From b67cf314ec21663c1b509424c5eb4bc50fa0a8da Mon Sep 17 00:00:00 2001 From: Julian Bilcke Date: Tue, 27 Aug 2024 23:18:18 +0200 Subject: [PATCH] implement #74 --- .../resolve/providers/replicate/runLipSync.ts | 73 +++++++++++++++++++ packages/app/src/app/api/resolve/route.ts | 56 ++++++++++++++ .../getSegmentWorkflowProviderAndEngine.ts | 17 +++++ 3 files changed, 146 insertions(+) create mode 100644 packages/app/src/app/api/resolve/providers/replicate/runLipSync.ts diff --git a/packages/app/src/app/api/resolve/providers/replicate/runLipSync.ts b/packages/app/src/app/api/resolve/providers/replicate/runLipSync.ts new file mode 100644 index 00000000..663f2799 --- /dev/null +++ b/packages/app/src/app/api/resolve/providers/replicate/runLipSync.ts @@ -0,0 +1,73 @@ +import Replicate from 'replicate' +import { ClapSegmentCategory } from '@aitube/clap' +import { TimelineSegment } from '@aitube/timeline' +import { ResolveRequest } from '@aitube/clapper-services' + +export async function runLipSync( + request: ResolveRequest +): Promise { + if (!request.settings.replicateApiKey) { + throw new Error(`Missing API key for "Replicate.com"`) + } + + const replicate = new Replicate({ auth: request.settings.replicateApiKey }) + + const segment: TimelineSegment = request.segment + + const firstDialogue = request.segments.find( + (s) => s.category === ClapSegmentCategory.DIALOGUE + ) + const firstDialogueAudio = firstDialogue?.assetUrl + + if (segment.category === ClapSegmentCategory.VIDEO) { + const videoLipsyncWorkflowModel = + request.settings.videoLipsyncWorkflow.data || '' + + if (!videoLipsyncWorkflowModel) { + throw new Error( + `cannot run the lip sync without an videoLipsyncWorkflowModel` + ) + } + if (!segment.assetUrl) { + throw new Error(`cannot run the lip sync without a video`) + } + + if (!firstDialogueAudio) { + throw new Error(`cannot run the lip sync without a dialogue speech`) + } + + try { + // console.log(`requested model:`, request.settings.videoLipsyncWorkflow.data) + const response = (await replicate.run( + request.settings.videoLipsyncWorkflow.data as any, + { + input: { + // note: this is actually a VIDEO (they call it face, but it's a face video) + face: segment.assetUrl, + input_audio: firstDialogueAudio, + + disable_safety_checker: + !request.settings.censorNotForAllAudiencesContent, + }, + } + )) as any + + // note how it is + const imageResult = `${response || ''}` + + if (!imageResult) { + throw new Error(`the generated image is empty`) + } + + segment.assetUrl = imageResult + } catch (err) { + console.error(`failed to run a lip sync using Replicate.com:`, err) + } + } else { + throw new Error( + `Clapper doesn't support lip sync for the "${segment.category}" category using Replicate.com yet` + ) + } + + return segment +} diff --git a/packages/app/src/app/api/resolve/route.ts b/packages/app/src/app/api/resolve/route.ts index 2d32ba3d..9962c78d 100644 --- a/packages/app/src/app/api/resolve/route.ts +++ b/packages/app/src/app/api/resolve/route.ts @@ -36,6 +36,7 @@ import { getMediaInfo } from '@/lib/ffmpeg/getMediaInfo' import { getSegmentWorkflowProviderAndEngine } from '@/services/editors/workflow-editor/getSegmentWorkflowProviderAndEngine' import { runFaceSwap as runFaceswapWithFalAi } from './providers/falai/runFaceSwap' import { runFaceSwap as runFaceswapWithReplicate } from './providers/replicate/runFaceSwap' +import { runLipSync as runLipSyncWithReplicate } from './providers/replicate/runLipSync' type ProviderFn = (request: ResolveRequest) => Promise @@ -54,6 +55,9 @@ export async function POST(req: NextRequest) { faceswapWorkflow, faceswapProvider, faceswapEngine, + lipsyncWorkflow, + lipsyncProvider, + lipsyncEngine, } = getSegmentWorkflowProviderAndEngine(request) /* @@ -202,5 +206,57 @@ export async function POST(req: NextRequest) { } } + // extra step: lip sync + // for this we need to have a valid video + // (or we could use a simple image + audio model) + + const hasValidVideo = + segment.category === ClapSegmentCategory.VIDEO && segment.assetUrl + + const firstDialogue = request.segments.find( + (s) => s.category === ClapSegmentCategory.DIALOGUE + ) + const hasValidAudio = firstDialogue?.assetUrl + + if ( + lipsyncProvider && + request.settings.videoLipsyncWorkflow.data && + hasValidVideo && + hasValidAudio + ) { + const lipsyncProviders: Partial> = + { + // TODO use Fal.ai? I think they only have SadTalker? + [ClapWorkflowProvider.REPLICATE]: runLipSyncWithReplicate, + } + + const lipsync: ProviderFn | undefined = + lipsyncProviders[lipsyncProvider] || undefined + + if (lipsync) { + try { + await lipsync(request) + + // we clean-up and parse the output from all the resolvers: + // this will download files hosted on CDNs, convert WAV files to MP3 etc + + segment.assetUrl = await decodeOutput(segment.assetUrl) + + segment.assetSourceType = getClapAssetSourceType(segment.assetUrl) + + segment.status = ClapSegmentStatus.COMPLETED + + const { assetFileFormat, outputType } = getTypeAndExtension( + segment.assetUrl + ) + + segment.assetFileFormat = assetFileFormat + segment.outputType = outputType + } catch (err) { + console.error(`failed to run the lipsync (${err})`) + } + } + } + return NextResponse.json(segment) } diff --git a/packages/app/src/services/editors/workflow-editor/getSegmentWorkflowProviderAndEngine.ts b/packages/app/src/services/editors/workflow-editor/getSegmentWorkflowProviderAndEngine.ts index b6297131..e390098c 100644 --- a/packages/app/src/services/editors/workflow-editor/getSegmentWorkflowProviderAndEngine.ts +++ b/packages/app/src/services/editors/workflow-editor/getSegmentWorkflowProviderAndEngine.ts @@ -20,6 +20,9 @@ export function getSegmentWorkflowProviderAndEngine({ faceswapWorkflow?: ClapWorkflow faceswapProvider?: ClapWorkflowProvider faceswapEngine?: ClapWorkflowEngine + lipsyncWorkflow?: ClapWorkflow + lipsyncProvider?: ClapWorkflowProvider + lipsyncEngine?: ClapWorkflowEngine } { const generationWorkflow: ClapWorkflow | undefined = segment.category === ClapSegmentCategory.STORYBOARD @@ -53,6 +56,17 @@ export function getSegmentWorkflowProviderAndEngine({ const faceswapEngine: ClapWorkflowEngine | undefined = faceswapWorkflow?.engine || undefined + const lipsyncWorkflow: ClapWorkflow | undefined = + segment.category === ClapSegmentCategory.VIDEO + ? settings.videoLipsyncWorkflow + : undefined + + const lipsyncProvider: ClapWorkflowProvider | undefined = + lipsyncWorkflow?.provider || undefined + + const lipsyncEngine: ClapWorkflowEngine | undefined = + lipsyncWorkflow?.engine || undefined + return { generationWorkflow, generationProvider, @@ -60,5 +74,8 @@ export function getSegmentWorkflowProviderAndEngine({ faceswapWorkflow, faceswapProvider, faceswapEngine, + lipsyncWorkflow, + lipsyncProvider, + lipsyncEngine, } }