diff --git a/forge.config.js b/forge.config.js index 3e5bfc86..c261015e 100644 --- a/forge.config.js +++ b/forge.config.js @@ -12,7 +12,7 @@ module.exports = { packagerConfig: { name: "Clapper", asar: true, - icon: "./public/logos/CL.png", + icon: "./public/images/logos/CL.png", osxSign: {}, // One or more files to be copied directly into the app's @@ -62,7 +62,7 @@ module.exports = { name: '@electron-forge/maker-deb', config: { options: { - icon: './public/logos/CL.png' + icon: './public/images/logos/CL.png' } }, }, @@ -70,7 +70,7 @@ module.exports = { name: '@electron-forge/maker-dmg', config: { options: { - icon: './public/logos/CL.icns' + icon: './public/images/logos/CL.icns' } }, }, diff --git a/package.json b/package.json index 291cdd2e..bb62ae79 100644 --- a/package.json +++ b/package.json @@ -14,9 +14,10 @@ }, "scripts": { "dev": "npm i && npm run checks && next dev", - "build": "npm i && npm run checks && rm -Rf out && next build && npm run build:copyassets", + "build": "npm i && npm run prepare && npm run checks && rm -Rf out && next build && npm run build:copyassets", + "build:ci": "rm -Rf out && npm run prepare && next build && npm run build:copyassets", "build:copyassets": "cp -R public .next/standalone/public && cp -R .next/static .next/standalone/.next/static", - "build:ci": "rm -Rf out && next build && npm run build:copyassets", + "prepare": "cp -R node_modules/mediainfo.js/dist/MediaInfoModule.wasm public/wasm/", "start": "next start", "start:prod": "node .next/standalone/server.js", "checks": "npm run format:fix && npm run lint", diff --git a/public/wasm/MediaInfoModule.wasm b/public/wasm/MediaInfoModule.wasm new file mode 100644 index 00000000..39abb23b Binary files /dev/null and b/public/wasm/MediaInfoModule.wasm differ diff --git a/src/lib/utils/base64DataUriToFile.ts b/src/lib/utils/base64DataUriToFile.ts new file mode 100644 index 00000000..404a737b --- /dev/null +++ b/src/lib/utils/base64DataUriToFile.ts @@ -0,0 +1,12 @@ +export function base64DataUriToFile(dataUrl: string, fileName: string) { + var arr = dataUrl.split(',') + const st = `${arr[0] || ''}` + const mime = `${st.match(/:(.*?);/)?.[1] || ''}` + const bstr = atob(arr[arr.length - 1]) + let n = bstr.length + const u8arr = new Uint8Array(n) + while(n--){ + u8arr[n] = bstr.charCodeAt(n); + } + return new File([u8arr], fileName, {type:mime}); +} \ No newline at end of file diff --git a/src/services/io/createFullVideo.ts b/src/services/io/createFullVideo.ts index b6afdf93..01647f71 100644 --- a/src/services/io/createFullVideo.ts +++ b/src/services/io/createFullVideo.ts @@ -1,481 +1,15 @@ import { UUID } from '@aitube/clap' -import { FFmpeg } from '@ffmpeg/ffmpeg' -import { toBlobURL } from '@ffmpeg/util' - -const TAG = 'io/createFullVideo' - -export type FFMPegVideoInput = { - data: Uint8Array | null - startTimeInMs: number - endTimeInMs: number - durationInSecs: number -} - -export type FFMPegAudioInput = FFMPegVideoInput - -/** - * Download and load single and multi-threading FFMPeg. - * MT for video - * ST for audio (as MT has issues with it) - * toBlobURL is used to bypass CORS issues, urls with the same domain can be used directly. - */ -async function initializeFFmpeg() { - const [ffmpegSt, ffmpegMt] = [new FFmpeg(), new FFmpeg()] - const baseStURL = 'https://unpkg.com/@ffmpeg/core@0.12.6/dist/umd' - const baseMtURL = 'https://unpkg.com/@ffmpeg/core-mt@0.12.6/dist/umd' - - ffmpegSt.on('log', ({ message }) => { - console.log(TAG, 'FFmpeg Single-Thread:', message) - }) - - ffmpegMt.on('log', ({ message }) => { - console.log(TAG, 'FFmpeg Multi-Thread:', message) - }) - - await ffmpegSt.load({ - coreURL: await toBlobURL(`${baseStURL}/ffmpeg-core.js`, 'text/javascript'), - wasmURL: await toBlobURL( - `${baseStURL}/ffmpeg-core.wasm`, - 'application/wasm' - ), - }) - - await ffmpegMt.load({ - coreURL: await toBlobURL(`${baseMtURL}/ffmpeg-core.js`, 'text/javascript'), - wasmURL: await toBlobURL( - `${baseMtURL}/ffmpeg-core.wasm`, - 'application/wasm' - ), - workerURL: await toBlobURL( - `${baseMtURL}/ffmpeg-core.worker.js`, - 'text/javascript' - ), - }) - - return [ffmpegSt, ffmpegMt] as [FFmpeg, FFmpeg] -} - -/** - * Get loaded FFmpeg. - */ -let ffmpegInstance: [FFmpeg, FFmpeg] -export async function loadFFmpegSt() { - if (!ffmpegInstance) ffmpegInstance = await initializeFFmpeg() - return ffmpegInstance[0] -} - -export async function loadFFmpegMt() { - if (!ffmpegInstance) ffmpegInstance = await initializeFFmpeg() - return ffmpegInstance[1] -} - -/** - * Creates an exclusive logger for the FFmpeg calls inside the provided method, - * it calculates the progress based on raw FFmpeg logs and the provided `totalTimeInMs`. - * - * @param totalTimeInMs - * @param method - * @param callback - * @param {number} callback.progress - The progress of the FFmpeg process from 0 to 100. - * @returns - */ -async function captureFFmpegProgress( - ffmpeg: FFmpeg, - totalTimeInMs: number, - method: () => any, - callback: (progress: number) => void -): Promise { - const extractProgressTimeMsFromLogs = (log: string): number | null => { - // `frame` for videos, `size` for audios - if (!log.startsWith('frame') && !log.startsWith('size')) return null - const timeRegex = /time=(\d{2}):(\d{2}):(\d{2})\.(\d{2})/ - const match = log.match(timeRegex) - if (match) { - const hours = parseInt(match[1]) - const minutes = parseInt(match[2]) - const seconds = parseInt(match[3]) - const centiseconds = parseInt(match[4]) - const totalMilliseconds = - hours * 3600000 + minutes * 60000 + seconds * 1000 + centiseconds * 10 - return totalMilliseconds - } - return null - } - let ffmpegLog = true - ffmpeg.on('log', ({ message }) => { - if (!ffmpegLog) return - const timeInMs = extractProgressTimeMsFromLogs(message) - if (timeInMs) callback((timeInMs / totalTimeInMs) * 100) - }) - const result = await method() - ffmpegLog = false - return result -} - -/** - * It will calculate a proportional progress between a targetProgress and a startProgress - * - * @param startProgress e.g. 50 - * @param progress e.g. 50 - * @param targetProgress e.g. 70 - * @returns e.g. 60, because 50% of progress between 70% and 50%, would result on 60% - */ -function calculateProgress( - startProgress: number, - progress: number, - targetProgress: number -): number { - return startProgress + (progress * (targetProgress - startProgress)) / 100 -} - -/** - * Creates an empty black video and appends it to the - * provided `fileListContentArray`. - * - * @param duration time in milliseconds - * @param width - * @param height - * @param filename - * @param fileListContentArray fileList.txt where to append the file name - * @param onProgress callback to capture the progress of this method - */ -export async function addEmptyVideo( - durationInSecs: number, - width: number, - height: number, - filename: string, - fileListContentArray: string[], - onProgress?: (progress: number, message?: string) => void -) { - const ffmpeg = await loadFFmpegMt() - let targetPartialProgress = 0 - - // For some reason, creating empty video with silent audio - // in one exec doesn't work, we need to split it. - - console.log( - TAG, - 'Creating empty video', - filename, - width, - height, - durationInSecs - ) - let currentProgress = 0 - targetPartialProgress = 50 - - await captureFFmpegProgress( - ffmpeg, - durationInSecs * 1000, - async () => { - await ffmpeg.exec([ - '-f', - 'lavfi', - '-i', - `color=c=black:s=${width}x${height}:d=${durationInSecs}`, - '-c:v', - 'libx264', - '-t', - `${durationInSecs}`, - '-loglevel', - 'verbose', - `base_${filename}`, - ]) - }, - (progress) => { - onProgress?.((progress / 100) * targetPartialProgress) - } - ) - - console.log( - TAG, - 'Adding silent audio to empty video', - filename, - width, - height, - durationInSecs - ) - currentProgress = 50 - targetPartialProgress = 100 - - const exitCode = await ffmpeg.exec([ - '-i', - `base_${filename}`, - '-f', - 'lavfi', - '-i', - 'anullsrc', - '-c:v', - 'copy', - '-c:a', - 'aac', - '-t', - `${durationInSecs}`, - '-loglevel', - 'verbose', - filename, - ]) - - if (exitCode) { - throw new Error(`${TAG}: Unexpect error while creating empty video`) - } - - console.log(TAG, 'Empty video created', filename) - fileListContentArray.push(`file ${filename}`) -} - -/** - * Creates the full mixed audio including silence - * segments and loads it into ffmpeg with the given `filename`. - * @param onProgress callback to capture the progress of this method - * @throws Error if ffmpeg returns exit code 1 - */ -export async function createFullAudio( - audios: FFMPegAudioInput[], - filename: string, - totalVideoDurationInMs: number, - onProgress?: (progress: number, message: string) => void -): Promise { - console.log(TAG, 'Creating full audio', filename) - - const ffmpeg = await loadFFmpegSt() - const filterComplexParts = [] - const baseFilename = `base_${filename}` - let currentProgress = 0 - let targetProgress = 25 - - // To mix audios at given times, we need a first empty base audio track - - await captureFFmpegProgress( - ffmpeg, - totalVideoDurationInMs, - async () => { - await ffmpeg.exec([ - '-f', - 'lavfi', - '-i', - 'anullsrc', - '-t', - `${totalVideoDurationInMs / 1000}`, - '-loglevel', - 'verbose', - !audios.length ? filename : baseFilename, - ]) - }, - (progress) => { - onProgress?.( - calculateProgress(currentProgress, progress, targetProgress), - 'Creating base audio...' - ) - } - ) - - // If there is no audios, the base audio is the final one - if (!audios.length) return onProgress?.(100, 'Prepared audios...') - - currentProgress = targetProgress - targetProgress = 50 - - // Mix audios based on their start times - - const audioInputFiles = ['-i', baseFilename] - for (let index = 0; index < audios.length; index++) { - onProgress?.(currentProgress, 'Creating base audio...') - console.log(TAG, `Processing audio #${index}`) - const audio = audios[index] - const expectedProgressForItem = ((1 / audios.length) * targetProgress) / 100 - if (!audio.data) continue - const audioFilename = `audio_${UUID()}.mp3` - await ffmpeg.writeFile(audioFilename, audio.data) - audioInputFiles.push('-i', audioFilename) - const delay = audio.startTimeInMs - const durationInSecs = audio.endTimeInMs - audio.startTimeInMs / 1000 - filterComplexParts.push( - `[${index + 1}:a]atrim=0:${durationInSecs},adelay=${delay}|${delay}[delayed${index}]` - ) - currentProgress += expectedProgressForItem * 100 - } - - const amixInputs = `[0:a]${audios.map((_, index) => `[delayed${index}]`).join('')}amix=inputs=${audios.length + 1}:duration=longest` - filterComplexParts.push(`${amixInputs}[a]`) - const filterComplex = filterComplexParts.join('; ') - - currentProgress = targetProgress - targetProgress = 100 - - const createFullAudioExitCode = await captureFFmpegProgress( - ffmpeg, - totalVideoDurationInMs, - async () => { - await ffmpeg.exec([ - ...audioInputFiles, - '-filter_complex', - filterComplex, - '-map', - '[a]', - '-t', - `${totalVideoDurationInMs / 1000}`, - '-loglevel', - 'verbose', - filename, - ]) - }, - (progress) => { - onProgress?.( - calculateProgress(currentProgress, progress, targetProgress), - 'Mixing audios...' - ) - } - ) - - if (createFullAudioExitCode) { - throw new Error(`${TAG}: Error while creating full audio!`) - } - onProgress?.(targetProgress, 'Prepared audios...') -} - -/** - * Creates the full silent video including empty black - * segments and loads it into ffmpeg with the given `filename`. - * @param onProgress callback to capture the progress of this method - * @throws Error if ffmpeg returns exit code 1 - */ -export async function createFullSilentVideo( - videos: FFMPegVideoInput[], - filename: string, - totalVideoDurationInMs: number, - width: number, - height: number, - excludeEmptyContent = false, - onProgress?: (progress: number, message: string) => void -) { - const ffmpeg = await loadFFmpegMt() - const fileList = 'fileList.txt' - const fileListContentArray = [] - - // Complete array of videos including concatenated empty segments - // This is helpful for cleaner progress log - let lastStartTimeVideoInMs = 0 - let videosWithGaps: FFMPegVideoInput[] - - if (!videos.length) { - videosWithGaps = [ - { - startTimeInMs: 0, - endTimeInMs: totalVideoDurationInMs, - data: null, - durationInSecs: totalVideoDurationInMs / 1000, - }, - ] - } else { - videosWithGaps = videos.reduce((arr: FFMPegVideoInput[], video, index) => { - const emptyVideoDurationInMs = - video.startTimeInMs - lastStartTimeVideoInMs - if (emptyVideoDurationInMs) { - arr.push({ - startTimeInMs: lastStartTimeVideoInMs, - endTimeInMs: lastStartTimeVideoInMs + emptyVideoDurationInMs, - data: null, - durationInSecs: emptyVideoDurationInMs / 1000, - }) - } - arr.push(video) - lastStartTimeVideoInMs = video.endTimeInMs - if ( - index == videos.length - 1 && - lastStartTimeVideoInMs < totalVideoDurationInMs - ) { - arr.push({ - startTimeInMs: lastStartTimeVideoInMs, - endTimeInMs: totalVideoDurationInMs, - data: null, - durationInSecs: - (totalVideoDurationInMs - lastStartTimeVideoInMs) / 1000, - }) - } - return arr - }, []) - } - - onProgress?.(0, 'Preparing videos...') - - // Arbitrary percentage, as `concat` is fast, - // then estimate the generation of gap videos - // as the 70% of the work - let currentProgress = 0 - let targetProgress = 70 - - for (const video of videosWithGaps) { - const expectedProgressForItem = - (((video.durationInSecs * 1000) / totalVideoDurationInMs) * - targetProgress) / - 100 - if (!video.data) { - if (excludeEmptyContent) continue - let collectedProgress = 0 - await addEmptyVideo( - video.durationInSecs, - width, - height, - `empty_video_${UUID()}.mp4`, - fileListContentArray, - (progress) => { - const subProgress = progress / 100 - currentProgress += - (expectedProgressForItem * subProgress - collectedProgress) * 100 - console.log(TAG, 'Current progress', currentProgress) - onProgress?.(currentProgress, 'Preparing videos...') - collectedProgress = expectedProgressForItem * subProgress - } - ) - } else { - const videoFilename = `video_${UUID()}.mp4` - await ffmpeg.writeFile(videoFilename, video.data) - fileListContentArray.push(`file ${videoFilename}`) - currentProgress += expectedProgressForItem * 100 - console.log(TAG, 'Current progress', currentProgress) - onProgress?.(currentProgress, 'Preparing videos...') - } - } - - onProgress?.(targetProgress, 'Concatenating videos...') - currentProgress = 70 - targetProgress = 100 - - const fileListContent = fileListContentArray.join('\n') - await ffmpeg.writeFile(fileList, fileListContent) - - const creatBaseFullVideoExitCode = await captureFFmpegProgress( - ffmpeg, - totalVideoDurationInMs, - async () => { - await ffmpeg.exec([ - '-f', - 'concat', - '-safe', - '0', - '-i', - fileList, - '-loglevel', - 'verbose', - '-c', - 'copy', - filename, - ]) - }, - (progress: number) => { - onProgress?.( - calculateProgress(currentProgress, progress, targetProgress), - 'Merging audio and video...' - ) - } - ) - - if (creatBaseFullVideoExitCode) { - throw new Error(`${TAG}: Error while creating base full video!`) - } - onProgress?.(targetProgress, 'Concatenating videos...') -} +import { + calculateProgress, + captureFFmpegProgress, + createFullAudio, + createFullSilentVideo, + FFMPegAudioInput, + FFMPegVideoInput, + loadFFmpegMt, + loadFFmpegSt, + TAG, +} from './ffmpegUtils' /** * Creates full video with audio using `@ffmpeg/ffmpeg` multi-core, diff --git a/src/services/io/extractCaptionFromFrame.ts b/src/services/io/extractCaptionFromFrameMoondream.ts similarity index 92% rename from src/services/io/extractCaptionFromFrame.ts rename to src/services/io/extractCaptionFromFrameMoondream.ts index 014124d0..d25eb23a 100644 --- a/src/services/io/extractCaptionFromFrame.ts +++ b/src/services/io/extractCaptionFromFrameMoondream.ts @@ -5,7 +5,7 @@ import { RawImage, } from '@xenova/transformers' -export async function extractCaptionFromFrame( +export async function extractCaptionFromFrameMoondream( imageInBase64DataUri: string ): Promise { if (!(navigator as any).gpu) { @@ -16,7 +16,8 @@ export async function extractCaptionFromFrame( 2. You need to enable WebGPU (depends on your browser, see below) 2.1 For Chrome: Perform the following operations in the Chrome / Microsoft Edge address bar -The chrome://flags/#enable-unsafe-webgpu flag must be enabled (not enable-webgpu-developer-features). Linux experimental support also requires launching the browser with --enable-features=Vulkan. +The chrome://flags/#enable-unsafe-webgpu flag must be enabled (not enable-webgpu-developer-features). +Linux experimental support also requires launching the browser with --enable-features=Vulkan. 2.2 For Safari 18 (macOS 15): WebGPU is enabled by default diff --git a/src/services/io/extractCaptionsFromFrames.ts b/src/services/io/extractCaptionsFromFrames.ts new file mode 100644 index 00000000..575ba547 --- /dev/null +++ b/src/services/io/extractCaptionsFromFrames.ts @@ -0,0 +1,92 @@ +import { + AutoProcessor, + AutoTokenizer, + Florence2ForConditionalGeneration, + RawImage, +} from '@xenova/transformers' + +export async function extractCaptionsFromFrames( + images: string[] = [], + onProgress: (progress: number, storyboardIndex: number, nbStoryboards: number) => void +): Promise { + if (!(navigator as any).gpu) { + throw new Error(`Please enable WebGPU to analyze video frames: + +1. You need a modern browser such as Google Chrome 113+, Microsoft Edge 113+, Safari 18 (macOS 15), Firefox Nightly + +2. You need to enable WebGPU (depends on your browser, see below) + +2.1 For Chrome: Perform the following operations in the Chrome / Microsoft Edge address bar +The chrome://flags/#enable-unsafe-webgpu flag must be enabled (not enable-webgpu-developer-features). +Linux experimental support also requires launching the browser with --enable-features=Vulkan. + +2.2 For Safari 18 (macOS 15): WebGPU is enabled by default + +2.3 For Firefox Nightly: Type about:config in the address bar and set 'dom.webgpu.enabled" to true +`) + } + + let progress = 0 + onProgress(progress, 0, images.length) + // for code example, see: + // https://github.com/xenova/transformers.js/pull/545#issuecomment-2183625876 + + // Load model, processor, and tokenizer + const model_id = 'onnx-community/Florence-2-base-ft' + const model = await Florence2ForConditionalGeneration.from_pretrained( + model_id, + { + dtype: 'fp32', + } + ) + + onProgress(progress = 5, 0, images.length) + + const processor = await AutoProcessor.from_pretrained(model_id) + + onProgress(progress = 10, 0, images.length) + + const tokenizer = await AutoTokenizer.from_pretrained(model_id) + + onProgress(progress = 15, 0, images.length) + + // not all prompts will work properly, see the official examples: + // https://huggingface.co/microsoft/Florence-2-base-ft/blob/e7a5acc73559546de6e12ec0319cd7cc1fa2437c/processing_florence2.py#L115-L117 + + // Prepare text inputs + const prompts = 'Describe with a paragraph what is shown in the image.' + const text_inputs = tokenizer(prompts) + + let i = 1 + const captions: string[] = [] + for (const imageInBase64DataUri of images) { + + console.log('analyzing image:', imageInBase64DataUri.slice(0, 64)) + // Prepare vision inputs + const image = await RawImage.fromURL(imageInBase64DataUri) + const vision_inputs = await processor(image) + + console.log(' - generating caption..') + // Generate text + const generated_ids = await model.generate({ + ...text_inputs, + ...vision_inputs, + max_new_tokens: 100, + }) + + // Decode generated text + const generated_text = tokenizer.batch_decode(generated_ids, { + skip_special_tokens: true, + }) + + const caption = `${generated_text[0] || ''}` + console.log(' - caption:', caption) + + const relativeProgress = i / images.length + + progress += relativeProgress * 75 + onProgress(progress, i, images.length) + captions.push(caption) + } + return captions +} diff --git a/src/services/io/extractFramesFromVideo.ts b/src/services/io/extractFramesFromVideo.ts index 78f22875..528ae7db 100644 --- a/src/services/io/extractFramesFromVideo.ts +++ b/src/services/io/extractFramesFromVideo.ts @@ -1,3 +1,5 @@ +'use client' + import { FFmpeg } from '@ffmpeg/ffmpeg' import { toBlobURL } from '@ffmpeg/util' import mediaInfoFactory, { @@ -17,14 +19,20 @@ interface FrameExtractorOptions { maxHeight: number sceneSamplingRate: number // Percentage of additional frames between scene changes (0-100) onProgress?: (progress: number) => void // Callback function for progress updates + debug?: boolean } -async function extractFramesFromVideo( +export async function extractFramesFromVideo( videoBlob: Blob, options: FrameExtractorOptions ): Promise { // Initialize MediaInfo - const mediaInfo = await mediaInfoFactory({ format: 'object' }) + const mediaInfo = await mediaInfoFactory({ + format: 'object', + locateFile: () => { + return '/wasm/MediaInfoModule.wasm' + }, + }) // Get video duration using MediaInfo const getSize = () => videoBlob.size @@ -42,19 +50,33 @@ async function extractFramesFromVideo( reader.readAsArrayBuffer(videoBlob.slice(offset, offset + chunkSize)) }) + if (options.debug) { + console.log('calling await mediaInfo.analyzeData(getSize, readChunk)') + } + const result = await mediaInfo.analyzeData(getSize, readChunk) + if (options.debug) { + console.log('result = ', result) + } let duration: number = 0 for (const track of result.media?.track || []) { - /// '@type': "General" | "Video" | "Audio" | "Text" | "Image" | "Menu" | "Other" + if (options.debug) { + console.log('track = ', track) + } + let maybeDuration: number = 0 if (track['@type'] === 'Audio') { const audioTrack = track as AudioTrack - maybeDuration = audioTrack.Duration || 0 + maybeDuration = audioTrack.Duration + ? parseFloat(`${audioTrack.Duration || 0}`) + : 0 } else if (track['@type'] === 'Video') { const videoTrack = track as VideoTrack - maybeDuration = videoTrack.Duration || 0 + maybeDuration = videoTrack.Duration + ? parseFloat(`${videoTrack.Duration || 0}`) + : 0 } if ( typeof maybeDuration === 'number' && @@ -69,6 +91,10 @@ async function extractFramesFromVideo( throw new Error('Could not determine video duration (or it is length 0)') } + if (options.debug) { + console.log('duration in seconds:', duration) + } + // Initialize FFmpeg const ffmpeg = new FFmpeg() const baseURL = 'https://unpkg.com/@ffmpeg/core@0.12.6/dist/umd' @@ -78,17 +104,26 @@ async function extractFramesFromVideo( wasmURL: await toBlobURL(`${baseURL}/ffmpeg-core.wasm`, 'application/wasm'), }) + if (options.debug) { + console.log('FFmpeg loaded!') + } + // Write video file to FFmpeg's file system const videoUint8Array = new Uint8Array(await videoBlob.arrayBuffer()) await ffmpeg.writeFile('input.mp4', videoUint8Array) - + if (options.debug) { + console.log('input.mp4 written!') + } // Prepare FFmpeg command const sceneFilter = `select='gt(scene,0.4)'` const additionalFramesFilter = `select='not(mod(n,${Math.floor(100 / options.sceneSamplingRate)}))'` - const scaleFilter = `scale=iw*min(${options.maxWidth}/iw\,${options.maxHeight}/ih):ih*min(${options.maxWidth}/iw\,${options.maxHeight}/ih)` + const scaleFilter = `scale='min(${options.maxWidth},iw)':min'(${options.maxHeight},ih)':force_original_aspect_ratio=decrease` let lastProgress = 0 ffmpeg.on('log', ({ message }) => { + if (options.debug) { + console.log('FFmpeg log:', message) + } const timeMatch = message.match(/time=(\d{2}):(\d{2}):(\d{2}\.\d{2})/) if (timeMatch) { const [, hours, minutes, seconds] = timeMatch @@ -102,40 +137,82 @@ async function extractFramesFromVideo( } }) - await ffmpeg.exec([ + const ffmpegCommand = [ '-i', 'input.mp4', + '-loglevel', + 'verbose', '-vf', `${sceneFilter},${additionalFramesFilter},${scaleFilter}`, '-vsync', - '0', + '2', '-q:v', '2', + '-f', + 'image2', + '-frames:v', + '1000', // Limit the number of frames to extract `frames_%03d.${options.format}`, - ]) + ] + + if (options.debug) { + console.log('Executing FFmpeg command:', ffmpegCommand.join(' ')) + } + + try { + await ffmpeg.exec(ffmpegCommand) + } catch (error) { + console.error('FFmpeg execution error:', error) + throw error + } // Read generated frames const files = await ffmpeg.listDir('/') + if (options.debug) { + console.log('All files in FFmpeg filesystem:', files) + } const frameFiles = files.filter( (file) => file.name.startsWith('frames_') && file.name.endsWith(`.${options.format}`) ) + if (options.debug) { + console.log('Frame files found:', frameFiles.length) + } const frames: string[] = [] + const encoder = new TextEncoder() + for (let i = 0; i < frameFiles.length; i++) { const file = frameFiles[i] - const frameData = await ffmpeg.readFile(file.name) - const base64Frame = btoa( - String.fromCharCode.apply(null, frameData as unknown as number[]) - ) - frames.push(`data:image/${options.format};base64,${base64Frame}`) - - // Update progress for frame processing (from 90% to 100%) - options.onProgress?.(90 + Math.round(((i + 1) / frameFiles.length) * 10)) + if (options.debug) { + console.log(`Processing frame file: ${file.name}`) + } + try { + const frameData = await ffmpeg.readFile(file.name) + + // Convert Uint8Array to Base64 string without using btoa + let binary = '' + const bytes = new Uint8Array(frameData as any) + const len = bytes.byteLength + for (let i = 0; i < len; i++) { + binary += String.fromCharCode(bytes[i]) + } + const base64Frame = window.btoa(binary) + + frames.push(`data:image/${options.format};base64,${base64Frame}`) + + // Update progress for frame processing (from 90% to 100%) + options.onProgress?.(90 + Math.round(((i + 1) / frameFiles.length) * 10)) + } catch (error) { + console.error(`Error processing frame ${file.name}:`, error) + // You can choose to either skip this frame or throw an error + // throw error; // Uncomment this line if you want to stop processing on any error + } } + if (options.debug) { + console.log(`Total frames processed: ${frames.length}`) + } return frames } - -export default extractFramesFromVideo diff --git a/src/services/io/ffmpegUtils.ts b/src/services/io/ffmpegUtils.ts new file mode 100644 index 00000000..40b85972 --- /dev/null +++ b/src/services/io/ffmpegUtils.ts @@ -0,0 +1,478 @@ +import { UUID } from '@aitube/clap' +import { FFmpeg } from '@ffmpeg/ffmpeg' +import { toBlobURL } from '@ffmpeg/util' + +export const TAG = 'io/createFullVideo' + +export type FFMPegVideoInput = { + data: Uint8Array | null + startTimeInMs: number + endTimeInMs: number + durationInSecs: number +} + +export type FFMPegAudioInput = FFMPegVideoInput + +/** + * Download and load single and multi-threading FFMPeg. + * MT for video + * ST for audio (as MT has issues with it) + * toBlobURL is used to bypass CORS issues, urls with the same domain can be used directly. + */ +export async function initializeFFmpeg() { + const [ffmpegSt, ffmpegMt] = [new FFmpeg(), new FFmpeg()] + const baseStURL = 'https://unpkg.com/@ffmpeg/core@0.12.6/dist/umd' + const baseMtURL = 'https://unpkg.com/@ffmpeg/core-mt@0.12.6/dist/umd' + + ffmpegSt.on('log', ({ message }) => { + console.log(TAG, 'FFmpeg Single-Thread:', message) + }) + + ffmpegMt.on('log', ({ message }) => { + console.log(TAG, 'FFmpeg Multi-Thread:', message) + }) + + await ffmpegSt.load({ + coreURL: await toBlobURL(`${baseStURL}/ffmpeg-core.js`, 'text/javascript'), + wasmURL: await toBlobURL( + `${baseStURL}/ffmpeg-core.wasm`, + 'application/wasm' + ), + }) + + await ffmpegMt.load({ + coreURL: await toBlobURL(`${baseMtURL}/ffmpeg-core.js`, 'text/javascript'), + wasmURL: await toBlobURL( + `${baseMtURL}/ffmpeg-core.wasm`, + 'application/wasm' + ), + workerURL: await toBlobURL( + `${baseMtURL}/ffmpeg-core.worker.js`, + 'text/javascript' + ), + }) + + return [ffmpegSt, ffmpegMt] as [FFmpeg, FFmpeg] +} + +/** + * Get loaded FFmpeg. + */ +let ffmpegInstance: [FFmpeg, FFmpeg] +export async function loadFFmpegSt() { + if (!ffmpegInstance) ffmpegInstance = await initializeFFmpeg() + return ffmpegInstance[0] +} + +export async function loadFFmpegMt() { + if (!ffmpegInstance) ffmpegInstance = await initializeFFmpeg() + return ffmpegInstance[1] +} + +/** + * Creates an exclusive logger for the FFmpeg calls inside the provided method, + * it calculates the progress based on raw FFmpeg logs and the provided `totalTimeInMs`. + * + * @param totalTimeInMs + * @param method + * @param callback + * @param {number} callback.progress - The progress of the FFmpeg process from 0 to 100. + * @returns + */ +export async function captureFFmpegProgress( + ffmpeg: FFmpeg, + totalTimeInMs: number, + method: () => any, + callback: (progress: number) => void +): Promise { + const extractProgressTimeMsFromLogs = (log: string): number | null => { + // `frame` for videos, `size` for audios + if (!log.startsWith('frame') && !log.startsWith('size')) return null + const timeRegex = /time=(\d{2}):(\d{2}):(\d{2})\.(\d{2})/ + const match = log.match(timeRegex) + if (match) { + const hours = parseInt(match[1]) + const minutes = parseInt(match[2]) + const seconds = parseInt(match[3]) + const centiseconds = parseInt(match[4]) + const totalMilliseconds = + hours * 3600000 + minutes * 60000 + seconds * 1000 + centiseconds * 10 + return totalMilliseconds + } + return null + } + let ffmpegLog = true + ffmpeg.on('log', ({ message }) => { + if (!ffmpegLog) return + const timeInMs = extractProgressTimeMsFromLogs(message) + if (timeInMs) callback((timeInMs / totalTimeInMs) * 100) + }) + const result = await method() + ffmpegLog = false + return result +} + +/** + * It will calculate a proportional progress between a targetProgress and a startProgress + * + * @param startProgress e.g. 50 + * @param progress e.g. 50 + * @param targetProgress e.g. 70 + * @returns e.g. 60, because 50% of progress between 70% and 50%, would result on 60% + */ +export function calculateProgress( + startProgress: number, + progress: number, + targetProgress: number +): number { + return startProgress + (progress * (targetProgress - startProgress)) / 100 +} + +/** + * Creates an empty black video and appends it to the + * provided `fileListContentArray`. + * + * @param duration time in milliseconds + * @param width + * @param height + * @param filename + * @param fileListContentArray fileList.txt where to append the file name + * @param onProgress callback to capture the progress of this method + */ +export async function addEmptyVideo( + durationInSecs: number, + width: number, + height: number, + filename: string, + fileListContentArray: string[], + onProgress?: (progress: number, message?: string) => void +) { + const ffmpeg = await loadFFmpegMt() + let targetPartialProgress = 0 + + // For some reason, creating empty video with silent audio + // in one exec doesn't work, we need to split it. + + console.log( + TAG, + 'Creating empty video', + filename, + width, + height, + durationInSecs + ) + let currentProgress = 0 + targetPartialProgress = 50 + + await captureFFmpegProgress( + ffmpeg, + durationInSecs * 1000, + async () => { + await ffmpeg.exec([ + '-f', + 'lavfi', + '-i', + `color=c=black:s=${width}x${height}:d=${durationInSecs}`, + '-c:v', + 'libx264', + '-t', + `${durationInSecs}`, + '-loglevel', + 'verbose', + `base_${filename}`, + ]) + }, + (progress) => { + onProgress?.((progress / 100) * targetPartialProgress) + } + ) + + console.log( + TAG, + 'Adding silent audio to empty video', + filename, + width, + height, + durationInSecs + ) + currentProgress = 50 + targetPartialProgress = 100 + + const exitCode = await ffmpeg.exec([ + '-i', + `base_${filename}`, + '-f', + 'lavfi', + '-i', + 'anullsrc', + '-c:v', + 'copy', + '-c:a', + 'aac', + '-t', + `${durationInSecs}`, + '-loglevel', + 'verbose', + filename, + ]) + + if (exitCode) { + throw new Error(`${TAG}: Unexpect error while creating empty video`) + } + + console.log(TAG, 'Empty video created', filename) + fileListContentArray.push(`file ${filename}`) +} + +/** + * Creates the full mixed audio including silence + * segments and loads it into ffmpeg with the given `filename`. + * @param onProgress callback to capture the progress of this method + * @throws Error if ffmpeg returns exit code 1 + */ +export async function createFullAudio( + audios: FFMPegAudioInput[], + filename: string, + totalVideoDurationInMs: number, + onProgress?: (progress: number, message: string) => void +): Promise { + console.log(TAG, 'Creating full audio', filename) + + const ffmpeg = await loadFFmpegSt() + const filterComplexParts = [] + const baseFilename = `base_${filename}` + let currentProgress = 0 + let targetProgress = 25 + + // To mix audios at given times, we need a first empty base audio track + + await captureFFmpegProgress( + ffmpeg, + totalVideoDurationInMs, + async () => { + await ffmpeg.exec([ + '-f', + 'lavfi', + '-i', + 'anullsrc', + '-t', + `${totalVideoDurationInMs / 1000}`, + '-loglevel', + 'verbose', + !audios.length ? filename : baseFilename, + ]) + }, + (progress) => { + onProgress?.( + calculateProgress(currentProgress, progress, targetProgress), + 'Creating base audio...' + ) + } + ) + + // If there is no audios, the base audio is the final one + if (!audios.length) return onProgress?.(100, 'Prepared audios...') + + currentProgress = targetProgress + targetProgress = 50 + + // Mix audios based on their start times + + const audioInputFiles = ['-i', baseFilename] + for (let index = 0; index < audios.length; index++) { + onProgress?.(currentProgress, 'Creating base audio...') + console.log(TAG, `Processing audio #${index}`) + const audio = audios[index] + const expectedProgressForItem = ((1 / audios.length) * targetProgress) / 100 + if (!audio.data) continue + const audioFilename = `audio_${UUID()}.mp3` + await ffmpeg.writeFile(audioFilename, audio.data) + audioInputFiles.push('-i', audioFilename) + const delay = audio.startTimeInMs + const durationInSecs = audio.endTimeInMs - audio.startTimeInMs / 1000 + filterComplexParts.push( + `[${index + 1}:a]atrim=0:${durationInSecs},adelay=${delay}|${delay}[delayed${index}]` + ) + currentProgress += expectedProgressForItem * 100 + } + + const amixInputs = `[0:a]${audios.map((_, index) => `[delayed${index}]`).join('')}amix=inputs=${audios.length + 1}:duration=longest` + filterComplexParts.push(`${amixInputs}[a]`) + const filterComplex = filterComplexParts.join('; ') + + currentProgress = targetProgress + targetProgress = 100 + + const createFullAudioExitCode = await captureFFmpegProgress( + ffmpeg, + totalVideoDurationInMs, + async () => { + await ffmpeg.exec([ + ...audioInputFiles, + '-filter_complex', + filterComplex, + '-map', + '[a]', + '-t', + `${totalVideoDurationInMs / 1000}`, + '-loglevel', + 'verbose', + filename, + ]) + }, + (progress) => { + onProgress?.( + calculateProgress(currentProgress, progress, targetProgress), + 'Mixing audios...' + ) + } + ) + + if (createFullAudioExitCode) { + throw new Error(`${TAG}: Error while creating full audio!`) + } + onProgress?.(targetProgress, 'Prepared audios...') +} + +/** + * Creates the full silent video including empty black + * segments and loads it into ffmpeg with the given `filename`. + * @param onProgress callback to capture the progress of this method + * @throws Error if ffmpeg returns exit code 1 + */ +export async function createFullSilentVideo( + videos: FFMPegVideoInput[], + filename: string, + totalVideoDurationInMs: number, + width: number, + height: number, + excludeEmptyContent = false, + onProgress?: (progress: number, message: string) => void +) { + const ffmpeg = await loadFFmpegMt() + const fileList = 'fileList.txt' + const fileListContentArray = [] + + // Complete array of videos including concatenated empty segments + // This is helpful for cleaner progress log + let lastStartTimeVideoInMs = 0 + let videosWithGaps: FFMPegVideoInput[] + + if (!videos.length) { + videosWithGaps = [ + { + startTimeInMs: 0, + endTimeInMs: totalVideoDurationInMs, + data: null, + durationInSecs: totalVideoDurationInMs / 1000, + }, + ] + } else { + videosWithGaps = videos.reduce((arr: FFMPegVideoInput[], video, index) => { + const emptyVideoDurationInMs = + video.startTimeInMs - lastStartTimeVideoInMs + if (emptyVideoDurationInMs) { + arr.push({ + startTimeInMs: lastStartTimeVideoInMs, + endTimeInMs: lastStartTimeVideoInMs + emptyVideoDurationInMs, + data: null, + durationInSecs: emptyVideoDurationInMs / 1000, + }) + } + arr.push(video) + lastStartTimeVideoInMs = video.endTimeInMs + if ( + index == videos.length - 1 && + lastStartTimeVideoInMs < totalVideoDurationInMs + ) { + arr.push({ + startTimeInMs: lastStartTimeVideoInMs, + endTimeInMs: totalVideoDurationInMs, + data: null, + durationInSecs: + (totalVideoDurationInMs - lastStartTimeVideoInMs) / 1000, + }) + } + return arr + }, []) + } + + onProgress?.(0, 'Preparing videos...') + + // Arbitrary percentage, as `concat` is fast, + // then estimate the generation of gap videos + // as the 70% of the work + let currentProgress = 0 + let targetProgress = 70 + + for (const video of videosWithGaps) { + const expectedProgressForItem = + (((video.durationInSecs * 1000) / totalVideoDurationInMs) * + targetProgress) / + 100 + if (!video.data) { + if (excludeEmptyContent) continue + let collectedProgress = 0 + await addEmptyVideo( + video.durationInSecs, + width, + height, + `empty_video_${UUID()}.mp4`, + fileListContentArray, + (progress) => { + const subProgress = progress / 100 + currentProgress += + (expectedProgressForItem * subProgress - collectedProgress) * 100 + console.log(TAG, 'Current progress', currentProgress) + onProgress?.(currentProgress, 'Preparing videos...') + collectedProgress = expectedProgressForItem * subProgress + } + ) + } else { + const videoFilename = `video_${UUID()}.mp4` + await ffmpeg.writeFile(videoFilename, video.data) + fileListContentArray.push(`file ${videoFilename}`) + currentProgress += expectedProgressForItem * 100 + console.log(TAG, 'Current progress', currentProgress) + onProgress?.(currentProgress, 'Preparing videos...') + } + } + + onProgress?.(targetProgress, 'Concatenating videos...') + currentProgress = 70 + targetProgress = 100 + + const fileListContent = fileListContentArray.join('\n') + await ffmpeg.writeFile(fileList, fileListContent) + + const creatBaseFullVideoExitCode = await captureFFmpegProgress( + ffmpeg, + totalVideoDurationInMs, + async () => { + await ffmpeg.exec([ + '-f', + 'concat', + '-safe', + '0', + '-i', + fileList, + '-loglevel', + 'verbose', + '-c', + 'copy', + filename, + ]) + }, + (progress: number) => { + onProgress?.( + calculateProgress(currentProgress, progress, targetProgress), + 'Merging audio and video...' + ) + } + ) + + if (creatBaseFullVideoExitCode) { + throw new Error(`${TAG}: Error while creating base full video!`) + } + onProgress?.(targetProgress, 'Concatenating videos...') +} diff --git a/src/services/io/parseFileIntoSegments.ts b/src/services/io/parseFileIntoSegments.ts index 8d6d2f1e..9c14daa3 100644 --- a/src/services/io/parseFileIntoSegments.ts +++ b/src/services/io/parseFileIntoSegments.ts @@ -13,6 +13,8 @@ import { SegmentEditionStatus, SegmentVisibility, TimelineSegment, + useTimeline, + TimelineStore } from '@aitube/timeline' import { blobToBase64DataUri } from '@/lib/utils/blobToBase64DataUri' @@ -28,6 +30,7 @@ export async function parseFileIntoSegments({ */ file: File }): Promise { + const timeline: TimelineStore = useTimeline.getState() // console.log(`parseFileIntoSegments(): filename = ${file.name}`) // console.log(`parseFileIntoSegments(): file size = ${file.size} bytes`) // console.log(`parseFileIntoSegments(): file type = ${file.type}`) @@ -44,11 +47,61 @@ export async function parseFileIntoSegments({ const newSegments: TimelineSegment[] = [] switch (file.type) { + case 'image/jpeg': + case 'image/png': + case 'image/avif': + case 'image/heic': case 'image/webp': type = 'image' resourceCategory = 'control_image' + const startTimeInMs = cursorInSteps * DEFAULT_DURATION_IN_MS_PER_STEP + const durationInSteps = 4 + const durationInMs = durationInSteps * DEFAULT_DURATION_IN_MS_PER_STEP + const endTimeInMs = startTimeInMs + durationInMs + + // ok let's stop for a minute there: + // if someone drops a .mp3, and assuming we don't yet have the UI to select the category, + // do you think it should be a SOUND, a VOICE or a MUSIC by default? + // I expect people will use AI service providers for sound and voice, + // maybe in some case music too, but there are also many people + // who will want to use their own track eg. to create a music video + const category = ClapSegmentCategory.MUSIC + + const assetUrl = await blobToBase64DataUri(file) + + const newSegmentData: Partial = { + prompt: 'audio track', + startTimeInMs, // start time of the segment + endTimeInMs, // end time of the segment (startTimeInMs + durationInMs) + status: ClapSegmentStatus.COMPLETED, + // track: findFreeTrack({ segments, startTimeInMs, endTimeInMs }), // track row index + label: `${file.name} (${Math.round(durationInMs / 1000)}s @ ${Math.round(bpm * 100) / 100} BPM)`, // a short label to name the segment (optional, can be human or LLM-defined) + category, + assetUrl, + assetDurationInMs: endTimeInMs, + assetSourceType: ClapAssetSource.DATA, + assetFileFormat: `${file.type}`, + } + + const timelineSegment = await clapSegmentToTimelineSegment( + newSegment(newSegmentData) + ) + timelineSegment.outputType = ClapOutputType.AUDIO + timelineSegment.outputGain = 1.0 + timelineSegment.audioBuffer = audioBuffer + + // we assume we want it to be immediately visible + timelineSegment.visibility = SegmentVisibility.VISIBLE + + // console.log("newSegment:", audioSegment) + + // poof! type disappears.. it's magic + newSegments.push(timelineSegment) break + break + + case 'audio/mpeg': // this is the "official" one case 'audio/mp3': // this is just an alias case 'audio/wav': diff --git a/src/services/io/useIO.ts b/src/services/io/useIO.ts index 9e11f667..e1d05f0b 100644 --- a/src/services/io/useIO.ts +++ b/src/services/io/useIO.ts @@ -40,11 +40,11 @@ import { formatSegmentForExport, } from '@/lib/utils/formatSegmentForExport' import { sleep } from '@/lib/utils/sleep' -import { - FFMPegAudioInput, - FFMPegVideoInput, - createFullVideo, -} from './createFullVideo' +import { FFMPegAudioInput, FFMPegVideoInput } from './ffmpegUtils' +import { createFullVideo } from './createFullVideo' +import { extractFramesFromVideo } from './extractFramesFromVideo' +import { extractCaptionsFromFrames } from './extractCaptionsFromFrames' +import { base64DataUriToFile } from '@/lib/utils/base64DataUriToFile' export const useIO = create((set, get) => ({ ...getDefaultIOState(), @@ -107,6 +107,69 @@ export const useIO = create((set, get) => ({ } const isVideoFile = fileType.startsWith('video/') + if (isVideoFile) { + const storyboardExtractionTask = useTasks.getState().add({ + category: TaskCategory.IMPORT, + visibility: TaskVisibility.BLOCKER, + initialMessage: `Extracting storyboards..`, + successMessage: `Extracting storyboards.. 100% done`, + value: 0, + }) + + const frames = await extractFramesFromVideo(file, { + format: 'png', // in theory we could also use 'jpg', but this freezes FFmpeg + maxWidth: 1024, + maxHeight: 576, + sceneSamplingRate: 100, + onProgress: (progress: number) => { + storyboardExtractionTask.setProgress({ + message: `Extracting storyboards.. ${progress}% done`, + value: progress, + }) + }, + }) + + let i = 0 + for (const frame of frames) { + const frameFile = base64DataUriToFile(frame, `storyboard_${i++}.png`) + const newSegments = await parseFileIntoSegments({ file: frameFile }) + + console.log('calling timeline.addSegments with:', newSegments) + await timeline.addSegments({ + segments: newSegments, + }) + } + + storyboardExtractionTask.success() + + const enableCaptioning = false + + if (enableCaptioning) { + const captioningTask = useTasks.getState().add({ + category: TaskCategory.IMPORT, + // visibility: TaskVisibility.BLOCKER, + + // since this is very long task, we can run it in the background + visibility: TaskVisibility.BACKGROUND, + initialMessage: `Analyzing storyboards..`, + successMessage: `Analyzing storyboards.. 100% done`, + value: 0, + }) + + console.log('calling extractCaptionsFromFrames() with:', frames) + const captions = await extractCaptionsFromFrames(frames, (progress: number, storyboardIndex: number, nbStoryboards: number) => { + captioningTask.setProgress({ + message: `Analyzing storyboards (${progress}%)`, + value: progress, + }) + }) + console.log('captions:', captions) + // TODO: add + + captioningTask.success() + } + + } } }, openScreenplay: async (