big rewrite of the AI assistant

jbilcke-hf · Aug 4, 2024 · 4cd2157 · 4cd2157
1 parent 3e3af53
commit 4cd2157
Show file tree

Hide file tree

Showing 31 changed files with 6,334 additions and 5,018 deletions.
diff --git a/README.md b/README.md
@@ -18,9 +18,13 @@ short_description: 🎬 Clapper
 
 🎬 Clapper is an open-source AI story visualization tool.
 
-Clapper can interpret a screenplay and render it to storyboards, videos, voice, sound and music.
+Prototyped [a year ago](https://www.loom.com/share/25b60750a32c4183b7fadc622d7c0120?sid=f1173e95-1ec8-4be2-831d-54b18e835367), Clapper is not designed to replace traditional video editors or modern AI editors using 3D scenes as input.
 
-Please note however that the tool is at an early stage of development, for the moment it is not supposed to be really used by "normal" people (some features don't work, there are no tutorials etc).
+Clapper's philosophy is for anyone to create videos using AI through an interactive, iterative and intuitive process, without the need for external tools, filmmaking or AI engineering skills.
+
+In Clapper you don't edit a sequence of video and audio files directly, but iterate (with the help from your AI assistant) over your story using high-level abstractions such as characters, locations, weather, time period, style etc.
+
+To this end I am also working on a Director's Mode, where you can just put the video in fullscreen, sit confortably in your director's chair (or couch), shouting orders at your AI set assistant to produce your movie.
 
 # Public alpha access
 

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "clapper",
-  "version": "0.0.7",
+  "version": "0.0.8",
   "private": true,
   "description": "🎬 Clapper",
   "license": "GPL-3.0-only",
@@ -36,24 +36,24 @@
     "electron:make": "npm run build && electron-forge make"
   },
   "dependencies": {
-    "@aitube/broadway": "0.1.2",
+    "@aitube/broadway": "0.1.3",
     "@aitube/clap": "0.1.2",
-    "@aitube/clapper-services": "0.1.2-14",
+    "@aitube/clapper-services": "0.1.5",
     "@aitube/engine": "0.1.2",
-    "@aitube/timeline": "0.1.2-3",
+    "@aitube/timeline": "0.1.2-4",
     "@fal-ai/serverless-client": "^0.13.0",
     "@ffmpeg/ffmpeg": "^0.12.10",
     "@ffmpeg/util": "^0.12.1",
-    "@gradio/client": "^1.3.0",
+    "@gradio/client": "^1.4.0",
     "@huggingface/hub": "^0.15.1",
     "@huggingface/inference": "^2.8.0",
-    "@langchain/anthropic": "^0.2.6",
+    "@langchain/anthropic": "^0.2.12",
     "@langchain/cohere": "^0.2.1",
-    "@langchain/core": "^0.2.17",
-    "@langchain/google-vertexai": "^0.0.20",
+    "@langchain/core": "^0.2.20",
+    "@langchain/google-vertexai": "^0.0.21",
     "@langchain/groq": "^0.0.15",
-    "@langchain/mistralai": "^0.0.26",
-    "@langchain/openai": "^0.2.4",
+    "@langchain/mistralai": "^0.0.27",
+    "@langchain/openai": "^0.2.5",
     "@monaco-editor/react": "^4.6.0",
     "@radix-ui/react-accordion": "^1.1.2",
     "@radix-ui/react-avatar": "^1.0.4",
@@ -116,10 +116,12 @@
     "react-hook-consent": "^3.5.3",
     "react-hotkeys-hook": "^4.5.0",
     "react-icons": "^5.2.1",
+    "react-markdown": "^9.0.1",
     "react-reflex": "^4.2.6",
     "react-speakup": "^1.0.0",
-    "replicate": "^0.31.1",
-    "sharp": "^0.33.4",
+    "remark-gfm": "^4.0.0",
+    "replicate": "^0.32.0",
+    "sharp": "0.33.4",
     "sonner": "^1.5.0",
     "tailwind-merge": "^2.4.0",
     "tailwindcss-animate": "^1.0.7",
@@ -128,7 +130,7 @@
     "use-file-picker": "^2.1.2",
     "usehooks-ts": "^2.14.0",
     "uuid": "^9.0.1",
-    "web-audio-beat-detector": "^8.2.10",
+    "web-audio-beat-detector": "^8.2.12",
     "yaml": "^2.4.5",
     "zustand": "4.5.2",
     "zx": "^8.1.3"

diff --git a/src/app/api/assistant/askAnyAssistant.ts b/src/app/api/assistant/askAnyAssistant.ts
@@ -12,7 +12,6 @@ import {
   ChatPromptTemplate,
   MessagesPlaceholder,
 } from '@langchain/core/prompts'
-import { StructuredOutputParser } from '@langchain/core/output_parsers'
 import { ChatOpenAI } from '@langchain/openai'
 import { ChatGroq } from '@langchain/groq'
 import { ChatAnthropic } from '@langchain/anthropic'
@@ -23,17 +22,19 @@ import { ChatVertexAI } from '@langchain/google-vertexai'
 // import { ChatHuggingFace } from "@langchain/huggingface"
 
 import {
+  AssistantInput,
+  AssistantAction,
+  AssistantMessage,
   AssistantRequest,
-  AssistantResponse,
+  AssistantSceneSegment,
+  AssistantStorySentence,
   ComputeProvider,
 } from '@aitube/clapper-services'
 
-import { SimplifiedSegmentData, simplifiedSegmentDataZ } from './types'
 import { examples, humanTemplate, systemTemplate } from './templates'
-
-const parser = StructuredOutputParser.fromZodSchema(simplifiedSegmentDataZ)
-
-const formatInstructions = parser.getFormatInstructions()
+import { isValidNumber } from '@/lib/utils'
+import { assistantMessageParser, formatInstructions } from './parser'
+import { parseRawInputToAction } from '@/services/assistant/parseRawInputToAction'
 
 /**
  * Query the preferred language model on the user prompt + the segments of the current scene
@@ -61,7 +62,7 @@ export async function askAnyAssistant({
   projectInfo = '',
 
   history = [],
-}: AssistantRequest): Promise<AssistantResponse> {
+}: AssistantRequest): Promise<AssistantMessage> {
   const provider = settings.assistantProvider
 
   if (!provider) {
@@ -121,28 +122,53 @@ export async function askAnyAssistant({
     ['human', humanTemplate],
   ])
 
+  //const storySentences: AssistantStorySentence[] = fullScene.split(/(?:. |\n)/).map(storySentence => {
+  //})
+
+  const storySentences: AssistantStorySentence[] = [
+    {
+      sentenceId: 0,
+      sentence: fullScene,
+    },
+    {
+      sentenceId: 1,
+      sentence: actionLine,
+    },
+  ]
+
   // we don't give the whole thing to the LLM as to not confuse it,
   // and also to keep things tight and performant
-  const inputData: SimplifiedSegmentData[] = segments.map(
-    (segment) =>
-      ({
-        prompt: segment.prompt,
-        category: segment.category,
-      }) as SimplifiedSegmentData
-  )
+  const sceneSegments: AssistantSceneSegment[] = segments.map((segment, i) => ({
+    segmentId: i,
+    prompt: segment.prompt,
+    startTimeInMs: segment.startTimeInMs,
+    endTimeInMs: segment.endTimeInMs,
+    category: segment.category,
+  }))
+
+  // TODO put this into a type
+  const inputData: AssistantInput = {
+    directorRequest: prompt,
+    storySentences,
+    sceneSegments,
+  }
 
   // console.log("INPUT:", JSON.stringify(inputData, null, 2))
 
-  const chain = chatPrompt.pipe(coerceable).pipe(parser)
+  const chain = chatPrompt.pipe(coerceable).pipe(assistantMessageParser)
 
+  const assistantMessage: AssistantMessage = {
+    comment: '',
+    action: AssistantAction.NONE,
+    updatedStorySentences: [],
+    updatedSceneSegments: [],
+  }
   try {
-    const result = await chain.invoke({
+    const rawResponse = await chain.invoke({
       formatInstructions,
       examples,
       projectInfo,
-      fullScene,
-      actionLine,
-      userPrompt: prompt,
+      inputData: JSON.stringify(inputData),
       chatHistory: history.map(
         ({
           eventId,
@@ -161,57 +187,89 @@ export async function askAnyAssistant({
           }
         }
       ),
-      inputData: JSON.stringify(inputData),
     })
 
-    // console.log("OUTPUT:", JSON.stringify(result, null, 2))
+    console.log(
+      'LLM replied this rawResponse:',
+      JSON.stringify(rawResponse, null, 2)
+    )
 
-    /*
-    this whole code doesn't work well actually..
+    // this is a fallback in case of LLM failure
+    if (!rawResponse) {
+      // complete failure
+    } else if (typeof rawResponse === 'string') {
+      assistantMessage.action = parseRawInputToAction(rawResponse)
+      if (assistantMessage.action === AssistantAction.NONE) {
+        assistantMessage.comment = rawResponse
+      }
+    } else {
+      assistantMessage.comment =
+        typeof rawResponse.comment === 'string' ? rawResponse.comment : ''
 
-    let match: SegmentData | undefined = segments[result.index] || undefined
+      assistantMessage.action = Object.keys(AssistantAction).includes(
+        `${rawResponse.action || ''}`.toUpperCase()
+      )
+        ? rawResponse.action
+        : AssistantAction.NONE
 
-    // LLM gave an object, but the index is wrong
-    if (!match) {
-      match = segments.find(s => s.category === result.category) || undefined
-    }
-    */
-
-    // let's create a new segment then!
-    const categoryName: ClapSegmentCategory =
-      result?.category &&
-      Object.keys(ClapSegmentCategory).includes(result.category.toUpperCase())
-        ? (result.category as ClapSegmentCategory)
-        : ClapSegmentCategory.GENERIC
-
-    return {
-      prompt: result?.prompt || '',
-      categoryName,
-      llmOutput: '',
-    }
-  } catch (err1) {
-    // a common scenario is when the output from the LLM is just not a JSON
-    // this can happen quite often, for instance if the user tried to bypass
-    // our prompt, or if they are just asking generic questions
-    const errObj = err1 as any
-    try {
-      const keys = Object.keys(errObj)
-      if (errObj.llmOutput) {
-        return {
-          prompt: '',
-          categoryName: ClapSegmentCategory.GENERIC,
-          llmOutput: `${errObj.llmOutput || ''}`,
+      let i = 0
+      for (const segment of rawResponse.updatedSceneSegments || []) {
+        i++
+        const segmentId = isValidNumber(segment.segmentId)
+          ? segment.segmentId!
+          : i
+
+        const category: ClapSegmentCategory =
+          segment.category &&
+          Object.keys(ClapSegmentCategory).includes(
+            segment.category.toUpperCase()
+          )
+            ? (segment.category as ClapSegmentCategory)
+            : ClapSegmentCategory.GENERIC
+
+        const startTimeInMs: number = isValidNumber(segment.startTimeInMs)
+          ? segment.startTimeInMs
+          : 0
+        const endTimeInMs: number = isValidNumber(segment.endTimeInMs)
+          ? segment.endTimeInMs
+          : 0
+
+        const prompt = segment?.prompt || ''
+
+        // we assume no prompt is an error
+        if (prompt) {
+          assistantMessage.updatedSceneSegments.push({
+            segmentId,
+            prompt,
+            startTimeInMs,
+            endTimeInMs,
+            category,
+          })
         }
       }
-    } catch (err2) {
-      // err2 is just the error for when the LLM failed to reply
-      console.error(`----<${err1}>----`)
     }
+  } catch (err) {
+    let errorPlainText = `${err}`
+    errorPlainText =
+      errorPlainText.split(`Error: Failed to parse. Text: "`).pop() ||
+      errorPlainText
+    errorPlainText =
+      errorPlainText.split(`". Error: SyntaxError`).shift() || errorPlainText
 
-    return {
-      prompt: '',
-      categoryName: ClapSegmentCategory.GENERIC,
-      llmOutput: '',
+    if (errorPlainText) {
+      console.log(
+        `result wasn't a JSON, switching to the fallback LLM response parser..`
+      )
+      assistantMessage.comment = errorPlainText
+      assistantMessage.action = AssistantAction.NONE
+      assistantMessage.updatedSceneSegments = []
+      assistantMessage.updatedStorySentences = []
+    } else {
+      throw new Error(
+        `couldn't process the request or parse the response (${err})`
+      )
     }
   }
+
+  return assistantMessage
 }
diff --git a/src/app/api/assistant/parser.ts b/src/app/api/assistant/parser.ts
@@ -0,0 +1,68 @@
+import z from 'zod'
+import { StructuredOutputParser } from '@langchain/core/output_parsers'
+import { ClapSegmentCategory } from '@aitube/clap'
+import { AssistantAction } from '@aitube/clapper-services'
+
+export const zAssistantAction = z
+  .nativeEnum(AssistantAction)
+  .describe('The type of action to perform within the video editor.')
+
+export const zClapSegmentCategory = z.nativeEnum(ClapSegmentCategory).describe(
+  `Type of the facet, is it about sound, music, visuals etc (the most commonly used types are ${[
+    ClapSegmentCategory.ACTION,
+    ClapSegmentCategory.DIALOGUE,
+    ClapSegmentCategory.WEATHER,
+    // ERA should be deprecated, TIME is a better and more generic approach
+    ClapSegmentCategory.ERA, // "early 1800", "in 1995", "during WWII", etc..
+    ClapSegmentCategory.TIME, // "early 1800", "in 1995", "during WWII", etc..
+    ClapSegmentCategory.LOCATION, // "in the alps", "in a sand desert", etc..
+    ClapSegmentCategory.LIGHTING, // "candle lit", "direct sunlight", "neon lights" etc
+    ClapSegmentCategory.CAMERA, // "medium shot", "close-up shot", etc..
+    ClapSegmentCategory.CHARACTER, // "male viking, in his 20s, brown hair, scar across the face, looking resolute" and so on
+    ClapSegmentCategory.SOUND,
+    ClapSegmentCategory.MUSIC,
+  ].join(', ')})`
+)
+
+export const zAssistantSceneSegment = z.object({
+  segmentId: z.number().describe('unique identifier'),
+  prompt: z
+    .string()
+    .describe(
+      'Textual description of one of the facets of the current scene, such as how it looks or sounds like. Each facet is temporally indexed (as segments in the movie timeline).'
+    ),
+  startTimeInMs: z
+    .number()
+    .describe(
+      'Start position of the facet within a temporal story timeline (in millisec)'
+    ),
+  endTimeInMs: z
+    .number()
+    .describe(
+      'Ending position of the facet within the story timeline (also in millisec)'
+    ),
+  category: zClapSegmentCategory,
+})
+
+export const zAssistantStorySentence = z.object({
+  sentenceId: z.number().describe('unique identifier'),
+  sentence: z
+    .string()
+    .describe('A sentence extracted from the story plain-text.'),
+})
+
+export const zAssistantMessage = z.object({
+  comment: z
+    .string()
+    .describe(
+      'A free-form comment and chat message, allowing you to answer to the user directly.'
+    ),
+  action: zAssistantAction,
+  updatedStorySentences: z.array(zAssistantStorySentence),
+  updatedSceneSegments: z.array(zAssistantSceneSegment),
+})
+
+export const assistantMessageParser =
+  StructuredOutputParser.fromZodSchema(zAssistantMessage)
+
+export const formatInstructions = assistantMessageParser.getFormatInstructions()