Skip to content

Commit

Permalink
big rewrite of the AI assistant
Browse files Browse the repository at this point in the history
  • Loading branch information
jbilcke-hf committed Aug 4, 2024
1 parent 3e3af53 commit 4cd2157
Show file tree
Hide file tree
Showing 31 changed files with 6,334 additions and 5,018 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,13 @@ short_description: 🎬 Clapper

🎬 Clapper is an open-source AI story visualization tool.

Clapper can interpret a screenplay and render it to storyboards, videos, voice, sound and music.
Prototyped [a year ago](https://www.loom.com/share/25b60750a32c4183b7fadc622d7c0120?sid=f1173e95-1ec8-4be2-831d-54b18e835367), Clapper is not designed to replace traditional video editors or modern AI editors using 3D scenes as input.

Please note however that the tool is at an early stage of development, for the moment it is not supposed to be really used by "normal" people (some features don't work, there are no tutorials etc).
Clapper's philosophy is for anyone to create videos using AI through an interactive, iterative and intuitive process, without the need for external tools, filmmaking or AI engineering skills.

In Clapper you don't edit a sequence of video and audio files directly, but iterate (with the help from your AI assistant) over your story using high-level abstractions such as characters, locations, weather, time period, style etc.

To this end I am also working on a Director's Mode, where you can just put the video in fullscreen, sit confortably in your director's chair (or couch), shouting orders at your AI set assistant to produce your movie.

# Public alpha access

Expand Down
9,464 changes: 4,995 additions & 4,469 deletions package-lock.json

Large diffs are not rendered by default.

28 changes: 15 additions & 13 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "clapper",
"version": "0.0.7",
"version": "0.0.8",
"private": true,
"description": "🎬 Clapper",
"license": "GPL-3.0-only",
Expand Down Expand Up @@ -36,24 +36,24 @@
"electron:make": "npm run build && electron-forge make"
},
"dependencies": {
"@aitube/broadway": "0.1.2",
"@aitube/broadway": "0.1.3",
"@aitube/clap": "0.1.2",
"@aitube/clapper-services": "0.1.2-14",
"@aitube/clapper-services": "0.1.5",
"@aitube/engine": "0.1.2",
"@aitube/timeline": "0.1.2-3",
"@aitube/timeline": "0.1.2-4",
"@fal-ai/serverless-client": "^0.13.0",
"@ffmpeg/ffmpeg": "^0.12.10",
"@ffmpeg/util": "^0.12.1",
"@gradio/client": "^1.3.0",
"@gradio/client": "^1.4.0",
"@huggingface/hub": "^0.15.1",
"@huggingface/inference": "^2.8.0",
"@langchain/anthropic": "^0.2.6",
"@langchain/anthropic": "^0.2.12",
"@langchain/cohere": "^0.2.1",
"@langchain/core": "^0.2.17",
"@langchain/google-vertexai": "^0.0.20",
"@langchain/core": "^0.2.20",
"@langchain/google-vertexai": "^0.0.21",
"@langchain/groq": "^0.0.15",
"@langchain/mistralai": "^0.0.26",
"@langchain/openai": "^0.2.4",
"@langchain/mistralai": "^0.0.27",
"@langchain/openai": "^0.2.5",
"@monaco-editor/react": "^4.6.0",
"@radix-ui/react-accordion": "^1.1.2",
"@radix-ui/react-avatar": "^1.0.4",
Expand Down Expand Up @@ -116,10 +116,12 @@
"react-hook-consent": "^3.5.3",
"react-hotkeys-hook": "^4.5.0",
"react-icons": "^5.2.1",
"react-markdown": "^9.0.1",
"react-reflex": "^4.2.6",
"react-speakup": "^1.0.0",
"replicate": "^0.31.1",
"sharp": "^0.33.4",
"remark-gfm": "^4.0.0",
"replicate": "^0.32.0",
"sharp": "0.33.4",
"sonner": "^1.5.0",
"tailwind-merge": "^2.4.0",
"tailwindcss-animate": "^1.0.7",
Expand All @@ -128,7 +130,7 @@
"use-file-picker": "^2.1.2",
"usehooks-ts": "^2.14.0",
"uuid": "^9.0.1",
"web-audio-beat-detector": "^8.2.10",
"web-audio-beat-detector": "^8.2.12",
"yaml": "^2.4.5",
"zustand": "4.5.2",
"zx": "^8.1.3"
Expand Down
182 changes: 120 additions & 62 deletions src/app/api/assistant/askAnyAssistant.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import {
ChatPromptTemplate,
MessagesPlaceholder,
} from '@langchain/core/prompts'
import { StructuredOutputParser } from '@langchain/core/output_parsers'
import { ChatOpenAI } from '@langchain/openai'
import { ChatGroq } from '@langchain/groq'
import { ChatAnthropic } from '@langchain/anthropic'
Expand All @@ -23,17 +22,19 @@ import { ChatVertexAI } from '@langchain/google-vertexai'
// import { ChatHuggingFace } from "@langchain/huggingface"

import {
AssistantInput,
AssistantAction,
AssistantMessage,
AssistantRequest,
AssistantResponse,
AssistantSceneSegment,
AssistantStorySentence,
ComputeProvider,
} from '@aitube/clapper-services'

import { SimplifiedSegmentData, simplifiedSegmentDataZ } from './types'
import { examples, humanTemplate, systemTemplate } from './templates'

const parser = StructuredOutputParser.fromZodSchema(simplifiedSegmentDataZ)

const formatInstructions = parser.getFormatInstructions()
import { isValidNumber } from '@/lib/utils'
import { assistantMessageParser, formatInstructions } from './parser'
import { parseRawInputToAction } from '@/services/assistant/parseRawInputToAction'

/**
* Query the preferred language model on the user prompt + the segments of the current scene
Expand Down Expand Up @@ -61,7 +62,7 @@ export async function askAnyAssistant({
projectInfo = '',

history = [],
}: AssistantRequest): Promise<AssistantResponse> {
}: AssistantRequest): Promise<AssistantMessage> {
const provider = settings.assistantProvider

if (!provider) {
Expand Down Expand Up @@ -121,28 +122,53 @@ export async function askAnyAssistant({
['human', humanTemplate],
])

//const storySentences: AssistantStorySentence[] = fullScene.split(/(?:. |\n)/).map(storySentence => {
//})

const storySentences: AssistantStorySentence[] = [
{
sentenceId: 0,
sentence: fullScene,
},
{
sentenceId: 1,
sentence: actionLine,
},
]

// we don't give the whole thing to the LLM as to not confuse it,
// and also to keep things tight and performant
const inputData: SimplifiedSegmentData[] = segments.map(
(segment) =>
({
prompt: segment.prompt,
category: segment.category,
}) as SimplifiedSegmentData
)
const sceneSegments: AssistantSceneSegment[] = segments.map((segment, i) => ({
segmentId: i,
prompt: segment.prompt,
startTimeInMs: segment.startTimeInMs,
endTimeInMs: segment.endTimeInMs,
category: segment.category,
}))

// TODO put this into a type
const inputData: AssistantInput = {
directorRequest: prompt,
storySentences,
sceneSegments,
}

// console.log("INPUT:", JSON.stringify(inputData, null, 2))

const chain = chatPrompt.pipe(coerceable).pipe(parser)
const chain = chatPrompt.pipe(coerceable).pipe(assistantMessageParser)

const assistantMessage: AssistantMessage = {
comment: '',
action: AssistantAction.NONE,
updatedStorySentences: [],
updatedSceneSegments: [],
}
try {
const result = await chain.invoke({
const rawResponse = await chain.invoke({
formatInstructions,
examples,
projectInfo,
fullScene,
actionLine,
userPrompt: prompt,
inputData: JSON.stringify(inputData),
chatHistory: history.map(
({
eventId,
Expand All @@ -161,57 +187,89 @@ export async function askAnyAssistant({
}
}
),
inputData: JSON.stringify(inputData),
})

// console.log("OUTPUT:", JSON.stringify(result, null, 2))
console.log(
'LLM replied this rawResponse:',
JSON.stringify(rawResponse, null, 2)
)

/*
this whole code doesn't work well actually..
// this is a fallback in case of LLM failure
if (!rawResponse) {
// complete failure
} else if (typeof rawResponse === 'string') {
assistantMessage.action = parseRawInputToAction(rawResponse)
if (assistantMessage.action === AssistantAction.NONE) {
assistantMessage.comment = rawResponse
}
} else {
assistantMessage.comment =
typeof rawResponse.comment === 'string' ? rawResponse.comment : ''

let match: SegmentData | undefined = segments[result.index] || undefined
assistantMessage.action = Object.keys(AssistantAction).includes(
`${rawResponse.action || ''}`.toUpperCase()
)
? rawResponse.action
: AssistantAction.NONE

// LLM gave an object, but the index is wrong
if (!match) {
match = segments.find(s => s.category === result.category) || undefined
}
*/

// let's create a new segment then!
const categoryName: ClapSegmentCategory =
result?.category &&
Object.keys(ClapSegmentCategory).includes(result.category.toUpperCase())
? (result.category as ClapSegmentCategory)
: ClapSegmentCategory.GENERIC

return {
prompt: result?.prompt || '',
categoryName,
llmOutput: '',
}
} catch (err1) {
// a common scenario is when the output from the LLM is just not a JSON
// this can happen quite often, for instance if the user tried to bypass
// our prompt, or if they are just asking generic questions
const errObj = err1 as any
try {
const keys = Object.keys(errObj)
if (errObj.llmOutput) {
return {
prompt: '',
categoryName: ClapSegmentCategory.GENERIC,
llmOutput: `${errObj.llmOutput || ''}`,
let i = 0
for (const segment of rawResponse.updatedSceneSegments || []) {
i++
const segmentId = isValidNumber(segment.segmentId)
? segment.segmentId!
: i

const category: ClapSegmentCategory =
segment.category &&
Object.keys(ClapSegmentCategory).includes(
segment.category.toUpperCase()
)
? (segment.category as ClapSegmentCategory)
: ClapSegmentCategory.GENERIC

const startTimeInMs: number = isValidNumber(segment.startTimeInMs)
? segment.startTimeInMs
: 0
const endTimeInMs: number = isValidNumber(segment.endTimeInMs)
? segment.endTimeInMs
: 0

const prompt = segment?.prompt || ''

// we assume no prompt is an error
if (prompt) {
assistantMessage.updatedSceneSegments.push({
segmentId,
prompt,
startTimeInMs,
endTimeInMs,
category,
})
}
}
} catch (err2) {
// err2 is just the error for when the LLM failed to reply
console.error(`----<${err1}>----`)
}
} catch (err) {
let errorPlainText = `${err}`
errorPlainText =
errorPlainText.split(`Error: Failed to parse. Text: "`).pop() ||
errorPlainText
errorPlainText =
errorPlainText.split(`". Error: SyntaxError`).shift() || errorPlainText

return {
prompt: '',
categoryName: ClapSegmentCategory.GENERIC,
llmOutput: '',
if (errorPlainText) {
console.log(
`result wasn't a JSON, switching to the fallback LLM response parser..`
)
assistantMessage.comment = errorPlainText
assistantMessage.action = AssistantAction.NONE
assistantMessage.updatedSceneSegments = []
assistantMessage.updatedStorySentences = []
} else {
throw new Error(
`couldn't process the request or parse the response (${err})`
)
}
}

return assistantMessage
}
68 changes: 68 additions & 0 deletions src/app/api/assistant/parser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import z from 'zod'
import { StructuredOutputParser } from '@langchain/core/output_parsers'
import { ClapSegmentCategory } from '@aitube/clap'
import { AssistantAction } from '@aitube/clapper-services'

export const zAssistantAction = z
.nativeEnum(AssistantAction)
.describe('The type of action to perform within the video editor.')

export const zClapSegmentCategory = z.nativeEnum(ClapSegmentCategory).describe(
`Type of the facet, is it about sound, music, visuals etc (the most commonly used types are ${[
ClapSegmentCategory.ACTION,
ClapSegmentCategory.DIALOGUE,
ClapSegmentCategory.WEATHER,
// ERA should be deprecated, TIME is a better and more generic approach
ClapSegmentCategory.ERA, // "early 1800", "in 1995", "during WWII", etc..
ClapSegmentCategory.TIME, // "early 1800", "in 1995", "during WWII", etc..
ClapSegmentCategory.LOCATION, // "in the alps", "in a sand desert", etc..
ClapSegmentCategory.LIGHTING, // "candle lit", "direct sunlight", "neon lights" etc
ClapSegmentCategory.CAMERA, // "medium shot", "close-up shot", etc..
ClapSegmentCategory.CHARACTER, // "male viking, in his 20s, brown hair, scar across the face, looking resolute" and so on
ClapSegmentCategory.SOUND,
ClapSegmentCategory.MUSIC,
].join(', ')})`
)

export const zAssistantSceneSegment = z.object({
segmentId: z.number().describe('unique identifier'),
prompt: z
.string()
.describe(
'Textual description of one of the facets of the current scene, such as how it looks or sounds like. Each facet is temporally indexed (as segments in the movie timeline).'
),
startTimeInMs: z
.number()
.describe(
'Start position of the facet within a temporal story timeline (in millisec)'
),
endTimeInMs: z
.number()
.describe(
'Ending position of the facet within the story timeline (also in millisec)'
),
category: zClapSegmentCategory,
})

export const zAssistantStorySentence = z.object({
sentenceId: z.number().describe('unique identifier'),
sentence: z
.string()
.describe('A sentence extracted from the story plain-text.'),
})

export const zAssistantMessage = z.object({
comment: z
.string()
.describe(
'A free-form comment and chat message, allowing you to answer to the user directly.'
),
action: zAssistantAction,
updatedStorySentences: z.array(zAssistantStorySentence),
updatedSceneSegments: z.array(zAssistantSceneSegment),
})

export const assistantMessageParser =
StructuredOutputParser.fromZodSchema(zAssistantMessage)

export const formatInstructions = assistantMessageParser.getFormatInstructions()
Loading

0 comments on commit 4cd2157

Please sign in to comment.