-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
(feat/extract) Refactor and Reranker improvements (#1100)
* Reapply "Nick: extract api reference" This reverts commit 61d7ba7. * Nick: refactor analyzer * Nick: formatting * Nick: * Update extraction-service.ts * Nick: fixes * NIck: * Nick: wip * Nick: reverted to the old re-ranker * Nick: * Update extract-status.ts
- Loading branch information
1 parent
ad06cde
commit 6b9e65c
Showing
13 changed files
with
588 additions
and
239 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
94 changes: 94 additions & 0 deletions
94
apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import { generateSchemaFromPrompt } from "../../../scraper/scrapeURL/transformers/llmExtract"; | ||
import { TokenUsage } from "../../../controllers/v1/types"; | ||
import { z } from "zod"; | ||
import { | ||
buildAnalyzeSchemaPrompt, | ||
buildAnalyzeSchemaUserPrompt, | ||
} from "../build-prompts"; | ||
import OpenAI from "openai"; | ||
const openai = new OpenAI(); | ||
|
||
export async function analyzeSchemaAndPrompt( | ||
urls: string[], | ||
schema: any, | ||
prompt: string, | ||
): Promise<{ | ||
isMultiEntity: boolean; | ||
multiEntityKeys: string[]; | ||
reasoning?: string; | ||
keyIndicators?: string[]; | ||
tokenUsage: TokenUsage; | ||
}> { | ||
if (!schema) { | ||
schema = await generateSchemaFromPrompt(prompt); | ||
} | ||
|
||
const schemaString = JSON.stringify(schema); | ||
|
||
const checkSchema = z | ||
.object({ | ||
isMultiEntity: z.boolean(), | ||
multiEntityKeys: z.array(z.string()).optional().default([]), | ||
reasoning: z.string(), | ||
keyIndicators: z.array(z.string()), | ||
}) | ||
.refine( | ||
(x) => !x.isMultiEntity || x.multiEntityKeys.length > 0, | ||
"isMultiEntity was true, but no multiEntityKeys", | ||
); | ||
|
||
const model = "gpt-4o"; | ||
|
||
const result = await openai.beta.chat.completions.parse({ | ||
model: model, | ||
messages: [ | ||
{ | ||
role: "system", | ||
content: buildAnalyzeSchemaPrompt(), | ||
}, | ||
{ | ||
role: "user", | ||
content: buildAnalyzeSchemaUserPrompt(schemaString, prompt, urls), | ||
}, | ||
], | ||
response_format: { | ||
type: "json_schema", | ||
json_schema: { | ||
schema: { | ||
type: "object", | ||
properties: { | ||
isMultiEntity: { type: "boolean" }, | ||
multiEntityKeys: { type: "array", items: { type: "string" } }, | ||
reasoning: { type: "string" }, | ||
keyIndicators: { type: "array", items: { type: "string" } }, | ||
}, | ||
required: [ | ||
"isMultiEntity", | ||
"multiEntityKeys", | ||
"reasoning", | ||
"keyIndicators", | ||
], | ||
additionalProperties: false, | ||
}, | ||
name: "checkSchema", | ||
}, | ||
}, | ||
}); | ||
|
||
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } = | ||
checkSchema.parse(result.choices[0].message.parsed); | ||
|
||
const tokenUsage: TokenUsage = { | ||
promptTokens: result.usage?.prompt_tokens ?? 0, | ||
completionTokens: result.usage?.completion_tokens ?? 0, | ||
totalTokens: result.usage?.total_tokens ?? 0, | ||
model: model, | ||
}; | ||
return { | ||
isMultiEntity, | ||
multiEntityKeys, | ||
reasoning, | ||
keyIndicators, | ||
tokenUsage, | ||
}; | ||
} |
Oops, something went wrong.