Skip to content

Commit

Permalink
(feat/extract) Refactor and Reranker improvements (#1100)
Browse files Browse the repository at this point in the history
* Reapply "Nick: extract api reference"

This reverts commit 61d7ba7.

* Nick: refactor analyzer

* Nick: formatting

* Nick:

* Update extraction-service.ts

* Nick: fixes

* NIck:

* Nick: wip

* Nick: reverted to the old re-ranker

* Nick:

* Update extract-status.ts
  • Loading branch information
nickscamara authored Jan 27, 2025
1 parent ad06cde commit 6b9e65c
Show file tree
Hide file tree
Showing 13 changed files with 588 additions and 239 deletions.
2 changes: 2 additions & 0 deletions apps/api/src/controllers/v1/extract-status.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ export async function extractStatusController(
data = jobData[0].docs;
}

// console.log(extract.sources);
return res.status(200).json({
success: extract.status === "failed" ? false : true,
data: data,
Expand All @@ -38,5 +39,6 @@ export async function extractStatusController(
expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(),
steps: extract.showSteps ? extract.steps : undefined,
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
// sources: extract.sources,
});
}
4 changes: 4 additions & 0 deletions apps/api/src/controllers/v1/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,7 @@ export interface URLTrace {
};
relevanceScore?: number;
usedInCompletion?: boolean;
extractedFields?: string[];
}

export interface ExtractResponse {
Expand All @@ -547,6 +548,9 @@ export interface ExtractResponse {
id?: string;
warning?: string;
urlTrace?: URLTrace[];
sources?: {
[key: string]: string[];
};
}

export interface ExtractResponseRequestTest {
Expand Down
156 changes: 156 additions & 0 deletions apps/api/src/lib/__tests__/mix-schemas.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1023,4 +1023,160 @@ describe("mixSchemaObjects function", () => {

expect(finalResult).toEqual(singleAnswerResult);
});

it("should handle empty objects correctly (id: 30)", async () => {
const originalSchema = {
type: "object",
properties: {
business_details: {
type: "object",
properties: {
name: { type: "string" },
years_in_operation: { type: "string" },
services_offered: {
type: "array",
items: { type: "string" }
},
experience_highlights: { type: "string" }
},
required: ["name"]
},
management: {
type: "object",
properties: {
owner_name: { type: "string" },
credentials: {
type: "array",
items: { type: "string" }
}
}
},
contact_information: {
type: "object",
properties: {
address: { type: "string" },
phone: { type: "string" }
}
},
reputation: {
type: "object",
properties: {
client_feedback: { type: "string" },
operational_quality: { type: "string" }
}
}
},
required: ["business_details"]
};

const singleAnswerResult = {
business_details: {
name: "Red Hill Mobility Group",
years_in_operation: "12 years",
services_offered: [
"Recovery equipment for military",
"Vehicle mobility solutions",
"Product development for military vehicles"
],
experience_highlights: "More than 12 years of combined experience overseas on over 25 active combat deployments."
},
management: {
owner_name: "",
credentials: []
},
contact_information: {
address: "659 Shell Drive, Spring Lake, NC 28390",
phone: "910-638-7836"
},
reputation: {
client_feedback: "",
operational_quality: ""
}
};

const multiEntityResult = {};

const finalResult = await mixSchemaObjects(
originalSchema,
singleAnswerResult,
{}
);

expect(finalResult).toEqual(singleAnswerResult);
});

it("should return single answer result when multi entity is undefined", async () => {
const originalSchema = {
type: "object",
properties: {
business_details: {
type: "object",
properties: {
name: { type: "string" },
years_in_operation: { type: "string" },
services_offered: {
type: "array",
items: { type: "string" }
},
experience_highlights: { type: "string" }
},
required: ["name"]
},
management: {
type: "object",
properties: {
owner_name: { type: "string" },
credentials: {
type: "array",
items: { type: "string" }
}
}
},
contact_information: {
type: "object",
properties: {
address: { type: "string" },
phone: { type: "string" }
}
},
reputation: {
type: "object",
properties: {
client_feedback: { type: "string" },
operational_quality: { type: "string" }
}
}
},
required: ["business_details"]
};

const singleAnswerResult = {
business_details: {
name: "Red Hill Mobility Group",
years_in_operation: "12 years",
services_offered: [
"Recovery equipment for military",
"Vehicle mobility solutions",
"Product development for military vehicles"
],
experience_highlights: "More than 12 years of combined experience overseas on over 25 active combat deployments."
},
management: {
owner_name: "",
credentials: []
},
contact_information: {
address: "659 Shell Drive, Spring Lake, NC 28390",
phone: "910-638-7836"
},
reputation: {
client_feedback: "",
operational_quality: ""
}
};

const finalResult = await mixSchemaObjects(originalSchema, singleAnswerResult, {});

expect(finalResult).toEqual(singleAnswerResult);
});
});
65 changes: 64 additions & 1 deletion apps/api/src/lib/extract/build-prompts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,68 @@ to determine their relevance to the user's query and intent.
}

export function buildRerankerUserPrompt(searchQuery: string): string {
return `Given these URLs, rank which ones are relevant to the user's extraction intent: "${searchQuery}".`;
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
}

// Multi entity schema anlayzer
export function buildAnalyzeSchemaPrompt(): string {
return `You are a query classifier for a web scraping system. Classify the data extraction query as either:
A) Single-Answer: One answer across a few pages, possibly containing small arrays.
B) Multi-Entity: Many items across many pages, often involving large arrays.
Consider:
1. Answer Cardinality: Single or multiple items?
2. Page Distribution: Found on 1-3 pages or many?
3. Verification Needs: Cross-page verification or independent extraction?
Provide:
- Method: [Single-Answer/Multi-Entity]
- Confidence: [0-100%]
- Reasoning: Why this classification?
- Key Indicators: Specific aspects leading to this decision.
Examples:
- "Is this company a non-profit?" -> Single-Answer
- "Extract all product prices" -> Multi-Entity
For Single-Answer, arrays may be present but are typically small. For Multi-Entity, if arrays have multiple items not from a single page, return keys with large arrays. If nested, return the full key (e.g., 'ecommerce.products').`;
}

export function buildAnalyzeSchemaUserPrompt(
schemaString: string,
prompt: string,
urls: string[],
): string {
return `Classify the query as Single-Answer or Multi-Entity. For Multi-Entity, return keys with large arrays; otherwise, return none:
Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`;
}

// Should Extract

export function buildShouldExtractSystemPrompt(): string {
return `You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.`;
}

export function buildShouldExtractUserPrompt(
prompt: string,
schema: any,
): string {
return `Should the following content be used to extract information for this prompt: "${prompt}" User schema is: ${JSON.stringify(schema)}\nReturn only true or false.`;
}

// Batch extract
export function buildBatchExtractSystemPrompt(
systemPrompt: string,
multiEntitySchema: any,
links: string[],
): string {
return (
(systemPrompt ? `${systemPrompt}\n` : "") +
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` +
links.join(", ")
);
}

export function buildBatchExtractPrompt(prompt: string): string {
return `Today is: ${new Date().toISOString()}\n${prompt}`;
}
94 changes: 94 additions & 0 deletions apps/api/src/lib/extract/completions/analyzeSchemaAndPrompt.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import { generateSchemaFromPrompt } from "../../../scraper/scrapeURL/transformers/llmExtract";
import { TokenUsage } from "../../../controllers/v1/types";
import { z } from "zod";
import {
buildAnalyzeSchemaPrompt,
buildAnalyzeSchemaUserPrompt,
} from "../build-prompts";
import OpenAI from "openai";
const openai = new OpenAI();

export async function analyzeSchemaAndPrompt(
urls: string[],
schema: any,
prompt: string,
): Promise<{
isMultiEntity: boolean;
multiEntityKeys: string[];
reasoning?: string;
keyIndicators?: string[];
tokenUsage: TokenUsage;
}> {
if (!schema) {
schema = await generateSchemaFromPrompt(prompt);
}

const schemaString = JSON.stringify(schema);

const checkSchema = z
.object({
isMultiEntity: z.boolean(),
multiEntityKeys: z.array(z.string()).optional().default([]),
reasoning: z.string(),
keyIndicators: z.array(z.string()),
})
.refine(
(x) => !x.isMultiEntity || x.multiEntityKeys.length > 0,
"isMultiEntity was true, but no multiEntityKeys",
);

const model = "gpt-4o";

const result = await openai.beta.chat.completions.parse({
model: model,
messages: [
{
role: "system",
content: buildAnalyzeSchemaPrompt(),
},
{
role: "user",
content: buildAnalyzeSchemaUserPrompt(schemaString, prompt, urls),
},
],
response_format: {
type: "json_schema",
json_schema: {
schema: {
type: "object",
properties: {
isMultiEntity: { type: "boolean" },
multiEntityKeys: { type: "array", items: { type: "string" } },
reasoning: { type: "string" },
keyIndicators: { type: "array", items: { type: "string" } },
},
required: [
"isMultiEntity",
"multiEntityKeys",
"reasoning",
"keyIndicators",
],
additionalProperties: false,
},
name: "checkSchema",
},
},
});

const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
checkSchema.parse(result.choices[0].message.parsed);

const tokenUsage: TokenUsage = {
promptTokens: result.usage?.prompt_tokens ?? 0,
completionTokens: result.usage?.completion_tokens ?? 0,
totalTokens: result.usage?.total_tokens ?? 0,
model: model,
};
return {
isMultiEntity,
multiEntityKeys,
reasoning,
keyIndicators,
tokenUsage,
};
}
Loading

0 comments on commit 6b9e65c

Please sign in to comment.