Skip to content

Commit 385dc0d

Browse files
committed
feat(runWebScraper): retry a scrape max 3 times in a crawl if the status code is failure
1 parent 6b17a53 commit 385dc0d

File tree

7 files changed

+108
-72
lines changed

7 files changed

+108
-72
lines changed

apps/api/logview.js

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,19 @@
11
const fs = require("fs");
22

3-
const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
4-
.split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
3+
// METHOD: Winston log file
4+
// const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
5+
// .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
6+
7+
// METHOD: GCloud export
8+
const logs = [
9+
"downloaded-logs-20241213-225607.json",
10+
"downloaded-logs-20241213-225654.json",
11+
"downloaded-logs-20241213-225720.json",
12+
"downloaded-logs-20241213-225758.json",
13+
"downloaded-logs-20241213-225825.json",
14+
"downloaded-logs-20241213-225843.json",
15+
].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload);
16+
517

618
const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];
719

apps/api/src/controllers/v0/scrape.ts

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import { authenticateUser } from "../auth";
88
import { PlanType, RateLimiterMode } from "../../types";
99
import { logJob } from "../../services/logging/log_job";
1010
import {
11-
Document,
1211
fromLegacyCombo,
1312
toLegacyDocument,
1413
url as urlSchema,
@@ -29,6 +28,7 @@ import * as Sentry from "@sentry/node";
2928
import { getJobPriority } from "../../lib/job-priority";
3029
import { fromLegacyScrapeOptions } from "../v1/types";
3130
import { ZodError } from "zod";
31+
import { Document as V0Document } from "./../../lib/entities";
3232

3333
export async function scrapeHelper(
3434
jobId: string,
@@ -42,7 +42,7 @@ export async function scrapeHelper(
4242
): Promise<{
4343
success: boolean;
4444
error?: string;
45-
data?: Document | { url: string };
45+
data?: V0Document | { url: string };
4646
returnCode: number;
4747
}> {
4848
const url = urlSchema.parse(req.body.url);
@@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) {
241241
const endTime = new Date().getTime();
242242
const timeTakenInSeconds = (endTime - startTime) / 1000;
243243
const numTokens =
244-
result.data && (result.data as Document).markdown
244+
result.data && (result.data as V0Document).markdown
245245
? numTokensFromString(
246-
(result.data as Document).markdown!,
246+
(result.data as V0Document).markdown!,
247247
"gpt-3.5-turbo",
248248
)
249249
: 0;
@@ -276,14 +276,14 @@ export async function scrapeController(req: Request, res: Response) {
276276

277277
let doc = result.data;
278278
if (!pageOptions || !pageOptions.includeRawHtml) {
279-
if (doc && (doc as Document).rawHtml) {
280-
delete (doc as Document).rawHtml;
279+
if (doc && (doc as V0Document).rawHtml) {
280+
delete (doc as V0Document).rawHtml;
281281
}
282282
}
283283

284284
if (pageOptions && pageOptions.includeExtract) {
285-
if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
286-
delete (doc as Document).markdown;
285+
if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) {
286+
delete (doc as V0Document).markdown;
287287
}
288288
}
289289

apps/api/src/controllers/v1/extract.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import { Request, Response } from "express";
22
import {
3-
// Document,
3+
Document,
44
RequestWithAuth,
55
ExtractRequest,
66
extractRequestSchema,
77
ExtractResponse,
88
MapDocument,
99
scrapeOptions,
1010
} from "./types";
11-
import { Document } from "../../lib/entities";
11+
// import { Document } from "../../lib/entities";
1212
import Redis from "ioredis";
1313
import { configDotenv } from "dotenv";
1414
import { performRanking } from "../../lib/ranker";

apps/api/src/controllers/v1/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ export type Document = {
396396
articleSection?: string;
397397
url?: string;
398398
sourceURL?: string;
399-
statusCode?: number;
399+
statusCode: number;
400400
error?: string;
401401
[key: string]: string | string[] | number | undefined;
402402
};

apps/api/src/main/runWebScraper.ts

Lines changed: 81 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
4949
bull_job_id: job.id.toString(),
5050
priority: job.opts.priority,
5151
is_scrape: job.data.is_scrape ?? false,
52+
is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
5253
});
5354
}
5455

@@ -63,73 +64,63 @@ export async function runWebScraper({
6364
bull_job_id,
6465
priority,
6566
is_scrape = false,
67+
is_crawl = false,
6668
}: RunWebScraperParams): Promise<ScrapeUrlResponse> {
69+
const tries = is_crawl ? 3 : 1;
70+
6771
let response: ScrapeUrlResponse | undefined = undefined;
6872
let engines: EngineResultsTracker = {};
69-
try {
70-
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
71-
priority,
72-
...internalOptions,
73-
});
74-
if (!response.success) {
75-
if (response.error instanceof Error) {
76-
throw response.error;
77-
} else {
78-
throw new Error(
79-
"scrapeURL error: " +
80-
(Array.isArray(response.error)
81-
? JSON.stringify(response.error)
82-
: typeof response.error === "object"
83-
? JSON.stringify({ ...response.error })
84-
: response.error),
85-
);
86-
}
87-
}
73+
let error: any = undefined;
8874

89-
if (is_scrape === false) {
90-
let creditsToBeBilled = 1; // Assuming 1 credit per document
91-
if (scrapeOptions.extract) {
92-
creditsToBeBilled = 5;
93-
}
94-
95-
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
96-
logger.error(
97-
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
98-
);
99-
// Optionally, you could notify an admin or add to a retry queue here
100-
});
75+
for (let i = 0; i < tries; i++) {
76+
if (i > 0) {
77+
logger.debug("Retrying scrape...", { scrapeId: bull_job_id, jobId: bull_job_id, method: "runWebScraper", module: "runWebScraper", tries, i, previousStatusCode: (response as any)?.document?.metadata?.statusCode, previousError: error });
10178
}
10279

103-
// This is where the returnvalue from the job is set
104-
// onSuccess(response.document, mode);
80+
response = undefined;
81+
engines = {};
82+
error = undefined;
10583

106-
engines = response.engines;
107-
return response;
108-
} catch (error) {
109-
engines =
110-
response !== undefined
111-
? response.engines
112-
: typeof error === "object" && error !== null
113-
? ((error as any).results ?? {})
114-
: {};
84+
try {
85+
response = await scrapeURL(bull_job_id, url, scrapeOptions, {
86+
priority,
87+
...internalOptions,
88+
});
89+
if (!response.success) {
90+
if (response.error instanceof Error) {
91+
throw response.error;
92+
} else {
93+
throw new Error(
94+
"scrapeURL error: " +
95+
(Array.isArray(response.error)
96+
? JSON.stringify(response.error)
97+
: typeof response.error === "object"
98+
? JSON.stringify({ ...response.error })
99+
: response.error),
100+
);
101+
}
102+
}
103+
104+
// This is where the returnvalue from the job is set
105+
// onSuccess(response.document, mode);
106+
107+
engines = response.engines;
115108

116-
if (response !== undefined) {
117-
return {
118-
...response,
119-
success: false,
120-
error,
121-
};
122-
} else {
123-
return {
124-
success: false,
125-
error,
126-
logs: ["no logs -- error coming from runWebScraper"],
127-
engines,
128-
};
109+
if ((response.document.metadata.statusCode >= 200 && response.document.metadata.statusCode < 300) || response.document.metadata.statusCode === 304) {
110+
// status code is good -- do not attempt retry
111+
break;
112+
}
113+
} catch (error) {
114+
engines =
115+
response !== undefined
116+
? response.engines
117+
: typeof error === "object" && error !== null
118+
? ((error as any).results ?? {})
119+
: {};
129120
}
130-
// onError(error);
131-
} finally {
132-
const engineOrder = Object.entries(engines)
121+
}
122+
123+
const engineOrder = Object.entries(engines)
133124
.sort((a, b) => a[1].startedAt - b[1].startedAt)
134125
.map((x) => x[0]) as Engine[];
135126

@@ -158,6 +149,38 @@ export async function runWebScraper({
158149
},
159150
});
160151
}
152+
153+
if (error === undefined && response?.success) {
154+
if (is_scrape === false) {
155+
let creditsToBeBilled = 1; // Assuming 1 credit per document
156+
if (scrapeOptions.extract) {
157+
creditsToBeBilled = 5;
158+
}
159+
160+
billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
161+
logger.error(
162+
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
163+
);
164+
// Optionally, you could notify an admin or add to a retry queue here
165+
});
166+
}
167+
168+
return response;
169+
} else {
170+
if (response !== undefined) {
171+
return {
172+
...response,
173+
success: false,
174+
error,
175+
};
176+
} else {
177+
return {
178+
success: false,
179+
error,
180+
logs: ["no logs -- error coming from runWebScraper"],
181+
engines,
182+
};
183+
}
161184
}
162185
}
163186

apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import { Meta } from "..";
55
export function extractMetadata(
66
meta: Meta,
77
html: string,
8-
): Document["metadata"] {
8+
): Partial<Document["metadata"]> {
99
let title: string | undefined = undefined;
1010
let description: string | undefined = undefined;
1111
let language: string | undefined = undefined;

apps/api/src/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ export interface RunWebScraperParams {
5555
bull_job_id: string;
5656
priority?: number;
5757
is_scrape?: boolean;
58+
is_crawl?: boolean;
5859
}
5960

6061
export type RunWebScraperResult =

0 commit comments

Comments
 (0)