Skip to content

Commit 36cf49c

Browse files
Merge remote-tracking branch 'origin/main' into nsc/new-extract
2 parents 91f4fd8 + a313367 commit 36cf49c

File tree

25 files changed

+269
-217
lines changed

25 files changed

+269
-217
lines changed

apps/api/src/controllers/v0/crawl.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ export async function crawlController(req: Request, res: Response) {
7575
await checkTeamCredits(chunk, team_id, limitCheck);
7676

7777
if (!creditsCheckSuccess) {
78-
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });
78+
return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" });
7979
}
8080

8181
// TODO: need to do this to v1

apps/api/src/controllers/v0/scrape.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ export async function scrapeController(req: Request, res: Response) {
209209
earlyReturn = true;
210210
return res.status(500).json({
211211
error:
212-
"Error checking team credits. Please contact hello@firecrawl.com for help.",
212+
"Error checking team credits. Please contact help@firecrawl.com for help.",
213213
});
214214
}
215215

apps/api/src/controllers/v1/batch-scrape.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import { logCrawl } from "../../services/logging/crawl_log";
1616
import { getScrapeQueue } from "../../services/queue-service";
1717
import { getJobPriority } from "../../lib/job-priority";
1818
import { addScrapeJobs } from "../../services/queue-jobs";
19+
import { callWebhook } from "../../services/webhook";
1920

2021
export async function batchScrapeController(
2122
req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
@@ -66,6 +67,7 @@ export async function batchScrapeController(
6667
crawl_id: id,
6768
sitemapped: true,
6869
v1: true,
70+
webhook: req.body.webhook,
6971
},
7072
opts: {
7173
jobId: uuidv4(),
@@ -85,6 +87,10 @@ export async function batchScrapeController(
8587
);
8688
await addScrapeJobs(jobs);
8789

90+
if(req.body.webhook) {
91+
await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
92+
}
93+
8894
const protocol = process.env.ENV === "local" ? req.protocol : "https";
8995

9096
return res.status(200).json({

apps/api/src/controllers/v1/crawl-status-ws.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
175175
logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
176176
return close(ws, 1011, {
177177
type: "error",
178-
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
178+
error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id
179179
});
180180
}
181181
}

apps/api/src/controllers/v1/map.ts

Lines changed: 109 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,6 @@
11
import { Response } from "express";
22
import { v4 as uuidv4 } from "uuid";
3-
import {
4-
MapDocument,
5-
mapRequestSchema,
6-
RequestWithAuth,
7-
scrapeOptions,
8-
} from "./types";
3+
import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
94
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
105
import { MapResponse, MapRequest } from "./types";
116
import { configDotenv } from "dotenv";
@@ -65,11 +60,13 @@ export async function getMapResults({
6560
}): Promise<MapResult> {
6661
const id = uuidv4();
6762
let links: string[] = [url];
63+
let mapResults: MapDocument[] = [];
6864

6965
const sc: StoredCrawl = {
7066
originUrl: url,
7167
crawlerOptions: {
7268
...crawlerOptions,
69+
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
7370
scrapeOptions: undefined,
7471
},
7572
scrapeOptions: scrapeOptions.parse({}),
@@ -81,105 +78,130 @@ export async function getMapResults({
8178

8279
const crawler = crawlToCrawler(id, sc);
8380

84-
let urlWithoutWww = url.replace("www.", "");
85-
86-
let mapUrl = search && allowExternalLinks
87-
? `${search} ${urlWithoutWww}`
88-
: search ? `${search} site:${urlWithoutWww}`
89-
: `site:${url}`;
90-
91-
const resultsPerPage = 100;
92-
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
93-
94-
const cacheKey = `fireEngineMap:${mapUrl}`;
95-
const cachedResult = null;
96-
97-
let allResults: any[] = [];
98-
let pagePromises: Promise<any>[] = [];
81+
// If sitemapOnly is true, only get links from sitemap
82+
if (crawlerOptions.sitemapOnly) {
83+
if (includeMetadata) {
84+
throw new Error("includeMetadata is not supported with sitemapOnly");
85+
}
9986

100-
if (cachedResult) {
101-
allResults = JSON.parse(cachedResult);
102-
} else {
103-
const fetchPage = async (page: number) => {
104-
return fireEngineMap(mapUrl, {
105-
numResults: resultsPerPage,
106-
page: page,
87+
const sitemap = await crawler.tryGetSitemap(true, true);
88+
if (sitemap !== null) {
89+
sitemap.forEach((x) => {
90+
links.push(x.url);
10791
});
108-
};
92+
links = links.slice(1)
93+
.map((x) => {
94+
try {
95+
return checkAndUpdateURLForMap(x).url.trim();
96+
} catch (_) {
97+
return null;
98+
}
99+
})
100+
.filter((x) => x !== null) as string[];
101+
// links = links.slice(1, limit); // don't slice, unnecessary
102+
}
103+
} else {
104+
let urlWithoutWww = url.replace("www.", "");
109105

110-
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
111-
allResults = await Promise.all(pagePromises);
106+
let mapUrl = search && allowExternalLinks
107+
? `${search} ${urlWithoutWww}`
108+
: search ? `${search} site:${urlWithoutWww}`
109+
: `site:${url}`;
112110

113-
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
114-
}
111+
const resultsPerPage = 100;
112+
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
115113

116-
console.log("allResults", allResults);
117-
// Parallelize sitemap fetch with serper search
118-
const [sitemap, ...searchResults] = await Promise.all([
119-
ignoreSitemap ? null : crawler.tryGetSitemap(),
120-
...(cachedResult ? [] : pagePromises),
121-
]);
114+
const cacheKey = `fireEngineMap:${mapUrl}`;
115+
const cachedResult = null;
122116

123-
if (!cachedResult) {
124-
allResults = searchResults;
125-
}
117+
let allResults: any[] = [];
118+
let pagePromises: Promise<any>[] = [];
126119

127-
if (sitemap !== null) {
128-
sitemap.forEach((x) => {
129-
links.push(x.url);
130-
});
131-
}
120+
if (cachedResult) {
121+
allResults = JSON.parse(cachedResult);
122+
} else {
123+
const fetchPage = async (page: number) => {
124+
return fireEngineMap(mapUrl, {
125+
numResults: resultsPerPage,
126+
page: page,
127+
});
128+
};
129+
130+
pagePromises = Array.from({ length: maxPages }, (_, i) =>
131+
fetchPage(i + 1)
132+
);
133+
allResults = await Promise.all(pagePromises);
134+
135+
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
136+
}
132137

133-
let mapResults : MapDocument[] = allResults
134-
.flat()
135-
.filter((result) => result !== null && result !== undefined);
138+
// Parallelize sitemap fetch with serper search
139+
const [sitemap, ...searchResults] = await Promise.all([
140+
ignoreSitemap ? null : crawler.tryGetSitemap(),
141+
...(cachedResult ? [] : pagePromises),
142+
]);
136143

137-
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
138-
if (mapResults.length > minumumCutoff) {
139-
mapResults = mapResults.slice(0, minumumCutoff);
140-
}
144+
if (!cachedResult) {
145+
allResults = searchResults;
146+
}
141147

142-
if (mapResults.length > 0) {
143-
if (search) {
144-
// Ensure all map results are first, maintaining their order
145-
links = [
146-
mapResults[0].url,
147-
...mapResults.slice(1).map((x) => x.url),
148-
...links,
149-
];
150-
} else {
151-
mapResults.map((x) => {
148+
if (sitemap !== null) {
149+
sitemap.forEach((x) => {
152150
links.push(x.url);
153151
});
154152
}
155-
}
156153

157-
// Perform cosine similarity between the search query and the list of links
158-
if (search) {
159-
const searchQuery = search.toLowerCase();
160-
links = performCosineSimilarity(links, searchQuery);
161-
}
154+
mapResults = allResults
155+
.flat()
156+
.filter((result) => result !== null && result !== undefined);
157+
158+
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
159+
if (mapResults.length > minumumCutoff) {
160+
mapResults = mapResults.slice(0, minumumCutoff);
161+
}
162162

163-
links = links
164-
.map((x) => {
165-
try {
166-
return checkAndUpdateURLForMap(x).url.trim();
167-
} catch (_) {
168-
return null;
163+
if (mapResults.length > 0) {
164+
if (search) {
165+
// Ensure all map results are first, maintaining their order
166+
links = [
167+
mapResults[0].url,
168+
...mapResults.slice(1).map((x) => x.url),
169+
...links,
170+
];
171+
} else {
172+
mapResults.map((x) => {
173+
links.push(x.url);
174+
});
169175
}
170-
})
171-
.filter((x) => x !== null) as string[];
176+
}
172177

173-
// allows for subdomains to be included
174-
links = links.filter((x) => isSameDomain(x, url));
178+
// Perform cosine similarity between the search query and the list of links
179+
if (search) {
180+
const searchQuery = search.toLowerCase();
181+
links = performCosineSimilarity(links, searchQuery);
182+
}
175183

176-
// if includeSubdomains is false, filter out subdomains
177-
if (!includeSubdomains) {
178-
links = links.filter((x) => isSameSubdomain(x, url));
179-
}
184+
links = links
185+
.map((x) => {
186+
try {
187+
return checkAndUpdateURLForMap(x).url.trim();
188+
} catch (_) {
189+
return null;
190+
}
191+
})
192+
.filter((x) => x !== null) as string[];
193+
194+
// allows for subdomains to be included
195+
links = links.filter((x) => isSameDomain(x, url));
196+
197+
// if includeSubdomains is false, filter out subdomains
198+
if (!includeSubdomains) {
199+
links = links.filter((x) => isSameSubdomain(x, url));
200+
}
180201

181-
// remove duplicates that could be due to http/https or www
182-
links = removeDuplicateUrls(links);
202+
// remove duplicates that could be due to http/https or www
203+
links = removeDuplicateUrls(links);
204+
}
183205

184206
const linksToReturn = links.slice(0, limit);
185207

@@ -241,52 +263,4 @@ export async function mapController(
241263
};
242264

243265
return res.status(200).json(response);
244-
}
245-
246-
// Subdomain sitemap url checking
247-
248-
// // For each result, check for subdomains, get their sitemaps and add them to the links
249-
// const processedUrls = new Set();
250-
// const processedSubdomains = new Set();
251-
252-
// for (const result of links) {
253-
// let url;
254-
// let hostParts;
255-
// try {
256-
// url = new URL(result);
257-
// hostParts = url.hostname.split('.');
258-
// } catch (e) {
259-
// continue;
260-
// }
261-
262-
// console.log("hostParts", hostParts);
263-
// // Check if it's a subdomain (more than 2 parts, and not 'www')
264-
// if (hostParts.length > 2 && hostParts[0] !== 'www') {
265-
// const subdomain = hostParts[0];
266-
// console.log("subdomain", subdomain);
267-
// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
268-
// console.log("subdomainUrl", subdomainUrl);
269-
270-
// if (!processedSubdomains.has(subdomainUrl)) {
271-
// processedSubdomains.add(subdomainUrl);
272-
273-
// const subdomainCrawl = crawlToCrawler(id, {
274-
// originUrl: subdomainUrl,
275-
// crawlerOptions: legacyCrawlerOptions(req.body),
276-
// pageOptions: {},
277-
// team_id: req.auth.team_id,
278-
// createdAt: Date.now(),
279-
// plan: req.auth.plan,
280-
// });
281-
// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
282-
// if (subdomainSitemap) {
283-
// subdomainSitemap.forEach((x) => {
284-
// if (!processedUrls.has(x.url)) {
285-
// processedUrls.add(x.url);
286-
// links.push(x.url);
287-
// }
288-
// });
289-
// }
290-
// }
291-
// }
292-
// }
266+
}

apps/api/src/controllers/v1/scrape-status.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,12 @@ export async function scrapeStatusController(req: any, res: any) {
1111
await rateLimiter.consume(iptoken);
1212

1313
const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
14+
const allowedTeams = [
15+
"41bdbfe1-0579-4d9b-b6d5-809f16be12f5",
16+
"511544f2-2fce-4183-9c59-6c29b02c69b5"
17+
];
1418

15-
if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
19+
if(!allowedTeams.includes(job?.team_id)){
1620
return res.status(403).json({
1721
success: false,
1822
error: "You are not allowed to access this resource.",

0 commit comments

Comments
 (0)