mendableai
diff --git a/‎apps/api/src/controllers/v0/crawl.ts
Lines changed: 1 addition & 1 deletion b/‎apps/api/src/controllers/v0/crawl.ts
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/api/src/controllers/v0/scrape.ts
Lines changed: 1 addition & 1 deletion b/‎apps/api/src/controllers/v0/scrape.ts
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/api/src/controllers/v1/batch-scrape.ts
Lines changed: 6 additions & 0 deletions b/‎apps/api/src/controllers/v1/batch-scrape.ts
Lines changed: 6 additions & 0 deletions
diff --git a/‎apps/api/src/controllers/v1/crawl-status-ws.ts
Lines changed: 1 addition & 1 deletion b/‎apps/api/src/controllers/v1/crawl-status-ws.ts
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/api/src/controllers/v1/map.ts
Lines changed: 109 additions & 135 deletions b/‎apps/api/src/controllers/v1/map.ts
Lines changed: 109 additions & 135 deletions
diff --git a/‎apps/api/src/controllers/v1/scrape-status.ts
Lines changed: 5 additions & 1 deletion b/‎apps/api/src/controllers/v1/scrape-status.ts
Lines changed: 5 additions & 1 deletion
@@ -75,7 +75,7 @@ export async function crawlController(req: Request, res: Response) {
       await checkTeamCredits(chunk, team_id, limitCheck);
 
     if (!creditsCheckSuccess) {
-      return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });
+      return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" });
     }
 
     // TODO: need to do this to v1
 
@@ -209,7 +209,7 @@ export async function scrapeController(req: Request, res: Response) {
       earlyReturn = true;
       return res.status(500).json({
         error:
-          "Error checking team credits. Please contact hello@firecrawl.com for help.",
+          "Error checking team credits. Please contact help@firecrawl.com for help.",
       });
     }
 
 
@@ -16,6 +16,7 @@ import { logCrawl } from "../../services/logging/crawl_log";
 import { getScrapeQueue } from "../../services/queue-service";
 import { getJobPriority } from "../../lib/job-priority";
 import { addScrapeJobs } from "../../services/queue-jobs";
+import { callWebhook } from "../../services/webhook";
 
 export async function batchScrapeController(
   req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
@@ -66,6 +67,7 @@ export async function batchScrapeController(
         crawl_id: id,
         sitemapped: true,
         v1: true,
+        webhook: req.body.webhook,
       },
       opts: {
         jobId: uuidv4(),
@@ -85,6 +87,10 @@ export async function batchScrapeController(
   );
   await addScrapeJobs(jobs);
 
+  if(req.body.webhook) {
+    await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
+  }
+
   const protocol = process.env.ENV === "local" ? req.protocol : "https";
 
   return res.status(200).json({
 
@@ -175,7 +175,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
     logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
     return close(ws, 1011, {
       type: "error",
-      error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
+      error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id
     });
   }
 }
@@ -1,11 +1,6 @@
 import { Response } from "express";
 import { v4 as uuidv4 } from "uuid";
-import {
-  MapDocument,
-  mapRequestSchema,
-  RequestWithAuth,
-  scrapeOptions,
-} from "./types";
+import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
 import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
 import { MapResponse, MapRequest } from "./types";
 import { configDotenv } from "dotenv";
@@ -65,11 +60,13 @@ export async function getMapResults({
 }): Promise<MapResult> {
   const id = uuidv4();
   let links: string[] = [url];
+  let mapResults: MapDocument[] = [];
 
   const sc: StoredCrawl = {
     originUrl: url,
     crawlerOptions: {
       ...crawlerOptions,
+      limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
       scrapeOptions: undefined,
     },
     scrapeOptions: scrapeOptions.parse({}),
@@ -81,105 +78,130 @@ export async function getMapResults({
 
   const crawler = crawlToCrawler(id, sc);
 
-  let urlWithoutWww = url.replace("www.", "");
-
-  let mapUrl = search && allowExternalLinks
-    ? `${search} ${urlWithoutWww}`
-    : search ? `${search} site:${urlWithoutWww}`
-    : `site:${url}`;
-
-  const resultsPerPage = 100;
-  const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
-
-  const cacheKey = `fireEngineMap:${mapUrl}`;
-  const cachedResult = null;
-
-  let allResults: any[] = [];
-  let pagePromises: Promise<any>[] = [];
+  // If sitemapOnly is true, only get links from sitemap
+  if (crawlerOptions.sitemapOnly) {
+    if (includeMetadata) {
+      throw new Error("includeMetadata is not supported with sitemapOnly");
+    }
 
-  if (cachedResult) {
-    allResults = JSON.parse(cachedResult);
-  } else {
-    const fetchPage = async (page: number) => {
-      return fireEngineMap(mapUrl, {
-        numResults: resultsPerPage,
-        page: page,
+    const sitemap = await crawler.tryGetSitemap(true, true);
+    if (sitemap !== null) {
+      sitemap.forEach((x) => {
+        links.push(x.url);
       });
-    };
+      links = links.slice(1)
+        .map((x) => {
+          try {
+            return checkAndUpdateURLForMap(x).url.trim();
+          } catch (_) {
+            return null;
+          }
+        })
+        .filter((x) => x !== null) as string[];
+      // links = links.slice(1, limit); // don't slice, unnecessary
+    }
+  } else {
+    let urlWithoutWww = url.replace("www.", "");
 
-    pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
-    allResults = await Promise.all(pagePromises);
+    let mapUrl = search && allowExternalLinks
+      ? `${search} ${urlWithoutWww}`
+      : search ? `${search} site:${urlWithoutWww}`
+      : `site:${url}`;
 
-    await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
-  }
+    const resultsPerPage = 100;
+    const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
 
-  console.log("allResults", allResults);
-  // Parallelize sitemap fetch with serper search
-  const [sitemap, ...searchResults] = await Promise.all([
-    ignoreSitemap ? null : crawler.tryGetSitemap(),
-    ...(cachedResult ? [] : pagePromises),
-  ]);
+    const cacheKey = `fireEngineMap:${mapUrl}`;
+    const cachedResult = null;
 
-  if (!cachedResult) {
-    allResults = searchResults;
-  }
+    let allResults: any[] = [];
+    let pagePromises: Promise<any>[] = [];
 
-  if (sitemap !== null) {
-    sitemap.forEach((x) => {
-      links.push(x.url);
-    });
-  }
+    if (cachedResult) {
+      allResults = JSON.parse(cachedResult);
+    } else {
+      const fetchPage = async (page: number) => {
+        return fireEngineMap(mapUrl, {
+          numResults: resultsPerPage,
+          page: page,
+        });
+      };
+
+      pagePromises = Array.from({ length: maxPages }, (_, i) =>
+        fetchPage(i + 1)
+      );
+      allResults = await Promise.all(pagePromises);
+
+      await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
+    }
 
-  let mapResults : MapDocument[] = allResults
-    .flat()
-    .filter((result) => result !== null && result !== undefined);
+    // Parallelize sitemap fetch with serper search
+    const [sitemap, ...searchResults] = await Promise.all([
+      ignoreSitemap ? null : crawler.tryGetSitemap(),
+      ...(cachedResult ? [] : pagePromises),
+    ]);
 
-  const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
-  if (mapResults.length > minumumCutoff) {
-    mapResults = mapResults.slice(0, minumumCutoff);
-  }
+    if (!cachedResult) {
+      allResults = searchResults;
+    }
 
-  if (mapResults.length > 0) {
-    if (search) {
-      // Ensure all map results are first, maintaining their order
-      links = [
-        mapResults[0].url,
-        ...mapResults.slice(1).map((x) => x.url),
-        ...links,
-      ];
-    } else {
-      mapResults.map((x) => {
+    if (sitemap !== null) {
+      sitemap.forEach((x) => {
         links.push(x.url);
       });
     }
-  }
 
-  // Perform cosine similarity between the search query and the list of links
-  if (search) {
-    const searchQuery = search.toLowerCase();
-    links = performCosineSimilarity(links, searchQuery);
-  }
+    mapResults = allResults
+      .flat()
+      .filter((result) => result !== null && result !== undefined);
+
+    const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
+    if (mapResults.length > minumumCutoff) {
+      mapResults = mapResults.slice(0, minumumCutoff);
+    }
 
-  links = links
-    .map((x) => {
-      try {
-        return checkAndUpdateURLForMap(x).url.trim();
-      } catch (_) {
-        return null;
+    if (mapResults.length > 0) {
+      if (search) {
+        // Ensure all map results are first, maintaining their order
+        links = [
+          mapResults[0].url,
+          ...mapResults.slice(1).map((x) => x.url),
+          ...links,
+        ];
+      } else {
+        mapResults.map((x) => {
+          links.push(x.url);
+        });
       }
-    })
-    .filter((x) => x !== null) as string[];
+    }
 
-  // allows for subdomains to be included
-  links = links.filter((x) => isSameDomain(x, url));
+    // Perform cosine similarity between the search query and the list of links
+    if (search) {
+      const searchQuery = search.toLowerCase();
+      links = performCosineSimilarity(links, searchQuery);
+    }
 
-  // if includeSubdomains is false, filter out subdomains
-  if (!includeSubdomains) {
-    links = links.filter((x) => isSameSubdomain(x, url));
-  }
+    links = links
+      .map((x) => {
+        try {
+          return checkAndUpdateURLForMap(x).url.trim();
+        } catch (_) {
+          return null;
+        }
+      })
+      .filter((x) => x !== null) as string[];
+
+    // allows for subdomains to be included
+    links = links.filter((x) => isSameDomain(x, url));
+
+    // if includeSubdomains is false, filter out subdomains
+    if (!includeSubdomains) {
+      links = links.filter((x) => isSameSubdomain(x, url));
+    }
 
-  // remove duplicates that could be due to http/https or www
-  links = removeDuplicateUrls(links);
+    // remove duplicates that could be due to http/https or www
+    links = removeDuplicateUrls(links);
+  }
 
   const linksToReturn = links.slice(0, limit);
 
@@ -241,52 +263,4 @@ export async function mapController(
   };
 
   return res.status(200).json(response);
-}
-
-// Subdomain sitemap url checking
-
-// // For each result, check for subdomains, get their sitemaps and add them to the links
-// const processedUrls = new Set();
-// const processedSubdomains = new Set();
-
-// for (const result of links) {
-//   let url;
-//   let hostParts;
-//   try {
-//     url = new URL(result);
-//     hostParts = url.hostname.split('.');
-//   } catch (e) {
-//     continue;
-//   }
-
-//   console.log("hostParts", hostParts);
-//   // Check if it's a subdomain (more than 2 parts, and not 'www')
-//   if (hostParts.length > 2 && hostParts[0] !== 'www') {
-//     const subdomain = hostParts[0];
-//     console.log("subdomain", subdomain);
-//     const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
-//     console.log("subdomainUrl", subdomainUrl);
-
-//     if (!processedSubdomains.has(subdomainUrl)) {
-//       processedSubdomains.add(subdomainUrl);
-
-//       const subdomainCrawl = crawlToCrawler(id, {
-//         originUrl: subdomainUrl,
-//         crawlerOptions: legacyCrawlerOptions(req.body),
-//         pageOptions: {},
-//         team_id: req.auth.team_id,
-//         createdAt: Date.now(),
-//         plan: req.auth.plan,
-//       });
-//       const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
-//       if (subdomainSitemap) {
-//         subdomainSitemap.forEach((x) => {
-//           if (!processedUrls.has(x.url)) {
-//             processedUrls.add(x.url);
-//             links.push(x.url);
-//           }
-//         });
-//       }
-//     }
-//   }
-// }
+}
@@ -11,8 +11,12 @@ export async function scrapeStatusController(req: any, res: any) {
     await rateLimiter.consume(iptoken);
 
     const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
+    const allowedTeams = [
+      "41bdbfe1-0579-4d9b-b6d5-809f16be12f5", 
+      "511544f2-2fce-4183-9c59-6c29b02c69b5"
+    ];
 
-    if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
+    if(!allowedTeams.includes(job?.team_id)){
       return res.status(403).json({
         success: false,
         error: "You are not allowed to access this resource.",
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ export async function crawlController(req: Request, res: Response) {`
`75`	`75`	`await checkTeamCredits(chunk, team_id, limitCheck);`
`76`	`76`
`77`	`77`	`if (!creditsCheckSuccess) {`
`78`		`- return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });`
	`78`	`+ return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" });`
`79`	`79`	`}`
`80`	`80`
`81`	`81`	`// TODO: need to do this to v1`
Original file line number	Diff line number	Diff line change
`@@ -209,7 +209,7 @@ export async function scrapeController(req: Request, res: Response) {`
`209`	`209`	`earlyReturn = true;`
`210`	`210`	`return res.status(500).json({`
`211`	`211`	`error:`
`212`		`- "Error checking team credits. Please contact hello@firecrawl.com for help.",`
	`212`	`+ "Error checking team credits. Please contact help@firecrawl.com for help.",`
`213`	`213`	`});`
`214`	`214`	`}`
`215`	`215`
Original file line number	Diff line number	Diff line change
`@@ -175,7 +175,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut`
`175`	`175`	`logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);`
`176`	`176`	`return close(ws, 1011, {`
`177`	`177`	`type: "error",`
`178`		`- error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id`
	`178`	`+ error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id`
`179`	`179`	`});`
`180`	`180`	`}`
`181`	`181`	`}`