1
1
import { Response } from "express" ;
2
2
import { v4 as uuidv4 } from "uuid" ;
3
- import {
4
- MapDocument ,
5
- mapRequestSchema ,
6
- RequestWithAuth ,
7
- scrapeOptions ,
8
- } from "./types" ;
3
+ import { MapDocument , mapRequestSchema , RequestWithAuth , scrapeOptions } from "./types" ;
9
4
import { crawlToCrawler , StoredCrawl } from "../../lib/crawl-redis" ;
10
5
import { MapResponse , MapRequest } from "./types" ;
11
6
import { configDotenv } from "dotenv" ;
@@ -65,11 +60,13 @@ export async function getMapResults({
65
60
} ) : Promise < MapResult > {
66
61
const id = uuidv4 ( ) ;
67
62
let links : string [ ] = [ url ] ;
63
+ let mapResults : MapDocument [ ] = [ ] ;
68
64
69
65
const sc : StoredCrawl = {
70
66
originUrl : url ,
71
67
crawlerOptions : {
72
68
...crawlerOptions ,
69
+ limit : crawlerOptions . sitemapOnly ? 10000000 : limit ,
73
70
scrapeOptions : undefined ,
74
71
} ,
75
72
scrapeOptions : scrapeOptions . parse ( { } ) ,
@@ -81,105 +78,130 @@ export async function getMapResults({
81
78
82
79
const crawler = crawlToCrawler ( id , sc ) ;
83
80
84
- let urlWithoutWww = url . replace ( "www." , "" ) ;
85
-
86
- let mapUrl = search && allowExternalLinks
87
- ? `${ search } ${ urlWithoutWww } `
88
- : search ? `${ search } site:${ urlWithoutWww } `
89
- : `site:${ url } ` ;
90
-
91
- const resultsPerPage = 100 ;
92
- const maxPages = Math . ceil ( Math . min ( MAX_FIRE_ENGINE_RESULTS , limit ) / resultsPerPage ) ;
93
-
94
- const cacheKey = `fireEngineMap:${ mapUrl } ` ;
95
- const cachedResult = null ;
96
-
97
- let allResults : any [ ] = [ ] ;
98
- let pagePromises : Promise < any > [ ] = [ ] ;
81
+ // If sitemapOnly is true, only get links from sitemap
82
+ if ( crawlerOptions . sitemapOnly ) {
83
+ if ( includeMetadata ) {
84
+ throw new Error ( "includeMetadata is not supported with sitemapOnly" ) ;
85
+ }
99
86
100
- if ( cachedResult ) {
101
- allResults = JSON . parse ( cachedResult ) ;
102
- } else {
103
- const fetchPage = async ( page : number ) => {
104
- return fireEngineMap ( mapUrl , {
105
- numResults : resultsPerPage ,
106
- page : page ,
87
+ const sitemap = await crawler . tryGetSitemap ( true , true ) ;
88
+ if ( sitemap !== null ) {
89
+ sitemap . forEach ( ( x ) => {
90
+ links . push ( x . url ) ;
107
91
} ) ;
108
- } ;
92
+ links = links . slice ( 1 )
93
+ . map ( ( x ) => {
94
+ try {
95
+ return checkAndUpdateURLForMap ( x ) . url . trim ( ) ;
96
+ } catch ( _ ) {
97
+ return null ;
98
+ }
99
+ } )
100
+ . filter ( ( x ) => x !== null ) as string [ ] ;
101
+ // links = links.slice(1, limit); // don't slice, unnecessary
102
+ }
103
+ } else {
104
+ let urlWithoutWww = url . replace ( "www." , "" ) ;
109
105
110
- pagePromises = Array . from ( { length : maxPages } , ( _ , i ) => fetchPage ( i + 1 ) ) ;
111
- allResults = await Promise . all ( pagePromises ) ;
106
+ let mapUrl = search && allowExternalLinks
107
+ ? `${ search } ${ urlWithoutWww } `
108
+ : search ? `${ search } site:${ urlWithoutWww } `
109
+ : `site:${ url } ` ;
112
110
113
- await redis . set ( cacheKey , JSON . stringify ( allResults ) , "EX" , 24 * 60 * 60 ) ; // Cache for 24 hours
114
- }
111
+ const resultsPerPage = 100 ;
112
+ const maxPages = Math . ceil ( Math . min ( MAX_FIRE_ENGINE_RESULTS , limit ) / resultsPerPage ) ;
115
113
116
- console . log ( "allResults" , allResults ) ;
117
- // Parallelize sitemap fetch with serper search
118
- const [ sitemap , ...searchResults ] = await Promise . all ( [
119
- ignoreSitemap ? null : crawler . tryGetSitemap ( ) ,
120
- ...( cachedResult ? [ ] : pagePromises ) ,
121
- ] ) ;
114
+ const cacheKey = `fireEngineMap:${ mapUrl } ` ;
115
+ const cachedResult = null ;
122
116
123
- if ( ! cachedResult ) {
124
- allResults = searchResults ;
125
- }
117
+ let allResults : any [ ] = [ ] ;
118
+ let pagePromises : Promise < any > [ ] = [ ] ;
126
119
127
- if ( sitemap !== null ) {
128
- sitemap . forEach ( ( x ) => {
129
- links . push ( x . url ) ;
130
- } ) ;
131
- }
120
+ if ( cachedResult ) {
121
+ allResults = JSON . parse ( cachedResult ) ;
122
+ } else {
123
+ const fetchPage = async ( page : number ) => {
124
+ return fireEngineMap ( mapUrl , {
125
+ numResults : resultsPerPage ,
126
+ page : page ,
127
+ } ) ;
128
+ } ;
129
+
130
+ pagePromises = Array . from ( { length : maxPages } , ( _ , i ) =>
131
+ fetchPage ( i + 1 )
132
+ ) ;
133
+ allResults = await Promise . all ( pagePromises ) ;
134
+
135
+ await redis . set ( cacheKey , JSON . stringify ( allResults ) , "EX" , 24 * 60 * 60 ) ; // Cache for 24 hours
136
+ }
132
137
133
- let mapResults : MapDocument [ ] = allResults
134
- . flat ( )
135
- . filter ( ( result ) => result !== null && result !== undefined ) ;
138
+ // Parallelize sitemap fetch with serper search
139
+ const [ sitemap , ...searchResults ] = await Promise . all ( [
140
+ ignoreSitemap ? null : crawler . tryGetSitemap ( ) ,
141
+ ...( cachedResult ? [ ] : pagePromises ) ,
142
+ ] ) ;
136
143
137
- const minumumCutoff = Math . min ( MAX_MAP_LIMIT , limit ) ;
138
- if ( mapResults . length > minumumCutoff ) {
139
- mapResults = mapResults . slice ( 0 , minumumCutoff ) ;
140
- }
144
+ if ( ! cachedResult ) {
145
+ allResults = searchResults ;
146
+ }
141
147
142
- if ( mapResults . length > 0 ) {
143
- if ( search ) {
144
- // Ensure all map results are first, maintaining their order
145
- links = [
146
- mapResults [ 0 ] . url ,
147
- ...mapResults . slice ( 1 ) . map ( ( x ) => x . url ) ,
148
- ...links ,
149
- ] ;
150
- } else {
151
- mapResults . map ( ( x ) => {
148
+ if ( sitemap !== null ) {
149
+ sitemap . forEach ( ( x ) => {
152
150
links . push ( x . url ) ;
153
151
} ) ;
154
152
}
155
- }
156
153
157
- // Perform cosine similarity between the search query and the list of links
158
- if ( search ) {
159
- const searchQuery = search . toLowerCase ( ) ;
160
- links = performCosineSimilarity ( links , searchQuery ) ;
161
- }
154
+ mapResults = allResults
155
+ . flat ( )
156
+ . filter ( ( result ) => result !== null && result !== undefined ) ;
157
+
158
+ const minumumCutoff = Math . min ( MAX_MAP_LIMIT , limit ) ;
159
+ if ( mapResults . length > minumumCutoff ) {
160
+ mapResults = mapResults . slice ( 0 , minumumCutoff ) ;
161
+ }
162
162
163
- links = links
164
- . map ( ( x ) => {
165
- try {
166
- return checkAndUpdateURLForMap ( x ) . url . trim ( ) ;
167
- } catch ( _ ) {
168
- return null ;
163
+ if ( mapResults . length > 0 ) {
164
+ if ( search ) {
165
+ // Ensure all map results are first, maintaining their order
166
+ links = [
167
+ mapResults [ 0 ] . url ,
168
+ ...mapResults . slice ( 1 ) . map ( ( x ) => x . url ) ,
169
+ ...links ,
170
+ ] ;
171
+ } else {
172
+ mapResults . map ( ( x ) => {
173
+ links . push ( x . url ) ;
174
+ } ) ;
169
175
}
170
- } )
171
- . filter ( ( x ) => x !== null ) as string [ ] ;
176
+ }
172
177
173
- // allows for subdomains to be included
174
- links = links . filter ( ( x ) => isSameDomain ( x , url ) ) ;
178
+ // Perform cosine similarity between the search query and the list of links
179
+ if ( search ) {
180
+ const searchQuery = search . toLowerCase ( ) ;
181
+ links = performCosineSimilarity ( links , searchQuery ) ;
182
+ }
175
183
176
- // if includeSubdomains is false, filter out subdomains
177
- if ( ! includeSubdomains ) {
178
- links = links . filter ( ( x ) => isSameSubdomain ( x , url ) ) ;
179
- }
184
+ links = links
185
+ . map ( ( x ) => {
186
+ try {
187
+ return checkAndUpdateURLForMap ( x ) . url . trim ( ) ;
188
+ } catch ( _ ) {
189
+ return null ;
190
+ }
191
+ } )
192
+ . filter ( ( x ) => x !== null ) as string [ ] ;
193
+
194
+ // allows for subdomains to be included
195
+ links = links . filter ( ( x ) => isSameDomain ( x , url ) ) ;
196
+
197
+ // if includeSubdomains is false, filter out subdomains
198
+ if ( ! includeSubdomains ) {
199
+ links = links . filter ( ( x ) => isSameSubdomain ( x , url ) ) ;
200
+ }
180
201
181
- // remove duplicates that could be due to http/https or www
182
- links = removeDuplicateUrls ( links ) ;
202
+ // remove duplicates that could be due to http/https or www
203
+ links = removeDuplicateUrls ( links ) ;
204
+ }
183
205
184
206
const linksToReturn = links . slice ( 0 , limit ) ;
185
207
@@ -241,52 +263,4 @@ export async function mapController(
241
263
} ;
242
264
243
265
return res . status ( 200 ) . json ( response ) ;
244
- }
245
-
246
- // Subdomain sitemap url checking
247
-
248
- // // For each result, check for subdomains, get their sitemaps and add them to the links
249
- // const processedUrls = new Set();
250
- // const processedSubdomains = new Set();
251
-
252
- // for (const result of links) {
253
- // let url;
254
- // let hostParts;
255
- // try {
256
- // url = new URL(result);
257
- // hostParts = url.hostname.split('.');
258
- // } catch (e) {
259
- // continue;
260
- // }
261
-
262
- // console.log("hostParts", hostParts);
263
- // // Check if it's a subdomain (more than 2 parts, and not 'www')
264
- // if (hostParts.length > 2 && hostParts[0] !== 'www') {
265
- // const subdomain = hostParts[0];
266
- // console.log("subdomain", subdomain);
267
- // const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
268
- // console.log("subdomainUrl", subdomainUrl);
269
-
270
- // if (!processedSubdomains.has(subdomainUrl)) {
271
- // processedSubdomains.add(subdomainUrl);
272
-
273
- // const subdomainCrawl = crawlToCrawler(id, {
274
- // originUrl: subdomainUrl,
275
- // crawlerOptions: legacyCrawlerOptions(req.body),
276
- // pageOptions: {},
277
- // team_id: req.auth.team_id,
278
- // createdAt: Date.now(),
279
- // plan: req.auth.plan,
280
- // });
281
- // const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
282
- // if (subdomainSitemap) {
283
- // subdomainSitemap.forEach((x) => {
284
- // if (!processedUrls.has(x.url)) {
285
- // processedUrls.add(x.url);
286
- // links.push(x.url);
287
- // }
288
- // });
289
- // }
290
- // }
291
- // }
292
- // }
266
+ }
0 commit comments