@@ -31,6 +31,88 @@ const {
31
31
32
32
const { version : reffyVersion } = require ( '../../package.json' ) ;
33
33
34
+ /**
35
+ * To be friendly with servers, requests get serialized by origin server,
36
+ * and the code sleeps a bit in between requests to a given origin server.
37
+ * To achieve, the code needs to take a lock on the origin it wants to send a
38
+ * request to.
39
+ */
40
+ const originLocks = { } ;
41
+
42
+
43
+ /**
44
+ * Helper function to sleep for a specified number of milliseconds
45
+ */
46
+ function sleep ( ms ) {
47
+ return new Promise ( resolve => setTimeout ( resolve , ms , 'slept' ) ) ;
48
+ }
49
+
50
+
51
+ /**
52
+ * Helper function to interleave values of a list of arrays.
53
+ *
54
+ * For example:
55
+ * interleave([0, 2, 4, 6, 8], [1, 3, 5]) returns [0, 1, 2, 3, 4, 5, 6, 8]
56
+ * interleave([0, 3], [1, 4], [2, 5]) returns [0, 1, 2, 3, 4, 5]
57
+ *
58
+ * The function is used to sort the list of specs to crawl so as to distribute
59
+ * origins throughout the list.
60
+ *
61
+ * Note the function happily modifies (and empties in practice) the arrays
62
+ * it receives as arguments.
63
+ */
64
+ function interleave ( firstArray , ...furtherArrays ) {
65
+ if ( firstArray ?. length > 0 ) {
66
+ // Return the concactenation of the first item in the first array,
67
+ // and of the result of interleaving remaining arrays, putting the
68
+ // first array last in the list.
69
+ const firstItem = firstArray . shift ( ) ;
70
+ return [ firstItem , ...interleave ( ...furtherArrays , firstArray ) ] ;
71
+ }
72
+ else {
73
+ // First array is empty, let's proceed with remaining arrays
74
+ // until there's nothing else to proceed.
75
+ if ( furtherArrays . length > 0 ) {
76
+ return interleave ( ...furtherArrays ) ;
77
+ }
78
+ else {
79
+ return [ ] ;
80
+ }
81
+ }
82
+ }
83
+
84
+
85
+ /**
86
+ * Helper function that returns the "origin" of a URL, defined in a loose way
87
+ * as the part of the true origin that identifies the server that's going to
88
+ * serve the resource.
89
+ *
90
+ * For example "github.io" for all specs under github.io, "whatwg.org" for
91
+ * all WHATWG specs, "csswg.org" for CSS specs at large (including Houdini
92
+ * and FXTF specs since they are served by the same server).
93
+ */
94
+ function getOrigin ( url ) {
95
+ if ( ! url ) {
96
+ return '' ;
97
+ }
98
+ const origin = ( new URL ( url ) ) . origin ;
99
+ if ( origin . endsWith ( '.whatwg.org' ) ) {
100
+ return 'whatwg.org' ;
101
+ }
102
+ else if ( origin . endsWith ( '.github.io' ) ) {
103
+ return 'github.io' ;
104
+ }
105
+ else if ( origin . endsWith ( '.csswg.org' ) ||
106
+ origin . endsWith ( '.css-houdini.org' ) ||
107
+ origin . endsWith ( '.fxtf.org' ) ) {
108
+ return 'csswg.org' ;
109
+ }
110
+ else {
111
+ return origin ;
112
+ }
113
+ }
114
+
115
+
34
116
/**
35
117
* Return the spec if crawl succeeded or crawl result from given fallback list
36
118
* if crawl yielded an error (and fallback does exist).
@@ -95,24 +177,51 @@ async function crawlSpec(spec, crawlOptions) {
95
177
result = { } ;
96
178
}
97
179
else {
98
- result = await processSpecification (
99
- urlToCrawl ,
100
- ( spec , modules ) => {
101
- const idToHeading = modules . find ( m => m . needsIdToHeadingMap ) ?
102
- window . reffy . mapIdsToHeadings ( ) : null ;
103
- const res = {
104
- crawled : window . location . toString ( )
105
- } ;
106
- modules . forEach ( mod => {
107
- res [ mod . property ] = window . reffy [ mod . name ] ( spec , idToHeading ) ;
108
- } ) ;
109
- return res ;
110
- } ,
111
- [ spec , crawlOptions . modules ] ,
112
- { quiet : crawlOptions . quiet ,
113
- forceLocalFetch : crawlOptions . forceLocalFetch ,
114
- ...cacheInfo }
115
- ) ;
180
+ // To be friendly with servers, requests are serialized per origin
181
+ // and only sent after a couple of seconds.
182
+ const origin = getOrigin ( urlToCrawl . url ) ;
183
+ let originLock = originLocks [ origin ] ;
184
+ if ( ! originLock ) {
185
+ originLock = {
186
+ locked : false ,
187
+ last : 0
188
+ } ;
189
+ originLocks [ origin ] = originLock ;
190
+ }
191
+ // Wait for the "lock" on the origin. Once we can take it, sleep as
192
+ // needed to only send a request after enough time has elapsed.
193
+ while ( originLock . locked ) {
194
+ await sleep ( 100 ) ;
195
+ }
196
+ originLock . locked = true ;
197
+ const now = Date . now ( ) ;
198
+ if ( now - originLock . last < 2000 ) {
199
+ await sleep ( 2000 - ( now - originLock . last ) ) ;
200
+ }
201
+ try {
202
+ result = await processSpecification (
203
+ urlToCrawl ,
204
+ ( spec , modules ) => {
205
+ const idToHeading = modules . find ( m => m . needsIdToHeadingMap ) ?
206
+ window . reffy . mapIdsToHeadings ( ) : null ;
207
+ const res = {
208
+ crawled : window . location . toString ( )
209
+ } ;
210
+ modules . forEach ( mod => {
211
+ res [ mod . property ] = window . reffy [ mod . name ] ( spec , idToHeading ) ;
212
+ } ) ;
213
+ return res ;
214
+ } ,
215
+ [ spec , crawlOptions . modules ] ,
216
+ { quiet : crawlOptions . quiet ,
217
+ forceLocalFetch : crawlOptions . forceLocalFetch ,
218
+ ...cacheInfo }
219
+ ) ;
220
+ }
221
+ finally {
222
+ originLock . last = Date . now ( ) ;
223
+ originLock . locked = false ;
224
+ }
116
225
if ( result . status === "notmodified" && fallback ) {
117
226
crawlOptions . quiet ?? console . warn ( `skipping ${ spec . url } , no change` ) ;
118
227
const copy = Object . assign ( { } , fallback ) ;
@@ -343,14 +452,33 @@ async function crawlList(speclist, crawlOptions) {
343
452
return { spec, readyToCrawl, resolve, reject } ;
344
453
} ) ;
345
454
455
+ // While we want results to be returned following the initial order of the
456
+ // specs, to avoid sending too many requests at once to the same origin,
457
+ // we'll sort specs so that origins get interleaved.
458
+ // Note: there may be specs without URL (ISO specs)
459
+ const specsByOrigin = { } ;
460
+ for ( const spec of list ) {
461
+ const toCrawl = crawlOptions . publishedVersion ?
462
+ ( spec . release ?? spec . nightly ) :
463
+ spec . nightly ;
464
+ const origin = getOrigin ( toCrawl ?. url ) ;
465
+ if ( ! specsByOrigin [ origin ] ) {
466
+ specsByOrigin [ origin ] = [ ] ;
467
+ }
468
+ specsByOrigin [ origin ] . push ( spec ) ;
469
+ }
470
+ const spreadList = interleave ( ...Object . values ( specsByOrigin ) ) ;
471
+
346
472
// In debug mode, specs are processed one by one. In normal mode,
347
473
// specs are processing in chunks
348
474
const chunkSize = Math . min ( ( crawlOptions . debug ? 1 : 4 ) , list . length ) ;
349
475
350
476
let pos = 0 ;
351
477
function flagNextSpecAsReadyToCrawl ( ) {
352
- if ( pos < listAndPromise . length ) {
353
- listAndPromise [ pos ] . resolve ( ) ;
478
+ if ( pos < spreadList . length ) {
479
+ const spec = spreadList [ pos ] ;
480
+ const specAndPromise = listAndPromise . find ( sp => sp . spec === spec ) ;
481
+ specAndPromise . resolve ( ) ;
354
482
pos += 1 ;
355
483
}
356
484
}
0 commit comments