Skip to content

Commit c63cb0a

Browse files
authored
Reduce crawler load on servers (#1581)
The crawler has a hard time crawling all specs nowadays due to more stringent restrictions on servers that lead to network timeouts and errors. See: w3c/webref#1244 The goal of this update is to reduce the load of the crawler onto servers. Two changes: 1. The list of specs to crawl gets sorted to distribute origins. This should help with diluting requests sent to a specific server at once. The notion of "origin" used in the code is loose and more meant to identify the server that serves the resource than the actual origin. 2. Requests sent to a given origin are serialized, and sent 2 seconds minimum after the last request was sent (and processed). The crawler still processes the list 4 specs at a time otherwise (provided the specs are to be retrieved from different origins). The consequence of 1. is that the specs are no longer processed in order, so logs will make the crawler look a bit drunk, processing specs seemingly randomly, as in: ``` 1/610 - https://aomediacodec.github.io/afgs1-spec/ - crawling 8/610 - https://compat.spec.whatwg.org/ - crawling 12/610 - https://datatracker.ietf.org/doc/html/draft-davidben-http-client-hint-reliability - crawling 13/610 - https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-rfc6265bis - crawling 12/610 - https://datatracker.ietf.org/doc/html/draft-davidben-http-client-hint-reliability - done 16/610 - https://drafts.css-houdini.org/css-typed-om-2/ - crawling 13/610 - https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-rfc6265bis - done 45/610 - https://fidoalliance.org/specs/fido-v2.1-ps-20210615/fido-client-to-authenticator-protocol-v2.1-ps-errata-20220621.html - crawling https://compat.spec.whatwg.org/ [error] Multiple event handler named orientationchange, cannot associate reliably to an interface in Compatibility Standard 8/610 - https://compat.spec.whatwg.org/ - done 66/610 - https://registry.khronos.org/glTF/specs/2.0/glTF-2.0.html - crawling https://aomediacodec.github.io/afgs1-spec/ [log] extract refs without rules 1/610 - https://aomediacodec.github.io/afgs1-spec/ - done ```
1 parent 2be9f4c commit c63cb0a

File tree

2 files changed

+154
-20
lines changed

2 files changed

+154
-20
lines changed

Diff for: src/lib/mock-server.js

+6
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,12 @@ mockAgent
123123
.reply(200, '')
124124
.persist();
125125

126+
mockAgent
127+
.get("https://www.w3.org")
128+
.intercept({ method: "GET", path: "/StyleSheets/TR/2021/dark.css" })
129+
.reply(200, '')
130+
.persist();
131+
126132
mockAgent
127133
.get("https://www.w3.org")
128134
.intercept({ method: "GET", path: "/Tools/respec/respec-highlight" })

Diff for: src/lib/specs-crawler.js

+148-20
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,88 @@ const {
3131

3232
const {version: reffyVersion} = require('../../package.json');
3333

34+
/**
35+
* To be friendly with servers, requests get serialized by origin server,
36+
* and the code sleeps a bit in between requests to a given origin server.
37+
* To achieve, the code needs to take a lock on the origin it wants to send a
38+
* request to.
39+
*/
40+
const originLocks = {};
41+
42+
43+
/**
44+
* Helper function to sleep for a specified number of milliseconds
45+
*/
46+
function sleep(ms) {
47+
return new Promise(resolve => setTimeout(resolve, ms, 'slept'));
48+
}
49+
50+
51+
/**
52+
* Helper function to interleave values of a list of arrays.
53+
*
54+
* For example:
55+
* interleave([0, 2, 4, 6, 8], [1, 3, 5]) returns [0, 1, 2, 3, 4, 5, 6, 8]
56+
* interleave([0, 3], [1, 4], [2, 5]) returns [0, 1, 2, 3, 4, 5]
57+
*
58+
* The function is used to sort the list of specs to crawl so as to distribute
59+
* origins throughout the list.
60+
*
61+
* Note the function happily modifies (and empties in practice) the arrays
62+
* it receives as arguments.
63+
*/
64+
function interleave(firstArray, ...furtherArrays) {
65+
if (firstArray?.length > 0) {
66+
// Return the concactenation of the first item in the first array,
67+
// and of the result of interleaving remaining arrays, putting the
68+
// first array last in the list.
69+
const firstItem = firstArray.shift();
70+
return [firstItem, ...interleave(...furtherArrays, firstArray)];
71+
}
72+
else {
73+
// First array is empty, let's proceed with remaining arrays
74+
// until there's nothing else to proceed.
75+
if (furtherArrays.length > 0) {
76+
return interleave(...furtherArrays);
77+
}
78+
else {
79+
return [];
80+
}
81+
}
82+
}
83+
84+
85+
/**
86+
* Helper function that returns the "origin" of a URL, defined in a loose way
87+
* as the part of the true origin that identifies the server that's going to
88+
* serve the resource.
89+
*
90+
* For example "github.io" for all specs under github.io, "whatwg.org" for
91+
* all WHATWG specs, "csswg.org" for CSS specs at large (including Houdini
92+
* and FXTF specs since they are served by the same server).
93+
*/
94+
function getOrigin(url) {
95+
if (!url) {
96+
return '';
97+
}
98+
const origin = (new URL(url)).origin;
99+
if (origin.endsWith('.whatwg.org')) {
100+
return 'whatwg.org';
101+
}
102+
else if (origin.endsWith('.github.io')) {
103+
return 'github.io';
104+
}
105+
else if (origin.endsWith('.csswg.org') ||
106+
origin.endsWith('.css-houdini.org') ||
107+
origin.endsWith('.fxtf.org')) {
108+
return 'csswg.org';
109+
}
110+
else {
111+
return origin;
112+
}
113+
}
114+
115+
34116
/**
35117
* Return the spec if crawl succeeded or crawl result from given fallback list
36118
* if crawl yielded an error (and fallback does exist).
@@ -95,24 +177,51 @@ async function crawlSpec(spec, crawlOptions) {
95177
result = {};
96178
}
97179
else {
98-
result = await processSpecification(
99-
urlToCrawl,
100-
(spec, modules) => {
101-
const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
102-
window.reffy.mapIdsToHeadings() : null;
103-
const res = {
104-
crawled: window.location.toString()
105-
};
106-
modules.forEach(mod => {
107-
res[mod.property] = window.reffy[mod.name](spec, idToHeading);
108-
});
109-
return res;
110-
},
111-
[spec, crawlOptions.modules],
112-
{ quiet: crawlOptions.quiet,
113-
forceLocalFetch: crawlOptions.forceLocalFetch,
114-
...cacheInfo}
115-
);
180+
// To be friendly with servers, requests are serialized per origin
181+
// and only sent after a couple of seconds.
182+
const origin = getOrigin(urlToCrawl.url);
183+
let originLock = originLocks[origin];
184+
if (!originLock) {
185+
originLock = {
186+
locked: false,
187+
last: 0
188+
};
189+
originLocks[origin] = originLock;
190+
}
191+
// Wait for the "lock" on the origin. Once we can take it, sleep as
192+
// needed to only send a request after enough time has elapsed.
193+
while (originLock.locked) {
194+
await sleep(100);
195+
}
196+
originLock.locked = true;
197+
const now = Date.now();
198+
if (now - originLock.last < 2000) {
199+
await sleep(2000 - (now - originLock.last));
200+
}
201+
try {
202+
result = await processSpecification(
203+
urlToCrawl,
204+
(spec, modules) => {
205+
const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
206+
window.reffy.mapIdsToHeadings() : null;
207+
const res = {
208+
crawled: window.location.toString()
209+
};
210+
modules.forEach(mod => {
211+
res[mod.property] = window.reffy[mod.name](spec, idToHeading);
212+
});
213+
return res;
214+
},
215+
[spec, crawlOptions.modules],
216+
{ quiet: crawlOptions.quiet,
217+
forceLocalFetch: crawlOptions.forceLocalFetch,
218+
...cacheInfo}
219+
);
220+
}
221+
finally {
222+
originLock.last = Date.now();
223+
originLock.locked = false;
224+
}
116225
if (result.status === "notmodified" && fallback) {
117226
crawlOptions.quiet ?? console.warn(`skipping ${spec.url}, no change`);
118227
const copy = Object.assign({}, fallback);
@@ -343,14 +452,33 @@ async function crawlList(speclist, crawlOptions) {
343452
return { spec, readyToCrawl, resolve, reject };
344453
});
345454

455+
// While we want results to be returned following the initial order of the
456+
// specs, to avoid sending too many requests at once to the same origin,
457+
// we'll sort specs so that origins get interleaved.
458+
// Note: there may be specs without URL (ISO specs)
459+
const specsByOrigin = {};
460+
for (const spec of list) {
461+
const toCrawl = crawlOptions.publishedVersion ?
462+
(spec.release ?? spec.nightly) :
463+
spec.nightly;
464+
const origin = getOrigin(toCrawl?.url);
465+
if (!specsByOrigin[origin]) {
466+
specsByOrigin[origin] = [];
467+
}
468+
specsByOrigin[origin].push(spec);
469+
}
470+
const spreadList = interleave(...Object.values(specsByOrigin));
471+
346472
// In debug mode, specs are processed one by one. In normal mode,
347473
// specs are processing in chunks
348474
const chunkSize = Math.min((crawlOptions.debug ? 1 : 4), list.length);
349475

350476
let pos = 0;
351477
function flagNextSpecAsReadyToCrawl() {
352-
if (pos < listAndPromise.length) {
353-
listAndPromise[pos].resolve();
478+
if (pos < spreadList.length) {
479+
const spec = spreadList[pos];
480+
const specAndPromise = listAndPromise.find(sp => sp.spec === spec);
481+
specAndPromise.resolve();
354482
pos += 1;
355483
}
356484
}

0 commit comments

Comments
 (0)