feat(perf): offload generators to worker threads (#247)

avivkeller · web-flow · commit 91df7496b674 · 2025-04-12T09:15:29.000-04:00
diff --git a/bin/cli.mjs b/bin/cli.mjs
@@ -2,21 +2,22 @@
 
 import { resolve } from 'node:path';
 import process from 'node:process';
+import { cpus } from 'node:os';
 
 import { Command, Option } from 'commander';
 
 import { coerce } from 'semver';
 import { DOC_NODE_CHANGELOG_URL, DOC_NODE_VERSION } from '../src/constants.mjs';
 import createGenerator from '../src/generators.mjs';
-import generators from '../src/generators/index.mjs';
+import { publicGenerators } from '../src/generators/index.mjs';
 import createLinter from '../src/linter/index.mjs';
 import reporters from '../src/linter/reporters/index.mjs';
 import rules from '../src/linter/rules/index.mjs';
 import createMarkdownLoader from '../src/loaders/markdown.mjs';
 import createMarkdownParser from '../src/parsers/markdown.mjs';
 import createNodeReleases from '../src/releases.mjs';
 
-const availableGenerators = Object.keys(generators);
+const availableGenerators = Object.keys(publicGenerators);
 
 const program = new Command();
 
@@ -77,10 +78,16 @@ program
       .choices(Object.keys(reporters))
       .default('console')
   )
+  .addOption(
+    new Option(
+      '-p, --threads <number>',
+      'The maximum number of threads to use. Set to 1 to disable parallelism'
+    ).default(Math.max(1, cpus().length - 1))
+  )
   .parse(process.argv);
 
 /**
- * @typedef {keyof generators} Target A list of the available generator names.
+ * @typedef {keyof publicGenerators} Target A list of the available generator names.
  *
  * @typedef {Object} Options
  * @property {Array<string>|string} input Specifies the glob/path for input files.
@@ -108,6 +115,7 @@ const {
   lintDryRun,
   gitRef,
   reporter,
+  threads,
 } = program.opts();
 
 const linter = createLinter(lintDryRun, disableRule);
@@ -142,6 +150,8 @@ if (target) {
     // An URL containing a git ref URL pointing to the commit or ref that was used
     // to generate the API docs. This is used to link to the source code of the
     gitRef,
+    // How many threads should be used
+    threads,
   });
 }
 
diff --git a/src/generators.mjs b/src/generators.mjs
@@ -1,16 +1,7 @@
 'use strict';
 
-import publicGenerators from './generators/index.mjs';
-import astJs from './generators/ast-js/index.mjs';
-import oramaDb from './generators/orama-db/index.mjs';
-
-const availableGenerators = {
-  ...publicGenerators,
-  // This one is a little special since we don't want it to run unless we need
-  // it and we also don't want it to be publicly accessible through the CLI.
-  'ast-js': astJs,
-  'orama-db': oramaDb,
-};
+import { allGenerators } from './generators/index.mjs';
+import WorkerPool from './threading/index.mjs';
 
 /**
  * @typedef {{ ast: GeneratorMetadata<ApiDocMetadataEntry, ApiDocMetadataEntry>}} AstGenerator The AST "generator" is a facade for the AST tree and it isn't really a generator
@@ -43,30 +34,39 @@ const createGenerator = markdownInput => {
    */
   const cachedGenerators = { ast: Promise.resolve(markdownInput) };
 
+  const threadPool = new WorkerPool();
+
   /**
    * Runs the Generator engine with the provided top-level input and the given generator options
    *
    * @param {GeneratorOptions} options The options for the generator runtime
    */
-  const runGenerators = async ({ generators, ...extra }) => {
+  const runGenerators = async ({ generators, threads, ...extra }) => {
     // Note that this method is blocking, and will only execute one generator per-time
     // but it ensures all dependencies are resolved, and that multiple bottom-level generators
     // can reuse the already parsed content from the top-level/dependency generators
     for (const generatorName of generators) {
-      const { dependsOn, generate } = availableGenerators[generatorName];
+      const { dependsOn, generate } = allGenerators[generatorName];
 
       // If the generator dependency has not yet been resolved, we resolve
       // the dependency first before running the current generator
       if (dependsOn && dependsOn in cachedGenerators === false) {
-        await runGenerators({ ...extra, generators: [dependsOn] });
+        await runGenerators({
+          ...extra,
+          threads,
+          generators: [dependsOn],
+        });
       }
 
       // Ensures that the dependency output gets resolved before we run the current
       // generator with its dependency output as the input
       const dependencyOutput = await cachedGenerators[dependsOn];
 
       // Adds the current generator execution Promise to the cache
-      cachedGenerators[generatorName] = generate(dependencyOutput, extra);
+      cachedGenerators[generatorName] =
+        threads < 2
+          ? generate(dependencyOutput, extra) // Run in main thread
+          : threadPool.run(generatorName, dependencyOutput, threads, extra); // Offload to worker thread
     }
 
     // Returns the value of the last generator of the current pipeline
diff --git a/src/generators/index.mjs b/src/generators/index.mjs
@@ -9,8 +9,9 @@ import legacyJsonAll from './legacy-json-all/index.mjs';
 import addonVerify from './addon-verify/index.mjs';
 import apiLinks from './api-links/index.mjs';
 import oramaDb from './orama-db/index.mjs';
+import astJs from './ast-js/index.mjs';
 
-export default {
+export const publicGenerators = {
   'json-simple': jsonSimple,
   'legacy-html': legacyHtml,
   'legacy-html-all': legacyHtmlAll,
@@ -21,3 +22,10 @@ export default {
   'api-links': apiLinks,
   'orama-db': oramaDb,
 };
+
+export const allGenerators = {
+  ...publicGenerators,
+  // This one is a little special since we don't want it to run unless we need
+  // it and we also don't want it to be publicly accessible through the CLI.
+  'ast-js': astJs,
+};
diff --git a/src/generators/json-simple/index.mjs b/src/generators/json-simple/index.mjs
@@ -6,7 +6,6 @@ import { join } from 'node:path';
 import { remove } from 'unist-util-remove';
 
 import createQueries from '../../utils/queries/index.mjs';
-import { getRemark } from '../../utils/remark.mjs';
 
 /**
  * This generator generates a simplified JSON version of the API docs and returns it as a string
@@ -35,9 +34,6 @@ export default {
    * @param {Partial<GeneratorOptions>} options
    */
   async generate(input, options) {
-    // Gets a remark processor for stringifying the AST tree into JSON
-    const remarkProcessor = getRemark();
-
     // Iterates the input (ApiDocMetadataEntry) and performs a few changes
     const mappedInput = input.map(node => {
       // Deep clones the content nodes to avoid affecting upstream nodes
@@ -50,12 +46,6 @@ export default {
         createQueries.UNIST.isHeading,
       ]);
 
-      /**
-       * For the JSON generate we want to transform the whole content into JSON
-       * @returns {string} The stringified JSON version of the content
-       */
-      content.toJSON = () => remarkProcessor.stringify(content);
-
       return { ...node, content };
     });
 
diff --git a/src/generators/legacy-html-all/index.mjs b/src/generators/legacy-html-all/index.mjs
@@ -86,7 +86,7 @@ export default {
       .replace('__ID__', 'all')
       .replace(/__FILENAME__/g, 'all')
       .replace('__SECTION__', 'All')
-      .replace(/__VERSION__/g, `v${version.toString()}`)
+      .replace(/__VERSION__/g, `v${version.version}`)
       .replace(/__TOC__/g, tableOfContents.wrapToC(aggregatedToC))
       .replace(/__GTOC__/g, parsedSideNav)
       .replace('__CONTENT__', aggregatedContent)
diff --git a/src/generators/legacy-html/index.mjs b/src/generators/legacy-html/index.mjs
@@ -84,7 +84,6 @@ export default {
      */
     const replaceTemplateValues = values => {
       const { api, added, section, version, toc, nav, content } = values;
-
       return apiTemplate
         .replace('__ID__', api)
         .replace(/__FILENAME__/g, api)
@@ -139,7 +138,7 @@ export default {
         api: head.api,
         added: head.introduced_in ?? '',
         section: head.heading.data.name || apiAsHeading,
-        version: `v${version.toString()}`,
+        version: `v${version.version}`,
         toc: String(parsedToC),
         nav: String(activeSideNav),
         content: parsedContent,
diff --git a/src/generators/legacy-html/utils/buildDropdowns.mjs b/src/generators/legacy-html/utils/buildDropdowns.mjs
@@ -60,8 +60,9 @@ const buildNavigation = navigationContents =>
 const buildVersions = (api, added, versions) => {
   // All Node.js versions that support the current API; If there's no "introduced_at" field,
   // we simply show all versions, as we cannot pinpoint the exact version
+  const coercedMajor = major(coerceSemVer(added));
   const compatibleVersions = versions.filter(({ version }) =>
-    added ? major(version) >= major(coerceSemVer(added)) : true
+    added ? version.major >= coercedMajor : true
   );
 
   // Parses the SemVer version into something we use for URLs and to display the Node.js version
diff --git a/src/generators/legacy-json/utils/buildSection.mjs b/src/generators/legacy-json/utils/buildSection.mjs
@@ -58,7 +58,7 @@ export const createSectionBuilder = () => {
    * @param {import('../types.d.ts').HierarchizedEntry} entry - The entry providing stability information.
    */
   const parseStability = (section, nodes, { stability }) => {
-    const stabilityInfo = stability.toJSON()?.[0];
+    const stabilityInfo = stability.children.map(node => node.data)?.[0];
 
     if (stabilityInfo) {
       section.stability = stabilityInfo.index;
diff --git a/src/generators/types.d.ts b/src/generators/types.d.ts
@@ -1,11 +1,11 @@
 import type { SemVer } from 'semver';
 import type { ApiDocReleaseEntry } from '../types';
-import type availableGenerators from './index.mjs';
+import type { publicGenerators } from './index.mjs';
 
 declare global {
   // All available generators as an inferable type, to allow Generator interfaces
   // to be type complete and runtime friendly within `runGenerators`
-  export type AvailableGenerators = typeof availableGenerators;
+  export type AvailableGenerators = typeof publicGenerators;
 
   // This is the runtime config passed to the API doc generators
   export interface GeneratorOptions {
@@ -36,6 +36,9 @@ declare global {
     // i.e. https://github.com/nodejs/node/tree/2cb1d07e0f6d9456438016bab7db4688ab354fd2
     // i.e. https://gitlab.com/someone/node/tree/HEAD
     gitRef: string;
+
+    // The number of threads the process is allowed to use
+    threads: number;
   }
 
   export interface GeneratorMetadata<I extends any, O extends any> {
diff --git a/src/linter/tests/fixtures/entries.mjs b/src/linter/tests/fixtures/entries.mjs
@@ -1,10 +1,3 @@
-/**
- * Noop function.
- *
- * @returns {any}
- */
-const noop = () => {};
-
 /**
  * @type {ApiDocMetadataEntry}
  */
@@ -69,12 +62,10 @@ export const assertEntry = {
       slug: 'assert',
       type: 'property',
     },
-    toJSON: noop,
   },
   stability: {
     type: 'root',
     children: [],
-    toJSON: noop,
   },
   content: {
     type: 'root',
diff --git a/src/metadata.mjs b/src/metadata.mjs
@@ -140,17 +140,6 @@ const createMetadata = slugger => {
       internalMetadata.heading.data.type =
         type ?? internalMetadata.heading.data.type;
 
-      /**
-       * Defines the toJSON method for the Heading AST node to be converted as JSON
-       */
-      internalMetadata.heading.toJSON = () => internalMetadata.heading.data;
-
-      /**
-       * Maps the Stability Index AST nodes into a JSON objects from their data properties
-       */
-      internalMetadata.stability.toJSON = () =>
-        internalMetadata.stability.children.map(node => node.data);
-
       // Returns the Metadata entry for the API doc
       return {
         api: apiDoc.stem,
diff --git a/src/test/metadata.test.mjs b/src/test/metadata.test.mjs
@@ -33,7 +33,6 @@ describe('createMetadata', () => {
     };
     metadata.addStability(stability);
     const actual = metadata.create(new VFile(), {}).stability;
-    delete actual.toJSON;
     deepStrictEqual(actual, {
       children: [stability],
       type: 'root',
@@ -82,8 +81,15 @@ describe('createMetadata', () => {
       yaml_position: {},
     };
     const actual = metadata.create(apiDoc, section);
-    delete actual.stability.toJSON;
-    delete actual.heading.toJSON;
     deepStrictEqual(actual, expected);
   });
+
+  it('should be serializable', () => {
+    const { create } = createMetadata(new GitHubSlugger());
+    const actual = create(new VFile({ path: 'test.md' }), {
+      type: 'root',
+      children: [],
+    });
+    deepStrictEqual(structuredClone(actual), actual);
+  });
 });
diff --git a/src/threading/index.mjs b/src/threading/index.mjs
diff --git a/src/threading/worker.mjs b/src/threading/worker.mjs
diff --git a/src/types.d.ts b/src/types.d.ts