diff --git a/packages/metascraper-readability/package.json b/packages/metascraper-readability/package.json index 74650b0ad..d9a44ff8c 100644 --- a/packages/metascraper-readability/package.json +++ b/packages/metascraper-readability/package.json @@ -25,6 +25,7 @@ "dependencies": { "@metascraper/helpers": "workspace:*", "@mozilla/readability": "~0.5.0", + "async-memoize-one": "~1.1.8", "happy-dom": "~16.5.3" }, "devDependencies": { @@ -37,7 +38,11 @@ "src" ], "scripts": { - "test": "NODE_PATH=.. TZ=UTC ava --timeout 15s" + "test": "NODE_PATH=.. TZ=UTC ava" }, - "license": "MIT" + "license": "MIT", + "ava": { + "workerThreads": false, + "timeout": "15s" + } } diff --git a/packages/metascraper-readability/src/index.d.ts b/packages/metascraper-readability/src/index.d.ts index 684b26689..778c1b395 100644 --- a/packages/metascraper-readability/src/index.d.ts +++ b/packages/metascraper-readability/src/index.d.ts @@ -1,5 +1,4 @@ type Options = { - getDocument: ({url: string, html: string }) => Document, readabilityOpts: import('readability').ReadabilityOptions, } diff --git a/packages/metascraper-readability/src/index.js b/packages/metascraper-readability/src/index.js index e63df13a6..e1c258b35 100644 --- a/packages/metascraper-readability/src/index.js +++ b/packages/metascraper-readability/src/index.js @@ -1,36 +1,25 @@ 'use strict' const { memoizeOne, composeRule } = require('@metascraper/helpers') -const { Readability } = require('@mozilla/readability') - -const parseReader = reader => { - try { - return reader.parse() - } catch (_) { - return {} - } -} - -const defaultGetDocument = ({ url, html }) => { - const { Window } = require('happy-dom') - const window = new Window({ url }) - const document = window.document - document.documentElement.innerHTML = html - return document -} - -module.exports = ({ - getDocument = defaultGetDocument, - readabilityOpts -} = {}) => { - const readability = memoizeOne((url, html, getDocument) => { - const document = getDocument({ url, html }) - const reader = new Readability(document, readabilityOpts) - return parseReader(reader) - }, memoizeOne.EqualityFirstArgument) - +const asyncMemoizeOne = require('async-memoize-one') +const { Worker } = require('worker_threads') +const path = require('path') + +const SCRIPT_PATH = path.resolve(__dirname, 'worker.js') + +const readability = asyncMemoizeOne((url, html, readabilityOpts) => { + const worker = new Worker(SCRIPT_PATH, { + workerData: { url, html, readabilityOpts } + }) + const { promise, resolve, reject } = Promise.withResolvers() + worker.on('message', message => resolve(JSON.parse(message))) + worker.on('error', reject) + return promise +}, memoizeOne.EqualityFirstArgument) + +module.exports = ({ readabilityOpts } = {}) => { const getReadbility = composeRule(($, url) => - readability(url, $.html(), getDocument) + readability(url, $.html(), readabilityOpts) ) const rules = { diff --git a/packages/metascraper-readability/src/worker.js b/packages/metascraper-readability/src/worker.js new file mode 100644 index 000000000..36d4fdad1 --- /dev/null +++ b/packages/metascraper-readability/src/worker.js @@ -0,0 +1,28 @@ +'use strict' + +const { workerData, parentPort } = require('node:worker_threads') +const { Readability } = require('@mozilla/readability') + +const parseReader = reader => { + try { + return reader.parse() + } catch (_) { + return {} + } +} + +const getDocument = ({ url, html }) => { + const { Window } = require('happy-dom') + const window = new Window({ url }) + const document = window.document + document.documentElement.innerHTML = html + return document +} + +const main = async ({ url, html, readabilityOpts } = {}) => { + const document = getDocument({ url, html }) + const reader = new Readability(document, readabilityOpts) + return parseReader(reader) +} + +main(workerData).then(result => parentPort.postMessage(JSON.stringify(result)))