Skip to content

Commit f9968be

Browse files
authored
Merge pull request #738 from microlinkhq/happydom
perf(readability): use happy-dom
2 parents 2c11034 + 4480f76 commit f9968be

File tree

8 files changed

+1429
-12
lines changed

8 files changed

+1429
-12
lines changed

packages/metascraper-readability/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,19 @@
1414
$ npm install metascraper-readability --save
1515
```
1616

17+
## API
18+
19+
### metascraper-readability([options])
20+
21+
#### options
22+
23+
##### getDocument
24+
25+
Type: `function`<br>
26+
Default: [source code](https://github.com/microlinkhq/metascraper/blob/master/packages/metascraper-readability/src/index.js#L14-L20)
27+
28+
The function to be called to serialized html into a DOM document.
29+
1730
## License
1831

1932
**metascraper-readability** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/metascraper/blob/master/LICENSE.md) License.<br>

packages/metascraper-readability/benchmark/fixture.html

Lines changed: 1345 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
'use strict'
2+
3+
const { readFileSync } = require('fs')
4+
5+
const url = 'https://arxiv.org/pdf/2412.06592'
6+
const html = readFileSync('./fixture.html', 'utf8')
7+
8+
const jsdom = () => {
9+
const { JSDOM, VirtualConsole } = require('jsdom')
10+
const dom = new JSDOM(html, { url, virtualConsole: new VirtualConsole() })
11+
return dom.window.document
12+
}
13+
14+
const happydom = () => {
15+
const { Window } = require('happy-dom')
16+
const window = new Window({ url })
17+
const document = window.document
18+
document.documentElement.innerHTML = html
19+
return document
20+
}
21+
22+
const { Readability } = require('@mozilla/readability')
23+
24+
const measure = fn => {
25+
const now = Date.now()
26+
const parsed = new Readability(fn()).parse()
27+
return { parsed, duration: Date.now() - now }
28+
}
29+
30+
const jsdomResult = measure(jsdom)
31+
const happydomResult = measure(happydom)
32+
33+
const isEqual = (value1, value2) =>
34+
JSON.stringify(value1) === JSON.stringify(value2)
35+
36+
if (!isEqual(jsdomResult.parsed, happydomResult.parsed)) {
37+
console.error('Results are different')
38+
process.exit(1)
39+
}
40+
41+
console.log(` jsdom: ${jsdomResult.duration}ms`)
42+
console.log(`happydom: ${happydomResult.duration}ms`)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"name": "@metascraper-readability/benchmark",
3+
"private": true,
4+
"version": "1.0.0",
5+
"devDependencies": {
6+
"dom-parser": "latest",
7+
"happy-dom": "latest"
8+
}
9+
}

packages/metascraper-readability/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"dependencies": {
2626
"@metascraper/helpers": "workspace:*",
2727
"@mozilla/readability": "~0.5.0",
28-
"jsdom": "~25.0.1"
28+
"happy-dom": "~16.5.3"
2929
},
3030
"devDependencies": {
3131
"ava": "5",

packages/metascraper-readability/src/index.js

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
'use strict'
22

33
const { memoizeOne, composeRule } = require('@metascraper/helpers')
4-
54
const { Readability } = require('@mozilla/readability')
6-
const { JSDOM, VirtualConsole } = require('jsdom')
75

86
const parseReader = reader => {
97
try {
@@ -13,15 +11,25 @@ const parseReader = reader => {
1311
}
1412
}
1513

16-
const readability = memoizeOne((url, html) => {
17-
const dom = new JSDOM(html, { url, virtualConsole: new VirtualConsole() })
18-
const reader = new Readability(dom.window.document)
19-
return parseReader(reader)
20-
}, memoizeOne.EqualityFirstArgument)
14+
const defaultGetDocument = ({ url, html }) => {
15+
const { Window } = require('happy-dom')
16+
const window = new Window({ url })
17+
const document = window.document
18+
document.documentElement.innerHTML = html
19+
return document
20+
}
21+
22+
module.exports = ({ getDocument = defaultGetDocument } = {}) => {
23+
const readability = memoizeOne((url, html, getDocument) => {
24+
const document = getDocument({ url, html })
25+
const reader = new Readability(document)
26+
return parseReader(reader)
27+
}, memoizeOne.EqualityFirstArgument)
2128

22-
const getReadbility = composeRule(($, url) => readability(url, $.html()))
29+
const getReadbility = composeRule(($, url) =>
30+
readability(url, $.html(), getDocument)
31+
)
2332

24-
module.exports = () => {
2533
return {
2634
author: getReadbility({ from: 'byline', to: 'author' }),
2735
description: getReadbility({ from: 'excerpt', to: 'description' }),

packages/metascraper-readability/test/snapshots/index.js.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ Generated by [AVA](https://avajs.dev).
4646
4747
{
4848
author: null,
49-
description: null,
49+
description: 'Virtual Tour of 219 Shale Rd.',
5050
lang: null,
5151
publisher: null,
52-
title: null,
52+
title: '219 Shale Rd - Virtual Tour',
5353
}
Binary file not shown.

0 commit comments

Comments
 (0)