|
1 | | -import * as cheerio from "cheerio" |
2 | | -import TurndownService from "turndown" |
3 | | -import { Readability, isProbablyReaderable } from "@mozilla/readability" |
| 1 | +import Defuddle, { createMarkdownContent } from "defuddle/full" |
4 | 2 |
|
5 | | -export const defaultExtractContent = (html: string) => { |
6 | | - const doc = new DOMParser().parseFromString(html, "text/html") |
7 | | - if (isProbablyReaderable(doc)) { |
8 | | - const reader = new Readability(doc) |
9 | | - const article = reader.parse() |
10 | | - if (article && article.content) { |
11 | | - const $article = cheerio.load(article.content) |
12 | | - $article("script, style, link, svg, [src^='data:image/']").remove() |
13 | | - article.content = $article.html() || "" |
14 | | - } |
15 | | - const turndownService = new TurndownService({ |
16 | | - headingStyle: "atx", |
17 | | - codeBlockStyle: "fenced" |
18 | | - }) |
19 | | - return turndownService.turndown(article?.content || "").trim() |
20 | | - } |
| 3 | +export const defaultExtractContent = (html: string, url: string = "") => { |
| 4 | + if (!html) return "" |
21 | 5 |
|
22 | | - const $ = cheerio.load(html) |
| 6 | + try { |
| 7 | + const doc = new DOMParser().parseFromString(html, "text/html") |
23 | 8 |
|
24 | | - $("script, style, link, svg, [src^='data:image/']").remove() |
| 9 | + const result = new Defuddle(doc as unknown as Document, { url }).parse() |
25 | 10 |
|
26 | | - $("*").each((_, element) => { |
27 | | - if ("attribs" in element) { |
28 | | - const attributes = element.attribs |
29 | | - for (const attr in attributes) { |
30 | | - if (attr !== "href" && attr !== "src") { |
31 | | - $(element).removeAttr(attr) |
32 | | - } |
33 | | - } |
| 11 | + if (result?.content && result.content.trim().length > 0) { |
| 12 | + return createMarkdownContent(result.content, url).trim() |
34 | 13 | } |
35 | | - }) |
36 | | - |
37 | | - const mainContent = |
38 | | - $('[role="main"]').html() || $("main").html() || $("body").html() || "" |
39 | | - |
40 | | - const turndownService = new TurndownService({ |
41 | | - headingStyle: "atx", |
42 | | - codeBlockStyle: "fenced" |
43 | | - }) |
44 | | - const markdown = turndownService.turndown(mainContent) |
| 14 | + } catch (error) { |
| 15 | + console.warn( |
| 16 | + "[defaultExtractContent] defuddle extraction failed, falling back:", |
| 17 | + error |
| 18 | + ) |
| 19 | + } |
45 | 20 |
|
46 | | - return markdown.trim() |
| 21 | + try { |
| 22 | + const doc = new DOMParser().parseFromString(html, "text/html") |
| 23 | + doc |
| 24 | + .querySelectorAll("script, style, link, noscript, svg, [aria-hidden=\"true\"]") |
| 25 | + .forEach((el) => el.remove()) |
| 26 | + const body = |
| 27 | + doc.querySelector("[role=\"main\"]") || |
| 28 | + doc.querySelector("main") || |
| 29 | + doc.querySelector("article") || |
| 30 | + doc.body |
| 31 | + return createMarkdownContent(body?.innerHTML || html, url).trim() |
| 32 | + } catch (error) { |
| 33 | + console.warn( |
| 34 | + "[defaultExtractContent] fallback markdown conversion failed:", |
| 35 | + error |
| 36 | + ) |
| 37 | + return "" |
| 38 | + } |
47 | 39 | } |
0 commit comments