|
| 1 | +import setNamespace from 'debug'; |
1 | 2 | import axios from 'axios'; |
2 | 3 | import path from 'path'; |
3 | 4 | import fs from 'fs/promises'; |
4 | 5 | import process from 'process'; |
5 | 6 | import * as cheerio from 'cheerio'; |
6 | 7 | import fsc from 'fs-cheerio'; |
| 8 | +import mime from 'mime-types'; |
7 | 9 | import { createWriteStream } from 'fs'; |
8 | 10 |
|
| 11 | +const debug = setNamespace('page-loader'); |
| 12 | + |
9 | 13 | let $; |
10 | 14 |
|
11 | 15 | export class PageLoader { |
12 | 16 | #url; |
13 | | - #destFolder; |
14 | | - #resourceDist; |
| 17 | + #outputDir; |
| 18 | + #resourceDir; |
15 | 19 |
|
16 | | - constructor(urlString, destFolder = process.cwd()) { |
| 20 | + constructor(urlString, outputDir = process.cwd()) { |
17 | 21 | this.#url = new URL(urlString); |
18 | | - this.#destFolder = destFolder; |
19 | | - this.#resourceDist = `${this.#generateFileName(this.#url.href)}_files`; |
| 22 | + this.#outputDir = this.#normalizeDirPath(outputDir); |
| 23 | + this.#resourceDir = `${this.#generateFileName(this.#url.href)}_files`; |
20 | 24 | } |
21 | 25 |
|
22 | 26 | async load() { |
23 | | - const { filepath, html } = await this.#loadHtml(); |
24 | | - $ = cheerio.load(html); |
25 | | - |
| 27 | + await this.#loadDom(); |
| 28 | + await this.#ensureDirExists(this.#outputDir); |
26 | 29 | await this.#createResourceDir(); |
27 | 30 | await this.#loadResources(); |
28 | 31 |
|
29 | | - await fsc.writeFile(filepath, $); |
| 32 | + const filepath = await this.#saveHtml(); |
30 | 33 |
|
31 | 34 | return { filepath }; |
32 | 35 | } |
33 | 36 |
|
34 | | - async #createResourceDir() { |
| 37 | + async #saveHtml() { |
| 38 | + const htmlFilename = this.#generateFileName(this.#url.href) + '.html'; |
| 39 | + const filepath = path.join(this.#outputDir, htmlFilename); |
| 40 | + |
| 41 | + await fsc.writeFile(filepath, $); |
| 42 | + |
| 43 | + return filepath; |
| 44 | + } |
| 45 | + |
| 46 | + async #ensureDirExists(dirPath) { |
35 | 47 | try { |
36 | | - await fs.mkdir(path.join(this.#destFolder, this.#resourceDist)); |
| 48 | + await fs.access(dirPath); |
37 | 49 | } catch (error) { |
38 | | - console.log(error); |
| 50 | + if (error.code === 'ENOENT') { |
| 51 | + await fs.mkdir(dirPath, { recursive: true }); |
| 52 | + } else { |
| 53 | + throw error; |
| 54 | + } |
39 | 55 | } |
40 | 56 | } |
41 | 57 |
|
42 | | - async #loadHtml() { |
43 | | - const htmlFilename = this.#generateFileName(this.#url.href) + '.html'; |
44 | | - const filepath = path.join(this.#destFolder, htmlFilename); |
45 | | - |
46 | | - const { data: html } = await axios.get(this.#url.toString()); |
| 58 | + #normalizeDirPath(pathToFolder) { |
| 59 | + return path.resolve(process.cwd(), pathToFolder); |
| 60 | + } |
47 | 61 |
|
48 | | - await fs.writeFile(filepath, html); |
| 62 | + async #createResourceDir() { |
| 63 | + await fs.mkdir(path.join(this.#outputDir, this.#resourceDir), { recursive: true }); |
| 64 | + } |
49 | 65 |
|
50 | | - return { filepath, html }; |
| 66 | + async #loadDom() { |
| 67 | + const { data } = await axios.get(this.#url.toString()); |
| 68 | + $ = cheerio.load(data); |
51 | 69 | } |
52 | 70 |
|
53 | 71 | async #loadResources() { |
54 | 72 | const $links = $('link'); |
55 | 73 | const $images = $('img'); |
56 | 74 | const $scripts = $('script'); |
57 | 75 |
|
58 | | - const promises = [$links, $images, $scripts].flatMap(($elements) => |
59 | | - $elements.toArray().map((el) => this.#loadResource(el)), |
| 76 | + const results = await Promise.allSettled( |
| 77 | + [$links, $images, $scripts].flatMap(($elements) => |
| 78 | + $elements.toArray().reduce((promises, el) => { |
| 79 | + const resourceUrl = this.#getResourceUrl(el); |
| 80 | + |
| 81 | + if (!this.#isResourceLocal(resourceUrl)) { |
| 82 | + return promises; |
| 83 | + } |
| 84 | + |
| 85 | + return promises.concat( |
| 86 | + this.#loadResource(resourceUrl.href).then((resp) => ({ el, resp })), |
| 87 | + ); |
| 88 | + }, []), |
| 89 | + ), |
60 | 90 | ); |
61 | 91 |
|
62 | | - await Promise.allSettled(promises); |
| 92 | + await Promise.allSettled( |
| 93 | + results |
| 94 | + .reduce((acc, res) => (res.value ? acc.concat(res.value) : acc), []) |
| 95 | + .map(({ el, resp }) => { |
| 96 | + const { url } = resp.config; |
| 97 | + const extname = path.extname(url) || `.${mime.extension(resp.headers['content-type'])}`; |
| 98 | + const resourcePath = this.#getResourceFilePath(url, extname); |
| 99 | + console.log(resourcePath); |
| 100 | + |
| 101 | + this.#changeElementUrl(el, resourcePath); |
| 102 | + |
| 103 | + return this.#saveResource(resp.data, path.join(this.#outputDir, resourcePath)); |
| 104 | + }), |
| 105 | + ); |
63 | 106 | } |
64 | 107 |
|
65 | | - #getUrlAttr(element) { |
66 | | - return element.name === 'link' ? 'href' : 'src'; |
| 108 | + #getResourceUrl(element) { |
| 109 | + const urlAttr = this.#getUrlAttr(element); |
| 110 | + return new URL(element.attribs[urlAttr], this.#url.href); |
67 | 111 | } |
68 | 112 |
|
69 | 113 | #isResourceLocal(resourceUrl) { |
70 | 114 | return resourceUrl.origin === this.#url.origin; |
71 | 115 | } |
72 | 116 |
|
73 | | - async #loadResource(element) { |
74 | | - const urlAttr = this.#getUrlAttr(element); |
75 | | - const resourceUrl = new URL(element.attribs[urlAttr], this.#url.href); |
76 | | - |
77 | | - if (!this.#isResourceLocal(resourceUrl)) { |
78 | | - return Promise.resolve(); |
79 | | - } |
80 | | - |
81 | | - let resp; |
82 | | - |
| 117 | + async #loadResource(url) { |
83 | 118 | try { |
84 | | - resp = await axios.get(resourceUrl.href, { responseType: 'stream' }); |
| 119 | + const resp = await axios.get(url, { responseType: 'stream' }); |
| 120 | + return resp; |
85 | 121 | } catch (error) { |
86 | | - return Promise.reject(error); |
| 122 | + console.error(`Error downloading ${url}`, error); |
| 123 | + throw error; |
87 | 124 | } |
| 125 | + } |
88 | 126 |
|
89 | | - const resourcePath = this.#getResourceFilePath(resourceUrl.href); |
90 | | - $(element).attr(urlAttr, resourcePath); |
| 127 | + #changeElementUrl(element, newUrl) { |
| 128 | + const urlAttr = this.#getUrlAttr(element); |
| 129 | + $(element).attr(urlAttr, newUrl); |
| 130 | + } |
91 | 131 |
|
92 | | - const writer = createWriteStream(path.join(this.#destFolder, resourcePath)); |
93 | | - resp.data.pipe(writer); |
| 132 | + async #saveResource(data, path) { |
| 133 | + const writer = createWriteStream(path); |
| 134 | + data.pipe(writer); |
| 135 | + |
| 136 | + debug(`start writing to path: ${path}`); |
94 | 137 |
|
95 | 138 | return new Promise((resolve, reject) => { |
96 | | - writer.on('finish', resolve); |
97 | | - writer.on('error', reject); |
| 139 | + writer.on('finish', () => { |
| 140 | + debug(`finish writing to path: ${path}`); |
| 141 | + resolve(); |
| 142 | + }); |
| 143 | + writer.on('error', (error) => { |
| 144 | + debug(`error writing to path: ${path}\n`, error); |
| 145 | + reject(error); |
| 146 | + }); |
98 | 147 | }); |
99 | 148 | } |
100 | 149 |
|
101 | | - #getResourceFilePath(rawName) { |
102 | | - return path.join(this.#resourceDist, this.#generateFileName(rawName, { saveExt: true })); |
| 150 | + #getUrlAttr(element) { |
| 151 | + return element.name === 'link' ? 'href' : 'src'; |
103 | 152 | } |
104 | 153 |
|
105 | | - #generateFileName(string, { saveExt = false } = {}) { |
106 | | - const dotIndex = string.lastIndexOf('.'); |
107 | | - const ext = string.slice(dotIndex); |
108 | | - |
109 | | - let filename = string.trim(); |
| 154 | + #getResourceFilePath(rawName, extname) { |
| 155 | + return path.join(this.#resourceDir, this.#generateFileName(rawName, extname)); |
| 156 | + } |
110 | 157 |
|
111 | | - if (saveExt) { |
112 | | - filename = filename.slice(0, dotIndex); |
113 | | - } |
| 158 | + #generateFileName(string, extname = null) { |
| 159 | + const regex = new RegExp(`${extname ? `(?!\\${extname})` : ''}[^a-z0-9]`, 'gi'); |
114 | 160 |
|
115 | | - filename = filename.replace(/^https?:\/\//, '').replace(/[^a-z0-9]/gi, '-'); |
| 161 | + let filename = string |
| 162 | + .trim() |
| 163 | + .replace(/^https?:\/\//, '') |
| 164 | + .replace(regex, '-'); |
116 | 165 |
|
117 | | - if (saveExt) { |
118 | | - filename = filename.concat(ext); |
| 166 | + if (extname && !path.extname(filename)) { |
| 167 | + filename = filename.concat(extname); |
119 | 168 | } |
120 | 169 |
|
121 | 170 | return filename; |
|
0 commit comments