|
| 1 | +import setNamespace from 'debug'; |
1 | 2 | import axios from 'axios';
|
2 | 3 | import path from 'path';
|
3 | 4 | import fs from 'fs/promises';
|
4 | 5 | import process from 'process';
|
5 | 6 | import * as cheerio from 'cheerio';
|
6 | 7 | import fsc from 'fs-cheerio';
|
| 8 | +import mime from 'mime-types'; |
7 | 9 | import { createWriteStream } from 'fs';
|
8 | 10 |
|
| 11 | +const debug = setNamespace('page-loader'); |
| 12 | + |
9 | 13 | let $;
|
10 | 14 |
|
11 | 15 | export class PageLoader {
|
12 | 16 | #url;
|
13 |
| - #destFolder; |
14 |
| - #resourceDist; |
| 17 | + #outputDir; |
| 18 | + #resourceDir; |
15 | 19 |
|
16 |
| - constructor(urlString, destFolder = process.cwd()) { |
| 20 | + constructor(urlString, outputDir = process.cwd()) { |
17 | 21 | this.#url = new URL(urlString);
|
18 |
| - this.#destFolder = destFolder; |
19 |
| - this.#resourceDist = `${this.#generateFileName(this.#url.href)}_files`; |
| 22 | + this.#outputDir = this.#normalizeDirPath(outputDir); |
| 23 | + this.#resourceDir = `${this.#generateFileName(this.#url.href)}_files`; |
20 | 24 | }
|
21 | 25 |
|
22 | 26 | async load() {
|
23 |
| - const { filepath, html } = await this.#loadHtml(); |
24 |
| - $ = cheerio.load(html); |
25 |
| - |
| 27 | + await this.#loadDom(); |
| 28 | + await this.#ensureDirExists(this.#outputDir); |
26 | 29 | await this.#createResourceDir();
|
27 | 30 | await this.#loadResources();
|
28 | 31 |
|
29 |
| - await fsc.writeFile(filepath, $); |
| 32 | + const filepath = await this.#saveHtml(); |
30 | 33 |
|
31 | 34 | return { filepath };
|
32 | 35 | }
|
33 | 36 |
|
34 |
| - async #createResourceDir() { |
| 37 | + async #saveHtml() { |
| 38 | + const htmlFilename = this.#generateFileName(this.#url.href) + '.html'; |
| 39 | + const filepath = path.join(this.#outputDir, htmlFilename); |
| 40 | + |
| 41 | + await fsc.writeFile(filepath, $); |
| 42 | + |
| 43 | + return filepath; |
| 44 | + } |
| 45 | + |
| 46 | + async #ensureDirExists(dirPath) { |
35 | 47 | try {
|
36 |
| - await fs.mkdir(path.join(this.#destFolder, this.#resourceDist)); |
| 48 | + await fs.access(dirPath); |
37 | 49 | } catch (error) {
|
38 |
| - console.log(error); |
| 50 | + if (error.code === 'ENOENT') { |
| 51 | + await fs.mkdir(dirPath, { recursive: true }); |
| 52 | + } else { |
| 53 | + throw error; |
| 54 | + } |
39 | 55 | }
|
40 | 56 | }
|
41 | 57 |
|
42 |
| - async #loadHtml() { |
43 |
| - const htmlFilename = this.#generateFileName(this.#url.href) + '.html'; |
44 |
| - const filepath = path.join(this.#destFolder, htmlFilename); |
45 |
| - |
46 |
| - const { data: html } = await axios.get(this.#url.toString()); |
| 58 | + #normalizeDirPath(pathToFolder) { |
| 59 | + return path.resolve(process.cwd(), pathToFolder); |
| 60 | + } |
47 | 61 |
|
48 |
| - await fs.writeFile(filepath, html); |
| 62 | + async #createResourceDir() { |
| 63 | + await fs.mkdir(path.join(this.#outputDir, this.#resourceDir), { recursive: true }); |
| 64 | + } |
49 | 65 |
|
50 |
| - return { filepath, html }; |
| 66 | + async #loadDom() { |
| 67 | + const { data } = await axios.get(this.#url.toString()); |
| 68 | + $ = cheerio.load(data); |
51 | 69 | }
|
52 | 70 |
|
53 | 71 | async #loadResources() {
|
54 | 72 | const $links = $('link');
|
55 | 73 | const $images = $('img');
|
56 | 74 | const $scripts = $('script');
|
57 | 75 |
|
58 |
| - const promises = [$links, $images, $scripts].flatMap(($elements) => |
59 |
| - $elements.toArray().map((el) => this.#loadResource(el)), |
| 76 | + const results = await Promise.allSettled( |
| 77 | + [$links, $images, $scripts].flatMap(($elements) => |
| 78 | + $elements.toArray().reduce((promises, el) => { |
| 79 | + const resourceUrl = this.#getResourceUrl(el); |
| 80 | + |
| 81 | + if (!this.#isResourceLocal(resourceUrl)) { |
| 82 | + return promises; |
| 83 | + } |
| 84 | + |
| 85 | + return promises.concat( |
| 86 | + this.#loadResource(resourceUrl.href).then((resp) => ({ el, resp })), |
| 87 | + ); |
| 88 | + }, []), |
| 89 | + ), |
60 | 90 | );
|
61 | 91 |
|
62 |
| - await Promise.allSettled(promises); |
| 92 | + await Promise.allSettled( |
| 93 | + results |
| 94 | + .reduce((acc, res) => (res.value ? acc.concat(res.value) : acc), []) |
| 95 | + .map(({ el, resp }) => { |
| 96 | + const { url } = resp.config; |
| 97 | + const extname = path.extname(url) || `.${mime.extension(resp.headers['content-type'])}`; |
| 98 | + const resourcePath = this.#getResourceFilePath(url, extname); |
| 99 | + console.log(resourcePath); |
| 100 | + |
| 101 | + this.#changeElementUrl(el, resourcePath); |
| 102 | + |
| 103 | + return this.#saveResource(resp.data, path.join(this.#outputDir, resourcePath)); |
| 104 | + }), |
| 105 | + ); |
63 | 106 | }
|
64 | 107 |
|
65 |
| - #getUrlAttr(element) { |
66 |
| - return element.name === 'link' ? 'href' : 'src'; |
| 108 | + #getResourceUrl(element) { |
| 109 | + const urlAttr = this.#getUrlAttr(element); |
| 110 | + return new URL(element.attribs[urlAttr], this.#url.href); |
67 | 111 | }
|
68 | 112 |
|
69 | 113 | #isResourceLocal(resourceUrl) {
|
70 | 114 | return resourceUrl.origin === this.#url.origin;
|
71 | 115 | }
|
72 | 116 |
|
73 |
| - async #loadResource(element) { |
74 |
| - const urlAttr = this.#getUrlAttr(element); |
75 |
| - const resourceUrl = new URL(element.attribs[urlAttr], this.#url.href); |
76 |
| - |
77 |
| - if (!this.#isResourceLocal(resourceUrl)) { |
78 |
| - return Promise.resolve(); |
79 |
| - } |
80 |
| - |
81 |
| - let resp; |
82 |
| - |
| 117 | + async #loadResource(url) { |
83 | 118 | try {
|
84 |
| - resp = await axios.get(resourceUrl.href, { responseType: 'stream' }); |
| 119 | + const resp = await axios.get(url, { responseType: 'stream' }); |
| 120 | + return resp; |
85 | 121 | } catch (error) {
|
86 |
| - return Promise.reject(error); |
| 122 | + console.error(`Error downloading ${url}`, error); |
| 123 | + throw error; |
87 | 124 | }
|
| 125 | + } |
88 | 126 |
|
89 |
| - const resourcePath = this.#getResourceFilePath(resourceUrl.href); |
90 |
| - $(element).attr(urlAttr, resourcePath); |
| 127 | + #changeElementUrl(element, newUrl) { |
| 128 | + const urlAttr = this.#getUrlAttr(element); |
| 129 | + $(element).attr(urlAttr, newUrl); |
| 130 | + } |
91 | 131 |
|
92 |
| - const writer = createWriteStream(path.join(this.#destFolder, resourcePath)); |
93 |
| - resp.data.pipe(writer); |
| 132 | + async #saveResource(data, path) { |
| 133 | + const writer = createWriteStream(path); |
| 134 | + data.pipe(writer); |
| 135 | + |
| 136 | + debug(`start writing to path: ${path}`); |
94 | 137 |
|
95 | 138 | return new Promise((resolve, reject) => {
|
96 |
| - writer.on('finish', resolve); |
97 |
| - writer.on('error', reject); |
| 139 | + writer.on('finish', () => { |
| 140 | + debug(`finish writing to path: ${path}`); |
| 141 | + resolve(); |
| 142 | + }); |
| 143 | + writer.on('error', (error) => { |
| 144 | + debug(`error writing to path: ${path}\n`, error); |
| 145 | + reject(error); |
| 146 | + }); |
98 | 147 | });
|
99 | 148 | }
|
100 | 149 |
|
101 |
| - #getResourceFilePath(rawName) { |
102 |
| - return path.join(this.#resourceDist, this.#generateFileName(rawName, { saveExt: true })); |
| 150 | + #getUrlAttr(element) { |
| 151 | + return element.name === 'link' ? 'href' : 'src'; |
103 | 152 | }
|
104 | 153 |
|
105 |
| - #generateFileName(string, { saveExt = false } = {}) { |
106 |
| - const dotIndex = string.lastIndexOf('.'); |
107 |
| - const ext = string.slice(dotIndex); |
108 |
| - |
109 |
| - let filename = string.trim(); |
| 154 | + #getResourceFilePath(rawName, extname) { |
| 155 | + return path.join(this.#resourceDir, this.#generateFileName(rawName, extname)); |
| 156 | + } |
110 | 157 |
|
111 |
| - if (saveExt) { |
112 |
| - filename = filename.slice(0, dotIndex); |
113 |
| - } |
| 158 | + #generateFileName(string, extname = null) { |
| 159 | + const regex = new RegExp(`${extname ? `(?!\\${extname})` : ''}[^a-z0-9]`, 'gi'); |
114 | 160 |
|
115 |
| - filename = filename.replace(/^https?:\/\//, '').replace(/[^a-z0-9]/gi, '-'); |
| 161 | + let filename = string |
| 162 | + .trim() |
| 163 | + .replace(/^https?:\/\//, '') |
| 164 | + .replace(regex, '-'); |
116 | 165 |
|
117 |
| - if (saveExt) { |
118 |
| - filename = filename.concat(ext); |
| 166 | + if (extname && !path.extname(filename)) { |
| 167 | + filename = filename.concat(extname); |
119 | 168 | }
|
120 | 169 |
|
121 | 170 | return filename;
|
|
0 commit comments