Skip to content

Commit a26a8b9

Browse files
committed
fix: make downloading canonical html files
1 parent b9c818e commit a26a8b9

File tree

4 files changed

+124
-63
lines changed

4 files changed

+124
-63
lines changed

__tests__/page-loader.test.js

+12-3
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ describe('downloads html', () => {
3333
beforeEach(async () => {
3434
nock(/ru\.hexlet\.io/)
3535
.get(/\/courses/)
36-
.reply(200, html);
36+
.reply(200, html, { 'Content-Type': 'text/html' });
3737
});
3838

3939
test('to the specified folder', async () => {
@@ -79,8 +79,9 @@ describe('downloads local resources', () => {
7979

8080
beforeEach(async () => {
8181
nock(/ru\.hexlet\.io/)
82+
.persist()
8283
.get(/\/courses/)
83-
.reply(200, htmlWithResources);
84+
.reply(200, htmlWithResources, { 'Content-type': 'text/html' });
8485

8586
nock(/ru\.hexlet\.io/)
8687
.get(/\/assets\/professions\/nodejs.png/)
@@ -99,7 +100,7 @@ describe('downloads local resources', () => {
99100
nock(/ru\.hexlet\.io/)
100101
.get(/\/packs\/js\/runtime.js/)
101102
.reply(200, script, {
102-
'Content-Type': 'text/javascript',
103+
'Content-Type': 'application/javascript',
103104
'Content-Length': (_, __, body) => body.length,
104105
});
105106

@@ -127,9 +128,17 @@ describe('downloads local resources', () => {
127128
'ru-hexlet-io-assets-application.css',
128129
);
129130

131+
const canonHtmlPath = path.join(
132+
tmpFolder,
133+
'ru-hexlet-io-courses_files',
134+
'ru-hexlet-io-courses.html',
135+
);
136+
130137
const actualCss = await fs.readFile(cssPath, 'utf-8');
138+
const actualCanonHtml = await fs.readFile(canonHtmlPath, 'utf-8');
131139

132140
expect(css).toBe(actualCss);
141+
expect(htmlWithResources).toBe(actualCanonHtml);
133142
});
134143

135144
test('scripts', async () => {

package-lock.json

+3-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

+5-3
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,11 @@
77
"scripts": {
88
"test": "jest --runInBand",
99
"test:log-nock": "DEBUG=nock.* jest --runInBand",
10-
"test:log-axios": "DEBUG=axios jest --runInBand"
10+
"test:log-axios": "DEBUG=axios jest --runInBand",
11+
"test:log": "DEBUG=page-loader jest --runInBand"
1112
},
1213
"bin": {
13-
"page-loader": "npx babel-node cli"
14+
"page-loader": "babel-node cli.js"
1415
},
1516
"keywords": [],
1617
"author": "",
@@ -32,6 +33,7 @@
3233
"cheerio": "^1.0.0-rc.12",
3334
"commander": "^11.0.0",
3435
"debug": "^4.3.4",
35-
"fs-cheerio": "^3.0.0"
36+
"fs-cheerio": "^3.0.0",
37+
"mime-types": "^2.1.35"
3638
}
3739
}

page-loader.js

+104-55
Original file line numberDiff line numberDiff line change
@@ -1,121 +1,170 @@
1+
import setNamespace from 'debug';
12
import axios from 'axios';
23
import path from 'path';
34
import fs from 'fs/promises';
45
import process from 'process';
56
import * as cheerio from 'cheerio';
67
import fsc from 'fs-cheerio';
8+
import mime from 'mime-types';
79
import { createWriteStream } from 'fs';
810

11+
const debug = setNamespace('page-loader');
12+
913
let $;
1014

1115
export class PageLoader {
1216
#url;
13-
#destFolder;
14-
#resourceDist;
17+
#outputDir;
18+
#resourceDir;
1519

16-
constructor(urlString, destFolder = process.cwd()) {
20+
constructor(urlString, outputDir = process.cwd()) {
1721
this.#url = new URL(urlString);
18-
this.#destFolder = destFolder;
19-
this.#resourceDist = `${this.#generateFileName(this.#url.href)}_files`;
22+
this.#outputDir = this.#normalizeDirPath(outputDir);
23+
this.#resourceDir = `${this.#generateFileName(this.#url.href)}_files`;
2024
}
2125

2226
async load() {
23-
const { filepath, html } = await this.#loadHtml();
24-
$ = cheerio.load(html);
25-
27+
await this.#loadDom();
28+
await this.#ensureDirExists(this.#outputDir);
2629
await this.#createResourceDir();
2730
await this.#loadResources();
2831

29-
await fsc.writeFile(filepath, $);
32+
const filepath = await this.#saveHtml();
3033

3134
return { filepath };
3235
}
3336

34-
async #createResourceDir() {
37+
async #saveHtml() {
38+
const htmlFilename = this.#generateFileName(this.#url.href) + '.html';
39+
const filepath = path.join(this.#outputDir, htmlFilename);
40+
41+
await fsc.writeFile(filepath, $);
42+
43+
return filepath;
44+
}
45+
46+
async #ensureDirExists(dirPath) {
3547
try {
36-
await fs.mkdir(path.join(this.#destFolder, this.#resourceDist));
48+
await fs.access(dirPath);
3749
} catch (error) {
38-
console.log(error);
50+
if (error.code === 'ENOENT') {
51+
await fs.mkdir(dirPath, { recursive: true });
52+
} else {
53+
throw error;
54+
}
3955
}
4056
}
4157

42-
async #loadHtml() {
43-
const htmlFilename = this.#generateFileName(this.#url.href) + '.html';
44-
const filepath = path.join(this.#destFolder, htmlFilename);
45-
46-
const { data: html } = await axios.get(this.#url.toString());
58+
#normalizeDirPath(pathToFolder) {
59+
return path.resolve(process.cwd(), pathToFolder);
60+
}
4761

48-
await fs.writeFile(filepath, html);
62+
async #createResourceDir() {
63+
await fs.mkdir(path.join(this.#outputDir, this.#resourceDir), { recursive: true });
64+
}
4965

50-
return { filepath, html };
66+
async #loadDom() {
67+
const { data } = await axios.get(this.#url.toString());
68+
$ = cheerio.load(data);
5169
}
5270

5371
async #loadResources() {
5472
const $links = $('link');
5573
const $images = $('img');
5674
const $scripts = $('script');
5775

58-
const promises = [$links, $images, $scripts].flatMap(($elements) =>
59-
$elements.toArray().map((el) => this.#loadResource(el)),
76+
const results = await Promise.allSettled(
77+
[$links, $images, $scripts].flatMap(($elements) =>
78+
$elements.toArray().reduce((promises, el) => {
79+
const resourceUrl = this.#getResourceUrl(el);
80+
81+
if (!this.#isResourceLocal(resourceUrl)) {
82+
return promises;
83+
}
84+
85+
return promises.concat(
86+
this.#loadResource(resourceUrl.href).then((resp) => ({ el, resp })),
87+
);
88+
}, []),
89+
),
6090
);
6191

62-
await Promise.allSettled(promises);
92+
await Promise.allSettled(
93+
results
94+
.reduce((acc, res) => (res.value ? acc.concat(res.value) : acc), [])
95+
.map(({ el, resp }) => {
96+
const { url } = resp.config;
97+
const extname = path.extname(url) || `.${mime.extension(resp.headers['content-type'])}`;
98+
const resourcePath = this.#getResourceFilePath(url, extname);
99+
console.log(resourcePath);
100+
101+
this.#changeElementUrl(el, resourcePath);
102+
103+
return this.#saveResource(resp.data, path.join(this.#outputDir, resourcePath));
104+
}),
105+
);
63106
}
64107

65-
#getUrlAttr(element) {
66-
return element.name === 'link' ? 'href' : 'src';
108+
#getResourceUrl(element) {
109+
const urlAttr = this.#getUrlAttr(element);
110+
return new URL(element.attribs[urlAttr], this.#url.href);
67111
}
68112

69113
#isResourceLocal(resourceUrl) {
70114
return resourceUrl.origin === this.#url.origin;
71115
}
72116

73-
async #loadResource(element) {
74-
const urlAttr = this.#getUrlAttr(element);
75-
const resourceUrl = new URL(element.attribs[urlAttr], this.#url.href);
76-
77-
if (!this.#isResourceLocal(resourceUrl)) {
78-
return Promise.resolve();
79-
}
80-
81-
let resp;
82-
117+
async #loadResource(url) {
83118
try {
84-
resp = await axios.get(resourceUrl.href, { responseType: 'stream' });
119+
const resp = await axios.get(url, { responseType: 'stream' });
120+
return resp;
85121
} catch (error) {
86-
return Promise.reject(error);
122+
console.error(`Error downloading ${url}`, error);
123+
throw error;
87124
}
125+
}
88126

89-
const resourcePath = this.#getResourceFilePath(resourceUrl.href);
90-
$(element).attr(urlAttr, resourcePath);
127+
#changeElementUrl(element, newUrl) {
128+
const urlAttr = this.#getUrlAttr(element);
129+
$(element).attr(urlAttr, newUrl);
130+
}
91131

92-
const writer = createWriteStream(path.join(this.#destFolder, resourcePath));
93-
resp.data.pipe(writer);
132+
async #saveResource(data, path) {
133+
const writer = createWriteStream(path);
134+
data.pipe(writer);
135+
136+
debug(`start writing to path: ${path}`);
94137

95138
return new Promise((resolve, reject) => {
96-
writer.on('finish', resolve);
97-
writer.on('error', reject);
139+
writer.on('finish', () => {
140+
debug(`finish writing to path: ${path}`);
141+
resolve();
142+
});
143+
writer.on('error', (error) => {
144+
debug(`error writing to path: ${path}\n`, error);
145+
reject(error);
146+
});
98147
});
99148
}
100149

101-
#getResourceFilePath(rawName) {
102-
return path.join(this.#resourceDist, this.#generateFileName(rawName, { saveExt: true }));
150+
#getUrlAttr(element) {
151+
return element.name === 'link' ? 'href' : 'src';
103152
}
104153

105-
#generateFileName(string, { saveExt = false } = {}) {
106-
const dotIndex = string.lastIndexOf('.');
107-
const ext = string.slice(dotIndex);
108-
109-
let filename = string.trim();
154+
#getResourceFilePath(rawName, extname) {
155+
return path.join(this.#resourceDir, this.#generateFileName(rawName, extname));
156+
}
110157

111-
if (saveExt) {
112-
filename = filename.slice(0, dotIndex);
113-
}
158+
#generateFileName(string, extname = null) {
159+
const regex = new RegExp(`${extname ? `(?!\\${extname})` : ''}[^a-z0-9]`, 'gi');
114160

115-
filename = filename.replace(/^https?:\/\//, '').replace(/[^a-z0-9]/gi, '-');
161+
let filename = string
162+
.trim()
163+
.replace(/^https?:\/\//, '')
164+
.replace(regex, '-');
116165

117-
if (saveExt) {
118-
filename = filename.concat(ext);
166+
if (extname && !path.extname(filename)) {
167+
filename = filename.concat(extname);
119168
}
120169

121170
return filename;

0 commit comments

Comments
 (0)