-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathindex.js
88 lines (79 loc) · 2.74 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
const puppeteer = require("puppeteer");
const fs = require("fs");
const path = require("path");
/**
* Configuration settings for the web scraper.
* @typedef {Object} Config
* @property {string} baseUrl - The base URL used for creating absolute URLs.
* @property {boolean} removeJS - Whether to remove JavaScript code from the scraped HTML.
* @property {boolean} addBaseURL - Whether to add a base URL to the head of the HTML.
* @property {string} cacheFolder - The folder for caching scraped HTML content.
*/
/**
* Configuration object with settings.
* @type {Config}
*/
const CONFIG = {
baseUrl: "https://example.com",
removeJS: true,
addBaseURL: true,
cacheFolder: "path_to_cache_folder",
};
/**
* Function to create necessary folders based on the provided directory path.
* @param {string} directory - The directory path to create folders for.
*/
const createFolders = (directory) => {
const folders = directory.split(path.sep);
folders.shift();
let currentPath = CONFIG.cacheFolder;
folders.forEach((folder) => {
currentPath = path.join(currentPath, folder);
if (!fs.existsSync(currentPath)) {
fs.mkdirSync(currentPath);
}
});
};
/**
* Main scraping function.
* @param {string} pathUrl - The URL to scrape.
*/
const scrap = async (pathUrl) => {
try {
// Launch Puppeteer browser
const browser = await puppeteer.launch({
headless: "new",
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
// Create a new page in the browser
const page = await browser.newPage();
// Navigate to the specified URL and wait until the page is fully loaded
await page.goto(pathUrl, { waitUntil: "networkidle2" });
// Get the outer HTML of the entire document
let html = await page.evaluate(() => document.documentElement.outerHTML);
// Remove JavaScript code from the HTML if configured to do so
if (CONFIG.removeJS) {
html = html.replace(
/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi,
"",
);
}
// Add base URL to the head if configured to do so
if (CONFIG.addBaseURL) {
html = html.replace(/<head>/gi, `<head><base href="${CONFIG.baseUrl}">`);
}
// Create necessary folders for caching based on the URL
createFolders(pathUrl);
// Generate a path for caching by removing the protocol (http/https)
const path = pathUrl.replace(/(^\w+:|^)\/\//, "");
// Write the HTML content to a file in the cache folder
fs.writeFileSync(`${CONFIG.cacheFolder}/${path}/index.html`, html);
// Close the Puppeteer browser
await browser.close();
} catch (error) {
// Log any errors that occur during the scraping process
console.error(error);
}
};
// Export the scraping function for external use
exports.scrap = scrap;