-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.js
151 lines (137 loc) · 3.88 KB
/
app.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
'use strict';
const cheerio = require('cheerio');
const axios = require('axios');
const fastq = require('fastq');
const {writeFileSync} = require('fs');
const {join} = require('path');
const domain = process.argv[2];
const domainReg = new RegExp(`^https?:\/\/${domain.replace('.', '\.')}`);
let results = {
'4xx': [],
'5xx': []
};
let crawled = 0;
let known = ['', '#'];
let lastEmptyTime = 0;
/**
* Adds an url to the crawling queue
* @param referrer
* @param url
*/
const addURLToQueue = (referrer, url = '') => {
if (url.match(/^\//)) {
url = url.replace(/^\//, `https://${domain}/`);
}
if (!known.includes(url.trim()) && url.trim().match(domainReg) !== null && url.trim().match(/^javascript/) === null) {
known.push(url)
crawlQueue.push({referrer, url});
}
};
/**
* Extract urls from a page
* @param data
* @returns {Promise<void>}
*/
const extractURLs = async (data) => {
const $ = cheerio.load(data.content);
$('a[href], link[href]').each((i, el) => {
addURLToQueue(data.url, el.attribs.href);
});
$('img[src], script[src]').each((i, el) => {
addURLToQueue(data.url, el.attribs.src);
});
}
/**
* Store the results to the result var
* @param referrer
* @param url
* @param response
*/
const storeResult = (referrer, url, response) => {
const statusType = Math.floor(response.status/100) +'xx';
const result = {
url,
referrer,
status: response.status
};
results[statusType] = results[statusType] || [];
results[response.status] = results[response.status] || [];
results[statusType].push(result);
results[response.status].push(result)
};
/**
* Crawl an URL
* @param data
* @returns {Promise<AxiosResponse<any>>}
*/
const crawl = (data) => {
return axios.get(data.url)
.catch(error => {
if (error.response) {
storeResult(data.referrer, data.url, error.response);
} else if (error.request) {
console.log(data.url, 'error.request', error.request);
} else {
console.log(data.url, 'error.message', error.message);
throw error;
}
})
.then(response => {
if (response) {
return extractURLsQueue.push({url: data.url, referrer: data.referrer, content: response.data});
}
})
.catch(error => {
console.log('ERROR ON URL', data.url);
throw error;
})
.then(() => {
crawled++;
// reports to console
console.log(crawled, 'URLs crawled so far.', crawlQueue.length(), 'URLs to be crawled.', `[4xx] ${results['4xx'].length}`, `[5xx] ${results['5xx'].length}`);
});
};
/**
* Store the data to a json file and exit the process
*/
const storeAndExit = () => {
const fileName = join(__dirname, 'reports', domain + '.json');
writeFileSync(fileName, JSON.stringify(results));
console.log('Data stored at', fileName);
process.exit(0);
};
/**
* Check if the process has been finished yet
*/
const endProcess = () => {
if (lastEmptyTime === 0) {
if (crawlQueue.length() === 0 && extractURLsQueue.length() === 0 && crawled > 0) {
console.log('About to end...');
lastEmptyTime = Date.now();
}
setTimeout(endProcess, 5000);
} else {
if (crawlQueue.length() === 0 && extractURLsQueue.length() === 0) {
console.log('Work done: saving data...');
storeAndExit();
} else {
console.log('No end for this time...');
lastEmptyTime = 0;
setTimeout(endProcess, 5000);
}
}
}
// Create the queues
const crawlQueue = fastq.promise(crawl, 4);
const extractURLsQueue = fastq.promise(extractURLs, 15);
// handle manual process end
process.on('SIGINT', () => {
console.log('Manual interruption: saving data...');
crawlQueue.kill();
extractURLsQueue.kill();
storeAndExit();
});
// push the first URL to crawl to start the process
crawlQueue.push({referrer: '', url: 'https://' + domain + '/'});
// launch the "has the process ended yet" mechanism
endProcess();