-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparseFiles.js
105 lines (87 loc) · 3.04 KB
/
parseFiles.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// Parse Files , see if law of Heap is right, count size of the vocabulary
// Count size of all terms, Get increase Frequency of the size of vocabulary
var fs = require('fs');
var path = require('path');
//Regex for the files, tokenizing , get words
function countTextWords (text) {
var result = /<body[^>]*>(.+?)<\/body>/img.exec(text.replace(/\s/g, ' '))[1]
.replace(/(<([^>]+)>)/ig, " ")
.replace(/[\t\n]+/g, "")
.replace(/ +/g, " ")
.replace(/[•:\^"·.\-|,[\]()]/g,"")
.split(/\s/)
.filter(function (x) {
return x.length; // remove empty spaces
})
.reduce(function (res, crnt) {
// crnt = crnt.toLowerCase();
res[crnt] = (res[crnt] | 0) + 1;
return res;
}, {});
return result;
}
// Get all directories/files
function walkSync(currentDirPath, callback) {
fs.readdirSync(currentDirPath).forEach(function (name) {
var filePath = path.join(currentDirPath, name);
var stat = fs.statSync(filePath);
if (stat.isFile()) {
callback(filePath, stat);
} else if (stat.isDirectory()) {
walkSync(filePath, callback);
}
});
}
var directory = "en1";//folder/directory given
var directoryStats = {};
var numTerms = 0;//total words found
//Call the regex countTextWords for all files
walkSync(directory, function(filePath, stat) {
directoryStats[filePath] = countTextWords(fs.readFileSync(filePath, 'utf8'));
});
//Combine each array from all files to one overall array
var overallResults = Object.keys(directoryStats).reduce(function (res, crntPath) {
var crntFileStats = directoryStats[crntPath];
var i = 0;
var test = 1;
Object.keys(crntFileStats).reduce(function (res, crntWord) {
i++;
res[crntWord] = (res[crntWord] | 0) + crntFileStats[crntWord];
numTerms += 1;
if((i / 100 != 0) && (i % 100 == 0) ) {
sizeOfVoc = Object.keys(res).length;
freqIncr = ((sizeOfVoc - test)/test)*100;
console.log("Size of vocabulary: " + sizeOfVoc + " || Number of tokens: " + numTerms + " || Increase Freq of Voc: " + freqIncr + "%");
console.log("Log10 of vocabulary: " + Math.log10(sizeOfVoc) + " || Log10 of the number of tokens: " + Math.log10(numTerms));
test = sizeOfVoc;
}
return res;
}, res);
return res;
}, {});
//FINAL PRINTS
console.log("Total number of terms is: " + numTerms);
indTerms = Object.keys(overallResults).length;//total terms
console.log("Individualy: " + indTerms);
table = Object.keys(overallResults).map(function(key) {
return {
key: key,
count: overallResults[key],
freq: overallResults[key]/indTerms,
};
}).sort(function (row1, row2) {
return row2.count - row1.count;
});
var overallconstant=0;
for (var i=0; i<table.length; i++) {
table[i].const = table[i].freq * (i+1);
overallconstant += table[i].const;
}
//FINAL PRINTS
//M=k*T^b , b=0,49
b = 0.49;
k = indTerms/(Math.pow(numTerms, b));
console.log("Size of Voc: " + indTerms + " Number of tokens: " + numTerms + " Parameter b: " + b + " Parameter k: " + k);
overallconstant = overallconstant / indTerms;
console.log("Overall constant is: " + overallconstant);
console.log(table);