-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathad_processing.js
199 lines (196 loc) · 12 KB
/
ad_processing.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
/* -------------------------- Main functions -------------------------- */
/**
* Extract the adverts from the results page, classifies them into
* sensitive or non-sensitive and adds a coloured border accordingly.
*/
function main() {
// Get adverts from results page
var ads = extractAds(); // HTMLCollection
/**/console.log(ads.length + " ads found:\n\n");
// Extract advert text, remove stop words and stem words
var ads_proc = processAds(ads);
/**/for(ad of ads_proc)console.log(ad);console.log("---------------\n");
var [row_probs, col_probs] = getProbs();
// Add a border to the adverts according to their classifications
highlightAds(ads, ads_proc, row_probs, col_probs);
}
/**
* Extract adverts from the currently loaded page.
* @return {HTMLCollection} Array-like object of advert elements.
*/
function extractAds() {
// Google ads have class="ads-ad"
return document.getElementsByClassName("ads-ad");
}
/**
* Extracts the text from the adverts and tokenizes it.
* @param {HTMLCollection} ads Array-like object containing advert elements.
* @return {Array} Arrays of strings representing individual words from the advert text.
*/
function processAds(ads) {
var ads_ = [];
var ad_proc;
for (var ad of ads) {
// Extract advert text
ad_proc = getAdText(ad);
// Tokenise advert text (remove stop words and stem words)
ad_proc = tokeniseText(ad_proc);
ads_.push(ad_proc);
}
return ads_;
}
/**
* Adds a coloured border to the adverts according to their classification:
* Sensitive: red border; Non-sensitive: green border.
* @param {HTMLCollection} ads Array-like object containing advert elements.
* @param {Array} ads_proc Arrays of strings representing individual words from the advert text.
*/
function highlightAds(ads, ads_proc, row_probs, col_probs) {
for (var i = 0; i < ads.length; i++) {
// Add margin to improve visibility
ads.item(i).style.marginBottom = "2px";
/**/console.group("Ad",(i+1));//console.log("Ad " + (i+1));
// Add a red border if advert is sensitive and green otherwise
if (testAd(ads_proc[i], row_probs, col_probs)) {
// Sensitive
ads.item(i).style.border = "2px dashed red";
changeLabel(ads.item(i), "sensitive", ads_proc[i]);
} else {
// Non-sensitive
ads.item(i).style.border = "2px dashed green";
changeLabel(ads.item(i), "other", ads_proc[i]);
}
}
}
/* -------------------------- Other functions -------------------------- */
/**
* Adds a button to each advert asking the user if the classification is wrong.
* When the button is clicked, the label of the advert (sensitive or not) is
* changed and the advert is included in the training data.
* @param {HTMLElement} ad Advert element.
* @param {String} orig_label Original label of the advert.
* @param {Array} ad_txt Array containing the advert text.
*/
function changeLabel(ad, orig_label, ad_txt) {
// Create button to change label
var btn_txt = document.createTextNode("Wrong?");
var btn = document.createElement("button");
btn.setAttribute("style", "float: right; position: relative");
btn.setAttribute("id", "label_btn");
btn.appendChild(btn_txt);
ad.insertBefore(btn, ad.firstChild);
// Handle clicks on the button
btn.addEventListener("click", changeLabel_);
function changeLabel_() {
// Change the style of the button
btn.setAttribute("style", "float: right; position: relative; background-color: #ff6666");
// Change the text
var new_txt = document.createTextNode("Label changed!");
btn.replaceChild(new_txt, btn_txt)
// Change the label of the advert
if (orig_label == "sensitive") {
ad.style.border = "2px dashed green";
var new_label = "other";
} else {
ad.style.border = "2px dashed red";
var new_label = "sensitive";
}
// Add advert to the training data with the correct label
addTrainingData(ad_txt, new_label);
btn.removeEventListener("click", changeLabel_);
}
}
/**
* Tests whether the advert is sensitive or not.
* @param {Array} ad Array of strings representing individual words from the advert.
* @return {Boolean} True if advert is sensitive; false otherwise.
*/
function testAd(ad, row_probs, col_probs) {
var pri = getPRI(ad, row_probs, col_probs);
var isSensitive = pri[0] >= pri[1];
/**/if(isSensitive)console.log("SENSITIVE "+pri[0]);else console.log("other "+pri[1]);console.log("---------------");console.groupEnd();
return pri[0] >= pri[1];
}
/**
* Extracts text of advert elements recursively.
* @param {HTMLElement} ad Advert element.
* @return {String} Advert text.
*/
function getAdText(ad) {
var txt = '';
var ignore_nodes = ad.className == "_lBb" // Ad notice
|| ad.className == "_mB" // Ad icon
|| ad.className == "_WGk" // Ad url
|| ad.id == "btn_label" // Button to change advert label
var num_children = ad.childNodes.length
// Recurse down into inner nodes until a text node is reached
if (ad.hasChildNodes() && !ignore_nodes) {
for (var i = 0; i < num_children; i++) {
txt += getAdText(ad.childNodes[i]);
}
} else if (ad.nodeType === Node.TEXT_NODE && !ignore_nodes){
// Append text from node
txt += ad.textContent + ' ';
}
return txt;
}
/**
* Removes stop words from text and stems the remaining words.
* @param {String} text Raw text.
* @return {Array} Text without stop words and with stemmed words.
*/
function tokeniseText(text) {
// Remove stop words
var text_= removeStopWords(text); // String
// Stem words
text_ = stemWords(text_); // String
// Split string into individual words
return text_.split(' ');
}
var stopWords = "a,about,across,above,after,afterwards,again,against,ain,all,almost,alone,along,already,also,although,always,am,among,amongst,amoungst,amount,amp,an,and,another,any,anyhow,anyone,anything,anyway,anywhere,are,aren,around,as,at,back,b,be,became,because,become,becomes,becoming,been,before,beforehand,behind,being,below,beside,besides,between,beyond,bill,both,bottom,but,by,call,can,cannot,cant,co,computer,con,could,couldn,couldnt,cry,d,de,describe,detail,did,didn,didn\'t,do,don\'t,does,doesn,doing,don,done,down,due,during,each,eg,eight,either,eleven,else,elsewhere,em,empty,enough,etc,eu,even,ever,every,everyone,everything,everywhere,except,few,fifteen,fifty,fill,find,fire,first,five,for,former,formerly,forty,found,four,from,front,full,further,get,give,go,had,hadn,has,hasn,hasnt,have,haven,having,he,hence,her,here,hereafter,hereby,herein,hereupon,hers,herself,hi,him,himself,his,how,however,hundred,i,i \'m,ie,if,in,inc,indeed,interest,into,is,isn,it,its,itself,just,keep,last,latter,latterly,least,less,lo,ltd,ll,m,ma,made,many,may,me,meanwhile,might,mill,mine,mightn,more,moreover,most,mostly,move,much,must,mustn,my,myself,name,namely,nbsp,neither,never,needn,nevertheless,next,nine,no,nobody,none,noone,nor,not,nothing,now,nowhere,o,of,off,often,on,once,one,only,onto,or,other,others,otherwise,our,ours,ourselves,out,over,own,part,per,perhaps,please,put,rather,re,s,same,see,seem,seemed,seeming,seems,serious,several,shan,she,should,shouldn,shouldn\'t,show,side,since,sincere,six,sixty,so,some,somehow,someone,something,sometime,sometimes,somewhere,still,such,system,t,take,ten,than,that,the,their,theirs,them,themselves,then,thence,there,thereafter,thereby,therefore,therein,thereupon,these,they,thick,thin,third,this,those,three,through,throughout,thru,thus,to,together,too,top,toward,towards,twelve,twenty,two,un,under,until,up,upon,us,ve,very,via,was,wasn,we,well,were,weren,what,whatever,when,whence,whenever,where,whereafter,whereas,whereby,wherein,whereupon,wherever,whether,which,while,whither,who,whoever,whole,whom,whose,why,will,with,within,without,would,won,wouldn,y,yet,you,your,yours,yourself,yourselves,www,htm,html,com,pfx,forum,aspx,nav,messages,webtag,ab,li,file,localhost,span,price,online,free,official";
/**
* Removes stop words from text.
* @param {String} str Raw text.
* @return {String} Text without stop words.
*/
function removeStopWords(str) {
var words = new Array();
str.replace(/\b[\w]+\b/g,
function(str) {
if (!isStopWord(str)) {
words[words.length] = str.trim();
}
}
);
return words.join(' ');
}
/**
* Verifies whether the input is a stop word.
* @param {String} word Word to be verified.
* @return {Boolean} True if word is a stop word; false otherwise.
*/
function isStopWord(word) {
var isNumber = /^\d+$/.test(word);
var regex = new RegExp("\\b"+word+"\\b","i");
if (!isNumber && stopWords.search(regex) < 0) {
return false;
} else {
return true;
}
}
/**
* Stems the words from the text.
* @param {Array} ads Array containing the adverts.
* @return {Array} Each element of the array is an array containing the stemmed words from each advert.
*/
function stemWords(text) {
return text.toLowerCase().split(' ').map(stemmer).join(' ');
}
// TODO: include copyright notice (how?) https://goo.gl/YUQCYg
// Modified from https://github.com/kristopolous/Porter-Stemmer/blob/master/PorterStemmer1980.min.js
var stemmer=function(){function h(){}function i(){}var j={ational:"ate",tional:"tion",enci:"ence",anci:"ance",izer:"ize",bli:"ble",alli:"al",entli:"ent",eli:"e",ousli:"ous",ization:"ize",ation:"ate",ator:"ate",alism:"al",iveness:"ive",fulness:"ful",ousness:"ous",aliti:"al",iviti:"ive",biliti:"ble",logi:"log"},k={icate:"ic",ative:"",alize:"al",iciti:"ic",ical:"ic",ful:"",ness:""};return function(a,l){var d,b,g,c,f,e;e=l?i:h;if(3>a.length)return a;
g=a.substr(0,1);"y"==g&&(a=g.toUpperCase()+a.substr(1));c=/^(.+?)(ss|i)es$/;b=/^(.+?)([^s])s$/;c.test(a)?(a=a.replace(c,"$1$2"),e("1a",c,a)):b.test(a)&&(a=a.replace(b,"$1$2"),e("1a",b,a));c=/^(.+?)eed$/;b=/^(.+?)(ed|ing)$/;c.test(a)?(b=c.exec(a),c=/^([^aeiou][^aeiouy]*)?[aeiouy][aeiou]*[^aeiou][^aeiouy]*/,c.test(b[1])&&(c=/.$/,a=a.replace(c,""),e("1b",c,a))):b.test(a)&&(b=b.exec(a),d=b[1],b=/^([^aeiou][^aeiouy]*)?[aeiouy]/,b.test(d)&&(a=d,e("1b",b,a),b=/(at|bl|iz)$/,f=/([^aeiouylsz])\1$/,d=/^[^aeiou][^aeiouy]*[aeiouy][^aeiouwxy]$/,
b.test(a)?(a+="e",e("1b",b,a)):f.test(a)?(c=/.$/,a=a.replace(c,""),e("1b",f,a)):d.test(a)&&(a+="e",e("1b",d,a))));c=/^(.*[aeiouy].*)y$/;c.test(a)&&(b=c.exec(a),d=b[1],a=d+"i",e("1c",c,a));c=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;c.test(a)&&(b=c.exec(a),d=b[1],b=b[2],c=/^([^aeiou][^aeiouy]*)?[aeiouy][aeiou]*[^aeiou][^aeiouy]*/,c.test(d)&&(a=d+j[b],e("2",c,a)));c=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
c.test(a)&&(b=c.exec(a),d=b[1],b=b[2],c=/^([^aeiou][^aeiouy]*)?[aeiouy][aeiou]*[^aeiou][^aeiouy]*/,c.test(d)&&(a=d+k[b],e("3",c,a)));c=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;b=/^(.+?)(s|t)(ion)$/;c.test(a)?(b=c.exec(a),d=b[1],c=/^([^aeiou][^aeiouy]*)?[aeiouy][aeiou]*[^aeiou][^aeiouy]*[aeiouy][aeiou]*[^aeiou][^aeiouy]*/,c.test(d)&&(a=d,e("4",c,a))):b.test(a)&&(b=b.exec(a),d=b[1]+b[2],b=/^([^aeiou][^aeiouy]*)?[aeiouy][aeiou]*[^aeiou][^aeiouy]*[aeiouy][aeiou]*[^aeiou][^aeiouy]*/,
b.test(d)&&(a=d,e("4",b,a)));c=/^(.+?)e$/;if(c.test(a)&&(b=c.exec(a),d=b[1],c=/^([^aeiou][^aeiouy]*)?[aeiouy][aeiou]*[^aeiou][^aeiouy]*[aeiouy][aeiou]*[^aeiou][^aeiouy]*/,b=/^([^aeiou][^aeiouy]*)?[aeiouy][aeiou]*[^aeiou][^aeiouy]*([aeiouy][aeiou]*)?$/,f=/^[^aeiou][^aeiouy]*[aeiouy][^aeiouwxy]$/,c.test(d)||b.test(d)&&!f.test(d)))a=d,e("5",c,b,f,a);c=/ll$/;b=/^([^aeiou][^aeiouy]*)?[aeiouy][aeiou]*[^aeiou][^aeiouy]*[aeiouy][aeiou]*[^aeiou][^aeiouy]*/;c.test(a)&&b.test(a)&&(c=/.$/,a=a.replace(c,""),e("5",
c,b,a));"y"==g&&(a=g.toLowerCase()+a.substr(1));return a}}();