-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute_ngd.R
54 lines (48 loc) · 1.75 KB
/
compute_ngd.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(RCurl)
library(here)
getGoogleCount <- function(searchTerms=NULL, language="en", ...){
require(RCurl)
entry <- paste(searchTerms, collapse="+")
siteHTML <- getForm("http://www.google.com/search",
hl=language, lr="", q=entry,
btnG="Search")
Sys.sleep(sample(seq(0.5, 3, by=0.1), 1))
write.table(siteHTML, file=paste0(here(),"/tmp/tmp_google.txt"))
indicatorWord <- "resultStats"
posExtractStart <- gregexpr(indicatorWord, siteHTML,
fixed = TRUE)[[1]]
stringExtract <- as.character(substring(siteHTML, first=posExtractStart[2]-30,
last = posExtractStart[2]+50 ))
count <- strsplit(stringExtract, 'resultStats')[[1]][2]
count <- strsplit(count, split='results')[[1]][1]
count <- strsplit(count, split='>')[[1]][2]
if(length(strsplit(count, split=" ")[[1]]) == 2){
count <- strsplit(count, split=" ")[[1]][2]
}
count <- as.numeric(gsub(",", "", count))
return(count)
}
NGD <- function(x,y){
xy <- getGoogleCount(c(x, y))
x <- getGoogleCount(c(x))
y <- getGoogleCount(c(y))
xy <- as.numeric(gsub(",", "", xy))
x <- as.numeric(gsub(",", "", x ))
y <- as.numeric(gsub(",", "", y ))
M <- 25270000000
dist <- (max(log(x), log(y)) - log(xy))/(log(M)-min(log(x), log(y)))
return(dist)
}
compute_NGD_for_combinations <- function (topiclist, show=F) {
NGD_vector <- numeric()
count <- 1
for (combination in combn(topiclist, 2, simplify = FALSE)) {
if (length(combination)==2){
ngd <- NGD(combination[1], combination[2])
}
if (show) cat("NGD for words ", combination[1], " and ", combination[2], ":", ngd)
NGD_vector[count] <- ngd
count <- count+1
}
mean(NGD_vector, na.rm=T)
}