-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathterm_pca.R
62 lines (43 loc) · 1.56 KB
/
term_pca.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
library(tidyverse)
library(tidytext)
library(irlba)
library(quanteda)
data(stop_words)
papers <- read_csv("full_biorxiv_data.csv") %>%
select(titles, abstracts)
word_counts <- papers %>%
unnest_tokens(word, abstracts) %>%
count(titles, word, sort = TRUE) %>%
ungroup()
word_freqs <- word_counts %>%
anti_join(stop_words) %>%
bind_tf_idf(word, titles, n)
term_mat <- word_freqs %>%
cast_dfm(titles, word, tf) %>%
as.matrix()
# term_pca <- prcomp(term_mat,center = TRUE, scale. = TRUE)
term_pca <- term_mat %*% irlba(term_mat, nv=5, nu=0, center=colMeans(term_mat), right_only=TRUE)$v
term_pca_df <- as_data_frame(term_pca) %>%
rename_(.dots = setNames(names(.), paste0("PC", 1:5))) %>%
mutate(title = rownames(term_pca))
ggplot(term_pca_df, aes(x = PC1, y = PC2)) +
geom_point(alpha = 0.2)
library(plotly)
packageVersion('plotly')
plot_ly(term_pca_df, x = ~PC1, y = ~PC2, z = ~PC3, opacity = 0.2,
text = ~paste('Title:', title)) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')))
dist <- dist(term_pca_df[,1:3])
m <- as.matrix(dist)
#find most different for users
most_diff <- which(matrix(m %in% head(sort(m, TRUE), 30), nr = nrow(m)), arr.ind = TRUE)
#so we don't pick ourselves like a bunch of cotton headed ninny muggins
m[m==0] <- Inf
save(m,file = "../papr/rec_matrix.Rda")
#add index to papers dataset
papers %>%
inner_join(lookup, by = "titles") -> dat
save(dat,file = "../papr/biorxiv_data.Rda")