This repository has been archived by the owner on Mar 2, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscience_pulse_functions.R
562 lines (438 loc) · 20.7 KB
/
science_pulse_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
#########################################################################################
#########################################################################################
########################### ###########################
########################### SCIENCE PULSE'S FUNCTIONS ###########################
########################### ###########################
#########################################################################################
### POPULAR WITHIN PULSE
# Among tweets in the sample, which ones were more RT (by the whole universe
# of twitter users)? Count includes RTs by users outside Pulse. However,
# original tweets were posted only by monitored accounts.
popular_within_pulse <- function(dataset){
show_dataset <- dataset %>%
filter(is_retweet == F)
# Not enough tweets message, if necessary
if(nrow(show_dataset) == 0) {
not_enough_tweets_en()
} else {
show_dataset %>%
# Slice only the tweet with the most RTs from each user
group_by(screen_name) %>%
arrange(desc(retweet_count), created_at) %>%
slice(1) %>%
ungroup() %>%
# Slice those 5 with the most RTs
arrange(desc(retweet_count), created_at) %>%
slice(1:5) %>%
# Create and select only column with embed tweet code
mutate(text = paste0('<blockquote class="twitter-tweet"><p lang="',
language, '" dir="ltr">',
text, '</p>—',
name, '(@',
screen_name, ') <a href="https://twitter.com/',
screen_name, '/status/',
status_id, '">',
created_at, '</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
)) %>%
select(text) %>%
mutate(text = paste0("<strong>", 1:n(), "º //</strong> ", text)) %>%
rename(" " = text)
}
}
#########################################################################################
### RT-RATIO
# Among tweets in the sample (posted in the sample), which ones have the highest
# RT/followers ratio? Count includes RTs by users outside Pulse. However, original
# tweets were posted only by monitored accounts. Excludes tweets with less than 2 RTs,
# to avoid irrelevant tweets from users with a low number of followers from appearing.
rising_popularity <- function(dataset){
show_dataset <- dataset %>%
# Filter tweets with 1+ RTs and creates ratio column
filter(is_retweet == F,
retweet_count > 1) %>%
mutate(followers_count = as.numeric(followers_count),
retweet_count = as.numeric(retweet_count),
ratio = retweet_count/followers_count)
# Not enough tweets message, if necessary
if(nrow(show_dataset) == 0) {
not_enough_tweets_en()
} else {
show_dataset %>%
# Slice only the tweet with the highest ratio from each user
group_by(screen_name) %>%
arrange(desc(ratio), created_at) %>%
slice(1) %>%
ungroup() %>%
# Slice those 5 with the highest ratios
arrange(desc(ratio), created_at) %>%
slice(1:5) %>%
# Create and select only column with embed tweet code
mutate(text = paste0('<blockquote class="twitter-tweet"><p lang="',
language, '" dir="ltr">',
text, '</p>—',
name, '(@',
screen_name, ') <a href="https://twitter.com/',
screen_name, '/status/',
status_id, '">',
created_at, '</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
)) %>%
select(text) %>%
mutate(text = paste0("<strong>", 1:n(), "º //</strong> ", text)) %>%
rename(" " = text)
}
}
#########################################################################################
### OVERPERFORMING
# Identify posts with the highest number of interactions, considering the usual n.
# of interactions of that account along a large period of time. Thus, it considers
# the weigthed sum of RTs and likes (a RT is more "valuable" than a like), from
# each post and compares it with an average from the last posts from that account.
# There's also a "penalty" for accounts with less followers, to avoid that the
# final score is only a result from a small number of tweets.
# This measure is inspired by CrowdTangle's overperforming metric.
overperforming <- function(dataset){
show_dataset <- dataset %>%
# Filter tweets with more than 2 RTs
filter(is_retweet == F,
retweet_count > 2)
# Not enough tweets message, if necessary
if(nrow(show_dataset) == 0) {
not_enough_tweets_pt()
} else {
show_dataset %>%
# Slice only the tweet with the highest final_score from each user
group_by(screen_name) %>%
arrange(desc(final_score), created_at) %>%
slice(1) %>%
ungroup() %>%
# Slice those 5 with the highest final_scores
arrange(desc(final_score), created_at) %>%
slice(1:5) %>%
# Create and select only column with embed tweet code
mutate(text = paste0('<blockquote class="twitter-tweet"><p lang="',
language, '" dir="ltr">',
text, '</p>—',
name, '(@',
screen_name, ') <a href="https://twitter.com/',
screen_name, '/status/',
status_id, '">',
created_at, '</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
)) %>%
select(text) %>%
mutate(text = paste0("<strong>", 1:n(), "º //</strong> ", text)) %>%
rename(" " = text)
}
}
#########################################################################################
### SAMPLE TWEETS WITH 1+ RTS
# Random sample of tweets with more than 1 RT from 3 profiles' strata:
# (1) non-male; (2) instituions; (3) males.
sample_more_than_one <- function(dataset){
dataset %>%
# Filter tweets with more than 1 RT
filter(is_retweet == F,
retweet_count > 1) %>%
group_by(group3) %>%
sample_n(size = case_when(
group3 == "TRUE" ~ 2,
group3 == "FALSE" ~ 2,
is.na(group3) ~ 1)) %>%
ungroup() %>%
# Arrange column to show not-male first
arrange(desc(group3)) %>%
# Create and select only column with embed tweet code
mutate(text = paste0('<blockquote class="twitter-tweet"><p lang="',
language, '" dir="ltr">',
text, '</p>—',
name, '(@',
screen_name, ') <a href="https://twitter.com/',
screen_name, '/status/',
status_id, '">',
created_at, '</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
)) %>%
select(text) %>%
mutate(text = paste0("<strong>", 1:n(), "º //</strong> ", text)) %>%
rename(" " = text)
}
#########################################################################################
### MOST ACTIVE USERS
# Monitored accounts with the highest n. of posts in the last 12h
active_users <- function(dataset){
dataset %>%
group_by(screen_name) %>%
count(sort = T) %>%
ungroup() %>%
slice(1:5) %>%
# Create and select only column with twitter hyperlynk
mutate(screen_name = paste0("@<a href='https://twitter.com/", screen_name, "' target='_blank' style='color: #d91c5c'>", screen_name, "</a>")) %>%
select(screen_name) %>%
rename("<i class='fas fa-users'></i>" = screen_name)
}
#########################################################################################
### MOST USED HASHTAGS
# Most used hashtags on posts monitored by Science Pulse over the last 12h
most_hashtags <- function(dataset){
dataset %>%
mutate(hashtag = toupper(hashtag)) %>%
filter(hashtag != "NA") %>%
count(hashtag, sort = T) %>%
slice(1:5) %>%
# Create and select only column with twitter hyperlynk
select(hashtag) %>%
mutate(hashtag = paste0("#<a href='https://twitter.com/hashtag/", hashtag, "' target='_blank' style='color: #d91c5c'>", hashtag, "</a>")) %>%
rename("<i class='fas fa-hashtag'></i>" = hashtag)
}
#########################################################################################
### ALSO POPULAR ON PULSE
# Among tweets in the sample (posted in the sample), which ones were more RT
# (by the whole universe of twitter users)? Count includes RTs by users outside
# Pulse. However, original tweets were posted only by members.
also_popular <- function(dataset){
# Filter only non-retweet posts
own_sample_trends <- dataset %>%
filter(is_retweet == F)
# Uses a 4 group k-means clustering to identify the 2nd group with the most RTs
set.seed(12345) # set.seed to keep the same results
# Extract cluster names so that they are in order
centers <- sort(kmeans(as.numeric(own_sample_trends$retweet_count),
centers = 4, nstart = 1000)$centers)
# K-means cluster observations with the ordered names
own_sample_trends$cluster <- kmeans(as.numeric(own_sample_trends$retweet_count),
centers = centers)$cluster
# Selects only the 2nd group with the most RTs
own_sample_trends <- own_sample_trends %>%
filter(cluster == 2)
# Not enough tweets message, if necessary
if(nrow(own_sample_trends) == 0){
not_enough_tweets_en()
} else {
own_sample_trends %>%
# Slice only the tweet with the most RTs from each user
group_by(screen_name) %>%
arrange(desc(retweet_count), created_at) %>%
slice(1) %>%
ungroup() %>%
# Slice those 5 with the most RTs
arrange(desc(retweet_count), created_at) %>%
slice(1:5) %>%
# Create and select only column with embed tweet code
mutate(text = paste0('<blockquote class="twitter-tweet"><p lang="',
language, '" dir="ltr">',
text, '</p>—',
name, '(@',
screen_name, ') <a href="https://twitter.com/',
screen_name, '/status/',
status_id, '">',
created_at, '</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
)) %>%
select(text) %>%
mutate(text = paste0("<strong>", 1:n(), "º //</strong> ", text)) %>%
rename(" " = text)
}
}
#########################################################################################
### PULSE RADAR
# A random sample of 5 tweets among popular ones on Pulse. This group of popular
# tweets do not reach the highest-ranked in absolute number of RTs.
pulse_radar <- function(dataset){
# Filter only non-retweet posts
own_sample_trends <- dataset %>%
filter(is_retweet == F)
# Uses a 4 group k-means clustering to identify the 2nd group with the most RTs
set.seed(12345) # set.seed to keep the same results
# Extract cluster names so that they are in order
centers <- sort(kmeans(as.numeric(own_sample_trends$retweet_count),
centers = 4, nstart = 1000)$centers)
# K-means cluster observations with the ordered names
own_sample_trends$cluster <- kmeans(as.numeric(own_sample_trends$retweet_count),
centers = centers)$cluster
# Selects only the 2nd group with the most RTs
cluster2 <- own_sample_trends %>%
filter(cluster == 2)
# Not enough tweets message, if necessary
if(nrow(cluster2) < 5){
not_enough_tweets_pt()
} else {
cluster2 %>%
# Random sample 5 tweets
sample_n(5) %>%
# Create and select only column with embed tweet code
mutate(text = paste0('<blockquote class="twitter-tweet"><p lang="',
language, '" dir="ltr">',
text, '</p>—',
name, '(@',
screen_name, ') <a href="https://twitter.com/',
screen_name, '/status/',
status_id, '">',
created_at, '</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>')) %>%
select(text) %>%
mutate(text = paste0("<strong>//</strong> ", text)) %>%
rename(" " = text)
}
}
#########################################################################################
### POPULAR AMONG SCIENTISTS
# Among posts retweet by Pulse's list members, which ones had the highest n. of RTs,
# considering only RT by accounts monitored by Science Pulse.
# They include tweets from any Twitter account.
popular_among_scientists <- function(dataset){
show_dataset <- dataset %>%
# Filter retweets that were RTed more than once
filter(is_retweet == T) %>%
group_by(retweet_status_id) %>%
mutate(numero = n()) %>%
ungroup() %>%
filter(numero > 1)
# Not enough tweets message, if necessary
if(nrow(show_dataset) == 0) {
not_enough_tweets_pt()
} else {
show_dataset %>%
select(language, text, retweet_name, retweet_screen_name,
retweet_status_id, retweet_created_at, numero) %>%
distinct() %>%
# Slices only the most RTed RT of each user
group_by(retweet_screen_name) %>%
arrange(desc(numero), retweet_status_id) %>%
slice(1) %>%
ungroup() %>%
# Slices the 5 tweets most RTed
arrange(desc(numero), retweet_status_id) %>%
slice(1:5) %>%
# Create and select only column with embed tweet code
mutate(text = paste0('<blockquote class="twitter-tweet"><p lang="',
language, '" dir="ltr">',
text, '</p>—',
retweet_name, '(@',
retweet_screen_name, ') <a href="https://twitter.com/',
retweet_screen_name, '/status/',
retweet_status_id, '">',
retweet_created_at, '</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
)) %>%
select(text) %>%
mutate(text = paste0("<strong>", 1:n(), "º //</strong> ", text)) %>%
rename(" " = text)
}
}
#########################################################################################
### OTHER POPULAR TWEETS
# Among RTs that appear in the sample, which ones have the highest overall n. of
# RTs? This count includes RTs from all Twitter users, but they need to have
# been RTed at least once by a Pulse's list member.
other_popular_tweets <- function(dataset){
show_dataset <- dataset %>%
# Filters 1+ RTs
filter(is_retweet == T,
retweet_count > 1)
# Not enough tweets message, if necessary
if(nrow(show_dataset) == 0) {
not_enough_tweets_pt()
} else {
show_dataset %>%
select(language, text, retweet_name, retweet_screen_name,
retweet_status_id, retweet_created_at, retweet_count) %>%
distinct() %>%
# Slices only the highest RTed RT from each user
group_by(retweet_screen_name) %>%
arrange(desc(retweet_count), retweet_status_id) %>%
slice(1) %>%
ungroup() %>%
# Slices the 5 most RTed RTs
arrange(desc(retweet_count), retweet_status_id) %>%
slice(1:5) %>%
# Create and select only column with embed tweet code
mutate(text = paste0('<blockquote class="twitter-tweet"><p lang="',
language, '" dir="ltr">',
text, '</p>—',
retweet_name, '(@',
retweet_screen_name, ') <a href="https://twitter.com/',
retweet_screen_name, '/status/',
retweet_status_id, '">',
retweet_created_at, '</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
)) %>%
select(text) %>%
mutate(text = paste0("<strong>", 1:n(), "º //</strong>", text)) %>%
rename(" " = text)
}
}
#########################################################################################
### NOT ENOUGH TWEETS
# Message to explain that there are not enough tweets to create the column
not_enough_tweets_en <- function(){
data.frame(variable = "Sorry, right now there are not enought tweets for this metric.\nPlease check again soon!") %>%
rename(" " = variable)
}
#########################################################################################
### SPINNERS
# Spinners to appear while data is loading. Different types:
# Large column: round spinner
include_spinner_large_column <- function(output){
withSpinner(tableOutput(output),
type = getOption("spinner.type", default = 6),
color = getOption("spinner.color", default = "#d91c5c"),
size = getOption("spinner.size", default = 1),
color.background = getOption("spinner.color.background", default = "#d91c5c"),
custom.css = FALSE, proxy.height = if (grepl("height:\\s*\\d", tableOutput(output))) NULL else "300px")
}
# Thin column: rectangular spinner
include_spinner_thin_column <- function(output){
withSpinner(tableOutput(output),
type = getOption("spinner.type", default = 1),
color = getOption("spinner.color", default = "#d91c5c"),
size = getOption("spinner.size", default = 1),
color.background = getOption("spinner.color.background", default = "#d91c5c"),
custom.css = FALSE, proxy.height = if (grepl("height:\\s*\\d", tableOutput(output))) NULL else "300px")
}
# Texts: small and circular spinner
include_spinner_small <- function(output){
withSpinner(textOutput(output),
type = getOption("spinner.type", default = 7),
color = getOption("spinner.color", default = "#d91c5c"),
size = getOption("spinner.size", default = 0.4),
color.background = getOption("spinner.color.background", default = "#d91c5c"),
custom.css = FALSE, proxy.height = "20px")
}
# Tables: large and round spinner
include_spinner_tables <- function(output){
withSpinner(DT::dataTableOutput(output),
type = getOption("spinner.type", default = 6),
color = getOption("spinner.color", default = "#d91c5c"),
size = getOption("spinner.size", default = 1),
color.background = getOption("spinner.color.background", default = "#d91c5c"),
custom.css = FALSE, proxy.height = if (grepl("height:\\s*\\d", DT::dataTableOutput(output))) NULL else "300px")
}
#########################################################################################
## GGPLOT THEMES
sp_theme <- function(base_size = 14, base_family = "Barlow") {
(theme_foundation(base_size = base_size, base_family = base_family) +
theme(
plot.background = element_rect(colour="#eeeeee", fill="#eeeeee"),
panel.background = element_rect(colour="#eeeeee", fill="#eeeeee"),
text = element_text(colour = "#231f20"),
axis.text = element_text(size = rel(0.8), margin=margin(0,40,0,0)),
axis.ticks = element_blank(),
axis.line = element_blank(),
axis.title = element_text(size = rel(0.9), colour = "#999999"),
legend.text = element_text(size=rel(0.9), angle = 0),
legend.title = element_blank(),
legend.key = element_rect(fill = "#eeeeee", colour = "#eeeeee", size = 0.5, linetype='dashed'),
legend.key.width = unit(0.6, "cm"),
legend.position = NULL,
legend.justification = c(-0.05, 0),
legend.background = element_blank(),
legend.direction = "horizontal",
legend.margin = (margin=margin(0,0,0,0)),
legend.box = NULL,
panel.border = element_rect(colour = "#eeeeee", fill=NA, size=2),
panel.grid.major = element_line(colour = "#e4e4e4"),
panel.grid.minor = element_line(colour = "#e6e6e6"),
panel.grid.minor.x = element_line(colour = "#e4e4e4"),
plot.title = element_text(hjust = 0, size = rel(1.3), face = "bold", colour = "#231f20"),
plot.title.position = "plot",
strip.background = element_rect(colour="#eeeeee", fill="#eeeeee"),
plot.subtitle = element_text(hjust = 0, margin=margin(0,0,40,0),size = rel(1), lineheight = 1),
plot.caption = element_text(size = rel(0.75), hjust = 1, margin=margin(20,0,0,0), colour = "#555555", lineheight = 1),
plot.margin = unit(c(1, 2, 1, 1), "lines")
)
)
}