-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path04_analyze_removed.R
145 lines (108 loc) · 4.08 KB
/
04_analyze_removed.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#Analyze records removed from PMC Public Health collections
#install.packages("tidyverse")
library(tidyverse)
library(lubridate)
library(jsonlite)
#---------------------------------------
#define function to run different mini-analyses
AnalyzeRemoved <- function(publisher, df){
#count (incl doi) - only for removed records
removed_count <- df %>%
filter(collection %in% publisher) %>%
filter(!is.na(removed)) %>%
summarise(pmcid = sum(!is.na(pmcid)),
doi = sum(!is.na(doi)))
#latest version - only for removed records
removed_latest <- df %>%
filter(collection %in% publisher) %>%
filter(!is.na(removed)) %>% #only for removed records
count(version_latest)
#publication year - compare all and removed records
removed_pubyear <- df %>%
filter(collection %in% publisher) %>%
#filter(!is.na(removed)) %>%
group_by(pubyear) %>%
summarise(all = sum(!is.na(pmcid)),
removed = sum(!is.na(removed))) %>%
arrange(desc(pubyear))
#license - compare all and removed records
removed_license <- df %>%
filter(collection %in% publisher) %>%
#filter(!is.na(removed)) %>%
group_by(license_summary) %>%
summarise(all = sum(!is.na(pmcid)),
removed = sum(!is.na(removed)))
#publication type - compare all and removed records
removed_type <- df %>%
filter(collection %in% publisher) %>%
#filter(!is.na(removed)) %>%
group_by(publication_type) %>%
summarise(all = sum(!is.na(pmcid)),
removed = sum(!is.na(removed)))
#oa-status - compare all and removed records
removed_oa_status <- df %>%
filter(collection %in% publisher) %>%
#filter(!is.na(removed)) %>%
group_by(upw_current) %>%
summarise(all = sum(!is.na(pmcid)),
removed = sum(!is.na(removed)))
#journals with removed records, compare all w removed
removed_journal_names <- df %>%
filter(collection %in% publisher) %>%
filter(!is.na(removed)) %>%
pull(journal) %>%
unique()
removed_journals <- df %>%
filter(collection %in% publisher) %>%
filter(journal %in% removed_journal_names) %>%
group_by(journal) %>%
summarise(all = sum(!is.na(pmcid)),
removed = sum(!is.na(removed))) %>%
arrange(desc(removed))
#collect results into list
res <- list(removed_count = removed_count,
removed_latest = removed_latest,
removed_pubyear = removed_pubyear,
removed_license = removed_license,
removed_type = removed_type,
removed_oa_status = removed_oa_status,
removed_journals = removed_journals
)
return(res)
}
#--------------------------------------
#read files
records_all <- read_csv("output/records_all_unique.csv")
records_upw <- read_csv("data/Unpaywall/PMC_all_UPW_instances_2021-11-17.csv")
#for reference
records_count <- read_csv("output/records_count.csv")
#-------------------------------------
#identify publishers with removed records
removed_publishers <- records_count %>%
#set cutoff manually depending on need
filter(removed > 100) %>%
pull(collection)
#create joined df with all relevant variables
records_upw_join <- records_upw %>%
select(doi_pmc,
publication_type,
upw_current,
upw_green_pmc_current,
last_updated_current) %>%
#deduplicate because some records are in multiple collections
distinct()
records_analyze <- records_all %>%
select(-c(license_url, license_text)) %>%
left_join(records_upw_join, by = c("doi" = "doi_pmc"))
#run analyses
res <- map(removed_publishers, ~ AnalyzeRemoved(., df = records_analyze))
#add publisher names to list
res_names <- set_names(res, removed_publishers)
# Save as json file
res_names_json <- toJSON(res_names, pretty = TRUE, auto_unbox = TRUE)
write(res_names_json, "output/removed_records/removed_records_analysis.json")
# Read json file
# NB without readLines step, errors with UTF8 encoding are reported
readlines <- readLines("output/removed_records/removed_records_analysis.json", warn = FALSE)
res_removed <- fromJSON(readlines)
rm(readlines)