-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path02_compare_dates.R
199 lines (152 loc) · 6.03 KB
/
02_compare_dates.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#Get licenses for COVID initiative collections in PMC
#info on COVID initiative
#https://www.ncbi.nlm.nih.gov/pmc/about/covid-19/
#links to collections (part of PMC special collections)
#https://www.ncbi.nlm.nih.gov/pmc/journals/collections/?titles=current&search=journals
#info on using rentrez package:
#https://ropensci.org/tutorials/rentrez_tutorial/
#https://docs.ropensci.org/rentrez/articles/rentrez_tutorial.html
#https://www.ncbi.nlm.nih.gov/books/NBK25499/
#install.packages("tidyverse")
library(tidyverse)
library(lubridate)
#define function to read files
readFolder <- function(date, folder){
files <- list.files(file.path(folder, date),
pattern="csv", full.names=TRUE) %>%
map_dfr(read_csv)
}
#--------------------------------------
date <- Sys.Date()
#or set manually
#date <- "yyyy-mm-dd"
date <- "2022-08-28"
#create vector with sampling dates
dates <- list.files(path = "data/", pattern = "^\\d{4}-\\d{2}-\\d{2}")
#----------------------------------------
#create df of records with presence on sample dates
#read files into list
list_records <- map(dates, ~readFolder(.,
folder = "data"))
list_records <- set_names(list_records, dates)
#bind all rows, .id creates column with identifier, taken from list names
records_all <- bind_rows(list_records, .id = "date") %>%
#remove AIP as not a declared PHE collection
filter(collection != "AIP") %>%
#reorder columns
select(date, collection, pmcid, doi, everything())
#create interim matching table of PMCIDs and DOIs
dois <- records_all %>%
select(pmcid, doi) %>%
filter(!is.na(doi)) %>%
distinct()
#transform into wide form keeping only collection and total
records_unique <- records_all %>%
#including columns pmc_live_date, pubyear, license_text and/or journal gives duplicates as apparently multiple values
#select(date, collection, pmcid, pmc_live_date, pubyear, license_text) %>%
select(date, collection, pmcid) %>%
mutate(included = TRUE) %>%
distinct() %>%
pivot_wider(names_from = date, values_from = included) %>%
arrange(collection) %>%
#add dois from interim matching table
left_join(dois) %>%
#reorder columns
select(collection, pmcid, doi, everything())
#n=267100 of wich 266316 unique pmcids - some records are in multiple collections?
rm(dois, list_records)
#use results from one time step to get DOIs from from CORD19 for records removed prior to 2021-11-01
#done using script 02a_get_dois_CORD19.R
#from 2021-11-01 onwards, dois collected directly from PMC
CORD_ids <- read_csv("data/CORD19/CORDids_2021-11-01.csv") %>%
rename(doi_cord = doi)
records_unique <- records_unique %>%
left_join(CORD_ids, by = "pmcid") %>%
mutate(doi = case_when(
is.na(doi) ~ doi_cord,
TRUE ~ doi)) %>%
select(-doi_cord)
rm(CORD_ids)
#3639 dois added, 222 still missing (in November 2021)
#identify dropped/removed records over time
#sufficient to check which records are not present in latest collection!
#add new element for each added sample_date
#identify last column
col_last <- ncol(records_unique)
records_removed <- records_unique %>%
mutate(removed = case_when(
is.na(.[[col_last]]) ~ "removed",
TRUE ~ NA_character_)) %>%
filter(!is.na(removed)) %>%
select(pmcid, removed) %>%
distinct()
records_unique <- records_unique %>%
left_join(records_removed, by = "pmcid")
rm(records_removed)
#------------------------------------------------------------------
#add parameters from original collections
#as collected pmc_live_date, pubyear, license_text and/or journal were found to occasionally differ between dates (see 04.compare_dates.R)
#take additional parameters from last instance where record was present in collection
#add column with latest date present in collection
#NB some pmcids are present in multiple collections!
records_latest <- records_unique %>%
select(-c(doi, removed)) %>%
pivot_longer(!c(pmcid, collection), names_to = "version", values_to = "included") %>%
#keep only versions with record included
filter(!is.na(included)) %>%
#convert version date to date format
mutate(version = as_date(version)) %>%
#keep only latest version
group_by(pmcid, collection) %>%
arrange(desc(version)) %>%
slice(1) %>%
ungroup() %>%
#convert version date back to character for subsequent matching
mutate(version = as.character(version)) %>%
select(-included)
#add column to records with latest version in which record occurs
records_unique <- records_unique %>%
left_join(records_latest,
by = c("collection", "pmcid")) %>%
rename(version_latest = version)
rm(records_latest)
#----------------------------------------------
#add parameters from full records (exclude publisher as not needed in addition to collection)
records_all_join <- records_all %>%
select(-c(doi, publisher))
records_unique <- records_unique %>%
left_join(records_all_join,
by = c("collection",
"pmcid",
"version_latest" = "date")) %>%
distinct()
rm(records_all_join)
#---------------------------------------------------
#quantification
records_count <- records_unique %>%
select(-c(version_latest,
pmc_live_date,
pubyear,
journal,
license_url,
license_text)) %>%
group_by(collection) %>%
summarise_all(~ sum(!is.na(.)))
#---------------------------------------------------
#write to csv
filename <- paste0("output/records_all_unique.csv")
write_csv(records_unique, filename)
filename <- paste0("output/records_count.csv")
write_csv(records_count, filename)
#--------------------------------------------------
#enrich data with longitudinal Unpaywall data through COKI
#SQL query: sql/PMC_all_UPW_instances.sql
#data: data/Unpaywall/PMC_all_UPW_instances_2021-11-17.csv
#---------------------------------------------------
#create list of unique licenses
#process further in 03_analyze_licenses.R
license_unique <- records_all %>%
count(license_url, license_text) %>%
select(-n)
filename <- paste0("output/licenses/license_unique.csv")
write_csv(license_unique, filename)