@@ -58,9 +58,12 @@ countries <- fread(here("data", "countries.csv"), header=FALSE)
5858authors2 <- left_join(authors, countries, by=c("country"="V1"))
5959authors2$country <- authors2$V2
6060
61+ authors2$country[authors2$country == "USA"] <- "United States"
62+ authors2$country[authors2$country == "The Netherlands"] <- "Netherlands"
63+
6164authors2 <- authors2 %>% mutate(country2 = case_when(
6265 country == "United Kingdom" ~ "UK",
63- country == "USA " ~ "USA",
66+ country == "United States " ~ "USA",
6467 country == "China" ~ "China",
6568 # country == "Sweden" ~ "Sweden",
6669 # country == "Germany" ~ "Germany",
@@ -74,13 +77,18 @@ authors2 <- authors2 %>% mutate(country2 = case_when(
7477
7578table(authors2$country2)
7679
77-
7880temp <- authors2 %>%
7981 filter(!duplicated(pmid)) %>% inner_join(., subset(abstracts, select=c(pmid, pub_date)), by="pmid") %>%
8082 group_by(country2, year=year(pub_date)) %>%
8183 summarise(n=n())
8284data_ends <- filter(temp, year == "2023")
8385
86+ temp_all <- authors2 %>%
87+ filter(!duplicated(pmid)) %>% inner_join(., subset(abstracts, select=c(pmid, pub_date)), by="pmid") %>%
88+ group_by(country, year=year(pub_date)) %>%
89+ summarise(n=n())
90+
91+
8492temp %>%
8593 ggplot(., aes(x=year, y=n, colour=country2)) +
8694 # geom_smooth(se=FALSE) +
@@ -106,9 +114,121 @@ temp %>%
106114 aes(label = country2), data = data_ends
107115 ) +
108116 xlim(2005,2023) +
109- theme(legend.position="none")
117+ theme(legend.position="none") +
118+ labs(y="Number of papers", x="Year")
119+ ggsave("country.pdf", width=8, height=4)
120+ ```
121+
122+ ## Paper mills
123+
124+ ``` {r}
125+ paper_mill <- tibble(perc=c(0.92, 0.02, 0, 0.06), country2=c("China", "USA", "UK", "All other countries"))
126+ temp <- left_join(temp, paper_mill, by="country2")
127+ temp$n_adj <- temp$n * (1 - temp$perc)
128+ data_ends <- filter(temp, year == "2023")
129+ temp %>%
130+ ggplot(., aes(x=year, y=n_adj, colour=country2)) +
131+ # geom_smooth(se=FALSE) +
132+ geom_line() +
133+ geom_point(data=data_ends, aes(x=year, y=n_adj, colour=country2)) +
134+ geom_text_repel(
135+ aes(label = country2), data = data_ends
136+ ) +
137+ xlim(2005,2023) +
138+ theme(legend.position="none") +
139+ labs(y="Relative number of papers adjusted for paper mills", x="Year")
140+
141+ ```
142+
143+ Retraction watch paper mill papers
144+
145+ ``` {r}
146+ format_date <- function(x) {
147+ do.call(rbind, strsplit(x, " "))[,1] %>% gsub("/", "-", .) %>% as.Date(., format="%m-%d-%Y") %>% lubridate::year()
148+ }
149+
150+ format_country <- function(x) {
151+ case_when(grepl("China", x) ~ "China", grepl("United States", x) ~ "USA", grepl("United Kingdom", x) ~ "UK", TRUE ~ "All other countries")
152+ }
153+
154+ rw <- read.csv("~/Downloads/retraction_watch.csv")
155+
156+ # split the Country column by ";" and duplicate the row for every country
157+ dim(rw)
158+ rw <- rw %>% tidyr::separate_rows(Country, sep=";")
159+
160+ table(rw$Country)
161+
162+ pm <- paper_milly <- rw %>% filter(grepl("paper mill", Reason, ignore.case=TRUE)) %>%
163+ mutate(year=format_date(OriginalPaperDate), country2 = format_country(Country)) %>%
164+ group_by(year, country=Country) %>% summarise(nret=n())
165+
166+ left_join(temp_all, pm) %>%
167+ mutate(nret = ifelse(is.na(nret), 0, nret)) %>%
168+ filter(year < 2023) %>%
169+ ggplot(., aes(x=nret, y=n, colour=year, group=country)) + geom_point() + geom_line(aes(group=country))
170+
171+ left_join(temp_all, pm) %>%
172+ mutate(nret = ifelse(is.na(nret), 0, nret)) %>%
173+ filter(country == "China" & year < 2023) %>%
174+ ggplot(., aes(x=nret, y=n, colour=year)) + geom_point() + geom_line()
175+ ```
176+
177+
178+ ``` {r}
179+ paper_milly <- rw %>% filter(grepl("paper mill", Reason, ignore.case=TRUE)) %>%
180+ mutate(year=format_date(OriginalPaperDate), country2 = format_country(Country)) %>%
181+ group_by(year, country2) %>% summarise(nret=n())
182+
183+ paper_mill_y <- group_by(paper_milly, year) %>% summarise(nret_all=sum(nret))
184+ paper_milly <- left_join(paper_milly, paper_mill_y, by="year") %>% mutate(percrety=nret/nret_all)
185+ ggplot(paper_milly, aes(x=year, y=nret, colour=country2)) + geom_line() + geom_point() + theme(legend.position="bottom") + labs(y="Proportion of papers in paper mills", x="Year") + scale_colour_brewer(type="qual") + xlim(2015,2022)
186+
187+ temp <- left_join(temp, paper_milly, by=c("year"="year", "country2"="country2"))
188+ temp$percrety[is.na(temp$percrety)] <- 0
189+
190+ temp$n_adjy <- temp$n * (1 - temp$percrety)
191+ data_ends <- filter(temp, year == "2023")
192+ temp %>%
193+ ggplot(., aes(x=year, y=n_adj, colour=country2)) +
194+ # geom_smooth(se=FALSE) +
195+ geom_line() +
196+ geom_point(data=data_ends, aes(x=year, y=n_adj, colour=country2)) +
197+ geom_text_repel(
198+ aes(label = country2), data = data_ends
199+ ) +
200+ xlim(2005,2023) +
201+ theme(legend.position="none", ) +
202+ labs(y="Relative number of papers adjusted for paper mills", x="Year")
203+
204+ temp %>%
205+ ggplot(., aes(x=year, y=n_adjy, colour=country2)) +
206+ # geom_smooth(se=FALSE) +
207+ geom_line() +
208+ geom_point(data=data_ends, aes(x=year, y=n_adjy, colour=country2)) +
209+ geom_text_repel(
210+ aes(label = country2), data = data_ends
211+ ) +
212+ xlim(2005,2023) +
213+ theme(legend.position="none") +
214+ labs(y="Relative number of papers adjusted for paper mills", x="Year")
110215```
111216
217+ ``` {r}
218+ temp %>%
219+ group_by(year) %>%
220+ summarise(n=sum(n), n_adjy=sum(n_adjy)) %>%
221+ tidyr::pivot_longer(cols=c(n, n_adjy), names_to="type", values_to="value") %>%
222+ ggplot(., aes(x=year, y=value)) +
223+ # geom_smooth(se=FALSE) +
224+ geom_line(aes(colour=type)) +
225+ xlim(2005,2023) +
226+ labs(y="Relative number of papers adjusted for paper mills", x="Year")
227+
228+ ```
229+
230+
231+
112232## China universities
113233
114234``` {r}
0 commit comments