Skip to content

Commit acf9ee9

Browse files
upd
1 parent ce54e9d commit acf9ee9

File tree

3 files changed

+881
-4
lines changed

3 files changed

+881
-4
lines changed

docs/country.Rmd

Lines changed: 123 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,12 @@ countries <- fread(here("data", "countries.csv"), header=FALSE)
5858
authors2 <- left_join(authors, countries, by=c("country"="V1"))
5959
authors2$country <- authors2$V2
6060
61+
authors2$country[authors2$country == "USA"] <- "United States"
62+
authors2$country[authors2$country == "The Netherlands"] <- "Netherlands"
63+
6164
authors2 <- authors2 %>% mutate(country2 = case_when(
6265
country == "United Kingdom" ~ "UK",
63-
country == "USA" ~ "USA",
66+
country == "United States" ~ "USA",
6467
country == "China" ~ "China",
6568
# country == "Sweden" ~ "Sweden",
6669
# country == "Germany" ~ "Germany",
@@ -74,13 +77,18 @@ authors2 <- authors2 %>% mutate(country2 = case_when(
7477
7578
table(authors2$country2)
7679
77-
7880
temp <- authors2 %>%
7981
filter(!duplicated(pmid)) %>% inner_join(., subset(abstracts, select=c(pmid, pub_date)), by="pmid") %>%
8082
group_by(country2, year=year(pub_date)) %>%
8183
summarise(n=n())
8284
data_ends <- filter(temp, year == "2023")
8385
86+
temp_all <- authors2 %>%
87+
filter(!duplicated(pmid)) %>% inner_join(., subset(abstracts, select=c(pmid, pub_date)), by="pmid") %>%
88+
group_by(country, year=year(pub_date)) %>%
89+
summarise(n=n())
90+
91+
8492
temp %>%
8593
ggplot(., aes(x=year, y=n, colour=country2)) +
8694
# geom_smooth(se=FALSE) +
@@ -106,9 +114,121 @@ temp %>%
106114
aes(label = country2), data = data_ends
107115
) +
108116
xlim(2005,2023) +
109-
theme(legend.position="none")
117+
theme(legend.position="none") +
118+
labs(y="Number of papers", x="Year")
119+
ggsave("country.pdf", width=8, height=4)
120+
```
121+
122+
## Paper mills
123+
124+
```{r}
125+
paper_mill <- tibble(perc=c(0.92, 0.02, 0, 0.06), country2=c("China", "USA", "UK", "All other countries"))
126+
temp <- left_join(temp, paper_mill, by="country2")
127+
temp$n_adj <- temp$n * (1 - temp$perc)
128+
data_ends <- filter(temp, year == "2023")
129+
temp %>%
130+
ggplot(., aes(x=year, y=n_adj, colour=country2)) +
131+
# geom_smooth(se=FALSE) +
132+
geom_line() +
133+
geom_point(data=data_ends, aes(x=year, y=n_adj, colour=country2)) +
134+
geom_text_repel(
135+
aes(label = country2), data = data_ends
136+
) +
137+
xlim(2005,2023) +
138+
theme(legend.position="none") +
139+
labs(y="Relative number of papers adjusted for paper mills", x="Year")
140+
141+
```
142+
143+
Retraction watch paper mill papers
144+
145+
```{r}
146+
format_date <- function(x) {
147+
do.call(rbind, strsplit(x, " "))[,1] %>% gsub("/", "-", .) %>% as.Date(., format="%m-%d-%Y") %>% lubridate::year()
148+
}
149+
150+
format_country <- function(x) {
151+
case_when(grepl("China", x) ~ "China", grepl("United States", x) ~ "USA", grepl("United Kingdom", x) ~ "UK", TRUE ~ "All other countries")
152+
}
153+
154+
rw <- read.csv("~/Downloads/retraction_watch.csv")
155+
156+
# split the Country column by ";" and duplicate the row for every country
157+
dim(rw)
158+
rw <- rw %>% tidyr::separate_rows(Country, sep=";")
159+
160+
table(rw$Country)
161+
162+
pm <- paper_milly <- rw %>% filter(grepl("paper mill", Reason, ignore.case=TRUE)) %>%
163+
mutate(year=format_date(OriginalPaperDate), country2 = format_country(Country)) %>%
164+
group_by(year, country=Country) %>% summarise(nret=n())
165+
166+
left_join(temp_all, pm) %>%
167+
mutate(nret = ifelse(is.na(nret), 0, nret)) %>%
168+
filter(year < 2023) %>%
169+
ggplot(., aes(x=nret, y=n, colour=year, group=country)) + geom_point() + geom_line(aes(group=country))
170+
171+
left_join(temp_all, pm) %>%
172+
mutate(nret = ifelse(is.na(nret), 0, nret)) %>%
173+
filter(country == "China" & year < 2023) %>%
174+
ggplot(., aes(x=nret, y=n, colour=year)) + geom_point() + geom_line()
175+
```
176+
177+
178+
```{r}
179+
paper_milly <- rw %>% filter(grepl("paper mill", Reason, ignore.case=TRUE)) %>%
180+
mutate(year=format_date(OriginalPaperDate), country2 = format_country(Country)) %>%
181+
group_by(year, country2) %>% summarise(nret=n())
182+
183+
paper_mill_y <- group_by(paper_milly, year) %>% summarise(nret_all=sum(nret))
184+
paper_milly <- left_join(paper_milly, paper_mill_y, by="year") %>% mutate(percrety=nret/nret_all)
185+
ggplot(paper_milly, aes(x=year, y=nret, colour=country2)) + geom_line() + geom_point() + theme(legend.position="bottom") + labs(y="Proportion of papers in paper mills", x="Year") + scale_colour_brewer(type="qual") + xlim(2015,2022)
186+
187+
temp <- left_join(temp, paper_milly, by=c("year"="year", "country2"="country2"))
188+
temp$percrety[is.na(temp$percrety)] <- 0
189+
190+
temp$n_adjy <- temp$n * (1 - temp$percrety)
191+
data_ends <- filter(temp, year == "2023")
192+
temp %>%
193+
ggplot(., aes(x=year, y=n_adj, colour=country2)) +
194+
# geom_smooth(se=FALSE) +
195+
geom_line() +
196+
geom_point(data=data_ends, aes(x=year, y=n_adj, colour=country2)) +
197+
geom_text_repel(
198+
aes(label = country2), data = data_ends
199+
) +
200+
xlim(2005,2023) +
201+
theme(legend.position="none", ) +
202+
labs(y="Relative number of papers adjusted for paper mills", x="Year")
203+
204+
temp %>%
205+
ggplot(., aes(x=year, y=n_adjy, colour=country2)) +
206+
# geom_smooth(se=FALSE) +
207+
geom_line() +
208+
geom_point(data=data_ends, aes(x=year, y=n_adjy, colour=country2)) +
209+
geom_text_repel(
210+
aes(label = country2), data = data_ends
211+
) +
212+
xlim(2005,2023) +
213+
theme(legend.position="none") +
214+
labs(y="Relative number of papers adjusted for paper mills", x="Year")
110215
```
111216

217+
```{r}
218+
temp %>%
219+
group_by(year) %>%
220+
summarise(n=sum(n), n_adjy=sum(n_adjy)) %>%
221+
tidyr::pivot_longer(cols=c(n, n_adjy), names_to="type", values_to="value") %>%
222+
ggplot(., aes(x=year, y=value)) +
223+
# geom_smooth(se=FALSE) +
224+
geom_line(aes(colour=type)) +
225+
xlim(2005,2023) +
226+
labs(y="Relative number of papers adjusted for paper mills", x="Year")
227+
228+
```
229+
230+
231+
112232
## China universities
113233

114234
```{r}

docs/eje.html

Lines changed: 578 additions & 0 deletions
Large diffs are not rendered by default.

docs/eje.rmd

Lines changed: 180 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ library(here)
1111
library(tidyr)
1212
library(ggplot2)
1313
library(ggrepel)
14+
library(janitor)
1415
```
1516

1617

@@ -39,4 +40,182 @@ ggplot(eje, aes(x=year, y=count, fill=type, group=type, colour=type)) +
3940
labs(y="Submissions received by EJE annually") +
4041
theme(legend.position = "none") +
4142
expand_limits(y = 0)
42-
```
43+
```
44+
45+
46+
## EJE MR papers
47+
48+
```{r}
49+
a <- read_xlsx("~/Downloads/Mendelian Randomisation Submissions_2015_24.xlsx")
50+
a <- clean_names(a)
51+
a$date <- lubridate::mdy(a$initial_date_submitted)
52+
str(a)
53+
54+
table(a$country)
55+
a$type <- ifelse(a$country == "CHINA", "China", "All other countries")
56+
57+
a$year <- lubridate::year(a$date)
58+
a2 <- a %>% group_by(year, type) %>%
59+
summarise(count=n())
60+
61+
62+
eje_e2 <- subset(a2, year == "2023")
63+
64+
ggplot(a2 %>% filter(year != 2024), aes(x=year, y=count, fill=type, group=type, colour=type)) +
65+
geom_line() +
66+
geom_point(data=eje_e2, aes(x=year, y=count)) +
67+
geom_text_repel(
68+
aes(label = type), data = eje_e2
69+
) +
70+
labs(y="MR Submissions received by EJE annually") +
71+
theme(legend.position = "none") +
72+
expand_limits(y = 0)
73+
```
74+
75+
76+
```{r}
77+
eje$year <- as.numeric(eje$year)
78+
eje_all <- inner_join(eje, a2, by=c("year", "type"), suffix=c("_all", "_mr"))
79+
eje_all$count_other <- eje_all$count_all - eje_all$count_mr
80+
81+
tidyr::pivot_longer(eje_all, c(count_all, count_mr, count_other)) %>%
82+
ggplot(aes(x=year, y=value, group=type, colour=name)) +
83+
geom_point() +
84+
geom_line(aes(group=paste(type, name), linetype=type)) +
85+
labs(y="Submissions received by EJE annually")
86+
87+
tidyr::pivot_longer(eje_all, c(count_all, count_mr, count_other)) %>%
88+
filter(name != "count_all") %>%
89+
mutate(name = case_when(name == "count_all" ~ "All", name == "count_mr" ~ "MR papers", name == "count_other" ~ "Other papers")) %>%
90+
ggplot(aes(x=year, y=value, group=type, fill=name)) +
91+
geom_bar(aes(fill=name, group=type), stat="identity", position="stack") +
92+
facet_grid( ~ type) +
93+
labs(x="Year", y="Submissions received by EJE annually", fill="Submission type")
94+
```
95+
96+
Ratios
97+
98+
```{r}
99+
eje_all
100+
101+
tidyr::pivot_longer(eje_all, c(count_all, count_mr, count_other)) %>%
102+
group_by(name) %>%
103+
do({
104+
china <- subset(., type == "China")
105+
other <- subset(., type == "All other countries")
106+
a <- inner_join(china, other, by="year")
107+
a$prop <- a$value.x / (a$value.y + a$value.x)
108+
a
109+
}) %>%
110+
mutate(name = case_when(name == "count_all" ~ "All", name == "count_mr" ~ "MR", name == "count_other" ~ "Other")) %>%
111+
ggplot(aes(x=year, y=prop, colour=name)) +
112+
geom_line() +
113+
scale_colour_brewer(type="qual") +
114+
labs(x="Year", y="Proportion of submissions from China", colour="Paper type")
115+
116+
```
117+
118+
Growth rates
119+
120+
```{r}
121+
calculate_exp_growth <- function(time, counts) {
122+
# Fit exponential model by taking log of counts
123+
# N(t) = N0 * e^(rt) becomes log(N(t)) = log(N0) + rt
124+
model <- summary(lm(log(counts) ~ time))
125+
126+
# Extract growth coefficient (r)
127+
r <- model$coefficients[2,1]
128+
r_se <- model$coefficients[2,2]
129+
130+
# Calculate R-squared to assess fit
131+
r_squared <- model$r.squared
132+
133+
# Calculate doubling time
134+
doubling_time <- log(2)/r
135+
136+
rdist <- rnorm(1000, r, r_se)
137+
doubling_time_se <- log(2) * r_se / r^2
138+
doubling_time_se_emp <- sd(log(2) / rdist)
139+
140+
141+
142+
# Return results
143+
return(list(
144+
growth_coefficient = r,
145+
growth_coefficient_se = r_se,
146+
r_squared = r_squared,
147+
doubling_time = doubling_time,
148+
doubling_time_se = doubling_time_se,
149+
doubling_time_se_emp = doubling_time_se_emp,
150+
initial_value = exp(coef(model)[1])
151+
) %>% as_tibble())
152+
}
153+
154+
tidyr::pivot_longer(eje_all, c(count_all, count_mr, count_other)) %>%
155+
filter(name != "count_all") %>%
156+
mutate(name = case_when(name == "count_all" ~ "All", name == "count_mr" ~ "MR papers", name == "count_other" ~ "Other papers")) %>%
157+
group_by(name, type) %>%
158+
do({
159+
calculate_exp_growth(.$year, .$value)
160+
}) %>% as.data.frame
161+
162+
```
163+
164+
## Adjusting for estimated paper mill fraction
165+
166+
Suppose that some fraction of all papers come from paper mills.
167+
168+
169+
170+
```{r}
171+
172+
paper_mill_fraction <- tibble(
173+
174+
)
175+
176+
tidyr::pivot_longer(eje_all, c(count_all, count_mr, count_other)) %>%
177+
filter(name != "count_all") %>%
178+
mutate(name = case_when(name == "count_all" ~ "All", name == "count_mr" ~ "MR papers", name == "count_other" ~ "Other papers")) %>%
179+
group_by(name, type) %>%
180+
do({
181+
calculate_exp_growth(.$year, .$value)
182+
}) %>% as.data.frame
183+
184+
```
185+
186+
187+
188+
## Nature Index
189+
190+
191+
192+
```{r}
193+
ni <- lapply(2016:2024, \(y) {
194+
a <- read.csv(file.path("~/Downloads", paste0(y, "-research-leading-countries.csv"))) %>% clean_names()
195+
i <- which(names(a) == paste0("share_", y-1))
196+
names(a)[i] <- "share"
197+
i <- which(names(a) == paste0("count_", y-1))
198+
names(a)[i] <- "count"
199+
a <- a %>% select(position, country=country_territory, share, count) %>% mutate(year=y)
200+
}) %>% bind_rows()
201+
202+
ni$country2 <- ni$country
203+
ni$country2[! ni$country2 %in% c("United States of America (USA)", "United Kingdom (UK)", "China")] <- "All other countries"
204+
ni <- ni %>% group_by(country2, year) %>% summarise(share = sum(share), count=sum(count), position=mean(position))
205+
206+
ggplot(ni, aes(x=year, y=share, group=country2, colour=country2)) +
207+
geom_line() +
208+
geom_point() +
209+
labs(y="Nature Index share of top 10 countries") +
210+
expand_limits(y = 0) +
211+
scale_colour_brewer(type="qual")
212+
213+
ggplot(ni, aes(x=year, y=count, group=country2, colour=country2)) +
214+
geom_line() +
215+
geom_point() +
216+
labs(y="Nature Index count of top 10 countries") +
217+
expand_limits(y = 0) +
218+
scale_colour_brewer(type="qual")
219+
```
220+
221+

0 commit comments

Comments
 (0)