upd

explodecomputer · explodecomputer · commit acf9ee9f6acc · 2024-12-24T23:43:47.000Z
diff --git a/docs/country.Rmd b/docs/country.Rmd
@@ -58,9 +58,12 @@ countries <- fread(here("data", "countries.csv"), header=FALSE)
 authors2 <- left_join(authors, countries, by=c("country"="V1"))
 authors2$country <- authors2$V2
 
+authors2$country[authors2$country == "USA"] <- "United States"
+authors2$country[authors2$country == "The Netherlands"] <- "Netherlands"
+
 authors2 <- authors2 %>% mutate(country2 = case_when(
     country == "United Kingdom" ~ "UK",
-    country == "USA" ~ "USA",
+    country == "United States" ~ "USA",
     country == "China" ~ "China",
     # country == "Sweden" ~ "Sweden",
     # country == "Germany" ~ "Germany",
@@ -74,13 +77,18 @@ authors2 <- authors2 %>% mutate(country2 = case_when(
 
 table(authors2$country2)
 
-
 temp <- authors2 %>%
     filter(!duplicated(pmid)) %>% inner_join(., subset(abstracts, select=c(pmid, pub_date)), by="pmid") %>%
     group_by(country2, year=year(pub_date)) %>%
     summarise(n=n()) 
 data_ends <- filter(temp, year == "2023")
 
+temp_all <- authors2 %>%
+    filter(!duplicated(pmid)) %>% inner_join(., subset(abstracts, select=c(pmid, pub_date)), by="pmid") %>%
+    group_by(country, year=year(pub_date)) %>%
+    summarise(n=n())
+
+
 temp %>%
     ggplot(., aes(x=year, y=n, colour=country2)) + 
     # geom_smooth(se=FALSE) + 
@@ -106,9 +114,121 @@ temp %>%
         aes(label = country2), data = data_ends
     ) +
    xlim(2005,2023) +
-   theme(legend.position="none")
+   theme(legend.position="none") +
+   labs(y="Number of papers", x="Year")
+ggsave("country.pdf", width=8, height=4)
+```
+
+## Paper mills
+
+```{r}
+paper_mill <- tibble(perc=c(0.92, 0.02, 0, 0.06), country2=c("China", "USA", "UK", "All other countries"))
+temp <- left_join(temp, paper_mill, by="country2")
+temp$n_adj <- temp$n * (1 - temp$perc)
+data_ends <- filter(temp, year == "2023")
+temp %>%
+    ggplot(., aes(x=year, y=n_adj, colour=country2)) + 
+    # geom_smooth(se=FALSE) + 
+    geom_line() +
+    geom_point(data=data_ends, aes(x=year, y=n_adj, colour=country2)) +
+    geom_text_repel(
+        aes(label = country2), data = data_ends
+    ) +
+   xlim(2005,2023) +
+   theme(legend.position="none") +
+   labs(y="Relative number of papers adjusted for paper mills", x="Year")
+
+```
+
+Retraction watch paper mill papers
+
+```{r}
+format_date <- function(x) {
+    do.call(rbind, strsplit(x, " "))[,1] %>% gsub("/", "-", .) %>% as.Date(., format="%m-%d-%Y") %>% lubridate::year()
+}
+
+format_country <- function(x) {
+    case_when(grepl("China", x) ~ "China", grepl("United States", x) ~ "USA", grepl("United Kingdom", x) ~ "UK", TRUE ~ "All other countries")
+}
+
+rw <- read.csv("~/Downloads/retraction_watch.csv")
+
+# split the Country column by ";" and duplicate the row for every country
+dim(rw)
+rw <- rw %>% tidyr::separate_rows(Country, sep=";")
+
+table(rw$Country)
+
+pm <- paper_milly <- rw %>% filter(grepl("paper mill", Reason, ignore.case=TRUE)) %>%
+    mutate(year=format_date(OriginalPaperDate), country2 = format_country(Country)) %>%
+    group_by(year, country=Country) %>% summarise(nret=n())
+
+left_join(temp_all, pm) %>%
+    mutate(nret = ifelse(is.na(nret), 0, nret)) %>%
+    filter(year < 2023) %>%
+    ggplot(., aes(x=nret, y=n, colour=year, group=country)) + geom_point() + geom_line(aes(group=country))
+
+left_join(temp_all, pm) %>%
+    mutate(nret = ifelse(is.na(nret), 0, nret)) %>%
+    filter(country == "China" & year < 2023) %>%
+    ggplot(., aes(x=nret, y=n, colour=year)) + geom_point() + geom_line()
+```
+
+
+```{r}
+paper_milly <- rw %>% filter(grepl("paper mill", Reason, ignore.case=TRUE)) %>%
+    mutate(year=format_date(OriginalPaperDate), country2 = format_country(Country)) %>% 
+    group_by(year, country2) %>% summarise(nret=n())
+
+paper_mill_y <- group_by(paper_milly, year) %>% summarise(nret_all=sum(nret))
+paper_milly <- left_join(paper_milly, paper_mill_y, by="year") %>% mutate(percrety=nret/nret_all)
+ggplot(paper_milly, aes(x=year, y=nret, colour=country2)) + geom_line() + geom_point() + theme(legend.position="bottom") + labs(y="Proportion of papers in paper mills", x="Year") + scale_colour_brewer(type="qual") + xlim(2015,2022)
+
+temp <- left_join(temp, paper_milly, by=c("year"="year", "country2"="country2"))
+temp$percrety[is.na(temp$percrety)] <- 0
+
+temp$n_adjy <- temp$n * (1 - temp$percrety)
+data_ends <- filter(temp, year == "2023")
+temp %>%
+    ggplot(., aes(x=year, y=n_adj, colour=country2)) + 
+    # geom_smooth(se=FALSE) + 
+    geom_line() +
+    geom_point(data=data_ends, aes(x=year, y=n_adj, colour=country2)) +
+    geom_text_repel(
+        aes(label = country2), data = data_ends
+    ) +
+   xlim(2005,2023) +
+   theme(legend.position="none", ) +
+   labs(y="Relative number of papers adjusted for paper mills", x="Year")
+
+temp %>%
+    ggplot(., aes(x=year, y=n_adjy, colour=country2)) + 
+    # geom_smooth(se=FALSE) + 
+    geom_line() +
+    geom_point(data=data_ends, aes(x=year, y=n_adjy, colour=country2)) +
+    geom_text_repel(
+        aes(label = country2), data = data_ends
+    ) +
+   xlim(2005,2023) +
+   theme(legend.position="none") +
+   labs(y="Relative number of papers adjusted for paper mills", x="Year")
 ```
 
+```{r}
+temp %>%
+    group_by(year) %>%
+    summarise(n=sum(n), n_adjy=sum(n_adjy)) %>%
+    tidyr::pivot_longer(cols=c(n, n_adjy), names_to="type", values_to="value") %>%
+    ggplot(., aes(x=year, y=value)) + 
+    # geom_smooth(se=FALSE) + 
+    geom_line(aes(colour=type)) +
+   xlim(2005,2023) +
+   labs(y="Relative number of papers adjusted for paper mills", x="Year")
+
+```
+
+
+
 ## China universities
 
 ```{r}
diff --git a/docs/eje.html b/docs/eje.html
diff --git a/docs/eje.rmd b/docs/eje.rmd
@@ -11,6 +11,7 @@ library(here)
 library(tidyr)
 library(ggplot2)
 library(ggrepel)
+library(janitor)
 ```
 
 
@@ -39,4 +40,182 @@ ggplot(eje, aes(x=year, y=count, fill=type, group=type, colour=type)) +
   labs(y="Submissions received by EJE annually") +
   theme(legend.position = "none") +
   expand_limits(y = 0)
-```
+```
+
+
+## EJE MR papers
+
+```{r}
+a <- read_xlsx("~/Downloads/Mendelian Randomisation Submissions_2015_24.xlsx")
+a <- clean_names(a)
+a$date <- lubridate::mdy(a$initial_date_submitted)
+str(a)
+
+table(a$country)
+a$type <- ifelse(a$country == "CHINA", "China", "All other countries")
+
+a$year <- lubridate::year(a$date)
+a2 <- a %>% group_by(year, type) %>%
+  summarise(count=n())
+
+
+eje_e2 <- subset(a2, year == "2023")
+
+ggplot(a2 %>% filter(year != 2024), aes(x=year, y=count, fill=type, group=type, colour=type)) +
+  geom_line() +
+    geom_point(data=eje_e2, aes(x=year, y=count)) +
+    geom_text_repel(
+        aes(label = type), data = eje_e2
+    ) +
+  labs(y="MR Submissions received by EJE annually") +
+  theme(legend.position = "none") +
+  expand_limits(y = 0)
+```
+
+
+```{r}
+eje$year <- as.numeric(eje$year)
+eje_all <- inner_join(eje, a2, by=c("year", "type"), suffix=c("_all", "_mr"))
+eje_all$count_other <- eje_all$count_all - eje_all$count_mr
+
+tidyr::pivot_longer(eje_all, c(count_all, count_mr, count_other)) %>%
+  ggplot(aes(x=year, y=value, group=type, colour=name)) +
+  geom_point() +
+  geom_line(aes(group=paste(type, name), linetype=type)) +
+  labs(y="Submissions received by EJE annually")
+
+tidyr::pivot_longer(eje_all, c(count_all, count_mr, count_other)) %>%
+  filter(name != "count_all") %>% 
+  mutate(name = case_when(name == "count_all" ~ "All", name == "count_mr" ~ "MR papers", name == "count_other" ~ "Other papers")) %>%
+  ggplot(aes(x=year, y=value, group=type, fill=name)) +
+  geom_bar(aes(fill=name, group=type), stat="identity", position="stack") +
+  facet_grid( ~ type) +
+  labs(x="Year", y="Submissions received by EJE annually", fill="Submission type")
+```
+
+Ratios
+
+```{r}
+eje_all
+
+tidyr::pivot_longer(eje_all, c(count_all, count_mr, count_other)) %>%
+  group_by(name) %>%
+  do({
+    china <- subset(., type == "China")
+    other <- subset(., type == "All other countries")
+    a <- inner_join(china, other, by="year")
+    a$prop <- a$value.x / (a$value.y + a$value.x)
+    a
+  }) %>%
+  mutate(name = case_when(name == "count_all" ~ "All", name == "count_mr" ~ "MR", name == "count_other" ~ "Other")) %>%
+  ggplot(aes(x=year, y=prop, colour=name)) +
+  geom_line() +
+  scale_colour_brewer(type="qual") +
+  labs(x="Year", y="Proportion of submissions from China", colour="Paper type")
+
+```
+
+Growth rates
+
+```{r}
+calculate_exp_growth <- function(time, counts) {
+    # Fit exponential model by taking log of counts
+    # N(t) = N0 * e^(rt) becomes log(N(t)) = log(N0) + rt
+    model <- summary(lm(log(counts) ~ time))
+
+    # Extract growth coefficient (r)
+    r <- model$coefficients[2,1]
+    r_se <- model$coefficients[2,2]
+
+    # Calculate R-squared to assess fit
+    r_squared <- model$r.squared
+
+    # Calculate doubling time
+    doubling_time <- log(2)/r
+
+    rdist <- rnorm(1000, r, r_se)
+    doubling_time_se <- log(2) * r_se / r^2
+    doubling_time_se_emp <- sd(log(2) / rdist)
+
+
+
+    # Return results
+    return(list(
+        growth_coefficient = r,
+        growth_coefficient_se = r_se,
+        r_squared = r_squared,
+        doubling_time = doubling_time,
+        doubling_time_se = doubling_time_se,
+        doubling_time_se_emp = doubling_time_se_emp,
+        initial_value = exp(coef(model)[1])
+    ) %>% as_tibble())
+}
+
+tidyr::pivot_longer(eje_all, c(count_all, count_mr, count_other)) %>%
+  filter(name != "count_all") %>% 
+  mutate(name = case_when(name == "count_all" ~ "All", name == "count_mr" ~ "MR papers", name == "count_other" ~ "Other papers")) %>%
+  group_by(name, type) %>%
+  do({
+    calculate_exp_growth(.$year, .$value)
+  }) %>% as.data.frame
+
+```
+
+## Adjusting for estimated paper mill fraction
+
+Suppose that some fraction of all papers come from paper mills. 
+
+
+
+```{r}
+
+paper_mill_fraction <- tibble(
+  
+)
+
+tidyr::pivot_longer(eje_all, c(count_all, count_mr, count_other)) %>%
+  filter(name != "count_all") %>% 
+  mutate(name = case_when(name == "count_all" ~ "All", name == "count_mr" ~ "MR papers", name == "count_other" ~ "Other papers")) %>%
+  group_by(name, type) %>%
+  do({
+    calculate_exp_growth(.$year, .$value)
+  }) %>% as.data.frame
+
+```
+
+
+
+## Nature Index
+
+
+
+```{r}
+ni <- lapply(2016:2024, \(y) {
+  a <- read.csv(file.path("~/Downloads", paste0(y, "-research-leading-countries.csv"))) %>% clean_names()
+  i <- which(names(a) == paste0("share_", y-1))
+  names(a)[i] <- "share"
+  i <- which(names(a) == paste0("count_", y-1))
+  names(a)[i] <- "count"
+  a <- a %>% select(position, country=country_territory, share, count) %>% mutate(year=y)
+}) %>% bind_rows()
+
+ni$country2 <- ni$country
+ni$country2[! ni$country2 %in% c("United States of America (USA)", "United Kingdom (UK)", "China")] <- "All other countries"
+ni <- ni %>% group_by(country2, year)  %>% summarise(share = sum(share), count=sum(count), position=mean(position))
+
+ggplot(ni, aes(x=year, y=share, group=country2, colour=country2)) +
+  geom_line() +
+  geom_point() +
+  labs(y="Nature Index share of top 10 countries") +
+  expand_limits(y = 0) +
+  scale_colour_brewer(type="qual")
+
+ggplot(ni, aes(x=year, y=count, group=country2, colour=country2)) +
+  geom_line() +
+  geom_point() +
+  labs(y="Nature Index count of top 10 countries") +
+  expand_limits(y = 0) +
+  scale_colour_brewer(type="qual")
+```
+
+