Merge pull request #13 from subugoe/may_19_update

May 19 update
subugoe · May 24, 2019 · b0e8944 · b0e8944
2 parents b53d851 + 21c7211
commit b0e8944
Show file tree

Hide file tree

Showing 13 changed files with 388,989 additions and 262,659 deletions.
diff --git a/R/add_emails.R b/R/add_emails.R
@@ -1,10 +1,8 @@
 #' emails add
 library(tidyverse)
-emails_df <- readr::read_csv("../data/emails_normalized.csv")
-hybrid <- readr::read_csv("../data/hybrid_publications.csv") %>%
-  select(1:country_name)
-hybrid_emails <- hybrid %>%
+emails_df <- readr::read_csv("../data/emails_normalized.csv") %>%
+  distinct(doi, .keep_all = TRUE)
+hybrid <- readr::read_csv("../data/hybrid_publications.csv")
+hybrid %>%
   left_join(emails_df, by = c("doi_oa" = "doi")) %>%
-  distinct(doi_oa, .keep_all = TRUE)
-readr::write_csv(hybrid_emails, "../data/hybrid_publications.csv")
-
+  readr::write_csv("../data/hybrid_publications.csv")
diff --git a/R/cr_fetching.R b/R/cr_fetching.R
@@ -23,7 +23,7 @@ knitr::opts_chunk$set(
 library(tidyverse)
 library(countrycode)
 library(jsonlite)
-library(rcrossref) # v 0.8
+library(rcrossref) 
 #' link to dataset
 u <-
   "https://raw.githubusercontent.com/OpenAPC/openapc-de/master/data/apc_de.csv"
@@ -135,7 +135,7 @@ issns_list <-
       filter(publisher == x, journal_full_title == y) %>%
       .$issn
     names(issns) <- rep("issn", length(issns))
-    as.list(issns)
+    issns
   })
 #' search crossref
 jn_facets <- purrr::map(issns_list, .f = purrr::safely(function(x) {
@@ -218,22 +218,29 @@ hybrid_licenses <- jn_facets_df %>%
 #' licenses were not issued for delayed open access articles by 
 #' additionally using  the self-explanatory filter `license.url` and
 #'  `license.delay`. We also obtain parsed metadata for these hybrid open
-#'  access articles stored as list-column.
+#'  access articles stored as list-column. metadata fields we pare are 
+#'  defined in `cr_md_fields`
+cr_md_fields <- c("URL", "member", "created", "license", 
+                  "ISSN", "container-title", "issued", "approved", 
+                  "indexed", "accepted", "DOI", "funder", "published-print", 
+                  "subject", "published-online", "link", "type", "publisher", 
+                  "issn-type", "deposited", "content-created")
 cr_license <- purrr::map2(hybrid_licenses$license_ref, hybrid_licenses$issn,
                           .f = purrr::safely(function(x, y) {
                             u <- x
                             issn <- y
+                            names(issn) <-rep("issn", length(issn))
                             tmp <- rcrossref::cr_works(filter = c(issn, 
                                                                   license.url = u, 
                                                                   license.delay = 0,
                                                                   type = "journal-article",
                                                                   from_pub_date = "2013-01-01", 
                                                                   until_pub_date = "2019-12-31"),
                                                        cursor = "*", cursor_max = 5000L, 
-                                                       limit = 1000L) 
+                                                       limit = 1000L,
+                                                       select = cr_md_fields) 
                             tibble::tibble(
                               issn =  list(issn),
-                              year_published = list(tmp$facets$published),
                               license = u,
                               md = list(tmp$data)
                             )
@@ -246,7 +253,7 @@ cr_license_df <- cr_license %>%
 dplyr::bind_rows(cr_license_df$md) %>% 
   jsonlite::stream_out(file("../data/hybrid_license_md.json"))
 #' only DOIs and how we retrieved them
-purrr::map(cr_license_df$md, "DOI") %>%
+purrr::map(cr_license_df$md, "doi") %>%
   data_frame(dois = ., issn = cr_license_df$issn, license = cr_license_df$license) %>%
   jsonlite::stream_out(file("../data/hybrid_license_dois.json"))
 
diff --git a/R/cr_match.R b/R/cr_match.R
@@ -3,67 +3,54 @@ library(jsonlite)
 #' full data set
 license_df <- jsonlite::stream_in(file("../data/hybrid_license_md.json")) %>%
   as_data_frame() %>%
-  select(-title,-`clinical-trial-number`, -subtitle, -archive, -abstract, -archive_ , -NA.) %>%
   mutate(publisher = ifelse(grepl("Springer", publisher, fixed = FALSE, ignore.case = TRUE),
-                            "Springer Nature", publisher))
+                            "Springer Nature", publisher)) 
 #' get duplicate dois
 license_df %>% 
-  select(DOI, container.title, publisher) %>% 
-  group_by(DOI) %>% 
+  select(doi, container.title, publisher) %>% 
+  group_by(doi) %>% 
   filter(n() > 1)
 #' which journals and publishers are affected
 license_df %>% 
-  select(DOI, container.title, publisher) %>% 
-  group_by(DOI) %>% 
+  select(doi, container.title, publisher) %>% 
+  group_by(doi) %>% 
   filter(n() > 1) %>% 
   ungroup() %>% 
   count(container.title, publisher) %>% 
   arrange(desc(n))
 #' create a tidy dataset
 license_df %>%
-  select(DOI, ISSN, issued) %>%
+  select(doi, issn, issued) %>%
   # to year
   mutate(issued = lubridate::parse_date_time(issued, c('y', 'ymd', 'ym'))) %>%
   mutate(issued = lubridate::year(issued)) %>%
   # issn 
-  separate(ISSN, into = c("issn_1", "issn_2", "issn_3"), sep = ",") %>%
+  separate(issn, into = c("issn_1", "issn_2", "issn_3"), sep = ",") %>%
   gather(issn_1, issn_2, issn_3, key = "issn_type", value = "issn") %>%
   filter(!is.na(issn)) -> tidy_oahybrid_df
 #' add licensing info            
-jn_all <- jsonlite::stream_in(file("../data/jn_facets_df.json"), simplifyDataFrame = FALSE)
-#' tidy import, (we need another backup strategy to avoid the following steps)
-issn <- map_df(jn_all, "issn")
-jn_df <- issn %>%
-  mutate(journal_title = map_chr(jn_all, "journal_title")) %>%
-  mutate(publisher = map_chr(jn_all, "publisher")) %>%
-  mutate(year_published = map(jn_all, c("year_published"))) %>%
-  mutate(license_refs = map(jn_all, c("license_refs"))) %>%
-  mutate(license_refs = map(license_refs, bind_rows)) %>%
-  gather(issn, issn.1, issn.2, issn.3, key = "issn_type", value = "issn") %>%
+jn_all <- jsonlite::stream_in(file("../data/jn_facets_df.json"))
+jn_df <- jn_all %>%
+  unnest(issn, .preserve = year_published) %>%
   filter(!is.na(issn)) %>%
   mutate(publisher = ifelse(grepl("Springer", publisher, fixed = FALSE, ignore.case = TRUE),
                             "Springer Nature", publisher))
-jn_df %>%
-  mutate(year_published = map(year_published, bind_rows)) -> jn_df
 #' load doi set
-dois <- jsonlite::stream_in(file("../data/hybrid_license_dois.json"), simplifyDataFrame = FALSE)
-map_df(dois, "issn") %>%
-  mutate(doi_oa = map(dois, "dois")) %>%
-  mutate(license = map_chr(dois, "license")) %>% 
-  # issn 
-  gather(issn, issn.1, issn.2, issn.3, key = "issn_type", value = "issn") %>% 
-  filter(!is.na(issn)) -> dois_df
+dois <- jsonlite::stream_in(file("../data/hybrid_license_dois.json"))
+dois %>%
+  unnest(issn, .preserve = dois) %>%
+  filter(!is.na(issn))  -> dois_df
 #' join article datasets
 tmp <- inner_join(dois_df, jn_df, by = c("issn" = "issn")) %>% 
   distinct(journal_title, publisher, license, .keep_all = TRUE)
 
 hybrid_oa_df <- tmp %>% 
-  select(1:2, journal_title, publisher, year_published) %>% 
+  select(-issn) %>% 
   # remove delayed licenses 
-  filter(map(doi_oa, length) > 0) %>%
-  unnest(doi_oa, .preserve = year_published) %>%
-  distinct(doi_oa, .keep_all = TRUE) %>% 
-  left_join(tidy_oahybrid_df, by = c("doi_oa" = "DOI"))
+  filter(map(dois, length) > 0) %>%
+  unnest(dois, .preserve = year_published) %>%
+  distinct(dois, .keep_all = TRUE) %>% 
+  left_join(tidy_oahybrid_df, by = c("dois" = "doi"))
 #' check for DOAJ
 #' #'
 #' ## Dealing with flipped journals
@@ -110,11 +97,12 @@ hybrid_oa_df %>%
 # export 
 flipped_jns %>%
   select(-year_published) %>%
-  distinct(doi_oa,.keep_all = TRUE) %>%
+  distinct(dois,.keep_all = TRUE) %>%
   readr::write_csv("../data/flipped_jns_doaj.csv")
 #' remove flipped journals from hybrid license data set and store into json
 hybrid_oa_df %>% 
-  filter(!doi_oa %in% flipped_jns$doi_oa) %>%
+  rename(doi_oa = dois) %>%
+  filter(!doi_oa %in% flipped_jns$dois) %>%
   # clean license URIS
   mutate(license = gsub("\\/$", "", license)) %>%
   mutate(license = gsub("https", "http", license)) -> hybrid_oa_df
@@ -132,8 +120,6 @@ hybrid_oa_df %>%
   mutate(year = lubridate::parse_date_time(year, 'y')) %>%
   mutate(year = lubridate::year(year)) %>%
   left_join(indicator_df, by = c("journal_title", "publisher", "year" = "issued")) -> indicator_df
-
-
 #' yearly 
 #' get journals that are probably flipped, defined as prop > 0.95 in at least two years
 indicator_df %>% 
@@ -144,9 +130,16 @@ indicator_df %>%
   ungroup() %>%
   group_by(journal_title, publisher) %>%
   filter(n() > 1) -> prob_flipped
+#' also check with those jns found by mathias et al 
+#' 10.5281/zenodo.2553582
+rv_flip <- readr::read_csv("https://zenodo.org/record/2553582/files/reverse_flips_dataset.csv?download=1")
+indicator_df %>%
+  inner_join(rv_flip, by = c("journal_title" = "journal_name")) %>% 
+  filter(year < year_reverse_flipped) -> rev_flip_list
 #' export and exclude them
 readr::write_csv(prob_flipped, "../data/flipped_jns.csv")
-anti_join(indicator_df, prob_flipped,  by = c("journal_title", "publisher", "year")) -> indicator_df
+anti_join(indicator_df, prob_flipped,  by = c("journal_title", "publisher", "year")) %>%
+  anti_join(rev_flip_list, by = c("journal_title", "publisher", "year")) -> indicator_df
 #' calculate publishers article volume and add this info to the dataset
 indicator_df %>% 
   distinct(journal_title, publisher, year,.keep_all = TRUE) %>%
@@ -160,6 +153,7 @@ readr::write_csv(indicator_df, "../data/indicator.csv")
 hybrid_dois <- hybrid_oa_df %>%
   # remove flipped journals 
   anti_join(prob_flipped,  by = c("journal_title", "publisher", "issued" = "year")) %>%
+  anti_join(rev_flip_list, by = c("journal_title", "publisher","issued" = "year")) %>%
   # make sure dois are lower case
   mutate(doi_oa = tolower(doi_oa)) %>%
   distinct(doi_oa, .keep_all = TRUE) %>%

diff --git a/R/unpaywall_integration.R b/R/unpaywall_integration.R
@@ -9,15 +9,14 @@ con <- dbConnect(bigrquery::bigquery(),
                  project = "api-project-764811344545",
                  dataset = "oadoi_full")
 #' test connection
-oadoi <- tbl(con, "mongo_13_sep_18")
+oadoi <- tbl(con, "apr_19_mongo_export_2013_Apr2019_full")
 #' my bq query
 sql_unnested <-
   "SELECT doi, evidence, publisher, journal_name, journal_issns, year, is_best, license, host_type
-FROM `oadoi_full.mongo_13_sep_18`, UNNEST(oa_locations)
-WHERE `journal_is_in_doaj`=false AND data_standard=2 AND EXISTS (
+FROM `oadoi_full.apr_19_mongo_export_2013_Apr2019_full` , UNNEST(oa_locations)
+WHERE `journal_is_in_doaj`= false AND data_standard=2 AND EXISTS (
 SELECT evidence FROM UNNEST(oa_locations)
-WHERE evidence LIKE '%license%'
-)
+WHERE evidence = 'open (via crossref license)' OR evidence = 'open (via page says license)')
 "
 #' call bq
 dbGetQuery(con, sql_unnested) -> my_license
@@ -30,17 +29,13 @@ my_license %>%
   gather(issn_1:issn_4, key = "issn_position", value = "issn") %>%
   filter(!is.na(issn)) -> oadoi_issns
 #' get issn variants from crossref
-hybrid_issn <-
-  jsonlite::stream_in(
-    file("../data/jn_facets_df.json"),
-    simplifyDataFrame = FALSE
-  )
-issn <- map_df(hybrid_issn, "issn")
-jn_df <- issn %>%
-  mutate(journal_title = map_chr(hybrid_issn, "journal_title")) %>%
-  mutate(publisher = map_chr(hybrid_issn, "publisher")) %>%
-  gather(issn, issn.1, issn.2, issn.3, key = "issn_type", value = "issn") %>%
-  filter(!is.na(issn))
+jn_all <- jsonlite::stream_in(file("../data/jn_facets_df.json"))
+jn_df <- jn_all %>%
+  select(issn, publisher, journal_title) %>%
+  unnest(issn) %>%
+  filter(!is.na(issn)) %>%
+  mutate(publisher = ifelse(grepl("Springer", publisher, fixed = FALSE, ignore.case = TRUE),
+                            "Springer Nature", publisher))
 #' only journals from unpaywall that are in our sample as well
 oadoi_issns %>%
   filter(issn %in% jn_df$issn) -> hybrid_oadoi_sub
@@ -76,9 +71,24 @@ hybrid_oadoi_sub %>%
 oadoi_indicators %>%
   filter(in_dashbaord == FALSE) %>%
   select(-in_dashbaord) %>%
-  rename(jn_y_unpaywall_others = articles) %>%
-  left_join(hybrid_dash,
-            .,
-            by = c("journal_title", "publisher", "issued" = "year")) -> dash_new
+  rename(jn_y_unpaywall_others = articles) -> oadoi_others
+
+left_join(hybrid_dash, oadoi_others,
+            by = c("journal_title" = "journal_title", "publisher" = "publisher", "issued" = "year")) -> dash_new
+unpaywall_df <- dash_new %>%
+  rename(year = issued) %>%
+  group_by(year, journal_title, publisher, jn_y_unpaywall_others) %>%
+  summarise(n = n_distinct(doi_oa)) %>%
+  gather(n, jn_y_unpaywall_others, key = "source", value = "articles") %>%
+  ungroup() %>%
+  group_by(year,journal_title, publisher, source) %>%
+  summarise(articles = sum(articles, na.rm = TRUE)) %>%
+  mutate(
+    source = ifelse(
+      source == "n",
+      "Crossref immediate license",
+      "Other license information\n(Unpaywall)"
+    )
+  )
 #' export
-write_csv(dash_new, "../data/hybrid_publications.csv")
+readr::write_csv(unpaywall_df, "../data/unpaywall_df.csv")
diff --git a/dashboard.Rmd b/dashboard.Rmd
@@ -31,6 +31,8 @@ hybrid_df <- readr::read_csv("data/hybrid_publications.csv") %>%
   mutate(hybrid_type = factor(hybrid_type, levels = c("Open APC (TA)", "Open APC (Hybrid)", "SCOAP<sup>3</sup>"))) %>%
   mutate(domain = paste(domain, suffix, sep =".")) %>%
   arrange(desc(yearly_publisher_volume))
+unpaywall_df <- readr::read_csv("data/unpaywall_df.csv") %>%
+  mutate(year = factor(year, levels = c("2013", "2014", "2015","2016", "2017", "2018", "2019")))
 ```
 
 Overview
@@ -85,23 +87,23 @@ if (length(unique(jn_f()$publisher)) > 1) {
   }
 })
 # using unpaywall indicators
-unpaywall <- reactive({
-  hybrid_df %>%
-    filter(journal_title %in% jn_f()$journal_title) %>%
-    group_by(year, journal_title, publisher, jn_y_unpaywall_others) %>%
-    summarise(n = n_distinct(doi_oa)) %>%
-    gather(n, jn_y_unpaywall_others, key = "source", value = "articles") %>%
-    ungroup() %>%
-    group_by(year, source) %>%
-    summarise(articles = sum(articles, na.rm = TRUE)) %>%
-    mutate(
-      source = ifelse(
-        source == "n",
-        "Crossref immediate license",
-        "Other license information\n(Unpaywall)"
-      )
-    )
-})
+# unpaywall <- reactive({
+#   hybrid_df %>%
+#     filter(journal_title %in% jn_f()$journal_title) %>%
+#     group_by(year, journal_title, publisher, jn_y_unpaywall_others) %>%
+#     summarise(n = n_distinct(doi_oa)) %>%
+#     gather(n, jn_y_unpaywall_others, key = "source", value = "articles") %>%
+#     ungroup() %>%
+#     group_by(year, source) %>%
+#     summarise(articles = sum(articles, na.rm = TRUE)) %>%
+#     mutate(
+#       source = ifelse(
+#         source == "n",
+#         "Crossref immediate license",
+#         "Other license information\n(Unpaywall)"
+#       )
+#     )
+# })
 ```
 
 Notice that only those hybrid open access journals were included where 
@@ -215,7 +217,11 @@ renderPlotly({
 
 ```{r}
 renderPlotly({
-  p <- ggplot(unpaywall(), aes(year, articles, fill = source)) +
+  p <- unpaywall_df %>% 
+    filter(journal_title %in% jn_f()$journal_title) %>%
+    group_by(year, source) %>%
+    summarise(articles = sum(articles, na.rm = TRUE)) %>%
+  ggplot(aes(year, articles, fill = source)) +
     geom_bar(stat = "identity", position = position_stack(reverse = TRUE)) +
     xlab("Year") +
     ylab("Articles") +

diff --git a/data/flipped_jns.csv b/data/flipped_jns.csv
@@ -1,13 +1,34 @@
 journal_title,publisher,year,yearly_jn_volume,n_year,prop
+3 Biotech,Springer Nature,2014,78,77,0.9871794871794872
+3 Biotech,Springer Nature,2015,48,47,0.9791666666666666
+3 Biotech,Springer Nature,2016,254,253,0.9960629921259843
+Applied Nanoscience,Springer Nature,2013,104,102,0.9807692307692307
+Applied Nanoscience,Springer Nature,2014,103,101,0.9805825242718447
+Applied Nanoscience,Springer Nature,2015,125,125,1
 Applied Nanoscience,Springer Nature,2016,30,30,1
 Applied Nanoscience,Springer Nature,2017,89,89,1
-Integrating Materials and Manufacturing Innovation,Springer Nature,2013,5,5,1
-Integrating Materials and Manufacturing Innovation,Springer Nature,2016,14,14,1
+EPMA Journal,Springer Nature,2013,25,25,1
+EPMA Journal,Springer Nature,2014,187,187,1
+Gold Bulletin,Springer Nature,2013,56,56,1
+Gold Bulletin,Springer Nature,2014,22,22,1
 Investigative Opthalmology & Visual Science,Association for Research in Vision and Ophthalmology (ARVO),2016,929,929,1
 Investigative Opthalmology & Visual Science,Association for Research in Vision and Ophthalmology (ARVO),2017,785,785,1
 Investigative Opthalmology & Visual Science,Association for Research in Vision and Ophthalmology (ARVO),2018,732,732,1
-Investigative Opthalmology & Visual Science,Association for Research in Vision and Ophthalmology (ARVO),2019,56,56,1
+Investigative Opthalmology & Visual Science,Association for Research in Vision and Ophthalmology (ARVO),2019,240,240,1
+Journal of Remanufacturing,Springer Nature,2015,12,12,1
+Journal of Remanufacturing,Springer Nature,2016,3,3,1
+Maritime Studies,Springer Nature,2016,15,15,1
+Maritime Studies,Springer Nature,2017,22,22,1
+Raumforschung und Raumordnung,Walter de Gruyter GmbH,2013,63,61,0.9682539682539683
+Raumforschung und Raumordnung,Walter de Gruyter GmbH,2014,57,57,1
+Raumforschung und Raumordnung,Walter de Gruyter GmbH,2015,52,50,0.9615384615384616
+Raumforschung und Raumordnung,Walter de Gruyter GmbH,2016,61,60,0.9836065573770492
+Raumforschung und Raumordnung,Walter de Gruyter GmbH,2017,64,64,1
+Raumforschung und Raumordnung,Walter de Gruyter GmbH,2018,56,56,1
+Raumforschung und Raumordnung,Walter de Gruyter GmbH,2019,36,36,1
+Research in the Mathematical Sciences,Springer Nature,2016,41,41,1
+Research in the Mathematical Sciences,Springer Nature,2017,28,28,1
 Translational Vision Science & Technology,Association for Research in Vision and Ophthalmology (ARVO),2016,87,87,1
 Translational Vision Science & Technology,Association for Research in Vision and Ophthalmology (ARVO),2017,88,88,1
 Translational Vision Science & Technology,Association for Research in Vision and Ophthalmology (ARVO),2018,169,168,0.9940828402366864
-Translational Vision Science & Technology,Association for Research in Vision and Ophthalmology (ARVO),2019,16,16,1
+Translational Vision Science & Technology,Association for Research in Vision and Ophthalmology (ARVO),2019,68,68,1