Skip to content
This repository has been archived by the owner on Mar 27, 2023. It is now read-only.

Commit

Permalink
Merge pull request #13 from subugoe/may_19_update
Browse files Browse the repository at this point in the history
May 19 update
  • Loading branch information
njahn82 authored May 24, 2019
2 parents b53d851 + 21c7211 commit b0e8944
Show file tree
Hide file tree
Showing 13 changed files with 388,989 additions and 262,659 deletions.
12 changes: 5 additions & 7 deletions R/add_emails.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
#' emails add
library(tidyverse)
emails_df <- readr::read_csv("../data/emails_normalized.csv")
hybrid <- readr::read_csv("../data/hybrid_publications.csv") %>%
select(1:country_name)
hybrid_emails <- hybrid %>%
emails_df <- readr::read_csv("../data/emails_normalized.csv") %>%
distinct(doi, .keep_all = TRUE)
hybrid <- readr::read_csv("../data/hybrid_publications.csv")
hybrid %>%
left_join(emails_df, by = c("doi_oa" = "doi")) %>%
distinct(doi_oa, .keep_all = TRUE)
readr::write_csv(hybrid_emails, "../data/hybrid_publications.csv")

readr::write_csv("../data/hybrid_publications.csv")
19 changes: 13 additions & 6 deletions R/cr_fetching.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ knitr::opts_chunk$set(
library(tidyverse)
library(countrycode)
library(jsonlite)
library(rcrossref) # v 0.8
library(rcrossref)
#' link to dataset
u <-
"https://raw.githubusercontent.com/OpenAPC/openapc-de/master/data/apc_de.csv"
Expand Down Expand Up @@ -135,7 +135,7 @@ issns_list <-
filter(publisher == x, journal_full_title == y) %>%
.$issn
names(issns) <- rep("issn", length(issns))
as.list(issns)
issns
})
#' search crossref
jn_facets <- purrr::map(issns_list, .f = purrr::safely(function(x) {
Expand Down Expand Up @@ -218,22 +218,29 @@ hybrid_licenses <- jn_facets_df %>%
#' licenses were not issued for delayed open access articles by
#' additionally using the self-explanatory filter `license.url` and
#' `license.delay`. We also obtain parsed metadata for these hybrid open
#' access articles stored as list-column.
#' access articles stored as list-column. metadata fields we pare are
#' defined in `cr_md_fields`
cr_md_fields <- c("URL", "member", "created", "license",
"ISSN", "container-title", "issued", "approved",
"indexed", "accepted", "DOI", "funder", "published-print",
"subject", "published-online", "link", "type", "publisher",
"issn-type", "deposited", "content-created")
cr_license <- purrr::map2(hybrid_licenses$license_ref, hybrid_licenses$issn,
.f = purrr::safely(function(x, y) {
u <- x
issn <- y
names(issn) <-rep("issn", length(issn))
tmp <- rcrossref::cr_works(filter = c(issn,
license.url = u,
license.delay = 0,
type = "journal-article",
from_pub_date = "2013-01-01",
until_pub_date = "2019-12-31"),
cursor = "*", cursor_max = 5000L,
limit = 1000L)
limit = 1000L,
select = cr_md_fields)
tibble::tibble(
issn = list(issn),
year_published = list(tmp$facets$published),
license = u,
md = list(tmp$data)
)
Expand All @@ -246,7 +253,7 @@ cr_license_df <- cr_license %>%
dplyr::bind_rows(cr_license_df$md) %>%
jsonlite::stream_out(file("../data/hybrid_license_md.json"))
#' only DOIs and how we retrieved them
purrr::map(cr_license_df$md, "DOI") %>%
purrr::map(cr_license_df$md, "doi") %>%
data_frame(dois = ., issn = cr_license_df$issn, license = cr_license_df$license) %>%
jsonlite::stream_out(file("../data/hybrid_license_dois.json"))

68 changes: 31 additions & 37 deletions R/cr_match.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,67 +3,54 @@ library(jsonlite)
#' full data set
license_df <- jsonlite::stream_in(file("../data/hybrid_license_md.json")) %>%
as_data_frame() %>%
select(-title,-`clinical-trial-number`, -subtitle, -archive, -abstract, -archive_ , -NA.) %>%
mutate(publisher = ifelse(grepl("Springer", publisher, fixed = FALSE, ignore.case = TRUE),
"Springer Nature", publisher))
"Springer Nature", publisher))
#' get duplicate dois
license_df %>%
select(DOI, container.title, publisher) %>%
group_by(DOI) %>%
select(doi, container.title, publisher) %>%
group_by(doi) %>%
filter(n() > 1)
#' which journals and publishers are affected
license_df %>%
select(DOI, container.title, publisher) %>%
group_by(DOI) %>%
select(doi, container.title, publisher) %>%
group_by(doi) %>%
filter(n() > 1) %>%
ungroup() %>%
count(container.title, publisher) %>%
arrange(desc(n))
#' create a tidy dataset
license_df %>%
select(DOI, ISSN, issued) %>%
select(doi, issn, issued) %>%
# to year
mutate(issued = lubridate::parse_date_time(issued, c('y', 'ymd', 'ym'))) %>%
mutate(issued = lubridate::year(issued)) %>%
# issn
separate(ISSN, into = c("issn_1", "issn_2", "issn_3"), sep = ",") %>%
separate(issn, into = c("issn_1", "issn_2", "issn_3"), sep = ",") %>%
gather(issn_1, issn_2, issn_3, key = "issn_type", value = "issn") %>%
filter(!is.na(issn)) -> tidy_oahybrid_df
#' add licensing info
jn_all <- jsonlite::stream_in(file("../data/jn_facets_df.json"), simplifyDataFrame = FALSE)
#' tidy import, (we need another backup strategy to avoid the following steps)
issn <- map_df(jn_all, "issn")
jn_df <- issn %>%
mutate(journal_title = map_chr(jn_all, "journal_title")) %>%
mutate(publisher = map_chr(jn_all, "publisher")) %>%
mutate(year_published = map(jn_all, c("year_published"))) %>%
mutate(license_refs = map(jn_all, c("license_refs"))) %>%
mutate(license_refs = map(license_refs, bind_rows)) %>%
gather(issn, issn.1, issn.2, issn.3, key = "issn_type", value = "issn") %>%
jn_all <- jsonlite::stream_in(file("../data/jn_facets_df.json"))
jn_df <- jn_all %>%
unnest(issn, .preserve = year_published) %>%
filter(!is.na(issn)) %>%
mutate(publisher = ifelse(grepl("Springer", publisher, fixed = FALSE, ignore.case = TRUE),
"Springer Nature", publisher))
jn_df %>%
mutate(year_published = map(year_published, bind_rows)) -> jn_df
#' load doi set
dois <- jsonlite::stream_in(file("../data/hybrid_license_dois.json"), simplifyDataFrame = FALSE)
map_df(dois, "issn") %>%
mutate(doi_oa = map(dois, "dois")) %>%
mutate(license = map_chr(dois, "license")) %>%
# issn
gather(issn, issn.1, issn.2, issn.3, key = "issn_type", value = "issn") %>%
filter(!is.na(issn)) -> dois_df
dois <- jsonlite::stream_in(file("../data/hybrid_license_dois.json"))
dois %>%
unnest(issn, .preserve = dois) %>%
filter(!is.na(issn)) -> dois_df
#' join article datasets
tmp <- inner_join(dois_df, jn_df, by = c("issn" = "issn")) %>%
distinct(journal_title, publisher, license, .keep_all = TRUE)

hybrid_oa_df <- tmp %>%
select(1:2, journal_title, publisher, year_published) %>%
select(-issn) %>%
# remove delayed licenses
filter(map(doi_oa, length) > 0) %>%
unnest(doi_oa, .preserve = year_published) %>%
distinct(doi_oa, .keep_all = TRUE) %>%
left_join(tidy_oahybrid_df, by = c("doi_oa" = "DOI"))
filter(map(dois, length) > 0) %>%
unnest(dois, .preserve = year_published) %>%
distinct(dois, .keep_all = TRUE) %>%
left_join(tidy_oahybrid_df, by = c("dois" = "doi"))
#' check for DOAJ
#' #'
#' ## Dealing with flipped journals
Expand Down Expand Up @@ -110,11 +97,12 @@ hybrid_oa_df %>%
# export
flipped_jns %>%
select(-year_published) %>%
distinct(doi_oa,.keep_all = TRUE) %>%
distinct(dois,.keep_all = TRUE) %>%
readr::write_csv("../data/flipped_jns_doaj.csv")
#' remove flipped journals from hybrid license data set and store into json
hybrid_oa_df %>%
filter(!doi_oa %in% flipped_jns$doi_oa) %>%
rename(doi_oa = dois) %>%
filter(!doi_oa %in% flipped_jns$dois) %>%
# clean license URIS
mutate(license = gsub("\\/$", "", license)) %>%
mutate(license = gsub("https", "http", license)) -> hybrid_oa_df
Expand All @@ -132,8 +120,6 @@ hybrid_oa_df %>%
mutate(year = lubridate::parse_date_time(year, 'y')) %>%
mutate(year = lubridate::year(year)) %>%
left_join(indicator_df, by = c("journal_title", "publisher", "year" = "issued")) -> indicator_df


#' yearly
#' get journals that are probably flipped, defined as prop > 0.95 in at least two years
indicator_df %>%
Expand All @@ -144,9 +130,16 @@ indicator_df %>%
ungroup() %>%
group_by(journal_title, publisher) %>%
filter(n() > 1) -> prob_flipped
#' also check with those jns found by mathias et al
#' 10.5281/zenodo.2553582
rv_flip <- readr::read_csv("https://zenodo.org/record/2553582/files/reverse_flips_dataset.csv?download=1")
indicator_df %>%
inner_join(rv_flip, by = c("journal_title" = "journal_name")) %>%
filter(year < year_reverse_flipped) -> rev_flip_list
#' export and exclude them
readr::write_csv(prob_flipped, "../data/flipped_jns.csv")
anti_join(indicator_df, prob_flipped, by = c("journal_title", "publisher", "year")) -> indicator_df
anti_join(indicator_df, prob_flipped, by = c("journal_title", "publisher", "year")) %>%
anti_join(rev_flip_list, by = c("journal_title", "publisher", "year")) -> indicator_df
#' calculate publishers article volume and add this info to the dataset
indicator_df %>%
distinct(journal_title, publisher, year,.keep_all = TRUE) %>%
Expand All @@ -160,6 +153,7 @@ readr::write_csv(indicator_df, "../data/indicator.csv")
hybrid_dois <- hybrid_oa_df %>%
# remove flipped journals
anti_join(prob_flipped, by = c("journal_title", "publisher", "issued" = "year")) %>%
anti_join(rev_flip_list, by = c("journal_title", "publisher","issued" = "year")) %>%
# make sure dois are lower case
mutate(doi_oa = tolower(doi_oa)) %>%
distinct(doi_oa, .keep_all = TRUE) %>%
Expand Down
52 changes: 31 additions & 21 deletions R/unpaywall_integration.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@ con <- dbConnect(bigrquery::bigquery(),
project = "api-project-764811344545",
dataset = "oadoi_full")
#' test connection
oadoi <- tbl(con, "mongo_13_sep_18")
oadoi <- tbl(con, "apr_19_mongo_export_2013_Apr2019_full")
#' my bq query
sql_unnested <-
"SELECT doi, evidence, publisher, journal_name, journal_issns, year, is_best, license, host_type
FROM `oadoi_full.mongo_13_sep_18`, UNNEST(oa_locations)
WHERE `journal_is_in_doaj`=false AND data_standard=2 AND EXISTS (
FROM `oadoi_full.apr_19_mongo_export_2013_Apr2019_full` , UNNEST(oa_locations)
WHERE `journal_is_in_doaj`= false AND data_standard=2 AND EXISTS (
SELECT evidence FROM UNNEST(oa_locations)
WHERE evidence LIKE '%license%'
)
WHERE evidence = 'open (via crossref license)' OR evidence = 'open (via page says license)')
"
#' call bq
dbGetQuery(con, sql_unnested) -> my_license
Expand All @@ -30,17 +29,13 @@ my_license %>%
gather(issn_1:issn_4, key = "issn_position", value = "issn") %>%
filter(!is.na(issn)) -> oadoi_issns
#' get issn variants from crossref
hybrid_issn <-
jsonlite::stream_in(
file("../data/jn_facets_df.json"),
simplifyDataFrame = FALSE
)
issn <- map_df(hybrid_issn, "issn")
jn_df <- issn %>%
mutate(journal_title = map_chr(hybrid_issn, "journal_title")) %>%
mutate(publisher = map_chr(hybrid_issn, "publisher")) %>%
gather(issn, issn.1, issn.2, issn.3, key = "issn_type", value = "issn") %>%
filter(!is.na(issn))
jn_all <- jsonlite::stream_in(file("../data/jn_facets_df.json"))
jn_df <- jn_all %>%
select(issn, publisher, journal_title) %>%
unnest(issn) %>%
filter(!is.na(issn)) %>%
mutate(publisher = ifelse(grepl("Springer", publisher, fixed = FALSE, ignore.case = TRUE),
"Springer Nature", publisher))
#' only journals from unpaywall that are in our sample as well
oadoi_issns %>%
filter(issn %in% jn_df$issn) -> hybrid_oadoi_sub
Expand Down Expand Up @@ -76,9 +71,24 @@ hybrid_oadoi_sub %>%
oadoi_indicators %>%
filter(in_dashbaord == FALSE) %>%
select(-in_dashbaord) %>%
rename(jn_y_unpaywall_others = articles) %>%
left_join(hybrid_dash,
.,
by = c("journal_title", "publisher", "issued" = "year")) -> dash_new
rename(jn_y_unpaywall_others = articles) -> oadoi_others

left_join(hybrid_dash, oadoi_others,
by = c("journal_title" = "journal_title", "publisher" = "publisher", "issued" = "year")) -> dash_new
unpaywall_df <- dash_new %>%
rename(year = issued) %>%
group_by(year, journal_title, publisher, jn_y_unpaywall_others) %>%
summarise(n = n_distinct(doi_oa)) %>%
gather(n, jn_y_unpaywall_others, key = "source", value = "articles") %>%
ungroup() %>%
group_by(year,journal_title, publisher, source) %>%
summarise(articles = sum(articles, na.rm = TRUE)) %>%
mutate(
source = ifelse(
source == "n",
"Crossref immediate license",
"Other license information\n(Unpaywall)"
)
)
#' export
write_csv(dash_new, "../data/hybrid_publications.csv")
readr::write_csv(unpaywall_df, "../data/unpaywall_df.csv")
42 changes: 24 additions & 18 deletions dashboard.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ hybrid_df <- readr::read_csv("data/hybrid_publications.csv") %>%
mutate(hybrid_type = factor(hybrid_type, levels = c("Open APC (TA)", "Open APC (Hybrid)", "SCOAP<sup>3</sup>"))) %>%
mutate(domain = paste(domain, suffix, sep =".")) %>%
arrange(desc(yearly_publisher_volume))
unpaywall_df <- readr::read_csv("data/unpaywall_df.csv") %>%
mutate(year = factor(year, levels = c("2013", "2014", "2015","2016", "2017", "2018", "2019")))
```

Overview
Expand Down Expand Up @@ -85,23 +87,23 @@ if (length(unique(jn_f()$publisher)) > 1) {
}
})
# using unpaywall indicators
unpaywall <- reactive({
hybrid_df %>%
filter(journal_title %in% jn_f()$journal_title) %>%
group_by(year, journal_title, publisher, jn_y_unpaywall_others) %>%
summarise(n = n_distinct(doi_oa)) %>%
gather(n, jn_y_unpaywall_others, key = "source", value = "articles") %>%
ungroup() %>%
group_by(year, source) %>%
summarise(articles = sum(articles, na.rm = TRUE)) %>%
mutate(
source = ifelse(
source == "n",
"Crossref immediate license",
"Other license information\n(Unpaywall)"
)
)
})
# unpaywall <- reactive({
# hybrid_df %>%
# filter(journal_title %in% jn_f()$journal_title) %>%
# group_by(year, journal_title, publisher, jn_y_unpaywall_others) %>%
# summarise(n = n_distinct(doi_oa)) %>%
# gather(n, jn_y_unpaywall_others, key = "source", value = "articles") %>%
# ungroup() %>%
# group_by(year, source) %>%
# summarise(articles = sum(articles, na.rm = TRUE)) %>%
# mutate(
# source = ifelse(
# source == "n",
# "Crossref immediate license",
# "Other license information\n(Unpaywall)"
# )
# )
# })
```

Notice that only those hybrid open access journals were included where
Expand Down Expand Up @@ -215,7 +217,11 @@ renderPlotly({

```{r}
renderPlotly({
p <- ggplot(unpaywall(), aes(year, articles, fill = source)) +
p <- unpaywall_df %>%
filter(journal_title %in% jn_f()$journal_title) %>%
group_by(year, source) %>%
summarise(articles = sum(articles, na.rm = TRUE)) %>%
ggplot(aes(year, articles, fill = source)) +
geom_bar(stat = "identity", position = position_stack(reverse = TRUE)) +
xlab("Year") +
ylab("Articles") +
Expand Down
29 changes: 25 additions & 4 deletions data/flipped_jns.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,34 @@
journal_title,publisher,year,yearly_jn_volume,n_year,prop
3 Biotech,Springer Nature,2014,78,77,0.9871794871794872
3 Biotech,Springer Nature,2015,48,47,0.9791666666666666
3 Biotech,Springer Nature,2016,254,253,0.9960629921259843
Applied Nanoscience,Springer Nature,2013,104,102,0.9807692307692307
Applied Nanoscience,Springer Nature,2014,103,101,0.9805825242718447
Applied Nanoscience,Springer Nature,2015,125,125,1
Applied Nanoscience,Springer Nature,2016,30,30,1
Applied Nanoscience,Springer Nature,2017,89,89,1
Integrating Materials and Manufacturing Innovation,Springer Nature,2013,5,5,1
Integrating Materials and Manufacturing Innovation,Springer Nature,2016,14,14,1
EPMA Journal,Springer Nature,2013,25,25,1
EPMA Journal,Springer Nature,2014,187,187,1
Gold Bulletin,Springer Nature,2013,56,56,1
Gold Bulletin,Springer Nature,2014,22,22,1
Investigative Opthalmology & Visual Science,Association for Research in Vision and Ophthalmology (ARVO),2016,929,929,1
Investigative Opthalmology & Visual Science,Association for Research in Vision and Ophthalmology (ARVO),2017,785,785,1
Investigative Opthalmology & Visual Science,Association for Research in Vision and Ophthalmology (ARVO),2018,732,732,1
Investigative Opthalmology & Visual Science,Association for Research in Vision and Ophthalmology (ARVO),2019,56,56,1
Investigative Opthalmology & Visual Science,Association for Research in Vision and Ophthalmology (ARVO),2019,240,240,1
Journal of Remanufacturing,Springer Nature,2015,12,12,1
Journal of Remanufacturing,Springer Nature,2016,3,3,1
Maritime Studies,Springer Nature,2016,15,15,1
Maritime Studies,Springer Nature,2017,22,22,1
Raumforschung und Raumordnung,Walter de Gruyter GmbH,2013,63,61,0.9682539682539683
Raumforschung und Raumordnung,Walter de Gruyter GmbH,2014,57,57,1
Raumforschung und Raumordnung,Walter de Gruyter GmbH,2015,52,50,0.9615384615384616
Raumforschung und Raumordnung,Walter de Gruyter GmbH,2016,61,60,0.9836065573770492
Raumforschung und Raumordnung,Walter de Gruyter GmbH,2017,64,64,1
Raumforschung und Raumordnung,Walter de Gruyter GmbH,2018,56,56,1
Raumforschung und Raumordnung,Walter de Gruyter GmbH,2019,36,36,1
Research in the Mathematical Sciences,Springer Nature,2016,41,41,1
Research in the Mathematical Sciences,Springer Nature,2017,28,28,1
Translational Vision Science & Technology,Association for Research in Vision and Ophthalmology (ARVO),2016,87,87,1
Translational Vision Science & Technology,Association for Research in Vision and Ophthalmology (ARVO),2017,88,88,1
Translational Vision Science & Technology,Association for Research in Vision and Ophthalmology (ARVO),2018,169,168,0.9940828402366864
Translational Vision Science & Technology,Association for Research in Vision and Ophthalmology (ARVO),2019,16,16,1
Translational Vision Science & Technology,Association for Research in Vision and Ophthalmology (ARVO),2019,68,68,1
Loading

0 comments on commit b0e8944

Please sign in to comment.