├── README.md ├── LICENSE ├── .github └── workflows │ └── build-deploy-rmd.yaml └── joss-submission-analytics.Rmd /README.md: -------------------------------------------------------------------------------- 1 | # joss-analytics 2 | Analysis of JOSS data and statistics 3 | 4 | Browse at http://www.theoj.org/joss-analytics/joss-submission-analytics.html -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Open Journals 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/build-deploy-rmd.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | branches: 7 | - master 8 | schedule: 9 | - cron: '0 9 * * 3' 10 | workflow_dispatch: 11 | 12 | name: build-deploy-rmd 13 | 14 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 15 | # permissions: 16 | # contents: read 17 | # pages: write 18 | # id-token: write 19 | 20 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 21 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 22 | # concurrency: 23 | # group: "pages" 24 | # cancel-in-progress: false 25 | 26 | jobs: 27 | build-rmd: 28 | runs-on: ${{ matrix.config.os }} 29 | 30 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | config: 36 | - {os: macOS-latest, r: 'release'} 37 | 38 | env: 39 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 40 | RSPM: ${{ matrix.config.rspm }} 41 | CRAN: ${{ matrix.config.cran }} 42 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 43 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 44 | 45 | steps: 46 | - name: Check out repo 47 | uses: actions/checkout@v2 48 | with: 49 | submodules: true 50 | 51 | - name: Set up R 52 | uses: r-lib/actions/setup-r@v2 53 | with: 54 | r-version: ${{ matrix.config.r }} 55 | 56 | - name: Set up pandoc 57 | uses: r-lib/actions/setup-pandoc@v2 58 | 59 | ## rcrossref requires an email address associated with a query 60 | - name: Set up crossref email 61 | run: | 62 | echo crossref_email=\"${{ secrets.CROSSREF_EMAIL }}\" >> ~/.Renviron 63 | 64 | - name: Install dependencies 65 | run: | 66 | install.packages(c('remotes', 'dplyr', 'ggplot2', 'rmarkdown', 67 | 'knitr', 'tibble', 'tidyr', 68 | 'lubridate', 'gh', 'jsonlite', 'purrr', 69 | 'DT', 'plotly', 'citecorp', 'readr', 70 | 'viridis', 'wordcloud', 'stringr', 'gt', 71 | 'rworldmap', 'openalexR'), Ncpu = 2L) 72 | remotes::install_github('ropensci/rcrossref') 73 | shell: Rscript {0} 74 | 75 | - name: Session info 76 | run: | 77 | install.packages('sessioninfo', Ncpus = 2L) 78 | options(width = 100) 79 | pkgs <- installed.packages()[, "Package"] 80 | sessioninfo::session_info(pkgs, include_base = TRUE) 81 | shell: Rscript {0} 82 | 83 | - name: Render site 84 | run: | 85 | rmarkdown::render(input = "joss-submission-analytics.Rmd", clean = FALSE) 86 | shell: Rscript {0} 87 | 88 | - name: Prepare files to deploy 89 | run: | 90 | mkdir _site 91 | touch _site/.nojekyll 92 | cp -r joss-submission-analytics_files joss-submission-analytics.html _site/ 93 | cp -r joss_submission_analytics.rds _site/ 94 | cp -r joss_submission_citations_byjournal.tsv _site/ 95 | cp -r joss_submission_citations.tsv _site/ 96 | 97 | - name: Deploy 🚀 98 | uses: JamesIves/github-pages-deploy-action@releases/v4 99 | with: 100 | ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }} 101 | BRANCH: gh-pages 102 | FOLDER: _site 103 | 104 | - name: Upload check results 105 | if: failure() 106 | uses: actions/upload-artifact@v4 107 | with: 108 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 109 | path: check 110 | 111 | - name: upload artifact 112 | uses: actions/upload-artifact@v4 113 | with: 114 | name: submissionanalytics 115 | path: joss_submission_analytics.rds 116 | 117 | ## From 2024-06-30, need to use GitHub Actions for deployment 118 | # - name: Setup Pages 119 | # uses: actions/configure-pages@v5 120 | # 121 | # - name: Build with Jekyll 122 | # uses: actions/jekyll-build-pages@v1 123 | # with: 124 | # source: ./_site 125 | # destination: ./_site_jkl 126 | # 127 | # - name: Upload artifact 128 | # uses: actions/upload-pages-artifact@v3 129 | # with: 130 | # path: ./_site_jkl 131 | 132 | # Deployment job 133 | # deploy: 134 | # environment: 135 | # name: github-pages 136 | # url: ${{ steps.deployment.outputs.page_url }} 137 | # runs-on: ubuntu-latest 138 | # needs: build-rmd 139 | # 140 | # steps: 141 | # - name: Deploy to GitHub Pages 142 | # id: deployment 143 | # uses: actions/deploy-pages@v4 144 | -------------------------------------------------------------------------------- /joss-submission-analytics.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "JOSS submission analytics" 3 | date: "`r Sys.time()`" 4 | output: 5 | html_document: 6 | code_folding: hide 7 | theme: united 8 | toc: true 9 | toc_float: true 10 | editor_options: 11 | chunk_output_type: console 12 | --- 13 | 14 | ```{r setup, include=FALSE} 15 | knitr::opts_chunk$set(echo = TRUE, dev = c("png", "pdf")) 16 | ``` 17 | 18 | # Introduction 19 | 20 | In this report, we extract information about published JOSS papers and generate 21 | graphics as well as a summary table that can be downloaded and used for further 22 | analyses. 23 | 24 | # Load required R packages 25 | 26 | ```{r load-packages, class.source = 'fold-show'} 27 | suppressPackageStartupMessages({ 28 | library(tibble) 29 | library(rcrossref) 30 | library(dplyr) 31 | library(tidyr) 32 | library(ggplot2) 33 | library(lubridate) 34 | library(gh) 35 | library(purrr) 36 | library(jsonlite) 37 | library(DT) 38 | library(plotly) 39 | library(citecorp) 40 | library(readr) 41 | library(rworldmap) 42 | library(gt) 43 | library(stringr) 44 | library(openalexR) 45 | }) 46 | ``` 47 | 48 | ```{r source-track, class.source = 'fold-hide'} 49 | ## Keep track of the source of each column 50 | source_track <- c() 51 | 52 | ## Determine whether to add a caption with today's date to the (non-interactive) plots 53 | add_date_caption <- TRUE 54 | if (add_date_caption) { 55 | dcap <- lubridate::today() 56 | } else { 57 | dcap <- "" 58 | } 59 | ``` 60 | 61 | ```{r} 62 | ## Get list of countries and populations (2022) from the rworldmap/gt packages 63 | data("countrySynonyms") 64 | country_names <- countrySynonyms |> 65 | select(-ID) |> 66 | pivot_longer(names_to = "tmp", values_to = "name", -ISO3) |> 67 | filter(name != "") |> 68 | select(-tmp) 69 | 70 | ## Country population data from the World Bank (https://data.worldbank.org/indicator/SP.POP.TOTL), 71 | ## distributed via the gt R package 72 | country_populations <- countrypops |> 73 | filter(year == 2022) 74 | ``` 75 | 76 | ```{r} 77 | ## Read archived version of summary data frame, to use for filling in 78 | ## information about software repositories (due to limit on API requests) 79 | ## Sort by the date when software repo info was last obtained 80 | papers_archive <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true"))) %>% 81 | dplyr::arrange(!is.na(repo_info_obtained), repo_info_obtained) 82 | 83 | ## Similarly for citation analysis, to avoid having to pull down the 84 | ## same information multiple times 85 | citations_archive <- readr::read_delim( 86 | url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_citations.tsv?raw=true"), 87 | col_types = cols(.default = "c"), col_names = TRUE, 88 | delim = "\t") 89 | ``` 90 | 91 | # Collect information about papers 92 | 93 | ## Pull down paper info from Crossref and citation information from OpenAlex 94 | 95 | We get the information about published JOSS papers from Crossref, using the 96 | `rcrossref` R package. The `openalexR` R package is used to extract citation 97 | counts from OpenAlex. 98 | 99 | ```{r pull-crossref, class.source = 'fold-show'} 100 | ## First check how many records there are in Crossref 101 | issn <- "2475-9066" 102 | joss_details <- rcrossref::cr_journals(issn, works = FALSE) %>% 103 | pluck("data") 104 | (total_dois <- joss_details$total_dois) 105 | 106 | ## Pull down all records from Crossref 107 | papers <- rcrossref::cr_journals(issn, works = TRUE, cursor = "*", 108 | cursor_max = joss_details$total_dois * 2) %>% 109 | pluck("data") 110 | 111 | ## Only keep articles 112 | papers <- papers %>% 113 | dplyr::filter(type == "journal-article") 114 | dim(papers) 115 | dim(papers %>% distinct()) 116 | 117 | ## Check that all papers were pulled down and stop otherwise 118 | if (!(nrow(papers %>% distinct()) >= total_dois)) { 119 | stop("Not all papers were pulled down from Crossref!") 120 | } 121 | 122 | ## A few papers don't have alternative.ids - generate them from the DOI 123 | noaltid <- which(is.na(papers$alternative.id)) 124 | papers$alternative.id[noaltid] <- papers$doi[noaltid] 125 | 126 | ## Get citation info from Crossref and merge with paper details 127 | # cit <- rcrossref::cr_citation_count(doi = papers$alternative.id) 128 | # papers <- papers %>% dplyr::left_join( 129 | # cit %>% dplyr::rename(citation_count = count), 130 | # by = c("alternative.id" = "doi") 131 | # ) 132 | 133 | ## Remove one duplicated paper 134 | papers <- papers %>% dplyr::filter(alternative.id != "10.21105/joss.00688") 135 | dim(papers) 136 | dim(papers %>% distinct()) 137 | papers$alternative.id[duplicated(papers$alternative.id)] 138 | 139 | source_track <- c(source_track, 140 | structure(rep("crossref", ncol(papers)), 141 | names = colnames(papers))) 142 | ``` 143 | 144 | ```{r} 145 | ## Get info from openalexR and merge with paper details 146 | ## Helper function to extract countries from affiliations. Note that this 147 | ## information is not available for all papers. 148 | .get_countries <- function(df, wh = "first") { 149 | if ((length(df) == 1 && is.na(df)) || is.null(df$affiliations)) { 150 | "" 151 | } else { 152 | if (wh == "first") { 153 | ## Only first affiliation for each author 154 | tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |> 155 | dplyr::filter(!duplicated(id) & !is.na(affiliations_country_code)) |> 156 | pull(affiliations_country_code) 157 | } else { 158 | ## All affiliations 159 | tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |> 160 | dplyr::filter(!is.na(affiliations_country_code)) |> 161 | pull(affiliations_country_code) 162 | } 163 | if (length(tmp) > 0) { 164 | tmp |> 165 | unique() |> 166 | paste(collapse = ";") 167 | } else { 168 | "" 169 | } 170 | } 171 | } 172 | 173 | oa <- oa_fetch(entity = "works", 174 | primary_location.source.id = "s4210214273") |> 175 | mutate(affil_countries_all = vapply(authorships, .get_countries, "", wh = "all"), 176 | affil_countries_first = vapply(authorships, .get_countries, "", wh = "first")) 177 | dim(oa) 178 | length(unique(oa$doi)) 179 | 180 | papers <- papers %>% dplyr::left_join( 181 | oa %>% dplyr::mutate(alternative.id = sub("https://doi.org/", "", doi)) %>% 182 | dplyr::select(alternative.id, cited_by_count, id, 183 | affil_countries_all, affil_countries_first) %>% 184 | dplyr::rename(citation_count = cited_by_count, 185 | openalex_id = id), 186 | by = "alternative.id" 187 | ) 188 | dim(papers) 189 | dim(papers %>% distinct()) 190 | 191 | source_track <- c(source_track, 192 | structure(rep("OpenAlex", length(setdiff(colnames(papers), 193 | names(source_track)))), 194 | names = setdiff(colnames(papers), names(source_track)))) 195 | ``` 196 | 197 | ## Pull down info from JOSS API 198 | 199 | For each published paper, we use the JOSS API to get information about 200 | pre-review and review issue numbers, corresponding software repository etc. 201 | 202 | ```{r pull-joss-api, class.source = 'fold-show'} 203 | joss_api <- list() 204 | p <- 1 205 | a0 <- NULL 206 | a <- jsonlite::fromJSON( 207 | url(paste0("https://joss.theoj.org/papers/published.json?page=", p)), 208 | simplifyDataFrame = FALSE 209 | ) 210 | while (length(a) > 0 && !identical(a, a0)) { 211 | joss_api <- c(joss_api, a) 212 | p <- p + 1 213 | a0 <- a 214 | a <- tryCatch({ 215 | jsonlite::fromJSON( 216 | url(paste0("https://joss.theoj.org/papers/published.json?page=", p)), 217 | simplifyDataFrame = FALSE 218 | )}, 219 | error = function(e) return(numeric(0)) 220 | ) 221 | } 222 | 223 | joss_api <- do.call(dplyr::bind_rows, lapply(joss_api, function(w) { 224 | data.frame(api_title = w$title, 225 | api_state = w$state, 226 | author_affiliations = paste(unique(unlist(lapply(w$authors, "[[", "affiliation"))), collapse = ";"), 227 | editor = paste(w$editor, collapse = ","), 228 | reviewers = paste(w$reviewers, collapse = ","), 229 | nbr_reviewers = length(w$reviewers), 230 | repo_url = w$software_repository, 231 | review_issue_id = sub("https://github.com/openjournals/joss-reviews/issues/", 232 | "", w$paper_review), 233 | doi = w$doi, 234 | prereview_issue_id = ifelse(!is.null(w$meta_review_issue_id), 235 | w$meta_review_issue_id, NA_integer_), 236 | languages = gsub(", ", ",", w$languages), 237 | archive_doi = w$software_archive) 238 | })) 239 | dim(joss_api) 240 | dim(joss_api %>% distinct()) 241 | ## Check that all papers were pulled down and stop otherwise 242 | if (!(nrow(joss_api %>% distinct()) >= total_dois)) { 243 | stop("Not all papers were pulled down from the JOSS API!") 244 | } 245 | joss_api$repo_url[duplicated(joss_api$repo_url)] 246 | 247 | papers <- papers %>% dplyr::left_join(joss_api, by = c("alternative.id" = "doi")) 248 | dim(papers) 249 | dim(papers %>% distinct()) 250 | papers$repo_url[duplicated(papers$repo_url)] 251 | 252 | source_track <- c(source_track, 253 | structure(rep("JOSS_API", length(setdiff(colnames(papers), 254 | names(source_track)))), 255 | names = setdiff(colnames(papers), names(source_track)))) 256 | ``` 257 | 258 | ## Combine with info from GitHub issues 259 | 260 | From each pre-review and review issue, we extract information about review 261 | times and assigned labels. 262 | 263 | ```{r pull-github, class.source = 'fold-show', message = FALSE} 264 | ## Pull down info on all issues in the joss-reviews repository 265 | issues <- gh("/repos/openjournals/joss-reviews/issues", 266 | .limit = 15000, state = "all") 267 | ``` 268 | 269 | ```{r extract-github, class.source = 'fold-show', message = FALSE} 270 | ## From each issue, extract required information 271 | iss <- do.call(dplyr::bind_rows, lapply(issues, function(i) { 272 | data.frame(title = i$title, 273 | number = i$number, 274 | state = i$state, 275 | opened = i$created_at, 276 | closed = ifelse(!is.null(i$closed_at), 277 | i$closed_at, NA_character_), 278 | ncomments = i$comments, 279 | labels = paste(setdiff( 280 | vapply(i$labels, getElement, 281 | name = "name", character(1L)), 282 | c("review", "pre-review", "query-scope", "paused")), 283 | collapse = ",")) 284 | })) 285 | 286 | ## Split into REVIEW, PRE-REVIEW, and other issues (the latter category 287 | ## is discarded) 288 | issother <- iss %>% dplyr::filter(!grepl("\\[PRE REVIEW\\]", title) & 289 | !grepl("\\[REVIEW\\]", title)) 290 | dim(issother) 291 | head(issother) 292 | 293 | ## For REVIEW issues, generate the DOI of the paper from the issue number 294 | getnbrzeros <- function(s) { 295 | paste(rep(0, 5 - nchar(s)), collapse = "") 296 | } 297 | issrev <- iss %>% dplyr::filter(grepl("\\[REVIEW\\]", title)) %>% 298 | dplyr::mutate(nbrzeros = purrr::map_chr(number, getnbrzeros)) %>% 299 | dplyr::mutate(alternative.id = paste0("10.21105/joss.", 300 | nbrzeros, 301 | number)) %>% 302 | dplyr::select(-nbrzeros) %>% 303 | dplyr::mutate(title = gsub("\\[REVIEW\\]: ", "", title)) %>% 304 | dplyr::rename_at(vars(-alternative.id), ~ paste0("review_", .)) 305 | ``` 306 | 307 | ```{r get-rejection-info, class.source = 'fold-show', message = FALSE} 308 | ## For pre-review and review issues, respectively, get the number of 309 | ## issues closed each month, and the number of those that have the 310 | ## 'rejected' label 311 | review_rejected <- iss %>% 312 | dplyr::filter(grepl("\\[REVIEW\\]", title)) %>% 313 | dplyr::filter(!is.na(closed)) %>% 314 | dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>% 315 | dplyr::group_by(closedmonth) %>% 316 | dplyr::summarize(nbr_issues_closed = length(labels), 317 | nbr_rejections = sum(grepl("rejected", labels))) %>% 318 | dplyr::mutate(itype = "review") 319 | 320 | prereview_rejected <- iss %>% 321 | dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>% 322 | dplyr::filter(!is.na(closed)) %>% 323 | dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>% 324 | dplyr::group_by(closedmonth) %>% 325 | dplyr::summarize(nbr_issues_closed = length(labels), 326 | nbr_rejections = sum(grepl("rejected", labels))) %>% 327 | dplyr::mutate(itype = "pre-review") 328 | 329 | all_rejected <- dplyr::bind_rows(review_rejected, prereview_rejected) 330 | ``` 331 | 332 | ```{r get-submission-info, class.source = 'fold-show', message = FALSE} 333 | ## Get only pre-review issues plus review issues opened before 2016-09-18, 334 | ## will use these as a proxy for the number of submissions 335 | pi1 <- iss |> 336 | dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) |> 337 | dplyr::mutate(opened = as.Date(opened)) 338 | dim(pi1) 339 | pi2 <- iss |> 340 | dplyr::filter(grepl("\\[REVIEW\\]", title)) |> 341 | dplyr::mutate(opened = as.Date(opened)) |> 342 | dplyr::filter(opened <= as.Date("2016-09-18")) 343 | dim(pi2) 344 | prereview_issues <- dplyr::bind_rows(pi1, pi2) 345 | ``` 346 | 347 | ```{r extract-github-2, class.source = 'fold-show', message = FALSE} 348 | ## For PRE-REVIEW issues, add information about the corresponding REVIEW 349 | ## issue number 350 | isspre <- iss %>% dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>% 351 | dplyr::filter(!grepl("withdrawn", labels)) %>% 352 | dplyr::filter(!grepl("rejected", labels)) 353 | ## Some titles have multiple pre-review issues. In these cases, keep the latest 354 | isspre <- isspre %>% dplyr::arrange(desc(number)) %>% 355 | dplyr::filter(!duplicated(title)) %>% 356 | dplyr::mutate(title = gsub("\\[PRE REVIEW\\]: ", "", title)) %>% 357 | dplyr::rename_all(~ paste0("prerev_", .)) 358 | 359 | papers <- papers %>% dplyr::left_join(issrev, by = "alternative.id") %>% 360 | dplyr::left_join(isspre, by = c("prereview_issue_id" = "prerev_number")) %>% 361 | dplyr::mutate(prerev_opened = as.Date(prerev_opened), 362 | prerev_closed = as.Date(prerev_closed), 363 | review_opened = as.Date(review_opened), 364 | review_closed = as.Date(review_closed)) %>% 365 | dplyr::mutate(days_in_pre = prerev_closed - prerev_opened, 366 | days_in_rev = review_closed - review_opened, 367 | to_review = !is.na(review_opened)) 368 | dim(papers) 369 | dim(papers %>% distinct()) 370 | 371 | source_track <- c(source_track, 372 | structure(rep("joss-github", length(setdiff(colnames(papers), 373 | names(source_track)))), 374 | names = setdiff(colnames(papers), names(source_track)))) 375 | ``` 376 | 377 | ## Add information from software repositories 378 | 379 | ```{r check-software-repos, class.source = 'fold-show', message = FALSE} 380 | ## Reorder so that software repositories that were interrogated longest 381 | ## ago are checked first 382 | tmporder <- order(match(papers$alternative.id, papers_archive$alternative.id), 383 | na.last = FALSE) 384 | software_urls <- papers$repo_url[tmporder] 385 | software_urls[duplicated(software_urls)] 386 | is_github <- grepl("github", software_urls) 387 | length(is_github) 388 | sum(is_github) 389 | software_urls[!is_github] 390 | ``` 391 | 392 | ```{r get-software-repos, class.source = 'fold-show', message = FALSE, results = 'hide', warning = FALSE} 393 | df <- do.call(dplyr::bind_rows, lapply(unique(software_urls[is_github]), function(u) { 394 | u0 <- gsub("^http://", "https://", gsub("\\.git$", "", gsub("/$", "", u))) 395 | if (grepl("/tree/", u0)) { 396 | u0 <- strsplit(u0, "/tree/")[[1]][1] 397 | } 398 | if (grepl("/blob/", u0)) { 399 | u0 <- strsplit(u0, "/blob/")[[1]][1] 400 | } 401 | info <- try({ 402 | gh(gsub("(https://)?(www.)?github.com/", "/repos/", u0)) 403 | }) 404 | languages <- try({ 405 | gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/languages"), 406 | .limit = 500) 407 | }) 408 | topics <- try({ 409 | gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/topics"), 410 | .accept = "application/vnd.github.mercy-preview+json", .limit = 500) 411 | }) 412 | contribs <- try({ 413 | gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/contributors"), 414 | .limit = 500) 415 | }) 416 | if (!is(info, "try-error") && length(info) > 1) { 417 | if (!is(contribs, "try-error")) { 418 | if (length(contribs) == 0) { 419 | repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_ 420 | } else { 421 | repo_nbr_contribs <- length(contribs) 422 | repo_nbr_contribs_2ormore <- sum(vapply(contribs, function(x) x$contributions >= 2, NA_integer_)) 423 | if (is.na(repo_nbr_contribs_2ormore)) { 424 | print(contribs) 425 | } 426 | } 427 | } else { 428 | repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_ 429 | } 430 | 431 | if (!is(languages, "try-error")) { 432 | if (length(languages) == 0) { 433 | repolang <- "" 434 | } else { 435 | repolang <- paste(paste(names(unlist(languages)), 436 | unlist(languages), sep = ":"), collapse = ",") 437 | } 438 | } else { 439 | repolang <- "" 440 | } 441 | 442 | if (!is(topics, "try-error")) { 443 | if (length(topics$names) == 0) { 444 | repotopics <- "" 445 | } else { 446 | repotopics <- paste(unlist(topics$names), collapse = ",") 447 | } 448 | } else { 449 | repotopics <- "" 450 | } 451 | 452 | data.frame(repo_url = u, 453 | repo_created = info$created_at, 454 | repo_updated = info$updated_at, 455 | repo_pushed = info$pushed_at, 456 | repo_nbr_stars = info$stargazers_count, 457 | repo_language = ifelse(!is.null(info$language), 458 | info$language, NA_character_), 459 | repo_languages_bytes = repolang, 460 | repo_topics = repotopics, 461 | repo_license = ifelse(!is.null(info$license), 462 | info$license$key, NA_character_), 463 | repo_nbr_contribs = repo_nbr_contribs, 464 | repo_nbr_contribs_2ormore = repo_nbr_contribs_2ormore 465 | ) 466 | } else { 467 | NULL 468 | } 469 | })) %>% 470 | dplyr::mutate(repo_created = as.Date(repo_created), 471 | repo_updated = as.Date(repo_updated), 472 | repo_pushed = as.Date(repo_pushed)) %>% 473 | dplyr::distinct() %>% 474 | dplyr::mutate(repo_info_obtained = lubridate::today()) 475 | ``` 476 | 477 | ```{r get-software-repos-print1, class.source = 'fold-show', message = FALSE, warning = FALSE} 478 | if (length(unique(df$repo_url)) != length(df$repo_url)) { 479 | print(length(unique(df$repo_url))) 480 | print(length(df$repo_url)) 481 | print(df$repo_url[duplicated(df$repo_url)]) 482 | } 483 | stopifnot(length(unique(df$repo_url)) == length(df$repo_url)) 484 | dim(df) 485 | ``` 486 | 487 | ```{r get-software-repos-print2, class.source = 'fold-show', message = FALSE, warning = FALSE} 488 | ## For papers not in df (i.e., for which we didn't get a valid response 489 | ## from the GitHub API query), use information from the archived data frame 490 | dfarchive <- papers_archive %>% 491 | dplyr::select(colnames(df)[colnames(df) %in% colnames(papers_archive)]) %>% 492 | dplyr::filter(!(repo_url %in% df$repo_url)) %>% 493 | dplyr::arrange(desc(repo_info_obtained)) %>% 494 | dplyr::filter(!duplicated(repo_url)) 495 | head(dfarchive) 496 | dim(dfarchive) 497 | df <- dplyr::bind_rows(df, dfarchive) 498 | stopifnot(length(unique(df$repo_url)) == length(df$repo_url)) 499 | dim(df) 500 | 501 | papers <- papers %>% dplyr::left_join(df, by = "repo_url") 502 | dim(papers) 503 | 504 | source_track <- c(source_track, 505 | structure(rep("sw-github", length(setdiff(colnames(papers), 506 | names(source_track)))), 507 | names = setdiff(colnames(papers), names(source_track)))) 508 | ``` 509 | 510 | ## Clean up a bit 511 | 512 | ```{r clean-up, class.source = 'fold-show'} 513 | ## Convert publication date to Date format 514 | ## Add information about the half year (H1, H2) of publication 515 | ## Count number of authors 516 | papers <- papers %>% dplyr::select(-reference, -license, -link) %>% 517 | dplyr::mutate(published.date = as.Date(published.print)) %>% 518 | dplyr::mutate( 519 | halfyear = paste0(year(published.date), 520 | ifelse(month(published.date) <= 6, "H1", "H2")) 521 | ) %>% dplyr::mutate( 522 | halfyear = factor(halfyear, 523 | levels = paste0(rep(sort(unique(year(published.date))), 524 | each = 2), c("H1", "H2"))) 525 | ) %>% dplyr::mutate(nbr_authors = vapply(author, function(a) nrow(a), NA_integer_)) 526 | dim(papers) 527 | dupidx <- which(papers$alternative.id %in% papers$alternative.id[duplicated(papers)]) 528 | papers[dupidx, ] %>% arrange(alternative.id) %>% head(n = 10) 529 | 530 | papers <- papers %>% dplyr::distinct() 531 | dim(papers) 532 | 533 | source_track <- c(source_track, 534 | structure(rep("cleanup", length(setdiff(colnames(papers), 535 | names(source_track)))), 536 | names = setdiff(colnames(papers), names(source_track)))) 537 | ``` 538 | 539 | # Tabulate number of missing values 540 | 541 | In some cases, fetching information from (e.g.) the GitHub API fails for a 542 | subset of the publications. There are also other reasons for missing values 543 | (for example, the earliest submissions do not have an associated pre-review 544 | issue). The table below lists the number of missing values for each of the 545 | variables in the data frame. 546 | 547 | ```{r nbr-missing, class.source = 'fold-hide'} 548 | DT::datatable( 549 | data.frame(variable = colnames(papers), 550 | nbr_missing = colSums(is.na(papers))) %>% 551 | dplyr::mutate(source = source_track[variable]), 552 | escape = FALSE, rownames = FALSE, 553 | filter = list(position = 'top', clear = FALSE), 554 | options = list(scrollX = TRUE) 555 | ) 556 | ``` 557 | 558 | # Number of published papers per month 559 | 560 | ```{r papers-month, class.source = 'fold-hide', fig.width = 10, message = FALSE} 561 | monthly_pubs <- papers %>% 562 | dplyr::mutate(pubmonth = lubridate::floor_date(published.date, "month")) %>% 563 | dplyr::group_by(pubmonth) %>% 564 | dplyr::summarize(npub = n()) 565 | ggplot(monthly_pubs, 566 | aes(x = factor(pubmonth), y = npub)) + 567 | geom_bar(stat = "identity") + theme_minimal() + 568 | labs(x = "", y = "Number of published papers per month", caption = dcap) + 569 | theme(axis.title = element_text(size = 15), 570 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 571 | ``` 572 | 573 | ```{r} 574 | DT::datatable( 575 | monthly_pubs %>% 576 | dplyr::rename("Number of papers" = "npub", 577 | "Month of publication" = "pubmonth"), 578 | escape = FALSE, rownames = FALSE, 579 | filter = list(position = 'top', clear = FALSE), 580 | options = list(scrollX = TRUE) 581 | ) 582 | ``` 583 | 584 | # Number of published papers per year 585 | 586 | ```{r papers-year, class.source = 'fold-hide', fig.width = 8, message = FALSE} 587 | yearly_pubs <- papers %>% 588 | dplyr::mutate(pubyear = lubridate::year(published.date)) %>% 589 | dplyr::group_by(pubyear) %>% 590 | dplyr::summarize(npub = n()) 591 | ggplot(yearly_pubs, 592 | aes(x = factor(pubyear), y = npub)) + 593 | geom_bar(stat = "identity") + theme_minimal() + 594 | labs(x = "", y = "Number of published papers per year", caption = dcap) + 595 | theme(axis.title = element_text(size = 15), 596 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 597 | ``` 598 | 599 | ```{r} 600 | DT::datatable( 601 | yearly_pubs %>% 602 | dplyr::rename("Number of papers" = "npub", 603 | "Year of publication" = "pubyear"), 604 | escape = FALSE, rownames = FALSE, 605 | filter = list(position = 'top', clear = FALSE), 606 | options = list(scrollX = TRUE) 607 | ) 608 | ``` 609 | 610 | # Number of submissions per month 611 | 612 | We use the number of opened pre-review issues in a month as a proxy for the 613 | number of submissions. 614 | 615 | ```{r submissions-month, class.source = 'fold-hide', fig.width = 10, message = FALSE} 616 | monthly_subs <- prereview_issues |> 617 | dplyr::mutate(submonth = lubridate::floor_date(opened, "month")) |> 618 | dplyr::group_by(submonth) |> 619 | dplyr::summarize(nsub = n()) 620 | ggplot(monthly_subs, 621 | aes(x = factor(submonth), y = nsub)) + 622 | geom_bar(stat = "identity") + theme_minimal() + 623 | labs(x = "", y = "Number of submissions per month", caption = dcap) + 624 | theme(axis.title = element_text(size = 15), 625 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 626 | ``` 627 | 628 | ```{r} 629 | DT::datatable( 630 | monthly_subs |> 631 | dplyr::rename("Number of submissions" = "nsub", 632 | "Month of submission" = "submonth"), 633 | escape = FALSE, rownames = FALSE, 634 | filter = list(position = 'top', clear = FALSE), 635 | options = list(scrollX = TRUE) 636 | ) 637 | ``` 638 | 639 | # Fraction rejected papers 640 | 641 | The plots below illustrate the fraction of pre-review and review issues closed 642 | during each month that have the 'rejected' label attached. 643 | 644 | ```{r rejections, class.source = 'fold-hide', fig.width = 10, fig.height = 8, message = FALSE} 645 | ggplot(all_rejected, 646 | aes(x = factor(closedmonth), y = nbr_rejections/nbr_issues_closed)) + 647 | geom_bar(stat = "identity") + 648 | theme_minimal() + 649 | facet_wrap(~ itype, ncol = 1) + 650 | labs(x = "Month of issue closing", y = "Fraction of issues rejected", 651 | caption = dcap) + 652 | theme(axis.title = element_text(size = 15), 653 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 654 | ``` 655 | 656 | # Citation distribution 657 | 658 | Papers with 20 or more citations are grouped in the ">=20" category. 659 | 660 | ```{r citation-distribution, class.source = 'fold-hide'} 661 | ggplot(papers %>% 662 | dplyr::mutate(citation_count = replace(citation_count, 663 | citation_count >= 20, ">=20")) %>% 664 | dplyr::mutate(citation_count = factor(citation_count, 665 | levels = c(0:20, ">=20"))) %>% 666 | dplyr::group_by(citation_count) %>% 667 | dplyr::tally(), 668 | aes(x = citation_count, y = n)) + 669 | geom_bar(stat = "identity") + 670 | theme_minimal() + 671 | labs(x = "OpenAlex citation count", y = "Number of publications", caption = dcap) 672 | ``` 673 | 674 | 675 | # Most cited papers 676 | 677 | The table below sorts the JOSS papers in decreasing order by the number of 678 | citations in OpenAlex. 679 | 680 | ```{r most-cited, class.source = 'fold-hide'} 681 | DT::datatable( 682 | papers %>% 683 | dplyr::mutate(url = paste0("", 684 | url,"")) %>% 685 | dplyr::arrange(desc(citation_count)) %>% 686 | dplyr::select(title, url, published.date, citation_count), 687 | escape = FALSE, 688 | filter = list(position = 'top', clear = FALSE), 689 | options = list(scrollX = TRUE) 690 | ) 691 | ``` 692 | 693 | # Citation count vs time since publication 694 | 695 | ```{r citations-vs-time, class.source = 'fold-hide', message = FALSE} 696 | plotly::ggplotly( 697 | ggplot(papers, aes(x = published.date, y = citation_count, label = title)) + 698 | geom_point(alpha = 0.5) + theme_bw() + scale_y_sqrt() + 699 | geom_smooth() + 700 | labs(x = "Date of publication", y = "OpenAlex citation count", caption = dcap) + 701 | theme(axis.title = element_text(size = 15)), 702 | tooltip = c("label", "x", "y") 703 | ) 704 | ``` 705 | 706 | # Power law of citation count within each half year 707 | 708 | Here, we plot the citation count for all papers published within each half year, 709 | sorted in decreasing order. 710 | 711 | ```{r power-law-citations, class.source = 'fold-hide', fig.width = 10, fig.height = 10} 712 | ggplot(papers %>% dplyr::group_by(halfyear) %>% 713 | dplyr::arrange(desc(citation_count)) %>% 714 | dplyr::mutate(idx = seq_along(citation_count)), 715 | aes(x = idx, y = citation_count)) + 716 | geom_point(alpha = 0.5) + 717 | facet_wrap(~ halfyear, scales = "free") + 718 | theme_bw() + 719 | labs(x = "Index", y = "OpenAlex citation count", caption = dcap) 720 | ``` 721 | 722 | 723 | # Pre-review/review time over time 724 | 725 | In these plots we investigate whether the time a submission spends in the 726 | pre-review or review stage (or their sum) has changed over time. The blue curve 727 | corresponds to a rolling median for submissions over 120 days. 728 | 729 | ```{r smoothing-helpers, class.source = 'fold-hide'} 730 | ## Helper functions (modified from https://stackoverflow.com/questions/65147186/geom-smooth-with-median-instead-of-mean) 731 | rolling_median <- function(formula, data, xwindow = 120, ...) { 732 | ## Get order of x-values and sort x/y 733 | ordr <- order(data$x) 734 | x <- data$x[ordr] 735 | y <- data$y[ordr] 736 | 737 | ## Initialize vector for smoothed y-values 738 | ys <- rep(NA, length(x)) 739 | ## Calculate median y-value for each unique x-value 740 | for (xs in setdiff(unique(x), NA)) { 741 | ## Get x-values in the window, and calculate median of corresponding y 742 | j <- ((xs - xwindow/2) < x) & (x < (xs + xwindow/2)) 743 | ys[x == xs] <- median(y[j], na.rm = TRUE) 744 | } 745 | y <- ys 746 | structure(list(x = x, y = y, f = approxfun(x, y)), class = "rollmed") 747 | } 748 | 749 | predict.rollmed <- function(mod, newdata, ...) { 750 | setNames(mod$f(newdata$x), newdata$x) 751 | } 752 | ``` 753 | 754 | ```{r review-time, class.source = 'fold-hide', message = FALSE, warning = FALSE} 755 | ggplot(papers, aes(x = prerev_opened, y = as.numeric(days_in_pre))) + 756 | geom_point() + 757 | geom_smooth(formula = y ~ x, method = "rolling_median", 758 | se = FALSE, method.args = list(xwindow = 120)) + 759 | theme_bw() + 760 | labs(x = "Date of pre-review opening", y = "Number of days in pre-review", 761 | caption = dcap) + 762 | theme(axis.title = element_text(size = 15)) 763 | 764 | ggplot(papers, aes(x = review_opened, y = as.numeric(days_in_rev))) + 765 | geom_point() + 766 | geom_smooth(formula = y ~ x, method = "rolling_median", 767 | se = FALSE, method.args = list(xwindow = 120)) + 768 | theme_bw() + 769 | labs(x = "Date of review opening", y = "Number of days in review", 770 | caption = dcap) + 771 | theme(axis.title = element_text(size = 15)) 772 | 773 | ggplot(papers, aes(x = prerev_opened, 774 | y = as.numeric(days_in_pre) + as.numeric(days_in_rev))) + 775 | geom_point() + 776 | geom_smooth(formula = y ~ x, method = "rolling_median", 777 | se = FALSE, method.args = list(xwindow = 120)) + 778 | theme_bw() + 779 | labs(x = "Date of pre-review opening", y = "Number of days in pre-review + review", 780 | caption = dcap) + 781 | theme(axis.title = element_text(size = 15)) 782 | ``` 783 | 784 | # Languages 785 | 786 | Next, we consider the languages used by the submissions, both as reported by 787 | JOSS and based on the information encoded in available GitHub repositories 788 | (for the latter, we also record the number of bytes of code written in each 789 | language). Note that a given submission can use multiple languages. 790 | 791 | ```{r languages, class.source = 'fold-hide', fig.width = 9, message = FALSE} 792 | ## Language information from JOSS 793 | sspl <- strsplit(papers$languages, ",") 794 | all_languages <- unique(unlist(sspl)) 795 | langs <- do.call(dplyr::bind_rows, lapply(all_languages, function(l) { 796 | data.frame(language = l, 797 | nbr_submissions_JOSS_API = sum(vapply(sspl, function(v) l %in% v, 0))) 798 | })) 799 | 800 | ## Language information from GitHub software repos 801 | a <- lapply(strsplit(papers$repo_languages_bytes, ","), function(w) strsplit(w, ":")) 802 | a <- a[sapply(a, length) > 0] 803 | langbytes <- as.data.frame(t(as.data.frame(a))) %>% 804 | setNames(c("language", "bytes")) %>% 805 | dplyr::mutate(bytes = as.numeric(bytes)) %>% 806 | dplyr::filter(!is.na(language)) %>% 807 | dplyr::group_by(language) %>% 808 | dplyr::summarize(nbr_bytes_GitHub = sum(bytes), 809 | nbr_repos_GitHub = length(bytes)) %>% 810 | dplyr::arrange(desc(nbr_bytes_GitHub)) 811 | 812 | langs <- dplyr::full_join(langs, langbytes, by = "language") 813 | ``` 814 | 815 | ```{r language-plot, class.source = 'fold-hide', message = FALSE} 816 | ggplot(langs %>% dplyr::arrange(desc(nbr_submissions_JOSS_API)) %>% 817 | dplyr::filter(nbr_submissions_JOSS_API > 10) %>% 818 | dplyr::mutate(language = factor(language, levels = language)), 819 | aes(x = language, y = nbr_submissions_JOSS_API)) + 820 | geom_bar(stat = "identity") + 821 | theme_bw() + 822 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 823 | labs(x = "", y = "Number of submissions", caption = dcap) + 824 | theme(axis.title = element_text(size = 15)) 825 | ``` 826 | 827 | ```{r language-bytes, class.source = 'fold-hide', message = FALSE} 828 | DT::datatable( 829 | langs %>% dplyr::arrange(desc(nbr_bytes_GitHub)), 830 | escape = FALSE, 831 | filter = list(position = 'top', clear = FALSE), 832 | options = list(scrollX = TRUE) 833 | ) 834 | ``` 835 | 836 | ```{r language-bytes-plot, class.source = 'fold-hide', message = FALSE, warning = FALSE} 837 | ggplot(langs, aes(x = nbr_repos_GitHub, y = nbr_bytes_GitHub)) + 838 | geom_point() + scale_x_log10() + scale_y_log10() + geom_smooth() + 839 | theme_bw() + 840 | labs(x = "Number of repos using the language", 841 | y = "Total number of bytes of code\nwritten in the language", 842 | caption = dcap) + 843 | theme(axis.title = element_text(size = 15)) 844 | ``` 845 | 846 | # Association between number of citations and number of stars of the GitHub repo 847 | 848 | ```{r citation-stars, class.source = 'fold-hide'} 849 | ggplotly( 850 | ggplot(papers, aes(x = citation_count, y = repo_nbr_stars, 851 | label = title)) + 852 | geom_point(alpha = 0.5) + scale_x_sqrt() + scale_y_sqrt() + 853 | theme_bw() + 854 | labs(x = "OpenAlex citation count", y = "Number of stars, GitHub repo", 855 | caption = dcap) + 856 | theme(axis.title = element_text(size = 15)), 857 | tooltip = c("label", "x", "y") 858 | ) 859 | ``` 860 | 861 | # Distribution of time between GitHub repo creation and JOSS submission 862 | 863 | ```{r creation-to-submission, class.source = 'fold-hide', warning = FALSE} 864 | ggplot(papers, aes(x = as.numeric(prerev_opened - repo_created))) + 865 | geom_histogram(bins = 50) + 866 | theme_bw() + 867 | labs(x = "Time (days) from repo creation to JOSS pre-review start", 868 | caption = dcap) + 869 | theme(axis.title = element_text(size = 15)) 870 | ``` 871 | 872 | # Distribution of time between JOSS acceptance and last commit 873 | 874 | ```{r acceptance-to-commit, class.source = 'fold-hide', warning = FALSE, fig.width = 8, fig.height = 8} 875 | ggplot(papers, aes(x = as.numeric(repo_pushed - review_closed))) + 876 | geom_histogram(bins = 50) + 877 | theme_bw() + 878 | labs(x = "Time (days) from closure of JOSS review to most recent commit in repo", 879 | caption = dcap) + 880 | theme(axis.title = element_text(size = 15)) + 881 | facet_wrap(~ year(published.date), scales = "free_y") 882 | ``` 883 | 884 | # Number of authors per paper 885 | 886 | List the papers with the largest number of authors, and display the distribution 887 | of the number of authors per paper, for papers with at most 20 authors. 888 | 889 | ```{r nbr-authors-top, class.source = 'fold-show'} 890 | ## Papers with largest number of authors 891 | papers %>% dplyr::arrange(desc(nbr_authors)) %>% 892 | dplyr::select(title, published.date, url, nbr_authors) %>% 893 | as.data.frame() %>% head(10) 894 | ``` 895 | 896 | ```{r nbr-authors, class.source = 'fold-hide', message = FALSE, fig.width = 8, fig.height = 8} 897 | nbins <- max(papers$nbr_authors[papers$nbr_authors <= 20]) 898 | ggplot(papers %>% dplyr::filter(nbr_authors <= 20), 899 | aes(x = nbr_authors)) + 900 | geom_histogram(bins = nbins, fill = "lightgrey", color = "grey50") + 901 | theme_bw() + 902 | facet_wrap(~ year(published.date), scales = "free_y") + 903 | theme(axis.title = element_text(size = 15)) + 904 | labs(x = "Number of authors", 905 | y = "Number of publications with\na given number of authors", 906 | caption = dcap) 907 | ``` 908 | 909 | ```{r nbr-authors-all, class.source = 'fold-hide', message = FALSE} 910 | ggplot(papers %>% 911 | dplyr::mutate(nbr_authors = replace(nbr_authors, nbr_authors > 5, ">5")) %>% 912 | dplyr::mutate(nbr_authors = factor(nbr_authors, levels = c("1", "2", "3", 913 | "4", "5", ">5"))) %>% 914 | dplyr::mutate(year = year(published.date)) %>% 915 | dplyr::mutate(year = factor(year)) %>% 916 | dplyr::group_by(year, nbr_authors, .drop = FALSE) %>% 917 | dplyr::summarize(n = n()) %>% 918 | dplyr::mutate(freq = n/sum(n)) %>% 919 | dplyr::mutate(year = as.integer(as.character(year))), 920 | aes(x = year, y = freq, fill = nbr_authors)) + geom_area() + 921 | theme_minimal() + 922 | scale_fill_brewer(palette = "Set1", name = "Number of\nauthors", 923 | na.value = "grey") + 924 | theme(axis.title = element_text(size = 15)) + 925 | labs(x = "Year", y = "Fraction of submissions", caption = dcap) 926 | 927 | ``` 928 | 929 | # Number of authors vs number of contributors to the GitHub repo 930 | 931 | Note that points are slightly jittered to reduce the overlap. 932 | 933 | ```{r nbr-authors-contribs, class.source = 'fold-hide', message = FALSE} 934 | plotly::ggplotly( 935 | ggplot(papers, aes(x = nbr_authors, y = repo_nbr_contribs_2ormore, label = title)) + 936 | geom_abline(slope = 1, intercept = 0) + 937 | geom_jitter(width = 0.05, height = 0.05, alpha = 0.5) + 938 | # geom_point(alpha = 0.5) + 939 | theme_bw() + 940 | scale_x_sqrt() + scale_y_sqrt() + 941 | labs(x = "Number of authors", 942 | y = "Number of contributors\nwith at least 2 commits", 943 | caption = dcap) + 944 | theme(axis.title = element_text(size = 15)), 945 | tooltip = c("label", "x", "y") 946 | ) 947 | ``` 948 | 949 | # Number of reviewers per paper 950 | 951 | Submissions associated with rOpenSci and pyOpenSci are not considered here, 952 | since they are not explicitly reviewed at JOSS. 953 | 954 | ```{r nbr-reviewers, class.source = 'fold-hide', message = FALSE, fig.width = 8, fig.height = 8} 955 | ggplot(papers %>% 956 | dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>% 957 | dplyr::mutate(year = year(published.date)), 958 | aes(x = nbr_reviewers)) + geom_bar() + 959 | facet_wrap(~ year) + theme_bw() + 960 | labs(x = "Number of reviewers", y = "Number of submissions", caption = dcap) 961 | ``` 962 | 963 | # Most active reviewers 964 | 965 | Submissions associated with rOpenSci and pyOpenSci are not considered here, 966 | since they are not explicitly reviewed at JOSS. 967 | 968 | ## All time 969 | 970 | ```{r most-reviewers, class.source = 'fold-hide', message = FALSE} 971 | reviewers <- papers %>% 972 | dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>% 973 | dplyr::mutate(year = year(published.date)) %>% 974 | dplyr::select(reviewers, year) %>% 975 | tidyr::separate_rows(reviewers, sep = ",") 976 | 977 | ## Most active reviewers 978 | DT::datatable( 979 | reviewers %>% dplyr::group_by(reviewers) %>% 980 | dplyr::summarize(nbr_reviews = length(year), 981 | timespan = paste(unique(c(min(year), max(year))), 982 | collapse = " - ")) %>% 983 | dplyr::arrange(desc(nbr_reviews)), 984 | escape = FALSE, rownames = FALSE, 985 | filter = list(position = 'top', clear = FALSE), 986 | options = list(scrollX = TRUE) 987 | ) 988 | ``` 989 | 990 | ## Past 5 years 991 | 992 | ```{r most-reviewers-past-5years, class.source = 'fold-hide', message = FALSE} 993 | reviewers <- papers %>% 994 | dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>% 995 | dplyr::mutate(year = year(published.date)) %>% 996 | dplyr::filter(as.Date(published.date) >= (lubridate::today() - 5 * 365.25)) %>% 997 | dplyr::select(reviewers, year) %>% 998 | tidyr::separate_rows(reviewers, sep = ",") 999 | 1000 | ## Most active reviewers 1001 | DT::datatable( 1002 | reviewers %>% dplyr::group_by(reviewers) %>% 1003 | dplyr::summarize(nbr_reviews = length(year), 1004 | timespan = paste(unique(c(min(year), max(year))), 1005 | collapse = " - ")) %>% 1006 | dplyr::arrange(desc(nbr_reviews)), 1007 | escape = FALSE, rownames = FALSE, 1008 | filter = list(position = 'top', clear = FALSE), 1009 | options = list(scrollX = TRUE) 1010 | ) 1011 | ``` 1012 | 1013 | ## Past year 1014 | 1015 | ```{r most-reviewers-past-year, class.source = 'fold-hide', message = FALSE} 1016 | reviewers <- papers %>% 1017 | dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>% 1018 | dplyr::mutate(year = year(published.date)) %>% 1019 | dplyr::filter(as.Date(published.date) >= (lubridate::today() - 365.25)) %>% 1020 | dplyr::select(reviewers, year) %>% 1021 | tidyr::separate_rows(reviewers, sep = ",") 1022 | 1023 | ## Most active reviewers 1024 | DT::datatable( 1025 | reviewers %>% dplyr::group_by(reviewers) %>% 1026 | dplyr::summarize(nbr_reviews = length(year), 1027 | timespan = paste(unique(c(min(year), max(year))), 1028 | collapse = " - ")) %>% 1029 | dplyr::arrange(desc(nbr_reviews)), 1030 | escape = FALSE, rownames = FALSE, 1031 | filter = list(position = 'top', clear = FALSE), 1032 | options = list(scrollX = TRUE) 1033 | ) 1034 | ``` 1035 | 1036 | # Number of papers per editor and year 1037 | 1038 | ```{r papers-per-editor, class.source = 'fold-hide', message = FALSE, fig.width = 16, fig.height = 15} 1039 | ggplot(papers %>% 1040 | dplyr::mutate(year = year(published.date), 1041 | `r/pyOpenSci` = factor( 1042 | grepl("rOpenSci|pyOpenSci", prerev_labels), 1043 | levels = c("TRUE", "FALSE"))), 1044 | aes(x = editor)) + geom_bar(aes(fill = `r/pyOpenSci`)) + 1045 | theme_bw() + facet_wrap(~ year, ncol = 1) + 1046 | scale_fill_manual(values = c(`TRUE` = "grey65", `FALSE` = "grey35")) + 1047 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 1048 | labs(x = "Editor", y = "Number of submissions", caption = dcap) 1049 | ``` 1050 | 1051 | 1052 | # Distribution of software repo licenses 1053 | 1054 | ```{r repo-license, class.source = 'fold-hide', warning = FALSE, message = FALSE, fig.width = 8, fig.height = 8} 1055 | all_licenses <- sort(unique(papers$repo_license)) 1056 | license_levels = c(grep("apache", all_licenses, value = TRUE), 1057 | grep("bsd", all_licenses, value = TRUE), 1058 | grep("mit", all_licenses, value = TRUE), 1059 | grep("gpl", all_licenses, value = TRUE), 1060 | grep("mpl", all_licenses, value = TRUE)) 1061 | license_levels <- c(license_levels, setdiff(all_licenses, license_levels)) 1062 | ggplot(papers %>% 1063 | dplyr::mutate(repo_license = factor(repo_license, 1064 | levels = license_levels)), 1065 | aes(x = repo_license)) + 1066 | geom_bar() + 1067 | theme_bw() + 1068 | labs(x = "Software license", y = "Number of submissions", caption = dcap) + 1069 | theme(axis.title = element_text(size = 15), 1070 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 1071 | facet_wrap(~ year(published.date), scales = "free_y") 1072 | ``` 1073 | 1074 | ```{r repl-license, class.source = 'fold-show', warning = FALSE, message = FALSE} 1075 | ## For plots below, replace licenses present in less 1076 | ## than 2.5% of the submissions by 'other' 1077 | tbl <- table(papers$repo_license) 1078 | to_replace <- names(tbl[tbl <= 0.025 * nrow(papers)]) 1079 | ``` 1080 | 1081 | ```{r plot-repo-license, class.source = 'fold-hide', warning = FALSE, message = FALSE} 1082 | ggplot(papers %>% 1083 | dplyr::mutate(year = year(published.date)) %>% 1084 | dplyr::mutate(repo_license = replace(repo_license, 1085 | repo_license %in% to_replace, 1086 | "other")) %>% 1087 | dplyr::mutate(year = factor(year), 1088 | repo_license = factor( 1089 | repo_license, 1090 | levels = license_levels[license_levels %in% repo_license] 1091 | )) %>% 1092 | dplyr::group_by(year, repo_license, .drop = FALSE) %>% 1093 | dplyr::count() %>% 1094 | dplyr::mutate(year = as.integer(as.character(year))), 1095 | aes(x = year, y = n, fill = repo_license)) + geom_area() + 1096 | theme_minimal() + 1097 | scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 1098 | na.value = "grey") + 1099 | theme(axis.title = element_text(size = 15)) + 1100 | labs(x = "Year", y = "Number of submissions", caption = dcap) 1101 | 1102 | ggplot(papers %>% 1103 | dplyr::mutate(year = year(published.date)) %>% 1104 | dplyr::mutate(repo_license = replace(repo_license, 1105 | repo_license %in% to_replace, 1106 | "other")) %>% 1107 | dplyr::mutate(year = factor(year), 1108 | repo_license = factor( 1109 | repo_license, 1110 | levels = license_levels[license_levels %in% repo_license] 1111 | )) %>% 1112 | dplyr::group_by(year, repo_license, .drop = FALSE) %>% 1113 | dplyr::summarize(n = n()) %>% 1114 | dplyr::mutate(freq = n/sum(n)) %>% 1115 | dplyr::mutate(year = as.integer(as.character(year))), 1116 | aes(x = year, y = freq, fill = repo_license)) + geom_area() + 1117 | theme_minimal() + 1118 | scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 1119 | na.value = "grey") + 1120 | theme(axis.title = element_text(size = 15)) + 1121 | labs(x = "Year", y = "Fraction of submissions", caption = dcap) 1122 | ``` 1123 | 1124 | # Most common GitHub repo topics 1125 | 1126 | ```{r github-topics, class.source = 'fold-hide', fig.width = 12, fig.height = 12, warning = FALSE, message = FALSE} 1127 | a <- unlist(strsplit(papers$repo_topics, ",")) 1128 | a <- a[!is.na(a)] 1129 | topicfreq <- table(a) 1130 | 1131 | colors <- viridis::viridis(100) 1132 | set.seed(1234) 1133 | wordcloud::wordcloud( 1134 | names(topicfreq), sqrt(topicfreq), min.freq = 1, max.words = 300, 1135 | random.order = FALSE, rot.per = 0.05, use.r.layout = FALSE, 1136 | colors = colors, scale = c(10, 0.1), random.color = TRUE, 1137 | ordered.colors = FALSE, vfont = c("serif", "plain") 1138 | ) 1139 | 1140 | DT::datatable(as.data.frame(topicfreq) %>% 1141 | dplyr::rename(topic = a, nbr_repos = Freq) %>% 1142 | dplyr::arrange(desc(nbr_repos)), 1143 | escape = FALSE, rownames = FALSE, 1144 | filter = list(position = 'top', clear = FALSE), 1145 | options = list(scrollX = TRUE)) 1146 | ``` 1147 | 1148 | 1149 | # Citation analysis 1150 | 1151 | Here, we take a more detailed look at the papers that cite JOSS papers, using 1152 | data from the Open Citations Corpus. 1153 | 1154 | ## Get citing papers for each submission 1155 | 1156 | ```{r get-citing-papers, class.source = 'fold-show', warning = FALSE, message = FALSE} 1157 | ## Split into several queries 1158 | ## Randomize the splitting since a whole query may fail if one ID is not recognized 1159 | papidx <- seq_len(nrow(papers)) 1160 | idxL <- split(sample(papidx, length(papidx), replace = FALSE), ceiling(papidx / 50)) 1161 | citationsL <- lapply(idxL, function(idx) { 1162 | tryCatch({ 1163 | citecorp::oc_coci_cites(doi = papers$alternative.id[idx]) %>% 1164 | dplyr::distinct() %>% 1165 | dplyr::mutate(citation_info_obtained = as.character(lubridate::today())) 1166 | }, error = function(e) { 1167 | NULL 1168 | }) 1169 | }) 1170 | citationsL <- citationsL[vapply(citationsL, function(df) !is.null(df) && nrow(df) > 0, FALSE)] 1171 | if (length(citationsL) > 0) { 1172 | citations <- do.call(dplyr::bind_rows, citationsL) 1173 | } else { 1174 | citations <- NULL 1175 | } 1176 | dim(citations) 1177 | 1178 | if (!is.null(citations) && is.data.frame(citations) && "oci" %in% colnames(citations)) { 1179 | citations <- citations %>% 1180 | dplyr::filter(!(oci %in% citations_archive$oci) & 1181 | citing != "") 1182 | 1183 | tmpj <- rcrossref::cr_works(dois = unique(citations$citing))$data %>% 1184 | dplyr::select(contains("doi"), contains("container.title"), contains("issn"), 1185 | contains("type"), contains("publisher"), contains("prefix")) 1186 | citations <- citations %>% dplyr::left_join(tmpj, by = c("citing" = "doi")) 1187 | 1188 | ## bioRxiv preprints don't have a 'container.title' or 'issn', but we'll assume 1189 | ## that they can be 1190 | ## identified from the prefix 10.1101 - set the container.title 1191 | ## for these records manually; we may or may not want to count these 1192 | ## (would it count citations twice, both preprint and publication?) 1193 | citations$container.title[citations$prefix == "10.1101"] <- "bioRxiv" 1194 | 1195 | ## JOSS is represented by 'The Journal of Open Source Software' as well as 1196 | ## 'Journal of Open Source Software' 1197 | citations$container.title[citations$container.title == 1198 | "Journal of Open Source Software"] <- 1199 | "The Journal of Open Source Software" 1200 | 1201 | ## Remove real self citations (cited DOI = citing DOI) 1202 | citations <- citations %>% dplyr::filter(cited != citing) 1203 | 1204 | ## Merge with the archive 1205 | citations <- dplyr::bind_rows(citations, citations_archive) 1206 | } else { 1207 | citations <- citations_archive 1208 | if (is.null(citations[["citation_info_obtained"]])) { 1209 | citations$citation_info_obtained <- NA_character_ 1210 | } 1211 | } 1212 | 1213 | citations$citation_info_obtained[is.na(citations$citation_info_obtained)] <- 1214 | "2021-08-11" 1215 | 1216 | write.table(citations, file = "joss_submission_citations.tsv", 1217 | row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE) 1218 | ``` 1219 | 1220 | ## Summary statistics 1221 | 1222 | ```{r citation-summary-stats, class.source = 'fold-show', warning = FALSE, message = FALSE} 1223 | ## Latest successful update of new citation data 1224 | max(as.Date(citations$citation_info_obtained)) 1225 | 1226 | ## Number of JOSS papers with >0 citations included in this collection 1227 | length(unique(citations$cited)) 1228 | 1229 | ## Number of JOSS papers with >0 citations according to OpenAlex 1230 | length(which(papers$citation_count > 0)) 1231 | ``` 1232 | 1233 | ```{r citation-merge, class.source = 'fold-hide', warning = FALSE, message = FALSE} 1234 | ## Number of citations from Open Citations Corpus vs OpenAlex 1235 | df0 <- papers %>% dplyr::select(doi, citation_count) %>% 1236 | dplyr::full_join(citations %>% dplyr::group_by(cited) %>% 1237 | dplyr::tally() %>% 1238 | dplyr::mutate(n = replace(n, is.na(n), 0)), 1239 | by = c("doi" = "cited")) 1240 | ``` 1241 | 1242 | ```{r citation-fraction, class.source = 'fold-show', warning = FALSE, message = FALSE} 1243 | ## Total citation count OpenAlex 1244 | sum(df0$citation_count, na.rm = TRUE) 1245 | 1246 | ## Total citation count Open Citations Corpus 1247 | sum(df0$n, na.rm = TRUE) 1248 | 1249 | ## Ratio of total citation count Open Citations Corpus/OpenAlex 1250 | sum(df0$n, na.rm = TRUE)/sum(df0$citation_count, na.rm = TRUE) 1251 | ``` 1252 | 1253 | ```{r citation-plot-crossref, class.source = 'fold-hide', warning = FALSE, message = FALSE} 1254 | ggplot(df0, aes(x = citation_count, y = n)) + 1255 | geom_abline(slope = 1, intercept = 0) + 1256 | geom_point(size = 3, alpha = 0.5) + 1257 | labs(x = "OpenAlex citation count", y = "Open Citations Corpus citation count", 1258 | caption = dcap) + 1259 | theme_bw() 1260 | 1261 | ## Zoom in 1262 | ggplot(df0, aes(x = citation_count, y = n)) + 1263 | geom_abline(slope = 1, intercept = 0) + 1264 | geom_point(size = 3, alpha = 0.5) + 1265 | labs(x = "OpenAlex citation count", y = "Open Citations Corpus citation count", 1266 | caption = dcap) + 1267 | theme_bw() + 1268 | coord_cartesian(xlim = c(0, 75), ylim = c(0, 75)) 1269 | ``` 1270 | 1271 | ```{r citation-nbr-journals, class.source = 'fold-show'} 1272 | ## Number of journals citing JOSS papers 1273 | length(unique(citations$container.title)) 1274 | length(unique(citations$issn)) 1275 | ``` 1276 | 1277 | ## Most citing journals 1278 | 1279 | ```{r citation-top-journals, class.source = 'fold-hide', message = FALSE} 1280 | topcit <- citations %>% dplyr::group_by(container.title) %>% 1281 | dplyr::summarize(nbr_citations_of_joss_papers = length(cited), 1282 | nbr_cited_joss_papers = length(unique(cited)), 1283 | nbr_citing_papers = length(unique(citing)), 1284 | nbr_selfcitations_of_joss_papers = sum(author_sc == "yes"), 1285 | fraction_selfcitations = signif(nbr_selfcitations_of_joss_papers / 1286 | nbr_citations_of_joss_papers, digits = 3)) %>% 1287 | dplyr::arrange(desc(nbr_cited_joss_papers)) 1288 | DT::datatable(topcit, 1289 | escape = FALSE, rownames = FALSE, 1290 | filter = list(position = 'top', clear = FALSE), 1291 | options = list(scrollX = TRUE)) 1292 | ``` 1293 | 1294 | ```{r citation-journals-plot, class.source = 'fold-hide'} 1295 | plotly::ggplotly( 1296 | ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers, 1297 | label = container.title)) + 1298 | geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 1299 | geom_point(size = 3, alpha = 0.5) + 1300 | theme_bw() + 1301 | labs(caption = dcap, x = "Number of citations of JOSS papers", 1302 | y = "Number of cited JOSS papers") 1303 | ) 1304 | plotly::ggplotly( 1305 | ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers, 1306 | label = container.title)) + 1307 | geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 1308 | geom_point(size = 3, alpha = 0.5) + 1309 | theme_bw() + 1310 | coord_cartesian(xlim = c(0, 100), ylim = c(0, 50)) + 1311 | labs(caption = dcap, x = "Number of citations of JOSS papers", 1312 | y = "Number of cited JOSS papers") 1313 | ) 1314 | ``` 1315 | 1316 | ```{r} 1317 | write.table(topcit, file = "joss_submission_citations_byjournal.tsv", 1318 | row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE) 1319 | ``` 1320 | 1321 | 1322 | # Save object 1323 | 1324 | The tibble object with all data collected above is serialized to a file that 1325 | can be downloaded and reused. 1326 | 1327 | ```{r save-data} 1328 | head(papers) %>% as.data.frame() 1329 | saveRDS(papers, file = "joss_submission_analytics.rds") 1330 | ``` 1331 | 1332 | To read the current version of this file directly from GitHub, use the 1333 | following code: 1334 | 1335 | ```{r, class.source = 'fold-show', eval = FALSE} 1336 | papers <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true"))) 1337 | ``` 1338 | 1339 | # Session info 1340 | 1341 | ```{r session-info} 1342 | sessionInfo() 1343 | ``` 1344 | 1345 | --------------------------------------------------------------------------------