├── README.md
├── LICENSE
├── .github
    └── workflows
    │   └── build-deploy-rmd.yaml
└── joss-submission-analytics.Rmd


/README.md:
--------------------------------------------------------------------------------
1 | # joss-analytics
2 | Analysis of JOSS data and statistics
3 | 
4 | Browse at http://www.theoj.org/joss-analytics/joss-submission-analytics.html


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Open Journals
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/build-deploy-rmd.yaml:
--------------------------------------------------------------------------------
  1 | on:
  2 |   push:
  3 |     branches:
  4 |       - master
  5 |   pull_request:
  6 |     branches:
  7 |       - master
  8 |   schedule:
  9 |     - cron: '0 9 * * 3'
 10 |   workflow_dispatch:
 11 | 
 12 | name: build-deploy-rmd
 13 | 
 14 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
 15 | # permissions:
 16 | #   contents: read
 17 | #   pages: write
 18 | #   id-token: write
 19 | 
 20 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
 21 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
 22 | # concurrency:
 23 | #   group: "pages"
 24 | #   cancel-in-progress: false
 25 | 
 26 | jobs:
 27 |   build-rmd:
 28 |     runs-on: ${{ matrix.config.os }}
 29 | 
 30 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
 31 | 
 32 |     strategy:
 33 |       fail-fast: false
 34 |       matrix:
 35 |         config:
 36 |           - {os: macOS-latest, r: 'release'}
 37 | 
 38 |     env:
 39 |       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
 40 |       RSPM: ${{ matrix.config.rspm }}
 41 |       CRAN: ${{ matrix.config.cran }}
 42 |       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 43 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 44 | 
 45 |     steps:
 46 |       - name: Check out repo
 47 |         uses: actions/checkout@v2
 48 |         with:
 49 |           submodules: true
 50 | 
 51 |       - name: Set up R
 52 |         uses: r-lib/actions/setup-r@v2
 53 |         with:
 54 |           r-version: ${{ matrix.config.r }}
 55 | 
 56 |       - name: Set up pandoc
 57 |         uses: r-lib/actions/setup-pandoc@v2
 58 | 
 59 |         ## rcrossref requires an email address associated with a query
 60 |       - name: Set up crossref email
 61 |         run: |
 62 |           echo crossref_email=\"${{ secrets.CROSSREF_EMAIL }}\" >> ~/.Renviron
 63 |           
 64 |       - name: Install dependencies
 65 |         run: |
 66 |           install.packages(c('remotes', 'dplyr', 'ggplot2', 'rmarkdown',
 67 |                              'knitr', 'tibble', 'tidyr',
 68 |                              'lubridate', 'gh', 'jsonlite', 'purrr',
 69 |                              'DT', 'plotly', 'citecorp', 'readr', 
 70 |                              'viridis', 'wordcloud', 'stringr', 'gt', 
 71 |                              'rworldmap', 'openalexR'), Ncpu = 2L)
 72 |           remotes::install_github('ropensci/rcrossref')
 73 |         shell: Rscript {0}
 74 | 
 75 |       - name: Session info
 76 |         run: |
 77 |           install.packages('sessioninfo', Ncpus = 2L)
 78 |           options(width = 100)
 79 |           pkgs <- installed.packages()[, "Package"]
 80 |           sessioninfo::session_info(pkgs, include_base = TRUE)
 81 |         shell: Rscript {0}
 82 |         
 83 |       - name: Render site
 84 |         run: |
 85 |           rmarkdown::render(input = "joss-submission-analytics.Rmd", clean = FALSE)
 86 |         shell: Rscript {0}
 87 | 
 88 |       - name: Prepare files to deploy
 89 |         run: |
 90 |           mkdir _site
 91 |           touch _site/.nojekyll
 92 |           cp -r joss-submission-analytics_files joss-submission-analytics.html _site/
 93 |           cp -r joss_submission_analytics.rds _site/
 94 |           cp -r joss_submission_citations_byjournal.tsv _site/
 95 |           cp -r joss_submission_citations.tsv _site/
 96 |           
 97 |       - name: Deploy 🚀
 98 |         uses: JamesIves/github-pages-deploy-action@releases/v4
 99 |         with:
100 |           ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
101 |           BRANCH: gh-pages
102 |           FOLDER: _site
103 | 
104 |       - name: Upload check results
105 |         if: failure()
106 |         uses: actions/upload-artifact@v4
107 |         with:
108 |           name: ${{ runner.os }}-r${{ matrix.config.r }}-results
109 |           path: check
110 | 
111 |       - name: upload artifact
112 |         uses: actions/upload-artifact@v4
113 |         with:
114 |           name: submissionanalytics
115 |           path: joss_submission_analytics.rds
116 |     
117 |         ## From 2024-06-30, need to use GitHub Actions for deployment
118 |       # - name: Setup Pages
119 |       #   uses: actions/configure-pages@v5
120 |       #   
121 |       # - name: Build with Jekyll
122 |       #   uses: actions/jekyll-build-pages@v1
123 |       #   with:
124 |       #     source: ./_site
125 |       #     destination: ./_site_jkl
126 |       #     
127 |       # - name: Upload artifact
128 |       #   uses: actions/upload-pages-artifact@v3
129 |       #   with:
130 |       #     path: ./_site_jkl
131 | 
132 |   # Deployment job
133 |   # deploy:
134 |   #   environment:
135 |   #     name: github-pages
136 |   #     url: ${{ steps.deployment.outputs.page_url }}
137 |   #   runs-on: ubuntu-latest
138 |   #   needs: build-rmd
139 |   #   
140 |   #   steps:
141 |   #     - name: Deploy to GitHub Pages
142 |   #       id: deployment
143 |   #       uses: actions/deploy-pages@v4
144 | 


--------------------------------------------------------------------------------
/joss-submission-analytics.Rmd:
--------------------------------------------------------------------------------
   1 | ---
   2 | title: "JOSS submission analytics"
   3 | date: "`r Sys.time()`"
   4 | output: 
   5 |   html_document:
   6 |     code_folding: hide
   7 |     theme: united
   8 |     toc: true
   9 |     toc_float: true
  10 | editor_options: 
  11 |   chunk_output_type: console
  12 | ---
  13 | 
  14 | ```{r setup, include=FALSE}
  15 | knitr::opts_chunk$set(echo = TRUE, dev = c("png", "pdf"))
  16 | ```
  17 | 
  18 | # Introduction
  19 | 
  20 | In this report, we extract information about published JOSS papers and generate  
  21 | graphics as well as a summary table that can be downloaded and used for further 
  22 | analyses. 
  23 | 
  24 | # Load required R packages
  25 | 
  26 | ```{r load-packages, class.source = 'fold-show'}
  27 | suppressPackageStartupMessages({
  28 |     library(tibble)
  29 |     library(rcrossref)
  30 |     library(dplyr)
  31 |     library(tidyr)
  32 |     library(ggplot2)
  33 |     library(lubridate)
  34 |     library(gh)
  35 |     library(purrr)
  36 |     library(jsonlite)
  37 |     library(DT)
  38 |     library(plotly)
  39 |     library(citecorp)
  40 |     library(readr)
  41 |     library(rworldmap)
  42 |     library(gt)
  43 |     library(stringr)
  44 |     library(openalexR)
  45 | })
  46 | ```
  47 | 
  48 | ```{r source-track, class.source = 'fold-hide'}
  49 | ## Keep track of the source of each column
  50 | source_track <- c()
  51 | 
  52 | ## Determine whether to add a caption with today's date to the (non-interactive) plots
  53 | add_date_caption <- TRUE
  54 | if (add_date_caption) {
  55 |     dcap <- lubridate::today()
  56 | } else {
  57 |     dcap <- ""
  58 | }
  59 | ```
  60 | 
  61 | ```{r}
  62 | ## Get list of countries and populations (2022) from the rworldmap/gt packages
  63 | data("countrySynonyms")
  64 | country_names <- countrySynonyms |>
  65 |     select(-ID) |>
  66 |     pivot_longer(names_to = "tmp", values_to = "name", -ISO3) |>
  67 |     filter(name != "") |>
  68 |     select(-tmp)
  69 | 
  70 | ## Country population data from the World Bank (https://data.worldbank.org/indicator/SP.POP.TOTL),
  71 | ## distributed via the gt R package
  72 | country_populations <- countrypops |> 
  73 |     filter(year == 2022)
  74 | ```
  75 | 
  76 | ```{r}
  77 | ## Read archived version of summary data frame, to use for filling in 
  78 | ## information about software repositories (due to limit on API requests)
  79 | ## Sort by the date when software repo info was last obtained
  80 | papers_archive <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true"))) %>%
  81 |     dplyr::arrange(!is.na(repo_info_obtained), repo_info_obtained)
  82 | 
  83 | ## Similarly for citation analysis, to avoid having to pull down the 
  84 | ## same information multiple times
  85 | citations_archive <- readr::read_delim(
  86 |     url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_citations.tsv?raw=true"),
  87 |     col_types = cols(.default = "c"), col_names = TRUE,
  88 |     delim = "\t")
  89 | ```
  90 | 
  91 | # Collect information about papers
  92 | 
  93 | ## Pull down paper info from Crossref and citation information from OpenAlex
  94 | 
  95 | We get the information about published JOSS papers from Crossref, using the 
  96 | `rcrossref` R package. The `openalexR` R package is used to extract citation 
  97 | counts from OpenAlex.
  98 | 
  99 | ```{r pull-crossref, class.source = 'fold-show'}
 100 | ## First check how many records there are in Crossref
 101 | issn <- "2475-9066"
 102 | joss_details <- rcrossref::cr_journals(issn, works = FALSE) %>%
 103 |     pluck("data")
 104 | (total_dois <- joss_details$total_dois)
 105 | 
 106 | ## Pull down all records from Crossref
 107 | papers <- rcrossref::cr_journals(issn, works = TRUE, cursor = "*",
 108 |     cursor_max = joss_details$total_dois * 2) %>%
 109 |     pluck("data")
 110 | 
 111 | ## Only keep articles
 112 | papers <- papers %>%
 113 |     dplyr::filter(type == "journal-article") 
 114 | dim(papers)
 115 | dim(papers %>% distinct())
 116 | 
 117 | ## Check that all papers were pulled down and stop otherwise
 118 | if (!(nrow(papers %>% distinct()) >= total_dois)) {
 119 |     stop("Not all papers were pulled down from Crossref!")
 120 | }
 121 | 
 122 | ## A few papers don't have alternative.ids - generate them from the DOI
 123 | noaltid <- which(is.na(papers$alternative.id))
 124 | papers$alternative.id[noaltid] <- papers$doi[noaltid]
 125 | 
 126 | ## Get citation info from Crossref and merge with paper details
 127 | # cit <- rcrossref::cr_citation_count(doi = papers$alternative.id)
 128 | # papers <- papers %>% dplyr::left_join(
 129 | #     cit %>% dplyr::rename(citation_count = count), 
 130 | #     by = c("alternative.id" = "doi")
 131 | # )
 132 | 
 133 | ## Remove one duplicated paper
 134 | papers <- papers %>% dplyr::filter(alternative.id != "10.21105/joss.00688")
 135 | dim(papers)
 136 | dim(papers %>% distinct())
 137 | papers$alternative.id[duplicated(papers$alternative.id)]
 138 | 
 139 | source_track <- c(source_track, 
 140 |                   structure(rep("crossref", ncol(papers)), 
 141 |                             names = colnames(papers)))
 142 | ```
 143 | 
 144 | ```{r}
 145 | ## Get info from openalexR and merge with paper details
 146 | ## Helper function to extract countries from affiliations. Note that this 
 147 | ## information is not available for all papers.
 148 | .get_countries <- function(df, wh = "first") {
 149 |     if ((length(df) == 1 && is.na(df)) || is.null(df$affiliations)) {
 150 |         ""
 151 |     } else {
 152 |         if (wh == "first") {
 153 |             ## Only first affiliation for each author
 154 |             tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |> 
 155 |                 dplyr::filter(!duplicated(id) & !is.na(affiliations_country_code)) |>
 156 |                 pull(affiliations_country_code)
 157 |         } else {
 158 |             ## All affiliations
 159 |             tmp <- unnest(df, cols = c(affiliations), names_sep = "_") |> 
 160 |                 dplyr::filter(!is.na(affiliations_country_code)) |>
 161 |                 pull(affiliations_country_code)
 162 |         }
 163 |         if (length(tmp) > 0) {
 164 |             tmp |>
 165 |                 unique() |>
 166 |                 paste(collapse = ";")
 167 |         } else {
 168 |             ""
 169 |         }
 170 |     }
 171 | }
 172 | 
 173 | oa <- oa_fetch(entity = "works", 
 174 |                primary_location.source.id = "s4210214273") |>
 175 |     mutate(affil_countries_all = vapply(authorships, .get_countries, "", wh = "all"),
 176 |            affil_countries_first = vapply(authorships, .get_countries, "", wh = "first"))
 177 | dim(oa)
 178 | length(unique(oa$doi))
 179 | 
 180 | papers <- papers %>% dplyr::left_join(
 181 |     oa %>% dplyr::mutate(alternative.id = sub("https://doi.org/", "", doi)) %>%
 182 |         dplyr::select(alternative.id, cited_by_count, id,
 183 |                       affil_countries_all, affil_countries_first) %>%
 184 |         dplyr::rename(citation_count = cited_by_count, 
 185 |                       openalex_id = id),
 186 |     by = "alternative.id"
 187 | )
 188 | dim(papers)
 189 | dim(papers %>% distinct())
 190 | 
 191 | source_track <- c(source_track, 
 192 |                   structure(rep("OpenAlex", length(setdiff(colnames(papers),
 193 |                                                            names(source_track)))), 
 194 |                             names = setdiff(colnames(papers), names(source_track))))
 195 | ```
 196 | 
 197 | ## Pull down info from JOSS API
 198 | 
 199 | For each published paper, we use the JOSS API to get information about
 200 | pre-review and review issue numbers, corresponding software repository etc.
 201 | 
 202 | ```{r pull-joss-api, class.source = 'fold-show'}
 203 | joss_api <- list()
 204 | p <- 1
 205 | a0 <- NULL
 206 | a <- jsonlite::fromJSON(
 207 |     url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
 208 |     simplifyDataFrame = FALSE
 209 | )
 210 | while (length(a) > 0 && !identical(a, a0)) {
 211 |     joss_api <- c(joss_api, a)
 212 |     p <- p + 1
 213 |     a0 <- a
 214 |     a <- tryCatch({
 215 |         jsonlite::fromJSON(
 216 |             url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
 217 |             simplifyDataFrame = FALSE
 218 |         )}, 
 219 |         error = function(e) return(numeric(0))
 220 |     )
 221 | }
 222 | 
 223 | joss_api <- do.call(dplyr::bind_rows, lapply(joss_api, function(w) {
 224 |     data.frame(api_title = w$title, 
 225 |                api_state = w$state,
 226 |                author_affiliations = paste(unique(unlist(lapply(w$authors, "[[", "affiliation"))), collapse = ";"),
 227 |                editor = paste(w$editor, collapse = ","),
 228 |                reviewers = paste(w$reviewers, collapse = ","),
 229 |                nbr_reviewers = length(w$reviewers),
 230 |                repo_url = w$software_repository,
 231 |                review_issue_id = sub("https://github.com/openjournals/joss-reviews/issues/", 
 232 |                                      "", w$paper_review),
 233 |                doi = w$doi,
 234 |                prereview_issue_id = ifelse(!is.null(w$meta_review_issue_id),
 235 |                                            w$meta_review_issue_id, NA_integer_),
 236 |                languages = gsub(", ", ",", w$languages),
 237 |                archive_doi = w$software_archive)
 238 | }))
 239 | dim(joss_api)
 240 | dim(joss_api %>% distinct())
 241 | ## Check that all papers were pulled down and stop otherwise
 242 | if (!(nrow(joss_api %>% distinct()) >= total_dois)) {
 243 |     stop("Not all papers were pulled down from the JOSS API!")
 244 | }
 245 | joss_api$repo_url[duplicated(joss_api$repo_url)]
 246 | 
 247 | papers <- papers %>% dplyr::left_join(joss_api, by = c("alternative.id" = "doi"))
 248 | dim(papers)
 249 | dim(papers %>% distinct())
 250 | papers$repo_url[duplicated(papers$repo_url)]
 251 | 
 252 | source_track <- c(source_track, 
 253 |                   structure(rep("JOSS_API", length(setdiff(colnames(papers),
 254 |                                                            names(source_track)))), 
 255 |                             names = setdiff(colnames(papers), names(source_track))))
 256 | ```
 257 | 
 258 | ## Combine with info from GitHub issues
 259 | 
 260 | From each pre-review and review issue, we extract information about review 
 261 | times and assigned labels. 
 262 | 
 263 | ```{r pull-github, class.source = 'fold-show', message = FALSE}
 264 | ## Pull down info on all issues in the joss-reviews repository
 265 | issues <- gh("/repos/openjournals/joss-reviews/issues", 
 266 |              .limit = 15000, state = "all")
 267 | ```
 268 | 
 269 | ```{r extract-github, class.source = 'fold-show', message = FALSE}
 270 | ## From each issue, extract required information
 271 | iss <- do.call(dplyr::bind_rows, lapply(issues, function(i) {
 272 |     data.frame(title = i$title, 
 273 |                number = i$number,
 274 |                state = i$state,
 275 |                opened = i$created_at,
 276 |                closed = ifelse(!is.null(i$closed_at),
 277 |                                i$closed_at, NA_character_),
 278 |                ncomments = i$comments,
 279 |                labels = paste(setdiff(
 280 |                    vapply(i$labels, getElement, 
 281 |                           name = "name", character(1L)),
 282 |                    c("review", "pre-review", "query-scope", "paused")),
 283 |                    collapse = ","))
 284 | }))
 285 | 
 286 | ## Split into REVIEW, PRE-REVIEW, and other issues (the latter category 
 287 | ## is discarded)
 288 | issother <- iss %>% dplyr::filter(!grepl("\\[PRE REVIEW\\]", title) & 
 289 |                                       !grepl("\\[REVIEW\\]", title))
 290 | dim(issother)
 291 | head(issother)
 292 | 
 293 | ## For REVIEW issues, generate the DOI of the paper from the issue number
 294 | getnbrzeros <- function(s) {
 295 |     paste(rep(0, 5 - nchar(s)), collapse = "")
 296 | }
 297 | issrev <- iss %>% dplyr::filter(grepl("\\[REVIEW\\]", title)) %>%
 298 |     dplyr::mutate(nbrzeros = purrr::map_chr(number, getnbrzeros)) %>%
 299 |     dplyr::mutate(alternative.id = paste0("10.21105/joss.", 
 300 |                                           nbrzeros,
 301 |                                           number)) %>%
 302 |     dplyr::select(-nbrzeros) %>% 
 303 |     dplyr::mutate(title = gsub("\\[REVIEW\\]: ", "", title)) %>%
 304 |     dplyr::rename_at(vars(-alternative.id), ~ paste0("review_", .))
 305 | ```
 306 | 
 307 | ```{r get-rejection-info, class.source = 'fold-show', message = FALSE}
 308 | ## For pre-review and review issues, respectively, get the number of 
 309 | ## issues closed each month, and the number of those that have the 
 310 | ## 'rejected' label
 311 | review_rejected <- iss %>% 
 312 |     dplyr::filter(grepl("\\[REVIEW\\]", title)) %>% 
 313 |     dplyr::filter(!is.na(closed)) %>%
 314 |     dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
 315 |     dplyr::group_by(closedmonth) %>%
 316 |     dplyr::summarize(nbr_issues_closed = length(labels),
 317 |                      nbr_rejections = sum(grepl("rejected", labels))) %>%
 318 |     dplyr::mutate(itype = "review")
 319 | 
 320 | prereview_rejected <- iss %>% 
 321 |     dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>% 
 322 |     dplyr::filter(!is.na(closed)) %>%
 323 |     dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
 324 |     dplyr::group_by(closedmonth) %>%
 325 |     dplyr::summarize(nbr_issues_closed = length(labels),
 326 |                      nbr_rejections = sum(grepl("rejected", labels))) %>%
 327 |     dplyr::mutate(itype = "pre-review")
 328 | 
 329 | all_rejected <- dplyr::bind_rows(review_rejected, prereview_rejected)
 330 | ```
 331 | 
 332 | ```{r get-submission-info, class.source = 'fold-show', message = FALSE}
 333 | ## Get only pre-review issues plus review issues opened before 2016-09-18, 
 334 | ## will use these as a proxy for the number of submissions
 335 | pi1 <- iss |>
 336 |     dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) |>
 337 |     dplyr::mutate(opened = as.Date(opened))
 338 | dim(pi1)
 339 | pi2 <- iss |>
 340 |     dplyr::filter(grepl("\\[REVIEW\\]", title)) |>
 341 |     dplyr::mutate(opened = as.Date(opened)) |>
 342 |     dplyr::filter(opened <= as.Date("2016-09-18"))
 343 | dim(pi2)
 344 | prereview_issues <- dplyr::bind_rows(pi1, pi2)
 345 | ```
 346 | 
 347 | ```{r extract-github-2, class.source = 'fold-show', message = FALSE}
 348 | ## For PRE-REVIEW issues, add information about the corresponding REVIEW 
 349 | ## issue number
 350 | isspre <- iss %>% dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>%
 351 |     dplyr::filter(!grepl("withdrawn", labels)) %>%
 352 |     dplyr::filter(!grepl("rejected", labels))
 353 | ## Some titles have multiple pre-review issues. In these cases, keep the latest
 354 | isspre <- isspre %>% dplyr::arrange(desc(number)) %>% 
 355 |     dplyr::filter(!duplicated(title)) %>% 
 356 |     dplyr::mutate(title = gsub("\\[PRE REVIEW\\]: ", "", title)) %>%
 357 |     dplyr::rename_all(~ paste0("prerev_", .))
 358 | 
 359 | papers <- papers %>% dplyr::left_join(issrev, by = "alternative.id") %>% 
 360 |     dplyr::left_join(isspre, by = c("prereview_issue_id" = "prerev_number")) %>%
 361 |     dplyr::mutate(prerev_opened = as.Date(prerev_opened),
 362 |                   prerev_closed = as.Date(prerev_closed),
 363 |                   review_opened = as.Date(review_opened),
 364 |                   review_closed = as.Date(review_closed)) %>% 
 365 |     dplyr::mutate(days_in_pre = prerev_closed - prerev_opened,
 366 |                   days_in_rev = review_closed - review_opened,
 367 |                   to_review = !is.na(review_opened))
 368 | dim(papers)
 369 | dim(papers %>% distinct())
 370 | 
 371 | source_track <- c(source_track, 
 372 |                   structure(rep("joss-github", length(setdiff(colnames(papers),
 373 |                                                               names(source_track)))), 
 374 |                             names = setdiff(colnames(papers), names(source_track))))
 375 | ```
 376 | 
 377 | ## Add information from software repositories
 378 | 
 379 | ```{r check-software-repos, class.source = 'fold-show', message = FALSE}
 380 | ## Reorder so that software repositories that were interrogated longest 
 381 | ## ago are checked first
 382 | tmporder <- order(match(papers$alternative.id, papers_archive$alternative.id),
 383 |                   na.last = FALSE)
 384 | software_urls <- papers$repo_url[tmporder]
 385 | software_urls[duplicated(software_urls)]
 386 | is_github <- grepl("github", software_urls)
 387 | length(is_github)
 388 | sum(is_github)
 389 | software_urls[!is_github]
 390 | ```
 391 | 
 392 | ```{r get-software-repos, class.source = 'fold-show', message = FALSE, results = 'hide', warning = FALSE}
 393 | df <- do.call(dplyr::bind_rows, lapply(unique(software_urls[is_github]), function(u) {
 394 |     u0 <- gsub("^http://", "https://", gsub("\\.git$", "", gsub("/$", "", u)))
 395 |     if (grepl("/tree/", u0)) {
 396 |         u0 <- strsplit(u0, "/tree/")[[1]][1]
 397 |     }
 398 |     if (grepl("/blob/", u0)) {
 399 |         u0 <- strsplit(u0, "/blob/")[[1]][1]
 400 |     }
 401 |     info <- try({
 402 |         gh(gsub("(https://)?(www.)?github.com/", "/repos/", u0))
 403 |     })
 404 |     languages <- try({
 405 |         gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/languages"), 
 406 |            .limit = 500)
 407 |     })
 408 |     topics <- try({
 409 |         gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/topics"), 
 410 |            .accept = "application/vnd.github.mercy-preview+json", .limit = 500)
 411 |     })
 412 |     contribs <- try({
 413 |         gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/contributors"), 
 414 |            .limit = 500)
 415 |     })
 416 |     if (!is(info, "try-error") && length(info) > 1) {
 417 |         if (!is(contribs, "try-error")) {
 418 |             if (length(contribs) == 0) {
 419 |                 repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
 420 |             } else {
 421 |                 repo_nbr_contribs <- length(contribs)
 422 |                 repo_nbr_contribs_2ormore <- sum(vapply(contribs, function(x) x$contributions >= 2, NA_integer_))
 423 |                 if (is.na(repo_nbr_contribs_2ormore)) {
 424 |                     print(contribs)
 425 |                 }
 426 |             }
 427 |         } else {
 428 |             repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
 429 |         }
 430 |         
 431 |         if (!is(languages, "try-error")) {
 432 |             if (length(languages) == 0) {
 433 |                 repolang <- ""
 434 |             } else {
 435 |                 repolang <- paste(paste(names(unlist(languages)), 
 436 |                                         unlist(languages), sep = ":"), collapse = ",")
 437 |             }
 438 |         } else {
 439 |             repolang <- ""
 440 |         }
 441 |         
 442 |         if (!is(topics, "try-error")) {
 443 |             if (length(topics$names) == 0) {
 444 |                 repotopics <- ""
 445 |             } else {
 446 |                 repotopics <- paste(unlist(topics$names), collapse = ",")
 447 |             }
 448 |         } else {
 449 |             repotopics <- ""
 450 |         }
 451 |         
 452 |         data.frame(repo_url = u, 
 453 |                    repo_created = info$created_at,
 454 |                    repo_updated = info$updated_at,
 455 |                    repo_pushed = info$pushed_at,
 456 |                    repo_nbr_stars = info$stargazers_count,
 457 |                    repo_language = ifelse(!is.null(info$language),
 458 |                                           info$language, NA_character_),
 459 |                    repo_languages_bytes = repolang,
 460 |                    repo_topics = repotopics,
 461 |                    repo_license = ifelse(!is.null(info$license),
 462 |                                          info$license$key, NA_character_),
 463 |                    repo_nbr_contribs = repo_nbr_contribs,
 464 |                    repo_nbr_contribs_2ormore = repo_nbr_contribs_2ormore
 465 |         )
 466 |     } else {
 467 |         NULL
 468 |     }
 469 | })) %>%
 470 |     dplyr::mutate(repo_created = as.Date(repo_created),
 471 |                   repo_updated = as.Date(repo_updated),
 472 |                   repo_pushed = as.Date(repo_pushed)) %>%
 473 |     dplyr::distinct() %>%
 474 |     dplyr::mutate(repo_info_obtained = lubridate::today())
 475 | ```
 476 | 
 477 | ```{r get-software-repos-print1, class.source = 'fold-show', message = FALSE, warning = FALSE}
 478 | if (length(unique(df$repo_url)) != length(df$repo_url)) {
 479 |     print(length(unique(df$repo_url)))
 480 |     print(length(df$repo_url))
 481 |     print(df$repo_url[duplicated(df$repo_url)])
 482 | }
 483 | stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
 484 | dim(df)
 485 | ```
 486 | 
 487 | ```{r get-software-repos-print2, class.source = 'fold-show', message = FALSE, warning = FALSE}
 488 | ## For papers not in df (i.e., for which we didn't get a valid response
 489 | ## from the GitHub API query), use information from the archived data frame
 490 | dfarchive <- papers_archive %>% 
 491 |     dplyr::select(colnames(df)[colnames(df) %in% colnames(papers_archive)]) %>%
 492 |     dplyr::filter(!(repo_url %in% df$repo_url)) %>%
 493 |     dplyr::arrange(desc(repo_info_obtained)) %>%
 494 |     dplyr::filter(!duplicated(repo_url))
 495 | head(dfarchive)
 496 | dim(dfarchive)
 497 | df <- dplyr::bind_rows(df, dfarchive)
 498 | stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
 499 | dim(df)
 500 | 
 501 | papers <- papers %>% dplyr::left_join(df, by = "repo_url")
 502 | dim(papers)
 503 | 
 504 | source_track <- c(source_track, 
 505 |                   structure(rep("sw-github", length(setdiff(colnames(papers),
 506 |                                                             names(source_track)))), 
 507 |                             names = setdiff(colnames(papers), names(source_track))))
 508 | ```
 509 | 
 510 | ## Clean up a bit
 511 | 
 512 | ```{r clean-up, class.source = 'fold-show'}
 513 | ## Convert publication date to Date format
 514 | ## Add information about the half year (H1, H2) of publication
 515 | ## Count number of authors
 516 | papers <- papers %>% dplyr::select(-reference, -license, -link) %>%
 517 |     dplyr::mutate(published.date = as.Date(published.print)) %>% 
 518 |     dplyr::mutate(
 519 |         halfyear = paste0(year(published.date), 
 520 |                           ifelse(month(published.date) <= 6, "H1", "H2"))
 521 |     ) %>% dplyr::mutate(
 522 |         halfyear = factor(halfyear, 
 523 |                           levels = paste0(rep(sort(unique(year(published.date))), 
 524 |                                               each = 2), c("H1", "H2")))
 525 |     ) %>% dplyr::mutate(nbr_authors = vapply(author, function(a) nrow(a), NA_integer_))
 526 | dim(papers)
 527 | dupidx <- which(papers$alternative.id %in% papers$alternative.id[duplicated(papers)])
 528 | papers[dupidx, ] %>% arrange(alternative.id) %>% head(n = 10)
 529 | 
 530 | papers <- papers %>% dplyr::distinct()
 531 | dim(papers)
 532 | 
 533 | source_track <- c(source_track, 
 534 |                   structure(rep("cleanup", length(setdiff(colnames(papers),
 535 |                                                           names(source_track)))), 
 536 |                             names = setdiff(colnames(papers), names(source_track))))
 537 | ```
 538 | 
 539 | # Tabulate number of missing values
 540 | 
 541 | In some cases, fetching information from (e.g.) the GitHub API fails for a 
 542 | subset of the publications. There are also other reasons for missing values 
 543 | (for example, the earliest submissions do not have an associated pre-review 
 544 | issue). The table below lists the number of missing values for each of the 
 545 | variables in the data frame.
 546 | 
 547 | ```{r nbr-missing, class.source = 'fold-hide'}
 548 | DT::datatable(
 549 |     data.frame(variable = colnames(papers),
 550 |                nbr_missing = colSums(is.na(papers))) %>%
 551 |         dplyr::mutate(source = source_track[variable]),
 552 |     escape = FALSE, rownames = FALSE, 
 553 |     filter = list(position = 'top', clear = FALSE),
 554 |     options = list(scrollX = TRUE)
 555 | )
 556 | ```
 557 | 
 558 | # Number of published papers per month
 559 | 
 560 | ```{r papers-month, class.source = 'fold-hide', fig.width = 10, message = FALSE}
 561 | monthly_pubs <- papers %>% 
 562 |     dplyr::mutate(pubmonth = lubridate::floor_date(published.date, "month")) %>%
 563 |     dplyr::group_by(pubmonth) %>%
 564 |     dplyr::summarize(npub = n())
 565 | ggplot(monthly_pubs, 
 566 |        aes(x = factor(pubmonth), y = npub)) + 
 567 |     geom_bar(stat = "identity") + theme_minimal() + 
 568 |     labs(x = "", y = "Number of published papers per month", caption = dcap) + 
 569 |     theme(axis.title = element_text(size = 15),
 570 |           axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
 571 | ```
 572 | 
 573 | ```{r}
 574 | DT::datatable(
 575 |     monthly_pubs %>% 
 576 |         dplyr::rename("Number of papers" = "npub",
 577 |                       "Month of publication" = "pubmonth"),
 578 |     escape = FALSE, rownames = FALSE, 
 579 |     filter = list(position = 'top', clear = FALSE),
 580 |     options = list(scrollX = TRUE)
 581 | )
 582 | ```
 583 | 
 584 | # Number of published papers per year
 585 | 
 586 | ```{r papers-year, class.source = 'fold-hide', fig.width = 8, message = FALSE}
 587 | yearly_pubs <- papers %>% 
 588 |     dplyr::mutate(pubyear = lubridate::year(published.date)) %>%
 589 |     dplyr::group_by(pubyear) %>%
 590 |     dplyr::summarize(npub = n())
 591 | ggplot(yearly_pubs, 
 592 |        aes(x = factor(pubyear), y = npub)) + 
 593 |     geom_bar(stat = "identity") + theme_minimal() + 
 594 |     labs(x = "", y = "Number of published papers per year", caption = dcap) + 
 595 |     theme(axis.title = element_text(size = 15),
 596 |           axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
 597 | ```
 598 | 
 599 | ```{r}
 600 | DT::datatable(
 601 |     yearly_pubs %>% 
 602 |         dplyr::rename("Number of papers" = "npub",
 603 |                       "Year of publication" = "pubyear"),
 604 |     escape = FALSE, rownames = FALSE, 
 605 |     filter = list(position = 'top', clear = FALSE),
 606 |     options = list(scrollX = TRUE)
 607 | )
 608 | ```
 609 | 
 610 | # Number of submissions per month
 611 | 
 612 | We use the number of opened pre-review issues in a month as a proxy for the 
 613 | number of submissions. 
 614 | 
 615 | ```{r submissions-month, class.source = 'fold-hide', fig.width = 10, message = FALSE}
 616 | monthly_subs <- prereview_issues |>
 617 |     dplyr::mutate(submonth = lubridate::floor_date(opened, "month")) |>
 618 |     dplyr::group_by(submonth) |>
 619 |     dplyr::summarize(nsub = n())
 620 | ggplot(monthly_subs, 
 621 |        aes(x = factor(submonth), y = nsub)) + 
 622 |     geom_bar(stat = "identity") + theme_minimal() + 
 623 |     labs(x = "", y = "Number of submissions per month", caption = dcap) + 
 624 |     theme(axis.title = element_text(size = 15),
 625 |           axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
 626 | ```
 627 | 
 628 | ```{r}
 629 | DT::datatable(
 630 |     monthly_subs |> 
 631 |         dplyr::rename("Number of submissions" = "nsub",
 632 |                       "Month of submission" = "submonth"),
 633 |     escape = FALSE, rownames = FALSE, 
 634 |     filter = list(position = 'top', clear = FALSE),
 635 |     options = list(scrollX = TRUE)
 636 | )
 637 | ```
 638 | 
 639 | # Fraction rejected papers
 640 | 
 641 | The plots below illustrate the fraction of pre-review and review issues closed 
 642 | during each month that have the 'rejected' label attached. 
 643 | 
 644 | ```{r rejections, class.source = 'fold-hide', fig.width = 10, fig.height = 8, message = FALSE}
 645 | ggplot(all_rejected, 
 646 |        aes(x = factor(closedmonth), y = nbr_rejections/nbr_issues_closed)) + 
 647 |     geom_bar(stat = "identity") + 
 648 |     theme_minimal() + 
 649 |     facet_wrap(~ itype, ncol = 1) + 
 650 |     labs(x = "Month of issue closing", y = "Fraction of issues rejected",
 651 |          caption = dcap) + 
 652 |     theme(axis.title = element_text(size = 15),
 653 |           axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
 654 | ```
 655 | 
 656 | # Citation distribution
 657 | 
 658 | Papers with 20 or more citations are grouped in the ">=20" category.
 659 | 
 660 | ```{r citation-distribution, class.source = 'fold-hide'}
 661 | ggplot(papers %>% 
 662 |            dplyr::mutate(citation_count = replace(citation_count,
 663 |                                                   citation_count >= 20, ">=20")) %>%
 664 |            dplyr::mutate(citation_count = factor(citation_count, 
 665 |                                                  levels = c(0:20, ">=20"))) %>%
 666 |            dplyr::group_by(citation_count) %>%
 667 |            dplyr::tally(),
 668 |        aes(x = citation_count, y = n)) + 
 669 |     geom_bar(stat = "identity") + 
 670 |     theme_minimal() + 
 671 |     labs(x = "OpenAlex citation count", y = "Number of publications", caption = dcap)
 672 | ```
 673 | 
 674 | 
 675 | # Most cited papers
 676 | 
 677 | The table below sorts the JOSS papers in decreasing order by the number of 
 678 | citations in OpenAlex.
 679 | 
 680 | ```{r most-cited, class.source = 'fold-hide'}
 681 | DT::datatable(
 682 |     papers %>% 
 683 |         dplyr::mutate(url = paste0("<a href='", url, "' target='_blank'>", 
 684 |                                    url,"</a>")) %>% 
 685 |         dplyr::arrange(desc(citation_count)) %>% 
 686 |         dplyr::select(title, url, published.date, citation_count),
 687 |     escape = FALSE,
 688 |     filter = list(position = 'top', clear = FALSE),
 689 |     options = list(scrollX = TRUE)
 690 | )
 691 | ```
 692 | 
 693 | # Citation count vs time since publication
 694 | 
 695 | ```{r citations-vs-time, class.source = 'fold-hide', message = FALSE}
 696 | plotly::ggplotly(
 697 |     ggplot(papers, aes(x = published.date, y = citation_count, label = title)) + 
 698 |         geom_point(alpha = 0.5) + theme_bw() + scale_y_sqrt() + 
 699 |         geom_smooth() + 
 700 |         labs(x = "Date of publication", y = "OpenAlex citation count", caption = dcap) + 
 701 |         theme(axis.title = element_text(size = 15)),
 702 |     tooltip = c("label", "x", "y")
 703 | )
 704 | ```
 705 | 
 706 | # Power law of citation count within each half year
 707 | 
 708 | Here, we plot the citation count for all papers published within each half year, 
 709 | sorted in decreasing order. 
 710 | 
 711 | ```{r power-law-citations, class.source = 'fold-hide', fig.width = 10, fig.height = 10}
 712 | ggplot(papers %>% dplyr::group_by(halfyear) %>% 
 713 |            dplyr::arrange(desc(citation_count)) %>%
 714 |            dplyr::mutate(idx = seq_along(citation_count)), 
 715 |        aes(x = idx, y = citation_count)) + 
 716 |     geom_point(alpha = 0.5) + 
 717 |     facet_wrap(~ halfyear, scales = "free") + 
 718 |     theme_bw() + 
 719 |     labs(x = "Index", y = "OpenAlex citation count", caption = dcap)
 720 | ```
 721 | 
 722 | 
 723 | # Pre-review/review time over time
 724 | 
 725 | In these plots we investigate whether the time a submission spends in the 
 726 | pre-review or review stage (or their sum) has changed over time. The blue curve 
 727 | corresponds to a rolling median for submissions over 120 days. 
 728 | 
 729 | ```{r smoothing-helpers, class.source = 'fold-hide'}
 730 | ## Helper functions (modified from https://stackoverflow.com/questions/65147186/geom-smooth-with-median-instead-of-mean)
 731 | rolling_median <- function(formula, data, xwindow = 120, ...) {
 732 |     ## Get order of x-values and sort x/y
 733 |     ordr <- order(data$x)
 734 |     x <- data$x[ordr]
 735 |     y <- data$y[ordr]
 736 |     
 737 |     ## Initialize vector for smoothed y-values
 738 |     ys <- rep(NA, length(x))
 739 |     ## Calculate median y-value for each unique x-value
 740 |     for (xs in setdiff(unique(x), NA)) {
 741 |         ## Get x-values in the window, and calculate median of corresponding y
 742 |         j <- ((xs - xwindow/2) < x) & (x < (xs + xwindow/2))
 743 |         ys[x == xs] <- median(y[j], na.rm = TRUE)
 744 |     }
 745 |     y <- ys
 746 |     structure(list(x = x, y = y, f = approxfun(x, y)), class = "rollmed")
 747 | }
 748 | 
 749 | predict.rollmed <- function(mod, newdata, ...) {
 750 |     setNames(mod$f(newdata$x), newdata$x)
 751 | }
 752 | ```
 753 | 
 754 | ```{r review-time, class.source = 'fold-hide', message = FALSE, warning = FALSE}
 755 | ggplot(papers, aes(x = prerev_opened, y = as.numeric(days_in_pre))) + 
 756 |     geom_point() + 
 757 |     geom_smooth(formula = y ~ x, method = "rolling_median", 
 758 |                 se = FALSE, method.args = list(xwindow = 120)) + 
 759 |     theme_bw() + 
 760 |     labs(x = "Date of pre-review opening", y = "Number of days in pre-review", 
 761 |          caption = dcap) + 
 762 |     theme(axis.title = element_text(size = 15))
 763 | 
 764 | ggplot(papers, aes(x = review_opened, y = as.numeric(days_in_rev))) + 
 765 |     geom_point() +
 766 |     geom_smooth(formula = y ~ x, method = "rolling_median", 
 767 |                 se = FALSE, method.args = list(xwindow = 120)) +
 768 |     theme_bw() + 
 769 |     labs(x = "Date of review opening", y = "Number of days in review", 
 770 |          caption = dcap) + 
 771 |     theme(axis.title = element_text(size = 15))
 772 | 
 773 | ggplot(papers, aes(x = prerev_opened, 
 774 |                    y = as.numeric(days_in_pre) + as.numeric(days_in_rev))) + 
 775 |     geom_point() +
 776 |     geom_smooth(formula = y ~ x, method = "rolling_median", 
 777 |                 se = FALSE, method.args = list(xwindow = 120)) +
 778 |     theme_bw() + 
 779 |     labs(x = "Date of pre-review opening", y = "Number of days in pre-review + review", 
 780 |          caption = dcap) + 
 781 |     theme(axis.title = element_text(size = 15))
 782 | ```
 783 | 
 784 | # Languages
 785 | 
 786 | Next, we consider the languages used by the submissions, both as reported by 
 787 | JOSS and based on the information encoded in available GitHub repositories 
 788 | (for the latter, we also record the number of bytes of code written in each 
 789 | language). Note that a given submission can use multiple languages. 
 790 | 
 791 | ```{r languages, class.source = 'fold-hide', fig.width = 9, message = FALSE}
 792 | ## Language information from JOSS
 793 | sspl <- strsplit(papers$languages, ",")
 794 | all_languages <- unique(unlist(sspl))
 795 | langs <- do.call(dplyr::bind_rows, lapply(all_languages, function(l) {
 796 |     data.frame(language = l,
 797 |                nbr_submissions_JOSS_API = sum(vapply(sspl, function(v) l %in% v, 0)))
 798 | }))
 799 | 
 800 | ## Language information from GitHub software repos
 801 | a <- lapply(strsplit(papers$repo_languages_bytes, ","), function(w) strsplit(w, ":"))
 802 | a <- a[sapply(a, length) > 0]
 803 | langbytes <- as.data.frame(t(as.data.frame(a))) %>% 
 804 |     setNames(c("language", "bytes")) %>%
 805 |     dplyr::mutate(bytes = as.numeric(bytes)) %>%
 806 |     dplyr::filter(!is.na(language)) %>%
 807 |     dplyr::group_by(language) %>%
 808 |     dplyr::summarize(nbr_bytes_GitHub = sum(bytes),
 809 |                      nbr_repos_GitHub = length(bytes)) %>%
 810 |     dplyr::arrange(desc(nbr_bytes_GitHub))
 811 | 
 812 | langs <- dplyr::full_join(langs, langbytes, by = "language")
 813 | ```
 814 | 
 815 | ```{r language-plot, class.source = 'fold-hide', message = FALSE}
 816 | ggplot(langs %>% dplyr::arrange(desc(nbr_submissions_JOSS_API)) %>%
 817 |            dplyr::filter(nbr_submissions_JOSS_API > 10) %>%
 818 |            dplyr::mutate(language = factor(language, levels = language)),
 819 |        aes(x = language, y = nbr_submissions_JOSS_API)) + 
 820 |     geom_bar(stat = "identity") + 
 821 |     theme_bw() + 
 822 |     theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
 823 |     labs(x = "", y = "Number of submissions", caption = dcap) + 
 824 |     theme(axis.title = element_text(size = 15))
 825 | ```
 826 | 
 827 | ```{r language-bytes, class.source = 'fold-hide', message = FALSE}
 828 | DT::datatable(
 829 |     langs %>% dplyr::arrange(desc(nbr_bytes_GitHub)),
 830 |     escape = FALSE,
 831 |     filter = list(position = 'top', clear = FALSE),
 832 |     options = list(scrollX = TRUE)
 833 | )
 834 | ```
 835 | 
 836 | ```{r language-bytes-plot, class.source = 'fold-hide', message = FALSE, warning = FALSE}
 837 | ggplot(langs, aes(x = nbr_repos_GitHub, y = nbr_bytes_GitHub)) + 
 838 |     geom_point() + scale_x_log10() + scale_y_log10() + geom_smooth() + 
 839 |     theme_bw() + 
 840 |     labs(x = "Number of repos using the language",
 841 |          y = "Total number of bytes of code\nwritten in the language", 
 842 |          caption = dcap) + 
 843 |     theme(axis.title = element_text(size = 15))
 844 | ```
 845 | 
 846 | # Association between number of citations and number of stars of the GitHub repo
 847 | 
 848 | ```{r citation-stars, class.source = 'fold-hide'}
 849 | ggplotly(
 850 |     ggplot(papers, aes(x = citation_count, y = repo_nbr_stars,
 851 |                        label = title)) + 
 852 |         geom_point(alpha = 0.5) + scale_x_sqrt() + scale_y_sqrt() + 
 853 |         theme_bw() + 
 854 |         labs(x = "OpenAlex citation count", y = "Number of stars, GitHub repo", 
 855 |              caption = dcap) + 
 856 |         theme(axis.title = element_text(size = 15)),
 857 |     tooltip = c("label", "x", "y")
 858 | )
 859 | ```
 860 | 
 861 | # Distribution of time between GitHub repo creation and JOSS submission
 862 | 
 863 | ```{r creation-to-submission, class.source = 'fold-hide', warning = FALSE}
 864 | ggplot(papers, aes(x = as.numeric(prerev_opened - repo_created))) +
 865 |     geom_histogram(bins = 50) + 
 866 |     theme_bw() + 
 867 |     labs(x = "Time (days) from repo creation to JOSS pre-review start", 
 868 |          caption = dcap) + 
 869 |     theme(axis.title = element_text(size = 15))
 870 | ```
 871 | 
 872 | # Distribution of time between JOSS acceptance and last commit
 873 | 
 874 | ```{r acceptance-to-commit, class.source = 'fold-hide', warning = FALSE, fig.width = 8, fig.height = 8}
 875 | ggplot(papers, aes(x = as.numeric(repo_pushed - review_closed))) +
 876 |     geom_histogram(bins = 50) + 
 877 |     theme_bw() + 
 878 |     labs(x = "Time (days) from closure of JOSS review to most recent commit in repo",
 879 |          caption = dcap) + 
 880 |     theme(axis.title = element_text(size = 15)) + 
 881 |     facet_wrap(~ year(published.date), scales = "free_y")
 882 | ```
 883 | 
 884 | # Number of authors per paper
 885 | 
 886 | List the papers with the largest number of authors, and display the distribution 
 887 | of the number of authors per paper, for papers with at most 20 authors.
 888 | 
 889 | ```{r nbr-authors-top, class.source = 'fold-show'}
 890 | ## Papers with largest number of authors
 891 | papers %>% dplyr::arrange(desc(nbr_authors)) %>% 
 892 |     dplyr::select(title, published.date, url, nbr_authors) %>%
 893 |     as.data.frame() %>% head(10)
 894 | ```
 895 | 
 896 | ```{r nbr-authors, class.source = 'fold-hide', message = FALSE, fig.width = 8, fig.height = 8}
 897 | nbins <- max(papers$nbr_authors[papers$nbr_authors <= 20])
 898 | ggplot(papers %>% dplyr::filter(nbr_authors <= 20),
 899 |        aes(x = nbr_authors)) + 
 900 |     geom_histogram(bins = nbins, fill = "lightgrey", color = "grey50") + 
 901 |     theme_bw() + 
 902 |     facet_wrap(~ year(published.date), scales = "free_y") + 
 903 |     theme(axis.title = element_text(size = 15)) + 
 904 |     labs(x = "Number of authors",
 905 |          y = "Number of publications with\na given number of authors", 
 906 |          caption = dcap)
 907 | ```
 908 | 
 909 | ```{r nbr-authors-all, class.source = 'fold-hide', message = FALSE}
 910 | ggplot(papers %>% 
 911 |            dplyr::mutate(nbr_authors = replace(nbr_authors, nbr_authors > 5, ">5")) %>%
 912 |            dplyr::mutate(nbr_authors = factor(nbr_authors, levels = c("1", "2", "3", 
 913 |                                                                       "4", "5", ">5"))) %>%
 914 |            dplyr::mutate(year = year(published.date)) %>%
 915 |            dplyr::mutate(year = factor(year)) %>%
 916 |            dplyr::group_by(year, nbr_authors, .drop = FALSE) %>%
 917 |            dplyr::summarize(n = n()) %>%
 918 |            dplyr::mutate(freq = n/sum(n)) %>%
 919 |            dplyr::mutate(year = as.integer(as.character(year))), 
 920 |        aes(x = year, y = freq, fill = nbr_authors)) + geom_area() + 
 921 |     theme_minimal() + 
 922 |     scale_fill_brewer(palette = "Set1", name = "Number of\nauthors", 
 923 |                       na.value = "grey") + 
 924 |     theme(axis.title = element_text(size = 15)) + 
 925 |     labs(x = "Year", y = "Fraction of submissions", caption = dcap)
 926 | 
 927 | ```
 928 | 
 929 | # Number of authors vs number of contributors to the GitHub repo
 930 | 
 931 | Note that points are slightly jittered to reduce the overlap. 
 932 | 
 933 | ```{r nbr-authors-contribs, class.source = 'fold-hide', message = FALSE}
 934 | plotly::ggplotly(
 935 |     ggplot(papers, aes(x = nbr_authors, y = repo_nbr_contribs_2ormore, label = title)) + 
 936 |         geom_abline(slope = 1, intercept = 0) + 
 937 |         geom_jitter(width = 0.05, height = 0.05, alpha = 0.5) + 
 938 |         # geom_point(alpha = 0.5) + 
 939 |         theme_bw() + 
 940 |         scale_x_sqrt() + scale_y_sqrt() + 
 941 |         labs(x = "Number of authors", 
 942 |              y = "Number of contributors\nwith at least 2 commits", 
 943 |              caption = dcap) + 
 944 |         theme(axis.title = element_text(size = 15)),
 945 |     tooltip = c("label", "x", "y")
 946 | )
 947 | ```
 948 | 
 949 | # Number of reviewers per paper
 950 | 
 951 | Submissions associated with rOpenSci and pyOpenSci are not considered here, 
 952 | since they are not explicitly reviewed at JOSS. 
 953 | 
 954 | ```{r nbr-reviewers, class.source = 'fold-hide', message = FALSE, fig.width = 8, fig.height = 8}
 955 | ggplot(papers %>%
 956 |            dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
 957 |            dplyr::mutate(year = year(published.date)),
 958 |        aes(x = nbr_reviewers)) + geom_bar() + 
 959 |     facet_wrap(~ year) + theme_bw() + 
 960 |     labs(x = "Number of reviewers", y = "Number of submissions", caption = dcap)
 961 | ```
 962 | 
 963 | # Most active reviewers
 964 | 
 965 | Submissions associated with rOpenSci and pyOpenSci are not considered here, 
 966 | since they are not explicitly reviewed at JOSS. 
 967 | 
 968 | ## All time
 969 | 
 970 | ```{r most-reviewers, class.source = 'fold-hide', message = FALSE}
 971 | reviewers <- papers %>% 
 972 |     dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
 973 |     dplyr::mutate(year = year(published.date)) %>%
 974 |     dplyr::select(reviewers, year) %>%
 975 |     tidyr::separate_rows(reviewers, sep = ",")
 976 | 
 977 | ## Most active reviewers
 978 | DT::datatable(
 979 |     reviewers %>% dplyr::group_by(reviewers) %>%
 980 |         dplyr::summarize(nbr_reviews = length(year),
 981 |                          timespan = paste(unique(c(min(year), max(year))), 
 982 |                                           collapse = " - ")) %>%
 983 |         dplyr::arrange(desc(nbr_reviews)),
 984 |     escape = FALSE, rownames = FALSE, 
 985 |     filter = list(position = 'top', clear = FALSE),
 986 |     options = list(scrollX = TRUE)
 987 | )
 988 | ```
 989 | 
 990 | ## Past 5 years
 991 | 
 992 | ```{r most-reviewers-past-5years, class.source = 'fold-hide', message = FALSE}
 993 | reviewers <- papers %>% 
 994 |     dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
 995 |     dplyr::mutate(year = year(published.date)) %>%
 996 |     dplyr::filter(as.Date(published.date) >= (lubridate::today() - 5 * 365.25)) %>%
 997 |     dplyr::select(reviewers, year) %>%
 998 |     tidyr::separate_rows(reviewers, sep = ",")
 999 | 
1000 | ## Most active reviewers
1001 | DT::datatable(
1002 |     reviewers %>% dplyr::group_by(reviewers) %>%
1003 |         dplyr::summarize(nbr_reviews = length(year),
1004 |                          timespan = paste(unique(c(min(year), max(year))), 
1005 |                                           collapse = " - ")) %>%
1006 |         dplyr::arrange(desc(nbr_reviews)),
1007 |     escape = FALSE, rownames = FALSE, 
1008 |     filter = list(position = 'top', clear = FALSE),
1009 |     options = list(scrollX = TRUE)
1010 | )
1011 | ```
1012 | 
1013 | ## Past year
1014 | 
1015 | ```{r most-reviewers-past-year, class.source = 'fold-hide', message = FALSE}
1016 | reviewers <- papers %>% 
1017 |     dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
1018 |     dplyr::mutate(year = year(published.date)) %>%
1019 |     dplyr::filter(as.Date(published.date) >= (lubridate::today() - 365.25)) %>%
1020 |     dplyr::select(reviewers, year) %>%
1021 |     tidyr::separate_rows(reviewers, sep = ",")
1022 | 
1023 | ## Most active reviewers
1024 | DT::datatable(
1025 |     reviewers %>% dplyr::group_by(reviewers) %>%
1026 |         dplyr::summarize(nbr_reviews = length(year),
1027 |                          timespan = paste(unique(c(min(year), max(year))), 
1028 |                                           collapse = " - ")) %>%
1029 |         dplyr::arrange(desc(nbr_reviews)),
1030 |     escape = FALSE, rownames = FALSE, 
1031 |     filter = list(position = 'top', clear = FALSE),
1032 |     options = list(scrollX = TRUE)
1033 | )
1034 | ```
1035 | 
1036 | # Number of papers per editor and year
1037 | 
1038 | ```{r papers-per-editor, class.source = 'fold-hide', message = FALSE, fig.width = 16, fig.height = 15}
1039 | ggplot(papers %>% 
1040 |            dplyr::mutate(year = year(published.date),
1041 |                          `r/pyOpenSci` = factor(
1042 |                              grepl("rOpenSci|pyOpenSci", prerev_labels),
1043 |                              levels = c("TRUE", "FALSE"))), 
1044 |        aes(x = editor)) + geom_bar(aes(fill = `r/pyOpenSci`)) + 
1045 |     theme_bw() + facet_wrap(~ year, ncol = 1) + 
1046 |     scale_fill_manual(values = c(`TRUE` = "grey65", `FALSE` = "grey35")) + 
1047 |     theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
1048 |     labs(x = "Editor", y = "Number of submissions", caption = dcap)
1049 | ```
1050 | 
1051 | 
1052 | # Distribution of software repo licenses
1053 | 
1054 | ```{r repo-license, class.source = 'fold-hide', warning = FALSE, message = FALSE, fig.width = 8, fig.height = 8}
1055 | all_licenses <- sort(unique(papers$repo_license))
1056 | license_levels = c(grep("apache", all_licenses, value = TRUE),
1057 |                    grep("bsd", all_licenses, value = TRUE),
1058 |                    grep("mit", all_licenses, value = TRUE),
1059 |                    grep("gpl", all_licenses, value = TRUE),
1060 |                    grep("mpl", all_licenses, value = TRUE))
1061 | license_levels <- c(license_levels, setdiff(all_licenses, license_levels))
1062 | ggplot(papers %>% 
1063 |            dplyr::mutate(repo_license = factor(repo_license, 
1064 |                                                levels = license_levels)),
1065 |        aes(x = repo_license)) +
1066 |     geom_bar() + 
1067 |     theme_bw() + 
1068 |     labs(x = "Software license", y = "Number of submissions", caption = dcap) + 
1069 |     theme(axis.title = element_text(size = 15),
1070 |           axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
1071 |     facet_wrap(~ year(published.date), scales = "free_y")
1072 | ```
1073 | 
1074 | ```{r repl-license, class.source = 'fold-show', warning = FALSE, message = FALSE}
1075 | ## For plots below, replace licenses present in less 
1076 | ## than 2.5% of the submissions by 'other'
1077 | tbl <- table(papers$repo_license)
1078 | to_replace <- names(tbl[tbl <= 0.025 * nrow(papers)])
1079 | ```
1080 | 
1081 | ```{r plot-repo-license, class.source = 'fold-hide', warning = FALSE, message = FALSE}
1082 | ggplot(papers %>% 
1083 |            dplyr::mutate(year = year(published.date)) %>%
1084 |            dplyr::mutate(repo_license = replace(repo_license, 
1085 |                                                 repo_license %in% to_replace,
1086 |                                                 "other")) %>%
1087 |            dplyr::mutate(year = factor(year), 
1088 |                          repo_license = factor(
1089 |                              repo_license, 
1090 |                              levels = license_levels[license_levels %in% repo_license]
1091 |                          )) %>%
1092 |            dplyr::group_by(year, repo_license, .drop = FALSE) %>%
1093 |            dplyr::count() %>%
1094 |            dplyr::mutate(year = as.integer(as.character(year))), 
1095 |        aes(x = year, y = n, fill = repo_license)) + geom_area() + 
1096 |     theme_minimal() + 
1097 |     scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 
1098 |                       na.value = "grey") + 
1099 |     theme(axis.title = element_text(size = 15)) + 
1100 |     labs(x = "Year", y = "Number of submissions", caption = dcap)
1101 | 
1102 | ggplot(papers %>% 
1103 |            dplyr::mutate(year = year(published.date)) %>%
1104 |            dplyr::mutate(repo_license = replace(repo_license, 
1105 |                                                 repo_license %in% to_replace,
1106 |                                                 "other")) %>%
1107 |            dplyr::mutate(year = factor(year), 
1108 |                          repo_license = factor(
1109 |                              repo_license, 
1110 |                              levels = license_levels[license_levels %in% repo_license]
1111 |                          )) %>%
1112 |            dplyr::group_by(year, repo_license, .drop = FALSE) %>%
1113 |            dplyr::summarize(n = n()) %>%
1114 |            dplyr::mutate(freq = n/sum(n)) %>%
1115 |            dplyr::mutate(year = as.integer(as.character(year))), 
1116 |        aes(x = year, y = freq, fill = repo_license)) + geom_area() + 
1117 |     theme_minimal() + 
1118 |     scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 
1119 |                       na.value = "grey") + 
1120 |     theme(axis.title = element_text(size = 15)) + 
1121 |     labs(x = "Year", y = "Fraction of submissions", caption = dcap)
1122 | ```
1123 | 
1124 | # Most common GitHub repo topics
1125 | 
1126 | ```{r github-topics, class.source = 'fold-hide', fig.width = 12, fig.height = 12, warning = FALSE, message = FALSE}
1127 | a <- unlist(strsplit(papers$repo_topics, ","))
1128 | a <- a[!is.na(a)]
1129 | topicfreq <- table(a)
1130 | 
1131 | colors <- viridis::viridis(100)
1132 | set.seed(1234)
1133 | wordcloud::wordcloud(
1134 |     names(topicfreq), sqrt(topicfreq), min.freq = 1, max.words = 300,
1135 |     random.order = FALSE, rot.per = 0.05, use.r.layout = FALSE, 
1136 |     colors = colors, scale = c(10, 0.1), random.color = TRUE,
1137 |     ordered.colors = FALSE, vfont = c("serif", "plain")
1138 | )
1139 | 
1140 | DT::datatable(as.data.frame(topicfreq) %>% 
1141 |                   dplyr::rename(topic = a, nbr_repos = Freq) %>%
1142 |                   dplyr::arrange(desc(nbr_repos)),
1143 |               escape = FALSE, rownames = FALSE, 
1144 |               filter = list(position = 'top', clear = FALSE),
1145 |               options = list(scrollX = TRUE))
1146 | ```
1147 | 
1148 | 
1149 | # Citation analysis
1150 | 
1151 | Here, we take a more detailed look at the papers that cite JOSS papers, using 
1152 | data from the Open Citations Corpus.
1153 | 
1154 | ## Get citing papers for each submission
1155 | 
1156 | ```{r get-citing-papers, class.source = 'fold-show', warning = FALSE, message = FALSE}
1157 | ## Split into several queries
1158 | ## Randomize the splitting since a whole query may fail if one ID is not recognized
1159 | papidx <- seq_len(nrow(papers))
1160 | idxL <- split(sample(papidx, length(papidx), replace = FALSE), ceiling(papidx / 50))
1161 | citationsL <- lapply(idxL, function(idx) {
1162 |     tryCatch({
1163 |         citecorp::oc_coci_cites(doi = papers$alternative.id[idx]) %>%
1164 |             dplyr::distinct() %>%
1165 |             dplyr::mutate(citation_info_obtained = as.character(lubridate::today()))
1166 |     }, error = function(e) {
1167 |         NULL
1168 |     })
1169 | })
1170 | citationsL <- citationsL[vapply(citationsL, function(df) !is.null(df) && nrow(df) > 0, FALSE)]
1171 | if (length(citationsL) > 0) {
1172 |     citations <- do.call(dplyr::bind_rows, citationsL)
1173 | } else {
1174 |     citations <- NULL
1175 | }
1176 | dim(citations)
1177 | 
1178 | if (!is.null(citations) && is.data.frame(citations) && "oci" %in% colnames(citations)) {
1179 |     citations <- citations %>% 
1180 |         dplyr::filter(!(oci %in% citations_archive$oci) & 
1181 |                           citing != "")
1182 |     
1183 |     tmpj <- rcrossref::cr_works(dois = unique(citations$citing))$data %>%
1184 |         dplyr::select(contains("doi"), contains("container.title"), contains("issn"),
1185 |                       contains("type"), contains("publisher"), contains("prefix"))
1186 |     citations <- citations %>% dplyr::left_join(tmpj, by = c("citing" = "doi"))
1187 |     
1188 |     ## bioRxiv preprints don't have a 'container.title' or 'issn', but we'll assume 
1189 |     ## that they can be 
1190 |     ## identified from the prefix 10.1101 - set the container.title 
1191 |     ## for these records manually; we may or may not want to count these
1192 |     ## (would it count citations twice, both preprint and publication?)
1193 |     citations$container.title[citations$prefix == "10.1101"] <- "bioRxiv"
1194 |     
1195 |     ## JOSS is represented by 'The Journal of Open Source Software' as well as 
1196 |     ## 'Journal of Open Source Software'
1197 |     citations$container.title[citations$container.title == 
1198 |                                   "Journal of Open Source Software"] <- 
1199 |         "The Journal of Open Source Software"
1200 |     
1201 |     ## Remove real self citations (cited DOI = citing DOI)
1202 |     citations <- citations %>% dplyr::filter(cited != citing)
1203 |     
1204 |     ## Merge with the archive
1205 |     citations <- dplyr::bind_rows(citations, citations_archive)
1206 | } else {
1207 |     citations <- citations_archive
1208 |     if (is.null(citations[["citation_info_obtained"]])) {
1209 |         citations$citation_info_obtained <- NA_character_
1210 |     }
1211 | }
1212 | 
1213 | citations$citation_info_obtained[is.na(citations$citation_info_obtained)] <- 
1214 |     "2021-08-11"
1215 | 
1216 | write.table(citations, file = "joss_submission_citations.tsv",
1217 |             row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)
1218 | ```
1219 | 
1220 | ## Summary statistics
1221 | 
1222 | ```{r citation-summary-stats, class.source = 'fold-show', warning = FALSE, message = FALSE}
1223 | ## Latest successful update of new citation data
1224 | max(as.Date(citations$citation_info_obtained))
1225 | 
1226 | ## Number of JOSS papers with >0 citations included in this collection
1227 | length(unique(citations$cited))
1228 | 
1229 | ## Number of JOSS papers with >0 citations according to OpenAlex
1230 | length(which(papers$citation_count > 0))
1231 | ```
1232 | 
1233 | ```{r citation-merge, class.source = 'fold-hide', warning = FALSE, message = FALSE}
1234 | ## Number of citations from Open Citations Corpus vs OpenAlex
1235 | df0 <- papers %>% dplyr::select(doi, citation_count) %>%
1236 |     dplyr::full_join(citations %>% dplyr::group_by(cited) %>%
1237 |                          dplyr::tally() %>%
1238 |                          dplyr::mutate(n = replace(n, is.na(n), 0)),
1239 |                      by = c("doi" = "cited"))
1240 | ```
1241 | 
1242 | ```{r citation-fraction, class.source = 'fold-show', warning = FALSE, message = FALSE}
1243 | ## Total citation count OpenAlex
1244 | sum(df0$citation_count, na.rm = TRUE)
1245 | 
1246 | ## Total citation count Open Citations Corpus
1247 | sum(df0$n, na.rm = TRUE)
1248 | 
1249 | ## Ratio of total citation count Open Citations Corpus/OpenAlex
1250 | sum(df0$n, na.rm = TRUE)/sum(df0$citation_count, na.rm = TRUE)
1251 | ```
1252 | 
1253 | ```{r citation-plot-crossref, class.source = 'fold-hide', warning = FALSE, message = FALSE}
1254 | ggplot(df0, aes(x = citation_count, y = n)) + 
1255 |     geom_abline(slope = 1, intercept = 0) + 
1256 |     geom_point(size = 3, alpha = 0.5) + 
1257 |     labs(x = "OpenAlex citation count", y = "Open Citations Corpus citation count",
1258 |          caption = dcap) + 
1259 |     theme_bw()
1260 | 
1261 | ## Zoom in
1262 | ggplot(df0, aes(x = citation_count, y = n)) + 
1263 |     geom_abline(slope = 1, intercept = 0) + 
1264 |     geom_point(size = 3, alpha = 0.5) + 
1265 |     labs(x = "OpenAlex citation count", y = "Open Citations Corpus citation count",
1266 |          caption = dcap) + 
1267 |     theme_bw() + 
1268 |     coord_cartesian(xlim = c(0, 75), ylim = c(0, 75))
1269 | ```
1270 | 
1271 | ```{r citation-nbr-journals, class.source = 'fold-show'}
1272 | ## Number of journals citing JOSS papers
1273 | length(unique(citations$container.title))
1274 | length(unique(citations$issn))
1275 | ```
1276 | 
1277 | ## Most citing journals
1278 | 
1279 | ```{r citation-top-journals, class.source = 'fold-hide', message = FALSE}
1280 | topcit <- citations %>% dplyr::group_by(container.title) %>%
1281 |     dplyr::summarize(nbr_citations_of_joss_papers = length(cited),
1282 |                      nbr_cited_joss_papers = length(unique(cited)),
1283 |                      nbr_citing_papers = length(unique(citing)),
1284 |                      nbr_selfcitations_of_joss_papers = sum(author_sc == "yes"),
1285 |                      fraction_selfcitations = signif(nbr_selfcitations_of_joss_papers /
1286 |                                                          nbr_citations_of_joss_papers, digits = 3)) %>%
1287 |     dplyr::arrange(desc(nbr_cited_joss_papers))
1288 | DT::datatable(topcit,
1289 |               escape = FALSE, rownames = FALSE, 
1290 |               filter = list(position = 'top', clear = FALSE),
1291 |               options = list(scrollX = TRUE))
1292 | ```
1293 | 
1294 | ```{r citation-journals-plot, class.source = 'fold-hide'}
1295 | plotly::ggplotly(
1296 |     ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers,
1297 |                        label = container.title)) + 
1298 |         geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 
1299 |         geom_point(size = 3, alpha = 0.5) + 
1300 |         theme_bw() + 
1301 |         labs(caption = dcap, x = "Number of citations of JOSS papers",
1302 |              y = "Number of cited JOSS papers")
1303 | )
1304 | plotly::ggplotly(
1305 |     ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers,
1306 |                        label = container.title)) + 
1307 |         geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 
1308 |         geom_point(size = 3, alpha = 0.5) + 
1309 |         theme_bw() + 
1310 |         coord_cartesian(xlim = c(0, 100), ylim = c(0, 50)) + 
1311 |         labs(caption = dcap, x = "Number of citations of JOSS papers",
1312 |              y = "Number of cited JOSS papers")
1313 | )
1314 | ```
1315 | 
1316 | ```{r}
1317 | write.table(topcit, file = "joss_submission_citations_byjournal.tsv",
1318 |             row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)
1319 | ```
1320 | 
1321 | 
1322 | # Save object
1323 | 
1324 | The tibble object with all data collected above is serialized to a file that 
1325 | can be downloaded and reused.
1326 | 
1327 | ```{r save-data}
1328 | head(papers) %>% as.data.frame()
1329 | saveRDS(papers, file = "joss_submission_analytics.rds")
1330 | ```
1331 | 
1332 | To read the current version of this file directly from GitHub, use the 
1333 | following code:
1334 | 
1335 | ```{r, class.source = 'fold-show', eval = FALSE}
1336 | papers <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true")))
1337 | ```
1338 | 
1339 | # Session info
1340 | 
1341 | ```{r session-info}
1342 | sessionInfo()
1343 | ```
1344 | 
1345 | 


--------------------------------------------------------------------------------