├── .Rbuildignore
├── .github
    ├── .gitignore
    ├── dependabot.yaml
    └── workflows
    │   ├── R-CMD-check.yaml
    │   └── pkgdown.yaml
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── R
    ├── bind_rows_2.R
    ├── cursor_crawl.R
    ├── doi_lookup.R
    ├── open_alex_export.R
    ├── open_alex_restclient.R
    ├── openalex-package.R
    ├── rectangularize.R
    └── utils-pipe.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── data-raw
    └── DATASET.R
├── data
    └── topics.rda
├── man
    ├── openalex-package.Rd
    ├── openalex_api.Rd
    ├── openalex_attribution.Rd
    ├── openalex_counts.Rd
    ├── openalex_crawl.Rd
    ├── openalex_doi_lookup.Rd
    ├── openalex_flatten_long.Rd
    ├── openalex_key.Rd
    ├── openalex_kth_rawaff_query.Rd
    ├── openalex_polite.Rd
    ├── openalex_topics.Rd
    ├── openalex_work.Rd
    ├── openalex_works_created_since.Rd
    ├── openalex_works_cursorcrawl.Rd
    ├── openalex_works_export.Rd
    ├── openalex_works_published_since.Rd
    ├── openalex_works_updated_since.Rd
    ├── openalex_write_duckdb.Rd
    ├── pipe.Rd
    ├── topics.Rd
    └── wos_plaintext_for_diva.Rd
├── openalex.Rproj
└── tests
    ├── testthat.R
    └── testthat
        ├── test-crawl.R
        ├── test-cursorcrawl.R
        ├── test-dois.R
        ├── test-export.R
        ├── test-freeze.R
        └── test-open_alex_restclient.R


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^LICENSE\.md$
 2 | ^\.github$
 3 | ^data-raw$
 4 | ^_pkgdown\.yml$
 5 | ^docs$
 6 | ^pkgdown$
 7 | ^.*\.Rproj$
 8 | ^\.Rproj\.user$
 9 | ^README\.Rmd$
10 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/dependabot.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "daily"


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macos-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'release'}
24 | 
25 |     env:
26 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
27 |       R_KEEP_PKG_SOURCE: yes
28 | 
29 |     steps:
30 |       - uses: actions/checkout@v4
31 | 
32 |       - uses: r-lib/actions/setup-pandoc@v2
33 | 
34 |       - uses: r-lib/actions/setup-r@v2
35 |         with:
36 |           r-version: ${{ matrix.config.r }}
37 |           http-user-agent: ${{ matrix.config.http-user-agent }}
38 |           use-public-rspm: true
39 | 
40 |       - uses: r-lib/actions/setup-r-dependencies@v2
41 |         with:
42 |           extra-packages: any::rcmdcheck
43 |           needs: check
44 | 
45 |       - uses: r-lib/actions/check-r-package@v2
46 |         with:
47 |           upload-snapshots: true
48 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     permissions:
23 |       contents: write
24 |     steps:
25 |       - uses: actions/checkout@v4
26 | 
27 |       - uses: r-lib/actions/setup-pandoc@v2
28 | 
29 |       - uses: r-lib/actions/setup-r@v2
30 |         with:
31 |           use-public-rspm: true
32 | 
33 |       - uses: r-lib/actions/setup-r-dependencies@v2
34 |         with:
35 |           extra-packages: any::pkgdown, local::.
36 |           needs: website
37 | 
38 |       - name: Build site
39 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
40 |         shell: Rscript {0}
41 | 
42 |       - name: Deploy to GitHub pages 🚀
43 |         if: github.event_name != 'pull_request'
44 |         uses: JamesIves/github-pages-deploy-action@v4.7.3
45 |         with:
46 |           clean: false
47 |           branch: gh-pages
48 |           folder: docs
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | docs
6 | 
7 | /.quarto/
8 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: openalex
 2 | Title: Data from OpenAlex REST API
 3 | Version: 0.0.5
 4 | Authors@R: 
 5 |     person(given = "Markus",
 6 |            family = "Skyttner",
 7 |            role = c("cre", "aut"),
 8 |            email = "markussk@kth.se")
 9 | Description: The OpenAlex website provides open data on
10 |     papers/works, venues, institutions and more around the world under the CC0 license. 
11 |     This R package provides some functions to access data from the OpenAlex REST API.
12 | License: MIT + file LICENSE
13 | Encoding: UTF-8
14 | LazyData: true
15 | Roxygen: list(markdown = TRUE)
16 | RoxygenNote: 7.3.2
17 | Imports: 
18 |     httr,
19 |     magrittr,
20 |     utils,
21 |     dplyr,
22 |     purrr,
23 |     progress,
24 |     jsonlite,
25 |     tibble,
26 |     tidyr,
27 |     lubridate,
28 |     DBI,
29 |     duckdb,
30 |     httr2,
31 |     readr,
32 |     RcppSimdJson,
33 |     jqr
34 | Suggests: 
35 |     testthat (>= 3.0.0)
36 | Config/testthat/edition: 3
37 | URL: https://kth-library.github.io/openalex/
38 | Depends: 
39 |     R (>= 2.10)
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2022
2 | COPYRIGHT HOLDER: openalex authors
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2022 openalex authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export("%>%")
 4 | export(openalex_api)
 5 | export(openalex_attribution)
 6 | export(openalex_counts)
 7 | export(openalex_crawl)
 8 | export(openalex_doi_lookup)
 9 | export(openalex_flatten_long)
10 | export(openalex_key)
11 | export(openalex_kth_rawaff_query)
12 | export(openalex_polite)
13 | export(openalex_topics)
14 | export(openalex_work)
15 | export(openalex_works_created_since)
16 | export(openalex_works_published_since)
17 | export(openalex_works_updated_since)
18 | import(DBI)
19 | import(dplyr)
20 | import(duckdb)
21 | import(httr2)
22 | import(purrr)
23 | import(tidyr)
24 | importFrom(RcppSimdJson,fload)
25 | importFrom(RcppSimdJson,fminify)
26 | importFrom(RcppSimdJson,fparse)
27 | importFrom(dplyr,bind_cols)
28 | importFrom(dplyr,bind_rows)
29 | importFrom(dplyr,distinct)
30 | importFrom(dplyr,mutate)
31 | importFrom(dplyr,rename)
32 | importFrom(dplyr,select)
33 | importFrom(dplyr,starts_with)
34 | importFrom(dplyr,tibble)
35 | importFrom(httr,GET)
36 | importFrom(httr,content)
37 | importFrom(httr,http_type)
38 | importFrom(httr,modify_url)
39 | importFrom(httr,status_code)
40 | importFrom(httr,user_agent)
41 | importFrom(jqr,jq)
42 | importFrom(jsonlite,fromJSON)
43 | importFrom(lubridate,as_date)
44 | importFrom(lubridate,format_ISO8601)
45 | importFrom(magrittr,"%>%")
46 | importFrom(progress,progress_bar)
47 | importFrom(purrr,keep)
48 | importFrom(purrr,map)
49 | importFrom(purrr,map_df)
50 | importFrom(purrr,map_dfr)
51 | importFrom(purrr,pluck)
52 | importFrom(purrr,pmap)
53 | importFrom(purrr,possibly)
54 | importFrom(purrr,walk2)
55 | importFrom(readr,read_csv)
56 | importFrom(stats,setNames)
57 | importFrom(tibble,enframe)
58 | importFrom(tidyr,hoist)
59 | importFrom(tidyr,unnest)
60 | importFrom(tidyr,unnest_longer)
61 | importFrom(tidyr,unnest_wider)
62 | importFrom(utils,URLencode)
63 | importFrom(utils,tail)
64 | 


--------------------------------------------------------------------------------
/R/bind_rows_2.R:
--------------------------------------------------------------------------------
 1 | bind_rows2 <- function(l) {
 2 | 
 3 |   #checks
 4 |   stopifnot(is.list(l))
 5 | 
 6 |   #get vars
 7 |   v <- unlist(
 8 |     lapply(unname(l), \(df) vapply(df, typeof, character(1)))
 9 |   )
10 |   nm <- names(v)
11 |   nm0 <- unique(nm[duplicated(nm)])
12 | 
13 |   #get list of columns with diff types in diff datasets
14 |   #to do use reduce with intersection hwere might make this all easier to read.
15 |   x <- stats::setNames(lapply(nm0, \(x) unique(v[nm == x])), nm0)
16 |   x0 <- x[!sapply(x, \(x) length(unique(x)) == 1)]
17 | 
18 |   # Convert to highest in type hierarchy
19 |   type_hierarchy <- c("logical" = 1, "integer" = 2, "double" = 3, "character"= 4)
20 |   conv_funs <- lapply(x0, \(x)
21 |                       switch(max(type_hierarchy[x]),
22 |                              match.fun(as.logical),
23 |                              match.fun(as.integer),
24 |                              match.fun(as.double),
25 |                              match.fun(as.character)
26 |                       )
27 |   )
28 |   l1 <- lapply(l, \(df) {
29 |     f <- conv_funs[names(conv_funs) %in% names(df)]
30 |     for(i in 1:length(f)) {
31 |       df[[names(f[i])]] <- f[[i]](df[[names(f[i])]])
32 |     }
33 |     df
34 |   })
35 | 
36 |   # bind rows and return
37 |   dplyr::bind_rows(l1)
38 | }
39 | 


--------------------------------------------------------------------------------
/R/cursor_crawl.R:
--------------------------------------------------------------------------------
  1 | #' Crawl multipage responses from queries against the API
  2 | #'
  3 | #' Chunks and uses cursor based pagination to fetch works
  4 | #' @param works_filter the works filter
  5 | #' @param n_max_pages the max amount of pages to fetch (50 per page)
  6 | #' @return paths to downloaded files
  7 | #' @importFrom RcppSimdJson fminify fparse fload
  8 | #' @importFrom jqr jq
  9 | openalex_works_cursorcrawl <- function(
 10 |   works_filter,
 11 |   n_max_pages = 5
 12 | ) {
 13 | 
 14 |   req_works <-
 15 |     "https://api.openalex.org/" |>
 16 |     httr2::request() |>
 17 |     httr2::req_url_path("works")
 18 | 
 19 |   # initially, cursor is set to "*"
 20 |   q <- list(
 21 |     filter = works_filter,
 22 |     cursor = "*",
 23 |     `per-page` = 50
 24 |   )
 25 | 
 26 |   # fcn to get works based on query params
 27 |   fetch_works <- function(q) {
 28 |       req_works |>
 29 |       httr2::req_url_query(!!!q) |>
 30 |       httr2::req_perform() |>
 31 |       httr2::resp_body_string() |>
 32 |       RcppSimdJson::fminify()
 33 |   }
 34 | 
 35 |   # get the first page of results
 36 |   json_line <- fetch_works(q)
 37 | 
 38 |   json_header <- function(j) {
 39 |     j |> RcppSimdJson::fparse(query = "/meta", max_simplify_lvl = "list")
 40 |   }
 41 | 
 42 |   json_results <- function(j) {
 43 | 
 44 |     #cmd <- sprintf("%s -c '.results[]' | %s -c 'del(..|.abstract_inverted_index?)'",
 45 |     #  jq_binary, jq_binary)
 46 | 
 47 |     #system(cmd, input = j, intern = TRUE) #|>
 48 |     j |> jqr::jq(".results[] | del(..|.abstract_inverted_index?)")
 49 |   }
 50 | 
 51 |   #TODO: exclude abstract_inverted_index
 52 |   # Using JSONPath: $.*[?(@.abstract_inverted_index == null)]
 53 | 
 54 |   header <- json_line |> json_header()
 55 |   results <- json_line |> json_results()
 56 | 
 57 |   # page <-
 58 |   #   json_line |>
 59 |   #   RcppSimdJson::fparse("/results", max_simplify_lvl = "list") |>
 60 |   #   (\(x) list(list(results = x)))()
 61 | 
 62 |   #page |> openalex_works_to_tbls()
 63 | 
 64 |   # compute total number of pages
 65 |   h <- header
 66 |   n_pages <- ceiling(h$count / h$per_page)
 67 | 
 68 |   # begin the crawl
 69 |   message("Retrieving ", min(n_max_pages, n_pages), " out of a total of ",
 70 |     n_pages, " pages having a total record count of ", h$count,
 71 |     ". Starting crawl...")
 72 | 
 73 |   # iterate using a while loop
 74 |   i <- 1
 75 |   is_stopped <- FALSE
 76 |   is_done <- n_pages <= 1
 77 |   q$cursor <- h$next_cursor
 78 |   td <- tempdir()
 79 |   unlink(dir(td, pattern = "\\.json$", full.names = TRUE))
 80 |   fn <- file.path(td, sprintf("%04i%s", i, ".json"))
 81 |   readr::write_lines(results, fn)
 82 |   #message("Wrote page ", i, " to ", fn, " and next cursor is ", q$cursor)
 83 |   #readr::write_rds(page, file = fn)
 84 |   #message("Cursor: ", q$cursor)
 85 | 
 86 |   while (!is_done) {
 87 |     i <- i + 1
 88 |     if (i %% 10 == 0) cat(paste(i, q$cursor, "\n")) else {
 89 |       if (i %% 100 == 0) cat("HUNDREDS_OF_PAGES!!!!\n") else cat(".")
 90 |     }
 91 |     next_page <- fetch_works(q)
 92 |     #stopifnot(!is.null(next_page))
 93 |     h <- json_header(next_page)
 94 |     q$cursor <- h$next_cursor
 95 |     fn <- file.path(td, sprintf("%04i%s", i, ".json"))
 96 |     if (file.exists(fn)) unlink(fn)
 97 |     
 98 |     results <- json_results(next_page)
 99 |     #message("Batch: #", i, " ", length(results))
100 |     if (length(results) > 0) {
101 |       readr::write_lines(results, fn, append = TRUE)    
102 |     }
103 |     is_stopped <- i >= min(n_max_pages, n_pages)
104 |     if (is_stopped)
105 |       message("Stopped, next cursor is: ", q$cursor)
106 |     is_done <- is.null(q$cursor) || is_stopped
107 |   }
108 | 
109 |   filez <- dir(td, pattern = "\\.json$", full.names = TRUE)
110 |   return (filez)
111 | 
112 |   message("\nDone, fetched ", length(filez), " pages of works, written to ", td)
113 | 
114 | }
115 | 
116 | jsonl_to_tbl <- function(fn) {
117 |   obj <- fn |> RcppSimdJson::fload(max_simplify_lvl = "list")
118 |   res <- list(results = obj)
119 |   res |> parse_work2()
120 | }
121 | 
122 | 


--------------------------------------------------------------------------------
/R/doi_lookup.R:
--------------------------------------------------------------------------------
 1 | split_chunks_of_n <- function(x, n) 
 2 |   split(x, ceiling(seq_along(x) / n))
 3 | 
 4 | split_n_chunks <- function(x, n)
 5 |   split(x, ceiling(seq_along(x) / (length(x) / n)))
 6 | 
 7 | openalex_or <- function(x) 
 8 |   paste0(collapse = "|", x)
 9 | 
10 | doi_crawl <- function(dois) {
11 | 
12 |   works <- 
13 |     openalex_crawl("works", fmt = "object",
14 |       query = openalex_query(filter = paste0("doi:", dois))
15 |     )
16 |   
17 |   lol <- 
18 |     list(list(results = reduce(works |> map("results"), c)))
19 | 
20 |   lol |> openalex_works_to_tbls()
21 |     
22 | }
23 | 
24 | doi_lookup_identifiers <- function(con, doi_filter) {
25 | 
26 |   if (missing(con)) {
27 |     con <- duckdb::dbConnect(duckdb::duckdb())
28 |     DBI::dbSendQuery(con, "install json; load json; install httpfs; load httpfs;")
29 |     on.exit(duckdb::dbDisconnect(con, shutdown = TRUE))  
30 |   }
31 | 
32 |   sql <- 
33 |     paste0("from (from read_json_auto('",
34 |     sprintf("https://api.openalex.org/works?filter=doi:%s&per-page=50&mailto=support@openalex.org", doi_filter),
35 |     "') select unnest(results) as r) select unnest(r.ids);")  
36 | 
37 |   DBI::dbGetQuery(con, sql) |> as_tibble()  
38 | }
39 | 
40 | #' Lookup DOIs using OpenAlex
41 | #' 
42 | #' @param dois a character vector of DOIs
43 | #' @param resolution either "all" or "identifiers" to only return other related identifiers
44 | #' @return tibble(s)
45 | #' @export
46 | openalex_doi_lookup <- function(dois, resolution = c("all", "identifiers")) {
47 | 
48 |   dois <- unique(dois)
49 | 
50 |   doi_filters <- 
51 |     split_chunks_of_n(dois, 50) |> 
52 |     map_chr(openalex_or)
53 | 
54 |   doi_chunks <- switch(resolution, 
55 |     "all" = {
56 |       doi_filters |> 
57 |         purrr::map(doi_crawl, .progress = TRUE)
58 |     },
59 |     "identifiers" = {
60 |       doi_filters |> 
61 |         map_dfr(\(x) doi_lookup_identifiers(doi_filter = x), .progress = TRUE)
62 |     }
63 |   )
64 | 
65 |   doi_chunks
66 |  
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/R/open_alex_export.R:
--------------------------------------------------------------------------------
  1 | #' Use OpenAlex API for exporting data in tabular and wos formats
  2 | #' @param q the query, for example "authorships.institutions.lineage:i86987016,authorships.institutions.lineage:!i4210161097,type:types/article,primary_location.source.type:source-types/journal|source-types/conference,publication_year:2023"
  3 | #' @param fmt the export format, one of "csv" or "wos-plaintext" or "wos-plaintext-diva"
  4 | #' @param raw_string boolean to indicate whether a raw string should be returned
  5 | #' @return a character vector with a raw string with the results from the export or a data frame
  6 | #' @import httr2
  7 | #' @importFrom dplyr bind_cols
  8 | #' @importFrom readr read_csv
  9 | openalex_works_export <- function(q, fmt = c("csv", "wos-plaintext"), raw_string = FALSE) {
 10 | 
 11 |   query <- list(filter = q)
 12 |   query$format <- fmt
 13 |   query$truncate <- "false"
 14 |   query$api_key <- cfg()$key
 15 | 
 16 |   ep <- 
 17 |     "https://export.openalex.org" |> 
 18 |     httr2::request() |> 
 19 |     httr2::req_url_path("works") |> 
 20 |     httr2::req_user_agent(cfg()$user_agent) |> 
 21 |     httr2::req_url_query(!!!query) 
 22 |   
 23 |   check_progress <- function() {
 24 |     ep |> httr2::req_perform() |> httr2::resp_body_json() |> dplyr::bind_cols()
 25 |   }
 26 |   
 27 |   res <- check_progress()
 28 |   
 29 |   message("Waiting for export to be generated ...\n")
 30 |   
 31 |   while (res$status != "finished") {
 32 |     Sys.sleep(5)
 33 |     res <- check_progress()
 34 |     message(sprintf("%0.1f%%", as.double(res$progress) * 100), " (", res$status, ")") 
 35 |   }
 36 | 
 37 |   message("Export is ready, retrieving results.")
 38 |   out <- 
 39 |     httr2::request(res$result_url) |> 
 40 |     httr2::req_perform() |> 
 41 |     httr2::resp_body_string()
 42 | 
 43 |   message("Done, returning results")
 44 |   if (raw_string) return(out)
 45 |   
 46 |   res <- switch(match.arg(fmt), 
 47 |     "csv" = {
 48 |       out |> readr::read_csv(show_col_types = FALSE)
 49 |     },
 50 |     # "wos-plaintext" = {
 51 |     #   out |> strsplit(split = "\n") |> unlist() |> read_wos_plaintext()
 52 |     # },
 53 |     "wos-plaintext" = {
 54 |       out |> wos_plaintext_for_diva()
 55 |     }
 56 |   )
 57 |   return (res)
 58 | }
 59 | 
 60 | #' Function which converts a wos_plaintext-string into a format
 61 | #' which can be uploaded to DiVA, by adding ER tags 
 62 | #' (including a blank line) after each record
 63 | #' @param x character string with "wos-plaintext" format as returned from OpenAlex export API endpoint
 64 | #' @importFrom stats setNames
 65 | #' @importFrom utils tail
 66 | wos_plaintext_for_diva <- function(x) {
 67 |   w <- x |> strsplit("\n") |> unlist()
 68 |   i_header <- which(grepl("^FN|^VR", w))
 69 |   #i_indented <- which(grepl("^\\s+", w))
 70 |   i_eor <- which(grepl("^ER$", w))
 71 |   i_blank <- which(nchar(w) == 0)
 72 |   
 73 |   pt <- w[-c(i_eor, i_blank)]  # TODO: should i_header rows be removed too?
 74 |   i_record <- which(grepl("^PT\\s+", pt))
 75 |   n_records <- length(i_record)
 76 |   i_range <- data.frame(beg = i_record, end = c(tail(i_record, -1) - 1, length(pt)))
 77 |   pt[i_range$end] <- pt[i_range$end] |> paste0("\nER\n")
 78 |   paste0(collapse ="\n", pt)
 79 | }
 80 | 
 81 | #' Export the results from a crawl as a duckdb database file
 82 | #' @param crawl the results from running the to_tbls fcn
 83 | #' @param destdir the location to save the database file
 84 | #' @return file path to the database file
 85 | #' @importFrom purrr walk2
 86 | #' @import duckdb DBI
 87 | openalex_write_duckdb <- function(crawl, destdir = NULL) {
 88 | 
 89 |   if (!requireNamespace("duckdb", quietly = TRUE)) {
 90 |     stop(
 91 |       "Package \"duckdb\" must be installed to use this function.",
 92 |       call. = FALSE
 93 |     )
 94 |   }
 95 | 
 96 |   if (is.null(destdir)) {
 97 |     destdir <- file.path(tempdir(check = TRUE), "openalex", "openalex.db")
 98 |   }
 99 | 
100 |   message("Ensure existing dir: ", dirname(destdir))
101 |   if (!dir.exists(dirname(destdir))) {
102 |     is_created <- dir.create(dirname(destdir), showWarnings = TRUE)
103 |   } else {
104 |     message("Removing existing file ", destdir)
105 |     if (file.exists(destdir))
106 |       unlink(destdir)
107 |   }
108 | 
109 |   drv <- duckdb::duckdb()
110 |   con <- duckdb::dbConnect(drv, dbdir = destdir)
111 |   on.exit(DBI::dbDisconnect(con, shutdown = TRUE))
112 | 
113 |   crawl |> names() |> 
114 |     purrr::walk(\(x) duckdb::duckdb_register(con, sprintf("view_%s", x), crawl |> getElement(x)))
115 | 
116 |   toc <- DBI::dbListTables(con)
117 |   new_tbl <- gsub("^view_", "", toc)
118 | 
119 |   sql_create_db <- sprintf("create table %s as from %s;", new_tbl, toc) |>
120 |     paste(collapse = "\n")
121 | 
122 |   message("Creating duckdb file at ", destdir, " using sql ", sql_create_db)
123 |   result <- DBI::dbExecute(con, sql_create_db)
124 |   message("Result is ", result)
125 | 
126 |   return(destdir)
127 | 
128 | }
129 | 
130 | openalex_fields <- function() {
131 |   paste0(
132 |   "abstract.search, abstract.search.no_stem, apc_list.currency, apc_list.provenance, ",
133 |   "apc_list.value, apc_list.value_usd, apc_paid.currency, apc_paid.provenance, apc_paid.value, ",
134 |   "apc_paid.value_usd, author.id, author.orcid, authors_count, ",
135 |   "authorships.affiliations.institution_ids, authorships.author.id, authorships.author.orcid, ",
136 |   "authorships.countries, authorships.institutions.continent, authorships.institutions.country_code, ",
137 |   "authorships.institutions.id, authorships.institutions.is_global_south, ",
138 |   "authorships.institutions.lineage, authorships.institutions.ror, authorships.institutions.type, ",
139 |   "authorships.is_corresponding, best_oa_location.is_accepted, best_oa_location.is_oa, ",
140 |   "best_oa_location.is_published, best_oa_location.landing_page_url, best_oa_location.license, ",
141 |   "best_oa_location.license_id, best_oa_location.source.host_organization, ",
142 |   "best_oa_location.source.host_organization_lineage, best_oa_location.source.id, ",
143 |   "best_oa_location.source.is_in_doaj, best_oa_location.source.is_oa, best_oa_location.source.issn, ",
144 |   "best_oa_location.source.type, best_oa_location.version, best_open_version, biblio.first_page, ",
145 |   "biblio.issue, biblio.last_page, biblio.volume, citation_normalized_percentile.is_in_top_10_percent, ",
146 |   "citation_normalized_percentile.is_in_top_1_percent, citation_normalized_percentile.value, ",
147 |   "cited_by, cited_by_count, cited_by_percentile_year.max, cited_by_percentile_year.min, ",
148 |   "cites, concept.id, concepts.id, concepts.wikidata, concepts_count, corresponding_author_ids, ",
149 |   "corresponding_institution_ids, countries_distinct_count, datasets, default.search, ",
150 |   "display_name, display_name.search, display_name.search.no_stem, doi, doi_starts_with, ",
151 |   "from_created_date, from_publication_date, fulltext.search, fulltext_origin, fwci, ",
152 |   "grants.award_id, grants.funder, has_abstract, has_doi, has_embeddings, has_fulltext, ",
153 |   "has_oa_accepted_or_published_version, has_oa_submitted_version, has_old_authors, has_orcid, ",
154 |   "has_pdf_url, has_pmcid, has_pmid, has_raw_affiliation_strings, has_references, ids.mag, ",
155 |   "ids.openalex, ids.pmcid, ids.pmid, indexed_in, institution.id, institution_assertions.country_code, ",
156 |   "institution_assertions.id, institution_assertions.lineage, institution_assertions.ror, ",
157 |   "institution_assertions.type, institutions.continent, institutions.country_code, institutions.id, ",
158 |   "institutions.is_global_south, institutions.ror, institutions.type, institutions_distinct_count, ",
159 |   "is_corresponding, is_oa, is_paratext, is_retracted, journal, keyword.search, keywords.id, ",
160 |   "language, locations.is_accepted, locations.is_oa, locations.is_published, locations.landing_page_url, ",
161 |   "locations.license, locations.license_id, locations.source.has_issn, ",
162 |   "locations.source.host_institution_lineage, locations.source.host_organization, ",
163 |   "locations.source.host_organization_lineage, locations.source.id, ",
164 |   "locations.source.is_core, locations.source.is_in_doaj, locations.source.is_oa, ",
165 |   "locations.source.issn, locations.source.publisher_lineage, locations.source.type, ",
166 |   "locations.version, locations_count, mag, mag_only, oa_status, ",
167 |   "open_access.any_repository_has_fulltext, open_access.is_oa, open_access.oa_status, ",
168 |   "openalex, openalex_id, pmcid, pmid, primary_location.is_accepted, primary_location.is_oa, ",
169 |   "primary_location.is_published, primary_location.landing_page_url, primary_location.license, ",
170 |   "primary_location.license_id, primary_location.source.has_issn, ",
171 |   "primary_location.source.host_institution_lineage, primary_location.source.host_organization, ",
172 |   "primary_location.source.host_organization_lineage, primary_location.source.id, ",
173 |   "primary_location.source.is_core, primary_location.source.is_in_doaj, ",
174 |   "primary_location.source.is_oa, primary_location.source.issn, ",
175 |   "primary_location.source.publisher_lineage, primary_location.source.type, ",
176 |   "primary_location.version, primary_topic.domain.id, primary_topic.field.id, primary_topic.id, ",
177 |   "primary_topic.subfield.id, publication_date, publication_year, raw_affiliation_strings.search, ",
178 |   "raw_author_name.search, referenced_works, referenced_works_count, related_to, ",
179 |   "repository, semantic.search, sustainable_development_goals.id, sustainable_development_goals.score, ",
180 |   "title.search, title.search.no_stem, title_and_abstract.search, title_and_abstract.search.no_stem, ",
181 |   "to_created_date, to_publication_date, to_updated_date, topics.domain.id, topics.field.id, ",
182 |   "topics.id, topics.subfield.id, topics_count, type, type_crossref, version"
183 | ) |> strsplit(split = ", ") |> unlist()
184 | }
185 | 
186 | openalex_groupbys_default <- function() { c(
187 |   "primary_location.source.type",
188 |   "primary_location.source.id",
189 |   "is_retracted",
190 |   "primary_location.source.publisher_lineage",
191 |   "open_access.oa_status",
192 |   "best_oa_location.is_published",
193 |   "best_oa_location.is_accepted",
194 |   "best_oa_location.license",
195 |   "authorships.institutions.type",
196 |   "has_pmid",
197 |   "has_orcid",
198 |   "mag_only",
199 |   "primary_location.source.is_in_doaj",
200 |   "has_doi",
201 |   "primary_location.source.is_oa",
202 |   "open_access.any_repository_has_fulltext",
203 |   "institutions.is_global_south",
204 |   "primary_location.source.is_core",
205 |   "corresponding_institution_ids",
206 |   "corresponding_author_ids",
207 |   "authorships.institutions.continent",
208 |   "language",
209 |   "keywords.id",
210 |   "authorships.countries",
211 |   "authorships.author.id",
212 |   "sustainable_development_goals.id",
213 |   "grants.funder",
214 |   "primary_topic.subfield.id",
215 |   "primary_topic.field.id",
216 |   "primary_topic.domain.id",
217 |   "primary_topic.id",
218 |   "type",
219 |   "authorships.institutions.lineage",
220 |   "open_access.is_oa",
221 |   "publication_year"
222 | )}
223 | 
224 | openalex_filter_default <- function() {
225 |   "authorships.institutions.lineage:i86987016,publication_year:2025"
226 | }
227 | 
228 | openalex_groupbys <- function(q) {
229 | 
230 |   colname <- field <- colid <- i <- NULL
231 | 
232 |   csv <- 
233 |     q |> readr::read_lines()
234 |   
235 |   schema <- 
236 |     csv[1:2] |> 
237 |     strsplit(split = ",") |> 
238 |     setNames(c("field", "colname")) |> 
239 |     purrr::map(\(x) na_if(x, "")) |> 
240 |     tibble::as_tibble() |> 
241 |     tibble::rowid_to_column(var = "colid") |> 
242 |     tidyr::fill(any_of(c("field"))) |> 
243 |     dplyr::filter(!is.na(colname)) |> 
244 |     dplyr::group_by(field) |> 
245 |     dplyr::summarize(i = min(colid), j = max(colid), colnames = list(colname)) |> 
246 |     dplyr::arrange(-desc(i))
247 |   
248 |   body <- 
249 |     csv[-c(1:2)] |> paste(collapse = "\n")
250 |   
251 |   all <- 
252 |     readr::read_csv(body, col_names = NULL, show_col_types = FALSE)
253 |   
254 |   parse_body <- function(field, i, j, colnames) {
255 |     all |> 
256 |       select(c(i, j)) |> 
257 |       setNames(nm = unlist(colnames)) |> 
258 |       filter(!if_all(everything(), is.na))
259 |       #filter(if_all(\(x) all(is.na(x)))) #|> 
260 |       #list() |> setNames(nm = field)
261 |   }
262 | 
263 |   tbls <- 
264 |     schema |> purrr::pmap(parse_body) |> 
265 |     setNames(nm = schema$field) |>
266 |     map(\(x) x |> mutate(across(any_of(c("name")), as.character)))
267 | 
268 |   tbls
269 | 
270 | }
271 | 
272 | #' Counts from OpenAlex
273 | #' 
274 | #' Aggregates/counts can be retrieved using the group_bys query parameter
275 | #' 
276 | #' @param filter a set of filter criteria, see the defaults in openalex_filter_default()
277 | #' @param dimensions a set of grouping dimensions, see the defaults in openalex_groupbys_default()
278 | #' @return a list of tibbles
279 | #' @export
280 | #' @importFrom utils URLencode
281 | openalex_counts <- function(
282 |   filter = openalex_filter_default(), 
283 |   dimensions = openalex_groupbys_default()
284 | ) {
285 | 
286 |   groupbys <- 
287 |     dimensions|> paste0(collapse = ",") |> utils::URLencode(reserved = TRUE)
288 | 
289 |   url <- paste0(
290 |     openalex_api(), "works?group_bys=", groupbys,
291 |     "&per_page=200&format=csv&mailto=team%40ourresearch.org",
292 |     "&filter=", filter
293 |   )
294 | 
295 |    message("Requesting ", url)
296 | 
297 |   url |> openalex_groupbys()
298 | }
299 | 
300 | read_page <- function(level = c("topics", "subfields", "fields", "domains"), page) {
301 | 
302 |   topic_page <- 
303 |     "https://api.openalex.org/%s?select=id,display_name,description,subfield,field,domain&per_page=200&page=%s" |> 
304 |     sprintf(level, page) |> 
305 |     jsonlite::fromJSON()
306 | 
307 |   tbl <- topic_page$results |> tibble::as_tibble()
308 |   structure(tbl, meta = topic_page$meta)
309 | 
310 | }
311 | 
312 | openalex_level <- function(l) {
313 |   
314 |   t <- read_page(level = l, page = 1)
315 |   np <- ceiling(attr(t, "meta")$count / 200)
316 | 
317 |   ts <- NULL
318 |   if (np > 1) {
319 |     ts <- (2:np) |> map(\(x) read_page(level = l, page = x), .progress = TRUE)
320 |   }
321 |   
322 |   t |> bind_rows(map_dfr(ts, bind_rows))
323 |   
324 | }
325 | 
326 | openalex_levels <- function() {
327 | 
328 |   display_name <- NULL
329 | 
330 |   topics <- openalex_level("topics")
331 | 
332 |   four <- topics |> select(all_of(c("subfield", "field", "domain")))
333 | 
334 |   topics |> rename(id_topic = id, topic = display_name) |> select(1:3) |> bind_cols(
335 |     four$subfield |> as_tibble() |> rename(id_subfield = id, subfield = display_name),
336 |     four$field |> as_tibble() |> rename(id_field = id, field = display_name),
337 |     four$domain |> as_tibble() |> rename(id_domain = id, domain = display_name)
338 |   ) |> 
339 |   mutate(across(contains("id_"), \(x) gsub("https://openalex.org/", "", x)))
340 | 
341 | }
342 | 
343 | #' Topics
344 | #' 
345 | #' Table of current topics, subfields, fields and domains used at OpenAlex
346 | #' @export
347 | openalex_topics <- function() {
348 |   openalex_levels()
349 | }
350 | 


--------------------------------------------------------------------------------
/R/open_alex_restclient.R:
--------------------------------------------------------------------------------
  1 | #file.edit("~/.Renviron")
  2 | #readRenviron("~/.Renviron")
  3 | 
  4 | #' Enter the OpenAlex API polite pool for faster requests by providing an email
  5 | #' @param email an email address, on the form "you@example.com" or "" to unset email
  6 | #' @return a logical depending on whether email was set or unset
  7 | #' @examples
  8 | #' \dontrun{
  9 | #' if(interactive()){
 10 | #'  # to set
 11 | #'  openalex_polite("you@example.com")
 12 | #'  # to unset
 13 | #'  openalex_polite("")
 14 | #'  }
 15 | #' }
 16 | #' @export
 17 | openalex_polite <- function(email) {
 18 | 
 19 |   if (!nzchar(email)) {
 20 |     message("Exiting from polite pool, email no longer provided in user agent header")
 21 |     Sys.setenv("OPENALEX_USERAGENT" = "http://github.com/hadley/httr")
 22 |     return (FALSE)
 23 |   }
 24 | 
 25 |   stopifnot(is.character(email), length(email) == 1)
 26 |   re_email <- "^mailto:.*?@.*?\\..*?"
 27 |   if (!grepl(re_email, email))
 28 |     email <- paste0("mailto:", trimws(email))
 29 |   stopifnot(grepl(re_email, email))
 30 | 
 31 |   ua <- sprintf("http://github.com/hadley/httr (%s)", email)
 32 | 
 33 |   if (Sys.getenv("OPENALEX_USERAGENT") != "") {
 34 |     message("Hint: You can provide an email to enter the polite pool")
 35 |     message("To have the setting stick persistently using .Renviron, do ...")
 36 |     message('  file.edit("~/.Renviron")')
 37 |     message(sprintf('  # and add a line OPENALEX_USERAGENT="%s"', ua))
 38 |     message("Then reload settings for the R environment in the current session")
 39 |     message('  readRenviron("~/.Renviron")')
 40 |   }
 41 | 
 42 |   message("Temporarily setting OPENALEX_USERAGENT envvar for this session to: ", ua)
 43 |   Sys.setenv("OPENALEX_USERAGENT" = ua)
 44 |   return (TRUE)
 45 | }
 46 | 
 47 | #' Use an API key for OpenAlex Premium Subscription
 48 | #'
 49 | #' This provides access to the latest data, fresher than what snapshots provide.
 50 | #' It also enables faster requests and filtering on from_created_date and from_updated_date fields.
 51 | #' @param key a premium subscription key
 52 | #' @return a logical depending on whether key was set or unset
 53 | #' @examplesIf interactive()
 54 | #'  openalex_key("my_secret_api_key")
 55 | #'  openalex_key("")
 56 | #' @export
 57 | #' @details
 58 | #' Additional details...
 59 | #'
 60 | #' <https://github.com/ourresearch/openalex-api-tutorials/blob/main/notebooks/getting-started/premium.ipynb>
 61 | #' <https://docs.openalex.org/api-entities/works/filter-works#from_created_date>
 62 | #' <https://docs.openalex.org/api-entities/works/filter-works#from_updated_date>
 63 | openalex_key <- function(key) {
 64 | 
 65 |   if (!nzchar(key)) {
 66 |     message("Unsetting premium subscription key")
 67 |     Sys.setenv("OPENALEX_KEY" = "")
 68 |     return (FALSE)
 69 |   }
 70 | 
 71 |   stopifnot(is.character(key), length(key) == 1)
 72 | 
 73 |   re_key <- "[[:alnum:]]{22}"
 74 |   stopifnot(grepl(re_key, key))
 75 | 
 76 |   if (Sys.getenv("OPENALEX_KEY") != "") {
 77 |     message("Hint: You can provide an premium subscription api key")
 78 |     message("To have the setting stick persistently using .Renviron, do ...")
 79 |     message('  file.edit("~/.Renviron")')
 80 |     message(sprintf('  # and add a line OPENALEX_KEY="%s"', key))
 81 |     message("Then reload settings for the R environment in the current session")
 82 |     message('  readRenviron("~/.Renviron")')
 83 |   }
 84 | 
 85 |   message("Temporarily setting OPENALEX_KEY envvar for this session")
 86 |   Sys.setenv("OPENALEX_KEY" = key)
 87 |   return (TRUE)
 88 | }
 89 | 
 90 | cfg <- function() {
 91 | 
 92 |   res <- list(
 93 |     user_agent = "http://github.com/hadley/httr"
 94 |   )
 95 | 
 96 |   if (Sys.getenv("OPENALEX_USERAGENT") != "") {
 97 |     res$user_agent <- Sys.getenv("OPENALEX_USERAGENT")
 98 |   }
 99 | 
100 |   if (Sys.getenv("OPENALEX_KEY") != "") {
101 |     res$key <- Sys.getenv("OPENALEX_KEY")
102 |   }
103 | 
104 |   return (res)
105 | }
106 | 
107 | #' Endpoint used for requests to OpenAlex API
108 | #' @export
109 | openalex_api <- function() {
110 |   "https://api.openalex.org/"
111 | }
112 | 
113 | #' Attribution
114 | #'
115 | #' Use this attribution whenever data from the API is publicly displayed
116 | #'
117 | #' @details OpenAlex provides a RESTful API for scholarly papers, authors,
118 | #' institutions, and more. When publicly displaying data from the API,
119 | #' it is polite to point back to OpenAlex at https://openalex.org/
120 | #' @export
121 | openalex_attribution <- function() {
122 |   license <- "https://creativecommons.org/publicdomain/zero/1.0/"
123 |   sprintf(paste0(
124 |     "Data source: OpenAlex API at %s", "\n",
125 |     "Data license agreement: %s"),
126 |     openalex_api(), license
127 |   )
128 | }
129 | 
130 | #' Retrieve work from OpenAlex REST API
131 | #'
132 | #' This function retrieves works given an identifier
133 | #' @param identifier string with identifier
134 | #' @param format one of "table" or "object"
135 | #' @param use_random logical to indicate whether to use random identifier, Default: FALSE
136 | #' @return as per format, either a tibble or an R object
137 | #' @examples
138 | #' \dontrun{
139 | #'  openalex_work(use_random = TRUE)
140 | #'  }
141 | #' @export
142 | openalex_work <- function(identifier, format = "table", use_random = FALSE) {
143 |   openalex_entity(identifier, entity = "works", format, use_random)
144 | }
145 | 
146 | openalex_entity_enum <- function()
147 |   c("works", "authors", "venues", "institutions", "concepts")
148 | 
149 | #' @importFrom httr modify_url user_agent GET status_code http_type content
150 | #' @importFrom jsonlite fromJSON
151 | #' @importFrom tibble enframe
152 | #' @importFrom dplyr mutate
153 | openalex_entity <- function(
154 |   identifier,
155 |   entity = openalex_entity_enum(),
156 |   format = c("table", "object", "raw", "tables"),
157 |   use_random = FALSE,
158 |   verbose = FALSE,
159 |   query = NULL) {
160 | 
161 |   style <- match.arg(format)
162 |   kind <- match.arg(entity)
163 |   is_listing <- FALSE
164 | 
165 |   if (missing(identifier)) {
166 |     if (use_random == FALSE && is.null(query)) {
167 |       stop("Identifier is missing, please specify use_random to use a random id.")
168 |     } else if (use_random == TRUE && is.null(query)) {
169 |       identifier <- "random"
170 |     } else {
171 |       if (verbose == TRUE) message("This is a list request...")
172 |       is_listing <- TRUE
173 |     }
174 |   }
175 | 
176 |   path <- if (!is_listing) sprintf("%s/%s", kind, identifier) else kind
177 | 
178 |   url <- httr::modify_url(
179 |     openalex_api(),
180 |     path = path,
181 |     query = query #paste0("filter=", URLencode(query$filter)) #, "&sort=publication_date:desc")
182 |   )
183 | 
184 |   if (verbose == TRUE) message("Requesting url: ", url)
185 | 
186 |   ua <- httr::user_agent(cfg()$user_agent)
187 |   res <- httr::GET(url, ua)
188 | 
189 |   if (httr::status_code(res) == 200) {
190 | 
191 |     if (httr::http_type(res) != "application/json") {
192 |       stop("API did not return json", call. = FALSE)
193 |     }
194 | 
195 |     if (style == "object") {
196 |       data <- jsonlite::fromJSON(
197 |         httr::content(res, as = "text", encoding = "utf-8"),
198 |         simplifyVector = FALSE #, DataFrame = TRUE, flatten = TRUE
199 |       )
200 |       data <- structure(data, meta = data$meta)
201 |     } else if (style == "table") {
202 |       payload <- httr::content(res, encoding = "utf-8")
203 |       name <- NULL
204 |       data <-
205 |         tibble::enframe(unlist(payload)) %>%
206 |         dplyr::mutate(name = gsub(".", "_", name, fixed = TRUE)) #%>%
207 |         #dplyr::mutate(item_id = cumsum(name == "concepts_id")) %>%
208 |         #dplyr::filter(item_id > 0) %>%
209 |         #tidyr::pivot_wider(values_fn = function(x) paste0(x, collapse = ", ")) %>%
210 |         #dplyr::rename_with(function(x) gsub("items_", "", x)) %>%
211 |         #dplyr::mutate(across(.fns = function(x) readr::parse_guess(x, guess_integer = TRUE)))
212 |       data <- structure(data, meta = payload$meta)
213 |     } else if (style == "tables") {
214 |       if (kind == "works") {
215 |         payload <- httr::content(res, encoding = "utf-8")
216 |         data <- payload$results |> parse_work()
217 |         data <- structure(data, meta = payload$meta)
218 |       } else {
219 |         stop("Only works supported for now!")
220 |       }
221 |     } else if (style == "raw") {
222 |       data <- res
223 |     }
224 | 
225 |     #class(data) <- c("tbl_df", "tbl", "data.frame")
226 |     return(data)
227 |   }
228 | 
229 |   if (status_code(res) == 429)
230 |     stop("HTTP status 429 Too Many Requests")
231 | 
232 |   if (status_code(res) == 403) {
233 |     cr <- content(res)
234 |     stop(cr$error, "\n\n", cr$message)
235 |   }
236 | 
237 |   stop("HTTP status ", status_code(res))
238 | 
239 | }
240 | 
241 | openalex_query <- function(
242 |     filter=NULL,
243 |     search=NULL,
244 |     sort=NULL,
245 |     page=NULL,
246 |     cursor=NULL,
247 |     verbose = FALSE) {
248 | 
249 |   # filter... use , to indicate AND
250 |   #?filter=last_known_institution.country_code:US,cited_by_count:>0
251 | 
252 |   # search... add ".search" to a property
253 |   #?filter=title.search:"intensive treatment of diabetes"
254 | 
255 |   # sort... By default, sort direction is ascending. You can reverse this by using sort:desc
256 |   # ?sort:display_name,cited_by_count,works_count,publication_date,relevance_score
257 | 
258 |   # paging currently you can only use paging to read the first 10,000 results of any list
259 |   # ?page=1
260 | 
261 |   q <- list(
262 |     filter = filter,
263 |     search = search,
264 |     sort = sort,
265 |     page = page,
266 |     cursor = cursor,
267 |     api_key = cfg()$key
268 |   ) |>
269 |     purrr::compact()
270 | 
271 |   if (verbose)
272 |     message("Query is:\n\n",
273 |        paste0(collapse = "\n", utils::capture.output(print(q)))
274 |     )
275 | 
276 |   return (q)
277 | 
278 | }
279 | 
280 | openalex_list <- function(entity, query, format = "object", verbose = FALSE) {
281 |   res <- openalex_entity(entity = entity, format = format, verbose = verbose, query = query)
282 |   attr(res, "page_count") <- ceiling(attr(res, "meta")$count / attr(res, "meta")$per_page)
283 |   return(res)
284 | }
285 | 
286 | gs <- function(x, p, r) {
287 |   gsub(p, r, x, fixed = TRUE)
288 | }
289 | 
290 | #' @importFrom purrr keep
291 | # support pipe
292 | tbl_from_slot <- function(x, slot)
293 |   x |> map(slot) |>
294 |   keep(.p = \(y) nrow(y) > 0) |>
295 |   bind_rows() |>
296 |   readr::type_convert() |>
297 |   suppressMessages() |>
298 |   mutate(across(is.character, \(x) x |> gs("https://openalex.org/", "")))
299 | 
300 | 
301 | #' Crawl multiple pages of results
302 | #'
303 | #' Iterates over paged results showing a progress bar
304 | #'
305 | #' @param entity one of the values in openalex_entity_enum()
306 | #' @param query an openalex_query object
307 | #' @param verbose boolean to indicate whether to output messages during process
308 | #' @param fmt the return format, one of "object" or "tables"
309 | #' @return R object with results matching the query
310 | #' @importFrom progress progress_bar
311 | #' @importFrom purrr possibly map_df map_dfr pmap
312 | #' @importFrom dplyr bind_rows
313 | #' @export
314 | openalex_crawl <- function(entity, query, verbose = FALSE, fmt = "object") {
315 | 
316 |   q <- query
317 | 
318 |   # if (use_cursor) {
319 |   #   q$cursor <- "*"
320 |   #   message("Using query:")
321 |   #   print(q)
322 |   # }
323 | 
324 |   res <- openalex_list(entity, q, format = fmt, verbose = FALSE)
325 |   #q <- query
326 |   n_items <- attr(res, "meta")$count
327 |   pages <- 1:attr(res, "page_count")
328 |   #next_cursor <- attr(res, "meta")$next_cursor
329 |   #q$next_cursor <- next_cursor
330 | 
331 |   # if (use_cursor == TRUE && is.null(next_cursor))
332 |   #   stop("Requested cursor paging, but no next_cursor found in response from API")
333 | 
334 |   if (n_items <= 0) {
335 |     message("No results, returning empty list.")
336 |     return (list())
337 |   }
338 | 
339 |   if (n_items > 1e4) {
340 |     stop("If there are more than 10000 results, please set use_cursor to activate cursor paging")
341 |   }
342 | 
343 |   if (verbose)
344 |     message("About to crawl a total of ", length(pages), " pages of results",
345 |             " with a total of ", n_items, " records.")
346 | 
347 |   pb <- progress_bar$new(
348 |     format = "  open alex resolving [:bar] :percent eta: :eta",
349 |     total = length(pages), clear = FALSE, width = 60)
350 | 
351 |   #TODO: fixme so this can run in parallel?
352 |   iq <- q
353 |   i <- 1
354 |   entities <- purrr::possibly(quiet = FALSE,
355 |     .f = function(x) {
356 |       pb$tick()
357 |       iq$page <- i
358 |       #print(q)
359 |       Sys.sleep(1 / 100)
360 |       # if (use_cursor & !is.null(next_cursor)) {
361 |       #   iq$filter <- paste0(q$filter, "&cursor=", next_cursor)
362 |       #   print(iq)
363 |       # }
364 |       res <- openalex_list(entity, iq, format = fmt, verbose = FALSE)
365 |       # if (use_cursor) {
366 |       #   next_cursor <<- attr(res, "meta")$next_cursor
367 |       # }
368 |       i <<- i + 1
369 |       return(res)
370 |     },
371 |     otherwise = list() #data.frame()
372 |   )
373 | 
374 |   if (fmt != "tables") {
375 |     res <- pages |>  map(entities, .progress = TRUE)
376 |     #res |>  pmap(c)
377 |     return (res)
378 |   }
379 | 
380 |   res <-
381 |     pages |>  map(entities, .progress = TRUE)
382 | 
383 |   #TODO: fix so that NOT THE SAME work ids are fetched!!!!
384 |   #TODO: do not assume entity is work below
385 | 
386 |   list(
387 |     work = res |> tbl_from_slot("work"),
388 |     work_ids = res |> tbl_from_slot("work_ids"),
389 |     work_concepts = res |> tbl_from_slot("work_concepts"),
390 |     work_authorships_institutions = res |> tbl_from_slot("work_authorships_institutions"),
391 |     work_abstract_inverted_index = res |> tbl_from_slot("work_abstract_inverted_index"),
392 |     work_authorships_author = res |> tbl_from_slot("work_authorships_author"),
393 |     work_biblio = res |> tbl_from_slot("work_biblio"),
394 |     work_open_access = res |> tbl_from_slot("work_open_access"),
395 |     work_host_venue = res |> tbl_from_slot("work_host_venue"),
396 |     work_counts_by_year = res |> tbl_from_slot("work_counts_by_year"),
397 |     work_related_works = res |> tbl_from_slot("work_related_works"),
398 |     work_referenced_works = res |> tbl_from_slot("work_referenced_works")
399 |   )
400 | 
401 | }
402 | 
403 | #'Flatten R object from deserialized nested JSON object
404 | #'
405 | #'@param nestedlist a nested list of lists
406 | #'@return a tibble in long format
407 | #'@export
408 | #'@importFrom tibble enframe
409 | #'@importFrom dplyr mutate
410 | openalex_flatten_long <- function(nestedlist) {
411 |   name <- NULL
412 |   tibble::enframe(unlist(nestedlist)) %>%
413 |     dplyr::mutate(name = gsub(".", "_", name, fixed = TRUE))
414 | }
415 | 
416 | openalex_autocomplete <- function(
417 |     query,
418 |     entity_type = openalex_entity_enum(),
419 |     format = c("object", "table"),
420 |     verbose = TRUE
421 |     ) {
422 | 
423 |   #/autocomplete/<entity_type>?q=<query>
424 | 
425 |   stopifnot(nchar(query) >= 1)
426 | 
427 |   style <- match.arg(format)
428 |   entity <- match.arg(entity_type)
429 |   path <- sprintf("autocomplete/%s", entity)
430 | 
431 |   url <- httr::modify_url(
432 |     openalex_api(),
433 |     path = path,
434 |     query = list(q = query)
435 |   )
436 | 
437 |   if (verbose == TRUE) message("Requesting url: ", url)
438 | 
439 |   ua <- httr::user_agent(cfg()$user_agent)
440 | 
441 |   res <- httr::GET(url, ua)
442 | 
443 |   if (httr::status_code(res) == 200) {
444 | 
445 |     if (httr::http_type(res) != "application/json") {
446 |       stop("API did not return json", call. = FALSE)
447 |     }
448 | 
449 |     if (style == "object") {
450 |       data <- jsonlite::fromJSON(
451 |         httr::content(res, as = "text", encoding = "utf-8"),
452 |         simplifyVector = FALSE #, DataFrame = TRUE, flatten = TRUE
453 |       )
454 |     } else {
455 |       name <- NULL
456 |       data <- httr::content(res, encoding = "utf-8") %>%
457 |         purrr::pluck("results") %>%
458 |         dplyr::bind_rows()
459 |     }
460 | 
461 |     #class(data) <- c("tbl_df", "tbl", "data.frame")
462 |     return(data)
463 |   }
464 | 
465 |   if (status_code(res) == 429)
466 |     stop("HTTP status 429 Too Many Requests")
467 | 
468 |   stop("HTTP status ", status_code(res))
469 | 
470 | }
471 | 
472 | #' Example query when searching raw affiliation strings
473 | #'
474 | #' This variant is specifically tailored for KTH, Royal Institute of Technology
475 | #' and includes some affiliation string variations which might be related.
476 | #' @export
477 | #' @return string with query
478 | openalex_kth_rawaff_query <- function() {
479 |   # (roy AND inst AND tech) OR
480 |   # "Roy. Inst. T"
481 |   # (roy* AND tech* AND univ*)) AND (Sweden))
482 |   # paste0(
483 |   #   'KTH OR (roy* AND inst* AND tech*) OR ',
484 |   #   '(alfven) OR (kung* AND tek* AND hog*) OR (kung* AND tek* AND h\\u00f6g*) OR ',
485 |   #   '(kgl AND tek* AND hog*) OR (kung* AND tek* AND hg*)'
486 |   # )
487 | 
488 |   '("KTH" OR
489 | 
490 |  (("roy inst" OR
491 |  "royal in-stitute" OR
492 |  "royal inititute" OR
493 |  "royal institut" OR
494 |  "royal institute" OR
495 |  "royal institite" OR
496 |  "royal institution" OR
497 |  "royal institue" OR
498 |  "royal insititu" OR
499 |  "royal insitute" OR
500 |  "royal inst" OR
501 |  "royal inst." OR
502 |  "royal intitute" OR
503 |  "royal istitute" OR
504 |  "royal lnstitute" OR
505 |  "royal lnstitufe" OR
506 |  "royal lnstltute") AND "tech") OR
507 | 
508 |  (("kgl" OR
509 |  "kgl." OR
510 |  "kungl" OR
511 |  "kungl." OR
512 |  "kungliga") AND "tekn") OR
513 | 
514 |  "r inst of technol" OR
515 |  "r inst. of technol." OR
516 |  "r. inst. of tech." OR
517 |  "r. inst. of technol" OR
518 |  "r. inst. of technol." OR
519 |  "royal tech" OR
520 |  "institute of technology stockholm" OR
521 |  "royal of technology" OR
522 |  "royal school of technology" OR
523 |  "royal swedish institute of technology" OR
524 |  "royal university of technology" OR
525 |  "royal college of technology" OR
526 |  "royalinstitute" OR
527 |  "alfven" OR
528 |  "alfv\u00e9n" OR
529 |  "10044 stockholm" OR
530 |  "100 44 stockholm")
531 | 
532 |  NOT
533 | 
534 |  ("khyber" OR
535 |  "peshawar" OR
536 |  "mcmaster")'
537 | 
538 | }
539 | 
540 | # There seems to be a way to fetch ngrams
541 | 
542 | ## https://api.openalex.org/works/W3128409631/ngrams
543 | ## https://api.openalex.org/works/W2023271753/ngrams
544 | 
545 | # Search UI for KTH
546 | 
547 | ## https://explore.openalex.org/institutions/I86987016
548 | 
549 | #' Recently published works based on query for matching raw affiliations
550 | #' @param raw_search_criteria raw affiliation string search criteria,
551 | #' by default openalex_kth_rawaff_query()
552 | #' @param since_days integer indicating days back from today
553 | #' @export
554 | #' @return list of tables with results
555 | openalex_works_published_since <- function(
556 |     raw_search_criteria = openalex_kth_rawaff_query(),
557 |     since_days = 7) {
558 | 
559 |   criteria_aff <- raw_search_criteria
560 |   criteria_from <- format(Sys.Date() - since_days, "%Y-%m-%d")
561 | 
562 |   params <- paste0(collapse = ",", c(
563 |       sprintf("raw_affiliation_strings.search:%s", criteria_aff),
564 |       sprintf("from_publication_date:%s", criteria_from)
565 |     )
566 |   )
567 | 
568 |   openalex_crawl("works", fmt = "tables", verbose = TRUE,
569 |     query = openalex_query(
570 |       filter = params,
571 |       verbose = FALSE
572 |     )
573 |   )
574 | 
575 | }
576 | 
577 | #' Recently updated works based on query for matching raw affiliations
578 | #'
579 | #' This function requires a premium subscription API key to be set.
580 | #'
581 | #' @param raw_search_criteria raw affiliation string search criteria,
582 | #' by default openalex_kth_rawaff_query()
583 | #' @param since_minutes integer indicating minutes since now
584 | #' @export
585 | #' @importFrom lubridate as_date format_ISO8601
586 | #' @return list of tables with results
587 | openalex_works_updated_since <- function(
588 |     raw_search_criteria = openalex_kth_rawaff_query(),
589 |     since_minutes) {
590 | 
591 |   if (is.null(cfg()$key))
592 |     stop("This function requires a Premium Subscription API key")
593 | 
594 |   criteria_aff <- raw_search_criteria
595 | 
596 |   #criteria_from <- "2024-01-15T08:02:55Z" #"2024-01-15T04:47:14.518460"
597 |   criteria_from <-
598 |     lubridate::as_datetime(Sys.time() - since_minutes * 60) |>
599 |     lubridate::format_ISO8601(usetz = "Z")
600 | 
601 |   params <- paste0(collapse = ",", c(
602 |       sprintf("raw_affiliation_strings.search:%s", criteria_aff),
603 |       sprintf("from_updated_date:%s", criteria_from)
604 |     )
605 |   )
606 | 
607 |   openalex_crawl("works", fmt = "tables", verbose = TRUE,
608 |     query = openalex_query(
609 |       filter = params,
610 |       verbose = FALSE
611 |     )
612 |   )
613 | 
614 | }
615 | 
616 | #' Recently created works based on query for matching raw affiliations
617 | #'
618 | #' This function requires a premium subscription API key to be set.
619 | #'
620 | #' @param raw_search_criteria raw affiliation string search criteria,
621 | #' by default openalex_kth_rawaff_query()
622 | #' @param since_days integer indicating minutes since now
623 | #' @export
624 | #' @importFrom lubridate as_date
625 | #' @return list of tables with results
626 | openalex_works_created_since <- function(
627 |     raw_search_criteria = openalex_kth_rawaff_query(),
628 |     since_days = 0) {
629 | 
630 |   if (is.null(cfg()$key))
631 |     stop("This function requires a Premium Subscription API key")
632 | 
633 |   criteria_aff <- raw_search_criteria
634 | 
635 |   criteria_from <-
636 |     lubridate::as_date(Sys.Date() - since_days) |>
637 |     format("%Y-%m-%d")
638 | 
639 |   params <- paste0(collapse = ",", c(
640 |       sprintf("raw_affiliation_strings.search:%s", criteria_aff),
641 |       sprintf("from_created_date:%s", criteria_from)
642 |     )
643 |   )
644 | 
645 |   openalex_crawl("works", fmt = "tables", verbose = TRUE,
646 |     query = openalex_query(
647 |       filter = params,
648 |       verbose = FALSE
649 |     )
650 |   )
651 | 
652 | }
653 | 
654 | #' @import httr2
655 | openalex_aboutness <- function(title, abstract = NULL, verbose = FALSE, format = c("object", "tables")) {
656 | 
657 |   #  "https://api.openalex.org/text?title=type%201%20diabetes%20research%20for%20children
658 |   #  https://groups.google.com/g/openalex-users/c/Df4dIA19adM
659 | 
660 |   is_invalid <- function(x) nchar(x) < 20 | nchar(x) > 2000
661 | 
662 |   if (is_invalid(title))
663 |     stop("Title must be between 20 and 2000 characters long")
664 | 
665 |   if (!is.null(abstract) && is_invalid(abstract))
666 |     stop("Abstract, if provided, must be between 20 and 2000 character long")
667 | 
668 |   q <- purrr::compact(list(title = title, abstract = abstract))
669 | 
670 |   req <-
671 |     httr2::request(openalex_api()) |>
672 |     httr2::req_url_path("text") |>
673 |     httr2::req_user_agent(cfg()$user_agent) |>
674 |     httr2::req_body_json(data = q)
675 | 
676 |   if (verbose)
677 |     req <- req |> httr2::req_verbose()
678 | 
679 |   resp <- req |> httr2::req_perform()
680 | 
681 |   res <- switch(match.arg(format),
682 |     "object" = resp |> httr2::resp_body_json(),
683 |     "tables" = parse_resp_aboutness(resp |> httr2::resp_body_json())
684 |   )
685 | 
686 |   return(res)
687 | 
688 | }
689 | 
690 | parse_topics <- function(topics) {
691 | 
692 |   ones <-
693 |     topics |> map(\(x) purrr::discard_at(x, at = c("field", "domain", "subfield"))) |>
694 |     bind_rows()
695 | 
696 |   manies <-
697 |     topics |> map(\(x) purrr::keep_at(x, at = c("field", "domain", "subfield")))
698 | 
699 |   fsd <- bind_cols(
700 |     manies |> map("field") |> bind_rows() |> rename_with(\(x) paste0("field_", x)),
701 |     manies |> map("subfield") |> bind_rows() |> rename_with(\(x) paste0("subfield_", x)),
702 |     manies |> map("domain") |> bind_rows() |> rename_with(\(x) paste0("domain_", x))
703 |   )
704 | 
705 |   bind_cols(ones, fsd)
706 | 
707 | }
708 | 
709 | parse_resp_aboutness <- function(resp) {
710 | 
711 |   d <- resp
712 | 
713 |   meta <-
714 |     d$meta |> bind_rows()
715 | 
716 |   keywords <-
717 |     d$keywords |> bind_rows()
718 | 
719 |   topics <-
720 |     d$topics |> parse_topics()
721 | 
722 |   primary_topic <-
723 |     list(d$primary_topic) |> parse_topics()
724 | 
725 |   concepts <-
726 |     bind_cols(
727 |       d$concepts |> bind_rows() |> select(-any_of("ancestors")),
728 |       d$concepts |> bind_rows() |> pull(any_of("ancestors")) |> map(bind_rows) |>
729 |         bind_rows() |> rename_with(.fn = \(x) paste0("ancestors_", x))
730 |     )
731 | 
732 |   list(meta = meta, keywords = keywords, topics = topics, concepts = concepts)
733 | 
734 | }
735 | 
736 | 
737 | 
738 | openalex_filter_similar_topics <- function(work_identifier, granularity = c("topic", "domain", "field", "subfield")) {
739 | 
740 |   w <- openalex_work(work_identifier, format = "object")
741 | 
742 |   topic_id <- function(w, field_type) {
743 | 
744 |     f <- switch(field_type,
745 |       topic = "topics.id",
746 |       domain = "topics.domain.id",
747 |       field = "topics.field.id",
748 |       subfield = "topics.subfield.id"
749 |     )
750 | 
751 |     if (field_type == "topic")
752 |       field_type <- NULL
753 | 
754 |     res <-
755 |       w$topics |> map_chr(c(field_type, "id")) |> unique() |>
756 |       gsub(pattern = "https://.*?/(.*?)$", replacement = "\\1")
757 | 
758 |     paste0(f, ":", paste0(collapse = "|", res))
759 |   }
760 | 
761 |   topics_filter <- function(w) {
762 |     fields <- granularity
763 |     topics <- fields |> map_chr(function(x) topic_id(w, x))
764 |     topics |> paste(collapse = ",")
765 |   }
766 | 
767 |   topics_filter(w)
768 | 
769 | }
770 | 
771 | openalex_works_to_tbls <- function(works) {
772 | 
773 |   pw2 <- purrr::possibly(parse_work2, otherwise = NULL, quiet = FALSE)
774 | 
775 |   message("Converting record batches to tables...")
776 |   tbls <- works |> map(pw2, .progress = TRUE)
777 |   message("Done")
778 | 
779 |   message("Unifying and merging tables...")
780 | 
781 |   unify_slots <- function(tbls) {
782 | 
783 |     slotz <- map(tbls, names) |> unique() |> unlist()
784 |     strip_prefix <- function(x) gsub("^https://.*?/(.*?)$", "\\1", x)
785 |     strip_doi <- function(x) gsub("^https://doi.org/(.*?)$", "\\1", x)
786 |     #message("Merging slots:\n", slotz |> paste0(collapse = "\n"))
787 |     unify <- function(x) {
788 | 
789 |       tbls |> map(x) |> bind_rows() |>
790 |       readr::type_convert(guess_integer = TRUE) |>
791 |       suppressMessages() |> suppressWarnings() |>
792 |       mutate(across(where(function(x) is.character(x)) & !any_of(c("doi")), .fns = strip_prefix)) |>
793 |       mutate(across(any_of(c("doi")), .fns = strip_doi)) |>
794 |       select(where(Negate(is.list)))
795 |     }
796 |     res <- slotz |> map(unify) |> setNames(nm = slotz)
797 |     return (res)
798 |   }
799 | 
800 |   out <- unify_slots(tbls)
801 | 
802 |   message("Done")
803 |   return(out)
804 | }
805 | 


--------------------------------------------------------------------------------
/R/openalex-package.R:
--------------------------------------------------------------------------------
 1 | #' @keywords internal
 2 | "_PACKAGE"
 3 | 
 4 | #' Topics
 5 | #' 
 6 | #' Topics used by OpenAlex
 7 | #' @format A data frame with 4516 rows and 9 variables:
 8 | #' \describe{
 9 | #'   \item{\code{id_topic}}{character the id for the topic}
10 | #'   \item{\code{topic}}{character description of topic}
11 | #'   \item{\code{description}}{character long form description of this topic cluster}
12 | #'   \item{\code{id_subfield}}{character the id for the subfield of this topic}
13 | #'   \item{\code{subfield}}{character description of the subfield}
14 | #'   \item{\code{id_field}}{character the id of the field}
15 | #'   \item{\code{field}}{character description of the field}
16 | #'   \item{\code{id_domain}}{character the id of the domain}
17 | #'   \item{\code{domain}}{character description of the domain} 
18 | #'}
19 | #' @details DETAILS
20 | "topics"
21 | 
22 | ## usethis namespace: start
23 | ## usethis namespace: end
24 | NULL
25 | 


--------------------------------------------------------------------------------
/R/rectangularize.R:
--------------------------------------------------------------------------------
  1 | #' @importFrom dplyr rename bind_rows select tibble starts_with distinct
  2 | #' @importFrom tidyr unnest unnest_wider hoist unnest_longer
  3 | #' @importFrom purrr map_dfr map pluck
  4 | parse_work <- function(chunk) {
  5 | 
  6 |   openalex <- NULL
  7 | 
  8 |   # TODO fix this?
  9 | #  if (length(lengths(chunk)) == 1)
 10 | #    chunk <- list(chunk)
 11 | 
 12 |   if (length(chunk) == 0) return(list())
 13 | 
 14 |   work_ids <-
 15 |     chunk |>
 16 |     map_dfr("ids") |>
 17 |     rename(work_id = openalex)
 18 | 
 19 |   work_host_venue <-
 20 |     chunk |>
 21 |     map(function(x) c(work_id = pluck(x, "id"), pluck(x, "host_venue"))) |>
 22 |     bind_rows() #|> unnest(issn)
 23 | #    map_dfr(bind_rows)
 24 | 
 25 |   work_open_access <-
 26 |     chunk |>
 27 |     map(function(x) c(work_id = pluck(x, "id"), pluck(x, "open_access"))) |>
 28 |     map_dfr(bind_rows)
 29 | 
 30 |   work_biblio <-
 31 |     chunk |>
 32 |     map(function(x) c(work_id = pluck(x, "id"), pluck(x, "biblio"))) |>
 33 |     map_dfr(bind_rows)
 34 | 
 35 |   work_authorships <-
 36 |     chunk |>
 37 |     map(function(x) c(work_id = pluck(x, "id"), pluck(x, "authorships")))
 38 | 
 39 |   work_authorships_author <-
 40 |     tibble(wa = work_authorships) |>
 41 |       hoist("wa", "work_id") |>
 42 |       unnest("wa") |>
 43 |       unnest_wider("wa", names_sep = "_") |>
 44 |       unnest_wider("wa_author") |>
 45 |       select(-starts_with(c("wa_institutions")))
 46 |     # work_authorships |>
 47 |     #   tibble(wa = .) |>
 48 |     #   hoist("wa", "work_id") |>
 49 |     #   unnest_wider("wa", names_sep = "_") |>
 50 |     #   unnest_wider("wa_1") |>
 51 |     #   unnest_longer("author") |>
 52 |     #   pivot_wider(names_from = "author_id", values_from = "author") |>
 53 |     #  select(-starts_with(c("wa_", "institutions")))
 54 | 
 55 |     # map(function(x) tibble(
 56 |     #   work_id = pluck(x, "work_id"),
 57 |     #   author_position = pluck(x, 2, "author_position"),
 58 |     #   author_id = pluck(x, 2, "author", "id"),
 59 |     #   author_display_name = pluck(x, 2, "author", "display_name"),
 60 |     #   author_orcid = pluck(x, 2, "author", "orcid")
 61 |     # )) |>
 62 |     # bind_rows() |> unnest(author) |>
 63 |     # unnest_wider("author") |>
 64 |     # rename(author_id = id, author_display_name = display_name)
 65 | 
 66 | 
 67 |     work_authorships_institutions <-
 68 |       tibble(wa = work_authorships) |>
 69 |         hoist("wa", "work_id") |>
 70 |         unnest("wa") |>
 71 |         unnest_wider("wa", names_sep = "_") |>
 72 |         select(-c("wa_author_position")) |>
 73 |         unnest("wa_institutions") |>
 74 |         unnest_wider("wa_institutions") |>
 75 |         select(-c("wa_author")) |>
 76 |         distinct()
 77 | 
 78 |   #   work_authorships |>
 79 |   #   map_dfr(function(x) tibble(
 80 |   #     work_id = pluck(x, "work_id"),
 81 |   #     raw_affiliation_string = pluck(x, "raw_affiliation_string"),
 82 |   #     institutions = pluck(x, "institutions")
 83 |   #   )) |>
 84 |   #   map("institutions")
 85 |   #   unnest_wider("institutions") |>
 86 |   #   rename(institution_id = id, institution_display_name = display_name) |>
 87 |   #   distinct() |>
 88 |   #   filter(!is.na(raw_affiliation_string))
 89 | 
 90 |   work_concepts <-
 91 |     chunk |>
 92 |     map(function(x) tibble(work_id = pluck(x, "id"), pluck(x, "concepts"))) |>
 93 |     map_dfr(bind_rows) |> unnest_wider(2)
 94 | 
 95 |   # work_mesh <-
 96 |   #   chunk |>
 97 |   #   map(function(x) tibble(work_id = pluck(x, "id"), pluck(x, "mesh"))) |>
 98 |   #   map_dfr(bind_rows)
 99 | 
100 |   aii_to_df <- function(x) {
101 |     tibble(attr = names(x), val = x) |>
102 |       unnest_wider("val", names_repair = function(x) paste0("i", seq_along(x) - 1))
103 |   }
104 | 
105 |   #abstract_inverted_index <-
106 |   #  chunk$abstract_inverted_index |> aii_to_df()
107 | 
108 |   abstract_inverted_index <-
109 |       chunk |>
110 |       map(function(x) tibble(
111 |         work_id = pluck(x, "id"),
112 |         aii_value = paste(collapse = " ", unlist(pluck(x, "abstract_inverted_index", .default = NA_integer_))),
113 |         aii_key = paste(collapse = " ", unique(names(pluck(x, "abstract_inverted_index", .default = NA_character_))))
114 |       )) |>
115 |       map_dfr(bind_rows) |>
116 |       unnest_longer("aii_value") |>
117 |       distinct()
118 | 
119 |   # abstract_inverted_index <-
120 |   #   chunk[1:20] |>
121 |   #   map_dfr(function(x) tibble(work_id = pluck(x, "id"), aii = pluck(x, "abstract_inverted_index"))) |>
122 |   #   bind_cols(aii_to_df(.$aii)) |>
123 |   #   select(!any_of("aii"))
124 |   #
125 |   #   unnest_wider("aii", transform = function(x) aii_to_df(x))
126 | 
127 |   work_counts_by_year <-
128 |     chunk |>
129 |     map(function(x) tibble(work_id = pluck(x, "id"), cby = pluck(x, "counts_by_year"))) |>
130 |     map_dfr(bind_rows) |>
131 |     unnest_wider("cby")
132 | 
133 |   work_related_works <-
134 |     chunk |>
135 |     map(function(x) tibble(work_id = pluck(x, "id"), related_works = pluck(x, "related_works"))) |>
136 |     map_dfr(bind_rows) |>
137 |     unnest_longer("related_works")
138 | 
139 |   work_referenced_works <-
140 |     chunk |>
141 |     map(function(x) tibble(work_id = pluck(x, "id"), referenced_works = pluck(x, "referenced_works"))) |>
142 |     map_dfr(bind_rows) |>
143 |     unnest_longer("referenced_works")
144 | 
145 | 
146 |   work <-
147 |     chunk |> map_dfr(
148 |       function(x) tibble(
149 |         id = pluck(x, "id"),
150 |         doi = pluck(x, "doi"),
151 |         display_name = pluck(x, "display_name"),
152 |         title = pluck(x, "title"),
153 |         publication_year = pluck(x, "publication_year"),
154 |         publication_date = pluck(x, "publication_date"),
155 |         type = pluck(x, "type"),
156 |         cited_by_count = pluck(x, "cited_by_count"),
157 |         is_retracted = pluck(x, "is_retracted"),
158 |         is_paratext = pluck(x, "is_paratext"),
159 |         updated_date = pluck(x, "updated_date"),
160 |         cited_by_api_url = pluck(x, "cited_by_api_url"),
161 |         created_date = pluck(x, "created_date")
162 |       )
163 |     )
164 | 
165 |   list(
166 |     work = work,
167 |     work_ids = work_ids,
168 | #    work_mesh = work_mesh,
169 |     work_concepts = work_concepts,
170 |     work_authorships_institutions = work_authorships_institutions,
171 |     work_abstract_inverted_index = abstract_inverted_index,
172 |     work_authorships_author = work_authorships_author,
173 |     work_biblio = work_biblio,
174 |     work_open_access = work_open_access,
175 |     work_host_venue = work_host_venue,
176 |     work_counts_by_year = work_counts_by_year,
177 |     work_related_works = work_related_works,
178 |     work_referenced_works = work_referenced_works
179 |   )
180 | 
181 | }
182 | 
183 | #' @noRd
184 | #' @import tidyr dplyr purrr
185 | parse_work2 <- function(object) {
186 | 
187 |   name <- value <- work_id <- NULL
188 | 
189 |   unfwv <- function(l, field) {
190 |     if (is.null(l$field)) return(tibble())
191 |     l |> map(\(x) keep_at(x, c("id", field))) |>
192 |       enframe() |>
193 |       unnest_wider(any_of("value")) |>
194 |       tidyr::unnest_wider(any_of(field)) |>
195 |       select(-any_of(c("name")))
196 |   }
197 | 
198 |   unfwvs <- function(l, field) {
199 |     if (is.null(l$field)) return(tibble())
200 |     l |> map(\(x) keep_at(x, c("id", field))) |>
201 |       enframe() |>
202 |       unnest_wider(any_of("value")) |>
203 |       tidyr::unnest_wider(any_of(field), names_sep = "_") |>
204 |       select(-any_of(c("name")))
205 |   }
206 | 
207 |   unfw <- function(l, field) {
208 |     if (is.null(l$field)) return(tibble())
209 |     l |> map(\(x) keep_at(x, c("id", field))) |>
210 |       compact() |>
211 |       map_df(tibble::as_tibble) |>
212 |       tidyr::unnest_wider(any_of(field)) |>
213 |       compact()
214 |   }
215 | 
216 |   unfws <- function(l, field) {
217 |     if (is.null(l$field)) return(tibble())
218 |     l |> map(\(x) keep_at(x, c("id", field))) |>
219 |       compact() |> 
220 |       map_df(tibble::as_tibble) |>
221 |       tidyr::unnest_wider(any_of(field), names_sep = "_") |> 
222 |       compact()
223 |   }
224 | 
225 |   unfl <- function(l, field) {
226 |     #has_field <- l |> map_lgl(\(x) field %in% names(x)) |> all()
227 |     #if (!has_field) return(data.frame(0))
228 |     if (is.null(l$field)) return(tibble())
229 |     l |> map(\(x) keep_at(x, c("id", field))) |>
230 |       compact() |>
231 |       map_df(tibble::as_tibble) |>
232 |       tidyr::unnest_longer(any_of(field)) |>
233 |       compact()
234 |   }
235 | 
236 |   pluck_with_id <- function(x, field) {
237 |     if (!pluck_exists(x, field)) return (NULL)
238 |     c(id = pluck(x, "id"), pluck(x, field))
239 |   }
240 | 
241 |   w <- object
242 | 
243 |   colz <-
244 |     w$results |>
245 |     map(\(x) tibble(cols = names(x), l = lengths(x)) |>
246 |       tidyr::pivot_wider(names_from = "cols", values_from = "l")
247 |     ) |>
248 |     bind_rows() |>
249 |     summarize(across(everything(), max)) |>
250 |     ungroup() |>
251 |     tidyr::pivot_longer(cols = everything()) 
252 |   
253 |   one_to_one <- colz |> filter(value == 1, name != "versions") |> pull(name)
254 | 
255 |   # workz <-
256 |   #   w$results  |>
257 |   #   map(\(x) x[one_to_one]  |> compact()  |> as_tibble())  |>
258 |   #   bind_rows()
259 | 
260 |   plf <- function(o, f) {
261 |     l <- o |> map(\(x) purrr::pluck(x, f)) |> unlist()
262 |     list(l) |> setNames(nm = f)
263 |   }
264 | 
265 |   # TODO: remove keep_empty(?)
266 |   wide <- enframe(w) |> unnest_longer(2, keep_empty = TRUE) |> unnest_wider(2) 
267 |   
268 |   workz <- 
269 |     wide |> select(any_of(one_to_one))
270 | 
271 |   ids <- 
272 |     wide |> select(work_id = id, any_of(c("ids"))) |> unnest_wider(any_of(c("ids")))
273 | 
274 |   re_ids <- paste0(
275 |       "(https://openalex.org/)|(https://doi.org/)|",
276 |       "(https://pubmed.ncbi.nlm.nih.gov/)|(https://www.ncbi.nlm.nih.gov/pmc/articles/)|",
277 |       "(https://www.wikidata.org/wiki/)|(https://orcid.org/)|(https://ror.org/)"
278 |     )
279 | 
280 |   fuw <- function(fields) {
281 |       wide |> select(work_id = "id", any_of(c(fields))) |> 
282 |       unnest_wider(any_of(c(fields)), names_sep = "_") |> 
283 | #          unnest_wider(any_of(c(fields))) |> 
284 |       mutate(across(-contains("url"), \(x) gsub(re_ids, "", x)))
285 |   }
286 | 
287 |   authorships <- 
288 |     wide |> select(work_id = "id", any_of("authorships")) |> 
289 |     unnest_longer(2) |> unnest_wider(2) |> 
290 |     mutate(across(contains("id"), \(x) gsub(re_ids, "", x)))
291 | 
292 |   authorships_affiliations_raw <- 
293 |     authorships |> 
294 |     select(any_of(c("work_id", "author_id", "raw_author_name", "raw_affiliation_strings"))) |> 
295 |     unnest_longer("raw_affiliation_strings") |> 
296 |     mutate(across(contains("id"), \(x) gsub(re_ids, "", x)))
297 | 
298 |   authorships_affiliations <- 
299 |     authorships |>
300 |     select(any_of(c("work_id", "affiliations", "author"))) |> 
301 |     unnest_longer(any_of("affiliations")) |>
302 |     unnest_wider(any_of("affiliations"), names_sep = "_") |>
303 |     unnest_longer(any_of("affiliations_institution_ids")) |> 
304 |     unnest_wider(any_of("author"), names_sep = "_") |> 
305 |     mutate(across(contains("id"), \(x) gsub(re_ids, "", x))) |> 
306 |     distinct()
307 | 
308 |   authorships_authors <- 
309 |     authorships |> 
310 |     unnest_wider(any_of("author"), names_sep = "_") |> 
311 |     select(-any_of(c("institutions", "affiliations", "raw_author_name", "raw_affiliation_strings", "countries"))) |> 
312 | #    unnest_wider(any_of(c("countries")), names_sep = "_") |> 
313 |     mutate(across(contains("id"), \(x) gsub(re_ids, "", x)))
314 |     
315 |   authorships_institutions <- 
316 |     authorships |>
317 |     select(any_of(c("work_id", "institutions", "author"))) |> 
318 |     unnest_longer(any_of("institutions")) |> 
319 |     unnest_wider("institutions", names_sep = "_") |> 
320 |     unnest_longer("institutions_lineage") |>
321 |     unnest_wider(any_of(c("author")), names_sep = "_") |> 
322 |     mutate(across(everything(), \(x) gsub(re_ids, "", x)))
323 |           
324 |   fields <- c(
325 |     "ids", "open_access", "apc_list", "apc_paid",
326 |     "citation_normalized_percentile", "cited_by_percentile_year",
327 |     "biblio"
328 |   )
329 | 
330 |   fields <- fields[which(fields %in% unique(colz$name))]
331 |   various <- fields |> map(fuw) |> set_names(fields)
332 | 
333 |   fields2 <- c("counts_by_year", "grants", "mesh")
334 |   fields2 <- fields2[which(fields2 %in% unique(colz$name))]
335 | 
336 |   bcbr <- function(field) {
337 |     w$results |> map_dfr(\(x) bind_cols(work_id = x$id, bind_rows(x |> getElement(field)))) |> 
338 |       mutate(across(-contains("url"), \(x) gsub(re_ids, "", x)))
339 |   }
340 | 
341 |   various2 <- fields2 |> map(bcbr) |> set_names(fields2)
342 | 
343 |   fields3 <- c(
344 |     "sustainable_development_goals",
345 |     "keywords",
346 |     "concepts"#
347 |     #"datasets"
348 |   )
349 | 
350 |   fields3 <- fields3[which(fields3 %in% unique(colz$name))]
351 | 
352 |   various3 <- 
353 |     fields3 |> map(bcbr) |> set_names(fields3)
354 | 
355 |   datasets <- 
356 |     wide |> select(id, datasets) |> unnest(datasets) |> unnest(datasets) |> 
357 |       mutate(across(-contains("url"), \(x) gsub(re_ids, "", x)))
358 | 
359 |   fields4 <- c(
360 |     "referenced_works",
361 |     "related_works",
362 |     "indexed_in",
363 |     "corresponding_institution_ids",
364 |     "corresponding_author_ids"#,
365 | #    "abstract_inverted_index"
366 |   )
367 | 
368 |   fields4 <- fields4[which(fields4 %in% unique(colz$name))]
369 | 
370 |   bcbv <- function(field) {
371 |     w$results |> map_dfr(\(x) bind_cols(work_id = x$id, rw = unlist(x |> getElement(field)))) |> 
372 |       setNames(nm = c("work_id", field)) |> 
373 |       mutate(across(-contains("url"), \(x) gsub(re_ids, "", x)))
374 |   }
375 | 
376 |   various4 <- 
377 |     fields4 |> map(bcbv)|> set_names(nm = fields4)
378 | 
379 |   aii_to_abstract <- function(aii) {
380 | 
381 |     value <- NULL
382 | 
383 |     abstract <-
384 |       aii |> enframe() |>
385 |       unnest_longer(any_of(c("value"))) |>
386 |       arrange(-desc(value)) |>
387 |       pull(any_of(c("name"))) |>
388 |       paste0(collapse = " ")
389 | 
390 |     if (!nzchar(abstract))
391 |       return (NA_character_)
392 | 
393 |     return (abstract)
394 | 
395 |   }
396 | 
397 |   abstracts <-
398 |     w$results |>
399 |     map(function(x) tibble(
400 |       work_id = pluck(x, "id"),
401 |       abstract = aii_to_abstract(pluck(x, "abstract_inverted_index"))
402 |     )) |>
403 |     map_dfr(bind_rows)
404 |  
405 |   primary_location <- 
406 |     "primary_location" |> fuw() 
407 |   
408 |   primary_location_source <- 
409 |     primary_location |> select(any_of(c("work_id", "primary_location_source"))) |> 
410 |     mutate(primary_location_source = map(primary_location_source, 
411 |       \(x) eval(parse(text = x)))) |> 
412 |     mutate(primary_location_source = map(primary_location_source, 
413 |       \(x) compact(x) |> enframe() |> pivot_wider())) |> #|> as_tibble())) |> 
414 |     #pull(primary_location_source) |> head(1)
415 |     unnest(2) |> 
416 |     unnest_longer(any_of("issn")) |> 
417 |     unnest(any_of(everything())) |> 
418 |     unnest_wider(any_of(c("host_organization_lineage")), names_sep = "_")
419 | 
420 |   primary_location <- 
421 |     primary_location |> select(-any_of("primary_location_source"))
422 | 
423 |   primary_topic <-
424 |     "primary_topic" |> fuw() |> 
425 |     mutate(across(any_of(
426 |       c("primary_topic_subfield", "primary_topic_field", "primary_topic_domain")), 
427 |       \(y) y |> map(\(x) eval(parse(text = x))))
428 |     ) |> 
429 |     mutate(across(any_of(
430 |       c("primary_topic_subfield", "primary_topic_field", "primary_topic_domain")), 
431 |       \(y) y |> map(\(x) compact(x) |> as_tibble()))
432 |     ) |> 
433 |     unnest("primary_topic_subfield", names_sep = "_") |> 
434 |     unnest("primary_topic_field", names_sep = "_") |> 
435 |     unnest("primary_topic_domain", names_sep = "_")
436 | 
437 |   topics <-
438 |     wide |> select(any_of(c("id", "topics"))) |> 
439 |     unnest(topics) |> 
440 |     unnest_wider(topics, names_sep = "_") |> 
441 |     unnest_wider(any_of("topics_field"), names_sep = "_") |>
442 |     unnest_wider(any_of("topics_subfield"), names_sep = "_") |>
443 |     unnest_wider(any_of("topics_domain"), names_sep = "_") |>
444 |     compact() |> 
445 |     mutate(across(-contains("url"), \(x) gsub(re_ids, "", x)))
446 | 
447 |   best_oa_location <-
448 |     "best_oa_location" |> fuw()
449 | 
450 |   best_oa_location_source <- 
451 |     best_oa_location |> 
452 |     select(any_of(c("work_id", "best_oa_location_source"))) |> 
453 |     mutate(best_oa_location_source = map(best_oa_location_source, 
454 |       \(x) eval(parse(text = x)))) |> 
455 |     mutate(best_oa_location_source = map(best_oa_location_source, 
456 |       \(x) compact(x) |> enframe() |> pivot_wider())) |> #|> as_tibble())) |> 
457 |     unnest(2) |> 
458 |     unnest_longer(any_of("issn")) |> 
459 |     unnest(any_of(everything())) |> 
460 |     compact() |> 
461 |     mutate(across(-contains("url"), \(x) gsub(re_ids, "", x)))
462 | 
463 |   best_oa_location <-
464 |     best_oa_location |> select(-any_of(c("best_oa_location_source")))
465 | 
466 |   locations <-
467 |     wide |> select(any_of(c("id", "locations"))) |> 
468 |     unnest(any_of(c("locations"))) |> 
469 |     unnest_wider(any_of(c("locations"))) |> 
470 |     unnest_wider(any_of(c("source")), names_sep = "_") |> 
471 |     #w$results |> unfw("locations") |>
472 |     #unnest_wider(any_of("source"), names_sep = "_") |>
473 |     unnest_longer(any_of("source_issn")) |>
474 |     unnest_longer(any_of(c("source_host_organization_lineage", "source_host_organization_lineage_names"))) |>
475 |     compact() |> 
476 |     mutate(across(-contains("url"), \(x) gsub(re_ids, "", x)))
477 | 
478 |   c(
479 |     list(work = workz),
480 |     list(abstracts = abstracts),
481 |     list(authorships_affiliations_raw = authorships_affiliations_raw),
482 |     list(authorships_affiliations = authorships_affiliations),
483 |     list(authorships_authors = authorships_authors),
484 |     list(authorships_institutions = authorships_institutions),
485 |     list(datasets = datasets),
486 |     various, various2, various3, various4,
487 |     list(
488 |       primary_location = primary_location,
489 |       primary_location_source = primary_location_source,
490 |       best_oa_location = best_oa_location,
491 |       best_oa_location_source = best_oa_location_source,
492 |       locations = locations,
493 |       primary_topic = primary_topic,
494 |       topics = topics
495 |     )
496 |   )
497 | 
498 | }
499 | 


--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
 1 | #' Pipe operator
 2 | #'
 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
 4 | #'
 5 | #' @name %>%
 6 | #' @rdname pipe
 7 | #' @keywords internal
 8 | #' @export
 9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | #' @param lhs A value or the magrittr placeholder.
12 | #' @param rhs A function call using the magrittr semantics.
13 | #' @return The result of calling `rhs(lhs)`.
14 | NULL
15 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>"
 11 | )
 12 | ```
 13 | 
 14 | # openalex
 15 | 
 16 | <!-- badges: start -->
 17 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)
 18 | [![R-CMD-check](https://github.com/KTH-Library/openalex/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/KTH-Library/openalex/actions/workflows/R-CMD-check.yaml)
 19 | <!-- badges: end -->
 20 | 
 21 | The goal of `openalex` is to provide access to data from [OpenAlex](https://openalex.org) - an open and comprehensive catalog of scholarly papers, authors, institutions and more ... - to R through the [Open Alex REST API](https://docs.openalex.org/api)...
 22 | 
 23 | ## Installation
 24 | 
 25 | You can install the current version of `openalex` from [GitHub](https://github.com/kth-library/openalex) with:
 26 | 
 27 | ``` r
 28 | #install.packages("devtools")
 29 | devtools::install_github("kth-library/openalex", dependencies = TRUE)
 30 | ```
 31 | 
 32 | ## Example
 33 | 
 34 | This is a basic example which shows you how to get information for papers and authors:
 35 | 
 36 | ```{r example, eval=TRUE}
 37 | 
 38 | library(openalex)
 39 | library(dplyr)
 40 | suppressPackageStartupMessages(library(purrr))
 41 | library(knitr)
 42 | 
 43 | iid <- 
 44 |   openalex:::openalex_autocomplete(
 45 |     query = "Royal Institute of Technology", 
 46 |     entity_type = "institution", 
 47 |     format = "table") |> 
 48 |   head(1) |> 
 49 |   pull("id")
 50 | 
 51 | data <- 
 52 |   openalex_crawl(entity = "works", verbose = TRUE, fmt = "tables",
 53 |     query = openalex:::openalex_query(filter =
 54 |       sprintf("institutions.id:%s,publication_year:2025", iid)))
 55 | 
 56 | res <- data |> map(head)  # return only first six rows from each table
 57 | 
 58 | res
 59 | ```
 60 | 
 61 | ## Rate limits and using an API key
 62 | 
 63 | By providing an email address you enter the "polite pool" which provides even less of rate limiting for API requests.
 64 | 
 65 | You can provide it in `~/.Renviron` by setting `OPENALEX_USERAGENT=http://github.com/hadley/httr (mailto:your_email@your_institution.org)`.
 66 | 
 67 | You can also set it just for the session by using a helper fcn `openalex_polite()` to temporarily set or unset the email used in the user agent string when making API requests:
 68 | 
 69 | ```{r polite}
 70 | library(openalex)
 71 | 
 72 | # set an email to use for the session
 73 | 
 74 | openalex_polite("you@example.com")
 75 | 
 76 | # unset, and use default user agent string...
 77 | 
 78 | openalex_polite("")
 79 | 
 80 | ```
 81 | 
 82 | A premium subscription API key can be used by setting `OPENALEX_KEY=secret_premium_api_key` in your `.Renviron`, or temporarily in a session using:
 83 | 
 84 | ```{r premium, eval = FALSE}
 85 | library(openalex)
 86 | 
 87 | # temporarily use a premium subscription API key
 88 | openalex_key("secret_premium_api_key")
 89 | 
 90 | # unset to not use the premium subscription API key
 91 | openalex_key("")
 92 | 
 93 | ```
 94 | 
 95 | This will make it possible to make API calls that return the latest available records, for example based on recent creation dates or recent last modification timestamps.
 96 | 
 97 | ```{r updates, eval = TRUE}
 98 | 
 99 | # we do not require an API key for the publish date
100 | published_since_ <- openalex_works_published_since(since_days = 7)
101 | 
102 | # but an API key is needed when using "from_created_date" and "from_updated_date" fields.
103 | created_since_7d <- openalex_works_created_since(since_days = 7)
104 | updated_since_1h <- openalex_works_updated_since(since_minutes = 60)
105 | 
106 | # first few rows of each of these retrievals
107 | created_since_7d |> _$work_ids |> head() |> knitr::kable()
108 | updated_since_1h |> _$work_ids |> head() |> knitr::kable()
109 | 
110 | ```
111 | 
112 | 
113 | ## Data source attribution
114 | 
115 | When data from `openalex` is displayed publicly, this attribution also needs to be displayed:
116 | 
117 | ```{r attribution}
118 | library(openalex)
119 | openalex_attribution()
120 | ```
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # openalex
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![Lifecycle:
  9 | experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)
 10 | [![R-CMD-check](https://github.com/KTH-Library/openalex/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/KTH-Library/openalex/actions/workflows/R-CMD-check.yaml)
 11 | <!-- badges: end -->
 12 | 
 13 | The goal of `openalex` is to provide access to data from
 14 | [OpenAlex](https://openalex.org) - an open and comprehensive catalog of
 15 | scholarly papers, authors, institutions and more … - to R through the
 16 | [Open Alex REST API](https://docs.openalex.org/api)…
 17 | 
 18 | ## Installation
 19 | 
 20 | You can install the current version of `openalex` from
 21 | [GitHub](https://github.com/kth-library/openalex) with:
 22 | 
 23 | ``` r
 24 | #install.packages("devtools")
 25 | devtools::install_github("kth-library/openalex", dependencies = TRUE)
 26 | ```
 27 | 
 28 | ## Example
 29 | 
 30 | This is a basic example which shows you how to get information for
 31 | papers and authors:
 32 | 
 33 | ``` r
 34 | 
 35 | library(openalex)
 36 | library(dplyr)
 37 | #> 
 38 | #> Attaching package: 'dplyr'
 39 | #> The following objects are masked from 'package:stats':
 40 | #> 
 41 | #>     filter, lag
 42 | #> The following objects are masked from 'package:base':
 43 | #> 
 44 | #>     intersect, setdiff, setequal, union
 45 | suppressPackageStartupMessages(library(purrr))
 46 | library(knitr)
 47 | 
 48 | iid <- 
 49 |   openalex:::openalex_autocomplete(
 50 |     query = "Royal Institute of Technology", 
 51 |     entity_type = "institution", 
 52 |     format = "table") |> 
 53 |   head(1) |> 
 54 |   pull("id")
 55 | #> Requesting url: https://api.openalex.org/autocomplete/institutions?q=Royal%20Institute%20of%20Technology
 56 | 
 57 | data <- 
 58 |   openalex_crawl(entity = "works", verbose = TRUE, fmt = "tables",
 59 |     query = openalex:::openalex_query(filter =
 60 |       sprintf("institutions.id:%s,publication_year:2025", iid)))
 61 | #> About to crawl a total of 11 pages of results with a total of 257 records.
 62 | #>  ■■■■■■■■■                         27% |  ETA:  3s
 63 | #>  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■      91% |  ETA:  0s
 64 | 
 65 | res <- data |> map(head)  # return only first six rows from each table
 66 | 
 67 | res
 68 | #> $work
 69 | #> # A tibble: 6 × 13
 70 | #>   id          doi     display_name title publication_year publication_date type 
 71 | #>   <chr>       <chr>   <chr>        <chr>            <int> <date>           <chr>
 72 | #> 1 W4405989080 https:… Molecular b… Mole…             2025 2025-01-01       arti…
 73 | #> 2 W4406001145 https:… Tracking th… Trac…             2025 2025-01-02       arti…
 74 | #> 3 W4406016239 https:… DGCR2 targe… DGCR…             2025 2025-01-02       arti…
 75 | #> 4 W4406082819 https:… Screening a… Scre…             2025 2025-01-05       arti…
 76 | #> 5 W4406172907 https:… Static deep… Stat…             2025 2025-01-01       arti…
 77 | #> 6 W4406435778 https:… Consistent,… Cons…             2025 2025-01-16       arti…
 78 | #> # ℹ 6 more variables: cited_by_count <int>, is_retracted <lgl>,
 79 | #> #   is_paratext <lgl>, updated_date <dttm>, cited_by_api_url <chr>,
 80 | #> #   created_date <date>
 81 | #> 
 82 | #> $work_ids
 83 | #> # A tibble: 6 × 3
 84 | #>   work_id     doi                                          pmid                 
 85 | #>   <chr>       <chr>                                        <chr>                
 86 | #> 1 W4405989080 https://doi.org/10.1016/j.cell.2024.11.036   https://pubmed.ncbi.…
 87 | #> 2 W4406001145 https://doi.org/10.1038/s41467-024-55688-8   https://pubmed.ncbi.…
 88 | #> 3 W4406016239 https://doi.org/10.1038/s41598-024-84574-y   https://pubmed.ncbi.…
 89 | #> 4 W4406082819 https://doi.org/10.1186/s12896-024-00926-6   https://pubmed.ncbi.…
 90 | #> 5 W4406172907 https://doi.org/10.1063/5.0248856            <NA>                 
 91 | #> 6 W4406435778 https://doi.org/10.21468/scipostphyscodeb.45 <NA>                 
 92 | #> 
 93 | #> $work_concepts
 94 | #> # A tibble: 6 × 6
 95 | #>   work_id     id         wikidata                       display_name level score
 96 | #>   <chr>       <chr>      <chr>                          <chr>        <int> <dbl>
 97 | #> 1 W4405989080 C86803240  https://www.wikidata.org/wiki… Biology          0 0.928
 98 | #> 2 W4405989080 C170493617 https://www.wikidata.org/wiki… Receptor         2 0.615
 99 | #> 3 W4405989080 C70721500  https://www.wikidata.org/wiki… Computation…     1 0.466
100 | #> 4 W4405989080 C12426560  https://www.wikidata.org/wiki… Basis (line…     2 0.455
101 | #> 5 W4405989080 C135285700 https://www.wikidata.org/wiki… G protein-c…     3 0.449
102 | #> 6 W4405989080 C95444343  https://www.wikidata.org/wiki… Cell biology     1 0.389
103 | #> 
104 | #> $work_authorships_institutions
105 | #> # A tibble: 6 × 12
106 | #>   work_id     id      display_name ror   country_code type  lineage wa_countries
107 | #>   <chr>       <chr>   <chr>        <chr> <chr>        <chr> <list>  <list>      
108 | #> 1 W4405989080 I18067… University … http… US           fund… <list>  <list [1]>  
109 | #> 2 W4405989080 I18067… University … http… US           fund… <list>  <list [1]>  
110 | #> 3 W4405989080 I11402… University … http… US           fund… <list>  <list [1]>  
111 | #> 4 W4405989080 I42101… National In… http… US           fund… <list>  <list [1]>  
112 | #> 5 W4405989080 I28001… Science for… http… SE           fund… <list>  <list [1]>  
113 | #> 6 W4405989080 I86987… KTH Royal I… http… SE           fund… <list>  <list [1]>  
114 | #> # ℹ 4 more variables: wa_is_corresponding <lgl>, wa_raw_author_name <chr>,
115 | #> #   wa_raw_affiliation_strings <list>, wa_affiliations <list>
116 | #> 
117 | #> $work_abstract_inverted_index
118 | #> # A tibble: 6 × 3
119 | #>   work_id     aii_value                                                  aii_key
120 | #>   <chr>       <chr>                                                      <chr>  
121 | #> 1 W4405989080 <NA>                                                       <NA>   
122 | #> 2 W4406001145 0 1 8 15 19 23 44 60 81 86 90 99 106 123 127 138 149 166 … Regula…
123 | #> 3 W4406016239 <NA>                                                       <NA>   
124 | #> 4 W4406082819 0 1 2 252 279 3 26 69 77 114 118 134 142 164 193 198 211 … Abstra…
125 | #> 5 W4406172907 0 1 88 2 3 12 4 5 6 7 8 58 9 59 10 60 11 13 14 15 20 35 6… As com…
126 | #> 6 W4406435778 0 1 55 67 2 3 4 51 87 5 6 7 64 83 96 103 8 9 17 23 44 50 … Histog…
127 | #> 
128 | #> $work_authorships_author
129 | #> # A tibble: 6 × 10
130 | #>   work_id     wa_author_position id          display_name     orcid wa_countries
131 | #>   <chr>       <chr>              <chr>       <chr>            <chr> <list>      
132 | #> 1 W4405989080 first              A5035848333 Matthew K. Howa… http… <list [1]>  
133 | #> 2 W4405989080 middle             A5106732468 Nick Hoppe       http… <list [1]>  
134 | #> 3 W4405989080 middle             A5089436626 Xi‐Ping Huang    http… <list [1]>  
135 | #> 4 W4405989080 middle             A5063488695 Darko Mitrovic   http… <list [1]>  
136 | #> 5 W4405989080 middle             A5036507080 Christian B. Bi… http… <list [1]>  
137 | #> 6 W4405989080 middle             A5080561155 Christian B. Ma… http… <list [1]>  
138 | #> # ℹ 4 more variables: wa_is_corresponding <lgl>, wa_raw_author_name <chr>,
139 | #> #   wa_raw_affiliation_strings <list>, wa_affiliations <list>
140 | #> 
141 | #> $work_biblio
142 | #> # A tibble: 6 × 5
143 | #>   work_id     volume issue first_page last_page
144 | #>   <chr>        <dbl> <chr> <chr>      <chr>    
145 | #> 1 W4405989080     NA <NA>  <NA>       <NA>     
146 | #> 2 W4406001145     16 1     <NA>       <NA>     
147 | #> 3 W4406016239     15 1     <NA>       <NA>     
148 | #> 4 W4406082819     25 1     <NA>       <NA>     
149 | #> 5 W4406172907     37 1     <NA>       <NA>     
150 | #> 6 W4406435778     NA <NA>  <NA>       <NA>     
151 | #> 
152 | #> $work_open_access
153 | #> # A tibble: 6 × 5
154 | #>   work_id     is_oa oa_status oa_url                      any_repository_has_f…¹
155 | #>   <chr>       <lgl> <chr>     <chr>                       <lgl>                 
156 | #> 1 W4405989080 TRUE  green     https://doi.org/10.1101/20… TRUE                  
157 | #> 2 W4406001145 TRUE  gold      https://doi.org/10.1038/s4… FALSE                 
158 | #> 3 W4406016239 TRUE  gold      https://doi.org/10.1038/s4… FALSE                 
159 | #> 4 W4406082819 TRUE  gold      https://doi.org/10.1186/s1… FALSE                 
160 | #> 5 W4406172907 TRUE  hybrid    https://doi.org/10.1063/5.… FALSE                 
161 | #> 6 W4406435778 TRUE  hybrid    https://doi.org/10.21468/s… TRUE                  
162 | #> # ℹ abbreviated name: ¹​any_repository_has_fulltext
163 | #> 
164 | #> $work_host_venue
165 | #> # A tibble: 6 × 1
166 | #>   work_id    
167 | #>   <chr>      
168 | #> 1 W4405989080
169 | #> 2 W4406001145
170 | #> 3 W4406016239
171 | #> 4 W4406082819
172 | #> 5 W4406172907
173 | #> 6 W4406435778
174 | #> 
175 | #> $work_counts_by_year
176 | #> # A tibble: 6 × 3
177 | #>   work_id      year cited_by_count
178 | #>   <chr>       <int>          <int>
179 | #> 1 W4405989080  2025              1
180 | #> 2 W4406001145  2025              1
181 | #> 3 W4406082819  2025              1
182 | #> 4 W4406172907  2025              1
183 | #> 5 W4406435778  2025              1
184 | #> 6 W4406435863  2025              1
185 | #> 
186 | #> $work_related_works
187 | #> # A tibble: 6 × 2
188 | #>   work_id     related_works
189 | #>   <chr>       <chr>        
190 | #> 1 W4405989080 W4391375266  
191 | #> 2 W4405989080 W4224216382  
192 | #> 3 W4405989080 W416861399   
193 | #> 4 W4405989080 W3195483439  
194 | #> 5 W4405989080 W3011298851  
195 | #> 6 W4405989080 W2609050007  
196 | #> 
197 | #> $work_referenced_works
198 | #> # A tibble: 6 × 2
199 | #>   work_id     referenced_works
200 | #>   <chr>       <chr>           
201 | #> 1 W4405989080 W1031578623     
202 | #> 2 W4405989080 W1483147211     
203 | #> 3 W4405989080 W1503765703     
204 | #> 4 W4405989080 W1513618424     
205 | #> 5 W4405989080 W1833104430     
206 | #> 6 W4405989080 W189880865
207 | ```
208 | 
209 | ## Rate limits and using an API key
210 | 
211 | By providing an email address you enter the “polite pool” which provides
212 | even less of rate limiting for API requests.
213 | 
214 | You can provide it in `~/.Renviron` by setting
215 | `OPENALEX_USERAGENT=http://github.com/hadley/httr
216 | (mailto:your_email@your_institution.org)`.
217 | 
218 | You can also set it just for the session by using a helper fcn
219 | `openalex_polite()` to temporarily set or unset the email used in the
220 | user agent string when making API requests:
221 | 
222 | ``` r
223 | library(openalex)
224 | 
225 | # set an email to use for the session
226 | 
227 | openalex_polite("you@example.com")
228 | #> Hint: You can provide an email to enter the polite pool
229 | #> To have the setting stick persistently using .Renviron, do ...
230 | #>   file.edit("~/.Renviron")
231 | #>   # and add a line OPENALEX_USERAGENT="http://github.com/hadley/httr (mailto:you@example.com)"
232 | #> Then reload settings for the R environment in the current session
233 | #>   readRenviron("~/.Renviron")
234 | #> Temporarily setting OPENALEX_USERAGENT envvar for this session to: http://github.com/hadley/httr (mailto:you@example.com)
235 | #> [1] TRUE
236 | 
237 | # unset, and use default user agent string...
238 | 
239 | openalex_polite("")
240 | #> Exiting from polite pool, email no longer provided in user agent header
241 | #> [1] FALSE
242 | ```
243 | 
244 | A premium subscription API key can be used by setting
245 | `OPENALEX_KEY=secret_premium_api_key` in your `.Renviron`, or
246 | temporarily in a session using:
247 | 
248 | ``` r
249 | library(openalex)
250 | 
251 | # temporarily use a premium subscription API key
252 | openalex_key("secret_premium_api_key")
253 | 
254 | # unset to not use the premium subscription API key
255 | openalex_key("")
256 | ```
257 | 
258 | This will make it possible to make API calls that return the latest
259 | available records, for example based on recent creation dates or recent
260 | last modification timestamps.
261 | 
262 | ``` r
263 | 
264 | # we do not require an API key for the publish date
265 | published_since_ <- openalex_works_published_since(since_days = 7)
266 | #> About to crawl a total of 1 pages of results with a total of 21 records.
267 | #> Warning: `type_convert()` only converts columns of type 'character'.
268 | #> - `df` has no columns of type 'character'
269 | 
270 | # but an API key is needed when using "from_created_date" and "from_updated_date" fields.
271 | created_since_7d <- openalex_works_created_since(since_days = 7)
272 | #> About to crawl a total of 2 pages of results with a total of 44 records.
273 | #> Warning: `type_convert()` only converts columns of type 'character'.
274 | #> - `df` has no columns of type 'character'
275 | updated_since_1h <- openalex_works_updated_since(since_minutes = 60)
276 | #> About to crawl a total of 18 pages of results with a total of 442 records.
277 | #>  ■■■                                6% |  ETA: 32s
278 | #>  ■■■■                              11% |  ETA: 35s
279 | #>  ■■■■■■                            17% |  ETA: 31s
280 | #>  ■■■■■■■■■                         28% |  ETA: 28s
281 | #>  ■■■■■■■■■■■                       33% |  ETA: 26s
282 | #>  ■■■■■■■■■■■■■■                    44% |  ETA: 21s
283 | #>  ■■■■■■■■■■■■■■■■                  50% |  ETA: 20s
284 | #>  ■■■■■■■■■■■■■■■■■■                56% |  ETA: 17s
285 | #>  ■■■■■■■■■■■■■■■■■■■■■             67% |  ETA: 13s
286 | #>  ■■■■■■■■■■■■■■■■■■■■■■■           72% |  ETA: 11s
287 | #>  ■■■■■■■■■■■■■■■■■■■■■■■■          78% |  ETA:  9s
288 | #>  ■■■■■■■■■■■■■■■■■■■■■■■■■■        83% |  ETA:  7s
289 | #>  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■      89% |  ETA:  5s
290 | #>  ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■     94% |  ETA:  2s
291 | 
292 | # first few rows of each of these retrievals
293 | created_since_7d |> _$work_ids |> head() |> knitr::kable()
294 | ```
295 | 
296 | | work\_id    | doi                                          | pmid                                       |
297 | | :---------- | :------------------------------------------- | :----------------------------------------- |
298 | | W4407223209 | <https://doi.org/10.1063/5.0238622>          | NA                                         |
299 | | W4407212560 | <https://doi.org/10.1109/tac.2025.3539332>   | NA                                         |
300 | | W4407347106 | <https://doi.org/10.1002/srin.202400360>     | NA                                         |
301 | | W4407235267 | <https://doi.org/10.1002/tea.22031>          | NA                                         |
302 | | W4407203234 | <https://doi.org/10.1038/s41467-025-56408-6> | <https://pubmed.ncbi.nlm.nih.gov/39915442> |
303 | | W4407277358 | <https://doi.org/10.3934/cpaa.2025036>       | NA                                         |
304 | 
305 | ``` r
306 | updated_since_1h |> _$work_ids |> head() |> knitr::kable()
307 | ```
308 | 
309 | | work\_id    | doi                                              |        mag | pmid                                       | pmcid |
310 | | :---------- | :----------------------------------------------- | ---------: | :----------------------------------------- | :---- |
311 | | W2010417920 | <https://doi.org/10.1016/j.wasman.2007.02.015>   | 2010417920 | <https://pubmed.ncbi.nlm.nih.gov/17434726> | NA    |
312 | | W2076716399 | <https://doi.org/10.1016/j.tra.2008.11.014>      | 2076716399 | NA                                         | NA    |
313 | | W2130306081 | <https://doi.org/10.1016/j.trc.2015.01.015>      | 2130306081 | NA                                         | NA    |
314 | | W2153980567 | <https://doi.org/10.1016/j.trd.2010.12.004>      | 2153980567 | NA                                         | NA    |
315 | | W2131628642 | <https://doi.org/10.1016/j.techfore.2009.03.003> | 2131628642 | NA                                         | NA    |
316 | | W2045503958 | <https://doi.org/10.1016/j.techfore.2010.09.005> | 2045503958 | NA                                         | NA    |
317 | 
318 | ## Data source attribution
319 | 
320 | When data from `openalex` is displayed publicly, this attribution also
321 | needs to be displayed:
322 | 
323 | ``` r
324 | library(openalex)
325 | openalex_attribution()
326 | #> [1] "Data source: OpenAlex API at https://api.openalex.org/\nData license agreement: https://creativecommons.org/publicdomain/zero/1.0/"
327 | ```
328 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://kth-library.github.io/openalex/
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/data-raw/DATASET.R:
--------------------------------------------------------------------------------
1 | ## code to prepare `DATASET` dataset goes here
2 | 
3 | usethis::use_data(DATASET, overwrite = TRUE)
4 | 


--------------------------------------------------------------------------------
/data/topics.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KTH-Library/openalex/c8b22a4462be7e114898f79152ac9c1ce86ae1da/data/topics.rda


--------------------------------------------------------------------------------
/man/openalex-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/openalex-package.R
 3 | \docType{package}
 4 | \name{openalex-package}
 5 | \alias{openalex}
 6 | \alias{openalex-package}
 7 | \title{openalex: Data from OpenAlex REST API}
 8 | \description{
 9 | The OpenAlex website provides open data on papers/works, venues, institutions and more around the world under the CC0 license. This R package provides some functions to access data from the OpenAlex REST API.
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 |   \item \url{https://kth-library.github.io/openalex/}
15 | }
16 | 
17 | }
18 | \author{
19 | \strong{Maintainer}: Markus Skyttner \email{markussk@kth.se}
20 | 
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/openalex_api.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_api}
 4 | \alias{openalex_api}
 5 | \title{Endpoint used for requests to OpenAlex API}
 6 | \usage{
 7 | openalex_api()
 8 | }
 9 | \description{
10 | Endpoint used for requests to OpenAlex API
11 | }
12 | 


--------------------------------------------------------------------------------
/man/openalex_attribution.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_attribution}
 4 | \alias{openalex_attribution}
 5 | \title{Attribution}
 6 | \usage{
 7 | openalex_attribution()
 8 | }
 9 | \description{
10 | Use this attribution whenever data from the API is publicly displayed
11 | }
12 | \details{
13 | OpenAlex provides a RESTful API for scholarly papers, authors,
14 | institutions, and more. When publicly displaying data from the API,
15 | it is polite to point back to OpenAlex at https://openalex.org/
16 | }
17 | 


--------------------------------------------------------------------------------
/man/openalex_counts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_export.R
 3 | \name{openalex_counts}
 4 | \alias{openalex_counts}
 5 | \title{Counts from OpenAlex}
 6 | \usage{
 7 | openalex_counts(
 8 |   filter = openalex_filter_default(),
 9 |   dimensions = openalex_groupbys_default()
10 | )
11 | }
12 | \arguments{
13 | \item{filter}{a set of filter criteria, see the defaults in openalex_filter_default()}
14 | 
15 | \item{dimensions}{a set of grouping dimensions, see the defaults in openalex_groupbys_default()}
16 | }
17 | \value{
18 | a list of tibbles
19 | }
20 | \description{
21 | Aggregates/counts can be retrieved using the group_bys query parameter
22 | }
23 | 


--------------------------------------------------------------------------------
/man/openalex_crawl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_crawl}
 4 | \alias{openalex_crawl}
 5 | \title{Crawl multiple pages of results}
 6 | \usage{
 7 | openalex_crawl(entity, query, verbose = FALSE, fmt = "object")
 8 | }
 9 | \arguments{
10 | \item{entity}{one of the values in openalex_entity_enum()}
11 | 
12 | \item{query}{an openalex_query object}
13 | 
14 | \item{verbose}{boolean to indicate whether to output messages during process}
15 | 
16 | \item{fmt}{the return format, one of "object" or "tables"}
17 | }
18 | \value{
19 | R object with results matching the query
20 | }
21 | \description{
22 | Iterates over paged results showing a progress bar
23 | }
24 | 


--------------------------------------------------------------------------------
/man/openalex_doi_lookup.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/doi_lookup.R
 3 | \name{openalex_doi_lookup}
 4 | \alias{openalex_doi_lookup}
 5 | \title{Lookup DOIs using OpenAlex}
 6 | \usage{
 7 | openalex_doi_lookup(dois, resolution = c("all", "identifiers"))
 8 | }
 9 | \arguments{
10 | \item{dois}{a character vector of DOIs}
11 | 
12 | \item{resolution}{either "all" or "identifiers" to only return other related identifiers}
13 | }
14 | \value{
15 | tibble(s)
16 | }
17 | \description{
18 | Lookup DOIs using OpenAlex
19 | }
20 | 


--------------------------------------------------------------------------------
/man/openalex_flatten_long.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_flatten_long}
 4 | \alias{openalex_flatten_long}
 5 | \title{Flatten R object from deserialized nested JSON object}
 6 | \usage{
 7 | openalex_flatten_long(nestedlist)
 8 | }
 9 | \arguments{
10 | \item{nestedlist}{a nested list of lists}
11 | }
12 | \value{
13 | a tibble in long format
14 | }
15 | \description{
16 | Flatten R object from deserialized nested JSON object
17 | }
18 | 


--------------------------------------------------------------------------------
/man/openalex_key.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_key}
 4 | \alias{openalex_key}
 5 | \title{Use an API key for OpenAlex Premium Subscription}
 6 | \usage{
 7 | openalex_key(key)
 8 | }
 9 | \arguments{
10 | \item{key}{a premium subscription key}
11 | }
12 | \value{
13 | a logical depending on whether key was set or unset
14 | }
15 | \description{
16 | This provides access to the latest data, fresher than what snapshots provide.
17 | It also enables faster requests and filtering on from_created_date and from_updated_date fields.
18 | }
19 | \details{
20 | Additional details...
21 | 
22 | \url{https://github.com/ourresearch/openalex-api-tutorials/blob/main/notebooks/getting-started/premium.ipynb}
23 | \url{https://docs.openalex.org/api-entities/works/filter-works#from_created_date}
24 | \url{https://docs.openalex.org/api-entities/works/filter-works#from_updated_date}
25 | }
26 | \examples{
27 | \dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
28 |  openalex_key("my_secret_api_key")
29 |  openalex_key("")
30 | \dontshow{\}) # examplesIf}
31 | }
32 | 


--------------------------------------------------------------------------------
/man/openalex_kth_rawaff_query.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_kth_rawaff_query}
 4 | \alias{openalex_kth_rawaff_query}
 5 | \title{Example query when searching raw affiliation strings}
 6 | \usage{
 7 | openalex_kth_rawaff_query()
 8 | }
 9 | \value{
10 | string with query
11 | }
12 | \description{
13 | This variant is specifically tailored for KTH, Royal Institute of Technology
14 | and includes some affiliation string variations which might be related.
15 | }
16 | 


--------------------------------------------------------------------------------
/man/openalex_polite.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_polite}
 4 | \alias{openalex_polite}
 5 | \title{Enter the OpenAlex API polite pool for faster requests by providing an email}
 6 | \usage{
 7 | openalex_polite(email)
 8 | }
 9 | \arguments{
10 | \item{email}{an email address, on the form "you@example.com" or "" to unset email}
11 | }
12 | \value{
13 | a logical depending on whether email was set or unset
14 | }
15 | \description{
16 | Enter the OpenAlex API polite pool for faster requests by providing an email
17 | }
18 | \examples{
19 | \dontrun{
20 | if(interactive()){
21 |  # to set
22 |  openalex_polite("you@example.com")
23 |  # to unset
24 |  openalex_polite("")
25 |  }
26 | }
27 | }
28 | 


--------------------------------------------------------------------------------
/man/openalex_topics.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_export.R
 3 | \name{openalex_topics}
 4 | \alias{openalex_topics}
 5 | \title{Topics}
 6 | \usage{
 7 | openalex_topics()
 8 | }
 9 | \description{
10 | Table of current topics, subfields, fields and domains used at OpenAlex
11 | }
12 | 


--------------------------------------------------------------------------------
/man/openalex_work.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_work}
 4 | \alias{openalex_work}
 5 | \title{Retrieve work from OpenAlex REST API}
 6 | \usage{
 7 | openalex_work(identifier, format = "table", use_random = FALSE)
 8 | }
 9 | \arguments{
10 | \item{identifier}{string with identifier}
11 | 
12 | \item{format}{one of "table" or "object"}
13 | 
14 | \item{use_random}{logical to indicate whether to use random identifier, Default: FALSE}
15 | }
16 | \value{
17 | as per format, either a tibble or an R object
18 | }
19 | \description{
20 | This function retrieves works given an identifier
21 | }
22 | \examples{
23 | \dontrun{
24 |  openalex_work(use_random = TRUE)
25 |  }
26 | }
27 | 


--------------------------------------------------------------------------------
/man/openalex_works_created_since.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_works_created_since}
 4 | \alias{openalex_works_created_since}
 5 | \title{Recently created works based on query for matching raw affiliations}
 6 | \usage{
 7 | openalex_works_created_since(
 8 |   raw_search_criteria = openalex_kth_rawaff_query(),
 9 |   since_days = 0
10 | )
11 | }
12 | \arguments{
13 | \item{raw_search_criteria}{raw affiliation string search criteria,
14 | by default openalex_kth_rawaff_query()}
15 | 
16 | \item{since_days}{integer indicating minutes since now}
17 | }
18 | \value{
19 | list of tables with results
20 | }
21 | \description{
22 | This function requires a premium subscription API key to be set.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/openalex_works_cursorcrawl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cursor_crawl.R
 3 | \name{openalex_works_cursorcrawl}
 4 | \alias{openalex_works_cursorcrawl}
 5 | \title{Crawl multipage responses from queries against the API}
 6 | \usage{
 7 | openalex_works_cursorcrawl(works_filter, n_max_pages = 5)
 8 | }
 9 | \arguments{
10 | \item{works_filter}{the works filter}
11 | 
12 | \item{n_max_pages}{the max amount of pages to fetch (50 per page)}
13 | }
14 | \value{
15 | paths to downloaded files
16 | }
17 | \description{
18 | Chunks and uses cursor based pagination to fetch works
19 | }
20 | 


--------------------------------------------------------------------------------
/man/openalex_works_export.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_export.R
 3 | \name{openalex_works_export}
 4 | \alias{openalex_works_export}
 5 | \title{Use OpenAlex API for exporting data in tabular and wos formats}
 6 | \usage{
 7 | openalex_works_export(q, fmt = c("csv", "wos-plaintext"), raw_string = FALSE)
 8 | }
 9 | \arguments{
10 | \item{q}{the query, for example "authorships.institutions.lineage:i86987016,authorships.institutions.lineage:!i4210161097,type:types/article,primary_location.source.type:source-types/journal|source-types/conference,publication_year:2023"}
11 | 
12 | \item{fmt}{the export format, one of "csv" or "wos-plaintext" or "wos-plaintext-diva"}
13 | 
14 | \item{raw_string}{boolean to indicate whether a raw string should be returned}
15 | }
16 | \value{
17 | a character vector with a raw string with the results from the export or a data frame
18 | }
19 | \description{
20 | Use OpenAlex API for exporting data in tabular and wos formats
21 | }
22 | 


--------------------------------------------------------------------------------
/man/openalex_works_published_since.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_works_published_since}
 4 | \alias{openalex_works_published_since}
 5 | \title{Recently published works based on query for matching raw affiliations}
 6 | \usage{
 7 | openalex_works_published_since(
 8 |   raw_search_criteria = openalex_kth_rawaff_query(),
 9 |   since_days = 7
10 | )
11 | }
12 | \arguments{
13 | \item{raw_search_criteria}{raw affiliation string search criteria,
14 | by default openalex_kth_rawaff_query()}
15 | 
16 | \item{since_days}{integer indicating days back from today}
17 | }
18 | \value{
19 | list of tables with results
20 | }
21 | \description{
22 | Recently published works based on query for matching raw affiliations
23 | }
24 | 


--------------------------------------------------------------------------------
/man/openalex_works_updated_since.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_restclient.R
 3 | \name{openalex_works_updated_since}
 4 | \alias{openalex_works_updated_since}
 5 | \title{Recently updated works based on query for matching raw affiliations}
 6 | \usage{
 7 | openalex_works_updated_since(
 8 |   raw_search_criteria = openalex_kth_rawaff_query(),
 9 |   since_minutes
10 | )
11 | }
12 | \arguments{
13 | \item{raw_search_criteria}{raw affiliation string search criteria,
14 | by default openalex_kth_rawaff_query()}
15 | 
16 | \item{since_minutes}{integer indicating minutes since now}
17 | }
18 | \value{
19 | list of tables with results
20 | }
21 | \description{
22 | This function requires a premium subscription API key to be set.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/openalex_write_duckdb.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_export.R
 3 | \name{openalex_write_duckdb}
 4 | \alias{openalex_write_duckdb}
 5 | \title{Export the results from a crawl as a duckdb database file}
 6 | \usage{
 7 | openalex_write_duckdb(crawl, destdir = NULL)
 8 | }
 9 | \arguments{
10 | \item{crawl}{the results from running the to_tbls fcn}
11 | 
12 | \item{destdir}{the location to save the database file}
13 | }
14 | \value{
15 | file path to the database file
16 | }
17 | \description{
18 | Export the results from a crawl as a duckdb database file
19 | }
20 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-pipe.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \arguments{
10 | \item{lhs}{A value or the magrittr placeholder.}
11 | 
12 | \item{rhs}{A function call using the magrittr semantics.}
13 | }
14 | \value{
15 | The result of calling \code{rhs(lhs)}.
16 | }
17 | \description{
18 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
19 | }
20 | \keyword{internal}
21 | 


--------------------------------------------------------------------------------
/man/topics.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/openalex-package.R
 3 | \docType{data}
 4 | \name{topics}
 5 | \alias{topics}
 6 | \title{Topics}
 7 | \format{
 8 | A data frame with 4516 rows and 9 variables:
 9 | \describe{
10 | \item{\code{id_topic}}{character the id for the topic}
11 | \item{\code{topic}}{character description of topic}
12 | \item{\code{description}}{character long form description of this topic cluster}
13 | \item{\code{id_subfield}}{character the id for the subfield of this topic}
14 | \item{\code{subfield}}{character description of the subfield}
15 | \item{\code{id_field}}{character the id of the field}
16 | \item{\code{field}}{character description of the field}
17 | \item{\code{id_domain}}{character the id of the domain}
18 | \item{\code{domain}}{character description of the domain}
19 | }
20 | }
21 | \usage{
22 | topics
23 | }
24 | \description{
25 | Topics used by OpenAlex
26 | }
27 | \details{
28 | DETAILS
29 | }
30 | \keyword{datasets}
31 | 


--------------------------------------------------------------------------------
/man/wos_plaintext_for_diva.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/open_alex_export.R
 3 | \name{wos_plaintext_for_diva}
 4 | \alias{wos_plaintext_for_diva}
 5 | \title{Function which converts a wos_plaintext-string into a format
 6 | which can be uploaded to DiVA, by adding ER tags
 7 | (including a blank line) after each record}
 8 | \usage{
 9 | wos_plaintext_for_diva(x)
10 | }
11 | \arguments{
12 | \item{x}{character string with "wos-plaintext" format as returned from OpenAlex export API endpoint}
13 | }
14 | \description{
15 | Function which converts a wos_plaintext-string into a format
16 | which can be uploaded to DiVA, by adding ER tags
17 | (including a blank line) after each record
18 | }
19 | 


--------------------------------------------------------------------------------
/openalex.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(openalex)
3 | 
4 | test_check("openalex")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-crawl.R:
--------------------------------------------------------------------------------
  1 | 
  2 | test_that("crawl works (not cursor based) and results can be persisted in db", {
  3 | 
  4 |   skip_on_ci()
  5 | 
  6 |   my_filter <- paste0(collapse = ",", c(
  7 |     "authorships.institutions.lineage:i86987016", ## KTH
  8 |     "authorships.institutions.lineage:!i4210161097", ## Bolin Center
  9 |     "authorships.institutions.lineage:!i119971240", ## NORDITA
 10 |     "type:types/article",
 11 |     "primary_location.source.type:source-types/journal|source-types/conference",
 12 |     "publication_year:2025"
 13 |   ))
 14 | 
 15 |   my_query <- openalex:::openalex_query(filter = my_filter)
 16 |   works <- openalex_crawl("work", query = my_query, fmt = "object")
 17 |   #readr::write_rds(works, "~/openalex-2023.rds")
 18 | 
 19 | 
 20 |   # TODO: some error here!
 21 |   library(purrr)
 22 |   library(dplyr)
 23 | 
 24 |   lol <- 
 25 |     list(list(results = reduce(works |> map("results"), c)))
 26 | 
 27 |   my_works <- 
 28 |     lol |> openalex_works_to_tbls()
 29 | 
 30 |   is_valid <- 
 31 |     attr(works[[1]], "meta")$count == nrow(my_works$work)
 32 | 
 33 |   harvest <-
 34 |     my_works |> map(\(x) x |> mutate(across(any_of(contains("id")), 
 35 |       \(y) gsub("https://openalex.org/", "", y, fixed = TRUE)))
 36 |     )
 37 | 
 38 |   dump_path <-file.path(tempdir(), "openalex-2025.db")
 39 |   harvest |> openalex_write_duckdb(dump_path)
 40 |   message("Persisted dump at ", dump_path)
 41 | 
 42 |   expect_true(is_valid)
 43 | 
 44 | })
 45 | 
 46 | test_that("Similar topics can be retrieved given a work", {
 47 | 
 48 |   skip_on_ci()
 49 | 
 50 |   topics_filter <-
 51 |     openalex_filter_similar_topics("W2168078104")
 52 | 
 53 |   my_filter <- list(filter = paste0(
 54 | #    "publication_year:2024,",
 55 |     "institution.id:I2799509149,",
 56 |     topics_filter
 57 |   ))
 58 | 
 59 |   works <- openalex_crawl("works", query = my_filter)
 60 | 
 61 |   lol <- 
 62 |     list(list(results = reduce(works |> map("results"), c)))
 63 | 
 64 |   my_works <- 
 65 |     lol |> openalex_works_to_tbls()
 66 | 
 67 |   is_valid <- my_works$work |> nrow() > 5
 68 |   expect_true(is_valid)
 69 | 
 70 | })
 71 | 
 72 | test_that("Crawling several of works related to a specific topic works", {
 73 | 
 74 |   skip_on_ci()
 75 | 
 76 |   q <-
 77 |     list(
 78 |       filter = paste0(collapse = ",", c(
 79 |         "publication_year:2025",
 80 |         "primary_topic.id:T10783"
 81 |         ))
 82 |       )
 83 | 
 84 |   works <- openalex_crawl("works", query = q, verbose = TRUE)
 85 | 
 86 |   message("JSON object size is ", format(object.size(works), "MB"))
 87 | 
 88 |   lol <- 
 89 |     list(list(results = reduce(works |> map("results"), c)))
 90 | 
 91 |   my_works <- 
 92 |     lol |> openalex_works_to_tbls()
 93 | 
 94 |   message("Tables object size is ", format(object.size(my_works), "MB"))
 95 | 
 96 |   message("Number of records are: ", nrow(my_works$work))
 97 | 
 98 |   is_valid <- 
 99 |     attr(works[[1]], "meta")$count == nrow(my_works$work)
100 | 
101 |   #is_valid <- object.size(works) > 7000000
102 | 
103 |   expect_true(is_valid)
104 | 
105 | })
106 | 
107 | 


--------------------------------------------------------------------------------
/tests/testthat/test-cursorcrawl.R:
--------------------------------------------------------------------------------
 1 | test_that("cursor based paging for works works", {
 2 | 
 3 |   skip_on_ci()
 4 | 
 5 |   works_filter <- "publication_year:2015-2023,primary_topic.id:t10783"
 6 | 
 7 |   cc <- 
 8 |     works_filter |> 
 9 |     openalex_works_cursorcrawl(n_max_pages = 10)
10 | 
11 |   mydir <- unique(dirname(cc))
12 | 
13 |   mydir <- unique(dirname(cc))
14 |   fns_max_size <- max(file.size(cc))
15 | 
16 |   read_jsonl <- function(fn) {
17 |     Sys.setenv("VROOM_CONNECTION_SIZE" = fns_max_size)
18 |     fn |> file() |> readr::read_lines() |> 
19 |     RcppSimdJson::fparse(max_simplify_lvl = "list")
20 |   }
21 | 
22 |   ccs <- cc |> purrr::map(\(x) list(results = read_jsonl(x)))
23 | 
24 |   #object <- ccs[[1]]
25 |   #w |> openalex_works_to_tbls()
26 |   
27 |   res <- 
28 |     cc |> 
29 |     purrr::map(\(x) list(results = read_jsonl(x))) |> 
30 |     openalex_works_to_tbls()
31 | 
32 |   is_valid <- 
33 |     all(cc %in% dir(mydir, full.names = TRUE)) &
34 |     res$work |> nrow() == 500
35 | 
36 |   # TODO: Hmmm, what is a$abstract_inverted_index_v3?
37 |   # a <- jsonlite::stream_in(file(cc[1])) |> as_tibble()
38 | 
39 |   expect_true(is_valid)
40 | 
41 |   # cmd <- paste0("flatterer --force --nocsv --parquet -m works --id-prefix work -j ",
42 |   #   paste(collapse = " ", cc), " ", mydir, paste0(mydir, "/cursorcrawl"))
43 |   
44 |   # system(cmd)
45 | 
46 | })
47 | 


--------------------------------------------------------------------------------
/tests/testthat/test-dois.R:
--------------------------------------------------------------------------------
 1 | test_that("doi lookup works for 20 dois", {
 2 |  
 3 |   skip_on_ci()
 4 | 
 5 |   dois <- paste0("10.1016/j.aos.2023.101522, 10.1051/m2an/2024042, ",
 6 |     "10.1016/j.heliyon.2024.e25125, 10.1145/3664476.3664508, ",
 7 |     "10.23919/ECC64448.2024.10590962, 10.1109/TCNS.2023.3285863, ",
 8 |     "10.23919/ECC64448.2024.10591128, 10.1007/s10570-023-05674-y, ",
 9 |     "10.1109/APWC61918.2024.10701979, 10.1137/23M1587804, ",
10 |     "10.1109/FDL63219.2024.10673844, 10.1007/978-3-031-54776-8_12, ",
11 |     "10.1137/22M148968X, 10.1016/j.trc.2023.104454, 10.1108/ECON-10-2023-0163, ",
12 |     "10.1016/j.apenergy.2024.122690, 10.1038/s41467-023-44315-7, ",
13 |     "10.1109/TCI.2024.3463485, 10.1016/j.jobe.2024.110536, ",
14 |     "10.1007/s13721-024-00446-5") |> 
15 |     strsplit(split = ", ") |> unlist() 
16 | 
17 |   doi_filter <- function(dois) dois |> openalex_or()
18 | 
19 |   doi_filters <- 
20 |     dois |> 
21 |     split_chunks_of_n(50) |> 
22 |     map(doi_filter)
23 |   
24 |   #doi_filters |> map(\(x) doi_lookup_identifiers(doi_filter = x))
25 | 
26 |   #ids <- openalex_doi_lookup(dois, "identifiers")
27 |   more <- openalex_doi_lookup(dois, "all")
28 | 
29 |   is_valid <- 
30 |     #nrow(ids) == length(dois) & 
31 |     nrow(more[[1]]$ids) == length(dois)
32 |   
33 |   expect_true(is_valid)
34 | 
35 | })
36 | 
37 | 


--------------------------------------------------------------------------------
/tests/testthat/test-export.R:
--------------------------------------------------------------------------------
  1 | test_that("export works", {
  2 | 
  3 |   skip()
  4 | 
  5 |   my_filter <- paste0(collapse = ",", c(
  6 |     "authorships.institutions.lineage:i86987016", ## KTH
  7 |     "authorships.institutions.lineage:i4210161097", ## Bolin Center (some of these might be KTH pubs!)
  8 |     "authorships.institutions.lineage:i119971240", ## NORDITA (some of these, too!)
  9 |     "authorships.institutions.lineage:i4210147696", ## THS Tekniska Högskolans Studentkår
 10 |     "type:types/article",
 11 |     "primary_location.source.type:source-types/journal|source-types/conference",
 12 |     "publication_year:2025"
 13 |   ))
 14 | 
 15 |   # TODO:
 16 |   # for weekly import - include centres
 17 |   # for retroactive import - exclude centres
 18 | 
 19 |   my_csv <- openalex_works_export(q = my_filter, fmt = "csv")
 20 |   my_csv_string <- openalex_works_export(q = my_filter, fmt = "csv", raw_string = TRUE)
 21 |   my_wos <- openalex_works_export(q = my_filter, fmt = "wos-plaintext")
 22 |   my_wos_string <- openalex_works_export(q = my_filter, fmt = "wos-plaintext", raw_string = TRUE)
 23 | 
 24 |   is_valid <- nrow(my_csv) > 2000 & nrow(my_wos) > 2000
 25 |   expect_true(is_valid)
 26 | })
 27 | 
 28 | test_that("export for diva in wos-plain text format works", {
 29 | 
 30 |   skip_on_ci()
 31 | 
 32 |   my_filter <- paste0(collapse = ",", c(
 33 |     sprintf("publication_year:%s", 2025),
 34 |     sprintf("authorships.author.id:%s", "a5045975901") #,
 35 |   #   sprintf("raw_affiliation_strings.search:%s", openalex_kth_rawaff_query()),
 36 |   #   "authorships.institutions.lineage:!i86987016", ## KTH
 37 |   #   "authorships.institutions.lineage:!i4210161097", ## Bolin Center (some of these might be KTH pubs!)
 38 |   #   "authorships.institutions.lineage:!i119971240", ## NORDITA (some of these, too!)
 39 |   #   "authorships.institutions.lineage:!i4210147696" ## THS Tekniska Högskolans Studentkår
 40 |   ))
 41 | 
 42 |   gm <- openalex_works_export(q = my_filter, fmt = "wos-plaintext")
 43 | 
 44 |   #cat(gm)
 45 | 
 46 |   is_valid <- 
 47 |     (regmatches(gm, gregexpr("KTH", gm)) |> unlist()) |> length() > 10
 48 | 
 49 |   expect_true(is_valid)
 50 | })
 51 | 
 52 | test_that("export of rawff query for 2025 in wos-plain diva text format works", {
 53 | 
 54 |   skip_on_ci()
 55 | 
 56 |   my_filter <- paste0(collapse = ",", c(
 57 |     sprintf("publication_year:%s", 2025),
 58 |     sprintf("raw_affiliation_strings.search:%s", openalex_kth_rawaff_query()),
 59 |     "authorships.institutions.lineage:!i86987016", ## KTH
 60 |     "authorships.institutions.lineage:!i4210161097", ## Bolin Center (some of these might be KTH pubs!)
 61 |     "authorships.institutions.lineage:!i119971240", ## NORDITA (some of these, too!)
 62 |     "authorships.institutions.lineage:!i4210147696" ## THS Tekniska Högskolans Studentkår
 63 |   ))
 64 | 
 65 |   extras <- openalex_works_export(q = my_filter, fmt = "wos-plaintext")
 66 |   is_valid <- (regmatches(extras, gregexpr("ER", extras)) |> unlist()) |> length() > 0
 67 |   expect_true(is_valid)
 68 | })
 69 | 
 70 | test_that("export of rawff query for 2025 in csv format works", {
 71 | 
 72 |   skip_on_ci()
 73 | 
 74 |   my_filter <- paste0(collapse = ",", c(
 75 |     sprintf("publication_year:%s", 2025),
 76 |     sprintf("raw_affiliation_strings.search:%s", openalex_kth_rawaff_query())
 77 |   ))
 78 | 
 79 |   csv <- openalex_works_export(my_filter, "csv")
 80 | 
 81 |   #arrow::write_parquet(csv, "~/oa-2025-csv-export.parquet")
 82 | 
 83 |   is_valid <- nrow(csv) > 1
 84 | 
 85 |   expect_true(is_valid)
 86 | 
 87 | })
 88 | 
 89 | test_that("export of rawff query for 2025 in csv format works", {
 90 | 
 91 |   skip_on_ci()
 92 | 
 93 |   my_filter <- paste0(collapse = ",", c(
 94 |     sprintf("publication_year:%s", 2025),
 95 |     sprintf("raw_affiliation_strings.search:%s", openalex_kth_rawaff_query())
 96 |   ))
 97 | 
 98 |   csv <- openalex_works_export(my_filter, "csv")
 99 | 
100 |   #arrow::write_parquet(csv_2023, "~/oa-2023-csv-export.parquet")
101 | 
102 |   is_valid <- nrow(csv) > 1
103 | 
104 |   expect_true(is_valid)
105 | 
106 | })
107 | 
108 | 
109 | 
110 | # TODO: Vilka konferenspapper finns i OA som är kopplade till KTH (behöver inte ha DOI)?
111 | # TODO: Vad ger sökningar (autocomplete) på titlar och konferensnamn?
112 | # Hypotes: Scopus är duktiga på konferenser, kanske pga screenscraping-superpowers, hur står sig OpenAlex där?
113 | # TODO: Hur mycket "mer" av sådant ser man med en Premium Key?
114 | 
115 | # tf <- openalex_counts(filter = "authorships.institutions.lineage:i86987016,publication_year:2025")
116 | 
117 | # tree <- tf[grepl("Topic", names(tf))]
118 | # topics <- openalex_topics()
119 | 
120 | # tree$`Primary Topic Domain Id` |> 
121 | #   left_join(topics |> distinct(id_domain, domain), by = c(name = "domain")) |> 
122 | #   rename(domain = "name")
123 | 
124 | # tree$`Primary Topic Field Id` |> 
125 | #   left_join(topics |> distinct(id_field, field, id_domain, domain), by = c(name = "field")) |> 
126 | #   rename(field = "name")
127 | 
128 | # tree$`Primary Topic Subfield Id` |> 
129 | #   left_join(topics |> distinct(id_subfield, subfield, id_field, field, id_domain, domain), by = c(name = "subfield")) |> 
130 | #   rename(subfield = "name")
131 | 
132 | # tree$`Primary Topic Id` |> 
133 | #   left_join(topics, by = c(name = "topic")) |> 
134 | #   rename(topic = "name")
135 | 
136 | # tt <- 
137 | #   list(tree, names(tree)) |> purrr::pmap(\(x, y) x |> mutate(var = y) |> select(var, everything())) |> 
138 | #   map_dfr(bind_rows) |> 
139 | #   rename(display_name = name) |> 
140 | #   left_join(topics, by = "display_name")
141 | 
142 | # proceed to make a treemap


--------------------------------------------------------------------------------
/tests/testthat/test-freeze.R:
--------------------------------------------------------------------------------
 1 | test_that("converting some records to tables does not cause freeze", {
 2 | 
 3 |   skip_on_ci()
 4 | 
 5 |   works_filter <- paste0(
 6 |     "publication_year:2015-2024,",
 7 |     "primary_topic.subfield.id:subfields/3605,",
 8 |     "authorships.countries:countries/se"
 9 |   )
10 | 
11 |   cc <- 
12 |     works_filter |> 
13 |     openalex_works_cursorcrawl(n_max_pages = 28)
14 | 
15 |   mydir <- unique(dirname(cc))
16 | 
17 |   read_jsonl <- function(fn) {
18 |       fn |> file() |> readr::read_lines() |> 
19 |       RcppSimdJson::fparse(max_simplify_lvl = "list")
20 |   }
21 | 
22 |   ccs <- 
23 |     cc |> purrr::map(\(x) list(results = read_jsonl(x)))
24 | 
25 |   res <- ccs |> openalex_works_to_tbls()
26 | 
27 |   #object <- list(results = ccs[[4]]$results) 
28 |   #object |> parse_work2()
29 | 
30 |   is_valid <- 
31 |     all(cc %in% dir(mydir, full.names = TRUE)) &
32 |     res$work |> nrow() == ccs |> map_int(\(x) length(x$results)) |> sum()
33 | 
34 |   expect_true(is_valid)
35 |   
36 | })
37 | 
38 | test_that("converting some records to tables does not cause vroom error", {
39 | 
40 |   skip_on_ci()
41 | 
42 |   works_filter <- paste0(
43 |     "publication_year:2015-2024,",
44 |     "topics.subfield.id:subfields/3605,",
45 |     "authorships.countries:countries/se"
46 |   )
47 | 
48 |   cc <- 
49 |     works_filter |> 
50 |     openalex_works_cursorcrawl(n_max_pages = 27)
51 | 
52 |   mydir <- unique(dirname(cc))
53 |   fns_max_size <- max(file.size(cc))
54 | 
55 |   read_jsonl <- function(fn) {
56 |     Sys.setenv("VROOM_CONNECTION_SIZE" = fns_max_size)
57 |     fn |> file() |> readr::read_lines() |> 
58 |     RcppSimdJson::fparse(max_simplify_lvl = "list")
59 |   }
60 | 
61 |   ccs <- 
62 |     cc |> purrr::map(\(x) list(results = read_jsonl(x)))
63 | 
64 |   res <- ccs |> openalex_works_to_tbls()
65 | 
66 |   #ccs |> map(\(x) list(results = x) |> openalex_works_to_tbls())
67 | 
68 |   #i <- ceiling(0.77 * 26)
69 |   #x <- ccs[i] 
70 |   #cc[i]
71 | 
72 |   #object <- list(results = ccs[[4]]$results) 
73 |   #object |> parse_work2()
74 | 
75 |   is_valid <- 
76 |     all(cc %in% dir(mydir, full.names = TRUE)) &
77 |       res$work |> nrow() == ccs |> map_int(\(x) length(x$results)) |> sum()
78 | 
79 |   expect_true(is_valid)
80 | })
81 | 
82 | 


--------------------------------------------------------------------------------
/tests/testthat/test-open_alex_restclient.R:
--------------------------------------------------------------------------------
  1 | # test_that("attribution works", {
  2 | #   attribution <- capture.output(cat(openalex_attribution()))
  3 | #   is_ok <- length(attribution) == 2 && nchar(attribution) > 0
  4 | #   expect_true(is_ok)
  5 | # })
  6 | 
  7 | test_that("fetching work works", {
  8 | 
  9 |   identifier <- "W1851956350"
 10 | 
 11 |   #is_ok <- identical(openalex_work(identifier), openalex_work(identifier))
 12 |   expected_id <- paste0("https://openalex.org/", identifier)
 13 | 
 14 |   table_has_ok_id <-
 15 |     subset(openalex_work(identifier), name == "id")$value ==
 16 |       expected_id
 17 | 
 18 |   object_has_ok_id <-
 19 |     openalex_work(identifier, format = "object")$ids$openalex ==
 20 |       expected_id
 21 | 
 22 |   expect_true(table_has_ok_id && object_has_ok_id)
 23 | 
 24 | })
 25 | 
 26 | test_that("error 404 is returned for when work is not found", {
 27 |   identifier <- "10.1038/nrn3241"
 28 |   expect_error(openalex_work(identifier), "404")
 29 | })
 30 | 
 31 | test_that("fetching random work works", {
 32 |   random <- openalex_work(use_random = TRUE)
 33 |   is_ok <- nrow(random) > 10
 34 |   expect_true(is_ok)
 35 | })
 36 | 
 37 | # openalex_entity_enum()
 38 | #
 39 | # openalex_list(entity = "works", query = openalex_query(page = 2))$meta
 40 | # openalex_list(entity = "concepts", query = openalex_query(page = 2))$meta
 41 | # openalex_list(entity = "institutions", query = openalex_query())$meta
 42 | # openalex_list(entity = "venues", query = openalex_query())$meta
 43 | # openalex_list(entity = "authors", query = openalex_query())$meta
 44 | #
 45 | # # works whose type is book
 46 | # openalex_list("works", query = openalex_query(
 47 | #   filter = "type:book")
 48 | # )$meta
 49 | #
 50 | # #venues that host more than 1000 works:
 51 | # openalex_list("venues", query = openalex_query(
 52 | #   filter = "works_count:>1000")
 53 | # )$meta
 54 | #
 55 | # # US-based authors who've been cited more than 100 times:
 56 | # openalex_list("authors", query = openalex_query(
 57 | #   filter = "last_known_institution.country_code:US,cited_by_count:>0")
 58 | # )$meta
 59 | #
 60 | # # works whose title is searched
 61 | # openalex_list("works", query = openalex_query(
 62 | #   filter = "title.search:'intensive treatment of diabetes'")
 63 | # )$meta
 64 | #
 65 | # res <-
 66 | #   openalex_crawl("works", query = openalex_query(
 67 | #   filter = "title.search:'intensive treatment of diabetes'")
 68 | # )
 69 | #
 70 | # library(dplyr)
 71 | # res %>% openalex_flatten_long() %>% count(name) %>% arrange(desc(n))
 72 | 
 73 | test_that("providing email for polite pool gives faster response...", {
 74 | 
 75 |   skip()
 76 | 
 77 |   # so initial setting can be restored
 78 |   initial <- Sys.getenv("OPENALEX_USERAGENT")
 79 |   on.exit(Sys.setenv("OPENALEX_USERAGENT" = initial))
 80 | 
 81 |   # not polite
 82 |   openalex_polite("")
 83 |   tn <- system.time(
 84 |     c1 <- openalex_crawl("works", verbose = TRUE,
 85 |              query = openalex:::openalex_query(filter =
 86 |                  "institutions.id:I86987016,publication_year:2022"))
 87 |   )[3]
 88 | 
 89 |   # polite
 90 |   openalex_polite("markussk@kth.se")
 91 |   tp <- system.time(
 92 |     c2 <- openalex_crawl("works", verbose = TRUE,
 93 |              query = openalex:::openalex_query(filter =
 94 |                  "institutions.id:I86987016,publication_year:2022"))
 95 |   )[3]
 96 | 
 97 |   message("Polite time: ", tp)
 98 |   message("Not polite time: ", tn)
 99 | 
100 |   is_faster <- tp < tn
101 |   expect_true(is_faster)
102 | 
103 | })
104 | 


--------------------------------------------------------------------------------