├── .Rbuildignore ├── .github ├── .gitignore ├── dependabot.yaml └── workflows │ ├── R-CMD-check.yaml │ └── pkgdown.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── R ├── bind_rows_2.R ├── cursor_crawl.R ├── doi_lookup.R ├── open_alex_export.R ├── open_alex_restclient.R ├── openalex-package.R ├── rectangularize.R └── utils-pipe.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── data-raw └── DATASET.R ├── data └── topics.rda ├── man ├── openalex-package.Rd ├── openalex_api.Rd ├── openalex_attribution.Rd ├── openalex_counts.Rd ├── openalex_crawl.Rd ├── openalex_doi_lookup.Rd ├── openalex_flatten_long.Rd ├── openalex_key.Rd ├── openalex_kth_rawaff_query.Rd ├── openalex_polite.Rd ├── openalex_topics.Rd ├── openalex_work.Rd ├── openalex_works_created_since.Rd ├── openalex_works_cursorcrawl.Rd ├── openalex_works_export.Rd ├── openalex_works_published_since.Rd ├── openalex_works_updated_since.Rd ├── openalex_write_duckdb.Rd ├── pipe.Rd ├── topics.Rd └── wos_plaintext_for_diva.Rd ├── openalex.Rproj └── tests ├── testthat.R └── testthat ├── test-crawl.R ├── test-cursorcrawl.R ├── test-dois.R ├── test-export.R ├── test-freeze.R └── test-open_alex_restclient.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^LICENSE\.md$ 2 | ^\.github$ 3 | ^data-raw$ 4 | ^_pkgdown\.yml$ 5 | ^docs$ 6 | ^pkgdown$ 7 | ^.*\.Rproj$ 8 | ^\.Rproj\.user$ 9 | ^README\.Rmd$ 10 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'release'} 24 | 25 | env: 26 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 27 | R_KEEP_PKG_SOURCE: yes 28 | 29 | steps: 30 | - uses: actions/checkout@v4 31 | 32 | - uses: r-lib/actions/setup-pandoc@v2 33 | 34 | - uses: r-lib/actions/setup-r@v2 35 | with: 36 | r-version: ${{ matrix.config.r }} 37 | http-user-agent: ${{ matrix.config.http-user-agent }} 38 | use-public-rspm: true 39 | 40 | - uses: r-lib/actions/setup-r-dependencies@v2 41 | with: 42 | extra-packages: any::rcmdcheck 43 | needs: check 44 | 45 | - uses: r-lib/actions/check-r-package@v2 46 | with: 47 | upload-snapshots: true 48 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | permissions: 23 | contents: write 24 | steps: 25 | - uses: actions/checkout@v4 26 | 27 | - uses: r-lib/actions/setup-pandoc@v2 28 | 29 | - uses: r-lib/actions/setup-r@v2 30 | with: 31 | use-public-rspm: true 32 | 33 | - uses: r-lib/actions/setup-r-dependencies@v2 34 | with: 35 | extra-packages: any::pkgdown, local::. 36 | needs: website 37 | 38 | - name: Build site 39 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 40 | shell: Rscript {0} 41 | 42 | - name: Deploy to GitHub pages 🚀 43 | if: github.event_name != 'pull_request' 44 | uses: JamesIves/github-pages-deploy-action@v4.7.3 45 | with: 46 | clean: false 47 | branch: gh-pages 48 | folder: docs 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | docs 6 | 7 | /.quarto/ 8 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: openalex 2 | Title: Data from OpenAlex REST API 3 | Version: 0.0.5 4 | Authors@R: 5 | person(given = "Markus", 6 | family = "Skyttner", 7 | role = c("cre", "aut"), 8 | email = "markussk@kth.se") 9 | Description: The OpenAlex website provides open data on 10 | papers/works, venues, institutions and more around the world under the CC0 license. 11 | This R package provides some functions to access data from the OpenAlex REST API. 12 | License: MIT + file LICENSE 13 | Encoding: UTF-8 14 | LazyData: true 15 | Roxygen: list(markdown = TRUE) 16 | RoxygenNote: 7.3.2 17 | Imports: 18 | httr, 19 | magrittr, 20 | utils, 21 | dplyr, 22 | purrr, 23 | progress, 24 | jsonlite, 25 | tibble, 26 | tidyr, 27 | lubridate, 28 | DBI, 29 | duckdb, 30 | httr2, 31 | readr, 32 | RcppSimdJson, 33 | jqr 34 | Suggests: 35 | testthat (>= 3.0.0) 36 | Config/testthat/edition: 3 37 | URL: https://kth-library.github.io/openalex/ 38 | Depends: 39 | R (>= 2.10) 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2022 2 | COPYRIGHT HOLDER: openalex authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2022 openalex authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export("%>%") 4 | export(openalex_api) 5 | export(openalex_attribution) 6 | export(openalex_counts) 7 | export(openalex_crawl) 8 | export(openalex_doi_lookup) 9 | export(openalex_flatten_long) 10 | export(openalex_key) 11 | export(openalex_kth_rawaff_query) 12 | export(openalex_polite) 13 | export(openalex_topics) 14 | export(openalex_work) 15 | export(openalex_works_created_since) 16 | export(openalex_works_published_since) 17 | export(openalex_works_updated_since) 18 | import(DBI) 19 | import(dplyr) 20 | import(duckdb) 21 | import(httr2) 22 | import(purrr) 23 | import(tidyr) 24 | importFrom(RcppSimdJson,fload) 25 | importFrom(RcppSimdJson,fminify) 26 | importFrom(RcppSimdJson,fparse) 27 | importFrom(dplyr,bind_cols) 28 | importFrom(dplyr,bind_rows) 29 | importFrom(dplyr,distinct) 30 | importFrom(dplyr,mutate) 31 | importFrom(dplyr,rename) 32 | importFrom(dplyr,select) 33 | importFrom(dplyr,starts_with) 34 | importFrom(dplyr,tibble) 35 | importFrom(httr,GET) 36 | importFrom(httr,content) 37 | importFrom(httr,http_type) 38 | importFrom(httr,modify_url) 39 | importFrom(httr,status_code) 40 | importFrom(httr,user_agent) 41 | importFrom(jqr,jq) 42 | importFrom(jsonlite,fromJSON) 43 | importFrom(lubridate,as_date) 44 | importFrom(lubridate,format_ISO8601) 45 | importFrom(magrittr,"%>%") 46 | importFrom(progress,progress_bar) 47 | importFrom(purrr,keep) 48 | importFrom(purrr,map) 49 | importFrom(purrr,map_df) 50 | importFrom(purrr,map_dfr) 51 | importFrom(purrr,pluck) 52 | importFrom(purrr,pmap) 53 | importFrom(purrr,possibly) 54 | importFrom(purrr,walk2) 55 | importFrom(readr,read_csv) 56 | importFrom(stats,setNames) 57 | importFrom(tibble,enframe) 58 | importFrom(tidyr,hoist) 59 | importFrom(tidyr,unnest) 60 | importFrom(tidyr,unnest_longer) 61 | importFrom(tidyr,unnest_wider) 62 | importFrom(utils,URLencode) 63 | importFrom(utils,tail) 64 | -------------------------------------------------------------------------------- /R/bind_rows_2.R: -------------------------------------------------------------------------------- 1 | bind_rows2 <- function(l) { 2 | 3 | #checks 4 | stopifnot(is.list(l)) 5 | 6 | #get vars 7 | v <- unlist( 8 | lapply(unname(l), \(df) vapply(df, typeof, character(1))) 9 | ) 10 | nm <- names(v) 11 | nm0 <- unique(nm[duplicated(nm)]) 12 | 13 | #get list of columns with diff types in diff datasets 14 | #to do use reduce with intersection hwere might make this all easier to read. 15 | x <- stats::setNames(lapply(nm0, \(x) unique(v[nm == x])), nm0) 16 | x0 <- x[!sapply(x, \(x) length(unique(x)) == 1)] 17 | 18 | # Convert to highest in type hierarchy 19 | type_hierarchy <- c("logical" = 1, "integer" = 2, "double" = 3, "character"= 4) 20 | conv_funs <- lapply(x0, \(x) 21 | switch(max(type_hierarchy[x]), 22 | match.fun(as.logical), 23 | match.fun(as.integer), 24 | match.fun(as.double), 25 | match.fun(as.character) 26 | ) 27 | ) 28 | l1 <- lapply(l, \(df) { 29 | f <- conv_funs[names(conv_funs) %in% names(df)] 30 | for(i in 1:length(f)) { 31 | df[[names(f[i])]] <- f[[i]](df[[names(f[i])]]) 32 | } 33 | df 34 | }) 35 | 36 | # bind rows and return 37 | dplyr::bind_rows(l1) 38 | } 39 | -------------------------------------------------------------------------------- /R/cursor_crawl.R: -------------------------------------------------------------------------------- 1 | #' Crawl multipage responses from queries against the API 2 | #' 3 | #' Chunks and uses cursor based pagination to fetch works 4 | #' @param works_filter the works filter 5 | #' @param n_max_pages the max amount of pages to fetch (50 per page) 6 | #' @return paths to downloaded files 7 | #' @importFrom RcppSimdJson fminify fparse fload 8 | #' @importFrom jqr jq 9 | openalex_works_cursorcrawl <- function( 10 | works_filter, 11 | n_max_pages = 5 12 | ) { 13 | 14 | req_works <- 15 | "https://api.openalex.org/" |> 16 | httr2::request() |> 17 | httr2::req_url_path("works") 18 | 19 | # initially, cursor is set to "*" 20 | q <- list( 21 | filter = works_filter, 22 | cursor = "*", 23 | `per-page` = 50 24 | ) 25 | 26 | # fcn to get works based on query params 27 | fetch_works <- function(q) { 28 | req_works |> 29 | httr2::req_url_query(!!!q) |> 30 | httr2::req_perform() |> 31 | httr2::resp_body_string() |> 32 | RcppSimdJson::fminify() 33 | } 34 | 35 | # get the first page of results 36 | json_line <- fetch_works(q) 37 | 38 | json_header <- function(j) { 39 | j |> RcppSimdJson::fparse(query = "/meta", max_simplify_lvl = "list") 40 | } 41 | 42 | json_results <- function(j) { 43 | 44 | #cmd <- sprintf("%s -c '.results[]' | %s -c 'del(..|.abstract_inverted_index?)'", 45 | # jq_binary, jq_binary) 46 | 47 | #system(cmd, input = j, intern = TRUE) #|> 48 | j |> jqr::jq(".results[] | del(..|.abstract_inverted_index?)") 49 | } 50 | 51 | #TODO: exclude abstract_inverted_index 52 | # Using JSONPath: $.*[?(@.abstract_inverted_index == null)] 53 | 54 | header <- json_line |> json_header() 55 | results <- json_line |> json_results() 56 | 57 | # page <- 58 | # json_line |> 59 | # RcppSimdJson::fparse("/results", max_simplify_lvl = "list") |> 60 | # (\(x) list(list(results = x)))() 61 | 62 | #page |> openalex_works_to_tbls() 63 | 64 | # compute total number of pages 65 | h <- header 66 | n_pages <- ceiling(h$count / h$per_page) 67 | 68 | # begin the crawl 69 | message("Retrieving ", min(n_max_pages, n_pages), " out of a total of ", 70 | n_pages, " pages having a total record count of ", h$count, 71 | ". Starting crawl...") 72 | 73 | # iterate using a while loop 74 | i <- 1 75 | is_stopped <- FALSE 76 | is_done <- n_pages <= 1 77 | q$cursor <- h$next_cursor 78 | td <- tempdir() 79 | unlink(dir(td, pattern = "\\.json$", full.names = TRUE)) 80 | fn <- file.path(td, sprintf("%04i%s", i, ".json")) 81 | readr::write_lines(results, fn) 82 | #message("Wrote page ", i, " to ", fn, " and next cursor is ", q$cursor) 83 | #readr::write_rds(page, file = fn) 84 | #message("Cursor: ", q$cursor) 85 | 86 | while (!is_done) { 87 | i <- i + 1 88 | if (i %% 10 == 0) cat(paste(i, q$cursor, "\n")) else { 89 | if (i %% 100 == 0) cat("HUNDREDS_OF_PAGES!!!!\n") else cat(".") 90 | } 91 | next_page <- fetch_works(q) 92 | #stopifnot(!is.null(next_page)) 93 | h <- json_header(next_page) 94 | q$cursor <- h$next_cursor 95 | fn <- file.path(td, sprintf("%04i%s", i, ".json")) 96 | if (file.exists(fn)) unlink(fn) 97 | 98 | results <- json_results(next_page) 99 | #message("Batch: #", i, " ", length(results)) 100 | if (length(results) > 0) { 101 | readr::write_lines(results, fn, append = TRUE) 102 | } 103 | is_stopped <- i >= min(n_max_pages, n_pages) 104 | if (is_stopped) 105 | message("Stopped, next cursor is: ", q$cursor) 106 | is_done <- is.null(q$cursor) || is_stopped 107 | } 108 | 109 | filez <- dir(td, pattern = "\\.json$", full.names = TRUE) 110 | return (filez) 111 | 112 | message("\nDone, fetched ", length(filez), " pages of works, written to ", td) 113 | 114 | } 115 | 116 | jsonl_to_tbl <- function(fn) { 117 | obj <- fn |> RcppSimdJson::fload(max_simplify_lvl = "list") 118 | res <- list(results = obj) 119 | res |> parse_work2() 120 | } 121 | 122 | -------------------------------------------------------------------------------- /R/doi_lookup.R: -------------------------------------------------------------------------------- 1 | split_chunks_of_n <- function(x, n) 2 | split(x, ceiling(seq_along(x) / n)) 3 | 4 | split_n_chunks <- function(x, n) 5 | split(x, ceiling(seq_along(x) / (length(x) / n))) 6 | 7 | openalex_or <- function(x) 8 | paste0(collapse = "|", x) 9 | 10 | doi_crawl <- function(dois) { 11 | 12 | works <- 13 | openalex_crawl("works", fmt = "object", 14 | query = openalex_query(filter = paste0("doi:", dois)) 15 | ) 16 | 17 | lol <- 18 | list(list(results = reduce(works |> map("results"), c))) 19 | 20 | lol |> openalex_works_to_tbls() 21 | 22 | } 23 | 24 | doi_lookup_identifiers <- function(con, doi_filter) { 25 | 26 | if (missing(con)) { 27 | con <- duckdb::dbConnect(duckdb::duckdb()) 28 | DBI::dbSendQuery(con, "install json; load json; install httpfs; load httpfs;") 29 | on.exit(duckdb::dbDisconnect(con, shutdown = TRUE)) 30 | } 31 | 32 | sql <- 33 | paste0("from (from read_json_auto('", 34 | sprintf("https://api.openalex.org/works?filter=doi:%s&per-page=50&mailto=support@openalex.org", doi_filter), 35 | "') select unnest(results) as r) select unnest(r.ids);") 36 | 37 | DBI::dbGetQuery(con, sql) |> as_tibble() 38 | } 39 | 40 | #' Lookup DOIs using OpenAlex 41 | #' 42 | #' @param dois a character vector of DOIs 43 | #' @param resolution either "all" or "identifiers" to only return other related identifiers 44 | #' @return tibble(s) 45 | #' @export 46 | openalex_doi_lookup <- function(dois, resolution = c("all", "identifiers")) { 47 | 48 | dois <- unique(dois) 49 | 50 | doi_filters <- 51 | split_chunks_of_n(dois, 50) |> 52 | map_chr(openalex_or) 53 | 54 | doi_chunks <- switch(resolution, 55 | "all" = { 56 | doi_filters |> 57 | purrr::map(doi_crawl, .progress = TRUE) 58 | }, 59 | "identifiers" = { 60 | doi_filters |> 61 | map_dfr(\(x) doi_lookup_identifiers(doi_filter = x), .progress = TRUE) 62 | } 63 | ) 64 | 65 | doi_chunks 66 | 67 | } 68 | 69 | -------------------------------------------------------------------------------- /R/open_alex_export.R: -------------------------------------------------------------------------------- 1 | #' Use OpenAlex API for exporting data in tabular and wos formats 2 | #' @param q the query, for example "authorships.institutions.lineage:i86987016,authorships.institutions.lineage:!i4210161097,type:types/article,primary_location.source.type:source-types/journal|source-types/conference,publication_year:2023" 3 | #' @param fmt the export format, one of "csv" or "wos-plaintext" or "wos-plaintext-diva" 4 | #' @param raw_string boolean to indicate whether a raw string should be returned 5 | #' @return a character vector with a raw string with the results from the export or a data frame 6 | #' @import httr2 7 | #' @importFrom dplyr bind_cols 8 | #' @importFrom readr read_csv 9 | openalex_works_export <- function(q, fmt = c("csv", "wos-plaintext"), raw_string = FALSE) { 10 | 11 | query <- list(filter = q) 12 | query$format <- fmt 13 | query$truncate <- "false" 14 | query$api_key <- cfg()$key 15 | 16 | ep <- 17 | "https://export.openalex.org" |> 18 | httr2::request() |> 19 | httr2::req_url_path("works") |> 20 | httr2::req_user_agent(cfg()$user_agent) |> 21 | httr2::req_url_query(!!!query) 22 | 23 | check_progress <- function() { 24 | ep |> httr2::req_perform() |> httr2::resp_body_json() |> dplyr::bind_cols() 25 | } 26 | 27 | res <- check_progress() 28 | 29 | message("Waiting for export to be generated ...\n") 30 | 31 | while (res$status != "finished") { 32 | Sys.sleep(5) 33 | res <- check_progress() 34 | message(sprintf("%0.1f%%", as.double(res$progress) * 100), " (", res$status, ")") 35 | } 36 | 37 | message("Export is ready, retrieving results.") 38 | out <- 39 | httr2::request(res$result_url) |> 40 | httr2::req_perform() |> 41 | httr2::resp_body_string() 42 | 43 | message("Done, returning results") 44 | if (raw_string) return(out) 45 | 46 | res <- switch(match.arg(fmt), 47 | "csv" = { 48 | out |> readr::read_csv(show_col_types = FALSE) 49 | }, 50 | # "wos-plaintext" = { 51 | # out |> strsplit(split = "\n") |> unlist() |> read_wos_plaintext() 52 | # }, 53 | "wos-plaintext" = { 54 | out |> wos_plaintext_for_diva() 55 | } 56 | ) 57 | return (res) 58 | } 59 | 60 | #' Function which converts a wos_plaintext-string into a format 61 | #' which can be uploaded to DiVA, by adding ER tags 62 | #' (including a blank line) after each record 63 | #' @param x character string with "wos-plaintext" format as returned from OpenAlex export API endpoint 64 | #' @importFrom stats setNames 65 | #' @importFrom utils tail 66 | wos_plaintext_for_diva <- function(x) { 67 | w <- x |> strsplit("\n") |> unlist() 68 | i_header <- which(grepl("^FN|^VR", w)) 69 | #i_indented <- which(grepl("^\\s+", w)) 70 | i_eor <- which(grepl("^ER$", w)) 71 | i_blank <- which(nchar(w) == 0) 72 | 73 | pt <- w[-c(i_eor, i_blank)] # TODO: should i_header rows be removed too? 74 | i_record <- which(grepl("^PT\\s+", pt)) 75 | n_records <- length(i_record) 76 | i_range <- data.frame(beg = i_record, end = c(tail(i_record, -1) - 1, length(pt))) 77 | pt[i_range$end] <- pt[i_range$end] |> paste0("\nER\n") 78 | paste0(collapse ="\n", pt) 79 | } 80 | 81 | #' Export the results from a crawl as a duckdb database file 82 | #' @param crawl the results from running the to_tbls fcn 83 | #' @param destdir the location to save the database file 84 | #' @return file path to the database file 85 | #' @importFrom purrr walk2 86 | #' @import duckdb DBI 87 | openalex_write_duckdb <- function(crawl, destdir = NULL) { 88 | 89 | if (!requireNamespace("duckdb", quietly = TRUE)) { 90 | stop( 91 | "Package \"duckdb\" must be installed to use this function.", 92 | call. = FALSE 93 | ) 94 | } 95 | 96 | if (is.null(destdir)) { 97 | destdir <- file.path(tempdir(check = TRUE), "openalex", "openalex.db") 98 | } 99 | 100 | message("Ensure existing dir: ", dirname(destdir)) 101 | if (!dir.exists(dirname(destdir))) { 102 | is_created <- dir.create(dirname(destdir), showWarnings = TRUE) 103 | } else { 104 | message("Removing existing file ", destdir) 105 | if (file.exists(destdir)) 106 | unlink(destdir) 107 | } 108 | 109 | drv <- duckdb::duckdb() 110 | con <- duckdb::dbConnect(drv, dbdir = destdir) 111 | on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) 112 | 113 | crawl |> names() |> 114 | purrr::walk(\(x) duckdb::duckdb_register(con, sprintf("view_%s", x), crawl |> getElement(x))) 115 | 116 | toc <- DBI::dbListTables(con) 117 | new_tbl <- gsub("^view_", "", toc) 118 | 119 | sql_create_db <- sprintf("create table %s as from %s;", new_tbl, toc) |> 120 | paste(collapse = "\n") 121 | 122 | message("Creating duckdb file at ", destdir, " using sql ", sql_create_db) 123 | result <- DBI::dbExecute(con, sql_create_db) 124 | message("Result is ", result) 125 | 126 | return(destdir) 127 | 128 | } 129 | 130 | openalex_fields <- function() { 131 | paste0( 132 | "abstract.search, abstract.search.no_stem, apc_list.currency, apc_list.provenance, ", 133 | "apc_list.value, apc_list.value_usd, apc_paid.currency, apc_paid.provenance, apc_paid.value, ", 134 | "apc_paid.value_usd, author.id, author.orcid, authors_count, ", 135 | "authorships.affiliations.institution_ids, authorships.author.id, authorships.author.orcid, ", 136 | "authorships.countries, authorships.institutions.continent, authorships.institutions.country_code, ", 137 | "authorships.institutions.id, authorships.institutions.is_global_south, ", 138 | "authorships.institutions.lineage, authorships.institutions.ror, authorships.institutions.type, ", 139 | "authorships.is_corresponding, best_oa_location.is_accepted, best_oa_location.is_oa, ", 140 | "best_oa_location.is_published, best_oa_location.landing_page_url, best_oa_location.license, ", 141 | "best_oa_location.license_id, best_oa_location.source.host_organization, ", 142 | "best_oa_location.source.host_organization_lineage, best_oa_location.source.id, ", 143 | "best_oa_location.source.is_in_doaj, best_oa_location.source.is_oa, best_oa_location.source.issn, ", 144 | "best_oa_location.source.type, best_oa_location.version, best_open_version, biblio.first_page, ", 145 | "biblio.issue, biblio.last_page, biblio.volume, citation_normalized_percentile.is_in_top_10_percent, ", 146 | "citation_normalized_percentile.is_in_top_1_percent, citation_normalized_percentile.value, ", 147 | "cited_by, cited_by_count, cited_by_percentile_year.max, cited_by_percentile_year.min, ", 148 | "cites, concept.id, concepts.id, concepts.wikidata, concepts_count, corresponding_author_ids, ", 149 | "corresponding_institution_ids, countries_distinct_count, datasets, default.search, ", 150 | "display_name, display_name.search, display_name.search.no_stem, doi, doi_starts_with, ", 151 | "from_created_date, from_publication_date, fulltext.search, fulltext_origin, fwci, ", 152 | "grants.award_id, grants.funder, has_abstract, has_doi, has_embeddings, has_fulltext, ", 153 | "has_oa_accepted_or_published_version, has_oa_submitted_version, has_old_authors, has_orcid, ", 154 | "has_pdf_url, has_pmcid, has_pmid, has_raw_affiliation_strings, has_references, ids.mag, ", 155 | "ids.openalex, ids.pmcid, ids.pmid, indexed_in, institution.id, institution_assertions.country_code, ", 156 | "institution_assertions.id, institution_assertions.lineage, institution_assertions.ror, ", 157 | "institution_assertions.type, institutions.continent, institutions.country_code, institutions.id, ", 158 | "institutions.is_global_south, institutions.ror, institutions.type, institutions_distinct_count, ", 159 | "is_corresponding, is_oa, is_paratext, is_retracted, journal, keyword.search, keywords.id, ", 160 | "language, locations.is_accepted, locations.is_oa, locations.is_published, locations.landing_page_url, ", 161 | "locations.license, locations.license_id, locations.source.has_issn, ", 162 | "locations.source.host_institution_lineage, locations.source.host_organization, ", 163 | "locations.source.host_organization_lineage, locations.source.id, ", 164 | "locations.source.is_core, locations.source.is_in_doaj, locations.source.is_oa, ", 165 | "locations.source.issn, locations.source.publisher_lineage, locations.source.type, ", 166 | "locations.version, locations_count, mag, mag_only, oa_status, ", 167 | "open_access.any_repository_has_fulltext, open_access.is_oa, open_access.oa_status, ", 168 | "openalex, openalex_id, pmcid, pmid, primary_location.is_accepted, primary_location.is_oa, ", 169 | "primary_location.is_published, primary_location.landing_page_url, primary_location.license, ", 170 | "primary_location.license_id, primary_location.source.has_issn, ", 171 | "primary_location.source.host_institution_lineage, primary_location.source.host_organization, ", 172 | "primary_location.source.host_organization_lineage, primary_location.source.id, ", 173 | "primary_location.source.is_core, primary_location.source.is_in_doaj, ", 174 | "primary_location.source.is_oa, primary_location.source.issn, ", 175 | "primary_location.source.publisher_lineage, primary_location.source.type, ", 176 | "primary_location.version, primary_topic.domain.id, primary_topic.field.id, primary_topic.id, ", 177 | "primary_topic.subfield.id, publication_date, publication_year, raw_affiliation_strings.search, ", 178 | "raw_author_name.search, referenced_works, referenced_works_count, related_to, ", 179 | "repository, semantic.search, sustainable_development_goals.id, sustainable_development_goals.score, ", 180 | "title.search, title.search.no_stem, title_and_abstract.search, title_and_abstract.search.no_stem, ", 181 | "to_created_date, to_publication_date, to_updated_date, topics.domain.id, topics.field.id, ", 182 | "topics.id, topics.subfield.id, topics_count, type, type_crossref, version" 183 | ) |> strsplit(split = ", ") |> unlist() 184 | } 185 | 186 | openalex_groupbys_default <- function() { c( 187 | "primary_location.source.type", 188 | "primary_location.source.id", 189 | "is_retracted", 190 | "primary_location.source.publisher_lineage", 191 | "open_access.oa_status", 192 | "best_oa_location.is_published", 193 | "best_oa_location.is_accepted", 194 | "best_oa_location.license", 195 | "authorships.institutions.type", 196 | "has_pmid", 197 | "has_orcid", 198 | "mag_only", 199 | "primary_location.source.is_in_doaj", 200 | "has_doi", 201 | "primary_location.source.is_oa", 202 | "open_access.any_repository_has_fulltext", 203 | "institutions.is_global_south", 204 | "primary_location.source.is_core", 205 | "corresponding_institution_ids", 206 | "corresponding_author_ids", 207 | "authorships.institutions.continent", 208 | "language", 209 | "keywords.id", 210 | "authorships.countries", 211 | "authorships.author.id", 212 | "sustainable_development_goals.id", 213 | "grants.funder", 214 | "primary_topic.subfield.id", 215 | "primary_topic.field.id", 216 | "primary_topic.domain.id", 217 | "primary_topic.id", 218 | "type", 219 | "authorships.institutions.lineage", 220 | "open_access.is_oa", 221 | "publication_year" 222 | )} 223 | 224 | openalex_filter_default <- function() { 225 | "authorships.institutions.lineage:i86987016,publication_year:2025" 226 | } 227 | 228 | openalex_groupbys <- function(q) { 229 | 230 | colname <- field <- colid <- i <- NULL 231 | 232 | csv <- 233 | q |> readr::read_lines() 234 | 235 | schema <- 236 | csv[1:2] |> 237 | strsplit(split = ",") |> 238 | setNames(c("field", "colname")) |> 239 | purrr::map(\(x) na_if(x, "")) |> 240 | tibble::as_tibble() |> 241 | tibble::rowid_to_column(var = "colid") |> 242 | tidyr::fill(any_of(c("field"))) |> 243 | dplyr::filter(!is.na(colname)) |> 244 | dplyr::group_by(field) |> 245 | dplyr::summarize(i = min(colid), j = max(colid), colnames = list(colname)) |> 246 | dplyr::arrange(-desc(i)) 247 | 248 | body <- 249 | csv[-c(1:2)] |> paste(collapse = "\n") 250 | 251 | all <- 252 | readr::read_csv(body, col_names = NULL, show_col_types = FALSE) 253 | 254 | parse_body <- function(field, i, j, colnames) { 255 | all |> 256 | select(c(i, j)) |> 257 | setNames(nm = unlist(colnames)) |> 258 | filter(!if_all(everything(), is.na)) 259 | #filter(if_all(\(x) all(is.na(x)))) #|> 260 | #list() |> setNames(nm = field) 261 | } 262 | 263 | tbls <- 264 | schema |> purrr::pmap(parse_body) |> 265 | setNames(nm = schema$field) |> 266 | map(\(x) x |> mutate(across(any_of(c("name")), as.character))) 267 | 268 | tbls 269 | 270 | } 271 | 272 | #' Counts from OpenAlex 273 | #' 274 | #' Aggregates/counts can be retrieved using the group_bys query parameter 275 | #' 276 | #' @param filter a set of filter criteria, see the defaults in openalex_filter_default() 277 | #' @param dimensions a set of grouping dimensions, see the defaults in openalex_groupbys_default() 278 | #' @return a list of tibbles 279 | #' @export 280 | #' @importFrom utils URLencode 281 | openalex_counts <- function( 282 | filter = openalex_filter_default(), 283 | dimensions = openalex_groupbys_default() 284 | ) { 285 | 286 | groupbys <- 287 | dimensions|> paste0(collapse = ",") |> utils::URLencode(reserved = TRUE) 288 | 289 | url <- paste0( 290 | openalex_api(), "works?group_bys=", groupbys, 291 | "&per_page=200&format=csv&mailto=team%40ourresearch.org", 292 | "&filter=", filter 293 | ) 294 | 295 | message("Requesting ", url) 296 | 297 | url |> openalex_groupbys() 298 | } 299 | 300 | read_page <- function(level = c("topics", "subfields", "fields", "domains"), page) { 301 | 302 | topic_page <- 303 | "https://api.openalex.org/%s?select=id,display_name,description,subfield,field,domain&per_page=200&page=%s" |> 304 | sprintf(level, page) |> 305 | jsonlite::fromJSON() 306 | 307 | tbl <- topic_page$results |> tibble::as_tibble() 308 | structure(tbl, meta = topic_page$meta) 309 | 310 | } 311 | 312 | openalex_level <- function(l) { 313 | 314 | t <- read_page(level = l, page = 1) 315 | np <- ceiling(attr(t, "meta")$count / 200) 316 | 317 | ts <- NULL 318 | if (np > 1) { 319 | ts <- (2:np) |> map(\(x) read_page(level = l, page = x), .progress = TRUE) 320 | } 321 | 322 | t |> bind_rows(map_dfr(ts, bind_rows)) 323 | 324 | } 325 | 326 | openalex_levels <- function() { 327 | 328 | display_name <- NULL 329 | 330 | topics <- openalex_level("topics") 331 | 332 | four <- topics |> select(all_of(c("subfield", "field", "domain"))) 333 | 334 | topics |> rename(id_topic = id, topic = display_name) |> select(1:3) |> bind_cols( 335 | four$subfield |> as_tibble() |> rename(id_subfield = id, subfield = display_name), 336 | four$field |> as_tibble() |> rename(id_field = id, field = display_name), 337 | four$domain |> as_tibble() |> rename(id_domain = id, domain = display_name) 338 | ) |> 339 | mutate(across(contains("id_"), \(x) gsub("https://openalex.org/", "", x))) 340 | 341 | } 342 | 343 | #' Topics 344 | #' 345 | #' Table of current topics, subfields, fields and domains used at OpenAlex 346 | #' @export 347 | openalex_topics <- function() { 348 | openalex_levels() 349 | } 350 | -------------------------------------------------------------------------------- /R/open_alex_restclient.R: -------------------------------------------------------------------------------- 1 | #file.edit("~/.Renviron") 2 | #readRenviron("~/.Renviron") 3 | 4 | #' Enter the OpenAlex API polite pool for faster requests by providing an email 5 | #' @param email an email address, on the form "you@example.com" or "" to unset email 6 | #' @return a logical depending on whether email was set or unset 7 | #' @examples 8 | #' \dontrun{ 9 | #' if(interactive()){ 10 | #' # to set 11 | #' openalex_polite("you@example.com") 12 | #' # to unset 13 | #' openalex_polite("") 14 | #' } 15 | #' } 16 | #' @export 17 | openalex_polite <- function(email) { 18 | 19 | if (!nzchar(email)) { 20 | message("Exiting from polite pool, email no longer provided in user agent header") 21 | Sys.setenv("OPENALEX_USERAGENT" = "http://github.com/hadley/httr") 22 | return (FALSE) 23 | } 24 | 25 | stopifnot(is.character(email), length(email) == 1) 26 | re_email <- "^mailto:.*?@.*?\\..*?" 27 | if (!grepl(re_email, email)) 28 | email <- paste0("mailto:", trimws(email)) 29 | stopifnot(grepl(re_email, email)) 30 | 31 | ua <- sprintf("http://github.com/hadley/httr (%s)", email) 32 | 33 | if (Sys.getenv("OPENALEX_USERAGENT") != "") { 34 | message("Hint: You can provide an email to enter the polite pool") 35 | message("To have the setting stick persistently using .Renviron, do ...") 36 | message(' file.edit("~/.Renviron")') 37 | message(sprintf(' # and add a line OPENALEX_USERAGENT="%s"', ua)) 38 | message("Then reload settings for the R environment in the current session") 39 | message(' readRenviron("~/.Renviron")') 40 | } 41 | 42 | message("Temporarily setting OPENALEX_USERAGENT envvar for this session to: ", ua) 43 | Sys.setenv("OPENALEX_USERAGENT" = ua) 44 | return (TRUE) 45 | } 46 | 47 | #' Use an API key for OpenAlex Premium Subscription 48 | #' 49 | #' This provides access to the latest data, fresher than what snapshots provide. 50 | #' It also enables faster requests and filtering on from_created_date and from_updated_date fields. 51 | #' @param key a premium subscription key 52 | #' @return a logical depending on whether key was set or unset 53 | #' @examplesIf interactive() 54 | #' openalex_key("my_secret_api_key") 55 | #' openalex_key("") 56 | #' @export 57 | #' @details 58 | #' Additional details... 59 | #' 60 | #' 61 | #' 62 | #' 63 | openalex_key <- function(key) { 64 | 65 | if (!nzchar(key)) { 66 | message("Unsetting premium subscription key") 67 | Sys.setenv("OPENALEX_KEY" = "") 68 | return (FALSE) 69 | } 70 | 71 | stopifnot(is.character(key), length(key) == 1) 72 | 73 | re_key <- "[[:alnum:]]{22}" 74 | stopifnot(grepl(re_key, key)) 75 | 76 | if (Sys.getenv("OPENALEX_KEY") != "") { 77 | message("Hint: You can provide an premium subscription api key") 78 | message("To have the setting stick persistently using .Renviron, do ...") 79 | message(' file.edit("~/.Renviron")') 80 | message(sprintf(' # and add a line OPENALEX_KEY="%s"', key)) 81 | message("Then reload settings for the R environment in the current session") 82 | message(' readRenviron("~/.Renviron")') 83 | } 84 | 85 | message("Temporarily setting OPENALEX_KEY envvar for this session") 86 | Sys.setenv("OPENALEX_KEY" = key) 87 | return (TRUE) 88 | } 89 | 90 | cfg <- function() { 91 | 92 | res <- list( 93 | user_agent = "http://github.com/hadley/httr" 94 | ) 95 | 96 | if (Sys.getenv("OPENALEX_USERAGENT") != "") { 97 | res$user_agent <- Sys.getenv("OPENALEX_USERAGENT") 98 | } 99 | 100 | if (Sys.getenv("OPENALEX_KEY") != "") { 101 | res$key <- Sys.getenv("OPENALEX_KEY") 102 | } 103 | 104 | return (res) 105 | } 106 | 107 | #' Endpoint used for requests to OpenAlex API 108 | #' @export 109 | openalex_api <- function() { 110 | "https://api.openalex.org/" 111 | } 112 | 113 | #' Attribution 114 | #' 115 | #' Use this attribution whenever data from the API is publicly displayed 116 | #' 117 | #' @details OpenAlex provides a RESTful API for scholarly papers, authors, 118 | #' institutions, and more. When publicly displaying data from the API, 119 | #' it is polite to point back to OpenAlex at https://openalex.org/ 120 | #' @export 121 | openalex_attribution <- function() { 122 | license <- "https://creativecommons.org/publicdomain/zero/1.0/" 123 | sprintf(paste0( 124 | "Data source: OpenAlex API at %s", "\n", 125 | "Data license agreement: %s"), 126 | openalex_api(), license 127 | ) 128 | } 129 | 130 | #' Retrieve work from OpenAlex REST API 131 | #' 132 | #' This function retrieves works given an identifier 133 | #' @param identifier string with identifier 134 | #' @param format one of "table" or "object" 135 | #' @param use_random logical to indicate whether to use random identifier, Default: FALSE 136 | #' @return as per format, either a tibble or an R object 137 | #' @examples 138 | #' \dontrun{ 139 | #' openalex_work(use_random = TRUE) 140 | #' } 141 | #' @export 142 | openalex_work <- function(identifier, format = "table", use_random = FALSE) { 143 | openalex_entity(identifier, entity = "works", format, use_random) 144 | } 145 | 146 | openalex_entity_enum <- function() 147 | c("works", "authors", "venues", "institutions", "concepts") 148 | 149 | #' @importFrom httr modify_url user_agent GET status_code http_type content 150 | #' @importFrom jsonlite fromJSON 151 | #' @importFrom tibble enframe 152 | #' @importFrom dplyr mutate 153 | openalex_entity <- function( 154 | identifier, 155 | entity = openalex_entity_enum(), 156 | format = c("table", "object", "raw", "tables"), 157 | use_random = FALSE, 158 | verbose = FALSE, 159 | query = NULL) { 160 | 161 | style <- match.arg(format) 162 | kind <- match.arg(entity) 163 | is_listing <- FALSE 164 | 165 | if (missing(identifier)) { 166 | if (use_random == FALSE && is.null(query)) { 167 | stop("Identifier is missing, please specify use_random to use a random id.") 168 | } else if (use_random == TRUE && is.null(query)) { 169 | identifier <- "random" 170 | } else { 171 | if (verbose == TRUE) message("This is a list request...") 172 | is_listing <- TRUE 173 | } 174 | } 175 | 176 | path <- if (!is_listing) sprintf("%s/%s", kind, identifier) else kind 177 | 178 | url <- httr::modify_url( 179 | openalex_api(), 180 | path = path, 181 | query = query #paste0("filter=", URLencode(query$filter)) #, "&sort=publication_date:desc") 182 | ) 183 | 184 | if (verbose == TRUE) message("Requesting url: ", url) 185 | 186 | ua <- httr::user_agent(cfg()$user_agent) 187 | res <- httr::GET(url, ua) 188 | 189 | if (httr::status_code(res) == 200) { 190 | 191 | if (httr::http_type(res) != "application/json") { 192 | stop("API did not return json", call. = FALSE) 193 | } 194 | 195 | if (style == "object") { 196 | data <- jsonlite::fromJSON( 197 | httr::content(res, as = "text", encoding = "utf-8"), 198 | simplifyVector = FALSE #, DataFrame = TRUE, flatten = TRUE 199 | ) 200 | data <- structure(data, meta = data$meta) 201 | } else if (style == "table") { 202 | payload <- httr::content(res, encoding = "utf-8") 203 | name <- NULL 204 | data <- 205 | tibble::enframe(unlist(payload)) %>% 206 | dplyr::mutate(name = gsub(".", "_", name, fixed = TRUE)) #%>% 207 | #dplyr::mutate(item_id = cumsum(name == "concepts_id")) %>% 208 | #dplyr::filter(item_id > 0) %>% 209 | #tidyr::pivot_wider(values_fn = function(x) paste0(x, collapse = ", ")) %>% 210 | #dplyr::rename_with(function(x) gsub("items_", "", x)) %>% 211 | #dplyr::mutate(across(.fns = function(x) readr::parse_guess(x, guess_integer = TRUE))) 212 | data <- structure(data, meta = payload$meta) 213 | } else if (style == "tables") { 214 | if (kind == "works") { 215 | payload <- httr::content(res, encoding = "utf-8") 216 | data <- payload$results |> parse_work() 217 | data <- structure(data, meta = payload$meta) 218 | } else { 219 | stop("Only works supported for now!") 220 | } 221 | } else if (style == "raw") { 222 | data <- res 223 | } 224 | 225 | #class(data) <- c("tbl_df", "tbl", "data.frame") 226 | return(data) 227 | } 228 | 229 | if (status_code(res) == 429) 230 | stop("HTTP status 429 Too Many Requests") 231 | 232 | if (status_code(res) == 403) { 233 | cr <- content(res) 234 | stop(cr$error, "\n\n", cr$message) 235 | } 236 | 237 | stop("HTTP status ", status_code(res)) 238 | 239 | } 240 | 241 | openalex_query <- function( 242 | filter=NULL, 243 | search=NULL, 244 | sort=NULL, 245 | page=NULL, 246 | cursor=NULL, 247 | verbose = FALSE) { 248 | 249 | # filter... use , to indicate AND 250 | #?filter=last_known_institution.country_code:US,cited_by_count:>0 251 | 252 | # search... add ".search" to a property 253 | #?filter=title.search:"intensive treatment of diabetes" 254 | 255 | # sort... By default, sort direction is ascending. You can reverse this by using sort:desc 256 | # ?sort:display_name,cited_by_count,works_count,publication_date,relevance_score 257 | 258 | # paging currently you can only use paging to read the first 10,000 results of any list 259 | # ?page=1 260 | 261 | q <- list( 262 | filter = filter, 263 | search = search, 264 | sort = sort, 265 | page = page, 266 | cursor = cursor, 267 | api_key = cfg()$key 268 | ) |> 269 | purrr::compact() 270 | 271 | if (verbose) 272 | message("Query is:\n\n", 273 | paste0(collapse = "\n", utils::capture.output(print(q))) 274 | ) 275 | 276 | return (q) 277 | 278 | } 279 | 280 | openalex_list <- function(entity, query, format = "object", verbose = FALSE) { 281 | res <- openalex_entity(entity = entity, format = format, verbose = verbose, query = query) 282 | attr(res, "page_count") <- ceiling(attr(res, "meta")$count / attr(res, "meta")$per_page) 283 | return(res) 284 | } 285 | 286 | gs <- function(x, p, r) { 287 | gsub(p, r, x, fixed = TRUE) 288 | } 289 | 290 | #' @importFrom purrr keep 291 | # support pipe 292 | tbl_from_slot <- function(x, slot) 293 | x |> map(slot) |> 294 | keep(.p = \(y) nrow(y) > 0) |> 295 | bind_rows() |> 296 | readr::type_convert() |> 297 | suppressMessages() |> 298 | mutate(across(is.character, \(x) x |> gs("https://openalex.org/", ""))) 299 | 300 | 301 | #' Crawl multiple pages of results 302 | #' 303 | #' Iterates over paged results showing a progress bar 304 | #' 305 | #' @param entity one of the values in openalex_entity_enum() 306 | #' @param query an openalex_query object 307 | #' @param verbose boolean to indicate whether to output messages during process 308 | #' @param fmt the return format, one of "object" or "tables" 309 | #' @return R object with results matching the query 310 | #' @importFrom progress progress_bar 311 | #' @importFrom purrr possibly map_df map_dfr pmap 312 | #' @importFrom dplyr bind_rows 313 | #' @export 314 | openalex_crawl <- function(entity, query, verbose = FALSE, fmt = "object") { 315 | 316 | q <- query 317 | 318 | # if (use_cursor) { 319 | # q$cursor <- "*" 320 | # message("Using query:") 321 | # print(q) 322 | # } 323 | 324 | res <- openalex_list(entity, q, format = fmt, verbose = FALSE) 325 | #q <- query 326 | n_items <- attr(res, "meta")$count 327 | pages <- 1:attr(res, "page_count") 328 | #next_cursor <- attr(res, "meta")$next_cursor 329 | #q$next_cursor <- next_cursor 330 | 331 | # if (use_cursor == TRUE && is.null(next_cursor)) 332 | # stop("Requested cursor paging, but no next_cursor found in response from API") 333 | 334 | if (n_items <= 0) { 335 | message("No results, returning empty list.") 336 | return (list()) 337 | } 338 | 339 | if (n_items > 1e4) { 340 | stop("If there are more than 10000 results, please set use_cursor to activate cursor paging") 341 | } 342 | 343 | if (verbose) 344 | message("About to crawl a total of ", length(pages), " pages of results", 345 | " with a total of ", n_items, " records.") 346 | 347 | pb <- progress_bar$new( 348 | format = " open alex resolving [:bar] :percent eta: :eta", 349 | total = length(pages), clear = FALSE, width = 60) 350 | 351 | #TODO: fixme so this can run in parallel? 352 | iq <- q 353 | i <- 1 354 | entities <- purrr::possibly(quiet = FALSE, 355 | .f = function(x) { 356 | pb$tick() 357 | iq$page <- i 358 | #print(q) 359 | Sys.sleep(1 / 100) 360 | # if (use_cursor & !is.null(next_cursor)) { 361 | # iq$filter <- paste0(q$filter, "&cursor=", next_cursor) 362 | # print(iq) 363 | # } 364 | res <- openalex_list(entity, iq, format = fmt, verbose = FALSE) 365 | # if (use_cursor) { 366 | # next_cursor <<- attr(res, "meta")$next_cursor 367 | # } 368 | i <<- i + 1 369 | return(res) 370 | }, 371 | otherwise = list() #data.frame() 372 | ) 373 | 374 | if (fmt != "tables") { 375 | res <- pages |> map(entities, .progress = TRUE) 376 | #res |> pmap(c) 377 | return (res) 378 | } 379 | 380 | res <- 381 | pages |> map(entities, .progress = TRUE) 382 | 383 | #TODO: fix so that NOT THE SAME work ids are fetched!!!! 384 | #TODO: do not assume entity is work below 385 | 386 | list( 387 | work = res |> tbl_from_slot("work"), 388 | work_ids = res |> tbl_from_slot("work_ids"), 389 | work_concepts = res |> tbl_from_slot("work_concepts"), 390 | work_authorships_institutions = res |> tbl_from_slot("work_authorships_institutions"), 391 | work_abstract_inverted_index = res |> tbl_from_slot("work_abstract_inverted_index"), 392 | work_authorships_author = res |> tbl_from_slot("work_authorships_author"), 393 | work_biblio = res |> tbl_from_slot("work_biblio"), 394 | work_open_access = res |> tbl_from_slot("work_open_access"), 395 | work_host_venue = res |> tbl_from_slot("work_host_venue"), 396 | work_counts_by_year = res |> tbl_from_slot("work_counts_by_year"), 397 | work_related_works = res |> tbl_from_slot("work_related_works"), 398 | work_referenced_works = res |> tbl_from_slot("work_referenced_works") 399 | ) 400 | 401 | } 402 | 403 | #'Flatten R object from deserialized nested JSON object 404 | #' 405 | #'@param nestedlist a nested list of lists 406 | #'@return a tibble in long format 407 | #'@export 408 | #'@importFrom tibble enframe 409 | #'@importFrom dplyr mutate 410 | openalex_flatten_long <- function(nestedlist) { 411 | name <- NULL 412 | tibble::enframe(unlist(nestedlist)) %>% 413 | dplyr::mutate(name = gsub(".", "_", name, fixed = TRUE)) 414 | } 415 | 416 | openalex_autocomplete <- function( 417 | query, 418 | entity_type = openalex_entity_enum(), 419 | format = c("object", "table"), 420 | verbose = TRUE 421 | ) { 422 | 423 | #/autocomplete/?q= 424 | 425 | stopifnot(nchar(query) >= 1) 426 | 427 | style <- match.arg(format) 428 | entity <- match.arg(entity_type) 429 | path <- sprintf("autocomplete/%s", entity) 430 | 431 | url <- httr::modify_url( 432 | openalex_api(), 433 | path = path, 434 | query = list(q = query) 435 | ) 436 | 437 | if (verbose == TRUE) message("Requesting url: ", url) 438 | 439 | ua <- httr::user_agent(cfg()$user_agent) 440 | 441 | res <- httr::GET(url, ua) 442 | 443 | if (httr::status_code(res) == 200) { 444 | 445 | if (httr::http_type(res) != "application/json") { 446 | stop("API did not return json", call. = FALSE) 447 | } 448 | 449 | if (style == "object") { 450 | data <- jsonlite::fromJSON( 451 | httr::content(res, as = "text", encoding = "utf-8"), 452 | simplifyVector = FALSE #, DataFrame = TRUE, flatten = TRUE 453 | ) 454 | } else { 455 | name <- NULL 456 | data <- httr::content(res, encoding = "utf-8") %>% 457 | purrr::pluck("results") %>% 458 | dplyr::bind_rows() 459 | } 460 | 461 | #class(data) <- c("tbl_df", "tbl", "data.frame") 462 | return(data) 463 | } 464 | 465 | if (status_code(res) == 429) 466 | stop("HTTP status 429 Too Many Requests") 467 | 468 | stop("HTTP status ", status_code(res)) 469 | 470 | } 471 | 472 | #' Example query when searching raw affiliation strings 473 | #' 474 | #' This variant is specifically tailored for KTH, Royal Institute of Technology 475 | #' and includes some affiliation string variations which might be related. 476 | #' @export 477 | #' @return string with query 478 | openalex_kth_rawaff_query <- function() { 479 | # (roy AND inst AND tech) OR 480 | # "Roy. Inst. T" 481 | # (roy* AND tech* AND univ*)) AND (Sweden)) 482 | # paste0( 483 | # 'KTH OR (roy* AND inst* AND tech*) OR ', 484 | # '(alfven) OR (kung* AND tek* AND hog*) OR (kung* AND tek* AND h\\u00f6g*) OR ', 485 | # '(kgl AND tek* AND hog*) OR (kung* AND tek* AND hg*)' 486 | # ) 487 | 488 | '("KTH" OR 489 | 490 | (("roy inst" OR 491 | "royal in-stitute" OR 492 | "royal inititute" OR 493 | "royal institut" OR 494 | "royal institute" OR 495 | "royal institite" OR 496 | "royal institution" OR 497 | "royal institue" OR 498 | "royal insititu" OR 499 | "royal insitute" OR 500 | "royal inst" OR 501 | "royal inst." OR 502 | "royal intitute" OR 503 | "royal istitute" OR 504 | "royal lnstitute" OR 505 | "royal lnstitufe" OR 506 | "royal lnstltute") AND "tech") OR 507 | 508 | (("kgl" OR 509 | "kgl." OR 510 | "kungl" OR 511 | "kungl." OR 512 | "kungliga") AND "tekn") OR 513 | 514 | "r inst of technol" OR 515 | "r inst. of technol." OR 516 | "r. inst. of tech." OR 517 | "r. inst. of technol" OR 518 | "r. inst. of technol." OR 519 | "royal tech" OR 520 | "institute of technology stockholm" OR 521 | "royal of technology" OR 522 | "royal school of technology" OR 523 | "royal swedish institute of technology" OR 524 | "royal university of technology" OR 525 | "royal college of technology" OR 526 | "royalinstitute" OR 527 | "alfven" OR 528 | "alfv\u00e9n" OR 529 | "10044 stockholm" OR 530 | "100 44 stockholm") 531 | 532 | NOT 533 | 534 | ("khyber" OR 535 | "peshawar" OR 536 | "mcmaster")' 537 | 538 | } 539 | 540 | # There seems to be a way to fetch ngrams 541 | 542 | ## https://api.openalex.org/works/W3128409631/ngrams 543 | ## https://api.openalex.org/works/W2023271753/ngrams 544 | 545 | # Search UI for KTH 546 | 547 | ## https://explore.openalex.org/institutions/I86987016 548 | 549 | #' Recently published works based on query for matching raw affiliations 550 | #' @param raw_search_criteria raw affiliation string search criteria, 551 | #' by default openalex_kth_rawaff_query() 552 | #' @param since_days integer indicating days back from today 553 | #' @export 554 | #' @return list of tables with results 555 | openalex_works_published_since <- function( 556 | raw_search_criteria = openalex_kth_rawaff_query(), 557 | since_days = 7) { 558 | 559 | criteria_aff <- raw_search_criteria 560 | criteria_from <- format(Sys.Date() - since_days, "%Y-%m-%d") 561 | 562 | params <- paste0(collapse = ",", c( 563 | sprintf("raw_affiliation_strings.search:%s", criteria_aff), 564 | sprintf("from_publication_date:%s", criteria_from) 565 | ) 566 | ) 567 | 568 | openalex_crawl("works", fmt = "tables", verbose = TRUE, 569 | query = openalex_query( 570 | filter = params, 571 | verbose = FALSE 572 | ) 573 | ) 574 | 575 | } 576 | 577 | #' Recently updated works based on query for matching raw affiliations 578 | #' 579 | #' This function requires a premium subscription API key to be set. 580 | #' 581 | #' @param raw_search_criteria raw affiliation string search criteria, 582 | #' by default openalex_kth_rawaff_query() 583 | #' @param since_minutes integer indicating minutes since now 584 | #' @export 585 | #' @importFrom lubridate as_date format_ISO8601 586 | #' @return list of tables with results 587 | openalex_works_updated_since <- function( 588 | raw_search_criteria = openalex_kth_rawaff_query(), 589 | since_minutes) { 590 | 591 | if (is.null(cfg()$key)) 592 | stop("This function requires a Premium Subscription API key") 593 | 594 | criteria_aff <- raw_search_criteria 595 | 596 | #criteria_from <- "2024-01-15T08:02:55Z" #"2024-01-15T04:47:14.518460" 597 | criteria_from <- 598 | lubridate::as_datetime(Sys.time() - since_minutes * 60) |> 599 | lubridate::format_ISO8601(usetz = "Z") 600 | 601 | params <- paste0(collapse = ",", c( 602 | sprintf("raw_affiliation_strings.search:%s", criteria_aff), 603 | sprintf("from_updated_date:%s", criteria_from) 604 | ) 605 | ) 606 | 607 | openalex_crawl("works", fmt = "tables", verbose = TRUE, 608 | query = openalex_query( 609 | filter = params, 610 | verbose = FALSE 611 | ) 612 | ) 613 | 614 | } 615 | 616 | #' Recently created works based on query for matching raw affiliations 617 | #' 618 | #' This function requires a premium subscription API key to be set. 619 | #' 620 | #' @param raw_search_criteria raw affiliation string search criteria, 621 | #' by default openalex_kth_rawaff_query() 622 | #' @param since_days integer indicating minutes since now 623 | #' @export 624 | #' @importFrom lubridate as_date 625 | #' @return list of tables with results 626 | openalex_works_created_since <- function( 627 | raw_search_criteria = openalex_kth_rawaff_query(), 628 | since_days = 0) { 629 | 630 | if (is.null(cfg()$key)) 631 | stop("This function requires a Premium Subscription API key") 632 | 633 | criteria_aff <- raw_search_criteria 634 | 635 | criteria_from <- 636 | lubridate::as_date(Sys.Date() - since_days) |> 637 | format("%Y-%m-%d") 638 | 639 | params <- paste0(collapse = ",", c( 640 | sprintf("raw_affiliation_strings.search:%s", criteria_aff), 641 | sprintf("from_created_date:%s", criteria_from) 642 | ) 643 | ) 644 | 645 | openalex_crawl("works", fmt = "tables", verbose = TRUE, 646 | query = openalex_query( 647 | filter = params, 648 | verbose = FALSE 649 | ) 650 | ) 651 | 652 | } 653 | 654 | #' @import httr2 655 | openalex_aboutness <- function(title, abstract = NULL, verbose = FALSE, format = c("object", "tables")) { 656 | 657 | # "https://api.openalex.org/text?title=type%201%20diabetes%20research%20for%20children 658 | # https://groups.google.com/g/openalex-users/c/Df4dIA19adM 659 | 660 | is_invalid <- function(x) nchar(x) < 20 | nchar(x) > 2000 661 | 662 | if (is_invalid(title)) 663 | stop("Title must be between 20 and 2000 characters long") 664 | 665 | if (!is.null(abstract) && is_invalid(abstract)) 666 | stop("Abstract, if provided, must be between 20 and 2000 character long") 667 | 668 | q <- purrr::compact(list(title = title, abstract = abstract)) 669 | 670 | req <- 671 | httr2::request(openalex_api()) |> 672 | httr2::req_url_path("text") |> 673 | httr2::req_user_agent(cfg()$user_agent) |> 674 | httr2::req_body_json(data = q) 675 | 676 | if (verbose) 677 | req <- req |> httr2::req_verbose() 678 | 679 | resp <- req |> httr2::req_perform() 680 | 681 | res <- switch(match.arg(format), 682 | "object" = resp |> httr2::resp_body_json(), 683 | "tables" = parse_resp_aboutness(resp |> httr2::resp_body_json()) 684 | ) 685 | 686 | return(res) 687 | 688 | } 689 | 690 | parse_topics <- function(topics) { 691 | 692 | ones <- 693 | topics |> map(\(x) purrr::discard_at(x, at = c("field", "domain", "subfield"))) |> 694 | bind_rows() 695 | 696 | manies <- 697 | topics |> map(\(x) purrr::keep_at(x, at = c("field", "domain", "subfield"))) 698 | 699 | fsd <- bind_cols( 700 | manies |> map("field") |> bind_rows() |> rename_with(\(x) paste0("field_", x)), 701 | manies |> map("subfield") |> bind_rows() |> rename_with(\(x) paste0("subfield_", x)), 702 | manies |> map("domain") |> bind_rows() |> rename_with(\(x) paste0("domain_", x)) 703 | ) 704 | 705 | bind_cols(ones, fsd) 706 | 707 | } 708 | 709 | parse_resp_aboutness <- function(resp) { 710 | 711 | d <- resp 712 | 713 | meta <- 714 | d$meta |> bind_rows() 715 | 716 | keywords <- 717 | d$keywords |> bind_rows() 718 | 719 | topics <- 720 | d$topics |> parse_topics() 721 | 722 | primary_topic <- 723 | list(d$primary_topic) |> parse_topics() 724 | 725 | concepts <- 726 | bind_cols( 727 | d$concepts |> bind_rows() |> select(-any_of("ancestors")), 728 | d$concepts |> bind_rows() |> pull(any_of("ancestors")) |> map(bind_rows) |> 729 | bind_rows() |> rename_with(.fn = \(x) paste0("ancestors_", x)) 730 | ) 731 | 732 | list(meta = meta, keywords = keywords, topics = topics, concepts = concepts) 733 | 734 | } 735 | 736 | 737 | 738 | openalex_filter_similar_topics <- function(work_identifier, granularity = c("topic", "domain", "field", "subfield")) { 739 | 740 | w <- openalex_work(work_identifier, format = "object") 741 | 742 | topic_id <- function(w, field_type) { 743 | 744 | f <- switch(field_type, 745 | topic = "topics.id", 746 | domain = "topics.domain.id", 747 | field = "topics.field.id", 748 | subfield = "topics.subfield.id" 749 | ) 750 | 751 | if (field_type == "topic") 752 | field_type <- NULL 753 | 754 | res <- 755 | w$topics |> map_chr(c(field_type, "id")) |> unique() |> 756 | gsub(pattern = "https://.*?/(.*?)$", replacement = "\\1") 757 | 758 | paste0(f, ":", paste0(collapse = "|", res)) 759 | } 760 | 761 | topics_filter <- function(w) { 762 | fields <- granularity 763 | topics <- fields |> map_chr(function(x) topic_id(w, x)) 764 | topics |> paste(collapse = ",") 765 | } 766 | 767 | topics_filter(w) 768 | 769 | } 770 | 771 | openalex_works_to_tbls <- function(works) { 772 | 773 | pw2 <- purrr::possibly(parse_work2, otherwise = NULL, quiet = FALSE) 774 | 775 | message("Converting record batches to tables...") 776 | tbls <- works |> map(pw2, .progress = TRUE) 777 | message("Done") 778 | 779 | message("Unifying and merging tables...") 780 | 781 | unify_slots <- function(tbls) { 782 | 783 | slotz <- map(tbls, names) |> unique() |> unlist() 784 | strip_prefix <- function(x) gsub("^https://.*?/(.*?)$", "\\1", x) 785 | strip_doi <- function(x) gsub("^https://doi.org/(.*?)$", "\\1", x) 786 | #message("Merging slots:\n", slotz |> paste0(collapse = "\n")) 787 | unify <- function(x) { 788 | 789 | tbls |> map(x) |> bind_rows() |> 790 | readr::type_convert(guess_integer = TRUE) |> 791 | suppressMessages() |> suppressWarnings() |> 792 | mutate(across(where(function(x) is.character(x)) & !any_of(c("doi")), .fns = strip_prefix)) |> 793 | mutate(across(any_of(c("doi")), .fns = strip_doi)) |> 794 | select(where(Negate(is.list))) 795 | } 796 | res <- slotz |> map(unify) |> setNames(nm = slotz) 797 | return (res) 798 | } 799 | 800 | out <- unify_slots(tbls) 801 | 802 | message("Done") 803 | return(out) 804 | } 805 | -------------------------------------------------------------------------------- /R/openalex-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | #' Topics 5 | #' 6 | #' Topics used by OpenAlex 7 | #' @format A data frame with 4516 rows and 9 variables: 8 | #' \describe{ 9 | #' \item{\code{id_topic}}{character the id for the topic} 10 | #' \item{\code{topic}}{character description of topic} 11 | #' \item{\code{description}}{character long form description of this topic cluster} 12 | #' \item{\code{id_subfield}}{character the id for the subfield of this topic} 13 | #' \item{\code{subfield}}{character description of the subfield} 14 | #' \item{\code{id_field}}{character the id of the field} 15 | #' \item{\code{field}}{character description of the field} 16 | #' \item{\code{id_domain}}{character the id of the domain} 17 | #' \item{\code{domain}}{character description of the domain} 18 | #'} 19 | #' @details DETAILS 20 | "topics" 21 | 22 | ## usethis namespace: start 23 | ## usethis namespace: end 24 | NULL 25 | -------------------------------------------------------------------------------- /R/rectangularize.R: -------------------------------------------------------------------------------- 1 | #' @importFrom dplyr rename bind_rows select tibble starts_with distinct 2 | #' @importFrom tidyr unnest unnest_wider hoist unnest_longer 3 | #' @importFrom purrr map_dfr map pluck 4 | parse_work <- function(chunk) { 5 | 6 | openalex <- NULL 7 | 8 | # TODO fix this? 9 | # if (length(lengths(chunk)) == 1) 10 | # chunk <- list(chunk) 11 | 12 | if (length(chunk) == 0) return(list()) 13 | 14 | work_ids <- 15 | chunk |> 16 | map_dfr("ids") |> 17 | rename(work_id = openalex) 18 | 19 | work_host_venue <- 20 | chunk |> 21 | map(function(x) c(work_id = pluck(x, "id"), pluck(x, "host_venue"))) |> 22 | bind_rows() #|> unnest(issn) 23 | # map_dfr(bind_rows) 24 | 25 | work_open_access <- 26 | chunk |> 27 | map(function(x) c(work_id = pluck(x, "id"), pluck(x, "open_access"))) |> 28 | map_dfr(bind_rows) 29 | 30 | work_biblio <- 31 | chunk |> 32 | map(function(x) c(work_id = pluck(x, "id"), pluck(x, "biblio"))) |> 33 | map_dfr(bind_rows) 34 | 35 | work_authorships <- 36 | chunk |> 37 | map(function(x) c(work_id = pluck(x, "id"), pluck(x, "authorships"))) 38 | 39 | work_authorships_author <- 40 | tibble(wa = work_authorships) |> 41 | hoist("wa", "work_id") |> 42 | unnest("wa") |> 43 | unnest_wider("wa", names_sep = "_") |> 44 | unnest_wider("wa_author") |> 45 | select(-starts_with(c("wa_institutions"))) 46 | # work_authorships |> 47 | # tibble(wa = .) |> 48 | # hoist("wa", "work_id") |> 49 | # unnest_wider("wa", names_sep = "_") |> 50 | # unnest_wider("wa_1") |> 51 | # unnest_longer("author") |> 52 | # pivot_wider(names_from = "author_id", values_from = "author") |> 53 | # select(-starts_with(c("wa_", "institutions"))) 54 | 55 | # map(function(x) tibble( 56 | # work_id = pluck(x, "work_id"), 57 | # author_position = pluck(x, 2, "author_position"), 58 | # author_id = pluck(x, 2, "author", "id"), 59 | # author_display_name = pluck(x, 2, "author", "display_name"), 60 | # author_orcid = pluck(x, 2, "author", "orcid") 61 | # )) |> 62 | # bind_rows() |> unnest(author) |> 63 | # unnest_wider("author") |> 64 | # rename(author_id = id, author_display_name = display_name) 65 | 66 | 67 | work_authorships_institutions <- 68 | tibble(wa = work_authorships) |> 69 | hoist("wa", "work_id") |> 70 | unnest("wa") |> 71 | unnest_wider("wa", names_sep = "_") |> 72 | select(-c("wa_author_position")) |> 73 | unnest("wa_institutions") |> 74 | unnest_wider("wa_institutions") |> 75 | select(-c("wa_author")) |> 76 | distinct() 77 | 78 | # work_authorships |> 79 | # map_dfr(function(x) tibble( 80 | # work_id = pluck(x, "work_id"), 81 | # raw_affiliation_string = pluck(x, "raw_affiliation_string"), 82 | # institutions = pluck(x, "institutions") 83 | # )) |> 84 | # map("institutions") 85 | # unnest_wider("institutions") |> 86 | # rename(institution_id = id, institution_display_name = display_name) |> 87 | # distinct() |> 88 | # filter(!is.na(raw_affiliation_string)) 89 | 90 | work_concepts <- 91 | chunk |> 92 | map(function(x) tibble(work_id = pluck(x, "id"), pluck(x, "concepts"))) |> 93 | map_dfr(bind_rows) |> unnest_wider(2) 94 | 95 | # work_mesh <- 96 | # chunk |> 97 | # map(function(x) tibble(work_id = pluck(x, "id"), pluck(x, "mesh"))) |> 98 | # map_dfr(bind_rows) 99 | 100 | aii_to_df <- function(x) { 101 | tibble(attr = names(x), val = x) |> 102 | unnest_wider("val", names_repair = function(x) paste0("i", seq_along(x) - 1)) 103 | } 104 | 105 | #abstract_inverted_index <- 106 | # chunk$abstract_inverted_index |> aii_to_df() 107 | 108 | abstract_inverted_index <- 109 | chunk |> 110 | map(function(x) tibble( 111 | work_id = pluck(x, "id"), 112 | aii_value = paste(collapse = " ", unlist(pluck(x, "abstract_inverted_index", .default = NA_integer_))), 113 | aii_key = paste(collapse = " ", unique(names(pluck(x, "abstract_inverted_index", .default = NA_character_)))) 114 | )) |> 115 | map_dfr(bind_rows) |> 116 | unnest_longer("aii_value") |> 117 | distinct() 118 | 119 | # abstract_inverted_index <- 120 | # chunk[1:20] |> 121 | # map_dfr(function(x) tibble(work_id = pluck(x, "id"), aii = pluck(x, "abstract_inverted_index"))) |> 122 | # bind_cols(aii_to_df(.$aii)) |> 123 | # select(!any_of("aii")) 124 | # 125 | # unnest_wider("aii", transform = function(x) aii_to_df(x)) 126 | 127 | work_counts_by_year <- 128 | chunk |> 129 | map(function(x) tibble(work_id = pluck(x, "id"), cby = pluck(x, "counts_by_year"))) |> 130 | map_dfr(bind_rows) |> 131 | unnest_wider("cby") 132 | 133 | work_related_works <- 134 | chunk |> 135 | map(function(x) tibble(work_id = pluck(x, "id"), related_works = pluck(x, "related_works"))) |> 136 | map_dfr(bind_rows) |> 137 | unnest_longer("related_works") 138 | 139 | work_referenced_works <- 140 | chunk |> 141 | map(function(x) tibble(work_id = pluck(x, "id"), referenced_works = pluck(x, "referenced_works"))) |> 142 | map_dfr(bind_rows) |> 143 | unnest_longer("referenced_works") 144 | 145 | 146 | work <- 147 | chunk |> map_dfr( 148 | function(x) tibble( 149 | id = pluck(x, "id"), 150 | doi = pluck(x, "doi"), 151 | display_name = pluck(x, "display_name"), 152 | title = pluck(x, "title"), 153 | publication_year = pluck(x, "publication_year"), 154 | publication_date = pluck(x, "publication_date"), 155 | type = pluck(x, "type"), 156 | cited_by_count = pluck(x, "cited_by_count"), 157 | is_retracted = pluck(x, "is_retracted"), 158 | is_paratext = pluck(x, "is_paratext"), 159 | updated_date = pluck(x, "updated_date"), 160 | cited_by_api_url = pluck(x, "cited_by_api_url"), 161 | created_date = pluck(x, "created_date") 162 | ) 163 | ) 164 | 165 | list( 166 | work = work, 167 | work_ids = work_ids, 168 | # work_mesh = work_mesh, 169 | work_concepts = work_concepts, 170 | work_authorships_institutions = work_authorships_institutions, 171 | work_abstract_inverted_index = abstract_inverted_index, 172 | work_authorships_author = work_authorships_author, 173 | work_biblio = work_biblio, 174 | work_open_access = work_open_access, 175 | work_host_venue = work_host_venue, 176 | work_counts_by_year = work_counts_by_year, 177 | work_related_works = work_related_works, 178 | work_referenced_works = work_referenced_works 179 | ) 180 | 181 | } 182 | 183 | #' @noRd 184 | #' @import tidyr dplyr purrr 185 | parse_work2 <- function(object) { 186 | 187 | name <- value <- work_id <- NULL 188 | 189 | unfwv <- function(l, field) { 190 | if (is.null(l$field)) return(tibble()) 191 | l |> map(\(x) keep_at(x, c("id", field))) |> 192 | enframe() |> 193 | unnest_wider(any_of("value")) |> 194 | tidyr::unnest_wider(any_of(field)) |> 195 | select(-any_of(c("name"))) 196 | } 197 | 198 | unfwvs <- function(l, field) { 199 | if (is.null(l$field)) return(tibble()) 200 | l |> map(\(x) keep_at(x, c("id", field))) |> 201 | enframe() |> 202 | unnest_wider(any_of("value")) |> 203 | tidyr::unnest_wider(any_of(field), names_sep = "_") |> 204 | select(-any_of(c("name"))) 205 | } 206 | 207 | unfw <- function(l, field) { 208 | if (is.null(l$field)) return(tibble()) 209 | l |> map(\(x) keep_at(x, c("id", field))) |> 210 | compact() |> 211 | map_df(tibble::as_tibble) |> 212 | tidyr::unnest_wider(any_of(field)) |> 213 | compact() 214 | } 215 | 216 | unfws <- function(l, field) { 217 | if (is.null(l$field)) return(tibble()) 218 | l |> map(\(x) keep_at(x, c("id", field))) |> 219 | compact() |> 220 | map_df(tibble::as_tibble) |> 221 | tidyr::unnest_wider(any_of(field), names_sep = "_") |> 222 | compact() 223 | } 224 | 225 | unfl <- function(l, field) { 226 | #has_field <- l |> map_lgl(\(x) field %in% names(x)) |> all() 227 | #if (!has_field) return(data.frame(0)) 228 | if (is.null(l$field)) return(tibble()) 229 | l |> map(\(x) keep_at(x, c("id", field))) |> 230 | compact() |> 231 | map_df(tibble::as_tibble) |> 232 | tidyr::unnest_longer(any_of(field)) |> 233 | compact() 234 | } 235 | 236 | pluck_with_id <- function(x, field) { 237 | if (!pluck_exists(x, field)) return (NULL) 238 | c(id = pluck(x, "id"), pluck(x, field)) 239 | } 240 | 241 | w <- object 242 | 243 | colz <- 244 | w$results |> 245 | map(\(x) tibble(cols = names(x), l = lengths(x)) |> 246 | tidyr::pivot_wider(names_from = "cols", values_from = "l") 247 | ) |> 248 | bind_rows() |> 249 | summarize(across(everything(), max)) |> 250 | ungroup() |> 251 | tidyr::pivot_longer(cols = everything()) 252 | 253 | one_to_one <- colz |> filter(value == 1, name != "versions") |> pull(name) 254 | 255 | # workz <- 256 | # w$results |> 257 | # map(\(x) x[one_to_one] |> compact() |> as_tibble()) |> 258 | # bind_rows() 259 | 260 | plf <- function(o, f) { 261 | l <- o |> map(\(x) purrr::pluck(x, f)) |> unlist() 262 | list(l) |> setNames(nm = f) 263 | } 264 | 265 | # TODO: remove keep_empty(?) 266 | wide <- enframe(w) |> unnest_longer(2, keep_empty = TRUE) |> unnest_wider(2) 267 | 268 | workz <- 269 | wide |> select(any_of(one_to_one)) 270 | 271 | ids <- 272 | wide |> select(work_id = id, any_of(c("ids"))) |> unnest_wider(any_of(c("ids"))) 273 | 274 | re_ids <- paste0( 275 | "(https://openalex.org/)|(https://doi.org/)|", 276 | "(https://pubmed.ncbi.nlm.nih.gov/)|(https://www.ncbi.nlm.nih.gov/pmc/articles/)|", 277 | "(https://www.wikidata.org/wiki/)|(https://orcid.org/)|(https://ror.org/)" 278 | ) 279 | 280 | fuw <- function(fields) { 281 | wide |> select(work_id = "id", any_of(c(fields))) |> 282 | unnest_wider(any_of(c(fields)), names_sep = "_") |> 283 | # unnest_wider(any_of(c(fields))) |> 284 | mutate(across(-contains("url"), \(x) gsub(re_ids, "", x))) 285 | } 286 | 287 | authorships <- 288 | wide |> select(work_id = "id", any_of("authorships")) |> 289 | unnest_longer(2) |> unnest_wider(2) |> 290 | mutate(across(contains("id"), \(x) gsub(re_ids, "", x))) 291 | 292 | authorships_affiliations_raw <- 293 | authorships |> 294 | select(any_of(c("work_id", "author_id", "raw_author_name", "raw_affiliation_strings"))) |> 295 | unnest_longer("raw_affiliation_strings") |> 296 | mutate(across(contains("id"), \(x) gsub(re_ids, "", x))) 297 | 298 | authorships_affiliations <- 299 | authorships |> 300 | select(any_of(c("work_id", "affiliations", "author"))) |> 301 | unnest_longer(any_of("affiliations")) |> 302 | unnest_wider(any_of("affiliations"), names_sep = "_") |> 303 | unnest_longer(any_of("affiliations_institution_ids")) |> 304 | unnest_wider(any_of("author"), names_sep = "_") |> 305 | mutate(across(contains("id"), \(x) gsub(re_ids, "", x))) |> 306 | distinct() 307 | 308 | authorships_authors <- 309 | authorships |> 310 | unnest_wider(any_of("author"), names_sep = "_") |> 311 | select(-any_of(c("institutions", "affiliations", "raw_author_name", "raw_affiliation_strings", "countries"))) |> 312 | # unnest_wider(any_of(c("countries")), names_sep = "_") |> 313 | mutate(across(contains("id"), \(x) gsub(re_ids, "", x))) 314 | 315 | authorships_institutions <- 316 | authorships |> 317 | select(any_of(c("work_id", "institutions", "author"))) |> 318 | unnest_longer(any_of("institutions")) |> 319 | unnest_wider("institutions", names_sep = "_") |> 320 | unnest_longer("institutions_lineage") |> 321 | unnest_wider(any_of(c("author")), names_sep = "_") |> 322 | mutate(across(everything(), \(x) gsub(re_ids, "", x))) 323 | 324 | fields <- c( 325 | "ids", "open_access", "apc_list", "apc_paid", 326 | "citation_normalized_percentile", "cited_by_percentile_year", 327 | "biblio" 328 | ) 329 | 330 | fields <- fields[which(fields %in% unique(colz$name))] 331 | various <- fields |> map(fuw) |> set_names(fields) 332 | 333 | fields2 <- c("counts_by_year", "grants", "mesh") 334 | fields2 <- fields2[which(fields2 %in% unique(colz$name))] 335 | 336 | bcbr <- function(field) { 337 | w$results |> map_dfr(\(x) bind_cols(work_id = x$id, bind_rows(x |> getElement(field)))) |> 338 | mutate(across(-contains("url"), \(x) gsub(re_ids, "", x))) 339 | } 340 | 341 | various2 <- fields2 |> map(bcbr) |> set_names(fields2) 342 | 343 | fields3 <- c( 344 | "sustainable_development_goals", 345 | "keywords", 346 | "concepts"# 347 | #"datasets" 348 | ) 349 | 350 | fields3 <- fields3[which(fields3 %in% unique(colz$name))] 351 | 352 | various3 <- 353 | fields3 |> map(bcbr) |> set_names(fields3) 354 | 355 | datasets <- 356 | wide |> select(id, datasets) |> unnest(datasets) |> unnest(datasets) |> 357 | mutate(across(-contains("url"), \(x) gsub(re_ids, "", x))) 358 | 359 | fields4 <- c( 360 | "referenced_works", 361 | "related_works", 362 | "indexed_in", 363 | "corresponding_institution_ids", 364 | "corresponding_author_ids"#, 365 | # "abstract_inverted_index" 366 | ) 367 | 368 | fields4 <- fields4[which(fields4 %in% unique(colz$name))] 369 | 370 | bcbv <- function(field) { 371 | w$results |> map_dfr(\(x) bind_cols(work_id = x$id, rw = unlist(x |> getElement(field)))) |> 372 | setNames(nm = c("work_id", field)) |> 373 | mutate(across(-contains("url"), \(x) gsub(re_ids, "", x))) 374 | } 375 | 376 | various4 <- 377 | fields4 |> map(bcbv)|> set_names(nm = fields4) 378 | 379 | aii_to_abstract <- function(aii) { 380 | 381 | value <- NULL 382 | 383 | abstract <- 384 | aii |> enframe() |> 385 | unnest_longer(any_of(c("value"))) |> 386 | arrange(-desc(value)) |> 387 | pull(any_of(c("name"))) |> 388 | paste0(collapse = " ") 389 | 390 | if (!nzchar(abstract)) 391 | return (NA_character_) 392 | 393 | return (abstract) 394 | 395 | } 396 | 397 | abstracts <- 398 | w$results |> 399 | map(function(x) tibble( 400 | work_id = pluck(x, "id"), 401 | abstract = aii_to_abstract(pluck(x, "abstract_inverted_index")) 402 | )) |> 403 | map_dfr(bind_rows) 404 | 405 | primary_location <- 406 | "primary_location" |> fuw() 407 | 408 | primary_location_source <- 409 | primary_location |> select(any_of(c("work_id", "primary_location_source"))) |> 410 | mutate(primary_location_source = map(primary_location_source, 411 | \(x) eval(parse(text = x)))) |> 412 | mutate(primary_location_source = map(primary_location_source, 413 | \(x) compact(x) |> enframe() |> pivot_wider())) |> #|> as_tibble())) |> 414 | #pull(primary_location_source) |> head(1) 415 | unnest(2) |> 416 | unnest_longer(any_of("issn")) |> 417 | unnest(any_of(everything())) |> 418 | unnest_wider(any_of(c("host_organization_lineage")), names_sep = "_") 419 | 420 | primary_location <- 421 | primary_location |> select(-any_of("primary_location_source")) 422 | 423 | primary_topic <- 424 | "primary_topic" |> fuw() |> 425 | mutate(across(any_of( 426 | c("primary_topic_subfield", "primary_topic_field", "primary_topic_domain")), 427 | \(y) y |> map(\(x) eval(parse(text = x)))) 428 | ) |> 429 | mutate(across(any_of( 430 | c("primary_topic_subfield", "primary_topic_field", "primary_topic_domain")), 431 | \(y) y |> map(\(x) compact(x) |> as_tibble())) 432 | ) |> 433 | unnest("primary_topic_subfield", names_sep = "_") |> 434 | unnest("primary_topic_field", names_sep = "_") |> 435 | unnest("primary_topic_domain", names_sep = "_") 436 | 437 | topics <- 438 | wide |> select(any_of(c("id", "topics"))) |> 439 | unnest(topics) |> 440 | unnest_wider(topics, names_sep = "_") |> 441 | unnest_wider(any_of("topics_field"), names_sep = "_") |> 442 | unnest_wider(any_of("topics_subfield"), names_sep = "_") |> 443 | unnest_wider(any_of("topics_domain"), names_sep = "_") |> 444 | compact() |> 445 | mutate(across(-contains("url"), \(x) gsub(re_ids, "", x))) 446 | 447 | best_oa_location <- 448 | "best_oa_location" |> fuw() 449 | 450 | best_oa_location_source <- 451 | best_oa_location |> 452 | select(any_of(c("work_id", "best_oa_location_source"))) |> 453 | mutate(best_oa_location_source = map(best_oa_location_source, 454 | \(x) eval(parse(text = x)))) |> 455 | mutate(best_oa_location_source = map(best_oa_location_source, 456 | \(x) compact(x) |> enframe() |> pivot_wider())) |> #|> as_tibble())) |> 457 | unnest(2) |> 458 | unnest_longer(any_of("issn")) |> 459 | unnest(any_of(everything())) |> 460 | compact() |> 461 | mutate(across(-contains("url"), \(x) gsub(re_ids, "", x))) 462 | 463 | best_oa_location <- 464 | best_oa_location |> select(-any_of(c("best_oa_location_source"))) 465 | 466 | locations <- 467 | wide |> select(any_of(c("id", "locations"))) |> 468 | unnest(any_of(c("locations"))) |> 469 | unnest_wider(any_of(c("locations"))) |> 470 | unnest_wider(any_of(c("source")), names_sep = "_") |> 471 | #w$results |> unfw("locations") |> 472 | #unnest_wider(any_of("source"), names_sep = "_") |> 473 | unnest_longer(any_of("source_issn")) |> 474 | unnest_longer(any_of(c("source_host_organization_lineage", "source_host_organization_lineage_names"))) |> 475 | compact() |> 476 | mutate(across(-contains("url"), \(x) gsub(re_ids, "", x))) 477 | 478 | c( 479 | list(work = workz), 480 | list(abstracts = abstracts), 481 | list(authorships_affiliations_raw = authorships_affiliations_raw), 482 | list(authorships_affiliations = authorships_affiliations), 483 | list(authorships_authors = authorships_authors), 484 | list(authorships_institutions = authorships_institutions), 485 | list(datasets = datasets), 486 | various, various2, various3, various4, 487 | list( 488 | primary_location = primary_location, 489 | primary_location_source = primary_location_source, 490 | best_oa_location = best_oa_location, 491 | best_oa_location_source = best_oa_location_source, 492 | locations = locations, 493 | primary_topic = primary_topic, 494 | topics = topics 495 | ) 496 | ) 497 | 498 | } 499 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | #' @param lhs A value or the magrittr placeholder. 12 | #' @param rhs A function call using the magrittr semantics. 13 | #' @return The result of calling `rhs(lhs)`. 14 | NULL 15 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>" 11 | ) 12 | ``` 13 | 14 | # openalex 15 | 16 | 17 | [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) 18 | [![R-CMD-check](https://github.com/KTH-Library/openalex/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/KTH-Library/openalex/actions/workflows/R-CMD-check.yaml) 19 | 20 | 21 | The goal of `openalex` is to provide access to data from [OpenAlex](https://openalex.org) - an open and comprehensive catalog of scholarly papers, authors, institutions and more ... - to R through the [Open Alex REST API](https://docs.openalex.org/api)... 22 | 23 | ## Installation 24 | 25 | You can install the current version of `openalex` from [GitHub](https://github.com/kth-library/openalex) with: 26 | 27 | ``` r 28 | #install.packages("devtools") 29 | devtools::install_github("kth-library/openalex", dependencies = TRUE) 30 | ``` 31 | 32 | ## Example 33 | 34 | This is a basic example which shows you how to get information for papers and authors: 35 | 36 | ```{r example, eval=TRUE} 37 | 38 | library(openalex) 39 | library(dplyr) 40 | suppressPackageStartupMessages(library(purrr)) 41 | library(knitr) 42 | 43 | iid <- 44 | openalex:::openalex_autocomplete( 45 | query = "Royal Institute of Technology", 46 | entity_type = "institution", 47 | format = "table") |> 48 | head(1) |> 49 | pull("id") 50 | 51 | data <- 52 | openalex_crawl(entity = "works", verbose = TRUE, fmt = "tables", 53 | query = openalex:::openalex_query(filter = 54 | sprintf("institutions.id:%s,publication_year:2025", iid))) 55 | 56 | res <- data |> map(head) # return only first six rows from each table 57 | 58 | res 59 | ``` 60 | 61 | ## Rate limits and using an API key 62 | 63 | By providing an email address you enter the "polite pool" which provides even less of rate limiting for API requests. 64 | 65 | You can provide it in `~/.Renviron` by setting `OPENALEX_USERAGENT=http://github.com/hadley/httr (mailto:your_email@your_institution.org)`. 66 | 67 | You can also set it just for the session by using a helper fcn `openalex_polite()` to temporarily set or unset the email used in the user agent string when making API requests: 68 | 69 | ```{r polite} 70 | library(openalex) 71 | 72 | # set an email to use for the session 73 | 74 | openalex_polite("you@example.com") 75 | 76 | # unset, and use default user agent string... 77 | 78 | openalex_polite("") 79 | 80 | ``` 81 | 82 | A premium subscription API key can be used by setting `OPENALEX_KEY=secret_premium_api_key` in your `.Renviron`, or temporarily in a session using: 83 | 84 | ```{r premium, eval = FALSE} 85 | library(openalex) 86 | 87 | # temporarily use a premium subscription API key 88 | openalex_key("secret_premium_api_key") 89 | 90 | # unset to not use the premium subscription API key 91 | openalex_key("") 92 | 93 | ``` 94 | 95 | This will make it possible to make API calls that return the latest available records, for example based on recent creation dates or recent last modification timestamps. 96 | 97 | ```{r updates, eval = TRUE} 98 | 99 | # we do not require an API key for the publish date 100 | published_since_ <- openalex_works_published_since(since_days = 7) 101 | 102 | # but an API key is needed when using "from_created_date" and "from_updated_date" fields. 103 | created_since_7d <- openalex_works_created_since(since_days = 7) 104 | updated_since_1h <- openalex_works_updated_since(since_minutes = 60) 105 | 106 | # first few rows of each of these retrievals 107 | created_since_7d |> _$work_ids |> head() |> knitr::kable() 108 | updated_since_1h |> _$work_ids |> head() |> knitr::kable() 109 | 110 | ``` 111 | 112 | 113 | ## Data source attribution 114 | 115 | When data from `openalex` is displayed publicly, this attribution also needs to be displayed: 116 | 117 | ```{r attribution} 118 | library(openalex) 119 | openalex_attribution() 120 | ``` 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # openalex 5 | 6 | 7 | 8 | [![Lifecycle: 9 | experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) 10 | [![R-CMD-check](https://github.com/KTH-Library/openalex/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/KTH-Library/openalex/actions/workflows/R-CMD-check.yaml) 11 | 12 | 13 | The goal of `openalex` is to provide access to data from 14 | [OpenAlex](https://openalex.org) - an open and comprehensive catalog of 15 | scholarly papers, authors, institutions and more … - to R through the 16 | [Open Alex REST API](https://docs.openalex.org/api)… 17 | 18 | ## Installation 19 | 20 | You can install the current version of `openalex` from 21 | [GitHub](https://github.com/kth-library/openalex) with: 22 | 23 | ``` r 24 | #install.packages("devtools") 25 | devtools::install_github("kth-library/openalex", dependencies = TRUE) 26 | ``` 27 | 28 | ## Example 29 | 30 | This is a basic example which shows you how to get information for 31 | papers and authors: 32 | 33 | ``` r 34 | 35 | library(openalex) 36 | library(dplyr) 37 | #> 38 | #> Attaching package: 'dplyr' 39 | #> The following objects are masked from 'package:stats': 40 | #> 41 | #> filter, lag 42 | #> The following objects are masked from 'package:base': 43 | #> 44 | #> intersect, setdiff, setequal, union 45 | suppressPackageStartupMessages(library(purrr)) 46 | library(knitr) 47 | 48 | iid <- 49 | openalex:::openalex_autocomplete( 50 | query = "Royal Institute of Technology", 51 | entity_type = "institution", 52 | format = "table") |> 53 | head(1) |> 54 | pull("id") 55 | #> Requesting url: https://api.openalex.org/autocomplete/institutions?q=Royal%20Institute%20of%20Technology 56 | 57 | data <- 58 | openalex_crawl(entity = "works", verbose = TRUE, fmt = "tables", 59 | query = openalex:::openalex_query(filter = 60 | sprintf("institutions.id:%s,publication_year:2025", iid))) 61 | #> About to crawl a total of 11 pages of results with a total of 257 records. 62 | #> ■■■■■■■■■ 27% | ETA: 3s 63 | #> ■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 91% | ETA: 0s 64 | 65 | res <- data |> map(head) # return only first six rows from each table 66 | 67 | res 68 | #> $work 69 | #> # A tibble: 6 × 13 70 | #> id doi display_name title publication_year publication_date type 71 | #> 72 | #> 1 W4405989080 https:… Molecular b… Mole… 2025 2025-01-01 arti… 73 | #> 2 W4406001145 https:… Tracking th… Trac… 2025 2025-01-02 arti… 74 | #> 3 W4406016239 https:… DGCR2 targe… DGCR… 2025 2025-01-02 arti… 75 | #> 4 W4406082819 https:… Screening a… Scre… 2025 2025-01-05 arti… 76 | #> 5 W4406172907 https:… Static deep… Stat… 2025 2025-01-01 arti… 77 | #> 6 W4406435778 https:… Consistent,… Cons… 2025 2025-01-16 arti… 78 | #> # ℹ 6 more variables: cited_by_count , is_retracted , 79 | #> # is_paratext , updated_date , cited_by_api_url , 80 | #> # created_date 81 | #> 82 | #> $work_ids 83 | #> # A tibble: 6 × 3 84 | #> work_id doi pmid 85 | #> 86 | #> 1 W4405989080 https://doi.org/10.1016/j.cell.2024.11.036 https://pubmed.ncbi.… 87 | #> 2 W4406001145 https://doi.org/10.1038/s41467-024-55688-8 https://pubmed.ncbi.… 88 | #> 3 W4406016239 https://doi.org/10.1038/s41598-024-84574-y https://pubmed.ncbi.… 89 | #> 4 W4406082819 https://doi.org/10.1186/s12896-024-00926-6 https://pubmed.ncbi.… 90 | #> 5 W4406172907 https://doi.org/10.1063/5.0248856 91 | #> 6 W4406435778 https://doi.org/10.21468/scipostphyscodeb.45 92 | #> 93 | #> $work_concepts 94 | #> # A tibble: 6 × 6 95 | #> work_id id wikidata display_name level score 96 | #> 97 | #> 1 W4405989080 C86803240 https://www.wikidata.org/wiki… Biology 0 0.928 98 | #> 2 W4405989080 C170493617 https://www.wikidata.org/wiki… Receptor 2 0.615 99 | #> 3 W4405989080 C70721500 https://www.wikidata.org/wiki… Computation… 1 0.466 100 | #> 4 W4405989080 C12426560 https://www.wikidata.org/wiki… Basis (line… 2 0.455 101 | #> 5 W4405989080 C135285700 https://www.wikidata.org/wiki… G protein-c… 3 0.449 102 | #> 6 W4405989080 C95444343 https://www.wikidata.org/wiki… Cell biology 1 0.389 103 | #> 104 | #> $work_authorships_institutions 105 | #> # A tibble: 6 × 12 106 | #> work_id id display_name ror country_code type lineage wa_countries 107 | #> 108 | #> 1 W4405989080 I18067… University … http… US fund… 109 | #> 2 W4405989080 I18067… University … http… US fund… 110 | #> 3 W4405989080 I11402… University … http… US fund… 111 | #> 4 W4405989080 I42101… National In… http… US fund… 112 | #> 5 W4405989080 I28001… Science for… http… SE fund… 113 | #> 6 W4405989080 I86987… KTH Royal I… http… SE fund… 114 | #> # ℹ 4 more variables: wa_is_corresponding , wa_raw_author_name , 115 | #> # wa_raw_affiliation_strings , wa_affiliations 116 | #> 117 | #> $work_abstract_inverted_index 118 | #> # A tibble: 6 × 3 119 | #> work_id aii_value aii_key 120 | #> 121 | #> 1 W4405989080 122 | #> 2 W4406001145 0 1 8 15 19 23 44 60 81 86 90 99 106 123 127 138 149 166 … Regula… 123 | #> 3 W4406016239 124 | #> 4 W4406082819 0 1 2 252 279 3 26 69 77 114 118 134 142 164 193 198 211 … Abstra… 125 | #> 5 W4406172907 0 1 88 2 3 12 4 5 6 7 8 58 9 59 10 60 11 13 14 15 20 35 6… As com… 126 | #> 6 W4406435778 0 1 55 67 2 3 4 51 87 5 6 7 64 83 96 103 8 9 17 23 44 50 … Histog… 127 | #> 128 | #> $work_authorships_author 129 | #> # A tibble: 6 × 10 130 | #> work_id wa_author_position id display_name orcid wa_countries 131 | #> 132 | #> 1 W4405989080 first A5035848333 Matthew K. Howa… http… 133 | #> 2 W4405989080 middle A5106732468 Nick Hoppe http… 134 | #> 3 W4405989080 middle A5089436626 Xi‐Ping Huang http… 135 | #> 4 W4405989080 middle A5063488695 Darko Mitrovic http… 136 | #> 5 W4405989080 middle A5036507080 Christian B. Bi… http… 137 | #> 6 W4405989080 middle A5080561155 Christian B. Ma… http… 138 | #> # ℹ 4 more variables: wa_is_corresponding , wa_raw_author_name , 139 | #> # wa_raw_affiliation_strings , wa_affiliations 140 | #> 141 | #> $work_biblio 142 | #> # A tibble: 6 × 5 143 | #> work_id volume issue first_page last_page 144 | #> 145 | #> 1 W4405989080 NA 146 | #> 2 W4406001145 16 1 147 | #> 3 W4406016239 15 1 148 | #> 4 W4406082819 25 1 149 | #> 5 W4406172907 37 1 150 | #> 6 W4406435778 NA 151 | #> 152 | #> $work_open_access 153 | #> # A tibble: 6 × 5 154 | #> work_id is_oa oa_status oa_url any_repository_has_f…¹ 155 | #> 156 | #> 1 W4405989080 TRUE green https://doi.org/10.1101/20… TRUE 157 | #> 2 W4406001145 TRUE gold https://doi.org/10.1038/s4… FALSE 158 | #> 3 W4406016239 TRUE gold https://doi.org/10.1038/s4… FALSE 159 | #> 4 W4406082819 TRUE gold https://doi.org/10.1186/s1… FALSE 160 | #> 5 W4406172907 TRUE hybrid https://doi.org/10.1063/5.… FALSE 161 | #> 6 W4406435778 TRUE hybrid https://doi.org/10.21468/s… TRUE 162 | #> # ℹ abbreviated name: ¹​any_repository_has_fulltext 163 | #> 164 | #> $work_host_venue 165 | #> # A tibble: 6 × 1 166 | #> work_id 167 | #> 168 | #> 1 W4405989080 169 | #> 2 W4406001145 170 | #> 3 W4406016239 171 | #> 4 W4406082819 172 | #> 5 W4406172907 173 | #> 6 W4406435778 174 | #> 175 | #> $work_counts_by_year 176 | #> # A tibble: 6 × 3 177 | #> work_id year cited_by_count 178 | #> 179 | #> 1 W4405989080 2025 1 180 | #> 2 W4406001145 2025 1 181 | #> 3 W4406082819 2025 1 182 | #> 4 W4406172907 2025 1 183 | #> 5 W4406435778 2025 1 184 | #> 6 W4406435863 2025 1 185 | #> 186 | #> $work_related_works 187 | #> # A tibble: 6 × 2 188 | #> work_id related_works 189 | #> 190 | #> 1 W4405989080 W4391375266 191 | #> 2 W4405989080 W4224216382 192 | #> 3 W4405989080 W416861399 193 | #> 4 W4405989080 W3195483439 194 | #> 5 W4405989080 W3011298851 195 | #> 6 W4405989080 W2609050007 196 | #> 197 | #> $work_referenced_works 198 | #> # A tibble: 6 × 2 199 | #> work_id referenced_works 200 | #> 201 | #> 1 W4405989080 W1031578623 202 | #> 2 W4405989080 W1483147211 203 | #> 3 W4405989080 W1503765703 204 | #> 4 W4405989080 W1513618424 205 | #> 5 W4405989080 W1833104430 206 | #> 6 W4405989080 W189880865 207 | ``` 208 | 209 | ## Rate limits and using an API key 210 | 211 | By providing an email address you enter the “polite pool” which provides 212 | even less of rate limiting for API requests. 213 | 214 | You can provide it in `~/.Renviron` by setting 215 | `OPENALEX_USERAGENT=http://github.com/hadley/httr 216 | (mailto:your_email@your_institution.org)`. 217 | 218 | You can also set it just for the session by using a helper fcn 219 | `openalex_polite()` to temporarily set or unset the email used in the 220 | user agent string when making API requests: 221 | 222 | ``` r 223 | library(openalex) 224 | 225 | # set an email to use for the session 226 | 227 | openalex_polite("you@example.com") 228 | #> Hint: You can provide an email to enter the polite pool 229 | #> To have the setting stick persistently using .Renviron, do ... 230 | #> file.edit("~/.Renviron") 231 | #> # and add a line OPENALEX_USERAGENT="http://github.com/hadley/httr (mailto:you@example.com)" 232 | #> Then reload settings for the R environment in the current session 233 | #> readRenviron("~/.Renviron") 234 | #> Temporarily setting OPENALEX_USERAGENT envvar for this session to: http://github.com/hadley/httr (mailto:you@example.com) 235 | #> [1] TRUE 236 | 237 | # unset, and use default user agent string... 238 | 239 | openalex_polite("") 240 | #> Exiting from polite pool, email no longer provided in user agent header 241 | #> [1] FALSE 242 | ``` 243 | 244 | A premium subscription API key can be used by setting 245 | `OPENALEX_KEY=secret_premium_api_key` in your `.Renviron`, or 246 | temporarily in a session using: 247 | 248 | ``` r 249 | library(openalex) 250 | 251 | # temporarily use a premium subscription API key 252 | openalex_key("secret_premium_api_key") 253 | 254 | # unset to not use the premium subscription API key 255 | openalex_key("") 256 | ``` 257 | 258 | This will make it possible to make API calls that return the latest 259 | available records, for example based on recent creation dates or recent 260 | last modification timestamps. 261 | 262 | ``` r 263 | 264 | # we do not require an API key for the publish date 265 | published_since_ <- openalex_works_published_since(since_days = 7) 266 | #> About to crawl a total of 1 pages of results with a total of 21 records. 267 | #> Warning: `type_convert()` only converts columns of type 'character'. 268 | #> - `df` has no columns of type 'character' 269 | 270 | # but an API key is needed when using "from_created_date" and "from_updated_date" fields. 271 | created_since_7d <- openalex_works_created_since(since_days = 7) 272 | #> About to crawl a total of 2 pages of results with a total of 44 records. 273 | #> Warning: `type_convert()` only converts columns of type 'character'. 274 | #> - `df` has no columns of type 'character' 275 | updated_since_1h <- openalex_works_updated_since(since_minutes = 60) 276 | #> About to crawl a total of 18 pages of results with a total of 442 records. 277 | #> ■■■ 6% | ETA: 32s 278 | #> ■■■■ 11% | ETA: 35s 279 | #> ■■■■■■ 17% | ETA: 31s 280 | #> ■■■■■■■■■ 28% | ETA: 28s 281 | #> ■■■■■■■■■■■ 33% | ETA: 26s 282 | #> ■■■■■■■■■■■■■■ 44% | ETA: 21s 283 | #> ■■■■■■■■■■■■■■■■ 50% | ETA: 20s 284 | #> ■■■■■■■■■■■■■■■■■■ 56% | ETA: 17s 285 | #> ■■■■■■■■■■■■■■■■■■■■■ 67% | ETA: 13s 286 | #> ■■■■■■■■■■■■■■■■■■■■■■■ 72% | ETA: 11s 287 | #> ■■■■■■■■■■■■■■■■■■■■■■■■ 78% | ETA: 9s 288 | #> ■■■■■■■■■■■■■■■■■■■■■■■■■■ 83% | ETA: 7s 289 | #> ■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 89% | ETA: 5s 290 | #> ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 94% | ETA: 2s 291 | 292 | # first few rows of each of these retrievals 293 | created_since_7d |> _$work_ids |> head() |> knitr::kable() 294 | ``` 295 | 296 | | work\_id | doi | pmid | 297 | | :---------- | :------------------------------------------- | :----------------------------------------- | 298 | | W4407223209 | | NA | 299 | | W4407212560 | | NA | 300 | | W4407347106 | | NA | 301 | | W4407235267 | | NA | 302 | | W4407203234 | | | 303 | | W4407277358 | | NA | 304 | 305 | ``` r 306 | updated_since_1h |> _$work_ids |> head() |> knitr::kable() 307 | ``` 308 | 309 | | work\_id | doi | mag | pmid | pmcid | 310 | | :---------- | :----------------------------------------------- | ---------: | :----------------------------------------- | :---- | 311 | | W2010417920 | | 2010417920 | | NA | 312 | | W2076716399 | | 2076716399 | NA | NA | 313 | | W2130306081 | | 2130306081 | NA | NA | 314 | | W2153980567 | | 2153980567 | NA | NA | 315 | | W2131628642 | | 2131628642 | NA | NA | 316 | | W2045503958 | | 2045503958 | NA | NA | 317 | 318 | ## Data source attribution 319 | 320 | When data from `openalex` is displayed publicly, this attribution also 321 | needs to be displayed: 322 | 323 | ``` r 324 | library(openalex) 325 | openalex_attribution() 326 | #> [1] "Data source: OpenAlex API at https://api.openalex.org/\nData license agreement: https://creativecommons.org/publicdomain/zero/1.0/" 327 | ``` 328 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://kth-library.github.io/openalex/ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /data-raw/DATASET.R: -------------------------------------------------------------------------------- 1 | ## code to prepare `DATASET` dataset goes here 2 | 3 | usethis::use_data(DATASET, overwrite = TRUE) 4 | -------------------------------------------------------------------------------- /data/topics.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KTH-Library/openalex/c8b22a4462be7e114898f79152ac9c1ce86ae1da/data/topics.rda -------------------------------------------------------------------------------- /man/openalex-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/openalex-package.R 3 | \docType{package} 4 | \name{openalex-package} 5 | \alias{openalex} 6 | \alias{openalex-package} 7 | \title{openalex: Data from OpenAlex REST API} 8 | \description{ 9 | The OpenAlex website provides open data on papers/works, venues, institutions and more around the world under the CC0 license. This R package provides some functions to access data from the OpenAlex REST API. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://kth-library.github.io/openalex/} 15 | } 16 | 17 | } 18 | \author{ 19 | \strong{Maintainer}: Markus Skyttner \email{markussk@kth.se} 20 | 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/openalex_api.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_api} 4 | \alias{openalex_api} 5 | \title{Endpoint used for requests to OpenAlex API} 6 | \usage{ 7 | openalex_api() 8 | } 9 | \description{ 10 | Endpoint used for requests to OpenAlex API 11 | } 12 | -------------------------------------------------------------------------------- /man/openalex_attribution.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_attribution} 4 | \alias{openalex_attribution} 5 | \title{Attribution} 6 | \usage{ 7 | openalex_attribution() 8 | } 9 | \description{ 10 | Use this attribution whenever data from the API is publicly displayed 11 | } 12 | \details{ 13 | OpenAlex provides a RESTful API for scholarly papers, authors, 14 | institutions, and more. When publicly displaying data from the API, 15 | it is polite to point back to OpenAlex at https://openalex.org/ 16 | } 17 | -------------------------------------------------------------------------------- /man/openalex_counts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_export.R 3 | \name{openalex_counts} 4 | \alias{openalex_counts} 5 | \title{Counts from OpenAlex} 6 | \usage{ 7 | openalex_counts( 8 | filter = openalex_filter_default(), 9 | dimensions = openalex_groupbys_default() 10 | ) 11 | } 12 | \arguments{ 13 | \item{filter}{a set of filter criteria, see the defaults in openalex_filter_default()} 14 | 15 | \item{dimensions}{a set of grouping dimensions, see the defaults in openalex_groupbys_default()} 16 | } 17 | \value{ 18 | a list of tibbles 19 | } 20 | \description{ 21 | Aggregates/counts can be retrieved using the group_bys query parameter 22 | } 23 | -------------------------------------------------------------------------------- /man/openalex_crawl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_crawl} 4 | \alias{openalex_crawl} 5 | \title{Crawl multiple pages of results} 6 | \usage{ 7 | openalex_crawl(entity, query, verbose = FALSE, fmt = "object") 8 | } 9 | \arguments{ 10 | \item{entity}{one of the values in openalex_entity_enum()} 11 | 12 | \item{query}{an openalex_query object} 13 | 14 | \item{verbose}{boolean to indicate whether to output messages during process} 15 | 16 | \item{fmt}{the return format, one of "object" or "tables"} 17 | } 18 | \value{ 19 | R object with results matching the query 20 | } 21 | \description{ 22 | Iterates over paged results showing a progress bar 23 | } 24 | -------------------------------------------------------------------------------- /man/openalex_doi_lookup.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/doi_lookup.R 3 | \name{openalex_doi_lookup} 4 | \alias{openalex_doi_lookup} 5 | \title{Lookup DOIs using OpenAlex} 6 | \usage{ 7 | openalex_doi_lookup(dois, resolution = c("all", "identifiers")) 8 | } 9 | \arguments{ 10 | \item{dois}{a character vector of DOIs} 11 | 12 | \item{resolution}{either "all" or "identifiers" to only return other related identifiers} 13 | } 14 | \value{ 15 | tibble(s) 16 | } 17 | \description{ 18 | Lookup DOIs using OpenAlex 19 | } 20 | -------------------------------------------------------------------------------- /man/openalex_flatten_long.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_flatten_long} 4 | \alias{openalex_flatten_long} 5 | \title{Flatten R object from deserialized nested JSON object} 6 | \usage{ 7 | openalex_flatten_long(nestedlist) 8 | } 9 | \arguments{ 10 | \item{nestedlist}{a nested list of lists} 11 | } 12 | \value{ 13 | a tibble in long format 14 | } 15 | \description{ 16 | Flatten R object from deserialized nested JSON object 17 | } 18 | -------------------------------------------------------------------------------- /man/openalex_key.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_key} 4 | \alias{openalex_key} 5 | \title{Use an API key for OpenAlex Premium Subscription} 6 | \usage{ 7 | openalex_key(key) 8 | } 9 | \arguments{ 10 | \item{key}{a premium subscription key} 11 | } 12 | \value{ 13 | a logical depending on whether key was set or unset 14 | } 15 | \description{ 16 | This provides access to the latest data, fresher than what snapshots provide. 17 | It also enables faster requests and filtering on from_created_date and from_updated_date fields. 18 | } 19 | \details{ 20 | Additional details... 21 | 22 | \url{https://github.com/ourresearch/openalex-api-tutorials/blob/main/notebooks/getting-started/premium.ipynb} 23 | \url{https://docs.openalex.org/api-entities/works/filter-works#from_created_date} 24 | \url{https://docs.openalex.org/api-entities/works/filter-works#from_updated_date} 25 | } 26 | \examples{ 27 | \dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} 28 | openalex_key("my_secret_api_key") 29 | openalex_key("") 30 | \dontshow{\}) # examplesIf} 31 | } 32 | -------------------------------------------------------------------------------- /man/openalex_kth_rawaff_query.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_kth_rawaff_query} 4 | \alias{openalex_kth_rawaff_query} 5 | \title{Example query when searching raw affiliation strings} 6 | \usage{ 7 | openalex_kth_rawaff_query() 8 | } 9 | \value{ 10 | string with query 11 | } 12 | \description{ 13 | This variant is specifically tailored for KTH, Royal Institute of Technology 14 | and includes some affiliation string variations which might be related. 15 | } 16 | -------------------------------------------------------------------------------- /man/openalex_polite.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_polite} 4 | \alias{openalex_polite} 5 | \title{Enter the OpenAlex API polite pool for faster requests by providing an email} 6 | \usage{ 7 | openalex_polite(email) 8 | } 9 | \arguments{ 10 | \item{email}{an email address, on the form "you@example.com" or "" to unset email} 11 | } 12 | \value{ 13 | a logical depending on whether email was set or unset 14 | } 15 | \description{ 16 | Enter the OpenAlex API polite pool for faster requests by providing an email 17 | } 18 | \examples{ 19 | \dontrun{ 20 | if(interactive()){ 21 | # to set 22 | openalex_polite("you@example.com") 23 | # to unset 24 | openalex_polite("") 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /man/openalex_topics.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_export.R 3 | \name{openalex_topics} 4 | \alias{openalex_topics} 5 | \title{Topics} 6 | \usage{ 7 | openalex_topics() 8 | } 9 | \description{ 10 | Table of current topics, subfields, fields and domains used at OpenAlex 11 | } 12 | -------------------------------------------------------------------------------- /man/openalex_work.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_work} 4 | \alias{openalex_work} 5 | \title{Retrieve work from OpenAlex REST API} 6 | \usage{ 7 | openalex_work(identifier, format = "table", use_random = FALSE) 8 | } 9 | \arguments{ 10 | \item{identifier}{string with identifier} 11 | 12 | \item{format}{one of "table" or "object"} 13 | 14 | \item{use_random}{logical to indicate whether to use random identifier, Default: FALSE} 15 | } 16 | \value{ 17 | as per format, either a tibble or an R object 18 | } 19 | \description{ 20 | This function retrieves works given an identifier 21 | } 22 | \examples{ 23 | \dontrun{ 24 | openalex_work(use_random = TRUE) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/openalex_works_created_since.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_works_created_since} 4 | \alias{openalex_works_created_since} 5 | \title{Recently created works based on query for matching raw affiliations} 6 | \usage{ 7 | openalex_works_created_since( 8 | raw_search_criteria = openalex_kth_rawaff_query(), 9 | since_days = 0 10 | ) 11 | } 12 | \arguments{ 13 | \item{raw_search_criteria}{raw affiliation string search criteria, 14 | by default openalex_kth_rawaff_query()} 15 | 16 | \item{since_days}{integer indicating minutes since now} 17 | } 18 | \value{ 19 | list of tables with results 20 | } 21 | \description{ 22 | This function requires a premium subscription API key to be set. 23 | } 24 | -------------------------------------------------------------------------------- /man/openalex_works_cursorcrawl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cursor_crawl.R 3 | \name{openalex_works_cursorcrawl} 4 | \alias{openalex_works_cursorcrawl} 5 | \title{Crawl multipage responses from queries against the API} 6 | \usage{ 7 | openalex_works_cursorcrawl(works_filter, n_max_pages = 5) 8 | } 9 | \arguments{ 10 | \item{works_filter}{the works filter} 11 | 12 | \item{n_max_pages}{the max amount of pages to fetch (50 per page)} 13 | } 14 | \value{ 15 | paths to downloaded files 16 | } 17 | \description{ 18 | Chunks and uses cursor based pagination to fetch works 19 | } 20 | -------------------------------------------------------------------------------- /man/openalex_works_export.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_export.R 3 | \name{openalex_works_export} 4 | \alias{openalex_works_export} 5 | \title{Use OpenAlex API for exporting data in tabular and wos formats} 6 | \usage{ 7 | openalex_works_export(q, fmt = c("csv", "wos-plaintext"), raw_string = FALSE) 8 | } 9 | \arguments{ 10 | \item{q}{the query, for example "authorships.institutions.lineage:i86987016,authorships.institutions.lineage:!i4210161097,type:types/article,primary_location.source.type:source-types/journal|source-types/conference,publication_year:2023"} 11 | 12 | \item{fmt}{the export format, one of "csv" or "wos-plaintext" or "wos-plaintext-diva"} 13 | 14 | \item{raw_string}{boolean to indicate whether a raw string should be returned} 15 | } 16 | \value{ 17 | a character vector with a raw string with the results from the export or a data frame 18 | } 19 | \description{ 20 | Use OpenAlex API for exporting data in tabular and wos formats 21 | } 22 | -------------------------------------------------------------------------------- /man/openalex_works_published_since.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_works_published_since} 4 | \alias{openalex_works_published_since} 5 | \title{Recently published works based on query for matching raw affiliations} 6 | \usage{ 7 | openalex_works_published_since( 8 | raw_search_criteria = openalex_kth_rawaff_query(), 9 | since_days = 7 10 | ) 11 | } 12 | \arguments{ 13 | \item{raw_search_criteria}{raw affiliation string search criteria, 14 | by default openalex_kth_rawaff_query()} 15 | 16 | \item{since_days}{integer indicating days back from today} 17 | } 18 | \value{ 19 | list of tables with results 20 | } 21 | \description{ 22 | Recently published works based on query for matching raw affiliations 23 | } 24 | -------------------------------------------------------------------------------- /man/openalex_works_updated_since.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_restclient.R 3 | \name{openalex_works_updated_since} 4 | \alias{openalex_works_updated_since} 5 | \title{Recently updated works based on query for matching raw affiliations} 6 | \usage{ 7 | openalex_works_updated_since( 8 | raw_search_criteria = openalex_kth_rawaff_query(), 9 | since_minutes 10 | ) 11 | } 12 | \arguments{ 13 | \item{raw_search_criteria}{raw affiliation string search criteria, 14 | by default openalex_kth_rawaff_query()} 15 | 16 | \item{since_minutes}{integer indicating minutes since now} 17 | } 18 | \value{ 19 | list of tables with results 20 | } 21 | \description{ 22 | This function requires a premium subscription API key to be set. 23 | } 24 | -------------------------------------------------------------------------------- /man/openalex_write_duckdb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_export.R 3 | \name{openalex_write_duckdb} 4 | \alias{openalex_write_duckdb} 5 | \title{Export the results from a crawl as a duckdb database file} 6 | \usage{ 7 | openalex_write_duckdb(crawl, destdir = NULL) 8 | } 9 | \arguments{ 10 | \item{crawl}{the results from running the to_tbls fcn} 11 | 12 | \item{destdir}{the location to save the database file} 13 | } 14 | \value{ 15 | file path to the database file 16 | } 17 | \description{ 18 | Export the results from a crawl as a duckdb database file 19 | } 20 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \arguments{ 10 | \item{lhs}{A value or the magrittr placeholder.} 11 | 12 | \item{rhs}{A function call using the magrittr semantics.} 13 | } 14 | \value{ 15 | The result of calling \code{rhs(lhs)}. 16 | } 17 | \description{ 18 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/topics.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/openalex-package.R 3 | \docType{data} 4 | \name{topics} 5 | \alias{topics} 6 | \title{Topics} 7 | \format{ 8 | A data frame with 4516 rows and 9 variables: 9 | \describe{ 10 | \item{\code{id_topic}}{character the id for the topic} 11 | \item{\code{topic}}{character description of topic} 12 | \item{\code{description}}{character long form description of this topic cluster} 13 | \item{\code{id_subfield}}{character the id for the subfield of this topic} 14 | \item{\code{subfield}}{character description of the subfield} 15 | \item{\code{id_field}}{character the id of the field} 16 | \item{\code{field}}{character description of the field} 17 | \item{\code{id_domain}}{character the id of the domain} 18 | \item{\code{domain}}{character description of the domain} 19 | } 20 | } 21 | \usage{ 22 | topics 23 | } 24 | \description{ 25 | Topics used by OpenAlex 26 | } 27 | \details{ 28 | DETAILS 29 | } 30 | \keyword{datasets} 31 | -------------------------------------------------------------------------------- /man/wos_plaintext_for_diva.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/open_alex_export.R 3 | \name{wos_plaintext_for_diva} 4 | \alias{wos_plaintext_for_diva} 5 | \title{Function which converts a wos_plaintext-string into a format 6 | which can be uploaded to DiVA, by adding ER tags 7 | (including a blank line) after each record} 8 | \usage{ 9 | wos_plaintext_for_diva(x) 10 | } 11 | \arguments{ 12 | \item{x}{character string with "wos-plaintext" format as returned from OpenAlex export API endpoint} 13 | } 14 | \description{ 15 | Function which converts a wos_plaintext-string into a format 16 | which can be uploaded to DiVA, by adding ER tags 17 | (including a blank line) after each record 18 | } 19 | -------------------------------------------------------------------------------- /openalex.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace,vignette 22 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(openalex) 3 | 4 | test_check("openalex") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-crawl.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("crawl works (not cursor based) and results can be persisted in db", { 3 | 4 | skip_on_ci() 5 | 6 | my_filter <- paste0(collapse = ",", c( 7 | "authorships.institutions.lineage:i86987016", ## KTH 8 | "authorships.institutions.lineage:!i4210161097", ## Bolin Center 9 | "authorships.institutions.lineage:!i119971240", ## NORDITA 10 | "type:types/article", 11 | "primary_location.source.type:source-types/journal|source-types/conference", 12 | "publication_year:2025" 13 | )) 14 | 15 | my_query <- openalex:::openalex_query(filter = my_filter) 16 | works <- openalex_crawl("work", query = my_query, fmt = "object") 17 | #readr::write_rds(works, "~/openalex-2023.rds") 18 | 19 | 20 | # TODO: some error here! 21 | library(purrr) 22 | library(dplyr) 23 | 24 | lol <- 25 | list(list(results = reduce(works |> map("results"), c))) 26 | 27 | my_works <- 28 | lol |> openalex_works_to_tbls() 29 | 30 | is_valid <- 31 | attr(works[[1]], "meta")$count == nrow(my_works$work) 32 | 33 | harvest <- 34 | my_works |> map(\(x) x |> mutate(across(any_of(contains("id")), 35 | \(y) gsub("https://openalex.org/", "", y, fixed = TRUE))) 36 | ) 37 | 38 | dump_path <-file.path(tempdir(), "openalex-2025.db") 39 | harvest |> openalex_write_duckdb(dump_path) 40 | message("Persisted dump at ", dump_path) 41 | 42 | expect_true(is_valid) 43 | 44 | }) 45 | 46 | test_that("Similar topics can be retrieved given a work", { 47 | 48 | skip_on_ci() 49 | 50 | topics_filter <- 51 | openalex_filter_similar_topics("W2168078104") 52 | 53 | my_filter <- list(filter = paste0( 54 | # "publication_year:2024,", 55 | "institution.id:I2799509149,", 56 | topics_filter 57 | )) 58 | 59 | works <- openalex_crawl("works", query = my_filter) 60 | 61 | lol <- 62 | list(list(results = reduce(works |> map("results"), c))) 63 | 64 | my_works <- 65 | lol |> openalex_works_to_tbls() 66 | 67 | is_valid <- my_works$work |> nrow() > 5 68 | expect_true(is_valid) 69 | 70 | }) 71 | 72 | test_that("Crawling several of works related to a specific topic works", { 73 | 74 | skip_on_ci() 75 | 76 | q <- 77 | list( 78 | filter = paste0(collapse = ",", c( 79 | "publication_year:2025", 80 | "primary_topic.id:T10783" 81 | )) 82 | ) 83 | 84 | works <- openalex_crawl("works", query = q, verbose = TRUE) 85 | 86 | message("JSON object size is ", format(object.size(works), "MB")) 87 | 88 | lol <- 89 | list(list(results = reduce(works |> map("results"), c))) 90 | 91 | my_works <- 92 | lol |> openalex_works_to_tbls() 93 | 94 | message("Tables object size is ", format(object.size(my_works), "MB")) 95 | 96 | message("Number of records are: ", nrow(my_works$work)) 97 | 98 | is_valid <- 99 | attr(works[[1]], "meta")$count == nrow(my_works$work) 100 | 101 | #is_valid <- object.size(works) > 7000000 102 | 103 | expect_true(is_valid) 104 | 105 | }) 106 | 107 | -------------------------------------------------------------------------------- /tests/testthat/test-cursorcrawl.R: -------------------------------------------------------------------------------- 1 | test_that("cursor based paging for works works", { 2 | 3 | skip_on_ci() 4 | 5 | works_filter <- "publication_year:2015-2023,primary_topic.id:t10783" 6 | 7 | cc <- 8 | works_filter |> 9 | openalex_works_cursorcrawl(n_max_pages = 10) 10 | 11 | mydir <- unique(dirname(cc)) 12 | 13 | mydir <- unique(dirname(cc)) 14 | fns_max_size <- max(file.size(cc)) 15 | 16 | read_jsonl <- function(fn) { 17 | Sys.setenv("VROOM_CONNECTION_SIZE" = fns_max_size) 18 | fn |> file() |> readr::read_lines() |> 19 | RcppSimdJson::fparse(max_simplify_lvl = "list") 20 | } 21 | 22 | ccs <- cc |> purrr::map(\(x) list(results = read_jsonl(x))) 23 | 24 | #object <- ccs[[1]] 25 | #w |> openalex_works_to_tbls() 26 | 27 | res <- 28 | cc |> 29 | purrr::map(\(x) list(results = read_jsonl(x))) |> 30 | openalex_works_to_tbls() 31 | 32 | is_valid <- 33 | all(cc %in% dir(mydir, full.names = TRUE)) & 34 | res$work |> nrow() == 500 35 | 36 | # TODO: Hmmm, what is a$abstract_inverted_index_v3? 37 | # a <- jsonlite::stream_in(file(cc[1])) |> as_tibble() 38 | 39 | expect_true(is_valid) 40 | 41 | # cmd <- paste0("flatterer --force --nocsv --parquet -m works --id-prefix work -j ", 42 | # paste(collapse = " ", cc), " ", mydir, paste0(mydir, "/cursorcrawl")) 43 | 44 | # system(cmd) 45 | 46 | }) 47 | -------------------------------------------------------------------------------- /tests/testthat/test-dois.R: -------------------------------------------------------------------------------- 1 | test_that("doi lookup works for 20 dois", { 2 | 3 | skip_on_ci() 4 | 5 | dois <- paste0("10.1016/j.aos.2023.101522, 10.1051/m2an/2024042, ", 6 | "10.1016/j.heliyon.2024.e25125, 10.1145/3664476.3664508, ", 7 | "10.23919/ECC64448.2024.10590962, 10.1109/TCNS.2023.3285863, ", 8 | "10.23919/ECC64448.2024.10591128, 10.1007/s10570-023-05674-y, ", 9 | "10.1109/APWC61918.2024.10701979, 10.1137/23M1587804, ", 10 | "10.1109/FDL63219.2024.10673844, 10.1007/978-3-031-54776-8_12, ", 11 | "10.1137/22M148968X, 10.1016/j.trc.2023.104454, 10.1108/ECON-10-2023-0163, ", 12 | "10.1016/j.apenergy.2024.122690, 10.1038/s41467-023-44315-7, ", 13 | "10.1109/TCI.2024.3463485, 10.1016/j.jobe.2024.110536, ", 14 | "10.1007/s13721-024-00446-5") |> 15 | strsplit(split = ", ") |> unlist() 16 | 17 | doi_filter <- function(dois) dois |> openalex_or() 18 | 19 | doi_filters <- 20 | dois |> 21 | split_chunks_of_n(50) |> 22 | map(doi_filter) 23 | 24 | #doi_filters |> map(\(x) doi_lookup_identifiers(doi_filter = x)) 25 | 26 | #ids <- openalex_doi_lookup(dois, "identifiers") 27 | more <- openalex_doi_lookup(dois, "all") 28 | 29 | is_valid <- 30 | #nrow(ids) == length(dois) & 31 | nrow(more[[1]]$ids) == length(dois) 32 | 33 | expect_true(is_valid) 34 | 35 | }) 36 | 37 | -------------------------------------------------------------------------------- /tests/testthat/test-export.R: -------------------------------------------------------------------------------- 1 | test_that("export works", { 2 | 3 | skip() 4 | 5 | my_filter <- paste0(collapse = ",", c( 6 | "authorships.institutions.lineage:i86987016", ## KTH 7 | "authorships.institutions.lineage:i4210161097", ## Bolin Center (some of these might be KTH pubs!) 8 | "authorships.institutions.lineage:i119971240", ## NORDITA (some of these, too!) 9 | "authorships.institutions.lineage:i4210147696", ## THS Tekniska Högskolans Studentkår 10 | "type:types/article", 11 | "primary_location.source.type:source-types/journal|source-types/conference", 12 | "publication_year:2025" 13 | )) 14 | 15 | # TODO: 16 | # for weekly import - include centres 17 | # for retroactive import - exclude centres 18 | 19 | my_csv <- openalex_works_export(q = my_filter, fmt = "csv") 20 | my_csv_string <- openalex_works_export(q = my_filter, fmt = "csv", raw_string = TRUE) 21 | my_wos <- openalex_works_export(q = my_filter, fmt = "wos-plaintext") 22 | my_wos_string <- openalex_works_export(q = my_filter, fmt = "wos-plaintext", raw_string = TRUE) 23 | 24 | is_valid <- nrow(my_csv) > 2000 & nrow(my_wos) > 2000 25 | expect_true(is_valid) 26 | }) 27 | 28 | test_that("export for diva in wos-plain text format works", { 29 | 30 | skip_on_ci() 31 | 32 | my_filter <- paste0(collapse = ",", c( 33 | sprintf("publication_year:%s", 2025), 34 | sprintf("authorships.author.id:%s", "a5045975901") #, 35 | # sprintf("raw_affiliation_strings.search:%s", openalex_kth_rawaff_query()), 36 | # "authorships.institutions.lineage:!i86987016", ## KTH 37 | # "authorships.institutions.lineage:!i4210161097", ## Bolin Center (some of these might be KTH pubs!) 38 | # "authorships.institutions.lineage:!i119971240", ## NORDITA (some of these, too!) 39 | # "authorships.institutions.lineage:!i4210147696" ## THS Tekniska Högskolans Studentkår 40 | )) 41 | 42 | gm <- openalex_works_export(q = my_filter, fmt = "wos-plaintext") 43 | 44 | #cat(gm) 45 | 46 | is_valid <- 47 | (regmatches(gm, gregexpr("KTH", gm)) |> unlist()) |> length() > 10 48 | 49 | expect_true(is_valid) 50 | }) 51 | 52 | test_that("export of rawff query for 2025 in wos-plain diva text format works", { 53 | 54 | skip_on_ci() 55 | 56 | my_filter <- paste0(collapse = ",", c( 57 | sprintf("publication_year:%s", 2025), 58 | sprintf("raw_affiliation_strings.search:%s", openalex_kth_rawaff_query()), 59 | "authorships.institutions.lineage:!i86987016", ## KTH 60 | "authorships.institutions.lineage:!i4210161097", ## Bolin Center (some of these might be KTH pubs!) 61 | "authorships.institutions.lineage:!i119971240", ## NORDITA (some of these, too!) 62 | "authorships.institutions.lineage:!i4210147696" ## THS Tekniska Högskolans Studentkår 63 | )) 64 | 65 | extras <- openalex_works_export(q = my_filter, fmt = "wos-plaintext") 66 | is_valid <- (regmatches(extras, gregexpr("ER", extras)) |> unlist()) |> length() > 0 67 | expect_true(is_valid) 68 | }) 69 | 70 | test_that("export of rawff query for 2025 in csv format works", { 71 | 72 | skip_on_ci() 73 | 74 | my_filter <- paste0(collapse = ",", c( 75 | sprintf("publication_year:%s", 2025), 76 | sprintf("raw_affiliation_strings.search:%s", openalex_kth_rawaff_query()) 77 | )) 78 | 79 | csv <- openalex_works_export(my_filter, "csv") 80 | 81 | #arrow::write_parquet(csv, "~/oa-2025-csv-export.parquet") 82 | 83 | is_valid <- nrow(csv) > 1 84 | 85 | expect_true(is_valid) 86 | 87 | }) 88 | 89 | test_that("export of rawff query for 2025 in csv format works", { 90 | 91 | skip_on_ci() 92 | 93 | my_filter <- paste0(collapse = ",", c( 94 | sprintf("publication_year:%s", 2025), 95 | sprintf("raw_affiliation_strings.search:%s", openalex_kth_rawaff_query()) 96 | )) 97 | 98 | csv <- openalex_works_export(my_filter, "csv") 99 | 100 | #arrow::write_parquet(csv_2023, "~/oa-2023-csv-export.parquet") 101 | 102 | is_valid <- nrow(csv) > 1 103 | 104 | expect_true(is_valid) 105 | 106 | }) 107 | 108 | 109 | 110 | # TODO: Vilka konferenspapper finns i OA som är kopplade till KTH (behöver inte ha DOI)? 111 | # TODO: Vad ger sökningar (autocomplete) på titlar och konferensnamn? 112 | # Hypotes: Scopus är duktiga på konferenser, kanske pga screenscraping-superpowers, hur står sig OpenAlex där? 113 | # TODO: Hur mycket "mer" av sådant ser man med en Premium Key? 114 | 115 | # tf <- openalex_counts(filter = "authorships.institutions.lineage:i86987016,publication_year:2025") 116 | 117 | # tree <- tf[grepl("Topic", names(tf))] 118 | # topics <- openalex_topics() 119 | 120 | # tree$`Primary Topic Domain Id` |> 121 | # left_join(topics |> distinct(id_domain, domain), by = c(name = "domain")) |> 122 | # rename(domain = "name") 123 | 124 | # tree$`Primary Topic Field Id` |> 125 | # left_join(topics |> distinct(id_field, field, id_domain, domain), by = c(name = "field")) |> 126 | # rename(field = "name") 127 | 128 | # tree$`Primary Topic Subfield Id` |> 129 | # left_join(topics |> distinct(id_subfield, subfield, id_field, field, id_domain, domain), by = c(name = "subfield")) |> 130 | # rename(subfield = "name") 131 | 132 | # tree$`Primary Topic Id` |> 133 | # left_join(topics, by = c(name = "topic")) |> 134 | # rename(topic = "name") 135 | 136 | # tt <- 137 | # list(tree, names(tree)) |> purrr::pmap(\(x, y) x |> mutate(var = y) |> select(var, everything())) |> 138 | # map_dfr(bind_rows) |> 139 | # rename(display_name = name) |> 140 | # left_join(topics, by = "display_name") 141 | 142 | # proceed to make a treemap -------------------------------------------------------------------------------- /tests/testthat/test-freeze.R: -------------------------------------------------------------------------------- 1 | test_that("converting some records to tables does not cause freeze", { 2 | 3 | skip_on_ci() 4 | 5 | works_filter <- paste0( 6 | "publication_year:2015-2024,", 7 | "primary_topic.subfield.id:subfields/3605,", 8 | "authorships.countries:countries/se" 9 | ) 10 | 11 | cc <- 12 | works_filter |> 13 | openalex_works_cursorcrawl(n_max_pages = 28) 14 | 15 | mydir <- unique(dirname(cc)) 16 | 17 | read_jsonl <- function(fn) { 18 | fn |> file() |> readr::read_lines() |> 19 | RcppSimdJson::fparse(max_simplify_lvl = "list") 20 | } 21 | 22 | ccs <- 23 | cc |> purrr::map(\(x) list(results = read_jsonl(x))) 24 | 25 | res <- ccs |> openalex_works_to_tbls() 26 | 27 | #object <- list(results = ccs[[4]]$results) 28 | #object |> parse_work2() 29 | 30 | is_valid <- 31 | all(cc %in% dir(mydir, full.names = TRUE)) & 32 | res$work |> nrow() == ccs |> map_int(\(x) length(x$results)) |> sum() 33 | 34 | expect_true(is_valid) 35 | 36 | }) 37 | 38 | test_that("converting some records to tables does not cause vroom error", { 39 | 40 | skip_on_ci() 41 | 42 | works_filter <- paste0( 43 | "publication_year:2015-2024,", 44 | "topics.subfield.id:subfields/3605,", 45 | "authorships.countries:countries/se" 46 | ) 47 | 48 | cc <- 49 | works_filter |> 50 | openalex_works_cursorcrawl(n_max_pages = 27) 51 | 52 | mydir <- unique(dirname(cc)) 53 | fns_max_size <- max(file.size(cc)) 54 | 55 | read_jsonl <- function(fn) { 56 | Sys.setenv("VROOM_CONNECTION_SIZE" = fns_max_size) 57 | fn |> file() |> readr::read_lines() |> 58 | RcppSimdJson::fparse(max_simplify_lvl = "list") 59 | } 60 | 61 | ccs <- 62 | cc |> purrr::map(\(x) list(results = read_jsonl(x))) 63 | 64 | res <- ccs |> openalex_works_to_tbls() 65 | 66 | #ccs |> map(\(x) list(results = x) |> openalex_works_to_tbls()) 67 | 68 | #i <- ceiling(0.77 * 26) 69 | #x <- ccs[i] 70 | #cc[i] 71 | 72 | #object <- list(results = ccs[[4]]$results) 73 | #object |> parse_work2() 74 | 75 | is_valid <- 76 | all(cc %in% dir(mydir, full.names = TRUE)) & 77 | res$work |> nrow() == ccs |> map_int(\(x) length(x$results)) |> sum() 78 | 79 | expect_true(is_valid) 80 | }) 81 | 82 | -------------------------------------------------------------------------------- /tests/testthat/test-open_alex_restclient.R: -------------------------------------------------------------------------------- 1 | # test_that("attribution works", { 2 | # attribution <- capture.output(cat(openalex_attribution())) 3 | # is_ok <- length(attribution) == 2 && nchar(attribution) > 0 4 | # expect_true(is_ok) 5 | # }) 6 | 7 | test_that("fetching work works", { 8 | 9 | identifier <- "W1851956350" 10 | 11 | #is_ok <- identical(openalex_work(identifier), openalex_work(identifier)) 12 | expected_id <- paste0("https://openalex.org/", identifier) 13 | 14 | table_has_ok_id <- 15 | subset(openalex_work(identifier), name == "id")$value == 16 | expected_id 17 | 18 | object_has_ok_id <- 19 | openalex_work(identifier, format = "object")$ids$openalex == 20 | expected_id 21 | 22 | expect_true(table_has_ok_id && object_has_ok_id) 23 | 24 | }) 25 | 26 | test_that("error 404 is returned for when work is not found", { 27 | identifier <- "10.1038/nrn3241" 28 | expect_error(openalex_work(identifier), "404") 29 | }) 30 | 31 | test_that("fetching random work works", { 32 | random <- openalex_work(use_random = TRUE) 33 | is_ok <- nrow(random) > 10 34 | expect_true(is_ok) 35 | }) 36 | 37 | # openalex_entity_enum() 38 | # 39 | # openalex_list(entity = "works", query = openalex_query(page = 2))$meta 40 | # openalex_list(entity = "concepts", query = openalex_query(page = 2))$meta 41 | # openalex_list(entity = "institutions", query = openalex_query())$meta 42 | # openalex_list(entity = "venues", query = openalex_query())$meta 43 | # openalex_list(entity = "authors", query = openalex_query())$meta 44 | # 45 | # # works whose type is book 46 | # openalex_list("works", query = openalex_query( 47 | # filter = "type:book") 48 | # )$meta 49 | # 50 | # #venues that host more than 1000 works: 51 | # openalex_list("venues", query = openalex_query( 52 | # filter = "works_count:>1000") 53 | # )$meta 54 | # 55 | # # US-based authors who've been cited more than 100 times: 56 | # openalex_list("authors", query = openalex_query( 57 | # filter = "last_known_institution.country_code:US,cited_by_count:>0") 58 | # )$meta 59 | # 60 | # # works whose title is searched 61 | # openalex_list("works", query = openalex_query( 62 | # filter = "title.search:'intensive treatment of diabetes'") 63 | # )$meta 64 | # 65 | # res <- 66 | # openalex_crawl("works", query = openalex_query( 67 | # filter = "title.search:'intensive treatment of diabetes'") 68 | # ) 69 | # 70 | # library(dplyr) 71 | # res %>% openalex_flatten_long() %>% count(name) %>% arrange(desc(n)) 72 | 73 | test_that("providing email for polite pool gives faster response...", { 74 | 75 | skip() 76 | 77 | # so initial setting can be restored 78 | initial <- Sys.getenv("OPENALEX_USERAGENT") 79 | on.exit(Sys.setenv("OPENALEX_USERAGENT" = initial)) 80 | 81 | # not polite 82 | openalex_polite("") 83 | tn <- system.time( 84 | c1 <- openalex_crawl("works", verbose = TRUE, 85 | query = openalex:::openalex_query(filter = 86 | "institutions.id:I86987016,publication_year:2022")) 87 | )[3] 88 | 89 | # polite 90 | openalex_polite("markussk@kth.se") 91 | tp <- system.time( 92 | c2 <- openalex_crawl("works", verbose = TRUE, 93 | query = openalex:::openalex_query(filter = 94 | "institutions.id:I86987016,publication_year:2022")) 95 | )[3] 96 | 97 | message("Polite time: ", tp) 98 | message("Not polite time: ", tn) 99 | 100 | is_faster <- tp < tn 101 | expect_true(is_faster) 102 | 103 | }) 104 | --------------------------------------------------------------------------------