├── .github └── workflows │ └── main.yml ├── .gitignore ├── LICENSE ├── LICENSE_selected_R_tutorials ├── README.md ├── _config.yml ├── _toc.yml ├── images ├── UALIB_favicon.png └── UALIB_logo.png ├── requirements.txt └── src ├── about ├── contributing.rst ├── introduction.rst ├── license-reuse.rst └── tech-details.rst ├── overview ├── arxiv.rst ├── bea.rst ├── bls.rst ├── casc.rst ├── chronam.rst ├── college-scorecard.rst ├── congress.rst ├── crossref.rst ├── fdc.rst ├── geonames.rst ├── nasa-images.rst ├── nps.rst ├── nws.rst ├── openalex.rst ├── osf.rst ├── osm.rst ├── pubchem-periodic-table.rst ├── pubchem.rst ├── pubmed.rst ├── ror.rst ├── scopus.rst ├── sdirect.rst ├── sec-edgar.rst ├── speedrun.rst ├── springer.rst ├── stack-exchange.rst ├── us-census-geocoding.rst ├── us-census.rst ├── us-treasury.rst ├── usa-spending.rst ├── usgs-national-map.rst ├── wiley-tdm.rst ├── world-bank.rst ├── wos.rst └── z3950.rst ├── python ├── bea.ipynb ├── bls.ipynb ├── casc.ipynb ├── chronam.ipynb ├── college-scorecard.ipynb ├── congress.ipynb ├── crossref.ipynb ├── fdc.ipynb ├── geonames.ipynb ├── imgs │ ├── APOD_Image.png │ ├── Earth_Image1_recent.png │ └── Earth_Image_Stitched.png ├── nasa-images.ipynb ├── nps.ipynb ├── nws.ipynb ├── openalex.ipynb ├── osf.ipynb ├── osm.ipynb ├── pubchem-periodic-table.ipynb ├── pubchem.ipynb ├── pubmed.ipynb ├── ror.ipynb ├── scopus.ipynb ├── sdirect.ipynb ├── sec-edgar.ipynb ├── speedrun.ipynb ├── springer.ipynb ├── stack-exchange.ipynb ├── us-census-geocoding.ipynb ├── us-census.ipynb ├── us-treasury.ipynb ├── usa-spending.ipynb ├── usgs-national-map.ipynb ├── wiley-tdm.ipynb ├── world-bank.ipynb └── wos.ipynb ├── r-gpl3 ├── PubMedAPItut_files │ └── figure-html │ │ └── visual-1.png ├── R_Pubchem_Markdown_Adam_Edit_files │ └── figure-html │ │ ├── unnamed-chunk-10-1.png │ │ ├── unnamed-chunk-10-2.png │ │ ├── unnamed-chunk-10-3.png │ │ ├── unnamed-chunk-10-4.png │ │ ├── unnamed-chunk-10-5.png │ │ ├── unnamed-chunk-18-1.png │ │ ├── unnamed-chunk-18-2.png │ │ ├── unnamed-chunk-18-3.png │ │ ├── unnamed-chunk-18-4.png │ │ └── unnamed-chunk-18-5.png ├── US_Census_Data_in_R_files │ └── figure-html │ │ └── plot-popchg-1.png ├── pubchem.md ├── pubmed.md └── us-census.md ├── r ├── CASCommonChemR_files │ └── figure-html │ │ ├── Display-1.png │ │ ├── disp-list-1.png │ │ ├── disp-list-2.png │ │ ├── disp-list-3.png │ │ ├── disp-list-4.png │ │ ├── disp-list-5.png │ │ ├── hist-1.png │ │ └── kernel-1.png ├── Chronam_in_R_Adam_Vers_files │ └── figure-html │ │ ├── unnamed-chunk-12-1.png │ │ └── unnamed-chunk-21-1.png ├── College_Scorecard_R_files │ └── figure-html │ │ └── program-percentage-distribution.png ├── R_WorldBank_Markdown_files │ └── figure-html │ │ ├── unnamed-chunk-16-1.png │ │ ├── unnamed-chunk-17-1.png │ │ └── unnamed-chunk-18-1.png ├── SEC_EDGAR_API_R_files │ └── figure-html │ │ ├── unnamed-chunk-10-1.png │ │ └── unnamed-chunk-5-1.png ├── USA_Spending_R_files │ └── figure-html │ │ ├── unnamed-chunk-4-1.png │ │ ├── unnamed-chunk-6-1.png │ │ └── unnamed-chunk-9-1.png ├── arXiv_API_in_R_files │ └── figure-html │ │ ├── unnamed-chunk-3-1.png │ │ └── unnamed-chunk-6-1.png ├── arxiv.md ├── casc.md ├── chronam.md ├── college-scorecard.md ├── crossref.md ├── figure │ ├── Display-1.png │ ├── disp list-1.png │ ├── disp list-2.png │ ├── disp list-3.png │ ├── disp list-4.png │ ├── disp list-5.png │ ├── hist-1.png │ ├── kernel-1.png │ ├── plot popchg-1.png │ ├── unnamed-chunk-12-1.png │ ├── unnamed-chunk-16-1.png │ ├── unnamed-chunk-17-1.png │ ├── unnamed-chunk-18-1.png │ └── visual-1.png ├── output.json ├── sdirect.md ├── sec-edgar.md ├── usa-spending.md ├── wiley-tdm.md └── world-bank.md └── shell └── z3950.rst /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Build and Deploy Jupyter Book 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | workflow_dispatch: 9 | 10 | permissions: 11 | contents: read 12 | pages: write 13 | id-token: write 14 | 15 | concurrency: 16 | group: "pages" 17 | cancel-in-progress: false 18 | 19 | jobs: 20 | deploy: 21 | environment: 22 | name: github-pages 23 | url: ${{ steps.deployment.outputs.page_url }} 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: Checkout Repository 27 | uses: actions/checkout@v2 28 | 29 | - name: Setup Python 30 | uses: actions/setup-python@v2 31 | with: 32 | python-version: '3.x' 33 | 34 | - name: Install Dependencies 35 | run: | 36 | python -m pip install --upgrade pip 37 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 38 | pip install jupyter-book 39 | 40 | - name: Build Jupyter Book HTML 41 | run: | 42 | jupyter-book build . 43 | 44 | - name: Setup Pages 45 | uses: actions/configure-pages@v3 46 | 47 | - name: Upload artifact 48 | uses: actions/upload-pages-artifact@v3 49 | with: 50 | path: './_build/html' 51 | 52 | - name: Deploy to GitHub Pages 53 | id: deployment 54 | uses: actions/deploy-pages@v4 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _build/ 2 | .ipynb_checkpoints/ 3 | .env 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 The University of Alabama Libraries 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # University of Alabama Libraries Scholarly API Cookbook 2 | 3 | > [!IMPORTANT] 4 | > Please check the individual scholarly API documentation for current information on API usage and policies. 5 | > 6 | > March 2025 - We have decided to no longer maintain the Matlab, Mathematica, Bash (except the Z39.50 Bash recipe) and C recipes and have removed them from the Scholarly API Cookbook. These archived recipes are in the [UA Libraries Scholarly API Cookbook Archive](https://github.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive). 7 | > 8 | > November 2024 - Some R code tutorials that were originally MIT Licensed, are now licensed under the GPL-3 License to comply with the licensing terms of dependent R libraries. 9 | 10 | The University of Alabama Libraries Scholarly API Cookbook is an open online book containing short scholarly API code examples (i.e., “recipes”) that demonstrate how to work with various scholarly web service APIs. It is part of the University of Alabama Libraries efforts to support Research Data Services. Read the book [here](https://ua-libraries-research-data-services.github.io/UALIB_ScholarlyAPI_Cookbook). 11 | 12 | ## License and Reuse 13 | 14 | Most of the code in this repository is licensed under the [MIT License](https://github.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/blob/main/LICENSE). 15 | 16 | The Python scripts in this repository are licensed under the MIT License. However, these scripts may rely on external libraries such as matplotlib, pandas, and others. These libraries are licensed under their own respective terms, and will need to be installed separately. Refer to the documentation of each library for installation instructions and licensing details. 17 | 18 | The Bash tutorials are licensed under the MIT License (Bash itself is licensed under the GNU General Public License). Some of the included scripts may rely on external tools such as curl, jq, YAZ, and gnuplot, each of which is licensed under its own terms. Users must obtain and install these tools separately. Refer to the documentation of each tool for installation instructions and licensing details. 19 | 20 | Lastly, most of the R tutorial scripts are MIT licensed, but some are licensed under the [GPL-3 License](https://github.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/blob/main/LICENSE_selected_R_tutorials) because they depend on GPL-licensed R libraries (refer to the documentation of each R library for installation instructions and licensing details). The R tutorials with GPL-3 licenses are indicated at the top of the respective files and organized separately in the folder `src/r-gpl3/`. 21 | 22 | We have endeavored to follow the appropriate terms and usage policies of each scholarly API, web service, and Z39.50 server. We have linked to the terms and policies where possible. Some database APIs may require a valid library subscription, institutional access, or individual account to use their services. Please be responsible when reusing these scripts and respect the API terms and usage policies (e.g., query limits, record downloads, data sharing restrictions). Data output snippets shown in this book are for demonstration purposes and are credited to the individual API or database service. The output generated from APIs or services remains subject to the terms and conditions of the respective provider. Some outputs (e.g., U.S. Government works) may be in the public domain, while others may require attribution or adherence to other conditions. 23 | 24 | If you reuse the code, attribution would be appreciated. Please link to the Cookbook and cite our manuscript: 25 | 26 | Link to Cookbook: https://ua-libraries-research-data-services.github.io/UALIB_ScholarlyAPI_Cookbook 27 | 28 | Citation: Scalfani, V. F.; Walker, K. W.; Simpson, L.; Fernandez, A. M.; Patel, V. D.; Ramig, A.; Gomes, C.; Moen, M. T.; Nguyen, A. M. Creating a Scholarly API Cookbook: Supporting Library Users with Programmatic Access to Information. Issues in Science and Technology Librarianship, 2023, No. 104. https://doi.org/10.29173/istl2766. 29 | 30 | ```bibtex 31 | @article{scalfani_creating_2023, 32 | title = {Creating a {Scholarly} {API} {Cookbook}: {Supporting} {Library} {Users} with {Programmatic} {Access} to {Information}}, 33 | issn = {1092-1206}, 34 | shorttitle = {Creating a {Scholarly} {API} {Cookbook}}, 35 | url = {https://journals.library.ualberta.ca/istl/index.php/istl/article/view/2766}, 36 | doi = {10.29173/istl2766}, 37 | abstract = {Scholarly web-based application programming interfaces (APIs) allow users to interact with information and data programmatically. Interacting with information programmatically allows users to create advanced information query workflows and quickly access machine-readable data for downstream computations. With the growing availability of scholarly APIs from open and commercial library databases, supporting access to information via an API has become a key support area for research data services in libraries. This article describes our efforts with supporting API access through the development of an online Scholarly API Cookbook. The Cookbook contains code recipes (i.e., tutorials) for getting started with 10 different scholarly APIs, including for example, Scopus, World Bank, and PubMed. API tutorials are available in Python, Bash, Matlab, and Mathematica. A tutorial for interacting with library catalog data programmatically via Z39.50 is also included, as traditional library catalog metadata is rarely available via an API. In addition to describing the Scholarly API Cookbook content, we discuss our experiences building a student research data services programming team, challenges we encountered, and ideas to improve the Cookbook. The University of Alabama Libraries Scholarly API Cookbook is freely available and hosted on GitHub. All code within the API Cookbook is licensed with the permissive MIT license, and as a result, users are free to reuse and adapt the code in their teaching and research.}, 38 | number = {104}, 39 | urldate = {2023-10-13}, 40 | journal = {Issues in Science and Technology Librarianship}, 41 | author = {Scalfani, Vincent F. and Walker, Kevin W. and Simpson, Lance and Fernandez, Avery M. and Patel, Vishank D. and Ramig, Anastasia and Gomes, Cyrus and Moen, Michael T. and Nguyen, Adam M.}, 42 | month = oct, 43 | year = {2023}, 44 | } 45 | ``` 46 | 47 | ## Archived Recipes 48 | 49 | We have decided to no longer maintain the Matlab, Mathematica, Bash (except the Z39.50 Bash recipe), and C recipes and have removed them from the Scholarly API Cookbook. These archived recipes are still in the [UA Libraries Scholarly API Cookbook Archive](https://github.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive). 50 | 51 | ### Archive License Information 52 | 53 | The code in the UA Libraries Scholarly API Cookbook Archive is licensed under the [MIT License](https://github.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/blob/main/LICENSE). This includes code written to be used with Wolfram Mathematica and MathWorks MATLAB. However, these proprietary software packages themselves are not covered under the MIT License, and users must have valid licenses for Mathematica and MATLAB to run the associated code. 54 | 55 | The Python scripts in this repository are licensed under the MIT License. However, these scripts may rely on external libraries such as matplotlib, pandas, and others. These libraries are licensed under their own respective terms, and will need to be installed separately. Refer to the documentation of each library for installation instructions and licensing details. 56 | 57 | The Bash tutorials are licensed under the MIT License (Bash itself is licensed under the GNU General Public License). Some of the included scripts may rely on external tools such as curl, jq, and gnuplot, each of which is licensed under its own terms. Users must obtain and install these tools separately. Refer to the documentation of each tool for installation instructions and licensing details. 58 | 59 | The C code in the archive is licensed under the MIT License. This repository provides only the source code, and users will need to compile the C programs to run them. Some of the C code depends on external libraries such as curl, jq, and YAZ, which are licensed under their own respective terms. These libraries will need to be obtained and installed separately by the user. 60 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | # Book settings 2 | 3 | title: Scholarly API Cookbook 4 | author: The University of Alabama Libraries 5 | copyright: "2025" 6 | logo: images/UALIB_logo.png 7 | exclude_patterns: [readme.md, old, conversion_scripts] 8 | # Force re-execution of notebooks on each build. 9 | # See https://jupyterbook.org/content/execute.html 10 | execute: 11 | execute_notebooks: off 12 | 13 | # Define the name of the latex output file for PDF builds 14 | latex: 15 | latex_documents: 16 | targetname: book.tex 17 | 18 | # Add a bibtex file so that we can create citations 19 | #bibtex_bibfiles: 20 | # - references.bib 21 | 22 | # Information about where the book exists on the web 23 | repository: 24 | url: https://github.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook # Online location of your book 25 | # path_to_book: # Optional path to your book, relative to the repository root 26 | branch: main # Which branch of the repository should be used when creating links (optional) 27 | 28 | # Add GitHub buttons to your book 29 | # See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository 30 | html: 31 | favicon: images/UALIB_favicon.png 32 | use_issues_button: true 33 | use_repository_button: true 34 | 35 | launch_buttons: 36 | colab_url: "" 37 | binderhub_url: "" 38 | jupyterhub_url: "" 39 | -------------------------------------------------------------------------------- /_toc.yml: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | format: jb-book 3 | root: src/about/introduction 4 | parts: 5 | 6 | - caption: ARTICLE FULL-TEXT 7 | chapters: 8 | 9 | # Open Science Framework 10 | - file: src/overview/osf 11 | sections: 12 | - file: src/python/osf 13 | title: "...in Python" 14 | 15 | # ScienceDirect 16 | - file: src/overview/sdirect 17 | sections: 18 | - file: src/python/sdirect 19 | title: "...in Python" 20 | - file: src/r/sdirect 21 | title: "...in R" 22 | 23 | # Springer 24 | - file: src/overview/springer 25 | sections: 26 | - file: src/python/springer 27 | title: "...in Python" 28 | 29 | # Wiley TDM 30 | - file: src/overview/wiley-tdm 31 | sections: 32 | - file: src/python/wiley-tdm 33 | title: "...in Python" 34 | - file: src/r/wiley-tdm 35 | title: "...in R" 36 | 37 | 38 | - caption: BIBLIOGRAPHIC 39 | chapters: 40 | 41 | # arXiv 42 | - file: src/overview/arxiv 43 | sections: 44 | - file: src/r/arxiv 45 | title: "...in R" 46 | 47 | # Crossref 48 | - file: src/overview/crossref 49 | sections: 50 | - file: src/python/crossref 51 | title: "...in Python" 52 | - file: src/r/crossref 53 | title: "...in R" 54 | 55 | # OpenAlex 56 | - file: src/overview/openalex 57 | sections: 58 | - file: src/python/openalex 59 | title: "...in Python" 60 | 61 | # Research Organization Registry 62 | - file: src/overview/ror 63 | sections: 64 | - file: src/python/ror 65 | title: "...in Python" 66 | 67 | # Scopus 68 | - file: src/overview/scopus 69 | sections: 70 | - file: src/python/scopus 71 | title: "...in Python" 72 | 73 | # Web of Science 74 | - file: src/overview/wos 75 | sections: 76 | - file: src/python/wos 77 | title: "...in Python" 78 | 79 | 80 | - caption: BUSINESS 81 | chapters: 82 | 83 | # U.S. Bureau of Economic Analysis 84 | - file: src/overview/bea 85 | sections: 86 | - file: src/python/bea 87 | title: "...in Python" 88 | 89 | # U.S. Bureau of Labor Statistics 90 | - file: src/overview/bls 91 | sections: 92 | - file: src/python/bls 93 | title: "...in Python" 94 | 95 | # U.S. Securities and Exchange 96 | - file: src/overview/sec-edgar 97 | sections: 98 | - file: src/python/sec-edgar 99 | title: "...in Python" 100 | - file: src/r/sec-edgar 101 | title: "...in R" 102 | 103 | # U.S. Treasury 104 | - file: src/overview/us-treasury 105 | sections: 106 | - file: src/python/us-treasury 107 | title: "...in Python" 108 | 109 | # World Bank 110 | - file: src/overview/world-bank 111 | sections: 112 | - file: src/python/world-bank 113 | title: "...in Python" 114 | - file: src/r/world-bank 115 | title: "...in R" 116 | 117 | 118 | - caption: GENERAL 119 | chapters: 120 | 121 | # National Park Service 122 | - file: src/overview/nps 123 | sections: 124 | - file: src/python/nps 125 | title: "...in Python" 126 | 127 | # Speedrun.com 128 | - file: src/overview/speedrun 129 | sections: 130 | - file: src/python/speedrun 131 | title: "...in Python" 132 | 133 | # Stack Exchange 134 | - file: src/overview/stack-exchange 135 | sections: 136 | - file: src/python/stack-exchange 137 | title: "...in Python" 138 | 139 | # Z39.50 140 | - file: src/overview/z3950 141 | sections: 142 | - file: src/shell/z3950 143 | title: "...in Bash" 144 | 145 | 146 | - caption: GIS 147 | chapters: 148 | 149 | # GeoNames 150 | - file: src/overview/geonames 151 | sections: 152 | - file: src/python/geonames 153 | title: "...in Python" 154 | 155 | # OpenStreetMap 156 | - file: src/overview/osm 157 | sections: 158 | - file: src/python/osm 159 | title: "...in Python" 160 | 161 | # U.S. Census Geocoding 162 | - file: src/overview/us-census-geocoding 163 | sections: 164 | - file: src/python/us-census-geocoding 165 | title: "...in Python" 166 | 167 | # USGS National Map 168 | - file: src/overview/usgs-national-map 169 | sections: 170 | - file: src/python/usgs-national-map 171 | title: "...in Python" 172 | 173 | 174 | - caption: HUMANITIES 175 | chapters: 176 | 177 | # Chronicling America 178 | - file: src/overview/chronam 179 | sections: 180 | - file: src/python/chronam 181 | title: "...in Python" 182 | - file: src/r/chronam 183 | title: "...in R" 184 | 185 | 186 | - caption: SCIENTIFIC 187 | chapters: 188 | 189 | # CAS Common Chemistry 190 | - file: src/overview/casc 191 | sections: 192 | - file: src/python/casc 193 | title: "...in Python" 194 | - file: src/r/casc 195 | title: "...in R" 196 | 197 | # FoodData Central 198 | - file: src/overview/fdc 199 | sections: 200 | - file: src/python/fdc 201 | title: "...in Python" 202 | 203 | # NASA Images 204 | - file: src/overview/nasa-images 205 | sections: 206 | - file: src/python/nasa-images 207 | title: "...in Python" 208 | 209 | # National Weather Service 210 | - file: src/overview/nws 211 | sections: 212 | - file: src/python/nws 213 | title: "...in Python" 214 | 215 | # PubChem 216 | - file: src/overview/pubchem 217 | sections: 218 | - file: src/python/pubchem 219 | title: "...in Python" 220 | - file: src/r-gpl3/pubchem 221 | title: "...in R" 222 | 223 | # PubChem Periodic Table 224 | - file: src/overview/pubchem-periodic-table 225 | sections: 226 | - file: src/python/pubchem-periodic-table 227 | title: "...in Python" 228 | 229 | # PubMed 230 | - file: src/overview/pubmed 231 | sections: 232 | - file: src/python/pubmed 233 | title: "...in Python" 234 | - file: src/r-gpl3/pubmed 235 | title: "...in R" 236 | 237 | 238 | - caption: SOCIAL SCIENCES 239 | chapters: 240 | 241 | # College Scorecard 242 | - file: src/overview/college-scorecard 243 | sections: 244 | - file: src/python/college-scorecard 245 | title: "...in Python" 246 | - file: src/r/college-scorecard 247 | title: "...in R" 248 | 249 | # Congress.gov 250 | - file: src/overview/congress 251 | sections: 252 | - file: src/python/congress 253 | title: "...in Python" 254 | 255 | # U.S. Census Data 256 | - file: src/overview/us-census 257 | sections: 258 | - file: src/python/us-census 259 | title: "...in Python" 260 | - file: src/r-gpl3/us-census 261 | title: "...in R" 262 | 263 | # USA Spending 264 | - file: src/overview/usa-spending 265 | sections: 266 | - file: src/python/usa-spending 267 | title: "...in Python" 268 | - file: src/r/usa-spending 269 | title: "...in R" 270 | 271 | - caption: ABOUT 272 | chapters: 273 | - file: src/about/contributing 274 | - file: src/about/license-reuse 275 | - file: src/about/tech-details 276 | -------------------------------------------------------------------------------- /images/UALIB_favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/images/UALIB_favicon.png -------------------------------------------------------------------------------- /images/UALIB_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/images/UALIB_logo.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter-book 2 | matplotlib 3 | numpy 4 | -------------------------------------------------------------------------------- /src/about/contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | %%%%%%%%%%%%%% 3 | 4 | How to Contribute 5 | ****************** 6 | 7 | This is an open source resource. Any contributions are welcome. If you find a 8 | mistake or have an idea, please let us know via the GitHub Issues Tracker. 9 | 10 | In addition, any general feedback is always welcome! 11 | -------------------------------------------------------------------------------- /src/about/introduction.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | %%%%%%%%%%%%%% 3 | 4 | What is this? 5 | ************* 6 | 7 | This is an open online book containing short scholarly API code examples (i.e., "recipes") 8 | that demonstrate how to work with various scholarly web service APIs. It is part of the University of 9 | Alabama Libraries efforts to support `Research Data Services`_. 10 | 11 | .. _Research Data Services: https://guides.lib.ua.edu/ResearchDataServices 12 | 13 | What should I be aware of before getting started? 14 | ************************************************* 15 | 16 | Before interacting with any scholarly APIs (or similar web service), it is very important to review the 17 | usage policies, which generally includes information such as query limits and data reuse policies. 18 | We have endeavored to follow all appropriate API usage policies in our examples and have linked 19 | to the specific API policies where possible. While some APIs are openly accessible and do 20 | not require special authentication, other scholarly APIs require that you are affiliated with a 21 | subscribing institution, and have registered for an API key to use for authentication in API queries. 22 | We have added instructions about any necessary authentication within the relevant code recipes. 23 | 24 | 25 | .. important:: 26 | 27 | In general, scholarly APIs are designed for the collection of small to medium 28 | sized datasets; that is, in the range of 100s or maybe a few thousand queries at most 29 | (various with API). If you need large bulk datasets, an API is likely not the method to use, and 30 | there may be bulk data downloads available from the database instead. 31 | 32 | If you decide that your use-case is appropriate for a scholarly API (or similar web service), here are a few good general practices 33 | to follow when working with any web API: 34 | 35 | 1. Read the API documentation and usage guidelines before starting. 36 | 2. Start with testing the behavior of the API using a single programmatic API request (i.e., not in a loop). 37 | 3. Add a 1 second delay between API requests when using a loop. 38 | 4. When using a loop to repeat API requests, start out with a small list, perhaps 3-5. 39 | 5. Cache the API returned data when testing. For example, if you are trying to parse the returned API data in a scripting workflow, save the returned data in a variable or to a file so that you do not need to repeat the API request unnecessarily for the downstream parsing or analysis. 40 | 41 | What kind of content is included? 42 | ********************************* 43 | 44 | The scope of this book is to provide short code examples related to the retrieval of data and information 45 | from scholarly APIs using several different programming languages. 46 | 47 | While there may be some introductory programming content in this book, the 48 | content is not meant to be a general introduction to programming. 49 | Instead, our aim with the Scholarly API Cookbook is to provide 50 | some short scripting based workflows for working with scholarly data and information APIs. 51 | For more general introductions to programming, we recommend searching the 52 | UA Libraries Scout database for programming books (e.g., `TI python`). 53 | 54 | .. seealso:: 55 | 56 | UA Libraries Workshop lessons and references therein for more general 57 | programming content [#ua_work]_. 58 | 59 | 60 | Which Programming Languages are Covered? 61 | **************************************** 62 | 63 | Currently, we have scholarly API code examples in Python and R (and a Z39.50 tutorial in Bash). 64 | For good luck, let's add ``Hello World!`` in each programming language: 65 | 66 | .. tab-set:: 67 | 68 | .. tab-item:: Python 69 | 70 | .. code-block:: python 71 | 72 | >>> print("Hello World!") 73 | 74 | .. tab-item:: R 75 | 76 | .. code-block:: r 77 | 78 | > print("Hello World!") 79 | 80 | Who is Creating the Content? 81 | **************************** 82 | 83 | The Scholarly API Cookbook content is authored by University of Alabama 84 | Libraries faculty and student assistants. Specific authors are noted on each 85 | tutorial or document page. 86 | 87 | .. rubric:: References 88 | 89 | .. [#ua_work] ``_ 90 | 91 | 92 | -------------------------------------------------------------------------------- /src/about/license-reuse.rst: -------------------------------------------------------------------------------- 1 | License and Reuse 2 | %%%%%%%%%%%%%%%%%% 3 | 4 | License and Reuse 5 | ****************** 6 | 7 | Most of the code in this Scholarly API Cookbook is licensed under the `MIT License `_. 8 | 9 | The Python scripts in this Scholarly API Cookbook are licensed under the MIT License. However, these scripts may rely on external libraries such as matplotlib, pandas, and others. These libraries are licensed under their own respective terms, and will need to be installed separately. Refer to the documentation of each library for installation instructions and licensing details. 10 | 11 | The Bash scripts are licensed under the MIT License (Bash itself is licensed under the GNU General Public License). Some of the included scripts may rely on external tools such as curl, jq, YAZ, and gnuplot, each of which is licensed under its own terms. Users must obtain and install these tools separately. Refer to the documentation of each tool for installation instructions and licensing details. 12 | 13 | Lastly, most of the R tutorial scripts are MIT licensed, but some are licensed under the `GPL-3 License `_ because they depend on GPL-licensed R libraries (refer to the documentation of each R library for installation instructions and licensing details). The R tutorials with GPL-3 licenses are indicated at the top of the respective files. 14 | 15 | .. important:: 16 | 17 | We have endeavored to follow the appropriate terms and usage policies of each scholarly API, web service, and Z39.50 server. We have linked to the terms and policies where possible. Some database APIs may require a valid library subscription, institutional access, or individual account to use their services. Please be responsible when reusing these scripts and respect the API terms and usage policies (e.g., query limits, record downloads, data sharing restrictions). Data output snippets shown in this book are for demonstration purposes and are credited to the individual API or database service. The output generated from APIs or services remains subject to the terms and conditions of the respective provider. Some outputs (e.g., U.S. Government works) may be in the public domain, while others may require attribution or adherence to other conditions. 18 | 19 | If you reuse the code, attribution would be appreciated. Please link to the Cookbook and cite our manuscript: 20 | 21 | Link to Cookbook: ``_ 22 | 23 | Citation: Scalfani, V. F.; Walker, K. W.; Simpson, L.; Fernandez, A. M.; Patel, V. D.; Ramig, A.; Gomes, C.; Moen, M. T.; Nguyen, A. M. Creating a Scholarly API Cookbook: Supporting Library Users with Programmatic Access to Information. *Issues in Science and Technology Librarianship*, **2023**, No. 104. ``_. 24 | 25 | .. code-block:: bibtex 26 | 27 | @article{scalfani_creating_2023, 28 | title = {Creating a {Scholarly} {API} {Cookbook}: {Supporting} {Library} {Users} with {Programmatic} {Access} to {Information}}, 29 | issn = {1092-1206}, 30 | shorttitle = {Creating a {Scholarly} {API} {Cookbook}}, 31 | url = {https://journals.library.ualberta.ca/istl/index.php/istl/article/view/2766}, 32 | doi = {10.29173/istl2766}, 33 | abstract = {Scholarly web-based application programming interfaces (APIs) allow users to interact with information and data programmatically. Interacting with information programmatically allows users to create advanced information query workflows and quickly access machine-readable data for downstream computations. With the growing availability of scholarly APIs from open and commercial library databases, supporting access to information via an API has become a key support area for research data services in libraries. This article describes our efforts with supporting API access through the development of an online Scholarly API Cookbook. The Cookbook contains code recipes (i.e., tutorials) for getting started with 10 different scholarly APIs, including for example, Scopus, World Bank, and PubMed. API tutorials are available in Python, Bash, Matlab, and Mathematica. A tutorial for interacting with library catalog data programmatically via Z39.50 is also included, as traditional library catalog metadata is rarely available via an API. In addition to describing the Scholarly API Cookbook content, we discuss our experiences building a student research data services programming team, challenges we encountered, and ideas to improve the Cookbook. The University of Alabama Libraries Scholarly API Cookbook is freely available and hosted on GitHub. All code within the API Cookbook is licensed with the permissive MIT license, and as a result, users are free to reuse and adapt the code in their teaching and research.}, 34 | number = {104}, 35 | urldate = {2023-10-13}, 36 | journal = {Issues in Science and Technology Librarianship}, 37 | author = {Scalfani, Vincent F. and Walker, Kevin W. and Simpson, Lance and Fernandez, Avery M. and Patel, Vishank D. and Ramig, Anastasia and Gomes, Cyrus and Moen, Michael T. and Nguyen, Adam M.}, 38 | month = oct, 39 | year = {2023}, 40 | } 41 | 42 | Scholarly API Cookbook Archive 43 | ****************************** 44 | 45 | We have decided to no longer maintain the Matlab, Mathematica, Bash (except the Z39.50 Bash recipe), and C recipes and have removed them from the Scholarly API Cookbook. These archived recipes are still in the `UA Libraries Scholarly API Cookbook Archive `_. 46 | 47 | The code in the UA Libraries Scholarly API Cookbook Archive is licensed under the `MIT License `_. This includes code written to be used with Wolfram Mathematica and MathWorks MATLAB. However, these proprietary software packages themselves are not covered under the MIT License, and users must have valid licenses for Mathematica and MATLAB to run the associated code. 48 | 49 | The Python scripts in this Scholarly API Cookbook are licensed under the MIT License. However, these scripts may rely on external libraries such as matplotlib, pandas, and others. These libraries are licensed under their own respective terms, and will need to be installed separately. Refer to the documentation of each library for installation instructions and licensing details. 50 | 51 | The Bash scripts are licensed under the MIT License (Bash itself is licensed under the GNU General Public License). Some of the included scripts may rely on external tools such as curl, jq, YAZ, and gnuplot, each of which is licensed under its own terms. Users must obtain and install these tools separately. Refer to the documentation of each tool for installation instructions and licensing details. 52 | 53 | The C code in the archive is licensed under the MIT License. This repository provides only the source code, and users will need to compile the C programs to run them. Some of the C code depends on external libraries such as curl, jq, and YAZ, which are licensed under their own respective terms. These libraries will need to be obtained and installed separately by the user. 54 | -------------------------------------------------------------------------------- /src/about/tech-details.rst: -------------------------------------------------------------------------------- 1 | Technical Details 2 | %%%%%%%%%%%%%%%%%%%%% 3 | 4 | Technology and Software Used 5 | ***************************** 6 | 7 | 1. Python content is written in `Jupyter Notebooks`_. 8 | 2. R content is written in RMarkdown and exported to Markdown. 9 | 3. All other content is written in `reStructuredText`_. 10 | 4. Code testing is done locally. 11 | 5. `Jupyter Book`_ is used to compile and create the HTML files via an automated GitHub Workflow. This workflow builds the book and hosts the HTML content with `GitHub Actions`_. 12 | 13 | .. _Jupyter Notebooks: https://jupyter.org/ 14 | .. _reStructuredText: https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html 15 | .. _Jupyter Book: https://jupyterbook.org/intro.html 16 | .. _GitHub Actions: https://docs.github.com/en/actions 17 | -------------------------------------------------------------------------------- /src/overview/arxiv.rst: -------------------------------------------------------------------------------- 1 | arXiv 2 | %%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | ***************** 6 | 7 | The arXiv API provides a programmatic accessible interface to the their extensive database of scientific papers on the arXiv website. It uses a RESTful interface and allows scholars to query and retrieve papers based on a variety of parameters. The API is free to use and does not require an API key. However the API does have a rate limit of one requests per three seconds [#arxiv1]_. 8 | 9 | .. rubric:: References 10 | 11 | .. [#arxiv1] ``_ -------------------------------------------------------------------------------- /src/overview/bea.rst: -------------------------------------------------------------------------------- 1 | U.S. Bureau of Economic Analysis 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The U.S. Bureau of Economic Analysis (BEA) API provides programmatic access to economic data published by the BEA. A UserID is required for this API, and a rate limit of 100 requests, 100 MB, and 30 errors per minute is enforced. 8 | 9 | See the BEA API documentation [#bea1]_ and BEA API user guide [#bea2]_ for more information on accessing the API. Please check the terms of use [#bea3]_ for more information on the usage of this API. 10 | 11 | *This product uses the Bureau of Economic Analysis (BEA) Data API but is not endorsed or certified by BEA.* 12 | 13 | .. rubric:: References 14 | 15 | .. [#bea1] ``_ 16 | 17 | .. [#bea2] ``_ 18 | 19 | .. [#bea3] ``_ 20 | 21 | -------------------------------------------------------------------------------- /src/overview/bls.rst: -------------------------------------------------------------------------------- 1 | U.S. Bureau of Labor Statistics 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The Bureau of Labor Statistics Public Data API provides access to the data published by the BLS, which primarily concerns prices, employment, compensation, and productivity in the United States. Registration or API keys are not required for the v1.0 API, but registration is required to access the v2.0 API [#us_bls1]_. Additionally, users must follow their rate limits [#us_bls2]_. 8 | 9 | See the BLS terms of service for more information on how the API can be used [#us_bls3]_. 10 | 11 | .. rubric:: References 12 | 13 | .. [#us_bls1] ``_ 14 | 15 | .. [#us_bls2] ``_ 16 | 17 | .. [#us_bls3] ``_ 18 | 19 | -------------------------------------------------------------------------------- /src/overview/casc.rst: -------------------------------------------------------------------------------- 1 | CAS Common Chemistry 2 | %%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The CAS Common Chemistry API provides access to ~500,000 chemical substance information from the CAS REGISTRY. Example API queries include the ability to search via chemical name, SMILES, and InChI [#casc1]_. Registration is required for access [#casc2]_. CAS Common Chemistry content is CC-BY-NC 4.0 licensed; for specific reuse questions, contact CAS [#casc3]_. 8 | 9 | .. rubric:: References 10 | 11 | .. [#casc1] ``_ 12 | 13 | .. [#casc2] ``_ 14 | 15 | .. [#casc3] ``_ 16 | -------------------------------------------------------------------------------- /src/overview/chronam.rst: -------------------------------------------------------------------------------- 1 | Chronicling America 2 | %%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The Library of Congress Chronicling America API provides programmatic access to historic newspaper text and images. Registration is not required. See the Chronicling America API documentation for information about API specification, API policies, and example use-cases [#chronam1]_. 8 | 9 | .. rubric:: References 10 | 11 | .. [#chronam1] ``_ 12 | 13 | -------------------------------------------------------------------------------- /src/overview/college-scorecard.rst: -------------------------------------------------------------------------------- 1 | College Scorecard 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The College Scorecard API provides programmatic access to data on institutions of higher education published by the U.S. Department of Education. Registration for the API is required [#cs1]_. More information about the data available through the API can be found in the documentation [#cs2]_. 8 | 9 | See the Department of Education's website for more information on how the data of the API can be used [#cs3]_. 10 | 11 | .. rubric:: References 12 | 13 | .. [#cs1] ``_ 14 | 15 | .. [#cs2] ``_ 16 | 17 | .. [#cs3] ``_ 18 | -------------------------------------------------------------------------------- /src/overview/congress.rst: -------------------------------------------------------------------------------- 1 | Congress.gov 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | Congress.gov [#con1]_ is the official website for U.S. federal legislative information. 8 | 9 | The Congress API provides users access to a variety of information about the U.S. Congress. 10 | 11 | See the API documentation [#con2]_ for more information on using the API and their legal page [#con3]_ for more information on licensing. 12 | 13 | .. rubric:: References 14 | 15 | .. [#con1] ``_ 16 | 17 | .. [#con2] ``_ 18 | 19 | .. [#con3] ``_ -------------------------------------------------------------------------------- /src/overview/crossref.rst: -------------------------------------------------------------------------------- 1 | Crossref 2 | %%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | ***************** 6 | 7 | The Crossref API provides programmatic access to bibliographic document information and related metadata [#crossref1]_. Registration is not required. See the Crossref API documentation for examples, specific API policies, and data reuse information [#crossref2]_. 8 | 9 | .. rubric:: References 10 | 11 | .. [#crossref1] ``_ 12 | 13 | .. [#crossref2] ``_ 14 | -------------------------------------------------------------------------------- /src/overview/fdc.rst: -------------------------------------------------------------------------------- 1 | FoodData Central 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The FoodData Central API provides users access to a variety of information about food products and their nutritional content. 8 | 9 | See the API documentation for more information on using the API [#fdc1]_ . 10 | 11 | "U.S. Department of Agriculture, Agricultural Research Service. FoodData Central, 2019. fdc.nal.usda.gov.". 12 | 13 | "USDA FoodData Central data are in the public domain and they are not copyrighted. They are published under CC0 1.0 Universal (CC0 1.0)" [#fdc2]_ . 14 | 15 | .. rubric:: References 16 | 17 | .. [#fdc1] ``_ 18 | 19 | .. [#fdc2] ``_ -------------------------------------------------------------------------------- /src/overview/geonames.rst: -------------------------------------------------------------------------------- 1 | GeoNames 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The GeoNames API allows users to programmatically access the GeoNames database. Registration is required to access this API [#gn1]_ . 8 | 9 | See the API documentation [#gn2]_ for more information on accessing the API. The GeoNames API is licensed under the CC BY 4.0 Deed license, allowing users to share and adapt its data with attribution [#gn3]_ . 10 | 11 | .. rubric:: References 12 | 13 | .. [#gn1] ``_ 14 | 15 | .. [#gn2] ``_ 16 | 17 | .. [#gn3] ``_ 18 | -------------------------------------------------------------------------------- /src/overview/nasa-images.rst: -------------------------------------------------------------------------------- 1 | NASA Images 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The NASA Earth Polychromatic Imaging Camera (EPIC) API [#nasa1]_ provides the most recent images of Earth taken by the EPIC. 8 | 9 | The NASA Astronomy Picture of the Day (APOD) API [#nasa2]_ provides the images of the universe taken by telescopes and other instruments. 10 | 11 | You can find more information about NASA APIs on their website [#nasa3]_ and information regarding the use of these images on their Images and Media page [#nasa4]_ . 12 | 13 | .. rubric:: References 14 | 15 | .. [#nasa1] ``_ 16 | 17 | .. [#nasa2] ``_ 18 | 19 | .. [#nasa3] ``_ 20 | 21 | .. [#nasa4] ``_ -------------------------------------------------------------------------------- /src/overview/nps.rst: -------------------------------------------------------------------------------- 1 | National Park Service 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The National Park Service (NPS) API contains pertinent information about national parks, monuments, and other sites managed by the NPS. An API key is required for this API, and registration can be found on the NPS website [#nps1]_ . Users are required to follow the rate limits of 1000 requests per hour [#nps2]_ . 8 | 9 | See the NPS API documentation [#nps3]_ for more information on accessing the API. Note that the data in the API is "is generally considered in the public domain," according to the API's disclaimer [#nps4]_ . 10 | 11 | .. rubric:: References 12 | 13 | .. [#nps1] ``_ 14 | 15 | .. [#nps2] ``_ 16 | 17 | .. [#nps3] ``_ 18 | 19 | .. [#nps4] ``_ 20 | -------------------------------------------------------------------------------- /src/overview/nws.rst: -------------------------------------------------------------------------------- 1 | National Weather Service 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The National Weather Service (NWS) API provides programmatic access to forecasts and alerts published by the NWS. An API key is not required to access this API, but users are required to include a User Agent with all API requests and adhere to an unpublished rate limit (we recommend sending a maximum of 1 request per second). 8 | 9 | See the NWS API documentation [#nws1]_ for more information on accessing the API. Note that the documentation states, "All of the information presented via the API is intended to be open data, free to use for any purpose." 10 | 11 | .. rubric:: References 12 | 13 | .. [#nws1] ``_ 14 | -------------------------------------------------------------------------------- /src/overview/openalex.rst: -------------------------------------------------------------------------------- 1 | OpenAlex 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The OpenAlex API is an open catalog of the global research system created by the nonprofit OurResearch [#oa1]_ . An API key is not required for this API, but providing your email in requests will provide faster and more consistent response times. Users are required to follow the rate limits of 100000 requests per user per day and 10 requests per second. 8 | 9 | See the OpenAlex API documentation [#oa2]_ for more information on accessing the API. Note that the data in the API is licensed under the Creative Commons CC0 license [#oa3]_ , designating it as part of the public domain. See the OpenAlex Terms of Service [#oa4]_ for more information on how you can use this API. 10 | 11 | .. rubric:: References 12 | 13 | .. [#oa1] ``_ 14 | 15 | .. [#oa2] ``_ 16 | 17 | .. [#oa3] ``_ 18 | 19 | .. [#oa4] ``_ 20 | -------------------------------------------------------------------------------- /src/overview/osf.rst: -------------------------------------------------------------------------------- 1 | Open Science Framework 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | Open Science Framework (OSF) is an open-source service maintained by the Center for Open Science [#osf1]_ . The OSF API allows programmatic access to OSF data and files. 8 | 9 | A token is optional for using this API, but signup can be found on the OSF website [#osf2]_ . Unauthenticated users are limited to 100 requests per hour, but authenticated users are allowed 10,000 requests per day. 10 | 11 | See the OSF API documentation [#osf3]_ for more information on accessing the API. See the OSF API terms of use [#osf4]_ for more information on how you can use this API. 12 | 13 | .. rubric:: References 14 | 15 | .. [#osf1] ``_ 16 | 17 | .. [#osf2] ``_ 18 | 19 | .. [#osf3] ``_ 20 | 21 | .. [#osf4] ``_ 22 | -------------------------------------------------------------------------------- /src/overview/osm.rst: -------------------------------------------------------------------------------- 1 | OpenStreetMap 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | OpenStreetMap (OSM) [#osm1]_ is a worldwide open data mapping service supported by the OpenStreetMap Foundation (OSMF). 8 | 9 | The OSM Overpass API allows users to programmatically read data from OSM. 10 | 11 | See the API documentation [#osm2]_ for more information on accessing the API and the OSM Copyright and License Page [#osm3]_ for more information on the data's license. 12 | 13 | .. rubric:: References 14 | 15 | .. [#osm1] ``_ 16 | 17 | .. [#osm2] ``_ 18 | 19 | .. [#osm3] ``_ 20 | -------------------------------------------------------------------------------- /src/overview/pubchem-periodic-table.rst: -------------------------------------------------------------------------------- 1 | PubChem Periodic Table 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | PubChem is a database of chemical molecules and their properties. It is maintained by the National Center for Biotechnology Information (NCBI), a division of the National Library of Medicine (NLM) at the National Institutes of Health (NIH). PubChem is a key chemical information resource for scientists, students, and the general public. 8 | 9 | The PubChem Periodic Table API provides programmatic access to a machine-readable periodic table. An API key is not required for this API, but a rate limit of 5 requests per second is enforced. 10 | 11 | See the PubChem Periodic Table [#ppt1]_ and PubChem APIs documentation [#ppt2]_ for more information on accessing the API. Please check the terms of use [#ppt3]_ for more information on the usage of this API. 12 | 13 | .. rubric:: References 14 | 15 | .. [#ppt1] ``_ 16 | 17 | .. [#ppt2] ``_ 18 | 19 | .. [#ppt3] ``_ 20 | 21 | -------------------------------------------------------------------------------- /src/overview/pubchem.rst: -------------------------------------------------------------------------------- 1 | PubChem 2 | %%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | PubChem APIs allow programmatic access to search and retrieve small molecule and related data. Registration is not required. See the PubChem Docs for information about API specification, API policies, and example use-cases [#pubchem1]_. 8 | 9 | .. rubric:: References 10 | 11 | .. [#pubchem1] ``_ 12 | 13 | -------------------------------------------------------------------------------- /src/overview/pubmed.rst: -------------------------------------------------------------------------------- 1 | PubMed 2 | %%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | PubMed bibliographic and related NCBI information is programmatically accessible through the Entrez Programming Utilities API. Registration is not required, though registering may offer additional API features [#pubmed1]_. See the NCBI API Usage Guidelines [#pubmed2]_ and Data Usage Policy [#pubmed3]_. 8 | 9 | 10 | .. rubric:: References 11 | 12 | .. [#pubmed1] ``_ 13 | 14 | .. [#pubmed2] ``_ 15 | 16 | .. [#pubmed3] ``_ 17 | -------------------------------------------------------------------------------- /src/overview/ror.rst: -------------------------------------------------------------------------------- 1 | Research Organization Registry 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The Research Organization Registry (ROR) API provides programmatic access to the organization's persistent identifiers for research organizations. An API key is not required for this API, but they do require you to and follow their rate limit of 2000 requests per five minute period. 8 | 9 | See the ROR API documentation [#ror1]_ for more information on accessing the API. Note that the data in the API is licensed under the Creative Commons CC0 license [#ror2]_ , designating it as part of the public domain. 10 | 11 | .. rubric:: References 12 | 13 | .. [#ror1] ``_ 14 | 15 | .. [#ror2] ``_ 16 | 17 | -------------------------------------------------------------------------------- /src/overview/scopus.rst: -------------------------------------------------------------------------------- 1 | Scopus 2 | %%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The Elsevier Scopus API allows programmatic access to search and retrieve Scopus record information including, for example author information, bibliographic metadata, and citations. Registration is required. See the Elsevier Developer Portal for information about the Scopus API specification, policies, and allowed use-cases [#scopus1]_. 8 | 9 | .. rubric:: References 10 | 11 | .. [#scopus1] ``_ 12 | 13 | -------------------------------------------------------------------------------- /src/overview/sdirect.rst: -------------------------------------------------------------------------------- 1 | ScienceDirect 2 | %%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The Elsevier ScienceDirect APIs allows programmatic access to search and retrieve ScienceDirect 8 | metadata and article full-text. Registration is required. See the Elsevier Developer Portal for 9 | information about the ScienceDirect API specification, policies, and allowed use-cases [#SDirect1]_. 10 | Note that Elsevier has detailed policies regarding text and data mining [#SDirect2]_. 11 | Please check with your institution for their Text and Data Mining Agreement with Elsevier. 12 | 13 | .. rubric:: References 14 | 15 | .. [#SDirect1] ``_ 16 | .. [#SDirect2] ``_ 17 | -------------------------------------------------------------------------------- /src/overview/sec-edgar.rst: -------------------------------------------------------------------------------- 1 | U.S. Securities and Exchange 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The U.S. Securities and Exchange Commission EDGAR API provides access to public company filing data. Registration or API keys are not required, but they do require you to add a user agent in the requests [#us_sec1]_ and follow their rate limits [#us_sec2]_. 8 | 9 | See the U.S. SEC Developer Resources [#us_sec3]_ and the SEC Web Site Privacy and Security Policy for information about data reuse [#us_sec4]_. 10 | 11 | .. rubric:: References 12 | 13 | .. [#us_sec1] ``_ 14 | 15 | .. [#us_sec2] ``_ 16 | 17 | .. [#us_sec3] ``_ 18 | 19 | .. [#us_sec4] ``_ 20 | 21 | -------------------------------------------------------------------------------- /src/overview/speedrun.rst: -------------------------------------------------------------------------------- 1 | Speedrun.com 2 | %%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | ***************** 6 | 7 | The Speedrun.com API provides programmatic access to the video game speedrunning data hosted and compiled by Speedrun.com [#speedrun1]_ . Registration is not required, but including a user-agent in your API requests is recommended. See the Speedrun.com API documentation [#speedrun2]_ for more information about this API. 8 | 9 | The Speedrun.com API is licensed under the CC BY-NC 4.0 [#speedrun3]_ . See the Speedrun.com terms of use [#speedrun4]_ for more information. 10 | 11 | .. rubric:: References 12 | 13 | .. [#speedrun1] ``_ 14 | 15 | .. [#speedrun2] ``_ 16 | 17 | .. [#speedrun3] ``_ 18 | 19 | .. [#speedrun4] ``_ 20 | -------------------------------------------------------------------------------- /src/overview/springer.rst: -------------------------------------------------------------------------------- 1 | Springer Nature 2 | %%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The Springer Nature API offers programmatic access to a vast array of metadata and full-text content from the Springer Nature publishing database. 8 | There are three different APIs, inclduing an Open Access API, Metadata API, and Text and Data Mining API [#springer1]_. 9 | Access to some data might require subscription or purchase. For detailed guidelines, usage policies, and access to the API's 10 | full capabilities,refer to the official Springer Nature documentation [#springer2]_. 11 | Review their general terms of service [#springer3]_, API terms [#springer4]_, and TDM reservation policies. [#springer5]_. 12 | Please check with your institution for their Text and Data Mining Agreement with Springer Nature. 13 | 14 | .. rubric:: References 15 | 16 | .. [#springer1] ``_ 17 | 18 | .. [#springer2] ``_ 19 | 20 | .. [#springer3] ``_ 21 | 22 | .. [#springer4] ``_ 23 | 24 | .. [#springer5] ``_ 25 | -------------------------------------------------------------------------------- /src/overview/stack-exchange.rst: -------------------------------------------------------------------------------- 1 | Stack Exchange 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | Stack Exchange is a network of question-and-answer websites on topics in diverse fields, each site covering a specific topic, where questions, answers, and users are subject to a reputation award process. 8 | 9 | This API provides users access to a variety of information about the Stack Exchange network. 10 | 11 | See the API documentation for more information on using the API [#stack1]_ . 12 | 13 | See the Stack Exchange API Terms of Use [#stack2]_ and the Terms of Service for more information on licensing [#stack3]_ . 14 | 15 | .. rubric:: References 16 | 17 | .. [#stack1] ``_ 18 | 19 | .. [#stack2] ``_ 20 | 21 | .. [#stack3] ``_ -------------------------------------------------------------------------------- /src/overview/us-census-geocoding.rst: -------------------------------------------------------------------------------- 1 | U.S. Census Geocoding 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The U.S. Census Geocoding Services API allows users to obtain geographic information for U.S. addresses. An API key is not required to access this API. 8 | 9 | See the API documentation [#uscg1]_ for more information on accessing the API. Please see the U.S. Census Bureau APIs terms of service [#uscg2]_ for specific information about API policies, data reuse, and allowed use-cases. 10 | 11 | .. rubric:: References 12 | 13 | .. [#uscg1] ``_ 14 | 15 | .. [#uscg2] ``_ 16 | -------------------------------------------------------------------------------- /src/overview/us-census.rst: -------------------------------------------------------------------------------- 1 | U.S. Census Data 2 | %%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The U.S. Census Bureau offers several APIs for accessing census data and related information. Registration is not required, but registering allows more queries and features [#uscensus1]_. See the U.S. Census Data API User Guide [#uscensus2]_ and Terms of Service [#uscensus3]_ for specific information about API policies, data reuse, and allowed use-cases. 8 | 9 | .. rubric:: References 10 | 11 | .. [#uscensus1] ``_ 12 | 13 | .. [#uscensus2] ``_ 14 | 15 | .. [#uscensus3] ``_ 16 | 17 | -------------------------------------------------------------------------------- /src/overview/us-treasury.rst: -------------------------------------------------------------------------------- 1 | U.S. Treasury 2 | %%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The U.S. Department of Treasury API provides information about federal finances [#ustreasury1]_. See the Dataset Search [#ustreasury2]_ and terms of use [#ustreasury3]_. 8 | 9 | .. rubric:: References 10 | 11 | .. [#ustreasury1] ``_ 12 | 13 | .. [#ustreasury2] ``_ 14 | 15 | .. [#ustreasury3] ``_ 16 | -------------------------------------------------------------------------------- /src/overview/usa-spending.rst: -------------------------------------------------------------------------------- 1 | USA Spending 2 | %%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The USA Spending API provides programmatic access to comprehensive U.S. government spending data, including spending on awards (e.g., federal contracts, grants, geographic breakdowns, agency breakdowns) and account-level, non-award spending such as federal employee compensation. The API is currently in V2, with V1 endpoints being deprecated. No registration is required to use the API. Visit the USA Spending API documentation for information on getting started, available endpoints, and background information [#usas1]_ . 8 | 9 | .. rubric:: References 10 | 11 | .. [#usas1] ``_ 12 | 13 | -------------------------------------------------------------------------------- /src/overview/usgs-national-map.rst: -------------------------------------------------------------------------------- 1 | USGS National Map 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | United States Geological Survey (USGS) [#usgs1]_ is a U.S. agency whose work spans the disciplines of biology, geography, geology, and hydrology. 8 | 9 | The USGS National Map [#usgs2]_ is a collaborative effort among the USGS and other federal, state, and local partners to provide a variety of topographic information. 10 | 11 | You can view or download data from the USGS National Map Viewer [#usgs3]_ and the USGS National Map Downloader [#usgs4]_ respectively. 12 | 13 | We will be running their API [#usgs5]_ to observe and download data. 14 | 15 | See the USGS Copyrights and Credits [#usgs6]_ and their FAQ page [#usgs7]_ for more information. 16 | 17 | .. rubric:: References 18 | 19 | .. [#usgs1] ``_ 20 | 21 | .. [#usgs2] ``_ 22 | 23 | .. [#usgs3] ``_ 24 | 25 | .. [#usgs4] ``_ 26 | 27 | .. [#usgs5] ``_ 28 | 29 | .. [#usgs6] ``_ 30 | 31 | .. [#usgs7] ``_ -------------------------------------------------------------------------------- /src/overview/wiley-tdm.rst: -------------------------------------------------------------------------------- 1 | Wiley 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The Wiley Text and Data Mining (TDM) API allows users to retrieve full-text articles of Wiley content in PDF form. 8 | A token is required to access [#wtdm1]_ , and users are limited to 3 API requests per second. 9 | 10 | See the Wiley TDM documentation [#wtdm2]_ for more information on accessing the API. 11 | Please check with your institution for their Text and Data Mining Agreement with Wiley. 12 | 13 | .. rubric:: References 14 | 15 | .. [#wtdm1] ``_ 16 | 17 | .. [#wtdm2] ``_ 18 | 19 | -------------------------------------------------------------------------------- /src/overview/world-bank.rst: -------------------------------------------------------------------------------- 1 | World Bank 2 | %%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | The World Bank Indicators API provides access to numerous time series economic and related data such as population, income, energy, and education information [#worldbank1]_. Registration is not required. See the World Bank Indicators API documentation [#worldbank2]_ and Terms of Use for specific information about data reuse policies [#worldbank3]_. 8 | 9 | .. rubric:: References 10 | 11 | .. [#worldbank1] ``_ 12 | 13 | .. [#worldbank2] ``_ 14 | 15 | .. [#worldbank3] ``_ 16 | -------------------------------------------------------------------------------- /src/overview/wos.rst: -------------------------------------------------------------------------------- 1 | Web of Science 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | Web of Science (WOS) is a service maintained by the Clarivate [#wos1]_ . The WOS APIs allow programmatic access to WOS data. 8 | 9 | The WOS Starter API allows access to some of the data in the WOS database. See the WOS Starter documentation [#wos2]_ for more information on accessing the API. 10 | 11 | The WOS Expanded API allows access to more data from the WOS database. See the WOS Expanded documentation [#wos3]_ for more information on accessing the API. 12 | 13 | Please check with your institution on WOS API access. Registration is required to access the API. 14 | 15 | Please see the WOS API release notes [#wos4]_ to stay up-to-date on the APIs. See the WOS Terms of Use [#wos5]_ and Clarivate Product / Service Terms [#wos6]_ for more info on usage of this data. 16 | 17 | .. rubric:: References 18 | 19 | .. [#wos1] ``_ 20 | 21 | .. [#wos2] ``_ 22 | 23 | .. [#wos3] ``_ 24 | 25 | .. [#wos4] ``_ 26 | 27 | .. [#wos5] ``_ 28 | 29 | .. [#wos6] ``_ 30 | -------------------------------------------------------------------------------- /src/overview/z3950.rst: -------------------------------------------------------------------------------- 1 | Z39.50 2 | %%%%%%%%%%%%%%%%%%%%% 3 | 4 | Brief Overview 5 | **************** 6 | 7 | Z39.50 is a protocol and query language often used for interacting with library catalogs [#z3950_1]_. 8 | 9 | .. rubric:: References 10 | 11 | .. [#z3950_1] ``_ 12 | 13 | -------------------------------------------------------------------------------- /src/python/geonames.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# GeoNames API in Python\n", 8 | "\n", 9 | "by Michael T. Moen\n", 10 | "\n", 11 | "Please see the following resources for more information on API usage:\n", 12 | "\n", 13 | "- Documentation\n", 14 | " - GeoNames\n", 15 | " - GeoNames API Documentation\n", 16 | "- Terms of Use\n", 17 | " - GeoNames API Terms of Use\n", 18 | "- Data Reuse\n", 19 | " - The GeoNames API Data is licensed under the Creative Commons' [CC 4.0 license](https://creativecommons.org/licenses/by/4.0/), allowing users to share and adapt the API's data for any purpose, as long as appropriate attribution is given.\n", 20 | "\n", 21 | "*These recipe examples were tested on March 7, 2025.*\n", 22 | "\n", 23 | "**_NOTE:_** The GeoNames API limits users to a maximum of 10000 credits per day and 1000 credits per hour. See [here](https://www.geonames.org/export/credits.html) for a list of how many credits a request to each endpoint uses." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Setup\n", 31 | "\n", 32 | "### Import Libraries\n", 33 | "\n", 34 | "The following external libraries need to be installed into your enviornment to run the code examples in this tutorial:\n", 35 | "\n", 36 | "- ipykernel\n", 37 | "- requests\n", 38 | "- python-dotenv" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 1, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import requests\n", 48 | "import os\n", 49 | "from dotenv import load_dotenv" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "### Import Username\n", 57 | "\n", 58 | "Users must register with GeoNames before accessing the GeoNames API. Sign up can be found here.\n", 59 | "\n", 60 | "We keep our username in a `.env` file and use the `dotenv` library to access it. If you would like to use this method, create a file named `.env` in the same directory as this notebook and add the following line to it:\n", 61 | "\n", 62 | "```text\n", 63 | "GEONAMES_API_USERNAME=PUT_YOUR_USERNAME_HERE\n", 64 | "```" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "load_dotenv()\n", 74 | "try:\n", 75 | " USERNAME = os.environ[\"GEONAMES_API_USERNAME\"]\n", 76 | "except KeyError:\n", 77 | " print(\"API key not found. Please set 'GEONAMES_API_USERNAME' in your .env file.\")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "## 1. Searching with a ZIP Code\n", 85 | "\n", 86 | "This example uses the `postalCodeSearchJSON` endpoint to find the coordinates of the the ZIP code 35401." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 10, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "200" 98 | ] 99 | }, 100 | "execution_count": 10, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "BASE_URL = f'https://secure.geonames.org/'\n", 107 | "\n", 108 | "endpoint = 'postalCodeSearchJSON'\n", 109 | "params = {\n", 110 | " 'postalcode': 35401, # Postal code to search\n", 111 | " 'countryBias': 'US', # Moves US results to the top of the results list\n", 112 | " 'username': USERNAME # Must include GeoNames username in all API calls\n", 113 | "}\n", 114 | "\n", 115 | "response = requests.get(f\"{BASE_URL}{endpoint}\", params=params)\n", 116 | "\n", 117 | "# Status code 200 indicates success\n", 118 | "response.status_code" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 11, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "{'adminCode2': '125',\n", 130 | " 'adminCode1': 'AL',\n", 131 | " 'adminName2': 'Tuscaloosa',\n", 132 | " 'lng': -87.562666,\n", 133 | " 'countryCode': 'US',\n", 134 | " 'postalCode': '35401',\n", 135 | " 'adminName1': 'Alabama',\n", 136 | " 'ISO3166-2': 'AL',\n", 137 | " 'placeName': 'Tuscaloosa',\n", 138 | " 'lat': 33.196891}" 139 | ] 140 | }, 141 | "execution_count": 11, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "top_result = response.json()['postalCodes'][0]\n", 148 | "top_result" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 12, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "(33.196891, -87.562666)" 160 | ] 161 | }, 162 | "execution_count": 12, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "latitude = top_result['lat']\n", 169 | "longitude = top_result['lng']\n", 170 | "latitude, longitude" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "## 2. Searching with Queries\n", 178 | "\n", 179 | "Queries allow users to search for location at several different levels.\n", 180 | "\n", 181 | "### Searching for a City\n", 182 | "\n", 183 | "In this example, we search for a location using the query \"Tuscaloosa.\"" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 13, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "200" 195 | ] 196 | }, 197 | "execution_count": 13, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "endpoint = 'searchJSON'\n", 204 | "params = {\n", 205 | " 'q': 'Tuscaloosa', # Search query\n", 206 | " 'countryBias': 'US', # Moves US results to the top of the results list\n", 207 | " 'maxRows': 10, # Limit results to top 10\n", 208 | " 'username': USERNAME # Must include GeoNames username in all API calls\n", 209 | "}\n", 210 | "\n", 211 | "response = requests.get(f\"{BASE_URL}{endpoint}\", params=params)\n", 212 | "\n", 213 | "# Status code 200 indicates success\n", 214 | "response.status_code" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 14, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "{'adminCode1': 'AL',\n", 226 | " 'lng': '-87.56917',\n", 227 | " 'geonameId': 4094455,\n", 228 | " 'toponymName': 'Tuscaloosa',\n", 229 | " 'countryId': '6252001',\n", 230 | " 'fcl': 'P',\n", 231 | " 'population': 98332,\n", 232 | " 'countryCode': 'US',\n", 233 | " 'name': 'Tuscaloosa',\n", 234 | " 'fclName': 'city, village,...',\n", 235 | " 'adminCodes1': {'ISO3166_2': 'AL'},\n", 236 | " 'countryName': 'United States',\n", 237 | " 'fcodeName': 'seat of a second-order administrative division',\n", 238 | " 'adminName1': 'Alabama',\n", 239 | " 'lat': '33.20984',\n", 240 | " 'fcode': 'PPLA2'}" 241 | ] 242 | }, 243 | "execution_count": 14, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "# Display top result\n", 250 | "response.json()['geonames'][0]" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "### Seaching for a Building\n", 258 | "\n", 259 | "In this example, we search for a location using the query \"Bruno Business Library.\"" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 15, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/plain": [ 270 | "200" 271 | ] 272 | }, 273 | "execution_count": 15, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "endpoint = 'searchJSON'\n", 280 | "params = {\n", 281 | " 'q': 'Bruno Business Library', # Search query\n", 282 | " 'countryBias': 'US', # Moves US results to the top of the results list\n", 283 | " 'maxRows': 10, # Limit results to top 10\n", 284 | " 'username': USERNAME # Must include GeoNames username in all API calls\n", 285 | "}\n", 286 | "\n", 287 | "response = requests.get(f'{BASE_URL}{endpoint}', params=params)\n", 288 | "\n", 289 | "# Status code 200 indicates success\n", 290 | "response.status_code" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 16, 296 | "metadata": {}, 297 | "outputs": [ 298 | { 299 | "data": { 300 | "text/plain": [ 301 | "{'adminCode1': 'AL',\n", 302 | " 'lng': '-87.54925',\n", 303 | " 'geonameId': 11524498,\n", 304 | " 'toponymName': 'Angelo Bruno Business Library',\n", 305 | " 'countryId': '6252001',\n", 306 | " 'fcl': 'S',\n", 307 | " 'population': 0,\n", 308 | " 'countryCode': 'US',\n", 309 | " 'name': 'Angelo Bruno Business Library',\n", 310 | " 'fclName': 'spot, building, farm',\n", 311 | " 'adminCodes1': {'ISO3166_2': 'AL'},\n", 312 | " 'countryName': 'United States',\n", 313 | " 'fcodeName': 'library',\n", 314 | " 'adminName1': 'Alabama',\n", 315 | " 'lat': '33.2111',\n", 316 | " 'fcode': 'LIBR'}" 317 | ] 318 | }, 319 | "execution_count": 16, 320 | "metadata": {}, 321 | "output_type": "execute_result" 322 | } 323 | ], 324 | "source": [ 325 | "# Display top result\n", 326 | "response.json()['geonames'][0]" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "### Searching for an Island\n", 334 | "\n", 335 | "In this example, we use the query \"Martha's Vineyard.\"" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 20, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "200" 347 | ] 348 | }, 349 | "execution_count": 20, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | } 353 | ], 354 | "source": [ 355 | "endpoint = 'searchJSON'\n", 356 | "params = {\n", 357 | " 'q': \"Martha's Vineyard\", # Search query\n", 358 | " 'countryBias': 'US', # Moves US results to the top of the results list\n", 359 | " 'maxRows': 10, # Limit results to top 10\n", 360 | " 'username': USERNAME # Must include GeoNames username in all API calls\n", 361 | "}\n", 362 | "\n", 363 | "response = requests.get(f'{BASE_URL}{endpoint}', params=params)\n", 364 | "\n", 365 | "# Status code 200 indicates success\n", 366 | "response.status_code" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 21, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "data": { 376 | "text/plain": [ 377 | "{'adminCode1': 'MA',\n", 378 | " 'lng': '-70.61265',\n", 379 | " 'geonameId': 4943237,\n", 380 | " 'toponymName': \"Martha's Vineyard Airport\",\n", 381 | " 'countryId': '6252001',\n", 382 | " 'fcl': 'S',\n", 383 | " 'population': 0,\n", 384 | " 'countryCode': 'US',\n", 385 | " 'name': \"Martha's Vineyard Airport\",\n", 386 | " 'fclName': 'spot, building, farm',\n", 387 | " 'adminCodes1': {'ISO3166_2': 'MA'},\n", 388 | " 'countryName': 'United States',\n", 389 | " 'fcodeName': 'airport',\n", 390 | " 'adminName1': 'Massachusetts',\n", 391 | " 'lat': '41.39016',\n", 392 | " 'fcode': 'AIRP'}" 393 | ] 394 | }, 395 | "execution_count": 21, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "# Display top result\n", 402 | "response.json()['geonames'][0]" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "Note that the result above is the data for Matha's Vineyard Airport. If we wish to find the data associated with the island, we can look at the `fcodeName` of the locations in the response:" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 22, 415 | "metadata": {}, 416 | "outputs": [ 417 | { 418 | "name": "stdout", 419 | "output_type": "stream", 420 | "text": [ 421 | "Martha's Vineyard Airport airport\n", 422 | "Martha's Vineyard Island island\n", 423 | "Vineyard Haven populated place\n", 424 | "Martha's Vineyard Hospital hospital\n", 425 | "Martha's Vineyard Regional High School school\n", 426 | "Marthas Vineyard Campground camp(s)\n", 427 | "Martha's Vineyard Aero Light \n", 428 | "Martha's Vineyard State Forest forest(s)\n", 429 | "Martha's Vineyard Agricultural Society vineyard\n", 430 | "Martha's Vineyard State Forest forest(s)\n" 431 | ] 432 | } 433 | ], 434 | "source": [ 435 | "for location in response.json()['geonames']:\n", 436 | " print(f'{location['toponymName']:<40}{location['fcodeName']}')" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "## 3. Reverse Geocoding\n", 444 | "\n", 445 | "The `findNearbyPostalCodesJSON` endpoint can be used to find the ZIP code of a pair of coordinates." 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 25, 451 | "metadata": {}, 452 | "outputs": [ 453 | { 454 | "data": { 455 | "text/plain": [ 456 | "200" 457 | ] 458 | }, 459 | "execution_count": 25, 460 | "metadata": {}, 461 | "output_type": "execute_result" 462 | } 463 | ], 464 | "source": [ 465 | "endpoint = 'findNearbyPostalCodesJSON'\n", 466 | "params = {\n", 467 | " 'lat': 38.625189, # Search latitude\n", 468 | " 'lng': -90.187330, # Search longitude\n", 469 | " 'maxRows': 10, # Limit results to top 10\n", 470 | " 'username': USERNAME # Must include GeoNames username in all API calls\n", 471 | "}\n", 472 | "\n", 473 | "response = requests.get(f'{BASE_URL}{endpoint}', params=params)\n", 474 | "\n", 475 | "# Status code 200 indicates success\n", 476 | "response.status_code" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 26, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "name": "stdout", 486 | "output_type": "stream", 487 | "text": [ 488 | "ZIP | Distance (km)\n", 489 | "63102 | 0\n", 490 | "63188 | 0.94603\n", 491 | "63197 | 0.94603\n", 492 | "63180 | 0.94603\n", 493 | "63155 | 0.94603\n", 494 | "63169 | 0.94603\n", 495 | "63182 | 0.94603\n", 496 | "63150 | 0.94603\n", 497 | "63101 | 1.1038\n", 498 | "62202 | 2.64737\n" 499 | ] 500 | } 501 | ], 502 | "source": [ 503 | "# Print 10 nearest ZIP codes\n", 504 | "print('ZIP | Distance (km)')\n", 505 | "for zip in response.json()['postalCodes']:\n", 506 | " print(f'{zip['postalCode']} | {zip['distance']}')" 507 | ] 508 | } 509 | ], 510 | "metadata": { 511 | "kernelspec": { 512 | "display_name": "api_env", 513 | "language": "python", 514 | "name": "python3" 515 | }, 516 | "language_info": { 517 | "codemirror_mode": { 518 | "name": "ipython", 519 | "version": 3 520 | }, 521 | "file_extension": ".py", 522 | "mimetype": "text/x-python", 523 | "name": "python", 524 | "nbconvert_exporter": "python", 525 | "pygments_lexer": "ipython3", 526 | "version": "3.13.1" 527 | }, 528 | "orig_nbformat": 4 529 | }, 530 | "nbformat": 4, 531 | "nbformat_minor": 2 532 | } 533 | -------------------------------------------------------------------------------- /src/python/imgs/APOD_Image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/python/imgs/APOD_Image.png -------------------------------------------------------------------------------- /src/python/imgs/Earth_Image1_recent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/python/imgs/Earth_Image1_recent.png -------------------------------------------------------------------------------- /src/python/imgs/Earth_Image_Stitched.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/python/imgs/Earth_Image_Stitched.png -------------------------------------------------------------------------------- /src/python/sdirect.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "088987c6-311d-4677-9cc9-19ceeeb245b9", 6 | "metadata": {}, 7 | "source": [ 8 | "# ScienceDirect API in Python\n", 9 | "\n", 10 | "by Vincent F. Scalfani\n", 11 | "\n", 12 | "**ScienceDirect**: https://www.sciencedirect.com/\n", 13 | "\n", 14 | "**Elsevier Developer Portal:** https://dev.elsevier.com/\n", 15 | "\n", 16 | "**ScienceDirect APIs Specification:** https://dev.elsevier.com/sd_api_spec.html\n", 17 | "\n", 18 | "**Elsevier How to Guide: Text Mining:** https://dev.elsevier.com/tecdoc_text_mining.html\n", 19 | "\n", 20 | "Please check with your institution for their Text and Data Mining Agreement with Elsevier.\n", 21 | "\n", 22 | "These recipe examples use the Elsevier ScienceDirect Article (Full-Text) API. This tutorial content is intended to help facillitate academic research. Before continuing or reusing any of this code, please be aware of Elsevier’s API policies and appropiate use-cases, as for example, Elsevier has detailed policies regarding [text and data mining of Elsevier full-text content](https://dev.elsevier.com/text_mining.html). If you have copyright or other related text and data mining questions, please contact The University of Alabama Libraries.\n", 23 | "\n", 24 | "*These recipe examples were tested on February 12, 2025.*" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "f257ddd2-982a-4179-99c0-0b8d572ac57d", 30 | "metadata": {}, 31 | "source": [ 32 | "## Setup\n", 33 | "\n", 34 | "### Import Libraries" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "id": "9eeb06c9-31ed-463d-a39d-d0207e68a336", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import requests\n", 45 | "from time import sleep" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "61baba0a-06e0-4a83-bf96-da483ca02742", 51 | "metadata": {}, 52 | "source": [ 53 | "### Import API key\n", 54 | "\n", 55 | "As a good practice, do not display your API key in your computational notebook (to prevent accidental sharing). Save your API key to a separate python file, then import your key." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "id": "1c7646f1-a700-4e00-a323-f2eea0e25768", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "from api_key import myAPIKey" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "id": "997bb204-db57-4730-addd-47abd59b67ef", 71 | "metadata": {}, 72 | "source": [ 73 | "### Identifier Note\n", 74 | "\n", 75 | "We will use DOIs as the article identifiers. See our Crossref and Scopus API tutorials for workflows on how to create lists of DOIs and identfiers for specific searches and journals. The Elsevier ScienceDirect Article (Full-Text) API also accepts other identifiers like Scopus IDs and PubMed IDs (see API specification documents linked above)." 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "5554fdf8-98cd-4bf5-bb44-f16e30b859c8", 81 | "metadata": {}, 82 | "source": [ 83 | "## 1. Retrieve full-text XML of an article" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "id": "c5f36bf5-50bb-4617-b75f-ccc1c0ea1964", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# For XML download\n", 94 | "elsevier_url = \"https://api.elsevier.com/content/article/doi/\"\n", 95 | "doi1 = '10.1016/j.tetlet.2017.07.080' # Example Tetrahedron Letters article\n", 96 | "fulltext1 = requests.get(elsevier_url + doi1 + \"?APIKey=\" + myAPIKey + \"&httpAccept=text/xml\")\n", 97 | "\n", 98 | "# Save to file\n", 99 | "with open('fulltext1.xml', 'w', encoding='utf-8') as outfile:\n", 100 | " outfile.write(fulltext1.text)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "56c376b1-411c-4f1b-b117-dd006fd74181", 106 | "metadata": {}, 107 | "source": [ 108 | "## 2. Retrieve plain text of an article" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "id": "e3dac0b9-41f5-4045-98d4-1c217c1ecd38", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# For simplified text download\n", 119 | "elsevier_url = \"https://api.elsevier.com/content/article/doi/\"\n", 120 | "doi2 = '10.1016/j.tetlet.2022.153680' # example Tetrahedron Letters article\n", 121 | "fulltext2 = requests.get(elsevier_url + doi2 + \"?APIKey=\" + myAPIKey + \"&httpAccept=text/plain\")\n", 122 | "\n", 123 | "# Save to file\n", 124 | "with open('fulltext2.txt', 'w', encoding='utf-8') as outfile:\n", 125 | " outfile.write(fulltext2.text)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "id": "9bd40795-646f-4962-b11d-ca967c06e9cf", 131 | "metadata": {}, 132 | "source": [ 133 | "## 3. Retrieve full-text in a loop" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "id": "2f21c309-903e-4e40-bd95-d206928c91e5", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# Make a list of 5 DOIs for testing\n", 144 | "dois = ['10.1016/j.tetlet.2018.10.031',\n", 145 | " '10.1016/j.tetlet.2018.10.033',\n", 146 | " '10.1016/j.tetlet.2018.10.034',\n", 147 | " '10.1016/j.tetlet.2018.10.038',\n", 148 | " '10.1016/j.tetlet.2018.10.041']" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 8, 154 | "id": "4be0c3e8-1931-450e-995e-06984b2218c1", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# Retrieve article full text for each DOI in a loop and save each article to a separate file.\n", 159 | "# Example shown for plain text, XML also works (replace 'plain' with 'xml')\n", 160 | "\n", 161 | "elsevier_url = \"https://api.elsevier.com/content/article/doi/\"\n", 162 | "for doi in dois:\n", 163 | " article = requests.get(elsevier_url + doi + \"?APIKey=\" + myAPIKey + \"&httpAccept=text/plain\") \n", 164 | " doi_name = doi.replace('/','_') # Can't save files with a '/' character on Linux\n", 165 | " with open(doi_name + '_plain_text.txt', 'w', encoding='utf-8') as outfile:\n", 166 | " outfile.write(article.text)\n", 167 | " sleep(1) # Pause for 1 second between API calls" 168 | ] 169 | } 170 | ], 171 | "metadata": { 172 | "kernelspec": { 173 | "display_name": "Python 3", 174 | "language": "python", 175 | "name": "python3" 176 | }, 177 | "language_info": { 178 | "codemirror_mode": { 179 | "name": "ipython", 180 | "version": 3 181 | }, 182 | "file_extension": ".py", 183 | "mimetype": "text/x-python", 184 | "name": "python", 185 | "nbconvert_exporter": "python", 186 | "pygments_lexer": "ipython3", 187 | "version": "3.11.9" 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 5 192 | } 193 | -------------------------------------------------------------------------------- /src/python/springer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": false 7 | }, 8 | "source": [ 9 | "# Springer Nature API in Python\n", 10 | "\n", 11 | "by Avery Fernandez and Vincent F. Scalfani\n", 12 | "\n", 13 | "These recipe examples use the Springer Nature Open Access API to retrieve metadata and full-text content. About 1.5 million full-text are available: https://dev.springernature.com/docs/api-endpoints/open-access/\n", 14 | "\n", 15 | "An API key is required to access the Springer Nature API, sign up can be found at https://dev.springernature.com/\n", 16 | "\n", 17 | "Code was tested on October 13, 2023. This tutorial content is intended to help facillitate academic research. Please check with your institution for their Text and Data Mining Agreement with Springer Nature. Before continuing or reusing any of this code, be aware of the Springer Nature Text and Data Mining Policies, Terms and Conditions, Terms for API Users, and TDM reservation policies:\n", 18 | "\n", 19 | "* https://www.springernature.com/gp/researchers/text-and-data-mining\n", 20 | "* https://www.springernature.com/gp/legal/general-terms-of-use/15067848\n", 21 | "* https://dev.springernature.com/terms-conditions\n", 22 | "* https://dev.springernature.com/tdm-reservation-policy/\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Setup\n", 30 | "\n", 31 | "### Import Libraries" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 16, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import requests\n", 43 | "from time import sleep\n", 44 | "from pprint import pprint" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "source": [ 53 | "### Import API Key\n", 54 | "\n", 55 | "We store our API key in a separate file for easy access and security." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 17, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "from key import api_key" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "source": [ 75 | "## 1. Retrieve full-text JATS XML of an article" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "source": [ 84 | "Before we can query, we must establish a few things:\n", 85 | "* **base_url**: The base url for the Springer API, more specifically the open access API with JATS format: https://jats.nlm.nih.gov/archiving/tag-library/1.1/index.html\n", 86 | "* **?q=doi:**: The query parameter, in this case we are searching for a DOI\n", 87 | "* **doi**: The DOI of the article\n", 88 | "* **openaccess:true**: This requests content through the openaccess API\n", 89 | "* **&api_key=**: This the text for the api key\n", 90 | "\n", 91 | "You can read more about the API parameters at https://dev.springernature.com/restfuloperations" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 18, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "base_url = 'https://api.springernature.com/openaccess/jats'\n", 103 | "\n", 104 | "# example DOI from SpringerOpen Brain Informatics\n", 105 | "doi = '\"10.1007/s40708-014-0001-z\"' # doi must be wrapped in double quotes" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 20, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "data = requests.get(f\"{base_url}?q=doi:{doi} openaccess:true&api_key={api_key}\")\n", 125 | "pprint(data) # Response 200 means that the response was successful" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 21, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "# Save to a file\n", 137 | "with open('fulltext.jats', 'w') as outfile:\n", 138 | " outfile.write(data.text)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "## 2. Retrieve full-text in a loop" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 22, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "# Examples from SprigerOpen Brain Informatics\n", 155 | "\n", 156 | "dois = [\n", 157 | " '\"10.1007/s40708-014-0001-z\"',\n", 158 | " '\"10.1007/s40708-014-0002-y\"',\n", 159 | " '\"10.1007/s40708-014-0003-x\"',\n", 160 | " '\"10.1007/s40708-014-0004-9\"',\n", 161 | " '\"10.1007/s40708-014-0005-8\"',\n", 162 | "]" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 23, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "base_url = 'https://api.springernature.com/openaccess/jats'\n", 174 | "for doi in dois:\n", 175 | " data = requests.get(f\"{base_url}?q=doi:{doi} openaccess:true&api_key={api_key}\")\n", 176 | " sleep(1) # add a delay.\n", 177 | " doi_name = doi.replace('/', '_').replace('\"', '') # remove / and \" from doi\n", 178 | " with open(f'{doi_name}_jats_text.jats', 'w') as outfile:\n", 179 | " outfile.write(data.text)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## 3. Acquire and Parse Metadata\n", 187 | "\n", 188 | "We can also acquire only the metadata as JSON text." 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 24, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "base_url = 'https://api.springernature.com/openaccess/json'\n", 198 | "doi = '\"10.1007/s40708-014-0001-z\"' # doi must be wrapped in double quotes\n", 199 | "data = requests.get(f\"{base_url}?q=doi:{doi} openaccess:true&api_key={api_key}\").json()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "We can now extract data out of `[\"records\"][0]`, where all the data is stored for the article" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 25, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "'This JSON was provided by Springer Nature'\n", 219 | "'doi:\"10.1007/s40708-014-0001-z\" openaccess:true'\n", 220 | "{'h1': 'Abstract',\n", 221 | " 'p': 'Big data is the term for a collection of datasets so huge and complex '\n", 222 | " 'that it becomes difficult to be processed using on-hand theoretical '\n", 223 | " 'models and technique tools. Brain big data is one of the most typical, '\n", 224 | " 'important big data collected using powerful equipments of functional '\n", 225 | " 'magnetic resonance imaging, multichannel electroencephalography, '\n", 226 | " 'magnetoencephalography, Positron emission tomography, near infrared '\n", 227 | " 'spectroscopic imaging, as well as other various devices. Granular '\n", 228 | " 'computing with multiple granular layers, referred to as multi-granular '\n", 229 | " 'computing (MGrC) for short hereafter, is an emerging computing paradigm '\n", 230 | " 'of information processing, which simulates the multi-granular '\n", 231 | " 'intelligent thinking model of human brain. It concerns the processing '\n", 232 | " 'of complex information entities called information granules, which '\n", 233 | " 'arise in the process of data abstraction and derivation of information '\n", 234 | " 'and even knowledge from data. This paper analyzes three basic '\n", 235 | " 'mechanisms of MGrC, namely granularity optimization, granularity '\n", 236 | " 'conversion, and multi-granularity joint computation, and discusses the '\n", 237 | " 'potential of introducing MGrC into intelligent processing of brain big '\n", 238 | " 'data.'}\n", 239 | "'10.1007/s40708-014-0001-z'\n", 240 | "'2014-09-06'\n", 241 | "'2015-01-30'\n", 242 | "'Brain Informatics'\n", 243 | "'Granular computing with multiple granular layers for brain big data processing'\n" 244 | ] 245 | } 246 | ], 247 | "source": [ 248 | "# some examples:\n", 249 | "pprint(data[\"apiMessage\"])\n", 250 | "pprint(data[\"query\"])\n", 251 | "pprint(data[\"records\"][0][\"abstract\"])\n", 252 | "pprint(data[\"records\"][0][\"doi\"])\n", 253 | "pprint(data[\"records\"][0][\"onlineDate\"])\n", 254 | "pprint(data[\"records\"][0][\"printDate\"])\n", 255 | "pprint(data[\"records\"][0][\"publicationName\"])\n", 256 | "pprint(data[\"records\"][0][\"title\"])" 257 | ] 258 | } 259 | ], 260 | "metadata": { 261 | "kernelspec": { 262 | "display_name": "Python 3", 263 | "language": "python", 264 | "name": "python3" 265 | }, 266 | "language_info": { 267 | "codemirror_mode": { 268 | "name": "ipython", 269 | "version": 3 270 | }, 271 | "file_extension": ".py", 272 | "mimetype": "text/x-python", 273 | "name": "python", 274 | "nbconvert_exporter": "python", 275 | "pygments_lexer": "ipython3", 276 | "version": "3.11.0" 277 | } 278 | }, 279 | "nbformat": 4, 280 | "nbformat_minor": 0 281 | } 282 | -------------------------------------------------------------------------------- /src/python/us-census-geocoding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# U.S. Census Geocoding API in Python\n", 8 | "\n", 9 | "by Michael T. Moen\n", 10 | "\n", 11 | "*This product uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau.*\n", 12 | "\n", 13 | "Please see the following resources for more information on API usage:\n", 14 | "\n", 15 | "- Documentation\n", 16 | " - U.S. Census Geocoding API Documentation\n", 17 | "- Terms of Use\n", 18 | " - U.S. Census Geocoding API Terms of Service\n", 19 | "\n", 20 | "*These recipe examples were tested on March 7, 2025.*" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Setup\n", 28 | "\n", 29 | "The following external libraries need to be installed into your enviornment to run the code examples in this tutorial:\n", 30 | "\n", 31 | "- ipykernel\n", 32 | "- requests" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import requests\n", 42 | "import csv\n", 43 | "from pprint import pprint" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## 1. Address Lookup\n", 51 | "\n", 52 | "One of the main use cases of this API is finding the latitude and longitude of an address. In this example, we find the latitude and longitude of the Bruno Business Library at the University of Alabama.\n", 53 | "\n", 54 | "The API allows searching through two methods: `address` and `onelineaddress`. These methods are nearly identical, with the only difference being the format of the parameters passed to API.\n", 55 | "\n", 56 | "### Using `address` Search" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "200" 68 | ] 69 | }, 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "BASE_URL = 'https://geocoding.geo.census.gov/geocoder/'\n", 77 | "return_type = 'locations'\n", 78 | "search_type = 'address'\n", 79 | "\n", 80 | "params = {\n", 81 | " # Specify the address to lookup with the following parameters\n", 82 | " 'street': '425 Stadium Dr',\n", 83 | " 'city': 'Tuscaloosa',\n", 84 | " 'state': 'AL',\n", 85 | " 'zip': 35401,\n", 86 | " # Specify the version of the locator to be searched\n", 87 | " 'benchmark': 'Public_AR_Current',\n", 88 | " # Specify that data should be returned in JSON format\n", 89 | " 'format': 'json'\n", 90 | "}\n", 91 | "\n", 92 | "response = requests.get(f'{BASE_URL}{return_type}/{search_type}', params=params)\n", 93 | "\n", 94 | "# Status code of 200 indicates success\n", 95 | "response.status_code" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "{'result': {'input': {'address': {'zip': '35401',\n", 107 | " 'city': 'Tuscaloosa',\n", 108 | " 'street': '425 Stadium Dr',\n", 109 | " 'state': 'AL'},\n", 110 | " 'benchmark': {'isDefault': True,\n", 111 | " 'benchmarkDescription': 'Public Address Ranges - Current Benchmark',\n", 112 | " 'id': '4',\n", 113 | " 'benchmarkName': 'Public_AR_Current'}},\n", 114 | " 'addressMatches': [{'tigerLine': {'side': 'L', 'tigerLineId': '636109874'},\n", 115 | " 'coordinates': {'x': -87.549700416257, 'y': 33.21105403378},\n", 116 | " 'addressComponents': {'zip': '35401',\n", 117 | " 'streetName': 'STADIUM',\n", 118 | " 'preType': '',\n", 119 | " 'city': 'TUSCALOOSA',\n", 120 | " 'preDirection': '',\n", 121 | " 'suffixDirection': '',\n", 122 | " 'fromAddress': '401',\n", 123 | " 'state': 'AL',\n", 124 | " 'suffixType': 'DR',\n", 125 | " 'toAddress': '499',\n", 126 | " 'suffixQualifier': '',\n", 127 | " 'preQualifier': ''},\n", 128 | " 'matchedAddress': '425 STADIUM DR, TUSCALOOSA, AL, 35401'}]}}" 129 | ] 130 | }, 131 | "execution_count": 4, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "response.json()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 5, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "(33.21105403378, -87.549700416257)" 149 | ] 150 | }, 151 | "execution_count": 5, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "latitude = response.json()['result']['addressMatches'][0]['coordinates']['y']\n", 158 | "longitude = response.json()['result']['addressMatches'][0]['coordinates']['x']\n", 159 | "\n", 160 | "# Display coordinates\n", 161 | "latitude, longitude" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "### Using `onelineaddress` Search" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 6, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "200" 180 | ] 181 | }, 182 | "execution_count": 6, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "return_type = 'locations'\n", 189 | "search_type = 'onelineaddress'\n", 190 | "\n", 191 | "params = {\n", 192 | " # Specify the address to lookup with the parameters\n", 193 | " # Note that 'street' is required, and the other parameters are optional\n", 194 | " 'address': '425 Stadium Dr, Tuscaloosa, AL 35401',\n", 195 | " # Specify the version of the locator to be searched\n", 196 | " 'benchmark': 'Public_AR_Current',\n", 197 | " # Specify that data should be returned in JSON format\n", 198 | " 'format': 'json'\n", 199 | "}\n", 200 | "\n", 201 | "response = requests.get(f'{BASE_URL}{return_type}/{search_type}', params=params)\n", 202 | "\n", 203 | "# Status code of 200 indicates success\n", 204 | "response.status_code" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 7, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "(33.21105403378, -87.549700416257)" 216 | ] 217 | }, 218 | "execution_count": 7, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "latitude = response.json()['result']['addressMatches'][0]['coordinates']['y']\n", 225 | "longitude = response.json()['result']['addressMatches'][0]['coordinates']['x']\n", 226 | "\n", 227 | "# Display coordinates\n", 228 | "latitude, longitude" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "## 2. Batch Address Lookup\n", 236 | "\n", 237 | "The U.S. Census Geocoding API also allows for batch geocoding with the submission of a CSV, TXT, DAT, XLS, or XLSX file. These files must be formatted with one record per line, where each record must be formatted as followed: Unique ID, Street address, City, State, ZIP. Users are limited to 10,000 records per batch file.\n", 238 | "\n", 239 | "This example uses the CSV file created below:" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 8, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "# Create list of addresses for the batch lookup\n", 249 | "# Note that each record must begin with a unique ID\n", 250 | "addresses = [\n", 251 | " ['1', '425 Stadium Dr', 'Tuscaloosa', 'AL', '35401'],\n", 252 | " ['2', '1600 Pennsylvania Avenue NW', 'Washington', 'DC', '20500'],\n", 253 | " ['3', '350 Fifth Avenue', 'New York', 'NY', '10118'],\n", 254 | " ['4', '660 Cannery Row', 'Monterey', 'CA', '93940'],\n", 255 | " ['5', '700 Clark Ave', 'St. Louis', 'MO', '63102']\n", 256 | "]\n", 257 | "\n", 258 | "# Export addresses to a CSV file\n", 259 | "input_filename = 'batch_addresses.csv'\n", 260 | "with open(input_filename, 'w', newline='') as f:\n", 261 | " csv_writer = csv.writer(f)\n", 262 | " csv_writer.writerows(addresses)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 9, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "200" 274 | ] 275 | }, 276 | "execution_count": 9, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "# Format parameters needed for POST request\n", 283 | "return_type = 'locations'\n", 284 | "params = {\n", 285 | " 'benchmark' : 'Public_AR_Current'\n", 286 | "}\n", 287 | "files = {\n", 288 | " 'addressFile': open(input_filename, \"rb\")\n", 289 | "}\n", 290 | "\n", 291 | "url = f'https://geocoding.geo.census.gov/geocoder/{return_type}/addressbatch'\n", 292 | "response = requests.post(url, data=params, files=files)\n", 293 | "\n", 294 | "# Status code of 200 indicates success\n", 295 | "response.status_code" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 10, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "['1', '425 Stadium Dr, Tuscaloosa, AL, 35401', 'Match', 'Exact', '425 STADIUM DR, TUSCALOOSA, AL, 35401', '-87.549700416257,33.211054033781', '636109874', 'L']\n", 308 | "['2', '1600 Pennsylvania Avenue NW, Washington, DC, 20500', 'Match', 'Exact', '1600 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20500', '-77.036543957308,38.898690918656', '76225813', 'L']\n", 309 | "['3', '350 Fifth Avenue, New York, NY, 10118', 'Match', 'Exact', '350 5TH AVE, NEW YORK, NY, 10118', '-73.985077152891,40.747848600317', '59653473', 'L']\n", 310 | "['4', '660 Cannery Row, Monterey, CA, 93940', 'Match', 'Exact', '660 CANNERY ROW, MONTEREY, CA, 93940', '-121.901280304574,36.617235842516', '647390330', 'R']\n", 311 | "['5', '700 Clark Ave, St. Louis, MO, 63102', 'Match', 'Non_Exact', '700 CLARK AVE, SAINT LOUIS, MO, 63119', '-90.340369438036,38.602422417149', '100141071', 'R']\n" 312 | ] 313 | } 314 | ], 315 | "source": [ 316 | "# Save content of response to a new CSV\n", 317 | "output_filename = 'geocoded_addresses.csv'\n", 318 | "with open(output_filename, 'wb') as f:\n", 319 | " f.write(response.content)\n", 320 | "\n", 321 | "# Printing contents of CSV for demonstation purposes\n", 322 | "with open(output_filename, newline='') as f:\n", 323 | " csv_reader = csv.reader(f)\n", 324 | " for row in csv_reader:\n", 325 | " print(row)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "Note that the last two columns of the above data are the TIGER/Line ID and TIGER/Line Side. For more information on these values, please see the U.S. Census TIGER/Line Geodatabase Documentation. However, this tutorial does not utilize any TIGER/Line data." 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "## 3. Retrieving Additional Geographic Data\n", 340 | "\n", 341 | "The `geographies` return type allows for the retrieval of additional data associated for a given address or set of coordinates. The example below retrieves this data using the address of the Bruno Business Library at the University of Alabama.\n", 342 | "\n", 343 | "Note that the `geographies` return type requires the `vintage` parameter to be specified.\n", 344 | "\n", 345 | "Users may additionally include the `layers` parameter, which determines the types of geography data returned. For a list of all layers, see here." 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 15, 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "data": { 355 | "text/plain": [ 356 | "200" 357 | ] 358 | }, 359 | "execution_count": 15, 360 | "metadata": {}, 361 | "output_type": "execute_result" 362 | } 363 | ], 364 | "source": [ 365 | "return_type = 'geographies'\n", 366 | "search_type = 'address'\n", 367 | "\n", 368 | "params = {\n", 369 | " # Specify the address to lookup with the following parameters\n", 370 | " 'street': '425 Stadium Dr',\n", 371 | " 'city': 'Tuscaloosa',\n", 372 | " 'state': 'AL',\n", 373 | " 'zip': 35401,\n", 374 | " # Specify the version of the locator to be searched\n", 375 | " 'benchmark': 'Public_AR_Current',\n", 376 | " # Specify the vintage\n", 377 | " 'vintage': 'Current_Current',\n", 378 | " # Specify what categories of geographic data to retrieve\n", 379 | " 'layers': 'all',\n", 380 | " # Specify that data should be returned in JSON format\n", 381 | " 'format': 'json'\n", 382 | "}\n", 383 | "\n", 384 | "response = requests.get(f'{BASE_URL}{return_type}/{search_type}', params=params)\n", 385 | "\n", 386 | "# Status code of 200 indicates success\n", 387 | "response.status_code" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "Note that the `geographies` return type returns all of the data that the `locations` return type does in addition to the geographies data." 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 16, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "name": "stdout", 404 | "output_type": "stream", 405 | "text": [ 406 | "{'addressComponents': {...},\n", 407 | " 'coordinates': {...},\n", 408 | " 'geographies': {...},\n", 409 | " 'matchedAddress': '425 STADIUM DR, TUSCALOOSA, AL, 35401',\n", 410 | " 'tigerLine': {...}}\n" 411 | ] 412 | } 413 | ], 414 | "source": [ 415 | "pprint(response.json()['result']['addressMatches'][0], depth=1)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "The geographies data contains the following categories:" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 17, 428 | "metadata": {}, 429 | "outputs": [ 430 | { 431 | "name": "stdout", 432 | "output_type": "stream", 433 | "text": [ 434 | "{'119th Congressional Districts': [...],\n", 435 | " '2020 Census Blocks': [...],\n", 436 | " '2020 Census Public Use Microdata Areas': [...],\n", 437 | " '2020 Census ZIP Code Tabulation Areas': [...],\n", 438 | " '2024 State Legislative Districts - Lower': [...],\n", 439 | " '2024 State Legislative Districts - Upper': [...],\n", 440 | " 'Census Block Groups': [...],\n", 441 | " 'Census Divisions': [...],\n", 442 | " 'Census Regions': [...],\n", 443 | " 'Census Tracts': [...],\n", 444 | " 'Counties': [...],\n", 445 | " 'County Subdivisions': [...],\n", 446 | " 'Incorporated Places': [...],\n", 447 | " 'Metropolitan Statistical Areas': [...],\n", 448 | " 'States': [...],\n", 449 | " 'Unified School Districts': [...],\n", 450 | " 'Urban Areas': [...]}\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "pprint(response.json()['result']['addressMatches'][0]['geographies'], depth=1)" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "As an example, this is how the Counties data is formatted." 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 18, 468 | "metadata": { 469 | "tags": [] 470 | }, 471 | "outputs": [ 472 | { 473 | "data": { 474 | "text/plain": [ 475 | "[{'GEOID': '01125',\n", 476 | " 'CENTLAT': '+33.2894031',\n", 477 | " 'AREAWATER': '78666216',\n", 478 | " 'STATE': '01',\n", 479 | " 'BASENAME': 'Tuscaloosa',\n", 480 | " 'OID': '2759075608325',\n", 481 | " 'LSADC': '06',\n", 482 | " 'FUNCSTAT': 'A',\n", 483 | " 'INTPTLAT': '+33.2902197',\n", 484 | " 'NAME': 'Tuscaloosa County',\n", 485 | " 'OBJECTID': 3113,\n", 486 | " 'CENTLON': '-087.5250366',\n", 487 | " 'COUNTYCC': 'H1',\n", 488 | " 'COUNTYNS': '00161588',\n", 489 | " 'AREALAND': '3421017287',\n", 490 | " 'INTPTLON': '-087.5227834',\n", 491 | " 'MTFCC': 'G4020',\n", 492 | " 'COUNTY': '125'}]" 493 | ] 494 | }, 495 | "execution_count": 18, 496 | "metadata": {}, 497 | "output_type": "execute_result" 498 | } 499 | ], 500 | "source": [ 501 | "response.json()['result']['addressMatches'][0]['geographies']['Counties']" 502 | ] 503 | } 504 | ], 505 | "metadata": { 506 | "kernelspec": { 507 | "display_name": "api_env", 508 | "language": "python", 509 | "name": "python3" 510 | }, 511 | "language_info": { 512 | "codemirror_mode": { 513 | "name": "ipython", 514 | "version": 3 515 | }, 516 | "file_extension": ".py", 517 | "mimetype": "text/x-python", 518 | "name": "python", 519 | "nbconvert_exporter": "python", 520 | "pygments_lexer": "ipython3", 521 | "version": "3.13.1" 522 | }, 523 | "orig_nbformat": 4 524 | }, 525 | "nbformat": 4, 526 | "nbformat_minor": 2 527 | } 528 | -------------------------------------------------------------------------------- /src/python/wiley-tdm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Wiley Text and Data Mining (TDM) in Python\n", 8 | "\n", 9 | "by Michael T. Moen\n", 10 | "\n", 11 | "The Wiley Text and Data Mining (TDM) API allows users to retrieve the full-text articles of subscribed Wiley content in PDF form. TDM use is for non-commercial scholarly research, see terms and restrictions in below links.\n", 12 | "\n", 13 | "*This tutorial content is intended to help facilitate academic research. Please check your institution for their Text and Data Mining or related License Agreement with Wiley.*\n", 14 | "\n", 15 | "Please see the following resources for more information on API usage:\n", 16 | "\n", 17 | "- Documentation\n", 18 | " - Wiley Text and Data Mining\n", 19 | "- Terms\n", 20 | " - Wiley Text and Data Mining Agreement\n", 21 | "- Data Reuse\n", 22 | " - Wiley TDM Data Reuse (see sections 4 and 5 of Text and Data Mining Agreement)\n", 23 | "\n", 24 | "*These recipe examples were tested on April 4, 2025.*\n", 25 | "\n", 26 | "**_NOTE:_** The Wiley TDM API limits requests to a maximum of 3 requests per second." 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Setup\n", 34 | "\n", 35 | "### Import Libraries\n", 36 | "\n", 37 | "The following external libraries need to be installed into your enviornment to run the code examples in this tutorial:\n", 38 | "\n", 39 | "- ipykernel\n", 40 | "- requests\n", 41 | "- python-dotenv\n", 42 | "\n", 43 | "We import the libraries used in this tutorial below:" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 1, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import os\n", 53 | "import requests\n", 54 | "from time import sleep\n", 55 | "from dotenv import load_dotenv" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Import Text and Data Mining Token\n", 63 | "\n", 64 | "An token is required for text and data mining with Wiley. You can sign up for one here.\n", 65 | "\n", 66 | "We keep our token in a `.env` file and use the `dotenv` library to access it. If you would like to use this method, create a `.env` file and add the following line to it:\n", 67 | "\n", 68 | "```text\n", 69 | "WILEY_TDM_TOKEN=PUT_YOUR_TOKEN_HERE\n", 70 | "```" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "load_dotenv()\n", 80 | "try:\n", 81 | " WILEY_TDM_TOKEN = os.environ[\"WILEY_TDM_TOKEN\"]\n", 82 | "except KeyError:\n", 83 | " print(\"Token not found. Please set 'WILEY_TDM_TOKEN' in your .env file.\")" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## 1. Retrieve Full-Text of an Article\n", 91 | "\n", 92 | "The Wiley TDM API returns the full-text of an article as a PDF when given the article's DOI.\n", 93 | "\n", 94 | "In the first example, we download the full-text of the article with the DOI \"10.1002/net.22207\". This article was found on the Wiley Online Library." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "10.1002_net.22207.pdf downloaded successfully\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "# DOI of article to download\n", 112 | "doi = '10.1002/net.22207'\n", 113 | "url = f'https://api.wiley.com/onlinelibrary/tdm/v1/articles/{doi}'\n", 114 | "headers = {\n", 115 | " \"Wiley-TDM-Client-Token\": WILEY_TDM_TOKEN\n", 116 | "}\n", 117 | "response = requests.get(url, headers=headers)\n", 118 | "\n", 119 | "# Download PDF if status code indicates success\n", 120 | "if response.status_code == 200:\n", 121 | " filename = f'{doi.replace('/', '_')}.pdf'\n", 122 | " with open(filename, 'wb') as file:\n", 123 | " file.write(response.content)\n", 124 | " print(f'{filename} downloaded successfully')\n", 125 | "else:\n", 126 | " print(f'Failed to download PDF. Status code: {response.status_code}')" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "## 2. Retrieve Full-Text of Multiple Articles\n", 134 | "\n", 135 | "In this example, we download 5 articles found in the Wiley Online Library:" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "10.1111_j.1467-8624.2010.01564.x.pdf downloaded successfully\n", 148 | "10.1111_1467-8624.00164.pdf downloaded successfully\n", 149 | "10.1111_cdev.12864.pdf downloaded successfully\n", 150 | "10.1111_j.1467-8624.2007.00995.x.pdf downloaded successfully\n", 151 | "10.1111_j.1467-8624.2010.01499.x.pdf downloaded successfully\n", 152 | "Failed to download PDF for 10.1111/j.1467-8624.2010.0149.x. Status code: 404\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "# DOIs of articles to download\n", 158 | "dois = [\n", 159 | " '10.1111/j.1467-8624.2010.01564.x',\n", 160 | " '10.1111/1467-8624.00164',\n", 161 | " '10.1111/cdev.12864',\n", 162 | " '10.1111/j.1467-8624.2007.00995.x',\n", 163 | " '10.1111/j.1467-8624.2010.01499.x',\n", 164 | " '10.1111/j.1467-8624.2010.0149.x' # Invalid DOI, will throw error\n", 165 | "]\n", 166 | "\n", 167 | "# Send an HTTP request for each DOI\n", 168 | "for doi in dois:\n", 169 | " url = f'https://api.wiley.com/onlinelibrary/tdm/v1/articles/{doi}'\n", 170 | " response = requests.get(url, headers=headers)\n", 171 | "\n", 172 | " # Download PDF if status code indicates success\n", 173 | " if response.status_code == 200:\n", 174 | " filename = f'{doi.replace('/', '_')}.pdf'\n", 175 | " with open(filename, 'wb') as file:\n", 176 | " file.write(response.content)\n", 177 | " print(f'{filename} downloaded successfully')\n", 178 | " else:\n", 179 | " print(f'Failed to download PDF for {doi.replace('%2f', '/')}.')\n", 180 | " print(f'Status code: {response.status_code}')\n", 181 | " sleep(1) # Wait 1 second to be nice on Wiley's servers" 182 | ] 183 | } 184 | ], 185 | "metadata": { 186 | "kernelspec": { 187 | "display_name": "api_env", 188 | "language": "python", 189 | "name": "python3" 190 | }, 191 | "language_info": { 192 | "codemirror_mode": { 193 | "name": "ipython", 194 | "version": 3 195 | }, 196 | "file_extension": ".py", 197 | "mimetype": "text/x-python", 198 | "name": "python", 199 | "nbconvert_exporter": "python", 200 | "pygments_lexer": "ipython3", 201 | "version": "3.13.2" 202 | }, 203 | "orig_nbformat": 4 204 | }, 205 | "nbformat": 4, 206 | "nbformat_minor": 2 207 | } 208 | -------------------------------------------------------------------------------- /src/r-gpl3/PubMedAPItut_files/figure-html/visual-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/PubMedAPItut_files/figure-html/visual-1.png -------------------------------------------------------------------------------- /src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-10-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-10-2.png -------------------------------------------------------------------------------- /src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-10-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-10-3.png -------------------------------------------------------------------------------- /src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-10-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-10-4.png -------------------------------------------------------------------------------- /src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-10-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-10-5.png -------------------------------------------------------------------------------- /src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-18-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-18-1.png -------------------------------------------------------------------------------- /src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-18-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-18-2.png -------------------------------------------------------------------------------- /src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-18-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-18-3.png -------------------------------------------------------------------------------- /src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-18-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-18-4.png -------------------------------------------------------------------------------- /src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-18-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/R_Pubchem_Markdown_Adam_Edit_files/figure-html/unnamed-chunk-18-5.png -------------------------------------------------------------------------------- /src/r-gpl3/US_Census_Data_in_R_files/figure-html/plot-popchg-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r-gpl3/US_Census_Data_in_R_files/figure-html/plot-popchg-1.png -------------------------------------------------------------------------------- /src/r-gpl3/pubmed.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: \...in R 3 | output: 4 | html_document: 5 | keep_md: TRUE 6 | --- 7 | 8 | # PubMed API in R 9 | 10 | by Adam M. Nguyen 11 | 12 | The recipe examples were tested on Mar 24, 2023 13 | 14 | The easyPubMed R package provides programmatic access to PubMed data, enabling researchers to search, retrieve, and analyze biomedical literature efficiently. 15 | 16 | ### API Resources 17 | 18 | - **Documentation** 19 | - [Getting Started with easyPubMed Article](https://cran.r-project.org/web/packages/easyPubMed/vignettes/getting_started_with_easyPubMed.html) 20 | - [easyPubMed PDF Documentation](https://cran.r-project.org/web/packages/easyPubMed/easyPubMed.pdf) 21 | 22 | - **Tutorial License** 23 | - This tutorial uses the **easyPubMed** R library which is licensed as **GPL-3**. 24 | - As a result, this tutorial code is also licensed as **GPL-3**: 25 | [License Details](https://github.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/blob/main/LICENSE_selected_R_tutorials) 26 | 27 | 28 | ## Setup 29 | 30 | First let's install the easyPubMed package as well as load the library. 31 | If you do not already have the package installed, run the following command in your console: "install.packages("easyPubMed", repos = "http://cran.us.r-project.org")". 32 | 33 | ```r 34 | # easyPubMed library for accessing PubMed API 35 | library(easyPubMed) 36 | ``` 37 | 38 | ## 1. Querying PubMed API 39 | 40 | Below is an example query utilizing some valuable functions provided by the easyPubMed library as well as some important information for the API, if unfamiliar. 41 | 42 | These functions include: 43 | 44 | * get_pubmed_ids() 45 | 46 | * fetch_pubmed_data() 47 | 48 | **Note:** PubMed employs field tags to specify the nature of the associated string, for a comprehensive list of field tags visit: "https://pubmed.ncbi.nlm.nih.gov/help/#using-search-field-tags". Additionally, using PubMed tags will limit your search to the specified terms only. While querying PubMed, using the "get_pubmed_ids", it is allowable to provide no tags and the function will translate it for the user. 49 | 50 | Let's try querying Pubmed! Check comments for additional, step-by-step detail. 51 | 52 | 53 | ```r 54 | example_query <- 'Ancestral population genomics using coalescence hidden Markov models and heuristic optimisation algorithms.[Title]' #State query in the format 'query[query tag]', can include AND and OR statements and a query tag is not required 55 | example_id <- get_pubmed_ids(example_query) #Stores a list of PMIDs(PubMed Identifications) satisfying the query 56 | 57 | example_id$IdList$Id 58 | ``` 59 | 60 | ``` 61 | ## [1] "25819138" 62 | ``` 63 | 64 | ```r 65 | example_xml <-fetch_pubmed_data(example_id, format = "xml") # Create xml output 66 | ``` 67 | Now we have successfully queried and stored the data of 'Ancestral population genomics using coalescence hidden Markov models and heuristic optimisation algorithms.[Title]' into the xml output. We will find that working with the xml output is advantageous due to its hierarchical structure. 68 | 69 | Next we will show an example of how we can obtain a list of authors from this query using simple R functions and the 'custom_grep()' function from easyPubMed. 70 | 71 | * custom_grep() retrieves data between the tags given 72 | 73 | 74 | ```r 75 | last_name_authors <- custom_grep(example_xml, "LastName", "/LastName") # retrieve last name 76 | forename_authors <- custom_grep(example_xml, "ForeName", "/ForeName") # retrieve forename 77 | 78 | example_authors <- rbind('Last Name'=last_name_authors, 'Forename'=forename_authors) # output example_authors dataframe for PMID 27933103 79 | example_authors 80 | ``` 81 | 82 | ``` 83 | ## [,1] [,2] 84 | ## Last Name "Cheng" "Mailund" 85 | ## Forename "Jade Yu" "Thomas" 86 | ``` 87 | 88 | 89 | ## 2. Querying for Multiple Sources 90 | 91 | Another convenience of using easyPubMed is whether requesting data from one article or multiple, it is the same process. The only change that must be made is changing the query, whether that be multiple PMIDs or an Author's name, as seen in the example below. 92 | 93 | 94 | ```r 95 | multi_example_query <- 'Vincent Scalfani[AU]' # All we need to change here is simply making more general query requests to PubMed. 96 | multi_example_id <- get_pubmed_ids(multi_example_query) #Stores a list of PMIDs satisfying the query 97 | 98 | multi_example_xml <-fetch_pubmed_data(multi_example_id, format = "xml") # XML format 99 | 100 | # To understand the structure of the XML output, try running the following line without the pound sign, i.e. uncomment 101 | # multi_example_xml 102 | 103 | # In the XML format we find Journal Titles to be between "Title" and "/Title" 104 | journals <- custom_grep(multi_example_xml, "Title", "/Title") #Retrieve Journal Titles 105 | ``` 106 | 107 | Similar to the previous example, now we have retrieved a list of Journal Titles Dr. Scalfani has published under from the articles available on PubMed. 108 | 109 | 110 | ```r 111 | journals 112 | ``` 113 | 114 | ``` 115 | ## [1] "Journal of cheminformatics" "Journal of cheminformatics" 116 | ## [3] "Science (New York, N.Y.)" "ACS macro letters" 117 | ``` 118 | ## 3. Looping Through a List of PMIDs 119 | 120 | In some use cases, a user may be interested in looping through a list of IDs to query data. Below we will show how one can do this. 121 | 122 | First, create an example list of PubMed IDs: 123 | 124 | ```r 125 | pmids = as.list(c(34813985, 34813932, 34813684, 34813661, 34813372, 34813140, 34813072)) 126 | ``` 127 | 128 | Next, let's begin querying through a for loop. Essentially what is happening is similar to in previous examples, but we are using the for command to iterate over each element of our 'pmid' list and then appending the results to our 'Titles' list. 129 | 130 | 131 | ```r 132 | # Creates empty list of titles 133 | Titles <- c() 134 | # Iterate through each listed pmid, retrieve XML formatted info, and retrieve list of Titles 135 | for (i in pmids) { 136 | join <- paste(i, '[pmid]') #join each element with [pmid] to specify 137 | id <- get_pubmed_ids(join) 138 | xml <- fetch_pubmed_data(id, format = "xml") 139 | Titles<-append(Titles,custom_grep(xml, "ArticleTitle", "/ArticleTitle")) 140 | Sys.sleep(1) 141 | } 142 | # Display list of titles 143 | Titles 144 | ``` 145 | 146 | ``` 147 | ## [1] "Mutation in RyR2-FKBP Binding site alters Ca2+ signaling modestly but increases \"arrhythmogenesis\" in human stem cells derived cardiomyocytes." 148 | ## [2] "M-CDC: Magnetic pull-down-assisted colorimetric method based on the CRISPR/Cas12a system." 149 | ## [3] "Naturally occurring UBIAD1 mutations differentially affect menaquinone biosynthesis and vitamin K-dependent carboxylation." 150 | ## [4] "Efficient visual screening of CRISPR/Cas9 genome editing in the nematode Pristionchus pacificus." 151 | ## [5] "Base Editing of Somatic Cells Using CRISPR-Cas9 in Drosophila." 152 | ## [6] "Mammalian Chemical Genomics towards Identifying Targets and Elucidating Modes-of-Action of Bioactive Compounds." 153 | ## [7] "CRISPR-Cas9 Editing of the Synthesis of Biodegradable Polyesters Polyhydroxyalkanaotes (PHA) in Pseudomonas putida KT2440." 154 | ``` 155 | ## 4. PubMed API Metadata Visualization 156 | 157 | In this example we're going to show how a user can create a visualization using the PubMed API, specifically a histogram to visualize publishing frequency of the query 'hydrogel drug AND clinical trial[pt]'. 158 | 159 | 160 | ```r 161 | # Libraries for creating network visual 162 | visual_query <- 'hydrogel drug AND clinical trial[pt]' # Example Query 163 | 164 | visual_id <- get_pubmed_ids(visual_query) #Stores a list of PMIDs satisfying the query 165 | 166 | visual_xml <-fetch_pubmed_data(visual_id, format = "xml") # XML format 167 | 168 | Year<- custom_grep(visual_xml, 'Year','/Year') #Retrieve Publication Years 169 | head(Year, n=10) #Display first 10 instances of the Year list 170 | ``` 171 | 172 | ``` 173 | ## [1] "2022" "2023" "2022" "2022" "2022" "2022" "2022" "2022" "2022" "2022" 174 | ``` 175 | 176 | ```r 177 | hist(as.numeric(Year),main= 'Freq. of Publications from PubMed "hydrogel drug AND clinical trial[pt]"',xlab='Year', breaks=40, col = 'plum') # Use base R function hist() to plot 178 | ``` 179 | 180 | ![](PubMedAPItut_files/figure-html/visual-1.png) 181 | -------------------------------------------------------------------------------- /src/r-gpl3/us-census.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: \...in R 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | # U.S. Census Data API in R 9 | 10 | by Adam M. Nguyen 11 | 12 | These recipe examples were tested on March 24, 2023. 13 | 14 | - **Documentation** 15 | - [censusapi Package Documentation (PDF)](https://cran.r-project.org/web/packages/censusapi/censusapi.pdf) 16 | - [U.S. Census API Documentation](https://www.census.gov/data/developers/about.html) 17 | - [U.S. Census Data Discovery Tool](https://api.census.gov/data.html) 18 | 19 | - **Terms** 20 | - See also the [U.S. Census API Terms of Service](https://www.census.gov/data/developers/about.html#terms) 21 | 22 | - **Attribution** 23 | - This tutorial uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau. 24 | 25 | - **Tutorial License** 26 | - This tutorial uses the **censusapi** R library which is licensed as **GPL-3**. 27 | - As a result, this tutorial code is also licensed as **GPL-3**: 28 | [License Details](https://github.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/blob/main/LICENSE_selected_R_tutorials) 29 | 30 | ## Setup 31 | 32 | ### API Key Information 33 | 34 | While an API key is not required to use the U.S. Census Data API, you may consider registering for an API key as the API is limited to 500 calls a day without a key. Sign up can be found here: https://api.census.gov/data/key_signup.html. 35 | 36 | If using this code, make sure to access your key below. 37 | 38 | Here we use 'Sys.getenv()' to retrieve our API key from the environment variables. You can either do this by creating an [.Renviron file and storing your API Key](https://docs.posit.co/how-to-guides/pre-tasks/api-keys-renv/) or simply replacing "Sys.getenv('USCensusAPIKey')" with your API Key. 39 | 40 | ```r 41 | # Access .Renviron to get PubMed API Key 42 | user_key = Sys.getenv('USCensusAPIKey')#use Sys.getenv() to access .Renviron 43 | ``` 44 | 45 | ### Setup censusapi Package 46 | The package, censusapi, allows users to easily access U.S. Census data and metadata, including datasets such as the Decennial Census, American Community Survey, Small Area Health Insurance Estimates, Small Area Income and Poverty Estimates, Population Estimates and Projections, and more. In this tutorial, we will be using this censusapi. 47 | 48 | If you haven't already, run "install.packages('censusapi')" in your R Console to install the US Census API package we will be using for this tutorial. 49 | 50 | First let us set up the required library, "censusapi". 51 | 52 | ```r 53 | library(censusapi) # Access censusapi library 54 | ``` 55 | ## 1. Get Population Estimates of Counties by State 56 | 57 | Our primary means of accessing the U.S. Census API will be through the function "getCensus". In this example we give specific comments as to each line of code that should clarify each line. 58 | 59 | In the following example we use arguments including 'name' and 'vars', to access comprehensive lists of each see the censusapi documentation located at the top of the article for further documentation on the functions 'listCensusApis()' and 'makeVarlist()'. 60 | 61 | ```r 62 | your_state_code = '01' # Alabama FIPS Code 63 | 64 | # Retrieve county population estimates by state 65 | 66 | pop_estimates <- getCensus(name = "acs/acs5/subject", #The programmatic name of your dataset,See 'listCensusApis()' for options 67 | vars = c("NAME", "S0101_C01_001E"), #list of variables to get 68 | region = "county:*", #geography to get 69 | vintage = "2021",#year 70 | key=user_key#API key 71 | ) 72 | head(pop_estimates,n=10) #Display first entries of 'pop_estimates' 73 | ``` 74 | 75 | ``` 76 | ## state county NAME S0101_C01_001E 77 | ## 1 01 001 Autauga County, Alabama 58239 78 | ## 2 01 003 Baldwin County, Alabama 227131 79 | ## 3 01 005 Barbour County, Alabama 25259 80 | ## 4 01 007 Bibb County, Alabama 22412 81 | ## 5 01 009 Blount County, Alabama 58884 82 | ## 6 01 011 Bullock County, Alabama 10386 83 | ## 7 01 013 Butler County, Alabama 19181 84 | ## 8 01 015 Calhoun County, Alabama 116425 85 | ## 9 01 017 Chambers County, Alabama 34834 86 | ## 10 01 019 Cherokee County, Alabama 24975 87 | ``` 88 | 89 | The previous dataframe 'pop_estimates' gives counties from every state, given the wildcard, '*', in the 'region' argument. Now we want to filter the dataset so we are left with only Alabama. Additionally, the US Census API utilizes codes for variables. To search for variables use the function 'makeVarlist()'; additional information on the usage can be found in the censusapi package documentation pdf file. 90 | 91 | 92 | ```r 93 | # Filter 94 | alabama_counties <- pop_estimates[pop_estimates$state == your_state_code,] 95 | 96 | # Extract population 97 | alabama_counties_populations <- data.frame(County = alabama_counties$NAME, Population = alabama_counties$S0101_C01_001E) 98 | 99 | # Print population 100 | head(alabama_counties_populations,n=10) #Display first entries of 'alabama_counties_populations' 101 | ``` 102 | 103 | ``` 104 | ## County Population 105 | ## 1 Autauga County, Alabama 58239 106 | ## 2 Baldwin County, Alabama 227131 107 | ## 3 Barbour County, Alabama 25259 108 | ## 4 Bibb County, Alabama 22412 109 | ## 5 Blount County, Alabama 58884 110 | ## 6 Bullock County, Alabama 10386 111 | ## 7 Butler County, Alabama 19181 112 | ## 8 Calhoun County, Alabama 116425 113 | ## 9 Chambers County, Alabama 34834 114 | ## 10 Cherokee County, Alabama 24975 115 | ``` 116 | 117 | Now we have successfully used the U.S. Census API to store population estimates from Alabama counties in the variable 'alabama_counties_populations'. 118 | 119 | 120 | ## 2. Get Population Estiamtes Over a Range of Years 121 | 122 | We can use similar code as before, but we will loop through the different population estimate datasets by year. 123 | 124 | ```r 125 | # Define the range of years 126 | years <- c(2016:2021) 127 | 128 | # Create an empty data frame to store the population estimates 129 | pop_estimates_all <- data.frame() 130 | 131 | # Loop over the years 132 | for (year in years) { 133 | # Retrieve population estimates for Tuscaloosa County 134 | pop_estimates <- getCensus(name = "acs/acs5/subject", 135 | vars = c("NAME", "S0101_C01_001E"), 136 | region = "county:*", 137 | vintage = as.character(year), 138 | key= user_key) 139 | alabama <- pop_estimates[pop_estimates$state == your_state_code,] 140 | 141 | 142 | # Add the population estimate and year to the data frame 143 | pop_estimates_all <- rbind(pop_estimates_all, data.frame(Year = year, Population = alabama$S0101_C01_001E,Name= alabama$NAME)) 144 | } 145 | 146 | # Print the resulting data frame 147 | head(pop_estimates_all,n=10) 148 | ``` 149 | 150 | ``` 151 | ## Year Population Name 152 | ## 1 2016 21975 Monroe County, Alabama 153 | ## 2 2016 33433 Lawrence County, Alabama 154 | ## 3 2016 153947 Lee County, Alabama 155 | ## 4 2016 30239 Marion County, Alabama 156 | ## 5 2016 20042 Pickens County, Alabama 157 | ## 6 2016 13285 Sumter County, Alabama 158 | ## 7 2016 659096 Jefferson County, Alabama 159 | ## 8 2016 13287 Choctaw County, Alabama 160 | ## 9 2016 31573 Franklin County, Alabama 161 | ## 10 2016 20066 Marengo County, Alabama 162 | ``` 163 | 164 | ## 3. Plot Population Change 165 | 166 | We will use the data we retrieved in example 2 and then calculate and graph the percent change in population per county. 167 | 168 | ```r 169 | # Filter for the population in 2016 170 | pop_2016 <- pop_estimates_all[pop_estimates_all$Year == 2016, ] 171 | 172 | # Filter for the population in 2021 173 | pop_2021 <- pop_estimates_all[pop_estimates_all$Year == 2021, ] 174 | 175 | # Calculate the percent change in population 176 | pop_pct_change <- data.frame(County=pop_2021$Name,Pct_Change =round(((as.numeric( pop_2021$Population)-as.numeric(pop_2016$Population))/as.numeric(pop_2016$Population)),4)) # (pop_2021-pop_2016)/pop_2016 rounded to 5 digits 177 | 178 | # Next we're going to remove the 'County, Alabama' because it is repetitive. 179 | pop_pct_change[]<-lapply(pop_pct_change,function(x) (sub(' County, Alabama','',x))) 180 | 181 | head(pop_pct_change,n=10) 182 | ``` 183 | 184 | ``` 185 | ## County Pct_Change 186 | ## 1 Autauga 1.6502 187 | ## 2 Baldwin 5.7936 188 | ## 3 Barbour -0.8359 189 | ## 4 Bibb -0.2588 190 | ## 5 Blount 1.938 191 | ## 6 Bullock -0.2182 192 | ## 7 Butler -0.9709 193 | ## 8 Calhoun 7.7623 194 | ## 9 Chambers 0.1033 195 | ## 10 Cherokee 0.2446 196 | ``` 197 | Next we will create a plot of the percent change in population by county in Alabama from the years 2016 to 2021 using the package ggplot2. 198 | 199 | ```r 200 | library(ggplot2) #library for creating graphics 201 | options(repr.plot.width = 100, repr.plot.height =2) 202 | ggplot(pop_pct_change, aes(x = reorder(pop_pct_change$County, as.numeric(pop_pct_change$Pct_Change)), y = as.numeric(pop_pct_change$Pct_Change))) + 203 | geom_point(orientation = 'y') + 204 | ylab("Percent Change in Population") + 205 | xlab("AL County") + 206 | theme_bw()+ 207 | theme( 208 | panel.grid.major.y = element_blank(), 209 | panel.grid.minor.y = element_blank(), 210 | panel.grid.major.x = element_line(colour = "grey80", linetype = "dashed"), 211 | axis.text.x = element_text(angle = 90, hjust = 1, vjust=.2, size= 7 ) 212 | )+ 213 | geom_hline(yintercept=0)+ 214 | ggtitle("Percent Change in Population by County from 2016 to 2021") 215 | ``` 216 | 217 | ![](US_Census_Data_in_R_files/figure-html/plot-popchg-1.png) 218 | -------------------------------------------------------------------------------- /src/r/CASCommonChemR_files/figure-html/Display-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/CASCommonChemR_files/figure-html/Display-1.png -------------------------------------------------------------------------------- /src/r/CASCommonChemR_files/figure-html/disp-list-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/CASCommonChemR_files/figure-html/disp-list-1.png -------------------------------------------------------------------------------- /src/r/CASCommonChemR_files/figure-html/disp-list-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/CASCommonChemR_files/figure-html/disp-list-2.png -------------------------------------------------------------------------------- /src/r/CASCommonChemR_files/figure-html/disp-list-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/CASCommonChemR_files/figure-html/disp-list-3.png -------------------------------------------------------------------------------- /src/r/CASCommonChemR_files/figure-html/disp-list-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/CASCommonChemR_files/figure-html/disp-list-4.png -------------------------------------------------------------------------------- /src/r/CASCommonChemR_files/figure-html/disp-list-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/CASCommonChemR_files/figure-html/disp-list-5.png -------------------------------------------------------------------------------- /src/r/CASCommonChemR_files/figure-html/hist-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/CASCommonChemR_files/figure-html/hist-1.png -------------------------------------------------------------------------------- /src/r/CASCommonChemR_files/figure-html/kernel-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/CASCommonChemR_files/figure-html/kernel-1.png -------------------------------------------------------------------------------- /src/r/Chronam_in_R_Adam_Vers_files/figure-html/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/Chronam_in_R_Adam_Vers_files/figure-html/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /src/r/Chronam_in_R_Adam_Vers_files/figure-html/unnamed-chunk-21-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/Chronam_in_R_Adam_Vers_files/figure-html/unnamed-chunk-21-1.png -------------------------------------------------------------------------------- /src/r/College_Scorecard_R_files/figure-html/program-percentage-distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/College_Scorecard_R_files/figure-html/program-percentage-distribution.png -------------------------------------------------------------------------------- /src/r/R_WorldBank_Markdown_files/figure-html/unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/R_WorldBank_Markdown_files/figure-html/unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /src/r/R_WorldBank_Markdown_files/figure-html/unnamed-chunk-17-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/R_WorldBank_Markdown_files/figure-html/unnamed-chunk-17-1.png -------------------------------------------------------------------------------- /src/r/R_WorldBank_Markdown_files/figure-html/unnamed-chunk-18-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/R_WorldBank_Markdown_files/figure-html/unnamed-chunk-18-1.png -------------------------------------------------------------------------------- /src/r/SEC_EDGAR_API_R_files/figure-html/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/SEC_EDGAR_API_R_files/figure-html/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /src/r/SEC_EDGAR_API_R_files/figure-html/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/SEC_EDGAR_API_R_files/figure-html/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /src/r/USA_Spending_R_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/USA_Spending_R_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /src/r/USA_Spending_R_files/figure-html/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/USA_Spending_R_files/figure-html/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /src/r/USA_Spending_R_files/figure-html/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/USA_Spending_R_files/figure-html/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /src/r/arXiv_API_in_R_files/figure-html/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/arXiv_API_in_R_files/figure-html/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /src/r/arXiv_API_in_R_files/figure-html/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/arXiv_API_in_R_files/figure-html/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /src/r/arxiv.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: \...in R 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | # arXiv API in R 9 | 10 | by Adam M. Nguyen 11 | 12 | 13 | The arXiv API provides programmatic access to metadata about scholarly papers hosted on the arXiv preprint server, covering fields such as physics, mathematics, computer science, and more. This allows users to automate the discovery and retrieval of preprint information for research, text mining, or integration into academic workflows. 14 | 15 | ### API Resources 16 | 17 | - **Documentation** 18 | - [arXiv](https://arxiv.org/) 19 | - [arXiv API Access Information](https://info.arxiv.org/help/api/index.html) 20 | - [aRxiv: R Interface to the arXiv API](https://cran.r-project.org/web/packages/aRxiv/index.html) 21 | 22 | - **Terms** 23 | - [arXiv Terms of Use](https://info.arxiv.org/help/api/tou.html) 24 | 25 | *Acknowledgment: Thank you to arXiv for use of its open access interoperability* 26 | 27 | These recipe examples were tested on December 1, 2023. 28 | 29 | Hosted and maintained by Cornell University, arXiv is an open-access and free distribution service containing nearly 2.5 million scholarly articles in fields including physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science and economics at the time of writing. In this tutorial we will introduce how to use the API with some examples, but for larger bulk downloads of data from arXiv, we recommend [Kaggle's arxiv Dataset](https://www.kaggle.com/datasets/Cornell-University/arxiv/data), which is updated monthly with the full arXiv data set and metadata. 30 | 31 | ## Setup 32 | 33 | ### Load Library 34 | 35 | Run the following lines of code to load the library ‘aRxiv’. If you have not done so already, additionally, before the ‘library()’ functions, run ‘install.packages('aRxiv')’. aRxiv is the package we will use to interface with the arXiv API. 36 | 37 | ```r 38 | # Load necessary libraries 39 | library(aRxiv) 40 | ``` 41 | 42 | ### Retrieving Categories 43 | 44 | Before we get started, a useful function provided by the aRxiv package is 'arxiv_cats'. This returns arXiv subject classification's abbreviation and corresponding description. Categories are especially important in forming queries to the API so we mention them here first. 45 | 46 | ```r 47 | # Here are the first 10 categories to showcase the function 48 | head(arxiv_cats,n=10) 49 | ``` 50 | 51 | ``` 52 | ## abbreviation description 53 | ## 1 stat.AP Statistics - Applications 54 | ## 2 stat.CO Statistics - Computation 55 | ## 3 stat.ML Statistics - Machine Learning 56 | ## 4 stat.ME Statistics - Methodology 57 | ## 5 stat.TH Statistics - Theory 58 | ## 6 q-bio.BM Quantitative Biology - Biomolecules 59 | ## 7 q-bio.CB Quantitative Biology - Cell Behavior 60 | ## 8 q-bio.GN Quantitative Biology - Genomics 61 | ## 9 q-bio.MN Quantitative Biology - Molecular Networks 62 | ## 10 q-bio.NC Quantitative Biology - Neurons and Cognition 63 | ``` 64 | 65 | ## 1. Basic Search 66 | 67 | Possibly the function of most utility in the package is 'arxiv_search()'. The search allows for the programmatic searching of the arXiv repository returning 15 columns of information including 'id','title','summary', and more. We will showcase the use of this function by searching for papers with the term 'Hydrodynamics' in the title and then extract authors and see who is has the most publications. 68 | 69 | ```r 70 | # Search for Hydrodynamics papers 71 | hydrodynamic_search <- arxiv_search('ti:Hydrodynamics', batchsize =410, limit=10000, force = TRUE) 72 | ``` 73 | 74 | ``` 75 | ## retrieved batch 1 76 | ``` 77 | 78 | ``` 79 | ## retrieved batch 2 80 | ``` 81 | 82 | ``` 83 | ## retrieved batch 3 84 | ``` 85 | 86 | ``` 87 | ## retrieved batch 4 88 | ``` 89 | 90 | ``` 91 | ## retrieved batch 5 92 | ``` 93 | 94 | ``` 95 | ## retrieved batch 6 96 | ``` 97 | 98 | ``` 99 | ## retrieved batch 7 100 | ``` 101 | 102 | ``` 103 | ## retrieved batch 8 104 | ``` 105 | 106 | ``` 107 | ## retrieved batch 9 108 | ``` 109 | 110 | ``` 111 | ## retrieved batch 10 112 | ``` 113 | 114 | ```r 115 | # Extract out the authors 116 | authors <- hydrodynamic_search[, c('title', 'authors')] 117 | 118 | # Show first few entries 119 | head(authors) 120 | ``` 121 | 122 | ``` 123 | ## title 124 | ## 1 A finite model of two-dimensional ideal hydrodynamics 125 | ## 2 Hydrodynamic Stability Analysis of Burning Bubbles in Electroweak Theory\n and in QCD 126 | ## 3 Hydrodynamics of Relativistic Fireballs 127 | ## 4 Comparison of Spectral Method and Lattice Boltzmann Simulations of\n Two-Dimensional Hydrodynamics 128 | ## 5 Classical differential geometry and integrability of systems of\n hydrodynamic type 129 | ## 6 Hydrodynamic Spinodal Decomposition: Growth Kinetics and Scaling\n Functions 130 | ## authors 131 | ## 1 J. S. Dowker|A. Wolski 132 | ## 2 P. Huet|K. Kajantie|R. G. Leigh|B. -H. Liu|L. McLerran 133 | ## 3 Tsvi Piran|Amotz Shemi|Ramesh Narayan 134 | ## 4 D. O. Martinez|W. H. Matthaeus|S. Chen|D. C. Montgomery 135 | ## 5 S. P. Tsarev 136 | ## 6 F. J. Alexander|S. Chen|D. W. Grunau 137 | ``` 138 | 139 | ```r 140 | # Split the 'authors' column in a list of individuals 141 | author_lists <- strsplit(authors[,'authors'], split = "|", fixed = TRUE) 142 | 143 | # List Frequency of Author Occurrences 144 | co_freq <- table(unlist(author_lists)) 145 | 146 | # Order and Format as Data frame 147 | ordered_cofreq <- as.data.frame(co_freq[order(co_freq, decreasing = TRUE)]) 148 | 149 | # Here are the first highest publishers in Hydrodynamics as available by the arXiv repository 150 | head(ordered_cofreq) 151 | ``` 152 | 153 | ``` 154 | ## Var1 Freq 155 | ## 1 Radoslaw Ryblewski 31 156 | ## 2 Tetsufumi Hirano 31 157 | ## 3 Wojciech Florkowski 30 158 | ## 4 Volker Springel 29 159 | ## 5 Michael Strickland 28 160 | ## 6 T. Kodama 28 161 | ``` 162 | 163 | ### Visualization 164 | 165 | Additionally, we can create a visualization using the 'ggplot2' library. See the following code to see how to do so and what is produced. 166 | 167 | 168 | ```r 169 | library(ggplot2) 170 | # Visualize the top 20 highest publishers 171 | ggplot(head(ordered_cofreq,n=20), aes(x = Var1, y = Freq)) + 172 | geom_bar(stat = "identity", fill = "#D16103") + 173 | labs(x = "Author", y = "Number of Publications", title = "Top 20 Most Published Authors in Hydrodynamics in arXiv") + 174 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5)) # Rotate x-axis labels for readability 175 | ``` 176 | 177 | ![](arXiv_API_in_R_files/figure-html/unnamed-chunk-3-1.png) 178 | 179 | ## 2. Retrieving Number of Query Results 180 | 181 | Using the aRxiv package you can also retrieve counts of papers given some query. For example, we can see how many papers our previous 'Hydrodynamics' query returns. 182 | 183 | 184 | ```r 185 | # How many papers titles contain hydroynamics? 186 | 187 | arxiv_count('ti:"hydrodynamics"') 188 | ``` 189 | 190 | ``` 191 | ## [1] 6385 192 | ``` 193 | We can also see how many HEP-th papers there are. 194 | 195 | 196 | ```r 197 | # How many papers fall under the HEP-th category? 198 | 199 | arxiv_count("cat: HEP-th") 200 | ``` 201 | 202 | ``` 203 | ## [1] 162439 204 | ``` 205 | And finally we can see how many HEP-th papers have been published throughout the years. 206 | 207 | 208 | ```r 209 | # Create a vector of years we are interested in, 1990:2023 210 | years <- 1991:2023 211 | 212 | # Create empty vector to append annual counts to 213 | arxiv_counts <- c() 214 | 215 | # Loop through years to create list of counts per year 216 | for(year in years){ 217 | arxiv_counts <- c(arxiv_counts, arxiv_count(paste0('cat:HEP-th AND submittedDate:[',year,' TO ',year+1,']'))) 218 | } 219 | arxiv_counts_df <- as.data.frame(cbind(1991:2023,arxiv_counts)) 220 | # Simple base R plot of the data 221 | plot(arxiv_counts_df, main = 'Theoretical High Energy Physics Papers Published per Year', xlab = 'Year', ylab='Number of Papers') 222 | ``` 223 | 224 | ![](arXiv_API_in_R_files/figure-html/unnamed-chunk-6-1.png) 225 | 226 | ## 3. Proportion of Preprints in Hydrodynamics Papers 227 | 228 | arXiv's repository contains both electronic preprints and and links to post print (e.g. version of record DOI). We will explore the proportion of preprints in the previous 'Hydrodynamics' query. This is possible as the 'doi' column returned in the query is empty for those articles that do not have doi, i.e. preprints. 229 | 230 | ```r 231 | # Count the number of preprints by looking for empty 'doi' columns 232 | hydrodynamic_preprint_count <- sum(hydrodynamic_search$doi == "") 233 | 234 | # Calculate a percentage of preprints 235 | percentage_preprints <- (hydrodynamic_preprint_count / nrow(hydrodynamic_search)) * 100 236 | 237 | paste0('The percentage of preprints is ',round(percentage_preprints, digits = 2),'%.') 238 | ``` 239 | 240 | ``` 241 | ## [1] "The percentage of preprints is 23.98%." 242 | ``` 243 | -------------------------------------------------------------------------------- /src/r/figure/Display-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/Display-1.png -------------------------------------------------------------------------------- /src/r/figure/disp list-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/disp list-1.png -------------------------------------------------------------------------------- /src/r/figure/disp list-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/disp list-2.png -------------------------------------------------------------------------------- /src/r/figure/disp list-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/disp list-3.png -------------------------------------------------------------------------------- /src/r/figure/disp list-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/disp list-4.png -------------------------------------------------------------------------------- /src/r/figure/disp list-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/disp list-5.png -------------------------------------------------------------------------------- /src/r/figure/hist-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/hist-1.png -------------------------------------------------------------------------------- /src/r/figure/kernel-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/kernel-1.png -------------------------------------------------------------------------------- /src/r/figure/plot popchg-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/plot popchg-1.png -------------------------------------------------------------------------------- /src/r/figure/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /src/r/figure/unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /src/r/figure/unnamed-chunk-17-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/unnamed-chunk-17-1.png -------------------------------------------------------------------------------- /src/r/figure/unnamed-chunk-18-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/unnamed-chunk-18-1.png -------------------------------------------------------------------------------- /src/r/figure/visual-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/UALIB_ScholarlyAPI_Cookbook/8bcdf5b57804d36745d113b3e586a47513622920/src/r/figure/visual-1.png -------------------------------------------------------------------------------- /src/r/output.json: -------------------------------------------------------------------------------- 1 | {"status":["ok"],"message-type":["work"],"message-version":["1.0.0"],"message":{"indexed":{"date-parts":[[2023,4,5]],"date-time":["2023-04-05T06:53:08Z"],"timestamp":[1680677588699]},"reference-count":[16],"publisher":["Springer Science and Business Media LLC"],"issue":["1"],"license":[{"content-version":"tdm","delay-in-days":0,"URL":"http://creativecommons.org/licenses/by/2.0","start.date-parts":[[2012,7,6]],"start.date-time":"2012-07-06T00:00:00Z","start.timestamp":1341532800000}],"content-domain":{"domain":[],"crossmark-restriction":[false]},"short-container-title":["J Cheminform"],"published-print":{"date-parts":[[2012,12]]},"DOI":["10.1186/1758-2946-4-12"],"type":["journal-article"],"created":{"date-parts":[[2012,7,6]],"date-time":["2012-07-06T12:14:34Z"],"timestamp":[1341576874000]},"source":["Crossref"],"is-referenced-by-count":[38],"title":["The Molecule Cloud - compact visualization of large collections of molecules"],"prefix":["10.1186"],"volume":["4"],"author":[{"given":"Peter","family":"Ertl","sequence":"first","affiliation":[]},{"given":"Bernhard","family":"Rohde","sequence":"additional","affiliation":[]}],"member":["297"],"published-online":{"date-parts":[[2012,7,6]]},"reference":[{"key":"336_CR1","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1007/s10822-011-9487-0","volume":"26","author":"E Martin","year":"2011","unstructured":"Martin E, Ertl P, Hunt P, Duca J, Lewis R: Gazing into the crystal ball; the future of computer-aided drug design. J Comp-Aided Mol Des. 2011, 26: 77-79.","journal-title":"J Comp-Aided Mol Des"},{"key":"336_CR2","doi-asserted-by":"publisher","first-page":"2174","DOI":"10.1021/ci2001428","volume":"26","author":"SR Langdon","year":"2011","unstructured":"Langdon SR, Brown N, Blagg J: Scaffold diversity of exemplified medicinal chemistry space. J Chem Inf Model. 2011, 26: 2174-2185.","journal-title":"J Chem Inf Model"},{"key":"336_CR3","doi-asserted-by":"publisher","first-page":"8732","DOI":"10.1021/ja902302h","volume":"131","author":"LC Blum","year":"2009","unstructured":"Blum LC, Reymond J-C: 970 Million druglike small molecules for virtual screening in the chemical universe database GDB-13. J Am Chem Soc. 2009, 131: 8732-8733. 10.1021/ja902302h.","journal-title":"J Am Chem Soc"},{"key":"336_CR4","doi-asserted-by":"publisher","first-page":"156","DOI":"10.2174/157340908785747410","volume":"4","author":"J Dubois","year":"2008","unstructured":"Dubois J, Bourg S, Vrain C, Morin-Allory L: Collections of compounds - how to deal with them?. Cur Comp-Aided Drug Des. 2008, 4: 156-168. 10.2174/157340908785747410.","journal-title":"Cur Comp-Aided Drug Des"},{"key":"336_CR5","doi-asserted-by":"publisher","first-page":"322","DOI":"10.2174/157340908786786010","volume":"4","author":"JL Medina-Franco","year":"2008","unstructured":"Medina-Franco JL, Martinez-Mayorga K, Giulianotti MA, Houghten RA, Pinilla C: Visualization of the chemical space in drug discovery. Cur Comp-Aided Drug Des. 2008, 4: 322-333. 10.2174/157340908786786010.","journal-title":"Cur Comp-Aided Drug Des"},{"key":"336_CR6","doi-asserted-by":"publisher","first-page":"47","DOI":"10.1021/ci600338x","volume":"47","author":"A Schuffenhauer","year":"2007","unstructured":"Schuffenhauer A, Ertl P, Roggo S, Wetzel S, Koch MA, Waldmann H: The Scaffold Tree - visualization of the scaffold universe by hierarchical scaffold classification. J Chem Inf Model. 2007, 47: 47-58. 10.1021/ci600338x.","journal-title":"J Chem Inf Model"},{"key":"336_CR7","doi-asserted-by":"publisher","first-page":"366","DOI":"10.1002/minf.201000019","volume":"29","author":"S Langdon","year":"2010","unstructured":"Langdon S, Ertl P, Brown N: Bioisosteric replacement and scaffold hopping in lead generation and optimization. Mol Inf. 2010, 29: 366-385. 10.1002/minf.201000019.","journal-title":"Mol Inf"},{"key":"336_CR8","doi-asserted-by":"publisher","first-page":"4443","DOI":"10.1021/jo8001276","volume":"73","author":"AH Lipkus","year":"2008","unstructured":"Lipkus AH, Yuan Q, Lucas KA, Funk SA, Bartelt WF, Schenck RJ, Trippe AJ: Structural diversity of organic chemistry. A scaffold analysis of the CAS Registry. J Org Chem. 2008, 73: 4443-4451. 10.1021/jo8001276.","journal-title":"J Org Chem"},{"key":"336_CR9","unstructured":"mib 2010.10, Molinspiration Cheminformatics: \n http://www.molinspiration.com\n \n ,"},{"key":"336_CR10","unstructured":"Bernhard R: Avalon Cheminformatics Toolkit. \n http://sourceforge.net/projects/avalontoolkit/\n \n ,"},{"key":"336_CR11","doi-asserted-by":"publisher","first-page":"D255","DOI":"10.1093/nar/gkp965","volume":"38","author":"Y Wang","year":"2009","unstructured":"Wang Y, Bolton E, Dracheva S, Karapetyan K, Shoemaker BA, Suzek TO, Wang J, Xiao J, Zhang J, Bryant SH: An overview of the PubChem BioAssay resource. Nucleic Acids Res. 2009, 38: D255-D266.","journal-title":"Nucleic Acids Res"},{"key":"336_CR12","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1021/ci049714+","volume":"45","author":"JJ Irwin","year":"2004","unstructured":"Irwin JJ, Shoichet BK: ZINC − a free database of commercially available compounds for virtual screening. J Chem Inf Model. 2004, 45: 177-182.","journal-title":"J Chem Inf Model"},{"key":"336_CR13","doi-asserted-by":"publisher","first-page":"D1100","DOI":"10.1093/nar/gkr777","volume":"40","author":"A Gaulton","year":"2012","unstructured":"Gaulton A, Bellis LJ, Bento AP, Chambers J, Davies M, Hersey A, Light Y, McGlinchey S, Michalovich D, Al-Lazikani B, Overington JP: ChEMBL: a large-scale bioactivity database for drug discovery. Nucleic Acids Res. 2012, 40: D1100-D1107. 10.1093/nar/gkr777.","journal-title":"Nucleic Acids Res"},{"key":"336_CR14","doi-asserted-by":"publisher","first-page":"347","DOI":"10.1016/j.cbpa.2010.02.018","volume":"14","author":"ME Welsch","year":"2010","unstructured":"Welsch ME, Snyder SA, Stockwell BR: Privileged scaffolds for library design and drug discovery. Curr Opin Chem Biol. 2010, 14: 347-361. 10.1016/j.cbpa.2010.02.018.","journal-title":"Curr Opin Chem Biol"},{"key":"336_CR15","doi-asserted-by":"publisher","first-page":"374","DOI":"10.1021/ci0255782","volume":"43","author":"P Ertl","year":"2003","unstructured":"Ertl P: Cheminformatics analysis of organic substituents: Identification of the most common substituents, calculation of substituent properties, and automatic identification of drug-like bioisosteric groups. J Chem Inf Comp Sci. 2003, 43: 374-380. 10.1021/ci0255782.","journal-title":"J Chem Inf Comp Sci"},{"key":"336_CR16","unstructured":"TagCrowd: \n http://tagcrowd.com"}],"container-title":["Journal of Cheminformatics"],"original-title":[],"language":["en"],"link":[{"URL":"http://link.springer.com/content/pdf/10.1186/1758-2946-4-12.pdf","content-type":"application/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http://link.springer.com/article/10.1186/1758-2946-4-12/fulltext.html","content-type":"text/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http://link.springer.com/content/pdf/10.1186/1758-2946-4-12.pdf","content-type":"application/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,6,24]],"date-time":["2019-06-24T14:22:07Z"],"timestamp":[1561386127000]},"score":[1],"resource":{"primary":{"URL":["https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-4-12"]}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2012,7,6]]},"references-count":[16],"journal-issue":{"issue":["1"],"published-print":{"date-parts":[[2012,12]]}},"alternative-id":["336"],"URL":["http://dx.doi.org/10.1186/1758-2946-4-12"],"relation":{},"ISSN":["1758-2946"],"issn-type":[{"value":"1758-2946","type":"electronic"}],"subject":["Library and Information Sciences","Computer Graphics and Computer-Aided Design","Physical and Theoretical Chemistry","Computer Science Applications"],"published":{"date-parts":[[2012,7,6]]},"article-number":["12"]}} 2 | -------------------------------------------------------------------------------- /src/r/sdirect.md: -------------------------------------------------------------------------------- 1 | # ScienceDirect API in R 2 | 3 | by Michael T. Moen 4 | 5 | These recipe examples demonstrate how to use Elsevier’s [ScienceDirect API](https://dev.elsevier.com/) to retrieve full-text articles in various formats (XML, text). 6 | 7 | *This tutorial content is intended to help facilitate academic research. Please check your institution for their Text and Data Mining or related License Agreement with Elsevier.* 8 | 9 | - **Documentation** 10 | - [ScienceDirect API](https://dev.elsevier.com/) 11 | - [ScienceDirect API Documentation](https://dev.elsevier.com/sd_api_spec.html) 12 | 13 | - **Terms** 14 | - [ScienceDirect API Terms of Use](https://dev.elsevier.com/api_key_settings.html) 15 | 16 | - **Data Reuse** 17 | - [Elsevier Text & Data Mining](https://dev.elsevier.com/tecdoc_text_mining.html) 18 | 19 | > **Note:** See your institution's rate limit in the [ScienceDirect API Terms of Use](https://dev.elsevier.com/api_key_settings.html). 20 | 21 | 22 | *These recipe examples were tested on February 7, 2025.* 23 | 24 | ## Setup 25 | 26 | ### Import Libraries 27 | 28 | ```r 29 | library(httr) 30 | ``` 31 | 32 | ### Import API Key 33 | 34 | An API key is required to access the ScienceDirect API. Registration is available on the [Elsevier developer portal](https://dev.elsevier.com/). The key is imported from an environment variable below: 35 | 36 | ```r 37 | myAPIKey <- Sys.getenv("sciencedirect_key") 38 | ``` 39 | 40 | ### Identifier Note 41 | 42 | We will use DOIs as the article identifiers. See our Crossref and Scopus API tutorials for workflows on how to create lists of DOIs and identfiers for specific searches and journals. The Elsevier ScienceDirect Article (Full-Text) API also accepts other identifiers like Scopus IDs and PubMed IDs (see API specification documents linked above). 43 | 44 | ## 1. Retrieve full-text XML of an article 45 | 46 | ```r 47 | # For XML download 48 | elsevier_url <- "https://api.elsevier.com/content/article/doi/" 49 | doi1 <- '10.1016/j.tetlet.2017.07.080' # Example Tetrahedron Letters article 50 | fulltext1 <- GET(paste0(elsevier_url, doi1, "?APIKey=", myAPIKey, "&httpAccept=text/xml")) 51 | 52 | # Save to file 53 | writeLines(content(fulltext1, "text"), "fulltext1.xml") 54 | ``` 55 | 56 | ## 2. Retrieve plain text of an article 57 | 58 | ```r 59 | # For simplified text download 60 | doi2 <- '10.1016/j.tetlet.2022.153680' # Example Tetrahedron Letters article 61 | fulltext2 <- GET(paste0(elsevier_url, doi2, "?APIKey=", myAPIKey, "&httpAccept=text/plain")) 62 | 63 | # Save to file 64 | writeLines(content(fulltext2, "text"), "fulltext2.txt") 65 | ``` 66 | 67 | ## 3. Retrieve full-text in a loop 68 | 69 | ```r 70 | # Make a list of 5 DOIs for testing 71 | dois <- c('10.1016/j.tetlet.2018.10.031', 72 | '10.1016/j.tetlet.2018.10.033', 73 | '10.1016/j.tetlet.2018.10.034', 74 | '10.1016/j.tetlet.2018.10.038', 75 | '10.1016/j.tetlet.2018.10.041') 76 | ``` 77 | 78 | ```r 79 | for (doi in dois) { 80 | article <- GET(paste0(elsevier_url, doi, "?APIKey=", myAPIKey, "&httpAccept=text/plain")) 81 | doi_name <- gsub("/", "_", doi) 82 | writeLines(content(article, "text"), paste0(doi_name, "_plain_text.txt")) 83 | Sys.sleep(1) 84 | } 85 | ``` 86 | -------------------------------------------------------------------------------- /src/r/sec-edgar.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: \...in R 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | # SEC EDGAR API in R 9 | 10 | by Adam M. Nguyen 11 | 12 | These recipe examples were tested on December 1, 2023. 13 | 14 | The U.S. Securities and Exchange Commission (SEC) allows free public access to documents filed by publicly traded companies in the Electronic Data Gathering, Analysis, and Retrieval (EDGAR) system. 15 | 16 | Please see the following resources for more information on API usage: 17 | 18 | ### Documentation 19 | - [SEC EDGAR](https://www.sec.gov/search-filings) 20 | - [SEC EDGAR API Documentation](https://www.sec.gov/search-filings/edgar-application-programming-interfaces) 21 | - [SEC EDGAR Search Assistance](https://www.sec.gov/search-filings/edgar-search-assistance/accessing-edgar-data) 22 | - [List of All CIKs](https://www.sec.gov/Archives/edgar/cik-lookup-data.txt) 23 | 24 | ### Terms of Use 25 | - [SEC Website Policies](https://www.sec.gov/privacy#security) 26 | 27 | ### Data Reuse 28 | - [SEC Website Dissemination Policy](https://www.sec.gov/about/privacy-information#dissemination) 29 | 30 | ***NOTE:*** Sending more than 10 requests per second will place a temporary IP ban. 31 | 32 | ## Setup 33 | 34 | ### Load libraries 35 | 36 | Run the following lines of code to load the libraries ‘httr’ and ‘jsonlite’. If you have not done so already, additionally, before the ‘library()’ functions, run ‘install.packages(c(‘httr’,’jsonlite’))’. 37 | 38 | ```r 39 | # Load necessary libraries 40 | library(httr) 41 | library(jsonlite) 42 | ``` 43 | 44 | ### User Info 45 | 46 | The SEC EDGAR API requires you to provide your name and email when sending requests. Simply edit the following variables with your information. 47 | 48 | ```r 49 | # Designate your user info 50 | firstName <- "First" 51 | lastName <- "Last" 52 | email <- "Email@email.com" 53 | ``` 54 | 55 | Alternatively, you can also designate environment variables ([click here to see how](https://support.posit.co/hc/en-us/articles/360047157094-Managing-R-with-Rprofile-Renviron-Rprofile-site-Renviron-site-rsession-conf-and-repos-conf)) to access your user information. 56 | 57 | ```r 58 | # Here we simply use the 'Sys.getenv()' function to grab the variables, first, last, and email 59 | firstName <- Sys.getenv("first") 60 | lastName <- Sys.getenv("last") 61 | email <- Sys.getenv("email") 62 | ``` 63 | 64 | ### SEC EDGAR Data Installation 65 | 66 | In addition to the publicly available API, SEC EDGAR data can also be access via a bulk data download, which is compiled nightly. This approach is advantageous when working with large datasets, since it does not require making many individual API calls. However, it requires about 15 GB of storage to install and is more difficult to keep up to date. 67 | 68 | To access this data, download the companyfacts.zip file under the ‘Bulk data’ heading at the bottom of [this page.](https://www.sec.gov/edgar/sec-api-documentation) 69 | 70 | ## 1. Obtaining Marketing Expenses for Amazon 71 | 72 | To access the data from an individual company, we must first obtain its Central Index Key (CIK) value. These values can be obtained by searching for a company [here](https://www.sec.gov/edgar/search/#). Alternatively, you can find a list of all companies and their CIK value [here](https://www.sec.gov/Archives/edgar/cik-lookup-data.txt). 73 | 74 | For this section of the guide, we’ll use Amazon (AMZN) as an example, which has a CIK of 0001018724. 75 | 76 | With this CIK, we can now build a URL for the /companyfacts/ endpoint: 77 | 78 | 79 | ```r 80 | # Define the Amazon CIK (Central Index Key) for the SEC EDGAR database 81 | cik <- "0001018724" # Amazon.com Inc. 82 | 83 | # Define the URL for the SEC EDGAR API 84 | base_url <- paste0("https://data.sec.gov/api/xbrl/companyfacts/CIK",cik,".json") 85 | 86 | # Query SEC EDGAR API 87 | amzn_data <- fromJSON(rawToChar(GET(url = base_url, add_headers("User-agent" = paste0(firstName,",",lastName,", ",email)))$content)) 88 | 89 | # Let's check the name of the company of the data retrieved 90 | amzn_data$entityName 91 | ``` 92 | 93 | ``` 94 | ## [1] "AMAZON.COM, INC." 95 | ``` 96 | 97 | Now that we've retrieved the Amazon's data, let's examine their marketing expenses. 98 | 99 | 100 | ```r 101 | # Retrieve marketing expenses in USD 102 | marketing_expenses <- amzn_data$facts$`us-gaap`$MarketingExpense$units$USD 103 | 104 | # Filter through marketing expenses to retrieve one cumulative value per Fiscal Year 105 | marketing_expenses_FY <- marketing_expenses[marketing_expenses$fp=='FY',] 106 | marketing_expenses_FY <- marketing_expenses_FY[!is.na(marketing_expenses_FY$frame),] 107 | 108 | # Marketing Expenses per Fiscal Year 109 | marketing_expenses_FY[c('frame', 'val')] 110 | ``` 111 | 112 | ``` 113 | ## frame val 114 | ## 1 CY2007 3.4400e+08 115 | ## 7 CY2008 4.8200e+08 116 | ## 19 CY2009 6.8000e+08 117 | ## 32 CY2010 1.0290e+09 118 | ## 45 CY2011 1.6300e+09 119 | ## 58 CY2012 2.4080e+09 120 | ## 71 CY2013 3.1330e+09 121 | ## 84 CY2014 4.3320e+09 122 | ## 97 CY2015 5.2540e+09 123 | ## 110 CY2016 7.2330e+09 124 | ## 123 CY2017 1.0069e+10 125 | ## 136 CY2018 1.3814e+10 126 | ## 149 CY2019 1.8878e+10 127 | ## 162 CY2020 2.2008e+10 128 | ## 174 CY2021 3.2551e+10 129 | ## 185 CY2022 4.2238e+10 130 | ``` 131 | 132 | One may be interested in the cumulative sum of the expenses over the years. 133 | 134 | 135 | ```r 136 | # Cumulative sum of marketing expenses over the years 137 | total_marketing_expenses <- sum(marketing_expenses_FY$val) 138 | 139 | # Let's take a look 140 | paste0("Amazon's Total Marketing Expenses: ", total_marketing_expenses, ' USD') 141 | ``` 142 | 143 | ``` 144 | ## [1] "Amazon's Total Marketing Expenses: 1.66083e+11 USD" 145 | ``` 146 | 147 | ### Marketing Expenses Visualization 148 | 149 | Rather than calculating the total marketing expenses documented in the API, let's visualize the marketing expenses by fiscal year using a box plot. 150 | 151 | ```r 152 | # Plot marketing expenses by fiscal year 153 | library(ggplot2) 154 | ggplot(data = marketing_expenses_FY, aes(x = as.numeric(substr(marketing_expenses_FY$frame, 3,6)), y = val))+ 155 | geom_bar(stat = "identity", fill = "#bcbddc", color = "black") + 156 | labs(x = "Fiscal Year", y = "Marketing Expenses (USD)", title = "Marketing Expenses by Fiscal Year") 157 | ``` 158 | 159 | ![](SEC_EDGAR_API_R_files/figure-html/unnamed-chunk-5-1.png) 160 | 161 | ## 2. Number of Shares Outstanding for Tesla 162 | 163 | For another use case, let’s look at the number of shares outstanding for Tesla, which the SEC defines as “Number of shares of common stock outstanding. Common stock represent the ownership interest in a corporation.” Much of the process is conveniently similar. 164 | 165 | 166 | ```r 167 | # Define the Tesla CIK 168 | cik <- "0001318605" # Tesla Inc. 169 | 170 | # Define the URL for the SEC EDGAR API 171 | base_url <- paste0("https://data.sec.gov/api/xbrl/companyfacts/CIK",cik,".json") 172 | 173 | # Query API 174 | tesla_data <- fromJSON(rawToChar(GET(url = base_url, add_headers("User-agent" = paste0(firstName,",",lastName,", ",email)))$content)) 175 | 176 | # Check the name of the company of the data retrieved 177 | tesla_data$entityName 178 | ``` 179 | 180 | ``` 181 | ## [1] "Tesla, Inc." 182 | ``` 183 | 184 | ```r 185 | # Retrieve Shares Outstanding 186 | shares_outstanding <- tesla_data$facts$`us-gaap`$CommonStockSharesOutstanding$units$shares 187 | 188 | # Filter through marketing expenses to retrieve one cumulative value per Fiscal Year 189 | shares_outstanding_FY <- shares_outstanding[shares_outstanding$fp=='FY',] 190 | shares_outstanding_FY <- shares_outstanding_FY[!is.na(shares_outstanding_FY$frame),] 191 | ``` 192 | 193 | Let's see the FY and the corresponding value of shares outstanding 194 | 195 | 196 | ```r 197 | cbind(shares_outstanding_FY$fy,shares_outstanding_FY$val) 198 | ``` 199 | 200 | ``` 201 | ## [,1] [,2] 202 | ## [1,] 2011 94908370 203 | ## [2,] 2012 104530305 204 | ## [3,] 2013 114214274 205 | ## [4,] 2014 123090990 206 | ## [5,] 2015 125688000 207 | ## [6,] 2016 131425000 208 | ## [7,] 2017 161561000 209 | ## [8,] 2018 168797000 210 | ## [9,] 2019 173000000 211 | ## [10,] 2020 905000000 212 | ## [11,] 2021 960000000 213 | ## [12,] 2022 3100000000 214 | ``` 215 | ## 3. Comparing Total Assets of All Filing Companies 216 | 217 | The SEC EDGAR API also has an endpoint called /frames/ that returns the data from all companies for a given category and filing period. In this example, we’ll look at the total assets of all companies reported for Q1 2023. 218 | 219 | 220 | ```r 221 | # Specify query parameters 222 | category <- "Assets/USD" 223 | year <- "2023" 224 | quarter <- "1" 225 | 226 | # Define URL 227 | base_url <- paste0('https://data.sec.gov/api/xbrl/frames/us-gaap/',category,'/CY',year,'Q',quarter,'I.json') 228 | 229 | # Query API 230 | asset_data <- fromJSON(rawToChar(GET(url = base_url, add_headers("User-agent" = paste0(firstName,",",lastName,", ",email)))$content))$data 231 | 232 | # For this usecase we are only interested in the 'entityName' and 'val' columns so let's subset 233 | asset_data <- as.data.frame(cbind(asset_data$entityName, asset_data$val)) 234 | 235 | # Rename columns 236 | colnames(asset_data) <- c('Company', 'totalAssets') 237 | 238 | # Coerce the 'totalAssets' column to numeric 239 | asset_data$totalAssets <- as.numeric(asset_data$totalAssets) 240 | 241 | # Let's see how many entries were retrieved 242 | nrow(asset_data) 243 | ``` 244 | 245 | ``` 246 | ## [1] 6220 247 | ``` 248 | 249 | ```r 250 | # We can also see the structure of the data retrieved using the 'str()' function 251 | str(asset_data) 252 | ``` 253 | 254 | ``` 255 | ## 'data.frame': 6220 obs. of 2 variables: 256 | ## $ Company : chr "AAR CORP" "ABBOTT LABORATORIES" "WORLDS INC." "ACME UNITED CORP" ... 257 | ## $ totalAssets: num 1.67e+09 7.38e+10 8.07e+04 1.57e+08 3.72e+08 ... 258 | ``` 259 | 260 | ```r 261 | # Finally, let's see the first few entries of asset_data 262 | head(asset_data) 263 | ``` 264 | 265 | ``` 266 | ## Company totalAssets 267 | ## 1 AAR CORP 1673300000 268 | ## 2 ABBOTT LABORATORIES 73794000000 269 | ## 3 WORLDS INC. 80675 270 | ## 4 ACME UNITED CORP 157468000 271 | ## 5 ADAMS RESOURCES & ENERGY, INC. 371563000 272 | ## 6 BK TECHNOLOGIES CORPORATION 50758000 273 | ``` 274 | 275 | ### Export to CSV 276 | 277 | Commonly users may want to export data into a comma seperated file (.csv), this may be achieved as follows: 278 | 279 | 280 | ```r 281 | # Export as a csv 282 | write.csv(asset_data, file = paste0('companies_by_total_assets_q',quarter,'_',year,'.csv')) 283 | ``` 284 | 285 | ### Total Assets of All Companies Histogram 286 | 287 | Since the total assets of all companies is a dataset that ranges from values as low as zero to those as large as 4.3 trillion, these values must be graphed logarithmically. Below, we take the log10 of the 'totalAssets' column, luckily R makes this very easy for us. 288 | 289 | 290 | ```r 291 | # Load the ggplot2 library 292 | library(ggplot2) 293 | 294 | # Plot Histogram of totalAssets with log10 transformation 295 | ggplot(asset_data, aes(x = log10(totalAssets))) + 296 | geom_histogram(bins = (10%%max(asset_data$totalAssets) +3), fill = "#756bb1", color = "black") + 297 | labs(title = "Companies by Total Assets Reported for Q1 2023 (Logarithmic)", 298 | x = "Assets (in 10^n USD)", 299 | y = "Number of Companies") 300 | ``` 301 | 302 | ``` 303 | ## Warning: Removed 30 rows containing non-finite values (`stat_bin()`). 304 | ``` 305 | 306 | ![](SEC_EDGAR_API_R_files/figure-html/unnamed-chunk-10-1.png) 307 | 308 | ## 4. Finding the Top 500 Companies by Revenue 309 | 310 | The Fortune 500 is a ranking of the top 500 companies by revenue, according to the data filed in their 10-K or a comparable form. In this example, we’ll look at only the revenues reported in the 10-K forms to construct a similar ranking of U.S. companies by revenue. 311 | 312 | 313 | ```r 314 | # Define query and parameters 315 | category <- 'Revenues/USD' 316 | year <- '2022' 317 | url <- paste0('https://data.sec.gov/api/xbrl/frames/us-gaap/',category,'/CY',year,'.json') 318 | 319 | # Query API 320 | data_retrieved <- fromJSON(rawToChar(GET(url = url, add_headers("User-agent" = paste0(firstName,",",lastName,", ",email)))$content))$data 321 | 322 | # Display number of results 323 | nrow(data_retrieved) 324 | ``` 325 | 326 | ``` 327 | ## [1] 2433 328 | ``` 329 | 330 | ```r 331 | # Grab only first 500 highest revenues 332 | top500_revenues <- head(data_retrieved[order(-data_retrieved$val), c('entityName', 'val')], n = 500) 333 | 334 | # Let's see the first 10 entries in the top500_revenues 335 | head(top500_revenues, n = 10) 336 | ``` 337 | 338 | ``` 339 | ## entityName val 340 | ## 214 WALMART INC. 6.11289e+11 341 | ## 72 Exxon Mobil Corporation 4.13680e+11 342 | ## 320 UnitedHealth Group Incorporated 3.24162e+11 343 | ## 128 CVS HEALTH CORP 3.22467e+11 344 | ## 776 BERKSHIRE HATHAWAY INC 3.02089e+11 345 | ## 188 Chevron Corp 2.46252e+11 346 | ## 909 CENCORA, INC. 2.38587e+11 347 | ## 562 COSTCO WHOLESALE CORP /NEW 2.26954e+11 348 | ## 294 Cardinal Health, Inc. 1.81364e+11 349 | ## 1992 The Cigna Group 1.80516e+11 350 | ``` 351 | 352 | ### Export to CSV 353 | 354 | 355 | ```r 356 | # Export to csv 357 | write.csv(top500_revenues, file = paste0('top_500_companies_by_revenue_fy',year,'.csv')) 358 | ``` 359 | -------------------------------------------------------------------------------- /src/r/usa-spending.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: \...in R 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | # USAspending API in R 9 | 10 | by Adam M. Nguyen 11 | 12 | Please see the following resources for more information on API usage: 13 | 14 | ### Documentation 15 | - [USAspending Website](https://www.usaspending.gov/) 16 | - [USAspending Documentation](https://api.usaspending.gov/) 17 | - [USAspending API](https://github.com/fedspendingtransparency/usaspending-api) 18 | 19 | ### Data Reuse 20 | - [USAspending Data Reuse](https://www.usaspending.gov/about#about-licensing) 21 | 22 | 23 | These recipe examples were tested on December 1, 2023. 24 | 25 | 26 | ## Setup 27 | 28 | Run the following lines of code to load the libraries ‘httr’ and ‘jsonlite’. If you have not done so already, additionally, before the ‘library()’ functions, run ‘install.packages(c(‘httr’,’jsonlite’))’. 29 | 30 | ```r 31 | library(httr) 32 | library(jsonlite) 33 | ``` 34 | ## 1. Get Agency Names and Toptier Codes 35 | 36 | To obtain data from the API, it'll be useful to have an object we can reference agency names and their toptier codes, the latter of which will be used to access subagency data. 37 | 38 | ```r 39 | # Set base url for API 40 | base_url <- 'https://api.usaspending.gov' 41 | 42 | # Define URL to obtain agency names and codes 43 | toptier_agencies_url <- paste0(base_url,'/api/v2/references/toptier_agencies/') 44 | 45 | # Query API using prepared URL and grab the results 46 | toptier_data <- fromJSON(rawToChar(GET(toptier_agencies_url)$content))$results 47 | 48 | # Let's check the first entry 49 | head(toptier_data, n=1) 50 | ``` 51 | 52 | ``` 53 | ## agency_id toptier_code abbreviation agency_name 54 | ## 1 1146 310 USAB Access Board 55 | ## congressional_justification_url active_fy active_fq outlay_amount 56 | ## 1 https://www.access-board.gov/cj 2023 4 9232761 57 | ## obligated_amount budget_authority_amount 58 | ## 1 8863661 11366459 59 | ## current_total_budget_authority_amount percentage_of_total_budget_authority 60 | ## 1 1.188986e+13 9.559789e-07 61 | ## agency_slug 62 | ## 1 access-board 63 | ``` 64 | 65 | ```r 66 | # Show total number agencies in data 67 | nrow(toptier_data) 68 | ``` 69 | 70 | ``` 71 | ## [1] 108 72 | ``` 73 | Now we can create a reference for agencies and their toptier codes, we call 'toptier_codes'. 74 | 75 | ```r 76 | toptier_codes <- toptier_data[c("agency_name", "toptier_code")] 77 | # Let's see the first 10 agencies and their toptier codes 78 | head(toptier_codes,n=10) 79 | ``` 80 | 81 | ``` 82 | ## agency_name 83 | ## 1 Access Board 84 | ## 2 Administrative Conference of the U.S. 85 | ## 3 Advisory Council on Historic Preservation 86 | ## 4 African Development Foundation 87 | ## 5 Agency for International Development 88 | ## 6 American Battle Monuments Commission 89 | ## 7 Appalachian Regional Commission 90 | ## 8 Armed Forces Retirement Home 91 | ## 9 Barry Goldwater Scholarship and Excellence In Education Foundation 92 | ## 10 Commission for the Preservation of America's Heritage Abroad 93 | ## toptier_code 94 | ## 1 310 95 | ## 2 302 96 | ## 3 306 97 | ## 4 166 98 | ## 5 072 99 | ## 6 074 100 | ## 7 309 101 | ## 8 084 102 | ## 9 313 103 | ## 10 321 104 | ``` 105 | Finally, let's test the data frame, 'toptier_codes', by obtaining the toptier code of an agency. 106 | 107 | 108 | ```r 109 | # Look up toptier code of specific agency, in this case Department of Transportation 110 | toptier_codes$toptier_code[toptier_codes$agency_name == "Department of Transportation"] 111 | ``` 112 | 113 | ``` 114 | ## [1] "069" 115 | ``` 116 | With these codes we can access subagency data. 117 | 118 | 119 | ## 2. Retrieving Data from Subagencies 120 | 121 | The 'toptier_codes' data frame we created contains every agency name in the USA Spending API. For this example we'll look at the total obligations of each subagency of the Department of Defense. 122 | 123 | 124 | ```r 125 | # Designate Desired Agency 126 | desired_agency_name <- 'Department of Defense' 127 | 128 | # Find toptier code 129 | desired_toptier_code <- toptier_codes$toptier_code[toptier_codes$agency_name == desired_agency_name] 130 | 131 | # Create URL to Query 132 | subagency_url <- paste0(base_url, '/api/v2/agency/', desired_toptier_code, '/sub_agency/?fiscal_year=2023') 133 | 134 | # Query API and grab Results 135 | subagency_data <- fromJSON(rawToChar(GET(subagency_url)$content))$results 136 | ``` 137 | ### Visualization: Pie Chart 138 | Let's try making a pie chart to visualize our data. Additionally, we will group the last four sub agencies to relieve clutter. 139 | 140 | 141 | ```r 142 | # Select Categories we'd like to collect into 'Other' 143 | last_four_rows <- tail(subagency_data, 4) 144 | 145 | # R is funny so we create a "better" as numeric function 146 | as_numeric_with_na <- function(x) { 147 | as.numeric(as.character(x)) 148 | } 149 | 150 | # Convert last four rows to numeric 151 | last_four_rows[, -1] <- lapply(last_four_rows[, -1], as_numeric_with_na) 152 | 153 | # Sum last four rows 154 | summed_values <- colSums(last_four_rows[, -1], na.rm = TRUE) 155 | 156 | # Collect summed values into "other_row" 157 | other_row <- c("other", as.character(summed_values)) 158 | 159 | # Remove last four rows 160 | subagency_data_removed <- head(subagency_data, -4) 161 | 162 | # Attach new "other_row" and rename it to 'Other' 163 | subagency_data_other <- rbind(subagency_data_removed,other_row) 164 | subagency_data_other$name[7] <- 'Other' 165 | 166 | # Make more fancy Colors 167 | custom_colors <- rainbow(length(subagency_data_other$total_obligations)) 168 | 169 | # Make new and improved pie chart 170 | pie(as.numeric(subagency_data_other$total_obligations), labels = paste0(subagency_data_other$abbreviation," (",round(100*as.numeric(subagency_data_other$total_obligations)/sum(as.numeric(subagency_data_other$total_obligations)),digits = 3),"%)"), main = "Subagency Obligations of the Department of Defense", col = custom_colors) 171 | 172 | # Make new and improved legend 173 | legend("topright", legend = subagency_data_other$abbreviation, fill = custom_colors) 174 | ``` 175 | 176 | ![](USA_Spending_R_files/figure-html/unnamed-chunk-4-1.png) 177 | 178 | 179 | ## 3. Acessing Fiscal Data Per Year 180 | 181 | Using the USA Spending API, we can also examine the annual budget of an agency 2017 and onward. 182 | 183 | ```r 184 | # Specify Agency 185 | desired_agency_name <- "Department of Health and Human Services" 186 | 187 | # Store toptier code of specified agency using 'toptier_codes' df 188 | desired_toptier_code <- toptier_codes$toptier_code[toptier_codes$agency_name == desired_agency_name] 189 | 190 | # Create URL for accessing budgetary resources of specified agency 191 | budgetary_resources_url <- paste0(base_url,'/api/v2/agency/',desired_toptier_code,'/budgetary_resources/') 192 | 193 | # Query API 194 | budgetary_resources_data <- fromJSON(rawToChar(GET(budgetary_resources_url)$content))$agency_data_by_year 195 | 196 | # Format Collected data into a dataframe containing the Fiscal Year and Total Obligated 197 | budget_by_year <- as.data.frame(cbind('Year'=tail(budgetary_resources_data, n=6)$fiscal_year,'Total_Obligated'=tail(budgetary_resources_data, n=6)$agency_total_obligated)) # We use the tail function to select only the last 6 years in the dataframe, because 2023 does not contain the entire annual budget as of the time of writing 198 | ``` 199 | 200 | We can now use ggplot2 to create a bar chart for the collected budgetary data. 201 | 202 | 203 | ```r 204 | # Load ggplot2 library 205 | library(ggplot2) 206 | 207 | # Create Barplot of Total Budgetary Resources by Fiscal Year 208 | p <- ggplot(data = budget_by_year, aes(x = Year, y = Total_Obligated)) 209 | p + geom_bar(stat = "identity", fill = "plum") + 210 | labs(title = "Department of Health and Human Services Budgetary Resources", x = "Fiscal Year", y = "Total Budgetary Resources") + 211 | theme_minimal() 212 | ``` 213 | 214 | ![](USA_Spending_R_files/figure-html/unnamed-chunk-6-1.png) 215 | 216 | ## 4. Breaking Down Budget Categories 217 | 218 | The API can also be used to view the spending breakdown of a specific agency 219 | 220 | ```r 221 | # Specify Agency 222 | desired_agency_name <- "Department of the Interior" 223 | 224 | # Store toptier code of specified agency 225 | desired_toptier_code <- toptier_codes$toptier_code[toptier_codes$agency_name == desired_agency_name] 226 | 227 | # Store URL to view agency's spending breakdown 228 | obligations_by_category_url <- paste0(base_url,"/api/v2/agency/",desired_toptier_code, "/obligations_by_award_category/?fiscal_year=2023") 229 | 230 | # Query API 231 | obligations_by_category_data <- fromJSON(rawToChar(GET(obligations_by_category_url)$content)) 232 | 233 | # Select the total aggregated obligations for this particular agency 234 | total_aggregated_amount <- obligations_by_category_data$total_aggregated_amount 235 | 236 | # Store results of query 237 | obligations_by_category_data <- obligations_by_category_data$results 238 | obligations_by_category_data 239 | ``` 240 | 241 | ``` 242 | ## category aggregated_amount 243 | ## 1 contracts 7811857503 244 | ## 2 direct_payments 3311940758 245 | ## 3 grants 7198549492 246 | ## 4 idvs 3580836 247 | ## 5 loans 0 248 | ## 6 other 335594193 249 | ``` 250 | 251 | ```r 252 | # Let's remove the categories where 'aggregated_amount' = 0 253 | budget_breakdown <-obligations_by_category_data[obligations_by_category_data$aggregated_amount>0,] 254 | budget_breakdown 255 | ``` 256 | 257 | ``` 258 | ## category aggregated_amount 259 | ## 1 contracts 7811857503 260 | ## 2 direct_payments 3311940758 261 | ## 3 grants 7198549492 262 | ## 4 idvs 3580836 263 | ## 6 other 335594193 264 | ``` 265 | Similar to the previous example, let's create a bar chart to visualize this data. 266 | 267 | ```r 268 | # Sort 'budget_breakdown' from greatest to least 'aggregated_amount' 269 | budget_breakdown_sorted <- budget_breakdown[order(-budget_breakdown$aggregated_amount), ] 270 | 271 | # Create bar chart using ggplot2 272 | ggplot(data = budget_breakdown_sorted, aes(x = reorder(category, -aggregated_amount), y = aggregated_amount)) + 273 | geom_bar(stat = "identity", fill = "plum") + 274 | labs(title = "Department of the Interior Budget Breakdown", 275 | x = "Category", 276 | y = "Aggregated Amount (USD)") + 277 | theme_minimal() + 278 | geom_text(aes(label = paste0(round(aggregated_amount / sum(budget_breakdown_sorted$aggregated_amount) * 100, 1), "%"), vjust = -0.5), size = 3) 279 | ``` 280 | 281 | ![](USA_Spending_R_files/figure-html/unnamed-chunk-9-1.png) 282 | -------------------------------------------------------------------------------- /src/r/wiley-tdm.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "wiley-tdm" 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | # Wiley Text and Data Mining (TDM) in R 9 | 10 | by Michael T. Moen 11 | 12 | This tutorial is designed to support academic research. Please consult your institution’s library or legal office regarding its Text and Data Mining license agreement with Wiley. 13 | 14 | ### Documentation 15 | - [Wiley Text and Data Mining](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) 16 | 17 | ### Terms of Use 18 | - [Wiley Text and Data Mining Agreement](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining#accordionHeader-3) 19 | 20 | ### Data Reuse 21 | - [Service Name] Data Reuse *(link to be provided by the service)* 22 | 23 | *These recipe examples were tested on February 12, 2025.* 24 | 25 | **_NOTE:_** The Wiley TDM API limits requests to a maximum of 3 requests per second. 26 | 27 | ## Setup 28 | 29 | ### Import Libraries 30 | 31 | This tutorial uses the following libraries: 32 | 33 | 34 | ``` r 35 | library(httr) 36 | ``` 37 | 38 | ### Text and Data Mining Token 39 | 40 | A token is required to access the Wiley TDM API. Sign up can be found [here](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining#accordionHeader-2). Import your token below: 41 | 42 | 43 | ``` r 44 | wiley_token <- Sys.getenv("wiley_token") 45 | 46 | # The token will be sent as a header in the API calls 47 | headers <- add_headers("Wiley-TDM-Client-Token" = wiley_token) 48 | ``` 49 | 50 | ## 1. Retrieve full-text of an article 51 | 52 | The Wiley TDM API returns the full-text of an article as a PDF when given the article's DOI. 53 | 54 | In the first example, we download the full-text of the article with the DOI "10.1002/net.22207". This article was found on the Wiley Online Library. 55 | 56 | 57 | ``` r 58 | # DOI to download 59 | doi <- "10.1002/net.22207" 60 | url <- paste0("https://api.wiley.com/onlinelibrary/tdm/v1/articles/", doi) 61 | 62 | response <- GET(url, headers) 63 | 64 | if (status_code(response) == 200) { 65 | # Download if status code indicates success 66 | filename <- paste0(gsub("/", "_", doi), ".pdf") 67 | writeBin(content(response, "raw"), filename) 68 | cat(paste0(filename, " downloaded successfully\n")) 69 | 70 | } else { 71 | # Print status code if unsuccessful 72 | cat(paste0("Failed to download PDF. Status code: ", status_code(response), "\n")) 73 | } 74 | ``` 75 | 76 | ``` 77 | ## 10.1002_net.22207.pdf downloaded successfully 78 | ``` 79 | 80 | ## 2. Retrieve full-text of multiple articles 81 | 82 | In this example, we download 5 articles found in the Wiley Online Library: 83 | 84 | 85 | ``` r 86 | # DOIs of articles to download 87 | dois <- c( 88 | "10.1111/j.1467-8624.2010.01564.x", 89 | "10.1111/1467-8624.00164", 90 | "10.1111/cdev.12864", 91 | "10.1111/j.1467-8624.2007.00995.x", 92 | "10.1111/j.1467-8624.2010.01499.x", 93 | "10.1111/j.1467-8624.2010.0149.x" # Invalid DOI, will throw error 94 | ) 95 | 96 | # Loop through DOIs and download each article 97 | for (doi in dois) { 98 | url <- paste0("https://api.wiley.com/onlinelibrary/tdm/v1/articles/", doi) 99 | response <- GET(url, headers) 100 | 101 | if (status_code(response) == 200) { 102 | # Download if status code indicates success 103 | filename <- paste0(gsub("/", "_", doi), ".pdf") 104 | writeBin(content(response, "raw"), filename) 105 | cat(paste0(filename, " downloaded successfully\n")) 106 | 107 | } else { 108 | # Print status code if unsuccessful 109 | cat(paste0("Failed to download PDF. Status code: ", status_code(response), "\n")) 110 | } 111 | 112 | # Wait 1 second to be nice to Wiley's servers 113 | Sys.sleep(1) 114 | } 115 | ``` 116 | 117 | ``` 118 | ## 10.1111_j.1467-8624.2010.01564.x.pdf downloaded successfully 119 | ## 10.1111_1467-8624.00164.pdf downloaded successfully 120 | ## 10.1111_cdev.12864.pdf downloaded successfully 121 | ## 10.1111_j.1467-8624.2007.00995.x.pdf downloaded successfully 122 | ## 10.1111_j.1467-8624.2010.01499.x.pdf downloaded successfully 123 | ## Failed to download PDF. Status code: 404 124 | ``` 125 | -------------------------------------------------------------------------------- /src/r/world-bank.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: \...in R 3 | output: 4 | html_document: 5 | keep_md: TRUE 6 | --- 7 | 8 | # World Bank API in R 9 | 10 | by Vishank Patel and Adam M. Nguyen 11 | 12 | The World Bank offers a suite of APIs providing access to a vast array of global development data, including economic indicators, project information, and more. These APIs enable users to programmatically retrieve data for analysis, application development, and research purposes 13 | 14 | ### Documentation 15 | - [World Bank Indicators API Documentation](https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation) 16 | - [World Bank Data Catalog API Documentation](https://datahelpdesk.worldbank.org/knowledgebase/articles/1886698-data-catalog-api) 17 | - [World Bank Data Catalog](https://data.worldbank.org/) 18 | - [World Bank Development Best Practices](https://datahelpdesk.worldbank.org/knowledgebase/articles/902064-development-best-practices) 19 | 20 | ### Terms of Use 21 | - [World Bank Terms of Use](https://www.worldbank.org/en/about/legal/terms-and-conditions) 22 | - [World Bank Summary Terms of Use](https://data.worldbank.org/summary-terms-of-use) 23 | 24 | ### Data Reuse and Licensing 25 | - [World Bank Data Licensing and Terms of Use](https://www.worldbank.org/en/about/legal/terms-of-use-for-datasets) 26 | 27 | These recipe examples were tested on March 24, 2023. 28 | 29 | ## Setup 30 | 31 | ```r 32 | # Load Packages 33 | library(tidyverse) #ggplot2 34 | library(dplyr) #tibbles 35 | library(purrr) #turning into character 36 | library(httr) #GET() 37 | library(jsonlite) #converting to JSON 38 | 39 | # define root World Bank API 40 | urlRoot <- "https://api.worldbank.org/v2/" 41 | ``` 42 | 43 | ## 1. Get list of country iso2Codes and names 44 | 45 | For obtaining data from the World Bank API, it is helpful to first obtain a list of country codes and names. 46 | 47 | 48 | ```r 49 | countryURL <- paste0(urlRoot,"country?format=json&per_page=500") # Create URL we are querying 50 | 51 | raw_country_data <- GET(countryURL) # Use 'GET()' to retrieve info 52 | prelim_country_data <- # Reading Data to R 53 | fromJSON( # Converts JSON data to R objects 54 | rawToChar(raw_country_data$content), flatten = TRUE) # Reads raw 8 bit data to chars 55 | # To view try 'view(prelim_country_data)' 56 | country_data <- prelim_country_data[[2]] # Retrieve only country data frame 57 | country_data[1:10,1:5] # Display first 5 features of first 10 countries from country_data 58 | ``` 59 | 60 | ``` 61 | ## id iso2Code name capitalCity longitude 62 | ## 1 ABW AW Aruba Oranjestad -70.0167 63 | ## 2 AFE ZH Africa Eastern and Southern 64 | ## 3 AFG AF Afghanistan Kabul 69.1761 65 | ## 4 AFR A9 Africa 66 | ## 5 AFW ZI Africa Western and Central 67 | ## 6 AGO AO Angola Luanda 13.242 68 | ## 7 ALB AL Albania Tirane 19.8172 69 | ## 8 AND AD Andorra Andorra la Vella 1.5218 70 | ## 9 ARB 1A Arab World 71 | ## 10 ARE AE United Arab Emirates Abu Dhabi 54.3705 72 | ``` 73 | ### Extract Country Codes 74 | 75 | 76 | ```r 77 | countryIso2Code <- as.list(country_data$iso2Code) # Extract iso2Codes 78 | length(countryIso2Code)# Display Length 79 | ``` 80 | 81 | ``` 82 | ## [1] 299 83 | ``` 84 | 85 | ```r 86 | head(countryIso2Code,n=10) # Display first 10 87 | ``` 88 | 89 | ``` 90 | ## [[1]] 91 | ## [1] "AW" 92 | ## 93 | ## [[2]] 94 | ## [1] "ZH" 95 | ## 96 | ## [[3]] 97 | ## [1] "AF" 98 | ## 99 | ## [[4]] 100 | ## [1] "A9" 101 | ## 102 | ## [[5]] 103 | ## [1] "ZI" 104 | ## 105 | ## [[6]] 106 | ## [1] "AO" 107 | ## 108 | ## [[7]] 109 | ## [1] "AL" 110 | ## 111 | ## [[8]] 112 | ## [1] "AD" 113 | ## 114 | ## [[9]] 115 | ## [1] "1A" 116 | ## 117 | ## [[10]] 118 | ## [1] "AE" 119 | ``` 120 | 121 | ### Extract country names 122 | 123 | 124 | ```r 125 | countryName <- as.list(country_data$name) # Extract Country Names 126 | length(countryName)# Display Length 127 | ``` 128 | 129 | ``` 130 | ## [1] 299 131 | ``` 132 | 133 | ```r 134 | head(countryName,n=10) # Display first 10 Country names 135 | ``` 136 | 137 | ``` 138 | ## [[1]] 139 | ## [1] "Aruba" 140 | ## 141 | ## [[2]] 142 | ## [1] "Africa Eastern and Southern" 143 | ## 144 | ## [[3]] 145 | ## [1] "Afghanistan" 146 | ## 147 | ## [[4]] 148 | ## [1] "Africa" 149 | ## 150 | ## [[5]] 151 | ## [1] "Africa Western and Central" 152 | ## 153 | ## [[6]] 154 | ## [1] "Angola" 155 | ## 156 | ## [[7]] 157 | ## [1] "Albania" 158 | ## 159 | ## [[8]] 160 | ## [1] "Andorra" 161 | ## 162 | ## [[9]] 163 | ## [1] "Arab World" 164 | ## 165 | ## [[10]] 166 | ## [1] "United Arab Emirates" 167 | ``` 168 | 169 | ### Store Country Codes and Names together 170 | 171 | ```r 172 | countryIso2CodeName <- transpose(list(countryIso2Code,countryName)) 173 | length(countryIso2CodeName)# Display Length 174 | ``` 175 | 176 | ``` 177 | ## [1] 299 178 | ``` 179 | 180 | ```r 181 | head(countryIso2CodeName, n=10) 182 | ``` 183 | 184 | ``` 185 | ## [[1]] 186 | ## [[1]][[1]] 187 | ## [1] "AW" 188 | ## 189 | ## [[1]][[2]] 190 | ## [1] "Aruba" 191 | ## 192 | ## 193 | ## [[2]] 194 | ## [[2]][[1]] 195 | ## [1] "ZH" 196 | ## 197 | ## [[2]][[2]] 198 | ## [1] "Africa Eastern and Southern" 199 | ## 200 | ## 201 | ## [[3]] 202 | ## [[3]][[1]] 203 | ## [1] "AF" 204 | ## 205 | ## [[3]][[2]] 206 | ## [1] "Afghanistan" 207 | ## 208 | ## 209 | ## [[4]] 210 | ## [[4]][[1]] 211 | ## [1] "A9" 212 | ## 213 | ## [[4]][[2]] 214 | ## [1] "Africa" 215 | ## 216 | ## 217 | ## [[5]] 218 | ## [[5]][[1]] 219 | ## [1] "ZI" 220 | ## 221 | ## [[5]][[2]] 222 | ## [1] "Africa Western and Central" 223 | ## 224 | ## 225 | ## [[6]] 226 | ## [[6]][[1]] 227 | ## [1] "AO" 228 | ## 229 | ## [[6]][[2]] 230 | ## [1] "Angola" 231 | ## 232 | ## 233 | ## [[7]] 234 | ## [[7]][[1]] 235 | ## [1] "AL" 236 | ## 237 | ## [[7]][[2]] 238 | ## [1] "Albania" 239 | ## 240 | ## 241 | ## [[8]] 242 | ## [[8]][[1]] 243 | ## [1] "AD" 244 | ## 245 | ## [[8]][[2]] 246 | ## [1] "Andorra" 247 | ## 248 | ## 249 | ## [[9]] 250 | ## [[9]][[1]] 251 | ## [1] "1A" 252 | ## 253 | ## [[9]][[2]] 254 | ## [1] "Arab World" 255 | ## 256 | ## 257 | ## [[10]] 258 | ## [[10]][[1]] 259 | ## [1] "AE" 260 | ## 261 | ## [[10]][[2]] 262 | ## [1] "United Arab Emirates" 263 | ``` 264 | 265 | Now we know the country iso2codes, which we can use to pull specific indicator data for countries. 266 | 267 | ## 2. Compile a Custom Indicator Dataset 268 | 269 | There are many availabe indicators: 270 | 271 | We wll select three indicators for this example: 272 | 273 | 1. Scientific and Technical Journal Article Data = [IP.JRN.ARTC.SC](https://data.worldbank.org/indicator/IP.JRN.ARTC.SC?view=chart) 274 | 275 | 2. Patent Applications, residents = [IP.PAT.RESD](https://data.worldbank.org/indicator/IP.PAT.RESD?view=chart) 276 | 277 | 3. GDP per capita (current US\$) Code = [NY.GDP.PCAP.CD](https://data.worldbank.org/indicator/NY.GDP.PCAP.CD?view=chart) 278 | 279 | Note that these three selected indictaors have a [CC-BY 4.0 license](https://datacatalog.worldbank.org/public-licenses#cc-by). We will compile this indicator data for the United States (US) and United Kingdom (GB) 280 | 281 | 282 | ```r 283 | indicators <- list("IP.JRN.ARTC.SC", "IP.PAT.RESD", "NY.GDP.PCAP.CD") 284 | ``` 285 | 286 | ### United States (US) 287 | 288 | #### Generate the web API URLs we need for U.S.: 289 | 290 | 291 | ```r 292 | # Create an Empty List 293 | us_api_url <- c() 294 | 295 | # Iterate through each indicator, appending to the base URL, creating a list of unique URLs 296 | for (indicator in indicators) { 297 | us_api_url <- append(x = us_api_url, 298 | values = paste(urlRoot,"country/US/indicator/",indicator,"?format=json&per_page=500",sep = "")) 299 | } 300 | # Display URLs 301 | us_api_url 302 | ``` 303 | 304 | ``` 305 | ## [1] "https://api.worldbank.org/v2/country/US/indicator/IP.JRN.ARTC.SC?format=json&per_page=500" 306 | ## [2] "https://api.worldbank.org/v2/country/US/indicator/IP.PAT.RESD?format=json&per_page=500" 307 | ## [3] "https://api.worldbank.org/v2/country/US/indicator/NY.GDP.PCAP.CD?format=json&per_page=500" 308 | ``` 309 | 310 | #### Retrieving Data 311 | 312 | 313 | ```r 314 | # Create an empty list for Indicator Data to be stored 315 | us_indicator_data <- list() 316 | # Iterate through URLs to collect and reformat data into lists 317 | for (url in us_api_url) { 318 | temp_data <- tibble(fromJSON(rawToChar(GET(url)$content), flatten = TRUE)[[2]]) 319 | us_indicator_data <- append(us_indicator_data,list(temp_data)) #making a list of tibbles 320 | } 321 | ``` 322 | #### Extracting Data 323 | 324 | 325 | ```r 326 | us_journal_data <- us_indicator_data[[1]][,c("date","value")] #the first element in us_indicator_data is regarding journal publications 327 | head(us_journal_data,n=10) 328 | ``` 329 | 330 | ``` 331 | ## # A tibble: 10 × 2 332 | ## date value 333 | ## 334 | ## 1 2021 NA 335 | ## 2 2020 NA 336 | ## 3 2019 NA 337 | ## 4 2018 422808. 338 | ## 5 2017 432216. 339 | ## 6 2016 427265. 340 | ## 7 2015 429989. 341 | ## 8 2014 433192. 342 | ## 9 2013 429570. 343 | ## 10 2012 427997. 344 | ``` 345 | 346 | 347 | ```r 348 | us_patent_data <- us_indicator_data[[2]][,c("date","value")] # Takes all rows but 2nd column 349 | head(us_patent_data,n=10) 350 | ``` 351 | 352 | ``` 353 | ## # A tibble: 10 × 2 354 | ## date value 355 | ## 356 | ## 1 2021 NA 357 | ## 2 2020 269586 358 | ## 3 2019 285113 359 | ## 4 2018 285095 360 | ## 5 2017 293904 361 | ## 6 2016 295327 362 | ## 7 2015 288335 363 | ## 8 2014 285096 364 | ## 9 2013 287831 365 | ## 10 2012 268782 366 | ``` 367 | 368 | 369 | ```r 370 | us_GDP_data <- us_indicator_data[[3]][,c("date","value")] 371 | head(us_GDP_data) 372 | ``` 373 | 374 | ``` 375 | ## # A tibble: 6 × 2 376 | ## date value 377 | ## 378 | ## 1 2021 70249. 379 | ## 2 2020 63531. 380 | ## 3 2019 65120. 381 | ## 4 2018 62823. 382 | ## 5 2017 59908. 383 | ## 6 2016 57867. 384 | ``` 385 | 386 | 387 | ```r 388 | # Create a dataframe using retrieved data 389 | us_data <- as.data.frame(c(us_journal_data,us_patent_data[2],us_GDP_data[2]), 390 | col.names= c("Year","Scientific and Technical Journal Article Data", "Patent Applications, residents","GDP per capita (current US$) Code")) # Set column names 391 | head(us_data) 392 | ``` 393 | 394 | ``` 395 | ## Year Scientific.and.Technical.Journal.Article.Data 396 | ## 1 2021 NA 397 | ## 2 2020 NA 398 | ## 3 2019 NA 399 | ## 4 2018 422807.7 400 | ## 5 2017 432216.5 401 | ## 6 2016 427264.6 402 | ## Patent.Applications..residents GDP.per.capita..current.US...Code 403 | ## 1 NA 70248.63 404 | ## 2 269586 63530.63 405 | ## 3 285113 65120.39 406 | ## 4 285095 62823.31 407 | ## 5 293904 59907.75 408 | ## 6 295327 57866.74 409 | ``` 410 | 411 | ### United Kingdom (GB) 412 | 413 | Now we can repeat the same process to find the relevant information for the United Kingdom indicated by the country code "GB". As you will see, much of the code is the same, so if needed, reference the United States example. 414 | 415 | 416 | 417 | ```r 418 | uk_api_url <- c() 419 | for (indicator in indicators) { 420 | uk_api_url <- append(x = uk_api_url, 421 | values = paste(urlRoot,"country/GB/indicator/",indicator,"?format=json&per_page=500",sep = "")) 422 | } 423 | uk_api_url 424 | ``` 425 | 426 | ``` 427 | ## [1] "https://api.worldbank.org/v2/country/GB/indicator/IP.JRN.ARTC.SC?format=json&per_page=500" 428 | ## [2] "https://api.worldbank.org/v2/country/GB/indicator/IP.PAT.RESD?format=json&per_page=500" 429 | ## [3] "https://api.worldbank.org/v2/country/GB/indicator/NY.GDP.PCAP.CD?format=json&per_page=500" 430 | ``` 431 | 432 | #### Retrieving Data 433 | 434 | 435 | ```r 436 | uk_indicator_data <- list() 437 | for (url in uk_api_url) { 438 | temp_data <- tibble(fromJSON(rawToChar(GET(url)$content), flatten = TRUE)[[2]]) 439 | uk_indicator_data <- append(uk_indicator_data,list(temp_data)) #making a list of tibbles 440 | } 441 | ``` 442 | #### Extracting Data 443 | 444 | ```r 445 | # Extract Data 446 | uk_journal_data <- uk_indicator_data[[1]][,c("date","value")] #takes all rows but only two columns 447 | uk_patent_data <- uk_indicator_data[[2]][,c("date","value")] 448 | uk_GDP_data <- uk_indicator_data[[3]][,c("date","value")] 449 | 450 | # Combine extracted data into a data frame 451 | uk_data <- as.data.frame(c(uk_journal_data,uk_patent_data[2],uk_GDP_data[2]),col.names= c("Year","Scientific and Technical Journal Article Data", "Patent Applications, residents","GDP per capita (current US$) Code")) 452 | head(uk_data) 453 | ``` 454 | 455 | ``` 456 | ## Year Scientific.and.Technical.Journal.Article.Data 457 | ## 1 2021 NA 458 | ## 2 2020 NA 459 | ## 3 2019 NA 460 | ## 4 2018 97680.90 461 | ## 5 2017 99128.72 462 | ## 6 2016 99366.17 463 | ## Patent.Applications..residents GDP.per.capita..current.US...Code 464 | ## 1 NA 46510.28 465 | ## 2 11990 40318.56 466 | ## 3 12061 42747.08 467 | ## 4 12865 43306.31 468 | ## 5 13301 40621.33 469 | ## 6 13876 41146.08 470 | ``` 471 | 472 | ## 3. Plot Indicator Data 473 | 474 | Create line plots of US/UK Number of Scientific and Technical Journal Articles and Patents by year. Upon inspecting the dataset, there are no values before the year 2000 and after 2018. Hence, we will slice our data for visualizations accordingly. 475 | 476 | 477 | ```r 478 | # Plotting the Data 479 | 480 | # Part [4:22] corresponds to years 2000-2018 481 | journal_data <- tibble(dates= c(us_journal_data$date[4:22]), 482 | us_journals=c(us_journal_data$value[4:22]), 483 | uk_journals=c(uk_journal_data$value[4:22])) 484 | 485 | ggplot(journal_data, aes(x = dates))+ 486 | geom_line(aes(y = us_journals, color = "US IP", group=1))+ 487 | geom_point(aes(y = us_journals, color = "US IP"))+ 488 | geom_line(aes(y = uk_journals, color = "UK IP"), group=1)+ 489 | geom_point(aes(y = uk_journals, color = "UK IP"))+ 490 | labs(title = "US vs UK Journal Publications", 491 | x="Year", 492 | y="Publications")+ 493 | theme(axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5)) 494 | ``` 495 | 496 | ![](R_WorldBank_Markdown_files/figure-html/unnamed-chunk-16-1.png) 497 | 498 | Similarly for the GDP data, 499 | 500 | 501 | ```r 502 | gdp_data <- tibble(dates= c(us_GDP_data$date[4:22]), 503 | us_gdp=c(us_GDP_data$value[4:22]), 504 | uk_gdp=c(uk_GDP_data$value[4:22])) 505 | 506 | ggplot(gdp_data, aes(x = dates))+ 507 | geom_line(aes(y = us_gdp, color = "US GDP", group=1))+ 508 | geom_point(aes(y = us_gdp, color = "US GDP"))+ 509 | geom_line(aes(y = uk_gdp, color = "UK GDP"), group=1)+ 510 | geom_point(aes(y = uk_gdp, color = "UK GDP"))+ 511 | labs(title = "US vs UK GDP", 512 | x="Year", 513 | y="GDP")+ 514 | theme(axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5)) 515 | ``` 516 | 517 | ![](R_WorldBank_Markdown_files/figure-html/unnamed-chunk-17-1.png) 518 | 519 | Patents: 520 | 521 | 522 | ```r 523 | patent_data <- tibble(dates= as.numeric(c(us_patent_data$date[4:41])), 524 | us_patents=as.numeric(c(us_patent_data$value[4:41])), 525 | uk_patents=as.numeric(c(uk_patent_data$value[4:41]))) 526 | 527 | ggplot(patent_data, aes(x = dates))+ 528 | geom_line(aes(y = us_patents, color = "US Patents", group=1))+ 529 | geom_point(aes(y = us_patents, color = "US Patents"))+ 530 | geom_line(aes(y = uk_patents, color = "UK Patents"), group=1)+ 531 | geom_point(aes(y = uk_patents, color = "UK Patents"))+ 532 | labs(title = "US vs UK Patents", 533 | x="Year", 534 | y="Patents")+ 535 | theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)+ 536 | scale_x_continuous(breaks=seq(1980, 2020, by = 5)) 537 | ) 538 | ``` 539 | 540 | ![](R_WorldBank_Markdown_files/figure-html/unnamed-chunk-18-1.png) 541 | --------------------------------------------------------------------------------