├── shell
    ├── imgs
    │   └── CID_2734162.png
    ├── sdirect.rst
    ├── crossref.rst
    ├── scopus.rst
    ├── world-bank.rst
    ├── chronam.rst
    ├── us-census.rst
    └── pubmed.rst
├── matlab
    ├── imgs
    │   ├── matlab_pc_im0.png
    │   ├── matlab_pc_im1.png
    │   ├── matlab_pc_im2.png
    │   ├── matlab_pc_im3.png
    │   ├── matlab_pc_im4.png
    │   ├── matlab_pc_im5.png
    │   ├── matlab_pc_im6.png
    │   ├── matlab_pm_im0.png
    │   ├── matlab_pm_im1.png
    │   ├── matlab_casc_im0.png
    │   ├── matlab_chronam_im0.png
    │   ├── matlab_chronam_im1.png
    │   ├── matlab_uscensus_im0.png
    │   └── matlab_worldbank_im0.png
    ├── sdirect.rst
    ├── chronam.rst
    ├── us-census.rst
    ├── world-bank.rst
    ├── pubmed.rst
    ├── crossref.rst
    └── scopus.rst
├── LICENSE
├── README.md
├── mathematica
    └── sdirect.ipynb
└── c
    ├── wiley-tdm.ipynb
    └── sdirect.ipynb


/shell/imgs/CID_2734162.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/shell/imgs/CID_2734162.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_pc_im0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_pc_im0.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_pc_im1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_pc_im1.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_pc_im2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_pc_im2.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_pc_im3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_pc_im3.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_pc_im4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_pc_im4.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_pc_im5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_pc_im5.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_pc_im6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_pc_im6.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_pm_im0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_pm_im0.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_pm_im1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_pm_im1.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_casc_im0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_casc_im0.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_chronam_im0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_chronam_im0.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_chronam_im1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_chronam_im1.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_uscensus_im0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_uscensus_im0.png


--------------------------------------------------------------------------------
/matlab/imgs/matlab_worldbank_im0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/main/matlab/imgs/matlab_worldbank_im0.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 University Libraries-Research Data Services
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/matlab/sdirect.rst:
--------------------------------------------------------------------------------
 1 | ScienceDirect API in Matlab
 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 3 | 
 4 | .. sectionauthor:: Vincent F. Scalfani <vfscalfani@ua.edu>
 5 | 
 6 | by Anastasia Ramig
 7 | 
 8 | **ScienceDirect**: https://www.sciencedirect.com/
 9 | 
10 | **Elsevier Developer Portal:** https://dev.elsevier.com/
11 | 
12 | **ScienceDirect APIs Specification:** https://dev.elsevier.com/sd_api_spec.html
13 | 
14 | **Elsevier How to Guide: Text Mining:** https://dev.elsevier.com/tecdoc_text_mining.html
15 | 
16 | Please check with your institution for their Text and Data Mining Agreement with Elsevier.
17 | 
18 | These recipe examples use the Elsevier ScienceDirect Article (Full-Text) API using MATLAB R2022b. This tutorial content is intended to help facillitate academic research. Before continuing or reusing any of this code, please be aware of Elsevier's API policies and appropiate use-cases, as for example, Elsevier has detailed policies regarding `text and data mining of Elsevier full-text content <https://dev.elsevier.com/text_mining.html>`_. If you have copyright or other related text and data mining questions, please contact The University of Alabama Libraries.
19 | 
20 | *These recipe examples were tested on November 22, 2022.*
21 | 
22 | Setup
23 | ======
24 | 
25 | Import API Key
26 | ---------------------------------
27 | 
28 | As a good practice, do not display your API key in your computational notebook (to prevent accidental sharing). Save your API key to a separate text file, then import your key.
29 | 
30 | .. code-block:: matlab
31 | 
32 |    %% import API key from file
33 |    myAPIKey = importdata("apikey.txt");
34 | 
35 | Identifier Note
36 | -----------------
37 | 
38 | We will use DOIs as the article identifiers. See our Crossref and Scopus API tutorials for workflows on how to create lists of DOIs and identifiers for specific searches and journals. The Elsevier ScienceDirect Article (Full-Text) API also accepts other identifiers like Scopus IDs and PubMed IDs (see API specification documents linked above).
39 | 
40 | 1. Retrieve full-text XML of an article
41 | =======================================
42 | 
43 | .. code-block:: matlab
44 | 
45 |    %% download XML text
46 |    elsevier_url = "https://api.elsevier.com/content/article/doi/";
47 |    doi1 = '10.1016/j.tetlet.2017.07.080'; %% example Tetrahedron Letters article
48 |    fulltext1 = {webread(elsevier_url + doi1 + "?APIKey=" + myAPIKey + "&httpAccept=text/xml")};
49 |  
50 |    %% save to file
51 |    writecell(fulltext1, "fulltext1.txt");  %% can change to .xml after writing
52 |    
53 | 
54 | 2. Retrieve plain text of an article
55 | ====================================
56 | 
57 | .. code-block:: matlab
58 | 
59 |    %% download simplified text
60 |    elsevier_url = "https://api.elsevier.com/content/article/doi/";
61 |    doi2 = "10.1016/j.tetlet.2022.153680";
62 |    fulltext2 = webread(elsevier_url + doi2 + "?APIKey=" + myAPIKey + "&httpAccept=text/plain");
63 |  
64 |    %% save to file
65 |    writematrix(fulltext2, "fulltext2.txt", "Delimiter", "\t");
66 | 
67 | 3. Retrieve full-text in a loop
68 | ===============================
69 | 
70 | Create an array of 5 DOIs for testing.
71 | 
72 | .. code-block:: matlab
73 | 
74 |    %% make a list of 5 DOIs for testing
75 |    dois = ["10.1016/j.tetlet.2018.10.031","10.1016/j.tetlet.2018.10.033",...
76 |     "10.1016/j.tetlet.2018.10.034","10.1016/j.tetlet.2018.10.038",...
77 |     "10.1016/j.tetlet.2018.10.041"];
78 | 
79 | Retrieve article full text for each DOI in a loop and save each article to a separate file. Example shown for plain text, XML also works (replace 'plain' with 'xml')
80 | 
81 | .. code-block:: matlab
82 | 
83 |    for i = 1:length(dois)
84 |        article = webread(elsevier_url + dois(i) + "?APIKey=" + myAPIKey + "&httpAccept=text/plain");
85 |     
86 |        %% replace '/' with '_' since you can't save files with an '/' character on Matlab
87 |        old = "/";
88 |        new = "_";
89 |        doi_name = replace(dois(i), old, new);
90 |        writematrix(article, (doi_name + "_plain_text.txt"), "Delimiter", "\t");
91 |     
92 |        %% pause for 1 second between API calls
93 |        pause(1)
94 |    end
95 | 
96 | 


--------------------------------------------------------------------------------
/shell/sdirect.rst:
--------------------------------------------------------------------------------
 1 | ScienceDirect API in Bash
 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 3 | 
 4 | by Avery Fernandez
 5 | 
 6 | **ScienceDirect**: https://www.sciencedirect.com/
 7 | 
 8 | **Elsevier Developer Portal:** https://dev.elsevier.com/
 9 | 
10 | **ScienceDirect APIs Specification:** https://dev.elsevier.com/sd_api_spec.html
11 | 
12 | **Elsevier How to Guide: Text Mining:** https://dev.elsevier.com/tecdoc_text_mining.html
13 | 
14 | Please check with your institution for their Text and Data Mining Agreement with Elsevier.
15 | 
16 | These recipe examples use the Elsevier ScienceDirect Article (Full-Text) API. This tutorial content is intended to help facillitate academic research. Before continuing or reusing any of this code, please be aware of Elsevier's API policies and appropiate use-cases, as for example, Elsevier has detailed policies regarding `text and data mining of Elsevier full-text content <https://dev.elsevier.com/text_mining.html>`_. If you have copyright or other related text and data mining questions, please contact The University of Alabama Libraries.
17 | 
18 | *These recipe examples were tested on November 22, 2022.*
19 | 
20 | Setup
21 | ======
22 | 
23 | Program requirements
24 | --------------------
25 | 
26 | In order to run this code, you will need to first install `curl`_. curl is used to request the data from the API.
27 | 
28 | .. _curl: https://github.com/curl/curl
29 | 
30 | Create a variable for API Key
31 | ---------------------------------
32 | 
33 | Save your API key to a separate text file, then create a variable for your key. Avoid displaying your API key in your terminal (to prevent accidental sharing). 
34 | 
35 | .. code-block:: shell
36 | 
37 |    apiKey=$(cat "apikey.txt")
38 | 
39 | Identifier Note
40 | -----------------
41 | 
42 | We will use DOIs as the article identifiers. See our Crossref and Scopus API tutorials for workflows on how to create lists of DOIs and identifiers for specific searches and journals. The Elsevier ScienceDirect Article (Full-Text) API also accepts other identifiers like Scopus IDs and PubMed IDs (see API specification documents linked above).
43 | 
44 | 1. Retrieve full-text XML of an article
45 | =======================================
46 | 
47 | .. code-block:: shell
48 | 
49 |    elsevier_url="https://api.elsevier.com/content/article/doi/"
50 |    doi1='10.1016/j.tetlet.2017.07.080' # example Tetrahedron Letters article
51 |    curl "$elsevier_url""$doi1"$"?APIKey=""$apiKey"$"&httpAccept=text/xml" > fulltext1.xml # save to file
52 | 
53 | 2. Retrieve plain text of an article
54 | ====================================
55 | 
56 | .. code-block:: shell
57 | 
58 |    elsevier_url="https://api.elsevier.com/content/article/doi/"
59 |    doi2='10.1016/j.tetlet.2022.153680' # example Tetrahedron Letters article
60 |    curl "$elsevier_url""$doi2"$"?APIKey=""$apiKey"$"&httpAccept=text/plain" > fulltext2.txt # save to file
61 | 
62 | 3. Retrieve full-text in a loop
63 | ===============================
64 | 
65 | Create an array of 5 DOIs for testing.
66 | 
67 | .. code-block:: shell
68 | 
69 |    declare -a dois=("10.1016/j.tetlet.2018.10.031" "10.1016/j.tetlet.2018.10.033" "10.1016/j.tetlet.2018.10.034" "10.1016/j.tetlet.2018.10.038" "10.1016/j.tetlet.2018.10.041")
70 | 
71 | Retrieve article full text for each DOI in a loop and save each article to a separate file. Example shown for plain text, XML also works (replace 'plain' with 'xml')
72 | 
73 | .. code-block:: shell
74 | 
75 |    elsevier_url="https://api.elsevier.com/content/article/doi/"
76 |    for doi in "${dois[@]}"
77 |    do
78 |      doi_name=$(echo "$doi" | sed 's/\//-/') # can't save files with a '/' character on Linux
79 |      echo "$doi_name"
80 |      curl "$elsevier_url""$doi"$"?APIKey=""$apiKey"$"&httpAccept=text/plain" > "$doi_name"$"_plain_text.txt"
81 |      sleep 1 # pause for 1 second between API calls
82 |    done
83 | 
84 | .. code-block:: shell
85 | 
86 |    ls
87 | 
88 | **Output:**
89 | 
90 | .. code-block:: shell
91 | 
92 |    10.1016-j.tetlet.2018.10.031_plain_text.txt
93 |    10.1016-j.tetlet.2018.10.033_plain_text.txt
94 |    10.1016-j.tetlet.2018.10.034_plain_text.txt
95 |    10.1016-j.tetlet.2018.10.038_plain_text.txt
96 |    10.1016-j.tetlet.2018.10.041_plain_text.txt
97 | 
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # University of Alabama Libraries Scholarly API Cookbook Archive
 2 | 
 3 | > [!IMPORTANT]
 4 | > This repository is a publicly available archive of recipes previously in the University of Alabama Scholarly API Cookbook. As of now, this archive includes Matlab, Mathematica, C, and Python recipes. We have also decided to archive all Bash recipes except for the Z39.50 Bash recipe. They are not being maintained nor updated. Please see the latest [University of Alabama Libraries Scholarly API Cookbook](https://ua-libraries-research-data-services.github.io/UALIB_ScholarlyAPI_Cookbook) for current tutorials. The links in the recipes are not being updated and may no longer be accurate. Please check the individual scholarly API documentation for current information on API usage and policies.
 5 | 
 6 | ## License and Reuse
 7 | 
 8 | Most of the code in this repository is licensed under the [MIT License](https://github.com/UA-Libraries-Research-Data-Services/Scholarly_API_Cookbook_Archive/blob/main/LICENSE). This includes code written to be used with Wolfram Mathematica and MathWorks MATLAB. However, these proprietary software packages themselves are not covered under the MIT License, and users must have valid licenses for Mathematica and MATLAB to run the associated code.
 9 | 
10 | The Python scripts in this repository are licensed under the MIT License. However, these scripts may rely on external libraries such as matplotlib, pandas, and others. These libraries are licensed under their own respective terms, and will need to be installed separately. Refer to the documentation of each library for installation instructions and licensing details.
11 | 
12 | The C code in this repository is licensed under the MIT License. This repository provides only the source code, and users will need to compile the C programs to run them. Some of the C code depends on external libraries such as curl, jq, and YAZ, which are licensed under their own respective terms. These libraries will need to be obtained and installed separately by the user.
13 | 
14 | The Bash tutorials are licensed under the MIT License (Bash itself is licensed under the GNU General Public License). Some of the included scripts may rely on external tools such as curl, jq, YAZ, and gnuplot, each of which is licensed under its own terms. Users must obtain and install these tools separately. Refer to the documentation of each tool for installation instructions and licensing details.
15 | 
16 | We have endeavored to follow the appropriate terms and usage policies of each scholarly API, web service, and Z39.50 server. We have linked to the terms and policies where possible. Some database APIs may require a valid library subscription, institutional access, or individual account to use their services. Please be responsible when reusing these scripts and respect the API terms and usage policies (e.g., query limits, record downloads, data sharing restrictions). Data output snippets shown in this book are for demonstration purposes and are credited to the individual API or database service. The output generated from APIs or services remains subject to the terms and conditions of the respective provider. Some outputs (e.g., U.S. Government works) may be in the public domain, while others may require attribution or adherence to other conditions.
17 | 
18 | If you reuse the code, attribution would be appreciated. Please link to the Cookbook and cite our manuscript:
19 | 
20 | Link to Cookbook: https://ua-libraries-research-data-services.github.io/UALIB_ScholarlyAPI_Cookbook
21 | 
22 | Citation: Scalfani, V. F.; Walker, K. W.; Simpson, L.; Fernandez, A. M.; Patel, V. D.; Ramig, A.; Gomes, C.; Moen, M. T.; Nguyen, A. M. Creating a Scholarly API Cookbook: Supporting Library Users with Programmatic Access to Information. Issues in Science and Technology Librarianship, 2023, No. 104. https://doi.org/10.29173/istl2766.
23 | 
24 | ```bibtex
25 | @article{scalfani_creating_2023,
26 |         title = {Creating a {Scholarly} {API} {Cookbook}: {Supporting} {Library} {Users} with {Programmatic} {Access} to {Information}},
27 |         issn = {1092-1206},
28 |         shorttitle = {Creating a {Scholarly} {API} {Cookbook}},
29 |         url = {https://journals.library.ualberta.ca/istl/index.php/istl/article/view/2766},
30 |         doi = {10.29173/istl2766},
31 |         abstract = {Scholarly web-based application programming interfaces (APIs) allow users to interact with information and data programmatically. Interacting with information programmatically allows users to create advanced information query workflows and quickly access machine-readable data for downstream computations. With the growing availability of scholarly APIs from open and commercial library databases, supporting access to information via an API has become a key support area for research data services in libraries. This article describes our efforts with supporting API access through the development of an online Scholarly API Cookbook. The Cookbook contains code recipes (i.e., tutorials) for getting started with 10 different scholarly APIs, including for example, Scopus, World Bank, and PubMed. API tutorials are available in Python, Bash, Matlab, and Mathematica. A tutorial for interacting with library catalog data programmatically via Z39.50 is also included, as traditional library catalog metadata is rarely available via an API. In addition to describing the Scholarly API Cookbook content, we discuss our experiences building a student research data services programming team, challenges we encountered, and ideas to improve the Cookbook. The University of Alabama Libraries Scholarly API Cookbook is freely available and hosted on GitHub. All code within the API Cookbook is licensed with the permissive MIT license, and as a result, users are free to reuse and adapt the code in their teaching and research.},
32 |         number = {104},
33 |         urldate = {2023-10-13},
34 |         journal = {Issues in Science and Technology Librarianship},
35 |         author = {Scalfani, Vincent F. and Walker, Kevin W. and Simpson, Lance and Fernandez, Avery M. and Patel, Vishank D. and Ramig, Anastasia and Gomes, Cyrus and Moen, Michael T. and Nguyen, Adam M.},
36 |         month = oct,
37 |         year = {2023},
38 | }
39 | ```
40 | 


--------------------------------------------------------------------------------
/matlab/chronam.rst:
--------------------------------------------------------------------------------
  1 | Chronicling America API in Matlab
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | .. sectionauthor:: Vincent F. Scalfani <vfscalfani@ua.edu>
  5 | 
  6 | by Anastasia Ramig
  7 | 
  8 | **LOC Chronicling America API Documentation:** https://chroniclingamerica.loc.gov/about/api/
  9 | 
 10 | These recipe examples were tested on December 6, 2022 in MATLAB R2022b.
 11 | 
 12 | **Attribution:** We thank **Professor Jessica Kincaid** (UA Libraries, Hoole Special Collections)
 13 | for the use-cases. All data was collected from the Library of Congress, Chronicling America: Historic
 14 | American Newspapers site, using the API.
 15 | 
 16 | Note that the data from the *Alabama state intelligencer*, *The age-herald*, and the 
 17 | *Birmingham age-herald* were contributed to Chronicling America by The University of 
 18 | Alabama Libraries: https://chroniclingamerica.loc.gov/awardees/au/
 19 | 
 20 | 1. Basic API Request
 21 | ============================
 22 | 
 23 | The Chronicling America API identifies newspapers and other records using LCCNs.
 24 | We can query the API once we have the LCCN for the newspaper and even ask for
 25 | particular issues and editions. For example, the following link lists newspapers 
 26 | published in the state of Alabama, from which the LCCN can be obtained:
 27 | https://chroniclingamerica.loc.gov/newspapers/?state=Alabama
 28 | 
 29 | Here is an example with the Alabama State Intelligencer:
 30 | 
 31 | .. code-block:: matlab
 32 | 
 33 |    %% set up the API parameters and pull json data for the given LCCN
 34 |    api_url = "https://chroniclingamerica.loc.gov/lccn/";
 35 |    lccn = "sn84023600";
 36 |    q = webread(api_url + lccn + ".json")
 37 | 
 38 | **Output:**
 39 | 
 40 | .. code-block:: matlab
 41 | 
 42 |    q = struct with fields:
 43 |     place_of_publication: 'Tuskaloosa [sic], Ala.'
 44 |                     lccn: 'sn84023600'
 45 |               start_year: '183?'
 46 |                    place: {'Alabama--Tuscaloosa--Tuscaloosa'}
 47 |                     name: 'Alabama State intelligencer. [volume]'
 48 |                publisher: 'T.M. Bradford'
 49 |                      url: 'https://chroniclingamerica.loc.gov/lccn/sn84023600.json'
 50 |                 end_year: '18??'
 51 |                   issues: []
 52 |                  subject: []
 53 | 
 54 | .. code-block:: matlab
 55 | 
 56 |    %% extract the name from the search
 57 |    q.name
 58 | 
 59 | **Output:**
 60 | 
 61 | .. code-block:: matlab
 62 | 
 63 |    ans = 'Alabama State intelligencer. [volume]'
 64 | 
 65 | .. code-block:: matlab
 66 | 
 67 |    %% extract the publisher from the search
 68 |    q.publisher
 69 | 
 70 | **Output:**
 71 | 
 72 | .. code-block:: matlab
 73 | 
 74 |    ans = 'T.M. Bradford'
 75 | 
 76 | Moving on to another publication, we can get the 182nd page (seq-182) of the 
 77 | Evening Star newspaper published on November 19, 1961.
 78 | 
 79 | .. code-block:: matlab
 80 | 
 81 |    %% set up the API parameters and pull json data for the given LCCN
 82 |    lccn2 = "sn83045462/1961-11-19/ed-1/seq-182";
 83 |    q2 = webread(api_url + lccn2 + ".json");
 84 |    
 85 |    %% obtain the url for the pdf of the page
 86 |    q_url = q2.pdf
 87 | 
 88 | **Output:**
 89 | 
 90 | .. code-block:: matlab
 91 | 
 92 |    q_url = 'https://chroniclingamerica.loc.gov/lccn/sn83045462/1961-11-19/ed-1/seq-182.pdf'
 93 | 
 94 | .. code-block:: matlab
 95 | 
 96 |    %% view the PDF in web browser
 97 |    web(q_url)
 98 | 
 99 | 2. Frequency of "University of Alabama" mentions
100 | ====================================================
101 | 
102 | The URL below limits to searching newspapers in the state of Alabama and provides
103 | 500 results (as a demo) of “University of Alabama” mentions. Note that phrases
104 | can be searched by putting them inside parentheses for the query.
105 | 
106 | .. code-block:: matlab
107 | 
108 |    %% set up the API parameters and pull json data
109 |    api_url = "https://chroniclingamerica.loc.gov/search/pages/results/?state=Alabama&proxtext=(University%20of%20Alabama)&rows=500&format=json";
110 |    options = weboptions('Timeout', 30);
111 |    alabamaInfo = webread(api_url, options);
112 |    
113 |    %% find the size of the data structure
114 |    size(struct2table(alabamaInfo.items))
115 | 
116 | **Output:**
117 | 
118 | .. code-block:: matlab
119 | 
120 |    ans = 1x2
121 |    500    28
122 | 
123 | .. code-block:: matlab
124 | 
125 |    %% extract the years from the dates given
126 |    dates = {alabamaInfo.items.date};
127 |    datesList = {ones(length(alabamaInfo.items), 1)};
128 |    for i = 1:length(dates)
129 |       datesList{i} = str2double(dates{i}(1:4));
130 |    end
131 |    %% plot a histogram of the mentions according to decade
132 |    x = cell2mat(datesList);
133 |    edges = [1890 1900 1910 1920 1930];
134 |    xticks = ([1890, 1900, 1910, 1920]);
135 |    histogram(x, edges)
136 |    title("Mentions of University of Alabama by Decade");
137 |    xlabel("Decade");
138 |    ylabel("Mentions");
139 | 
140 | **Output:**
141 | 
142 | .. image:: imgs/matlab_chronam_im0.png
143 | 
144 | 3. Industrialization keywords frequency in the Birmingham Age-Herald
145 | =======================================================================
146 | 
147 | We will try to obtain the frequency of “Iron” on the front pages of the Birmingham Age- herald newspapers
148 | from the year 1903 to 1949 (limited to first 500 rows for testing here).
149 | 
150 | .. code-block:: matlab
151 | 
152 |    %% set up the API parameters and pull json data for the given parameters
153 |    api_url = "https://chroniclingamerica.loc.gov/search/pages/results/?state=Alabama&lccn=sn85038485&dateFilterType=yearRange&date1=1903&date2=1949&sequence=1&andtext=Iron&rows=500&searchType=advanced&format=json";
154 |    ind = webread(api_url, options);
155 | 
156 | .. code-block:: matlab
157 | 
158 |    %% create a dataset of dates and format as datetimes
159 |    dates2 = {ind.items.date};
160 |    x2 = datetime(dates2, 'InputFormat', "yyyyMMdd");
161 |    
162 |    %% plot a histogram of mentions of iron by year
163 |    histogram(x2.Year, 'BinMethod', 'integers')
164 |    title("Iron Frequency in the Birmingham Age Herald");
165 |    xlabel("Year");
166 |    ylabel("Mentions");
167 | 
168 | .. image:: imgs/matlab_chronam_im1.png
169 | 


--------------------------------------------------------------------------------
/mathematica/sdirect.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "id": "5bc5f701-85cd-48d8-a978-2cc44692a879",
  6 |       "metadata": {
  7 |         "id": "5bc5f701-85cd-48d8-a978-2cc44692a879"
  8 |       },
  9 |       "source": [
 10 |         "# ScienceDirect API in Mathematica\n",
 11 |         "\n",
 12 |         "by Vishank Patel\n",
 13 |         "\n",
 14 |         "**ScienceDirect**: https://www.sciencedirect.com/\n",
 15 |         "\n",
 16 |         "**Elsevier Developer Portal:** https://dev.elsevier.com/\n",
 17 |         "\n",
 18 |         "**ScienceDirect APIs Specification:** https://dev.elsevier.com/sd_api_spec.html\n",
 19 |         "\n",
 20 |         "**Elsevier How to Guide: Text Mining:** https://dev.elsevier.com/tecdoc_text_mining.html\n",
 21 |         "\n",
 22 |         "Please check with your institution for their Text and Data Mining Agreement with Elsevier.\n",
 23 |         "\n",
 24 |         "These recipe examples use the Elsevier ScienceDirect Article (Full-Text) API and Mathematica 12.3. This tutorial content is intended to help facillitate academic research. Before continuing or reusing any of this code, please be aware of Elsevier’s API policies and appropiate use-cases, as for example, Elsevier has detailed policies regarding [text and data mining of Elsevier full-text content](https://dev.elsevier.com/text_mining.html). If you have copyright or other related text and data mining questions, please contact The University of Alabama Libraries.\n",
 25 |         "\n",
 26 |         "*These recipe examples were tested on April 21, 2022.*"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "markdown",
 31 |       "id": "04e7afdb-e2be-4724-95ca-dc2dbb4138d1",
 32 |       "metadata": {
 33 |         "id": "04e7afdb-e2be-4724-95ca-dc2dbb4138d1"
 34 |       },
 35 |       "source": [
 36 |         "## Setup\n",
 37 |         "\n",
 38 |         "### API key\n",
 39 |         "\n",
 40 |         "After saving the API key in a text file on your computer:"
 41 |       ]
 42 |     },
 43 |     {
 44 |       "cell_type": "code",
 45 |       "execution_count": null,
 46 |       "id": "39a62220-a24e-4a0e-9e3a-b2aca4bb3bf5",
 47 |       "metadata": {
 48 |         "id": "39a62220-a24e-4a0e-9e3a-b2aca4bb3bf5",
 49 |         "vscode": {
 50 |           "languageId": "wolfram language"
 51 |         }
 52 |       },
 53 |       "outputs": [],
 54 |       "source": [
 55 |         "myAPIKey = ToString[Import[\"INSERT PATH HERE\"]];"
 56 |       ]
 57 |     },
 58 |     {
 59 |       "cell_type": "markdown",
 60 |       "id": "44c7ea3b-5de7-4a92-8a72-2565a837f427",
 61 |       "metadata": {
 62 |         "id": "44c7ea3b-5de7-4a92-8a72-2565a837f427"
 63 |       },
 64 |       "source": [
 65 |         "### Identifier Note"
 66 |       ]
 67 |     },
 68 |     {
 69 |       "cell_type": "markdown",
 70 |       "id": "579cdab2-e8d9-42a0-b2e1-2a8dfb8d7560",
 71 |       "metadata": {
 72 |         "id": "579cdab2-e8d9-42a0-b2e1-2a8dfb8d7560"
 73 |       },
 74 |       "source": [
 75 |         "We will use DOIs as the article identifiers. See our Crossref and Scopus API tutorials for workflows on how to create lists of DOIs and identifiers for specific searches and journals. The Elsevier ScienceDirect Article (Full-Text) API also accepts other identifiers like Scopus IDs and PubMed IDs (see API specification documents linked above)."
 76 |       ]
 77 |     },
 78 |     {
 79 |       "cell_type": "markdown",
 80 |       "id": "2250fbf4-6e9e-4c66-b7fb-4ef5c706d87e",
 81 |       "metadata": {
 82 |         "id": "2250fbf4-6e9e-4c66-b7fb-4ef5c706d87e"
 83 |       },
 84 |       "source": [
 85 |         "## 1. Retrieve full-text XML of an article"
 86 |       ]
 87 |     },
 88 |     {
 89 |       "cell_type": "markdown",
 90 |       "id": "d40366d3-bffd-4b18-8256-8d13a3e2ec28",
 91 |       "metadata": {
 92 |         "id": "d40366d3-bffd-4b18-8256-8d13a3e2ec28"
 93 |       },
 94 |       "source": [
 95 |         "For XML Download:"
 96 |       ]
 97 |     },
 98 |     {
 99 |       "cell_type": "code",
100 |       "execution_count": null,
101 |       "id": "90d6c34d-f32d-4e5e-9971-3554132a77ec",
102 |       "metadata": {
103 |         "id": "90d6c34d-f32d-4e5e-9971-3554132a77ec",
104 |         "vscode": {
105 |           "languageId": "wolfram language"
106 |         }
107 |       },
108 |       "outputs": [],
109 |       "source": [
110 |         "elsevierURL = \"https://api.elsevier.com/content/article/doi/\";\n",
111 |         "doi1 = \"10.1016/j.tetlet.2017.07.080\";  (*example Tetrahedron Letters article*)\n",
112 |         "\n",
113 |         "searchURL1 = elsevierURL <> doi1 <> \"?APIKey=\" <> myAPIKey <> \"&httpAccept=text/xml\";\n",
114 |         "fullText1 = Import[searchURL1];"
115 |       ]
116 |     },
117 |     {
118 |       "cell_type": "markdown",
119 |       "id": "83b4c2f2-7913-4f66-8767-9a8aacba23c0",
120 |       "metadata": {
121 |         "id": "83b4c2f2-7913-4f66-8767-9a8aacba23c0"
122 |       },
123 |       "source": [
124 |         "## 2. Retrieve plain text of an article"
125 |       ]
126 |     },
127 |     {
128 |       "cell_type": "markdown",
129 |       "id": "806ebd16-6279-431c-bc92-01c452cab471",
130 |       "metadata": {
131 |         "id": "806ebd16-6279-431c-bc92-01c452cab471"
132 |       },
133 |       "source": [
134 |         "For simplified text download:"
135 |       ]
136 |     },
137 |     {
138 |       "cell_type": "code",
139 |       "execution_count": null,
140 |       "id": "267e7822-2008-4c26-a29d-f849906f273b",
141 |       "metadata": {
142 |         "id": "267e7822-2008-4c26-a29d-f849906f273b",
143 |         "vscode": {
144 |           "languageId": "wolfram language"
145 |         }
146 |       },
147 |       "outputs": [],
148 |       "source": [
149 |         "elsevierURL = \"https://api.elsevier.com/content/article/doi/\";\n",
150 |         "doi2 = \"10.1016/j.tetlet.2022.153680\"; (*example Tetrahedron Letters article*)\n",
151 |         "\n",
152 |         "searchURL2 = elsevierURL <> doi2 <> \"?APIKey=\" <> myAPIKey <> \"&httpAccept=text/plain\";\n",
153 |         "fulltext2 = Import[searchURL2];"
154 |       ]
155 |     },
156 |     {
157 |       "cell_type": "markdown",
158 |       "id": "e4ae3e31-5704-4faa-b4a7-cbabb8834fa3",
159 |       "metadata": {
160 |         "id": "e4ae3e31-5704-4faa-b4a7-cbabb8834fa3"
161 |       },
162 |       "source": [
163 |         "## 3. Retrieve full-text in a loop"
164 |       ]
165 |     },
166 |     {
167 |       "cell_type": "markdown",
168 |       "id": "e4cba3d8-b7dc-4919-93d6-8e2c74390826",
169 |       "metadata": {
170 |         "id": "e4cba3d8-b7dc-4919-93d6-8e2c74390826"
171 |       },
172 |       "source": [
173 |         "Make a list of 5 DOIs for testing"
174 |       ]
175 |     },
176 |     {
177 |       "cell_type": "code",
178 |       "execution_count": null,
179 |       "id": "0035ba88-ed6d-4df2-8d59-fa97facba3fe",
180 |       "metadata": {
181 |         "id": "0035ba88-ed6d-4df2-8d59-fa97facba3fe",
182 |         "vscode": {
183 |           "languageId": "wolfram language"
184 |         }
185 |       },
186 |       "outputs": [],
187 |       "source": [
188 |         "dois = {\"10.1016/j.tetlet.2018.10.031\", \n",
189 |         "   \"10.1016/j.tetlet.2018.10.033\", \"10.1016/j.tetlet.2018.10.034\", \n",
190 |         "   \"10.1016/j.tetlet.2018.10.038\", \"10.1016/j.tetlet.2018.10.041\"};"
191 |       ]
192 |     },
193 |     {
194 |       "cell_type": "markdown",
195 |       "id": "655e6939-3179-40e2-8a32-d73ad8532b01",
196 |       "metadata": {
197 |         "id": "655e6939-3179-40e2-8a32-d73ad8532b01"
198 |       },
199 |       "source": [
200 |         "Retrieve article full text for each DOI in a loop and save each article to a separate file.\n",
201 |         "Example shown for plain text, XML also works (replace 'plain' with 'xml')."
202 |       ]
203 |     },
204 |     {
205 |       "cell_type": "code",
206 |       "execution_count": null,
207 |       "id": "2c0c49d3-a384-459c-944f-ef02fade1bc7",
208 |       "metadata": {
209 |         "id": "2c0c49d3-a384-459c-944f-ef02fade1bc7",
210 |         "vscode": {
211 |           "languageId": "wolfram language"
212 |         }
213 |       },
214 |       "outputs": [],
215 |       "source": [
216 |         "For[i = 1, i <= Length[dois], i++,\n",
217 |         " article = Import[elsevierURL <> dois[[i]] <> \"?APIKey=\" <> myAPIKey <> \"&httpAccept=text/plain\"];\n",
218 |         " doiName = StringReplace[dois[[i]], \"/\" -> \"_\"]  ; (*Can't save files with a '/' character on Linux*)\n",
219 |         " \n",
220 |         " Export[doiName <> \"_plain_text.txt\", article, \"Text\"];\n",
221 |         " Pause[1];\n",
222 |         " ]"
223 |       ]
224 |     }
225 |   ],
226 |   "metadata": {
227 |     "colab": {
228 |       "name": "mathematica_SDirect.ipynb",
229 |       "provenance": [],
230 |       "toc_visible": true
231 |     },
232 |     "kernelspec": {
233 |       "display_name": "Wolfram Language 12.3",
234 |       "language": "Wolfram Language",
235 |       "name": "wolframlanguage12.3"
236 |     },
237 |     "language_info": {
238 |       "codemirror_mode": "mathematica",
239 |       "file_extension": ".m",
240 |       "mimetype": "application/vnd.wolfram.m",
241 |       "name": "Wolfram Language",
242 |       "pygments_lexer": "mathematica",
243 |       "version": "12.0"
244 |     }
245 |   },
246 |   "nbformat": 4,
247 |   "nbformat_minor": 5
248 | }
249 | 


--------------------------------------------------------------------------------
/matlab/us-census.rst:
--------------------------------------------------------------------------------
  1 | U.S. Census Data API in Matlab
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | .. sectionauthor:: Vincent F. Scalfani <vfscalfani@ua.edu>
  5 | 
  6 | by Anastasia Ramig
  7 | 
  8 | **U.S. Census API documentation:** https://www.census.gov/data/developers/about.html
  9 | 
 10 | **U.S. Census Data Discovery Tool:** https://api.census.gov/data.html
 11 | 
 12 | These recipe examples were tested on December 12, 2022 in MATLAB R2022b.
 13 | 
 14 | See also the U.S. `Census API Terms of Service`_
 15 | 
 16 | .. _Census API Terms of Service: https://www.census.gov/data/developers/about/terms-of-service.html
 17 | 
 18 | **Attribution:** This tutorial uses the Census Bureau Data API but is not endorsed
 19 | or certified by the Census Bureau.
 20 | 
 21 | API Key Information
 22 | =========================
 23 | 
 24 | While an API key is not required to use the U.S. Census Data API, you may consider
 25 | registering for an API key as the API is limited to 500 calls a day without a key. 
 26 | Sign up can be found here: https://api.census.gov/data/key_signup.html
 27 | 
 28 | If you want to add in your API Key, save the API key to a text file named "apiKey" and
 29 | import it using the following code:
 30 | 
 31 | .. code-block:: matlab
 32 | 
 33 |    %%key = importdata("apiKey.txt");
 34 | 
 35 | Note that this tutorial does not use an API key.
 36 | 
 37 | 1. Get population estimates of countries by state
 38 | =======================================================
 39 | 
 40 | Note: includes Washington, D.C. and Puerto Rico
 41 | 
 42 | .. code-block:: matlab
 43 | 
 44 |    %% define root Census API
 45 |    api = "https://api.census.gov/data/";
 46 |    
 47 |    %% define api url for the state ids
 48 |    %% we will use the population estimates from the 2019 dataset
 49 |    %% https://api.census.gov/data/2019/pep/population/examples.html
 50 |    stateIds = api + "2019/pep/population?get=NAME&for=state:*" + key;
 51 |    
 52 |    %% read the url to get the data and find the length
 53 |    stateIds = webread(stateIds);
 54 |    
 55 |    length(stateIds)
 56 | 
 57 | **Output:**
 58 | 
 59 | .. code-block:: matlab
 60 | 
 61 |    ans = 53
 62 | 
 63 | .. code-block:: matlab
 64 | 
 65 |    %% preallocate arrays for states and their id number
 66 |    stateList = {ones(length(stateIds), 1)};
 67 |    numberList = {ones(length(stateIds), 1)};
 68 |  
 69 |    %% create a list of each state and its id
 70 |    for i = 1:length(stateIds)
 71 |        stateList{i, 1} = stateIds{i, 1}{1, 1};
 72 |       numberList{i, 1} = stateIds{i, 1}{2, 1};
 73 |    end
 74 |    stateIdsArray = horzcat(stateList, numberList);
 75 |    stateIdsArray(1, :) = []
 76 | 
 77 | **Output:**
 78 | 
 79 | .. code-block:: matlab
 80 | 
 81 |    stateIdsArray = 52x2 cell
 82 |    'Alabama'             '01'        
 83 |    'Alaska'              '02'        
 84 |    'Arizona'             '04'        
 85 |    'Arkansas'            '05'        
 86 |    'California'          '06'        
 87 |    'Colorado'            '08'        
 88 |    'Delaware'            '10'        
 89 |    'District of Columbia''11'        
 90 |    'Connecticut'         '09'        
 91 |    'Florida'             '12'
 92 |    .
 93 |    .
 94 |    .
 95 |  
 96 | .. code-block:: matlab
 97 | 
 98 |    %% find a list of counties and their populations for each state and put it into a structure
 99 |    state = struct;
100 |    for k = 1:length(stateIdsArray)
101 |        stateName = strrep(stateIdsArray{k, 1}, " ", "");
102 |       stateNumber = stateIdsArray{k, 2};
103 |       stateData{k} = webread(api + "2019/pep/population?get=NAME,POP&for=county:*&in=state:" + stateNumber + key);
104 |       pause(1)
105 |  
106 |       countiesList = {ones(length(stateData{1, k}), 1)};
107 |       popList = {ones(length(stateData{1, k}), 1)};
108 |        for i = 1:length(stateData{1, k})
109 |           countiesList{i, 1} = stateData{1, k}{i, 1}{1, 1};
110 |           popList{i, 1} = stateData{1, k}{i, 1}{2, 1};
111 |        end
112 |       stateArray = horzcat(countiesList, popList);
113 |       state.(stateName) = stateArray;
114 |    end
115 |    state.Alabama
116 | 
117 | **Output:**
118 | 
119 | .. code-block:: matlab
120 | 
121 |    ans = 68x2 cell
122 |    'NAME'                      'POP'       
123 |    'St. Clair County, Alabama' '89512'     
124 |    'Cullman County, Alabama'   '83768'     
125 |    'Houston County, Alabama'   '105882'    
126 |    'Tuscaloosa County, Alabama''209355'    
127 |    'Coffee County, Alabama'    '52342'     
128 |    'Chilton County, Alabama'   '44428'     
129 |    'Coosa County, Alabama'     '10663'     
130 |    'Etowah County, Alabama'    '102268'    
131 |    'Lamar County, Alabama'     '13805' 
132 | 
133 | 2. Get population estimates over a range of years
134 | ===================================================
135 | 
136 | We can use similar code as before, but now loop through different population estimate datasets by year.
137 | Here are the specific APIs used:
138 | 
139 | Vintage 2015 Population Estimates: https://api.census.gov/data/2015/pep/population/examples.html
140 | 
141 | Vintage 2016 Population Estimates: https://api.census.gov/data/2016/pep/population/examples.html
142 | 
143 | Vintage 2017 Population Estimates: https://api.census.gov/data/2017/pep/population/examples.html
144 | 
145 | .. code-block:: matlab
146 | 
147 |    %% create a structure of county and population data from 2015-2017
148 |    stateYears = struct;
149 |    for k = 1:length(stateData)
150 |       for y = 2015:2017
151 |          stateName = strrep(stateIdsArray{k, 1}, " ", "");
152 |          stateNumber = stateIdsArray{k, 2};
153 |          stateData{k} = webread(api + string(y) + "/pep/population?get=GEONAME,POP&for=county:*&in=state:" + stateNumber + key);
154 |          pause(1)
155 |    
156 |          stateArray = {ones(length(stateData{1, k}), 1)};
157 |          for i = 1:length(stateData{1, k})
158 |                stateArray{i, 1} = stateData{1, k}{i, 1}{1, 1};
159 |                stateArray{i, 2} = stateData{1, k}{i, 1}{2, 1};
160 |          end
161 |          year = "y" + string(y);
162 |          stateYears.(stateName).(year) = stateArray;
163 |       end
164 |    end
165 |    stateYears.Alabama.y2015
166 | 
167 | **Output:**
168 | 
169 | .. code-block:: matlab
170 | 
171 |    ans = 68×2 cell
172 |    'GEONAME'                                                           'POP'       
173 |    'Baldwin County, Alabama, East South Central, South, United States' '203709'    
174 |    'Barbour County, Alabama, East South Central, South, United States' '26489'     
175 |    'Bibb County, Alabama, East South Central, South, United States'    '22583'     
176 |    'Blount County, Alabama, East South Central, South, United States'  '57673'     
177 |    'Bullock County, Alabama, East South Central, South, United States' '10696'     
178 |    'Butler County, Alabama, East South Central, South, United States'  '20154'     
179 |    'Calhoun County, Alabama, East South Central, South, United States' '115620'    
180 |    'Chambers County, Alabama, East South Central, South, United States''34123'     
181 |    'Cherokee County, Alabama, East South Central, South, United States''25859'
182 |    .
183 |    .
184 |    .
185 | 
186 | .. code-block:: matlab
187 | 
188 |    stateYears.Alabama.y2016
189 | 
190 | **Output:**
191 | 
192 | .. code-block:: matlab
193 | 
194 |    'GEONAME'                 'POP'       
195 |    'Baldwin County, Alabama' '208563'    
196 |    'Barbour County, Alabama' '25965'     
197 |    'Bibb County, Alabama'    '22643'     
198 |    'Blount County, Alabama'  '57704'     
199 |    'Bullock County, Alabama' '10362'     
200 |    'Butler County, Alabama'  '19998'     
201 |    'Calhoun County, Alabama' '114611'    
202 |    'Chambers County, Alabama''33843'     
203 |    'Cherokee County, Alabama''25725'
204 |    .
205 |    .
206 |    .
207 | 
208 | .. code-block:: matlab
209 | 
210 |    stateYears.Alabama.y2017
211 | 
212 |    'GEONAME'                  'POP'       
213 |    'Conecuh County, Alabama'  '12469'     
214 |    'Coosa County, Alabama'    '10754'     
215 |    'Covington County, Alabama''37092'     
216 |    'Crenshaw County, Alabama' '13871'     
217 |    'Cullman County, Alabama'  '82755'     
218 |    'Dale County, Alabama'     '49226'     
219 |    'Dallas County, Alabama'   '39215'     
220 |    'Autauga County, Alabama'  '55504'     
221 |    'Baldwin County, Alabama'  '212628'
222 |    .
223 |    .
224 |    .   
225 | 
226 | 3. Plot Population Change
227 | ============================
228 | 
229 | This data is based off the 2021 Population Estimates dataset:
230 | 
231 | https://api.census.gov/data/2021/pep/population/variables.html
232 | 
233 | The percentage change in population is from July 1, 2020 to July 1, 2021 for states
234 | (includes Washington, D.C. and Puerto Rico)
235 | 
236 | .. code-block:: matlab
237 | 
238 |    %% find the percentage population change for each state between 2020 and 2021
239 |    request = webread(api + "2021/pep/population?get=NAME,POP_2021,PPOPCHG_2021&for=state:*" + key);
240 |    request;
241 |    for i = 1:length(request)
242 |       for k = 1:length(request{1, 1})
243 |          popChange{i, k} = request{i, 1}{k, 1};
244 |       end
245 |    end
246 |    popChange(1, :) = []
247 | 
248 | **Output:**
249 | 
250 | .. code-block:: matlab
251 | 
252 |    popChange = 52x4 cell
253 |    'Oklahoma'            '3986639'   '0.6210955947' '40'        
254 |    'Nebraska'            '1963692'   '0.1140479899' '31'        
255 |    'Hawaii'              '1441553'   '-0.7134046100''15'        
256 |    'South Dakota'        '895376'    '0.9330412953' '46'        
257 |    'Tennessee'           '6975218'   '0.7962146316' '47'        
258 |    'Nevada'              '3143991'   '0.9608001873' '32'        
259 |    'New Mexico'          '2115877'   '-0.0797613860''35'        
260 |    'Iowa'                '3193079'   '0.1383022195' '19'        
261 |    'Kansas'              '2934582'   '-0.0442116160''20'        
262 |    'District of Columbia''670050'    '-2.9043911470''11'
263 |    .
264 |    .
265 |    .    
266 | 
267 | .. code-block:: matlab
268 | 
269 |    %% create two datasets and make a scatter plot of the population change for each state
270 |    clear x
271 |    clear y
272 |    for i = 1:length(popChange)
273 |       x{i, 1} = str2num(popChange{i, 3});
274 |       y{i, 1} = popChange{i, 1};
275 |    end
276 |    f = figure;
277 |    plot(cell2mat(x), categorical(y), 'o','MarkerFaceColor','magenta','MarkerEdgeColor','none');
278 |    title("Population Change from 2020 to 2021");
279 |    xlabel("% Population Change");
280 |    ylabel("States (including Washington DC and Puerto Rico");
281 |    f.Position = [680, 558, 560, 800];
282 | 
283 | **Output:**
284 | 
285 | .. image:: imgs/matlab_uscensus_im0.png
286 | 


--------------------------------------------------------------------------------
/shell/crossref.rst:
--------------------------------------------------------------------------------
  1 | Crossref API in Bash
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | by Avery Fernandez
  5 | 
  6 | **Crossref API documentation:** https://api.crossref.org/swagger-ui/index.html
  7 | 
  8 | These recipe examples were tested on March 7, 2022 using Ubuntu 18.04.
  9 | 
 10 | *From our testing, we have found that the crossref metadata across publishers and even journals can vary considerably. As a result, it can be easier to work with one journal at a time when using the crossref API (e.g., particulary when trying to extract selected data from records).*
 11 | 
 12 | Program requirements
 13 | =========================
 14 | 
 15 | In order to run this code, you will need to first install `curl`_ and `jq`_. curl is used to request the data from the API and jq is used to parse the JSON data.
 16 | 
 17 | .. _curl: https://github.com/curl/curl
 18 | .. _jq: https://stedolan.github.io/jq/
 19 | 
 20 | 
 21 | 1. Basic Crossref API call
 22 | ===========================
 23 | 
 24 | Setup API parameters
 25 | ---------------------
 26 | 
 27 | .. code-block:: shell
 28 | 
 29 |    base_url="https://api.crossref.org/works/"; email="your_email@ua.edu"; mailto="?mailto="$email; doi="10.1186/1758-2946-4-12"
 30 | 
 31 | .. note::
 32 |    
 33 |    The ``;`` allows us to enter multiple variable assignments on one line and the ``$`` allows for variable expansion.
 34 | 
 35 | Request data from Crossref API
 36 | -------------------------------
 37 | 
 38 | If you want to view the returned json data directly, you can pipe the curl -s (silent option) output to jq:
 39 | 
 40 | .. code-block:: shell
 41 | 
 42 |    curl -s $base_url$doi$mailto | jq '.'
 43 | 
 44 | *Output not shown here*
 45 | 
 46 | However, for our case, we will redirct the output to a file named storage.json
 47 | 
 48 | .. code-block:: shell
 49 | 
 50 |    curl $base_url$doi$mailto > storage.json
 51 | 
 52 | Select some specific data
 53 | ---------------------------
 54 | 
 55 | For example, the container-title data, which contains the journal title:
 56 | 
 57 | .. code-block:: shell
 58 | 
 59 |    cat storage.json | jq '.["message"]["container-title"][0]'
 60 | 
 61 | **Output:**
 62 | 
 63 | .. code-block:: shell
 64 | 
 65 |    "Journal of Cheminformatics"
 66 | 
 67 | Get article title:
 68 | 
 69 | .. code-block:: shell
 70 | 
 71 |    cat storage.json | jq '.["message"]["title"][0]'
 72 | 
 73 | **Output:**
 74 | 
 75 | .. code-block:: shell
 76 | 
 77 |    "The Molecule Cloud - compact visualization of large collections of molecules"
 78 | 
 79 | Get article author names. First, check how many authors there are. One method is to use jq's builtin length function:
 80 | 
 81 | .. code-block:: shell
 82 | 
 83 |    cat storage.json | jq '.["message"]["author"] | length'
 84 | 
 85 | **Output:**
 86 | 
 87 | .. code-block:: shell
 88 | 
 89 |    2
 90 | 
 91 | Now we can incorporate the length into a for loop:
 92 | 
 93 | .. note:: 
 94 | 
 95 |    - The below for loop uses C syntax for looping range (e.g., ``for (( variable = 0; variable < range ; variable++ ))``).
 96 |    - The first name and last name of the authors are concatenated together using ``$`` variable expansiion.
 97 |    - The ``tr -d '"'`` command removes extra quotes around the names. 
 98 | 
 99 | .. code-block:: shell
100 | 
101 |    for (( i = 0; i < $(cat storage.json | jq '.["message"]["author"] | length'); i++ ))
102 |    do
103 |      name=$(cat storage.json | jq ".message.author[$i].given" | tr -d '"')$" "$(cat storage.json | jq ".message.author[$i].family" | tr -d '"');
104 |      echo $name;
105 |    done
106 | 
107 | **Output:**
108 | 
109 | .. code-block:: shell
110 | 
111 |    Peter Ertl
112 |    Bernhard Rohde
113 | 
114 | 
115 | Get bibliography references:
116 | 
117 | .. code-block:: shell
118 | 
119 |    cat storage.json | jq '.["message"]["reference"][].unstructured'
120 | 
121 | **Output:**
122 | 
123 | .. code-block:: shell
124 | 
125 |    "Martin E, Ertl P, Hunt P, Duca J, Lewis R: Gazing into the crystal ball; the future of computer-aided drug design. J Comp-Aided Mol Des. 2011, 26: 77-79."
126 |    "Langdon SR, Brown N, Blagg J: Scaffold diversity of exemplified medicinal chemistry space. J Chem Inf Model. 2011, 26: 2174-2185."
127 |    "Blum LC, Reymond J-C: 970 Million druglike small molecules for virtual screening in the chemical universe database GDB-13. J Am Chem Soc. 2009, 131: 8732-8733. 10.1021/ja902302h."
128 | 
129 | 
130 | 2. Crossref API call with a Loop
131 | =================================
132 | 
133 | Setup API parameters
134 | ---------------------
135 | 
136 | .. code-block:: shell
137 | 
138 |    base_url="https://api.crossref.org/works/"; email="your_email@ua.edu"; mailto="?mailto="$email
139 | 
140 | 
141 | Create a list of DOIs
142 | ----------------------
143 | 
144 | .. code-block:: shell
145 |    
146 |    doi_list=('10.1021/acsomega.1c03250' '10.1021/acsomega.1c05512' '10.1021/acsomega.8b01647' '10.1021/acsomega.1c04287' '10.1021/acsomega.8b01834')
147 | 
148 | Request metadata for each DOI from Crossref API and save to an array
149 | ---------------------------------------------------------------------
150 | 
151 | .. code-block:: shell
152 | 
153 |    declare -a my_array
154 |    for (( i = 0 ; i < ${#doi_list[@]} ; i++ )); do
155 |    my_array[$i]=$(curl $base_url${doi_list[$i]}$mailto)
156 |    sleep 1;
157 |    done
158 | 
159 | .. note::
160 | 
161 |   ``declare -a`` creates an array variable; ``${#doi_list[@]}`` returns length.
162 | 
163 | Select some specific data
164 | ---------------------------
165 | 
166 | Get article titles:
167 | 
168 | .. code-block:: shell
169 | 
170 |    for i in "${!my_array[@]}"
171 |    do
172 |    echo ${my_array[$i]} | jq '.["message"]["title"][0]'
173 |    done
174 | 
175 | .. note::
176 | 
177 |    ``"${!my_array[@]}"`` returns array range.
178 | 
179 | **Output:**
180 | 
181 | .. code-block:: shell
182 | 
183 |    "Navigating into the Chemical Space of Monoamine Oxidase Inhibitors by Artificial Intelligence and Cheminformatics Approach"
184 |    "Impact of Artificial Intelligence on Compound Discovery, Design, and Synthesis"
185 |    "How Precise Are Our Quantitative Structure–Activity Relationship Derived Predictions for New Query Chemicals?"
186 |    "Applying Neuromorphic Computing Simulation in Band Gap Prediction and Chemical Reaction Classification"
187 |    "QSPR Modeling of the Refractive Index for Diverse Polymers Using 2D Descriptors"
188 | 
189 | Get all author affiliations for each article:
190 | 
191 | .. code-block:: shell
192 | 
193 |    for i in "${!my_array[@]}"
194 |    do
195 |    echo ${my_array[$i]} | jq '.["message"]["author"][].affiliation[0].name'
196 |    done
197 | 
198 | .. code-block:: shell
199 | 
200 |    "Department of Pharmaceutical Chemistry and Analysis, Amrita School of Pharmacy, Amrita Vishwa Vidyapeetham, AIMS Health Sciences Campus, Kochi 682041, India"
201 |    "Department of Pharmaceutical Chemistry and Analysis, Amrita School of Pharmacy, Amrita Vishwa Vidyapeetham, AIMS Health Sciences Campus, Kochi 682041, India"
202 |    ...
203 |    ...
204 |    "Department of Chemical and Biomolecular Engineering, The Ohio State University, Columbus, Ohio 43210, United States"
205 |    "Department of Chemical and Biomolecular Engineering, The Ohio State University, Columbus, Ohio 43210, United States"
206 |    "Department of Pharmacoinformatics, National Institute of Pharmaceutical Educational and Research (NIPER), Chunilal Bhawan, 168, Manikata Main Road, 700054 Kolkata, India"
207 |    "Department of Coatings and Polymeric Materials, North Dakota State University, Fargo, North Dakota 58108-6050, United States"
208 |    "Drug Theoretics and Cheminformatics Laboratory, Division of Medicinal and Pharmaceutical Chemistry, Department of Pharmaceutical Technology, Jadavpur University, 700032 Kolkata, India"
209 | 
210 | 
211 | 3. Crossref API call for Journal information
212 | ==============================================
213 | 
214 | Setup API parameters
215 | ---------------------
216 | 
217 | We will use the issn for the journal *BMC Bioinformatics* as an example:
218 | 
219 | .. code-block:: shell
220 | 
221 |    jbase_url="https://api.crossref.org/journals/"; email="your_email@ua.edu"; mailto="?mailto="$email; issn="1471-2105"
222 | 
223 | 
224 | Request journal data from crossref API
225 | ---------------------------------------
226 | 
227 | .. code-block:: shell
228 | 
229 |    curl -s $jbase_url$issn$mailto | jq '.'
230 | 
231 | *Output not shown here*
232 | 
233 | 
234 | 4. Crossref API - Get article DOIs for a journal
235 | =================================================
236 | 
237 | Setup API parameters
238 | ---------------------
239 | 
240 | We will use the issn for the journal *BMC Bioinformatics* and year 2014 as an example:
241 | 
242 | .. code-block:: shell
243 | 
244 |    jbase_url="https://api.crossref.org/journals/"; email="your_email@ua.edu"; mailto="&mailto="$email; issn="1471-2105"; journal_works2014="/works?filter=from-pub-date:2014,until-pub-date:2014&select=DOI"
245 | 
246 | Request DOI data from Crossref API
247 | -----------------------------------
248 | 
249 | .. code-block:: shell
250 | 
251 |    curl -s $jbase_url$issn$journal_works2014$mailto | jq '.'
252 | 
253 | **Output:**
254 | 
255 | .. code-block:: shell
256 | 
257 |    {
258 |      "status": "ok",
259 |      "message-type": "work-list",
260 |      "message-version": "1.0.0",
261 |      "message": {
262 |        "facets": {},
263 |        "total-results": 619,
264 |        "items": [
265 |          {
266 |            "DOI": "10.1186/1471-2105-15-84"
267 |          },
268 |          {
269 |         "DOI": "10.1186/1471-2105-15-94"
270 |          },
271 |          {
272 |            "DOI": "10.1186/1471-2105-15-172"
273 |          },
274 |          {
275 |            "DOI": "10.1186/1471-2105-15-106"
276 |          },
277 |          {
278 |            "DOI": "10.1186/1471-2105-15-s9-s12"
279 | 
280 |        ...
281 |        ...
282 | 
283 |          },
284 |          {
285 |            "DOI": "10.1186/1471-2105-15-266"
286 |          }
287 |        ],
288 |        "items-per-page": 20,
289 |        "query": {
290 |          "start-index": 0,
291 |          "search-terms": null
292 |        }
293 |      }
294 |    }
295 | 
296 | By default, 20 results are displayed. Crossref allows up to 1000 returned results using the rows parameter.
297 | To get all 619 results, we can increase the number of returned rows and save the json output to a file:
298 | 
299 | 
300 | .. code-block:: shell
301 | 
302 |    rows="&rows=700"
303 |    curl $jbase_url$issn$journal_works2014$rows$mailto > dois_save.json
304 | 
305 | Extract DOIs
306 | -----------------------------------
307 | 
308 | .. code-block:: shell
309 | 
310 |    cat dois_save.json | jq '.["message"]["items"][].DOI'
311 | 
312 | **Output:**
313 | 
314 | .. code-block:: shell
315 | 
316 |    "10.1186/1471-2105-15-84"
317 |    "10.1186/1471-2105-15-94"
318 |    "10.1186/1471-2105-15-172"
319 |    "10.1186/1471-2105-15-106"
320 |    "10.1186/1471-2105-15-s9-s12"
321 |    "10.1186/1471-2105-15-33"
322 |    "10.1186/1471-2105-15-s10-p33"
323 |    "10.1186/1471-2105-15-161"
324 |    "10.1186/1471-2105-15-278"
325 |    "10.1186/1471-2105-15-147"
326 |    "10.1186/1471-2105-15-s13-s3"
327 |    "10.1186/1471-2105-15-254"
328 |    "10.1186/1471-2105-15-s10-p24"
329 |    "10.1186/1471-2105-15-s10-p6"
330 |    "10.1186/s12859-014-0411-1"
331 |    ...
332 |    ...
333 | 
334 | .. code-block:: shell
335 | 
336 |    cat dois_save.json | jq '.["message"]["items"][].DOI' | wc -l
337 | 
338 | **Output:**
339 | 
340 | .. code-block:: shell
341 | 
342 |    619
343 | 
344 | **What if we have more than 1000 results in a single query?**
345 | 
346 | For example, if we wanted the DOIs from BMC Bioinformatics for years 2014 through 2016, we see that there are 1772 DOIs:
347 | 
348 | .. code-block:: shell
349 | 
350 |    journal_works2014_2016="/works?filter=from-pub-date:2014,until-pub-date:2016&select=DOI"
351 |    curl -s $jbase_url$issn$journal_works2014_2016$mailto | jq '.["message"]["total-results"]'
352 | 
353 | **Output:**
354 | 
355 | .. code-block:: shell
356 | 
357 |    1772
358 | 
359 | An additional parameter that we can use with crossref API is called “offset”. The offset option allows us to select sets of records and define a starting position (e.g., the first 1000, and then the second set of up to 1000.)
360 | 
361 | .. code-block:: shell
362 | 
363 |    rows="&rows=1000"
364 | 
365 | .. code-block:: shell
366 | 
367 |    numResults=$(curl -s $jbase_url$issn$journal_works2014_2016$mailto | jq '.["message"]["total-results"]')
368 |    echo $numResults
369 | 
370 | **Output:**
371 | 
372 | .. code-block:: shell
373 | 
374 |    1772
375 | 
376 | .. code-block:: shell
377 | 
378 |    for (( n = 0; n < numResults; n+=1000)); do
379 |      curl -s $jbase_url$issn$journal_works2014_2016$rows$"&offset="$n$mailto | jq '.["message"]["items"][].DOI' >> dois_save2.txt
380 |      sleep 1;
381 |    done
382 | 
383 | .. code-block:: shell
384 | 
385 |    head dois_save2.txt
386 | 
387 | **Output:**
388 | 
389 | .. code-block:: shell
390 | 
391 |    "10.1186/1471-2105-15-84"
392 |    "10.1186/1471-2105-15-94"
393 |    "10.1186/1471-2105-16-s15-p11"
394 |    "10.1186/s12859-016-1335-8"
395 |    "10.1186/1471-2105-15-172"
396 |    "10.1186/s12859-015-0538-8"
397 |    "10.1186/1471-2105-15-106"
398 |    "10.1186/1471-2105-16-s15-p20"
399 |    "10.1186/1471-2105-15-s9-s12"
400 |    "10.1186/s12859-016-1202-7"
401 | 
402 | .. code-block:: shell
403 | 
404 |    cat dois_save2.txt | wc -l
405 | 
406 | **Output:**
407 | 
408 | .. code-block:: shell
409 | 
410 |    1772
411 | 
412 | 


--------------------------------------------------------------------------------
/matlab/world-bank.rst:
--------------------------------------------------------------------------------
  1 | World Bank API in Matlab
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | .. sectionauthor:: Vincent F. Scalfani <vfscalfani@ua.edu>
  5 | 
  6 | by Vincent F. Scalfani
  7 | 
  8 | See the `World Bank API documentation`_
  9 | 
 10 | .. _World Bank API documentation: https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation
 11 | 
 12 | These recipe examples were tested on February 12, 2022
 13 | 
 14 | 1. Get list of country iso2Codes and names
 15 | ==============================================
 16 | 
 17 | For obtaining data from the World Bank API, it is helpful to first obtain a list of country codes and names.
 18 | 
 19 | .. code-block:: matlab
 20 | 
 21 |    % define root WorldBank API
 22 |    api = 'https://api.worldbank.org/v2/';
 23 | 
 24 |    % define api url for getting couuntry code data
 25 |    country_url = [api 'country/?format=json&per_page=500'];
 26 | 
 27 |    % read the url and import data to Matlab as JSON data
 28 |    country_data = webread(country_url);
 29 | 
 30 |    % Extract out iso2code from countries data
 31 |    country_iso2Code = extractfield(country_data{2,1}, 'iso2Code');
 32 | 
 33 |    % Transform data
 34 |    country_iso2Code = country_iso2Code';
 35 | 
 36 |    % length of data
 37 |    length(country_iso2Code)
 38 |       ans = 299
 39 |  
 40 | .. code-block:: matlab
 41 | 
 42 |    % display first 10
 43 |    disp(country_iso2Code(1:10,:))
 44 |       {'AW'}
 45 |       {'ZH'}
 46 |       {'AF'}
 47 |       {'A9'}
 48 |       {'ZI'}
 49 |       {'AO'}
 50 |       {'AL'}
 51 |       {'AD'}
 52 |       {'1A'}
 53 |       {'AE'}
 54 | 
 55 | .. code-block:: matlab
 56 | 
 57 |    % Extract out country names
 58 |    country_name = extractfield(country_data{2,1}, 'name');
 59 | 
 60 |    % Transform data
 61 |    country_name = country_name';
 62 | 
 63 |    % length of data
 64 |    length(country_name)
 65 |       ans = 299
 66 | 
 67 | .. code-block:: matlab
 68 | 
 69 |    % display first 10
 70 |    disp(country_name(1:10,:))
 71 |        {'Aruba'                      }
 72 |        {'Africa Eastern and Southern'}
 73 |        {'Afghanistan'                }
 74 |        {'Africa'                     }
 75 |        {'Africa Western and Central' }
 76 |        {'Angola'                     }
 77 |        {'Albania'                    }
 78 |        {'Andorra'                    }
 79 |        {'Arab World'                 }
 80 |        {'United Arab Emirates'       }
 81 | 
 82 | .. code-block:: matlab
 83 | 
 84 |    % now combine country_iso2Code and country name
 85 |    country_iso2code_name = [country_iso2Code country_name];
 86 |    disp(country_iso2code_name(1:10,:))
 87 |        {'AW'}    {'Aruba'                      }
 88 |        {'ZH'}    {'Africa Eastern and Southern'}
 89 |        {'AF'}    {'Afghanistan'                }
 90 |        {'A9'}    {'Africa'                     }
 91 |        {'ZI'}    {'Africa Western and Central' }
 92 |        {'AO'}    {'Angola'                     }
 93 |        {'AL'}    {'Albania'                    }
 94 |        {'AD'}    {'Andorra'                    }
 95 |        {'1A'}    {'Arab World'                 }
 96 |        {'AE'}    {'United Arab Emirates'       }
 97 | 
 98 | Now we know the country iso2Codes which we can use to pull specific indicator data for countries.
 99 | 
100 | 2. Compile a Custom Indicator Dataset
101 | =========================================
102 | 
103 | There are many availabe indicators: https://data.worldbank.org/indicator
104 | 
105 | We wll select three indicators for this example:
106 | 
107 | 1. Scientific and Technical Journal Article Data = `IP.JRN.ARTC.SC`_
108 | 2. Patent Applications, residents = `IP.PAT.RESD`_
109 | 3. GDP per capita (current US$) Code = `NY.GDP.PCAP.CD`_
110 | 
111 | 
112 | Note that these three selected indictaors have a `CC-BY 4.0 license`_
113 | We will compile this indicator data for the United States (US) and United Kingdom (GB) 
114 | 
115 | .. _IP.JRN.ARTC.SC: https://data.worldbank.org/indicator/IP.JRN.ARTC.SC?view=chart
116 | .. _IP.PAT.RESD: https://data.worldbank.org/indicator/IP.PAT.RESD?view=chart
117 | .. _NY.GDP.PCAP.CD: https://data.worldbank.org/indicator/NY.GDP.PCAP.CD?view=chart
118 | .. _CC-BY 4.0 license: https://datacatalog.worldbank.org/public-licenses#cc-by
119 | 
120 | .. code-block:: matlab
121 | 
122 |    % define indicators
123 |    indicators = {'IP.JRN.ARTC.SC','IP.PAT.RESD','NY.GDP.PCAP.CD'};
124 | 
125 |    % generate the web API urls we need for U.S.
126 |    for j = 1:length(indicators)    
127 |        US_api_URL{j} = [api 'country/US/indicator/' indicators{j} '/?format=json&per_page=500'];      
128 |    end
129 | 
130 |    % retrieve data
131 |    for i = 1:length(US_api_URL)    
132 |        try
133 |            US_indicator_data{i} = webread(char(US_api_URL(i)));
134 |         
135 |            % be kind to World Bank Servers, add a 1 second pause
136 |            n = 1;
137 |            pause(n)        
138 |        catch ME
139 |            disp('not found')
140 |        end      
141 |    end
142 | 
143 |    % generate web API urls we need for the UK (GB)
144 |    for j = 1:length(indicators)   
145 |        UK_api_URL{j} = [api 'country/GB/indicator/' indicators{j} '/?format=json&per_page=500'];      
146 |    end
147 | 
148 |    % retrieve data
149 | 
150 |    for i = 1:length(UK_api_URL)   
151 |        try
152 |            UK_indicator_data{i} = webread(char(UK_api_URL(i)));
153 |         
154 |            % be kind to World Bank Servers, add a 1 second pause
155 |            n = 1;
156 |            pause(n)        
157 |        catch ME
158 |            disp('not found')
159 |        end
160 |       
161 |    end
162 | 
163 |    % Extract the data and compile
164 |    % N.B. we can not use the Matlab extractfield function here because of the non empty cells
165 | 
166 |    % US Data compilation
167 |    for k = 1:length(US_indicator_data{1,1}{2,1})
168 |     
169 |        % Scientific and Technical Journal Article Data Indicator Code = IP.JRN.ARTC.SC
170 |        US_IPja_date{k} = US_indicator_data{1,1}{2,1}(k).date;
171 |        US_IPja_value{k} = US_indicator_data{1,1}{2,1}(k).value;
172 |     
173 |        % Patent Applications, residents = IP.PAT.RESD
174 |        US_IPpat_date{k} = US_indicator_data{1,2}{2,1}(k).date;
175 |        US_IPpat_value{k} = US_indicator_data{1,2}{2,1}(k).value;
176 |     
177 |        % GDP per capita (current US$) Code = NY.GDP.PCAP.CD
178 |        US_NYGDP_date{k} = US_indicator_data{1,3}{2,1}(k).date;
179 |        US_NYGDP_value{k} = US_indicator_data{1,3}{2,1}(k).value;
180 |     
181 |    end
182 | 
183 |    % compile
184 |    US_data = [US_IPja_date; US_IPja_value; US_IPpat_date; US_IPpat_value;...
185 |        US_NYGDP_date; US_NYGDP_value];
186 | 
187 |    % transform
188 |    US_data = US_data';
189 | 
190 |    % length of data
191 |    length(US_data)
192 |       ans = 61
193 | 
194 | .. code-block:: matlab
195 | 
196 |    % display first 10 rows
197 |    disp(US_data(1:10,:))
198 |        {'2020'}    {0×0 double  }    {'2020'}    {0×0 double}    {'2020'}    {[6.3414e+04]}
199 |        {'2019'}    {0×0 double  }    {'2019'}    {[  285113]}    {'2019'}    {[6.5280e+04]}
200 |        {'2018'}    {[4.2281e+05]}    {'2018'}    {[  285095]}    {'2018'}    {[6.3064e+04]}
201 |        {'2017'}    {[4.3222e+05]}    {'2017'}    {[  293904]}    {'2017'}    {[6.0110e+04]}
202 |        {'2016'}    {[4.2726e+05]}    {'2016'}    {[  295327]}    {'2016'}    {[5.8021e+04]}
203 |        {'2015'}    {[4.2999e+05]}    {'2015'}    {[  288335]}    {'2015'}    {[5.6863e+04]}
204 |        {'2014'}    {[4.3319e+05]}    {'2014'}    {[  285096]}    {'2014'}    {[5.5050e+04]}
205 |        {'2013'}    {[4.2957e+05]}    {'2013'}    {[  287831]}    {'2013'}    {[5.3107e+04]}
206 |        {'2012'}    {[4.2800e+05]}    {'2012'}    {[  268782]}    {'2012'}    {[5.1603e+04]}
207 |        {'2011'}    {[4.2396e+05]}    {'2011'}    {[  247750]}    {'2011'}    {[4.9883e+04]}
208 |   
209 | .. code-block:: matlab
210 | 
211 |    % Convert the year data to numbers for plotting
212 |    USyears = cellfun(@str2num, US_data(:,1),'UniformOutput', false);
213 | 
214 |    % since other year columns are the same, we can simplify this matrix
215 |    US_data = [USyears US_data(:,2) US_data(:,4) US_data(:,6)];
216 | 
217 |    % display first 10 rows
218 |    disp(US_data(1:10,:))
219 |        {[2020]}    {0×0 double  }    {0×0 double}    {[6.3414e+04]}
220 |        {[2019]}    {0×0 double  }    {[  285113]}    {[6.5280e+04]}
221 |        {[2018]}    {[4.2281e+05]}    {[  285095]}    {[6.3064e+04]}
222 |        {[2017]}    {[4.3222e+05]}    {[  293904]}    {[6.0110e+04]}
223 |        {[2016]}    {[4.2726e+05]}    {[  295327]}    {[5.8021e+04]}
224 |        {[2015]}    {[4.2999e+05]}    {[  288335]}    {[5.6863e+04]}
225 |        {[2014]}    {[4.3319e+05]}    {[  285096]}    {[5.5050e+04]}
226 |        {[2013]}    {[4.2957e+05]}    {[  287831]}    {[5.3107e+04]}
227 |        {[2012]}    {[4.2800e+05]}    {[  268782]}    {[5.1603e+04]}
228 |        {[2011]}    {[4.2396e+05]}    {[  247750]}    {[4.9883e+04]}
229 | 
230 | .. code-block:: matlab
231 | 
232 |    % convert empty cells [] to NaN (this is required for plotting)
233 |    empties = cellfun('isempty',US_data);
234 |    US_data(empties) = {NaN};
235 |    US_data = cell2mat(US_data);
236 | 
237 |    % display first 10 rows
238 |    % column 1: year
239 |    % column 2: Scientific and Technical Journal Article Data = IP.JRN.ARTC.SC
240 |    % column 3: Patent Applications, residents = IP.PAT.RESD
241 |    % column 4: GDP per capita (current US$) Code = NY.GDP.PCAP.CD
242 | 
243 |    disp(US_data(1:10,:))
244 |       1.0e+05 *
245 | 
246 |        0.0202       NaN       NaN    0.6341
247 |        0.0202       NaN    2.8511    0.6528
248 |        0.0202    4.2281    2.8510    0.6306
249 |        0.0202    4.3222    2.9390    0.6011
250 |        0.0202    4.2726    2.9533    0.5802
251 |        0.0202    4.2999    2.8834    0.5686
252 |        0.0201    4.3319    2.8510    0.5505
253 |        0.0201    4.2957    2.8783    0.5311
254 |        0.0201    4.2800    2.6878    0.5160
255 |        0.0201    4.2396    2.4775    0.4988
256 | 
257 | .. code-block:: matlab
258 | 
259 |    % UK Data Compilation (same workflow as above)
260 |    for k = 1:length(UK_indicator_data{1, 1}{2, 1})
261 |     
262 |        % Scientific and Technical Journal Article Data = IP.JRN.ARTC.SC
263 |        UK_IPja_date{k} = UK_indicator_data{1, 1}{2, 1}(k).date;
264 |        UK_IPja_value{k} = UK_indicator_data{1, 1}{2, 1}(k).value;
265 |     
266 |        % Patent Applications, residents = IP.PAT.RESD
267 |        UK_IPpat_date{k} = UK_indicator_data{1, 2}{2, 1}(k).date;
268 |        UK_IPpat_value{k} = UK_indicator_data{1, 2}{2, 1}(k).value;
269 |     
270 |        % GDP per capita (current US$) Code = NY.GDP.PCAP.CD
271 |        UK_NYGDP_date{k} = UK_indicator_data{1, 3}{2, 1}(k).date;
272 |        UK_NYGDP_value{k} = UK_indicator_data{1, 3}{2, 1}(k).value;
273 |     
274 |    end
275 | 
276 |    UK_data = [UK_IPja_date; UK_IPja_value; UK_IPpat_date; UK_IPpat_value;...
277 |        UK_NYGDP_date; UK_NYGDP_value];
278 | 
279 |    UK_data = UK_data';
280 | 
281 |    % length of data
282 |    length(UK_data)
283 |       ans = 61
284 | 
285 | .. code-block:: matlab
286 | 
287 |    % we need to convert the year data to numbers for plotting
288 |    UKyears = cellfun(@str2num, UK_data(:,1),'UniformOutput', false);
289 | 
290 |    % since other year columns are the same, we can simplify this matrix
291 |    UK_data = [UKyears UK_data(:,2) UK_data(:,4) UK_data(:,6)];
292 | 
293 |    % convert empty cells [] to NaN (this is required for plotting) 
294 |    empties = cellfun('isempty',UK_data);
295 |    UK_data(empties) = {NaN};
296 |    UK_data = cell2mat(UK_data);
297 | 
298 |    % display first 10 rows
299 |    % column 1: year
300 |    % column 2: Scientific and Technical Journal Article Data = IP.JRN.ARTC.SC
301 |    % column 3: Patent Applications, residents = IP.PAT.RESD
302 |    % column 4: GDP per capita (current US$) Code = NY.GDP.PCAP.CD
303 |    disp(UK_data(1:10,:))
304 |       1.0e+04 *
305 | 
306 |        0.2020       NaN       NaN    4.1125
307 |        0.2019       NaN    1.2061    4.3070
308 |        0.2018    9.7681    1.2865    4.3647
309 |        0.2017    9.9129    1.3301    4.0858
310 |        0.2016    9.9366    1.3876    4.1500
311 |        0.2015    9.9616    1.4867    4.5405
312 |        0.2014    9.9385    1.5196    4.7787
313 |        0.2013    9.9228    1.4972    4.3714
314 |        0.2012    9.8145    1.5370    4.2687
315 |        0.2011    9.5820    1.5343    4.2285
316 | 
317 | 3. Plot Indicator data
318 | =========================
319 | 
320 | Create a line plot of US/UK Number of Scientific and Technical Journal Articles and Patents by year
321 | 
322 | 
323 | .. code-block:: matlab
324 | 
325 |    % US/UK Number of Scientific and Technical Journal Articles and Patents (yleft)
326 |    % GDP per capita (current US$) Code = NY.GDP.PCAP.CD (yright)
327 | 
328 |    figure
329 |    box on
330 |    hold on
331 |    xlabel('Year')
332 |    xlim([2000 2018])
333 |    ylim([1e+03 1e+06])
334 |    ylabel('Number of Scientific/Technical Journal Articles and Patents')
335 |    yyaxis left
336 |    plot(US_data(:,1),(US_data(:,2)+US_data(:,3)),'-.o',UK_data(:,1), (UK_data(:,2)+UK_data(:,3)),...
337 |        '-.^','LineWidth', 3,'MarkerSize', 10)
338 | 
339 |    yyaxis right
340 |    ylim([2e+04 7e+04])
341 |    ylabel('GDP per capita (current US$)')
342 |    plot(US_data(:,1),US_data(:,4),':o',UK_data(:,1), UK_data(:,4),...
343 |        ':^','LineWidth', 3,'MarkerSize', 10)
344 | 
345 |    legend('IP United States', 'IP United Kingdom', 'GDP United States','GDP United Kingdom')
346 |    hold off
347 | 
348 |    % make figure wider
349 |    set(gcf, 'Units', 'Normalized', 'OuterPosition', [0 0 1 1]);
350 |    set(gca,'FontSize',20)
351 | 
352 | .. image:: imgs/matlab_worldbank_im0.png
353 |    :scale: 25%
354 | 
355 | 
356 | 
357 | 


--------------------------------------------------------------------------------
/matlab/pubmed.rst:
--------------------------------------------------------------------------------
  1 | PubMed API in Matlab
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | .. sectionauthor:: Vincent F. Scalfani <vfscalfani@ua.edu>
  5 | 
  6 | by Anastasia Ramig
  7 | 
  8 | These recipe examples were tested on November 23, 2022 in MATLAB R2022b.
  9 | 
 10 | **NCBI Entrez Programming Utilities Documentation:**
 11 | https://www.ncbi.nlm.nih.gov/books/NBK25501/
 12 | 
 13 | **Please see NCBI's Data Usage Policies and Disclaimers:**
 14 | https://www.ncbi.nlm.nih.gov/home/about/policies/
 15 | 
 16 | 1. Basic PubMed API call
 17 | ============================
 18 | 
 19 | For calling individual articles and publications, we will need to use this API URL:
 20 | 
 21 | .. code-block:: matlab
 22 | 
 23 |    %% set up the API parameters
 24 |    summary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&";
 25 | 
 26 | The article we are requesting has PubMed ID: 27933103.
 27 | 
 28 | **retmode** in the web API URL specifies the file format. In this example, we will use JSON.
 29 | 
 30 | .. code-block:: matlab
 31 | 
 32 |    %% pull specific article data using its ID
 33 |    url = summary + "id=27933103&retmode=json";
 34 |    data_call = webread(url);
 35 | 
 36 | .. code-block:: matlab
 37 | 
 38 |    %% index to where the authors are stored
 39 |    index = data_call.result.x27933103.authors;
 40 |    disp(index)
 41 | 
 42 | **Output:**
 43 | 
 44 | .. code-block:: matlab
 45 | 
 46 |    8×1 struct array with fields:
 47 | 
 48 |     name
 49 |     authtype
 50 |     clusterid
 51 | 
 52 | .. code-block:: matlab
 53 | 
 54 |    %% index to pull out the list of author names
 55 |    names = {index(:).name};
 56 |    disp(names)
 57 | 
 58 | **Output:**
 59 | 
 60 | .. code-block:: matlab
 61 | 
 62 |    Columns 1 through 5
 63 | 
 64 |     {'Scalfani VF'}    {'Williams AJ'}    {'Tkachenko V'}    {'Karapetyan K'}    {'Pshenichnov A'}
 65 | 
 66 |    Columns 6 through 8
 67 | 
 68 |     {'Hanson RM'}    {'Liddie JM'}    {'Bara JE'}
 69 | 
 70 | 2. Request Data using a Loop
 71 | ================================
 72 | 
 73 | First, create a list of PubMed IDs:
 74 | 
 75 | .. code-block:: matlab
 76 | 
 77 |    %% create a list of PubMed IDs
 78 |    idList = [34813985, 34813932, 34813684, 34813661, 34813372, 34813140, 34813072];
 79 | 
 80 | We will use map containers to store IDs and associated data. MATLAB map containers work by storing
 81 | a value and associating it with a specific key. We can establish a map container using:
 82 | 
 83 | .. code-block:: matlab
 84 | 
 85 |    %% create an empty map container
 86 |    map = containers.Map;
 87 |    map
 88 | 
 89 | **Output:**
 90 | 
 91 | .. code-block:: matlab
 92 | 
 93 |    map = 
 94 |    Map with properties:
 95 | 
 96 |          Count: 0
 97 |        KeyType: char
 98 |      ValueType: any
 99 | 
100 | .. code-block:: matlab
101 | 
102 |    %% create two structures and add each id and its corresponding search
103 |    multiPapersKeys = {ones(length(idList), 1)};
104 |    multiPapersValues = {ones(length(idList), 1)};
105 |    for i=1:length(idList)
106 |        url = summary + "id=" + string(idList(i)) + "&retmode=json";
107 |        multiPapersKeys{i} = idList(i);
108 |        multiPapersValues{i} = webread(url);
109 |        pause(1)
110 |    end
111 | 
112 | .. code-block:: matlab
113 | 
114 |    %% create a container of the search results and index to a specific article
115 |    multiPapers = containers.Map(multiPapersKeys, multiPapersValues);
116 |    multiPapers(34813985)
117 | 
118 | **Output:**
119 | 
120 | .. code-block:: matlab
121 | 
122 |    ans = struct with fields:
123 |      header: [1×1 struct]
124 |      result: [1×1 struct]
125 | 
126 | .. code-block:: matlab
127 | 
128 |    %% create a new set of ids that are formatted with "x" for indexing
129 |    xiSet = {ones(length(idList),1)};
130 |    for i=1:length(idList)
131 |        xiSet{i} = "x" + idList(i);
132 |    end
133 |  
134 |    %% get the title for each journal
135 |    for i=1:length(idList)
136 |        id = idList(i);
137 |        displayResult = multiPapers(id).result.(xiSet{i}).source
138 |    end
139 | 
140 | **Output:**
141 | 
142 | .. code-block:: matlab
143 | 
144 |    displayResult = 'Cell Calcium'
145 |    displayResult = 'Methods'
146 |    displayResult = 'FEBS J'
147 |    displayResult = 'Dev Growth Differ'
148 |    displayResult = 'CRISPR J'
149 |    displayResult = 'Chembiochem'
150 |    displayResult = 'Methods Mol Biol'
151 | 
152 | 3. PubMed API Calls with Requests and Parameters
153 | ===================================================
154 | 
155 | For searching for articles using search term(s), we will need to use this API URL:
156 | 
157 | .. code-block:: matlab
158 | 
159 |    %% set the search url for the API
160 |    search = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&";
161 | 
162 | When searching through articles, we are given a few ways of filtering the data.
163 | A list of all the available parameters for these requests can be found in the official NCBI documentation:
164 | 
165 | https://www.ncbi.nlm.nih.gov/books/NBK25499/
166 | 
167 | We can specify the database by putting **db=<database>** into the URL. We will be using the PubMed database. 
168 | We can also use term to search data by adding **term=<searchQuery>**. Just be sure to replace spaces with
169 | a + instead. We can, for example, use a query to search PubMed, such as "neuroscience intervention learning":
170 | 
171 | .. code-block:: matlab
172 | 
173 |    %% search the API
174 |    url = search + "term=neuroscience+intervention+learning&retmode=json";
175 |    data = webread(url);
176 | 
177 | The number of returned IDs can be adjusted with the **retmax** parameter:
178 | 
179 | .. code-block:: matlab
180 | 
181 |    %% limit the search to 25 articles and pull the list of ids
182 |    url = search + "term=neuroscience+intervention+learning&retmax=25&retmode=json";
183 |    data = webread(url);
184 |    disp(data.esearchresult.idlist)
185 | 
186 | **Output:**
187 | 
188 | .. code-block:: matlab
189 | 
190 |     {'36416175'}
191 |     {'36415971'}
192 |     {'36414247'}
193 |     {'36414012'}
194 |     {'36411719'}
195 |     {'36411683'}
196 |     {'36411673'}
197 |     {'36409100'}
198 |     {'36409046'}
199 |     {'36408530'}
200 |     {'36408399'}
201 |     {'36408106'}
202 |     {'36408061'}
203 |     {'36405490'}
204 |     {'36405191'}
205 |     {'36405080'}
206 |     {'36404677'}
207 |     {'36404570'}
208 |     {'36402843'}
209 |     {'36402815'}
210 |     {'36402739'}
211 |     {'36402496'}
212 |     {'36401545'}
213 |     {'36399451'}
214 |     {'36398842'}
215 | 
216 | .. code-block:: matlab
217 | 
218 |    length(data.esearchresult.idlist)
219 | 
220 | **Output:**
221 | 
222 | .. code-block:: matlab
223 | 
224 |    ans = 25
225 | 
226 | We can also use the query to search for an author. Add **[au]** after the name to specify it is an author.
227 | 
228 | .. code-block:: matlab
229 | 
230 |    %% search articles by author name
231 |    url = search+"term=Darwin[au]&retmode=json";
232 |    data = webread(url); 
233 |    data.esearchresult.count
234 | 
235 | **Output:**
236 | 
237 | .. code-block:: matlab
238 | 
239 |    ans = '603'
240 | 
241 | We can also sort results using **usehistory=y**. This allows us to store the data for it to be sorted in the same API call.
242 | The addition of **sort=pub+date** will sort IDs by the publishing date.
243 | 
244 | .. code-block:: matlab
245 | 
246 |    %% perform a search that is sorted by publication date
247 |    url = search+"term=Coral+Reefs&retmode=json&usehistory=y&sort=pub+date";
248 |    data = webread(url);
249 |    disp(data.esearchresult.idlist)
250 | 
251 | **Output:**
252 | 
253 | .. code-block:: matlab
254 | 
255 |     {'35341677'}
256 |     {'36252668'}
257 |     {'36183766'}
258 |     {'36181819'}
259 |     {'36055494'}
260 |     {'35995149'}
261 |     {'36409983'}
262 |     {'36265239'}
263 |     {'36179999'}
264 |     {'36172974'}
265 |     {'36168958'}
266 |     {'36152066'}
267 |     {'36150619'}
268 |     {'36129389'}
269 |     {'36106689'}
270 |     {'36064010'}
271 |     {'36054745'}
272 |     {'35998799'}
273 |     {'35980514'}
274 |     {'35718641'}
275 | 
276 | .. code-block:: matlab
277 | 
278 |    %% compare to unsorted
279 |    url = search+"term=Coral+Reefs&retmode=json";
280 |    data = webread(url);
281 |    disp(data.esearchresult.idlist)
282 | 
283 | 
284 | **Output:**
285 | 
286 | .. code-block:: matlab
287 | 
288 |     {'36416762'}
289 |     {'36415309'}
290 |     {'36413112'}
291 |     {'36409983'}
292 |     {'36406938'}
293 |     {'36405638'}
294 |     {'36401956'}
295 |     {'36401815'}
296 |     {'36399057'}
297 |     {'36395713'}
298 |     {'36395226'}
299 |     {'36389413'}
300 |     {'36385270'}
301 |     {'36383546'}
302 |     {'36382375'}
303 |     {'36379970'}
304 |     {'36379169'}
305 |     {'36372339'}
306 |     {'36371949'}
307 |     {'36371558'}
308 | 
309 | We can also search based on publication type by adding **AND** into the search in the term: **term=<searchQuery>+AND+filter[filterType]**. 
310 | 
311 | **[pt]** specifies that the filter type is the publication type. More filters can be found at: https://pubmed.ncbi.nlm.nih.gov/help/.
312 | 
313 | .. code-block:: matlab
314 | 
315 |    %% search based on publication type
316 |    url = search+"term=stem+cells+AND+clinical+trial[pt]&retmode=json";
317 |    data = webread(url)
318 | 
319 | **Output:**
320 | 
321 | .. code-block:: matlab
322 | 
323 |    data = struct with fields:
324 |            header: [1×1 struct]
325 |     esearchresult: [1×1 struct]
326 | 
327 | 4. PubMed API Metadata Visualization
328 | ===========================================
329 | 
330 | Frequency of Topic sortpubdate field
331 | ----------------------------------------
332 | 
333 | Extracting the sortpubdate field for a "hydrogel drug" search results, limited to publication type clinical trials:
334 | 
335 | .. code-block:: matlab
336 | 
337 |    %% perform a search using the term "hydrogel drug" and print the list of ids
338 |    url = search+"term=hydrogel+drug+AND+clinical+trial[pt]&sort=pub+date&retmax=500&retmode=json";
339 |    data = webread(url);
340 |    ids = data.esearchresult.idlist;
341 |    length(ids)
342 | 
343 | **Output:**
344 | 
345 | .. code-block:: matlab
346 | 
347 |    ans = 299
348 | 
349 | .. code-block:: matlab
350 | 
351 |    %% create a list of publication dates
352 |    pubDates = {ones(length(ids), 1)};
353 |    for i=1:length(ids)
354 |        url = summary+"id="+string(ids{i})+"&retmode=json";
355 |        request = webread(url);
356 |        pause(1)
357 |        idNew = "x" + ids{i};
358 |        pubDates{i} = request.result.(idNew).sortpubdate;
359 |    end
360 |    pubDates{1:10}
361 | 
362 | **Output:**
363 | 
364 | .. code-block:: matlab
365 | 
366 |    ans = '2022/12/01 00:00'
367 |    ans = '2022/10/19 00:00'
368 |    ans = '2022/10/01 00:00'
369 |    ans = '2022/10/01 00:00'
370 |    ans = '2022/08/01 00:00'
371 |    ans = '2022/06/01 00:00'
372 |    ans = '2022/05/01 00:00'
373 |    ans = '2022/04/01 00:00'
374 |    ans = '2022/03/01 00:00'
375 |    ans = '2022/01/21 00:00'
376 | 
377 | .. code-block:: matlab
378 | 
379 |    length(pubDates)
380 | 
381 | **Output:**
382 | 
383 | .. code-block:: matlab
384 | 
385 |    ans = 299
386 | 
387 | .. code-block:: matlab
388 | 
389 |    %% pull the year from each publication date
390 |    datesList = {ones(length(pubDates), 1)};
391 |    for i = 1:length(pubDates)
392 |        datesList{i} = str2double(pubDates{i}(1:4));
393 |    end
394 |    disp(datesList(1:20)) %% show first 20
395 | 
396 | **Output:**
397 | 
398 | .. code-block:: matlab
399 | 
400 |    Columns 1 through 8
401 | 
402 |     {[2022]}    {[2022]}    {[2022]}    {[2022]}    {[2022]}    {[2022]}    {[2022]}    {[2022]}
403 | 
404 |    Columns 9 through 16
405 | 
406 |     {[2022]}    {[2022]}    {[2022]}    {[2022]}    {[2022]}    {[2021]}    {[2021]}    {[2021]}
407 | 
408 |    Columns 17 through 20
409 | 
410 |     {[2021]}    {[2021]}    {[2021]}    {[2021]}
411 | 
412 | .. code-block:: matlab
413 | 
414 |    %% plot a histogram of the publications according to the decade in which they were published
415 |    x = cell2mat(datesList);
416 |  
417 |    f = figure;
418 |    f.Position = [100 100 540 400];
419 |    f(1);
420 |    edges = [1980 1985 1990 1995 2000 2005 2010 2015 2020];
421 |    histogram(x)
422 | 
423 | **Output:**
424 | 
425 | .. image:: imgs/matlab_pm_im0.png
426 | 
427 | Frequency of Publication for an Author Search
428 | -------------------------------------------------
429 | 
430 | .. code-block:: matlab
431 | 
432 |    %% search all of the articles written by a certain author sorted by publication date
433 |    url = search+"term=Reed+LK[au]&sort=pub+date&retmax=500&retmode=json";
434 |    data = webread(url);
435 |    ids = data.esearchresult.idlist;
436 |    length(ids)
437 | 
438 | **Output:**
439 | 
440 | .. code-block:: matlab
441 | 
442 |    ans = 55
443 | 
444 | .. code-block:: matlab
445 | 
446 |    %% create a list of publication dates
447 |    pubDates = {ones(length(ids), 1)};
448 |    for i=1:length(ids)
449 |        url = summary+"id="+string(ids{i})+"&retmode=json";
450 |        request = webread(url);
451 |        pause(1)
452 |        idNew = "x" + ids{i};
453 |        pubDates{i} = request.result.(idNew).sortpubdate;
454 |    end
455 | 
456 | .. code-block:: matlab
457 | 
458 |    %% pull the year from each publication date
459 |    datesList = {ones(length(pubDates), 1)};
460 |    for i = 1:length(pubDates)
461 |        datesList{i} = str2double(pubDates{i}(1:4));
462 |    end
463 |    disp(datesList(1:20))
464 | 
465 | **Output:**
466 | 
467 | .. code-block:: matlab
468 | 
469 |    Columns 1 through 8
470 | 
471 |     {[2022]}    {[2022]}    {[2022]}    {[2022]}    {[2021]}    {[2021]}    {[2021]}    {[2021]}
472 | 
473 |    Columns 9 through 16
474 | 
475 |     {[2021]}    {[2020]}    {[2020]}    {[2020]}    {[2020]}    {[2020]}    {[2019]}    {[2019]}
476 | 
477 |    Columns 17 through 20
478 | 
479 |     {[2019]}    {[2018]}    {[2018]}    {[2018]}
480 | 
481 | 
482 | .. code-block:: matlab
483 | 
484 |    %% plot a histogram of the articles according to the decade in which they were published
485 |    x = cell2mat(datesList);
486 |  
487 |    f = figure;
488 |    f.Position = [100 100 540 400];
489 |    xticks = ([1950 1960 1970 1980 1990 2000 2010 2020]);
490 |    histogram(x)
491 | 
492 | **Output:**
493 | 
494 | .. image:: imgs/matlab_pm_im1.png
495 | 
496 | 


--------------------------------------------------------------------------------
/shell/scopus.rst:
--------------------------------------------------------------------------------
  1 | Scopus API in Bash
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | by Avery Fernandez
  5 | 
  6 | These recipe examples use the Elsevier Scopus API. Code was tested and sample data downloaded from the Scopus
  7 | API on April 7, 2022 via http://api.elsevier.com and http://www.scopus.com. This tutorial content is intended to help 
  8 | facillitate academic research. Before continuing or reusing any of this code, please be aware of
  9 | Elsevier's `API policies and appropiate use-cases`_. You will also need to register for an API key
 10 | in order to use the Scopus API.
 11 | 
 12 | .. _API policies and appropiate use-cases: https://dev.elsevier.com/use_cases.html
 13 | 
 14 | Setup
 15 | ==========
 16 | 
 17 | Program requirements
 18 | --------------------
 19 | 
 20 | In order to run this code, you will need to first install `curl`_, and `jq`_.
 21 | curl is used to request the data from the API and jq is used to parse the JSON data.
 22 | 
 23 | .. _curl: https://github.com/curl/curl
 24 | .. _jq: https://stedolan.github.io/jq/
 25 | 
 26 | API Key Information
 27 | -------------------
 28 | 
 29 | We will start by setting up the API key. Save your key in a text file in
 30 | your current directory and import your key as follows:
 31 | 
 32 | .. code-block:: shell
 33 | 
 34 |    apiKey=$(cat "apikey.txt")
 35 | 
 36 | Set the url for the base API:
 37 | 
 38 | .. code-block:: shell
 39 | 
 40 |    api="https://api.elsevier.com/content/search/scopus"
 41 | 
 42 | 1. Get Author data
 43 | ======================
 44 | 
 45 | Records for Author
 46 | --------------------------------
 47 | 
 48 | .. code-block:: shell
 49 | 
 50 |    rawAuthorSearch=$(curl $api$"?query=AU-ID(55764087400)&apiKey=""$apiKey" | jq '.["search-results"]')
 51 |    
 52 | Here is how to view the returned data (`[0]` to show first record):
 53 | 
 54 | .. code-block:: shell
 55 | 
 56 |    echo "$rawAuthorSearch" | jq '.["entry"][0]'
 57 | 
 58 | The Raw JSON file is converted into a dictionary/associative array, which can be queried using the keys listed below:
 59 | 
 60 | .. code-block:: shell
 61 | 
 62 |    echo "$rawAuthorSearch" | jq '.["entry"][0] | keys'
 63 | 
 64 | **Output:**
 65 | 
 66 | .. code-block:: shell
 67 | 
 68 |    [
 69 |    "@_fa",
 70 |    "affiliation",
 71 |    "citedby-count",
 72 |    "dc:creator",
 73 |    "dc:identifier",
 74 |    "dc:title",
 75 |    "eid",
 76 |    "link",
 77 |    "openaccess",
 78 |    "openaccessFlag",
 79 |    "prism:aggregationType",
 80 |    "prism:coverDate",
 81 |    "prism:coverDisplayDate",
 82 |    "prism:doi",
 83 |    "prism:eIssn",
 84 |    "prism:issn",
 85 |    "prism:issueIdentifier",
 86 |    "prism:pageRange",
 87 |    "prism:publicationName",
 88 |    "prism:url",
 89 |    "prism:volume",
 90 |    "source-id",
 91 |    "subtype",
 92 |    "subtypeDescription"
 93 |    ]
 94 | 
 95 | Extracting all the DOIs from the author data:
 96 | 
 97 | .. code-block:: shell
 98 | 
 99 |    echo "$rawAuthorSearch" | jq '.["entry"][]["prism:doi"]'
100 | 
101 | **Output:**
102 | 
103 | .. code-block:: shell
104 | 
105 |    "10.1021/acs.jchemed.1c00904"
106 |    "10.5860/crln.82.9.428"
107 |    "10.1021/acs.iecr.8b02573"
108 |    "10.1021/acs.jchemed.6b00602"
109 |    "10.5062/F4TD9VBX"
110 |    "10.1021/acs.macromol.6b02005"
111 |    "10.1186/s13321-016-0181-z"
112 |    "10.1021/acs.chemmater.5b04431"
113 |    "10.1021/acs.jchemed.5b00512"
114 |    "10.1021/acs.jchemed.5b00375"
115 |    "10.5860/crln.76.9.9384"
116 |    "10.5860/crln.76.2.9259"
117 |    "10.1021/ed400887t"
118 |    "10.1016/j.acalib.2014.03.015"
119 |    "10.5062/F4XS5SB9"
120 |    "10.1021/ma300328u"
121 |    "10.1021/mz200108a"
122 |    "10.1021/ma201170y"
123 |    "10.1021/ma200184u"
124 |    "10.1021/cm102374t"
125 | 
126 | Extract all titles:
127 | 
128 | .. code-block:: shell
129 | 
130 |    echo "$rawAuthorSearch" | jq '.["entry"][]["dc:title"]'
131 | 
132 | **Output:**
133 | 
134 | .. code-block:: shell
135 | 
136 |    "Using NCBI Entrez Direct (EDirect) for Small Molecule Chemical Information Searching in a Unix Terminal"
137 |    "Using the linux operating system full-time tips and experiences from a subject liaison librarian"
138 |    "Analysis of the Frequency and Diversity of 1,3-Dialkylimidazolium Ionic Liquids Appearing in the Literature"
139 |    "Rapid Access to Multicolor Three-Dimensional Printed Chemistry and Biochemistry Models Using Visualization and Three-Dimensional Printing Software Programs"
140 |    "Text analysis of chemistry thesis and dissertation titles"
141 |    "Phototunable Thermoplastic Elastomer Hydrogel Networks"
142 |    "Programmatic conversion of crystal structures into 3D printable files using Jmol"
143 |    "Dangling-End Double Networks: Tapping Hidden Toughness in Highly Swollen Thermoplastic Elastomer Hydrogels"
144 |    "Replacing the Traditional Graduate Chemistry Literature Seminar with a Chemical Research Literacy Course"
145 |    "3D Printed Block Copolymer Nanostructures"
146 |    "Hypotheses in librarianship: Applying the scientific method"
147 |    "Recruiting students to campus: Creating tangible and digital products in the academic library"
148 |    "3D printed molecules and extended solid models for teaching symmetry and point groups"
149 |    "Repurposing Space in a Science and Engineering Library: Considerations for a Successful Outcome"
150 |    "A model for managing 3D printing services in academic libraries"
151 |    "Morphological phase behavior of poly(RTIL)-containing diblock copolymer melts"
152 |    "Network formation in an orthogonally self-assembling system"
153 |    "Access to nanostructured hydrogel networks through photocured body-centered cubic block copolymer melts"
154 |    "Synthesis and ordered phase separation of imidazolium-based alkyl-ionic diblock copolymers made via ROMP"
155 |    "Thermally stable photocuring chemistry for selective morphological trapping in block copolymer melt systems"
156 | 
157 | Citation information:
158 | 
159 | .. code-block:: shell
160 | 
161 |    echo "$rawAuthorSearch" | jq '.["entry"][]["citedby-count"]'
162 | 
163 | **Output:**
164 | 
165 | .. code-block:: shell
166 | 
167 |    "0"
168 |    "0"
169 |    "17"
170 |    "24"
171 |    "4"
172 |    "11"
173 |    "20"
174 |    "6"
175 |    "10"
176 |    "25"
177 |    "0"
178 |    "0"
179 |    "97"
180 |    "6"
181 |    "34"
182 |    "40"
183 |    "31"
184 |    "18"
185 |    "45"
186 |    "11"
187 | 
188 | 2. Author Data in a Loop
189 | ==========================
190 | 
191 | Number of Records for Author
192 | ---------------------------------
193 | 
194 | Setup an array of Authors and their Scopus IDs:
195 | 
196 | .. code-block:: shell
197 | 
198 |    declare -A names=( [36660678600]="Emy Decker" [57210944451]="Lindsey Lowry" [35783926100]="Karen Chapman" [56133961300]="Kevin Walker" [57194760730]="Sara Whitver" )
199 | 
200 | Find the number of records for each author:
201 | 
202 | .. code-block:: shell
203 | 
204 |    declare -A numRecords
205 |    for ids in "${!names[@]}";
206 |    do
207 |      echo "$ids"
208 |      AuthorData=$(curl $api"?query=AU-ID(""$ids"$")&apiKey=""$apiKey" | jq '.["search-results"]')
209 |      echo "$AuthorData"
210 |      numRecords[$ids]=$(echo "$AuthorData" | jq '.["opensearch:totalResults"]')
211 |      sleep 1
212 |    done
213 | 
214 |    for key in "${!numRecords[@]}";
215 |    do
216 |      echo "$key"$": ""${numRecords["$key"]}"
217 |    done
218 | 
219 | **Output:**
220 | 
221 | .. code-block:: shell
222 | 
223 |    57210944451: "4"
224 |    56133961300: "8"
225 |    36660678600: "14"
226 |    35783926100: "29"
227 |    57194760730: "4"
228 | 
229 | Download Record Data
230 | ------------------------
231 | 
232 | Let's say we want the DOIs and cited by counts in a csv file
233 | 
234 | .. code-block:: shell
235 | 
236 |    truncate -s 0 authors.csv
237 |    echo $"AuthorID,DOI,citedby" >> authors.csv
238 |    for ids in "${!names[@]}";
239 |    do
240 |      AuthorData=$(curl $api"?query=AU-ID(""$ids"$")&apiKey=""$apiKey" | jq '.["search-results"]')
241 |      sleep 1
242 |      length=$(echo "$AuthorData" | jq '.["entry"] | length')
243 |      for (( i = 0 ; i < length ; i++));
244 |      do
245 |        data=$(echo "$AuthorData" | jq ".entry[$i]")
246 |        doi=$(echo "$data" | jq '.["prism:doi"]')
247 |        cite=$(echo "$data" | jq '.["citedby-count"]')
248 |        echo "${names["$ids"]}"$",""$doi"$",""$cite" >> authors.csv
249 |      done
250 |    done
251 | 
252 | **Output:**
253 | 
254 | .. code-block:: shell
255 | 
256 |    AuthorID,DOI,citedby
257 |    Lindsey Lowry,"10.1080/1941126X.2021.1949153","1"
258 |    Lindsey Lowry,"10.5860/lrts.65n1.4-13","0"
259 |    Lindsey Lowry,"10.1080/00987913.2020.1733173","1"
260 |    Lindsey Lowry,"10.1080/1941126X.2019.1634951","0"
261 |    Kevin Walker,"10.1016/j.acalib.2021.102450","0"
262 |    Kevin Walker,"10.1016/j.acalib.2020.102136","4"
263 |    Kevin Walker,"10.1016/j.lisr.2019.100968","2"
264 |    Kevin Walker,"10.1016/j.acalib.2019.02.013","10"
265 |    Kevin Walker,"10.1027/1614-2241/a000166","2"
266 |    ...
267 |    ...
268 | 
269 | Get the article titles:
270 | 
271 | .. code-block:: shell
272 | 
273 |    for ids in "${!names[@]}";
274 |    do
275 |      echo $"Author: ""${names["$ids"]}"
276 |      AuthorData=$(curl -s $api"?query=AU-ID(""$ids"$")&apiKey=""$apiKey" | jq '.["search-results"]') # -s makes the download silent
277 |      sleep 1
278 |      length=$(echo "$AuthorData" | jq '.["entry"] | length')
279 |      for (( i = 0 ; i < length ; i++));
280 |      do
281 |        data=$(echo "$AuthorData" | jq ".entry[$i]")
282 |        echo "$data" | jq '.["dc:title"]'
283 |      done
284 |    done
285 | 
286 | **Output:**
287 | 
288 | .. code-block:: shell
289 | 
290 |    Author: Lindsey Lowry
291 |    "Exploring the evidence-base for electronic access troubleshooting: Where research meets practice"
292 |    "Fighting an uphill battle: Troubleshooting assessment practices in academic libraries"
293 |    "Where Do Our Problems Lie?: Comparing Rates of E-Access Problems Across Three Research Institutions"
294 |    "Using LastPass to facilitate the gathering of usage statistics for e-resources: a case study"
295 |    Author: Kevin Walker
296 |    "Exploring adaptive boosting (AdaBoost) as a platform for the predictive modeling of tangible collection usage"
297 |    "Assessing information literacy in first year writing"
298 |    "Modeling time-to-trigger in library demand-driven acquisitions via survival analysis"
299 |    "Application of adaptive boosting (AdaBoost) in demand-driven acquisition (DDA) prediction: A machine-learning approach"
300 |    "Applying AdaBoost to Improve Diagnostic Accuracy: A Simulation Study"
301 |    "Judging the Need for and Value of DDA in an Academic Research Library Setting"
302 |    "Improving generalizability coefficient estimate accuracy: A way to incorporate auxiliary information"
303 |    "Student Engagement in One-Shot Library Instruction"
304 |    Author: Emy Decker
305 |    "Launching chat service during the pandemic: inaugurating a new public service under emergency conditions"
306 |    "Making Sense of the Lending Fill Rate in Interlibrary Loan: Investigating Causes for Low Fill Rates and Developing Potential Remedies"
307 |    "Reaching academic library users during the COVID-19 pandemic: New and adapted approaches in access services"
308 |    "Expediting the delivery of content to library users: When to buy versus when to borrow"
309 |    ...
310 |    ...
311 | 
312 | 3. Get References via a Title Search
313 | ==========================================
314 | 
315 | Number of Title Match Records
316 | ---------------------------------
317 | 
318 | Search Scopus for all references containing' ChemSpider' in the record title
319 | 
320 | All the data will be stored into individual **entry** locations
321 | 
322 | .. code-block:: shell
323 | 
324 |    query=$(curl "$api"$"?query=TITLE(ChemSpider)&apiKey=""$apiKey" | jq '.["search-results"]')
325 |    echo "$query" | jq '.["entry"][0]'
326 |    length=$(echo "$query" | jq '.["entry"] | length')
327 | 
328 | Repeat this in a loop to get number of Scopus records for each title search:
329 | 
330 | .. code-block:: shell
331 | 
332 |    declare -a titles=("ChemSpider" "PubChem" "ChEMBL" "Reaxys" "SciFinder")
333 |    declare -A storage
334 |    for title in "${titles[@]}";
335 |    do
336 |      storage["$title"]=$(curl "$api"$"?query=TITLE(""$title"$")&apiKey=""$apiKey" | jq '.["search-results"]')
337 |      sleep 1
338 |    done
339 | 
340 |    for title in "${!storage[@]}";
341 |    do
342 |      search=$(echo "${storage["$title"]}" | jq '.["opensearch:totalResults"]')
343 |      echo "$title"$": ""$search"
344 |    done
345 | 
346 | **Output:**
347 | 
348 | .. code-block:: shell
349 | 
350 |    Reaxys: "8"
351 |    PubChem: "83"
352 |    SciFinder: "31"
353 |    ChemSpider: "7"
354 |    ChEMBL: "53"
355 | 
356 | Title Match Record Data
357 | -----------------------------------
358 | 
359 | Create a csv of selected metadata:
360 | 
361 | .. code-block:: shell
362 | 
363 |    truncate -s 0 titles.csv
364 |    echo $"Title,DOI,Article,Date" >> titles.csv
365 |    for title in "${!storage[@]}";
366 |    do
367 |      length=$(echo "${storage["$title"]}" | jq '.["entry"] | length')
368 |      for (( i = 0 ; i < "$length" ; i++));
369 |      do
370 |        data=$(echo "${storage["$title"]}" | jq ".entry[$i]" )
371 |        doi=$(echo "$data" | jq '.["prism:doi"]')
372 |        articleTitle=$(echo "$data" | jq '.["dc:title"]')
373 |        date=$(echo "$data" | jq '.["prism:coverDate"]')
374 |        echo "$title"$",""$doi"$",""$articleTitle"$",""$date" >> titles.csv
375 |      done
376 |    done
377 | 
378 | **Output:**
379 | 
380 | .. code-block:: shell
381 | 
382 |    Title,DOI,Article,Date
383 |    Reaxys,null,"Store unit files for bundling activities - Reaxys","2018-04-06"
384 |    Reaxys,null,"Hybrid Retrosynthesis: Organic Synthesis using Reaxys and SciFinder","2015-01-01"
385 |    Reaxys,null,"Comparisons of the most important chemistry databases - Scifinder program and reaxys database system","2014-01-30"
386 |    Reaxys,"10.1021/bk-2014-1164.ch008","The making of reaxys - Towards unobstructed access to relevant chemistry information","2014-01-01"
387 |    Reaxys,null,"A chemistry searcher compares CAS'S SciFinder and elsevier's reaxys","2013-09-01"
388 |    Reaxys,null,"Od beilsteina do reaxys","2012-04-30"
389 |    Reaxys,null,"Store unit files for bundling activities - Reaxys","2011-11-07"
390 |    Reaxys,"10.1002/nadc.201179450","Beilstein and Gmelin combined in Reaxys","2011-04-01"
391 |    PubChem,"10.1016/j.bioorg.2022.105648","Structure-based discovery of a specific SHP2 inhibitor with enhanced blood–brain barrier penetration from PubChem database","2022-04-01"
392 | 


--------------------------------------------------------------------------------
/matlab/crossref.rst:
--------------------------------------------------------------------------------
  1 | Crossref API in Matlab
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | .. sectionauthor:: Vincent F. Scalfani <vfscalfani@ua.edu>
  5 | 
  6 | by Anastasia Ramig
  7 | 
  8 | **Crossref API documentation:** https://api.crossref.org/swagger-ui/index.html
  9 | 
 10 | These recipe examples were tested on April 21, 2022 in MATLAB R2021a.
 11 | 
 12 | *From our testing, we have found that the crossref metadata across publishers and even journals can vary considerably. As a result, it can be easier to work with one journal at a time when using the crossref API (e.g., particularly when trying to extract selected data from records).*
 13 | 
 14 | 1. Basic Crossref API call
 15 | ============================
 16 | 
 17 | Setup API parameters
 18 | ----------------------
 19 | 
 20 | .. code-block:: matlab
 21 | 
 22 |    base_url = "https://api.crossref.org/works/"; %% the base url for api calls
 23 |    email = "your_email@ua.edu"; %% change this to your email
 24 |    mailto = "?mailto=" + email;
 25 |    options = weboptions('Timeout', 30);
 26 |    doi = "10.1186/1758-2946-4-12"; %% example
 27 | 
 28 | Request data from Crossref API
 29 | --------------------------------
 30 | 
 31 | .. code-block:: matlab
 32 | 
 33 |    api_data = webread(base_url + doi + mailto, options);
 34 |    disp(api_data.message)
 35 | 
 36 | **Output:**
 37 | 
 38 | .. code-block:: matlab
 39 | 
 40 |                    indexed: [1×1 struct]
 41 |            reference_count: 16
 42 |                  publisher: 'Springer Science and Business Media LLC'
 43 |                      issue: '1'
 44 |                    license: [1×1 struct]
 45 |             content_domain: [1×1 struct]
 46 |      short_container_title: {'J Cheminform'}
 47 |            published_print: [1×1 struct]
 48 |                        DOI: '10.1186/1758-2946-4-12'
 49 |                       type: 'journal-article'
 50 |                    created: [1×1 struct]
 51 |                     source: 'Crossref'
 52 |     is_referenced_by_count: 25
 53 |                      title: {'The Molecule Cloud - compact visualization of large collections of molecules'}
 54 |                     prefix: '10.1186'
 55 |                     volume: '4'
 56 |                     author: [2×1 struct]
 57 |                     member: '297'
 58 |           published_online: [1×1 struct]
 59 |                  reference: {16×1 cell}
 60 |            container_title: {'Journal of Cheminformatics'}
 61 |             original_title: []
 62 |                   language: 'en'
 63 |                       link: [3×1 struct]
 64 |                  deposited: [1×1 struct]
 65 |                      score: 1
 66 |                   resource: [1×1 struct]
 67 |                   subtitle: []
 68 |                short_title: []
 69 |                     issued: [1×1 struct]
 70 |           references_count: 16
 71 |              journal_issue: [1×1 struct]
 72 |             alternative_id: {'336'}
 73 |                        URL: 'http://dx.doi.org/10.1186/1758-2946-4-12'
 74 |                   relation: [1×1 struct]
 75 |                       ISSN: {'1758-2946'}
 76 |                  issn_type: [1×1 struct]
 77 |                    subject: {4×1 cell}
 78 |                  published: [1×1 struct]
 79 |             article_number: '12'
 80 | 
 81 | 
 82 | Select some specific data
 83 | -----------------------------
 84 | 
 85 | .. code-block:: matlab
 86 | 
 87 |    %% Get Journal title
 88 |    api_data.message.container_title
 89 | 
 90 | **Output:**
 91 | 
 92 | .. code-block:: matlab
 93 | 
 94 |    ans = 1×1 cell array
 95 |        {'Journal of Cheminformatics'}
 96 | 
 97 | .. code-block:: matlab
 98 | 
 99 |    %% Get article title
100 |    api_data.message.title
101 | 
102 | **Output:**
103 | 
104 | .. code-block:: matlab
105 | 
106 |    ans = 1×1 cell array
107 |        {'The Molecule Cloud - compact visualization of large collections of molecules'}
108 | 
109 | .. code-block:: matlab
110 | 
111 |    %% Get article author names
112 |    names{1} = string(api_data.message.author(1).given) + " " + string(api_data.message.author(1).family);
113 |    names{2} = string(api_data.message.author(2).given) + " " + string(api_data.message.author(2).family);
114 |    disp(names)
115 | 
116 | **Output:**
117 | 
118 | .. code-block:: matlab
119 | 
120 |     {["Peter Ertl"]}    {["Bernhard Rohde"]}
121 | 
122 | .. code-block:: matlab
123 | 
124 |    %% get the bibliography references
125 |    bib_refs = cell(1,length(api_data.message.reference)); % pre-allocate a cell array
126 |    for ref = 1:length(api_data.message.reference)
127 |        bib_refs{ref} = api_data.message.reference{ref}.unstructured;
128 |    end
129 |    %% display the first few references
130 |    disp(bib_refs(1:5))
131 | 
132 | **Output:**
133 | 
134 | .. code-block:: matlab
135 | 
136 |    Column 1
137 | 
138 |     {'Martin E, Ertl P, Hunt P, Duca J, Lewis R: Gazing into the crystal ball; the future of com…'}
139 | 
140 |    Column 2
141 | 
142 |     {'Langdon SR, Brown N, Blagg J: Scaffold diversity of exemplified medicinal chemistry space.…'}
143 | 
144 |    Column 3
145 | 
146 |     {'Blum LC, Reymond J-C: 970 Million druglike small molecules for virtual screening in the ch…'}
147 | 
148 |    Column 4
149 | 
150 |     {'Dubois J, Bourg S, Vrain C, Morin-Allory L: Collections of compounds - how to deal with th…'}
151 | 
152 |    Column 5
153 | 
154 |     {'Medina-Franco JL, Martinez-Mayorga K, Giulianotti MA, Houghten RA, Pinilla C: Visualizatio…'}
155 | 
156 | 2. Crossref API call with a Loop
157 | =================================
158 | 
159 | Setup API parameters
160 | ---------------------
161 | 
162 | .. code-block:: matlab
163 | 
164 |    base_url = "https://api.crossref.org/works/"; %% the base url for api calls
165 |    email = "your_email@ua.edu"; %% change this to your email
166 |    mailto = "?mailto=" + email;
167 | 
168 | Create a list of DOIs
169 | ----------------------
170 | 
171 | .. code-block:: matlab
172 | 
173 |    %% Create a list of DOIs
174 |    doi_list = ["10.1021/acsomega.1c03250",...
175 |    "10.1021/acsomega.1c05512",...
176 |    "10.1021/acsomega.8b01647",...
177 |    "10.1021/acsomega.1c04287",...
178 |    "10.1021/acsomega.8b01834"];
179 | 
180 | Request metadata for each DOI from Crossref API and save to a structure
181 | --------------------------------------------------------------------------
182 | 
183 | .. code-block:: matlab
184 | 
185 |    %% get data for each of the dois in the list
186 |    doi_metadata = struct;
187 |    for doi = 1:length(doi_list)
188 |        doi_metadata.doi{doi} =  webread(base_url + doi_list(doi) + mailto);
189 |        pause(1)
190 |    end
191 |    doi_metadata
192 | 
193 | **Output:**
194 | 
195 | .. code-block:: matlab
196 | 
197 |    doi_metadata = struct with fields:
198 |        doi: {[1×1 struct]  [1×1 struct]  [1×1 struct]  [1×1 struct]  [1×1 struct]}
199 | 
200 | Select some specific data
201 | --------------------------
202 | 
203 | .. code-block:: matlab
204 | 
205 |    %% Create a table of information
206 |    message_array = cell(1, length(doi_metadata.doi));
207 |    for i = 1:length(doi_metadata.doi)
208 |        message_array{i} = doi_metadata.doi{1, i};
209 |    end
210 |    message_table = cell2table(message_array);
211 |    message_table = rows2vars(message_table);
212 |    message_table.OriginalVariableNames = [];
213 |    %% Get article titles
214 |    titles = cell(1,height(message_table));
215 |    for m = 1:height(message_table)
216 |        message = [message_table.Var1(m, 1).message];
217 |        titles(m) = message.title;
218 |    end
219 |    disp(titles)
220 | 
221 | **Output:**
222 | 
223 | .. code-block:: matlab
224 | 
225 |    Column 1
226 | 
227 |     {'Navigating into the Chemical Space of Monoamine Oxidase Inhibitors by Artificial Intellige…'}
228 | 
229 |    Column 2
230 | 
231 |     {'Impact of Artificial Intelligence on Compound Discovery, Design, and Synthesis'}
232 | 
233 |    Column 3
234 | 
235 |     {'How Precise Are Our Quantitative Structure–Activity Relationship Derived Predictions for N…'}
236 | 
237 |    Column 4
238 | 
239 |     {'Applying Neuromorphic Computing Simulation in Band Gap Prediction and Chemical Reaction Cl…'}
240 | 
241 |    Column 5
242 | 
243 |     {'QSPR Modeling of the Refractive Index for Diverse Polymers Using 2D Descriptors'}
244 | 
245 | 3. Crossref API call for journal information
246 | ==============================================
247 | 
248 | Setup API parameters
249 | ---------------------
250 | 
251 | .. code-block:: matlab
252 | 
253 |    jbase_url = "https://api.crossref.org/journals/"; %% the base url for api calls
254 |    email = "your_email@ua.edu"; %% change this to your email
255 |    mailto = "?mailto=" + email;
256 |    issn = "1471-2105"; %% issn for the journal BMC Bioinformatics
257 | 
258 | Request journal data from Crossref API
259 | ---------------------------------------
260 | 
261 | .. code-block:: matlab
262 | 
263 |    jour_data = webread(jbase_url + issn + mailto)
264 | 
265 | **Output:**
266 | 
267 | .. code-block:: matlab
268 | 
269 |    jour_data = struct with fields:
270 |              status: 'ok'
271 |        message_type: 'journal'
272 |     message_version: '1.0.0'
273 |             message: [1×1 struct]
274 | 
275 | .. code-block:: matlab
276 | 
277 |    % get subjects
278 |    disp({jour_data.message.subjects.name})
279 | 
280 | **Output:**
281 | 
282 | .. code-block:: matlab
283 | 
284 |    Columns 1 through 3
285 | 
286 |     {'Applied Mathematics'}    {'Computer Science Applications'}    {'Molecular Biology'}
287 | 
288 |    Columns 4 through 5
289 | 
290 |     {'Biochemistry'}    {'Structural Biology'}
291 | 
292 | 4. Crossref API - Get article DOIs for a journal
293 | =================================================
294 | 
295 | Setup API parameters
296 | ----------------------
297 | 
298 | .. code-block:: matlab
299 | 
300 |    jbase_url = "https://api.crossref.org/journals/"; %% the base url for api calls
301 |    email = "your_email@ua.edu"; %% Change this to be your email
302 |    mailto = "&mailto=" + email;
303 |    options = weboptions('Timeout', 60);
304 |    issn = "1471-2105";  %% issn for the journal BMC Bioinformatics
305 |    journal_works2014 = "/works?filter=from-pub-date:2014,until-pub-date:2014&select=DOI"; %% query to get DOIs for 2014
306 | 
307 | Request DOI data from Crossref API
308 | -----------------------------------
309 | 
310 | .. code-block:: matlab
311 | 
312 |    doi_data = webread(jbase_url + issn + journal_works2014 + mailto, options)
313 | 
314 | **Output:**
315 | 
316 | .. code-block:: matlab
317 | 
318 |    doi_data = struct with fields:
319 |              status: 'ok'
320 |        message_type: 'work-list'
321 |     message_version: '1.0.0'
322 |             message: [1×1 struct]
323 | 
324 | 
325 | .. code-block:: matlab
326 | 
327 |    doi_data.message.total_results
328 | 
329 | **Output:**
330 | 
331 | .. code-block:: matlab
332 | 
333 |    ans = 
334 |       619
335 | 
336 | By default, 20 results are returned. Crossref allows up to 1000 returned results using the rows parameter. To get all 619 results, we can increase the number of returned rows.
337 | 
338 | .. code-block:: matlab
339 | 
340 |    rows = "&rows=700";
341 |    weboptions('Timeout', 60);
342 |    doi_data_all = webread(jbase_url + issn + journal_works2014 + rows + mailto, options);
343 | 
344 | Extract DOIs
345 | --------------
346 | 
347 | .. code-block:: matlab
348 | 
349 |    dois_list = {doi_data_all.message.items.DOI}
350 | 
351 | **Output:**
352 | 
353 | .. code-block:: matlab
354 | 
355 |    dois_list = 1×619 cell
356 |    '10.1186/1471-2105-15-158'    '10.1186/1471-2105-15-106'    '10.1186/1471-2105-15-268'    '10.1186/1471-2105-15-248' ...
357 | 
358 | 
359 | What if we have more than 1000 results in a single query? For example, if we wanted the DOIs from BMC Bioinformatics for years 2014 through 2016?
360 | 
361 | .. code-block:: matlab
362 | 
363 |    jbase_url = "https://api.crossref.org/journals/"; %% the base url for api calls
364 |    email = "your_email@ua.edu"; %% Change this to be your email
365 |    mailto = "&mailto=" + email;
366 |    options = weboptions('Timeout', 60);
367 |    issn = "1471-2105";  %% issn for the journal BMC Bioinformatics
368 |    journal_works2014_2016 = "/works?filter=from-pub-date:2014,until-pub-date:2016&select=DOI"; %% query to get DOIs for 2014-2016
369 |    doi_data2 = webread(jbase_url + issn + journal_works2014_2016 + mailto, options);
370 | 
371 | .. code-block:: matlab
372 | 
373 |    doi_data2.message.total_results
374 | 
375 | **Output:**
376 | 
377 | .. code-block:: matlab
378 | 
379 |    ans = 
380 |         1772
381 | 
382 | Here we see that the total results is over 1000 (total results: 1772). An additional parameter that we can use with crossref API is called "offset". The offset option allows us to select sets of records and define a starting position (e.g., the first 1000, and then the second set of up to 1000).
383 | 
384 | .. code-block:: matlab
385 | 
386 |    rows = "&rows=1000";
387 |    numResults = doi_data.message.total_results;
388 |    doi_list2 = cell(1,int16((numResults/1000)+1));
389 |    for n = 1:(int16((numResults/1000)+1))
390 |        query = webread(jbase_url + issn + journal_works2014_2016 + rows + "&offset=" + string((1000*(n-1))) + mailto, options);
391 |        pause(1);
392 |        doi_list2{n} = query;
393 |    end
394 | 
395 | .. code-block:: matlab
396 | 
397 |    %% concatenate the results into a cell array
398 |    doi_list3 = [doi_list2{1,1}.message.items; doi_list2{1, 2}.message.items];
399 |    length(doi_list3)
400 | 
401 | **Output:**
402 | 
403 | .. code-block:: matlab
404 | 
405 |    ans = 
406 |         1772
407 | 
408 | .. code-block:: matlab
409 | 
410 |    %  Show index results 1000-1020
411 |    disp(struct2cell(doi_list3(1000:1020)))
412 | 
413 | **Output:**
414 | 
415 | .. code-block:: matlab
416 | 
417 |    Columns 1 through 2
418 | 
419 |     {'10.1186/1471-2105-15-139'}    {'10.1186/s12859-015-0768-9'}
420 | 
421 |    Columns 3 through 4
422 | 
423 |     {'10.1186/1471-2105-15-s6-s1'}    {'10.1186/1471-2105-15-157'}
424 | 
425 |    Columns 5 through 6
426 | 
427 |     {'10.1186/s12859-016-1246-8'}    {'10.1186/s12859-016-1155-x'}
428 | 
429 |    Columns 7 through 8
430 | 
431 |     {'10.1186/s12859-014-0381-3'}    {'10.1186/s12859-015-0725-7'}
432 | 
433 |    Columns 9 through 10
434 | 
435 |     {'10.1186/s12859-015-0465-8'}    {'10.1186/s12859-014-0426-7'}
436 | 
437 |    Columns 11 through 12
438 | 
439 |     {'10.1186/s12859-016-1326-9'}    {'10.1186/s12859-015-0636-7'}
440 | 
441 |    Columns 13 through 14
442 | 
443 |     {'10.1186/1471-2105-15-136'}    {'10.1186/s12859-015-0789-4'}
444 | 
445 |    Columns 15 through 16
446 | 
447 |     {'10.1186/1471-2105-15-164'}    {'10.1186/1471-2105-15-121'}
448 | 
449 |    Columns 17 through 18
450 | 
451 |     {'10.1186/s12859-016-1272-6'}    {'10.1186/1471-2105-15-s13-s2'}
452 | 
453 |    Columns 19 through 20
454 | 
455 |     {'10.1186/s12859-015-0451-1'}    {'10.1186/s12859-016-0929-5'}
456 | 
457 |    Column 21
458 | 
459 |     {'10.1186/s12859-016-1254-8'}
460 | 
461 | 


--------------------------------------------------------------------------------
/c/wiley-tdm.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Wiley Text and Data Mining (TDM) in C\n",
  8 |     "\n",
  9 |     "by Cyrus Gomes\n",
 10 |     "\n",
 11 |     "**Wiley TDM Terms of Use:** https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining\n",
 12 |     "\n",
 13 |     "Please check with your institution for their Text and Data Mining Agreement with Wiley. This tutorial content is intended to help facillitate academic research.\n",
 14 |     "\n",
 15 |     "The Wiley Text and Data Mining (TDM) API allows users to retrieve the full-text articles of Wiley content in PDF form.\n",
 16 |     "\n",
 17 |     "These recipe examples were tested on August 21, 2024.\n",
 18 |     "\n",
 19 |     "**_NOTE:_** The Wiley TDM API limits requests to a maximum of 3 requests per second."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Setup"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "First, install the CURL and jq packages by typing the following command in the terminal:"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "!sudo apt install curl jq"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "Create a directory for the Wiley project:"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 1,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "!mkdir Wiley"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Change to the newly created Wiley directory:"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "%cd Wiley"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "### Create a variable for API Key\n",
 82 |     "\n",
 83 |     "\n",
 84 |     "### Text and Data Mining Token\n",
 85 |     "\n",
 86 |     "A token is required to access the Wiley TDM API. Sign up can be found [here](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining#accordionHeader-2). If creating a new account make sure to log in to access your wiley token. \n",
 87 |     "\n",
 88 |     "Make sure to input the wiley token in the C program below."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 3,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# Create the key file\n",
 98 |     "!touch \"wiley_token.txt\""
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "We use the following command to access the key as Jupyter does not allow variable sharing for bash scripts."
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 4,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "# Input the key into the file by copy/paste or keying in manually\n",
115 |     "# Read the key from the file\n",
116 |     "!wiley_token=$(cat \"wiley_token.txt\")"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "We use the `%%file` command to create the following makefile which will compile our program and create an executable."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 5,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "name": "stdout",
133 |      "output_type": "stream",
134 |      "text": [
135 |       "Writing makefile\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "%%file makefile\n",
141 |     "\n",
142 |     "# Set the variable CC to gcc, which is used to build the program\n",
143 |     "CC=gcc\n",
144 |     "\n",
145 |     "# Enable debugging information and enable all compiler warnings\n",
146 |     "CFLAGS=-g -Wall\n",
147 |     "\n",
148 |     "# Set the bin variable as the name of the binary file we are creating\n",
149 |     "BIN=wiley\n",
150 |     "\n",
151 |     "# Create the binary file with the name we put\n",
152 |     "all: $(BIN)\n",
153 |     "\n",
154 |     "# Map any file ending in .c to a binary executable. \n",
155 |     "# \"$<\" represents the .c file and \"$@\" represents the target binary executable\n",
156 |     "%: %.c\n",
157 |     "\n",
158 |     "\t# Compile the .c file using the gcc compiler with the CFLAGS and links \n",
159 |     "\t# resulting binary with the CURL library\n",
160 |     "\t$(CC) $(CFLAGS) $< -o $@ -lcurl\n",
161 |     "\n",
162 |     "# Clean target which removes specific files\n",
163 |     "clean:\n",
164 |     "\n",
165 |     "\t# Remove the binary file and an \".dSYM\" (debug symbols for debugging) directories\n",
166 |     "\t# the RM command used -r to remove directories and -f to force delete\n",
167 |     "\t$(RM) -rf $(BIN) *.dSYM"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "This command is used again to create our .c file which contains the code for the program"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 6,
180 |    "metadata": {},
181 |    "outputs": [
182 |     {
183 |      "name": "stdout",
184 |      "output_type": "stream",
185 |      "text": [
186 |       "Writing ./wiley.c\n"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "%%file ./wiley.c\n",
192 |     "\n",
193 |     "#include <stdio.h>\n",
194 |     "#include <stdlib.h>\n",
195 |     "#include <curl/curl.h>\n",
196 |     "#include <string.h>\n",
197 |     "\n",
198 |     "// Callback function to write response data to a file\n",
199 |     "size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {\n",
200 |     "    return fwrite(ptr, size, nmemb, stream);\n",
201 |     "}\n",
202 |     "\n",
203 |     "// Function to replace characters in a string\n",
204 |     "void replace_char(char *str, char find, char replace) {\n",
205 |     "    char *current_pos = strchr(str, find);\n",
206 |     "    while (current_pos) {\n",
207 |     "        *current_pos = replace;\n",
208 |     "        current_pos = strchr(current_pos, find);\n",
209 |     "    }\n",
210 |     "}\n",
211 |     "\n",
212 |     "int main(int argc, char* argv[]) {\n",
213 |     "    // Default doi and header codes\n",
214 |     "    char doi[200] = {};\n",
215 |     "    char header[200] = {};\n",
216 |     "\n",
217 |     "    // If there are enough arguments\n",
218 |     "    if (argc == 5) {\n",
219 |     "        // Check argument order\n",
220 |     "        if (strcmp(argv[1], \"-h\") == 0 && strcmp(argv[3], \"-d\") == 0) {\n",
221 |     "            strcat(header, argv[2]);\n",
222 |     "            strcat(doi, argv[4]);\n",
223 |     "        } else if (strcmp(argv[1], \"-d\") == 0 && strcmp(argv[3], \"-h\") == 0) {\n",
224 |     "            strcat(doi, argv[2]);\n",
225 |     "            strcat(header, argv[4]);\n",
226 |     "        } else {\n",
227 |     "            fprintf(stderr, \"Invalid argument order.\\n\");\n",
228 |     "            return -1;\n",
229 |     "        }\n",
230 |     "    } else {\n",
231 |     "        fprintf(stderr, \"Invalid number of arguments.\\n\");\n",
232 |     "        return -1;\n",
233 |     "    }\n",
234 |     "\n",
235 |     "    // Construct URL\n",
236 |     "    char url[300];\n",
237 |     "    sprintf(url, \"https://api.wiley.com/onlinelibrary/tdm/v1/articles/%s\", doi);\n",
238 |     "\n",
239 |     "    // Include token in header\n",
240 |     "    struct curl_slist *headers = NULL;\n",
241 |     "    headers = curl_slist_append(headers, header);\n",
242 |     "\n",
243 |     "    // Initialize libcurl\n",
244 |     "    curl_global_init(CURL_GLOBAL_ALL);\n",
245 |     "    CURL *curl = curl_easy_init();\n",
246 |     "\n",
247 |     "    // Set URL and headers\n",
248 |     "    curl_easy_setopt(curl, CURLOPT_URL, url);\n",
249 |     "    curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);\n",
250 |     "\n",
251 |     "    // Follow redirects\n",
252 |     "    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);\n",
253 |     "\n",
254 |     "    // Generate file name\n",
255 |     "    char filename[300];\n",
256 |     "    strcpy(filename, doi);\n",
257 |     "    replace_char(filename, '/', '_');\n",
258 |     "    strcat(filename, \".pdf\");\n",
259 |     "\n",
260 |     "    // Open file for writing\n",
261 |     "    FILE *file = fopen(filename, \"wb\");\n",
262 |     "    if (!file) {\n",
263 |     "        fprintf(stderr, \"Failed to open file for writing\\n\");\n",
264 |     "        return 1;\n",
265 |     "    }\n",
266 |     "\n",
267 |     "    // Set callback function to write response data to file\n",
268 |     "    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);\n",
269 |     "    curl_easy_setopt(curl, CURLOPT_WRITEDATA, file);\n",
270 |     "\n",
271 |     "    // Perform GET request\n",
272 |     "    CURLcode res = curl_easy_perform(curl);\n",
273 |     "\n",
274 |     "    // Debugging: Print the response code\n",
275 |     "    long response_code;\n",
276 |     "    curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);\n",
277 |     "    printf(\"Response code: %ld\\n\", response_code);\n",
278 |     "\n",
279 |     "    // Cleanup\n",
280 |     "    fclose(file);\n",
281 |     "    curl_slist_free_all(headers);\n",
282 |     "    curl_easy_cleanup(curl);\n",
283 |     "    curl_global_cleanup();\n",
284 |     "\n",
285 |     "    if (res != CURLE_OK) {\n",
286 |     "        fprintf(stderr, \"Failed to download PDF: %s\\n\", curl_easy_strerror(res));\n",
287 |     "        return 1;\n",
288 |     "    }\n",
289 |     "\n",
290 |     "    if (response_code != 200) {\n",
291 |     "        fprintf(stderr, \"PDF download failed: %s\\n\", filename);\n",
292 |     "        return 1;\n",
293 |     "    }\n",
294 |     "\n",
295 |     "    printf(\"PDF downloaded successfully: %s\\n\", filename);\n",
296 |     "\n",
297 |     "    return 0;\n",
298 |     "}"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 7,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "name": "stdout",
308 |      "output_type": "stream",
309 |      "text": [
310 |       "# Compile the .c file using the gcc compiler with the CFLAGS and links \n",
311 |       "# resulting binary with the CURL library\n",
312 |       "gcc -g -Wall wiley.c -o wiley -lcurl\n"
313 |      ]
314 |     }
315 |    ],
316 |    "source": [
317 |     "!make"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "## 1. Retrieve full-text of an article\n",
325 |     "\n",
326 |     "The Wiley TDM API returns the full-text of an article as a PDF when given the article's DOI.\n",
327 |     "\n",
328 |     "In the first example, we download the full-text of the article with the DOI \"10.1002/net.22207\". This article was found on the Wiley Online Library."
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 8,
334 |    "metadata": {},
335 |    "outputs": [
336 |     {
337 |      "name": "stdout",
338 |      "output_type": "stream",
339 |      "text": [
340 |       "Response code: 200\n",
341 |       "PDF downloaded successfully: 10.1002_net.22207.pdf\n"
342 |      ]
343 |     }
344 |    ],
345 |    "source": [
346 |     "%%bash\n",
347 |     "\n",
348 |     "# DOI of article to download\n",
349 |     "doi=\"10.1002/net.22207\"\n",
350 |     "\n",
351 |     "# Wiley token to be retrieved\n",
352 |     "wiley_token=$(cat \"wiley_token.txt\")\n",
353 |     "\n",
354 |     "# Download PDF using wiley tool\n",
355 |     "./wiley -d \"$doi\" -h \"Wiley-TDM-Client-Token: $wiley_token\""
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "## 2. Retrieve full-text of multiple articles\n",
363 |     "\n",
364 |     "In this example, we download 5 articles found in the Wiley Online Library:"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 9,
370 |    "metadata": {},
371 |    "outputs": [
372 |     {
373 |      "name": "stdout",
374 |      "output_type": "stream",
375 |      "text": [
376 |       "Response code: 200\n",
377 |       "PDF downloaded successfully: 10.1111_j.1467-8624.2010.01564.x.pdf\n",
378 |       "Response code: 200\n",
379 |       "PDF downloaded successfully: 10.1111_1467-8624.00164.pdf\n",
380 |       "Response code: 200\n",
381 |       "PDF downloaded successfully: 10.1111_cdev.12864.pdf\n",
382 |       "Response code: 200\n",
383 |       "PDF downloaded successfully: 10.1111_j.1467-8624.2007.00995.x.pdf\n",
384 |       "Response code: 200\n",
385 |       "PDF downloaded successfully: 10.1111_j.1467-8624.2010.01499.x.pdf\n"
386 |      ]
387 |     },
388 |     {
389 |      "name": "stderr",
390 |      "output_type": "stream",
391 |      "text": [
392 |       "PDF download failed: 10.1111_j.1467-8624.2010.0149.x.pdf\n"
393 |      ]
394 |     },
395 |     {
396 |      "name": "stdout",
397 |      "output_type": "stream",
398 |      "text": [
399 |       "Response code: 404\n"
400 |      ]
401 |     }
402 |    ],
403 |    "source": [
404 |     "%%bash\n",
405 |     "\n",
406 |     "# DOIs of articles to download\n",
407 |     "dois=(\n",
408 |     "    '10.1111/j.1467-8624.2010.01564.x'\n",
409 |     "    '10.1111/1467-8624.00164'\n",
410 |     "    '10.1111/cdev.12864'\n",
411 |     "    '10.1111/j.1467-8624.2007.00995.x'\n",
412 |     "    '10.1111/j.1467-8624.2010.01499.x'\n",
413 |     "    '10.1111/j.1467-8624.2010.0149.x'      # Invalid DOI, will throw error\n",
414 |     ")\n",
415 |     "\n",
416 |     "# Retrieve Wiley token from file\n",
417 |     "wiley_token=$(cat \"wiley_token.txt\")\n",
418 |     "\n",
419 |     "# Iterate through each DOI\n",
420 |     "for doi in \"${dois[@]}\"; do\n",
421 |     "    # Download PDF using Wiley tool\n",
422 |     "    ./wiley -d \"$doi\" -h \"Wiley-TDM-Client-Token: $wiley_token\"\n",
423 |     "    \n",
424 |     "    # Sleep for 1 second\n",
425 |     "    sleep 1\n",
426 |     "done"
427 |    ]
428 |   }
429 |  ],
430 |  "metadata": {
431 |   "kernelspec": {
432 |    "display_name": "cookbook-env",
433 |    "language": "python",
434 |    "name": "python3"
435 |   },
436 |   "language_info": {
437 |    "codemirror_mode": {
438 |     "name": "ipython",
439 |     "version": 3
440 |    },
441 |    "file_extension": ".py",
442 |    "mimetype": "text/x-python",
443 |    "name": "python",
444 |    "nbconvert_exporter": "python",
445 |    "pygments_lexer": "ipython3",
446 |    "version": "3.12.1"
447 |   },
448 |   "orig_nbformat": 4
449 |  },
450 |  "nbformat": 4,
451 |  "nbformat_minor": 2
452 | }
453 | 


--------------------------------------------------------------------------------
/shell/world-bank.rst:
--------------------------------------------------------------------------------
  1 | World Bank API in Bash
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | by Avery Fernandez
  5 | 
  6 | See the `World Bank API documentation`_.
  7 | 
  8 | These recipe examples were tested on March 25, 2022 using GNOME Terminal in Ubuntu 18.04.
  9 | 
 10 | .. _World Bank API documentation: https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation
 11 | 
 12 | Program requirements
 13 | =========================
 14 | 
 15 | In order to run this code, you will need to first install `curl`_, `jq`_, and `gnuplot`_. curl is used to request the data from the API, jq is used to parse the JSON data, and gnuplot is used to plot the data.
 16 | 
 17 | .. _curl: https://github.com/curl/curl
 18 | .. _jq: https://stedolan.github.io/jq/
 19 | .. _gnuplot: http://www.gnuplot.info/
 20 | 
 21 | 1. Get list of country iso2Codes and names
 22 | ===========================================
 23 | 
 24 | For obtaining data from the World Bank API, it is helpful to first obtain a list 
 25 | of country codes and names.
 26 | 
 27 | First, define the root World Bank API and the API URL for obtaining country code data:
 28 | 
 29 | .. code-block:: shell
 30 | 
 31 |    api="https://api.worldbank.org/v2/"; country_url=$api$"country/?format=json&per_page=500" 
 32 | 
 33 | .. note::
 34 |    
 35 |    The ``;`` allows us to enter multiple variable assignments on one line and the ``$`` allows for variable expansion.
 36 | 
 37 | Next, request and save the JSON data from the World Bank API:
 38 | 
 39 | .. code-block:: shell
 40 | 
 41 |    country_data=$(curl $country_url)
 42 | 
 43 | .. note::
 44 | 
 45 |    If you want to instead quickly view a formatted output of the data, try the silent option (``-s``) in curl piped to jq as follows: ``curl -s $country_url | jq '.'``
 46 | 
 47 | Get the length of the data:
 48 | 
 49 | .. code-block:: shell
 50 | 
 51 |    echo $country_data | jq '.[1] | length'
 52 | 
 53 | **Output:**
 54 | 
 55 | .. code-block:: shell
 56 | 
 57 |    299
 58 | 
 59 | View the first element:
 60 | 
 61 | .. code-block:: shell
 62 | 
 63 |    echo $country_data | jq '.[1][0]'
 64 | 
 65 | **Output:**
 66 | 
 67 | .. code-block:: shell
 68 | 
 69 |    {
 70 |      "id": "ABW",
 71 |      "iso2Code": "AW",
 72 |      "name": "Aruba",
 73 |      "region": {
 74 |        "id": "LCN",
 75 |        "iso2code": "ZJ",
 76 |        "value": "Latin America & Caribbean "
 77 |      },
 78 |      "adminregion": {
 79 |        "id": "",
 80 |        "iso2code": "",
 81 |        "value": ""
 82 |      },
 83 |      "incomeLevel": {
 84 |        "id": "HIC",
 85 |        "iso2code": "XD",
 86 |        "value": "High income"
 87 |      },
 88 |      "lendingType": {
 89 |        "id": "LNX",
 90 |        "iso2code": "XX",
 91 |        "value": "Not classified"
 92 |      },
 93 |      "capitalCity": "Oranjestad",
 94 |      "longitude": "-70.0167",
 95 |      "latitude": "12.5167"
 96 |    }
 97 | 
 98 | Next, extract out the iso2codes from the country_data
 99 | 
100 | .. code-block:: shell
101 | 
102 |    declare -A country_iso2Code
103 |    for (( i = 0; i < $(echo $country_data | jq '.[1] | length'); i++ ))
104 |    do
105 |      country=$(echo $country_data | jq ".[1][$i].name");
106 |      iso=$(echo $country_data | jq ".[1][$i].iso2Code");
107 |      echo $iso$" : "$country;
108 |      country_iso2Code["$iso"]="$country";
109 |    done;
110 | 
111 | **Output:**
112 | 
113 | .. code-block:: shell
114 | 
115 |    "AW" : "Aruba"
116 |    "ZH" : "Africa Eastern and Southern"
117 |    "AF" : "Afghanistan"
118 |    "A9" : "Africa"
119 |    "ZI" : "Africa Western and Central"
120 |    "AO" : "Angola"
121 |    "AL" : "Albania"
122 |    "AD" : "Andorra"
123 |    "1A" : "Arab World"
124 |    "AE" : "United Arab Emirates"
125 |    ...
126 |    ...
127 |    ...
128 | 
129 | .. note::
130 | 
131 |   ``declare -A`` creates an associative array; ``country_iso2Code["$iso"]="$country"`` stores the iso variable and corresponding country name. 
132 | 
133 | Since we saved the iso2codes and country names in the associative array, ``country_iso2code``, it is also possible to loop through and display the data as follows:
134 | 
135 | .. code-block:: shell
136 | 
137 |    for isos in "${!country_iso2Code[@]}"; do
138 |      echo "$isos - ${country_iso2Code[$isos]}";
139 |    done
140 | 
141 | *Output not shown here*
142 | 
143 | .. note::
144 | 
145 |    ``!`` selects individual indices of the associative array; ``@`` specifies all elements in the array.
146 | 
147 | 
148 | 2. Compile a Custom Indicator Dataset
149 | ======================================
150 | 
151 | There are many available indicators: https://data.worldbank.org/indicator
152 | 
153 | We will select three indicators for this example:
154 | 
155 | 1. Scientific and Technical Journal Article Data = `IP.JRN.ARTC.SC`_
156 | 
157 | 2. Patent Applications, residents = `IP.PAT.RESD`_
158 | 
159 | 3. GDP per capita (current US$) Code = `NY.GDP.PCAP.CD`_
160 | 
161 | Note that these three selected indicators have a `CC-BY 4.0 license`_.
162 | 
163 | We will compile this indicator data for the United States (US) and United Kingdom (GB).
164 | 
165 | .. _IP.JRN.ARTC.SC: https://data.worldbank.org/indicator/IP.JRN.ARTC.SC?view=chart
166 | .. _IP.PAT.RESD: https://data.worldbank.org/indicator/IP.PAT.RESD?view=chart
167 | .. _NY.GDP.PCAP.CD: https://data.worldbank.org/indicator/NY.GDP.PCAP.CD?view=chart
168 | .. _CC-BY 4.0 license: https://datacatalog.worldbank.org/public-licenses#cc-by
169 | 
170 | .. code-block:: shell
171 | 
172 |    indicators=('IP.JRN.ARTC.SC' 'IP.PAT.RESD' 'NY.GDP.PCAP.CD')
173 | 
174 | Generate the web API URLs we need for U.S. and U.K. and retrieve the data.
175 | 
176 | .. code-block:: shell
177 | 
178 |    api="https://api.worldbank.org/v2/"
179 | 
180 | .. code-block:: shell
181 | 
182 |    declare -A US_indicator_data
183 |    for indic in "${indicators[@]}"
184 |    do
185 |        US_indicator_data[$indic]=$(curl $api$"country/US/indicator/"$indic$"/?format=json&per_page=500")
186 |        sleep 1;
187 |    done
188 | 
189 | .. code-block:: shell
190 | 
191 |    declare -A UK_indicator_data
192 |    for indic in "${indicators[@]}"
193 |    do
194 |        UK_indicator_data[$indic]=$(curl $api$"country/GB/indicator/"$indic$"/?format=json&per_page=500")
195 |        sleep 1;
196 |    done
197 | 
198 | Now we need to extract the data and compile for analysis.
199 | 
200 | column 1: year
201 | 
202 | column 2: Scientific and Technical Journal Article Data = ``IP.JRN.ARTC.SC``
203 | 
204 | column 3: Patent Applications, residents = ``IP.PAT.RESD``
205 | 
206 | column 4: GDP per capita (current US$) Code = ``NY.GDP.PCAP.CD``
207 | 
208 | U.S. data extraction:
209 | 
210 | .. code-block:: shell
211 | 
212 |    declare -A US_data_JRN
213 |    declare -A US_data_PAT
214 |    declare -A US_data_NY
215 |    for (( years = 0; years < $(echo ${US_indicator_data['IP.JRN.ARTC.SC']} | jq '.[1] | length'); years++ ))
216 |    do
217 |      year=$(echo ${US_indicator_data['IP.JRN.ARTC.SC']} | jq ".[1][$years].date" | tr -d '"')
218 |      US_data_JRN[$year]=$(echo ${US_indicator_data['IP.JRN.ARTC.SC']} | jq ".[1][$years].value")
219 |      US_data_PAT[$year]=$(echo ${US_indicator_data['IP.PAT.RESD']} | jq ".[1][$years].value")
220 |      US_data_NY[$year]=$(echo ${US_indicator_data['NY.GDP.PCAP.CD']} | jq ".[1][$years].value")
221 |    done;
222 |    echo $'"year","IP.JRN.ARTC.SC","IP.PAT.RESD","NY.GDP.PCAP.CD"' >> US_data.csv
223 |    for years in "${!US_data_JRN[@]}"; do
224 |      echo $years$","${US_data_JRN[$years]}$","${US_data_PAT[$years]}$","${US_data_NY[$years]} | sed 's/null/NaN/g' >> US_data.csv
225 |    done
226 | 
227 | .. note::
228 | 
229 |    ``sed 's/null/NaN/g'`` is used to replace missing data with NaN.
230 | 
231 | .. code-block:: shell
232 | 
233 |    head US_data.csv
234 | 
235 | **Output:**
236 | 
237 | .. code-block:: shell
238 | 
239 |    "year","IP.JRN.ARTC.SC","IP.PAT.RESD","NY.GDP.PCAP.CD"
240 |    1979,NaN,NaN,11674.1818666548
241 |    1978,NaN,NaN,10564.9482220275
242 |    1973,NaN,NaN,6726.35895596695
243 |    1972,NaN,NaN,6094.01798986165
244 |    1971,NaN,NaN,5609.38259952519
245 |    1970,NaN,NaN,5234.2966662115
246 |    1977,NaN,NaN,9452.57651914511
247 |    1976,NaN,NaN,8592.25353727612
248 |    1975,NaN,NaN,7801.45666356443
249 | 
250 | U.K. Data extraction:
251 | 
252 | column 1: year
253 | 
254 | column 2: Scientific and Technical Journal Article Data = ``IP.JRN.ARTC.SC``
255 | 
256 | column 3: Patent Applications, residents = ``IP.PAT.RESD``
257 | 
258 | column 4: GDP per capita (current US$) Code = ``NY.GDP.PCAP.CD``
259 | 
260 | .. code-block:: shell
261 | 
262 |    declare -A UK_data_JRN
263 |    declare -A UK_data_PAT
264 |    declare -A UK_data_NY
265 |    for (( years = 0; years < $(echo ${UK_indicator_data['IP.JRN.ARTC.SC']} | jq '.[1] | length'); years++ ))
266 |    do
267 |      year=$(echo ${UK_indicator_data['IP.JRN.ARTC.SC']} | jq ".[1][$years].date" | tr -d '"')
268 |      UK_data_JRN[$year]=$(echo ${UK_indicator_data['IP.JRN.ARTC.SC']} | jq ".[1][$years].value")
269 |      UK_data_PAT[$year]=$(echo ${UK_indicator_data['IP.PAT.RESD']} | jq ".[1][$years].value")
270 |      UK_data_NY[$year]=$(echo ${UK_indicator_data['NY.GDP.PCAP.CD']} | jq ".[1][$years].value")
271 |    done;
272 |    echo $'"year","IP.JRN.ARTC.SC","IP.PAT.RESD","NY.GDP.PCAP.CD"' >> UK_data.csv
273 |    for years in "${!UK_data_JRN[@]}"; do
274 |      echo "$years"$","${UK_data_JRN[$years]}$","${UK_data_PAT[$years]}$","${UK_data_NY[$years]} | sed 's/null/NaN/g' >> UK_data.csv
275 |    done
276 | 
277 | 
278 | .. note::
279 | 
280 |    ``sed 's/null/NaN/g'`` is used to replace missing data with NaN.
281 | 
282 | .. code-block:: shell
283 | 
284 |    tail UK_data.csv
285 | 
286 | **Output:**
287 | 
288 | .. code-block:: shell
289 | 
290 |    2003,75564.08,20426,34487.4675722539
291 |    1984,NaN,19093,8179.19444064991
292 |    2000,77244.9,22050,28223.0675706515
293 |    1985,NaN,19672,8652.21654247593
294 |    2001,73779.92,21423,27806.4488245133
295 |    1988,NaN,20536,15987.1680775688
296 |    1989,NaN,19732,16239.2821960944
297 |    2008,91357.74,16523,47549.3486286006
298 |    2009,93803.37,15985,38952.2110262455
299 |    2020,NaN,NaN,41059.1688090547
300 | 
301 | 3. Plot Indicator data
302 | =======================
303 | 
304 | Create a line plot of US/UK Number of Scientific and Technical Journal Articles and Patents by year.
305 | 
306 | .. code-block:: shell
307 | 
308 |    awk -F',' '{ print $1","$2+$3","$4; }' US_data.csv | sort -t"," -k1n,1 > US_sorted.csv
309 |    awk -F',' '{ print $1","$2+$3","$4; }' UK_data.csv | sort -t"," -k1n,1 > UK_sorted.csv
310 |    sed -i "1s/.*/'year','US Articles and Patents','US GDP'/" US_sorted.csv
311 |    sed -i "1s/.*/'year','UK Articles and Patents','UK GDP'/" UK_sorted.csv
312 | 
313 | .. note::
314 | 
315 |    ``awk`` is combining the second column and third column into a single column; ``sort`` is to sort the data by the year; ``sed`` is to change the first row to accurately name the columns.
316 | 
317 | .. code-block:: shell
318 | 
319 |    head US_sorted.csv
320 | 
321 | **Output:**
322 | 
323 | .. code-block:: shell
324 | 
325 |    'year','US Articles and Patents','US GDP'
326 |    1960,nan,3007.12344537862
327 |    1961,nan,3066.56286916615
328 |    1962,nan,3243.84307754988
329 |    1963,nan,3374.51517105082
330 |    1964,nan,3573.94118474743
331 |    1965,nan,3827.52710972039
332 |    1966,nan,4146.31664631665
333 |    1967,nan,4336.42658722171
334 |    1968,nan,4695.92339043178
335 | 
336 | Plot the data as an ascii plot:
337 | 
338 | .. code-block:: shell
339 | 
340 |    gnuplot -e "set datafile separator ','; \
341 |    set datafile missing NaN; \
342 |    set key outside; \
343 |    set key autotitle columnhead; \
344 |    set term dumb size 130, 30; \
345 |    set xrange [2000:2018]; \
346 |    set ylabel 'First Y Units'; \
347 |    set xlabel 'Time'; \
348 |    set title 'US and UK data'; \
349 |    set y2tics nomirror; \
350 |    set ytics nomirror; \
351 |    set size 1,1; \
352 |    plot 'US_sorted.csv' using 1:2 with lines axis x1y1, '' using 1:3 with lines axis x1y2, \
353 |    'UK_sorted.csv' using 1:2 with lines axis x1y1, '' using 1:3 with lines axis x1y2"
354 | 
355 | **Output:**
356 | 
357 | .. code-block:: shell
358 | 
359 |                                              US and UK data                                                                      
360 |                                                                                                                                   
361 |                800000 +--------------------------------------------------------------+ 65000                                      
362 |                       |      +      +      +      +      +      +      +      +      |         'US Articles and Patents' *******  
363 |                       |                                           *****************##|                          'US GDP' #######  
364 |                700000 |-+                                      ***              ###+-| 60000   'UK Articles and Patents' $$$$$$$  
365 |                       |                                 *******              ###     |                          'UK GDP' %%%%%%%  
366 |                       |                      ***********                 ####        |                                            
367 |                600000 |-+             *******                         ###          +-| 55000                                      
368 |                       |            ***                            ####               |                                            
369 |                       |        ****            %%%             ###                   |                                            
370 |                500000 |********               %            ####                    +-| 50000                                      
371 |                       |                      %####%########            %%%%          |                                            
372 |                400000 |-+                 ##%      %                 %%    %       +-| 45000                                      
373 |                       |               ####%%        %             %%%       %        |                                            
374 |                       |             ##%%%%          %       %%%%%%           %     %%|                                            
375 |                300000 |-+         ##%%               %    %%                  %%%%%+-| 40000                                      
376 |                       |        ### %                  %%%%                           |                                            
377 |                       |########    %                                                 |                                            
378 |                200000 |-+         %                                                +-| 35000                                      
379 |                       |         %%                                                   |                                            
380 |                       |       %%                                                     |                                            
381 |                100000 |$$$$$%%$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$| 30000                                      
382 |                       |%%%%%                                                         |                                            
383 |                       |      +      +      +      +      +      +      +      +      |                                            
384 |                     0 +--------------------------------------------------------------+ 25000                                      
385 |                      2000   2002   2004   2006   2008   2010   2012   2014   2016   2018                                          
386 |                                                    Time                                                
387 | 
388 | 
389 | 
390 | 


--------------------------------------------------------------------------------
/shell/chronam.rst:
--------------------------------------------------------------------------------
  1 | Chronicling America API in Bash
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | by Avery Fernandez
  5 | 
  6 | **LOC Chronicling America API Documentation:** https://chroniclingamerica.loc.gov/about/api/
  7 | 
  8 | These recipe examples were tested on December 7, 2022 using GNOME Terminal in Ubuntu 18.04.
  9 | 
 10 | **Attribution:** We thank **Professor Jessica Kincaid** (UA Libraries, Hoole Special Collections)
 11 | for the use-cases. All data was collected from the Library of Congress, Chronicling America: Historic
 12 | American Newspapers site, using the API.
 13 | 
 14 | Note that the data from the *Alabama state intelligencer*, *The age-herald*, and the 
 15 | *Birmingham age-herald* were contributed to Chronicling America by The University of 
 16 | Alabama Libraries: https://chroniclingamerica.loc.gov/awardees/au/
 17 | 
 18 | Program requirements
 19 | =========================
 20 | 
 21 | In order to run this code, you will need to first install `curl`_, `jq`_, and `gnuplot`_.
 22 | curl is used to request the data from the API, jq is used to parse the JSON data,
 23 | and gnuplot is used to plot the data.
 24 | 
 25 | .. _curl: https://github.com/curl/curl
 26 | .. _jq: https://stedolan.github.io/jq/
 27 | .. _gnuplot: http://www.gnuplot.info/
 28 | 
 29 | 1. Basic API request
 30 | =============================
 31 | 
 32 | The Chronicling America API identifies newspapers and other records using LCCNs.
 33 | We can query the API once we have the LCCN for the newspaper and even ask for
 34 | particular issues and editions. For example, the following link lists newspapers
 35 | published in the state of Alabama, from which the LCCN can be obtained:
 36 | https://chroniclingamerica.loc.gov/newspapers/?state=Alabama
 37 | 
 38 | Here is an example with the Alabama State Intelligencer:
 39 | 
 40 | .. code-block:: shell
 41 |    
 42 |    api="https://chroniclingamerica.loc.gov/"
 43 |    request=$(curl -s $api$"lccn/sn84023600.json")
 44 |    echo $request | jq '.'
 45 | 
 46 | **Output:**
 47 | 
 48 | .. code-block:: shell
 49 | 
 50 |    {
 51 |    "place_of_publication": "Tuskaloosa [sic], Ala.",
 52 |    "lccn": "sn84023600",
 53 |    "start_year": "183?",
 54 |    "place": [
 55 |       "Alabama--Tuscaloosa--Tuscaloosa"
 56 |    ],
 57 |    "name": "Alabama State intelligencer. [volume]",
 58 |    "publisher": "T.M. Bradford",
 59 |    "url": "https://chroniclingamerica.loc.gov/lccn/sn84023600.json",
 60 |    "end_year": "18??",
 61 |    "issues": [],
 62 |    "subject": []
 63 |    }
 64 | 
 65 | Indexing into the json output allows data to be extracted using key names as demonstrated below:
 66 | 
 67 | .. code-block:: shell
 68 | 
 69 |    echo $request | jq '.["name"]'
 70 | 
 71 | **Output:**
 72 | 
 73 | .. code-block:: shell
 74 | 
 75 |    "Alabama State intelligencer. [volume]"
 76 | 
 77 | .. code-block:: shell
 78 | 
 79 |    echo $request | jq '.["publisher"]'
 80 | 
 81 | **Output:**
 82 | 
 83 | .. code-block:: shell
 84 | 
 85 |    "T.M. Bradford"
 86 | 
 87 | Moving on to another publication, we can get the 182nd page (seq-182) of the Evening Star
 88 | newspaper published on November 19, 1961.
 89 | 
 90 | .. code-block:: shell
 91 | 
 92 |    request=$(curl -s $api$"lccn/sn83045462/1961-11-19/ed-1/seq-182.json")
 93 |    echo $request | jq '.'
 94 | 
 95 | **Output:**
 96 | 
 97 | .. code-block:: shell
 98 | 
 99 |    {
100 |    "jp2": "https://chroniclingamerica.loc.gov/lccn/sn83045462/1961-11-19/ed-1/seq-182.jp2",
101 |    "sequence": 182,
102 |    "text": "https://chroniclingamerica.loc.gov/lccn/sn83045462/1961-11-19/ed-1/seq-182/ocr.txt",
103 |    "title": {
104 |       "url": "https://chroniclingamerica.loc.gov/lccn/sn83045462.json",
105 |       "name": "Evening star. [volume]"
106 |    },
107 |    "pdf": "https://chroniclingamerica.loc.gov/lccn/sn83045462/1961-11-19/ed-1/seq-182.pdf",
108 |    "ocr": "https://chroniclingamerica.loc.gov/lccn/sn83045462/1961-11-19/ed-1/seq-182/ocr.xml",
109 |    "issue": {
110 |       "url": "https://chroniclingamerica.loc.gov/lccn/sn83045462/1961-11-19/ed-1.json",
111 |       "date_issued": "1961-11-19"
112 |    }
113 |    }
114 | 
115 | Next, extract the URL for the PDF file and open it from the terminal. The `-L`
116 | option in curl allows for redirection to load the PDF:
117 | 
118 | .. code-block:: shell
119 | 
120 |    url=$(echo $request | jq '.["pdf"]' | tr -d '"')
121 |    curl $url -L --output outfile.pdf
122 |    xdg-open outfile.pdf
123 | 
124 | 2. Frequency of “University of Alabama” mentions
125 | =====================================================
126 | 
127 | The URL below limits to searching newspapers in the state of Alabama and provides 500 results of 
128 | “University of Alabama” mentions. Note that phrases can be searched by putting them inside parentheses for the query.
129 | 
130 | .. code-block:: shell
131 | 
132 |    api="https://chroniclingamerica.loc.gov/"
133 |    request=$(curl -s $api$"search/pages/results/?state=Alabama&proxtext=(University%20of%20Alabama)&rows=500&format=json" | jq .'["items"]')
134 | 
135 | Get the length of returned data:
136 | 
137 | .. code-block:: shell
138 | 
139 |    length=$(echo "$request" | jq '. | length')
140 |    echo "$length"
141 | 
142 | **Output:**
143 | 
144 | .. code-block:: shell
145 | 
146 |    500
147 | 
148 | Next, display the first record
149 | 
150 | .. code-block:: shell
151 | 
152 |    echo "$request" | jq '.[0]'
153 | 
154 | **Output:**
155 | 
156 | .. code-block:: shell
157 | 
158 |    {
159 |    "sequence": 48,
160 |    "county": [
161 |       "Jefferson"
162 |    ],
163 |    "edition": null,
164 |    "frequency": "Daily",
165 |    "id": "/lccn/sn85038485/1924-07-13/ed-1/seq-48/",
166 |    "subject": [
167 |       "Alabama--Birmingham.--fast--(OCoLC)fst01204958",
168 |       "Birmingham (Ala.)--Newspapers."
169 |    ],
170 |    "city": [
171 |       "Birmingham"
172 |    ],
173 |    "date": "19240713",
174 |    "title": "The Birmingham age-herald. [volume]",
175 |    "end_year": 1950,
176 |    "note": [
177 |       "Also issued on microfilm from Bell & Howell, Micro Photo Div.; the Library of Congress, Photoduplication Service.",
178 |       "Also published in a weekly ed.",
179 |       "Archived issues are available in digital format from the Library of Congress Chronicling America online collection.",
180 |       "Publication suspended with July 12, 1945 issue due to a printers' strike; resumed publication with Aug. 17, 1945 issue."
181 |    ],
182 |    "state": [
183 |       "Alabama"
184 |    ],
185 |    "section_label": "Tuscaloosa Section",
186 |    "type": "page",
187 |    "place_of_publication": "Birmingham, Ala.",
188 |    "start_year": 1902,
189 |    "edition_label": "",
190 |    "publisher": "Age-Herald Co.",
191 |    "language": [
192 |       "English"
193 |    ],
194 |    "alt_title": [
195 |       "Age-herald",
196 |       "Birmingham news, the Birmingham age-herald"
197 |    ],
198 |    "lccn": "sn85038485",
199 |    "country": "Alabama",
200 |    "ocr_eng": "canes at the University .of Alabama\nMORGAN HALL -\nSMITH HALL\n' hi i ..mil w i 1»..IIgylUjAiU. '. n\njjiIi\n(ARCHITECTS* MODEL)\nCOMER. HALli\nMINING\n••tSgSB?\n* i v' y -4\n■Si ' 3>\nA GLIMP9E OF FRATERNITY ROW\nTHE GYMNASIUM\nTuscaloosa, Alabama\nADV.",
201 |    "batch": "au_foster_ver01",
202 |    "title_normal": "birmingham age-herald.",
203 |    "url": "https://chroniclingamerica.loc.gov/lccn/sn85038485/1924-07-13/ed-1/seq-48.json",
204 |    "place": [
205 |       "Alabama--Jefferson--Birmingham"
206 |    ],
207 |    "page": "8"
208 |    }
209 | 
210 | Loop through the records and extract the dates:
211 | 
212 | .. code-block:: shell
213 | 
214 |    declare -a dates
215 |    for (( i = 0 ; i < "$length" ; i++ ));
216 |    do
217 |      dates+=("$(echo "$request" | jq ".[$i].date" | tr -d '"')")
218 |    done
219 | 
220 | Check the length of dates:
221 | 
222 | .. code-block:: shell
223 | 
224 |    echo "${#dates[@]}"
225 | 
226 | **Output:**
227 | 
228 | .. code-block:: shell
229 | 
230 |    500
231 | 
232 | Display the first 10 dates:
233 | 
234 | .. code-block:: shell
235 | 
236 |    echo "${dates[@]:0:10}"
237 | 
238 | **Output:**
239 | 
240 | .. code-block:: shell
241 | 
242 |    19240713 19180818 19240224 19160806 19130618 19240217 19140602 19120714 19220917 19170513
243 | 
244 | We'll do a bit of data transformation on the dates before plotting:
245 | 
246 | .. code-block:: shell
247 | 
248 |    declare -a formattedDates
249 |    for date in "${dates[@]}";
250 |    do
251 |      year=$(echo "$date" | cut -c1-4)
252 |      month=$(echo "$date" | cut -c5-6)
253 |      day=$(echo "$date" | cut -c7-8)
254 |      formatted=$year$"/"$month$"/"$day
255 |      echo $'"'"$formatted"$'"' >> dates.csv
256 |      formattedDates+=("$formatted")
257 |    done
258 |    echo "${formattedDates[@]:0:10}"
259 | 
260 | **Output:**
261 | 
262 | .. code-block:: shell
263 | 
264 |    1924/07/13 1918/08/18 1924/02/24 1916/08/06 1913/06/18 1924/02/17 1914/06/02 1912/07/14 1922/09/17 1917/05/13
265 | 
266 | Next, plot the data using gnuplot. 
267 | See the `gnuplot documentation`_ for more information about the smooth frequency histogram.
268 | 
269 | .. _gnuplot documentation: http://www.gnuplot.info/documentation.html
270 | 
271 | .. code-block:: shell
272 | 
273 |    head dates.csv
274 | 
275 | **Output:**
276 | 
277 | .. code-block:: shell
278 | 
279 |    "1924/07/13"
280 |    "1918/08/18"
281 |    "1924/02/24"
282 |    "1916/08/06"
283 |    "1913/06/18"
284 |    "1924/02/17"
285 |    "1914/06/02"
286 |    "1912/07/14"
287 |    "1922/09/17"
288 |    "1917/05/13"
289 | 
290 | .. code-block:: shell
291 | 
292 |    cat graph.gnuplot
293 | 
294 | **Output:**
295 | 
296 | .. code-block:: Shell
297 | 
298 |    set datafile separator ','
299 |    binwidth=4
300 |    set term dumb
301 |    bin(x,width)=width*floor(x/width)
302 |    plot 'dates.csv' using (bin($1,binwidth)):(1.0) smooth freq with boxes notitle
303 | 
304 | .. code-block:: shell
305 | 
306 |    gnuplot -p graph.gnuplot
307 | 
308 | **Output:**
309 | 
310 | .. code-block:: Shell
311 | 
312 |    120 +--------------------------------------------------------------------+   
313 |       |         +         +         +        +         +         +         |   
314 |       |                                           ***                      |   
315 |    100 |-+                                         * * ***                +-|   
316 |       |                                           * * * *                  |   
317 |       |                                           * * * *                  |   
318 |       |                                           * * * *                  |   
319 |    80 |-+                                         * * * *                +-|   
320 |       |                                           * *** *                  |   
321 |       |                                       *** * * * *                  |   
322 |    60 |-+                                     * *** * * *                +-|   
323 |       |                                       * * * * * *                  |   
324 |       |                                     *** * * * * *                  |   
325 |    40 |-+                                   * * * * * * *                +-|   
326 |       |                                     * * * * * * *                  |   
327 |       |                                     * * * * * * *                  |   
328 |       |                                     * * * * * * **********         |   
329 |    20 |-+                                   * * * * * * *        *       +-|   
330 |       |                                   *** * * * * * *        *         |   
331 |       |         +         ***************** *+* * * * *+*        *         |   
332 |     0 +--------------------------------------------------------------------+   
333 |      1820      1840      1860      1880     1900      1920      1940      1960 
334 | 
335 | 
336 | 3. Industrialization keywords frequency in the Birmingham Age-Herald
337 | ==========================================================================
338 | 
339 | We will try to obtain the frequency of “Iron” on the front pages of the Birmingham Age- herald newspapers from
340 | the year 1903 to 1949 (limited to first 500 rows for testing here).
341 | 
342 | .. code-block:: shell
343 | 
344 |    api="https://chroniclingamerica.loc.gov/"
345 |    request=$(curl "$api"$"search/pages/results/?state=Alabama&lccn=sn85038485&dateFilterType=yearRange&date1=1903&date2=1949&sequence=1&andtext=Iron&rows=500&searchType=advanced&format=json" | jq '.["items"]')
346 |    
347 | .. code-block:: shell
348 | 
349 |    echo "$request" | jq '. | length'
350 | 
351 | **Output:**
352 | 
353 | .. code-block:: shell
354 | 
355 |    500
356 | 
357 | Extract the dates and do some formatting as shown before:
358 | 
359 | .. code-block:: shell
360 | 
361 |    declare -a dates
362 |    length=$(echo "$request" | jq '. | length')
363 |    for (( i = 0 ; i < "$length" ; i++ ));
364 |    do
365 |      dates+=("$(echo "$request" | jq ".[$i].date" | tr -d '"')")
366 |    done
367 | 
368 |    declare -a formattedDates
369 |    for date in "${dates[@]}";
370 |    do
371 |      year=$(echo "$date" | cut -c1-4)
372 |      month=$(echo "$date" | cut -c5-6)
373 |      day=$(echo "$date" | cut -c7-8)
374 |      formatted=$year$"/"$month$"/"$day
375 |      echo $'"'"$formatted"$'"' >> dates2.csv
376 |      formattedDates+=("$formatted")
377 |    done
378 | 
379 | Check to make sure we have 500 dates:
380 | 
381 | .. code-block:: shell
382 | 
383 |    cat dates2.csv | wc -l
384 | 
385 | **Output:**
386 | 
387 | .. code-block:: shell
388 | 
389 |    500
390 | 
391 | And plot the data:
392 | 
393 | .. code-block:: shell
394 | 
395 |    cat graph.gnuplot 
396 | 
397 | **Output:**
398 | 
399 | .. code-block:: shell
400 | 
401 |    set datafile separator ','
402 |    binwidth=2
403 |    set term dumb
404 |    bin(x,width)=width*floor(x/width)
405 |    plot 'dates2.csv' using (bin($1,binwidth)):(1.0) smooth freq with boxes notitle
406 | 
407 | .. code-block:: shell
408 | 
409 |    gnuplot -p graph.gnuplot
410 | 
411 | **Output:**
412 | 
413 | .. code-block:: shell
414 | 
415 |    90 +---------------------------------------------------------------------+   
416 |       |             +             +             +             +             |   
417 |    80 |-+                      *******                                    +-|   
418 |       |                        *     *                                      |   
419 |    70 |-+     *******          *     *                                    +-|   
420 |       |       *     *          *     *                                      |   
421 |       |       *     ************     *                                      |   
422 |    60 |-+     *     *     *    *     ******                               +-|   
423 |       |       *     *     *    *     *    *                                 |   
424 |    50 |-+     *     *     *    *     *    *                      ******   +-|   
425 |       |       *     *     *    *     *    *                      *    *     |   
426 |    40 |-+     *     *     *    *     *    *                      *    *   +-|   
427 |       |       *     *     *    *     *    *                      *    *     |   
428 |    30 |-+     *     *     *    *     *    *                      *    *   +-|   
429 |       |  ******     *     *    *     *    *     *******          *    *     |   
430 |       |  *    *     *     *    *     *    *     *     *    *******    *     |   
431 |    20 |-+*    *     *     *    *     *    *******     *    *     *    *   +-|   
432 |       |  *    *     *     *    *     *    *     *     *    *     *    *     |   
433 |    10 |-+*    *     *     *    *     *    *     *     ******     *    ****+-|   
434 |       |  *    *     *     *    *  +  *    *     *     *    *  +  *    *  *  |   
435 |       0 +---------------------------------------------------------------------+   
436 |       1900          1905          1910          1915          1920          1925 
437 | 
438 | 
439 | 


--------------------------------------------------------------------------------
/shell/us-census.rst:
--------------------------------------------------------------------------------
  1 | U.S. Census API in Bash
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | by Avery Fernandez
  5 | 
  6 | **U.S. Census API documentation:** https://www.census.gov/data/developers/about.html
  7 | 
  8 | **U.S. Census Data Discovery Tool:** https://api.census.gov/data.html
  9 | 
 10 | These recipe examples were tested on December 13, 2022 using GNOME Terminal in Ubuntu 18.04.
 11 | 
 12 | See also the U.S. `Census API Terms of Service`_
 13 | 
 14 | .. _Census API Terms of Service: https://www.census.gov/data/developers/about/terms-of-service.html
 15 | 
 16 | **Attribution:** This tutorial uses the Census Bureau Data API but is not endorsed
 17 | or certified by the Census Bureau.
 18 | 
 19 | Setup
 20 | =====
 21 | 
 22 | Program requirements
 23 | --------------------
 24 | 
 25 | In order to run this code, you will need to first install `curl`_, `jq`_, and `gnuplot`_.
 26 | curl is used to request the data from the API, jq is used to parse the JSON data,
 27 | and gnuplot is used to plot the data.
 28 | 
 29 | .. _curl: https://github.com/curl/curl
 30 | .. _jq: https://stedolan.github.io/jq/
 31 | .. _gnuplot: http://www.gnuplot.info/
 32 | 
 33 | API Key Information
 34 | -------------------
 35 | 
 36 | While an API key is not required to use the U.S. Census Data API, you may consider
 37 | registering for an API key as the API is limited to 500 calls a day without a key.
 38 | Sign up can be found here: https://api.census.gov/data/key_signup.html
 39 | 
 40 | If you want to add in your API Key, save the API key to a text file named "apiKey" and
 41 | import it using the following code:
 42 | 
 43 | .. code-block:: shell
 44 | 
 45 |    key=$(cat "apikey.txt");
 46 | 
 47 | Note that this tutorial does not use an API key.
 48 | 
 49 | 1. Get population estimates of countries by state
 50 | =======================================================
 51 | 
 52 | Note: includes Washington, D.C. and Puerto Rico
 53 | 
 54 | We will use the population estimates from the 2019 dataset:
 55 | https://api.census.gov/data/2019/pep/population/examples.html
 56 | 
 57 | First, obtain a list of the state names and IDs from the API:
 58 | 
 59 | .. code-block:: shell
 60 | 
 61 |    api='https://api.census.gov/data/'
 62 |    state_ids=$(curl -s $api$"2019/pep/population?get=NAME&for=state:*"$key)
 63 |    echo $state_ids
 64 | 
 65 | **Output:**
 66 | 
 67 | .. code-block:: shell
 68 | 
 69 |    [["NAME","state"], ["Alabama","01"], ["Alaska","02"], ["Arizona","04"], ["Arkansas","05"], ["California","06"], ["Colorado","08"], ["Delaware","10"], ["District of Columbia","11"], ["Connecticut","09"], ["Florida","12"], ["Georgia","13"], ["Idaho","16"], ["Hawaii","15"], ["Illinois","17"], ["Indiana","18"], ["Iowa","19"], ["Kansas","20"], ["Kentucky","21"], ["Louisiana","22"], ["Maine","23"], ["Maryland","24"], ["Massachusetts","25"], ["Michigan","26"], ["Minnesota","27"], ["Mississippi","28"], ["Missouri","29"], ["Montana","30"], ["Nebraska","31"], ["Nevada","32"], ["New Hampshire","33"], ["New Jersey","34"], ["New Mexico","35"], ["New York","36"], ["North Carolina","37"], ["North Dakota","38"], ["Ohio","39"], ["Oklahoma","40"], ["Oregon","41"], ["Pennsylvania","42"], ["Rhode Island","44"], ["South Carolina","45"], ["South Dakota","46"], ["Tennessee","47"], ["Texas","48"], ["Vermont","50"], ["Utah","49"], ["Virginia","51"], ["Washington","53"], ["West Virginia","54"], ["Wisconsin","55"], ["Wyoming","56"], ["Puerto Rico","72"]]
 70 | 
 71 | Get the length:
 72 | 
 73 | .. code-block:: shell
 74 | 
 75 |    echo $state_ids | jq '. | length'
 76 | 
 77 | **Output:**
 78 | 
 79 | .. code-block:: shell
 80 | 
 81 |    53
 82 | 
 83 | Next, loop through each state and obtain population data:
 84 | 
 85 | .. code-block:: shell
 86 | 
 87 |    for (( i = 1; i < $(echo $state_ids | jq '. | length'); i++ ))
 88 |    do
 89 |       state=$(echo $state_ids | jq ".[$i][0]" | tr -d '"')
 90 |       stateID=$(echo $state_ids | jq ".[$i][1]" | tr -d '"')
 91 |       request=$(curl -s $api$"2019/pep/population?get=NAME,POP&for=county:*&in=state:"$stateID$key)
 92 |       sleep 1;
 93 |       for (( j = 1; j < $(echo $request | jq '. | length'); j++ ))
 94 |       do
 95 |          county=$(echo $request | jq ".[$j][0]" | tr -d '"' | cut -f1 -d",")
 96 |          population=$(echo $request | jq ".[$j][1]" | tr -d '"')
 97 |          echo $state$","$county$","$population >> state_populations.csv
 98 |       done
 99 |    done
100 | 
101 | View the first 25 lines
102 | 
103 | .. code-block:: shell
104 | 
105 |    head -n25 state_populations.csv
106 | 
107 | **Output:**
108 | 
109 | .. code-block:: shell
110 | 
111 |    Alabama,St. Clair County,89512
112 |    Alabama,Cullman County,83768
113 |    Alabama,Houston County,105882
114 |    Alabama,Tuscaloosa County,209355
115 |    Alabama,Coffee County,52342
116 |    Alabama,Chilton County,44428
117 |    Alabama,Coosa County,10663
118 |    Alabama,Etowah County,102268
119 |    Alabama,Lamar County,13805
120 |    Alabama,Butler County,19448
121 |    Alabama,Walker County,63521
122 |    Alabama,Greene County,8111
123 |    Alabama,Bullock County,10101
124 |    Alabama,Chambers County,33254
125 |    Alabama,Monroe County,20733
126 |    Alabama,Lawrence County,32924
127 |    Alabama,Lee County,164542
128 |    Alabama,Marion County,29709
129 |    Alabama,Pickens County,19930
130 |    Alabama,Sumter County,12427
131 |    Alabama,Jefferson County,658573
132 |    Alabama,Choctaw County,12589
133 |    Alabama,Franklin County,31362
134 |    Alabama,Marengo County,18863
135 |    Alabama,Russell County,57961
136 | 
137 | 2. Get population estimates over a range of years
138 | ====================================================
139 | 
140 | We can use similar code as before, but now loop through different population estimate datasets by year.
141 | Here are the specific APIs used:
142 | 
143 | Vintage 2015 Population Estimates: https://api.census.gov/data/2015/pep/population/examples.html
144 | 
145 | Vintage 2016 Population Estimates: https://api.census.gov/data/2016/pep/population/examples.html
146 | 
147 | Vintage 2017 Population Estimates: https://api.census.gov/data/2017/pep/population/examples.html
148 | 
149 | Note: includes Washington, D.C. and Puerto Rico.
150 | 
151 | .. code-block:: shell
152 | 
153 |    for year in {2015..2018}
154 |    do
155 |      for (( i = 1; i < $(echo $state_ids | jq '. | length'); i++ ))
156 |      do
157 |        state=$(echo $state_ids | jq ".[$i][0]" | tr -d '"')
158 |        stateID=$(echo $state_ids | jq ".[$i][1]" | tr -d '"')
159 |        request=$(curl -s $api$year$"/pep/population?get=GEONAME,POP&for=county:*&in=state:"$stateID$key)
160 |        sleep 1;
161 |        for (( j = 1; j < $(echo $request | jq '. | length'); j++ ))
162 |        do
163 |          county=$(echo $request | jq ".[$j][0]" | tr -d '"' | cut -f1 -d",")
164 |          population=$(echo $request | jq ".[$j][1]" | tr -d '"')
165 |          echo $year","$state$","$county$","$population >> state_populations_over_years.csv
166 |        done
167 |      done
168 |    done
169 | 
170 | View the first 25 lines
171 | 
172 | .. code-block:: shell
173 | 
174 |    head -n25 state_populations_over_years.csv
175 | 
176 | **Output:**
177 | 
178 | .. code-block:: shell
179 | 
180 |    2015,Alabama,Baldwin County,203709
181 |    2015,Alabama,Barbour County,26489
182 |    2015,Alabama,Bibb County,22583
183 |    2015,Alabama,Blount County,57673
184 |    2015,Alabama,Bullock County,10696
185 |    2015,Alabama,Butler County,20154
186 |    2015,Alabama,Calhoun County,115620
187 |    2015,Alabama,Chambers County,34123
188 |    2015,Alabama,Cherokee County,25859
189 |    2015,Alabama,Chilton County,43943
190 |    2015,Alabama,Choctaw County,13170
191 |    2015,Alabama,Clarke County,24675
192 |    2015,Alabama,Clay County,13555
193 |    2015,Alabama,Cleburne County,15018
194 |    2015,Alabama,Coffee County,51211
195 |    2015,Alabama,Colbert County,54354
196 |    2015,Alabama,Conecuh County,12672
197 |    2015,Alabama,Coosa County,10724
198 |    2015,Alabama,Covington County,37835
199 |    2015,Alabama,Autauga County,55347
200 |    2015,Alabama,Lawrence County,33115
201 |    2015,Alabama,Lee County,156993
202 |    2015,Alabama,Limestone County,91663
203 |    2015,Alabama,Lowndes County,10458
204 |    2015,Alabama,Macon County,19105
205 | 
206 | 3. Plot population change
207 | ================================
208 | 
209 | This data is based off the 2021 Population Estimates dataset:
210 | 
211 | https://api.census.gov/data/2021/pep/population/variables.html
212 | 
213 | The percentage change in population is from July 1, 2020 to July 1, 2021 for states 
214 | (includes Washington, D.C. and Puerto Rico)
215 | 
216 | .. code-block:: shell
217 | 
218 |    request=$(curl -s $api$"2021/pep/population?get=NAME,POP_2021,DENSITY_2021,PPOPCHG_2021&for=state:*"$key)
219 |    for (( i = 1; i < $(echo $request | jq '. | length'); i++ ))
220 |    do
221 |      state=$(echo $request | jq ".[$i][0]" | tr -d '"')
222 |      population=$(echo $request | jq ".[$i][1]" | tr -d '"')
223 |      density=$(echo $request | jq ".[$i][2]" | tr -d '"')
224 |      populationChange=$(echo $request | jq ".[$i][3]" | tr -d '"')
225 |      echo ${state}$","$population$","$density$","$populationChange >> state_change.csv
226 |    done
227 | 
228 | Sort the data:
229 | 
230 | .. code-block:: shell
231 | 
232 |    sort state_change.csv > state_change.sorted
233 | 
234 | Create an associative array that replaces state name with abbreviation:
235 | 
236 | .. code-block:: shell
237 | 
238 |    declare -A abbreviation=( [Puerto Rico]=Pr [Alabama]=Al [Alaska]=Ak [Arizona]=Az [Arkansas]=Ar [California]=Ca [Colorado]=Co [Connecticut]=Ct [Delaware]=De [District of Columbia]=Dc [Florida]=Fl [Georgia]=Ga [Hawaii]=Hi [Idaho]=Id [Illinois]=Il [Indiana]=In [Iowa]=Ia [Kansas]=Ks [Kentucky]=Ky [Louisiana]=La [Maine]=Me [Maryland]=Md [Massachusetts]=Ma [Michigan]=Mi [Minnesota]=Mn [Mississippi]=Ms [Missouri]=Mo [Montana]=Mt [Nebraska]=Ne [Nevada]=Nv [New Hampshire]=Nh [New Jersey]=Nj [New Mexico]=Nm [New York]=Ny [North Carolina]=Nc [North Dakota]=Nd [Ohio]=Oh [Oklahoma]=Ok [Oregon]=Or [Pennsylvania]=Pa [Rhode Island]=Ri [South Carolina]=Sc [South Dakota]=Sd [Tennessee]=Tn [Texas]=Tx [Utah]=Ut [Vermont]=Vt [Virginia]=Va [Washington]=Wa [West Virginia]=Wv [Wisconsin]=Wi [Wyoming]=Wy )
239 | 
240 | Next, select only the population change and state abbreviation:
241 | 
242 | .. code-block:: shell
243 |     
244 |    while IFS=, read -r field1 field2 field3 field4
245 |    do
246 |      state_abbreviation=${abbreviation[$field1]}
247 |        echo "$state_abbreviation,$field4" >> abbreviation_data.csv
248 |    done < state_change.sorted
249 | 
250 | Next, plot the data:
251 | 
252 | .. code-block:: shell
253 | 
254 |    gnuplot -p popChange.gnuplot
255 | 
256 | **Output:**
257 | 
258 | .. code-block:: shell
259 | 
260 |                                                                                                                                                                 
261 |                                                             States Population Change from 2020 to 2021                                                           
262 |                                                                                                                                                                  
263 |    3 +-------------------------------------------------------------------------------------------------------------------------------------------------------+   
264 |      |  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  + +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  |   
265 |      |                                                                                                                                                       |   
266 |      |                                                                                                                                                       |   
267 |    2 |-+                                                                                                                                                   +-|   
268 |      |                                                                            A                                                        A                 |   
269 |      |                                                                                                                                                       |   
270 |      |     A              A                                                                                                    A                             |   
271 |    1 |-+                        A                                                       A              A                          A     A                  +-|   
272 |      |                             A                          A                            A                                         A                       |   
273 |      |        A     A                                                                                           A                             A              |   
274 |      |                 A                       A  A                             A                                                                   A        |   
275 |    0 |-+A                                            A  A        A        A          A           A           A     A        A                    A        A+-|   
276 |      |                                                                 A     A                A                       A                                      |   
277 |      |                                                     A        A                                     A              A                             A     |   
278 |      |           A                    A                                                                                                                      |   
279 |   -1 |-+                                    A                                                                                                              +-|   
280 |      |                                                                                                                                                       |   
281 |      |                                                                                              A                                                        |   
282 |      |                                                                                                                                                       |   
283 |   -2 |-+                                                                                                                                                   +-|   
284 |      |                                                                                                                                                       |   
285 |      |                                                                                                                                                       |   
286 |      |  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  + +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  +  |   
287 |   -3 +-------------------------------------------------------------------------------------------------------------------------------------------------------+   
288 |     Al Ak Az Ar Ca Co Ct De Dc Fl Ga Hi Id Il In Ia Ks Ky La Me Md Ma Mi Mn Ms MoMt Ne Nv Nh Nj Nm Ny Nc Nd Oh Ok Or Pa Pr Ri Sc Sd Tn Tx Ut Vt Va Wa Wv Wi Wy   
289 |                                                                                                                                                                 
290 | 
291 | Here is the gnuplot file:
292 | 
293 | .. code-block:: shell
294 | 
295 |    cat popChange.gnuplot
296 | 
297 | **Output:**
298 | 
299 | .. code-block:: shell
300 | 
301 |    set datafile separator ','
302 |    set title 'States Population Change from 2020 to 2021'
303 |    set term dumb size 160,30
304 |    plot 'abbreviation_data.csv' using 2:xtic(1) notitle
305 | 


--------------------------------------------------------------------------------
/matlab/scopus.rst:
--------------------------------------------------------------------------------
  1 | Scopus API in Matlab
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | .. sectionauthor:: Vincent F. Scalfani <vfscalfani@ua.edu>
  5 | 
  6 | by Anastasia Ramig
  7 | 
  8 | These recipe examples use the Elsevier Scopus API. 
  9 | Code was tested with MATLAB R2021b and sample data downloaded from the Scopus API on April 26, 2022
 10 | via http://api.elsevier.com and http://www.scopus.com. This tutorial content is intended to help 
 11 | facillitate academic research. Before continuing or reusing any of this code, please be aware of
 12 | Elsevier's `API policies and appropiate use-cases`_. You will also need to register for an API key
 13 | in order to use the Scopus API.
 14 | 
 15 | .. _API policies and appropiate use-cases: https://dev.elsevier.com/use_cases.html
 16 | 
 17 | Setup
 18 | =========
 19 | 
 20 | We will start by setting up the API key. Save your key in a text file in the
 21 | same directory as the current Matlab folder and import your key as follows:
 22 | 
 23 | .. code-block:: matlab
 24 | 
 25 |    %% import API key from file
 26 |    myAPIKey = importdata("apiKey.txt");
 27 | 
 28 | 1. Get Author Data
 29 | =====================
 30 | 
 31 | Number of Records for Author
 32 | -------------------------------
 33 | 
 34 | .. code-block:: matlab
 35 | 
 36 |    %% setup API information and pull data
 37 |    api_url = "https://api.elsevier.com/content/search/scopus?query=";
 38 |    author_id = "AU-ID(55764087400)&apiKey=";
 39 |    q = webread(api_url + author_id + myAPIKey)
 40 | 
 41 | **Output:**
 42 | 
 43 | .. code-block:: matlab
 44 | 
 45 |    q = struct with fields:
 46 |       search_results: [1x1 struct]
 47 | 
 48 | .. code-block:: matlab
 49 | 
 50 |    %% create an array of ones to pre-allocate doi_list
 51 |    doi_list = {ones(length(q.search_results.entry), 1)};
 52 | 
 53 |    %% create a list of dois from the data
 54 |    for i = 1:length(q.search_results.entry)
 55 |       doi_list{i} = q.search_results.entry{i,1}.prism_doi;
 56 |    end
 57 |    doi_list
 58 | 
 59 | **Output:**
 60 | 
 61 | .. code-block:: matlab
 62 | 
 63 |    doi_list = 1x20 cell
 64 |    '10.1021/acs.jchemed.1c00904''10.5860/crln.82.9.428''10.1021/acs.iecr.8b02573''10.1021/acs.jchemed.6b00602''10.5062/F4TD9VBX''10.1021/acs.macromol.6b02005''10.1186/s13321-016-0181-z''10.1021/acs.chemmater.5b04431''10.1021/acs.jchemed.5b00512''10.1021/acs.jchemed.5b00375''10.5860/crln.76.9.9384''10.5860/crln.76.2.9259''10.1021/ed400887t''10.1016/j.acalib.2014.03.015''10.5062/F4XS5SB9''10.1021/ma300328u''10.1021/mz200108a''10.1021/ma201170y''10.1021/ma200184u''10.1021/cm102374t'
 65 | 
 66 | .. code-block:: matlab
 67 | 
 68 |    %% create an array of ones to pre-allocate titles_list
 69 |    titles_list = {ones(length(q.search_results.entry), 1)};
 70 | 
 71 |    %% create a list of titles from the data
 72 |    for i = 1:length(q.search_results.entry)
 73 |       titles_list{i} = q.search_results.entry{i,1}.dc_title;
 74 |    end
 75 |    titles_list
 76 | 
 77 | **Output:**
 78 | 
 79 | .. code-block:: matlab
 80 | 
 81 |    titles_list = 1x20 cell
 82 |    'Using NCBI Entrez Direct (EDirect) for Small Molecule Chemical Informati…  'Using the linux operating system full-time tips and experiences from a s…  'Analysis of the Frequency and Diversity of 1,3-Dialkylimidazolium Ionic …  'Rapid Access to Multicolor Three-Dimensional Printed Chemistry and Bioch…  'Text analysis of chemistry thesis and dissertation titles''Phototunable Thermoplastic Elastomer Hydrogel Networks''Programmatic conversion of crystal structures into 3D printable files us…  'Dangling-End Double Networks: Tapping Hidden Toughness in Highly Swollen…  'Replacing the Traditional Graduate Chemistry Literature Seminar with a C…  '3D Printed Block Copolymer Nanostructures''Hypotheses in librarianship: Applying the scientific method''Recruiting students to campus: Creating tangible and digital products in…  '3D printed molecules and extended solid models for teaching symmetry and…  'Repurposing Space in a Science and Engineering Library: Considerations f…  'A model for managing 3D printing services in academic libraries''Morphological phase behavior of poly(RTIL)-containing diblock copolymer …  'Network formation in an orthogonally self-assembling system''Access to nanostructured hydrogel networks through photocured body-cente…  'Synthesis and ordered phase separation of imidazolium-based alkyl-ionic …  'Thermally stable photocuring chemistry for selective morphological trapp…  
 83 | 
 84 | .. code-block:: matlab
 85 | 
 86 |    %% create an array of ones to pre-allocate citedby_count
 87 |    citedby_count = {ones(length(q.search_results.entry), 1)};
 88 | 
 89 |    %% create a list of counts of how much each title was cited
 90 |    for i = 1:length(q.search_results.entry)
 91 |       citedby_count{i} = q.search_results.entry{i,1}.citedby_count;
 92 |    end
 93 |    citedby_count
 94 | 
 95 | **Output:**
 96 | 
 97 | .. code-block:: matlab
 98 | 
 99 |     citedby_count = 1x20 cell
100 |     '0'          '0'          '17'         '25'         '5'          '11'         '20'         '6'          '10'         '25'         '0'          '0'          '98'         '6'          '34'         '40'         '31'         '18'         '45'         '11'         
101 | 
102 | .. code-block:: matlab
103 | 
104 |    %% find the total number of cites
105 |    citesTotal = str2double(citedby_count);
106 |    totalCites = sum(citesTotal)
107 | 
108 | **Output:**
109 | 
110 | .. code-block:: matlab
111 | 
112 |    totalCites = 402
113 | 
114 | 2. Get Author Data in a Loop
115 | ==================================
116 | 
117 | Number of Records for Author
118 | ------------------------------
119 | 
120 | .. code-block:: matlab
121 | 
122 |    %% import author text data as a cell array
123 |    authorList = importdata("authorData.txt")
124 | 
125 | **Output:**
126 | 
127 | .. code-block:: matlab
128 | 
129 |    authorList = 5x1 cell
130 |    '{Emy Decker, 36660678600}'   
131 |    '{Lindsey Lowry, 57210944451}'
132 |    '{Karen Chapman, 35783926100}'
133 |    '{Kevin Walker, 56133961300}' 
134 |    '{Sara Whitver, 57194760730}' 
135 | 
136 | 
137 | .. code-block:: matlab
138 | 
139 |    %% create a list of author names and delete the extra bracket from it
140 |    authorList2 = cellfun(@(x) strsplit(x, ","), authorList, 'UniformOutput', false);
141 |    for i = 1:length(authorList2)
142 |       str = authorList2{i, 1}{1, 1};
143 |       old = "{";
144 |       new = "";
145 |       authorList2{i, 1}{1, 1} = replace(str, old, new);
146 |    end
147 | 
148 |    %% extract the author ids
149 |    author_ids = {ones(length(authorList2), 1)};
150 |    for i = 1:length(authorList2)
151 |       pat = digitsPattern;
152 |       author_ids{i} = extract(authorList2{i, 1}{1, 2}, pat);
153 |    end
154 | 
155 | .. code-block:: matlab
156 | 
157 |    %% preallocate an array for the number of records
158 |    numRecords = {ones(length(author_ids), 1)};
159 | 
160 |    %% find the number of records for each author and add it to the author list
161 |    for i = 1:length(numRecords{1, 1})
162 |       q1 = webread(api_url + "AU-ID(" + author_ids{1, i} + ")&apiKey=" + myAPIKey);
163 |       numRecords{i} = length(q1.search_results.entry);
164 |       pause(1)
165 |       authorList2{i, 1}{1, 3} = numRecords{i};
166 |    end
167 |    disp(cell2table(authorList2))
168 | 
169 | **Output:**
170 | 
171 | .. code-block:: matlab
172 | 
173 |                          authorList2                   
174 |     ________________________________________________
175 | 
176 |     {'Emy Decker'   }    {' 36660678600}'}    {[14]}
177 |     {'Lindsey Lowry'}    {' 57210944451}'}    {[ 4]}
178 |     {'Karen Chapman'}    {' 35783926100}'}    {[25]}
179 |     {'Kevin Walker' }    {' 56133961300}'}    {[ 8]}
180 |     {'Sara Whitver' }    {' 57194760730}'}    {[ 4]}
181 | 
182 | Get Record Data
183 | -------------------
184 | 
185 | .. code-block:: matlab
186 | 
187 |    clear info 
188 |    %% extract the dois and cites for each author
189 |    for i = 1:length(author_ids)
190 |       q_records = webread(api_url + "AU-ID(" + author_ids{1, i}+")&apiKey=" + myAPIKey);
191 |       n = length(q_records.search_results.entry);
192 |       
193 |       %% preallocate cell array for the dois and cites
194 |       doiList = cell(1, length(author_ids));
195 |       citeList = cell(1, length(author_ids));
196 |       for k = 1:n
197 |          try
198 |                doiList{1, i}{k, 1} = q_records.search_results.entry{k, 1}.prism_doi;
199 |                citeList{1, i}{k, 1} = q_records.search_results.entry{k, 1}.citedby_count;
200 |          catch
201 |          end
202 |       end
203 |       pause(1)
204 |       
205 |       %% add the dois and cites to an overall information array
206 |       info{1, 1}{1, i} = doiList{1, i};
207 |       info{2, 1}{1, i} = citeList{1, i};
208 |    end
209 | 
210 |    %% create arrays for the dois and cites
211 |    dois = {};
212 |    cites = {};
213 |    for i = 1:width(info{1, 1})
214 |       dois = vertcat(dois, info{1, 1}{1, i});
215 |       cites = vertcat(cites, info{2, 1}{1, i});
216 |    end
217 | 
218 | .. code-block:: matlab
219 | 
220 |    %% create a conclusive array
221 |    authorArray = horzcat(dois, cites);
222 |    nameArray = {};
223 | 
224 |    %% create an array of author names
225 |    for i = 1:(length(numRecords))
226 |       nameLength = int16(numRecords{i});
227 |       authorName = cellstr(repmat(authorList2{i, 1}{1, 1}, nameLength, 1));
228 |       nameArray = vertcat(nameArray, authorName);
229 |    end
230 | 
231 |    %% add the author names to the informational array
232 |    authorArray = horzcat(authorArray, nameArray)
233 | 
234 | **Output:**
235 | 
236 | .. code-block:: matlab
237 | 
238 |    authorArray = 55x3 cell
239 |       1	2	3
240 |    1	'10.1108/RSR-08-2021-0051'	'0'	'Emy Decker'
241 |    2	'10.1080/1072303X.2021.1929642'	'0'	'Emy Decker'
242 |    3	'10.1080/15367967.2021.1900740'	'8'	'Emy Decker'
243 |    4	'10.1080/15367967.2020.1826951'	'0'	'Emy Decker'
244 |    5	'10.1080/10691316.2020.1781725'	'0'	'Emy Decker'
245 |    6	'10.1145/3347709.3347805'	'0'	'Emy Decker'
246 |    7	'10.4018/978-1-5225-5631-2.ch09'	'3'	'Emy Decker'
247 |    ...
248 |    ...
249 |    ...
250 | 
251 | Save Record Data to a file
252 | -------------------------------
253 | 
254 | .. code-block:: matlab
255 | 
256 |    %% save the search for each author to a mat file
257 |    for author = 1:length(author_ids)
258 |       authorName = authorList2{author, 1}{1, 1};
259 |       q2 = webread(api_url + "AU-ID" + "(" + author_ids{1, author} + ")&apiKey=" + myAPIKey);
260 |       pause(1)
261 |       filename = authorName + ".mat";
262 |       save filename q2;
263 |    end
264 | 
265 | .. code-block:: matlab
266 | 
267 |    %% save the author arrays to individual text files
268 |    for i = 1:(length(numRecords))
269 |       clear individualAuthorData;
270 |       individualDois = info{1, 1}{1, i};
271 |       individualCites = info{2, 1}{1, i};
272 |       
273 |       nameLength = int16(numRecords{i});
274 |       authorName = cellstr(repmat(authorList2{i, 1}{1, 1}, nameLength, 1));
275 |       
276 |       individualAuthorData = horzcat(individualDois, individualCites);
277 |       individualAuthorData = horzcat(individualAuthorData, authorName);
278 |       
279 |       writecell(individualAuthorData, (authorList2{i, 1}{1, 1} + ".txt"), "Delimiter", "\t");
280 |    end
281 | 
282 | 3. Get References via a Title Search
283 | =====================================
284 | 
285 | Number of Title Match Records
286 | ----------------------------------
287 | 
288 | Search Scopus for all references containing 'ChemSpider" in the record title.
289 | 
290 | .. code-block:: matlab
291 | 
292 |    %% set up the API information
293 |    api_url = "https://api.elsevier.com/content/search/scopus?query=";
294 |    author_id = "TITLE(ChemSpider)&apiKey=";
295 | 
296 |    %% find the information for ChemSpider and get the total number of results
297 |    q3 = webread(api_url + author_id + myAPIKey);
298 |    q3.search_results.opensearch_totalResults
299 | 
300 | Repeat this in a loop to get number of Scopus records for each title search.
301 | 
302 | .. code-block:: matlab
303 | 
304 |    %% create a list of titles
305 |    titleList = ["ChemSpider", "PubChem", "ChEMBL", "Reaxys", "SciFinder"];
306 |    length(titleList)
307 | 
308 |    %% create an array of ones to pre-allocate numRecordsTitle
309 |    clear numRecordsTitle
310 |    numRecordsTitle = {ones(length(titleList), 1)};
311 |    
312 |    %% obtain the number of records for each title in the list and create an array
313 |    for i = 1:length(titleList)
314 |       qt = webread(api_url + "TITLE(" + titleList(i) + ")&apiKey=" + myAPIKey);
315 |       numt = qt.search_results.opensearch_totalResults;
316 |       numRecordsTitle{1, i}{1, 1} = titleList(i);
317 |       numRecordsTitle{1, i}{1, 2} = numt;
318 |       pause(1)
319 |    end
320 | 
321 | Download Title Match Record Data
322 | ------------------------------------
323 | 
324 | Download records and create a list of selected metadata.
325 | 
326 | .. code-block:: matlab
327 | 
328 |    %% create a list of titles and preallocate an array
329 |    titleList = ["ChemSpider", "PubChem", "ChEMBL", "Reaxys", "SciFinder"];
330 |    scopusTitleData = {ones(length(titleList), 1)};
331 |    %% find the dois, titles, and dates for each title in the list and put them into an array
332 |    for t = 1:length(titleList)
333 |       qt = webread(api_url + "TITLE(" + titleList(t) + ")&apiKey=" + myAPIKey);
334 |       n = length(qt.search_results.entry);
335 |       doiTitles = cell(1, length(titleList));
336 |       titles = cell(1, length(titleList));
337 |       dates = cell(1, length(titleList));
338 |       for k = 1:n
339 |          try
340 |                doiTitles{1, t}{k, 1} = qt.search_results.entry{k, 1}.prism_doi;
341 |                titles{1, t}{k, 1} = qt.search_results.entry{k, 1}.dc_title;
342 |                dates{1, t}{k, 1} = qt.search_results.entry{k, 1}.prism_coverDate;
343 |          catch
344 |          end
345 |       end
346 |       pause(1)
347 |       infoTitles{1, 1}{1, t} = doiTitles{1, t};
348 |       infoTitles{2, 1}{1, t} = titles{1, t};
349 |       infoTitles{3, 1}{1, t} = dates{1, t};
350 |    end
351 | 
352 | .. code-block:: matlab
353 | 
354 |    %% create an overall array of the information found above
355 |    titleDois = {};
356 |    titlesFinal = {};
357 |    datesFinal = {};
358 |    for t = 1:width(info{1, 1})
359 |       titleDois = vertcat(titleDois, infoTitles{1, 1}{1, t});
360 |       titlesFinal = vertcat(titlesFinal, infoTitles{2, 1}{1, t});
361 |       datesFinal = vertcat(datesFinal, infoTitles{3, 1}{1, t});
362 |    end
363 |    titleArray = horzcat(titleDois, titlesFinal);
364 |    titleArray = horzcat(titleArray, datesFinal);
365 |    %% create an array of names and add it to the overall array
366 |    titlesNameArray = {};
367 |    for t = 1:length(titleList)
368 |       nameLength = length(infoTitles{1, 1}{1, t});
369 |       titlesAuthorName = cellstr(repmat(titleList(t), nameLength, 1));
370 |       titlesNameArray = vertcat(titlesNameArray, titlesAuthorName);
371 |    end
372 |    titleArray = horzcat(titleArray, titlesNameArray)
373 | 
374 | **Output:**
375 | 
376 | .. code-block:: matlab
377 | 
378 |       titleArray = 88x4 cell
379 |       1	2	3	4
380 |    1	'10.1039/c5np90022k'	'Editorial: ChemSpider-a tool for Natural Products research'	'2015-08-01'	'ChemSpider'
381 |    2	'10.1021/bk-2013-1128.ch020'	'ChemSpider: How a free community resource of data can support the teaching of nmr spectroscopy'	'2013-01-01'	'ChemSpider'
382 |    3	'10.1007/s13361-011-0265-y'	'Identification of "known unknowns" utilizing accurate mass data and chemspider'	'2012-01-01'	'ChemSpider'
383 |    4	'10.1002/9781118026038.ch22'	'Chemspider: A Platform for Crowdsourced Collaboration to Curate Data Derived From Public Compound Databases'	'2011-05-03'	'ChemSpider'
384 |    5	'10.1021/ed100697w'	'Chemspider: An online chemical information resource'	'2010-11-01'	'ChemSpider'
385 |    6	'10.1016/j.bioorg.2022.105648'	'Structure-based discovery of a specific SHP2 inhibitor with enhanced blood–brain barrier penetration from PubChem database'	'2022-04-01'	'PubChem'
386 |    7	'10.1016/j.jmb.2022.167514'	'PubChem Protein, Gene, Pathway, and Taxonomy Data Collections: Bridging Biology and Chemistry through Target-Centric Views of PubChem Data'	'2022-01-01'	'PubChem'
387 |    8	'10.1007/s40011-021-01335-x'	'Identification a Novel Inhibitor for Aldo–Keto Reductase 1 C3 by Virtual Screening of PubChem Database'	'2022-01-01'	'PubChem'
388 |    9	'10.1007/978-1-0716-2067-0_27'	'Plant Reactome and PubChem: The Plant Pathway and (Bio)Chemical Entity Knowledgebases'	'2022-01-01'	'PubChem'
389 |    10	'10.1016/j.molstruc.2021.130968'	'3CL<sup>pro</sup> and PL<sup>pro</sup> affinity, a docking study to fight COVID19 based on 900 compounds from PubChem and literature. Are there new drugs to be found?'	'2021-12-05'	'PubChem'
390 |    11	'10.1093/glycob/cwab078'	'Enhancing the interoperability of glycan data flow between ChEBI, PubChem and GlyGen'	'2021-11-01'	'PubChem'
391 |    ...
392 |    ...
393 |    ...
394 | 
395 | 


--------------------------------------------------------------------------------
/c/sdirect.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "088987c6-311d-4677-9cc9-19ceeeb245b9",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# ScienceDirect API in C\n",
  9 |     "\n",
 10 |     "by Cyrus Gomes\n",
 11 |     "\n",
 12 |     "**ScienceDirect**: https://www.sciencedirect.com/\n",
 13 |     "\n",
 14 |     "**Elsevier Developer Portal:** https://dev.elsevier.com/\n",
 15 |     "\n",
 16 |     "**ScienceDirect APIs Specification:** https://dev.elsevier.com/sd_api_spec.html\n",
 17 |     "\n",
 18 |     "**Elsevier How to Guide: Text Mining:** https://dev.elsevier.com/tecdoc_text_mining.html\n",
 19 |     "\n",
 20 |     "Please check with your institution for their Text and Data Mining Agreement with Elsevier.\n",
 21 |     "\n",
 22 |     "These recipe examples use the Elsevier ScienceDirect Article (Full-Text) API. This tutorial content is intended to help facillitate academic research. Before continuing or reusing any of this code, please be aware of Elsevier’s API policies and appropiate use-cases, as for example, Elsevier has detailed policies regarding [text and data mining of Elsevier full-text content](https://dev.elsevier.com/text_mining.html). If you have copyright or other related text and data mining questions, please contact The University of Alabama Libraries.\n",
 23 |     "\n",
 24 |     "*These recipe examples were tested February 2024.*"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "f257ddd2-982a-4179-99c0-0b8d572ac57d",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## Setup"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "id": "ea50676e",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "First, install the CURL and jq packages by typing the following command in the terminal:"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "e70df78e",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "!sudo apt install curl jq"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "id": "01bdb3e7",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "Then, we set a directory where we want the Science_Direct directory for our projects to be created:"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 1,
 64 |    "id": "462ac307",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "!mkdir Science_Direct"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "id": "49c44a5e",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Finally, we change the directory to the folder we created:"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "id": "d9c9efaf",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "%cd Science_Direct"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "id": "61baba0a-06e0-4a83-bf96-da483ca02742",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "### Create a variable for API Key\n",
 95 |     "\n",
 96 |     "Save your API key to a separate text file, then create a variable for your key. Avoid displaying your API key in your terminal (to prevent accidental sharing). "
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "id": "c85dc169",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "# Create the key file\n",
107 |     "!touch \"apiKey.txt\""
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "id": "cc466078",
113 |    "metadata": {},
114 |    "source": [
115 |     "We use the following command to access the key as Jupyter does not allow variable sharing for bash scripts."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "id": "3842b766",
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "# Read the key from the file\n",
126 |     "!apiKey=$(cat \"apiKey.txt\")"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "997bb204-db57-4730-addd-47abd59b67ef",
132 |    "metadata": {},
133 |    "source": [
134 |     "### Identifier Note\n",
135 |     "\n",
136 |     "We will use DOIs as the article identifiers. See our Crossref and Scopus API tutorials for workflows on how to create lists of DOIs and identfiers for specific searches and journals. The Elsevier ScienceDirect Article (Full-Text) API also accepts other identifiers like Scopus IDs and PubMed IDs (see API specification documents linked above)."
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "id": "7efb081b",
142 |    "metadata": {},
143 |    "source": [
144 |     "### Create an executable for API calls\n",
145 |     "\n",
146 |     "First, we can initialize a folder for the all the project files and change to that directory:"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "2b9fa9fc",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "!mkdir api_results"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "id": "530852eb",
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "%cd api_results"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "id": "7403abb3",
172 |    "metadata": {},
173 |    "source": [
174 |     "We utilize the `%%file` command to create the following makefile which will compile our program and create an executable."
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 1,
180 |    "id": "437a955c",
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "Writing makefile\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "%%file makefile\n",
193 |     "\n",
194 |     "# Set the variable CC to gcc, which is used to build the program\n",
195 |     "CC=gcc\n",
196 |     "\n",
197 |     "# Enable debugging information and enable all compiler warnings\n",
198 |     "CFLAGS=-g -Wall\n",
199 |     "\n",
200 |     "# Set the bin variable as the name of the binary file we are creating\n",
201 |     "BIN=sDirect_data\n",
202 |     "\n",
203 |     "# Create the binary file with the name we put\n",
204 |     "all: $(BIN)\n",
205 |     "\n",
206 |     "# Map any file ending in .c to a binary executable. \n",
207 |     "# \"$<\" represents the .c file and \"$@\" represents the target binary executable\n",
208 |     "%: %.c\n",
209 |     "\n",
210 |     "\t# Compile the .c file using the gcc compiler with the CFLAGS and links \n",
211 |     "\t# resulting binary with the CURL library\n",
212 |     "\t$(CC) $(CFLAGS) $< -o $@ -lcurl\n",
213 |     "\n",
214 |     "# Clean target which removes specific files\n",
215 |     "clean:\n",
216 |     "\n",
217 |     "\t# Remove the binary file and an \".dSYM\" (debug symbols for debugging) directories\n",
218 |     "\t# the RM command used -r to remove directories and -f to force delete\n",
219 |     "\t$(RM) -rf $(BIN) *.dSYM\n"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "id": "64046fbe",
225 |    "metadata": {},
226 |    "source": [
227 |     "This command is used again to create our .c file which contains the code for the program"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 4,
233 |    "id": "41946be2",
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "name": "stdout",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "Overwriting sDirect_data.c\n"
241 |      ]
242 |     }
243 |    ],
244 |    "source": [
245 |     "%%file sDirect_data.c\n",
246 |     "\n",
247 |     "#include <curl/curl.h>\n",
248 |     "#include <stdio.h>\n",
249 |     "#include <stdlib.h>\n",
250 |     "#include <string.h>\n",
251 |     "\n",
252 |     "/*CURL program that retrieves Science Direct data from\n",
253 |     "  https://api.elsevier.com/content/article/doi/ */\n",
254 |     "\n",
255 |     "int main (int argc, char* argv[]) {\n",
256 |     "    \n",
257 |     "    // If arguments are invalid then return\n",
258 |     "    if (argc < 2) {                                                                                      \n",
259 |     "        printf(\"Error. Please try again correctly. (./sDirect_data -doi [doi] -key [key])\\n\");\n",
260 |     "        return -1;\n",
261 |     "    }\n",
262 |     "    \n",
263 |     "    // Initialize the CURL HTTP connection\n",
264 |     "    CURL *curl = curl_easy_init();\n",
265 |     "\n",
266 |     "    // Bits of the URL that are joined together later                                                                      \n",
267 |     "    char api[] = \"https://api.elsevier.com/content/article/doi/\";                            \n",
268 |     "    char url[1000];\n",
269 |     "    char label1[] = \"?APIKey=\";\n",
270 |     "    char label2[] = \"&httpAccept=text/xml\";\n",
271 |     "    char doi[] = \"10.1016/j.tetlet.2017.07.080\";\n",
272 |     "\n",
273 |     "    // Check if CURL initialization is a success or not\n",
274 |     "    if (!curl) {                                                                                         \n",
275 |     "        fprintf(stderr, \"init failed\\n\");\n",
276 |     "        return EXIT_FAILURE;\n",
277 |     "    }\n",
278 |     "    \n",
279 |     "    /* Here are different ways of calling the program in the\n",
280 |     "    command line and integrating doi and parameter fields.*/\n",
281 |     "\n",
282 |     "    // Has the -doi flag: /sDirect_data -doi\n",
283 |     "    if ((argc==2) && (strcmp(argv[1],\"-doi\")==0)) {\n",
284 |     "        \n",
285 |     "        // Combine the API and default DOI to produce a functioning URL\n",
286 |     "        sprintf(url, \"%s%s\", api, doi); \n",
287 |     "        \n",
288 |     "    }\n",
289 |     "    \n",
290 |     "    // Has the -doi flag and field: ./sDirect_data -doi [doi]\n",
291 |     "    else if ((argc==3) && (strcmp(argv[1],\"-doi\")==0)) {\n",
292 |     "        \n",
293 |     "        // Combine the API and custom DOI\n",
294 |     "        sprintf(url, \"%s%s\", api, argv[2]);                                              \n",
295 |     "    \n",
296 |     "    }\n",
297 |     "    \n",
298 |     "    // Has the -doi and -key flags and the key field: ./sDirect_data -doi -key [key]\n",
299 |     "    else if ((argc==4) && (strcmp(argv[2],\"-key\")==0) && (strcmp(argv[1],\"-doi\")==0)) {\n",
300 |     "        \n",
301 |     "        // Combine the API, default DOI, and key to produce a functioning URL\n",
302 |     "        sprintf(url, \"%s%s%s%s%s\", api, doi, label1, argv[3], label2);                                              \n",
303 |     "    \n",
304 |     "    }\n",
305 |     "    \n",
306 |     "    // Has the -key and -doi flags and the key and doi field: ./sDirect_data -key [key] -doi [doi]\n",
307 |     "    else if ((argc==5) && (strcmp(argv[1],\"-key\")==0) && (strcmp(argv[3],\"-doi\")==0)) {\n",
308 |     "        \n",
309 |     "        // Combine the API, custom DOI, and key to produce the URL\n",
310 |     "        sprintf(url, \"%s%s%s%s%s\", api, argv[4], label1,  argv[2], label2);                                              \n",
311 |     "    \n",
312 |     "    }\n",
313 |     "    \n",
314 |     "    // Has the -doi and -key flags and the doi and key field: ./sDirect_data -doi [doi] -key [key]\n",
315 |     "    else if ((argc==5) && (strcmp(argv[3],\"-key\")==0)) {\n",
316 |     "        \n",
317 |     "        //combines the API, custom DOI, and key to produce the URL\n",
318 |     "        sprintf(url, \"%s%s%s%s%s\", api, argv[2], label1, argv[4], label2);                                              \n",
319 |     "    \n",
320 |     "    }\n",
321 |     "    \n",
322 |     "    // If the arguments are invalid then return\n",
323 |     "    else {        \n",
324 |     "        printf(\"./sDirect_data  -doi [doi] -key [key]\\n\");                                                                                      \n",
325 |     "        curl_easy_cleanup(curl);\n",
326 |     "        return 0;\n",
327 |     "    }                                            \n",
328 |     "\n",
329 |     "    // Set the URL to which the HTTP request will be sent to\n",
330 |     "    // First parameter is for the initialized curl HTTP request, second for the option to be set, and third for the value to be set\n",
331 |     "    curl_easy_setopt(curl, CURLOPT_URL, url);\n",
332 |     "\n",
333 |     "    // If result is not retrieved then output error\n",
334 |     "    CURLcode result = curl_easy_perform(curl);\n",
335 |     "\n",
336 |     "    // If result is not retrieved then output error\n",
337 |     "    if (result != CURLE_OK) {                                                                            \n",
338 |     "        fprintf(stderr, \"download problem: %s\\n\", curl_easy_strerror(result));\n",
339 |     "    }\n",
340 |     "\n",
341 |     "    // Deallocate memory for the CURL connection\n",
342 |     "    curl_easy_cleanup(curl);                                                                            \n",
343 |     "    return EXIT_SUCCESS;\n",
344 |     "}"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 5,
350 |    "id": "504d29df",
351 |    "metadata": {},
352 |    "outputs": [
353 |     {
354 |      "name": "stdout",
355 |      "output_type": "stream",
356 |      "text": [
357 |       "gcc -g -Wall sDirect_data.c -o sDirect_data -lcurl\n"
358 |      ]
359 |     }
360 |    ],
361 |    "source": [
362 |     "!make"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "id": "5554fdf8-98cd-4bf5-bb44-f16e30b859c8",
368 |    "metadata": {},
369 |    "source": [
370 |     "## 1. Retrieve full-text XML of an article\n",
371 |     "\n",
372 |     "This example downloads an XML file with the article full-text by calling the API. The DOI used in this example comes from a *Tetrahedron Letters* article:"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 14,
378 |    "id": "97826b88",
379 |    "metadata": {},
380 |    "outputs": [],
381 |    "source": [
382 |     "%%bash\n",
383 |     "\n",
384 |     "# Store the key in the key variable\n",
385 |     "key=$(cat apiKey.txt)\n",
386 |     "\n",
387 |     "# -key [key] can also be used to input the key to program\n",
388 |     "# ./sDirect_data -doi \"$doi\" -key \"$key\"\n",
389 |     "\n",
390 |     "# Call the program using a doi and assign it to a variable\n",
391 |     "fulltext1=$(./sDirect_data -doi \"10.1016/j.tetlet.2017.07.080\")\n",
392 |     "\n",
393 |     "# Save the output to fulltext1.xml\n",
394 |     "echo \"$fulltext1\" > fulltext1.xml"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "id": "56c376b1-411c-4f1b-b117-dd006fd74181",
400 |    "metadata": {},
401 |    "source": [
402 |     "## 2. Retrieve plain text of an article\n",
403 |     "\n",
404 |     "This example downloads a text file with the article full-text by calling the API. The DOI used in this example comes from a *Tetrahedron Letters* article:"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": 16,
410 |    "id": "24642a97",
411 |    "metadata": {},
412 |    "outputs": [],
413 |    "source": [
414 |     "%%bash\n",
415 |     "\n",
416 |     "# Store the key in the key variable\n",
417 |     "key=$(cat apiKey.txt)\n",
418 |     "\n",
419 |     "# -key [key] can also be used to input the key to program\n",
420 |     "# ./sDirect_data -doi \"$doi\" -key \"$key\"\n",
421 |     "\n",
422 |     "# Call the program using a doi and assign it to a variable\n",
423 |     "fulltext2=$(./sDirect_data -doi \"10.1016/j.tetlet.2022.153680\")\n",
424 |     "\n",
425 |     "# Save the output to fulltext2.txt\n",
426 |     "echo \"$fulltext2\" > fulltext2.txt"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "id": "9bd40795-646f-4962-b11d-ca967c06e9cf",
432 |    "metadata": {},
433 |    "source": [
434 |     "## 3. Retrieve full-text in a loop\n",
435 |     "\n",
436 |     "This example retrieves the full-text for a list of articles given their DOIs. This example downloads the articles as plain text, and the examples are *Tetrahedron Letters* articles."
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": 5,
442 |    "id": "3d6d8776",
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "%%bash\n",
447 |     "\n",
448 |     "# List of 5 DOIs for testing\n",
449 |     "dois=('10.1016/j.tetlet.2018.10.031',\n",
450 |     "        '10.1016/j.tetlet.2018.10.033',\n",
451 |     "        '10.1016/j.tetlet.2018.10.034',\n",
452 |     "        '10.1016/j.tetlet.2018.10.038',\n",
453 |     "        '10.1016/j.tetlet.2018.10.041')\n",
454 |     "\n",
455 |     "# Store the key in the key variable\n",
456 |     "key=$(cat apiKey.txt)\n",
457 |     "\n",
458 |     "# Call the program using a DOI and assign it to a variable\n",
459 |     "for doi in \"${dois[@]}\"; do\n",
460 |     "    \n",
461 |     "    # Can't save files with a '/' character on Linux\n",
462 |     "    filename=$(echo \"$doi\" | tr '/' '_')\n",
463 |     "    \n",
464 |     "    # Concatenate \"_plain_text.txt\" to the filename\n",
465 |     "    filename=\"${filename}_plain_text.txt\"\n",
466 |     "    \n",
467 |     "    # -key [key] can also be used to input the key to program\n",
468 |     "    # ./sDirect_data -doi \"$doi\" -key \"$key\"\n",
469 |     "    \n",
470 |     "    # Call the program using a DOI and assign it to a variable\n",
471 |     "    article=$(./sDirect_data -doi \"$doi\")\n",
472 |     "    \n",
473 |     "    # Save the output to a .txt file\n",
474 |     "    echo \"$article\" > \"$filename.txt\"\n",
475 |     "\n",
476 |     "done"
477 |    ]
478 |   }
479 |  ],
480 |  "metadata": {
481 |   "kernelspec": {
482 |    "display_name": "Python 3 (ipykernel)",
483 |    "language": "python",
484 |    "name": "python3"
485 |   },
486 |   "language_info": {
487 |    "codemirror_mode": {
488 |     "name": "ipython",
489 |     "version": 3
490 |    },
491 |    "file_extension": ".py",
492 |    "mimetype": "text/x-python",
493 |    "name": "python",
494 |    "nbconvert_exporter": "python",
495 |    "pygments_lexer": "ipython3",
496 |    "version": "3.10.12"
497 |   }
498 |  },
499 |  "nbformat": 4,
500 |  "nbformat_minor": 5
501 | }
502 | 


--------------------------------------------------------------------------------
/shell/pubmed.rst:
--------------------------------------------------------------------------------
  1 | PubMed API in Bash
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | 
  4 | by Avery Fernandez and Vincent F. Scalfani
  5 | 
  6 | These recipe examples were tested on November 23, 2022 using GNOME Terminal in Ubuntu 18.04.
  7 | 
  8 | **NCBI Entrez Programming Utilities documentation:** https://www.ncbi.nlm.nih.gov/books/NBK25501/
  9 | 
 10 | **Please see NCBI’s Data Usage Policies and Disclaimers:** https://www.ncbi.nlm.nih.gov/home/about/policies/
 11 | 
 12 | .. note::
 13 |   
 14 |    This tutorial uses ``curl`` and ``jq`` for interacting with the PubChem API. You may also be interested in using the `NCBI EDirect command line program <https://www.ncbi.nlm.nih.gov/books/NBK179288/>`_. We have workshop materials for EDirect with PubMed in our `UALIB Workshops repository <https://github.com/UA-Libraries-Research-Data-Services/UALIB_Workshops>`_.
 15 | 
 16 | Program requirements
 17 | =========================
 18 | 
 19 | In order to run this code, you will need to first install `curl`_, `jq`_, and `gnuplot`_. curl is used to request the data from the API, jq is used to parse the JSON data, and gnuplot is used to plot the data.
 20 | 
 21 | .. _curl: https://github.com/curl/curl
 22 | .. _jq: https://stedolan.github.io/jq/
 23 | .. _gnuplot: http://www.gnuplot.info/
 24 | 
 25 | 1. Basic PubMed API call
 26 | =============================
 27 | 
 28 | For calling individual articles and publications, we will need to use this API url:
 29 | 
 30 | .. code-block:: shell
 31 | 
 32 |    summary='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&'
 33 | 
 34 | Request data from PubMed API
 35 | -------------------------------
 36 | 
 37 | The article we are requesting has PubMed ID: 27933103. **retmode** in the web API URL specifies the file format, in this example, we will use json.
 38 | 
 39 | .. code-block:: shell
 40 | 
 41 |    url=$summary$'id=27933103&retmode=json'
 42 |    data_call=$(curl -s $url)
 43 |    echo $data_call
 44 | 
 45 | 
 46 | **Output:**
 47 | 
 48 | .. code-block:: shell
 49 | 
 50 |    {"header":{"type":"esummary","version":"0.3"},"result":{"uids":["27933103"],"27933103":{"uid":"27933103","pubdate":"2016","epubdate":"2016 Nov 23","source":"J Cheminform","authors":[{"name":"Scalfani VF","authtype":"Author","clusterid":""},{"name":"Williams AJ","authtype":"Author","clusterid":""},{"name":"Tkachenko V","authtype":"Author","clusterid":""},{"name":"Karapetyan K","authtype":"Author","clusterid":""},{"name":"Pshenichnov A","authtype":"Author","clusterid":""},{"name":"Hanson RM","authtype":"Author","clusterid":""},{"name":"Liddie JM","authtype":"Author","clusterid":""},{"name":"Bara JE","authtype":"Author","clusterid":""}],"lastauthor":"Bara JE","title":"Programmatic conversion of crystal structures into 3D printable files using Jmol.","sorttitle":"programmatic conversion of crystal structures into 3d printable files using jmol","volume":"8","issue":"","pages":"66","lang":["eng"],"nlmuniqueid":"101516718","issn":"1758-2946","essn":"1758-2946","pubtype":["Journal Article"],"recordstatus":"PubMed","pubstatus":"258","articleids":[{"idtype":"pubmed","idtypen":1,"value":"27933103"},{"idtype":"pmc","idtypen":8,"value":"PMC5122160"},{"idtype":"pmcid","idtypen":5,"value":"pmc-id: PMC5122160;"},{"idtype":"doi","idtypen":3,"value":"10.1186/s13321-016-0181-z"},{"idtype":"pii","idtypen":4,"value":"181"}],"history":[{"pubstatus":"received","date":"2016/08/15 00:00"},{"pubstatus":"accepted","date":"2016/11/16 00:00"},{"pubstatus":"entrez","date":"2016/12/10 06:00"},{"pubstatus":"pubmed","date":"2016/12/10 06:00"},{"pubstatus":"medline","date":"2016/12/10 06:01"}],"references":[],"attributes":["Has Abstract"],"pmcrefcount":33,"fulljournalname":"Journal of cheminformatics","elocationid":"","doctype":"citation","srccontriblist":[],"booktitle":"","medium":"","edition":"","publisherlocation":"","publishername":"","srcdate":"","reportnumber":"","availablefromurl":"","locationlabel":"","doccontriblist":[],"docdate":"","bookname":"","chapter":"","sortpubdate":"2016/11/23 00:00","sortfirstauthor":"Scalfani VF","vernaculartitle":""}}}
 51 | 
 52 | 
 53 | .. note::
 54 | 
 55 |    The silent option ``(-s)`` for curl was used to hide the progress outputs.
 56 | 
 57 | Let's extract the authors of the paper:
 58 | 
 59 | .. code-block:: shell
 60 | 
 61 |    echo $data_call | jq '.["result"]["27933103"]["authors"][]["name"]'
 62 | 
 63 | 
 64 | **Output:**
 65 | 
 66 | .. code-block:: shell
 67 | 
 68 |    "Scalfani VF"
 69 |    "Williams AJ"
 70 |    "Tkachenko V"
 71 |    "Karapetyan K"
 72 |    "Pshenichnov A"
 73 |    "Hanson RM"
 74 |    "Liddie JM"
 75 |    "Bara JE"
 76 | 
 77 | 2. Request data using a Loop
 78 | ============================
 79 | 
 80 | First, create an array of PubMed IDs:
 81 | 
 82 | .. code-block:: shell
 83 | 
 84 |    idList=('34813985' '34813932' '34813684' '34813661' '34813372' '34813140' '34813072')
 85 | 
 86 | We can loop through the ``idList`` as follows:
 87 | 
 88 | .. code-block:: shell
 89 | 
 90 |    for id in "${idList[@]}"
 91 |    do
 92 |        echo $id
 93 |    done
 94 | 
 95 | **Output:**
 96 | 
 97 | .. code-block:: shell
 98 | 
 99 |    34813985
100 |    34813932
101 |    34813684
102 |    34813661
103 |    34813372
104 |    34813140
105 |    34813072
106 | 
107 | For storing data when looping through the IDs, we can use associative arrays. For example:
108 | 
109 | .. code-block:: shell
110 | 
111 |    declare -A myarray
112 |    myarray["34813985"]="data1"
113 |    myarray["34813932"]="data2"
114 |    echo ${myarray["34813985"]}
115 |    echo ${myarray["34813932"]}
116 | 
117 | **Output:**
118 | 
119 | .. code-block:: shell
120 | 
121 |    data1
122 |    data2
123 | 
124 | For extracting specific data from the returned PubMed data, we will use jq with the ``--arg`` option, which allows us to pass data into the jq environment, such as an ID variable:
125 | 
126 | 
127 | .. code-block:: shell
128 | 
129 |    data=$(curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=34813072&retmode=json")
130 | 
131 | 
132 | .. code-block:: shell
133 | 
134 |    echo $data | jq '.["result"]["34813072"]'
135 | 
136 | or, alternatively:
137 | 
138 | .. code-block:: shell
139 | 
140 |    id="34813072"
141 |    echo $data | jq --arg location "$id" '.["result"][$location]'
142 | 
143 | **Output:**
144 | 
145 | .. code-block:: shell
146 | 
147 |    {
148 |      "uid": "34813072",
149 |      "pubdate": "2022",
150 |      "epubdate": "",
151 |      "source": "Methods Mol Biol",
152 |      "authors": [
153 |        {
154 |          "name": "Liu S",
155 |          "authtype": "Author",
156 |          "clusterid": ""
157 |        },
158 |        {
159 |          "name": "Narancic T",
160 |          "authtype": "Author",
161 |           "clusterid": ""
162 |        },
163 |        {
164 |          "name": "Davis C",
165 |          "authtype": "Author",
166 |          "clusterid": ""
167 |        },
168 |        {
169 |          "name": "O'Connor KE",
170 |          "authtype": "Author",
171 |          "clusterid": ""
172 |        }
173 |      ],
174 |      "lastauthor": "O'Connor KE",
175 |      "title": "CRISPR-Cas9 Editing of the Synthesis of Biodegradable Polyesters Polyhydroxyalkanaotes (PHA) in Pseudomonas putida KT2440.",
176 |      "sorttitle": "crispr cas9 editing of the synthesis of biodegradable polyesters polyhydroxyalkanaotes pha in pseudomonas putida kt2440",
177 |      "volume": "2397",
178 |      "issue": "",
179 |      "pages": "341-358",
180 |      "lang": [
181 |        "eng"
182 |      ],
183 |      "nlmuniqueid": "9214969",
184 |      "issn": "1064-3745",
185 |      "essn": "1940-6029",
186 |      "pubtype": [
187 |        "Journal Article"
188 |      ],
189 |      "recordstatus": "PubMed - indexed for MEDLINE",
190 |      "pubstatus": "4",
191 |      "articleids": [
192 |        {
193 |          "idtype": "pubmed",
194 |          "idtypen": 1,
195 |          "value": "34813072"
196 |        },
197 |        {
198 |          "idtype": "doi",
199 |          "idtypen": 3,
200 |          "value": "10.1007/978-1-0716-1826-4_17"
201 |        }
202 |      ],
203 |      "history": [
204 |        {
205 |          "pubstatus": "entrez",
206 |          "date": "2021/11/23 12:28"
207 |        },
208 |        {
209 |          "pubstatus": "pubmed",
210 |          "date": "2021/11/24 06:00"
211 |        },
212 |        {
213 |          "pubstatus": "medline",
214 |          "date": "2022/01/27 06:00"
215 |        }
216 |      ],
217 |      "references": [],
218 |      "attributes": [
219 |        "Has Abstract"
220 |      ],
221 |      "pmcrefcount": "",
222 |      "fulljournalname": "Methods in molecular biology (Clifton, N.J.)",
223 |      "elocationid": "doi: 10.1007/978-1-0716-1826-4_17",
224 |      "doctype": "citation",
225 |      "srccontriblist": [],
226 |      "booktitle": "",
227 |      "medium": "",
228 |      "edition": "",
229 |      "publisherlocation": "",
230 |      "publishername": "",
231 |      "srcdate": "",
232 |      "reportnumber": "",
233 |      "availablefromurl": "",
234 |      "locationlabel": "",
235 |      "doccontriblist": [],
236 |      "docdate": "",
237 |      "bookname": "",
238 |      "chapter": "",
239 |      "sortpubdate": "2022/01/01 00:00",
240 |      "sortfirstauthor": "Liu S",
241 |      "vernaculartitle": ""
242 |    }
243 | 
244 | Finally, we can now extract out specific elements, such as the journal title (source).
245 | 
246 | .. code-block:: shell
247 | 
248 |    id="34813072"
249 |    echo $data | jq --arg location "$id" '.["result"][$location]["source"]'
250 | 
251 | 
252 | **Output:**
253 | 
254 | .. code-block:: shell
255 | 
256 |    "Methods Mol Biol"
257 | 
258 | Now, combine these steps to loop through the list of IDs and extract the journal titles:
259 | 
260 | .. code-block:: shell
261 | 
262 |    idList=('34813985' '34813932' '34813684' '34813661' '34813372' '34813140' '34813072')
263 |    declare -A multiPapers
264 |    for ids in "${idList[@]}"
265 |    do
266 |      multiPapers[$ids]=$(curl -s $summary$'id='$ids$'&retmode=json')
267 |      sleep 1
268 |    done
269 |    for ids in "${idList[@]}"
270 |    do
271 |      echo ${multiPapers[$ids]} | jq --arg location "$ids" '.result[$location]["source"]'
272 |    done
273 | 
274 | **Output:**
275 | 
276 | .. code-block:: shell
277 | 
278 |    "Cell Calcium"
279 |    "Methods"
280 |    "FEBS J"
281 |    "Dev Growth Differ"
282 |    "CRISPR J"
283 |    "Chembiochem"
284 |    "Methods Mol Biol"
285 | 
286 | 3. PubMed API Calls with Requests and Parameters
287 | =========================================================
288 | 
289 | For searching for articles, we will need to use this API url:
290 | 
291 | .. code-block:: shell
292 | 
293 |    search='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&'
294 | 
295 | When searching through articles, we are given a few ways of filtering the data. A list of all the available parameters for these requests can be found in the official NCBI documentation:
296 | 
297 | https://www.ncbi.nlm.nih.gov/books/NBK25499/
298 | 
299 | We can specify the database by putting ``db=<database>`` into the URL. We will be using the PubMed database. We can also use term to search data by adding ``term=<searchQuery>``. Just be sure to replace spaces with a + instead. We can, for example, use a query to search PubMed, such as “neuroscience intervention learning”:
300 | 
301 | .. code-block:: shell
302 | 
303 |    url=$search$"term=neuroscience+intervention+learning&retmode=json"
304 |    data=$(curl -s $url)
305 | 
306 | We can also use the query to search for an author.
307 | 
308 | we will add ```[au]``` after the name to specify it is an author:
309 | 
310 | .. code-block:: shell
311 | 
312 |    url=$search$"term=Darwin[au]&retmode=json"
313 |    data=$(curl -s $url)
314 |    echo $data
315 | 
316 | **Output:**
317 | 
318 | .. code-block:: shell
319 | 
320 |    {"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"603","retmax":"20","retstart":"0","idlist":["36374290","36370080","36363931","36342372","36315101","36254119","36164491","36102812","36100038","36098658","36082519","35993699","35916364","35834740","35732810","35719898","35714393","35513308","35507730","35475719"],"translationset":[],"querytranslation":"Darwin[Author]"}}
321 | 
322 | 
323 | The number of returned IDs can be adjusted with the ``retmax`` parameter:
324 | 
325 | 
326 | .. code-block:: shell
327 | 
328 |    url=$search$"term=Darwin[au]&retmax=30&retmode=json"
329 |    data=$(curl -s $url)
330 |    echo $data | jq '.["esearchresult"]["idlist"]'
331 | 
332 | **Output:**
333 | 
334 | .. code-block:: shell
335 | 
336 |    [
337 |    "36374290",
338 |    "36370080",
339 |    "36363931",
340 |    "36342372",
341 |    "36315101",
342 |    "36254119",
343 |    "36164491",
344 |    "36102812",
345 |    "36100038",
346 |    "36098658",
347 |    "36082519",
348 |    "35993699",
349 |    "35916364",
350 |    "35834740",
351 |    "35732810",
352 |    "35719898",
353 |    "35714393",
354 |    "35513308",
355 |    "35507730",
356 |    "35475719",
357 |    "35414258",
358 |    "35301788",
359 |    "35293777",
360 |    "35122809",
361 |    "35100046",
362 |    "35073334",
363 |    "35038915",
364 |    "35034540",
365 |    "34927345",
366 |    "34923869"
367 |    ]
368 | 
369 | We can get the number of IDs after a bit of cleanup with ``tr`` and ``awk``:
370 | 
371 | .. code-block:: shell
372 | 
373 |    echo $data | jq '.["esearchresult"]["idlist"]' | tr -d ' "[],' | awk 'NF' | wc -l
374 | 
375 | **Output:**
376 | 
377 | .. code-block:: shell
378 | 
379 |    30 
380 | 
381 | We can sort results using **usehistory=y**. This allows us to store the data for it to be sorted in the same API call. The addition of **sort=pub+date** will sort IDs by the publishing date.
382 | 
383 | .. code-block:: shell
384 | 
385 |    url=$search$"term=Coral+Reefs&retmode=json&usehistory=y&sort=pub+date"
386 |    data=$(curl -s $url)
387 | 
388 | 
389 | We can also search based on publication type by adding **AND** into the search in the term: **term=<searchQuery>+AND+filter[filterType]**.
390 | 
391 | **[pt]** specifies that the filter type is the publication type. More filters can be found at: https://pubmed.ncbi.nlm.nih.gov/help/.
392 | 
393 | 
394 | .. code-block:: shell
395 | 
396 |    url=$search$"term=stem+cells+AND+clinical+trial[pt]&retmode=json"
397 |    data=$(curl $url)
398 |    sleep 1
399 |    echo $data
400 | 
401 | 
402 | 4. PubMed API Metadata Visualization
403 | ===========================================
404 | 
405 | Frequency of topic sortpubdate field
406 | -----------------------------------------
407 | 
408 | Extracting the sortpubdate field for a “hydrogel drug” search results, limited to publication type clinical trials:
409 | 
410 | .. code-block:: shell
411 | 
412 |    search='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&'
413 |    url=$search$"term=hydrogel+drug+AND+clinical+trial[pt]&sort=pub+date&retmax=500&retmode=json"
414 |    data=$(curl -s $url)
415 | 
416 | Get the length of results:
417 | 
418 | .. code-block:: shell
419 | 
420 |    echo $data | jq '.["esearchresult"]["idlist"] | length'
421 | 
422 | **Output:**
423 | 
424 | .. code-block:: shell
425 | 
426 |    299
427 | 
428 | Next, loop through each ID and get the sortpubdate field. Note that this sortpubdate field may not necessarily be equivalent to a publication date:
429 | 
430 | .. code-block:: shell
431 | 
432 |    declare -a idList
433 |    for (( id = 0; id < $(echo $data | jq '.["esearchresult"]["idlist"] | length'); id++ ))
434 |    do
435 |      idList+=($(echo $data | jq ".esearchresult.idlist[$id]" | tr -d '"'))
436 |    done
437 | 
438 | Get the length of the array:
439 | 
440 | .. code-block:: shell
441 | 
442 |    echo ${#idList[@]}
443 | 
444 | **Output:**
445 | 
446 | .. code-block:: shell
447 | 
448 |    299
449 | 
450 | Show the first 10 IDs
451 | 
452 | .. code-block:: shell
453 | 
454 |    echo ${idList[@]:0:10}
455 | 
456 | **Output:**
457 | 
458 | .. code-block:: shell
459 | 
460 |    36203046 36261491 35830550 34653384 35556170 35413602 35041809 34915741 34695615 35062896
461 | 
462 | Now, loop through each ID, get the sortpubdate and save to a file. Note, this will take a few minutes:
463 | 
464 | .. code-block:: shell
465 | 
466 |    summary='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&'
467 |    for ids in ${idList[@]}
468 |    do
469 |      url=$summary$"id="$ids$"&retmode=json"
470 |      data=$(curl -s $url)
471 |      sleep 1
472 |      echo $data | jq --arg location "$ids" '.["result"][$location]["sortpubdate"]' >> pubDates.csv
473 |    done
474 | 
475 | Finally, plot the data using gnuplot.  See the `gnuplot documentation`_ for more information about the smooth frequency histogram.
476 | 
477 | .. _gnuplot documentation: http://www.gnuplot.info/documentation.html
478 | 
479 | .. code-block:: shell
480 | 
481 |    gnuplot -e "set datafile separator ','; \
482 |    set title 'sortpubdate';
483 |    set term dumb;
484 |    binwidth=2; \
485 |    bin(val)=binwidth*floor(val/binwidth); \
486 |    plot 'pubDates.csv' using (bin(column(1))):(1.0) smooth frequency with boxes notitle"
487 | 
488 | **Output:**
489 | 
490 | .. code-block:: shell
491 | 
492 |                                     sortpubdate                                 
493 |                                                                                 
494 |    35 +---------------------------------------------------------------------+   
495 |       |       +       +      +       +       +       +****  +       +       |   
496 |       |                                               *  *                  |   
497 |    30 |-+                                             *  ****             +-|   
498 |       |                                   ****        *  *  *****           |   
499 |    25 |-+                                 *  ****     *  *  *   *         +-|   
500 |       |                                   *  *  ****  *  *  *   *  ****     |   
501 |       |                                   *  *  *  ****  *  *   ****  *     |   
502 |    20 |-+                                 *  *  *  *  *  *  *   *  *  *   +-|   
503 |       |                                   *  *  *  *  *  *  *   *  *  *     |   
504 |    15 |-+                                 *  *  *  *  *  *  *   *  *  *   +-|   
505 |       |                                   *  *  *  *  *  *  *   *  *  **    |   
506 |       |                                   *  *  *  *  *  *  *   *  *  **    |   
507 |    10 |-+                                 *  *  *  *  *  *  *   *  *  **  +-|   
508 |       |             ****  ****      *******  *  *  *  *  *  *   *  *  **    |   
509 |     5 |-+           *  ****  ********  *  *  *  *  *  *  *  *   *  *  **  +-|   
510 |       |             *  *  *  *  *   *  *  *  *  *  *  *  *  *   *  *  **    |   
511 |       |************** +*  *  *  *   *+ *  *  *  *  * +*  *  *   *  *+ **    |   
512 |     0 +---------------------------------------------------------------------+   
513 |      1980    1985    1990   1995    2000    2005    2010   2015    2020    2025 
514 | 
515 | 
516 | Frequency of publication for an author search
517 | -----------------------------------------------
518 | 
519 | .. code-block:: shell
520 | 
521 |    search='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&'
522 |    url=$search$"term=Reed+LK[au]&sort=pub+date&retmax=500&retmode=json"
523 |    data=$(curl -s $url)
524 | 
525 | Next, create the list of IDs:
526 | 
527 | .. code-block:: shell
528 | 
529 |    declare -a idList
530 |    for (( id = 0; id < $(echo $data | jq '.["esearchresult"]["idlist"] | length'); id++ ))
531 |    do
532 |      idList+=($(echo $data | jq ".esearchresult.idlist[$id]" | tr -d '"'))
533 |    done
534 | 
535 | Get the length of the array:
536 | 
537 | .. code-block:: shell
538 | 
539 |    echo ${#idList[@]}
540 | 
541 | **Output:**
542 | 
543 | .. code-block:: shell
544 | 
545 |    55
546 | 
547 | Next, collect the sortpubdate for each ID:
548 | 
549 | .. code-block:: shell
550 | 
551 |    summary='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&'
552 |    for ids in ${idList[@]}
553 |    do
554 |      url=$summary$"id="$ids$"&retmode=json"
555 |      data=$(curl -s $url)
556 |      sleep 1
557 |      echo $data | jq --arg location "$ids" '.["result"][$location]["sortpubdate"]' >> pubDates2.csv
558 |    done
559 | 
560 | Plot the data:
561 | 
562 | .. code-block:: shell
563 | 
564 |    gnuplot -e "set datafile separator ','; \
565 |    set title 'sortpubdate';
566 |    set term dumb;
567 |    binwidth=3; \
568 |    bin(val)=binwidth*floor(val/binwidth); \
569 |    plot 'pubDates2.csv' using (bin(column(1))):(1.0) smooth frequency with boxes notitle"
570 | 
571 | 
572 | **Output:**
573 | 
574 | .. code-block:: shell
575 | 
576 |                                     sortpubdate                                 
577 |                                                                                 
578 |    16 +---------------------------------------------------------------------+   
579 |       |       +       +      +       +       +       +      +   *** +       |   
580 |    14 |-+                                                       * *       +-|   
581 |       |                                                         * ****      |   
582 |    12 |-+                                                       * *  *    +-|   
583 |       |                                                         * *  *      |   
584 |       |                                                         * *  *      |   
585 |    10 |-+                                                       * *  *    +-|   
586 |       |                                                         * *  *      |   
587 |     8 |-+                                                ***    * *  *    +-|   
588 |       |                                                  * *    * *  *      |   
589 |     6 |-+                                                * *    * *  *    +-|   
590 |       |                                              ***** *  *** *  *      |   
591 |     4 |-+                                            *   * *  * * *  **   +-|   
592 |       |                                              *   * *  * * *  **     |   
593 |       |                                              *   * *  * * *  **     |   
594 |     2 |-+                              ***************   * *  * * *  **   +-|   
595 |       |*********************************     +       *   * **** * * +**     |   
596 |     0 +---------------------------------------------------------------------+   
597 |      1940    1950    1960   1970    1980    1990    2000   2010    2020    2030 
598 | 
599 | 
600 | 


--------------------------------------------------------------------------------