├── data
    ├── .gitignore
    ├── .DS_Store
    └── Data_folder_README.md
├── tableau-visualization
    ├── tableau-screenshots
    │   ├── .gitignore
    │   ├── details.png
    │   ├── publish.png
    │   ├── edit colors.png
    │   ├── edit group.png
    │   ├── go to sheet.png
    │   ├── make a copy.png
    │   ├── create extract.png
    │   ├── copied dashboard.png
    │   ├── data source tab.png
    │   ├── edit filter org2.png
    │   ├── new data source.png
    │   ├── missing directory.png
    │   ├── show sheets as tabs.png
    │   ├── groups after editing.png
    │   ├── single value list filter.png
    │   └── summary dashboard in full visualization.png
    ├── dashboard-documentation.md
    └── customizing-dashboard.md
├── network-visualization
    └── network-script
├── README.md
├── Rorcid_Crossref_Authors_Existing_List.R
└── Rorcid_Crossref_Authors.R


/data/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/data/.DS_Store


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/details.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/publish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/publish.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/edit colors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/edit colors.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/edit group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/edit group.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/go to sheet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/go to sheet.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/make a copy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/make a copy.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/create extract.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/create extract.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/copied dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/copied dashboard.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/data source tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/data source tab.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/edit filter org2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/edit filter org2.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/new data source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/new data source.png


--------------------------------------------------------------------------------
/data/Data_folder_README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Data_folder_README"
3 | output: html_document
4 | ---
5 | 
6 | This file will be empty until you run the code so do not panic if you don't see anything.


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/missing directory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/missing directory.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/show sheets as tabs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/show sheets as tabs.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/groups after editing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/groups after editing.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/single value list filter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/single value list filter.png


--------------------------------------------------------------------------------
/tableau-visualization/tableau-screenshots/summary dashboard in full visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/summary dashboard in full visualization.png


--------------------------------------------------------------------------------
/tableau-visualization/dashboard-documentation.md:
--------------------------------------------------------------------------------
 1 | # Tableau dashboard documentation
 2 | ## Major decisions
 3 | * **Date filter:** Did not use a date filter because it causes issues due to how data are pulled. For example, if there is even one DOI from 2020, even if all of the rest of the DOIs are from 2021, using the date filter makes it appear as if the data consist of records from 2020 and 2021 – which is technically true, but very easy to misinterpret. Makes more sense to have users manually add a date.
 4 | * **Field names:** Did not change any of the field names in Tableau so that the data can be replaced without any issues directly from the data pull. If you change any of the field names, whether in Tableau or the data source, you will have to make sure that the names are consistent between Tableau and the data source. 
 5 | * **How the dashboard is counting:** The total numbers are being calculated by the distinct count of DOI from the home/anchor institution. What this means is that a paper with two collaborating authors is counted the same number of times (once) as a paper with ten collaborating authors. That is because the DOI itself is a collaboration, not the number of authors. This logic also applies to the number of collaborating cities. 
 6 | ## Groups
 7 | **Institution grouping:** Each institution has to create two groups from the **Org2** field: one that is ONLY their institution, one that is ONLY other institutions. That makes it possible to filter out their own institution from the data. 
 8 | ## Measures
 9 | **City2**, **DOI**, and **Orcid1** have all been duplicated and converted to measures in order to be able to get the counts of those fields.
10 | ## Maps/geocoding
11 | * The ***latitude*** and ***longitude*** fields are automatically generated by Tableau based on the geographic information in the data. For Tableau to be able to read and geocode the data, the information must be broken down into individual components: city, state, country. 
12 | * Sometimes, Tableau doesn’t know how to code a city. You can manually resolve this issue. On the map, in the bottom right corner, it’ll say **[x number] Unknown**. Follow the instructions in the [Edit Unknown or Ambiguous Locations](https://help.tableau.com/current/pro/desktop/en-us/maps_editlocation.htm) resource to edit these locations. 
13 | * In the data pull, the columns for state are called **Region1** (home institution) and **Region2** (collaborating institution). 
14 | ## Further data cleaning
15 | Consider using a tool such as OpenRefine to further clean the dataset. 
16 | 


--------------------------------------------------------------------------------
/network-visualization/network-script:
--------------------------------------------------------------------------------
  1 | # Network visualization script for rorcid output
  2 | # Uses the NetSciX Workshop code by Katherine Ognyanova, www.kateto.net
  3 | # with slight modifications/tailoring for ORCID data
  4 | # For more information, contact LYRASIS at orcidus@lyrasis.org
  5 | 
  6 | # Before running this script, make sure your data are in a
  7 | # workable format (nodes and edges)
  8 | # Follow data structure in workshop materials:
  9 | # https://www.kateto.net/wp-content/uploads/2016/01/NetSciX_2016_Workshop.pdf
 10 | # Name nodes and edges files "nodes.csv" and "edges.csv"
 11 | 
 12 | # Install the package "igraph"
 13 | # The package (www.igraph.org) is maintained by Gabor Csardi and Tamas Nepusz.
 14 | 
 15 | install.packages("igraph")
 16 | 
 17 | # Load the igraph package'
 18 | 
 19 | library(igraph)
 20 | 
 21 | # Set the working directory to the folder containing the nodes and edges files:
 22 | 
 23 | setwd("C:/Users/folder")
 24 | 
 25 | # Load the datasets
 26 | 
 27 | nodes <- read.csv("nodes.csv", header=T, as.is=T)
 28 | links <- read.csv("edges.csv", header=T, as.is=T)
 29 | 
 30 | # Examine the data:
 31 | head(nodes)
 32 | head(links)
 33 | nrow(nodes); length(unique(nodes$id))
 34 | nrow(links); nrow(unique(links[,c("from", "to")]))
 35 | 
 36 | # Collapse multiple links of the same type between the same two nodes
 37 | # by summing their weights, using aggregate() by "from", "to", & "type":
 38 | links <- aggregate(links[,3], links[,-3], sum)
 39 | links <- links[order(links$from, links$to),]
 40 | colnames(links)[4] <- "weight"
 41 | rownames(links) <- NULL
 42 | 
 43 | # Converting the data to an igraph object:
 44 | # The graph.data.frame function, which takes two data frames: 'd' and 'vertices'.
 45 | # 'd' describes the edges of the network - it should start with two columns
 46 | # containing the source and target node IDs for each network tie.
 47 | # 'vertices' should start with a column of node IDs.
 48 | # Any additional columns in either data frame are interpreted as attributes.
 49 | 
 50 | net <- graph_from_data_frame(d=links, vertices=nodes, directed=T)
 51 | 
 52 | # Examine the resulting object:
 53 | class(net)
 54 | net
 55 | 
 56 | # We can look at the nodes, edges, and their attributes:
 57 | E(net)
 58 | V(net)
 59 | 
 60 | plot(net, edge.arrow.size=.4,vertex.label=NA)
 61 | 
 62 | # Removing loops from the graph:
 63 | net <- simplify(net, remove.multiple = F, remove.loops = T)
 64 | 
 65 | # Extract data frames describing nodes and edges:
 66 | as_data_frame(net, what="edges")
 67 | as_data_frame(net, what="vertices")
 68 | 
 69 | # Plotting with igraph: node options (starting with 'vertex.') and edge options
 70 | # (starting with 'edge.'). A list of options is available:
 71 | ?igraph.plotting
 72 | 
 73 | # We can set the node & edge options in two ways - one is to specify
 74 | # them in the plot() function, as we are doing below.
 75 | 
 76 | # Plot with curved edges (edge.curved=.1) and reduce arrow size:
 77 | plot(net, edge.arrow.size=.4, edge.curved=.1)
 78 | 
 79 | # Set node color to orange and the border color to hex #555555
 80 | plot(net, edge.arrow.size=.2, edge.curved=0,
 81 |      vertex.color="orange", vertex.frame.color="#555555")
 82 | 
 83 | # Other layouts to experiment with
 84 | 
 85 | # Randomly placed vertices
 86 | l <- layout_randomly(net)
 87 | plot(net, layout=l)
 88 | 
 89 | # Circle layout
 90 | l <- layout_in_circle(net)
 91 | plot(net, layout=l)
 92 | 
 93 | # 3D sphere layout
 94 | l <- layout_on_sphere(net)
 95 | plot(net, layout=l)
 96 | 
 97 | # By default, igraph uses a layout called layout_nicely which selects
 98 | # an appropriate layout algorithm based on the properties of the graph.
 99 | 
100 | # Check out all available layouts in igraph:
101 | ?igraph::layout_
102 | 
103 | # R and igraph offer interactive plotting, mostly helpful for small networks
104 | # If your institution has more than a couple hundred collaborations,
105 | # consider a different layout from the ones above
106 | 
107 | tkid <- tkplot(net) #tkid is the id of the tkplot
108 | l <- tkplot.getcoords(tkid) # grab the coordinates from tkplot
109 | 
110 | # Shortest distance to the center indicates higher weight/higher number
111 | # of collaborations
112 | 
113 | tk_close(tkid, window.close = T)
114 | plot(net, layout=l)
115 | 


--------------------------------------------------------------------------------
/tableau-visualization/customizing-dashboard.md:
--------------------------------------------------------------------------------
 1 | # Customizing your own Tableau dashboard
 2 | Take the steps below to build your own Tableau dashboard. For questions about Tableau, check out the [Tableau Community Forums](https://community.tableau.com/s/topic/0TO4T000000QF9sWAG/tableau-public). If you run into issues with this Tableau dashboard, contact Lyrasis ORCID US Community support for assistance. Be sure to also review the [dashboard documentation](https://github.com/lyrasis/ORCID-Data-Visualization/blob/main/tableau-visualization/dashboard-documentation.md). If you run into challenges authoring on the web (in your browser), consider installing the Tableau Public application on your device ([installers](https://help.tableau.com/current/desktopdeploy/en-us/desktop_deploy_download.htm)). 
 3 | 1. Request a data pull from ORCID US Community support, or pull your own data using the R script in this repository. **Do not change any of the variable names.** Doing so will cause Tableau to be unable to recognize the variables. 
 4 | 2. Make sure that the name of the CSV from the data pull is **orcid-data** 
 5 | 3. [Create a Tableau Public account](https://public.tableau.com/desktop/signup_unification.html) for yourself, your institution, or your department. Be sure to check in with your local IT department to ask about any possible restrictions or rules around creating an account. 
 6 | 4. While logged in to your Tableau Public account, navigate to the template dashboard featured on the [ORCID US Community Tableau Public profile](https://public.tableau.com/app/profile/orcid.us.community/viz/Lyrasis-CollaborationsDashboardTemplate/Fullvisualization).
 7 | 5. Using the menu in the top right of the dashboard, click on the icon with two overlapping rectangles (**Make a copy**).
 8 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/make%20a%20copy.png" alt="Make a copy button in Tableau." width="350"/></p>
 9 | 6. What you should be seeing at this point... 
10 | 
11 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/copied%20dashboard.png" alt="Tableau in browser edit view for collaborations dashboard.." width="950"/></p>
12 | 
13 | 7. In the top right corner, click on the blue **Publish As...** button, rename the dashboard as needed, and then click on the blue **Publish** button.
14 | 8. In the bottom left corner, click on **Data Source**.
15 | 9. You will see an error message: *The directory is missing or has been moved: Replace it with another file?*
16 | 10. Click **Yes**. 
17 | 11. Drag and drop or upload your institution’s data file from its location on your device. Tableau will replace the existing data file with this file. This may take several minutes, depending on the file size. 
18 | 12. Once the new data has been added, in the top right corner, click on **Create extract**, then click on **Create extract** again in the pop-up. Depending on the file size, this may take anywhere from several seconds to several minutes. 
19 | 
20 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/create%20extract.png" alt="Location of create extract option in Tableau. " width="950"/></p>
21 | 
22 | 13. Click on the **Summary dashboard** tab at the bottom.
23 | 14. At the top of the dashboard, double click on the title. Replace the **[Organization name]** and **[time period from data pull]** and **Data pulled on [date]** with your organization name, the time period requested from the data pull, and the date of the data pull, then click **OK** (make sure to delete the brackets after replacing the text). As a reminder, the script pulls data from January 1 of the year requested to the present date of the data pull. 
24 | 15. Scroll down to the **Highest number of collaborations with the following institutions:** visualization. At the bottom of the visualization, double click the caption underneath the bars to fill in the text between the brackets with your own institution’s data.
25 | 16. Click anywhere on the **Highest number of collaborations with the following institutions:** visualization. Four small icons will appear on the right of the gray box around the visualization. Click on the small square with the arrow popping out (second icon down, called **Go to Sheet**). You can use this feature to navigate to individual visualizations from the dashboards. 
26 | 
27 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/go%20to%20sheet.png" alt="Demonstration of Go to Sheet icon in Tableau, found on right-hand side of visualizations. " width="950"/></p>
28 | 
29 | 17. On the **Marks** card (one of the left-hand panes), click on **Color** to change the color of the bars. You can use the other features in the **Marks** card to edit the colors, sizes, and other aesthetic features of the visualizations. 
30 | 
31 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/edit%20colors.png" alt="Marks card for Tableau controls color selection, size, and other aesthetics." width="950"/></p>
32 | 
33 | 18. Click on the **Collaborations** tab, then click anywhere on the **Collaborations Map** visualization. Click on the **Go to Sheet** icon that appears on the top right corner of the visualization to go to the visualization. 
34 | 19. On the left-hand side, under the **Data pane** that contains a long list of variables, click on the small white triangle to the right of **Org2 (group)**. Then, click on **Edit Group…**
35 | 
36 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/edit%20group.png" alt="In Tableau, Edit Group can be found by clicking on a dimension. " width="950"/></p>
37 | 
38 | 20. A list of institutions should pop up. If there are any intra-institution collaborations in the dataset (for example, a Temple University researcher who collaborated and published with another Temple University researcher), find your institution and click on it to highlight, then click on **Group**. Label this group as **[Your institution name] Only**. Be sure not to edit the field name at the top (**Org2(group)**). If there are no intra-institution collaborations in the dataset, delete this filter by clicking on the **x** in the top right when this filter is selected in the dashboard. 
39 | 21. Follow the same steps to select all of the other institutions (including **Null**) and group them. Label that group as **Excluding [your institution name]**.
40 | 
41 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/groups%20after%20editing.png" alt="In Tableau, groups can be edited by selected and labeling groups of data. " width="950"/></p>
42 | 
43 | 22. Close the pop-up. 
44 | 23. At the bottom, click on the **Collaborations** tab.
45 | 24.  Immediately below the first gray bar divider, click on the filter. On the right-hand side of the gray box that appears around the filter, click on the small white triangle, then select **Single Value (list)**. 
46 | 
47 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/single%20value%20list%20filter.png" alt="Filters can be edited to a single value list by clicking on white arrow in Tableau." width="950"/></p>
48 | 
49 | 25. Click on the **Individual search** tab. Double click on the title to change the **[time period from data pull]** to the time period for your dataset and **Data pulled on [date]** to add the date of the data pull.  
50 | 26. At the bottom, click on the **Why can't I find my ORCID iD?** tab. Double click on the text to add the appropriate contact information for ORCID support at your institution.
51 | 27.  Make any other customizations to the dashboard. For support with using Tableau, refer to the Tableau Resources section. 
52 | 28.  In the top right corner, click on **Publish** or **Publish as…** to save the dashboard to your Tableau Public profile. Note that any filters you have selected will be saved once you publish the workbook, so clear any filters that you do not want to be set as the default and make sure you are on the **Full visualization** tab. 
53 | 
54 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/publish.png" alt="Tableau dashboard in Tableau Public can be published by clicking on Public button." width="950"/></p>
55 | 
56 | 29. The **Full visualization** tab of the dashboard contains all of the dashboard tabs in a neater, guided format. To only display the **Full visualization** tab, navigate to the published version of the dashboard, then click on the **settings** in the top right corner (indicated by the gear icon). Deselect Show Sheets to only show the **Full visualization** tab.
57 | 
58 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/show%20sheets%20as%20tabs.png" alt="Tableau settings allow for profile visibility, sheets visibility, and access customization. " width="950"/></p>
59 | 
60 | 30. You can also decide if you want this dashboard to be visible on your profile and if you want to allow others to download or make a copy of your visualization in the **settings**.
61 | 31. Scroll down to the **Details** section. Click on the **pencil icon** to the right of **Details** to edit the details for the dashboard, such as the title and description. Click on **Save Changes** when you’re finished with your edits. 
62 | 
63 | <p align="center"><img src="https://github.com/lyrasis/ORCID-Data-Visualization/blob/fbe5c037e1dad039fb2704fac43a4b30c8cb6ec0/tableau-visualization/tableau-screenshots/details.png" alt="Tableau details allow for title, description, inspiration, and external links to be shared with Tableau Public dashboards." width="950"/></p>
64 | 
65 | 32. Review your visualization for any accessibility issues using [accessibility resources](https://github.com/lyrasis/ORCID-Data-Visualization#tableau-and-accessibility-resources). As a general note, any changes that you make to the dashboard as an **editor** will persist. Any changes that you make to the dashboard as a **viewer** (e.g., using a filter, excluding values) can be reset by refreshing the dashboard. 
66 | 
67 | If you run into any issues with these steps, refer to the Tableau resources below or reach out to Lyrasis ORCID US Community support for further assistance. 
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Exploring Publication Collaborations using ORCID and DOI data
  2 | ## Project Summary
  3 | The resources available in this Github repository can be used to create a visualization of publication collaboration activity, based on public data from researchers' ORCID records and Crossref DOI publication metadata. The R script in this repository can be used to retrieve information about publishing collaborations between researchers at a home organization and other organizations across the globe. The resulting CSV file can then be loaded into a [Tableau Public](https://public.tableau.com/app/discover) dashboard to create a collaboration map and additional views to explore the data further. This project was made possible by a 2022 partnership between the [ORCID US Community](https://orcidus.lyrasis.org/) (administered by Lyrasis) and the Drexel University [LEADING program](https://mrc.cci.drexel.edu/leading/). For more information, including full instructions for how to use the tools in this repository, please [see our website](https://orcidus.lyrasis.org/data-visualization/).
  4 | ## Retrieving the data
  5 | We recommend using [R Studio](https://posit.co/) to run the [R script](https://github.com/lyrasis/ORCID-Data-Visualization/blob/main/Rorcid_Crossref_Authors.R), which will result in the data needed to create the visualization. The script is designed to:
  6 | 
  7 | * Retrieve ORCID iDs for researchers who have a current, publicly visible employment affiliation for a home institution on their ORCID record
  8 | * Unpack the publicly visible works information present on each ORCID record
  9 | * Retrieve Crossref DOI metadata for every work that has a Crossref DOI included on the ORCID work citation
 10 | * Unpack list of co-authors included in the Crossref DOI metadata for each work
 11 | * Retrieve ORCID iD for every co-author, if available
 12 | * Check current employment affiliation on the ORCID record of every co-author
 13 | * Get location information for the co-authors’ institutions
 14 | * Repackage data into CSV file containing home author ORCID iDs, co-author ORCID iDs and institutional affiliations/geographic location, and publication DOIs
 15 | 
 16 | You can use the green “code” button above to download a ZIP file, which will contain the R script as well as a folder labeled “data,” which is where the resulting CSV file will be saved. Or, you can download just the R script and create your own “data” folder separately.
 17 | 
 18 | Before you get started, you will need to gather the following information for your organization, based on how broad or narrow you want your search to be:
 19 | 
 20 | * Organization Name(s)
 21 | * Organization Email Domain(s)
 22 | * Organization ROR ID (search the [ROR registry](https://ror.org/search))
 23 | * Organization GRID ID (often included as an "other identifier" in the [ROR registry](https://ror.org/search))
 24 | * Organization Ringgold ID(s) (to find the Ringgold ID(s) for your organization, you can create a guest account at [https://ido.ringgold.com/register](https://ido.ringgold.com/register), or you can email orcidus@lyrasis.org and we can find your Ringgold ID for you)
 25 | 
 26 | For help retrieving any of this information, contact orcidus@lyrasis.org.
 27 | 
 28 | Open the R script file in RStudio. The script contains a number of comments and instructions. Comments are indicated by the presence of a hashtag (#) proceeding the comment text. Any lines of text with a hashtag in front will not be run as commands. Lines of text with no hashtag will be run as a command when entered in R Studio.
 29 | 
 30 | The first time you run the script, there are a few things you will need to do to get set up. Once these things are done, you should not have to do them again:
 31 | 
 32 | * Install the packages listed in the script by un-commenting, or removing the hashtags from, the install commands. Once installed, you should not have to install the packages again, so in future sessions you can leave the hashtags in. However, you will still need to load the packages each time you run the script.
 33 | * Get your [ORCID Public API keys](https://info.orcid.org/documentation/features/public-api/) (Client ID and Client Secret). You need to have an ORCID iD account in order to get Public API keys. If you don’t have an ORCID iD account, you can register for one, for free, at [https://orcid.org/register](https://orcid.org/register). Follow the instructions in the script.
 34 | * Get your ORCID API bearer token in RStudio. Follow the instructions provided in the script.
 35 | 
 36 | Next, you will plug in your unique values to perform the search and data retrieval process. You will need to enter the following values (follow the instructions in the script):
 37 | 
 38 | * Your working directory = the file path leading to your “data” folder where the final CSV output will be stored. The path needs to look something like this: /Users/rabun/Desktop/ORCID-Data-Visualization-main/data
 39 | * The year you want to start searching for publications
 40 | * Ringgold ID, GRID ID, ROR ID, Email domain, and Organization name - this allows the script to start by finding all ORCID records that have one or more of these values present in the Employment section of individuals’ ORCID records. For example: 
 41 |    * ringgold_id <- "14703" 
 42 |    * grid_id <- "grid.253554.0" 
 43 |    * ror_id <- "https://ror.org/04v097707"
 44 |    * email_domain <- "@csuci.edu"
 45 |    * organization_name <- "California State University, Channel Islands"
 46 | 
 47 | Note that if you want to search for multiple organization names (and thus have multiple different identifiers) and multiple email domains, there is a section of the script that provides the option to set multiple values for the search (see below).
 48 | 
 49 | * Keyword = a word that is unique to your institution, that will serve to narrow the search results for just your organization. For example “Temple” could be the keyword if searching for results from Temple University. If your institution has common words in the name, you may want to use the entire organization name as the keyword. For example the keyword “New” would not be helpful for a search for “The New School” because multiple organizations have the word “New” in the name.
 50 | * Geographic information for your organization, including city, state, and country. For example:
 51 |    * anchor_org<-"California State University, Channel Islands"
 52 |    * anchor_city<-"Camarillo"
 53 |    * anchor_region<-"CA"
 54 |    * anchor_country<-"US"
 55 | 
 56 | At this point, the script provides two options for creating the initial query:
 57 | 1) Run the search based on the values that you already entered 
 58 | 2) If you want to search for multiple campuses or specific departments at your organization, you will need to enter those additional values and go from there.
 59 | 
 60 | Now you can continue to run the commands and follow the instructions within the script.
 61 | 
 62 | Note that the script has various sections, and there are opportunities for you to export the data so far after each section so you can write it back in later without having to run the whole script again. This can be helpful if you get interrupted or if you don’t have time to run the whole script in one sitting. 
 63 | 
 64 | Note that there is one more part of the script, in the “get employment data” section, where you will have two options:
 65 | 1) Accept all of the organization names returned by the initial query
 66 | 2) View and edit the list of organization names to be included in the search results.
 67 | 
 68 | Continue to follow the instructions and run the script commands, until you get to the end, where the final CSV data file will be exported to your data folder.
 69 | Once you have the CSV output, you may want to check and clean organization names and city names using [Open Refine](https://openrefine.org/). This can be helpful for collapsing multiple variations of the same organization name (mis-spellings, extra spaces, etc.), and for filling in any city information that may be missing or incorrect.
 70 | 
 71 | ## Considerations and context for the data
 72 | **Data errors:** The data pulled using the R script are imperfect and contain gaps, as well as user and machine errors. The numbers in the data pull are not definitive. The data pulled for your institution are a snapshot for a specific period of time and may change as researchers obtain/update their ORCID profiles and continue to publish.
 73 | 
 74 | Some examples of data errors that may exist in the data are: 
 75 | * Missing ORCID iDs
 76 | * Missing geographic information that leads to missing data points on the collaborations map
 77 | * Typos in the institution name or city/country that lead to missing or erroneous ORCID iDs included in the data pulls
 78 | 
 79 | It’s important to highlight that this data shouldn’t be used to evaluate or compare researchers against one another because the data are not perfect and do not give a full picture of collaborations and impact. The resources in this repository provide just one angle through which to approach this information.
 80 | 
 81 | **Collaboration counting:** In the data pull, collaborations are counted by iterating through each home author and counting the collaborations again. For example, if 2 researchers at Temple (home institution) author a paper with researchers from the University of Texas, this counts as 1 collaboration within Temple and 1 collaboration with UT for each Temple author. In other words, for the home institution as a whole, it’s counted as 2 collaborations within Temple and 2 collaborations with UT. However, in the Tableau dashboard, each DOI is counted as one collaboration for the institution. 
 82 | 
 83 | **Current or previous institutions:** The data pulled for each author also looks at their entire careers. The script also pulls the current institution for collaborating authors. This reduces blanks which are greater when trying to pinpoint affiliation at the time of DOI minting because of lack of historical employment entries in ORCID profiles. This also avoids potential discrepancies with date of DOI minting and date of publication, which is sometimes blank. This also treats both authors the same in terms of counting. 
 84 | 
 85 | **Dates**: You may see discrepancies in the DOI issuing date and publication date due to different DOI issuing processes. There may be a lag time between when the DOI was issued and the publication date according to the journal. This may also depend on the publisher’s workflow. The date used in this script is the DOI issue date. This allowed for fewer blanks in the data. This is an area of future improvement for this project. 
 86 | ## Customizing your own Tableau dashboard
 87 | Once you have the CSV output for your search, you can load your data into Tableau to create your visualization. Refer to the [Customizing your own Tableau dashboard](https://github.com/lyrasis/ORCID-Data-Visualization/blob/main/tableau-visualization/customizing-dashboard.md) page. Be sure to also review the [dashboard documentation](https://github.com/lyrasis/ORCID-Data-Visualization/blob/main/tableau-visualization/dashboard-documentation.md). 
 88 | ## Why can't I find my ORCID iD? 
 89 | For the “individual search” tab within the Tableau dashboard, if you or other individual are having trouble finding your/their ORCID iD in the data pull or the search, here are a few things you may want to check:
 90 | ### 1. Do you have an ORCID profile set up?
 91 | If you have not yet created an ORCID iD, please visit www.orcid.org to set up your ORCID profile. ORCID sets up a persistent digital identifier (also called an ORCID iD) to distinguish you from other researchers.
 92 | ### 2. Did you set up your ORCID iD after the data pull?
 93 | If you set up your ORCID iD after the data supporting this dashboard were pulled, then your ORCID iD will not show up in the dashboard until the data has been pulled again.
 94 | ### 3. Is all of the information in your ORCID profile accurate?
 95 | Take a moment to verify that your current institution and location are accurately listed in your ORCID profile -- typos happen! If you work remotely for an institution, you will have to list the institution's primary location in order to show up in the data. If you correct any information in your ORCID profile after the data supporting this dashboard were pulled, then your ORCID iD will not show up in the dashboard until the data has been pulled again.
 96 | ### 4. Still not sure?
 97 | Reach out to your campus ORCID administrator or Lyrasis for further troubleshooting. 
 98 | ## Tableau and accessibility resources
 99 | * [Tableau, A Beginner’s Guide to Tableau Public](https://www.tableau.com/blog/beginners-guide-tableau-public)
100 | * [Authoring for Accessibility – Tableau](https://onlinehelp.tableau.com/current/pro/desktop/en-us/accessibility_create_view.htm)
101 | * [Tableau maps: Edit Unknown or Ambiguous Locations](https://help.tableau.com/current/pro/desktop/en-us/maps_editlocation.htm)
102 | * [Tableau Community Forums](https://community.tableau.com/welcome)
103 | * [Tableau Reference Guide](http://www.tableaureferenceguide.com/)
104 | * [Financial Times "Visual Vocabulary: Tableau Edition"](http://www.vizwiz.com/2018/07/visual-vocabulary.html)
105 | * [OneNumber, Tableau for Beginners](https://onenumber.biz/blog-1/2022/5/2/tableau-for-beginners-connect-to-data)
106 | ## Questions and support
107 | For any questions or support, or to provide feedback, please contact Lyrasis ORCID US Community support at orcidus@lyrasis.org.
108 | ## Usage License
109 | [Collaboration Data Visualization](https://github.com/lyrasis/ORCID-Data-Visualization) © 2022 by [Lyrasis](https://orcidus.lyrasis.org/data-visualization/) is licensed under [CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/?ref=chooser-v1)
110 | 


--------------------------------------------------------------------------------
/Rorcid_Crossref_Authors_Existing_List.R:
--------------------------------------------------------------------------------
  1 | # Script by Olivia Given Castello, adapted by Sheila Rabun, based on: https://ciakovx.github.io/rorcid.html 
  2 | # and 04-rcrossref_metadata.R at https://github.com/ciakovx/fsci2022/tree/main/code
  3 | # Retrieves ORCID profile and Crossref metadata for authors from an existing list of ORCID iDs, 
  4 | # since a given year, paired with that of the co-authors with whom they collaborated.
  5 | 
  6 | # Install and load packages -----------------------------------------------
  7 | 
  8 | # you will need to install these packages first, using the following
  9 | # if you've already installed them, skip this step
 10 | #install.packages('dplyr')
 11 | #install.packages('tibble')
 12 | #install.packages('tidyr')
 13 | #install.packages('purrr')
 14 | #install.packages('readr')
 15 | #install.packages('stringr')
 16 | #install.packages('jsonlite')
 17 | #install.packages('lubridate')
 18 | #install.packages('ggplot2')
 19 | #install.packages('httr')
 20 | #install.packages('forcats')
 21 | #install.packages('rorcid')
 22 | #install.packages('usethis')
 23 | #install.packages('anytime')
 24 | #install.packages('janitor')
 25 | #install.packages('glue')
 26 | #install.packages('remotes')
 27 | #install.packages("ropensci/rcrossref")
 28 | #install.packages('roadoi')
 29 | #install.packages('inops')
 30 | #install.packages("data.table")
 31 | #install.packages("ropensci/geonames")
 32 | 
 33 | # load the packages
 34 | library(dplyr)
 35 | library(tibble)
 36 | library(tidyr)
 37 | library(purrr)
 38 | library(readr)
 39 | library(stringr)
 40 | library(jsonlite)
 41 | library(lubridate)
 42 | library(ggplot2)
 43 | library(httr)
 44 | library(forcats)
 45 | library(usethis)
 46 | library(anytime)
 47 | library(janitor)
 48 | library(glue)
 49 | library(rorcid)
 50 | library(rcrossref)
 51 | library(roadoi)
 52 | library(inops)
 53 | 
 54 | # remove all objects from the environment to start with a clean slate
 55 | rm(list = ls())
 56 | 
 57 | # Set up orcid / crossref in R environment ------------------------------------------------------------
 58 | 
 59 | # if you've already done these steps and set up your bearer token in RStudio
 60 | # you can skip to the next section: "set some variablees and build the query"
 61 | 
 62 | # 1. If you haven’t done so already, create an ORCID account at https://orcid.org/signin. 
 63 | # 2. In the upper right corner, click your name, then in the drop-down menu, click Developer Tools. Note: In order to access Developer Tools, you must verify your email address. 
 64 | # 3. If you have not already verified your email address, you will be prompted to do so at this point.
 65 | # 4. Click the “Register for the free ORCID public API” button
 66 | # 5. Review and agree to the terms of service when prompted.
 67 | # 6. Add your name in the Name field, https://www.orcid.org in the Your Website URL field, “Getting public API key” in Description field, and https://www.orcid.org in the redirect URI field. Click the diskette button to save.
 68 | # 7. A gray box will appear including your Client ID and Client Secret. In the below code chunk, copy and paste the client ID and the client secret respectively. 
 69 | # 8. Make sure to leave the quotation marks (e.g. orcid_client_id <- "APP-FDFJKDSLF320SDFF" and orcid_client_secret <- "c8e987sa-0b9c-82ed-91as-1112b24234e"). 
 70 | 
 71 | # copy/paste your client ID from https://orcid.org/developer-tools
 72 | orcid_client_id <- "PASTE MY CLIENT ID HERE"
 73 | 
 74 | # copy/paste your client secret from https://orcid.org/developer-tools
 75 | orcid_client_secret <- "PASTE MY CLIENT SECRET HERE"
 76 | 
 77 | # This gets a /read-public scope access token
 78 | orcid_request <- POST(url  = "https://orcid.org/oauth/token",
 79 |                       config = add_headers(`Accept` = "application/json",
 80 |                                            `Content-Type` = "application/x-www-form-urlencoded"),
 81 |                       body = list(grant_type = "client_credentials",
 82 |                                   scope = "/read-public",
 83 |                                   client_id = orcid_client_id,
 84 |                                   client_secret = orcid_client_secret),
 85 |                       encode = "form")
 86 | 
 87 | # parse the API request with content
 88 | orcid_response <- content(orcid_request)
 89 | 
 90 | # run the following code
 91 | print(orcid_response$access_token)
 92 | 
 93 | #You will see a string of text print out in your R console.
 94 | # Copy that string to the clipboard 
 95 | # so we can  save the token to our R environment
 96 | # Run this code:
 97 | usethis::edit_r_environ()
 98 | 
 99 | # A new window will open in RStudio.
100 | # In this separate R environment page, type the following (except the pound sign):
101 | # ORCID_TOKEN="my-token"
102 | # replace 'my-token' with the access_token you just copied. 
103 | # Then press enter to create a new line.
104 | # while we are here, we'll add in our rcrossref credentials
105 | # type crossref_email="name@example.com", using your own email address.
106 | # press enter to create a new line, and leave it blank. 
107 | # Press Ctrl + S (Mac: Cmd + S) to save this information to your R environment and close the window. 
108 | # You won't see anything happen here because it is just saving the page.
109 | 
110 | # Click Session > Restart R. Your token should now be saved to your R environment. 
111 | 
112 | # You will now need to rerun all the packages ("library()" commands) above, then return to this line.
113 | 
114 | #You can confirm this worked by calling orcid_auth(), and it will print the token
115 | rorcid::orcid_auth()
116 | 
117 | 
118 | # set some variablees and build the query  --------------------------------------------------------
119 | 
120 | # set the working directory where this script is
121 | # a folder called "data" is also expected to be in this directory
122 | # example: setwd("C:/Users/rabun/OneDrive - LYRASIS/Documents/RsearchResults")
123 | setwd("PASTE WORKING DIRECTORY HERE")
124 | 
125 | # set the time period of interest: this script will compile collaboration data since Jan 1 of this year.
126 | # replace the YYYY with a 4 digit year.
127 | # the more years of data desired, the longer some portions of this script will take to run
128 | my_year = YYYY;
129 | 
130 | # set the institution's main location information (for use when precise location info is blank)
131 | # example:
132 | # anchor_org<-"The Gordon and Betty Moore Foundation"
133 | # anchor_city<-"Palo Alto"
134 | # anchor_region<-"CA"
135 | # anchor_country<-"US"
136 | anchor_org<-"Organization Name"
137 | anchor_city<-"City"
138 | anchor_region<-"State"
139 | anchor_country<-"Country"
140 | 
141 | # read in your list of existing ORCID iDs - it should be a csv file named my_orcids_data (make sure you save as .csv) and should be saved within the "data" file in your working directory. It should be formatted with three columns:
142 | # first column should be titled orcid_identifier_uri and should contain a list of the full ORCID ID URL for each person (example: https://orcid.org/0000-0002-0375-8429)
143 | # second column should be titled orcid_identifier_path and should contain a list of just the 16 digit ORCID numbers for each person (example: 0000-0002-0375-8429)
144 | # third column should be titled orcid_identifier_host and should contain a list of just the 16 digit ORCID numbers for each person (example: 0000-0002-0375-8429)
145 | my_orcids_data <- read_csv("./data/my_orcids_data.csv", col_types = cols(.default = "c"))
146 | 
147 | 
148 | # get employment data -----------------------------------------------------
149 | 
150 | # get the employments from the orcid_identifier_path column
151 | ##### TIME: be patient, this may take a long time (e.g. for Temple University's data [~3500 IDs], this took ~8 minutes)
152 | my_employment <- rorcid::orcid_employments(my_orcids_data$orcid_identifier_path)
153 | 
154 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later
155 | #to_write<-toJSON(my_employment, na="null")
156 | #write(to_write,"./data/employment.json")
157 | 
158 | # read it back in, if necessary
159 | #my_employment <- read_json("./data/processed/employment.json", simplifyVector = TRUE)
160 | ##### WRITE/READ JSON
161 | 
162 | # extract the employment data and mutate the dates
163 | my_employment_data <- my_employment %>%
164 |   purrr::map(., purrr::pluck, "affiliation-group", "summaries") %>% 
165 |   purrr::flatten_dfr() %>%
166 |   janitor::clean_names() %>%
167 |   dplyr::mutate(employment_summary_end_date = anytime::anydate(employment_summary_end_date/1000),
168 |                 employment_summary_created_date_value = anytime::anydate(employment_summary_created_date_value/1000),
169 |                 employment_summary_last_modified_date_value = anytime::anydate(employment_summary_last_modified_date_value/1000))
170 | 
171 | # clean up the column names
172 | names(my_employment_data) <- names(my_employment_data) %>%
173 |   stringr::str_replace(., "employment_summary_", "") %>%
174 |   stringr::str_replace(., "source_source_", "") %>%
175 |   stringr::str_replace(., "organization_disambiguated_", "")
176 | 
177 | # view the unique institutions in the organization names columns
178 | # keep in mind this will include all institutions a person has in their employments section
179 | my_organizations <- my_employment_data %>%
180 |   group_by(organization_name) %>%
181 |   count() %>%
182 |   arrange(desc(n))
183 | 
184 | # view the variation in organization names by looking at my_organization_filtered (will open a new tab)
185 | # view(my_organizations)
186 | 
187 | # Note that this will give you employment records only. 
188 | # In other words, each row represents a single employment record for an individual.
189 | # the name_value variable refers specifically to the name of the person or system
190 | # that wrote the record, NOT the name of the individual. 
191 | 
192 | # To get that, you must first get all the unique ORCID iDs from the dataset:
193 | 
194 | # There is no distinct value identifying the orcid ID of the person.
195 | # The orcid_path value corresponds to the path of the person who added the employment record (which is usually, but not always the same)
196 | # Therefore you have to strip out the ORCID iD from the 'path' variable first and put it in it's own value and use it
197 | # We do this using str_sub from the stringr package
198 | # While we are at it, we can select and reorder the columns we want to keep
199 | current_employment_all <- my_employment_data %>%
200 |   mutate(orcid_identifier = str_sub(path, 2, 20)) %>%
201 |   select(any_of(c("orcid_identifier",
202 |                   "organization_name",
203 |                   "organization_address_city",
204 |                   "organization_address_region",
205 |                   "organization_address_country",
206 |                   "organization_identifier",
207 |                   "organization_disambiguated_organization_identifier",
208 |                   "organization_disambiguation_source",
209 |                   "department_name",
210 |                   "role_title",
211 |                   "url_value",
212 |                   "display_index",
213 |                   "visibility",
214 |                   "created_date_value",
215 |                   "start_date_year_value",
216 |                   "start_date_month_value",
217 |                   "start_date_day_value",
218 |                   "end_date_year_value",
219 |                   "end_date_month_value",
220 |                   "end_date_day_value")))
221 | 
222 | # next, create a new vector unique_orcids that includes only unique ORCID iDs from our filtered dataset.     
223 | unique_orcids <- unique(current_employment_all$orcid_identifier) %>%
224 |   na.omit(.) %>%
225 |   as.character()
226 | 
227 | # then run the following expression to get all biographical information for those iDs.
228 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's data [~700 IDs], this took ~1.5 minutes)
229 | my_orcid_person <- rorcid::orcid_person(unique_orcids)
230 | 
231 | # then we construct a data frame from the response. 
232 | # See more at https://ciakovx.github.io/rorcid.html#Getting_the_data_into_a_data_frame for this.
233 | my_orcid_person_data <- my_orcid_person %>% {
234 |   dplyr::tibble(
235 |     given_name = purrr::map_chr(., purrr::pluck, "name", "given-names", "value", .default=NA_character_),
236 |     created_date = purrr::map_chr(., purrr::pluck, "name", "created-date", "value", .default=NA_integer_),
237 |     last_modified_date = purrr::map_chr(., purrr::pluck, "name", "created-date", "value", .default=NA_character_),
238 |     family_name = purrr::map_chr(., purrr::pluck, "name", "family-name", "value", .default=NA_character_),
239 |     credit_name = purrr::map_chr(., purrr::pluck, "name", "credit-name", "value", .default=NA_character_),
240 |     other_names = purrr::map(., purrr::pluck, "other-names", "other-name", "content", .default=NA_character_),
241 |     orcid_identifier_path = purrr::map_chr(., purrr::pluck, "name", "path", .default = NA_character_),
242 |     biography = purrr::map_chr(., purrr::pluck, "biography", "content", .default=NA_character_),
243 |     researcher_urls = purrr::map(., purrr::pluck, "researcher-urls", "researcher-url", .default=NA_character_),
244 |     emails = purrr::map(., purrr::pluck, "emails", "email", "email", .default=NA_character_),
245 |     keywords = purrr::map(., purrr::pluck, "keywords", "keyword", "content", .default=NA_character_),
246 |     external_ids = purrr::map(., purrr::pluck, "external-identifiers", "external-identifier", .default=NA_character_))
247 | } %>%
248 |   dplyr::mutate(created_date = anytime::anydate(as.double(created_date)/1000),
249 |                 last_modified_date = anytime::anydate(as.double(last_modified_date)/1000))
250 | 
251 | # Join it back with the employment records so that the employment data now includes organization city, region, country
252 | orcid_person_employment_join <- my_orcid_person_data %>%
253 |   left_join(current_employment_all, by = c("orcid_identifier_path" = "orcid_identifier"))
254 | 
255 | ##### WRITE/READ CSV uncomment to save this data and read it back in later
256 | #write_csv(orcid_person_employment_join, "./data/orcid_employment_file.csv")
257 | 
258 | # read it back in, if necessary
259 | #orcid_person_employment_join <- read_csv("./data/orcid_employment_file.csv", col_types = cols(.default = "c"))
260 | ##### WRITE/READ CSV
261 | 
262 | 
263 | # get works data -----------------------------------------------------
264 | 
265 | # create a vector of unique, unduplicated ORCID IDs from that file
266 | my_orcids <- orcid_person_employment_join %>%
267 |   filter(!duplicated(orcid_identifier_path)) %>%
268 |   pull(orcid_identifier_path) %>%
269 |   na.omit() %>%
270 |   as.character()
271 | 
272 | # Call the orcid_works function to collect all works associated with each ID
273 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's data [~700 IDs], this took ~2.5 minutes)
274 | my_works <- rorcid::orcid_works(my_orcids)
275 | 
276 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later
277 | #to_write<-toJSON(my_works, na="null")
278 | #write(to_write,"./data/my_works.json")
279 | 
280 | # read it back in, if necessary
281 | #my_works <- read_json("./data/my_works.json", simplifyVector = TRUE)
282 | ##### WRITE/READ JSON
283 | 
284 | # turn the JSON file into a unique data frame by looping through the file,
285 | # extracting ("pluck") the object, bind the rows together with(this is the "_dfr" part of map_dfr)
286 | # then clean column names
287 | # and convert the dates from Unix time to yyyy-mm-dd
288 | my_works_data <- my_works %>%
289 |   purrr::map_dfr(pluck, "works") %>%
290 |   janitor::clean_names() %>%
291 |   dplyr::mutate(created_date_value = anytime::anydate(created_date_value/1000),
292 |                 last_modified_date_value = anytime::anydate(last_modified_date_value/1000))
293 | 
294 | # we only want to keep works that have an external identifier
295 | # (specifically, a DOI), so we first filter to keep only objects that have an external_id value
296 | # then unnest those: in other words expand to include a row for every work + external id value
297 | # (in other words, one work might be linked to a DOI, a PubMed ID, an ISSN, etc.)
298 | my_works_externalIDs <- my_works_data %>% 
299 |   dplyr::filter(!purrr::map_lgl(external_ids_external_id, purrr::is_empty)) %>% 
300 |   tidyr::unnest(external_ids_external_id) %>%
301 |   clean_names()
302 | 
303 | # From those unnested external IDs, we want to keep only those with a DOI, as that is the 
304 | # value we'll use to look up the items in Crossref.
305 | # We then select a few relevant columns, and finally create a new column DOI that takes the external_id_value column
306 | # and coerces it to lower case, and the orcid_identifier column which strips out the ORCID ID
307 | # from the path variable.
308 | dois <- my_works_externalIDs %>%
309 |   filter(external_id_type == "doi") %>%
310 |   select(type, path, title_title_value, external_id_type, external_id_value, external_id_relationship,
311 |          url_value, publication_date_year_value, publication_date_month_value, publication_date_day_value,
312 |          journal_title_value) %>%
313 |   mutate(doi = tolower(external_id_value),
314 |          orcid_identifier = str_sub(path, 2, 20))
315 | 
316 | # there are some duplicated values here: we can't just look at duplicate DOIs because some of these
317 | # works were co-authored, and we want to keep that data (i.e. unique orcid + doi combinations)
318 | # This function will let you look at observations where both the orcid ID and the DOI are duplicated in 
319 | # case you want to review them more closely. 
320 | # In our case below, we just keep the first appearance of a unique orcid + doi combination and discard
321 | # all subsequent ones.
322 | dupes <- dois %>%
323 |   get_dupes(orcid_identifier, doi)
324 | 
325 | # Here we are preparing the orcid dataset for merging to publications. 
326 | # We keep only Orcid ID, first name and last name, remove duplicates, and rename orcid_identifier
327 | orcid_empl_merge <- orcid_person_employment_join %>%
328 |   select(orcid_identifier_path, given_name, family_name) %>%
329 |   filter(!duplicated(orcid_identifier_path)) %>%
330 |   rename(orcid_identifier = orcid_identifier_path)
331 | 
332 | # Finally, we remove the duplicates by creating a new variable that is a combination of
333 | # the orcid ID and the DOI, and keeping only the first instance. We then join that to our 
334 | # cleaned orcid ID file and write to csv
335 | dois_unduped <- dois %>%
336 |   mutate(orcid_doi = paste0(orcid_identifier, doi)) %>%
337 |   filter(!duplicated(orcid_doi)) %>%
338 |   left_join(orcid_empl_merge, by = "orcid_identifier")
339 | 
340 | ##### WRITE/READ CSV uncomment to save this data and read it back in later
341 | #write_csv(dois_unduped, "./data/orcid_dois.csv")
342 | 
343 | # read it back in, if necessary
344 | #dois_unduped <- read_csv("./data/orcid_dois.csv")
345 | ##### WRITE/READ CSV
346 | 
347 | 
348 | # get CrossRef data -----------------------------------------------------
349 | 
350 | # We start by subsetting our unduped dois to include only since the year that we want
351 | # this is the year of publication according to the ORCID profile works data
352 | dois_since_year <- dois_unduped %>%
353 |   filter(publication_date_year_value >= my_year)
354 | 
355 | # This will loop through the column of dois and perform a function that
356 | # prints the doi (this allows you to ensure it's progressing)
357 | # there will be warning messages for any DOIs not found at CrossRef
358 | ##### TIME This will take a long time for large datasets (e.g. for Temple University's 2022 data [800+ DOIs], this took ~6 minutes)
359 |  metadata_since_year <- map(dois_since_year$doi, function(z) {
360 |    print(z)
361 |    o <- cr_works(dois = z)
362 |    return(o)
363 |  })
364 | 
365 | ##### Code improvement
366 | # Here we could create a similar function that queries DataCite for metadata on the ones that weren't found in CR
367 | # Also rather than DOIs SINCE a given year, it might be desired to retrieve data on DOIs from a discrete year, 
368 | # or from a time period with specific start and end dates.
369 | ##### Code improvement
370 | 
371 | 
372 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later
373 | #write_file_path = paste0("./data/metadata_",my_year,".json")
374 | #to_write<-toJSON(metadata_since_year, pretty=TRUE, na="null")
375 | #write(to_write,write_file_path)
376 | 
377 | # read it back in, if necessary
378 | #metadata_since_year <- read_json(write_file_path, simplifyVector = TRUE)
379 | ##### WRITE/READ JSON
380 | 
381 | # This will loop through each result, extract ("pluck") the object called "data"
382 | # bind it together into a dataframe (the "dfr" part of map_dfr)
383 | # clean the names up and filter to remove any duplicates
384 | metadata_since_year_df <- metadata_since_year %>%
385 |   map_dfr(., pluck("data")) %>%
386 |   clean_names() %>%
387 |   filter(!duplicated(doi))
388 | 
389 | # We next want to prepare our orcid data frame to merge to the crossref data by selecting only the relevant columns.
390 | # Rows with no CrossRef data (like issued from DataCite) are still present here 
391 | # anything published in an earlier time frame will be removed
392 | orcid_merge <- dois_since_year %>%
393 |   select(orcid_identifier, doi, given_name, family_name)
394 | 
395 | # select relevant columns
396 | cr_merge <- metadata_since_year_df %>%
397 |   select(any_of(c("doi",
398 |                   "title",
399 |                   "published_print", 
400 |                   "published_online", 
401 |                   "issued", 
402 |                   "container_title",
403 |                   "issn",
404 |                   "volume",
405 |                   "issue",
406 |                   "page",
407 |                   "publisher",
408 |                   "language",
409 |                   "isbn",
410 |                   "url",
411 |                   "type",
412 |                   "subject",
413 |                   "reference_count",
414 |                   "is_referenced_by_count",
415 |                   "subject",
416 |                   "alternative_id",
417 |                   "author",
418 |                   "pdf_url")))
419 | 
420 | # CrossRef metadata was retrieved for Works on the ORCID profile with publication year >= my_year
421 | # however the DOI issued date may earlier than my_year, could be NA, or will have missing month or day info
422 | # if an issued date from CrossRef is NA, we will fill it in as my_year-01-01
423 | # if issued is a partial date, we fill in with January 1, or the 1st of the month 
424 | # so that in Tableau they will render properly as dates
425 | jan1date<-paste0(my_year,"-01-01")
426 | cr_merge$issued<-cr_merge$issued %>% replace_na(jan1date)
427 | cr_merge <- cr_merge %>% add_column(issued2 = "", .after = "issued") 
428 | cr_merge <- cr_merge %>%
429 |   mutate(
430 |     issued2 = if_else(
431 |       condition = nchar(trim(issued)) == 7,
432 |       true      = paste0(issued,"-01"),
433 |       false     = issued
434 |     )
435 |   ) %>% 
436 |   mutate(
437 |     issued2 = if_else(
438 |       condition = nchar(trim(issued)) == 4, 
439 |       true      = paste0(issued,"-01-01"), 
440 |       false     = issued2
441 |     )
442 |   )
443 | cr_merge$issued<-cr_merge$issued2
444 | cr_merge <- cr_merge %>% select(-(issued2))
445 | 
446 | 
447 | # build an author ORCID ID reference table -----------------------------------------------------
448 | # it will help us fill in blanks later if we start building a dataframe of full author names with their ORCID
449 | 
450 | # start with the orcid_person_employment_join dataframe of employment data for home authors
451 | # create a fullname identifier for the home author that is striped of punctuation and whitespace
452 | orcid_person_employment_join$fullname <- with(orcid_person_employment_join, paste(given_name,family_name))
453 | orcid_person_employment_join$fullname <- str_replace_all(orcid_person_employment_join$fullname, "[^[:alnum:]]", " ")
454 | orcid_person_employment_join$fullname<-str_replace_all(orcid_person_employment_join$fullname, fixed(" "), "")
455 | 
456 | # select relevant columns
457 | master_names <- orcid_person_employment_join %>%
458 |   select(any_of(c("fullname",
459 |                   "orcid_identifier_path",
460 |                   "department_name",
461 |                   "organization_name",
462 |                   "organization_address_city",
463 |                   "organization_address_region",
464 |                   "organization_address_country"
465 |                   )))
466 | master_names <- master_names[!duplicated(master_names$orcid_identifier_path),]
467 | 
468 | # get the credit_name, an alternate version of their name and make a row for that
469 | credit_names <- orcid_person_employment_join %>%
470 |   filter(!is.na(credit_name)) %>%
471 |   select(any_of(c("credit_name",
472 |                   "orcid_identifier_path",
473 |                   "department_name",
474 |                  "organization_name",
475 |                  "organization_address_city",
476 |                  "organization_address_region",
477 |                  "organization_address_country"
478 |                     ))) %>%
479 |   rename(fullname = credit_name)
480 | 
481 | # strip the fullname identifier of punctuation and whitespace
482 | credit_names$fullname <- str_replace_all(credit_names$fullname, "[^[:alnum:]]", " ")
483 | credit_names$fullname<-str_replace_all(credit_names$fullname, fixed(" "), "")
484 | 
485 | # remove duplicate rows
486 | credit_names <- credit_names[!duplicated(credit_names$orcid_identifier_path),]
487 | 
488 | # concatenate these two data frames to start our author ORCID ID reference table
489 | names_df <- rbind(master_names,credit_names)
490 | 
491 | 
492 | # get co-author information -----------------------------------------------------
493 | 
494 | # The authors for each DOI in the cr_merge dataframe are in a nested list. 
495 | # In order to collect information about them, we must unnest the list,
496 | # Then we will build a list of home author, co-author pairs and try ti fill in any unknown ORCID
497 | # and location info about the co-authors
498 | 
499 | # unnest the author list for each DOI
500 | what_auths <- cr_merge %>% unnest(author)
501 | 
502 | # left join this DOI authors list to our list of home authors by DOI
503 | # this gives us a df where there is an individual row for each home author and co-author on a  DOI
504 | authlist_all <- what_auths %>%
505 |   left_join(orcid_merge, by = "doi")
506 | 
507 | # when multiple home authors have collaborated on a DOI there will be several sets of
508 | # rows for that DOI in the data frame - one set for each home author
509 | # we keep these because we're counting each home author and all their collaborations, including within institution
510 | 
511 | # we do want to remove rows produced by the join where the home author (orcid_identifier) is 
512 | # the same as the co-author (ORCID) - so where orcid_identifier = str_sub(ORCID , 18, 37)
513 | # AND where the home author / co-author names are exactly the same
514 | # this will miss slight variations in names when there is no ORCID ID on the cross ref record (e.g. Bradley Baker vs. Bradley J. Baker)
515 | 
516 | # add some columns to authlist_all to help with this deduplicating
517 | authlist_all$orcid_coauth <- with(authlist_all, 
518 |                                   ifelse(is.na(ORCID),'',str_sub(ORCID , 18, 37))
519 | )
520 | 
521 | # fullname identifier for the home author, striped of punctuation and whitespace
522 | authlist_all$anchorfullname <- with(authlist_all, paste(given_name,family_name))
523 | authlist_all$anchorfullname <- str_replace_all(authlist_all$anchorfullname, "[^[:alnum:]]", " ")
524 | authlist_all$anchorfullname<-str_replace_all(authlist_all$anchorfullname, fixed(" "), "")
525 | 
526 | # fullname identifier for the co-author, striped of punctuation and whitespace
527 | authlist_all$coauthfullname <- with(authlist_all, paste(given,family))
528 | authlist_all$coauthfullname <- str_replace_all(authlist_all$coauthfullname, "[^[:alnum:]]", " ")
529 | authlist_all$coauthfullname<-str_replace_all(authlist_all$coauthfullname, fixed(" "), "")
530 | 
531 | ## create a new df with the identical entries removed
532 | authlist_nodups <- subset(authlist_all, (orcid_identifier != orcid_coauth))
533 | authlist_nodups <- subset(authlist_nodups, (anchorfullname != coauthfullname))
534 | 
535 | # next it would be good to fill in ORCID if there is a co-author name variation that 
536 | # we are already aware of and logged in names_df, our author ORCID ID reference table
537 | # when there are author name variations that we are not aware of, and there is no ORCID ID
538 | # there is just no way to resolve them, so the occasional row where home author and co-author are the same will persist 
539 | 
540 | ##### Code improvement
541 | # there are many times when we could try to fill in info from the author ORCID ID reference table
542 | # in order to keep refining the data. so it would be good to take this code out and
543 | # put it in a function that we could just call here instead of re-running similar lines of code
544 | ##### Code improvement
545 | 
546 | #### TIME: These joins hang a bit if the lists are very large (e.g. for Temple University's 2022 data [>2700 names], all these joins took ~10 seconds)
547 | # left join to add ORCIDs from our reference table to the author list
548 | my_join <- left_join(authlist_nodups,names_df,by=c("coauthfullname" = "fullname"))
549 | 
550 | # fill in the joined ORCID where orcid_coauth is blank
551 | my_join[ my_join$orcid_coauth == "", "orcid_coauth" ] <- my_join[ my_join$orcid_coauth == "", "orcid_identifier_path" ]
552 | 
553 | # this reintroducies NA values into the data fram, so replace those with blanks again
554 | my_join <- my_join %>% 
555 |   mutate_at('orcid_coauth', ~replace_na(.,""))
556 | 
557 | # do another pass to eliminate rows with the same anchor author and co-author ORCID from the ones we just filled in
558 | authlist_nodups <- subset(my_join, (orcid_identifier != orcid_coauth))
559 | 
560 | 
561 | # now that we tried to fill in co-author ORCID IDs we can also fill in 
562 | # co-author current affiliations and location information that we have in the reference table names_df
563 | 
564 | # but we have to use a version of the names_df where orcid is unique
565 | orcid_df <- names_df
566 | 
567 | # remove duplicate orcid rows
568 | orcid_df <- orcid_df[!duplicated(orcid_df$orcid_identifier_path),]
569 | 
570 | my_join <- left_join(authlist_nodups,orcid_df,by=c("orcid_coauth" = "orcid_identifier_path"))
571 | 
572 | # fill in the joined location fields where any co-author locations are blank
573 | my_join <- my_join %>% 
574 |   mutate(department_name.x = coalesce(department_name.x,department_name.y),
575 |          organization_name.x = coalesce(organization_name.x,organization_name.y),
576 |          organization_address_city.x = coalesce(organization_address_city.x,organization_address_city.y),
577 |          organization_address_region.x = coalesce(organization_address_region.x,organization_address_region.y),
578 |          organization_address_country.x = coalesce(organization_address_country.x,organization_address_country.y)
579 |   )
580 | 
581 | # drop some columns we don't need
582 | authlist_nodups <- subset(my_join, select = -c(orcid_identifier_path,department_name.y,organization_name.y, organization_address_city.y, organization_address_region.y, organization_address_country.y))
583 | 
584 | # now we have authlist_nodups, a dataframe where there is a row for every co-author on a DOI except for the home author duplicate (ideally),
585 | # and each row also includes the home author's name and ORCID ID, and as much info about the co-author as we have so far
586 | 
587 | 
588 | # build the output file -----------------------------------------------------
589 | 
590 | # we eventually want to output a CSV with these columns:
591 | # fname1, lname1, orcid1, affiliation1, org1, city1, region1, country1, fname2, lname2, orcid2, affiliation2, org2, city2, region2, country2, DOI
592 | 
593 | # create a dataframe with the columns we need
594 | co_authors <- authlist_nodups %>%
595 |   select(any_of(c("doi",
596 |                   "issued",
597 |                   "given_name",
598 |                   "family_name", 
599 |                   "orcid_identifier", 
600 |                   "given", 
601 |                   "family",
602 |                   "orcid_coauth",
603 |                   "affiliation.name",
604 |                   "organization_name.x",
605 |                   "organization_address_city.x",
606 |                   "organization_address_region.x",
607 |                   "organization_address_country.x"
608 |   )))
609 | 
610 | # rename some columns
611 | co_authors <- co_authors %>% 
612 |   rename(
613 |     fname1 = given_name,
614 |     lname1 = family_name,
615 |     orcid1 = orcid_identifier,
616 |     fname2 = given,
617 |     lname2 = family,
618 |     orcid2 = orcid_coauth,
619 |     affiliation2 = affiliation.name,
620 |     org2 = organization_name.x,
621 |     city2 = organization_address_city.x,
622 |     region2 = organization_address_region.x,
623 |     country2 = organization_address_country.x
624 |   )
625 | 
626 | # add in columns of home author info affiliation and location info
627 | # join the info in from our orcid_df reference table
628 | co_authors <- left_join(co_authors,orcid_df,by=c("orcid1" = "orcid_identifier_path"))
629 | 
630 | # rename the joined affiliation and location fields for the home author
631 | co_authors <- co_authors %>% 
632 |   rename(
633 |     affiliation1 = department_name,
634 |     org1 = organization_name,
635 |     city1 = organization_address_city,
636 |     region1 = organization_address_region,
637 |     country1 = organization_address_country
638 |   )
639 | 
640 | # move the columns around
641 | co_authors <- co_authors %>% relocate(affiliation1, org1, city1, region1, country1, .after = orcid1)
642 | 
643 | # fill in with static values if there are blanks -- there realy shouldn't be any but just in case
644 | co_authors$org1[co_authors$org1 == "" | co_authors$org1 == " " | is.na(co_authors$org1)]<- anchor_org
645 | co_authors$city1[co_authors$city1 == "" | co_authors$city1 == " " | is.na(co_authors$city1)]<- anchor_city
646 | co_authors$region1[co_authors$region1 == "" | co_authors$region1 == " " | is.na(co_authors$region1)]<- anchor_region
647 | co_authors$country1[co_authors$country1 == "" | co_authors$country1 == " " | is.na(co_authors$country1)]<- anchor_country
648 | 
649 | 
650 | # though we might have filled in a few pieces of co-author info for some of the co-authors from the same institution above,
651 | # we stil need city, region, and country for many of the co-authors. we can try to retrive this if we have the co-authors ORCID ID
652 | # we'll make a unique list of co-author's who have ORCID IDs and get their CURRENT affiliation
653 | # we chose to get their current affiliation because this is the same way we treat home authors 
654 | # (they are a home author because of their current affiliation, 
655 | # even though they may have published a DOI in the past when affiliated with a different organization)
656 | co_auth_ids <- co_authors$orcid2
657 | co_auth_ids_unduped <- unique(co_auth_ids[co_auth_ids != ""])
658 | 
659 | # if a value in co_auth_ids_unduped gives an error when you try to generate my_co_auths_employment below
660 | # (like that it is locked and cannot be edited)
661 | # remove it from the list by filling in the problem ORCID ID (format XXXX-XXXX-XXXX-XXXX), uncommenting, and running this statement
662 | # then try to generate my_co_auths_employment again
663 | #co_auth_ids_unduped <- co_auth_ids_unduped[ co_auth_ids_unduped != "enter problem ORCID ID here in format XXXX-XXXX-XXXX-XXXX"]
664 | 
665 | # get the co-authors employment data from their ORCID profile
666 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's 2022 data [>850 IDs], this took ~2 minutes)
667 | my_co_auths_employment <- rorcid::orcid_employments(co_auth_ids_unduped)
668 | 
669 | ##### JSON
670 | # you can write the file to json if you want to work with it outside of R
671 | #to_write<-toJSON(my_co_auths_employment, na="null")
672 | #write(to_write,"./data/co_auths_employment.json")
673 | 
674 | # read it back in, if necessary
675 | #my_co_auths_employment <- read_json("./data/co_auths_employment.json", simplifyVector = TRUE)
676 | ##### JSON
677 |  
678 | # extract the employment data and mutate the dates
679 | my_co_auths_employment_data <- my_co_auths_employment %>%
680 |   purrr::map(., purrr::pluck, "affiliation-group", "summaries") %>% 
681 |   purrr::flatten_dfr() %>%
682 |   janitor::clean_names() %>%
683 |   dplyr::mutate(employment_summary_end_date = anytime::anydate(employment_summary_end_date/1000),
684 |                 employment_summary_created_date_value = anytime::anydate(employment_summary_created_date_value/1000),
685 |                 employment_summary_last_modified_date_value = anytime::anydate(employment_summary_last_modified_date_value/1000))
686 | 
687 | # clean up column names
688 | names(my_co_auths_employment_data) <- names(my_co_auths_employment_data) %>%
689 |   stringr::str_replace(., "employment_summary_", "") %>%
690 |   stringr::str_replace(., "source_source_", "") %>%
691 |   stringr::str_replace(., "organization_disambiguated_", "")
692 | 
693 | # some rows have orcid_path = NA, for these put the ORCID ID back with substring of path
694 | my_co_auths_employment_data <- my_co_auths_employment_data %>% 
695 |   mutate(orcid_path = coalesce(orcid_path,substring(path,2,20)))
696 | 
697 | # get the co-authors' current affiliations
698 | # this will miss co-authors who have no current employment line (with no end date) in their ORCID profile
699 | my_co_auths_employment_data_filtered_current <- my_co_auths_employment_data %>%
700 |   dplyr::filter(is.na(end_date_year_value))
701 | 
702 | # some co-authors may have multiple "current" affiliations
703 | # seperate out those with no start date year value and those that do have start dates
704 | my_co_auths_current_emp_nodate <- subset(my_co_auths_employment_data_filtered_current, is.na(start_date_year_value))
705 | my_co_auths_current_emp_date <- subset(my_co_auths_employment_data_filtered_current, !is.na(start_date_year_value))
706 | 
707 | # for those with a start date, choose the row with the most recent year
708 | latest_dates <- my_co_auths_current_emp_date %>%
709 |   group_by(orcid_path) %>%
710 |   slice(which.max(start_date_year_value)) %>%
711 |   arrange(start_date_year_value)
712 | 
713 | co_auths_latest_emp <- rbind(my_co_auths_current_emp_nodate,latest_dates)
714 | 
715 | # there will STILL be duplicates because of people with a mix of undated and dated ORCID profile employment entries, 
716 | # group again and use the latest entry date
717 | co_auths_very_latest_emp  <- co_auths_latest_emp  %>%
718 |   group_by(orcid_path) %>%
719 |   slice(which.max(created_date_value)) %>%
720 |   arrange(created_date_value)
721 | 
722 | # be double sure that we removed duplicate orcid rows
723 | co_auths_very_latest_emp <- co_auths_very_latest_emp[!duplicated(co_auths_very_latest_emp$orcid_path),]
724 | 
725 | # for the co-authors that had ORCID profiles and for whom we now have a current employment data point, join them back to the co_authors dataframe
726 | co_authors_full_info <- left_join(co_authors,co_auths_very_latest_emp,by=c("orcid2" = "orcid_path"))
727 | 
728 | # If org2, city2, region2, country2 had been NA in the dataframe we are building to output, fill from the joined table fields 
729 | co_authors_full_info <- co_authors_full_info %>% 
730 |   mutate(org2 = coalesce(org2,organization_name),
731 |          city2 = coalesce(city2,organization_address_city),
732 |          region2 = coalesce(region2,organization_address_region),
733 |          country2 = coalesce(country2,organization_address_country)
734 |   )
735 | 
736 | # drop some columns we don't need
737 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2)
738 | 
739 | ##### Code improvement
740 | # from here you could do yet ANOTHER round of recording co-author fullnames and ORCID IDs to the reference dataframe, 
741 | # then fill in blanks in the full_info df
742 | # when the code that does that is pulled out into its own function, that won't take a lot of space to do
743 | ##### Code improvement
744 | 
745 | # get rid of NA values
746 | co_authors_full_info[is.na(co_authors_full_info)] <- ""
747 | 
748 | 
749 | # clean up US state names so they produce single locations on the Tableau map
750 | # set up a dataframe of state names and abbreviations
751 | states_df<- data.frame(state.abb, state.name, paste0(state.name,'US'))
752 | colnames(states_df) <- c('abb','name','id')
753 | 
754 | # left join the correct state abbreviation for only US states with the full state name spelled out
755 | # starting with the home authors' region1
756 | co_authors_full_info$state1<-with(co_authors_full_info,paste0(region1,country1))
757 | co_authors_full_info <- left_join(co_authors_full_info,states_df,by=c("state1" = "id"))
758 | 
759 | # overwrite the full state names with the abbreviations where they occur
760 | co_authors_full_info$region1 <- ifelse(is.na(co_authors_full_info$abb), co_authors_full_info$region1, co_authors_full_info$abb )
761 | 
762 | # drop the joined columns
763 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2)
764 | 
765 | # do the same for the region2, the co_authors' US state names
766 | co_authors_full_info$state2<-with(co_authors_full_info,paste0(region2,country2))
767 | co_authors_full_info <- left_join(co_authors_full_info,states_df,by=c("state2" = "id"))
768 | co_authors_full_info$region2 <- ifelse(is.na(co_authors_full_info$abb), co_authors_full_info$region2, co_authors_full_info$abb )
769 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2)
770 | 
771 | 
772 | # write it to a csv to be visualized
773 | write_csv(co_authors_full_info, "./data/orcid-data.csv")
774 | 
775 | # Ta da, you should now have a data file to visualize in Tableau
776 | 
777 | # Before uploading to Tableau, consider cleaning your data file, either manually or using a tool
778 | # like Open Refine (https://openrefine.org/). It will improve the visualization if wordings and spellings
779 | # are standardized, particularly in the organization (org1, org2) and city name (city1, city2) fields.
780 | 


--------------------------------------------------------------------------------
/Rorcid_Crossref_Authors.R:
--------------------------------------------------------------------------------
  1 | # Script by Olivia Given Castello, based on: https://ciakovx.github.io/rorcid.html 
  2 | # and 04-rcrossref_metadata.R at https://github.com/ciakovx/fsci2022/tree/main/code
  3 | # Retrieves ORCID profile and Crossref metadata for authors from a given institution, 
  4 | # since a given year, paired with that of the co-authors with whom they collaborated.
  5 | 
  6 | # Install and load packages -----------------------------------------------
  7 | 
  8 | # you will need to install these packages first, using the following
  9 | # if you've already installed them, skip this step
 10 | #install.packages('dplyr')
 11 | #install.packages('tibble')
 12 | #install.packages('tidyr')
 13 | #install.packages('purrr')
 14 | #install.packages('readr')
 15 | #install.packages('stringr')
 16 | #install.packages('jsonlite')
 17 | #install.packages('lubridate')
 18 | #install.packages('ggplot2')
 19 | #install.packages('httr')
 20 | #install.packages('forcats')
 21 | #install.packages('rorcid')
 22 | #install.packages('usethis')
 23 | #install.packages('anytime')
 24 | #install.packages('janitor')
 25 | #install.packages('glue')
 26 | #install.packages('remotes')
 27 | #install.packages("ropensci/crossref")
 28 | #install.packages('roadoi')
 29 | #install.packages('inops')
 30 | #install.packages("rdatacite")
 31 | #install.packages("data.table")
 32 | #install.packages("ropensci/geonames")
 33 | 
 34 | # load the packages
 35 | library(dplyr)
 36 | library(tibble)
 37 | library(tidyr)
 38 | library(purrr)
 39 | library(readr)
 40 | library(stringr)
 41 | library(jsonlite)
 42 | library(lubridate)
 43 | library(ggplot2)
 44 | library(httr)
 45 | library(forcats)
 46 | library(usethis)
 47 | library(anytime)
 48 | library(janitor)
 49 | library(glue)
 50 | library(rorcid)
 51 | library(rcrossref)
 52 | library(roadoi)
 53 | library(inops)
 54 | 
 55 | # remove all objects from the environment to start with a clean slate
 56 | rm(list = ls())
 57 | 
 58 | # Set up orcid / crossref in R environment ------------------------------------------------------------
 59 | 
 60 | # if you've already done these steps and set up your bearer token in RStudio
 61 | # you can skip to the next section: "set some variablees and build the query"
 62 | 
 63 | # 1. If you haven’t done so already, create an ORCID account at https://orcid.org/signin. 
 64 | # 2. In the upper right corner, click your name, then in the drop-down menu, click Developer Tools. Note: In order to access Developer Tools, you must verify your email address. 
 65 | # 3. If you have not already verified your email address, you will be prompted to do so at this point.
 66 | # 4. Click the “Register for the free ORCID public API” button
 67 | # 5. Review and agree to the terms of service when prompted.
 68 | # 6. Add your name in the Name field, https://www.orcid.org in the Your Website URL field, “Getting public API key” in Description field, and https://www.orcid.org in the redirect URI field. Click the diskette button to save.
 69 | # 7. A gray box will appear including your Client ID and Client Secret. In the below code chunk, copy and paste the client ID and the client secret respectively. 
 70 | # 8. Make sure to leave the quotation marks (e.g. orcid_client_id <- "APP-FDFJKDSLF320SDFF" and orcid_client_secret <- "c8e987sa-0b9c-82ed-91as-1112b24234e"). 
 71 | 
 72 | # copy/paste your client ID from https://orcid.org/developer-tools
 73 | orcid_client_id <- "PASTE MY CLIENT ID HERE"
 74 | 
 75 | # copy/paste your client secret from https://orcid.org/developer-tools
 76 | orcid_client_secret <- "PASTE MY CLIENT SECRET HERE"
 77 | 
 78 | # This gets a /read-public scope access token
 79 | orcid_request <- POST(url  = "https://orcid.org/oauth/token",
 80 |                       config = add_headers(`Accept` = "application/json",
 81 |                                            `Content-Type` = "application/x-www-form-urlencoded"),
 82 |                       body = list(grant_type = "client_credentials",
 83 |                                   scope = "/read-public",
 84 |                                   client_id = orcid_client_id,
 85 |                                   client_secret = orcid_client_secret),
 86 |                       encode = "form")
 87 | 
 88 | # parse the API request with content
 89 | orcid_response <- content(orcid_request)
 90 | 
 91 | # run the following code
 92 | print(orcid_response$access_token)
 93 | 
 94 | #You will see a string of text print out in your R console.
 95 | # Copy that string to the clipboard 
 96 | # so we can  save the token to our R environment
 97 | # Run this code:
 98 | usethis::edit_r_environ()
 99 | 
100 | # A new window will open in RStudio.
101 | # In this separate R environment page, type the following (except the pound sign):
102 | # ORCID_TOKEN="my-token"
103 | # replace 'my-token' with the access_token you just copied. 
104 | # Then press enter to create a new line.
105 | # while we are here, we'll add in our rcrossref credentials
106 | # type crossref_email="name@example.com", using your own email address.
107 | # press enter to create a new line, and leave it blank. 
108 | # Press Ctrl + S (Mac: Cmd + S) to save this information to your R environment and close the window. 
109 | # You won't see anything happen here because it is just saving the page.
110 | 
111 | # Click Session > Restart R. Your token should now be saved to your R environment. 
112 | 
113 | # You will now need to rerun all the packages ("library()" commands) above, then return to this line.
114 | 
115 | #You can confirm this worked by calling orcid_auth(), and it will print the token
116 | rorcid::orcid_auth()
117 | 
118 | 
119 | # set some variablees and build the query  --------------------------------------------------------
120 | 
121 | # set the working directory where this script is
122 | # a folder called "data" is also expected to be in this directory
123 | setwd("PASTE YOUR WORKING DIRECTORY HERE")
124 | 
125 | # set the time period of interest: this script will compile collaboration data since Jan 1 of this year.
126 | # replace the YYYY with a 4 digit year.
127 | # the more years of data desired, the longer some portions of this script will take to run
128 | my_year = YYYY;
129 | 
130 | # set the home institution identifiers
131 | ringgold_id <- "enter your institution's ringgold" 
132 | grid_id <- "enter your institution's grid ID" 
133 | ror_id <- "enter your institution's ROR ID"
134 | # leave the @ off the email domain, if you want to catch subdomains (e.g. @tuj.temple.edu)
135 | email_domain <- "enter your institution's email domain" 
136 | organization_name <- "enter your organization's name"
137 | 
138 | # Set a short name key word here that you will use to filter for ORCID records from the home institution later
139 | # Keep it short, like the state name (e.g. Oklahoma).
140 | # If you are adding more than one keyword, separate them by a pipe (|)
141 | my_org_keyword = "enter your institution's keyword"
142 | 
143 | # set the institution's main location information (for use when precise location info is blank)
144 | anchor_org<-"enter your institution's name"
145 | anchor_city<-"enter your institution's city"
146 | anchor_region<-"enter your institution's state"
147 | anchor_country<-"enter your institution's country"
148 | 
149 | # create the query
150 | # decide between these two choices:
151 | # 1. to construct a simple query with the ringgold, grid, ROR ids, email domain, an organization name set above
152 | # run this:
153 | my_query <- glue('ringgold-org-id:', ringgold_id, 
154 |                  ' OR grid-org-id:', grid_id, 
155 |                  ' OR ror-org-id:"', ror_id, 
156 |                  '" OR email:*', email_domain, 
157 |                  ' OR affiliation-org-name:"', organization_name, '"')
158 | 
159 | # OR 2. to customize a more complicated query with multiple ringgold, grid, ROR ids, email domains, or organization names
160 | # specify which data you want to pull following this example.
161 | # keep in mind that ROR ID and organization name are strings and need double quotes inside the 
162 | # single quotes used here for concatenation
163 | # replace these  example lines from Temple University carefully with ones you are interested in 
164 | my_query <- glue('ringgold-org-id:', '6558', 
165 |                  ' OR ringgold-org-id:', '43297',
166 |                  ' OR ringgold-org-id:', '83908',
167 |                  ' OR grid-org-id:', 'grid.264727.2', 
168 |                  ' OR grid-org-id:', 'grid.469246.b', 
169 |                  ' OR grid-org-id:', 'grid.460938.0', 
170 |                  ' OR ror-org-id:"', 'https://ror.org/00kx1jb78', 
171 |                  '" OR ror-org-id:"', 'https://ror.org/04zzmzt85',
172 |                  '" OR ror-org-id:"', 'https://ror.org/03savr706', 
173 |                  '" OR email:*', '@temple.edu', 
174 |                  ' OR email:*', '@tuj.temple.edu', 
175 |                  ' OR affiliation-org-name:"', 'Temple University',
176 |                  '" OR affiliation-org-name:"', 'Temple Ambler',
177 |                  '" OR affiliation-org-name:"', 'Temple Japan', '"')
178 | 
179 | # get the counts
180 | ##### TIME: this may hang a bit if institution has many ORCID ID holders(e.g. for Temple University's data [~3500 IDs], this took a few seconds)
181 | orcid_count <- base::attr(rorcid::orcid(query = my_query),
182 |                           "found")
183 | 
184 | # create the page vector
185 | my_pages <- seq(from = 0, to = orcid_count, by = 200)
186 | 
187 | # get the ORCID iDs
188 | my_orcids <- purrr::map(
189 |   my_pages,
190 |   function(page) {
191 |     print(page)
192 |     my_orcids <- rorcid::orcid(query = my_query,
193 |                                rows = 200,
194 |                                start = page)
195 |     return(my_orcids)
196 |   })
197 | 
198 | # put the ORCID iDs into a single tibble
199 | my_orcids_data <- my_orcids %>%
200 |   map_dfr(., as_tibble) %>%
201 |   janitor::clean_names()
202 | 
203 | ##### WRITE/READ CSV uncomment to save this data and read it back in later
204 | #write_csv(my_orcids_data, "./data/my_orcids_data.csv")
205 | 
206 | # read it back in, if necessary
207 | #my_orcids_data <- read_csv("./data/my_orcids_data.csv", col_types = cols(.default = "c"))
208 | ##### WRITE/READ CSV
209 | 
210 | 
211 | # get employment data -----------------------------------------------------
212 | 
213 | # get the employments from the orcid_identifier_path column
214 | ##### TIME: be patient, this may take a long time (e.g. for Temple University's data [~3500 IDs], this took ~8 minutes)
215 | my_employment <- rorcid::orcid_employments(my_orcids_data$orcid_identifier_path)
216 | 
217 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later
218 | #to_write<-toJSON(my_employment, na="null")
219 | #write(to_write,"./data/employment.json")
220 | 
221 | # read it back in, if necessary
222 | #my_employment <- read_json("./data/processed/employment.json", simplifyVector = TRUE)
223 | ##### WRITE/READ JSON
224 | 
225 | # extract the employment data and mutate the dates
226 | my_employment_data <- my_employment %>%
227 |   purrr::map(., purrr::pluck, "affiliation-group", "summaries") %>% 
228 |   purrr::flatten_dfr() %>%
229 |   janitor::clean_names() %>%
230 |   dplyr::mutate(employment_summary_end_date = anytime::anydate(employment_summary_end_date/1000),
231 |                 employment_summary_created_date_value = anytime::anydate(employment_summary_created_date_value/1000),
232 |                 employment_summary_last_modified_date_value = anytime::anydate(employment_summary_last_modified_date_value/1000))
233 | 
234 | # clean up the column names
235 | names(my_employment_data) <- names(my_employment_data) %>%
236 |   stringr::str_replace(., "employment_summary_", "") %>%
237 |   stringr::str_replace(., "source_source_", "") %>%
238 |   stringr::str_replace(., "organization_disambiguated_", "")
239 | 
240 | # view the unique institutions in the organization names columns
241 | # keep in mind this will include all institutions a person has in their employments section
242 | my_organizations <- my_employment_data %>%
243 |   group_by(organization_name) %>%
244 |   count() %>%
245 |   arrange(desc(n))
246 | 
247 | # filter it with a keyword or set of keywords
248 | # this is the short keyword, or piped set of keywords, set at the top of the script
249 | my_organizations_filtered <- my_organizations %>%
250 |   filter(str_detect(organization_name, my_org_keyword))
251 | 
252 | # view the variation in organization names by looking at my_organization_filtered (will open a new tab)
253 | view(my_organizations_filtered)
254 | 
255 | # filter the dataset to include only the institutions you want 
256 | # decide between these two choices:
257 | # 1. to accept any organization listed in my_organization filtered, run this:
258 | my_employment_data_filtered <- my_employment_data %>%
259 |   dplyr::filter(organization_name %in% my_organizations_filtered$organization_name[])
260 | 
261 | # OR 2. to specify which organization name variations to include, copy and paste them here
262 | # following this example. As you can see there may be messiness in hand-entered organization names.
263 | # replace these example names with the ones you are interested in from your my_organizations_filtered list
264 | my_employment_data_filtered <- my_employment_data %>%
265 |   dplyr::filter(organization_name == "Temple University"
266 |                 | organization_name == "Temple University "
267 |                 | organization_name == "Temple University Fox School of Business and Management"
268 |                 | organization_name == "Temple University, Japan"
269 |                 | organization_name == "Temple University Japan"
270 |                 | organization_name == "Temple University - Ambler Campus")
271 | 
272 | # finally, filter to include only people who have NA as the end date
273 | my_employment_data_filtered_current <- my_employment_data_filtered %>%
274 |   dplyr::filter(is.na(end_date_year_value))
275 | 
276 | 
277 | # Note that this will give you employment records only. 
278 | # In other words, each row represents a single employment record for an individual.
279 | # the name_value variable refers specifically to the name of the person or system
280 | # that wrote the record, NOT the name of the individual. 
281 | 
282 | # To get that, you must first get all the unique ORCID iDs from the dataset:
283 | 
284 | # There is no distinct value identifying the orcid ID of the person.
285 | # The orcid_path value corresponds to the path of the person who added the employment record (which is usually, but not always the same)
286 | # Therefore you have to strip out the ORCID iD from the 'path' variable first and put it in it's own value and use it
287 | # We do this using str_sub from the stringr package
288 | # While we are at it, we can select and reorder the columns we want to keep
289 | current_employment_all <- my_employment_data_filtered_current %>%
290 |   mutate(orcid_identifier = str_sub(path, 2, 20)) %>%
291 |   select(any_of(c("orcid_identifier",
292 |                   "organization_name",
293 |                   "organization_address_city",
294 |                   "organization_address_region",
295 |                   "organization_address_country",
296 |                   "organization_identifier",
297 |                   "organization_disambiguated_organization_identifier",
298 |                   "organization_disambiguation_source",
299 |                   "department_name",
300 |                   "role_title",
301 |                   "url_value",
302 |                   "display_index",
303 |                   "visibility",
304 |                   "created_date_value",
305 |                   "start_date_year_value",
306 |                   "start_date_month_value",
307 |                   "start_date_day_value",
308 |                   "end_date_year_value",
309 |                   "end_date_month_value",
310 |                   "end_date_day_value")))
311 | 
312 | # next, create a new vector unique_orcids that includes only unique ORCID iDs from our filtered dataset.     
313 | unique_orcids <- unique(current_employment_all$orcid_identifier) %>%
314 |   na.omit(.) %>%
315 |   as.character()
316 | 
317 | # then run the following expression to get all biographical information for those iDs.
318 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's data [~700 IDs], this took ~1.5 minutes)
319 | my_orcid_person <- rorcid::orcid_person(unique_orcids)
320 | 
321 | # then we construct a data frame from the response. 
322 | # See more at https://ciakovx.github.io/rorcid.html#Getting_the_data_into_a_data_frame for this.
323 | my_orcid_person_data <- my_orcid_person %>% {
324 |   dplyr::tibble(
325 |     given_name = purrr::map_chr(., purrr::pluck, "name", "given-names", "value", .default=NA_character_),
326 |     created_date = purrr::map_chr(., purrr::pluck, "name", "created-date", "value", .default=NA_integer_),
327 |     last_modified_date = purrr::map_chr(., purrr::pluck, "name", "created-date", "value", .default=NA_character_),
328 |     family_name = purrr::map_chr(., purrr::pluck, "name", "family-name", "value", .default=NA_character_),
329 |     credit_name = purrr::map_chr(., purrr::pluck, "name", "credit-name", "value", .default=NA_character_),
330 |     other_names = purrr::map(., purrr::pluck, "other-names", "other-name", "content", .default=NA_character_),
331 |     orcid_identifier_path = purrr::map_chr(., purrr::pluck, "name", "path", .default = NA_character_),
332 |     biography = purrr::map_chr(., purrr::pluck, "biography", "content", .default=NA_character_),
333 |     researcher_urls = purrr::map(., purrr::pluck, "researcher-urls", "researcher-url", .default=NA_character_),
334 |     emails = purrr::map(., purrr::pluck, "emails", "email", "email", .default=NA_character_),
335 |     keywords = purrr::map(., purrr::pluck, "keywords", "keyword", "content", .default=NA_character_),
336 |     external_ids = purrr::map(., purrr::pluck, "external-identifiers", "external-identifier", .default=NA_character_))
337 | } %>%
338 |   dplyr::mutate(created_date = anytime::anydate(as.double(created_date)/1000),
339 |                 last_modified_date = anytime::anydate(as.double(last_modified_date)/1000))
340 | 
341 | # Join it back with the employment records so that the employment data now includes organization city, region, country
342 | orcid_person_employment_join <- my_orcid_person_data %>%
343 |   left_join(current_employment_all, by = c("orcid_identifier_path" = "orcid_identifier"))
344 | 
345 | ##### WRITE/READ CSV uncomment to save this data and read it back in later
346 | #write_csv(orcid_person_employment_join, "./data/orcid_employment_file.csv")
347 | 
348 | # read it back in, if necessary
349 | #orcid_person_employment_join <- read_csv("./data/orcid_employment_file.csv", col_types = cols(.default = "c"))
350 | ##### WRITE/READ CSV
351 | 
352 | 
353 | # get works data -----------------------------------------------------
354 | 
355 | # create a vector of unique, unduplicated ORCID IDs from that file
356 | my_orcids <- orcid_person_employment_join %>%
357 |   filter(!duplicated(orcid_identifier_path)) %>%
358 |   pull(orcid_identifier_path) %>%
359 |   na.omit() %>%
360 |   as.character()
361 | 
362 | # Call the orcid_works function to collect all works associated with each ID
363 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's data [~700 IDs], this took ~2.5 minutes)
364 | my_works <- rorcid::orcid_works(my_orcids)
365 | 
366 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later
367 | #to_write<-toJSON(my_works, na="null")
368 | #write(to_write,"./data/my_works.json")
369 | 
370 | # read it back in, if necessary
371 | #my_works <- read_json("./data/my_works.json", simplifyVector = TRUE)
372 | ##### WRITE/READ JSON
373 | 
374 | # turn the JSON file into a unique data frame by looping through the file,
375 | # extracting ("pluck") the object, bind the rows together with(this is the "_dfr" part of map_dfr)
376 | # then clean column names
377 | # and convert the dates from Unix time to yyyy-mm-dd
378 | my_works_data <- my_works %>%
379 |   purrr::map_dfr(pluck, "works") %>%
380 |   janitor::clean_names() %>%
381 |   dplyr::mutate(created_date_value = anytime::anydate(created_date_value/1000),
382 |                 last_modified_date_value = anytime::anydate(last_modified_date_value/1000))
383 | 
384 | # we only want to keep works that have an external identifier
385 | # (specifically, a DOI), so we first filter to keep only objects that have an external_id value
386 | # then unnest those: in other words expand to include a row for every work + external id value
387 | # (in other words, one work might be linked to a DOI, a PubMed ID, an ISSN, etc.)
388 | my_works_externalIDs <- my_works_data %>% 
389 |   dplyr::filter(!purrr::map_lgl(external_ids_external_id, purrr::is_empty)) %>% 
390 |   tidyr::unnest(external_ids_external_id) %>%
391 |   clean_names()
392 | 
393 | # From those unnested external IDs, we want to keep only those with a DOI, as that is the 
394 | # value we'll use to look up the items in Crossref.
395 | # We then select a few relevant columns, and finally create a new column DOI that takes the external_id_value column
396 | # and coerces it to lower case, and the orcid_identifier column which strips out the ORCID ID
397 | # from the path variable.
398 | dois <- my_works_externalIDs %>%
399 |   filter(external_id_type == "doi") %>%
400 |   select(type, path, title_title_value, external_id_type, external_id_value, external_id_relationship,
401 |          url_value, publication_date_year_value, publication_date_month_value, publication_date_day_value,
402 |          journal_title_value) %>%
403 |   mutate(doi = tolower(external_id_value),
404 |          orcid_identifier = str_sub(path, 2, 20))
405 | 
406 | # there are some duplicated values here: we can't just look at duplicate DOIs because some of these
407 | # works were co-authored, and we want to keep that data (i.e. unique orcid + doi combinations)
408 | # This function will let you look at observations where both the orcid ID and the DOI are duplicated in 
409 | # case you want to review them more closely. 
410 | # In our case below, we just keep the first appearance of a unique orcid + doi combination and discard
411 | # all subsequent ones.
412 | dupes <- dois %>%
413 |   get_dupes(orcid_identifier, doi)
414 | 
415 | # Here we are preparing the orcid dataset for merging to publications. 
416 | # We keep only Orcid ID, first name and last name, remove duplicates, and rename orcid_identifier
417 | orcid_empl_merge <- orcid_person_employment_join %>%
418 |   select(orcid_identifier_path, given_name, family_name) %>%
419 |   filter(!duplicated(orcid_identifier_path)) %>%
420 |   rename(orcid_identifier = orcid_identifier_path)
421 | 
422 | # Finally, we remove the duplicates by creating a new variable that is a combination of
423 | # the orcid ID and the DOI, and keeping only the first instance. We then join that to our 
424 | # cleaned orcid ID file and write to csv
425 | dois_unduped <- dois %>%
426 |   mutate(orcid_doi = paste0(orcid_identifier, doi)) %>%
427 |   filter(!duplicated(orcid_doi)) %>%
428 |   left_join(orcid_empl_merge, by = "orcid_identifier")
429 | 
430 | ##### WRITE/READ CSV uncomment to save this data and read it back in later
431 | #write_csv(dois_unduped, "./data/orcid_dois.csv")
432 | 
433 | # read it back in, if necessary
434 | #dois_unduped <- read_csv("./data/orcid_dois.csv")
435 | ##### WRITE/READ CSV
436 | 
437 | 
438 | # get CrossRef data -----------------------------------------------------
439 | 
440 | # We start by subsetting our unduped dois to include only since the year that we want
441 | # this is the year of publication according to the ORCID profile works data
442 | dois_since_year <- dois_unduped %>%
443 |   filter(publication_date_year_value >= my_year)
444 | 
445 | # This will loop through the column of dois and perform a function that
446 | # prints the doi (this allows you to ensure it's progressing)
447 | # there will be warning messages for any DOIs not found at CrossRef
448 | ##### TIME This will take a long time for large datasets (e.g. for Temple University's 2022 data [800+ DOIs], this took ~6 minutes)
449 |  metadata_since_year <- map(dois_since_year$doi, function(z) {
450 |    print(z)
451 |    o <- cr_works(dois = z)
452 |    return(o)
453 |  })
454 | 
455 | ##### Code improvement
456 | # Here we could create a similar function that queries DataCite for metadata on the ones that weren't found in CR
457 | # Also rather than DOIs SINCE a given year, it might be desired to retrieve data on DOIs from a discrete year, 
458 | # or from a time period with specific start and end dates.
459 | ##### Code improvement
460 | 
461 | 
462 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later
463 | #write_file_path = paste0("./data/metadata_",my_year,".json")
464 | #to_write<-toJSON(metadata_since_year, pretty=TRUE, na="null")
465 | #write(to_write,write_file_path)
466 | 
467 | # read it back in, if necessary
468 | #metadata_since_year <- read_json(write_file_path, simplifyVector = TRUE)
469 | ##### WRITE/READ JSON
470 | 
471 | # This will loop through each result, extract ("pluck") the object called "data"
472 | # bind it together into a dataframe (the "dfr" part of map_dfr)
473 | # clean the names up and filter to remove any duplicates
474 | metadata_since_year_df <- metadata_since_year %>%
475 |   map_dfr(., pluck("data")) %>%
476 |   clean_names() %>%
477 |   filter(!duplicated(doi))
478 | 
479 | # We next want to prepare our orcid data frame to merge to the crossref data by selecting only the relevant columns.
480 | # Rows with no CrossRef data (like issued from DataCite) are still present here 
481 | # anything published in an earlier time frame will be removed
482 | orcid_merge <- dois_since_year %>%
483 |   select(orcid_identifier, doi, given_name, family_name)
484 | 
485 | # select relevant columns
486 | cr_merge <- metadata_since_year_df %>%
487 |   select(any_of(c("doi",
488 |                   "title",
489 |                   "published_print", 
490 |                   "published_online", 
491 |                   "issued", 
492 |                   "container_title",
493 |                   "issn",
494 |                   "volume",
495 |                   "issue",
496 |                   "page",
497 |                   "publisher",
498 |                   "language",
499 |                   "isbn",
500 |                   "url",
501 |                   "type",
502 |                   "subject",
503 |                   "reference_count",
504 |                   "is_referenced_by_count",
505 |                   "subject",
506 |                   "alternative_id",
507 |                   "author",
508 |                   "pdf_url")))
509 | 
510 | # CrossRef metadata was retrieved for Works on the ORCID profile with publication year >= my_year
511 | # however the DOI issued date may earlier than my_year, could be NA, or will have missing month or day info
512 | # if an issued date from CrossRef is NA, we will fill it in as my_year-01-01
513 | # if issued is a partial date, we fill in with January 1, or the 1st of the month 
514 | # so that in Tableau they will render properly as dates
515 | jan1date<-paste0(my_year,"-01-01")
516 | cr_merge$issued<-cr_merge$issued %>% replace_na(jan1date)
517 | cr_merge <- cr_merge %>% add_column(issued2 = "", .after = "issued") 
518 | cr_merge <- cr_merge %>%
519 |   mutate(
520 |     issued2 = if_else(
521 |       condition = nchar(trim(issued)) == 7,
522 |       true      = paste0(issued,"-01"),
523 |       false     = issued
524 |     )
525 |   ) %>% 
526 |   mutate(
527 |     issued2 = if_else(
528 |       condition = nchar(trim(issued)) == 4, 
529 |       true      = paste0(issued,"-01-01"), 
530 |       false     = issued2
531 |     )
532 |   )
533 | cr_merge$issued<-cr_merge$issued2
534 | cr_merge <- cr_merge %>% select(-(issued2))
535 | 
536 | 
537 | # build an author ORCID ID reference table -----------------------------------------------------
538 | # it will help us fill in blanks later if we start building a dataframe of full author names with their ORCID
539 | 
540 | # start with the orcid_person_employment_join dataframe of employment data for home authors
541 | # create a fullname identifier for the home author that is striped of punctuation and whitespace
542 | orcid_person_employment_join$fullname <- with(orcid_person_employment_join, paste(given_name,family_name))
543 | orcid_person_employment_join$fullname <- str_replace_all(orcid_person_employment_join$fullname, "[^[:alnum:]]", " ")
544 | orcid_person_employment_join$fullname<-str_replace_all(orcid_person_employment_join$fullname, fixed(" "), "")
545 | 
546 | # select relevant columns
547 | master_names <- orcid_person_employment_join %>%
548 |   select(any_of(c("fullname",
549 |                   "orcid_identifier_path",
550 |                   "department_name",
551 |                   "organization_name",
552 |                   "organization_address_city",
553 |                   "organization_address_region",
554 |                   "organization_address_country"
555 |                   )))
556 | master_names <- master_names[!duplicated(master_names$orcid_identifier_path),]
557 | 
558 | # get the credit_name, an alternate version of their name and make a row for that
559 | credit_names <- orcid_person_employment_join %>%
560 |   filter(!is.na(credit_name)) %>%
561 |   select(any_of(c("credit_name",
562 |                   "orcid_identifier_path",
563 |                   "department_name",
564 |                  "organization_name",
565 |                  "organization_address_city",
566 |                  "organization_address_region",
567 |                  "organization_address_country"
568 |                     ))) %>%
569 |   rename(fullname = credit_name)
570 | 
571 | # strip the fullname identifier of punctuation and whitespace
572 | credit_names$fullname <- str_replace_all(credit_names$fullname, "[^[:alnum:]]", " ")
573 | credit_names$fullname<-str_replace_all(credit_names$fullname, fixed(" "), "")
574 | 
575 | # remove duplicate rows
576 | credit_names <- credit_names[!duplicated(credit_names$orcid_identifier_path),]
577 | 
578 | # concatenate these two data frames to start our author ORCID ID reference table
579 | names_df <- rbind(master_names,credit_names)
580 | 
581 | 
582 | # get co-author information -----------------------------------------------------
583 | 
584 | # The authors for each DOI in the cr_merge dataframe are in a nested list. 
585 | # In order to collect information about them, we must unnest the list,
586 | # Then we will build a list of home author, co-author pairs and try ti fill in any unknown ORCID
587 | # and location info about the co-authors
588 | 
589 | # unnest the author list for each DOI
590 | what_auths <- cr_merge %>% unnest(author)
591 | 
592 | # left join this DOI authors list to our list of home authors by DOI
593 | # this gives us a df where there is an individual row for each home author and co-author on a  DOI
594 | authlist_all <- what_auths %>%
595 |   left_join(orcid_merge, by = "doi")
596 | 
597 | # when multiple home authors have collaborated on a DOI there will be several sets of
598 | # rows for that DOI in the data frame - one set for each home author
599 | # we keep these because we're counting each home author and all their collaborations, including within institution
600 | 
601 | # we do want to remove rows produced by the join where the home author (orcid_identifier) is 
602 | # the same as the co-author (ORCID) - so where orcid_identifier = str_sub(ORCID , 18, 37)
603 | # AND where the home author / co-author names are exactly the same
604 | # this will miss slight variations in names when there is no ORCID ID on the cross ref record (e.g. Bradley Baker vs. Bradley J. Baker)
605 | 
606 | # add some columns to authlist_all to help with this deduplicating
607 | authlist_all$orcid_coauth <- with(authlist_all, 
608 |                                   ifelse(is.na(ORCID),'',str_sub(ORCID , 18, 37))
609 | )
610 | 
611 | # fullname identifier for the home author, striped of punctuation and whitespace
612 | authlist_all$anchorfullname <- with(authlist_all, paste(given_name,family_name))
613 | authlist_all$anchorfullname <- str_replace_all(authlist_all$anchorfullname, "[^[:alnum:]]", " ")
614 | authlist_all$anchorfullname<-str_replace_all(authlist_all$anchorfullname, fixed(" "), "")
615 | 
616 | # fullname identifier for the co-author, striped of punctuation and whitespace
617 | authlist_all$coauthfullname <- with(authlist_all, paste(given,family))
618 | authlist_all$coauthfullname <- str_replace_all(authlist_all$coauthfullname, "[^[:alnum:]]", " ")
619 | authlist_all$coauthfullname<-str_replace_all(authlist_all$coauthfullname, fixed(" "), "")
620 | 
621 | ## create a new df with the identical entries removed
622 | authlist_nodups <- subset(authlist_all, (orcid_identifier != orcid_coauth))
623 | authlist_nodups <- subset(authlist_nodups, (anchorfullname != coauthfullname))
624 | 
625 | # next it would be good to fill in ORCID if there is a co-author name variation that 
626 | # we are already aware of and logged in names_df, our author ORCID ID reference table
627 | # when there are author name variations that we are not aware of, and there is no ORCID ID
628 | # there is just no way to resolve them, so the occasional row where home author and co-author are the same will persist 
629 | 
630 | ##### Code improvement
631 | # there are many times when we could try to fill in info from the author ORCID ID reference table
632 | # in order to keep refining the data. so it would be good to take this code out and
633 | # put it in a function that we could just call here instead of re-running similar lines of code
634 | ##### Code improvement
635 | 
636 | #### TIME: These joins hang a bit if the lists are very large (e.g. for Temple University's 2022 data [>2700 names], all these joins took ~10 seconds)
637 | # left join to add ORCIDs from our reference table to the author list
638 | my_join <- left_join(authlist_nodups,names_df,by=c("coauthfullname" = "fullname"))
639 | 
640 | # fill in the joined ORCID where orcid_coauth is blank
641 | my_join[ my_join$orcid_coauth == "", "orcid_coauth" ] <- my_join[ my_join$orcid_coauth == "", "orcid_identifier_path" ]
642 | 
643 | # this reintroducies NA values into the data fram, so replace those with blanks again
644 | my_join <- my_join %>% 
645 |   mutate_at('orcid_coauth', ~replace_na(.,""))
646 | 
647 | # do another pass to eliminate rows with the same anchor author and co-author ORCID from the ones we just filled in
648 | authlist_nodups <- subset(my_join, (orcid_identifier != orcid_coauth))
649 | 
650 | 
651 | # now that we tried to fill in co-author ORCID IDs we can also fill in 
652 | # co-author current affiliations and location information that we have in the reference table names_df
653 | 
654 | # but we have to use a version of the names_df where orcid is unique
655 | orcid_df <- names_df
656 | 
657 | # remove duplicate orcid rows
658 | orcid_df <- orcid_df[!duplicated(orcid_df$orcid_identifier_path),]
659 | 
660 | my_join <- left_join(authlist_nodups,orcid_df,by=c("orcid_coauth" = "orcid_identifier_path"))
661 | 
662 | # fill in the joined location fields where any co-author locations are blank
663 | my_join <- my_join %>% 
664 |   mutate(department_name.x = coalesce(department_name.x,department_name.y),
665 |          organization_name.x = coalesce(organization_name.x,organization_name.y),
666 |          organization_address_city.x = coalesce(organization_address_city.x,organization_address_city.y),
667 |          organization_address_region.x = coalesce(organization_address_region.x,organization_address_region.y),
668 |          organization_address_country.x = coalesce(organization_address_country.x,organization_address_country.y)
669 |   )
670 | 
671 | # drop some columns we don't need
672 | authlist_nodups <- subset(my_join, select = -c(orcid_identifier_path,department_name.y,organization_name.y, organization_address_city.y, organization_address_region.y, organization_address_country.y))
673 | 
674 | # now we have authlist_nodups, a dataframe where there is a row for every co-author on a DOI except for the home author duplicate (ideally),
675 | # and each row also includes the home author's name and ORCID ID, and as much info about the co-author as we have so far
676 | 
677 | 
678 | # build the output file -----------------------------------------------------
679 | 
680 | # we eventually want to output a CSV with these columns:
681 | # fname1, lname1, orcid1, affiliation1, org1, city1, region1, country1, fname2, lname2, orcid2, affiliation2, org2, city2, region2, country2, DOI
682 | 
683 | # create a dataframe with the columns we need
684 | co_authors <- authlist_nodups %>%
685 |   select(any_of(c("doi",
686 |                   "issued",
687 |                   "given_name",
688 |                   "family_name", 
689 |                   "orcid_identifier", 
690 |                   "given", 
691 |                   "family",
692 |                   "orcid_coauth",
693 |                   "affiliation.name",
694 |                   "organization_name.x",
695 |                   "organization_address_city.x",
696 |                   "organization_address_region.x",
697 |                   "organization_address_country.x"
698 |   )))
699 | 
700 | # rename some columns
701 | co_authors <- co_authors %>% 
702 |   rename(
703 |     fname1 = given_name,
704 |     lname1 = family_name,
705 |     orcid1 = orcid_identifier,
706 |     fname2 = given,
707 |     lname2 = family,
708 |     orcid2 = orcid_coauth,
709 |     affiliation2 = affiliation.name,
710 |     org2 = organization_name.x,
711 |     city2 = organization_address_city.x,
712 |     region2 = organization_address_region.x,
713 |     country2 = organization_address_country.x
714 |   )
715 | 
716 | # add in columns of home author info affiliation and location info
717 | # join the info in from our orcid_df reference table
718 | co_authors <- left_join(co_authors,orcid_df,by=c("orcid1" = "orcid_identifier_path"))
719 | 
720 | # rename the joined affiliation and location fields for the home author
721 | co_authors <- co_authors %>% 
722 |   rename(
723 |     affiliation1 = department_name,
724 |     org1 = organization_name,
725 |     city1 = organization_address_city,
726 |     region1 = organization_address_region,
727 |     country1 = organization_address_country
728 |   )
729 | 
730 | # move the columns around
731 | co_authors <- co_authors %>% relocate(affiliation1, org1, city1, region1, country1, .after = orcid1)
732 | 
733 | # fill in with static values if there are blanks -- there realy shouldn't be any but just in case
734 | co_authors$org1[co_authors$org1 == "" | co_authors$org1 == " " | is.na(co_authors$org1)]<- anchor_org
735 | co_authors$city1[co_authors$city1 == "" | co_authors$city1 == " " | is.na(co_authors$city1)]<- anchor_city
736 | co_authors$region1[co_authors$region1 == "" | co_authors$region1 == " " | is.na(co_authors$region1)]<- anchor_region
737 | co_authors$country1[co_authors$country1 == "" | co_authors$country1 == " " | is.na(co_authors$country1)]<- anchor_country
738 | 
739 | 
740 | # though we might have filled in a few pieces of co-author info for some of the co-authors from the same institution above,
741 | # we stil need city, region, and country for many of the co-authors. we can try to retrive this if we have the co-authors ORCID ID
742 | # we'll make a unique list of co-author's who have ORCID IDs and get their CURRENT affiliation
743 | # we chose to get their current affiliation because this is the same way we treat home authors 
744 | # (they are a home author because of their current affiliation, 
745 | # even though they may have published a DOI in the past when affiliated with a different organization)
746 | co_auth_ids <- co_authors$orcid2
747 | co_auth_ids_unduped <- unique(co_auth_ids[co_auth_ids != ""])
748 | 
749 | # if a value in co_auth_ids_unduped gives an error when you try to generate my_co_auths_employment below
750 | # (like that it is locked and cannot be edited)
751 | # remove it from the list by filling in the problem ORCID ID (format XXXX-XXXX-XXXX-XXXX), uncommenting, and running this statement
752 | # then try to generate my_co_auths_employment again
753 | #co_auth_ids_unduped <- co_auth_ids_unduped[ co_auth_ids_unduped != "enter problem ORCID ID here in format XXXX-XXXX-XXXX-XXXX"]
754 | 
755 | # get the co-authors employment data from their ORCID profile
756 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's 2022 data [>850 IDs], this took ~2 minutes)
757 | my_co_auths_employment <- rorcid::orcid_employments(co_auth_ids_unduped)
758 | 
759 | ##### JSON
760 | # you can write the file to json if you want to work with it outside of R
761 | #to_write<-toJSON(my_co_auths_employment, na="null")
762 | #write(to_write,"./data/co_auths_employment.json")
763 | 
764 | # read it back in, if necessary
765 | #my_co_auths_employment <- read_json("./data/co_auths_employment.json", simplifyVector = TRUE)
766 | ##### JSON
767 |  
768 | # extract the employment data and mutate the dates
769 | my_co_auths_employment_data <- my_co_auths_employment %>%
770 |   purrr::map(., purrr::pluck, "affiliation-group", "summaries") %>% 
771 |   purrr::flatten_dfr() %>%
772 |   janitor::clean_names() %>%
773 |   dplyr::mutate(employment_summary_end_date = anytime::anydate(employment_summary_end_date/1000),
774 |                 employment_summary_created_date_value = anytime::anydate(employment_summary_created_date_value/1000),
775 |                 employment_summary_last_modified_date_value = anytime::anydate(employment_summary_last_modified_date_value/1000))
776 | 
777 | # clean up column names
778 | names(my_co_auths_employment_data) <- names(my_co_auths_employment_data) %>%
779 |   stringr::str_replace(., "employment_summary_", "") %>%
780 |   stringr::str_replace(., "source_source_", "") %>%
781 |   stringr::str_replace(., "organization_disambiguated_", "")
782 | 
783 | # some rows have orcid_path = NA, for these put the ORCID ID back with substring of path
784 | my_co_auths_employment_data <- my_co_auths_employment_data %>% 
785 |   mutate(orcid_path = coalesce(orcid_path,substring(path,2,20)))
786 | 
787 | # get the co-authors' current affiliations
788 | # this will miss co-authors who have no current employment line (with no end date) in their ORCID profile
789 | my_co_auths_employment_data_filtered_current <- my_co_auths_employment_data %>%
790 |   dplyr::filter(is.na(end_date_year_value))
791 | 
792 | # some co-authors may have multiple "current" affiliations
793 | # seperate out those with no start date year value and those that do have start dates
794 | my_co_auths_current_emp_nodate <- subset(my_co_auths_employment_data_filtered_current, is.na(start_date_year_value))
795 | my_co_auths_current_emp_date <- subset(my_co_auths_employment_data_filtered_current, !is.na(start_date_year_value))
796 | 
797 | # for those with a start date, choose the row with the most recent year
798 | latest_dates <- my_co_auths_current_emp_date %>%
799 |   group_by(orcid_path) %>%
800 |   slice(which.max(start_date_year_value)) %>%
801 |   arrange(start_date_year_value)
802 | 
803 | co_auths_latest_emp <- rbind(my_co_auths_current_emp_nodate,latest_dates)
804 | 
805 | # there will STILL be duplicates because of people with a mix of undated and dated ORCID profile employment entries, 
806 | # group again and use the latest entry date
807 | co_auths_very_latest_emp  <- co_auths_latest_emp  %>%
808 |   group_by(orcid_path) %>%
809 |   slice(which.max(created_date_value)) %>%
810 |   arrange(created_date_value)
811 | 
812 | # be double sure that we removed duplicate orcid rows
813 | co_auths_very_latest_emp <- co_auths_very_latest_emp[!duplicated(co_auths_very_latest_emp$orcid_path),]
814 | 
815 | # for the co-authors that had ORCID profiles and for whom we now have a current employment data point, join them back to the co_authors dataframe
816 | co_authors_full_info <- left_join(co_authors,co_auths_very_latest_emp,by=c("orcid2" = "orcid_path"))
817 | 
818 | # If org2, city2, region2, country2 had been NA in the dataframe we are building to output, fill from the joined table fields 
819 | co_authors_full_info <- co_authors_full_info %>% 
820 |   mutate(org2 = coalesce(org2,organization_name),
821 |          city2 = coalesce(city2,organization_address_city),
822 |          region2 = coalesce(region2,organization_address_region),
823 |          country2 = coalesce(country2,organization_address_country)
824 |   )
825 | 
826 | # drop some columns we don't need
827 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2)
828 | 
829 | ##### Code improvement
830 | # from here you could do yet ANOTHER round of recording co-author fullnames and ORCID IDs to the reference dataframe, 
831 | # then fill in blanks in the full_info df
832 | # when the code that does that is pulled out into its own function, that won't take a lot of space to do
833 | ##### Code improvement
834 | 
835 | # get rid of NA values
836 | co_authors_full_info[is.na(co_authors_full_info)] <- ""
837 | 
838 | 
839 | # clean up US state names so they produce single locations on the Tableau map
840 | # set up a dataframe of state names and abbreviations
841 | states_df<- data.frame(state.abb, state.name, paste0(state.name,'US'))
842 | colnames(states_df) <- c('abb','name','id')
843 | 
844 | # left join the correct state abbreviation for only US states with the full state name spelled out
845 | # starting with the home authors' region1
846 | co_authors_full_info$state1<-with(co_authors_full_info,paste0(region1,country1))
847 | co_authors_full_info <- left_join(co_authors_full_info,states_df,by=c("state1" = "id"))
848 | 
849 | # overwrite the full state names with the abbreviations where they occur
850 | co_authors_full_info$region1 <- ifelse(is.na(co_authors_full_info$abb), co_authors_full_info$region1, co_authors_full_info$abb )
851 | 
852 | # drop the joined columns
853 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2)
854 | 
855 | # do the same for the region2, the co_authors' US state names
856 | co_authors_full_info$state2<-with(co_authors_full_info,paste0(region2,country2))
857 | co_authors_full_info <- left_join(co_authors_full_info,states_df,by=c("state2" = "id"))
858 | co_authors_full_info$region2 <- ifelse(is.na(co_authors_full_info$abb), co_authors_full_info$region2, co_authors_full_info$abb )
859 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2)
860 | 
861 | 
862 | # write it to a csv to be visualized
863 | write_csv(co_authors_full_info, "./data/orcid-data.csv")
864 | 
865 | # Ta da, you should now have a data file to visualize in Tableau
866 | 
867 | # Before uploading to Tableau, consider cleaning your data file, either manually or using a tool
868 | # like Open Refine (https://openrefine.org/). It will improve the visualization if wordings and spellings
869 | # are standardized, particularly in the organization (org1, org2) and city name (city1, city2) fields.
870 | 


--------------------------------------------------------------------------------