├── data ├── .gitignore ├── .DS_Store └── Data_folder_README.md ├── tableau-visualization ├── tableau-screenshots │ ├── .gitignore │ ├── details.png │ ├── publish.png │ ├── edit colors.png │ ├── edit group.png │ ├── go to sheet.png │ ├── make a copy.png │ ├── create extract.png │ ├── copied dashboard.png │ ├── data source tab.png │ ├── edit filter org2.png │ ├── new data source.png │ ├── missing directory.png │ ├── show sheets as tabs.png │ ├── groups after editing.png │ ├── single value list filter.png │ └── summary dashboard in full visualization.png ├── dashboard-documentation.md └── customizing-dashboard.md ├── network-visualization └── network-script ├── README.md ├── Rorcid_Crossref_Authors_Existing_List.R └── Rorcid_Crossref_Authors.R /data/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/data/.DS_Store -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/details.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/publish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/publish.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/edit colors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/edit colors.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/edit group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/edit group.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/go to sheet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/go to sheet.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/make a copy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/make a copy.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/create extract.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/create extract.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/copied dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/copied dashboard.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/data source tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/data source tab.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/edit filter org2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/edit filter org2.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/new data source.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/new data source.png -------------------------------------------------------------------------------- /data/Data_folder_README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data_folder_README" 3 | output: html_document 4 | --- 5 | 6 | This file will be empty until you run the code so do not panic if you don't see anything. -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/missing directory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/missing directory.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/show sheets as tabs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/show sheets as tabs.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/groups after editing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/groups after editing.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/single value list filter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/single value list filter.png -------------------------------------------------------------------------------- /tableau-visualization/tableau-screenshots/summary dashboard in full visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lyrasis/ORCID-Data-Visualization/HEAD/tableau-visualization/tableau-screenshots/summary dashboard in full visualization.png -------------------------------------------------------------------------------- /tableau-visualization/dashboard-documentation.md: -------------------------------------------------------------------------------- 1 | # Tableau dashboard documentation 2 | ## Major decisions 3 | * **Date filter:** Did not use a date filter because it causes issues due to how data are pulled. For example, if there is even one DOI from 2020, even if all of the rest of the DOIs are from 2021, using the date filter makes it appear as if the data consist of records from 2020 and 2021 – which is technically true, but very easy to misinterpret. Makes more sense to have users manually add a date. 4 | * **Field names:** Did not change any of the field names in Tableau so that the data can be replaced without any issues directly from the data pull. If you change any of the field names, whether in Tableau or the data source, you will have to make sure that the names are consistent between Tableau and the data source. 5 | * **How the dashboard is counting:** The total numbers are being calculated by the distinct count of DOI from the home/anchor institution. What this means is that a paper with two collaborating authors is counted the same number of times (once) as a paper with ten collaborating authors. That is because the DOI itself is a collaboration, not the number of authors. This logic also applies to the number of collaborating cities. 6 | ## Groups 7 | **Institution grouping:** Each institution has to create two groups from the **Org2** field: one that is ONLY their institution, one that is ONLY other institutions. That makes it possible to filter out their own institution from the data. 8 | ## Measures 9 | **City2**, **DOI**, and **Orcid1** have all been duplicated and converted to measures in order to be able to get the counts of those fields. 10 | ## Maps/geocoding 11 | * The ***latitude*** and ***longitude*** fields are automatically generated by Tableau based on the geographic information in the data. For Tableau to be able to read and geocode the data, the information must be broken down into individual components: city, state, country. 12 | * Sometimes, Tableau doesn’t know how to code a city. You can manually resolve this issue. On the map, in the bottom right corner, it’ll say **[x number] Unknown**. Follow the instructions in the [Edit Unknown or Ambiguous Locations](https://help.tableau.com/current/pro/desktop/en-us/maps_editlocation.htm) resource to edit these locations. 13 | * In the data pull, the columns for state are called **Region1** (home institution) and **Region2** (collaborating institution). 14 | ## Further data cleaning 15 | Consider using a tool such as OpenRefine to further clean the dataset. 16 | -------------------------------------------------------------------------------- /network-visualization/network-script: -------------------------------------------------------------------------------- 1 | # Network visualization script for rorcid output 2 | # Uses the NetSciX Workshop code by Katherine Ognyanova, www.kateto.net 3 | # with slight modifications/tailoring for ORCID data 4 | # For more information, contact LYRASIS at orcidus@lyrasis.org 5 | 6 | # Before running this script, make sure your data are in a 7 | # workable format (nodes and edges) 8 | # Follow data structure in workshop materials: 9 | # https://www.kateto.net/wp-content/uploads/2016/01/NetSciX_2016_Workshop.pdf 10 | # Name nodes and edges files "nodes.csv" and "edges.csv" 11 | 12 | # Install the package "igraph" 13 | # The package (www.igraph.org) is maintained by Gabor Csardi and Tamas Nepusz. 14 | 15 | install.packages("igraph") 16 | 17 | # Load the igraph package' 18 | 19 | library(igraph) 20 | 21 | # Set the working directory to the folder containing the nodes and edges files: 22 | 23 | setwd("C:/Users/folder") 24 | 25 | # Load the datasets 26 | 27 | nodes <- read.csv("nodes.csv", header=T, as.is=T) 28 | links <- read.csv("edges.csv", header=T, as.is=T) 29 | 30 | # Examine the data: 31 | head(nodes) 32 | head(links) 33 | nrow(nodes); length(unique(nodes$id)) 34 | nrow(links); nrow(unique(links[,c("from", "to")])) 35 | 36 | # Collapse multiple links of the same type between the same two nodes 37 | # by summing their weights, using aggregate() by "from", "to", & "type": 38 | links <- aggregate(links[,3], links[,-3], sum) 39 | links <- links[order(links$from, links$to),] 40 | colnames(links)[4] <- "weight" 41 | rownames(links) <- NULL 42 | 43 | # Converting the data to an igraph object: 44 | # The graph.data.frame function, which takes two data frames: 'd' and 'vertices'. 45 | # 'd' describes the edges of the network - it should start with two columns 46 | # containing the source and target node IDs for each network tie. 47 | # 'vertices' should start with a column of node IDs. 48 | # Any additional columns in either data frame are interpreted as attributes. 49 | 50 | net <- graph_from_data_frame(d=links, vertices=nodes, directed=T) 51 | 52 | # Examine the resulting object: 53 | class(net) 54 | net 55 | 56 | # We can look at the nodes, edges, and their attributes: 57 | E(net) 58 | V(net) 59 | 60 | plot(net, edge.arrow.size=.4,vertex.label=NA) 61 | 62 | # Removing loops from the graph: 63 | net <- simplify(net, remove.multiple = F, remove.loops = T) 64 | 65 | # Extract data frames describing nodes and edges: 66 | as_data_frame(net, what="edges") 67 | as_data_frame(net, what="vertices") 68 | 69 | # Plotting with igraph: node options (starting with 'vertex.') and edge options 70 | # (starting with 'edge.'). A list of options is available: 71 | ?igraph.plotting 72 | 73 | # We can set the node & edge options in two ways - one is to specify 74 | # them in the plot() function, as we are doing below. 75 | 76 | # Plot with curved edges (edge.curved=.1) and reduce arrow size: 77 | plot(net, edge.arrow.size=.4, edge.curved=.1) 78 | 79 | # Set node color to orange and the border color to hex #555555 80 | plot(net, edge.arrow.size=.2, edge.curved=0, 81 | vertex.color="orange", vertex.frame.color="#555555") 82 | 83 | # Other layouts to experiment with 84 | 85 | # Randomly placed vertices 86 | l <- layout_randomly(net) 87 | plot(net, layout=l) 88 | 89 | # Circle layout 90 | l <- layout_in_circle(net) 91 | plot(net, layout=l) 92 | 93 | # 3D sphere layout 94 | l <- layout_on_sphere(net) 95 | plot(net, layout=l) 96 | 97 | # By default, igraph uses a layout called layout_nicely which selects 98 | # an appropriate layout algorithm based on the properties of the graph. 99 | 100 | # Check out all available layouts in igraph: 101 | ?igraph::layout_ 102 | 103 | # R and igraph offer interactive plotting, mostly helpful for small networks 104 | # If your institution has more than a couple hundred collaborations, 105 | # consider a different layout from the ones above 106 | 107 | tkid <- tkplot(net) #tkid is the id of the tkplot 108 | l <- tkplot.getcoords(tkid) # grab the coordinates from tkplot 109 | 110 | # Shortest distance to the center indicates higher weight/higher number 111 | # of collaborations 112 | 113 | tk_close(tkid, window.close = T) 114 | plot(net, layout=l) 115 | -------------------------------------------------------------------------------- /tableau-visualization/customizing-dashboard.md: -------------------------------------------------------------------------------- 1 | # Customizing your own Tableau dashboard 2 | Take the steps below to build your own Tableau dashboard. For questions about Tableau, check out the [Tableau Community Forums](https://community.tableau.com/s/topic/0TO4T000000QF9sWAG/tableau-public). If you run into issues with this Tableau dashboard, contact Lyrasis ORCID US Community support for assistance. Be sure to also review the [dashboard documentation](https://github.com/lyrasis/ORCID-Data-Visualization/blob/main/tableau-visualization/dashboard-documentation.md). If you run into challenges authoring on the web (in your browser), consider installing the Tableau Public application on your device ([installers](https://help.tableau.com/current/desktopdeploy/en-us/desktop_deploy_download.htm)). 3 | 1. Request a data pull from ORCID US Community support, or pull your own data using the R script in this repository. **Do not change any of the variable names.** Doing so will cause Tableau to be unable to recognize the variables. 4 | 2. Make sure that the name of the CSV from the data pull is **orcid-data** 5 | 3. [Create a Tableau Public account](https://public.tableau.com/desktop/signup_unification.html) for yourself, your institution, or your department. Be sure to check in with your local IT department to ask about any possible restrictions or rules around creating an account. 6 | 4. While logged in to your Tableau Public account, navigate to the template dashboard featured on the [ORCID US Community Tableau Public profile](https://public.tableau.com/app/profile/orcid.us.community/viz/Lyrasis-CollaborationsDashboardTemplate/Fullvisualization). 7 | 5. Using the menu in the top right of the dashboard, click on the icon with two overlapping rectangles (**Make a copy**). 8 |

Make a copy button in Tableau.

9 | 6. What you should be seeing at this point... 10 | 11 |

Tableau in browser edit view for collaborations dashboard..

12 | 13 | 7. In the top right corner, click on the blue **Publish As...** button, rename the dashboard as needed, and then click on the blue **Publish** button. 14 | 8. In the bottom left corner, click on **Data Source**. 15 | 9. You will see an error message: *The directory is missing or has been moved: Replace it with another file?* 16 | 10. Click **Yes**. 17 | 11. Drag and drop or upload your institution’s data file from its location on your device. Tableau will replace the existing data file with this file. This may take several minutes, depending on the file size. 18 | 12. Once the new data has been added, in the top right corner, click on **Create extract**, then click on **Create extract** again in the pop-up. Depending on the file size, this may take anywhere from several seconds to several minutes. 19 | 20 |

Location of create extract option in Tableau.

21 | 22 | 13. Click on the **Summary dashboard** tab at the bottom. 23 | 14. At the top of the dashboard, double click on the title. Replace the **[Organization name]** and **[time period from data pull]** and **Data pulled on [date]** with your organization name, the time period requested from the data pull, and the date of the data pull, then click **OK** (make sure to delete the brackets after replacing the text). As a reminder, the script pulls data from January 1 of the year requested to the present date of the data pull. 24 | 15. Scroll down to the **Highest number of collaborations with the following institutions:** visualization. At the bottom of the visualization, double click the caption underneath the bars to fill in the text between the brackets with your own institution’s data. 25 | 16. Click anywhere on the **Highest number of collaborations with the following institutions:** visualization. Four small icons will appear on the right of the gray box around the visualization. Click on the small square with the arrow popping out (second icon down, called **Go to Sheet**). You can use this feature to navigate to individual visualizations from the dashboards. 26 | 27 |

Demonstration of Go to Sheet icon in Tableau, found on right-hand side of visualizations.

28 | 29 | 17. On the **Marks** card (one of the left-hand panes), click on **Color** to change the color of the bars. You can use the other features in the **Marks** card to edit the colors, sizes, and other aesthetic features of the visualizations. 30 | 31 |

Marks card for Tableau controls color selection, size, and other aesthetics.

32 | 33 | 18. Click on the **Collaborations** tab, then click anywhere on the **Collaborations Map** visualization. Click on the **Go to Sheet** icon that appears on the top right corner of the visualization to go to the visualization. 34 | 19. On the left-hand side, under the **Data pane** that contains a long list of variables, click on the small white triangle to the right of **Org2 (group)**. Then, click on **Edit Group…** 35 | 36 |

In Tableau, Edit Group can be found by clicking on a dimension.

37 | 38 | 20. A list of institutions should pop up. If there are any intra-institution collaborations in the dataset (for example, a Temple University researcher who collaborated and published with another Temple University researcher), find your institution and click on it to highlight, then click on **Group**. Label this group as **[Your institution name] Only**. Be sure not to edit the field name at the top (**Org2(group)**). If there are no intra-institution collaborations in the dataset, delete this filter by clicking on the **x** in the top right when this filter is selected in the dashboard. 39 | 21. Follow the same steps to select all of the other institutions (including **Null**) and group them. Label that group as **Excluding [your institution name]**. 40 | 41 |

In Tableau, groups can be edited by selected and labeling groups of data.

42 | 43 | 22. Close the pop-up. 44 | 23. At the bottom, click on the **Collaborations** tab. 45 | 24. Immediately below the first gray bar divider, click on the filter. On the right-hand side of the gray box that appears around the filter, click on the small white triangle, then select **Single Value (list)**. 46 | 47 |

Filters can be edited to a single value list by clicking on white arrow in Tableau.

48 | 49 | 25. Click on the **Individual search** tab. Double click on the title to change the **[time period from data pull]** to the time period for your dataset and **Data pulled on [date]** to add the date of the data pull. 50 | 26. At the bottom, click on the **Why can't I find my ORCID iD?** tab. Double click on the text to add the appropriate contact information for ORCID support at your institution. 51 | 27. Make any other customizations to the dashboard. For support with using Tableau, refer to the Tableau Resources section. 52 | 28. In the top right corner, click on **Publish** or **Publish as…** to save the dashboard to your Tableau Public profile. Note that any filters you have selected will be saved once you publish the workbook, so clear any filters that you do not want to be set as the default and make sure you are on the **Full visualization** tab. 53 | 54 |

Tableau dashboard in Tableau Public can be published by clicking on Public button.

55 | 56 | 29. The **Full visualization** tab of the dashboard contains all of the dashboard tabs in a neater, guided format. To only display the **Full visualization** tab, navigate to the published version of the dashboard, then click on the **settings** in the top right corner (indicated by the gear icon). Deselect Show Sheets to only show the **Full visualization** tab. 57 | 58 |

Tableau settings allow for profile visibility, sheets visibility, and access customization.

59 | 60 | 30. You can also decide if you want this dashboard to be visible on your profile and if you want to allow others to download or make a copy of your visualization in the **settings**. 61 | 31. Scroll down to the **Details** section. Click on the **pencil icon** to the right of **Details** to edit the details for the dashboard, such as the title and description. Click on **Save Changes** when you’re finished with your edits. 62 | 63 |

Tableau details allow for title, description, inspiration, and external links to be shared with Tableau Public dashboards.

64 | 65 | 32. Review your visualization for any accessibility issues using [accessibility resources](https://github.com/lyrasis/ORCID-Data-Visualization#tableau-and-accessibility-resources). As a general note, any changes that you make to the dashboard as an **editor** will persist. Any changes that you make to the dashboard as a **viewer** (e.g., using a filter, excluding values) can be reset by refreshing the dashboard. 66 | 67 | If you run into any issues with these steps, refer to the Tableau resources below or reach out to Lyrasis ORCID US Community support for further assistance. 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Exploring Publication Collaborations using ORCID and DOI data 2 | ## Project Summary 3 | The resources available in this Github repository can be used to create a visualization of publication collaboration activity, based on public data from researchers' ORCID records and Crossref DOI publication metadata. The R script in this repository can be used to retrieve information about publishing collaborations between researchers at a home organization and other organizations across the globe. The resulting CSV file can then be loaded into a [Tableau Public](https://public.tableau.com/app/discover) dashboard to create a collaboration map and additional views to explore the data further. This project was made possible by a 2022 partnership between the [ORCID US Community](https://orcidus.lyrasis.org/) (administered by Lyrasis) and the Drexel University [LEADING program](https://mrc.cci.drexel.edu/leading/). For more information, including full instructions for how to use the tools in this repository, please [see our website](https://orcidus.lyrasis.org/data-visualization/). 4 | ## Retrieving the data 5 | We recommend using [R Studio](https://posit.co/) to run the [R script](https://github.com/lyrasis/ORCID-Data-Visualization/blob/main/Rorcid_Crossref_Authors.R), which will result in the data needed to create the visualization. The script is designed to: 6 | 7 | * Retrieve ORCID iDs for researchers who have a current, publicly visible employment affiliation for a home institution on their ORCID record 8 | * Unpack the publicly visible works information present on each ORCID record 9 | * Retrieve Crossref DOI metadata for every work that has a Crossref DOI included on the ORCID work citation 10 | * Unpack list of co-authors included in the Crossref DOI metadata for each work 11 | * Retrieve ORCID iD for every co-author, if available 12 | * Check current employment affiliation on the ORCID record of every co-author 13 | * Get location information for the co-authors’ institutions 14 | * Repackage data into CSV file containing home author ORCID iDs, co-author ORCID iDs and institutional affiliations/geographic location, and publication DOIs 15 | 16 | You can use the green “code” button above to download a ZIP file, which will contain the R script as well as a folder labeled “data,” which is where the resulting CSV file will be saved. Or, you can download just the R script and create your own “data” folder separately. 17 | 18 | Before you get started, you will need to gather the following information for your organization, based on how broad or narrow you want your search to be: 19 | 20 | * Organization Name(s) 21 | * Organization Email Domain(s) 22 | * Organization ROR ID (search the [ROR registry](https://ror.org/search)) 23 | * Organization GRID ID (often included as an "other identifier" in the [ROR registry](https://ror.org/search)) 24 | * Organization Ringgold ID(s) (to find the Ringgold ID(s) for your organization, you can create a guest account at [https://ido.ringgold.com/register](https://ido.ringgold.com/register), or you can email orcidus@lyrasis.org and we can find your Ringgold ID for you) 25 | 26 | For help retrieving any of this information, contact orcidus@lyrasis.org. 27 | 28 | Open the R script file in RStudio. The script contains a number of comments and instructions. Comments are indicated by the presence of a hashtag (#) proceeding the comment text. Any lines of text with a hashtag in front will not be run as commands. Lines of text with no hashtag will be run as a command when entered in R Studio. 29 | 30 | The first time you run the script, there are a few things you will need to do to get set up. Once these things are done, you should not have to do them again: 31 | 32 | * Install the packages listed in the script by un-commenting, or removing the hashtags from, the install commands. Once installed, you should not have to install the packages again, so in future sessions you can leave the hashtags in. However, you will still need to load the packages each time you run the script. 33 | * Get your [ORCID Public API keys](https://info.orcid.org/documentation/features/public-api/) (Client ID and Client Secret). You need to have an ORCID iD account in order to get Public API keys. If you don’t have an ORCID iD account, you can register for one, for free, at [https://orcid.org/register](https://orcid.org/register). Follow the instructions in the script. 34 | * Get your ORCID API bearer token in RStudio. Follow the instructions provided in the script. 35 | 36 | Next, you will plug in your unique values to perform the search and data retrieval process. You will need to enter the following values (follow the instructions in the script): 37 | 38 | * Your working directory = the file path leading to your “data” folder where the final CSV output will be stored. The path needs to look something like this: /Users/rabun/Desktop/ORCID-Data-Visualization-main/data 39 | * The year you want to start searching for publications 40 | * Ringgold ID, GRID ID, ROR ID, Email domain, and Organization name - this allows the script to start by finding all ORCID records that have one or more of these values present in the Employment section of individuals’ ORCID records. For example: 41 | * ringgold_id <- "14703" 42 | * grid_id <- "grid.253554.0" 43 | * ror_id <- "https://ror.org/04v097707" 44 | * email_domain <- "@csuci.edu" 45 | * organization_name <- "California State University, Channel Islands" 46 | 47 | Note that if you want to search for multiple organization names (and thus have multiple different identifiers) and multiple email domains, there is a section of the script that provides the option to set multiple values for the search (see below). 48 | 49 | * Keyword = a word that is unique to your institution, that will serve to narrow the search results for just your organization. For example “Temple” could be the keyword if searching for results from Temple University. If your institution has common words in the name, you may want to use the entire organization name as the keyword. For example the keyword “New” would not be helpful for a search for “The New School” because multiple organizations have the word “New” in the name. 50 | * Geographic information for your organization, including city, state, and country. For example: 51 | * anchor_org<-"California State University, Channel Islands" 52 | * anchor_city<-"Camarillo" 53 | * anchor_region<-"CA" 54 | * anchor_country<-"US" 55 | 56 | At this point, the script provides two options for creating the initial query: 57 | 1) Run the search based on the values that you already entered 58 | 2) If you want to search for multiple campuses or specific departments at your organization, you will need to enter those additional values and go from there. 59 | 60 | Now you can continue to run the commands and follow the instructions within the script. 61 | 62 | Note that the script has various sections, and there are opportunities for you to export the data so far after each section so you can write it back in later without having to run the whole script again. This can be helpful if you get interrupted or if you don’t have time to run the whole script in one sitting. 63 | 64 | Note that there is one more part of the script, in the “get employment data” section, where you will have two options: 65 | 1) Accept all of the organization names returned by the initial query 66 | 2) View and edit the list of organization names to be included in the search results. 67 | 68 | Continue to follow the instructions and run the script commands, until you get to the end, where the final CSV data file will be exported to your data folder. 69 | Once you have the CSV output, you may want to check and clean organization names and city names using [Open Refine](https://openrefine.org/). This can be helpful for collapsing multiple variations of the same organization name (mis-spellings, extra spaces, etc.), and for filling in any city information that may be missing or incorrect. 70 | 71 | ## Considerations and context for the data 72 | **Data errors:** The data pulled using the R script are imperfect and contain gaps, as well as user and machine errors. The numbers in the data pull are not definitive. The data pulled for your institution are a snapshot for a specific period of time and may change as researchers obtain/update their ORCID profiles and continue to publish. 73 | 74 | Some examples of data errors that may exist in the data are: 75 | * Missing ORCID iDs 76 | * Missing geographic information that leads to missing data points on the collaborations map 77 | * Typos in the institution name or city/country that lead to missing or erroneous ORCID iDs included in the data pulls 78 | 79 | It’s important to highlight that this data shouldn’t be used to evaluate or compare researchers against one another because the data are not perfect and do not give a full picture of collaborations and impact. The resources in this repository provide just one angle through which to approach this information. 80 | 81 | **Collaboration counting:** In the data pull, collaborations are counted by iterating through each home author and counting the collaborations again. For example, if 2 researchers at Temple (home institution) author a paper with researchers from the University of Texas, this counts as 1 collaboration within Temple and 1 collaboration with UT for each Temple author. In other words, for the home institution as a whole, it’s counted as 2 collaborations within Temple and 2 collaborations with UT. However, in the Tableau dashboard, each DOI is counted as one collaboration for the institution. 82 | 83 | **Current or previous institutions:** The data pulled for each author also looks at their entire careers. The script also pulls the current institution for collaborating authors. This reduces blanks which are greater when trying to pinpoint affiliation at the time of DOI minting because of lack of historical employment entries in ORCID profiles. This also avoids potential discrepancies with date of DOI minting and date of publication, which is sometimes blank. This also treats both authors the same in terms of counting. 84 | 85 | **Dates**: You may see discrepancies in the DOI issuing date and publication date due to different DOI issuing processes. There may be a lag time between when the DOI was issued and the publication date according to the journal. This may also depend on the publisher’s workflow. The date used in this script is the DOI issue date. This allowed for fewer blanks in the data. This is an area of future improvement for this project. 86 | ## Customizing your own Tableau dashboard 87 | Once you have the CSV output for your search, you can load your data into Tableau to create your visualization. Refer to the [Customizing your own Tableau dashboard](https://github.com/lyrasis/ORCID-Data-Visualization/blob/main/tableau-visualization/customizing-dashboard.md) page. Be sure to also review the [dashboard documentation](https://github.com/lyrasis/ORCID-Data-Visualization/blob/main/tableau-visualization/dashboard-documentation.md). 88 | ## Why can't I find my ORCID iD? 89 | For the “individual search” tab within the Tableau dashboard, if you or other individual are having trouble finding your/their ORCID iD in the data pull or the search, here are a few things you may want to check: 90 | ### 1. Do you have an ORCID profile set up? 91 | If you have not yet created an ORCID iD, please visit www.orcid.org to set up your ORCID profile. ORCID sets up a persistent digital identifier (also called an ORCID iD) to distinguish you from other researchers. 92 | ### 2. Did you set up your ORCID iD after the data pull? 93 | If you set up your ORCID iD after the data supporting this dashboard were pulled, then your ORCID iD will not show up in the dashboard until the data has been pulled again. 94 | ### 3. Is all of the information in your ORCID profile accurate? 95 | Take a moment to verify that your current institution and location are accurately listed in your ORCID profile -- typos happen! If you work remotely for an institution, you will have to list the institution's primary location in order to show up in the data. If you correct any information in your ORCID profile after the data supporting this dashboard were pulled, then your ORCID iD will not show up in the dashboard until the data has been pulled again. 96 | ### 4. Still not sure? 97 | Reach out to your campus ORCID administrator or Lyrasis for further troubleshooting. 98 | ## Tableau and accessibility resources 99 | * [Tableau, A Beginner’s Guide to Tableau Public](https://www.tableau.com/blog/beginners-guide-tableau-public) 100 | * [Authoring for Accessibility – Tableau](https://onlinehelp.tableau.com/current/pro/desktop/en-us/accessibility_create_view.htm) 101 | * [Tableau maps: Edit Unknown or Ambiguous Locations](https://help.tableau.com/current/pro/desktop/en-us/maps_editlocation.htm) 102 | * [Tableau Community Forums](https://community.tableau.com/welcome) 103 | * [Tableau Reference Guide](http://www.tableaureferenceguide.com/) 104 | * [Financial Times "Visual Vocabulary: Tableau Edition"](http://www.vizwiz.com/2018/07/visual-vocabulary.html) 105 | * [OneNumber, Tableau for Beginners](https://onenumber.biz/blog-1/2022/5/2/tableau-for-beginners-connect-to-data) 106 | ## Questions and support 107 | For any questions or support, or to provide feedback, please contact Lyrasis ORCID US Community support at orcidus@lyrasis.org. 108 | ## Usage License 109 | [Collaboration Data Visualization](https://github.com/lyrasis/ORCID-Data-Visualization) © 2022 by [Lyrasis](https://orcidus.lyrasis.org/data-visualization/) is licensed under [CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/?ref=chooser-v1) 110 | -------------------------------------------------------------------------------- /Rorcid_Crossref_Authors_Existing_List.R: -------------------------------------------------------------------------------- 1 | # Script by Olivia Given Castello, adapted by Sheila Rabun, based on: https://ciakovx.github.io/rorcid.html 2 | # and 04-rcrossref_metadata.R at https://github.com/ciakovx/fsci2022/tree/main/code 3 | # Retrieves ORCID profile and Crossref metadata for authors from an existing list of ORCID iDs, 4 | # since a given year, paired with that of the co-authors with whom they collaborated. 5 | 6 | # Install and load packages ----------------------------------------------- 7 | 8 | # you will need to install these packages first, using the following 9 | # if you've already installed them, skip this step 10 | #install.packages('dplyr') 11 | #install.packages('tibble') 12 | #install.packages('tidyr') 13 | #install.packages('purrr') 14 | #install.packages('readr') 15 | #install.packages('stringr') 16 | #install.packages('jsonlite') 17 | #install.packages('lubridate') 18 | #install.packages('ggplot2') 19 | #install.packages('httr') 20 | #install.packages('forcats') 21 | #install.packages('rorcid') 22 | #install.packages('usethis') 23 | #install.packages('anytime') 24 | #install.packages('janitor') 25 | #install.packages('glue') 26 | #install.packages('remotes') 27 | #install.packages("ropensci/rcrossref") 28 | #install.packages('roadoi') 29 | #install.packages('inops') 30 | #install.packages("data.table") 31 | #install.packages("ropensci/geonames") 32 | 33 | # load the packages 34 | library(dplyr) 35 | library(tibble) 36 | library(tidyr) 37 | library(purrr) 38 | library(readr) 39 | library(stringr) 40 | library(jsonlite) 41 | library(lubridate) 42 | library(ggplot2) 43 | library(httr) 44 | library(forcats) 45 | library(usethis) 46 | library(anytime) 47 | library(janitor) 48 | library(glue) 49 | library(rorcid) 50 | library(rcrossref) 51 | library(roadoi) 52 | library(inops) 53 | 54 | # remove all objects from the environment to start with a clean slate 55 | rm(list = ls()) 56 | 57 | # Set up orcid / crossref in R environment ------------------------------------------------------------ 58 | 59 | # if you've already done these steps and set up your bearer token in RStudio 60 | # you can skip to the next section: "set some variablees and build the query" 61 | 62 | # 1. If you haven’t done so already, create an ORCID account at https://orcid.org/signin. 63 | # 2. In the upper right corner, click your name, then in the drop-down menu, click Developer Tools. Note: In order to access Developer Tools, you must verify your email address. 64 | # 3. If you have not already verified your email address, you will be prompted to do so at this point. 65 | # 4. Click the “Register for the free ORCID public API” button 66 | # 5. Review and agree to the terms of service when prompted. 67 | # 6. Add your name in the Name field, https://www.orcid.org in the Your Website URL field, “Getting public API key” in Description field, and https://www.orcid.org in the redirect URI field. Click the diskette button to save. 68 | # 7. A gray box will appear including your Client ID and Client Secret. In the below code chunk, copy and paste the client ID and the client secret respectively. 69 | # 8. Make sure to leave the quotation marks (e.g. orcid_client_id <- "APP-FDFJKDSLF320SDFF" and orcid_client_secret <- "c8e987sa-0b9c-82ed-91as-1112b24234e"). 70 | 71 | # copy/paste your client ID from https://orcid.org/developer-tools 72 | orcid_client_id <- "PASTE MY CLIENT ID HERE" 73 | 74 | # copy/paste your client secret from https://orcid.org/developer-tools 75 | orcid_client_secret <- "PASTE MY CLIENT SECRET HERE" 76 | 77 | # This gets a /read-public scope access token 78 | orcid_request <- POST(url = "https://orcid.org/oauth/token", 79 | config = add_headers(`Accept` = "application/json", 80 | `Content-Type` = "application/x-www-form-urlencoded"), 81 | body = list(grant_type = "client_credentials", 82 | scope = "/read-public", 83 | client_id = orcid_client_id, 84 | client_secret = orcid_client_secret), 85 | encode = "form") 86 | 87 | # parse the API request with content 88 | orcid_response <- content(orcid_request) 89 | 90 | # run the following code 91 | print(orcid_response$access_token) 92 | 93 | #You will see a string of text print out in your R console. 94 | # Copy that string to the clipboard 95 | # so we can save the token to our R environment 96 | # Run this code: 97 | usethis::edit_r_environ() 98 | 99 | # A new window will open in RStudio. 100 | # In this separate R environment page, type the following (except the pound sign): 101 | # ORCID_TOKEN="my-token" 102 | # replace 'my-token' with the access_token you just copied. 103 | # Then press enter to create a new line. 104 | # while we are here, we'll add in our rcrossref credentials 105 | # type crossref_email="name@example.com", using your own email address. 106 | # press enter to create a new line, and leave it blank. 107 | # Press Ctrl + S (Mac: Cmd + S) to save this information to your R environment and close the window. 108 | # You won't see anything happen here because it is just saving the page. 109 | 110 | # Click Session > Restart R. Your token should now be saved to your R environment. 111 | 112 | # You will now need to rerun all the packages ("library()" commands) above, then return to this line. 113 | 114 | #You can confirm this worked by calling orcid_auth(), and it will print the token 115 | rorcid::orcid_auth() 116 | 117 | 118 | # set some variablees and build the query -------------------------------------------------------- 119 | 120 | # set the working directory where this script is 121 | # a folder called "data" is also expected to be in this directory 122 | # example: setwd("C:/Users/rabun/OneDrive - LYRASIS/Documents/RsearchResults") 123 | setwd("PASTE WORKING DIRECTORY HERE") 124 | 125 | # set the time period of interest: this script will compile collaboration data since Jan 1 of this year. 126 | # replace the YYYY with a 4 digit year. 127 | # the more years of data desired, the longer some portions of this script will take to run 128 | my_year = YYYY; 129 | 130 | # set the institution's main location information (for use when precise location info is blank) 131 | # example: 132 | # anchor_org<-"The Gordon and Betty Moore Foundation" 133 | # anchor_city<-"Palo Alto" 134 | # anchor_region<-"CA" 135 | # anchor_country<-"US" 136 | anchor_org<-"Organization Name" 137 | anchor_city<-"City" 138 | anchor_region<-"State" 139 | anchor_country<-"Country" 140 | 141 | # read in your list of existing ORCID iDs - it should be a csv file named my_orcids_data (make sure you save as .csv) and should be saved within the "data" file in your working directory. It should be formatted with three columns: 142 | # first column should be titled orcid_identifier_uri and should contain a list of the full ORCID ID URL for each person (example: https://orcid.org/0000-0002-0375-8429) 143 | # second column should be titled orcid_identifier_path and should contain a list of just the 16 digit ORCID numbers for each person (example: 0000-0002-0375-8429) 144 | # third column should be titled orcid_identifier_host and should contain a list of just the 16 digit ORCID numbers for each person (example: 0000-0002-0375-8429) 145 | my_orcids_data <- read_csv("./data/my_orcids_data.csv", col_types = cols(.default = "c")) 146 | 147 | 148 | # get employment data ----------------------------------------------------- 149 | 150 | # get the employments from the orcid_identifier_path column 151 | ##### TIME: be patient, this may take a long time (e.g. for Temple University's data [~3500 IDs], this took ~8 minutes) 152 | my_employment <- rorcid::orcid_employments(my_orcids_data$orcid_identifier_path) 153 | 154 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later 155 | #to_write<-toJSON(my_employment, na="null") 156 | #write(to_write,"./data/employment.json") 157 | 158 | # read it back in, if necessary 159 | #my_employment <- read_json("./data/processed/employment.json", simplifyVector = TRUE) 160 | ##### WRITE/READ JSON 161 | 162 | # extract the employment data and mutate the dates 163 | my_employment_data <- my_employment %>% 164 | purrr::map(., purrr::pluck, "affiliation-group", "summaries") %>% 165 | purrr::flatten_dfr() %>% 166 | janitor::clean_names() %>% 167 | dplyr::mutate(employment_summary_end_date = anytime::anydate(employment_summary_end_date/1000), 168 | employment_summary_created_date_value = anytime::anydate(employment_summary_created_date_value/1000), 169 | employment_summary_last_modified_date_value = anytime::anydate(employment_summary_last_modified_date_value/1000)) 170 | 171 | # clean up the column names 172 | names(my_employment_data) <- names(my_employment_data) %>% 173 | stringr::str_replace(., "employment_summary_", "") %>% 174 | stringr::str_replace(., "source_source_", "") %>% 175 | stringr::str_replace(., "organization_disambiguated_", "") 176 | 177 | # view the unique institutions in the organization names columns 178 | # keep in mind this will include all institutions a person has in their employments section 179 | my_organizations <- my_employment_data %>% 180 | group_by(organization_name) %>% 181 | count() %>% 182 | arrange(desc(n)) 183 | 184 | # view the variation in organization names by looking at my_organization_filtered (will open a new tab) 185 | # view(my_organizations) 186 | 187 | # Note that this will give you employment records only. 188 | # In other words, each row represents a single employment record for an individual. 189 | # the name_value variable refers specifically to the name of the person or system 190 | # that wrote the record, NOT the name of the individual. 191 | 192 | # To get that, you must first get all the unique ORCID iDs from the dataset: 193 | 194 | # There is no distinct value identifying the orcid ID of the person. 195 | # The orcid_path value corresponds to the path of the person who added the employment record (which is usually, but not always the same) 196 | # Therefore you have to strip out the ORCID iD from the 'path' variable first and put it in it's own value and use it 197 | # We do this using str_sub from the stringr package 198 | # While we are at it, we can select and reorder the columns we want to keep 199 | current_employment_all <- my_employment_data %>% 200 | mutate(orcid_identifier = str_sub(path, 2, 20)) %>% 201 | select(any_of(c("orcid_identifier", 202 | "organization_name", 203 | "organization_address_city", 204 | "organization_address_region", 205 | "organization_address_country", 206 | "organization_identifier", 207 | "organization_disambiguated_organization_identifier", 208 | "organization_disambiguation_source", 209 | "department_name", 210 | "role_title", 211 | "url_value", 212 | "display_index", 213 | "visibility", 214 | "created_date_value", 215 | "start_date_year_value", 216 | "start_date_month_value", 217 | "start_date_day_value", 218 | "end_date_year_value", 219 | "end_date_month_value", 220 | "end_date_day_value"))) 221 | 222 | # next, create a new vector unique_orcids that includes only unique ORCID iDs from our filtered dataset. 223 | unique_orcids <- unique(current_employment_all$orcid_identifier) %>% 224 | na.omit(.) %>% 225 | as.character() 226 | 227 | # then run the following expression to get all biographical information for those iDs. 228 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's data [~700 IDs], this took ~1.5 minutes) 229 | my_orcid_person <- rorcid::orcid_person(unique_orcids) 230 | 231 | # then we construct a data frame from the response. 232 | # See more at https://ciakovx.github.io/rorcid.html#Getting_the_data_into_a_data_frame for this. 233 | my_orcid_person_data <- my_orcid_person %>% { 234 | dplyr::tibble( 235 | given_name = purrr::map_chr(., purrr::pluck, "name", "given-names", "value", .default=NA_character_), 236 | created_date = purrr::map_chr(., purrr::pluck, "name", "created-date", "value", .default=NA_integer_), 237 | last_modified_date = purrr::map_chr(., purrr::pluck, "name", "created-date", "value", .default=NA_character_), 238 | family_name = purrr::map_chr(., purrr::pluck, "name", "family-name", "value", .default=NA_character_), 239 | credit_name = purrr::map_chr(., purrr::pluck, "name", "credit-name", "value", .default=NA_character_), 240 | other_names = purrr::map(., purrr::pluck, "other-names", "other-name", "content", .default=NA_character_), 241 | orcid_identifier_path = purrr::map_chr(., purrr::pluck, "name", "path", .default = NA_character_), 242 | biography = purrr::map_chr(., purrr::pluck, "biography", "content", .default=NA_character_), 243 | researcher_urls = purrr::map(., purrr::pluck, "researcher-urls", "researcher-url", .default=NA_character_), 244 | emails = purrr::map(., purrr::pluck, "emails", "email", "email", .default=NA_character_), 245 | keywords = purrr::map(., purrr::pluck, "keywords", "keyword", "content", .default=NA_character_), 246 | external_ids = purrr::map(., purrr::pluck, "external-identifiers", "external-identifier", .default=NA_character_)) 247 | } %>% 248 | dplyr::mutate(created_date = anytime::anydate(as.double(created_date)/1000), 249 | last_modified_date = anytime::anydate(as.double(last_modified_date)/1000)) 250 | 251 | # Join it back with the employment records so that the employment data now includes organization city, region, country 252 | orcid_person_employment_join <- my_orcid_person_data %>% 253 | left_join(current_employment_all, by = c("orcid_identifier_path" = "orcid_identifier")) 254 | 255 | ##### WRITE/READ CSV uncomment to save this data and read it back in later 256 | #write_csv(orcid_person_employment_join, "./data/orcid_employment_file.csv") 257 | 258 | # read it back in, if necessary 259 | #orcid_person_employment_join <- read_csv("./data/orcid_employment_file.csv", col_types = cols(.default = "c")) 260 | ##### WRITE/READ CSV 261 | 262 | 263 | # get works data ----------------------------------------------------- 264 | 265 | # create a vector of unique, unduplicated ORCID IDs from that file 266 | my_orcids <- orcid_person_employment_join %>% 267 | filter(!duplicated(orcid_identifier_path)) %>% 268 | pull(orcid_identifier_path) %>% 269 | na.omit() %>% 270 | as.character() 271 | 272 | # Call the orcid_works function to collect all works associated with each ID 273 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's data [~700 IDs], this took ~2.5 minutes) 274 | my_works <- rorcid::orcid_works(my_orcids) 275 | 276 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later 277 | #to_write<-toJSON(my_works, na="null") 278 | #write(to_write,"./data/my_works.json") 279 | 280 | # read it back in, if necessary 281 | #my_works <- read_json("./data/my_works.json", simplifyVector = TRUE) 282 | ##### WRITE/READ JSON 283 | 284 | # turn the JSON file into a unique data frame by looping through the file, 285 | # extracting ("pluck") the object, bind the rows together with(this is the "_dfr" part of map_dfr) 286 | # then clean column names 287 | # and convert the dates from Unix time to yyyy-mm-dd 288 | my_works_data <- my_works %>% 289 | purrr::map_dfr(pluck, "works") %>% 290 | janitor::clean_names() %>% 291 | dplyr::mutate(created_date_value = anytime::anydate(created_date_value/1000), 292 | last_modified_date_value = anytime::anydate(last_modified_date_value/1000)) 293 | 294 | # we only want to keep works that have an external identifier 295 | # (specifically, a DOI), so we first filter to keep only objects that have an external_id value 296 | # then unnest those: in other words expand to include a row for every work + external id value 297 | # (in other words, one work might be linked to a DOI, a PubMed ID, an ISSN, etc.) 298 | my_works_externalIDs <- my_works_data %>% 299 | dplyr::filter(!purrr::map_lgl(external_ids_external_id, purrr::is_empty)) %>% 300 | tidyr::unnest(external_ids_external_id) %>% 301 | clean_names() 302 | 303 | # From those unnested external IDs, we want to keep only those with a DOI, as that is the 304 | # value we'll use to look up the items in Crossref. 305 | # We then select a few relevant columns, and finally create a new column DOI that takes the external_id_value column 306 | # and coerces it to lower case, and the orcid_identifier column which strips out the ORCID ID 307 | # from the path variable. 308 | dois <- my_works_externalIDs %>% 309 | filter(external_id_type == "doi") %>% 310 | select(type, path, title_title_value, external_id_type, external_id_value, external_id_relationship, 311 | url_value, publication_date_year_value, publication_date_month_value, publication_date_day_value, 312 | journal_title_value) %>% 313 | mutate(doi = tolower(external_id_value), 314 | orcid_identifier = str_sub(path, 2, 20)) 315 | 316 | # there are some duplicated values here: we can't just look at duplicate DOIs because some of these 317 | # works were co-authored, and we want to keep that data (i.e. unique orcid + doi combinations) 318 | # This function will let you look at observations where both the orcid ID and the DOI are duplicated in 319 | # case you want to review them more closely. 320 | # In our case below, we just keep the first appearance of a unique orcid + doi combination and discard 321 | # all subsequent ones. 322 | dupes <- dois %>% 323 | get_dupes(orcid_identifier, doi) 324 | 325 | # Here we are preparing the orcid dataset for merging to publications. 326 | # We keep only Orcid ID, first name and last name, remove duplicates, and rename orcid_identifier 327 | orcid_empl_merge <- orcid_person_employment_join %>% 328 | select(orcid_identifier_path, given_name, family_name) %>% 329 | filter(!duplicated(orcid_identifier_path)) %>% 330 | rename(orcid_identifier = orcid_identifier_path) 331 | 332 | # Finally, we remove the duplicates by creating a new variable that is a combination of 333 | # the orcid ID and the DOI, and keeping only the first instance. We then join that to our 334 | # cleaned orcid ID file and write to csv 335 | dois_unduped <- dois %>% 336 | mutate(orcid_doi = paste0(orcid_identifier, doi)) %>% 337 | filter(!duplicated(orcid_doi)) %>% 338 | left_join(orcid_empl_merge, by = "orcid_identifier") 339 | 340 | ##### WRITE/READ CSV uncomment to save this data and read it back in later 341 | #write_csv(dois_unduped, "./data/orcid_dois.csv") 342 | 343 | # read it back in, if necessary 344 | #dois_unduped <- read_csv("./data/orcid_dois.csv") 345 | ##### WRITE/READ CSV 346 | 347 | 348 | # get CrossRef data ----------------------------------------------------- 349 | 350 | # We start by subsetting our unduped dois to include only since the year that we want 351 | # this is the year of publication according to the ORCID profile works data 352 | dois_since_year <- dois_unduped %>% 353 | filter(publication_date_year_value >= my_year) 354 | 355 | # This will loop through the column of dois and perform a function that 356 | # prints the doi (this allows you to ensure it's progressing) 357 | # there will be warning messages for any DOIs not found at CrossRef 358 | ##### TIME This will take a long time for large datasets (e.g. for Temple University's 2022 data [800+ DOIs], this took ~6 minutes) 359 | metadata_since_year <- map(dois_since_year$doi, function(z) { 360 | print(z) 361 | o <- cr_works(dois = z) 362 | return(o) 363 | }) 364 | 365 | ##### Code improvement 366 | # Here we could create a similar function that queries DataCite for metadata on the ones that weren't found in CR 367 | # Also rather than DOIs SINCE a given year, it might be desired to retrieve data on DOIs from a discrete year, 368 | # or from a time period with specific start and end dates. 369 | ##### Code improvement 370 | 371 | 372 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later 373 | #write_file_path = paste0("./data/metadata_",my_year,".json") 374 | #to_write<-toJSON(metadata_since_year, pretty=TRUE, na="null") 375 | #write(to_write,write_file_path) 376 | 377 | # read it back in, if necessary 378 | #metadata_since_year <- read_json(write_file_path, simplifyVector = TRUE) 379 | ##### WRITE/READ JSON 380 | 381 | # This will loop through each result, extract ("pluck") the object called "data" 382 | # bind it together into a dataframe (the "dfr" part of map_dfr) 383 | # clean the names up and filter to remove any duplicates 384 | metadata_since_year_df <- metadata_since_year %>% 385 | map_dfr(., pluck("data")) %>% 386 | clean_names() %>% 387 | filter(!duplicated(doi)) 388 | 389 | # We next want to prepare our orcid data frame to merge to the crossref data by selecting only the relevant columns. 390 | # Rows with no CrossRef data (like issued from DataCite) are still present here 391 | # anything published in an earlier time frame will be removed 392 | orcid_merge <- dois_since_year %>% 393 | select(orcid_identifier, doi, given_name, family_name) 394 | 395 | # select relevant columns 396 | cr_merge <- metadata_since_year_df %>% 397 | select(any_of(c("doi", 398 | "title", 399 | "published_print", 400 | "published_online", 401 | "issued", 402 | "container_title", 403 | "issn", 404 | "volume", 405 | "issue", 406 | "page", 407 | "publisher", 408 | "language", 409 | "isbn", 410 | "url", 411 | "type", 412 | "subject", 413 | "reference_count", 414 | "is_referenced_by_count", 415 | "subject", 416 | "alternative_id", 417 | "author", 418 | "pdf_url"))) 419 | 420 | # CrossRef metadata was retrieved for Works on the ORCID profile with publication year >= my_year 421 | # however the DOI issued date may earlier than my_year, could be NA, or will have missing month or day info 422 | # if an issued date from CrossRef is NA, we will fill it in as my_year-01-01 423 | # if issued is a partial date, we fill in with January 1, or the 1st of the month 424 | # so that in Tableau they will render properly as dates 425 | jan1date<-paste0(my_year,"-01-01") 426 | cr_merge$issued<-cr_merge$issued %>% replace_na(jan1date) 427 | cr_merge <- cr_merge %>% add_column(issued2 = "", .after = "issued") 428 | cr_merge <- cr_merge %>% 429 | mutate( 430 | issued2 = if_else( 431 | condition = nchar(trim(issued)) == 7, 432 | true = paste0(issued,"-01"), 433 | false = issued 434 | ) 435 | ) %>% 436 | mutate( 437 | issued2 = if_else( 438 | condition = nchar(trim(issued)) == 4, 439 | true = paste0(issued,"-01-01"), 440 | false = issued2 441 | ) 442 | ) 443 | cr_merge$issued<-cr_merge$issued2 444 | cr_merge <- cr_merge %>% select(-(issued2)) 445 | 446 | 447 | # build an author ORCID ID reference table ----------------------------------------------------- 448 | # it will help us fill in blanks later if we start building a dataframe of full author names with their ORCID 449 | 450 | # start with the orcid_person_employment_join dataframe of employment data for home authors 451 | # create a fullname identifier for the home author that is striped of punctuation and whitespace 452 | orcid_person_employment_join$fullname <- with(orcid_person_employment_join, paste(given_name,family_name)) 453 | orcid_person_employment_join$fullname <- str_replace_all(orcid_person_employment_join$fullname, "[^[:alnum:]]", " ") 454 | orcid_person_employment_join$fullname<-str_replace_all(orcid_person_employment_join$fullname, fixed(" "), "") 455 | 456 | # select relevant columns 457 | master_names <- orcid_person_employment_join %>% 458 | select(any_of(c("fullname", 459 | "orcid_identifier_path", 460 | "department_name", 461 | "organization_name", 462 | "organization_address_city", 463 | "organization_address_region", 464 | "organization_address_country" 465 | ))) 466 | master_names <- master_names[!duplicated(master_names$orcid_identifier_path),] 467 | 468 | # get the credit_name, an alternate version of their name and make a row for that 469 | credit_names <- orcid_person_employment_join %>% 470 | filter(!is.na(credit_name)) %>% 471 | select(any_of(c("credit_name", 472 | "orcid_identifier_path", 473 | "department_name", 474 | "organization_name", 475 | "organization_address_city", 476 | "organization_address_region", 477 | "organization_address_country" 478 | ))) %>% 479 | rename(fullname = credit_name) 480 | 481 | # strip the fullname identifier of punctuation and whitespace 482 | credit_names$fullname <- str_replace_all(credit_names$fullname, "[^[:alnum:]]", " ") 483 | credit_names$fullname<-str_replace_all(credit_names$fullname, fixed(" "), "") 484 | 485 | # remove duplicate rows 486 | credit_names <- credit_names[!duplicated(credit_names$orcid_identifier_path),] 487 | 488 | # concatenate these two data frames to start our author ORCID ID reference table 489 | names_df <- rbind(master_names,credit_names) 490 | 491 | 492 | # get co-author information ----------------------------------------------------- 493 | 494 | # The authors for each DOI in the cr_merge dataframe are in a nested list. 495 | # In order to collect information about them, we must unnest the list, 496 | # Then we will build a list of home author, co-author pairs and try ti fill in any unknown ORCID 497 | # and location info about the co-authors 498 | 499 | # unnest the author list for each DOI 500 | what_auths <- cr_merge %>% unnest(author) 501 | 502 | # left join this DOI authors list to our list of home authors by DOI 503 | # this gives us a df where there is an individual row for each home author and co-author on a DOI 504 | authlist_all <- what_auths %>% 505 | left_join(orcid_merge, by = "doi") 506 | 507 | # when multiple home authors have collaborated on a DOI there will be several sets of 508 | # rows for that DOI in the data frame - one set for each home author 509 | # we keep these because we're counting each home author and all their collaborations, including within institution 510 | 511 | # we do want to remove rows produced by the join where the home author (orcid_identifier) is 512 | # the same as the co-author (ORCID) - so where orcid_identifier = str_sub(ORCID , 18, 37) 513 | # AND where the home author / co-author names are exactly the same 514 | # this will miss slight variations in names when there is no ORCID ID on the cross ref record (e.g. Bradley Baker vs. Bradley J. Baker) 515 | 516 | # add some columns to authlist_all to help with this deduplicating 517 | authlist_all$orcid_coauth <- with(authlist_all, 518 | ifelse(is.na(ORCID),'',str_sub(ORCID , 18, 37)) 519 | ) 520 | 521 | # fullname identifier for the home author, striped of punctuation and whitespace 522 | authlist_all$anchorfullname <- with(authlist_all, paste(given_name,family_name)) 523 | authlist_all$anchorfullname <- str_replace_all(authlist_all$anchorfullname, "[^[:alnum:]]", " ") 524 | authlist_all$anchorfullname<-str_replace_all(authlist_all$anchorfullname, fixed(" "), "") 525 | 526 | # fullname identifier for the co-author, striped of punctuation and whitespace 527 | authlist_all$coauthfullname <- with(authlist_all, paste(given,family)) 528 | authlist_all$coauthfullname <- str_replace_all(authlist_all$coauthfullname, "[^[:alnum:]]", " ") 529 | authlist_all$coauthfullname<-str_replace_all(authlist_all$coauthfullname, fixed(" "), "") 530 | 531 | ## create a new df with the identical entries removed 532 | authlist_nodups <- subset(authlist_all, (orcid_identifier != orcid_coauth)) 533 | authlist_nodups <- subset(authlist_nodups, (anchorfullname != coauthfullname)) 534 | 535 | # next it would be good to fill in ORCID if there is a co-author name variation that 536 | # we are already aware of and logged in names_df, our author ORCID ID reference table 537 | # when there are author name variations that we are not aware of, and there is no ORCID ID 538 | # there is just no way to resolve them, so the occasional row where home author and co-author are the same will persist 539 | 540 | ##### Code improvement 541 | # there are many times when we could try to fill in info from the author ORCID ID reference table 542 | # in order to keep refining the data. so it would be good to take this code out and 543 | # put it in a function that we could just call here instead of re-running similar lines of code 544 | ##### Code improvement 545 | 546 | #### TIME: These joins hang a bit if the lists are very large (e.g. for Temple University's 2022 data [>2700 names], all these joins took ~10 seconds) 547 | # left join to add ORCIDs from our reference table to the author list 548 | my_join <- left_join(authlist_nodups,names_df,by=c("coauthfullname" = "fullname")) 549 | 550 | # fill in the joined ORCID where orcid_coauth is blank 551 | my_join[ my_join$orcid_coauth == "", "orcid_coauth" ] <- my_join[ my_join$orcid_coauth == "", "orcid_identifier_path" ] 552 | 553 | # this reintroducies NA values into the data fram, so replace those with blanks again 554 | my_join <- my_join %>% 555 | mutate_at('orcid_coauth', ~replace_na(.,"")) 556 | 557 | # do another pass to eliminate rows with the same anchor author and co-author ORCID from the ones we just filled in 558 | authlist_nodups <- subset(my_join, (orcid_identifier != orcid_coauth)) 559 | 560 | 561 | # now that we tried to fill in co-author ORCID IDs we can also fill in 562 | # co-author current affiliations and location information that we have in the reference table names_df 563 | 564 | # but we have to use a version of the names_df where orcid is unique 565 | orcid_df <- names_df 566 | 567 | # remove duplicate orcid rows 568 | orcid_df <- orcid_df[!duplicated(orcid_df$orcid_identifier_path),] 569 | 570 | my_join <- left_join(authlist_nodups,orcid_df,by=c("orcid_coauth" = "orcid_identifier_path")) 571 | 572 | # fill in the joined location fields where any co-author locations are blank 573 | my_join <- my_join %>% 574 | mutate(department_name.x = coalesce(department_name.x,department_name.y), 575 | organization_name.x = coalesce(organization_name.x,organization_name.y), 576 | organization_address_city.x = coalesce(organization_address_city.x,organization_address_city.y), 577 | organization_address_region.x = coalesce(organization_address_region.x,organization_address_region.y), 578 | organization_address_country.x = coalesce(organization_address_country.x,organization_address_country.y) 579 | ) 580 | 581 | # drop some columns we don't need 582 | authlist_nodups <- subset(my_join, select = -c(orcid_identifier_path,department_name.y,organization_name.y, organization_address_city.y, organization_address_region.y, organization_address_country.y)) 583 | 584 | # now we have authlist_nodups, a dataframe where there is a row for every co-author on a DOI except for the home author duplicate (ideally), 585 | # and each row also includes the home author's name and ORCID ID, and as much info about the co-author as we have so far 586 | 587 | 588 | # build the output file ----------------------------------------------------- 589 | 590 | # we eventually want to output a CSV with these columns: 591 | # fname1, lname1, orcid1, affiliation1, org1, city1, region1, country1, fname2, lname2, orcid2, affiliation2, org2, city2, region2, country2, DOI 592 | 593 | # create a dataframe with the columns we need 594 | co_authors <- authlist_nodups %>% 595 | select(any_of(c("doi", 596 | "issued", 597 | "given_name", 598 | "family_name", 599 | "orcid_identifier", 600 | "given", 601 | "family", 602 | "orcid_coauth", 603 | "affiliation.name", 604 | "organization_name.x", 605 | "organization_address_city.x", 606 | "organization_address_region.x", 607 | "organization_address_country.x" 608 | ))) 609 | 610 | # rename some columns 611 | co_authors <- co_authors %>% 612 | rename( 613 | fname1 = given_name, 614 | lname1 = family_name, 615 | orcid1 = orcid_identifier, 616 | fname2 = given, 617 | lname2 = family, 618 | orcid2 = orcid_coauth, 619 | affiliation2 = affiliation.name, 620 | org2 = organization_name.x, 621 | city2 = organization_address_city.x, 622 | region2 = organization_address_region.x, 623 | country2 = organization_address_country.x 624 | ) 625 | 626 | # add in columns of home author info affiliation and location info 627 | # join the info in from our orcid_df reference table 628 | co_authors <- left_join(co_authors,orcid_df,by=c("orcid1" = "orcid_identifier_path")) 629 | 630 | # rename the joined affiliation and location fields for the home author 631 | co_authors <- co_authors %>% 632 | rename( 633 | affiliation1 = department_name, 634 | org1 = organization_name, 635 | city1 = organization_address_city, 636 | region1 = organization_address_region, 637 | country1 = organization_address_country 638 | ) 639 | 640 | # move the columns around 641 | co_authors <- co_authors %>% relocate(affiliation1, org1, city1, region1, country1, .after = orcid1) 642 | 643 | # fill in with static values if there are blanks -- there realy shouldn't be any but just in case 644 | co_authors$org1[co_authors$org1 == "" | co_authors$org1 == " " | is.na(co_authors$org1)]<- anchor_org 645 | co_authors$city1[co_authors$city1 == "" | co_authors$city1 == " " | is.na(co_authors$city1)]<- anchor_city 646 | co_authors$region1[co_authors$region1 == "" | co_authors$region1 == " " | is.na(co_authors$region1)]<- anchor_region 647 | co_authors$country1[co_authors$country1 == "" | co_authors$country1 == " " | is.na(co_authors$country1)]<- anchor_country 648 | 649 | 650 | # though we might have filled in a few pieces of co-author info for some of the co-authors from the same institution above, 651 | # we stil need city, region, and country for many of the co-authors. we can try to retrive this if we have the co-authors ORCID ID 652 | # we'll make a unique list of co-author's who have ORCID IDs and get their CURRENT affiliation 653 | # we chose to get their current affiliation because this is the same way we treat home authors 654 | # (they are a home author because of their current affiliation, 655 | # even though they may have published a DOI in the past when affiliated with a different organization) 656 | co_auth_ids <- co_authors$orcid2 657 | co_auth_ids_unduped <- unique(co_auth_ids[co_auth_ids != ""]) 658 | 659 | # if a value in co_auth_ids_unduped gives an error when you try to generate my_co_auths_employment below 660 | # (like that it is locked and cannot be edited) 661 | # remove it from the list by filling in the problem ORCID ID (format XXXX-XXXX-XXXX-XXXX), uncommenting, and running this statement 662 | # then try to generate my_co_auths_employment again 663 | #co_auth_ids_unduped <- co_auth_ids_unduped[ co_auth_ids_unduped != "enter problem ORCID ID here in format XXXX-XXXX-XXXX-XXXX"] 664 | 665 | # get the co-authors employment data from their ORCID profile 666 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's 2022 data [>850 IDs], this took ~2 minutes) 667 | my_co_auths_employment <- rorcid::orcid_employments(co_auth_ids_unduped) 668 | 669 | ##### JSON 670 | # you can write the file to json if you want to work with it outside of R 671 | #to_write<-toJSON(my_co_auths_employment, na="null") 672 | #write(to_write,"./data/co_auths_employment.json") 673 | 674 | # read it back in, if necessary 675 | #my_co_auths_employment <- read_json("./data/co_auths_employment.json", simplifyVector = TRUE) 676 | ##### JSON 677 | 678 | # extract the employment data and mutate the dates 679 | my_co_auths_employment_data <- my_co_auths_employment %>% 680 | purrr::map(., purrr::pluck, "affiliation-group", "summaries") %>% 681 | purrr::flatten_dfr() %>% 682 | janitor::clean_names() %>% 683 | dplyr::mutate(employment_summary_end_date = anytime::anydate(employment_summary_end_date/1000), 684 | employment_summary_created_date_value = anytime::anydate(employment_summary_created_date_value/1000), 685 | employment_summary_last_modified_date_value = anytime::anydate(employment_summary_last_modified_date_value/1000)) 686 | 687 | # clean up column names 688 | names(my_co_auths_employment_data) <- names(my_co_auths_employment_data) %>% 689 | stringr::str_replace(., "employment_summary_", "") %>% 690 | stringr::str_replace(., "source_source_", "") %>% 691 | stringr::str_replace(., "organization_disambiguated_", "") 692 | 693 | # some rows have orcid_path = NA, for these put the ORCID ID back with substring of path 694 | my_co_auths_employment_data <- my_co_auths_employment_data %>% 695 | mutate(orcid_path = coalesce(orcid_path,substring(path,2,20))) 696 | 697 | # get the co-authors' current affiliations 698 | # this will miss co-authors who have no current employment line (with no end date) in their ORCID profile 699 | my_co_auths_employment_data_filtered_current <- my_co_auths_employment_data %>% 700 | dplyr::filter(is.na(end_date_year_value)) 701 | 702 | # some co-authors may have multiple "current" affiliations 703 | # seperate out those with no start date year value and those that do have start dates 704 | my_co_auths_current_emp_nodate <- subset(my_co_auths_employment_data_filtered_current, is.na(start_date_year_value)) 705 | my_co_auths_current_emp_date <- subset(my_co_auths_employment_data_filtered_current, !is.na(start_date_year_value)) 706 | 707 | # for those with a start date, choose the row with the most recent year 708 | latest_dates <- my_co_auths_current_emp_date %>% 709 | group_by(orcid_path) %>% 710 | slice(which.max(start_date_year_value)) %>% 711 | arrange(start_date_year_value) 712 | 713 | co_auths_latest_emp <- rbind(my_co_auths_current_emp_nodate,latest_dates) 714 | 715 | # there will STILL be duplicates because of people with a mix of undated and dated ORCID profile employment entries, 716 | # group again and use the latest entry date 717 | co_auths_very_latest_emp <- co_auths_latest_emp %>% 718 | group_by(orcid_path) %>% 719 | slice(which.max(created_date_value)) %>% 720 | arrange(created_date_value) 721 | 722 | # be double sure that we removed duplicate orcid rows 723 | co_auths_very_latest_emp <- co_auths_very_latest_emp[!duplicated(co_auths_very_latest_emp$orcid_path),] 724 | 725 | # for the co-authors that had ORCID profiles and for whom we now have a current employment data point, join them back to the co_authors dataframe 726 | co_authors_full_info <- left_join(co_authors,co_auths_very_latest_emp,by=c("orcid2" = "orcid_path")) 727 | 728 | # If org2, city2, region2, country2 had been NA in the dataframe we are building to output, fill from the joined table fields 729 | co_authors_full_info <- co_authors_full_info %>% 730 | mutate(org2 = coalesce(org2,organization_name), 731 | city2 = coalesce(city2,organization_address_city), 732 | region2 = coalesce(region2,organization_address_region), 733 | country2 = coalesce(country2,organization_address_country) 734 | ) 735 | 736 | # drop some columns we don't need 737 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2) 738 | 739 | ##### Code improvement 740 | # from here you could do yet ANOTHER round of recording co-author fullnames and ORCID IDs to the reference dataframe, 741 | # then fill in blanks in the full_info df 742 | # when the code that does that is pulled out into its own function, that won't take a lot of space to do 743 | ##### Code improvement 744 | 745 | # get rid of NA values 746 | co_authors_full_info[is.na(co_authors_full_info)] <- "" 747 | 748 | 749 | # clean up US state names so they produce single locations on the Tableau map 750 | # set up a dataframe of state names and abbreviations 751 | states_df<- data.frame(state.abb, state.name, paste0(state.name,'US')) 752 | colnames(states_df) <- c('abb','name','id') 753 | 754 | # left join the correct state abbreviation for only US states with the full state name spelled out 755 | # starting with the home authors' region1 756 | co_authors_full_info$state1<-with(co_authors_full_info,paste0(region1,country1)) 757 | co_authors_full_info <- left_join(co_authors_full_info,states_df,by=c("state1" = "id")) 758 | 759 | # overwrite the full state names with the abbreviations where they occur 760 | co_authors_full_info$region1 <- ifelse(is.na(co_authors_full_info$abb), co_authors_full_info$region1, co_authors_full_info$abb ) 761 | 762 | # drop the joined columns 763 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2) 764 | 765 | # do the same for the region2, the co_authors' US state names 766 | co_authors_full_info$state2<-with(co_authors_full_info,paste0(region2,country2)) 767 | co_authors_full_info <- left_join(co_authors_full_info,states_df,by=c("state2" = "id")) 768 | co_authors_full_info$region2 <- ifelse(is.na(co_authors_full_info$abb), co_authors_full_info$region2, co_authors_full_info$abb ) 769 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2) 770 | 771 | 772 | # write it to a csv to be visualized 773 | write_csv(co_authors_full_info, "./data/orcid-data.csv") 774 | 775 | # Ta da, you should now have a data file to visualize in Tableau 776 | 777 | # Before uploading to Tableau, consider cleaning your data file, either manually or using a tool 778 | # like Open Refine (https://openrefine.org/). It will improve the visualization if wordings and spellings 779 | # are standardized, particularly in the organization (org1, org2) and city name (city1, city2) fields. 780 | -------------------------------------------------------------------------------- /Rorcid_Crossref_Authors.R: -------------------------------------------------------------------------------- 1 | # Script by Olivia Given Castello, based on: https://ciakovx.github.io/rorcid.html 2 | # and 04-rcrossref_metadata.R at https://github.com/ciakovx/fsci2022/tree/main/code 3 | # Retrieves ORCID profile and Crossref metadata for authors from a given institution, 4 | # since a given year, paired with that of the co-authors with whom they collaborated. 5 | 6 | # Install and load packages ----------------------------------------------- 7 | 8 | # you will need to install these packages first, using the following 9 | # if you've already installed them, skip this step 10 | #install.packages('dplyr') 11 | #install.packages('tibble') 12 | #install.packages('tidyr') 13 | #install.packages('purrr') 14 | #install.packages('readr') 15 | #install.packages('stringr') 16 | #install.packages('jsonlite') 17 | #install.packages('lubridate') 18 | #install.packages('ggplot2') 19 | #install.packages('httr') 20 | #install.packages('forcats') 21 | #install.packages('rorcid') 22 | #install.packages('usethis') 23 | #install.packages('anytime') 24 | #install.packages('janitor') 25 | #install.packages('glue') 26 | #install.packages('remotes') 27 | #install.packages("ropensci/crossref") 28 | #install.packages('roadoi') 29 | #install.packages('inops') 30 | #install.packages("rdatacite") 31 | #install.packages("data.table") 32 | #install.packages("ropensci/geonames") 33 | 34 | # load the packages 35 | library(dplyr) 36 | library(tibble) 37 | library(tidyr) 38 | library(purrr) 39 | library(readr) 40 | library(stringr) 41 | library(jsonlite) 42 | library(lubridate) 43 | library(ggplot2) 44 | library(httr) 45 | library(forcats) 46 | library(usethis) 47 | library(anytime) 48 | library(janitor) 49 | library(glue) 50 | library(rorcid) 51 | library(rcrossref) 52 | library(roadoi) 53 | library(inops) 54 | 55 | # remove all objects from the environment to start with a clean slate 56 | rm(list = ls()) 57 | 58 | # Set up orcid / crossref in R environment ------------------------------------------------------------ 59 | 60 | # if you've already done these steps and set up your bearer token in RStudio 61 | # you can skip to the next section: "set some variablees and build the query" 62 | 63 | # 1. If you haven’t done so already, create an ORCID account at https://orcid.org/signin. 64 | # 2. In the upper right corner, click your name, then in the drop-down menu, click Developer Tools. Note: In order to access Developer Tools, you must verify your email address. 65 | # 3. If you have not already verified your email address, you will be prompted to do so at this point. 66 | # 4. Click the “Register for the free ORCID public API” button 67 | # 5. Review and agree to the terms of service when prompted. 68 | # 6. Add your name in the Name field, https://www.orcid.org in the Your Website URL field, “Getting public API key” in Description field, and https://www.orcid.org in the redirect URI field. Click the diskette button to save. 69 | # 7. A gray box will appear including your Client ID and Client Secret. In the below code chunk, copy and paste the client ID and the client secret respectively. 70 | # 8. Make sure to leave the quotation marks (e.g. orcid_client_id <- "APP-FDFJKDSLF320SDFF" and orcid_client_secret <- "c8e987sa-0b9c-82ed-91as-1112b24234e"). 71 | 72 | # copy/paste your client ID from https://orcid.org/developer-tools 73 | orcid_client_id <- "PASTE MY CLIENT ID HERE" 74 | 75 | # copy/paste your client secret from https://orcid.org/developer-tools 76 | orcid_client_secret <- "PASTE MY CLIENT SECRET HERE" 77 | 78 | # This gets a /read-public scope access token 79 | orcid_request <- POST(url = "https://orcid.org/oauth/token", 80 | config = add_headers(`Accept` = "application/json", 81 | `Content-Type` = "application/x-www-form-urlencoded"), 82 | body = list(grant_type = "client_credentials", 83 | scope = "/read-public", 84 | client_id = orcid_client_id, 85 | client_secret = orcid_client_secret), 86 | encode = "form") 87 | 88 | # parse the API request with content 89 | orcid_response <- content(orcid_request) 90 | 91 | # run the following code 92 | print(orcid_response$access_token) 93 | 94 | #You will see a string of text print out in your R console. 95 | # Copy that string to the clipboard 96 | # so we can save the token to our R environment 97 | # Run this code: 98 | usethis::edit_r_environ() 99 | 100 | # A new window will open in RStudio. 101 | # In this separate R environment page, type the following (except the pound sign): 102 | # ORCID_TOKEN="my-token" 103 | # replace 'my-token' with the access_token you just copied. 104 | # Then press enter to create a new line. 105 | # while we are here, we'll add in our rcrossref credentials 106 | # type crossref_email="name@example.com", using your own email address. 107 | # press enter to create a new line, and leave it blank. 108 | # Press Ctrl + S (Mac: Cmd + S) to save this information to your R environment and close the window. 109 | # You won't see anything happen here because it is just saving the page. 110 | 111 | # Click Session > Restart R. Your token should now be saved to your R environment. 112 | 113 | # You will now need to rerun all the packages ("library()" commands) above, then return to this line. 114 | 115 | #You can confirm this worked by calling orcid_auth(), and it will print the token 116 | rorcid::orcid_auth() 117 | 118 | 119 | # set some variablees and build the query -------------------------------------------------------- 120 | 121 | # set the working directory where this script is 122 | # a folder called "data" is also expected to be in this directory 123 | setwd("PASTE YOUR WORKING DIRECTORY HERE") 124 | 125 | # set the time period of interest: this script will compile collaboration data since Jan 1 of this year. 126 | # replace the YYYY with a 4 digit year. 127 | # the more years of data desired, the longer some portions of this script will take to run 128 | my_year = YYYY; 129 | 130 | # set the home institution identifiers 131 | ringgold_id <- "enter your institution's ringgold" 132 | grid_id <- "enter your institution's grid ID" 133 | ror_id <- "enter your institution's ROR ID" 134 | # leave the @ off the email domain, if you want to catch subdomains (e.g. @tuj.temple.edu) 135 | email_domain <- "enter your institution's email domain" 136 | organization_name <- "enter your organization's name" 137 | 138 | # Set a short name key word here that you will use to filter for ORCID records from the home institution later 139 | # Keep it short, like the state name (e.g. Oklahoma). 140 | # If you are adding more than one keyword, separate them by a pipe (|) 141 | my_org_keyword = "enter your institution's keyword" 142 | 143 | # set the institution's main location information (for use when precise location info is blank) 144 | anchor_org<-"enter your institution's name" 145 | anchor_city<-"enter your institution's city" 146 | anchor_region<-"enter your institution's state" 147 | anchor_country<-"enter your institution's country" 148 | 149 | # create the query 150 | # decide between these two choices: 151 | # 1. to construct a simple query with the ringgold, grid, ROR ids, email domain, an organization name set above 152 | # run this: 153 | my_query <- glue('ringgold-org-id:', ringgold_id, 154 | ' OR grid-org-id:', grid_id, 155 | ' OR ror-org-id:"', ror_id, 156 | '" OR email:*', email_domain, 157 | ' OR affiliation-org-name:"', organization_name, '"') 158 | 159 | # OR 2. to customize a more complicated query with multiple ringgold, grid, ROR ids, email domains, or organization names 160 | # specify which data you want to pull following this example. 161 | # keep in mind that ROR ID and organization name are strings and need double quotes inside the 162 | # single quotes used here for concatenation 163 | # replace these example lines from Temple University carefully with ones you are interested in 164 | my_query <- glue('ringgold-org-id:', '6558', 165 | ' OR ringgold-org-id:', '43297', 166 | ' OR ringgold-org-id:', '83908', 167 | ' OR grid-org-id:', 'grid.264727.2', 168 | ' OR grid-org-id:', 'grid.469246.b', 169 | ' OR grid-org-id:', 'grid.460938.0', 170 | ' OR ror-org-id:"', 'https://ror.org/00kx1jb78', 171 | '" OR ror-org-id:"', 'https://ror.org/04zzmzt85', 172 | '" OR ror-org-id:"', 'https://ror.org/03savr706', 173 | '" OR email:*', '@temple.edu', 174 | ' OR email:*', '@tuj.temple.edu', 175 | ' OR affiliation-org-name:"', 'Temple University', 176 | '" OR affiliation-org-name:"', 'Temple Ambler', 177 | '" OR affiliation-org-name:"', 'Temple Japan', '"') 178 | 179 | # get the counts 180 | ##### TIME: this may hang a bit if institution has many ORCID ID holders(e.g. for Temple University's data [~3500 IDs], this took a few seconds) 181 | orcid_count <- base::attr(rorcid::orcid(query = my_query), 182 | "found") 183 | 184 | # create the page vector 185 | my_pages <- seq(from = 0, to = orcid_count, by = 200) 186 | 187 | # get the ORCID iDs 188 | my_orcids <- purrr::map( 189 | my_pages, 190 | function(page) { 191 | print(page) 192 | my_orcids <- rorcid::orcid(query = my_query, 193 | rows = 200, 194 | start = page) 195 | return(my_orcids) 196 | }) 197 | 198 | # put the ORCID iDs into a single tibble 199 | my_orcids_data <- my_orcids %>% 200 | map_dfr(., as_tibble) %>% 201 | janitor::clean_names() 202 | 203 | ##### WRITE/READ CSV uncomment to save this data and read it back in later 204 | #write_csv(my_orcids_data, "./data/my_orcids_data.csv") 205 | 206 | # read it back in, if necessary 207 | #my_orcids_data <- read_csv("./data/my_orcids_data.csv", col_types = cols(.default = "c")) 208 | ##### WRITE/READ CSV 209 | 210 | 211 | # get employment data ----------------------------------------------------- 212 | 213 | # get the employments from the orcid_identifier_path column 214 | ##### TIME: be patient, this may take a long time (e.g. for Temple University's data [~3500 IDs], this took ~8 minutes) 215 | my_employment <- rorcid::orcid_employments(my_orcids_data$orcid_identifier_path) 216 | 217 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later 218 | #to_write<-toJSON(my_employment, na="null") 219 | #write(to_write,"./data/employment.json") 220 | 221 | # read it back in, if necessary 222 | #my_employment <- read_json("./data/processed/employment.json", simplifyVector = TRUE) 223 | ##### WRITE/READ JSON 224 | 225 | # extract the employment data and mutate the dates 226 | my_employment_data <- my_employment %>% 227 | purrr::map(., purrr::pluck, "affiliation-group", "summaries") %>% 228 | purrr::flatten_dfr() %>% 229 | janitor::clean_names() %>% 230 | dplyr::mutate(employment_summary_end_date = anytime::anydate(employment_summary_end_date/1000), 231 | employment_summary_created_date_value = anytime::anydate(employment_summary_created_date_value/1000), 232 | employment_summary_last_modified_date_value = anytime::anydate(employment_summary_last_modified_date_value/1000)) 233 | 234 | # clean up the column names 235 | names(my_employment_data) <- names(my_employment_data) %>% 236 | stringr::str_replace(., "employment_summary_", "") %>% 237 | stringr::str_replace(., "source_source_", "") %>% 238 | stringr::str_replace(., "organization_disambiguated_", "") 239 | 240 | # view the unique institutions in the organization names columns 241 | # keep in mind this will include all institutions a person has in their employments section 242 | my_organizations <- my_employment_data %>% 243 | group_by(organization_name) %>% 244 | count() %>% 245 | arrange(desc(n)) 246 | 247 | # filter it with a keyword or set of keywords 248 | # this is the short keyword, or piped set of keywords, set at the top of the script 249 | my_organizations_filtered <- my_organizations %>% 250 | filter(str_detect(organization_name, my_org_keyword)) 251 | 252 | # view the variation in organization names by looking at my_organization_filtered (will open a new tab) 253 | view(my_organizations_filtered) 254 | 255 | # filter the dataset to include only the institutions you want 256 | # decide between these two choices: 257 | # 1. to accept any organization listed in my_organization filtered, run this: 258 | my_employment_data_filtered <- my_employment_data %>% 259 | dplyr::filter(organization_name %in% my_organizations_filtered$organization_name[]) 260 | 261 | # OR 2. to specify which organization name variations to include, copy and paste them here 262 | # following this example. As you can see there may be messiness in hand-entered organization names. 263 | # replace these example names with the ones you are interested in from your my_organizations_filtered list 264 | my_employment_data_filtered <- my_employment_data %>% 265 | dplyr::filter(organization_name == "Temple University" 266 | | organization_name == "Temple University " 267 | | organization_name == "Temple University Fox School of Business and Management" 268 | | organization_name == "Temple University, Japan" 269 | | organization_name == "Temple University Japan" 270 | | organization_name == "Temple University - Ambler Campus") 271 | 272 | # finally, filter to include only people who have NA as the end date 273 | my_employment_data_filtered_current <- my_employment_data_filtered %>% 274 | dplyr::filter(is.na(end_date_year_value)) 275 | 276 | 277 | # Note that this will give you employment records only. 278 | # In other words, each row represents a single employment record for an individual. 279 | # the name_value variable refers specifically to the name of the person or system 280 | # that wrote the record, NOT the name of the individual. 281 | 282 | # To get that, you must first get all the unique ORCID iDs from the dataset: 283 | 284 | # There is no distinct value identifying the orcid ID of the person. 285 | # The orcid_path value corresponds to the path of the person who added the employment record (which is usually, but not always the same) 286 | # Therefore you have to strip out the ORCID iD from the 'path' variable first and put it in it's own value and use it 287 | # We do this using str_sub from the stringr package 288 | # While we are at it, we can select and reorder the columns we want to keep 289 | current_employment_all <- my_employment_data_filtered_current %>% 290 | mutate(orcid_identifier = str_sub(path, 2, 20)) %>% 291 | select(any_of(c("orcid_identifier", 292 | "organization_name", 293 | "organization_address_city", 294 | "organization_address_region", 295 | "organization_address_country", 296 | "organization_identifier", 297 | "organization_disambiguated_organization_identifier", 298 | "organization_disambiguation_source", 299 | "department_name", 300 | "role_title", 301 | "url_value", 302 | "display_index", 303 | "visibility", 304 | "created_date_value", 305 | "start_date_year_value", 306 | "start_date_month_value", 307 | "start_date_day_value", 308 | "end_date_year_value", 309 | "end_date_month_value", 310 | "end_date_day_value"))) 311 | 312 | # next, create a new vector unique_orcids that includes only unique ORCID iDs from our filtered dataset. 313 | unique_orcids <- unique(current_employment_all$orcid_identifier) %>% 314 | na.omit(.) %>% 315 | as.character() 316 | 317 | # then run the following expression to get all biographical information for those iDs. 318 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's data [~700 IDs], this took ~1.5 minutes) 319 | my_orcid_person <- rorcid::orcid_person(unique_orcids) 320 | 321 | # then we construct a data frame from the response. 322 | # See more at https://ciakovx.github.io/rorcid.html#Getting_the_data_into_a_data_frame for this. 323 | my_orcid_person_data <- my_orcid_person %>% { 324 | dplyr::tibble( 325 | given_name = purrr::map_chr(., purrr::pluck, "name", "given-names", "value", .default=NA_character_), 326 | created_date = purrr::map_chr(., purrr::pluck, "name", "created-date", "value", .default=NA_integer_), 327 | last_modified_date = purrr::map_chr(., purrr::pluck, "name", "created-date", "value", .default=NA_character_), 328 | family_name = purrr::map_chr(., purrr::pluck, "name", "family-name", "value", .default=NA_character_), 329 | credit_name = purrr::map_chr(., purrr::pluck, "name", "credit-name", "value", .default=NA_character_), 330 | other_names = purrr::map(., purrr::pluck, "other-names", "other-name", "content", .default=NA_character_), 331 | orcid_identifier_path = purrr::map_chr(., purrr::pluck, "name", "path", .default = NA_character_), 332 | biography = purrr::map_chr(., purrr::pluck, "biography", "content", .default=NA_character_), 333 | researcher_urls = purrr::map(., purrr::pluck, "researcher-urls", "researcher-url", .default=NA_character_), 334 | emails = purrr::map(., purrr::pluck, "emails", "email", "email", .default=NA_character_), 335 | keywords = purrr::map(., purrr::pluck, "keywords", "keyword", "content", .default=NA_character_), 336 | external_ids = purrr::map(., purrr::pluck, "external-identifiers", "external-identifier", .default=NA_character_)) 337 | } %>% 338 | dplyr::mutate(created_date = anytime::anydate(as.double(created_date)/1000), 339 | last_modified_date = anytime::anydate(as.double(last_modified_date)/1000)) 340 | 341 | # Join it back with the employment records so that the employment data now includes organization city, region, country 342 | orcid_person_employment_join <- my_orcid_person_data %>% 343 | left_join(current_employment_all, by = c("orcid_identifier_path" = "orcid_identifier")) 344 | 345 | ##### WRITE/READ CSV uncomment to save this data and read it back in later 346 | #write_csv(orcid_person_employment_join, "./data/orcid_employment_file.csv") 347 | 348 | # read it back in, if necessary 349 | #orcid_person_employment_join <- read_csv("./data/orcid_employment_file.csv", col_types = cols(.default = "c")) 350 | ##### WRITE/READ CSV 351 | 352 | 353 | # get works data ----------------------------------------------------- 354 | 355 | # create a vector of unique, unduplicated ORCID IDs from that file 356 | my_orcids <- orcid_person_employment_join %>% 357 | filter(!duplicated(orcid_identifier_path)) %>% 358 | pull(orcid_identifier_path) %>% 359 | na.omit() %>% 360 | as.character() 361 | 362 | # Call the orcid_works function to collect all works associated with each ID 363 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's data [~700 IDs], this took ~2.5 minutes) 364 | my_works <- rorcid::orcid_works(my_orcids) 365 | 366 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later 367 | #to_write<-toJSON(my_works, na="null") 368 | #write(to_write,"./data/my_works.json") 369 | 370 | # read it back in, if necessary 371 | #my_works <- read_json("./data/my_works.json", simplifyVector = TRUE) 372 | ##### WRITE/READ JSON 373 | 374 | # turn the JSON file into a unique data frame by looping through the file, 375 | # extracting ("pluck") the object, bind the rows together with(this is the "_dfr" part of map_dfr) 376 | # then clean column names 377 | # and convert the dates from Unix time to yyyy-mm-dd 378 | my_works_data <- my_works %>% 379 | purrr::map_dfr(pluck, "works") %>% 380 | janitor::clean_names() %>% 381 | dplyr::mutate(created_date_value = anytime::anydate(created_date_value/1000), 382 | last_modified_date_value = anytime::anydate(last_modified_date_value/1000)) 383 | 384 | # we only want to keep works that have an external identifier 385 | # (specifically, a DOI), so we first filter to keep only objects that have an external_id value 386 | # then unnest those: in other words expand to include a row for every work + external id value 387 | # (in other words, one work might be linked to a DOI, a PubMed ID, an ISSN, etc.) 388 | my_works_externalIDs <- my_works_data %>% 389 | dplyr::filter(!purrr::map_lgl(external_ids_external_id, purrr::is_empty)) %>% 390 | tidyr::unnest(external_ids_external_id) %>% 391 | clean_names() 392 | 393 | # From those unnested external IDs, we want to keep only those with a DOI, as that is the 394 | # value we'll use to look up the items in Crossref. 395 | # We then select a few relevant columns, and finally create a new column DOI that takes the external_id_value column 396 | # and coerces it to lower case, and the orcid_identifier column which strips out the ORCID ID 397 | # from the path variable. 398 | dois <- my_works_externalIDs %>% 399 | filter(external_id_type == "doi") %>% 400 | select(type, path, title_title_value, external_id_type, external_id_value, external_id_relationship, 401 | url_value, publication_date_year_value, publication_date_month_value, publication_date_day_value, 402 | journal_title_value) %>% 403 | mutate(doi = tolower(external_id_value), 404 | orcid_identifier = str_sub(path, 2, 20)) 405 | 406 | # there are some duplicated values here: we can't just look at duplicate DOIs because some of these 407 | # works were co-authored, and we want to keep that data (i.e. unique orcid + doi combinations) 408 | # This function will let you look at observations where both the orcid ID and the DOI are duplicated in 409 | # case you want to review them more closely. 410 | # In our case below, we just keep the first appearance of a unique orcid + doi combination and discard 411 | # all subsequent ones. 412 | dupes <- dois %>% 413 | get_dupes(orcid_identifier, doi) 414 | 415 | # Here we are preparing the orcid dataset for merging to publications. 416 | # We keep only Orcid ID, first name and last name, remove duplicates, and rename orcid_identifier 417 | orcid_empl_merge <- orcid_person_employment_join %>% 418 | select(orcid_identifier_path, given_name, family_name) %>% 419 | filter(!duplicated(orcid_identifier_path)) %>% 420 | rename(orcid_identifier = orcid_identifier_path) 421 | 422 | # Finally, we remove the duplicates by creating a new variable that is a combination of 423 | # the orcid ID and the DOI, and keeping only the first instance. We then join that to our 424 | # cleaned orcid ID file and write to csv 425 | dois_unduped <- dois %>% 426 | mutate(orcid_doi = paste0(orcid_identifier, doi)) %>% 427 | filter(!duplicated(orcid_doi)) %>% 428 | left_join(orcid_empl_merge, by = "orcid_identifier") 429 | 430 | ##### WRITE/READ CSV uncomment to save this data and read it back in later 431 | #write_csv(dois_unduped, "./data/orcid_dois.csv") 432 | 433 | # read it back in, if necessary 434 | #dois_unduped <- read_csv("./data/orcid_dois.csv") 435 | ##### WRITE/READ CSV 436 | 437 | 438 | # get CrossRef data ----------------------------------------------------- 439 | 440 | # We start by subsetting our unduped dois to include only since the year that we want 441 | # this is the year of publication according to the ORCID profile works data 442 | dois_since_year <- dois_unduped %>% 443 | filter(publication_date_year_value >= my_year) 444 | 445 | # This will loop through the column of dois and perform a function that 446 | # prints the doi (this allows you to ensure it's progressing) 447 | # there will be warning messages for any DOIs not found at CrossRef 448 | ##### TIME This will take a long time for large datasets (e.g. for Temple University's 2022 data [800+ DOIs], this took ~6 minutes) 449 | metadata_since_year <- map(dois_since_year$doi, function(z) { 450 | print(z) 451 | o <- cr_works(dois = z) 452 | return(o) 453 | }) 454 | 455 | ##### Code improvement 456 | # Here we could create a similar function that queries DataCite for metadata on the ones that weren't found in CR 457 | # Also rather than DOIs SINCE a given year, it might be desired to retrieve data on DOIs from a discrete year, 458 | # or from a time period with specific start and end dates. 459 | ##### Code improvement 460 | 461 | 462 | ##### WRITE/READ JSON uncomment to work with this data outside of R or read it back in later 463 | #write_file_path = paste0("./data/metadata_",my_year,".json") 464 | #to_write<-toJSON(metadata_since_year, pretty=TRUE, na="null") 465 | #write(to_write,write_file_path) 466 | 467 | # read it back in, if necessary 468 | #metadata_since_year <- read_json(write_file_path, simplifyVector = TRUE) 469 | ##### WRITE/READ JSON 470 | 471 | # This will loop through each result, extract ("pluck") the object called "data" 472 | # bind it together into a dataframe (the "dfr" part of map_dfr) 473 | # clean the names up and filter to remove any duplicates 474 | metadata_since_year_df <- metadata_since_year %>% 475 | map_dfr(., pluck("data")) %>% 476 | clean_names() %>% 477 | filter(!duplicated(doi)) 478 | 479 | # We next want to prepare our orcid data frame to merge to the crossref data by selecting only the relevant columns. 480 | # Rows with no CrossRef data (like issued from DataCite) are still present here 481 | # anything published in an earlier time frame will be removed 482 | orcid_merge <- dois_since_year %>% 483 | select(orcid_identifier, doi, given_name, family_name) 484 | 485 | # select relevant columns 486 | cr_merge <- metadata_since_year_df %>% 487 | select(any_of(c("doi", 488 | "title", 489 | "published_print", 490 | "published_online", 491 | "issued", 492 | "container_title", 493 | "issn", 494 | "volume", 495 | "issue", 496 | "page", 497 | "publisher", 498 | "language", 499 | "isbn", 500 | "url", 501 | "type", 502 | "subject", 503 | "reference_count", 504 | "is_referenced_by_count", 505 | "subject", 506 | "alternative_id", 507 | "author", 508 | "pdf_url"))) 509 | 510 | # CrossRef metadata was retrieved for Works on the ORCID profile with publication year >= my_year 511 | # however the DOI issued date may earlier than my_year, could be NA, or will have missing month or day info 512 | # if an issued date from CrossRef is NA, we will fill it in as my_year-01-01 513 | # if issued is a partial date, we fill in with January 1, or the 1st of the month 514 | # so that in Tableau they will render properly as dates 515 | jan1date<-paste0(my_year,"-01-01") 516 | cr_merge$issued<-cr_merge$issued %>% replace_na(jan1date) 517 | cr_merge <- cr_merge %>% add_column(issued2 = "", .after = "issued") 518 | cr_merge <- cr_merge %>% 519 | mutate( 520 | issued2 = if_else( 521 | condition = nchar(trim(issued)) == 7, 522 | true = paste0(issued,"-01"), 523 | false = issued 524 | ) 525 | ) %>% 526 | mutate( 527 | issued2 = if_else( 528 | condition = nchar(trim(issued)) == 4, 529 | true = paste0(issued,"-01-01"), 530 | false = issued2 531 | ) 532 | ) 533 | cr_merge$issued<-cr_merge$issued2 534 | cr_merge <- cr_merge %>% select(-(issued2)) 535 | 536 | 537 | # build an author ORCID ID reference table ----------------------------------------------------- 538 | # it will help us fill in blanks later if we start building a dataframe of full author names with their ORCID 539 | 540 | # start with the orcid_person_employment_join dataframe of employment data for home authors 541 | # create a fullname identifier for the home author that is striped of punctuation and whitespace 542 | orcid_person_employment_join$fullname <- with(orcid_person_employment_join, paste(given_name,family_name)) 543 | orcid_person_employment_join$fullname <- str_replace_all(orcid_person_employment_join$fullname, "[^[:alnum:]]", " ") 544 | orcid_person_employment_join$fullname<-str_replace_all(orcid_person_employment_join$fullname, fixed(" "), "") 545 | 546 | # select relevant columns 547 | master_names <- orcid_person_employment_join %>% 548 | select(any_of(c("fullname", 549 | "orcid_identifier_path", 550 | "department_name", 551 | "organization_name", 552 | "organization_address_city", 553 | "organization_address_region", 554 | "organization_address_country" 555 | ))) 556 | master_names <- master_names[!duplicated(master_names$orcid_identifier_path),] 557 | 558 | # get the credit_name, an alternate version of their name and make a row for that 559 | credit_names <- orcid_person_employment_join %>% 560 | filter(!is.na(credit_name)) %>% 561 | select(any_of(c("credit_name", 562 | "orcid_identifier_path", 563 | "department_name", 564 | "organization_name", 565 | "organization_address_city", 566 | "organization_address_region", 567 | "organization_address_country" 568 | ))) %>% 569 | rename(fullname = credit_name) 570 | 571 | # strip the fullname identifier of punctuation and whitespace 572 | credit_names$fullname <- str_replace_all(credit_names$fullname, "[^[:alnum:]]", " ") 573 | credit_names$fullname<-str_replace_all(credit_names$fullname, fixed(" "), "") 574 | 575 | # remove duplicate rows 576 | credit_names <- credit_names[!duplicated(credit_names$orcid_identifier_path),] 577 | 578 | # concatenate these two data frames to start our author ORCID ID reference table 579 | names_df <- rbind(master_names,credit_names) 580 | 581 | 582 | # get co-author information ----------------------------------------------------- 583 | 584 | # The authors for each DOI in the cr_merge dataframe are in a nested list. 585 | # In order to collect information about them, we must unnest the list, 586 | # Then we will build a list of home author, co-author pairs and try ti fill in any unknown ORCID 587 | # and location info about the co-authors 588 | 589 | # unnest the author list for each DOI 590 | what_auths <- cr_merge %>% unnest(author) 591 | 592 | # left join this DOI authors list to our list of home authors by DOI 593 | # this gives us a df where there is an individual row for each home author and co-author on a DOI 594 | authlist_all <- what_auths %>% 595 | left_join(orcid_merge, by = "doi") 596 | 597 | # when multiple home authors have collaborated on a DOI there will be several sets of 598 | # rows for that DOI in the data frame - one set for each home author 599 | # we keep these because we're counting each home author and all their collaborations, including within institution 600 | 601 | # we do want to remove rows produced by the join where the home author (orcid_identifier) is 602 | # the same as the co-author (ORCID) - so where orcid_identifier = str_sub(ORCID , 18, 37) 603 | # AND where the home author / co-author names are exactly the same 604 | # this will miss slight variations in names when there is no ORCID ID on the cross ref record (e.g. Bradley Baker vs. Bradley J. Baker) 605 | 606 | # add some columns to authlist_all to help with this deduplicating 607 | authlist_all$orcid_coauth <- with(authlist_all, 608 | ifelse(is.na(ORCID),'',str_sub(ORCID , 18, 37)) 609 | ) 610 | 611 | # fullname identifier for the home author, striped of punctuation and whitespace 612 | authlist_all$anchorfullname <- with(authlist_all, paste(given_name,family_name)) 613 | authlist_all$anchorfullname <- str_replace_all(authlist_all$anchorfullname, "[^[:alnum:]]", " ") 614 | authlist_all$anchorfullname<-str_replace_all(authlist_all$anchorfullname, fixed(" "), "") 615 | 616 | # fullname identifier for the co-author, striped of punctuation and whitespace 617 | authlist_all$coauthfullname <- with(authlist_all, paste(given,family)) 618 | authlist_all$coauthfullname <- str_replace_all(authlist_all$coauthfullname, "[^[:alnum:]]", " ") 619 | authlist_all$coauthfullname<-str_replace_all(authlist_all$coauthfullname, fixed(" "), "") 620 | 621 | ## create a new df with the identical entries removed 622 | authlist_nodups <- subset(authlist_all, (orcid_identifier != orcid_coauth)) 623 | authlist_nodups <- subset(authlist_nodups, (anchorfullname != coauthfullname)) 624 | 625 | # next it would be good to fill in ORCID if there is a co-author name variation that 626 | # we are already aware of and logged in names_df, our author ORCID ID reference table 627 | # when there are author name variations that we are not aware of, and there is no ORCID ID 628 | # there is just no way to resolve them, so the occasional row where home author and co-author are the same will persist 629 | 630 | ##### Code improvement 631 | # there are many times when we could try to fill in info from the author ORCID ID reference table 632 | # in order to keep refining the data. so it would be good to take this code out and 633 | # put it in a function that we could just call here instead of re-running similar lines of code 634 | ##### Code improvement 635 | 636 | #### TIME: These joins hang a bit if the lists are very large (e.g. for Temple University's 2022 data [>2700 names], all these joins took ~10 seconds) 637 | # left join to add ORCIDs from our reference table to the author list 638 | my_join <- left_join(authlist_nodups,names_df,by=c("coauthfullname" = "fullname")) 639 | 640 | # fill in the joined ORCID where orcid_coauth is blank 641 | my_join[ my_join$orcid_coauth == "", "orcid_coauth" ] <- my_join[ my_join$orcid_coauth == "", "orcid_identifier_path" ] 642 | 643 | # this reintroducies NA values into the data fram, so replace those with blanks again 644 | my_join <- my_join %>% 645 | mutate_at('orcid_coauth', ~replace_na(.,"")) 646 | 647 | # do another pass to eliminate rows with the same anchor author and co-author ORCID from the ones we just filled in 648 | authlist_nodups <- subset(my_join, (orcid_identifier != orcid_coauth)) 649 | 650 | 651 | # now that we tried to fill in co-author ORCID IDs we can also fill in 652 | # co-author current affiliations and location information that we have in the reference table names_df 653 | 654 | # but we have to use a version of the names_df where orcid is unique 655 | orcid_df <- names_df 656 | 657 | # remove duplicate orcid rows 658 | orcid_df <- orcid_df[!duplicated(orcid_df$orcid_identifier_path),] 659 | 660 | my_join <- left_join(authlist_nodups,orcid_df,by=c("orcid_coauth" = "orcid_identifier_path")) 661 | 662 | # fill in the joined location fields where any co-author locations are blank 663 | my_join <- my_join %>% 664 | mutate(department_name.x = coalesce(department_name.x,department_name.y), 665 | organization_name.x = coalesce(organization_name.x,organization_name.y), 666 | organization_address_city.x = coalesce(organization_address_city.x,organization_address_city.y), 667 | organization_address_region.x = coalesce(organization_address_region.x,organization_address_region.y), 668 | organization_address_country.x = coalesce(organization_address_country.x,organization_address_country.y) 669 | ) 670 | 671 | # drop some columns we don't need 672 | authlist_nodups <- subset(my_join, select = -c(orcid_identifier_path,department_name.y,organization_name.y, organization_address_city.y, organization_address_region.y, organization_address_country.y)) 673 | 674 | # now we have authlist_nodups, a dataframe where there is a row for every co-author on a DOI except for the home author duplicate (ideally), 675 | # and each row also includes the home author's name and ORCID ID, and as much info about the co-author as we have so far 676 | 677 | 678 | # build the output file ----------------------------------------------------- 679 | 680 | # we eventually want to output a CSV with these columns: 681 | # fname1, lname1, orcid1, affiliation1, org1, city1, region1, country1, fname2, lname2, orcid2, affiliation2, org2, city2, region2, country2, DOI 682 | 683 | # create a dataframe with the columns we need 684 | co_authors <- authlist_nodups %>% 685 | select(any_of(c("doi", 686 | "issued", 687 | "given_name", 688 | "family_name", 689 | "orcid_identifier", 690 | "given", 691 | "family", 692 | "orcid_coauth", 693 | "affiliation.name", 694 | "organization_name.x", 695 | "organization_address_city.x", 696 | "organization_address_region.x", 697 | "organization_address_country.x" 698 | ))) 699 | 700 | # rename some columns 701 | co_authors <- co_authors %>% 702 | rename( 703 | fname1 = given_name, 704 | lname1 = family_name, 705 | orcid1 = orcid_identifier, 706 | fname2 = given, 707 | lname2 = family, 708 | orcid2 = orcid_coauth, 709 | affiliation2 = affiliation.name, 710 | org2 = organization_name.x, 711 | city2 = organization_address_city.x, 712 | region2 = organization_address_region.x, 713 | country2 = organization_address_country.x 714 | ) 715 | 716 | # add in columns of home author info affiliation and location info 717 | # join the info in from our orcid_df reference table 718 | co_authors <- left_join(co_authors,orcid_df,by=c("orcid1" = "orcid_identifier_path")) 719 | 720 | # rename the joined affiliation and location fields for the home author 721 | co_authors <- co_authors %>% 722 | rename( 723 | affiliation1 = department_name, 724 | org1 = organization_name, 725 | city1 = organization_address_city, 726 | region1 = organization_address_region, 727 | country1 = organization_address_country 728 | ) 729 | 730 | # move the columns around 731 | co_authors <- co_authors %>% relocate(affiliation1, org1, city1, region1, country1, .after = orcid1) 732 | 733 | # fill in with static values if there are blanks -- there realy shouldn't be any but just in case 734 | co_authors$org1[co_authors$org1 == "" | co_authors$org1 == " " | is.na(co_authors$org1)]<- anchor_org 735 | co_authors$city1[co_authors$city1 == "" | co_authors$city1 == " " | is.na(co_authors$city1)]<- anchor_city 736 | co_authors$region1[co_authors$region1 == "" | co_authors$region1 == " " | is.na(co_authors$region1)]<- anchor_region 737 | co_authors$country1[co_authors$country1 == "" | co_authors$country1 == " " | is.na(co_authors$country1)]<- anchor_country 738 | 739 | 740 | # though we might have filled in a few pieces of co-author info for some of the co-authors from the same institution above, 741 | # we stil need city, region, and country for many of the co-authors. we can try to retrive this if we have the co-authors ORCID ID 742 | # we'll make a unique list of co-author's who have ORCID IDs and get their CURRENT affiliation 743 | # we chose to get their current affiliation because this is the same way we treat home authors 744 | # (they are a home author because of their current affiliation, 745 | # even though they may have published a DOI in the past when affiliated with a different organization) 746 | co_auth_ids <- co_authors$orcid2 747 | co_auth_ids_unduped <- unique(co_auth_ids[co_auth_ids != ""]) 748 | 749 | # if a value in co_auth_ids_unduped gives an error when you try to generate my_co_auths_employment below 750 | # (like that it is locked and cannot be edited) 751 | # remove it from the list by filling in the problem ORCID ID (format XXXX-XXXX-XXXX-XXXX), uncommenting, and running this statement 752 | # then try to generate my_co_auths_employment again 753 | #co_auth_ids_unduped <- co_auth_ids_unduped[ co_auth_ids_unduped != "enter problem ORCID ID here in format XXXX-XXXX-XXXX-XXXX"] 754 | 755 | # get the co-authors employment data from their ORCID profile 756 | ##### TIME: This may take anywhere from a few seconds to a few minutes (e.g. for Temple University's 2022 data [>850 IDs], this took ~2 minutes) 757 | my_co_auths_employment <- rorcid::orcid_employments(co_auth_ids_unduped) 758 | 759 | ##### JSON 760 | # you can write the file to json if you want to work with it outside of R 761 | #to_write<-toJSON(my_co_auths_employment, na="null") 762 | #write(to_write,"./data/co_auths_employment.json") 763 | 764 | # read it back in, if necessary 765 | #my_co_auths_employment <- read_json("./data/co_auths_employment.json", simplifyVector = TRUE) 766 | ##### JSON 767 | 768 | # extract the employment data and mutate the dates 769 | my_co_auths_employment_data <- my_co_auths_employment %>% 770 | purrr::map(., purrr::pluck, "affiliation-group", "summaries") %>% 771 | purrr::flatten_dfr() %>% 772 | janitor::clean_names() %>% 773 | dplyr::mutate(employment_summary_end_date = anytime::anydate(employment_summary_end_date/1000), 774 | employment_summary_created_date_value = anytime::anydate(employment_summary_created_date_value/1000), 775 | employment_summary_last_modified_date_value = anytime::anydate(employment_summary_last_modified_date_value/1000)) 776 | 777 | # clean up column names 778 | names(my_co_auths_employment_data) <- names(my_co_auths_employment_data) %>% 779 | stringr::str_replace(., "employment_summary_", "") %>% 780 | stringr::str_replace(., "source_source_", "") %>% 781 | stringr::str_replace(., "organization_disambiguated_", "") 782 | 783 | # some rows have orcid_path = NA, for these put the ORCID ID back with substring of path 784 | my_co_auths_employment_data <- my_co_auths_employment_data %>% 785 | mutate(orcid_path = coalesce(orcid_path,substring(path,2,20))) 786 | 787 | # get the co-authors' current affiliations 788 | # this will miss co-authors who have no current employment line (with no end date) in their ORCID profile 789 | my_co_auths_employment_data_filtered_current <- my_co_auths_employment_data %>% 790 | dplyr::filter(is.na(end_date_year_value)) 791 | 792 | # some co-authors may have multiple "current" affiliations 793 | # seperate out those with no start date year value and those that do have start dates 794 | my_co_auths_current_emp_nodate <- subset(my_co_auths_employment_data_filtered_current, is.na(start_date_year_value)) 795 | my_co_auths_current_emp_date <- subset(my_co_auths_employment_data_filtered_current, !is.na(start_date_year_value)) 796 | 797 | # for those with a start date, choose the row with the most recent year 798 | latest_dates <- my_co_auths_current_emp_date %>% 799 | group_by(orcid_path) %>% 800 | slice(which.max(start_date_year_value)) %>% 801 | arrange(start_date_year_value) 802 | 803 | co_auths_latest_emp <- rbind(my_co_auths_current_emp_nodate,latest_dates) 804 | 805 | # there will STILL be duplicates because of people with a mix of undated and dated ORCID profile employment entries, 806 | # group again and use the latest entry date 807 | co_auths_very_latest_emp <- co_auths_latest_emp %>% 808 | group_by(orcid_path) %>% 809 | slice(which.max(created_date_value)) %>% 810 | arrange(created_date_value) 811 | 812 | # be double sure that we removed duplicate orcid rows 813 | co_auths_very_latest_emp <- co_auths_very_latest_emp[!duplicated(co_auths_very_latest_emp$orcid_path),] 814 | 815 | # for the co-authors that had ORCID profiles and for whom we now have a current employment data point, join them back to the co_authors dataframe 816 | co_authors_full_info <- left_join(co_authors,co_auths_very_latest_emp,by=c("orcid2" = "orcid_path")) 817 | 818 | # If org2, city2, region2, country2 had been NA in the dataframe we are building to output, fill from the joined table fields 819 | co_authors_full_info <- co_authors_full_info %>% 820 | mutate(org2 = coalesce(org2,organization_name), 821 | city2 = coalesce(city2,organization_address_city), 822 | region2 = coalesce(region2,organization_address_region), 823 | country2 = coalesce(country2,organization_address_country) 824 | ) 825 | 826 | # drop some columns we don't need 827 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2) 828 | 829 | ##### Code improvement 830 | # from here you could do yet ANOTHER round of recording co-author fullnames and ORCID IDs to the reference dataframe, 831 | # then fill in blanks in the full_info df 832 | # when the code that does that is pulled out into its own function, that won't take a lot of space to do 833 | ##### Code improvement 834 | 835 | # get rid of NA values 836 | co_authors_full_info[is.na(co_authors_full_info)] <- "" 837 | 838 | 839 | # clean up US state names so they produce single locations on the Tableau map 840 | # set up a dataframe of state names and abbreviations 841 | states_df<- data.frame(state.abb, state.name, paste0(state.name,'US')) 842 | colnames(states_df) <- c('abb','name','id') 843 | 844 | # left join the correct state abbreviation for only US states with the full state name spelled out 845 | # starting with the home authors' region1 846 | co_authors_full_info$state1<-with(co_authors_full_info,paste0(region1,country1)) 847 | co_authors_full_info <- left_join(co_authors_full_info,states_df,by=c("state1" = "id")) 848 | 849 | # overwrite the full state names with the abbreviations where they occur 850 | co_authors_full_info$region1 <- ifelse(is.na(co_authors_full_info$abb), co_authors_full_info$region1, co_authors_full_info$abb ) 851 | 852 | # drop the joined columns 853 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2) 854 | 855 | # do the same for the region2, the co_authors' US state names 856 | co_authors_full_info$state2<-with(co_authors_full_info,paste0(region2,country2)) 857 | co_authors_full_info <- left_join(co_authors_full_info,states_df,by=c("state2" = "id")) 858 | co_authors_full_info$region2 <- ifelse(is.na(co_authors_full_info$abb), co_authors_full_info$region2, co_authors_full_info$abb ) 859 | co_authors_full_info <- co_authors_full_info %>% select(doi:country2) 860 | 861 | 862 | # write it to a csv to be visualized 863 | write_csv(co_authors_full_info, "./data/orcid-data.csv") 864 | 865 | # Ta da, you should now have a data file to visualize in Tableau 866 | 867 | # Before uploading to Tableau, consider cleaning your data file, either manually or using a tool 868 | # like Open Refine (https://openrefine.org/). It will improve the visualization if wordings and spellings 869 | # are standardized, particularly in the organization (org1, org2) and city name (city1, city2) fields. 870 | --------------------------------------------------------------------------------