├── .github └── workflows │ ├── black.yaml │ ├── bot.yaml │ ├── codemeta2cff.yml │ ├── iga.yaml │ └── pypi-publish.yaml ├── .gitignore ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── Uploading_dataset_to_CaltechDATA.ipynb ├── accept.py ├── caltechdata_api ├── __init__.py ├── caltechdata_edit.py ├── caltechdata_write.py ├── cli-documentation-for-users.md ├── cli.py ├── customize_schema.py ├── download_file.py ├── get_files.py ├── get_metadata.py ├── md_to_json.py ├── pictures-documentation │ ├── Interact CLI Step 1(a).png │ ├── Interact CLI Step 1(b).png │ ├── Interact CLI Step 1(c) Test Instance.png │ ├── Interact CLI Step 1(d) Test Instance.png │ ├── Interact CLI Step 2(a).png │ ├── Interact CLI Step 2(b).png │ ├── Interact CLI Step 2(c).png │ ├── README.md │ ├── Step 1.png │ ├── Step 2.png │ ├── Step 3(a).png │ ├── Step 3(b).png │ ├── Step 3(c).png │ ├── Step 4(a).png │ ├── Step 4(b).png │ ├── Step 5.png │ ├── Step 6(a).png │ └── Step 6(b).png ├── utils.py ├── vocabularies.yaml └── vocabularies │ ├── .DS_Store │ ├── date_types.yaml │ ├── description_types.yaml │ ├── identifier_types.yaml │ ├── licenses.csv │ ├── relation_types.yaml │ ├── resource_types.yaml │ ├── roles.yaml │ └── title_types.yaml ├── codemeta.json ├── completed_dois.json ├── edit.py ├── edit_osn.py ├── example.json ├── example_custom.json ├── excluded_dois.json ├── fix_names.py ├── inspect_dois.py ├── logo.gif ├── new_ids.json ├── outdated ├── README.md ├── add_doi_minting_date.py ├── caltechdata_multipart.py ├── edit_all.py ├── edit_all_geo.py ├── edit_all_github.py ├── edit_all_tccon.py ├── edit_files.py ├── edit_tccon.py ├── example_download_and_upload.ipynb ├── get_geo.py ├── test.py ├── test_community.py ├── test_file.py ├── unembargo.py ├── update_thesis_file.py └── write_pilot_phase1.py ├── process_tomograms.py ├── pyproject.toml ├── rdm.json ├── run-tests.sh ├── setup.cfg ├── setup.py ├── templates └── README.md ├── tests ├── bot.py ├── bot_yaml.py ├── conftest.py ├── data │ ├── caltechdata │ │ ├── 1171.json │ │ ├── 1235.json │ │ ├── 1250.json │ │ ├── 1259.json │ │ ├── 1300.json │ │ ├── 210.json │ │ ├── 266.json │ │ ├── 267.json │ │ ├── 268.json │ │ ├── 283.json │ │ ├── 293.json │ │ ├── 301.json │ │ └── 970.json │ ├── datacite43 │ │ ├── 4yxbs-4mj38.json │ │ ├── asjw8-cd908.json │ │ ├── b2jqz-qdw65.json │ │ ├── cgkcc-ymk88.json │ │ ├── d7mk4-f8t44.json │ │ ├── dks9f-mj878.json │ │ ├── ep884-g0v97.json │ │ ├── f40da-hww21.json │ │ ├── fbdqe-hez98.json │ │ ├── hevaf-20f84.json │ │ ├── hhg7x-hgm42.json │ │ ├── kxjgj-tfk18.json │ │ ├── kxtar-bm759.json │ │ ├── n0y4x-xx706.json │ │ ├── n13wc-zwc92.json │ │ ├── nbtw5-37m55.json │ │ ├── rmzp9-9yx96.json │ │ ├── t15w6-x9q23.json │ │ └── wbty9-bqy29.json │ └── invalid_datacite43 │ │ ├── invalid_metadata_1.json │ │ ├── invalid_metadata_10.json │ │ ├── invalid_metadata_2.json │ │ ├── invalid_metadata_3.json │ │ ├── invalid_metadata_4.json │ │ ├── invalid_metadata_5.json │ │ ├── invalid_metadata_6.json │ │ ├── invalid_metadata_7.json │ │ ├── invalid_metadata_8.json │ │ ├── invalid_metadata_9.json │ │ ├── missing_creators.json │ │ ├── missing_publisher.json │ │ ├── multiple_errors.json │ │ └── type_error_creators.json ├── helpers.py ├── test_download.py ├── test_rdm.py └── test_unit.py ├── token.bash ├── tomogram_ids.json ├── write.py ├── write_authors.py └── write_hte.py /.github/workflows/black.yaml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - uses: psf/black@stable 11 | -------------------------------------------------------------------------------- /.github/workflows/bot.yaml: -------------------------------------------------------------------------------- 1 | name: Bot validation 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | validate-metadata: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v4 11 | with: 12 | fetch-depth: 0 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: '3.x' 18 | 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install pytest requests s3fs cryptography 23 | pip install . 24 | 25 | - name: Run against CaltechData Test system 26 | env: 27 | RDMTOK: ${{ secrets.CALTECHDATA_TOKEN }} 28 | run: | 29 | cd tests 30 | pytest test_unit.py 31 | pytest test_rdm.py 32 | - name: Run Medata Validation Test and RDM 33 | env: 34 | RDMTOK: ${{ secrets.CALTECHDATA_TOKEN }} 35 | run: | 36 | cd tests 37 | python bot_yaml.py 38 | 39 | -------------------------------------------------------------------------------- /.github/workflows/codemeta2cff.yml: -------------------------------------------------------------------------------- 1 | name: CodeMeta2CFF 2 | run-name: Run CodeMeta2CFF after ${{github.event_name}} by ${{github.actor}} 3 | 4 | on: 5 | push: 6 | paths: ['codemeta.json'] 7 | workflow_dispatch: 8 | inputs: 9 | reason: 10 | description: 'Reason' 11 | required: false 12 | default: 'Manual trigger' 13 | 14 | jobs: 15 | CodeMeta2CFF: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v4 20 | - name: Convert CFF 21 | uses: caltechlibrary/codemeta2cff@main 22 | - name: Install jq for JSON parsing 23 | run: sudo apt-get install -y jq 24 | - name: Parse and update setup.cfg 25 | run: | 26 | # Extract values from codemeta.json 27 | NAME=$(jq -r '.name' codemeta.json) 28 | VERSION=$(jq -r '.version' codemeta.json) 29 | AUTHORS=$(jq -r '[.author[] | .givenName + " " + .familyName] | join(", ")' codemeta.json) 30 | AUTHOR_EMAILS=$(jq -r '[.author[] | .email // empty] | join(", ")' codemeta.json) 31 | DESCRIPTION=$(jq -r '.description' codemeta.json) 32 | URL=$(jq -r '.codeRepository // .url' codemeta.json) 33 | 34 | # Update setup.cfg fields 35 | sed -i "s/^name = .*/name = $NAME/" setup.cfg 36 | sed -i "s/^version = .*/version = $VERSION/" setup.cfg 37 | sed -i "s/^author = .*/author = $AUTHORS/" setup.cfg 38 | sed -i "s/^author_email = .*/author_email = $AUTHOR_EMAILS/" setup.cfg 39 | sed -i "s/^description = .*/description = $DESCRIPTION/" setup.cfg 40 | sed -i "s|^url = .*|url = $URL|" setup.cfg 41 | - name: Commit CFF 42 | uses: EndBug/add-and-commit@v9 43 | with: 44 | message: 'Add updated CITATION.cff and setup.cfg from codemeta.json file' 45 | add: '["setup.cfg", "CITATION.cff"]' 46 | -------------------------------------------------------------------------------- /.github/workflows/iga.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | INVENIO_SERVER: https://data.caltech.edu 3 | 4 | # These variables are IGA options. Please see the docs for info. 5 | draft: false 6 | all_assets: false 7 | all_metadata: false 8 | community: none 9 | parent_record: "6qhkm-7n074" 10 | debug: false 11 | 12 | # This variable is a setting for post-archiving CodeMeta file updates. 13 | # If you don't have a CodeMeta file, you can remove the add_doi_codemeta 14 | # and Coremeta2CFF jobs at the bottom of this file. 15 | ref: main 16 | 17 | # ╭────────────────────────────────────────────╮ 18 | # │ The rest of this file should be left as-is │ 19 | # ╰────────────────────────────────────────────╯ 20 | 21 | name: InvenioRDM GitHub Archiver 22 | on: 23 | release: 24 | types: [published] 25 | workflow_dispatch: 26 | inputs: 27 | release_tag: 28 | description: The release tag (empty = latest) 29 | parent_record: 30 | description: ID of parent record (for versioning) 31 | community: 32 | description: Name of InvenioRDM community (if any) 33 | draft: 34 | description: Mark the record as a draft 35 | type: boolean 36 | all_assets: 37 | description: Attach all GitHub assets 38 | type: boolean 39 | all_metadata: 40 | description: Include additional GitHub metadata 41 | type: boolean 42 | debug: 43 | description: Print debug info in the GitHub log 44 | type: boolean 45 | 46 | run-name: Archive ${{inputs.release_tag || 'latest release'}} in InvenioRDM 47 | jobs: 48 | run_iga: 49 | name: Send to ${{needs.get_repository.outputs.server}} 50 | runs-on: ubuntu-latest 51 | needs: get_repository 52 | outputs: 53 | record_doi: ${{steps.iga.outputs.record_doi}} 54 | steps: 55 | - uses: caltechlibrary/iga@v1 56 | id: iga 57 | with: 58 | INVENIO_SERVER: ${{env.INVENIO_SERVER}} 59 | INVENIO_TOKEN: ${{secrets.INVENIO_TOKEN}} 60 | all_assets: ${{github.event.inputs.all_assets || env.all_assets}} 61 | all_metadata: ${{github.event.inputs.all_metadata || env.all_metadata}} 62 | debug: ${{github.event.inputs.debug || env.debug}} 63 | draft: ${{github.event.inputs.draft || env.draft}} 64 | community: ${{github.event.inputs.community || env.community}} 65 | parent_record: ${{github.event.inputs.parent_record || env.parent_record}} 66 | release_tag: ${{github.event.inputs.release_tag || 'latest'}} 67 | get_repository: 68 | name: Get repository name 69 | runs-on: ubuntu-latest 70 | outputs: 71 | server: ${{steps.parse.outputs.host}} 72 | steps: 73 | - name: Extract name from INVENIO_SERVER 74 | id: parse 75 | run: echo "host=$(cut -d'/' -f3 <<< ${{env.INVENIO_SERVER}} | cut -d':' -f1)" >> $GITHUB_OUTPUT 76 | add_doi_codemeta: 77 | name: "Add ${{needs.run_iga.outputs.record_doi}} to codemeta.json" 78 | needs: run_iga 79 | runs-on: ubuntu-latest 80 | steps: 81 | - name: Checkout 82 | uses: actions/checkout@v4 83 | with: 84 | ref: ${{ env.ref }} 85 | - name: Install sde 86 | run: pip install sde 87 | - name: Add DOI to CodeMeta File 88 | run: sde identifier ${{needs.run_iga.outputs.record_doi}} codemeta.json 89 | - name: Commit CFF 90 | uses: EndBug/add-and-commit@v9 91 | with: 92 | message: 'Add DOI to codemeta.json file' 93 | add: 'codemeta.json' 94 | CodeMeta2CFF: 95 | runs-on: ubuntu-latest 96 | needs: add_doi_codemeta 97 | steps: 98 | - name: Checkout 99 | uses: actions/checkout@v4 100 | with: 101 | ref: ${{ env.ref }} 102 | - name: Convert CFF 103 | uses: caltechlibrary/codemeta2cff@main 104 | - name: Commit CFF 105 | uses: EndBug/add-and-commit@v9 106 | with: 107 | message: 'Add updated CITATION.cff from codemeta.json file' 108 | add: 'CITATION.cff' 109 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | jobs: 9 | build-n-publish: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 3.9 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.9 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install setuptools wheel 21 | - name: Build package 22 | run: | 23 | python setup.py sdist bdist_wheel 24 | - name: Publish 25 | uses: pypa/gh-action-pypi-publish@v1.3.1 26 | with: 27 | user: __token__ 28 | password: ${{ secrets.pypi_token }} 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | data/ 4 | caltechdata_api.egg-info/ 5 | caltechdata_api/__pycache__/ 6 | tests/__pycache__/ 7 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | title: caltechdata_api 4 | authors: 5 | - family-names: Morrell 6 | given-names: Thomas E 7 | orcid: https://orcid.org/0000-0001-9266-5146 8 | - family-names: Bhattarai 9 | given-names: Rohan 10 | orcid: https://orcid.org/0009-0007-0323-4733 11 | - family-names: Won 12 | given-names: Elizabeth 13 | orcid: https://orcid.org/0009-0002-2450-6471 14 | - family-names: Abakah 15 | given-names: Alexander A 16 | orcid: https://orcid.org/0009-0003-5640-6691 17 | abstract: Python wrapper for CaltechDATA API. 18 | repository-code: "https://github.com/caltechlibrary/caltechdata_api" 19 | type: software 20 | doi: 10.22002/3gdk4-j5504 21 | version: 1.10.0 22 | license-url: "https://data.caltech.edu/license" 23 | keywords: 24 | - GitHub 25 | - metadata 26 | - software 27 | - InvenioRDM 28 | date-released: 2025-04-07 29 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | Contributor Covenant Code of Conduct 2 | ==================================== 3 | 4 | ## Our Pledge 5 | 6 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 7 | 8 | ## Our Standards 9 | 10 | Examples of behavior that contributes to creating a positive environment include: 11 | 12 | * Using welcoming and inclusive language 13 | * Being respectful of differing viewpoints and experiences 14 | * Gracefully accepting constructive criticism 15 | * Focusing on what is best for the community 16 | * Showing empathy towards other community members 17 | 18 | Examples of unacceptable behavior by participants include: 19 | 20 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 21 | * Trolling, insulting/derogatory comments, and personal or political attacks 22 | * Public or private harassment 23 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 24 | * Other conduct which could reasonably be considered inappropriate in a professional setting 25 | 26 | ## Our Responsibilities 27 | 28 | Project contributors are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 29 | 30 | Project contributors have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 31 | 32 | ## Scope 33 | 34 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project contributors. 35 | 36 | ## Enforcement 37 | 38 | If a contributor engages in harassing behaviour, the project organizers may take any action they deem appropriate, including warning the offender or expelling them from online forums, online project resources, face-to-face meetings, or any other project-related activity or resource. 39 | 40 | If you are being harassed, notice that someone else is being harassed, or have any other concerns, please contact a member of the project team immediately. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 41 | 42 | ## Attribution 43 | 44 | Portions of this Code of Conduct were adapted from Electron's [Contributor Covenant Code of Conduct](https://github.com/electron/electron/blob/master/CODE_OF_CONDUCT.md), which itself was adapted from the [Contributor Covenant](http://contributor-covenant.org/version/1/4), version 1.4. 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Caltech 2 | All rights not granted herein are expressly reserved by Caltech. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CaltechDATA API Python Library 2 | 3 | [![DOI](https://img.shields.io/badge/dynamic/json.svg?label=DOI&query=$.pids.doi.identifier&uri=https://data.caltech.edu/api/records/wfjr5-kw507/versions/latest)](https://data.caltech.edu/records/wfjr5-kw507/latest) 4 | 5 | The `caltechdata_api` Python library provides a convenient interface for interacting with the CaltechDATA API. It allows users to write files, create DataCite 4 standard JSON records, edit existing records, and retrieve metadata from the CaltechDATA repository. 6 | 7 | ## Features 8 | 9 | ### Writing and Editing Records 10 | - `caltechdata_write`: Writes files and a DataCite 4 standard JSON record to the CaltechDATA repository. 11 | - `caltechdata_edit`: Edits existing records in CaltechDATA. 12 | 13 | ### Metadata Operations 14 | - `get_metadata`: Retrieves metadata from CaltechDATA records. 15 | 16 | ## Requirements 17 | 18 | - Python 3.6+ 19 | 20 | ## Installation 21 | 22 | Install the library via pip: 23 | 24 | ```shell 25 | pip install caltechdata_api 26 | ``` 27 | 28 | ## Examples 29 | 30 | There are some example python scripts in the GitHub repository. 31 | 32 | ### Create a record: 33 | 34 | ```shell 35 | python write.py example.json -fnames logo.gif 36 | # Output: pbkn6-m9y63 (unique identifier) 37 | ``` 38 | > The response will be the unique identifier for the record. You can put this at 39 | the end of a url to visit the record (e.g. 40 | https://data.caltechlibrary.dev/records/pbkn6-m9y63) 41 | 42 | ### Edit a record 43 | Make changes to the example.json file to see a change) 44 | ``` 45 | python edit.py example.json -id pbkn6-m9y63 46 | 10.33569/pbkn6-m9y63 47 | ``` 48 | > The response is the DOI for the record, which includes the unique identifier 49 | for the record in the default configuration. 50 | 51 | ## Using Custom DOIs 52 | Some groups have worked with the library to create custom DOIs. These can be 53 | passed in the metadata like: 54 | 55 | ```shell 56 | python write.py example_custom.json -fnames logo.gif 57 | m6zxz-p4j22 58 | ``` 59 | 60 | And then you can edit with 61 | ``` 62 | python edit.py example_custom.json -id m6zxz-p4j22 63 | 10.5281/inveniordm.1234 64 | ``` 65 | 66 | This returns the custom DOI of the record if it is successful. 67 | 68 | 69 | ## Setup and Authentication 70 | 71 | 1. Acquire a personal access token from your CaltechDATA account (found under "Applications" at the top right of your screen). 72 | 2. Copy the token to a file named token.bash. 73 | 3. Load the token in the command line using source token.bash. 74 | 75 | ## Note on Testing 76 | 77 | Only test your application on the test repository (`data.caltechlibrary.dev`). Testing the API on the public 78 | repository will generate junk records that are annoying to delete. 79 | 80 | ## Using the Command Line Interface 81 | 82 | If you would like to interact with the CaltechDATA API using the Command line Interface (CLI), please [see the detailed documentation](https://caltechlibrary.github.io/caltechdata_api/caltechdata_api/cli-documentation-for-users). 83 | -------------------------------------------------------------------------------- /accept.py: -------------------------------------------------------------------------------- 1 | import argparse, os 2 | from caltechdata_api import caltechdata_accept 3 | 4 | parser = argparse.ArgumentParser( 5 | description="Accept records to a community in the CaltechDATA repository" 6 | ) 7 | parser.add_argument("ids", nargs="*", help="CaltechDATA IDs") 8 | args = parser.parse_args() 9 | 10 | # Get access token set as environment variable with source token.bash 11 | token = os.environ["RDMTOK"] 12 | 13 | production = True 14 | 15 | caltechdata_accept( 16 | args.ids, 17 | token, 18 | production, 19 | ) 20 | print("Completed") 21 | -------------------------------------------------------------------------------- /caltechdata_api/__init__.py: -------------------------------------------------------------------------------- 1 | from .caltechdata_write import ( 2 | caltechdata_write, 3 | write_files_rdm, 4 | add_file_links, 5 | send_to_community, 6 | ) 7 | from .caltechdata_edit import ( 8 | caltechdata_edit, 9 | caltechdata_unembargo, 10 | caltechdata_accept, 11 | ) 12 | from .customize_schema import customize_schema, validate_metadata 13 | from .get_metadata import get_metadata 14 | from .download_file import download_file, download_url 15 | from .utils import humanbytes 16 | from .md_to_json import parse_readme_to_json 17 | -------------------------------------------------------------------------------- /caltechdata_api/download_file.py: -------------------------------------------------------------------------------- 1 | import requests, argparse 2 | from tqdm.auto import tqdm 3 | 4 | 5 | def download_url(doi, media_type=None): 6 | """Get a download link for a file listed in the media API for a DataCite DOI""" 7 | api_url = "https://api.datacite.org/dois/" + doi + "/media" 8 | r = requests.get(api_url).json() 9 | data = r["data"] 10 | if media_type == None: 11 | url = data[0]["attributes"]["url"] 12 | else: 13 | for media in data: 14 | if media["attributes"]["mediaType"] == media_type: 15 | url = media["attributes"] 16 | return url 17 | 18 | 19 | def download_file(doi, fname=None, media_type=None): 20 | """Download a file listed in the media API for a DataCite DOI""" 21 | url = download_url(doi, media_type) 22 | r = requests.get(url, stream=True) 23 | # Set file name 24 | if fname == None: 25 | fname = doi.replace("/", "-") 26 | # Download file with progress bar 27 | if r.status_code == 403: 28 | print("File Unavailable") 29 | if "content-length" not in r.headers: 30 | print("Did not get file") 31 | else: 32 | with open(fname, "wb") as f: 33 | total_length = int(r.headers.get("content-length")) 34 | pbar = tqdm(total=int(total_length / 1024), unit="B") 35 | for chunk in r.iter_content(chunk_size=1024): 36 | if chunk: 37 | pbar.update() 38 | f.write(chunk) 39 | return fname 40 | 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser( 44 | description="download_file queries the DaiaCite Media API\ 45 | and downloads the file associated with a DOI" 46 | ) 47 | parser.add_argument( 48 | "dois", 49 | nargs="+", 50 | help="The DOI for files to be downloaded", 51 | ) 52 | parser.add_argument( 53 | "-fname", default=None, help="File name to be used for downloaded file" 54 | ) 55 | parser.add_argument( 56 | "-media_type", default=None, help="File (media) type to be downloaded" 57 | ) 58 | 59 | args = parser.parse_args() 60 | 61 | for doi in args.dois: 62 | download_file(doi, args.fname, args.media_type) 63 | -------------------------------------------------------------------------------- /caltechdata_api/get_files.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import requests 3 | 4 | 5 | def get_files(idv, production=True): 6 | # Returns file block 7 | 8 | if production == True: 9 | api_url = "https://data.caltech.edu/api/records/" 10 | else: 11 | api_url = "https://data.caltechlibrary.dev/api/records/" 12 | 13 | r = requests.get(api_url + str(idv) + "/files") 14 | r_data = r.json() 15 | if "message" in r_data: 16 | raise AssertionError( 17 | "id " 18 | + str(idv) 19 | + " expected http status 200, got " 20 | + str(r.status_code) 21 | + " " 22 | + r_data["message"] 23 | ) 24 | if not "entries" in r_data: 25 | raise AssertionError("expected as entries property in response, got " + r_data) 26 | return r_data["entries"] 27 | 28 | 29 | if __name__ == "__main__": 30 | parser = argparse.ArgumentParser( 31 | description="get_files queries the caltechDATA (Invenio 3) API\ 32 | and returns file information" 33 | ) 34 | parser.add_argument( 35 | "ids", 36 | metavar="ID", 37 | type=str, 38 | nargs="+", 39 | help="The CaltechDATA ID for each record of interest", 40 | ) 41 | parser.add_argument("-test", dest="production", action="store_false") 42 | 43 | args = parser.parse_args() 44 | 45 | production = args.production 46 | 47 | for idv in args.ids: 48 | metadata = get_files(idv, production) 49 | print(metadata) 50 | -------------------------------------------------------------------------------- /caltechdata_api/get_metadata.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import json 4 | import os 5 | 6 | import requests 7 | from datacite import schema43 8 | 9 | 10 | def get_metadata( 11 | idv, 12 | production=True, 13 | validate=True, 14 | emails=False, 15 | schema="43", 16 | token=False, 17 | authors=False, 18 | ): 19 | # Returns just DataCite metadata or DataCite metadata with emails 20 | 21 | if production == True: 22 | if authors: 23 | url = "https://authors.library.caltech.edu/api/records/" 24 | else: 25 | url = "https://data.caltech.edu/api/records/" 26 | verify = True 27 | else: 28 | if authors: 29 | url = "https://authors.caltechlibrary.dev/api/records/" 30 | else: 31 | url = "https://data.caltechlibrary.dev/api/records/" 32 | verify = True 33 | 34 | if authors: 35 | headers = { 36 | "accept": "application/json", 37 | } 38 | validate = False 39 | else: 40 | headers = { 41 | "accept": "application/vnd.datacite.datacite+json", 42 | } 43 | 44 | if token: 45 | headers["Authorization"] = "Bearer %s" % token 46 | 47 | response = requests.get(url + idv, headers=headers, verify=verify) 48 | if response.status_code != 200: 49 | raise Exception(response.text) 50 | else: 51 | metadata = response.json() 52 | 53 | if validate: 54 | if schema == "43": 55 | try: 56 | assert schema43.validate(metadata) 57 | except AssertionError: 58 | v = schema43.validator.validate(metadata) 59 | errors = sorted(v.iter_errors(instance), key=lambda e: e.path) 60 | for error in errors: 61 | print(error.message) 62 | 63 | return metadata 64 | 65 | 66 | if __name__ == "__main__": 67 | parser = argparse.ArgumentParser( 68 | description="get_metadata queries the caltechDATA (Invenio 3) API\ 69 | and returns DataCite-compatable metadata" 70 | ) 71 | parser.add_argument( 72 | "ids", 73 | metavar="ID", 74 | type=str, 75 | nargs="+", 76 | help="The CaltechDATA ID for each record of interest", 77 | ) 78 | parser.add_argument("-test", dest="production", action="store_false") 79 | parser.add_argument("-authors", dest="authors", action="store_true") 80 | parser.add_argument("-xml", dest="save_xml", action="store_true") 81 | parser.add_argument( 82 | "-skip_validate", 83 | dest="skip_validate", 84 | action="store_true", 85 | help="skip validation of metadata", 86 | ) 87 | parser.add_argument("-schema", default="43", help="Schema Version") 88 | 89 | args = parser.parse_args() 90 | 91 | production = args.production 92 | schema = args.schema 93 | authors = args.authors 94 | skip_validate = args.skip_validate 95 | if skip_validate: 96 | validate = False 97 | else: 98 | validate = True 99 | 100 | for idv in args.ids: 101 | metadata = get_metadata( 102 | idv, production, validate, schema=schema, authors=authors 103 | ) 104 | outfile = open(str(idv) + ".json", "w") 105 | outfile.write(json.dumps(metadata, indent=4)) 106 | outfile.close() 107 | if args.save_xml == True: 108 | xml = schema40.tostring(metadata) 109 | outfile = open(str(idv) + ".xml", "w", encoding="utf8") 110 | outfile.write(xml) 111 | -------------------------------------------------------------------------------- /caltechdata_api/md_to_json.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import requests 4 | 5 | 6 | class ReadmeFormatException(Exception): 7 | """Custom exception for errors in the README format.""" 8 | 9 | 10 | def camel_case(s): 11 | """Converts a string to camelCase.""" 12 | s = re.sub(r"(\s|_|-)+", " ", s).title().replace(" ", "") 13 | return s[0].lower() + s[1:] if s else "" 14 | 15 | 16 | def expand_special_keys(key, value): 17 | """Expand special keys into their structured format (affiliation, nameIdentifiers).""" 18 | if key == "affiliation": 19 | if "ror.org" not in value: 20 | raise ValueError("Affiliation Identifier is not a ROR") 21 | ror = value.split("ror.org/")[1].split("]")[0] 22 | response = requests.get(f"https://api.ror.org/organizations/{ror}").json() 23 | return [ 24 | { 25 | "affiliationIdentifier": ror, 26 | "affiliationIdentifierScheme": "ROR", 27 | "name": response["name"], 28 | } 29 | ] 30 | elif key == "nameIdentifiers": 31 | orcid = value.split("orcid.org/")[1].split("]")[0] 32 | return [ 33 | { 34 | "nameIdentifier": orcid, 35 | "nameIdentifierScheme": "ORCID", 36 | "schemeUri": f"https://orcid.org/{value}", 37 | } 38 | ] 39 | return value 40 | 41 | 42 | def parse_readme_to_json(readme_path): 43 | try: 44 | with open(readme_path, "r") as file: 45 | lines = file.read().split("\n") 46 | except IOError as e: 47 | raise ReadmeFormatException(f"Failed to open or read the file: {e}") 48 | 49 | json_data = {} 50 | current_section = None 51 | current_object = {} 52 | 53 | title_line = lines.pop(0) 54 | if title_line.startswith("#") == False: 55 | raise ValueError('README.md needs to start with "# Title"') 56 | else: 57 | json_data["titles"] = [{"title": title_line.replace("# ", "")}] 58 | 59 | contributors = [] 60 | identifiers = [] 61 | item_list = [] 62 | 63 | section_pattern = re.compile(r"^##\s+(.*)$") 64 | key_value_pattern = re.compile(r"^-\s+(.*?):\s+(.*)$") 65 | link_pattern = re.compile(r"\[.*?\]\((.*?)\)") 66 | 67 | for line_number, line in enumerate(lines, 1): 68 | if not line.strip(): 69 | if item_list and current_section: 70 | json_data[current_section] = item_list 71 | item_list = [] 72 | elif current_object and current_section: 73 | if current_section == "types": 74 | json_data[current_section] = current_object 75 | elif len(current_object) == 1: 76 | key, value = next(iter(current_object.items())) 77 | if key in ["language", "publicationYear", "publisher", "version"]: 78 | json_data[current_section] = value 79 | else: 80 | json_data[current_section].append(current_object) 81 | elif current_section in ["creators", "contributors"]: 82 | contributors.append(current_object) 83 | current_object = {} 84 | elif current_section == "identifiers": 85 | identifiers.append(current_object) 86 | current_object = {} 87 | else: 88 | json_data[current_section].append(current_object) 89 | current_object = {} 90 | continue 91 | 92 | section_match = section_pattern.match(line) 93 | if section_match: 94 | if item_list: 95 | json_data[current_section] = item_list 96 | elif current_object: 97 | if current_section in json_data: 98 | if isinstance(json_data[current_section], list): 99 | json_data[current_section].append(current_object) 100 | elif isinstance(json_data[current_section], dict): 101 | json_data[current_section].update(current_object) 102 | else: 103 | json_data[current_section] = ( 104 | [current_object] 105 | if current_section != "types" 106 | else current_object 107 | ) 108 | current_object = {} 109 | 110 | elif contributors and current_section in ["creators", "contributors"]: 111 | json_data[current_section] = contributors 112 | contributors = [] 113 | elif identifiers and current_section == "identifiers": 114 | json_data[current_section] = identifiers 115 | identifiers = [] 116 | 117 | elif current_section and current_object: 118 | if current_section == "types": 119 | json_data[current_section] = current_object 120 | elif len(current_object) == 1: 121 | key, value = next(iter(current_object.items())) 122 | if key in ["language", "publicationYear", "publisher", "version"]: 123 | json_data[current_section].append(value) 124 | else: 125 | json_data[current_section].append(current_object) 126 | else: 127 | json_data[current_section].append(current_object) 128 | current_object = {} 129 | current_section = camel_case(section_match.group(1)) 130 | json_data[current_section] = [] if current_section != "types" else {} 131 | continue 132 | 133 | key_value_match = key_value_pattern.match(line) 134 | if key_value_match and current_section: 135 | key, value = key_value_match.groups() 136 | key = camel_case(key) 137 | 138 | if key in ["affiliation", "nameIdentifiers"]: 139 | value = expand_special_keys(key, value) 140 | elif ( 141 | key == "nameType" 142 | and current_object 143 | and current_section in ["creators", "contributors"] 144 | ): 145 | contributors.append(current_object) 146 | current_object = {} 147 | elif current_section in ["subjects"]: 148 | item_list.append({key: value}) 149 | elif current_section == "dates": 150 | if key == "date": 151 | current_object["date"] = value 152 | elif key == "dateType": 153 | current_object["dateType"] = value 154 | item_list.append(current_object) 155 | current_object = {} 156 | else: 157 | link_match = link_pattern.search(value) 158 | if link_match: 159 | value = link_match.group(1) 160 | 161 | current_object[key] = value 162 | 163 | elif line.strip() and not section_match: 164 | raise ReadmeFormatException( 165 | f"Incorrect format detected at line {line_number}: {line}" 166 | ) 167 | 168 | if contributors and current_section in ["creators", "contributors"]: 169 | json_data[current_section] = contributors 170 | elif identifiers and current_section == "identifiers": 171 | json_data[current_section] = identifiers 172 | elif current_section and current_object: 173 | if current_section == "types": 174 | json_data[current_section] = current_object 175 | elif len(current_object) == 1: 176 | key, value = next(iter(current_object.items())) 177 | if key in ["language", "publicationYear", "publisher", "version"]: 178 | json_data[current_section].append(value) 179 | else: 180 | json_data[current_section].append(current_object) 181 | else: 182 | json_data[current_section].append(current_object) 183 | 184 | return json_data 185 | 186 | 187 | if __name__ == "__main__": 188 | readme_path = "/Users/elizabethwon/downloads/exampleREADME.md" 189 | try: 190 | json_data = parse_readme_to_json(readme_path) 191 | output_json_path = "output1.json" 192 | with open(output_json_path, "w") as json_file: 193 | json.dump(json_data, json_file, indent=4) 194 | print(f"Converted JSON saved to {output_json_path}") 195 | except ReadmeFormatException as e: 196 | print(f"Error parsing README file: {e}") 197 | -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Interact CLI Step 1(a).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Interact CLI Step 1(a).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Interact CLI Step 1(b).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Interact CLI Step 1(b).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Interact CLI Step 1(c) Test Instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Interact CLI Step 1(c) Test Instance.png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Interact CLI Step 1(d) Test Instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Interact CLI Step 1(d) Test Instance.png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Interact CLI Step 2(a).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Interact CLI Step 2(a).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Interact CLI Step 2(b).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Interact CLI Step 2(b).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Interact CLI Step 2(c).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Interact CLI Step 2(c).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/README.md: -------------------------------------------------------------------------------- 1 | This subfolder is created to store the pictures for documentation 2 | -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Step 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Step 1.png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Step 2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Step 2.png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Step 3(a).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Step 3(a).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Step 3(b).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Step 3(b).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Step 3(c).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Step 3(c).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Step 4(a).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Step 4(a).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Step 4(b).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Step 4(b).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Step 5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Step 5.png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Step 6(a).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Step 6(a).png -------------------------------------------------------------------------------- /caltechdata_api/pictures-documentation/Step 6(b).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/pictures-documentation/Step 6(b).png -------------------------------------------------------------------------------- /caltechdata_api/utils.py: -------------------------------------------------------------------------------- 1 | # Public domain by Mitch McMabers 2 | 3 | from typing import List, Union 4 | 5 | METRIC_LABELS: List[str] = ["B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"] 6 | BINARY_LABELS: List[str] = ["B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"] 7 | PRECISION_OFFSETS: List[float] = [0.5, 0.05, 0.005, 0.0005] # PREDEFINED FOR SPEED. 8 | PRECISION_FORMATS: List[str] = [ 9 | "{}{:.0f} {}", 10 | "{}{:.1f} {}", 11 | "{}{:.2f} {}", 12 | "{}{:.3f} {}", 13 | ] # PREDEFINED FOR SPEED. 14 | 15 | 16 | def humanbytes(num: Union[int, float], metric: bool = True, precision: int = 1) -> str: 17 | """ 18 | Human-readable formatting of bytes, using binary (powers of 1024) 19 | or metric (powers of 1000) representation. 20 | """ 21 | 22 | assert isinstance(num, (int, float)), "num must be an int or float" 23 | assert isinstance(metric, bool), "metric must be a bool" 24 | assert ( 25 | isinstance(precision, int) and precision >= 0 and precision <= 3 26 | ), "precision must be an int (range 0-3)" 27 | 28 | unit_labels = METRIC_LABELS if metric else BINARY_LABELS 29 | last_label = unit_labels[-1] 30 | unit_step = 1000 if metric else 1024 31 | unit_step_thresh = unit_step - PRECISION_OFFSETS[precision] 32 | 33 | is_negative = num < 0 34 | if is_negative: # Faster than ternary assignment or always running abs(). 35 | num = abs(num) 36 | 37 | for unit in unit_labels: 38 | if num < unit_step_thresh: 39 | # VERY IMPORTANT: 40 | # Only accepts the CURRENT unit if we're BELOW the threshold where 41 | # float rounding behavior would place us into the NEXT unit: F.ex. 42 | # when rounding a float to 1 decimal, any number ">= 1023.95" will 43 | # be rounded to "1024.0". Obviously we don't want ugly output such 44 | # as "1024.0 KiB", since the proper term for that is "1.0 MiB". 45 | break 46 | if unit != last_label: 47 | # We only shrink the number if we HAVEN'T reached the last unit. 48 | # NOTE: These looped divisions accumulate floating point rounding 49 | # errors, but each new division pushes the rounding errors further 50 | # and further down in the decimals, so it doesn't matter at all. 51 | num /= unit_step 52 | 53 | return PRECISION_FORMATS[precision].format("-" if is_negative else "", num, unit) 54 | 55 | 56 | if __name__ == "__main__": 57 | print(humanbytes(2251799813685247)) # 2 pebibytes 58 | print(humanbytes(2000000000000000, True)) # 2 petabytes 59 | print(humanbytes(1099511627776)) # 1 tebibyte 60 | print(humanbytes(1000000000000, True)) # 1 terabyte 61 | print(humanbytes(1000000000, True)) # 1 gigabyte 62 | print(humanbytes(4318498233, precision=3)) # 4.022 gibibytes 63 | print(humanbytes(4318498233, True, 3)) # 4.318 gigabytes 64 | print(humanbytes(-4318498233, precision=2)) # -4.02 gibibytes 65 | -------------------------------------------------------------------------------- /caltechdata_api/vocabularies.yaml: -------------------------------------------------------------------------------- 1 | creatorsroles: 2 | pid-type: crr 3 | data-file: vocabularies/roles.yaml 4 | contributorsroles: 5 | pid-type: cor 6 | data-file: vocabularies/roles.yaml 7 | resourcetypes: 8 | pid-type: rsrct 9 | data-file: vocabularies/resource_types.yaml 10 | descriptiontypes: 11 | pid-type: dty 12 | data-file: vocabularies/description_types.yaml 13 | datetypes: 14 | pid-type: dat 15 | data-file: vocabularies/date_types.yaml 16 | relationtypes: 17 | pid-type: rlt 18 | data-file: vocabularies/relation_types.yaml 19 | titletypes: 20 | pid-type: ttyp 21 | data-file: vocabularies/title_types.yaml 22 | identifiertypes: 23 | pid-type: idt 24 | data-file: vocabularies/identifier_types.yaml 25 | -------------------------------------------------------------------------------- /caltechdata_api/vocabularies/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/caltechdata_api/vocabularies/.DS_Store -------------------------------------------------------------------------------- /caltechdata_api/vocabularies/date_types.yaml: -------------------------------------------------------------------------------- 1 | - id: accepted 2 | props: 3 | datacite: Accepted 4 | title: 5 | en: Accepted 6 | - id: available 7 | props: 8 | datacite: Available 9 | title: 10 | en: Available 11 | - id: collected 12 | props: 13 | datacite: Collected 14 | title: 15 | en: Collected 16 | - id: copyrighted 17 | props: 18 | datacite: Copyrighted 19 | title: 20 | en: Copyrighted 21 | - id: created 22 | props: 23 | datacite: Created 24 | title: 25 | en: Created 26 | - id: issued 27 | props: 28 | datacite: Issued 29 | title: 30 | en: Issued 31 | - id: other 32 | props: 33 | datacite: Other 34 | title: 35 | en: Other 36 | - id: submitted 37 | props: 38 | datacite: Submitted 39 | title: 40 | en: Submitted 41 | - id: updated 42 | props: 43 | datacite: Updated 44 | title: 45 | en: Updated 46 | - id: valid 47 | props: 48 | datacite: Valid 49 | title: 50 | en: Valid 51 | - id: withdrawn 52 | props: 53 | datacite: Withdrawn 54 | title: 55 | en: Withdrawn 56 | -------------------------------------------------------------------------------- /caltechdata_api/vocabularies/description_types.yaml: -------------------------------------------------------------------------------- 1 | - id: abstract 2 | props: 3 | datacite: Abstract 4 | title: 5 | en: Abstract 6 | - id: methods 7 | props: 8 | datacite: Methods 9 | title: 10 | en: Methods 11 | - id: series-information 12 | props: 13 | datacite: SeriesInformation 14 | title: 15 | en: Series information 16 | - id: table-of-contents 17 | props: 18 | datacite: TableOfContents 19 | title: 20 | en: Table of contents 21 | - id: technical-info 22 | props: 23 | datacite: TechnicalInfo 24 | title: 25 | en: Technical info 26 | - id: other 27 | props: 28 | datacite: Other 29 | title: 30 | en: Other 31 | # Not really a datacite mapping, but needed to support passing custom types 32 | - id: files 33 | props: 34 | datacite: files 35 | -------------------------------------------------------------------------------- /caltechdata_api/vocabularies/identifier_types.yaml: -------------------------------------------------------------------------------- 1 | - id: ark 2 | props: 3 | datacite: ARK 4 | title: 5 | en: ARK 6 | - id: arxiv 7 | props: 8 | datacite: arXiv 9 | title: 10 | en: arXiv 11 | - id: bibcode 12 | props: 13 | datacite: bibcode 14 | title: 15 | en: Bibcode 16 | - id: doi 17 | props: 18 | datacite: DOI 19 | title: 20 | en: DOI 21 | - id: ean13 22 | props: 23 | datacite: EAN13 24 | title: 25 | en: EAN13 26 | - id: eissn 27 | props: 28 | datacite: EISSN 29 | title: 30 | en: EISSN 31 | - id: handle 32 | props: 33 | datacite: Handle 34 | title: 35 | en: Handle 36 | - id: igsn 37 | props: 38 | datacite: IGSN 39 | title: 40 | en: IGSN 41 | - id: isbn 42 | props: 43 | datacite: ISBN 44 | title: 45 | en: ISBN 46 | - id: issn 47 | props: 48 | datacite: ISSN 49 | title: 50 | en: ISSN 51 | - id: istc 52 | props: 53 | datacite: ISTC 54 | title: 55 | en: ISTC 56 | - id: lissn 57 | props: 58 | datacite: LISSN 59 | title: 60 | en: LISSN 61 | - id: lsid 62 | props: 63 | datacite: LSID 64 | title: 65 | en: LSID 66 | - id: pmid 67 | props: 68 | datacite: PMID 69 | title: 70 | en: PMID 71 | - id: purl 72 | props: 73 | datacite: PURL 74 | title: 75 | en: PURL 76 | - id: upc 77 | props: 78 | datacite: UPC 79 | title: 80 | en: UPC 81 | - id: url 82 | props: 83 | datacite: URL 84 | title: 85 | en: URL 86 | - id: urn 87 | props: 88 | datacite: URN 89 | title: 90 | en: URN 91 | - id: w3id 92 | props: 93 | datacite: w3id 94 | title: 95 | en: W3ID 96 | - id: cdid 97 | props: 98 | datacite: cdid 99 | title: 100 | en: CALTECHDATA_ID 101 | - id: tiltid 102 | props: 103 | datacite: tiltid 104 | title: 105 | en: TILT_SERIES_ID 106 | - id: dsa-110-id 107 | props: 108 | datacite: dsa-110-id 109 | title: 110 | en: DSA_110_ID 111 | -------------------------------------------------------------------------------- /caltechdata_api/vocabularies/relation_types.yaml: -------------------------------------------------------------------------------- 1 | - id: iscitedby 2 | props: 3 | datacite: IsCitedBy 4 | title: 5 | en: Is cited by 6 | - id: cites 7 | props: 8 | datacite: Cites 9 | title: 10 | en: Cites 11 | - id: issupplementto 12 | props: 13 | datacite: IsSupplementTo 14 | title: 15 | en: Is supplement to 16 | - id: issupplementedby 17 | props: 18 | datacite: IsSupplementedBy 19 | title: 20 | en: Is supplemented by 21 | - id: iscontinuedby 22 | props: 23 | datacite: IsContinuedBy 24 | title: 25 | en: Is continued by 26 | - id: continues 27 | props: 28 | datacite: Continues 29 | title: 30 | en: Continues 31 | - id: isdescribedby 32 | props: 33 | datacite: IsDescribedBy 34 | title: 35 | en: Is described by 36 | - id: describes 37 | props: 38 | datacite: Describes 39 | title: 40 | en: Describes 41 | - id: hasversion 42 | props: 43 | datacite: HasVersion 44 | title: 45 | en: Has version 46 | - id: isversionof 47 | props: 48 | datacite: IsVersionOf 49 | title: 50 | en: Is version of 51 | - id: isnewversionof 52 | props: 53 | datacite: IsNewVersionOf 54 | title: 55 | en: Is new version of 56 | - id: ispreviousversionof 57 | props: 58 | datacite: IsPreviousVersionOf 59 | title: 60 | en: Is previous version of 61 | - id: ispartof 62 | props: 63 | datacite: IsPartOf 64 | title: 65 | en: Is part of 66 | - id: haspart 67 | props: 68 | datacite: HasPart 69 | title: 70 | en: HasPart 71 | - id: isreferencedby 72 | props: 73 | datacite: IsReferencedBy 74 | title: 75 | en: Is referenced by 76 | - id: references 77 | props: 78 | datacite: References 79 | title: 80 | en: References 81 | - id: isdocumentedby 82 | props: 83 | datacite: IsDocumentedBy 84 | title: 85 | en: Is documented by 86 | - id: documents 87 | props: 88 | datacite: Documents 89 | title: 90 | en: Documents 91 | - id: iscompiledby 92 | props: 93 | datacite: IsCompiledBy 94 | title: 95 | en: Is compiled by 96 | - id: compiles 97 | props: 98 | datacite: Compiles 99 | title: 100 | en: Compiles 101 | - id: isvariantformof 102 | props: 103 | datacite: IsVariantFormOf 104 | title: 105 | en: Is variant form of 106 | - id: isoriginalformof 107 | props: 108 | datacite: IsOriginalFormOf 109 | title: 110 | en: Is original form of 111 | - id: isidenticalto 112 | props: 113 | datacite: IsIdenticalTo 114 | title: 115 | en: Is identical to 116 | - id: isreviewedby 117 | props: 118 | datacite: IsReviewedBy 119 | title: 120 | en: Is reviewed by 121 | - id: reviews 122 | props: 123 | datacite: Reviews 124 | title: 125 | en: Reviews 126 | - id: isderivedfrom 127 | props: 128 | datacite: IsDerivedFrom 129 | title: 130 | en: Is derived from 131 | - id: issourceof 132 | props: 133 | datacite: IsSourceOf 134 | title: 135 | en: Is source of 136 | - id: isrequiredby 137 | props: 138 | datacite: IsRequiredBy 139 | title: 140 | en: Is required by 141 | - id: requires 142 | props: 143 | datacite: Requires 144 | title: 145 | en: Requires 146 | - id: isobsoletedby 147 | props: 148 | datacite: IsObsoletedBy 149 | title: 150 | en: Is obsoleted by 151 | - id: obsoletes 152 | props: 153 | datacite: Obsoletes 154 | title: 155 | en: Obsoletes 156 | -------------------------------------------------------------------------------- /caltechdata_api/vocabularies/roles.yaml: -------------------------------------------------------------------------------- 1 | - id: contactperson 2 | props: 3 | datacite: ContactPerson 4 | title: 5 | en: Contact person 6 | - id: datacollector 7 | props: 8 | datacite: DataCollector 9 | title: 10 | en: Data collector 11 | - id: datacurator 12 | props: 13 | datacite: DataCurator 14 | title: 15 | en: Data curator 16 | - id: datamanager 17 | props: 18 | datacite: DataManager 19 | title: 20 | en: Data manager 21 | - id: distributor 22 | props: 23 | datacite: Distributor 24 | title: 25 | en: Distributor 26 | - id: editor 27 | props: 28 | datacite: Editor 29 | title: 30 | en: Editor 31 | - id: hostinginstitution 32 | props: 33 | datacite: HostingInstitution 34 | title: 35 | en: Hosting institution 36 | - id: producer 37 | props: 38 | datacite: Producer 39 | title: 40 | en: Producer 41 | - id: projectleader 42 | props: 43 | datacite: ProjectLeader 44 | title: 45 | en: Project leader 46 | - id: projectmanager 47 | props: 48 | datacite: ProjectManager 49 | title: 50 | en: Project manager 51 | - id: projectmember 52 | props: 53 | datacite: ProjectMember 54 | title: 55 | en: Project member 56 | - id: registrationagency 57 | props: 58 | datacite: RegistrationAgency 59 | title: 60 | en: Registration agency 61 | - id: registrationauthority 62 | props: 63 | datacite: RegistrationAuthority 64 | title: 65 | en: Registration authority 66 | - id: relatedperson 67 | props: 68 | datacite: RelatedPerson 69 | title: 70 | en: Related person 71 | - id: researcher 72 | props: 73 | datacite: Researcher 74 | title: 75 | en: Researcher 76 | - id: researchgroup 77 | props: 78 | datacite: ResearchGroup 79 | title: 80 | en: Research group 81 | - id: rightsholder 82 | props: 83 | datacite: RightsHolder 84 | title: 85 | en: Rights holder 86 | - id: sponsor 87 | props: 88 | datacite: Sponsor 89 | title: 90 | en: Sponsor 91 | - id: supervisor 92 | props: 93 | datacite: Supervisor 94 | title: 95 | en: Supervisor 96 | - id: workpackageleader 97 | props: 98 | datacite: WorkPackageLeader 99 | title: 100 | en: Work package leader 101 | - id: other 102 | props: 103 | datacite: Other 104 | title: 105 | en: Other 106 | -------------------------------------------------------------------------------- /caltechdata_api/vocabularies/title_types.yaml: -------------------------------------------------------------------------------- 1 | - id: alternative-title 2 | props: 3 | datacite: AlternativeTitle 4 | title: 5 | en: Alternative title 6 | - id: subtitle 7 | props: 8 | datacite: Subtitle 9 | title: 10 | en: Subtitle 11 | - id: translated-title 12 | props: 13 | datacite: TranslatedTitle 14 | title: 15 | en: Translated title 16 | - id: other 17 | props: 18 | datacite: Other 19 | title: 20 | en: Other 21 | -------------------------------------------------------------------------------- /codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://doi.org/10.5063/schema/codemeta-2.0", 3 | "@type": "SoftwareSourceCode", 4 | "description": "Python wrapper for CaltechDATA API.", 5 | "name": "caltechdata_api", 6 | "codeRepository": "https://github.com/caltechlibrary/caltechdata_api", 7 | "issueTracker": "https://github.com/caltechlibrary/caltechdata_api/issues", 8 | "license": "https://data.caltech.edu/license", 9 | "version": "1.10.0", 10 | "author": [ 11 | { 12 | "@type": "Person", 13 | "givenName": "Thomas E", 14 | "familyName": "Morrell", 15 | "affiliation": { 16 | "@type": "Organization", 17 | "name": "Caltech Library" 18 | }, 19 | "email": "tmorrell@caltech.edu", 20 | "@id": "https://orcid.org/0000-0001-9266-5146" 21 | }, 22 | { 23 | "@type": "Person", 24 | "givenName": "Rohan", 25 | "familyName": "Bhattarai", 26 | "affiliation": { 27 | "@type": "Organization", 28 | "name": "Caltech" 29 | }, 30 | "@id": "https://orcid.org/0009-0007-0323-4733" 31 | }, 32 | { 33 | "@type": "Person", 34 | "givenName": "Elizabeth", 35 | "familyName": "Won", 36 | "affiliation": { 37 | "@type": "Organization", 38 | "name": "Caltech" 39 | }, 40 | "@id": "https://orcid.org/0009-0002-2450-6471" 41 | }, 42 | { 43 | "@type": "Person", 44 | "givenName": "Alexander A", 45 | "familyName": "Abakah", 46 | "affiliation": { 47 | "@type": "Organization", 48 | "name": "Caltech Library" 49 | }, 50 | "email": "aabakah@caltech.edu", 51 | "@id": "https://orcid.org/0009-0003-5640-6691" 52 | } 53 | ], 54 | "developmentStatus": "active", 55 | "downloadUrl": "https://github.com/caltechlibrary/caltechdata_api/archive/1.9.1.zip", 56 | "keywords": [ 57 | "GitHub", 58 | "metadata", 59 | "software", 60 | "InvenioRDM" 61 | ], 62 | "maintainer": [ 63 | { 64 | "@type": "Person", 65 | "givenName": "Thomas E", 66 | "familyName": "Morrell", 67 | "affiliation": { 68 | "@type": "Organization", 69 | "name": "Caltech Library" 70 | }, 71 | "email": "tmorrell@caltech.edu", 72 | "@id": "https://orcid.org/0000-0001-9266-5146" 73 | } 74 | ], 75 | "funding": { 76 | "@type": "Grant", 77 | "identifier": "2322420", 78 | "name": "CC* Data Storage: Closing Caltech's data storage gap: from ad-hoc to well-managed stewardship of large-scale datasets", 79 | "funder": { 80 | "@id": "https://doi.org/10.13039/100000001", 81 | "@type": "Organization", 82 | "name": "National Science Foundation" 83 | } 84 | }, 85 | "programmingLanguage": "Python", 86 | "identifier": "10.22002/3gdk4-j5504" 87 | } -------------------------------------------------------------------------------- /edit.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json 2 | from caltechdata_api import caltechdata_edit 3 | 4 | parser = argparse.ArgumentParser( 5 | description="Write files and a DataCite 4 standard json record\ 6 | to CaltechDATA repository" 7 | ) 8 | parser.add_argument( 9 | "json_file", 10 | nargs="?", 11 | default=None, 12 | help="file name for json DataCite metadata file", 13 | ) 14 | parser.add_argument("-id", help="CaltechDATA IDs") 15 | parser.add_argument("-fnames", nargs="*", help="New Files") 16 | parser.add_argument("-flinks", nargs="*", help="New File Links") 17 | parser.add_argument("-schema", default="43", help="Metadata Schema") 18 | parser.add_argument("-authors", action="store_true", help="Edit CaltechAUTHORS") 19 | args = parser.parse_args() 20 | 21 | # Get access token set as environment variable with source token.bash 22 | token = os.environ["RDMTOK"] 23 | 24 | if args.json_file: 25 | metaf = open(args.json_file, "r") 26 | metadata = json.load(metaf) 27 | else: 28 | metadata = {} 29 | 30 | production = True 31 | publish = True 32 | 33 | response = caltechdata_edit( 34 | args.id, 35 | metadata, 36 | token, 37 | args.fnames, 38 | production, 39 | args.schema, 40 | publish, 41 | args.flinks, 42 | authors=args.authors, 43 | ) 44 | print(response) 45 | -------------------------------------------------------------------------------- /edit_osn.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json 2 | import s3fs, requests 3 | from datacite import schema43 4 | from caltechdata_api import caltechdata_edit, get_metadata 5 | 6 | 7 | parser = argparse.ArgumentParser( 8 | description="Edits a CaltechDATA record by adding OSN-stored pilot files" 9 | ) 10 | parser.add_argument("folder", nargs=1, help="Folder") 11 | parser.add_argument("-id", nargs=1, help="") 12 | 13 | args = parser.parse_args() 14 | 15 | # Get access token as environment variable 16 | token = os.environ["RDMTOK"] 17 | 18 | endpoint = "https://renc.osn.xsede.org/" 19 | 20 | # Get metadata and files from bucket 21 | s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint}) 22 | 23 | folder = args.folder[0] 24 | 25 | path = "ini210004tommorrell/" + folder 26 | 27 | idv = args.id[0] 28 | try: 29 | metadata = get_metadata(idv, schema="43") 30 | except: 31 | url = "https://data.caltech.edu/api/records/" 32 | 33 | headers = { 34 | "accept": "application/vnd.datacite.datacite+json", 35 | "Authorization": "Bearer %s" % token, 36 | } 37 | 38 | response = requests.get(url + idv + "/draft", headers=headers) 39 | if response.status_code != 200: 40 | raise Exception(response.text) 41 | metadata = response.json() 42 | 43 | # Find the files 44 | files = s3.glob(path + "/*") 45 | 46 | file_links = [] 47 | for link in files: 48 | fname = link.split("/")[-1] 49 | if "." not in fname: 50 | # If there is a directory, get files 51 | folder_files = s3.glob(link + "/*") 52 | for file in folder_files: 53 | name = file.split("/")[-1] 54 | if "." not in name: 55 | level_2_files = s3.glob(file + "/*") 56 | for f in level_2_files: 57 | name = f.split("/")[-1] 58 | if "." not in name: 59 | level_3_files = s3.glob(f + "/*") 60 | for l3 in level_3_files: 61 | file_links.append(endpoint + l3) 62 | else: 63 | file_links.append(endpoint + f) 64 | else: 65 | file_links.append(endpoint + file) 66 | else: 67 | file_links.append(endpoint + link) 68 | 69 | production = True 70 | 71 | response = caltechdata_edit( 72 | idv, metadata, token, [], production, "43", publish=False, file_links=file_links 73 | ) 74 | print(response) 75 | -------------------------------------------------------------------------------- /example.json: -------------------------------------------------------------------------------- 1 | { 2 | "identifiers": [ 3 | {"identifier": "1924MNRAS..84..308E", "identifierType": "bibcode"} 4 | ], 5 | "contributors": [ 6 | { 7 | "nameType": "Personal", 8 | "affiliation": [ 9 | { 10 | "name": "DataCitea", 11 | "affiliationIdentifier": "https://ror.org/04wxnsj81", 12 | "affiliationIdentifierScheme": "ROR" 13 | } 14 | ], 15 | "name": "Contributor Name", 16 | "familyName": "Family Name", 17 | "givenName": "Given Name", 18 | "contributorType": "ContactPerson", 19 | "nameIdentifiers": [ 20 | { 21 | "nameIdentifier": "0000-0002-1825-0097", 22 | "nameIdentifierScheme": "ORCID", 23 | "schemeUri": "https://orcid.org/" 24 | } 25 | ] 26 | } 27 | ], 28 | "creators": [ 29 | { 30 | "nameType": "Personal", 31 | "affiliation": [ 32 | { 33 | "name": "DataCite", 34 | "affiliationIdentifier": "https://ror.org/04wxnsj81", 35 | "affiliationIdentifierScheme": "ROR" 36 | } 37 | ], 38 | "name": "Name", 39 | "familyName": "Family Name", 40 | "givenName": "Given Name", 41 | "nameIdentifiers": [ 42 | { 43 | "nameIdentifier": "0000-0002-1825-0097", 44 | "nameIdentifierScheme": "ORCID", 45 | "schemeUri": "https://orcid.org/" 46 | } 47 | ] 48 | } 49 | ], 50 | "dates": [ 51 | { 52 | "date": "2014-10-01", 53 | "dateType": "Created" 54 | }, 55 | { 56 | "date": "2012-05-22/2016-12-21", 57 | "dateType": "Collected" 58 | } 59 | ], 60 | "descriptions": [ 61 | { 62 | "description": "Description", 63 | "descriptionType": "Abstract" 64 | } 65 | ], 66 | "formats": [ 67 | "format" 68 | ], 69 | "fundingReferences": [ 70 | { 71 | "awardTitle": "Measurement of Column-Averaged CO2", 72 | "funderName": "National Aeronautics and Space Administration", 73 | "funderIdentifierType": "GRID", 74 | "funderIdentifier": "grid.238252.c", 75 | "awardNumber": "NAG5-12247" 76 | } 77 | ], 78 | "geoLocations": [ 79 | { 80 | "geoLocationPlace": "Place Name", 81 | "geoLocationPoint": { 82 | "pointLatitude": "34.138", 83 | "pointLongitude": "-118.1258" 84 | } 85 | } 86 | ], 87 | "language": "eng", 88 | "publicationYear": "2017", 89 | "publisher": "Publisher", 90 | "relatedIdentifiers": [ 91 | { 92 | "relatedIdentifier": "http://www.url.org/", 93 | "relatedIdentifierType": "URL", 94 | "relationType": "IsPartOf" 95 | }, 96 | { 97 | "relatedIdentifier": "10.5072/FK2", 98 | "relatedIdentifierType": "DOI", 99 | "relationType": "IsDocumentedBy" 100 | } 101 | ], 102 | "types": { 103 | "resourceTypeGeneral": "Dataset", 104 | "resourceType": "Dataset" 105 | }, 106 | "rightsList": [ 107 | { 108 | "rights": "Rights Name", 109 | "rightsURI": "Rights List" 110 | } 111 | ], 112 | "subjects": [ 113 | { 114 | "subject": "subject1" 115 | }, 116 | { 117 | "subject": "subject2" 118 | } 119 | ], 120 | "titles": [ 121 | { 122 | "title": "Title" 123 | }, 124 | { 125 | "title": "Alternative Title", 126 | "titleType": "AlternativeTitle" 127 | } 128 | ], 129 | "version": "0", 130 | "schemaVersion": "http://datacite.org/schema/kernel-4" 131 | } 132 | -------------------------------------------------------------------------------- /example_custom.json: -------------------------------------------------------------------------------- 1 | { 2 | "identifiers": [ 3 | {"identifier": "10.5281/inveniordm.1234", "identifierType": "DOI"}, 4 | {"identifier": "1924MNRAS..84..308E", "identifierType": "bibcode"} 5 | ], 6 | "contributors": [ 7 | { 8 | "nameType": "Personal", 9 | "affiliation": [ 10 | { 11 | "name": "DataCitea", 12 | "affiliationIdentifier": "https://ror.org/04wxnsj81", 13 | "affiliationIdentifierScheme": "ROR" 14 | } 15 | ], 16 | "name": "Contributor Name", 17 | "familyName": "Family Name", 18 | "givenName": "Given Name", 19 | "contributorType": "ContactPerson", 20 | "nameIdentifiers": [ 21 | { 22 | "nameIdentifier": "0000-0002-1825-0097", 23 | "nameIdentifierScheme": "ORCID", 24 | "schemeUri": "https://orcid.org/" 25 | } 26 | ] 27 | } 28 | ], 29 | "creators": [ 30 | { 31 | "nameType": "Personal", 32 | "affiliation": [ 33 | { 34 | "name": "DataCite", 35 | "affiliationIdentifier": "https://ror.org/04wxnsj81", 36 | "affiliationIdentifierScheme": "ROR" 37 | } 38 | ], 39 | "name": "Name", 40 | "familyName": "Family Name", 41 | "givenName": "Given Name", 42 | "nameIdentifiers": [ 43 | { 44 | "nameIdentifier": "0000-0002-1825-0097", 45 | "nameIdentifierScheme": "ORCID", 46 | "schemeUri": "https://orcid.org/" 47 | } 48 | ] 49 | } 50 | ], 51 | "dates": [ 52 | { 53 | "date": "2014-10-01", 54 | "dateType": "Created" 55 | }, 56 | { 57 | "date": "2012-05-22/2016-12-21", 58 | "dateType": "Collected" 59 | } 60 | ], 61 | "descriptions": [ 62 | { 63 | "description": "Description", 64 | "descriptionType": "Abstract" 65 | } 66 | ], 67 | "formats": [ 68 | "format" 69 | ], 70 | "fundingReferences": [ 71 | { 72 | "awardTitle": "Measurement of Column-Averaged CO2", 73 | "funderName": "National Aeronautics and Space Administration", 74 | "funderIdentifierType": "GRID", 75 | "funderIdentifier": "grid.238252.c", 76 | "awardNumber": "NAG5-12247" 77 | } 78 | ], 79 | "geoLocations": [ 80 | { 81 | "geoLocationPlace": "Place Name", 82 | "geoLocationPoint": { 83 | "pointLatitude": "34.138", 84 | "pointLongitude": "-118.1258" 85 | } 86 | } 87 | ], 88 | "language": "eng", 89 | "publicationYear": "2017", 90 | "publisher": "Publisher", 91 | "relatedIdentifiers": [ 92 | { 93 | "relatedIdentifier": "http://www.url.org/", 94 | "relatedIdentifierType": "URL", 95 | "relationType": "IsPartOf" 96 | }, 97 | { 98 | "relatedIdentifier": "10.5072/FK2", 99 | "relatedIdentifierType": "DOI", 100 | "relationType": "IsDocumentedBy" 101 | } 102 | ], 103 | "types": { 104 | "resourceTypeGeneral": "Dataset", 105 | "resourceType": "Dataset" 106 | }, 107 | "rightsList": [ 108 | { 109 | "rights": "Rights Name", 110 | "rightsURI": "Rights List" 111 | } 112 | ], 113 | "subjects": [ 114 | { 115 | "subject": "subject1" 116 | }, 117 | { 118 | "subject": "subject2" 119 | } 120 | ], 121 | "titles": [ 122 | { 123 | "title": "Title" 124 | }, 125 | { 126 | "title": "Alternative Title", 127 | "titleType": "AlternativeTitle" 128 | } 129 | ], 130 | "version": "0", 131 | "schemaVersion": "http://datacite.org/schema/kernel-4" 132 | } 133 | -------------------------------------------------------------------------------- /excluded_dois.json: -------------------------------------------------------------------------------- 1 | [] 2 | -------------------------------------------------------------------------------- /fix_names.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import math 3 | from progressbar import progressbar 4 | from caltechdata_api import caltechdata_edit 5 | 6 | 7 | def fix_name(metadata, fixed): 8 | for name in metadata: 9 | if name["nameType"] == "Personal": 10 | if "givenName" not in name: 11 | fixed = True 12 | given = name["name"].split(",")[1] 13 | name["givenName"] = given.strip() 14 | return metadata, fixed 15 | 16 | 17 | url = 'https://data.caltech.edu/api/records?q=-metadata.related_identifiers.identifier%3A"10.25989%2Fes8t-kswe"' 18 | 19 | headers = { 20 | "accept": "application/vnd.datacite.datacite+json", 21 | } 22 | 23 | response = requests.get(f"{url}&search_type=scan&scroll=5m") 24 | 25 | total = response.json()["hits"]["total"] 26 | pages = math.ceil(int(total) / 1000) 27 | hits = [] # [{'id':'a7f64-a8k10'}] 28 | print(total) 29 | for c in progressbar(range(1, pages + 1)): 30 | chunkurl = f"{url}&sort=newest&size=1000&page={c}" 31 | response = requests.get(chunkurl) 32 | response = response.json() 33 | hits += response["hits"]["hits"] 34 | 35 | 36 | url = "https://data.caltech.edu/api/records" 37 | 38 | for h in progressbar(hits): 39 | idv = str(h["id"]) 40 | 41 | response = requests.get(f"{url}/{idv}", headers=headers) 42 | if response.status_code != 200: 43 | print(response.text) 44 | exit() 45 | else: 46 | fixed = False 47 | metadata = response.json() 48 | metadata["creators"], fixed = fix_name(metadata["creators"], fixed) 49 | if "contributors" in metadata: 50 | metadata["contributors"], fixed = fix_name(metadata["contributors"], fixed) 51 | if fixed: 52 | print(idv) 53 | caltechdata_edit(idv, metadata, production=True, publish=True) 54 | -------------------------------------------------------------------------------- /inspect_dois.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import math 3 | from progressbar import progressbar 4 | from caltechdata_api import caltechdata_edit 5 | 6 | 7 | def fix_name(metadata, fixed): 8 | for name in metadata: 9 | if name["nameType"] == "Personal": 10 | if "givenName" not in name: 11 | fixed = True 12 | given = name["name"].split(",")[1] 13 | name["givenName"] = given.strip() 14 | return metadata, fixed 15 | 16 | 17 | url = 'https://data.caltech.edu/api/records?q=-metadata.related_identifiers.identifier%3A"10.25989%2Fes8t-kswe"' 18 | 19 | headers = { 20 | "accept": "application/vnd.datacite.datacite+json", 21 | } 22 | 23 | response = requests.get(f"{url}&search_type=scan&scroll=5m") 24 | 25 | total = response.json()["hits"]["total"] 26 | pages = math.ceil(int(total) / 1000) 27 | hits = [] 28 | print(total) 29 | for c in progressbar(range(1, pages + 1)): 30 | chunkurl = f"{url}&sort=newest&size=1000&page={c}" 31 | response = requests.get(chunkurl) 32 | response = response.json() 33 | hits += response["hits"]["hits"] 34 | 35 | 36 | url = "https://data.caltech.edu/api/records" 37 | 38 | for h in progressbar(hits): 39 | idv = str(h["id"]) 40 | 41 | doi = h["pids"]["doi"] 42 | 43 | if "client" not in doi: 44 | if "10.22002/" in doi["identifier"]: 45 | response = requests.get(f"{url}/{idv}", headers=headers) 46 | if response.status_code != 200: 47 | print(response.text) 48 | exit() 49 | else: 50 | metadata = response.json() 51 | print(idv) 52 | caltechdata_edit(idv, metadata, production=True, publish=True) 53 | -------------------------------------------------------------------------------- /logo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caltechlibrary/caltechdata_api/89cb1bfb6513c61458e257a66e5e6c6aaf222195/logo.gif -------------------------------------------------------------------------------- /outdated/README.md: -------------------------------------------------------------------------------- 1 | # caltechdata_api outdated functions 2 | 3 | These functions have yet to be updated to the InvenioRDM version of 4 | CaltechDATA. Many will be updated in the future, but for now they are available 5 | here for reference. 6 | 7 | 8 | Get geographic metadata from CaltechDATA with WKT representations in a csv file. 9 | You can import this to a GIS program like QGIS 10 | using a delimited text import and projection epsg:4326. You'll have to do one 11 | import for Geometry type Point and another for Geometry type Polygon. 12 | 13 | ``` 14 | python get_geo.py caltechdata_geo.csv 15 | ``` 16 | 17 | You can filter by keyword 18 | 19 | ``` 20 | python get_geo.py caltechdata_geo.csv -keywords TCCON 21 | ``` 22 | 23 | 24 | -------------------------------------------------------------------------------- /outdated/add_doi_minting_date.py: -------------------------------------------------------------------------------- 1 | import os, requests 2 | from progressbar import progressbar 3 | from caltechdata_api import get_metadata, caltechdata_edit 4 | 5 | 6 | def get_datacite_dates(prefix): 7 | """Get sumbitted date for DataCite DOIs with specific prefix""" 8 | doi_dates = {} 9 | doi_urls = {} 10 | url = ( 11 | "https://api.datacite.org/dois?query=prefix:" 12 | + prefix 13 | + "&page[cursor]=1&page[size]=500" 14 | ) 15 | next_link = url 16 | meta = requests.get(next_link).json()["meta"] 17 | for j in progressbar(range(meta["totalPages"])): 18 | r = requests.get(next_link) 19 | data = r.json() 20 | for doi in data["data"]: 21 | date = doi["attributes"]["registered"].split("T")[0] 22 | doi_dates[doi["id"]] = date 23 | doi_urls[doi["id"]] = doi["attributes"]["url"] 24 | if "next" in data["links"]: 25 | next_link = data["links"]["next"] 26 | else: 27 | next_link = None 28 | return doi_dates, doi_urls 29 | 30 | 31 | token = os.environ["TINDTOK"] 32 | 33 | doi_dates, doi_urls = get_datacite_dates("10.14291") 34 | for doi in doi_urls: 35 | if "data.caltech.edu" in doi_urls[doi]: 36 | caltech_id = doi_urls[doi].split("/")[-1] 37 | if caltech_id not in ["252", "253", "254", "255"]: 38 | metadata = get_metadata(caltech_id, emails=True) 39 | print(caltech_id) 40 | # print(metadata['dates']) 41 | for date in metadata["dates"]: 42 | if date["dateType"] == "Issued": 43 | print(date["date"], doi_dates[doi]) 44 | date["date"] = doi_dates[doi] 45 | response = caltechdata_edit(token, caltech_id, metadata, production=True) 46 | print(response) 47 | -------------------------------------------------------------------------------- /outdated/caltechdata_multipart.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from requests import session 5 | 6 | import boto3 7 | from caltechdata_api import customize_schema 8 | 9 | 10 | def send_s3(filepath, token, production=False): 11 | 12 | if production == True: 13 | s3surl = "https://data.caltech.edu/tindfiles/sign_s3/" 14 | chkurl = "https://data.caltech.edu/tindfiles/md5_s3" 15 | else: 16 | s3surl = "https://cd-sandbox.tind.io/tindfiles/sign_s3/" 17 | chkurl = "https://cd-sandbox.tind.io/tindfiles/md5_s3" 18 | 19 | headers = {"Authorization": "Bearer %s" % token} 20 | 21 | c = session() 22 | 23 | response = c.get(s3surl, headers=headers) 24 | jresp = response.json() 25 | data = jresp["data"] 26 | 27 | bucket = jresp["bucket"] 28 | key = data["fields"]["key"] 29 | policy = data["fields"]["policy"] 30 | aid = data["fields"]["AWSAccessKeyId"] 31 | signature = data["fields"]["signature"] 32 | url = data["url"] 33 | 34 | print(filepath) 35 | infile = open(filepath, "rb") 36 | size = infile.seek(0, 2) 37 | infile.seek(0, 0) # reset at beginning 38 | 39 | s3 = boto.client("s3") 40 | s3.upload_file(filepath, bucket, key) 41 | 42 | response = c.get(chkurl + "/" + bucket + "/" + key, headers=headers) 43 | print(response) 44 | exit() 45 | 46 | s3headers = { 47 | "Host": bucket + ".s3.amazonaws.com", 48 | "Date": "date", 49 | "x-amz-acl": "public-read", 50 | "Access-Control-Allow-Origin": "*", 51 | } 52 | 53 | form = ( 54 | ("key", key), 55 | ("acl", "public-read"), 56 | ("AWSAccessKeyID", aid), 57 | ("policy", policy), 58 | ("signature", signature), 59 | ("file", infile), 60 | ) 61 | 62 | c = session() 63 | response = c.post(url, files=form, headers=s3headers) 64 | if response.text: 65 | raise Exception(response.text) 66 | 67 | response = c.get(chkurl + "/" + bucket + "/" + key, headers=headers) 68 | md5 = response.json()["md5"] 69 | filename = filepath.split("/")[-1] 70 | 71 | fileinfo = {"url": key, "filename": filename, "md5": md5, "size": size} 72 | 73 | return fileinfo 74 | 75 | 76 | def caltechdata_write(metadata, token, files=[], production=False): 77 | 78 | # If files is a string - change to single value array 79 | if isinstance(files, str) == True: 80 | files = [files] 81 | 82 | fileinfo = [] 83 | 84 | for f in files: 85 | fileinfo.append(send_s3(f, token, production)) 86 | 87 | if production == True: 88 | url = "https://data.caltech.edu/submit/api/create/" 89 | else: 90 | url = "https://cd-sandbox.tind.io/submit/api/create/" 91 | 92 | headers = {"Authorization": "Bearer %s" % token, "Content-type": "application/json"} 93 | 94 | newdata = customize_schema.customize_schema(metadata) 95 | newdata["files"] = fileinfo 96 | if "doi" not in newdata: 97 | # We want tind to generate the identifier 98 | newdata["final_actions"] = [ 99 | {"type": "create_doi", "parameters": {"type": "records", "field": "doi"}} 100 | ] 101 | 102 | dat = json.dumps({"record": newdata}) 103 | 104 | c = session() 105 | response = c.post(url, headers=headers, data=dat) 106 | return response.text 107 | -------------------------------------------------------------------------------- /outdated/edit_all.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json 2 | from caltechdata_api import caltechdata_edit 3 | 4 | parser = argparse.ArgumentParser( 5 | description="Write files and a DataCite 4 standard json record\ 6 | to CaltechDATA repository" 7 | ) 8 | parser.add_argument( 9 | "json_file", nargs=1, help="file name for json DataCite metadata file" 10 | ) 11 | parser.add_argument("-fnames", nargs="*", help="New Files") 12 | args = parser.parse_args() 13 | 14 | # Get access token from TIND set as environment variable with source token.bash 15 | token = os.environ["TINDTOK"] 16 | 17 | metaf = open(args.json_file[0], "r") 18 | metadata = json.load(metaf) 19 | 20 | production = False 21 | 22 | ids = range(1, 717) 23 | response = caltechdata_edit(token, ids, metadata, args.fnames, {"pdf"}, production) 24 | print(response) 25 | -------------------------------------------------------------------------------- /outdated/edit_all_geo.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json, requests, csv, dataset 2 | from caltechdata_api import caltechdata_edit, decustomize_schema 3 | 4 | # Get access token from TIND sed as environment variable with source token.bash 5 | token = os.environ["TINDTOK"] 6 | 7 | collection = "data/CaltechTHESIS.ds" 8 | 9 | production = True 10 | 11 | if production == True: 12 | url = "https://data.caltech.edu/api/records" 13 | else: 14 | url = "https://cd-sandbox.tind.io/api/records" 15 | 16 | response = requests.get(url + "/?size=1000&q=subjects:gps") 17 | hits = response.json() 18 | 19 | # Set up dictionary of links between resolver and thesis IDs 20 | available = os.path.isfile("data/record_list.csv") 21 | if available == False: 22 | print("You need to run update_thesis_file.py") 23 | exit() 24 | else: 25 | record_list = {} 26 | reader = csv.reader(open("data/record_list.csv")) 27 | for row in reader: 28 | record_list[row[1]] = row[0] 29 | 30 | for h in hits["hits"]["hits"]: 31 | rid = str(h["id"]) 32 | print(rid) 33 | record = decustomize_schema(h["metadata"], True) 34 | if "relatedIdentifiers" in record: 35 | for r in record["relatedIdentifiers"]: 36 | if ( 37 | r["relationType"] == "IsSupplementTo" 38 | and r["relatedIdentifierType"] == "URL" 39 | ): 40 | idv = record_list[r["relatedIdentifier"]] 41 | thesis_metadata, err = dataset.read(collection, idv) 42 | pub_date = thesis_metadata["date"] 43 | dates = [{"date": pub_date, "dateType": "Issued"}] 44 | for date in record["dates"]: 45 | if date["dateType"] == "Issued": 46 | dates.append({"date": date["date"], "dateType": "Updated"}) 47 | elif date["dateType"] == "Updated": 48 | pass 49 | elif date["dateType"] != "Submitted": 50 | dates.append(date) 51 | print(dates) 52 | metadata = {"dates": dates} 53 | response = caltechdata_edit(token, rid, metadata, {}, {}, production) 54 | print(response) 55 | -------------------------------------------------------------------------------- /outdated/edit_all_github.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json, requests 2 | from caltechdata_api import caltechdata_edit, decustomize_schema 3 | 4 | # Get access token from TIND sed as environment variable with source token.bash 5 | token = os.environ["TINDTOK"] 6 | 7 | production = True 8 | 9 | if production == True: 10 | url = "https://data.caltech.edu/api/records" 11 | else: 12 | url = "https://cd-sandbox.tind.io/api/records" 13 | 14 | response = requests.get(url + "/?size=2000&q=cal_resource_type=software") 15 | hits = response.json() 16 | 17 | for h in hits["hits"]["hits"]: 18 | rid = h["id"] 19 | print(rid) 20 | record = decustomize_schema(h["metadata"], True) 21 | replace = False 22 | # to_update =\ 23 | # [288,269,295,291,279,284,266,281,286,278,280,293,283,287,210,274,276,290,300,285,270,268,267,302,744,282,272,289] 24 | # if rid in to_update: 25 | # Find just GitHub records by title 26 | if "/" in record["titles"][0]["title"]: 27 | add = True 28 | for s in record["subjects"]: 29 | subject = s["subject"] 30 | if subject == "Github": 31 | add = False 32 | if subject == "GitHub": 33 | add = False 34 | if subject == "Bitbucket": 35 | add = False 36 | if add == True: 37 | record["subjects"].append({"subject": "GitHub"}) 38 | print(record["titles"][0]["title"]) 39 | response = caltechdata_edit(token, rid, record, {}, {}, production) 40 | print(response) 41 | -------------------------------------------------------------------------------- /outdated/edit_all_tccon.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json, requests 2 | from caltechdata_api import caltechdata_edit, decustomize_schema 3 | 4 | # Get access token from TIND sed as environment variable with source token.bash 5 | token = os.environ["TINDTOK"] 6 | 7 | production = True 8 | 9 | if production == True: 10 | url = "https://data.caltech.edu/api/records" 11 | else: 12 | url = "https://cd-sandbox.tind.io/api/records" 13 | 14 | response = requests.get(url + "/?size=1000&q=subjects:TCCON") 15 | hits = response.json() 16 | 17 | wiki1 = "https://tccon-wiki.caltech.edu/Network_Policy/Data_Use_Policy/Data_Description" 18 | new1 = "https://tccon-wiki.caltech.edu/Main/DataDescription" 19 | wiki2 = "https://tccon-wiki.caltech.edu/Sites" 20 | new2 = "https://tccon-wiki.caltech.edu/Main/TCCONSites" 21 | site = "http://tccondata.org/" 22 | new3 = "https://tccondata.org" 23 | exsite = "http://tccondata.org" 24 | 25 | for h in hits["hits"]["hits"]: 26 | rid = h["id"] 27 | print(rid) 28 | record = decustomize_schema(h["metadata"], True) 29 | updated = {} 30 | if "relatedIdentifiers" in record: 31 | for related in record["relatedIdentifiers"]: 32 | if related["relatedIdentifier"] == wiki1: 33 | related["relatedIdentifier"] = new1 34 | if related["relatedIdentifier"] == wiki2: 35 | related["relatedIdentifier"] = new2 36 | if related["relatedIdentifier"] == site: 37 | related["relatedIdentifier"] = new3 38 | if related["relatedIdentifier"] == exsite: 39 | related["relatedIdentifier"] = new3 40 | updated["relatedIdentifiers"] = record["relatedIdentifiers"] 41 | response = caltechdata_edit(rid, updated, token, {}, {}, production) 42 | print(response) 43 | -------------------------------------------------------------------------------- /outdated/edit_files.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json 2 | from caltechdata_api import caltechdata_edit 3 | 4 | parser = argparse.ArgumentParser( 5 | description="Write files and a DataCite 4 standard json record\ 6 | to CaltechDATA repository" 7 | ) 8 | parser.add_argument("-ids", nargs="*", help="CaltechDATA IDs") 9 | parser.add_argument("-fnames", nargs="*", help="New Files") 10 | parser.add_argument("-delete", nargs="*", help="Files To Delete") 11 | args = parser.parse_args() 12 | 13 | # Get access token from TIND sed as environment variable with source token.bash 14 | token = os.environ["TINDTOK"] 15 | 16 | production = True 17 | 18 | print(args.delete) 19 | 20 | response = caltechdata_edit(token, args.ids, {}, args.fnames, args.delete, production) 21 | print(response) 22 | -------------------------------------------------------------------------------- /outdated/edit_tccon.py: -------------------------------------------------------------------------------- 1 | import sys, os, json, requests 2 | from caltechdata_api import caltechdata_edit, decustomize_schema 3 | 4 | # USAGE: python edit_tccon.py tccon.ggg2014.darwin01.R0.json 269 0 griffith@uow.edu.au 5 | 6 | # Get access token from TIND sed as environment variable with source token.bash 7 | token = os.environ["TINDTOK"] 8 | 9 | production = True 10 | 11 | if production == True: 12 | url = "https://data.caltech.edu/api/records" 13 | else: 14 | url = "https://cd-sandbox.tind.io/api/records" 15 | 16 | response = requests.get(url + "/?size=1000&q=subjects:TCCON") 17 | hits = response.json() 18 | 19 | infile = open(sys.argv[1], "r") 20 | record = json.load(infile) 21 | 22 | rid = sys.argv[2] 23 | 24 | group = {"contributorName": "TCCON", "contributorType": "ResearchGroup"} 25 | new = "" 26 | for c in record["contributors"]: 27 | print(c["contributorType"]) 28 | if c["contributorType"] == "HostingInstitution": 29 | print("YES") 30 | c["contributorName"] = "California Institute of Techonolgy, Pasadena, CA (US)" 31 | c["nameIdentifiers"] = [ 32 | {"nameIdentifier": "grid.20861.3d", "nameIdentifierScheme": "GRID"} 33 | ] 34 | v = record["contributors"] 35 | v.append(group) 36 | contact = record["creators"][int(sys.argv[3])] 37 | contact["contributorName"] = contact.pop("creatorName") 38 | contact["contributorEmail"] = sys.argv[4] 39 | contact["contributorType"] = "ContactPerson" 40 | v.append(contact) 41 | new = {"contributors": v} 42 | print(new) 43 | response = caltechdata_edit(token, rid, new, {}, {}, production) 44 | print(response) 45 | -------------------------------------------------------------------------------- /outdated/example_download_and_upload.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from caltechdata_api import download_file, caltechdata_write\n", 10 | "import json" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "application/vnd.jupyter.widget-view+json": { 21 | "model_id": "0f9182c455d94474ae1845c7047b4e0a", 22 | "version_major": 2, 23 | "version_minor": 0 24 | }, 25 | "text/plain": [ 26 | "HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15990.0), HTML(value='')))" 27 | ] 28 | }, 29 | "metadata": {}, 30 | "output_type": "display_data" 31 | } 32 | ], 33 | "source": [ 34 | "#By default will download to file named 10.22002-D1.1098\n", 35 | "#Can provide filename of interest using fname option\n", 36 | "filen = download_file('10.22002/D1.1098')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "Incorrect access token: \n" 49 | ] 50 | }, 51 | { 52 | "ename": "UnboundLocalError", 53 | "evalue": "local variable 'jresp' referenced before assignment", 54 | "output_type": "error", 55 | "traceback": [ 56 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 57 | "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)", 58 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mproduction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcaltechdata_write\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtoken\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproduction\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mschema\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'43'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 59 | "\u001b[0;32m~/Documents/caltechdata_api/caltechdata_api/caltechdata_write.py\u001b[0m in \u001b[0;36mcaltechdata_write\u001b[0;34m(metadata, token, files, production, schema)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfiles\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mfiles\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 90\u001b[0;31m \u001b[0mfileinfo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msend_s3\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtoken\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproduction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 91\u001b[0m \u001b[0mnewdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"files\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfileinfo\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 60 | "\u001b[0;32m~/Documents/caltechdata_api/caltechdata_api/caltechdata_write.py\u001b[0m in \u001b[0;36msend_s3\u001b[0;34m(filepath, token, production)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Incorrect access token: {response}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 33\u001b[0;31m \u001b[0mbucket\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjresp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"bucket\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 34\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"fields\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"key\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0mpolicy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"fields\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"policy\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 61 | "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'jresp' referenced before assignment" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "#Now write a file to CaltechDATA test instance (cd-sandbox.tind.io)\n", 67 | "\n", 68 | "token = 'TOKEN'\n", 69 | "\n", 70 | "metaf = open('example43.json', 'r')\n", 71 | "metadata = json.load(metaf)\n", 72 | "filen = 'logo.gif'\n", 73 | "\n", 74 | "production = False\n", 75 | "\n", 76 | "response = caltechdata_write(metadata, token, filen, production, schema='43')\n", 77 | "print(response)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [] 86 | } 87 | ], 88 | "metadata": { 89 | "kernelspec": { 90 | "display_name": "Python 3", 91 | "language": "python", 92 | "name": "python3" 93 | }, 94 | "language_info": { 95 | "codemirror_mode": { 96 | "name": "ipython", 97 | "version": 3 98 | }, 99 | "file_extension": ".py", 100 | "mimetype": "text/x-python", 101 | "name": "python", 102 | "nbconvert_exporter": "python", 103 | "pygments_lexer": "ipython3", 104 | "version": "3.8.5" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 4 109 | } 110 | -------------------------------------------------------------------------------- /outdated/get_geo.py: -------------------------------------------------------------------------------- 1 | import os, json, csv, argparse 2 | import requests 3 | 4 | if __name__ == "__main__": 5 | parser = argparse.ArgumentParser( 6 | description="get_metadata queries the caltechDATA (Invenio 3) API\ 7 | and returns DataCite-compatable metadata" 8 | ) 9 | parser.add_argument("output", help="Output file name") 10 | parser.add_argument("-keywords", nargs="*") 11 | 12 | args = parser.parse_args() 13 | 14 | url = "https://data.caltech.edu/api/records/?size=5000" 15 | 16 | search = "" 17 | if args.keywords: 18 | for key in args.keywords: 19 | if search == "": 20 | search = f'&q=subjects:"{key}"' 21 | else: 22 | search = search + f'+"{key}"' 23 | url = url + search 24 | 25 | response = requests.get(url) 26 | hits = response.json() 27 | 28 | outfile = open(args.output, "w") 29 | writer = csv.writer(outfile) 30 | writer.writerow(["wkt", "name", "year", "doi"]) 31 | 32 | for h in hits["hits"]["hits"]: 33 | metadata = decustomize_schema(h["metadata"]) 34 | if "geoLocations" in metadata: 35 | doi = "https://doi.org/" + metadata["identifier"]["identifier"] 36 | title = metadata["titles"][0]["title"].split(":")[0] 37 | geo = metadata["geoLocations"] 38 | year = metadata["publicationYear"] 39 | for g in geo: 40 | if "geoLocationBox" in g: 41 | box = g["geoLocationBox"] 42 | p1 = f"{box['eastBoundLongitude']} {box['northBoundLatitude']}" 43 | p2 = f"{box['westBoundLongitude']} {box['northBoundLatitude']}" 44 | p3 = f"{box['westBoundLongitude']} {box['southBoundLatitude']}" 45 | p4 = f"{box['eastBoundLongitude']} {box['southBoundLatitude']}" 46 | wkt = f"POLYGON (({p1}, {p2}, {p3}, {p4}, {p1}))" 47 | writer.writerow([wkt, title, year, doi]) 48 | 49 | if "geoLocationPoint" in g: 50 | point = g["geoLocationPoint"] 51 | wkt = f"POINT ({point['pointLongitude']} {point['pointLatitude']})" 52 | writer.writerow([wkt, title, year, doi]) 53 | -------------------------------------------------------------------------------- /outdated/test.py: -------------------------------------------------------------------------------- 1 | from datacite import schema43 2 | import io, json 3 | from os.path import dirname, join 4 | 5 | 6 | def load_json_path(path): 7 | """Helper method for loading a JSON example file from a path.""" 8 | path_base = dirname(__file__) 9 | with io.open(join(path_base, path), encoding="utf-8") as file: 10 | content = file.read() 11 | return json.loads(content) 12 | 13 | 14 | metadata = load_json_path("example43.json") 15 | 16 | valid = schema43.validate(metadata) 17 | if valid == False: 18 | v = schema43.validator.validate(metadata) 19 | errors = sorted(v.iter_errors(instance), key=lambda e: e.path) 20 | for error in errors: 21 | print(error.message) 22 | -------------------------------------------------------------------------------- /outdated/test_community.py: -------------------------------------------------------------------------------- 1 | import requests, os 2 | 3 | token = os.environ["RDMTOK"] 4 | 5 | url = "https://data.caltechlibrary.dev/" 6 | 7 | headers = { 8 | "Authorization": "Bearer %s" % token, 9 | "Content-type": "application/json", 10 | } 11 | 12 | data = {"payload": {"content": "I want this record to be in!", "format": "html"}} 13 | 14 | result = requests.post( 15 | url + "/api/records/cxc6m-bef55/draft/actions/submit-review", 16 | headers=headers, 17 | json=data, 18 | ) 19 | 20 | print(result.status_code) 21 | print(result.text) 22 | # if result.status_code != 201: 23 | # print(result.text) 24 | # exit() 25 | -------------------------------------------------------------------------------- /outdated/test_file.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | from requests import session 3 | from caltechdata_api import customize_schema 4 | 5 | # fileinfo = [ {"url": , "filename": filename, "md5": md5, "size": size}] 6 | 7 | token = os.environ["TINDTOK"] 8 | 9 | metaf = open("test_file.json", "r") 10 | metadata = json.load(metaf) 11 | 12 | url = "https://cd-sandbox.tind.io/submit/api/create/" 13 | 14 | headers = {"Authorization": "Bearer %s" % token, "Content-type": "application/json"} 15 | 16 | newdata = customize_schema(metadata) 17 | # if "doi" not in newdata: 18 | # # We want tind to generate the identifier 19 | # newdata["final_actions"] = [ 20 | # {"type": "create_doi", "parameters": {"type": "records", "field": "doi"}} 21 | # ] 22 | 23 | dat = json.dumps({"record": newdata}) 24 | 25 | c = session() 26 | response = c.post(url, headers=headers, data=dat) 27 | print(response.text) 28 | -------------------------------------------------------------------------------- /outdated/unembargo.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json 2 | from caltechdata_api import caltechdata_unembargo 3 | 4 | parser = argparse.ArgumentParser( 5 | description="Write files and a DataCite 4 standard json record\ 6 | to CaltechDATA repository" 7 | ) 8 | parser.add_argument("-ids", nargs="*", help="CaltechDATA IDs") 9 | args = parser.parse_args() 10 | 11 | # Get access token from TIND set as environment variable with source token.bash 12 | token = os.environ["TINDTOK"] 13 | 14 | production = False 15 | 16 | response = caltechdata_unembargo(token, args.ids, production) 17 | print(response) 18 | -------------------------------------------------------------------------------- /outdated/update_thesis_file.py: -------------------------------------------------------------------------------- 1 | import os, subprocess, json, csv 2 | import dataset 3 | from ames.harvesters import get_caltechfeed 4 | 5 | if os.path.isdir("data") == False: 6 | os.mkdir("data") 7 | os.chdir("data") 8 | 9 | get_caltechfeed("thesis") 10 | 11 | record_list = {} 12 | collection = "CaltechTHESIS.ds" 13 | keys = dataset.keys(collection) 14 | count = 0 15 | for k in keys: 16 | count = count + 1 17 | if count % 100 == 0: 18 | print(count) 19 | metadata, err = dataset.read(collection, k) 20 | if err != "": 21 | print("Error on read ", err) 22 | exit() 23 | if metadata != {}: 24 | if "official_url" in metadata: 25 | record_list[k] = metadata["official_url"] 26 | else: 27 | print("Missing URL", metadata) 28 | else: 29 | print("Bad Record: " + k) 30 | print(metadata) 31 | with open("record_list.csv", "w") as f: 32 | w = csv.writer(f) 33 | w.writerows(record_list.items()) 34 | -------------------------------------------------------------------------------- /outdated/write_pilot_phase1.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json 2 | import s3fs 3 | from datacite import schema43 4 | from caltechdata_api import caltechdata_write 5 | 6 | parser = argparse.ArgumentParser( 7 | description="Adds S3-stored pilot files and a DataCite 4.3 standard json record\ 8 | to CaltechDATA repository" 9 | ) 10 | parser.add_argument("folder", nargs=1, help="Folder") 11 | parser.add_argument( 12 | "json_file", nargs=1, help="file name for json DataCite metadata file" 13 | ) 14 | 15 | args = parser.parse_args() 16 | 17 | # Get access token as environment variable 18 | token = os.environ["TINDTOK"] 19 | 20 | endpoint = "https://renc.osn.xsede.org/" 21 | 22 | # Get metadata and files from bucket 23 | s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint}) 24 | 25 | 26 | path = "ini210004tommorrell/" + args.folder[0] + "/" 27 | meta_path = path + args.json_file[0] 28 | metaf = s3.open(meta_path, "rb") 29 | metadata = json.load(metaf) 30 | 31 | # Find the files 32 | files = s3.glob(path + "/*.nc") 33 | 34 | description_string = f"Files available via S3 at {endpoint}{path}
" 35 | for link in files: 36 | fname = link.split("/")[-1] 37 | link = endpoint + link 38 | description_string += f"""{fname} 40 | Download
""" 41 | 42 | metadata["descriptions"].append( 43 | {"description": description_string, "descriptionType": "Other"} 44 | ) 45 | 46 | # valid = schema43.validate(metadata) 47 | # if not valid: 48 | # v = schema43.validator.validate(metadata) 49 | # errors = sorted(v.iter_errors(instance), key=lambda e: e.path) 50 | # for error in errors: 51 | # print(error.message) 52 | # exit() 53 | 54 | print(metadata) 55 | 56 | production = True 57 | 58 | response = caltechdata_write(metadata, token, [], production, "43") 59 | print(response) 60 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /rdm.json: -------------------------------------------------------------------------------- 1 | { 2 | "pids": { 3 | }, 4 | "metadata": { 5 | "resource_type": {"id": "dataset"}, 6 | "creators": [ 7 | { 8 | "person_or_org": { 9 | "name": "Nielsen, Lars Holm", 10 | "type": "personal", 11 | "given_name": "Lars Holm", 12 | "family_name": "Nielsen", 13 | "identifiers": [ 14 | {"scheme": "orcid", "identifier": "0000-0001-8135-3489"} 15 | ] 16 | }, 17 | "affiliations": [{"name": "free-text"}] 18 | } 19 | ], 20 | "title": "InvenioRDM", 21 | "additional_titles": [ 22 | { 23 | "title": "a research data management platform", 24 | "type": {"id": "subtitle"}, 25 | "lang": {"id": "eng"} 26 | } 27 | ], 28 | "publisher": "InvenioRDM", 29 | "publication_date": "2018/2020-09", 30 | "subjects": [ 31 | {"subject": "custom"} 32 | ], 33 | "contributors": [ 34 | { 35 | "person_or_org": { 36 | "name": "Nielsen, Lars Holm", 37 | "type": "personal", 38 | "given_name": "Lars Holm", 39 | "family_name": "Nielsen", 40 | "identifiers": [ 41 | {"scheme": "orcid", "identifier": "0000-0001-8135-3489"} 42 | ] 43 | }, 44 | "role": {"id": "other"} 45 | } 46 | ], 47 | "dates": [ 48 | {"date": "1939/1945", "type": {"id": "other"}, "description": "A date"} 49 | ], 50 | "languages": [{"id": "dan"}, {"id": "eng"}], 51 | "identifiers": [{"identifier": "1924MNRAS..84..308E", "scheme": "bibcode"}], 52 | "related_identifiers": [ 53 | { 54 | "identifier": "10.1234/foo.bar", 55 | "scheme": "doi", 56 | "relation_type": {"id": "iscitedby"}, 57 | "resource_type": {"id": "dataset"} 58 | } 59 | ], 60 | "sizes": ["11 pages"], 61 | "formats": ["application/pdf"], 62 | "version": "v1.0", 63 | "rights": [ 64 | { 65 | "title": {"en": "A custom license"}, 66 | "description": {"en": "A description"}, 67 | "link": "https://customlicense.org/licenses/by/4.0/" 68 | }, 69 | {"id": "cc-by-4.0"} 70 | ], 71 | "description": "

A description

with HTML tags

", 72 | "additional_descriptions": [ 73 | { 74 | "description": "Bla bla bla", 75 | "type": {"id": "methods"}, 76 | "lang": {"id": "eng"} 77 | } 78 | ], 79 | "locations": { 80 | "features": [ 81 | { 82 | "geometry": { 83 | "type": "Point", 84 | "coordinates": [-32.94682, -60.63932] 85 | }, 86 | "place": "test location place", 87 | "description": "test location description", 88 | "identifiers": [ 89 | {"identifier": "12345abcde", "scheme": "wikidata"}, 90 | {"identifier": "12345abcde", "scheme": "geonames"} 91 | ] 92 | } 93 | ] 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- 1 | pytest tests -vv 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64.0","wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [metadata] 6 | name = caltechdata_api 7 | version = 1.10.0 8 | author = Thomas E Morrell, Rohan Bhattarai, Elizabeth Won, Alexander A Abakah 9 | author_email = tmorrell@caltech.edu, aabakah@caltech.edu 10 | description = Python wrapper for CaltechDATA API. 11 | long_description = file: README.md 12 | long_description_content_type = text/markdown 13 | url = https://github.com/caltechlibrary/caltechdata_api 14 | license = MIT 15 | classifiers = 16 | License :: OSI Approved :: MIT License 17 | Programming Language :: Python :: 3 18 | Programming Language :: Python :: 3.7 19 | Programming Language :: Python :: 3.8 20 | Programming Language :: Python :: 3.9 21 | Programming Language :: Python :: 3.10 22 | Programming Language :: Python :: Implementation :: CPython 23 | Operating System :: OS Independent 24 | 25 | [options] 26 | packages = find: 27 | python_requires = >=3.6.0 28 | install_requires = 29 | requests 30 | datacite>1.1.0 31 | tqdm>=4.62.3 32 | pyyaml 33 | s3fs 34 | cryptography 35 | s3cmd 36 | include_package_data = True 37 | 38 | [options.packages.find] 39 | exclude = tests 40 | 41 | [options.package_data] 42 | caltechdata_api = vocabularies.yaml, vocabularies/* 43 | 44 | [options.entry_points] 45 | console_scripts = 46 | caltechdata_api=caltechdata_api.cli:main 47 | 48 | [tool:pytest] 49 | addopts = --verbose 50 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /templates/README.md: -------------------------------------------------------------------------------- 1 | # This is the title of your submission to CaltechDATA 2 | 3 | ## Creators 4 | - Name Type: Personal 5 | - Affiliation: [https://ror.org/04wxnsj81](https://ror.org/04wxnsj81) 6 | - Name: Name 7 | - Family Name: Family Name 8 | - Given Name: Given Name 9 | - Name Identifiers: [https://orcid.org/0000-0002-1825-0097](https://orcid.org/0000-0002-1825-0097) 10 | 11 | - Name Type: Personal 12 | - Affiliation: [https://ror.org/04wxnsj81](https://ror.org/04wxnsj81) 13 | - Name: Name2 14 | - Family Name: Family Name 2 15 | - Given Name: Given Name 2 16 | - Name Identifiers: [https://orcid.org/0000-0002-1825-0097](https://orcid.org/0000-0002-1825-0097) 17 | 18 | ## Descriptions 19 | - Description: Description 20 | - Description Type: Abstract 21 | 22 | ## Types 23 | - Resource Type General: Dataset 24 | - Resource Type: Dataset 25 | 26 | ## Rights List 27 | - Rights: Creative Commons Zero v1.0 Universal 28 | - Rights URI: https://creativecommons.org/publicdomain/zero/1.0/legalcode 29 | 30 | ## Publication Year 31 | - Publication Year: 2024 32 | 33 | ## Publisher 34 | - Publisher: CaltechDATA 35 | 36 | ## Dates 37 | - Date: 2014-10-01 38 | - Date Type: Created 39 | - Date: 2012-05-22/2016-12-21 40 | - Date Type: Collected 41 | 42 | ## Subjects 43 | - Subject: subject1 44 | - Subject: subject2 45 | 46 | ## Funding References 47 | - Award Title: Measurement of Column-Averaged CO2 48 | - Funder Name: National Aeronautics and Space Administration 49 | - Funder Identifier Type: ROR 50 | - Funder Identifier: https://ror.org/027ka1x80 51 | - Award Number: NAG5-12247 52 | 53 | ## Related Identifiers 54 | - Related Identifier: [http://www.url.org/](http://www.url.org/) 55 | - Related Identifier Type: URL 56 | - Relation Type: IsPartOf 57 | - Related Identifier: 10.5072/FK2 58 | - Related Identifier Type: DOI 59 | - Relation Type: IsDocumentedBy 60 | 61 | ## Version 62 | - Version: 1 63 | 64 | ## Identifiers 65 | - Identifier: 1924MNRAS..84..308E 66 | - Identifier Type: bibcode 67 | 68 | ## Contributors 69 | - Name Type: Personal 70 | - Affiliation: [https://ror.org/04wxnsj81](https://ror.org/04wxnsj81) 71 | - Name: Contributor Name 72 | - Family Name: Family Name 73 | - Given Name: Given Name 74 | - Contributor Type: ContactPerson 75 | - Name Identifiers: [https://orcid.org/0000-0002-1825-0097](https://orcid.org/0000-0002-1825-0097) 76 | -------------------------------------------------------------------------------- /tests/bot.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | from unittest.mock import patch 4 | import sys 5 | import os 6 | import json 7 | import requests 8 | from datetime import datetime 9 | import pytest 10 | from customize_schema import validate_metadata as validator43 # Import validator 11 | 12 | 13 | class CaltechDataTester: 14 | def __init__(self): 15 | self.test_dir = "caltech_test_data" 16 | self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 17 | if not os.path.exists(self.test_dir): 18 | os.makedirs(self.test_dir) 19 | 20 | # Create test data directory with timestamp 21 | self.test_run_dir = os.path.join(self.test_dir, f"test_run_{self.timestamp}") 22 | os.makedirs(self.test_run_dir) 23 | 24 | # Initialize logging 25 | self.log_file = os.path.join(self.test_run_dir, "test_log.txt") 26 | 27 | def log(self, message): 28 | """Log message to both console and file""" 29 | print(message) 30 | with open(self.log_file, "a") as f: 31 | f.write(f"{datetime.now()}: {message}\n") 32 | 33 | def create_test_files(self): 34 | """Create necessary test files""" 35 | # Create a dummy CSV file 36 | csv_path = os.path.join(self.test_run_dir, "test_data.csv") 37 | with open(csv_path, "w") as f: 38 | f.write("date,temperature,humidity\n") 39 | f.write("2023-01-01,25.5,60\n") 40 | f.write("2023-01-02,26.0,62\n") 41 | f.write("2023-01-03,24.8,65\n") 42 | 43 | self.log(f"Created test CSV file: {csv_path}") 44 | return csv_path 45 | 46 | def generate_test_responses(self): 47 | """Generate test responses for CLI prompts""" 48 | return { 49 | "Do you want to create or edit a CaltechDATA record? (create/edit): ": "create", 50 | "Do you want to use metadata from an existing file or create new metadata? (existing/create): ": "create", 51 | "Enter the title of the dataset: ": f"Test Dataset {self.timestamp}", 52 | "Enter the abstract or description of the dataset: ": "This is an automated test dataset containing sample climate data for validation purposes.", 53 | "Enter the number corresponding to the desired license: ": "1", 54 | "Enter your ORCID identifier: ": "0000-0002-1825-0097", 55 | "How many funding entries do you want to provide? ": "1", 56 | "Enter the award number for funding: ": "NSF-1234567", 57 | "Enter the award title for funding: ": "Automated Testing Grant", 58 | "Enter the funder ROR (https://ror.org): ": "021nxhr62", 59 | "Do you want to upload or link data files? (upload/link/n): ": "upload", 60 | "Enter the filename to upload as a supporting file (or 'n' to finish): ": "test_data.csv", 61 | "Do you want to add more files? (y/n): ": "n", 62 | "Do you want to send this record to CaltechDATA? (y/n): ": "y", 63 | } 64 | 65 | def extract_record_id(self, output_text): 66 | """Extract record ID from CLI output""" 67 | try: 68 | for line in output_text.split("\n"): 69 | if "uploads/" in line: 70 | return line.strip().split("/")[-1] 71 | except Exception as e: 72 | self.log(f"Error extracting record ID: {e}") 73 | return None 74 | 75 | def download_and_validate_record(self, record_id): 76 | """Download and validate the record""" 77 | try: 78 | # Wait for record to be available 79 | time.sleep(5) 80 | 81 | # Download metadata 82 | url = f"https://data.caltechlibrary.dev/records/{record_id}/export/datacite-json" 83 | response = requests.get(url) 84 | response.raise_for_status() 85 | 86 | # Save metadata 87 | json_path = os.path.join(self.test_run_dir, f"{record_id}.json") 88 | with open(json_path, "w") as f: 89 | json.dump(response.json(), f, indent=2) 90 | 91 | self.log(f"Downloaded metadata to: {json_path}") 92 | 93 | # Validate metadata using the imported validator 94 | validation_errors = validator43(response.json()) 95 | 96 | if validation_errors: 97 | self.log("❌ Validation errors found:") 98 | for error in validation_errors: 99 | self.log(f" - {error}") 100 | return False 101 | else: 102 | self.log("✅ Validation passed successfully") 103 | return True 104 | 105 | except Exception as e: 106 | self.log(f"Error in download and validation: {e}") 107 | return False 108 | 109 | def run_test_submission(self): 110 | """Run the complete test submission process""" 111 | try: 112 | self.log("Starting test submission process...") 113 | 114 | # Create test files 115 | test_csv = self.create_test_files() 116 | 117 | # Generate responses 118 | responses = self.generate_test_responses() 119 | 120 | # Setup output capture 121 | class OutputCapture: 122 | def __init__(self): 123 | self.output = [] 124 | 125 | def write(self, text): 126 | self.output.append(text) 127 | sys.__stdout__.write(text) 128 | 129 | def flush(self): 130 | pass 131 | 132 | def get_output(self): 133 | return "".join(self.output) 134 | 135 | output_capture = OutputCapture() 136 | sys.stdout = output_capture 137 | 138 | # Mock input and run CLI 139 | def mock_input(prompt): 140 | self.log(f"Prompt: {prompt}") 141 | if prompt in responses: 142 | response = responses[prompt] 143 | self.log(f"Response: {response}") 144 | return response 145 | return "" 146 | 147 | with patch("builtins.input", side_effect=mock_input): 148 | try: 149 | import cli 150 | 151 | cli.main() 152 | except Exception as e: 153 | self.log(f"Error during CLI execution: {e}") 154 | return False 155 | 156 | # Restore stdout 157 | sys.stdout = sys.__stdout__ 158 | 159 | # Get output and extract record ID 160 | cli_output = output_capture.get_output() 161 | record_id = self.extract_record_id(cli_output) 162 | 163 | if not record_id: 164 | self.log("Failed to extract record ID") 165 | return False 166 | 167 | self.log(f"Successfully created record with ID: {record_id}") 168 | 169 | # Validate the record 170 | return self.download_and_validate_record(record_id) 171 | 172 | except Exception as e: 173 | self.log(f"Error in test submission: {e}") 174 | return False 175 | finally: 176 | # Cleanup 177 | if os.path.exists(test_csv): 178 | os.remove(test_csv) 179 | self.log("Test files cleaned up") 180 | 181 | 182 | def main(): 183 | tester = CaltechDataTester() 184 | 185 | success = tester.run_test_submission() 186 | 187 | if success: 188 | tester.log("\n🎉 Test submission and validation completed successfully!") 189 | else: 190 | tester.log("\n❌ Test submission or validation failed - check logs for details") 191 | 192 | tester.log(f"\nTest logs available at: {tester.log_file}") 193 | 194 | 195 | if __name__ == "__main__": 196 | main() 197 | -------------------------------------------------------------------------------- /tests/bot_yaml.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | from unittest.mock import patch 4 | import sys 5 | import os 6 | import json 7 | import requests 8 | from datetime import datetime 9 | import pytest 10 | import importlib.util 11 | import traceback 12 | 13 | 14 | class CaltechDataTester: 15 | def __init__(self): 16 | # Use GitHub Actions environment or create a local test directory 17 | self.test_dir = os.environ.get( 18 | "GITHUB_WORKSPACE", os.path.join(os.getcwd(), "caltech_test_data") 19 | ) 20 | self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 21 | 22 | # Ensure test directory exists 23 | os.makedirs(self.test_dir, exist_ok=True) 24 | 25 | # Create test run directory 26 | self.test_run_dir = os.path.join(self.test_dir, f"test_run_{self.timestamp}") 27 | os.makedirs(self.test_run_dir, exist_ok=True) 28 | 29 | # Initialize logging 30 | self.log_file = os.path.join(self.test_run_dir, "test_log.txt") 31 | 32 | def log(self, message): 33 | """Log message to both console and file""" 34 | print(message) 35 | with open(self.log_file, "a") as f: 36 | f.write(f"{datetime.now()}: {message}\n") 37 | 38 | def create_test_files(self): 39 | """Create necessary test files""" 40 | csv_path = os.path.join(self.test_run_dir, "test_data.csv") 41 | with open(csv_path, "w") as f: 42 | f.write("date,temperature,humidity\n") 43 | f.write("2023-01-01,25.5,60\n") 44 | f.write("2023-01-02,26.0,62\n") 45 | f.write("2023-01-03,24.8,65\n") 46 | 47 | self.log(f"Created test CSV file: {csv_path}") 48 | return csv_path 49 | 50 | def import_cli_module(self): 51 | """Dynamically import cli module from the correct path""" 52 | cli_path = os.path.join( 53 | os.environ.get("GITHUB_WORKSPACE", os.getcwd()), "caltechdata_api", "cli.py" 54 | ) 55 | spec = importlib.util.spec_from_file_location("cli", cli_path) 56 | cli_module = importlib.util.module_from_spec(spec) 57 | spec.loader.exec_module(cli_module) 58 | return cli_module 59 | 60 | def generate_test_responses(self): 61 | """Generate test responses for CLI prompts""" 62 | return { 63 | "What would you like to do? (create/edit/profile/exit): ": "create", 64 | "Do you want to use metadata from an existing file or create new metadata? (existing/create): ": "create", 65 | "Enter the title of the dataset: ": f"Test Dataset {self.timestamp}", 66 | "Enter the abstract or description of the dataset: ": "This is an automated test dataset containing sample climate data for validation purposes.", 67 | "Enter the number corresponding to the desired license: ": "1", 68 | "Use saved profile? (y/n): ": "n", 69 | "Enter your ORCID identifier: ": os.environ.get( 70 | "TEST_ORCID", "0000-0002-1825-0097" 71 | ), 72 | "How many funding entries do you want to provide? ": "1", 73 | "Enter the award number for funding: ": "NSF-1234567", 74 | "Enter the award title for funding: ": "Automated Testing Grant", 75 | "Enter the funder ROR (https://ror.org): ": "021nxhr62", 76 | "Do you want to upload or link data files? (upload/link/n): ": "upload", 77 | "Enter the filename to upload as a supporting file (or 'n' to finish): ": "test_data.csv", 78 | "Do you want to add more files? (y/n): ": "n", 79 | "Do you want to send this record to CaltechDATA? (y/n): ": "y", 80 | } 81 | 82 | def run_test_submission(self): 83 | """Run the complete test submission process""" 84 | try: 85 | self.log("Starting test submission process...") 86 | 87 | # Create test files 88 | test_csv = self.create_test_files() 89 | 90 | # Dynamically import cli module 91 | cli_module = self.import_cli_module() 92 | 93 | # Generate responses 94 | responses = self.generate_test_responses() 95 | 96 | # Setup output capture 97 | class OutputCapture: 98 | def __init__(self): 99 | self.output = [] 100 | 101 | def write(self, text): 102 | self.output.append(text) 103 | sys.__stdout__.write(text) 104 | 105 | def flush(self): 106 | pass 107 | 108 | def get_output(self): 109 | return "".join(self.output) 110 | 111 | output_capture = OutputCapture() 112 | sys.stdout = output_capture 113 | 114 | # Mock input and run CLI 115 | def mock_input(prompt): 116 | self.log(f"Prompt: {prompt}") 117 | if prompt in responses: 118 | response = responses[prompt] 119 | self.log(f"Response: {response}") 120 | return response 121 | return "" 122 | 123 | with patch("builtins.input", side_effect=mock_input): 124 | # Use -test flag to use test mode 125 | sys.argv = [sys.argv[0], "-test"] 126 | cli_module.main() 127 | 128 | # Restore stdout 129 | sys.stdout = sys.__stdout__ 130 | 131 | return True 132 | 133 | except Exception as e: 134 | self.log(f"Error in test submission: {e}") 135 | traceback.print_exc() 136 | return False 137 | finally: 138 | # Cleanup 139 | if "test_csv" in locals() and os.path.exists(test_csv): 140 | os.remove(test_csv) 141 | self.log("Test files cleaned up") 142 | 143 | 144 | def main(): 145 | tester = CaltechDataTester() 146 | 147 | success = tester.run_test_submission() 148 | 149 | if success: 150 | tester.log("\n🎉 Test submission completed successfully!") 151 | sys.exit(0) 152 | else: 153 | tester.log("\n❌ Test submission failed - check logs for details") 154 | sys.exit(1) 155 | 156 | 157 | if __name__ == "__main__": 158 | main() 159 | -------------------------------------------------------------------------------- /tests/data/caltechdata/1235.json: -------------------------------------------------------------------------------- 1 | { 2 | "created": "2019-04-29T20:13:59.728273+00:00", 3 | "id": 1235, 4 | "links": { 5 | "self": "http://data.caltech.edu/api/record/1235" 6 | }, 7 | "metadata": { 8 | "_form_uuid": "beae3039-29ed-4e20-bd21-6ed6e994afa5", 9 | "alternateIdentifiers": [ 10 | { 11 | "alternateIdentifier": "1235", 12 | "alternateIdentifierType": "CaltechDATA_Identifier" 13 | } 14 | ], 15 | "authors": [ 16 | { 17 | "authorAffiliation": [ 18 | "Caltech Library" 19 | ], 20 | "authorIdentifiers": [ 21 | { 22 | "authorIdentifier": "0000-0001-9266-5146", 23 | "authorIdentifierScheme": "ORCID" 24 | } 25 | ], 26 | "authorName": "Morrell, Thomas E" 27 | } 28 | ], 29 | "control_number": "1235", 30 | "descriptions": [ 31 | { 32 | "descriptionType": "Abstract", 33 | "descriptionValue": "First included in ames, this notebook dynamically shows how many records are in CaltechDATA and where they come from (GitHub, Deposit Form, or API). This repository is set to work with MyBinder so you can easily reproduce the plot and include new records. " 34 | }, 35 | { 36 | "descriptionType": "Other", 37 | "descriptionValue": "
Cite this record as:
Morrell, T. E. (2019, April 29). caltechlibrary/caltechdata_usage: First release of CaltechDATA Usage notebook (Version v0.0.1). CaltechDATA. https://doi.org/10.22002/d1.1235
or choose a different citation style.
Download Citation
" 38 | }, 39 | { 40 | "descriptionType": "Other", 41 | "descriptionValue": "
Unique Views: 4
Unique Downloads: 1
between April 29, 2019 and July 02, 2020
More info on how stats are collected
" 42 | } 43 | ], 44 | "doi": "10.22002/D1.1235", 45 | "electronic_location_and_access": [ 46 | { 47 | "access_method": "HTTP", 48 | "electronic_name": [ 49 | "caltechlibrary_caltechdata_usage-v0.0.1.zip" 50 | ], 51 | "embargo_status": "{{embargo_status}}", 52 | "file_size": "87521", 53 | "uniform_resource_identifier": "https://data.caltech.edu/tindfiles/serve/9762705a-1de2-4f2c-9553-5150ba6e98e1/" 54 | } 55 | ], 56 | "files": [ 57 | { 58 | "id": "9762705a-1de2-4f2c-9553-5150ba6e98e1", 59 | "path": "https://data.caltech.edu/tindfiles/serve/9762705a-1de2-4f2c-9553-5150ba6e98e1/" 60 | } 61 | ], 62 | "id": "1235", 63 | "owners": [ 64 | 2 65 | ], 66 | "pid_value": "1235", 67 | "publicationDate": "2019-04-29", 68 | "publishers": [ 69 | { 70 | "publisherName": "CaltechDATA" 71 | } 72 | ], 73 | "relatedIdentifiers": [ 74 | { 75 | "relatedIdentifier": "https://github.com/caltechlibrary/caltechdata_usage/releases/tag/v0.0.1", 76 | "relatedIdentifierRelation": "IsIdenticalTo", 77 | "relatedIdentifierScheme": "URL" 78 | } 79 | ], 80 | "relevantDates": [ 81 | { 82 | "relevantDateType": "Issued", 83 | "relevantDateValue": "2019-04-29" 84 | } 85 | ], 86 | "resourceType": { 87 | "resourceTypeGeneral": "Software" 88 | }, 89 | "rightsList": { 90 | "rights": "license", 91 | "rightsURI": "https://data.caltech.edu/license" 92 | }, 93 | "subjects": [ 94 | "CaltechDATA", 95 | "reporitory", 96 | "usage", 97 | "Jupyter", 98 | "GitHub" 99 | ], 100 | "title": "caltechlibrary/caltechdata_usage: First release of CaltechDATA Usage notebook", 101 | "version": "v0.0.1" 102 | }, 103 | "updated": "2020-07-02T20:40:41.944666+00:00" 104 | } -------------------------------------------------------------------------------- /tests/data/caltechdata/1250.json: -------------------------------------------------------------------------------- 1 | { 2 | "created": "2019-06-19T16:27:52.460707+00:00", 3 | "id": 1250, 4 | "links": { 5 | "self": "http://data.caltech.edu/api/record/1250" 6 | }, 7 | "metadata": { 8 | "_form_uuid": "beae3039-29ed-4e20-bd21-6ed6e994afa5", 9 | "alternateIdentifiers": [ 10 | { 11 | "alternateIdentifier": "1250", 12 | "alternateIdentifierType": "CaltechDATA_Identifier" 13 | } 14 | ], 15 | "authors": [ 16 | { 17 | "authorAffiliation": [ 18 | "Caltech Library" 19 | ], 20 | "authorIdentifiers": [ 21 | { 22 | "authorIdentifier": "0000-0001-9266-5146", 23 | "authorIdentifierScheme": "ORCID" 24 | } 25 | ], 26 | "authorName": "Morrell, Thomas E" 27 | } 28 | ], 29 | "control_number": "1250", 30 | "descriptions": [ 31 | { 32 | "descriptionType": "Abstract", 33 | "descriptionValue": "This release includes two months more data and has some dependency updates." 34 | }, 35 | { 36 | "descriptionType": "Other", 37 | "descriptionValue": "Jupyter notebooks highlighting usage of CaltechDATA" 38 | }, 39 | { 40 | "descriptionType": "Other", 41 | "descriptionValue": "
Click to run this software:
" 42 | }, 43 | { 44 | "descriptionType": "Other", 45 | "descriptionValue": "
Cite this record as:
Morrell, T. E. (2019, June 19). caltechlibrary/caltechdata_usage: Jupyter notebook with visualization of submissions to CaltechDATA (Version v0.0.2). CaltechDATA. https://doi.org/10.22002/d1.1250
or choose a different citation style.
Download Citation
" 46 | }, 47 | { 48 | "descriptionType": "Other", 49 | "descriptionValue": "
Unique Views: 85
Unique Downloads: 2
between June 19, 2019 and July 02, 2020
More info on how stats are collected
" 50 | } 51 | ], 52 | "doi": "10.22002/D1.1250", 53 | "electronic_location_and_access": [ 54 | { 55 | "access_method": "HTTP", 56 | "electronic_name": [ 57 | "caltechlibrary_caltechdata_usage-v0.0.2.zip" 58 | ], 59 | "embargo_status": "{{embargo_status}}", 60 | "file_size": "90421", 61 | "uniform_resource_identifier": "https://data.caltech.edu/tindfiles/serve/45bc4db6-7d54-4cb7-b98a-ad9de15b0e29/" 62 | } 63 | ], 64 | "files": [ 65 | { 66 | "id": "45bc4db6-7d54-4cb7-b98a-ad9de15b0e29", 67 | "path": "https://data.caltech.edu/tindfiles/serve/45bc4db6-7d54-4cb7-b98a-ad9de15b0e29/" 68 | } 69 | ], 70 | "id": "1250", 71 | "owners": [ 72 | 2 73 | ], 74 | "pid_value": "1250", 75 | "publicationDate": "2019-06-19", 76 | "publishers": [ 77 | { 78 | "publisherName": "CaltechDATA" 79 | } 80 | ], 81 | "relatedIdentifiers": [ 82 | { 83 | "relatedIdentifier": "https://github.com/caltechlibrary/caltechdata_usage/releases/tag/v0.0.2", 84 | "relatedIdentifierRelation": "IsIdenticalTo", 85 | "relatedIdentifierScheme": "URL" 86 | } 87 | ], 88 | "relevantDates": [ 89 | { 90 | "relevantDateType": "Issued", 91 | "relevantDateValue": "2019-06-19" 92 | } 93 | ], 94 | "resourceType": { 95 | "resourceTypeGeneral": "Software" 96 | }, 97 | "rightsList": { 98 | "rights": "license", 99 | "rightsURI": "https://data.caltech.edu/license" 100 | }, 101 | "subjects": [ 102 | "CaltechDATA", 103 | "reporitory", 104 | "usage", 105 | "Jupyter", 106 | "GitHub" 107 | ], 108 | "title": "caltechlibrary/caltechdata_usage: Jupyter notebook with visualization of submissions to CaltechDATA", 109 | "version": "v0.0.2" 110 | }, 111 | "updated": "2020-07-02T20:38:38.355371+00:00" 112 | } -------------------------------------------------------------------------------- /tests/data/caltechdata/1259.json: -------------------------------------------------------------------------------- 1 | { 2 | "created": "2019-07-16T22:53:14.663052+00:00", 3 | "id": 1259, 4 | "links": { 5 | "self": "http://data.caltech.edu/api/record/1259" 6 | }, 7 | "metadata": { 8 | "_form_uuid": "beae3039-29ed-4e20-bd21-6ed6e994afa5", 9 | "alternateIdentifiers": [ 10 | { 11 | "alternateIdentifier": "1259", 12 | "alternateIdentifierType": "CaltechDATA_Identifier" 13 | } 14 | ], 15 | "authors": [ 16 | { 17 | "authorAffiliation": [ 18 | "Caltech Library" 19 | ], 20 | "authorIdentifiers": [ 21 | { 22 | "authorIdentifier": "0000-0001-9266-5146", 23 | "authorIdentifierScheme": "ORCID" 24 | } 25 | ], 26 | "authorName": "Morrell, Thomas E" 27 | } 28 | ], 29 | "control_number": "1259", 30 | "descriptions": [ 31 | { 32 | "descriptionType": "Abstract", 33 | "descriptionValue": "This release includes a new notebook that determines the use of ORCID iDs across Caltech Library DOIs. It also updates all notebooks to use the latest version of ames and streamlines dependencies." 34 | }, 35 | { 36 | "descriptionType": "Other", 37 | "descriptionValue": "Jupyter notebooks highlighting usage of CaltechDATA" 38 | }, 39 | { 40 | "descriptionType": "Other", 41 | "descriptionValue": "
Click to run this software:
" 42 | }, 43 | { 44 | "descriptionType": "Other", 45 | "descriptionValue": "
Cite this record as:
Morrell, T. E. (2019, July 16). caltechlibrary/caltechdata_usage: Addition of ORCID analysis notebook and update for new ames version (Version v0.1.0). CaltechDATA. https://doi.org/10.22002/d1.1259
or choose a different citation style.
Download Citation
" 46 | }, 47 | { 48 | "descriptionType": "Other", 49 | "descriptionValue": "
Unique Views: 86
Unique Downloads: 1
between July 16, 2019 and July 02, 2020
More info on how stats are collected
" 50 | } 51 | ], 52 | "doi": "10.22002/D1.1259", 53 | "electronic_location_and_access": [ 54 | { 55 | "access_method": "HTTP", 56 | "electronic_name": [ 57 | "caltechlibrary_caltechdata_usage-v0.1.0.zip" 58 | ], 59 | "embargo_status": "{{embargo_status}}", 60 | "file_size": "101985", 61 | "uniform_resource_identifier": "https://data.caltech.edu/tindfiles/serve/96b518fe-8f28-4ff6-9a4c-d5cc59f4644e/" 62 | } 63 | ], 64 | "files": [ 65 | { 66 | "id": "96b518fe-8f28-4ff6-9a4c-d5cc59f4644e", 67 | "path": "https://data.caltech.edu/tindfiles/serve/96b518fe-8f28-4ff6-9a4c-d5cc59f4644e/" 68 | } 69 | ], 70 | "id": "1259", 71 | "owners": [ 72 | 2 73 | ], 74 | "pid_value": "1259", 75 | "publicationDate": "2019-07-16", 76 | "publishers": [ 77 | { 78 | "publisherName": "CaltechDATA" 79 | } 80 | ], 81 | "relatedIdentifiers": [ 82 | { 83 | "relatedIdentifier": "https://github.com/caltechlibrary/caltechdata_usage/releases/tag/v0.1.0", 84 | "relatedIdentifierRelation": "IsIdenticalTo", 85 | "relatedIdentifierScheme": "URL" 86 | } 87 | ], 88 | "relevantDates": [ 89 | { 90 | "relevantDateType": "Issued", 91 | "relevantDateValue": "2019-07-16" 92 | } 93 | ], 94 | "resourceType": { 95 | "resourceTypeGeneral": "Software" 96 | }, 97 | "rightsList": { 98 | "rights": "license", 99 | "rightsURI": "https://data.caltech.edu/license" 100 | }, 101 | "subjects": [ 102 | "CaltechDATA", 103 | "reporitory", 104 | "usage", 105 | "Jupyter", 106 | "GitHub" 107 | ], 108 | "title": "caltechlibrary/caltechdata_usage: Addition of ORCID analysis notebook and update for new ames version", 109 | "version": "v0.1.0" 110 | }, 111 | "updated": "2020-07-02T20:37:16.358971+00:00" 112 | } -------------------------------------------------------------------------------- /tests/data/caltechdata/293.json: -------------------------------------------------------------------------------- 1 | { 2 | "created": "2017-09-13T18:03:12.224037+00:00", 3 | "id": 293, 4 | "links": { 5 | "self": "http://data.caltech.edu/api/record/293" 6 | }, 7 | "metadata": { 8 | "_form_uuid": "beae3039-29ed-4e20-bd21-6ed6e994afa5", 9 | "_oai": { 10 | "id": "oai:data.caltech.edu:recid/293", 11 | "updated": "2017-09-13T18:03:12Z" 12 | }, 13 | "access_right": "open", 14 | "alternateIdentifiers": [ 15 | { 16 | "alternateIdentifier": "293", 17 | "alternateIdentifierType": "CaltechDATA_Identifier" 18 | } 19 | ], 20 | "authors": [ 21 | { 22 | "authorAffiliation": [ 23 | "TCCON Consortium" 24 | ], 25 | "authorName": "Total Carbon Column Observing Network (TCCON) Team" 26 | } 27 | ], 28 | "contributors": [ 29 | { 30 | "contributorAffiliation": [ 31 | "California Institute of Technology, Pasadena, CA, U.S.A." 32 | ], 33 | "contributorEmail": "dwunch@atmosp.physics.utoronto.ca", 34 | "contributorIdentifiers": [ 35 | { 36 | "contributorIdentifier": "0000-0002-4924-0377", 37 | "contributorIdentifierScheme": "ORCID" 38 | } 39 | ], 40 | "contributorName": "Wunch, Debra", 41 | "contributorType": "ContactPerson" 42 | }, 43 | { 44 | "contributorAffiliation": [ 45 | "California Institute of Technology, Pasadena, CA (US)" 46 | ], 47 | "contributorEmail": "wennberg@caltech.edu", 48 | "contributorIdentifiers": [ 49 | { 50 | "contributorIdentifier": "0000-0002-6126-3854", 51 | "contributorIdentifierScheme": "ORCID" 52 | } 53 | ], 54 | "contributorName": "Wennberg, P. O. ", 55 | "contributorType": "ContactPerson" 56 | }, 57 | { 58 | "contributorAffiliation": [ 59 | "Centre for Atmospheric Chemistry, School of Chemistry, University of Wollongong, Wollongong, NSW (AU)" 60 | ], 61 | "contributorEmail": "griffith@uow.edu.au", 62 | "contributorIdentifiers": [ 63 | { 64 | "contributorIdentifier": " 0000-0002-7986-1924", 65 | "contributorIdentifierScheme": "ORCID" 66 | } 67 | ], 68 | "contributorName": "Griffith, D. W.T.", 69 | "contributorType": "ContactPerson" 70 | }, 71 | { 72 | "contributorAffiliation": [ 73 | " Institute of Environmental Physics, University of Bremen, Bremen (DE), Centre for Atmospheric Chemistry, School of Chemistry, University of Wollongong, Wollongong, NSW (AU) " 74 | ], 75 | "contributorEmail": "n_deutscher@iup.physik.uni-bremen.de", 76 | "contributorIdentifiers": [ 77 | { 78 | "contributorIdentifier": "0000-0002-2906-2577", 79 | "contributorIdentifierScheme": "ORCID" 80 | } 81 | ], 82 | "contributorName": "Deutscher, N. M.", 83 | "contributorType": "ContactPerson" 84 | }, 85 | { 86 | "contributorAffiliation": [ 87 | "Max Planck Institute for Biogeochemistry, Jena (DE)" 88 | ], 89 | "contributorEmail": "dfeist@bgc-jena.mpg.de", 90 | "contributorIdentifiers": [ 91 | { 92 | "contributorIdentifier": "0000-0002-5890-6687", 93 | "contributorIdentifierScheme": "ORCID" 94 | } 95 | ], 96 | "contributorName": "Feist, D. G.", 97 | "contributorType": "ContactPerson" 98 | }, 99 | { 100 | "contributorAffiliation": [ 101 | "Institute of Environmental Physics, University of Bremen, Bremen (DE)" 102 | ], 103 | "contributorEmail": "jnotholt@iup.physik.uni-bremen.de", 104 | "contributorIdentifiers": [ 105 | { 106 | "contributorIdentifier": "0000-0002-3324-885X", 107 | "contributorIdentifierScheme": "ORCID" 108 | } 109 | ], 110 | "contributorName": "Notholt, J.", 111 | "contributorType": "ContactPerson" 112 | } 113 | ], 114 | "control_number": "293", 115 | "descriptions": [ 116 | { 117 | "descriptionType": "Other", 118 | "descriptionValue": "The Total Carbon Column Observing Network (TCCON) is a network of ground-based Fourier Transform Spectrometers that record direct solar absorption spectra of the atmosphere in the near-infrared. From these spectra, accurate and precise column-averaged abundances of atmospheric constituents including CO2, CH4, N2O, HF, CO, H2O, and HDO, are retrieved. This is the 2014 data release." 119 | }, 120 | { 121 | "descriptionType": "Other", 122 | "descriptionValue": "
Unique Views: 953
Unique Downloads: 98
between September 13, 2017 and July 02, 2020
More info on how stats are collected
" 123 | }, 124 | { 125 | "descriptionType": "Other", 126 | "descriptionValue": "
Cite this record as:
Total Carbon Column Observing Network (TCCON) Team. (2017). 2014 TCCON Data Release (Version GGG2014) [Data set]. CaltechDATA. https://doi.org/10.14291/TCCON.GGG2014
or choose a different citation style.
Download Citation
" 127 | } 128 | ], 129 | "doi": "10.14291/TCCON.GGG2014", 130 | "electronic_location_and_access": [ 131 | { 132 | "access_method": "HTTP", 133 | "electronic_name": [ 134 | "tccon.latest.public.tgz" 135 | ], 136 | "embargo_status": "open", 137 | "file_size": "236307805", 138 | "uniform_resource_identifier": "https://data.caltech.edu/tindfiles/serve/d739803e-d069-4a4e-9070-a85ed6ddb07d/" 139 | }, 140 | { 141 | "access_method": "HTTP", 142 | "electronic_name": [ 143 | "LICENSE.txt" 144 | ], 145 | "embargo_status": "open", 146 | "file_size": "11436", 147 | "uniform_resource_identifier": "https://data.caltech.edu/tindfiles/serve/24d2401d-d2b7-42e1-83b1-1ee01839d84d/" 148 | } 149 | ], 150 | "files": [ 151 | { 152 | "id": "d739803e-d069-4a4e-9070-a85ed6ddb07d", 153 | "path": "https://data.caltech.edu/tindfiles/serve/d739803e-d069-4a4e-9070-a85ed6ddb07d/" 154 | }, 155 | { 156 | "id": "24d2401d-d2b7-42e1-83b1-1ee01839d84d", 157 | "path": "https://data.caltech.edu/tindfiles/serve/24d2401d-d2b7-42e1-83b1-1ee01839d84d/" 158 | } 159 | ], 160 | "format": [ 161 | ".tgz", 162 | ".nc" 163 | ], 164 | "id": "293", 165 | "language": "eng", 166 | "license": "other-license", 167 | "owners": [ 168 | 2 169 | ], 170 | "pid_value": "293", 171 | "publicationDate": "2017-09-13", 172 | "publishers": { 173 | "publisherName": "CaltechDATA" 174 | }, 175 | "relatedIdentifiers": [ 176 | { 177 | "relatedIdentifier": "10.14291/TCCON.GGG2014.DOCUMENTATION.R0/1221662", 178 | "relatedIdentifierRelation": "IsDocumentedBy", 179 | "relatedIdentifierScheme": "DOI" 180 | }, 181 | { 182 | "relatedIdentifier": "https://tccon-wiki.caltech.edu/Network_Policy/Data_Use_Policy/Data_Description", 183 | "relatedIdentifierRelation": "IsDocumentedBy", 184 | "relatedIdentifierScheme": "URL" 185 | }, 186 | { 187 | "relatedIdentifier": "https://tccon-wiki.caltech.edu/Sites", 188 | "relatedIdentifierRelation": "IsDocumentedBy", 189 | "relatedIdentifierScheme": "DOI" 190 | } 191 | ], 192 | "relevantDates": [ 193 | { 194 | "relevantDateType": "Updated", 195 | "relevantDateValue": "2020-07-01" 196 | }, 197 | { 198 | "relevantDateType": "Submitted", 199 | "relevantDateValue": "2017-09-13" 200 | } 201 | ], 202 | "resourceType": { 203 | "resourceTypeGeneral": "Dataset" 204 | }, 205 | "rightsList": { 206 | "rights": "TCCON Data Use Policy", 207 | "rightsURI": "https://data.caltech.edu/tindfiles/serve/24d2401d-d2b7-42e1-83b1-1ee01839d84d/" 208 | }, 209 | "subjects": [ 210 | "atmospheric trace gases", 211 | " CO2", 212 | " CH4", 213 | " CO", 214 | " N2O", 215 | " column-averaged dry-air mole fractions", 216 | " remote sensing", 217 | " FTIR spectroscopy", 218 | " TCCON" 219 | ], 220 | "title": "2014 TCCON Data Release", 221 | "version": "GGG2014" 222 | }, 223 | "updated": "2020-07-03T10:56:19.627073+00:00" 224 | } -------------------------------------------------------------------------------- /tests/data/datacite43/4yxbs-4mj38.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "affiliationIdentifier": "05dxps055", 7 | "affiliationIdentifierScheme": "ROR", 8 | "name": "California Institute of Technology" 9 | } 10 | ], 11 | "familyName": "Law", 12 | "givenName": "Casey", 13 | "name": "Casey Law", 14 | "nameIdentifiers": [ 15 | { 16 | "nameIdentifier": "0000-0002-4119-9963", 17 | "nameIdentifierScheme": "ORCID" 18 | } 19 | ], 20 | "nameType": "Personal" 21 | } 22 | ], 23 | "dates": [ 24 | { 25 | "date": "2023", 26 | "dateType": "Issued" 27 | }, 28 | { 29 | "date": "2023-04-21", 30 | "dateType": "Created" 31 | } 32 | ], 33 | "descriptions": [ 34 | { 35 | "description": "Data associated with DSA-110 candidate transient. Each filterbank is saved at maximum native resolution (32.7 microseconds, 30.4 kHz) and contains ~0.669696 seconds (20480 samples) around the burst across the full DSA-110 187 MHz (6144 channels) frequency band. The Stokes parameters have been calibrated using observations of the 3C48 and 3C286 Very Large Array (VLA) calibrators as described in Sherman et al., 2024 (https://doi.org/10.3847/1538-4357/ad275e ; see Appendices D and E). Note that there may be minute differences between the data contained here and that reported in Sherman et al., 2024 due to being calibrated at maximum time resolution, rather than being downsampled first.", 36 | "descriptionType": "Abstract" 37 | } 38 | ], 39 | "formats": [ 40 | "png" 41 | ], 42 | "fundingReferences": [ 43 | { 44 | "funderIdentifier": "grid.431093.c", 45 | "funderIdentifierType": "GRID", 46 | "funderName": "National Science Foundation" 47 | } 48 | ], 49 | "geoLocations": [ 50 | { 51 | "geoLocationPlace": "OVRO", 52 | "geoLocationPoint": { 53 | "pointLatitude": 37.2339, 54 | "pointLongitude": -118.282 55 | } 56 | } 57 | ], 58 | "identifiers": [ 59 | { 60 | "identifier": "10.22002/4yxbs-4mj38", 61 | "identifierType": "DOI" 62 | }, 63 | { 64 | "identifier": "oai:data.caltech.edu:4yxbs-4mj38", 65 | "identifierType": "oai" 66 | }, 67 | { 68 | "identifier": "220506aabd", 69 | "identifierType": "dsa-110-id" 70 | }, 71 | { 72 | "identifier": "byyt8-y6a26", 73 | "identifierType": "cdid" 74 | } 75 | ], 76 | "publicationYear": "2023", 77 | "publisher": "Caltech Data", 78 | "relatedIdentifiers": [ 79 | { 80 | "relatedIdentifier": "http://deepsynoptic.org", 81 | "relatedIdentifierType": "URL", 82 | "relationType": "IsDocumentedBy" 83 | } 84 | ], 85 | "rightsList": [ 86 | { 87 | "rights": "cc-by-4.0" 88 | } 89 | ], 90 | "schemaVersion": "http://datacite.org/schema/kernel-4", 91 | "subjects": [ 92 | { 93 | "subject": "OVRO" 94 | }, 95 | { 96 | "subject": "Astrophysics" 97 | }, 98 | { 99 | "subject": "Fast Radio Bursts" 100 | } 101 | ], 102 | "titles": [ 103 | { 104 | "title": "DSA-110 Data for Candidate Fast Radio Burst 220506aabd" 105 | } 106 | ], 107 | "types": { 108 | "resourceType": "", 109 | "resourceTypeGeneral": "Dataset" 110 | }, 111 | "version": "2.0" 112 | } -------------------------------------------------------------------------------- /tests/data/datacite43/asjw8-cd908.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "familyName": "Sloan", 5 | "givenName": "Julia", 6 | "name": "Sloan, Julia", 7 | "nameIdentifiers": [ 8 | { 9 | "nameIdentifier": "0000-0003-0200-063X", 10 | "nameIdentifierScheme": "ORCID" 11 | } 12 | ], 13 | "nameType": "Personal" 14 | } 15 | ], 16 | "dates": [ 17 | { 18 | "date": "2024-10-25", 19 | "dateType": "Issued" 20 | }, 21 | { 22 | "date": "2024-10-31", 23 | "dateInformation": "Correct file added", 24 | "dateType": "Updated" 25 | } 26 | ], 27 | "descriptions": [ 28 | { 29 | "description": "This artifact contains two datasets: one corresponding to a simulation solving Richards Equation in clay, and another solving it in sand. These experiments were conducted in Gordon Bonan's \"Climate Change and Terrestrial Ecosystem Modeling\" textbook, Chapter 8 supplementary program 1.\nFull citation: Bonan, Gordon. Climate Change and Terrestrial Ecosystem Modeling. Cambridge University Press, 2019.", 30 | "descriptionType": "Abstract" 31 | } 32 | ], 33 | "identifiers": [ 34 | { 35 | "identifier": "10.22002/asjw8-cd908", 36 | "identifierType": "DOI" 37 | }, 38 | { 39 | "identifier": "oai:data.caltech.edu:asjw8-cd908", 40 | "identifierType": "oai" 41 | } 42 | ], 43 | "publicationYear": "2024", 44 | "publisher": "CaltechDATA", 45 | "rightsList": [ 46 | { 47 | "rights": "Creative Commons Zero v1.0 Universal", 48 | "rightsIdentifier": "cc0-1.0", 49 | "rightsIdentifierScheme": "spdx", 50 | "rightsUri": "https://creativecommons.org/publicdomain/zero/1.0/legalcode" 51 | } 52 | ], 53 | "schemaVersion": "http://datacite.org/schema/kernel-4", 54 | "titles": [ 55 | { 56 | "title": "Bonan 2019 Richards Eqn Data" 57 | } 58 | ], 59 | "types": { 60 | "resourceType": "", 61 | "resourceTypeGeneral": "Dataset" 62 | } 63 | } -------------------------------------------------------------------------------- /tests/data/datacite43/b2jqz-qdw65.json: -------------------------------------------------------------------------------- 1 | { 2 | "contributors": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "name": "Department of Biological Sciences, Tata Institute of Fundamental Research, Mumbai, Maharashtra, India" 7 | } 8 | ], 9 | "contributorType": "ContactPerson", 10 | "familyName": "Koushika", 11 | "givenName": "Sandhya P.", 12 | "name": "Koushika, Sandhya P.", 13 | "nameIdentifiers": [ 14 | { 15 | "nameIdentifier": "0000-0002-1742-7356", 16 | "nameIdentifierScheme": "ORCID" 17 | } 18 | ], 19 | "nameType": "Personal" 20 | } 21 | ], 22 | "creators": [ 23 | { 24 | "affiliation": [ 25 | { 26 | "name": "Department of Biological Sciences, Tata Institute of Fundamental Research, Mumbai, Maharashtra, India" 27 | } 28 | ], 29 | "familyName": "Vasudevan", 30 | "givenName": "Amruta", 31 | "name": "Vasudevan, Amruta", 32 | "nameIdentifiers": [ 33 | { 34 | "nameIdentifier": "0000-0002-5777-9508", 35 | "nameIdentifierScheme": "ORCID" 36 | } 37 | ], 38 | "nameType": "Personal" 39 | } 40 | ], 41 | "dates": [ 42 | { 43 | "date": "2024-06-25", 44 | "dateType": "Issued" 45 | }, 46 | { 47 | "date": "2024-06-24", 48 | "dateType": "Accepted" 49 | } 50 | ], 51 | "descriptions": [ 52 | { 53 | "description": "Raw data for figure 1", 54 | "descriptionType": "Abstract" 55 | } 56 | ], 57 | "fundingReferences": [ 58 | { 59 | "funderName": "The authors gratefully acknowledge support from the Department of Atomic Energy, Government of India (DAE) grants 12-R\\&D-IMS-5.02-0202 and 1303/2/2019/R\\&DII/DAE/2079 (dated 11.02.2020 to S.P.K.), the Howard Hughes Medical Institute (HHMI) International Early Career Scientist (IECS) grant 55007425 (to S.P.K.), CSIR (to S.P.K.), and funding from the PRISM project at the Institute of Mathematical Sciences (to S.P.K.) for research costs. The authors gratefully acknowledge salary support from TIFR-DAE (for A.V.)." 60 | } 61 | ], 62 | "identifiers": [ 63 | { 64 | "identifier": "10.22002/b2jqz-qdw65", 65 | "identifierType": "DOI" 66 | }, 67 | { 68 | "identifier": "oai:data.caltech.edu:b2jqz-qdw65", 69 | "identifierType": "oai" 70 | } 71 | ], 72 | "language": "eng", 73 | "publicationYear": "2024", 74 | "publisher": "CaltechDATA", 75 | "relatedIdentifiers": [ 76 | { 77 | "relatedIdentifier": "10.17912/micropub.biology.001204", 78 | "relatedIdentifierType": "DOI", 79 | "relationType": "IsPartOf", 80 | "resourceTypeGeneral": "Text" 81 | } 82 | ], 83 | "rightsList": [ 84 | { 85 | "rights": "Creative Commons Attribution 4.0 International", 86 | "rightsIdentifier": "cc-by-4.0", 87 | "rightsIdentifierScheme": "spdx", 88 | "rightsUri": "https://creativecommons.org/licenses/by/4.0/legalcode" 89 | } 90 | ], 91 | "schemaVersion": "http://datacite.org/schema/kernel-4", 92 | "subjects": [ 93 | { 94 | "subject": "c. elegans" 95 | } 96 | ], 97 | "titles": [ 98 | { 99 | "title": "Dataset: Physical presence of chemical synapses is necessary for turning behavior of anterograde synaptic vesicles at the branch point of PLM neurons in C. elegans" 100 | } 101 | ], 102 | "types": { 103 | "resourceType": "", 104 | "resourceTypeGeneral": "Dataset" 105 | }, 106 | "version": "1.0" 107 | } -------------------------------------------------------------------------------- /tests/data/datacite43/d7mk4-f8t44.json: -------------------------------------------------------------------------------- 1 | { 2 | "contributors": [ 3 | { 4 | "contributorType": "DataCurator", 5 | "familyName": "Civilini", 6 | "givenName": "Francesco", 7 | "name": "Civilini, Francesco", 8 | "nameIdentifiers": [ 9 | { 10 | "nameIdentifier": "0000-0003-0669-0404", 11 | "nameIdentifierScheme": "ORCID" 12 | } 13 | ], 14 | "nameType": "Personal" 15 | }, 16 | { 17 | "affiliation": [ 18 | { 19 | "affiliationIdentifier": "05dxps055", 20 | "affiliationIdentifierScheme": "ROR", 21 | "name": "California Institute of Technology" 22 | } 23 | ], 24 | "contributorType": "Other", 25 | "familyName": "Husker", 26 | "givenName": "Allen", 27 | "name": "Husker, Allen", 28 | "nameIdentifiers": [ 29 | { 30 | "nameIdentifier": "0000-0003-1139-0502", 31 | "nameIdentifierScheme": "ORCID" 32 | } 33 | ], 34 | "nameType": "Personal" 35 | }, 36 | { 37 | "contributorType": "Other", 38 | "familyName": "Weber", 39 | "givenName": "Renee", 40 | "name": "Weber, Renee", 41 | "nameIdentifiers": [ 42 | { 43 | "nameIdentifier": "0000-0002-1649-483X", 44 | "nameIdentifierScheme": "ORCID" 45 | } 46 | ], 47 | "nameType": "Personal" 48 | } 49 | ], 50 | "creators": [ 51 | { 52 | "familyName": "Civilini", 53 | "givenName": "Francesco", 54 | "name": "Civilini, Francesco", 55 | "nameIdentifiers": [], 56 | "nameType": "Personal" 57 | } 58 | ], 59 | "dates": [ 60 | { 61 | "date": "2024-10-24", 62 | "dateType": "Issued" 63 | } 64 | ], 65 | "descriptions": [ 66 | { 67 | "description": "This dataset contains data from the Lunar Seismic Profiling Experiment as well as results from the JGR: Planets publication \"Thermal moonquake characterization and cataloging using frequency-based algorithms and stochastic gradient descent\".\u00a0\nThe code to compute the results can be found in the GitHub here:\nhttps://github.com/civilinifr/thermal_mq_analysis\nalso published through Zenodo here:\nhttp://doi.org/10.5281/zenodo.8025056\n\u00a0\nv2: Fixed files in LSPE_sac_hourly.zip to contain correct start and end times.\u00a0\n\u00a0\nIncludes:\n\nThermal moonquake catalog of Grade-A LSPE events\nDaily ASCII files in units of decompressed volts (filenames daily_ascii_YYYYMMDD_YYYYMMDD.zip)\nHourly SAC files in units of decompressed volts (filename LSPE_sac_hourly.zip)\nHourly SAC files in units of nm/s (filenames LSPE_sac_hourly_phys_p1.zip and LSPE_sac_hourly_phys_p2.zip)\nAnalysis results from the manuscript (filename lunar_output.zip)", 68 | "descriptionType": "Abstract" 69 | } 70 | ], 71 | "identifiers": [ 72 | { 73 | "identifier": "10.22002/d7mk4-f8t44", 74 | "identifierType": "DOI" 75 | }, 76 | { 77 | "identifier": "oai:data.caltech.edu:d7mk4-f8t44", 78 | "identifierType": "oai" 79 | } 80 | ], 81 | "publicationYear": "2024", 82 | "publisher": "CaltechDATA", 83 | "rightsList": [ 84 | { 85 | "rights": "Creative Commons Zero v1.0 Universal", 86 | "rightsIdentifier": "cc0-1.0", 87 | "rightsIdentifierScheme": "spdx", 88 | "rightsUri": "https://creativecommons.org/publicdomain/zero/1.0/legalcode" 89 | } 90 | ], 91 | "schemaVersion": "http://datacite.org/schema/kernel-4", 92 | "titles": [ 93 | { 94 | "title": "Apollo 17 Lunar Seismic Profiling Experiment Seismic Data and Thermal Moonquake Catalog" 95 | } 96 | ], 97 | "types": { 98 | "resourceType": "", 99 | "resourceTypeGeneral": "Dataset" 100 | }, 101 | "version": "v2.0" 102 | } -------------------------------------------------------------------------------- /tests/data/datacite43/dks9f-mj878.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "affiliationIdentifier": "05dxps055", 7 | "affiliationIdentifierScheme": "ROR", 8 | "name": "California Institute of Technology" 9 | } 10 | ], 11 | "familyName": "Gray", 12 | "givenName": "Robert M.", 13 | "name": "Gray, Robert M.", 14 | "nameIdentifiers": [ 15 | { 16 | "nameIdentifier": "0000-0001-5980-8774", 17 | "nameIdentifierScheme": "ORCID" 18 | } 19 | ], 20 | "nameType": "Personal" 21 | }, 22 | { 23 | "affiliation": [ 24 | { 25 | "affiliationIdentifier": "05dxps055", 26 | "affiliationIdentifierScheme": "ROR", 27 | "name": "California Institute of Technology" 28 | } 29 | ], 30 | "familyName": "Liu", 31 | "givenName": "Mingchen", 32 | "name": "Liu, Mingchen", 33 | "nameIdentifiers": [], 34 | "nameType": "Personal" 35 | }, 36 | { 37 | "affiliation": [ 38 | { 39 | "affiliationIdentifier": "05dxps055", 40 | "affiliationIdentifierScheme": "ROR", 41 | "name": "California Institute of Technology" 42 | } 43 | ], 44 | "familyName": "Zhou", 45 | "givenName": "Selina", 46 | "name": "Zhou, Selina", 47 | "nameIdentifiers": [], 48 | "nameType": "Personal" 49 | } 50 | ], 51 | "dates": [ 52 | { 53 | "date": "2024-07-20", 54 | "dateType": "Issued" 55 | } 56 | ], 57 | "descriptions": [ 58 | { 59 | "description": "Data and processing code corresponding to the manuscript, \"Quadratic-soliton-enhanced mid-IR molecular sensing.\"", 60 | "descriptionType": "Abstract" 61 | } 62 | ], 63 | "identifiers": [ 64 | { 65 | "identifier": "10.22002/dks9f-mj878", 66 | "identifierType": "DOI" 67 | }, 68 | { 69 | "identifier": "oai:data.caltech.edu:dks9f-mj878", 70 | "identifierType": "oai" 71 | } 72 | ], 73 | "publicationYear": "2024", 74 | "publisher": "CaltechDATA", 75 | "relatedIdentifiers": [ 76 | { 77 | "relatedIdentifier": "arXiv:2301.07826", 78 | "relatedIdentifierType": "arXiv", 79 | "relationType": "IsDescribedBy", 80 | "resourceTypeGeneral": "Text" 81 | } 82 | ], 83 | "rightsList": [ 84 | { 85 | "rights": "Creative Commons Attribution 4.0 International", 86 | "rightsIdentifier": "cc-by-4.0", 87 | "rightsIdentifierScheme": "spdx", 88 | "rightsUri": "https://creativecommons.org/licenses/by/4.0/legalcode" 89 | } 90 | ], 91 | "schemaVersion": "http://datacite.org/schema/kernel-4", 92 | "titles": [ 93 | { 94 | "title": "Data for \"Quadratic-soliton-enhanced mid-IR molecular sensing\"" 95 | } 96 | ], 97 | "types": { 98 | "resourceType": "", 99 | "resourceTypeGeneral": "Dataset" 100 | } 101 | } -------------------------------------------------------------------------------- /tests/data/datacite43/ep884-g0v97.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "familyName": "Chen", 5 | "givenName": "Zibo", 6 | "name": "Chen, Zibo", 7 | "nameIdentifiers": [], 8 | "nameType": "Personal" 9 | } 10 | ], 11 | "dates": [ 12 | { 13 | "date": "2024-06-19", 14 | "dateType": "Issued" 15 | } 16 | ], 17 | "identifiers": [ 18 | { 19 | "identifier": "10.22002/ep884-g0v97", 20 | "identifierType": "DOI" 21 | }, 22 | { 23 | "identifier": "oai:data.caltech.edu:ep884-g0v97", 24 | "identifierType": "oai" 25 | } 26 | ], 27 | "publicationYear": "2024", 28 | "publisher": "CaltechDATA", 29 | "rightsList": [ 30 | { 31 | "rights": "Creative Commons Zero v1.0 Universal", 32 | "rightsIdentifier": "cc0-1.0", 33 | "rightsIdentifierScheme": "spdx", 34 | "rightsUri": "https://creativecommons.org/publicdomain/zero/1.0/legalcode" 35 | } 36 | ], 37 | "schemaVersion": "http://datacite.org/schema/kernel-4", 38 | "titles": [ 39 | { 40 | "title": "A synthetic protein-level neural network in mammalian cells" 41 | } 42 | ], 43 | "types": { 44 | "resourceType": "", 45 | "resourceTypeGeneral": "Dataset" 46 | }, 47 | "version": "2.0" 48 | } -------------------------------------------------------------------------------- /tests/data/datacite43/f40da-hww21.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "affiliationIdentifier": "035a68863", 7 | "affiliationIdentifierScheme": "ROR", 8 | "name": "United States Geological Survey" 9 | }, 10 | { 11 | "affiliationIdentifier": "05dxps055", 12 | "affiliationIdentifierScheme": "ROR", 13 | "name": "California Institute of Technology" 14 | } 15 | ], 16 | "familyName": "Atterholt", 17 | "givenName": "James", 18 | "name": "Atterholt, James", 19 | "nameIdentifiers": [ 20 | { 21 | "nameIdentifier": "0000-0003-1603-5518", 22 | "nameIdentifierScheme": "ORCID" 23 | } 24 | ], 25 | "nameType": "Personal" 26 | }, 27 | { 28 | "affiliation": [ 29 | { 30 | "affiliationIdentifier": "05dxps055", 31 | "affiliationIdentifierScheme": "ROR", 32 | "name": "California Institute of Technology" 33 | } 34 | ], 35 | "familyName": "Wilding", 36 | "givenName": "John", 37 | "name": "Wilding, John", 38 | "nameIdentifiers": [ 39 | { 40 | "nameIdentifier": "0000-0002-0914-2078", 41 | "nameIdentifierScheme": "ORCID" 42 | } 43 | ], 44 | "nameType": "Personal" 45 | }, 46 | { 47 | "affiliation": [ 48 | { 49 | "affiliationIdentifier": "05dxps055", 50 | "affiliationIdentifierScheme": "ROR", 51 | "name": "California Institute of Technology" 52 | } 53 | ], 54 | "familyName": "Ross", 55 | "givenName": "Zachary", 56 | "name": "Ross, Zachary", 57 | "nameIdentifiers": [ 58 | { 59 | "nameIdentifier": "0000-0002-6343-8400", 60 | "nameIdentifierScheme": "ORCID" 61 | } 62 | ], 63 | "nameType": "Personal" 64 | } 65 | ], 66 | "dates": [ 67 | { 68 | "date": "2025-01-01", 69 | "dateType": "Issued" 70 | }, 71 | { 72 | "date": "2025-01-01", 73 | "dateType": "Available" 74 | } 75 | ], 76 | "descriptions": [ 77 | { 78 | "description": "There are two catalogs in this dataset produced for the study by Atterholt, Wilding, & Ross (2025):\n\nA relocated earthquake hypocenter location catalog made using PhaseNO (Sun et al., 2023) for phase picking, GaMMA (Zhu et al., 2022) for phase association, HypoSVI (Smith et al., 2021) for absolute location, and GrowClust (Trugman & Shearer, 2017) for cross-correlation-based relative relocation.\nA corresponding earthquake moment tensor catalog made using the picks from the hypocenter catalog and the Bayesian framework outlined in Wilding & Ross (2024).\nCitations:\n\nAtterholt, J., Wilding, J. D., & Ross., Z. E. (2025). The evolution of fault orientation in the 2019 Ridgecrest earthquake sequence with a new long-term catalogue of seismicity and moment tensors. Geophysical Journal International, 240(3), 1579\u20131592. https://doi.org/10.1093/gji/ggaf001\nSun, H., Ross, Z. E., Zhu, W., & Azizzadenesheli, K. (2023). Phase Neural Operator for Multi\u2010Station Picking of Seismic Arrivals. Geophysical Research Letters, 50(24), e2023GL106434. https://doi.org/10.1029/2023GL106434\nZhu, W., McBrearty, I. W., Mousavi, S. M., Ellsworth, W. L., & Beroza, G. C. (2022). Earthquake Phase Association Using a Bayesian Gaussian Mixture Model. Journal of Geophysical Research: Solid Earth, 127(5), e2021JB023249. https://doi.org/10.1029/2021JB023249\nSmith, J. D., Ross, Z. E., Azizzadenesheli, K., & Muir, J. B. (2021). HypoSVI: Hypocentre inversion with Stein variational inference and physics informed neural networks. Geophysical Journal International, 228(1), 698\u2013710. https://doi.org/10.1093/gji/ggab309\nTrugman, D. T., & Shearer, P. M. (2017). GrowClust: A Hierarchical Clustering Algorithm for Relative Earthquake Relocation, with Application to the Spanish Springs and Sheldon, Nevada, Earthquake Sequences. Seismological Research Letters, 88(2A), 379\u2013391. https://doi.org/10.1785/0220160188", 79 | "descriptionType": "Abstract" 80 | } 81 | ], 82 | "fundingReferences": [ 83 | { 84 | "awardNumber": "1745301", 85 | "awardTitle": "Graduate Research Fellowship", 86 | "funderIdentifier": "grid.431093.c", 87 | "funderIdentifierType": "GRID", 88 | "funderName": "National Science Foundation" 89 | } 90 | ], 91 | "identifiers": [ 92 | { 93 | "identifier": "10.22002/f40da-hww21", 94 | "identifierType": "DOI" 95 | }, 96 | { 97 | "identifier": "oai:data.caltech.edu:f40da-hww21", 98 | "identifierType": "oai" 99 | } 100 | ], 101 | "publicationYear": "2025", 102 | "publisher": "CaltechDATA", 103 | "rightsList": [ 104 | { 105 | "rights": "Creative Commons Zero v1.0 Universal", 106 | "rightsIdentifier": "cc0-1.0", 107 | "rightsIdentifierScheme": "spdx", 108 | "rightsUri": "https://creativecommons.org/publicdomain/zero/1.0/legalcode" 109 | } 110 | ], 111 | "schemaVersion": "http://datacite.org/schema/kernel-4", 112 | "titles": [ 113 | { 114 | "title": "2019 Ridgecrest Earthquake Sequence Long-Term Hypocenter and Moment Tensor Catalog" 115 | } 116 | ], 117 | "types": { 118 | "resourceType": "", 119 | "resourceTypeGeneral": "Dataset" 120 | } 121 | } -------------------------------------------------------------------------------- /tests/data/datacite43/hevaf-20f84.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "affiliationIdentifier": "05dxps055", 7 | "affiliationIdentifierScheme": "ROR", 8 | "name": "California Institute of Technology" 9 | } 10 | ], 11 | "familyName": "Law", 12 | "givenName": "Casey", 13 | "name": "Casey Law", 14 | "nameIdentifiers": [ 15 | { 16 | "nameIdentifier": "0000-0002-4119-9963", 17 | "nameIdentifierScheme": "ORCID" 18 | } 19 | ], 20 | "nameType": "Personal" 21 | } 22 | ], 23 | "dates": [ 24 | { 25 | "date": "2024", 26 | "dateType": "Issued" 27 | }, 28 | { 29 | "date": "2024-04-25", 30 | "dateType": "Created" 31 | } 32 | ], 33 | "descriptions": [ 34 | { 35 | "description": "Data associated with DSA-110 candidate transient.", 36 | "descriptionType": "Abstract" 37 | } 38 | ], 39 | "formats": [ 40 | "png" 41 | ], 42 | "fundingReferences": [ 43 | { 44 | "funderIdentifier": "grid.431093.c", 45 | "funderIdentifierType": "GRID", 46 | "funderName": "National Science Foundation" 47 | } 48 | ], 49 | "geoLocations": [ 50 | { 51 | "geoLocationPlace": "OVRO", 52 | "geoLocationPoint": { 53 | "pointLatitude": 37.2339, 54 | "pointLongitude": -118.282 55 | } 56 | } 57 | ], 58 | "identifiers": [ 59 | { 60 | "identifier": "10.25800/t9jd-fh86", 61 | "identifierType": "DOI" 62 | }, 63 | { 64 | "identifier": "oai:data.caltech.edu:hevaf-20f84", 65 | "identifierType": "oai" 66 | }, 67 | { 68 | "identifier": "221116aaab", 69 | "identifierType": "dsa-110-id" 70 | }, 71 | { 72 | "identifier": "hevaf-20f84", 73 | "identifierType": "cdid" 74 | } 75 | ], 76 | "publicationYear": "2024", 77 | "publisher": "Caltech Data", 78 | "relatedIdentifiers": [ 79 | { 80 | "relatedIdentifier": "http://deepsynoptic.org", 81 | "relatedIdentifierType": "URL", 82 | "relationType": "IsDocumentedBy" 83 | } 84 | ], 85 | "rightsList": [ 86 | { 87 | "rights": "cc-by-4.0" 88 | } 89 | ], 90 | "schemaVersion": "http://datacite.org/schema/kernel-4", 91 | "subjects": [ 92 | { 93 | "subject": "OVRO" 94 | }, 95 | { 96 | "subject": "Astrophysics" 97 | }, 98 | { 99 | "subject": "Fast Radio Bursts" 100 | } 101 | ], 102 | "titles": [ 103 | { 104 | "title": "DSA-110 Data for Candidate Fast Radio Burst 221116aaab" 105 | } 106 | ], 107 | "types": { 108 | "resourceType": "", 109 | "resourceTypeGeneral": "Dataset" 110 | }, 111 | "version": "0.1" 112 | } -------------------------------------------------------------------------------- /tests/data/datacite43/hhg7x-hgm42.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "affiliationIdentifier": "05dxps055", 7 | "affiliationIdentifierScheme": "ROR", 8 | "name": "California Institute of Technology" 9 | } 10 | ], 11 | "familyName": "Atterholt", 12 | "givenName": "James", 13 | "name": "Atterholt, James", 14 | "nameIdentifiers": [ 15 | { 16 | "nameIdentifier": "0000-0003-1603-5518", 17 | "nameIdentifierScheme": "ORCID" 18 | } 19 | ], 20 | "nameType": "Personal" 21 | }, 22 | { 23 | "affiliation": [ 24 | { 25 | "affiliationIdentifier": "05dxps055", 26 | "affiliationIdentifierScheme": "ROR", 27 | "name": "California Institute of Technology" 28 | } 29 | ], 30 | "familyName": "Zhan", 31 | "givenName": "Zhongwen", 32 | "name": "Zhan, Zhongwen", 33 | "nameIdentifiers": [ 34 | { 35 | "nameIdentifier": "0000-0002-5586-2607", 36 | "nameIdentifierScheme": "ORCID" 37 | } 38 | ], 39 | "nameType": "Personal" 40 | } 41 | ], 42 | "dates": [ 43 | { 44 | "date": "2024-06-26", 45 | "dateType": "Issued" 46 | } 47 | ], 48 | "descriptions": [ 49 | { 50 | "description": "These are h5 files that contain events with PmP observations used in the publication \"Fine Scale Southern California Moho Structure Uncovered with Distributed Acoustic Sensing.\" Events are descriptively named.", 51 | "descriptionType": "Abstract" 52 | } 53 | ], 54 | "fundingReferences": [ 55 | { 56 | "awardNumber": "1848166", 57 | "awardTitle": "CAREER:Potential of fiber acoustic sensing in the next-generation seismic networks", 58 | "funderIdentifier": "grid.431093.c", 59 | "funderIdentifierType": "GRID", 60 | "funderName": "National Science Foundation" 61 | }, 62 | { 63 | "funderIdentifier": "grid.452959.6", 64 | "funderIdentifierType": "GRID", 65 | "funderName": "Gordon and Betty Moore Foundation" 66 | } 67 | ], 68 | "identifiers": [ 69 | { 70 | "identifier": "10.22002/hhg7x-hgm42", 71 | "identifierType": "DOI" 72 | }, 73 | { 74 | "identifier": "oai:data.caltech.edu:hhg7x-hgm42", 75 | "identifierType": "oai" 76 | } 77 | ], 78 | "publicationYear": "2024", 79 | "publisher": "CaltechDATA", 80 | "rightsList": [ 81 | { 82 | "rights": "Creative Commons Zero v1.0 Universal", 83 | "rightsIdentifier": "cc0-1.0", 84 | "rightsIdentifierScheme": "spdx", 85 | "rightsUri": "https://creativecommons.org/publicdomain/zero/1.0/legalcode" 86 | } 87 | ], 88 | "schemaVersion": "http://datacite.org/schema/kernel-4", 89 | "titles": [ 90 | { 91 | "title": "Catalog of Events with PmP Phase" 92 | } 93 | ], 94 | "types": { 95 | "resourceType": "", 96 | "resourceTypeGeneral": "Dataset" 97 | } 98 | } -------------------------------------------------------------------------------- /tests/data/datacite43/kxjgj-tfk18.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "affiliationIdentifier": "05dxps055", 7 | "affiliationIdentifierScheme": "ROR", 8 | "name": "California Institute of Technology" 9 | } 10 | ], 11 | "familyName": "Ding", 12 | "givenName": "Xiaozhe", 13 | "name": "Ding, Xiaozhe", 14 | "nameIdentifiers": [ 15 | { 16 | "nameIdentifier": "0000-0002-0267-0791", 17 | "nameIdentifierScheme": "ORCID" 18 | } 19 | ], 20 | "nameType": "Personal" 21 | } 22 | ], 23 | "dates": [ 24 | { 25 | "date": "2024-04-16", 26 | "dateType": "Issued" 27 | } 28 | ], 29 | "descriptions": [ 30 | { 31 | "description": "Raw data for Ding, X., Chen, X., Sullivan, E.E., Shay, T.F., and Gradinaru, V. (2024). Fast, accurate ranking of engineered proteins by target binding propensity using structure modeling. Molecular Therapy. https://doi.org/10.1016/j.ymthe.2024.04.003", 32 | "descriptionType": "Abstract" 33 | } 34 | ], 35 | "identifiers": [ 36 | { 37 | "identifier": "10.22002/kxjgj-tfk18", 38 | "identifierType": "DOI" 39 | }, 40 | { 41 | "identifier": "oai:data.caltech.edu:kxjgj-tfk18", 42 | "identifierType": "oai" 43 | } 44 | ], 45 | "publicationYear": "2024", 46 | "publisher": "CaltechDATA", 47 | "relatedIdentifiers": [ 48 | { 49 | "relatedIdentifier": "10.1016/j.ymthe.2024.04.003", 50 | "relatedIdentifierType": "DOI", 51 | "relationType": "IsSupplementTo", 52 | "resourceTypeGeneral": "Text" 53 | } 54 | ], 55 | "rightsList": [ 56 | { 57 | "rights": "Creative Commons Zero v1.0 Universal", 58 | "rightsIdentifier": "cc0-1.0", 59 | "rightsIdentifierScheme": "spdx", 60 | "rightsUri": "https://creativecommons.org/publicdomain/zero/1.0/legalcode" 61 | } 62 | ], 63 | "schemaVersion": "http://datacite.org/schema/kernel-4", 64 | "titles": [ 65 | { 66 | "title": "Data for Fast, accurate ranking of engineered proteins by target-binding propensity using structure modeling" 67 | } 68 | ], 69 | "types": { 70 | "resourceType": "", 71 | "resourceTypeGeneral": "Dataset" 72 | } 73 | } -------------------------------------------------------------------------------- /tests/data/datacite43/kxtar-bm759.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "affiliationIdentifier": "05dxps055", 7 | "affiliationIdentifierScheme": "ROR", 8 | "name": "California Institute of Technology" 9 | } 10 | ], 11 | "familyName": "Bird", 12 | "givenName": "Eli", 13 | "name": "Bird, Eli", 14 | "nameIdentifiers": [ 15 | { 16 | "nameIdentifier": "0000-0002-9428-0650", 17 | "nameIdentifierScheme": "ORCID" 18 | } 19 | ], 20 | "nameType": "Personal" 21 | }, 22 | { 23 | "affiliation": [ 24 | { 25 | "affiliationIdentifier": "05dxps055", 26 | "affiliationIdentifierScheme": "ROR", 27 | "name": "California Institute of Technology" 28 | } 29 | ], 30 | "familyName": "Zhan", 31 | "givenName": "Zhongwen", 32 | "name": "Zhan, Zhongwen", 33 | "nameIdentifiers": [ 34 | { 35 | "nameIdentifier": "0000-0002-5586-2607", 36 | "nameIdentifierScheme": "ORCID" 37 | } 38 | ], 39 | "nameType": "Personal" 40 | } 41 | ], 42 | "dates": [ 43 | { 44 | "date": "2024-09-23", 45 | "dateType": "Issued" 46 | } 47 | ], 48 | "identifiers": [ 49 | { 50 | "identifier": "10.22002/kxtar-bm759", 51 | "identifierType": "DOI" 52 | }, 53 | { 54 | "identifier": "oai:data.caltech.edu:kxtar-bm759", 55 | "identifierType": "oai" 56 | } 57 | ], 58 | "publicationYear": "2024", 59 | "publisher": "CaltechDATA", 60 | "rightsList": [ 61 | { 62 | "rights": "Creative Commons Zero v1.0 Universal", 63 | "rightsIdentifier": "cc0-1.0", 64 | "rightsIdentifierScheme": "spdx", 65 | "rightsUri": "https://creativecommons.org/publicdomain/zero/1.0/legalcode" 66 | } 67 | ], 68 | "schemaVersion": "http://datacite.org/schema/kernel-4", 69 | "titles": [ 70 | { 71 | "title": "Ambient Noise Cross-Correlation Data associated with Constraining Dike Opening Models With Seismic Velocity Changes Associated with the 2023-2024 Eruption Sequence on the Reykjanes Peninsula\" by Bird et al." 72 | } 73 | ], 74 | "types": { 75 | "resourceType": "", 76 | "resourceTypeGeneral": "Dataset" 77 | } 78 | } -------------------------------------------------------------------------------- /tests/data/datacite43/n0y4x-xx706.json: -------------------------------------------------------------------------------- 1 | { 2 | "contributors": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "name": "Department of Biological Sciences, California State University, Sacramento" 7 | } 8 | ], 9 | "contributorType": "ContactPerson", 10 | "familyName": "Gleason", 11 | "givenName": "Lani U.", 12 | "name": "Gleason, Lani U.", 13 | "nameIdentifiers": [], 14 | "nameType": "Personal" 15 | } 16 | ], 17 | "creators": [ 18 | { 19 | "affiliation": [ 20 | { 21 | "name": "Department of Biological Sciences, California State University, Sacramento" 22 | } 23 | ], 24 | "familyName": "Gleason", 25 | "givenName": "Lani U.", 26 | "name": "Gleason, Lani U.", 27 | "nameIdentifiers": [], 28 | "nameType": "Personal" 29 | } 30 | ], 31 | "dates": [ 32 | { 33 | "date": "2025-01-24", 34 | "dateType": "Issued" 35 | }, 36 | { 37 | "date": "2025-01-22", 38 | "dateType": "Accepted" 39 | } 40 | ], 41 | "descriptions": [ 42 | { 43 | "description": "Genes identified to be significantly differentially expressed for each component of the Venn diagram in Figure 1B. The annotation information, RPKM expression value for each individual in each of the four treatments, and the average RPKM expression value per treatment are provided for each gene.", 44 | "descriptionType": "Abstract" 45 | } 46 | ], 47 | "fundingReferences": [ 48 | { 49 | "funderName": "This work was supported by a California State University, Sacramento Research & Creative Activity (RCA) Award and a Biological Sciences Genes to Ecosystems (G2E) Award to Lani Gleason. The California State University (CSU) Council on Ocean Affairs, Science, and Technology (COAST) also provided an Undergraduate Research Award to support Hanna Franklin." 50 | } 51 | ], 52 | "identifiers": [ 53 | { 54 | "identifier": "10.22002/n0y4x-xx706", 55 | "identifierType": "DOI" 56 | }, 57 | { 58 | "identifier": "oai:data.caltech.edu:n0y4x-xx706", 59 | "identifierType": "oai" 60 | } 61 | ], 62 | "language": "eng", 63 | "publicationYear": "2025", 64 | "publisher": "CaltechDATA", 65 | "relatedIdentifiers": [ 66 | { 67 | "relatedIdentifier": "10.17912/micropub.biology.001473", 68 | "relatedIdentifierType": "DOI", 69 | "relationType": "IsPartOf", 70 | "resourceTypeGeneral": "Text" 71 | } 72 | ], 73 | "rightsList": [ 74 | { 75 | "rights": "Creative Commons Attribution 4.0 International", 76 | "rightsIdentifier": "cc-by-4.0", 77 | "rightsIdentifierScheme": "spdx", 78 | "rightsUri": "https://creativecommons.org/licenses/by/4.0/legalcode" 79 | } 80 | ], 81 | "schemaVersion": "http://datacite.org/schema/kernel-4", 82 | "subjects": [ 83 | { 84 | "subject": "haliotis rufescens" 85 | } 86 | ], 87 | "titles": [ 88 | { 89 | "title": "Dataset: Heat Stress, Starvation, and Heat Stress Plus Starvation Cause Unique Transcriptomic Responses in the Economically Important Red Abalone Haliotis rufescens" 90 | } 91 | ], 92 | "types": { 93 | "resourceType": "", 94 | "resourceTypeGeneral": "Dataset" 95 | }, 96 | "version": "1.0" 97 | } -------------------------------------------------------------------------------- /tests/data/datacite43/n13wc-zwc92.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "affiliationIdentifier": "05dxps055", 7 | "affiliationIdentifierScheme": "ROR", 8 | "name": "California Institute of Technology" 9 | } 10 | ], 11 | "familyName": "Silevitch", 12 | "givenName": "Daniel", 13 | "name": "Silevitch, Daniel", 14 | "nameIdentifiers": [ 15 | { 16 | "nameIdentifier": "0000-0002-6347-3513", 17 | "nameIdentifierScheme": "ORCID" 18 | } 19 | ], 20 | "nameType": "Personal" 21 | }, 22 | { 23 | "affiliation": [ 24 | { 25 | "affiliationIdentifier": "05dxps055", 26 | "affiliationIdentifierScheme": "ROR", 27 | "name": "California Institute of Technology" 28 | } 29 | ], 30 | "familyName": "Armstrong", 31 | "givenName": "Stephen", 32 | "name": "Armstrong, Stephen", 33 | "nameIdentifiers": [], 34 | "nameType": "Personal" 35 | } 36 | ], 37 | "dates": [ 38 | { 39 | "date": "2025-01-30", 40 | "dateType": "Issued" 41 | }, 42 | { 43 | "date": "2025-01-30", 44 | "dateType": "Submitted" 45 | } 46 | ], 47 | "descriptions": [ 48 | { 49 | "description": "Magnetic susceptibility and specific heat data for LiErF4.\u00a0", 50 | "descriptionType": "Abstract" 51 | } 52 | ], 53 | "identifiers": [ 54 | { 55 | "identifier": "10.22002/n13wc-zwc92", 56 | "identifierType": "DOI" 57 | }, 58 | { 59 | "identifier": "oai:data.caltech.edu:n13wc-zwc92", 60 | "identifierType": "oai" 61 | } 62 | ], 63 | "publicationYear": "2025", 64 | "publisher": "CaltechDATA", 65 | "rightsList": [ 66 | { 67 | "rights": "Creative Commons Attribution 4.0 International", 68 | "rightsIdentifier": "cc-by-4.0", 69 | "rightsIdentifierScheme": "spdx", 70 | "rightsUri": "https://creativecommons.org/licenses/by/4.0/legalcode" 71 | } 72 | ], 73 | "schemaVersion": "http://datacite.org/schema/kernel-4", 74 | "subjects": [ 75 | { 76 | "subject": "Physical sciences" 77 | } 78 | ], 79 | "titles": [ 80 | { 81 | "title": "LiErF4 susceptibility and specific heat" 82 | } 83 | ], 84 | "types": { 85 | "resourceType": "", 86 | "resourceTypeGeneral": "Dataset" 87 | } 88 | } -------------------------------------------------------------------------------- /tests/data/datacite43/nbtw5-37m55.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "affiliationIdentifier": "05dxps055", 7 | "affiliationIdentifierScheme": "ROR", 8 | "name": "California Institute of Technology" 9 | } 10 | ], 11 | "familyName": "Law", 12 | "givenName": "Casey", 13 | "name": "Casey Law", 14 | "nameIdentifiers": [ 15 | { 16 | "nameIdentifier": "0000-0002-4119-9963", 17 | "nameIdentifierScheme": "ORCID" 18 | } 19 | ], 20 | "nameType": "Personal" 21 | } 22 | ], 23 | "dates": [ 24 | { 25 | "date": "2024", 26 | "dateType": "Issued" 27 | }, 28 | { 29 | "date": "2024-04-12", 30 | "dateType": "Created" 31 | } 32 | ], 33 | "descriptions": [ 34 | { 35 | "description": "Data associated with DSA-110 candidate transient.", 36 | "descriptionType": "Abstract" 37 | } 38 | ], 39 | "formats": [ 40 | "png" 41 | ], 42 | "fundingReferences": [ 43 | { 44 | "funderIdentifier": "grid.431093.c", 45 | "funderIdentifierType": "GRID", 46 | "funderName": "National Science Foundation" 47 | } 48 | ], 49 | "geoLocations": [ 50 | { 51 | "geoLocationPlace": "OVRO", 52 | "geoLocationPoint": { 53 | "pointLatitude": 37.2339, 54 | "pointLongitude": -118.282 55 | } 56 | } 57 | ], 58 | "identifiers": [ 59 | { 60 | "identifier": "10.25800/3ghe-8e93", 61 | "identifierType": "DOI" 62 | }, 63 | { 64 | "identifier": "oai:data.caltech.edu:nbtw5-37m55", 65 | "identifierType": "oai" 66 | }, 67 | { 68 | "identifier": "231120aabi", 69 | "identifierType": "dsa-110-id" 70 | }, 71 | { 72 | "identifier": "nbtw5-37m55", 73 | "identifierType": "cdid" 74 | } 75 | ], 76 | "publicationYear": "2024", 77 | "publisher": "Caltech Data", 78 | "relatedIdentifiers": [ 79 | { 80 | "relatedIdentifier": "http://deepsynoptic.org", 81 | "relatedIdentifierType": "URL", 82 | "relationType": "IsDocumentedBy" 83 | } 84 | ], 85 | "rightsList": [ 86 | { 87 | "rights": "cc-by-4.0" 88 | } 89 | ], 90 | "schemaVersion": "http://datacite.org/schema/kernel-4", 91 | "subjects": [ 92 | { 93 | "subject": "OVRO" 94 | }, 95 | { 96 | "subject": "Astrophysics" 97 | }, 98 | { 99 | "subject": "Fast Radio Bursts" 100 | } 101 | ], 102 | "titles": [ 103 | { 104 | "title": "DSA-110 Data for Candidate Fast Radio Burst 231120aabi" 105 | } 106 | ], 107 | "types": { 108 | "resourceType": "", 109 | "resourceTypeGeneral": "Dataset" 110 | } 111 | } -------------------------------------------------------------------------------- /tests/data/datacite43/rmzp9-9yx96.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "familyName": "Devey", 5 | "givenName": "Sean", 6 | "name": "Devey, Sean", 7 | "nameIdentifiers": [ 8 | { 9 | "nameIdentifier": "0000-0002-8937-939X", 10 | "nameIdentifierScheme": "ORCID" 11 | } 12 | ], 13 | "nameType": "Personal" 14 | } 15 | ], 16 | "dates": [ 17 | { 18 | "date": "2024-06-07", 19 | "dateType": "Issued" 20 | }, 21 | { 22 | "date": "2023-11-09", 23 | "dateInformation": "DPIV, CTA data collection", 24 | "dateType": "Collected" 25 | }, 26 | { 27 | "date": "2023-10-22", 28 | "dateInformation": "No FMS DPIV data collected", 29 | "dateType": "Collected" 30 | }, 31 | { 32 | "date": "2023-11-16", 33 | "dateInformation": "Cylinder wake dye visualizations collected", 34 | "dateType": "Collected" 35 | }, 36 | { 37 | "date": "2024-05-13", 38 | "dateInformation": "Shear layer dye visualization", 39 | "dateType": "Collected" 40 | }, 41 | { 42 | "date": "2024-06-07", 43 | "dateInformation": "date of upload", 44 | "dateType": "Submitted" 45 | } 46 | ], 47 | "descriptions": [ 48 | { 49 | "description": "DPIV, CTA measurements and dye visualizations demonstrating flow quality of the Free-surface, Low turbulence, Optically accessible, Water TUnnel in a Box (FLOWTUB) developed at GALCIT 2022-2024.", 50 | "descriptionType": "Abstract" 51 | } 52 | ], 53 | "fundingReferences": [ 54 | { 55 | "awardNumber": "DGE-1745301", 56 | "awardTitle": "Graduate Research Fellowships Program (GRFP)", 57 | "funderIdentifier": "grid.431093.c", 58 | "funderIdentifierType": "GRID", 59 | "funderName": "National Science Foundation" 60 | } 61 | ], 62 | "identifiers": [ 63 | { 64 | "identifier": "10.22002/rmzp9-9yx96", 65 | "identifierType": "DOI" 66 | }, 67 | { 68 | "identifier": "oai:data.caltech.edu:rmzp9-9yx96", 69 | "identifierType": "oai" 70 | } 71 | ], 72 | "language": "eng", 73 | "publicationYear": "2024", 74 | "publisher": "CaltechDATA", 75 | "rightsList": [ 76 | { 77 | "rights": "Creative Commons Attribution 4.0 International", 78 | "rightsIdentifier": "cc-by-4.0", 79 | "rightsIdentifierScheme": "spdx", 80 | "rightsUri": "https://creativecommons.org/licenses/by/4.0/legalcode" 81 | } 82 | ], 83 | "schemaVersion": "http://datacite.org/schema/kernel-4", 84 | "titles": [ 85 | { 86 | "title": "FLOWTUB Water Tunnel Validation Data" 87 | } 88 | ], 89 | "types": { 90 | "resourceType": "", 91 | "resourceTypeGeneral": "Dataset" 92 | }, 93 | "version": "1" 94 | } -------------------------------------------------------------------------------- /tests/data/datacite43/t15w6-x9q23.json: -------------------------------------------------------------------------------- 1 | { 2 | "contributors": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "name": "National Eye Institute, National Institutes of Health, Bethesda, Maryland, United States" 7 | }, 8 | { 9 | "name": "Intramural Research Program, National Library of Medicine, National Institutes of Health, Bethesda, Maryland, United States" 10 | } 11 | ], 12 | "contributorType": "ContactPerson", 13 | "familyName": "Pal", 14 | "givenName": "Soumitra", 15 | "name": "Pal, Soumitra", 16 | "nameIdentifiers": [ 17 | { 18 | "nameIdentifier": "0000-0003-4840-3944", 19 | "nameIdentifierScheme": "ORCID" 20 | } 21 | ], 22 | "nameType": "Personal" 23 | } 24 | ], 25 | "creators": [ 26 | { 27 | "affiliation": [ 28 | { 29 | "name": "National Eye Institute, National Institutes of Health, Bethesda, Maryland, United States" 30 | }, 31 | { 32 | "name": "Intramural Research Program, National Library of Medicine, National Institutes of Health, Bethesda, Maryland, United States" 33 | } 34 | ], 35 | "familyName": "Pal", 36 | "givenName": "Soumitra", 37 | "name": "Pal, Soumitra", 38 | "nameIdentifiers": [ 39 | { 40 | "nameIdentifier": "0000-0003-4840-3944", 41 | "nameIdentifierScheme": "ORCID" 42 | } 43 | ], 44 | "nameType": "Personal" 45 | } 46 | ], 47 | "dates": [ 48 | { 49 | "date": "2025-02-21", 50 | "dateType": "Issued" 51 | } 52 | ], 53 | "descriptions": [ 54 | { 55 | "description": "This MS-Excel workbook contains spreadsheets detailing the FCA datasets, clustering resolutions, and the results of our analysis at both the cell and cluster levels.", 56 | "descriptionType": "Abstract" 57 | } 58 | ], 59 | "fundingReferences": [ 60 | { 61 | "funderName": "This research was supported in part by the Intramural Research Program of the National Institutes of Health, USA: The National Institute of Diabetes and Digestive and Kidney Diseases (NIDDK) Grant No. ZIADK015600 to B.O. and National Library of Medicine (NLM) Grant No. LM200887 to T.M.P." 62 | } 63 | ], 64 | "identifiers": [ 65 | { 66 | "identifier": "10.22002/t15w6-x9q23", 67 | "identifierType": "DOI" 68 | }, 69 | { 70 | "identifier": "oai:data.caltech.edu:t15w6-x9q23", 71 | "identifierType": "oai" 72 | } 73 | ], 74 | "language": "eng", 75 | "publicationYear": "2025", 76 | "publisher": "CaltechDATA", 77 | "relatedIdentifiers": [ 78 | { 79 | "relatedIdentifier": "10.17912/micropub.biology.001501", 80 | "relatedIdentifierType": "DOI", 81 | "relationType": "IsPartOf", 82 | "resourceTypeGeneral": "Text" 83 | } 84 | ], 85 | "rightsList": [ 86 | { 87 | "rights": "Creative Commons Attribution 4.0 International", 88 | "rightsIdentifier": "cc-by-4.0", 89 | "rightsIdentifierScheme": "spdx", 90 | "rightsUri": "https://creativecommons.org/licenses/by/4.0/legalcode" 91 | } 92 | ], 93 | "schemaVersion": "http://datacite.org/schema/kernel-4", 94 | "subjects": [ 95 | { 96 | "subject": "drosophila" 97 | } 98 | ], 99 | "titles": [ 100 | { 101 | "title": "Dataset: Cell-Type Specific Variation in X-Chromosome Dosage Compensation in Drosophila" 102 | } 103 | ], 104 | "types": { 105 | "resourceType": "", 106 | "resourceTypeGeneral": "Dataset" 107 | }, 108 | "version": "1.0" 109 | } -------------------------------------------------------------------------------- /tests/data/datacite43/wbty9-bqy29.json: -------------------------------------------------------------------------------- 1 | { 2 | "contributors": [ 3 | { 4 | "affiliation": [ 5 | { 6 | "name": "Molecular Biophysics & Biochemistry, Yale University, New Haven, Connecticut, United States" 7 | }, 8 | { 9 | "name": "Cell Biology, Yale University School of Medicine" 10 | } 11 | ], 12 | "contributorType": "ContactPerson", 13 | "familyName": "Berro", 14 | "givenName": "Julien", 15 | "name": "Berro, Julien", 16 | "nameIdentifiers": [ 17 | { 18 | "nameIdentifier": "0000-0002-9560-8646", 19 | "nameIdentifierScheme": "ORCID" 20 | } 21 | ], 22 | "nameType": "Personal" 23 | } 24 | ], 25 | "creators": [ 26 | { 27 | "affiliation": [ 28 | { 29 | "name": "Molecular Biophysics & Biochemistry, Yale University, New Haven, Connecticut, United States" 30 | }, 31 | { 32 | "name": "Cell Biology, Yale University School of Medicine" 33 | } 34 | ], 35 | "familyName": "Berro", 36 | "givenName": "Julien", 37 | "name": "Berro, Julien", 38 | "nameIdentifiers": [ 39 | { 40 | "nameIdentifier": "0000-0002-9560-8646", 41 | "nameIdentifierScheme": "ORCID" 42 | } 43 | ], 44 | "nameType": "Personal" 45 | } 46 | ], 47 | "dates": [ 48 | { 49 | "date": "2024-05-08", 50 | "dateType": "Issued" 51 | }, 52 | { 53 | "date": "2024-05-03", 54 | "dateType": "Accepted" 55 | } 56 | ], 57 | "descriptions": [ 58 | { 59 | "description": "Primers used in this study", 60 | "descriptionType": "Abstract" 61 | } 62 | ], 63 | "fundingReferences": [ 64 | { 65 | "funderName": "This work was partly supported by the National Institutes of Health (R01 GM11563601)." 66 | } 67 | ], 68 | "identifiers": [ 69 | { 70 | "identifier": "10.22002/wbty9-bqy29", 71 | "identifierType": "DOI" 72 | }, 73 | { 74 | "identifier": "oai:data.caltech.edu:wbty9-bqy29", 75 | "identifierType": "oai" 76 | } 77 | ], 78 | "language": "eng", 79 | "publicationYear": "2024", 80 | "publisher": "CaltechDATA", 81 | "relatedIdentifiers": [ 82 | { 83 | "relatedIdentifier": "10.17912/micropub.biology.001191", 84 | "relatedIdentifierType": "DOI", 85 | "relationType": "IsPartOf", 86 | "resourceTypeGeneral": "Text" 87 | } 88 | ], 89 | "rightsList": [ 90 | { 91 | "rights": "Creative Commons Attribution 4.0 International", 92 | "rightsIdentifier": "cc-by-4.0", 93 | "rightsIdentifierScheme": "spdx", 94 | "rightsUri": "https://creativecommons.org/licenses/by/4.0/legalcode" 95 | } 96 | ], 97 | "schemaVersion": "http://datacite.org/schema/kernel-4", 98 | "subjects": [ 99 | { 100 | "subject": "s. pombe" 101 | } 102 | ], 103 | "titles": [ 104 | { 105 | "title": "Dataset: CRISPR-Cas9 editing efficiency in fission yeast is not limited by homology search and is improved by combining gap-repair with fluoride selection" 106 | } 107 | ], 108 | "types": { 109 | "resourceType": "", 110 | "resourceTypeGeneral": "Dataset" 111 | }, 112 | "version": "1.0" 113 | } -------------------------------------------------------------------------------- /tests/data/invalid_datacite43/invalid_metadata_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [ 3 | { 4 | "name": "John Doe" 5 | } 6 | ], 7 | "publisher": "Caltech", 8 | "publicationYear": "2023", 9 | "types": { 10 | "resourceTypeGeneral": "Dataset" 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/invalid_datacite43/invalid_metadata_10.json: -------------------------------------------------------------------------------- 1 | { 2 | "titles": [ 3 | { 4 | "title": "Sample Title" 5 | } 6 | ], 7 | "creators": [ 8 | { 9 | "name": "John Doe" 10 | } 11 | ], 12 | "version": 1, 13 | "publisher": "Caltech", 14 | "publicationYear": "2023", 15 | "types": { 16 | "resourceTypeGeneral": "Dataset" 17 | } 18 | } -------------------------------------------------------------------------------- /tests/data/invalid_datacite43/invalid_metadata_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "titles": [], 3 | "creators": [ 4 | { 5 | "name": "John Doe" 6 | } 7 | ], 8 | "publisher": "Caltech", 9 | "publicationYear": "2023", 10 | "types": { 11 | "resourceTypeGeneral": "Dataset" 12 | } 13 | } -------------------------------------------------------------------------------- /tests/data/invalid_datacite43/invalid_metadata_3.json: -------------------------------------------------------------------------------- 1 | { 2 | "titles": [ 3 | { 4 | "title": "Sample Title" 5 | } 6 | ], 7 | "publisher": "Caltech", 8 | "publicationYear": "2023", 9 | "types": { 10 | "resourceTypeGeneral": "Dataset" 11 | } 12 | } -------------------------------------------------------------------------------- /tests/data/invalid_datacite43/invalid_metadata_4.json: -------------------------------------------------------------------------------- 1 | { 2 | "titles": [ 3 | { 4 | "title": "Sample Title" 5 | } 6 | ], 7 | "creators": [ 8 | { 9 | "name": "John Doe" 10 | } 11 | ], 12 | "contributors": [ 13 | {} 14 | ], 15 | "publisher": "Caltech", 16 | "publicationYear": "2023", 17 | "types": { 18 | "resourceTypeGeneral": "Dataset" 19 | } 20 | } -------------------------------------------------------------------------------- /tests/data/invalid_datacite43/invalid_metadata_5.json: -------------------------------------------------------------------------------- 1 | { 2 | "titles": [ 3 | { 4 | "title": "Sample Title" 5 | } 6 | ], 7 | "creators": [ 8 | { 9 | "name": "John Doe" 10 | } 11 | ], 12 | "descriptions": [ 13 | { 14 | "description": "Sample Description" 15 | } 16 | ], 17 | "publisher": "Caltech", 18 | "publicationYear": "2023", 19 | "types": { 20 | "resourceTypeGeneral": "Dataset" 21 | } 22 | } -------------------------------------------------------------------------------- /tests/data/invalid_datacite43/invalid_metadata_6.json: -------------------------------------------------------------------------------- 1 | { 2 | "titles": [ 3 | { 4 | "title": "Sample Title" 5 | } 6 | ], 7 | "creators": [ 8 | { 9 | "name": "John Doe" 10 | } 11 | ], 12 | "fundingReferences": [ 13 | { 14 | "funderIdentifier": "1234" 15 | } 16 | ], 17 | "publisher": "Caltech", 18 | "publicationYear": "2023", 19 | "types": { 20 | "resourceTypeGeneral": "Dataset" 21 | } 22 | } -------------------------------------------------------------------------------- /tests/data/invalid_datacite43/invalid_metadata_7.json: -------------------------------------------------------------------------------- 1 | { 2 | "titles": [ 3 | { 4 | "title": "Sample Title" 5 | } 6 | ], 7 | "creators": [ 8 | { 9 | "name": "John Doe" 10 | } 11 | ], 12 | "identifiers": [ 13 | {} 14 | ], 15 | "publisher": "Caltech", 16 | "publicationYear": "2023", 17 | "types": { 18 | "resourceTypeGeneral": "Dataset" 19 | } 20 | } -------------------------------------------------------------------------------- /tests/data/invalid_datacite43/invalid_metadata_8.json: -------------------------------------------------------------------------------- 1 | { 2 | "titles": [ 3 | { 4 | "title": "Sample Title" 5 | } 6 | ], 7 | "creators": [ 8 | { 9 | "name": "John Doe" 10 | } 11 | ], 12 | "dates": [ 13 | {} 14 | ], 15 | "publisher": "Caltech", 16 | "publicationYear": "2023", 17 | "types": { 18 | "resourceTypeGeneral": "Dataset" 19 | } 20 | } -------------------------------------------------------------------------------- /tests/data/invalid_datacite43/invalid_metadata_9.json: -------------------------------------------------------------------------------- 1 | { 2 | "titles": [ 3 | { 4 | "title": "Sample Title" 5 | } 6 | ], 7 | "creators": [ 8 | { 9 | "name": "John Doe" 10 | } 11 | ], 12 | "publicationYear": "2023", 13 | "types": { 14 | "resourceTypeGeneral": "Dataset" 15 | } 16 | } -------------------------------------------------------------------------------- /tests/helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of DataCite. 4 | # 5 | # Copyright (C) 2015, 2016 CERN. 6 | # 7 | # DataCite is free software; you can redistribute it and/or modify it 8 | # under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Test helpers.""" 12 | 13 | from __future__ import absolute_import, print_function 14 | 15 | import io 16 | import json 17 | import os 18 | from os.path import dirname, join 19 | 20 | 21 | def load_json_path(path): 22 | """Helper method for loading a JSON example file from a path.""" 23 | path_base = dirname(__file__) 24 | with io.open(join(path_base, path), encoding="utf-8") as file: 25 | content = file.read() 26 | return json.loads(content) 27 | 28 | 29 | def write_json_path(path, metadata): 30 | """Helper method for writing a JSON example file to a path.""" 31 | path_base = dirname(__file__) 32 | path_full = join(path_base, path) 33 | print(path_full) 34 | print(metadata) 35 | with io.open(path_full, "w", encoding="utf-8") as file: 36 | json.dump(metadata, file) 37 | -------------------------------------------------------------------------------- /tests/test_download.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of caltechdata_api. 4 | # 5 | # Copyright (C) 2020 Caltech. 6 | # 7 | # caltechdata_api is free software; you can redistribute it and/or modify it 8 | # under the terms of the Revised BSD License; see LICENSE file for 9 | # more details. 10 | 11 | """Tests for format transformations.""" 12 | 13 | import pytest 14 | 15 | from caltechdata_api import download_url, download_file 16 | 17 | 18 | @pytest.mark.skip(reason="works, don't want to do unnecessary downloads") 19 | def test_download(): 20 | """Test that downloads from the DataCite Media API work.""" 21 | example_doi = "10.22002/D1.1098" 22 | expected_url = ( 23 | "https://data.caltech.edu/tindfiles/serve/293d37c5-73f2-4016-bcd5-76cf353ff9d8/" 24 | ) 25 | assert expected_url == download_url(example_doi) 26 | filen = download_file(example_doi) 27 | assert filen == "10.22002-D1.1098" 28 | -------------------------------------------------------------------------------- /tests/test_rdm.py: -------------------------------------------------------------------------------- 1 | from caltechdata_api import ( 2 | customize_schema, 3 | caltechdata_write, 4 | caltechdata_edit, 5 | get_metadata, 6 | ) 7 | import json 8 | import os 9 | 10 | 11 | def test_datacite_rdm_conversion(full_datacite43_record, full_rdm_record): 12 | 13 | # Remove DOI from full_datacite43_record 14 | # since it's prcessed by caltechdata_write or caltechdata_edit 15 | identifiers = [] 16 | for identifier in full_datacite43_record["identifiers"]: 17 | if identifier["identifierType"] != "DOI": 18 | identifiers.append(identifier) 19 | full_datacite43_record["identifiers"] = identifiers 20 | 21 | converted = customize_schema(full_datacite43_record, schema="43") 22 | 23 | assert converted == full_rdm_record 24 | 25 | 26 | def test_datacite_rdm_create_edit(full_datacite43_record): 27 | env_token = os.environ.get("RDMTOK") 28 | 29 | # Remove DOI from full_datacite43_record 30 | # since we want the test system to create one 31 | identifiers = [] 32 | for identifier in full_datacite43_record["identifiers"]: 33 | if identifier["identifierType"] != "DOI": 34 | identifiers.append(identifier) 35 | full_datacite43_record["identifiers"] = identifiers 36 | 37 | recid = caltechdata_write( 38 | full_datacite43_record, 39 | schema="43", 40 | production=False, 41 | publish=True, 42 | token=env_token, 43 | ) 44 | 45 | assert len(recid) == 11 46 | 47 | recid = caltechdata_write( 48 | full_datacite43_record, 49 | schema="43", 50 | production=False, 51 | files=["helpers.py"], 52 | publish=True, 53 | token=env_token, 54 | ) 55 | 56 | assert len(recid) == 11 57 | 58 | full_datacite43_record["publisher"] = "Edited" 59 | 60 | doi = caltechdata_edit( 61 | recid, 62 | full_datacite43_record, 63 | schema="43", 64 | production=False, 65 | publish=True, 66 | token=env_token, 67 | ) 68 | 69 | assert doi.startswith("10.33569") 70 | 71 | # Validate is false until geolocation points are fixed/we move to 4.6 72 | new_metadata = get_metadata(recid, production=False, validate=False) 73 | 74 | assert new_metadata["publisher"] == "Edited" 75 | 76 | full_datacite43_record["publisher"] = "Again!" 77 | 78 | new_doi = caltechdata_edit( 79 | recid, 80 | full_datacite43_record, 81 | files=["helpers.py"], 82 | schema="43", 83 | production=False, 84 | publish=True, 85 | token=env_token, 86 | ) 87 | 88 | assert new_doi != doi 89 | 90 | recid = new_doi.split("/")[1] 91 | 92 | # Validate is false until geolocation points are fixed/we move to 4.6 93 | new_metadata = get_metadata(recid, production=False, validate=False) 94 | 95 | assert new_metadata["publisher"] == "Again!" 96 | -------------------------------------------------------------------------------- /tests/test_unit.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import logging 4 | from caltechdata_api import validate_metadata as validator43 5 | from helpers import load_json_path 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.DEBUG) 9 | logger = logging.getLogger(__name__) 10 | 11 | # Dynamically determine the base path 12 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 13 | INVALID_DATACITE43_DIR = os.path.join(BASE_DIR, "data", "invalid_datacite43") 14 | DATACITE43_DIR = os.path.join(BASE_DIR, "data") 15 | 16 | 17 | # Function to get all JSON files in the directory 18 | def get_all_json_files(directory): 19 | return [ 20 | os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".json") 21 | ] 22 | 23 | 24 | # Get list of all valid and invalid JSON files 25 | VALID_DATACITE43_FILES = get_all_json_files( 26 | os.path.join(BASE_DIR, "data", "datacite43") 27 | ) 28 | INVALID_DATACITE43_FILES = get_all_json_files(INVALID_DATACITE43_DIR) 29 | 30 | 31 | @pytest.mark.parametrize("valid_file", VALID_DATACITE43_FILES) 32 | def test_valid_json(valid_file): 33 | """Test that valid example files validate successfully.""" 34 | print(f"\nValidating file: {valid_file}") 35 | json_data = load_json_path(valid_file) 36 | validation_errors = None 37 | try: 38 | validation_errors = validator43(json_data) 39 | except ValueError as e: 40 | pytest.fail(f"Validation failed for: {valid_file}\nErrors: {str(e)}") 41 | 42 | assert ( 43 | not validation_errors 44 | ), f"Validation failed for: {valid_file}\nErrors: {validation_errors}" 45 | print(f"Validation passed for: {valid_file}") 46 | 47 | 48 | @pytest.mark.parametrize("invalid_file", INVALID_DATACITE43_FILES) 49 | def test_invalid_json(invalid_file): 50 | """Test that invalid example files do not validate successfully.""" 51 | logger.debug(f"Attempting to validate invalid file: {invalid_file}") 52 | 53 | json_data = load_json_path(invalid_file) 54 | 55 | def validate_wrapper(): 56 | try: 57 | validation_errors = validator43(json_data) 58 | 59 | logger.debug(f"Validation result for {invalid_file}: {validation_errors}") 60 | 61 | if validation_errors: 62 | logger.debug(f"Found validation errors in {invalid_file}") 63 | return 64 | 65 | logger.error( 66 | f"No validation errors found for supposedly invalid file: {invalid_file}" 67 | ) 68 | raise ValueError( 69 | f"Validation did not fail for invalid file: {invalid_file}" 70 | ) 71 | 72 | except Exception as e: 73 | logger.error(f"Validation exception for {invalid_file}: {str(e)}") 74 | raise 75 | 76 | with pytest.raises((ValueError, KeyError, AssertionError, TypeError)): 77 | validate_wrapper() 78 | 79 | 80 | @pytest.mark.parametrize( 81 | "missing_field_file", 82 | [ 83 | { 84 | "file": os.path.join(DATACITE43_DIR, "missing_creators.json"), 85 | "missing_field": "creators", 86 | }, 87 | { 88 | "file": os.path.join(DATACITE43_DIR, "missing_titles.json"), 89 | "missing_field": "titles", 90 | }, 91 | ], 92 | ) 93 | def test_missing_required_fields(missing_field_file): 94 | """Test that JSON files missing required fields fail validation.""" 95 | print( 96 | f"\nTesting missing field: {missing_field_file['missing_field']} in file: {missing_field_file['file']}" 97 | ) 98 | 99 | # Skip the test if the file doesn't exist 100 | if not os.path.exists(missing_field_file["file"]): 101 | pytest.skip(f"Test file not found: {missing_field_file['file']}") 102 | 103 | json_data = load_json_path(missing_field_file["file"]) 104 | with pytest.raises( 105 | ValueError, 106 | match=f"Missing required metadata field: {missing_field_file['missing_field']}", 107 | ): 108 | validator43(json_data) 109 | 110 | 111 | @pytest.mark.parametrize( 112 | "type_error_file", 113 | [ 114 | { 115 | "file": os.path.join(DATACITE43_DIR, "type_error_creators.json"), 116 | "field": "creators", 117 | }, 118 | { 119 | "file": os.path.join(DATACITE43_DIR, "type_error_dates.json"), 120 | "field": "dates", 121 | }, 122 | ], 123 | ) 124 | def test_incorrect_field_types(type_error_file): 125 | """Test that JSON files with incorrect field types fail validation.""" 126 | print( 127 | f"\nTesting incorrect type in field: {type_error_file['field']} for file: {type_error_file['file']}" 128 | ) 129 | 130 | # Skip the test if the file doesn't exist 131 | if not os.path.exists(type_error_file["file"]): 132 | pytest.skip(f"Test file not found: {type_error_file['file']}") 133 | 134 | json_data = load_json_path(type_error_file["file"]) 135 | with pytest.raises( 136 | ValueError, match=f"Incorrect type for field: {type_error_file['field']}" 137 | ): 138 | validator43(json_data) 139 | 140 | 141 | def test_multiple_errors(): 142 | """Test JSON file with multiple issues to check all errors are raised.""" 143 | multiple_errors_file = os.path.join(DATACITE43_DIR, "multiple_errors.json") 144 | 145 | # Skip the test if the file doesn't exist 146 | if not os.path.exists(multiple_errors_file): 147 | pytest.skip(f"Test file not found: {multiple_errors_file}") 148 | 149 | json_data = load_json_path(multiple_errors_file) 150 | with pytest.raises(ValueError, match="Multiple validation errors"): 151 | validator43(json_data) 152 | 153 | 154 | def test_error_logging(caplog): 155 | """Test that errors are logged correctly during validation.""" 156 | some_invalid_file = os.path.join(INVALID_DATACITE43_DIR, "some_invalid_file.json") 157 | 158 | # Skip the test if the file doesn't exist 159 | if not os.path.exists(some_invalid_file): 160 | pytest.skip(f"Test file not found: {some_invalid_file}") 161 | 162 | json_data = load_json_path(some_invalid_file) 163 | with caplog.at_level(logging.ERROR): 164 | with pytest.raises(ValueError): 165 | validator43(json_data) 166 | assert "Validation failed" in caplog.text 167 | -------------------------------------------------------------------------------- /token.bash: -------------------------------------------------------------------------------- 1 | export RDMTOK="token" 2 | 3 | -------------------------------------------------------------------------------- /write.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json 2 | from caltechdata_api import caltechdata_write 3 | 4 | parser = argparse.ArgumentParser( 5 | description="Write files and a DataCite 4 standard json record\ 6 | to CaltechDATA repository" 7 | ) 8 | parser.add_argument( 9 | "json_file", nargs=1, help="file name for json DataCite metadata file" 10 | ) 11 | parser.add_argument("-fnames", nargs="*", help="New Files") 12 | parser.add_argument("-schema", default="43", help="Metadata Schema") 13 | 14 | args = parser.parse_args() 15 | 16 | # Get access token as environment variable 17 | token = os.environ["RDMTOK"] 18 | 19 | metaf = open(args.json_file[0], "r") 20 | metadata = json.load(metaf) 21 | 22 | production = False 23 | publish = True 24 | 25 | response = caltechdata_write( 26 | metadata, token, args.fnames, production, args.schema, publish 27 | ) 28 | print(response) 29 | -------------------------------------------------------------------------------- /write_authors.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json 2 | from caltechdata_api import caltechdata_write 3 | 4 | parser = argparse.ArgumentParser( 5 | description="Write files and a DataCite 4 standard json record\ 6 | to CaltechDATA repository" 7 | ) 8 | parser.add_argument( 9 | "json_file", nargs=1, help="file name for json DataCite metadata file" 10 | ) 11 | parser.add_argument("-fnames", nargs="*", help="New Files") 12 | parser.add_argument("-schema", default="43", help="Metadata Schema") 13 | 14 | args = parser.parse_args() 15 | 16 | # Get access token as environment variable 17 | token = os.environ["RDMTOK"] 18 | 19 | metaf = open(args.json_file[0], "r") 20 | metadata = json.load(metaf) 21 | 22 | production = True 23 | publish = False 24 | authors = True 25 | community = "669e5e57-7d9e-4d19-8ab5-9c6158562fb3" 26 | 27 | response = caltechdata_write( 28 | metadata, 29 | token, 30 | args.fnames, 31 | production, 32 | args.schema, 33 | publish, 34 | community=community, 35 | authors=authors, 36 | ) 37 | print(response) 38 | -------------------------------------------------------------------------------- /write_hte.py: -------------------------------------------------------------------------------- 1 | import argparse, os, json 2 | import s3fs 3 | import requests 4 | from datacite import schema43, DataCiteRESTClient 5 | from caltechdata_api import caltechdata_write, caltechdata_edit 6 | from tqdm import tqdm 7 | 8 | folder = "0_gregoire" 9 | 10 | endpoint = "https://renc.osn.xsede.org/" 11 | 12 | # Get metadata and files from bucket 13 | s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint}) 14 | 15 | # Set up datacite client 16 | password = os.environ["DATACITE"] 17 | prefix = "10.25989" 18 | datacite = DataCiteRESTClient(username="CALTECH.HTE", password=password, prefix=prefix) 19 | 20 | path = "ini210004tommorrell/" + folder + "/" 21 | dirs = s3.ls(path) 22 | # Strip out reference to top level directory 23 | repeat = dirs.pop(0) 24 | assert repeat == path 25 | # Switch directories to doi 26 | records = [] 27 | for record in dirs: 28 | body = record.split("0_gregoire/")[1] 29 | records.append(f"{prefix}/{body}") 30 | 31 | with open("new_ids.json", "r") as infile: 32 | record_ids = json.load(infile) 33 | 34 | # We are using the list of unregistered dois 35 | # with open("unregistered_dois.json", "r") as infile: 36 | # data = json.load(infile) 37 | # records = data["pub"] 38 | 39 | abstract = """This record is a component of the Materials Experiment and 40 | Analysis Database (MEAD). It contains raw data and metadata from millions 41 | of materials synthesis and characterization experiments, as well as the 42 | analysis and distillation of that data into property and performance 43 | metrics. The unprecedented quantity and diversity of experimental data 44 | are searchable by experiment and analysis attributes generated by both 45 | researchers and data processing software. 46 | """ 47 | 48 | with open("completed_dois.json", "r") as infile: 49 | completed = json.load(infile) 50 | 51 | for doi in completed: 52 | if doi in records: 53 | records.remove(doi) 54 | else: 55 | print(doi) 56 | 57 | with open("excluded_dois.json", "r") as infile: 58 | excluded = json.load(infile) 59 | 60 | for doi in excluded: 61 | records.remove(doi) 62 | 63 | for record in tqdm(records): 64 | base = record.split("/")[1] 65 | meta_path = path + base + "/metadata.json" 66 | metadata = None 67 | files = s3.ls(path + base) 68 | if len(files) == 0: 69 | excluded.append(record) 70 | print(f"No files available {record}") 71 | with open("excluded_dois.json", "w") as outfile: 72 | data = json.dump(excluded, outfile) 73 | else: 74 | try: 75 | metaf = s3.open(meta_path, "rb") 76 | metadata = json.load(metaf) 77 | except: 78 | print(files) 79 | excluded.append(record) 80 | print(f"Missing metadata {record}") 81 | exit() 82 | with open("excluded_dois.json", "w") as outfile: 83 | data = json.dump(excluded, outfile) 84 | 85 | if metadata: 86 | metadata["identifiers"] = [{"identifier": record, "identifierType": "DOI"}] 87 | 88 | # Find the zip file or files 89 | zipf = s3.glob(path + base + "/*.zip") 90 | file_links = [] 91 | 92 | description_string = f"Files available via S3 at {endpoint}{path}
" 93 | for link in zipf: 94 | fname = link.split("/")[-1] 95 | file_links.append(endpoint + link) 96 | 97 | metadata["types"] = {"resourceType": "", "resourceTypeGeneral": "Dataset"} 98 | metadata["schemaVersion"] = "http://datacite.org/schema/kernel-4" 99 | metadata["publicationYear"] = str(metadata["publicationYear"]) 100 | metadata["rightsList"] = [ 101 | { 102 | "rights": "cc-by-sa-4.0", 103 | "rightsUri": "http://creativecommons.org/licenses/by-sa/4.0/", 104 | } 105 | ] 106 | static = [ 107 | { 108 | "relatedIdentifier": "10.25989/es8t-kswe", 109 | "relationType": "IsPartOf", 110 | "relatedIdentifierType": "DOI", 111 | }, 112 | { 113 | "relatedIdentifier": "10.1038/s41524-019-0216-x", 114 | "relationType": "IsDocumentedBy", 115 | "relatedIdentifierType": "DOI", 116 | }, 117 | ] 118 | if "relatedIdentifiers" in metadata: 119 | metadata["relatedIdentifiers"] += static 120 | else: 121 | metadata["relatedIdentifiers"] = static 122 | metadata["fundingReferences"] = [ 123 | { 124 | "funderName": "Office of Science of the U.S. Department of Energy", 125 | "awardTitle": "Energy Innovation Hub Renewal - Fuels from Sunlight", 126 | "awardNumber": "DE-SC0004993", 127 | } 128 | ] 129 | 130 | if "descriptions" not in metadata: 131 | metadata["descriptions"] = [ 132 | {"description": abstract, "descriptionType": "Abstract"} 133 | ] 134 | else: 135 | print(metadata["descriptions"]) 136 | exit() 137 | 138 | for meta in metadata.copy(): 139 | if metadata[meta] == []: 140 | metadata.pop(meta) 141 | for contributor in metadata["contributors"]: 142 | if contributor["affiliation"] == []: 143 | contributor.pop("affiliation") 144 | new_cre = [] 145 | for creator in metadata["creators"]: 146 | if creator["affiliation"] == []: 147 | creator.pop("affiliation") 148 | if creator["name"] != "Contributors": 149 | new_cre.append(creator) 150 | metadata["creators"] = new_cre 151 | 152 | doi = metadata["doi"].lower() 153 | unnecessary = [ 154 | "id", 155 | "doi", 156 | "container", 157 | "providerId", 158 | "clientId", 159 | "agency", 160 | "state", 161 | ] 162 | for un in unnecessary: 163 | if un in metadata: 164 | metadata.pop(un) 165 | if "dates" in metadata: 166 | for d in metadata["dates"]: 167 | d["date"] = str(d["date"]) 168 | valid = schema43.validate(metadata) 169 | if not valid: 170 | v = schema43.validator.validate(metadata) 171 | errors = sorted(v.iter_errors(instance), key=lambda e: e.path) 172 | for error in errors: 173 | print(error.message) 174 | exit() 175 | 176 | metadata.pop("language") 177 | community = "d0de1569-0a01-498f-b6bd-4bc75d54012f" 178 | 179 | production = True 180 | 181 | # We're now doing new records, so redirects are not needed 182 | # result = requests.get(f'https://api.datacite.org/dois/{doi}') 183 | # if result.status_code != 200: 184 | # print('DATACITE Failed') 185 | # print(result.text) 186 | # exit() 187 | 188 | # url = result.json()['data']['attributes']['url'] 189 | # old_id = url.split('data.caltech.edu/records/')[1] 190 | new_id = caltechdata_write( 191 | metadata, 192 | schema="43", 193 | publish=True, 194 | production=True, 195 | file_links=file_links, 196 | s3=s3, 197 | community=community, 198 | ) 199 | url = f"https://data.caltech.edu/records/{new_id}" 200 | 201 | # record_ids[old_id] = new_id 202 | # with open("new_ids.json", "w") as outfile: 203 | # json.dump(record_ids, outfile) 204 | 205 | result = requests.get(f"https://api.datacite.org/dois/{doi}") 206 | if result.status_code != 200: 207 | doi = datacite.public_doi(doi=record, metadata=metadata, url=url) 208 | else: 209 | doi = datacite.update_doi(doi=record, metadata=metadata, url=url)["doi"] 210 | completed.append(doi) 211 | with open("completed_dois.json", "w") as outfile: 212 | data = json.dump(completed, outfile) 213 | --------------------------------------------------------------------------------