├── .gitbook.yaml ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── create-a-story.md │ └── feature_request.md └── workflows │ ├── documentation.yaml │ ├── python-publish.yml │ └── tests.yml ├── .gitignore ├── LICENSE ├── README.md ├── assets ├── foundry-black.png ├── foundry-black.svg ├── foundry-dark.png ├── foundry-dark.svg ├── foundry-light.png ├── foundry-light.svg ├── foundry-logo-4.pptx ├── foundry-logo.pptx ├── foundry-ml.png ├── foundry-purple.png ├── foundry-white.png ├── foundry-white.svg ├── foundry.png └── foundry.svg ├── diagram.svg ├── docs ├── .gitbook │ └── assets │ │ ├── foundry-overview.png │ │ ├── foundry-purple (1).png │ │ ├── foundry-purple (2).png │ │ ├── foundry-purple (3).png │ │ ├── foundry-purple.png │ │ ├── foundry.png │ │ ├── image (1).png │ │ ├── image (2).png │ │ ├── image.png │ │ ├── screen-shot-2021-07-15-at-10.00.38-am.png │ │ └── screen-shot-2021-07-15-at-10.05.40-am.png ├── README.md ├── SUMMARY.md ├── command-line-interface.md ├── concepts │ ├── foundry-benchmarks.md │ ├── foundry-data-packages.md │ ├── foundry-datasets.md │ ├── foundry-models-and-functions.md │ └── overview.md ├── examples.md ├── foundry-package-foundry_test-1.1-documentation-html-autogeneration.md ├── foundry.auth.md ├── foundry.foundry.md ├── foundry.foundry_cache.md ├── foundry.foundry_dataset.md ├── foundry.https_download.md ├── foundry.https_upload.md ├── foundry.loaders.md ├── foundry.loaders.tf_wrapper.md ├── foundry.loaders.torch_wrapper.md ├── foundry.md ├── foundry.models.md ├── foundry.utils.md ├── how-to-contribute │ ├── code_of_conduct.md │ └── contributing.md ├── publishing-datasets.md ├── publishing-models.md ├── publishing │ ├── publishing-datasets.md │ └── publishing-models.md ├── sphinx-autogenerated-documentation.md └── support │ └── troubleshooting.md ├── examples ├── README.md ├── atom-position-finding │ ├── .ipynb_checkpoints │ │ └── atom_position_finding-checkpoint.ipynb │ ├── atom_position_finding.ipynb │ └── requirements.txt ├── bandgap │ ├── bandgap_demo.ipynb │ ├── foundry.json │ └── requirements.txt ├── dendrite-segmentation │ ├── dendrite_segmentation.ipynb │ ├── foundry.json │ └── requirements.txt ├── g4mp2-solvation │ └── g4mp2_solvation_demo.ipynb ├── oqmd │ ├── foundry.json │ ├── oqmd.ipynb │ └── requirements.txt ├── publishing-guides │ ├── data │ │ └── iris.csv │ └── dataset_publishing.ipynb ├── qmc_ml │ └── qmc_ml.ipynb ├── work_in_progress │ └── PACBEDCNN-thickness-mistilt │ │ └── PACBEDCNN_thickness_mistilt.ipynb └── zeolite │ ├── .ipynb_checkpoints │ └── zeolite_demo-checkpoint.ipynb │ ├── requirements.txt │ └── zeolite_demo.ipynb ├── foundry ├── __init__.py ├── auth.py ├── foundry.py ├── foundry_cache.py ├── foundry_dataset.py ├── https_download.py ├── https_upload.py ├── jsonschema_models │ ├── __init__.py │ ├── dc_model.py │ └── project_model.py ├── loaders │ ├── __init__.py │ ├── tf_wrapper.py │ └── torch_wrapper.py ├── models.py └── utils.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── test-requirements.txt ├── test.py └── tests ├── README.md ├── __init__.py ├── data ├── __init__.py ├── https_test │ └── test_data.json └── tmp_data.json ├── test_data.py ├── test_data ├── elwood_md_v1.2 │ └── MD_properties.csv └── test_dataset │ └── elwood.hdf5 ├── test_foundry.py ├── test_foundry_cache.py ├── test_foundry_components.py ├── test_foundry_dataset.py └── test_https_download.py /.gitbook.yaml: -------------------------------------------------------------------------------- 1 | root: ./docs/ 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/create-a-story.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Create a Story 3 | about: Suggest a user-centered feature, told as a Story 4 | title: My Story 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | _Short description_ 11 | 12 | # Assumptions: 13 | 1. 14 | 2. 15 | 16 | # Acceptance Criteria 17 | Given..., when..., then... 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yaml: -------------------------------------------------------------------------------- 1 | name: build api documentation 2 | 3 | on: 4 | pull_request: 5 | types: 6 | - closed 7 | push: 8 | branches: 9 | - "*" 10 | 11 | jobs: 12 | build_documentation: 13 | if: github.event.pull_request.merged == true 14 | name: generate api markdown docs 15 | runs-on: ubuntu-latest 16 | env: 17 | CLIENT_ID: ${{ secrets.CLIENT_ID }} 18 | CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }} 19 | steps: 20 | - name: Check out repo's default branch 21 | uses: actions/checkout@v3 22 | - name: Setup python 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: '3.10' 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install -r requirements.txt 30 | pip install lazydocs 31 | - name: Build docs from docstrings 32 | continue-on-error: true 33 | run: | 34 | lazydocs --output-path="docs" --overview-file="README.md" --src-base-url="https://github.com/MLMI2-CSSI/foundry/tree/main" . 35 | - name: Commit files 36 | run: | 37 | echo ${{ github.ref }} 38 | git add . 39 | git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com" 40 | git config --local user.name "github-actions[bot]" 41 | git commit -m "CI: Automated documentation build" -a | exit 0 42 | git push origin ${{ github.event.pull_request.base.ref }} 43 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - dev 7 | - main 8 | 9 | jobs: 10 | 11 | build: 12 | runs-on: ubuntu-latest 13 | timeout-minutes: 20 14 | strategy: 15 | matrix: 16 | python-version: ["3.9", "3.10", "3.11", "3.12"] 17 | 18 | env: 19 | CLIENT_ID: ${{ secrets.CLIENT_ID }} 20 | CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }} 21 | name: build 22 | steps: 23 | - uses: actions/checkout@v4 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | cache : 'pip' 29 | 30 | - name: Globus auth 31 | run: 'echo "$GLOBUS_CONFIG" > ~/.globus-native-apps.cfg' 32 | shell: bash 33 | env: 34 | GLOBUS_CONFIG: "${{ secrets.GLOBUS_CONFIG }}" 35 | 36 | - name: Install dependencies 37 | run: | 38 | python -m pip install --upgrade pip 39 | pip install -r requirements.txt 40 | pip install -r test-requirements.txt 41 | 42 | - name: Lint with flake8 43 | run: | 44 | # stop the build if there are any-flake8 comments 45 | flake8 foundry 46 | 47 | - name: Test with pytest 48 | run: | 49 | pytest -s -v tests/ --cov=./foundry --cov-report=xml 50 | - name: Upload coverage to Codecov 51 | run: | 52 | curl -Os https://uploader.codecov.io/v0.1.0_4653/linux/codecov 53 | 54 | chmod +x codecov 55 | ./codecov -t ${{ secrets.CODECOV_TOKEN }} 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | */build/* 2 | *.DS_STORE 3 | *.pyc 4 | *.idea 5 | */foundry_ml.egg-info/* 6 | globus_creds 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 The University of Chicago 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | [![PyPI](https://img.shields.io/pypi/v/foundry_ml.svg)](https://pypi.python.org/pypi/foundry_ml) 8 | [![Tests](https://github.com/MLMI2-CSSI/foundry/actions/workflows/tests.yml/badge.svg)](https://github.com/MLMI2-CSSI/foundry/actions/workflows/tests.yml) 9 | [![Tests](https://github.com/MLMI2-CSSI/foundry/actions/workflows/python-publish.yml/badge.svg)](https://github.com/MLMI2-CSSI/foundry/actions/workflows/python-publish.yml) 10 | [![NSF-1931306](https://img.shields.io/badge/NSF-1931306-blue)](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1931306&HistoricalAwards=false) 11 | [](https://ai-materials-and-chemistry.gitbook.io/foundry/) 12 | 13 | 14 | Foundry-ML simplifies the discovery and usage of ML-ready datasets in materials science and chemistry providing a simple API to access even complex datasets. 15 | * Load ML-ready data with just a few lines of code 16 | * Work with datasets in local or cloud environments. 17 | * Publish your own datasets with Foundry to promote community usage 18 | * (in progress) Run published ML models without hassle 19 | 20 | Learn more and see our available datasets on [Foundry-ML.org](https://foundry-ml.org/) 21 | 22 | 23 | 24 | # Documentation 25 | Information on how to install and use Foundry is available in our documentation [here](https://ai-materials-and-chemistry.gitbook.io/foundry/v/docs/). 26 | 27 | DLHub documentation for model publication and running information can be found [here](https://dlhub-sdk.readthedocs.io/en/latest/servable-publication.html). 28 | 29 | # Quick Start 30 | Install Foundry-ML via command line with: 31 | `pip install foundry_ml` 32 | 33 | You can use the following code to import and instantiate Foundry-ML, then load a dataset. 34 | 35 | ```python 36 | from foundry import Foundry 37 | f = Foundry(index="mdf") 38 | 39 | 40 | f = f.load("10.18126/e73h-3w6n", globus=True) 41 | ``` 42 | *NOTE*: If you run locally and don't want to install the [Globus Connect Personal endpoint](https://www.globus.org/globus-connect-personal), just set the `globus=False`. 43 | 44 | If running this code in a notebook, a table of metadata for the dataset will appear: 45 | 46 | metadata 47 | 48 | We can use the data with `f.load_data()` and specifying splits such as `train` for different segments of the dataset, then use matplotlib to visualize it. 49 | 50 | ```python 51 | res = f.load_data() 52 | 53 | imgs = res['train']['input']['imgs'] 54 | desc = res['train']['input']['metadata'] 55 | coords = res['train']['target']['coords'] 56 | 57 | n_images = 3 58 | offset = 150 59 | key_list = list(res['train']['input']['imgs'].keys())[0+offset:n_images+offset] 60 | 61 | fig, axs = plt.subplots(1, n_images, figsize=(20,20)) 62 | for i in range(n_images): 63 | axs[i].imshow(imgs[key_list[i]]) 64 | axs[i].scatter(coords[key_list[i]][:,0], coords[key_list[i]][:,1], s = 20, c = 'r', alpha=0.5) 65 | ``` 66 | Screen Shot 2022-10-20 at 2 22 43 PM 67 | 68 | [See full examples](./examples) 69 | 70 | # How to Cite 71 | If you find Foundry-ML useful, please cite the following [paper](https://doi.org/10.21105/joss.05467) 72 | 73 | ``` 74 | @article{Schmidt2024, 75 | doi = {10.21105/joss.05467}, 76 | url = {https://doi.org/10.21105/joss.05467}, 77 | year = {2024}, publisher = {The Open Journal}, 78 | volume = {9}, 79 | number = {93}, 80 | pages = {5467}, 81 | author = {Kj Schmidt and Aristana Scourtas and Logan Ward and Steve Wangen and Marcus Schwarting and Isaac Darling and Ethan Truelove and Aadit Ambadkar and Ribhav Bose and Zoa Katok and Jingrui Wei and Xiangguo Li and Ryan Jacobs and Lane Schultz and Doyeon Kim and Michael Ferris and Paul M. Voyles and Dane Morgan and Ian Foster and Ben Blaiszik}, 82 | title = {Foundry-ML - Software and Services to Simplify Access to Machine Learning Datasets in Materials Science}, journal = {Journal of Open Source Software} 83 | } 84 | ``` 85 | 86 | # Contributing 87 | Foundry is an Open Source project and we encourage contributions from the community. To contribute, please fork from the `main` branch and open a Pull Request on the `main` branch. A member of our team will review your PR shortly. 88 | 89 | ## Developer notes 90 | In order to enforce consistency with external schemas for the metadata and datacite structures ([contained in the MDF data schema repository](https://github.com/materials-data-facility/data-schemas)) the `dc_model.py` and `project_model.py` pydantic data models (found in the `foundry/jsonschema_models` folder) were generated using the [datamodel-code-generator](https://github.com/koxudaxi/datamodel-code-generator/) tool. In order to ensure compliance with the flake8 linting, the `--use-annoted` flag was passed to ensure regex patterns in `dc_model.py` were specified using pydantic's `Annotated` type vs the soon to be deprecated `constr` type. The command used to run the datamodel-code-generator looks like: 91 | ``` 92 | datamodel-codegen --input dc.json --output dc_model.py --use-annotated 93 | ``` 94 | 95 | # Primary Support 96 | This work was supported by the National Science Foundation under NSF Award Number: 1931306 "Collaborative Research: Framework: Machine Learning Materials Innovation Infrastructure". 97 | 98 | # Other Support 99 | Foundry-ML brings together many components in the materials data ecosystem. Including [MAST-ML](https://mastmldocs.readthedocs.io/en/latest/), the [Data and Learning Hub for Science](https://www.dlhub.org) (DLHub), and the [Materials Data Facility](https://materialsdatafacility.org) (MDF). 100 | 101 | ## MAST-ML 102 | This work was supported by the National Science Foundation (NSF) SI2 award No. 1148011 and DMREF award number DMR-1332851 103 | 104 | ## The Data and Learning Hub for Science (DLHub) 105 | This material is based upon work supported by Laboratory Directed Research and Development (LDRD) funding from Argonne National Laboratory, provided by the Director, Office of Science, of the U.S. Department of Energy under Contract No. DE-AC02-06CH11357. 106 | https://www.dlhub.org 107 | 108 | ## The Materials Data Facility 109 | This work was performed under financial assistance award 70NANB14H012 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the [Center for Hierarchical Material Design (CHiMaD)](http://chimad.northwestern.edu). This work was performed under the following financial assistance award 70NANB19H005 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the Center for Hierarchical Materials Design (CHiMaD). This work was also supported by the National Science Foundation as part of the [Midwest Big Data Hub](http://midwestbigdatahub.org) under NSF Award Number: 1636950 "BD Spokes: SPOKE: MIDWEST: Collaborative: Integrative Materials Design (IMaD): Leverage, Innovate, and Disseminate". 110 | https://www.materialsdatafacility.org 111 | -------------------------------------------------------------------------------- /assets/foundry-black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-black.png -------------------------------------------------------------------------------- /assets/foundry-black.svg: -------------------------------------------------------------------------------- 1 | FOUNDRY-MLDATA, MODELS, SCIENCE -------------------------------------------------------------------------------- /assets/foundry-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-dark.png -------------------------------------------------------------------------------- /assets/foundry-dark.svg: -------------------------------------------------------------------------------- 1 | FOUNDRYDATA, MODELS, SCIENCE -------------------------------------------------------------------------------- /assets/foundry-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-light.png -------------------------------------------------------------------------------- /assets/foundry-light.svg: -------------------------------------------------------------------------------- 1 | FOUNDRYDATA, MODELS, SCIENCE -------------------------------------------------------------------------------- /assets/foundry-logo-4.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-logo-4.pptx -------------------------------------------------------------------------------- /assets/foundry-logo.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-logo.pptx -------------------------------------------------------------------------------- /assets/foundry-ml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-ml.png -------------------------------------------------------------------------------- /assets/foundry-purple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-purple.png -------------------------------------------------------------------------------- /assets/foundry-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-white.png -------------------------------------------------------------------------------- /assets/foundry-white.svg: -------------------------------------------------------------------------------- 1 | FOUNDRY-MLDATA, MODELS, SCIENCE -------------------------------------------------------------------------------- /assets/foundry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry.png -------------------------------------------------------------------------------- /assets/foundry.svg: -------------------------------------------------------------------------------- 1 | FOUNDRYDATA, MODELS, SCIENCE -------------------------------------------------------------------------------- /docs/.gitbook/assets/foundry-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-overview.png -------------------------------------------------------------------------------- /docs/.gitbook/assets/foundry-purple (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple (1).png -------------------------------------------------------------------------------- /docs/.gitbook/assets/foundry-purple (2).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple (2).png -------------------------------------------------------------------------------- /docs/.gitbook/assets/foundry-purple (3).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple (3).png -------------------------------------------------------------------------------- /docs/.gitbook/assets/foundry-purple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple.png -------------------------------------------------------------------------------- /docs/.gitbook/assets/foundry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry.png -------------------------------------------------------------------------------- /docs/.gitbook/assets/image (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/image (1).png -------------------------------------------------------------------------------- /docs/.gitbook/assets/image (2).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/image (2).png -------------------------------------------------------------------------------- /docs/.gitbook/assets/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/image.png -------------------------------------------------------------------------------- /docs/.gitbook/assets/screen-shot-2021-07-15-at-10.00.38-am.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/screen-shot-2021-07-15-at-10.00.38-am.png -------------------------------------------------------------------------------- /docs/.gitbook/assets/screen-shot-2021-07-15-at-10.05.40-am.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/screen-shot-2021-07-15-at-10.05.40-am.png -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Getting started with Foundry 2 | 3 | ![](.gitbook/assets/foundry-purple%20%283%29.png) 4 | 5 | ## What is Foundry? 6 | 7 | Foundry is a Python package that simplifies the discovery and usage of machine-learning ready datasets in materials science and chemistry. Foundry provides software tools that make it easy to load these datasets and work with them in local or cloud environments. Further, Foundry provides a dataset specification, and defined curation flows, that allow users to create new datasets for the community to use through this same interface. 8 | 9 | ## Installation 10 | 11 | Foundry can be installed on any operating system with Python with pip 12 | 13 | ```text 14 | pip install foundry-ml 15 | ``` 16 | 17 | ### Globus 18 | 19 | Foundry uses the Globus platform for authentication, search, and to optimize some data transfer operations. Follow the steps below to get set up. 20 | 21 | * [Create a free account.](https://app.globus.org) You can create a free account here with your institutional credentials or with free IDs \(GlobusID, Google, ORCID, etc\). 22 | * [Set up a Globus Connect Personal endpoint ](https://www.globus.org/globus-connect-personal)_**\(optional\)**_. While this step is optional, some Foundry capabilities will work more efficiently when using GCP. 23 | 24 | ## Project Support 25 | 26 | This work was supported by the National Science Foundation under NSF Award Number: 1931306 "Collaborative Research: Framework: Machine Learning Materials Innovation Infrastructure". 27 | 28 | ### Other Support 29 | 30 | Foundry brings together many components in the materials data ecosystem. Including MAST-ML, the Data and Learning Hub for Science \(DLHub\), and The Materials Data Facility \(MDF\). 31 | 32 | #### MAST-ML 33 | 34 | This work was supported by the National Science Foundation \(NSF\) SI2 award No. 1148011 and DMREF award number DMR-1332851 35 | 36 | #### The Data and Learning Hub for Science \(DLHub\) 37 | 38 | This material is based upon work supported by Laboratory Directed Research and Development \(LDRD\) funding from Argonne National Laboratory, provided by the Director, Office of Science, of the U.S. Department of Energy under Contract No. DE-AC02-06CH11357. [https://www.dlhub.org](https://www.dlhub.org) 39 | 40 | #### The Materials Data Facility 41 | 42 | This work was performed under financial assistance award 70NANB14H012 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the [Center for Hierarchical Material Design \(CHiMaD\)](http://chimad.northwestern.edu). This work was performed under the following financial assistance award 70NANB19H005 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the Center for Hierarchical Materials Design \(CHiMaD\). This work was also supported by the National Science Foundation as part of the [Midwest Big Data Hub](http://midwestbigdatahub.org) under NSF Award Number: 1636950 "BD Spokes: SPOKE: MIDWEST: Collaborative: Integrative Materials Design \(IMaD\): Leverage, Innovate, and Disseminate". [https://www.materialsdatafacility.org](https://www.materialsdatafacility.org) 43 | 44 | -------------------------------------------------------------------------------- /docs/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | 3 | * [Getting started with Foundry](README.md) 4 | 5 | ## How to contribute 6 | 7 | * [Contribution Process](how-to-contribute/contributing.md) 8 | * [Contributor Covenant](how-to-contribute/code_of_conduct.md) 9 | 10 | --- 11 | 12 | * [Sphinx Autogenerated documentation - markdown](sphinx-autogenerated-documentation.md) 13 | * [foundry package — Foundry\_test 1.1 documentation - HTML AUTOGENERATION](foundry-package-foundry_test-1.1-documentation-html-autogeneration.md) 14 | 15 | -------------------------------------------------------------------------------- /docs/command-line-interface.md: -------------------------------------------------------------------------------- 1 | # Command Line Interface \(CLI\) 2 | 3 | The Foundry command line interface \(CLI\) allows for users to build their data environment from the command line using a specification file. This is the data analag to how `pip` or `conda` allow users to build a software environment from software specification files. 4 | 5 | ## Installation 6 | 7 | ```text 8 | pip install foundry-ml-cli 9 | ``` 10 | 11 | ### CLI Options 12 | 13 | **`--file`** : \(string\) the name of the specification file to build. _Default: "./foundry.json"_ 14 | 15 | **`--globus`** : \(bool\) If True, uses Globus to download the files, otherwise HTTPS. _Default: False_ 16 | 17 | **`--interval`** : \(int\) Time in seconds between polling operations to check transfer status. _Default: 3_ 18 | 19 | **`-verbose`** : \(bool\) If True, print out more logging information to the console. _Default: False_ 20 | 21 | ## Example Usage 22 | 23 | In a folder containing a file named foundry.json 24 | 25 | ```text 26 | /foundry.json 27 | 28 | 29 | $ foundry 30 | ``` 31 | 32 | This is the same as running 33 | 34 | ```text 35 | /foundry.json 36 | 37 | 38 | $ foundry --file=foundry.json --globus=False --interval=3 --verbose=False 39 | ``` 40 | 41 | -------------------------------------------------------------------------------- /docs/concepts/foundry-benchmarks.md: -------------------------------------------------------------------------------- 1 | # Foundry Benchmarks 2 | 3 | -------------------------------------------------------------------------------- /docs/concepts/foundry-data-packages.md: -------------------------------------------------------------------------------- 1 | # Foundry Data Packages 2 | 3 | Foundry Data Packages allow for a logical and portable way to specify and collect data for analyses. From a data package, a user can easily build a local data environment matching the data package. 4 | 5 | ## Data Package Specification Fields 6 | 7 | **`name`** : \(string\) A name for the data package 8 | 9 | **`version`** : \(string\) A version of the form <major>.<minor>.<sub> e.g., "1.2.0" 10 | 11 | **`description`** : \(string\) A short description of the data package and its intended use 12 | 13 | **`tags`** : \(list\) A list of tag strings associated with the data package 14 | 15 | **`dependencies`** : \(list\) A list of dependency objects associated with the data package 16 | 17 | **`private`** : \(bool\) Whether the data package is to be registered in a public data package index 18 | 19 | ### Dependency Objects 20 | 21 | **`identifier`** : \(string\) Unique identifier for the dataset 22 | 23 | **`version`** : \(string\) The version of the dataset to use 24 | 25 | **`provider`** : \(string\) The dataset provider. _Currently only "MDF" is supported_ 26 | 27 | ```javascript 28 | { 29 | "identifier": "_test_foundry_mp_bandgap_v1.1", 30 | "version": "1.1", 31 | "provider": "MDF" 32 | } 33 | ``` 34 | 35 | ## Example Specification 36 | 37 | ```javascript 38 | { 39 | "name": "Band Gap Analysis", 40 | "version": "1.0.0", 41 | "description": "Datasets for band gap uber model generation", 42 | "private": true, 43 | "dependencies": [{ 44 | "name": "_test_foundry_experimental_bandgap_v1.1", 45 | "version": "1.1", 46 | "provider": "MDF" 47 | }, 48 | { 49 | "name": "_test_foundry_mp_bandgap_v1.1", 50 | "version": "1.1", 51 | "provider": "MDF" 52 | }, 53 | { 54 | "name": "_test_foundry_oqmd_bandgap_v1.1", 55 | "version": "1.1", 56 | "provider": "MDF" 57 | }, 58 | { 59 | "name": "_test_foundry_assorted_computational_bandgap_v1.1", 60 | "version": "1.1", 61 | "provider": "MDF" 62 | } 63 | ] 64 | } 65 | ``` 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /docs/concepts/foundry-datasets.md: -------------------------------------------------------------------------------- 1 | --- 2 | description: Describe the metadata that is for each Foundry dataset 3 | --- 4 | 5 | # Foundry Datasets 6 | 7 | Foundry Datasets are comprised of two key components, [_**data**_](foundry-datasets.md#data) and descriptive [_**metadata**_](foundry-datasets.md#describing-datasets-with-metadata). In order to make the data easily consumable, _**data**_ \(consisting of files\) should be assembled following the supported structures. The _**metadata**_ description allows tracking of high level information \(e.g., authors, assoicated institutions, licenses, data location\), and also information on how to operate on the datasets \(e.g., how to load the data, training/test splits\) 8 | 9 | ### **Data** 10 | 11 | ### Example - Record-Based Data 12 | 13 | #### **Tabular Data** 14 | 15 | For tabular data should, columns represent the different keys of the data, and rows represent individual records. 16 | 17 | {% hint style="info" %} 18 | Supported tabular data types currently include JSON, csv, and xlsx. 19 | {% endhint %} 20 | 21 | In this example, we showcase how to describe a JSON record-based dataset where each record is a valid JSON object in a JSON list or a line in a JSON line delimited file. 22 | 23 | | **feature\_1** | **feature\_2** | **material\_type** | band\_gap | 24 | | :--- | :--- | :--- | :--- | 25 | | 0.10 | 0.52 | 1 | 1.40 | 26 | | 0.34 | 0.910 | 0 | 0.73 | 27 | | ... | ... | ... | | 28 | 29 | For this example dataset the `Key` object could be: 30 | 31 | ```javascript 32 | { 33 | "short_name": "oqmd-bandgaps", 34 | "data_type": "tabular", 35 | "task_type": ["supervised"], 36 | "domain": ["materials science"], 37 | "n_items": 29197, 38 | "splits": [{ 39 | "type": "train", 40 | "path": "foundry_dataframe.json", 41 | "label": "train" 42 | }], 43 | "keys": [{ 44 | "key": ["reference"], 45 | "type": "input", 46 | "units": "", 47 | "description": "source publication of the bandgap value" 48 | }, { 49 | "key": ["icsd_id"], 50 | "type": "input", 51 | "units": "", 52 | "description": "corresponding id in ICSD of this compound" 53 | }, { 54 | "key": ["structure"], 55 | "type": "input", 56 | "units": "", 57 | "description": "the structure of this compound" 58 | }, { 59 | "key": ["composition"], 60 | "type": "input", 61 | "units": "", 62 | "description": "reduced composition of this compound" 63 | }, { 64 | "key": ["comments"], 65 | "type": "input", 66 | "units": "", 67 | "description": "Additional information about this bandgap measurement" 68 | }, { 69 | "key": ["bandgap type"], 70 | "type": "input", 71 | "units": "", 72 | "description": "the type of the bandgap, e.g., direct or indirect" 73 | }, { 74 | "key": ["comp method"], 75 | "type": "input", 76 | "units": "", 77 | "description": "functional used to calculate the bandgap" 78 | }, { 79 | "key": ["space group"], 80 | "type": "input", 81 | "units": "", 82 | "description": "the space group of this compound" 83 | }, 84 | { 85 | "key": ["bandgap value (eV)"], 86 | "type": "output", 87 | "units": "eV", 88 | "description": "value of the bandgap" 89 | } 90 | ] 91 | } 92 | ``` 93 | 94 | **TODO** 95 | 96 | ```text 97 | "keys":[{ 98 | "key": "feature_1", 99 | "type": "input", 100 | "units": None, 101 | "description": "This is feature 1" 102 | },{ 103 | "key": "feature_2", 104 | "type": "input", 105 | "units": None, 106 | "description": "This is feature 2" 107 | },{ 108 | "key": "material_type", 109 | "type": "input", 110 | "units": None, 111 | "description": "This is the material type", 112 | "labels":["perovskite","not perovskite"] 113 | }{ 114 | "key": "band_gap", 115 | "type": "target", 116 | "units": "eV", 117 | "description": "This is the simulated band gap in eV" 118 | } 119 | ] 120 | ``` 121 | 122 | {% hint style="info" %} 123 | `This tabular data file should be saved in the base directory as` **`foundry_dataframe.json`** 124 | {% endhint %} 125 | 126 | * Write general pandas reader to try csv, JSON, xlsx for opening 127 | 128 | #### Hierarchical Data 129 | 130 | Foundry also supports data from hierarchical data formats \(e.g., [HDF5](https://www.h5py.org)\). In this case features and outputs can be represented with `/` notation. For example, if the features of a dataset are located in an array stored in `/data/arr1` and `/other_data/arr2` while the outputs are in `/data/band_gaps`, the Key object would be: 131 | 132 | ```javascript 133 | { 134 | "short_name": "segmentation-dev", 135 | "data_type": "hdf5", 136 | "task_type": ["unsupervised", "segmentation"], 137 | "domain": ["materials science", "chemistry"], 138 | "n_items": 100, 139 | "splits": [{ 140 | "type": "train", 141 | "path": "foundry.hdf5", 142 | "label": "train" 143 | }], 144 | "keys": [{ 145 | "key": ["train/input"], 146 | "type": "input", 147 | "description": "input, unlabeled images" 148 | }, { 149 | "key": ["train/output"], 150 | "type": "target", 151 | "description": "target, labeled images" 152 | }] 153 | } 154 | ``` 155 | 156 | ```text 157 | "keys":[{ 158 | "key": "/data/arr1", 159 | "type": "input", 160 | "units": None, 161 | "description": "This is an array containing input data" 162 | },{ 163 | "key": "/other_data/arr2", 164 | "type": "input", 165 | "units": None, 166 | "description": "This is an another array containing input data" 167 | },{ 168 | "key": "/data/band_gaps", 169 | "type": "target", 170 | "units": "eV", 171 | "description": "This is the simulated band gap in eV" 172 | } 173 | ] 174 | ``` 175 | 176 | ## Descriptive Metadata 177 | 178 | **DataCite Metadata \(object\):** All datasets can be described using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). This metadata captures . Many of these capabilities have helper functions in the SDK, to make it easier to match the DataCite schema 179 | 180 | **Keys \(object\):** Key objects provide a mapping that allows Foundry to read data from the underlying data structure into usable Python objects. Key objects have the following properties 181 | 182 | * **`key (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\) 183 | * **`type (str)`** The type of key this entry represents. Currently suported types are _**\["input", "target" \]**_ 184 | * **`units (str)[optional]`** _****_The scientific units associated with a key. _Default: None_ 185 | * **`description (str)[optional]`** _****_A free text description of the key. _Default: None_ 186 | * **`labels (list) (str) [optional]`:** A list of strings mapped to integers in a key column 187 | 188 | **short\_name \(str\):** Short name is a unique name associated with this dataset to make loading and . 189 | 190 | **type \(str\):** The type provides a hint to Foundry on how to map the keys into loading operations. _Options \["tabular","hdf5"\]_ 191 | 192 | ```text 193 | "foundry": { 194 | "dc": {}, 195 | "keys": [{ 196 | "type": "input", 197 | "name": "feature_1", 198 | "units": "", 199 | "description": "This is an input" 200 | }, 201 | { 202 | "type": "target", 203 | "name": "band_gap", 204 | "units": "eV", 205 | "description": "blah blah", 206 | "labels": [] 207 | } 208 | ], 209 | "short_name": "my_short_name", 210 | "type": "tabular" 211 | } 212 | ``` 213 | 214 | -------------------------------------------------------------------------------- /docs/concepts/foundry-models-and-functions.md: -------------------------------------------------------------------------------- 1 | # Foundry Models and Functions 2 | 3 | ## Foundry Model Providers 4 | 5 | Currently Foundry supports model and functions provided via the [Data and Learning Hub for Science \(DLHub\)](https://www.dlhub.org)/[FuncX](https://www.funcx.org) 6 | 7 | -------------------------------------------------------------------------------- /docs/concepts/overview.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | TODO: 4 | 5 | * Change the code snippet in the image 6 | * Write the text :\) 7 | 8 | ![](../.gitbook/assets/foundry-overview.png) 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Python 2 | 3 | ## Scientific Examples 4 | 5 | [Checkout our example notebooks ](https://github.com/MLMI2-CSSI/foundry/tree/master/examples)for how to load or publish datasets using Foundry. 6 | 7 | ## Quickstart 8 | 9 | ### Creating a Foundry Client 10 | 11 | The Foundry client provides access to all of the methods described here for listing, loading, and publishing datasets and models. The code below will create a Foundry client 12 | 13 | ```python 14 | from foundry import Foundry 15 | f = Foundry() 16 | ``` 17 | 18 | {% hint style="success" %} 19 | If you are running your script on cloud resources \(e.g. Google Colab, Binder\), see [Using Foundry on Cloud Computing Resources](examples.md#using-foundry-on-cloud-computing-resources)W 20 | {% endhint %} 21 | 22 | ### Listing Datasets 23 | 24 | To show all available Foundry datasets, you can use the Foundry `list()` method as follows. The method returns a pandas DataFrame with details on the available datasets. 25 | 26 | ```python 27 | f.list() 28 | ``` 29 | 30 | ### Loading Datasets 31 | 32 | The Foundry client can be used to access datasets using a `source_id`, e.g. here `"_test_foundry_fashion_mnist_v1.1"`_._ You can retrieve the `source_id` from the [`list()` method](examples.md#listing-datasets). 33 | 34 | ```python 35 | from foundry import Foundry 36 | f = Foundry() 37 | f = f.load("_test_foundry_fashion_mnist_v1.1") 38 | ``` 39 | 40 | This will remotely load the metadata \(e.g., data location, data keys, etc.\) and download the data to local storage if it is not already cached. Data can be downloaded via HTTPS without additional setup or more optimally with a Globus endpoint [set up](https://www.globus.org/globus-connect-personal) on your machine. 41 | 42 | Once the data are accessible locally, access the data with the `load_data()` method. Load data allows you to load data from a specific split that is defined for the dataset, here we use `train`. 43 | 44 | ```python 45 | res = f.load_data() 46 | X,y = res['train'] 47 | ``` 48 | 49 | The data are then usable within the `X` and `y` variables. This full example can be found in [`/examples/fashion-mnist/`](https://github.com/MLMI2-CSSI/foundry/tree/master/examples/fashion-mnist). 50 | 51 | ## Using Foundry on Cloud Computing Resources 52 | 53 | Foundry works with common cloud computing providers \(e.g., the NSF sponsored Jetstream and Google Colab\). On these resources, simply add the following arguments to use a cloud-compatible authentication flow. 54 | 55 | ```python 56 | f = Foundry(no_browser=True, no_local_server=True) 57 | ``` 58 | 59 | When downloading data, add the following argument to download via HTTPS. 60 | 61 | {% hint style="info" %} 62 | This method may be slow for large datasets and datasets with many files 63 | {% endhint %} 64 | 65 | ```python 66 | f.load(globus=False) 67 | X, y = f.load_data() 68 | ``` 69 | 70 | -------------------------------------------------------------------------------- /docs/foundry-package-foundry_test-1.1-documentation-html-autogeneration.md: -------------------------------------------------------------------------------- 1 | # foundry package — Foundry\_test 1.1 documentation - HTML AUTOGENERATION 2 | 3 | ## foundry.foundry module[¶]() 4 | 5 | _class_ foundry.foundry.Foundry\(_no\_browser=False_, _no\_local\_server=False_, _search\_index='mdf-test'_, _\*_, _dc: Dict = {}_, _mdf: Dict = {}_, _dataset:_ [_foundry.models.FoundryDataset_]() _= {}_, _config:_ [_foundry.models.FoundryConfig_]() _= FoundryConfig\(dataframe\_file='foundry\_dataframe.json', data\_file='foundry.hdf5', metadata\_file='foundry\_metadata.json', destination\_endpoint=None, local=False, metadata\_key='foundry', organization='foundry', local\_cache\_dir='./data'\)_, _dlhub\_client: Any = None_, _forge\_client: Any = None_, _connect\_client: Any = None_, _xtract\_tokens: Any = None_\)[¶]() 6 | 7 | Bases: [`foundry.models.FoundryMetadata`]() 8 | 9 | Foundry Client Base Class TODO: ——- Add Docstring build\(_spec_, _globus=False_, _interval=3_, _file=False_\)[¶]() 10 | 11 | Build a Foundry Data Package :param spec: dict or str \(relative filename\) of the data package specification :type spec: multiple :param globus: if True use Globus to fetch datasets :type globus: bool :param interval: Polling interval on checking task status in seconds. :type interval: int :param type: One of “file” or None :type type: strReturns 12 | 13 | **\(Foundry\)**Return type 14 | 15 | self: for chaining check\_model\_status\(_res_\)[¶]() 16 | 17 | Check status of model or function publication to DLHub 18 | 19 | TODO: currently broken on DLHub side of things check\_status\(_source\_id_, _short=False_, _raw=False_\)[¶]() 20 | 21 | Check the status of your submission.Parameters 22 | 23 | * **source\_id** \(_str_\) – The `source_id` \(`source_name` + version information\) of the submission to check. Returned in the `res` result from `publish()` via MDF Connect Client. 24 | * **short** \(_bool_\) – When `False`, will print a status summary containing all of the status steps for the dataset. When `True`, will print a short finished/processing message, useful for checking many datasets’ status at once. **Default:** `False` 25 | * **raw** \(_bool_\) – When `False`, will print a nicely-formatted status summary. When `True`, will return the full status result. For direct human consumption, `False` is recommended. **Default:** `False` 26 | 27 | Returns 28 | 29 | The full status result.Return type 30 | 31 | If `raw` is `True`, _dict_ collect\_dataframes\(_packages=\[\]_\)[¶]() 32 | 33 | Collect dataframes of local data packages :param packages: List of packages to collect, defaults to all :type packages: listReturns 34 | 35 | **\(tuple\)**Return type 36 | 37 | Tuple of X\(pandas.DataFrame\), y\(pandas.DataFrame\) configure\(_\*\*kwargs_\)[¶]() 38 | 39 | Set Foundry config :keyword file: Path to the file containing :kwtype file: str :keyword \(default: self.config.metadata\_file\) 40 | 41 | dataframe\_file \(str\): filename for the dataframe file default:”foundry\_dataframe.json” data\_file \(str\): : filename for the data file default:”foundry.hdf5” destination\_endpoint \(str\): Globus endpoint UUID where Foundry data should move local\_cache\_dir \(str\): Where to place collected data default:”./data”Returns 42 | 43 | **\(Foundry\)**Return type 44 | 45 | self: for chaining connect\_client_: Any_[¶]() describe\_model\(\)[¶]() dlhub\_client_: Any_[¶]() download\(_globus=True_, _verbose=False_, _\*\*kwargs_\)[¶]() 46 | 47 | Download a Foundry dataset :param globus: if True, use Globus to download the data else try HTTPS :type globus: bool :param verbose: if True print out debug information during the download :type verbose: boolReturns 48 | 49 | **\(Foundry\)**Return type 50 | 51 | self: for chaining forge\_client_: Any_[¶]() get\_keys\(_type_, _as\_object=False_\)[¶]() 52 | 53 | Get keys for a Foundry datasetParameters 54 | 55 | * **type** \(_str_\) – The type of key to be returned e.g., “input”, “target” 56 | * **as\_object** \(_bool_\) – When `False`, will return a list of keys in as strings When `True`, will return the full key objects **Default:** `False` 57 | 58 | Returns: \(list\) String representations of keys or if `as_object` 59 | 60 | is False otherwise returns the full key objects. get\_packages\(_paths=False_\)[¶]() 61 | 62 | Get available local data packagesParameters 63 | 64 | **paths** \(_bool_\) – If True return paths in addition to package, if False return package name onlyReturns 65 | 66 | **\(list\)**Return type 67 | 68 | List describing local Foundry packages list\(\)[¶]() 69 | 70 | List available Foundry data packagesReturns 71 | 72 | **\(pandas.DataFrame\)**Return type 73 | 74 | DataFrame with summary list of Foundry data packages including name, title, and publication year load\(_name_, _download=True_, _globus=True_, _verbose=False_, _metadata=None_, _\*\*kwargs_\)[¶]() 75 | 76 | Load the metadata for a Foundry dataset into the client :param name: Name of the foundry dataset :type name: str :param download: If True, download the data associated with the package \(default is True\) :type download: bool :param globus: If True, download using Globus, otherwise https :type globus: bool :param verbose: If True print additional debug information :type verbose: bool :param metadata: **For debug purposes.** A search result analog to prepopulate metadata. :type metadata: dictKeyword Arguments 77 | 78 | **interval** \(_int_\) – How often to poll Globus to check if transfers are completeReturnsReturn type 79 | 80 | self load\_data\(_source\_id=None_, _globus=True_\)[¶]() 81 | 82 | Load in the data associated with the prescribed dataset 83 | 84 | Tabular Data Type: Data are arranged in a standard data frame stored in self.dataframe\_file. The contents are read, and 85 | 86 | File Data Type: <<Add desc>> 87 | 88 | For more complicated data structures, users should subclass Foundry and override the load\_data functionParameters 89 | 90 | * **inputs** \(_list_\) – List of strings for input columns 91 | * **targets** \(_list_\) – List of strings for output columns 92 | 93 | Returns ——-s 94 | 95 | > \(tuple\): Tuple of X, y values 96 | 97 | publish\(_foundry\_metadata_, _data\_source_, _title_, _authors_, _update=False_, _publication\_year=None_, _\*\*kwargs_\)[¶]() 98 | 99 | Submit a dataset for publication :param foundry\_metadata: Dict of metadata describing data package :type foundry\_metadata: dict :param data\_source: Url for Globus endpoint :type data\_source: string :param title: Title of data package :type title: string :param authors: List of data package author names e.g., Jack Black 100 | 101 | > or Nunez, Victoria 102 | 103 | Parameters 104 | 105 | * **update** \(_bool_\) – True if this is an update to a prior data package \(default: self.config.metadata\_file\) 106 | * **publication\_year** \(_int_\) – Year of dataset publication. If None, will be set to the current calendar year by MDF Connect Client. \(default: $current\_year\) 107 | 108 | Keyword Arguments 109 | 110 | * **affiliations** \([_list_]()\) – List of author affiliations 111 | * **tags** \([_list_]()\) – List of tags to apply to the data package 112 | * **short\_name** \(_string_\) – Shortened/abbreviated name of the data package 113 | * **publisher** \(_string_\) – Data publishing entity \(e.g. MDF, Zenodo, etc.\) 114 | 115 | Returns 116 | 117 | **\(dict\) MDF Connect Response** – of dataset. Contains source\_id, which can be used to check the status of the submissionReturn type 118 | 119 | Response from MDF Connect to allow tracking publish\_model\(_options_\)[¶]() 120 | 121 | Submit a model or function for publication :param options: dict of all possible optionsOptions keys: 122 | 123 | title \(req\) authors \(req\) short\_name \(req\) servable\_type \(req\) \(“static method”, “class method”, “keras”, “pytorch”, “tensorflow”, “sklearn”\) affiliations domains abstract references requirements \(dict of library:version keypairs\) module \(if Python method\) function \(if Python method\) inputs \(not needed for TF\) \(dict of options\) outputs \(not needed for TF\) methods \(e.g. research methods\) DOI publication\_year \(advanced\) version \(advanced\) visibility \(dict of users and groups, each a list\) funding reference rights 124 | 125 | TODO: alternate identifier \(to add an identifier of this artifact in another service\) add file add directory add files run\(_name_, _inputs_, _\*\*kwargs_\)[¶]() 126 | 127 | Run a model on dataParameters 128 | 129 | * **name** \(_str_\) – DLHub model name 130 | * **inputs** – Data to send to DLHub as inputs \(should be JSON serializable\) 131 | 132 | ReturnsReturn type 133 | 134 | Returns results after invocation via the DLHub service 135 | 136 | * Pass [\*\*]()kwargs through to DLHub client and document kwargs 137 | 138 | xtract\_tokens_: Any_[¶]() 139 | 140 | ## foundry.models module[¶]() 141 | 142 | _class_ foundry.models.FoundryConfig\(_\*_, _dataframe\_file: str = 'foundry\_dataframe.json'_, _data\_file: str = 'foundry.hdf5'_, _metadata\_file: str = 'foundry\_metadata.json'_, _destination\_endpoint: str = None_, _local: bool = False_, _metadata\_key: str = 'foundry'_, _organization: str = 'foundry'_, _local\_cache\_dir: str = './data'_\)[¶]() 143 | 144 | Bases: `pydantic.main.BaseModel` 145 | 146 | Foundry Configuration Configuration information for Foundry DatasetParameters 147 | 148 | * **dataframe\_file** \(_str_\) – Filename to read dataframe contents from 149 | * **metadata\_file** \(_str_\) – Filename to read metadata contents from defaults to reading for MDF Discover 150 | * **destination\_endpoint** \(_str_\) – Globus endpoint ID to transfer data to \(defaults to local GCP installation\) 151 | * **local\_cache\_dir** \(_str_\) – Path to local Foundry package cache 152 | 153 | data\_file_: Optional\[str\]_[¶]() dataframe\_file_: Optional\[str\]_[¶]() destination\_endpoint_: Optional\[str\]_[¶]() local_: Optional\[bool\]_[¶]() metadata\_file_: Optional\[str\]_[¶]() metadata\_key_: Optional\[str\]_[¶]() organization_: Optional\[str\]_[¶]() _class_ foundry.models.FoundryDataset\(_\*_, _keys: List\[_[_foundry.models.FoundryKey_]()_\] = None_, _splits: List\[_[_foundry.models.FoundrySplit_]()_\] = None_, _type:_ [_foundry.models.FoundryDatasetType_]() _= None_, _short\_name: str = ''_, _dataframe: Any = None_\)[¶]() 154 | 155 | Bases: `pydantic.main.BaseModel` 156 | 157 | Foundry Dataset Schema for Foundry Datasets. This includes specifications of inputs, outputs, type, version, and more _class_ Config[¶]() 158 | 159 | Bases: `object` arbitrary\_types\_allowed _= True_[¶]() dataframe_: Optional\[Any\]_[¶]() keys_: List\[_[_foundry.models.FoundryKey_]()_\]_[¶]() short\_name_: Optional\[str\]_[¶]() splits_: Optional\[List\[_[_foundry.models.FoundrySplit_]()_\]\]_[¶]() type_:_ [_foundry.models.FoundryDatasetType_]()[¶]() _class_ foundry.models.FoundryDatasetType\(_value_\)[¶]() 160 | 161 | Bases: `enum.Enum` 162 | 163 | Foundry Dataset Types Enumeration of the possible Foundry dataset types files _= 'files'_[¶]() hdf5 _= 'hdf5'_[¶]() other _= 'other'_[¶]() tabular _= 'tabular'_[¶]() _class_ foundry.models.FoundryKey\(_\*_, _key: List\[str\] = \[\]_, _type: str = ''_, _filter: str = ''_, _units: str = ''_, _description: str = ''_, _classes: List\[_[_foundry.models.FoundryKeyClass_]()_\] = None_\)[¶]() 164 | 165 | Bases: `pydantic.main.BaseModel` classes_: Optional\[List\[_[_foundry.models.FoundryKeyClass_]()_\]\]_[¶]() description_: Optional\[str\]_[¶]() filter_: Optional\[str\]_[¶]() key_: List\[str\]_[¶]() type_: str_[¶]() units_: Optional\[str\]_[¶]() _class_ foundry.models.FoundryKeyClass\(_\*_, _label: str = ''_, _name: str = ''_\)[¶]() 166 | 167 | Bases: `pydantic.main.BaseModel` label_: str_[¶]() name_: str_[¶]() _class_ foundry.models.FoundryMetadata\(_\*_, _dc: Dict = {}_, _mdf: Dict = {}_, _dataset:_ [_foundry.models.FoundryDataset_]() _= {}_, _config:_ [_foundry.models.FoundryConfig_]() _= FoundryConfig\(dataframe\_file='foundry\_dataframe.json', data\_file='foundry.hdf5', metadata\_file='foundry\_metadata.json', destination\_endpoint=None, local=False, metadata\_key='foundry', organization='foundry', local\_cache\_dir='./data'\)_\)[¶]() 168 | 169 | Bases: `pydantic.main.BaseModel` _class_ Config[¶]() 170 | 171 | Bases: `object` arbitrary\_types\_allowed _= True_[¶]() config_:_ [_foundry.models.FoundryConfig_]()[¶]() dataset_:_ [_foundry.models.FoundryDataset_]()[¶]() dc_: Optional\[Dict\]_[¶]() mdf_: Optional\[Dict\]_[¶]() _class_ foundry.models.FoundrySpecification\(_\*_, _name: str = ''_, _version: str = ''_, _description: str = ''_, _private: bool = False_, _dependencies: Any = None_\)[¶]() 172 | 173 | Bases: `pydantic.main.BaseModel` 174 | 175 | Pydantic base class for interacting with the Foundry data package specification The specification provides a way to group datasets and manage versions add\_dependency\(_name_, _version_\)[¶]() clear\_dependencies\(\)[¶]() dependencies_: Any_[¶]() description_: str_[¶]() name_: str_[¶]() private_: bool_[¶]() remove\_duplicate\_dependencies\(\)[¶]() version_: str_[¶]() _class_ foundry.models.FoundrySpecificationDataset\(_\*_, _name: str = None_, _provider: str = 'MDF'_, _version: str = None_\)[¶]() 176 | 177 | Bases: `pydantic.main.BaseModel` 178 | 179 | Pydantic base class for datasets within the Foundry data package specification name_: Optional\[str\]_[¶]() provider_: Optional\[str\]_[¶]() version_: Optional\[str\]_[¶]() _class_ foundry.models.FoundrySplit\(_\*_, _type: str = ''_, _path: str = ''_, _label: str = ''_\)[¶]() 180 | 181 | Bases: `pydantic.main.BaseModel` label_: Optional\[str\]_[¶]() path_: Optional\[str\]_[¶]() type_: str_[¶]() 182 | 183 | ## foundry.xtract\_method module[¶]() 184 | 185 | -------------------------------------------------------------------------------- /docs/foundry.auth.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry.auth` 6 | Utilities related to storing authentication credentials 7 | 8 | 9 | 10 | --- 11 | 12 | 13 | 14 | ## class `PubAuths` 15 | Collection of the authorizers needed for publication 16 | 17 | 18 | 19 | **Attributes:** 20 | 21 | - `transfer_client`: Client with credentials to perform transfers 22 | - `auth_client_openid`: Client with permissions to get users IDs 23 | - `endpoint_auth_clients`: Mapping between endpoint ID and client that can authorize access to it 24 | 25 | 26 | 27 | ### method `__init__` 28 | 29 | ```python 30 | __init__( 31 | transfer_client: TransferClient, 32 | auth_client_openid: AuthClient, 33 | endpoint_auth_clients: Dict[str, AuthClient] 34 | ) → None 35 | ``` 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | --- 48 | 49 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 50 | -------------------------------------------------------------------------------- /docs/foundry.foundry_cache.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry.foundry_cache` 6 | 7 | 8 | 9 | 10 | 11 | 12 | --- 13 | 14 | 15 | 16 | ## class `FoundryCache` 17 | The FoundryCache manages the local storage of FoundryDataset objects 18 | 19 | 20 | 21 | ### method `__init__` 22 | 23 | ```python 24 | __init__( 25 | forge_client: Forge, 26 | transfer_client: Any, 27 | use_globus, 28 | interval, 29 | parallel_https, 30 | verbose, 31 | local_cache_dir: str = None 32 | ) 33 | ``` 34 | 35 | Initializes a FoundryCache object. 36 | 37 | 38 | 39 | **Args:** 40 | 41 | - `forge_client` (Forge): The Forge client object. 42 | - `transfer_client` (Any): The transfer client object. 43 | - `use_globus` (bool): Flag indicating whether to use Globus for downloading. 44 | - `interval` (int): How often to wait before checking Globus transfer status. 45 | - `parallel_https` (int): Number of threads to use for downloading via HTTP. 46 | - `verbose` (bool): Flag indicating whether to produce more debug messages. 47 | - `local_cache_dir` (str, optional): The local cache directory. Defaults to None. If not specified, defaults to either the environmental variable 'FOUNDRY_LOCAL_CACHE_DIR' or './data/'. 48 | 49 | 50 | 51 | 52 | --- 53 | 54 | 55 | 56 | ### method `clear_cache` 57 | 58 | ```python 59 | clear_cache(dataset_name: str = None) 60 | ``` 61 | 62 | Deletes all of the locally stored datasets 63 | 64 | 65 | 66 | **Arguments:** 67 | 68 | - `dataset_name` (str): Optional name of a specific dataset. If omitted, all datsets will be erased 69 | 70 | --- 71 | 72 | 73 | 74 | ### method `download_to_cache` 75 | 76 | ```python 77 | download_to_cache(dataset_name: str, splits: List[Split] = None) 78 | ``` 79 | 80 | Checks if the data is downloaded, and if not, downloads the data from source to local storage. 81 | 82 | 83 | 84 | **Args:** 85 | 86 | - `dataset_name` (str): Name of the dataset (equivalent to source_id in MDF). 87 | - `splits` (List[FoundrySplit], optional): List of splits in the dataset. Defaults to None. 88 | 89 | 90 | 91 | **Returns:** 92 | 93 | - `FoundryCache`: The FoundryCache object. 94 | 95 | --- 96 | 97 | 98 | 99 | ### method `download_via_globus` 100 | 101 | ```python 102 | download_via_globus(dataset_name: str) 103 | ``` 104 | 105 | Downloads selected dataset over Globus. 106 | 107 | 108 | 109 | **Args:** 110 | 111 | - `dataset_name` (str): Name of the dataset (equivalent to source_id in MDF). 112 | 113 | --- 114 | 115 | 116 | 117 | ### method `download_via_http` 118 | 119 | ```python 120 | download_via_http(dataset_name: str) 121 | ``` 122 | 123 | Downloads selected dataset from MDF over HTTP. 124 | 125 | **Args:** 126 | dataset_name (str): Name of the dataset (equivalent to source_id in MDF). 127 | 128 | --- 129 | 130 | 131 | 132 | ### method `get_keys` 133 | 134 | ```python 135 | get_keys( 136 | foundry_schema: FoundrySchema, 137 | type: str = None, 138 | as_object: bool = False 139 | ) 140 | ``` 141 | 142 | Get keys for a Foundry dataset 143 | 144 | 145 | 146 | **Arguments:** 147 | 148 | - `foundry_schema` (FoundrySchema): The schema from MDF that contains the keys 149 | - `type` (str): The type of key to be returned e.g., "input", "target" 150 | - `as_object` (bool): When ``False``, will return a list of keys in as strings When ``True``, will return the full key objects 151 | - `**Default`: ** ``False`` Returns: (list) String representations of keys or if ``as_object`` is False otherwise returns the full key objects. 152 | 153 | --- 154 | 155 | 156 | 157 | ### method `load_as_dict` 158 | 159 | ```python 160 | load_as_dict( 161 | split: str, 162 | dataset_name: str, 163 | foundry_schema: FoundrySchema, 164 | as_hdf5: bool 165 | ) 166 | ``` 167 | 168 | Load the data associated with the specified dataset and return it as a labeled dictionary of tuples. 169 | 170 | 171 | 172 | **Args:** 173 | 174 | - `split` (str): Split to load the data from. 175 | - `dataset_name` (str): Name of the dataset (equivalent to source_id in MDF). 176 | - `foundry_schema` (FoundrySchema, optional): FoundrySchema object. Defaults to None. 177 | - `as_hdf5` (bool, optional): If True and dataset is in HDF5 format, keep data in HDF5 format. Defaults to False. 178 | 179 | 180 | 181 | **Returns:** 182 | 183 | - `dict`: A labeled dictionary of tuples containing the loaded data. 184 | 185 | --- 186 | 187 | 188 | 189 | ### method `load_as_tensorflow` 190 | 191 | ```python 192 | load_as_tensorflow( 193 | split: str, 194 | dataset_name: str, 195 | foundry_schema: FoundrySchema, 196 | as_hdf5: bool 197 | ) 198 | ``` 199 | 200 | Convert Foundry Dataset to a Tensorflow Sequence 201 | 202 | 203 | 204 | **Arguments:** 205 | 206 | - `split` (string): Split to create Tensorflow Sequence on. 207 | - `**Default`: ** ``None`` 208 | 209 | Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split 210 | 211 | --- 212 | 213 | 214 | 215 | ### method `load_as_torch` 216 | 217 | ```python 218 | load_as_torch( 219 | split: str, 220 | dataset_name: str, 221 | foundry_schema: FoundrySchema, 222 | as_hdf5: bool 223 | ) 224 | ``` 225 | 226 | Convert Foundry Dataset to a PyTorch Dataset 227 | 228 | 229 | 230 | **Arguments:** 231 | 232 | - `split` (string): Split to create PyTorch Dataset on. 233 | - `**Default`: ** ``None`` 234 | 235 | Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split 236 | 237 | --- 238 | 239 | 240 | 241 | ### method `validate_local_dataset_storage` 242 | 243 | ```python 244 | validate_local_dataset_storage(dataset_name: str, splits: List[Split] = None) 245 | ``` 246 | 247 | Verifies that the local storage location exists and all expected files are present. 248 | 249 | 250 | 251 | **Args:** 252 | 253 | - `dataset_name` (str): Name of the dataset (equivalent to source_id in MDF). 254 | - `splits` (List[FoundrySplit], optional): Labels of splits to be loaded. Defaults to None. 255 | 256 | 257 | 258 | **Returns:** 259 | 260 | - `bool`: True if the dataset exists and contains all the desired files; False otherwise. 261 | 262 | 263 | 264 | 265 | --- 266 | 267 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 268 | -------------------------------------------------------------------------------- /docs/foundry.foundry_dataset.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry.foundry_dataset` 6 | 7 | 8 | 9 | 10 | 11 | 12 | --- 13 | 14 | 15 | 16 | ## class `FoundryDataset` 17 | Representation of an individual dataset. Provides access to metadata as well as functions to instantiate data into memory in different formats. 18 | 19 | 20 | 21 | **Args:** 22 | 23 | - `dataset_name` (str): Name of the dataset (equivalent to source_id in MDF) 24 | - `datacite_entry` (FoundryDatacite): Datacite entry for the dataset 25 | - `foundry_schema` (FoundrySchema): Schema for the dataset 26 | - `foundry_cache` (FoundryCache): Cache for the dataset 27 | 28 | Desired functions: 29 | - Get as pandas 30 | - Get as tensorflow dataset 31 | - Get as pytorch dataset 32 | - Get file list 33 | - Set metadata 34 | - Attach datafiles 35 | - Validate against schema 36 | - Get citation 37 | 38 | 39 | 40 | ### method `__init__` 41 | 42 | ```python 43 | __init__( 44 | dataset_name: str, 45 | datacite_entry: FoundryDatacite, 46 | foundry_schema: FoundrySchema, 47 | foundry_cache: FoundryCache = None 48 | ) 49 | ``` 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | --- 59 | 60 | 61 | 62 | ### method `add_data` 63 | 64 | ```python 65 | add_data(local_data_path: str = None, globus_data_source: str = None) 66 | ``` 67 | 68 | Add data to the dataset. User must provide the location of the data as either a `globus_data_source` or `local_data_path`. 69 | 70 | 71 | 72 | **Arguments:** 73 | 74 | - `local_data_path` (str): Local path to the dataset used to publish to Foundry via HTTPS. Creates an HTTPS PUT request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is transferred to MDF. If None, the user must specify a 'globus_data_source' URL to the location of the data on their own Globus endpoint. User must choose either `globus_data_source` or `local_data_path` to publish their data. 75 | - `globus_data_source` (str): Url path for a data folder on a Globus endpoint; url can be obtained through the Globus Web UI or SDK. If None, the user must specify an 'local_data_path' pointing to the location of the data on their local machine. User must choose either `globus_data_source` or `local_data_path` to publish their data. 76 | 77 | --- 78 | 79 | 80 | 81 | ### method `clean_dc_dict` 82 | 83 | ```python 84 | clean_dc_dict() 85 | ``` 86 | 87 | Clean the Datacite dictionary of None values 88 | 89 | --- 90 | 91 | 92 | 93 | ### method `clear_dataset_cache` 94 | 95 | ```python 96 | clear_dataset_cache() 97 | ``` 98 | 99 | Deletes the cached data for this specific datset 100 | 101 | --- 102 | 103 | 104 | 105 | ### method `delete_none` 106 | 107 | ```python 108 | delete_none(_dict) 109 | ``` 110 | 111 | Delete None values recursively from all of the dictionaries 112 | 113 | --- 114 | 115 | 116 | 117 | ### method `get_as_dict` 118 | 119 | ```python 120 | get_as_dict(split: str = None, as_hdf5: bool = False) 121 | ``` 122 | 123 | Returns the data from the dataset as a dictionary 124 | 125 | 126 | 127 | **Arguments:** 128 | 129 | - `split` (string): Split to create dataset on. 130 | - `**Default`: ** ``None`` 131 | 132 | Returns: (dict) Dictionary of all the data from the specified split 133 | 134 | --- 135 | 136 | 137 | 138 | ### method `get_as_tensorflow` 139 | 140 | ```python 141 | get_as_tensorflow(split: str = None) 142 | ``` 143 | 144 | Convert Foundry Dataset to a Tensorflow Sequence 145 | 146 | 147 | 148 | **Arguments:** 149 | 150 | - `split` (string): Split to create Tensorflow Sequence on. 151 | - `**Default`: ** ``None`` 152 | 153 | Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split 154 | 155 | --- 156 | 157 | 158 | 159 | ### method `get_as_torch` 160 | 161 | ```python 162 | get_as_torch(split: str = None) 163 | ``` 164 | 165 | Returns the data from the dataset as a TorchDataset 166 | 167 | 168 | 169 | **Arguments:** 170 | 171 | - `split` (string): Split to create PyTorch Dataset on. 172 | - `**Default`: ** ``None`` 173 | 174 | Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split 175 | 176 | --- 177 | 178 | 179 | 180 | ### method `get_citation` 181 | 182 | ```python 183 | get_citation() → str 184 | ``` 185 | 186 | 187 | 188 | 189 | 190 | --- 191 | 192 | 193 | 194 | ### method `get_as_dict` 195 | 196 | ```python 197 | get_as_dict(split: str = None, as_hdf5: bool = False) 198 | ``` 199 | 200 | Returns the data from the dataset as a dictionary 201 | 202 | 203 | 204 | **Arguments:** 205 | 206 | - `split` (string): Split to create dataset on. 207 | - `**Default`: ** ``None`` 208 | 209 | Returns: (dict) Dictionary of all the data from the specified split 210 | 211 | --- 212 | 213 | 214 | 215 | ### method `validate_metadata` 216 | 217 | ```python 218 | validate_metadata(metadata) 219 | ``` 220 | 221 | Validate the JSON message against the FoundryDataset model 222 | 223 | 224 | 225 | **Arguments:** 226 | 227 | - `metadata` (dict): Metadata information provided by the user. 228 | 229 | 230 | 231 | **Raises:** 232 | 233 | - `ValidationError`: if metadata supplied by user does not meet the specificiation of a FoundryDataset object. 234 | 235 | 236 | 237 | 238 | --- 239 | 240 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 241 | -------------------------------------------------------------------------------- /docs/foundry.https_download.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry.https_download` 6 | Methods to download files from a Globus endpoint 7 | 8 | 9 | --- 10 | 11 | 12 | 13 | ## function `recursive_ls` 14 | 15 | ```python 16 | recursive_ls(tc: TransferClient, ep: str, path: str, max_depth: int = 3) 17 | ``` 18 | 19 | Find all files in a Globus directory recursively 20 | 21 | 22 | 23 | **Args:** 24 | 25 | - `tc`: TransferClient authorized to access the directory 26 | - `ep`: Endpoint on which the files reside 27 | - `path`: Path to the files being downloaded 28 | - `max_depth`: Maximum recurse depth 29 | 30 | 31 | 32 | **Yields:** 33 | Dictionaries describing the location of the files. Each includes at least 34 | - `"name"`: Name of the file 35 | - `"path"`: Absolute path to the file's location 36 | 37 | 38 | --- 39 | 40 | 41 | 42 | ## function `download_file` 43 | 44 | ```python 45 | download_file(item, base_directory, https_config, timeout=1800) 46 | ``` 47 | 48 | Download a file to disk 49 | 50 | 51 | 52 | **Args:** 53 | 54 | - `item`: Dictionary defining the path to the file 55 | - `base_directory`: Base directory for storing downloaded files 56 | - `https_config`: Configuration defining the URL of the server and the name of the dataset 57 | - `timeout`: Timeout for the download request in seconds (default: 1800) 58 | 59 | 60 | 61 | 62 | --- 63 | 64 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 65 | -------------------------------------------------------------------------------- /docs/foundry.https_upload.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry.https_upload` 6 | Private utility methods to upload files and/or folders to Globus using HTTPS instead of Globus Transfer. 7 | 8 | 9 | --- 10 | 11 | 12 | 13 | ## function `upload_to_endpoint` 14 | 15 | ```python 16 | upload_to_endpoint( 17 | auths: PubAuths, 18 | local_data_path: str, 19 | endpoint_id: str = '82f1b5c6-6e9b-11e5-ba47-22000b92c6ec', 20 | dest_parent: str = None, 21 | dest_child: str = None 22 | ) → Tuple[str, str] 23 | ``` 24 | 25 | Upload local data to a Globus endpoint using HTTPS PUT requests. Data can be a folder or an individual file. 26 | 27 | **Args:** 28 | 29 | - `auths` (PubAuths): Dataclass of authorizers needed for upload. Includes `transfer_client`, `auth_client_openid`, 30 | - `and `endpoint_auth_clients`, which is a Dict of `endpoint_id``: AuthClient mappings. 31 | - `local_data_path` (str): Path to the local dataset to publish to Foundry via HTTPS. Creates an HTTPS PUT request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is transferred to MDF. 32 | - `endpoint_id` (str): Globus endpoint ID to upload the data to. Default is NCSA endpoint. Must match the `endpoint_id` auth'd in `auths.auth_client_gcs`. 33 | 34 | Returns 35 | ------- (str) Globus data source URL: URL pointing to the data on the Globus endpoint 36 | 37 | 38 | 39 | 40 | --- 41 | 42 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 43 | -------------------------------------------------------------------------------- /docs/foundry.loaders.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry.loaders` 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | --- 15 | 16 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 17 | -------------------------------------------------------------------------------- /docs/foundry.loaders.tf_wrapper.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry.loaders.tf_wrapper` 6 | 7 | 8 | 9 | 10 | 11 | 12 | --- 13 | 14 | 15 | 16 | ## class `TensorflowSequence` 17 | Foundry Dataset Converted to Tensorflow Format 18 | 19 | 20 | 21 | ### method `__init__` 22 | 23 | ```python 24 | __init__(inputs, targets) 25 | ``` 26 | 27 | 28 | 29 | 30 | 31 | 32 | --- 33 | 34 | #### property max_queue_size 35 | 36 | 37 | 38 | 39 | 40 | --- 41 | 42 | #### property num_batches 43 | 44 | Number of batches in the PyDataset. 45 | 46 | 47 | 48 | **Returns:** 49 | The number of batches in the PyDataset or `None` to indicate that the dataset is infinite. 50 | 51 | --- 52 | 53 | #### property use_multiprocessing 54 | 55 | 56 | 57 | 58 | 59 | --- 60 | 61 | #### property workers 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | --- 73 | 74 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 75 | -------------------------------------------------------------------------------- /docs/foundry.loaders.torch_wrapper.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry.loaders.torch_wrapper` 6 | 7 | 8 | 9 | 10 | 11 | 12 | --- 13 | 14 | 15 | 16 | ## class `TorchDataset` 17 | Foundry Dataset Converted to Pytorch Format 18 | 19 | 20 | 21 | ### method `__init__` 22 | 23 | ```python 24 | __init__(inputs, targets) 25 | ``` 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | --- 38 | 39 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 40 | -------------------------------------------------------------------------------- /docs/foundry.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry` 6 | 7 | 8 | 9 | 10 | **Global Variables** 11 | --------------- 12 | - **auth** 13 | - **https_download** 14 | - **jsonschema_models** 15 | - **models** 16 | - **utils** 17 | - **foundry_cache** 18 | - **foundry_dataset** 19 | - **https_upload** 20 | - **foundry** 21 | 22 | 23 | 24 | 25 | --- 26 | 27 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 28 | -------------------------------------------------------------------------------- /docs/foundry.models.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry.models` 6 | 7 | 8 | 9 | 10 | 11 | 12 | --- 13 | 14 | 15 | 16 | ## class `FoundrySpecificationDataset` 17 | Pydantic base class for datasets within the Foundry data package specification 18 | 19 | 20 | --- 21 | 22 | #### property model_extra 23 | 24 | Get extra fields set during validation. 25 | 26 | 27 | 28 | **Returns:** 29 | A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. 30 | 31 | --- 32 | 33 | #### property model_fields_set 34 | 35 | Returns the set of fields that have been explicitly set on this model instance. 36 | 37 | 38 | 39 | **Returns:** 40 | A set of strings representing the fields that have been set, i.e. that were not filled from defaults. 41 | 42 | 43 | 44 | 45 | --- 46 | 47 | 48 | 49 | ## class `FoundrySpecification` 50 | Pydantic base class for interacting with the Foundry data package specification The specification provides a way to group datasets and manage versions 51 | 52 | 53 | --- 54 | 55 | #### property model_extra 56 | 57 | Get extra fields set during validation. 58 | 59 | 60 | 61 | **Returns:** 62 | A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. 63 | 64 | --- 65 | 66 | #### property model_fields_set 67 | 68 | Returns the set of fields that have been explicitly set on this model instance. 69 | 70 | 71 | 72 | **Returns:** 73 | A set of strings representing the fields that have been set, i.e. that were not filled from defaults. 74 | 75 | 76 | 77 | --- 78 | 79 | 80 | 81 | ### method `add_dependency` 82 | 83 | ```python 84 | add_dependency(name: str, version: str) 85 | ``` 86 | 87 | 88 | 89 | 90 | 91 | --- 92 | 93 | 94 | 95 | ### method `clear_dependencies` 96 | 97 | ```python 98 | clear_dependencies() 99 | ``` 100 | 101 | 102 | 103 | 104 | 105 | --- 106 | 107 | 108 | 109 | ### method `model_dump` 110 | 111 | ```python 112 | model_dump() 113 | ``` 114 | 115 | 116 | 117 | 118 | 119 | --- 120 | 121 | 122 | 123 | ### method `remove_duplicate_dependencies` 124 | 125 | ```python 126 | remove_duplicate_dependencies() 127 | ``` 128 | 129 | 130 | 131 | 132 | 133 | 134 | --- 135 | 136 | 137 | 138 | ## class `FoundryDatasetType` 139 | Foundry Dataset Types Enumeration of the possible Foundry dataset types 140 | 141 | 142 | 143 | 144 | 145 | --- 146 | 147 | 148 | 149 | ## class `FoundrySchema` 150 | A model for the Foundry schema based on the FoundryModel (project_model.py) class. 151 | 152 | 153 | 154 | ### method `__init__` 155 | 156 | ```python 157 | __init__(project_dict: Dict[str, Any]) 158 | ``` 159 | 160 | 161 | 162 | 163 | 164 | 165 | --- 166 | 167 | #### property model_extra 168 | 169 | Get extra fields set during validation. 170 | 171 | 172 | 173 | **Returns:** 174 | A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. 175 | 176 | --- 177 | 178 | #### property model_fields_set 179 | 180 | Returns the set of fields that have been explicitly set on this model instance. 181 | 182 | 183 | 184 | **Returns:** 185 | A set of strings representing the fields that have been set, i.e. that were not filled from defaults. 186 | 187 | 188 | 189 | 190 | --- 191 | 192 | 193 | 194 | ## class `FoundryDatacite` 195 | A model for the Datacite schema based on the Datacite (dc_model.py) class. 196 | 197 | 198 | 199 | ### method `__init__` 200 | 201 | ```python 202 | __init__(datacite_dict: Dict[str, Any], **kwargs) 203 | ``` 204 | 205 | 206 | 207 | 208 | 209 | 210 | --- 211 | 212 | #### property model_extra 213 | 214 | Get extra fields set during validation. 215 | 216 | 217 | 218 | **Returns:** 219 | A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. 220 | 221 | --- 222 | 223 | #### property model_fields_set 224 | 225 | Returns the set of fields that have been explicitly set on this model instance. 226 | 227 | 228 | 229 | **Returns:** 230 | A set of strings representing the fields that have been set, i.e. that were not filled from defaults. 231 | 232 | 233 | 234 | 235 | --- 236 | 237 | 238 | 239 | ## class `FoundryBase` 240 | Configuration information for Foundry instance 241 | 242 | 243 | --- 244 | 245 | #### property model_extra 246 | 247 | Get extra fields set during validation. 248 | 249 | 250 | 251 | **Returns:** 252 | A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. 253 | 254 | --- 255 | 256 | #### property model_fields_set 257 | 258 | Returns the set of fields that have been explicitly set on this model instance. 259 | 260 | 261 | 262 | **Returns:** 263 | A set of strings representing the fields that have been set, i.e. that were not filled from defaults. 264 | 265 | 266 | 267 | --- 268 | 269 | 270 | 271 | ### method `model_dump` 272 | 273 | ```python 274 | model_dump() 275 | ``` 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | --- 285 | 286 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 287 | -------------------------------------------------------------------------------- /docs/foundry.utils.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # module `foundry.utils` 6 | 7 | 8 | 9 | 10 | 11 | --- 12 | 13 | 14 | 15 | ## function `is_pandas_pytable` 16 | 17 | ```python 18 | is_pandas_pytable(group) 19 | ``` 20 | 21 | 22 | 23 | 24 | 25 | 26 | --- 27 | 28 | 29 | 30 | ## function `is_doi` 31 | 32 | ```python 33 | is_doi(string: str) 34 | ``` 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | --- 44 | 45 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ 46 | -------------------------------------------------------------------------------- /docs/how-to-contribute/code_of_conduct.md: -------------------------------------------------------------------------------- 1 | --- 2 | description: Read our pledge and Code of Conduct for contributing 3 | --- 4 | 5 | # Contributor Covenant 6 | 7 | ## Our Pledge 8 | 9 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 10 | 11 | ## Our Standards 12 | 13 | Examples of behavior that contributes to creating a positive environment include: 14 | 15 | * Using welcoming and inclusive language 16 | * Being respectful of differing viewpoints and experiences 17 | * Gracefully accepting constructive criticism 18 | * Focusing on what is best for the community 19 | * Showing empathy towards other community members 20 | 21 | Examples of unacceptable behavior by participants include: 22 | 23 | * The use of sexualized language or imagery and unwelcome sexual attention or 24 | 25 | advances 26 | 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | 31 | address, without explicit permission 32 | 33 | * Other conduct which could reasonably be considered inappropriate in a 34 | 35 | professional setting 36 | 37 | ## Our Responsibilities 38 | 39 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 40 | 41 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 42 | 43 | ## Scope 44 | 45 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 46 | 47 | ## Enforcement 48 | 49 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at foundry@uchicago.edu. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 50 | 51 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 52 | 53 | ## Attribution 54 | 55 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4, available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html) 56 | 57 | For answers to common questions about this code of conduct, see [https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq) 58 | 59 | -------------------------------------------------------------------------------- /docs/how-to-contribute/contributing.md: -------------------------------------------------------------------------------- 1 | # Contribution Process 2 | 3 | When contributing to this repository, please first discuss the change you wish to make via issue, email, or any other method with the owners of this repository before making a change. 4 | 5 | Please note we have a code of conduct, please follow it in all your interactions with the project. 6 | 7 | ## Contributing code 8 | 9 | If you have improvements to Foundry, send us your pull requests! For those just getting started, Github has a [how to](https://help.github.com/articles/using-pull-requests/). 10 | 11 | If you want to contribute, start working through the Foundry codebase, navigate to the [Github "issues" tab](https://github.com/MLMI2-CSSI/foundry/issues) and start looking through interesting issues. If you are not sure of where to start, then start by trying one of the smaller/easier issues here i.e. [issues with the "good first issue" label](https://github.com/MLMI2-CSSI/foundry/labels/good%20first%20issue). These are issues that we believe are particularly well suited for outside contributions. If you want to help out, but not alone, use the issue comment thread to coordinate. 12 | 13 | ### General guidelines and philosophy for contribution 14 | 15 | * Include unit tests when you contribute new features, as they help to a\) 16 | 17 | prove that your code works correctly, and b\) guard against future breaking 18 | 19 | changes to lower the maintenance cost. 20 | 21 | * Bug fixes also generally require unit tests, because the presence of bugs 22 | 23 | usually indicates insufficient test coverage. 24 | 25 | * Keep API compatibility in mind when you change code in Foundry, 26 | * When you contribute a new feature to Foundry, the maintenance burden is 27 | 28 | \(by default\) transferred to the Foundry team. This means that the benefit 29 | 30 | of the contribution must be compared against the cost of maintaining the 31 | 32 | feature. 33 | 34 | * Tests should follow [testing best practices](https://www..org/community/contribute/tests) 35 | 36 | guide. 37 | 38 | ## Pull Request Process 39 | 40 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a 41 | 42 | build. 43 | 44 | 2. Update the README.md with details of changes to the interface, this includes new environment 45 | 46 | variables, exposed ports, useful file locations and container parameters. 47 | 48 | 3. Increase the version numbers in any examples files and the README.md to the new version that this 49 | 50 | Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/). 51 | 52 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you 53 | 54 | do not have permission to do that, you may request the second reviewer to merge it for you. 55 | 56 | -------------------------------------------------------------------------------- /docs/publishing-datasets.md: -------------------------------------------------------------------------------- 1 | --- 2 | description: Information on how to publish datasets 3 | --- 4 | 5 | # Publishing Datasets 6 | 7 | In order to publish datasets, the datasets must 1\) adhere to specified Foundry dataset shapes \([see here](publishing-datasets.md#shaping-datasets)\), and 2\) be described with required information \([see here](publishing-datasets.md#describing-datasets)\). Together, the dataset shape and description enable researchers to reuse the datasets more easily. 8 | 9 | ## Examples 10 | 11 | [Skip to the publication example notebook.](https://github.com/MLMI2-CSSI/foundry/blob/master/examples/foundry_publication_example.ipynb) 12 | 13 | ## Shaping Datasets 14 | 15 | For a general dataset to be translated into a usable Foundry dataset, it should follow one of the prescribed shapes. It should also be described by a Key object, which provides a mapping that allows Foundry to read data from the underlying data structure into usable Python objects \([see Describing Datasets](publishing-datasets.md#describing-datasets) for more info\). 16 | 17 | ### **Tabular Data** 18 | 19 | Tabular data should include in a form where columns represent the different keys of the data and rows represent individual entries. 20 | 21 | | **feature\_1** | **feature\_2** | **material\_type** | band\_gap | 22 | | :--- | :--- | :--- | :--- | 23 | | 0.10 | 0.52 | 1 | 1.40 | 24 | | 0.34 | 0.910 | 0 | 0.73 | 25 | | ... | ... | ... | | 26 | 27 | For this example dataset the `Key` object could be: 28 | 29 | ```text 30 | "keys":[{ 31 | "key": "feature_1", 32 | "type": "input", 33 | "units": None, 34 | "description": "This is feature 1" 35 | },{ 36 | "key": "feature_2", 37 | "type": "input", 38 | "units": None, 39 | "description": "This is feature 2" 40 | },{ 41 | "key": "material_type", 42 | "type": "input", 43 | "units": None, 44 | "description": "This is the material type", 45 | "labels":["perovskite","not perovskite"] 46 | }{ 47 | "key": "band_gap", 48 | "type": "target", 49 | "units": "eV", 50 | "description": "This is the simulated band gap in eV" 51 | } 52 | ] 53 | ``` 54 | 55 | {% hint style="info" %} 56 | `This tabular data file should be saved in the base directory as` **`foundry_dataframe.json`** 57 | {% endhint %} 58 | 59 | ### Hierarchical Data 60 | 61 | Foundry also supports data from hierarchical data formats \(e.g., [HDF5](https://www.h5py.org)\). In this case features and outputs can be represented with `/` notation. For example, if the features of a dataset are located in an array stored in `/data/arr1` and `/other_data/arr2` while the outputs are in `/data/band_gaps`, the Key object would be: 62 | 63 | ```text 64 | "keys":[{ 65 | "key": "/data/arr1", 66 | "type": "input", 67 | "units": None, 68 | "description": "This is an array containing input data" 69 | },{ 70 | "key": "/other_data/arr2", 71 | "type": "input", 72 | "units": None, 73 | "description": "This is an another array containing input data" 74 | },{ 75 | "key": "/data/band_gaps", 76 | "type": "target", 77 | "units": "eV", 78 | "description": "This is the simulated band gap in eV" 79 | } 80 | ] 81 | ``` 82 | 83 | ## Describing Datasets 84 | 85 | **DataCite Metadata \(object\):** All datasets can be described using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). This metadata captures . Many of these capabilities have helper functions in the SDK, to make it easier to match the DataCite schema 86 | 87 | **Keys \(object\):** Key objects provide a mapping that allows Foundry to read data from the underlying data structure into usable Python objects. Key objects have the following properties 88 | 89 | * **`key (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\) 90 | * **`type (str)`** The type of key this entry represents. Currently suported types are _**\["input", "target" \]**_ 91 | * **`units (str)[optional]`** _****_The scientific units associated with a key. _Default: None_ 92 | * **`description (str)[optional]`** _****_A free text description of the key. _Default: None_ 93 | * **`labels (list) (str) [optional]`:** A list of strings mapped to integers in a key column 94 | 95 | **short\_name \(str\):** Short name is a unique name associated with this dataset to make loading and . 96 | 97 | **type \(str\):** The type provides a hint to Foundry on how to map the keys into loading operations. _Options \["tabular","hdf5"\]_ 98 | 99 | ```text 100 | "foundry": { 101 | "dc": {}, 102 | "keys": [{ 103 | "type": "input", 104 | "name": "feature_1", 105 | "units": "", 106 | "description": "This is an input" 107 | }, 108 | { 109 | "type": "target", 110 | "name": "band_gap", 111 | "units": "eV", 112 | "description": "blah blah", 113 | "labels": [] 114 | } 115 | ], 116 | "short_name": "my_short_name", 117 | "type": "tabular" 118 | } 119 | ``` 120 | 121 | ## Publishing 122 | 123 | {% hint style="info" %} 124 | Before continuing, be sure that you have 1\) signed up for a [free Globus account](https://app.globus.org) and 2\) [joined this Globus group](https://app.globus.org/groups/cc192dca-3751-11e8-90c1-0a7c735d220a/about). 125 | {% endhint %} 126 | 127 | Once your dataset is in the proper shape, and you have created the associated metadata structure, you can publish to Foundry! 128 | 129 | Currently, you can publish any dataset you have stored on a Globus endpoint or Google Drive. In the following, assume your [previously defined metadata](publishing-datasets.md#describing-datasets) are stored in `metadata` : 130 | 131 | ```python 132 | from foundry import Foundry 133 | 134 | # Globus endpoint URL where your dataset is located 135 | data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry%2F_test_blaiszik_foundry_iris_v1.2%2F" 136 | 137 | # full title of dataset 138 | title = "Scourtas example iris dataset" 139 | 140 | # authors to list 141 | authors = ["A Scourtas", "B Blaiszik"] 142 | 143 | # shorthand title (optional) 144 | short_name = "example_AS_iris" 145 | 146 | # affiliations of authors (optional) 147 | affiliations = ["Globus Labs, UChicago"] 148 | 149 | # publisher of the data (optional) 150 | publisher = "Materials Data Facility" 151 | 152 | # publication year (optional) 153 | publication_year = 2021 154 | 155 | 156 | f = Foundry() 157 | res = f.publish(metadata, data_source, title, authors, short_name=short_name)) 158 | ``` 159 | 160 | The `publish()` method returns a result object that you can inspect for information about the state of the publication. For the above publication, `res` would have the format: 161 | 162 | ```python 163 | {'error': None, 164 | 'source_id': '_test_example_iris_v1.1', 165 | 'status_code': 202, 166 | 'success': True} 167 | ``` 168 | 169 | 170 | 171 | ## Future Work 172 | 173 | * Add support for wildcard key type specifications 174 | * Add link to example publication 175 | 176 | -------------------------------------------------------------------------------- /docs/publishing-models.md: -------------------------------------------------------------------------------- 1 | --- 2 | description: Information on how to publish models 3 | --- 4 | 5 | # Publishing Models 6 | 7 | In addition to datasets, you can publish models \(or even individual Python methods\) to Foundry and run them in the cloud! 8 | 9 | ## Examples 10 | 11 | Model publication example notebook coming soon 12 | 13 | ## Model Types 14 | 15 | You can publish any of the following types of models or functions to Foundry: 16 | 17 | * [Scikit-Learn models](publishing-models.md#scikit-learn-models) 18 | * [Tensorflow 1 & 2 models](publishing-models.md#tensorflow-1-and-2-models) 19 | * [Keras models](publishing-models.md#keras-models) 20 | * [PyTorch models](publishing-models.md#keras-models) 21 | * [Class methods \(advanced use\)](publishing-models.md#class-methods) 22 | * [Static methods \(advanced use\)](publishing-models.md#static-methods) 23 | 24 | ### Scikit-Learn models 25 | 26 | ### Tensorflow 1 & 2 models 27 | 28 | ### Keras models 29 | 30 | ### PyTorch models 31 | 32 | ### Class methods 33 | 34 | ### Static methods 35 | 36 | ## Data Types for Inputs and Targets 37 | 38 | ## Describing Models 39 | 40 | Before you can publish a model, you need to describe it using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). 41 | 42 | ## Publishing 43 | 44 | ## Future Work 45 | 46 | -------------------------------------------------------------------------------- /docs/publishing/publishing-datasets.md: -------------------------------------------------------------------------------- 1 | --- 2 | description: Information on how to publish datasets 3 | --- 4 | 5 | # Publishing Datasets 6 | 7 | In order to publish datasets, the datasets must 1\) adhere to specified Foundry dataset shapes \([see here](publishing-datasets.md#shaping-datasets)\), and 2\) be described with required information \([see here](publishing-datasets.md#describing-datasets)\). Together, the dataset shape and description enable researchers to reuse the datasets more easily. 8 | 9 | ## Examples 10 | 11 | [Skip to the publication example notebook.](https://github.com/MLMI2-CSSI/foundry/blob/master/examples/foundry_publication_example.ipynb) 12 | 13 | ## Shaping Datasets 14 | 15 | For a general dataset to be translated into a usable Foundry dataset, it should follow one of the prescribed shapes. It should also be described by a `Key` object, which provides a mapping that allows Foundry to read data from the underlying data structure into usable Python objects \([see Describing Datasets](publishing-datasets.md#describing-datasets) for more info\). 16 | 17 | ### **Tabular Data** 18 | 19 | Tabular data should include in a form where columns represent the different keys of the data and rows represent individual entries. 20 | 21 | | **feature\_1** | **feature\_2** | **material\_type** | band\_gap | 22 | | :--- | :--- | :--- | :--- | 23 | | 0.10 | 0.52 | 1 | 1.40 | 24 | | 0.34 | 0.910 | 0 | 0.73 | 25 | | ... | ... | ... | | 26 | 27 | For this example dataset the `keys` list could be: 28 | 29 | ```text 30 | "keys":[{ 31 | "key": "feature_1", 32 | "type": "input", 33 | "units": None, 34 | "description": "This is feature 1" 35 | },{ 36 | "key": "feature_2", 37 | "type": "input", 38 | "units": None, 39 | "description": "This is feature 2" 40 | },{ 41 | "key": "material_type", 42 | "type": "input", 43 | "units": None, 44 | "description": "This is the material type", 45 | "labels":["perovskite","not perovskite"] 46 | }{ 47 | "key": "band_gap", 48 | "type": "target", 49 | "units": "eV", 50 | "description": "This is the simulated band gap in eV" 51 | } 52 | ] 53 | ``` 54 | 55 | {% hint style="info" %} 56 | `Don't forget to specify the tabular data file in the submitted metadata` 57 | {% endhint %} 58 | 59 | ### Hierarchical Data 60 | 61 | Foundry also supports data from hierarchical data formats \(e.g., [HDF5](https://www.h5py.org)\). In this case features and outputs can be represented with `/` notation. For example, if the features of a dataset are located in an array stored in `/data/arr1` and `/other_data/arr2` while the outputs are in `/data/band_gaps`, the Key object would be: 62 | 63 | ```text 64 | "keys":[{ 65 | "key": "/data/arr1", 66 | "type": "input", 67 | "units": None, 68 | "description": "This is an array containing input data" 69 | },{ 70 | "key": "/other_data/arr2", 71 | "type": "input", 72 | "units": None, 73 | "description": "This is an another array containing input data" 74 | },{ 75 | "key": "/data/band_gaps", 76 | "type": "target", 77 | "units": "eV", 78 | "description": "This is the simulated band gap in eV" 79 | } 80 | ] 81 | ``` 82 | 83 | ## Describing Datasets 84 | 85 | **DataCite Metadata \(object\):** All datasets can be described using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). This metadata captures . Many of these capabilities have helper functions in the SDK, to make it easier to match the DataCite schema 86 | 87 | **Keys \(list\[Key\]\):** `Key` objects provide a mapping that allows Foundry to read data from the underlying data structure into usable Python objects. Individual `Key` objects have the following properties 88 | 89 | * **`key (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\) 90 | * **`type (str)`** The type of key this entry represents. Currently suported types are _**\["input", "target" \]**_ 91 | * **`units (str)[optional]`** _****_The scientific units associated with a key. _Default: None_ 92 | * **`description (str)[optional]`** _****_A free text description of the key. _Default: None_ 93 | * **`labels (list) (str) [optional]`:** A list of strings mapped to integers in a key column 94 | 95 | **Splits \(list\[Split\]\):** `Split`objects provide a way for users to specify which data should be included as test, train, or other user defined splits. Individual `Split` objects have the following properties 96 | 97 | * **`type (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\) 98 | * **`path (str)`** The full filepath to the dataset file or directory that contains the split 99 | * **`label (str)`** A label to assign to this split 100 | 101 | **short\_name \(str\):** Short name is a unique name associated with this dataset to make loading and . 102 | 103 | **type \(str\):** The type provides a hint to Foundry on how to map the keys into loading operations. _Options \["tabular","hdf5"\]_ 104 | 105 | ```text 106 | "foundry": { 107 | "dc": {}, 108 | "keys": [{ 109 | "type": "input", 110 | "name": "feature_1", 111 | "units": "", 112 | "description": "This is an input" 113 | }, 114 | { 115 | "type": "target", 116 | "name": "band_gap", 117 | "units": "eV", 118 | "description": "blah blah", 119 | "labels": [] 120 | } 121 | ], 122 | "short_name": "my_short_name", 123 | "data_type": "tabular" 124 | } 125 | ``` 126 | 127 | ## Publishing 128 | 129 | {% hint style="info" %} 130 | Before continuing, be sure that you have 1\) signed up for a [free Globus account](https://app.globus.org) and 2\) [joined this Globus group](https://app.globus.org/groups/cc192dca-3751-11e8-90c1-0a7c735d220a/about). 131 | {% endhint %} 132 | 133 | Once your dataset is in the proper shape, and you have created the associated metadata structure, you can publish to Foundry! An example is shown below. 134 | 135 | ```text 136 | "foundry": { 137 | "dc": {}, 138 | "keys": [{ 139 | "type": "input", 140 | "name": "feature_1", 141 | "units": "", 142 | "description": "This is an input" 143 | }, 144 | { 145 | "type": "target", 146 | "name": "band_gap", 147 | "units": "eV", 148 | "description": "blah blah", 149 | "labels": [] 150 | } 151 | ], 152 | "short_name": "my_short_name", 153 | "data_type": "tabular" 154 | } 155 | ``` 156 | 157 | Currently, you can publish any dataset you have stored on a Globus endpoint or Google Drive. In the following, assume your [previously defined metadata](publishing-datasets.md#describing-datasets) are stored in `metadata` : 158 | 159 | ```python 160 | from foundry import Foundry 161 | 162 | # Globus endpoint URL where your dataset is located 163 | data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry%2F_test_blaiszik_foundry_iris_v1.2%2F" 164 | 165 | # full title of dataset 166 | title = "Scourtas example iris dataset" 167 | 168 | # authors to list 169 | authors = ["A. Scourtas", "B. Blaiszik"] 170 | 171 | # shorthand title (optional) 172 | short_name = "example_AS_iris" 173 | 174 | # affiliations of authors (optional) 175 | affiliations = ["Globus Labs, UChicago"] 176 | 177 | # publisher of the data (optional) 178 | publisher = "Materials Data Facility" 179 | 180 | # publication year (optional) 181 | publication_year = 2021 182 | 183 | 184 | f = Foundry() 185 | res = f.publish(metadata, data_source, title, authors, short_name=short_name)) 186 | ``` 187 | 188 | The `publish()` method returns a result object that you can inspect for information about the state of the publication. For the above publication, `res` would have the format: 189 | 190 | ```python 191 | {'error': None, 192 | 'source_id': '_test_example_iris_v1.1', 193 | 'status_code': 202, 194 | 'success': True} 195 | ``` 196 | 197 | 198 | 199 | ## Future Work 200 | 201 | * Add support for wildcard key type specifications 202 | * Add link to example publication 203 | 204 | -------------------------------------------------------------------------------- /docs/publishing/publishing-models.md: -------------------------------------------------------------------------------- 1 | --- 2 | description: Information on how to publish models 3 | --- 4 | 5 | # Publishing Models 6 | 7 | In addition to datasets, you can publish models \(or even individual Python methods\) to Foundry and run them in the cloud! 8 | 9 | ## Examples 10 | 11 | Model publication example notebook coming soon 12 | 13 | ## Model Types 14 | 15 | You can publish any of the following types of models or functions to Foundry: 16 | 17 | * [Scikit-Learn models](publishing-models.md#scikit-learn-models) 18 | * [Tensorflow 1 & 2 models](publishing-models.md#tensorflow-1-and-2-models) 19 | * [Keras models](publishing-models.md#keras-models) 20 | * [PyTorch models](publishing-models.md#keras-models) 21 | * [Class methods \(advanced use\)](publishing-models.md#class-methods) 22 | * [Static methods \(advanced use\)](publishing-models.md#static-methods) 23 | 24 | ### Scikit-Learn models 25 | 26 | ### Tensorflow 1 & 2 models 27 | 28 | ### Keras models 29 | 30 | ### PyTorch models 31 | 32 | ### Class methods 33 | 34 | ### Static methods 35 | 36 | ## Data Types for Inputs and Targets 37 | 38 | ## Describing Models 39 | 40 | Before you can publish a model, you need to describe it using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). 41 | 42 | ## Publishing 43 | 44 | ## Future Work 45 | 46 | -------------------------------------------------------------------------------- /docs/support/troubleshooting.md: -------------------------------------------------------------------------------- 1 | --- 2 | description: Common pitfalls and issues and how to solve them 3 | --- 4 | 5 | # Troubleshooting 6 | 7 | ### Issues with loading or publishing Keras or Tensorflow models 8 | 9 | ![A common error that arises when there is a Keras or Tensorflow version mismatch](../.gitbook/assets/screen-shot-2021-07-15-at-10.05.40-am.png) 10 | 11 | There is a difference between the older, plain Keras package installed via `import keras`, and the currently maintained and up-to-date Keras package installed via `from tensorflow import keras`. Currently, the DLHub SDK \(which Foundry uses under-the-hood to publish, pull, and run models and functions\) uses whichever version of Keras you have installed. 12 | 13 | Errors can arise when `tf.keras` is used in one part of the model pipeline, but plain `keras` is used in another. 14 | 15 | If you have both versions of Keras installed \(which can be the case in common container environments, such as Google Colab\), DLHub will default to the plain Keras version, in case the user wants to use that with the newest version of Tensorflow. To override this functionality and use the Tensorflow Keras instead when publishing your model, pass the `force_tf_keras = True`option to `publish_model()`. 16 | 17 | ```python 18 | # Assume our fitted model is '7-fi-1.hdf5'. 19 | # Create the metadata for the model 20 | import os 21 | 22 | options_keras = { 23 | "title": "Bandgap-7-fidelity-MP-JARVIS-1", 24 | "short_name": "7-fi-1", 25 | "authors": ["Scientist, Awesome"], 26 | "servable": { 27 | "type": "keras", 28 | "model_path": "7-fi-1.hdf5", 29 | "custom_objects": {"softplus2": softplus2, 30 | "MEGNetLayer": MEGNetLayer, 31 | "Set2Set": Set2Set}, 32 | "force_tf_keras": True 33 | } 34 | } 35 | res = f.publish_model(options_keras) 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples using Foundry 2 | If you're wondering how to get started with Foundry or want to see it in action, you're in the right place! 3 | 4 | Each notebook walks through instantiating Foundry, loading data from Foundry, and working with the data in different ways. Some notebooks also use machine learning models with the data. 5 | 6 | Each folder contains a notebook and `requirements.txt` file. The notebooks can be run locally (using the `requirements.txt`) or in [Google Colab](https://colab.research.google.com/). 7 | 8 | If you have any trouble with the notebooks, please check our [documentation](https://ai-materials-and-chemistry.gitbook.io/foundry/v/docs/) or create an issue on the repo. 9 | -------------------------------------------------------------------------------- /examples/atom-position-finding/requirements.txt: -------------------------------------------------------------------------------- 1 | foundry_ml 2 | matplotlib 3 | -------------------------------------------------------------------------------- /examples/bandgap/foundry.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Band Gap Analysis", 3 | "version": "1.0.0", 4 | "description": "Datasets for band gap uber model generation", 5 | "private":true, 6 | "dependencies":{ 7 | "_test_foundry_experimental_bandgap_v1.1": "1.1", 8 | "_test_foundry_mp_bandgap_v1.1":"1.1", 9 | "_test_foundry_oqmd_bandgap_v1.1":"1.1", 10 | "_test_foundry_assorted_computational_bandgap_v1.1":"1.1" 11 | } 12 | } -------------------------------------------------------------------------------- /examples/bandgap/requirements.txt: -------------------------------------------------------------------------------- 1 | pymatgen 2 | matminer 3 | pandas 4 | matplotlib 5 | scikit-learn 6 | foundry_ml 7 | -------------------------------------------------------------------------------- /examples/dendrite-segmentation/foundry.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Dendrite Segmentation", 3 | "version": "1.0.0", 4 | "description": "Semantic Segmentation of Dendrites via Machine Learning", 5 | "private":true, 6 | "dependencies":{ 7 | "_test_foundry_stan_dendrite_segmentation_v1.1": "1.1" 8 | } 9 | } -------------------------------------------------------------------------------- /examples/dendrite-segmentation/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn 2 | foundry_ml 3 | scikit-image 4 | tensorflow 5 | keras-unet 6 | opencv-python 7 | -------------------------------------------------------------------------------- /examples/oqmd/foundry.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "OQMD Data Analysis", 3 | "version": "1.0.0", 4 | "description": "Creating dataframe and metadata for OQMD dataset", 5 | "private":true, 6 | "dependencies":{ 7 | "_test_foundry_oqmd_v1.1": "1.1" 8 | } 9 | } -------------------------------------------------------------------------------- /examples/oqmd/requirements.txt: -------------------------------------------------------------------------------- 1 | foundry_ml 2 | pandas 3 | -------------------------------------------------------------------------------- /examples/publishing-guides/data/iris.csv: -------------------------------------------------------------------------------- 1 | # Data from: https://archive.ics.uci.edu/ml/datasets/Iris 2 | sepal_length,sepal_width,petal_length,petal_width,species 3 | 5.1,3.5,1.4,0.2,setosa 4 | 4.9,3.0,1.4,0.2,setosa 5 | 4.7,3.2,1.3,0.2,setosa 6 | 4.6,3.1,1.5,0.2,setosa 7 | 5.0,3.6,1.4,0.2,setosa 8 | 5.4,3.9,1.7,0.4,setosa 9 | 4.6,3.4,1.4,0.3,setosa 10 | 5.0,3.4,1.5,0.2,setosa 11 | 4.4,2.9,1.4,0.2,setosa 12 | 4.9,3.1,1.5,0.1,setosa 13 | 5.4,3.7,1.5,0.2,setosa 14 | 4.8,3.4,1.6,0.2,setosa 15 | 4.8,3.0,1.4,0.1,setosa 16 | 4.3,3.0,1.1,0.1,setosa 17 | 5.8,4.0,1.2,0.2,setosa 18 | 5.7,4.4,1.5,0.4,setosa 19 | 5.4,3.9,1.3,0.4,setosa 20 | 5.1,3.5,1.4,0.3,setosa 21 | 5.7,3.8,1.7,0.3,setosa 22 | 5.1,3.8,1.5,0.3,setosa 23 | 5.4,3.4,1.7,0.2,setosa 24 | 5.1,3.7,1.5,0.4,setosa 25 | 4.6,3.6,1.0,0.2,setosa 26 | 5.1,3.3,1.7,0.5,setosa 27 | 4.8,3.4,1.9,0.2,setosa 28 | 5.0,3.0,1.6,0.2,setosa 29 | 5.0,3.4,1.6,0.4,setosa 30 | 5.2,3.5,1.5,0.2,setosa 31 | 5.2,3.4,1.4,0.2,setosa 32 | 4.7,3.2,1.6,0.2,setosa 33 | 4.8,3.1,1.6,0.2,setosa 34 | 5.4,3.4,1.5,0.4,setosa 35 | 5.2,4.1,1.5,0.1,setosa 36 | 5.5,4.2,1.4,0.2,setosa 37 | 4.9,3.1,1.5,0.1,setosa 38 | 5.0,3.2,1.2,0.2,setosa 39 | 5.5,3.5,1.3,0.2,setosa 40 | 4.9,3.1,1.5,0.1,setosa 41 | 4.4,3.0,1.3,0.2,setosa 42 | 5.1,3.4,1.5,0.2,setosa 43 | 5.0,3.5,1.3,0.3,setosa 44 | 4.5,2.3,1.3,0.3,setosa 45 | 4.4,3.2,1.3,0.2,setosa 46 | 5.0,3.5,1.6,0.6,setosa 47 | 5.1,3.8,1.9,0.4,setosa 48 | 4.8,3.0,1.4,0.3,setosa 49 | 5.1,3.8,1.6,0.2,setosa 50 | 4.6,3.2,1.4,0.2,setosa 51 | 5.3,3.7,1.5,0.2,setosa 52 | 5.0,3.3,1.4,0.2,setosa 53 | 7.0,3.2,4.7,1.4,versicolor 54 | 6.4,3.2,4.5,1.5,versicolor 55 | 6.9,3.1,4.9,1.5,versicolor 56 | 5.5,2.3,4.0,1.3,versicolor 57 | 6.5,2.8,4.6,1.5,versicolor 58 | 5.7,2.8,4.5,1.3,versicolor 59 | 6.3,3.3,4.7,1.6,versicolor 60 | 4.9,2.4,3.3,1.0,versicolor 61 | 6.6,2.9,4.6,1.3,versicolor 62 | 5.2,2.7,3.9,1.4,versicolor 63 | 5.0,2.0,3.5,1.0,versicolor 64 | 5.9,3.0,4.2,1.5,versicolor 65 | 6.0,2.2,4.0,1.0,versicolor 66 | 6.1,2.9,4.7,1.4,versicolor 67 | 5.6,2.9,3.6,1.3,versicolor 68 | 6.7,3.1,4.4,1.4,versicolor 69 | 5.6,3.0,4.5,1.5,versicolor 70 | 5.8,2.7,4.1,1.0,versicolor 71 | 6.2,2.2,4.5,1.5,versicolor 72 | 5.6,2.5,3.9,1.1,versicolor 73 | 5.9,3.2,4.8,1.8,versicolor 74 | 6.1,2.8,4.0,1.3,versicolor 75 | 6.3,2.5,4.9,1.5,versicolor 76 | 6.1,2.8,4.7,1.2,versicolor 77 | 6.4,2.9,4.3,1.3,versicolor 78 | 6.6,3.0,4.4,1.4,versicolor 79 | 6.8,2.8,4.8,1.4,versicolor 80 | 6.7,3.0,5.0,1.7,versicolor 81 | 6.0,2.9,4.5,1.5,versicolor 82 | 5.7,2.6,3.5,1.0,versicolor 83 | 5.5,2.4,3.8,1.1,versicolor 84 | 5.5,2.4,3.7,1.0,versicolor 85 | 5.8,2.7,3.9,1.2,versicolor 86 | 6.0,2.7,5.1,1.6,versicolor 87 | 5.4,3.0,4.5,1.5,versicolor 88 | 6.0,3.4,4.5,1.6,versicolor 89 | 6.7,3.1,4.7,1.5,versicolor 90 | 6.3,2.3,4.4,1.3,versicolor 91 | 5.6,3.0,4.1,1.3,versicolor 92 | 5.5,2.5,4.0,1.3,versicolor 93 | 5.5,2.6,4.4,1.2,versicolor 94 | 6.1,3.0,4.6,1.4,versicolor 95 | 5.8,2.6,4.0,1.2,versicolor 96 | 5.0,2.3,3.3,1.0,versicolor 97 | 5.6,2.7,4.2,1.3,versicolor 98 | 5.7,3.0,4.2,1.2,versicolor 99 | 5.7,2.9,4.2,1.3,versicolor 100 | 6.2,2.9,4.3,1.3,versicolor 101 | 5.1,2.5,3.0,1.1,versicolor 102 | 5.7,2.8,4.1,1.3,versicolor 103 | 6.3,3.3,6.0,2.5,virginica 104 | 5.8,2.7,5.1,1.9,virginica 105 | 7.1,3.0,5.9,2.1,virginica 106 | 6.3,2.9,5.6,1.8,virginica 107 | 6.5,3.0,5.8,2.2,virginica 108 | 7.6,3.0,6.6,2.1,virginica 109 | 4.9,2.5,4.5,1.7,virginica 110 | 7.3,2.9,6.3,1.8,virginica 111 | 6.7,2.5,5.8,1.8,virginica 112 | 7.2,3.6,6.1,2.5,virginica 113 | 6.5,3.2,5.1,2.0,virginica 114 | 6.4,2.7,5.3,1.9,virginica 115 | 6.8,3.0,5.5,2.1,virginica 116 | 5.7,2.5,5.0,2.0,virginica 117 | 5.8,2.8,5.1,2.4,virginica 118 | 6.4,3.2,5.3,2.3,virginica 119 | 6.5,3.0,5.5,1.8,virginica 120 | 7.7,3.8,6.7,2.2,virginica 121 | 7.7,2.6,6.9,2.3,virginica 122 | 6.0,2.2,5.0,1.5,virginica 123 | 6.9,3.2,5.7,2.3,virginica 124 | 5.6,2.8,4.9,2.0,virginica 125 | 7.7,2.8,6.7,2.0,virginica 126 | 6.3,2.7,4.9,1.8,virginica 127 | 6.7,3.3,5.7,2.1,virginica 128 | 7.2,3.2,6.0,1.8,virginica 129 | 6.2,2.8,4.8,1.8,virginica 130 | 6.1,3.0,4.9,1.8,virginica 131 | 6.4,2.8,5.6,2.1,virginica 132 | 7.2,3.0,5.8,1.6,virginica 133 | 7.4,2.8,6.1,1.9,virginica 134 | 7.9,3.8,6.4,2.0,virginica 135 | 6.4,2.8,5.6,2.2,virginica 136 | 6.3,2.8,5.1,1.5,virginica 137 | 6.1,2.6,5.6,1.4,virginica 138 | 7.7,3.0,6.1,2.3,virginica 139 | 6.3,3.4,5.6,2.4,virginica 140 | 6.4,3.1,5.5,1.8,virginica 141 | 6.0,3.0,4.8,1.8,virginica 142 | 6.9,3.1,5.4,2.1,virginica 143 | 6.7,3.1,5.6,2.4,virginica 144 | 6.9,3.1,5.1,2.3,virginica 145 | 5.8,2.7,5.1,1.9,virginica 146 | 6.8,3.2,5.9,2.3,virginica 147 | 6.7,3.3,5.7,2.5,virginica 148 | 6.7,3.0,5.2,2.3,virginica 149 | 6.3,2.5,5.0,1.9,virginica 150 | 6.5,3.0,5.2,2.0,virginica 151 | 6.2,3.4,5.4,2.3,virginica 152 | 5.9,3.0,5.1,1.8,virginica 153 | -------------------------------------------------------------------------------- /examples/zeolite/requirements.txt: -------------------------------------------------------------------------------- 1 | seaborn 2 | matplotlib 3 | foundry_ml -------------------------------------------------------------------------------- /foundry/__init__.py: -------------------------------------------------------------------------------- 1 | from .foundry import Foundry # noqa F401 (import unused) 2 | from . import models # noqa F401 (import unused) 3 | from . import https_download # noqa F401 (import unused) 4 | from . import https_upload # noqa F401 (import unused) 5 | from .foundry_dataset import FoundryDataset # noqa F401 (import unused) 6 | -------------------------------------------------------------------------------- /foundry/auth.py: -------------------------------------------------------------------------------- 1 | """Utilities related to storing authentication credentials""" 2 | 3 | from dataclasses import dataclass 4 | from typing import Dict 5 | 6 | from globus_sdk import TransferClient, AuthClient 7 | 8 | 9 | @dataclass 10 | class PubAuths: 11 | """Collection of the authorizers needed for publication 12 | 13 | Attributes: 14 | transfer_client: Client with credentials to perform transfers 15 | auth_client_openid: Client with permissions to get users IDs 16 | endpoint_auth_clients: Mapping between endpoint ID and client that can authorize access to it 17 | """ 18 | 19 | transfer_client: TransferClient 20 | auth_client_openid: AuthClient 21 | endpoint_auth_clients: Dict[str, AuthClient] 22 | -------------------------------------------------------------------------------- /foundry/foundry_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import html 5 | from json2table import convert 6 | 7 | from pydantic import ValidationError 8 | 9 | from .foundry_cache import FoundryCache 10 | from .models import FoundrySchema, FoundryDatacite 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class FoundryDataset(): 17 | """Representation of an individual dataset. 18 | Provides access to metadata as well as functions to 19 | instantiate data into memory in different formats. 20 | 21 | Args: 22 | dataset_name (str): Name of the dataset (equivalent to source_id in MDF) 23 | datacite_entry (FoundryDatacite): Datacite entry for the dataset 24 | foundry_schema (FoundrySchema): Schema for the dataset 25 | foundry_cache (FoundryCache): Cache for the dataset 26 | 27 | Desired functions: 28 | - Get as pandas 29 | - Get as tensorflow dataset 30 | - Get as pytorch dataset 31 | - Get file list 32 | - Set metadata 33 | - Attach datafiles 34 | - Validate against schema 35 | - Get citation 36 | """ 37 | 38 | def __init__(self, 39 | dataset_name: str, 40 | datacite_entry: FoundryDatacite, 41 | foundry_schema: FoundrySchema, 42 | foundry_cache: FoundryCache = None): 43 | 44 | self.dataset_name = dataset_name 45 | try: 46 | self.dc = FoundryDatacite(datacite_entry) 47 | self.foundry_schema = FoundrySchema(foundry_schema) 48 | except Exception as e: 49 | raise Exception('there was a problem creating the dataset: ', e) 50 | self._foundry_cache = foundry_cache 51 | 52 | def get_as_dict(self, split: str = None, as_hdf5: bool = False): 53 | """Returns the data from the dataset as a dictionary 54 | 55 | Arguments: 56 | split (string): Split to create dataset on. 57 | **Default:** ``None`` 58 | 59 | Returns: (dict) Dictionary of all the data from the specified split 60 | 61 | """ 62 | return self._foundry_cache.load_as_dict(split, 63 | self.dataset_name, 64 | self.foundry_schema, 65 | as_hdf5) 66 | load = get_as_dict 67 | 68 | def get_as_torch(self, split: str = None): 69 | """Returns the data from the dataset as a TorchDataset 70 | 71 | Arguments: 72 | split (string): Split to create PyTorch Dataset on. 73 | **Default:** ``None`` 74 | 75 | Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split 76 | 77 | """ 78 | 79 | return self._foundry_cache.load_as_torch(split, 80 | self.dataset_name, 81 | self.foundry_schema) 82 | 83 | def get_as_tensorflow(self, split: str = None): 84 | """Convert Foundry Dataset to a Tensorflow Sequence 85 | 86 | Arguments: 87 | split (string): Split to create Tensorflow Sequence on. 88 | **Default:** ``None`` 89 | 90 | Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split 91 | 92 | """ 93 | return self._foundry_cache.load_as_tensorflow(split, 94 | self.dataset_name, 95 | self.foundry_schema) 96 | 97 | def _repr_html_(self) -> str: 98 | """Format the Foundry object for notebook rendering as HTML output 99 | 100 | Args: 101 | self (Foundry) 102 | 103 | Returns: 104 | buf (str): buffer containing the HTML to render 105 | """ 106 | if not self.dc: 107 | buf = str(self) 108 | else: 109 | title = self.dc.titles[0].title 110 | authors = [creator['creatorName'] 111 | for creator in self.dc.creators] 112 | authors = '; '.join(authors) 113 | DOI = "DOI: " + self.dc.identifier.identifier.root 114 | 115 | buf = f'

{title}

{authors}

{DOI}

' 116 | 117 | buf = f'{buf}

Dataset

{convert(json.loads(self.foundry_schema.json()))}' 118 | return buf 119 | 120 | def _format_creators(self): 121 | creators_list = [] 122 | for creator in self.dc.creators: 123 | affiliations = creator.get('affiliations', []) 124 | if affiliations: 125 | affiliations_str = ', '.join(html.escape(aff) for aff in affiliations) 126 | creators_list.append(f"{html.escape(creator['creatorName'])} ({affiliations_str})") 127 | else: 128 | creators_list.append(f"{html.escape(creator['creatorName'])}") 129 | return '; '.join(creators_list) 130 | 131 | def _format_subjects(self): 132 | return ', '.join([html.escape(subject.subject) for subject in self.dc.subjects]) if self.dc.subjects else 'No subjects available' 133 | 134 | def get_citation(self) -> str: 135 | subjects = [subject.subject for subject in self.dc.subjects] 136 | doi_str = f"doi = {{{self.dc.identifier.identifier.root}}}" 137 | url_str = f"url = {{https://doi.org/{self.dc.identifier.identifier.root}}}" 138 | author_str = f"author = {{{' and '.join([creator['creatorName'] for creator in self.dc.creators])}}}" 139 | title_str = f"title = {{{self.dc.titles[0].title}}}" 140 | keywords_str = f"keywords = {{{', '.join(subjects)}}}" 141 | publisher_str = f"publisher = {{{self.dc.publisher}}}" 142 | year_str = f"year = {{{self.dc.publicationYear}}}" 143 | bibtex = os.linesep.join([doi_str, url_str, 144 | author_str, title_str, 145 | keywords_str, publisher_str, 146 | year_str]) 147 | bibtex = f"@misc{{https://doi.org/{self.dc.identifier.identifier.root}{os.linesep}{bibtex}}}" 148 | return bibtex 149 | 150 | def validate_metadata(self, metadata): 151 | """Validate the JSON message against the FoundryDataset model 152 | 153 | Arguments: 154 | metadata (dict): Metadata information provided by the user. 155 | 156 | Raises: 157 | ValidationError: if metadata supplied by user does not meet the specificiation of a 158 | FoundryDataset object. 159 | 160 | """ 161 | try: 162 | FoundryDataset(**metadata) 163 | logger.debug("Metadata validation successful!") 164 | except ValidationError as e: 165 | logger.error("Metadata validation failed!") 166 | for error in e.errors(): 167 | field_name = ".".join([item for item in error['loc'] if isinstance(item, str)]) 168 | error_description = error['msg'] 169 | error_message = f"""There is an issue validating the metadata for the field '{field_name}': 170 | The error message returned is: '{error_description}'.""" 171 | logger.error(error_message) 172 | raise e 173 | 174 | def add_data(self, local_data_path: str = None, globus_data_source: str = None): 175 | """Add data to the dataset. User must provide the location of the data as 176 | either a `globus_data_source` or `local_data_path`. 177 | 178 | Arguments: 179 | local_data_path (str): Local path to the dataset used to publish to Foundry via HTTPS. Creates an HTTPS PUT 180 | request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is 181 | transferred to MDF. If None, the user must specify a 'globus_data_source' URL to the location of the 182 | data on their own Globus endpoint. User must choose either `globus_data_source` or `local_data_path` to 183 | publish their data. 184 | globus_data_source (str): Url path for a data folder on a Globus endpoint; url can be obtained through 185 | the Globus Web UI or SDK. If None, the user must specify an 'local_data_path' pointing to the location 186 | of the data on their local machine. User must choose either `globus_data_source` or `local_data_path` to 187 | publish their data. 188 | 189 | """ 190 | if local_data_path is None and globus_data_source is None: 191 | raise ValueError("User must provide either a path to the data on their local machine or a URL to the data " 192 | "on their Globus endpoint.") 193 | 194 | if local_data_path is not None and globus_data_source is not None: 195 | raise ValueError("User must choose either `globus_data_source` or `local_data_path`, not both.") 196 | 197 | if globus_data_source is not None: 198 | self._globus_data_source = globus_data_source 199 | if hasattr(self, '_local_data_path'): 200 | delattr(self, '_local_data_path') 201 | elif local_data_path is not None: 202 | if os.path.isdir(local_data_path) or os.path.isfile(local_data_path): 203 | self._local_data_path = local_data_path 204 | if hasattr(self, '_globus_data_source'): 205 | delattr(self, '_globus_data_source') 206 | else: 207 | raise ValueError("The path provided does not exist or is not a file or directory.") 208 | 209 | def clear_dataset_cache(self): 210 | """Deletes the cached data for this specific datset""" 211 | self._foundry_cache.clear_cache(self.dataset_name) 212 | 213 | def clean_dc_dict(self): 214 | """Clean the Datacite dictionary of None values""" 215 | print(json.loads(self.dc.json())) 216 | return self.delete_none(json.loads(self.dc.json())) 217 | 218 | def delete_none(self, _dict): 219 | """Delete None values recursively from all of the dictionaries""" 220 | for key, value in list(_dict.items()): 221 | if isinstance(value, dict): 222 | self.delete_none(value) 223 | elif value is None: 224 | del _dict[key] 225 | elif isinstance(value, list): 226 | for v_i in value: 227 | if isinstance(v_i, dict): 228 | self.delete_none(v_i) 229 | 230 | return _dict 231 | -------------------------------------------------------------------------------- /foundry/https_download.py: -------------------------------------------------------------------------------- 1 | """Methods to download files from a Globus endpoint 2 | """ 3 | 4 | 5 | import os 6 | from collections import deque 7 | 8 | import requests 9 | from globus_sdk import TransferClient 10 | 11 | 12 | def recursive_ls(tc: TransferClient, ep: str, path: str, max_depth: int = 3): 13 | """Find all files in a Globus directory recursively 14 | 15 | Args: 16 | tc: TransferClient authorized to access the directory 17 | ep: Endpoint on which the files reside 18 | path: Path to the files being downloaded 19 | max_depth: Maximum recurse depth 20 | 21 | Yields: 22 | Dictionaries describing the location of the files. Each includes at least 23 | "name": Name of the file 24 | "path": Absolute path to the file's location 25 | """ 26 | queue = deque() 27 | queue.append((path, "", 0)) 28 | yield from _get_files(tc, ep, queue, max_depth) 29 | 30 | 31 | def _get_files(tc, ep, queue, max_depth): 32 | while queue: 33 | abs_path, rel_path, depth = queue.pop() 34 | path_prefix = rel_path + "/" if rel_path else "" 35 | 36 | res = tc.operation_ls(ep, path=abs_path) 37 | 38 | if depth < max_depth: 39 | queue.extend( 40 | ( 41 | res["path"] + item["name"], 42 | path_prefix + item["name"], 43 | depth + 1, 44 | ) 45 | for item in res["DATA"] 46 | if item["type"] == "dir" 47 | ) 48 | for item in res["DATA"]: 49 | if item["type"] == 'file': 50 | item["name"] = path_prefix + item["name"] 51 | item["path"] = abs_path.replace('/~/', '/') 52 | yield item 53 | 54 | 55 | def download_file(item, base_directory, https_config, timeout=1800): 56 | """Download a file to disk 57 | 58 | Args: 59 | item: Dictionary defining the path to the file 60 | base_directory: Base directory for storing downloaded files 61 | https_config: Configuration defining the URL of the server and the name of the dataset 62 | timeout: Timeout for the download request in seconds (default: 1800) 63 | """ 64 | base_url = https_config['base_url'].rstrip('/') 65 | path = item.get('path', '').strip('/') 66 | 67 | # Extracting the name and subdirectory from the item 68 | name = item.get('name', '') 69 | subdirectory = name.split('/')[0] if '/' in name else '' 70 | 71 | # Avoid duplication of subdirectory in path 72 | if subdirectory and path.endswith(subdirectory): 73 | full_path = f"{path}/{name.split('/', 1)[-1]}".strip('/') 74 | else: 75 | full_path = '/'.join([path, name]).strip('/') 76 | 77 | url = f"{base_url}/{full_path}" 78 | 79 | # build destination path for data file 80 | destination = os.path.join(base_directory, https_config['source_id'], item['name']) 81 | parent_path = os.path.split(destination)[0] 82 | 83 | # if parent directories don't exist, create them 84 | if not os.path.exists(parent_path): 85 | os.makedirs(parent_path, exist_ok=True) 86 | 87 | try: 88 | with requests.get(url, stream=True, timeout=timeout) as response: 89 | response.raise_for_status() 90 | 91 | downloaded_size = 0 92 | print(f"\rStarting Download of: {url}") 93 | 94 | with open(destination, "wb") as f: 95 | for chunk in response.iter_content(chunk_size=8192): 96 | if chunk: 97 | f.write(chunk) 98 | downloaded_size += len(chunk) 99 | # Calculate and print the download progress 100 | print(f"\rDownloading... {downloaded_size/(1 << 20):,.2f} MB", end="") 101 | return destination 102 | 103 | except requests.exceptions.RequestException as e: 104 | print(f"Error downloading file: {e}") 105 | except IOError as e: 106 | print(f"Error writing file to disk: {e}") 107 | 108 | return {destination + " status": True} 109 | -------------------------------------------------------------------------------- /foundry/https_upload.py: -------------------------------------------------------------------------------- 1 | """Private utility methods to upload files and/or folders to Globus using HTTPS instead of Globus Transfer. 2 | """ 3 | 4 | import logging 5 | import os 6 | import urllib 7 | from requests import put, Response 8 | from typing import Any, Tuple, Dict, List 9 | from uuid import uuid4 10 | 11 | from globus_sdk import AuthClient, TransferClient, TransferAPIError 12 | 13 | from .auth import PubAuths 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def upload_to_endpoint(auths: PubAuths, local_data_path: str, endpoint_id: str = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec", 20 | dest_parent: str = None, dest_child: str = None) -> Tuple[str, str]: 21 | """Upload local data to a Globus endpoint using HTTPS PUT requests. Data can be a folder or an individual file. 22 | Args: 23 | auths (PubAuths): Dataclass of authorizers needed for upload. Includes `transfer_client`, `auth_client_openid`, 24 | and `endpoint_auth_clients`, which is a Dict of `endpoint_id`:AuthClient mappings. 25 | local_data_path (str): Path to the local dataset to publish to Foundry via HTTPS. Creates an HTTPS PUT 26 | request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is 27 | transferred to MDF. 28 | endpoint_id (str): Globus endpoint ID to upload the data to. Default is NCSA endpoint. Must match the 29 | `endpoint_id` auth'd in `auths.auth_client_gcs`. 30 | 31 | Returns 32 | ------- 33 | (str) Globus data source URL: URL pointing to the data on the Globus endpoint 34 | """ 35 | # define upload destination 36 | dest_path = _create_dest_folder(auths.transfer_client, endpoint_id, parent_dir=dest_parent, child_dir=dest_child) 37 | # upload data to endpoint 38 | globus_data_source = _https_upload(auths.transfer_client, auths.endpoint_auth_clients, local_data_path=local_data_path, 39 | dest_path=dest_path, endpoint_id=endpoint_id) 40 | return globus_data_source 41 | 42 | 43 | def _create_dest_folder(transfer_client: TransferClient, endpoint_id: str, parent_dir: str = None, 44 | child_dir: str = None) -> str: 45 | """Create a destination folder for the data on a Globus endpoint 46 | Args: 47 | transfer_client (TransferClient): Globus client authorized for Globus Transfers (ie moving data on endpoint, 48 | adding/removing folders, etc). 49 | endpoint_id (str): A UUID designating the exact Globus endpoint. Can be obtained via the Globus Web UI or 50 | the SDK. 51 | parent_dir (str): Set to "/tmp" when default is None. The parent directory that all publications via HTTPS 52 | will be written to. 53 | child_dir (str): Set to a random UUID when default is None. The child directory that the data will be 54 | written to. 55 | Returns 56 | ------- 57 | (str): Path on Globus endpoint to write to 58 | """ 59 | # use a random UUID for each dataset publication, unless specified otherwise 60 | if child_dir is None: 61 | child_dir = uuid4() # the publication ID forms the name of the child directory 62 | if parent_dir is None: 63 | parent_dir = "/tmp" 64 | dest_path = os.path.join(parent_dir, str(child_dir)) # NOTE: must start and end with "/" 65 | 66 | try: 67 | transfer_client.operation_mkdir(endpoint_id=endpoint_id, path=dest_path) 68 | except TransferAPIError as e: 69 | raise IOError(f"Error from Globus API while creating destination folder: {e.message}") from e 70 | return dest_path 71 | 72 | 73 | def _https_upload(transfer_client: TransferClient, endpoint_auth_clients: Dict[str, AuthClient], local_data_path: str, 74 | dest_path: str = "/tmp", endpoint_id: str = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec") -> str: 75 | """Upload a dataset via HTTPS to a Globus endpoint 76 | Args: 77 | transfer_client (TransferClient): Globus client authorized for Globus Transfers (ie moving data on endpoint, 78 | adding/removing folders, etc). 79 | endpoint_auth_clients (Dict[str, AuthClient]): Dict of `endpoint_id` : `AuthClient` keypairs. AuthClients used 80 | for Globus Auth functionality within endpoint-specific scopes using Globus Connect Server (ie accessing 81 | or altering data on a specific endpoint). 82 | local_data_path (str): The path to the local data to upload. Can be relative or absolute. 83 | dest_path (str): The path to the destination folder on the Globus endpoint. Default is "/tmp". 84 | endpoint_id (str): A UUID designating the exact Globus endpoint. Can be obtained via the Globus Web UI or 85 | the SDK. Default is the NCSA UUID "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec". 86 | Returns 87 | ------- 88 | (str): Globus data source URL (ie the URL that points to the data on a Globus endpoint) 89 | """ 90 | 91 | # get URL for Globus endpoint location 92 | endpoint = transfer_client.get_endpoint(endpoint_id) # gets info for NCSA endpoint 93 | https_base_url = endpoint["https_server"] 94 | 95 | # Submit data (folders of files or an independent file) to be written to endpoint 96 | if os.path.isdir(local_data_path): 97 | _upload_folder(transfer_client, endpoint_auth_clients, local_data_path, https_base_url, dest_path, endpoint_id) 98 | elif os.path.isfile(local_data_path): 99 | _upload_file(endpoint_auth_clients[endpoint_id], local_data_path, https_base_url, dest_path) 100 | else: 101 | raise IOError(f"Data path '{local_data_path}' is of unknown type") 102 | 103 | # return the data source URL for publication to MDF 104 | return _make_globus_link(endpoint_id, dest_path) 105 | 106 | 107 | def _upload_folder(transfer_client: TransferClient, endpoint_auth_clients: Dict[str, AuthClient], local_data_path: str, 108 | https_base_url: str, parent_dest_path: str, endpoint_id: str) -> List[Dict[str, Any]]: 109 | """Upload a folder to a Globus endpoint using HTTPS 110 | Args: 111 | transfer_client (TransferClient): Globus client authorized for Globus Transfers (ie moving data on endpoint, 112 | adding/removing folders, etc). 113 | endpoint_auth_clients (Dict[str, AuthClient]): Dict of `endpoint_id` : `AuthClient` keypairs. AuthClients used 114 | for Globus Auth functionality within endpoint-specific scopes using Globus Connect Server (ie accessing 115 | or altering data on a specific endpoint). 116 | local_data_path (str): The path to the local data to upload. Can be relative or absolute. 117 | https_base_url (str): The URL for a given Globus endpoint. 118 | parent_dest_path (str): The path to the parent folder to be written to on the given endpoint. The contents 119 | of "local_data_path" will be written here, including subdirectories. 120 | endpoint_id (str): The UUID designating the exact Globus endpoint. Can be obtained via the Globus Web UI or 121 | the SDK. This must be the same endpoint pointed to by the https_base_url. 122 | Returns 123 | ------- 124 | (list): A list of Response objects (the `requests` HTTPS response object from a PUT request) 125 | """ 126 | results = [] 127 | # initialize destination path as the parent destination path 128 | dest_path = parent_dest_path 129 | 130 | # walk through each child directory in the designated local data folder 131 | for root, _, files in os.walk(local_data_path): 132 | # update destination path if we have walked into a child directory 133 | if root != local_data_path: 134 | # get the child directory relative path 135 | subpath = os.path.relpath(root, local_data_path) 136 | # update destination path to include child directories (ie subpaths) 137 | dest_path = os.path.join(parent_dest_path, subpath) 138 | # create child directories on endpoint 139 | try: 140 | transfer_client.operation_mkdir(endpoint_id=endpoint_id, path=dest_path) 141 | except TransferAPIError as e: 142 | raise IOError(f"Error while creating child directory {dest_path}: {e.message}") from e 143 | # get local path to file to upload 144 | for filename in files: 145 | filepath = os.path.join(root, filename) 146 | # upload file to destination path on endpoint 147 | result = _upload_file(endpoint_auth_clients[endpoint_id], filepath, https_base_url, dest_path) 148 | results.append(result) 149 | return results 150 | 151 | 152 | def _upload_file(auth_client_gcs: AuthClient, filepath: str, https_base_url: str, dest_path: str) -> Response: 153 | """Upload an individual file to a Globus endpoint specified in 'auth_client_gcs' using HTTPS PUT 154 | Args: 155 | auth_client_gcs (AuthClient): Globus client authorized for Globus Auth functionality within an endpoint-specific 156 | scope using Globus Connect Server (ie accessing or altering data on a specific endpoint). 157 | filepath (str): The path to the local file to upload. 158 | https_base_url (str): The URL for a given Globus endpoint. 159 | dest_path (str): The path to the folder to be written to on the given endpoint. 160 | Returns 161 | ------- 162 | (Response): The `requests` HTTPS response object from a PUT request 163 | """ 164 | # Get the authorization header token (string for the headers dict) for HTTPS upload 165 | header = auth_client_gcs.authorizer.get_authorization_header() 166 | 167 | # get Globus endpoint path to write to 168 | filename = os.path.split(filepath)[1] 169 | # need to strip out leading "/" in dest_path for join to work 170 | endpoint_dest = os.path.join(https_base_url, dest_path.lstrip("/"), filename) 171 | 172 | # upload via HTTPS as arbitrary binary content type 173 | with open(filepath, "rb") as f: 174 | reply = put( 175 | endpoint_dest, 176 | data=f, 177 | headers={"Authorization": header, "Content-Type": "application/octet-stream"} 178 | ) 179 | if reply.status_code != 200: 180 | raise IOError(f"Error on HTTPS PUT, got response {reply.status_code}: {reply.text}") 181 | # Return the response 182 | return reply 183 | 184 | 185 | def _make_globus_link(endpoint_id: str, path: str) -> str: 186 | """Create the Globus data source URL for a given datapath on an endpoint 187 | Args: 188 | endpoint_id (str): The UUID designating the exact Globus endpoint. Can be obtained via the Globus Web UI or 189 | the SDK. 190 | path (str): The path to the dataset folder on the given endpoint. 191 | Returns 192 | ------- 193 | (str): The Globus data source URL (ie the URL that points to the data on a Globus endpoint) 194 | """ 195 | # make sure the path has the "/"s encoded properly for a URL 196 | safe_path = urllib.parse.quote(path, safe="*") 197 | link = f"https://app.globus.org/file-manager?origin_id={endpoint_id}&origin_path={safe_path}" 198 | return link 199 | -------------------------------------------------------------------------------- /foundry/jsonschema_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/foundry/jsonschema_models/__init__.py -------------------------------------------------------------------------------- /foundry/jsonschema_models/project_model.py: -------------------------------------------------------------------------------- 1 | # generated by datamodel-codegen: 2 | # filename: projects.json 3 | # timestamp: 2024-02-20T22:47:19+00:00 4 | 5 | from __future__ import annotations 6 | 7 | from typing import Any, Dict, List, Optional, Union 8 | 9 | from pydantic import BaseModel, Extra, Field 10 | 11 | 12 | class Nanomfg(BaseModel): 13 | class Config: 14 | extra = Extra.forbid 15 | 16 | base_pressure: Optional[float] = None 17 | carbon_source: Optional[str] = None 18 | catalyst: Optional[str] = None 19 | grain_size: Optional[str] = None 20 | max_temperature: Optional[float] = None 21 | orientation: Optional[str] = None 22 | sample_surface_area: Optional[float] = None 23 | sample_thickness: Optional[float] = None 24 | 25 | 26 | class RedoxPotential(BaseModel): 27 | class Config: 28 | extra = Extra.forbid 29 | 30 | S0: Optional[float] = Field( 31 | None, description='Redox potential for electronic ground state S0. Units: eV' 32 | ) 33 | S1: Optional[float] = Field( 34 | None, 35 | description='Redox potential for electronicically excited state S1. Units: eV', 36 | ) 37 | T1: Optional[float] = Field( 38 | None, 39 | description='Redox potential for electronically excited state T1. Units: eV', 40 | ) 41 | 42 | 43 | class DipoleMoment(BaseModel): 44 | class Config: 45 | extra = Extra.forbid 46 | 47 | S0: Optional[float] = Field( 48 | None, description='Dipole moment for electronic ground state S0. Units: Debye' 49 | ) 50 | S1: Optional[float] = Field( 51 | None, 52 | description='Dipole moment for electronicically excited state S1. Units: Debye', 53 | ) 54 | T1: Optional[float] = Field( 55 | None, 56 | description='Dipole moment for electronically excited state T1. Units: Debye', 57 | ) 58 | 59 | 60 | class Field00(BaseModel): 61 | class Config: 62 | extra = Extra.forbid 63 | 64 | S1: Optional[float] = Field( 65 | None, description='0_0 energy for electronicically excited state S1. Units: eV' 66 | ) 67 | T1: Optional[float] = Field( 68 | None, description='0_0 energy for electronically excited state T1. Units: eV' 69 | ) 70 | 71 | 72 | class Verde(BaseModel): 73 | class Config: 74 | extra = Extra.forbid 75 | 76 | vertical_excitation_energy: Optional[float] = Field( 77 | None, description='Vertical excitation energy. Units: eV' 78 | ) 79 | ionization_potential: Optional[float] = Field( 80 | None, description='Ionization potential. Units: eV' 81 | ) 82 | redox_potential: Optional[RedoxPotential] = Field( 83 | None, description='Redox potentials of the molecule. Units: eV' 84 | ) 85 | dipole_moment: Optional[DipoleMoment] = Field( 86 | None, description='Dipole moment of the molecule. Units: Debye' 87 | ) 88 | field_0_0: Optional[Field00] = Field( 89 | None, 90 | alias='0_0', 91 | description='0-0 transition energies of the molecule. Units: eV', 92 | ) 93 | 94 | 95 | class Split(BaseModel): 96 | class Config: 97 | extra = Extra.forbid 98 | 99 | type: Optional[str] = Field( 100 | None, 101 | description='The kind of partition of the dataset (train, test, validation, etc)', 102 | ) 103 | path: Optional[str] = Field( 104 | None, description='The full filepath to the dataset file or directory' 105 | ) 106 | label: Optional[str] = Field(None, description='A label to assign to this split') 107 | 108 | 109 | class Classes(BaseModel): 110 | class Config: 111 | extra = Extra.forbid 112 | 113 | label: Optional[str] = Field(None, description='The label that exists in the data') 114 | name: Optional[str] = Field(None, description='The name the label maps onto.') 115 | 116 | 117 | class Key(BaseModel): 118 | class Config: 119 | extra = Extra.forbid 120 | 121 | key: Optional[List[str]] = Field( 122 | None, 123 | description='Column or header name for tabular data, key/path for HDF5 data', 124 | ) 125 | type: Optional[str] = Field(None, description='Whether input or target') 126 | filter: Optional[str] = Field(None, description='How apply the defined key') 127 | description: Optional[str] = Field( 128 | None, description='Free text description of the key' 129 | ) 130 | units: Optional[str] = Field(None, description='The units associated with the key') 131 | classes: Optional[Union[List[Dict[str, Any]], Classes]] = None 132 | 133 | 134 | class Foundry(BaseModel): 135 | class Config: 136 | extra = Extra.forbid 137 | 138 | short_name: Optional[str] = None 139 | data_type: Optional[str] = Field( 140 | None, description='The kind of data in the dataset, e.g. tabular, json, hdf5' 141 | ) 142 | task_type: Optional[List[str]] = Field( 143 | None, 144 | description='The type of task. e.g., supervised, unsupervised, generative.', 145 | ) 146 | domain: Optional[List[str]] = Field( 147 | None, 148 | description='The domain of applicability. e.g., materials science, chemistry, machine vision', 149 | ) 150 | n_items: Optional[float] = Field( 151 | None, 152 | description='The number of total items in the dataset including all splits.', 153 | ) 154 | splits: Optional[List[Split]] = Field( 155 | None, 156 | description='Define all partitions of the dataset (train, test, validation, etc.)', 157 | ) 158 | keys: Optional[List[Key]] = Field( 159 | None, description='Keys describing how to load the data' 160 | ) 161 | 162 | 163 | class Projects(BaseModel): 164 | class Config: 165 | extra = Extra.forbid 166 | 167 | nanomfg: Optional[Nanomfg] = Field(None, description='Project block for NanoMFG.') 168 | verde: Optional[Verde] = Field(None, description='VERDE calculation outputs') 169 | foundry: Optional[Foundry] = Field( 170 | None, description='Project block for Foundry datasets.' 171 | ) 172 | 173 | 174 | class ProjectsBlock(BaseModel): 175 | projects: Optional[Projects] = Field( 176 | None, description='External organization-defined block.' 177 | ) 178 | -------------------------------------------------------------------------------- /foundry/loaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/foundry/loaders/__init__.py -------------------------------------------------------------------------------- /foundry/loaders/tf_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tensorflow.keras.utils import Sequence 3 | 4 | 5 | class TensorflowSequence(Sequence): 6 | """Foundry Dataset Converted to Tensorflow Format""" 7 | 8 | def __init__(self, inputs, targets): 9 | self.inputs = inputs 10 | self.targets = targets 11 | 12 | def __len__(self): 13 | return len(self.inputs[0]) 14 | 15 | def __getitem__(self, idx): 16 | item = {"input": [], "target": []} 17 | 18 | for input in self.inputs: 19 | item["input"].append(np.array(input[idx])) 20 | item["input"] = np.array(item["input"]) 21 | 22 | for target in self.targets: 23 | item["target"].append(np.array(target[idx])) 24 | item["target"] = np.array(item["target"]) 25 | 26 | return item 27 | -------------------------------------------------------------------------------- /foundry/loaders/torch_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class TorchDataset(Dataset): 6 | """Foundry Dataset Converted to Pytorch Format""" 7 | 8 | def __init__(self, inputs, targets): 9 | self.inputs = inputs 10 | self.targets = targets 11 | 12 | def __len__(self): 13 | return len(self.inputs[0]) 14 | 15 | def __getitem__(self, idx): 16 | item = {"input": [], "target": []} 17 | 18 | # adds the correct item at index idx from each input from self.inputs to the item dictionary 19 | for input in self.inputs: 20 | item["input"].append(np.array(input[idx])) 21 | item["input"] = np.array(item["input"]) 22 | 23 | # adds the correct item at index idx from each target from self.targets to the item dictionary 24 | for target in self.targets: 25 | item["target"].append(np.array(target[idx])) 26 | item["target"] = np.array(item["target"]) 27 | 28 | return item 29 | -------------------------------------------------------------------------------- /foundry/models.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import json 3 | from json2table import convert 4 | import logging 5 | import pandas as pd 6 | from pydantic import BaseModel, Field, Extra, ValidationError 7 | from typing import Optional, Any, Dict 8 | 9 | from .jsonschema_models.dc_model import Dc1 as DataciteModel 10 | from .jsonschema_models.project_model import Foundry as FoundryModel 11 | 12 | logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING) 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | # Classes for Foundry Data Package Specification 17 | class FoundrySpecificationDataset(BaseModel): 18 | """Pydantic base class for datasets within the Foundry data package specification""" 19 | 20 | name: Optional[str] = None 21 | provider: Optional[str] = Field(default="MDF") 22 | version: Optional[str] = None 23 | 24 | 25 | class FoundrySpecification(BaseModel): 26 | """Pydantic base class for interacting with the Foundry data package specification 27 | The specification provides a way to group datasets and manage versions 28 | """ 29 | 30 | name: str = Field(default="") 31 | version: str = Field(default="") 32 | description: str = Field(default="") 33 | private: bool = Field(default=False) 34 | dependencies: Dict[str, str] = Field(default_factory=dict) 35 | 36 | def add_dependency(self, name: str, version: str): 37 | self.dependencies[name] = version 38 | 39 | def remove_duplicate_dependencies(self): 40 | deps = [{"name": key, "version": self.dependencies[key]} 41 | for key in self.dependencies] 42 | df = pd.DataFrame.from_records(deps) 43 | self.clear_dependencies() 44 | for _, row in df.drop_duplicates().iterrows(): 45 | self.add_dependency(name=row["name"], version=row["version"]) 46 | 47 | def clear_dependencies(self): 48 | self.dependencies.clear() 49 | 50 | def model_dump(self): 51 | return json.loads(self.model_dump_json()) 52 | 53 | def _repr_html_(self): 54 | buf = f'

Data Requirements - {self.name}

' 55 | buf = buf + convert(self.model_dump()) 56 | return buf 57 | 58 | 59 | class FoundryDatasetType(Enum): 60 | """Foundry Dataset Types 61 | Enumeration of the possible Foundry dataset types 62 | """ 63 | 64 | tabular = "tabular" 65 | files = "files" 66 | hdf5 = "hdf5" 67 | other = "other" 68 | 69 | 70 | class FoundrySchema(FoundryModel): 71 | """ 72 | A model for the Foundry schema based on the FoundryModel (project_model.py) class. 73 | """ 74 | 75 | def __init__(self, project_dict: Dict[str, Any]): 76 | try: 77 | super().__init__(**project_dict) 78 | except ValidationError as e: 79 | print("FoundrySchema validation failed!") 80 | for error in e.errors(): 81 | field_name = ".".join([str(item) for item in error['loc']]) 82 | error_description = error['msg'] 83 | error_message = f"""There is an issue validating the entry for the field '{field_name}': 84 | The error message returned is: '{error_description}'. 85 | The description for this field is: '{FoundryModel.model_json_schema()['properties'][field_name]['description']}'""" 86 | print(error_message) 87 | raise e 88 | 89 | 90 | class FoundryDatacite(DataciteModel): 91 | """ 92 | A model for the Datacite schema based on the Datacite (dc_model.py) class. 93 | """ 94 | def __init__(self, datacite_dict: Dict[str, Any], **kwargs): 95 | try: 96 | dc_dict = datacite_dict.copy() 97 | if 'identifier' in dc_dict: 98 | if isinstance(dc_dict['identifier'], dict) and 'identifier' in dc_dict['identifier']: 99 | if isinstance(dc_dict['identifier']['identifier'], dict) and '__root__' in dc_dict['identifier']['identifier']: 100 | dc_dict['identifier']['identifier'] = dc_dict['identifier']['identifier']['__root__'] 101 | super().__init__(**dc_dict, **kwargs) 102 | except ValidationError as e: 103 | print("Datacite validation failed!") 104 | for error in e.errors(): 105 | field_name = ".".join(str(loc) for loc in error["loc"]) 106 | error_description = error['msg'] 107 | error_message = f"""There is an issue validating the entry for the field '{field_name}': 108 | The error message returned is: '{error_description}'. 109 | The description is: '{self.model_json_schema()['properties'].get(field_name, {}).get('description', 'No description available')}'""" 110 | print(error_message) 111 | raise e 112 | 113 | 114 | class FoundryBase(BaseModel): 115 | """ 116 | Configuration information for Foundry instance 117 | """ 118 | 119 | dataframe_file: Optional[str] = Field(default="foundry_dataframe.json") 120 | data_file: Optional[str] = Field(default="foundry.hdf5") 121 | metadata_file: Optional[str] = Field(default="foundry_metadata.json") 122 | destination_endpoint: Optional[str] = None 123 | local: Optional[bool] = Field(default=False) 124 | local_cache_dir: str = Field(default="./data") 125 | metadata_key: Optional[str] = Field(default="foundry") 126 | organization: Optional[str] = Field(default="Foundry") 127 | 128 | class Config: 129 | extra = Extra.allow 130 | 131 | def model_dump(self): 132 | return json.loads(self.model_dump_json()) 133 | 134 | def _repr_html_(self): 135 | return convert(self.model_dump()) 136 | -------------------------------------------------------------------------------- /foundry/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def is_pandas_pytable(group): 5 | if 'axis0' in group.keys() and 'axis1' in group.keys(): 6 | return True 7 | else: 8 | return False 9 | 10 | 11 | def is_doi(string: str): 12 | if string.startswith('10.') or string.startswith('https://doi.org/'): 13 | return True 14 | else: 15 | return False 16 | 17 | 18 | def _read_json(path_to_file, lines=False): 19 | """Read JSON file 20 | 21 | Arguments: 22 | path_to_file (string): Path to JSON file 23 | 24 | Returns: (dict) JSON file contents 25 | """ 26 | df = pd.read_json(path_to_file, lines=lines) 27 | return df 28 | 29 | 30 | def _read_csv(path_to_file): 31 | """Read CSV file 32 | 33 | Arguments: 34 | path_to_file (string): Path to CSV file 35 | 36 | Returns: (dict) CSV file contents 37 | """ 38 | return pd.read_csv(path_to_file) 39 | 40 | 41 | def _read_excel(path_to_file): 42 | return pd.read_excel(path_to_file) 43 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | globus-sdk>=3,<4 2 | dlhub_sdk>=2.1.0 3 | requests>=2.18.4 4 | tqdm>=4.19.4 5 | six>=1.11.0 6 | h5py>=2.10.0 7 | numpy>=1.15.4 8 | pandas>=0.23.4 9 | scikit-learn>=1.0 10 | pydantic>=2.7.2 11 | mdf_forge>=0.8.0 12 | mdf-connect-client>=0.5.0 13 | json2table>=1.1.5 14 | torch>=1.8.0 15 | tensorflow>=2 16 | tqdm>=4.64 17 | openpyxl>=3.1.0 18 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = .git,*.egg* 3 | max-line-length = 150 4 | per-file-ignores = 5 | foundry/jsonschema_models/*:E501 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | packages = (setuptools.find_packages(),) 6 | setuptools.setup( 7 | name="foundry_ml", 8 | version="1.0.4", 9 | author="""Aristana Scourtas, KJ Schmidt, Isaac Darling, Aadit Ambadkar, Braeden Cullen, 10 | Imogen Foster, Ribhav Bose, Zoa Katok, Ethan Truelove, Ian Foster, Ben Blaiszik""", 11 | author_email="blaiszik@uchicago.edu", 12 | packages=setuptools.find_packages(), 13 | description="Package to support simplified application of machine learning models to datasets in materials science", 14 | long_description=long_description, 15 | long_description_content_type="text/markdown", 16 | install_requires=[ 17 | "mdf_forge>=0.8.0", 18 | "globus-sdk>=3,<4", 19 | "dlhub_sdk>=1.0.0", 20 | "numpy>=1.15.4", 21 | "pandas>=0.23.4", 22 | "pydantic>=2.7.2", 23 | "mdf_connect_client>=0.5.0", 24 | "h5py>=2.10.0", 25 | "json2table", 26 | "openpyxl>=3.1.0" 27 | ], 28 | python_requires=">=3.7", 29 | classifiers=[ 30 | "Development Status :: 3 - Alpha", 31 | "Intended Audience :: Science/Research", 32 | "License :: OSI Approved :: MIT License", 33 | "Natural Language :: English", 34 | "Operating System :: OS Independent", 35 | "Programming Language :: Python :: 3", 36 | "Topic :: Scientific/Engineering", 37 | ], 38 | keywords=[], 39 | license="MIT License", 40 | url="https://github.com/MLMI2-CSSI/foundry", 41 | ) 42 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest>=7.4 2 | pytest-cov>=2.12 3 | pytest-mock 4 | flake8 5 | jsonschema 6 | mock 7 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/test.py -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | *NOTE: These tests are still in development, and should not be expected to properly cover all test cases yet.* 2 | ## Foundry Tests 3 | This directory contains the tests for the `materials_foundry` package. 4 | 5 | ### Running the tests 6 | Python 3 must be installed. Go to https://www.python.org/downloads/ to download Python 3. 7 | Pytest must also be installed. To do this, run `pip install pytest`. 8 | After Pytest is installed, the tests can be executed by running `pytest` in this directory. 9 | 10 | ### About the tests 11 | These tests cover the basic and advanced functionality of the `materials_foundry` package. They test each function to check that operations succeed with expected values, error with invalid values, and respect parameters appropriately. 12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/tests/data/__init__.py -------------------------------------------------------------------------------- /tests/data/https_test/test_data.json: -------------------------------------------------------------------------------- 1 | "[{\"A\":0.5325016729,\"B\":0.4869994072,\"C\":0.269408499,\"D\":0.3621738306},{\"A\":0.2304825119,\"B\":0.5481912504,\"C\":0.4014871558,\"D\":0.3603053727},{\"A\":0.3507889192,\"B\":0.6226744491,\"C\":0.6091377546,\"D\":0.5165489581},{\"A\":0.5798596097,\"B\":0.7200900352,\"C\":0.9342783399,\"D\":0.0540093822},{\"A\":0.3108050843,\"B\":0.9773936853,\"C\":0.4801914079,\"D\":0.1787148388},{\"A\":0.2084711872,\"B\":0.5751155582,\"C\":0.2584603695,\"D\":0.4499059913},{\"A\":0.4759231155,\"B\":0.8092009236,\"C\":0.3072478517,\"D\":0.8837572165},{\"A\":0.2480077693,\"B\":0.4552477195,\"C\":0.9647205731,\"D\":0.2536659951},{\"A\":0.1634613944,\"B\":0.4766800168,\"C\":0.369663194,\"D\":0.9961761176},{\"A\":0.5346664051,\"B\":0.194493705,\"C\":0.4651741797,\"D\":0.7293780026},{\"A\":0.663231418,\"B\":0.4826377804,\"C\":0.5241409718,\"D\":0.1599210151},{\"A\":0.6838886399,\"B\":0.4712653511,\"C\":0.5581199413,\"D\":0.3226088546},{\"A\":0.5353026817,\"B\":0.7982157423,\"C\":0.5292073255,\"D\":0.8607700998},{\"A\":0.7164381489,\"B\":0.8843576476,\"C\":0.8875950376,\"D\":0.3435581081},{\"A\":0.664698152,\"B\":0.8538449824,\"C\":0.8392907092,\"D\":0.3113757281},{\"A\":0.6479963522,\"B\":0.9540320749,\"C\":0.5325504287,\"D\":0.8182171859},{\"A\":0.8211351808,\"B\":0.4439015651,\"C\":0.6269342472,\"D\":0.4122693317},{\"A\":0.6679538246,\"B\":0.7390494918,\"C\":0.1759150727,\"D\":0.1302475247},{\"A\":0.8377472214,\"B\":0.273212617,\"C\":0.2663859412,\"D\":0.5964344124},{\"A\":0.9082325183,\"B\":0.3137189069,\"C\":0.9204916523,\"D\":0.6981099323},{\"A\":0.0569704461,\"B\":0.5887289259,\"C\":0.9691034132,\"D\":0.864492329},{\"A\":0.8733863986,\"B\":0.5010284784,\"C\":0.8753387807,\"D\":0.2446379967},{\"A\":0.9960436279,\"B\":0.4441227796,\"C\":0.4813238954,\"D\":0.2405431943},{\"A\":0.9743124513,\"B\":0.2431975581,\"C\":0.1700467831,\"D\":0.5107508473},{\"A\":0.2659344468,\"B\":0.2298763219,\"C\":0.1679702902,\"D\":0.2976868866},{\"A\":0.4967479732,\"B\":0.8507496604,\"C\":0.1298286001,\"D\":0.0696243588},{\"A\":0.5011896631,\"B\":0.7902353379,\"C\":0.862998181,\"D\":0.2236992358},{\"A\":0.1630314398,\"B\":0.9551477474,\"C\":0.9418805628,\"D\":0.2297807846},{\"A\":0.4375272337,\"B\":0.4738682907,\"C\":0.7732553673,\"D\":0.9564487668},{\"A\":0.6313980088,\"B\":0.704010717,\"C\":0.5371476407,\"D\":0.6924966076},{\"A\":0.3307278087,\"B\":0.6970454132,\"C\":0.1032596315,\"D\":0.2564419892},{\"A\":0.4353581681,\"B\":0.6941255237,\"C\":0.0394112344,\"D\":0.4991979497},{\"A\":0.3901813924,\"B\":0.3581649708,\"C\":0.0696682115,\"D\":0.1097029952},{\"A\":0.6259185992,\"B\":0.8903284037,\"C\":0.5605368861,\"D\":0.5762697392},{\"A\":0.6068870699,\"B\":0.2540924395,\"C\":0.1086194342,\"D\":0.6033022401},{\"A\":0.3123136578,\"B\":0.2788511046,\"C\":0.4036056534,\"D\":0.5723193255},{\"A\":0.1021738989,\"B\":0.9994909756,\"C\":0.9832543715,\"D\":0.9885624128},{\"A\":0.6785393932,\"B\":0.5788494481,\"C\":0.5004145535,\"D\":0.5129020263},{\"A\":0.4616303134,\"B\":0.2699233081,\"C\":0.1211274991,\"D\":0.2856023533},{\"A\":0.7471981109,\"B\":0.8964269606,\"C\":0.5273349805,\"D\":0.519780483},{\"A\":0.2398561128,\"B\":0.6637693729,\"C\":0.0713046133,\"D\":0.1965824806},{\"A\":0.1438138313,\"B\":0.8077715814,\"C\":0.5987424102,\"D\":0.8796307444},{\"A\":0.2099913033,\"B\":0.0014793221,\"C\":0.6282096248,\"D\":0.3344606287},{\"A\":0.2172346949,\"B\":0.8055518737,\"C\":0.7020053655,\"D\":0.2734407306},{\"A\":0.2919147983,\"B\":0.8325704254,\"C\":0.6602685898,\"D\":0.9023679814},{\"A\":0.5941575402,\"B\":0.03341885,\"C\":0.1900654781,\"D\":0.5003254697},{\"A\":0.0577672223,\"B\":0.5857695682,\"C\":0.7401711144,\"D\":0.2932093813},{\"A\":0.0441340405,\"B\":0.9699508666,\"C\":0.5688421424,\"D\":0.8265708757},{\"A\":0.5671337446,\"B\":0.1391732202,\"C\":0.8557361973,\"D\":0.95843871},{\"A\":0.7818036893,\"B\":0.7559001038,\"C\":0.9803074287,\"D\":0.5550912458},{\"A\":0.7764158003,\"B\":0.6901683866,\"C\":0.7134621122,\"D\":0.3412987906},{\"A\":0.5184392059,\"B\":0.3561712456,\"C\":0.9341091567,\"D\":0.4326865978},{\"A\":0.7478769752,\"B\":0.985482101,\"C\":0.7739615326,\"D\":0.0101972409},{\"A\":0.4681250259,\"B\":0.1031996448,\"C\":0.5414294345,\"D\":0.4261483639},{\"A\":0.1752046752,\"B\":0.1991262091,\"C\":0.677077356,\"D\":0.8059928892},{\"A\":0.7081022399,\"B\":0.5430350946,\"C\":0.4756771947,\"D\":0.2051436153},{\"A\":0.7008949898,\"B\":0.4885963791,\"C\":0.2012325862,\"D\":0.4846756182},{\"A\":0.6803875318,\"B\":0.3302139274,\"C\":0.2008848379,\"D\":0.8565463434},{\"A\":0.8215943867,\"B\":0.4007808246,\"C\":0.849984323,\"D\":0.7148225175},{\"A\":0.984314214,\"B\":0.3315224115,\"C\":0.2374709671,\"D\":0.7303792807},{\"A\":0.1544605924,\"B\":0.9114949507,\"C\":0.8431437169,\"D\":0.3078082636},{\"A\":0.4466543896,\"B\":0.7093552302,\"C\":0.0139362347,\"D\":0.6832623102},{\"A\":0.335121826,\"B\":0.017851436,\"C\":0.8177046429,\"D\":0.0868433873},{\"A\":0.3241821741,\"B\":0.352863243,\"C\":0.0782754953,\"D\":0.9690912676},{\"A\":0.6525125608,\"B\":0.6431642124,\"C\":0.2455876297,\"D\":0.4893561674},{\"A\":0.5009426045,\"B\":0.144012049,\"C\":0.9115418398,\"D\":0.6228395399},{\"A\":0.707871306,\"B\":0.088211523,\"C\":0.9318696511,\"D\":0.8569612714},{\"A\":0.4605842335,\"B\":0.5185556048,\"C\":0.9262782515,\"D\":0.7801799842},{\"A\":0.7730264146,\"B\":0.6774950976,\"C\":0.6291621329,\"D\":0.2161067579},{\"A\":0.7916767193,\"B\":0.212605389,\"C\":0.3408662965,\"D\":0.5176147758},{\"A\":0.7025679202,\"B\":0.0851145247,\"C\":0.1140933764,\"D\":0.7167199974},{\"A\":0.9758435877,\"B\":0.6312956197,\"C\":0.4144884024,\"D\":0.5930438643},{\"A\":0.1998500366,\"B\":0.079361875,\"C\":0.5949318443,\"D\":0.0516943591},{\"A\":0.9872288449,\"B\":0.2349307202,\"C\":0.5721875354,\"D\":0.1446492501},{\"A\":0.8438844171,\"B\":0.4464399495,\"C\":0.3444058338,\"D\":0.4428694837},{\"A\":0.3778647292,\"B\":0.3380841433,\"C\":0.5285966196,\"D\":0.0594458212},{\"A\":0.7635524601,\"B\":0.6418464458,\"C\":0.7063874264,\"D\":0.1375340887},{\"A\":0.0092013791,\"B\":0.4869340269,\"C\":0.7725304702,\"D\":0.6235075271},{\"A\":0.0774261649,\"B\":0.5042933554,\"C\":0.7095936633,\"D\":0.4012486987},{\"A\":0.9215810197,\"B\":0.0154472261,\"C\":0.2654161552,\"D\":0.3247884855},{\"A\":0.393765934,\"B\":0.481696696,\"C\":0.8731618709,\"D\":0.758867711},{\"A\":0.4745454185,\"B\":0.0666828682,\"C\":0.2043568046,\"D\":0.7433732038},{\"A\":0.7717461404,\"B\":0.112668368,\"C\":0.8416286193,\"D\":0.0254580519},{\"A\":0.3025883997,\"B\":0.3437180802,\"C\":0.2369378307,\"D\":0.8939787727},{\"A\":0.3484336427,\"B\":0.3910067643,\"C\":0.0953904485,\"D\":0.3651110205},{\"A\":0.2110935156,\"B\":0.4636447284,\"C\":0.9283017709,\"D\":0.0105194739},{\"A\":0.7394658063,\"B\":0.3301475445,\"C\":0.9340923108,\"D\":0.5463739846},{\"A\":0.2309639435,\"B\":0.5585589948,\"C\":0.2390889386,\"D\":0.9918534523},{\"A\":0.9987103314,\"B\":0.0906074135,\"C\":0.872042477,\"D\":0.2227486275},{\"A\":0.1443925385,\"B\":0.0679658547,\"C\":0.0935615945,\"D\":0.840750406},{\"A\":0.0943707276,\"B\":0.9048926926,\"C\":0.2245580652,\"D\":0.3529395385},{\"A\":0.4725164841,\"B\":0.8412799321,\"C\":0.6457602779,\"D\":0.0037145716},{\"A\":0.4405258389,\"B\":0.085446825,\"C\":0.2832068609,\"D\":0.5431120155},{\"A\":0.4215098076,\"B\":0.6496444076,\"C\":0.8175635963,\"D\":0.6856483039},{\"A\":0.6176578205,\"B\":0.9106344265,\"C\":0.8360707837,\"D\":0.4640646558},{\"A\":0.123294081,\"B\":0.1690605358,\"C\":0.8352657504,\"D\":0.5192123665},{\"A\":0.455456584,\"B\":0.1734614195,\"C\":0.385524538,\"D\":0.8626150199},{\"A\":0.5662762499,\"B\":0.6534325729,\"C\":0.8660321272,\"D\":0.8059199064},{\"A\":0.7471802655,\"B\":0.6614166044,\"C\":0.3731012478,\"D\":0.6285728953},{\"A\":0.2486054865,\"B\":0.8962429634,\"C\":0.5485535981,\"D\":0.6525742063}]" -------------------------------------------------------------------------------- /tests/data/tmp_data.json: -------------------------------------------------------------------------------- 1 | "[{\"A\":0.5325016729,\"B\":0.4869994072,\"C\":0.269408499,\"D\":0.3621738306},{\"A\":0.2304825119,\"B\":0.5481912504,\"C\":0.4014871558,\"D\":0.3603053727},{\"A\":0.3507889192,\"B\":0.6226744491,\"C\":0.6091377546,\"D\":0.5165489581},{\"A\":0.5798596097,\"B\":0.7200900352,\"C\":0.9342783399,\"D\":0.0540093822},{\"A\":0.3108050843,\"B\":0.9773936853,\"C\":0.4801914079,\"D\":0.1787148388},{\"A\":0.2084711872,\"B\":0.5751155582,\"C\":0.2584603695,\"D\":0.4499059913},{\"A\":0.4759231155,\"B\":0.8092009236,\"C\":0.3072478517,\"D\":0.8837572165},{\"A\":0.2480077693,\"B\":0.4552477195,\"C\":0.9647205731,\"D\":0.2536659951},{\"A\":0.1634613944,\"B\":0.4766800168,\"C\":0.369663194,\"D\":0.9961761176},{\"A\":0.5346664051,\"B\":0.194493705,\"C\":0.4651741797,\"D\":0.7293780026},{\"A\":0.663231418,\"B\":0.4826377804,\"C\":0.5241409718,\"D\":0.1599210151},{\"A\":0.6838886399,\"B\":0.4712653511,\"C\":0.5581199413,\"D\":0.3226088546},{\"A\":0.5353026817,\"B\":0.7982157423,\"C\":0.5292073255,\"D\":0.8607700998},{\"A\":0.7164381489,\"B\":0.8843576476,\"C\":0.8875950376,\"D\":0.3435581081},{\"A\":0.664698152,\"B\":0.8538449824,\"C\":0.8392907092,\"D\":0.3113757281},{\"A\":0.6479963522,\"B\":0.9540320749,\"C\":0.5325504287,\"D\":0.8182171859},{\"A\":0.8211351808,\"B\":0.4439015651,\"C\":0.6269342472,\"D\":0.4122693317},{\"A\":0.6679538246,\"B\":0.7390494918,\"C\":0.1759150727,\"D\":0.1302475247},{\"A\":0.8377472214,\"B\":0.273212617,\"C\":0.2663859412,\"D\":0.5964344124},{\"A\":0.9082325183,\"B\":0.3137189069,\"C\":0.9204916523,\"D\":0.6981099323},{\"A\":0.0569704461,\"B\":0.5887289259,\"C\":0.9691034132,\"D\":0.864492329},{\"A\":0.8733863986,\"B\":0.5010284784,\"C\":0.8753387807,\"D\":0.2446379967},{\"A\":0.9960436279,\"B\":0.4441227796,\"C\":0.4813238954,\"D\":0.2405431943},{\"A\":0.9743124513,\"B\":0.2431975581,\"C\":0.1700467831,\"D\":0.5107508473},{\"A\":0.2659344468,\"B\":0.2298763219,\"C\":0.1679702902,\"D\":0.2976868866},{\"A\":0.4967479732,\"B\":0.8507496604,\"C\":0.1298286001,\"D\":0.0696243588},{\"A\":0.5011896631,\"B\":0.7902353379,\"C\":0.862998181,\"D\":0.2236992358},{\"A\":0.1630314398,\"B\":0.9551477474,\"C\":0.9418805628,\"D\":0.2297807846},{\"A\":0.4375272337,\"B\":0.4738682907,\"C\":0.7732553673,\"D\":0.9564487668},{\"A\":0.6313980088,\"B\":0.704010717,\"C\":0.5371476407,\"D\":0.6924966076},{\"A\":0.3307278087,\"B\":0.6970454132,\"C\":0.1032596315,\"D\":0.2564419892},{\"A\":0.4353581681,\"B\":0.6941255237,\"C\":0.0394112344,\"D\":0.4991979497},{\"A\":0.3901813924,\"B\":0.3581649708,\"C\":0.0696682115,\"D\":0.1097029952},{\"A\":0.6259185992,\"B\":0.8903284037,\"C\":0.5605368861,\"D\":0.5762697392},{\"A\":0.6068870699,\"B\":0.2540924395,\"C\":0.1086194342,\"D\":0.6033022401},{\"A\":0.3123136578,\"B\":0.2788511046,\"C\":0.4036056534,\"D\":0.5723193255},{\"A\":0.1021738989,\"B\":0.9994909756,\"C\":0.9832543715,\"D\":0.9885624128},{\"A\":0.6785393932,\"B\":0.5788494481,\"C\":0.5004145535,\"D\":0.5129020263},{\"A\":0.4616303134,\"B\":0.2699233081,\"C\":0.1211274991,\"D\":0.2856023533},{\"A\":0.7471981109,\"B\":0.8964269606,\"C\":0.5273349805,\"D\":0.519780483},{\"A\":0.2398561128,\"B\":0.6637693729,\"C\":0.0713046133,\"D\":0.1965824806},{\"A\":0.1438138313,\"B\":0.8077715814,\"C\":0.5987424102,\"D\":0.8796307444},{\"A\":0.2099913033,\"B\":0.0014793221,\"C\":0.6282096248,\"D\":0.3344606287},{\"A\":0.2172346949,\"B\":0.8055518737,\"C\":0.7020053655,\"D\":0.2734407306},{\"A\":0.2919147983,\"B\":0.8325704254,\"C\":0.6602685898,\"D\":0.9023679814},{\"A\":0.5941575402,\"B\":0.03341885,\"C\":0.1900654781,\"D\":0.5003254697},{\"A\":0.0577672223,\"B\":0.5857695682,\"C\":0.7401711144,\"D\":0.2932093813},{\"A\":0.0441340405,\"B\":0.9699508666,\"C\":0.5688421424,\"D\":0.8265708757},{\"A\":0.5671337446,\"B\":0.1391732202,\"C\":0.8557361973,\"D\":0.95843871},{\"A\":0.7818036893,\"B\":0.7559001038,\"C\":0.9803074287,\"D\":0.5550912458},{\"A\":0.7764158003,\"B\":0.6901683866,\"C\":0.7134621122,\"D\":0.3412987906},{\"A\":0.5184392059,\"B\":0.3561712456,\"C\":0.9341091567,\"D\":0.4326865978},{\"A\":0.7478769752,\"B\":0.985482101,\"C\":0.7739615326,\"D\":0.0101972409},{\"A\":0.4681250259,\"B\":0.1031996448,\"C\":0.5414294345,\"D\":0.4261483639},{\"A\":0.1752046752,\"B\":0.1991262091,\"C\":0.677077356,\"D\":0.8059928892},{\"A\":0.7081022399,\"B\":0.5430350946,\"C\":0.4756771947,\"D\":0.2051436153},{\"A\":0.7008949898,\"B\":0.4885963791,\"C\":0.2012325862,\"D\":0.4846756182},{\"A\":0.6803875318,\"B\":0.3302139274,\"C\":0.2008848379,\"D\":0.8565463434},{\"A\":0.8215943867,\"B\":0.4007808246,\"C\":0.849984323,\"D\":0.7148225175},{\"A\":0.984314214,\"B\":0.3315224115,\"C\":0.2374709671,\"D\":0.7303792807},{\"A\":0.1544605924,\"B\":0.9114949507,\"C\":0.8431437169,\"D\":0.3078082636},{\"A\":0.4466543896,\"B\":0.7093552302,\"C\":0.0139362347,\"D\":0.6832623102},{\"A\":0.335121826,\"B\":0.017851436,\"C\":0.8177046429,\"D\":0.0868433873},{\"A\":0.3241821741,\"B\":0.352863243,\"C\":0.0782754953,\"D\":0.9690912676},{\"A\":0.6525125608,\"B\":0.6431642124,\"C\":0.2455876297,\"D\":0.4893561674},{\"A\":0.5009426045,\"B\":0.144012049,\"C\":0.9115418398,\"D\":0.6228395399},{\"A\":0.707871306,\"B\":0.088211523,\"C\":0.9318696511,\"D\":0.8569612714},{\"A\":0.4605842335,\"B\":0.5185556048,\"C\":0.9262782515,\"D\":0.7801799842},{\"A\":0.7730264146,\"B\":0.6774950976,\"C\":0.6291621329,\"D\":0.2161067579},{\"A\":0.7916767193,\"B\":0.212605389,\"C\":0.3408662965,\"D\":0.5176147758},{\"A\":0.7025679202,\"B\":0.0851145247,\"C\":0.1140933764,\"D\":0.7167199974},{\"A\":0.9758435877,\"B\":0.6312956197,\"C\":0.4144884024,\"D\":0.5930438643},{\"A\":0.1998500366,\"B\":0.079361875,\"C\":0.5949318443,\"D\":0.0516943591},{\"A\":0.9872288449,\"B\":0.2349307202,\"C\":0.5721875354,\"D\":0.1446492501},{\"A\":0.8438844171,\"B\":0.4464399495,\"C\":0.3444058338,\"D\":0.4428694837},{\"A\":0.3778647292,\"B\":0.3380841433,\"C\":0.5285966196,\"D\":0.0594458212},{\"A\":0.7635524601,\"B\":0.6418464458,\"C\":0.7063874264,\"D\":0.1375340887},{\"A\":0.0092013791,\"B\":0.4869340269,\"C\":0.7725304702,\"D\":0.6235075271},{\"A\":0.0774261649,\"B\":0.5042933554,\"C\":0.7095936633,\"D\":0.4012486987},{\"A\":0.9215810197,\"B\":0.0154472261,\"C\":0.2654161552,\"D\":0.3247884855},{\"A\":0.393765934,\"B\":0.481696696,\"C\":0.8731618709,\"D\":0.758867711},{\"A\":0.4745454185,\"B\":0.0666828682,\"C\":0.2043568046,\"D\":0.7433732038},{\"A\":0.7717461404,\"B\":0.112668368,\"C\":0.8416286193,\"D\":0.0254580519},{\"A\":0.3025883997,\"B\":0.3437180802,\"C\":0.2369378307,\"D\":0.8939787727},{\"A\":0.3484336427,\"B\":0.3910067643,\"C\":0.0953904485,\"D\":0.3651110205},{\"A\":0.2110935156,\"B\":0.4636447284,\"C\":0.9283017709,\"D\":0.0105194739},{\"A\":0.7394658063,\"B\":0.3301475445,\"C\":0.9340923108,\"D\":0.5463739846},{\"A\":0.2309639435,\"B\":0.5585589948,\"C\":0.2390889386,\"D\":0.9918534523},{\"A\":0.9987103314,\"B\":0.0906074135,\"C\":0.872042477,\"D\":0.2227486275},{\"A\":0.1443925385,\"B\":0.0679658547,\"C\":0.0935615945,\"D\":0.840750406},{\"A\":0.0943707276,\"B\":0.9048926926,\"C\":0.2245580652,\"D\":0.3529395385},{\"A\":0.4725164841,\"B\":0.8412799321,\"C\":0.6457602779,\"D\":0.0037145716},{\"A\":0.4405258389,\"B\":0.085446825,\"C\":0.2832068609,\"D\":0.5431120155},{\"A\":0.4215098076,\"B\":0.6496444076,\"C\":0.8175635963,\"D\":0.6856483039},{\"A\":0.6176578205,\"B\":0.9106344265,\"C\":0.8360707837,\"D\":0.4640646558},{\"A\":0.123294081,\"B\":0.1690605358,\"C\":0.8352657504,\"D\":0.5192123665},{\"A\":0.455456584,\"B\":0.1734614195,\"C\":0.385524538,\"D\":0.8626150199},{\"A\":0.5662762499,\"B\":0.6534325729,\"C\":0.8660321272,\"D\":0.8059199064},{\"A\":0.7471802655,\"B\":0.6614166044,\"C\":0.3731012478,\"D\":0.6285728953},{\"A\":0.2486054865,\"B\":0.8962429634,\"C\":0.5485535981,\"D\":0.6525742063}]" -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | 4 | datacite_data = {'identifier': {'identifier': '10.xx/xx', 'identifierType': 'DOI'}, 5 | 'rightsList': [{'rights': 'CC-BY 4.0'}], 6 | 'creators': [{'creatorName': 'Brown, C', 7 | 'familyName': 'Brown', 8 | 'givenName': 'C'}, 9 | {'creatorName': 'Van Pelt, L', 10 | 'familyName': 'Van Pelt', 11 | 'givenName': 'L'}], 12 | 'subjects': [{'subject': 'blockheads'}, 13 | {'subject': 'foundry'}, 14 | {'subject': 'test_data'}], 15 | 'publicationYear': 2024, 16 | 'publisher': 'Materials Data Facility', 17 | 'dates': [{'date': '2024-08-03', 'dateType': 'Accepted'}], 18 | 'titles': [{'title': "You're a Good man, Charlie Brown"}], 19 | 'resourceType': {'resourceTypeGeneral': 'Dataset', 20 | 'resourceType': 'Dataset'}} 21 | 22 | 23 | valid_metadata = {"keys": [ 24 | { 25 | "key": ["sepal length (cm)"], 26 | "type": "input", 27 | "units": "cm", 28 | "description": "sepal length in Charlie Brown's zig-zag style" 29 | }, 30 | { 31 | "key": ["sepal width (cm)"], 32 | "type": "input", 33 | "units": "cm", 34 | "description": "sepal width in Snoopy's flying ace mode" 35 | }, 36 | { 37 | "key": ["petal length (cm)"], 38 | "type": "input", 39 | "units": "cm", 40 | "description": "petal length in Linus's security blanket units" 41 | }, 42 | { 43 | "key": ["petal width (cm)"], 44 | "type": "input", 45 | "units": "cm", 46 | "description": "petal width in Lucy's psychiatric advice scale" 47 | }, 48 | { 49 | "key": ["y"], 50 | "type": "output", 51 | "units": "", 52 | "description": "flower type", 53 | "classes": [ 54 | { 55 | "label": "0", 56 | "name": "setosa" 57 | }, 58 | { 59 | "label": "1", 60 | "name": "versicolor" 61 | }, 62 | { 63 | "label": "2", 64 | "name": "virginica" 65 | } 66 | ] 67 | } 68 | ], 69 | "splits": [ 70 | {"label": "train", "path": "train_snoopy.json", "type": "train"}, 71 | {"label": "test", "path": "test_woodstock.json", "type": "test"} 72 | ], 73 | "short_name": "peanuts_iris_{:.0f}".format(datetime.now().timestamp()), 74 | "data_type": "tabular", 75 | "task_type": ["unsupervised", "generative"], 76 | "domain": ["comics", "nostalgia"], 77 | "n_items": 1000 78 | } 79 | 80 | 81 | invalid_metadata = {"oranges": [ 82 | { 83 | "key": ["sepal length (cm)"], 84 | "type": "input", 85 | "units": "cm", 86 | "description": 10 87 | }, 88 | { 89 | "key": ["sepal width (cm)"], 90 | "type": "input", 91 | "units": "cm", 92 | "description": "sepal width in unit(cm)" 93 | }, 94 | { 95 | "key": ["petal length (cm)"], 96 | "type": "input", 97 | "units": "cm", 98 | "description": "petal length in unit(cm)" 99 | }, 100 | { 101 | "key": ["petal width (cm)"], 102 | "type": "input", 103 | "units": "cm", 104 | "description": "petal width in unit(cm)" 105 | }, 106 | { 107 | "key": ["y"], 108 | "type": "output", 109 | "units": "", 110 | "description": "flower type", 111 | "classes": [ 112 | { 113 | "label": "0", 114 | "name": "setosa" 115 | }, 116 | { 117 | "label": "1", 118 | "name": "versicolor" 119 | }, 120 | { 121 | "label": "2", 122 | "name": "virginica" 123 | } 124 | ] 125 | } 126 | ], 127 | 'splits': [ 128 | {'label': 'train', 'path': 'train.json', 'type': 'train'}, 129 | {'label': 'test', 'path': 'test.json', 'type': 'test'} 130 | ], 131 | "short_name": "example_AS_iris_test_{:.0f}".format(datetime.now().timestamp()), 132 | "data_type": "tabular", 133 | 'task_type': ['unsupervised', 'generative'], 134 | 'domain': ['materials science', 'chemistry'], 135 | 'n_items': 1000 136 | } 137 | -------------------------------------------------------------------------------- /tests/test_data/test_dataset/elwood.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/tests/test_data/test_dataset/elwood.hdf5 -------------------------------------------------------------------------------- /tests/test_foundry_cache.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | import pandas as pd 4 | import pytest 5 | from unittest.mock import MagicMock 6 | 7 | from . import test_foundry_dataset 8 | from foundry.jsonschema_models.project_model import Split as FoundrySplit, \ 9 | Key as FoundryKey 10 | from foundry.foundry_cache import FoundryCache 11 | from foundry.models import FoundrySchema 12 | 13 | 14 | @pytest.fixture 15 | def mock_foundry_cache(): 16 | cache_dir = str(Path(__file__).parent) + '/test_data' 17 | cache = FoundryCache(forge_client=MagicMock(), 18 | transfer_client=MagicMock(), 19 | local_cache_dir=cache_dir, 20 | use_globus=False, 21 | interval=10, 22 | parallel_https=4, 23 | verbose=False) 24 | return cache 25 | 26 | 27 | @pytest.fixture 28 | def mock_nonexistent_foundry_cache(): 29 | cache_dir = str(Path(__file__).parent) + '/cheeseballs' 30 | cache = FoundryCache(forge_client=MagicMock(), 31 | transfer_client=MagicMock(), 32 | local_cache_dir=cache_dir, 33 | use_globus=False, 34 | interval=10, 35 | parallel_https=4, 36 | verbose=False) 37 | return cache 38 | 39 | 40 | def test_validate_local_dataset_storage_exists(mock_foundry_cache): 41 | cache = mock_foundry_cache 42 | dataset_name = "elwood_md_v1.2" 43 | assert cache.validate_local_dataset_storage(dataset_name) is True 44 | 45 | 46 | def test_validate_local_dataset_storage_missing_files(mock_foundry_cache): 47 | cache = mock_foundry_cache 48 | dataset_name = "elwood_md_v1.2" 49 | # Create a split with a missing file 50 | splits = [ 51 | FoundrySplit(path="file1.csv", type="train"), 52 | FoundrySplit(path="file2.csv", type="test"), 53 | FoundrySplit(path="file3.csv", type="validation") 54 | ] 55 | 56 | assert cache.validate_local_dataset_storage(dataset_name, splits) is False 57 | 58 | 59 | def test_validate_local_dataset_storage_complete(mock_foundry_cache): 60 | cache = mock_foundry_cache 61 | dataset_name = "elwood_md_v1.2" 62 | 63 | assert cache.validate_local_dataset_storage(dataset_name) is True 64 | 65 | 66 | def test_validate_local_dataset_storage_not_present(mock_nonexistent_foundry_cache): 67 | cache = mock_nonexistent_foundry_cache 68 | dataset_name = "test_dataset" 69 | 70 | assert cache.validate_local_dataset_storage(dataset_name) is False 71 | 72 | 73 | @pytest.fixture 74 | def mock_tabular_foundry_source_id(): 75 | source_id = "elwood_md_v1.2" 76 | yield source_id 77 | 78 | 79 | @pytest.fixture 80 | def mock_tabular_foundry_schema(): 81 | foundry_schema = json.loads('{"short_name": "elwood_properties", "data_type": "tabular", "task_type": ["unsupervised", "generative", "supervised"], "domain": ["materials science", "chemistry", "simulation"], "n_items": 410.0, "splits": [{"type": "train", "path": "MD_properties.csv", "label": "train"}], "keys": [{"key": ["SMILES"], "type": "input", "filter": null, "description": "Canonical SMILES string of molecule", "units": "arb", "classes": null}, {"key": ["E_coh (MPa)"], "type": "target", "filter": null, "description": "Simulated cohesive energy (in MPa)", "units": "MPa", "classes": null}, {"key": ["T_g (K)"], "type": "target", "filter": null, "description": "Simulated glass transition temperature (in Kelvin)", "units": "Kelvin", "classes": null}, {"key": ["R_gyr (A^2)"], "type": "target", "filter": null, "description": "Simulated squared radius of gyration (in Angstroms^2)", "units": "Angstrom^2", "classes": null}, {"key": ["Densities (kg/m^3)"], "type": "target", "filter": null, "description": "Simulated density (in kg/m^3)", "units": "kg/m^3", "classes": null}]}') 82 | yield foundry_schema 83 | 84 | 85 | @pytest.fixture 86 | def mock_hdf5_foundry_source_id(): 87 | source_id = "test_dataset" 88 | yield source_id 89 | 90 | 91 | @pytest.fixture 92 | def mock_hdf5_foundry_schema(): 93 | foundry_schema = json.loads('{"short_name": "elwood_properties", "data_type": "hdf5", "task_type": ["unsupervised", "generative", "supervised"], "domain": ["materials science", "chemistry", "simulation"], "n_items": 410.0, "splits": [{"type": "train", "path": "MD_properties.csv", "label": "train"}], "keys": [{"key": ["SMILES"], "type": "input", "filter": null, "description": "Canonical SMILES string of molecule", "units": "arb", "classes": null}, {"key": ["E_coh (MPa)"], "type": "target", "filter": null, "description": "Simulated cohesive energy (in MPa)", "units": "MPa", "classes": null}, {"key": ["T_g (K)"], "type": "target", "filter": null, "description": "Simulated glass transition temperature (in Kelvin)", "units": "Kelvin", "classes": null}, {"key": ["R_gyr (A^2)"], "type": "target", "filter": null, "description": "Simulated squared radius of gyration (in Angstroms^2)", "units": "Angstrom^2", "classes": null}, {"key": ["Densities (kg/m^3)"], "type": "target", "filter": null, "description": "Simulated density (in kg/m^3)", "units": "kg/m^3", "classes": null}]}') 94 | yield foundry_schema 95 | 96 | 97 | @pytest.fixture 98 | def mock_read_functions(): 99 | 100 | def mock_read_csv(file_path): 101 | # Mock _read_csv() to return a DataFrame with minimal example data 102 | data = {'Column1': [1, 2, 3], 'Column2': ['A', 'B', 'C']} 103 | return pd.DataFrame(data) 104 | 105 | def mock_read_json(file_path): 106 | # Mock _read_json() to return a DataFrame with minimal example data 107 | data = {'Column1': [4, 5, 6], 'Column2': ['D', 'E', 'F']} 108 | return pd.DataFrame(data) 109 | 110 | def mock_read_excel(file_path): 111 | # Mock _read_excel() to return a DataFrame with minimal example data 112 | data = {'Column1': [7, 8, 9], 'Column2': ['G', 'H', 'I']} 113 | return pd.DataFrame(data) 114 | 115 | # # Patch the _read_csv(), _read_json(), and _read_excel() functions with the mock functions 116 | # with patch('foundry.foundry_cache._read_csv', MagicMock(side_effect=mock_read_csv)): 117 | # with patch('foundry.foundry_cache._read_json', MagicMock(side_effect=mock_read_json)): 118 | # with patch('foundry.foundry_cache._read_excel', MagicMock(side_effect=mock_read_excel)): 119 | # yield 120 | 121 | 122 | def test_load_data_with_globus(mock_foundry_cache, 123 | mock_tabular_foundry_source_id, 124 | mock_tabular_foundry_schema): 125 | cache = mock_foundry_cache 126 | source_id = mock_tabular_foundry_source_id 127 | foundry_schema = FoundrySchema(mock_tabular_foundry_schema) 128 | cache._load_data(foundry_schema, 129 | file="MD_properties.csv", 130 | source_id=source_id, 131 | as_hdf5=False) 132 | # Add assertions here 133 | 134 | 135 | def test_load_data_with_hdf5(mock_foundry_cache, 136 | mock_hdf5_foundry_schema, 137 | mock_read_functions, 138 | mock_hdf5_foundry_source_id): 139 | cache = mock_foundry_cache 140 | source_id = mock_hdf5_foundry_source_id 141 | foundry_schema = FoundrySchema(mock_hdf5_foundry_schema) 142 | cache._load_data(foundry_schema, 143 | file="elwood.hdf5", 144 | source_id=source_id, 145 | as_hdf5=True) 146 | # Add assertions here 147 | 148 | 149 | def test_load_data_with_globus_2(mock_foundry_cache, 150 | mock_tabular_foundry_schema, 151 | mock_read_functions, 152 | mock_tabular_foundry_source_id): 153 | cache = mock_foundry_cache 154 | source_id = mock_tabular_foundry_source_id 155 | foundry_schema = FoundrySchema(mock_tabular_foundry_schema) 156 | cache._load_data(foundry_schema, 157 | file="MD_properties.csv", 158 | source_id=source_id, 159 | as_hdf5=False) 160 | # Add assertions here 161 | 162 | 163 | def test_load_data_with_source_id(mock_foundry_cache, 164 | mock_tabular_foundry_schema, 165 | mock_read_functions, 166 | mock_hdf5_foundry_source_id): 167 | cache = mock_foundry_cache 168 | foundry_schema = FoundrySchema(mock_tabular_foundry_schema) 169 | with pytest.raises(Exception) as exc_info: 170 | cache._load_data(foundry_schema, 171 | file="MD_properties.csv", 172 | source_id="12345", 173 | as_hdf5=False) 174 | err = exc_info.value 175 | assert isinstance(err, FileNotFoundError) 176 | -------------------------------------------------------------------------------- /tests/test_foundry_components.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pytest 3 | import os 4 | 5 | from mdf_connect_client import MDFConnectClient 6 | from mdf_forge import Forge 7 | import mdf_toolbox 8 | import pandas as pd 9 | 10 | from foundry import foundry 11 | 12 | is_gha = os.getenv("GITHUB_ACTIONS") 13 | client_id = os.getenv("CLIENT_ID") 14 | client_secret = os.getenv("CLIENT_SECRET") 15 | 16 | 17 | @pytest.fixture 18 | def auths(): 19 | services = [ 20 | "data_mdf", 21 | "mdf_connect", 22 | "search", 23 | "petrel", 24 | "transfer", 25 | "openid", 26 | "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all", # funcx 27 | "https://auth.globus.org/scopes/f10a69a9-338c-4e5b-baa1-0dc92359ab47/https", # Eagle HTTPS 28 | "https://auth.globus.org/scopes/82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/https", # NCSA HTTPS 29 | "https://auth.globus.org/scopes/d31d4f5d-be37-4adc-a761-2f716b7af105/action_all", # Globus Search Lambda 30 | ] 31 | 32 | if is_gha: 33 | auths = mdf_toolbox.confidential_login(client_id=client_id, 34 | client_secret=client_secret, 35 | services=services, make_clients=True) 36 | 37 | search_auth = mdf_toolbox.confidential_login(client_id=client_id, 38 | client_secret=client_secret, 39 | services=["search"], make_clients=False) 40 | else: 41 | auths = mdf_toolbox.login(services=services, make_clients=True) 42 | search_auth = mdf_toolbox.login(services=["search"], make_clients=False) 43 | 44 | auths['search_authorizer'] = search_auth['search'] 45 | 46 | yield auths 47 | 48 | 49 | @pytest.fixture() 50 | def testing_data_dir(): 51 | return str(Path(__file__).parent) + '/test_data' 52 | 53 | 54 | @pytest.fixture 55 | def elwood_data(): 56 | test_dataset_name = "elwood_md_v1.2" 57 | test_doi = "10.18126/8p6m-e135" 58 | expected_title = "Project Elwood: MD Simulated Monomer Properties" 59 | yield test_dataset_name, test_doi, expected_title 60 | 61 | 62 | @pytest.fixture 63 | def iris_data(): 64 | pub_test_dataset = "_test_foundry_iris_dev_v2.1" 65 | pub_expected_title = "Iris Dataset" 66 | yield pub_test_dataset, pub_expected_title 67 | 68 | 69 | # FoundryCache testing 70 | 71 | def test_loading_as_dict(auths, elwood_data, testing_data_dir): 72 | # test loading the dataset from a local (static) copy 73 | test_dataset_name, test_doi, expected_title = elwood_data 74 | 75 | f = foundry.Foundry(authorizers=auths, local_cache_dir=testing_data_dir) 76 | search_results = f.search(test_dataset_name, as_list=True) 77 | elwood_data = search_results[0].get_as_dict() 78 | X, y = elwood_data['train'] 79 | 80 | assert len(X) > 1 81 | assert isinstance(X, pd.DataFrame) 82 | assert len(y) > 1 83 | assert isinstance(y, pd.DataFrame) 84 | 85 | 86 | def test_foundry_init(auths, elwood_data): 87 | test_dataset_name, test_doi, expected_title = elwood_data 88 | 89 | f = foundry.Foundry(authorizers=auths) 90 | assert isinstance(f.forge_client, Forge) 91 | assert isinstance(f.connect_client, MDFConnectClient) 92 | 93 | def test_search(auths, elwood_data): 94 | test_dataset_name, test_doi, expected_title = elwood_data 95 | 96 | f = foundry.Foundry(authorizers=auths) 97 | q = "Elwood" 98 | ds = f.search(q) 99 | 100 | assert isinstance(ds, pd.DataFrame) 101 | assert len(ds) > 0 102 | 103 | dataset = ds.iloc[0].FoundryDataset 104 | 105 | # assert ds.iloc[0]['name'] is not None 106 | assert dataset.dc.titles[0].title == expected_title 107 | 108 | # assert ds.iloc[0]['source_id'] is not None 109 | assert dataset.dataset_name == test_dataset_name 110 | 111 | # assert ds.iloc[0]['year'] is not None 112 | assert dataset.dc.publicationYear is not None 113 | 114 | 115 | def test_search_as_list(auths, elwood_data): 116 | auths = auths 117 | test_dataset_name, test_doi, expected_title = elwood_data 118 | 119 | f = foundry.Foundry(authorizers=auths) 120 | q = "Elwood" 121 | ds = f.search(q, as_list=True) 122 | 123 | assert isinstance(ds, list) 124 | assert len(ds) > 0 125 | 126 | dataset = ds[0] 127 | 128 | # assert ds.iloc[0]['name'] is not None 129 | assert dataset.dc.titles[0].title == expected_title 130 | 131 | # assert ds.iloc[0]['source_id'] is not None 132 | assert dataset.dataset_name == test_dataset_name 133 | 134 | # assert ds.iloc[0]['year'] is not None 135 | assert dataset.dc.publicationYear is not None 136 | 137 | 138 | def test_search_limit(auths, elwood_data): 139 | f = foundry.Foundry(authorizers=auths) 140 | ds = f.search(limit=10) 141 | 142 | assert isinstance(ds, pd.DataFrame) 143 | assert len(ds) == 10 144 | 145 | 146 | @pytest.mark.skipif(bool(is_gha), reason="pytest.raises seems to cause issues in GHA?") 147 | def test_search_no_results(): 148 | f = foundry.Foundry() 149 | 150 | with pytest.raises(Exception) as exc_info: 151 | f.search('chewbacca') 152 | 153 | err = exc_info.value 154 | assert hasattr(err, '__cause__') 155 | -------------------------------------------------------------------------------- /tests/test_foundry_dataset.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pytest 3 | 4 | from foundry import foundry 5 | from tests.test_data import datacite_data, valid_metadata 6 | 7 | 8 | def test_dataset_instantiation(): 9 | ds = foundry.FoundryDataset(dataset_name='peanuts', 10 | foundry_schema=valid_metadata, 11 | datacite_entry=datacite_data) 12 | 13 | assert ds.foundry_schema is not None 14 | 15 | 16 | def test_dataset_instantiation_broken_dc(): 17 | broken_datacite = datacite_data.copy() 18 | broken_datacite.pop('creators') 19 | with pytest.raises(Exception) as exc_info: 20 | foundry.FoundryDataset(dataset_name='peanuts', 21 | foundry_schema=valid_metadata, 22 | datacite_entry=broken_datacite) 23 | print(f'ERROR: {exc_info.value}') 24 | assert "field required" in str(exc_info.value) 25 | 26 | 27 | def test_add_non_existent_data_to_dataset(): 28 | ds = foundry.FoundryDataset(dataset_name='peanuts', 29 | foundry_schema=valid_metadata, 30 | datacite_entry=datacite_data) 31 | 32 | with pytest.raises(ValueError) as exc_info: 33 | ds.add_data(local_data_path='./test_data/iris.csv') 34 | print(f'ERROR: {exc_info.value}') 35 | assert "local path" in str(exc_info.value) 36 | 37 | 38 | def test_add_data_folder_to_dataset(): 39 | ds = foundry.FoundryDataset(dataset_name='peanuts', 40 | foundry_schema=valid_metadata, 41 | datacite_entry=datacite_data) 42 | dir_path = str(Path(__file__).parent) + '/test_data/test_dataset' 43 | ds.add_data(local_data_path=dir_path) 44 | assert hasattr(ds, '_local_data_path') 45 | 46 | 47 | def test_add_data_file_to_dataset(): 48 | ds = foundry.FoundryDataset(dataset_name='peanuts', 49 | foundry_schema=valid_metadata, 50 | datacite_entry=datacite_data) 51 | file_path = str(Path(__file__).parent) + '/test_data/test_dataset/elwood.hdf5' 52 | ds.add_data(local_data_path=file_path) 53 | assert hasattr(ds, '_local_data_path') 54 | -------------------------------------------------------------------------------- /tests/test_https_download.py: -------------------------------------------------------------------------------- 1 | # import os 2 | # import requests 3 | # import mock 4 | 5 | # from foundry.https_download import download_file 6 | 7 | 8 | # def test_download_file(tmp_path): 9 | # item = { 10 | # "path": tmp_path, 11 | # "name": "example_file.txt" 12 | # } 13 | # data_directory = tmp_path 14 | # https_config = { 15 | # "base_url": "https://example.com/", 16 | # "source_id": "12345" 17 | # } 18 | 19 | # # Mock the requests.get function to return a response with content 20 | # with mock.patch.object(requests, "get") as mock_get: 21 | # mock_get.return_value.content = b"Example file content" 22 | 23 | # # Call the function 24 | # result = download_file(item, data_directory, https_config) 25 | 26 | # # Assert that the file was downloaded and written correctly 27 | # assert os.path.exists(str(tmp_path) + "/12345/example_file.txt") 28 | # with open(str(tmp_path) + "/12345/example_file.txt", "rb") as f: 29 | # assert f.read() == b"Example file content" 30 | 31 | # # Assert that the result is as expected 32 | # assert result == {str(tmp_path) + "/12345/example_file.txt status": True} 33 | 34 | 35 | # def test_download_file_with_existing_directories(tmp_path): 36 | # temp_path_to_file = str(tmp_path) + '/file' 37 | # os.mkdir(temp_path_to_file) 38 | # temp_path_to_data = str(tmp_path) + '/data' 39 | # os.mkdir(temp_path_to_data) 40 | 41 | # item = { 42 | # "path": temp_path_to_file, 43 | # "name": "example_file.txt" 44 | # } 45 | # data_directory = temp_path_to_data 46 | # https_config = { 47 | # "base_url": "https://example.com/", 48 | # "source_id": "12345" 49 | # } 50 | 51 | # # Create the parent directories 52 | # os.makedirs(temp_path_to_data + "12345") 53 | 54 | # # Mock the requests.get function to return a response with content 55 | # with mock.patch.object(requests, "get") as mock_get: 56 | # mock_get.return_value.content = b"Example file content" 57 | 58 | # # Call the function 59 | # result = download_file(item, data_directory, https_config) 60 | 61 | # # Assert that the file was downloaded and written correctly 62 | # assert os.path.exists(temp_path_to_data + "/12345/example_file.txt") 63 | # with open(temp_path_to_data + "/12345/example_file.txt", "rb") as f: 64 | # assert f.read() == b"Example file content" 65 | 66 | # # Assert that the result is as expected 67 | # assert result == {temp_path_to_data + "/12345/example_file.txt status": True} 68 | --------------------------------------------------------------------------------