├── .gitbook.yaml
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── create-a-story.md
    │   └── feature_request.md
    └── workflows
    │   ├── documentation.yaml
    │   ├── python-publish.yml
    │   └── tests.yml
├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── foundry-black.png
    ├── foundry-black.svg
    ├── foundry-dark.png
    ├── foundry-dark.svg
    ├── foundry-light.png
    ├── foundry-light.svg
    ├── foundry-logo-4.pptx
    ├── foundry-logo.pptx
    ├── foundry-ml.png
    ├── foundry-purple.png
    ├── foundry-white.png
    ├── foundry-white.svg
    ├── foundry.png
    └── foundry.svg
├── diagram.svg
├── docs
    ├── .gitbook
    │   └── assets
    │   │   ├── foundry-overview.png
    │   │   ├── foundry-purple (1).png
    │   │   ├── foundry-purple (2).png
    │   │   ├── foundry-purple (3).png
    │   │   ├── foundry-purple.png
    │   │   ├── foundry.png
    │   │   ├── image (1).png
    │   │   ├── image (2).png
    │   │   ├── image.png
    │   │   ├── screen-shot-2021-07-15-at-10.00.38-am.png
    │   │   └── screen-shot-2021-07-15-at-10.05.40-am.png
    ├── README.md
    ├── SUMMARY.md
    ├── command-line-interface.md
    ├── concepts
    │   ├── foundry-benchmarks.md
    │   ├── foundry-data-packages.md
    │   ├── foundry-datasets.md
    │   ├── foundry-models-and-functions.md
    │   └── overview.md
    ├── examples.md
    ├── foundry-package-foundry_test-1.1-documentation-html-autogeneration.md
    ├── foundry.auth.md
    ├── foundry.foundry.md
    ├── foundry.foundry_cache.md
    ├── foundry.foundry_dataset.md
    ├── foundry.https_download.md
    ├── foundry.https_upload.md
    ├── foundry.loaders.md
    ├── foundry.loaders.tf_wrapper.md
    ├── foundry.loaders.torch_wrapper.md
    ├── foundry.md
    ├── foundry.models.md
    ├── foundry.utils.md
    ├── how-to-contribute
    │   ├── code_of_conduct.md
    │   └── contributing.md
    ├── publishing-datasets.md
    ├── publishing-models.md
    ├── publishing
    │   ├── publishing-datasets.md
    │   └── publishing-models.md
    ├── sphinx-autogenerated-documentation.md
    └── support
    │   └── troubleshooting.md
├── examples
    ├── README.md
    ├── atom-position-finding
    │   ├── .ipynb_checkpoints
    │   │   └── atom_position_finding-checkpoint.ipynb
    │   ├── atom_position_finding.ipynb
    │   └── requirements.txt
    ├── bandgap
    │   ├── bandgap_demo.ipynb
    │   ├── foundry.json
    │   └── requirements.txt
    ├── dendrite-segmentation
    │   ├── dendrite_segmentation.ipynb
    │   ├── foundry.json
    │   └── requirements.txt
    ├── g4mp2-solvation
    │   └── g4mp2_solvation_demo.ipynb
    ├── oqmd
    │   ├── foundry.json
    │   ├── oqmd.ipynb
    │   └── requirements.txt
    ├── publishing-guides
    │   ├── data
    │   │   └── iris.csv
    │   └── dataset_publishing.ipynb
    ├── qmc_ml
    │   └── qmc_ml.ipynb
    ├── work_in_progress
    │   └── PACBEDCNN-thickness-mistilt
    │   │   └── PACBEDCNN_thickness_mistilt.ipynb
    └── zeolite
    │   ├── .ipynb_checkpoints
    │       └── zeolite_demo-checkpoint.ipynb
    │   ├── requirements.txt
    │   └── zeolite_demo.ipynb
├── foundry
    ├── __init__.py
    ├── auth.py
    ├── foundry.py
    ├── foundry_cache.py
    ├── foundry_dataset.py
    ├── https_download.py
    ├── https_upload.py
    ├── jsonschema_models
    │   ├── __init__.py
    │   ├── dc_model.py
    │   └── project_model.py
    ├── loaders
    │   ├── __init__.py
    │   ├── tf_wrapper.py
    │   └── torch_wrapper.py
    ├── models.py
    └── utils.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── test-requirements.txt
├── test.py
└── tests
    ├── README.md
    ├── __init__.py
    ├── data
        ├── __init__.py
        ├── https_test
        │   └── test_data.json
        └── tmp_data.json
    ├── test_data.py
    ├── test_data
        ├── elwood_md_v1.2
        │   └── MD_properties.csv
        └── test_dataset
        │   └── elwood.hdf5
    ├── test_foundry.py
    ├── test_foundry_cache.py
    ├── test_foundry_components.py
    ├── test_foundry_dataset.py
    └── test_https_download.py


/.gitbook.yaml:
--------------------------------------------------------------------------------
1 | root: ./docs/
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/create-a-story.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Create a Story
 3 | about: Suggest a user-centered feature, told as a Story
 4 | title: My Story
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | _Short description_
11 | 
12 | # Assumptions:
13 | 1. 
14 | 2. 
15 | 
16 | # Acceptance Criteria
17 | Given..., when..., then...
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/documentation.yaml:
--------------------------------------------------------------------------------
 1 | name: build api documentation
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types:
 6 |       - closed
 7 |   push:
 8 |     branches:
 9 |       - "*"
10 | 
11 | jobs:
12 |   build_documentation:
13 |     if: github.event.pull_request.merged == true
14 |     name: generate api markdown docs
15 |     runs-on: ubuntu-latest
16 |     env:
17 |       CLIENT_ID: ${{ secrets.CLIENT_ID }}
18 |       CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }}
19 |     steps:
20 |       - name: Check out repo's default branch
21 |         uses: actions/checkout@v3
22 |       - name: Setup python
23 |         uses: actions/setup-python@v4
24 |         with:
25 |           python-version: '3.10' 
26 |       - name: Install dependencies
27 |         run: |
28 |           python -m pip install --upgrade pip
29 |           pip install -r requirements.txt
30 |           pip install lazydocs
31 |       - name: Build docs from docstrings
32 |         continue-on-error: true
33 |         run: |
34 |           lazydocs --output-path="docs" --overview-file="README.md" --src-base-url="https://github.com/MLMI2-CSSI/foundry/tree/main" .        
35 |       - name: Commit files
36 |         run: |
37 |           echo ${{ github.ref }}
38 |           git add .
39 |           git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
40 |           git config --local user.name "github-actions[bot]"
41 |           git commit -m "CI: Automated documentation build" -a | exit 0
42 |           git push origin ${{ github.event.pull_request.base.ref }}
43 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v4
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - dev
 7 |       - main
 8 | 
 9 | jobs:
10 | 
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     timeout-minutes: 20
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
17 | 
18 |     env:
19 |       CLIENT_ID: ${{ secrets.CLIENT_ID }}
20 |       CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }}
21 |     name: build
22 |     steps:
23 |       - uses: actions/checkout@v4
24 |       - name: Set up Python ${{ matrix.python-version }}
25 |         uses: actions/setup-python@v5
26 |         with:
27 |           python-version: ${{ matrix.python-version }}
28 |           cache : 'pip'
29 | 
30 |       - name: Globus auth
31 |         run: 'echo "$GLOBUS_CONFIG" > ~/.globus-native-apps.cfg'
32 |         shell: bash
33 |         env:
34 |           GLOBUS_CONFIG: "${{ secrets.GLOBUS_CONFIG }}"
35 | 
36 |       - name: Install dependencies
37 |         run: |
38 |           python -m pip install --upgrade pip
39 |           pip install -r requirements.txt
40 |           pip install -r test-requirements.txt
41 | 
42 |       - name: Lint with flake8
43 |         run: |
44 |           # stop the build if there are any-flake8 comments
45 |           flake8 foundry
46 | 
47 |       - name: Test with pytest
48 |         run: |
49 |           pytest -s -v tests/ --cov=./foundry --cov-report=xml
50 |       - name: Upload coverage to Codecov
51 |         run: |
52 |           curl -Os https://uploader.codecov.io/v0.1.0_4653/linux/codecov
53 | 
54 |           chmod +x codecov
55 |           ./codecov -t ${{ secrets.CODECOV_TOKEN }}
56 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | */build/*
2 | *.DS_STORE
3 | *.pyc
4 | *.idea
5 | */foundry_ml.egg-info/*
6 | globus_creds
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 The University of Chicago
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <picture>
  3 |   <source srcset="https://raw.githubusercontent.com/MLMI2-CSSI/foundry/main/assets/foundry-white.png" height=175" media="(prefers-color-scheme: dark)">
  4 |   <img src="https://raw.githubusercontent.com/MLMI2-CSSI/foundry/main/assets/foundry-black.png" height="175">
  5 | </picture>
  6 | 
  7 | [![PyPI](https://img.shields.io/pypi/v/foundry_ml.svg)](https://pypi.python.org/pypi/foundry_ml)
  8 | [![Tests](https://github.com/MLMI2-CSSI/foundry/actions/workflows/tests.yml/badge.svg)](https://github.com/MLMI2-CSSI/foundry/actions/workflows/tests.yml)
  9 | [![Tests](https://github.com/MLMI2-CSSI/foundry/actions/workflows/python-publish.yml/badge.svg)](https://github.com/MLMI2-CSSI/foundry/actions/workflows/python-publish.yml)
 10 | [![NSF-1931306](https://img.shields.io/badge/NSF-1931306-blue)](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1931306&HistoricalAwards=false)
 11 | [<img src="https://img.shields.io/badge/view-documentation-blue">](https://ai-materials-and-chemistry.gitbook.io/foundry/)
 12 | 
 13 | 
 14 | Foundry-ML simplifies the discovery and usage of ML-ready datasets in materials science and chemistry providing a simple API to access even complex datasets. 
 15 | * Load ML-ready data with just a few lines of code
 16 | * Work with datasets in local or cloud environments. 
 17 | * Publish your own datasets with Foundry to promote community usage
 18 | * (in progress) Run published ML models without hassle
 19 | 
 20 | Learn more and see our available datasets on [Foundry-ML.org](https://foundry-ml.org/)
 21 | 
 22 | 
 23 | 
 24 | # Documentation
 25 | Information on how to install and use Foundry is available in our documentation [here](https://ai-materials-and-chemistry.gitbook.io/foundry/v/docs/).
 26 | 
 27 | DLHub documentation for model publication and running information can be found [here](https://dlhub-sdk.readthedocs.io/en/latest/servable-publication.html).
 28 | 
 29 | # Quick Start
 30 | Install Foundry-ML via command line with:
 31 | `pip install foundry_ml`
 32 | 
 33 | You can use the following code to import and instantiate Foundry-ML, then load a dataset.
 34 | 
 35 | ```python
 36 | from foundry import Foundry
 37 | f = Foundry(index="mdf")
 38 | 
 39 | 
 40 | f = f.load("10.18126/e73h-3w6n", globus=True)
 41 | ```
 42 | *NOTE*: If you run locally and don't want to install the [Globus Connect Personal endpoint](https://www.globus.org/globus-connect-personal), just set the `globus=False`.
 43 | 
 44 | If running this code in a notebook, a table of metadata for the dataset will appear:
 45 | 
 46 | <img width="903" alt="metadata" src="https://user-images.githubusercontent.com/16869564/197038472-0b6ae559-4a6b-4b20-88e5-679bb6eb4f5c.png">
 47 | 
 48 | We can use the data with `f.load_data()` and specifying splits such as `train` for different segments of the dataset, then use matplotlib to visualize it.
 49 | 
 50 | ```python
 51 | res = f.load_data()
 52 | 
 53 | imgs = res['train']['input']['imgs']
 54 | desc = res['train']['input']['metadata']
 55 | coords = res['train']['target']['coords']
 56 | 
 57 | n_images = 3
 58 | offset = 150
 59 | key_list = list(res['train']['input']['imgs'].keys())[0+offset:n_images+offset]
 60 | 
 61 | fig, axs = plt.subplots(1, n_images, figsize=(20,20))
 62 | for i in range(n_images):
 63 |     axs[i].imshow(imgs[key_list[i]])
 64 |     axs[i].scatter(coords[key_list[i]][:,0], coords[key_list[i]][:,1], s = 20, c = 'r', alpha=0.5)
 65 | ```
 66 | <img width="595" alt="Screen Shot 2022-10-20 at 2 22 43 PM" src="https://user-images.githubusercontent.com/16869564/197039252-6d9c78ba-dc09-4037-aac2-d6f7e8b46851.png">
 67 | 
 68 | [See full examples](./examples)
 69 | 
 70 | # How to Cite
 71 | If you find Foundry-ML useful, please cite the following [paper](https://doi.org/10.21105/joss.05467)
 72 | 
 73 | ```
 74 | @article{Schmidt2024,
 75 |   doi = {10.21105/joss.05467},
 76 |   url = {https://doi.org/10.21105/joss.05467},
 77 |   year = {2024}, publisher = {The Open Journal},
 78 |   volume = {9},
 79 |   number = {93},
 80 |   pages = {5467},
 81 |   author = {Kj Schmidt and Aristana Scourtas and Logan Ward and Steve Wangen and Marcus Schwarting and Isaac Darling and Ethan Truelove and Aadit Ambadkar and Ribhav Bose and Zoa Katok and Jingrui Wei and Xiangguo Li and Ryan Jacobs and Lane Schultz and Doyeon Kim and Michael Ferris and Paul M. Voyles and Dane Morgan and Ian Foster and Ben Blaiszik},
 82 |   title = {Foundry-ML - Software and Services to Simplify Access to Machine Learning Datasets in Materials Science}, journal = {Journal of Open Source Software}
 83 | }
 84 | ```
 85 | 
 86 | # Contributing
 87 | Foundry is an Open Source project and we encourage contributions from the community. To contribute, please fork from the `main` branch and open a Pull Request on the `main` branch. A member of our team will review your PR shortly.
 88 | 
 89 | ## Developer notes
 90 | In order to enforce consistency with external schemas for the metadata and datacite structures ([contained in the MDF data schema repository](https://github.com/materials-data-facility/data-schemas)) the `dc_model.py` and `project_model.py` pydantic data models (found in the `foundry/jsonschema_models` folder) were generated using the [datamodel-code-generator](https://github.com/koxudaxi/datamodel-code-generator/) tool. In order to ensure compliance with the flake8 linting, the `--use-annoted` flag was passed to ensure regex patterns in `dc_model.py` were specified using pydantic's `Annotated` type vs the soon to be deprecated `constr` type. The command used to run the datamodel-code-generator looks like:
 91 | ```
 92 | datamodel-codegen --input dc.json --output dc_model.py --use-annotated
 93 | ```
 94 | 
 95 | # Primary Support
 96 | This work was supported by the National Science Foundation under NSF Award Number: 1931306 "Collaborative Research: Framework: Machine Learning Materials Innovation Infrastructure".
 97 | 
 98 | # Other Support
 99 | Foundry-ML brings together many components in the materials data ecosystem. Including [MAST-ML](https://mastmldocs.readthedocs.io/en/latest/), the [Data and Learning Hub for Science](https://www.dlhub.org) (DLHub), and the [Materials Data Facility](https://materialsdatafacility.org) (MDF).
100 | 
101 | ## MAST-ML
102 | This work was supported by the National Science Foundation (NSF) SI2 award No. 1148011 and DMREF award number DMR-1332851
103 | 
104 | ## The Data and Learning Hub for Science (DLHub)
105 | This material is based upon work supported by Laboratory Directed Research and Development (LDRD) funding from Argonne National Laboratory, provided by the Director, Office of Science, of the U.S. Department of Energy under Contract No. DE-AC02-06CH11357.
106 | https://www.dlhub.org
107 | 
108 | ## The Materials Data Facility
109 | This work was performed under financial assistance award 70NANB14H012 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the [Center for Hierarchical Material Design (CHiMaD)](http://chimad.northwestern.edu). This work was performed under the following financial assistance award 70NANB19H005 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the Center for Hierarchical Materials Design (CHiMaD). This work was also supported by the National Science Foundation as part of the [Midwest Big Data Hub](http://midwestbigdatahub.org) under NSF Award Number: 1636950 "BD Spokes: SPOKE: MIDWEST: Collaborative: Integrative Materials Design (IMaD): Leverage, Innovate, and Disseminate".
110 | https://www.materialsdatafacility.org
111 | 


--------------------------------------------------------------------------------
/assets/foundry-black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-black.png


--------------------------------------------------------------------------------
/assets/foundry-black.svg:
--------------------------------------------------------------------------------
1 | <svg width="2260" height="587" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="1150" y="609" width="2260" height="587"/></clipPath></defs><g clip-path="url(#clip0)" transform="translate(-1150 -609)"><text fill="#000000" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="275" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1728.2 956)">FOUNDRY</text><text fill="#000000" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="275" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2786.95 956)">-</text><text fill="#000000" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="275" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2938.2 956)">ML</text><text fill="#000000" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1728.2 1084)">DATA, MODELS, SCIENCE</text><path d="M1158.5 1055.5 1405.59 688.5 1676.5 688.5 1429.41 1055.5Z" stroke="#FF5C5C" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F08481" fill-rule="evenodd" fill-opacity="1"/><path d="M1190.5 1111.5 1437.59 744.5 1709.5 744.5 1462.41 1111.5Z" stroke="#31538F" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#6E79C1" fill-rule="evenodd" fill-opacity="1"/></g></svg>


--------------------------------------------------------------------------------
/assets/foundry-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-dark.png


--------------------------------------------------------------------------------
/assets/foundry-dark.svg:
--------------------------------------------------------------------------------
1 | <svg width="1921" height="582" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="1058" y="715" width="1921" height="582"/></clipPath></defs><g clip-path="url(#clip0)" transform="translate(-1058 -715)"><text fill="#000000" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="275" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1634.89 1060)">FOUNDRY<tspan fill="#000000" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" x="0" y="131">DATA, MODELS, SCIENCE</tspan></text><path d="M1065.5 1162.5 1312.59 795.5 1583.5 795.5 1336.41 1162.5Z" stroke="#FF5C5C" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F08481" fill-rule="evenodd" fill-opacity="1"/><path d="M1097.5 1218.5 1344.59 851.5 1615.5 851.5 1368.41 1218.5Z" stroke="#2F528F" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#6E79C1" fill-rule="evenodd" fill-opacity="1"/></g></svg>


--------------------------------------------------------------------------------
/assets/foundry-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-light.png


--------------------------------------------------------------------------------
/assets/foundry-light.svg:
--------------------------------------------------------------------------------
1 | <svg width="1920" height="587" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="1040" y="1365" width="1920" height="587"/></clipPath></defs><g clip-path="url(#clip0)" transform="translate(-1040 -1365)"><text fill="#FFFFFF" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="275" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1616.06 1712)">FOUNDRY<tspan fill="#FFFFFF" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" x="0" y="130">DATA, MODELS, SCIENCE</tspan></text><path d="M1046.5 1813.5 1293.59 1446.5 1564.5 1446.5 1317.41 1813.5Z" stroke="#FF5C5C" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F08481" fill-rule="evenodd" fill-opacity="1"/><path d="M1078.5 1869.5 1324.91 1503.5 1597.5 1503.5 1351.09 1869.5Z" stroke="#2F528F" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#6E79C1" fill-rule="evenodd" fill-opacity="1"/></g></svg>


--------------------------------------------------------------------------------
/assets/foundry-logo-4.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-logo-4.pptx


--------------------------------------------------------------------------------
/assets/foundry-logo.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-logo.pptx


--------------------------------------------------------------------------------
/assets/foundry-ml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-ml.png


--------------------------------------------------------------------------------
/assets/foundry-purple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-purple.png


--------------------------------------------------------------------------------
/assets/foundry-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-white.png


--------------------------------------------------------------------------------
/assets/foundry-white.svg:
--------------------------------------------------------------------------------
1 | <svg width="2260" height="587" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="1150" y="609" width="2260" height="587"/></clipPath></defs><g clip-path="url(#clip0)" transform="translate(-1150 -609)"><text fill="#FFFFFF" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="275" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1728.2 956)">FOUNDRY</text><text fill="#FFFFFF" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="275" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2786.95 956)">-</text><text fill="#FFFFFF" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="275" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2938.2 956)">ML</text><text fill="#FFFFFF" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1728.2 1084)">DATA, MODELS, SCIENCE</text><path d="M1158.5 1055.5 1405.59 688.5 1676.5 688.5 1429.41 1055.5Z" stroke="#FF5C5C" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#F08481" fill-rule="evenodd" fill-opacity="1"/><path d="M1190.5 1111.5 1437.59 744.5 1709.5 744.5 1462.41 1111.5Z" stroke="#31538F" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#6E79C1" fill-rule="evenodd" fill-opacity="1"/></g></svg>


--------------------------------------------------------------------------------
/assets/foundry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry.png


--------------------------------------------------------------------------------
/assets/foundry.svg:
--------------------------------------------------------------------------------
1 | <svg width="1920" height="582" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="1150" y="609" width="1920" height="582"/></clipPath></defs><g clip-path="url(#clip0)" transform="translate(-1150 -609)"><text fill="#000000" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="275" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1728.21 954)">FOUNDRY<tspan fill="#000000" fill-opacity="1" font-family="Consolas,Consolas_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" x="0" y="130">DATA, MODELS, SCIENCE</tspan></text><path d="M1158.5 1055.5 1405.59 688.5 1676.5 688.5 1429.41 1055.5Z" stroke="#FF5C5C" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FF5C5C" fill-rule="evenodd" fill-opacity="1"/><path d="M1190.5 1111.5 1437.59 744.5 1709.5 744.5 1462.41 1111.5Z" stroke="#2F528F" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#325A95" fill-rule="evenodd" fill-opacity="1"/></g></svg>


--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-overview.png


--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry-purple (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple (1).png


--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry-purple (2).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple (2).png


--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry-purple (3).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple (3).png


--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry-purple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple.png


--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry.png


--------------------------------------------------------------------------------
/docs/.gitbook/assets/image (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/image (1).png


--------------------------------------------------------------------------------
/docs/.gitbook/assets/image (2).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/image (2).png


--------------------------------------------------------------------------------
/docs/.gitbook/assets/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/image.png


--------------------------------------------------------------------------------
/docs/.gitbook/assets/screen-shot-2021-07-15-at-10.00.38-am.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/screen-shot-2021-07-15-at-10.00.38-am.png


--------------------------------------------------------------------------------
/docs/.gitbook/assets/screen-shot-2021-07-15-at-10.05.40-am.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/screen-shot-2021-07-15-at-10.05.40-am.png


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Getting started with Foundry
 2 | 
 3 | ![](.gitbook/assets/foundry-purple%20%283%29.png)
 4 | 
 5 | ## What is Foundry?
 6 | 
 7 | Foundry is a Python package that simplifies the discovery and usage of machine-learning ready datasets in materials science and chemistry. Foundry provides software tools that make it easy to load these datasets and work with them in local or cloud environments. Further, Foundry provides a dataset specification, and defined curation flows, that allow users to create new datasets for the community to use through this same interface.
 8 | 
 9 | ## Installation
10 | 
11 | Foundry can be installed on any operating system with Python with pip
12 | 
13 | ```text
14 | pip install foundry-ml
15 | ```
16 | 
17 | ### Globus
18 | 
19 | Foundry uses the Globus platform for authentication, search, and to optimize some data transfer operations. Follow the steps below to get set up.
20 | 
21 | * [Create a free account.](https://app.globus.org) You can create a free account here with your institutional credentials or with free IDs \(GlobusID, Google, ORCID, etc\).
22 | * [Set up a Globus Connect Personal endpoint ](https://www.globus.org/globus-connect-personal)_**\(optional\)**_. While this step is optional, some Foundry capabilities will work more efficiently when using GCP.
23 | 
24 | ## Project Support
25 | 
26 | This work was supported by the National Science Foundation under NSF Award Number: 1931306 "Collaborative Research: Framework: Machine Learning Materials Innovation Infrastructure".
27 | 
28 | ### Other Support
29 | 
30 | Foundry brings together many components in the materials data ecosystem. Including MAST-ML, the Data and Learning Hub for Science \(DLHub\), and The Materials Data Facility \(MDF\).
31 | 
32 | #### MAST-ML
33 | 
34 | This work was supported by the National Science Foundation \(NSF\) SI2 award No. 1148011 and DMREF award number DMR-1332851
35 | 
36 | #### The Data and Learning Hub for Science \(DLHub\)
37 | 
38 | This material is based upon work supported by Laboratory Directed Research and Development \(LDRD\) funding from Argonne National Laboratory, provided by the Director, Office of Science, of the U.S. Department of Energy under Contract No. DE-AC02-06CH11357. [https://www.dlhub.org](https://www.dlhub.org)
39 | 
40 | #### The Materials Data Facility
41 | 
42 | This work was performed under financial assistance award 70NANB14H012 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the [Center for Hierarchical Material Design \(CHiMaD\)](http://chimad.northwestern.edu). This work was performed under the following financial assistance award 70NANB19H005 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the Center for Hierarchical Materials Design \(CHiMaD\). This work was also supported by the National Science Foundation as part of the [Midwest Big Data Hub](http://midwestbigdatahub.org) under NSF Award Number: 1636950 "BD Spokes: SPOKE: MIDWEST: Collaborative: Integrative Materials Design \(IMaD\): Leverage, Innovate, and Disseminate". [https://www.materialsdatafacility.org](https://www.materialsdatafacility.org)
43 | 
44 | 


--------------------------------------------------------------------------------
/docs/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Table of contents
 2 | 
 3 | * [Getting started with Foundry](README.md)
 4 | 
 5 | ## How to contribute
 6 | 
 7 | * [Contribution Process](how-to-contribute/contributing.md)
 8 | * [Contributor Covenant](how-to-contribute/code_of_conduct.md)
 9 | 
10 | ---
11 | 
12 | * [Sphinx Autogenerated documentation - markdown](sphinx-autogenerated-documentation.md)
13 | * [foundry package — Foundry\_test 1.1 documentation - HTML AUTOGENERATION](foundry-package-foundry_test-1.1-documentation-html-autogeneration.md)
14 | 
15 | 


--------------------------------------------------------------------------------
/docs/command-line-interface.md:
--------------------------------------------------------------------------------
 1 | # Command Line Interface \(CLI\)
 2 | 
 3 | The Foundry command line interface \(CLI\) allows for users to build their data environment from the command line using a specification file. This is the data analag to how `pip` or `conda` allow users to build a software environment from software specification files.
 4 | 
 5 | ## Installation
 6 | 
 7 | ```text
 8 | pip install foundry-ml-cli
 9 | ```
10 | 
11 | ### CLI Options
12 | 
13 | **`--file`** : \(string\) the name of the specification file to build. _Default: "./foundry.json"_
14 | 
15 | **`--globus`** : \(bool\) If True, uses Globus to download the files, otherwise HTTPS. _Default: False_
16 | 
17 | **`--interval`** : \(int\) Time in seconds between polling operations to check transfer status. _Default: 3_
18 | 
19 | **`-verbose`** : \(bool\) If True, print out more logging information to the console. _Default: False_
20 | 
21 | ## Example Usage
22 | 
23 | In a folder containing a file named foundry.json
24 | 
25 | ```text
26 | /foundry.json
27 | 
28 | 
29 | $ foundry
30 | ```
31 | 
32 | This is the same as running
33 | 
34 | ```text
35 | /foundry.json
36 | 
37 | 
38 | $ foundry --file=foundry.json --globus=False --interval=3 --verbose=False
39 | ```
40 | 
41 | 


--------------------------------------------------------------------------------
/docs/concepts/foundry-benchmarks.md:
--------------------------------------------------------------------------------
1 | # Foundry Benchmarks
2 | 
3 | 


--------------------------------------------------------------------------------
/docs/concepts/foundry-data-packages.md:
--------------------------------------------------------------------------------
 1 | # Foundry Data Packages
 2 | 
 3 | Foundry Data Packages allow for a logical and portable way to specify and collect data for analyses. From a data package, a user can easily build a local data environment matching the data package.
 4 | 
 5 | ## Data Package Specification Fields
 6 | 
 7 | **`name`** : \(string\) A name for the data package
 8 | 
 9 | **`version`** : \(string\) A version of the form &lt;major&gt;.&lt;minor&gt;.&lt;sub&gt; e.g., "1.2.0"
10 | 
11 | **`description`**  : \(string\) A short description of the data package and its intended use
12 | 
13 | **`tags`**  : \(list\) A list of tag strings associated with the data package
14 | 
15 | **`dependencies`**  : \(list\) A list of dependency objects associated with the data package
16 | 
17 | **`private`**  : \(bool\) Whether the data package is to be registered in a public data package index
18 | 
19 | ### Dependency Objects
20 | 
21 | **`identifier`** : \(string\) Unique identifier for the dataset
22 | 
23 | **`version`** : \(string\) The version of the dataset to use
24 | 
25 | **`provider`** : \(string\) The dataset provider. _Currently only "MDF" is supported_
26 | 
27 | ```javascript
28 | {
29 |     "identifier": "_test_foundry_mp_bandgap_v1.1",
30 |     "version": "1.1",
31 |     "provider": "MDF"
32 | }
33 | ```
34 | 
35 | ## Example Specification
36 | 
37 | ```javascript
38 | {
39 | 	"name": "Band Gap Analysis",
40 | 	"version": "1.0.0",
41 | 	"description": "Datasets for band gap uber model generation",
42 | 	"private": true,
43 | 	"dependencies": [{
44 | 			"name": "_test_foundry_experimental_bandgap_v1.1",
45 | 			"version": "1.1",
46 | 			"provider": "MDF"
47 | 		},
48 | 		{
49 | 			"name": "_test_foundry_mp_bandgap_v1.1",
50 | 			"version": "1.1",
51 | 			"provider": "MDF"
52 | 		},
53 | 		{
54 | 			"name": "_test_foundry_oqmd_bandgap_v1.1",
55 | 			"version": "1.1",
56 | 			"provider": "MDF"
57 | 		},
58 | 		{
59 | 			"name": "_test_foundry_assorted_computational_bandgap_v1.1",
60 | 			"version": "1.1",
61 | 			"provider": "MDF"
62 | 		}
63 | 	]
64 | }
65 | ```
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/docs/concepts/foundry-datasets.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | description: Describe the metadata that is for each Foundry dataset
  3 | ---
  4 | 
  5 | # Foundry Datasets
  6 | 
  7 | Foundry Datasets are comprised of two key components, [_**data**_](foundry-datasets.md#data) and descriptive [_**metadata**_](foundry-datasets.md#describing-datasets-with-metadata). In order to make the data easily consumable, _**data**_ \(consisting of files\) should be assembled following the supported structures. The _**metadata**_ description allows tracking of high level information \(e.g.,  authors, assoicated institutions, licenses, data location\), and also information on how to operate on the datasets \(e.g., how to load the data, training/test splits\)
  8 | 
  9 | ### **Data**
 10 | 
 11 | ### Example - Record-Based Data
 12 | 
 13 | #### **Tabular Data**
 14 | 
 15 | For tabular data should, columns represent the different keys of the data, and rows represent individual records.
 16 | 
 17 | {% hint style="info" %}
 18 | Supported tabular data types currently include JSON, csv, and xlsx.
 19 | {% endhint %}
 20 | 
 21 | In this example, we showcase how to describe a JSON record-based dataset where each record is a valid JSON object in a JSON list or a line in a JSON line delimited file.
 22 | 
 23 | | **feature\_1** | **feature\_2** | **material\_type** | band\_gap |
 24 | | :--- | :--- | :--- | :--- |
 25 | | 0.10 | 0.52 | 1 | 1.40 |
 26 | | 0.34 | 0.910 | 0 | 0.73 |
 27 | | ... | ... | ... |  |
 28 | 
 29 | For this example dataset the `Key` object could be:  
 30 | 
 31 | ```javascript
 32 | {
 33 | 	"short_name": "oqmd-bandgaps",
 34 | 	"data_type": "tabular",
 35 | 	"task_type": ["supervised"],
 36 | 	"domain": ["materials science"],
 37 | 	"n_items": 29197,
 38 | 	"splits": [{
 39 | 		"type": "train",
 40 | 		"path": "foundry_dataframe.json",
 41 | 		"label": "train"
 42 | 	}],
 43 | 	"keys": [{
 44 | 			"key": ["reference"],
 45 | 			"type": "input",
 46 | 			"units": "",
 47 | 			"description": "source publication of the bandgap value"
 48 | 		}, {
 49 | 			"key": ["icsd_id"],
 50 | 			"type": "input",
 51 | 			"units": "",
 52 | 			"description": "corresponding id in ICSD of this compound"
 53 | 		}, {
 54 | 			"key": ["structure"],
 55 | 			"type": "input",
 56 | 			"units": "",
 57 | 			"description": "the structure of this compound"
 58 | 		}, {
 59 | 			"key": ["composition"],
 60 | 			"type": "input",
 61 | 			"units": "",
 62 | 			"description": "reduced composition of this compound"
 63 | 		}, {
 64 | 			"key": ["comments"],
 65 | 			"type": "input",
 66 | 			"units": "",
 67 | 			"description": "Additional information about this bandgap measurement"
 68 | 		}, {
 69 | 			"key": ["bandgap type"],
 70 | 			"type": "input",
 71 | 			"units": "",
 72 | 			"description": "the type of the bandgap, e.g., direct or indirect"
 73 | 		}, {
 74 | 			"key": ["comp method"],
 75 | 			"type": "input",
 76 | 			"units": "",
 77 | 			"description": "functional used to calculate the bandgap"
 78 | 		}, {
 79 | 			"key": ["space group"],
 80 | 			"type": "input",
 81 | 			"units": "",
 82 | 			"description": "the space group of this compound"
 83 | 		},
 84 | 		{
 85 | 			"key": ["bandgap value (eV)"],
 86 | 			"type": "output",
 87 | 			"units": "eV",
 88 | 			"description": "value of the bandgap"
 89 | 		}
 90 | 	]
 91 | }
 92 | ```
 93 | 
 94 | **TODO**
 95 | 
 96 | ```text
 97 | "keys":[{
 98 | 		 	"key": "feature_1",
 99 | 			"type": "input",
100 | 			"units": None,
101 | 			"description": "This is feature 1"
102 | 		},{
103 | 			"key": "feature_2",
104 | 			"type": "input",
105 | 			"units": None,
106 | 			"description": "This is feature 2"
107 | 		},{
108 | 			"key": "material_type",
109 | 			"type": "input",
110 | 			"units": None,
111 | 			"description": "This is the material type",
112 | 			"labels":["perovskite","not perovskite"]
113 | 		}{
114 | 			"key": "band_gap",
115 | 			"type": "target",
116 | 			"units": "eV",
117 | 			"description": "This is the simulated band gap in eV"
118 | 		}
119 | ]
120 | ```
121 | 
122 | {% hint style="info" %}
123 | `This tabular data file should be saved in the base directory as` **`foundry_dataframe.json`**
124 | {% endhint %}
125 | 
126 | * Write general pandas reader to try csv, JSON, xlsx for opening
127 | 
128 | #### Hierarchical Data
129 | 
130 | Foundry also supports data from hierarchical data formats \(e.g., [HDF5](https://www.h5py.org)\). In this case features and outputs can be represented with `/` notation. For example, if the features of a dataset are located in an array stored in `/data/arr1` and `/other_data/arr2` while the outputs are in `/data/band_gaps`, the Key object would be:
131 | 
132 | ```javascript
133 | {
134 | 		"short_name": "segmentation-dev",
135 | 		"data_type": "hdf5",
136 | 		"task_type": ["unsupervised", "segmentation"],
137 | 		"domain": ["materials science", "chemistry"],
138 | 		"n_items": 100,
139 | 		"splits": [{
140 | 			"type": "train",
141 | 			"path": "foundry.hdf5",
142 | 			"label": "train"
143 | 		}],
144 | 		"keys": [{
145 | 			"key": ["train/input"],
146 | 			"type": "input",
147 | 			"description": "input, unlabeled images"
148 | 		}, {
149 | 			"key": ["train/output"],
150 | 			"type": "target",
151 | 			"description": "target, labeled images"
152 | 		}]
153 | 	}
154 | ```
155 | 
156 | ```text
157 | "keys":[{
158 | 			"key": "/data/arr1",
159 | 			"type": "input",
160 | 			"units": None,
161 | 			"description": "This is an array containing input data"
162 | 		},{
163 | 		  "key": "/other_data/arr2",
164 | 			"type": "input",
165 | 			"units": None,
166 | 			"description": "This is an another array containing input data"
167 | 		},{
168 | 		  "key": "/data/band_gaps",
169 | 			"type": "target",
170 | 			"units": "eV",
171 | 			"description": "This is the simulated band gap in eV"
172 | 		}
173 | ]
174 | ```
175 | 
176 | ## Descriptive Metadata
177 | 
178 | **DataCite Metadata \(object\):** All datasets can be described using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). This metadata captures . Many of these capabilities have helper functions in the SDK, to make it easier to match the DataCite schema
179 | 
180 | **Keys \(object\):** Key objects provide a mapping that allows Foundry to read data from the underlying data structure into usable Python objects. Key objects have the following properties
181 | 
182 | * **`key (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\)
183 | * **`type (str)`** The type of key this entry represents. Currently suported types are _**\["input", "target" \]**_
184 | * **`units (str)[optional]`** _****_The scientific units associated with a key. _Default: None_
185 | * **`description (str)[optional]`** _****_A free text description of the key. _Default: None_
186 | * **`labels (list) (str) [optional]`:** A list of strings mapped to integers in a key column
187 | 
188 | **short\_name \(str\):** Short name is a unique name associated with this dataset to make loading and . 
189 | 
190 | **type \(str\):** The type provides a hint to Foundry on how to map the keys into loading operations. _Options \["tabular","hdf5"\]_
191 | 
192 | ```text
193 | "foundry": {
194 | 	"dc": {},
195 | 	"keys": [{
196 | 			"type": "input",
197 | 			"name": "feature_1",
198 | 			"units": "",
199 | 			"description": "This is an input"
200 | 		},
201 | 		{
202 | 			"type": "target",
203 | 			"name": "band_gap",
204 | 			"units": "eV",
205 | 			"description": "blah blah",
206 | 			"labels": []
207 | 		}
208 | 	],
209 | 	"short_name": "my_short_name",
210 | 	"type": "tabular"
211 | }
212 | ```
213 | 
214 | 


--------------------------------------------------------------------------------
/docs/concepts/foundry-models-and-functions.md:
--------------------------------------------------------------------------------
1 | # Foundry Models and Functions
2 | 
3 | ## Foundry Model Providers
4 | 
5 | Currently Foundry supports model and functions provided via the [Data and Learning Hub for Science \(DLHub\)](https://www.dlhub.org)/[FuncX](https://www.funcx.org) 
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/concepts/overview.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | TODO:
 4 | 
 5 | * Change the code snippet in the image
 6 | * Write the text :\)
 7 | 
 8 | ![](../.gitbook/assets/foundry-overview.png)
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
 1 | # Getting Started with Python
 2 | 
 3 | ## Scientific Examples
 4 | 
 5 | [Checkout our example notebooks ](https://github.com/MLMI2-CSSI/foundry/tree/master/examples)for how to load or publish datasets using Foundry.
 6 | 
 7 | ## Quickstart
 8 | 
 9 | ### Creating a Foundry Client
10 | 
11 | The Foundry client provides access to all of the methods described here for listing, loading, and publishing datasets and models. The code below will create a Foundry client 
12 | 
13 | ```python
14 | from foundry import Foundry
15 | f = Foundry()
16 | ```
17 | 
18 | {% hint style="success" %}
19 | If you are running your script on cloud resources \(e.g. Google Colab, Binder\), see [Using Foundry on Cloud Computing Resources](examples.md#using-foundry-on-cloud-computing-resources)W
20 | {% endhint %}
21 | 
22 | ### Listing Datasets
23 | 
24 | To show all available Foundry datasets, you can use the Foundry `list()` method as follows. The method returns a pandas DataFrame with details on the available datasets.
25 | 
26 | ```python
27 | f.list()
28 | ```
29 | 
30 | ### Loading Datasets
31 | 
32 | The Foundry client can be used to access datasets using a `source_id`, e.g. here `"_test_foundry_fashion_mnist_v1.1"`_._ You can retrieve the `source_id` from the [`list()` method](examples.md#listing-datasets).
33 | 
34 | ```python
35 | from foundry import Foundry
36 | f = Foundry()
37 | f = f.load("_test_foundry_fashion_mnist_v1.1")
38 | ```
39 | 
40 | This will remotely load the metadata \(e.g., data location, data keys, etc.\) and download the data to local storage if it is not already cached. Data can be downloaded via HTTPS without additional setup or more optimally with a Globus endpoint [set up](https://www.globus.org/globus-connect-personal) on your machine.
41 | 
42 | Once the data are accessible locally, access the data with the `load_data()` method. Load data allows you to load data from a specific split that is defined for the dataset, here we use `train`.
43 | 
44 | ```python
45 | res = f.load_data()
46 | X,y = res['train']
47 | ```
48 | 
49 | The data are then usable within the `X` and `y` variables. This full example can be found in [`/examples/fashion-mnist/`](https://github.com/MLMI2-CSSI/foundry/tree/master/examples/fashion-mnist).
50 | 
51 | ## Using Foundry on Cloud Computing Resources
52 | 
53 | Foundry works with common cloud computing providers \(e.g., the NSF sponsored Jetstream and Google Colab\). On these resources, simply add the following arguments to use a cloud-compatible authentication flow.
54 | 
55 | ```python
56 | f = Foundry(no_browser=True, no_local_server=True)
57 | ```
58 | 
59 | When downloading data, add the following argument to download via HTTPS.
60 | 
61 | {% hint style="info" %}
62 | This method may be slow for large datasets and datasets with many files
63 | {% endhint %}
64 | 
65 | ```python
66 | f.load(globus=False)
67 | X, y = f.load_data()
68 | ```
69 | 
70 | 


--------------------------------------------------------------------------------
/docs/foundry-package-foundry_test-1.1-documentation-html-autogeneration.md:
--------------------------------------------------------------------------------
  1 | # foundry package — Foundry\_test 1.1 documentation - HTML AUTOGENERATION
  2 | 
  3 | ## foundry.foundry module[¶]()
  4 | 
  5 |  _class_ foundry.foundry.Foundry\(_no\_browser=False_, _no\_local\_server=False_, _search\_index='mdf-test'_, _\*_, _dc: Dict = {}_, _mdf: Dict = {}_, _dataset:_ [_foundry.models.FoundryDataset_]() _= {}_, _config:_ [_foundry.models.FoundryConfig_]() _= FoundryConfig\(dataframe\_file='foundry\_dataframe.json', data\_file='foundry.hdf5', metadata\_file='foundry\_metadata.json', destination\_endpoint=None, local=False, metadata\_key='foundry', organization='foundry', local\_cache\_dir='./data'\)_, _dlhub\_client: Any = None_, _forge\_client: Any = None_, _connect\_client: Any = None_, _xtract\_tokens: Any = None_\)[¶]()
  6 | 
  7 | Bases: [`foundry.models.FoundryMetadata`]()
  8 | 
  9 | Foundry Client Base Class TODO: ——- Add Docstring build\(_spec_, _globus=False_, _interval=3_, _file=False_\)[¶]()
 10 | 
 11 | Build a Foundry Data Package :param spec: dict or str \(relative filename\) of the data package specification :type spec: multiple :param globus: if True use Globus to fetch datasets :type globus: bool :param interval: Polling interval on checking task status in seconds. :type interval: int :param type: One of “file” or None :type type: strReturns
 12 | 
 13 | **\(Foundry\)**Return type
 14 | 
 15 | self: for chaining check\_model\_status\(_res_\)[¶]()
 16 | 
 17 | Check status of model or function publication to DLHub
 18 | 
 19 | TODO: currently broken on DLHub side of things check\_status\(_source\_id_, _short=False_, _raw=False_\)[¶]()
 20 | 
 21 | Check the status of your submission.Parameters
 22 | 
 23 | * **source\_id** \(_str_\) – The `source_id` \(`source_name` + version information\) of the submission to check. Returned in the `res` result from `publish()` via MDF Connect Client.
 24 | * **short** \(_bool_\) – When `False`, will print a status summary containing all of the status steps for the dataset. When `True`, will print a short finished/processing message, useful for checking many datasets’ status at once. **Default:** `False`
 25 | * **raw** \(_bool_\) – When `False`, will print a nicely-formatted status summary. When `True`, will return the full status result. For direct human consumption, `False` is recommended. **Default:** `False`
 26 | 
 27 | Returns
 28 | 
 29 | The full status result.Return type
 30 | 
 31 | If `raw` is `True`, _dict_ collect\_dataframes\(_packages=\[\]_\)[¶]()
 32 | 
 33 | Collect dataframes of local data packages :param packages: List of packages to collect, defaults to all :type packages: listReturns
 34 | 
 35 | **\(tuple\)**Return type
 36 | 
 37 | Tuple of X\(pandas.DataFrame\), y\(pandas.DataFrame\) configure\(_\*\*kwargs_\)[¶]()
 38 | 
 39 | Set Foundry config :keyword file: Path to the file containing :kwtype file: str :keyword \(default: self.config.metadata\_file\)
 40 | 
 41 | dataframe\_file \(str\): filename for the dataframe file default:”foundry\_dataframe.json” data\_file \(str\): : filename for the data file default:”foundry.hdf5” destination\_endpoint \(str\): Globus endpoint UUID where Foundry data should move local\_cache\_dir \(str\): Where to place collected data default:”./data”Returns
 42 | 
 43 | **\(Foundry\)**Return type
 44 | 
 45 | self: for chaining connect\_client_: Any_[¶]() describe\_model\(\)[¶]() dlhub\_client_: Any_[¶]() download\(_globus=True_, _verbose=False_, _\*\*kwargs_\)[¶]()
 46 | 
 47 | Download a Foundry dataset :param globus: if True, use Globus to download the data else try HTTPS :type globus: bool :param verbose: if True print out debug information during the download :type verbose: boolReturns
 48 | 
 49 | **\(Foundry\)**Return type
 50 | 
 51 | self: for chaining forge\_client_: Any_[¶]() get\_keys\(_type_, _as\_object=False_\)[¶]()
 52 | 
 53 | Get keys for a Foundry datasetParameters
 54 | 
 55 | * **type** \(_str_\) – The type of key to be returned e.g., “input”, “target”
 56 | * **as\_object** \(_bool_\) – When `False`, will return a list of keys in as strings When `True`, will return the full key objects **Default:** `False`
 57 | 
 58 | Returns: \(list\) String representations of keys or if `as_object`
 59 | 
 60 | is False otherwise returns the full key objects. get\_packages\(_paths=False_\)[¶]()
 61 | 
 62 | Get available local data packagesParameters
 63 | 
 64 | **paths** \(_bool_\) – If True return paths in addition to package, if False return package name onlyReturns
 65 | 
 66 | **\(list\)**Return type
 67 | 
 68 | List describing local Foundry packages list\(\)[¶]()
 69 | 
 70 | List available Foundry data packagesReturns
 71 | 
 72 | **\(pandas.DataFrame\)**Return type
 73 | 
 74 | DataFrame with summary list of Foundry data packages including name, title, and publication year load\(_name_, _download=True_, _globus=True_, _verbose=False_, _metadata=None_, _\*\*kwargs_\)[¶]()
 75 | 
 76 | Load the metadata for a Foundry dataset into the client :param name: Name of the foundry dataset :type name: str :param download: If True, download the data associated with the package \(default is True\) :type download: bool :param globus: If True, download using Globus, otherwise https :type globus: bool :param verbose: If True print additional debug information :type verbose: bool :param metadata: **For debug purposes.** A search result analog to prepopulate metadata. :type metadata: dictKeyword Arguments
 77 | 
 78 | **interval** \(_int_\) – How often to poll Globus to check if transfers are completeReturnsReturn type
 79 | 
 80 | self load\_data\(_source\_id=None_, _globus=True_\)[¶]()
 81 | 
 82 | Load in the data associated with the prescribed dataset
 83 | 
 84 | Tabular Data Type: Data are arranged in a standard data frame stored in self.dataframe\_file. The contents are read, and
 85 | 
 86 | File Data Type: &lt;&lt;Add desc&gt;&gt;
 87 | 
 88 | For more complicated data structures, users should subclass Foundry and override the load\_data functionParameters
 89 | 
 90 | * **inputs** \(_list_\) – List of strings for input columns
 91 | * **targets** \(_list_\) – List of strings for output columns
 92 | 
 93 | Returns ——-s
 94 | 
 95 | > \(tuple\): Tuple of X, y values
 96 | 
 97 |  publish\(_foundry\_metadata_, _data\_source_, _title_, _authors_, _update=False_, _publication\_year=None_, _\*\*kwargs_\)[¶]()
 98 | 
 99 | Submit a dataset for publication :param foundry\_metadata: Dict of metadata describing data package :type foundry\_metadata: dict :param data\_source: Url for Globus endpoint :type data\_source: string :param title: Title of data package :type title: string :param authors: List of data package author names e.g., Jack Black
100 | 
101 | > or Nunez, Victoria
102 | 
103 | Parameters
104 | 
105 | * **update** \(_bool_\) – True if this is an update to a prior data package \(default: self.config.metadata\_file\)
106 | * **publication\_year** \(_int_\) – Year of dataset publication. If None, will be set to the current calendar year by MDF Connect Client. \(default: $current\_year\)
107 | 
108 | Keyword Arguments
109 | 
110 | * **affiliations** \([_list_]()\) – List of author affiliations
111 | * **tags** \([_list_]()\) – List of tags to apply to the data package
112 | * **short\_name** \(_string_\) – Shortened/abbreviated name of the data package
113 | * **publisher** \(_string_\) – Data publishing entity \(e.g. MDF, Zenodo, etc.\)
114 | 
115 | Returns
116 | 
117 | **\(dict\) MDF Connect Response** – of dataset. Contains source\_id, which can be used to check the status of the submissionReturn type
118 | 
119 | Response from MDF Connect to allow tracking publish\_model\(_options_\)[¶]()
120 | 
121 | Submit a model or function for publication :param options: dict of all possible optionsOptions keys:
122 | 
123 | title \(req\) authors \(req\) short\_name \(req\) servable\_type \(req\) \(“static method”, “class method”, “keras”, “pytorch”, “tensorflow”, “sklearn”\) affiliations domains abstract references requirements \(dict of library:version keypairs\) module \(if Python method\) function \(if Python method\) inputs \(not needed for TF\) \(dict of options\) outputs \(not needed for TF\) methods \(e.g. research methods\) DOI publication\_year \(advanced\) version \(advanced\) visibility \(dict of users and groups, each a list\) funding reference rights
124 | 
125 | TODO: alternate identifier \(to add an identifier of this artifact in another service\) add file add directory add files run\(_name_, _inputs_, _\*\*kwargs_\)[¶]()
126 | 
127 | Run a model on dataParameters
128 | 
129 | * **name** \(_str_\) – DLHub model name
130 | * **inputs** – Data to send to DLHub as inputs \(should be JSON serializable\)
131 | 
132 | ReturnsReturn type
133 | 
134 | Returns results after invocation via the DLHub service
135 | 
136 | * Pass [\*\*]()kwargs through to DLHub client and document kwargs
137 | 
138 |  xtract\_tokens_: Any_[¶]()
139 | 
140 | ## foundry.models module[¶]()
141 | 
142 |  _class_ foundry.models.FoundryConfig\(_\*_, _dataframe\_file: str = 'foundry\_dataframe.json'_, _data\_file: str = 'foundry.hdf5'_, _metadata\_file: str = 'foundry\_metadata.json'_, _destination\_endpoint: str = None_, _local: bool = False_, _metadata\_key: str = 'foundry'_, _organization: str = 'foundry'_, _local\_cache\_dir: str = './data'_\)[¶]()
143 | 
144 | Bases: `pydantic.main.BaseModel`
145 | 
146 | Foundry Configuration Configuration information for Foundry DatasetParameters
147 | 
148 | * **dataframe\_file** \(_str_\) – Filename to read dataframe contents from
149 | * **metadata\_file** \(_str_\) – Filename to read metadata contents from defaults to reading for MDF Discover
150 | * **destination\_endpoint** \(_str_\) – Globus endpoint ID to transfer data to \(defaults to local GCP installation\)
151 | * **local\_cache\_dir** \(_str_\) – Path to local Foundry package cache
152 | 
153 |  data\_file_: Optional\[str\]_[¶]() dataframe\_file_: Optional\[str\]_[¶]() destination\_endpoint_: Optional\[str\]_[¶]() local_: Optional\[bool\]_[¶]() metadata\_file_: Optional\[str\]_[¶]() metadata\_key_: Optional\[str\]_[¶]() organization_: Optional\[str\]_[¶]() _class_ foundry.models.FoundryDataset\(_\*_, _keys: List\[_[_foundry.models.FoundryKey_]()_\] = None_, _splits: List\[_[_foundry.models.FoundrySplit_]()_\] = None_, _type:_ [_foundry.models.FoundryDatasetType_]() _= None_, _short\_name: str = ''_, _dataframe: Any = None_\)[¶]()
154 | 
155 | Bases: `pydantic.main.BaseModel`
156 | 
157 | Foundry Dataset Schema for Foundry Datasets. This includes specifications of inputs, outputs, type, version, and more _class_ Config[¶]()
158 | 
159 | Bases: `object` arbitrary\_types\_allowed _= True_[¶]() dataframe_: Optional\[Any\]_[¶]() keys_: List\[_[_foundry.models.FoundryKey_]()_\]_[¶]() short\_name_: Optional\[str\]_[¶]() splits_: Optional\[List\[_[_foundry.models.FoundrySplit_]()_\]\]_[¶]() type_:_ [_foundry.models.FoundryDatasetType_]()[¶]() _class_ foundry.models.FoundryDatasetType\(_value_\)[¶]()
160 | 
161 | Bases: `enum.Enum`
162 | 
163 | Foundry Dataset Types Enumeration of the possible Foundry dataset types files _= 'files'_[¶]() hdf5 _= 'hdf5'_[¶]() other _= 'other'_[¶]() tabular _= 'tabular'_[¶]() _class_ foundry.models.FoundryKey\(_\*_, _key: List\[str\] = \[\]_, _type: str = ''_, _filter: str = ''_, _units: str = ''_, _description: str = ''_, _classes: List\[_[_foundry.models.FoundryKeyClass_]()_\] = None_\)[¶]()
164 | 
165 | Bases: `pydantic.main.BaseModel` classes_: Optional\[List\[_[_foundry.models.FoundryKeyClass_]()_\]\]_[¶]() description_: Optional\[str\]_[¶]() filter_: Optional\[str\]_[¶]() key_: List\[str\]_[¶]() type_: str_[¶]() units_: Optional\[str\]_[¶]() _class_ foundry.models.FoundryKeyClass\(_\*_, _label: str = ''_, _name: str = ''_\)[¶]()
166 | 
167 | Bases: `pydantic.main.BaseModel` label_: str_[¶]() name_: str_[¶]() _class_ foundry.models.FoundryMetadata\(_\*_, _dc: Dict = {}_, _mdf: Dict = {}_, _dataset:_ [_foundry.models.FoundryDataset_]() _= {}_, _config:_ [_foundry.models.FoundryConfig_]() _= FoundryConfig\(dataframe\_file='foundry\_dataframe.json', data\_file='foundry.hdf5', metadata\_file='foundry\_metadata.json', destination\_endpoint=None, local=False, metadata\_key='foundry', organization='foundry', local\_cache\_dir='./data'\)_\)[¶]()
168 | 
169 | Bases: `pydantic.main.BaseModel` _class_ Config[¶]()
170 | 
171 | Bases: `object` arbitrary\_types\_allowed _= True_[¶]() config_:_ [_foundry.models.FoundryConfig_]()[¶]() dataset_:_ [_foundry.models.FoundryDataset_]()[¶]() dc_: Optional\[Dict\]_[¶]() mdf_: Optional\[Dict\]_[¶]() _class_ foundry.models.FoundrySpecification\(_\*_, _name: str = ''_, _version: str = ''_, _description: str = ''_, _private: bool = False_, _dependencies: Any = None_\)[¶]()
172 | 
173 | Bases: `pydantic.main.BaseModel`
174 | 
175 | Pydantic base class for interacting with the Foundry data package specification The specification provides a way to group datasets and manage versions add\_dependency\(_name_, _version_\)[¶]() clear\_dependencies\(\)[¶]() dependencies_: Any_[¶]() description_: str_[¶]() name_: str_[¶]() private_: bool_[¶]() remove\_duplicate\_dependencies\(\)[¶]() version_: str_[¶]() _class_ foundry.models.FoundrySpecificationDataset\(_\*_, _name: str = None_, _provider: str = 'MDF'_, _version: str = None_\)[¶]()
176 | 
177 | Bases: `pydantic.main.BaseModel`
178 | 
179 | Pydantic base class for datasets within the Foundry data package specification name_: Optional\[str\]_[¶]() provider_: Optional\[str\]_[¶]() version_: Optional\[str\]_[¶]() _class_ foundry.models.FoundrySplit\(_\*_, _type: str = ''_, _path: str = ''_, _label: str = ''_\)[¶]()
180 | 
181 | Bases: `pydantic.main.BaseModel` label_: Optional\[str\]_[¶]() path_: Optional\[str\]_[¶]() type_: str_[¶]()
182 | 
183 | ## foundry.xtract\_method module[¶]()
184 | 
185 | 


--------------------------------------------------------------------------------
/docs/foundry.auth.md:
--------------------------------------------------------------------------------
 1 | <!-- markdownlint-disable -->
 2 | 
 3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/auth.py#L0"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 4 | 
 5 | # <kbd>module</kbd> `foundry.auth`
 6 | Utilities related to storing authentication credentials 
 7 | 
 8 | 
 9 | 
10 | ---
11 | 
12 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/auth.py#L9"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
13 | 
14 | ## <kbd>class</kbd> `PubAuths`
15 | Collection of the authorizers needed for publication 
16 | 
17 | 
18 | 
19 | **Attributes:**
20 |  
21 |  - <b>`transfer_client`</b>:  Client with credentials to perform transfers 
22 |  - <b>`auth_client_openid`</b>:  Client with permissions to get users IDs 
23 |  - <b>`endpoint_auth_clients`</b>:  Mapping between endpoint ID and client that can authorize access to it 
24 | 
25 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/<string>"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
26 | 
27 | ### <kbd>method</kbd> `__init__`
28 | 
29 | ```python
30 | __init__(
31 |     transfer_client: TransferClient,
32 |     auth_client_openid: AuthClient,
33 |     endpoint_auth_clients: Dict[str, AuthClient]
34 | ) → None
35 | ```
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | ---
48 | 
49 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
50 | 


--------------------------------------------------------------------------------
/docs/foundry.foundry_cache.md:
--------------------------------------------------------------------------------
  1 | <!-- markdownlint-disable -->
  2 | 
  3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L0"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
  4 | 
  5 | # <kbd>module</kbd> `foundry.foundry_cache`
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | ---
 13 | 
 14 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L22"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 15 | 
 16 | ## <kbd>class</kbd> `FoundryCache`
 17 | The FoundryCache manages the local storage of FoundryDataset objects 
 18 | 
 19 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L25"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 20 | 
 21 | ### <kbd>method</kbd> `__init__`
 22 | 
 23 | ```python
 24 | __init__(
 25 |     forge_client: Forge,
 26 |     transfer_client: Any,
 27 |     use_globus,
 28 |     interval,
 29 |     parallel_https,
 30 |     verbose,
 31 |     local_cache_dir: str = None
 32 | )
 33 | ```
 34 | 
 35 | Initializes a FoundryCache object. 
 36 | 
 37 | 
 38 | 
 39 | **Args:**
 40 |  
 41 |  - <b>`forge_client`</b> (Forge):  The Forge client object. 
 42 |  - <b>`transfer_client`</b> (Any):  The transfer client object. 
 43 |  - <b>`use_globus`</b> (bool):  Flag indicating whether to use Globus for downloading. 
 44 |  - <b>`interval`</b> (int):  How often to wait before checking Globus transfer status. 
 45 |  - <b>`parallel_https`</b> (int):  Number of threads to use for downloading via HTTP. 
 46 |  - <b>`verbose`</b> (bool):  Flag indicating whether to produce more debug messages. 
 47 |  - <b>`local_cache_dir`</b> (str, optional):  The local cache directory. Defaults to None.  If not specified, defaults to either the environmental variable 'FOUNDRY_LOCAL_CACHE_DIR'  or './data/'. 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | ---
 53 | 
 54 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L442"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 55 | 
 56 | ### <kbd>method</kbd> `clear_cache`
 57 | 
 58 | ```python
 59 | clear_cache(dataset_name: str = None)
 60 | ```
 61 | 
 62 | Deletes all of the locally stored datasets 
 63 | 
 64 | 
 65 | 
 66 | **Arguments:**
 67 |  
 68 |  - <b>`dataset_name`</b> (str):  Optional name of a specific dataset. If omitted,  all datsets will be erased 
 69 | 
 70 | ---
 71 | 
 72 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L58"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 73 | 
 74 | ### <kbd>method</kbd> `download_to_cache`
 75 | 
 76 | ```python
 77 | download_to_cache(dataset_name: str, splits: List[Split] = None)
 78 | ```
 79 | 
 80 | Checks if the data is downloaded, and if not, downloads the data from source to local storage. 
 81 | 
 82 | 
 83 | 
 84 | **Args:**
 85 |  
 86 |  - <b>`dataset_name`</b> (str):  Name of the dataset (equivalent to source_id in MDF). 
 87 |  - <b>`splits`</b> (List[FoundrySplit], optional):  List of splits in the dataset. Defaults to None. 
 88 | 
 89 | 
 90 | 
 91 | **Returns:**
 92 |  
 93 |  - <b>`FoundryCache`</b>:  The FoundryCache object. 
 94 | 
 95 | ---
 96 | 
 97 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L81"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 98 | 
 99 | ### <kbd>method</kbd> `download_via_globus`
100 | 
101 | ```python
102 | download_via_globus(dataset_name: str)
103 | ```
104 | 
105 | Downloads selected dataset over Globus. 
106 | 
107 | 
108 | 
109 | **Args:**
110 |  
111 |  - <b>`dataset_name`</b> (str):  Name of the dataset (equivalent to source_id in MDF). 
112 | 
113 | ---
114 | 
115 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L101"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
116 | 
117 | ### <kbd>method</kbd> `download_via_http`
118 | 
119 | ```python
120 | download_via_http(dataset_name: str)
121 | ```
122 | 
123 | Downloads selected dataset from MDF over HTTP. 
124 | 
125 | **Args:**
126 |  dataset_name (str): Name of the dataset (equivalent to source_id in MDF). 
127 | 
128 | ---
129 | 
130 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L408"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
131 | 
132 | ### <kbd>method</kbd> `get_keys`
133 | 
134 | ```python
135 | get_keys(
136 |     foundry_schema: FoundrySchema,
137 |     type: str = None,
138 |     as_object: bool = False
139 | )
140 | ```
141 | 
142 | Get keys for a Foundry dataset 
143 | 
144 | 
145 | 
146 | **Arguments:**
147 |  
148 |  - <b>`foundry_schema`</b> (FoundrySchema):  The schema from MDF that contains the keys 
149 |  - <b>`type`</b> (str):  The type of key to be returned e.g., "input", "target" 
150 |  - <b>`as_object`</b> (bool):  When ``False``, will return a list of keys in as strings  When ``True``, will return the full key objects 
151 |  - <b>`**Default`</b>: ** ``False`` Returns: (list) String representations of keys or if ``as_object`` is False otherwise returns the full key objects. 
152 | 
153 | ---
154 | 
155 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L187"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
156 | 
157 | ### <kbd>method</kbd> `load_as_dict`
158 | 
159 | ```python
160 | load_as_dict(
161 |     split: str,
162 |     dataset_name: str,
163 |     foundry_schema: FoundrySchema,
164 |     as_hdf5: bool
165 | )
166 | ```
167 | 
168 | Load the data associated with the specified dataset and return it as a labeled dictionary of tuples. 
169 | 
170 | 
171 | 
172 | **Args:**
173 |  
174 |  - <b>`split`</b> (str):  Split to load the data from. 
175 |  - <b>`dataset_name`</b> (str):  Name of the dataset (equivalent to source_id in MDF). 
176 |  - <b>`foundry_schema`</b> (FoundrySchema, optional):  FoundrySchema object. Defaults to None. 
177 |  - <b>`as_hdf5`</b> (bool, optional):  If True and dataset is in HDF5 format, keep data in HDF5 format. Defaults to False. 
178 | 
179 | 
180 | 
181 | **Returns:**
182 |  
183 |  - <b>`dict`</b>:  A labeled dictionary of tuples containing the loaded data. 
184 | 
185 | ---
186 | 
187 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L255"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
188 | 
189 | ### <kbd>method</kbd> `load_as_tensorflow`
190 | 
191 | ```python
192 | load_as_tensorflow(
193 |     split: str,
194 |     dataset_name: str,
195 |     foundry_schema: FoundrySchema,
196 |     as_hdf5: bool
197 | )
198 | ```
199 | 
200 | Convert Foundry Dataset to a Tensorflow Sequence 
201 | 
202 | 
203 | 
204 | **Arguments:**
205 |  
206 |  - <b>`split`</b> (string):  Split to create Tensorflow Sequence on. 
207 |  - <b>`**Default`</b>: ** ``None`` 
208 | 
209 | Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split 
210 | 
211 | ---
212 | 
213 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L227"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
214 | 
215 | ### <kbd>method</kbd> `load_as_torch`
216 | 
217 | ```python
218 | load_as_torch(
219 |     split: str,
220 |     dataset_name: str,
221 |     foundry_schema: FoundrySchema,
222 |     as_hdf5: bool
223 | )
224 | ```
225 | 
226 | Convert Foundry Dataset to a PyTorch Dataset 
227 | 
228 | 
229 | 
230 | **Arguments:**
231 |  
232 |  - <b>`split`</b> (string):  Split to create PyTorch Dataset on. 
233 |  - <b>`**Default`</b>: ** ``None`` 
234 | 
235 | Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split 
236 | 
237 | ---
238 | 
239 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_cache.py#L130"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
240 | 
241 | ### <kbd>method</kbd> `validate_local_dataset_storage`
242 | 
243 | ```python
244 | validate_local_dataset_storage(dataset_name: str, splits: List[Split] = None)
245 | ```
246 | 
247 | Verifies that the local storage location exists and all expected files are present. 
248 | 
249 | 
250 | 
251 | **Args:**
252 |  
253 |  - <b>`dataset_name`</b> (str):  Name of the dataset (equivalent to source_id in MDF). 
254 |  - <b>`splits`</b> (List[FoundrySplit], optional):  Labels of splits to be loaded. Defaults to None. 
255 | 
256 | 
257 | 
258 | **Returns:**
259 |  
260 |  - <b>`bool`</b>:  True if the dataset exists and contains all the desired files; False otherwise. 
261 | 
262 | 
263 | 
264 | 
265 | ---
266 | 
267 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
268 | 


--------------------------------------------------------------------------------
/docs/foundry.foundry_dataset.md:
--------------------------------------------------------------------------------
  1 | <!-- markdownlint-disable -->
  2 | 
  3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L0"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
  4 | 
  5 | # <kbd>module</kbd> `foundry.foundry_dataset`
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | ---
 13 | 
 14 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L16"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 15 | 
 16 | ## <kbd>class</kbd> `FoundryDataset`
 17 | Representation of an individual dataset. Provides access to metadata as well as functions to instantiate data into memory in different formats. 
 18 | 
 19 | 
 20 | 
 21 | **Args:**
 22 |  
 23 |  - <b>`dataset_name`</b> (str):  Name of the dataset (equivalent to source_id in MDF) 
 24 |  - <b>`datacite_entry`</b> (FoundryDatacite):  Datacite entry for the dataset 
 25 |  - <b>`foundry_schema`</b> (FoundrySchema):  Schema for the dataset 
 26 |  - <b>`foundry_cache`</b> (FoundryCache):  Cache for the dataset 
 27 | 
 28 | Desired functions: 
 29 |     - Get as pandas 
 30 |     - Get as tensorflow dataset 
 31 |     - Get as pytorch dataset 
 32 |     - Get file list 
 33 |     - Set metadata 
 34 |     - Attach datafiles 
 35 |     - Validate against schema 
 36 |     - Get citation 
 37 | 
 38 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L38"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 39 | 
 40 | ### <kbd>method</kbd> `__init__`
 41 | 
 42 | ```python
 43 | __init__(
 44 |     dataset_name: str,
 45 |     datacite_entry: FoundryDatacite,
 46 |     foundry_schema: FoundrySchema,
 47 |     foundry_cache: FoundryCache = None
 48 | )
 49 | ```
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | ---
 59 | 
 60 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L174"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 61 | 
 62 | ### <kbd>method</kbd> `add_data`
 63 | 
 64 | ```python
 65 | add_data(local_data_path: str = None, globus_data_source: str = None)
 66 | ```
 67 | 
 68 | Add data to the dataset. User must provide the location of the data as either a `globus_data_source` or `local_data_path`. 
 69 | 
 70 | 
 71 | 
 72 | **Arguments:**
 73 |  
 74 |  - <b>`local_data_path`</b> (str):  Local path to the dataset used to publish to Foundry via HTTPS. Creates an HTTPS PUT request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is transferred to MDF. If None, the user must specify a 'globus_data_source' URL to the location of the data on their own Globus endpoint. User must choose either `globus_data_source` or `local_data_path` to publish their data. 
 75 |  - <b>`globus_data_source`</b> (str):  Url path for a data folder on a Globus endpoint; url can be obtained through  the Globus Web UI or SDK. If None, the user must specify an 'local_data_path' pointing to the location  of the data on their local machine. User must choose either `globus_data_source` or `local_data_path` to  publish their data. 
 76 | 
 77 | ---
 78 | 
 79 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L209"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 80 | 
 81 | ### <kbd>method</kbd> `clean_dc_dict`
 82 | 
 83 | ```python
 84 | clean_dc_dict()
 85 | ```
 86 | 
 87 | Clean the Datacite dictionary of None values 
 88 | 
 89 | ---
 90 | 
 91 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L205"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 92 | 
 93 | ### <kbd>method</kbd> `clear_dataset_cache`
 94 | 
 95 | ```python
 96 | clear_dataset_cache()
 97 | ```
 98 | 
 99 | Deletes the cached data for this specific datset 
100 | 
101 | ---
102 | 
103 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L213"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
104 | 
105 | ### <kbd>method</kbd> `delete_none`
106 | 
107 | ```python
108 | delete_none(_dict)
109 | ```
110 | 
111 | Delete None values recursively from all of the dictionaries 
112 | 
113 | ---
114 | 
115 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L52"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
116 | 
117 | ### <kbd>method</kbd> `get_as_dict`
118 | 
119 | ```python
120 | get_as_dict(split: str = None, as_hdf5: bool = False)
121 | ```
122 | 
123 | Returns the data from the dataset as a dictionary 
124 | 
125 | 
126 | 
127 | **Arguments:**
128 |  
129 |  - <b>`split`</b> (string):  Split to create dataset on. 
130 |  - <b>`**Default`</b>: ** ``None`` 
131 | 
132 | Returns: (dict) Dictionary of all the data from the specified split 
133 | 
134 | ---
135 | 
136 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L83"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
137 | 
138 | ### <kbd>method</kbd> `get_as_tensorflow`
139 | 
140 | ```python
141 | get_as_tensorflow(split: str = None)
142 | ```
143 | 
144 | Convert Foundry Dataset to a Tensorflow Sequence 
145 | 
146 | 
147 | 
148 | **Arguments:**
149 |  
150 |  - <b>`split`</b> (string):  Split to create Tensorflow Sequence on. 
151 |  - <b>`**Default`</b>: ** ``None`` 
152 | 
153 | Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split 
154 | 
155 | ---
156 | 
157 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L68"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
158 | 
159 | ### <kbd>method</kbd> `get_as_torch`
160 | 
161 | ```python
162 | get_as_torch(split: str = None)
163 | ```
164 | 
165 | Returns the data from the dataset as a TorchDataset 
166 | 
167 | 
168 | 
169 | **Arguments:**
170 |  
171 |  - <b>`split`</b> (string):  Split to create PyTorch Dataset on. 
172 |  - <b>`**Default`</b>: ** ``None`` 
173 | 
174 | Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split 
175 | 
176 | ---
177 | 
178 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L134"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
179 | 
180 | ### <kbd>method</kbd> `get_citation`
181 | 
182 | ```python
183 | get_citation() → str
184 | ```
185 | 
186 | 
187 | 
188 | 
189 | 
190 | ---
191 | 
192 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L52"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
193 | 
194 | ### <kbd>method</kbd> `get_as_dict`
195 | 
196 | ```python
197 | get_as_dict(split: str = None, as_hdf5: bool = False)
198 | ```
199 | 
200 | Returns the data from the dataset as a dictionary 
201 | 
202 | 
203 | 
204 | **Arguments:**
205 |  
206 |  - <b>`split`</b> (string):  Split to create dataset on. 
207 |  - <b>`**Default`</b>: ** ``None`` 
208 | 
209 | Returns: (dict) Dictionary of all the data from the specified split 
210 | 
211 | ---
212 | 
213 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/foundry_dataset.py#L150"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
214 | 
215 | ### <kbd>method</kbd> `validate_metadata`
216 | 
217 | ```python
218 | validate_metadata(metadata)
219 | ```
220 | 
221 | Validate the JSON message against the FoundryDataset model 
222 | 
223 | 
224 | 
225 | **Arguments:**
226 |  
227 |  - <b>`metadata`</b> (dict):  Metadata information provided by the user. 
228 | 
229 | 
230 | 
231 | **Raises:**
232 |  
233 |  - <b>`ValidationError`</b>:  if metadata supplied by user does not meet the specificiation of a FoundryDataset object. 
234 | 
235 | 
236 | 
237 | 
238 | ---
239 | 
240 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
241 | 


--------------------------------------------------------------------------------
/docs/foundry.https_download.md:
--------------------------------------------------------------------------------
 1 | <!-- markdownlint-disable -->
 2 | 
 3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/https_download.py#L0"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 4 | 
 5 | # <kbd>module</kbd> `foundry.https_download`
 6 | Methods to download files from a Globus endpoint 
 7 | 
 8 | 
 9 | ---
10 | 
11 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/https_download.py#L12"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
12 | 
13 | ## <kbd>function</kbd> `recursive_ls`
14 | 
15 | ```python
16 | recursive_ls(tc: TransferClient, ep: str, path: str, max_depth: int = 3)
17 | ```
18 | 
19 | Find all files in a Globus directory recursively 
20 | 
21 | 
22 | 
23 | **Args:**
24 |  
25 |  - <b>`tc`</b>:  TransferClient authorized to access the directory 
26 |  - <b>`ep`</b>:  Endpoint on which the files reside 
27 |  - <b>`path`</b>:  Path to the files being downloaded 
28 |  - <b>`max_depth`</b>:  Maximum recurse depth 
29 | 
30 | 
31 | 
32 | **Yields:**
33 |  Dictionaries describing the location of the files. Each includes at least 
34 |  - <b>`"name"`</b>:  Name of the file 
35 |  - <b>`"path"`</b>:  Absolute path to the file's location 
36 | 
37 | 
38 | ---
39 | 
40 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/https_download.py#L55"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
41 | 
42 | ## <kbd>function</kbd> `download_file`
43 | 
44 | ```python
45 | download_file(item, base_directory, https_config, timeout=1800)
46 | ```
47 | 
48 | Download a file to disk 
49 | 
50 | 
51 | 
52 | **Args:**
53 |  
54 |  - <b>`item`</b>:  Dictionary defining the path to the file 
55 |  - <b>`base_directory`</b>:  Base directory for storing downloaded files 
56 |  - <b>`https_config`</b>:  Configuration defining the URL of the server and the name of the dataset 
57 |  - <b>`timeout`</b>:  Timeout for the download request in seconds (default: 1800) 
58 | 
59 | 
60 | 
61 | 
62 | ---
63 | 
64 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
65 | 


--------------------------------------------------------------------------------
/docs/foundry.https_upload.md:
--------------------------------------------------------------------------------
 1 | <!-- markdownlint-disable -->
 2 | 
 3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/https_upload.py#L0"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 4 | 
 5 | # <kbd>module</kbd> `foundry.https_upload`
 6 | Private utility methods to upload files and/or folders to Globus using HTTPS instead of Globus Transfer. 
 7 | 
 8 | 
 9 | ---
10 | 
11 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/https_upload.py#L19"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
12 | 
13 | ## <kbd>function</kbd> `upload_to_endpoint`
14 | 
15 | ```python
16 | upload_to_endpoint(
17 |     auths: PubAuths,
18 |     local_data_path: str,
19 |     endpoint_id: str = '82f1b5c6-6e9b-11e5-ba47-22000b92c6ec',
20 |     dest_parent: str = None,
21 |     dest_child: str = None
22 | ) → Tuple[str, str]
23 | ```
24 | 
25 | Upload local data to a Globus endpoint using HTTPS PUT requests. Data can be a folder or an individual file. 
26 | 
27 | **Args:**
28 |  
29 |  - <b>`auths`</b> (PubAuths):  Dataclass of authorizers needed for upload. Includes `transfer_client`, `auth_client_openid`, 
30 |  - <b>`and `endpoint_auth_clients`, which is a Dict of `endpoint_id``</b>: AuthClient mappings. 
31 |  - <b>`local_data_path`</b> (str):  Path to the local dataset to publish to Foundry via HTTPS. Creates an HTTPS PUT  request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is  transferred to MDF. 
32 |  - <b>`endpoint_id`</b> (str):  Globus endpoint ID to upload the data to. Default is NCSA endpoint. Must match the  `endpoint_id` auth'd in `auths.auth_client_gcs`. 
33 | 
34 | Returns 
35 | ------- (str) Globus data source URL: URL pointing to the data on the Globus endpoint 
36 | 
37 | 
38 | 
39 | 
40 | ---
41 | 
42 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
43 | 


--------------------------------------------------------------------------------
/docs/foundry.loaders.md:
--------------------------------------------------------------------------------
 1 | <!-- markdownlint-disable -->
 2 | 
 3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/loaders/__init__.py"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 4 | 
 5 | # <kbd>module</kbd> `foundry.loaders`
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | ---
15 | 
16 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
17 | 


--------------------------------------------------------------------------------
/docs/foundry.loaders.tf_wrapper.md:
--------------------------------------------------------------------------------
 1 | <!-- markdownlint-disable -->
 2 | 
 3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/loaders/tf_wrapper.py#L0"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 4 | 
 5 | # <kbd>module</kbd> `foundry.loaders.tf_wrapper`
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | ---
13 | 
14 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/loaders/tf_wrapper.py#L5"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
15 | 
16 | ## <kbd>class</kbd> `TensorflowSequence`
17 | Foundry Dataset Converted to Tensorflow Format 
18 | 
19 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/loaders/tf_wrapper.py#L8"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
20 | 
21 | ### <kbd>method</kbd> `__init__`
22 | 
23 | ```python
24 | __init__(inputs, targets)
25 | ```
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | ---
33 | 
34 | #### <kbd>property</kbd> max_queue_size
35 | 
36 | 
37 | 
38 | 
39 | 
40 | ---
41 | 
42 | #### <kbd>property</kbd> num_batches
43 | 
44 | Number of batches in the PyDataset. 
45 | 
46 | 
47 | 
48 | **Returns:**
49 |   The number of batches in the PyDataset or `None` to indicate that  the dataset is infinite. 
50 | 
51 | ---
52 | 
53 | #### <kbd>property</kbd> use_multiprocessing
54 | 
55 | 
56 | 
57 | 
58 | 
59 | ---
60 | 
61 | #### <kbd>property</kbd> workers
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | ---
73 | 
74 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
75 | 


--------------------------------------------------------------------------------
/docs/foundry.loaders.torch_wrapper.md:
--------------------------------------------------------------------------------
 1 | <!-- markdownlint-disable -->
 2 | 
 3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/loaders/torch_wrapper.py#L0"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 4 | 
 5 | # <kbd>module</kbd> `foundry.loaders.torch_wrapper`
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | ---
13 | 
14 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/loaders/torch_wrapper.py#L5"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
15 | 
16 | ## <kbd>class</kbd> `TorchDataset`
17 | Foundry Dataset Converted to Pytorch Format 
18 | 
19 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/loaders/torch_wrapper.py#L8"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
20 | 
21 | ### <kbd>method</kbd> `__init__`
22 | 
23 | ```python
24 | __init__(inputs, targets)
25 | ```
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | ---
38 | 
39 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
40 | 


--------------------------------------------------------------------------------
/docs/foundry.md:
--------------------------------------------------------------------------------
 1 | <!-- markdownlint-disable -->
 2 | 
 3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/__init__.py#L0"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 4 | 
 5 | # <kbd>module</kbd> `foundry`
 6 | 
 7 | 
 8 | 
 9 | 
10 | **Global Variables**
11 | ---------------
12 | - **auth**
13 | - **https_download**
14 | - **jsonschema_models**
15 | - **models**
16 | - **utils**
17 | - **foundry_cache**
18 | - **foundry_dataset**
19 | - **https_upload**
20 | - **foundry**
21 | 
22 | 
23 | 
24 | 
25 | ---
26 | 
27 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
28 | 


--------------------------------------------------------------------------------
/docs/foundry.models.md:
--------------------------------------------------------------------------------
  1 | <!-- markdownlint-disable -->
  2 | 
  3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L0"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
  4 | 
  5 | # <kbd>module</kbd> `foundry.models`
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | ---
 13 | 
 14 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L17"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 15 | 
 16 | ## <kbd>class</kbd> `FoundrySpecificationDataset`
 17 | Pydantic base class for datasets within the Foundry data package specification 
 18 | 
 19 | 
 20 | ---
 21 | 
 22 | #### <kbd>property</kbd> model_extra
 23 | 
 24 | Get extra fields set during validation. 
 25 | 
 26 | 
 27 | 
 28 | **Returns:**
 29 |   A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. 
 30 | 
 31 | ---
 32 | 
 33 | #### <kbd>property</kbd> model_fields_set
 34 | 
 35 | Returns the set of fields that have been explicitly set on this model instance. 
 36 | 
 37 | 
 38 | 
 39 | **Returns:**
 40 |   A set of strings representing the fields that have been set,  i.e. that were not filled from defaults. 
 41 | 
 42 | 
 43 | 
 44 | 
 45 | ---
 46 | 
 47 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L25"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 48 | 
 49 | ## <kbd>class</kbd> `FoundrySpecification`
 50 | Pydantic base class for interacting with the Foundry data package specification The specification provides a way to group datasets and manage versions 
 51 | 
 52 | 
 53 | ---
 54 | 
 55 | #### <kbd>property</kbd> model_extra
 56 | 
 57 | Get extra fields set during validation. 
 58 | 
 59 | 
 60 | 
 61 | **Returns:**
 62 |   A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. 
 63 | 
 64 | ---
 65 | 
 66 | #### <kbd>property</kbd> model_fields_set
 67 | 
 68 | Returns the set of fields that have been explicitly set on this model instance. 
 69 | 
 70 | 
 71 | 
 72 | **Returns:**
 73 |   A set of strings representing the fields that have been set,  i.e. that were not filled from defaults. 
 74 | 
 75 | 
 76 | 
 77 | ---
 78 | 
 79 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L36"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 80 | 
 81 | ### <kbd>method</kbd> `add_dependency`
 82 | 
 83 | ```python
 84 | add_dependency(name: str, version: str)
 85 | ```
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | ---
 92 | 
 93 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L47"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 94 | 
 95 | ### <kbd>method</kbd> `clear_dependencies`
 96 | 
 97 | ```python
 98 | clear_dependencies()
 99 | ```
100 | 
101 | 
102 | 
103 | 
104 | 
105 | ---
106 | 
107 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L50"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
108 | 
109 | ### <kbd>method</kbd> `model_dump`
110 | 
111 | ```python
112 | model_dump()
113 | ```
114 | 
115 | 
116 | 
117 | 
118 | 
119 | ---
120 | 
121 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L39"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
122 | 
123 | ### <kbd>method</kbd> `remove_duplicate_dependencies`
124 | 
125 | ```python
126 | remove_duplicate_dependencies()
127 | ```
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | ---
135 | 
136 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L59"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
137 | 
138 | ## <kbd>class</kbd> `FoundryDatasetType`
139 | Foundry Dataset Types Enumeration of the possible Foundry dataset types 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | ---
146 | 
147 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L70"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
148 | 
149 | ## <kbd>class</kbd> `FoundrySchema`
150 | A model for the Foundry schema based on the FoundryModel (project_model.py) class. 
151 | 
152 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L75"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
153 | 
154 | ### <kbd>method</kbd> `__init__`
155 | 
156 | ```python
157 | __init__(project_dict: Dict[str, Any])
158 | ```
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | ---
166 | 
167 | #### <kbd>property</kbd> model_extra
168 | 
169 | Get extra fields set during validation. 
170 | 
171 | 
172 | 
173 | **Returns:**
174 |   A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. 
175 | 
176 | ---
177 | 
178 | #### <kbd>property</kbd> model_fields_set
179 | 
180 | Returns the set of fields that have been explicitly set on this model instance. 
181 | 
182 | 
183 | 
184 | **Returns:**
185 |   A set of strings representing the fields that have been set,  i.e. that were not filled from defaults. 
186 | 
187 | 
188 | 
189 | 
190 | ---
191 | 
192 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L90"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
193 | 
194 | ## <kbd>class</kbd> `FoundryDatacite`
195 | A model for the Datacite schema based on the Datacite (dc_model.py) class. 
196 | 
197 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L94"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
198 | 
199 | ### <kbd>method</kbd> `__init__`
200 | 
201 | ```python
202 | __init__(datacite_dict: Dict[str, Any], **kwargs)
203 | ```
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | ---
211 | 
212 | #### <kbd>property</kbd> model_extra
213 | 
214 | Get extra fields set during validation. 
215 | 
216 | 
217 | 
218 | **Returns:**
219 |   A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. 
220 | 
221 | ---
222 | 
223 | #### <kbd>property</kbd> model_fields_set
224 | 
225 | Returns the set of fields that have been explicitly set on this model instance. 
226 | 
227 | 
228 | 
229 | **Returns:**
230 |   A set of strings representing the fields that have been set,  i.e. that were not filled from defaults. 
231 | 
232 | 
233 | 
234 | 
235 | ---
236 | 
237 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L114"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
238 | 
239 | ## <kbd>class</kbd> `FoundryBase`
240 | Configuration information for Foundry instance 
241 | 
242 | 
243 | ---
244 | 
245 | #### <kbd>property</kbd> model_extra
246 | 
247 | Get extra fields set during validation. 
248 | 
249 | 
250 | 
251 | **Returns:**
252 |   A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`. 
253 | 
254 | ---
255 | 
256 | #### <kbd>property</kbd> model_fields_set
257 | 
258 | Returns the set of fields that have been explicitly set on this model instance. 
259 | 
260 | 
261 | 
262 | **Returns:**
263 |   A set of strings representing the fields that have been set,  i.e. that were not filled from defaults. 
264 | 
265 | 
266 | 
267 | ---
268 | 
269 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/models.py#L131"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
270 | 
271 | ### <kbd>method</kbd> `model_dump`
272 | 
273 | ```python
274 | model_dump()
275 | ```
276 | 
277 | 
278 | 
279 | 
280 | 
281 | 
282 | 
283 | 
284 | ---
285 | 
286 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
287 | 


--------------------------------------------------------------------------------
/docs/foundry.utils.md:
--------------------------------------------------------------------------------
 1 | <!-- markdownlint-disable -->
 2 | 
 3 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/utils.py#L0"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 4 | 
 5 | # <kbd>module</kbd> `foundry.utils`
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | ---
12 | 
13 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/utils.py#L4"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
14 | 
15 | ## <kbd>function</kbd> `is_pandas_pytable`
16 | 
17 | ```python
18 | is_pandas_pytable(group)
19 | ```
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | ---
27 | 
28 | <a href="https://github.com/MLMI2-CSSI/foundry/tree/main/foundry/utils.py#L11"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
29 | 
30 | ## <kbd>function</kbd> `is_doi`
31 | 
32 | ```python
33 | is_doi(string: str)
34 | ```
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | ---
44 | 
45 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
46 | 


--------------------------------------------------------------------------------
/docs/how-to-contribute/code_of_conduct.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: Read our pledge and Code of Conduct for contributing
 3 | ---
 4 | 
 5 | # Contributor Covenant
 6 | 
 7 | ## Our Pledge
 8 | 
 9 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
10 | 
11 | ## Our Standards
12 | 
13 | Examples of behavior that contributes to creating a positive environment include:
14 | 
15 | * Using welcoming and inclusive language
16 | * Being respectful of differing viewpoints and experiences
17 | * Gracefully accepting constructive criticism
18 | * Focusing on what is best for the community
19 | * Showing empathy towards other community members
20 | 
21 | Examples of unacceptable behavior by participants include:
22 | 
23 | * The use of sexualized language or imagery and unwelcome sexual attention or
24 | 
25 |   advances
26 | 
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | 
31 |   address, without explicit permission
32 | 
33 | * Other conduct which could reasonably be considered inappropriate in a
34 | 
35 |   professional setting
36 | 
37 | ## Our Responsibilities
38 | 
39 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
40 | 
41 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
42 | 
43 | ## Scope
44 | 
45 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
46 | 
47 | ## Enforcement
48 | 
49 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at foundry@uchicago.edu. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
50 | 
51 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
52 | 
53 | ## Attribution
54 | 
55 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4, available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)
56 | 
57 | For answers to common questions about this code of conduct, see [https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq)
58 | 
59 | 


--------------------------------------------------------------------------------
/docs/how-to-contribute/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contribution Process
 2 | 
 3 | When contributing to this repository, please first discuss the change you wish to make via issue, email, or any other method with the owners of this repository before making a change.
 4 | 
 5 | Please note we have a code of conduct, please follow it in all your interactions with the project.
 6 | 
 7 | ## Contributing code
 8 | 
 9 | If you have improvements to Foundry, send us your pull requests! For those just getting started, Github has a [how to](https://help.github.com/articles/using-pull-requests/).
10 | 
11 | If you want to contribute, start working through the Foundry codebase, navigate to the [Github "issues" tab](https://github.com/MLMI2-CSSI/foundry/issues) and start looking through interesting issues. If you are not sure of where to start, then start by trying one of the smaller/easier issues here i.e. [issues with the "good first issue" label](https://github.com/MLMI2-CSSI/foundry/labels/good%20first%20issue). These are issues that we believe are particularly well suited for outside contributions. If you want to help out, but not alone, use the issue comment thread to coordinate.
12 | 
13 | ### General guidelines and philosophy for contribution
14 | 
15 | * Include unit tests when you contribute new features, as they help to a\)
16 | 
17 |   prove that your code works correctly, and b\) guard against future breaking
18 | 
19 |   changes to lower the maintenance cost.
20 | 
21 | * Bug fixes also generally require unit tests, because the presence of bugs
22 | 
23 |   usually indicates insufficient test coverage.
24 | 
25 | * Keep API compatibility in mind when you change code in Foundry,
26 | * When you contribute a new feature to Foundry, the maintenance burden is
27 | 
28 |   \(by default\) transferred to the Foundry team. This means that the benefit
29 | 
30 |   of the contribution must be compared against the cost of maintaining the
31 | 
32 |   feature.
33 | 
34 | * Tests should follow [testing best practices](https://www..org/community/contribute/tests)
35 | 
36 |   guide.
37 | 
38 | ## Pull Request Process
39 | 
40 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a
41 | 
42 |    build.
43 | 
44 | 2. Update the README.md with details of changes to the interface, this includes new environment
45 | 
46 |    variables, exposed ports, useful file locations and container parameters.
47 | 
48 | 3. Increase the version numbers in any examples files and the README.md to the new version that this
49 | 
50 |    Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/).
51 | 
52 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you
53 | 
54 |    do not have permission to do that, you may request the second reviewer to merge it for you.
55 | 
56 | 


--------------------------------------------------------------------------------
/docs/publishing-datasets.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | description: Information on how to publish datasets
  3 | ---
  4 | 
  5 | # Publishing Datasets
  6 | 
  7 | In order to publish datasets, the datasets must 1\) adhere to specified Foundry dataset shapes \([see here](publishing-datasets.md#shaping-datasets)\), and 2\)  be described with required information \([see here](publishing-datasets.md#describing-datasets)\). Together, the dataset shape and description enable researchers to reuse the datasets more easily.
  8 | 
  9 | ## Examples
 10 | 
 11 | [Skip to the publication example notebook.](https://github.com/MLMI2-CSSI/foundry/blob/master/examples/foundry_publication_example.ipynb)
 12 | 
 13 | ## Shaping Datasets
 14 | 
 15 | For a general dataset to be translated into a usable Foundry dataset, it should follow one of the prescribed shapes. It should also be described by a Key object, which provides a mapping that allows Foundry to read data from the underlying data structure into usable Python objects \([see Describing Datasets](publishing-datasets.md#describing-datasets) for more info\). 
 16 | 
 17 | ### **Tabular Data**
 18 | 
 19 | Tabular data should include in a form where columns represent the different keys of the data and rows represent individual entries.
 20 | 
 21 | | **feature\_1** | **feature\_2** | **material\_type** | band\_gap |
 22 | | :--- | :--- | :--- | :--- |
 23 | | 0.10 | 0.52 | 1 | 1.40 |
 24 | | 0.34 | 0.910 | 0 | 0.73 |
 25 | | ... | ... | ... |  |
 26 | 
 27 | For this example dataset the `Key` object could be:  
 28 | 
 29 | ```text
 30 | "keys":[{
 31 | 		 	"key": "feature_1",
 32 | 			"type": "input",
 33 | 			"units": None,
 34 | 			"description": "This is feature 1"
 35 | 		},{
 36 | 			"key": "feature_2",
 37 | 			"type": "input",
 38 | 			"units": None,
 39 | 			"description": "This is feature 2"
 40 | 		},{
 41 | 			"key": "material_type",
 42 | 			"type": "input",
 43 | 			"units": None,
 44 | 			"description": "This is the material type",
 45 | 			"labels":["perovskite","not perovskite"]
 46 | 		}{
 47 | 			"key": "band_gap",
 48 | 			"type": "target",
 49 | 			"units": "eV",
 50 | 			"description": "This is the simulated band gap in eV"
 51 | 		}
 52 | ]
 53 | ```
 54 | 
 55 | {% hint style="info" %}
 56 | `This tabular data file should be saved in the base directory as` **`foundry_dataframe.json`**
 57 | {% endhint %}
 58 | 
 59 | ### Hierarchical Data
 60 | 
 61 | Foundry also supports data from hierarchical data formats \(e.g., [HDF5](https://www.h5py.org)\). In this case features and outputs can be represented with `/` notation. For example, if the features of a dataset are located in an array stored in `/data/arr1` and `/other_data/arr2` while the outputs are in `/data/band_gaps`, the Key object would be:
 62 | 
 63 | ```text
 64 | "keys":[{
 65 | 			"key": "/data/arr1",
 66 | 			"type": "input",
 67 | 			"units": None,
 68 | 			"description": "This is an array containing input data"
 69 | 		},{
 70 | 		  "key": "/other_data/arr2",
 71 | 			"type": "input",
 72 | 			"units": None,
 73 | 			"description": "This is an another array containing input data"
 74 | 		},{
 75 | 		  "key": "/data/band_gaps",
 76 | 			"type": "target",
 77 | 			"units": "eV",
 78 | 			"description": "This is the simulated band gap in eV"
 79 | 		}
 80 | ]
 81 | ```
 82 | 
 83 | ## Describing Datasets
 84 | 
 85 | **DataCite Metadata \(object\):** All datasets can be described using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). This metadata captures . Many of these capabilities have helper functions in the SDK, to make it easier to match the DataCite schema
 86 | 
 87 | **Keys \(object\):** Key objects provide a mapping that allows Foundry to read data from the underlying data structure into usable Python objects. Key objects have the following properties
 88 | 
 89 | * **`key (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\)
 90 | * **`type (str)`** The type of key this entry represents. Currently suported types are _**\["input", "target" \]**_
 91 | * **`units (str)[optional]`** _****_The scientific units associated with a key. _Default: None_
 92 | * **`description (str)[optional]`** _****_A free text description of the key. _Default: None_
 93 | * **`labels (list) (str) [optional]`:** A list of strings mapped to integers in a key column
 94 | 
 95 | **short\_name \(str\):** Short name is a unique name associated with this dataset to make loading and . 
 96 | 
 97 | **type \(str\):** The type provides a hint to Foundry on how to map the keys into loading operations. _Options \["tabular","hdf5"\]_
 98 | 
 99 | ```text
100 | "foundry": {
101 | 	"dc": {},
102 | 	"keys": [{
103 | 			"type": "input",
104 | 			"name": "feature_1",
105 | 			"units": "",
106 | 			"description": "This is an input"
107 | 		},
108 | 		{
109 | 			"type": "target",
110 | 			"name": "band_gap",
111 | 			"units": "eV",
112 | 			"description": "blah blah",
113 | 			"labels": []
114 | 		}
115 | 	],
116 | 	"short_name": "my_short_name",
117 | 	"type": "tabular"
118 | }
119 | ```
120 | 
121 | ## Publishing
122 | 
123 | {% hint style="info" %}
124 | Before continuing, be sure that you have 1\) signed up for a [free Globus account](https://app.globus.org) and 2\) [joined this Globus group](https://app.globus.org/groups/cc192dca-3751-11e8-90c1-0a7c735d220a/about).
125 | {% endhint %}
126 | 
127 | Once your dataset is in the proper shape, and you have created the associated metadata structure, you can publish to Foundry!
128 | 
129 | Currently, you can publish any dataset you have stored on a Globus endpoint or Google Drive. In the following, assume your [previously defined metadata](publishing-datasets.md#describing-datasets) are stored in `metadata` :
130 | 
131 | ```python
132 | from foundry import Foundry
133 | 
134 | # Globus endpoint URL where your dataset is located
135 | data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry%2F_test_blaiszik_foundry_iris_v1.2%2F"
136 | 
137 | # full title of dataset
138 | title = "Scourtas example iris dataset"
139 | 
140 | # authors to list 
141 | authors = ["A Scourtas", "B Blaiszik"]
142 | 
143 | # shorthand title (optional)
144 | short_name = "example_AS_iris"
145 | 
146 | # affiliations of authors (optional)
147 | affiliations = ["Globus Labs, UChicago"]
148 | 
149 | # publisher of the data (optional)
150 | publisher = "Materials Data Facility"
151 | 
152 | # publication year (optional)
153 | publication_year = 2021
154 | 
155 | 
156 | f = Foundry()
157 | res = f.publish(metadata, data_source, title, authors, short_name=short_name))
158 | ```
159 | 
160 | The `publish()` method returns a result object that you can inspect for information about the state of the publication. For the above publication, `res` would have the format:
161 | 
162 | ```python
163 | {'error': None,
164 |  'source_id': '_test_example_iris_v1.1',
165 |  'status_code': 202,
166 |  'success': True}
167 | ```
168 | 
169 | 
170 | 
171 | ## Future Work
172 | 
173 | * Add support for wildcard key type specifications
174 | * Add link to example publication
175 | 
176 | 


--------------------------------------------------------------------------------
/docs/publishing-models.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: Information on how to publish models
 3 | ---
 4 | 
 5 | # Publishing Models
 6 | 
 7 | In addition to datasets, you can publish models \(or even individual Python methods\) to Foundry and run them in the cloud!
 8 | 
 9 | ## Examples
10 | 
11 | Model publication example notebook coming soon
12 | 
13 | ## Model Types
14 | 
15 | You can publish any of the following types of models or functions to Foundry:
16 | 
17 | * [Scikit-Learn models](publishing-models.md#scikit-learn-models)
18 | * [Tensorflow 1 & 2 models](publishing-models.md#tensorflow-1-and-2-models)
19 | * [Keras models](publishing-models.md#keras-models)
20 | * [PyTorch models](publishing-models.md#keras-models)
21 | * [Class methods \(advanced use\)](publishing-models.md#class-methods)
22 | * [Static methods \(advanced use\)](publishing-models.md#static-methods)
23 | 
24 | ### Scikit-Learn models
25 | 
26 | ### Tensorflow 1 & 2 models
27 | 
28 | ### Keras models
29 | 
30 | ### PyTorch models
31 | 
32 | ### Class methods
33 | 
34 | ### Static methods
35 | 
36 | ## Data Types for Inputs and Targets
37 | 
38 | ## Describing Models 
39 | 
40 | Before you can publish a model, you need to describe it using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). 
41 | 
42 | ## Publishing
43 | 
44 | ## Future Work
45 | 
46 | 


--------------------------------------------------------------------------------
/docs/publishing/publishing-datasets.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | description: Information on how to publish datasets
  3 | ---
  4 | 
  5 | # Publishing Datasets
  6 | 
  7 | In order to publish datasets, the datasets must 1\) adhere to specified Foundry dataset shapes \([see here](publishing-datasets.md#shaping-datasets)\), and 2\)  be described with required information \([see here](publishing-datasets.md#describing-datasets)\). Together, the dataset shape and description enable researchers to reuse the datasets more easily.
  8 | 
  9 | ## Examples
 10 | 
 11 | [Skip to the publication example notebook.](https://github.com/MLMI2-CSSI/foundry/blob/master/examples/foundry_publication_example.ipynb)
 12 | 
 13 | ## Shaping Datasets
 14 | 
 15 | For a general dataset to be translated into a usable Foundry dataset, it should follow one of the prescribed shapes. It should also be described by a `Key` object, which provides a mapping that allows Foundry to read data from the underlying data structure into usable Python objects \([see Describing Datasets](publishing-datasets.md#describing-datasets) for more info\). 
 16 | 
 17 | ### **Tabular Data**
 18 | 
 19 | Tabular data should include in a form where columns represent the different keys of the data and rows represent individual entries.
 20 | 
 21 | | **feature\_1** | **feature\_2** | **material\_type** | band\_gap |
 22 | | :--- | :--- | :--- | :--- |
 23 | | 0.10 | 0.52 | 1 | 1.40 |
 24 | | 0.34 | 0.910 | 0 | 0.73 |
 25 | | ... | ... | ... |  |
 26 | 
 27 | For this example dataset the `keys` list could be:  
 28 | 
 29 | ```text
 30 | "keys":[{
 31 | 		 	"key": "feature_1",
 32 | 			"type": "input",
 33 | 			"units": None,
 34 | 			"description": "This is feature 1"
 35 | 		},{
 36 | 			"key": "feature_2",
 37 | 			"type": "input",
 38 | 			"units": None,
 39 | 			"description": "This is feature 2"
 40 | 		},{
 41 | 			"key": "material_type",
 42 | 			"type": "input",
 43 | 			"units": None,
 44 | 			"description": "This is the material type",
 45 | 			"labels":["perovskite","not perovskite"]
 46 | 		}{
 47 | 			"key": "band_gap",
 48 | 			"type": "target",
 49 | 			"units": "eV",
 50 | 			"description": "This is the simulated band gap in eV"
 51 | 		}
 52 | ]
 53 | ```
 54 | 
 55 | {% hint style="info" %}
 56 | `Don't forget to specify the tabular data file in the submitted metadata`
 57 | {% endhint %}
 58 | 
 59 | ### Hierarchical Data
 60 | 
 61 | Foundry also supports data from hierarchical data formats \(e.g., [HDF5](https://www.h5py.org)\). In this case features and outputs can be represented with `/` notation. For example, if the features of a dataset are located in an array stored in `/data/arr1` and `/other_data/arr2` while the outputs are in `/data/band_gaps`, the Key object would be:
 62 | 
 63 | ```text
 64 | "keys":[{
 65 | 			"key": "/data/arr1",
 66 | 			"type": "input",
 67 | 			"units": None,
 68 | 			"description": "This is an array containing input data"
 69 | 		},{
 70 | 		  "key": "/other_data/arr2",
 71 | 			"type": "input",
 72 | 			"units": None,
 73 | 			"description": "This is an another array containing input data"
 74 | 		},{
 75 | 		  "key": "/data/band_gaps",
 76 | 			"type": "target",
 77 | 			"units": "eV",
 78 | 			"description": "This is the simulated band gap in eV"
 79 | 		}
 80 | ]
 81 | ```
 82 | 
 83 | ## Describing Datasets
 84 | 
 85 | **DataCite Metadata \(object\):** All datasets can be described using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). This metadata captures . Many of these capabilities have helper functions in the SDK, to make it easier to match the DataCite schema
 86 | 
 87 | **Keys \(list\[Key\]\):** `Key` objects provide a mapping that allows Foundry to read data from the underlying data structure into usable Python objects. Individual `Key` objects have the following properties
 88 | 
 89 | * **`key (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\)
 90 | * **`type (str)`** The type of key this entry represents. Currently suported types are _**\["input", "target" \]**_
 91 | * **`units (str)[optional]`** _****_The scientific units associated with a key. _Default: None_
 92 | * **`description (str)[optional]`** _****_A free text description of the key. _Default: None_
 93 | * **`labels (list) (str) [optional]`:** A list of strings mapped to integers in a key column
 94 | 
 95 | **Splits \(list\[Split\]\):** `Split`objects provide a way for users to specify which data should be included as test, train, or other user defined splits. Individual `Split` objects have the following properties
 96 | 
 97 | * **`type (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\)
 98 | * **`path (str)`** The full filepath to the dataset file or directory that contains the split
 99 | * **`label (str)`** A label to assign to this split
100 | 
101 | **short\_name \(str\):** Short name is a unique name associated with this dataset to make loading and . 
102 | 
103 | **type \(str\):** The type provides a hint to Foundry on how to map the keys into loading operations. _Options \["tabular","hdf5"\]_
104 | 
105 | ```text
106 | "foundry": {
107 | 	"dc": {},
108 | 	"keys": [{
109 | 			"type": "input",
110 | 			"name": "feature_1",
111 | 			"units": "",
112 | 			"description": "This is an input"
113 | 		},
114 | 		{
115 | 			"type": "target",
116 | 			"name": "band_gap",
117 | 			"units": "eV",
118 | 			"description": "blah blah",
119 | 			"labels": []
120 | 		}
121 | 	],
122 | 	"short_name": "my_short_name",
123 | 	"data_type": "tabular"
124 | }
125 | ```
126 | 
127 | ## Publishing
128 | 
129 | {% hint style="info" %}
130 | Before continuing, be sure that you have 1\) signed up for a [free Globus account](https://app.globus.org) and 2\) [joined this Globus group](https://app.globus.org/groups/cc192dca-3751-11e8-90c1-0a7c735d220a/about).
131 | {% endhint %}
132 | 
133 | Once your dataset is in the proper shape, and you have created the associated metadata structure, you can publish to Foundry! An example is shown below.
134 | 
135 | ```text
136 | "foundry": {
137 | 	"dc": {},
138 | 	"keys": [{
139 | 			"type": "input",
140 | 			"name": "feature_1",
141 | 			"units": "",
142 | 			"description": "This is an input"
143 | 		},
144 | 		{
145 | 			"type": "target",
146 | 			"name": "band_gap",
147 | 			"units": "eV",
148 | 			"description": "blah blah",
149 | 			"labels": []
150 | 		}
151 | 	],
152 | 	"short_name": "my_short_name",
153 | 	"data_type": "tabular"
154 | }
155 | ```
156 | 
157 | Currently, you can publish any dataset you have stored on a Globus endpoint or Google Drive. In the following, assume your [previously defined metadata](publishing-datasets.md#describing-datasets) are stored in `metadata` :
158 | 
159 | ```python
160 | from foundry import Foundry
161 | 
162 | # Globus endpoint URL where your dataset is located
163 | data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry%2F_test_blaiszik_foundry_iris_v1.2%2F"
164 | 
165 | # full title of dataset
166 | title = "Scourtas example iris dataset"
167 | 
168 | # authors to list 
169 | authors = ["A. Scourtas", "B. Blaiszik"]
170 | 
171 | # shorthand title (optional)
172 | short_name = "example_AS_iris"
173 | 
174 | # affiliations of authors (optional)
175 | affiliations = ["Globus Labs, UChicago"]
176 | 
177 | # publisher of the data (optional)
178 | publisher = "Materials Data Facility"
179 | 
180 | # publication year (optional)
181 | publication_year = 2021
182 | 
183 | 
184 | f = Foundry()
185 | res = f.publish(metadata, data_source, title, authors, short_name=short_name))
186 | ```
187 | 
188 | The `publish()` method returns a result object that you can inspect for information about the state of the publication. For the above publication, `res` would have the format:
189 | 
190 | ```python
191 | {'error': None,
192 |  'source_id': '_test_example_iris_v1.1',
193 |  'status_code': 202,
194 |  'success': True}
195 | ```
196 | 
197 | 
198 | 
199 | ## Future Work
200 | 
201 | * Add support for wildcard key type specifications
202 | * Add link to example publication
203 | 
204 | 


--------------------------------------------------------------------------------
/docs/publishing/publishing-models.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: Information on how to publish models
 3 | ---
 4 | 
 5 | # Publishing Models
 6 | 
 7 | In addition to datasets, you can publish models \(or even individual Python methods\) to Foundry and run them in the cloud!
 8 | 
 9 | ## Examples
10 | 
11 | Model publication example notebook coming soon
12 | 
13 | ## Model Types
14 | 
15 | You can publish any of the following types of models or functions to Foundry:
16 | 
17 | * [Scikit-Learn models](publishing-models.md#scikit-learn-models)
18 | * [Tensorflow 1 & 2 models](publishing-models.md#tensorflow-1-and-2-models)
19 | * [Keras models](publishing-models.md#keras-models)
20 | * [PyTorch models](publishing-models.md#keras-models)
21 | * [Class methods \(advanced use\)](publishing-models.md#class-methods)
22 | * [Static methods \(advanced use\)](publishing-models.md#static-methods)
23 | 
24 | ### Scikit-Learn models
25 | 
26 | ### Tensorflow 1 & 2 models
27 | 
28 | ### Keras models
29 | 
30 | ### PyTorch models
31 | 
32 | ### Class methods
33 | 
34 | ### Static methods
35 | 
36 | ## Data Types for Inputs and Targets
37 | 
38 | ## Describing Models 
39 | 
40 | Before you can publish a model, you need to describe it using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). 
41 | 
42 | ## Publishing
43 | 
44 | ## Future Work
45 | 
46 | 


--------------------------------------------------------------------------------
/docs/support/troubleshooting.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: Common pitfalls and issues and how to solve them
 3 | ---
 4 | 
 5 | # Troubleshooting
 6 | 
 7 | ### Issues with loading or publishing Keras or Tensorflow models
 8 | 
 9 | ![A common error that arises when there is a Keras or Tensorflow version mismatch](../.gitbook/assets/screen-shot-2021-07-15-at-10.05.40-am.png)
10 | 
11 | There is a difference between the older, plain Keras package installed via `import keras`, and the currently maintained and up-to-date Keras package installed via `from tensorflow import keras`. Currently, the DLHub SDK \(which Foundry uses under-the-hood to publish, pull, and run models and functions\) uses whichever version of Keras you have installed. 
12 | 
13 | Errors can arise when `tf.keras` is used in one part of the model pipeline, but plain `keras` is used in another.
14 | 
15 | If you have both versions of Keras installed \(which can be the case in common container environments, such as Google Colab\), DLHub will default to the plain Keras version, in case the user wants to use that with the newest version of Tensorflow. To override this functionality and use the Tensorflow Keras instead when publishing your model, pass the `force_tf_keras = True`option to `publish_model()`. 
16 | 
17 | ```python
18 | # Assume our fitted model is '7-fi-1.hdf5'.
19 | # Create the metadata for the model
20 | import os
21 | 
22 | options_keras = {
23 |             "title": "Bandgap-7-fidelity-MP-JARVIS-1",
24 |             "short_name": "7-fi-1",
25 |             "authors": ["Scientist, Awesome"],
26 |             "servable": {
27 |                 "type": "keras",
28 |                 "model_path": "7-fi-1.hdf5",
29 |                 "custom_objects": {"softplus2": softplus2, 
30 |                                    "MEGNetLayer": MEGNetLayer,
31 |                                    "Set2Set": Set2Set},
32 |                 "force_tf_keras": True
33 |             }
34 | }
35 | res = f.publish_model(options_keras)
36 | ```
37 | 
38 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples using Foundry
2 | If you're wondering how to get started with Foundry or want to see it in action, you're in the right place!
3 | 
4 | Each notebook walks through instantiating Foundry, loading data from Foundry, and working with the data in different ways. Some notebooks also use machine learning models with the data. 
5 | 
6 | Each folder contains a notebook and `requirements.txt` file. The notebooks can be run locally (using the `requirements.txt`) or in [Google Colab](https://colab.research.google.com/).
7 | 
8 | If you have any trouble with the notebooks, please check our [documentation](https://ai-materials-and-chemistry.gitbook.io/foundry/v/docs/) or create an issue on the repo.
9 | 


--------------------------------------------------------------------------------
/examples/atom-position-finding/requirements.txt:
--------------------------------------------------------------------------------
1 | foundry_ml
2 | matplotlib
3 | 


--------------------------------------------------------------------------------
/examples/bandgap/foundry.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Band Gap Analysis",
 3 |   "version": "1.0.0",
 4 |   "description": "Datasets for band gap uber model generation",
 5 |   "private":true,
 6 |   "dependencies":{
 7 |     "_test_foundry_experimental_bandgap_v1.1": "1.1",
 8 |     "_test_foundry_mp_bandgap_v1.1":"1.1",
 9 |     "_test_foundry_oqmd_bandgap_v1.1":"1.1",
10 |     "_test_foundry_assorted_computational_bandgap_v1.1":"1.1"
11 |   }
12 | }


--------------------------------------------------------------------------------
/examples/bandgap/requirements.txt:
--------------------------------------------------------------------------------
1 | pymatgen
2 | matminer
3 | pandas
4 | matplotlib
5 | scikit-learn
6 | foundry_ml
7 | 


--------------------------------------------------------------------------------
/examples/dendrite-segmentation/foundry.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "Dendrite Segmentation",
3 |   "version": "1.0.0",
4 |   "description": "Semantic Segmentation of Dendrites via Machine Learning",
5 |   "private":true,
6 |   "dependencies":{
7 |     "_test_foundry_stan_dendrite_segmentation_v1.1": "1.1"
8 |   }
9 | }


--------------------------------------------------------------------------------
/examples/dendrite-segmentation/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn
2 | foundry_ml
3 | scikit-image
4 | tensorflow
5 | keras-unet
6 | opencv-python
7 | 


--------------------------------------------------------------------------------
/examples/oqmd/foundry.json:
--------------------------------------------------------------------------------
1 | {
2 |     "name": "OQMD Data Analysis",
3 |     "version": "1.0.0",
4 |     "description": "Creating dataframe and metadata for OQMD dataset",
5 |     "private":true,
6 |     "dependencies":{
7 |       "_test_foundry_oqmd_v1.1": "1.1"
8 |     }
9 |   }


--------------------------------------------------------------------------------
/examples/oqmd/requirements.txt:
--------------------------------------------------------------------------------
1 | foundry_ml
2 | pandas
3 | 


--------------------------------------------------------------------------------
/examples/publishing-guides/data/iris.csv:
--------------------------------------------------------------------------------
  1 | # Data from: https://archive.ics.uci.edu/ml/datasets/Iris
  2 | sepal_length,sepal_width,petal_length,petal_width,species
  3 | 5.1,3.5,1.4,0.2,setosa
  4 | 4.9,3.0,1.4,0.2,setosa
  5 | 4.7,3.2,1.3,0.2,setosa
  6 | 4.6,3.1,1.5,0.2,setosa
  7 | 5.0,3.6,1.4,0.2,setosa
  8 | 5.4,3.9,1.7,0.4,setosa
  9 | 4.6,3.4,1.4,0.3,setosa
 10 | 5.0,3.4,1.5,0.2,setosa
 11 | 4.4,2.9,1.4,0.2,setosa
 12 | 4.9,3.1,1.5,0.1,setosa
 13 | 5.4,3.7,1.5,0.2,setosa
 14 | 4.8,3.4,1.6,0.2,setosa
 15 | 4.8,3.0,1.4,0.1,setosa
 16 | 4.3,3.0,1.1,0.1,setosa
 17 | 5.8,4.0,1.2,0.2,setosa
 18 | 5.7,4.4,1.5,0.4,setosa
 19 | 5.4,3.9,1.3,0.4,setosa
 20 | 5.1,3.5,1.4,0.3,setosa
 21 | 5.7,3.8,1.7,0.3,setosa
 22 | 5.1,3.8,1.5,0.3,setosa
 23 | 5.4,3.4,1.7,0.2,setosa
 24 | 5.1,3.7,1.5,0.4,setosa
 25 | 4.6,3.6,1.0,0.2,setosa
 26 | 5.1,3.3,1.7,0.5,setosa
 27 | 4.8,3.4,1.9,0.2,setosa
 28 | 5.0,3.0,1.6,0.2,setosa
 29 | 5.0,3.4,1.6,0.4,setosa
 30 | 5.2,3.5,1.5,0.2,setosa
 31 | 5.2,3.4,1.4,0.2,setosa
 32 | 4.7,3.2,1.6,0.2,setosa
 33 | 4.8,3.1,1.6,0.2,setosa
 34 | 5.4,3.4,1.5,0.4,setosa
 35 | 5.2,4.1,1.5,0.1,setosa
 36 | 5.5,4.2,1.4,0.2,setosa
 37 | 4.9,3.1,1.5,0.1,setosa
 38 | 5.0,3.2,1.2,0.2,setosa
 39 | 5.5,3.5,1.3,0.2,setosa
 40 | 4.9,3.1,1.5,0.1,setosa
 41 | 4.4,3.0,1.3,0.2,setosa
 42 | 5.1,3.4,1.5,0.2,setosa
 43 | 5.0,3.5,1.3,0.3,setosa
 44 | 4.5,2.3,1.3,0.3,setosa
 45 | 4.4,3.2,1.3,0.2,setosa
 46 | 5.0,3.5,1.6,0.6,setosa
 47 | 5.1,3.8,1.9,0.4,setosa
 48 | 4.8,3.0,1.4,0.3,setosa
 49 | 5.1,3.8,1.6,0.2,setosa
 50 | 4.6,3.2,1.4,0.2,setosa
 51 | 5.3,3.7,1.5,0.2,setosa
 52 | 5.0,3.3,1.4,0.2,setosa
 53 | 7.0,3.2,4.7,1.4,versicolor
 54 | 6.4,3.2,4.5,1.5,versicolor
 55 | 6.9,3.1,4.9,1.5,versicolor
 56 | 5.5,2.3,4.0,1.3,versicolor
 57 | 6.5,2.8,4.6,1.5,versicolor
 58 | 5.7,2.8,4.5,1.3,versicolor
 59 | 6.3,3.3,4.7,1.6,versicolor
 60 | 4.9,2.4,3.3,1.0,versicolor
 61 | 6.6,2.9,4.6,1.3,versicolor
 62 | 5.2,2.7,3.9,1.4,versicolor
 63 | 5.0,2.0,3.5,1.0,versicolor
 64 | 5.9,3.0,4.2,1.5,versicolor
 65 | 6.0,2.2,4.0,1.0,versicolor
 66 | 6.1,2.9,4.7,1.4,versicolor
 67 | 5.6,2.9,3.6,1.3,versicolor
 68 | 6.7,3.1,4.4,1.4,versicolor
 69 | 5.6,3.0,4.5,1.5,versicolor
 70 | 5.8,2.7,4.1,1.0,versicolor
 71 | 6.2,2.2,4.5,1.5,versicolor
 72 | 5.6,2.5,3.9,1.1,versicolor
 73 | 5.9,3.2,4.8,1.8,versicolor
 74 | 6.1,2.8,4.0,1.3,versicolor
 75 | 6.3,2.5,4.9,1.5,versicolor
 76 | 6.1,2.8,4.7,1.2,versicolor
 77 | 6.4,2.9,4.3,1.3,versicolor
 78 | 6.6,3.0,4.4,1.4,versicolor
 79 | 6.8,2.8,4.8,1.4,versicolor
 80 | 6.7,3.0,5.0,1.7,versicolor
 81 | 6.0,2.9,4.5,1.5,versicolor
 82 | 5.7,2.6,3.5,1.0,versicolor
 83 | 5.5,2.4,3.8,1.1,versicolor
 84 | 5.5,2.4,3.7,1.0,versicolor
 85 | 5.8,2.7,3.9,1.2,versicolor
 86 | 6.0,2.7,5.1,1.6,versicolor
 87 | 5.4,3.0,4.5,1.5,versicolor
 88 | 6.0,3.4,4.5,1.6,versicolor
 89 | 6.7,3.1,4.7,1.5,versicolor
 90 | 6.3,2.3,4.4,1.3,versicolor
 91 | 5.6,3.0,4.1,1.3,versicolor
 92 | 5.5,2.5,4.0,1.3,versicolor
 93 | 5.5,2.6,4.4,1.2,versicolor
 94 | 6.1,3.0,4.6,1.4,versicolor
 95 | 5.8,2.6,4.0,1.2,versicolor
 96 | 5.0,2.3,3.3,1.0,versicolor
 97 | 5.6,2.7,4.2,1.3,versicolor
 98 | 5.7,3.0,4.2,1.2,versicolor
 99 | 5.7,2.9,4.2,1.3,versicolor
100 | 6.2,2.9,4.3,1.3,versicolor
101 | 5.1,2.5,3.0,1.1,versicolor
102 | 5.7,2.8,4.1,1.3,versicolor
103 | 6.3,3.3,6.0,2.5,virginica
104 | 5.8,2.7,5.1,1.9,virginica
105 | 7.1,3.0,5.9,2.1,virginica
106 | 6.3,2.9,5.6,1.8,virginica
107 | 6.5,3.0,5.8,2.2,virginica
108 | 7.6,3.0,6.6,2.1,virginica
109 | 4.9,2.5,4.5,1.7,virginica
110 | 7.3,2.9,6.3,1.8,virginica
111 | 6.7,2.5,5.8,1.8,virginica
112 | 7.2,3.6,6.1,2.5,virginica
113 | 6.5,3.2,5.1,2.0,virginica
114 | 6.4,2.7,5.3,1.9,virginica
115 | 6.8,3.0,5.5,2.1,virginica
116 | 5.7,2.5,5.0,2.0,virginica
117 | 5.8,2.8,5.1,2.4,virginica
118 | 6.4,3.2,5.3,2.3,virginica
119 | 6.5,3.0,5.5,1.8,virginica
120 | 7.7,3.8,6.7,2.2,virginica
121 | 7.7,2.6,6.9,2.3,virginica
122 | 6.0,2.2,5.0,1.5,virginica
123 | 6.9,3.2,5.7,2.3,virginica
124 | 5.6,2.8,4.9,2.0,virginica
125 | 7.7,2.8,6.7,2.0,virginica
126 | 6.3,2.7,4.9,1.8,virginica
127 | 6.7,3.3,5.7,2.1,virginica
128 | 7.2,3.2,6.0,1.8,virginica
129 | 6.2,2.8,4.8,1.8,virginica
130 | 6.1,3.0,4.9,1.8,virginica
131 | 6.4,2.8,5.6,2.1,virginica
132 | 7.2,3.0,5.8,1.6,virginica
133 | 7.4,2.8,6.1,1.9,virginica
134 | 7.9,3.8,6.4,2.0,virginica
135 | 6.4,2.8,5.6,2.2,virginica
136 | 6.3,2.8,5.1,1.5,virginica
137 | 6.1,2.6,5.6,1.4,virginica
138 | 7.7,3.0,6.1,2.3,virginica
139 | 6.3,3.4,5.6,2.4,virginica
140 | 6.4,3.1,5.5,1.8,virginica
141 | 6.0,3.0,4.8,1.8,virginica
142 | 6.9,3.1,5.4,2.1,virginica
143 | 6.7,3.1,5.6,2.4,virginica
144 | 6.9,3.1,5.1,2.3,virginica
145 | 5.8,2.7,5.1,1.9,virginica
146 | 6.8,3.2,5.9,2.3,virginica
147 | 6.7,3.3,5.7,2.5,virginica
148 | 6.7,3.0,5.2,2.3,virginica
149 | 6.3,2.5,5.0,1.9,virginica
150 | 6.5,3.0,5.2,2.0,virginica
151 | 6.2,3.4,5.4,2.3,virginica
152 | 5.9,3.0,5.1,1.8,virginica
153 | 


--------------------------------------------------------------------------------
/examples/zeolite/requirements.txt:
--------------------------------------------------------------------------------
1 | seaborn
2 | matplotlib
3 | foundry_ml


--------------------------------------------------------------------------------
/foundry/__init__.py:
--------------------------------------------------------------------------------
1 | from .foundry import Foundry # noqa F401 (import unused)
2 | from . import models # noqa F401 (import unused)
3 | from . import https_download # noqa F401 (import unused)
4 | from . import https_upload # noqa F401 (import unused)
5 | from .foundry_dataset import FoundryDataset # noqa F401 (import unused)
6 | 


--------------------------------------------------------------------------------
/foundry/auth.py:
--------------------------------------------------------------------------------
 1 | """Utilities related to storing authentication credentials"""
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import Dict
 5 | 
 6 | from globus_sdk import TransferClient, AuthClient
 7 | 
 8 | 
 9 | @dataclass
10 | class PubAuths:
11 |     """Collection of the authorizers needed for publication
12 | 
13 |     Attributes:
14 |         transfer_client: Client with credentials to perform transfers
15 |         auth_client_openid: Client with permissions to get users IDs
16 |         endpoint_auth_clients: Mapping between endpoint ID and client that can authorize access to it
17 |     """
18 | 
19 |     transfer_client: TransferClient
20 |     auth_client_openid: AuthClient
21 |     endpoint_auth_clients: Dict[str, AuthClient]
22 | 


--------------------------------------------------------------------------------
/foundry/foundry_dataset.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import html
  5 | from json2table import convert
  6 | 
  7 | from pydantic import ValidationError
  8 | 
  9 | from .foundry_cache import FoundryCache
 10 | from .models import FoundrySchema, FoundryDatacite
 11 | 
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class FoundryDataset():
 17 |     """Representation of an individual dataset.
 18 |         Provides access to metadata as well as functions to
 19 |         instantiate data into memory in different formats.
 20 | 
 21 |         Args:
 22 |             dataset_name (str): Name of the dataset (equivalent to source_id in MDF)
 23 |             datacite_entry (FoundryDatacite): Datacite entry for the dataset
 24 |             foundry_schema (FoundrySchema): Schema for the dataset
 25 |             foundry_cache (FoundryCache): Cache for the dataset
 26 | 
 27 |         Desired functions:
 28 |             - Get as pandas
 29 |             - Get as tensorflow dataset
 30 |             - Get as pytorch dataset
 31 |             - Get file list
 32 |             - Set metadata
 33 |             - Attach datafiles
 34 |             - Validate against schema
 35 |             - Get citation
 36 |         """
 37 | 
 38 |     def __init__(self,
 39 |                  dataset_name: str,
 40 |                  datacite_entry: FoundryDatacite,
 41 |                  foundry_schema: FoundrySchema,
 42 |                  foundry_cache: FoundryCache = None):
 43 | 
 44 |         self.dataset_name = dataset_name
 45 |         try:
 46 |             self.dc = FoundryDatacite(datacite_entry)
 47 |             self.foundry_schema = FoundrySchema(foundry_schema)
 48 |         except Exception as e:
 49 |             raise Exception('there was a problem creating the dataset: ', e)
 50 |         self._foundry_cache = foundry_cache
 51 | 
 52 |     def get_as_dict(self, split: str = None, as_hdf5: bool = False):
 53 |         """Returns the data from the dataset as a dictionary
 54 | 
 55 |         Arguments:
 56 |             split (string): Split to create dataset on.
 57 |                     **Default:** ``None``
 58 | 
 59 |         Returns: (dict) Dictionary of all the data from the specified split
 60 | 
 61 |         """
 62 |         return self._foundry_cache.load_as_dict(split,
 63 |                                                 self.dataset_name,
 64 |                                                 self.foundry_schema,
 65 |                                                 as_hdf5)
 66 |     load = get_as_dict
 67 | 
 68 |     def get_as_torch(self, split: str = None):
 69 |         """Returns the data from the dataset as a TorchDataset
 70 | 
 71 |         Arguments:
 72 |             split (string): Split to create PyTorch Dataset on.
 73 |                     **Default:** ``None``
 74 | 
 75 |         Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split
 76 | 
 77 |         """
 78 | 
 79 |         return self._foundry_cache.load_as_torch(split,
 80 |                                                  self.dataset_name,
 81 |                                                  self.foundry_schema)
 82 | 
 83 |     def get_as_tensorflow(self, split: str = None):
 84 |         """Convert Foundry Dataset to a Tensorflow Sequence
 85 | 
 86 |         Arguments:
 87 |             split (string): Split to create Tensorflow Sequence on.
 88 |                     **Default:** ``None``
 89 | 
 90 |         Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split
 91 | 
 92 |         """
 93 |         return self._foundry_cache.load_as_tensorflow(split,
 94 |                                                       self.dataset_name,
 95 |                                                       self.foundry_schema)
 96 | 
 97 |     def _repr_html_(self) -> str:
 98 |         """Format the Foundry object for notebook rendering as HTML output
 99 | 
100 |         Args:
101 |             self (Foundry)
102 | 
103 |         Returns:
104 |             buf (str): buffer containing the HTML to render
105 |         """
106 |         if not self.dc:
107 |             buf = str(self)
108 |         else:
109 |             title = self.dc.titles[0].title
110 |             authors = [creator['creatorName']
111 |                        for creator in self.dc.creators]
112 |             authors = '; '.join(authors)
113 |             DOI = "DOI: " + self.dc.identifier.identifier.root
114 | 
115 |             buf = f'<h2>{title}</h2>{authors}<p>{DOI}</p>'
116 | 
117 |             buf = f'{buf}<h3>Dataset</h3>{convert(json.loads(self.foundry_schema.json()))}'
118 |         return buf
119 | 
120 |     def _format_creators(self):
121 |         creators_list = []
122 |         for creator in self.dc.creators:
123 |             affiliations = creator.get('affiliations', [])
124 |             if affiliations:
125 |                 affiliations_str = ', '.join(html.escape(aff) for aff in affiliations)
126 |                 creators_list.append(f"{html.escape(creator['creatorName'])} ({affiliations_str})")
127 |             else:
128 |                 creators_list.append(f"{html.escape(creator['creatorName'])}")
129 |         return '; '.join(creators_list)
130 | 
131 |     def _format_subjects(self):
132 |         return ', '.join([html.escape(subject.subject) for subject in self.dc.subjects]) if self.dc.subjects else 'No subjects available'
133 | 
134 |     def get_citation(self) -> str:
135 |         subjects = [subject.subject for subject in self.dc.subjects]
136 |         doi_str = f"doi = {{{self.dc.identifier.identifier.root}}}"
137 |         url_str = f"url = {{https://doi.org/{self.dc.identifier.identifier.root}}}"
138 |         author_str = f"author = {{{' and '.join([creator['creatorName'] for creator in self.dc.creators])}}}"
139 |         title_str = f"title = {{{self.dc.titles[0].title}}}"
140 |         keywords_str = f"keywords = {{{', '.join(subjects)}}}"
141 |         publisher_str = f"publisher = {{{self.dc.publisher}}}"
142 |         year_str = f"year = {{{self.dc.publicationYear}}}"
143 |         bibtex = os.linesep.join([doi_str, url_str,
144 |                                   author_str, title_str,
145 |                                   keywords_str, publisher_str,
146 |                                   year_str])
147 |         bibtex = f"@misc{{https://doi.org/{self.dc.identifier.identifier.root}{os.linesep}{bibtex}}}"
148 |         return bibtex
149 | 
150 |     def validate_metadata(self, metadata):
151 |         """Validate the JSON message against the FoundryDataset model
152 | 
153 |         Arguments:
154 |             metadata (dict): Metadata information provided by the user.
155 | 
156 |         Raises:
157 |             ValidationError: if metadata supplied by user does not meet the specificiation of a
158 |             FoundryDataset object.
159 | 
160 |         """
161 |         try:
162 |             FoundryDataset(**metadata)
163 |             logger.debug("Metadata validation successful!")
164 |         except ValidationError as e:
165 |             logger.error("Metadata validation failed!")
166 |             for error in e.errors():
167 |                 field_name = ".".join([item for item in error['loc'] if isinstance(item, str)])
168 |                 error_description = error['msg']
169 |                 error_message = f"""There is an issue validating the metadata for the field '{field_name}':
170 |                 The error message returned is: '{error_description}'."""
171 |                 logger.error(error_message)
172 |             raise e
173 | 
174 |     def add_data(self, local_data_path: str = None, globus_data_source: str = None):
175 |         """Add data to the dataset. User must provide the location of the data as
176 |         either a `globus_data_source` or `local_data_path`.
177 | 
178 |         Arguments:
179 |                 local_data_path (str): Local path to the dataset used to publish to Foundry via HTTPS. Creates an HTTPS PUT
180 |                 request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is
181 |                 transferred to MDF. If None, the user must specify a 'globus_data_source' URL to the location of the
182 |                 data on their own Globus endpoint. User must choose either `globus_data_source` or `local_data_path` to
183 |                 publish their data.
184 |             globus_data_source (str): Url path for a data folder on a Globus endpoint; url can be obtained through
185 |                 the Globus Web UI or SDK. If None, the user must specify an 'local_data_path' pointing to the location
186 |                 of the data on their local machine. User must choose either `globus_data_source` or `local_data_path` to
187 |                 publish their data.
188 | 
189 |         """
190 |         if local_data_path is None and globus_data_source is None:
191 |             raise ValueError("User must provide either a path to the data on their local machine or a URL to the data "
192 |                              "on their Globus endpoint.")
193 | 
194 |         if local_data_path is not None and globus_data_source is not None:
195 |             raise ValueError("User must choose either `globus_data_source` or `local_data_path`, not both.")
196 | 
197 |         if globus_data_source is not None:
198 |             self._globus_data_source = globus_data_source
199 |             if hasattr(self, '_local_data_path'):
200 |                 delattr(self, '_local_data_path')
201 |         elif local_data_path is not None:
202 |             if os.path.isdir(local_data_path) or os.path.isfile(local_data_path):
203 |                 self._local_data_path = local_data_path
204 |                 if hasattr(self, '_globus_data_source'):
205 |                     delattr(self, '_globus_data_source')
206 |             else:
207 |                 raise ValueError("The path provided does not exist or is not a file or directory.")
208 | 
209 |     def clear_dataset_cache(self):
210 |         """Deletes the cached data for this specific datset"""
211 |         self._foundry_cache.clear_cache(self.dataset_name)
212 | 
213 |     def clean_dc_dict(self):
214 |         """Clean the Datacite dictionary of None values"""
215 |         print(json.loads(self.dc.json()))
216 |         return self.delete_none(json.loads(self.dc.json()))
217 | 
218 |     def delete_none(self, _dict):
219 |         """Delete None values recursively from all of the dictionaries"""
220 |         for key, value in list(_dict.items()):
221 |             if isinstance(value, dict):
222 |                 self.delete_none(value)
223 |             elif value is None:
224 |                 del _dict[key]
225 |             elif isinstance(value, list):
226 |                 for v_i in value:
227 |                     if isinstance(v_i, dict):
228 |                         self.delete_none(v_i)
229 | 
230 |         return _dict
231 | 


--------------------------------------------------------------------------------
/foundry/https_download.py:
--------------------------------------------------------------------------------
  1 | """Methods to download files from a Globus endpoint
  2 | """
  3 | 
  4 | 
  5 | import os
  6 | from collections import deque
  7 | 
  8 | import requests
  9 | from globus_sdk import TransferClient
 10 | 
 11 | 
 12 | def recursive_ls(tc: TransferClient, ep: str, path: str, max_depth: int = 3):
 13 |     """Find all files in a Globus directory recursively
 14 | 
 15 |     Args:
 16 |         tc: TransferClient authorized to access the directory
 17 |         ep: Endpoint on which the files reside
 18 |         path: Path to the files being downloaded
 19 |         max_depth: Maximum recurse depth
 20 | 
 21 |     Yields:
 22 |         Dictionaries describing the location of the files. Each includes at least
 23 |             "name": Name of the file
 24 |             "path": Absolute path to the file's location
 25 |     """
 26 |     queue = deque()
 27 |     queue.append((path, "", 0))
 28 |     yield from _get_files(tc, ep, queue, max_depth)
 29 | 
 30 | 
 31 | def _get_files(tc, ep, queue, max_depth):
 32 |     while queue:
 33 |         abs_path, rel_path, depth = queue.pop()
 34 |         path_prefix = rel_path + "/" if rel_path else ""
 35 | 
 36 |         res = tc.operation_ls(ep, path=abs_path)
 37 | 
 38 |         if depth < max_depth:
 39 |             queue.extend(
 40 |                 (
 41 |                     res["path"] + item["name"],
 42 |                     path_prefix + item["name"],
 43 |                     depth + 1,
 44 |                 )
 45 |                 for item in res["DATA"]
 46 |                 if item["type"] == "dir"
 47 |             )
 48 |         for item in res["DATA"]:
 49 |             if item["type"] == 'file':
 50 |                 item["name"] = path_prefix + item["name"]
 51 |                 item["path"] = abs_path.replace('/~/', '/')
 52 |                 yield item
 53 | 
 54 | 
 55 | def download_file(item, base_directory, https_config, timeout=1800):
 56 |     """Download a file to disk
 57 | 
 58 |     Args:
 59 |         item: Dictionary defining the path to the file
 60 |         base_directory: Base directory for storing downloaded files
 61 |         https_config: Configuration defining the URL of the server and the name of the dataset
 62 |         timeout: Timeout for the download request in seconds (default: 1800)
 63 |     """
 64 |     base_url = https_config['base_url'].rstrip('/')
 65 |     path = item.get('path', '').strip('/')
 66 | 
 67 |     # Extracting the name and subdirectory from the item
 68 |     name = item.get('name', '')
 69 |     subdirectory = name.split('/')[0] if '/' in name else ''
 70 | 
 71 |     # Avoid duplication of subdirectory in path
 72 |     if subdirectory and path.endswith(subdirectory):
 73 |         full_path = f"{path}/{name.split('/', 1)[-1]}".strip('/')
 74 |     else:
 75 |         full_path = '/'.join([path, name]).strip('/')
 76 | 
 77 |     url = f"{base_url}/{full_path}"
 78 | 
 79 |     # build destination path for data file
 80 |     destination = os.path.join(base_directory, https_config['source_id'], item['name'])
 81 |     parent_path = os.path.split(destination)[0]
 82 | 
 83 |     # if parent directories don't exist, create them
 84 |     if not os.path.exists(parent_path):
 85 |         os.makedirs(parent_path, exist_ok=True)
 86 | 
 87 |     try:
 88 |         with requests.get(url, stream=True, timeout=timeout) as response:
 89 |             response.raise_for_status()
 90 | 
 91 |             downloaded_size = 0
 92 |             print(f"\rStarting Download of: {url}")
 93 | 
 94 |             with open(destination, "wb") as f:
 95 |                 for chunk in response.iter_content(chunk_size=8192):
 96 |                     if chunk:
 97 |                         f.write(chunk)
 98 |                         downloaded_size += len(chunk)
 99 |                         # Calculate and print the download progress
100 |                         print(f"\rDownloading... {downloaded_size/(1 << 20):,.2f} MB", end="")
101 |             return destination
102 | 
103 |     except requests.exceptions.RequestException as e:
104 |         print(f"Error downloading file: {e}")
105 |     except IOError as e:
106 |         print(f"Error writing file to disk: {e}")
107 | 
108 |     return {destination + " status": True}
109 | 


--------------------------------------------------------------------------------
/foundry/https_upload.py:
--------------------------------------------------------------------------------
  1 | """Private utility methods to upload files and/or folders to Globus using HTTPS instead of Globus Transfer.
  2 | """
  3 | 
  4 | import logging
  5 | import os
  6 | import urllib
  7 | from requests import put, Response
  8 | from typing import Any, Tuple, Dict, List
  9 | from uuid import uuid4
 10 | 
 11 | from globus_sdk import AuthClient, TransferClient, TransferAPIError
 12 | 
 13 | from .auth import PubAuths
 14 | 
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def upload_to_endpoint(auths: PubAuths, local_data_path: str, endpoint_id: str = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec",
 20 |                        dest_parent: str = None, dest_child: str = None) -> Tuple[str, str]:
 21 |     """Upload local data to a Globus endpoint using HTTPS PUT requests. Data can be a folder or an individual file.
 22 |     Args:
 23 |         auths (PubAuths): Dataclass of authorizers needed for upload. Includes `transfer_client`, `auth_client_openid`,
 24 |             and `endpoint_auth_clients`, which is a Dict of `endpoint_id`:AuthClient mappings.
 25 |         local_data_path (str): Path to the local dataset to publish to Foundry via HTTPS. Creates an HTTPS PUT
 26 |             request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is
 27 |             transferred to MDF.
 28 |         endpoint_id (str): Globus endpoint ID to upload the data to. Default is NCSA endpoint. Must match the
 29 |             `endpoint_id` auth'd in `auths.auth_client_gcs`.
 30 | 
 31 |     Returns
 32 |     -------
 33 |     (str) Globus data source URL: URL pointing to the data on the Globus endpoint
 34 |     """
 35 |     # define upload destination
 36 |     dest_path = _create_dest_folder(auths.transfer_client, endpoint_id, parent_dir=dest_parent, child_dir=dest_child)
 37 |     # upload data to endpoint
 38 |     globus_data_source = _https_upload(auths.transfer_client, auths.endpoint_auth_clients, local_data_path=local_data_path,
 39 |                                        dest_path=dest_path, endpoint_id=endpoint_id)
 40 |     return globus_data_source
 41 | 
 42 | 
 43 | def _create_dest_folder(transfer_client: TransferClient, endpoint_id: str, parent_dir: str = None,
 44 |                         child_dir: str = None) -> str:
 45 |     """Create a destination folder for the data on a Globus endpoint
 46 |     Args:
 47 |         transfer_client (TransferClient): Globus client authorized for Globus Transfers (ie moving data on endpoint,
 48 |             adding/removing folders, etc).
 49 |         endpoint_id (str): A UUID designating the exact Globus endpoint. Can be obtained via the Globus Web UI or
 50 |             the SDK.
 51 |         parent_dir (str): Set to "/tmp" when default is None. The parent directory that all publications via HTTPS
 52 |             will be written to.
 53 |         child_dir (str): Set to a random UUID when default is None. The child directory that the data will be
 54 |             written to.
 55 |     Returns
 56 |     -------
 57 |         (str): Path on Globus endpoint to write to
 58 |     """
 59 |     # use a random UUID for each dataset publication, unless specified otherwise
 60 |     if child_dir is None:
 61 |         child_dir = uuid4()  # the publication ID forms the name of the child directory
 62 |     if parent_dir is None:
 63 |         parent_dir = "/tmp"
 64 |     dest_path = os.path.join(parent_dir, str(child_dir))  # NOTE: must start and end with "/"
 65 | 
 66 |     try:
 67 |         transfer_client.operation_mkdir(endpoint_id=endpoint_id, path=dest_path)
 68 |     except TransferAPIError as e:
 69 |         raise IOError(f"Error from Globus API while creating destination folder: {e.message}") from e
 70 |     return dest_path
 71 | 
 72 | 
 73 | def _https_upload(transfer_client: TransferClient, endpoint_auth_clients: Dict[str, AuthClient], local_data_path: str,
 74 |                   dest_path: str = "/tmp", endpoint_id: str = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec") -> str:
 75 |     """Upload a dataset via HTTPS to a Globus endpoint
 76 |     Args:
 77 |         transfer_client (TransferClient): Globus client authorized for Globus Transfers (ie moving data on endpoint,
 78 |             adding/removing folders, etc).
 79 |         endpoint_auth_clients (Dict[str, AuthClient]): Dict of `endpoint_id` : `AuthClient` keypairs. AuthClients used
 80 |             for Globus Auth functionality within endpoint-specific scopes using Globus Connect Server (ie accessing
 81 |             or altering data on a specific endpoint).
 82 |         local_data_path (str): The path to the local data to upload. Can be relative or absolute.
 83 |         dest_path (str): The path to the destination folder on the Globus endpoint. Default is "/tmp".
 84 |         endpoint_id (str): A UUID designating the exact Globus endpoint. Can be obtained via the Globus Web UI or
 85 |             the SDK. Default is the NCSA UUID "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec".
 86 |     Returns
 87 |     -------
 88 |         (str): Globus data source URL (ie the URL that points to the data on a Globus endpoint)
 89 |     """
 90 | 
 91 |     # get URL for Globus endpoint location
 92 |     endpoint = transfer_client.get_endpoint(endpoint_id)  # gets info for NCSA endpoint
 93 |     https_base_url = endpoint["https_server"]
 94 | 
 95 |     # Submit data (folders of files or an independent file) to be written to endpoint
 96 |     if os.path.isdir(local_data_path):
 97 |         _upload_folder(transfer_client, endpoint_auth_clients, local_data_path, https_base_url, dest_path, endpoint_id)
 98 |     elif os.path.isfile(local_data_path):
 99 |         _upload_file(endpoint_auth_clients[endpoint_id], local_data_path, https_base_url, dest_path)
100 |     else:
101 |         raise IOError(f"Data path '{local_data_path}' is of unknown type")
102 | 
103 |     # return the data source URL for publication to MDF
104 |     return _make_globus_link(endpoint_id, dest_path)
105 | 
106 | 
107 | def _upload_folder(transfer_client: TransferClient, endpoint_auth_clients: Dict[str, AuthClient], local_data_path: str,
108 |                    https_base_url: str, parent_dest_path: str, endpoint_id: str) -> List[Dict[str, Any]]:
109 |     """Upload a folder to a Globus endpoint using HTTPS
110 |     Args:
111 |         transfer_client (TransferClient): Globus client authorized for Globus Transfers (ie moving data on endpoint,
112 |             adding/removing folders, etc).
113 |         endpoint_auth_clients (Dict[str, AuthClient]): Dict of `endpoint_id` : `AuthClient` keypairs. AuthClients used
114 |             for Globus Auth functionality within endpoint-specific scopes using Globus Connect Server (ie accessing
115 |             or altering data on a specific endpoint).
116 |         local_data_path (str): The path to the local data to upload. Can be relative or absolute.
117 |         https_base_url (str): The URL for a given Globus endpoint.
118 |         parent_dest_path (str): The path to the parent folder to be written to on the given endpoint. The contents
119 |             of "local_data_path" will be written here, including subdirectories.
120 |         endpoint_id (str): The UUID designating the exact Globus endpoint. Can be obtained via the Globus Web UI or
121 |             the SDK. This must be the same endpoint pointed to by the https_base_url.
122 |     Returns
123 |     -------
124 |         (list): A list of Response objects (the `requests` HTTPS response object from a PUT request)
125 |     """
126 |     results = []
127 |     # initialize destination path as the parent destination path
128 |     dest_path = parent_dest_path
129 | 
130 |     # walk through each child directory in the designated local data folder
131 |     for root, _, files in os.walk(local_data_path):
132 |         # update destination path if we have walked into a child directory
133 |         if root != local_data_path:
134 |             # get the child directory relative path
135 |             subpath = os.path.relpath(root, local_data_path)
136 |             # update destination path to include child directories (ie subpaths)
137 |             dest_path = os.path.join(parent_dest_path, subpath)
138 |             # create child directories on endpoint
139 |             try:
140 |                 transfer_client.operation_mkdir(endpoint_id=endpoint_id, path=dest_path)
141 |             except TransferAPIError as e:
142 |                 raise IOError(f"Error while creating child directory {dest_path}: {e.message}") from e
143 |         # get local path to file to upload
144 |         for filename in files:
145 |             filepath = os.path.join(root, filename)
146 |             # upload file to destination path on endpoint
147 |             result = _upload_file(endpoint_auth_clients[endpoint_id], filepath, https_base_url, dest_path)
148 |             results.append(result)
149 |     return results
150 | 
151 | 
152 | def _upload_file(auth_client_gcs: AuthClient, filepath: str, https_base_url: str, dest_path: str) -> Response:
153 |     """Upload an individual file to a Globus endpoint specified in 'auth_client_gcs' using HTTPS PUT
154 |     Args:
155 |         auth_client_gcs (AuthClient): Globus client authorized for Globus Auth functionality within an endpoint-specific
156 |             scope using Globus Connect Server (ie accessing or altering data on a specific endpoint).
157 |         filepath (str): The path to the local file to upload.
158 |         https_base_url (str): The URL for a given Globus endpoint.
159 |         dest_path (str): The path to the folder to be written to on the given endpoint.
160 |     Returns
161 |     -------
162 |         (Response): The `requests` HTTPS response object from a PUT request
163 |     """
164 |     # Get the authorization header token (string for the headers dict) for HTTPS upload
165 |     header = auth_client_gcs.authorizer.get_authorization_header()
166 | 
167 |     # get Globus endpoint path to write to
168 |     filename = os.path.split(filepath)[1]
169 |     # need to strip out leading "/" in dest_path for join to work
170 |     endpoint_dest = os.path.join(https_base_url, dest_path.lstrip("/"), filename)
171 | 
172 |     # upload via HTTPS as arbitrary binary content type
173 |     with open(filepath, "rb") as f:
174 |         reply = put(
175 |             endpoint_dest,
176 |             data=f,
177 |             headers={"Authorization": header, "Content-Type": "application/octet-stream"}
178 |         )
179 |     if reply.status_code != 200:
180 |         raise IOError(f"Error on HTTPS PUT, got response {reply.status_code}: {reply.text}")
181 |     # Return the response
182 |     return reply
183 | 
184 | 
185 | def _make_globus_link(endpoint_id: str, path: str) -> str:
186 |     """Create the Globus data source URL for a given datapath on an endpoint
187 |     Args:
188 |         endpoint_id (str): The UUID designating the exact Globus endpoint. Can be obtained via the Globus Web UI or
189 |             the SDK.
190 |         path (str): The path to the dataset folder on the given endpoint.
191 |     Returns
192 |     -------
193 |         (str): The Globus data source URL (ie the URL that points to the data on a Globus endpoint)
194 |     """
195 |     # make sure the path has the "/"s encoded properly for a URL
196 |     safe_path = urllib.parse.quote(path, safe="*")
197 |     link = f"https://app.globus.org/file-manager?origin_id={endpoint_id}&origin_path={safe_path}"
198 |     return link
199 | 


--------------------------------------------------------------------------------
/foundry/jsonschema_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/foundry/jsonschema_models/__init__.py


--------------------------------------------------------------------------------
/foundry/jsonschema_models/project_model.py:
--------------------------------------------------------------------------------
  1 | # generated by datamodel-codegen:
  2 | #   filename:  projects.json
  3 | #   timestamp: 2024-02-20T22:47:19+00:00
  4 | 
  5 | from __future__ import annotations
  6 | 
  7 | from typing import Any, Dict, List, Optional, Union
  8 | 
  9 | from pydantic import BaseModel, Extra, Field
 10 | 
 11 | 
 12 | class Nanomfg(BaseModel):
 13 |     class Config:
 14 |         extra = Extra.forbid
 15 | 
 16 |     base_pressure: Optional[float] = None
 17 |     carbon_source: Optional[str] = None
 18 |     catalyst: Optional[str] = None
 19 |     grain_size: Optional[str] = None
 20 |     max_temperature: Optional[float] = None
 21 |     orientation: Optional[str] = None
 22 |     sample_surface_area: Optional[float] = None
 23 |     sample_thickness: Optional[float] = None
 24 | 
 25 | 
 26 | class RedoxPotential(BaseModel):
 27 |     class Config:
 28 |         extra = Extra.forbid
 29 | 
 30 |     S0: Optional[float] = Field(
 31 |         None, description='Redox potential for electronic ground state S0. Units: eV'
 32 |     )
 33 |     S1: Optional[float] = Field(
 34 |         None,
 35 |         description='Redox potential for electronicically excited state S1. Units: eV',
 36 |     )
 37 |     T1: Optional[float] = Field(
 38 |         None,
 39 |         description='Redox potential for electronically excited state T1. Units: eV',
 40 |     )
 41 | 
 42 | 
 43 | class DipoleMoment(BaseModel):
 44 |     class Config:
 45 |         extra = Extra.forbid
 46 | 
 47 |     S0: Optional[float] = Field(
 48 |         None, description='Dipole moment for electronic ground state S0. Units: Debye'
 49 |     )
 50 |     S1: Optional[float] = Field(
 51 |         None,
 52 |         description='Dipole moment for electronicically excited state S1. Units: Debye',
 53 |     )
 54 |     T1: Optional[float] = Field(
 55 |         None,
 56 |         description='Dipole moment for electronically excited state T1. Units: Debye',
 57 |     )
 58 | 
 59 | 
 60 | class Field00(BaseModel):
 61 |     class Config:
 62 |         extra = Extra.forbid
 63 | 
 64 |     S1: Optional[float] = Field(
 65 |         None, description='0_0 energy for electronicically excited state S1. Units: eV'
 66 |     )
 67 |     T1: Optional[float] = Field(
 68 |         None, description='0_0 energy for electronically excited state T1. Units: eV'
 69 |     )
 70 | 
 71 | 
 72 | class Verde(BaseModel):
 73 |     class Config:
 74 |         extra = Extra.forbid
 75 | 
 76 |     vertical_excitation_energy: Optional[float] = Field(
 77 |         None, description='Vertical excitation energy. Units: eV'
 78 |     )
 79 |     ionization_potential: Optional[float] = Field(
 80 |         None, description='Ionization potential. Units: eV'
 81 |     )
 82 |     redox_potential: Optional[RedoxPotential] = Field(
 83 |         None, description='Redox potentials of the molecule. Units: eV'
 84 |     )
 85 |     dipole_moment: Optional[DipoleMoment] = Field(
 86 |         None, description='Dipole moment of the molecule. Units: Debye'
 87 |     )
 88 |     field_0_0: Optional[Field00] = Field(
 89 |         None,
 90 |         alias='0_0',
 91 |         description='0-0 transition energies of the molecule. Units: eV',
 92 |     )
 93 | 
 94 | 
 95 | class Split(BaseModel):
 96 |     class Config:
 97 |         extra = Extra.forbid
 98 | 
 99 |     type: Optional[str] = Field(
100 |         None,
101 |         description='The kind of partition of the dataset (train, test, validation, etc)',
102 |     )
103 |     path: Optional[str] = Field(
104 |         None, description='The full filepath to the dataset file or directory'
105 |     )
106 |     label: Optional[str] = Field(None, description='A label to assign to this split')
107 | 
108 | 
109 | class Classes(BaseModel):
110 |     class Config:
111 |         extra = Extra.forbid
112 | 
113 |     label: Optional[str] = Field(None, description='The label that exists in the data')
114 |     name: Optional[str] = Field(None, description='The name the label maps onto.')
115 | 
116 | 
117 | class Key(BaseModel):
118 |     class Config:
119 |         extra = Extra.forbid
120 | 
121 |     key: Optional[List[str]] = Field(
122 |         None,
123 |         description='Column or header name for tabular data, key/path for HDF5 data',
124 |     )
125 |     type: Optional[str] = Field(None, description='Whether input or target')
126 |     filter: Optional[str] = Field(None, description='How apply the defined key')
127 |     description: Optional[str] = Field(
128 |         None, description='Free text description of the key'
129 |     )
130 |     units: Optional[str] = Field(None, description='The units associated with the key')
131 |     classes: Optional[Union[List[Dict[str, Any]], Classes]] = None
132 | 
133 | 
134 | class Foundry(BaseModel):
135 |     class Config:
136 |         extra = Extra.forbid
137 | 
138 |     short_name: Optional[str] = None
139 |     data_type: Optional[str] = Field(
140 |         None, description='The kind of data in the dataset, e.g. tabular, json, hdf5'
141 |     )
142 |     task_type: Optional[List[str]] = Field(
143 |         None,
144 |         description='The type of task. e.g., supervised, unsupervised, generative.',
145 |     )
146 |     domain: Optional[List[str]] = Field(
147 |         None,
148 |         description='The domain of applicability. e.g., materials science, chemistry, machine vision',
149 |     )
150 |     n_items: Optional[float] = Field(
151 |         None,
152 |         description='The number of total items in the dataset including all splits.',
153 |     )
154 |     splits: Optional[List[Split]] = Field(
155 |         None,
156 |         description='Define all partitions of the dataset (train, test, validation, etc.)',
157 |     )
158 |     keys: Optional[List[Key]] = Field(
159 |         None, description='Keys describing how to load the data'
160 |     )
161 | 
162 | 
163 | class Projects(BaseModel):
164 |     class Config:
165 |         extra = Extra.forbid
166 | 
167 |     nanomfg: Optional[Nanomfg] = Field(None, description='Project block for NanoMFG.')
168 |     verde: Optional[Verde] = Field(None, description='VERDE calculation outputs')
169 |     foundry: Optional[Foundry] = Field(
170 |         None, description='Project block for Foundry datasets.'
171 |     )
172 | 
173 | 
174 | class ProjectsBlock(BaseModel):
175 |     projects: Optional[Projects] = Field(
176 |         None, description='External organization-defined block.'
177 |     )
178 | 


--------------------------------------------------------------------------------
/foundry/loaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/foundry/loaders/__init__.py


--------------------------------------------------------------------------------
/foundry/loaders/tf_wrapper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tensorflow.keras.utils import Sequence
 3 | 
 4 | 
 5 | class TensorflowSequence(Sequence):
 6 |     """Foundry Dataset Converted to Tensorflow Format"""
 7 | 
 8 |     def __init__(self, inputs, targets):
 9 |         self.inputs = inputs
10 |         self.targets = targets
11 | 
12 |     def __len__(self):
13 |         return len(self.inputs[0])
14 | 
15 |     def __getitem__(self, idx):
16 |         item = {"input": [], "target": []}
17 | 
18 |         for input in self.inputs:
19 |             item["input"].append(np.array(input[idx]))
20 |         item["input"] = np.array(item["input"])
21 | 
22 |         for target in self.targets:
23 |             item["target"].append(np.array(target[idx]))
24 |         item["target"] = np.array(item["target"])
25 | 
26 |         return item
27 | 


--------------------------------------------------------------------------------
/foundry/loaders/torch_wrapper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch.utils.data import Dataset
 3 | 
 4 | 
 5 | class TorchDataset(Dataset):
 6 |     """Foundry Dataset Converted to Pytorch Format"""
 7 | 
 8 |     def __init__(self, inputs, targets):
 9 |         self.inputs = inputs
10 |         self.targets = targets
11 | 
12 |     def __len__(self):
13 |         return len(self.inputs[0])
14 | 
15 |     def __getitem__(self, idx):
16 |         item = {"input": [], "target": []}
17 | 
18 |         # adds the correct item at index idx from each input from self.inputs to the item dictionary
19 |         for input in self.inputs:
20 |             item["input"].append(np.array(input[idx]))
21 |         item["input"] = np.array(item["input"])
22 | 
23 |         # adds the correct item at index idx from each target from self.targets to the item dictionary
24 |         for target in self.targets:
25 |             item["target"].append(np.array(target[idx]))
26 |         item["target"] = np.array(item["target"])
27 | 
28 |         return item
29 | 


--------------------------------------------------------------------------------
/foundry/models.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | import json
  3 | from json2table import convert
  4 | import logging
  5 | import pandas as pd
  6 | from pydantic import BaseModel, Field, Extra, ValidationError
  7 | from typing import Optional, Any, Dict
  8 | 
  9 | from .jsonschema_models.dc_model import Dc1 as DataciteModel
 10 | from .jsonschema_models.project_model import Foundry as FoundryModel
 11 | 
 12 | logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | # Classes for Foundry Data Package Specification
 17 | class FoundrySpecificationDataset(BaseModel):
 18 |     """Pydantic base class for datasets within the Foundry data package specification"""
 19 | 
 20 |     name: Optional[str] = None
 21 |     provider: Optional[str] = Field(default="MDF")
 22 |     version: Optional[str] = None
 23 | 
 24 | 
 25 | class FoundrySpecification(BaseModel):
 26 |     """Pydantic base class for interacting with the Foundry data package specification
 27 |     The specification provides a way to group datasets and manage versions
 28 |     """
 29 | 
 30 |     name: str = Field(default="")
 31 |     version: str = Field(default="")
 32 |     description: str = Field(default="")
 33 |     private: bool = Field(default=False)
 34 |     dependencies: Dict[str, str] = Field(default_factory=dict)
 35 | 
 36 |     def add_dependency(self, name: str, version: str):
 37 |         self.dependencies[name] = version
 38 | 
 39 |     def remove_duplicate_dependencies(self):
 40 |         deps = [{"name": key, "version": self.dependencies[key]}
 41 |                 for key in self.dependencies]
 42 |         df = pd.DataFrame.from_records(deps)
 43 |         self.clear_dependencies()
 44 |         for _, row in df.drop_duplicates().iterrows():
 45 |             self.add_dependency(name=row["name"], version=row["version"])
 46 | 
 47 |     def clear_dependencies(self):
 48 |         self.dependencies.clear()
 49 | 
 50 |     def model_dump(self):
 51 |         return json.loads(self.model_dump_json())
 52 | 
 53 |     def _repr_html_(self):
 54 |         buf = f'<h3>Data Requirements - {self.name}</h3>'
 55 |         buf = buf + convert(self.model_dump())
 56 |         return buf
 57 | 
 58 | 
 59 | class FoundryDatasetType(Enum):
 60 |     """Foundry Dataset Types
 61 |     Enumeration of the possible Foundry dataset types
 62 |     """
 63 | 
 64 |     tabular = "tabular"
 65 |     files = "files"
 66 |     hdf5 = "hdf5"
 67 |     other = "other"
 68 | 
 69 | 
 70 | class FoundrySchema(FoundryModel):
 71 |     """
 72 |     A model for the Foundry schema based on the FoundryModel (project_model.py) class.
 73 |     """
 74 | 
 75 |     def __init__(self, project_dict: Dict[str, Any]):
 76 |         try:
 77 |             super().__init__(**project_dict)
 78 |         except ValidationError as e:
 79 |             print("FoundrySchema validation failed!")
 80 |             for error in e.errors():
 81 |                 field_name = ".".join([str(item) for item in error['loc']])
 82 |                 error_description = error['msg']
 83 |                 error_message = f"""There is an issue validating the entry for the field '{field_name}':
 84 |                 The error message returned is: '{error_description}'.
 85 |                 The description for this field is: '{FoundryModel.model_json_schema()['properties'][field_name]['description']}'"""
 86 |                 print(error_message)
 87 |             raise e
 88 | 
 89 | 
 90 | class FoundryDatacite(DataciteModel):
 91 |     """
 92 |     A model for the Datacite schema based on the Datacite (dc_model.py) class.
 93 |     """
 94 |     def __init__(self, datacite_dict: Dict[str, Any], **kwargs):
 95 |         try:
 96 |             dc_dict = datacite_dict.copy()
 97 |             if 'identifier' in dc_dict:
 98 |                 if isinstance(dc_dict['identifier'], dict) and 'identifier' in dc_dict['identifier']:
 99 |                     if isinstance(dc_dict['identifier']['identifier'], dict) and '__root__' in dc_dict['identifier']['identifier']:
100 |                         dc_dict['identifier']['identifier'] = dc_dict['identifier']['identifier']['__root__']
101 |             super().__init__(**dc_dict, **kwargs)
102 |         except ValidationError as e:
103 |             print("Datacite validation failed!")
104 |             for error in e.errors():
105 |                 field_name = ".".join(str(loc) for loc in error["loc"])
106 |                 error_description = error['msg']
107 |                 error_message = f"""There is an issue validating the entry for the field '{field_name}':
108 |                 The error message returned is: '{error_description}'.
109 |                 The description is: '{self.model_json_schema()['properties'].get(field_name, {}).get('description', 'No description available')}'"""
110 |                 print(error_message)
111 |             raise e
112 | 
113 | 
114 | class FoundryBase(BaseModel):
115 |     """
116 |     Configuration information for Foundry instance
117 |     """
118 | 
119 |     dataframe_file: Optional[str] = Field(default="foundry_dataframe.json")
120 |     data_file: Optional[str] = Field(default="foundry.hdf5")
121 |     metadata_file: Optional[str] = Field(default="foundry_metadata.json")
122 |     destination_endpoint: Optional[str] = None
123 |     local: Optional[bool] = Field(default=False)
124 |     local_cache_dir: str = Field(default="./data")
125 |     metadata_key: Optional[str] = Field(default="foundry")
126 |     organization: Optional[str] = Field(default="Foundry")
127 | 
128 |     class Config:
129 |         extra = Extra.allow
130 | 
131 |     def model_dump(self):
132 |         return json.loads(self.model_dump_json())
133 | 
134 |     def _repr_html_(self):
135 |         return convert(self.model_dump())
136 | 


--------------------------------------------------------------------------------
/foundry/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def is_pandas_pytable(group):
 5 |     if 'axis0' in group.keys() and 'axis1' in group.keys():
 6 |         return True
 7 |     else:
 8 |         return False
 9 | 
10 | 
11 | def is_doi(string: str):
12 |     if string.startswith('10.') or string.startswith('https://doi.org/'):
13 |         return True
14 |     else:
15 |         return False
16 | 
17 | 
18 | def _read_json(path_to_file, lines=False):
19 |     """Read JSON file
20 | 
21 |     Arguments:
22 |         path_to_file (string): Path to JSON file
23 | 
24 |     Returns: (dict) JSON file contents
25 |     """
26 |     df = pd.read_json(path_to_file, lines=lines)
27 |     return df
28 | 
29 | 
30 | def _read_csv(path_to_file):
31 |     """Read CSV file
32 | 
33 |     Arguments:
34 |         path_to_file (string): Path to CSV file
35 | 
36 |     Returns: (dict) CSV file contents
37 |     """
38 |     return pd.read_csv(path_to_file)
39 | 
40 | 
41 | def _read_excel(path_to_file):
42 |     return pd.read_excel(path_to_file)
43 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | globus-sdk>=3,<4
 2 | dlhub_sdk>=2.1.0
 3 | requests>=2.18.4
 4 | tqdm>=4.19.4
 5 | six>=1.11.0
 6 | h5py>=2.10.0
 7 | numpy>=1.15.4
 8 | pandas>=0.23.4
 9 | scikit-learn>=1.0
10 | pydantic>=2.7.2
11 | mdf_forge>=0.8.0
12 | mdf-connect-client>=0.5.0
13 | json2table>=1.1.5
14 | torch>=1.8.0
15 | tensorflow>=2
16 | tqdm>=4.64
17 | openpyxl>=3.1.0
18 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = .git,*.egg*
3 | max-line-length = 150
4 | per-file-ignores =
5 |     foundry/jsonschema_models/*:E501
6 |     


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | packages = (setuptools.find_packages(),)
 6 | setuptools.setup(
 7 |     name="foundry_ml",
 8 |     version="1.0.4",
 9 |     author="""Aristana Scourtas, KJ Schmidt, Isaac Darling, Aadit Ambadkar, Braeden Cullen,
10 |             Imogen Foster, Ribhav Bose, Zoa Katok, Ethan Truelove, Ian Foster, Ben Blaiszik""",
11 |     author_email="blaiszik@uchicago.edu",
12 |     packages=setuptools.find_packages(),
13 |     description="Package to support simplified application of machine learning models to datasets in materials science",
14 |     long_description=long_description,
15 |     long_description_content_type="text/markdown",
16 |     install_requires=[
17 |         "mdf_forge>=0.8.0",
18 |         "globus-sdk>=3,<4",
19 |         "dlhub_sdk>=1.0.0",
20 |         "numpy>=1.15.4",
21 |         "pandas>=0.23.4",
22 |         "pydantic>=2.7.2",
23 |         "mdf_connect_client>=0.5.0",
24 |         "h5py>=2.10.0",
25 |         "json2table",
26 |         "openpyxl>=3.1.0"
27 |     ],
28 |     python_requires=">=3.7",
29 |     classifiers=[
30 |         "Development Status :: 3 - Alpha",
31 |         "Intended Audience :: Science/Research",
32 |         "License :: OSI Approved :: MIT License",
33 |         "Natural Language :: English",
34 |         "Operating System :: OS Independent",
35 |         "Programming Language :: Python :: 3",
36 |         "Topic :: Scientific/Engineering",
37 |     ],
38 |     keywords=[],
39 |     license="MIT License",
40 |     url="https://github.com/MLMI2-CSSI/foundry",
41 | )
42 | 


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest>=7.4
2 | pytest-cov>=2.12
3 | pytest-mock
4 | flake8
5 | jsonschema
6 | mock
7 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/test.py


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | *NOTE: These tests are still in development, and should not be expected to properly cover all test cases yet.*
 2 | ## Foundry Tests
 3 | This directory contains the tests for the `materials_foundry` package.
 4 | 
 5 | ### Running the tests
 6 | Python 3 must be installed. Go to https://www.python.org/downloads/ to download Python 3.
 7 | Pytest must also be installed. To do this, run `pip install pytest`.
 8 | After Pytest is installed, the tests can be executed by running `pytest` in this directory.
 9 | 
10 | ### About the tests
11 | These tests cover the basic and advanced functionality of the `materials_foundry` package. They test each function to check that operations succeed with expected values, error with invalid values, and respect parameters appropriately.
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/tests/data/__init__.py


--------------------------------------------------------------------------------
/tests/data/https_test/test_data.json:
--------------------------------------------------------------------------------
1 | "[{\"A\":0.5325016729,\"B\":0.4869994072,\"C\":0.269408499,\"D\":0.3621738306},{\"A\":0.2304825119,\"B\":0.5481912504,\"C\":0.4014871558,\"D\":0.3603053727},{\"A\":0.3507889192,\"B\":0.6226744491,\"C\":0.6091377546,\"D\":0.5165489581},{\"A\":0.5798596097,\"B\":0.7200900352,\"C\":0.9342783399,\"D\":0.0540093822},{\"A\":0.3108050843,\"B\":0.9773936853,\"C\":0.4801914079,\"D\":0.1787148388},{\"A\":0.2084711872,\"B\":0.5751155582,\"C\":0.2584603695,\"D\":0.4499059913},{\"A\":0.4759231155,\"B\":0.8092009236,\"C\":0.3072478517,\"D\":0.8837572165},{\"A\":0.2480077693,\"B\":0.4552477195,\"C\":0.9647205731,\"D\":0.2536659951},{\"A\":0.1634613944,\"B\":0.4766800168,\"C\":0.369663194,\"D\":0.9961761176},{\"A\":0.5346664051,\"B\":0.194493705,\"C\":0.4651741797,\"D\":0.7293780026},{\"A\":0.663231418,\"B\":0.4826377804,\"C\":0.5241409718,\"D\":0.1599210151},{\"A\":0.6838886399,\"B\":0.4712653511,\"C\":0.5581199413,\"D\":0.3226088546},{\"A\":0.5353026817,\"B\":0.7982157423,\"C\":0.5292073255,\"D\":0.8607700998},{\"A\":0.7164381489,\"B\":0.8843576476,\"C\":0.8875950376,\"D\":0.3435581081},{\"A\":0.664698152,\"B\":0.8538449824,\"C\":0.8392907092,\"D\":0.3113757281},{\"A\":0.6479963522,\"B\":0.9540320749,\"C\":0.5325504287,\"D\":0.8182171859},{\"A\":0.8211351808,\"B\":0.4439015651,\"C\":0.6269342472,\"D\":0.4122693317},{\"A\":0.6679538246,\"B\":0.7390494918,\"C\":0.1759150727,\"D\":0.1302475247},{\"A\":0.8377472214,\"B\":0.273212617,\"C\":0.2663859412,\"D\":0.5964344124},{\"A\":0.9082325183,\"B\":0.3137189069,\"C\":0.9204916523,\"D\":0.6981099323},{\"A\":0.0569704461,\"B\":0.5887289259,\"C\":0.9691034132,\"D\":0.864492329},{\"A\":0.8733863986,\"B\":0.5010284784,\"C\":0.8753387807,\"D\":0.2446379967},{\"A\":0.9960436279,\"B\":0.4441227796,\"C\":0.4813238954,\"D\":0.2405431943},{\"A\":0.9743124513,\"B\":0.2431975581,\"C\":0.1700467831,\"D\":0.5107508473},{\"A\":0.2659344468,\"B\":0.2298763219,\"C\":0.1679702902,\"D\":0.2976868866},{\"A\":0.4967479732,\"B\":0.8507496604,\"C\":0.1298286001,\"D\":0.0696243588},{\"A\":0.5011896631,\"B\":0.7902353379,\"C\":0.862998181,\"D\":0.2236992358},{\"A\":0.1630314398,\"B\":0.9551477474,\"C\":0.9418805628,\"D\":0.2297807846},{\"A\":0.4375272337,\"B\":0.4738682907,\"C\":0.7732553673,\"D\":0.9564487668},{\"A\":0.6313980088,\"B\":0.704010717,\"C\":0.5371476407,\"D\":0.6924966076},{\"A\":0.3307278087,\"B\":0.6970454132,\"C\":0.1032596315,\"D\":0.2564419892},{\"A\":0.4353581681,\"B\":0.6941255237,\"C\":0.0394112344,\"D\":0.4991979497},{\"A\":0.3901813924,\"B\":0.3581649708,\"C\":0.0696682115,\"D\":0.1097029952},{\"A\":0.6259185992,\"B\":0.8903284037,\"C\":0.5605368861,\"D\":0.5762697392},{\"A\":0.6068870699,\"B\":0.2540924395,\"C\":0.1086194342,\"D\":0.6033022401},{\"A\":0.3123136578,\"B\":0.2788511046,\"C\":0.4036056534,\"D\":0.5723193255},{\"A\":0.1021738989,\"B\":0.9994909756,\"C\":0.9832543715,\"D\":0.9885624128},{\"A\":0.6785393932,\"B\":0.5788494481,\"C\":0.5004145535,\"D\":0.5129020263},{\"A\":0.4616303134,\"B\":0.2699233081,\"C\":0.1211274991,\"D\":0.2856023533},{\"A\":0.7471981109,\"B\":0.8964269606,\"C\":0.5273349805,\"D\":0.519780483},{\"A\":0.2398561128,\"B\":0.6637693729,\"C\":0.0713046133,\"D\":0.1965824806},{\"A\":0.1438138313,\"B\":0.8077715814,\"C\":0.5987424102,\"D\":0.8796307444},{\"A\":0.2099913033,\"B\":0.0014793221,\"C\":0.6282096248,\"D\":0.3344606287},{\"A\":0.2172346949,\"B\":0.8055518737,\"C\":0.7020053655,\"D\":0.2734407306},{\"A\":0.2919147983,\"B\":0.8325704254,\"C\":0.6602685898,\"D\":0.9023679814},{\"A\":0.5941575402,\"B\":0.03341885,\"C\":0.1900654781,\"D\":0.5003254697},{\"A\":0.0577672223,\"B\":0.5857695682,\"C\":0.7401711144,\"D\":0.2932093813},{\"A\":0.0441340405,\"B\":0.9699508666,\"C\":0.5688421424,\"D\":0.8265708757},{\"A\":0.5671337446,\"B\":0.1391732202,\"C\":0.8557361973,\"D\":0.95843871},{\"A\":0.7818036893,\"B\":0.7559001038,\"C\":0.9803074287,\"D\":0.5550912458},{\"A\":0.7764158003,\"B\":0.6901683866,\"C\":0.7134621122,\"D\":0.3412987906},{\"A\":0.5184392059,\"B\":0.3561712456,\"C\":0.9341091567,\"D\":0.4326865978},{\"A\":0.7478769752,\"B\":0.985482101,\"C\":0.7739615326,\"D\":0.0101972409},{\"A\":0.4681250259,\"B\":0.1031996448,\"C\":0.5414294345,\"D\":0.4261483639},{\"A\":0.1752046752,\"B\":0.1991262091,\"C\":0.677077356,\"D\":0.8059928892},{\"A\":0.7081022399,\"B\":0.5430350946,\"C\":0.4756771947,\"D\":0.2051436153},{\"A\":0.7008949898,\"B\":0.4885963791,\"C\":0.2012325862,\"D\":0.4846756182},{\"A\":0.6803875318,\"B\":0.3302139274,\"C\":0.2008848379,\"D\":0.8565463434},{\"A\":0.8215943867,\"B\":0.4007808246,\"C\":0.849984323,\"D\":0.7148225175},{\"A\":0.984314214,\"B\":0.3315224115,\"C\":0.2374709671,\"D\":0.7303792807},{\"A\":0.1544605924,\"B\":0.9114949507,\"C\":0.8431437169,\"D\":0.3078082636},{\"A\":0.4466543896,\"B\":0.7093552302,\"C\":0.0139362347,\"D\":0.6832623102},{\"A\":0.335121826,\"B\":0.017851436,\"C\":0.8177046429,\"D\":0.0868433873},{\"A\":0.3241821741,\"B\":0.352863243,\"C\":0.0782754953,\"D\":0.9690912676},{\"A\":0.6525125608,\"B\":0.6431642124,\"C\":0.2455876297,\"D\":0.4893561674},{\"A\":0.5009426045,\"B\":0.144012049,\"C\":0.9115418398,\"D\":0.6228395399},{\"A\":0.707871306,\"B\":0.088211523,\"C\":0.9318696511,\"D\":0.8569612714},{\"A\":0.4605842335,\"B\":0.5185556048,\"C\":0.9262782515,\"D\":0.7801799842},{\"A\":0.7730264146,\"B\":0.6774950976,\"C\":0.6291621329,\"D\":0.2161067579},{\"A\":0.7916767193,\"B\":0.212605389,\"C\":0.3408662965,\"D\":0.5176147758},{\"A\":0.7025679202,\"B\":0.0851145247,\"C\":0.1140933764,\"D\":0.7167199974},{\"A\":0.9758435877,\"B\":0.6312956197,\"C\":0.4144884024,\"D\":0.5930438643},{\"A\":0.1998500366,\"B\":0.079361875,\"C\":0.5949318443,\"D\":0.0516943591},{\"A\":0.9872288449,\"B\":0.2349307202,\"C\":0.5721875354,\"D\":0.1446492501},{\"A\":0.8438844171,\"B\":0.4464399495,\"C\":0.3444058338,\"D\":0.4428694837},{\"A\":0.3778647292,\"B\":0.3380841433,\"C\":0.5285966196,\"D\":0.0594458212},{\"A\":0.7635524601,\"B\":0.6418464458,\"C\":0.7063874264,\"D\":0.1375340887},{\"A\":0.0092013791,\"B\":0.4869340269,\"C\":0.7725304702,\"D\":0.6235075271},{\"A\":0.0774261649,\"B\":0.5042933554,\"C\":0.7095936633,\"D\":0.4012486987},{\"A\":0.9215810197,\"B\":0.0154472261,\"C\":0.2654161552,\"D\":0.3247884855},{\"A\":0.393765934,\"B\":0.481696696,\"C\":0.8731618709,\"D\":0.758867711},{\"A\":0.4745454185,\"B\":0.0666828682,\"C\":0.2043568046,\"D\":0.7433732038},{\"A\":0.7717461404,\"B\":0.112668368,\"C\":0.8416286193,\"D\":0.0254580519},{\"A\":0.3025883997,\"B\":0.3437180802,\"C\":0.2369378307,\"D\":0.8939787727},{\"A\":0.3484336427,\"B\":0.3910067643,\"C\":0.0953904485,\"D\":0.3651110205},{\"A\":0.2110935156,\"B\":0.4636447284,\"C\":0.9283017709,\"D\":0.0105194739},{\"A\":0.7394658063,\"B\":0.3301475445,\"C\":0.9340923108,\"D\":0.5463739846},{\"A\":0.2309639435,\"B\":0.5585589948,\"C\":0.2390889386,\"D\":0.9918534523},{\"A\":0.9987103314,\"B\":0.0906074135,\"C\":0.872042477,\"D\":0.2227486275},{\"A\":0.1443925385,\"B\":0.0679658547,\"C\":0.0935615945,\"D\":0.840750406},{\"A\":0.0943707276,\"B\":0.9048926926,\"C\":0.2245580652,\"D\":0.3529395385},{\"A\":0.4725164841,\"B\":0.8412799321,\"C\":0.6457602779,\"D\":0.0037145716},{\"A\":0.4405258389,\"B\":0.085446825,\"C\":0.2832068609,\"D\":0.5431120155},{\"A\":0.4215098076,\"B\":0.6496444076,\"C\":0.8175635963,\"D\":0.6856483039},{\"A\":0.6176578205,\"B\":0.9106344265,\"C\":0.8360707837,\"D\":0.4640646558},{\"A\":0.123294081,\"B\":0.1690605358,\"C\":0.8352657504,\"D\":0.5192123665},{\"A\":0.455456584,\"B\":0.1734614195,\"C\":0.385524538,\"D\":0.8626150199},{\"A\":0.5662762499,\"B\":0.6534325729,\"C\":0.8660321272,\"D\":0.8059199064},{\"A\":0.7471802655,\"B\":0.6614166044,\"C\":0.3731012478,\"D\":0.6285728953},{\"A\":0.2486054865,\"B\":0.8962429634,\"C\":0.5485535981,\"D\":0.6525742063}]"


--------------------------------------------------------------------------------
/tests/data/tmp_data.json:
--------------------------------------------------------------------------------
1 | "[{\"A\":0.5325016729,\"B\":0.4869994072,\"C\":0.269408499,\"D\":0.3621738306},{\"A\":0.2304825119,\"B\":0.5481912504,\"C\":0.4014871558,\"D\":0.3603053727},{\"A\":0.3507889192,\"B\":0.6226744491,\"C\":0.6091377546,\"D\":0.5165489581},{\"A\":0.5798596097,\"B\":0.7200900352,\"C\":0.9342783399,\"D\":0.0540093822},{\"A\":0.3108050843,\"B\":0.9773936853,\"C\":0.4801914079,\"D\":0.1787148388},{\"A\":0.2084711872,\"B\":0.5751155582,\"C\":0.2584603695,\"D\":0.4499059913},{\"A\":0.4759231155,\"B\":0.8092009236,\"C\":0.3072478517,\"D\":0.8837572165},{\"A\":0.2480077693,\"B\":0.4552477195,\"C\":0.9647205731,\"D\":0.2536659951},{\"A\":0.1634613944,\"B\":0.4766800168,\"C\":0.369663194,\"D\":0.9961761176},{\"A\":0.5346664051,\"B\":0.194493705,\"C\":0.4651741797,\"D\":0.7293780026},{\"A\":0.663231418,\"B\":0.4826377804,\"C\":0.5241409718,\"D\":0.1599210151},{\"A\":0.6838886399,\"B\":0.4712653511,\"C\":0.5581199413,\"D\":0.3226088546},{\"A\":0.5353026817,\"B\":0.7982157423,\"C\":0.5292073255,\"D\":0.8607700998},{\"A\":0.7164381489,\"B\":0.8843576476,\"C\":0.8875950376,\"D\":0.3435581081},{\"A\":0.664698152,\"B\":0.8538449824,\"C\":0.8392907092,\"D\":0.3113757281},{\"A\":0.6479963522,\"B\":0.9540320749,\"C\":0.5325504287,\"D\":0.8182171859},{\"A\":0.8211351808,\"B\":0.4439015651,\"C\":0.6269342472,\"D\":0.4122693317},{\"A\":0.6679538246,\"B\":0.7390494918,\"C\":0.1759150727,\"D\":0.1302475247},{\"A\":0.8377472214,\"B\":0.273212617,\"C\":0.2663859412,\"D\":0.5964344124},{\"A\":0.9082325183,\"B\":0.3137189069,\"C\":0.9204916523,\"D\":0.6981099323},{\"A\":0.0569704461,\"B\":0.5887289259,\"C\":0.9691034132,\"D\":0.864492329},{\"A\":0.8733863986,\"B\":0.5010284784,\"C\":0.8753387807,\"D\":0.2446379967},{\"A\":0.9960436279,\"B\":0.4441227796,\"C\":0.4813238954,\"D\":0.2405431943},{\"A\":0.9743124513,\"B\":0.2431975581,\"C\":0.1700467831,\"D\":0.5107508473},{\"A\":0.2659344468,\"B\":0.2298763219,\"C\":0.1679702902,\"D\":0.2976868866},{\"A\":0.4967479732,\"B\":0.8507496604,\"C\":0.1298286001,\"D\":0.0696243588},{\"A\":0.5011896631,\"B\":0.7902353379,\"C\":0.862998181,\"D\":0.2236992358},{\"A\":0.1630314398,\"B\":0.9551477474,\"C\":0.9418805628,\"D\":0.2297807846},{\"A\":0.4375272337,\"B\":0.4738682907,\"C\":0.7732553673,\"D\":0.9564487668},{\"A\":0.6313980088,\"B\":0.704010717,\"C\":0.5371476407,\"D\":0.6924966076},{\"A\":0.3307278087,\"B\":0.6970454132,\"C\":0.1032596315,\"D\":0.2564419892},{\"A\":0.4353581681,\"B\":0.6941255237,\"C\":0.0394112344,\"D\":0.4991979497},{\"A\":0.3901813924,\"B\":0.3581649708,\"C\":0.0696682115,\"D\":0.1097029952},{\"A\":0.6259185992,\"B\":0.8903284037,\"C\":0.5605368861,\"D\":0.5762697392},{\"A\":0.6068870699,\"B\":0.2540924395,\"C\":0.1086194342,\"D\":0.6033022401},{\"A\":0.3123136578,\"B\":0.2788511046,\"C\":0.4036056534,\"D\":0.5723193255},{\"A\":0.1021738989,\"B\":0.9994909756,\"C\":0.9832543715,\"D\":0.9885624128},{\"A\":0.6785393932,\"B\":0.5788494481,\"C\":0.5004145535,\"D\":0.5129020263},{\"A\":0.4616303134,\"B\":0.2699233081,\"C\":0.1211274991,\"D\":0.2856023533},{\"A\":0.7471981109,\"B\":0.8964269606,\"C\":0.5273349805,\"D\":0.519780483},{\"A\":0.2398561128,\"B\":0.6637693729,\"C\":0.0713046133,\"D\":0.1965824806},{\"A\":0.1438138313,\"B\":0.8077715814,\"C\":0.5987424102,\"D\":0.8796307444},{\"A\":0.2099913033,\"B\":0.0014793221,\"C\":0.6282096248,\"D\":0.3344606287},{\"A\":0.2172346949,\"B\":0.8055518737,\"C\":0.7020053655,\"D\":0.2734407306},{\"A\":0.2919147983,\"B\":0.8325704254,\"C\":0.6602685898,\"D\":0.9023679814},{\"A\":0.5941575402,\"B\":0.03341885,\"C\":0.1900654781,\"D\":0.5003254697},{\"A\":0.0577672223,\"B\":0.5857695682,\"C\":0.7401711144,\"D\":0.2932093813},{\"A\":0.0441340405,\"B\":0.9699508666,\"C\":0.5688421424,\"D\":0.8265708757},{\"A\":0.5671337446,\"B\":0.1391732202,\"C\":0.8557361973,\"D\":0.95843871},{\"A\":0.7818036893,\"B\":0.7559001038,\"C\":0.9803074287,\"D\":0.5550912458},{\"A\":0.7764158003,\"B\":0.6901683866,\"C\":0.7134621122,\"D\":0.3412987906},{\"A\":0.5184392059,\"B\":0.3561712456,\"C\":0.9341091567,\"D\":0.4326865978},{\"A\":0.7478769752,\"B\":0.985482101,\"C\":0.7739615326,\"D\":0.0101972409},{\"A\":0.4681250259,\"B\":0.1031996448,\"C\":0.5414294345,\"D\":0.4261483639},{\"A\":0.1752046752,\"B\":0.1991262091,\"C\":0.677077356,\"D\":0.8059928892},{\"A\":0.7081022399,\"B\":0.5430350946,\"C\":0.4756771947,\"D\":0.2051436153},{\"A\":0.7008949898,\"B\":0.4885963791,\"C\":0.2012325862,\"D\":0.4846756182},{\"A\":0.6803875318,\"B\":0.3302139274,\"C\":0.2008848379,\"D\":0.8565463434},{\"A\":0.8215943867,\"B\":0.4007808246,\"C\":0.849984323,\"D\":0.7148225175},{\"A\":0.984314214,\"B\":0.3315224115,\"C\":0.2374709671,\"D\":0.7303792807},{\"A\":0.1544605924,\"B\":0.9114949507,\"C\":0.8431437169,\"D\":0.3078082636},{\"A\":0.4466543896,\"B\":0.7093552302,\"C\":0.0139362347,\"D\":0.6832623102},{\"A\":0.335121826,\"B\":0.017851436,\"C\":0.8177046429,\"D\":0.0868433873},{\"A\":0.3241821741,\"B\":0.352863243,\"C\":0.0782754953,\"D\":0.9690912676},{\"A\":0.6525125608,\"B\":0.6431642124,\"C\":0.2455876297,\"D\":0.4893561674},{\"A\":0.5009426045,\"B\":0.144012049,\"C\":0.9115418398,\"D\":0.6228395399},{\"A\":0.707871306,\"B\":0.088211523,\"C\":0.9318696511,\"D\":0.8569612714},{\"A\":0.4605842335,\"B\":0.5185556048,\"C\":0.9262782515,\"D\":0.7801799842},{\"A\":0.7730264146,\"B\":0.6774950976,\"C\":0.6291621329,\"D\":0.2161067579},{\"A\":0.7916767193,\"B\":0.212605389,\"C\":0.3408662965,\"D\":0.5176147758},{\"A\":0.7025679202,\"B\":0.0851145247,\"C\":0.1140933764,\"D\":0.7167199974},{\"A\":0.9758435877,\"B\":0.6312956197,\"C\":0.4144884024,\"D\":0.5930438643},{\"A\":0.1998500366,\"B\":0.079361875,\"C\":0.5949318443,\"D\":0.0516943591},{\"A\":0.9872288449,\"B\":0.2349307202,\"C\":0.5721875354,\"D\":0.1446492501},{\"A\":0.8438844171,\"B\":0.4464399495,\"C\":0.3444058338,\"D\":0.4428694837},{\"A\":0.3778647292,\"B\":0.3380841433,\"C\":0.5285966196,\"D\":0.0594458212},{\"A\":0.7635524601,\"B\":0.6418464458,\"C\":0.7063874264,\"D\":0.1375340887},{\"A\":0.0092013791,\"B\":0.4869340269,\"C\":0.7725304702,\"D\":0.6235075271},{\"A\":0.0774261649,\"B\":0.5042933554,\"C\":0.7095936633,\"D\":0.4012486987},{\"A\":0.9215810197,\"B\":0.0154472261,\"C\":0.2654161552,\"D\":0.3247884855},{\"A\":0.393765934,\"B\":0.481696696,\"C\":0.8731618709,\"D\":0.758867711},{\"A\":0.4745454185,\"B\":0.0666828682,\"C\":0.2043568046,\"D\":0.7433732038},{\"A\":0.7717461404,\"B\":0.112668368,\"C\":0.8416286193,\"D\":0.0254580519},{\"A\":0.3025883997,\"B\":0.3437180802,\"C\":0.2369378307,\"D\":0.8939787727},{\"A\":0.3484336427,\"B\":0.3910067643,\"C\":0.0953904485,\"D\":0.3651110205},{\"A\":0.2110935156,\"B\":0.4636447284,\"C\":0.9283017709,\"D\":0.0105194739},{\"A\":0.7394658063,\"B\":0.3301475445,\"C\":0.9340923108,\"D\":0.5463739846},{\"A\":0.2309639435,\"B\":0.5585589948,\"C\":0.2390889386,\"D\":0.9918534523},{\"A\":0.9987103314,\"B\":0.0906074135,\"C\":0.872042477,\"D\":0.2227486275},{\"A\":0.1443925385,\"B\":0.0679658547,\"C\":0.0935615945,\"D\":0.840750406},{\"A\":0.0943707276,\"B\":0.9048926926,\"C\":0.2245580652,\"D\":0.3529395385},{\"A\":0.4725164841,\"B\":0.8412799321,\"C\":0.6457602779,\"D\":0.0037145716},{\"A\":0.4405258389,\"B\":0.085446825,\"C\":0.2832068609,\"D\":0.5431120155},{\"A\":0.4215098076,\"B\":0.6496444076,\"C\":0.8175635963,\"D\":0.6856483039},{\"A\":0.6176578205,\"B\":0.9106344265,\"C\":0.8360707837,\"D\":0.4640646558},{\"A\":0.123294081,\"B\":0.1690605358,\"C\":0.8352657504,\"D\":0.5192123665},{\"A\":0.455456584,\"B\":0.1734614195,\"C\":0.385524538,\"D\":0.8626150199},{\"A\":0.5662762499,\"B\":0.6534325729,\"C\":0.8660321272,\"D\":0.8059199064},{\"A\":0.7471802655,\"B\":0.6614166044,\"C\":0.3731012478,\"D\":0.6285728953},{\"A\":0.2486054865,\"B\":0.8962429634,\"C\":0.5485535981,\"D\":0.6525742063}]"


--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | 
  3 | 
  4 | datacite_data = {'identifier': {'identifier': '10.xx/xx', 'identifierType': 'DOI'},
  5 |                  'rightsList': [{'rights': 'CC-BY 4.0'}],
  6 |                  'creators': [{'creatorName': 'Brown, C',
  7 |                                'familyName': 'Brown',
  8 |                                'givenName': 'C'},
  9 |                               {'creatorName': 'Van Pelt, L',
 10 |                                'familyName': 'Van Pelt',
 11 |                                'givenName': 'L'}],
 12 |                  'subjects': [{'subject': 'blockheads'},
 13 |                               {'subject': 'foundry'},
 14 |                               {'subject': 'test_data'}],
 15 |                  'publicationYear': 2024,
 16 |                  'publisher': 'Materials Data Facility',
 17 |                  'dates': [{'date': '2024-08-03', 'dateType': 'Accepted'}],
 18 |                  'titles': [{'title': "You're a Good man, Charlie Brown"}],
 19 |                  'resourceType': {'resourceTypeGeneral': 'Dataset',
 20 |                                   'resourceType': 'Dataset'}}
 21 | 
 22 | 
 23 | valid_metadata = {"keys": [
 24 |                     {
 25 |                         "key": ["sepal length (cm)"],
 26 |                         "type": "input",
 27 |                         "units": "cm",
 28 |                         "description": "sepal length in Charlie Brown's zig-zag style"
 29 |                     },
 30 |                     {
 31 |                         "key": ["sepal width (cm)"],
 32 |                         "type": "input",
 33 |                         "units": "cm",
 34 |                         "description": "sepal width in Snoopy's flying ace mode"
 35 |                     },
 36 |                     {
 37 |                         "key": ["petal length (cm)"],
 38 |                         "type": "input",
 39 |                         "units": "cm",
 40 |                         "description": "petal length in Linus's security blanket units"
 41 |                     },
 42 |                     {
 43 |                         "key": ["petal width (cm)"],
 44 |                         "type": "input",
 45 |                         "units": "cm",
 46 |                         "description": "petal width in Lucy's psychiatric advice scale"
 47 |                     },
 48 |                     {
 49 |                         "key": ["y"],
 50 |                         "type": "output",
 51 |                         "units": "",
 52 |                         "description": "flower type",
 53 |                         "classes": [
 54 |                             {
 55 |                                 "label": "0",
 56 |                                 "name": "setosa"
 57 |                             },
 58 |                             {
 59 |                                 "label": "1",
 60 |                                 "name": "versicolor"
 61 |                             },
 62 |                             {
 63 |                                 "label": "2",
 64 |                                 "name": "virginica"
 65 |                             }
 66 |                         ]
 67 |                     }
 68 |                 ],
 69 |                 "splits": [
 70 |                     {"label": "train", "path": "train_snoopy.json", "type": "train"},
 71 |                     {"label": "test", "path": "test_woodstock.json", "type": "test"}
 72 |                 ],
 73 |                 "short_name": "peanuts_iris_{:.0f}".format(datetime.now().timestamp()),
 74 |                 "data_type": "tabular",
 75 |                 "task_type": ["unsupervised", "generative"],
 76 |                 "domain": ["comics", "nostalgia"],
 77 |                 "n_items": 1000
 78 |             }
 79 | 
 80 | 
 81 | invalid_metadata = {"oranges": [
 82 |                         {
 83 |                             "key": ["sepal length (cm)"],
 84 |                             "type": "input",
 85 |                             "units": "cm",
 86 |                             "description": 10
 87 |                         },
 88 |                         {
 89 |                             "key": ["sepal width (cm)"],
 90 |                             "type": "input",
 91 |                             "units": "cm",
 92 |                             "description": "sepal width in unit(cm)"
 93 |                         },
 94 |                         {
 95 |                             "key": ["petal length (cm)"],
 96 |                             "type": "input",
 97 |                             "units": "cm",
 98 |                             "description": "petal length in unit(cm)"
 99 |                         },
100 |                         {
101 |                             "key": ["petal width (cm)"],
102 |                             "type": "input",
103 |                             "units": "cm",
104 |                             "description": "petal width in unit(cm)"
105 |                         },
106 |                         {
107 |                             "key": ["y"],
108 |                             "type": "output",
109 |                             "units": "",
110 |                             "description": "flower type",
111 |                             "classes": [
112 |                                 {
113 |                                     "label": "0",
114 |                                     "name": "setosa"
115 |                                 },
116 |                                 {
117 |                                     "label": "1",
118 |                                     "name": "versicolor"
119 |                                 },
120 |                                 {
121 |                                     "label": "2",
122 |                                     "name": "virginica"
123 |                                 }
124 |                             ]
125 |                         }
126 |                     ],
127 |                     'splits': [
128 |                         {'label': 'train', 'path': 'train.json', 'type': 'train'},
129 |                         {'label': 'test', 'path': 'test.json', 'type': 'test'}
130 |                     ],
131 |                     "short_name": "example_AS_iris_test_{:.0f}".format(datetime.now().timestamp()),
132 |                     "data_type": "tabular",
133 |                     'task_type': ['unsupervised', 'generative'],
134 |                     'domain': ['materials science', 'chemistry'],
135 |                     'n_items': 1000
136 |                 }
137 | 


--------------------------------------------------------------------------------
/tests/test_data/test_dataset/elwood.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/tests/test_data/test_dataset/elwood.hdf5


--------------------------------------------------------------------------------
/tests/test_foundry_cache.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | import pandas as pd
  4 | import pytest
  5 | from unittest.mock import MagicMock
  6 | 
  7 | from . import test_foundry_dataset
  8 | from foundry.jsonschema_models.project_model import Split as FoundrySplit, \
  9 |                                   Key as FoundryKey
 10 | from foundry.foundry_cache import FoundryCache
 11 | from foundry.models import FoundrySchema
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def mock_foundry_cache():
 16 |     cache_dir = str(Path(__file__).parent) + '/test_data'
 17 |     cache = FoundryCache(forge_client=MagicMock(), 
 18 |                          transfer_client=MagicMock(), 
 19 |                          local_cache_dir=cache_dir,
 20 |                          use_globus=False,
 21 |                          interval=10,
 22 |                          parallel_https=4,
 23 |                          verbose=False)
 24 |     return cache
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def mock_nonexistent_foundry_cache():
 29 |     cache_dir = str(Path(__file__).parent) + '/cheeseballs'
 30 |     cache = FoundryCache(forge_client=MagicMock(), 
 31 |                          transfer_client=MagicMock(), 
 32 |                          local_cache_dir=cache_dir,
 33 |                          use_globus=False,
 34 |                          interval=10,
 35 |                          parallel_https=4,
 36 |                          verbose=False)
 37 |     return cache
 38 | 
 39 | 
 40 | def test_validate_local_dataset_storage_exists(mock_foundry_cache):
 41 |     cache = mock_foundry_cache
 42 |     dataset_name = "elwood_md_v1.2"
 43 |     assert cache.validate_local_dataset_storage(dataset_name) is True
 44 | 
 45 | 
 46 | def test_validate_local_dataset_storage_missing_files(mock_foundry_cache):
 47 |     cache = mock_foundry_cache
 48 |     dataset_name = "elwood_md_v1.2"
 49 |     # Create a split with a missing file
 50 |     splits = [
 51 |         FoundrySplit(path="file1.csv", type="train"),
 52 |         FoundrySplit(path="file2.csv", type="test"),
 53 |         FoundrySplit(path="file3.csv", type="validation")
 54 |     ]
 55 | 
 56 |     assert cache.validate_local_dataset_storage(dataset_name, splits) is False
 57 | 
 58 | 
 59 | def test_validate_local_dataset_storage_complete(mock_foundry_cache):
 60 |     cache = mock_foundry_cache
 61 |     dataset_name = "elwood_md_v1.2"
 62 | 
 63 |     assert cache.validate_local_dataset_storage(dataset_name) is True
 64 | 
 65 | 
 66 | def test_validate_local_dataset_storage_not_present(mock_nonexistent_foundry_cache):
 67 |     cache = mock_nonexistent_foundry_cache
 68 |     dataset_name = "test_dataset"
 69 | 
 70 |     assert cache.validate_local_dataset_storage(dataset_name) is False
 71 | 
 72 | 
 73 | @pytest.fixture
 74 | def mock_tabular_foundry_source_id():
 75 |     source_id = "elwood_md_v1.2"
 76 |     yield source_id
 77 | 
 78 | 
 79 | @pytest.fixture
 80 | def mock_tabular_foundry_schema():
 81 |     foundry_schema = json.loads('{"short_name": "elwood_properties", "data_type": "tabular", "task_type": ["unsupervised", "generative", "supervised"], "domain": ["materials science", "chemistry", "simulation"], "n_items": 410.0, "splits": [{"type": "train", "path": "MD_properties.csv", "label": "train"}], "keys": [{"key": ["SMILES"], "type": "input", "filter": null, "description": "Canonical SMILES string of molecule", "units": "arb", "classes": null}, {"key": ["E_coh (MPa)"], "type": "target", "filter": null, "description": "Simulated cohesive energy (in MPa)", "units": "MPa", "classes": null}, {"key": ["T_g (K)"], "type": "target", "filter": null, "description": "Simulated glass transition temperature (in Kelvin)", "units": "Kelvin", "classes": null}, {"key": ["R_gyr (A^2)"], "type": "target", "filter": null, "description": "Simulated squared radius of gyration (in Angstroms^2)", "units": "Angstrom^2", "classes": null}, {"key": ["Densities (kg/m^3)"], "type": "target", "filter": null, "description": "Simulated density (in kg/m^3)", "units": "kg/m^3", "classes": null}]}')
 82 |     yield foundry_schema
 83 | 
 84 | 
 85 | @pytest.fixture
 86 | def mock_hdf5_foundry_source_id():
 87 |     source_id = "test_dataset"
 88 |     yield source_id
 89 | 
 90 | 
 91 | @pytest.fixture
 92 | def mock_hdf5_foundry_schema():
 93 |     foundry_schema = json.loads('{"short_name": "elwood_properties", "data_type": "hdf5", "task_type": ["unsupervised", "generative", "supervised"], "domain": ["materials science", "chemistry", "simulation"], "n_items": 410.0, "splits": [{"type": "train", "path": "MD_properties.csv", "label": "train"}], "keys": [{"key": ["SMILES"], "type": "input", "filter": null, "description": "Canonical SMILES string of molecule", "units": "arb", "classes": null}, {"key": ["E_coh (MPa)"], "type": "target", "filter": null, "description": "Simulated cohesive energy (in MPa)", "units": "MPa", "classes": null}, {"key": ["T_g (K)"], "type": "target", "filter": null, "description": "Simulated glass transition temperature (in Kelvin)", "units": "Kelvin", "classes": null}, {"key": ["R_gyr (A^2)"], "type": "target", "filter": null, "description": "Simulated squared radius of gyration (in Angstroms^2)", "units": "Angstrom^2", "classes": null}, {"key": ["Densities (kg/m^3)"], "type": "target", "filter": null, "description": "Simulated density (in kg/m^3)", "units": "kg/m^3", "classes": null}]}')
 94 |     yield foundry_schema
 95 | 
 96 | 
 97 | @pytest.fixture
 98 | def mock_read_functions():
 99 | 
100 |     def mock_read_csv(file_path):
101 |         # Mock _read_csv() to return a DataFrame with minimal example data
102 |         data = {'Column1': [1, 2, 3], 'Column2': ['A', 'B', 'C']}
103 |         return pd.DataFrame(data)
104 | 
105 |     def mock_read_json(file_path):
106 |         # Mock _read_json() to return a DataFrame with minimal example data
107 |         data = {'Column1': [4, 5, 6], 'Column2': ['D', 'E', 'F']}
108 |         return pd.DataFrame(data)
109 | 
110 |     def mock_read_excel(file_path):
111 |         # Mock _read_excel() to return a DataFrame with minimal example data
112 |         data = {'Column1': [7, 8, 9], 'Column2': ['G', 'H', 'I']}
113 |         return pd.DataFrame(data)
114 | 
115 |     # # Patch the _read_csv(), _read_json(), and _read_excel() functions with the mock functions
116 |     # with patch('foundry.foundry_cache._read_csv', MagicMock(side_effect=mock_read_csv)):
117 |     #     with patch('foundry.foundry_cache._read_json', MagicMock(side_effect=mock_read_json)):
118 |     #         with patch('foundry.foundry_cache._read_excel', MagicMock(side_effect=mock_read_excel)):
119 |     #             yield
120 | 
121 | 
122 | def test_load_data_with_globus(mock_foundry_cache,
123 |                                mock_tabular_foundry_source_id,
124 |                                mock_tabular_foundry_schema):
125 |     cache = mock_foundry_cache
126 |     source_id = mock_tabular_foundry_source_id
127 |     foundry_schema = FoundrySchema(mock_tabular_foundry_schema)
128 |     cache._load_data(foundry_schema,
129 |                      file="MD_properties.csv",
130 |                      source_id=source_id,
131 |                      as_hdf5=False)
132 |     # Add assertions here
133 | 
134 | 
135 | def test_load_data_with_hdf5(mock_foundry_cache,
136 |                              mock_hdf5_foundry_schema,
137 |                              mock_read_functions,
138 |                              mock_hdf5_foundry_source_id):
139 |     cache = mock_foundry_cache
140 |     source_id = mock_hdf5_foundry_source_id
141 |     foundry_schema = FoundrySchema(mock_hdf5_foundry_schema)
142 |     cache._load_data(foundry_schema,
143 |                      file="elwood.hdf5",
144 |                      source_id=source_id,
145 |                      as_hdf5=True)
146 |     # Add assertions here
147 | 
148 | 
149 | def test_load_data_with_globus_2(mock_foundry_cache,
150 |                                  mock_tabular_foundry_schema,
151 |                                  mock_read_functions,
152 |                                  mock_tabular_foundry_source_id):
153 |     cache = mock_foundry_cache
154 |     source_id = mock_tabular_foundry_source_id
155 |     foundry_schema = FoundrySchema(mock_tabular_foundry_schema)
156 |     cache._load_data(foundry_schema,
157 |                      file="MD_properties.csv",
158 |                      source_id=source_id,
159 |                      as_hdf5=False)
160 |     # Add assertions here
161 | 
162 | 
163 | def test_load_data_with_source_id(mock_foundry_cache,
164 |                                   mock_tabular_foundry_schema,
165 |                                   mock_read_functions,
166 |                                   mock_hdf5_foundry_source_id):
167 |     cache = mock_foundry_cache
168 |     foundry_schema = FoundrySchema(mock_tabular_foundry_schema)
169 |     with pytest.raises(Exception) as exc_info:
170 |         cache._load_data(foundry_schema, 
171 |                          file="MD_properties.csv", 
172 |                          source_id="12345", 
173 |                          as_hdf5=False)
174 |     err = exc_info.value
175 |     assert isinstance(err, FileNotFoundError)
176 | 


--------------------------------------------------------------------------------
/tests/test_foundry_components.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import pytest
  3 | import os
  4 | 
  5 | from mdf_connect_client import MDFConnectClient
  6 | from mdf_forge import Forge
  7 | import mdf_toolbox
  8 | import pandas as pd
  9 | 
 10 | from foundry import foundry
 11 | 
 12 | is_gha = os.getenv("GITHUB_ACTIONS")
 13 | client_id = os.getenv("CLIENT_ID")
 14 | client_secret = os.getenv("CLIENT_SECRET")
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def auths():
 19 |     services = [
 20 |         "data_mdf",
 21 |         "mdf_connect",
 22 |         "search",
 23 |         "petrel",
 24 |         "transfer",
 25 |         "openid",
 26 |         "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all",  # funcx
 27 |         "https://auth.globus.org/scopes/f10a69a9-338c-4e5b-baa1-0dc92359ab47/https",  # Eagle HTTPS
 28 |         "https://auth.globus.org/scopes/82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/https",  # NCSA HTTPS
 29 |         "https://auth.globus.org/scopes/d31d4f5d-be37-4adc-a761-2f716b7af105/action_all",  # Globus Search Lambda
 30 |     ]
 31 | 
 32 |     if is_gha:
 33 |         auths = mdf_toolbox.confidential_login(client_id=client_id,
 34 |                                                client_secret=client_secret,
 35 |                                                services=services, make_clients=True)
 36 | 
 37 |         search_auth = mdf_toolbox.confidential_login(client_id=client_id,
 38 |                                                      client_secret=client_secret,
 39 |                                                      services=["search"], make_clients=False)
 40 |     else:
 41 |         auths = mdf_toolbox.login(services=services, make_clients=True)
 42 |         search_auth = mdf_toolbox.login(services=["search"], make_clients=False)
 43 | 
 44 |     auths['search_authorizer'] = search_auth['search']
 45 | 
 46 |     yield auths
 47 | 
 48 | 
 49 | @pytest.fixture()
 50 | def testing_data_dir():
 51 |     return str(Path(__file__).parent) + '/test_data'
 52 | 
 53 | 
 54 | @pytest.fixture
 55 | def elwood_data():
 56 |     test_dataset_name = "elwood_md_v1.2"
 57 |     test_doi = "10.18126/8p6m-e135"
 58 |     expected_title = "Project Elwood: MD Simulated Monomer Properties"
 59 |     yield test_dataset_name, test_doi, expected_title
 60 | 
 61 | 
 62 | @pytest.fixture
 63 | def iris_data():
 64 |     pub_test_dataset = "_test_foundry_iris_dev_v2.1"
 65 |     pub_expected_title = "Iris Dataset"
 66 |     yield pub_test_dataset, pub_expected_title
 67 | 
 68 | 
 69 | # FoundryCache testing
 70 | 
 71 | def test_loading_as_dict(auths, elwood_data, testing_data_dir):
 72 |     # test loading the dataset from a local (static) copy
 73 |     test_dataset_name, test_doi, expected_title = elwood_data
 74 | 
 75 |     f = foundry.Foundry(authorizers=auths, local_cache_dir=testing_data_dir)
 76 |     search_results = f.search(test_dataset_name, as_list=True)
 77 |     elwood_data = search_results[0].get_as_dict()
 78 |     X, y = elwood_data['train']
 79 | 
 80 |     assert len(X) > 1
 81 |     assert isinstance(X, pd.DataFrame)
 82 |     assert len(y) > 1
 83 |     assert isinstance(y, pd.DataFrame)
 84 | 
 85 | 
 86 | def test_foundry_init(auths, elwood_data):
 87 |     test_dataset_name, test_doi, expected_title = elwood_data
 88 | 
 89 |     f = foundry.Foundry(authorizers=auths)
 90 |     assert isinstance(f.forge_client, Forge)
 91 |     assert isinstance(f.connect_client, MDFConnectClient)
 92 | 
 93 | def test_search(auths, elwood_data):
 94 |     test_dataset_name, test_doi, expected_title = elwood_data
 95 | 
 96 |     f = foundry.Foundry(authorizers=auths)
 97 |     q = "Elwood"
 98 |     ds = f.search(q)
 99 | 
100 |     assert isinstance(ds, pd.DataFrame)
101 |     assert len(ds) > 0
102 | 
103 |     dataset = ds.iloc[0].FoundryDataset
104 | 
105 |     # assert ds.iloc[0]['name'] is not None
106 |     assert dataset.dc.titles[0].title == expected_title
107 | 
108 |     # assert ds.iloc[0]['source_id'] is not None
109 |     assert dataset.dataset_name == test_dataset_name
110 | 
111 |     # assert ds.iloc[0]['year'] is not None
112 |     assert dataset.dc.publicationYear is not None
113 | 
114 | 
115 | def test_search_as_list(auths, elwood_data):
116 |     auths = auths
117 |     test_dataset_name, test_doi, expected_title = elwood_data
118 | 
119 |     f = foundry.Foundry(authorizers=auths)
120 |     q = "Elwood"
121 |     ds = f.search(q, as_list=True)
122 | 
123 |     assert isinstance(ds, list)
124 |     assert len(ds) > 0
125 | 
126 |     dataset = ds[0]
127 | 
128 |     # assert ds.iloc[0]['name'] is not None
129 |     assert dataset.dc.titles[0].title == expected_title
130 | 
131 |     # assert ds.iloc[0]['source_id'] is not None
132 |     assert dataset.dataset_name == test_dataset_name
133 | 
134 |     # assert ds.iloc[0]['year'] is not None
135 |     assert dataset.dc.publicationYear is not None
136 | 
137 | 
138 | def test_search_limit(auths, elwood_data):
139 |     f = foundry.Foundry(authorizers=auths)
140 |     ds = f.search(limit=10)
141 | 
142 |     assert isinstance(ds, pd.DataFrame)
143 |     assert len(ds) == 10
144 | 
145 | 
146 | @pytest.mark.skipif(bool(is_gha), reason="pytest.raises seems to cause issues in GHA?")
147 | def test_search_no_results():
148 |     f = foundry.Foundry()
149 | 
150 |     with pytest.raises(Exception) as exc_info:
151 |         f.search('chewbacca')
152 | 
153 |     err = exc_info.value
154 |     assert hasattr(err, '__cause__')
155 | 


--------------------------------------------------------------------------------
/tests/test_foundry_dataset.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pytest
 3 | 
 4 | from foundry import foundry
 5 | from tests.test_data import datacite_data, valid_metadata
 6 | 
 7 | 
 8 | def test_dataset_instantiation():
 9 |     ds = foundry.FoundryDataset(dataset_name='peanuts',
10 |                                 foundry_schema=valid_metadata,
11 |                                 datacite_entry=datacite_data)
12 | 
13 |     assert ds.foundry_schema is not None
14 | 
15 | 
16 | def test_dataset_instantiation_broken_dc():
17 |     broken_datacite = datacite_data.copy()
18 |     broken_datacite.pop('creators')
19 |     with pytest.raises(Exception) as exc_info:
20 |         foundry.FoundryDataset(dataset_name='peanuts',
21 |                                foundry_schema=valid_metadata,
22 |                                datacite_entry=broken_datacite)
23 |         print(f'ERROR: {exc_info.value}')
24 |         assert "field required" in str(exc_info.value)
25 | 
26 | 
27 | def test_add_non_existent_data_to_dataset():
28 |     ds = foundry.FoundryDataset(dataset_name='peanuts',
29 |                                 foundry_schema=valid_metadata,
30 |                                 datacite_entry=datacite_data)
31 | 
32 |     with pytest.raises(ValueError) as exc_info:
33 |         ds.add_data(local_data_path='./test_data/iris.csv')
34 |         print(f'ERROR: {exc_info.value}')
35 |         assert "local path" in str(exc_info.value)
36 | 
37 | 
38 | def test_add_data_folder_to_dataset():
39 |     ds = foundry.FoundryDataset(dataset_name='peanuts',
40 |                                 foundry_schema=valid_metadata,
41 |                                 datacite_entry=datacite_data)
42 |     dir_path = str(Path(__file__).parent) + '/test_data/test_dataset'
43 |     ds.add_data(local_data_path=dir_path)
44 |     assert hasattr(ds, '_local_data_path')
45 | 
46 | 
47 | def test_add_data_file_to_dataset():
48 |     ds = foundry.FoundryDataset(dataset_name='peanuts',
49 |                                 foundry_schema=valid_metadata,
50 |                                 datacite_entry=datacite_data)
51 |     file_path = str(Path(__file__).parent) + '/test_data/test_dataset/elwood.hdf5'
52 |     ds.add_data(local_data_path=file_path)
53 |     assert hasattr(ds, '_local_data_path')
54 | 


--------------------------------------------------------------------------------
/tests/test_https_download.py:
--------------------------------------------------------------------------------
 1 | # import os
 2 | # import requests
 3 | # import mock
 4 | 
 5 | # from foundry.https_download import download_file
 6 | 
 7 | 
 8 | # def test_download_file(tmp_path):
 9 | #     item = {
10 | #         "path": tmp_path,
11 | #         "name": "example_file.txt"
12 | #     }
13 | #     data_directory = tmp_path
14 | #     https_config = {
15 | #         "base_url": "https://example.com/",
16 | #         "source_id": "12345"
17 | #     }
18 | 
19 | #     # Mock the requests.get function to return a response with content
20 | #     with mock.patch.object(requests, "get") as mock_get:
21 | #         mock_get.return_value.content = b"Example file content"
22 | 
23 | #         # Call the function
24 | #         result = download_file(item, data_directory, https_config)
25 | 
26 | #         # Assert that the file was downloaded and written correctly
27 | #         assert os.path.exists(str(tmp_path) + "/12345/example_file.txt")
28 | #         with open(str(tmp_path) + "/12345/example_file.txt", "rb") as f:
29 | #             assert f.read() == b"Example file content"
30 | 
31 | #         # Assert that the result is as expected
32 | #         assert result == {str(tmp_path) + "/12345/example_file.txt status": True}
33 | 
34 | 
35 | # def test_download_file_with_existing_directories(tmp_path):
36 | #     temp_path_to_file = str(tmp_path) + '/file'
37 | #     os.mkdir(temp_path_to_file)
38 | #     temp_path_to_data = str(tmp_path) + '/data'
39 | #     os.mkdir(temp_path_to_data)
40 | 
41 | #     item = {
42 | #         "path": temp_path_to_file,
43 | #         "name": "example_file.txt"
44 | #     }
45 | #     data_directory = temp_path_to_data
46 | #     https_config = {
47 | #         "base_url": "https://example.com/",
48 | #         "source_id": "12345"
49 | #     }
50 | 
51 | #     # Create the parent directories
52 | #     os.makedirs(temp_path_to_data + "12345")
53 | 
54 | #     # Mock the requests.get function to return a response with content
55 | #     with mock.patch.object(requests, "get") as mock_get:
56 | #         mock_get.return_value.content = b"Example file content"
57 | 
58 | #         # Call the function
59 | #         result = download_file(item, data_directory, https_config)
60 | 
61 | #         # Assert that the file was downloaded and written correctly
62 | #         assert os.path.exists(temp_path_to_data + "/12345/example_file.txt")
63 | #         with open(temp_path_to_data + "/12345/example_file.txt", "rb") as f:
64 | #             assert f.read() == b"Example file content"
65 | 
66 | #         # Assert that the result is as expected
67 | #         assert result == {temp_path_to_data + "/12345/example_file.txt status": True}
68 | 


--------------------------------------------------------------------------------