├── .gitbook.yaml
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── create-a-story.md
│ └── feature_request.md
└── workflows
│ ├── documentation.yaml
│ ├── python-publish.yml
│ └── tests.yml
├── .gitignore
├── LICENSE
├── README.md
├── assets
├── foundry-black.png
├── foundry-black.svg
├── foundry-dark.png
├── foundry-dark.svg
├── foundry-light.png
├── foundry-light.svg
├── foundry-logo-4.pptx
├── foundry-logo.pptx
├── foundry-ml.png
├── foundry-purple.png
├── foundry-white.png
├── foundry-white.svg
├── foundry.png
└── foundry.svg
├── diagram.svg
├── docs
├── .gitbook
│ └── assets
│ │ ├── foundry-overview.png
│ │ ├── foundry-purple (1).png
│ │ ├── foundry-purple (2).png
│ │ ├── foundry-purple (3).png
│ │ ├── foundry-purple.png
│ │ ├── foundry.png
│ │ ├── image (1).png
│ │ ├── image (2).png
│ │ ├── image.png
│ │ ├── screen-shot-2021-07-15-at-10.00.38-am.png
│ │ └── screen-shot-2021-07-15-at-10.05.40-am.png
├── README.md
├── SUMMARY.md
├── command-line-interface.md
├── concepts
│ ├── foundry-benchmarks.md
│ ├── foundry-data-packages.md
│ ├── foundry-datasets.md
│ ├── foundry-models-and-functions.md
│ └── overview.md
├── examples.md
├── foundry-package-foundry_test-1.1-documentation-html-autogeneration.md
├── foundry.auth.md
├── foundry.foundry.md
├── foundry.foundry_cache.md
├── foundry.foundry_dataset.md
├── foundry.https_download.md
├── foundry.https_upload.md
├── foundry.loaders.md
├── foundry.loaders.tf_wrapper.md
├── foundry.loaders.torch_wrapper.md
├── foundry.md
├── foundry.models.md
├── foundry.utils.md
├── how-to-contribute
│ ├── code_of_conduct.md
│ └── contributing.md
├── publishing-datasets.md
├── publishing-models.md
├── publishing
│ ├── publishing-datasets.md
│ └── publishing-models.md
├── sphinx-autogenerated-documentation.md
└── support
│ └── troubleshooting.md
├── examples
├── README.md
├── atom-position-finding
│ ├── .ipynb_checkpoints
│ │ └── atom_position_finding-checkpoint.ipynb
│ ├── atom_position_finding.ipynb
│ └── requirements.txt
├── bandgap
│ ├── bandgap_demo.ipynb
│ ├── foundry.json
│ └── requirements.txt
├── dendrite-segmentation
│ ├── dendrite_segmentation.ipynb
│ ├── foundry.json
│ └── requirements.txt
├── g4mp2-solvation
│ └── g4mp2_solvation_demo.ipynb
├── oqmd
│ ├── foundry.json
│ ├── oqmd.ipynb
│ └── requirements.txt
├── publishing-guides
│ ├── data
│ │ └── iris.csv
│ └── dataset_publishing.ipynb
├── qmc_ml
│ └── qmc_ml.ipynb
├── work_in_progress
│ └── PACBEDCNN-thickness-mistilt
│ │ └── PACBEDCNN_thickness_mistilt.ipynb
└── zeolite
│ ├── .ipynb_checkpoints
│ └── zeolite_demo-checkpoint.ipynb
│ ├── requirements.txt
│ └── zeolite_demo.ipynb
├── foundry
├── __init__.py
├── auth.py
├── foundry.py
├── foundry_cache.py
├── foundry_dataset.py
├── https_download.py
├── https_upload.py
├── jsonschema_models
│ ├── __init__.py
│ ├── dc_model.py
│ └── project_model.py
├── loaders
│ ├── __init__.py
│ ├── tf_wrapper.py
│ └── torch_wrapper.py
├── models.py
└── utils.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── test-requirements.txt
├── test.py
└── tests
├── README.md
├── __init__.py
├── data
├── __init__.py
├── https_test
│ └── test_data.json
└── tmp_data.json
├── test_data.py
├── test_data
├── elwood_md_v1.2
│ └── MD_properties.csv
└── test_dataset
│ └── elwood.hdf5
├── test_foundry.py
├── test_foundry_cache.py
├── test_foundry_components.py
├── test_foundry_dataset.py
└── test_https_download.py
/.gitbook.yaml:
--------------------------------------------------------------------------------
1 | root: ./docs/
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/create-a-story.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Create a Story
3 | about: Suggest a user-centered feature, told as a Story
4 | title: My Story
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | _Short description_
11 |
12 | # Assumptions:
13 | 1.
14 | 2.
15 |
16 | # Acceptance Criteria
17 | Given..., when..., then...
18 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/workflows/documentation.yaml:
--------------------------------------------------------------------------------
1 | name: build api documentation
2 |
3 | on:
4 | pull_request:
5 | types:
6 | - closed
7 | push:
8 | branches:
9 | - "*"
10 |
11 | jobs:
12 | build_documentation:
13 | if: github.event.pull_request.merged == true
14 | name: generate api markdown docs
15 | runs-on: ubuntu-latest
16 | env:
17 | CLIENT_ID: ${{ secrets.CLIENT_ID }}
18 | CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }}
19 | steps:
20 | - name: Check out repo's default branch
21 | uses: actions/checkout@v3
22 | - name: Setup python
23 | uses: actions/setup-python@v4
24 | with:
25 | python-version: '3.10'
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install -r requirements.txt
30 | pip install lazydocs
31 | - name: Build docs from docstrings
32 | continue-on-error: true
33 | run: |
34 | lazydocs --output-path="docs" --overview-file="README.md" --src-base-url="https://github.com/MLMI2-CSSI/foundry/tree/main" .
35 | - name: Commit files
36 | run: |
37 | echo ${{ github.ref }}
38 | git add .
39 | git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
40 | git config --local user.name "github-actions[bot]"
41 | git commit -m "CI: Automated documentation build" -a | exit 0
42 | git push origin ${{ github.event.pull_request.base.ref }}
43 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Upload Python Package
5 |
6 | on:
7 | release:
8 | types: [created]
9 |
10 | jobs:
11 | deploy:
12 |
13 | runs-on: ubuntu-latest
14 |
15 | steps:
16 | - uses: actions/checkout@v2
17 | - name: Set up Python
18 | uses: actions/setup-python@v4
19 | with:
20 | python-version: '3.x'
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | pip install setuptools wheel twine
25 | - name: Build and publish
26 | env:
27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 | run: |
30 | python setup.py sdist bdist_wheel
31 | twine upload dist/*
32 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | pull_request:
5 | branches:
6 | - dev
7 | - main
8 |
9 | jobs:
10 |
11 | build:
12 | runs-on: ubuntu-latest
13 | timeout-minutes: 20
14 | strategy:
15 | matrix:
16 | python-version: ["3.9", "3.10", "3.11", "3.12"]
17 |
18 | env:
19 | CLIENT_ID: ${{ secrets.CLIENT_ID }}
20 | CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }}
21 | name: build
22 | steps:
23 | - uses: actions/checkout@v4
24 | - name: Set up Python ${{ matrix.python-version }}
25 | uses: actions/setup-python@v5
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 | cache : 'pip'
29 |
30 | - name: Globus auth
31 | run: 'echo "$GLOBUS_CONFIG" > ~/.globus-native-apps.cfg'
32 | shell: bash
33 | env:
34 | GLOBUS_CONFIG: "${{ secrets.GLOBUS_CONFIG }}"
35 |
36 | - name: Install dependencies
37 | run: |
38 | python -m pip install --upgrade pip
39 | pip install -r requirements.txt
40 | pip install -r test-requirements.txt
41 |
42 | - name: Lint with flake8
43 | run: |
44 | # stop the build if there are any-flake8 comments
45 | flake8 foundry
46 |
47 | - name: Test with pytest
48 | run: |
49 | pytest -s -v tests/ --cov=./foundry --cov-report=xml
50 | - name: Upload coverage to Codecov
51 | run: |
52 | curl -Os https://uploader.codecov.io/v0.1.0_4653/linux/codecov
53 |
54 | chmod +x codecov
55 | ./codecov -t ${{ secrets.CODECOV_TOKEN }}
56 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | */build/*
2 | *.DS_STORE
3 | *.pyc
4 | *.idea
5 | */foundry_ml.egg-info/*
6 | globus_creds
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 The University of Chicago
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
5 |
](https://ai-materials-and-chemistry.gitbook.io/foundry/)
12 |
13 |
14 | Foundry-ML simplifies the discovery and usage of ML-ready datasets in materials science and chemistry providing a simple API to access even complex datasets.
15 | * Load ML-ready data with just a few lines of code
16 | * Work with datasets in local or cloud environments.
17 | * Publish your own datasets with Foundry to promote community usage
18 | * (in progress) Run published ML models without hassle
19 |
20 | Learn more and see our available datasets on [Foundry-ML.org](https://foundry-ml.org/)
21 |
22 |
23 |
24 | # Documentation
25 | Information on how to install and use Foundry is available in our documentation [here](https://ai-materials-and-chemistry.gitbook.io/foundry/v/docs/).
26 |
27 | DLHub documentation for model publication and running information can be found [here](https://dlhub-sdk.readthedocs.io/en/latest/servable-publication.html).
28 |
29 | # Quick Start
30 | Install Foundry-ML via command line with:
31 | `pip install foundry_ml`
32 |
33 | You can use the following code to import and instantiate Foundry-ML, then load a dataset.
34 |
35 | ```python
36 | from foundry import Foundry
37 | f = Foundry(index="mdf")
38 |
39 |
40 | f = f.load("10.18126/e73h-3w6n", globus=True)
41 | ```
42 | *NOTE*: If you run locally and don't want to install the [Globus Connect Personal endpoint](https://www.globus.org/globus-connect-personal), just set the `globus=False`.
43 |
44 | If running this code in a notebook, a table of metadata for the dataset will appear:
45 |
46 |
47 |
48 | We can use the data with `f.load_data()` and specifying splits such as `train` for different segments of the dataset, then use matplotlib to visualize it.
49 |
50 | ```python
51 | res = f.load_data()
52 |
53 | imgs = res['train']['input']['imgs']
54 | desc = res['train']['input']['metadata']
55 | coords = res['train']['target']['coords']
56 |
57 | n_images = 3
58 | offset = 150
59 | key_list = list(res['train']['input']['imgs'].keys())[0+offset:n_images+offset]
60 |
61 | fig, axs = plt.subplots(1, n_images, figsize=(20,20))
62 | for i in range(n_images):
63 | axs[i].imshow(imgs[key_list[i]])
64 | axs[i].scatter(coords[key_list[i]][:,0], coords[key_list[i]][:,1], s = 20, c = 'r', alpha=0.5)
65 | ```
66 |
67 |
68 | [See full examples](./examples)
69 |
70 | # How to Cite
71 | If you find Foundry-ML useful, please cite the following [paper](https://doi.org/10.21105/joss.05467)
72 |
73 | ```
74 | @article{Schmidt2024,
75 | doi = {10.21105/joss.05467},
76 | url = {https://doi.org/10.21105/joss.05467},
77 | year = {2024}, publisher = {The Open Journal},
78 | volume = {9},
79 | number = {93},
80 | pages = {5467},
81 | author = {Kj Schmidt and Aristana Scourtas and Logan Ward and Steve Wangen and Marcus Schwarting and Isaac Darling and Ethan Truelove and Aadit Ambadkar and Ribhav Bose and Zoa Katok and Jingrui Wei and Xiangguo Li and Ryan Jacobs and Lane Schultz and Doyeon Kim and Michael Ferris and Paul M. Voyles and Dane Morgan and Ian Foster and Ben Blaiszik},
82 | title = {Foundry-ML - Software and Services to Simplify Access to Machine Learning Datasets in Materials Science}, journal = {Journal of Open Source Software}
83 | }
84 | ```
85 |
86 | # Contributing
87 | Foundry is an Open Source project and we encourage contributions from the community. To contribute, please fork from the `main` branch and open a Pull Request on the `main` branch. A member of our team will review your PR shortly.
88 |
89 | ## Developer notes
90 | In order to enforce consistency with external schemas for the metadata and datacite structures ([contained in the MDF data schema repository](https://github.com/materials-data-facility/data-schemas)) the `dc_model.py` and `project_model.py` pydantic data models (found in the `foundry/jsonschema_models` folder) were generated using the [datamodel-code-generator](https://github.com/koxudaxi/datamodel-code-generator/) tool. In order to ensure compliance with the flake8 linting, the `--use-annoted` flag was passed to ensure regex patterns in `dc_model.py` were specified using pydantic's `Annotated` type vs the soon to be deprecated `constr` type. The command used to run the datamodel-code-generator looks like:
91 | ```
92 | datamodel-codegen --input dc.json --output dc_model.py --use-annotated
93 | ```
94 |
95 | # Primary Support
96 | This work was supported by the National Science Foundation under NSF Award Number: 1931306 "Collaborative Research: Framework: Machine Learning Materials Innovation Infrastructure".
97 |
98 | # Other Support
99 | Foundry-ML brings together many components in the materials data ecosystem. Including [MAST-ML](https://mastmldocs.readthedocs.io/en/latest/), the [Data and Learning Hub for Science](https://www.dlhub.org) (DLHub), and the [Materials Data Facility](https://materialsdatafacility.org) (MDF).
100 |
101 | ## MAST-ML
102 | This work was supported by the National Science Foundation (NSF) SI2 award No. 1148011 and DMREF award number DMR-1332851
103 |
104 | ## The Data and Learning Hub for Science (DLHub)
105 | This material is based upon work supported by Laboratory Directed Research and Development (LDRD) funding from Argonne National Laboratory, provided by the Director, Office of Science, of the U.S. Department of Energy under Contract No. DE-AC02-06CH11357.
106 | https://www.dlhub.org
107 |
108 | ## The Materials Data Facility
109 | This work was performed under financial assistance award 70NANB14H012 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the [Center for Hierarchical Material Design (CHiMaD)](http://chimad.northwestern.edu). This work was performed under the following financial assistance award 70NANB19H005 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the Center for Hierarchical Materials Design (CHiMaD). This work was also supported by the National Science Foundation as part of the [Midwest Big Data Hub](http://midwestbigdatahub.org) under NSF Award Number: 1636950 "BD Spokes: SPOKE: MIDWEST: Collaborative: Integrative Materials Design (IMaD): Leverage, Innovate, and Disseminate".
110 | https://www.materialsdatafacility.org
111 |
--------------------------------------------------------------------------------
/assets/foundry-black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-black.png
--------------------------------------------------------------------------------
/assets/foundry-black.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assets/foundry-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-dark.png
--------------------------------------------------------------------------------
/assets/foundry-dark.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assets/foundry-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-light.png
--------------------------------------------------------------------------------
/assets/foundry-light.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assets/foundry-logo-4.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-logo-4.pptx
--------------------------------------------------------------------------------
/assets/foundry-logo.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-logo.pptx
--------------------------------------------------------------------------------
/assets/foundry-ml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-ml.png
--------------------------------------------------------------------------------
/assets/foundry-purple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-purple.png
--------------------------------------------------------------------------------
/assets/foundry-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry-white.png
--------------------------------------------------------------------------------
/assets/foundry-white.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assets/foundry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/assets/foundry.png
--------------------------------------------------------------------------------
/assets/foundry.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-overview.png
--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry-purple (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple (1).png
--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry-purple (2).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple (2).png
--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry-purple (3).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple (3).png
--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry-purple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry-purple.png
--------------------------------------------------------------------------------
/docs/.gitbook/assets/foundry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/foundry.png
--------------------------------------------------------------------------------
/docs/.gitbook/assets/image (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/image (1).png
--------------------------------------------------------------------------------
/docs/.gitbook/assets/image (2).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/image (2).png
--------------------------------------------------------------------------------
/docs/.gitbook/assets/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/image.png
--------------------------------------------------------------------------------
/docs/.gitbook/assets/screen-shot-2021-07-15-at-10.00.38-am.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/screen-shot-2021-07-15-at-10.00.38-am.png
--------------------------------------------------------------------------------
/docs/.gitbook/assets/screen-shot-2021-07-15-at-10.05.40-am.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLMI2-CSSI/foundry/0c028b81cdb4dcf6660e0c412e5a3f98f20b716d/docs/.gitbook/assets/screen-shot-2021-07-15-at-10.05.40-am.png
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Getting started with Foundry
2 |
3 | 
4 |
5 | ## What is Foundry?
6 |
7 | Foundry is a Python package that simplifies the discovery and usage of machine-learning ready datasets in materials science and chemistry. Foundry provides software tools that make it easy to load these datasets and work with them in local or cloud environments. Further, Foundry provides a dataset specification, and defined curation flows, that allow users to create new datasets for the community to use through this same interface.
8 |
9 | ## Installation
10 |
11 | Foundry can be installed on any operating system with Python with pip
12 |
13 | ```text
14 | pip install foundry-ml
15 | ```
16 |
17 | ### Globus
18 |
19 | Foundry uses the Globus platform for authentication, search, and to optimize some data transfer operations. Follow the steps below to get set up.
20 |
21 | * [Create a free account.](https://app.globus.org) You can create a free account here with your institutional credentials or with free IDs \(GlobusID, Google, ORCID, etc\).
22 | * [Set up a Globus Connect Personal endpoint ](https://www.globus.org/globus-connect-personal)_**\(optional\)**_. While this step is optional, some Foundry capabilities will work more efficiently when using GCP.
23 |
24 | ## Project Support
25 |
26 | This work was supported by the National Science Foundation under NSF Award Number: 1931306 "Collaborative Research: Framework: Machine Learning Materials Innovation Infrastructure".
27 |
28 | ### Other Support
29 |
30 | Foundry brings together many components in the materials data ecosystem. Including MAST-ML, the Data and Learning Hub for Science \(DLHub\), and The Materials Data Facility \(MDF\).
31 |
32 | #### MAST-ML
33 |
34 | This work was supported by the National Science Foundation \(NSF\) SI2 award No. 1148011 and DMREF award number DMR-1332851
35 |
36 | #### The Data and Learning Hub for Science \(DLHub\)
37 |
38 | This material is based upon work supported by Laboratory Directed Research and Development \(LDRD\) funding from Argonne National Laboratory, provided by the Director, Office of Science, of the U.S. Department of Energy under Contract No. DE-AC02-06CH11357. [https://www.dlhub.org](https://www.dlhub.org)
39 |
40 | #### The Materials Data Facility
41 |
42 | This work was performed under financial assistance award 70NANB14H012 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the [Center for Hierarchical Material Design \(CHiMaD\)](http://chimad.northwestern.edu). This work was performed under the following financial assistance award 70NANB19H005 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the Center for Hierarchical Materials Design \(CHiMaD\). This work was also supported by the National Science Foundation as part of the [Midwest Big Data Hub](http://midwestbigdatahub.org) under NSF Award Number: 1636950 "BD Spokes: SPOKE: MIDWEST: Collaborative: Integrative Materials Design \(IMaD\): Leverage, Innovate, and Disseminate". [https://www.materialsdatafacility.org](https://www.materialsdatafacility.org)
43 |
44 |
--------------------------------------------------------------------------------
/docs/SUMMARY.md:
--------------------------------------------------------------------------------
1 | # Table of contents
2 |
3 | * [Getting started with Foundry](README.md)
4 |
5 | ## How to contribute
6 |
7 | * [Contribution Process](how-to-contribute/contributing.md)
8 | * [Contributor Covenant](how-to-contribute/code_of_conduct.md)
9 |
10 | ---
11 |
12 | * [Sphinx Autogenerated documentation - markdown](sphinx-autogenerated-documentation.md)
13 | * [foundry package — Foundry\_test 1.1 documentation - HTML AUTOGENERATION](foundry-package-foundry_test-1.1-documentation-html-autogeneration.md)
14 |
15 |
--------------------------------------------------------------------------------
/docs/command-line-interface.md:
--------------------------------------------------------------------------------
1 | # Command Line Interface \(CLI\)
2 |
3 | The Foundry command line interface \(CLI\) allows for users to build their data environment from the command line using a specification file. This is the data analag to how `pip` or `conda` allow users to build a software environment from software specification files.
4 |
5 | ## Installation
6 |
7 | ```text
8 | pip install foundry-ml-cli
9 | ```
10 |
11 | ### CLI Options
12 |
13 | **`--file`** : \(string\) the name of the specification file to build. _Default: "./foundry.json"_
14 |
15 | **`--globus`** : \(bool\) If True, uses Globus to download the files, otherwise HTTPS. _Default: False_
16 |
17 | **`--interval`** : \(int\) Time in seconds between polling operations to check transfer status. _Default: 3_
18 |
19 | **`-verbose`** : \(bool\) If True, print out more logging information to the console. _Default: False_
20 |
21 | ## Example Usage
22 |
23 | In a folder containing a file named foundry.json
24 |
25 | ```text
26 | /foundry.json
27 |
28 |
29 | $ foundry
30 | ```
31 |
32 | This is the same as running
33 |
34 | ```text
35 | /foundry.json
36 |
37 |
38 | $ foundry --file=foundry.json --globus=False --interval=3 --verbose=False
39 | ```
40 |
41 |
--------------------------------------------------------------------------------
/docs/concepts/foundry-benchmarks.md:
--------------------------------------------------------------------------------
1 | # Foundry Benchmarks
2 |
3 |
--------------------------------------------------------------------------------
/docs/concepts/foundry-data-packages.md:
--------------------------------------------------------------------------------
1 | # Foundry Data Packages
2 |
3 | Foundry Data Packages allow for a logical and portable way to specify and collect data for analyses. From a data package, a user can easily build a local data environment matching the data package.
4 |
5 | ## Data Package Specification Fields
6 |
7 | **`name`** : \(string\) A name for the data package
8 |
9 | **`version`** : \(string\) A version of the form <major>.<minor>.<sub> e.g., "1.2.0"
10 |
11 | **`description`** : \(string\) A short description of the data package and its intended use
12 |
13 | **`tags`** : \(list\) A list of tag strings associated with the data package
14 |
15 | **`dependencies`** : \(list\) A list of dependency objects associated with the data package
16 |
17 | **`private`** : \(bool\) Whether the data package is to be registered in a public data package index
18 |
19 | ### Dependency Objects
20 |
21 | **`identifier`** : \(string\) Unique identifier for the dataset
22 |
23 | **`version`** : \(string\) The version of the dataset to use
24 |
25 | **`provider`** : \(string\) The dataset provider. _Currently only "MDF" is supported_
26 |
27 | ```javascript
28 | {
29 | "identifier": "_test_foundry_mp_bandgap_v1.1",
30 | "version": "1.1",
31 | "provider": "MDF"
32 | }
33 | ```
34 |
35 | ## Example Specification
36 |
37 | ```javascript
38 | {
39 | "name": "Band Gap Analysis",
40 | "version": "1.0.0",
41 | "description": "Datasets for band gap uber model generation",
42 | "private": true,
43 | "dependencies": [{
44 | "name": "_test_foundry_experimental_bandgap_v1.1",
45 | "version": "1.1",
46 | "provider": "MDF"
47 | },
48 | {
49 | "name": "_test_foundry_mp_bandgap_v1.1",
50 | "version": "1.1",
51 | "provider": "MDF"
52 | },
53 | {
54 | "name": "_test_foundry_oqmd_bandgap_v1.1",
55 | "version": "1.1",
56 | "provider": "MDF"
57 | },
58 | {
59 | "name": "_test_foundry_assorted_computational_bandgap_v1.1",
60 | "version": "1.1",
61 | "provider": "MDF"
62 | }
63 | ]
64 | }
65 | ```
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/docs/concepts/foundry-datasets.md:
--------------------------------------------------------------------------------
1 | ---
2 | description: Describe the metadata that is for each Foundry dataset
3 | ---
4 |
5 | # Foundry Datasets
6 |
7 | Foundry Datasets are comprised of two key components, [_**data**_](foundry-datasets.md#data) and descriptive [_**metadata**_](foundry-datasets.md#describing-datasets-with-metadata). In order to make the data easily consumable, _**data**_ \(consisting of files\) should be assembled following the supported structures. The _**metadata**_ description allows tracking of high level information \(e.g., authors, assoicated institutions, licenses, data location\), and also information on how to operate on the datasets \(e.g., how to load the data, training/test splits\)
8 |
9 | ### **Data**
10 |
11 | ### Example - Record-Based Data
12 |
13 | #### **Tabular Data**
14 |
15 | For tabular data should, columns represent the different keys of the data, and rows represent individual records.
16 |
17 | {% hint style="info" %}
18 | Supported tabular data types currently include JSON, csv, and xlsx.
19 | {% endhint %}
20 |
21 | In this example, we showcase how to describe a JSON record-based dataset where each record is a valid JSON object in a JSON list or a line in a JSON line delimited file.
22 |
23 | | **feature\_1** | **feature\_2** | **material\_type** | band\_gap |
24 | | :--- | :--- | :--- | :--- |
25 | | 0.10 | 0.52 | 1 | 1.40 |
26 | | 0.34 | 0.910 | 0 | 0.73 |
27 | | ... | ... | ... | |
28 |
29 | For this example dataset the `Key` object could be:
30 |
31 | ```javascript
32 | {
33 | "short_name": "oqmd-bandgaps",
34 | "data_type": "tabular",
35 | "task_type": ["supervised"],
36 | "domain": ["materials science"],
37 | "n_items": 29197,
38 | "splits": [{
39 | "type": "train",
40 | "path": "foundry_dataframe.json",
41 | "label": "train"
42 | }],
43 | "keys": [{
44 | "key": ["reference"],
45 | "type": "input",
46 | "units": "",
47 | "description": "source publication of the bandgap value"
48 | }, {
49 | "key": ["icsd_id"],
50 | "type": "input",
51 | "units": "",
52 | "description": "corresponding id in ICSD of this compound"
53 | }, {
54 | "key": ["structure"],
55 | "type": "input",
56 | "units": "",
57 | "description": "the structure of this compound"
58 | }, {
59 | "key": ["composition"],
60 | "type": "input",
61 | "units": "",
62 | "description": "reduced composition of this compound"
63 | }, {
64 | "key": ["comments"],
65 | "type": "input",
66 | "units": "",
67 | "description": "Additional information about this bandgap measurement"
68 | }, {
69 | "key": ["bandgap type"],
70 | "type": "input",
71 | "units": "",
72 | "description": "the type of the bandgap, e.g., direct or indirect"
73 | }, {
74 | "key": ["comp method"],
75 | "type": "input",
76 | "units": "",
77 | "description": "functional used to calculate the bandgap"
78 | }, {
79 | "key": ["space group"],
80 | "type": "input",
81 | "units": "",
82 | "description": "the space group of this compound"
83 | },
84 | {
85 | "key": ["bandgap value (eV)"],
86 | "type": "output",
87 | "units": "eV",
88 | "description": "value of the bandgap"
89 | }
90 | ]
91 | }
92 | ```
93 |
94 | **TODO**
95 |
96 | ```text
97 | "keys":[{
98 | "key": "feature_1",
99 | "type": "input",
100 | "units": None,
101 | "description": "This is feature 1"
102 | },{
103 | "key": "feature_2",
104 | "type": "input",
105 | "units": None,
106 | "description": "This is feature 2"
107 | },{
108 | "key": "material_type",
109 | "type": "input",
110 | "units": None,
111 | "description": "This is the material type",
112 | "labels":["perovskite","not perovskite"]
113 | }{
114 | "key": "band_gap",
115 | "type": "target",
116 | "units": "eV",
117 | "description": "This is the simulated band gap in eV"
118 | }
119 | ]
120 | ```
121 |
122 | {% hint style="info" %}
123 | `This tabular data file should be saved in the base directory as` **`foundry_dataframe.json`**
124 | {% endhint %}
125 |
126 | * Write general pandas reader to try csv, JSON, xlsx for opening
127 |
128 | #### Hierarchical Data
129 |
130 | Foundry also supports data from hierarchical data formats \(e.g., [HDF5](https://www.h5py.org)\). In this case features and outputs can be represented with `/` notation. For example, if the features of a dataset are located in an array stored in `/data/arr1` and `/other_data/arr2` while the outputs are in `/data/band_gaps`, the Key object would be:
131 |
132 | ```javascript
133 | {
134 | "short_name": "segmentation-dev",
135 | "data_type": "hdf5",
136 | "task_type": ["unsupervised", "segmentation"],
137 | "domain": ["materials science", "chemistry"],
138 | "n_items": 100,
139 | "splits": [{
140 | "type": "train",
141 | "path": "foundry.hdf5",
142 | "label": "train"
143 | }],
144 | "keys": [{
145 | "key": ["train/input"],
146 | "type": "input",
147 | "description": "input, unlabeled images"
148 | }, {
149 | "key": ["train/output"],
150 | "type": "target",
151 | "description": "target, labeled images"
152 | }]
153 | }
154 | ```
155 |
156 | ```text
157 | "keys":[{
158 | "key": "/data/arr1",
159 | "type": "input",
160 | "units": None,
161 | "description": "This is an array containing input data"
162 | },{
163 | "key": "/other_data/arr2",
164 | "type": "input",
165 | "units": None,
166 | "description": "This is an another array containing input data"
167 | },{
168 | "key": "/data/band_gaps",
169 | "type": "target",
170 | "units": "eV",
171 | "description": "This is the simulated band gap in eV"
172 | }
173 | ]
174 | ```
175 |
176 | ## Descriptive Metadata
177 |
178 | **DataCite Metadata \(object\):** All datasets can be described using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). This metadata captures . Many of these capabilities have helper functions in the SDK, to make it easier to match the DataCite schema
179 |
180 | **Keys \(object\):** Key objects provide a mapping that allows Foundry to read data from the underlying data structure into usable Python objects. Key objects have the following properties
181 |
182 | * **`key (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\)
183 | * **`type (str)`** The type of key this entry represents. Currently suported types are _**\["input", "target" \]**_
184 | * **`units (str)[optional]`** _****_The scientific units associated with a key. _Default: None_
185 | * **`description (str)[optional]`** _****_A free text description of the key. _Default: None_
186 | * **`labels (list) (str) [optional]`:** A list of strings mapped to integers in a key column
187 |
188 | **short\_name \(str\):** Short name is a unique name associated with this dataset to make loading and .
189 |
190 | **type \(str\):** The type provides a hint to Foundry on how to map the keys into loading operations. _Options \["tabular","hdf5"\]_
191 |
192 | ```text
193 | "foundry": {
194 | "dc": {},
195 | "keys": [{
196 | "type": "input",
197 | "name": "feature_1",
198 | "units": "",
199 | "description": "This is an input"
200 | },
201 | {
202 | "type": "target",
203 | "name": "band_gap",
204 | "units": "eV",
205 | "description": "blah blah",
206 | "labels": []
207 | }
208 | ],
209 | "short_name": "my_short_name",
210 | "type": "tabular"
211 | }
212 | ```
213 |
214 |
--------------------------------------------------------------------------------
/docs/concepts/foundry-models-and-functions.md:
--------------------------------------------------------------------------------
1 | # Foundry Models and Functions
2 |
3 | ## Foundry Model Providers
4 |
5 | Currently Foundry supports model and functions provided via the [Data and Learning Hub for Science \(DLHub\)](https://www.dlhub.org)/[FuncX](https://www.funcx.org)
6 |
7 |
--------------------------------------------------------------------------------
/docs/concepts/overview.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | TODO:
4 |
5 | * Change the code snippet in the image
6 | * Write the text :\)
7 |
8 | 
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
1 | # Getting Started with Python
2 |
3 | ## Scientific Examples
4 |
5 | [Checkout our example notebooks ](https://github.com/MLMI2-CSSI/foundry/tree/master/examples)for how to load or publish datasets using Foundry.
6 |
7 | ## Quickstart
8 |
9 | ### Creating a Foundry Client
10 |
11 | The Foundry client provides access to all of the methods described here for listing, loading, and publishing datasets and models. The code below will create a Foundry client
12 |
13 | ```python
14 | from foundry import Foundry
15 | f = Foundry()
16 | ```
17 |
18 | {% hint style="success" %}
19 | If you are running your script on cloud resources \(e.g. Google Colab, Binder\), see [Using Foundry on Cloud Computing Resources](examples.md#using-foundry-on-cloud-computing-resources)W
20 | {% endhint %}
21 |
22 | ### Listing Datasets
23 |
24 | To show all available Foundry datasets, you can use the Foundry `list()` method as follows. The method returns a pandas DataFrame with details on the available datasets.
25 |
26 | ```python
27 | f.list()
28 | ```
29 |
30 | ### Loading Datasets
31 |
32 | The Foundry client can be used to access datasets using a `source_id`, e.g. here `"_test_foundry_fashion_mnist_v1.1"`_._ You can retrieve the `source_id` from the [`list()` method](examples.md#listing-datasets).
33 |
34 | ```python
35 | from foundry import Foundry
36 | f = Foundry()
37 | f = f.load("_test_foundry_fashion_mnist_v1.1")
38 | ```
39 |
40 | This will remotely load the metadata \(e.g., data location, data keys, etc.\) and download the data to local storage if it is not already cached. Data can be downloaded via HTTPS without additional setup or more optimally with a Globus endpoint [set up](https://www.globus.org/globus-connect-personal) on your machine.
41 |
42 | Once the data are accessible locally, access the data with the `load_data()` method. Load data allows you to load data from a specific split that is defined for the dataset, here we use `train`.
43 |
44 | ```python
45 | res = f.load_data()
46 | X,y = res['train']
47 | ```
48 |
49 | The data are then usable within the `X` and `y` variables. This full example can be found in [`/examples/fashion-mnist/`](https://github.com/MLMI2-CSSI/foundry/tree/master/examples/fashion-mnist).
50 |
51 | ## Using Foundry on Cloud Computing Resources
52 |
53 | Foundry works with common cloud computing providers \(e.g., the NSF sponsored Jetstream and Google Colab\). On these resources, simply add the following arguments to use a cloud-compatible authentication flow.
54 |
55 | ```python
56 | f = Foundry(no_browser=True, no_local_server=True)
57 | ```
58 |
59 | When downloading data, add the following argument to download via HTTPS.
60 |
61 | {% hint style="info" %}
62 | This method may be slow for large datasets and datasets with many files
63 | {% endhint %}
64 |
65 | ```python
66 | f.load(globus=False)
67 | X, y = f.load_data()
68 | ```
69 |
70 |
--------------------------------------------------------------------------------
/docs/foundry-package-foundry_test-1.1-documentation-html-autogeneration.md:
--------------------------------------------------------------------------------
1 | # foundry package — Foundry\_test 1.1 documentation - HTML AUTOGENERATION
2 |
3 | ## foundry.foundry module[¶]()
4 |
5 | _class_ foundry.foundry.Foundry\(_no\_browser=False_, _no\_local\_server=False_, _search\_index='mdf-test'_, _\*_, _dc: Dict = {}_, _mdf: Dict = {}_, _dataset:_ [_foundry.models.FoundryDataset_]() _= {}_, _config:_ [_foundry.models.FoundryConfig_]() _= FoundryConfig\(dataframe\_file='foundry\_dataframe.json', data\_file='foundry.hdf5', metadata\_file='foundry\_metadata.json', destination\_endpoint=None, local=False, metadata\_key='foundry', organization='foundry', local\_cache\_dir='./data'\)_, _dlhub\_client: Any = None_, _forge\_client: Any = None_, _connect\_client: Any = None_, _xtract\_tokens: Any = None_\)[¶]()
6 |
7 | Bases: [`foundry.models.FoundryMetadata`]()
8 |
9 | Foundry Client Base Class TODO: ——- Add Docstring build\(_spec_, _globus=False_, _interval=3_, _file=False_\)[¶]()
10 |
11 | Build a Foundry Data Package :param spec: dict or str \(relative filename\) of the data package specification :type spec: multiple :param globus: if True use Globus to fetch datasets :type globus: bool :param interval: Polling interval on checking task status in seconds. :type interval: int :param type: One of “file” or None :type type: strReturns
12 |
13 | **\(Foundry\)**Return type
14 |
15 | self: for chaining check\_model\_status\(_res_\)[¶]()
16 |
17 | Check status of model or function publication to DLHub
18 |
19 | TODO: currently broken on DLHub side of things check\_status\(_source\_id_, _short=False_, _raw=False_\)[¶]()
20 |
21 | Check the status of your submission.Parameters
22 |
23 | * **source\_id** \(_str_\) – The `source_id` \(`source_name` + version information\) of the submission to check. Returned in the `res` result from `publish()` via MDF Connect Client.
24 | * **short** \(_bool_\) – When `False`, will print a status summary containing all of the status steps for the dataset. When `True`, will print a short finished/processing message, useful for checking many datasets’ status at once. **Default:** `False`
25 | * **raw** \(_bool_\) – When `False`, will print a nicely-formatted status summary. When `True`, will return the full status result. For direct human consumption, `False` is recommended. **Default:** `False`
26 |
27 | Returns
28 |
29 | The full status result.Return type
30 |
31 | If `raw` is `True`, _dict_ collect\_dataframes\(_packages=\[\]_\)[¶]()
32 |
33 | Collect dataframes of local data packages :param packages: List of packages to collect, defaults to all :type packages: listReturns
34 |
35 | **\(tuple\)**Return type
36 |
37 | Tuple of X\(pandas.DataFrame\), y\(pandas.DataFrame\) configure\(_\*\*kwargs_\)[¶]()
38 |
39 | Set Foundry config :keyword file: Path to the file containing :kwtype file: str :keyword \(default: self.config.metadata\_file\)
40 |
41 | dataframe\_file \(str\): filename for the dataframe file default:”foundry\_dataframe.json” data\_file \(str\): : filename for the data file default:”foundry.hdf5” destination\_endpoint \(str\): Globus endpoint UUID where Foundry data should move local\_cache\_dir \(str\): Where to place collected data default:”./data”Returns
42 |
43 | **\(Foundry\)**Return type
44 |
45 | self: for chaining connect\_client_: Any_[¶]() describe\_model\(\)[¶]() dlhub\_client_: Any_[¶]() download\(_globus=True_, _verbose=False_, _\*\*kwargs_\)[¶]()
46 |
47 | Download a Foundry dataset :param globus: if True, use Globus to download the data else try HTTPS :type globus: bool :param verbose: if True print out debug information during the download :type verbose: boolReturns
48 |
49 | **\(Foundry\)**Return type
50 |
51 | self: for chaining forge\_client_: Any_[¶]() get\_keys\(_type_, _as\_object=False_\)[¶]()
52 |
53 | Get keys for a Foundry datasetParameters
54 |
55 | * **type** \(_str_\) – The type of key to be returned e.g., “input”, “target”
56 | * **as\_object** \(_bool_\) – When `False`, will return a list of keys in as strings When `True`, will return the full key objects **Default:** `False`
57 |
58 | Returns: \(list\) String representations of keys or if `as_object`
59 |
60 | is False otherwise returns the full key objects. get\_packages\(_paths=False_\)[¶]()
61 |
62 | Get available local data packagesParameters
63 |
64 | **paths** \(_bool_\) – If True return paths in addition to package, if False return package name onlyReturns
65 |
66 | **\(list\)**Return type
67 |
68 | List describing local Foundry packages list\(\)[¶]()
69 |
70 | List available Foundry data packagesReturns
71 |
72 | **\(pandas.DataFrame\)**Return type
73 |
74 | DataFrame with summary list of Foundry data packages including name, title, and publication year load\(_name_, _download=True_, _globus=True_, _verbose=False_, _metadata=None_, _\*\*kwargs_\)[¶]()
75 |
76 | Load the metadata for a Foundry dataset into the client :param name: Name of the foundry dataset :type name: str :param download: If True, download the data associated with the package \(default is True\) :type download: bool :param globus: If True, download using Globus, otherwise https :type globus: bool :param verbose: If True print additional debug information :type verbose: bool :param metadata: **For debug purposes.** A search result analog to prepopulate metadata. :type metadata: dictKeyword Arguments
77 |
78 | **interval** \(_int_\) – How often to poll Globus to check if transfers are completeReturnsReturn type
79 |
80 | self load\_data\(_source\_id=None_, _globus=True_\)[¶]()
81 |
82 | Load in the data associated with the prescribed dataset
83 |
84 | Tabular Data Type: Data are arranged in a standard data frame stored in self.dataframe\_file. The contents are read, and
85 |
86 | File Data Type: <<Add desc>>
87 |
88 | For more complicated data structures, users should subclass Foundry and override the load\_data functionParameters
89 |
90 | * **inputs** \(_list_\) – List of strings for input columns
91 | * **targets** \(_list_\) – List of strings for output columns
92 |
93 | Returns ——-s
94 |
95 | > \(tuple\): Tuple of X, y values
96 |
97 | publish\(_foundry\_metadata_, _data\_source_, _title_, _authors_, _update=False_, _publication\_year=None_, _\*\*kwargs_\)[¶]()
98 |
99 | Submit a dataset for publication :param foundry\_metadata: Dict of metadata describing data package :type foundry\_metadata: dict :param data\_source: Url for Globus endpoint :type data\_source: string :param title: Title of data package :type title: string :param authors: List of data package author names e.g., Jack Black
100 |
101 | > or Nunez, Victoria
102 |
103 | Parameters
104 |
105 | * **update** \(_bool_\) – True if this is an update to a prior data package \(default: self.config.metadata\_file\)
106 | * **publication\_year** \(_int_\) – Year of dataset publication. If None, will be set to the current calendar year by MDF Connect Client. \(default: $current\_year\)
107 |
108 | Keyword Arguments
109 |
110 | * **affiliations** \([_list_]()\) – List of author affiliations
111 | * **tags** \([_list_]()\) – List of tags to apply to the data package
112 | * **short\_name** \(_string_\) – Shortened/abbreviated name of the data package
113 | * **publisher** \(_string_\) – Data publishing entity \(e.g. MDF, Zenodo, etc.\)
114 |
115 | Returns
116 |
117 | **\(dict\) MDF Connect Response** – of dataset. Contains source\_id, which can be used to check the status of the submissionReturn type
118 |
119 | Response from MDF Connect to allow tracking publish\_model\(_options_\)[¶]()
120 |
121 | Submit a model or function for publication :param options: dict of all possible optionsOptions keys:
122 |
123 | title \(req\) authors \(req\) short\_name \(req\) servable\_type \(req\) \(“static method”, “class method”, “keras”, “pytorch”, “tensorflow”, “sklearn”\) affiliations domains abstract references requirements \(dict of library:version keypairs\) module \(if Python method\) function \(if Python method\) inputs \(not needed for TF\) \(dict of options\) outputs \(not needed for TF\) methods \(e.g. research methods\) DOI publication\_year \(advanced\) version \(advanced\) visibility \(dict of users and groups, each a list\) funding reference rights
124 |
125 | TODO: alternate identifier \(to add an identifier of this artifact in another service\) add file add directory add files run\(_name_, _inputs_, _\*\*kwargs_\)[¶]()
126 |
127 | Run a model on dataParameters
128 |
129 | * **name** \(_str_\) – DLHub model name
130 | * **inputs** – Data to send to DLHub as inputs \(should be JSON serializable\)
131 |
132 | ReturnsReturn type
133 |
134 | Returns results after invocation via the DLHub service
135 |
136 | * Pass [\*\*]()kwargs through to DLHub client and document kwargs
137 |
138 | xtract\_tokens_: Any_[¶]()
139 |
140 | ## foundry.models module[¶]()
141 |
142 | _class_ foundry.models.FoundryConfig\(_\*_, _dataframe\_file: str = 'foundry\_dataframe.json'_, _data\_file: str = 'foundry.hdf5'_, _metadata\_file: str = 'foundry\_metadata.json'_, _destination\_endpoint: str = None_, _local: bool = False_, _metadata\_key: str = 'foundry'_, _organization: str = 'foundry'_, _local\_cache\_dir: str = './data'_\)[¶]()
143 |
144 | Bases: `pydantic.main.BaseModel`
145 |
146 | Foundry Configuration Configuration information for Foundry DatasetParameters
147 |
148 | * **dataframe\_file** \(_str_\) – Filename to read dataframe contents from
149 | * **metadata\_file** \(_str_\) – Filename to read metadata contents from defaults to reading for MDF Discover
150 | * **destination\_endpoint** \(_str_\) – Globus endpoint ID to transfer data to \(defaults to local GCP installation\)
151 | * **local\_cache\_dir** \(_str_\) – Path to local Foundry package cache
152 |
153 | data\_file_: Optional\[str\]_[¶]() dataframe\_file_: Optional\[str\]_[¶]() destination\_endpoint_: Optional\[str\]_[¶]() local_: Optional\[bool\]_[¶]() metadata\_file_: Optional\[str\]_[¶]() metadata\_key_: Optional\[str\]_[¶]() organization_: Optional\[str\]_[¶]() _class_ foundry.models.FoundryDataset\(_\*_, _keys: List\[_[_foundry.models.FoundryKey_]()_\] = None_, _splits: List\[_[_foundry.models.FoundrySplit_]()_\] = None_, _type:_ [_foundry.models.FoundryDatasetType_]() _= None_, _short\_name: str = ''_, _dataframe: Any = None_\)[¶]()
154 |
155 | Bases: `pydantic.main.BaseModel`
156 |
157 | Foundry Dataset Schema for Foundry Datasets. This includes specifications of inputs, outputs, type, version, and more _class_ Config[¶]()
158 |
159 | Bases: `object` arbitrary\_types\_allowed _= True_[¶]() dataframe_: Optional\[Any\]_[¶]() keys_: List\[_[_foundry.models.FoundryKey_]()_\]_[¶]() short\_name_: Optional\[str\]_[¶]() splits_: Optional\[List\[_[_foundry.models.FoundrySplit_]()_\]\]_[¶]() type_:_ [_foundry.models.FoundryDatasetType_]()[¶]() _class_ foundry.models.FoundryDatasetType\(_value_\)[¶]()
160 |
161 | Bases: `enum.Enum`
162 |
163 | Foundry Dataset Types Enumeration of the possible Foundry dataset types files _= 'files'_[¶]() hdf5 _= 'hdf5'_[¶]() other _= 'other'_[¶]() tabular _= 'tabular'_[¶]() _class_ foundry.models.FoundryKey\(_\*_, _key: List\[str\] = \[\]_, _type: str = ''_, _filter: str = ''_, _units: str = ''_, _description: str = ''_, _classes: List\[_[_foundry.models.FoundryKeyClass_]()_\] = None_\)[¶]()
164 |
165 | Bases: `pydantic.main.BaseModel` classes_: Optional\[List\[_[_foundry.models.FoundryKeyClass_]()_\]\]_[¶]() description_: Optional\[str\]_[¶]() filter_: Optional\[str\]_[¶]() key_: List\[str\]_[¶]() type_: str_[¶]() units_: Optional\[str\]_[¶]() _class_ foundry.models.FoundryKeyClass\(_\*_, _label: str = ''_, _name: str = ''_\)[¶]()
166 |
167 | Bases: `pydantic.main.BaseModel` label_: str_[¶]() name_: str_[¶]() _class_ foundry.models.FoundryMetadata\(_\*_, _dc: Dict = {}_, _mdf: Dict = {}_, _dataset:_ [_foundry.models.FoundryDataset_]() _= {}_, _config:_ [_foundry.models.FoundryConfig_]() _= FoundryConfig\(dataframe\_file='foundry\_dataframe.json', data\_file='foundry.hdf5', metadata\_file='foundry\_metadata.json', destination\_endpoint=None, local=False, metadata\_key='foundry', organization='foundry', local\_cache\_dir='./data'\)_\)[¶]()
168 |
169 | Bases: `pydantic.main.BaseModel` _class_ Config[¶]()
170 |
171 | Bases: `object` arbitrary\_types\_allowed _= True_[¶]() config_:_ [_foundry.models.FoundryConfig_]()[¶]() dataset_:_ [_foundry.models.FoundryDataset_]()[¶]() dc_: Optional\[Dict\]_[¶]() mdf_: Optional\[Dict\]_[¶]() _class_ foundry.models.FoundrySpecification\(_\*_, _name: str = ''_, _version: str = ''_, _description: str = ''_, _private: bool = False_, _dependencies: Any = None_\)[¶]()
172 |
173 | Bases: `pydantic.main.BaseModel`
174 |
175 | Pydantic base class for interacting with the Foundry data package specification The specification provides a way to group datasets and manage versions add\_dependency\(_name_, _version_\)[¶]() clear\_dependencies\(\)[¶]() dependencies_: Any_[¶]() description_: str_[¶]() name_: str_[¶]() private_: bool_[¶]() remove\_duplicate\_dependencies\(\)[¶]() version_: str_[¶]() _class_ foundry.models.FoundrySpecificationDataset\(_\*_, _name: str = None_, _provider: str = 'MDF'_, _version: str = None_\)[¶]()
176 |
177 | Bases: `pydantic.main.BaseModel`
178 |
179 | Pydantic base class for datasets within the Foundry data package specification name_: Optional\[str\]_[¶]() provider_: Optional\[str\]_[¶]() version_: Optional\[str\]_[¶]() _class_ foundry.models.FoundrySplit\(_\*_, _type: str = ''_, _path: str = ''_, _label: str = ''_\)[¶]()
180 |
181 | Bases: `pydantic.main.BaseModel` label_: Optional\[str\]_[¶]() path_: Optional\[str\]_[¶]() type_: str_[¶]()
182 |
183 | ## foundry.xtract\_method module[¶]()
184 |
185 |
--------------------------------------------------------------------------------
/docs/foundry.auth.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry.auth`
6 | Utilities related to storing authentication credentials
7 |
8 |
9 |
10 | ---
11 |
12 |
13 |
14 | ## class `PubAuths`
15 | Collection of the authorizers needed for publication
16 |
17 |
18 |
19 | **Attributes:**
20 |
21 | - `transfer_client`: Client with credentials to perform transfers
22 | - `auth_client_openid`: Client with permissions to get users IDs
23 | - `endpoint_auth_clients`: Mapping between endpoint ID and client that can authorize access to it
24 |
25 |
26 |
27 | ### method `__init__`
28 |
29 | ```python
30 | __init__(
31 | transfer_client: TransferClient,
32 | auth_client_openid: AuthClient,
33 | endpoint_auth_clients: Dict[str, AuthClient]
34 | ) → None
35 | ```
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 | ---
48 |
49 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
50 |
--------------------------------------------------------------------------------
/docs/foundry.foundry_cache.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry.foundry_cache`
6 |
7 |
8 |
9 |
10 |
11 |
12 | ---
13 |
14 |
15 |
16 | ## class `FoundryCache`
17 | The FoundryCache manages the local storage of FoundryDataset objects
18 |
19 |
20 |
21 | ### method `__init__`
22 |
23 | ```python
24 | __init__(
25 | forge_client: Forge,
26 | transfer_client: Any,
27 | use_globus,
28 | interval,
29 | parallel_https,
30 | verbose,
31 | local_cache_dir: str = None
32 | )
33 | ```
34 |
35 | Initializes a FoundryCache object.
36 |
37 |
38 |
39 | **Args:**
40 |
41 | - `forge_client` (Forge): The Forge client object.
42 | - `transfer_client` (Any): The transfer client object.
43 | - `use_globus` (bool): Flag indicating whether to use Globus for downloading.
44 | - `interval` (int): How often to wait before checking Globus transfer status.
45 | - `parallel_https` (int): Number of threads to use for downloading via HTTP.
46 | - `verbose` (bool): Flag indicating whether to produce more debug messages.
47 | - `local_cache_dir` (str, optional): The local cache directory. Defaults to None. If not specified, defaults to either the environmental variable 'FOUNDRY_LOCAL_CACHE_DIR' or './data/'.
48 |
49 |
50 |
51 |
52 | ---
53 |
54 |
55 |
56 | ### method `clear_cache`
57 |
58 | ```python
59 | clear_cache(dataset_name: str = None)
60 | ```
61 |
62 | Deletes all of the locally stored datasets
63 |
64 |
65 |
66 | **Arguments:**
67 |
68 | - `dataset_name` (str): Optional name of a specific dataset. If omitted, all datsets will be erased
69 |
70 | ---
71 |
72 |
73 |
74 | ### method `download_to_cache`
75 |
76 | ```python
77 | download_to_cache(dataset_name: str, splits: List[Split] = None)
78 | ```
79 |
80 | Checks if the data is downloaded, and if not, downloads the data from source to local storage.
81 |
82 |
83 |
84 | **Args:**
85 |
86 | - `dataset_name` (str): Name of the dataset (equivalent to source_id in MDF).
87 | - `splits` (List[FoundrySplit], optional): List of splits in the dataset. Defaults to None.
88 |
89 |
90 |
91 | **Returns:**
92 |
93 | - `FoundryCache`: The FoundryCache object.
94 |
95 | ---
96 |
97 |
98 |
99 | ### method `download_via_globus`
100 |
101 | ```python
102 | download_via_globus(dataset_name: str)
103 | ```
104 |
105 | Downloads selected dataset over Globus.
106 |
107 |
108 |
109 | **Args:**
110 |
111 | - `dataset_name` (str): Name of the dataset (equivalent to source_id in MDF).
112 |
113 | ---
114 |
115 |
116 |
117 | ### method `download_via_http`
118 |
119 | ```python
120 | download_via_http(dataset_name: str)
121 | ```
122 |
123 | Downloads selected dataset from MDF over HTTP.
124 |
125 | **Args:**
126 | dataset_name (str): Name of the dataset (equivalent to source_id in MDF).
127 |
128 | ---
129 |
130 |
131 |
132 | ### method `get_keys`
133 |
134 | ```python
135 | get_keys(
136 | foundry_schema: FoundrySchema,
137 | type: str = None,
138 | as_object: bool = False
139 | )
140 | ```
141 |
142 | Get keys for a Foundry dataset
143 |
144 |
145 |
146 | **Arguments:**
147 |
148 | - `foundry_schema` (FoundrySchema): The schema from MDF that contains the keys
149 | - `type` (str): The type of key to be returned e.g., "input", "target"
150 | - `as_object` (bool): When ``False``, will return a list of keys in as strings When ``True``, will return the full key objects
151 | - `**Default`: ** ``False`` Returns: (list) String representations of keys or if ``as_object`` is False otherwise returns the full key objects.
152 |
153 | ---
154 |
155 |
156 |
157 | ### method `load_as_dict`
158 |
159 | ```python
160 | load_as_dict(
161 | split: str,
162 | dataset_name: str,
163 | foundry_schema: FoundrySchema,
164 | as_hdf5: bool
165 | )
166 | ```
167 |
168 | Load the data associated with the specified dataset and return it as a labeled dictionary of tuples.
169 |
170 |
171 |
172 | **Args:**
173 |
174 | - `split` (str): Split to load the data from.
175 | - `dataset_name` (str): Name of the dataset (equivalent to source_id in MDF).
176 | - `foundry_schema` (FoundrySchema, optional): FoundrySchema object. Defaults to None.
177 | - `as_hdf5` (bool, optional): If True and dataset is in HDF5 format, keep data in HDF5 format. Defaults to False.
178 |
179 |
180 |
181 | **Returns:**
182 |
183 | - `dict`: A labeled dictionary of tuples containing the loaded data.
184 |
185 | ---
186 |
187 |
188 |
189 | ### method `load_as_tensorflow`
190 |
191 | ```python
192 | load_as_tensorflow(
193 | split: str,
194 | dataset_name: str,
195 | foundry_schema: FoundrySchema,
196 | as_hdf5: bool
197 | )
198 | ```
199 |
200 | Convert Foundry Dataset to a Tensorflow Sequence
201 |
202 |
203 |
204 | **Arguments:**
205 |
206 | - `split` (string): Split to create Tensorflow Sequence on.
207 | - `**Default`: ** ``None``
208 |
209 | Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split
210 |
211 | ---
212 |
213 |
214 |
215 | ### method `load_as_torch`
216 |
217 | ```python
218 | load_as_torch(
219 | split: str,
220 | dataset_name: str,
221 | foundry_schema: FoundrySchema,
222 | as_hdf5: bool
223 | )
224 | ```
225 |
226 | Convert Foundry Dataset to a PyTorch Dataset
227 |
228 |
229 |
230 | **Arguments:**
231 |
232 | - `split` (string): Split to create PyTorch Dataset on.
233 | - `**Default`: ** ``None``
234 |
235 | Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split
236 |
237 | ---
238 |
239 |
240 |
241 | ### method `validate_local_dataset_storage`
242 |
243 | ```python
244 | validate_local_dataset_storage(dataset_name: str, splits: List[Split] = None)
245 | ```
246 |
247 | Verifies that the local storage location exists and all expected files are present.
248 |
249 |
250 |
251 | **Args:**
252 |
253 | - `dataset_name` (str): Name of the dataset (equivalent to source_id in MDF).
254 | - `splits` (List[FoundrySplit], optional): Labels of splits to be loaded. Defaults to None.
255 |
256 |
257 |
258 | **Returns:**
259 |
260 | - `bool`: True if the dataset exists and contains all the desired files; False otherwise.
261 |
262 |
263 |
264 |
265 | ---
266 |
267 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
268 |
--------------------------------------------------------------------------------
/docs/foundry.foundry_dataset.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry.foundry_dataset`
6 |
7 |
8 |
9 |
10 |
11 |
12 | ---
13 |
14 |
15 |
16 | ## class `FoundryDataset`
17 | Representation of an individual dataset. Provides access to metadata as well as functions to instantiate data into memory in different formats.
18 |
19 |
20 |
21 | **Args:**
22 |
23 | - `dataset_name` (str): Name of the dataset (equivalent to source_id in MDF)
24 | - `datacite_entry` (FoundryDatacite): Datacite entry for the dataset
25 | - `foundry_schema` (FoundrySchema): Schema for the dataset
26 | - `foundry_cache` (FoundryCache): Cache for the dataset
27 |
28 | Desired functions:
29 | - Get as pandas
30 | - Get as tensorflow dataset
31 | - Get as pytorch dataset
32 | - Get file list
33 | - Set metadata
34 | - Attach datafiles
35 | - Validate against schema
36 | - Get citation
37 |
38 |
39 |
40 | ### method `__init__`
41 |
42 | ```python
43 | __init__(
44 | dataset_name: str,
45 | datacite_entry: FoundryDatacite,
46 | foundry_schema: FoundrySchema,
47 | foundry_cache: FoundryCache = None
48 | )
49 | ```
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 | ---
59 |
60 |
61 |
62 | ### method `add_data`
63 |
64 | ```python
65 | add_data(local_data_path: str = None, globus_data_source: str = None)
66 | ```
67 |
68 | Add data to the dataset. User must provide the location of the data as either a `globus_data_source` or `local_data_path`.
69 |
70 |
71 |
72 | **Arguments:**
73 |
74 | - `local_data_path` (str): Local path to the dataset used to publish to Foundry via HTTPS. Creates an HTTPS PUT request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is transferred to MDF. If None, the user must specify a 'globus_data_source' URL to the location of the data on their own Globus endpoint. User must choose either `globus_data_source` or `local_data_path` to publish their data.
75 | - `globus_data_source` (str): Url path for a data folder on a Globus endpoint; url can be obtained through the Globus Web UI or SDK. If None, the user must specify an 'local_data_path' pointing to the location of the data on their local machine. User must choose either `globus_data_source` or `local_data_path` to publish their data.
76 |
77 | ---
78 |
79 |
80 |
81 | ### method `clean_dc_dict`
82 |
83 | ```python
84 | clean_dc_dict()
85 | ```
86 |
87 | Clean the Datacite dictionary of None values
88 |
89 | ---
90 |
91 |
92 |
93 | ### method `clear_dataset_cache`
94 |
95 | ```python
96 | clear_dataset_cache()
97 | ```
98 |
99 | Deletes the cached data for this specific datset
100 |
101 | ---
102 |
103 |
104 |
105 | ### method `delete_none`
106 |
107 | ```python
108 | delete_none(_dict)
109 | ```
110 |
111 | Delete None values recursively from all of the dictionaries
112 |
113 | ---
114 |
115 |
116 |
117 | ### method `get_as_dict`
118 |
119 | ```python
120 | get_as_dict(split: str = None, as_hdf5: bool = False)
121 | ```
122 |
123 | Returns the data from the dataset as a dictionary
124 |
125 |
126 |
127 | **Arguments:**
128 |
129 | - `split` (string): Split to create dataset on.
130 | - `**Default`: ** ``None``
131 |
132 | Returns: (dict) Dictionary of all the data from the specified split
133 |
134 | ---
135 |
136 |
137 |
138 | ### method `get_as_tensorflow`
139 |
140 | ```python
141 | get_as_tensorflow(split: str = None)
142 | ```
143 |
144 | Convert Foundry Dataset to a Tensorflow Sequence
145 |
146 |
147 |
148 | **Arguments:**
149 |
150 | - `split` (string): Split to create Tensorflow Sequence on.
151 | - `**Default`: ** ``None``
152 |
153 | Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split
154 |
155 | ---
156 |
157 |
158 |
159 | ### method `get_as_torch`
160 |
161 | ```python
162 | get_as_torch(split: str = None)
163 | ```
164 |
165 | Returns the data from the dataset as a TorchDataset
166 |
167 |
168 |
169 | **Arguments:**
170 |
171 | - `split` (string): Split to create PyTorch Dataset on.
172 | - `**Default`: ** ``None``
173 |
174 | Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split
175 |
176 | ---
177 |
178 |
179 |
180 | ### method `get_citation`
181 |
182 | ```python
183 | get_citation() → str
184 | ```
185 |
186 |
187 |
188 |
189 |
190 | ---
191 |
192 |
193 |
194 | ### method `get_as_dict`
195 |
196 | ```python
197 | get_as_dict(split: str = None, as_hdf5: bool = False)
198 | ```
199 |
200 | Returns the data from the dataset as a dictionary
201 |
202 |
203 |
204 | **Arguments:**
205 |
206 | - `split` (string): Split to create dataset on.
207 | - `**Default`: ** ``None``
208 |
209 | Returns: (dict) Dictionary of all the data from the specified split
210 |
211 | ---
212 |
213 |
214 |
215 | ### method `validate_metadata`
216 |
217 | ```python
218 | validate_metadata(metadata)
219 | ```
220 |
221 | Validate the JSON message against the FoundryDataset model
222 |
223 |
224 |
225 | **Arguments:**
226 |
227 | - `metadata` (dict): Metadata information provided by the user.
228 |
229 |
230 |
231 | **Raises:**
232 |
233 | - `ValidationError`: if metadata supplied by user does not meet the specificiation of a FoundryDataset object.
234 |
235 |
236 |
237 |
238 | ---
239 |
240 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
241 |
--------------------------------------------------------------------------------
/docs/foundry.https_download.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry.https_download`
6 | Methods to download files from a Globus endpoint
7 |
8 |
9 | ---
10 |
11 |
12 |
13 | ## function `recursive_ls`
14 |
15 | ```python
16 | recursive_ls(tc: TransferClient, ep: str, path: str, max_depth: int = 3)
17 | ```
18 |
19 | Find all files in a Globus directory recursively
20 |
21 |
22 |
23 | **Args:**
24 |
25 | - `tc`: TransferClient authorized to access the directory
26 | - `ep`: Endpoint on which the files reside
27 | - `path`: Path to the files being downloaded
28 | - `max_depth`: Maximum recurse depth
29 |
30 |
31 |
32 | **Yields:**
33 | Dictionaries describing the location of the files. Each includes at least
34 | - `"name"`: Name of the file
35 | - `"path"`: Absolute path to the file's location
36 |
37 |
38 | ---
39 |
40 |
41 |
42 | ## function `download_file`
43 |
44 | ```python
45 | download_file(item, base_directory, https_config, timeout=1800)
46 | ```
47 |
48 | Download a file to disk
49 |
50 |
51 |
52 | **Args:**
53 |
54 | - `item`: Dictionary defining the path to the file
55 | - `base_directory`: Base directory for storing downloaded files
56 | - `https_config`: Configuration defining the URL of the server and the name of the dataset
57 | - `timeout`: Timeout for the download request in seconds (default: 1800)
58 |
59 |
60 |
61 |
62 | ---
63 |
64 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
65 |
--------------------------------------------------------------------------------
/docs/foundry.https_upload.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry.https_upload`
6 | Private utility methods to upload files and/or folders to Globus using HTTPS instead of Globus Transfer.
7 |
8 |
9 | ---
10 |
11 |
12 |
13 | ## function `upload_to_endpoint`
14 |
15 | ```python
16 | upload_to_endpoint(
17 | auths: PubAuths,
18 | local_data_path: str,
19 | endpoint_id: str = '82f1b5c6-6e9b-11e5-ba47-22000b92c6ec',
20 | dest_parent: str = None,
21 | dest_child: str = None
22 | ) → Tuple[str, str]
23 | ```
24 |
25 | Upload local data to a Globus endpoint using HTTPS PUT requests. Data can be a folder or an individual file.
26 |
27 | **Args:**
28 |
29 | - `auths` (PubAuths): Dataclass of authorizers needed for upload. Includes `transfer_client`, `auth_client_openid`,
30 | - `and `endpoint_auth_clients`, which is a Dict of `endpoint_id``: AuthClient mappings.
31 | - `local_data_path` (str): Path to the local dataset to publish to Foundry via HTTPS. Creates an HTTPS PUT request to upload the data specified to a Globus endpoint (default is NCSA endpoint) before it is transferred to MDF.
32 | - `endpoint_id` (str): Globus endpoint ID to upload the data to. Default is NCSA endpoint. Must match the `endpoint_id` auth'd in `auths.auth_client_gcs`.
33 |
34 | Returns
35 | ------- (str) Globus data source URL: URL pointing to the data on the Globus endpoint
36 |
37 |
38 |
39 |
40 | ---
41 |
42 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
43 |
--------------------------------------------------------------------------------
/docs/foundry.loaders.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry.loaders`
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
17 |
--------------------------------------------------------------------------------
/docs/foundry.loaders.tf_wrapper.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry.loaders.tf_wrapper`
6 |
7 |
8 |
9 |
10 |
11 |
12 | ---
13 |
14 |
15 |
16 | ## class `TensorflowSequence`
17 | Foundry Dataset Converted to Tensorflow Format
18 |
19 |
20 |
21 | ### method `__init__`
22 |
23 | ```python
24 | __init__(inputs, targets)
25 | ```
26 |
27 |
28 |
29 |
30 |
31 |
32 | ---
33 |
34 | #### property max_queue_size
35 |
36 |
37 |
38 |
39 |
40 | ---
41 |
42 | #### property num_batches
43 |
44 | Number of batches in the PyDataset.
45 |
46 |
47 |
48 | **Returns:**
49 | The number of batches in the PyDataset or `None` to indicate that the dataset is infinite.
50 |
51 | ---
52 |
53 | #### property use_multiprocessing
54 |
55 |
56 |
57 |
58 |
59 | ---
60 |
61 | #### property workers
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 | ---
73 |
74 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
75 |
--------------------------------------------------------------------------------
/docs/foundry.loaders.torch_wrapper.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry.loaders.torch_wrapper`
6 |
7 |
8 |
9 |
10 |
11 |
12 | ---
13 |
14 |
15 |
16 | ## class `TorchDataset`
17 | Foundry Dataset Converted to Pytorch Format
18 |
19 |
20 |
21 | ### method `__init__`
22 |
23 | ```python
24 | __init__(inputs, targets)
25 | ```
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 | ---
38 |
39 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
40 |
--------------------------------------------------------------------------------
/docs/foundry.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry`
6 |
7 |
8 |
9 |
10 | **Global Variables**
11 | ---------------
12 | - **auth**
13 | - **https_download**
14 | - **jsonschema_models**
15 | - **models**
16 | - **utils**
17 | - **foundry_cache**
18 | - **foundry_dataset**
19 | - **https_upload**
20 | - **foundry**
21 |
22 |
23 |
24 |
25 | ---
26 |
27 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
28 |
--------------------------------------------------------------------------------
/docs/foundry.models.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry.models`
6 |
7 |
8 |
9 |
10 |
11 |
12 | ---
13 |
14 |
15 |
16 | ## class `FoundrySpecificationDataset`
17 | Pydantic base class for datasets within the Foundry data package specification
18 |
19 |
20 | ---
21 |
22 | #### property model_extra
23 |
24 | Get extra fields set during validation.
25 |
26 |
27 |
28 | **Returns:**
29 | A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`.
30 |
31 | ---
32 |
33 | #### property model_fields_set
34 |
35 | Returns the set of fields that have been explicitly set on this model instance.
36 |
37 |
38 |
39 | **Returns:**
40 | A set of strings representing the fields that have been set, i.e. that were not filled from defaults.
41 |
42 |
43 |
44 |
45 | ---
46 |
47 |
48 |
49 | ## class `FoundrySpecification`
50 | Pydantic base class for interacting with the Foundry data package specification The specification provides a way to group datasets and manage versions
51 |
52 |
53 | ---
54 |
55 | #### property model_extra
56 |
57 | Get extra fields set during validation.
58 |
59 |
60 |
61 | **Returns:**
62 | A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`.
63 |
64 | ---
65 |
66 | #### property model_fields_set
67 |
68 | Returns the set of fields that have been explicitly set on this model instance.
69 |
70 |
71 |
72 | **Returns:**
73 | A set of strings representing the fields that have been set, i.e. that were not filled from defaults.
74 |
75 |
76 |
77 | ---
78 |
79 |
80 |
81 | ### method `add_dependency`
82 |
83 | ```python
84 | add_dependency(name: str, version: str)
85 | ```
86 |
87 |
88 |
89 |
90 |
91 | ---
92 |
93 |
94 |
95 | ### method `clear_dependencies`
96 |
97 | ```python
98 | clear_dependencies()
99 | ```
100 |
101 |
102 |
103 |
104 |
105 | ---
106 |
107 |
108 |
109 | ### method `model_dump`
110 |
111 | ```python
112 | model_dump()
113 | ```
114 |
115 |
116 |
117 |
118 |
119 | ---
120 |
121 |
122 |
123 | ### method `remove_duplicate_dependencies`
124 |
125 | ```python
126 | remove_duplicate_dependencies()
127 | ```
128 |
129 |
130 |
131 |
132 |
133 |
134 | ---
135 |
136 |
137 |
138 | ## class `FoundryDatasetType`
139 | Foundry Dataset Types Enumeration of the possible Foundry dataset types
140 |
141 |
142 |
143 |
144 |
145 | ---
146 |
147 |
148 |
149 | ## class `FoundrySchema`
150 | A model for the Foundry schema based on the FoundryModel (project_model.py) class.
151 |
152 |
153 |
154 | ### method `__init__`
155 |
156 | ```python
157 | __init__(project_dict: Dict[str, Any])
158 | ```
159 |
160 |
161 |
162 |
163 |
164 |
165 | ---
166 |
167 | #### property model_extra
168 |
169 | Get extra fields set during validation.
170 |
171 |
172 |
173 | **Returns:**
174 | A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`.
175 |
176 | ---
177 |
178 | #### property model_fields_set
179 |
180 | Returns the set of fields that have been explicitly set on this model instance.
181 |
182 |
183 |
184 | **Returns:**
185 | A set of strings representing the fields that have been set, i.e. that were not filled from defaults.
186 |
187 |
188 |
189 |
190 | ---
191 |
192 |
193 |
194 | ## class `FoundryDatacite`
195 | A model for the Datacite schema based on the Datacite (dc_model.py) class.
196 |
197 |
198 |
199 | ### method `__init__`
200 |
201 | ```python
202 | __init__(datacite_dict: Dict[str, Any], **kwargs)
203 | ```
204 |
205 |
206 |
207 |
208 |
209 |
210 | ---
211 |
212 | #### property model_extra
213 |
214 | Get extra fields set during validation.
215 |
216 |
217 |
218 | **Returns:**
219 | A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`.
220 |
221 | ---
222 |
223 | #### property model_fields_set
224 |
225 | Returns the set of fields that have been explicitly set on this model instance.
226 |
227 |
228 |
229 | **Returns:**
230 | A set of strings representing the fields that have been set, i.e. that were not filled from defaults.
231 |
232 |
233 |
234 |
235 | ---
236 |
237 |
238 |
239 | ## class `FoundryBase`
240 | Configuration information for Foundry instance
241 |
242 |
243 | ---
244 |
245 | #### property model_extra
246 |
247 | Get extra fields set during validation.
248 |
249 |
250 |
251 | **Returns:**
252 | A dictionary of extra fields, or `None` if `config.extra` is not set to `"allow"`.
253 |
254 | ---
255 |
256 | #### property model_fields_set
257 |
258 | Returns the set of fields that have been explicitly set on this model instance.
259 |
260 |
261 |
262 | **Returns:**
263 | A set of strings representing the fields that have been set, i.e. that were not filled from defaults.
264 |
265 |
266 |
267 | ---
268 |
269 |
270 |
271 | ### method `model_dump`
272 |
273 | ```python
274 | model_dump()
275 | ```
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 | ---
285 |
286 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
287 |
--------------------------------------------------------------------------------
/docs/foundry.utils.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # module `foundry.utils`
6 |
7 |
8 |
9 |
10 |
11 | ---
12 |
13 |
14 |
15 | ## function `is_pandas_pytable`
16 |
17 | ```python
18 | is_pandas_pytable(group)
19 | ```
20 |
21 |
22 |
23 |
24 |
25 |
26 | ---
27 |
28 |
29 |
30 | ## function `is_doi`
31 |
32 | ```python
33 | is_doi(string: str)
34 | ```
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 | ---
44 |
45 | _This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._
46 |
--------------------------------------------------------------------------------
/docs/how-to-contribute/code_of_conduct.md:
--------------------------------------------------------------------------------
1 | ---
2 | description: Read our pledge and Code of Conduct for contributing
3 | ---
4 |
5 | # Contributor Covenant
6 |
7 | ## Our Pledge
8 |
9 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
10 |
11 | ## Our Standards
12 |
13 | Examples of behavior that contributes to creating a positive environment include:
14 |
15 | * Using welcoming and inclusive language
16 | * Being respectful of differing viewpoints and experiences
17 | * Gracefully accepting constructive criticism
18 | * Focusing on what is best for the community
19 | * Showing empathy towards other community members
20 |
21 | Examples of unacceptable behavior by participants include:
22 |
23 | * The use of sexualized language or imagery and unwelcome sexual attention or
24 |
25 | advances
26 |
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |
31 | address, without explicit permission
32 |
33 | * Other conduct which could reasonably be considered inappropriate in a
34 |
35 | professional setting
36 |
37 | ## Our Responsibilities
38 |
39 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
40 |
41 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
42 |
43 | ## Scope
44 |
45 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
46 |
47 | ## Enforcement
48 |
49 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at foundry@uchicago.edu. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
50 |
51 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
52 |
53 | ## Attribution
54 |
55 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4, available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)
56 |
57 | For answers to common questions about this code of conduct, see [https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq)
58 |
59 |
--------------------------------------------------------------------------------
/docs/how-to-contribute/contributing.md:
--------------------------------------------------------------------------------
1 | # Contribution Process
2 |
3 | When contributing to this repository, please first discuss the change you wish to make via issue, email, or any other method with the owners of this repository before making a change.
4 |
5 | Please note we have a code of conduct, please follow it in all your interactions with the project.
6 |
7 | ## Contributing code
8 |
9 | If you have improvements to Foundry, send us your pull requests! For those just getting started, Github has a [how to](https://help.github.com/articles/using-pull-requests/).
10 |
11 | If you want to contribute, start working through the Foundry codebase, navigate to the [Github "issues" tab](https://github.com/MLMI2-CSSI/foundry/issues) and start looking through interesting issues. If you are not sure of where to start, then start by trying one of the smaller/easier issues here i.e. [issues with the "good first issue" label](https://github.com/MLMI2-CSSI/foundry/labels/good%20first%20issue). These are issues that we believe are particularly well suited for outside contributions. If you want to help out, but not alone, use the issue comment thread to coordinate.
12 |
13 | ### General guidelines and philosophy for contribution
14 |
15 | * Include unit tests when you contribute new features, as they help to a\)
16 |
17 | prove that your code works correctly, and b\) guard against future breaking
18 |
19 | changes to lower the maintenance cost.
20 |
21 | * Bug fixes also generally require unit tests, because the presence of bugs
22 |
23 | usually indicates insufficient test coverage.
24 |
25 | * Keep API compatibility in mind when you change code in Foundry,
26 | * When you contribute a new feature to Foundry, the maintenance burden is
27 |
28 | \(by default\) transferred to the Foundry team. This means that the benefit
29 |
30 | of the contribution must be compared against the cost of maintaining the
31 |
32 | feature.
33 |
34 | * Tests should follow [testing best practices](https://www..org/community/contribute/tests)
35 |
36 | guide.
37 |
38 | ## Pull Request Process
39 |
40 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a
41 |
42 | build.
43 |
44 | 2. Update the README.md with details of changes to the interface, this includes new environment
45 |
46 | variables, exposed ports, useful file locations and container parameters.
47 |
48 | 3. Increase the version numbers in any examples files and the README.md to the new version that this
49 |
50 | Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/).
51 |
52 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you
53 |
54 | do not have permission to do that, you may request the second reviewer to merge it for you.
55 |
56 |
--------------------------------------------------------------------------------
/docs/publishing-datasets.md:
--------------------------------------------------------------------------------
1 | ---
2 | description: Information on how to publish datasets
3 | ---
4 |
5 | # Publishing Datasets
6 |
7 | In order to publish datasets, the datasets must 1\) adhere to specified Foundry dataset shapes \([see here](publishing-datasets.md#shaping-datasets)\), and 2\) be described with required information \([see here](publishing-datasets.md#describing-datasets)\). Together, the dataset shape and description enable researchers to reuse the datasets more easily.
8 |
9 | ## Examples
10 |
11 | [Skip to the publication example notebook.](https://github.com/MLMI2-CSSI/foundry/blob/master/examples/foundry_publication_example.ipynb)
12 |
13 | ## Shaping Datasets
14 |
15 | For a general dataset to be translated into a usable Foundry dataset, it should follow one of the prescribed shapes. It should also be described by a Key object, which provides a mapping that allows Foundry to read data from the underlying data structure into usable Python objects \([see Describing Datasets](publishing-datasets.md#describing-datasets) for more info\).
16 |
17 | ### **Tabular Data**
18 |
19 | Tabular data should include in a form where columns represent the different keys of the data and rows represent individual entries.
20 |
21 | | **feature\_1** | **feature\_2** | **material\_type** | band\_gap |
22 | | :--- | :--- | :--- | :--- |
23 | | 0.10 | 0.52 | 1 | 1.40 |
24 | | 0.34 | 0.910 | 0 | 0.73 |
25 | | ... | ... | ... | |
26 |
27 | For this example dataset the `Key` object could be:
28 |
29 | ```text
30 | "keys":[{
31 | "key": "feature_1",
32 | "type": "input",
33 | "units": None,
34 | "description": "This is feature 1"
35 | },{
36 | "key": "feature_2",
37 | "type": "input",
38 | "units": None,
39 | "description": "This is feature 2"
40 | },{
41 | "key": "material_type",
42 | "type": "input",
43 | "units": None,
44 | "description": "This is the material type",
45 | "labels":["perovskite","not perovskite"]
46 | }{
47 | "key": "band_gap",
48 | "type": "target",
49 | "units": "eV",
50 | "description": "This is the simulated band gap in eV"
51 | }
52 | ]
53 | ```
54 |
55 | {% hint style="info" %}
56 | `This tabular data file should be saved in the base directory as` **`foundry_dataframe.json`**
57 | {% endhint %}
58 |
59 | ### Hierarchical Data
60 |
61 | Foundry also supports data from hierarchical data formats \(e.g., [HDF5](https://www.h5py.org)\). In this case features and outputs can be represented with `/` notation. For example, if the features of a dataset are located in an array stored in `/data/arr1` and `/other_data/arr2` while the outputs are in `/data/band_gaps`, the Key object would be:
62 |
63 | ```text
64 | "keys":[{
65 | "key": "/data/arr1",
66 | "type": "input",
67 | "units": None,
68 | "description": "This is an array containing input data"
69 | },{
70 | "key": "/other_data/arr2",
71 | "type": "input",
72 | "units": None,
73 | "description": "This is an another array containing input data"
74 | },{
75 | "key": "/data/band_gaps",
76 | "type": "target",
77 | "units": "eV",
78 | "description": "This is the simulated band gap in eV"
79 | }
80 | ]
81 | ```
82 |
83 | ## Describing Datasets
84 |
85 | **DataCite Metadata \(object\):** All datasets can be described using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). This metadata captures . Many of these capabilities have helper functions in the SDK, to make it easier to match the DataCite schema
86 |
87 | **Keys \(object\):** Key objects provide a mapping that allows Foundry to read data from the underlying data structure into usable Python objects. Key objects have the following properties
88 |
89 | * **`key (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\)
90 | * **`type (str)`** The type of key this entry represents. Currently suported types are _**\["input", "target" \]**_
91 | * **`units (str)[optional]`** _****_The scientific units associated with a key. _Default: None_
92 | * **`description (str)[optional]`** _****_A free text description of the key. _Default: None_
93 | * **`labels (list) (str) [optional]`:** A list of strings mapped to integers in a key column
94 |
95 | **short\_name \(str\):** Short name is a unique name associated with this dataset to make loading and .
96 |
97 | **type \(str\):** The type provides a hint to Foundry on how to map the keys into loading operations. _Options \["tabular","hdf5"\]_
98 |
99 | ```text
100 | "foundry": {
101 | "dc": {},
102 | "keys": [{
103 | "type": "input",
104 | "name": "feature_1",
105 | "units": "",
106 | "description": "This is an input"
107 | },
108 | {
109 | "type": "target",
110 | "name": "band_gap",
111 | "units": "eV",
112 | "description": "blah blah",
113 | "labels": []
114 | }
115 | ],
116 | "short_name": "my_short_name",
117 | "type": "tabular"
118 | }
119 | ```
120 |
121 | ## Publishing
122 |
123 | {% hint style="info" %}
124 | Before continuing, be sure that you have 1\) signed up for a [free Globus account](https://app.globus.org) and 2\) [joined this Globus group](https://app.globus.org/groups/cc192dca-3751-11e8-90c1-0a7c735d220a/about).
125 | {% endhint %}
126 |
127 | Once your dataset is in the proper shape, and you have created the associated metadata structure, you can publish to Foundry!
128 |
129 | Currently, you can publish any dataset you have stored on a Globus endpoint or Google Drive. In the following, assume your [previously defined metadata](publishing-datasets.md#describing-datasets) are stored in `metadata` :
130 |
131 | ```python
132 | from foundry import Foundry
133 |
134 | # Globus endpoint URL where your dataset is located
135 | data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry%2F_test_blaiszik_foundry_iris_v1.2%2F"
136 |
137 | # full title of dataset
138 | title = "Scourtas example iris dataset"
139 |
140 | # authors to list
141 | authors = ["A Scourtas", "B Blaiszik"]
142 |
143 | # shorthand title (optional)
144 | short_name = "example_AS_iris"
145 |
146 | # affiliations of authors (optional)
147 | affiliations = ["Globus Labs, UChicago"]
148 |
149 | # publisher of the data (optional)
150 | publisher = "Materials Data Facility"
151 |
152 | # publication year (optional)
153 | publication_year = 2021
154 |
155 |
156 | f = Foundry()
157 | res = f.publish(metadata, data_source, title, authors, short_name=short_name))
158 | ```
159 |
160 | The `publish()` method returns a result object that you can inspect for information about the state of the publication. For the above publication, `res` would have the format:
161 |
162 | ```python
163 | {'error': None,
164 | 'source_id': '_test_example_iris_v1.1',
165 | 'status_code': 202,
166 | 'success': True}
167 | ```
168 |
169 |
170 |
171 | ## Future Work
172 |
173 | * Add support for wildcard key type specifications
174 | * Add link to example publication
175 |
176 |
--------------------------------------------------------------------------------
/docs/publishing-models.md:
--------------------------------------------------------------------------------
1 | ---
2 | description: Information on how to publish models
3 | ---
4 |
5 | # Publishing Models
6 |
7 | In addition to datasets, you can publish models \(or even individual Python methods\) to Foundry and run them in the cloud!
8 |
9 | ## Examples
10 |
11 | Model publication example notebook coming soon
12 |
13 | ## Model Types
14 |
15 | You can publish any of the following types of models or functions to Foundry:
16 |
17 | * [Scikit-Learn models](publishing-models.md#scikit-learn-models)
18 | * [Tensorflow 1 & 2 models](publishing-models.md#tensorflow-1-and-2-models)
19 | * [Keras models](publishing-models.md#keras-models)
20 | * [PyTorch models](publishing-models.md#keras-models)
21 | * [Class methods \(advanced use\)](publishing-models.md#class-methods)
22 | * [Static methods \(advanced use\)](publishing-models.md#static-methods)
23 |
24 | ### Scikit-Learn models
25 |
26 | ### Tensorflow 1 & 2 models
27 |
28 | ### Keras models
29 |
30 | ### PyTorch models
31 |
32 | ### Class methods
33 |
34 | ### Static methods
35 |
36 | ## Data Types for Inputs and Targets
37 |
38 | ## Describing Models
39 |
40 | Before you can publish a model, you need to describe it using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org).
41 |
42 | ## Publishing
43 |
44 | ## Future Work
45 |
46 |
--------------------------------------------------------------------------------
/docs/publishing/publishing-datasets.md:
--------------------------------------------------------------------------------
1 | ---
2 | description: Information on how to publish datasets
3 | ---
4 |
5 | # Publishing Datasets
6 |
7 | In order to publish datasets, the datasets must 1\) adhere to specified Foundry dataset shapes \([see here](publishing-datasets.md#shaping-datasets)\), and 2\) be described with required information \([see here](publishing-datasets.md#describing-datasets)\). Together, the dataset shape and description enable researchers to reuse the datasets more easily.
8 |
9 | ## Examples
10 |
11 | [Skip to the publication example notebook.](https://github.com/MLMI2-CSSI/foundry/blob/master/examples/foundry_publication_example.ipynb)
12 |
13 | ## Shaping Datasets
14 |
15 | For a general dataset to be translated into a usable Foundry dataset, it should follow one of the prescribed shapes. It should also be described by a `Key` object, which provides a mapping that allows Foundry to read data from the underlying data structure into usable Python objects \([see Describing Datasets](publishing-datasets.md#describing-datasets) for more info\).
16 |
17 | ### **Tabular Data**
18 |
19 | Tabular data should include in a form where columns represent the different keys of the data and rows represent individual entries.
20 |
21 | | **feature\_1** | **feature\_2** | **material\_type** | band\_gap |
22 | | :--- | :--- | :--- | :--- |
23 | | 0.10 | 0.52 | 1 | 1.40 |
24 | | 0.34 | 0.910 | 0 | 0.73 |
25 | | ... | ... | ... | |
26 |
27 | For this example dataset the `keys` list could be:
28 |
29 | ```text
30 | "keys":[{
31 | "key": "feature_1",
32 | "type": "input",
33 | "units": None,
34 | "description": "This is feature 1"
35 | },{
36 | "key": "feature_2",
37 | "type": "input",
38 | "units": None,
39 | "description": "This is feature 2"
40 | },{
41 | "key": "material_type",
42 | "type": "input",
43 | "units": None,
44 | "description": "This is the material type",
45 | "labels":["perovskite","not perovskite"]
46 | }{
47 | "key": "band_gap",
48 | "type": "target",
49 | "units": "eV",
50 | "description": "This is the simulated band gap in eV"
51 | }
52 | ]
53 | ```
54 |
55 | {% hint style="info" %}
56 | `Don't forget to specify the tabular data file in the submitted metadata`
57 | {% endhint %}
58 |
59 | ### Hierarchical Data
60 |
61 | Foundry also supports data from hierarchical data formats \(e.g., [HDF5](https://www.h5py.org)\). In this case features and outputs can be represented with `/` notation. For example, if the features of a dataset are located in an array stored in `/data/arr1` and `/other_data/arr2` while the outputs are in `/data/band_gaps`, the Key object would be:
62 |
63 | ```text
64 | "keys":[{
65 | "key": "/data/arr1",
66 | "type": "input",
67 | "units": None,
68 | "description": "This is an array containing input data"
69 | },{
70 | "key": "/other_data/arr2",
71 | "type": "input",
72 | "units": None,
73 | "description": "This is an another array containing input data"
74 | },{
75 | "key": "/data/band_gaps",
76 | "type": "target",
77 | "units": "eV",
78 | "description": "This is the simulated band gap in eV"
79 | }
80 | ]
81 | ```
82 |
83 | ## Describing Datasets
84 |
85 | **DataCite Metadata \(object\):** All datasets can be described using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org). This metadata captures . Many of these capabilities have helper functions in the SDK, to make it easier to match the DataCite schema
86 |
87 | **Keys \(list\[Key\]\):** `Key` objects provide a mapping that allows Foundry to read data from the underlying data structure into usable Python objects. Individual `Key` objects have the following properties
88 |
89 | * **`key (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\)
90 | * **`type (str)`** The type of key this entry represents. Currently suported types are _**\["input", "target" \]**_
91 | * **`units (str)[optional]`** _****_The scientific units associated with a key. _Default: None_
92 | * **`description (str)[optional]`** _****_A free text description of the key. _Default: None_
93 | * **`labels (list) (str) [optional]`:** A list of strings mapped to integers in a key column
94 |
95 | **Splits \(list\[Split\]\):** `Split`objects provide a way for users to specify which data should be included as test, train, or other user defined splits. Individual `Split` objects have the following properties
96 |
97 | * **`type (str)`**A name mapping to a column name \(e.g., for csv files\) or key within a data structure \(e.g., for HDF5 files\)
98 | * **`path (str)`** The full filepath to the dataset file or directory that contains the split
99 | * **`label (str)`** A label to assign to this split
100 |
101 | **short\_name \(str\):** Short name is a unique name associated with this dataset to make loading and .
102 |
103 | **type \(str\):** The type provides a hint to Foundry on how to map the keys into loading operations. _Options \["tabular","hdf5"\]_
104 |
105 | ```text
106 | "foundry": {
107 | "dc": {},
108 | "keys": [{
109 | "type": "input",
110 | "name": "feature_1",
111 | "units": "",
112 | "description": "This is an input"
113 | },
114 | {
115 | "type": "target",
116 | "name": "band_gap",
117 | "units": "eV",
118 | "description": "blah blah",
119 | "labels": []
120 | }
121 | ],
122 | "short_name": "my_short_name",
123 | "data_type": "tabular"
124 | }
125 | ```
126 |
127 | ## Publishing
128 |
129 | {% hint style="info" %}
130 | Before continuing, be sure that you have 1\) signed up for a [free Globus account](https://app.globus.org) and 2\) [joined this Globus group](https://app.globus.org/groups/cc192dca-3751-11e8-90c1-0a7c735d220a/about).
131 | {% endhint %}
132 |
133 | Once your dataset is in the proper shape, and you have created the associated metadata structure, you can publish to Foundry! An example is shown below.
134 |
135 | ```text
136 | "foundry": {
137 | "dc": {},
138 | "keys": [{
139 | "type": "input",
140 | "name": "feature_1",
141 | "units": "",
142 | "description": "This is an input"
143 | },
144 | {
145 | "type": "target",
146 | "name": "band_gap",
147 | "units": "eV",
148 | "description": "blah blah",
149 | "labels": []
150 | }
151 | ],
152 | "short_name": "my_short_name",
153 | "data_type": "tabular"
154 | }
155 | ```
156 |
157 | Currently, you can publish any dataset you have stored on a Globus endpoint or Google Drive. In the following, assume your [previously defined metadata](publishing-datasets.md#describing-datasets) are stored in `metadata` :
158 |
159 | ```python
160 | from foundry import Foundry
161 |
162 | # Globus endpoint URL where your dataset is located
163 | data_source = "https://app.globus.org/file-manager?origin_id=e38ee745-6d04-11e5-ba46-22000b92c6ec&origin_path=%2Ffoundry%2F_test_blaiszik_foundry_iris_v1.2%2F"
164 |
165 | # full title of dataset
166 | title = "Scourtas example iris dataset"
167 |
168 | # authors to list
169 | authors = ["A. Scourtas", "B. Blaiszik"]
170 |
171 | # shorthand title (optional)
172 | short_name = "example_AS_iris"
173 |
174 | # affiliations of authors (optional)
175 | affiliations = ["Globus Labs, UChicago"]
176 |
177 | # publisher of the data (optional)
178 | publisher = "Materials Data Facility"
179 |
180 | # publication year (optional)
181 | publication_year = 2021
182 |
183 |
184 | f = Foundry()
185 | res = f.publish(metadata, data_source, title, authors, short_name=short_name))
186 | ```
187 |
188 | The `publish()` method returns a result object that you can inspect for information about the state of the publication. For the above publication, `res` would have the format:
189 |
190 | ```python
191 | {'error': None,
192 | 'source_id': '_test_example_iris_v1.1',
193 | 'status_code': 202,
194 | 'success': True}
195 | ```
196 |
197 |
198 |
199 | ## Future Work
200 |
201 | * Add support for wildcard key type specifications
202 | * Add link to example publication
203 |
204 |
--------------------------------------------------------------------------------
/docs/publishing/publishing-models.md:
--------------------------------------------------------------------------------
1 | ---
2 | description: Information on how to publish models
3 | ---
4 |
5 | # Publishing Models
6 |
7 | In addition to datasets, you can publish models \(or even individual Python methods\) to Foundry and run them in the cloud!
8 |
9 | ## Examples
10 |
11 | Model publication example notebook coming soon
12 |
13 | ## Model Types
14 |
15 | You can publish any of the following types of models or functions to Foundry:
16 |
17 | * [Scikit-Learn models](publishing-models.md#scikit-learn-models)
18 | * [Tensorflow 1 & 2 models](publishing-models.md#tensorflow-1-and-2-models)
19 | * [Keras models](publishing-models.md#keras-models)
20 | * [PyTorch models](publishing-models.md#keras-models)
21 | * [Class methods \(advanced use\)](publishing-models.md#class-methods)
22 | * [Static methods \(advanced use\)](publishing-models.md#static-methods)
23 |
24 | ### Scikit-Learn models
25 |
26 | ### Tensorflow 1 & 2 models
27 |
28 | ### Keras models
29 |
30 | ### PyTorch models
31 |
32 | ### Class methods
33 |
34 | ### Static methods
35 |
36 | ## Data Types for Inputs and Targets
37 |
38 | ## Describing Models
39 |
40 | Before you can publish a model, you need to describe it using metadata in compliance with the [DataCite metadata format](https://schema.datacite.org).
41 |
42 | ## Publishing
43 |
44 | ## Future Work
45 |
46 |
--------------------------------------------------------------------------------
/docs/support/troubleshooting.md:
--------------------------------------------------------------------------------
1 | ---
2 | description: Common pitfalls and issues and how to solve them
3 | ---
4 |
5 | # Troubleshooting
6 |
7 | ### Issues with loading or publishing Keras or Tensorflow models
8 |
9 | 
10 |
11 | There is a difference between the older, plain Keras package installed via `import keras`, and the currently maintained and up-to-date Keras package installed via `from tensorflow import keras`. Currently, the DLHub SDK \(which Foundry uses under-the-hood to publish, pull, and run models and functions\) uses whichever version of Keras you have installed.
12 |
13 | Errors can arise when `tf.keras` is used in one part of the model pipeline, but plain `keras` is used in another.
14 |
15 | If you have both versions of Keras installed \(which can be the case in common container environments, such as Google Colab\), DLHub will default to the plain Keras version, in case the user wants to use that with the newest version of Tensorflow. To override this functionality and use the Tensorflow Keras instead when publishing your model, pass the `force_tf_keras = True`option to `publish_model()`.
16 |
17 | ```python
18 | # Assume our fitted model is '7-fi-1.hdf5'.
19 | # Create the metadata for the model
20 | import os
21 |
22 | options_keras = {
23 | "title": "Bandgap-7-fidelity-MP-JARVIS-1",
24 | "short_name": "7-fi-1",
25 | "authors": ["Scientist, Awesome"],
26 | "servable": {
27 | "type": "keras",
28 | "model_path": "7-fi-1.hdf5",
29 | "custom_objects": {"softplus2": softplus2,
30 | "MEGNetLayer": MEGNetLayer,
31 | "Set2Set": Set2Set},
32 | "force_tf_keras": True
33 | }
34 | }
35 | res = f.publish_model(options_keras)
36 | ```
37 |
38 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples using Foundry
2 | If you're wondering how to get started with Foundry or want to see it in action, you're in the right place!
3 |
4 | Each notebook walks through instantiating Foundry, loading data from Foundry, and working with the data in different ways. Some notebooks also use machine learning models with the data.
5 |
6 | Each folder contains a notebook and `requirements.txt` file. The notebooks can be run locally (using the `requirements.txt`) or in [Google Colab](https://colab.research.google.com/).
7 |
8 | If you have any trouble with the notebooks, please check our [documentation](https://ai-materials-and-chemistry.gitbook.io/foundry/v/docs/) or create an issue on the repo.
9 |
--------------------------------------------------------------------------------
/examples/atom-position-finding/requirements.txt:
--------------------------------------------------------------------------------
1 | foundry_ml
2 | matplotlib
3 |
--------------------------------------------------------------------------------
/examples/bandgap/foundry.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Band Gap Analysis",
3 | "version": "1.0.0",
4 | "description": "Datasets for band gap uber model generation",
5 | "private":true,
6 | "dependencies":{
7 | "_test_foundry_experimental_bandgap_v1.1": "1.1",
8 | "_test_foundry_mp_bandgap_v1.1":"1.1",
9 | "_test_foundry_oqmd_bandgap_v1.1":"1.1",
10 | "_test_foundry_assorted_computational_bandgap_v1.1":"1.1"
11 | }
12 | }
--------------------------------------------------------------------------------
/examples/bandgap/requirements.txt:
--------------------------------------------------------------------------------
1 | pymatgen
2 | matminer
3 | pandas
4 | matplotlib
5 | scikit-learn
6 | foundry_ml
7 |
--------------------------------------------------------------------------------
/examples/dendrite-segmentation/foundry.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Dendrite Segmentation",
3 | "version": "1.0.0",
4 | "description": "Semantic Segmentation of Dendrites via Machine Learning",
5 | "private":true,
6 | "dependencies":{
7 | "_test_foundry_stan_dendrite_segmentation_v1.1": "1.1"
8 | }
9 | }
--------------------------------------------------------------------------------
/examples/dendrite-segmentation/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn
2 | foundry_ml
3 | scikit-image
4 | tensorflow
5 | keras-unet
6 | opencv-python
7 |
--------------------------------------------------------------------------------
/examples/oqmd/foundry.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "OQMD Data Analysis",
3 | "version": "1.0.0",
4 | "description": "Creating dataframe and metadata for OQMD dataset",
5 | "private":true,
6 | "dependencies":{
7 | "_test_foundry_oqmd_v1.1": "1.1"
8 | }
9 | }
--------------------------------------------------------------------------------
/examples/oqmd/requirements.txt:
--------------------------------------------------------------------------------
1 | foundry_ml
2 | pandas
3 |
--------------------------------------------------------------------------------
/examples/publishing-guides/data/iris.csv:
--------------------------------------------------------------------------------
1 | # Data from: https://archive.ics.uci.edu/ml/datasets/Iris
2 | sepal_length,sepal_width,petal_length,petal_width,species
3 | 5.1,3.5,1.4,0.2,setosa
4 | 4.9,3.0,1.4,0.2,setosa
5 | 4.7,3.2,1.3,0.2,setosa
6 | 4.6,3.1,1.5,0.2,setosa
7 | 5.0,3.6,1.4,0.2,setosa
8 | 5.4,3.9,1.7,0.4,setosa
9 | 4.6,3.4,1.4,0.3,setosa
10 | 5.0,3.4,1.5,0.2,setosa
11 | 4.4,2.9,1.4,0.2,setosa
12 | 4.9,3.1,1.5,0.1,setosa
13 | 5.4,3.7,1.5,0.2,setosa
14 | 4.8,3.4,1.6,0.2,setosa
15 | 4.8,3.0,1.4,0.1,setosa
16 | 4.3,3.0,1.1,0.1,setosa
17 | 5.8,4.0,1.2,0.2,setosa
18 | 5.7,4.4,1.5,0.4,setosa
19 | 5.4,3.9,1.3,0.4,setosa
20 | 5.1,3.5,1.4,0.3,setosa
21 | 5.7,3.8,1.7,0.3,setosa
22 | 5.1,3.8,1.5,0.3,setosa
23 | 5.4,3.4,1.7,0.2,setosa
24 | 5.1,3.7,1.5,0.4,setosa
25 | 4.6,3.6,1.0,0.2,setosa
26 | 5.1,3.3,1.7,0.5,setosa
27 | 4.8,3.4,1.9,0.2,setosa
28 | 5.0,3.0,1.6,0.2,setosa
29 | 5.0,3.4,1.6,0.4,setosa
30 | 5.2,3.5,1.5,0.2,setosa
31 | 5.2,3.4,1.4,0.2,setosa
32 | 4.7,3.2,1.6,0.2,setosa
33 | 4.8,3.1,1.6,0.2,setosa
34 | 5.4,3.4,1.5,0.4,setosa
35 | 5.2,4.1,1.5,0.1,setosa
36 | 5.5,4.2,1.4,0.2,setosa
37 | 4.9,3.1,1.5,0.1,setosa
38 | 5.0,3.2,1.2,0.2,setosa
39 | 5.5,3.5,1.3,0.2,setosa
40 | 4.9,3.1,1.5,0.1,setosa
41 | 4.4,3.0,1.3,0.2,setosa
42 | 5.1,3.4,1.5,0.2,setosa
43 | 5.0,3.5,1.3,0.3,setosa
44 | 4.5,2.3,1.3,0.3,setosa
45 | 4.4,3.2,1.3,0.2,setosa
46 | 5.0,3.5,1.6,0.6,setosa
47 | 5.1,3.8,1.9,0.4,setosa
48 | 4.8,3.0,1.4,0.3,setosa
49 | 5.1,3.8,1.6,0.2,setosa
50 | 4.6,3.2,1.4,0.2,setosa
51 | 5.3,3.7,1.5,0.2,setosa
52 | 5.0,3.3,1.4,0.2,setosa
53 | 7.0,3.2,4.7,1.4,versicolor
54 | 6.4,3.2,4.5,1.5,versicolor
55 | 6.9,3.1,4.9,1.5,versicolor
56 | 5.5,2.3,4.0,1.3,versicolor
57 | 6.5,2.8,4.6,1.5,versicolor
58 | 5.7,2.8,4.5,1.3,versicolor
59 | 6.3,3.3,4.7,1.6,versicolor
60 | 4.9,2.4,3.3,1.0,versicolor
61 | 6.6,2.9,4.6,1.3,versicolor
62 | 5.2,2.7,3.9,1.4,versicolor
63 | 5.0,2.0,3.5,1.0,versicolor
64 | 5.9,3.0,4.2,1.5,versicolor
65 | 6.0,2.2,4.0,1.0,versicolor
66 | 6.1,2.9,4.7,1.4,versicolor
67 | 5.6,2.9,3.6,1.3,versicolor
68 | 6.7,3.1,4.4,1.4,versicolor
69 | 5.6,3.0,4.5,1.5,versicolor
70 | 5.8,2.7,4.1,1.0,versicolor
71 | 6.2,2.2,4.5,1.5,versicolor
72 | 5.6,2.5,3.9,1.1,versicolor
73 | 5.9,3.2,4.8,1.8,versicolor
74 | 6.1,2.8,4.0,1.3,versicolor
75 | 6.3,2.5,4.9,1.5,versicolor
76 | 6.1,2.8,4.7,1.2,versicolor
77 | 6.4,2.9,4.3,1.3,versicolor
78 | 6.6,3.0,4.4,1.4,versicolor
79 | 6.8,2.8,4.8,1.4,versicolor
80 | 6.7,3.0,5.0,1.7,versicolor
81 | 6.0,2.9,4.5,1.5,versicolor
82 | 5.7,2.6,3.5,1.0,versicolor
83 | 5.5,2.4,3.8,1.1,versicolor
84 | 5.5,2.4,3.7,1.0,versicolor
85 | 5.8,2.7,3.9,1.2,versicolor
86 | 6.0,2.7,5.1,1.6,versicolor
87 | 5.4,3.0,4.5,1.5,versicolor
88 | 6.0,3.4,4.5,1.6,versicolor
89 | 6.7,3.1,4.7,1.5,versicolor
90 | 6.3,2.3,4.4,1.3,versicolor
91 | 5.6,3.0,4.1,1.3,versicolor
92 | 5.5,2.5,4.0,1.3,versicolor
93 | 5.5,2.6,4.4,1.2,versicolor
94 | 6.1,3.0,4.6,1.4,versicolor
95 | 5.8,2.6,4.0,1.2,versicolor
96 | 5.0,2.3,3.3,1.0,versicolor
97 | 5.6,2.7,4.2,1.3,versicolor
98 | 5.7,3.0,4.2,1.2,versicolor
99 | 5.7,2.9,4.2,1.3,versicolor
100 | 6.2,2.9,4.3,1.3,versicolor
101 | 5.1,2.5,3.0,1.1,versicolor
102 | 5.7,2.8,4.1,1.3,versicolor
103 | 6.3,3.3,6.0,2.5,virginica
104 | 5.8,2.7,5.1,1.9,virginica
105 | 7.1,3.0,5.9,2.1,virginica
106 | 6.3,2.9,5.6,1.8,virginica
107 | 6.5,3.0,5.8,2.2,virginica
108 | 7.6,3.0,6.6,2.1,virginica
109 | 4.9,2.5,4.5,1.7,virginica
110 | 7.3,2.9,6.3,1.8,virginica
111 | 6.7,2.5,5.8,1.8,virginica
112 | 7.2,3.6,6.1,2.5,virginica
113 | 6.5,3.2,5.1,2.0,virginica
114 | 6.4,2.7,5.3,1.9,virginica
115 | 6.8,3.0,5.5,2.1,virginica
116 | 5.7,2.5,5.0,2.0,virginica
117 | 5.8,2.8,5.1,2.4,virginica
118 | 6.4,3.2,5.3,2.3,virginica
119 | 6.5,3.0,5.5,1.8,virginica
120 | 7.7,3.8,6.7,2.2,virginica
121 | 7.7,2.6,6.9,2.3,virginica
122 | 6.0,2.2,5.0,1.5,virginica
123 | 6.9,3.2,5.7,2.3,virginica
124 | 5.6,2.8,4.9,2.0,virginica
125 | 7.7,2.8,6.7,2.0,virginica
126 | 6.3,2.7,4.9,1.8,virginica
127 | 6.7,3.3,5.7,2.1,virginica
128 | 7.2,3.2,6.0,1.8,virginica
129 | 6.2,2.8,4.8,1.8,virginica
130 | 6.1,3.0,4.9,1.8,virginica
131 | 6.4,2.8,5.6,2.1,virginica
132 | 7.2,3.0,5.8,1.6,virginica
133 | 7.4,2.8,6.1,1.9,virginica
134 | 7.9,3.8,6.4,2.0,virginica
135 | 6.4,2.8,5.6,2.2,virginica
136 | 6.3,2.8,5.1,1.5,virginica
137 | 6.1,2.6,5.6,1.4,virginica
138 | 7.7,3.0,6.1,2.3,virginica
139 | 6.3,3.4,5.6,2.4,virginica
140 | 6.4,3.1,5.5,1.8,virginica
141 | 6.0,3.0,4.8,1.8,virginica
142 | 6.9,3.1,5.4,2.1,virginica
143 | 6.7,3.1,5.6,2.4,virginica
144 | 6.9,3.1,5.1,2.3,virginica
145 | 5.8,2.7,5.1,1.9,virginica
146 | 6.8,3.2,5.9,2.3,virginica
147 | 6.7,3.3,5.7,2.5,virginica
148 | 6.7,3.0,5.2,2.3,virginica
149 | 6.3,2.5,5.0,1.9,virginica
150 | 6.5,3.0,5.2,2.0,virginica
151 | 6.2,3.4,5.4,2.3,virginica
152 | 5.9,3.0,5.1,1.8,virginica
153 |
--------------------------------------------------------------------------------
/examples/zeolite/requirements.txt:
--------------------------------------------------------------------------------
1 | seaborn
2 | matplotlib
3 | foundry_ml
--------------------------------------------------------------------------------
/foundry/__init__.py:
--------------------------------------------------------------------------------
1 | from .foundry import Foundry # noqa F401 (import unused)
2 | from . import models # noqa F401 (import unused)
3 | from . import https_download # noqa F401 (import unused)
4 | from . import https_upload # noqa F401 (import unused)
5 | from .foundry_dataset import FoundryDataset # noqa F401 (import unused)
6 |
--------------------------------------------------------------------------------
/foundry/auth.py:
--------------------------------------------------------------------------------
1 | """Utilities related to storing authentication credentials"""
2 |
3 | from dataclasses import dataclass
4 | from typing import Dict
5 |
6 | from globus_sdk import TransferClient, AuthClient
7 |
8 |
9 | @dataclass
10 | class PubAuths:
11 | """Collection of the authorizers needed for publication
12 |
13 | Attributes:
14 | transfer_client: Client with credentials to perform transfers
15 | auth_client_openid: Client with permissions to get users IDs
16 | endpoint_auth_clients: Mapping between endpoint ID and client that can authorize access to it
17 | """
18 |
19 | transfer_client: TransferClient
20 | auth_client_openid: AuthClient
21 | endpoint_auth_clients: Dict[str, AuthClient]
22 |
--------------------------------------------------------------------------------
/foundry/foundry_dataset.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 | import html
5 | from json2table import convert
6 |
7 | from pydantic import ValidationError
8 |
9 | from .foundry_cache import FoundryCache
10 | from .models import FoundrySchema, FoundryDatacite
11 |
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | class FoundryDataset():
17 | """Representation of an individual dataset.
18 | Provides access to metadata as well as functions to
19 | instantiate data into memory in different formats.
20 |
21 | Args:
22 | dataset_name (str): Name of the dataset (equivalent to source_id in MDF)
23 | datacite_entry (FoundryDatacite): Datacite entry for the dataset
24 | foundry_schema (FoundrySchema): Schema for the dataset
25 | foundry_cache (FoundryCache): Cache for the dataset
26 |
27 | Desired functions:
28 | - Get as pandas
29 | - Get as tensorflow dataset
30 | - Get as pytorch dataset
31 | - Get file list
32 | - Set metadata
33 | - Attach datafiles
34 | - Validate against schema
35 | - Get citation
36 | """
37 |
38 | def __init__(self,
39 | dataset_name: str,
40 | datacite_entry: FoundryDatacite,
41 | foundry_schema: FoundrySchema,
42 | foundry_cache: FoundryCache = None):
43 |
44 | self.dataset_name = dataset_name
45 | try:
46 | self.dc = FoundryDatacite(datacite_entry)
47 | self.foundry_schema = FoundrySchema(foundry_schema)
48 | except Exception as e:
49 | raise Exception('there was a problem creating the dataset: ', e)
50 | self._foundry_cache = foundry_cache
51 |
52 | def get_as_dict(self, split: str = None, as_hdf5: bool = False):
53 | """Returns the data from the dataset as a dictionary
54 |
55 | Arguments:
56 | split (string): Split to create dataset on.
57 | **Default:** ``None``
58 |
59 | Returns: (dict) Dictionary of all the data from the specified split
60 |
61 | """
62 | return self._foundry_cache.load_as_dict(split,
63 | self.dataset_name,
64 | self.foundry_schema,
65 | as_hdf5)
66 | load = get_as_dict
67 |
68 | def get_as_torch(self, split: str = None):
69 | """Returns the data from the dataset as a TorchDataset
70 |
71 | Arguments:
72 | split (string): Split to create PyTorch Dataset on.
73 | **Default:** ``None``
74 |
75 | Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split
76 |
77 | """
78 |
79 | return self._foundry_cache.load_as_torch(split,
80 | self.dataset_name,
81 | self.foundry_schema)
82 |
83 | def get_as_tensorflow(self, split: str = None):
84 | """Convert Foundry Dataset to a Tensorflow Sequence
85 |
86 | Arguments:
87 | split (string): Split to create Tensorflow Sequence on.
88 | **Default:** ``None``
89 |
90 | Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split
91 |
92 | """
93 | return self._foundry_cache.load_as_tensorflow(split,
94 | self.dataset_name,
95 | self.foundry_schema)
96 |
97 | def _repr_html_(self) -> str:
98 | """Format the Foundry object for notebook rendering as HTML output
99 |
100 | Args:
101 | self (Foundry)
102 |
103 | Returns:
104 | buf (str): buffer containing the HTML to render
105 | """
106 | if not self.dc:
107 | buf = str(self)
108 | else:
109 | title = self.dc.titles[0].title
110 | authors = [creator['creatorName']
111 | for creator in self.dc.creators]
112 | authors = '; '.join(authors)
113 | DOI = "DOI: " + self.dc.identifier.identifier.root
114 |
115 | buf = f'
{DOI}
' 116 | 117 | buf = f'{buf}