├── .github
    └── workflows
    │   ├── publish.yaml
    │   └── test_workflow.yaml
├── .gitignore
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── README.rst
├── docs
    ├── .readthedocs.yaml
    ├── Makefile
    ├── _static
    │   └── css
    │   │   └── custom.css
    ├── api
    │   ├── fairly.client.rst
    │   ├── fairly.dataset.rst
    │   ├── fairly.file.rst
    │   └── fairly.rst
    ├── conf.py
    ├── img
    │   ├── add-filles.png
    │   ├── clone1.png
    │   ├── clone2.png
    │   ├── contex-menu.png
    │   ├── create-dataset1.png
    │   ├── create-dataset2.png
    │   ├── create-directory.png
    │   ├── labs-home.png
    │   ├── labs-start.png
    │   ├── my-dataset.png
    │   ├── open-metadata.png
    │   ├── osf-banner2023.png
    │   ├── push-confirm.png
    │   ├── push-menu.png
    │   ├── start-jupyterlab.png
    │   ├── zenodo-cli-upload.png
    │   ├── zenodo-token.png
    │   └── zenodo-upload.png
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── modules.rst
    ├── package
    │   ├── account-datasets.ipynb
    │   ├── account-token.rst
    │   ├── archiving-datasets.ipynb
    │   ├── demo-4tu.ipynb
    │   └── demo-zenodo.ipynb
    ├── requirements.txt
    └── tutorials
    │   ├── cli.rst
    │   ├── jupyterlab.rst
    │   ├── python-api.ipynb
    │   └── workshop.rst
├── pyproject.toml
├── src
    └── fairly
    │   ├── __init__.py
    │   ├── cli
    │       ├── __init__.py
    │       ├── config.py
    │       └── dataset.py
    │   ├── client
    │       ├── __init__.py
    │       ├── dataverse.py
    │       ├── djehuty.py
    │       ├── figshare.py
    │       ├── invenio.py
    │       └── zenodo.py
    │   ├── data
    │       ├── config.json
    │       ├── languages
    │       │   ├── ISO-639-2_8859-1.tab
    │       │   ├── ISO-639-2_UTF-8.tab
    │       │   ├── ISO-639-3_8859-1.tab
    │       │   └── ISO-639-3_UTF-8.tab
    │       └── templates
    │       │   ├── default.yaml
    │       │   ├── figshare.yaml
    │       │   └── zenodo.yaml
    │   ├── dataset
    │       ├── __init__.py
    │       ├── local.py
    │       └── remote.py
    │   ├── diff.py
    │   ├── file
    │       ├── __init__.py
    │       ├── local.py
    │       └── remote.py
    │   ├── metadata.py
    │   └── person.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_cli.py
    └── test_fairly.py


/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | # GitHub recommends pinning actions to a commit SHA.
 7 | # To get a newer version, you will need to update the SHA.
 8 | # You can also reference a tag or branch, but the action may change without warning.
 9 | 
10 | name: Publish
11 | 
12 | on:
13 |   release:
14 |     types: [published]
15 | 
16 | jobs:
17 |   deploy:
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - name: Set up Python
22 |         uses: actions/setup-python@v5
23 |         with:
24 |           python-version: '3.x'
25 |       - name: Install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           pip install build hatch
29 |       - name: Build package
30 |         run: python -m build
31 |       - name: Publish package
32 |         uses: pypa/gh-action-pypi-publish@v1.8.14
33 |         with:
34 |           user: __token__
35 |           password: ${{ secrets.PYPI_API_TOKEN }}
36 | 


--------------------------------------------------------------------------------
/.github/workflows/test_workflow.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     timeout-minutes: 10
 9 |     env:
10 |       FAIRLY_FIGSHARE_TOKEN: ${{ secrets.FIGSHARE_TOKEN }}
11 |       FAIRLY_ZENODO_TOKEN: ${{ secrets.ZENODO_TOKEN }}
12 |       FAIRLY_4TU_TOKEN: ${{ secrets.FOURTU_TOKEN }}
13 | 
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v4
20 |       - name: Set up Python ${{ matrix.python-version }}
21 |         uses: actions/setup-python@v5
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           pip install -e .[dev]
28 |       - name: Test with pytest
29 |         run: |
30 |           pytest --cov=fairly tests/
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | *.dataset
 36 | tests/fixtures/vcr_cassettes*/
 37 | 
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | vcr_cassettes/
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # experiments
136 | experiments
137 | 
138 | # ignore bench-test in tests
139 | tests/notes.py
140 | _no_*
141 | 
142 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: Fairly
 6 | message: Please cite this software using these metadata.
 7 | type: software
 8 | authors:
 9 |   - given-names: Serkan
10 |     family-names: Grigin
11 |     email: s.girgin@utwente.nl
12 |     affiliation: University of Twente
13 |     orcid: 'https://orcid.org/0000-0002-0156-185X'
14 |   - given-names: Manuel
15 |     family-names: Garcia Alvarez
16 |     email: m.g.garciaalvarez@tudelft.nl
17 |     affiliation: Delft University of Technology
18 |     orcid: 'https://orcid.org/0000-0003-1579-9989'
19 |   - given-names: Jose
20 |     family-names: Urra Llanusa
21 |     email: j.c.urrallanusa@tudelft.nl
22 |     affiliation: Delft University of Technology
23 |     orcid: 'https://orcid.org/0000-0002-9334-3998'
24 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | As an open source software project, we welcome contributions. Please, read this these guidelines before attempting to contribute with this project.
 4 | 
 5 | ## Types of Contributions
 6 | A contribution can be one of the following cases:
 7 | 
 8 | 1. you have a question;
 9 | 2. you think you may have found a bug (including unexpected behaviour);
10 | 3. you want to make some changes to the code base (e.g. to fix a bug, to add a new feature, to update 4. documentation).
11 | 
12 | ## Questions
13 |     
14 | 1. use the search functionality [here](https://github.com/ITC-CRIB/fairly/issues) to see if someone already filed the same issue or question;
15 | 2. if your issue search did not yield any relevant results, make a new issue;
16 | 3. apply the "Question" label; apply other labels when relevant.
17 | 
18 | ## Find Bugs
19 | 
20 | If you think you may have found a bug:
21 | 
22 | 1. use the search functionality [here](https://github.com/ITC-CRIB/fairly/issues) to see if someone already filed the same issue;
23 | 2. if your issue search did not yield any relevant results, make a new issue, making sure to provide enough information to the rest of the community to understand the cause and context of the problem. Depending on the issue, you may want to include:
24 |     - the [SHA hashcode](https://help.github.com/articles/autolinked-references-and-urls/#commit-shas) of the commit that is causing your problem;
25 |     - some identifying information (name and version number) for dependencies you're using;
26 |     - information about the operating system;
27 |     - detailed steps to reproduce the bug.
28 | 3. apply relevant labels to the newly created issue.
29 | 
30 | ## Changes to Source Code: fix bugs and add features
31 | 
32 | 1. (important) announce your plan to the rest of the community before you start working. This announcement should be in the form of a (new) issue;
33 | 2. (important) wait until some consensus is reached about your idea is a good idea;
34 | 3. if needed, fork the repository to your own Github profile and create your feature branch out of the latest master commit. While working on your feature branch, make sure to stay up to date with the master branch by pulling in changes;
35 | 4. make sure the existing tests still work;
36 | 5. add your tests (if applicable);
37 | 6. update or expand the documentation;
38 | 7. push your feature branch to (your fork of) this repository on GitHub;
39 | 8. create the pull request, e.g. following the instructions [here](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request).
40 | 
41 | > If you feel like you have a valuable contribution to make, but you don't know how to write or run tests for it or create the documentation; don't let this discourage you from making the pull request; we can help you! Just go ahead and submit the pull request, but keep in mind that you might be asked to append additional commits to your pull request.
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 JupyterFAIR Team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. list-table::
  2 |    :widths: 25 25
  3 |    :header-rows: 1
  4 | 
  5 |    * - `fair-software.nl <https://fair-software.nl>`_ recommendations
  6 |      - Badges
  7 |    * - \1. Code repository
  8 |      - |GitHub Badge|
  9 |    * - \2. License
 10 |      - |License Badge|
 11 |    * - \3. Community Registry
 12 |      - |PyPI Badge|
 13 |    * - \4. Enable Citation
 14 |      - |Zenodo Badge|
 15 |    * - **Other best practices**
 16 |      -
 17 |    * - Continuous integration
 18 |      - |Python Build| |Python Publish|
 19 |    * - Documentation
 20 |      - |Documentation Status|
 21 |    * - Anaconda package
 22 |      - |Anaconda| |Anaconda Downloads|
 23 | 
 24 | .. |Anaconda| image:: https://anaconda.org/conda-forge/fairly/badges/version.svg   
 25 |    :target: https://anaconda.org/conda-forge/fairly
 26 |    :alt: Anaconda Package
 27 | 
 28 | .. |Anaconda Downloads| image:: https://anaconda.org/conda-forge/fairly/badges/downloads.svg   
 29 |    :target: https://anaconda.org/conda-forge/fairly
 30 |    :alt: Anaconda Downloads
 31 | 
 32 | .. |GitHub Badge| image:: https://img.shields.io/github/v/release/ITC-CRIB/fairly
 33 |    :target: https://github.com/ITC-CRIB/fairly
 34 |    :alt: GitHub Badge
 35 | 
 36 | .. |License Badge| image:: https://img.shields.io/badge/License-MIT-yellow.svg
 37 |    :target: https://opensource.org/licenses/MIT
 38 |    :alt: License Badge
 39 | 
 40 | .. |PyPI Badge| image:: https://img.shields.io/pypi/v/fairly?colorB=blue
 41 |    :target: https://pypi.org/project/fairly/
 42 |    :alt: PyPI Badge
 43 | 
 44 | .. |Zenodo Badge| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.7759648.svg
 45 |    :target: https://doi.org/10.5281/zenodo.7759648
 46 |    :alt: Zenodo Badge
 47 | 
 48 | .. |Python Build| image:: https://github.com/ITC-CRIB/fairly/actions/workflows/test_workflow.yaml/badge.svg
 49 |    :target: https://github.com/ITC-CRIB/fairly/actions/workflows/test_workflow.yaml
 50 |    :alt: Python Build
 51 | 
 52 | .. |Python Publish| image:: https://github.com/ITC-CRIB/fairly/actions/workflows/publish.yaml/badge.svg
 53 |    :target: https://github.com/ITC-CRIB/fairly/actions/workflows/publish.yaml
 54 |    :alt: Python Publish
 55 | 
 56 | .. |Documentation Status| image:: https://readthedocs.org/projects/fairly/badge/?version=latest
 57 |    :target: https://fairly.readthedocs.io/en/latest/
 58 |    :alt: Documentation Status
 59 | 
 60 | 
 61 | fairly
 62 | ======
 63 | 
 64 | A package to create, publish and clone research datasets.
 65 | 
 66 | |License: MIT|
 67 | 
 68 | Installation
 69 | ------------
 70 | 
 71 | *fairly* requires Python 3.8 or later, and `ruamel.yaml` version *0.17.26* or later.  It can be installed directly
 72 | from **PYPI** or **Conda-Forge**.
 73 | 
 74 | .. code:: shell
 75 | 
 76 |    # Using pip
 77 |    pip install fairly
 78 | 
 79 | .. code:: shell
 80 | 
 81 |    # using anaconda or miniconda
 82 |    conda install conda-forge::fairly
 83 | 
 84 | 
 85 | Installing from source
 86 | ~~~~~~~~~~~~~~~~~~~~~~
 87 | 
 88 | 1. Clone or download the `source
 89 |    code <https://github.com/ITC-CRIB/fairly>`__:
 90 | 
 91 |    .. code:: shell
 92 | 
 93 |       git clone https://github.com/ITC-CRIB/fairly.git
 94 | 
 95 | 2. Go to the root directory:
 96 | 
 97 |    .. code:: shell
 98 | 
 99 |       cd fairly/
100 | 
101 | 3. Compile and install using pip:
102 | 
103 |    .. code:: shell
104 | 
105 |       pip install .
106 | 
107 | Usage
108 | -----
109 | 
110 | Basic example to create a local research dataset and deposit it to a
111 | repository:
112 | 
113 | .. code:: python
114 | 
115 |    import fairly
116 | 
117 |    # Initialize a local dataset
118 |    dataset = fairly.init_dataset('/path/dataset')
119 | 
120 |    # Set metadata
121 |    dataset.metadata['license'] = 'MIT'
122 |    dataset.set_metadata(
123 |        title='My dataset',
124 |        keywords=['FAIR', 'research', 'data'],
125 |        authors=[
126 |            '0000-0002-0156-185X',
127 |            {'name': 'John', 'surname': 'Doe'}
128 |        ]
129 |    )
130 | 
131 |    # Add data files
132 |    dataset.includes.extend([
133 |        'README.txt',
134 |        '*.csv',
135 |        'train/*.jpg'
136 |    ])
137 | 
138 |    # Save dataset
139 |    dataset.save()
140 | 
141 |    # Upload to a data repository
142 |    remote_dataset = dataset.upload('zenodo')
143 | 
144 | Basic example to access a remote dataset and store it locally:
145 | 
146 | .. code:: python
147 | 
148 |    import fairly
149 | 
150 |    # Open a remote dataset
151 |    dataset = fairly.dataset('doi:10.4121/21588096.v1')
152 | 
153 |    # Get dataset information
154 |    dataset.id
155 |    >>> {'id': '21588096', 'version': '1'}
156 | 
157 |    dataset.url
158 |    >>> 'https://data.4tu.nl/articles/dataset/.../21588096/1'
159 | 
160 |    dataset.size
161 |    >>> 33339
162 | 
163 |    len(dataset.files)
164 |    >>> 6
165 | 
166 |    dataset.metadata
167 |    >>> Metadata({'keywords': ['Earthquakes', 'precursor', ...], ...})
168 | 
169 |    # Update metadata
170 |    dataset.metadata['keywords'] = ['Landslides', 'precursor']
171 |    dataset.save_metadata()
172 | 
173 |    # Store dataset to a local directory (i.e. clone dataset)
174 |    local_dataset = dataset.store('/path/dataset')
175 | 
176 | Currently, the package supports the following research data management
177 | platforms:
178 | 
179 | -  `Invenio <https://inveniosoftware.org/>`__
180 | -  `Figshare <https://figshare.com/>`__
181 | -  `Djehuty <https://github.com/4TUResearchData/djehuty/>`__
182 |    (experimental)
183 | 
184 | All research data repositories based on the listed platforms are
185 | supported.
186 | 
187 | For more details and examples, consult the `package
188 | documentation <https://fairly.readthedocs.io/en/latest/>`__.
189 | 
190 | Testing
191 | -------
192 | 
193 | Unit tests can be run by using ``pytest`` command in the root directory.
194 | 
195 | Contributions
196 | -------------
197 | 
198 | Read the `guidelines <CONTRIBUTING.md>`__ to know how you can be part of
199 | this open source project.
200 | 
201 | JupyterLab Extension
202 | --------------------
203 | 
204 | An extension for JupyerLab is being developed in a `different
205 | repository. <https://github.com/ITC-CRIB/jupyter-fairly>`__
206 | 
207 | Citation
208 | --------
209 | 
210 | Please cite this software using as follows:
211 | 
212 | *Girgin, S., Garcia Alvarez, M., & Urra Llanusa, J., fairly: a package
213 | to create, publish and clone research datasets [Computer software]*
214 | 
215 | Acknowledgements
216 | ----------------
217 | 
218 | This research is funded by the `Dutch Research Council (NWO) Open
219 | Science
220 | Fund <https://www.nwo.nl/en/researchprogrammes/open-science/open-science-fund/>`__,
221 | File No. 203.001.114.
222 | 
223 | Project members:
224 | 
225 | -  `Center of Expertise in Big Geodata Science, University of Twente,
226 |    Faculty ITC <https://itc.nl/big-geodata/>`__
227 | -  `Digital Competence Centre, TU Delft <https://dcc.tudelft.nl/>`__
228 | -  `4TU.ResearchData <https://data.4tu.nl/>`__
229 | 
230 | .. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg
231 |    :target: https://opensource.org/licenses/MIT
232 | 


--------------------------------------------------------------------------------
/docs/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 |     # You can also specify versions for other tools like so:
14 |     # nodejs: "16"
15 |     # rust: "1.55"
16 |     # golang: "1.17"
17 | 
18 | # Build documentation in the docs/ directory with Sphinx
19 | sphinx:
20 |    configuration: docs/conf.py
21 | 
22 | # If using Sphinx, optionally build your docs in additional formats such as PDF
23 | formats:
24 |    - pdf
25 | 
26 | # Optionally declare the Python requirements required to build your docs
27 | python:
28 |    install:
29 |    - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
 1 | /* Prevent stacking properties */
 2 | /* https://github.com/readthedocs/sphinx_rtd_theme/issues/1301 */
 3 | dl.property {
 4 | 	display: block !important;
 5 | 	width: 100%;
 6 | }
 7 | /* Prevent double colon for roles */
 8 | /* https://github.com/sphinx-doc/sphinx/issues/10594 */
 9 | dl.field-list .colon {
10 |     display: none;
11 | }


--------------------------------------------------------------------------------
/docs/api/fairly.client.rst:
--------------------------------------------------------------------------------
 1 | fairly.client package
 2 | =====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | fairly.client.djehuty module
 8 | ----------------------------
 9 | 
10 | .. automodule:: fairly.client.djehuty
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | fairly.client.figshare module
16 | -----------------------------
17 | 
18 | .. automodule:: fairly.client.figshare
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | fairly.client.invenio module
24 | ----------------------------
25 | 
26 | .. automodule:: fairly.client.invenio
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: fairly.client
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 |    :noindex:
39 | 


--------------------------------------------------------------------------------
/docs/api/fairly.dataset.rst:
--------------------------------------------------------------------------------
 1 | fairly.dataset package
 2 | ======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | fairly.dataset.local module
 8 | ---------------------------
 9 | 
10 | .. automodule:: fairly.dataset.local
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | fairly.dataset.remote module
16 | ----------------------------
17 | 
18 | .. automodule:: fairly.dataset.remote
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: fairly.dataset
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 |    :noindex:
31 | 


--------------------------------------------------------------------------------
/docs/api/fairly.file.rst:
--------------------------------------------------------------------------------
 1 | fairly.file package
 2 | ===================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | fairly.file.local module
 8 | ------------------------
 9 | 
10 | .. automodule:: fairly.file.local
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | fairly.file.remote module
16 | -------------------------
17 | 
18 | .. automodule:: fairly.file.remote
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: fairly.file
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 


--------------------------------------------------------------------------------
/docs/api/fairly.rst:
--------------------------------------------------------------------------------
 1 | .. _`appi`:
 2 | 
 3 | fairly package
 4 | ==============
 5 | 
 6 | Subpackages
 7 | -----------
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 4
11 | 
12 |    fairly.client
13 |    fairly.dataset
14 |    fairly.file
15 | 
16 | Submodules
17 | ----------
18 | 
19 | fairly.diff module
20 | ------------------
21 | 
22 | .. automodule:: fairly.diff
23 |    :members:
24 |    :undoc-members:
25 |    :show-inheritance:
26 | 
27 | fairly.metadata module
28 | ----------------------
29 | 
30 | .. automodule:: fairly.metadata
31 |    :members:
32 |    :undoc-members:
33 |    :show-inheritance:
34 | 
35 | fairly.person module
36 | --------------------
37 | 
38 | .. automodule:: fairly.person
39 |    :members:
40 |    :undoc-members:
41 |    :show-inheritance:
42 | 
43 | Module contents
44 | ---------------
45 | 
46 | .. automodule:: fairly
47 |    :members:
48 |    :undoc-members:
49 |    :show-inheritance:
50 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = 'Fairly Toolset'
10 | copyright = '2023, Serkan Girgin, Manuel Garcia Alvarez, Jose Urra Llanusa'
11 | author = 'Serkan Girgin, Manuel Garcia Alvarez, Jose Urra Llanusa'
12 | release = '1.0.0'
13 | 
14 | # -- General configuration ---------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
16 | 
17 | extensions = [
18 |     'sphinx.ext.autodoc',
19 |     'sphinx.ext.viewcode',
20 |     'sphinx.ext.napoleon',
21 |     'sphinx_rtd_theme',
22 |     'nbsphinx',
23 |     'sphinx_copybutton',
24 | ]
25 | 
26 | templates_path = ['_templates']
27 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
28 | 
29 | 
30 | 
31 | # -- Options for HTML output -------------------------------------------------
32 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
33 | 
34 | html_theme = 'sphinx_rtd_theme'
35 | html_static_path = ['_static']
36 | html_css_files = [
37 |     'css/custom.css',
38 | ]
39 | 


--------------------------------------------------------------------------------
/docs/img/add-filles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/add-filles.png


--------------------------------------------------------------------------------
/docs/img/clone1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/clone1.png


--------------------------------------------------------------------------------
/docs/img/clone2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/clone2.png


--------------------------------------------------------------------------------
/docs/img/contex-menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/contex-menu.png


--------------------------------------------------------------------------------
/docs/img/create-dataset1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/create-dataset1.png


--------------------------------------------------------------------------------
/docs/img/create-dataset2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/create-dataset2.png


--------------------------------------------------------------------------------
/docs/img/create-directory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/create-directory.png


--------------------------------------------------------------------------------
/docs/img/labs-home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/labs-home.png


--------------------------------------------------------------------------------
/docs/img/labs-start.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/labs-start.png


--------------------------------------------------------------------------------
/docs/img/my-dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/my-dataset.png


--------------------------------------------------------------------------------
/docs/img/open-metadata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/open-metadata.png


--------------------------------------------------------------------------------
/docs/img/osf-banner2023.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/osf-banner2023.png


--------------------------------------------------------------------------------
/docs/img/push-confirm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/push-confirm.png


--------------------------------------------------------------------------------
/docs/img/push-menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/push-menu.png


--------------------------------------------------------------------------------
/docs/img/start-jupyterlab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/start-jupyterlab.png


--------------------------------------------------------------------------------
/docs/img/zenodo-cli-upload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/zenodo-cli-upload.png


--------------------------------------------------------------------------------
/docs/img/zenodo-token.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/zenodo-token.png


--------------------------------------------------------------------------------
/docs/img/zenodo-upload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/docs/img/zenodo-upload.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. fairly documentation master file, created by
 2 |    sphinx-quickstart on Mon Oct  3 21:00:21 2022.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Fairly Toolset Documentation
 7 | ==================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Fairly Package
12 | 
13 |    installation
14 | 
15 | .. toctree::
16 |    :maxdepth: 1
17 |    :caption: Tutorials
18 | 
19 |    tutorials/jupyterlab
20 |    tutorials/cli
21 |    tutorials/python-api
22 | 
23 | .. toctree::
24 |    :maxdepth:  1
25 |    :caption: Fairly API
26 |    
27 |    modules
28 |    api/fairly.client
29 |    api/fairly.dataset
30 |    api/fairly.file
31 | 
32 | 
33 | Indices and tables
34 | ==================
35 | 
36 | * :ref:`genindex`
37 | * :ref:`modindex`
38 | * :ref:`search`
39 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
  1 | .. _installation:
  2 | 
  3 | Installation
  4 | ================
  5 | 
  6 | The *Fairly Toolset* provides functionality for the core tasks of preparing, uploading and downloading datasets from research data repositories. The toolset currently provides integration with data repositories based on `Invenio <https://inveniosoftware.org/>`_ and `Figshare <https://figshare.com/>`_.
  7 | 
  8 | **What's Included:**
  9 | 
 10 | * fairly Python package
 11 | * Command Line Interface (CLI)
 12 | * JupyterLab extension
 13 | 
 14 | **Requirements:**
 15 | 
 16 | * Python 3.8 or higher
 17 | * pip 20.0 or higher
 18 | * ruamel.yaml 0.17.26 or higher 
 19 | * JupyterLab 3.x
 20 | 
 21 | Installing the Toolset
 22 | ------------------------
 23 | 
 24 | You can install the *full toolset* by installing the JupyterLab extension from PyPI. The fairly package and CLI will be installed automatically.
 25 | 
 26 | Linux / MacOS
 27 | '''''''''''''''''''
 28 | 
 29 | Install the toolset using `pip`
 30 | 
 31 | .. code-block:: shell
 32 | 
 33 |    pip install jupyter-fairly
 34 | 
 35 | 
 36 | Windows
 37 | '''''''''''''''''''
 38 | 
 39 | 1. Download the ZIP file with the `latest release <https://github.com/ITC-CRIB/jupyter-fairly/releases>`_ of the JupyterLab extension to a directory.
 40 | 2. Unzip the content.
 41 | 3. Using the **terminal**, go to the directory where the ZIP file is located and then to the `jupyter_fairly` sub-directory.
 42 | 4. Type and run the following command. You need to add Python to the system PATH for this to work.
 43 | 
 44 | .. code-block:: shell
 45 | 
 46 |    python -m pip install .
 47 | 
 48 | .. warning::
 49 |    For the above to work, you need Pyton in the PATH environment variable on Windows. If your not sure that is the case. Open the Shell, and type :code:`python --version`. You should see the version of Python on the screen. If you see otherwise, follow these steps to `add Python to the PATH on Windows <https://realpython.com/add-python-to-path/#how-to-add-python-to-path-on-windows>`_
 50 | 
 51 | Installing Python Package Only
 52 | --------------------------------
 53 | 
 54 | If all you need is the *fairly* Python package and the CLI, you can install them as following.
 55 | 
 56 | Linux / MacOS
 57 | '''''''''''''''''''
 58 | 
 59 | On the terminal type, for PyPI:
 60 | 
 61 | .. code-block:: shell
 62 | 
 63 |    pip install fairly
 64 | 
 65 | Or if using Anaconda or Miniconda:
 66 | 
 67 | .. code-block:: shell
 68 | 
 69 |    conda install conda-forge::fairly
 70 | 
 71 | 
 72 | Installing from Source
 73 | '''''''''''''''''''''''''
 74 | 
 75 | Installing *fairly* from source requires `setuptools` version 49.0 or later and `pip`.
 76 | 
 77 | 1. Clone or download the `source code <https://github.com/ITC-CRIB/fairly>`_:
 78 | 
 79 | .. code-block:: shell
 80 | 
 81 |    git clone https://github.com/ITC-CRIB/fairly.git
 82 | 
 83 | 
 84 | 2. Unzip if necessary, and go to the `fairly` directory:
 85 | 
 86 | .. code-block:: shell
 87 | 
 88 |    cd fairly/
 89 | 
 90 | 
 91 | 3. Install the package:
 92 | 
 93 | .. code-block:: shell
 94 | 
 95 |    pip install .
 96 | 
 97 | .. important::
 98 |    Currently, the toolset only supports data repositories based on `Invenio <https://inveniosoftware.org/>`_ and `Figshare <https://figshare.com/>`_. For examples on how to use the toolset, read the `Tutorials <index.rst>`_
 99 | 
100 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | fairly
2 | ======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    api/fairly
8 | 


--------------------------------------------------------------------------------
/docs/package/account-datasets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Access Account Datasets\n",
  8 |     "\n",
  9 |     "With *fairly*, you can access the datasets in a repository's user account. This tutorial shows you how to do it for the case of 4TU.ResearchData. The procedure is the same for Zenodo.\n",
 10 |     "\n",
 11 |     "**Requirements:**\n",
 12 |     "\n",
 13 |     "* A 4TU.ResearchData account\n",
 14 |     "* A personal access token. See [configuring access token](https://fairly.readthedocs.io/en/latest/package/account-token.html) if you don't have one."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## 1. Connect to an account\n",
 22 |     "\n",
 23 |     "To connect to an repository's account, we need to pass a personal token when creating a client. Or we can store tokens in a configuration file at `~/.fairly/config.json`"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# Passing a token directly\n",
 33 |     "import fairly\n",
 34 |     "\n",
 35 |     "fourtu = fairly.client(\"figshare\", token=\"<4tu-token>\")"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "> To store your tokens, create a JSON file like the one below and store it at `~/.fairly/config.json`. You can store tokens for other repositories by adding them to this file as `\"<repository-id>\": {\"token\": \"<the-token>\"}`\n",
 43 |     "\n",
 44 |     "```json\n",
 45 |     "\n",
 46 |     "{\n",
 47 |     "    \"4tu\": {\n",
 48 |     "\t\t\"token\": \"<4tu-token>\"\n",
 49 |     "\t}\n",
 50 |     "}\n",
 51 |     "```"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## 2. Retrieve account datasets\n",
 59 |     "\n",
 60 |     "You can see the datasets in an account by calling the `get_account_datasets()` method of a client. This retrieves the list of datasets in the account. Then, you can use the `id` and `metadata` properties of a *dataset* to find more details."
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "There are 2 datasets in this account\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "# Retrieve the datasets in the account\n",
 78 |     "my_datasets = fourtu.get_account_datasets()\n",
 79 |     "\n",
 80 |     "# Display the number of datasets\n",
 81 |     "print(\"There are\", len(my_datasets), \"datasets in this account\")\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 5,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "Dataset Ids:\n",
 94 |       "[{'id': '20758348', 'version': None}, {'id': '20752675', 'version': None}]\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "# Display the dataset IDs \n",
100 |     "print(\"Dataset Ids:\")\n",
101 |     "print([dataset.id for dataset in my_datasets])"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 6,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "text/plain": [
112 |        "Metadata({'authors': [Person({'fullname': 'Manuel Garcia Alvarez', 'figshare_id': 10645703})], 'license': 'CC BY 4.0', 'title': 'test-dataset', 'type': 'software', 'access_type': 'open', 'custom_fields': {'Publisher': '4TU.ResearchData', 'Language': '', 'Time coverage': '', 'Geolocation': '', 'Geolocation Longitude': '', 'Geolocation Latitude': '', 'Format': '', 'Data Link': [], 'Derived From': [], 'Same As': [], 'Organizations': ''}, 'embargo_type': 'file', 'categories': ['Communications Technologies'], 'figshare_id': {'id': '20758348', 'version': None}})"
113 |       ]
114 |      },
115 |      "execution_count": 6,
116 |      "metadata": {},
117 |      "output_type": "execute_result"
118 |     }
119 |    ],
120 |    "source": [
121 |     "# Metadata of the first dataset\n",
122 |     "my_datasets[0].metadata"
123 |    ]
124 |   }
125 |  ],
126 |  "metadata": {
127 |   "kernelspec": {
128 |    "display_name": "Python 3.10.4 64-bit",
129 |    "language": "python",
130 |    "name": "python3"
131 |   },
132 |   "language_info": {
133 |    "codemirror_mode": {
134 |     "name": "ipython",
135 |     "version": 3
136 |    },
137 |    "file_extension": ".py",
138 |    "mimetype": "text/x-python",
139 |    "name": "python",
140 |    "nbconvert_exporter": "python",
141 |    "pygments_lexer": "ipython3",
142 |    "version": "3.10.4"
143 |   },
144 |   "vscode": {
145 |    "interpreter": {
146 |     "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
147 |    }
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 4
152 | }
153 | 


--------------------------------------------------------------------------------
/docs/package/account-token.rst:
--------------------------------------------------------------------------------
 1 | .. _access token:
 2 | 
 3 | Configuring Access Token
 4 | ###########################
 5 | 
 6 | 
 7 | *fairly* can be used to access datasets owned by a user of a data repository. For 4TU.ResearchData and Zenodo, we can do that by configuring access tokens.
 8 | 
 9 | Creating a personal access token
10 | =====================================
11 | 
12 | A personal access toke allows to connect to a user account remotely without the need to a *username* and *password*.
13 | 
14 | Zenodo
15 | -------------
16 | 
17 | 1. Register for a Zenodo account if you do not already have one.
18 | #. Go to your :guilabel:`Applications`, and click on :guilabel:`New token` under **Personal access tokens**.
19 | #. Enter a name for your token.
20 | #. Select the OAuth scopes you need (:guilabel:`deposit:write` and :guilabel:`deposit:actions`).
21 | #. Click :guilabel:`Create`
22 | #. An access token will be shown, copy it and store it. **The token will only be shown once.**
23 | #. Click on :guilabel:`Save`
24 | 
25 | 
26 | 4TU.ResearchData
27 | -------------------
28 | 
29 | 1. Register for a Zenodo account if you do not already have one.
30 | #. Go to your :guilabel:`Applications`, and click on :guilabel:`Create Personal Token`.
31 | #. Enter short description for your token, for example a name, and click on :guilabel:`Save`
32 | #. An access token will be shown, copy it and store it. **The token will only be shown once.**
33 | #. Click on :guilabel:`Done`
34 | 
35 | Connecting to an Account
36 | ============================
37 | 
38 | Connecting to an account is a simple as passing a token when creating a 4TU.ResearchData or Zenodo client.
39 | 
40 | .. code-block:: python
41 | 
42 |    import fairly
43 | 
44 |    # For 4TU.ResearchData
45 |    fourtu = fairly.client("figshare", token="<my-4tu-token>")
46 | 
47 |    # For Zenodo
48 |    zenodo = fairly.client("zenodo", token="<my-zenodo-token>" )
49 | 
50 | Storing Tokens
51 | ================
52 | 
53 | To store your tokens, create a JSON file like the one below and store it at `~/.fairly/config.json`. You can store tokens for other repositories by adding them to this file as `"<repository-id>": {"token": "<the-token>"}`
54 | 
55 | .. code-block:: json
56 | 
57 |    {
58 |       "4tu": {
59 |          "token": "<4tu-token>"
60 |       }
61 |    }
62 | 
63 | 


--------------------------------------------------------------------------------
/docs/package/archiving-datasets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Archiving Dataset\n",
  8 |     "\n",
  9 |     "With **fairly**, we can remotely archive and edit datasets in a user account. Users can prepare a dataset for archiving by editing metadata, defining which files are part of a dataset, and uploading them to a data repository. One of the purposes of **fairly** is to *remove the need of preparing metadata and data for every repository to which a dataset will be archived*. Therefore, saving time and effort, and lowering the barriers for practicing Open Science.\n",
 10 |     "This tutorial shows what is possible by using the 4TU.ResearchData repository. The procedure is similar for Zenodo.\n",
 11 |     "\n",
 12 |     "**Requirements:**\n",
 13 |     "\n",
 14 |     "* A 4TU.ResearchData account\n",
 15 |     "* A personal access token. See [configuring access token](https://fairly.readthedocs.io/en/latest/package/account-token.html) if you don't have one.\n",
 16 |     "* Files to be archived. We will use a hypothetical case in this tutorial.\n",
 17 |     "\n",
 18 |     "> For this tutorial, we assume that our goal is to archive a dataset in 4TU.ResearchData, that we previously archived in Zenodo. We will use the dataset [Quality and timing of crowd-based water level class observations](https://zenodo.org/records/3929547), as an example.\n",
 19 |     "   "
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## 1. Download the Zenodo dataset\n",
 27 |     "\n",
 28 |     "First, we need to download the [Quality and timing of crowd-based water level class observations](https://zenodo.org/records/3929547), using its URL. If you did this already in the tutorial on *downloading datasets from Zenodo*, you can skip this step."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 3,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import fairly\n",
 38 |     "\n",
 39 |     "# Create a Zenodo client\n",
 40 |     "zenodo = fairly.client(\"zenodo\")\n",
 41 |     "\n",
 42 |     "# Connect and download a dataset\n",
 43 |     "source_dataset = zenodo.get_dataset(\"https://zenodo.org/records/3929547\") \n",
 44 |     "source_dataset.store(\"./quality/\") "
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## 2. Editing Metadata\n",
 52 |     "\n",
 53 |     "Now we can load the downloaded dataset and edit its metadata. For example, we can add a few more *keywords* and edit the *license*."
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 4,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "{'access_type': 'open', 'authors': [Person({'fullname': 'Etter, Simon', 'institution': 'University of Zurich, Department of Geography', 'name': 'Simon', 'orcid_id': '0000-0002-7553-9102', 'surname': 'Etter'}), Person({'fullname': 'Strobl, Barbara', 'institution': 'University of Zurich, Department of Geography', 'name': 'Barbara', 'orcid_id': '0000-0001-5530-4632', 'surname': 'Strobl'}), Person({'fullname': 'Seibert, Jan', 'institution': 'University of Zurich, Department of Geography', 'name': 'Jan', 'orcid_id': '0000-0002-6314-2124', 'surname': 'Seibert'}), Person({'fullname': 'van Meerveld, Ilja (H.J.)', 'institution': 'University of Zurich, Department of Geography', 'name': 'Ilja (H.J.)', 'orcid_id': '0000-0002-7547-3270', 'surname': 'van Meerveld'})], 'description': '<p>This are the data and the R-scripts used for the manuscript &quot;Quality and timing of crowd-based water level class observations&quot; accepted for publication in the journal Hydrological Processes in July 2020 as a Scientific Briefing. To run the code, just run the R-script with the name &quot;RunThisForResults.R&quot;. Results will be written to the &quot;Figures&quot; and the &quot;Results&quot; folder.</p>', 'doi': '10.5281/zenodo.3929547', 'grants': ['10.13039/501100001711::200021_163008'], 'keywords': ['CrowdWater', 'Hydrology'], 'language': 'eng', 'license': 'CC-BY-4.0', 'prereserve_doi': {'doi': '10.5281/zenodo.3929547', 'recid': 3929547}, 'publication_date': '2020-02-20', 'related_identifiers': [{'identifier': '10.5281/zenodo.3676350', 'relation': 'isVersionOf', 'scheme': 'doi'}], 'title': 'Data and R-Scripts for \"Quality and timing of crowd-based water level class observations\"', 'type': 'dataset', 'version': '2', 'zenodo_id': {'id': '3929547'}}\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "import fairly\n",
 71 |     "\n",
 72 |     "# Load a previously downloaded dataset by passing its path\n",
 73 |     "local_dataset = fairly.dataset(\"./quality/\")\n",
 74 |     "\n",
 75 |     "# Display the metadata\n",
 76 |     "print(local_dataset.metadata)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 5,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# Edit keywords\n",
 86 |     "local_dataset.metadata[\"keywords\"] = [\"CrowdWater\", \"Hydrology\", \"made by fairly\"]\n",
 87 |     "\n",
 88 |     "# Edit the license name to match what is required by 4TU.ResearchData\n",
 89 |     "local_dataset.metadata[\"license\"] = \"CC BY 4.0\""
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## 3. Archive to 4TU.ResearchData\n",
 97 |     "Now we can create a new dataset in a 4TU.ResearchData account. We assume a **personal access token** has already been added to `~/.fairly/config.json` "
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 6,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "DataForUploadToZenodo.zip, 26765942/10485760\n",
110 |       "DataForUploadToZenodo.zip, 26765942/20971520\n",
111 |       "DataForUploadToZenodo.zip, 26765942/26765942\n"
112 |      ]
113 |     },
114 |     {
115 |      "data": {
116 |       "text/plain": [
117 |        "<fairly.dataset.remote.RemoteDataset at 0x7f62f5326200>"
118 |       ]
119 |      },
120 |      "execution_count": 6,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "local_dataset.upload(\"figshare\", notify=fairly.notify)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "> We could continue uploading files or editing the metadata in a similar way. For now, **publishing** the dataset should be done via the web interface of 4TU.ResearchData."
134 |    ]
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "kernelspec": {
139 |    "display_name": "Python 3 (ipykernel)",
140 |    "language": "python",
141 |    "name": "python3"
142 |   },
143 |   "language_info": {
144 |    "codemirror_mode": {
145 |     "name": "ipython",
146 |     "version": 3
147 |    },
148 |    "file_extension": ".py",
149 |    "mimetype": "text/x-python",
150 |    "name": "python",
151 |    "nbconvert_exporter": "python",
152 |    "pygments_lexer": "ipython3",
153 |    "version": "3.10.4"
154 |   },
155 |   "vscode": {
156 |    "interpreter": {
157 |     "hash": "262683ceb590c1664a72ae4b5fb24aafe692d2539af9aafb5e1323673742110e"
158 |    }
159 |   }
160 |  },
161 |  "nbformat": 4,
162 |  "nbformat_minor": 4
163 | }
164 | 


--------------------------------------------------------------------------------
/docs/package/demo-4tu.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "441a22a6-7527-48c3-951e-7cab0937707c",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Download Datasets from 4TU.ResearchData\n",
  9 |     "\n",
 10 |     "**fairly** can download public datasets from 4TU.ResearchData.\n",
 11 |     "The *4TU.ResearchData* repository uses Figshare as a platform for managing research datasets. For this example, we will use the dataset [EDoM measurement campaign](https://data.4tu.nl/articles/dataset/EDoM_measurement_campaign_full_data_from_the_lower_Ems_River/20308263). This dataset contains 28 files of different types (`.txt`, `.pdf`), and it is about `278 MB` in size. \n",
 12 |     "\n",
 13 |     "We can fetch a dataset using either its ID or its URL. For 4TU.ResearchData the dataset ID is the last part of the URL that appears in the web browser. The dataset has ID: `20308263`\n"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "9b1a66e5",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## 1. Connect to 4TU.ResearchData\n",
 22 |     "To connect to data repositories we use clients. A client manage the connection to an specific data repository. We can create a client to connect to 4TU.ResearchData as follows:"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 4,
 28 |    "id": "3ddbd026-62e2-4a2c-a62e-127f06a4b0f3",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "import fairly \n",
 33 |     "\n",
 34 |     "fourtu = fairly.client(\"4tu\") "
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "id": "f088481e",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## 2. Connect to a dataset\n",
 43 |     "\n",
 44 |     "Now, we can connect to a *public* dataset by calling the `get_dataset()` method and using the dataset ID, its URL, or its DOI."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 5,
 50 |    "id": "075a2d23-85ee-4415-bd53-888e11627f61",
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "# Using ID\n",
 55 |     "# dataset = fourtu.get_dataset(\"20308263\") \n",
 56 |     "\n",
 57 |     "# Using URL address\n",
 58 |     "dataset = fourtu.get_dataset(\"https://data.4tu.nl/articles/dataset/EDoM_measurement_campaign_full_data_from_the_lower_Ems_River/20308263\") \n",
 59 |     "\n",
 60 |     "# Using DOI\n",
 61 | 		"# fairly has a convenience method that infers the client from DOI\n",
 62 |     "# dataset = fairly.dataset(\"https://doi.org/10.4121/19519618.v1\")"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "id": "59c971ed",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## 3. Explore dataset's metadata\n",
 71 |     "\n",
 72 |     "Once we have made a connection to a dataset, we can access its metadata as stored in the data repository by using the `metadata` property."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 3,
 78 |    "id": "30023980",
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "data": {
 83 |       "text/plain": [
 84 |        "Metadata({'authors': [Person({'fullname': 'Bas van Maren', 'orcid_id': '0000-0001-5820-3212', 'figshare_id': 11844539}), Person({'fullname': 'Andreas Engels', 'figshare_id': 12901508})], 'keywords': ['Hydrodynamics', 'Sediment dynamics', 'Collection: The Ems-Dollard Measurement (EDoM) campaign'], 'description': '<p>A large amount of long term monitoring data collected during the Edom measurement campaign has been published in Net CDF as part of the collection \\'Edom measurements campaign: data from long-term monitoring\\' ( <a href=\"https://doi.org/10.4121/19519618.v1\" target=\"_blank\">https://doi.org/10.4121/19519618.v1</a>). This dataset provides the full subset of the long term mooring data (including oxygen and flow velocities) in ASCII text format, and only for the lower Ems River</p>', 'license': 'CC BY-NC-SA 4.0', 'title': 'EDoM measurement campaign: full data from the lower Ems River', 'doi': '10.4121/20308263.v1', 'type': 'dataset', 'access_type': 'open', 'custom_fields': {'Publisher': '4TU.ResearchData', 'Language': '', 'Time coverage': '2017-2019', 'Geolocation': 'Ems estuary', 'Geolocation Longitude': '7.04', 'Geolocation Latitude': '53.30', 'Format': 'ASCII text', 'Data Link': [], 'Derived From': [], 'Same As': [], 'Organizations': 'Niedersächsischer Landesbetrieb für Wasserwirtschaft Küsten (NLWKN);'}, 'embargo_type': 'file', 'categories': ['Physical Geography and Environmental Geoscience'], 'online_date': '2022-07-14T10:56:04', '4tu_id': {'id': '20308263', 'version': None}})"
 85 |       ]
 86 |      },
 87 |      "execution_count": 3,
 88 |      "metadata": {},
 89 |      "output_type": "execute_result"
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "# Retrieve metadata from the data repository\n",
 94 |     "dataset.metadata"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "id": "2523d219",
100 |    "metadata": {},
101 |    "source": [
102 |     "## 4. List dataset's files\n",
103 |     "\n",
104 |     "We can list the files of a dataset using the `files` property. The result is a Python dictionary where names of files become keys of the dictionary."
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "id": "f9f51002",
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "There are 28 files in this dataset\n",
118 |       "{'CsEmspier_01052017-01052019_from_NLWKN.txt': 'CsEmspier_01052017-01052019_from_NLWKN.txt', 'CsGandesum_01052017-01052019_from_NLWKN.txt': 'CsGandesum_01052017-01052019_from_NLWKN.txt', 'CsKnock_01052017-01052019_from_NLWKN.txt': 'CsKnock_01052017-01052019_from_NLWKN.txt', 'CsMP1_01052017-01052019_from_WSV.txt': 'CsMP1_01052017-01052019_from_WSV.txt', 'CsPogum_01052017-01052019_from_NLWKN.txt': 'CsPogum_01052017-01052019_from_NLWKN.txt', 'CsTerborg_01052017-01052019_from_NLWKN.txt': 'CsTerborg_01052017-01052019_from_NLWKN.txt', 'Messung_Gewaesserguete_EMS_NLWKN.pdf': 'Messung_Gewaesserguete_EMS_NLWKN.pdf', 'O2Emspier_01052017-01052019_from_NLWKN.txt': 'O2Emspier_01052017-01052019_from_NLWKN.txt', 'O2Gandersum_01052017-01052019_from_NLWKN.txt': 'O2Gandersum_01052017-01052019_from_NLWKN.txt', 'O2Knock_01052017-01052019_from_NLWKN.txt': 'O2Knock_01052017-01052019_from_NLWKN.txt', 'O2MP1_01052017-01052019_from_WSV.txt': 'O2MP1_01052017-01052019_from_WSV.txt', 'O2Pogum_01052017-01052019_from_NLWKN.txt': 'O2Pogum_01052017-01052019_from_NLWKN.txt', 'O2Terborg_01052017-01052019_from_NLWKN.txt': 'O2Terborg_01052017-01052019_from_NLWKN.txt', 'Q_Versen_052017-052019.txt': 'Q_Versen_052017-052019.txt', 'readme.txt': 'readme.txt', 'SpEmspier_01052017-01052019_from_NLWKN.txt': 'SpEmspier_01052017-01052019_from_NLWKN.txt', 'SpGandersum_01052017-01052019_from_NLWKN.txt': 'SpGandersum_01052017-01052019_from_NLWKN.txt', 'SpKnock_01052017-01052019_from_NLWKN.txt': 'SpKnock_01052017-01052019_from_NLWKN.txt', 'SpMP1_01052017-01052019_from_WSV.txt': 'SpMP1_01052017-01052019_from_WSV.txt', 'SpPogum_01052017-01052019_from_NLWKN.txt': 'SpPogum_01052017-01052019_from_NLWKN.txt', 'SpTerborg_01052017-01052019_from_NLWKN.txt': 'SpTerborg_01052017-01052019_from_NLWKN.txt', 'U_Emden_01052017-01052019_from_WSA_Emden.txt': 'U_Emden_01052017-01052019_from_WSA_Emden.txt', 'U_Knock_01052017-01052019_from_WSA_Emden.txt': 'U_Knock_01052017-01052019_from_WSA_Emden.txt', 'U_MP1_01052017-01052019_from_WSV.txt': 'U_MP1_01052017-01052019_from_WSV.txt', 'U_Terborg_01052017-01052019_from_WSA_Emden.txt': 'U_Terborg_01052017-01052019_from_WSA_Emden.txt', 'WL_Emden_01052017-01052019_from_WSA_Emden.txt': 'WL_Emden_01052017-01052019_from_WSA_Emden.txt', 'WL_Knock_01052017-01052019_from_WSA_Emden.txt': 'WL_Knock_01052017-01052019_from_WSA_Emden.txt', 'WL_Terborg_01052017-01052019_from_WSA_Emden.txt': 'WL_Terborg_01052017-01052019_from_WSA_Emden.txt'}\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "# List files associated with the dataset\n",
124 |     "files = dataset.files\n",
125 |     "\n",
126 |     "print(\"There are\", len(files), \"files in this dataset\")\n",
127 |     "\n",
128 |     "print(files)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "id": "39c30dce-9c82-4d7f-9151-8c65ef4fadfd",
134 |    "metadata": {},
135 |    "source": [
136 |     "## 5. Download a file\n",
137 |     "\n",
138 |     "We can download a single file in a dataset by using its name. For example, this dataset contains a file with the name `'CsEmspier_01052017-01052019_from_NLWKN.txt'`. \n",
139 |     "\n",
140 |     "> The `path` parameter can be used to define where to store the file, otherwise the file will be store in the working directory.\n"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 5,
146 |    "id": "71a25448",
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "'CsEmspier_01052017-01052019_from_NLWKN.txt'"
153 |       ]
154 |      },
155 |      "execution_count": 5,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "# Select a file from the dataset\n",
162 |     "single_file =  dataset.files['CsEmspier_01052017-01052019_from_NLWKN.txt']\n",
163 |     "\n",
164 |     "# Download the file\n",
165 |     "fourtu.download_file(single_file)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "id": "d7351589",
171 |    "metadata": {},
172 |    "source": [
173 |     "## 6. Download a dataset\n",
174 |     "\n",
175 |     "We can download all files and metadata of a dataset using the `store()` function. We need to provide a `path` to a directory to store the dataset. If the directory does not exist, it would be created."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 6,
181 |    "id": "506d536b-53f0-482a-95e3-1d37eb5c3676",
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "<fairly.dataset.local.LocalDataset at 0x7f143af8a6b0>"
188 |       ]
189 |      },
190 |      "execution_count": 6,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "# This will download about 278 MB\n",
197 |     "dataset.store(\"./demo\")"
198 |    ]
199 |   }
200 |  ],
201 |  "metadata": {
202 |   "kernelspec": {
203 |    "display_name": "Python 3.10.4 ('venv': venv)",
204 |    "language": "python",
205 |    "name": "python3"
206 |   },
207 |   "language_info": {
208 |    "codemirror_mode": {
209 |     "name": "ipython",
210 |     "version": 3
211 |    },
212 |    "file_extension": ".py",
213 |    "mimetype": "text/x-python",
214 |    "name": "python",
215 |    "nbconvert_exporter": "python",
216 |    "pygments_lexer": "ipython3",
217 |    "version": "3.10.4"
218 |   },
219 |   "vscode": {
220 |    "interpreter": {
221 |     "hash": "7d5fcea36288094484ea4026c704bd47a44b80f0a87a063450ae6a7b7e01ed32"
222 |    }
223 |   }
224 |  },
225 |  "nbformat": 4,
226 |  "nbformat_minor": 5
227 | }
228 | 


--------------------------------------------------------------------------------
/docs/package/demo-zenodo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "441a22a6-7527-48c3-951e-7cab0937707c",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Download Datasets from Zenodo\n",
  9 |     "\n",
 10 |     "*fairly* can also download publid datasets from Zenodo.\n",
 11 |     "The *Zenodo* repository its own platform for managing research datasets. For this example, we will use the dataset [Quality and timing of crowd-based water level class observations](https://zenodo.org/records/3929547). This dataset is a single compressed file of type `.zip`, which contains several other files and directories, and it is about `27 MBs` in size. \n",
 12 |     "\n",
 13 |     "In Zenodo the ID of a dataet can be found by looking its DOI. It last part of a DOI (a number). For example, the DOI for the second version of the dataset is `10.5281/zenodo.3929547`, therefore its ID is `3929547`. We can fetch a dataset using either its ID or its URL.\n",
 14 |     "\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "id": "a88e8a7d",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## 1. Connect to Zenodo\n",
 23 |     "To connect to data repositories we use clients. A client manage the connection to an specific data repository. We can create a client to connect to Zenodo as follows:"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 7,
 29 |    "id": "3ddbd026-62e2-4a2c-a62e-127f06a4b0f3",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import fairly\n",
 34 |     "\n",
 35 |     "zenodo = fairly.client(id=\"zenodo\")"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "id": "f088481e",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## 2. Connect to a dataset\n",
 44 |     "Now, we can connect to a *public* dataset by calling the `get_dataset()` method and using either the dataset ID or its URL."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 8,
 50 |    "id": "075a2d23-85ee-4415-bd53-888e11627f61",
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "# USING ID\n",
 55 |     "dataset = zenodo.get_dataset(\"3929547\") \n",
 56 |     "\n",
 57 |     "# USING URL\n",
 58 |     "dataset = zenodo.get_dataset(\"https://zenodo.org/records/3929547\") "
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "id": "59c971ed",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "## 3. Explore dataset's metadata\n",
 67 |     "\n",
 68 |     "Once we have made a connection to a dataset, we can access its metadata (as stored in the data repository) by calling the `metadata` property of a dataset. "
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 9,
 74 |    "id": "30023980",
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/plain": [
 80 |        "Metadata({'type': 'dataset', 'publication_date': '2020-02-20', 'title': 'Data and R-Scripts for \"Quality and timing of crowd-based water level class observations\"', 'authors': [Person({'fullname': 'Etter, Simon', 'institution': 'University of Zurich, Department of Geography', 'orcid_id': '0000-0002-7553-9102', 'name': 'Simon', 'surname': 'Etter'}), Person({'fullname': 'Strobl, Barbara', 'institution': 'University of Zurich, Department of Geography', 'orcid_id': '0000-0001-5530-4632', 'name': 'Barbara', 'surname': 'Strobl'}), Person({'fullname': 'Seibert, Jan', 'institution': 'University of Zurich, Department of Geography', 'orcid_id': '0000-0002-6314-2124', 'name': 'Jan', 'surname': 'Seibert'}), Person({'fullname': 'van Meerveld, Ilja (H.J.)', 'institution': 'University of Zurich, Department of Geography', 'orcid_id': '0000-0002-7547-3270', 'name': 'Ilja (H.J.)', 'surname': 'van Meerveld'})], 'description': '<p>This are the data and the R-scripts used for the manuscript &quot;Quality and timing of crowd-based water level class observations&quot; accepted for publication in the journal Hydrological Processes in July 2020 as a Scientific Briefing. To run the code, just run the R-script with the name &quot;RunThisForResults.R&quot;. Results will be written to the &quot;Figures&quot; and the &quot;Results&quot; folder.</p>', 'access_type': 'open', 'license': 'CC-BY-4.0', 'doi': '10.5281/zenodo.3929547', 'keywords': ['CrowdWater', 'Hydrology'], 'zenodo_id': {'id': '3929547'}, 'prereserve_doi': {'doi': '10.5281/zenodo.3929547', 'recid': 3929547}, 'related_identifiers': [{'identifier': '10.5281/zenodo.3676350', 'relation': 'isVersionOf', 'scheme': 'doi'}], 'grants': ['10.13039/501100001711::200021_163008'], 'version': '2', 'language': 'eng'})"
 81 |       ]
 82 |      },
 83 |      "execution_count": 9,
 84 |      "metadata": {},
 85 |      "output_type": "execute_result"
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "# Retrieves metadata from data repository\n",
 90 |     "dataset.metadata"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "id": "2523d219",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## 4. List dataset's files\n",
 99 |     "\n",
100 |     "We can list the files of a dataset using the `files` property. The result is a Python dictionary where the name of each file is an element of the dictionary. In this case the dataset contains only one file."
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 10,
106 |    "id": "f9f51002",
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "There are 1 files in this dataset\n",
114 |       "{'DataForUploadToZenodo.zip': 'DataForUploadToZenodo.zip'}\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "# Lists files (data) associated to the dataset\n",
120 |     "files = dataset.files\n",
121 |     "\n",
122 |     "print(\"There are\", len(files), \"files in this dataset\")\n",
123 |     "\n",
124 |     "print(files)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "id": "5fed2262",
130 |    "metadata": {},
131 |    "source": [
132 |     "## 5. Download a file\n",
133 |     "\n",
134 |     "We can download the file in the dataset by using the name of a file. For example `'DataForUploadToZenodo.zip'`. \n",
135 |     "\n",
136 |     "> The `path` parameter can be used to define where to store the file, otherwise the file will be store in the working directory.\n"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 5,
142 |    "id": "71a25448",
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "'DataForUploadToZenodo.zip'"
149 |       ]
150 |      },
151 |      "execution_count": 5,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "# Select a file to download from the dataset\n",
158 |     "single_file =  dataset.files['DataForUploadToZenodo.zip'] # missing updating the manifest\n",
159 |     "\n",
160 |     "# download a file\n",
161 |     "zenodo.download_file(single_file, path=\"./from-zenodo\")"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "id": "d7351589",
167 |    "metadata": {},
168 |    "source": [
169 |     "## 6. Download a dataset\n",
170 |     "\n",
171 |     "We also can download all files and metadata of a dataset using the `store()` function. We need to provide a path to a directory to store the dataset. If the directory does not exist, it would be created."
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 11,
177 |    "id": "506d536b-53f0-482a-95e3-1d37eb5c3676",
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "data": {
182 |       "text/plain": [
183 |        "<fairly.dataset.local.LocalDataset at 0x7f5250515ba0>"
184 |       ]
185 |      },
186 |      "execution_count": 11,
187 |      "metadata": {},
188 |      "output_type": "execute_result"
189 |     }
190 |    ],
191 |    "source": [
192 |     "# This will download about 278 MBs\n",
193 |     "dataset.store(\"./quality\") # use extract=True for unzipping\n"
194 |    ]
195 |   }
196 |  ],
197 |  "metadata": {
198 |   "kernelspec": {
199 |    "display_name": "Python 3.10.4 ('venv': venv)",
200 |    "language": "python",
201 |    "name": "python3"
202 |   },
203 |   "language_info": {
204 |    "codemirror_mode": {
205 |     "name": "ipython",
206 |     "version": 3
207 |    },
208 |    "file_extension": ".py",
209 |    "mimetype": "text/x-python",
210 |    "name": "python",
211 |    "nbconvert_exporter": "python",
212 |    "pygments_lexer": "ipython3",
213 |    "version": "3.10.4"
214 |   },
215 |   "vscode": {
216 |    "interpreter": {
217 |     "hash": "7d5fcea36288094484ea4026c704bd47a44b80f0a87a063450ae6a7b7e01ed32"
218 |    }
219 |   }
220 |  },
221 |  "nbformat": 4,
222 |  "nbformat_minor": 5
223 | }
224 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx-copybutton==0.5.0
2 | fairly
3 | nbsphinx>=0.9.3
4 | sphinx-rtd-theme==1.3.0


--------------------------------------------------------------------------------
/docs/tutorials/cli.rst:
--------------------------------------------------------------------------------
  1 | Using the CLI
  2 | =====================
  3 | 
  4 | This tutorial shows how to use the *fairly* Command Line Interface (CLI) to clone, and create datasets, and to edit their metadata.
  5 | 
  6 | .. important::
  7 |    **Windows Users.** For the following to work, you need Pyton in the PATH environment variable on Windows. If your not sure that is the case. Open the Shell, and type :code:`python --version`. You should see the version of Python on the screen. If you see otherwise, follow these steps to `add Python to the PATH on Windows <https://realpython.com/add-python-to-path/#how-to-add-python-to-path-on-windows>`_
  8 | 
  9 | 1. Open a *Terminal* or *Shell*
 10 | 
 11 | 2. Test the *fairly* CLI is accessible in your terminal, by calling the help command:
 12 | 
 13 | .. code:: shell
 14 |    
 15 |    fairly --help
 16 | 
 17 | 
 18 | You should see the following:
 19 | 
 20 |    .. code:: shell
 21 | 
 22 |       Usage: fairly [OPTIONS] COMMAND [ARGS]...                                                                                        
 23 |                                                                                                                                       
 24 |       ╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
 25 |       │ --install-completion        [bash|zsh|fish|powershell|pwsh]  Install completion for the specified shell. [default: None]       │
 26 |       │ --show-completion           [bash|zsh|fish|powershell|pwsh]  Show completion for the specified shell, to copy it or customize  │
 27 |       │                                                              the installation.                                                 │
 28 |       │                                                              [default: None]                                                   │
 29 |       │ --help                                                       Show this message and exit.                                       │
 30 |       ╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 31 |       ╭─ Commands ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
 32 |       │ config                                                                                                                         │
 33 |       │ dataset                                                                                                                        │
 34 |       │ list-repos               List all repositories supported by fairly                                                             │
 35 |       │ list-user-datasets       List all datasets in the specified repository by doi, title, and publication_date                     │
 36 |       ╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯   
 37 | 
 38 | Cloning a Dataset
 39 | --------------------
 40 | 
 41 | 1. Create a new directory and subdirectory :code:`workshop/clone` 
 42 | 
 43 |    .. code:: shell
 44 | 
 45 |       # On Windows
 46 |       mkdir workshop
 47 |       mkdir workshop\clone
 48 | 
 49 |       # On Linux/MacOS
 50 |       mkdir -p  workshop/clone
 51 | 
 52 | 2. Go to the :code:`clone` directory
 53 | 
 54 |    .. code:: shell
 55 | 
 56 |       # On Windows
 57 |       cd workshop\clone
 58 | 
 59 |       # On Linux/MacOS
 60 |       cd  workshop/clone
 61 | 
 62 | 3. Clone this `Zenodo dataset <https://zenodo.org/records/7748718#.ZBozNdLMJhE>`_, using its URL:
 63 | 
 64 |    .. code:: shell
 65 | 
 66 |       fairly dataset clone --url https://zenodo.org/records/7748718#.ZBo1SNLMJhF
 67 | 
 68 | 4. Explore the content of the dataset, notice that file(s)  of the dataset have been downloaded and its metadata is in the :code:`manifest.yaml` file.
 69 | 
 70 |    .. code:: shell
 71 | 
 72 |       manifest.yaml  Trixi.jl-v0.5.14.zip
 73 | 
 74 | 
 75 | Creating a Local fairly Dataset
 76 | --------------------------------------
 77 | 
 78 | We can use the CLI to initialize a new dataset.
 79 | 
 80 |    1. Create a new directory called :code:`mydataset-cli` inside the *workshop* directory. Then move to into the directory
 81 | 
 82 |    .. code:: shell
 83 | 
 84 |       # On Windows/Linux/McOS
 85 |       mkdir mydataset-cli
 86 |       cd mydataset-cli
 87 | 
 88 |    2. Create a local dataset using the Zenodo metadata template, as follows
 89 | 
 90 |    .. code:: shell
 91 | 
 92 |       fairly dataset create zenodo
 93 | 
 94 | 
 95 | Include Files in your Dataset
 96 | ''''''''''''''''''''''''''''''''
 97 | 
 98 | Add some folders and files the :code:`mydataset-cli` directory.  You can do this using the file explorer/browser. You can add files of your own, but be careful not to include anything that you want to keep confidential. Also consider the total size of the files you will add, the larger the size the longer the upload will take. Also remember that for the current Zenodo API each file should be :code:`100MB` or smaller; this will change in the future.
 99 | 
100 | If you do not want to use files from your own, you can download and use the `dummy-data <https://drive.google.com/drive/folders/160N6MCmiKV3g-74idCgyyul9UdoPRO8T?usp=share_link>`_ 
101 | 
102 | Editing the Manifest
103 | ''''''''''''''''''''''
104 | 
105 | The :code:`manifest.yaml` file contains several sections to describe the medatadata of a dataset. Some of the sections and fields are compulsory (they are required by the data repository), others are optional. In this example, you started a *fairly* dataset using the template for the Zenodo repository, but you could also do so for 4TU.ResearchData. 
106 | 
107 | However, if you are not sure which repository you will use to publish a dataset, use the :guilabel:`default` option. This template contains the most common sections and fields for the repositories supported by *fairly*
108 | 
109 | .. tip::
110 |    Independently of which template you use to start a dataset, the :code:`manifest.yaml` file is interoperable between data repositories, with very few exceptions. This means that you can use the same manifest file for various data repositories. Different templates are provided only as a guide to indicate what metadata is more relevant for each data repository. 
111 | 
112 | 
113 | 1. Open the :code:`manifest.yaml` using a text editor. On Linux/MacOS you can use **nano** or **vim**. On Windows use the **notepad**
114 | 
115 | 2. Substitute the content of the :code:`manifest.yaml` with the text below.  *Here, we use only a small set of fields that are possible for Zenodo.*
116 |    
117 | .. code-block:: yaml
118 | 
119 |    metadata:
120 |      type: dataset
121 |      publication_date: '2023-03-22'
122 |      title: My Title CLI
123 |      authors:
124 |      - fullname: Surname, FirstName
125 |        affiliation: Your institution
126 |      description: A dataset from the Fairly Toolset workshop
127 |      access_type: open
128 |      license: CC0-1.0
129 |      doi: ''
130 |      prereserve_doi:
131 |      keywords:
132 |      - workshop
133 |      - dummy data
134 |      notes: ''
135 |      related_identifiers: []
136 |      communities: []
137 |      grants: []
138 |      subjects: []
139 |      version: 1.0.0
140 |      language: eng
141 |    template: zenodo
142 |    files:
143 |      includes:
144 |      - ARP1_.info
145 |      - ARP1_d01.zip
146 |      - my_code.py
147 |      - Survey_AI.csv
148 |      - wind-mill.jpg
149 |      excludes: []
150 | 
151 | 
152 | 3. Edit the dataset metadata by typing the information you want to add. For example, you can change the title, authors, description, etc. Save the file when you are done.
153 | 
154 | .. important:: 
155 |    * The :code:`includes`  field must list the files  and directories (folders) you want to include as part of the dataset. *Included files and directories will be uploaded to the the data repository* 
156 |    * The :code:`excludes` field can be used for explicitly indicating what files or directories you **don't want to be part  of the dataset**, for example, files that contain sensitive information. Excluded files and directories will never be uploaded to the data repository. 
157 |    * Files and directories that are not listed in either :code:`includes` or :code:`excludes` will be ignored by *fairly*.
158 | 
159 | 
160 | Upload Dataset to Data Repository
161 | -----------------------------------
162 | 
163 | Here, we explain how to upload a dataset to an existing account in Zenodo. If you do not have an account yet, you can `sign up in this webpage. <https://zenodo.org/signup/>`_
164 | 
165 | For this, you first need to :ref:`create-token` and register it manually or :ref:`via JupyterLab <configuring-fairly>`.
166 | 
167 | Upload Dataset
168 | ''''''''''''''''
169 | 
170 | 1. On the terminal or command prompt, type:
171 | 
172 |    .. code:: shell
173 | 
174 |       fairly dataset upload zenodo
175 | 
176 | 2. Go to your Zenodo and click on :guilabel:`Upload`. The `My dataset CLI` dataset should be there. 
177 | 
178 | .. image:: ../img/zenodo-cli-upload.png
179 | 
180 | 
181 | Explore the dataset and notice that all the files and metadata you added in JupyterLab has been automatically added to the new dataset. You should also notice that the dataset is not **published**, this is on purpose. This gives you the oportunity to review the dataset before deciding to publish if, and if necessary to make changes. In this way we also prevent users to publish dataset by mistake.
182 | 
183 | .. note:: 
184 |    If you try to upload the dataset again, you will get an error message. This is because the dataset already exists in Zenodo. You can see this reflected in the :code:`manifest.yaml` file;  the section :code:`remotes:` is added to the file after succesfully uploading a dataset. It lists the names and ids of the repositories where the dataset has been uploaded.
185 |    In the future, we will add a feature to allow users to update and sync datasets between repositories.
186 | 


--------------------------------------------------------------------------------
/docs/tutorials/jupyterlab.rst:
--------------------------------------------------------------------------------
  1 | Using the JupyterLab Extension
  2 | ==============================
  3 | 
  4 | This tutorial shows how to use the JupyterLab extension to clone and create research datasets using the graphical inteface of JupyterLab, and how to upload dataset  to popular research data repositories.
  5 | 
  6 | If you haven not done so, :ref:`install the full toolset. <installation>`
  7 | 
  8 | Start JupyterLab
  9 | ------------------
 10 | 
 11 | Star JupyterLab with the **jupyter-fairly** extension. This will start JupterLab in your browser.
 12 | 
 13 | Windows
 14 | ''''''''
 15 | 
 16 | You will use the Shell Terminal to start JupyterLab.
 17 | 
 18 | .. important::
 19 |    For the following to work, you need Pyton in the PATH environment variable on Windows. If you are not sure that is the case. Open the Shell, and type :code:`python --version`. You should see the version of Python on the screen. If you see otherwise, follow these steps to `add Python to the PATH on Windows <https://realpython.com/add-python-to-path/#how-to-add-python-to-path-on-windows>`_
 20 | 
 21 | On the shell type the following and press `Enter`:
 22 | 
 23 | .. code-block:: shell
 24 |    
 25 |    jupyter lab
 26 | 
 27 | Linux / MacOS
 28 | ''''''''''''''''
 29 | 
 30 | From the terminal, run: 
 31 | 
 32 | .. code-block:: shell
 33 | 
 34 |    jupyter lab
 35 | 
 36 | 
 37 | JupyterLab should automatically start on you browser.
 38 | 
 39 | .. image:: ../img/start-jupyterlab.png
 40 | 
 41 | 
 42 | Part 1: Cloning Dastasets
 43 | ----------------------------
 44 | 
 45 | Public research datasets can be cloned (copy and downloaded) directly to an empty directory, using the dataset's **URL** or **DOI**. We will use `this datset <https://data.4tu.nl/articles/dataset/Earthquake_Precursors_detected_by_convolutional_neural_network/21588096>`_ from 4TU.ResearchData as an example.
 46 | 
 47 | This are other datasets that you can try:
 48 | 
 49 | * https://zenodo.org/records/4302600
 50 | * https://zenodo.org/records/8273524
 51 |   
 52 |  
 53 | Using the JupyterLab interface, create a new directory called :code:`workshop`. *Notice that the content of your main directory would be different.*
 54 | 
 55 | .. image:: ../img/create-directory.png
 56 | 
 57 | 1. Inside the workshop directory, create a new directory called :code:`clone`
 58 | 2. Right click on the left panel to open the context menu
 59 | 3. Click on :guilabel:`Clone Dataset`
 60 | 4. Copy and paste the URL for the example dataset on the dialog window
 61 | 5. Click :guilabel:`Clone`
 62 | 
 63 | .. image:: ../img/clone1.png
 64 | 
 65 | .. image:: ../img/clone2.png
 66 | 
 67 | A notification on the bottom-right corner will let you know when the *cloning* is complete, and you should see a list of files on JupyterLab. All the files, except for :code:`manifest.yaml` are files that belong to the dataset in the research repository. The file :code:`manifest.yaml` is automatically created by the Fairly Toolset, and it contains metadata from the research data repository, such as:
 68 | 
 69 | - Authors 
 70 | - Keywords
 71 | - License
 72 | - DOI
 73 | - Files in the dataset
 74 | - etc.
 75 | 
 76 | Part 2: Create a Fairly Dataset
 77 | ---------------------------------------------
 78 | 
 79 | Here, we show you how can you create and prepare your own dataset using the JupyterLab extension of *fairly*.
 80 | 
 81 |    1. Create a new directory called :code:`mydataset` inside the *workshop directory*.
 82 |    2. Inside :code:`workshop/mydataset/`. Open the context menu and click on :guilabel:`Create Fairly Dataset`
 83 |    3. Select :guilabel:`Zenodo` as template from the drop-down list. *Notice that there are templates for other data repositories*. 
 84 |    4. Click :guilabel:`Create`. A :code:`manifest.yaml` file will be add to the *dummy-data* directory. This file contains a list of fields that you can edit to add metadata to your dataset. 
 85 | 
 86 | .. image:: ../img/create-dataset1.png
 87 | .. image:: ../img/create-dataset2.png
 88 | 
 89 | 
 90 | Include Files in your Dataset
 91 | ''''''''''''''''''''''''''''''''
 92 | 
 93 | Add some files to the :code:`mydataset` directory. You can add files of your own, but be careful not to include anything that you want to keep confidential. Also consider the size of the files you will add, the larger the size the longer the upload will take. Also remember that for the current Zenodo API each file should be :code:`100MB` or smaller; this will change in the future.
 94 | 
 95 | If you do not want to use your own files, you can download and use the `dummy-data <https://drive.google.com/drive/folders/160N6MCmiKV3g-74idCgyyul9UdoPRO8T?usp=share_link>`_ 
 96 | 
 97 | After you have added some file and/or folders to :code:`mydataset`, JupyterLab should look something like this:
 98 | 
 99 | .. image:: ../img/my-dataset.png
100 | 
101 | Editing the Manifest
102 | ''''''''''''''''''''''
103 | 
104 | The :code:`manifest.yaml` file contains several sections to describe the medatadata of a dataset. Some of the sections and fiels are compulsory (they are required by the researh data repository), others are optional. In this example you started a *fairly* dataset using the template for the Zenodo repository, but you could also do so for 4TU.ResearchData. 
105 | 
106 | However, if you are not sure which repository you will use to publish a dataset, use the :guilabel:`Default` template. This template contains the most common sections and fields for the repositories supported by the Fairly Toolset.
107 | 
108 | .. tip::
109 |    Independently of which template you use to start a dataset, the :code:`manifest.yaml` file is interoperable between data repositories, with very few exceptions. This means that you can use the same manifest file for various data repositories. Different templates are provided only as a guide to indicate what metadata is more relevant for each data repository. 
110 | 
111 | 1. Open the :code:`manifest.yaml` file using the context menu, or by doble-clicking on the file
112 | 
113 | .. image:: ../img/open-metadata.png
114 | 
115 | 2. Substitute the content of the :code:`manifest.yaml` with the text below.  *Here, we use only a small set of fields that are possible for Zenodo.*
116 | 
117 | .. code-block:: yaml
118 |    
119 |    metadata:
120 |      type: dataset
121 |      publication_date: '2023-08-31'
122 |      title: My Title
123 |      authors:
124 |      - fullname: Surname, FirstName
125 |        affiliation: Your institution
126 |      description: A dataset from the Fairly Toolset workshop
127 |      access_type: open
128 |      license: CC0-1.0
129 |      doi: ''
130 |      prereserve_doi:
131 |      keywords:
132 |      - fairly-toolset
133 |      - tutorial
134 |      - dummy data
135 |      notes: ''
136 |      related_identifiers: []
137 |      communities: []
138 |      grants: []
139 |      subjects: []
140 |      version: 1.0.0
141 |      language: eng
142 |    template: zenodo
143 |    files:
144 |      includes:
145 |      - ARP1_.info
146 |      - ARP1_d01.zip
147 |      - my_code.py
148 |      - Survey_AI.csv
149 |      - wind-mill.jpg
150 |      excludes: []
151 | 
152 | 3. Edit the dataset metadata by typing the information you want to add. For example, you can change the title, authors, description, etc. Save the file when you are done.
153 | 
154 | .. important:: 
155 |    * The :code:`includes`  field must list the files  and directories (folders) you want to include as part of the dataset. *Included files and directories will be uploaded to the the data repository* 
156 |    * The :code:`excludes` field can be used for explicitly indicating what files or directories you **don't want to be part  of the dataset**, for example, files that contain sensitive information. Excluded files and directories will never be uploaded to the data repository. 
157 |    * Files and directories that are not listed in either :code:`includes` or :code:`excludes` will be ignored by *fairly*.
158 | 
159 | Part 3: Upload Dataset to Repository
160 | -------------------------------------
161 | 
162 | This part explains how to upload a dataset to an existing account in Zenodo. If you do not have an account yet, you can `sign up in this webpage. <https://zenodo.org/signup/>`_
163 | 
164 | .. _create-token:
165 | 
166 | Create Personal Token
167 | ''''''''''''''''''''''
168 | 
169 | A personal token is a way in which data repositories identify a user. We need to register a personal token for creating datasets in the repository and uploading files to an specific account.
170 | 
171 | 1. Sign in to Zenodo. 
172 | 2. On the top-right corner click on drop-down arrow, then :guilabel:`Applicaitons`.
173 | 3. On the section :guilabel:`Personal access tokens`, click the :guilabel:`New token` button.
174 | 4. Enter a name for your token, for example: :code:`workshop`
175 | 5. For scopes, check all three boxes, and click :guilabel:`Create`
176 | 6. Copy the token (list of characters in red) to somewhere secure. **You will only see the token once.**
177 | 7. Under :guilabel:`Scopes`, check all three boxes once more. Then click :guilabel:`Save`
178 | 
179 | .. image:: ../img/zenodo-token.png
180 | 
181 | .. _configuring-fairly:
182 | 
183 | Register Personal Token
184 | ''''''''''''''''''''''''''''''''
185 | 
186 | To register a personal token to the Fairly Toolset, do the following in JupyterLab:
187 | 
188 | 1. Open the :guilabel:`Fairly` menu on the top menu bar, and click on :guilabel:`Add Repository Token`
189 | 2. Select :guilabel:`Zenodo` from the drop-down list.
190 | 3. Paste the token you copied from Zenodo in the previous step.
191 | 4. Click :guilabel:`Add Token`
192 | 
193 | .. important:: 
194 |    * You can register tokens for other repositories supporte by *fairly* in the same way. Tokens added in this way are global, and will be used by by the JupyterLab extension, the Python package and the CLI.
195 |    * Tokens are stored in a file called :code:`config.json` in your user home directory. This file is created automatically by *fairly* when you register a token. For Windows the file is located in :code:`C:\\Users\\<You-user-name>\\.fairly\\config.json`, and for Linux/MacOS in :code:`~/.fairly/config.json`. 
196 |    * To **update a token**, simply register a new token with the same name. The old token will be replaced by the new one. To **remove a token**, simply repeate the process, but type a random character in the token field.
197 | 
198 | .. warning::
199 |    If you are using the Fairly Toolset in a shared computer, make sure that you **remove your tokens** from the JupterLab extension. Otherwise, other users of the computer will be able to use your token to create datasets in your account.
200 | 
201 | .. note::
202 |    Windows users might need to re-start JupyterLab for the tokens to work correctly when uploading datasets.
203 | 
204 | Upload Dataset
205 | ''''''''''''''''
206 | 
207 | 1. On the left panel, do right-click, and then click :guilabel:`Upload Dataset`
208 | 2. Select Zenodo from the dowp-down list, and click :guilabel:`Continue`
209 | 3. Confirm that you want to upload the dataset to Zenodo by ticking the checkbox.
210 | 4. Click :guilabel:`OK`. A notification on the bottom-right corner will let you know that the upload is in progress and when it is complete.
211 | 5. Go to your Zenodo account and click on :guilabel:`Upload`. The `my dataset` dataset should be there. 
212 | 
213 | .. image:: ../img/zenodo-upload.png
214 | 
215 | Explore the dataset and notice that all the files and metadata you added in JupyterLab has been automatically added to the new dataset. You should also notice that the dataset is not **published**, this is on purpose. This gives you the oportunity to review the dataset before deciding to publish if, and if necessary to make changes. In this way we also prevent users to publish dataset by mistake.
216 | 
217 | .. note:: 
218 |    If you try to upload the dataset again, you will get an error message. This is because the dataset already exists in Zenodo. You can see this reflected in the :code:`manifest.yaml` file;  the section :code:`remotes:` is added to the file after succesfully uploading a dataset. It lists the names and ids of the repositories where the dataset has been uploaded.
219 |    In the future, we will add a feature to allow users to update and sync datasets between repositories.
220 | 
221 | 
222 | Part 4: Pushing Changes to Data Repository
223 | --------------------------------------------
224 | 
225 | In the last part of this tutorial, we will show you how to push changes to a dataset that has already been uploaded to a data repository. For this, we will use the dataset we created in the previous part.
226 | 
227 | 
228 | .. attention:: 
229 | 
230 |    To be able to push updates to an existing dataset in a repository, you need to have write access to the dataset. For most of the repositories this requires you to be the **owner** of the dataset. Most data repositories prevent updates if a dataset is "published" (i.e. editing is limited to datasets that are not yet published).
231 | 
232 | You can make changes to the files in a local dataset as you would normally do. For example, you can add new files, edit existing files, or delete files. You can also edit the :code:`manifest.yaml` file to update the metadata of the dataset. 
233 | If file inclusion or exclusion rules are defined using patterns (e.g. `'*.txt'`), then the extension  automatically identifies added, removed, or modified files.
234 | Otherwise, you need to explicitly indicate what needs to be *included* or *excluded* by updating the :code:`includes` and `excludes` fields in the :code:`manifest.yaml` file.
235 | 
236 | 
237 | .. image:: ../img/add-filles.png
238 | 
239 | Once you have made and **shaved** the changes, you can do the following upload the changes to the data repository.
240 | 
241 | 1. On the left panel, do right-click,
242 | 2. click :guilabel:`Push` option from the list,
243 | 3. confirm that you want to push the changes and click :guilabel:`Push` button. A notification on the bottom-right corner will let you know that changes are in progress and when they are completed.
244 | 
245 | .. image:: ../img/push-menu.png
246 | 
247 | .. image:: ../img/push-confirm.png
248 | 
249 | 
250 | .. tip:: 
251 |    
252 |    To push change to a dataset that you own, but you did not create using the Fairly Toolset, all you have to do is to clone it first, using the :guilabel:`Clone Dataset` option from the context menu. Then you will be able to make changes to the dataset and push them back to the data repository.
253 | 


--------------------------------------------------------------------------------
/docs/tutorials/python-api.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Using the Python API\n",
  9 |     "\n",
 10 |     "In this tutorial you will learn how to use *fairly* as a Python package to clone, create and upload datasets to research data repositories.\n",
 11 |     "\n",
 12 |     "If you haven not done so, [install the fairly package.](../installation.rst)\n",
 13 |     "\n",
 14 |     "## Cloning a dataset\n",
 15 |     "\n",
 16 |     "The Python API provides the flexibility to explore the metadata of a `remote dataset` before downloading it. A `remote` dataset is any dataset which is not stored locally. \n",
 17 |     "\n",
 18 |     "1. In a python script, import the `fairly` package and open a remote dataset:"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import fairly\n",
 28 |     "\n",
 29 |     "# Open a remote dataset\n",
 30 |     "dataset = fairly.dataset(\"doi:10.4121/21588096.v1\")\n"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "2. You can now explore the metadata of the dataset as follows:"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "data": {
 47 |       "text/plain": [
 48 |        "{'id': '21588096', 'version': '1'}"
 49 |       ]
 50 |      },
 51 |      "execution_count": 2,
 52 |      "metadata": {},
 53 |      "output_type": "execute_result"
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "dataset.id"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "'https://data.4tu.nl/datasets/a37120e2-96db-48e4-bd65-a54b970bc4fe/1'"
 69 |       ]
 70 |      },
 71 |      "execution_count": 3,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "dataset.url"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 5,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "33339\n"
 90 |      ]
 91 |     },
 92 |     {
 93 |      "data": {
 94 |       "text/plain": [
 95 |        "6"
 96 |       ]
 97 |      },
 98 |      "execution_count": 5,
 99 |      "metadata": {},
100 |      "output_type": "execute_result"
101 |     }
102 |    ],
103 |    "source": [
104 |     "print(dataset.size)\n",
105 |     "\n",
106 |     "# number of files\n",
107 |     "len(dataset.files)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 6,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "Metadata({'authors': [Person({'fullname': 'Stefan Nielsen', 'orcid_id': '0000-0002-9214-2932', 'figshare_id': 12882551})], 'keywords': ['Earthquakes', 'artificial neural network', 'precursor'], 'description': '<p>These are the accuracy results for the whole dataset A and B together. This is a second batch (2/2) of cycles where network was trained, tested and verified 50 times with different combinations of test, train and verification groups. There is a first batch of 50 in a separate file</p>', 'license': {'id': 2, 'name': 'CC0', 'url': 'https://creativecommons.org/publicdomain/zero/1.0/'}, 'title': 'Earthquake Precursors detected by convolutional neural network', 'doi': '10.4121/21588096.v1', 'type': 'dataset', 'access_type': 'open', 'custom_fields': {'Time coverage': '2012-2022', 'Publisher': '4TU.ResearchData', 'Organizations': 'University of Durham, Department of Earth Sciences.', 'Geolocation Longitude': '138.204', 'Geolocation Latitude': '36.546', 'Geolocation': 'Japan and surrounding area', 'Format': '*.py, *.csv, *.txt'}, 'categories': [13555], 'online_date': '2022-11-24T07:50:39'})"
119 |       ]
120 |      },
121 |      "execution_count": 6,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "# complete metadata\n",
128 |     "dataset.metadata"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "3.  You can save the dataset's metadata to a file to a local directory as follows. The directory will be created if it does not exist."
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 7,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# store dataset locally (i.e. clone dataset)\n",
145 |     "local_dataset = dataset.store(\"./cloned-dataset\")"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "## Creating a local dataset\n",
153 |     "\n",
154 |     "A `local dataset`` is a dataset which is stored locally. When creating our own dataset, we used a local dataset.\n",
155 |     "\n",
156 |     "1. Initialize a new dataset: "
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 2,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "import fairly\n",
166 |     "\n",
167 |     "# Initialize a local dataset\n",
168 |     "dataset = fairly.init_dataset(\"./local-dataset\") # path is created if it does not exist"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "2. Set the dataset's metadata attributes by passing a list of attribute names and values to a local dataset:"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 9,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "dataset.set_metadata(\n",
185 |     "    title=\"My first dataset\",\n",
186 |     "    kewords=[ \"fairly\", \"python\", \"api\" ],\n",
187 |     "    authors=[ \"0000-0002-0516-185X\",\n",
188 |     "             { \"name\": \"Jane\", \"surname\": \"Doe\" }\n",
189 |     "             ],\n",
190 |     ")"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 10,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "# Metadata attributes can be passed one by one as follows\n",
200 |     "dataset.metadata[\"license\"] = \"CC-BY-4.0\""
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "3. Add files and folders to the dataset:"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 11,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "dataset.includes.extend([ \n",
217 |     "    \"README\", \n",
218 |     "    \"*.csv\",\n",
219 |     "     \"train/*.jpg\" \n",
220 |     "])"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "4. To save values to the dataset's attributes to the `manifest.yaml` file, we must call the `save()` method:"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 12,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "# Save changes and update manifest.yaml\n",
237 |     "dataset.save()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "## Uploading a dataset\n",
245 |     "\n",
246 |     "To upload a dataset to a research data repository, we must first register an access token for an account in the data repository. Check the tutorial on the [JupyterLab extension](./jupyterlab.rst) to learn how to register an access token.\n",
247 |     "\n",
248 |     "Once you have registered an access token, you can upload a dataset with a single command:"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "# Upload dataset to data repository\n",
258 |     "remote_dataset = dataset.upload('zenodo')"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "## Pushing changes to a data repository\n",
266 |     "\n",
267 |     "After uploading a dataset to a data repository, you can use the `push` command to push changes to the dataset's metadata and files and update the data repository. The `push` method automatically finds the remote version of a dataset from the information available in the *manifest* file. It also updates the remote metadata, if any metadata fields are modified locally.\n",
268 |     "\n",
269 |     "> To be able to push updates to an existing dataset in a repository, you need to have write access to the dataset. For most of the repositories this requires you to be the owner of the dataset.\n",
270 |     "> Most data repositories prevent updates if a dataset is \"published\" (i.e. editing is limited to datasets that are not yet published).\n",
271 |     "\n",
272 |     "### Changing metadata in a dataset\n",
273 |     "\n",
274 |     "For example, to update the *title* of a dataset for which you have a local copy, you can do the following:"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 4,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "ds = fairly.dataset(\"./local-dataset\")\n",
284 |     "ds.metadata[\"title\"] = \"New title\"\n",
285 |     "ds.save_metadata() # save changes to manifest.yaml\n",
286 |     "\n",
287 |     "ds.push() # push changes to data repository to update an existing dataset"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "### Changing files in a dataset\n",
295 |     "\n",
296 |     "You can add, remove, or modify files in a local dataset as you wish. If file inclusion or exclusion rules are defined using patterns (e.g. `'*.txt'`), then fairly automatically identifies added, removed, or modified files. Otherwise, you need to explicitly indicate what needs to be *included* or *excluded*. Use the `includes.append` and `excludes.append` methods to do so."
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 6,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "# include a new file or directory\n",
306 |     "ds.includes.append(\"new file.txt\")\n",
307 |     "\n",
308 |     "# remove a file or directory\n",
309 |     "ds.excludes.append(\"old file.txt\")\n",
310 |     "\n",
311 |     "ds.save() # save changes to manifest.yaml"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {},
317 |    "source": [
318 |     "Once the changes are saved to the  *manifest file*, the remote version can be updated by calling the `push` method:"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": null,
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "ds.push() # push changes to data repository "
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {},
333 |    "source": [
334 |     "To learn more about the Fairly Python API, check the [API reference](../api/fairly.rst)."
335 |    ]
336 |   }
337 |  ],
338 |  "metadata": {
339 |   "kernelspec": {
340 |    "display_name": "venv",
341 |    "language": "python",
342 |    "name": "python3"
343 |   },
344 |   "language_info": {
345 |    "codemirror_mode": {
346 |     "name": "ipython",
347 |     "version": 3
348 |    },
349 |    "file_extension": ".py",
350 |    "mimetype": "text/x-python",
351 |    "name": "python",
352 |    "nbconvert_exporter": "python",
353 |    "pygments_lexer": "ipython3",
354 |    "version": "3.10.12"
355 |   },
356 |   "orig_nbformat": 4
357 |  },
358 |  "nbformat": 4,
359 |  "nbformat_minor": 2
360 | }
361 | 


--------------------------------------------------------------------------------
/docs/tutorials/workshop.rst:
--------------------------------------------------------------------------------
1 | Fairly Toolset Workshop
2 | """"""""""""""""""""""""
3 | 
4 | Go to `Resources for the workshop <https://hackmd.io/s9JTIbPARYWdETF6xXVH1Q>`_
5 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "fairly"
 7 | version = "1.0.1"
 8 | description = "A package to create, publish, and download research datasets"
 9 | readme = "README.rst"
10 | license = { file="LICENSE" }
11 | requires-python = ">=3.8"
12 | authors = [
13 |   { name="Serkan Girgin", email="s.girgin@utwente.nl" },
14 |   { name="Manuel Garcia Alvarez", email="m.g.garciaalvarez@tudelft.nl" },
15 |   { name="Jose Urra Llanusa", email="j.c.urrallanusa@tudelft.nl" }, 
16 |   ]
17 | classifiers = [
18 |     "Programming Language :: Python :: 3",
19 |     "License :: OSI Approved :: MIT License",
20 |     "Operating System :: OS Independent", ]
21 | dependencies = [
22 |     "python-dateutil",
23 |     "requests",
24 | 		"requests_toolbelt>=1.0.0",
25 |     "ruamel.yaml>=0.17.26",
26 |     "typer>=0.9.0",
27 |     "rich"
28 |     ]
29 | keywords = ["fairly", "open science", "research data", "data management"]
30 | 
31 | [project.optional-dependencies]
32 | dev = [
33 |     "pytest",
34 |     "pytest-cov",
35 |     "python-dotenv",
36 |     "build",
37 |     "hatch"
38 |     ]
39 | 
40 | [project.urls]
41 | "Homepage" = "https://github.com/ITC-CRIB/fairly"
42 | "Bug Tracker" = "https://github.com/ITC-CRIB/fairly/issues"
43 | "Documentation" = "https://fairly.readthedocs.io"
44 | "Funding" = "https://nwo.nl/en/researchprogrammes/open-science/open-science-fund"
45 | 
46 | [project.scripts]
47 | fairly = "fairly.cli:app"
48 | 


--------------------------------------------------------------------------------
/src/fairly/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from ruamel.yaml import YAML
 4 | import typer
 5 | import fairly
 6 | from fairly.cli import dataset
 7 | from fairly.cli import config
 8 | 
 9 | app = typer.Typer()
10 | app.add_typer(dataset.app, name="dataset")
11 | app.add_typer(config.app, name="config")
12 | 
13 | @app.command()
14 | def list_repos():
15 |     '''List all repositories supported by fairly'''
16 |     repositories = fairly.get_repositories()
17 | 
18 |     print("List of repositories to use with fairly:")
19 | 
20 |     for key in repositories:
21 |         print("- " + key)
22 | 
23 | @app.command()
24 | @app.command()
25 | def list_user_datasets(
26 |     repository: str = typer.Argument("", help="Repository name"),
27 | ) -> None:
28 |     '''List all datasets in the specified repository by doi, title, and publication_date'''
29 |     yaml = YAML()
30 |     # Test the connection to the repository by listing account datasets
31 |     try:
32 |         client = fairly.client(repository)
33 |             # store dataset lists and print the id, url and title
34 |         list = client.get_account_datasets()
35 |         if len(list) == 0:
36 |             print("There are no datasets under this account")
37 |         else:
38 |             print("\n")
39 |             for dataset in list:
40 |                 # get the dataset metadata
41 |                 metadata = dataset.metadata
42 |                 item = {}
43 |                 for i in metadata:
44 |                     if i == "publication_date": item[i] = metadata[i]
45 |                     if i == 'title': item[i] = metadata[i]
46 |                     if i == 'doi': item[i] = metadata[i]
47 | 
48 |                 # pretty print the list of datasets with yaml format
49 |                 yaml.dump(item, sys.stdout)
50 |                 print("------------------")
51 | 
52 |     except Exception as e:
53 |         print(e)
54 |         print("Please specify a repository name that is valid")
55 |     return None
56 | 
57 | if __name__ == "__main__":
58 |     app()


--------------------------------------------------------------------------------
/src/fairly/cli/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pprint
 4 | import json
 5 | 
 6 | from ruamel.yaml import YAML
 7 | 
 8 | import typer
 9 | import fairly
10 | 
11 | pp = pprint.PrettyPrinter(indent=4)
12 | 
13 | app = typer.Typer()
14 | 
15 | CONFIG_FILE = os.path.expanduser("~/.fairly/config.json")
16 | 
17 | 
18 | # @app.command()
19 | def add(
20 |     id: str = typer.Argument("", help="Repository ID"),
21 | ):
22 |     '''Add a repository to the config file,
23 | 
24 |     fairly repository add --id <id> --name <name> --api-url <url> --token <token>
25 | 
26 |     Notice that this should only be allowed once there is a corresponing module
27 |     for the repository.
28 |     '''
29 |     raise NotImplementedError
30 | 
31 | @app.command()
32 | def show(
33 | 
34 | ):
35 |     '''Show config details'''
36 |     yaml = YAML()
37 |     # expand user path
38 |     print(f"You can edit the config file located at: {CONFIG_FILE}")
39 | 
40 |     print("FAIRLY CONFIG")
41 |     print("--------------------")
42 | 
43 |     repos = fairly.get_repositories()
44 |     yaml.dump(repos, sys.stdout)
45 | 
46 | 
47 | @app.command()
48 | def update_token(
49 |     id: str = typer.Argument("", help="Repository ID"),
50 |     token: str = typer.Argument("", help="Repository token")
51 | ):
52 |     ''' Update a repository token os.path.expanduser('~/.fairly/config.json)'''
53 |     config = {}
54 |     try:
55 |         with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
56 |             config = json.loads(f.read())
57 | 
58 |             # check if token is already set with the same value
59 |             if config[id]["token"] == token:
60 |                 print(f"Token for repository {id} is already set to {token}")
61 |                 return
62 | 
63 |             else: config[id]["token"] = token
64 | 
65 |         with open(CONFIG_FILE, 'w', encoding='utf-8') as f:
66 |             f.write(json.dumps(config, indent=4))
67 | 
68 |     except FileNotFoundError:
69 |         print(f"Config file not found at {CONFIG_FILE}")
70 |         return
71 | 
72 | 
73 | # @app.command()
74 | def remove():
75 |     '''fairly repository remove <id>'''
76 |     raise NotImplementedError
77 | 
78 | if __name__ == "__main__":
79 |     app()
80 | 
81 | 


--------------------------------------------------------------------------------
/src/fairly/cli/dataset.py:
--------------------------------------------------------------------------------
  1 | import typer
  2 | 
  3 | from rich.progress import Progress, SpinnerColumn, TextColumn
  4 | 
  5 | import fairly
  6 | 
  7 | 
  8 | app = typer.Typer(pretty_exceptions_show_locals=False)
  9 | 
 10 | @app.command()
 11 | def create(
 12 |     path: str = typer.Argument(help="Path where the dataset will be created"),
 13 |     template: str = typer.Option("default", help="Metadata template to be used for the dataset"),
 14 | ) -> None:
 15 |     '''Create a local dataset under path with default template\n
 16 | 
 17 |     fairly dataset create <path>\n
 18 | 
 19 |     Create a local dataset under path with the specified template\n
 20 |     <template> = 'zeondo, 4tu, default'\n
 21 | 
 22 |     fairly dataset create <path> --template <template>
 23 |     '''
 24 |     fairly.init_dataset(path, template=template)
 25 | 
 26 | 
 27 | @app.command()
 28 | def clone(
 29 |     id: str = typer.Argument(help="Dataset identifier (URL, DOI, or unique ID)"),
 30 |     path: str = typer.Argument("", help="Path where the dataset will be stored"),
 31 |     repo: str = typer.Option("", help="Repository option argument"),
 32 |     token: str = typer.Option("", help="Access token option argument"),
 33 |     notify: bool = typer.Option(False, help="Enable process notification"),
 34 |     extract: bool = typer.Option(False, help="Extract archive files"),
 35 | ) -> None:
 36 |     '''
 37 |     Clones a dataset by using its URL address, DOI or unique ID.
 38 | 
 39 |     Examples: \n
 40 |         >>> fairly dataset clone <url|doi|uid> \n \n
 41 |         >>> fairly dataset clone https://zenodo.org/records/7759648 \n
 42 |         >>> fairly dataset clone 10.5281/zenodo.7759648 \n
 43 |         >>> fairly dataset clone <repository> <id> \n
 44 |         >>> fairly dataset clone --repo zenodo 7759648 \n
 45 |     '''
 46 | 
 47 |     if repo:
 48 |         if token:
 49 |             client = fairly.client(repo, token=token)
 50 |         else:
 51 |             client = fairly.client(repo)
 52 |         dataset = client.get_dataset(id)
 53 | 
 54 |     else:
 55 |         dataset = fairly.dataset(id)
 56 | 
 57 |     if not path:
 58 |         path = dataset.doi if dataset.doi else "dataset"
 59 | 
 60 |         for sep in ["/", "\\"]:
 61 |             path = path.replace(sep, "_")
 62 | 
 63 |     with Progress(
 64 |         SpinnerColumn(),
 65 |         TextColumn("[progress.description]{task.description}"),
 66 |         transient = True,
 67 |     ) as progress:
 68 |         progress.add_task("Cloning dataset", total=None)
 69 | 
 70 |         dataset.store(path, notify=fairly.notify if notify else None, extract=extract)
 71 |         print(f"Dataset {id} is successfully cloned to {path}")
 72 | 
 73 | 
 74 |     return None
 75 | 
 76 | @app.command()
 77 | def upload(
 78 |     path: str = typer.Argument(help="Path where the dataset is located"),
 79 |     repo: str = typer.Argument(help="Repository to upload the dataset"),
 80 |     token: str = typer.Option(None, help="Access token option argument"),
 81 |     notify: bool = typer.Option(False, help="Enable process notification"),
 82 | ):
 83 |     '''
 84 |     Uploads a local dataset to a data repository.
 85 |     '''
 86 |     dataset = fairly.dataset(path)
 87 | 
 88 |     # TODO: Support repository selection from the metadata template of the dataset
 89 | 
 90 |     if token:
 91 |         client = fairly.client(repo, token=token)
 92 |     else:
 93 |         client = fairly.client(repo)
 94 | 
 95 |     with Progress(
 96 |         SpinnerColumn(),
 97 |         TextColumn("[progress.description]{task.description}"),
 98 |         transient = True,
 99 |     ) as progress:
100 |         progress.add_task(description=f"Uploading dataset {path}", total=None)
101 |         remote_dataset = dataset.upload(client, notify=notify)
102 | 
103 |     print(f"Dataset {path} is successfully uploaded at {remote_dataset.url or remote_dataset.plain_id}")
104 | 
105 | 
106 | @app.command()
107 | def delete(
108 |     id: str = typer.Argument(help="Dataset identifier (URL address, DOI, or unique ID)"),
109 |     repo: str = typer.Option("", help="Repository option argument"),
110 |     token: str = typer.Option("", help="Access token option argument"),
111 | ):
112 |     '''
113 |     Deletes a dataset by using its URL address, DOI or unique ID.
114 |     '''
115 |     if repo:
116 |         if token:
117 |             client = fairly.client(repo, token=token)
118 |         else:
119 |             client = fairly.client(repo)
120 |         dataset = client.get_dataset(id)
121 | 
122 |     else:
123 |         dataset = fairly.dataset(id)
124 |         client = dataset.client
125 | 
126 |     with Progress(
127 |         SpinnerColumn(),
128 |         TextColumn("[progress.description]{task.description}"),
129 |         transient = True,
130 |     ) as progress:
131 |         progress.add_task(description=f"Deleting dataset {id}", total=None)
132 |         client.delete_dataset(dataset.id)
133 | 
134 |     print(f"Dataset {id} is successfully deleted.")
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     app()


--------------------------------------------------------------------------------
/src/fairly/client/dataverse.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, List, Callable
  2 | 
  3 | from . import Client
  4 | from ..metadata import Metadata
  5 | from ..dataset.remote import RemoteDataset
  6 | from ..file.local import LocalFile
  7 | from ..file.remote import RemoteFile
  8 | 
  9 | import requests
 10 | from collections import OrderedDict
 11 | import urllib.parse
 12 | import dateutil.parser
 13 | import logging
 14 | 
 15 | CLASS_NAME = "DataverseClient"
 16 | 
 17 | 
 18 | class DataverseClient(Client):
 19 | 
 20 |     def __init__(self, repository_id: str=None, **kwargs):
 21 |         super().__init__(repository_id, **kwargs)
 22 | 
 23 | 
 24 |     @classmethod
 25 |     def get_config_parameters(cls) -> Dict:
 26 |         """Returns configuration parameters.
 27 | 
 28 |         Returns:
 29 |             Dictionary of configuration parameters.
 30 |             Keys are the parameter names, values are the descriptions.
 31 |         """
 32 |         return {**super().get_config_parameters(), **{
 33 |             "token": "Access token.",
 34 |         }}
 35 | 
 36 | 
 37 |     @classmethod
 38 |     def get_config(cls, **kwargs) -> Dict:
 39 |         config = super().get_config(**kwargs)
 40 | 
 41 |         for key, val in kwargs.items():
 42 |             if key == "token":
 43 |                 config["token"] = val
 44 |             else:
 45 |                 pass
 46 | 
 47 |         return config
 48 | 
 49 | 
 50 |     @classmethod
 51 |     def get_client(cls, url: str) -> Client:
 52 |         """Creates a repository client from the specified URL address.
 53 | 
 54 |         Args:
 55 |             url (str): URL address of the repository or dataset.
 56 | 
 57 |         Returns:
 58 |             Client object (InvenioClient).
 59 | 
 60 |         Raises:
 61 |             ValueError("Invalid repository"): If repository is not valid.
 62 |         """
 63 |         logging.info("Checking Dataverse client for %s.", url)
 64 |         parts = urllib.parse.urlparse(url)
 65 | 
 66 |         url = parts.scheme + "://" + parts.netloc
 67 |         api_url = url + "/api/"
 68 | 
 69 |         try:
 70 |             # REMARK: search endpoint is not used as it may require access token
 71 |             response = requests.get(api_url + "info/metrics/dataverses")
 72 |             response.raise_for_status()
 73 |             data = response.json()
 74 | 
 75 |         except:
 76 |             raise ValueError("Invalid repository")
 77 | 
 78 |         if data.get("status") != "OK" or data.get("data", {}).get("count") is None:
 79 |             raise ValueError("Invalid repository")
 80 | 
 81 |         logging.info("Repository found at %s.", api_url)
 82 |         client = DataverseClient(name=parts.hostname, url=url, api_url=api_url)
 83 | 
 84 |         return client
 85 | 
 86 | 
 87 |     def _create_session(self) -> requests.Session:
 88 |         session = super()._create_session()
 89 | 
 90 |         # Set authentication token
 91 |         if self.config.get("token"):
 92 |             session.headers["X-Dataverse-key"] = self.config['token']
 93 | 
 94 |         return session
 95 | 
 96 | 
 97 |     def _get_dataset_id(self, **kwargs) -> Dict:
 98 |         """Returns standard dataset identifier.
 99 | 
100 |         Args:
101 |             **kwargs: Dataset identifier arguments
102 | 
103 |         Returns:
104 |             Standard dataset identifier
105 | 
106 |         Raises:
107 |           ValueError("Invalid id")
108 |           ValueError("Invalid URL address")
109 |           ValueError("No identifier")
110 | 
111 |         """
112 |         if "doi" in kwargs:
113 |             doi = kwargs["doi"]
114 | 
115 |         elif "url" in kwargs:
116 |             url = urllib.parse.urlparse(kwargs["url"])
117 |             params = urllib.parse.parse_qs(url.query)
118 |             if "persistentId" in params:
119 |                 id = params["persistentId"][0]
120 |                 if id.startswith("doi:"):
121 |                     doi = id[4:]
122 |                 else:
123 |                     raise ValueError("Invalid id")
124 |             else:
125 |                 raise ValueError("Invalid URL address")
126 | 
127 |         else:
128 |             raise ValueError("No identifier")
129 | 
130 |         return {"doi": doi}
131 | 
132 | 
133 |     def _get_dataset_hash(self, id: Dict) -> str:
134 |         """Returns hash of the standard dataset identifier.
135 | 
136 |         Args:
137 |             id (Dict): Standard dataset identifier.
138 | 
139 |         Returns:
140 |             Hash of the dataset identifier.
141 |         """
142 |         return id["doi"]
143 | 
144 | 
145 |     def _get_dataset_details(self, id: Dict) -> Dict:
146 |         """Retrieves details of the dataset.
147 | 
148 |         Args:
149 |             id (Dict): Standard dataset identifier.
150 | 
151 |         Returns:
152 |             Dictionary of dataset details.
153 | 
154 |         Raises:
155 |             ValueError("Invalid dataset id")
156 |             HTTPError
157 |         """
158 |         if id.get("version"):
159 |             endpoint = f"datasets/:persistentId/versions/{id['version']}?persistentId=doi:{id['doi']}"
160 |         else:
161 |             endpoint = f"datasets/:persistentId/?persistentId=doi:{id['doi']}"
162 | 
163 |         result, response = self._request(endpoint)
164 | 
165 |         if not result or result.get("status") != "OK" or not result.get("data"):
166 |             raise ValueError("Invalid dataset id")
167 | 
168 |         return result
169 | 
170 | 
171 |     def _create_dataset(self, metadata: Metadata) -> Dict:
172 |         """Creates a dataset with the specified standard metadata
173 | 
174 |         Args:
175 |             metadata (Metadata): Standard metadata
176 | 
177 |         Returns:
178 |             Standard identifier of the dataset
179 | 
180 |         Raises:
181 |             NotImplementedError
182 |         """
183 |         raise NotImplementedError
184 | 
185 | 
186 |     def _get_account_datasets(self) -> List[RemoteDataset]:
187 |         if "token" not in self.config:
188 |             return []
189 | 
190 |         raise NotImplementedError
191 | 
192 | 
193 |     def _get_versions(self, id: Dict) -> OrderedDict:
194 |         """Returns standard dataset identifiers of the dataset versions
195 | 
196 |         Args:
197 |             id (Dict): Dataset id
198 | 
199 |         Returns:
200 |             Ordered dictionary of dataset identifiers of the available versions.
201 |             Keys are the versions, values are the dataset identifiers.
202 |         """
203 |         raise NotImplementedError
204 | 
205 | 
206 |     def _get_metadata(self, id: Dict) -> Dict:
207 |         details = self._get_dataset_details(id)
208 | 
209 |         data = details["data"] if id.get("version") else details["data"]["latestVersion"]
210 |         if "citation" in data["metadataBlocks"]:
211 |             citation = data["metadataBlocks"]["citation"]
212 |         else:
213 |             citation = next(iter(data["metadataBlocks"].values()))
214 |         fields = citation["fields"]
215 | 
216 |         metadata = {}
217 |         for field in fields:
218 |             key = field["typeName"]
219 |             val = self._get_property_value(field)
220 |             metadata[key] = val
221 | 
222 |         return metadata
223 | 
224 | 
225 |     def save_metadata(self, id: Dict, metadata: Metadata) -> None:
226 |         """Saves metadata of the specified dataset.
227 | 
228 |         Args:
229 |             id (Dict): Standard dataset id.
230 |             metadata (Metadata): Metadata to be saved.
231 | 
232 |         Raises:
233 |             ValueError("No access token")
234 |         """
235 |         # Raise exception if no access token
236 |         if not self.config.get("token"):
237 |             raise ValueError("No access token")
238 | 
239 |         raise NotImplementedError
240 | 
241 | 
242 |     def validate_metadata(self, metadata: Metadata) -> Dict:
243 |         raise NotImplementedError
244 | 
245 | 
246 |     def get_files(self, id: Dict) -> List[RemoteFile]:
247 |         # REMARK: Dataverse has a dedicated endpoint for files
248 |         details = self._get_dataset_details(id)
249 | 
250 |         data = details["data"] if id.get("version") else details["data"]["latestVersion"]
251 | 
252 |         if "files" not in data:
253 |             raise NotImplementedError
254 | 
255 |         files = [];
256 |         for item in data["files"]:
257 |             file = item["dataFile"]
258 |             md5 = file.get("md5")
259 |             if not md5 and "checksum" in file and file["checksum"]["type"] == "md5":
260 |                 md5 = file["checksum"]["value"]
261 |             file = RemoteFile(
262 |                 url=self.config.get("api_url") + f"access/datafile/{file['id']}",
263 |                 id=file.get("id"),
264 |                 path=file.get("filename"),
265 |                 size=file.get("filesize"),
266 |                 type=file.get("contentType"),
267 |                 md5=md5
268 |             )
269 |             files.append(file)
270 | 
271 |         return files
272 | 
273 | 
274 |     def _upload_file(self, id: Dict, file: LocalFile, notify: Callable=None) -> RemoteFile:
275 |         raise NotImplementedError
276 | 
277 | 
278 |     def _delete_file(self, id: Dict, file: RemoteFile) -> None:
279 |         raise NotImplementedError
280 | 
281 | 
282 |     def _delete_dataset(self, id: Dict) -> None:
283 |         """Deletes specific dataset from the repository.
284 | 
285 |         Args:
286 |             id (Dict): Standard dataset identifier.
287 | 
288 |         Raises:
289 |             NotImplementedError
290 |         """
291 |         raise NotImplementedError
292 | 
293 | 
294 |     def _get_property_value(self, prop: Dict) -> Any:
295 |         if prop["typeClass"] in ["primitive", "controlledVocabulary"]:
296 |             return prop["value"]
297 | 
298 |         elif prop["typeClass"] == "compound":
299 |             if prop["multiple"]:
300 |                 results = []
301 |                 for item in prop["value"]:
302 |                     result = {}
303 |                     for key, val in item.items():
304 |                         result[key] = self._get_property_value(val)
305 |                     results.append(result)
306 |                 return results
307 | 
308 |             else:
309 |                 result = {}
310 |                 for key, val in prop["value"].items():
311 |                     result[key] = self._get_property_value(val)
312 |                 return result
313 | 
314 |         else:
315 |             raise ValueError("Invalid property type class", prop["typeClass"])
316 | 
317 | 
318 |     def _get_field_value(self, fields: Dict, key: str) -> Any:
319 |         for field in fields:
320 |             if field["typeName"] == key:
321 |                 return self._get_property_value(field)
322 |                 break
323 | 
324 |         return None
325 | 
326 | 
327 |     def get_details(self, id: Dict) -> Dict:
328 |         """Returns standard details of the specified dataset.
329 | 
330 |         Details dictionary:
331 |             - title (str): Title.
332 |             - url (str): URL address.
333 |             - doi (str): DOI.
334 |             - status (str): Status.
335 |             - size (int): Total size of data files in bytes.
336 |             - created (datetime.datetime): Creation date and time.
337 |             - modified (datetime.datetime): Last modification date and time.
338 | 
339 |         Possible statuses are as follows:
340 |             - "draft": Dataset is not published yet.
341 |             - "public": Dataset is published and is publicly available.
342 |             - "embargoed": Dataset is published, but is under embargo.
343 |             - "restricted": Dataset is published, but accessible only under certain conditions.
344 |             - "closed": Dataset is published, but accessible only by the owners.
345 |             - "error": Dataset is in an error state.
346 |             - "unknown": Dataset is in an unknown state.
347 | 
348 |         Args:
349 |             id (Dict): Standard dataset id.
350 | 
351 |         Returns:
352 |             Details dictionary of the dataset.
353 |         """
354 |         details = self._get_dataset_details(id)
355 | 
356 |         data = details["data"] if id.get("version") else details["data"]["latestVersion"]
357 |         if "citation" in data["metadataBlocks"]:
358 |             citation = data["metadataBlocks"]["citation"]
359 |         else:
360 |             citation = next(iter(data["metadataBlocks"].values()))
361 |         fields = citation["fields"]
362 | 
363 |         out = {}
364 | 
365 |         out["title"] = self._get_field_value(fields, "title")
366 |         if isinstance(out["title"], list):
367 |             out["title"] = out["title"][0]
368 | 
369 |         url = self.config.get("url")
370 |         if url:
371 |             out["url"] = f"{url}/dataset.xhtml?persistentId=doi:{id['doi']}"
372 |             if id.get("version"):
373 |                 out["url"] += f"&version={id['version']}"
374 | 
375 |         out["doi"] = data["datasetPersistentId"]
376 |         if out["doi"].startswith("doi:"):
377 |             out["doi"] = out["doi"][4:]
378 | 
379 |         # TODO: Set status
380 | 
381 |         # REMARK: Dataverse has a dedicated endpoint for dataset size
382 |         # https://guides.dataverse.org/en/latest/api/native-api.html?highlight=typename#id81
383 |         if "files" in data:
384 |             size = 0
385 |             for file in data["files"]:
386 |                 size += file["dataFile"]["filesize"]
387 |             out["size"] = size
388 | 
389 |         out["created"] = dateutil.parser.isoparse(data["createTime"])
390 |         out["modified"] = dateutil.parser.isoparse(data["lastUpdateTime"])
391 | 
392 |         return out
393 | 
394 | 
395 |     @classmethod
396 |     def supports_folder(cls) -> bool:
397 |         """Returns if folders are supported."""
398 |         return False
399 | 


--------------------------------------------------------------------------------
/src/fairly/data/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"4tu": {
 3 | 		"client_id": "djehuty",
 4 | 		"name": "4TU.ResearchData",
 5 | 		"url": "https://data.4tu.nl/",
 6 | 		"api_url": "https://data.4tu.nl/"
 7 | 	},
 8 | 	"figshare": {
 9 | 		"client_id": "figshare",
10 | 		"name": "Figshare",
11 | 		"url": "https://figshare.com/",
12 | 		"api_url": "https://api.figshare.com/v2/"
13 | 	},
14 | 	"zenodo": {
15 | 		"client_id": "invenio",
16 | 		"name": "Zenodo",
17 | 		"url": "https://zenodo.org/",
18 | 		"api_url": "https://zenodo.org/api/"
19 | 	}
20 | }


--------------------------------------------------------------------------------
/src/fairly/data/languages/ISO-639-2_8859-1.tab:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/src/fairly/data/languages/ISO-639-2_8859-1.tab


--------------------------------------------------------------------------------
/src/fairly/data/languages/ISO-639-3_8859-1.tab:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/src/fairly/data/languages/ISO-639-3_8859-1.tab


--------------------------------------------------------------------------------
/src/fairly/data/templates/default.yaml:
--------------------------------------------------------------------------------
 1 | metadata:
 2 | 
 3 |     title: ""
 4 |         # Title of the dataset.
 5 | 
 6 |     description: ""
 7 |         # Description of the dataset.
 8 | 
 9 |     authors: []
10 |         # List of authors to be associated with the dataset.
11 |         #
12 |         # The list can contain items with the following attributes:
13 |         # - name
14 |         # - surname
15 |         # - email
16 |         # - institution
17 |         # - orcid_id
18 | 
19 |     keywords: []
20 |         # List of keywords associated with the dataset.
21 |         #
22 |         # Example:
23 |         #
24 |         # ["Keyword 1", "Keyword 2"]
25 | 
26 |     doi: ""
27 |         # Digital Object Identifier of the dataset.
28 | 
29 |     license: ""
30 |         # License of the dataset.
31 | 
32 |     type: ""
33 |         # Type of the dataset.
34 |         #
35 |         # Controlled vocabulary:
36 |         #
37 |         # - book
38 |         # - conference contribution
39 |         # - dataset
40 |         # - figure
41 |         # - journal contribution
42 |         # - media
43 |         # - online resource
44 |         # - poster
45 |         # - preprint
46 |         # - presentation
47 |         # - software
48 |         # - thesis
49 |         # - other
50 | 
51 |     references: []
52 |         # List of references to be associated with the dataset.
53 |         #
54 |         # Example:
55 |         #
56 |         # ["http://link1", "http://link2", "http://link3"]
57 | 
58 |     funding: ""
59 |         # Grant number or funding authority.
60 | 
61 |     online_date: ""
62 |         # Online posted date
63 | 
64 |     publication_date: ""
65 |         # Date when the dataset was published by the publisher
66 | 
67 |     acceptance_date: ""
68 |         # Date when the dataset was accepted for publication by the publisher
69 | 
70 |     access_type: "open"
71 |         # Access type of the dataset
72 |         #
73 |         # Controlled vocabulary:
74 |         #
75 |         # - open
76 |         # - embargoed
77 |         # - restricted
78 |         # - closed
79 | 
80 |     embargo_date: ""
81 |         # Date when the embargo expires.
82 |         #
83 |         # Leave empty if access type is "closed"
84 |         #
85 |         # Example:
86 |         #
87 |         # 2022-10-15T18:30:00
88 | 
89 |     embargo_reason: ""
90 |         # Reason for setting embargo
91 | 


--------------------------------------------------------------------------------
/src/fairly/data/templates/figshare.yaml:
--------------------------------------------------------------------------------
  1 | metadata:
  2 | 
  3 |     title: ""
  4 |         # Title of the dataset.
  5 |         #
  6 |         # Required.
  7 | 
  8 |     description: ""
  9 |         # Description of the dataset.
 10 | 
 11 |     keywords: []
 12 |         # List of keywords associated with the dataset.
 13 |         #
 14 |         # Example:
 15 |         #
 16 |         # ["Keyword 1", "Keyword 2"]
 17 | 
 18 |     references: []
 19 |         # List of links associated with the dataset.
 20 |         #
 21 |         # Example:
 22 |         #
 23 |         # ["http://link1", "http://link2", "http://link3"]
 24 | 
 25 |     categories: []
 26 |         # List of category names associated with the dataset.
 27 |         #
 28 |         # Example:
 29 |         # ["Agricultural Land Planning", "Biochemistry"]
 30 | 
 31 |     authors: []
 32 |         # List of authors to be associated with the dataset.
 33 |         #
 34 |         # The list can contain items with the following attributes:
 35 |         # - name
 36 |         # - surname
 37 |         # - fullname
 38 |         # - email
 39 |         # - orcid_id
 40 |         # - figshare_id
 41 |         #
 42 |         # If an id is supplied, it will take priority and everything else will
 43 |         # be ignored.
 44 | 
 45 |     custom_fields: {}
 46 |         # List of key, values pairs to be associated with the dataset.
 47 |         #
 48 |         # Example:
 49 |         #
 50 |         # {"Format": "GeoTIFF", "Location": "Amsterdam"}
 51 | 
 52 |     type: ""
 53 |         # Controlled vocabulary:
 54 |         #
 55 |         # - book
 56 |         # - conference contribution
 57 |         # - dataset
 58 |         # - figure
 59 |         # - journal contribution
 60 |         # - media
 61 |         # - online resource
 62 |         # - poster
 63 |         # - preprint
 64 |         # - presentation
 65 |         # - software
 66 |         # - thesis
 67 | 
 68 |     access_type: ""
 69 |         # Access type of the dataset
 70 |         #
 71 |         # Controlled vocabulary:
 72 |         #
 73 |         # - open
 74 |         # - embargoed
 75 |         # - restricted
 76 |         # - closed
 77 | 
 78 |     funding: ""
 79 |         # Grant number or funding authority.
 80 | 
 81 |     funding_list: []
 82 |         # Funding creation / update items
 83 |         #
 84 |         # - id (integer): ID as returned by the Funding Search endpoint
 85 |         # - title (string): The title of the new user created funding
 86 | 
 87 |     license: ""
 88 |         # License for the dataset.
 89 |         #
 90 |         # Example:
 91 |         #
 92 |         # "MIT"
 93 | 
 94 |     online_date: ""
 95 |         # Online posted date
 96 | 
 97 |     publication_date: ""
 98 |         # Date when the dataset was published by the publisher
 99 | 
100 |     acceptance_date: ""
101 |         # Date when the dataset was accepted for publication by the publisher
102 | 
103 |     embargo_date: ""
104 |         # Date when the embargo expires and the article gets published.
105 |         #
106 |         # '0' value will set up permanent embargo.
107 |         #
108 |         # Example:
109 |         #
110 |         # 2018-05-22T04:04:04
111 | 
112 |     embargo_type: ""
113 |         # Embargo can be enabled at the article or the file level.
114 |         #
115 |         # Controlled Vocabulary:
116 |         #
117 |         # - article
118 |         # - file
119 |         #
120 |         # Example:
121 |         #
122 |         # "file"
123 | 
124 |     embargo_title: ""
125 |         # Title for embargo
126 |         #
127 |         # Example:
128 |         #
129 |         # "File(s) under embargo"
130 | 
131 |     embargo_reason: ""
132 |         # Reason for setting embargo
133 | 


--------------------------------------------------------------------------------
/src/fairly/data/templates/zenodo.yaml:
--------------------------------------------------------------------------------
  1 | metadata:
  2 |     type: ""
  3 |         # Required.
  4 |         #
  5 |         # Controlled vocabulary:
  6 |         #
  7 |         # - dataset: Dataset
  8 |         # - image: Image
  9 |         # - lesson: Lesson
 10 |         # - physicalobject: Physical object
 11 |         # - poster: Poster
 12 |         # - presentation: Presentation
 13 |         # - publication: Publication
 14 |         # - software: Software
 15 |         # - video: Video/Audio
 16 |         # - other: Other
 17 |         # - annotationcollection: Annotation collection
 18 |         # - article: Journal article
 19 |         # - book: Book
 20 |         # - conferencepaper: Conference paper
 21 |         # - datamanagementplan: Data management plan
 22 |         # - deliverable: Project deliverable
 23 |         # - milestone: Project milestone
 24 |         # - patent: Patent
 25 |         # - preprint: Preprint
 26 |         # - proposal: Proposal
 27 |         # - report: Report
 28 |         # - section: Book section
 29 |         # - softwaredocumentation: Software documentation
 30 |         # - taxonomictreatment: Taxonomic treatment
 31 |         # - technicalnote: Technical note
 32 |         # - thesis: Thesis
 33 |         # - workingpaper: Working paper
 34 |         # - diagram: Diagram
 35 |         # - drawing: Drawing
 36 |         # - figure: Figure
 37 |         # - photo: Photo
 38 |         # - plot: Plot
 39 |         
 40 |     publication_date: ""
 41 |         # Date of publication in ISO8601 format (YYYY-MM-DD).
 42 |         #
 43 |         # Required.
 44 |         #
 45 |         # Defaults to current date.
 46 | 
 47 |     title: ""
 48 |         # Title of the dataset.
 49 |         #
 50 |         # Required.
 51 | 
 52 |     authors: []
 53 |         # The authors of the dataset.
 54 |         #
 55 |         # Required.
 56 |         #
 57 |         # Each array element is an object with the attributes:
 58 |         #
 59 |         # - name: Name of author in the format Family name, Given names.
 60 |         # - affiliation: Affiliation of author (optional).
 61 |         # - orcid: ORCID identifier of author (optional).
 62 |         # - gnd: GND identifier of author (optional).
 63 |         #
 64 |         # Example:
 65 |         #
 66 |         # [{'name':'Doe, John', 'affiliation': 'Zenodo'}, {'name':'Smith, Jane',
 67 |         # 'affiliation': 'Zenodo', 'orcid': '0000-0002-1694-233X'}, {'name':
 68 |         # 'Kowalski, Jack', 'affiliation': 'Zenodo', 'gnd': '170118215'}]
 69 | 
 70 | 
 71 |     description: ""
 72 |         # Description of the dataset.
 73 |         #
 74 |         # Required.
 75 |         #
 76 |         # Allows HTML.
 77 | 
 78 |     access_type: "open"
 79 |         # Required.
 80 |         #
 81 |         # Controlled vocabulary:
 82 |         #
 83 |         # - open: Open Access
 84 |         # - embargoed: Embargoed Access
 85 |         # - restricted: Restricted Access
 86 |         # - closed: Closed Access
 87 | 
 88 |     license: ""
 89 |         # Required if 'access_type' is 'open' or 'embargoed'.
 90 |         #
 91 |         # The selected license applies to all files in the dataset, but not
 92 |         # to the metadata which is licensed under Creative Commons Zero.
 93 |         #
 94 |         # Controlled vocabulary is available at /api/licenses endpoint.
 95 |         #
 96 |         # Defaults to 'cc-zero' for datasets and 'cc-by' for everything else.
 97 | 
 98 |     embargo_date: ""
 99 |         # When the deposited files will be made automatically made publicly
100 |         # available by the system.
101 |         #
102 |         # Required if 'access_type' is 'embargoed'.
103 |         #
104 |         # Defaults to current date.
105 | 
106 |     access_conditions: ""
107 |         # Conditions under which you grant users access to the files in the
108 |         # dataset.
109 |         #
110 |         # User requesting access will be asked to justify how they fulfill the
111 |         # conditions. Based on the justification, you decide who to grant/deny
112 |         # access.
113 |         #
114 |         # Required if 'access_type' is 'restricted'.
115 |         #
116 |         # Allows HTML.
117 | 
118 |     doi: ""
119 |         # Digital Object Identifier.
120 |         # If left empty, Zenodo will register a new DOI when you publish.
121 | 
122 |     prereserve_doi:
123 |         # Set 'true' to reserve a DOI.
124 |         #
125 |         # Reserved DOI cannot be changed. Reserved DOI is not registered with
126 |         # DataCite until you publish, and thus cannot be used before then.
127 |         # Reserving a DOI is useful, if you need to include it in the files you
128 |         # upload, or if you need to provide a dataset DOI to your publisher but
129 |         # not yet publish your dataset.
130 | 
131 |     keywords: []
132 |         # List of free form keywords.
133 |         #
134 |         # Example:
135 |         #
136 |         # ["Keyword 1", "Keyword 2"]
137 | 
138 |     notes: ""
139 |         # Additional notes.
140 | 
141 |     related_identifiers: []
142 |         # Persistent identifiers of related publications and datasets.
143 |         #
144 |         # Supported identifiers include:
145 |         #
146 |         # DOI, Handle, ARK, PURL, ISSN, ISBN, PubMed ID, PubMed Central ID,
147 |         # ADS Bibliographic Code, arXiv, Life Science Identifiers (LSID),
148 |         # EAN-13, ISTC, URNs and URLs.
149 |         #
150 |         # Each array element is an object with the attributes:
151 |         #
152 |         # - identifier: The persistent identifier
153 |         # - relation: Relationship. Controlled vocabulary:
154 |         #   - isCitedBy
155 |         #   - cites
156 |         #   - isSupplementTo
157 |         #   - isSupplementedBy
158 |         #   - isContinuedBy
159 |         #   - continues
160 |         #   - isDescribedBy
161 |         #   - describes
162 |         #   - hasMetadata
163 |         #   - isMetadataFor
164 |         #   - isNewVersionOf
165 |         #   - isPreviousVersionOf
166 |         #   - isPartOf
167 |         #   - hasPart
168 |         #   - isReferencedBy
169 |         #   - references
170 |         #   - isDocumentedBy
171 |         #   - documents
172 |         #   - isCompiledBy
173 |         #   - compiles
174 |         #   - isVariantFormOf
175 |         #   - isOriginalFormof
176 |         #   - isIdenticalTo
177 |         #   - isAlternateIdentifier
178 |         #   - isReviewedBy
179 |         #   - reviews
180 |         #   - isDerivedFrom
181 |         #   - isSourceOf
182 |         #   - requires
183 |         #   - isRequiredBy
184 |         #   - isObsoletedBy
185 |         #   - obsoletes
186 |         # - resource_type: Type of the related resource (based on the type field).
187 |         #
188 |         # The identifier type (e.g. DOI) is automatically detected, and used to
189 |         # validate and normalize the identifier into a standard form.
190 |         #
191 |         # Example:
192 |         #
193 |         # [{'relation': 'isSupplementTo', 'identifier':'10.1234/foo'},
194 |         # {'relation': 'cites', 'identifier':'https://doi.org/10.1234/bar',
195 |         # 'resource_type': 'image-diagram'}]
196 | 
197 |     contributors: []
198 |         # The contributors of the dataset (e.g. editors, data curators, etc.).
199 |         #
200 |         # Each array element is an object with the attributes:
201 |         # - name: Name of contributor in the format Family name, Given names.
202 |         # - type: Contributor type. Controlled vocabulary:
203 |         #   - ContactPerson
204 |         #   - DataCollector
205 |         #   - DataCurator
206 |         #   - DataManager
207 |         #   - Distributor
208 |         #   - Editor
209 |         #   - Funder
210 |         #   - HostingInstitution
211 |         #   - Producer
212 |         #   - ProjectLeader
213 |         #   - ProjectManager
214 |         #   - ProjectMember
215 |         #   - RegistrationAgency
216 |         #   - RegistrationAuthority
217 |         #   - RelatedPerson
218 |         #   - Researcher
219 |         #   - ResearchGroup
220 |         #   - RightsHolder
221 |         #   - Supervisor
222 |         #   - Sponsor
223 |         #   - WorkPackageLeader
224 |         #   - Other
225 |         # - affiliation: Affiliation of contributor (optional).
226 |         # - orcid: ORCID identifier of contributor (optional).
227 |         # - gnd: GND identifier of contributor (optional).
228 |         #
229 |         # Example:
230 |         #
231 |         # [{'name':'Doe, John', 'affiliation': 'Zenodo', 'type': 'Editor' }]
232 | 
233 |     references: []
234 |         # List of references.
235 |         #
236 |         # Example:
237 |         # ["Doe J (2014). Title. Publisher. DOI", "Smith J (2014). Title.
238 |         # Publisher. DOI"]
239 | 
240 |     communities: []
241 |         # List of communities you wish the dataset to appear.
242 |         # The owner of the community will be notified, and can either accept or
243 |         # reject your request.
244 |         #
245 |         # Each array element is a community identifier:
246 |         #
247 |         # Example:
248 |         #
249 |         # ["ecfunded"]
250 | 
251 |     grants: []
252 |         # List of OpenAIRE-supported grants, which have funded the research for
253 |         # the dataset.
254 |         #
255 |         # Each array element is a grant identifier:
256 |         #
257 |         # Example:
258 |         #
259 |         # ["283595"]
260 |         # ["10.13039/501100000780::283595"]
261 | 
262 |     subjects: []
263 |         # Subjects from a taxonomy or controlled vocabulary.
264 |         #
265 |         # Each term must be uniquely identified (e.g. a URL).
266 |         # For free form text, use the keywords field.
267 |         #
268 |         # Each array element is an object with the attributes:
269 |         # - term: Term from taxonomy or controlled vocabulary.
270 |         # - identifier: Unique identifier for term.
271 |         #
272 |         # Example:
273 |         #
274 |         # [{"term": "Astronomy", "identifier": "http://id.loc.gov/authorities/subjects/sh85009003"}]
275 | 
276 |     version: ""
277 |         # Version of the resource.
278 |         #
279 |         # Any string will be accepted, however the suggested format is a
280 |         # semantically versioned tag (see more details on semantic versioning
281 |         # at https://semver.org)
282 |         #
283 |         # Example:
284 |         #
285 |         # 2.1.5
286 | 
287 |     language: ""
288 |         # Main language of the dataset as ISO 639-2 or 639-3 code.
289 |         #
290 |         # See Library of Congress ISO 639 codes list at
291 |         # https://www.loc.gov/standards/iso639-2/php/code_list.php.
292 |         #
293 |         # Example:
294 |         #
295 |         # eng
296 | 
297 |     locations: []
298 |         # List of locations.
299 |         #
300 |         # Each array element is an object with the attributes:
301 |         #
302 |         # - lat (double): Latitude
303 |         # - lon (double): Longitude
304 |         # - place (string): Name of place (required)
305 |         # - description (string): Description of place (optional)
306 |         #
307 |         # Example:
308 |         #
309 |         # [{"lat": 34.02577, "lon": -118.7804, "place": "Los Angeles"},
310 |         # {"place": "Mt.Fuji, Japan", "description": "Sample found 100ft from
311 |         # the foot of the mountain."}]
312 | 
313 |     dates: []
314 |         # List of date intervals
315 |         #
316 |         # Each array element is an object with the attributes:
317 |         #
318 |         # - start (ISO date string): Start date
319 |         # - end (ISO date string): End date
320 |         # - type (Collected, Valid, Withdrawn): Type of interval (required)
321 |         # - description (string): Description of interval (optional)
322 |         #
323 |         # You have to specify at least a start or end date.
324 |         # For an exact date, use the same value for both start and end.
325 |         #
326 |         # Example:
327 |         #
328 |         # [{"start": "2018-03-21", "end": "2018-03-25", "type": "Collected",
329 |         # "description": "Specimen A5 collection period."}]
330 | 
331 |     method: ""
332 |         # The methodology employed for the research.
333 |         #
334 |         # Allows HTML.
335 | 
336 |     journal_title: ""
337 |         # Journal title
338 | 
339 |     journal_volume: ""
340 |         # Journal volume
341 | 
342 |     journal_issue: ""
343 |         # Journal issue
344 | 
345 |     journal_pages: ""
346 |         # Journal pages
347 | 
348 |     conference_title: ""
349 |         # Title of conference.
350 |         #
351 |         # Example:
352 |         #
353 |         # 20th International Conference on Cloud Computing
354 | 
355 |     conference_acronym: ""
356 |         # Acronym of conference.
357 |         #
358 |         # Example:
359 |         #
360 |         # CHEP'13
361 | 
362 |     conference_dates: ""
363 |         # Dates of conference.
364 |         #
365 |         # Conference title or acronym must also be specified if this field is
366 |         # specified.
367 |         #
368 |         # Example:
369 |         #
370 |         # 14-18 August 2022
371 | 
372 |     conference_place: ""
373 |         # Place of conference in the format city, country.
374 |         #
375 |         # Conference title or acronym must also be specified if this field is
376 |         # specified.
377 |         #
378 |         # Example:
379 |         #
380 |         # Amsterdam, The Netherlands
381 | 
382 |     conference_url: ""
383 |         # URL of conference.
384 |         #
385 |         # Example:
386 |         #
387 |         # http://www.chep2013.org/
388 | 
389 |     conference_session: ""
390 |         # Number of session within the conference.
391 |         #
392 |         # Example:
393 |         #
394 |         # VI
395 | 
396 |     conference_session_part: ""
397 |         # Number of part within a session.
398 |         #
399 |         # Example:
400 |         #
401 |         # 1
402 | 
403 |     imprint_publisher: ""
404 |         # Publisher of a book, report, or chapter.
405 | 
406 |     imprint_isbn: ""
407 |         # ISBN of a book or report.
408 | 
409 |     imprint_place: ""
410 |         # Place of publication of a book, report, or chapter in the format city, country.
411 | 
412 |     partof_title: ""
413 |         # Title of book for chapters.
414 | 
415 |     partof_pages: ""
416 |         # Pages numbers of book.
417 | 
418 |     thesis_supervisors: []
419 |         # Supervisors of the thesis.
420 |         #
421 |         # Same format as for authors.
422 | 
423 |     thesis_university: ""
424 |         # Awarding university of thesis.
425 | 


--------------------------------------------------------------------------------
/src/fairly/dataset/__init__.py:
--------------------------------------------------------------------------------
  1 | """Dataset class module.
  2 | 
  3 | Dataset class is used to represent datasets in a standardized manner.
  4 | It is an abstract class.
  5 | 
  6 | Implementations:
  7 |     LocalDataset
  8 |     RemoteDataset
  9 | """
 10 | from __future__ import annotations
 11 | from typing import List, Dict
 12 | from abc import ABC, abstractmethod
 13 | 
 14 | import datetime
 15 | 
 16 | from ..metadata import Metadata
 17 | from ..file import File
 18 | from ..diff import Diff
 19 | 
 20 | 
 21 | class Dataset(ABC):
 22 |     """Dataset class.
 23 | 
 24 |     Attributes:
 25 |       _metadata (Metadata): Metadata.
 26 |       _files (list): Files list.
 27 |       _modified (datetime.datetime): Last known modification date.
 28 |       _auto_refresh (bool): Auto-refresh flag.
 29 |     """
 30 | 
 31 |     def __init__(self, auto_refresh: bool=False):
 32 |         """Initializes Dataset object.
 33 | 
 34 |         Args:
 35 |             auto_refresh (bool): Set True to auto-refresh dataset information (default False).
 36 |         """
 37 |         self._metadata = None
 38 |         self._files = None
 39 |         self._modified = None
 40 |         self._auto_refresh = auto_refresh
 41 | 
 42 | 
 43 |     @abstractmethod
 44 |     def _get_metadata(self) -> Metadata:
 45 |         """Retrieves metadata of the dataset.
 46 | 
 47 |         Returns:
 48 |             Metadata of the dataset.
 49 |         """
 50 |         raise NotImplementedError
 51 | 
 52 | 
 53 |     def get_metadata(self, refresh: bool=False) -> Metadata:
 54 |         """Returns metadata of the dataset.
 55 | 
 56 |         Args:
 57 |             refresh (bool): Set True to enforce metadata retrieval (default False).
 58 | 
 59 |         Returns:
 60 |             Metadata of the dataset.
 61 |         """
 62 |         if self._metadata is None or refresh:
 63 |             self._metadata = self._get_metadata()
 64 |             self._modified = self.modified
 65 | 
 66 |         return self._metadata
 67 | 
 68 | 
 69 |     @property
 70 |     def metadata(self) -> Metadata:
 71 |         """Metadata of the dataset.
 72 | 
 73 |         Refreshes metadata automatically if metadata object is not modified by
 74 |         the user, auto-fresh flag is set, and metadata is modified externally.
 75 |         """
 76 |         if self._metadata and self._metadata.is_modified:
 77 |             refresh = False
 78 |         else:
 79 |             refresh = self._auto_refresh and self.is_modified
 80 | 
 81 |         return self.get_metadata(refresh=refresh)
 82 | 
 83 | 
 84 |     def set_metadata(self, **kwargs) -> None:
 85 |         """Sets metadata attributes.
 86 | 
 87 |         Args:
 88 |             **kwargs: Metadata attributes.
 89 |         """
 90 |         self.metadata.update(kwargs)
 91 | 
 92 | 
 93 |     @abstractmethod
 94 |     def _save_metadata(self) -> None:
 95 |         """Stores dataset metadata."""
 96 |         raise NotImplementedError
 97 | 
 98 | 
 99 |     def save_metadata(self, force: bool=False) -> None:
100 |         """Stores dataset metadata if exists.
101 | 
102 |         Args:
103 |             force (bool): Set True to enforce save even if existing dataset is modified (default False).
104 | 
105 |         Raises:
106 |             Warning("Existing dataset is modified"): If dataset is modified.
107 |         """
108 |         if self._metadata is None:
109 |             return
110 | 
111 |         # REMARK: It can be better to check if metadata is actually changed
112 |         if self.is_modified and not force:
113 |             raise Warning("Existing dataset is modified")
114 | 
115 |         self._save_metadata()
116 | 
117 |         self.get_metadata(refresh=True)
118 | 
119 | 
120 |     @abstractmethod
121 |     def _get_files(self) -> List[File]:
122 |         """Returns list of files of the dataset."""
123 |         raise NotImplementedError
124 | 
125 | 
126 |     def get_files(self, refresh: bool=False) -> Dict[str, File]:
127 |         """Returns dictionary of files of the dataset.
128 | 
129 |         Args:
130 |             refresh (bool): Set True to enforce file list retrieval.
131 | 
132 |         Returns:
133 |             Dictionary of files of the dataset.
134 |             Keys are paths, values are File objects.
135 |         """
136 |         if self._files is None or refresh or self.auto_refresh:
137 |             files = {}
138 |             for file in self._get_files():
139 |                 files[file.path] = file
140 |             self._files = files
141 |             self._modified = self.modified
142 | 
143 |         return self._files
144 | 
145 | 
146 |     @property
147 |     def files(self) -> List[File]:
148 |         """List of files of the dataset."""
149 |         return self.get_files(refresh=self.is_modified)
150 | 
151 | 
152 |     def get_file(self, val: str, refresh: bool=False) -> File:
153 |         """Returns specified file of the dataset.
154 | 
155 |         Args:
156 |             val (str): File identifier.
157 |             refresh (bool): Set True to enforce file information retrieval.
158 | 
159 |         Returns:
160 |             File object if file is found, None otherwise.
161 |         """
162 |         # TODO: Implement without using get_files()
163 |         files = self.get_files(refresh)
164 | 
165 |         if isinstance(val, int):
166 |             return list(files.values())[val]
167 | 
168 |         for key, file in files.items():
169 |             if file.match(val):
170 |                 return file
171 | 
172 |         return None
173 | 
174 | 
175 |     def file(self, val: str) -> File:
176 |         """Returns specified file of the dataset.
177 | 
178 |         Automatically refreshes file information if dataset is modified.
179 |         """
180 |         return self.get_file(val, refresh=self.is_modified)
181 | 
182 | 
183 |     @abstractmethod
184 |     def reproduce(self) -> Dataset:
185 |         """Reproduces an actual copy of the dataset."""
186 |         raise NotImplementedError
187 | 
188 | 
189 |     def diff_metadata(self, dataset: Dataset=None) -> Diff:
190 |         diff = Diff()
191 | 
192 |         if dataset is None:
193 |             dataset = self.reproduce()
194 | 
195 |         metadata = self.metadata
196 |         other_metadata = dataset.metadata
197 | 
198 |         for key, val in metadata.items():
199 | 
200 |             if key in other_metadata:
201 |                 if val == other_metadata[key]:
202 |                     pass
203 |                 else:
204 |                     diff.modify(key, val, other_metadata[key])
205 | 
206 |             else:
207 |                 diff.add(key, val)
208 | 
209 |         for key, val in other_metadata.items():
210 |             if key not in metadata:
211 |                 diff.remove(key, val)
212 | 
213 |         return diff
214 | 
215 | 
216 |     def diff_files(self, dataset: Dataset=None) -> Diff:
217 |         diff = Diff()
218 | 
219 |         if dataset is None:
220 |             dataset = self.reproduce()
221 | 
222 |         files = self.files
223 |         other_files = dataset.files
224 | 
225 |         for path, file in files.items():
226 |             other_file = other_files.get(path)
227 | 
228 |             if other_file:
229 |                 if file.size == other_file.size and file.md5 == other_file.md5:
230 |                     pass
231 |                 else:
232 |                     diff.modify(path, file, other_file)
233 | 
234 |             else:
235 |                 diff.add(path, file)
236 | 
237 |         for path, other_file in other_files.items():
238 |             if path not in files:
239 |                 diff.remove(path, other_file)
240 | 
241 |         return diff
242 | 
243 | 
244 |     @property
245 |     @abstractmethod
246 |     def title(self) -> str:
247 |         """Title of the dataset."""
248 |         raise NotImplementedError
249 | 
250 | 
251 |     @property
252 |     @abstractmethod
253 |     def size(self) -> int:
254 |         """Total size of the dataset in bytes."""
255 |         raise NotImplementedError
256 | 
257 | 
258 |     @property
259 |     @abstractmethod
260 |     def created(self) -> datetime.datetime:
261 |         """Creation date and time of the dataset."""
262 |         raise NotImplementedError
263 | 
264 | 
265 |     @property
266 |     @abstractmethod
267 |     def modified(self) -> datetime.datetime:
268 |         """Last modification date and time of the dataset."""
269 |         raise NotImplementedError
270 | 
271 | 
272 |     @property
273 |     def is_modified(self) -> bool:
274 |         """Checks if the existing dataset is modified.
275 | 
276 |         Returns:
277 |             True if the existing dataset is modified, False otherwise.
278 |         """
279 |         return None if self._modified is None else self._modified != self.modified
280 | 
281 | 
282 |     @property
283 |     def auto_refresh(self) -> bool:
284 |         """Auto-refresh flag of the dataset."""
285 |         return self._auto_refresh
286 | 
287 | 
288 |     @auto_refresh.setter
289 |     def auto_refresh(self, val) -> None:
290 |         """Sets auto-refresh flag of the dataset."""
291 |         self._auto_refresh = bool(val)


--------------------------------------------------------------------------------
/src/fairly/dataset/remote.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Any, List, Dict, Callable
  3 | 
  4 | import fairly
  5 | 
  6 | from . import Dataset
  7 | from ..metadata import Metadata
  8 | from ..file.local import LocalFile
  9 | from ..file.remote import RemoteFile
 10 | # FIXME: Importing Client or LocalDataset results in circular dependency
 11 | # from ..client import Client
 12 | 
 13 | import os
 14 | import os.path
 15 | import datetime
 16 | import concurrent.futures
 17 | from functools import cached_property
 18 | import logging
 19 | 
 20 | 
 21 | class RemoteDataset(Dataset):
 22 |     """
 23 | 
 24 |     Attributes:
 25 |         _client (Client): Client object
 26 |         _id (str): Dataset identifier
 27 |         _details (Dict): Dataset details
 28 | 
 29 |     """
 30 | 
 31 |     def __init__(self, client, id=None, auto_refresh: bool=True, **kwargs):
 32 |         """Initializes RemoteDataset object.
 33 | 
 34 |         Args:
 35 |             client (Client): Client of the dataset
 36 |             id: Dataset identifier
 37 |             auto_refresh (bool): Set True to auto-refresh dataset information
 38 |         """
 39 |         # Call parent method
 40 |         super().__init__(auto_refresh=auto_refresh)
 41 | 
 42 |         # Set client
 43 |         self._client = client
 44 | 
 45 |         # Set dataset id
 46 |         self._id = client.get_dataset_id(id, **kwargs)
 47 | 
 48 |         # Set details
 49 |         self._details = client.get_details(self.id)
 50 | 
 51 | 
 52 |     @property
 53 |     def client(self) -> Client:
 54 |         """Client of the dataset."""
 55 |         return self._client
 56 | 
 57 | 
 58 |     @property
 59 |     def id(self) -> Dict:
 60 |         """Identifier of the dataset."""
 61 |         return self._id
 62 | 
 63 | 
 64 |     @property
 65 |     def plain_id(self) -> str:
 66 |         """Plain identifier of the dataset."""
 67 |         return self._client.get_dataset_plain_id(self._id)
 68 | 
 69 | 
 70 |     def _get_metadata(self) -> Metadata:
 71 |         return self.client.get_metadata(self.id)
 72 | 
 73 | 
 74 |     def _save_metadata(self) -> None:
 75 |         return self.client.save_metadata(self.id, self.metadata)
 76 | 
 77 | 
 78 |     def _get_files(self) -> List[RemoteFile]:
 79 |         return self.client.get_files(self.id)
 80 | 
 81 | 
 82 |     def get_versions(self) -> List[RemoteDataset]:
 83 |         """Returns all available versions of the dataset.
 84 | 
 85 |         Returns:
 86 |             List of remote datasets of all available versions.
 87 |         """
 88 |         return self.client.get_versions(self.id)
 89 | 
 90 | 
 91 |     def _store_file(self, file, path, extract, notify):
 92 |         # Download file
 93 |         local_file = self.client.download_file(file, path, notify=notify)
 94 | 
 95 |         # Check if file should be extracted
 96 |         if extract and local_file.is_archive and local_file.is_simple:
 97 | 
 98 |             # Start extraction loop
 99 |             while True:
100 |                 files = local_file.extract(path, notify=notify)
101 |                 os.remove(local_file.fullpath)
102 | 
103 |                 if len(files) == 1:
104 |                     inner_file = LocalFile(os.path.join(path, files[0]))
105 |                     if inner_file.is_archive:
106 |                         local_file = inner_file
107 |                         continue
108 | 
109 |                 break
110 | 
111 |             return {file.path: files}
112 | 
113 |         else:
114 |             return file.path
115 | 
116 | 
117 |     def store(self, path: str=None, notify: Callable=None, extract: bool=False, max_workers: int=None) -> LocalDataset:
118 |         """Stores the dataset to a local directory.
119 | 
120 |         If no path is provided, DOI is used by replacing slashes and backslashes with underscores.
121 |         Local directory is created if it does not exist.
122 | 
123 |         Args:
124 |             path (str): Path to the local directory (optional).
125 |             notify (Callable): Notification callback method (optional).
126 |             extract (bool): Set True to extract archive files (default False).
127 |             max_workers (int): Number of workers (optional).
128 | 
129 |         Returns:
130 |             LocalDataset object of the stored local dataset.
131 | 
132 |         Raises:
133 |             ValueError("Empty path")
134 |             ValueError("Directory is not empty")
135 |         """
136 |         # Set number of workers if required
137 |         if not max_workers:
138 |             max_workers = fairly.max_workers()
139 | 
140 |         # Set path based on DOI if required
141 |         if not path:
142 |             path = self.doi
143 |             if not path:
144 |                 raise ValueError("Empty path")
145 |             for sep in ["/", "\\"]:
146 |                 path = path.replace(sep, "_")
147 | 
148 |         # Create path
149 |         os.makedirs(path, exist_ok=True)
150 | 
151 |         # check if directory is empty,
152 |         # while ignoring hidden files or directories
153 |         entries = os.listdir(path)
154 |         visible_entries = [entry for entry in entries if not entry.startswith(".")]
155 |         if len(visible_entries) > 0:
156 |             raise ValueError("Directory is not empty.")
157 | 
158 |         # Set dataset template
159 |         templates = fairly.metadata_templates()
160 | 
161 |         if self.client.repository_id in templates:
162 |             template = self.client.repository_id
163 | 
164 |         elif self.client.client_id in templates:
165 |             template = self.client.client_id
166 | 
167 |         else:
168 |             template = None
169 | 
170 |         # Initialize dataset
171 |         dataset = fairly.init_dataset(path, template=template)
172 | 
173 |         # Save metadata
174 |         # TODO: Set metadata directly without serialization
175 |         dataset.set_metadata(**self.metadata)
176 |         dataset.save_metadata()
177 | 
178 |         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
179 | 
180 |             futures = []
181 | 
182 |             for _, file in self.files.items():
183 |                 futures.append(
184 |                     executor.submit(self._store_file, file, path, extract, notify)
185 |                 )
186 | 
187 |             for future in concurrent.futures.as_completed(futures):
188 |                 dataset.includes.append(future.result())
189 | 
190 |         # Save file information
191 |         dataset.save_files()
192 | 
193 |         # Set remote dataset id if possible
194 |         # REMARK: It might be possible to store configuration for custom clients.
195 |         if self.client.repository_id:
196 |             dataset.set_remote_dataset(self)
197 | 
198 |         return dataset
199 | 
200 | 
201 |     def _get_detail(self, key: str, refresh: bool=False) -> Any:
202 |         if refresh:
203 |             self._details = self.client.get_details(self.id)
204 | 
205 |         return self._details.get(key)
206 | 
207 | 
208 |     @property
209 |     def title(self) -> str:
210 |         """Title of the dataset."""
211 |         # REMARK: Title is usually part of the metadata
212 |         return self._get_detail("title")
213 | 
214 | 
215 |     @property
216 |     def url(self) -> str:
217 |         """URL address of the dataset."""
218 |         # REMARK: URL address might be part of the metadata
219 |         return self._get_detail("url")
220 | 
221 | 
222 |     @property
223 |     def doi(self) -> str:
224 |         """DOI of the dataset."""
225 |         # REMARK: DOI might be part of the metadata
226 |         return self._get_detail("doi")
227 | 
228 | 
229 |     @property
230 |     def status(self) -> str:
231 |         """Status of the dataset.
232 | 
233 |         Possible statuses are as follows:
234 |             - "draft": Dataset is not published yet.
235 |             - "public": Dataset is published and is publicly available.
236 |             - "embargoed": Dataset is published, but is under embargo.
237 |             - "restricted": Dataset is published, but accessible only under certain conditions.
238 |             - "closed": Dataset is published, but accessible only by the owners.
239 |             - "error": Dataset is in an error state.
240 |             - "unknown": Dataset is in an unknown state.
241 |         """
242 |         return self._get_detail("status")
243 | 
244 | 
245 |     @property
246 |     def size(self) -> int:
247 |         """Total size of the dataset in bytes."""
248 |         size = self._get_detail("size")
249 | 
250 |         if size is None:
251 |             size = 0
252 |             for file in self.get_files():
253 |                 size += file.size
254 | 
255 |         return size
256 | 
257 | 
258 |     @cached_property
259 |     def created(self) -> datetime.datetime:
260 |         """Creation date and time of the dataset"""
261 |         return self._get_detail("created")
262 | 
263 | 
264 |     @property
265 |     def modified(self) -> datetime.datetime:
266 |         """Last modification date and time of the dataset"""
267 |         # REMARK: Can be better to have a dedicated method to minimize data transfer
268 |         return self._get_detail("modified", refresh=True)
269 | 
270 | 
271 |     def reproduce(self) -> RemoteDataset:
272 |         """Reproduces an actual copy of the dataset."""
273 |         return RemoteDataset(self.client, self.id)


--------------------------------------------------------------------------------
/src/fairly/diff.py:
--------------------------------------------------------------------------------
 1 | """Diff class module.
 2 | 
 3 | Diff class is used to keep track of dataset modifications.
 4 | 
 5 | Usage example:
 6 | 
 7 |     >>> diff = Diff()
 8 |     >>> diff.modify("name", "Johnny", "John")
 9 |     >>> diff.modified
10 |         {"name": ("Johnny", "John")}
11 | """
12 | from typing import Dict
13 | 
14 | class Diff:
15 |     """
16 |     Attributes:
17 |         _added (Dict): Items added
18 |         _modified (Dict): Items modified
19 |         _removed (Dict): Items removed
20 |     """
21 | 
22 |     def __init__(self):
23 |         self._added = {}
24 |         self._modified = {}
25 |         self._removed = {}
26 | 
27 | 
28 |     def __bool__(self):
29 |         return bool(self._added) or bool(self._modified) or bool(self._removed)
30 | 
31 | 
32 |     def __repr__(self):
33 |         return "{{'added': {}, 'modified': {}, 'removed': {}}}".format(self.added, self.modified, self.removed)
34 | 
35 | 
36 |     @property
37 |     def added(self) -> Dict:
38 |         """Returns a dictionary of added items."""
39 |         return self._added
40 | 
41 | 
42 |     @property
43 |     def modified(self) -> Dict:
44 |         """Returns a dictionary of modified items."""
45 |         return self._modified
46 | 
47 | 
48 |     @property
49 |     def removed(self) -> Dict:
50 |         """Returns a dictionary of removed items."""
51 |         return self._removed
52 | 
53 | 
54 |     def add(self, key, val) -> None:
55 |         """Appends an item to the diff set as added.
56 | 
57 |         Args:
58 |             key: Item key
59 |             val: Item value
60 |         """
61 |         self._added[key] = val
62 | 
63 | 
64 |     def modify(self, key, val, oldval) -> None:
65 |         """Appends an item to the diff set as modified.
66 | 
67 |         Args:
68 |             key: Item key
69 |             val: Item value
70 |             oldVal: Old value of the item
71 |         """
72 |         self._modified[key] = (val, oldval)
73 | 
74 | 
75 |     def remove(self, key, val) -> None:
76 |         """Appends an item to the diff set as removed.
77 | 
78 |         Args:
79 |             key: Item key
80 |             val: Item value
81 |         """
82 |         self._removed[key] = val
83 | 


--------------------------------------------------------------------------------
/src/fairly/file/__init__.py:
--------------------------------------------------------------------------------
  1 | """File class module.
  2 | 
  3 | File class is used to store file information in a standardized manner.
  4 | It is an abstract class.
  5 | 
  6 | Implementations:
  7 |     - LocalFile
  8 |     - RemoteFile
  9 | """
 10 | from abc import ABC, abstractmethod
 11 | 
 12 | import os.path
 13 | 
 14 | 
 15 | class File(ABC):
 16 |     """File class.
 17 | 
 18 |     Attributes:
 19 |         _name (str): Name of the file including its extension.
 20 |         _path (str): Path of the file including its name.
 21 |         _size (int): Size of the file in bytes.
 22 |         _type (str): Content type of the file.
 23 |         _md5 (str): MD5 checksum of the file.
 24 |         _extension (str): Extension of the file.
 25 |     """
 26 | 
 27 |     @abstractmethod
 28 |     def __init__(self):
 29 |         """Initializes File object."""
 30 |         raise NotImplementedError
 31 | 
 32 | 
 33 |     def __repr__(self):
 34 |         """Returns string representation of the file."""
 35 |         return f"{{'path': {self.path}, 'size': {self.size}}}"
 36 | 
 37 | 
 38 |     @property
 39 |     def name(self) -> str:
 40 |         """Name of the file including its extension."""
 41 |         return self._name
 42 | 
 43 | 
 44 |     @property
 45 |     def path(self) -> str:
 46 |         """Path of the file including its name."""
 47 |         return self._path
 48 | 
 49 | 
 50 |     @property
 51 |     def size(self) -> int:
 52 |         """Size of the file in bytes."""
 53 |         return self._size
 54 | 
 55 | 
 56 |     @property
 57 |     def type(self) -> str:
 58 |         """Content type of the file."""
 59 |         return self._type
 60 | 
 61 | 
 62 |     @property
 63 |     def md5(self) -> str:
 64 |         """MD5 checksum of the file."""
 65 |         return self._md5
 66 | 
 67 | 
 68 |     @property
 69 |     def extension(self) -> str:
 70 |         """Extension of the file."""
 71 |         if not hasattr(self, "_extension"):
 72 |             _, self._extension = os.path.splitext(self.name)
 73 | 
 74 |         return self._extension
 75 | 
 76 | 
 77 |     def match(self, val: str) -> bool:
 78 |         """Checks if file matches the specified file identifier.
 79 | 
 80 |         File name, path, and MD5 checksum are compared with the specified
 81 |         identifier for matching.
 82 | 
 83 |         Args:
 84 |             val (str): File identifier.
 85 | 
 86 |         Returns:
 87 |             True if file matches the specified file identifier, False otherwise.
 88 |         """
 89 |         return True if self.name == val or self.path == val or self.md5 == val else False
 90 | 
 91 | 
 92 |     @property
 93 |     def is_simple(self) -> bool:
 94 |         """Checks if file is a simple file.
 95 | 
 96 |         A simple file does not include any directories in its path, e.g. the
 97 |         path is equal to the name.
 98 | 
 99 |         Returns:
100 |             True if the file is simple, False otherwise.
101 |         """
102 |         return self.path == self.name
103 | 


--------------------------------------------------------------------------------
/src/fairly/file/local.py:
--------------------------------------------------------------------------------
  1 | """LocalFile class module.
  2 | 
  3 | LocalFile class is used to perform operations on local files.
  4 | 
  5 | Usage example:
  6 | 
  7 |     >>> file = LocalFile("/path/to/local/file/filename.txt")
  8 |     >>> file.type
  9 |         application/text
 10 |     >>> file.size
 11 |         543
 12 |     >>> file.is_archive
 13 |         False
 14 | """
 15 | from . import File
 16 | from typing import Callable, List
 17 | 
 18 | import os
 19 | import os.path
 20 | import mimetypes
 21 | import hashlib
 22 | import zipfile
 23 | import tarfile
 24 | import logging
 25 | 
 26 | 
 27 | class LocalFile(File):
 28 |     """LocalFile class.
 29 | 
 30 |     Class Attributes:
 31 |         CHUNK_SIZE: Chunk size in bytes to calculate MD5 checksum (default = 65536).
 32 |         NO_EXTRACT: List of file extensions which should not be extracted.
 33 | 
 34 |     Attributes:
 35 |         _fullpath (str): Full path of the local file.
 36 |     """
 37 | 
 38 |     CHUNK_SIZE = 2**18
 39 | 
 40 |     NO_EXTRACT = [
 41 |         ".docx",
 42 |         ".xlsx",
 43 |         ".pptx",
 44 |     ]
 45 | 
 46 |     def __init__(self, fullpath: str, basepath: str = None, md5: str = None):
 47 |         """Initializes LocalFile object.
 48 | 
 49 |         Args:
 50 |             fullpath (str): Full path of the local file.
 51 |             basepath (str): Base path of the local file (optional).
 52 |             md5 (str): MD5 checksum of the local file (optional).
 53 | 
 54 |         Raises:
 55 |             ValueError("Invalid file path"): If fullpath is not a valid file path.
 56 |         """
 57 |         if not os.path.isfile(fullpath):
 58 |             raise ValueError("Invalid file path")
 59 |         self._fullpath = fullpath
 60 |         self._path = os.path.relpath(fullpath, basepath) if basepath else fullpath
 61 |         self._name = os.path.basename(fullpath)
 62 |         self._size = os.path.getsize(fullpath)
 63 |         self._type = None
 64 |         self._md5 = md5
 65 | 
 66 | 
 67 |     @property
 68 |     def fullpath(self) -> str:
 69 |         """Full path of the local file."""
 70 |         return self._fullpath
 71 | 
 72 | 
 73 |     @property
 74 |     def type(self) -> str:
 75 |         """Content type of the local file."""
 76 |         if self._type is None:
 77 |             logging.info("Guessing content type of %s.", self.fullpath)
 78 |             self._type, _ = mimetypes.guess_type(self.fullpath)
 79 |             logging.info("Guessed content type is %s.", self._type)
 80 | 
 81 |         return self._type
 82 | 
 83 | 
 84 |     @property
 85 |     def md5(self) -> str:
 86 |         """MD5 checksum of the local file.
 87 | 
 88 |         MD5 checksum is only calculated once and cached for subsequent calls.
 89 |         """
 90 |         if self._md5 is None:
 91 |             logging.info("Calculating MD5 checksum of %s.", self.fullpath)
 92 |             with open(self.fullpath, "rb") as file:
 93 |                 md5 = hashlib.md5()
 94 |                 while chunk := file.read(self.CHUNK_SIZE):
 95 |                     md5.update(chunk)
 96 |             self._md5 = md5.hexdigest()
 97 |             logging.info("Calculated MD5 checksum is %s.", self._md5)
 98 | 
 99 |         return self._md5
100 | 
101 | 
102 |     @property
103 |     def is_archive(self) -> bool:
104 |         """Checks if file is an archive file.
105 | 
106 |         Returns:
107 |             True if file is an archive file, False otherwise.
108 |         """
109 |         if self.NO_EXTRACT and (self.extension in self.NO_EXTRACT):
110 |             return False
111 | 
112 |         if zipfile.is_zipfile(self.fullpath):
113 |             return True
114 | 
115 |         elif tarfile.is_tarfile(self.fullpath):
116 |             return True
117 | 
118 |         else:
119 |             return False
120 | 
121 | 
122 |     def match(self, val: str) -> bool:
123 |         """Checks if file matches the specified file identifier.
124 | 
125 |         File fullpath is compared with the specified identifier in addition to
126 |         the properties checked by File.match().
127 | 
128 |         Args:
129 |             val (str): File identifier.
130 | 
131 |         Returns:
132 |             True if file matches the specified file identifier, False otherwise.
133 |         """
134 |         return True if self.fullpath == val else super().match(val)
135 | 
136 | 
137 |     def extract(self, path: str = None, notify: Callable = None) -> List:
138 |         """Extracts archive file contents to a specified directory.
139 | 
140 |         Args:
141 |             path: Path of the directory to extract to. Default is the
142 |                 current working directory.
143 | 
144 |             notify: Notification callback function. Three arguments are
145 |                 provided to the callback function:
146 | 
147 |                 - file (LocalFile): File object of the extracted local file.
148 |                 - current_size (int): Current total uncompressed size of
149 |                     extracted files.
150 |                 - total_size (int): Total uncompressed size of the archive.
151 | 
152 |         Raises:
153 |             ValueError("Invalid path"): If path is not a directory path.
154 |             ValueError("Invalid archive item {name}"): If archive item path is not valid.
155 |             ValueError("Invalid archive file"): If file is not an archive file.
156 | 
157 |         Returns:
158 |             List of names of extracted files (str).
159 |         """
160 |         # Raise exception if invalid path
161 |         if path:
162 |             if not os.path.isdir(path):
163 |                 raise ValueError("Invalid path")
164 |         else:
165 |             path = ""
166 | 
167 |         files = []
168 | 
169 |         # Check if ZIP archive
170 |         if zipfile.is_zipfile(self.fullpath):
171 | 
172 |             # Open ZIP archive
173 |             with zipfile.ZipFile(self.fullpath, "r") as archive:
174 | 
175 |                 # Get list of items
176 |                 items = archive.infolist()
177 | 
178 |                 # Calculate total size
179 |                 total_size = sum(item.file_size for item in items)
180 | 
181 |                 # Extract items
182 |                 current_size = 0
183 |                 for item in items:
184 | 
185 |                     # REMARK: Absolute and non-canonical paths are corrected
186 |                     # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.extract
187 |                     # TODO: Add error handling
188 |                     archive.extract(item, path)
189 | 
190 |                     files.append(item.filename)
191 | 
192 |                     # Call notify callback if required
193 |                     if notify and not item.is_dir():
194 | 
195 |                         file = LocalFile(os.path.join(
196 |                             path, item.filename), path)
197 |                         current_size += file.size
198 |                         notify(file, file.size, total_size, current_size)
199 | 
200 |         # Check if TAR archive
201 |         elif tarfile.is_tarfile(self.fullpath):
202 | 
203 |             # Open TAR archive
204 |             with tarfile.open(self.fullpath, "r") as archive:
205 | 
206 |                 # Get list of items
207 |                 items = archive.getmembers()
208 | 
209 |                 # Calculate total size
210 |                 total_size = sum(item.size for item in items)
211 | 
212 |                 # Check validity of the archive content
213 |                 for item in items:
214 | 
215 |                     if os.path.normpath(item.name) != os.path.relpath(item.name):
216 |                         raise ValueError(f"Invalid archive item {item.name}")
217 | 
218 |                 # Extract items
219 |                 # REMARK: extractall() cannot be used as it sets owner attributes
220 |                 attrs = []
221 | 
222 |                 current_size = 0
223 |                 for item in items:
224 | 
225 |                     itempath = os.path.join(path, item.name)
226 |                     attrs.append(
227 |                         {"path": itempath, "mode": item.mode, "time": item.mtime})
228 | 
229 |                     if item.isdir():
230 |                         item.mode = 0o700
231 | 
232 |                     # TODO: Add error handling
233 |                     archive.extract(item, path, set_attrs=False)
234 | 
235 |                     files.append(item.name)
236 | 
237 |                     # Call notify callback if required
238 |                     if notify and item.isfile():
239 |                         file = LocalFile(itempath, path)
240 |                         current_size += file.size
241 |                         notify(file, file.size, total_size, current_size)
242 | 
243 |                 # Set file mode and modification times
244 |                 # REMARK: Reverse sorting is required to handle directories correctly
245 |                 attrs.sort(key=lambda item: item["path"], reverse=True)
246 | 
247 |                 for item in attrs:
248 |                     try:
249 |                         os.chmod(item["path"], item["mode"])
250 |                         os.utime(item["path"], (item["time"], item["time"]))
251 |                     except:
252 |                         pass
253 | 
254 |         else:
255 |             raise ValueError("Invalid archive file")
256 | 
257 |         return files
258 | 


--------------------------------------------------------------------------------
/src/fairly/file/remote.py:
--------------------------------------------------------------------------------
  1 | """RemoteFile class module.
  2 | 
  3 | RemoteFile class is used to perform operations on remote files.
  4 | """
  5 | from typing import Dict
  6 | 
  7 | from . import File
  8 | 
  9 | import requests
 10 | import mimetypes
 11 | import os.path
 12 | from urllib.parse import urlparse
 13 | import logging
 14 | 
 15 | 
 16 | class RemoteFile(File):
 17 |     """RemoteFile class.
 18 | 
 19 |     Attributes:
 20 |         _url (str): URL address of the remote file.
 21 |         _id (str): Identifier of the remote file.
 22 |         _headers (Dict): HTTP headers of the remote file.
 23 |     """
 24 | 
 25 |     def __init__(self, url: str, id: str=None, path: str=None, size: int=None, type: str=None, md5: str=None):
 26 |         """Initializes RemoteFile object.
 27 | 
 28 |         Args:
 29 |             url (str): URL address of the remote file.
 30 |             id (str): Identifier of the remote file (optional).
 31 |             path (str): Path of the remote file (optional).
 32 |             size (int): Size of the remote file in bytes (optional).
 33 |             type (str): Content type of the remote file (optional).
 34 |             md5 (str): MD5 checksum of the remote file (optional).
 35 |         """
 36 |         self._url = url
 37 |         self._id = id
 38 |         self._headers = None
 39 |         self._path = path
 40 |         self._name = os.path.basename(path) if path else None
 41 |         self._size = size
 42 |         self._type = type
 43 |         self._md5 = md5
 44 | 
 45 | 
 46 |     @property
 47 |     def url(self) -> str:
 48 |         """URL address of the remote file."""
 49 |         return self._url
 50 | 
 51 | 
 52 |     @property
 53 |     def id(self) -> str:
 54 |         """Identifier of the remote file."""
 55 |         return self._id
 56 | 
 57 | 
 58 |     @property
 59 |     def headers(self) -> Dict:
 60 |         """HTTP headers of the remote file."""
 61 |         if self._headers is None:
 62 |             logging.info("Fetching HTTP headers from %s.", self.url)
 63 |             # TODO: Add error handling
 64 |             response = requests.head(self.url, allow_redirects=True)
 65 |             response.raise_for_status()
 66 |             logging.debug("Headers %s", response.headers)
 67 |             self._headers = response.headers
 68 | 
 69 |         return self._headers
 70 | 
 71 | 
 72 |     @property
 73 |     def name(self) -> str:
 74 |         """Name of the remote file."""
 75 |         if self._name is None:
 76 |             parts = urlparse(self.url)
 77 |             self._name = os.path.basename(parts.path)
 78 | 
 79 |         return self._name
 80 | 
 81 | 
 82 |     @property
 83 |     def size(self) -> int:
 84 |         """Size of the remote file in bytes.
 85 | 
 86 |         Content-Length header is used to get the size.
 87 |         It is only calculated once and cached for subsequent calls.
 88 |         """
 89 |         if self._size is None:
 90 |             self._size = self.headers.get("content-length")
 91 | 
 92 |         return self._size
 93 | 
 94 | 
 95 |     @property
 96 |     def type(self) -> str:
 97 |         """Content type of the remote file.
 98 | 
 99 |         Content type is guessed by using the URL address. If it fails, then
100 |         Content-Type header is used to get the content type.
101 |         It is only calculated once and cached for subsequent calls.
102 |         """
103 |         if self._type is None:
104 |             self._type, _ = mimetypes.guess_type(self.url)
105 |             if self._type is None:
106 |                 self._type = self.headers.get("content-type")
107 | 
108 |         return self._type
109 | 
110 | 
111 |     @property
112 |     def md5(self) -> str:
113 |         """MD5 checksum of the remote file.
114 | 
115 |         Content-MD5 header is used to get the MD5 checksum.
116 |         It is only calculated once and cached for subsequent calls.
117 |         """
118 |         if self._md5 is None:
119 |             self._md5 = self.headers.get("content-md5")
120 | 
121 |         return self._md5
122 | 
123 | 
124 |     def match(self, val: str) -> bool:
125 |         """Checks if remote file matches the specified file identifier.
126 | 
127 |         File URL address and id are compared with the specified identifier in
128 |         addition to the properties checked by File.match().
129 | 
130 |         Args:
131 |             val (str): File identifier.
132 | 
133 |         Returns:
134 |             True if file matches the specified file identifier, False otherwise.
135 |         """
136 |         return True if self.url == val or self.id == val else super().match(val)
137 | 


--------------------------------------------------------------------------------
/src/fairly/metadata.py:
--------------------------------------------------------------------------------
  1 | """Metadata class module.
  2 | 
  3 | Metadata class is used to store metadata attributes in a standardized manner.
  4 | 
  5 | Usage example:
  6 | 
  7 |     >>> metadata = Metadata({"title": "Title", "DOI": "doi:xxx"})
  8 |     >>> metadata["authors"] = ["Doe, John"]
  9 | 
 10 | """
 11 | from __future__ import annotations
 12 | from typing import Any, Dict, List, Callable
 13 | from collections.abc import MutableMapping
 14 | 
 15 | from .person import Person, PersonList
 16 | 
 17 | import re
 18 | import copy
 19 | import sys
 20 | import ruamel.yaml
 21 | 
 22 | 
 23 | class Metadata(MutableMapping):
 24 |     """Metadata class.
 25 | 
 26 |     Attributes:
 27 |         _attrs (Dict): Metadata attributes.
 28 |         _basis (Dict): Basis of metadata attributes.
 29 |         _normalize (Callable): Attribute normalization method.
 30 |         _serialize (Callable): Attribute serialization method.
 31 | 
 32 |     Class Attributes:
 33 |         REGEXP_DOI: Regular expression to validate DOI.
 34 |     """
 35 | 
 36 |     REGEXP_DOI = re.compile(r"10\.\d{4,9}/[-._;()/:a-z\d]+", re.IGNORECASE)
 37 | 
 38 | 
 39 |     def __init__(self, normalize: Callable=None, serialize: Callable=None, **kwargs):
 40 |         """Initializes Metadata object.
 41 | 
 42 |         The corresponding default methods are not called if user-defined
 43 |         attribute value normalization and serialization methods are provided.
 44 | 
 45 |         Args:
 46 |             normalize: Attribute value normalization method (optional).
 47 |             serialize: Attribute value serialization method (optional).
 48 |             **kwargs: Metadata attributes.
 49 |         """
 50 |         self._normalize = normalize if normalize else Metadata.normalize_value
 51 |         self._serialize = serialize if serialize else Metadata.serialize_value
 52 |         self._attrs = {}
 53 | 
 54 |         for key, val in kwargs.items():
 55 |             if bool(val) or isinstance(val, (bool, int, float)):
 56 |                 self._attrs[key] = self._normalize(key, val)
 57 | 
 58 |         self.rebase()
 59 | 
 60 | 
 61 |     def __setitem__(self, key, val):
 62 |         if bool(val) or isinstance(val, (bool, int, float)):
 63 |             self._attrs[key] = self._normalize(key, val)
 64 |         elif key in self._attrs:
 65 |             del self._attrs[key]
 66 | 
 67 | 
 68 |     def __getitem__(self, key):
 69 |         return self._attrs[key]
 70 | 
 71 | 
 72 |     def __delitem__(self, key):
 73 |         del self._attrs[key]
 74 | 
 75 | 
 76 |     def __iter__(self):
 77 |         return iter(self._attrs)
 78 | 
 79 | 
 80 |     def __len__(self):
 81 |         return len(self._attrs)
 82 | 
 83 | 
 84 |     def __str__(self):
 85 |         return str(self._attrs)
 86 | 
 87 | 
 88 |     def __repr__(self):
 89 |         return "Metadata({})".format(self._attrs)
 90 | 
 91 | 
 92 |     def rebase(self) -> None:
 93 |         """Updates the basis of the metadata attributes."""
 94 |         self._basis = copy.deepcopy(self._attrs)
 95 | 
 96 | 
 97 |     @property
 98 |     def is_modified(self) -> bool:
 99 |         """Checks if metadata is modified.
100 | 
101 |         Returns:
102 |             True is metadata is modified, False otherwise.
103 |         """
104 |         return self._attrs != self._basis
105 | 
106 | 
107 |     @classmethod
108 |     def normalize_value(cls, key: str, val) -> Any:
109 |         """Normalizes metadata attribute value.
110 | 
111 |         Supported attributes:
112 |             - doi
113 |             - keywords
114 |             - authors
115 | 
116 |         Args:
117 |             key (str): Attribute key.
118 |             val: Attribute value.
119 | 
120 |         Returns:
121 |             Normalized attribute value.
122 | 
123 |         Raises:
124 |             ValueError: If invalid attribute value.
125 |         """
126 |         # Digital Object Identifier
127 |         if key == "doi":
128 |             if isinstance(val, str):
129 |                 val = val.lower()
130 |                 if val.startswith("doi:"):
131 |                     val = val[4:]
132 |                 elif val.startswith("http://doi.org/"):
133 |                     val = val[15:]
134 |                 elif val.startswith("https://doi.org/"):
135 |                     val = val[16:]
136 |                 if not re.fullmatch(Metadata.REGEXP_DOI, val):
137 |                     raise ValueError
138 |             else:
139 |                 raise ValueError
140 | 
141 |         # Keywords
142 |         elif key == "keywords":
143 |             if isinstance(val, str):
144 |                 val = re.split(r"[,;\n]", val)
145 |             try:
146 |                 val = [keyword.strip() for keyword in iter(val)]
147 |             except TypeError:
148 |                 raise ValueError
149 | 
150 |         # Authors
151 |         elif key == "authors":
152 |             val = Person.get_persons(val)
153 | 
154 |         # Return normalized value
155 |         return val
156 | 
157 | 
158 |     @classmethod
159 |     def serialize_value(cls, key: str, val) -> Any:
160 |         """Serializes metadata attribute value.
161 | 
162 |         Supported attributes:
163 |             - Any attribute with a data type of `Person`.
164 |             - Any attribute with a data type of `PersonList`.
165 | 
166 |         Args:
167 |             key (str): Attribute key.
168 |             val: Attribute value.
169 | 
170 |         Returns:
171 |             Serialized attribute value.
172 |         """
173 |         if isinstance(val, Person):
174 |             return val.serialize()
175 | 
176 |         if isinstance(val, PersonList):
177 |             return [person.serialize() for person in val]
178 | 
179 |         return copy.deepcopy(val)
180 | 
181 | 
182 |     def serialize(self) -> Dict:
183 |         """Serializes metadata as a dictionary.
184 | 
185 |         Returns:
186 |             Metadata dictionary.
187 |         """
188 |         out = {}
189 | 
190 |         for key, val in self._attrs.items():
191 |             out[key] = self._serialize(key, val)
192 | 
193 |         return out
194 | 
195 | 
196 |     def autocomplete(self, overwrite: bool=False, attrs: List=None, **kwargs) -> Dict:
197 |         """Completes missing metadata attributes by using the available information.
198 | 
199 |         Supported attributes:
200 |             - Any attribute with a data type of `Person`.
201 |             - Any attribute with a data type of `PersonList`.
202 | 
203 |         Args:
204 |             overwrite (bool): Set True to overwrite existing attributes (default False).
205 |             attrs (List): List of attributes to be completed (optional).
206 |             **kwargs: Arguments for the specific autocomplete methods.
207 | 
208 |         Returns:
209 |             A dictionary of attributes set by method.
210 |         """
211 |         updated = {}
212 | 
213 |         for key, val in self._attrs.items():
214 | 
215 |             if attrs and key not in attrs:
216 |                 continue
217 | 
218 |             if isinstance(val, Person):
219 |                 result = val.autocomplete(overwrite=overwrite, **kwargs)
220 | 
221 |             elif isinstance(val, PersonList):
222 |                 result = {}
223 |                 for index, person in enumerate(val):
224 |                     res = person.autocomplete(overwrite=overwrite, **kwargs)
225 |                     if res:
226 |                         result[key] = res
227 | 
228 |             else:
229 |                 continue
230 | 
231 |             if result:
232 |                 updated[key] = result
233 | 
234 |         return updated
235 | 
236 | 
237 |     def _remove_comments(self, var) -> None:
238 |         """Removes comments from a YAML dictionary recursively.
239 | 
240 |         Args:
241 |             var: YAML dictionary or a dictionary item, if called recursively.
242 |         """
243 |         # REMARK: Based on https://stackoverflow.com/questions/60080325/how-to-delete-all-comments-in-ruamel-yaml
244 |         if isinstance(var, dict):
245 |             for key, val in var.items():
246 |                 self._remove_comments(key)
247 |                 self._remove_comments(val)
248 | 
249 |         elif isinstance(var, list):
250 |             for item in var:
251 |                 self._remove_comments(item)
252 | 
253 |         try:
254 |              if isinstance(var, ruamel.yaml.scalarstring.ScalarString):
255 |                 attr = "comment"
256 |              else:
257 |                 attr = ruamel.yaml.comments.Comment.attrib
258 |              delattr(var, attr)
259 | 
260 |         except AttributeError:
261 |             pass
262 | 
263 | 
264 |     def print(self) -> None:
265 |         """Pretty prints metadata.
266 | 
267 |         Serializes metadata and prints as YAML without comments.
268 |         """
269 |         yaml = ruamel.yaml.YAML()
270 | 
271 |         out = self.serialize()
272 |         self._remove_comments(out)
273 | 
274 |         yaml.dump(out, sys.stdout)
275 | 


--------------------------------------------------------------------------------
/src/fairly/person.py:
--------------------------------------------------------------------------------
  1 | """Person class module.
  2 | 
  3 | Person class is used to store person (e.g. author) information in a standardized
  4 | manner.
  5 | 
  6 | Usage example:
  7 | 
  8 |     >>> person = Person("Doe, John")
  9 |     >>> person = Person(fullname="Doe, Jon", orcid_id="xxx")
 10 |     >>> person.affiliation = "fairly Community"
 11 | 
 12 | """
 13 | from __future__ import annotations
 14 | from typing import List, Dict
 15 | from collections.abc import Iterable, MutableMapping
 16 | 
 17 | import fairly
 18 | 
 19 | import re
 20 | import requests
 21 | import copy
 22 | 
 23 | class Person(MutableMapping):
 24 |     """Class to handle person information, e.g. for authors, contributors, etc.
 25 | 
 26 |     Class Attributes:
 27 |         REGEXP_ORCID_ID: Regular expression to validate ORCID identifier.
 28 |         REGEXP_EMAIL: Regular expression to validate e-mail address.
 29 |     """
 30 | 
 31 |     # TODO: Check the checksum digit
 32 |     # https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier
 33 |     REGEXP_ORCID_ID = re.compile(r"(\d{4}-){3}\d{3}(\d|X)")
 34 |     REGEXP_EMAIL = re.compile(r"[\w\.+-]+@([\w-]+\.)+[\w-]{2,}")
 35 | 
 36 | 
 37 |     def __init__(self, person: str=None, **kwargs):
 38 |         """Initializes Person object.
 39 | 
 40 |         Full name is obtained from name and surname, if required.
 41 | 
 42 |         Name and surname are obtained from full name, if required.
 43 |         (see `parse()` method for details).
 44 | 
 45 |         Standard attributes:
 46 |             name (string): Name of the person.
 47 |             surname (string): Surname of the person.
 48 |             fullname (string): Full name of the person.
 49 |             email (string): E-mail address of the person.
 50 |             institution (string): Institution of the person.
 51 |             orcid_id (string): ORCID identifier of the person.
 52 | 
 53 |         Args:
 54 |             person: Person identifier.
 55 |             **kwargs: Person attributes.
 56 |         """
 57 |         attrs = Person.parse(person) if person else {}
 58 | 
 59 |         if kwargs.get("fullname"):
 60 |             attrs.update(Person.parse(kwargs["fullname"]))
 61 | 
 62 |         attrs.update(kwargs)
 63 | 
 64 |         if not attrs.get("fullname") and attrs.get("name") and attrs.get("surname"):
 65 |             attrs["fullname"] = (attrs["name"] + " " + attrs["surname"]).strip()
 66 | 
 67 |         for key, val in attrs.items():
 68 |             if bool(val) or isinstance(val, (bool, int, float)):
 69 |                 self.__dict__[key] = val
 70 | 
 71 | 
 72 |     def __setitem__(self, key, val):
 73 |         if bool(val) or isinstance(val, (bool, int, float)):
 74 |             self.__dict__[key] = val
 75 |         elif key in self.__dict__:
 76 |             del self.__dict__[key]
 77 | 
 78 | 
 79 |     def __getitem__(self, key):
 80 |         return self.__dict__[key]
 81 | 
 82 | 
 83 |     def __delitem__(self, key):
 84 |         del self.__dict__[key]
 85 | 
 86 | 
 87 |     def __iter__(self):
 88 |         return iter(self.__dict__)
 89 | 
 90 | 
 91 |     def __len__(self):
 92 |         return len(self.__dict__)
 93 | 
 94 | 
 95 |     def __str__(self):
 96 |         return str(self.__dict__)
 97 | 
 98 | 
 99 |     def __repr__(self):
100 |         return f"Person({self.__dict__})"
101 | 
102 | 
103 |     @classmethod
104 |     def parse(cls, person: str) -> Dict:
105 |         """Parses person identifier and extracts available person attributes.
106 | 
107 |         The following attributes might be extracted:
108 |             - name
109 |             - surname
110 |             - fullname
111 |             - orcid_id
112 | 
113 |         Args:
114 |             person: Person identifier (e.g. fullname)
115 | 
116 |         Returns:
117 |             Dictionary of person attributes.
118 |         """
119 |         person = person.strip()
120 | 
121 |         if re.fullmatch(Person.REGEXP_ORCID_ID, person):
122 |             return {"orcid_id": person}
123 | 
124 |         if re.fullmatch(Person.REGEXP_EMAIL, person):
125 |             return {"email": person}
126 | 
127 |         attrs = {"fullname": person}
128 |         parts = [part.strip() for part in person.split(",")]
129 |         if len(parts) == 2:
130 |             attrs["surname"], attrs["name"] = parts
131 | 
132 |         return attrs
133 | 
134 | 
135 |     @staticmethod
136 |     def get_orcid_token(client_id: str=None, client_secret: str=None) -> str:
137 |         """Retrieves ORCID access token by using ORCID client id and secret.
138 | 
139 |         ORCID access token is required to retrieve person information by using
140 |         an ORCID ID.
141 | 
142 |         If not specified, `client_id` and `client_secret` are read from fairly
143 |         configuration.
144 | 
145 |         Args:
146 |             client_id: ORCID client id.
147 |             client_secret: ORCID client secret.
148 | 
149 |         Returns:
150 |             ORCID access token.
151 | 
152 |         Raises:
153 |             ValueError("No client id"): If client id is not available.
154 |             ValueError("No client secret"): If client secret is not available.
155 |             ValueError("Invalid response"): If access token is not retrieved.
156 |         """
157 |         config = fairly.get_config("fairly")
158 | 
159 |         if not client_id:
160 |             client_id = config.get("orcid_client_id")
161 |             if not client_id:
162 |                 raise ValueError("No client id")
163 | 
164 |         if not client_secret:
165 |             client_secret = config.get("orcid_client_secret")
166 |             if not client_secret:
167 |                 raise ValueError("No client secret")
168 | 
169 |         response = requests.post(
170 |             "https://orcid.org/oauth/token",
171 |             data=f"client_id={client_id}&client_secret={client_secret}&grant_type=client_credentials&scope=/read-public",
172 |             headers={
173 |                 "Accept": "application/json",
174 |                 "Content-Type": "application/x-www-form-urlencoded",
175 |             }
176 |         )
177 |         response.raise_for_status()
178 | 
179 |         json = response.json()
180 | 
181 |         if "access_token" not in json:
182 |             raise ValueError("Invalid response")
183 | 
184 |         return json["access_token"]
185 | 
186 | 
187 |     @staticmethod
188 |     def from_orcid_id(orcid_id: str, token: str=None) -> Person:
189 |         """Retrieves person information from ORCID identifier.
190 | 
191 |         If not specified, `token` is read from fairly configuration. If it is
192 |         also not available, it is retrieved by using `get_orcid_token()` method.
193 | 
194 |         Args:
195 |             orcid_id: ORCID identifier.
196 |             token: ORCID access token.
197 | 
198 |         Returns:
199 |             Person object if valid ORCID identifier, None otherwise.
200 | 
201 |         Raises:
202 |             ValueError("No access token"): If access token is not available.
203 |             ValueError("Invalid ORCID identifier"): If ORCID identified is not valid.
204 |         """
205 |         # Get default access token if required
206 |         if not token:
207 |             config = fairly.get_config("fairly")
208 |             token = config.get("orcid_token")
209 |             if not token:
210 |                 try:
211 |                     token = Person.get_orcid_token()
212 |                 except:
213 |                     raise ValueError("No access token")
214 | 
215 |         # Send request
216 |         fields = ",".join(["orcid", "email", "given-names", "family-name", "current-institution-affiliation-name"])
217 |         response = requests.get(
218 |             f"https://pub.orcid.org/v3.0/expanded-search/?q=orcid:{orcid_id}&fl={fields}",
219 |             headers={
220 |                 "Content-type": "application/vnd.orcid+json",
221 |                 "Authorization type and Access token": f"Bearer {token}"
222 |             }
223 |         )
224 |         response.raise_for_status()
225 |         results = response.json().get("expanded-result")
226 | 
227 |         # Raise exception if no results
228 |         if not results:
229 |             raise ValueError("Invalid ORCID Id")
230 | 
231 |         # Return the first person matching the ORCID identifier
232 |         result = results[0]
233 |         return Person(
234 |             orcid_id=result.get("orcid-id"),
235 |             name=result.get("given-names"),
236 |             surname=result.get("family-names"),
237 |             email=result.get("email"),
238 |             institution=result["institution-name"][0] if result.get("institution-name") else None
239 |         )
240 | 
241 | 
242 |     @staticmethod
243 |     def get_persons(people) -> List[Person]:
244 |         """Returns standard person list from the people argument.
245 | 
246 |         A string or an iterable are accepted as input. If input is a string,
247 |         it is split using semicolon and line feed as separators. For the items
248 |         of the iterable, the following are performed:
249 | 
250 |             - If it is a Person object, a copy is created.
251 |             - If it is a string, it is parsed to a dictionary using parse().
252 |             - If is is a dictionary, Person object is created.
253 | 
254 |         Args:
255 |             people: People argument.
256 | 
257 |         Returns:
258 |             List of person objects.
259 | 
260 |         Raises:
261 |             ValueError: If people argument is invalid.
262 |         """
263 |         if not people:
264 |             return PersonList()
265 | 
266 |         if isinstance(people, str):
267 |             people = re.split(r"[;\n]", people)
268 | 
269 |         if not isinstance(people, Iterable):
270 |             raise ValueError
271 | 
272 |         persons = PersonList()
273 |         for item in people:
274 | 
275 |             if not item:
276 |                 continue
277 | 
278 |             if isinstance(item, Person):
279 |                 person = copy.copy(item)
280 | 
281 |             else:
282 |                 if isinstance(item, str):
283 |                     item = Person.parse(item)
284 |                 if not isinstance(item, Dict):
285 |                     raise ValueError
286 |                 person = Person(**item)
287 | 
288 |             persons.append(person)
289 | 
290 |         return persons
291 | 
292 | 
293 |     def autocomplete(self, overwrite: bool=False, orcid_token: str=None) -> Dict:
294 |         """Completes missing information by using the ORCID identifier.
295 | 
296 |         Args:
297 |             overwrite: If True existing attributes are overwritten.
298 | 
299 |         Returns:
300 |             A dictionary of attributes set by method.
301 |         """
302 |         if not self.get("orcid_id"):
303 |             return {}
304 | 
305 |         person = Person.from_orcid_id(self["orcid_id"], token=orcid_token)
306 | 
307 |         updated = {}
308 |         for key, val in person.__dict__.items():
309 |             if key not in self.__dict__ or overwrite:
310 |                 self.__dict__[key] = updated[key] = val
311 | 
312 |         return updated
313 | 
314 | 
315 |     def serialize(self) -> Dict:
316 |         """Serializes person as a dictionary.
317 | 
318 |         Returns:
319 |             Person dictionary.
320 |         """
321 |         return self.__dict__.copy()
322 | 
323 | 
324 | class PersonList(list):
325 |     def _person(self, item):
326 |         if isinstance(item, Person):
327 |             return item
328 | 
329 |         if isinstance(item, str):
330 |             return Person(item)
331 | 
332 |         if isinstance(item, dict):
333 |             return Person(**item)
334 | 
335 |         raise ValueError
336 | 
337 |     def __init__(self, iterable=None):
338 |         if iterable:
339 |             super().__init__(self._person(item) for item in iterable)
340 | 
341 |     def __setitem__(self, index, item):
342 |         super().__setitem__(index, self._person(item))
343 | 
344 |     def insert(self, index, item):
345 |         super().insert(index, self._person(item))
346 | 
347 |     def append(self, item):
348 |         super().append(self._person(item))
349 | 
350 |     def extend(self, other):
351 |         if isinstance(other, type(self)):
352 |             super().extend(other)
353 |         else:
354 |             super().extend(self._person(item) for item in other)


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ITC-CRIB/fairly/7fdb1327c35daffc491e22e72855814883880f5a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ruamel.yaml import YAML
 4 | import dotenv
 5 | import os.path
 6 | 
 7 | # Load environment variables
 8 | dotenv.load_dotenv()
 9 | 
10 | 
11 | def create_dummy_dataset(path):
12 |     """Create a dummy dataset for testing.
13 | 
14 |     Returns:
15 |         Dummy dataset path
16 |     """
17 |     manifest = {
18 |         "metadata": {
19 |             "title": "Title",
20 |             "description": "Description",
21 |             "authors": ["Surname, Name"],
22 |             "license": "MIT",
23 |             "type": "dataset",
24 |             "access_type": "open"
25 |         },
26 |         "files": {
27 |             "includes": ["*.txt"],
28 |             "excludes": []
29 |         }
30 |     }
31 | 
32 |     yaml = YAML()
33 | 
34 |     with open(os.path.join(path, "manifest.yaml"), "w") as file:
35 |         yaml.dump(manifest, file)
36 | 
37 |     for i in range(10):
38 |         with open(os.path.join(path, f"file_{i}.txt"), "w") as file:
39 |             file.write(f"file_{i}")
40 | 
41 | 
42 | def remote_dataset_ids():
43 |     return [
44 |         "https://zenodo.org/records/7759648",
45 |         "10.5281/zenodo.7759648",
46 |         "https://figshare.com/articles/dataset/_/12505823",
47 |         "https://doi.org/10.6084/m9.figshare.12505823.v1",
48 |         "https://data.4tu.nl/datasets/8c9cf432-a4cf-4ba4-aaf5-92b1a99ca20b",
49 |         "10.4121/13697596.v1",
50 |         "https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/M9JGU2",
51 |         "10.34894/M9JGU2",
52 |     ]
53 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | from tests.conftest import *
 4 | 
 5 | import fairly
 6 | 
 7 | import re
 8 | 
 9 | from fairly.cli import app
10 | 
11 | from typer.testing import CliRunner
12 | 
13 | runner = CliRunner()
14 | 
15 | def test_cli_help():
16 |     '''Test if CLI is reachable from the system terminal.'''
17 |     exit_status = os.system("fairly --help")
18 |     assert exit_status == 0
19 | 
20 | def test_show_config():
21 |     result = runner.invoke(app, ["config", "show"])
22 |     assert result.exit_code == 0
23 | 
24 | 
25 | @pytest.mark.parametrize("id", remote_dataset_ids())
26 | def test_dataset_clone(id, tmpdir):
27 |     '''Test dataset cloning by using dataset URL address, DOI or ID.'''
28 | 
29 |     result = runner.invoke(app, ["dataset", "clone", id, str(tmpdir)])
30 |     assert result.exit_code == 0, result.stdout
31 | 
32 | 
33 | @pytest.mark.parametrize("template", fairly.metadata_templates())
34 | def test_dataset_create(template, tmpdir):
35 |     '''Test dataset creation by using metadata templates.'''
36 | 
37 |     # Create a dummy dataset
38 |     result = runner.invoke(app, ["dataset", "create", "--template", template, str(tmpdir)])
39 |     assert result.exit_code == 0, result.stdout
40 | 
41 |     # Access the dummy dataset
42 |     dataset = fairly.dataset(tmpdir)
43 |     assert dataset.template == template
44 | 
45 |     # Should raise an exception if dataset already exists
46 |     with pytest.raises(Exception):
47 |         assert isinstance(runner.invoke(app, ["dataset", "create", tmpdir]), Exception)
48 | 
49 | 
50 | @pytest.mark.parametrize("repository_id", fairly.get_repositories())
51 | def test_dataset_upload_delete(repository_id, tmpdir):
52 |     '''Test dataset upload to the recognized repositories.'''
53 | 
54 |     repository = fairly.get_repository(repository_id)
55 |     if not repository.get("token"):
56 |         pytest.skip("No access token")
57 | 
58 |     create_dummy_dataset(tmpdir)
59 | 
60 |     result = runner.invoke(app, ["dataset", "upload", str(tmpdir), repository_id])
61 |     assert result.exit_code == 0, result.stdout
62 | 
63 |     match = re.search(r"uploaded at (.+)", result.stdout)
64 |     assert match
65 | 
66 |     id = match[1]
67 | 
68 |     result = runner.invoke(app, ["dataset", "delete", "--repo", repository_id, id])
69 |     assert result.exit_code == 0, result.stdout
70 | 


--------------------------------------------------------------------------------
/tests/test_fairly.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from tests.conftest import *
  3 | 
  4 | import fairly
  5 | 
  6 | from fairly.dataset.local import LocalDataset
  7 | from fairly.dataset.remote import RemoteDataset
  8 | 
  9 | from fairly.client.figshare import FigshareClient
 10 | from fairly.client.invenio import InvenioClient
 11 | 
 12 | # Set testing flag
 13 | fairly.TESTING = True
 14 | 
 15 | 
 16 | def params_create_client():
 17 |     return [
 18 |         ("figshare", FigshareClient),
 19 |         ("invenio", InvenioClient)
 20 |     ]
 21 | 
 22 | 
 23 | def test_load_config():
 24 |     config = fairly.get_config("zenodo")
 25 |     assert config is not None
 26 |     assert config["url"] == "https://zenodo.org/"
 27 | 
 28 | 
 29 | def test_get_clients():
 30 |     clients = fairly.get_clients()
 31 |     assert clients
 32 |     assert "fairly" not in clients
 33 |     assert "figshare" in clients
 34 |     assert "invenio" in clients
 35 |     assert "djehuty" in clients
 36 | 
 37 | 
 38 | @pytest.mark.parametrize("client_id, client_class", params_create_client())
 39 | def test_create_client(client_id, client_class):
 40 |     """Test client creation."""
 41 |     client = fairly.client(client_id)
 42 |     assert isinstance(client, client_class)
 43 |     assert client.client_id == client_id
 44 | 
 45 | 
 46 | @pytest.mark.parametrize("repository_id", fairly.get_repositories())
 47 | def test_dataset_upload_delete(repository_id, tmpdir):
 48 |     '''Test dataset upload to the recognized repositories.'''
 49 | 
 50 |     repository = fairly.get_repository(repository_id)
 51 |     if not repository.get("token"):
 52 |         pytest.skip("No access token")
 53 | 
 54 |     create_dummy_dataset(tmpdir)
 55 | 
 56 |     local_dataset = fairly.dataset(str(tmpdir))
 57 |     assert isinstance(local_dataset, LocalDataset)
 58 |     assert local_dataset.files is not None
 59 | 
 60 |     remote_dataset = local_dataset.upload(repository_id)
 61 |     assert isinstance(remote_dataset, RemoteDataset)
 62 |     assert len(remote_dataset.files) == len(local_dataset.files)
 63 | 
 64 |     remote_dataset.client.delete_dataset(remote_dataset.id)
 65 | 
 66 | 
 67 | @pytest.mark.parametrize("id", remote_dataset_ids())
 68 | def test_dataset_clone(id, tmpdir):
 69 |     '''Test the dataset cloning by using dataset URL address, DOI or ID.'''
 70 | 
 71 |     remote_dataset = fairly.dataset(id)
 72 |     assert isinstance(remote_dataset, RemoteDataset)
 73 | 
 74 |     local_dataset = remote_dataset.store(tmpdir)
 75 |     assert isinstance(local_dataset, LocalDataset)
 76 |     assert len(local_dataset.files) == len(remote_dataset.files)
 77 | 
 78 | 
 79 | @pytest.mark.parametrize("template", fairly.metadata_templates())
 80 | def test_dataset_create(template, tmpdir):
 81 |     '''Tests creation of a new dataset.'''
 82 | 
 83 |     dataset = fairly.init_dataset(str(tmpdir), template=template)
 84 |     assert isinstance(dataset, LocalDataset)
 85 |     assert dataset.template == template
 86 | 
 87 |     # Should raise an exception if dataset already exists
 88 |     with pytest.raises(Exception):
 89 |         assert isinstance(fairly.init_dataset(str(tmpdir)), Exception)
 90 | 
 91 | 
 92 | @pytest.mark.parametrize("repository_id", fairly.get_repositories())
 93 | def test_get_account_datasets(repository_id):
 94 | 
 95 |     repository = fairly.get_repository(repository_id)
 96 | 
 97 |     if not repository.get("token"):
 98 |         pytest.skip("No access token")
 99 | 
100 |     client = fairly.client(repository_id)
101 | 
102 |     datasets = client.get_account_datasets()
103 |     assert datasets is not None


--------------------------------------------------------------------------------