├── .cruft.json ├── .github ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── FUNDING.yml ├── PULL_REQUEST_TEMPLATE.md ├── codecov.yml └── workflows │ ├── cruft.yml │ └── tests.yml ├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── README.md ├── docs ├── Makefile └── source │ ├── cli.rst │ ├── conf.py │ ├── index.rst │ ├── installation.rst │ ├── usage.rst │ └── utils.rst ├── pyproject.toml ├── src └── pystow │ ├── __init__.py │ ├── __main__.py │ ├── api.py │ ├── cache.py │ ├── cli.py │ ├── config_api.py │ ├── constants.py │ ├── impl.py │ ├── py.typed │ ├── utils.py │ └── version.py ├── tests ├── resources │ ├── test.txt │ ├── test.txt.md5 │ ├── test_1.csv │ ├── test_1.json │ ├── test_1.pkl │ ├── test_1.tsv │ ├── test_verbose.txt.md5 │ └── test_wrong.txt.md5 ├── test_api.py ├── test_caching.py ├── test_config.py ├── test_module.py └── test_utils.py └── tox.ini /.cruft.json: -------------------------------------------------------------------------------- 1 | { 2 | "template": "https://github.com/cthoyt/cookiecutter-snekpack", 3 | "commit": "3257f18b9b5dc6922830dea64f9e0ab8b42a40e4", 4 | "checkout": null, 5 | "context": { 6 | "cookiecutter": { 7 | "package_name": "pystow", 8 | "package_name_stylized": "PyStow", 9 | "short_description": "Easily pick a place to store data for your Python code", 10 | "author_name": "Charles Tapley Hoyt", 11 | "author_github": "cthoyt", 12 | "author_email": "cthoyt@gmail.com", 13 | "github_organization_name": "cthoyt", 14 | "github_repository_name": "pystow", 15 | "command_line_interface": true, 16 | "gitlab": false, 17 | "runner": "tox", 18 | "__runner": "tox -e", 19 | "__runner_uv": "--with tox-uv tox -e", 20 | "__runner_pip": "tox tox-uv", 21 | "__runner_install_uv": "uv tool install tox --with tox-uv", 22 | "__runner_install_pip": "python3 -m pip install tox tox-uv", 23 | "__runner_tests": "py", 24 | "__gh_slug": "cthoyt/pystow", 25 | "_template": "https://github.com/cthoyt/cookiecutter-snekpack", 26 | "_commit": "3257f18b9b5dc6922830dea64f9e0ab8b42a40e4" 27 | } 28 | }, 29 | "directory": null 30 | } 31 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual 10 | identity and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | - Demonstrating empathy and kindness toward other people 21 | - Being respectful of differing opinions, viewpoints, and experiences 22 | - Giving and gracefully accepting constructive feedback 23 | - Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | - Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | - The use of sexualized language or imagery, and sexual attention or advances of 31 | any kind 32 | - Trolling, insulting or derogatory comments, and personal or political attacks 33 | - Public or private harassment 34 | - Publishing others' private information, such as a physical or email address, 35 | without their explicit permission 36 | - Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | cthoyt@gmail.com. All complaints will be reviewed and investigated promptly and 64 | fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series of 86 | actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or permanent 93 | ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within the 113 | community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.1, available at 119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder][Mozilla CoC]. 123 | 124 | For answers to common questions about this code of conduct, see the FAQ at 125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 126 | [https://www.contributor-covenant.org/translations][translations]. 127 | 128 | [homepage]: https://www.contributor-covenant.org 129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 130 | [Mozilla CoC]: https://github.com/mozilla/diversity 131 | [FAQ]: https://www.contributor-covenant.org/faq 132 | [translations]: https://www.contributor-covenant.org/translations 133 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions to this repository are welcomed and encouraged. 4 | 5 | ## Code Contribution 6 | 7 | This project uses the [GitHub Flow](https://guides.github.com/introduction/flow) 8 | model for code contributions. Follow these steps: 9 | 10 | 1. [Create a fork](https://help.github.com/articles/fork-a-repo) of the upstream 11 | repository at [`cthoyt/pystow`](https://github.com/cthoyt/pystow) on your 12 | GitHub account (or in one of your organizations) 13 | 2. [Clone your fork](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) 14 | with `git clone https://github.com//pystow.git` 15 | 3. Make and commit changes to your fork with `git commit` 16 | 4. Push changes to your fork with `git push` 17 | 5. Repeat steps 3 and 4 as needed 18 | 6. Submit a pull request back to the upstream repository 19 | 20 | ### Merge Model 21 | 22 | This repository uses 23 | [squash merges](https://docs.github.com/en/github/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-pull-request-commits) 24 | to group all related commits in a given pull request into a single commit upon 25 | acceptance and merge into the main branch. This has several benefits: 26 | 27 | 1. Keeps the commit history on the main branch focused on high-level narrative 28 | 2. Enables people to make lots of small commits without worrying about muddying 29 | up the commit history 30 | 3. Commits correspond 1-to-1 with pull requests 31 | 32 | ### Code Style 33 | 34 | This project uses `tox` for running code quality checks. Start by installing it 35 | with `pip install tox tox-uv`. 36 | 37 | This project encourages the use of optional static typing. It uses 38 | [`mypy`](http://mypy-lang.org/) as a type checker. You can check if your code 39 | passes `mypy` with `tox -e mypy`. 40 | 41 | This project uses [`ruff`](https://docs.astral.sh/ruff/) to automatically 42 | enforce a consistent code style. You can apply `ruff format` and other 43 | pre-configured formatters with `tox -e format`. 44 | 45 | This project uses [`ruff`](https://docs.astral.sh/ruff/) and several plugins for 46 | additional checks of documentation style, security issues, good variable 47 | nomenclature, and more (see `pyproject.toml` for a list of Ruff plugins). You 48 | can check if your code passes `ruff check` with `tox -e lint`. 49 | 50 | Each of these checks are run on each commit using GitHub Actions as a continuous 51 | integration service. Passing all of them is required for accepting a 52 | contribution. If you're unsure how to address the feedback from one of these 53 | tools, please say so either in the description of your pull request or in a 54 | comment, and we will help you. 55 | 56 | ### Logging 57 | 58 | Python's builtin `print()` should not be used (except when writing to files), 59 | it's checked by the 60 | [`flake8-print` (T20)](https://docs.astral.sh/ruff/rules/#flake8-print-t20) 61 | plugin to `ruff`. If you're in a command line setting or `main()` function for a 62 | module, you can use `click.echo()`. Otherwise, you can use the builtin `logging` 63 | module by adding `logger = logging.getLogger(__name__)` below the imports at the 64 | top of your file. 65 | 66 | ### Documentation 67 | 68 | All public functions (i.e., not starting with an underscore `_`) must be 69 | documented using the 70 | [sphinx documentation format](https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html#the-sphinx-docstring-format). 71 | The [`darglint2`](https://github.com/akaihola/darglint2) tool reports on 72 | functions that are not fully documented. 73 | 74 | This project uses [`sphinx`](https://www.sphinx-doc.org) to automatically build 75 | documentation into a narrative structure. You can check that the documentation 76 | builds properly in an isolated environment with `tox -e docs-test` and actually 77 | build it locally with `tox -e docs`. 78 | 79 | ### Testing 80 | 81 | Functions in this repository should be unit tested. These can either be written 82 | using the `unittest` framework in the `tests/` directory or as embedded 83 | doctests. You can check that the unit tests pass with `tox -e py` and that the 84 | doctests pass with `tox -e doctests`. These tests are required to pass for 85 | accepting a contribution. 86 | 87 | ### Syncing your fork 88 | 89 | If other code is updated before your contribution gets merged, you might need to 90 | resolve conflicts against the main branch. After cloning, you should add the 91 | upstream repository with 92 | 93 | ```shell 94 | $ git remote add cthoyt https://github.com/cthoyt/pystow.git 95 | ``` 96 | 97 | Then, you can merge upstream code into your branch. You can also use the GitHub 98 | UI to do this by following 99 | [this tutorial](https://docs.github.com/en/github/collaborating-with-pull-requests/working-with-forks/syncing-a-fork). 100 | 101 | ### Python Version Compatibility 102 | 103 | This project aims to support all versions of Python that have not passed their 104 | end-of-life dates. After end-of-life, the version will be removed from the Trove 105 | qualifiers in the `pyproject.toml` and from the GitHub Actions testing 106 | configuration. 107 | 108 | See https://endoflife.date/python for a timeline of Python release and 109 | end-of-life dates. 110 | 111 | ## Acknowledgements 112 | 113 | These code contribution guidelines are derived from the 114 | [cthoyt/cookiecutter-snekpack](https://github.com/cthoyt/cookiecutter-snekpack) 115 | Python package template. They're free to reuse and modify as long as they're 116 | properly acknowledged. 117 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/displaying-a-sponsor-button-in-your-repository 2 | github: 3 | - cthoyt 4 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | ## Summary 16 | 17 | 18 | -------------------------------------------------------------------------------- /.github/codecov.yml: -------------------------------------------------------------------------------- 1 | # see https://docs.codecov.com/v4.6/docs/codecov-yaml 2 | ignore: 3 | - "src/pystow/__main__.py" 4 | - "src/pystow/cli.py" 5 | -------------------------------------------------------------------------------- /.github/workflows/cruft.yml: -------------------------------------------------------------------------------- 1 | # from https://cruft.github.io/cruft/#automating-updates-with-github-actions 2 | 3 | name: Update repository with Cruft 4 | 5 | permissions: { } 6 | 7 | on: 8 | workflow_dispatch: 9 | schedule: 10 | - cron: "0 2 * * 1" # Every Monday at 2am 11 | 12 | jobs: 13 | update: 14 | permissions: 15 | contents: write 16 | pull-requests: write 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: true 20 | matrix: 21 | include: 22 | - add-paths: . 23 | body: Use this to merge the changes to this repository. 24 | branch: cruft/update 25 | commit-message: "chore: accept new Cruft update" 26 | title: New updates detected with Cruft 27 | - add-paths: .cruft.json 28 | body: Use this to reject the changes in this repository. 29 | branch: cruft/reject 30 | commit-message: "chore: reject new Cruft update" 31 | title: Reject new updates detected with Cruft 32 | steps: 33 | - uses: actions/checkout@v3 34 | 35 | - uses: actions/setup-python@v4 36 | with: 37 | python-version: "3.10" 38 | 39 | - name: Install Cruft 40 | run: pip3 install cruft 41 | 42 | - name: Check if update is available 43 | continue-on-error: false 44 | id: check 45 | run: | 46 | CHANGES=0 47 | if [ -f .cruft.json ]; then 48 | if ! cruft check; then 49 | CHANGES=1 50 | fi 51 | else 52 | echo "No .cruft.json file" 53 | fi 54 | 55 | echo "has_changes=$CHANGES" >> "$GITHUB_OUTPUT" 56 | 57 | - name: Run update if available 58 | if: steps.check.outputs.has_changes == '1' 59 | run: | 60 | git config --global user.email "you@example.com" 61 | git config --global user.name "GitHub" 62 | 63 | cruft update --skip-apply-ask --refresh-private-variables 64 | git restore --staged . 65 | 66 | - name: Create pull request 67 | if: steps.check.outputs.has_changes == '1' 68 | uses: peter-evans/create-pull-request@v4 69 | with: 70 | token: ${{ secrets.GITHUB_TOKEN }} 71 | add-paths: ${{ matrix.add-paths }} 72 | commit-message: ${{ matrix.commit-message }} 73 | branch: ${{ matrix.branch }} 74 | delete-branch: true 75 | branch-suffix: timestamp 76 | title: ${{ matrix.title }} 77 | body: | 78 | This is an autogenerated PR. ${{ matrix.body }} 79 | 80 | [Cruft](https://cruft.github.io/cruft/) has detected updates from the Cookiecutter repository. 81 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This file configures the continuous integration (CI) system on GitHub. 2 | # Introductory materials can be found here: https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions. 3 | # Documentation for editing this file can be found here: https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions 4 | 5 | name: Tests 6 | 7 | # by default, give the GITHUB_TOKEN no permissions 8 | # See https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/controlling-permissions-for-github_token 9 | permissions: { } 10 | 11 | on: 12 | push: 13 | branches: [ main ] 14 | pull_request: 15 | branches: [ main ] 16 | 17 | jobs: 18 | lint: 19 | name: Code Quality 20 | permissions: 21 | # give only read-only access to the contents of the repository 22 | # this is the only permission this job requires, so keep it to the least privilege 23 | # i.e., not to issues, discussions, actions, etc. 24 | contents: read 25 | runs-on: ubuntu-latest 26 | strategy: 27 | matrix: 28 | python-version: [ "3.13", "3.9" ] 29 | tox-command: [ "lint", "pyroma", "mypy" ] 30 | steps: 31 | - uses: actions/checkout@v4 32 | - name: "Install uv" 33 | uses: "astral-sh/setup-uv@v3" 34 | with: 35 | enable-cache: true 36 | cache-dependency-glob: "pyproject.toml" 37 | - name: "Run command" 38 | run: | 39 | uvx -p ${{ matrix.python-version }} --with tox-uv tox -e ${{ matrix.tox-command }} 40 | 41 | docs: 42 | name: Documentation 43 | permissions: 44 | contents: read 45 | runs-on: ubuntu-latest 46 | strategy: 47 | matrix: 48 | # We only test documentation on the latest version 49 | # sphinx 8.0 / sphinx-rtd-theme 3.0 discontinued Python 3.9 support 50 | # a year early, which prompted re-thinking about this. 51 | python-version: [ "3.13" ] 52 | steps: 53 | - uses: actions/checkout@v4 54 | - name: "Install uv" 55 | uses: "astral-sh/setup-uv@v3" 56 | with: 57 | enable-cache: true 58 | cache-dependency-glob: "pyproject.toml" 59 | - name: Install dependencies 60 | run: | 61 | sudo apt-get install graphviz 62 | - name: Check RST conformity with doc8 63 | run: uvx -p ${{ matrix.python-version }} --with tox-uv tox -e doc8 64 | - name: Check docstring coverage 65 | run: uvx -p ${{ matrix.python-version }} --with tox-uv tox -e docstr-coverage 66 | - name: Check documentation build with Sphinx 67 | run: uvx -p ${{ matrix.python-version }} --with tox-uv tox -e docs-test 68 | 69 | # Check all markdown files are properly formatted 70 | # inspired by https://github.com/astral-sh/uv/blob/98523e2014e9a5c69706623344026d76296e178f/.github/workflows/ci.yml#L67C1-L70C61 71 | - name: Check markdown formatting 72 | run: | 73 | npx --yes prettier --prose-wrap always --check "**/*.md" 74 | 75 | tests: 76 | name: Tests 77 | permissions: 78 | contents: read 79 | runs-on: ${{ matrix.os }} 80 | strategy: 81 | matrix: 82 | os: [ ubuntu-latest ] 83 | python-version: [ "3.13", "3.9" ] 84 | steps: 85 | - uses: actions/checkout@v4 86 | - name: "Install uv" 87 | uses: "astral-sh/setup-uv@v3" 88 | with: 89 | enable-cache: true 90 | cache-dependency-glob: "pyproject.toml" 91 | - name: Test with pytest and generate coverage file 92 | run: 93 | uvx -p ${{ matrix.python-version }} --with tox-uv tox -e py 94 | - name: Run doctests 95 | run: 96 | uvx -p ${{ matrix.python-version }} --with tox-uv tox -e doctests 97 | - name: Upload coverage report to codecov 98 | uses: codecov/codecov-action@v4 99 | if: success() 100 | with: 101 | file: coverage.xml 102 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/macos,linux,windows,python,jupyternotebooks,jetbrains,pycharm,vim,emacs,visualstudiocode,visualstudio 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos,linux,windows,python,jupyternotebooks,jetbrains,pycharm,vim,emacs,visualstudiocode,visualstudio 3 | 4 | ### Emacs ### 5 | # -*- mode: gitignore; -*- 6 | *~ 7 | \#*\# 8 | /.emacs.desktop 9 | /.emacs.desktop.lock 10 | *.elc 11 | auto-save-list 12 | tramp 13 | .\#* 14 | 15 | # Org-mode 16 | .org-id-locations 17 | *_archive 18 | 19 | # flymake-mode 20 | *_flymake.* 21 | 22 | # eshell files 23 | /eshell/history 24 | /eshell/lastdir 25 | 26 | # elpa packages 27 | /elpa/ 28 | 29 | # reftex files 30 | *.rel 31 | 32 | # AUCTeX auto folder 33 | /auto/ 34 | 35 | # cask packages 36 | .cask/ 37 | dist/ 38 | 39 | # Flycheck 40 | flycheck_*.el 41 | 42 | # server auth directory 43 | /server/ 44 | 45 | # projectiles files 46 | .projectile 47 | 48 | # directory configuration 49 | .dir-locals.el 50 | 51 | # network security 52 | /network-security.data 53 | 54 | 55 | ### JetBrains ### 56 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 57 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 58 | 59 | # User-specific stuff 60 | .idea/**/workspace.xml 61 | .idea/**/tasks.xml 62 | .idea/**/usage.statistics.xml 63 | .idea/**/dictionaries 64 | .idea/**/shelf 65 | 66 | # AWS User-specific 67 | .idea/**/aws.xml 68 | 69 | # Generated files 70 | .idea/**/contentModel.xml 71 | 72 | # Sensitive or high-churn files 73 | .idea/**/dataSources/ 74 | .idea/**/dataSources.ids 75 | .idea/**/dataSources.local.xml 76 | .idea/**/sqlDataSources.xml 77 | .idea/**/dynamic.xml 78 | .idea/**/uiDesigner.xml 79 | .idea/**/dbnavigator.xml 80 | 81 | # Gradle 82 | .idea/**/gradle.xml 83 | .idea/**/libraries 84 | 85 | # Gradle and Maven with auto-import 86 | # When using Gradle or Maven with auto-import, you should exclude module files, 87 | # since they will be recreated, and may cause churn. Uncomment if using 88 | # auto-import. 89 | # .idea/artifacts 90 | # .idea/compiler.xml 91 | # .idea/jarRepositories.xml 92 | # .idea/modules.xml 93 | # .idea/*.iml 94 | # .idea/modules 95 | # *.iml 96 | # *.ipr 97 | 98 | # CMake 99 | cmake-build-*/ 100 | 101 | # Mongo Explorer plugin 102 | .idea/**/mongoSettings.xml 103 | 104 | # File-based project format 105 | *.iws 106 | 107 | # IntelliJ 108 | out/ 109 | 110 | # mpeltonen/sbt-idea plugin 111 | .idea_modules/ 112 | 113 | # JIRA plugin 114 | atlassian-ide-plugin.xml 115 | 116 | # Cursive Clojure plugin 117 | .idea/replstate.xml 118 | 119 | # SonarLint plugin 120 | .idea/sonarlint/ 121 | 122 | # Crashlytics plugin (for Android Studio and IntelliJ) 123 | com_crashlytics_export_strings.xml 124 | crashlytics.properties 125 | crashlytics-build.properties 126 | fabric.properties 127 | 128 | # Editor-based Rest Client 129 | .idea/httpRequests 130 | 131 | # Android studio 3.1+ serialized cache file 132 | .idea/caches/build_file_checksums.ser 133 | 134 | ### JetBrains Patch ### 135 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 136 | 137 | # *.iml 138 | # modules.xml 139 | # .idea/misc.xml 140 | # *.ipr 141 | 142 | # Sonarlint plugin 143 | # https://plugins.jetbrains.com/plugin/7973-sonarlint 144 | .idea/**/sonarlint/ 145 | 146 | # SonarQube Plugin 147 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin 148 | .idea/**/sonarIssues.xml 149 | 150 | # Markdown Navigator plugin 151 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced 152 | .idea/**/markdown-navigator.xml 153 | .idea/**/markdown-navigator-enh.xml 154 | .idea/**/markdown-navigator/ 155 | 156 | # Cache file creation bug 157 | # See https://youtrack.jetbrains.com/issue/JBR-2257 158 | .idea/$CACHE_FILE$ 159 | 160 | # CodeStream plugin 161 | # https://plugins.jetbrains.com/plugin/12206-codestream 162 | .idea/codestream.xml 163 | 164 | ### JupyterNotebooks ### 165 | # gitignore template for Jupyter Notebooks 166 | # website: http://jupyter.org/ 167 | 168 | .ipynb_checkpoints 169 | */.ipynb_checkpoints/* 170 | 171 | # IPython 172 | profile_default/ 173 | ipython_config.py 174 | 175 | # Remove previous ipynb_checkpoints 176 | # git rm -r .ipynb_checkpoints/ 177 | 178 | ### Linux ### 179 | 180 | # temporary files which can be created if a process still has a handle open of a deleted file 181 | .fuse_hidden* 182 | 183 | # KDE directory preferences 184 | .directory 185 | 186 | # Linux trash folder which might appear on any partition or disk 187 | .Trash-* 188 | 189 | # .nfs files are created when an open file is removed but is still being accessed 190 | .nfs* 191 | 192 | ### macOS ### 193 | # General 194 | .DS_Store 195 | .AppleDouble 196 | .LSOverride 197 | 198 | # Icon must end with two \r 199 | Icon 200 | 201 | 202 | # Thumbnails 203 | ._* 204 | 205 | # Files that might appear in the root of a volume 206 | .DocumentRevisions-V100 207 | .fseventsd 208 | .Spotlight-V100 209 | .TemporaryItems 210 | .Trashes 211 | .VolumeIcon.icns 212 | .com.apple.timemachine.donotpresent 213 | 214 | # Directories potentially created on remote AFP share 215 | .AppleDB 216 | .AppleDesktop 217 | Network Trash Folder 218 | Temporary Items 219 | .apdisk 220 | 221 | ### PyCharm ### 222 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 223 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 224 | 225 | # User-specific stuff 226 | 227 | # AWS User-specific 228 | 229 | # Generated files 230 | 231 | # Sensitive or high-churn files 232 | 233 | # Gradle 234 | 235 | # Gradle and Maven with auto-import 236 | # When using Gradle or Maven with auto-import, you should exclude module files, 237 | # since they will be recreated, and may cause churn. Uncomment if using 238 | # auto-import. 239 | # .idea/artifacts 240 | # .idea/compiler.xml 241 | # .idea/jarRepositories.xml 242 | # .idea/modules.xml 243 | # .idea/*.iml 244 | # .idea/modules 245 | # *.iml 246 | # *.ipr 247 | 248 | # CMake 249 | 250 | # Mongo Explorer plugin 251 | 252 | # File-based project format 253 | 254 | # IntelliJ 255 | 256 | # mpeltonen/sbt-idea plugin 257 | 258 | # JIRA plugin 259 | 260 | # Cursive Clojure plugin 261 | 262 | # SonarLint plugin 263 | 264 | # Crashlytics plugin (for Android Studio and IntelliJ) 265 | 266 | # Editor-based Rest Client 267 | 268 | # Android studio 3.1+ serialized cache file 269 | 270 | ### PyCharm Patch ### 271 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 272 | 273 | # *.iml 274 | # modules.xml 275 | # .idea/misc.xml 276 | # *.ipr 277 | 278 | # Sonarlint plugin 279 | # https://plugins.jetbrains.com/plugin/7973-sonarlint 280 | 281 | # SonarQube Plugin 282 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin 283 | 284 | # Markdown Navigator plugin 285 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced 286 | 287 | # Cache file creation bug 288 | # See https://youtrack.jetbrains.com/issue/JBR-2257 289 | 290 | # CodeStream plugin 291 | # https://plugins.jetbrains.com/plugin/12206-codestream 292 | 293 | ### Python ### 294 | # Byte-compiled / optimized / DLL files 295 | __pycache__/ 296 | *.py[cod] 297 | *$py.class 298 | 299 | # C extensions 300 | *.so 301 | 302 | # Distribution / packaging 303 | .Python 304 | build/ 305 | develop-eggs/ 306 | downloads/ 307 | eggs/ 308 | .eggs/ 309 | lib/ 310 | lib64/ 311 | parts/ 312 | sdist/ 313 | var/ 314 | wheels/ 315 | share/python-wheels/ 316 | *.egg-info/ 317 | .installed.cfg 318 | *.egg 319 | MANIFEST 320 | 321 | # PyInstaller 322 | # Usually these files are written by a python script from a template 323 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 324 | *.manifest 325 | *.spec 326 | 327 | # Installer logs 328 | pip-log.txt 329 | pip-delete-this-directory.txt 330 | 331 | # Unit test / coverage reports 332 | htmlcov/ 333 | .tox/ 334 | .nox/ 335 | .coverage 336 | .coverage.* 337 | .cache 338 | nosetests.xml 339 | coverage.xml 340 | *.cover 341 | *.py,cover 342 | .hypothesis/ 343 | .pytest_cache/ 344 | cover/ 345 | 346 | # Translations 347 | *.mo 348 | *.pot 349 | 350 | # Django stuff: 351 | *.log 352 | local_settings.py 353 | db.sqlite3 354 | db.sqlite3-journal 355 | 356 | # Flask stuff: 357 | instance/ 358 | .webassets-cache 359 | 360 | # Scrapy stuff: 361 | .scrapy 362 | 363 | # Sphinx documentation 364 | docs/_build/ 365 | docs/build 366 | docs/source/api 367 | 368 | # PyBuilder 369 | .pybuilder/ 370 | target/ 371 | 372 | # Jupyter Notebook 373 | 374 | # IPython 375 | 376 | # pyenv 377 | # For a library or package, you might want to ignore these files since the code is 378 | # intended to run in multiple environments; otherwise, check them in: 379 | # .python-version 380 | 381 | # pipenv 382 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 383 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 384 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 385 | # install all needed dependencies. 386 | #Pipfile.lock 387 | 388 | # poetry 389 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 390 | # This is especially recommended for binary packages to ensure reproducibility, and is more 391 | # commonly ignored for libraries. 392 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 393 | #poetry.lock 394 | 395 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 396 | __pypackages__/ 397 | 398 | # Celery stuff 399 | celerybeat-schedule 400 | celerybeat.pid 401 | 402 | # SageMath parsed files 403 | *.sage.py 404 | 405 | # Environments 406 | .env 407 | .venv 408 | env/ 409 | venv/ 410 | ENV/ 411 | env.bak/ 412 | venv.bak/ 413 | 414 | # Spyder project settings 415 | .spyderproject 416 | .spyproject 417 | 418 | # Rope project settings 419 | .ropeproject 420 | 421 | # mkdocs documentation 422 | /site 423 | 424 | # mypy 425 | .mypy_cache/ 426 | .dmypy.json 427 | dmypy.json 428 | 429 | # Pyre type checker 430 | .pyre/ 431 | 432 | # pytype static type analyzer 433 | .pytype/ 434 | 435 | # Cython debug symbols 436 | cython_debug/ 437 | 438 | # PyCharm 439 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 440 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 441 | # and can be added to the global gitignore or merged into this file. For a more nuclear 442 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 443 | #.idea/ 444 | 445 | ### Vim ### 446 | # Swap 447 | [._]*.s[a-v][a-z] 448 | !*.svg # comment out if you don't need vector files 449 | [._]*.sw[a-p] 450 | [._]s[a-rt-v][a-z] 451 | [._]ss[a-gi-z] 452 | [._]sw[a-p] 453 | 454 | # Session 455 | Session.vim 456 | Sessionx.vim 457 | 458 | # Temporary 459 | .netrwhist 460 | # Auto-generated tag files 461 | tags 462 | # Persistent undo 463 | [._]*.un~ 464 | 465 | ### VisualStudioCode ### 466 | .vscode/* 467 | !.vscode/settings.json 468 | !.vscode/tasks.json 469 | !.vscode/launch.json 470 | !.vscode/extensions.json 471 | !.vscode/*.code-snippets 472 | 473 | # Local History for Visual Studio Code 474 | .history/ 475 | 476 | # Built Visual Studio Code Extensions 477 | *.vsix 478 | 479 | ### VisualStudioCode Patch ### 480 | # Ignore all local history of files 481 | .history 482 | .ionide 483 | 484 | # Support for Project snippet scope 485 | 486 | ### Windows ### 487 | # Windows thumbnail cache files 488 | Thumbs.db 489 | Thumbs.db:encryptable 490 | ehthumbs.db 491 | ehthumbs_vista.db 492 | 493 | # Dump file 494 | *.stackdump 495 | 496 | # Folder config file 497 | [Dd]esktop.ini 498 | 499 | # Recycle Bin used on file shares 500 | $RECYCLE.BIN/ 501 | 502 | # Windows Installer files 503 | *.cab 504 | *.msi 505 | *.msix 506 | *.msm 507 | *.msp 508 | 509 | # Windows shortcuts 510 | *.lnk 511 | 512 | ### VisualStudio ### 513 | ## Ignore Visual Studio temporary files, build results, and 514 | ## files generated by popular Visual Studio add-ons. 515 | ## 516 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore 517 | 518 | # User-specific files 519 | *.rsuser 520 | *.suo 521 | *.user 522 | *.userosscache 523 | *.sln.docstates 524 | 525 | # User-specific files (MonoDevelop/Xamarin Studio) 526 | *.userprefs 527 | 528 | # Mono auto generated files 529 | mono_crash.* 530 | 531 | # Build results 532 | [Dd]ebug/ 533 | [Dd]ebugPublic/ 534 | [Rr]elease/ 535 | [Rr]eleases/ 536 | x64/ 537 | x86/ 538 | [Ww][Ii][Nn]32/ 539 | [Aa][Rr][Mm]/ 540 | [Aa][Rr][Mm]64/ 541 | bld/ 542 | [Bb]in/ 543 | [Oo]bj/ 544 | [Ll]og/ 545 | [Ll]ogs/ 546 | 547 | # Visual Studio 2015/2017 cache/options directory 548 | .vs/ 549 | # Uncomment if you have tasks that create the project's static files in wwwroot 550 | #wwwroot/ 551 | 552 | # Visual Studio 2017 auto generated files 553 | Generated\ Files/ 554 | 555 | # MSTest test Results 556 | [Tt]est[Rr]esult*/ 557 | [Bb]uild[Ll]og.* 558 | 559 | # NUnit 560 | *.VisualState.xml 561 | TestResult.xml 562 | nunit-*.xml 563 | 564 | # Build Results of an ATL Project 565 | [Dd]ebugPS/ 566 | [Rr]eleasePS/ 567 | dlldata.c 568 | 569 | # Benchmark Results 570 | BenchmarkDotNet.Artifacts/ 571 | 572 | # .NET Core 573 | project.lock.json 574 | project.fragment.lock.json 575 | artifacts/ 576 | 577 | # ASP.NET Scaffolding 578 | ScaffoldingReadMe.txt 579 | 580 | # StyleCop 581 | StyleCopReport.xml 582 | 583 | # Files built by Visual Studio 584 | *_i.c 585 | *_p.c 586 | *_h.h 587 | *.ilk 588 | *.meta 589 | *.obj 590 | *.iobj 591 | *.pch 592 | *.pdb 593 | *.ipdb 594 | *.pgc 595 | *.pgd 596 | *.rsp 597 | *.sbr 598 | *.tlb 599 | *.tli 600 | *.tlh 601 | *.tmp 602 | *.tmp_proj 603 | *_wpftmp.csproj 604 | *.tlog 605 | *.vspscc 606 | *.vssscc 607 | .builds 608 | *.pidb 609 | *.svclog 610 | *.scc 611 | 612 | # Chutzpah Test files 613 | _Chutzpah* 614 | 615 | # Visual C++ cache files 616 | ipch/ 617 | *.aps 618 | *.ncb 619 | *.opendb 620 | *.opensdf 621 | *.sdf 622 | *.cachefile 623 | *.VC.db 624 | *.VC.VC.opendb 625 | 626 | # Visual Studio profiler 627 | *.psess 628 | *.vsp 629 | *.vspx 630 | *.sap 631 | 632 | # Visual Studio Trace Files 633 | *.e2e 634 | 635 | # TFS 2012 Local Workspace 636 | $tf/ 637 | 638 | # Guidance Automation Toolkit 639 | *.gpState 640 | 641 | # ReSharper is a .NET coding add-in 642 | _ReSharper*/ 643 | *.[Rr]e[Ss]harper 644 | *.DotSettings.user 645 | 646 | # TeamCity is a build add-in 647 | _TeamCity* 648 | 649 | # DotCover is a Code Coverage Tool 650 | *.dotCover 651 | 652 | # AxoCover is a Code Coverage Tool 653 | .axoCover/* 654 | !.axoCover/settings.json 655 | 656 | # Coverlet is a free, cross platform Code Coverage Tool 657 | coverage*.json 658 | coverage*.xml 659 | coverage*.info 660 | 661 | # Visual Studio code coverage results 662 | *.coverage 663 | *.coveragexml 664 | 665 | # NCrunch 666 | _NCrunch_* 667 | .*crunch*.local.xml 668 | nCrunchTemp_* 669 | 670 | # MightyMoose 671 | *.mm.* 672 | AutoTest.Net/ 673 | 674 | # Web workbench (sass) 675 | .sass-cache/ 676 | 677 | # Installshield output folder 678 | [Ee]xpress/ 679 | 680 | # DocProject is a documentation generator add-in 681 | DocProject/buildhelp/ 682 | DocProject/Help/*.HxT 683 | DocProject/Help/*.HxC 684 | DocProject/Help/*.hhc 685 | DocProject/Help/*.hhk 686 | DocProject/Help/*.hhp 687 | DocProject/Help/Html2 688 | DocProject/Help/html 689 | 690 | # Click-Once directory 691 | publish/ 692 | 693 | # Publish Web Output 694 | *.[Pp]ublish.xml 695 | *.azurePubxml 696 | # Note: Comment the next line if you want to checkin your web deploy settings, 697 | # but database connection strings (with potential passwords) will be unencrypted 698 | *.pubxml 699 | *.publishproj 700 | 701 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 702 | # checkin your Azure Web App publish settings, but sensitive information contained 703 | # in these scripts will be unencrypted 704 | PublishScripts/ 705 | 706 | # NuGet Packages 707 | *.nupkg 708 | # NuGet Symbol Packages 709 | *.snupkg 710 | # The packages folder can be ignored because of Package Restore 711 | **/[Pp]ackages/* 712 | # except build/, which is used as an MSBuild target. 713 | !**/[Pp]ackages/build/ 714 | # Uncomment if necessary however generally it will be regenerated when needed 715 | #!**/[Pp]ackages/repositories.config 716 | # NuGet v3's project.json files produces more ignorable files 717 | *.nuget.props 718 | *.nuget.targets 719 | 720 | # Microsoft Azure Build Output 721 | csx/ 722 | *.build.csdef 723 | 724 | # Microsoft Azure Emulator 725 | ecf/ 726 | rcf/ 727 | 728 | # Windows Store app package directories and files 729 | AppPackages/ 730 | BundleArtifacts/ 731 | Package.StoreAssociation.xml 732 | _pkginfo.txt 733 | *.appx 734 | *.appxbundle 735 | *.appxupload 736 | 737 | # Visual Studio cache files 738 | # files ending in .cache can be ignored 739 | *.[Cc]ache 740 | # but keep track of directories ending in .cache 741 | !?*.[Cc]ache/ 742 | 743 | # Others 744 | ClientBin/ 745 | ~$* 746 | *.dbmdl 747 | *.dbproj.schemaview 748 | *.jfm 749 | *.pfx 750 | *.publishsettings 751 | orleans.codegen.cs 752 | 753 | # Including strong name files can present a security risk 754 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 755 | #*.snk 756 | 757 | # Since there are multiple workflows, uncomment next line to ignore bower_components 758 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 759 | #bower_components/ 760 | 761 | # RIA/Silverlight projects 762 | Generated_Code/ 763 | 764 | # Backup & report files from converting an old project file 765 | # to a newer Visual Studio version. Backup files are not needed, 766 | # because we have git ;-) 767 | _UpgradeReport_Files/ 768 | Backup*/ 769 | UpgradeLog*.XML 770 | UpgradeLog*.htm 771 | ServiceFabricBackup/ 772 | *.rptproj.bak 773 | 774 | # SQL Server files 775 | *.mdf 776 | *.ldf 777 | *.ndf 778 | 779 | # Business Intelligence projects 780 | *.rdl.data 781 | *.bim.layout 782 | *.bim_*.settings 783 | *.rptproj.rsuser 784 | *- [Bb]ackup.rdl 785 | *- [Bb]ackup ([0-9]).rdl 786 | *- [Bb]ackup ([0-9][0-9]).rdl 787 | 788 | # Microsoft Fakes 789 | FakesAssemblies/ 790 | 791 | # GhostDoc plugin setting file 792 | *.GhostDoc.xml 793 | 794 | # Node.js Tools for Visual Studio 795 | .ntvs_analysis.dat 796 | node_modules/ 797 | 798 | # Visual Studio 6 build log 799 | *.plg 800 | 801 | # Visual Studio 6 workspace options file 802 | *.opt 803 | 804 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 805 | *.vbw 806 | 807 | # Visual Studio 6 auto-generated project file (contains which files were open etc.) 808 | *.vbp 809 | 810 | # Visual Studio 6 workspace and project file (working project files containing files to include in project) 811 | *.dsw 812 | *.dsp 813 | 814 | # Visual Studio 6 technical files 815 | 816 | # Visual Studio LightSwitch build output 817 | **/*.HTMLClient/GeneratedArtifacts 818 | **/*.DesktopClient/GeneratedArtifacts 819 | **/*.DesktopClient/ModelManifest.xml 820 | **/*.Server/GeneratedArtifacts 821 | **/*.Server/ModelManifest.xml 822 | _Pvt_Extensions 823 | 824 | # Paket dependency manager 825 | .paket/paket.exe 826 | paket-files/ 827 | 828 | # FAKE - F# Make 829 | .fake/ 830 | 831 | # CodeRush personal settings 832 | .cr/personal 833 | 834 | # Python Tools for Visual Studio (PTVS) 835 | *.pyc 836 | 837 | # Cake - Uncomment if you are using it 838 | # tools/** 839 | # !tools/packages.config 840 | 841 | # Tabs Studio 842 | *.tss 843 | 844 | # Telerik's JustMock configuration file 845 | *.jmconfig 846 | 847 | # BizTalk build output 848 | *.btp.cs 849 | *.btm.cs 850 | *.odx.cs 851 | *.xsd.cs 852 | 853 | # OpenCover UI analysis results 854 | OpenCover/ 855 | 856 | # Azure Stream Analytics local run output 857 | ASALocalRun/ 858 | 859 | # MSBuild Binary and Structured Log 860 | *.binlog 861 | 862 | # NVidia Nsight GPU debugger configuration file 863 | *.nvuser 864 | 865 | # MFractors (Xamarin productivity tool) working folder 866 | .mfractor/ 867 | 868 | # Local History for Visual Studio 869 | .localhistory/ 870 | 871 | # Visual Studio History (VSHistory) files 872 | .vshistory/ 873 | 874 | # BeatPulse healthcheck temp database 875 | healthchecksdb 876 | 877 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 878 | MigrationBackup/ 879 | 880 | # Ionide (cross platform F# VS Code tools) working folder 881 | .ionide/ 882 | 883 | # Fody - auto-generated XML schema 884 | FodyWeavers.xsd 885 | 886 | # VS Code files for those working on multiple tools 887 | *.code-workspace 888 | 889 | # Local History for Visual Studio Code 890 | 891 | # Windows Installer files from build outputs 892 | 893 | # JetBrains Rider 894 | *.sln.iml 895 | 896 | ### VisualStudio Patch ### 897 | # Additional files built by Visual Studio 898 | 899 | # End of https://www.toptal.com/developers/gitignore/api/macos,linux,windows,python,jupyternotebooks,jetbrains,pycharm,vim,emacs,visualstudiocode,visualstudio 900 | 901 | scratch/ 902 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | version: 2 6 | 7 | sphinx: 8 | # Path to your Sphinx configuration file, required as of 9 | # https://about.readthedocs.com/blog/2024/12/deprecate-config-files-without-sphinx-or-mkdocs-config/ 10 | configuration: docs/source/conf.py 11 | 12 | # Set the version of Python and other tools you might need 13 | build: 14 | os: ubuntu-22.04 15 | apt_packages: 16 | - graphviz 17 | tools: 18 | python: "3.12" 19 | 20 | # adapted from uv recipe at https://docs.readthedocs.io/en/stable/build-customization.html#install-dependencies-with-uv 21 | # and comment at https://github.com/readthedocs/readthedocs.org/issues/11289#issuecomment-2103832834 22 | commands: 23 | - asdf plugin add uv 24 | - asdf install uv latest 25 | - asdf global uv latest 26 | - uv venv $READTHEDOCS_VIRTUALENV_PATH 27 | - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH UV_PREVIEW=1 uv pip install .[docs,rdf,pandas,xml,aws] 28 | - python -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs/source $READTHEDOCS_OUTPUT/html 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Charles Tapley Hoyt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | PyStow 3 |

4 | 5 |

6 | 7 | Tests 8 | 9 | PyPI 10 | 11 | PyPI - Python Version 12 | 13 | PyPI - License 14 | 15 | Documentation Status 16 | 17 | Codecov status 18 | 19 | Cookiecutter template from @cthoyt 20 | 21 | Ruff 22 | 23 | Contributor Covenant 24 | 25 | DOI 26 |

27 | 28 | 👜 Easily pick a place to store data for your Python code 29 | 30 | ## 💪 Getting Started 31 | 32 | Get a directory for your application. 33 | 34 | ```python 35 | import pystow 36 | 37 | # Get a directory (as a pathlib.Path) for ~/.data/pykeen 38 | pykeen_directory = pystow.join('pykeen') 39 | 40 | # Get a subdirectory (as a pathlib.Path) for ~/.data/pykeen/experiments 41 | pykeen_experiments_directory = pystow.join('pykeen', 'experiments') 42 | 43 | # You can go as deep as you want 44 | pykeen_deep_directory = pystow.join('pykeen', 'experiments', 'a', 'b', 'c') 45 | ``` 46 | 47 | If you reuse the same directory structure a lot, you can save them in a module: 48 | 49 | ```python 50 | import pystow 51 | 52 | pykeen_module = pystow.module("pykeen") 53 | 54 | # Access the module's directory with .base 55 | assert pystow.join("pykeen") == pystow.module("pykeen").base 56 | 57 | # Get a subdirectory (as a pathlib.Path) for ~/.data/pykeen/experiments 58 | pykeen_experiments_directory = pykeen_module.join('experiments') 59 | 60 | # You can go as deep as you want past the original "pykeen" module 61 | pykeen_deep_directory = pykeen_module.join('experiments', 'a', 'b', 'c') 62 | ``` 63 | 64 | Get a file path for your application by adding the `name` keyword argument. This 65 | is made explicit so PyStow knows which parent directories to automatically 66 | create. This works with `pystow` or any module you create with `pystow.module`. 67 | 68 | ```python 69 | import pystow 70 | 71 | # Get a directory (as a pathlib.Path) for ~/.data/indra/database.tsv 72 | indra_database_path = pystow.join('indra', 'database', name='database.tsv') 73 | ``` 74 | 75 | Ensure a file from the internet is available in your application's directory: 76 | 77 | ```python 78 | import pystow 79 | 80 | url = 'https://raw.githubusercontent.com/pykeen/pykeen/master/src/pykeen/datasets/nations/test.txt' 81 | path = pystow.ensure('pykeen', 'datasets', 'nations', url=url) 82 | ``` 83 | 84 | Ensure a tabular data file from the internet and load it for usage (requires 85 | `pip install pandas`): 86 | 87 | ```python 88 | import pystow 89 | import pandas as pd 90 | 91 | url = 'https://raw.githubusercontent.com/pykeen/pykeen/master/src/pykeen/datasets/nations/test.txt' 92 | df: pd.DataFrame = pystow.ensure_csv('pykeen', 'datasets', 'nations', url=url) 93 | ``` 94 | 95 | Ensure a comma-separated tabular data file from the internet and load it for 96 | usage (requires `pip install pandas`): 97 | 98 | ```python 99 | import pystow 100 | import pandas as pd 101 | 102 | url = 'https://raw.githubusercontent.com/cthoyt/pystow/main/tests/resources/test_1.csv' 103 | df: pd.DataFrame = pystow.ensure_csv('pykeen', 'datasets', 'nations', url=url, read_csv_kwargs=dict(sep=",")) 104 | ``` 105 | 106 | Ensure a RDF file from the internet and load it for usage (requires 107 | `pip install rdflib`) 108 | 109 | ```python 110 | import pystow 111 | import rdflib 112 | 113 | url = 'https://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz' 114 | rdf_graph: rdflib.Graph = pystow.ensure_rdf('rhea', url=url) 115 | ``` 116 | 117 | Also see `pystow.ensure_excel()`, `pystow.ensure_rdf()`, 118 | `pystow.ensure_zip_df()`, and `pystow.ensure_tar_df()`. 119 | 120 | If your data comes with a lot of different files in an archive, you can ensure 121 | the archive is downloaded and get specific files from it: 122 | 123 | ```python 124 | import numpy as np 125 | import pystow 126 | 127 | url = "https://cloud.enterprise.informatik.uni-leipzig.de/index.php/s/LHPbMCre7SLqajB/download/MultiKE_D_Y_15K_V1.zip" 128 | # the path inside the archive to the file you want 129 | inner_path = "MultiKE/D_Y_15K_V1/721_5fold/1/20210219183115/ent_embeds.npy" 130 | with pystow.ensure_open_zip("kiez", url=url, inner_path=inner_path) as file: 131 | emb = np.load(file) 132 | ``` 133 | 134 | Also see `pystow.module.ensure_open_lzma()`, 135 | `pystow.module.ensure_open_tarfile()` and `pystow.module.ensure_open_gz()`. 136 | 137 | ## ⚙️️ Configuration 138 | 139 | By default, data is stored in the `$HOME/.data` directory. By default, the 140 | `` app will create the `$HOME/.data/` folder. 141 | 142 | If you want to use an alternate folder name to `.data` inside the home 143 | directory, you can set the `PYSTOW_NAME` environment variable. For example, if 144 | you set `PYSTOW_NAME=mydata`, then the following code for the `pykeen` app will 145 | create the `$HOME/mydata/pykeen/` directory: 146 | 147 | ```python 148 | import os 149 | import pystow 150 | 151 | # Only for demonstration purposes. You should set environment 152 | # variables either with your .bashrc or in the command line REPL. 153 | os.environ['PYSTOW_NAME'] = 'mydata' 154 | 155 | # Get a directory (as a pathlib.Path) for ~/mydata/pykeen 156 | pykeen_directory = pystow.join('pykeen') 157 | ``` 158 | 159 | If you want to specify a completely custom directory that isn't relative to your 160 | home directory, you can set the `PYSTOW_HOME` environment variable. For example, 161 | if you set `PYSTOW_HOME=/usr/local/`, then the following code for the `pykeen` 162 | app will create the `/usr/local/pykeen/` directory: 163 | 164 | ```python 165 | import os 166 | import pystow 167 | 168 | # Only for demonstration purposes. You should set environment 169 | # variables either with your .bashrc or in the command line REPL. 170 | os.environ['PYSTOW_HOME'] = '/usr/local/' 171 | 172 | # Get a directory (as a pathlib.Path) for /usr/local/pykeen 173 | pykeen_directory = pystow.join('pykeen') 174 | ``` 175 | 176 | Note: if you set `PYSTOW_HOME`, then `PYSTOW_NAME` is disregarded. 177 | 178 | ### X Desktop Group (XDG) Compatibility 179 | 180 | While PyStow's main goal is to make application data less opaque and less 181 | hidden, some users might want to use the 182 | [XDG specifications](http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html) 183 | for storing their app data. 184 | 185 | If you set the environment variable `PYSTOW_USE_APPDIRS` to `true` or `True`, 186 | then the [`appdirs`](https://pypi.org/project/appdirs/) package will be used to 187 | choose the base directory based on the `user data dir` option. This can still be 188 | overridden by `PYSTOW_HOME`. 189 | 190 | ## 🚀 Installation 191 | 192 | The most recent release can be installed from 193 | [PyPI](https://pypi.org/project/pystow/) with uv: 194 | 195 | ```console 196 | $ uv pip install pystow 197 | ``` 198 | 199 | or with pip: 200 | 201 | ```console 202 | $ python3 -m pip install pystow 203 | ``` 204 | 205 | The most recent code and data can be installed directly from GitHub with uv: 206 | 207 | ```console 208 | $ uv --preview pip install git+https://github.com/cthoyt/pystow.git 209 | ``` 210 | 211 | or with pip: 212 | 213 | ```console 214 | $ UV_PREVIEW=1 python3 -m pip install git+https://github.com/cthoyt/pystow.git 215 | ``` 216 | 217 | Note that this requires setting `UV_PREVIEW` mode enabled until the uv build 218 | backend becomes a stable feature. 219 | 220 | ## 👐 Contributing 221 | 222 | Contributions, whether filing an issue, making a pull request, or forking, are 223 | appreciated. See 224 | [CONTRIBUTING.md](https://github.com/cthoyt/pystow/blob/master/.github/CONTRIBUTING.md) 225 | for more information on getting involved. 226 | 227 | ## 👋 Attribution 228 | 229 | ### ⚖️ License 230 | 231 | The code in this package is licensed under the MIT License. 232 | 233 | ### 🍪 Cookiecutter 234 | 235 | This package was created with 236 | [@audreyfeldroy](https://github.com/audreyfeldroy)'s 237 | [cookiecutter](https://github.com/cookiecutter/cookiecutter) package using 238 | [@cthoyt](https://github.com/cthoyt)'s 239 | [cookiecutter-snekpack](https://github.com/cthoyt/cookiecutter-snekpack) 240 | template. 241 | 242 | ## 🛠️ For Developers 243 | 244 |
245 | See developer instructions 246 | 247 | The final section of the README is for if you want to get involved by making a 248 | code contribution. 249 | 250 | ### Development Installation 251 | 252 | To install in development mode, use the following: 253 | 254 | ```console 255 | $ git clone git+https://github.com/cthoyt/pystow.git 256 | $ cd pystow 257 | $ uv --preview pip install -e . 258 | ``` 259 | 260 | Alternatively, install using pip: 261 | 262 | ```console 263 | $ UV_PREVIEW=1 python3 -m pip install -e . 264 | ``` 265 | 266 | Note that this requires setting `UV_PREVIEW` mode enabled until the uv build 267 | backend becomes a stable feature. 268 | 269 | ### Updating Package Boilerplate 270 | 271 | This project uses `cruft` to keep boilerplate (i.e., configuration, contribution 272 | guidelines, documentation configuration) up-to-date with the upstream 273 | cookiecutter package. Install cruft with either `uv tool install cruft` or 274 | `python3 -m pip install cruft` then run: 275 | 276 | ```console 277 | $ cruft update 278 | ``` 279 | 280 | More info on Cruft's update command is available 281 | [here](https://github.com/cruft/cruft?tab=readme-ov-file#updating-a-project). 282 | 283 | ### 🥼 Testing 284 | 285 | After cloning the repository and installing `tox` with 286 | `uv tool install tox --with tox-uv` or `python3 -m pip install tox tox-uv`, the 287 | unit tests in the `tests/` folder can be run reproducibly with: 288 | 289 | ```console 290 | $ tox -e py 291 | ``` 292 | 293 | Additionally, these tests are automatically re-run with each commit in a 294 | [GitHub Action](https://github.com/cthoyt/pystow/actions?query=workflow%3ATests). 295 | 296 | ### 📖 Building the Documentation 297 | 298 | The documentation can be built locally using the following: 299 | 300 | ```console 301 | $ git clone git+https://github.com/cthoyt/pystow.git 302 | $ cd pystow 303 | $ tox -e docs 304 | $ open docs/build/html/index.html 305 | ``` 306 | 307 | The documentation automatically installs the package as well as the `docs` extra 308 | specified in the [`pyproject.toml`](pyproject.toml). `sphinx` plugins like 309 | `texext` can be added there. Additionally, they need to be added to the 310 | `extensions` list in [`docs/source/conf.py`](docs/source/conf.py). 311 | 312 | The documentation can be deployed to [ReadTheDocs](https://readthedocs.io) using 313 | [this guide](https://docs.readthedocs.io/en/stable/intro/import-guide.html). The 314 | [`.readthedocs.yml`](.readthedocs.yml) YAML file contains all the configuration 315 | you'll need. You can also set up continuous integration on GitHub to check not 316 | only that Sphinx can build the documentation in an isolated environment (i.e., 317 | with `tox -e docs-test`) but also that 318 | [ReadTheDocs can build it too](https://docs.readthedocs.io/en/stable/pull-requests.html). 319 | 320 | #### Configuring ReadTheDocs 321 | 322 | 1. Log in to ReadTheDocs with your GitHub account to install the integration at 323 | https://readthedocs.org/accounts/login/?next=/dashboard/ 324 | 2. Import your project by navigating to https://readthedocs.org/dashboard/import 325 | then clicking the plus icon next to your repository 326 | 3. You can rename the repository on the next screen using a more stylized name 327 | (i.e., with spaces and capital letters) 328 | 4. Click next, and you're good to go! 329 | 330 | ### 📦 Making a Release 331 | 332 | #### Configuring Zenodo 333 | 334 | [Zenodo](https://zenodo.org) is a long-term archival system that assigns a DOI 335 | to each release of your package. 336 | 337 | 1. Log in to Zenodo via GitHub with this link: 338 | https://zenodo.org/oauth/login/github/?next=%2F. This brings you to a page 339 | that lists all of your organizations and asks you to approve installing the 340 | Zenodo app on GitHub. Click "grant" next to any organizations you want to 341 | enable the integration for, then click the big green "approve" button. This 342 | step only needs to be done once. 343 | 2. Navigate to https://zenodo.org/account/settings/github/, which lists all of 344 | your GitHub repositories (both in your username and any organizations you 345 | enabled). Click the on/off toggle for any relevant repositories. When you 346 | make a new repository, you'll have to come back to this 347 | 348 | After these steps, you're ready to go! After you make "release" on GitHub (steps 349 | for this are below), you can navigate to 350 | https://zenodo.org/account/settings/github/repository/cthoyt/pystow to see the 351 | DOI for the release and link to the Zenodo record for it. 352 | 353 | #### Registering with the Python Package Index (PyPI) 354 | 355 | You only have to do the following steps once. 356 | 357 | 1. Register for an account on the 358 | [Python Package Index (PyPI)](https://pypi.org/account/register) 359 | 2. Navigate to https://pypi.org/manage/account and make sure you have verified 360 | your email address. A verification email might not have been sent by default, 361 | so you might have to click the "options" dropdown next to your address to get 362 | to the "re-send verification email" button 363 | 3. 2-Factor authentication is required for PyPI since the end of 2023 (see this 364 | [blog post from PyPI](https://blog.pypi.org/posts/2023-05-25-securing-pypi-with-2fa/)). 365 | This means you have to first issue account recovery codes, then set up 366 | 2-factor authentication 367 | 4. Issue an API token from https://pypi.org/manage/account/token 368 | 369 | #### Configuring your machine's connection to PyPI 370 | 371 | You have to do the following steps once per machine. 372 | 373 | ```console 374 | $ uv tool install keyring 375 | $ keyring set https://upload.pypi.org/legacy/ __token__ 376 | $ keyring set https://test.pypi.org/legacy/ __token__ 377 | ``` 378 | 379 | Note that this deprecates previous workflows using `.pypirc`. 380 | 381 | #### Uploading to PyPI 382 | 383 | After installing the package in development mode and installing `tox` with 384 | `uv tool install tox --with tox-uv` or `python3 -m pip install tox tox-uv`, run 385 | the following from the console: 386 | 387 | ```console 388 | $ tox -e finish 389 | ``` 390 | 391 | This script does the following: 392 | 393 | 1. Uses [bump-my-version](https://github.com/callowayproject/bump-my-version) to 394 | switch the version number in the `pyproject.toml`, `CITATION.cff`, 395 | `src/pystow/version.py`, and [`docs/source/conf.py`](docs/source/conf.py) to 396 | not have the `-dev` suffix 397 | 2. Packages the code in both a tar archive and a wheel using 398 | [`uv build`](https://docs.astral.sh/uv/guides/publish/#building-your-package) 399 | 3. Uploads to PyPI using 400 | [`uv publish`](https://docs.astral.sh/uv/guides/publish/#publishing-your-package). 401 | 4. Push to GitHub. You'll need to make a release going with the commit where the 402 | version was bumped. 403 | 5. Bump the version to the next patch. If you made big changes and want to bump 404 | the version by minor, you can use `tox -e bumpversion -- minor` after. 405 | 406 | #### Releasing on GitHub 407 | 408 | 1. Navigate to https://github.com/cthoyt/pystow/releases/new to draft a new 409 | release 410 | 2. Click the "Choose a Tag" dropdown and select the tag corresponding to the 411 | release you just made 412 | 3. Click the "Generate Release Notes" button to get a quick outline of recent 413 | changes. Modify the title and description as you see fit 414 | 4. Click the big green "Publish Release" button 415 | 416 | This will trigger Zenodo to assign a DOI to your release as well. 417 | 418 |
419 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | 3 | # You can set these variables from the command line. 4 | SPHINXOPTS = 5 | SPHINXBUILD = sphinx-build 6 | SPHINXPROJ = PyStow 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/source/cli.rst: -------------------------------------------------------------------------------- 1 | Command Line Interface 2 | ====================== 3 | pystow automatically installs the command :code:`pystow`. See 4 | :code:`pystow --help` for usage details. 5 | 6 | .. click:: pystow.cli:main 7 | :prog: pystow 8 | :show-nested: 9 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration file for the Sphinx documentation builder. 3 | 4 | This file does only contain a selection of the most common options. For a 5 | full list see the documentation: 6 | http://www.sphinx-doc.org/en/master/config 7 | 8 | -- Path setup -------------------------------------------------------------- 9 | 10 | If extensions (or modules to document with autodoc) are in another directory, 11 | add these directories to ``sys.path`` here. If the directory is relative to the 12 | documentation root, use ``os.path.abspath`` to make it absolute, like shown here. 13 | """ 14 | 15 | import os 16 | import re 17 | import sys 18 | from datetime import date 19 | 20 | sys.path.insert(0, os.path.abspath("../../src")) 21 | 22 | # -- Project information ----------------------------------------------------- 23 | 24 | project = "pystow" 25 | copyright = f"{date.today().year}, Charles Tapley Hoyt" 26 | author = "Charles Tapley Hoyt" 27 | 28 | # The full version, including alpha/beta/rc tags. 29 | release = "0.7.1-dev" 30 | 31 | # The short X.Y version. 32 | parsed_version = re.match( 33 | r"(?P\d+)\.(?P\d+)\.(?P\d+)(?:-(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?(?:\+(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?", 34 | release, 35 | ) 36 | version = parsed_version.expand(r"\g.\g.\g") 37 | 38 | if parsed_version.group("release"): 39 | tags.add("prerelease") # noqa:F821 40 | 41 | 42 | # See https://about.readthedocs.com/blog/2024/07/addons-by-default/ 43 | # Define the canonical URL if you are using a custom domain on Read the Docs 44 | html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "") 45 | 46 | # See https://about.readthedocs.com/blog/2024/07/addons-by-default/ 47 | # Tell Jinja2 templates the build is running on Read the Docs 48 | if os.environ.get("READTHEDOCS", "") == "True": 49 | if "html_context" not in globals(): 50 | html_context = {} 51 | html_context["READTHEDOCS"] = True 52 | 53 | 54 | # -- General configuration --------------------------------------------------- 55 | 56 | # If your documentation needs a minimal Sphinx version, state it here. 57 | # 58 | # needs_sphinx = '1.0' 59 | 60 | # If true, the current module name will be prepended to all description 61 | # unit titles (such as .. function::). 62 | add_module_names = False 63 | 64 | # A list of prefixes that are ignored when creating the module index. (new in Sphinx 0.6) 65 | modindex_common_prefix = ["pystow."] 66 | 67 | # Add any Sphinx extension module names here, as strings. They can be 68 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 69 | # ones. 70 | extensions = [ 71 | "sphinx.ext.autosummary", 72 | "sphinx.ext.autodoc", 73 | "sphinx.ext.coverage", 74 | "sphinx.ext.intersphinx", 75 | "sphinx.ext.todo", 76 | "sphinx.ext.mathjax", 77 | "sphinx.ext.viewcode", 78 | "sphinx_automodapi.automodapi", 79 | "sphinx_automodapi.smart_resolver", 80 | ] 81 | 82 | 83 | extensions.append("sphinx_click.ext") 84 | 85 | 86 | # generate autosummary pages 87 | autosummary_generate = True 88 | 89 | # Add any paths that contain templates here, relative to this directory. 90 | templates_path = ["_templates"] 91 | 92 | # The suffix(es) of source filenames. 93 | # You can specify multiple suffix as a list of string: 94 | # 95 | # source_suffix = ['.rst', '.md'] 96 | source_suffix = { 97 | ".rst": "restructuredtext", 98 | } 99 | 100 | # The master toctree document. 101 | master_doc = "index" 102 | 103 | # The language for content autogenerated by Sphinx. Refer to documentation 104 | # for a list of supported languages. 105 | # 106 | # This is also used if you do content translation via gettext catalogs. 107 | # Usually you set "language" from the command line for these cases. 108 | language = "en" 109 | 110 | # List of patterns, relative to source directory, that match files and 111 | # directories to ignore when looking for source files. 112 | # This pattern also affects html_static_path and html_extra_path. 113 | exclude_patterns = [] 114 | 115 | # The name of the Pygments (syntax highlighting) style to use. 116 | pygments_style = "sphinx" 117 | 118 | # -- Options for HTML output ------------------------------------------------- 119 | 120 | # The theme to use for HTML and HTML Help pages. See the documentation for 121 | # a list of builtin themes. 122 | # 123 | html_theme = "sphinx_rtd_theme" 124 | 125 | # Theme options are theme-specific and customize the look and feel of a theme 126 | # further. For a list of options available for each theme, see the 127 | # documentation. 128 | # 129 | # html_theme_options = {} 130 | 131 | # Add any paths that contain custom static files (such as style sheets) here, 132 | # relative to this directory. They are copied after the builtin static files, 133 | # so a file named "default.css" will overwrite the builtin "default.css". 134 | # html_static_path = ['_static'] 135 | 136 | # Custom sidebar templates, must be a dictionary that maps document names 137 | # to template names. 138 | # 139 | # The default sidebars (for documents that don't match any pattern) are 140 | # defined by theme itself. Builtin themes are using these templates by 141 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 142 | # 'searchbox.html']``. 143 | # 144 | # html_sidebars = {} 145 | 146 | # The name of an image file (relative to this directory) to place at the top 147 | # of the sidebar. 148 | # 149 | if os.path.exists("logo.png"): 150 | html_logo = "logo.png" 151 | 152 | # -- Options for HTMLHelp output --------------------------------------------- 153 | 154 | # Output file base name for HTML help builder. 155 | htmlhelp_basename = "pystow_doc" 156 | 157 | # -- Options for LaTeX output ------------------------------------------------ 158 | 159 | # latex_elements = { 160 | # The paper size ('letterpaper' or 'a4paper'). 161 | # 162 | # 'papersize': 'letterpaper', 163 | # 164 | # The font size ('10pt', '11pt' or '12pt'). 165 | # 166 | # 'pointsize': '10pt', 167 | # 168 | # Additional stuff for the LaTeX preamble. 169 | # 170 | # 'preamble': '', 171 | # 172 | # Latex figure (float) alignment 173 | # 174 | # 'figure_align': 'htbp', 175 | # } 176 | 177 | # Grouping the document tree into LaTeX files. List of tuples 178 | # (source start file, target name, title, 179 | # author, documentclass [howto, manual, or own class]). 180 | # latex_documents = [ 181 | # ( 182 | # master_doc, 183 | # 'pystow.tex', 184 | # 'PyStow Documentation', 185 | # author, 186 | # 'manual', 187 | # ), 188 | # ] 189 | 190 | # -- Options for manual page output ------------------------------------------ 191 | 192 | # One entry per manual page. List of tuples 193 | # (source start file, name, description, authors, manual section). 194 | man_pages = [ 195 | ( 196 | master_doc, 197 | "pystow", 198 | "PyStow Documentation", 199 | [author], 200 | 1, 201 | ), 202 | ] 203 | 204 | # -- Options for Texinfo output ---------------------------------------------- 205 | 206 | # Grouping the document tree into Texinfo files. List of tuples 207 | # (source start file, target name, title, author, 208 | # dir menu entry, description, category) 209 | texinfo_documents = [ 210 | ( 211 | master_doc, 212 | "pystow", 213 | "PyStow Documentation", 214 | author, 215 | "Charles Tapley Hoyt", 216 | "Easily pick a place to store data for your Python code", 217 | "Miscellaneous", 218 | ), 219 | ] 220 | 221 | # -- Options for Epub output ------------------------------------------------- 222 | 223 | # Bibliographic Dublin Core info. 224 | # epub_title = project 225 | 226 | # The unique identifier of the text. This can be a ISBN number 227 | # or the project homepage. 228 | # 229 | # epub_identifier = '' 230 | 231 | # A unique identification for the text. 232 | # 233 | # epub_uid = '' 234 | 235 | # A list of files that should not be packed into the epub file. 236 | # epub_exclude_files = ['search.html'] 237 | 238 | # -- Extension configuration ------------------------------------------------- 239 | 240 | # -- Options for intersphinx extension --------------------------------------- 241 | 242 | # Example configuration for intersphinx: refer to the Python standard library. 243 | # Note: don't add trailing slashes, since sphinx adds "/objects.inv" to the end 244 | intersphinx_mapping = { 245 | "python": ("https://docs.python.org/3", None), 246 | "rdflib": ("https://rdflib.readthedocs.io/en/latest", None), 247 | "pandas": ("https://pandas.pydata.org/docs", None), 248 | "sklearn": ("https://scikit-learn.org/stable", None), 249 | "numpy": ("https://numpy.org/doc/stable", None), 250 | "scipy": ("https://docs.scipy.org/doc/scipy", None), 251 | } 252 | 253 | autoclass_content = "both" 254 | 255 | # Don't sort alphabetically, explained at: 256 | # https://stackoverflow.com/questions/37209921/python-how-not-to-sort-sphinx-output-in-alphabetical-order 257 | autodoc_member_order = "bysource" 258 | 259 | todo_include_todos = True 260 | todo_emit_warnings = True 261 | 262 | # Output SVG inheritance diagrams 263 | graphviz_output_format = "svg" 264 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | PyStow |release| Documentation 2 | ============================== 3 | If you've ever written the following few lines of code, :mod:`pystow` is for you: 4 | 5 | .. code-block:: python 6 | 7 | import os 8 | home = os.path.expanduser('~') 9 | project_name = 'adeft' 10 | envvar_name = f'{project_name.upper()}_HOME' 11 | if envvar_name in os.environ: 12 | ADEFT_HOME = os.environ[envvar_name] 13 | else: 14 | ADEFT_HOME = os.path.join(home, f'.{project_name}') 15 | os.makedirs(ADEFT_HOME, exist_ok=True) 16 | 17 | Many projects (let's use `Adeft `_ as an example) create a folder in the home 18 | directory as a dot-file such as ``$HOME/.adeft``. I found that I had so many of these that I started grouping 19 | them inside a ``$HOME/.data`` folder. It's also the case that every time you create one of these folders, 20 | you need to ensure its existence. 21 | 22 | :mod:`pystow` takes care of these things. You can replace the previous code with: 23 | 24 | .. code-block:: python 25 | 26 | import pystow 27 | ADEFT_HOME = pystow.join('adeft') 28 | 29 | First, it takes the name of the module, uppercases it, and postpends ``_HOME`` on to it (e.g., ``ADEFT_HOME``) 30 | and looks in the environment. If this variable is available, it uses that as the directory. It ensures it 31 | exists, then returns a :class:`pathlib.Path` pointing to it. 32 | 33 | If ``ADEFT_HOME`` (or more generally, ``_HOME`` is not available in the environment, it picks the 34 | path as ``$HOME/.data/``. Normally, ``$HOME`` is specified in your OS. However, if you want to 35 | pick another location to stick the data, you can override using ``$HOME`` by setting ``$PYSTOW_HOME`` in 36 | the environment. 37 | 38 | If you want to go more directories deep inside the adeft default directory, you can just keep using more 39 | positional arguments (the same semantics as :func:`os.path.join`). These directories automatically 40 | get created as well. 41 | 42 | .. code-block:: python 43 | 44 | >>> import pystow 45 | >>> from pathlib import Path 46 | >>> # already set somewhere 47 | >>> __version__ = ... 48 | >>> ADEFT_VERSION_HOME: Path = pystow.join('adeft', __version__) 49 | 50 | .. toctree:: 51 | :maxdepth: 2 52 | :caption: Getting Started 53 | :name: start 54 | 55 | installation 56 | usage 57 | utils 58 | cli 59 | 60 | Indices and Tables 61 | ------------------ 62 | * :ref:`genindex` 63 | * :ref:`modindex` 64 | * :ref:`search` 65 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | The most recent release can be installed from 4 | `PyPI `_ with uv: 5 | 6 | .. code-block:: console 7 | 8 | $ uv pip install pystow 9 | 10 | or with pip: 11 | 12 | .. code-block:: console 13 | 14 | $ python3 -m pip install pystow 15 | 16 | Installing from git 17 | ------------------- 18 | The most recent code and data can be installed directly from GitHub with uv: 19 | 20 | .. code-block:: console 21 | 22 | $ uv --preview pip install git+https://github.com/cthoyt/pystow.git 23 | 24 | or with pip: 25 | 26 | .. code-block:: console 27 | 28 | $ UV_PREVIEW=1 python3 -m pip install git+https://github.com/cthoyt/pystow.git 29 | 30 | .. note:: 31 | 32 | The ``UV_PREVIEW`` environment variable is required to be 33 | set until the uv build backend becomes a stable feature. 34 | 35 | Installing for development 36 | -------------------------- 37 | To install in development mode with uv: 38 | 39 | .. code-block:: console 40 | 41 | $ git clone git+https://github.com/cthoyt/pystow.git 42 | $ cd pystow 43 | $ uv --preview pip install -e . 44 | 45 | or with pip: 46 | 47 | .. code-block:: console 48 | 49 | $ UV_PREVIEW=1 python3 -m pip install -e . 50 | 51 | Configuration 52 | ============= 53 | By default, data is stored in the ``$HOME/.data`` directory. By default, the ```` app will create the 54 | ``$HOME/.data/`` folder. 55 | 56 | If you want to use an alternate folder name to ``.data`` inside the home directory, you can set the ``PYSTOW_NAME`` 57 | environment variable. For example, if you set ``PYSTOW_NAME=mydata``, then the following code for the ``pykeen`` app 58 | will create the ``$HOME/mydata/pykeen/`` directory: 59 | 60 | .. code-block:: python 61 | 62 | import os 63 | import pystow 64 | 65 | # Only for demonstration purposes. You should set environment 66 | # variables either with your .bashrc or in the command line REPL. 67 | os.environ['PYSTOW_NAME'] = 'mydata' 68 | 69 | # Get a directory (as a pathlib.Path) for ~/mydata/pykeen 70 | pykeen_directory = pystow.join('pykeen') 71 | 72 | 73 | If you want to specify a completely custom directory that isn't relative to your home directory, you can set 74 | the ``PYSTOW_HOME`` environment variable. For example, if you set ``PYSTOW_HOME=/usr/local/``, then the following code 75 | for the ``pykeen`` app will create the ``/usr/local/pykeen/`` directory: 76 | 77 | .. code-block:: python 78 | 79 | import os 80 | import pystow 81 | 82 | # Only for demonstration purposes. You should set environment 83 | # variables either with your .bashrc or in the command line REPL. 84 | os.environ['PYSTOW_HOME'] = '/usr/local/' 85 | 86 | # Get a directory (as a pathlib.Path) for /usr/local/pykeen 87 | pykeen_directory = pystow.join('pykeen') 88 | 89 | 90 | .. warning:: If you set ``PYSTOW_HOME``, then ``PYSTOW_NAME`` is disregarded. 91 | 92 | X Desktop Group (XDG) Compatibility 93 | ----------------------------------- 94 | While PyStow's main goal is to make application data less opaque and less 95 | hidden, some users might want to use the 96 | `XDG specifications `_ 97 | for storing their app data. 98 | 99 | If you set the environment variable ``PYSTOW_USE_APPDIRS`` to ``true`` or ``True``, then the 100 | `appdirs `_ package will be used to choose 101 | the base directory based on the ``user data dir`` option. 102 | 103 | .. warning:: If you use this setting, make sure you first do ``pip install appdirs`` 104 | 105 | .. note:: This can still be overridden by ``PYSTOW_HOME``. 106 | -------------------------------------------------------------------------------- /docs/source/usage.rst: -------------------------------------------------------------------------------- 1 | Usage 2 | ===== 3 | .. automodapi:: pystow 4 | :no-inheritance-diagram: 5 | :no-heading: 6 | :headings: -- 7 | :skip: Module 8 | :no-main-docstr: 9 | 10 | .. automodapi:: pystow.impl 11 | :no-inheritance-diagram: 12 | :no-heading: 13 | :headings: -- 14 | :no-main-docstr: 15 | -------------------------------------------------------------------------------- /docs/source/utils.rst: -------------------------------------------------------------------------------- 1 | Utilities 2 | ========= 3 | .. automodapi:: pystow.utils 4 | :no-inheritance-diagram: 5 | :no-heading: 6 | :headings: -- 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["uv_build>=0.6.6,<1.0"] 3 | build-backend = "uv_build" 4 | 5 | [project] 6 | name = "pystow" 7 | version = "0.7.1-dev" 8 | description = "Easily pick a place to store data for your Python code" 9 | readme = "README.md" 10 | authors = [ 11 | { name = "Charles Tapley Hoyt", email = "cthoyt@gmail.com" } 12 | ] 13 | maintainers = [ 14 | { name = "Charles Tapley Hoyt", email = "cthoyt@gmail.com" } 15 | ] 16 | 17 | # See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#classifiers 18 | # Search tags using the controlled vocabulary at https://pypi.org/classifiers 19 | classifiers = [ 20 | "Development Status :: 5 - Production/Stable", 21 | "Environment :: Console", 22 | "Intended Audience :: Developers", 23 | "License :: OSI Approved :: MIT License", 24 | "Operating System :: OS Independent", 25 | "Framework :: Pytest", 26 | "Framework :: tox", 27 | "Framework :: Sphinx", 28 | "Natural Language :: English", 29 | "Programming Language :: Python", 30 | "Programming Language :: Python :: 3.9", 31 | "Programming Language :: Python :: 3.10", 32 | "Programming Language :: Python :: 3.11", 33 | "Programming Language :: Python :: 3.12", 34 | "Programming Language :: Python :: 3.13", 35 | "Programming Language :: Python :: 3 :: Only", 36 | "Typing :: Typed", 37 | ] 38 | keywords = [ 39 | "snekpack", # please keep this keyword to credit the cookiecutter-snekpack template 40 | "cookiecutter", 41 | "caching", 42 | "file management" 43 | ] 44 | 45 | # License Information. 46 | # See PEP-639 at https://peps.python.org/pep-0639/#add-license-files-key 47 | license-files = [ 48 | "LICENSE", 49 | ] 50 | 51 | requires-python = ">=3.9" 52 | dependencies = [ 53 | "click", 54 | "requests", 55 | "tqdm", 56 | "typing-extensions", 57 | ] 58 | 59 | [project.optional-dependencies] 60 | tests = [ 61 | "pytest", 62 | "coverage[toml]", 63 | "requests_file", 64 | ] 65 | docs = [ 66 | "sphinx>=8", 67 | "sphinx-rtd-theme>=3.0", 68 | "sphinx-click", 69 | "sphinx_automodapi", 70 | ] 71 | rdf = [ 72 | "rdflib", 73 | ] 74 | xml = [ 75 | "lxml", 76 | ] 77 | pandas = [ 78 | "pandas", 79 | ] 80 | aws = [ 81 | "boto3", 82 | ] 83 | 84 | # See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#urls 85 | # and also https://packaging.python.org/en/latest/specifications/well-known-project-urls/ 86 | [project.urls] 87 | "Bug Tracker" = "https://github.com/cthoyt/pystow/issues" 88 | Homepage = "https://github.com/cthoyt/pystow" 89 | Repository = "https://github.com/cthoyt/pystow.git" 90 | Documentation = "https://pystow.readthedocs.io" 91 | Funding = "https://github.com/sponsors/cthoyt" 92 | 93 | [project.scripts] 94 | pystow = "pystow.cli:main" 95 | 96 | [tool.cruft] 97 | skip = [ 98 | "**/__init__.py", 99 | "tests/*" 100 | ] 101 | 102 | # MyPy, see https://mypy.readthedocs.io/en/stable/config_file.html 103 | [tool.mypy] 104 | plugins = [ 105 | ] 106 | 107 | # Doc8, see https://doc8.readthedocs.io/en/stable/readme.html#ini-file-usage 108 | [tool.doc8] 109 | max-line-length = 120 110 | 111 | # Pytest, see https://docs.pytest.org/en/stable/reference/customize.html#pyproject-toml 112 | [tool.pytest.ini_options] 113 | markers = [ 114 | "slow: marks tests as slow (deselect with '-m \"not slow\"')", 115 | ] 116 | 117 | # Coverage, see https://coverage.readthedocs.io/en/latest/config.html 118 | [tool.coverage.run] 119 | branch = true 120 | source = [ 121 | "pystow", 122 | ] 123 | omit = [ 124 | "tests/*", 125 | "docs/*", 126 | ] 127 | 128 | [tool.coverage.paths] 129 | source = [ 130 | "src/pystow", 131 | ".tox/*/lib/python*/site-packages/pystow", 132 | ] 133 | 134 | [tool.coverage.report] 135 | show_missing = true 136 | exclude_lines = [ 137 | "pragma: no cover", 138 | "raise NotImplementedError", 139 | "if __name__ == \"__main__\":", 140 | "if TYPE_CHECKING:", 141 | "def __str__", 142 | "def __repr__", 143 | ] 144 | 145 | [tool.ruff] 146 | line-length = 100 147 | extend-include = ["*.ipynb"] 148 | 149 | [tool.ruff.lint] 150 | # See https://docs.astral.sh/ruff/rules 151 | extend-select = [ 152 | "F", # pyflakes 153 | "E", # pycodestyle errors 154 | "W", # pycodestyle warnings 155 | "C90", # mccabe 156 | "I", # isort 157 | "UP", # pyupgrade 158 | "D", # pydocstyle 159 | "DOC", # pydoclint 160 | "B", # bugbear 161 | "S", # bandit 162 | "T20", # print 163 | "N", # pep8 naming 164 | "ERA", # eradicate commented out code 165 | "NPY", # numpy checks 166 | "RUF", # ruff rules 167 | "C4", # comprehensions 168 | ] 169 | ignore = [ 170 | "D105", # Missing docstring in magic method 171 | "E203", # Black conflicts with the following 172 | "S301", # yolo pickle 173 | "S320", # yolo lxml 174 | ] 175 | 176 | 177 | # See https://docs.astral.sh/ruff/settings/#per-file-ignores 178 | [tool.ruff.lint.per-file-ignores] 179 | # Ignore security issues in the version.py, which are inconsistent 180 | "src/pystow/version.py" = ["S603", "S607"] 181 | # Ignore commented out code in Sphinx configuration file 182 | "docs/source/conf.py" = ["ERA001"] 183 | # Prints are okay in notebooks 184 | "notebooks/**/*.ipynb" = ["T201"] 185 | 186 | [tool.ruff.lint.pydocstyle] 187 | convention = "pep257" 188 | 189 | [tool.ruff.lint.isort] 190 | relative-imports-order = "closest-to-furthest" 191 | known-third-party = [ 192 | "tqdm", 193 | ] 194 | known-first-party = [ 195 | "pystow", 196 | "tests", 197 | ] 198 | 199 | [tool.ruff.format] 200 | # see https://docs.astral.sh/ruff/settings/#format_docstring-code-format 201 | docstring-code-format = true 202 | 203 | [tool.bumpversion] 204 | current_version = "0.7.1-dev" 205 | parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(?:-(?P[0-9A-Za-z-]+(?:\\.[0-9A-Za-z-]+)*))?(?:\\+(?P[0-9A-Za-z-]+(?:\\.[0-9A-Za-z-]+)*))?" 206 | serialize = [ 207 | "{major}.{minor}.{patch}-{release}+{build}", 208 | "{major}.{minor}.{patch}+{build}", 209 | "{major}.{minor}.{patch}-{release}", 210 | "{major}.{minor}.{patch}", 211 | ] 212 | commit = true 213 | tag = false 214 | 215 | [tool.bumpversion.parts.release] 216 | optional_value = "production" 217 | first_value = "dev" 218 | values = [ 219 | "dev", 220 | "production", 221 | ] 222 | 223 | [[tool.bumpversion.files]] 224 | filename = "pyproject.toml" 225 | search = "version = \"{current_version}\"" 226 | replace = "version = \"{new_version}\"" 227 | 228 | [[tool.bumpversion.files]] 229 | filename = "docs/source/conf.py" 230 | search = "release = \"{current_version}\"" 231 | replace = "release = \"{new_version}\"" 232 | 233 | [[tool.bumpversion.files]] 234 | filename = "src/pystow/version.py" 235 | search = "VERSION = \"{current_version}\"" 236 | replace = "VERSION = \"{new_version}\"" 237 | -------------------------------------------------------------------------------- /src/pystow/__init__.py: -------------------------------------------------------------------------------- 1 | """PyStow: Easily pick a place to store data for your python package.""" 2 | 3 | from .api import ( 4 | dump_df, 5 | dump_json, 6 | dump_pickle, 7 | dump_rdf, 8 | dump_xml, 9 | ensure, 10 | ensure_csv, 11 | ensure_custom, 12 | ensure_excel, 13 | ensure_from_google, 14 | ensure_from_s3, 15 | ensure_gunzip, 16 | ensure_json, 17 | ensure_json_bz2, 18 | ensure_open, 19 | ensure_open_bz2, 20 | ensure_open_gz, 21 | ensure_open_lzma, 22 | ensure_open_sqlite, 23 | ensure_open_sqlite_gz, 24 | ensure_open_tarfile, 25 | ensure_open_zip, 26 | ensure_pickle, 27 | ensure_pickle_gz, 28 | ensure_rdf, 29 | ensure_tar_df, 30 | ensure_tar_xml, 31 | ensure_untar, 32 | ensure_xml, 33 | ensure_zip_df, 34 | ensure_zip_np, 35 | join, 36 | joinpath_sqlite, 37 | load_df, 38 | load_json, 39 | load_pickle, 40 | load_pickle_gz, 41 | load_rdf, 42 | load_xml, 43 | module, 44 | open, 45 | open_gz, 46 | ) 47 | from .config_api import ConfigError, get_config, write_config 48 | from .impl import Module, VersionHint 49 | from .utils import ensure_readme 50 | 51 | __all__ = [ 52 | "ConfigError", 53 | "Module", 54 | "VersionHint", 55 | "dump_df", 56 | "dump_json", 57 | "dump_pickle", 58 | "dump_rdf", 59 | "dump_xml", 60 | "ensure", 61 | "ensure_csv", 62 | "ensure_custom", 63 | "ensure_excel", 64 | "ensure_from_google", 65 | "ensure_from_s3", 66 | "ensure_gunzip", 67 | "ensure_json", 68 | "ensure_json_bz2", 69 | "ensure_open", 70 | "ensure_open_bz2", 71 | "ensure_open_gz", 72 | "ensure_open_lzma", 73 | "ensure_open_sqlite", 74 | "ensure_open_sqlite_gz", 75 | "ensure_open_tarfile", 76 | "ensure_open_zip", 77 | "ensure_pickle", 78 | "ensure_pickle_gz", 79 | "ensure_rdf", 80 | "ensure_tar_df", 81 | "ensure_tar_xml", 82 | "ensure_untar", 83 | "ensure_xml", 84 | "ensure_zip_df", 85 | "ensure_zip_np", 86 | "get_config", 87 | "join", 88 | "joinpath_sqlite", 89 | "load_df", 90 | "load_json", 91 | "load_pickle", 92 | "load_pickle_gz", 93 | "load_rdf", 94 | "load_xml", 95 | "module", 96 | "open", 97 | "open_gz", 98 | "write_config", 99 | ] 100 | 101 | ensure_readme() 102 | 103 | del ensure_readme 104 | -------------------------------------------------------------------------------- /src/pystow/__main__.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Command line interface for PyStow.""" 4 | 5 | from .cli import main 6 | 7 | if __name__ == "__main__": 8 | main() 9 | -------------------------------------------------------------------------------- /src/pystow/cache.py: -------------------------------------------------------------------------------- 1 | """Utilities for caching files.""" 2 | 3 | from __future__ import annotations 4 | 5 | import functools 6 | import json 7 | import logging 8 | import pickle 9 | from abc import ABC, abstractmethod 10 | from collections.abc import MutableMapping 11 | from pathlib import Path 12 | from typing import ( 13 | TYPE_CHECKING, 14 | Any, 15 | Callable, 16 | Generic, 17 | TypeVar, 18 | Union, 19 | cast, 20 | ) 21 | 22 | if TYPE_CHECKING: 23 | import pandas as pd 24 | 25 | __all__ = [ 26 | # Classes 27 | "Cached", 28 | "CachedCollection", 29 | "CachedDataFrame", 30 | "CachedJSON", 31 | "CachedPickle", 32 | # Types 33 | "Getter", 34 | ] 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | JSONType = Union[ 39 | dict[str, Any], 40 | list[Any], 41 | ] 42 | 43 | X = TypeVar("X") 44 | Getter = Callable[[], X] 45 | 46 | 47 | class Cached(Generic[X], ABC): 48 | """Caching decorator.""" 49 | 50 | def __init__( 51 | self, 52 | path: str | Path, 53 | *, 54 | force: bool = False, 55 | cache: bool = True, 56 | ) -> None: 57 | """Instantiate the decorator. 58 | 59 | :param path: The path to the cache for the file 60 | :param cache: Should caching be done? Defaults to true, turn off for debugging purposes 61 | :param force: Should a pre-existing file be disregared/overwritten? 62 | """ 63 | self.path = Path(path) 64 | self.force = force 65 | self.cache = cache 66 | 67 | def __call__(self, func: Getter[X]) -> Getter[X]: 68 | """Apply this instance as a decorator. 69 | 70 | :param func: The function to wrap 71 | :return: A wrapped function 72 | """ 73 | 74 | @functools.wraps(func) 75 | def _wrapped() -> X: 76 | if not self.cache: 77 | return func() 78 | 79 | if self.path.is_file() and not self.force: 80 | return self.load() 81 | logger.debug("no cache found at %s", self.path) 82 | rv = func() 83 | logger.debug("writing cache to %s", self.path) 84 | self.dump(rv) 85 | return rv 86 | 87 | return _wrapped 88 | 89 | @abstractmethod 90 | def load(self) -> X: 91 | """Load data from the cache (typically by opening a file at the given path).""" 92 | 93 | @abstractmethod 94 | def dump(self, rv: X) -> None: 95 | """Dump data to the cache (typically by opening a file at the given path). 96 | 97 | :param rv: The data to dump 98 | """ 99 | 100 | 101 | class CachedJSON(Cached[JSONType]): 102 | """Make a function lazily cache its return value as JSON.""" 103 | 104 | def load(self) -> JSONType: 105 | """Load data from the cache as JSON. 106 | 107 | :returns: A python object with JSON-like data from the cache 108 | """ 109 | with open(self.path) as file: 110 | return cast(JSONType, json.load(file)) 111 | 112 | def dump(self, rv: JSONType) -> None: 113 | """Dump data to the cache as JSON. 114 | 115 | :param rv: The JSON data to dump 116 | """ 117 | with open(self.path, "w") as file: 118 | json.dump(rv, file, indent=2) 119 | 120 | 121 | class CachedPickle(Cached[Any]): 122 | """Make a function lazily cache its return value as a pickle.""" 123 | 124 | def load(self) -> Any: 125 | """Load data from the cache as a pickle. 126 | 127 | :returns: A python object loaded from the cache 128 | """ 129 | with open(self.path, "rb") as file: 130 | return pickle.load(file) 131 | 132 | def dump(self, rv: Any) -> None: 133 | """Dump data to the cache as a pickle. 134 | 135 | :param rv: The arbitrary python object to dump 136 | """ 137 | with open(self.path, "wb") as file: 138 | pickle.dump(rv, file, protocol=pickle.HIGHEST_PROTOCOL) 139 | 140 | 141 | class CachedCollection(Cached[list[str]]): 142 | """Make a function lazily cache its return value as file.""" 143 | 144 | def load(self) -> list[str]: 145 | """Load data from the cache as a list of strings. 146 | 147 | :returns: A list of strings loaded from the cache 148 | """ 149 | with open(self.path) as file: 150 | return [line.strip() for line in file] 151 | 152 | def dump(self, rv: list[str]) -> None: 153 | """Dump data to the cache as a list of strings. 154 | 155 | :param rv: The list of strings to dump 156 | """ 157 | with open(self.path, "w") as file: 158 | for line in rv: 159 | print(line, file=file) 160 | 161 | 162 | class CachedDataFrame(Cached["pd.DataFrame"]): 163 | """Make a function lazily cache its return value as a dataframe.""" 164 | 165 | def __init__( 166 | self, 167 | path: str | Path, 168 | cache: bool = True, 169 | force: bool = False, 170 | sep: str | None = None, 171 | dtype: Any | None = None, 172 | read_csv_kwargs: MutableMapping[str, Any] | None = None, 173 | ) -> None: 174 | """Instantiate the decorator. 175 | 176 | :param path: The path to the cache for the file 177 | :param force: Should a pre-existing file be disregared/overwritten? 178 | :param sep: The separator. Defaults to TSV, since this is the only reasonable default. 179 | :param dtype: A shortcut for setting the dtype 180 | :param read_csv_kwargs: Additional kwargs to pass to :func:`pd.read_csv`. 181 | :raises ValueError: if sep is given as a kwarg and also in ``read_csv_kwargs``. 182 | """ 183 | super().__init__(path=path, cache=cache, force=force) 184 | self.read_csv_kwargs = read_csv_kwargs or {} 185 | if "sep" not in self.read_csv_kwargs: 186 | self.sep = sep or "\t" 187 | elif sep is not None: 188 | raise ValueError 189 | else: 190 | self.sep = self.read_csv_kwargs.pop("sep") 191 | if dtype is not None: 192 | if "dtype" in self.read_csv_kwargs: 193 | raise ValueError 194 | self.read_csv_kwargs["dtype"] = dtype 195 | self.read_csv_kwargs.setdefault("keep_default_na", False) 196 | 197 | def load(self) -> pd.DataFrame: 198 | """Load data from the cache as a dataframe. 199 | 200 | :returns: A dataframe loaded from the cache. 201 | """ 202 | import pandas as pd 203 | 204 | return pd.read_csv( 205 | self.path, 206 | sep=self.sep, 207 | **self.read_csv_kwargs, 208 | ) 209 | 210 | def dump(self, rv: pd.DataFrame) -> None: 211 | """Dump data to the cache as a dataframe. 212 | 213 | :param rv: The dataframe to dump 214 | """ 215 | rv.to_csv(self.path, sep=self.sep, index=False) 216 | -------------------------------------------------------------------------------- /src/pystow/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # flake8: noqa 3 | # type: ignore 4 | 5 | """Command line interface for PyStow.""" 6 | 7 | from __future__ import annotations 8 | 9 | import os 10 | from typing import Optional, Sequence 11 | 12 | import click 13 | 14 | 15 | @click.group() 16 | def main() -> None: 17 | """Run the PyStow CLI.""" 18 | 19 | 20 | @main.command() 21 | @click.argument("keys", nargs=-1) 22 | @click.option("--name") 23 | def join(keys: Sequence[str], name: Optional[str]): 24 | """List a directory.""" 25 | from . import api 26 | 27 | click.echo(api.join(*keys, name=name)) 28 | 29 | 30 | @main.command() 31 | @click.argument("keys", nargs=-1) 32 | def ls(keys: Sequence[str]): 33 | """List a directory.""" 34 | from . import api 35 | 36 | directory = api.join(*keys) 37 | _ls(directory) 38 | 39 | 40 | @main.command() 41 | @click.argument("keys", nargs=-1) 42 | @click.option("--url", required=True) 43 | @click.option("--name") 44 | @click.option("--force", is_flag=True) 45 | def ensure(keys: Sequence[str], url: str, name: Optional[str], force: bool): 46 | """Ensure a file is downloaded.""" 47 | from . import api 48 | 49 | path = api.ensure(*keys, url=url, name=name, force=force) 50 | _ls(path.parent) 51 | 52 | 53 | def _ls(directory): 54 | command = f"ls -al {directory}" 55 | click.secho(f"[pystow] {command}", fg="cyan", bold=True) 56 | os.system(command) # noqa:S605 57 | 58 | 59 | @main.command(name="set") 60 | @click.argument("module") 61 | @click.argument("key") 62 | @click.argument("value") 63 | def set_config(module: str, key: str, value: str): 64 | """Set a configuration value.""" 65 | from .config_api import write_config 66 | 67 | write_config(module, key, value) 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /src/pystow/config_api.py: -------------------------------------------------------------------------------- 1 | """Configuration handling.""" 2 | 3 | from __future__ import annotations 4 | 5 | import os 6 | from configparser import ConfigParser 7 | from functools import lru_cache 8 | from pathlib import Path 9 | from textwrap import dedent 10 | from typing import Any, Callable, TypeVar 11 | 12 | from .utils import getenv_path 13 | 14 | __all__ = [ 15 | "get_config", 16 | "write_config", 17 | ] 18 | 19 | X = TypeVar("X") 20 | 21 | CONFIG_NAME_ENVVAR = "PYSTOW_CONFIG_NAME" 22 | CONFIG_HOME_ENVVAR = "PYSTOW_CONFIG_HOME" 23 | CONFIG_NAME_DEFAULT = ".config" 24 | 25 | 26 | class ConfigError(ValueError): 27 | """Raised when configuration can not be looked up.""" 28 | 29 | def __init__(self, module: str, key: str): 30 | """Initialize the configuration error. 31 | 32 | :param module: Name of the module, e.g., ``bioportal`` 33 | :param key: Name of the key inside the module, e.g., ``api_key`` 34 | """ 35 | self.module = module 36 | self.key = key 37 | 38 | def __str__(self) -> str: 39 | path = get_home().joinpath(self.module).with_suffix(".ini") 40 | return dedent( 41 | f"""\ 42 | Could not look up {self.module}/{self.key} and no default given. 43 | 44 | This can be solved with one of the following: 45 | 46 | 1. Set the {self.module.upper()}_{self.key.upper()} environment variable 47 | 48 | - Windows, via GUI: https://www.computerhope.com/issues/ch000549.htm 49 | - Windows, via CLI: https://learn.microsoft.com/en-us/windows-server/administration/windows-commands/set_1 50 | - Mac OS: https://apple.stackexchange.com/questions/106778/how-do-i-set-environment-variables-on-os-x 51 | - Linux: https://www.freecodecamp.org/news/how-to-set-an-environment-variable-in-linux/ 52 | 53 | 2. Use the PyStow CLI from the command line to 54 | set the configuration like so: 55 | 56 | $ pystow set {self.module} {self.key} 57 | 58 | This creates an INI file in {path} 59 | with the configuration in the right place. 60 | 61 | 3. Create/edit an INI file in {path} and manually 62 | fill it in by 1) creating a section inside it called [{self.module}] 63 | and 2) setting a value for {self.key} = that looks like: 64 | 65 | # {path} 66 | [{self.module}] 67 | {self.key} = 68 | 69 | See https://github.com/cthoyt/pystow#%EF%B8%8F%EF%B8%8F-configuration for more information. 70 | """ 71 | ) 72 | 73 | 74 | def get_name() -> str: 75 | """Get the config home directory name. 76 | 77 | :returns: The name of the pystow home directory, either loaded from 78 | the :data:`CONFIG_NAME_ENVVAR`` environment variable or given by the default 79 | value :data:`CONFIG_NAME_DEFAULT`. 80 | """ 81 | return os.getenv(CONFIG_NAME_ENVVAR, default=CONFIG_NAME_DEFAULT) 82 | 83 | 84 | def get_home(ensure_exists: bool = True) -> Path: 85 | """Get the config home directory. 86 | 87 | :param ensure_exists: If true, ensures the directory is created 88 | :returns: A path object representing the pystow home directory, as one of: 89 | 90 | 1. :data:`CONFIG_HOME_ENVVAR` environment variable or 91 | 2. The default directory constructed in the user's home directory plus what's 92 | returned by :func:`get_name`. 93 | """ 94 | default = Path.home().joinpath(get_name()).expanduser() 95 | return getenv_path(CONFIG_HOME_ENVVAR, default, ensure_exists=ensure_exists) 96 | 97 | 98 | @lru_cache(maxsize=1) 99 | def _get_cfp(module: str) -> ConfigParser: 100 | cfp = ConfigParser() 101 | directory = get_home() 102 | 103 | # If a multi-part module was given like "zenodo:sandbox", 104 | # then only look for the first part "zenodo" as the file name 105 | if ":" in module: 106 | module = module.split(":", 1)[0] 107 | 108 | filenames = [ 109 | os.path.join(directory, "config.cfg"), 110 | os.path.join(directory, "config.ini"), 111 | os.path.join(directory, "pystow.cfg"), 112 | os.path.join(directory, "pystow.ini"), 113 | os.path.join(directory, f"{module}.cfg"), 114 | os.path.join(directory, f"{module}.ini"), 115 | os.path.join(directory, module, f"{module}.cfg"), 116 | os.path.join(directory, module, f"{module}.ini"), 117 | os.path.join(directory, module, "conf.ini"), 118 | os.path.join(directory, module, "config.ini"), 119 | os.path.join(directory, module, "conf.cfg"), 120 | os.path.join(directory, module, "config.cfg"), 121 | ] 122 | cfp.read(filenames) 123 | return cfp 124 | 125 | 126 | def get_config( 127 | module: str, 128 | key: str, 129 | *, 130 | passthrough: X | None = None, 131 | default: X | None = None, 132 | dtype: type[X] | None = None, 133 | raise_on_missing: bool = False, 134 | ) -> Any: 135 | """Get a configuration value. 136 | 137 | :param module: Name of the module (e.g., ``pybel``) to get configuration for 138 | :param key: Name of the key (e.g., ``connection``) 139 | :param passthrough: If this is not none, will get returned 140 | :param default: If the environment and configuration files don't contain anything, 141 | this is returned. 142 | :param dtype: The datatype to parse out. Can either be :func:`int`, :func:`float`, 143 | :func:`bool`, or :func:`str`. If none, defaults to :func:`str`. 144 | :param raise_on_missing: If true, will raise a value error if no data is found and no default 145 | is given 146 | :returns: The config value or the default. 147 | :raises ConfigError: If ``raise_on_missing`` conditions are met 148 | """ 149 | if passthrough is not None: 150 | return _cast(passthrough, dtype) 151 | rv = os.getenv(f"{module.upper()}_{key.upper()}") 152 | if rv is not None: 153 | return _cast(rv, dtype) 154 | rv = _get_cfp(module).get(module, key, fallback=None) 155 | if rv is None: 156 | if default is None and raise_on_missing: 157 | raise ConfigError(module=module, key=key) 158 | return default 159 | return _cast(rv, dtype) 160 | 161 | 162 | def _cast(rv: Any, dtype: None | Callable[..., Any]) -> Any: 163 | if not isinstance(rv, str): # if it's not a string, it doesn't need munging 164 | return rv 165 | if dtype in (None, str): # no munging necessary 166 | return rv 167 | if dtype in (int, float): 168 | return dtype(rv) 169 | if dtype is bool: 170 | if rv.lower() in ("t", "true", "yes", "1", 1, True): 171 | return True 172 | elif rv.lower() in ("f", "false", "no", "0", 0, False): 173 | return False 174 | else: 175 | raise ValueError(f"value can not be coerced into bool: {rv}") 176 | raise TypeError(f"dtype is invalid: {dtype}") 177 | 178 | 179 | def write_config(module: str, key: str, value: str) -> None: 180 | """Write a configuration value. 181 | 182 | :param module: The name of the app (e.g., ``indra``) 183 | :param key: The key of the configuration in the app 184 | :param value: The value of the configuration in the app 185 | """ 186 | _get_cfp.cache_clear() 187 | cfp = ConfigParser() 188 | 189 | # If there's a multi-part module such as "zenodo:sandbox", 190 | # then write to zenodo.ini with section [zenodo:sandbox] 191 | fname = module.split(":", 1)[0] if ":" in module else module 192 | 193 | path = get_home().joinpath(fname).with_suffix(".ini") 194 | cfp.read(path) 195 | 196 | # If the file did not exist, then this section will be empty 197 | # and running set() would raise a configparser.NoSectionError. 198 | if not cfp.has_section(module): 199 | cfp.add_section(module) 200 | 201 | # Note that the section duplicates the file name 202 | cfp.set(section=module, option=key, value=value) 203 | 204 | with path.open("w") as file: 205 | cfp.write(file) 206 | -------------------------------------------------------------------------------- /src/pystow/constants.py: -------------------------------------------------------------------------------- 1 | """PyStow constants.""" 2 | 3 | from __future__ import annotations 4 | 5 | from collections.abc import Generator 6 | from io import StringIO 7 | from textwrap import dedent 8 | from typing import IO, Any, Callable 9 | 10 | __all__ = [ 11 | "JSON", 12 | "PYSTOW_HOME_ENVVAR", 13 | "PYSTOW_NAME_DEFAULT", 14 | "PYSTOW_NAME_ENVVAR", 15 | "PYSTOW_USE_APPDIRS", 16 | "README_TEXT", 17 | "Opener", 18 | "Provider", 19 | ] 20 | 21 | PYSTOW_NAME_ENVVAR = "PYSTOW_NAME" 22 | PYSTOW_HOME_ENVVAR = "PYSTOW_HOME" 23 | PYSTOW_USE_APPDIRS = "PYSTOW_USE_APPDIRS" 24 | PYSTOW_NAME_DEFAULT = ".data" 25 | README_TEXT = dedent( 26 | """\ 27 | # PyStow Data Directory 28 | 29 | This directory is used by [`pystow`](https://github.com/cthoyt/pystow) as a 30 | reproducible location to store and access data. 31 | 32 | ### ⚙️️ Configuration 33 | 34 | By default, data is stored in the `$HOME/.data` directory. By default, the `` 35 | app will create the `$HOME/.data/` folder. 36 | 37 | If you want to use an alternate folder name to `.data` inside the home directory, 38 | you can set the `PYSTOW_NAME` environment variable. For example, if you set 39 | `PYSTOW_NAME=mydata`, then the following code for the `pykeen` app will 40 | create the `$HOME/mydata/pykeen/` directory: 41 | 42 | ```python 43 | import os 44 | import pystow 45 | 46 | # Only for demonstration purposes. You should set environment 47 | # variables either with your .bashrc or in the command line REPL. 48 | os.environ['PYSTOW_NAME'] = 'mydata' 49 | 50 | # Get a directory (as a pathlib.Path) for ~/mydata/pykeen 51 | pykeen_directory = pystow.join('pykeen') 52 | ``` 53 | 54 | If you want to specify a completely custom directory that isn't relative to 55 | your home directory, you can set the `PYSTOW_HOME` environment variable. For 56 | example, if you set `PYSTOW_HOME=/usr/local/`, then the following code for 57 | the `pykeen` app will create the `/usr/local/pykeen/` directory: 58 | 59 | ```python 60 | import os 61 | import pystow 62 | 63 | # Only for demonstration purposes. You should set environment 64 | # variables either with your .bashrc or in the command line REPL. 65 | os.environ['PYSTOW_HOME'] = '/usr/local/' 66 | 67 | # Get a directory (as a pathlib.Path) for /usr/local/pykeen 68 | pykeen_directory = pystow.join('pykeen') 69 | ``` 70 | 71 | Note: if you set `PYSTOW_HOME`, then `PYSTOW_NAME` is disregarded. 72 | """ 73 | ) 74 | 75 | Opener = Generator[StringIO, None, None] 76 | BytesOpener = Generator[IO[bytes], None, None] 77 | JSON = Any 78 | Provider = Callable[..., None] 79 | -------------------------------------------------------------------------------- /src/pystow/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cthoyt/pystow/80249d83c684cb15ce05b0c83e10d45c22b966d7/src/pystow/py.typed -------------------------------------------------------------------------------- /src/pystow/utils.py: -------------------------------------------------------------------------------- 1 | """Utilities.""" 2 | 3 | from __future__ import annotations 4 | 5 | import contextlib 6 | import csv 7 | import gzip 8 | import hashlib 9 | import logging 10 | import lzma 11 | import os 12 | import pickle 13 | import shutil 14 | import tarfile 15 | import tempfile 16 | import typing 17 | import urllib.error 18 | import zipfile 19 | from collections.abc import Collection, Generator, Iterable, Iterator, Mapping 20 | from functools import partial 21 | from io import BytesIO, StringIO 22 | from pathlib import Path, PurePosixPath 23 | from subprocess import check_output 24 | from typing import ( 25 | TYPE_CHECKING, 26 | Any, 27 | Literal, 28 | NamedTuple, 29 | TextIO, 30 | cast, 31 | ) 32 | from urllib.parse import urlparse 33 | from urllib.request import urlretrieve 34 | from uuid import uuid4 35 | 36 | import requests 37 | from tqdm.auto import tqdm 38 | from typing_extensions import TypeAlias 39 | 40 | from .constants import ( 41 | PYSTOW_HOME_ENVVAR, 42 | PYSTOW_NAME_DEFAULT, 43 | PYSTOW_NAME_ENVVAR, 44 | PYSTOW_USE_APPDIRS, 45 | README_TEXT, 46 | ) 47 | 48 | if TYPE_CHECKING: 49 | import _csv 50 | 51 | import botocore.client 52 | import lxml.etree 53 | import numpy.typing 54 | import pandas 55 | import rdflib 56 | 57 | __all__ = [ 58 | "DownloadBackend", 59 | "Hash", 60 | "HexDigestError", 61 | "HexDigestMismatch", 62 | "UnexpectedDirectory", 63 | "UnexpectedDirectoryError", 64 | "download", 65 | "download_from_google", 66 | "download_from_s3", 67 | "get_base", 68 | "get_commit", 69 | "get_df_io", 70 | "get_hashes", 71 | "get_hexdigests_remote", 72 | "get_home", 73 | "get_name", 74 | "get_np_io", 75 | "get_offending_hexdigests", 76 | "getenv_path", 77 | "gunzip", 78 | "mkdir", 79 | "mock_envvar", 80 | "mock_home", 81 | "n", 82 | "name_from_s3_key", 83 | "name_from_url", 84 | "path_to_sqlite", 85 | "raise_on_digest_mismatch", 86 | "read_rdf", 87 | "read_tarfile_csv", 88 | "read_tarfile_xml", 89 | "read_zip_np", 90 | "read_zipfile_csv", 91 | "read_zipfile_rdf", 92 | "read_zipfile_xml", 93 | "safe_open", 94 | "safe_open_writer", 95 | "write_lzma_csv", 96 | "write_pickle_gz", 97 | "write_tarfile_csv", 98 | "write_zipfile_csv", 99 | "write_zipfile_np", 100 | "write_zipfile_xml", 101 | ] 102 | 103 | logger = logging.getLogger(__name__) 104 | 105 | #: Represents an available backend for downloading 106 | DownloadBackend: TypeAlias = Literal["urllib", "requests"] 107 | 108 | #: This type alias uses a stub-only constructor, meaning that 109 | #: hashlib._Hash isn't actually part of the code, but MyPy injects it 110 | #: so we can do type checking 111 | Hash: TypeAlias = "hashlib._Hash" 112 | 113 | 114 | class HexDigestMismatch(NamedTuple): 115 | """Contains information about a hexdigest mismatch.""" 116 | 117 | #: the name of the algorithm 118 | name: str 119 | #: the observed/actual hexdigest, encoded as a string 120 | actual: str 121 | #: the expected hexdigest, encoded as a string 122 | expected: str 123 | 124 | 125 | class HexDigestError(ValueError): 126 | """Thrown if the hashsums do not match expected hashsums.""" 127 | 128 | def __init__(self, offending_hexdigests: Collection[HexDigestMismatch]): 129 | """Instantiate the exception. 130 | 131 | :param offending_hexdigests: The result from :func:`get_offending_hexdigests` 132 | """ 133 | self.offending_hexdigests = offending_hexdigests 134 | 135 | def __str__(self) -> str: 136 | return "\n".join( 137 | ( 138 | "Hexdigest of downloaded file does not match the expected ones!", 139 | *( 140 | f"\t{name} actual: {actual} vs. expected: {expected}" 141 | for name, actual, expected in self.offending_hexdigests 142 | ), 143 | ) 144 | ) 145 | 146 | 147 | class UnexpectedDirectoryError(FileExistsError): 148 | """Thrown if a directory path is given where file path should have been.""" 149 | 150 | def __init__(self, path: Path): 151 | """Instantiate the exception. 152 | 153 | :param path: The path to a directory that should have been a file. 154 | """ 155 | self.path = path 156 | 157 | def __str__(self) -> str: 158 | return f"got directory instead of file: {self.path}" 159 | 160 | 161 | #: Backwards compatible name 162 | UnexpectedDirectory = UnexpectedDirectoryError 163 | 164 | 165 | def get_hexdigests_remote( 166 | hexdigests_remote: Mapping[str, str] | None, hexdigests_strict: bool = False 167 | ) -> Mapping[str, str]: 168 | """Process hexdigests via URLs. 169 | 170 | :param hexdigests_remote: 171 | The expected hexdigests as (algorithm_name, url to file with expected hex digest) pairs. 172 | :param hexdigests_strict: 173 | Set this to `False` to stop automatically checking for the `algorithm(filename)=hash` format 174 | :returns: 175 | A mapping of algorithms to hexdigests 176 | """ 177 | rv = {} 178 | for key, url in (hexdigests_remote or {}).items(): 179 | text = requests.get(url, timeout=15).text 180 | if not hexdigests_strict and "=" in text: 181 | text = text.rsplit("=", 1)[-1].strip() 182 | rv[key] = text 183 | return rv 184 | 185 | 186 | def get_offending_hexdigests( 187 | path: str | Path, 188 | chunk_size: int | None = None, 189 | hexdigests: Mapping[str, str] | None = None, 190 | hexdigests_remote: Mapping[str, str] | None = None, 191 | hexdigests_strict: bool = False, 192 | ) -> Collection[HexDigestMismatch]: 193 | """ 194 | Check a file for hash sums. 195 | 196 | :param path: 197 | The file path. 198 | :param chunk_size: 199 | The chunk size for reading the file. 200 | :param hexdigests: 201 | The expected hexdigests as (algorithm_name, expected_hex_digest) pairs. 202 | :param hexdigests_remote: 203 | The expected hexdigests as (algorithm_name, url to file with expected hexdigest) pairs. 204 | :param hexdigests_strict: 205 | Set this to false to stop automatically checking for the `algorithm(filename)=hash` format 206 | 207 | :return: 208 | A collection of observed / expected hexdigests where the digests do not match. 209 | """ 210 | hexdigests = dict( 211 | **(hexdigests or {}), 212 | **get_hexdigests_remote(hexdigests_remote, hexdigests_strict=hexdigests_strict), 213 | ) 214 | 215 | # If there aren't any keys in the combine dictionaries, 216 | # then there won't be any mismatches 217 | if not hexdigests: 218 | return [] 219 | 220 | logger.info(f"Checking hash sums for file: {path}") 221 | 222 | # instantiate algorithms 223 | algorithms = get_hashes(path=path, names=set(hexdigests), chunk_size=chunk_size) 224 | 225 | # Compare digests 226 | mismatches = [] 227 | for alg, expected_digest in hexdigests.items(): 228 | observed_digest = algorithms[alg].hexdigest() 229 | if observed_digest != expected_digest: 230 | logger.error(f"{alg} expected {expected_digest} but got {observed_digest}.") 231 | mismatches.append(HexDigestMismatch(alg, observed_digest, expected_digest)) 232 | else: 233 | logger.debug(f"Successfully checked with {alg}.") 234 | 235 | return mismatches 236 | 237 | 238 | def get_hashes( 239 | path: str | Path, 240 | names: Iterable[str], 241 | *, 242 | chunk_size: int | None = None, 243 | ) -> Mapping[str, Hash]: 244 | """Calculate several hexdigests of hash algorithms for a file concurrently. 245 | 246 | :param path: The file path. 247 | :param names: Names of the hash algorithms in :mod:`hashlib` 248 | :param chunk_size: The chunk size for reading the file. 249 | 250 | :return: 251 | A collection of observed hexdigests 252 | """ 253 | path = Path(path).resolve() 254 | if chunk_size is None: 255 | chunk_size = 64 * 2**10 256 | 257 | # instantiate hash algorithms 258 | algorithms: Mapping[str, Hash] = {name: hashlib.new(name) for name in names} 259 | 260 | # calculate hash sums of file incrementally 261 | buffer = memoryview(bytearray(chunk_size)) 262 | with path.open("rb", buffering=0) as file: 263 | for this_chunk_size in iter(lambda: file.readinto(buffer), 0): 264 | for alg in algorithms.values(): 265 | alg.update(buffer[:this_chunk_size]) 266 | 267 | return algorithms 268 | 269 | 270 | def raise_on_digest_mismatch( 271 | *, 272 | path: Path, 273 | hexdigests: Mapping[str, str] | None = None, 274 | hexdigests_remote: Mapping[str, str] | None = None, 275 | hexdigests_strict: bool = False, 276 | ) -> None: 277 | """Raise a HexDigestError if the digests do not match. 278 | 279 | :param path: 280 | The file path. 281 | :param hexdigests: 282 | The expected hexdigests as (algorithm_name, expected_hex_digest) pairs. 283 | :param hexdigests_remote: 284 | The expected hexdigests as (algorithm_name, url to file with expected hexdigest) pairs. 285 | :param hexdigests_strict: 286 | Set this to false to stop automatically checking for the `algorithm(filename)=hash` format 287 | 288 | :raises HexDigestError: if there are any offending hex digests 289 | The expected hexdigests as (algorithm_name, url to file with expected hexdigest) pairs. 290 | """ 291 | offending_hexdigests = get_offending_hexdigests( 292 | path=path, 293 | hexdigests=hexdigests, 294 | hexdigests_remote=hexdigests_remote, 295 | hexdigests_strict=hexdigests_strict, 296 | ) 297 | if offending_hexdigests: 298 | raise HexDigestError(offending_hexdigests) 299 | 300 | 301 | class TqdmReportHook(tqdm): # type:ignore 302 | """A custom progress bar that can be used with urllib. 303 | 304 | Based on https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5 305 | """ 306 | 307 | def update_to( 308 | self, 309 | blocks: int = 1, 310 | block_size: int = 1, 311 | total_size: int | None = None, 312 | ) -> None: 313 | """Update the internal state based on a urllib report hook. 314 | 315 | :param blocks: Number of blocks transferred so far 316 | :param block_size: Size of each block (in tqdm units) 317 | :param total_size: Total size (in tqdm units). If [default: None] remains unchanged. 318 | """ 319 | if total_size is not None: 320 | self.total = total_size 321 | self.update(blocks * block_size - self.n) # will also set self.n = b * bsize 322 | 323 | 324 | def download( 325 | url: str, 326 | path: str | Path, 327 | force: bool = True, 328 | clean_on_failure: bool = True, 329 | backend: DownloadBackend = "urllib", 330 | hexdigests: Mapping[str, str] | None = None, 331 | hexdigests_remote: Mapping[str, str] | None = None, 332 | hexdigests_strict: bool = False, 333 | progress_bar: bool = True, 334 | tqdm_kwargs: Mapping[str, Any] | None = None, 335 | **kwargs: Any, 336 | ) -> None: 337 | """Download a file from a given URL. 338 | 339 | :param url: URL to download 340 | :param path: Path to download the file to 341 | :param force: If false and the file already exists, will not re-download. 342 | :param clean_on_failure: If true, will delete the file on any exception raised during download 343 | :param backend: The downloader to use. Choose 'urllib' or 'requests' 344 | :param hexdigests: 345 | The expected hexdigests as (algorithm_name, expected_hex_digest) pairs. 346 | :param hexdigests_remote: 347 | The expected hexdigests as (algorithm_name, url to file with expected hexdigest) pairs. 348 | :param hexdigests_strict: 349 | Set this to false to stop automatically checking for the `algorithm(filename)=hash` format 350 | :param progress_bar: 351 | Set to true to show a progress bar while downloading 352 | :param tqdm_kwargs: 353 | Override the default arguments passed to :class:`tadm.tqdm` when progress_bar is True. 354 | :param kwargs: 355 | The keyword arguments to pass to :func:`urllib.request.urlretrieve` 356 | or to `requests.get` depending on the backend chosen. If using 'requests' backend, 357 | `stream` is set to True by default. 358 | 359 | :raises Exception: Thrown if an error besides a keyboard interrupt is thrown during download 360 | :raises KeyboardInterrupt: If a keyboard interrupt is thrown during download 361 | :raises UnexpectedDirectory: If a directory is given for the ``path`` argument 362 | :raises ValueError: If an invalid backend is chosen 363 | :raises DownloadError: If an error occurs during download 364 | """ 365 | path = Path(path).resolve() 366 | 367 | if path.is_dir(): 368 | raise UnexpectedDirectoryError(path) 369 | if path.is_file() and not force: 370 | raise_on_digest_mismatch( 371 | path=path, 372 | hexdigests=hexdigests, 373 | hexdigests_remote=hexdigests_remote, 374 | hexdigests_strict=hexdigests_strict, 375 | ) 376 | logger.debug("did not re-download %s from %s", path, url) 377 | return 378 | 379 | _tqdm_kwargs = { 380 | "unit": "B", 381 | "unit_scale": True, 382 | "unit_divisor": 1024, 383 | "miniters": 1, 384 | "disable": not progress_bar, 385 | "desc": f"Downloading {path.name}", 386 | "leave": False, 387 | } 388 | if tqdm_kwargs: 389 | _tqdm_kwargs.update(tqdm_kwargs) 390 | 391 | try: 392 | if backend == "urllib": 393 | logger.info("downloading with urllib from %s to %s", url, path) 394 | with TqdmReportHook(**_tqdm_kwargs) as t: 395 | try: 396 | urlretrieve(url, path, reporthook=t.update_to, **kwargs) # noqa:S310 397 | except urllib.error.URLError as e: 398 | raise DownloadError(backend, url, path, e) from e 399 | elif backend == "requests": 400 | kwargs.setdefault("stream", True) 401 | try: 402 | # see https://requests.readthedocs.io/en/master/user/quickstart/#raw-response-content 403 | # pattern from https://stackoverflow.com/a/39217788/5775947 404 | with requests.get(url, **kwargs) as response, path.open("wb") as file: # noqa:S113 405 | logger.info( 406 | "downloading (stream=%s) with requests from %s to %s", 407 | kwargs["stream"], 408 | url, 409 | path, 410 | ) 411 | # Solution for progress bar from https://stackoverflow.com/a/63831344/5775947 412 | total_size = int(response.headers.get("Content-Length", 0)) 413 | # Decompress if needed 414 | response.raw.read = partial( # type:ignore[method-assign] 415 | response.raw.read, decode_content=True 416 | ) 417 | with tqdm.wrapattr( 418 | response.raw, "read", total=total_size, **_tqdm_kwargs 419 | ) as fsrc: 420 | shutil.copyfileobj(fsrc, file) 421 | except requests.exceptions.ConnectionError as e: 422 | raise DownloadError(backend, url, path, e) from e 423 | else: 424 | raise ValueError(f'Invalid backend: {backend}. Use "requests" or "urllib".') 425 | except (Exception, KeyboardInterrupt): 426 | if clean_on_failure: 427 | _unlink(path) 428 | raise 429 | 430 | raise_on_digest_mismatch( 431 | path=path, 432 | hexdigests=hexdigests, 433 | hexdigests_remote=hexdigests_remote, 434 | hexdigests_strict=hexdigests_strict, 435 | ) 436 | 437 | 438 | class DownloadError(OSError): 439 | """An error that wraps information from a requests or urllib download failure.""" 440 | 441 | def __init__( 442 | self, 443 | backend: DownloadBackend, 444 | url: str, 445 | path: Path, 446 | exc: urllib.error.URLError | requests.exceptions.ConnectionError, 447 | ) -> None: 448 | """Initialize the error. 449 | 450 | :param backend: The backend used 451 | :param url: The url that failed to download 452 | :param path: The path that was supposed to be downloaded to 453 | :param exc: The exception raised 454 | """ 455 | self.backend = backend 456 | self.url = url 457 | self.path = path 458 | self.exc = exc 459 | # TODO parse out HTTP error code, if possible 460 | 461 | def __str__(self) -> str: 462 | return f"Failed with {self.backend} to download {self.url} to {self.path}" 463 | 464 | 465 | def name_from_url(url: str) -> str: 466 | """Get the filename from the end of the URL. 467 | 468 | :param url: A URL 469 | :return: The name of the file at the end of the URL 470 | """ 471 | parse_result = urlparse(url) 472 | path = PurePosixPath(parse_result.path) 473 | name = path.name 474 | return name 475 | 476 | 477 | def base_from_gzip_name(name: str) -> str: 478 | """Get the base name for a file after stripping the gz ending. 479 | 480 | :param name: The name of the gz file 481 | :returns: The cleaned name of the file, with no gz ending 482 | :raises ValueError: if the file does not end with ".gz" 483 | """ 484 | if not name.endswith(".gz"): 485 | raise ValueError(f"Name does not end with .gz: {name}") 486 | return name[: -len(".gz")] 487 | 488 | 489 | def name_from_s3_key(key: str) -> str: 490 | """Get the filename from the S3 key. 491 | 492 | :param key: A S3 path 493 | :returns: The name of the file 494 | """ 495 | return key.split("/")[-1] 496 | 497 | 498 | def mkdir(path: Path, ensure_exists: bool = True) -> None: 499 | """Make a directory (or parent directory if a file is given) if flagged with ``ensure_exists``. 500 | 501 | :param path: The path to a directory 502 | :param ensure_exists: 503 | Should the directories leading to the path be created if they don't already exist? 504 | """ 505 | if ensure_exists: 506 | path.mkdir(exist_ok=True, parents=True) 507 | 508 | 509 | @contextlib.contextmanager 510 | def mock_envvar(envvar: str, value: str) -> Iterator[None]: 511 | """Mock the environment variable then delete it after the test is over. 512 | 513 | :param envvar: The environment variable to mock 514 | :param value: The value to temporarily put in the environment variable 515 | during this mock. 516 | :yield: None, since this just mocks the environment variable for the 517 | time being. 518 | """ 519 | original_value = os.environ.get(envvar) 520 | os.environ[envvar] = value 521 | yield 522 | if original_value is None: 523 | del os.environ[envvar] 524 | else: 525 | os.environ[envvar] = original_value 526 | 527 | 528 | @contextlib.contextmanager 529 | def mock_home() -> Iterator[Path]: 530 | """Mock the PyStow home environment variable, yields the directory name. 531 | 532 | :yield: The path to the temporary directory. 533 | """ 534 | with tempfile.TemporaryDirectory() as directory: 535 | with mock_envvar(PYSTOW_HOME_ENVVAR, directory): 536 | yield Path(directory) 537 | 538 | 539 | def getenv_path(envvar: str, default: Path, ensure_exists: bool = True) -> Path: 540 | """Get an environment variable representing a path, or use the default. 541 | 542 | :param envvar: The environmental variable name to check 543 | :param default: 544 | The default path to return if the environmental variable is not set 545 | :param ensure_exists: 546 | Should the directories leading to the path be created if they don't already exist? 547 | :return: A path either specified by the environmental variable or by the default. 548 | """ 549 | rv = Path(os.getenv(envvar, default=default)).expanduser() 550 | mkdir(rv, ensure_exists=ensure_exists) 551 | return rv 552 | 553 | 554 | def n() -> str: 555 | """Get a random string for testing. 556 | 557 | :returns: A random string for testing purposes. 558 | """ 559 | return str(uuid4()) 560 | 561 | 562 | def get_df_io(df: pandas.DataFrame, sep: str = "\t", index: bool = False, **kwargs: Any) -> BytesIO: 563 | """Get the dataframe as bytes. 564 | 565 | :param df: A dataframe 566 | :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. 567 | :param index: Should the index be output? Overrides the Pandas default to be false. 568 | :param kwargs: Additional kwargs to pass to :func:`pandas.DataFrame.to_csv`. 569 | :return: A bytes object that can be used as a file. 570 | """ 571 | sio = StringIO() 572 | df.to_csv(sio, sep=sep, index=index, **kwargs) 573 | sio.seek(0) 574 | bio = BytesIO(sio.read().encode("utf-8")) 575 | return bio 576 | 577 | 578 | def get_np_io(arr: numpy.typing.ArrayLike, **kwargs: Any) -> BytesIO: 579 | """Get the numpy object as bytes. 580 | 581 | :param arr: Array-like 582 | :param kwargs: Additional kwargs to pass to :func:`numpy.save`. 583 | :return: A bytes object that can be used as a file. 584 | """ 585 | import numpy as np 586 | 587 | bio = BytesIO() 588 | np.save(bio, arr, **kwargs) 589 | bio.seek(0) 590 | return bio 591 | 592 | 593 | def write_pickle_gz( 594 | obj: Any, 595 | path: str | Path, 596 | **kwargs: Any, 597 | ) -> None: 598 | """Write an object to a gzipped pickle. 599 | 600 | :param obj: The object to write 601 | :param path: The path of the file to write to 602 | :param kwargs: 603 | Additional kwargs to pass to :func:`pickle.dump` 604 | """ 605 | with gzip.open(path, mode="wb") as file: 606 | pickle.dump(obj, file, **kwargs) 607 | 608 | 609 | def write_lzma_csv( 610 | df: pandas.DataFrame, 611 | path: str | Path, 612 | sep: str = "\t", 613 | index: bool = False, 614 | **kwargs: Any, 615 | ) -> None: 616 | """Write a dataframe as an lzma-compressed file. 617 | 618 | :param df: A dataframe 619 | :param path: The path to the resulting LZMA compressed dataframe file 620 | :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. 621 | :param index: Should the index be output? Overrides the Pandas default to be false. 622 | :param kwargs: 623 | Additional kwargs to pass to :func:`get_df_io` and transitively 624 | to :func:`pandas.DataFrame.to_csv`. 625 | """ 626 | bytes_io = get_df_io(df, sep=sep, index=index, **kwargs) 627 | with lzma.open(path, "wb") as file: 628 | file.write(bytes_io.read()) 629 | 630 | 631 | def write_zipfile_csv( 632 | df: pandas.DataFrame, 633 | path: str | Path, 634 | inner_path: str, 635 | sep: str = "\t", 636 | index: bool = False, 637 | **kwargs: Any, 638 | ) -> None: 639 | """Write a dataframe to an inner CSV file to a zip archive. 640 | 641 | :param df: A dataframe 642 | :param path: The path to the resulting zip archive 643 | :param inner_path: The path inside the zip archive to write the dataframe 644 | :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. 645 | :param index: Should the index be output? Overrides the Pandas default to be false. 646 | :param kwargs: 647 | Additional kwargs to pass to :func:`get_df_io` and transitively 648 | to :func:`pandas.DataFrame.to_csv`. 649 | """ 650 | bytes_io = get_df_io(df, sep=sep, index=index, **kwargs) 651 | with zipfile.ZipFile(file=path, mode="w") as zip_file: 652 | with zip_file.open(inner_path, mode="w") as file: 653 | file.write(bytes_io.read()) 654 | 655 | 656 | def read_zipfile_csv( 657 | path: str | Path, inner_path: str, sep: str = "\t", **kwargs: Any 658 | ) -> pandas.DataFrame: 659 | """Read an inner CSV file from a zip archive. 660 | 661 | :param path: The path to the zip archive 662 | :param inner_path: The path inside the zip archive to the dataframe 663 | :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. 664 | :param kwargs: Additional kwargs to pass to :func:`pandas.read_csv`. 665 | :return: A dataframe 666 | """ 667 | import pandas as pd 668 | 669 | with zipfile.ZipFile(file=path) as zip_file: 670 | with zip_file.open(inner_path) as file: 671 | return pd.read_csv(file, sep=sep, **kwargs) 672 | 673 | 674 | def write_zipfile_xml( 675 | element_tree: lxml.etree.ElementTree, 676 | path: str | Path, 677 | inner_path: str, 678 | **kwargs: Any, 679 | ) -> None: 680 | """Write an XML element tree to an inner XML file to a zip archive. 681 | 682 | :param element_tree: An XML element tree 683 | :param path: The path to the resulting zip archive 684 | :param inner_path: The path inside the zip archive to write the dataframe 685 | :param kwargs: Additional kwargs to pass to :func:`tostring` 686 | """ 687 | from lxml import etree 688 | 689 | kwargs.setdefault("pretty_print", True) 690 | with zipfile.ZipFile(file=path, mode="w") as zip_file: 691 | with zip_file.open(inner_path, mode="w") as file: 692 | file.write(etree.tostring(element_tree, **kwargs)) 693 | 694 | 695 | def read_zipfile_xml(path: str | Path, inner_path: str, **kwargs: Any) -> lxml.etree.ElementTree: 696 | """Read an inner XML file from a zip archive. 697 | 698 | :param path: The path to the zip archive 699 | :param inner_path: The path inside the zip archive to the xml file 700 | :param kwargs: Additional kwargs to pass to :func:`lxml.etree.parse` 701 | :return: An XML element tree 702 | """ 703 | from lxml import etree 704 | 705 | with zipfile.ZipFile(file=path) as zip_file: 706 | with zip_file.open(inner_path) as file: 707 | return etree.parse(file, **kwargs) 708 | 709 | 710 | def write_zipfile_np( 711 | arr: numpy.typing.ArrayLike, 712 | path: str | Path, 713 | inner_path: str, 714 | **kwargs: Any, 715 | ) -> None: 716 | """Write a dataframe to an inner CSV file to a zip archive. 717 | 718 | :param arr: Array-like 719 | :param path: The path to the resulting zip archive 720 | :param inner_path: The path inside the zip archive to write the dataframe 721 | :param kwargs: 722 | Additional kwargs to pass to :func:`get_np_io` and transitively 723 | to :func:`numpy.save`. 724 | """ 725 | bytes_io = get_np_io(arr, **kwargs) 726 | with zipfile.ZipFile(file=path, mode="w") as zip_file: 727 | with zip_file.open(inner_path, mode="w") as file: 728 | file.write(bytes_io.read()) 729 | 730 | 731 | def read_zip_np(path: str | Path, inner_path: str, **kwargs: Any) -> numpy.typing.ArrayLike: 732 | """Read an inner numpy array-like from a zip archive. 733 | 734 | :param path: The path to the zip archive 735 | :param inner_path: The path inside the zip archive to the dataframe 736 | :param kwargs: Additional kwargs to pass to :func:`numpy.load`. 737 | :return: A numpy array or other object 738 | """ 739 | import numpy as np 740 | 741 | with zipfile.ZipFile(file=path) as zip_file: 742 | with zip_file.open(inner_path) as file: 743 | return cast(np.typing.ArrayLike, np.load(file, **kwargs)) 744 | 745 | 746 | def read_zipfile_rdf(path: str | Path, inner_path: str, **kwargs: Any) -> rdflib.Graph: 747 | """Read an inner RDF file from a zip archive. 748 | 749 | :param path: The path to the zip archive 750 | :param inner_path: The path inside the zip archive to the dataframe 751 | :param kwargs: Additional kwargs to pass to :func:`pandas.read_csv`. 752 | :return: A graph 753 | """ 754 | import rdflib 755 | 756 | graph = rdflib.Graph() 757 | with zipfile.ZipFile(file=path) as zip_file: 758 | with zip_file.open(inner_path) as file: 759 | graph.parse(file, **kwargs) 760 | return graph 761 | 762 | 763 | def write_tarfile_csv( 764 | df: pandas.DataFrame, 765 | path: str | Path, 766 | inner_path: str, 767 | sep: str = "\t", 768 | index: bool = False, 769 | **kwargs: Any, 770 | ) -> None: 771 | """Write a dataframe to an inner CSV file from a tar archive. 772 | 773 | :param df: A dataframe 774 | :param path: The path to the resulting tar archive 775 | :param inner_path: The path inside the tar archive to write the dataframe 776 | :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. 777 | :param index: Should the index be output? Overrides the Pandas default to be false. 778 | :param kwargs: 779 | Additional kwargs to pass to :func:`get_df_io` and transitively 780 | to :func:`pandas.DataFrame.to_csv`. 781 | """ 782 | s = df.to_csv(sep=sep, index=index, **kwargs) 783 | tarinfo = tarfile.TarInfo(name=inner_path) 784 | tarinfo.size = len(s) 785 | with tarfile.TarFile(path, mode="w") as tar_file: 786 | tar_file.addfile(tarinfo, BytesIO(s.encode("utf-8"))) 787 | 788 | 789 | def read_tarfile_csv( 790 | path: str | Path, inner_path: str, sep: str = "\t", **kwargs: Any 791 | ) -> pandas.DataFrame: 792 | """Read an inner CSV file from a tar archive. 793 | 794 | :param path: The path to the tar archive 795 | :param inner_path: The path inside the tar archive to the dataframe 796 | :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. 797 | :param kwargs: Additional kwargs to pass to :func:`pandas.read_csv`. 798 | :return: A dataframe 799 | """ 800 | import pandas as pd 801 | 802 | with tarfile.open(path) as tar_file: 803 | with tar_file.extractfile(inner_path) as file: # type: ignore 804 | return pd.read_csv(file, sep=sep, **kwargs) 805 | 806 | 807 | def read_tarfile_xml(path: str | Path, inner_path: str, **kwargs: Any) -> lxml.etree.ElementTree: 808 | """Read an inner XML file from a tar archive. 809 | 810 | :param path: The path to the tar archive 811 | :param inner_path: The path inside the tar archive to the xml file 812 | :param kwargs: Additional kwargs to pass to :func:`lxml.etree.parse` 813 | :return: An XML element tree 814 | """ 815 | from lxml import etree 816 | 817 | with tarfile.open(path) as tar_file: 818 | with tar_file.extractfile(inner_path) as file: # type: ignore 819 | return etree.parse(file, **kwargs) 820 | 821 | 822 | def read_rdf(path: str | Path, **kwargs: Any) -> rdflib.Graph: 823 | """Read an RDF file with :mod:`rdflib`. 824 | 825 | :param path: The path to the RDF file 826 | :param kwargs: Additional kwargs to pass to :func:`rdflib.Graph.parse` 827 | :return: A parsed RDF graph 828 | """ 829 | import rdflib 830 | 831 | if isinstance(path, str): 832 | path = Path(path) 833 | graph = rdflib.Graph() 834 | with ( 835 | gzip.open(path, "rb") if isinstance(path, Path) and path.suffix == ".gz" else open(path) 836 | ) as file: 837 | graph.parse(file, **kwargs) # type:ignore 838 | return graph 839 | 840 | 841 | def write_sql(df: pandas.DataFrame, name: str, path: str | Path, **kwargs: Any) -> None: 842 | """Write a dataframe as a SQL table. 843 | 844 | :param df: A dataframe 845 | :param name: The table the database to write to 846 | :param path: The path to the resulting tar archive 847 | :param kwargs: Additional keyword arguments to pass to :meth:`pandas.DataFrame.to_sql` 848 | """ 849 | import sqlite3 850 | 851 | with contextlib.closing(sqlite3.connect(path)) as conn: 852 | df.to_sql(name, conn, **kwargs) 853 | 854 | 855 | def get_commit(org: str, repo: str, provider: str = "git") -> str: 856 | """Get last commit hash for the given repo. 857 | 858 | :param org: The GitHub organization or owner 859 | :param repo: The GitHub repository name 860 | :param provider: The method for getting the most recent commit 861 | :raises ValueError: if an invalid provider is given 862 | :returns: A commit hash's hex digest as a string 863 | """ 864 | if provider == "git": 865 | output = check_output(["git", "ls-remote", f"https://github.com/{org}/{repo}"]) # noqa 866 | lines = (line.strip().split("\t") for line in output.decode("utf8").splitlines()) 867 | rv = next(line[0] for line in lines if line[1] == "HEAD") 868 | elif provider == "github": 869 | res = requests.get(f"https://api.github.com/repos/{org}/{repo}/branches/master", timeout=15) 870 | res_json = res.json() 871 | rv = res_json["commit"]["sha"] 872 | else: 873 | raise ValueError(f"invalid implementation: {provider}") 874 | return rv 875 | 876 | 877 | CHUNK_SIZE = 32768 878 | DOWNLOAD_URL = "https://docs.google.com/uc?export=download" 879 | TOKEN_KEY = "download_warning" # noqa:S105 880 | 881 | 882 | def download_from_google( 883 | file_id: str, 884 | path: str | Path, 885 | force: bool = True, 886 | clean_on_failure: bool = True, 887 | hexdigests: Mapping[str, str] | None = None, 888 | ) -> None: 889 | """Download a file from google drive. 890 | 891 | Implementation inspired by https://github.com/ndrplz/google-drive-downloader. 892 | 893 | :param file_id: The google file identifier 894 | :param path: The place to write the file 895 | :param force: If false and the file already exists, will not re-download. 896 | :param clean_on_failure: If true, will delete the file on any exception raised during download 897 | :param hexdigests: 898 | The expected hexdigests as (algorithm_name, expected_hex_digest) pairs. 899 | 900 | :raises Exception: Thrown if an error besides a keyboard interrupt is thrown during download 901 | :raises KeyboardInterrupt: If a keyboard interrupt is thrown during download 902 | :raises UnexpectedDirectory: If a directory is given for the ``path`` argument 903 | """ 904 | path = Path(path).resolve() 905 | 906 | if path.is_dir(): 907 | raise UnexpectedDirectoryError(path) 908 | if path.is_file() and not force: 909 | raise_on_digest_mismatch(path=path, hexdigests=hexdigests) 910 | logger.debug("did not re-download %s from Google ID %s", path, file_id) 911 | return 912 | 913 | try: 914 | with requests.Session() as sess: 915 | res = sess.get(DOWNLOAD_URL, params={"id": file_id}, stream=True) 916 | token = _get_confirm_token(res) 917 | res = sess.get(DOWNLOAD_URL, params={"id": file_id, "confirm": token}, stream=True) 918 | with path.open("wb") as file: 919 | for chunk in tqdm(res.iter_content(CHUNK_SIZE), desc="writing", unit="chunk"): 920 | if chunk: # filter out keep-alive new chunks 921 | file.write(chunk) 922 | except (Exception, KeyboardInterrupt): 923 | if clean_on_failure: 924 | _unlink(path) 925 | raise 926 | 927 | raise_on_digest_mismatch(path=path, hexdigests=hexdigests) 928 | 929 | 930 | def _get_confirm_token(res: requests.Response) -> str: 931 | for key, value in res.cookies.items(): 932 | if key.startswith(TOKEN_KEY): 933 | return value 934 | raise ValueError(f"no token found with key {TOKEN_KEY} in cookies: {res.cookies}") 935 | 936 | 937 | def download_from_s3( 938 | s3_bucket: str, 939 | s3_key: str, 940 | path: str | Path, 941 | client: None | botocore.client.BaseClient = None, 942 | client_kwargs: Mapping[str, Any] | None = None, 943 | download_file_kwargs: Mapping[str, Any] | None = None, 944 | force: bool = True, 945 | clean_on_failure: bool = True, 946 | ) -> None: 947 | """Download a file from S3. 948 | 949 | :param s3_bucket: The key inside the S3 bucket name 950 | :param s3_key: The key inside the S3 bucket 951 | :param path: The place to write the file 952 | :param client: 953 | A botocore client. If none given, one will be created automatically 954 | :param client_kwargs: 955 | Keyword arguments to be passed to the client on instantiation. 956 | :param download_file_kwargs: 957 | Keyword arguments to be passed to :func:`boto3.s3.transfer.S3Transfer.download_file` 958 | :param force: If false and the file already exists, will not re-download. 959 | :param clean_on_failure: If true, will delete the file on any exception raised during download 960 | 961 | :raises Exception: Thrown if an error besides a keyboard interrupt is thrown during download 962 | :raises KeyboardInterrupt: If a keyboard interrupt is thrown during download 963 | :raises UnexpectedDirectory: If a directory is given for the ``path`` argument 964 | """ 965 | path = Path(path).resolve() 966 | 967 | if path.is_dir(): 968 | raise UnexpectedDirectoryError(path) 969 | if path.is_file() and not force: 970 | logger.debug("did not re-download %s from %s %s", path, s3_bucket, s3_key) 971 | return 972 | 973 | try: 974 | import boto3.s3.transfer 975 | 976 | if client is None: 977 | import boto3 978 | import botocore.client 979 | 980 | client_kwargs = {} if client_kwargs is None else dict(client_kwargs) 981 | client_kwargs.setdefault( 982 | "config", botocore.client.Config(signature_version=botocore.UNSIGNED) 983 | ) 984 | client = boto3.client("s3", **client_kwargs) 985 | 986 | download_file_kwargs = {} if download_file_kwargs is None else dict(download_file_kwargs) 987 | download_file_kwargs.setdefault( 988 | "Config", boto3.s3.transfer.TransferConfig(use_threads=False) 989 | ) 990 | client.download_file(s3_bucket, s3_key, path.as_posix(), **download_file_kwargs) 991 | except (Exception, KeyboardInterrupt): 992 | if clean_on_failure: 993 | _unlink(path) 994 | raise 995 | 996 | 997 | def _unlink(path: str | Path) -> None: 998 | # python 3.6 does not have pathlib.Path.unlink, smh 999 | try: 1000 | os.remove(path) 1001 | except OSError: 1002 | pass # if the file can't be deleted then no problem 1003 | 1004 | 1005 | def get_name() -> str: 1006 | """Get the PyStow home directory name. 1007 | 1008 | :returns: The name of the pystow home directory, either loaded from 1009 | the :data:`PYSTOW_NAME_ENVVAR`` environment variable or given by the default 1010 | value :data:`PYSTOW_NAME_DEFAULT`. 1011 | """ 1012 | return os.getenv(PYSTOW_NAME_ENVVAR, default=PYSTOW_NAME_DEFAULT) 1013 | 1014 | 1015 | def use_appdirs() -> bool: 1016 | """Check if X Desktop Group (XDG) compatibility is requested. 1017 | 1018 | :returns: If the :data:`PYSTOW_USE_APPDIRS` is set to ``true`` in the environment. 1019 | """ 1020 | return os.getenv(PYSTOW_USE_APPDIRS) in {"true", "True"} 1021 | 1022 | 1023 | def get_home(ensure_exists: bool = True) -> Path: 1024 | """Get the PyStow home directory. 1025 | 1026 | :param ensure_exists: If true, ensures the directory is created 1027 | :returns: A path object representing the pystow home directory, as one of: 1028 | 1029 | 1. :data:`PYSTOW_HOME_ENVVAR` environment variable or 1030 | 2. The user data directory defined by :mod:`appdirs` if the :data:`PYSTOW_USE_APPDIRS` 1031 | environment variable is set to ``true`` or 1032 | 3. The default directory constructed in the user's home directory plus what's 1033 | returned by :func:`get_name`. 1034 | """ 1035 | if use_appdirs(): 1036 | from appdirs import user_data_dir 1037 | 1038 | default = Path(user_data_dir()) 1039 | else: 1040 | default = Path.home() / get_name() 1041 | return getenv_path(PYSTOW_HOME_ENVVAR, default, ensure_exists=ensure_exists) 1042 | 1043 | 1044 | def get_base(key: str, ensure_exists: bool = True) -> Path: 1045 | """Get the base directory for a module. 1046 | 1047 | :param key: 1048 | The name of the module. No funny characters. The envvar 1049 | _HOME where key is uppercased is checked first before using 1050 | the default home directory. 1051 | :param ensure_exists: 1052 | Should all directories be created automatically? Defaults to true. 1053 | :returns: 1054 | The path to the given 1055 | 1056 | :raises ValueError: if the key is invalid (e.g., has a dot in it) 1057 | """ 1058 | if "." in key: 1059 | raise ValueError(f"The module should not have a dot in it: {key}") 1060 | envvar = f"{key.upper()}_HOME" 1061 | if use_appdirs(): 1062 | from appdirs import user_data_dir 1063 | 1064 | default = Path(user_data_dir(appname=key)) 1065 | else: 1066 | default = get_home(ensure_exists=False) / key 1067 | return getenv_path(envvar, default, ensure_exists=ensure_exists) 1068 | 1069 | 1070 | def ensure_readme() -> None: 1071 | """Ensure there's a README in the PyStow data directory. 1072 | 1073 | :raises PermissionError: If the script calling this function does not have 1074 | adequate permissions to write a file into the PyStow home directory. 1075 | """ 1076 | try: 1077 | readme_path = get_home(ensure_exists=True).joinpath("README.md") 1078 | except PermissionError as e: 1079 | raise PermissionError( 1080 | "PyStow was not able to create its home directory in due to a lack of " 1081 | "permissions. This can happen, e.g., if you're working on a server where you don't " 1082 | "have full rights. See https://pystow.readthedocs.io/en/latest/installation.html#" 1083 | "configuration for instructions on choosing a different home folder location for " 1084 | "PyStow to somewhere where you have write permissions." 1085 | ) from e 1086 | if readme_path.is_file(): 1087 | return 1088 | with readme_path.open("w", encoding="utf8") as file: 1089 | print(README_TEXT, file=file) 1090 | 1091 | 1092 | def path_to_sqlite(path: str | Path) -> str: 1093 | """Convert a path to a SQLite connection string. 1094 | 1095 | :param path: A path to a SQLite database file 1096 | :returns: A standard connection string to the database 1097 | """ 1098 | path = Path(path).expanduser().resolve() 1099 | return f"sqlite:///{path.as_posix()}" 1100 | 1101 | 1102 | def gunzip(source: str | Path, target: str | Path) -> None: 1103 | """Unzip a file in the source to the target. 1104 | 1105 | :param source: The path to an input file 1106 | :param target: The path to an output file 1107 | """ 1108 | with gzip.open(source, "rb") as in_file, open(target, "wb") as out_file: 1109 | shutil.copyfileobj(in_file, out_file) 1110 | 1111 | 1112 | #: A human-readable flag for how to open a file. 1113 | Operation: TypeAlias = Literal["read", "write"] 1114 | OPERATION_VALUES: set[str] = set(typing.get_args(Operation)) 1115 | 1116 | #: A human-readable flag for how to open a file. 1117 | Representation: TypeAlias = Literal["text", "binary"] 1118 | REPRESENTATION_VALUES: set[str] = set(typing.get_args(Representation)) 1119 | 1120 | MODE_MAP: dict[tuple[Operation, Representation], Literal["rt", "wt", "rb", "wb"]] = { 1121 | ("read", "text"): "rt", 1122 | ("read", "binary"): "rb", 1123 | ("write", "text"): "wt", 1124 | ("write", "binary"): "wb", 1125 | } 1126 | 1127 | 1128 | # docstr-coverage:excused `overload` 1129 | @typing.overload 1130 | @contextlib.contextmanager 1131 | def safe_open( 1132 | path: str | Path, *, operation: Operation = ..., representation: Literal["text"] = "text" 1133 | ) -> Generator[typing.TextIO, None, None]: ... 1134 | 1135 | 1136 | # docstr-coverage:excused `overload` 1137 | @typing.overload 1138 | @contextlib.contextmanager 1139 | def safe_open( 1140 | path: str | Path, *, operation: Operation = ..., representation: Literal["binary"] = "binary" 1141 | ) -> Generator[typing.BinaryIO, None, None]: ... 1142 | 1143 | 1144 | @contextlib.contextmanager 1145 | def safe_open( 1146 | path: str | Path, *, operation: Operation = "read", representation: Representation = "text" 1147 | ) -> Generator[typing.TextIO, None, None] | Generator[typing.BinaryIO, None, None]: 1148 | """Safely open a file for reading or writing text.""" 1149 | if operation not in OPERATION_VALUES: 1150 | raise ValueError( 1151 | f"Invalid operation given: {operation}. Should be one of {OPERATION_VALUES}." 1152 | ) 1153 | if representation not in REPRESENTATION_VALUES: 1154 | raise ValueError( 1155 | f"Invalid representation given: {representation}. " 1156 | f"Should be one of {REPRESENTATION_VALUES}." 1157 | ) 1158 | 1159 | mode = MODE_MAP[operation, representation] 1160 | path = Path(path).expanduser().resolve() 1161 | if path.suffix.endswith(".gz"): 1162 | with gzip.open(path, mode=mode) as file: 1163 | yield file # type:ignore 1164 | else: 1165 | with open(path, mode=mode) as file: 1166 | yield file # type:ignore 1167 | 1168 | 1169 | @contextlib.contextmanager 1170 | def safe_open_writer( 1171 | f: str | Path | TextIO, *, delimiter: str = "\t", **kwargs: Any 1172 | ) -> Generator[_csv._writer, None, None]: 1173 | """Open a CSV writer, wrapping :func:`csv.writer`. 1174 | 1175 | :param f: A path to a file, or an already open text-based IO object 1176 | :param delimiter: The delimiter for writing to CSV 1177 | :param kwargs: Keyword arguments to pass to :func:`csv.writer` 1178 | :yields: A CSV writer object, constructed from :func:`csv.writer` 1179 | """ 1180 | if isinstance(f, (str, Path)): 1181 | with safe_open(f, operation="write", representation="text") as file: 1182 | yield csv.writer(file, delimiter=delimiter, **kwargs) 1183 | else: 1184 | yield csv.writer(f, delimiter=delimiter, **kwargs) 1185 | -------------------------------------------------------------------------------- /src/pystow/version.py: -------------------------------------------------------------------------------- 1 | """Version information for PyStow.""" 2 | 3 | __all__ = [ 4 | "VERSION", 5 | ] 6 | 7 | VERSION = "0.7.1-dev" 8 | -------------------------------------------------------------------------------- /tests/resources/test.txt: -------------------------------------------------------------------------------- 1 | this is a test file 2 | -------------------------------------------------------------------------------- /tests/resources/test.txt.md5: -------------------------------------------------------------------------------- 1 | 4221d002ceb5d3c9e9137e495ceaa647 -------------------------------------------------------------------------------- /tests/resources/test_1.csv: -------------------------------------------------------------------------------- 1 | h1,h2,h3 2 | v1_1,v1_2,v1_3 3 | v2_1,v2_2,v2_3 4 | -------------------------------------------------------------------------------- /tests/resources/test_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": "value" 3 | } -------------------------------------------------------------------------------- /tests/resources/test_1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cthoyt/pystow/80249d83c684cb15ce05b0c83e10d45c22b966d7/tests/resources/test_1.pkl -------------------------------------------------------------------------------- /tests/resources/test_1.tsv: -------------------------------------------------------------------------------- 1 | h1 h2 h3 2 | v1_1 v1_2 v1_3 3 | v2_1 v2_2 v2_3 4 | -------------------------------------------------------------------------------- /tests/resources/test_verbose.txt.md5: -------------------------------------------------------------------------------- 1 | MD5(text.txt)=4221d002ceb5d3c9e9137e495ceaa647 -------------------------------------------------------------------------------- /tests/resources/test_wrong.txt.md5: -------------------------------------------------------------------------------- 1 | yolo -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | """Test for API completeness.""" 2 | 3 | from __future__ import annotations 4 | 5 | import inspect 6 | import unittest 7 | from typing import Callable, TypeVar, cast 8 | 9 | import pandas as pd 10 | import rdflib 11 | from lxml import etree 12 | 13 | import pystow 14 | from pystow import Module 15 | 16 | SKIP = {"__init__"} 17 | 18 | X = TypeVar("X") 19 | 20 | 21 | def _df_equal(a: pd.DataFrame, b: pd.DataFrame, msg: str | None = None) -> bool: 22 | return bool(a.values.tolist() == b.values.tolist()) 23 | 24 | 25 | def _rdf_equal(a: rdflib.Graph, b: rdflib.Graph, msg: str | None = None) -> bool: 26 | return {tuple(t) for t in a} == {tuple(t) for t in b} 27 | 28 | 29 | def _etree_equal(a: etree.ElementTree, b: etree.ElementTree, msg: str | None = None) -> bool: 30 | return cast(str, etree.tostring(a)) == cast(str, etree.tostring(b)) 31 | 32 | 33 | class TestExposed(unittest.TestCase): 34 | """Test API exposure.""" 35 | 36 | def setUp(self) -> None: 37 | """Set up the test case.""" 38 | self.addTypeEqualityFunc(pd.DataFrame, _df_equal) # type:ignore[arg-type] 39 | self.addTypeEqualityFunc(rdflib.Graph, _rdf_equal) # type:ignore[arg-type] 40 | self.addTypeEqualityFunc(type(etree.ElementTree()), _etree_equal) # type:ignore[arg-type] 41 | 42 | def assert_io( 43 | self, obj: X, extension: str, dump: Callable[..., None], load: Callable[..., X] 44 | ) -> None: 45 | """Test an object can be dumped and loaded. 46 | 47 | :param obj: The object to dump 48 | :param extension: The extension to use 49 | :param dump: The dump function 50 | :param load: The load function 51 | """ 52 | name = f"test.{extension}" 53 | path = pystow.join("test", name=name) 54 | if path.is_file(): 55 | path.unlink() 56 | self.assertFalse(path.is_file()) 57 | 58 | dump("test", name=name, obj=obj) 59 | self.assertTrue(path.is_file()) 60 | self.assertEqual(obj, load("test", name=name)) 61 | 62 | def test_exposed(self) -> None: 63 | """Test that all module-level functions also have a counterpart in the top-level API.""" 64 | for name, func in Module.__dict__.items(): 65 | if not inspect.isfunction(func) or name in SKIP: 66 | continue 67 | with self.subTest(name=name): 68 | self.assertIn( 69 | name, 70 | pystow.api.__all__, 71 | msg=f"Module.{name} should be included in from `pystow.api.__all__`.", 72 | ) 73 | self.assertTrue( 74 | hasattr(pystow.api, name), 75 | msg=f"`Module.{name} should be exposed as a top-level " 76 | f"function in `pystow.api`.", 77 | ) 78 | self.assertTrue( 79 | hasattr(pystow, name), 80 | msg=f"`pystow.api.{name}` should be imported in `pystow.__init__`.", 81 | ) 82 | 83 | def test_io(self) -> None: 84 | """Test IO functions.""" 85 | obj = ["a", "b", "c"] 86 | for ext, dump, load in [ 87 | ("json", pystow.dump_json, pystow.load_json), 88 | ("pkl", pystow.dump_pickle, pystow.load_pickle), 89 | ]: 90 | with self.subTest(ext=ext): 91 | self.assert_io(obj, extension=ext, dump=dump, load=load) # type:ignore 92 | 93 | def test_pd_io(self) -> None: 94 | """Test pandas IO.""" 95 | columns = list("abc") 96 | data = [(1, 2, 3), (4, 5, 6)] 97 | df = pd.DataFrame(data, columns=columns) 98 | self.assert_io(df, extension="tsv", load=pystow.load_df, dump=pystow.dump_df) 99 | 100 | def test_rdf_io(self) -> None: 101 | """Test RDFlib IO.""" 102 | graph = rdflib.Graph() 103 | graph.add( 104 | ( 105 | rdflib.URIRef("http://example.com/subject"), 106 | rdflib.URIRef("http://example.com/predicate"), 107 | rdflib.URIRef("http://example.com/object"), 108 | ) 109 | ) 110 | self.assertEqual(1, len(graph)) 111 | self.assert_io(graph, extension="ttl", dump=pystow.dump_rdf, load=pystow.load_rdf) 112 | 113 | def test_xml_io(self) -> None: 114 | """Test XML I/O.""" 115 | root = etree.Element("root") 116 | root.set("interesting", "somewhat") 117 | etree.SubElement(root, "test") 118 | my_tree = etree.ElementTree(root) 119 | self.assert_io(my_tree, extension="xml", dump=pystow.dump_xml, load=pystow.load_xml) 120 | -------------------------------------------------------------------------------- /tests/test_caching.py: -------------------------------------------------------------------------------- 1 | """Tests for caching.""" 2 | 3 | from __future__ import annotations 4 | 5 | import os 6 | import tempfile 7 | import unittest 8 | from pathlib import Path 9 | 10 | from pystow.cache import CachedPickle 11 | 12 | EXPECTED = 5 13 | EXPECTED_2 = 6 14 | 15 | 16 | class TestCache(unittest.TestCase): 17 | """Tests for caches.""" 18 | 19 | def setUp(self) -> None: 20 | """Set up the test case with a temporary directory.""" 21 | self.tmpdir = tempfile.TemporaryDirectory() 22 | self.directory = Path(self.tmpdir.name) 23 | 24 | def tearDown(self) -> None: 25 | """Tear down the test case's temporary directory.""" 26 | self.tmpdir.cleanup() 27 | 28 | def test_cache_exception(self) -> None: 29 | """Test that exceptions aren't swallowed.""" 30 | path = self.directory.joinpath("test.pkl") 31 | 32 | self.assertFalse(path.is_file()) 33 | 34 | @CachedPickle(path=path) 35 | def _f1() -> None: 36 | raise NotImplementedError 37 | 38 | self.assertFalse(path.is_file(), msg="function has not been called") 39 | 40 | with self.assertRaises(NotImplementedError): 41 | _f1() 42 | 43 | self.assertFalse( 44 | path.is_file(), 45 | msg="file should not have been created if an exception was thrown by the function", 46 | ) 47 | 48 | def test_cache_pickle(self) -> None: 49 | """Test caching a pickle.""" 50 | path = self.directory.joinpath("test.pkl") 51 | self.assertFalse( 52 | path.is_file(), 53 | msg="the file should not exist at the beginning of the test", 54 | ) 55 | 56 | raise_flag = True 57 | 58 | @CachedPickle(path=path) 59 | def _f1() -> int: 60 | if raise_flag: 61 | raise ValueError 62 | return EXPECTED 63 | 64 | self.assertFalse(path.is_file(), msg="the file should not exist until function is called") 65 | 66 | with self.assertRaises(ValueError): 67 | _f1() 68 | self.assertFalse( 69 | path.is_file(), 70 | msg="the function should throw an exception " 71 | "because of the flag, and no file should be created", 72 | ) 73 | 74 | raise_flag = False 75 | actual = _f1() 76 | self.assertEqual(EXPECTED, actual) 77 | self.assertTrue(path.is_file(), msg="a file should have been created") 78 | 79 | raise_flag = True 80 | actual_2 = _f1() # if raises, the caching mechanism didn't work 81 | self.assertEqual(EXPECTED, actual_2) 82 | self.assertTrue(path.is_file()) 83 | 84 | os.unlink(path) 85 | self.assertFalse(path.is_file()) 86 | with self.assertRaises(ValueError): 87 | _f1() 88 | 89 | @CachedPickle(path=path, force=True) 90 | def _f2() -> int: 91 | return EXPECTED_2 92 | 93 | self.assertEqual(EXPECTED_2, _f2()) # overwrites the file 94 | self.assertEqual(EXPECTED_2, _f1()) 95 | 96 | def test_no_cache(self) -> None: 97 | """Test that no caching happens.""" 98 | path = self.directory.joinpath("test.pkl") 99 | sentinel_value = 5 100 | 101 | self.assertFalse(path.is_file()) 102 | 103 | @CachedPickle(path=path, cache=False) 104 | def _f1() -> int: 105 | return sentinel_value 106 | 107 | self.assertFalse(path.is_file(), msg="function has not been called") 108 | 109 | # check the following twice, just for good measure! 110 | for _ in range(2): 111 | self.assertEqual(sentinel_value, _f1()) 112 | self.assertFalse( 113 | path.is_file(), 114 | msg="file should not have been created since caching was turned off", 115 | ) 116 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | """Test configuration loading.""" 2 | 3 | from __future__ import annotations 4 | 5 | import tempfile 6 | import unittest 7 | from configparser import ConfigParser 8 | from pathlib import Path 9 | from typing import ClassVar 10 | 11 | import pystow 12 | from pystow.config_api import CONFIG_HOME_ENVVAR, _get_cfp 13 | from pystow.utils import mock_envvar 14 | 15 | 16 | class TestConfig(unittest.TestCase): 17 | """Test configuration.""" 18 | 19 | test_section: ClassVar[str] 20 | test_option: ClassVar[str] 21 | test_value: ClassVar[str] 22 | cfp: ClassVar[ConfigParser] 23 | 24 | @classmethod 25 | def setUpClass(cls) -> None: 26 | """Set up the class for testing.""" 27 | cls.test_section = "test" 28 | cls.test_option = "option" 29 | cls.test_value = "value" 30 | cls.cfp = _get_cfp(cls.test_section) 31 | cls.cfp.add_section(cls.test_section) 32 | cls.cfp.set( 33 | section=cls.test_section, 34 | option=cls.test_option, 35 | value=cls.test_value, 36 | ) 37 | 38 | def test_env_cast(self) -> None: 39 | """Test casting works properly when getting from the environment.""" 40 | with mock_envvar("TEST_VAR", "1234"): 41 | self.assertEqual("1234", pystow.get_config("test", "var")) 42 | self.assertEqual("1234", pystow.get_config("test", "var", dtype=str)) 43 | self.assertEqual(1234, pystow.get_config("test", "var", dtype=int)) 44 | with self.assertRaises(ValueError): 45 | pystow.get_config("test", "var", dtype=bool) 46 | with self.assertRaises(TypeError): 47 | pystow.get_config("test", "var", dtype=object) 48 | 49 | def test_get_config(self) -> None: 50 | """Test lookup not existing.""" 51 | self.assertIsNone(pystow.get_config(self.test_section, "key")) 52 | self.assertEqual("1234", pystow.get_config(self.test_section, "key", default="1234")) 53 | 54 | value = "not_value" 55 | self.assertEqual( 56 | value, pystow.get_config(self.test_section, self.test_option, passthrough=value) 57 | ) 58 | 59 | self.assertEqual(1, pystow.get_config(self.test_section, self.test_option, passthrough=1)) 60 | self.assertEqual( 61 | 1, pystow.get_config(self.test_section, self.test_option, passthrough="1", dtype=int) 62 | ) 63 | 64 | self.assertEqual( 65 | True, 66 | pystow.get_config(self.test_section, self.test_option, passthrough="1", dtype=bool), 67 | ) 68 | self.assertEqual( 69 | True, 70 | pystow.get_config(self.test_section, self.test_option, passthrough="yes", dtype=bool), 71 | ) 72 | self.assertEqual( 73 | True, 74 | pystow.get_config(self.test_section, self.test_option, passthrough="Yes", dtype=bool), 75 | ) 76 | self.assertEqual( 77 | True, 78 | pystow.get_config(self.test_section, self.test_option, passthrough="YES", dtype=bool), 79 | ) 80 | self.assertEqual( 81 | True, 82 | pystow.get_config(self.test_section, self.test_option, passthrough="True", dtype=bool), 83 | ) 84 | self.assertEqual( 85 | True, 86 | pystow.get_config(self.test_section, self.test_option, passthrough="TRUE", dtype=bool), 87 | ) 88 | self.assertEqual( 89 | True, 90 | pystow.get_config(self.test_section, self.test_option, passthrough="T", dtype=bool), 91 | ) 92 | self.assertEqual( 93 | True, 94 | pystow.get_config(self.test_section, self.test_option, passthrough="t", dtype=bool), 95 | ) 96 | self.assertEqual( 97 | True, 98 | pystow.get_config(self.test_section, self.test_option, passthrough=True, dtype=bool), 99 | ) 100 | self.assertEqual( 101 | True, pystow.get_config(self.test_section, self.test_option, passthrough=1, dtype=bool) 102 | ) 103 | 104 | def test_subsection(self) -> None: 105 | """Test subsections.""" 106 | with tempfile.TemporaryDirectory() as directory, mock_envvar(CONFIG_HOME_ENVVAR, directory): 107 | directory_ = Path(directory) 108 | path = directory_.joinpath("test.ini") 109 | self.assertFalse(path.is_file(), msg="file should not already exist") 110 | 111 | self.assertIsNone(pystow.get_config("test:subtest", "key")) 112 | self.assertFalse(path.is_file(), msg="getting config should not create a file") 113 | 114 | pystow.write_config("test:subtest", "key", "value") 115 | self.assertTrue(path.is_file(), msg=f"{list(directory_.iterdir())}") 116 | 117 | self.assertEqual("value", pystow.get_config("test:subtest", "key")) 118 | -------------------------------------------------------------------------------- /tests/test_module.py: -------------------------------------------------------------------------------- 1 | """Tests for PyStow.""" 2 | 3 | from __future__ import annotations 4 | 5 | import bz2 6 | import contextlib 7 | import itertools as itt 8 | import json 9 | import lzma 10 | import os 11 | import pickle 12 | import shutil 13 | import tempfile 14 | import unittest 15 | from collections.abc import Generator, Mapping 16 | from pathlib import Path 17 | from typing import Any 18 | from unittest import mock 19 | 20 | import pandas as pd 21 | 22 | import pystow 23 | from pystow import join 24 | from pystow.constants import PYSTOW_HOME_ENVVAR, PYSTOW_NAME_ENVVAR 25 | from pystow.impl import Module 26 | from pystow.utils import ( 27 | get_home, 28 | get_name, 29 | mock_envvar, 30 | n, 31 | write_pickle_gz, 32 | write_sql, 33 | write_tarfile_csv, 34 | write_zipfile_csv, 35 | ) 36 | 37 | HERE = Path(__file__).parent.resolve() 38 | RESOURCES = HERE.joinpath("resources") 39 | 40 | TSV_NAME = "test_1.tsv" 41 | TSV_URL = f"{n()}/{TSV_NAME}" 42 | 43 | SQLITE_NAME = "test_1.db" 44 | SQLITE_URL = f"{n()}/{SQLITE_NAME}" 45 | SQLITE_PATH = RESOURCES / SQLITE_NAME 46 | SQLITE_TABLE = "testtable" 47 | 48 | JSON_NAME = "test_1.json" 49 | JSON_URL = f"{n()}/{JSON_NAME}" 50 | JSON_PATH = RESOURCES / JSON_NAME 51 | 52 | PICKLE_NAME = "test_1.pkl" 53 | PICKLE_URL = f"{n()}/{PICKLE_NAME}" 54 | PICKLE_PATH = RESOURCES / PICKLE_NAME 55 | 56 | PICKLE_GZ_NAME = "test_1.pkl.gz" 57 | PICKLE_GZ_URL = f"{n()}/{PICKLE_GZ_NAME}" 58 | PICKLE_GZ_PATH = RESOURCES / PICKLE_GZ_NAME 59 | 60 | JSON_BZ2_NAME = "test_1.json.bz2" 61 | JSON_BZ2_URL = f"{n()}/{JSON_BZ2_NAME}" 62 | JSON_BZ2_PATH = RESOURCES / JSON_BZ2_NAME 63 | 64 | MOCK_FILES: Mapping[str, Path] = { 65 | TSV_URL: RESOURCES / TSV_NAME, 66 | JSON_URL: JSON_PATH, 67 | JSON_BZ2_URL: JSON_BZ2_PATH, 68 | PICKLE_URL: PICKLE_PATH, 69 | PICKLE_GZ_URL: PICKLE_GZ_PATH, 70 | SQLITE_URL: SQLITE_PATH, 71 | } 72 | 73 | TEST_TSV_ROWS = [ 74 | ("h1", "h2", "h3"), 75 | ("v1_1", "v1_2", "v1_3"), 76 | ("v2_1", "v2_2", "v2_3"), 77 | ] 78 | TEST_DF = pd.DataFrame(TEST_TSV_ROWS) 79 | TEST_JSON = {"key": "value"} 80 | 81 | # Make the pickle file 82 | if not PICKLE_PATH.is_file(): 83 | PICKLE_PATH.write_bytes(pickle.dumps(TEST_TSV_ROWS)) 84 | 85 | if not SQLITE_PATH.is_file(): 86 | write_sql(TEST_DF, name=SQLITE_TABLE, path=SQLITE_PATH, index=False) 87 | 88 | if not JSON_PATH.is_file(): 89 | JSON_PATH.write_text(json.dumps(TEST_JSON)) 90 | 91 | if not JSON_BZ2_PATH.is_file(): 92 | with bz2.open(JSON_BZ2_PATH, mode="wt") as file: 93 | json.dump(TEST_JSON, file, indent=2) 94 | 95 | 96 | class TestMocks(unittest.TestCase): 97 | """Tests for :mod:`pystow` mocks and context managers.""" 98 | 99 | def test_mock_home(self) -> None: 100 | """Test that home can be properly mocked.""" 101 | name = n() 102 | 103 | with tempfile.TemporaryDirectory() as d: 104 | expected_path = Path(d) / name 105 | self.assertFalse(expected_path.exists()) 106 | 107 | with mock_envvar(PYSTOW_HOME_ENVVAR, expected_path.as_posix()): 108 | self.assertFalse(expected_path.exists()) 109 | self.assertEqual(expected_path, get_home(ensure_exists=False)) 110 | self.assertFalse(expected_path.exists()) 111 | 112 | def test_mock_name(self) -> None: 113 | """Test that the name can be properly mocked.""" 114 | name = n() 115 | 116 | expected_path = Path.home() / name 117 | self.assertFalse(expected_path.exists()) 118 | 119 | with mock_envvar(PYSTOW_NAME_ENVVAR, name): 120 | self.assertEqual(name, get_name()) 121 | 122 | self.assertFalse(expected_path.exists()) 123 | self.assertEqual(expected_path, get_home(ensure_exists=False)) 124 | self.assertFalse(expected_path.exists()) 125 | 126 | 127 | class TestJoin(unittest.TestCase): 128 | """Tests for :mod:`pystow`.""" 129 | 130 | def setUp(self) -> None: 131 | """Set up the test case.""" 132 | self.directory = tempfile.TemporaryDirectory() 133 | 134 | def tearDown(self) -> None: 135 | """Tear down the test case.""" 136 | self.directory.cleanup() 137 | 138 | @contextlib.contextmanager 139 | def mock_directory(self) -> Generator[Path, None, None]: 140 | """Use this test case's temporary directory as a mock environment variable. 141 | 142 | :yield: The mock directory's path 143 | """ 144 | with mock_envvar(PYSTOW_HOME_ENVVAR, self.directory.name): 145 | yield Path(self.directory.name) 146 | 147 | @staticmethod 148 | def mock_download() -> mock._patch_default_new: 149 | """Mock connection to the internet using local resource files. 150 | 151 | :return: A patch object that can be applied to the pystow download function 152 | """ 153 | 154 | def _mock_get_data(url: str, path: str | Path, **_kwargs: Any) -> Path: 155 | return Path(shutil.copy(MOCK_FILES[url], path)) 156 | 157 | return mock.patch("pystow.utils.download", side_effect=_mock_get_data) 158 | 159 | @staticmethod 160 | def mock_download_once(local_path: str | Path) -> mock._patch_default_new: 161 | """Mock connection to the internet using local resource files. 162 | 163 | :param local_path: the path to the file to mock 164 | :return: A patch object that can be applied to the pystow download function 165 | """ 166 | 167 | def _mock_get_data(path: str | Path, **_kwargs: Any) -> Path: 168 | return Path(shutil.copy(local_path, path)) 169 | 170 | return mock.patch("pystow.utils.download", side_effect=_mock_get_data) 171 | 172 | def join(self, *parts: str) -> Path: 173 | """Help join the parts to this test case's temporary directory. 174 | 175 | :param parts: The file path parts that are joined with this test case's directory 176 | :return: A path to the file 177 | """ 178 | return Path(self.directory.name).joinpath(*parts) 179 | 180 | def test_mock(self) -> None: 181 | """Test that mocking the directory works properly for this test case.""" 182 | with self.mock_directory(): 183 | self.assertEqual(os.getenv(PYSTOW_HOME_ENVVAR), self.directory.name) 184 | 185 | def test_join(self) -> None: 186 | """Test the :func:`pystow.join` function.""" 187 | parts_examples = [ 188 | [n()], 189 | [n(), n()], 190 | [n(), n(), n()], 191 | ] 192 | with self.mock_directory(): 193 | for parts in parts_examples: 194 | with self.subTest(parts=parts): 195 | self.assertEqual(self.join(*parts), join(*parts)) 196 | 197 | def test_join_with_version(self) -> None: 198 | """Test the join function when a version is present.""" 199 | with self.mock_directory(): 200 | key = "key" 201 | version = "v1" 202 | self.assertEqual( 203 | self.join(key, version), 204 | pystow.join(key, version=version), 205 | ) 206 | 207 | parts = [n()] 208 | self.assertEqual( 209 | self.join(key, version, *parts), pystow.join(key, *parts, version=version) 210 | ) 211 | 212 | parts = [n()] 213 | name = "yup.tsv" 214 | self.assertEqual( 215 | self.join(key, version, *parts, name), 216 | pystow.join(key, *parts, version=version, name=name), 217 | ) 218 | 219 | def _version_getter() -> str: 220 | return "v2" 221 | 222 | parts = [n()] 223 | name = "yup.tsv" 224 | self.assertEqual( 225 | self.join(key, _version_getter(), *parts, name), 226 | pystow.join(key, *parts, version=_version_getter, name=name), 227 | ) 228 | 229 | with self.assertRaises(ValueError): 230 | pystow.join(key, version="/") 231 | 232 | def test_ensure(self) -> None: 233 | """Test ensuring various files.""" 234 | write_pickle_gz(TEST_TSV_ROWS, path=PICKLE_GZ_PATH) 235 | 236 | with self.mock_directory(), self.mock_download(): 237 | with self.subTest(type="tsv"): 238 | df = pystow.ensure_csv("test", url=TSV_URL) 239 | self.assertEqual(3, len(df.columns)) 240 | 241 | df2 = pystow.load_df("test", name=TSV_NAME) 242 | self.assertEqual(df.values.tolist(), df2.values.tolist()) 243 | 244 | with self.subTest(type="json"): 245 | j = pystow.ensure_json("test", url=JSON_URL) 246 | self.assertEqual(TEST_JSON, j) 247 | 248 | j2 = pystow.load_json("test", name=JSON_NAME) 249 | self.assertEqual(j, j2) 250 | 251 | with self.subTest(type="pickle"): 252 | p = pystow.ensure_pickle("test", url=PICKLE_URL) 253 | self.assertEqual(3, len(p)) 254 | 255 | p2 = pystow.load_pickle("test", name=PICKLE_NAME) 256 | self.assertEqual(p, p2) 257 | 258 | with self.subTest(type="pickle_gz"): 259 | p = pystow.ensure_pickle_gz("test", url=PICKLE_GZ_URL) 260 | self.assertEqual(3, len(p)) 261 | 262 | p2 = pystow.load_pickle_gz("test", name=PICKLE_GZ_NAME) 263 | self.assertEqual(p, p2) 264 | 265 | with self.subTest(type="json_bz2"): 266 | p = pystow.ensure_json_bz2("test", url=JSON_BZ2_URL) 267 | self.assertEqual(TEST_JSON, p) 268 | 269 | def test_open_fail(self) -> None: 270 | """Test opening a missing file.""" 271 | with self.assertRaises(FileNotFoundError): 272 | with pystow.open("nope", name="nope"): 273 | pass 274 | 275 | with self.assertRaises(FileNotFoundError): 276 | pystow.load_json("nope", name="nope") 277 | 278 | def test_ensure_open_lzma(self) -> None: 279 | """Test opening lzma-encoded files.""" 280 | with tempfile.TemporaryDirectory() as directory, self.mock_directory(): 281 | path = Path(directory) / n() 282 | with self.mock_download_once(path): 283 | with lzma.open(path, "wt") as file_1: 284 | for row in TEST_TSV_ROWS: 285 | print(*row, sep="\t", file=file_1) 286 | # FIXME this ignore needs to be removed and addressed 287 | with pystow.ensure_open_lzma("test", url=n()) as file_2: # type: ignore 288 | df = pd.read_csv(file_2, sep="\t") 289 | self.assertEqual(3, len(df.columns)) 290 | 291 | def test_ensure_open_zip(self) -> None: 292 | """Test opening tar-encoded files.""" 293 | with tempfile.TemporaryDirectory() as directory, self.mock_directory(): 294 | path = Path(directory) / n() 295 | inner_path = n() 296 | with self.mock_download_once(path): 297 | write_zipfile_csv(TEST_DF, path, inner_path) 298 | with pystow.ensure_open_zip("test", url=n(), inner_path=inner_path) as file: 299 | df = pd.read_csv(file, sep="\t") 300 | self.assertEqual(3, len(df.columns)) 301 | 302 | def test_ensure_open_tarfile(self) -> None: 303 | """Test opening tarfile-encoded files.""" 304 | with tempfile.TemporaryDirectory() as directory, self.mock_directory(): 305 | path = Path(directory) / n() 306 | inner_path = n() 307 | with self.mock_download_once(path): 308 | write_tarfile_csv(TEST_DF, path, inner_path) 309 | with pystow.ensure_open_tarfile("test", url=n(), inner_path=inner_path) as file: 310 | df = pd.read_csv(file, sep="\t") 311 | self.assertEqual(3, len(df.columns)) 312 | 313 | def test_ensure_module(self) -> None: 314 | """Test that the ``ensure_exist`` argument in :meth:`Module.from_key` works properly.""" 315 | parts_examples = [ 316 | [n()], 317 | [n(), n()], 318 | [n(), n(), n()], 319 | ] 320 | ensure_examples = [False, True] 321 | 322 | for ensure_exists, parts in itt.product(ensure_examples, parts_examples): 323 | with self.subTest(ensure_exists=ensure_exists, parts=parts), self.mock_directory(): 324 | expected_directory = self.join(*parts) 325 | 326 | module = Module.from_key(*parts, ensure_exists=ensure_exists) 327 | 328 | self.assertEqual(expected_directory, module.base) 329 | self.assertIs( 330 | expected_directory.exists(), 331 | ensure_exists, 332 | msg=f"{expected_directory} should{'' if ensure_exists else ' not'} exist.", 333 | ) 334 | 335 | def test_ensure_custom(self) -> None: 336 | """Test ensure with custom provider.""" 337 | with self.mock_directory(): 338 | # create a minimal provider 339 | def touch_file(path: Path, **_kwargs: Any) -> None: 340 | """ 341 | Create a file. 342 | 343 | :param path: 344 | the file path 345 | :param _kwargs: 346 | ignored keywords 347 | """ 348 | path.touch() 349 | 350 | # wrap to record calls 351 | provider = mock.Mock(wraps=touch_file) 352 | 353 | # the keyword-based parameters for the provider 354 | kwargs: dict[str, Any] = {"a": 4, "c": {0: 1, 5: 7}} 355 | 356 | # call first time 357 | name = n() 358 | path = pystow.ensure_custom("test", name=name, provider=provider, **kwargs) 359 | self.assertTrue(path.is_file()) 360 | # call a second time 361 | path = pystow.ensure_custom("test", name=name, provider=provider, **kwargs) 362 | # ensure that the provider was only called once with the given parameters 363 | provider.assert_called_once_with(path, **kwargs) 364 | 365 | def test_ensure_open_sqlite(self) -> None: 366 | """Test caching SQLite.""" 367 | with self.mock_directory(), self.mock_download(): 368 | with pystow.ensure_open_sqlite("test", url=SQLITE_URL) as conn: 369 | df = pd.read_sql(f"SELECT * from {SQLITE_TABLE}", conn) # noqa:S608 370 | self.assertEqual(3, len(df.columns)) 371 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | """Tests for utilities.""" 2 | 3 | from __future__ import annotations 4 | 5 | import hashlib 6 | import os 7 | import tempfile 8 | import unittest 9 | from pathlib import Path 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import requests 14 | from lxml import etree 15 | from requests_file import FileAdapter 16 | 17 | from pystow.utils import ( 18 | DownloadError, 19 | HexDigestError, 20 | download, 21 | get_hexdigests_remote, 22 | getenv_path, 23 | mkdir, 24 | mock_envvar, 25 | n, 26 | name_from_url, 27 | read_tarfile_csv, 28 | read_zip_np, 29 | read_zipfile_csv, 30 | read_zipfile_xml, 31 | write_tarfile_csv, 32 | write_zipfile_csv, 33 | write_zipfile_np, 34 | write_zipfile_xml, 35 | ) 36 | 37 | HERE = Path(__file__).resolve().parent 38 | TEST_TXT = HERE.joinpath("resources", "test.txt") 39 | TEST_TXT_MD5 = HERE.joinpath("resources", "test.txt.md5") 40 | TEST_TXT_VERBOSE_MD5 = HERE.joinpath("resources", "test_verbose.txt.md5") 41 | TEST_TXT_WRONG_MD5 = HERE.joinpath("resources", "test_wrong.txt.md5") 42 | 43 | skip_on_windows = unittest.skipIf( 44 | os.name == "nt", 45 | reason="Funny stuff happens in requests with a file adapter on windows that adds line breaks", 46 | ) 47 | 48 | 49 | class _Session(requests.sessions.Session): 50 | """A mock session.""" 51 | 52 | def __init__(self) -> None: 53 | """Instantiate the patched session with an additional file adapter.""" 54 | super().__init__() 55 | self.mount("file://", FileAdapter()) 56 | 57 | 58 | requests.sessions.Session = _Session # type: ignore 59 | 60 | 61 | class TestUtils(unittest.TestCase): 62 | """Test utility functions.""" 63 | 64 | def test_name_from_url(self) -> None: 65 | """Test :func:`name_from_url`.""" 66 | data = [ 67 | ("test.tsv", "https://example.com/test.tsv"), 68 | ("test.tsv", "https://example.com/deeper/test.tsv"), 69 | ("test.tsv.gz", "https://example.com/deeper/test.tsv.gz"), 70 | ] 71 | for name, url in data: 72 | with self.subTest(name=name, url=url): 73 | self.assertEqual(name, name_from_url(url)) 74 | 75 | @skip_on_windows 76 | def test_file_values(self) -> None: 77 | """Test encodings.""" 78 | for url, value in [ 79 | (TEST_TXT, "this is a test file\n"), 80 | (TEST_TXT_MD5, "4221d002ceb5d3c9e9137e495ceaa647"), 81 | (TEST_TXT_VERBOSE_MD5, "MD5(text.txt)=4221d002ceb5d3c9e9137e495ceaa647"), 82 | (TEST_TXT_WRONG_MD5, "yolo"), 83 | ]: 84 | with self.subTest(name=url.name): 85 | self.assertEqual(value, requests.get(url.as_uri(), timeout=15).text) 86 | 87 | def test_mkdir(self) -> None: 88 | """Test for ensuring a directory.""" 89 | with tempfile.TemporaryDirectory() as directory: 90 | directory_ = Path(directory) 91 | subdirectory = directory_ / "sd1" 92 | self.assertFalse(subdirectory.exists()) 93 | 94 | mkdir(subdirectory, ensure_exists=False) 95 | self.assertFalse(subdirectory.exists()) 96 | 97 | mkdir(subdirectory, ensure_exists=True) 98 | self.assertTrue(subdirectory.exists()) 99 | 100 | def test_mock_envvar(self) -> None: 101 | """Test that environment variables can be mocked properly.""" 102 | name, value = n(), n() 103 | 104 | self.assertNotIn(name, os.environ) 105 | with mock_envvar(name, value): 106 | self.assertIn(name, os.environ) 107 | self.assertEqual(value, os.getenv(name)) 108 | self.assertNotIn(name, os.environ) 109 | 110 | def test_getenv_path(self) -> None: 111 | """Test that :func:`getenv_path` works properly.""" 112 | envvar = n() 113 | 114 | with tempfile.TemporaryDirectory() as directory: 115 | directory_ = Path(directory) 116 | value = directory_ / n() 117 | default = directory_ / n() 118 | 119 | self.assertEqual(default, getenv_path(envvar, default)) 120 | with mock_envvar(envvar, value.as_posix()): 121 | self.assertEqual(value, getenv_path(envvar, default)) 122 | # Check that it goes back 123 | self.assertEqual(default, getenv_path(envvar, default)) 124 | 125 | def test_compressed_io(self) -> None: 126 | """Test that the read/write to compressed folder functions work.""" 127 | rows = [[1, 2], [3, 4], [5, 6]] 128 | columns = ["A", "B"] 129 | df = pd.DataFrame(rows, columns=columns) 130 | inner_path = "okay.tsv" 131 | 132 | data = [ 133 | ("test.zip", write_zipfile_csv, read_zipfile_csv), 134 | ("test.tar.gz", write_tarfile_csv, read_tarfile_csv), 135 | ] 136 | for name, writer, reader in data: 137 | with self.subTest(name=name), tempfile.TemporaryDirectory() as directory: 138 | path = Path(directory) / name 139 | self.assertFalse(path.exists()) 140 | writer(df, path=path, inner_path=inner_path) 141 | self.assertTrue(path.exists()) 142 | new_df = reader(path=path, inner_path=inner_path) 143 | self.assertEqual(list(df.columns), list(new_df.columns)) 144 | self.assertEqual(df.values.tolist(), new_df.values.tolist()) 145 | 146 | def test_xml_io(self) -> None: 147 | """Test that read/write for XML element tree works.""" 148 | root = etree.Element("Doc") 149 | level1 = etree.SubElement(root, "S") 150 | main = etree.SubElement(level1, "Text") 151 | main.text = "Thanks for contributing an answer to Stack Overflow!" 152 | second = etree.SubElement(level1, "Tokens") 153 | level2 = etree.SubElement(second, "Token", word="low") 154 | 155 | level3 = etree.SubElement(level2, "Morph") 156 | second1 = etree.SubElement(level3, "Lemma") 157 | second1.text = "sdfs" 158 | second1 = etree.SubElement(level3, "info") 159 | second1.text = "qw" 160 | 161 | level4 = etree.SubElement(level3, "Aff") 162 | second1 = etree.SubElement(level4, "Type") 163 | second1.text = "sdfs" 164 | second1 = etree.SubElement(level4, "Suf") 165 | second1.text = "qw" 166 | 167 | tree = etree.ElementTree(root) 168 | inner_path = "okay.tsv" 169 | data = [ 170 | ("test.zip", write_zipfile_xml, read_zipfile_xml), 171 | ] 172 | for name, writer, reader in data: 173 | with self.subTest(name=name), tempfile.TemporaryDirectory() as directory: 174 | path = Path(directory) / name 175 | self.assertFalse(path.exists()) 176 | writer(tree, path=path, inner_path=inner_path) 177 | self.assertTrue(path.exists()) 178 | new_tree = reader(path=path, inner_path=inner_path) 179 | self.assertEqual( 180 | etree.tostring(tree, pretty_print=True), 181 | etree.tostring(new_tree, pretty_print=True), 182 | ) 183 | 184 | def test_numpy_io(self) -> None: 185 | """Test IO with numpy.""" 186 | arr = np.array([[0, 1], [2, 3]]) 187 | inner_path = "okay.npz" 188 | with tempfile.TemporaryDirectory() as directory: 189 | path = Path(directory) / "test.zip" 190 | write_zipfile_np(arr, inner_path=inner_path, path=path) 191 | reloaded_arr = read_zip_np(path=path, inner_path=inner_path) 192 | self.assertTrue(np.array_equal(arr, reloaded_arr)) 193 | 194 | 195 | class TestDownload(unittest.TestCase): 196 | """Tests for downloading.""" 197 | 198 | def setUp(self) -> None: 199 | """Set up a test.""" 200 | self.directory_obj = tempfile.TemporaryDirectory() 201 | self.directory = Path(self.directory_obj.name) 202 | self.bad_url = "https://nope.nope/nope.tsv" 203 | self.path_for_bad_url = self.directory.joinpath("nope.tsv") 204 | 205 | def tearDown(self) -> None: 206 | """Tear down a test.""" 207 | self.directory_obj.cleanup() 208 | 209 | def test_bad_file_error(self) -> None: 210 | """Test that urllib errors are handled properly.""" 211 | with self.assertRaises(DownloadError): 212 | download( 213 | url=self.bad_url, 214 | path=self.path_for_bad_url, 215 | backend="urllib", 216 | ) 217 | self.assertFalse(self.path_for_bad_url.is_file()) 218 | 219 | def test_requests_error_stream(self) -> None: 220 | """Test that requests errors are handled properly.""" 221 | with self.assertRaises(DownloadError): 222 | download( 223 | url=self.bad_url, 224 | path=self.path_for_bad_url, 225 | backend="requests", 226 | stream=True, 227 | ) 228 | self.assertFalse(self.path_for_bad_url.is_file()) 229 | 230 | def test_requests_error_sync(self) -> None: 231 | """Test that requests errors are handled properly.""" 232 | with self.assertRaises(DownloadError): 233 | download( 234 | url=self.bad_url, 235 | path=self.path_for_bad_url, 236 | backend="requests", 237 | stream=False, 238 | ) 239 | self.assertFalse(self.path_for_bad_url.is_file()) 240 | 241 | 242 | class TestHashing(unittest.TestCase): 243 | """Tests for hexdigest checking.""" 244 | 245 | def setUp(self) -> None: 246 | """Set up a test.""" 247 | self.directory = tempfile.TemporaryDirectory() 248 | self.path = Path(self.directory.name).joinpath("test.tsv") 249 | 250 | md5 = hashlib.md5() # noqa: S324 251 | with TEST_TXT.open("rb") as file: 252 | md5.update(file.read()) 253 | self.expected_md5 = md5.hexdigest() 254 | self.mismatching_md5_hexdigest = "yolo" 255 | self.assertNotEqual(self.mismatching_md5_hexdigest, self.expected_md5) 256 | 257 | def tearDown(self) -> None: 258 | """Tear down a test.""" 259 | self.directory.cleanup() 260 | 261 | def test_hash_success(self) -> None: 262 | """Test checking actually works.""" 263 | self.assertFalse(self.path.exists()) 264 | download( 265 | url=TEST_TXT.as_uri(), 266 | path=self.path, 267 | hexdigests={ 268 | "md5": self.expected_md5, 269 | }, 270 | ) 271 | 272 | @skip_on_windows 273 | def test_hash_remote_success(self) -> None: 274 | """Test checking actually works.""" 275 | self.assertFalse(self.path.exists()) 276 | download( 277 | url=TEST_TXT.as_uri(), 278 | path=self.path, 279 | hexdigests_remote={ 280 | "md5": TEST_TXT_MD5.as_uri(), 281 | }, 282 | hexdigests_strict=True, 283 | ) 284 | self.assertTrue(self.path.exists()) 285 | 286 | @skip_on_windows 287 | def test_hash_remote_verbose_success(self) -> None: 288 | """Test checking actually works.""" 289 | self.assertFalse(self.path.exists()) 290 | download( 291 | url=TEST_TXT.as_uri(), 292 | path=self.path, 293 | hexdigests_remote={ 294 | "md5": TEST_TXT_VERBOSE_MD5.as_uri(), 295 | }, 296 | hexdigests_strict=False, 297 | ) 298 | self.assertTrue(self.path.exists()) 299 | 300 | def test_hash_remote_verbose_failure(self) -> None: 301 | """Test checking actually works.""" 302 | self.assertFalse(self.path.exists()) 303 | with self.assertRaises(HexDigestError): 304 | download( 305 | url=TEST_TXT.as_uri(), 306 | path=self.path, 307 | hexdigests_remote={ 308 | "md5": TEST_TXT_VERBOSE_MD5.as_uri(), 309 | }, 310 | hexdigests_strict=True, 311 | ) 312 | 313 | def test_hash_error(self) -> None: 314 | """Test hash error on download.""" 315 | self.assertFalse(self.path.exists()) 316 | with self.assertRaises(HexDigestError): 317 | download( 318 | url=TEST_TXT.as_uri(), 319 | path=self.path, 320 | hexdigests={ 321 | "md5": self.mismatching_md5_hexdigest, 322 | }, 323 | ) 324 | 325 | def test_hash_remote_error(self) -> None: 326 | """Test hash error on download.""" 327 | self.assertFalse(self.path.exists()) 328 | with self.assertRaises(HexDigestError): 329 | download( 330 | url=TEST_TXT.as_uri(), 331 | path=self.path, 332 | hexdigests_remote={ 333 | "md5": TEST_TXT_WRONG_MD5.as_uri(), 334 | }, 335 | hexdigests_strict=True, 336 | ) 337 | 338 | def test_override_hash_error(self) -> None: 339 | """Test hash error on download.""" 340 | self.path.write_text("test file content") 341 | 342 | self.assertTrue(self.path.exists()) 343 | with self.assertRaises(HexDigestError): 344 | download( 345 | url=TEST_TXT.as_uri(), 346 | path=self.path, 347 | hexdigests={ 348 | "md5": self.expected_md5, 349 | }, 350 | force=False, 351 | ) 352 | 353 | def test_override_hash_remote_error(self) -> None: 354 | """Test hash error on download.""" 355 | self.path.write_text("test file content") 356 | 357 | self.assertTrue(self.path.exists()) 358 | with self.assertRaises(HexDigestError): 359 | download( 360 | url=TEST_TXT.as_uri(), 361 | path=self.path, 362 | hexdigests_remote={ 363 | "md5": TEST_TXT_MD5.as_uri(), 364 | }, 365 | hexdigests_strict=True, 366 | force=False, 367 | ) 368 | 369 | def test_force(self) -> None: 370 | """Test overwriting wrong file.""" 371 | # now if force=True it should not bother with the hash check 372 | self.path.write_text("test file content") 373 | 374 | self.assertTrue(self.path.exists()) 375 | download( 376 | url=TEST_TXT.as_uri(), 377 | path=self.path, 378 | hexdigests={ 379 | "md5": self.expected_md5, 380 | }, 381 | force=True, 382 | ) 383 | 384 | @skip_on_windows 385 | def test_remote_force(self) -> None: 386 | """Test overwriting wrong file.""" 387 | # now if force=True it should not bother with the hash check 388 | self.path.write_text("test file content") 389 | 390 | self.assertTrue(self.path.exists()) 391 | download( 392 | url=TEST_TXT.as_uri(), 393 | path=self.path, 394 | hexdigests_remote={ 395 | "md5": TEST_TXT_MD5.as_uri(), 396 | }, 397 | hexdigests_strict=True, 398 | force=True, 399 | ) 400 | 401 | def test_hexdigest_urls(self) -> None: 402 | """Test getting hex digests from URLs.""" 403 | for url, strict in [ 404 | (TEST_TXT_MD5, True), 405 | (TEST_TXT_MD5, False), 406 | (TEST_TXT_VERBOSE_MD5, False), 407 | ]: 408 | hexdigests = get_hexdigests_remote( 409 | {"md5": url.as_uri()}, 410 | hexdigests_strict=strict, 411 | ) 412 | self.assertEqual( 413 | "4221d002ceb5d3c9e9137e495ceaa647", 414 | hexdigests["md5"], 415 | ) 416 | 417 | hexdigests = get_hexdigests_remote( 418 | {"md5": TEST_TXT_VERBOSE_MD5.as_uri()}, hexdigests_strict=True 419 | ) 420 | self.assertNotEqual( 421 | "4221d002ceb5d3c9e9137e495ceaa647", 422 | hexdigests["md5"], 423 | ) 424 | 425 | @unittest.skip(reason="this test hits a live endpoint") 426 | def test_live(self) -> None: 427 | """Test live.""" 428 | hexdigests = get_hexdigests_remote( 429 | {"md5": "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed22n0001.xml.gz.md5"}, 430 | hexdigests_strict=False, 431 | ) 432 | self.assertEqual( 433 | { 434 | "md5": "0f08d8f3947dde1f3bced5e1f450c0da", 435 | }, 436 | hexdigests, 437 | ) 438 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | # To use a PEP 517 build-backend you are required to configure tox to use an isolated_build: 8 | # https://tox.readthedocs.io/en/latest/example/package.html 9 | isolated_build = True 10 | 11 | # These environments are run in order if you just use `tox`: 12 | envlist = 13 | # always keep coverage-clean first 14 | coverage-clean 15 | # code formatters 16 | format 17 | # format-docs 18 | # Code quality assessment 19 | pyroma 20 | lint 21 | mypy 22 | # Documentation quality assurance 23 | doc8 24 | docstr-coverage 25 | docs-test 26 | # the actual tests 27 | py 28 | doctests 29 | # always keep coverage-report last 30 | coverage-report 31 | 32 | [testenv:.pkg] 33 | # this special environment configures the build that tox does itself 34 | set_env = 35 | UV_PREVIEW=1 36 | 37 | [testenv] 38 | description = Run unit and integration tests. 39 | # Runs on the "tests" directory by default, or passes the positional 40 | # arguments from `tox -e py ... 41 | commands = 42 | coverage run -p -m pytest --durations=20 {posargs:tests} 43 | coverage combine 44 | coverage xml 45 | extras = 46 | # See the [project.optional-dependencies] entry in pyproject.toml for "tests" 47 | tests 48 | pandas 49 | rdf 50 | xml 51 | set_env = 52 | # this setting gets inherited into all environments, meaning 53 | # that things that call uv commands don't require a --preview 54 | UV_PREVIEW=1 55 | 56 | [testenv:coverage-clean] 57 | description = Remove testing coverage artifacts. 58 | deps = coverage 59 | skip_install = true 60 | commands = coverage erase 61 | 62 | [testenv:doctests] 63 | description = Test that documentation examples run properly. 64 | commands = 65 | # note that the package name is required for discovery 66 | xdoctest -m src/pystow 67 | deps = 68 | xdoctest 69 | pygments 70 | extras = 71 | pandas 72 | 73 | [testenv:treon] 74 | description = Test that notebooks can run to completion 75 | commands = 76 | treon notebooks/ 77 | deps = 78 | treon 79 | 80 | [testenv:format] 81 | description = Format the code in a deterministic way using ruff. Note that ruff check should come before ruff format when using --fix (ref: https://github.com/astral-sh/ruff-pre-commit/blob/main/README.md) 82 | deps = 83 | ruff 84 | skip_install = true 85 | commands = 86 | ruff check --fix 87 | ruff format 88 | 89 | [testenv:format-docs] 90 | description = Run documentation linters. 91 | # note that this doesn't work with sphinx-click 92 | # or any other extension that adds extra directives 93 | deps = 94 | rstfmt 95 | extras = 96 | # See the [project.optional-dependencies] entry in pyproject.toml for "docs" 97 | docs 98 | skip_install = true 99 | commands = 100 | rstfmt docs/source/ 101 | 102 | [testenv:format-markdown] 103 | description = Run markdown formatter. 104 | skip_install = true 105 | allowlist_externals = 106 | npx 107 | commands = 108 | npx --yes prettier --write --prose-wrap always "**/*.md" 109 | 110 | [testenv:lint] 111 | description = Check code quality using ruff and other tools. 112 | skip_install = true 113 | deps = 114 | ruff 115 | commands = 116 | ruff check 117 | ruff format --check 118 | 119 | [testenv:pyroma] 120 | deps = 121 | pygments 122 | pyroma 123 | skip_install = true 124 | commands = pyroma --min=10 . 125 | description = Run the pyroma tool to check the package friendliness of the project. 126 | 127 | [testenv:mypy] 128 | description = Run the mypy tool to check static typing on the project. Installs the package to make sure all type stubs get recognized. 129 | deps = 130 | mypy 131 | types-requests 132 | extras = 133 | pandas 134 | rdf 135 | xml 136 | commands = mypy --install-types --non-interactive --ignore-missing-imports --strict src/ tests/ 137 | 138 | [testenv:doc8] 139 | skip_install = true 140 | deps = 141 | doc8 142 | extras = 143 | docs 144 | commands = 145 | doc8 docs/source/ 146 | description = Run the doc8 tool to check the style of the RST files in the project docs. 147 | 148 | [testenv:docstr-coverage] 149 | description = Run the docstr-coverage tool to check documentation coverage. 150 | skip_install = true 151 | deps = 152 | docstr-coverage 153 | commands = 154 | docstr-coverage src/ tests/ --skip-private --skip-magic 155 | 156 | [testenv:docs] 157 | description = Build the documentation locally, allowing warnings. 158 | extras = 159 | # See the [project.optional-dependencies] entry in pyproject.toml for "docs" 160 | docs 161 | # You might need to add additional extras if your documentation covers it 162 | commands = 163 | python -m sphinx -b html -d docs/build/doctrees docs/source docs/build/html 164 | 165 | [testenv:docs-test] 166 | description = Test building the documentation in an isolated environment. Warnings are considered as errors via -W. 167 | changedir = docs 168 | extras = 169 | {[testenv:docs]extras} 170 | commands = 171 | mkdir -p {envtmpdir} 172 | cp -r source {envtmpdir}/source 173 | python -m sphinx -W -b html -d {envtmpdir}/build/doctrees {envtmpdir}/source {envtmpdir}/build/html 174 | # python -m sphinx -W -b coverage -d {envtmpdir}/build/doctrees {envtmpdir}/source {envtmpdir}/build/coverage 175 | # cat {envtmpdir}/build/coverage/c.txt 176 | # cat {envtmpdir}/build/coverage/python.txt 177 | allowlist_externals = 178 | cp 179 | cat 180 | mkdir 181 | 182 | [testenv:coverage-xml] 183 | deps = coverage[toml] 184 | skip_install = true 185 | commands = coverage xml 186 | 187 | [testenv:coverage-report] 188 | # TODO this is broken 189 | deps = coverage[toml] 190 | skip_install = true 191 | commands = 192 | coverage report 193 | 194 | #################### 195 | # Deployment tools # 196 | #################### 197 | 198 | [testenv:bumpversion] 199 | description = Bump the version number 200 | commands = bump-my-version bump {posargs} 201 | skip_install = true 202 | passenv = HOME 203 | deps = 204 | bump-my-version 205 | 206 | [testenv:bumpversion-release] 207 | description = Remove the -dev tag from the version 208 | commands = bump-my-version bump release --tag 209 | skip_install = true 210 | passenv = HOME 211 | deps = 212 | bump-my-version 213 | 214 | [testenv:build] 215 | skip_install = true 216 | deps = 217 | uv 218 | commands = 219 | uv build --sdist --wheel --no-build-isolation 220 | 221 | ############ 222 | # Releases # 223 | ############ 224 | 225 | # In order to make a release to PyPI, you'll need to take the following steps: 226 | # 227 | # 1. Navigate to https://pypi.org/account/register/ to register for Test PyPI 228 | # 2. Navigate to https://pypi.org/manage/account/ and request to re-send a verification email. 229 | # This is not sent by default, and is required to set up 2-Factor Authentication. 230 | # 3. Get account recovery codes 231 | # 4. Set up 2-Factor Authentication 232 | # 5. Get an API token from https://pypi.org/manage/account/token/ 233 | # 6. Install keyring with `uv tool install keyring` 234 | # 7. Add your token to keyring with `keyring set https://upload.pypi.org/legacy/ __token__` 235 | 236 | [testenv:release] 237 | description = Release the code to PyPI so users can pip install it, using credentials from keyring 238 | skip_install = true 239 | deps = 240 | {[testenv:build]deps} 241 | uv 242 | keyring 243 | commands = 244 | {[testenv:build]commands} 245 | uv publish --username __token__ --keyring-provider subprocess --publish-url https://upload.pypi.org/legacy/ 246 | 247 | [testenv:release-via-env] 248 | description = Release the code to PyPI so users can pip install it, using credentials from the environment. 249 | skip_install = true 250 | deps = 251 | {[testenv:build]deps} 252 | uv 253 | commands = 254 | {[testenv:build]commands} 255 | uv publish --publish-url https://upload.pypi.org/legacy/ 256 | passenv = 257 | UV_PUBLISH_USERNAME 258 | UV_PUBLISH_PASSWORD 259 | 260 | [testenv:finish] 261 | description = 262 | Run a workflow that removes -dev from the version, creates a tagged release on GitHub, 263 | creates a release on PyPI, and bumps the version again. 264 | skip_install = true 265 | passenv = 266 | HOME 267 | deps = 268 | {[testenv:release]deps} 269 | bump-my-version 270 | commands = 271 | {[testenv:bumpversion-release]commands} 272 | {[testenv:release]commands} 273 | git push --tags 274 | bump-my-version bump patch 275 | git push 276 | allowlist_externals = 277 | git 278 | 279 | ################# 280 | # Test Releases # 281 | ################# 282 | 283 | # In order to test making a release to Test PyPI, you'll need to take the following steps: 284 | # 285 | # 1. Navigate to https://test.pypi.org/account/register/ to register for Test PyPI 286 | # 2. Navigate to https://test.pypi.org/manage/account/ and request to re-send a verification email. 287 | # This is not sent by default, and is required to set up 2-Factor Authentication. 288 | # 3. Get account recovery codes 289 | # 4. Set up 2-Factor Authentication 290 | # 5. Get an API token from https://test.pypi.org/manage/account/token/ 291 | # 6. Install keyring with `uv tool install keyring` 292 | # 7. Add your token to keyring with `keyring set https://test.pypi.org/legacy/ __token__` 293 | 294 | [testenv:testrelease] 295 | description = Release the code to the test PyPI site 296 | skip_install = true 297 | deps = 298 | {[testenv:build]deps} 299 | uv 300 | keyring 301 | commands = 302 | {[testenv:build]commands} 303 | uv publish --username __token__ --keyring-provider subprocess --publish-url https://test.pypi.org/legacy/ 304 | 305 | [testenv:testfinish] 306 | description = 307 | Run a workflow that removes -dev from the version, creates a tagged release on GitHub, 308 | creates a release on Test PyPI, and bumps the version again. 309 | skip_install = true 310 | passenv = 311 | HOME 312 | deps = 313 | {[testenv:testrelease]deps} 314 | bump-my-version 315 | commands = 316 | {[testenv:bumpversion-release]commands} 317 | {[testenv:testrelease]commands} 318 | git push --tags 319 | bump-my-version bump patch 320 | git push 321 | allowlist_externals = 322 | git 323 | --------------------------------------------------------------------------------