├── .git_archival.txt ├── .gitattributes ├── .github ├── CONTRIBUTING.md ├── dependabot.yml ├── matchers │ └── pylint.json └── workflows │ ├── cd.yml │ ├── ci.yml │ └── keep-alive.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── api │ └── idc_index.rst ├── cli_tools.rst ├── column_descriptions.md ├── conf.py └── index.md ├── idc_index ├── __init__.py ├── _version.pyi ├── cli.py ├── index.py └── py.typed ├── noxfile.py ├── pyproject.toml └── tests ├── __init__.py ├── idcindex.py ├── prior_version_manifest.s5cmd ├── study_manifest_aws.s5cmd ├── study_manifest_bogus.s5cmd ├── study_manifest_gcs.s5cmd └── test_package.py /.git_archival.txt: -------------------------------------------------------------------------------- 1 | node: 9d905f9b7e4ab719bfe54b1d747505283870ed8b 2 | node-date: 2025-05-19T17:26:21-04:00 3 | describe-name: 0.9.0 4 | ref-names: HEAD -> main, tag: 0.9.0 5 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | .git_archival.txt export-subst 2 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | See the [Scientific Python Developer Guide][spc-dev-intro] for a detailed 2 | description of best practices for developing scientific packages. 3 | 4 | [spc-dev-intro]: https://learn.scientific-python.org/development/ 5 | 6 | # Quick development 7 | 8 | The fastest way to start with development is to use nox. If you don't have nox, 9 | you can use `pipx run nox` to run it without installing, or `pipx install nox`. 10 | If you don't have pipx (pip for applications), then you can install with 11 | `pip install pipx` (the only case were installing an application with regular 12 | pip is reasonable). If you use macOS, then pipx and nox are both in brew, use 13 | `brew install pipx nox`. 14 | 15 | To use, run `nox`. This will lint and test using every installed version of 16 | Python on your system, skipping ones that are not installed. You can also run 17 | specific jobs: 18 | 19 | ```console 20 | $ nox -s lint # Lint only 21 | $ nox -s tests # Python tests 22 | $ nox -s docs -- --serve # Build and serve the docs 23 | $ nox -s build # Make an SDist and wheel 24 | ``` 25 | 26 | Nox handles everything for you, including setting up an temporary virtual 27 | environment for each run. 28 | 29 | # Setting up a development environment manually 30 | 31 | You can set up a development environment by running: 32 | 33 | ```bash 34 | python3 -m venv .venv 35 | source ./.venv/bin/activate 36 | pip install -v -e .[dev] 37 | ``` 38 | 39 | If you have the 40 | [Python Launcher for Unix](https://github.com/brettcannon/python-launcher), you 41 | can instead do: 42 | 43 | ```bash 44 | py -m venv .venv 45 | py -m install -v -e .[dev] 46 | ``` 47 | 48 | # Post setup 49 | 50 | You should prepare pre-commit, which will help you by checking that commits pass 51 | required checks: 52 | 53 | ```bash 54 | pip install pre-commit # or brew install pre-commit on macOS 55 | pre-commit install # Will install a pre-commit hook into the git repo 56 | ``` 57 | 58 | You can also/alternatively run `pre-commit run` (changes only) or 59 | `pre-commit run --all-files` to check even without installing the hook. 60 | 61 | # Testing 62 | 63 | Use pytest to run the unit checks: 64 | 65 | ```bash 66 | pytest 67 | ``` 68 | 69 | # Coverage 70 | 71 | Use pytest-cov to generate coverage reports: 72 | 73 | ```bash 74 | pytest --cov=idc-index 75 | ``` 76 | 77 | # Building docs 78 | 79 | You can build the docs using: 80 | 81 | ```bash 82 | nox -s docs 83 | ``` 84 | 85 | You can see a preview with: 86 | 87 | ```bash 88 | nox -s docs -- --serve 89 | ``` 90 | 91 | # Pre-commit 92 | 93 | This project uses pre-commit for all style checking. While you can run it with 94 | nox, this is such an important tool that it deserves to be installed on its own. 95 | Install pre-commit and run: 96 | 97 | ```bash 98 | pre-commit run -a 99 | ``` 100 | 101 | to check all files. 102 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Maintain dependencies for GitHub Actions 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | groups: 9 | actions: 10 | patterns: 11 | - "*" 12 | -------------------------------------------------------------------------------- /.github/matchers/pylint.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "severity": "warning", 5 | "pattern": [ 6 | { 7 | "regexp": "^([^:]+):(\\d+):(\\d+): ([A-DF-Z]\\d+): \\033\\[[\\d;]+m([^\\033]+).*$", 8 | "file": 1, 9 | "line": 2, 10 | "column": 3, 11 | "code": 4, 12 | "message": 5 13 | } 14 | ], 15 | "owner": "pylint-warning" 16 | }, 17 | { 18 | "severity": "error", 19 | "pattern": [ 20 | { 21 | "regexp": "^([^:]+):(\\d+):(\\d+): (E\\d+): \\033\\[[\\d;]+m([^\\033]+).*$", 22 | "file": 1, 23 | "line": 2, 24 | "column": 3, 25 | "code": 4, 26 | "message": 5 27 | } 28 | ], 29 | "owner": "pylint-error" 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | name: wheels 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | push: 7 | branches: 8 | - main 9 | release: 10 | types: 11 | - published 12 | 13 | concurrency: 14 | group: ${{ github.workflow }}-${{ github.ref }} 15 | cancel-in-progress: true 16 | 17 | env: 18 | FORCE_COLOR: 3 19 | 20 | jobs: 21 | dist: 22 | name: Distribution build 23 | runs-on: ubuntu-latest 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | with: 28 | fetch-depth: 0 29 | 30 | - uses: hynek/build-and-inspect-python-package@v2 31 | 32 | publish: 33 | needs: [dist] 34 | name: Publish to PyPI 35 | environment: pypi 36 | permissions: 37 | id-token: write 38 | runs-on: ubuntu-latest 39 | if: github.event_name == 'release' && github.event.action == 'published' 40 | 41 | steps: 42 | - uses: actions/download-artifact@v4 43 | with: 44 | name: Packages 45 | path: dist 46 | 47 | - uses: pypa/gh-action-pypi-publish@release/v1 48 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | push: 7 | branches: 8 | - main 9 | 10 | concurrency: 11 | group: ${{ github.workflow }}-${{ github.ref }} 12 | cancel-in-progress: true 13 | 14 | env: 15 | FORCE_COLOR: 3 16 | 17 | jobs: 18 | pre-commit: 19 | name: Format 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v4 23 | with: 24 | fetch-depth: 0 25 | - uses: actions/setup-python@v5 26 | with: 27 | python-version: "3.x" 28 | - uses: pre-commit/action@v3.0.1 29 | with: 30 | extra_args: --hook-stage manual --all-files 31 | - name: Run PyLint 32 | run: | 33 | echo "::add-matcher::$GITHUB_WORKSPACE/.github/matchers/pylint.json" 34 | pipx run nox -s pylint 35 | 36 | checks: 37 | name: Check Python ${{ matrix.python-version }} on ${{ matrix.runs-on }} 38 | runs-on: ${{ matrix.runs-on }} 39 | needs: [pre-commit] 40 | strategy: 41 | fail-fast: false 42 | matrix: 43 | python-version: ["3.8", "3.12"] 44 | runs-on: [ubuntu-latest, macos-latest, windows-latest] 45 | 46 | steps: 47 | - uses: actions/checkout@v4 48 | with: 49 | fetch-depth: 0 50 | 51 | - uses: actions/setup-python@v5 52 | with: 53 | python-version: ${{ matrix.python-version }} 54 | allow-prereleases: true 55 | 56 | - name: Install package 57 | run: python -m pip install .[test] 58 | 59 | - name: Test package 60 | run: >- 61 | python -m pytest -ra --cov --cov-report=xml --cov-report=term 62 | --durations=20 -vv 63 | 64 | # - name: Upload coverage report 65 | # uses: codecov/codecov-action@v4.1.0 66 | # with: 67 | # token: ${{ secrets.CODECOV_TOKEN }} 68 | -------------------------------------------------------------------------------- /.github/workflows/keep-alive.yaml: -------------------------------------------------------------------------------- 1 | name: keep-github-actions-alive 2 | 3 | on: 4 | schedule: 5 | - cron: "0 0 * * *" 6 | workflow_dispatch: 7 | 8 | permissions: 9 | actions: write 10 | 11 | jobs: 12 | keep-alive: 13 | runs-on: ubuntu-latest 14 | permissions: 15 | contents: write 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | ref: "keep-alive" 20 | - uses: gautamkrishnar/keepalive-workflow@v2 21 | with: 22 | time_elapsed: 50 23 | use_api: false 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # macos 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | # pytype static type analyzer 138 | .pytype/ 139 | 140 | # Cython debug symbols 141 | cython_debug/ 142 | 143 | # setuptools_scm 144 | */_version.py 145 | 146 | # ruff 147 | .ruff_cache/ 148 | 149 | # OS specific stuff 150 | .DS_Store 151 | .DS_Store? 152 | ._* 153 | .Spotlight-V100 154 | .Trashes 155 | ehthumbs.db 156 | Thumbs.db 157 | 158 | # Common editor files 159 | *~ 160 | *.swp 161 | 162 | # idc-index 163 | */idc_index.csv.zip 164 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autoupdate_commit_msg: "chore: update pre-commit hooks" 3 | autofix_commit_msg: "style: pre-commit fixes" 4 | 5 | repos: 6 | - repo: https://github.com/adamchainz/blacken-docs 7 | rev: "1.16.0" 8 | hooks: 9 | - id: blacken-docs 10 | additional_dependencies: [black==24.*] 11 | 12 | - repo: https://github.com/pre-commit/pre-commit-hooks 13 | rev: "v4.5.0" 14 | hooks: 15 | - id: check-added-large-files 16 | - id: check-case-conflict 17 | - id: check-merge-conflict 18 | - id: check-symlinks 19 | - id: check-yaml 20 | - id: debug-statements 21 | - id: end-of-file-fixer 22 | - id: mixed-line-ending 23 | # - id: name-tests-test 24 | # args: ["--pytest-test-first"] 25 | - id: requirements-txt-fixer 26 | - id: trailing-whitespace 27 | 28 | - repo: https://github.com/pre-commit/pygrep-hooks 29 | rev: "v1.10.0" 30 | hooks: 31 | - id: rst-backticks 32 | - id: rst-directive-colons 33 | - id: rst-inline-touching-normal 34 | 35 | - repo: https://github.com/pre-commit/mirrors-prettier 36 | rev: "v3.1.0" 37 | hooks: 38 | - id: prettier 39 | types_or: [yaml, markdown, html, css, scss, javascript, json] 40 | args: [--prose-wrap=always] 41 | 42 | - repo: https://github.com/astral-sh/ruff-pre-commit 43 | rev: "v0.2.2" 44 | hooks: 45 | - id: ruff 46 | args: ["--fix", "--show-fixes"] 47 | - id: ruff-format 48 | 49 | #- repo: https://github.com/pre-commit/mirrors-mypy 50 | # rev: "v1.8.0" 51 | # hooks: 52 | # - id: mypy 53 | # files: idc_index|tests 54 | # args: [] 55 | # additional_dependencies: 56 | # - pandas-stubs 57 | # - pytest 58 | 59 | - repo: https://github.com/codespell-project/codespell 60 | rev: "v2.2.6" 61 | hooks: 62 | - id: codespell 63 | args: ["--quiet-level 3"] 64 | 65 | - repo: https://github.com/shellcheck-py/shellcheck-py 66 | rev: "v0.9.0.6" 67 | hooks: 68 | - id: shellcheck 69 | 70 | - repo: local 71 | hooks: 72 | - id: disallow-caps 73 | name: Disallow improper capitalization 74 | language: pygrep 75 | entry: PyBind|Numpy|Cmake|CCache|Github|PyTest 76 | exclude: .pre-commit-config.yaml 77 | 78 | - repo: https://github.com/abravalheri/validate-pyproject 79 | rev: "v0.16" 80 | hooks: 81 | - id: validate-pyproject 82 | additional_dependencies: ["validate-pyproject-schema-store[all]"] 83 | 84 | - repo: https://github.com/python-jsonschema/check-jsonschema 85 | rev: "0.28.0" 86 | hooks: 87 | - id: check-dependabot 88 | - id: check-github-workflows 89 | - id: check-readthedocs 90 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.11" 10 | sphinx: 11 | configuration: docs/conf.py 12 | 13 | python: 14 | install: 15 | - method: pip 16 | path: . 17 | extra_requirements: 18 | - docs 19 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to idc-index 2 | 3 | There are many ways to contribute to idc-index, with varying levels of effort. 4 | Do try to look through the [documentation](idc-index-docs) first if something is 5 | unclear, and let us know how we can do better. 6 | 7 | - Ask a question on the [IDC forum][idc-forum] 8 | - Use [idc-index issues][idc-index-issues] to submit a feature request or bug, 9 | or add to the discussion on an existing issue 10 | - Submit a [Pull Request](https://github.com/ImagingDataCommons/idc-index/pulls) 11 | to improve idc-index or its documentation 12 | 13 | We encourage a range of Pull Requests, from patches that include passing tests 14 | and documentation, all the way down to half-baked ideas that launch discussions. 15 | 16 | ## The PR Process, Circle CI, and Related Gotchas 17 | 18 | ### How to submit a PR ? 19 | 20 | If you are new to idc-index development and you don't have push access to the 21 | repository, here are the steps: 22 | 23 | 1. [Fork and clone](https://docs.github.com/get-started/quickstart/fork-a-repo) 24 | the repository. 25 | 2. Create a branch dedicated to the feature/bugfix you plan to implement (do not 26 | use `main` branch - this will complicate further development and 27 | collaboration) 28 | 3. [Push](https://docs.github.com/get-started/using-git/pushing-commits-to-a-remote-repository) 29 | the branch to your GitHub fork. 30 | 4. Create a 31 | [Pull Request](https://github.com/ImagingDataCommons/idc-index/pulls). 32 | 33 | This corresponds to the `Fork & Pull Model` described in the 34 | [GitHub collaborative development](https://docs.github.com/pull-requests/collaborating-with-pull-requests/getting-started/about-collaborative-development-models) 35 | documentation. 36 | 37 | When submitting a PR, the developers following the project will be notified. 38 | That said, to engage specific developers, you can add `Cc: @` comment 39 | to notify them of your awesome contributions. Based on the comments posted by 40 | the reviewers, you may have to revisit your patches. 41 | 42 | ### How to efficiently contribute ? 43 | 44 | We encourage all developers to: 45 | 46 | - set up pre-commit hooks so that you can remedy various formatting and other 47 | issues early, without waiting for the continuous integration (CI) checks to 48 | complete: `pre-commit install` 49 | 50 | - add or update tests. You can see current tests 51 | [here](https://github.com/ImagingDataCommons/idc-index/tree/main/tests). If 52 | you contribute new functionality, adding test(s) covering it is mandatory! 53 | 54 | - you can run individual tests from the root repository using the following 55 | command: `python -m unittest -vv tests.idcindex.TestIDCClient.` 56 | 57 | ### How to write commit messages ? 58 | 59 | Write your commit messages using the standard prefixes for commit messages: 60 | 61 | - `BUG:` Fix for runtime crash or incorrect result 62 | - `COMP:` Compiler error or warning fix 63 | - `DOC:` Documentation change 64 | - `ENH:` New functionality 65 | - `PERF:` Performance improvement 66 | - `STYLE:` No logic impact (indentation, comments) 67 | - `WIP:` Work In Progress not ready for merge 68 | 69 | The body of the message should clearly describe the motivation of the commit 70 | (**what**, **why**, and **how**). In order to ease the task of reviewing 71 | commits, the message body should follow the following guidelines: 72 | 73 | 1. Leave a blank line between the subject and the body. This helps `git log` and 74 | `git rebase` work nicely, and allows to smooth generation of release notes. 75 | 2. Try to keep the subject line below 72 characters, ideally 50. 76 | 3. Capitalize the subject line. 77 | 4. Do not end the subject line with a period. 78 | 5. Use the imperative mood in the subject line (e.g. 79 | `BUG: Fix spacing not being considered.`). 80 | 6. Wrap the body at 80 characters. 81 | 7. Use semantic line feeds to separate different ideas, which improves the 82 | readability. 83 | 8. Be concise, but honor the change: if significant alternative solutions were 84 | available, explain why they were discarded. 85 | 9. If the commit refers to a topic discussed on the [IDC forum][idc-forum], or 86 | fixes a regression test, provide the link. If it fixes a compiler error, 87 | provide a minimal verbatim message of the compiler error. If the commit 88 | closes an issue, use the 89 | [GitHub issue closing keywords](https://docs.github.com/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). 90 | 91 | Keep in mind that the significant time is invested in reviewing commits and 92 | _pull requests_, so following these guidelines will greatly help the people 93 | doing reviews. 94 | 95 | These guidelines are largely inspired by Chris Beam's 96 | [How to Write a Commit Message](https://chris.beams.io/posts/git-commit/) post. 97 | 98 | ### How to integrate a PR ? 99 | 100 | Getting your contributions integrated is relatively straightforward, here is the 101 | checklist: 102 | 103 | - All tests pass 104 | - Consensus is reached. This usually means that at least two reviewers approved 105 | the changes (or added a `LGTM` comment) and at least one business day passed 106 | without anyone objecting. `LGTM` is an acronym for _Looks Good to Me_. 107 | - To accommodate developers explicitly asking for more time to test the proposed 108 | changes, integration time can be delayed by few more days. 109 | - If you do NOT have push access, a core developer will integrate your PR. If 110 | you would like to speed up the integration, do not hesitate to add a reminder 111 | comment to the PR 112 | 113 | ### Automatic testing of pull requests 114 | 115 | Every pull request is tested automatically using GitHub Actions each time you 116 | push a commit to it. The GitHub UI will restrict users from merging pull 117 | requests until the CI build has returned with a successful result indicating 118 | that all tests have passed. 119 | 120 | [idc-forum]: https://discourse.canceridc.dev 121 | [idc-index-issues]: https://github.com/ImagingDataCommons/idc-index/issues 122 | [idc-index-docs]: https://idc-index.readthedocs.io/ 123 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Imaging Data Commons 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 9 | of the Software, and to permit persons to whom the Software is furnished to do 10 | so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # idc-index 2 | 3 | [![Actions Status][actions-badge]][actions-link] 4 | [![Documentation Status][rtd-badge]][rtd-link] 5 | 6 | [![PyPI version][pypi-version]][pypi-link] 7 | [![PyPI platforms][pypi-platforms]][pypi-link] 8 | 9 | [![Discourse Forum][discourse-forum-badge]][discourse-forum-link] 10 | 11 | > [!WARNING] 12 | > 13 | > This package is in its early development stages. Its functionality and API 14 | > will change. 15 | > 16 | > Stay tuned for the updates and documentation, and please share your feedback 17 | > about it by opening issues in this repository, or by starting a discussion in 18 | > [IDC User forum](https://discourse.canceridc.dev/). 19 | 20 | 21 | 22 | ## About 23 | 24 | `idc-index` is a Python package that enables basic operations for working with 25 | [NCI Imaging Data Commons (IDC)](https://imaging.datacommons.cancer.gov): 26 | 27 | - subsetting of the IDC data using selected metadata attributes 28 | - download of the files corresponding to selection 29 | - generation of the viewer URLs for the selected data 30 | 31 | ## Getting started 32 | 33 | Install the latest version of the package. 34 | 35 | ```bash 36 | $ pip install --upgrade idc-index 37 | ``` 38 | 39 | Instantiate `IDCClient`, which provides the interface for main operations. 40 | 41 | ```python 42 | from idc_index import IDCClient 43 | 44 | client = IDCClient.client() 45 | ``` 46 | 47 | You can use [IDC Portal](https://imaging.datacommons.cancer.gov/explore/) to 48 | browse collections, cases, studies and series, copy their identifiers and 49 | download the corresponding files using `idc-index` helper functions. 50 | 51 | You can try this out with the `rider_pilot` collection, which is just 10.5 GB in 52 | size: 53 | 54 | ``` 55 | client.download_from_selection(collection_id="rider_pilot", downloadDir=".") 56 | ``` 57 | 58 | ... or run queries against the "mini" index of Imaging Data Commons data, and 59 | download images that match your selection criteria! The following will select 60 | all Magnetic Resonance (MR) series, and will download the first 10. 61 | 62 | ```python 63 | from idc_index import index 64 | 65 | client = index.IDCClient() 66 | 67 | query = """ 68 | SELECT 69 | SeriesInstanceUID 70 | FROM 71 | index 72 | WHERE 73 | Modality = 'MR' 74 | """ 75 | 76 | selection_df = client.sql_query(query) 77 | 78 | client.download_from_selection( 79 | seriesInstanceUID=list(selection_df["SeriesInstanceUID"].values[:10]), 80 | downloadDir=".", 81 | ) 82 | ``` 83 | 84 | ## Tutorial 85 | 86 | Please check out 87 | [this tutorial notebook](https://github.com/ImagingDataCommons/IDC-Tutorials/blob/master/notebooks/labs/idc_rsna2023.ipynb) 88 | for the introduction into using `idc-index`. 89 | 90 | ## Resources 91 | 92 | - [Imaging Data Commons Portal](https://imaging.datacommons.cancer.gov/) can be 93 | used to explore the content of IDC from the web browser 94 | - [s5cmd](https://github.com/peak/s5cmd) is a highly efficient, open source, 95 | multi-platform S3 client that we use for downloading IDC data, which is hosted 96 | in public AWS and GCS buckets. Distributed on PyPI as 97 | [s5cmd](https://pypi.org/project/s5cmd/). 98 | - [SlicerIDCBrowser](https://github.com/ImagingDataCommons/SlicerIDCBrowser) 3D 99 | Slicer extension that relies on `idc-index` for search and download of IDC 100 | data 101 | 102 | ## Acknowledgment 103 | 104 | This software is maintained by the IDC team, which has been funded in whole or 105 | in part with Federal funds from the NCI, NIH, under task order no. HHSN26110071 106 | under contract no. HHSN261201500003l. 107 | 108 | If this package helped your research, we would appreciate if you could cite IDC 109 | paper below. 110 | 111 | > Fedorov, A., Longabaugh, W. J. R., Pot, D., Clunie, D. A., Pieper, S. D., 112 | > Gibbs, D. L., Bridge, C., Herrmann, M. D., Homeyer, A., Lewis, R., Aerts, H. 113 | > J. W., Krishnaswamy, D., Thiriveedhi, V. K., Ciausu, C., Schacherer, D. P., 114 | > Bontempi, D., Pihl, T., Wagner, U., Farahani, K., Kim, E. & Kikinis, R. 115 | > _National Cancer Institute Imaging Data Commons: Toward Transparency, 116 | > Reproducibility, and Scalability in Imaging Artificial Intelligence_. 117 | > RadioGraphics (2023). https://doi.org/10.1148/rg.230180 118 | 119 | 120 | [actions-badge]: https://github.com/ImagingDataCommons/idc-index/workflows/CI/badge.svg 121 | [actions-link]: https://github.com/ImagingDataCommons/idc-index/actions 122 | [discourse-forum-badge]: https://img.shields.io/discourse/https/discourse.canceridc.dev/status.svg 123 | [discourse-forum-link]: https://discourse.canceridc.dev/ 124 | [pypi-link]: https://pypi.org/project/idc-index/ 125 | [pypi-platforms]: https://img.shields.io/pypi/pyversions/idc-index 126 | [pypi-version]: https://img.shields.io/pypi/v/idc-index 127 | [rtd-badge]: https://readthedocs.org/projects/idc-index/badge/?version=latest 128 | [rtd-link]: https://idc-index.readthedocs.io/en/latest/?badge=latest 129 | 130 | 131 | -------------------------------------------------------------------------------- /docs/api/idc_index.rst: -------------------------------------------------------------------------------- 1 | idc\_index package API 2 | ======================= 3 | 4 | .. automodule:: idc_index 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | idc\_index.index module 13 | ----------------------- 14 | 15 | .. automodule:: idc_index.index 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | idc\_index.cli module 21 | ----------------------- 22 | 23 | .. automodule:: idc_index.cli 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | -------------------------------------------------------------------------------- /docs/cli_tools.rst: -------------------------------------------------------------------------------- 1 | Command Line Interface tools 2 | ============================ 3 | 4 | *idc-index* provides a command line interface (CLI) tool to simplify access to the functionality 5 | implemented via the Python API. The CLI tool is a wrapper around the Python API and provides a 6 | simple way to interact with the package. 7 | 8 | Once *idc-index* is installed, the CLI tool can be accessed by running the following command in the 9 | terminal. 10 | 11 | .. click:: idc_index.cli:idc 12 | :prog: idc 13 | :nested: full 14 | -------------------------------------------------------------------------------- /docs/column_descriptions.md: -------------------------------------------------------------------------------- 1 | # Metadata attributes in `idc-index`'s index tables 2 | 3 | `idc-index` is named this way because it wraps indices of IDC data: tables 4 | containing the most important metadata attributes describing the files available 5 | in IDC. The main metadata index is available in the `index` variable (which is a 6 | pandas `DataFrame`) of `IDCClient`. Additional index tables such as the 7 | `clinical_index` contain non-DICOM clinical data or slide microscopy specific 8 | tables (indicated by the prefix `sm`) include metadata attributes specific to 9 | slide microscopy images. 10 | 11 | ## `index` 12 | 13 | The following is the list of the columns included in `index`. You can use those 14 | to select cohorts and subsetting data. `index` is series-based, i.e, it has one 15 | row per DICOM series. 16 | 17 | - non-DICOM attributes assigned/curated by IDC: 18 | 19 | - `collection_id`: short string with the identifier of the collection the 20 | series belongs to 21 | - `analysis_result_id`: this string is not empty if the specific series is 22 | part of an analysis results collection; analysis results can be added to a 23 | given collection over time 24 | - `source_DOI`: Digital Object Identifier of the dataset that contains the 25 | given series; note that a given collection can include one or more DOIs, 26 | since analysis results added to the collection would typically have 27 | independent DOI values! 28 | - `instanceCount`: number of files in the series (typically, this matches the 29 | number of slices in cross-sectional modalities) 30 | - `license_short_name`: short name of the license that governs the use of the 31 | files corresponding to the series 32 | - `series_aws_url`: location of the series files in a public AWS bucket 33 | - `series_size_MB`: total disk size needed to store the series 34 | 35 | - DICOM attributes extracted from the files 36 | - `PatientID`: identifier of the patient 37 | - `PatientAge` and `PatientSex`: attributes containing patient age and sex 38 | - `StudyInstanceUID`: unique identifier of the DICOM study 39 | - `StudyDescription`: textual description of the study content 40 | - `StudyDate`: date of the study (note that those dates are shifted, and are 41 | not real dates when images were acquired, to protect patient privacy) 42 | - `SeriesInstanceUID`: unique identifier of the DICOM series 43 | - `SeriesDate`: date when the series was acquired 44 | - `SeriesDescription`: textual description of the series content 45 | - `SeriesNumber`: series number 46 | - `BodyPartExamined`: body part imaged 47 | - `Modality`: acquisition modality 48 | - `Manufacturer`: manufacturer of the equipment that generated the series 49 | - `ManufacturerModelName`: model name of the equipment 50 | 51 | ## `sm_index` 52 | 53 | The following is the list of the columns included in `sm_index`. `sm_index` is 54 | series-based, i.e, it has one row per DICOM series, but only includes series 55 | with slide microscopy data. 56 | 57 | - DICOM attributes extracted from the files: 58 | - `SeriesInstanceUID`: unique identifier of the DICOM series: one DICOM series 59 | = one slide 60 | - `embeddingMedium`: describes in what medium the slide was embedded before 61 | the image was obtained 62 | - `tissueFixative`: describes tissue fixatives used before the image was 63 | obtained 64 | - `staining_usingSubstance`: describes staining steps the specimen underwent 65 | before the image was obtained 66 | - `max_TotalPixelMatrixColumns`: width of the image at the maximum resolution 67 | - `max_TotalMatrixRows`: height of the image at the maximum resolution 68 | - `min_PixelSpacing_2sf`: pixel spacing in mm at the maximum resolution layer, 69 | rounded to 2 significant figures 70 | - `ObjectiveLensPower`: power of the objective lens of the equipment used to 71 | digitize the slide 72 | - `primaryAnatomicStructure`: anatomic location from where the imaged specimen 73 | was collected 74 | - `primaryAnatomicStructureModifier`: additional characteristics of the 75 | specimen, such as whether it is a tumor or normal tissue 76 | - `admittingDiagnosis`: if available, diagnosis of the patient; populated 77 | using the first item of the `AdmittingDiagnosesSequence` in DICOM SM series 78 | - `illuminationType`: specifies the type of illumination used when obtaining 79 | the image 80 | 81 | In case of `embeddingMedium`, `tissueFixative`, `staining_usingSubstance`, 82 | `primaryAnatomicStructure`, `primaryAnatomicStructureModifier`, 83 | `admittingDiagnosis` and `illuminationType` the attributes exist with suffix 84 | `_code_designator_value_str` and `_CodeMeaning`, which indicates whether the 85 | column contains CodeSchemeDesignator and CodeValue, or CodeMeaning. If this is 86 | new to you, a brief explanation on the three-value based coding scheme in DICOM 87 | can be found at https://learn.canceridc.dev/dicom/coding-schemes. 88 | 89 | ## `sm_instance_index` 90 | 91 | The following is the list of the columns included in `sm_instance_index`. 92 | `sm_instance_index` is instance-based, i.e, it has one row per DICOM instance 93 | (pyramid level of a slide, plus potentially thumbnail or label images), but only 94 | includes DICOM instances of the slide microscopy modality. 95 | 96 | - DICOM attributes extracted from the files: 97 | 98 | - `SOPInstanceUID`: unique identifier of the DICOM instance: one DICOM 99 | instance = one level/label/thumbnail image of the slide 100 | - `SeriesInstanceUID`: unique identifier of the DICOM series: one DICOM series 101 | = one slide 102 | - `embeddingMedium`: describes in what medium the slide was embedded before 103 | the image was obtained 104 | - `tissueFixative`: describes tissue fixatives used before the image was 105 | obtained 106 | - `staining_usingSubstance`: describes staining steps the specimen underwent 107 | before the image was obtained 108 | - `TotalPixelMatrixColumns`: width of the image 109 | - `TotalMatrixRows`: height of the image 110 | - `PixelSpacing_0`: pixel spacing in mm 111 | - `ImageType`: specifies further characteristics of the image in a list, 112 | including as the third value whether it is a VOLUME, LABEL, OVERVIEW or 113 | THUMBNAIL image. 114 | - `TransferSyntaxUID`: specifies the encoding scheme used for the image data 115 | - `instance_size`: specifies the DICOM instance's size in bytes 116 | 117 | - non-DICOM attributes assigned/curated by IDC: 118 | - `crdc_instance_uuid`: globally unique, versioned identifier of the DICOM 119 | instance 120 | 121 | In case of `embeddingMedium`, `tissueFixative`, and `staining_usingSubstance` 122 | the attributes exist with suffix `_code_designator_value_str` and 123 | `_CodeMeaning`, which indicates whether the column contains CodeSchemeDesignator 124 | and CodeValue, or CodeMeaning. If this is new to you, a brief explanation on the 125 | three-value based coding scheme in DICOM can be found at 126 | https://learn.canceridc.dev/dicom/coding-schemes. 127 | 128 | ## `clinical_index` 129 | 130 | Many of the image collections available in IDC are accompanied by clinical data. 131 | Such clinical data is organized in one or more tables that are shared alongside 132 | the images. 133 | 134 | Each row in `clinical_index` corresponds to a column in a clinical table 135 | available in IDC. You can use this index to find collections that have a 136 | specific clinical attribute, compare availability of the clinical data across 137 | collections, identify patients that have specific clinical characteristics. 138 | 139 | Note that IDC does not perform any harmonization of the clinical data across 140 | collections, or any validation of the content of the tables. We share clinical 141 | data as it was provided by the submitter. 142 | 143 | provides the list of all of the columns across all of the clinical tables 144 | available in IDC. It contains the following items: 145 | 146 | - `collection_id`: identifier of the collection where the given clinical data 147 | attribute is available 148 | - `short_table_name`: name of the clinical data table where the attribute is 149 | encountered; the referenced table can be loaded into a Pandas DataFrame using 150 | the `IDCClient.get_clinical_data()` call 151 | - `table_name`: fully resolved name of the table in IDC Google BigQuery public 152 | dataset (only relevant if you would like to search using BigQuery) 153 | - `column`: name of the column that is available in the given clinical table 154 | - `colum_label`: label of the column (this field may contain more extensive 155 | information describing a given column) 156 | - `values`: set of values defining the content of the column (relevant if the 157 | column contains fixed list of values and not free text) 158 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib.metadata 4 | 5 | project = "idc-index" 6 | copyright = "2024, Imaging Data Commons" 7 | author = "Imaging Data Commons" 8 | version = release = importlib.metadata.version("idc_index") 9 | 10 | extensions = [ 11 | "myst_parser", 12 | "sphinx.ext.autodoc", 13 | "sphinx.ext.intersphinx", 14 | "sphinx.ext.mathjax", 15 | "sphinx.ext.napoleon", 16 | "sphinx_autodoc_typehints", 17 | "sphinx_copybutton", 18 | "sphinx_click", 19 | ] 20 | 21 | source_suffix = [".rst", ".md"] 22 | exclude_patterns = [ 23 | "_build", 24 | "**.ipynb_checkpoints", 25 | "Thumbs.db", 26 | ".DS_Store", 27 | ".env", 28 | ".venv", 29 | ] 30 | 31 | html_theme = "furo" 32 | 33 | myst_enable_extensions = [ 34 | "colon_fence", 35 | ] 36 | 37 | intersphinx_mapping = { 38 | "python": ("https://docs.python.org/3", None), 39 | "pandas": ("http://pandas.pydata.org/pandas-docs/stable", None), 40 | } 41 | 42 | nitpick_ignore = [ 43 | ("py:class", "_io.StringIO"), 44 | ("py:class", "_io.BytesIO"), 45 | ] 46 | 47 | always_document_param_types = True 48 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # idc-index 2 | 3 | ```{toctree} 4 | :maxdepth: 2 5 | :hidden: 6 | ``` 7 | 8 | :::{warning} 9 | 10 | This package is in its early development stages. Its functionality and API will 11 | change. 12 | 13 | Stay tuned for the updates and documentation, and please share your feedback 14 | about it by opening issues at the 15 | [idc-index](https://github.com/ImagingDataCommons/idc-index) repository, or by 16 | starting a discussion in [IDC User forum](https://discourse.canceridc.dev/). 17 | 18 | ::: 19 | 20 | ```{include} ../README.md 21 | :start-after: 22 | ``` 23 | 24 | ## Contents 25 | 26 | ```{toctree} 27 | :maxdepth: 2 28 | :titlesonly: 29 | :caption: API docs 30 | 31 | column_descriptions 32 | cli_tools.rst 33 | api/idc_index 34 | ``` 35 | 36 | ## Indices and tables 37 | 38 | - {ref}`genindex` 39 | - {ref}`modindex` 40 | - {ref}`search` 41 | -------------------------------------------------------------------------------- /idc_index/__init__.py: -------------------------------------------------------------------------------- 1 | """Copyright (c) 2024 Imaging Data Commons. All rights reserved. 2 | 3 | idc-index: Package to query and download data from an index of ImagingDataCommons 4 | """ 5 | 6 | 7 | from __future__ import annotations 8 | 9 | from ._version import version as __version__ 10 | 11 | __all__ = ["__version__"] 12 | 13 | from .index import IDCClient 14 | 15 | _ = IDCClient 16 | -------------------------------------------------------------------------------- /idc_index/_version.pyi: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | version: str 4 | version_tuple: tuple[int, int, int] | tuple[int, int, int, str, str] 5 | -------------------------------------------------------------------------------- /idc_index/cli.py: -------------------------------------------------------------------------------- 1 | """CLI module for the IDC client. 2 | 3 | This module provides command-line interface (CLI) commands to interact with the Imaging Data Commons (IDC) data. 4 | """ 5 | from __future__ import annotations 6 | 7 | import logging 8 | from pathlib import Path 9 | 10 | import click 11 | 12 | from . import index 13 | from .index import IDCClient 14 | 15 | # Set up logging for the CLI module 16 | logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG) 17 | logger_cli = logging.getLogger("cli") 18 | logger_cli.setLevel(logging.INFO) 19 | 20 | 21 | def _get_max_path_length(): 22 | # can make this more robust 23 | return 260 24 | 25 | 26 | @click.group() 27 | def idc(): 28 | """`idc` is a command line interface to the API functionality available in idc-index.""" 29 | 30 | 31 | def set_log_level(log_level): 32 | """Set the logging level for the CLI module. 33 | 34 | Args: 35 | log_level (str): The logging level to set. 36 | """ 37 | log_levels = { 38 | "debug": logging.DEBUG, 39 | "info": logging.INFO, 40 | "warning": logging.WARNING, 41 | "error": logging.ERROR, 42 | "critical": logging.CRITICAL, 43 | } 44 | logging_level = log_levels.get(log_level.lower(), logging.WARNING) 45 | logger_cli.debug(f"Setting the log level of index.py to {logging_level}") 46 | index.logger.setLevel(logging_level) 47 | logger_cli.setLevel(logging_level) 48 | 49 | 50 | @idc.command() 51 | @click.option( 52 | "--download-dir", 53 | required=True, 54 | type=click.Path(), 55 | help="Path to the directory to download the files to.", 56 | ) 57 | @click.option( 58 | "--dry-run", 59 | type=bool, 60 | default=False, 61 | help="If set, calculates the size of the cohort but download does not start.", 62 | ) 63 | @click.option( 64 | "--collection-id", 65 | type=str, 66 | multiple=True, 67 | default=None, 68 | help="Collection ID(s) to filter by.", 69 | ) 70 | @click.option( 71 | "--patient-id", 72 | type=str, 73 | multiple=True, 74 | default=None, 75 | help="Patient ID(s) to filter by.", 76 | ) 77 | @click.option( 78 | "--study-instance-uid", 79 | type=str, 80 | multiple=True, 81 | default=None, 82 | help="DICOM StudyInstanceUID(s) to filter by.", 83 | ) 84 | @click.option( 85 | "--series-instance-uid", 86 | type=str, 87 | multiple=True, 88 | default=None, 89 | help="DICOM SeriesInstanceUID(s) to filter by.", 90 | ) 91 | @click.option( 92 | "--crdc-series-uuid", 93 | type=str, 94 | multiple=True, 95 | default=None, 96 | help="crdc_series_uuid(s) to filter by.", 97 | ) 98 | @click.option( 99 | "--quiet", 100 | type=bool, 101 | default=True, 102 | help="If set, suppresses the output of the subprocess.", 103 | ) 104 | @click.option( 105 | "--show-progress-bar", 106 | type=bool, 107 | default=True, 108 | help="If set, tracks the progress of download.", 109 | ) 110 | @click.option( 111 | "--use-s5cmd-sync", 112 | type=bool, 113 | default=False, 114 | help="If set, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded.", 115 | ) 116 | @click.option( 117 | "--log-level", 118 | type=click.Choice( 119 | ["debug", "info", "warning", "error", "critical"], case_sensitive=False 120 | ), 121 | default="info", 122 | help="Set the logging level for the CLI module.", 123 | ) 124 | @click.option( 125 | "--dir-template", 126 | type=str, 127 | default=IDCClient.DOWNLOAD_HIERARCHY_DEFAULT, 128 | help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to empty string (\"\") all files will be downloaded to the download directory with no subdirectories.", 129 | ) 130 | def download_from_selection( 131 | download_dir, 132 | dry_run, 133 | collection_id, 134 | patient_id, 135 | study_instance_uid, 136 | series_instance_uid, 137 | crdc_series_uuid, 138 | quiet, 139 | show_progress_bar, 140 | use_s5cmd_sync, 141 | log_level, 142 | dir_template, 143 | ): 144 | """Download from a selection of collection(s), patient(s), study(studies) and series. 145 | 146 | The filtering will be applied in sequence by first selecting the collection(s), followed by 147 | patient(s), study(studies) and series. If no filtering is applied, all the files will be downloaded. 148 | """ 149 | # Set the logging level for the CLI module 150 | set_log_level(log_level) 151 | # Create an instance of the IDCClient 152 | client = IDCClient() 153 | logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index") 154 | # Parse the input parameters and pass them to IDCClient's download_from_selection method 155 | collection_id = ( 156 | [cid.strip() for cid in (",".join(collection_id)).split(",")] 157 | if collection_id 158 | else None 159 | ) 160 | patient_id = ( 161 | [pid.strip() for pid in (",".join(patient_id)).split(",")] 162 | if patient_id 163 | else None 164 | ) 165 | study_instance_uid = ( 166 | [uid.strip() for uid in (",".join(study_instance_uid)).split(",")] 167 | if study_instance_uid 168 | else None 169 | ) 170 | series_instance_uid = ( 171 | [uid.strip() for uid in (",".join(series_instance_uid)).split(",")] 172 | if series_instance_uid 173 | else None 174 | ) 175 | crdc_series_uuid = ( 176 | [uid.strip() for uid in (",".join(crdc_series_uuid)).split(",")] 177 | if crdc_series_uuid 178 | else None 179 | ) 180 | logger_cli.debug("Inputs received from cli download:") 181 | logger_cli.debug(f"collection_id: {collection_id}") 182 | logger_cli.debug(f"patient_id: {patient_id}") 183 | logger_cli.debug(f"study_instance_uid: {study_instance_uid}") 184 | logger_cli.debug(f"series_instance_uid: {series_instance_uid}") 185 | logger_cli.debug(f"crdc_series_uuid: {crdc_series_uuid}") 186 | logger_cli.debug(f"dry_run: {dry_run}") 187 | logger_cli.debug(f"quiet: {quiet}") 188 | logger_cli.debug(f"show_progress_bar: {show_progress_bar}") 189 | logger_cli.debug(f"use_s5cmd_sync: {use_s5cmd_sync}") 190 | logger_cli.debug(f"dirTemplate: {dir_template}") 191 | 192 | client.download_from_selection( 193 | download_dir, 194 | dry_run=dry_run, 195 | collection_id=collection_id, 196 | patientId=patient_id, 197 | studyInstanceUID=study_instance_uid, 198 | seriesInstanceUID=series_instance_uid, 199 | crdc_series_uuid=crdc_series_uuid, 200 | quiet=quiet, 201 | show_progress_bar=show_progress_bar, 202 | use_s5cmd_sync=use_s5cmd_sync, 203 | dirTemplate=dir_template, 204 | ) 205 | 206 | 207 | @idc.command() 208 | @click.option( 209 | "--manifest-file", 210 | required=True, 211 | type=click.Path(), 212 | help="The path to the manifest file.", 213 | ) 214 | @click.option( 215 | "--download-dir", 216 | required=True, 217 | type=click.Path(), 218 | help="Path to the directory to download the files to.", 219 | ) 220 | @click.option( 221 | "--quiet", 222 | type=bool, 223 | default=True, 224 | help="If set, suppresses the output of the subprocess.", 225 | ) 226 | @click.option( 227 | "--validate-manifest", 228 | type=bool, 229 | default=True, 230 | help="If True, validates the manifest for any errors. Defaults to True.", 231 | ) 232 | @click.option( 233 | "--show-progress-bar", 234 | type=bool, 235 | default=True, 236 | help="If set, tracks the progress of download.", 237 | ) 238 | @click.option( 239 | "--use-s5cmd-sync", 240 | type=bool, 241 | default=False, 242 | help="If set, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded.", 243 | ) 244 | @click.option( 245 | "--log-level", 246 | type=click.Choice( 247 | ["debug", "info", "warning", "error", "critical"], case_sensitive=False 248 | ), 249 | default="info", 250 | help="Set the logging level for the CLI module.", 251 | ) 252 | @click.option( 253 | "--dir-template", 254 | type=str, 255 | default=IDCClient.DOWNLOAD_HIERARCHY_DEFAULT, 256 | help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to empty string (\"\") all files will be downloaded to the download directory with no subdirectories.", 257 | ) 258 | def download_from_manifest( 259 | manifest_file, 260 | download_dir, 261 | quiet, 262 | validate_manifest, 263 | show_progress_bar, 264 | use_s5cmd_sync, 265 | log_level, 266 | dir_template, 267 | ): 268 | """Download the manifest file. 269 | 270 | In a series of steps, the manifest file is first validated to ensure every line contains a valid URL. 271 | It then gets the total size to be downloaded and runs the download process on one 272 | process and download progress on another process. 273 | """ 274 | # Set the logging level for the CLI module 275 | set_log_level(log_level) 276 | # Create an instance of the IDCClient 277 | client = IDCClient() 278 | logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index") 279 | logger_cli.debug("Inputs received from cli manifest download:") 280 | logger_cli.debug(f"manifest_file_path: {manifest_file}") 281 | logger_cli.debug(f"download_dir: {download_dir}") 282 | logger_cli.debug(f"validate_manifest: {validate_manifest}") 283 | logger_cli.debug(f"show_progress_bar: {show_progress_bar}") 284 | logger_cli.debug(f"use_s5cmd_sync: {use_s5cmd_sync}") 285 | logger_cli.debug(f"dirTemplate: {dir_template}") 286 | 287 | # Call IDCClient's download_from_manifest method with the provided parameters 288 | client.download_from_manifest( 289 | manifestFile=manifest_file, 290 | downloadDir=download_dir, 291 | quiet=quiet, 292 | validate_manifest=validate_manifest, 293 | show_progress_bar=show_progress_bar, 294 | use_s5cmd_sync=use_s5cmd_sync, 295 | dirTemplate=dir_template, 296 | ) 297 | 298 | 299 | @idc.command() 300 | @click.argument( 301 | "generic_argument", 302 | type=str, 303 | ) 304 | @click.option( 305 | "--download-dir", 306 | required=False, 307 | type=click.Path(), 308 | help="Path to the directory to download the files to.", 309 | ) 310 | @click.option( 311 | "--dir-template", 312 | type=str, 313 | default=IDCClient.DOWNLOAD_HIERARCHY_DEFAULT, 314 | help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to empty string (\"\") all files will be downloaded to the download directory with no subdirectories.", 315 | ) 316 | @click.option( 317 | "--log-level", 318 | type=click.Choice( 319 | ["debug", "info", "warning", "error", "critical"], case_sensitive=False 320 | ), 321 | default="info", 322 | help="Set the logging level for the CLI module.", 323 | ) 324 | def download(generic_argument, download_dir, dir_template, log_level): 325 | """Download content given the input parameter. 326 | 327 | Determine whether the input parameter corresponds to a file manifest or a list of collection_id, PatientID, StudyInstanceUID, or SeriesInstanceUID values, and download the corresponding files into the current directory. Default parameters will be used for organizing the downloaded files into folder hierarchy. Use `download_from_selection()` and `download_from_manifest()` functions if granular control over the download process is needed. 328 | """ 329 | # Set the logging level for the CLI module 330 | set_log_level(log_level) 331 | # Create an instance of the IDCClient 332 | client = IDCClient() 333 | 334 | logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index") 335 | 336 | if download_dir: 337 | download_dir = Path(download_dir) 338 | else: 339 | download_dir = Path.cwd() 340 | 341 | if ( 342 | len(generic_argument) < _get_max_path_length() 343 | and Path(generic_argument).is_file() 344 | ): 345 | # Parse the input parameters and pass them to IDC 346 | logger_cli.info("Detected manifest file, downloading from manifest.") 347 | client.download_from_manifest( 348 | generic_argument, downloadDir=download_dir, dirTemplate=dir_template 349 | ) 350 | # this is not a file manifest 351 | else: 352 | # Split the input string and filter out any empty values 353 | item_ids = [item for item in generic_argument.split(",") if item] 354 | 355 | if not item_ids: 356 | logger_cli.error("No valid IDs provided.") 357 | 358 | index_df = client.index 359 | 360 | def check_and_download(column_name, item_ids, download_dir, kwarg_name): 361 | matches = index_df[column_name].isin(item_ids) 362 | matched_ids = index_df[column_name][matches].unique().tolist() 363 | if not matched_ids: 364 | return False 365 | unmatched_ids = list(set(item_ids) - set(matched_ids)) 366 | if unmatched_ids: 367 | logger_cli.debug( 368 | f"Partial match for {column_name}: matched {matched_ids}, unmatched {unmatched_ids}" 369 | ) 370 | logger_cli.info(f"Identified matching {column_name}: {matched_ids}") 371 | client.download_from_selection( 372 | **{ 373 | kwarg_name: matched_ids, 374 | "downloadDir": download_dir, 375 | "dirTemplate": dir_template, 376 | } 377 | ) 378 | return True 379 | 380 | matches_found = 0 381 | matches_found += check_and_download( 382 | "collection_id", item_ids, download_dir, "collection_id" 383 | ) 384 | matches_found += check_and_download( 385 | "PatientID", item_ids, download_dir, "patientId" 386 | ) 387 | matches_found += check_and_download( 388 | "StudyInstanceUID", item_ids, download_dir, "studyInstanceUID" 389 | ) 390 | matches_found += check_and_download( 391 | "SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID" 392 | ) 393 | matches_found += check_and_download( 394 | "crdc_series_uuid", item_ids, download_dir, "crdc_series_uuid" 395 | ) 396 | if not matches_found: 397 | logger_cli.error( 398 | "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID, crdc_series_uuid." 399 | ) 400 | 401 | 402 | if __name__ == "__main__": 403 | idc() 404 | -------------------------------------------------------------------------------- /idc_index/index.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=too-many-lines 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import os 7 | import re 8 | import shutil 9 | import subprocess 10 | import tempfile 11 | import time 12 | from importlib.metadata import distribution, version 13 | from pathlib import Path 14 | 15 | import duckdb 16 | import idc_index_data 17 | import pandas as pd 18 | import platformdirs 19 | import psutil 20 | import requests 21 | from packaging.version import Version 22 | from tqdm import tqdm 23 | 24 | aws_endpoint_url = "https://s3.amazonaws.com" 25 | gcp_endpoint_url = "https://storage.googleapis.com" 26 | asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}" 27 | 28 | logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) 29 | logger = logging.getLogger(__name__) 30 | 31 | 32 | class IDCClient: 33 | # Default download hierarchy template 34 | DOWNLOAD_HIERARCHY_DEFAULT = ( 35 | "%collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID" 36 | ) 37 | 38 | # Defined citation formats that can be passed to the citations request methods 39 | # see acceptable values at https://citation.crosscite.org/docs.html#sec-4 40 | CITATION_FORMAT_APA = "text/x-bibliography; style=apa; locale=en-US" 41 | CITATION_FORMAT_TURTLE = "text/turtle" 42 | CITATION_FORMAT_JSON = "application/vnd.citationstyles.csl+json" 43 | CITATION_FORMAT_BIBTEX = "application/x-bibtex" 44 | 45 | # Singleton pattern 46 | # NOTE: In the future, one may want to use multiple clients e.g. for sub-datasets so a attribute-singleton as shown below seems a better option. 47 | # _instance: IDCClient 48 | # def __new__(cls): 49 | # if not hasattr(cls, "_instance") or getattr(cls, "_instance") is None: 50 | # instance = super(IDCClient, cls).__new__(cls) 51 | # setattr(cls, "_instance", instance) 52 | # return cls._instance 53 | 54 | _client: IDCClient 55 | 56 | @classmethod 57 | def client(cls) -> IDCClient: 58 | if not hasattr(cls, "_client") or getattr(cls, "_client") is None: 59 | setattr(cls, "_client", IDCClient()) 60 | 61 | return cls._client 62 | 63 | def __init__(self): 64 | # Read main index file 65 | file_path = idc_index_data.IDC_INDEX_PARQUET_FILEPATH 66 | logger.debug(f"Reading index file v{idc_index_data.__version__}") 67 | self.index = pd.read_parquet(file_path) 68 | 69 | # initialize crdc_series_uuid for the index 70 | # TODO: in the future, after https://github.com/ImagingDataCommons/idc-index/pull/113 71 | # is merged (to minimize disruption), it will make more sense to change 72 | # idc-index-data to separate bucket from crdc_series_uuid, add support for GCP, 73 | # and consequently simplify the code here 74 | self.index["crdc_series_uuid"] = ( 75 | self.index["series_aws_url"].str.split("/").str[3] 76 | ) 77 | 78 | self.prior_versions_index_path = ( 79 | idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH 80 | ) 81 | file_path = idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH 82 | 83 | self.prior_versions_index = pd.read_parquet(file_path) 84 | 85 | # self.index = self.index.astype(str).replace("nan", "") 86 | self.index["series_size_MB"] = self.index["series_size_MB"].astype(float) 87 | self.collection_summary = self.index.groupby("collection_id").agg( 88 | {"Modality": pd.Series.unique, "series_size_MB": "sum"} 89 | ) 90 | 91 | self.idc_version = f"v{Version(idc_index_data.__version__).major}" 92 | 93 | # since indices can change between versions, we need to store them in a versioned directory 94 | self.indices_data_dir = platformdirs.user_data_dir( 95 | "idc_index_data", "IDC", version=version("idc-index-data") 96 | ) 97 | # these are the items that are fetched from IDC release assets (e.g., clinical data files) 98 | self.idc_data_dir = platformdirs.user_data_dir( 99 | "IDC", "IDC", version=self.idc_version 100 | ) 101 | self.clinical_data_dir = None 102 | 103 | self.indices_overview = { 104 | "index": { 105 | "description": "Main index containing one row per DICOM series.", 106 | "installed": True, 107 | "url": None, 108 | "file_path": idc_index_data.IDC_INDEX_PARQUET_FILEPATH, 109 | }, 110 | "prior_versions_index": { 111 | "description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.", 112 | "installed": True, 113 | "url": None, 114 | "file_path": idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH, 115 | }, 116 | "sm_index": { 117 | "description": "DICOM Slide Microscopy series-level index.", 118 | "installed": False, 119 | "url": f"{asset_endpoint_url}/sm_index.parquet", 120 | "file_path": None, 121 | }, 122 | "sm_instance_index": { 123 | "description": "DICOM Slide Microscopy instance-level index.", 124 | "installed": False, 125 | "url": f"{asset_endpoint_url}/sm_instance_index.parquet", 126 | "file_path": None, 127 | }, 128 | "clinical_index": { 129 | "description": "Index of clinical data accompanying the available images.", 130 | "installed": False, 131 | "url": f"{asset_endpoint_url}/clinical_index.parquet", 132 | "file_path": None, 133 | }, 134 | } 135 | 136 | # these will point to the dataframes containing the respective indices, once installed 137 | self.sm_index = None 138 | self.sm_instance_index = None 139 | self.clinical_index = None 140 | 141 | # Lookup s5cmd 142 | self.s5cmdPath = shutil.which("s5cmd") 143 | if self.s5cmdPath is None: 144 | # Workaround to support environment without a properly setup PATH 145 | # See https://github.com/Slicer/Slicer/pull/7587 146 | logger.debug("Falling back to looking up s5cmd along side the package") 147 | for script in distribution("s5cmd").files: 148 | if str(script).startswith("s5cmd/bin/s5cmd"): 149 | self.s5cmdPath = script.locate().resolve(strict=True) 150 | break 151 | if self.s5cmdPath is None: 152 | raise FileNotFoundError( 153 | "s5cmd executable not found. Please install s5cmd from https://github.com/peak/s5cmd#installation" 154 | ) 155 | self.s5cmdPath = str(self.s5cmdPath) 156 | logger.debug(f"Found s5cmd executable: {self.s5cmdPath}") 157 | # ... and check it can be executed 158 | subprocess.check_call([self.s5cmdPath, "--help"], stdout=subprocess.DEVNULL) 159 | 160 | @staticmethod 161 | def _replace_aws_with_gcp_buckets(dataframe, column_name): 162 | # mapping from AWS to GCS buckets is fixed 163 | replacements = { 164 | r"s3://idc-open-data-two/": r"s3://idc-open-idc1/", 165 | r"s3://idc-open-data-cr/": r"s3://idc-open-cr/", 166 | # as of IDC v20, we use a new bucket that has the same name as AWS 167 | # for `idc-open-data` - no need to replace 168 | # r"s3://idc-open-data/": r"s3://public-datasets-idc/", 169 | } 170 | 171 | # Function to apply replacements 172 | def replace_url_parts(url): 173 | for old, new in replacements.items(): 174 | url = re.sub(old, new, url) 175 | return url 176 | 177 | # Apply the replacements to the requested column 178 | dataframe[column_name] = dataframe[column_name].apply(replace_url_parts) 179 | 180 | @staticmethod 181 | def _filter_dataframe_by_id(key, dataframe, _id): 182 | values = _id 183 | if isinstance(_id, str): 184 | values = [_id] 185 | filtered_df = dataframe[dataframe[key].isin(values)].copy() 186 | if filtered_df.empty: 187 | error_message = f"No data found for the {key} with the values {values}." 188 | raise ValueError(error_message) 189 | return filtered_df 190 | 191 | @staticmethod 192 | def _safe_filter_by_selection( 193 | df_index, 194 | collection_id, 195 | patientId, 196 | studyInstanceUID, 197 | seriesInstanceUID, 198 | sopInstanceUID, 199 | crdc_series_uuid, 200 | ): 201 | if collection_id is not None: 202 | if not isinstance(collection_id, str) and not isinstance( 203 | collection_id, list 204 | ): 205 | raise TypeError("collection_id must be a string or list of strings") 206 | if patientId is not None: 207 | if not isinstance(patientId, str) and not isinstance(patientId, list): 208 | raise TypeError("patientId must be a string or list of strings") 209 | if studyInstanceUID is not None: 210 | if not isinstance(studyInstanceUID, str) and not isinstance( 211 | studyInstanceUID, list 212 | ): 213 | raise TypeError("studyInstanceUID must be a string or list of strings") 214 | if seriesInstanceUID is not None: 215 | if not isinstance(seriesInstanceUID, str) and not isinstance( 216 | seriesInstanceUID, list 217 | ): 218 | raise TypeError("seriesInstanceUID must be a string or list of strings") 219 | if sopInstanceUID is not None: 220 | if not isinstance(sopInstanceUID, str) and not isinstance( 221 | sopInstanceUID, list 222 | ): 223 | raise TypeError("sopInstanceUID must be a string or list of strings") 224 | 225 | if crdc_series_uuid is not None: 226 | if not isinstance(crdc_series_uuid, str) and not isinstance( 227 | crdc_series_uuid, list 228 | ): 229 | raise TypeError("crdc_series_uuid must be a string or list of strings") 230 | 231 | # Here we go down-up the hierarchy of filtering, taking into 232 | # account the direction of one-to-many relationships 233 | # one crdc_series_uuid can be associated with one and only one SeriesInstanceUID 234 | # one SeriesInstanceUID can be associated with one and only one StudyInstanceUID 235 | # one StudyInstanceUID can be associated with one and only one PatientID 236 | # one PatientID can be associated with one and only one collection_id 237 | # because of this we do not need to apply attributes above the given defined 238 | # attribute in the hierarchy 239 | # The earlier implemented behavior was a relic of the API from a different system 240 | # that influenced the API of SlicerIDCIndex, and propagated into idc-index. Unfortunately. 241 | 242 | if crdc_series_uuid is not None: 243 | result_df = IDCClient._filter_dataframe_by_id( 244 | "crdc_series_uuid", df_index, crdc_series_uuid 245 | ) 246 | return result_df 247 | 248 | if sopInstanceUID is not None: 249 | result_df = IDCClient._filter_by_dicom_instance_uid( 250 | df_index, sopInstanceUID 251 | ) 252 | return result_df 253 | 254 | if seriesInstanceUID is not None: 255 | result_df = IDCClient._filter_by_dicom_series_uid( 256 | df_index, seriesInstanceUID 257 | ) 258 | return result_df 259 | 260 | if studyInstanceUID is not None: 261 | result_df = IDCClient._filter_by_dicom_study_uid(df_index, studyInstanceUID) 262 | return result_df 263 | 264 | if patientId is not None: 265 | result_df = IDCClient._filter_by_patient_id(df_index, patientId) 266 | return result_df 267 | 268 | if collection_id is not None: 269 | result_df = IDCClient._filter_by_collection_id(df_index, collection_id) 270 | return result_df 271 | 272 | return None 273 | 274 | @staticmethod 275 | def _filter_by_collection_id(df_index, collection_id): 276 | return IDCClient._filter_dataframe_by_id( 277 | "collection_id", df_index, collection_id 278 | ) 279 | 280 | @staticmethod 281 | def _filter_by_patient_id(df_index, patient_id): 282 | return IDCClient._filter_dataframe_by_id("PatientID", df_index, patient_id) 283 | 284 | @staticmethod 285 | def _filter_by_dicom_study_uid(df_index, dicom_study_uid): 286 | return IDCClient._filter_dataframe_by_id( 287 | "StudyInstanceUID", df_index, dicom_study_uid 288 | ) 289 | 290 | @staticmethod 291 | def _filter_by_dicom_series_uid(df_index, dicom_series_uid): 292 | return IDCClient._filter_dataframe_by_id( 293 | "SeriesInstanceUID", df_index, dicom_series_uid 294 | ) 295 | 296 | @staticmethod 297 | def _filter_by_dicom_instance_uid(df_index, dicom_instance_uid): 298 | return IDCClient._filter_dataframe_by_id( 299 | "SOPInstanceUID", df_index, dicom_instance_uid 300 | ) 301 | 302 | @staticmethod 303 | def get_idc_version(): 304 | """ 305 | Returns the version of IDC data used in idc-index 306 | """ 307 | idc_version = Version(idc_index_data.__version__).major 308 | return f"v{idc_version}" 309 | 310 | @staticmethod 311 | def _check_create_directory(download_dir): 312 | """ 313 | Mimic behavior of s5cmd and create the download directory if it does not exist 314 | """ 315 | download_dir = Path(download_dir) 316 | download_dir.mkdir(parents=True, exist_ok=True) 317 | 318 | return str(download_dir.resolve()) 319 | 320 | def _check_disk_size_and_warn(self, download_dir, disk_size_needed): 321 | disk_free_space_MB = psutil.disk_usage(download_dir).free / (1000 * 1000) 322 | logger.info("Disk size needed: " + self._format_size(disk_size_needed)) 323 | logger.info("Disk size available: " + self._format_size(disk_free_space_MB)) 324 | if disk_free_space_MB < disk_size_needed: 325 | logger.error("Not enough free space on disk to download the files.") 326 | return False 327 | return True 328 | 329 | def fetch_index(self, index_name) -> None: 330 | """ 331 | Downloads requested index and adds this index joined with the main index as respective class attribute. 332 | 333 | Args: 334 | index (str): Name of the index to be downloaded. 335 | """ 336 | 337 | if index_name not in self.indices_overview: 338 | logger.error(f"Index {index_name} is not available and can not be fetched.") 339 | elif self.indices_overview[index_name]["installed"]: 340 | logger.warning( 341 | f"Index {index_name} already installed and will not be fetched again." 342 | ) 343 | else: 344 | logger.info("Fetching index %s", index_name) 345 | response = requests.get( 346 | self.indices_overview[index_name]["url"], timeout=30 347 | ) 348 | if response.status_code == 200: 349 | filepath = os.path.join( 350 | self.indices_data_dir, 351 | f"{index_name}.parquet", 352 | ) 353 | 354 | os.makedirs(os.path.dirname(filepath), exist_ok=True) 355 | with open(filepath, mode="wb") as file: 356 | file.write(response.content) 357 | 358 | index_table = pd.read_parquet(filepath) 359 | # index_table = index_table.merge( 360 | # self.index[["series_aws_url", "SeriesInstanceUID"]], 361 | # on="SeriesInstanceUID", how="left" 362 | # ) 363 | # TODO: consider switching to class variable! 364 | # setattr(self.__class__, index_name, index_table) 365 | setattr(self, index_name, index_table) 366 | self.indices_overview[index_name]["installed"] = True 367 | self.indices_overview[index_name]["file_path"] = filepath 368 | 369 | else: 370 | logger.error( 371 | f"Failed to fetch index from URL {self.indices_overview[index_name]['url']}: {response.status_code}" 372 | ) 373 | # if clinical_index is requested, likely the user will need clinical data 374 | # download it here, given that the size is small (<2MB as of IDC v19) 375 | if index_name == "clinical_index": 376 | logger.info( 377 | "Since clinical_index was fetched, also installing corresponding tables." 378 | ) 379 | # create clinical_data folder under self.idc_data_dir, if it does not exist 380 | self.clinical_data_dir = os.path.join(self.idc_data_dir, "clinical_data") 381 | idc_clinical_data_release_url = f"s3://idc-open-metadata/bigquery_export/idc_{self.idc_version}_clinical/*" 382 | result = subprocess.run( 383 | [ 384 | self.s5cmdPath, 385 | "--no-sign-request", 386 | "cp", 387 | idc_clinical_data_release_url, 388 | self.clinical_data_dir, 389 | ], 390 | capture_output=True, 391 | text=True, 392 | check=True, 393 | ) 394 | if result.stderr and result.stdout.startswith("ERROR"): 395 | logger.error("Failed to download IDC clinical data.") 396 | else: 397 | logger.info( 398 | "IDC clinical data downloaded successfully to %s", 399 | self.clinical_data_dir, 400 | ) 401 | 402 | def get_clinical_table(self, table_name): 403 | """ 404 | Returns the requested clinical table as a pandas DataFrame. 405 | 406 | Args: 407 | table_name (str): Name of the clinical table to be loaded. 408 | 409 | Returns: 410 | pandas.DataFrame: The requested clinical table. 411 | """ 412 | if self.clinical_data_dir is None: 413 | logger.error( 414 | "Clinical data directory is not available. Please fetch clinical_index first." 415 | ) 416 | return None 417 | 418 | table_path = os.path.join(self.clinical_data_dir, table_name) 419 | if not os.path.exists(table_path): 420 | logger.error(f"Table {table_name} is not found in {table_path}.") 421 | return None 422 | 423 | return pd.read_parquet(table_path) 424 | 425 | def get_collections(self): 426 | """ 427 | Returns the collections present in IDC 428 | """ 429 | unique_collections = self.index["collection_id"].unique() 430 | return unique_collections.tolist() 431 | 432 | def get_series_size(self, seriesInstanceUID): 433 | """ 434 | Gets cumulative size (MB) of the DICOM instances in a given SeriesInstanceUID. 435 | 436 | Args: 437 | seriesInstanceUID (str): The DICOM SeriesInstanceUID. 438 | 439 | Returns: 440 | float: The cumulative size of the DICOM instances in the given SeriesInstanceUID rounded to two digits, in MB. 441 | 442 | Raises: 443 | ValueError: If the `seriesInstanceUID` does not exist. 444 | """ 445 | 446 | resp = self.index[["SeriesInstanceUID"] == seriesInstanceUID][ 447 | "series_size_MB" 448 | ].iloc[0] 449 | return resp 450 | 451 | def get_patients(self, collection_id, outputFormat="dict"): 452 | """ 453 | Gets the patients in a collection. 454 | 455 | Args: 456 | collection_id (str or list[str]): The collection id or list of collection ids. This should be in lower case separated by underscores. 457 | For example, 'pdmr_texture_analysis'. or ['pdmr_texture_analysis','nlst'] 458 | 459 | outputFormat (str): The format in which to return the patient IDs. Available options are 'dict', 460 | 'df', and 'list'. Default is 'dict'. 461 | 462 | Returns: 463 | dict or pandas.DataFrame or list: Patient IDs in the requested output format. By default, it returns a dictionary. 464 | 465 | Raises: 466 | ValueError: If `outputFormat` is not one of 'dict', 'df', 'list'. 467 | """ 468 | 469 | if not isinstance(collection_id, str) and not isinstance(collection_id, list): 470 | raise TypeError("collection_id must be a string or list of strings") 471 | 472 | if outputFormat not in ["dict", "df", "list"]: 473 | raise ValueError("outputFormat must be either 'dict', 'df', or 'list") 474 | 475 | patient_df = self._filter_by_collection_id(self.index, collection_id) 476 | 477 | if outputFormat == "list": 478 | response = patient_df["PatientID"].unique().tolist() 479 | else: 480 | sql = """ 481 | SELECT 482 | PatientID, 483 | STRING_AGG(DISTINCT PatientSex) as PatientSex, 484 | STRING_AGG(DISTINCT PatientAge) as PatientAge 485 | FROM 486 | patient_df 487 | GROUP BY 488 | PatientID 489 | ORDER BY 490 | PatientID 491 | """ 492 | patient_df = duckdb.sql(sql).df() 493 | # Convert DataFrame to a list of dictionaries for the API-like response 494 | if outputFormat == "dict": 495 | response = patient_df.to_dict(orient="records") 496 | else: 497 | response = patient_df 498 | 499 | logger.debug("Get patient response: %s", str(response)) 500 | 501 | return response 502 | 503 | def get_dicom_studies(self, patientId, outputFormat="dict"): 504 | """ 505 | Returns Studies for a given patient or list of patients. 506 | 507 | Args: 508 | patientId (str or list of str): The patient Id or a list of patient Ids. 509 | 510 | outputFormat (str): The format in which to return the studies. Available options are 'dict', 511 | 'df', and 'list'. Default is 'dict'. 512 | 513 | Returns: 514 | dict or pandas.DataFrame or list: Studies in the requested output format. By default, it returns a dictionary. 515 | 516 | Raises: 517 | ValueError: If `outputFormat` is not one of 'dict', 'df', 'list'. 518 | ValueError: If any of the `patientId` does not exist. 519 | """ 520 | 521 | if not isinstance(patientId, str) and not isinstance(patientId, list): 522 | raise TypeError("patientId must be a string or list of strings") 523 | 524 | if outputFormat not in ["dict", "df", "list"]: 525 | raise ValueError("outputFormat must be either 'dict' or 'df' or 'list'") 526 | 527 | studies_df = self._filter_by_patient_id(self.index, patientId) 528 | 529 | if outputFormat == "list": 530 | response = studies_df["StudyInstanceUID"].unique().tolist() 531 | else: 532 | sql = """ 533 | SELECT 534 | StudyInstanceUID, 535 | STRING_AGG(DISTINCT StudyDate) as StudyDate, 536 | STRING_AGG(DISTINCT StudyDescription) as StudyDescription, 537 | COUNT(SeriesInstanceUID) as SeriesCount 538 | FROM 539 | studies_df 540 | GROUP BY 541 | StudyInstanceUID 542 | ORDER BY 543 | 2,3,4 544 | """ 545 | studies_df = duckdb.query(sql).df() 546 | 547 | if outputFormat == "dict": 548 | response = studies_df.to_dict(orient="records") 549 | else: 550 | response = studies_df 551 | 552 | logger.debug("Get patient study response: %s", str(response)) 553 | 554 | return response 555 | 556 | def get_dicom_series(self, studyInstanceUID, outputFormat="dict"): 557 | """ 558 | Returns Series for a given study or list of studies. 559 | 560 | Args: 561 | studyInstanceUID (str or list of str): The DICOM StudyInstanceUID or a list of StudyInstanceUIDs. 562 | 563 | outputFormat (str): The format in which to return the series. Available options are 'dict', 564 | 'df', and 'list'. Default is 'dict'. 565 | 566 | Returns: 567 | dict or pandas.DataFrame or list: Series in the requested output format. By default, it returns a dictionary. 568 | 569 | Raises: 570 | ValueError: If `outputFormat` is not one of 'dict', 'df', 'list'. 571 | ValueError: If any of the `studyInstanceUID` does not exist. 572 | """ 573 | 574 | if not isinstance(studyInstanceUID, str) and not isinstance( 575 | studyInstanceUID, list 576 | ): 577 | raise TypeError("studyInstanceUID must be a string or list of strings") 578 | 579 | if outputFormat not in ["dict", "df", "list"]: 580 | raise ValueError("outputFormat must be either 'dict' or 'df' or 'list'") 581 | 582 | series_df = self._filter_by_dicom_study_uid(self.index, studyInstanceUID) 583 | 584 | if outputFormat == "list": 585 | response = series_df["SeriesInstanceUID"].unique().tolist() 586 | else: 587 | series_df = series_df.rename( 588 | columns={ 589 | "collection_id": "Collection", 590 | "instanceCount": "instance_count", 591 | } 592 | ) 593 | series_df["ImageCount"] = 1 594 | series_df = series_df[ 595 | [ 596 | "StudyInstanceUID", 597 | "SeriesInstanceUID", 598 | "Modality", 599 | "SeriesDate", 600 | "Collection", 601 | "BodyPartExamined", 602 | "SeriesDescription", 603 | "Manufacturer", 604 | "ManufacturerModelName", 605 | "series_size_MB", 606 | "SeriesNumber", 607 | "instance_count", 608 | "ImageCount", 609 | ] 610 | ] 611 | 612 | series_df = series_df.drop_duplicates().sort_values( 613 | by=[ 614 | "Modality", 615 | "SeriesDate", 616 | "SeriesDescription", 617 | "BodyPartExamined", 618 | "SeriesNumber", 619 | ] 620 | ) 621 | # Convert DataFrame to a list of dictionaries for the API-like response 622 | if outputFormat == "dict": 623 | response = series_df.to_dict(orient="records") 624 | else: 625 | response = series_df 626 | logger.debug("Get series response: %s", str(response)) 627 | 628 | return response 629 | 630 | def get_series_file_URLs(self, seriesInstanceUID, source_bucket_location="aws"): 631 | """ 632 | Get the URLs of the files corresponding to the DICOM instances in a given SeriesInstanceUID. 633 | 634 | Args: 635 | SeriesInstanceUID: string containing the value of DICOM SeriesInstanceUID to filter by 636 | 637 | Returns: 638 | list of strings containing the AWS S3 URLs of the files corresponding to the SeriesInstanceUID 639 | """ 640 | if seriesInstanceUID not in self.index["SeriesInstanceUID"].values: 641 | raise ValueError("SeriesInstanceUID not found in IDC index.") 642 | 643 | selected_series_df = self.index[ 644 | self.index["SeriesInstanceUID"] == seriesInstanceUID 645 | ].copy() 646 | selected_series_df["series_aws_url"] = ( 647 | "s3://" 648 | + selected_series_df["aws_bucket"] 649 | + "/" 650 | + selected_series_df["crdc_series_uuid"] 651 | + "/" 652 | ) 653 | 654 | endpoint = aws_endpoint_url 655 | if source_bucket_location == "gcp": 656 | self._replace_aws_with_gcp_buckets(selected_series_df, "series_aws_url") 657 | endpoint = gcp_endpoint_url 658 | 659 | s3_url = selected_series_df["series_aws_url"].values[0] 660 | 661 | # Run the s5cmd ls command and capture its output 662 | result = subprocess.run( 663 | [ 664 | self.s5cmdPath, 665 | "--endpoint-url", 666 | endpoint, 667 | "--no-sign-request", 668 | "ls", 669 | s3_url, 670 | ], 671 | stdout=subprocess.PIPE, 672 | check=False, 673 | ) 674 | output = result.stdout.decode("utf-8") 675 | 676 | # Parse the output to get the file names 677 | lines = output.split("\n") 678 | file_names = [ 679 | s3_url + line.split()[-1] 680 | for line in lines 681 | if line and line.split()[-1].endswith(".dcm") 682 | ] 683 | 684 | return file_names 685 | 686 | def get_instance_file_URL(self, sopInstanceUID, source_bucket_location="aws"): 687 | """ 688 | Get the bucket URL of the file corresponding to a given SOPInstanceUID. 689 | 690 | This function will only return the URL for the Slide Microscopy (SM) instances, 691 | which are maintained in the `sm_instance_index` table. 692 | 693 | Args: 694 | sopInstanceUID: string containing the value of DICOM SOPInstanceUID 695 | source_bucket_location: string containing the source bucket location, either "aws" or "gcp" 696 | 697 | Returns: 698 | string containing the bucket URL of the file corresponding to the SOPInstanceUID, 699 | or None if the SOPInstanceUID is not recognized 700 | """ 701 | 702 | # sm_instance_index is required to complete this operation - install it! 703 | self.fetch_index("sm_instance_index") 704 | 705 | if self.sm_instance_index is None: 706 | logger.error( 707 | "sm_instance_index could not be installed. Please install it first using fetch_index." 708 | ) 709 | return None 710 | 711 | if sopInstanceUID not in self.sm_instance_index["SOPInstanceUID"].values: # pylint: disable=unsubscriptable-object 712 | raise ValueError("SOPInstanceUID not found in IDC sm_instance_index.") 713 | 714 | # merge with the main index to get series_aws_url 715 | selected_instance_df = self.sm_instance_index[ # pylint: disable=unsubscriptable-object 716 | self.sm_instance_index["SOPInstanceUID"] == sopInstanceUID # pylint: disable=unsubscriptable-object 717 | ].copy()[["SeriesInstanceUID", "SOPInstanceUID", "crdc_instance_uuid"]] 718 | selected_instance_df = pd.merge( 719 | selected_instance_df, 720 | self.index, 721 | on="SeriesInstanceUID", 722 | how="left", 723 | ) 724 | 725 | if source_bucket_location == "gcp": 726 | # replace AWS with the GCP bucket 727 | self._replace_aws_with_gcp_buckets(selected_instance_df, "series_aws_url") 728 | 729 | # instance files are named using crdc_instance_uuid 730 | series_url = selected_instance_df.iloc[0]["series_aws_url"][:-1] 731 | instance_uuid = selected_instance_df.iloc[0]["crdc_instance_uuid"] 732 | return series_url + instance_uuid + ".dcm" 733 | 734 | def get_viewer_URL( 735 | self, seriesInstanceUID=None, studyInstanceUID=None, viewer_selector=None 736 | ): 737 | """ 738 | Get the URL of the IDC viewer for the given series or study in IDC based on 739 | the provided SeriesInstanceUID or StudyInstanceUID. If StudyInstanceUID is not provided, 740 | it will be automatically deduced. If viewer_selector is not provided, default viewers 741 | will be used (OHIF v3 for radiology modalities, and Slim for SM). 742 | 743 | This function will validate the provided SeriesInstanceUID or StudyInstanceUID against IDC 744 | index to ensure that the series or study is available in IDC. 745 | 746 | Args: 747 | SeriesInstanceUID: string containing the value of DICOM SeriesInstanceUID for a series 748 | available in IDC 749 | 750 | StudyInstanceUID: string containing the value of DICOM SeriesInstanceUID for a series 751 | available in IDC 752 | 753 | viewer_selector: string containing the name of the viewer to use. Must be one of the following: 754 | ohif_v2, ohif_v3, or slim. If not provided, default viewers will be used: slim for studies that contain SM modality and ohif_v3 for radiology. 755 | 756 | Returns: 757 | string containing the IDC viewer URL for the requested selection 758 | """ 759 | 760 | if seriesInstanceUID is None and studyInstanceUID is None: 761 | raise ValueError( 762 | "Either SeriesInstanceUID or StudyInstanceUID, or both, must be provided." 763 | ) 764 | 765 | if ( 766 | seriesInstanceUID is not None 767 | and seriesInstanceUID not in self.index["SeriesInstanceUID"].values 768 | ): 769 | raise ValueError("SeriesInstanceUID not found in IDC index.") 770 | 771 | if ( 772 | studyInstanceUID is not None 773 | and studyInstanceUID not in self.index["StudyInstanceUID"].values 774 | ): 775 | raise ValueError("StudyInstanceUID not found in IDC index.") 776 | 777 | if viewer_selector is not None and viewer_selector not in [ 778 | "ohif_v2", 779 | "ohif_v3", 780 | "slim", 781 | ]: 782 | raise ValueError( 783 | "viewer_selector must be one of 'ohif_v2', 'ohif_v3', or 'slim'." 784 | ) 785 | 786 | modality = None 787 | 788 | if studyInstanceUID is None: 789 | query = f""" 790 | SELECT 791 | DISTINCT(StudyInstanceUID), 792 | Modality 793 | FROM 794 | index 795 | WHERE 796 | SeriesInstanceUID='{seriesInstanceUID}' 797 | """ 798 | query_result = self.sql_query(query) 799 | studyInstanceUID = query_result.StudyInstanceUID[0] 800 | modality = query_result.Modality[0] 801 | 802 | else: 803 | query = f""" 804 | SELECT 805 | DISTINCT(Modality) 806 | FROM 807 | index 808 | WHERE 809 | StudyInstanceUID='{studyInstanceUID}' 810 | """ 811 | query_result = self.sql_query(query) 812 | modality = query_result.Modality[0] 813 | 814 | viewer_url = None 815 | if viewer_selector is None: 816 | if "SM" in modality: 817 | viewer_selector = "slim" 818 | else: 819 | viewer_selector = "ohif_v3" 820 | 821 | if viewer_selector == "ohif_v2": 822 | if seriesInstanceUID is None: 823 | viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/viewer/{studyInstanceUID}" 824 | else: 825 | viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/viewer/{studyInstanceUID}?SeriesInstanceUID={seriesInstanceUID}" 826 | elif viewer_selector == "ohif_v3": 827 | if seriesInstanceUID is None: 828 | viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs={studyInstanceUID}" 829 | else: 830 | viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs={studyInstanceUID}&SeriesInstanceUIDs={seriesInstanceUID}" 831 | elif viewer_selector == "volview": 832 | # TODO! Not implemented yet 833 | viewer_url = None 834 | elif viewer_selector == "slim": 835 | if seriesInstanceUID is None: 836 | viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/slim/studies/{studyInstanceUID}" 837 | else: 838 | viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/slim/studies/{studyInstanceUID}/series/{seriesInstanceUID}" 839 | 840 | return viewer_url 841 | 842 | def _validate_update_manifest_and_get_download_size( 843 | self, 844 | manifestFile, 845 | downloadDir, 846 | validate_manifest, 847 | use_s5cmd_sync, 848 | dirTemplate, 849 | ) -> tuple[float, str, Path]: 850 | """ 851 | Validates the manifest file by checking the URLs in the manifest 852 | 853 | Args: 854 | manifestFile (str): The path to the manifest file. 855 | downloadDir (str): The path to the download directory. 856 | validate_manifest (bool): If True, validates the manifest for any errors. Defaults to True. 857 | show_progress_bar (bool): If True, tracks the progress of download 858 | use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded 859 | dirTemplate (str): A template string for the directory path. Must start with %. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT. It can contain attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) wrapped in '%'. Special characters can be used as connectors: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). Can be disabled by None. 860 | 861 | Returns: 862 | total_size (float): The total size of all series in the manifest file. 863 | endpoint_to_use (str): The endpoint URL to use (either AWS or GCP). 864 | temp_manifest_file(Path): Path to the temporary manifest file for downstream steps 865 | Raises: 866 | ValueError: If the manifest file does not exist, if any URL in the manifest file is invalid, or if any URL is inaccessible in both AWS and GCP. 867 | Exception: If the manifest contains URLs from both AWS and GCP. 868 | """ 869 | logger.debug("manifest validation is requested: " + str(validate_manifest)) 870 | 871 | logger.debug("Parsing the manifest. Please wait..") 872 | # Read the manifest as a csv file 873 | manifest_df = pd.read_csv( 874 | manifestFile, comment="#", skip_blank_lines=True, header=None 875 | ) 876 | 877 | # Rename the column 878 | manifest_df.columns = ["manifest_cp_cmd"] 879 | 880 | # remove all rows that do not contain an S3 URL 881 | manifest_df = manifest_df[ 882 | manifest_df["manifest_cp_cmd"].str.contains(r"s3://", na=False) 883 | ] 884 | 885 | # create a copy of the index 886 | index_df_copy = self.index[ 887 | [ 888 | "SeriesInstanceUID", 889 | "aws_bucket", 890 | "crdc_series_uuid", 891 | "series_size_MB", 892 | "PatientID", 893 | "collection_id", 894 | "Modality", 895 | "StudyInstanceUID", 896 | ] 897 | ] 898 | prior_versions_index_df_copy = self.prior_versions_index[ 899 | [ 900 | "SeriesInstanceUID", 901 | "aws_bucket", 902 | "crdc_series_uuid", 903 | "series_size_MB", 904 | "PatientID", 905 | "collection_id", 906 | "Modality", 907 | "StudyInstanceUID", 908 | ] 909 | ] 910 | 911 | # use default hierarchy 912 | if dirTemplate is not None: 913 | hierarchy = self._generate_sql_concat_for_building_directory( 914 | dirTemplate=dirTemplate, downloadDir=downloadDir 915 | ) 916 | else: 917 | hierarchy = f"CONCAT('{downloadDir}')" 918 | 919 | # Extract s3 url and crdc_series_uuid from the manifest copy commands 920 | # Next, construct aws_series_url in the index and 921 | # try to verify if every series in the manifest is present in the index 922 | 923 | # ruff: noqa 924 | sql = f""" 925 | PRAGMA disable_progress_bar; 926 | WITH 927 | index_temp AS ( 928 | SELECT 929 | seriesInstanceUID, 930 | CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url, 931 | series_size_MB, 932 | {hierarchy} AS path, 933 | crdc_series_uuid AS index_crdc_series_uuid 934 | FROM 935 | index_df_copy), 936 | manifest_temp AS ( 937 | SELECT 938 | manifest_cp_cmd, 939 | REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid, 940 | REGEXP_EXTRACT(manifest_cp_cmd, 's3://\\S+') AS s3_url, 941 | FROM 942 | manifest_df 943 | WHERE 944 | REGEXP_EXTRACT(manifest_cp_cmd, 's3://\\S+') IS NOT NULL) 945 | SELECT 946 | seriesInstanceuid, 947 | index_crdc_series_uuid, 948 | s3_url, 949 | path, 950 | series_size_MB, 951 | index_crdc_series_uuid is not NULL as crdc_series_uuid_match, 952 | s3_url==series_aws_url AS s3_url_match, 953 | manifest_temp.manifest_cp_cmd, 954 | CASE 955 | WHEN s3_url==series_aws_url THEN 'aws' 956 | ELSE 957 | 'unknown' 958 | END 959 | AS endpoint 960 | FROM 961 | manifest_temp 962 | LEFT JOIN 963 | index_temp 964 | ON 965 | index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid 966 | """ 967 | # ruff: noqa: end 968 | merged_df = duckdb.query(sql).df() 969 | 970 | endpoint_to_use = None 971 | 972 | if not all(merged_df["crdc_series_uuid_match"]): 973 | missing_manifest_cp_cmds = merged_df.loc[ 974 | ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd" 975 | ] 976 | missing_in_main_cnt = len(missing_manifest_cp_cmds.tolist()) 977 | logger.warning( 978 | f"The total of {missing_in_main_cnt} copy commands are not recognized as referencing any associated series in the main index.\n" 979 | "This means either these commands are invalid, or they may correspond to files available in a release of IDC\n" 980 | f"different from {self.get_idc_version()} used in this version of idc-index. Prior data releases will be checked next." 981 | ) 982 | 983 | logger.debug( 984 | "Checking if the requested data is available in other idc versions " 985 | ) 986 | 987 | missing_series_sql = f""" 988 | PRAGMA disable_progress_bar; 989 | WITH 990 | index_temp AS 991 | (SELECT 992 | seriesInstanceUID, 993 | CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url, 994 | series_size_MB, 995 | {hierarchy} AS path, 996 | crdc_series_uuid AS index_crdc_series_uuid 997 | FROM 998 | index_df_copy 999 | union by name 1000 | SELECT 1001 | seriesInstanceUID, 1002 | CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url, 1003 | series_size_MB, 1004 | {hierarchy} AS path, 1005 | crdc_series_uuid AS index_crdc_series_uuid 1006 | FROM 1007 | prior_versions_index_df_copy pvip 1008 | 1009 | ), 1010 | manifest_temp AS ( 1011 | SELECT 1012 | manifest_cp_cmd, 1013 | REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid, 1014 | REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url, 1015 | FROM 1016 | manifest_df 1017 | WHERE 1018 | REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') IS NOT NULL) 1019 | SELECT 1020 | seriesInstanceuid, 1021 | index_crdc_series_uuid, 1022 | s3_url, 1023 | path, 1024 | series_size_MB, 1025 | index_crdc_series_uuid is not NULL as crdc_series_uuid_match, 1026 | TRIM(s3_url) = TRIM(series_aws_url) AS s3_url_match, 1027 | manifest_temp.manifest_cp_cmd, 1028 | CASE 1029 | WHEN TRIM(s3_url) = TRIM(series_aws_url) THEN 'aws' 1030 | ELSE 1031 | 'unknown' 1032 | END 1033 | AS endpoint 1034 | FROM 1035 | manifest_temp 1036 | LEFT JOIN 1037 | index_temp 1038 | ON 1039 | index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid 1040 | """ 1041 | merged_df = duckdb.sql(missing_series_sql).df() 1042 | if not all(merged_df["crdc_series_uuid_match"]): 1043 | missing_manifest_cp_cmds = merged_df.loc[ 1044 | ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd" 1045 | ] 1046 | logger.error( 1047 | "The following manifest copy commands are not recognized as referencing any associated series in any release of IDC.\n" 1048 | "These commands may be invalid. Please submit an issue on https://github.com/ImagingDataCommons/idc-index/issues \n" 1049 | "The corresponding files could not be downloaded.\n" 1050 | ) 1051 | logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist())) 1052 | else: 1053 | logger.info("All of the identifiers from manifest have been resolved!") 1054 | 1055 | # `idc-open-data` bucket is present in both AWS and GCP, this is why we skip checking endpoint 1056 | # for the URLs that contain `idc-open-data` 1057 | provider_specific_urls = merged_df[ 1058 | ~merged_df["s3_url"].str.contains("/idc-open-data/") 1059 | ] 1060 | 1061 | if validate_manifest: 1062 | # Check if there is more than one endpoint 1063 | if len(provider_specific_urls["endpoint"].unique()) > 1: 1064 | logger.error("A mix of endpoint s3_urls encountered!") 1065 | for endpoint in merged_df["endpoint"].unique(): 1066 | sample_s3_url = merged_df[ 1067 | merged_df["endpoint"] == endpoint 1068 | ].s3_url.values[0] 1069 | logger.error(f" Endpoint {endpoint} s3_url {sample_s3_url}") 1070 | raise ValueError( 1071 | "Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. " 1072 | ) 1073 | elif provider_specific_urls.empty: 1074 | # if all URLs are from idc-open-data, default to AWS 1075 | endpoint_to_use = aws_endpoint_url 1076 | else: # provider_specific_urls["endpoint"].unique()) == 1 1077 | if provider_specific_urls["endpoint"].values[0] == "aws": 1078 | logging.debug("Detected AWS as the endpoint to use") 1079 | endpoint_to_use = aws_endpoint_url 1080 | else: # unknown / gcp 1081 | logging.debug("Will use GCS endpoint") 1082 | cmd = [ 1083 | self.s5cmdPath, 1084 | "--no-sign-request", 1085 | "--endpoint-url", 1086 | gcp_endpoint_url, 1087 | "ls", 1088 | merged_df.s3_url.values[0], 1089 | ] 1090 | process = subprocess.run( 1091 | cmd, capture_output=True, text=True, check=False 1092 | ) 1093 | if process.stderr and process.stdout.startswith("ERROR"): 1094 | logger.debug( 1095 | "Folder not available in GCP. Manifest appears to be invalid." 1096 | ) 1097 | if validate_manifest: 1098 | raise ValueError 1099 | else: 1100 | endpoint_to_use = gcp_endpoint_url 1101 | 1102 | elif ( 1103 | provider_specific_urls.empty 1104 | or provider_specific_urls["endpoint"].values[0] == "aws" 1105 | ): 1106 | endpoint_to_use = aws_endpoint_url 1107 | else: 1108 | # TODO: here we assume that the endpoint is GCP; we could check at least the first URL to be sure, 1109 | # but we can take care of this in a more principled way by including GCP bucket directly 1110 | # in the future, see https://github.com/ImagingDataCommons/idc-index/pull/56#discussion_r1582157048 1111 | endpoint_to_use = gcp_endpoint_url 1112 | 1113 | # Calculate total size 1114 | total_size = merged_df["series_size_MB"].sum() 1115 | total_size = round(total_size, 2) 1116 | 1117 | # Write a temporary manifest file 1118 | with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_manifest_file: 1119 | if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0: 1120 | if dirTemplate is not None: 1121 | merged_df["s5cmd_cmd"] = ( 1122 | "sync " 1123 | + merged_df["s3_url"] 1124 | + " " 1125 | + '"' 1126 | + merged_df["path"] 1127 | + '"' 1128 | ) 1129 | else: 1130 | merged_df["s5cmd_cmd"] = ( 1131 | "sync " + merged_df["s3_url"] + " " + '"' + downloadDir + '"' 1132 | ) 1133 | elif dirTemplate is not None: 1134 | merged_df["s5cmd_cmd"] = ( 1135 | "cp " + merged_df["s3_url"] + " " + '"' + merged_df["path"] + '"' 1136 | ) 1137 | else: 1138 | merged_df["s5cmd_cmd"] = ( 1139 | "cp " + merged_df["s3_url"] + " " + '"' + downloadDir + '"' 1140 | ) 1141 | 1142 | # Combine all commands into a single string with newline separators 1143 | commands = "\n".join(merged_df["s5cmd_cmd"]) 1144 | 1145 | temp_manifest_file.write(commands) 1146 | 1147 | logger.info("Parsing the manifest is finished. Download will begin soon") 1148 | 1149 | if dirTemplate is not None: 1150 | list_of_directories = merged_df.path.to_list() 1151 | else: 1152 | list_of_directories = [downloadDir] 1153 | 1154 | logger.debug(f"list of directories:{list_of_directories}") 1155 | return ( 1156 | total_size, 1157 | endpoint_to_use, 1158 | Path(temp_manifest_file.name), 1159 | list_of_directories, 1160 | merged_df[ 1161 | ["index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path"] 1162 | ], 1163 | ) 1164 | 1165 | @staticmethod 1166 | def _generate_sql_concat_for_building_directory(dirTemplate, downloadDir): 1167 | # for now, we limit the allowed columns to this list to make sure that all 1168 | # values are guaranteed to be non-empty and to not contain any special characters 1169 | # in the future, we should consider including more attributes 1170 | # also, if we allow any column, we should decide what we would do if the value is NULL 1171 | valid_attributes = [ 1172 | "PatientID", 1173 | "collection_id", 1174 | "Modality", 1175 | "StudyInstanceUID", 1176 | "SeriesInstanceUID", 1177 | ] 1178 | valid_separators = ["_", "-", "/"] 1179 | 1180 | updated_template = dirTemplate 1181 | 1182 | # validate input template by removing all valid attributes and separators 1183 | for attr in valid_attributes: 1184 | updated_template = updated_template.replace("%" + attr, "") 1185 | for sep in valid_separators: 1186 | updated_template = updated_template.replace(sep, "") 1187 | 1188 | if updated_template != "": 1189 | logger.error("Invalid download hierarchy template:" + updated_template) 1190 | logger.error( 1191 | "Make sure your template uses only valid attributes and separators" 1192 | ) 1193 | logger.error("Valid attributes: " + str(valid_attributes)) 1194 | logger.error("Valid separators: " + str(valid_separators)) 1195 | raise ValueError 1196 | 1197 | concat_command = dirTemplate 1198 | for attr in valid_attributes: 1199 | concat_command = concat_command.replace("%" + attr, f"', {attr},'") 1200 | 1201 | # CONCAT command may contain empty strings, and they are not harmless - 1202 | # duckdb does not like them! 1203 | # NB: double-quotes are not allowed by duckdb! 1204 | concat_command = f"CONCAT('{downloadDir}/','" + concat_command + "')" 1205 | concat_command = concat_command.replace(",''", "") 1206 | concat_command = concat_command.replace("'',", "") 1207 | concat_command = concat_command.replace(",'',", "") 1208 | return concat_command 1209 | 1210 | @staticmethod 1211 | def _track_download_progress( 1212 | size_MB: int, 1213 | downloadDir: str, 1214 | process: subprocess.Popen, 1215 | show_progress_bar: bool = True, 1216 | list_of_directories=None, 1217 | ): 1218 | logger.debug("Inputs received for tracking download:") 1219 | logger.debug(f"size_MB: {size_MB}") 1220 | logger.debug(f"downloadDir: {downloadDir}") 1221 | logger.debug(f"show_progress_bar: {show_progress_bar}") 1222 | 1223 | runtime_errors = [] 1224 | 1225 | if show_progress_bar: 1226 | total_size_to_be_downloaded_bytes = size_MB * (10**6) 1227 | initial_size_bytes = 0 1228 | # Calculate the initial size of the directory 1229 | for directory in list_of_directories: 1230 | initial_size_bytes = IDCClient._get_dir_sum_file_size(directory) 1231 | 1232 | logger.info( 1233 | "Initial size of the directory: %s", 1234 | IDCClient._format_size(initial_size_bytes, size_in_bytes=True), 1235 | ) 1236 | logger.info( 1237 | "Approximate size of the files that need to be downloaded: %s", 1238 | IDCClient._format_size(size_MB), 1239 | ) 1240 | 1241 | pbar = tqdm( 1242 | total=total_size_to_be_downloaded_bytes, 1243 | unit="B", 1244 | unit_scale=True, 1245 | desc="Downloading data", 1246 | ) 1247 | 1248 | while True: 1249 | time.sleep(0.5) 1250 | downloaded_bytes = 0 1251 | for directory in list_of_directories: 1252 | downloaded_bytes += IDCClient._get_dir_sum_file_size(directory) 1253 | downloaded_bytes -= initial_size_bytes 1254 | pbar.n = min( 1255 | downloaded_bytes, total_size_to_be_downloaded_bytes 1256 | ) # Prevent the progress bar from exceeding 100% 1257 | pbar.refresh() 1258 | if process.poll() is not None: 1259 | break 1260 | # Wait for the process to finish 1261 | _, stderr = process.communicate() 1262 | pbar.close() 1263 | 1264 | else: 1265 | while process.poll() is None: 1266 | time.sleep(0.5) 1267 | 1268 | @staticmethod 1269 | def _get_dir_sum_file_size(directory) -> int: 1270 | path = Path(directory) 1271 | sum_file_size = 0 1272 | if path.exists() and path.is_dir(): 1273 | for f in path.iterdir(): 1274 | if f.is_file(): 1275 | try: 1276 | sum_file_size += f.stat().st_size 1277 | except FileNotFoundError: 1278 | # file must have been removed before we 1279 | # could get its size 1280 | pass 1281 | return sum_file_size 1282 | 1283 | def _parse_s5cmd_sync_output_and_generate_synced_manifest( 1284 | self, stdout, s5cmd_sync_helper_df 1285 | ) -> Path: 1286 | """ 1287 | Parse the output of s5cmd sync --dry-run to extract distinct folders and generate a synced manifest. 1288 | 1289 | Args: 1290 | output (str): The output of s5cmd sync --dry-run command. 1291 | s5cmd_sync_helper_df: helper df obtained after validation of manifest or filtering of selection, containing a minimum of "index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path" columns 1292 | 1293 | Returns: 1294 | Path: The path to the generated synced manifest file. 1295 | float: Download size in MB 1296 | list_of_directories: list of directories need to tracked for progress bar 1297 | """ 1298 | logger.info("Parsing the s5cmd sync dry run output...") 1299 | 1300 | stdout_df = pd.DataFrame(stdout.splitlines(), columns=["s5cmd_output"]) 1301 | 1302 | # create a copy of the index 1303 | index_df_copy = self.index 1304 | 1305 | result_df = s5cmd_sync_helper_df 1306 | 1307 | # TODO: need to remove the assumption that manifest commands will have 'cp' 1308 | # ruff: noqa 1309 | sql = """ 1310 | PRAGMA disable_progress_bar; 1311 | WITH 1312 | index_temp AS ( 1313 | SELECT 1314 | index_crdc_series_uuid, 1315 | s5cmd_cmd, 1316 | path, 1317 | series_size_MB 1318 | FROM 1319 | result_df), 1320 | sync_temp AS ( 1321 | SELECT 1322 | DISTINCT CONCAT(REGEXP_EXTRACT(s5cmd_output, 'cp (s3://[^/]+/[^/]+)/.*', 1), '/*') AS s3_url, 1323 | REGEXP_EXTRACT(CONCAT(REGEXP_EXTRACT(s5cmd_output, 'cp (s3://[^/]+/[^/]+)/.*', 1), '/*'),'(?:.*?\\/){3}([^\\/?#]+)',1) AS sync_crdc_instance_uuid 1324 | FROM 1325 | stdout_df ) 1326 | SELECT 1327 | DISTINCT s5cmd_cmd, 1328 | series_size_MB, 1329 | path 1330 | FROM 1331 | sync_temp 1332 | JOIN 1333 | index_temp 1334 | ON 1335 | index_temp.index_crdc_series_uuid = sync_temp.sync_crdc_instance_uuid 1336 | """ 1337 | # ruff: noqa: end 1338 | synced_df = duckdb.query(sql).df() 1339 | sync_size = synced_df["series_size_MB"].sum() 1340 | sync_size_rounded = round(sync_size, 2) 1341 | 1342 | logger.debug(f"sync_size_rounded: {sync_size_rounded}") 1343 | 1344 | # Write a temporary manifest file 1345 | with tempfile.NamedTemporaryFile(mode="w", delete=False) as synced_manifest: 1346 | list_of_directories = synced_df.path.to_list() 1347 | commands = "\n".join(synced_df["s5cmd_cmd"]) 1348 | synced_manifest.write(commands) 1349 | 1350 | logger.info("Parsing the s5cmd sync dry run output finished") 1351 | return Path(synced_manifest.name), sync_size_rounded, list_of_directories 1352 | 1353 | def _s5cmd_run( 1354 | self, 1355 | endpoint_to_use, 1356 | manifest_file, 1357 | total_size, 1358 | downloadDir, 1359 | quiet, 1360 | show_progress_bar, 1361 | use_s5cmd_sync, 1362 | dirTemplate, 1363 | list_of_directories, 1364 | s5cmd_sync_helper_df, 1365 | ): 1366 | """ 1367 | Executes the s5cmd command to sync files from a given endpoint to a local directory. 1368 | 1369 | This function first performs a dry run of the s5cmd command to check which files need to be downloaded. 1370 | If there are files to be downloaded, it generates a new manifest file with the files to be synced and 1371 | runs the s5cmd command again to download the files. The progress of the download is tracked and printed 1372 | to the console. 1373 | 1374 | Args: 1375 | endpoint_to_use (str): The endpoint URL to download the files from. 1376 | manifest_file (str): The path to the manifest file listing the files to be downloaded. 1377 | total_size (float): The total size of the files to be downloaded in MB. 1378 | downloadDir (str): The local directory where the files will be downloaded. 1379 | quiet (bool): If True, suppresses the stdout and stderr of the s5cmd command. 1380 | show_progress_bar (bool): If True, tracks the progress of download. 1381 | use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded. 1382 | dirTemplate (str): Download directory hierarchy template. 1383 | list_of_directories(list): List of directories need to tracked for progress bar. 1384 | s5cmd_sync_helper_df (df): helper df obtained after validation of manifest or filtering of selection, containing a minimum of "index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path" columns. 1385 | 1386 | Raises: 1387 | subprocess.CalledProcessError: If the s5cmd command fails. 1388 | 1389 | Returns: 1390 | None 1391 | """ 1392 | logger.debug("running self._s5cmd_run. Inputs received:") 1393 | logger.debug(f"endpoint_to_use: {endpoint_to_use}") 1394 | logger.debug(f"manifest_file: {manifest_file}") 1395 | logger.debug(f"total_size: {total_size}") 1396 | logger.debug(f"downloadDir: {downloadDir}") 1397 | logger.debug(f"quiet: {quiet}") 1398 | logger.debug(f"show_progress_bar: {show_progress_bar}") 1399 | logger.debug(f"use_s5cmd_sync: {use_s5cmd_sync}") 1400 | logger.debug(f"dirTemplate: {dirTemplate}") 1401 | 1402 | if quiet: 1403 | stdout = subprocess.DEVNULL 1404 | stderr = subprocess.DEVNULL 1405 | else: 1406 | stdout = None 1407 | stderr = None 1408 | 1409 | if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0: 1410 | logger.debug( 1411 | "Requested progress bar along with s5cmd sync dry run.\ 1412 | Using s5cmd sync dry run as the destination folder is not empty" 1413 | ) 1414 | dry_run_cmd = [ 1415 | self.s5cmdPath, 1416 | "--no-sign-request", 1417 | "--dry-run", 1418 | "--endpoint-url", 1419 | endpoint_to_use, 1420 | "run", 1421 | manifest_file, 1422 | ] 1423 | 1424 | process = subprocess.run( 1425 | dry_run_cmd, stdout=subprocess.PIPE, text=True, check=False 1426 | ) 1427 | 1428 | if process.stdout: 1429 | # Some files need to be downloaded 1430 | logger.info( 1431 | """ 1432 | stoud from s5cmd sync dry run is not empty. Parsing the output to 1433 | evaluate what to download and corresponding size with only series level precision 1434 | """ 1435 | ) 1436 | ( 1437 | synced_manifest, 1438 | sync_size, 1439 | list_of_directories, 1440 | ) = self._parse_s5cmd_sync_output_and_generate_synced_manifest( 1441 | stdout=process.stdout, 1442 | s5cmd_sync_helper_df=s5cmd_sync_helper_df, 1443 | ) 1444 | logger.info(f"sync_size (MB): {sync_size}") 1445 | 1446 | cmd = [ 1447 | self.s5cmdPath, 1448 | "--no-sign-request", 1449 | "--endpoint-url", 1450 | endpoint_to_use, 1451 | "run", 1452 | synced_manifest, 1453 | ] 1454 | with subprocess.Popen( 1455 | cmd, stdout=stdout, stderr=stderr, universal_newlines=True 1456 | ) as process: 1457 | if sync_size < total_size: 1458 | logger.info( 1459 | """ 1460 | Destination folder is not empty and sync size is less than total size. 1461 | """ 1462 | ) 1463 | existing_data_size = round(total_size - sync_size, 2) 1464 | logger.info( 1465 | f"Requested total download size is {total_size} MB, \ 1466 | however at least {existing_data_size} MB is already present,\ 1467 | so downloading only remaining up to {sync_size} MB\n\ 1468 | Please note that disk sizes are calculated at series level, \ 1469 | so if individual files are missing, displayed progress bar may\ 1470 | not be accurate." 1471 | ) 1472 | self._track_download_progress( 1473 | sync_size, 1474 | downloadDir, 1475 | process, 1476 | show_progress_bar, 1477 | list_of_directories, 1478 | ) 1479 | else: 1480 | self._track_download_progress( 1481 | total_size, 1482 | downloadDir, 1483 | process, 1484 | show_progress_bar, 1485 | list_of_directories, 1486 | ) 1487 | else: 1488 | logger.info( 1489 | "It appears that all requested DICOM files are already present in destination folder" 1490 | ) 1491 | else: 1492 | logger.info( 1493 | "Not using s5cmd sync as the destination folder is empty or sync or progress bar is not requested" 1494 | ) 1495 | cmd = [ 1496 | self.s5cmdPath, 1497 | "--no-sign-request", 1498 | "--endpoint-url", 1499 | endpoint_to_use, 1500 | "run", 1501 | manifest_file, 1502 | ] 1503 | 1504 | # fedorov: did consider-using-with, and decided against it to keep the code more readable 1505 | stderr_log_file = tempfile.NamedTemporaryFile(delete=False) # pylint: disable=consider-using-with 1506 | logging.debug("Running download command: " + str(cmd)) 1507 | with subprocess.Popen( 1508 | cmd, 1509 | stdout=stdout, 1510 | stderr=stderr_log_file, 1511 | universal_newlines=True, 1512 | ) as process: 1513 | self._track_download_progress( 1514 | total_size, 1515 | downloadDir, 1516 | process, 1517 | show_progress_bar, 1518 | list_of_directories, 1519 | ) 1520 | 1521 | stderr_log_file.close() 1522 | 1523 | runtime_errors = [] 1524 | with open(stderr_log_file.name) as stderr_log_file: 1525 | for line in stderr_log_file.readlines(): 1526 | if not quiet: 1527 | logger.info(line) 1528 | if line.startswith("ERROR"): 1529 | runtime_errors.append(line) 1530 | 1531 | Path(stderr_log_file.name).unlink() 1532 | 1533 | if len(runtime_errors) > 0: 1534 | logger.error( 1535 | "Download process failed with the following errors:\n" 1536 | + "\n".join(runtime_errors) 1537 | ) 1538 | 1539 | # Check if download process completed successfully 1540 | if process.returncode != 0: 1541 | logger.error( 1542 | f"Download process return non-zero exit code: {process.returncode}" 1543 | ) 1544 | else: 1545 | logger.info("Successfully downloaded files to %s", str(downloadDir)) 1546 | 1547 | @staticmethod 1548 | def _format_size(size, size_in_bytes: bool = False): 1549 | if size_in_bytes: 1550 | size_MB = size / (10**6) 1551 | else: 1552 | size_MB = size 1553 | size_GB = size_MB / 1000 1554 | size_TB = size_GB / 1000 1555 | 1556 | if size_TB >= 1: 1557 | return f"{round(size_TB, 2)} TB" 1558 | if size_GB >= 1: 1559 | return f"{round(size_GB, 2)} GB" 1560 | if size_MB >= 1: 1561 | return f"{round(size_MB, 2)} MB" 1562 | return f"{round(size, 2)} bytes" 1563 | 1564 | def download_from_manifest( 1565 | self, 1566 | manifestFile: str, 1567 | downloadDir: str, 1568 | quiet: bool = True, 1569 | validate_manifest: bool = True, 1570 | show_progress_bar: bool = True, 1571 | use_s5cmd_sync: bool = False, 1572 | dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT, 1573 | ) -> None: 1574 | """ 1575 | Download the manifest file. In a series of steps, the manifest file 1576 | is first validated to ensure every line contains a valid urls. It then 1577 | gets the total size to be downloaded and runs download process on one 1578 | process and download progress on another process. 1579 | 1580 | Args: 1581 | manifestFile (str): The path to the manifest file. 1582 | downloadDir (str): The directory to download the files to. 1583 | quiet (bool): If True, suppresses the output of the subprocess. Defaults to True. 1584 | validate_manifest (bool): If True, validates the manifest for any errors. Defaults to True. 1585 | show_progress_bar (bool): If True, tracks the progress of download 1586 | use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded 1587 | dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories. 1588 | 1589 | Raises: 1590 | ValueError: If the download directory does not exist. 1591 | """ 1592 | 1593 | downloadDir = self._check_create_directory(downloadDir) 1594 | 1595 | # validate the manifest 1596 | ( 1597 | total_size, 1598 | endpoint_to_use, 1599 | temp_manifest_file, 1600 | list_of_directories, 1601 | validation_result_df, 1602 | ) = self._validate_update_manifest_and_get_download_size( 1603 | manifestFile=manifestFile, 1604 | downloadDir=downloadDir, 1605 | validate_manifest=validate_manifest, 1606 | use_s5cmd_sync=use_s5cmd_sync, 1607 | dirTemplate=dirTemplate, 1608 | ) 1609 | 1610 | total_size_rounded = round(total_size, 2) 1611 | if not self._check_disk_size_and_warn(downloadDir, total_size): 1612 | return 1613 | 1614 | self._s5cmd_run( 1615 | endpoint_to_use=endpoint_to_use, 1616 | manifest_file=temp_manifest_file, 1617 | total_size=total_size_rounded, 1618 | downloadDir=downloadDir, 1619 | quiet=quiet, 1620 | show_progress_bar=show_progress_bar, 1621 | use_s5cmd_sync=use_s5cmd_sync, 1622 | dirTemplate=dirTemplate, 1623 | list_of_directories=list_of_directories, 1624 | s5cmd_sync_helper_df=validation_result_df, 1625 | ) 1626 | 1627 | def citations_from_manifest( 1628 | self, 1629 | manifestFile: str, 1630 | citation_format: str = CITATION_FORMAT_APA, 1631 | ): 1632 | """Get the list of publications that should be cited/attributed for a cohort defined by a manifest. 1633 | 1634 | Args: 1635 | manifestFile (str: string containing the path to the manifest file. 1636 | format (str): string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4. 1637 | 1638 | Returns: 1639 | List of citations in the requested format. 1640 | """ 1641 | 1642 | manifest_df = pd.read_csv( 1643 | manifestFile, 1644 | comment="#", 1645 | skip_blank_lines=True, 1646 | header=None, 1647 | names=["manifest_line"], 1648 | ) 1649 | uuid_pattern = r"s3://.*/([^/]+)/\*" 1650 | manifest_df["crdc_series_uuid"] = manifest_df["manifest_line"].str.extract( 1651 | uuid_pattern, expand=False 1652 | ) 1653 | index_copy = self.index[["series_aws_url", "SeriesInstanceUID"]].copy() 1654 | index_copy["crdc_series_uuid"] = index_copy["series_aws_url"].str.extract( 1655 | uuid_pattern, expand=False 1656 | ) 1657 | 1658 | result_df = pd.merge(manifest_df, index_copy, on="crdc_series_uuid", how="left") 1659 | 1660 | return self.citations_from_selection( 1661 | seriesInstanceUID=result_df["SeriesInstanceUID"].tolist(), 1662 | citation_format=citation_format, 1663 | ) 1664 | 1665 | def citations_from_selection( 1666 | self, 1667 | collection_id=None, 1668 | patientId=None, 1669 | studyInstanceUID=None, 1670 | seriesInstanceUID=None, 1671 | citation_format=CITATION_FORMAT_APA, 1672 | ): 1673 | """Get the list of publications that should be cited/attributed for the specific collection, patient (case) ID, study or series UID. 1674 | 1675 | Args: 1676 | collection_id: string or list of strings containing the values of collection_id to filter by 1677 | patientId: string or list of strings containing the values of PatientID to filter by 1678 | studyInstanceUID (str): string or list of strings containing the values of DICOM StudyInstanceUID to filter by 1679 | seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by 1680 | format: string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4. 1681 | 1682 | Returns: 1683 | List of citations in the requested format. 1684 | """ 1685 | 1686 | result_df = self._safe_filter_by_selection( 1687 | self.index, 1688 | collection_id=collection_id, 1689 | patientId=patientId, 1690 | studyInstanceUID=studyInstanceUID, 1691 | seriesInstanceUID=seriesInstanceUID, 1692 | sopInstanceUID=None, 1693 | crdc_series_uuid=None, 1694 | ) 1695 | 1696 | citations = [] 1697 | 1698 | if not result_df.empty: 1699 | distinct_dois = result_df["source_DOI"].unique().tolist() 1700 | 1701 | if len(distinct_dois) == 0: 1702 | logger.error("No DOIs found for the selection.") 1703 | return citations 1704 | 1705 | # include citation for the currently main IDC publication 1706 | # https://doi.org/10.1148/rg.230180 1707 | distinct_dois.append("10.1148/rg.230180") 1708 | 1709 | headers = {"accept": citation_format} 1710 | timeout = 30 1711 | 1712 | for doi in distinct_dois: 1713 | url = "https://dx.doi.org/" + doi 1714 | 1715 | logger.debug(f"Requesting citation for DOI: {doi}") 1716 | 1717 | response = requests.get(url, headers=headers, timeout=timeout) 1718 | 1719 | logger.debug("Received response: " + str(response.status_code)) 1720 | 1721 | if response.status_code == 200: 1722 | if citation_format == self.CITATION_FORMAT_JSON: 1723 | citations.append(response.json()) 1724 | else: 1725 | citations.append(response.text) 1726 | logger.debug("Received citation: " + citations[-1]) 1727 | 1728 | else: 1729 | logger.error(f"Failed to get citation for DOI: {url}") 1730 | logger.error( 1731 | f"DOI server response status code: {response.status_code}" 1732 | ) 1733 | 1734 | return citations 1735 | 1736 | def download_from_selection( 1737 | self, 1738 | downloadDir, 1739 | dry_run=False, 1740 | collection_id=None, 1741 | patientId=None, 1742 | studyInstanceUID=None, 1743 | seriesInstanceUID=None, 1744 | sopInstanceUID=None, 1745 | crdc_series_uuid=None, 1746 | quiet=True, 1747 | show_progress_bar=True, 1748 | use_s5cmd_sync=False, 1749 | dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT, 1750 | source_bucket_location="aws", 1751 | ): 1752 | """Download the files corresponding to the selection. The filtering will be applied in sequence (but does it matter?) by first selecting the collection(s), followed by 1753 | patient(s), study(studies) and series. If no filtering is applied, all the files will be downloaded. 1754 | 1755 | Args: 1756 | downloadDir: string containing the path to the directory to download the files to 1757 | dry_run: calculates the size of the cohort but download does not start 1758 | collection_id: string or list of strings containing the values of collection_id to filter by 1759 | patientId: string or list of strings containing the values of PatientID to filter by 1760 | studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by 1761 | seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by 1762 | sopInstanceUID: string or list of strings containing the values of DICOM SOPInstanceUID to filter by 1763 | crdc_series_uuid: string or list of strings containing the values of crdc_series_uuid to filter by 1764 | quiet (bool): If True, suppresses the output of the subprocess. Defaults to True 1765 | show_progress_bar (bool): If True, tracks the progress of download 1766 | use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded 1767 | dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories. 1768 | source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'. 1769 | """ 1770 | 1771 | if source_bucket_location not in ["aws", "gcs"]: 1772 | raise ValueError("source_bucket_location must be either 'aws' or 'gcs'") 1773 | 1774 | downloadDir = self._check_create_directory(downloadDir) 1775 | 1776 | # If SOPInstanceUID(s) are given, we need to join the main index with the instance-level index 1777 | sm_instance_index = None 1778 | if sopInstanceUID: 1779 | if ( 1780 | self.sm_instance_index is not None 1781 | ): # check if instance-level index is installed 1782 | download_df = self.sm_instance_index 1783 | sm_instance_index = self.sm_instance_index 1784 | else: 1785 | logger.error( 1786 | "Instance-level access not possible because instance-level index not installed." 1787 | ) 1788 | raise ValueError( 1789 | "Instance-level access not possible because instance-level index not installed." 1790 | ) 1791 | if use_s5cmd_sync: 1792 | logger.warning( 1793 | "s5cmd sync is not supported for downloading individual files. Disabling sync." 1794 | ) 1795 | use_s5cmd_sync = False 1796 | elif crdc_series_uuid is not None: 1797 | download_df = pd.concat( 1798 | [ 1799 | self.index[ 1800 | [ 1801 | "PatientID", 1802 | "collection_id", 1803 | "Modality", 1804 | "StudyInstanceUID", 1805 | "SeriesInstanceUID", 1806 | "crdc_series_uuid", 1807 | "aws_bucket", 1808 | "series_size_MB", 1809 | ] 1810 | ], 1811 | self.prior_versions_index[ 1812 | [ 1813 | "PatientID", 1814 | "collection_id", 1815 | "Modality", 1816 | "StudyInstanceUID", 1817 | "SeriesInstanceUID", 1818 | "crdc_series_uuid", 1819 | "aws_bucket", 1820 | "series_size_MB", 1821 | ] 1822 | ], 1823 | ], 1824 | ) 1825 | else: 1826 | download_df = self.index 1827 | 1828 | result_df = self._safe_filter_by_selection( 1829 | download_df, 1830 | collection_id=collection_id, 1831 | patientId=patientId, 1832 | studyInstanceUID=studyInstanceUID, 1833 | seriesInstanceUID=seriesInstanceUID, 1834 | sopInstanceUID=sopInstanceUID, 1835 | crdc_series_uuid=crdc_series_uuid, 1836 | ) 1837 | 1838 | if not sopInstanceUID: 1839 | total_size = round(result_df["series_size_MB"].sum(), 2) 1840 | else: 1841 | total_size_bytes = round(result_df["instance_size"].sum(), 2) 1842 | total_size = total_size_bytes / (10**6) 1843 | 1844 | if not self._check_disk_size_and_warn(downloadDir, total_size): 1845 | return 1846 | 1847 | if dry_run: 1848 | logger.info( 1849 | "Dry run. Not downloading files. Rerun with dry_run=False to download the files." 1850 | ) 1851 | return 1852 | 1853 | if dirTemplate is not None: 1854 | hierarchy = self._generate_sql_concat_for_building_directory( 1855 | downloadDir=downloadDir, 1856 | dirTemplate=dirTemplate, 1857 | ) 1858 | else: 1859 | hierarchy = f"CONCAT('{downloadDir}')" 1860 | 1861 | if sopInstanceUID: 1862 | sql = f""" 1863 | WITH temp as 1864 | ( 1865 | SELECT 1866 | sopInstanceUID 1867 | FROM 1868 | result_df 1869 | ) 1870 | SELECT 1871 | CONCAT('s3://', aws_bucket, '/', crdc_series_uuid,'/*') AS series_aws_url, 1872 | CONCAT('s3://', aws_bucket, '/', crdc_series_uuid,'/', crdc_instance_uuid, '.dcm') as instance_aws_url, 1873 | crdc_series_uuid index_crdc_series_uuid, 1874 | {hierarchy} as path 1875 | FROM 1876 | temp 1877 | JOIN 1878 | sm_instance_index using (sopInstanceUID) 1879 | LEFT JOIN 1880 | index using (seriesInstanceUID) 1881 | """ 1882 | else: 1883 | sql = f""" 1884 | WITH temp as 1885 | ( 1886 | SELECT 1887 | seriesInstanceUID 1888 | FROM 1889 | result_df 1890 | ) 1891 | SELECT 1892 | CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url, 1893 | crdc_series_uuid AS index_crdc_series_uuid, 1894 | series_size_MB, 1895 | {hierarchy} as path 1896 | FROM 1897 | temp 1898 | JOIN 1899 | index using (seriesInstanceUID) 1900 | """ 1901 | index = self.index 1902 | result_df = duckdb.query(sql).df() 1903 | # Download the files and make temporary file to store the list of files to download 1904 | 1905 | with tempfile.NamedTemporaryFile(mode="w", delete=False) as manifest_file: 1906 | # Determine column containing the URL for instance / series-level access 1907 | if sopInstanceUID: 1908 | if not "instance_aws_url" in result_df: 1909 | result_df["instance_aws_url"] = ( 1910 | result_df["series_aws_url"].replace("/*", "/") 1911 | + result_df["crdc_instance_uuid"] 1912 | + ".dcm" 1913 | ) 1914 | url_column = "instance_aws_url" 1915 | else: 1916 | url_column = "series_aws_url" 1917 | 1918 | if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0: 1919 | if dirTemplate is not None: 1920 | result_df["s5cmd_cmd"] = ( 1921 | "sync " + result_df[url_column] + ' "' + result_df["path"] + '"' 1922 | ) 1923 | else: 1924 | result_df["s5cmd_cmd"] = ( 1925 | "sync " + result_df[url_column] + ' "' + downloadDir + '"' 1926 | ) 1927 | elif dirTemplate is not None: 1928 | result_df["s5cmd_cmd"] = ( 1929 | "cp " + result_df[url_column] + ' "' + result_df["path"] + '"' 1930 | ) 1931 | else: 1932 | result_df["s5cmd_cmd"] = ( 1933 | "cp " + result_df[url_column] + ' "' + downloadDir + '"' 1934 | ) 1935 | 1936 | if source_bucket_location == "gcs": 1937 | self._replace_aws_with_gcp_buckets(result_df, "s5cmd_cmd") 1938 | 1939 | # Combine all commands into a single string with newline separators 1940 | commands = "\n".join(result_df["s5cmd_cmd"]) 1941 | manifest_file.write(commands) 1942 | 1943 | if dirTemplate is not None: 1944 | list_of_directories = result_df.path.to_list() 1945 | else: 1946 | list_of_directories = [downloadDir] 1947 | logger.debug( 1948 | """ 1949 | Temporary download manifest is generated and is passed to self._s5cmd_run 1950 | """ 1951 | ) 1952 | if sopInstanceUID: 1953 | s5cmd_sync_helper_df = None 1954 | else: 1955 | s5cmd_sync_helper_df = result_df[ 1956 | ["index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path"] 1957 | ] 1958 | endpoint_url = None 1959 | if source_bucket_location == "aws": 1960 | endpoint_url = aws_endpoint_url 1961 | elif source_bucket_location == "gcs": 1962 | endpoint_url = gcp_endpoint_url 1963 | else: 1964 | raise ValueError("source_bucket_location must be either 'aws' or 'gcs'") 1965 | self._s5cmd_run( 1966 | endpoint_to_use=endpoint_url, 1967 | manifest_file=Path(manifest_file.name), 1968 | total_size=total_size, 1969 | downloadDir=downloadDir, 1970 | quiet=quiet, 1971 | show_progress_bar=show_progress_bar, 1972 | use_s5cmd_sync=use_s5cmd_sync, 1973 | dirTemplate=dirTemplate, 1974 | list_of_directories=list_of_directories, 1975 | s5cmd_sync_helper_df=s5cmd_sync_helper_df, 1976 | ) 1977 | 1978 | def download_dicom_instance( 1979 | self, 1980 | sopInstanceUID, 1981 | downloadDir, 1982 | dry_run=False, 1983 | quiet=True, 1984 | show_progress_bar=True, 1985 | use_s5cmd_sync=False, 1986 | dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT, 1987 | source_bucket_location="aws", 1988 | ) -> None: 1989 | """ 1990 | Download the files corresponding to the seriesInstanceUID to the specified directory. 1991 | 1992 | Args: 1993 | sopInstanceUID: string or list of strings containing the values of DICOM SOPInstanceUID to filter by 1994 | downloadDir: string containing the path to the directory to download the files to 1995 | dry_run: calculates the size of the cohort but download does not start 1996 | quiet (bool): If True, suppresses the output of the subprocess. Defaults to True. 1997 | show_progress_bar (bool): If True, tracks the progress of download 1998 | use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded 1999 | dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories. 2000 | source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'. 2001 | Returns: None 2002 | 2003 | Raises: 2004 | TypeError: If sopInstanceUID(s) passed is(are) not a string or list 2005 | 2006 | """ 2007 | self.download_from_selection( 2008 | downloadDir, 2009 | sopInstanceUID=sopInstanceUID, 2010 | dry_run=dry_run, 2011 | quiet=quiet, 2012 | show_progress_bar=show_progress_bar, 2013 | use_s5cmd_sync=use_s5cmd_sync, 2014 | dirTemplate=dirTemplate, 2015 | source_bucket_location=source_bucket_location, 2016 | ) 2017 | 2018 | def download_dicom_series( 2019 | self, 2020 | seriesInstanceUID, 2021 | downloadDir, 2022 | dry_run=False, 2023 | quiet=True, 2024 | show_progress_bar=True, 2025 | use_s5cmd_sync=False, 2026 | dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT, 2027 | source_bucket_location="aws", 2028 | ) -> None: 2029 | """ 2030 | Download the files corresponding to the seriesInstanceUID to the specified directory. 2031 | 2032 | Args: 2033 | seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by 2034 | downloadDir: string containing the path to the directory to download the files to 2035 | dry_run: calculates the size of the cohort but download does not start 2036 | quiet (bool): If True, suppresses the output of the subprocess. Defaults to True. 2037 | show_progress_bar (bool): If True, tracks the progress of download 2038 | use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded 2039 | dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories. 2040 | source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'. 2041 | Returns: None 2042 | 2043 | Raises: 2044 | TypeError: If seriesInstanceUID(s) passed is(are) not a string or list 2045 | 2046 | """ 2047 | self.download_from_selection( 2048 | downloadDir, 2049 | seriesInstanceUID=seriesInstanceUID, 2050 | dry_run=dry_run, 2051 | quiet=quiet, 2052 | show_progress_bar=show_progress_bar, 2053 | use_s5cmd_sync=use_s5cmd_sync, 2054 | dirTemplate=dirTemplate, 2055 | source_bucket_location=source_bucket_location, 2056 | ) 2057 | 2058 | def download_dicom_studies( 2059 | self, 2060 | studyInstanceUID, 2061 | downloadDir, 2062 | dry_run=False, 2063 | quiet=True, 2064 | show_progress_bar=True, 2065 | use_s5cmd_sync=False, 2066 | dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT, 2067 | source_bucket_location="aws", 2068 | ) -> None: 2069 | """ 2070 | Download the files corresponding to the studyInstanceUID to the specified directory. 2071 | 2072 | Args: 2073 | studyInstanceUID: string or list of strings containing the values of DICOM studyInstanceUID to filter by 2074 | downloadDir: string containing the path to the directory to download the files to 2075 | dry_run: calculates the size of the cohort but download does not start 2076 | quiet (bool): If True, suppresses the output of the subprocess. Defaults to True. 2077 | show_progress_bar (bool): If True, tracks the progress of download 2078 | use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded 2079 | dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories. 2080 | source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'. 2081 | Returns: None 2082 | 2083 | Raises: 2084 | TypeError: If seriesInstanceUID(s) passed is(are) not a string or list 2085 | 2086 | """ 2087 | self.download_from_selection( 2088 | downloadDir, 2089 | studyInstanceUID=studyInstanceUID, 2090 | dry_run=dry_run, 2091 | quiet=quiet, 2092 | show_progress_bar=show_progress_bar, 2093 | use_s5cmd_sync=use_s5cmd_sync, 2094 | dirTemplate=dirTemplate, 2095 | source_bucket_location=source_bucket_location, 2096 | ) 2097 | 2098 | def download_dicom_patients( 2099 | self, 2100 | patientId, 2101 | downloadDir, 2102 | dry_run=False, 2103 | quiet=True, 2104 | show_progress_bar=True, 2105 | use_s5cmd_sync=False, 2106 | dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT, 2107 | source_bucket_location="aws", 2108 | ) -> None: 2109 | """ 2110 | Download the files corresponding to the studyInstanceUID to the specified directory. 2111 | 2112 | Args: 2113 | patientId: string or list of strings containing the values of DICOM patientId to filter by 2114 | downloadDir: string containing the path to the directory to download the files to 2115 | dry_run: calculates the size of the cohort but download does not start 2116 | quiet (bool): If True, suppresses the output of the subprocess. Defaults to True. 2117 | show_progress_bar (bool): If True, tracks the progress of download 2118 | use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded 2119 | dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories. 2120 | source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'. 2121 | 2122 | Returns: None 2123 | 2124 | Raises: 2125 | TypeError: If patientId(s) passed is(are) not a string or list 2126 | 2127 | """ 2128 | self.download_from_selection( 2129 | downloadDir, 2130 | patientId=patientId, 2131 | dry_run=dry_run, 2132 | quiet=quiet, 2133 | show_progress_bar=show_progress_bar, 2134 | use_s5cmd_sync=use_s5cmd_sync, 2135 | dirTemplate=dirTemplate, 2136 | source_bucket_location=source_bucket_location, 2137 | ) 2138 | 2139 | def download_collection( 2140 | self, 2141 | collection_id, 2142 | downloadDir, 2143 | dry_run=False, 2144 | quiet=True, 2145 | show_progress_bar=True, 2146 | use_s5cmd_sync=False, 2147 | dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT, 2148 | source_bucket_location="aws", 2149 | ) -> None: 2150 | """ 2151 | Download the files corresponding to the studyInstanceUID to the specified directory. 2152 | 2153 | Args: 2154 | collection_id: string or list of strings containing the values of DICOM patientId to filter by 2155 | downloadDir: string containing the path to the directory to download the files to 2156 | dry_run: calculates the size of the cohort but download does not start 2157 | quiet (bool): If True, suppresses the output of the subprocess. Defaults to True. 2158 | show_progress_bar (bool): If True, tracks the progress of download 2159 | use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded 2160 | dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories. 2161 | source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'. 2162 | 2163 | Returns: None 2164 | 2165 | Raises: 2166 | TypeError: If collection_id(s) passed is(are) not a string or list 2167 | 2168 | """ 2169 | self.download_from_selection( 2170 | downloadDir, 2171 | collection_id=collection_id, 2172 | dry_run=dry_run, 2173 | quiet=quiet, 2174 | show_progress_bar=show_progress_bar, 2175 | use_s5cmd_sync=use_s5cmd_sync, 2176 | dirTemplate=dirTemplate, 2177 | source_bucket_location=source_bucket_location, 2178 | ) 2179 | 2180 | def sql_query(self, sql_query): 2181 | """Execute SQL query against the table in the index using duckdb. 2182 | 2183 | Args: 2184 | sql_query: string containing the SQL query to execute. The table name to use in the FROM clause is 'index' (without quotes). 2185 | 2186 | Returns: 2187 | pandas dataframe containing the results of the query 2188 | 2189 | Raises: 2190 | duckdb.Error: any exception that duckdb.query() raises 2191 | """ 2192 | 2193 | logger.debug("Executing SQL query: " + sql_query) 2194 | # TODO: find a more elegant way to automate the following: https://www.perplexity.ai/search/write-python-code-that-iterate-XY9ppywbQFSRnOpgbwx_uQ 2195 | index = self.index 2196 | if self.sm_index is not None: 2197 | sm_index = self.sm_index 2198 | if self.sm_instance_index is not None: 2199 | sm_instance_index = self.sm_instance_index 2200 | if self.clinical_index is not None: 2201 | clinical_index = self.clinical_index 2202 | if self.prior_versions_index is not None: 2203 | prior_versions_index = self.prior_versions_index 2204 | return duckdb.query(sql_query).to_df() 2205 | -------------------------------------------------------------------------------- /idc_index/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ImagingDataCommons/idc-index/9d905f9b7e4ab719bfe54b1d747505283870ed8b/idc_index/py.typed -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | import shutil 5 | from pathlib import Path 6 | 7 | import nox 8 | 9 | DIR = Path(__file__).parent.resolve() 10 | 11 | nox.options.sessions = ["lint", "pylint", "tests"] 12 | 13 | 14 | @nox.session 15 | def lint(session: nox.Session) -> None: 16 | """ 17 | Run the linter. 18 | """ 19 | session.install("pre-commit") 20 | session.run( 21 | "pre-commit", "run", "--all-files", "--show-diff-on-failure", *session.posargs 22 | ) 23 | 24 | 25 | @nox.session 26 | def pylint(session: nox.Session) -> None: 27 | """ 28 | Run PyLint. 29 | """ 30 | # This needs to be installed into the package environment, and is slower 31 | # than a pre-commit check 32 | session.install(".", "pylint") 33 | session.run("pylint", "idc_index", *session.posargs) 34 | 35 | 36 | @nox.session 37 | def tests(session: nox.Session) -> None: 38 | """ 39 | Run the unit and regular tests. 40 | """ 41 | session.install(".[test]") 42 | session.run("pytest", *session.posargs) 43 | 44 | 45 | @nox.session(reuse_venv=True) 46 | def docs(session: nox.Session) -> None: 47 | """ 48 | Build the docs. Pass "--serve" to serve. Pass "-b linkcheck" to check links. 49 | """ 50 | 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("--serve", action="store_true", help="Serve after building") 53 | parser.add_argument( 54 | "-b", dest="builder", default="html", help="Build target (default: html)" 55 | ) 56 | args, posargs = parser.parse_known_args(session.posargs) 57 | 58 | if args.builder != "html" and args.serve: 59 | session.error("Must not specify non-HTML builder with --serve") 60 | 61 | extra_installs = ["sphinx-autobuild"] if args.serve else [] 62 | 63 | session.install("-e.[docs]", *extra_installs) 64 | session.chdir("docs") 65 | 66 | if args.builder == "linkcheck": 67 | session.run( 68 | "sphinx-build", "-b", "linkcheck", ".", "_build/linkcheck", *posargs 69 | ) 70 | return 71 | 72 | shared_args = ( 73 | "-n", # nitpicky mode 74 | "-T", # full tracebacks 75 | f"-b={args.builder}", 76 | ".", 77 | f"_build/{args.builder}", 78 | *posargs, 79 | ) 80 | 81 | if args.serve: 82 | session.run("sphinx-autobuild", *shared_args) 83 | else: 84 | session.run("sphinx-build", "--keep-going", *shared_args) 85 | 86 | 87 | @nox.session 88 | def build_api_docs(session: nox.Session) -> None: 89 | """ 90 | Build (regenerate) API docs. 91 | """ 92 | 93 | session.install("sphinx") 94 | session.chdir("docs") 95 | session.run( 96 | "sphinx-apidoc", 97 | "-o", 98 | "api/", 99 | "--module-first", 100 | "--no-toc", 101 | "--force", 102 | "../idc_index", 103 | ) 104 | 105 | 106 | @nox.session 107 | def build(session: nox.Session) -> None: 108 | """ 109 | Build an SDist and wheel. 110 | """ 111 | 112 | build_path = DIR.joinpath("build") 113 | if build_path.exists(): 114 | shutil.rmtree(build_path) 115 | 116 | session.install("build") 117 | session.run("python", "-m", "build") 118 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling","hatch-vcs"] 3 | 4 | build-backend = "hatchling.build" 5 | 6 | [project] 7 | name = "idc-index" 8 | authors = [ 9 | { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" }, 10 | { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" }, 11 | ] 12 | description = "Package to query and download data from an index of ImagingDataCommons" 13 | readme = "README.md" 14 | license.file = "LICENSE" 15 | requires-python = ">=3.8" 16 | classifiers = [ 17 | "Development Status :: 4 - Beta", 18 | "Intended Audience :: Science/Research", 19 | "Intended Audience :: Developers", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | "Programming Language :: Python", 23 | "Programming Language :: Python :: 3", 24 | "Programming Language :: Python :: 3 :: Only", 25 | "Programming Language :: Python :: 3.8", 26 | "Programming Language :: Python :: 3.9", 27 | "Programming Language :: Python :: 3.10", 28 | "Programming Language :: Python :: 3.11", 29 | "Programming Language :: Python :: 3.12", 30 | "Topic :: Scientific/Engineering", 31 | "Typing :: Typed", 32 | ] 33 | dynamic = ["version"] 34 | dependencies = [ 35 | "click", 36 | 'duckdb>=0.10.0,<=1.2.1', 37 | "idc-index-data==21.0.0", 38 | "packaging", 39 | "pandas<=2.2.4", 40 | "platformdirs", 41 | "psutil", 42 | "pyarrow", 43 | "requests", 44 | "s5cmd", 45 | "sphinx-click", 46 | "tqdm" 47 | ] 48 | 49 | [project.optional-dependencies] 50 | test = [ 51 | "pytest >=6", 52 | "pytest-cov >=3", 53 | ] 54 | dev = [ 55 | "pytest >=6", 56 | "pytest-cov >=3", 57 | ] 58 | docs = [ 59 | "sphinx>=7.0", 60 | "myst_parser>=0.13", 61 | "sphinx_copybutton", 62 | "sphinx_autodoc_typehints", 63 | "furo>=2023.08.17", 64 | ] 65 | 66 | [project.scripts] 67 | idc = 'idc_index.cli:idc' 68 | 69 | [project.urls] 70 | Homepage = "https://github.com/ImagingDataCommons/idc-index" 71 | "Bug Tracker" = "https://github.com/ImagingDataCommons/idc-index/issues" 72 | Discussions = "https://discourse.canceridc.dev/" 73 | Changelog = "https://github.com/ImagingDataCommons/idc-index/releases" 74 | 75 | 76 | [tool.hatch] 77 | version.source = "vcs" 78 | build.hooks.vcs.version-file = "idc_index/_version.py" 79 | 80 | [tool.hatch.envs.default] 81 | features = ["test"] 82 | scripts.test = "pytest {args}" 83 | 84 | 85 | [tool.pytest.ini_options] 86 | minversion = "6.0" 87 | addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"] 88 | xfail_strict = true 89 | filterwarnings = [ 90 | "error", 91 | # https://github.com/dateutil/dateutil/issues/1314 92 | "ignore:datetime.datetime.utcfromtimestamp.. is deprecated.*:DeprecationWarning:dateutil", 93 | ] 94 | log_cli_level = "INFO" 95 | testpaths = [ 96 | "tests", 97 | ] 98 | python_files = [ 99 | "idcindex.py", 100 | "*_test.py", 101 | "test_*.py", 102 | ] 103 | 104 | 105 | [tool.coverage] 106 | run.source = ["idc_index"] 107 | report.exclude_also = [ 108 | '\.\.\.', 109 | 'if typing.TYPE_CHECKING:', 110 | ] 111 | 112 | [tool.mypy] 113 | files = ["idc_index", "tests"] 114 | python_version = "3.8" 115 | warn_unused_configs = true 116 | strict = true 117 | enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] 118 | warn_unreachable = true 119 | disallow_untyped_defs = false 120 | disallow_incomplete_defs = false 121 | 122 | [[tool.mypy.overrides]] 123 | module = "idc_index.*" 124 | disallow_untyped_defs = true 125 | disallow_incomplete_defs = true 126 | 127 | 128 | [tool.ruff] 129 | src = ["idc_index"] 130 | extend-exclude = ["./CONTRIBUTING.md"] 131 | 132 | [tool.ruff.lint] 133 | extend-select = [ 134 | "B", # flake8-bugbear 135 | "I", # isort 136 | "ARG", # flake8-unused-arguments 137 | "C4", # flake8-comprehensions 138 | "D", # pydocstyle 139 | "EM", # flake8-errmsg 140 | "ICN", # flake8-import-conventions 141 | "G", # flake8-logging-format 142 | "PGH", # pygrep-hooks 143 | "PIE", # flake8-pie 144 | "PL", # pylint 145 | "PT", # flake8-pytest-style 146 | "PTH", # flake8-use-pathlib 147 | "RET", # flake8-return 148 | "RUF", # Ruff-specific 149 | "SIM", # flake8-simplify 150 | "T20", # flake8-print 151 | "UP", # pyupgrade 152 | "YTT", # flake8-2020 153 | "EXE", # flake8-executable 154 | "NPY", # NumPy specific rules 155 | "PD", # pandas-vet 156 | ] 157 | ignore = [ 158 | "PLR09", # Too many <...> 159 | "PLR2004", # Magic value used in comparison 160 | "ISC001", # Conflicts with formatter 161 | # Exceptions below are specific to idc-index 162 | "B007", # Loop control variable {name} not used within loop body 163 | "B904", # Checks for raise statements in exception handlers that lack a from clause. 164 | "E722", # Do not use bare except 165 | "EM101", # Exception must not use a string literal, assign to variable first 166 | "F841", # Local variable {name} is assigned to but never used 167 | "G003", # Logging statement uses + 168 | "G004", # Logging statement uses f-string 169 | "PD011", # Use .to_numpy() instead of .values 170 | "PD901", # Avoid using the generic variable name df for DataFrames 171 | "PT009", # Use a regular assert instead of unittest-style {assertion} 172 | "PTH100", # os.path.abspath() should be replaced by Path.resolve() 173 | "PTH103", # os.makedirs() should be replaced by Path.mkdir(parents=True) 174 | "PTH107", # os.remove() should be replaced by Path.unlink() 175 | "PTH110", # os.path.exists() should be replaced by Path.exists() 176 | "PTH118", # Checks for uses of os.path.join 177 | "PTH119", # os.path.basename() should be replaced by Path.name 178 | "PTH120", # os.path.dirname() should be replaced by Path.parent 179 | "PTH123", # open() should be replaced by Path.open() 180 | "RET504", # Unnecessary assignment to {name} before return statement 181 | "RET506", # Unnecessary {branch} after raise statement 182 | "SIM102", # Use a single if statement instead of nested if statements 183 | "SIM108", # Use ternary operator {contents} instead of if-else-block 184 | "SIM117", # Use a single with statement with multiple contexts instead of nested with statements 185 | "T201", # print found 186 | ] 187 | isort.required-imports = ["from __future__ import annotations"] 188 | # Uncomment if using a _compat.typing backport 189 | # typing-modules = ["idc_index._compat.typing"] 190 | 191 | [tool.ruff.lint.per-file-ignores] 192 | "docs/conf.py" = ["D"] 193 | "tests/**" = [ 194 | "D", 195 | "T20", 196 | ] 197 | "noxfile.py" = [ 198 | "D", 199 | "T20", 200 | ] 201 | 202 | [tool.ruff.lint.pydocstyle] 203 | convention = "google" 204 | 205 | 206 | [tool.pylint] 207 | py-version = "3.8" 208 | ignore-paths = [".*/_version.py"] 209 | reports.output-format = "colorized" 210 | similarities.ignore-imports = "yes" 211 | messages_control.disable = [ 212 | "design", 213 | "fixme", 214 | "line-too-long", 215 | "missing-module-docstring", 216 | "wrong-import-position", 217 | # Exceptions below are specific to idc-index 218 | "invalid-name", 219 | "missing-class-docstring", 220 | "missing-function-docstring", 221 | "logging-fstring-interpolation", 222 | "logging-not-lazy", 223 | "no-else-raise", 224 | "raise-missing-from", 225 | "undefined-loop-variable", 226 | "unspecified-encoding", 227 | "unused-variable", 228 | ] 229 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ImagingDataCommons/idc-index/9d905f9b7e4ab719bfe54b1d747505283870ed8b/tests/__init__.py -------------------------------------------------------------------------------- /tests/idcindex.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import os 5 | import tempfile 6 | import unittest 7 | from itertools import product 8 | from pathlib import Path 9 | 10 | import pandas as pd 11 | import pytest 12 | import requests 13 | from click.testing import CliRunner 14 | from idc_index import IDCClient, cli 15 | 16 | # Run tests using the following command from the root of the repository: 17 | # python -m unittest -vv tests/idcindex.py 18 | # 19 | # run specific tests with this: 20 | # pytest ./tests/idcindex.py::TestIDCClient.test_download_dicom_instance 21 | 22 | logging.basicConfig(level=logging.DEBUG) 23 | 24 | 25 | def remote_file_exists(url): 26 | try: 27 | response = requests.head(url, allow_redirects=True) 28 | # Check if the status code indicates success 29 | return response.status_code == 200 30 | except requests.RequestException as e: 31 | # Handle any exceptions (e.g., network issues) 32 | print(f"An error occurred: {e}") 33 | return False 34 | 35 | 36 | @pytest.fixture(autouse=True) 37 | def _change_test_dir(request, monkeypatch): 38 | monkeypatch.chdir(request.fspath.dirname) 39 | 40 | 41 | class TestIDCClient(unittest.TestCase): 42 | def setUp(self): 43 | self.client = IDCClient() 44 | self.download_from_manifest = cli.download_from_manifest 45 | self.download_from_selection = cli.download_from_selection 46 | self.download = cli.download 47 | 48 | logger = logging.getLogger("idc_index") 49 | logger.setLevel(logging.DEBUG) 50 | 51 | def test_get_collections(self): 52 | collections = self.client.get_collections() 53 | self.assertIsNotNone(collections) 54 | 55 | def test_get_idc_version(self): 56 | idc_version = self.client.get_idc_version() 57 | self.assertIsNotNone(idc_version) 58 | self.assertTrue(idc_version.startswith("v")) 59 | 60 | def test_get_patients(self): 61 | # Define the values for each optional parameter 62 | output_format_values = ["list", "dict", "df"] 63 | collection_id_values = [ 64 | "htan_ohsu", 65 | ["ct_phantom4radiomics", "cmb_gec"], 66 | ] 67 | 68 | # Test each combination 69 | for collection_id in collection_id_values: 70 | for output_format in output_format_values: 71 | patients = self.client.get_patients( 72 | collection_id=collection_id, outputFormat=output_format 73 | ) 74 | 75 | # Check if the output format matches the expected type 76 | if output_format == "list": 77 | self.assertIsInstance(patients, list) 78 | self.assertTrue(bool(patients)) # Check that the list is not empty 79 | elif output_format == "dict": 80 | self.assertTrue( 81 | isinstance(patients, dict) 82 | or ( 83 | isinstance(patients, list) 84 | and all(isinstance(i, dict) for i in patients) 85 | ) 86 | ) # Check that the output is either a dictionary or a list of dictionaries 87 | self.assertTrue( 88 | bool(patients) 89 | ) # Check that the output is not empty 90 | elif output_format == "df": 91 | self.assertIsInstance(patients, pd.DataFrame) 92 | self.assertFalse( 93 | patients.empty 94 | ) # Check that the DataFrame is not empty 95 | 96 | def test_get_studies(self): 97 | # Define the values for each optional parameter 98 | output_format_values = ["list", "dict", "df"] 99 | patient_id_values = ["PCAMPMRI-00001", ["PCAMPMRI-00001", "NoduleLayout_1"]] 100 | 101 | # Test each combination 102 | for patient_id in patient_id_values: 103 | for output_format in output_format_values: 104 | studies = self.client.get_dicom_studies( 105 | patientId=patient_id, outputFormat=output_format 106 | ) 107 | 108 | # Check if the output format matches the expected type 109 | if output_format == "list": 110 | self.assertIsInstance(studies, list) 111 | self.assertTrue(bool(studies)) # Check that the list is not empty 112 | elif output_format == "dict": 113 | self.assertTrue( 114 | isinstance(studies, dict) 115 | or ( 116 | isinstance(studies, list) 117 | and all(isinstance(i, dict) for i in studies) 118 | ) 119 | ) # Check that the output is either a dictionary or a list of dictionaries 120 | self.assertTrue(bool(studies)) # Check that the output is not empty 121 | elif output_format == "df": 122 | self.assertIsInstance(studies, pd.DataFrame) 123 | self.assertFalse( 124 | studies.empty 125 | ) # Check that the DataFrame is not empty 126 | 127 | def test_get_series(self): 128 | """ 129 | Query used for selecting the smallest series/studies: 130 | 131 | SELECT 132 | StudyInstanceUID, 133 | ARRAY_AGG(DISTINCT(collection_id)) AS collection, 134 | ARRAY_AGG(DISTINCT(series_aws_url)) AS aws_url, 135 | ARRAY_AGG(DISTINCT(series_gcs_url)) AS gcs_url, 136 | COUNT(DISTINCT(SOPInstanceUID)) AS num_instances, 137 | SUM(instance_size) AS series_size 138 | FROM 139 | `bigquery-public-data.idc_current.dicom_all` 140 | GROUP BY 141 | StudyInstanceUID 142 | HAVING 143 | num_instances > 2 144 | ORDER BY 145 | series_size asc 146 | LIMIT 147 | 10 148 | """ 149 | # Define the values for each optional parameter 150 | output_format_values = ["list", "dict", "df"] 151 | study_instance_uid_values = [ 152 | "1.3.6.1.4.1.14519.5.2.1.6279.6001.175012972118199124641098335511", 153 | [ 154 | "1.3.6.1.4.1.14519.5.2.1.1239.1759.691327824408089993476361149761", 155 | "1.3.6.1.4.1.14519.5.2.1.1239.1759.272272273744698671736205545239", 156 | ], 157 | ] 158 | 159 | # Test each combination 160 | for study_instance_uid in study_instance_uid_values: 161 | for output_format in output_format_values: 162 | series = self.client.get_dicom_series( 163 | studyInstanceUID=study_instance_uid, outputFormat=output_format 164 | ) 165 | 166 | # Check if the output format matches the expected type 167 | if output_format == "list": 168 | self.assertIsInstance(series, list) 169 | self.assertTrue(bool(series)) # Check that the list is not empty 170 | elif output_format == "dict": 171 | self.assertTrue( 172 | isinstance(series, dict) 173 | or ( 174 | isinstance(series, list) 175 | and all(isinstance(i, dict) for i in series) 176 | ) 177 | ) # Check that the output is either a dictionary or a list of dictionaries 178 | elif output_format == "df": 179 | self.assertIsInstance(series, pd.DataFrame) 180 | self.assertFalse( 181 | series.empty 182 | ) # Check that the DataFrame is not empty 183 | 184 | def test_download_dicom_series(self): 185 | with tempfile.TemporaryDirectory() as temp_dir: 186 | self.client.download_dicom_series( 187 | seriesInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.1700.153974929648969296590126728101", 188 | downloadDir=temp_dir, 189 | ) 190 | self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 3) 191 | 192 | def test_download_dicom_instance(self): 193 | self.client.fetch_index("sm_instance_index") 194 | with tempfile.TemporaryDirectory() as temp_dir: 195 | self.client.download_dicom_instance( 196 | sopInstanceUID="1.3.6.1.4.1.5962.99.1.528744472.1087975700.1641206284312.14.0", 197 | downloadDir=temp_dir, 198 | ) 199 | 200 | self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 1) 201 | 202 | def test_download_dicom_series_gcs(self): 203 | with tempfile.TemporaryDirectory() as temp_dir: 204 | self.client.download_dicom_series( 205 | seriesInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.1700.153974929648969296590126728101", 206 | downloadDir=temp_dir, 207 | source_bucket_location="gcs", 208 | ) 209 | self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 3) 210 | 211 | def test_download_dicom_instance_gcs(self): 212 | self.client.fetch_index("sm_instance_index") 213 | with tempfile.TemporaryDirectory() as temp_dir: 214 | self.client.download_dicom_instance( 215 | sopInstanceUID="1.3.6.1.4.1.5962.99.1.528744472.1087975700.1641206284312.14.0", 216 | downloadDir=temp_dir, 217 | source_bucket_location="gcs", 218 | ) 219 | 220 | self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 1) 221 | 222 | def test_download_with_template(self): 223 | dirTemplateValues = [ 224 | None, 225 | "%collection_id_%PatientID/%Modality-%StudyInstanceUID%SeriesInstanceUID", 226 | "%collection_id%PatientID-%Modality_%StudyInstanceUID/%SeriesInstanceUID", 227 | "%collection_id-%PatientID_%Modality/%StudyInstanceUID-%SeriesInstanceUID", 228 | "%collection_id_%PatientID/%Modality/%StudyInstanceUID_%SeriesInstanceUID", 229 | ] 230 | for template in dirTemplateValues: 231 | with tempfile.TemporaryDirectory() as temp_dir: 232 | self.client.download_from_selection( 233 | downloadDir=temp_dir, 234 | studyInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462", 235 | dirTemplate=template, 236 | ) 237 | self.assertEqual( 238 | sum([len(files) for r, d, files in os.walk(temp_dir)]), 3 239 | ) 240 | 241 | def test_download_from_selection(self): 242 | # Define the values for each optional parameter 243 | dry_run_values = [True, False] 244 | quiet_values = [True, False] 245 | show_progress_bar_values = [True, False] 246 | use_s5cmd_sync_values = [True, False] 247 | 248 | # Generate all combinations of optional parameters 249 | combinations = product( 250 | dry_run_values, 251 | quiet_values, 252 | show_progress_bar_values, 253 | use_s5cmd_sync_values, 254 | ) 255 | 256 | # Test each combination 257 | for ( 258 | dry_run, 259 | quiet, 260 | show_progress_bar, 261 | use_s5cmd_sync, 262 | ) in combinations: 263 | with tempfile.TemporaryDirectory() as temp_dir: 264 | self.client.download_from_selection( 265 | downloadDir=temp_dir, 266 | dry_run=dry_run, 267 | patientId=None, 268 | studyInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462", 269 | seriesInstanceUID=None, 270 | quiet=quiet, 271 | show_progress_bar=show_progress_bar, 272 | use_s5cmd_sync=use_s5cmd_sync, 273 | ) 274 | 275 | if not dry_run: 276 | self.assertNotEqual(len(os.listdir(temp_dir)), 0) 277 | 278 | def test_sql_queries(self): 279 | df = self.client.sql_query("SELECT DISTINCT(collection_id) FROM index") 280 | 281 | self.assertIsNotNone(df) 282 | 283 | def test_download_from_aws_manifest(self): 284 | # Define the values for each optional parameter 285 | quiet_values = [True, False] 286 | validate_manifest_values = [True, False] 287 | show_progress_bar_values = [True, False] 288 | use_s5cmd_sync_values = [True, False] 289 | dirTemplateValues = [ 290 | None, 291 | "%collection_id/%PatientID/%Modality/%StudyInstanceUID/%SeriesInstanceUID", 292 | "%collection_id%PatientID%Modality%StudyInstanceUID%SeriesInstanceUID", 293 | ] 294 | # Generate all combinations of optional parameters 295 | combinations = product( 296 | quiet_values, 297 | validate_manifest_values, 298 | show_progress_bar_values, 299 | use_s5cmd_sync_values, 300 | dirTemplateValues, 301 | ) 302 | # Test each combination 303 | for ( 304 | quiet, 305 | validate_manifest, 306 | show_progress_bar, 307 | use_s5cmd_sync, 308 | dirTemplate, 309 | ) in combinations: 310 | with tempfile.TemporaryDirectory() as temp_dir: 311 | self.client.download_from_manifest( 312 | manifestFile="./study_manifest_aws.s5cmd", 313 | downloadDir=temp_dir, 314 | quiet=quiet, 315 | validate_manifest=validate_manifest, 316 | show_progress_bar=show_progress_bar, 317 | use_s5cmd_sync=use_s5cmd_sync, 318 | dirTemplate=dirTemplate, 319 | ) 320 | 321 | if sum([len(files) for _, _, files in os.walk(temp_dir)]) != 9: 322 | print( 323 | f"Failed for {quiet} {validate_manifest} {show_progress_bar} {use_s5cmd_sync} {dirTemplate}" 324 | ) 325 | self.assertFalse(True) 326 | 327 | def test_download_from_gcp_manifest(self): 328 | # Define the values for each optional parameter 329 | quiet_values = [True, False] 330 | validate_manifest_values = [True, False] 331 | show_progress_bar_values = [True, False] 332 | use_s5cmd_sync_values = [True, False] 333 | dirTemplateValues = [ 334 | None, 335 | "%collection_id/%PatientID/%Modality/%StudyInstanceUID/%SeriesInstanceUID", 336 | "%collection_id_%PatientID_%Modality_%StudyInstanceUID_%SeriesInstanceUID", 337 | ] 338 | # Generate all combinations of optional parameters 339 | combinations = product( 340 | quiet_values, 341 | validate_manifest_values, 342 | show_progress_bar_values, 343 | use_s5cmd_sync_values, 344 | dirTemplateValues, 345 | ) 346 | 347 | # Test each combination 348 | for ( 349 | quiet, 350 | validate_manifest, 351 | show_progress_bar, 352 | use_s5cmd_sync, 353 | dirTemplate, 354 | ) in combinations: 355 | with tempfile.TemporaryDirectory() as temp_dir: 356 | self.client.download_from_manifest( 357 | manifestFile="./study_manifest_gcs.s5cmd", 358 | downloadDir=temp_dir, 359 | quiet=quiet, 360 | validate_manifest=validate_manifest, 361 | show_progress_bar=show_progress_bar, 362 | use_s5cmd_sync=use_s5cmd_sync, 363 | dirTemplate=dirTemplate, 364 | ) 365 | 366 | self.assertEqual( 367 | sum([len(files) for r, d, files in os.walk(temp_dir)]), 9 368 | ) 369 | 370 | def test_download_from_bogus_manifest(self): 371 | # Define the values for each optional parameter 372 | quiet_values = [True, False] 373 | validate_manifest_values = [True, False] 374 | show_progress_bar_values = [True, False] 375 | use_s5cmd_sync_values = [True, False] 376 | 377 | # Generate all combinations of optional parameters 378 | combinations = product( 379 | quiet_values, 380 | validate_manifest_values, 381 | show_progress_bar_values, 382 | use_s5cmd_sync_values, 383 | ) 384 | 385 | # Test each combination 386 | for ( 387 | quiet, 388 | validate_manifest, 389 | show_progress_bar, 390 | use_s5cmd_sync, 391 | ) in combinations: 392 | with tempfile.TemporaryDirectory() as temp_dir: 393 | self.client.download_from_manifest( 394 | manifestFile="./study_manifest_bogus.s5cmd", 395 | downloadDir=temp_dir, 396 | quiet=quiet, 397 | validate_manifest=validate_manifest, 398 | show_progress_bar=show_progress_bar, 399 | use_s5cmd_sync=use_s5cmd_sync, 400 | ) 401 | 402 | self.assertEqual(len(os.listdir(temp_dir)), 0) 403 | 404 | """ 405 | disabling these tests due to a consistent server timeout issue 406 | def test_citations(self): 407 | citations = self.client.citations_from_selection( 408 | collection_id="tcga_gbm", 409 | citation_format=index.IDCClient.CITATION_FORMAT_APA, 410 | ) 411 | self.assertIsNotNone(citations) 412 | 413 | citations = self.client.citations_from_selection( 414 | seriesInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.4164.588007658875211151397302775781", 415 | citation_format=index.IDCClient.CITATION_FORMAT_BIBTEX, 416 | ) 417 | self.assertIsNotNone(citations) 418 | 419 | citations = self.client.citations_from_selection( 420 | studyInstanceUID="1.2.840.113654.2.55.174144834924218414213677353968537663991", 421 | citation_format=index.IDCClient.CITATION_FORMAT_BIBTEX, 422 | ) 423 | self.assertIsNotNone(citations) 424 | 425 | citations = self.client.citations_from_manifest("./study_manifest_aws.s5cmd") 426 | self.assertIsNotNone(citations) 427 | """ 428 | 429 | def test_cli_download_from_selection(self): 430 | runner = CliRunner() 431 | with tempfile.TemporaryDirectory() as temp_dir: 432 | result = runner.invoke( 433 | self.download_from_selection, 434 | [ 435 | "--download-dir", 436 | temp_dir, 437 | "--dry-run", 438 | False, 439 | "--quiet", 440 | True, 441 | "--show-progress-bar", 442 | True, 443 | "--use-s5cmd-sync", 444 | False, 445 | "--study-instance-uid", 446 | "1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462", 447 | ], 448 | ) 449 | assert len(os.listdir(temp_dir)) != 0 450 | 451 | def test_cli_download_from_manifest(self): 452 | runner = CliRunner() 453 | with tempfile.TemporaryDirectory() as temp_dir: 454 | result = runner.invoke( 455 | self.download_from_manifest, 456 | [ 457 | "--manifest-file", 458 | "./study_manifest_aws.s5cmd", 459 | "--download-dir", 460 | temp_dir, 461 | "--quiet", 462 | True, 463 | "--show-progress-bar", 464 | True, 465 | "--use-s5cmd-sync", 466 | False, 467 | ], 468 | ) 469 | assert len(os.listdir(temp_dir)) != 0 470 | 471 | def test_singleton_attribute(self): 472 | # singleton, initialized on first use 473 | i1 = IDCClient.client() 474 | i2 = IDCClient.client() 475 | 476 | # new instances created via constructor (through init) 477 | i3 = IDCClient() 478 | i4 = self.client 479 | 480 | # all must be not none 481 | assert i1 is not None 482 | assert i2 is not None 483 | assert i3 is not None 484 | assert i4 is not None 485 | 486 | # singletons must return the same instance 487 | assert i1 == i2 488 | 489 | # new instances must be different 490 | assert i1 != i3 491 | assert i1 != i4 492 | assert i3 != i4 493 | 494 | # all must be instances of IDCClient 495 | assert isinstance(i1, IDCClient) 496 | assert isinstance(i2, IDCClient) 497 | assert isinstance(i3, IDCClient) 498 | assert isinstance(i4, IDCClient) 499 | 500 | def test_cli_download(self): 501 | runner = CliRunner() 502 | with runner.isolated_filesystem(): 503 | result = runner.invoke( 504 | self.download, 505 | # StudyInstanceUID: 506 | ["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"], 507 | ) 508 | assert len(os.listdir(Path.cwd())) != 0 509 | 510 | with runner.isolated_filesystem(): 511 | result = runner.invoke( 512 | self.download, 513 | # crdc_series_uuid: 514 | ["e5c5c71d-62c4-4c50-a8a9-b6799c7f8dea"], 515 | ) 516 | assert len(os.listdir(Path.cwd())) != 0 517 | 518 | def test_prior_version_manifest(self): 519 | # Define the values for each optional parameter 520 | quiet_values = [True, False] 521 | validate_manifest_values = [True, False] 522 | show_progress_bar_values = [True, False] 523 | use_s5cmd_sync_values = [True, False] 524 | dirTemplateValues = [ 525 | None, 526 | "%collection_id/%PatientID/%Modality/%StudyInstanceUID/%SeriesInstanceUID", 527 | "%collection_id_%PatientID_%Modality_%StudyInstanceUID_%SeriesInstanceUID", 528 | ] 529 | # Generate all combinations of optional parameters 530 | combinations = product( 531 | quiet_values, 532 | validate_manifest_values, 533 | show_progress_bar_values, 534 | use_s5cmd_sync_values, 535 | dirTemplateValues, 536 | ) 537 | 538 | # Test each combination 539 | for ( 540 | quiet, 541 | validate_manifest, 542 | show_progress_bar, 543 | use_s5cmd_sync, 544 | dirTemplate, 545 | ) in combinations: 546 | with tempfile.TemporaryDirectory() as temp_dir: 547 | self.client.download_from_manifest( 548 | manifestFile="./prior_version_manifest.s5cmd", 549 | downloadDir=temp_dir, 550 | quiet=quiet, 551 | validate_manifest=validate_manifest, 552 | show_progress_bar=show_progress_bar, 553 | use_s5cmd_sync=use_s5cmd_sync, 554 | dirTemplate=dirTemplate, 555 | ) 556 | 557 | self.assertEqual( 558 | sum([len(files) for r, d, files in os.walk(temp_dir)]), 5 559 | ) 560 | 561 | def test_list_indices(self): 562 | i = IDCClient() 563 | assert i.indices_overview # assert that dict was created 564 | 565 | def test_fetch_index(self): 566 | i = IDCClient() 567 | assert i.indices_overview["sm_index"]["installed"] is False 568 | i.fetch_index("sm_index") 569 | assert i.indices_overview["sm_index"]["installed"] is True 570 | assert hasattr(i, "sm_index") 571 | 572 | def test_indices_urls(self): 573 | i = IDCClient() 574 | for index in i.indices_overview: 575 | if i.indices_overview[index]["url"] is not None: 576 | assert remote_file_exists(i.indices_overview[index]["url"]) 577 | 578 | def test_clinical_index_install(self): 579 | i = IDCClient() 580 | assert i.indices_overview["clinical_index"]["installed"] is False 581 | i.fetch_index("clinical_index") 582 | assert i.indices_overview["clinical_index"]["installed"] is True 583 | assert len(os.listdir(i.clinical_data_dir)) > 0 584 | 585 | nlst_canc = i.get_clinical_table("nlst_canc") 586 | assert nlst_canc is not None 587 | 588 | def test_series_files_URLs(self): 589 | c = IDCClient() 590 | seriesInstanceUID = ( 591 | "1.3.6.1.4.1.14519.5.2.1.3671.4754.228015946741563785297552112143" 592 | ) 593 | files_aws = c.get_series_file_URLs(seriesInstanceUID, "aws") 594 | files_gcp = c.get_series_file_URLs(seriesInstanceUID, "gcp") 595 | assert len(files_aws) > 0 596 | assert len(files_gcp) == len(files_aws) 597 | 598 | def test_instance_file_URLs(self): 599 | c = IDCClient() 600 | sopInstanceUID = "1.3.6.1.4.1.5962.99.1.1900325859.924065538.1719887277027.10.0" 601 | file_url = "s3://idc-open-data/763fe058-7d25-4ba7-9b29-fd3d6c41dc4b/210f0529-c767-4795-9acf-bad2f4877427.dcm" 602 | files_aws = c.get_instance_file_URL(sopInstanceUID, "aws") 603 | files_gcp = c.get_instance_file_URL(sopInstanceUID, "gcp") 604 | assert files_aws == files_gcp == file_url 605 | 606 | 607 | if __name__ == "__main__": 608 | unittest.main() 609 | -------------------------------------------------------------------------------- /tests/prior_version_manifest.s5cmd: -------------------------------------------------------------------------------- 1 | cp s3://idc-open-data/2f77262c-3a4a-4e5a-bdc0-056dc2837f15/* . 2 | cp s3://idc-open-data/35459457-bd4c-4eef-9579-47f12fc6928e/* . 3 | cp s3://idc-open-data/9c9ab2bf-c784-4658-b0f9-d2e4b33f2dbf/* . 4 | cp s3://idc-open-data/312788ec-8739-4e56-a857-efcab92b20ed/* . 5 | cp s3://idc-open-data/c27b80c9-0e90-416b-8eca-0b20bc0cf8e2/* . 6 | -------------------------------------------------------------------------------- /tests/study_manifest_aws.s5cmd: -------------------------------------------------------------------------------- 1 | # To download the files in this manifest, first install s5cmd (https://github.com/peak/s5cmd), 2 | # then run the following command: 3 | # s5cmd --no-sign-request --endpoint-url https://s3.amazonaws.com run study_manifest_aws.s5cmd 4 | study_manifest_cp_command 5 | cp s3://idc-open-data/28621ba9-1aca-4aab-a2a1-f6d2c3e2ab19/* . 6 | cp s3://idc-open-data/f0b76401-c6d1-4b61-a5fd-3fa596e6cc41/* . 7 | cp s3://idc-open-data/4ea3bbe6-98da-4b92-abe6-2ee18927e3c9/* . 8 | -------------------------------------------------------------------------------- /tests/study_manifest_bogus.s5cmd: -------------------------------------------------------------------------------- 1 | # the URLs below are invalid and are used for test purposes only! 2 | cp s3://invalid-idc-open-data/28621ba9-1aca-4aab-a2a1-f6d2c3e2ab19/* . 3 | cp s3://invalid-idc-open-data/f0b76401-c6d1-4b61-a5fd-3fa596e6cc41/* . 4 | cp s3://invalid-idc-open-data/4ea3bbe6-98da-4b92-abe6-2ee18927e3c9/* . 5 | -------------------------------------------------------------------------------- /tests/study_manifest_gcs.s5cmd: -------------------------------------------------------------------------------- 1 | # To download the files in this manifest, first install s5cmd (https://github.com/peak/s5cmd), 2 | # then run the following command: 3 | # s5cmd --no-sign-request --endpoint-url https://storage.googleapis.com run study_manifest_gcs.s5cmd 4 | cp s3://idc-open-data/28621ba9-1aca-4aab-a2a1-f6d2c3e2ab19/* . 5 | cp s3://idc-open-data/f0b76401-c6d1-4b61-a5fd-3fa596e6cc41/* . 6 | cp s3://idc-open-data/4ea3bbe6-98da-4b92-abe6-2ee18927e3c9/* . 7 | -------------------------------------------------------------------------------- /tests/test_package.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib.metadata 4 | 5 | import idc_index as m 6 | 7 | 8 | def test_version(): 9 | assert importlib.metadata.version("idc_index") == m.__version__ 10 | --------------------------------------------------------------------------------