├── .git_archival.txt
├── .gitattributes
├── .github
    ├── CONTRIBUTING.md
    ├── dependabot.yml
    ├── matchers
    │   └── pylint.json
    └── workflows
    │   ├── cd.yml
    │   ├── ci.yml
    │   └── keep-alive.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── docs
    ├── api
    │   └── idc_index.rst
    ├── cli_tools.rst
    ├── column_descriptions.md
    ├── conf.py
    └── index.md
├── idc_index
    ├── __init__.py
    ├── _version.pyi
    ├── cli.py
    ├── index.py
    └── py.typed
├── noxfile.py
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── idcindex.py
    ├── prior_version_manifest.s5cmd
    ├── study_manifest_aws.s5cmd
    ├── study_manifest_bogus.s5cmd
    ├── study_manifest_gcs.s5cmd
    └── test_package.py


/.git_archival.txt:
--------------------------------------------------------------------------------
1 | node: 9d905f9b7e4ab719bfe54b1d747505283870ed8b
2 | node-date: 2025-05-19T17:26:21-04:00
3 | describe-name: 0.9.0
4 | ref-names: HEAD -> main, tag: 0.9.0
5 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | .git_archival.txt  export-subst
2 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | See the [Scientific Python Developer Guide][spc-dev-intro] for a detailed
  2 | description of best practices for developing scientific packages.
  3 | 
  4 | [spc-dev-intro]: https://learn.scientific-python.org/development/
  5 | 
  6 | # Quick development
  7 | 
  8 | The fastest way to start with development is to use nox. If you don't have nox,
  9 | you can use `pipx run nox` to run it without installing, or `pipx install nox`.
 10 | If you don't have pipx (pip for applications), then you can install with
 11 | `pip install pipx` (the only case were installing an application with regular
 12 | pip is reasonable). If you use macOS, then pipx and nox are both in brew, use
 13 | `brew install pipx nox`.
 14 | 
 15 | To use, run `nox`. This will lint and test using every installed version of
 16 | Python on your system, skipping ones that are not installed. You can also run
 17 | specific jobs:
 18 | 
 19 | ```console
 20 | $ nox -s lint  # Lint only
 21 | $ nox -s tests  # Python tests
 22 | $ nox -s docs -- --serve  # Build and serve the docs
 23 | $ nox -s build  # Make an SDist and wheel
 24 | ```
 25 | 
 26 | Nox handles everything for you, including setting up an temporary virtual
 27 | environment for each run.
 28 | 
 29 | # Setting up a development environment manually
 30 | 
 31 | You can set up a development environment by running:
 32 | 
 33 | ```bash
 34 | python3 -m venv .venv
 35 | source ./.venv/bin/activate
 36 | pip install -v -e .[dev]
 37 | ```
 38 | 
 39 | If you have the
 40 | [Python Launcher for Unix](https://github.com/brettcannon/python-launcher), you
 41 | can instead do:
 42 | 
 43 | ```bash
 44 | py -m venv .venv
 45 | py -m install -v -e .[dev]
 46 | ```
 47 | 
 48 | # Post setup
 49 | 
 50 | You should prepare pre-commit, which will help you by checking that commits pass
 51 | required checks:
 52 | 
 53 | ```bash
 54 | pip install pre-commit # or brew install pre-commit on macOS
 55 | pre-commit install # Will install a pre-commit hook into the git repo
 56 | ```
 57 | 
 58 | You can also/alternatively run `pre-commit run` (changes only) or
 59 | `pre-commit run --all-files` to check even without installing the hook.
 60 | 
 61 | # Testing
 62 | 
 63 | Use pytest to run the unit checks:
 64 | 
 65 | ```bash
 66 | pytest
 67 | ```
 68 | 
 69 | # Coverage
 70 | 
 71 | Use pytest-cov to generate coverage reports:
 72 | 
 73 | ```bash
 74 | pytest --cov=idc-index
 75 | ```
 76 | 
 77 | # Building docs
 78 | 
 79 | You can build the docs using:
 80 | 
 81 | ```bash
 82 | nox -s docs
 83 | ```
 84 | 
 85 | You can see a preview with:
 86 | 
 87 | ```bash
 88 | nox -s docs -- --serve
 89 | ```
 90 | 
 91 | # Pre-commit
 92 | 
 93 | This project uses pre-commit for all style checking. While you can run it with
 94 | nox, this is such an important tool that it deserves to be installed on its own.
 95 | Install pre-commit and run:
 96 | 
 97 | ```bash
 98 | pre-commit run -a
 99 | ```
100 | 
101 | to check all files.
102 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   # Maintain dependencies for GitHub Actions
 4 |   - package-ecosystem: "github-actions"
 5 |     directory: "/"
 6 |     schedule:
 7 |       interval: "weekly"
 8 |     groups:
 9 |       actions:
10 |         patterns:
11 |           - "*"
12 | 


--------------------------------------------------------------------------------
/.github/matchers/pylint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "severity": "warning",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^([^:]+):(\\d+):(\\d+): ([A-DF-Z]\\d+): \\033\\[[\\d;]+m([^\\033]+).*$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "column": 3,
11 |           "code": 4,
12 |           "message": 5
13 |         }
14 |       ],
15 |       "owner": "pylint-warning"
16 |     },
17 |     {
18 |       "severity": "error",
19 |       "pattern": [
20 |         {
21 |           "regexp": "^([^:]+):(\\d+):(\\d+): (E\\d+): \\033\\[[\\d;]+m([^\\033]+).*$",
22 |           "file": 1,
23 |           "line": 2,
24 |           "column": 3,
25 |           "code": 4,
26 |           "message": 5
27 |         }
28 |       ],
29 |       "owner": "pylint-error"
30 |     }
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/.github/workflows/cd.yml:
--------------------------------------------------------------------------------
 1 | name: wheels
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   release:
10 |     types:
11 |       - published
12 | 
13 | concurrency:
14 |   group: ${{ github.workflow }}-${{ github.ref }}
15 |   cancel-in-progress: true
16 | 
17 | env:
18 |   FORCE_COLOR: 3
19 | 
20 | jobs:
21 |   dist:
22 |     name: Distribution build
23 |     runs-on: ubuntu-latest
24 | 
25 |     steps:
26 |       - uses: actions/checkout@v4
27 |         with:
28 |           fetch-depth: 0
29 | 
30 |       - uses: hynek/build-and-inspect-python-package@v2
31 | 
32 |   publish:
33 |     needs: [dist]
34 |     name: Publish to PyPI
35 |     environment: pypi
36 |     permissions:
37 |       id-token: write
38 |     runs-on: ubuntu-latest
39 |     if: github.event_name == 'release' && github.event.action == 'published'
40 | 
41 |     steps:
42 |       - uses: actions/download-artifact@v4
43 |         with:
44 |           name: Packages
45 |           path: dist
46 | 
47 |       - uses: pypa/gh-action-pypi-publish@release/v1
48 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |   push:
 7 |     branches:
 8 |       - main
 9 | 
10 | concurrency:
11 |   group: ${{ github.workflow }}-${{ github.ref }}
12 |   cancel-in-progress: true
13 | 
14 | env:
15 |   FORCE_COLOR: 3
16 | 
17 | jobs:
18 |   pre-commit:
19 |     name: Format
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - uses: actions/checkout@v4
23 |         with:
24 |           fetch-depth: 0
25 |       - uses: actions/setup-python@v5
26 |         with:
27 |           python-version: "3.x"
28 |       - uses: pre-commit/action@v3.0.1
29 |         with:
30 |           extra_args: --hook-stage manual --all-files
31 |       - name: Run PyLint
32 |         run: |
33 |           echo "::add-matcher::$GITHUB_WORKSPACE/.github/matchers/pylint.json"
34 |           pipx run nox -s pylint
35 | 
36 |   checks:
37 |     name: Check Python ${{ matrix.python-version }} on ${{ matrix.runs-on }}
38 |     runs-on: ${{ matrix.runs-on }}
39 |     needs: [pre-commit]
40 |     strategy:
41 |       fail-fast: false
42 |       matrix:
43 |         python-version: ["3.8", "3.12"]
44 |         runs-on: [ubuntu-latest, macos-latest, windows-latest]
45 | 
46 |     steps:
47 |       - uses: actions/checkout@v4
48 |         with:
49 |           fetch-depth: 0
50 | 
51 |       - uses: actions/setup-python@v5
52 |         with:
53 |           python-version: ${{ matrix.python-version }}
54 |           allow-prereleases: true
55 | 
56 |       - name: Install package
57 |         run: python -m pip install .[test]
58 | 
59 |       - name: Test package
60 |         run: >-
61 |           python -m pytest -ra --cov --cov-report=xml --cov-report=term
62 |           --durations=20 -vv
63 | 
64 | #      - name: Upload coverage report
65 | #        uses: codecov/codecov-action@v4.1.0
66 | #        with:
67 | #          token: ${{ secrets.CODECOV_TOKEN }}
68 | 


--------------------------------------------------------------------------------
/.github/workflows/keep-alive.yaml:
--------------------------------------------------------------------------------
 1 | name: keep-github-actions-alive
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 0 * * *"
 6 |   workflow_dispatch:
 7 | 
 8 | permissions:
 9 |   actions: write
10 | 
11 | jobs:
12 |   keep-alive:
13 |     runs-on: ubuntu-latest
14 |     permissions:
15 |       contents: write
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |         with:
19 |           ref: "keep-alive"
20 |       - uses: gautamkrishnar/keepalive-workflow@v2
21 |         with:
22 |           time_elapsed: 50
23 |           use_api: false
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # macos
  2 | .DS_Store
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 
137 | # pytype static type analyzer
138 | .pytype/
139 | 
140 | # Cython debug symbols
141 | cython_debug/
142 | 
143 | # setuptools_scm
144 | */_version.py
145 | 
146 | # ruff
147 | .ruff_cache/
148 | 
149 | # OS specific stuff
150 | .DS_Store
151 | .DS_Store?
152 | ._*
153 | .Spotlight-V100
154 | .Trashes
155 | ehthumbs.db
156 | Thumbs.db
157 | 
158 | # Common editor files
159 | *~
160 | *.swp
161 | 
162 | # idc-index
163 | */idc_index.csv.zip
164 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ci:
 2 |   autoupdate_commit_msg: "chore: update pre-commit hooks"
 3 |   autofix_commit_msg: "style: pre-commit fixes"
 4 | 
 5 | repos:
 6 |   - repo: https://github.com/adamchainz/blacken-docs
 7 |     rev: "1.16.0"
 8 |     hooks:
 9 |       - id: blacken-docs
10 |         additional_dependencies: [black==24.*]
11 | 
12 |   - repo: https://github.com/pre-commit/pre-commit-hooks
13 |     rev: "v4.5.0"
14 |     hooks:
15 |       - id: check-added-large-files
16 |       - id: check-case-conflict
17 |       - id: check-merge-conflict
18 |       - id: check-symlinks
19 |       - id: check-yaml
20 |       - id: debug-statements
21 |       - id: end-of-file-fixer
22 |       - id: mixed-line-ending
23 |       # - id: name-tests-test
24 |       #   args: ["--pytest-test-first"]
25 |       - id: requirements-txt-fixer
26 |       - id: trailing-whitespace
27 | 
28 |   - repo: https://github.com/pre-commit/pygrep-hooks
29 |     rev: "v1.10.0"
30 |     hooks:
31 |       - id: rst-backticks
32 |       - id: rst-directive-colons
33 |       - id: rst-inline-touching-normal
34 | 
35 |   - repo: https://github.com/pre-commit/mirrors-prettier
36 |     rev: "v3.1.0"
37 |     hooks:
38 |       - id: prettier
39 |         types_or: [yaml, markdown, html, css, scss, javascript, json]
40 |         args: [--prose-wrap=always]
41 | 
42 |   - repo: https://github.com/astral-sh/ruff-pre-commit
43 |     rev: "v0.2.2"
44 |     hooks:
45 |       - id: ruff
46 |         args: ["--fix", "--show-fixes"]
47 |       - id: ruff-format
48 | 
49 |   #- repo: https://github.com/pre-commit/mirrors-mypy
50 |   #  rev: "v1.8.0"
51 |   #  hooks:
52 |   #    - id: mypy
53 |   #      files: idc_index|tests
54 |   #      args: []
55 |   #      additional_dependencies:
56 |   #        - pandas-stubs
57 |   #        - pytest
58 | 
59 |   - repo: https://github.com/codespell-project/codespell
60 |     rev: "v2.2.6"
61 |     hooks:
62 |       - id: codespell
63 |         args: ["--quiet-level 3"]
64 | 
65 |   - repo: https://github.com/shellcheck-py/shellcheck-py
66 |     rev: "v0.9.0.6"
67 |     hooks:
68 |       - id: shellcheck
69 | 
70 |   - repo: local
71 |     hooks:
72 |       - id: disallow-caps
73 |         name: Disallow improper capitalization
74 |         language: pygrep
75 |         entry: PyBind|Numpy|Cmake|CCache|Github|PyTest
76 |         exclude: .pre-commit-config.yaml
77 | 
78 |   - repo: https://github.com/abravalheri/validate-pyproject
79 |     rev: "v0.16"
80 |     hooks:
81 |       - id: validate-pyproject
82 |         additional_dependencies: ["validate-pyproject-schema-store[all]"]
83 | 
84 |   - repo: https://github.com/python-jsonschema/check-jsonschema
85 |     rev: "0.28.0"
86 |     hooks:
87 |       - id: check-dependabot
88 |       - id: check-github-workflows
89 |       - id: check-readthedocs
90 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.11"
10 | sphinx:
11 |   configuration: docs/conf.py
12 | 
13 | python:
14 |   install:
15 |     - method: pip
16 |       path: .
17 |       extra_requirements:
18 |         - docs
19 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to idc-index
  2 | 
  3 | There are many ways to contribute to idc-index, with varying levels of effort.
  4 | Do try to look through the [documentation](idc-index-docs) first if something is
  5 | unclear, and let us know how we can do better.
  6 | 
  7 | - Ask a question on the [IDC forum][idc-forum]
  8 | - Use [idc-index issues][idc-index-issues] to submit a feature request or bug,
  9 |   or add to the discussion on an existing issue
 10 | - Submit a [Pull Request](https://github.com/ImagingDataCommons/idc-index/pulls)
 11 |   to improve idc-index or its documentation
 12 | 
 13 | We encourage a range of Pull Requests, from patches that include passing tests
 14 | and documentation, all the way down to half-baked ideas that launch discussions.
 15 | 
 16 | ## The PR Process, Circle CI, and Related Gotchas
 17 | 
 18 | ### How to submit a PR ?
 19 | 
 20 | If you are new to idc-index development and you don't have push access to the
 21 | repository, here are the steps:
 22 | 
 23 | 1. [Fork and clone](https://docs.github.com/get-started/quickstart/fork-a-repo)
 24 |    the repository.
 25 | 2. Create a branch dedicated to the feature/bugfix you plan to implement (do not
 26 |    use `main` branch - this will complicate further development and
 27 |    collaboration)
 28 | 3. [Push](https://docs.github.com/get-started/using-git/pushing-commits-to-a-remote-repository)
 29 |    the branch to your GitHub fork.
 30 | 4. Create a
 31 |    [Pull Request](https://github.com/ImagingDataCommons/idc-index/pulls).
 32 | 
 33 | This corresponds to the `Fork & Pull Model` described in the
 34 | [GitHub collaborative development](https://docs.github.com/pull-requests/collaborating-with-pull-requests/getting-started/about-collaborative-development-models)
 35 | documentation.
 36 | 
 37 | When submitting a PR, the developers following the project will be notified.
 38 | That said, to engage specific developers, you can add `Cc: @<username>` comment
 39 | to notify them of your awesome contributions. Based on the comments posted by
 40 | the reviewers, you may have to revisit your patches.
 41 | 
 42 | ### How to efficiently contribute ?
 43 | 
 44 | We encourage all developers to:
 45 | 
 46 | - set up pre-commit hooks so that you can remedy various formatting and other
 47 |   issues early, without waiting for the continuous integration (CI) checks to
 48 |   complete: `pre-commit install`
 49 | 
 50 | - add or update tests. You can see current tests
 51 |   [here](https://github.com/ImagingDataCommons/idc-index/tree/main/tests). If
 52 |   you contribute new functionality, adding test(s) covering it is mandatory!
 53 | 
 54 | - you can run individual tests from the root repository using the following
 55 |   command: `python -m unittest -vv tests.idcindex.TestIDCClient.<test_name>`
 56 | 
 57 | ### How to write commit messages ?
 58 | 
 59 | Write your commit messages using the standard prefixes for commit messages:
 60 | 
 61 | - `BUG:` Fix for runtime crash or incorrect result
 62 | - `COMP:` Compiler error or warning fix
 63 | - `DOC:` Documentation change
 64 | - `ENH:` New functionality
 65 | - `PERF:` Performance improvement
 66 | - `STYLE:` No logic impact (indentation, comments)
 67 | - `WIP:` Work In Progress not ready for merge
 68 | 
 69 | The body of the message should clearly describe the motivation of the commit
 70 | (**what**, **why**, and **how**). In order to ease the task of reviewing
 71 | commits, the message body should follow the following guidelines:
 72 | 
 73 | 1. Leave a blank line between the subject and the body. This helps `git log` and
 74 |    `git rebase` work nicely, and allows to smooth generation of release notes.
 75 | 2. Try to keep the subject line below 72 characters, ideally 50.
 76 | 3. Capitalize the subject line.
 77 | 4. Do not end the subject line with a period.
 78 | 5. Use the imperative mood in the subject line (e.g.
 79 |    `BUG: Fix spacing not being considered.`).
 80 | 6. Wrap the body at 80 characters.
 81 | 7. Use semantic line feeds to separate different ideas, which improves the
 82 |    readability.
 83 | 8. Be concise, but honor the change: if significant alternative solutions were
 84 |    available, explain why they were discarded.
 85 | 9. If the commit refers to a topic discussed on the [IDC forum][idc-forum], or
 86 |    fixes a regression test, provide the link. If it fixes a compiler error,
 87 |    provide a minimal verbatim message of the compiler error. If the commit
 88 |    closes an issue, use the
 89 |    [GitHub issue closing keywords](https://docs.github.com/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue).
 90 | 
 91 | Keep in mind that the significant time is invested in reviewing commits and
 92 | _pull requests_, so following these guidelines will greatly help the people
 93 | doing reviews.
 94 | 
 95 | These guidelines are largely inspired by Chris Beam's
 96 | [How to Write a Commit Message](https://chris.beams.io/posts/git-commit/) post.
 97 | 
 98 | ### How to integrate a PR ?
 99 | 
100 | Getting your contributions integrated is relatively straightforward, here is the
101 | checklist:
102 | 
103 | - All tests pass
104 | - Consensus is reached. This usually means that at least two reviewers approved
105 |   the changes (or added a `LGTM` comment) and at least one business day passed
106 |   without anyone objecting. `LGTM` is an acronym for _Looks Good to Me_.
107 | - To accommodate developers explicitly asking for more time to test the proposed
108 |   changes, integration time can be delayed by few more days.
109 | - If you do NOT have push access, a core developer will integrate your PR. If
110 |   you would like to speed up the integration, do not hesitate to add a reminder
111 |   comment to the PR
112 | 
113 | ### Automatic testing of pull requests
114 | 
115 | Every pull request is tested automatically using GitHub Actions each time you
116 | push a commit to it. The GitHub UI will restrict users from merging pull
117 | requests until the CI build has returned with a successful result indicating
118 | that all tests have passed.
119 | 
120 | [idc-forum]: https://discourse.canceridc.dev
121 | [idc-index-issues]: https://github.com/ImagingDataCommons/idc-index/issues
122 | [idc-index-docs]: https://idc-index.readthedocs.io/
123 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Imaging Data Commons
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 9 | of the Software, and to permit persons to whom the Software is furnished to do
10 | so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # idc-index
  2 | 
  3 | [![Actions Status][actions-badge]][actions-link]
  4 | [![Documentation Status][rtd-badge]][rtd-link]
  5 | 
  6 | [![PyPI version][pypi-version]][pypi-link]
  7 | [![PyPI platforms][pypi-platforms]][pypi-link]
  8 | 
  9 | [![Discourse Forum][discourse-forum-badge]][discourse-forum-link]
 10 | 
 11 | > [!WARNING]
 12 | >
 13 | > This package is in its early development stages. Its functionality and API
 14 | > will change.
 15 | >
 16 | > Stay tuned for the updates and documentation, and please share your feedback
 17 | > about it by opening issues in this repository, or by starting a discussion in
 18 | > [IDC User forum](https://discourse.canceridc.dev/).
 19 | 
 20 | <!-- SPHINX-START -->
 21 | 
 22 | ## About
 23 | 
 24 | `idc-index` is a Python package that enables basic operations for working with
 25 | [NCI Imaging Data Commons (IDC)](https://imaging.datacommons.cancer.gov):
 26 | 
 27 | - subsetting of the IDC data using selected metadata attributes
 28 | - download of the files corresponding to selection
 29 | - generation of the viewer URLs for the selected data
 30 | 
 31 | ## Getting started
 32 | 
 33 | Install the latest version of the package.
 34 | 
 35 | ```bash
 36 | $ pip install --upgrade idc-index
 37 | ```
 38 | 
 39 | Instantiate `IDCClient`, which provides the interface for main operations.
 40 | 
 41 | ```python
 42 | from idc_index import IDCClient
 43 | 
 44 | client = IDCClient.client()
 45 | ```
 46 | 
 47 | You can use [IDC Portal](https://imaging.datacommons.cancer.gov/explore/) to
 48 | browse collections, cases, studies and series, copy their identifiers and
 49 | download the corresponding files using `idc-index` helper functions.
 50 | 
 51 | You can try this out with the `rider_pilot` collection, which is just 10.5 GB in
 52 | size:
 53 | 
 54 | ```
 55 | client.download_from_selection(collection_id="rider_pilot", downloadDir=".")
 56 | ```
 57 | 
 58 | ... or run queries against the "mini" index of Imaging Data Commons data, and
 59 | download images that match your selection criteria! The following will select
 60 | all Magnetic Resonance (MR) series, and will download the first 10.
 61 | 
 62 | ```python
 63 | from idc_index import index
 64 | 
 65 | client = index.IDCClient()
 66 | 
 67 | query = """
 68 | SELECT
 69 |   SeriesInstanceUID
 70 | FROM
 71 |   index
 72 | WHERE
 73 |   Modality = 'MR'
 74 | """
 75 | 
 76 | selection_df = client.sql_query(query)
 77 | 
 78 | client.download_from_selection(
 79 |     seriesInstanceUID=list(selection_df["SeriesInstanceUID"].values[:10]),
 80 |     downloadDir=".",
 81 | )
 82 | ```
 83 | 
 84 | ## Tutorial
 85 | 
 86 | Please check out
 87 | [this tutorial notebook](https://github.com/ImagingDataCommons/IDC-Tutorials/blob/master/notebooks/labs/idc_rsna2023.ipynb)
 88 | for the introduction into using `idc-index`.
 89 | 
 90 | ## Resources
 91 | 
 92 | - [Imaging Data Commons Portal](https://imaging.datacommons.cancer.gov/) can be
 93 |   used to explore the content of IDC from the web browser
 94 | - [s5cmd](https://github.com/peak/s5cmd) is a highly efficient, open source,
 95 |   multi-platform S3 client that we use for downloading IDC data, which is hosted
 96 |   in public AWS and GCS buckets. Distributed on PyPI as
 97 |   [s5cmd](https://pypi.org/project/s5cmd/).
 98 | - [SlicerIDCBrowser](https://github.com/ImagingDataCommons/SlicerIDCBrowser) 3D
 99 |   Slicer extension that relies on `idc-index` for search and download of IDC
100 |   data
101 | 
102 | ## Acknowledgment
103 | 
104 | This software is maintained by the IDC team, which has been funded in whole or
105 | in part with Federal funds from the NCI, NIH, under task order no. HHSN26110071
106 | under contract no. HHSN261201500003l.
107 | 
108 | If this package helped your research, we would appreciate if you could cite IDC
109 | paper below.
110 | 
111 | > Fedorov, A., Longabaugh, W. J. R., Pot, D., Clunie, D. A., Pieper, S. D.,
112 | > Gibbs, D. L., Bridge, C., Herrmann, M. D., Homeyer, A., Lewis, R., Aerts, H.
113 | > J. W., Krishnaswamy, D., Thiriveedhi, V. K., Ciausu, C., Schacherer, D. P.,
114 | > Bontempi, D., Pihl, T., Wagner, U., Farahani, K., Kim, E. & Kikinis, R.
115 | > _National Cancer Institute Imaging Data Commons: Toward Transparency,
116 | > Reproducibility, and Scalability in Imaging Artificial Intelligence_.
117 | > RadioGraphics (2023). https://doi.org/10.1148/rg.230180
118 | 
119 | <!-- prettier-ignore-start -->
120 | [actions-badge]:            https://github.com/ImagingDataCommons/idc-index/workflows/CI/badge.svg
121 | [actions-link]:             https://github.com/ImagingDataCommons/idc-index/actions
122 | [discourse-forum-badge]: https://img.shields.io/discourse/https/discourse.canceridc.dev/status.svg
123 | [discourse-forum-link]:  https://discourse.canceridc.dev/
124 | [pypi-link]:                https://pypi.org/project/idc-index/
125 | [pypi-platforms]:           https://img.shields.io/pypi/pyversions/idc-index
126 | [pypi-version]:             https://img.shields.io/pypi/v/idc-index
127 | [rtd-badge]:                https://readthedocs.org/projects/idc-index/badge/?version=latest
128 | [rtd-link]:                 https://idc-index.readthedocs.io/en/latest/?badge=latest
129 | 
130 | <!-- prettier-ignore-end -->
131 | 


--------------------------------------------------------------------------------
/docs/api/idc_index.rst:
--------------------------------------------------------------------------------
 1 | idc\_index package API
 2 | =======================
 3 | 
 4 | .. automodule:: idc_index
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | idc\_index.index module
13 | -----------------------
14 | 
15 | .. automodule:: idc_index.index
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | idc\_index.cli module
21 | -----------------------
22 | 
23 | .. automodule:: idc_index.cli
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 


--------------------------------------------------------------------------------
/docs/cli_tools.rst:
--------------------------------------------------------------------------------
 1 | Command Line Interface tools
 2 | ============================
 3 | 
 4 | *idc-index* provides a command line interface (CLI) tool to simplify access to the functionality
 5 | implemented via the Python API. The CLI tool is a wrapper around the Python API and provides a
 6 | simple way to interact with the package.
 7 | 
 8 | Once *idc-index* is installed, the CLI tool can be accessed by running the following command in the
 9 | terminal.
10 | 
11 | .. click:: idc_index.cli:idc
12 |   :prog: idc
13 |   :nested: full
14 | 


--------------------------------------------------------------------------------
/docs/column_descriptions.md:
--------------------------------------------------------------------------------
  1 | # Metadata attributes in `idc-index`'s index tables
  2 | 
  3 | `idc-index` is named this way because it wraps indices of IDC data: tables
  4 | containing the most important metadata attributes describing the files available
  5 | in IDC. The main metadata index is available in the `index` variable (which is a
  6 | pandas `DataFrame`) of `IDCClient`. Additional index tables such as the
  7 | `clinical_index` contain non-DICOM clinical data or slide microscopy specific
  8 | tables (indicated by the prefix `sm`) include metadata attributes specific to
  9 | slide microscopy images.
 10 | 
 11 | ## `index`
 12 | 
 13 | The following is the list of the columns included in `index`. You can use those
 14 | to select cohorts and subsetting data. `index` is series-based, i.e, it has one
 15 | row per DICOM series.
 16 | 
 17 | - non-DICOM attributes assigned/curated by IDC:
 18 | 
 19 |   - `collection_id`: short string with the identifier of the collection the
 20 |     series belongs to
 21 |   - `analysis_result_id`: this string is not empty if the specific series is
 22 |     part of an analysis results collection; analysis results can be added to a
 23 |     given collection over time
 24 |   - `source_DOI`: Digital Object Identifier of the dataset that contains the
 25 |     given series; note that a given collection can include one or more DOIs,
 26 |     since analysis results added to the collection would typically have
 27 |     independent DOI values!
 28 |   - `instanceCount`: number of files in the series (typically, this matches the
 29 |     number of slices in cross-sectional modalities)
 30 |   - `license_short_name`: short name of the license that governs the use of the
 31 |     files corresponding to the series
 32 |   - `series_aws_url`: location of the series files in a public AWS bucket
 33 |   - `series_size_MB`: total disk size needed to store the series
 34 | 
 35 | - DICOM attributes extracted from the files
 36 |   - `PatientID`: identifier of the patient
 37 |   - `PatientAge` and `PatientSex`: attributes containing patient age and sex
 38 |   - `StudyInstanceUID`: unique identifier of the DICOM study
 39 |   - `StudyDescription`: textual description of the study content
 40 |   - `StudyDate`: date of the study (note that those dates are shifted, and are
 41 |     not real dates when images were acquired, to protect patient privacy)
 42 |   - `SeriesInstanceUID`: unique identifier of the DICOM series
 43 |   - `SeriesDate`: date when the series was acquired
 44 |   - `SeriesDescription`: textual description of the series content
 45 |   - `SeriesNumber`: series number
 46 |   - `BodyPartExamined`: body part imaged
 47 |   - `Modality`: acquisition modality
 48 |   - `Manufacturer`: manufacturer of the equipment that generated the series
 49 |   - `ManufacturerModelName`: model name of the equipment
 50 | 
 51 | ## `sm_index`
 52 | 
 53 | The following is the list of the columns included in `sm_index`. `sm_index` is
 54 | series-based, i.e, it has one row per DICOM series, but only includes series
 55 | with slide microscopy data.
 56 | 
 57 | - DICOM attributes extracted from the files:
 58 |   - `SeriesInstanceUID`: unique identifier of the DICOM series: one DICOM series
 59 |     = one slide
 60 |   - `embeddingMedium`: describes in what medium the slide was embedded before
 61 |     the image was obtained
 62 |   - `tissueFixative`: describes tissue fixatives used before the image was
 63 |     obtained
 64 |   - `staining_usingSubstance`: describes staining steps the specimen underwent
 65 |     before the image was obtained
 66 |   - `max_TotalPixelMatrixColumns`: width of the image at the maximum resolution
 67 |   - `max_TotalMatrixRows`: height of the image at the maximum resolution
 68 |   - `min_PixelSpacing_2sf`: pixel spacing in mm at the maximum resolution layer,
 69 |     rounded to 2 significant figures
 70 |   - `ObjectiveLensPower`: power of the objective lens of the equipment used to
 71 |     digitize the slide
 72 |   - `primaryAnatomicStructure`: anatomic location from where the imaged specimen
 73 |     was collected
 74 |   - `primaryAnatomicStructureModifier`: additional characteristics of the
 75 |     specimen, such as whether it is a tumor or normal tissue
 76 |   - `admittingDiagnosis`: if available, diagnosis of the patient; populated
 77 |     using the first item of the `AdmittingDiagnosesSequence` in DICOM SM series
 78 |   - `illuminationType`: specifies the type of illumination used when obtaining
 79 |     the image
 80 | 
 81 | In case of `embeddingMedium`, `tissueFixative`, `staining_usingSubstance`,
 82 | `primaryAnatomicStructure`, `primaryAnatomicStructureModifier`,
 83 | `admittingDiagnosis` and `illuminationType` the attributes exist with suffix
 84 | `_code_designator_value_str` and `_CodeMeaning`, which indicates whether the
 85 | column contains CodeSchemeDesignator and CodeValue, or CodeMeaning. If this is
 86 | new to you, a brief explanation on the three-value based coding scheme in DICOM
 87 | can be found at https://learn.canceridc.dev/dicom/coding-schemes.
 88 | 
 89 | ## `sm_instance_index`
 90 | 
 91 | The following is the list of the columns included in `sm_instance_index`.
 92 | `sm_instance_index` is instance-based, i.e, it has one row per DICOM instance
 93 | (pyramid level of a slide, plus potentially thumbnail or label images), but only
 94 | includes DICOM instances of the slide microscopy modality.
 95 | 
 96 | - DICOM attributes extracted from the files:
 97 | 
 98 |   - `SOPInstanceUID`: unique identifier of the DICOM instance: one DICOM
 99 |     instance = one level/label/thumbnail image of the slide
100 |   - `SeriesInstanceUID`: unique identifier of the DICOM series: one DICOM series
101 |     = one slide
102 |   - `embeddingMedium`: describes in what medium the slide was embedded before
103 |     the image was obtained
104 |   - `tissueFixative`: describes tissue fixatives used before the image was
105 |     obtained
106 |   - `staining_usingSubstance`: describes staining steps the specimen underwent
107 |     before the image was obtained
108 |   - `TotalPixelMatrixColumns`: width of the image
109 |   - `TotalMatrixRows`: height of the image
110 |   - `PixelSpacing_0`: pixel spacing in mm
111 |   - `ImageType`: specifies further characteristics of the image in a list,
112 |     including as the third value whether it is a VOLUME, LABEL, OVERVIEW or
113 |     THUMBNAIL image.
114 |   - `TransferSyntaxUID`: specifies the encoding scheme used for the image data
115 |   - `instance_size`: specifies the DICOM instance's size in bytes
116 | 
117 | - non-DICOM attributes assigned/curated by IDC:
118 |   - `crdc_instance_uuid`: globally unique, versioned identifier of the DICOM
119 |     instance
120 | 
121 | In case of `embeddingMedium`, `tissueFixative`, and `staining_usingSubstance`
122 | the attributes exist with suffix `_code_designator_value_str` and
123 | `_CodeMeaning`, which indicates whether the column contains CodeSchemeDesignator
124 | and CodeValue, or CodeMeaning. If this is new to you, a brief explanation on the
125 | three-value based coding scheme in DICOM can be found at
126 | https://learn.canceridc.dev/dicom/coding-schemes.
127 | 
128 | ## `clinical_index`
129 | 
130 | Many of the image collections available in IDC are accompanied by clinical data.
131 | Such clinical data is organized in one or more tables that are shared alongside
132 | the images.
133 | 
134 | Each row in `clinical_index` corresponds to a column in a clinical table
135 | available in IDC. You can use this index to find collections that have a
136 | specific clinical attribute, compare availability of the clinical data across
137 | collections, identify patients that have specific clinical characteristics.
138 | 
139 | Note that IDC does not perform any harmonization of the clinical data across
140 | collections, or any validation of the content of the tables. We share clinical
141 | data as it was provided by the submitter.
142 | 
143 | provides the list of all of the columns across all of the clinical tables
144 | available in IDC. It contains the following items:
145 | 
146 | - `collection_id`: identifier of the collection where the given clinical data
147 |   attribute is available
148 | - `short_table_name`: name of the clinical data table where the attribute is
149 |   encountered; the referenced table can be loaded into a Pandas DataFrame using
150 |   the `IDCClient.get_clinical_data()` call
151 | - `table_name`: fully resolved name of the table in IDC Google BigQuery public
152 |   dataset (only relevant if you would like to search using BigQuery)
153 | - `column`: name of the column that is available in the given clinical table
154 | - `colum_label`: label of the column (this field may contain more extensive
155 |   information describing a given column)
156 | - `values`: set of values defining the content of the column (relevant if the
157 |   column contains fixed list of values and not free text)
158 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import importlib.metadata
 4 | 
 5 | project = "idc-index"
 6 | copyright = "2024, Imaging Data Commons"
 7 | author = "Imaging Data Commons"
 8 | version = release = importlib.metadata.version("idc_index")
 9 | 
10 | extensions = [
11 |     "myst_parser",
12 |     "sphinx.ext.autodoc",
13 |     "sphinx.ext.intersphinx",
14 |     "sphinx.ext.mathjax",
15 |     "sphinx.ext.napoleon",
16 |     "sphinx_autodoc_typehints",
17 |     "sphinx_copybutton",
18 |     "sphinx_click",
19 | ]
20 | 
21 | source_suffix = [".rst", ".md"]
22 | exclude_patterns = [
23 |     "_build",
24 |     "**.ipynb_checkpoints",
25 |     "Thumbs.db",
26 |     ".DS_Store",
27 |     ".env",
28 |     ".venv",
29 | ]
30 | 
31 | html_theme = "furo"
32 | 
33 | myst_enable_extensions = [
34 |     "colon_fence",
35 | ]
36 | 
37 | intersphinx_mapping = {
38 |     "python": ("https://docs.python.org/3", None),
39 |     "pandas": ("http://pandas.pydata.org/pandas-docs/stable", None),
40 | }
41 | 
42 | nitpick_ignore = [
43 |     ("py:class", "_io.StringIO"),
44 |     ("py:class", "_io.BytesIO"),
45 | ]
46 | 
47 | always_document_param_types = True
48 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # idc-index
 2 | 
 3 | ```{toctree}
 4 | :maxdepth: 2
 5 | :hidden:
 6 | ```
 7 | 
 8 | :::{warning}
 9 | 
10 | This package is in its early development stages. Its functionality and API will
11 | change.
12 | 
13 | Stay tuned for the updates and documentation, and please share your feedback
14 | about it by opening issues at the
15 | [idc-index](https://github.com/ImagingDataCommons/idc-index) repository, or by
16 | starting a discussion in [IDC User forum](https://discourse.canceridc.dev/).
17 | 
18 | :::
19 | 
20 | ```{include} ../README.md
21 | :start-after: <!-- SPHINX-START -->
22 | ```
23 | 
24 | ## Contents
25 | 
26 | ```{toctree}
27 | :maxdepth: 2
28 | :titlesonly:
29 | :caption: API docs
30 | 
31 | column_descriptions
32 | cli_tools.rst
33 | api/idc_index
34 | ```
35 | 
36 | ## Indices and tables
37 | 
38 | - {ref}`genindex`
39 | - {ref}`modindex`
40 | - {ref}`search`
41 | 


--------------------------------------------------------------------------------
/idc_index/__init__.py:
--------------------------------------------------------------------------------
 1 | """Copyright (c) 2024 Imaging Data Commons. All rights reserved.
 2 | 
 3 | idc-index: Package to query and download data from an index of ImagingDataCommons
 4 | """
 5 | 
 6 | 
 7 | from __future__ import annotations
 8 | 
 9 | from ._version import version as __version__
10 | 
11 | __all__ = ["__version__"]
12 | 
13 | from .index import IDCClient
14 | 
15 | _ = IDCClient
16 | 


--------------------------------------------------------------------------------
/idc_index/_version.pyi:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | version: str
4 | version_tuple: tuple[int, int, int] | tuple[int, int, int, str, str]
5 | 


--------------------------------------------------------------------------------
/idc_index/cli.py:
--------------------------------------------------------------------------------
  1 | """CLI module for the IDC client.
  2 | 
  3 | This module provides command-line interface (CLI) commands to interact with the Imaging Data Commons (IDC) data.
  4 | """
  5 | from __future__ import annotations
  6 | 
  7 | import logging
  8 | from pathlib import Path
  9 | 
 10 | import click
 11 | 
 12 | from . import index
 13 | from .index import IDCClient
 14 | 
 15 | # Set up logging for the CLI module
 16 | logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)
 17 | logger_cli = logging.getLogger("cli")
 18 | logger_cli.setLevel(logging.INFO)
 19 | 
 20 | 
 21 | def _get_max_path_length():
 22 |     # can make this more robust
 23 |     return 260
 24 | 
 25 | 
 26 | @click.group()
 27 | def idc():
 28 |     """`idc` is a command line interface to the API functionality available in idc-index."""
 29 | 
 30 | 
 31 | def set_log_level(log_level):
 32 |     """Set the logging level for the CLI module.
 33 | 
 34 |     Args:
 35 |         log_level (str): The logging level to set.
 36 |     """
 37 |     log_levels = {
 38 |         "debug": logging.DEBUG,
 39 |         "info": logging.INFO,
 40 |         "warning": logging.WARNING,
 41 |         "error": logging.ERROR,
 42 |         "critical": logging.CRITICAL,
 43 |     }
 44 |     logging_level = log_levels.get(log_level.lower(), logging.WARNING)
 45 |     logger_cli.debug(f"Setting the log level of index.py to {logging_level}")
 46 |     index.logger.setLevel(logging_level)
 47 |     logger_cli.setLevel(logging_level)
 48 | 
 49 | 
 50 | @idc.command()
 51 | @click.option(
 52 |     "--download-dir",
 53 |     required=True,
 54 |     type=click.Path(),
 55 |     help="Path to the directory to download the files to.",
 56 | )
 57 | @click.option(
 58 |     "--dry-run",
 59 |     type=bool,
 60 |     default=False,
 61 |     help="If set, calculates the size of the cohort but download does not start.",
 62 | )
 63 | @click.option(
 64 |     "--collection-id",
 65 |     type=str,
 66 |     multiple=True,
 67 |     default=None,
 68 |     help="Collection ID(s) to filter by.",
 69 | )
 70 | @click.option(
 71 |     "--patient-id",
 72 |     type=str,
 73 |     multiple=True,
 74 |     default=None,
 75 |     help="Patient ID(s) to filter by.",
 76 | )
 77 | @click.option(
 78 |     "--study-instance-uid",
 79 |     type=str,
 80 |     multiple=True,
 81 |     default=None,
 82 |     help="DICOM StudyInstanceUID(s) to filter by.",
 83 | )
 84 | @click.option(
 85 |     "--series-instance-uid",
 86 |     type=str,
 87 |     multiple=True,
 88 |     default=None,
 89 |     help="DICOM SeriesInstanceUID(s) to filter by.",
 90 | )
 91 | @click.option(
 92 |     "--crdc-series-uuid",
 93 |     type=str,
 94 |     multiple=True,
 95 |     default=None,
 96 |     help="crdc_series_uuid(s) to filter by.",
 97 | )
 98 | @click.option(
 99 |     "--quiet",
100 |     type=bool,
101 |     default=True,
102 |     help="If set, suppresses the output of the subprocess.",
103 | )
104 | @click.option(
105 |     "--show-progress-bar",
106 |     type=bool,
107 |     default=True,
108 |     help="If set, tracks the progress of download.",
109 | )
110 | @click.option(
111 |     "--use-s5cmd-sync",
112 |     type=bool,
113 |     default=False,
114 |     help="If set, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded.",
115 | )
116 | @click.option(
117 |     "--log-level",
118 |     type=click.Choice(
119 |         ["debug", "info", "warning", "error", "critical"], case_sensitive=False
120 |     ),
121 |     default="info",
122 |     help="Set the logging level for the CLI module.",
123 | )
124 | @click.option(
125 |     "--dir-template",
126 |     type=str,
127 |     default=IDCClient.DOWNLOAD_HIERARCHY_DEFAULT,
128 |     help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to empty string (\"\") all files will be downloaded to the download directory with no subdirectories.",
129 | )
130 | def download_from_selection(
131 |     download_dir,
132 |     dry_run,
133 |     collection_id,
134 |     patient_id,
135 |     study_instance_uid,
136 |     series_instance_uid,
137 |     crdc_series_uuid,
138 |     quiet,
139 |     show_progress_bar,
140 |     use_s5cmd_sync,
141 |     log_level,
142 |     dir_template,
143 | ):
144 |     """Download from a selection of collection(s), patient(s), study(studies) and series.
145 | 
146 |     The filtering will be applied in sequence by first selecting the collection(s), followed by
147 |     patient(s), study(studies) and series. If no filtering is applied, all the files will be downloaded.
148 |     """
149 |     # Set the logging level for the CLI module
150 |     set_log_level(log_level)
151 |     # Create an instance of the IDCClient
152 |     client = IDCClient()
153 |     logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index")
154 |     # Parse the input parameters and pass them to IDCClient's download_from_selection method
155 |     collection_id = (
156 |         [cid.strip() for cid in (",".join(collection_id)).split(",")]
157 |         if collection_id
158 |         else None
159 |     )
160 |     patient_id = (
161 |         [pid.strip() for pid in (",".join(patient_id)).split(",")]
162 |         if patient_id
163 |         else None
164 |     )
165 |     study_instance_uid = (
166 |         [uid.strip() for uid in (",".join(study_instance_uid)).split(",")]
167 |         if study_instance_uid
168 |         else None
169 |     )
170 |     series_instance_uid = (
171 |         [uid.strip() for uid in (",".join(series_instance_uid)).split(",")]
172 |         if series_instance_uid
173 |         else None
174 |     )
175 |     crdc_series_uuid = (
176 |         [uid.strip() for uid in (",".join(crdc_series_uuid)).split(",")]
177 |         if crdc_series_uuid
178 |         else None
179 |     )
180 |     logger_cli.debug("Inputs received from cli download:")
181 |     logger_cli.debug(f"collection_id: {collection_id}")
182 |     logger_cli.debug(f"patient_id: {patient_id}")
183 |     logger_cli.debug(f"study_instance_uid: {study_instance_uid}")
184 |     logger_cli.debug(f"series_instance_uid: {series_instance_uid}")
185 |     logger_cli.debug(f"crdc_series_uuid: {crdc_series_uuid}")
186 |     logger_cli.debug(f"dry_run: {dry_run}")
187 |     logger_cli.debug(f"quiet: {quiet}")
188 |     logger_cli.debug(f"show_progress_bar: {show_progress_bar}")
189 |     logger_cli.debug(f"use_s5cmd_sync: {use_s5cmd_sync}")
190 |     logger_cli.debug(f"dirTemplate: {dir_template}")
191 | 
192 |     client.download_from_selection(
193 |         download_dir,
194 |         dry_run=dry_run,
195 |         collection_id=collection_id,
196 |         patientId=patient_id,
197 |         studyInstanceUID=study_instance_uid,
198 |         seriesInstanceUID=series_instance_uid,
199 |         crdc_series_uuid=crdc_series_uuid,
200 |         quiet=quiet,
201 |         show_progress_bar=show_progress_bar,
202 |         use_s5cmd_sync=use_s5cmd_sync,
203 |         dirTemplate=dir_template,
204 |     )
205 | 
206 | 
207 | @idc.command()
208 | @click.option(
209 |     "--manifest-file",
210 |     required=True,
211 |     type=click.Path(),
212 |     help="The path to the manifest file.",
213 | )
214 | @click.option(
215 |     "--download-dir",
216 |     required=True,
217 |     type=click.Path(),
218 |     help="Path to the directory to download the files to.",
219 | )
220 | @click.option(
221 |     "--quiet",
222 |     type=bool,
223 |     default=True,
224 |     help="If set, suppresses the output of the subprocess.",
225 | )
226 | @click.option(
227 |     "--validate-manifest",
228 |     type=bool,
229 |     default=True,
230 |     help="If True, validates the manifest for any errors. Defaults to True.",
231 | )
232 | @click.option(
233 |     "--show-progress-bar",
234 |     type=bool,
235 |     default=True,
236 |     help="If set, tracks the progress of download.",
237 | )
238 | @click.option(
239 |     "--use-s5cmd-sync",
240 |     type=bool,
241 |     default=False,
242 |     help="If set, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded.",
243 | )
244 | @click.option(
245 |     "--log-level",
246 |     type=click.Choice(
247 |         ["debug", "info", "warning", "error", "critical"], case_sensitive=False
248 |     ),
249 |     default="info",
250 |     help="Set the logging level for the CLI module.",
251 | )
252 | @click.option(
253 |     "--dir-template",
254 |     type=str,
255 |     default=IDCClient.DOWNLOAD_HIERARCHY_DEFAULT,
256 |     help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to empty string (\"\") all files will be downloaded to the download directory with no subdirectories.",
257 | )
258 | def download_from_manifest(
259 |     manifest_file,
260 |     download_dir,
261 |     quiet,
262 |     validate_manifest,
263 |     show_progress_bar,
264 |     use_s5cmd_sync,
265 |     log_level,
266 |     dir_template,
267 | ):
268 |     """Download the manifest file.
269 | 
270 |     In a series of steps, the manifest file is first validated to ensure every line contains a valid URL.
271 |     It then gets the total size to be downloaded and runs the download process on one
272 |     process and download progress on another process.
273 |     """
274 |     # Set the logging level for the CLI module
275 |     set_log_level(log_level)
276 |     # Create an instance of the IDCClient
277 |     client = IDCClient()
278 |     logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index")
279 |     logger_cli.debug("Inputs received from cli manifest download:")
280 |     logger_cli.debug(f"manifest_file_path: {manifest_file}")
281 |     logger_cli.debug(f"download_dir: {download_dir}")
282 |     logger_cli.debug(f"validate_manifest: {validate_manifest}")
283 |     logger_cli.debug(f"show_progress_bar: {show_progress_bar}")
284 |     logger_cli.debug(f"use_s5cmd_sync: {use_s5cmd_sync}")
285 |     logger_cli.debug(f"dirTemplate: {dir_template}")
286 | 
287 |     # Call IDCClient's download_from_manifest method with the provided parameters
288 |     client.download_from_manifest(
289 |         manifestFile=manifest_file,
290 |         downloadDir=download_dir,
291 |         quiet=quiet,
292 |         validate_manifest=validate_manifest,
293 |         show_progress_bar=show_progress_bar,
294 |         use_s5cmd_sync=use_s5cmd_sync,
295 |         dirTemplate=dir_template,
296 |     )
297 | 
298 | 
299 | @idc.command()
300 | @click.argument(
301 |     "generic_argument",
302 |     type=str,
303 | )
304 | @click.option(
305 |     "--download-dir",
306 |     required=False,
307 |     type=click.Path(),
308 |     help="Path to the directory to download the files to.",
309 | )
310 | @click.option(
311 |     "--dir-template",
312 |     type=str,
313 |     default=IDCClient.DOWNLOAD_HIERARCHY_DEFAULT,
314 |     help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to empty string (\"\") all files will be downloaded to the download directory with no subdirectories.",
315 | )
316 | @click.option(
317 |     "--log-level",
318 |     type=click.Choice(
319 |         ["debug", "info", "warning", "error", "critical"], case_sensitive=False
320 |     ),
321 |     default="info",
322 |     help="Set the logging level for the CLI module.",
323 | )
324 | def download(generic_argument, download_dir, dir_template, log_level):
325 |     """Download content given the input parameter.
326 | 
327 |     Determine whether the input parameter corresponds to a file manifest or a list of collection_id, PatientID, StudyInstanceUID, or SeriesInstanceUID values, and download the corresponding files into the current directory. Default parameters will be used for organizing the downloaded files into folder hierarchy. Use `download_from_selection()` and `download_from_manifest()` functions if granular control over the download process is needed.
328 |     """
329 |     # Set the logging level for the CLI module
330 |     set_log_level(log_level)
331 |     # Create an instance of the IDCClient
332 |     client = IDCClient()
333 | 
334 |     logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index")
335 | 
336 |     if download_dir:
337 |         download_dir = Path(download_dir)
338 |     else:
339 |         download_dir = Path.cwd()
340 | 
341 |     if (
342 |         len(generic_argument) < _get_max_path_length()
343 |         and Path(generic_argument).is_file()
344 |     ):
345 |         # Parse the input parameters and pass them to IDC
346 |         logger_cli.info("Detected manifest file, downloading from manifest.")
347 |         client.download_from_manifest(
348 |             generic_argument, downloadDir=download_dir, dirTemplate=dir_template
349 |         )
350 |     # this is not a file manifest
351 |     else:
352 |         # Split the input string and filter out any empty values
353 |         item_ids = [item for item in generic_argument.split(",") if item]
354 | 
355 |         if not item_ids:
356 |             logger_cli.error("No valid IDs provided.")
357 | 
358 |         index_df = client.index
359 | 
360 |         def check_and_download(column_name, item_ids, download_dir, kwarg_name):
361 |             matches = index_df[column_name].isin(item_ids)
362 |             matched_ids = index_df[column_name][matches].unique().tolist()
363 |             if not matched_ids:
364 |                 return False
365 |             unmatched_ids = list(set(item_ids) - set(matched_ids))
366 |             if unmatched_ids:
367 |                 logger_cli.debug(
368 |                     f"Partial match for {column_name}: matched {matched_ids}, unmatched {unmatched_ids}"
369 |                 )
370 |             logger_cli.info(f"Identified matching {column_name}: {matched_ids}")
371 |             client.download_from_selection(
372 |                 **{
373 |                     kwarg_name: matched_ids,
374 |                     "downloadDir": download_dir,
375 |                     "dirTemplate": dir_template,
376 |                 }
377 |             )
378 |             return True
379 | 
380 |         matches_found = 0
381 |         matches_found += check_and_download(
382 |             "collection_id", item_ids, download_dir, "collection_id"
383 |         )
384 |         matches_found += check_and_download(
385 |             "PatientID", item_ids, download_dir, "patientId"
386 |         )
387 |         matches_found += check_and_download(
388 |             "StudyInstanceUID", item_ids, download_dir, "studyInstanceUID"
389 |         )
390 |         matches_found += check_and_download(
391 |             "SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID"
392 |         )
393 |         matches_found += check_and_download(
394 |             "crdc_series_uuid", item_ids, download_dir, "crdc_series_uuid"
395 |         )
396 |         if not matches_found:
397 |             logger_cli.error(
398 |                 "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID, crdc_series_uuid."
399 |             )
400 | 
401 | 
402 | if __name__ == "__main__":
403 |     idc()
404 | 


--------------------------------------------------------------------------------
/idc_index/index.py:
--------------------------------------------------------------------------------
   1 | # pylint: disable=too-many-lines
   2 | 
   3 | from __future__ import annotations
   4 | 
   5 | import logging
   6 | import os
   7 | import re
   8 | import shutil
   9 | import subprocess
  10 | import tempfile
  11 | import time
  12 | from importlib.metadata import distribution, version
  13 | from pathlib import Path
  14 | 
  15 | import duckdb
  16 | import idc_index_data
  17 | import pandas as pd
  18 | import platformdirs
  19 | import psutil
  20 | import requests
  21 | from packaging.version import Version
  22 | from tqdm import tqdm
  23 | 
  24 | aws_endpoint_url = "https://s3.amazonaws.com"
  25 | gcp_endpoint_url = "https://storage.googleapis.com"
  26 | asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}"
  27 | 
  28 | logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
  29 | logger = logging.getLogger(__name__)
  30 | 
  31 | 
  32 | class IDCClient:
  33 |     # Default download hierarchy template
  34 |     DOWNLOAD_HIERARCHY_DEFAULT = (
  35 |         "%collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID"
  36 |     )
  37 | 
  38 |     # Defined citation formats that can be passed to the citations request methods
  39 |     # see acceptable values at https://citation.crosscite.org/docs.html#sec-4
  40 |     CITATION_FORMAT_APA = "text/x-bibliography; style=apa; locale=en-US"
  41 |     CITATION_FORMAT_TURTLE = "text/turtle"
  42 |     CITATION_FORMAT_JSON = "application/vnd.citationstyles.csl+json"
  43 |     CITATION_FORMAT_BIBTEX = "application/x-bibtex"
  44 | 
  45 |     # Singleton pattern
  46 |     # NOTE: In the future, one may want to use multiple clients e.g. for sub-datasets so a attribute-singleton as shown below seems a better option.
  47 |     # _instance: IDCClient
  48 |     # def __new__(cls):
  49 |     #     if not hasattr(cls, "_instance") or getattr(cls, "_instance") is None:
  50 |     #         instance = super(IDCClient, cls).__new__(cls)
  51 |     #         setattr(cls, "_instance", instance)
  52 |     #     return cls._instance
  53 | 
  54 |     _client: IDCClient
  55 | 
  56 |     @classmethod
  57 |     def client(cls) -> IDCClient:
  58 |         if not hasattr(cls, "_client") or getattr(cls, "_client") is None:
  59 |             setattr(cls, "_client", IDCClient())
  60 | 
  61 |         return cls._client
  62 | 
  63 |     def __init__(self):
  64 |         # Read main index file
  65 |         file_path = idc_index_data.IDC_INDEX_PARQUET_FILEPATH
  66 |         logger.debug(f"Reading index file v{idc_index_data.__version__}")
  67 |         self.index = pd.read_parquet(file_path)
  68 | 
  69 |         # initialize crdc_series_uuid for the index
  70 |         # TODO: in the future, after https://github.com/ImagingDataCommons/idc-index/pull/113
  71 |         # is merged (to minimize disruption), it will make more sense to change
  72 |         # idc-index-data to separate bucket from crdc_series_uuid, add support for GCP,
  73 |         # and consequently simplify the code here
  74 |         self.index["crdc_series_uuid"] = (
  75 |             self.index["series_aws_url"].str.split("/").str[3]
  76 |         )
  77 | 
  78 |         self.prior_versions_index_path = (
  79 |             idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
  80 |         )
  81 |         file_path = idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
  82 | 
  83 |         self.prior_versions_index = pd.read_parquet(file_path)
  84 | 
  85 |         # self.index = self.index.astype(str).replace("nan", "")
  86 |         self.index["series_size_MB"] = self.index["series_size_MB"].astype(float)
  87 |         self.collection_summary = self.index.groupby("collection_id").agg(
  88 |             {"Modality": pd.Series.unique, "series_size_MB": "sum"}
  89 |         )
  90 | 
  91 |         self.idc_version = f"v{Version(idc_index_data.__version__).major}"
  92 | 
  93 |         # since indices can change between versions, we need to store them in a versioned directory
  94 |         self.indices_data_dir = platformdirs.user_data_dir(
  95 |             "idc_index_data", "IDC", version=version("idc-index-data")
  96 |         )
  97 |         # these are the items that are fetched from IDC release assets (e.g., clinical data files)
  98 |         self.idc_data_dir = platformdirs.user_data_dir(
  99 |             "IDC", "IDC", version=self.idc_version
 100 |         )
 101 |         self.clinical_data_dir = None
 102 | 
 103 |         self.indices_overview = {
 104 |             "index": {
 105 |                 "description": "Main index containing one row per DICOM series.",
 106 |                 "installed": True,
 107 |                 "url": None,
 108 |                 "file_path": idc_index_data.IDC_INDEX_PARQUET_FILEPATH,
 109 |             },
 110 |             "prior_versions_index": {
 111 |                 "description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.",
 112 |                 "installed": True,
 113 |                 "url": None,
 114 |                 "file_path": idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH,
 115 |             },
 116 |             "sm_index": {
 117 |                 "description": "DICOM Slide Microscopy series-level index.",
 118 |                 "installed": False,
 119 |                 "url": f"{asset_endpoint_url}/sm_index.parquet",
 120 |                 "file_path": None,
 121 |             },
 122 |             "sm_instance_index": {
 123 |                 "description": "DICOM Slide Microscopy instance-level index.",
 124 |                 "installed": False,
 125 |                 "url": f"{asset_endpoint_url}/sm_instance_index.parquet",
 126 |                 "file_path": None,
 127 |             },
 128 |             "clinical_index": {
 129 |                 "description": "Index of clinical data accompanying the available images.",
 130 |                 "installed": False,
 131 |                 "url": f"{asset_endpoint_url}/clinical_index.parquet",
 132 |                 "file_path": None,
 133 |             },
 134 |         }
 135 | 
 136 |         # these will point to the dataframes containing the respective indices, once installed
 137 |         self.sm_index = None
 138 |         self.sm_instance_index = None
 139 |         self.clinical_index = None
 140 | 
 141 |         # Lookup s5cmd
 142 |         self.s5cmdPath = shutil.which("s5cmd")
 143 |         if self.s5cmdPath is None:
 144 |             # Workaround to support environment without a properly setup PATH
 145 |             # See https://github.com/Slicer/Slicer/pull/7587
 146 |             logger.debug("Falling back to looking up s5cmd along side the package")
 147 |             for script in distribution("s5cmd").files:
 148 |                 if str(script).startswith("s5cmd/bin/s5cmd"):
 149 |                     self.s5cmdPath = script.locate().resolve(strict=True)
 150 |                     break
 151 |         if self.s5cmdPath is None:
 152 |             raise FileNotFoundError(
 153 |                 "s5cmd executable not found. Please install s5cmd from https://github.com/peak/s5cmd#installation"
 154 |             )
 155 |         self.s5cmdPath = str(self.s5cmdPath)
 156 |         logger.debug(f"Found s5cmd executable: {self.s5cmdPath}")
 157 |         # ... and check it can be executed
 158 |         subprocess.check_call([self.s5cmdPath, "--help"], stdout=subprocess.DEVNULL)
 159 | 
 160 |     @staticmethod
 161 |     def _replace_aws_with_gcp_buckets(dataframe, column_name):
 162 |         # mapping from AWS to GCS buckets is fixed
 163 |         replacements = {
 164 |             r"s3://idc-open-data-two/": r"s3://idc-open-idc1/",
 165 |             r"s3://idc-open-data-cr/": r"s3://idc-open-cr/",
 166 |             # as of IDC v20, we use a new bucket that has the same name as AWS
 167 |             # for `idc-open-data` - no need to replace
 168 |             # r"s3://idc-open-data/": r"s3://public-datasets-idc/",
 169 |         }
 170 | 
 171 |         # Function to apply replacements
 172 |         def replace_url_parts(url):
 173 |             for old, new in replacements.items():
 174 |                 url = re.sub(old, new, url)
 175 |             return url
 176 | 
 177 |         # Apply the replacements to the requested column
 178 |         dataframe[column_name] = dataframe[column_name].apply(replace_url_parts)
 179 | 
 180 |     @staticmethod
 181 |     def _filter_dataframe_by_id(key, dataframe, _id):
 182 |         values = _id
 183 |         if isinstance(_id, str):
 184 |             values = [_id]
 185 |         filtered_df = dataframe[dataframe[key].isin(values)].copy()
 186 |         if filtered_df.empty:
 187 |             error_message = f"No data found for the {key} with the values {values}."
 188 |             raise ValueError(error_message)
 189 |         return filtered_df
 190 | 
 191 |     @staticmethod
 192 |     def _safe_filter_by_selection(
 193 |         df_index,
 194 |         collection_id,
 195 |         patientId,
 196 |         studyInstanceUID,
 197 |         seriesInstanceUID,
 198 |         sopInstanceUID,
 199 |         crdc_series_uuid,
 200 |     ):
 201 |         if collection_id is not None:
 202 |             if not isinstance(collection_id, str) and not isinstance(
 203 |                 collection_id, list
 204 |             ):
 205 |                 raise TypeError("collection_id must be a string or list of strings")
 206 |         if patientId is not None:
 207 |             if not isinstance(patientId, str) and not isinstance(patientId, list):
 208 |                 raise TypeError("patientId must be a string or list of strings")
 209 |         if studyInstanceUID is not None:
 210 |             if not isinstance(studyInstanceUID, str) and not isinstance(
 211 |                 studyInstanceUID, list
 212 |             ):
 213 |                 raise TypeError("studyInstanceUID must be a string or list of strings")
 214 |         if seriesInstanceUID is not None:
 215 |             if not isinstance(seriesInstanceUID, str) and not isinstance(
 216 |                 seriesInstanceUID, list
 217 |             ):
 218 |                 raise TypeError("seriesInstanceUID must be a string or list of strings")
 219 |         if sopInstanceUID is not None:
 220 |             if not isinstance(sopInstanceUID, str) and not isinstance(
 221 |                 sopInstanceUID, list
 222 |             ):
 223 |                 raise TypeError("sopInstanceUID must be a string or list of strings")
 224 | 
 225 |         if crdc_series_uuid is not None:
 226 |             if not isinstance(crdc_series_uuid, str) and not isinstance(
 227 |                 crdc_series_uuid, list
 228 |             ):
 229 |                 raise TypeError("crdc_series_uuid must be a string or list of strings")
 230 | 
 231 |         # Here we go down-up the hierarchy of filtering, taking into
 232 |         # account the direction of one-to-many relationships
 233 |         #   one crdc_series_uuid can be associated with one and only one SeriesInstanceUID
 234 |         #   one SeriesInstanceUID can be associated with one and only one StudyInstanceUID
 235 |         #   one StudyInstanceUID can be associated with one and only one PatientID
 236 |         #   one PatientID can be associated with one and only one collection_id
 237 |         # because of this we do not need to apply attributes above the given defined
 238 |         # attribute in the hierarchy
 239 |         # The earlier implemented behavior was a relic of the API from a different system
 240 |         # that influenced the API of SlicerIDCIndex, and propagated into idc-index. Unfortunately.
 241 | 
 242 |         if crdc_series_uuid is not None:
 243 |             result_df = IDCClient._filter_dataframe_by_id(
 244 |                 "crdc_series_uuid", df_index, crdc_series_uuid
 245 |             )
 246 |             return result_df
 247 | 
 248 |         if sopInstanceUID is not None:
 249 |             result_df = IDCClient._filter_by_dicom_instance_uid(
 250 |                 df_index, sopInstanceUID
 251 |             )
 252 |             return result_df
 253 | 
 254 |         if seriesInstanceUID is not None:
 255 |             result_df = IDCClient._filter_by_dicom_series_uid(
 256 |                 df_index, seriesInstanceUID
 257 |             )
 258 |             return result_df
 259 | 
 260 |         if studyInstanceUID is not None:
 261 |             result_df = IDCClient._filter_by_dicom_study_uid(df_index, studyInstanceUID)
 262 |             return result_df
 263 | 
 264 |         if patientId is not None:
 265 |             result_df = IDCClient._filter_by_patient_id(df_index, patientId)
 266 |             return result_df
 267 | 
 268 |         if collection_id is not None:
 269 |             result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
 270 |             return result_df
 271 | 
 272 |         return None
 273 | 
 274 |     @staticmethod
 275 |     def _filter_by_collection_id(df_index, collection_id):
 276 |         return IDCClient._filter_dataframe_by_id(
 277 |             "collection_id", df_index, collection_id
 278 |         )
 279 | 
 280 |     @staticmethod
 281 |     def _filter_by_patient_id(df_index, patient_id):
 282 |         return IDCClient._filter_dataframe_by_id("PatientID", df_index, patient_id)
 283 | 
 284 |     @staticmethod
 285 |     def _filter_by_dicom_study_uid(df_index, dicom_study_uid):
 286 |         return IDCClient._filter_dataframe_by_id(
 287 |             "StudyInstanceUID", df_index, dicom_study_uid
 288 |         )
 289 | 
 290 |     @staticmethod
 291 |     def _filter_by_dicom_series_uid(df_index, dicom_series_uid):
 292 |         return IDCClient._filter_dataframe_by_id(
 293 |             "SeriesInstanceUID", df_index, dicom_series_uid
 294 |         )
 295 | 
 296 |     @staticmethod
 297 |     def _filter_by_dicom_instance_uid(df_index, dicom_instance_uid):
 298 |         return IDCClient._filter_dataframe_by_id(
 299 |             "SOPInstanceUID", df_index, dicom_instance_uid
 300 |         )
 301 | 
 302 |     @staticmethod
 303 |     def get_idc_version():
 304 |         """
 305 |         Returns the version of IDC data used in idc-index
 306 |         """
 307 |         idc_version = Version(idc_index_data.__version__).major
 308 |         return f"v{idc_version}"
 309 | 
 310 |     @staticmethod
 311 |     def _check_create_directory(download_dir):
 312 |         """
 313 |         Mimic behavior of s5cmd and create the download directory if it does not exist
 314 |         """
 315 |         download_dir = Path(download_dir)
 316 |         download_dir.mkdir(parents=True, exist_ok=True)
 317 | 
 318 |         return str(download_dir.resolve())
 319 | 
 320 |     def _check_disk_size_and_warn(self, download_dir, disk_size_needed):
 321 |         disk_free_space_MB = psutil.disk_usage(download_dir).free / (1000 * 1000)
 322 |         logger.info("Disk size needed: " + self._format_size(disk_size_needed))
 323 |         logger.info("Disk size available: " + self._format_size(disk_free_space_MB))
 324 |         if disk_free_space_MB < disk_size_needed:
 325 |             logger.error("Not enough free space on disk to download the files.")
 326 |             return False
 327 |         return True
 328 | 
 329 |     def fetch_index(self, index_name) -> None:
 330 |         """
 331 |         Downloads requested index and adds this index joined with the main index as respective class attribute.
 332 | 
 333 |         Args:
 334 |             index (str): Name of the index to be downloaded.
 335 |         """
 336 | 
 337 |         if index_name not in self.indices_overview:
 338 |             logger.error(f"Index {index_name} is not available and can not be fetched.")
 339 |         elif self.indices_overview[index_name]["installed"]:
 340 |             logger.warning(
 341 |                 f"Index {index_name} already installed and will not be fetched again."
 342 |             )
 343 |         else:
 344 |             logger.info("Fetching index %s", index_name)
 345 |             response = requests.get(
 346 |                 self.indices_overview[index_name]["url"], timeout=30
 347 |             )
 348 |             if response.status_code == 200:
 349 |                 filepath = os.path.join(
 350 |                     self.indices_data_dir,
 351 |                     f"{index_name}.parquet",
 352 |                 )
 353 | 
 354 |                 os.makedirs(os.path.dirname(filepath), exist_ok=True)
 355 |                 with open(filepath, mode="wb") as file:
 356 |                     file.write(response.content)
 357 | 
 358 |                 index_table = pd.read_parquet(filepath)
 359 |                 # index_table = index_table.merge(
 360 |                 #    self.index[["series_aws_url", "SeriesInstanceUID"]],
 361 |                 #    on="SeriesInstanceUID", how="left"
 362 |                 # )
 363 |                 # TODO: consider switching to class variable!
 364 |                 # setattr(self.__class__, index_name, index_table)
 365 |                 setattr(self, index_name, index_table)
 366 |                 self.indices_overview[index_name]["installed"] = True
 367 |                 self.indices_overview[index_name]["file_path"] = filepath
 368 | 
 369 |             else:
 370 |                 logger.error(
 371 |                     f"Failed to fetch index from URL {self.indices_overview[index_name]['url']}: {response.status_code}"
 372 |                 )
 373 |         # if clinical_index is requested, likely the user will need clinical data
 374 |         # download it here, given that the size is small (<2MB as of IDC v19)
 375 |         if index_name == "clinical_index":
 376 |             logger.info(
 377 |                 "Since clinical_index was fetched, also installing corresponding tables."
 378 |             )
 379 |             # create clinical_data folder under self.idc_data_dir, if it does not exist
 380 |             self.clinical_data_dir = os.path.join(self.idc_data_dir, "clinical_data")
 381 |             idc_clinical_data_release_url = f"s3://idc-open-metadata/bigquery_export/idc_{self.idc_version}_clinical/*"
 382 |             result = subprocess.run(
 383 |                 [
 384 |                     self.s5cmdPath,
 385 |                     "--no-sign-request",
 386 |                     "cp",
 387 |                     idc_clinical_data_release_url,
 388 |                     self.clinical_data_dir,
 389 |                 ],
 390 |                 capture_output=True,
 391 |                 text=True,
 392 |                 check=True,
 393 |             )
 394 |             if result.stderr and result.stdout.startswith("ERROR"):
 395 |                 logger.error("Failed to download IDC clinical data.")
 396 |             else:
 397 |                 logger.info(
 398 |                     "IDC clinical data downloaded successfully to %s",
 399 |                     self.clinical_data_dir,
 400 |                 )
 401 | 
 402 |     def get_clinical_table(self, table_name):
 403 |         """
 404 |         Returns the requested clinical table as a pandas DataFrame.
 405 | 
 406 |         Args:
 407 |             table_name (str): Name of the clinical table to be loaded.
 408 | 
 409 |         Returns:
 410 |             pandas.DataFrame: The requested clinical table.
 411 |         """
 412 |         if self.clinical_data_dir is None:
 413 |             logger.error(
 414 |                 "Clinical data directory is not available. Please fetch clinical_index first."
 415 |             )
 416 |             return None
 417 | 
 418 |         table_path = os.path.join(self.clinical_data_dir, table_name)
 419 |         if not os.path.exists(table_path):
 420 |             logger.error(f"Table {table_name} is not found in {table_path}.")
 421 |             return None
 422 | 
 423 |         return pd.read_parquet(table_path)
 424 | 
 425 |     def get_collections(self):
 426 |         """
 427 |         Returns the collections present in IDC
 428 |         """
 429 |         unique_collections = self.index["collection_id"].unique()
 430 |         return unique_collections.tolist()
 431 | 
 432 |     def get_series_size(self, seriesInstanceUID):
 433 |         """
 434 |         Gets cumulative size (MB) of the DICOM instances in a given SeriesInstanceUID.
 435 | 
 436 |         Args:
 437 |             seriesInstanceUID (str): The DICOM SeriesInstanceUID.
 438 | 
 439 |         Returns:
 440 |             float: The cumulative size of the DICOM instances in the given SeriesInstanceUID rounded to two digits, in MB.
 441 | 
 442 |         Raises:
 443 |             ValueError: If the `seriesInstanceUID` does not exist.
 444 |         """
 445 | 
 446 |         resp = self.index[["SeriesInstanceUID"] == seriesInstanceUID][
 447 |             "series_size_MB"
 448 |         ].iloc[0]
 449 |         return resp
 450 | 
 451 |     def get_patients(self, collection_id, outputFormat="dict"):
 452 |         """
 453 |         Gets the patients in a collection.
 454 | 
 455 |         Args:
 456 |             collection_id (str or list[str]): The collection id or list of collection ids. This should be in lower case separated by underscores.
 457 |                 For example, 'pdmr_texture_analysis'. or ['pdmr_texture_analysis','nlst']
 458 | 
 459 |             outputFormat (str): The format in which to return the patient IDs. Available options are 'dict',
 460 |                 'df', and 'list'. Default is 'dict'.
 461 | 
 462 |         Returns:
 463 |             dict or pandas.DataFrame or list: Patient IDs in the requested output format. By default, it returns a dictionary.
 464 | 
 465 |         Raises:
 466 |             ValueError: If `outputFormat` is not one of 'dict', 'df', 'list'.
 467 |         """
 468 | 
 469 |         if not isinstance(collection_id, str) and not isinstance(collection_id, list):
 470 |             raise TypeError("collection_id must be a string or list of strings")
 471 | 
 472 |         if outputFormat not in ["dict", "df", "list"]:
 473 |             raise ValueError("outputFormat must be either 'dict', 'df', or 'list")
 474 | 
 475 |         patient_df = self._filter_by_collection_id(self.index, collection_id)
 476 | 
 477 |         if outputFormat == "list":
 478 |             response = patient_df["PatientID"].unique().tolist()
 479 |         else:
 480 |             sql = """
 481 |                 SELECT
 482 |                     PatientID,
 483 |                     STRING_AGG(DISTINCT PatientSex) as PatientSex,
 484 |                     STRING_AGG(DISTINCT PatientAge) as PatientAge
 485 |                 FROM
 486 |                     patient_df
 487 |                 GROUP BY
 488 |                     PatientID
 489 |                 ORDER BY
 490 |                     PatientID
 491 |                 """
 492 |             patient_df = duckdb.sql(sql).df()
 493 |             # Convert DataFrame to a list of dictionaries for the API-like response
 494 |             if outputFormat == "dict":
 495 |                 response = patient_df.to_dict(orient="records")
 496 |             else:
 497 |                 response = patient_df
 498 | 
 499 |         logger.debug("Get patient response: %s", str(response))
 500 | 
 501 |         return response
 502 | 
 503 |     def get_dicom_studies(self, patientId, outputFormat="dict"):
 504 |         """
 505 |         Returns Studies for a given patient or list of patients.
 506 | 
 507 |         Args:
 508 |             patientId (str or list of str): The patient Id or a list of patient Ids.
 509 | 
 510 |             outputFormat (str): The format in which to return the studies. Available options are 'dict',
 511 |                 'df', and 'list'. Default is 'dict'.
 512 | 
 513 |         Returns:
 514 |             dict or pandas.DataFrame or list: Studies in the requested output format. By default, it returns a dictionary.
 515 | 
 516 |         Raises:
 517 |             ValueError: If `outputFormat` is not one of 'dict', 'df', 'list'.
 518 |             ValueError: If any of the `patientId` does not exist.
 519 |         """
 520 | 
 521 |         if not isinstance(patientId, str) and not isinstance(patientId, list):
 522 |             raise TypeError("patientId must be a string or list of strings")
 523 | 
 524 |         if outputFormat not in ["dict", "df", "list"]:
 525 |             raise ValueError("outputFormat must be either 'dict' or 'df' or 'list'")
 526 | 
 527 |         studies_df = self._filter_by_patient_id(self.index, patientId)
 528 | 
 529 |         if outputFormat == "list":
 530 |             response = studies_df["StudyInstanceUID"].unique().tolist()
 531 |         else:
 532 |             sql = """
 533 |                 SELECT
 534 |                     StudyInstanceUID,
 535 |                     STRING_AGG(DISTINCT StudyDate) as StudyDate,
 536 |                     STRING_AGG(DISTINCT StudyDescription) as StudyDescription,
 537 |                     COUNT(SeriesInstanceUID) as SeriesCount
 538 |                 FROM
 539 |                     studies_df
 540 |                 GROUP BY
 541 |                     StudyInstanceUID
 542 |                 ORDER BY
 543 |                     2,3,4
 544 |                 """
 545 |             studies_df = duckdb.query(sql).df()
 546 | 
 547 |             if outputFormat == "dict":
 548 |                 response = studies_df.to_dict(orient="records")
 549 |             else:
 550 |                 response = studies_df
 551 | 
 552 |         logger.debug("Get patient study response: %s", str(response))
 553 | 
 554 |         return response
 555 | 
 556 |     def get_dicom_series(self, studyInstanceUID, outputFormat="dict"):
 557 |         """
 558 |         Returns Series for a given study or list of studies.
 559 | 
 560 |         Args:
 561 |             studyInstanceUID (str or list of str): The DICOM StudyInstanceUID or a list of StudyInstanceUIDs.
 562 | 
 563 |             outputFormat (str): The format in which to return the series. Available options are 'dict',
 564 |                 'df', and 'list'. Default is 'dict'.
 565 | 
 566 |         Returns:
 567 |             dict or pandas.DataFrame or list: Series in the requested output format. By default, it returns a dictionary.
 568 | 
 569 |         Raises:
 570 |             ValueError: If `outputFormat` is not one of 'dict', 'df', 'list'.
 571 |             ValueError: If any of the `studyInstanceUID` does not exist.
 572 |         """
 573 | 
 574 |         if not isinstance(studyInstanceUID, str) and not isinstance(
 575 |             studyInstanceUID, list
 576 |         ):
 577 |             raise TypeError("studyInstanceUID must be a string or list of strings")
 578 | 
 579 |         if outputFormat not in ["dict", "df", "list"]:
 580 |             raise ValueError("outputFormat must be either 'dict' or 'df' or 'list'")
 581 | 
 582 |         series_df = self._filter_by_dicom_study_uid(self.index, studyInstanceUID)
 583 | 
 584 |         if outputFormat == "list":
 585 |             response = series_df["SeriesInstanceUID"].unique().tolist()
 586 |         else:
 587 |             series_df = series_df.rename(
 588 |                 columns={
 589 |                     "collection_id": "Collection",
 590 |                     "instanceCount": "instance_count",
 591 |                 }
 592 |             )
 593 |             series_df["ImageCount"] = 1
 594 |             series_df = series_df[
 595 |                 [
 596 |                     "StudyInstanceUID",
 597 |                     "SeriesInstanceUID",
 598 |                     "Modality",
 599 |                     "SeriesDate",
 600 |                     "Collection",
 601 |                     "BodyPartExamined",
 602 |                     "SeriesDescription",
 603 |                     "Manufacturer",
 604 |                     "ManufacturerModelName",
 605 |                     "series_size_MB",
 606 |                     "SeriesNumber",
 607 |                     "instance_count",
 608 |                     "ImageCount",
 609 |                 ]
 610 |             ]
 611 | 
 612 |             series_df = series_df.drop_duplicates().sort_values(
 613 |                 by=[
 614 |                     "Modality",
 615 |                     "SeriesDate",
 616 |                     "SeriesDescription",
 617 |                     "BodyPartExamined",
 618 |                     "SeriesNumber",
 619 |                 ]
 620 |             )
 621 |             # Convert DataFrame to a list of dictionaries for the API-like response
 622 |             if outputFormat == "dict":
 623 |                 response = series_df.to_dict(orient="records")
 624 |             else:
 625 |                 response = series_df
 626 |         logger.debug("Get series response: %s", str(response))
 627 | 
 628 |         return response
 629 | 
 630 |     def get_series_file_URLs(self, seriesInstanceUID, source_bucket_location="aws"):
 631 |         """
 632 |         Get the URLs of the files corresponding to the DICOM instances in a given SeriesInstanceUID.
 633 | 
 634 |         Args:
 635 |             SeriesInstanceUID: string containing the value of DICOM SeriesInstanceUID to filter by
 636 | 
 637 |         Returns:
 638 |             list of strings containing the AWS S3 URLs of the files corresponding to the SeriesInstanceUID
 639 |         """
 640 |         if seriesInstanceUID not in self.index["SeriesInstanceUID"].values:
 641 |             raise ValueError("SeriesInstanceUID not found in IDC index.")
 642 | 
 643 |         selected_series_df = self.index[
 644 |             self.index["SeriesInstanceUID"] == seriesInstanceUID
 645 |         ].copy()
 646 |         selected_series_df["series_aws_url"] = (
 647 |             "s3://"
 648 |             + selected_series_df["aws_bucket"]
 649 |             + "/"
 650 |             + selected_series_df["crdc_series_uuid"]
 651 |             + "/"
 652 |         )
 653 | 
 654 |         endpoint = aws_endpoint_url
 655 |         if source_bucket_location == "gcp":
 656 |             self._replace_aws_with_gcp_buckets(selected_series_df, "series_aws_url")
 657 |             endpoint = gcp_endpoint_url
 658 | 
 659 |         s3_url = selected_series_df["series_aws_url"].values[0]
 660 | 
 661 |         # Run the s5cmd ls command and capture its output
 662 |         result = subprocess.run(
 663 |             [
 664 |                 self.s5cmdPath,
 665 |                 "--endpoint-url",
 666 |                 endpoint,
 667 |                 "--no-sign-request",
 668 |                 "ls",
 669 |                 s3_url,
 670 |             ],
 671 |             stdout=subprocess.PIPE,
 672 |             check=False,
 673 |         )
 674 |         output = result.stdout.decode("utf-8")
 675 | 
 676 |         # Parse the output to get the file names
 677 |         lines = output.split("\n")
 678 |         file_names = [
 679 |             s3_url + line.split()[-1]
 680 |             for line in lines
 681 |             if line and line.split()[-1].endswith(".dcm")
 682 |         ]
 683 | 
 684 |         return file_names
 685 | 
 686 |     def get_instance_file_URL(self, sopInstanceUID, source_bucket_location="aws"):
 687 |         """
 688 |         Get the bucket URL of the file corresponding to a given SOPInstanceUID.
 689 | 
 690 |         This function will only return the URL for the Slide Microscopy (SM) instances,
 691 |         which are maintained in the `sm_instance_index` table.
 692 | 
 693 |         Args:
 694 |             sopInstanceUID: string containing the value of DICOM SOPInstanceUID
 695 |             source_bucket_location: string containing the source bucket location, either "aws" or "gcp"
 696 | 
 697 |         Returns:
 698 |             string containing the bucket URL of the file corresponding to the SOPInstanceUID,
 699 |             or None if the SOPInstanceUID is not recognized
 700 |         """
 701 | 
 702 |         # sm_instance_index is required to complete this operation - install it!
 703 |         self.fetch_index("sm_instance_index")
 704 | 
 705 |         if self.sm_instance_index is None:
 706 |             logger.error(
 707 |                 "sm_instance_index could not be installed. Please install it first using fetch_index."
 708 |             )
 709 |             return None
 710 | 
 711 |         if sopInstanceUID not in self.sm_instance_index["SOPInstanceUID"].values:  # pylint: disable=unsubscriptable-object
 712 |             raise ValueError("SOPInstanceUID not found in IDC sm_instance_index.")
 713 | 
 714 |         # merge with the main index to get series_aws_url
 715 |         selected_instance_df = self.sm_instance_index[  # pylint: disable=unsubscriptable-object
 716 |             self.sm_instance_index["SOPInstanceUID"] == sopInstanceUID  # pylint: disable=unsubscriptable-object
 717 |         ].copy()[["SeriesInstanceUID", "SOPInstanceUID", "crdc_instance_uuid"]]
 718 |         selected_instance_df = pd.merge(
 719 |             selected_instance_df,
 720 |             self.index,
 721 |             on="SeriesInstanceUID",
 722 |             how="left",
 723 |         )
 724 | 
 725 |         if source_bucket_location == "gcp":
 726 |             # replace AWS with the GCP bucket
 727 |             self._replace_aws_with_gcp_buckets(selected_instance_df, "series_aws_url")
 728 | 
 729 |         # instance files are named using crdc_instance_uuid
 730 |         series_url = selected_instance_df.iloc[0]["series_aws_url"][:-1]
 731 |         instance_uuid = selected_instance_df.iloc[0]["crdc_instance_uuid"]
 732 |         return series_url + instance_uuid + ".dcm"
 733 | 
 734 |     def get_viewer_URL(
 735 |         self, seriesInstanceUID=None, studyInstanceUID=None, viewer_selector=None
 736 |     ):
 737 |         """
 738 |         Get the URL of the IDC viewer for the given series or study in IDC based on
 739 |         the provided SeriesInstanceUID or StudyInstanceUID. If StudyInstanceUID is not provided,
 740 |         it will be automatically deduced. If viewer_selector is not provided, default viewers
 741 |         will be used (OHIF v3 for radiology modalities, and Slim for SM).
 742 | 
 743 |         This function will validate the provided SeriesInstanceUID or StudyInstanceUID against IDC
 744 |         index to ensure that the series or study is available in IDC.
 745 | 
 746 |         Args:
 747 |             SeriesInstanceUID: string containing the value of DICOM SeriesInstanceUID for a series
 748 |                 available in IDC
 749 | 
 750 |             StudyInstanceUID: string containing the value of DICOM SeriesInstanceUID for a series
 751 |                 available in IDC
 752 | 
 753 |             viewer_selector: string containing the name of the viewer to use. Must be one of the following:
 754 |                 ohif_v2, ohif_v3, or slim. If not provided, default viewers will be used: slim for studies that contain SM modality and ohif_v3 for radiology.
 755 | 
 756 |         Returns:
 757 |             string containing the IDC viewer URL for the requested selection
 758 |         """
 759 | 
 760 |         if seriesInstanceUID is None and studyInstanceUID is None:
 761 |             raise ValueError(
 762 |                 "Either SeriesInstanceUID or StudyInstanceUID, or both, must be provided."
 763 |             )
 764 | 
 765 |         if (
 766 |             seriesInstanceUID is not None
 767 |             and seriesInstanceUID not in self.index["SeriesInstanceUID"].values
 768 |         ):
 769 |             raise ValueError("SeriesInstanceUID not found in IDC index.")
 770 | 
 771 |         if (
 772 |             studyInstanceUID is not None
 773 |             and studyInstanceUID not in self.index["StudyInstanceUID"].values
 774 |         ):
 775 |             raise ValueError("StudyInstanceUID not found in IDC index.")
 776 | 
 777 |         if viewer_selector is not None and viewer_selector not in [
 778 |             "ohif_v2",
 779 |             "ohif_v3",
 780 |             "slim",
 781 |         ]:
 782 |             raise ValueError(
 783 |                 "viewer_selector must be one of 'ohif_v2', 'ohif_v3',  or 'slim'."
 784 |             )
 785 | 
 786 |         modality = None
 787 | 
 788 |         if studyInstanceUID is None:
 789 |             query = f"""
 790 |             SELECT
 791 |                 DISTINCT(StudyInstanceUID),
 792 |                 Modality
 793 |             FROM
 794 |                 index
 795 |             WHERE
 796 |                 SeriesInstanceUID='{seriesInstanceUID}'
 797 |             """
 798 |             query_result = self.sql_query(query)
 799 |             studyInstanceUID = query_result.StudyInstanceUID[0]
 800 |             modality = query_result.Modality[0]
 801 | 
 802 |         else:
 803 |             query = f"""
 804 |             SELECT
 805 |                 DISTINCT(Modality)
 806 |             FROM
 807 |                 index
 808 |             WHERE
 809 |                 StudyInstanceUID='{studyInstanceUID}'
 810 |             """
 811 |             query_result = self.sql_query(query)
 812 |             modality = query_result.Modality[0]
 813 | 
 814 |         viewer_url = None
 815 |         if viewer_selector is None:
 816 |             if "SM" in modality:
 817 |                 viewer_selector = "slim"
 818 |             else:
 819 |                 viewer_selector = "ohif_v3"
 820 | 
 821 |         if viewer_selector == "ohif_v2":
 822 |             if seriesInstanceUID is None:
 823 |                 viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/viewer/{studyInstanceUID}"
 824 |             else:
 825 |                 viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/viewer/{studyInstanceUID}?SeriesInstanceUID={seriesInstanceUID}"
 826 |         elif viewer_selector == "ohif_v3":
 827 |             if seriesInstanceUID is None:
 828 |                 viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs={studyInstanceUID}"
 829 |             else:
 830 |                 viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs={studyInstanceUID}&SeriesInstanceUIDs={seriesInstanceUID}"
 831 |         elif viewer_selector == "volview":
 832 |             # TODO! Not implemented yet
 833 |             viewer_url = None
 834 |         elif viewer_selector == "slim":
 835 |             if seriesInstanceUID is None:
 836 |                 viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/slim/studies/{studyInstanceUID}"
 837 |             else:
 838 |                 viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/slim/studies/{studyInstanceUID}/series/{seriesInstanceUID}"
 839 | 
 840 |         return viewer_url
 841 | 
 842 |     def _validate_update_manifest_and_get_download_size(
 843 |         self,
 844 |         manifestFile,
 845 |         downloadDir,
 846 |         validate_manifest,
 847 |         use_s5cmd_sync,
 848 |         dirTemplate,
 849 |     ) -> tuple[float, str, Path]:
 850 |         """
 851 |         Validates the manifest file by checking the URLs in the manifest
 852 | 
 853 |         Args:
 854 |             manifestFile (str): The path to the manifest file.
 855 |             downloadDir (str): The path to the download directory.
 856 |             validate_manifest (bool): If True, validates the manifest for any errors. Defaults to True.
 857 |             show_progress_bar (bool): If True, tracks the progress of download
 858 |             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
 859 |             dirTemplate (str): A template string for the directory path. Must start with %. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT. It can contain attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) wrapped in '%'. Special characters can be used as connectors: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). Can be disabled by None.
 860 | 
 861 |         Returns:
 862 |             total_size (float): The total size of all series in the manifest file.
 863 |             endpoint_to_use (str): The endpoint URL to use (either AWS or GCP).
 864 |             temp_manifest_file(Path): Path to the temporary manifest file for downstream steps
 865 |         Raises:
 866 |             ValueError: If the manifest file does not exist, if any URL in the manifest file is invalid, or if any URL is inaccessible in both AWS and GCP.
 867 |             Exception: If the manifest contains URLs from both AWS and GCP.
 868 |         """
 869 |         logger.debug("manifest validation is requested: " + str(validate_manifest))
 870 | 
 871 |         logger.debug("Parsing the manifest. Please wait..")
 872 |         # Read the manifest as a csv file
 873 |         manifest_df = pd.read_csv(
 874 |             manifestFile, comment="#", skip_blank_lines=True, header=None
 875 |         )
 876 | 
 877 |         # Rename the column
 878 |         manifest_df.columns = ["manifest_cp_cmd"]
 879 | 
 880 |         # remove all rows that do not contain an S3 URL
 881 |         manifest_df = manifest_df[
 882 |             manifest_df["manifest_cp_cmd"].str.contains(r"s3://", na=False)
 883 |         ]
 884 | 
 885 |         # create a copy of the index
 886 |         index_df_copy = self.index[
 887 |             [
 888 |                 "SeriesInstanceUID",
 889 |                 "aws_bucket",
 890 |                 "crdc_series_uuid",
 891 |                 "series_size_MB",
 892 |                 "PatientID",
 893 |                 "collection_id",
 894 |                 "Modality",
 895 |                 "StudyInstanceUID",
 896 |             ]
 897 |         ]
 898 |         prior_versions_index_df_copy = self.prior_versions_index[
 899 |             [
 900 |                 "SeriesInstanceUID",
 901 |                 "aws_bucket",
 902 |                 "crdc_series_uuid",
 903 |                 "series_size_MB",
 904 |                 "PatientID",
 905 |                 "collection_id",
 906 |                 "Modality",
 907 |                 "StudyInstanceUID",
 908 |             ]
 909 |         ]
 910 | 
 911 |         # use default hierarchy
 912 |         if dirTemplate is not None:
 913 |             hierarchy = self._generate_sql_concat_for_building_directory(
 914 |                 dirTemplate=dirTemplate, downloadDir=downloadDir
 915 |             )
 916 |         else:
 917 |             hierarchy = f"CONCAT('{downloadDir}')"
 918 | 
 919 |         # Extract s3 url and crdc_series_uuid from the manifest copy commands
 920 |         # Next, construct aws_series_url in the index and
 921 |         # try to verify if every series in the manifest is present in the index
 922 | 
 923 |         # ruff: noqa
 924 |         sql = f"""
 925 |             PRAGMA disable_progress_bar;
 926 |             WITH
 927 |             index_temp AS (
 928 |             SELECT
 929 |                 seriesInstanceUID,
 930 |                 CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url,
 931 |                 series_size_MB,
 932 |                 {hierarchy} AS path,
 933 |                 crdc_series_uuid AS index_crdc_series_uuid
 934 |             FROM
 935 |                 index_df_copy),
 936 |             manifest_temp AS (
 937 |             SELECT
 938 |                 manifest_cp_cmd,
 939 |                 REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid,
 940 |                 REGEXP_EXTRACT(manifest_cp_cmd, 's3://\\S+') AS s3_url,
 941 |             FROM
 942 |                 manifest_df
 943 |             WHERE
 944 |                 REGEXP_EXTRACT(manifest_cp_cmd, 's3://\\S+') IS NOT NULL)
 945 |             SELECT
 946 |                 seriesInstanceuid,
 947 |                 index_crdc_series_uuid,
 948 |                 s3_url,
 949 |                 path,
 950 |                 series_size_MB,
 951 |                 index_crdc_series_uuid is not NULL as crdc_series_uuid_match,
 952 |                 s3_url==series_aws_url AS s3_url_match,
 953 |                 manifest_temp.manifest_cp_cmd,
 954 |             CASE
 955 |                 WHEN s3_url==series_aws_url THEN 'aws'
 956 |             ELSE
 957 |                 'unknown'
 958 |             END
 959 |                 AS endpoint
 960 |             FROM
 961 |                 manifest_temp
 962 |             LEFT JOIN
 963 |                 index_temp
 964 |             ON
 965 |                 index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
 966 |         """
 967 |         # ruff: noqa: end
 968 |         merged_df = duckdb.query(sql).df()
 969 | 
 970 |         endpoint_to_use = None
 971 | 
 972 |         if not all(merged_df["crdc_series_uuid_match"]):
 973 |             missing_manifest_cp_cmds = merged_df.loc[
 974 |                 ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
 975 |             ]
 976 |             missing_in_main_cnt = len(missing_manifest_cp_cmds.tolist())
 977 |             logger.warning(
 978 |                 f"The total of {missing_in_main_cnt} copy commands are not recognized as referencing any associated series in the main index.\n"
 979 |                 "This means either these commands are invalid, or they may correspond to files available in a release of IDC\n"
 980 |                 f"different from {self.get_idc_version()} used in this version of idc-index. Prior data releases will be checked next."
 981 |             )
 982 | 
 983 |             logger.debug(
 984 |                 "Checking if the requested data is available in other idc versions "
 985 |             )
 986 | 
 987 |             missing_series_sql = f"""
 988 |             PRAGMA disable_progress_bar;
 989 |             WITH
 990 |             index_temp AS
 991 |             (SELECT
 992 |                 seriesInstanceUID,
 993 |                 CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url,
 994 |                 series_size_MB,
 995 |                 {hierarchy} AS path,
 996 |                 crdc_series_uuid AS index_crdc_series_uuid
 997 |             FROM
 998 |                 index_df_copy
 999 |             union by name
1000 |             SELECT
1001 |                 seriesInstanceUID,
1002 |                 CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url,
1003 |                 series_size_MB,
1004 |                  {hierarchy} AS path,
1005 |                  crdc_series_uuid AS index_crdc_series_uuid
1006 |             FROM
1007 |                 prior_versions_index_df_copy pvip
1008 | 
1009 |             ),
1010 |             manifest_temp AS (
1011 |             SELECT
1012 |                 manifest_cp_cmd,
1013 |                 REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid,
1014 |                 REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url,
1015 |             FROM
1016 |                 manifest_df
1017 |             WHERE
1018 |                 REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') IS NOT NULL)
1019 |             SELECT
1020 |                 seriesInstanceuid,
1021 |                 index_crdc_series_uuid,
1022 |                 s3_url,
1023 |                 path,
1024 |                 series_size_MB,
1025 |                 index_crdc_series_uuid is not NULL as crdc_series_uuid_match,
1026 |                 TRIM(s3_url) = TRIM(series_aws_url) AS s3_url_match,
1027 |                 manifest_temp.manifest_cp_cmd,
1028 |             CASE
1029 |                 WHEN TRIM(s3_url) = TRIM(series_aws_url) THEN 'aws'
1030 |             ELSE
1031 |                 'unknown'
1032 |             END
1033 |                 AS endpoint
1034 |             FROM
1035 |                 manifest_temp
1036 |             LEFT JOIN
1037 |                 index_temp
1038 |             ON
1039 |                 index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
1040 |             """
1041 |             merged_df = duckdb.sql(missing_series_sql).df()
1042 |             if not all(merged_df["crdc_series_uuid_match"]):
1043 |                 missing_manifest_cp_cmds = merged_df.loc[
1044 |                     ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
1045 |                 ]
1046 |                 logger.error(
1047 |                     "The following manifest copy commands are not recognized as referencing any associated series in any release of IDC.\n"
1048 |                     "These commands may be invalid. Please submit an issue on https://github.com/ImagingDataCommons/idc-index/issues \n"
1049 |                     "The corresponding files could not be downloaded.\n"
1050 |                 )
1051 |                 logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist()))
1052 |             else:
1053 |                 logger.info("All of the identifiers from manifest have been resolved!")
1054 | 
1055 |         # `idc-open-data` bucket is present in both AWS and GCP, this is why we skip checking endpoint
1056 |         # for the URLs that contain `idc-open-data`
1057 |         provider_specific_urls = merged_df[
1058 |             ~merged_df["s3_url"].str.contains("/idc-open-data/")
1059 |         ]
1060 | 
1061 |         if validate_manifest:
1062 |             # Check if there is more than one endpoint
1063 |             if len(provider_specific_urls["endpoint"].unique()) > 1:
1064 |                 logger.error("A mix of endpoint s3_urls encountered!")
1065 |                 for endpoint in merged_df["endpoint"].unique():
1066 |                     sample_s3_url = merged_df[
1067 |                         merged_df["endpoint"] == endpoint
1068 |                     ].s3_url.values[0]
1069 |                     logger.error(f"  Endpoint {endpoint} s3_url {sample_s3_url}")
1070 |                 raise ValueError(
1071 |                     "Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. "
1072 |                 )
1073 |             elif provider_specific_urls.empty:
1074 |                 # if all URLs are from idc-open-data, default to AWS
1075 |                 endpoint_to_use = aws_endpoint_url
1076 |             else:  # provider_specific_urls["endpoint"].unique()) == 1
1077 |                 if provider_specific_urls["endpoint"].values[0] == "aws":
1078 |                     logging.debug("Detected AWS as the endpoint to use")
1079 |                     endpoint_to_use = aws_endpoint_url
1080 |                 else:  # unknown / gcp
1081 |                     logging.debug("Will use GCS endpoint")
1082 |                     cmd = [
1083 |                         self.s5cmdPath,
1084 |                         "--no-sign-request",
1085 |                         "--endpoint-url",
1086 |                         gcp_endpoint_url,
1087 |                         "ls",
1088 |                         merged_df.s3_url.values[0],
1089 |                     ]
1090 |                     process = subprocess.run(
1091 |                         cmd, capture_output=True, text=True, check=False
1092 |                     )
1093 |                     if process.stderr and process.stdout.startswith("ERROR"):
1094 |                         logger.debug(
1095 |                             "Folder not available in GCP. Manifest appears to be invalid."
1096 |                         )
1097 |                         if validate_manifest:
1098 |                             raise ValueError
1099 |                     else:
1100 |                         endpoint_to_use = gcp_endpoint_url
1101 | 
1102 |         elif (
1103 |             provider_specific_urls.empty
1104 |             or provider_specific_urls["endpoint"].values[0] == "aws"
1105 |         ):
1106 |             endpoint_to_use = aws_endpoint_url
1107 |         else:
1108 |             # TODO: here we assume that the endpoint is GCP; we could check at least the first URL to be sure,
1109 |             # but we can take care of this in a more principled way by including GCP bucket directly
1110 |             # in the future, see https://github.com/ImagingDataCommons/idc-index/pull/56#discussion_r1582157048
1111 |             endpoint_to_use = gcp_endpoint_url
1112 | 
1113 |         # Calculate total size
1114 |         total_size = merged_df["series_size_MB"].sum()
1115 |         total_size = round(total_size, 2)
1116 | 
1117 |         # Write a temporary manifest file
1118 |         with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_manifest_file:
1119 |             if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0:
1120 |                 if dirTemplate is not None:
1121 |                     merged_df["s5cmd_cmd"] = (
1122 |                         "sync "
1123 |                         + merged_df["s3_url"]
1124 |                         + " "
1125 |                         + '"'
1126 |                         + merged_df["path"]
1127 |                         + '"'
1128 |                     )
1129 |                 else:
1130 |                     merged_df["s5cmd_cmd"] = (
1131 |                         "sync " + merged_df["s3_url"] + " " + '"' + downloadDir + '"'
1132 |                     )
1133 |             elif dirTemplate is not None:
1134 |                 merged_df["s5cmd_cmd"] = (
1135 |                     "cp " + merged_df["s3_url"] + " " + '"' + merged_df["path"] + '"'
1136 |                 )
1137 |             else:
1138 |                 merged_df["s5cmd_cmd"] = (
1139 |                     "cp " + merged_df["s3_url"] + " " + '"' + downloadDir + '"'
1140 |                 )
1141 | 
1142 |             # Combine all commands into a single string with newline separators
1143 |             commands = "\n".join(merged_df["s5cmd_cmd"])
1144 | 
1145 |             temp_manifest_file.write(commands)
1146 | 
1147 |             logger.info("Parsing the manifest is finished. Download will begin soon")
1148 | 
1149 |         if dirTemplate is not None:
1150 |             list_of_directories = merged_df.path.to_list()
1151 |         else:
1152 |             list_of_directories = [downloadDir]
1153 | 
1154 |         logger.debug(f"list of directories:{list_of_directories}")
1155 |         return (
1156 |             total_size,
1157 |             endpoint_to_use,
1158 |             Path(temp_manifest_file.name),
1159 |             list_of_directories,
1160 |             merged_df[
1161 |                 ["index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path"]
1162 |             ],
1163 |         )
1164 | 
1165 |     @staticmethod
1166 |     def _generate_sql_concat_for_building_directory(dirTemplate, downloadDir):
1167 |         # for now, we limit the allowed columns to this list to make sure that all
1168 |         # values are guaranteed to be non-empty and to not contain any special characters
1169 |         # in the future, we should consider including more attributes
1170 |         # also, if we allow any column, we should decide what we would do if the value is NULL
1171 |         valid_attributes = [
1172 |             "PatientID",
1173 |             "collection_id",
1174 |             "Modality",
1175 |             "StudyInstanceUID",
1176 |             "SeriesInstanceUID",
1177 |         ]
1178 |         valid_separators = ["_", "-", "/"]
1179 | 
1180 |         updated_template = dirTemplate
1181 | 
1182 |         # validate input template by removing all valid attributes and separators
1183 |         for attr in valid_attributes:
1184 |             updated_template = updated_template.replace("%" + attr, "")
1185 |         for sep in valid_separators:
1186 |             updated_template = updated_template.replace(sep, "")
1187 | 
1188 |         if updated_template != "":
1189 |             logger.error("Invalid download hierarchy template:" + updated_template)
1190 |             logger.error(
1191 |                 "Make sure your template uses only valid attributes and separators"
1192 |             )
1193 |             logger.error("Valid attributes: " + str(valid_attributes))
1194 |             logger.error("Valid separators: " + str(valid_separators))
1195 |             raise ValueError
1196 | 
1197 |         concat_command = dirTemplate
1198 |         for attr in valid_attributes:
1199 |             concat_command = concat_command.replace("%" + attr, f"', {attr},'")
1200 | 
1201 |         # CONCAT command may contain empty strings, and they are not harmless -
1202 |         # duckdb does not like them!
1203 |         # NB: double-quotes are not allowed by duckdb!
1204 |         concat_command = f"CONCAT('{downloadDir}/','" + concat_command + "')"
1205 |         concat_command = concat_command.replace(",''", "")
1206 |         concat_command = concat_command.replace("'',", "")
1207 |         concat_command = concat_command.replace(",'',", "")
1208 |         return concat_command
1209 | 
1210 |     @staticmethod
1211 |     def _track_download_progress(
1212 |         size_MB: int,
1213 |         downloadDir: str,
1214 |         process: subprocess.Popen,
1215 |         show_progress_bar: bool = True,
1216 |         list_of_directories=None,
1217 |     ):
1218 |         logger.debug("Inputs received for tracking download:")
1219 |         logger.debug(f"size_MB: {size_MB}")
1220 |         logger.debug(f"downloadDir: {downloadDir}")
1221 |         logger.debug(f"show_progress_bar: {show_progress_bar}")
1222 | 
1223 |         runtime_errors = []
1224 | 
1225 |         if show_progress_bar:
1226 |             total_size_to_be_downloaded_bytes = size_MB * (10**6)
1227 |             initial_size_bytes = 0
1228 |             # Calculate the initial size of the directory
1229 |             for directory in list_of_directories:
1230 |                 initial_size_bytes = IDCClient._get_dir_sum_file_size(directory)
1231 | 
1232 |             logger.info(
1233 |                 "Initial size of the directory: %s",
1234 |                 IDCClient._format_size(initial_size_bytes, size_in_bytes=True),
1235 |             )
1236 |             logger.info(
1237 |                 "Approximate size of the files that need to be downloaded: %s",
1238 |                 IDCClient._format_size(size_MB),
1239 |             )
1240 | 
1241 |             pbar = tqdm(
1242 |                 total=total_size_to_be_downloaded_bytes,
1243 |                 unit="B",
1244 |                 unit_scale=True,
1245 |                 desc="Downloading data",
1246 |             )
1247 | 
1248 |             while True:
1249 |                 time.sleep(0.5)
1250 |                 downloaded_bytes = 0
1251 |                 for directory in list_of_directories:
1252 |                     downloaded_bytes += IDCClient._get_dir_sum_file_size(directory)
1253 |                 downloaded_bytes -= initial_size_bytes
1254 |                 pbar.n = min(
1255 |                     downloaded_bytes, total_size_to_be_downloaded_bytes
1256 |                 )  # Prevent the progress bar from exceeding 100%
1257 |                 pbar.refresh()
1258 |                 if process.poll() is not None:
1259 |                     break
1260 |             # Wait for the process to finish
1261 |             _, stderr = process.communicate()
1262 |             pbar.close()
1263 | 
1264 |         else:
1265 |             while process.poll() is None:
1266 |                 time.sleep(0.5)
1267 | 
1268 |     @staticmethod
1269 |     def _get_dir_sum_file_size(directory) -> int:
1270 |         path = Path(directory)
1271 |         sum_file_size = 0
1272 |         if path.exists() and path.is_dir():
1273 |             for f in path.iterdir():
1274 |                 if f.is_file():
1275 |                     try:
1276 |                         sum_file_size += f.stat().st_size
1277 |                     except FileNotFoundError:
1278 |                         # file must have been removed before we
1279 |                         # could get its size
1280 |                         pass
1281 |         return sum_file_size
1282 | 
1283 |     def _parse_s5cmd_sync_output_and_generate_synced_manifest(
1284 |         self, stdout, s5cmd_sync_helper_df
1285 |     ) -> Path:
1286 |         """
1287 |         Parse the output of s5cmd sync --dry-run to extract distinct folders and generate a synced manifest.
1288 | 
1289 |         Args:
1290 |             output (str): The output of s5cmd sync --dry-run command.
1291 |             s5cmd_sync_helper_df: helper df obtained after validation of manifest or filtering of selection, containing a minimum of "index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path" columns
1292 | 
1293 |         Returns:
1294 |             Path: The path to the generated synced manifest file.
1295 |             float: Download size in MB
1296 |             list_of_directories: list of directories need to tracked for progress bar
1297 |         """
1298 |         logger.info("Parsing the s5cmd sync dry run output...")
1299 | 
1300 |         stdout_df = pd.DataFrame(stdout.splitlines(), columns=["s5cmd_output"])
1301 | 
1302 |         # create a copy of the index
1303 |         index_df_copy = self.index
1304 | 
1305 |         result_df = s5cmd_sync_helper_df
1306 | 
1307 |         # TODO: need to remove the assumption that manifest commands will have 'cp'
1308 |         # ruff: noqa
1309 |         sql = """
1310 |             PRAGMA disable_progress_bar;
1311 |             WITH
1312 |             index_temp AS (
1313 |             SELECT
1314 |                  index_crdc_series_uuid,
1315 |                  s5cmd_cmd,
1316 |                  path,
1317 |                  series_size_MB
1318 |             FROM
1319 |                 result_df),
1320 |             sync_temp AS (
1321 |             SELECT
1322 |                 DISTINCT CONCAT(REGEXP_EXTRACT(s5cmd_output, 'cp (s3://[^/]+/[^/]+)/.*', 1), '/*') AS s3_url,
1323 |                 REGEXP_EXTRACT(CONCAT(REGEXP_EXTRACT(s5cmd_output, 'cp (s3://[^/]+/[^/]+)/.*', 1), '/*'),'(?:.*?\\/){3}([^\\/?#]+)',1) AS sync_crdc_instance_uuid
1324 |             FROM
1325 |                 stdout_df )
1326 |             SELECT
1327 |                 DISTINCT s5cmd_cmd,
1328 |                 series_size_MB,
1329 |                 path
1330 |             FROM
1331 |                 sync_temp
1332 |             JOIN
1333 |                 index_temp
1334 |             ON
1335 |                 index_temp.index_crdc_series_uuid = sync_temp.sync_crdc_instance_uuid
1336 |         """
1337 |         # ruff: noqa: end
1338 |         synced_df = duckdb.query(sql).df()
1339 |         sync_size = synced_df["series_size_MB"].sum()
1340 |         sync_size_rounded = round(sync_size, 2)
1341 | 
1342 |         logger.debug(f"sync_size_rounded: {sync_size_rounded}")
1343 | 
1344 |         # Write a temporary manifest file
1345 |         with tempfile.NamedTemporaryFile(mode="w", delete=False) as synced_manifest:
1346 |             list_of_directories = synced_df.path.to_list()
1347 |             commands = "\n".join(synced_df["s5cmd_cmd"])
1348 |             synced_manifest.write(commands)
1349 | 
1350 |             logger.info("Parsing the s5cmd sync dry run output finished")
1351 |         return Path(synced_manifest.name), sync_size_rounded, list_of_directories
1352 | 
1353 |     def _s5cmd_run(
1354 |         self,
1355 |         endpoint_to_use,
1356 |         manifest_file,
1357 |         total_size,
1358 |         downloadDir,
1359 |         quiet,
1360 |         show_progress_bar,
1361 |         use_s5cmd_sync,
1362 |         dirTemplate,
1363 |         list_of_directories,
1364 |         s5cmd_sync_helper_df,
1365 |     ):
1366 |         """
1367 |         Executes the s5cmd command to sync files from a given endpoint to a local directory.
1368 | 
1369 |         This function first performs a dry run of the s5cmd command to check which files need to be downloaded.
1370 |         If there are files to be downloaded, it generates a new manifest file with the files to be synced and
1371 |         runs the s5cmd command again to download the files. The progress of the download is tracked and printed
1372 |         to the console.
1373 | 
1374 |         Args:
1375 |             endpoint_to_use (str): The endpoint URL to download the files from.
1376 |             manifest_file (str): The path to the manifest file listing the files to be downloaded.
1377 |             total_size (float): The total size of the files to be downloaded in MB.
1378 |             downloadDir (str): The local directory where the files will be downloaded.
1379 |             quiet (bool): If True, suppresses the stdout and stderr of the s5cmd command.
1380 |             show_progress_bar (bool): If True, tracks the progress of download.
1381 |             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded.
1382 |             dirTemplate (str): Download directory hierarchy template.
1383 |             list_of_directories(list): List of directories need to tracked for progress bar.
1384 |             s5cmd_sync_helper_df (df): helper df obtained after validation of manifest or filtering of selection, containing a minimum of "index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path" columns.
1385 | 
1386 |         Raises:
1387 |             subprocess.CalledProcessError: If the s5cmd command fails.
1388 | 
1389 |         Returns:
1390 |             None
1391 |         """
1392 |         logger.debug("running self._s5cmd_run. Inputs received:")
1393 |         logger.debug(f"endpoint_to_use: {endpoint_to_use}")
1394 |         logger.debug(f"manifest_file: {manifest_file}")
1395 |         logger.debug(f"total_size: {total_size}")
1396 |         logger.debug(f"downloadDir: {downloadDir}")
1397 |         logger.debug(f"quiet: {quiet}")
1398 |         logger.debug(f"show_progress_bar: {show_progress_bar}")
1399 |         logger.debug(f"use_s5cmd_sync: {use_s5cmd_sync}")
1400 |         logger.debug(f"dirTemplate: {dirTemplate}")
1401 | 
1402 |         if quiet:
1403 |             stdout = subprocess.DEVNULL
1404 |             stderr = subprocess.DEVNULL
1405 |         else:
1406 |             stdout = None
1407 |             stderr = None
1408 | 
1409 |         if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0:
1410 |             logger.debug(
1411 |                 "Requested progress bar along with s5cmd sync dry run.\
1412 |                         Using s5cmd sync dry run as the destination folder is not empty"
1413 |             )
1414 |             dry_run_cmd = [
1415 |                 self.s5cmdPath,
1416 |                 "--no-sign-request",
1417 |                 "--dry-run",
1418 |                 "--endpoint-url",
1419 |                 endpoint_to_use,
1420 |                 "run",
1421 |                 manifest_file,
1422 |             ]
1423 | 
1424 |             process = subprocess.run(
1425 |                 dry_run_cmd, stdout=subprocess.PIPE, text=True, check=False
1426 |             )
1427 | 
1428 |             if process.stdout:
1429 |                 # Some files need to be downloaded
1430 |                 logger.info(
1431 |                     """
1432 | stoud from s5cmd sync dry run is not empty. Parsing the output to
1433 | evaluate what to download and corresponding size with only series level precision
1434 | """
1435 |                 )
1436 |                 (
1437 |                     synced_manifest,
1438 |                     sync_size,
1439 |                     list_of_directories,
1440 |                 ) = self._parse_s5cmd_sync_output_and_generate_synced_manifest(
1441 |                     stdout=process.stdout,
1442 |                     s5cmd_sync_helper_df=s5cmd_sync_helper_df,
1443 |                 )
1444 |                 logger.info(f"sync_size (MB): {sync_size}")
1445 | 
1446 |                 cmd = [
1447 |                     self.s5cmdPath,
1448 |                     "--no-sign-request",
1449 |                     "--endpoint-url",
1450 |                     endpoint_to_use,
1451 |                     "run",
1452 |                     synced_manifest,
1453 |                 ]
1454 |                 with subprocess.Popen(
1455 |                     cmd, stdout=stdout, stderr=stderr, universal_newlines=True
1456 |                 ) as process:
1457 |                     if sync_size < total_size:
1458 |                         logger.info(
1459 |                             """
1460 | Destination folder is not empty and sync size is less than total size.
1461 | """
1462 |                         )
1463 |                         existing_data_size = round(total_size - sync_size, 2)
1464 |                         logger.info(
1465 |                             f"Requested total download size is {total_size} MB, \
1466 |                                     however at least {existing_data_size} MB is already present,\
1467 |                                     so downloading only remaining up to {sync_size} MB\n\
1468 |                                     Please note that disk sizes are calculated at series level, \
1469 |                                     so if individual files are missing, displayed progress bar may\
1470 |                                     not be accurate."
1471 |                         )
1472 |                         self._track_download_progress(
1473 |                             sync_size,
1474 |                             downloadDir,
1475 |                             process,
1476 |                             show_progress_bar,
1477 |                             list_of_directories,
1478 |                         )
1479 |                     else:
1480 |                         self._track_download_progress(
1481 |                             total_size,
1482 |                             downloadDir,
1483 |                             process,
1484 |                             show_progress_bar,
1485 |                             list_of_directories,
1486 |                         )
1487 |             else:
1488 |                 logger.info(
1489 |                     "It appears that all requested DICOM files are already present in destination folder"
1490 |                 )
1491 |         else:
1492 |             logger.info(
1493 |                 "Not using s5cmd sync as the destination folder is empty or sync or progress bar is not requested"
1494 |             )
1495 |             cmd = [
1496 |                 self.s5cmdPath,
1497 |                 "--no-sign-request",
1498 |                 "--endpoint-url",
1499 |                 endpoint_to_use,
1500 |                 "run",
1501 |                 manifest_file,
1502 |             ]
1503 | 
1504 |             # fedorov: did consider-using-with, and decided against it to keep the code more readable
1505 |             stderr_log_file = tempfile.NamedTemporaryFile(delete=False)  # pylint: disable=consider-using-with
1506 |             logging.debug("Running download command: " + str(cmd))
1507 |             with subprocess.Popen(
1508 |                 cmd,
1509 |                 stdout=stdout,
1510 |                 stderr=stderr_log_file,
1511 |                 universal_newlines=True,
1512 |             ) as process:
1513 |                 self._track_download_progress(
1514 |                     total_size,
1515 |                     downloadDir,
1516 |                     process,
1517 |                     show_progress_bar,
1518 |                     list_of_directories,
1519 |                 )
1520 | 
1521 |                 stderr_log_file.close()
1522 | 
1523 |                 runtime_errors = []
1524 |                 with open(stderr_log_file.name) as stderr_log_file:
1525 |                     for line in stderr_log_file.readlines():
1526 |                         if not quiet:
1527 |                             logger.info(line)
1528 |                         if line.startswith("ERROR"):
1529 |                             runtime_errors.append(line)
1530 | 
1531 |                 Path(stderr_log_file.name).unlink()
1532 | 
1533 |                 if len(runtime_errors) > 0:
1534 |                     logger.error(
1535 |                         "Download process failed with the following errors:\n"
1536 |                         + "\n".join(runtime_errors)
1537 |                     )
1538 | 
1539 |                 # Check if download process completed successfully
1540 |                 if process.returncode != 0:
1541 |                     logger.error(
1542 |                         f"Download process return non-zero exit code: {process.returncode}"
1543 |                     )
1544 |                 else:
1545 |                     logger.info("Successfully downloaded files to %s", str(downloadDir))
1546 | 
1547 |     @staticmethod
1548 |     def _format_size(size, size_in_bytes: bool = False):
1549 |         if size_in_bytes:
1550 |             size_MB = size / (10**6)
1551 |         else:
1552 |             size_MB = size
1553 |         size_GB = size_MB / 1000
1554 |         size_TB = size_GB / 1000
1555 | 
1556 |         if size_TB >= 1:
1557 |             return f"{round(size_TB, 2)} TB"
1558 |         if size_GB >= 1:
1559 |             return f"{round(size_GB, 2)} GB"
1560 |         if size_MB >= 1:
1561 |             return f"{round(size_MB, 2)} MB"
1562 |         return f"{round(size, 2)} bytes"
1563 | 
1564 |     def download_from_manifest(
1565 |         self,
1566 |         manifestFile: str,
1567 |         downloadDir: str,
1568 |         quiet: bool = True,
1569 |         validate_manifest: bool = True,
1570 |         show_progress_bar: bool = True,
1571 |         use_s5cmd_sync: bool = False,
1572 |         dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT,
1573 |     ) -> None:
1574 |         """
1575 |         Download the manifest file. In a series of steps, the manifest file
1576 |         is first validated to ensure every line contains a valid urls. It then
1577 |         gets the total size to be downloaded and runs download process on one
1578 |         process and download progress on another process.
1579 | 
1580 |         Args:
1581 |             manifestFile (str): The path to the manifest file.
1582 |             downloadDir (str): The directory to download the files to.
1583 |             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True.
1584 |             validate_manifest (bool): If True, validates the manifest for any errors. Defaults to True.
1585 |             show_progress_bar (bool): If True, tracks the progress of download
1586 |             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
1587 |             dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.
1588 | 
1589 |         Raises:
1590 |             ValueError: If the download directory does not exist.
1591 |         """
1592 | 
1593 |         downloadDir = self._check_create_directory(downloadDir)
1594 | 
1595 |         # validate the manifest
1596 |         (
1597 |             total_size,
1598 |             endpoint_to_use,
1599 |             temp_manifest_file,
1600 |             list_of_directories,
1601 |             validation_result_df,
1602 |         ) = self._validate_update_manifest_and_get_download_size(
1603 |             manifestFile=manifestFile,
1604 |             downloadDir=downloadDir,
1605 |             validate_manifest=validate_manifest,
1606 |             use_s5cmd_sync=use_s5cmd_sync,
1607 |             dirTemplate=dirTemplate,
1608 |         )
1609 | 
1610 |         total_size_rounded = round(total_size, 2)
1611 |         if not self._check_disk_size_and_warn(downloadDir, total_size):
1612 |             return
1613 | 
1614 |         self._s5cmd_run(
1615 |             endpoint_to_use=endpoint_to_use,
1616 |             manifest_file=temp_manifest_file,
1617 |             total_size=total_size_rounded,
1618 |             downloadDir=downloadDir,
1619 |             quiet=quiet,
1620 |             show_progress_bar=show_progress_bar,
1621 |             use_s5cmd_sync=use_s5cmd_sync,
1622 |             dirTemplate=dirTemplate,
1623 |             list_of_directories=list_of_directories,
1624 |             s5cmd_sync_helper_df=validation_result_df,
1625 |         )
1626 | 
1627 |     def citations_from_manifest(
1628 |         self,
1629 |         manifestFile: str,
1630 |         citation_format: str = CITATION_FORMAT_APA,
1631 |     ):
1632 |         """Get the list of publications that should be cited/attributed for a cohort defined by a manifest.
1633 | 
1634 |         Args:
1635 |             manifestFile (str: string containing the path to the manifest file.
1636 |             format (str): string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4.
1637 | 
1638 |         Returns:
1639 |             List of citations in the requested format.
1640 |         """
1641 | 
1642 |         manifest_df = pd.read_csv(
1643 |             manifestFile,
1644 |             comment="#",
1645 |             skip_blank_lines=True,
1646 |             header=None,
1647 |             names=["manifest_line"],
1648 |         )
1649 |         uuid_pattern = r"s3://.*/([^/]+)/\*"
1650 |         manifest_df["crdc_series_uuid"] = manifest_df["manifest_line"].str.extract(
1651 |             uuid_pattern, expand=False
1652 |         )
1653 |         index_copy = self.index[["series_aws_url", "SeriesInstanceUID"]].copy()
1654 |         index_copy["crdc_series_uuid"] = index_copy["series_aws_url"].str.extract(
1655 |             uuid_pattern, expand=False
1656 |         )
1657 | 
1658 |         result_df = pd.merge(manifest_df, index_copy, on="crdc_series_uuid", how="left")
1659 | 
1660 |         return self.citations_from_selection(
1661 |             seriesInstanceUID=result_df["SeriesInstanceUID"].tolist(),
1662 |             citation_format=citation_format,
1663 |         )
1664 | 
1665 |     def citations_from_selection(
1666 |         self,
1667 |         collection_id=None,
1668 |         patientId=None,
1669 |         studyInstanceUID=None,
1670 |         seriesInstanceUID=None,
1671 |         citation_format=CITATION_FORMAT_APA,
1672 |     ):
1673 |         """Get the list of publications that should be cited/attributed for the specific collection, patient (case) ID, study or series UID.
1674 | 
1675 |         Args:
1676 |             collection_id: string or list of strings containing the values of collection_id to filter by
1677 |             patientId: string or list of strings containing the values of PatientID to filter by
1678 |             studyInstanceUID (str): string or list of strings containing the values of DICOM StudyInstanceUID to filter by
1679 |             seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
1680 |             format: string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4.
1681 | 
1682 |         Returns:
1683 |             List of citations in the requested format.
1684 |         """
1685 | 
1686 |         result_df = self._safe_filter_by_selection(
1687 |             self.index,
1688 |             collection_id=collection_id,
1689 |             patientId=patientId,
1690 |             studyInstanceUID=studyInstanceUID,
1691 |             seriesInstanceUID=seriesInstanceUID,
1692 |             sopInstanceUID=None,
1693 |             crdc_series_uuid=None,
1694 |         )
1695 | 
1696 |         citations = []
1697 | 
1698 |         if not result_df.empty:
1699 |             distinct_dois = result_df["source_DOI"].unique().tolist()
1700 | 
1701 |             if len(distinct_dois) == 0:
1702 |                 logger.error("No DOIs found for the selection.")
1703 |                 return citations
1704 | 
1705 |             # include citation for the currently main IDC publication
1706 |             # https://doi.org/10.1148/rg.230180
1707 |             distinct_dois.append("10.1148/rg.230180")
1708 | 
1709 |             headers = {"accept": citation_format}
1710 |             timeout = 30
1711 | 
1712 |             for doi in distinct_dois:
1713 |                 url = "https://dx.doi.org/" + doi
1714 | 
1715 |                 logger.debug(f"Requesting citation for DOI: {doi}")
1716 | 
1717 |                 response = requests.get(url, headers=headers, timeout=timeout)
1718 | 
1719 |                 logger.debug("Received response: " + str(response.status_code))
1720 | 
1721 |                 if response.status_code == 200:
1722 |                     if citation_format == self.CITATION_FORMAT_JSON:
1723 |                         citations.append(response.json())
1724 |                     else:
1725 |                         citations.append(response.text)
1726 |                     logger.debug("Received citation: " + citations[-1])
1727 | 
1728 |                 else:
1729 |                     logger.error(f"Failed to get citation for DOI: {url}")
1730 |                     logger.error(
1731 |                         f"DOI server response status code: {response.status_code}"
1732 |                     )
1733 | 
1734 |         return citations
1735 | 
1736 |     def download_from_selection(
1737 |         self,
1738 |         downloadDir,
1739 |         dry_run=False,
1740 |         collection_id=None,
1741 |         patientId=None,
1742 |         studyInstanceUID=None,
1743 |         seriesInstanceUID=None,
1744 |         sopInstanceUID=None,
1745 |         crdc_series_uuid=None,
1746 |         quiet=True,
1747 |         show_progress_bar=True,
1748 |         use_s5cmd_sync=False,
1749 |         dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT,
1750 |         source_bucket_location="aws",
1751 |     ):
1752 |         """Download the files corresponding to the selection. The filtering will be applied in sequence (but does it matter?) by first selecting the collection(s), followed by
1753 |         patient(s), study(studies) and series. If no filtering is applied, all the files will be downloaded.
1754 | 
1755 |         Args:
1756 |             downloadDir: string containing the path to the directory to download the files to
1757 |             dry_run: calculates the size of the cohort but download does not start
1758 |             collection_id: string or list of strings containing the values of collection_id to filter by
1759 |             patientId: string or list of strings containing the values of PatientID to filter by
1760 |             studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
1761 |             seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
1762 |             sopInstanceUID: string or list of strings containing the values of DICOM SOPInstanceUID to filter by
1763 |             crdc_series_uuid: string or list of strings containing the values of crdc_series_uuid to filter by
1764 |             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True
1765 |             show_progress_bar (bool): If True, tracks the progress of download
1766 |             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
1767 |             dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.
1768 |             source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'.
1769 |         """
1770 | 
1771 |         if source_bucket_location not in ["aws", "gcs"]:
1772 |             raise ValueError("source_bucket_location must be either 'aws' or 'gcs'")
1773 | 
1774 |         downloadDir = self._check_create_directory(downloadDir)
1775 | 
1776 |         # If SOPInstanceUID(s) are given, we need to join the main index with the instance-level index
1777 |         sm_instance_index = None
1778 |         if sopInstanceUID:
1779 |             if (
1780 |                 self.sm_instance_index is not None
1781 |             ):  # check if instance-level index is installed
1782 |                 download_df = self.sm_instance_index
1783 |                 sm_instance_index = self.sm_instance_index
1784 |             else:
1785 |                 logger.error(
1786 |                     "Instance-level access not possible because instance-level index not installed."
1787 |                 )
1788 |                 raise ValueError(
1789 |                     "Instance-level access not possible because instance-level index not installed."
1790 |                 )
1791 |             if use_s5cmd_sync:
1792 |                 logger.warning(
1793 |                     "s5cmd sync is not supported for downloading individual files. Disabling sync."
1794 |                 )
1795 |                 use_s5cmd_sync = False
1796 |         elif crdc_series_uuid is not None:
1797 |             download_df = pd.concat(
1798 |                 [
1799 |                     self.index[
1800 |                         [
1801 |                             "PatientID",
1802 |                             "collection_id",
1803 |                             "Modality",
1804 |                             "StudyInstanceUID",
1805 |                             "SeriesInstanceUID",
1806 |                             "crdc_series_uuid",
1807 |                             "aws_bucket",
1808 |                             "series_size_MB",
1809 |                         ]
1810 |                     ],
1811 |                     self.prior_versions_index[
1812 |                         [
1813 |                             "PatientID",
1814 |                             "collection_id",
1815 |                             "Modality",
1816 |                             "StudyInstanceUID",
1817 |                             "SeriesInstanceUID",
1818 |                             "crdc_series_uuid",
1819 |                             "aws_bucket",
1820 |                             "series_size_MB",
1821 |                         ]
1822 |                     ],
1823 |                 ],
1824 |             )
1825 |         else:
1826 |             download_df = self.index
1827 | 
1828 |         result_df = self._safe_filter_by_selection(
1829 |             download_df,
1830 |             collection_id=collection_id,
1831 |             patientId=patientId,
1832 |             studyInstanceUID=studyInstanceUID,
1833 |             seriesInstanceUID=seriesInstanceUID,
1834 |             sopInstanceUID=sopInstanceUID,
1835 |             crdc_series_uuid=crdc_series_uuid,
1836 |         )
1837 | 
1838 |         if not sopInstanceUID:
1839 |             total_size = round(result_df["series_size_MB"].sum(), 2)
1840 |         else:
1841 |             total_size_bytes = round(result_df["instance_size"].sum(), 2)
1842 |             total_size = total_size_bytes / (10**6)
1843 | 
1844 |         if not self._check_disk_size_and_warn(downloadDir, total_size):
1845 |             return
1846 | 
1847 |         if dry_run:
1848 |             logger.info(
1849 |                 "Dry run. Not downloading files. Rerun with dry_run=False to download the files."
1850 |             )
1851 |             return
1852 | 
1853 |         if dirTemplate is not None:
1854 |             hierarchy = self._generate_sql_concat_for_building_directory(
1855 |                 downloadDir=downloadDir,
1856 |                 dirTemplate=dirTemplate,
1857 |             )
1858 |         else:
1859 |             hierarchy = f"CONCAT('{downloadDir}')"
1860 | 
1861 |         if sopInstanceUID:
1862 |             sql = f"""
1863 |                 WITH temp as
1864 |                     (
1865 |                         SELECT
1866 |                             sopInstanceUID
1867 |                         FROM
1868 |                             result_df
1869 |                     )
1870 |                 SELECT
1871 |                     CONCAT('s3://', aws_bucket, '/', crdc_series_uuid,'/*') AS series_aws_url,
1872 |                     CONCAT('s3://', aws_bucket, '/', crdc_series_uuid,'/', crdc_instance_uuid, '.dcm') as instance_aws_url,
1873 |                     crdc_series_uuid index_crdc_series_uuid,
1874 |                     {hierarchy} as path
1875 |                 FROM
1876 |                     temp
1877 |                 JOIN
1878 |                     sm_instance_index using (sopInstanceUID)
1879 |                 LEFT JOIN
1880 |                     index using (seriesInstanceUID)
1881 |                 """
1882 |         else:
1883 |             sql = f"""
1884 |                 WITH temp as
1885 |                     (
1886 |                         SELECT
1887 |                             seriesInstanceUID
1888 |                         FROM
1889 |                             result_df
1890 |                     )
1891 |                 SELECT
1892 |                     CONCAT('s3://',aws_bucket,'/',crdc_series_uuid,'/*') AS series_aws_url,
1893 |                     crdc_series_uuid AS index_crdc_series_uuid,
1894 |                     series_size_MB,
1895 |                     {hierarchy} as path
1896 |                 FROM
1897 |                     temp
1898 |                 JOIN
1899 |                     index using (seriesInstanceUID)
1900 |                 """
1901 |         index = self.index
1902 |         result_df = duckdb.query(sql).df()
1903 |         # Download the files and make temporary file to store the list of files to download
1904 | 
1905 |         with tempfile.NamedTemporaryFile(mode="w", delete=False) as manifest_file:
1906 |             # Determine column containing the URL for instance / series-level access
1907 |             if sopInstanceUID:
1908 |                 if not "instance_aws_url" in result_df:
1909 |                     result_df["instance_aws_url"] = (
1910 |                         result_df["series_aws_url"].replace("/*", "/")
1911 |                         + result_df["crdc_instance_uuid"]
1912 |                         + ".dcm"
1913 |                     )
1914 |                 url_column = "instance_aws_url"
1915 |             else:
1916 |                 url_column = "series_aws_url"
1917 | 
1918 |             if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0:
1919 |                 if dirTemplate is not None:
1920 |                     result_df["s5cmd_cmd"] = (
1921 |                         "sync " + result_df[url_column] + ' "' + result_df["path"] + '"'
1922 |                     )
1923 |                 else:
1924 |                     result_df["s5cmd_cmd"] = (
1925 |                         "sync " + result_df[url_column] + ' "' + downloadDir + '"'
1926 |                     )
1927 |             elif dirTemplate is not None:
1928 |                 result_df["s5cmd_cmd"] = (
1929 |                     "cp " + result_df[url_column] + ' "' + result_df["path"] + '"'
1930 |                 )
1931 |             else:
1932 |                 result_df["s5cmd_cmd"] = (
1933 |                     "cp " + result_df[url_column] + ' "' + downloadDir + '"'
1934 |                 )
1935 | 
1936 |             if source_bucket_location == "gcs":
1937 |                 self._replace_aws_with_gcp_buckets(result_df, "s5cmd_cmd")
1938 | 
1939 |             # Combine all commands into a single string with newline separators
1940 |             commands = "\n".join(result_df["s5cmd_cmd"])
1941 |             manifest_file.write(commands)
1942 | 
1943 |             if dirTemplate is not None:
1944 |                 list_of_directories = result_df.path.to_list()
1945 |             else:
1946 |                 list_of_directories = [downloadDir]
1947 |         logger.debug(
1948 |             """
1949 | Temporary download manifest is generated and is passed to self._s5cmd_run
1950 | """
1951 |         )
1952 |         if sopInstanceUID:
1953 |             s5cmd_sync_helper_df = None
1954 |         else:
1955 |             s5cmd_sync_helper_df = result_df[
1956 |                 ["index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path"]
1957 |             ]
1958 |         endpoint_url = None
1959 |         if source_bucket_location == "aws":
1960 |             endpoint_url = aws_endpoint_url
1961 |         elif source_bucket_location == "gcs":
1962 |             endpoint_url = gcp_endpoint_url
1963 |         else:
1964 |             raise ValueError("source_bucket_location must be either 'aws' or 'gcs'")
1965 |         self._s5cmd_run(
1966 |             endpoint_to_use=endpoint_url,
1967 |             manifest_file=Path(manifest_file.name),
1968 |             total_size=total_size,
1969 |             downloadDir=downloadDir,
1970 |             quiet=quiet,
1971 |             show_progress_bar=show_progress_bar,
1972 |             use_s5cmd_sync=use_s5cmd_sync,
1973 |             dirTemplate=dirTemplate,
1974 |             list_of_directories=list_of_directories,
1975 |             s5cmd_sync_helper_df=s5cmd_sync_helper_df,
1976 |         )
1977 | 
1978 |     def download_dicom_instance(
1979 |         self,
1980 |         sopInstanceUID,
1981 |         downloadDir,
1982 |         dry_run=False,
1983 |         quiet=True,
1984 |         show_progress_bar=True,
1985 |         use_s5cmd_sync=False,
1986 |         dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT,
1987 |         source_bucket_location="aws",
1988 |     ) -> None:
1989 |         """
1990 |         Download the files corresponding to the seriesInstanceUID to the specified directory.
1991 | 
1992 |         Args:
1993 |             sopInstanceUID: string or list of strings containing the values of DICOM SOPInstanceUID to filter by
1994 |             downloadDir: string containing the path to the directory to download the files to
1995 |             dry_run: calculates the size of the cohort but download does not start
1996 |             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True.
1997 |             show_progress_bar (bool): If True, tracks the progress of download
1998 |             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
1999 |             dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.
2000 |             source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'.
2001 |         Returns: None
2002 | 
2003 |         Raises:
2004 |             TypeError: If sopInstanceUID(s) passed is(are) not a string or list
2005 | 
2006 |         """
2007 |         self.download_from_selection(
2008 |             downloadDir,
2009 |             sopInstanceUID=sopInstanceUID,
2010 |             dry_run=dry_run,
2011 |             quiet=quiet,
2012 |             show_progress_bar=show_progress_bar,
2013 |             use_s5cmd_sync=use_s5cmd_sync,
2014 |             dirTemplate=dirTemplate,
2015 |             source_bucket_location=source_bucket_location,
2016 |         )
2017 | 
2018 |     def download_dicom_series(
2019 |         self,
2020 |         seriesInstanceUID,
2021 |         downloadDir,
2022 |         dry_run=False,
2023 |         quiet=True,
2024 |         show_progress_bar=True,
2025 |         use_s5cmd_sync=False,
2026 |         dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT,
2027 |         source_bucket_location="aws",
2028 |     ) -> None:
2029 |         """
2030 |         Download the files corresponding to the seriesInstanceUID to the specified directory.
2031 | 
2032 |         Args:
2033 |             seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
2034 |             downloadDir: string containing the path to the directory to download the files to
2035 |             dry_run: calculates the size of the cohort but download does not start
2036 |             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True.
2037 |             show_progress_bar (bool): If True, tracks the progress of download
2038 |             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
2039 |             dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.
2040 |             source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'.
2041 |         Returns: None
2042 | 
2043 |         Raises:
2044 |             TypeError: If seriesInstanceUID(s) passed is(are) not a string or list
2045 | 
2046 |         """
2047 |         self.download_from_selection(
2048 |             downloadDir,
2049 |             seriesInstanceUID=seriesInstanceUID,
2050 |             dry_run=dry_run,
2051 |             quiet=quiet,
2052 |             show_progress_bar=show_progress_bar,
2053 |             use_s5cmd_sync=use_s5cmd_sync,
2054 |             dirTemplate=dirTemplate,
2055 |             source_bucket_location=source_bucket_location,
2056 |         )
2057 | 
2058 |     def download_dicom_studies(
2059 |         self,
2060 |         studyInstanceUID,
2061 |         downloadDir,
2062 |         dry_run=False,
2063 |         quiet=True,
2064 |         show_progress_bar=True,
2065 |         use_s5cmd_sync=False,
2066 |         dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT,
2067 |         source_bucket_location="aws",
2068 |     ) -> None:
2069 |         """
2070 |         Download the files corresponding to the studyInstanceUID to the specified directory.
2071 | 
2072 |         Args:
2073 |             studyInstanceUID: string or list of strings containing the values of DICOM studyInstanceUID to filter by
2074 |             downloadDir: string containing the path to the directory to download the files to
2075 |             dry_run: calculates the size of the cohort but download does not start
2076 |             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True.
2077 |             show_progress_bar (bool): If True, tracks the progress of download
2078 |             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
2079 |             dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.
2080 |             source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'.
2081 |         Returns: None
2082 | 
2083 |         Raises:
2084 |             TypeError: If seriesInstanceUID(s) passed is(are) not a string or list
2085 | 
2086 |         """
2087 |         self.download_from_selection(
2088 |             downloadDir,
2089 |             studyInstanceUID=studyInstanceUID,
2090 |             dry_run=dry_run,
2091 |             quiet=quiet,
2092 |             show_progress_bar=show_progress_bar,
2093 |             use_s5cmd_sync=use_s5cmd_sync,
2094 |             dirTemplate=dirTemplate,
2095 |             source_bucket_location=source_bucket_location,
2096 |         )
2097 | 
2098 |     def download_dicom_patients(
2099 |         self,
2100 |         patientId,
2101 |         downloadDir,
2102 |         dry_run=False,
2103 |         quiet=True,
2104 |         show_progress_bar=True,
2105 |         use_s5cmd_sync=False,
2106 |         dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT,
2107 |         source_bucket_location="aws",
2108 |     ) -> None:
2109 |         """
2110 |         Download the files corresponding to the studyInstanceUID to the specified directory.
2111 | 
2112 |         Args:
2113 |             patientId: string or list of strings containing the values of DICOM patientId to filter by
2114 |             downloadDir: string containing the path to the directory to download the files to
2115 |             dry_run: calculates the size of the cohort but download does not start
2116 |             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True.
2117 |             show_progress_bar (bool): If True, tracks the progress of download
2118 |             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
2119 |             dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.
2120 |             source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'.
2121 | 
2122 |         Returns: None
2123 | 
2124 |         Raises:
2125 |             TypeError: If patientId(s) passed is(are) not a string or list
2126 | 
2127 |         """
2128 |         self.download_from_selection(
2129 |             downloadDir,
2130 |             patientId=patientId,
2131 |             dry_run=dry_run,
2132 |             quiet=quiet,
2133 |             show_progress_bar=show_progress_bar,
2134 |             use_s5cmd_sync=use_s5cmd_sync,
2135 |             dirTemplate=dirTemplate,
2136 |             source_bucket_location=source_bucket_location,
2137 |         )
2138 | 
2139 |     def download_collection(
2140 |         self,
2141 |         collection_id,
2142 |         downloadDir,
2143 |         dry_run=False,
2144 |         quiet=True,
2145 |         show_progress_bar=True,
2146 |         use_s5cmd_sync=False,
2147 |         dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT,
2148 |         source_bucket_location="aws",
2149 |     ) -> None:
2150 |         """
2151 |         Download the files corresponding to the studyInstanceUID to the specified directory.
2152 | 
2153 |         Args:
2154 |             collection_id: string or list of strings containing the values of DICOM patientId to filter by
2155 |             downloadDir: string containing the path to the directory to download the files to
2156 |             dry_run: calculates the size of the cohort but download does not start
2157 |             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True.
2158 |             show_progress_bar (bool): If True, tracks the progress of download
2159 |             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
2160 |             dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.
2161 |             source_bucket_location: string selecting the provider of the bucket from which the files will be downloaded, allowing to select between Google ('gcs') and AWS ('aws') storage. Defaults to 'aws'.
2162 | 
2163 |         Returns: None
2164 | 
2165 |         Raises:
2166 |             TypeError: If collection_id(s) passed is(are) not a string or list
2167 | 
2168 |         """
2169 |         self.download_from_selection(
2170 |             downloadDir,
2171 |             collection_id=collection_id,
2172 |             dry_run=dry_run,
2173 |             quiet=quiet,
2174 |             show_progress_bar=show_progress_bar,
2175 |             use_s5cmd_sync=use_s5cmd_sync,
2176 |             dirTemplate=dirTemplate,
2177 |             source_bucket_location=source_bucket_location,
2178 |         )
2179 | 
2180 |     def sql_query(self, sql_query):
2181 |         """Execute SQL query against the table in the index using duckdb.
2182 | 
2183 |         Args:
2184 |             sql_query: string containing the SQL query to execute. The table name to use in the FROM clause is 'index' (without quotes).
2185 | 
2186 |         Returns:
2187 |             pandas dataframe containing the results of the query
2188 | 
2189 |         Raises:
2190 |             duckdb.Error: any exception that duckdb.query() raises
2191 |         """
2192 | 
2193 |         logger.debug("Executing SQL query: " + sql_query)
2194 |         # TODO: find a more elegant way to automate the following:  https://www.perplexity.ai/search/write-python-code-that-iterate-XY9ppywbQFSRnOpgbwx_uQ
2195 |         index = self.index
2196 |         if self.sm_index is not None:
2197 |             sm_index = self.sm_index
2198 |         if self.sm_instance_index is not None:
2199 |             sm_instance_index = self.sm_instance_index
2200 |         if self.clinical_index is not None:
2201 |             clinical_index = self.clinical_index
2202 |         if self.prior_versions_index is not None:
2203 |             prior_versions_index = self.prior_versions_index
2204 |         return duckdb.query(sql_query).to_df()
2205 | 


--------------------------------------------------------------------------------
/idc_index/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ImagingDataCommons/idc-index/9d905f9b7e4ab719bfe54b1d747505283870ed8b/idc_index/py.typed


--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import argparse
  4 | import shutil
  5 | from pathlib import Path
  6 | 
  7 | import nox
  8 | 
  9 | DIR = Path(__file__).parent.resolve()
 10 | 
 11 | nox.options.sessions = ["lint", "pylint", "tests"]
 12 | 
 13 | 
 14 | @nox.session
 15 | def lint(session: nox.Session) -> None:
 16 |     """
 17 |     Run the linter.
 18 |     """
 19 |     session.install("pre-commit")
 20 |     session.run(
 21 |         "pre-commit", "run", "--all-files", "--show-diff-on-failure", *session.posargs
 22 |     )
 23 | 
 24 | 
 25 | @nox.session
 26 | def pylint(session: nox.Session) -> None:
 27 |     """
 28 |     Run PyLint.
 29 |     """
 30 |     # This needs to be installed into the package environment, and is slower
 31 |     # than a pre-commit check
 32 |     session.install(".", "pylint")
 33 |     session.run("pylint", "idc_index", *session.posargs)
 34 | 
 35 | 
 36 | @nox.session
 37 | def tests(session: nox.Session) -> None:
 38 |     """
 39 |     Run the unit and regular tests.
 40 |     """
 41 |     session.install(".[test]")
 42 |     session.run("pytest", *session.posargs)
 43 | 
 44 | 
 45 | @nox.session(reuse_venv=True)
 46 | def docs(session: nox.Session) -> None:
 47 |     """
 48 |     Build the docs. Pass "--serve" to serve. Pass "-b linkcheck" to check links.
 49 |     """
 50 | 
 51 |     parser = argparse.ArgumentParser()
 52 |     parser.add_argument("--serve", action="store_true", help="Serve after building")
 53 |     parser.add_argument(
 54 |         "-b", dest="builder", default="html", help="Build target (default: html)"
 55 |     )
 56 |     args, posargs = parser.parse_known_args(session.posargs)
 57 | 
 58 |     if args.builder != "html" and args.serve:
 59 |         session.error("Must not specify non-HTML builder with --serve")
 60 | 
 61 |     extra_installs = ["sphinx-autobuild"] if args.serve else []
 62 | 
 63 |     session.install("-e.[docs]", *extra_installs)
 64 |     session.chdir("docs")
 65 | 
 66 |     if args.builder == "linkcheck":
 67 |         session.run(
 68 |             "sphinx-build", "-b", "linkcheck", ".", "_build/linkcheck", *posargs
 69 |         )
 70 |         return
 71 | 
 72 |     shared_args = (
 73 |         "-n",  # nitpicky mode
 74 |         "-T",  # full tracebacks
 75 |         f"-b={args.builder}",
 76 |         ".",
 77 |         f"_build/{args.builder}",
 78 |         *posargs,
 79 |     )
 80 | 
 81 |     if args.serve:
 82 |         session.run("sphinx-autobuild", *shared_args)
 83 |     else:
 84 |         session.run("sphinx-build", "--keep-going", *shared_args)
 85 | 
 86 | 
 87 | @nox.session
 88 | def build_api_docs(session: nox.Session) -> None:
 89 |     """
 90 |     Build (regenerate) API docs.
 91 |     """
 92 | 
 93 |     session.install("sphinx")
 94 |     session.chdir("docs")
 95 |     session.run(
 96 |         "sphinx-apidoc",
 97 |         "-o",
 98 |         "api/",
 99 |         "--module-first",
100 |         "--no-toc",
101 |         "--force",
102 |         "../idc_index",
103 |     )
104 | 
105 | 
106 | @nox.session
107 | def build(session: nox.Session) -> None:
108 |     """
109 |     Build an SDist and wheel.
110 |     """
111 | 
112 |     build_path = DIR.joinpath("build")
113 |     if build_path.exists():
114 |         shutil.rmtree(build_path)
115 | 
116 |     session.install("build")
117 |     session.run("python", "-m", "build")
118 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling","hatch-vcs"]
  3 | 
  4 | build-backend = "hatchling.build"
  5 | 
  6 | [project]
  7 | name = "idc-index"
  8 | authors = [
  9 |   { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" },
 10 |   { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" },
 11 | ]
 12 | description = "Package to query and download data from an index of ImagingDataCommons"
 13 | readme = "README.md"
 14 | license.file = "LICENSE"
 15 | requires-python = ">=3.8"
 16 | classifiers = [
 17 |   "Development Status :: 4 - Beta",
 18 |   "Intended Audience :: Science/Research",
 19 |   "Intended Audience :: Developers",
 20 |   "License :: OSI Approved :: MIT License",
 21 |   "Operating System :: OS Independent",
 22 |   "Programming Language :: Python",
 23 |   "Programming Language :: Python :: 3",
 24 |   "Programming Language :: Python :: 3 :: Only",
 25 |   "Programming Language :: Python :: 3.8",
 26 |   "Programming Language :: Python :: 3.9",
 27 |   "Programming Language :: Python :: 3.10",
 28 |   "Programming Language :: Python :: 3.11",
 29 |   "Programming Language :: Python :: 3.12",
 30 |   "Topic :: Scientific/Engineering",
 31 |   "Typing :: Typed",
 32 | ]
 33 | dynamic = ["version"]
 34 | dependencies = [
 35 |   "click",
 36 |   'duckdb>=0.10.0,<=1.2.1',
 37 |   "idc-index-data==21.0.0",
 38 |   "packaging",
 39 |   "pandas<=2.2.4",
 40 |   "platformdirs",
 41 |   "psutil",
 42 |   "pyarrow",
 43 |   "requests",
 44 |   "s5cmd",
 45 |   "sphinx-click",
 46 |   "tqdm"
 47 | ]
 48 | 
 49 | [project.optional-dependencies]
 50 | test = [
 51 |   "pytest >=6",
 52 |   "pytest-cov >=3",
 53 | ]
 54 | dev = [
 55 |   "pytest >=6",
 56 |   "pytest-cov >=3",
 57 | ]
 58 | docs = [
 59 |   "sphinx>=7.0",
 60 |   "myst_parser>=0.13",
 61 |   "sphinx_copybutton",
 62 |   "sphinx_autodoc_typehints",
 63 |   "furo>=2023.08.17",
 64 | ]
 65 | 
 66 | [project.scripts]
 67 | idc = 'idc_index.cli:idc'
 68 | 
 69 | [project.urls]
 70 | Homepage = "https://github.com/ImagingDataCommons/idc-index"
 71 | "Bug Tracker" = "https://github.com/ImagingDataCommons/idc-index/issues"
 72 | Discussions = "https://discourse.canceridc.dev/"
 73 | Changelog = "https://github.com/ImagingDataCommons/idc-index/releases"
 74 | 
 75 | 
 76 | [tool.hatch]
 77 | version.source = "vcs"
 78 | build.hooks.vcs.version-file = "idc_index/_version.py"
 79 | 
 80 | [tool.hatch.envs.default]
 81 | features = ["test"]
 82 | scripts.test = "pytest {args}"
 83 | 
 84 | 
 85 | [tool.pytest.ini_options]
 86 | minversion = "6.0"
 87 | addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
 88 | xfail_strict = true
 89 | filterwarnings = [
 90 |   "error",
 91 |   # https://github.com/dateutil/dateutil/issues/1314
 92 |   "ignore:datetime.datetime.utcfromtimestamp.. is deprecated.*:DeprecationWarning:dateutil",
 93 | ]
 94 | log_cli_level = "INFO"
 95 | testpaths = [
 96 |   "tests",
 97 | ]
 98 | python_files = [
 99 |   "idcindex.py",
100 |   "*_test.py",
101 |   "test_*.py",
102 | ]
103 | 
104 | 
105 | [tool.coverage]
106 | run.source = ["idc_index"]
107 | report.exclude_also = [
108 |   '\.\.\.',
109 |   'if typing.TYPE_CHECKING:',
110 | ]
111 | 
112 | [tool.mypy]
113 | files = ["idc_index", "tests"]
114 | python_version = "3.8"
115 | warn_unused_configs = true
116 | strict = true
117 | enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
118 | warn_unreachable = true
119 | disallow_untyped_defs = false
120 | disallow_incomplete_defs = false
121 | 
122 | [[tool.mypy.overrides]]
123 | module = "idc_index.*"
124 | disallow_untyped_defs = true
125 | disallow_incomplete_defs = true
126 | 
127 | 
128 | [tool.ruff]
129 | src = ["idc_index"]
130 | extend-exclude = ["./CONTRIBUTING.md"]
131 | 
132 | [tool.ruff.lint]
133 | extend-select = [
134 |   "B",        # flake8-bugbear
135 |   "I",        # isort
136 |   "ARG",      # flake8-unused-arguments
137 |   "C4",       # flake8-comprehensions
138 |   "D",        # pydocstyle
139 |   "EM",       # flake8-errmsg
140 |   "ICN",      # flake8-import-conventions
141 |   "G",        # flake8-logging-format
142 |   "PGH",      # pygrep-hooks
143 |   "PIE",      # flake8-pie
144 |   "PL",       # pylint
145 |   "PT",       # flake8-pytest-style
146 |   "PTH",      # flake8-use-pathlib
147 |   "RET",      # flake8-return
148 |   "RUF",      # Ruff-specific
149 |   "SIM",      # flake8-simplify
150 |   "T20",      # flake8-print
151 |   "UP",       # pyupgrade
152 |   "YTT",      # flake8-2020
153 |   "EXE",      # flake8-executable
154 |   "NPY",      # NumPy specific rules
155 |   "PD",       # pandas-vet
156 | ]
157 | ignore = [
158 |   "PLR09",    # Too many <...>
159 |   "PLR2004",  # Magic value used in comparison
160 |   "ISC001",   # Conflicts with formatter
161 |   # Exceptions below are specific to idc-index
162 |   "B007",     # Loop control variable {name} not used within loop body
163 |   "B904",     # Checks for raise statements in exception handlers that lack a from clause.
164 |   "E722",     # Do not use bare except
165 |   "EM101",    # Exception must not use a string literal, assign to variable first
166 |   "F841",     # Local variable {name} is assigned to but never used
167 |   "G003",     # Logging statement uses +
168 |   "G004",     # Logging statement uses f-string
169 |   "PD011",    # Use .to_numpy() instead of .values
170 |   "PD901",    # Avoid using the generic variable name df for DataFrames
171 |   "PT009",    # Use a regular assert instead of unittest-style {assertion}
172 |   "PTH100",   # os.path.abspath() should be replaced by Path.resolve()
173 |   "PTH103",   # os.makedirs() should be replaced by Path.mkdir(parents=True)
174 |   "PTH107",   # os.remove() should be replaced by Path.unlink()
175 |   "PTH110",   # os.path.exists() should be replaced by Path.exists()
176 |   "PTH118",   # Checks for uses of os.path.join
177 |   "PTH119",   # os.path.basename() should be replaced by Path.name
178 |   "PTH120",   # os.path.dirname() should be replaced by Path.parent
179 |   "PTH123",   # open() should be replaced by Path.open()
180 |   "RET504",   # Unnecessary assignment to {name} before return statement
181 |   "RET506",   # Unnecessary {branch} after raise statement
182 |   "SIM102",   # Use a single if statement instead of nested if statements
183 |   "SIM108",   # Use ternary operator {contents} instead of if-else-block
184 |   "SIM117",   # Use a single with statement with multiple contexts instead of nested with statements
185 |   "T201",     # print found
186 | ]
187 | isort.required-imports = ["from __future__ import annotations"]
188 | # Uncomment if using a _compat.typing backport
189 | # typing-modules = ["idc_index._compat.typing"]
190 | 
191 | [tool.ruff.lint.per-file-ignores]
192 | "docs/conf.py" = ["D"]
193 | "tests/**" = [
194 |   "D",
195 |   "T20",
196 | ]
197 | "noxfile.py" = [
198 |   "D",
199 |   "T20",
200 | ]
201 | 
202 | [tool.ruff.lint.pydocstyle]
203 | convention = "google"
204 | 
205 | 
206 | [tool.pylint]
207 | py-version = "3.8"
208 | ignore-paths = [".*/_version.py"]
209 | reports.output-format = "colorized"
210 | similarities.ignore-imports = "yes"
211 | messages_control.disable = [
212 |   "design",
213 |   "fixme",
214 |   "line-too-long",
215 |   "missing-module-docstring",
216 |   "wrong-import-position",
217 |   # Exceptions below are specific to idc-index
218 |   "invalid-name",
219 |   "missing-class-docstring",
220 |   "missing-function-docstring",
221 |   "logging-fstring-interpolation",
222 |   "logging-not-lazy",
223 |   "no-else-raise",
224 |   "raise-missing-from",
225 |   "undefined-loop-variable",
226 |   "unspecified-encoding",
227 |   "unused-variable",
228 | ]
229 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ImagingDataCommons/idc-index/9d905f9b7e4ab719bfe54b1d747505283870ed8b/tests/__init__.py


--------------------------------------------------------------------------------
/tests/idcindex.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import logging
  4 | import os
  5 | import tempfile
  6 | import unittest
  7 | from itertools import product
  8 | from pathlib import Path
  9 | 
 10 | import pandas as pd
 11 | import pytest
 12 | import requests
 13 | from click.testing import CliRunner
 14 | from idc_index import IDCClient, cli
 15 | 
 16 | # Run tests using the following command from the root of the repository:
 17 | # python -m unittest -vv tests/idcindex.py
 18 | #
 19 | # run specific tests with this:
 20 | # pytest ./tests/idcindex.py::TestIDCClient.test_download_dicom_instance
 21 | 
 22 | logging.basicConfig(level=logging.DEBUG)
 23 | 
 24 | 
 25 | def remote_file_exists(url):
 26 |     try:
 27 |         response = requests.head(url, allow_redirects=True)
 28 |         # Check if the status code indicates success
 29 |         return response.status_code == 200
 30 |     except requests.RequestException as e:
 31 |         # Handle any exceptions (e.g., network issues)
 32 |         print(f"An error occurred: {e}")
 33 |         return False
 34 | 
 35 | 
 36 | @pytest.fixture(autouse=True)
 37 | def _change_test_dir(request, monkeypatch):
 38 |     monkeypatch.chdir(request.fspath.dirname)
 39 | 
 40 | 
 41 | class TestIDCClient(unittest.TestCase):
 42 |     def setUp(self):
 43 |         self.client = IDCClient()
 44 |         self.download_from_manifest = cli.download_from_manifest
 45 |         self.download_from_selection = cli.download_from_selection
 46 |         self.download = cli.download
 47 | 
 48 |         logger = logging.getLogger("idc_index")
 49 |         logger.setLevel(logging.DEBUG)
 50 | 
 51 |     def test_get_collections(self):
 52 |         collections = self.client.get_collections()
 53 |         self.assertIsNotNone(collections)
 54 | 
 55 |     def test_get_idc_version(self):
 56 |         idc_version = self.client.get_idc_version()
 57 |         self.assertIsNotNone(idc_version)
 58 |         self.assertTrue(idc_version.startswith("v"))
 59 | 
 60 |     def test_get_patients(self):
 61 |         # Define the values for each optional parameter
 62 |         output_format_values = ["list", "dict", "df"]
 63 |         collection_id_values = [
 64 |             "htan_ohsu",
 65 |             ["ct_phantom4radiomics", "cmb_gec"],
 66 |         ]
 67 | 
 68 |         # Test each combination
 69 |         for collection_id in collection_id_values:
 70 |             for output_format in output_format_values:
 71 |                 patients = self.client.get_patients(
 72 |                     collection_id=collection_id, outputFormat=output_format
 73 |                 )
 74 | 
 75 |                 # Check if the output format matches the expected type
 76 |                 if output_format == "list":
 77 |                     self.assertIsInstance(patients, list)
 78 |                     self.assertTrue(bool(patients))  # Check that the list is not empty
 79 |                 elif output_format == "dict":
 80 |                     self.assertTrue(
 81 |                         isinstance(patients, dict)
 82 |                         or (
 83 |                             isinstance(patients, list)
 84 |                             and all(isinstance(i, dict) for i in patients)
 85 |                         )
 86 |                     )  # Check that the output is either a dictionary or a list of dictionaries
 87 |                     self.assertTrue(
 88 |                         bool(patients)
 89 |                     )  # Check that the output is not empty
 90 |                 elif output_format == "df":
 91 |                     self.assertIsInstance(patients, pd.DataFrame)
 92 |                     self.assertFalse(
 93 |                         patients.empty
 94 |                     )  # Check that the DataFrame is not empty
 95 | 
 96 |     def test_get_studies(self):
 97 |         # Define the values for each optional parameter
 98 |         output_format_values = ["list", "dict", "df"]
 99 |         patient_id_values = ["PCAMPMRI-00001", ["PCAMPMRI-00001", "NoduleLayout_1"]]
100 | 
101 |         # Test each combination
102 |         for patient_id in patient_id_values:
103 |             for output_format in output_format_values:
104 |                 studies = self.client.get_dicom_studies(
105 |                     patientId=patient_id, outputFormat=output_format
106 |                 )
107 | 
108 |                 # Check if the output format matches the expected type
109 |                 if output_format == "list":
110 |                     self.assertIsInstance(studies, list)
111 |                     self.assertTrue(bool(studies))  # Check that the list is not empty
112 |                 elif output_format == "dict":
113 |                     self.assertTrue(
114 |                         isinstance(studies, dict)
115 |                         or (
116 |                             isinstance(studies, list)
117 |                             and all(isinstance(i, dict) for i in studies)
118 |                         )
119 |                     )  # Check that the output is either a dictionary or a list of dictionaries
120 |                     self.assertTrue(bool(studies))  # Check that the output is not empty
121 |                 elif output_format == "df":
122 |                     self.assertIsInstance(studies, pd.DataFrame)
123 |                     self.assertFalse(
124 |                         studies.empty
125 |                     )  # Check that the DataFrame is not empty
126 | 
127 |     def test_get_series(self):
128 |         """
129 |         Query used for selecting the smallest series/studies:
130 | 
131 |         SELECT
132 |             StudyInstanceUID,
133 |             ARRAY_AGG(DISTINCT(collection_id)) AS collection,
134 |             ARRAY_AGG(DISTINCT(series_aws_url)) AS aws_url,
135 |             ARRAY_AGG(DISTINCT(series_gcs_url)) AS gcs_url,
136 |             COUNT(DISTINCT(SOPInstanceUID)) AS num_instances,
137 |             SUM(instance_size) AS series_size
138 |         FROM
139 |             `bigquery-public-data.idc_current.dicom_all`
140 |         GROUP BY
141 |             StudyInstanceUID
142 |         HAVING
143 |             num_instances > 2
144 |         ORDER BY
145 |             series_size asc
146 |         LIMIT
147 |             10
148 |         """
149 |         # Define the values for each optional parameter
150 |         output_format_values = ["list", "dict", "df"]
151 |         study_instance_uid_values = [
152 |             "1.3.6.1.4.1.14519.5.2.1.6279.6001.175012972118199124641098335511",
153 |             [
154 |                 "1.3.6.1.4.1.14519.5.2.1.1239.1759.691327824408089993476361149761",
155 |                 "1.3.6.1.4.1.14519.5.2.1.1239.1759.272272273744698671736205545239",
156 |             ],
157 |         ]
158 | 
159 |         # Test each combination
160 |         for study_instance_uid in study_instance_uid_values:
161 |             for output_format in output_format_values:
162 |                 series = self.client.get_dicom_series(
163 |                     studyInstanceUID=study_instance_uid, outputFormat=output_format
164 |                 )
165 | 
166 |                 # Check if the output format matches the expected type
167 |                 if output_format == "list":
168 |                     self.assertIsInstance(series, list)
169 |                     self.assertTrue(bool(series))  # Check that the list is not empty
170 |                 elif output_format == "dict":
171 |                     self.assertTrue(
172 |                         isinstance(series, dict)
173 |                         or (
174 |                             isinstance(series, list)
175 |                             and all(isinstance(i, dict) for i in series)
176 |                         )
177 |                     )  # Check that the output is either a dictionary or a list of dictionaries
178 |                 elif output_format == "df":
179 |                     self.assertIsInstance(series, pd.DataFrame)
180 |                     self.assertFalse(
181 |                         series.empty
182 |                     )  # Check that the DataFrame is not empty
183 | 
184 |     def test_download_dicom_series(self):
185 |         with tempfile.TemporaryDirectory() as temp_dir:
186 |             self.client.download_dicom_series(
187 |                 seriesInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.1700.153974929648969296590126728101",
188 |                 downloadDir=temp_dir,
189 |             )
190 |             self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 3)
191 | 
192 |     def test_download_dicom_instance(self):
193 |         self.client.fetch_index("sm_instance_index")
194 |         with tempfile.TemporaryDirectory() as temp_dir:
195 |             self.client.download_dicom_instance(
196 |                 sopInstanceUID="1.3.6.1.4.1.5962.99.1.528744472.1087975700.1641206284312.14.0",
197 |                 downloadDir=temp_dir,
198 |             )
199 | 
200 |             self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 1)
201 | 
202 |     def test_download_dicom_series_gcs(self):
203 |         with tempfile.TemporaryDirectory() as temp_dir:
204 |             self.client.download_dicom_series(
205 |                 seriesInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.1700.153974929648969296590126728101",
206 |                 downloadDir=temp_dir,
207 |                 source_bucket_location="gcs",
208 |             )
209 |             self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 3)
210 | 
211 |     def test_download_dicom_instance_gcs(self):
212 |         self.client.fetch_index("sm_instance_index")
213 |         with tempfile.TemporaryDirectory() as temp_dir:
214 |             self.client.download_dicom_instance(
215 |                 sopInstanceUID="1.3.6.1.4.1.5962.99.1.528744472.1087975700.1641206284312.14.0",
216 |                 downloadDir=temp_dir,
217 |                 source_bucket_location="gcs",
218 |             )
219 | 
220 |             self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 1)
221 | 
222 |     def test_download_with_template(self):
223 |         dirTemplateValues = [
224 |             None,
225 |             "%collection_id_%PatientID/%Modality-%StudyInstanceUID%SeriesInstanceUID",
226 |             "%collection_id%PatientID-%Modality_%StudyInstanceUID/%SeriesInstanceUID",
227 |             "%collection_id-%PatientID_%Modality/%StudyInstanceUID-%SeriesInstanceUID",
228 |             "%collection_id_%PatientID/%Modality/%StudyInstanceUID_%SeriesInstanceUID",
229 |         ]
230 |         for template in dirTemplateValues:
231 |             with tempfile.TemporaryDirectory() as temp_dir:
232 |                 self.client.download_from_selection(
233 |                     downloadDir=temp_dir,
234 |                     studyInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462",
235 |                     dirTemplate=template,
236 |                 )
237 |                 self.assertEqual(
238 |                     sum([len(files) for r, d, files in os.walk(temp_dir)]), 3
239 |                 )
240 | 
241 |     def test_download_from_selection(self):
242 |         # Define the values for each optional parameter
243 |         dry_run_values = [True, False]
244 |         quiet_values = [True, False]
245 |         show_progress_bar_values = [True, False]
246 |         use_s5cmd_sync_values = [True, False]
247 | 
248 |         # Generate all combinations of optional parameters
249 |         combinations = product(
250 |             dry_run_values,
251 |             quiet_values,
252 |             show_progress_bar_values,
253 |             use_s5cmd_sync_values,
254 |         )
255 | 
256 |         # Test each combination
257 |         for (
258 |             dry_run,
259 |             quiet,
260 |             show_progress_bar,
261 |             use_s5cmd_sync,
262 |         ) in combinations:
263 |             with tempfile.TemporaryDirectory() as temp_dir:
264 |                 self.client.download_from_selection(
265 |                     downloadDir=temp_dir,
266 |                     dry_run=dry_run,
267 |                     patientId=None,
268 |                     studyInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462",
269 |                     seriesInstanceUID=None,
270 |                     quiet=quiet,
271 |                     show_progress_bar=show_progress_bar,
272 |                     use_s5cmd_sync=use_s5cmd_sync,
273 |                 )
274 | 
275 |                 if not dry_run:
276 |                     self.assertNotEqual(len(os.listdir(temp_dir)), 0)
277 | 
278 |     def test_sql_queries(self):
279 |         df = self.client.sql_query("SELECT DISTINCT(collection_id) FROM index")
280 | 
281 |         self.assertIsNotNone(df)
282 | 
283 |     def test_download_from_aws_manifest(self):
284 |         # Define the values for each optional parameter
285 |         quiet_values = [True, False]
286 |         validate_manifest_values = [True, False]
287 |         show_progress_bar_values = [True, False]
288 |         use_s5cmd_sync_values = [True, False]
289 |         dirTemplateValues = [
290 |             None,
291 |             "%collection_id/%PatientID/%Modality/%StudyInstanceUID/%SeriesInstanceUID",
292 |             "%collection_id%PatientID%Modality%StudyInstanceUID%SeriesInstanceUID",
293 |         ]
294 |         # Generate all combinations of optional parameters
295 |         combinations = product(
296 |             quiet_values,
297 |             validate_manifest_values,
298 |             show_progress_bar_values,
299 |             use_s5cmd_sync_values,
300 |             dirTemplateValues,
301 |         )
302 |         # Test each combination
303 |         for (
304 |             quiet,
305 |             validate_manifest,
306 |             show_progress_bar,
307 |             use_s5cmd_sync,
308 |             dirTemplate,
309 |         ) in combinations:
310 |             with tempfile.TemporaryDirectory() as temp_dir:
311 |                 self.client.download_from_manifest(
312 |                     manifestFile="./study_manifest_aws.s5cmd",
313 |                     downloadDir=temp_dir,
314 |                     quiet=quiet,
315 |                     validate_manifest=validate_manifest,
316 |                     show_progress_bar=show_progress_bar,
317 |                     use_s5cmd_sync=use_s5cmd_sync,
318 |                     dirTemplate=dirTemplate,
319 |                 )
320 | 
321 |                 if sum([len(files) for _, _, files in os.walk(temp_dir)]) != 9:
322 |                     print(
323 |                         f"Failed for {quiet} {validate_manifest} {show_progress_bar} {use_s5cmd_sync} {dirTemplate}"
324 |                     )
325 |                     self.assertFalse(True)
326 | 
327 |     def test_download_from_gcp_manifest(self):
328 |         # Define the values for each optional parameter
329 |         quiet_values = [True, False]
330 |         validate_manifest_values = [True, False]
331 |         show_progress_bar_values = [True, False]
332 |         use_s5cmd_sync_values = [True, False]
333 |         dirTemplateValues = [
334 |             None,
335 |             "%collection_id/%PatientID/%Modality/%StudyInstanceUID/%SeriesInstanceUID",
336 |             "%collection_id_%PatientID_%Modality_%StudyInstanceUID_%SeriesInstanceUID",
337 |         ]
338 |         # Generate all combinations of optional parameters
339 |         combinations = product(
340 |             quiet_values,
341 |             validate_manifest_values,
342 |             show_progress_bar_values,
343 |             use_s5cmd_sync_values,
344 |             dirTemplateValues,
345 |         )
346 | 
347 |         # Test each combination
348 |         for (
349 |             quiet,
350 |             validate_manifest,
351 |             show_progress_bar,
352 |             use_s5cmd_sync,
353 |             dirTemplate,
354 |         ) in combinations:
355 |             with tempfile.TemporaryDirectory() as temp_dir:
356 |                 self.client.download_from_manifest(
357 |                     manifestFile="./study_manifest_gcs.s5cmd",
358 |                     downloadDir=temp_dir,
359 |                     quiet=quiet,
360 |                     validate_manifest=validate_manifest,
361 |                     show_progress_bar=show_progress_bar,
362 |                     use_s5cmd_sync=use_s5cmd_sync,
363 |                     dirTemplate=dirTemplate,
364 |                 )
365 | 
366 |                 self.assertEqual(
367 |                     sum([len(files) for r, d, files in os.walk(temp_dir)]), 9
368 |                 )
369 | 
370 |     def test_download_from_bogus_manifest(self):
371 |         # Define the values for each optional parameter
372 |         quiet_values = [True, False]
373 |         validate_manifest_values = [True, False]
374 |         show_progress_bar_values = [True, False]
375 |         use_s5cmd_sync_values = [True, False]
376 | 
377 |         # Generate all combinations of optional parameters
378 |         combinations = product(
379 |             quiet_values,
380 |             validate_manifest_values,
381 |             show_progress_bar_values,
382 |             use_s5cmd_sync_values,
383 |         )
384 | 
385 |         # Test each combination
386 |         for (
387 |             quiet,
388 |             validate_manifest,
389 |             show_progress_bar,
390 |             use_s5cmd_sync,
391 |         ) in combinations:
392 |             with tempfile.TemporaryDirectory() as temp_dir:
393 |                 self.client.download_from_manifest(
394 |                     manifestFile="./study_manifest_bogus.s5cmd",
395 |                     downloadDir=temp_dir,
396 |                     quiet=quiet,
397 |                     validate_manifest=validate_manifest,
398 |                     show_progress_bar=show_progress_bar,
399 |                     use_s5cmd_sync=use_s5cmd_sync,
400 |                 )
401 | 
402 |                 self.assertEqual(len(os.listdir(temp_dir)), 0)
403 | 
404 |     """
405 |     disabling these tests due to a consistent server timeout issue
406 |     def test_citations(self):
407 |         citations = self.client.citations_from_selection(
408 |             collection_id="tcga_gbm",
409 |             citation_format=index.IDCClient.CITATION_FORMAT_APA,
410 |         )
411 |         self.assertIsNotNone(citations)
412 | 
413 |         citations = self.client.citations_from_selection(
414 |             seriesInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.4164.588007658875211151397302775781",
415 |             citation_format=index.IDCClient.CITATION_FORMAT_BIBTEX,
416 |         )
417 |         self.assertIsNotNone(citations)
418 | 
419 |         citations = self.client.citations_from_selection(
420 |             studyInstanceUID="1.2.840.113654.2.55.174144834924218414213677353968537663991",
421 |             citation_format=index.IDCClient.CITATION_FORMAT_BIBTEX,
422 |         )
423 |         self.assertIsNotNone(citations)
424 | 
425 |         citations = self.client.citations_from_manifest("./study_manifest_aws.s5cmd")
426 |         self.assertIsNotNone(citations)
427 |     """
428 | 
429 |     def test_cli_download_from_selection(self):
430 |         runner = CliRunner()
431 |         with tempfile.TemporaryDirectory() as temp_dir:
432 |             result = runner.invoke(
433 |                 self.download_from_selection,
434 |                 [
435 |                     "--download-dir",
436 |                     temp_dir,
437 |                     "--dry-run",
438 |                     False,
439 |                     "--quiet",
440 |                     True,
441 |                     "--show-progress-bar",
442 |                     True,
443 |                     "--use-s5cmd-sync",
444 |                     False,
445 |                     "--study-instance-uid",
446 |                     "1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462",
447 |                 ],
448 |             )
449 |             assert len(os.listdir(temp_dir)) != 0
450 | 
451 |     def test_cli_download_from_manifest(self):
452 |         runner = CliRunner()
453 |         with tempfile.TemporaryDirectory() as temp_dir:
454 |             result = runner.invoke(
455 |                 self.download_from_manifest,
456 |                 [
457 |                     "--manifest-file",
458 |                     "./study_manifest_aws.s5cmd",
459 |                     "--download-dir",
460 |                     temp_dir,
461 |                     "--quiet",
462 |                     True,
463 |                     "--show-progress-bar",
464 |                     True,
465 |                     "--use-s5cmd-sync",
466 |                     False,
467 |                 ],
468 |             )
469 |             assert len(os.listdir(temp_dir)) != 0
470 | 
471 |     def test_singleton_attribute(self):
472 |         # singleton, initialized on first use
473 |         i1 = IDCClient.client()
474 |         i2 = IDCClient.client()
475 | 
476 |         # new instances created via constructor (through init)
477 |         i3 = IDCClient()
478 |         i4 = self.client
479 | 
480 |         # all must be not none
481 |         assert i1 is not None
482 |         assert i2 is not None
483 |         assert i3 is not None
484 |         assert i4 is not None
485 | 
486 |         # singletons must return the same instance
487 |         assert i1 == i2
488 | 
489 |         # new instances must be different
490 |         assert i1 != i3
491 |         assert i1 != i4
492 |         assert i3 != i4
493 | 
494 |         # all must be instances of IDCClient
495 |         assert isinstance(i1, IDCClient)
496 |         assert isinstance(i2, IDCClient)
497 |         assert isinstance(i3, IDCClient)
498 |         assert isinstance(i4, IDCClient)
499 | 
500 |     def test_cli_download(self):
501 |         runner = CliRunner()
502 |         with runner.isolated_filesystem():
503 |             result = runner.invoke(
504 |                 self.download,
505 |                 # StudyInstanceUID:
506 |                 ["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"],
507 |             )
508 |             assert len(os.listdir(Path.cwd())) != 0
509 | 
510 |         with runner.isolated_filesystem():
511 |             result = runner.invoke(
512 |                 self.download,
513 |                 # crdc_series_uuid:
514 |                 ["e5c5c71d-62c4-4c50-a8a9-b6799c7f8dea"],
515 |             )
516 |             assert len(os.listdir(Path.cwd())) != 0
517 | 
518 |     def test_prior_version_manifest(self):
519 |         # Define the values for each optional parameter
520 |         quiet_values = [True, False]
521 |         validate_manifest_values = [True, False]
522 |         show_progress_bar_values = [True, False]
523 |         use_s5cmd_sync_values = [True, False]
524 |         dirTemplateValues = [
525 |             None,
526 |             "%collection_id/%PatientID/%Modality/%StudyInstanceUID/%SeriesInstanceUID",
527 |             "%collection_id_%PatientID_%Modality_%StudyInstanceUID_%SeriesInstanceUID",
528 |         ]
529 |         # Generate all combinations of optional parameters
530 |         combinations = product(
531 |             quiet_values,
532 |             validate_manifest_values,
533 |             show_progress_bar_values,
534 |             use_s5cmd_sync_values,
535 |             dirTemplateValues,
536 |         )
537 | 
538 |         # Test each combination
539 |         for (
540 |             quiet,
541 |             validate_manifest,
542 |             show_progress_bar,
543 |             use_s5cmd_sync,
544 |             dirTemplate,
545 |         ) in combinations:
546 |             with tempfile.TemporaryDirectory() as temp_dir:
547 |                 self.client.download_from_manifest(
548 |                     manifestFile="./prior_version_manifest.s5cmd",
549 |                     downloadDir=temp_dir,
550 |                     quiet=quiet,
551 |                     validate_manifest=validate_manifest,
552 |                     show_progress_bar=show_progress_bar,
553 |                     use_s5cmd_sync=use_s5cmd_sync,
554 |                     dirTemplate=dirTemplate,
555 |                 )
556 | 
557 |                 self.assertEqual(
558 |                     sum([len(files) for r, d, files in os.walk(temp_dir)]), 5
559 |                 )
560 | 
561 |     def test_list_indices(self):
562 |         i = IDCClient()
563 |         assert i.indices_overview  # assert that dict was created
564 | 
565 |     def test_fetch_index(self):
566 |         i = IDCClient()
567 |         assert i.indices_overview["sm_index"]["installed"] is False
568 |         i.fetch_index("sm_index")
569 |         assert i.indices_overview["sm_index"]["installed"] is True
570 |         assert hasattr(i, "sm_index")
571 | 
572 |     def test_indices_urls(self):
573 |         i = IDCClient()
574 |         for index in i.indices_overview:
575 |             if i.indices_overview[index]["url"] is not None:
576 |                 assert remote_file_exists(i.indices_overview[index]["url"])
577 | 
578 |     def test_clinical_index_install(self):
579 |         i = IDCClient()
580 |         assert i.indices_overview["clinical_index"]["installed"] is False
581 |         i.fetch_index("clinical_index")
582 |         assert i.indices_overview["clinical_index"]["installed"] is True
583 |         assert len(os.listdir(i.clinical_data_dir)) > 0
584 | 
585 |         nlst_canc = i.get_clinical_table("nlst_canc")
586 |         assert nlst_canc is not None
587 | 
588 |     def test_series_files_URLs(self):
589 |         c = IDCClient()
590 |         seriesInstanceUID = (
591 |             "1.3.6.1.4.1.14519.5.2.1.3671.4754.228015946741563785297552112143"
592 |         )
593 |         files_aws = c.get_series_file_URLs(seriesInstanceUID, "aws")
594 |         files_gcp = c.get_series_file_URLs(seriesInstanceUID, "gcp")
595 |         assert len(files_aws) > 0
596 |         assert len(files_gcp) == len(files_aws)
597 | 
598 |     def test_instance_file_URLs(self):
599 |         c = IDCClient()
600 |         sopInstanceUID = "1.3.6.1.4.1.5962.99.1.1900325859.924065538.1719887277027.10.0"
601 |         file_url = "s3://idc-open-data/763fe058-7d25-4ba7-9b29-fd3d6c41dc4b/210f0529-c767-4795-9acf-bad2f4877427.dcm"
602 |         files_aws = c.get_instance_file_URL(sopInstanceUID, "aws")
603 |         files_gcp = c.get_instance_file_URL(sopInstanceUID, "gcp")
604 |         assert files_aws == files_gcp == file_url
605 | 
606 | 
607 | if __name__ == "__main__":
608 |     unittest.main()
609 | 


--------------------------------------------------------------------------------
/tests/prior_version_manifest.s5cmd:
--------------------------------------------------------------------------------
1 | cp s3://idc-open-data/2f77262c-3a4a-4e5a-bdc0-056dc2837f15/* .
2 | cp s3://idc-open-data/35459457-bd4c-4eef-9579-47f12fc6928e/*  .
3 | cp s3://idc-open-data/9c9ab2bf-c784-4658-b0f9-d2e4b33f2dbf/*  .
4 | cp s3://idc-open-data/312788ec-8739-4e56-a857-efcab92b20ed/* .
5 | cp s3://idc-open-data/c27b80c9-0e90-416b-8eca-0b20bc0cf8e2/*  .
6 | 


--------------------------------------------------------------------------------
/tests/study_manifest_aws.s5cmd:
--------------------------------------------------------------------------------
1 | # To download the files in this manifest, first install s5cmd (https://github.com/peak/s5cmd),
2 | # then run the following command:
3 | # s5cmd --no-sign-request --endpoint-url https://s3.amazonaws.com run study_manifest_aws.s5cmd
4 | study_manifest_cp_command
5 | cp s3://idc-open-data/28621ba9-1aca-4aab-a2a1-f6d2c3e2ab19/* .
6 | cp s3://idc-open-data/f0b76401-c6d1-4b61-a5fd-3fa596e6cc41/* .
7 | cp s3://idc-open-data/4ea3bbe6-98da-4b92-abe6-2ee18927e3c9/* .
8 | 


--------------------------------------------------------------------------------
/tests/study_manifest_bogus.s5cmd:
--------------------------------------------------------------------------------
1 | # the URLs below are invalid and are used for test purposes only!
2 | cp s3://invalid-idc-open-data/28621ba9-1aca-4aab-a2a1-f6d2c3e2ab19/* .
3 | cp s3://invalid-idc-open-data/f0b76401-c6d1-4b61-a5fd-3fa596e6cc41/* .
4 | cp s3://invalid-idc-open-data/4ea3bbe6-98da-4b92-abe6-2ee18927e3c9/* .
5 | 


--------------------------------------------------------------------------------
/tests/study_manifest_gcs.s5cmd:
--------------------------------------------------------------------------------
1 | # To download the files in this manifest, first install s5cmd (https://github.com/peak/s5cmd),
2 | # then run the following command:
3 | # s5cmd --no-sign-request --endpoint-url https://storage.googleapis.com run study_manifest_gcs.s5cmd
4 | cp s3://idc-open-data/28621ba9-1aca-4aab-a2a1-f6d2c3e2ab19/* .
5 | cp s3://idc-open-data/f0b76401-c6d1-4b61-a5fd-3fa596e6cc41/* .
6 | cp s3://idc-open-data/4ea3bbe6-98da-4b92-abe6-2ee18927e3c9/* .
7 | 


--------------------------------------------------------------------------------
/tests/test_package.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import importlib.metadata
 4 | 
 5 | import idc_index as m
 6 | 
 7 | 
 8 | def test_version():
 9 |     assert importlib.metadata.version("idc_index") == m.__version__
10 | 


--------------------------------------------------------------------------------