├── q2_fondue
├── tests
│ ├── data
│ │ ├── testaccA.sra
│ │ ├── SRR123456.sra
│ │ ├── SRR123457.sra
│ │ ├── SRP123456_md.tsv
│ │ ├── SRS123456_md.tsv
│ │ ├── SRX123456_md.tsv
│ │ ├── PRJNA734376_md.tsv
│ │ ├── SRR123456_md.tsv
│ │ ├── SRR123457_md.tsv
│ │ ├── study_ids.tsv
│ │ ├── testaccB_md.tsv
│ │ ├── testaccC_md.tsv
│ │ ├── bioproject_ids.tsv
│ │ ├── run_ids.tsv
│ │ ├── sample_ids.tsv
│ │ ├── experiment_ids.tsv
│ │ ├── sample_ids_w_doi.tsv
│ │ ├── study_ids_w_doi.tsv
│ │ ├── SRR1234567_md.tsv
│ │ ├── bioproject_ids_w_doi.tsv
│ │ ├── experiment_ids_w_doi.tsv
│ │ ├── failed_ids_no_doi.tsv
│ │ ├── testaccBC_md.tsv
│ │ ├── run_ids_w_doi.tsv
│ │ ├── metadata_response_error.xml
│ │ ├── run_ids_w_doi_2.tsv
│ │ ├── empty
│ │ │ ├── xxx_00_L001_R1_001.fastq.gz
│ │ │ └── xxx_00_L001_R2_001.fastq.gz
│ │ ├── paired1
│ │ │ ├── SEQID1_00_L001_R1_001.fastq.gz
│ │ │ ├── SEQID1_00_L001_R2_001.fastq.gz
│ │ │ ├── SEQID2_00_L001_R1_001.fastq.gz
│ │ │ └── SEQID2_00_L001_R2_001.fastq.gz
│ │ ├── paired2
│ │ │ ├── SEQID3_00_L001_R1_001.fastq.gz
│ │ │ ├── SEQID3_00_L001_R2_001.fastq.gz
│ │ │ ├── SEQID4_00_L001_R1_001.fastq.gz
│ │ │ └── SEQID4_00_L001_R2_001.fastq.gz
│ │ ├── single1
│ │ │ ├── SEQID1_00_L001_R1_001.fastq.gz
│ │ │ └── SEQID2_00_L001_R1_001.fastq.gz
│ │ ├── single2
│ │ │ ├── SEQID3_00_L001_R1_001.fastq.gz
│ │ │ └── SEQID4_00_L001_R1_001.fastq.gz
│ │ ├── SRR123457_2.fastq
│ │ ├── testacc_2.fastq
│ │ ├── testacc_00_L001_R2_001.fastq
│ │ ├── elink_response_single.json
│ │ ├── esearch_response_single_ambiguous.json
│ │ ├── esearch_response_single_correct.json
│ │ ├── sra-metadata-1.tsv
│ │ ├── sra-metadata-2.tsv
│ │ ├── sra-metadata-3.tsv
│ │ ├── esearch_response_multi_invalid.json
│ │ ├── sra-metadata-4.tsv
│ │ ├── sra-metadata-failed-ids.tsv
│ │ ├── sra-metadata-5.tsv
│ │ ├── sra-metadata-6.tsv
│ │ ├── sra-metadata-7.tsv
│ │ ├── sra-metadata-8.tsv
│ │ ├── sra-metadata-mock.tsv
│ │ ├── sra-metadata-exp-4.tsv
│ │ ├── esearch_response_multi_correct.json
│ │ ├── esearch_response_multi_mixed.json
│ │ ├── sra-metadata-exp-2.tsv
│ │ ├── sra-metadata-exp-3.tsv
│ │ ├── sra-metadata-exp-5.tsv
│ │ ├── sra-metadata-exp-1.tsv
│ │ ├── testaccHYB.fastq
│ │ ├── fasterq-dump-response.txt
│ │ ├── SRR123456.fastq
│ │ ├── testaccA.fastq
│ │ ├── testacc_1.fastq
│ │ ├── SRR123457_1.fastq
│ │ ├── testaccA_01_L001_R1_001.fastq
│ │ ├── testacc_00_L001_R1_001.fastq
│ │ ├── testaccHYB_2.fastq
│ │ ├── testaccHYB_1.fastq
│ │ ├── scraper_items_no_doi.json
│ │ ├── efetch_b2_response_runs.xml
│ │ ├── efetch_b1_response_runs.xml
│ │ ├── efetch_response_runs_single_item.xml
│ │ ├── metadata_processed_multi.json
│ │ ├── efetch_response_runs.xml
│ │ ├── scraper_items_no_attach.json
│ │ └── metadata_response_small.json
│ ├── __init__.py
│ ├── test_query.py
│ ├── test_get_all.py
│ ├── test_esearch.py
│ ├── _utils.py
│ └── test_utils.py
├── types
│ ├── tests
│ │ ├── data
│ │ │ ├── sra-failed-ids-empty.tsv
│ │ │ ├── ncbi-ids-wrong.tsv
│ │ │ ├── ncbi-ids-bioprojects.tsv
│ │ │ ├── ncbi-ids-other.tsv
│ │ │ ├── ncbi-ids-studies.tsv
│ │ │ ├── ncbi-ids-runs.tsv
│ │ │ ├── ncbi-ids-runs-wrong-id-header.tsv
│ │ │ ├── ncbi-ids-runs-doi.tsv
│ │ │ ├── ncbi-ids-runs-no-doi.tsv
│ │ │ ├── sra-failed-ids.tsv
│ │ │ ├── sra-metadata-missing-columns.tsv
│ │ │ ├── sra-metadata-missing-ids.tsv
│ │ │ └── sra-metadata.tsv
│ │ └── __init__.py
│ ├── _type.py
│ ├── __init__.py
│ ├── _transformer.py
│ └── _format.py
├── entrezpy_clients
│ ├── __init__.py
│ ├── _utils.py
│ ├── _esearch.py
│ ├── _pipelines.py
│ └── _sra_meta.py
├── __init__.py
├── query.py
├── get_all.py
├── citations.bib
├── utils.py
└── metadata.py
├── .gitattributes
├── tutorial
├── metadata_file.tsv
└── metadata_file_runs.tsv
├── setup.cfg
├── logo.png
├── .github
└── workflows
│ ├── join-release.yaml
│ ├── tag-release.yaml
│ ├── dependecies.yaml
│ ├── ci.yaml
│ ├── q2-ci.yaml
│ ├── dependent-issues.yaml
│ └── docker-push.yaml
├── parallel.config
├── .coveragerc
├── .copier-answers.yml
├── Makefile
├── conda-recipe
└── meta.yaml
├── .gitignore
├── pyproject.toml
├── LICENSE
├── Dockerfile
└── install-sra-tools.sh
/q2_fondue/tests/data/testaccA.sra:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123456.sra:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123457.sra:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | pyproject.toml export-subst
2 |
--------------------------------------------------------------------------------
/tutorial/metadata_file.tsv:
--------------------------------------------------------------------------------
1 | id
2 | PRJEB14186
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRP123456_md.tsv:
--------------------------------------------------------------------------------
1 | id
2 | SRP123456
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRS123456_md.tsv:
--------------------------------------------------------------------------------
1 | id
2 | SRS123456
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRX123456_md.tsv:
--------------------------------------------------------------------------------
1 | id
2 | SRX123456
3 |
--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/sra-failed-ids-empty.tsv:
--------------------------------------------------------------------------------
1 | ID
2 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/PRJNA734376_md.tsv:
--------------------------------------------------------------------------------
1 | id
2 | PRJNA734376
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123456_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123456
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123457_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123457
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/study_ids.tsv:
--------------------------------------------------------------------------------
1 | id
2 | ERP12345
3 | SRP23456
4 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccB_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123456
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccC_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123457
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/bioproject_ids.tsv:
--------------------------------------------------------------------------------
1 | id
2 | PRJNA123
3 | PRJNA234
4 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/run_ids.tsv:
--------------------------------------------------------------------------------
1 | id
2 | SRR123
3 | SRR234
4 | SRR345
5 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/sample_ids.tsv:
--------------------------------------------------------------------------------
1 | id
2 | ERS147978
3 | ERS3588233
4 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | extend-ignore = E203
4 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/logo.png
--------------------------------------------------------------------------------
/q2_fondue/tests/data/experiment_ids.tsv:
--------------------------------------------------------------------------------
1 | id
2 | ERX115020
3 | SRX10331465
4 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/sample_ids_w_doi.tsv:
--------------------------------------------------------------------------------
1 | id DOI
2 | SRS000100 some_doi1
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/study_ids_w_doi.tsv:
--------------------------------------------------------------------------------
1 | id DOI
2 | SRP000001 some_doi1
3 |
--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-wrong.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | ABC123
3 | SRX098
4 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR1234567_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123456
3 | SRR123457
4 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/bioproject_ids_w_doi.tsv:
--------------------------------------------------------------------------------
1 | ID DOI
2 | PRJNA33627 some_doi1
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/experiment_ids_w_doi.tsv:
--------------------------------------------------------------------------------
1 | id DOI
2 | SRX000007 some_doi1
3 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/failed_ids_no_doi.tsv:
--------------------------------------------------------------------------------
1 | id
2 | SRR000001
3 | SRR000002
4 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccBC_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123456
3 | SRR123457
4 |
--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-bioprojects.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | PRJ1234
3 | PRJ56789
4 |
--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-other.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | ERX115020
3 | ERS115020
4 |
--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-studies.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | ERP104978
3 | SRP123456
4 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/run_ids_w_doi.tsv:
--------------------------------------------------------------------------------
1 | ID DOI
2 | SRR000001 some_doi1
3 | SRR000002 some_doi2
4 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/metadata_response_error.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/run_ids_w_doi_2.tsv:
--------------------------------------------------------------------------------
1 | ID DOI
2 | SRR123456 some_doi1
3 | SRR123457 some_doi2
4 |
--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-runs.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | SRR000013
3 | SRR000001
4 | ERR3978173
5 | ERR3978174
6 |
--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-runs-wrong-id-header.tsv:
--------------------------------------------------------------------------------
1 | wrongID
2 | SRR000013
3 | SRR000001
4 | ERR3978173
5 | ERR3978174
6 |
--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-runs-doi.tsv:
--------------------------------------------------------------------------------
1 | ID DOI
2 | SRR000013 some_doi1
3 | SRR000001 some_doi2
4 | ERR3978173 some_doi3
5 | ERR3978174 some_doi4
6 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/empty/xxx_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/empty/xxx_00_L001_R1_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/empty/xxx_00_L001_R2_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/empty/xxx_00_L001_R2_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-runs-no-doi.tsv:
--------------------------------------------------------------------------------
1 | ID FUNFACT
2 | SRR000013 some_doi1
3 | SRR000001 some_doi2
4 | ERR3978173 some_doi3
5 | ERR3978174 some_doi4
6 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired1/SEQID1_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID1_00_L001_R1_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired1/SEQID1_00_L001_R2_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID1_00_L001_R2_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired1/SEQID2_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID2_00_L001_R1_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired1/SEQID2_00_L001_R2_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID2_00_L001_R2_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired2/SEQID3_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID3_00_L001_R1_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired2/SEQID3_00_L001_R2_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID3_00_L001_R2_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired2/SEQID4_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID4_00_L001_R1_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired2/SEQID4_00_L001_R2_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID4_00_L001_R2_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/single1/SEQID1_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single1/SEQID1_00_L001_R1_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/single1/SEQID2_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single1/SEQID2_00_L001_R1_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/single2/SEQID3_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single2/SEQID3_00_L001_R1_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/tests/data/single2/SEQID4_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single2/SEQID4_00_L001_R1_001.fastq.gz
--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/sra-failed-ids.tsv:
--------------------------------------------------------------------------------
1 | ID Error message
2 | SRR000020 ID is ambiguous.
3 | SRR000021 ID is invalid.
4 | ERR0000020 ID is ambiguous.
5 | ERR0000021 ID is invalid.
6 |
--------------------------------------------------------------------------------
/.github/workflows/join-release.yaml:
--------------------------------------------------------------------------------
1 | name: join-release
2 | on:
3 | workflow_dispatch: {}
4 | jobs:
5 | release:
6 | uses: qiime2/distributions/.github/workflows/lib-join-release.yaml@dev
7 |
--------------------------------------------------------------------------------
/.github/workflows/tag-release.yaml:
--------------------------------------------------------------------------------
1 | name: tag-release
2 | on:
3 | push:
4 | branches: ["Release-*"]
5 | jobs:
6 | tag:
7 | uses: qiime2/distributions/.github/workflows/lib-tag-release.yaml@dev
8 |
--------------------------------------------------------------------------------
/parallel.config:
--------------------------------------------------------------------------------
1 | [parsl]
2 |
3 | [[parsl.executors]]
4 | class = "HighThroughputExecutor"
5 | label = "default"
6 | max_workers = 1
7 |
8 | [parsl.executors.provider]
9 | class = "LocalProvider"
10 | max_blocks = 4
--------------------------------------------------------------------------------
/.github/workflows/dependecies.yaml:
--------------------------------------------------------------------------------
1 | name: Dependency check
2 | on:
3 | pull_request:
4 | branches: ["main"]
5 | types: [opened, reopened, synchronize, labeled, unlabeled]
6 |
7 | jobs:
8 | ci:
9 | uses: bokulich-lab/utilities/.github/workflows/dependencies.yaml@main
10 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = q2_fondue
3 | branch = True
4 | omit =
5 | */tests*
6 | */__init__.py
7 | q2_fondue/_version.py
8 | versioneer.py
9 |
10 | [report]
11 | fail_under = 90
12 | omit =
13 | */tests*
14 | */__init__.py
15 | q2_fondue/_version.py
16 | versioneer.py
17 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on:
3 | pull_request:
4 | branches: ["main"]
5 | push:
6 | branches: ["main"]
7 | tags: ["*"]
8 |
9 | jobs:
10 | ci:
11 | uses: bokulich-lab/utilities/.github/workflows/ci.yaml@main
12 | with:
13 | distro: moshpit
14 | build_docker: true
15 |
--------------------------------------------------------------------------------
/.github/workflows/q2-ci.yaml:
--------------------------------------------------------------------------------
1 | name: QIIME 2 CI
2 | on:
3 | pull_request:
4 | branches: ["main"]
5 | push:
6 | branches: ["main"]
7 |
8 | jobs:
9 | qiime-ci:
10 | uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml@dev
11 | with:
12 | distro: moshpit
13 | recipe-path: 'conda-recipe'
14 |
--------------------------------------------------------------------------------
/q2_fondue/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
--------------------------------------------------------------------------------
/q2_fondue/types/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
--------------------------------------------------------------------------------
/q2_fondue/entrezpy_clients/__init__.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123457_2.fastq:
--------------------------------------------------------------------------------
1 | @test_acc_single.1 test_1_seq length=59
2 | TTGGGGGGCACCATCTAATCAGCTGCCAGTGCTGCCAGAACATAAAGCAGGCAGAAATT
3 | +test_acc_single.1 test_1_seq length=59
4 | ?60-*'$"<=D===;8C=<<<<<==C<=<<==<=C=:={{ q2_types }}
25 | - qiime2 >={{ qiime2 }}
26 | - tqdm {{ tqdm }}
27 | build:
28 | - python {{ python }}
29 | - setuptools
30 | - versioningit
31 | test:
32 | imports:
33 | - q2_fondue
34 | - qiime2.plugins.fondue
35 | requires:
36 | - parameterized
37 | - coverage
38 | - pytest-cov
39 | commands:
40 | - pytest --cov q2_fondue --cov-report xml:coverage.xml --pyargs q2_fondue
41 | about:
42 | home: https://github.com/bokulich-lab/q2-fondue
43 | license: BSD-3-Clause
44 | license_family: BSD
45 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/esearch_response_multi_mixed.json:
--------------------------------------------------------------------------------
1 | {
2 | "header": {
3 | "type": "esearch",
4 | "version": "0.3"
5 | },
6 | "esearchresult": {
7 | "count": "8",
8 | "retmax": "0",
9 | "retstart": "0",
10 | "idlist": [],
11 | "translationset": [],
12 | "translationstack": [
13 | {
14 | "term": "SRR000001[All Fields]",
15 | "field": "All Fields",
16 | "count": "1",
17 | "explode": "N"
18 | },
19 | {
20 | "term": "SRR000013[All Fields]",
21 | "field": "All Fields",
22 | "count": "1",
23 | "explode": "N"
24 | },
25 | "OR",
26 | {
27 | "term": "SR012[All Fields]",
28 | "field": "All Fields",
29 | "count": "7",
30 | "explode": "N"
31 | },
32 | "OR"
33 | ],
34 | "querytranslation": "SRR000001[All Fields] OR SRR000013[All Fields] OR SR012[All Fields]",
35 | "errorlist": {
36 | "phrasesnotfound": [
37 | "ABCD123", "SRR001"
38 | ], "fieldsnotfound": []}}}
39 |
--------------------------------------------------------------------------------
/.github/workflows/dependent-issues.yaml:
--------------------------------------------------------------------------------
1 | name: Dependent issues
2 |
3 | on:
4 | issues:
5 | types:
6 | - opened
7 | - edited
8 | - closed
9 | - reopened
10 | pull_request_target:
11 | types:
12 | - opened
13 | - edited
14 | - closed
15 | - reopened
16 | - synchronize
17 |
18 | schedule:
19 | - cron: '0 0 * * *'
20 |
21 | jobs:
22 | check:
23 | runs-on: ubuntu-latest
24 | steps:
25 | - uses: z0al/dependent-issues@v1
26 | env:
27 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
28 | GITHUB_READ_TOKEN: ${{ secrets.GITHUB_READ_TOKEN }}
29 |
30 | with:
31 | label: dependent
32 |
33 | # (Optional) Enable checking for dependencies in issues.
34 | # Enable by setting the value to "on". Default "off"
35 | check_issues: off
36 |
37 | ignore_dependabot: off
38 |
39 | keywords: depends on, blocked by, merge after
40 |
41 | comment: >
42 | This PR/issue depends on:
43 | {{ dependencies }}
44 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-exp-2.tsv:
--------------------------------------------------------------------------------
1 | ID Library Selection Library Source Library Name Library Layout Bases Spots AvgSpotLen Organism Tax ID Sample Name Sample Accession Sample Title Biosample ID Bioproject ID Experiment ID Instrument Platform Study ID Bytes Public Center Name Some Meta 1 Some Meta 2
2 | SRR123456 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC ABC
3 | SRR123457 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DEF
4 | SRR123458 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC AB12
5 | SRR123459 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DE34
6 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-exp-3.tsv:
--------------------------------------------------------------------------------
1 | ID Library Selection Library Source Library Name Library Layout Bases Spots AvgSpotLen Organism Tax ID Sample Name Sample Accession Sample Title Biosample ID Bioproject ID Experiment ID Instrument Platform Study ID Bytes Public Center Name Some Meta 1 Some Meta 2
2 | SRR123456 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC ABC
3 | SRR123457 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DEF AB12
4 | SRR123458 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC GHI DE12
5 | SRR123459 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC GH34
6 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-exp-5.tsv:
--------------------------------------------------------------------------------
1 | ID Library Selection Library Source Library Name Library Layout Bases Spots AvgSpotLen Organism Tax ID Sample Name Sample Accession Sample Title Biosample ID Bioproject ID Experiment ID Instrument Platform Study ID Bytes Public Center Name Some Meta 1 Some Meta 2
2 | SRR123456 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC ABC
3 | SRR123457 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DEF AB12
4 | SRR123457 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DEF XXX
5 | SRR123458 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DE34
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 |
55 | # Sphinx documentation
56 | docs/_build/
57 |
58 | # PyBuilder
59 | target/
60 |
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 |
64 | # vi
65 | .*.swp
66 |
67 | # other
68 | *~
69 | .env
70 |
71 | .DS_store
72 | .idea
73 | .vscode
74 |
75 |
76 | fasterq.tmp.*/**
77 | sratoolkit**/**
78 |
79 | # ignore dbGAP permission keys
80 | **.krt
81 | **.ngc
82 |
83 | # Version file from versioningit
84 | _version.py
85 |
86 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "q2-fondue"
3 | authors = [
4 | { name = "Michal Ziemski", email = "ziemski.michal@gmail.com" }
5 | ]
6 | description = "None"
7 | readme = {file = "README.md", content-type = "text/markdown"}
8 | license = {file = "LICENSE"}
9 | dynamic = ["version"]
10 |
11 | [project.urls]
12 | Homepage = "https://github.com/bokulich-lab/q2-fondue"
13 | Repository = "https://github.com/bokulich-lab/q2-fondue"
14 |
15 | [project.entry-points.'qiime2.plugins']
16 | "q2-fondue" = "q2_fondue.plugin_setup:plugin"
17 |
18 | [build-system]
19 | requires = [
20 | "setuptools",
21 | "versioningit",
22 | "wheel"
23 | ]
24 | build-backend = "setuptools.build_meta"
25 |
26 | [tool.versioningit.vcs]
27 | method = "git-archive"
28 | describe-subst = "2026.1.0.dev0-1-g71797cbd"
29 | default-tag = "0.0.1"
30 |
31 | [tool.versioningit.next-version]
32 | method = "minor"
33 |
34 | [tool.versioningit.format]
35 | distance = "{base_version}+{distance}.{vcs}{rev}"
36 | dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
37 | distance-dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
38 |
39 | [tool.versioningit.write]
40 | file = "q2_fondue/_version.py"
41 |
42 | [tool.setuptools]
43 | include-package-data = true
44 |
45 | [tool.setuptools.packages.find]
46 | where = ["."]
47 | include = ["q2_fondue*"]
48 |
49 | [tool.setuptools.package-data]
50 | q2_fondue = ["**/*"]
51 |
--------------------------------------------------------------------------------
/q2_fondue/query.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
9 | import threading
10 | import pandas as pd
11 |
12 | from q2_fondue.utils import handle_threaded_exception
13 | from q2_fondue.entrezpy_clients._pipelines import _get_run_ids
14 |
15 | threading.excepthook = handle_threaded_exception
16 |
17 |
18 | def get_ids_from_query(
19 | query: str, email: str, threads: int = 1, log_level: str = "INFO"
20 | ) -> pd.Series:
21 | """Retrieves SRA run IDs based on a search query performed
22 | on the BioSample database.
23 |
24 | Args:
25 | query (str): Search query to be executed on
26 | the BioSample database.
27 | email (str): A valid e-mail address (required by NCBI).
28 | threads (int, default=1): Number of threads to be used in parallel.
29 | log_level (str, default='INFO'): Logging level.
30 |
31 | Returns:
32 | ids (pd.Series): Retrieved SRA run IDs.
33 | """
34 | run_ids = _get_run_ids(email, threads, None, query, "biosample", log_level)
35 |
36 | return pd.Series(run_ids, name="ID")
37 |
--------------------------------------------------------------------------------
/q2_fondue/tests/test_query.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 | import pandas as pd
9 | import unittest
10 |
11 | from pandas.testing import assert_frame_equal
12 | from qiime2.plugins import fondue
13 | from unittest.mock import patch
14 |
15 | from q2_fondue.tests.test_sequences import SequenceTests
16 |
17 |
18 | class TestQuery(SequenceTests):
19 | package = "q2_fondue.tests"
20 |
21 | @patch("q2_fondue.query._get_run_ids", return_value=["SRR123", "SRR234"])
22 | def test_query(self, mock_ids):
23 | query = "some magical query text"
24 |
25 | (obs_ids,) = fondue.actions.get_ids_from_query(
26 | query, "fake@email.com", 1, "DEBUG"
27 | )
28 | exp_ids = pd.DataFrame(
29 | index=pd.Index(["SRR123", "SRR234"], name="ID"),
30 | columns=[],
31 | )
32 |
33 | mock_ids.assert_called_once_with(
34 | "fake@email.com", 1, None, query, "biosample", "DEBUG"
35 | )
36 | assert_frame_equal(obs_ids.view(pd.DataFrame), exp_ids)
37 |
38 |
39 | if __name__ == "__main__":
40 | unittest.main()
41 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-exp-1.tsv:
--------------------------------------------------------------------------------
1 | ID Library Selection Library Source Library Name Library Layout Bases Spots AvgSpotLen Organism Tax ID Sample Name Sample Accession Sample Title Biosample ID Bioproject ID Experiment ID Instrument Platform Study ID Bytes Public Center Name Some Meta 1
2 | SRR123456 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC ABC
3 | SRR123457 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DEF
4 | SRR123458 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC GHI
5 | SRR123459 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC JKL
6 | SRR123460 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC MNO
7 | SRR123461 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC PQR
8 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2025, Bokulich Laboratories.
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccHYB.fastq:
--------------------------------------------------------------------------------
1 | @ERR3018303.92 Bgsng7131.m10_5758889 length=224
2 | CCTGTTCGCTCCCCACGCTTTCGAGCCTCAGCGTCAGTTACAGACCAGAGAGCCGCTTTCGCCACCGGTGTTCCTCCATATATCTACGCATTTCACCGCTACACATGGAATTCCACTCTCCCCTTCTGCACTCAAGTTTGACAGTTTCCAAAGCGAACTATGGTTGAGCCACAGCCTTTAACTTCAGACTTATCAAACCGCCTGCGCTCGCTTTACGCCCAATA
3 | +ERR3018303.92 Bgsng7131.m10_5758889 length=224
4 | HHHHHHHGGGHGGGFGGGGGGHGGGGGHHHHHGGGGGHHHHHHHHHHHHGFGFHGGGGGHGGGGGHGGGGGGHHHHHGFHHGHHHHHGGGGGHHHHHGGGGGHHHHHHF1FGFHHHHHHHHHGHHHHHGHHHHHHHHHHHFHGHHHHHHGHGHHFGGGHHGHHHHGHFGGGGGGGGGGGGGGGGGFGGGGGFGGGGFGGGFFFFFFFFFFFFFFFFFFFFFFFF
5 | @ERR3018303.93 Bgsng7131.m10_1129555 length=229
6 | TACGTAGGTGGCAAGCGCTGTCCGGATTTATTGGGCGTAAAGGGAACGCAGGCGGTCTTTTAAGTCTGATGTGAAAGCCTTCGGCTTAACCGAAGTAGTGCATTGGAAACTGGAAGACTTGAGTGCAGAAGAGGAGAGTGGAACTCAATGTGTAGCGGTGAAATGGGTAGATATATGGAAGAACACCAGTGGCGAAAGCGGCGCGTTGGCCTGTAACTGACGCTGAGGT
7 | +ERR3018303.93 Bgsng7131.m10_1129555 length=229
8 | D2FEGFA0GGE0GFCHG/A/B11AEEE/G1FBG1GA>>EA/GFGGHG///E@EEE>/?1BGB1B>F2B1G1212FGFFHHB00>1GGDGHFBBDGHEHGHAACC/;.CGC:FB/C0000CGG?GGGGG00C0.BB.09CBFGGGGGB.9/F9A-9/;/-;99@?/99@###########################
9 | @ERR3018303.94 Bgsng7131.m10_1839802 length=250
10 | TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGACTTTTAAGTGAGATGTGAAATACCCGGGCTCAACTTGGGTGCTGCATTTCAAACTGGAAGTCTAGAGTGCAGGAGAGGAGAATGGAATTCCTAGTGTAGCGGTGAAATGCGTAGAGATTAGGAAGAACACCAGTGGCGAAGGCGATTCTCTGGACTGTAACTGACGCTGAGGCTCGAAAGCGTGGGGAGCAAAC
11 | +ERR3018303.94 Bgsng7131.m10_1839802 length=250
12 | CBBCCFFCFFCFGGGGGGGGGGHGGGGGHHHHHHHHGGFGHHGGGGGGGGGGHGGGGGHHHFGHHGHHGHHHHHHEGHHHHGGGFGHGHGGHGHHGEFFHGHHHHHGHGGHGHHHHHHHGHFHHHHHGGFFHGDFGHFHGHHGHHGHHHHHHHFHHGDGGGFGHGHFGD?EFCHGHHFCGHFHHGGGGGGGFFGFADAED?CFFFFFFFB;BEBFFFFFF/ADFAAFEFEFF@BADFFFFFFFAABEFF:
13 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/fasterq-dump-response.txt:
--------------------------------------------------------------------------------
1 | cursor-cache : 5,242,880 bytes
2 | buf-size : 1,048,576 bytes
3 | mem-limit : 52,428,800 bytes
4 | threads : 6
5 | scratch-path : '/private/var/folders/7f/7nw_x13n5q965rss_qz6061m0000gq/T/tmpf0d7oy6a/fasterq.tmp.MacBook-Pro.35899/'
6 | total ram : 17,179,869,184 bytes
7 | output-format: FASTQ split 3
8 | check-mode : only
9 | output-file : '/private/var/folders/7f/7nw_x13n5q965rss_qz6061m0000gq/T/tmpf0d7oy6a/ERR2750829.fastq'
10 | output-dir : '.'
11 | output : '/private/var/folders/7f/7nw_x13n5q965rss_qz6061m0000gq/T/tmpf0d7oy6a/ERR2750829.fastq'
12 | append-mode : 'NO'
13 | stdout-mode : 'NO'
14 | seq-defline : '@$ac.$si $sn length=$rl'
15 | qual-defline : '+$ac.$si $sn length=$rl'
16 | only-unaligned : 'NO'
17 | only-aligned : 'NO'
18 | accession : 'ERR2750829'
19 | accession-path: 'ERR2750829'
20 | est. output : 44,926,989,570 bytes
21 | disk-limit (OS) : 9,149,612,032 bytes
22 | disk-limit-tmp (OS) : 9,149,612,032 bytes
23 | out/tmp on same fs : 'NO'
24 |
25 | ERR2750829 is remote
26 | ... has a size of 12,459,034,417 bytes
27 | ... is cSRA without alignments
28 | ... SEQ has NAME column = YES
29 | ... SEQ has SPOT_GROUP column = YES
30 | ... uses 'SEQUENCE' as sequence-table
31 | SEQ.first_row = 1
32 | SEQ.row_count = 84,543,740
33 | SEQ.spot_count = 84,543,740
34 | SEQ.total_base_count = 16,545,432,985
35 | SEQ.bio_base_count = 16,545,432,985
36 | SEQ.avg_name_len = 1
37 | SEQ.avg_spot_group_len = 0
38 | SEQ.avg_bio_reads_per_spot = 2
39 | SEQ.avg_tech_reads_per_spot = 0
40 | ALIGN.first_row = 0
41 | ALIGN.row_count = 0
42 | ALIGN.spot_count = 0
43 | ALIGN.total_base_count = 0
44 | ALIGN.bio_base_count = 0
45 |
46 | disk-limit exeeded!
47 | fasterq-dump quit with error code 3
48 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/miniconda3:latest AS base
2 |
3 | ARG ENVIRONMENT
4 | ARG PLUGIN_NAME
5 |
6 | ENV PLUGIN_NAME=$PLUGIN_NAME
7 | ENV PATH=/opt/conda/envs/${PLUGIN_NAME}/bin:$PATH \
8 | LC_ALL=C.UTF-8 LANG=C.UTF-8 \
9 | MPLBACKEND=agg \
10 | UNIFRAC_USE_GPU=N \
11 | HOME=/home/qiime2 \
12 | XDG_CONFIG_HOME=/home/qiime2
13 |
14 | WORKDIR /home/qiime2
15 | COPY environment.yml .
16 | COPY install-sra-tools.sh .
17 |
18 | RUN apt-get update \
19 | && apt-get install -y --no-install-recommends wget curl procps make \
20 | && apt-get clean \
21 | && rm -rf /var/lib/apt/lists/*
22 |
23 | RUN conda update -qy conda \
24 | && conda install -c conda-forge -qy mamba \
25 | && mamba env create -n ${PLUGIN_NAME} --file environment.yml \
26 | && mamba run -n ${PLUGIN_NAME} bash install-sra-tools.sh \
27 | && mamba clean --all --yes \
28 | && chmod -R a+rwx /opt/conda
29 |
30 | RUN mkdir -p .ncbi
31 | RUN printf '/LIBS/GUID = "%s"\n' `uuidgen` > .ncbi/user-settings.mkfg
32 |
33 | COPY . ./plugin
34 | RUN mamba run -n ${PLUGIN_NAME} pip install ./plugin
35 |
36 | RUN /bin/bash -c "source activate ${PLUGIN_NAME}"
37 | ENV CONDA_PREFIX=/opt/conda/envs/${PLUGIN_NAME}/
38 | RUN mamba run -n ${PLUGIN_NAME} qiime dev refresh-cache
39 | RUN echo "source activate ${PLUGIN_NAME}" >> $HOME/.bashrc
40 | RUN echo "source tab-qiime" >> $HOME/.bashrc
41 |
42 |
43 | FROM base AS test
44 |
45 | RUN mamba run -n ${PLUGIN_NAME} pip install pytest pytest-cov coverage parameterized pytest-xdist
46 | CMD mamba run -n ${PLUGIN_NAME} make -f ./plugin/Makefile test-cov
47 |
48 | FROM base AS prod
49 |
50 | # Important: let any UID modify these directories so that
51 | # `docker run -u UID:GID` works
52 | RUN rm -rf ./plugin
53 | RUN chmod -R a+rwx /home/qiime2
--------------------------------------------------------------------------------
/q2_fondue/get_all.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
9 | import qiime2 as q2
10 |
11 | import pandas as pd
12 | import threading
13 |
14 | from q2_fondue.utils import handle_threaded_exception
15 | from qiime2 import Artifact
16 |
17 |
18 | threading.excepthook = handle_threaded_exception
19 |
20 |
21 | def get_all(
22 | ctx, accession_ids, email, threads=1, retries=2, log_level="INFO", linked_doi=None
23 | ):
24 |
25 | # get required methods
26 | get_metadata = ctx.get_action("fondue", "get_metadata")
27 | get_sequences = ctx.get_action("fondue", "get_sequences")
28 |
29 | # fetch metadata
30 | metadata, failed_ids = get_metadata(
31 | accession_ids, email, threads, log_level, linked_doi
32 | )
33 | failed_ids_df = failed_ids.view(pd.DataFrame)
34 |
35 | # fetch sequences - use metadata to get run ids, regardless if
36 | # runs or projects were requested
37 | run_ids = q2.Artifact.import_data(
38 | "NCBIAccessionIDs", pd.Series(metadata.view(pd.DataFrame).index)
39 | )
40 | (
41 | seq_single,
42 | seq_paired,
43 | failed_ids,
44 | ) = get_sequences(run_ids, email, retries, threads, log_level)
45 | failed_ids_df = pd.concat([failed_ids_df, failed_ids.view(pd.DataFrame)])
46 | if failed_ids_df.shape[0] > 0:
47 | failed_ids = Artifact.import_data("SRAFailedIDs", failed_ids_df)
48 |
49 | return metadata, seq_single, seq_paired, failed_ids
50 |
--------------------------------------------------------------------------------
/q2_fondue/citations.bib:
--------------------------------------------------------------------------------
1 | @article {Ziemski2022,
2 | author = {Ziemski, Michal and Adamov, Anja and Kim, Lina and Flörl, Lena and Bokulich, Nicholas A},
3 | title = {Reproducible acquisition, management, and meta-analysis of nucleotide sequence (meta)data using q2-fondue},
4 | year = {2022},
5 | month = {09},
6 | doi = {10.1093/bioinformatics/btac639},
7 | URL = {https://doi.org/10.1093/bioinformatics/btac639},
8 | journal = {Bioinformatics},
9 | issn = {1367-4803},
10 | }
11 |
12 | @article{Buchmann2019,
13 | author = {Buchmann, Jan P and Holmes, Edward C},
14 | doi = {10.1093/bioinformatics/btz385},
15 | editor = {Wren, Jonathan},
16 | journal = {Bioinformatics},
17 | month = {nov},
18 | number = {21},
19 | pages = {4511--4514},
20 | publisher = {Oxford University Press},
21 | title = {Entrezpy: a Python library to dynamically interact with the NCBI Entrez databases},
22 | url = {https://academic.oup.com/bioinformatics/article/35/21/4511/5488119},
23 | volume = {35},
24 | year = {2019}
25 | }
26 |
27 | @misc{SraToolkit,
28 | name = {SRA Toolkit},
29 | author = {SRA Toolkit Development Team},
30 | version = {2.9.6},
31 | url = {https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software}
32 | }
33 |
34 | @misc{stephan_hugel_2019_2917290,
35 | author = {Stephan Hügel and Peter Gerdes and Patrick Fournier and
36 | emuzie and Patrick Golden and jghauser and Stefan Frühwirth and
37 | Sean Takats and Pablo Orduña and Merlin and Erik Hetzner and
38 | Christian Brodbeck and Avram Lyon and A Lee},
39 | title = {urschrei/pyzotero: Zenodo Release},
40 | month = {may},
41 | year = 2019,
42 | publisher = {Zenodo},
43 | version = {v1.3.15},
44 | doi = {10.5281/zenodo.2917290},
45 | url = {https://doi.org/10.5281/zenodo.2917290}
46 | }
47 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123456.fastq:
--------------------------------------------------------------------------------
1 | @test_acc_single.1 test_1_seq length=278
2 | AAACTCCTAGCCTACATCCGTACGAGTTAGCGTGGGATTACGAGGTGCACACCATTTCATTCCGTACGGGTAAATTTTTGTATTTTTAGCAGACGGCAGGGTTTCACCATGGTTGACCAACGTACTAATCTTGAACTCCTGACCTCAAGTGATTTGCCTGCCTTCAGCCTCCCAAAGTGACTGGGTATTACAGATGTGAGCGAGTTTGTGCCCAAGCCTTATAAGTAAATTTATAAATTTACATAATTTAAATGACTTATGCTTAGCGAAATAGTTTA
3 | +test_acc_single.1 test_1_seq length=278
4 | 85)9=9/3-8?68<7=8<3657747==49==+;FB2;A;5:'*>69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1
5 | @test_acc_single.4 test_2_seq length=274
6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
7 | +test_acc_single.4 test_2_seq length=274
8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1
5 | @test_acc_single.4 test_2_seq length=274
6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
7 | +test_acc_single.4 test_2_seq length=274
8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1
5 | @test_acc_single.4 test_2_seq length=274
6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
7 | +test_acc_single.4 test_2_seq length=274
8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1
5 | @test_acc_single.4 test_2_seq length=274
6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
7 | +test_acc_single.4 test_2_seq length=274
8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1
5 | @test_acc_single.4 test_2_seq length=274
6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
7 | +test_acc_single.4 test_2_seq length=274
8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1
5 | @test_acc_single.4 test_2_seq length=274
6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
7 | +test_acc_single.4 test_2_seq length=274
8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A sratoolkit.tar.gz
26 |
27 | echo "Extracting..."
28 | tar -xzf sratoolkit.tar.gz
29 | rm sratoolkit.tar.gz
30 | mv "sratoolkit.${TOOLKIT_VER}-${OS_VER}/" "sratoolkit/"
31 |
32 | if [[ "$PREFIX" == "" ]]; then
33 | echo "Setting PREFIX=$CONDA_PREFIX"
34 | PREFIX="$CONDA_PREFIX"
35 | fi
36 |
37 | echo "Installing SRA Tools in $PREFIX..."
38 | if [[ ! -d "$PREFIX/bin/" ]]; then
39 | mkdir $PREFIX/bin/
40 | fi
41 | find sratoolkit/bin/ -maxdepth 1 -type f -exec mv -f {} $PREFIX/bin/ \;
42 | find sratoolkit/bin/ -maxdepth 1 -type l -exec mv -f {} $PREFIX/bin/ \;
43 | rm -r sratoolkit
44 |
45 | echo "Testing installation..."
46 | if [[ $(which prefetch) == "$PREFIX/bin"* ]]; then
47 | echo "Success!"
48 | else
49 | echo "Installation failed."
50 | exit 1
51 | fi
52 |
53 | echo "Configuring SRA Toolkit:"
54 | SRA_CACHE_LOC="$HOME/.prefetch_cache"
55 | echo "Creating prefetch cache directory under $SRA_CACHE_LOC..."
56 | mkdir "$SRA_CACHE_LOC"
57 | echo "Running vdb-config..."
58 | vdb-config -s "/repository/user/main/public/root=$SRA_CACHE_LOC"
59 | vdb-config --prefetch-to-user-repo
60 | echo "Configuration completed."
61 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccHYB_2.fastq:
--------------------------------------------------------------------------------
1 | @ERR3018303.88 Bgsng7131.m10_3542277 length=228
2 | TACGTAGGGTGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGTTCGTCGCGTCCGGTGTGAAAGTCCATCGCTTAACGGTGGATCCGCGCCGGGTACGGGCGGGCTTGAGTGCGGTAGGGGAGACTGGAATTCCCGGTGTAACGGTGGAATGTGTAGATATCAGGAAGAACACCAATGGCGAAGGCAGGTCTCTGGGCCGTTACTGACGCTGAGG
3 | +ERR3018303.88 Bgsng7131.m10_3542277 length=228
4 | HHHHHHHGGHGGHHHHGGGGHHHGGGGGHHHHHHHHGGGGHGHGGHHGGGGGGGGGGGGGGGGGGGGGGGGGGHHHHGHHHHHGGGHGHHGGGEGGHHHHGGGGGCGAGHGGGGGFFFFEF.99BFDCAFFFFFFC.EFFFFFFFFFFDCDF.BFFFFDEFF/BFBFFFFFFFFB/;B;BF.9FBBDFE;/;BDFAFFFEA;;B/BFFFFEED=.A99BFFFF?DFFF
5 | @ERR3018303.89 Bgsng7131.m10_3605463 length=230
6 | TACGTAGGTGGCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGAACGCAGGCGGTCTTTTCAGTCTGATGTGAAAGCCTTCGGCTTAACCGAAGTAGTGCATTGGAAACTGGAAGACTTGAGTGCAGAAGAGGAGAGTGGCACTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAAGAACACCAGGGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGTGGTT
7 | +ERR3018303.89 Bgsng7131.m10_3605463 length=230
8 | GGHHHGGEDBFHGFFEFGEFFHHGGGEGHH5BFFHGEFGGHBGGAGFGGCCGGEGEGGHHG4FDFDFDDGHHDHHF2CFFHHHD>?/2G1?AFF1>D0DFBGGHFFHBGF0CGBCHDGFDF00G0GGFFCGECDHGFHG.C0;/C/9B0;CFBB?AEGGBBBFFEDDAGEFFFBFFFFBFB.BFAE...-@-@BF?/AA;-.9BBFFBFFFFFFFFEBF?DA######
9 | @ERR3018303.90 Bgsng7131.m10_3641968 length=231
10 | TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGGCTGTGGCTCAACCATAGTTCGCTTTGGAAACTGTCAAACTTGAGTGCAGAAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCGGTGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGAGGCTCG
11 | +ERR3018303.90 Bgsng7131.m10_3641968 length=231
12 | GGGHHHHGHHHHGGGGGGGGGHHGGGGGHHHHHHHHGGGGHHHGGGGGGGGGGGGGGGGFFHHHDGHHHHHHGGHHHHHHHHGHHHHHGHHGGHHHHHGGGHGHHFHHBHHHGHHHHFHGFFFFFFFFFFFFFFFFFFFFFFFFFFEFA
13 | @ERR3018303.91 Bgsng7131.m10_3716454 length=231
14 | TACGTAGGTGGCAAGCGTTATCCGGATTTATTGGGCGTAAAGAGAGTGCAGGCGGTTTTCTAAGTCTGATGTGAAAGCCTTCGGCTTAACCGGAGAAGTGCATCGGAAACTGGATAACTTGAGTGCAGAAGAGGGTAGTGGAACTCCATGTGTAGCGGTGGAATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTACCTGGTCTGCAACTGACGCTGAGACTC
15 | +ERR3018303.91 Bgsng7131.m10_3716454 length=231
16 | GGGHHHHGHGGHHHGHGGGGHHHGGGGGGHEGHHHHGFGGHHHHHGHHHHHHGGGEFGGGHGHGGFHGEDG4GEGH3BECGEHG@CCBHBGFG/BF1=FDG>D-DEF.BFF?9BFFFCFBFFE.DFAF/BFFFF;.99AAEBFFFFFF?;FE/;9@@-;9BADFAFF-9=-;@F
5 | @ERR3018303.89 Bgsng7131.m10_3605463 length=250
6 | TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGGCTGTGGCTCAACCATAGTTCGCTTTGGAAACTGTCAAACTTGAGTGCAGAAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCGGTGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGAGGCTCGAAAGCGTGGGGGGCGAACA
7 | +ERR3018303.89 Bgsng7131.m10_3605463 length=250
8 | BBBBBFFBFFDFAEFEGGGCGGHGG?FGHGHHGHHHGGGGGHGGGGGGCCECGGGEFEGBGFHHHHGHB4FGFFHG4FG3EG3GHHEHHHHFHEFDGFFCACFGHEB2GDGF2F222@F>2C@1FHHB0CGFG/A??<>F1>FG0GHHHHHH0DG@F<@A0BC0CFD.@ACFGE;CF0B?.BFFGF-;9-99--B@D/.--BD/99BF//////;/9BBFA.9...A.EA@-9;.::9ABFF########
9 | @ERR3018303.90 Bgsng7131.m10_3641968 length=250
10 | TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGCCTGTGGCTCAACCATAGTTCGCTTTGGAAACTGTCCAACTTGGGTGCAGAAGGGGGGGGTGGAATTTCATGTGTAGCGGGGAAATGCGTAGATATATGGGGGAACACCCGTGGCGAAAGCGGCTCTCTTGGCTGTAACTGACGCTGAGGCTCGAAAACGTGGGGAGCCAAAC
11 | +ERR3018303.90 Bgsng7131.m10_3641968 length=250
12 | AAAAAFF@@F1C1EEGGGG?AAFEGGCGHG2GGHHHEGGGHHHG//EGGGGGGGGGGGF1BFDGHF21E2FHBGGG11BB01>FGHHHFHG/F1B22BF@1///? SRAMetadataFormat:
30 | ff = SRAMetadataFormat()
31 | with ff.open() as fh:
32 | data.to_csv(fh, sep="\t", header=True)
33 | return ff
34 |
35 |
36 | @plugin.register_transformer
37 | def _2(ff: SRAMetadataFormat) -> pd.DataFrame:
38 | with ff.open() as fh:
39 | df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str")
40 | return df
41 |
42 |
43 | @plugin.register_transformer
44 | def _3(ff: SRAMetadataFormat) -> qiime2.Metadata:
45 | return _meta_fmt_to_metadata(ff)
46 |
47 |
48 | @plugin.register_transformer
49 | def _4(data: pd.DataFrame) -> SRAFailedIDsFormat:
50 | ff = SRAFailedIDsFormat()
51 | with ff.open() as fh:
52 | data.to_csv(fh, sep="\t", header=True, index=True)
53 | return ff
54 |
55 |
56 | @plugin.register_transformer
57 | def _5(ff: SRAFailedIDsFormat) -> pd.DataFrame:
58 | with ff.open() as fh:
59 | df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str")
60 | return df
61 |
62 |
63 | @plugin.register_transformer
64 | def _6(ff: SRAFailedIDsFormat) -> qiime2.Metadata:
65 | return _meta_fmt_to_metadata(ff)
66 |
67 |
68 | @plugin.register_transformer
69 | def _7(data: pd.DataFrame) -> NCBIAccessionIDsFormat:
70 | ff = NCBIAccessionIDsFormat()
71 | with ff.open() as fh:
72 | data.to_csv(fh, sep="\t", header=True, index=True)
73 | return ff
74 |
75 |
76 | @plugin.register_transformer
77 | def _77(data: pd.Series) -> NCBIAccessionIDsFormat:
78 | ff = NCBIAccessionIDsFormat()
79 | return _series_to_meta_fmt(data, ff)
80 |
81 |
82 | @plugin.register_transformer
83 | def _8(ff: NCBIAccessionIDsFormat) -> pd.DataFrame:
84 | with ff.open() as fh:
85 | df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str")
86 | return df
87 |
88 |
89 | @plugin.register_transformer
90 | def _9(ff: NCBIAccessionIDsFormat) -> qiime2.Metadata:
91 | return _meta_fmt_to_metadata(ff)
92 |
93 |
94 | @plugin.register_transformer
95 | def _10(ff: SRAMetadataFormat) -> NCBIAccessionIDsFormat:
96 | fout = NCBIAccessionIDsFormat()
97 | with ff.open() as fh, fout.open() as fo:
98 | df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str")
99 | df.index.to_frame().to_csv(fo, sep="\t", header=True, index=False)
100 | return fout
101 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/scraper_items_no_doi.json:
--------------------------------------------------------------------------------
1 | [{
2 | "key": "WZV4HG8X",
3 | "version": 1259,
4 | "library": {
5 | "type": "user",
6 | "id": 12345,
7 | "name": "username",
8 | "links": {
9 | "alternate": {
10 | "href": "https://www.zotero.org/username",
11 | "type": "text/html"
12 | }
13 | }
14 | },
15 | "links": {
16 | "self": {
17 | "href": "https://api.zotero.org/users/12345/items/WZV4HG8X",
18 | "type": "application/json"
19 | },
20 | "alternate": {
21 | "href": "https://www.zotero.org/username/items/WZV4HG8X",
22 | "type": "text/html"
23 | },
24 | "up": {
25 | "href": "https://api.zotero.org/users/12345/items/CP4ED2CY",
26 | "type": "application/json"
27 | },
28 | "enclosure": {
29 | "type": "text/html",
30 | "href": "https://api.zotero.org/users/12345/items/WZV4HG8X/file/view"
31 | }
32 | },
33 | "meta": {
34 | "numChildren": false
35 | },
36 | "data": {
37 | "key": "WZV4HG8X",
38 | "version": 1259,
39 | "parentItem": "CP4ED2CY",
40 | "itemType": "attachment",
41 | "linkMode": "imported_url",
42 | "title": "Snapshot",
43 | "accessDate": "2021-11-10T07:04:53Z",
44 | "url": "https://www.nature.com/articles/s41467-021-26215-w",
45 | "note": "",
46 | "contentType": "text/html",
47 | "charset": "utf-8",
48 | "filename": "s41467-021-26215-w.html",
49 | "md5": "9ba88a9f08c42a02d11a00b3498198f4",
50 | "mtime": 1636527893000,
51 | "tags": [],
52 | "relations": {},
53 | "dateAdded": "2021-11-10T07:04:53Z",
54 | "dateModified": "2021-11-10T07:04:53Z"
55 | }
56 | },
57 | {
58 | "key": "DMJ4AQ48",
59 | "version": 1261,
60 | "library": {
61 | "type": "user",
62 | "id": 12345,
63 | "name": "username",
64 | "links": {
65 | "alternate": {
66 | "href": "https://www.zotero.org/username",
67 | "type": "text/html"
68 | }
69 | }
70 | },
71 | "links": {
72 | "self": {
73 | "href": "https://api.zotero.org/users/12345/items/DMJ4AQ48",
74 | "type": "application/json"
75 | },
76 | "alternate": {
77 | "href": "https://www.zotero.org/username/items/DMJ4AQ48",
78 | "type": "text/html"
79 | },
80 | "up": {
81 | "href": "https://api.zotero.org/users/12345/items/CP4ED2CY",
82 | "type": "application/json"
83 | },
84 | "enclosure": {
85 | "type": "application/pdf",
86 | "href": "https://api.zotero.org/users/12345/items/DMJ4AQ48/file/view",
87 | "title": "Pruski et al. - 2021 - Direct on-swab metabolic profiling of vaginal micr.pdf",
88 | "length": 3648434
89 | }
90 | },
91 | "meta": {
92 | "numChildren": false
93 | },
94 | "data": {
95 | "key": "DMJ4AQ48",
96 | "version": 1261,
97 | "parentItem": "CP4ED2CY",
98 | "itemType": "attachment",
99 | "linkMode": "imported_url",
100 | "title": "Full Text PDF",
101 | "accessDate": "2021-11-10T07:04:46Z",
102 | "url": "https://www.nature.com/articles/s41467-021-26215-w.pdf",
103 | "note": "",
104 | "contentType": "application/pdf",
105 | "charset": "",
106 | "filename": "Pruski et al. - 2021 - Direct on-swab metabolic profiling of vaginal micr.pdf",
107 | "md5": "28edb400729d11e14b2b1829ceb16b3a",
108 | "mtime": 1636528753000,
109 | "tags": [],
110 | "relations": {},
111 | "dateAdded": "2021-11-10T07:04:46Z",
112 | "dateModified": "2021-11-10T07:04:46Z"
113 | }
114 | }]
--------------------------------------------------------------------------------
/q2_fondue/tests/data/efetch_b2_response_runs.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 | 13481774
8 | - <Summary><Title>18</Title><Platform
9 | instrument_model="Illumina HiSeq 3000">ILLUMINA</Platform><Statistics
10 | total_runs="1" total_spots="63703" total_bases="38349206"
11 | total_size="22735317" load_done="true" cluster_name="public"/></Summary><Submitter
12 | acc="SRA1206349" center_name="Jiangxi Agricultural University"
13 | contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory
14 | of Animal Nutritio"/><Experiment acc="SRX10339760" ver="1"
15 | status="public" name="18"/><Study acc="SRP310597" name="PRJNA
16 | Chuanzhong black lamb Raw sequence reads"/><Organism
17 | taxid="1904483" ScientificName="sheep gut metagenome"/><Sample
18 | acc="SRS8459117" name=""/><Instrument ILLUMINA="Illumina
19 | HiSeq 3000"/><Library_descriptor><LIBRARY_NAME>18</LIBRARY_NAME><LIBRARY_STRATEGY>AMPLICON</LIBRARY_STRATEGY><LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>PCR</LIBRARY_SELECTION><LIBRARY_LAYOUT>
20 | <PAIRED/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA707607</Bioproject><Biosample>SAMN18309312</Biosample>
21 |
22 | - <Run acc="SRR13961771"
23 | total_spots="63703" total_bases="38349206" load_done="true"
24 | is_public="true" cluster_name="public"
25 | static_data_available="true"/>
26 |
27 |
28 | - 2021/03/17
29 | - 2021/03/15
30 |
31 |
32 |
33 | 13481786
34 | - <Summary><Title>12</Title><Platform
35 | instrument_model="Illumina HiSeq 3000">ILLUMINA</Platform><Statistics
36 | total_runs="1" total_spots="59130" total_bases="35596260"
37 | total_size="21079845" load_done="true" cluster_name="public"/></Summary><Submitter
38 | acc="SRA1206349" center_name="Jiangxi Agricultural University"
39 | contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory
40 | of Animal Nutritio"/><Experiment acc="SRX10339772" ver="1"
41 | status="public" name="12"/><Study acc="SRP310597" name="PRJNA
42 | Chuanzhong black lamb Raw sequence reads"/><Organism
43 | taxid="1904483" ScientificName="sheep gut metagenome"/><Sample
44 | acc="SRS8459130" name=""/><Instrument ILLUMINA="Illumina
45 | HiSeq 3000"/><Library_descriptor><LIBRARY_NAME>12</LIBRARY_NAME><LIBRARY_STRATEGY>AMPLICON</LIBRARY_STRATEGY><LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>PCR</LIBRARY_SELECTION><LIBRARY_LAYOUT>
46 | <PAIRED/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA707607</Bioproject><Biosample>SAMN18309306</Biosample>
47 |
48 | - <Run acc="SRR13961759"
49 | total_spots="59130" total_bases="35596260" load_done="true"
50 | is_public="true" cluster_name="public"
51 | static_data_available="true"/>
52 |
53 |
54 | - 2021/03/17
55 | - 2021/03/15
56 |
57 |
58 |
--------------------------------------------------------------------------------
/q2_fondue/entrezpy_clients/_utils.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
9 | import logging
10 | import sys
11 |
12 | import pandas as pd
13 |
14 | PREFIX = {
15 | "run": ("SRR", "ERR", "DRR"),
16 | "experiment": ("SRX", "ERX", "DRX"),
17 | "sample": ("SRS", "ERS", "DRS"),
18 | "study": ("SRP", "ERP", "DRP"),
19 | "bioproject": ("PRJ",),
20 | }
21 |
22 |
23 | class InvalidIDs(Exception):
24 | pass
25 |
26 |
27 | def get_attrs(obj, excluded=()):
28 | return [
29 | k for k, v in vars(obj).items() if k not in excluded and not k.startswith("__")
30 | ]
31 |
32 |
33 | def rename_columns(df: pd.DataFrame):
34 | # clean up ID columns
35 | col_map = {}
36 | id_cols = [col for col in df.columns if col.endswith("_id")]
37 | for col in id_cols:
38 | col_split = col.split("_")
39 | col_map[col] = f"{col_split[0].capitalize()} {col_split[1].upper()}"
40 |
41 | # clean up other multi-word columns
42 | wordy_cols = [col for col in df.columns if "_" in col and col not in id_cols]
43 | for col in wordy_cols:
44 | col_map[col] = " ".join([x.capitalize() for x in col.split("_")])
45 |
46 | # capitalize the rest
47 | remainder_cols = [
48 | col for col in df.columns if col not in id_cols and col not in wordy_cols
49 | ]
50 | for col in remainder_cols:
51 | col_map[col] = col.capitalize()
52 |
53 | df.rename(columns=col_map, inplace=True)
54 |
55 | # rename Sample ID to Sample Accession (incompatible with qiime naming)
56 | df.rename(columns={"Sample ID": "Sample Accession"}, inplace=True)
57 |
58 | return df
59 |
60 |
61 | def set_up_entrezpy_logging(entrezpy_obj, log_level, log_id=False):
62 | """Sets up logging for the given Entrezpy object.
63 |
64 | Args:
65 | entrezpy_obj (object): An Entrezpy object that has a logger attribute.
66 | log_level (str): The log level to set.
67 | log_id (bool): If True, accession ID will be added to the log.
68 | """
69 | handler = set_up_logging_handler(log_id=log_id)
70 |
71 | entrezpy_obj.logger.addHandler(handler)
72 | entrezpy_obj.logger.setLevel(log_level)
73 |
74 | if hasattr(entrezpy_obj, "request_pool"):
75 | entrezpy_obj.request_pool.logger.addHandler(handler)
76 | entrezpy_obj.request_pool.logger.setLevel(log_level)
77 |
78 |
79 | def set_up_logger(
80 | log_level, cls_obj=None, logger_name=None, log_id=False
81 | ) -> logging.Logger:
82 | """Sets up the module/class logger.
83 |
84 | Args:
85 | log_level (str): The log level to set.
86 | cls_obj: Class instance for which the logger should be created.
87 | logger_name (str): The name of the logger.
88 | log_id (bool): If True, accession ID will be added to the log.
89 |
90 | Returns:
91 | logging.Logger: The module logger.
92 | """
93 | if cls_obj:
94 | logger = logging.getLogger(f"{cls_obj.__module__}")
95 | else:
96 | logger = logging.getLogger(logger_name)
97 | logger.setLevel(log_level)
98 | handler = set_up_logging_handler(log_id=log_id)
99 | logger.addHandler(handler)
100 | return logger
101 |
102 |
103 | def set_up_logging_handler(log_id: bool = False) -> logging.StreamHandler:
104 | """Sets up logging handler."""
105 | handler = logging.StreamHandler(sys.stdout)
106 | if log_id:
107 | formatter = logging.Formatter(
108 | "%(asctime)s [%(threadName)s] [%(levelname)s] "
109 | "[%(name)s] [%(accession_id)s]: %(message)s"
110 | )
111 | else:
112 | formatter = logging.Formatter(
113 | "%(asctime)s [%(threadName)s] [%(levelname)s] " "[%(name)s]: %(message)s"
114 | )
115 | handler.setFormatter(formatter)
116 | return handler
117 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/efetch_b1_response_runs.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 | 4
8 | - <Summary><Title>454
9 | sequencing of Human HapMap individual NA18505 genomic paired-end
10 | library</Title><Platform instrument_model="454 GS FLX">LS454</Platform><Statistics
11 | total_runs="10" total_spots="4703662" total_bases="1306798474"
12 | total_size="3205056622" load_done="true"
13 | static_data_available="true" cluster_name="public"/></Summary><Submitter
14 | acc="SRA000197" center_name="454MSC" contact_name="Chris OSullivan"
15 | lab_name=""/><Experiment acc="SRX000003" ver="10"
16 | status="public" name="454 sequencing of Human HapMap individual
17 | NA18505 genomic paired-end library"/><Study acc="SRP000001"
18 | name="Paired-end mapping reveals extensive structural variation in
19 | the human genome"/><Organism taxid="9606"
20 | ScientificName="Homo sapiens"/><Sample acc="SRS000100"
21 | name=""/><Instrument LS454="454 GS FLX"/><Library_descriptor><LIBRARY_NAME
22 | xmlns="">SID2699</LIBRARY_NAME><LIBRARY_STRATEGY
23 | xmlns="">WGS</LIBRARY_STRATEGY><LIBRARY_SOURCE xmlns="">GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION
24 | xmlns="">RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT
25 | xmlns=""> <PAIRED NOMINAL_LENGTH="3000"/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA33627</Bioproject><Biosample>SAMN00001583</Biosample>
26 |
27 | - <Run acc="SRR000007"
28 | total_spots="633196" total_bases="175275395" load_done="true"
29 | is_public="true" cluster_name="public"
30 | static_data_available="true"/><Run acc="SRR000018"
31 | total_spots="626624" total_bases="174403220" load_done="true"
32 | is_public="true" cluster_name="public"
33 | static_data_available="true"/><Run acc="SRR000020"
34 | total_spots="374556" total_bases="103411232" load_done="true"
35 | is_public="true" cluster_name="public"
36 | static_data_available="true"/><Run acc="SRR000038"
37 | total_spots="529820" total_bases="148389031" load_done="true"
38 | is_public="true" cluster_name="public"
39 | static_data_available="true"/><Run acc="SRR000043"
40 | total_spots="608946" total_bases="168985392" load_done="true"
41 | is_public="true" cluster_name="public"
42 | static_data_available="true"/><Run acc="SRR000046"
43 | total_spots="79047" total_bases="21258857" load_done="true"
44 | is_public="true" cluster_name="public"
45 | static_data_available="true"/><Run acc="SRR000048"
46 | total_spots="640737" total_bases="177619279" load_done="true"
47 | is_public="true" cluster_name="public"
48 | static_data_available="true"/><Run acc="SRR000050"
49 | total_spots="547349" total_bases="153260655" load_done="true"
50 | is_public="true" cluster_name="public"
51 | static_data_available="true"/><Run acc="SRR000057"
52 | total_spots="76744" total_bases="21203932" load_done="true"
53 | is_public="true" cluster_name="public"
54 | static_data_available="true"/><Run acc="SRR000058"
55 | total_spots="586643" total_bases="162991481" load_done="true"
56 | is_public="true" cluster_name="public"
57 | static_data_available="true"/>
58 |
59 |
60 | - 2008/04/04
61 | - 2015/04/09
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/efetch_response_runs_single_item.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 | 4
8 | - <Summary><Title>454
9 | sequencing of Human HapMap individual NA18505 genomic paired-end
10 | library</Title><Platform instrument_model="454 GS FLX">LS454</Platform><Statistics
11 | total_runs="10" total_spots="4703662" total_bases="1306798474"
12 | total_size="3205056622" load_done="true"
13 | static_data_available="true" cluster_name="public"/></Summary><Submitter
14 | acc="SRA000197" center_name="454MSC" contact_name="Chris OSullivan"
15 | lab_name=""/><Experiment acc="SRX000003" ver="10"
16 | status="public" name="454 sequencing of Human HapMap individual
17 | NA18505 genomic paired-end library"/><Study acc="SRP000001"
18 | name="Paired-end mapping reveals extensive structural variation in
19 | the human genome"/><Organism taxid="9606"
20 | ScientificName="Homo sapiens"/><Sample acc="SRS000100"
21 | name=""/><Instrument LS454="454 GS FLX"/><Library_descriptor><LIBRARY_NAME
22 | xmlns="">SID2699</LIBRARY_NAME><LIBRARY_STRATEGY
23 | xmlns="">WGS</LIBRARY_STRATEGY><LIBRARY_SOURCE xmlns="">GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION
24 | xmlns="">RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT
25 | xmlns=""> <PAIRED NOMINAL_LENGTH="3000"/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA33627</Bioproject><Biosample>SAMN00001583</Biosample>
26 |
27 | - <Run acc="SRR000007"
28 | total_spots="633196" total_bases="175275395" load_done="true"
29 | is_public="true" cluster_name="public"
30 | static_data_available="true"/><Run acc="SRR000018"
31 | total_spots="626624" total_bases="174403220" load_done="true"
32 | is_public="true" cluster_name="public"
33 | static_data_available="true"/><Run acc="SRR000020"
34 | total_spots="374556" total_bases="103411232" load_done="true"
35 | is_public="true" cluster_name="public"
36 | static_data_available="true"/><Run acc="SRR000038"
37 | total_spots="529820" total_bases="148389031" load_done="true"
38 | is_public="true" cluster_name="public"
39 | static_data_available="true"/><Run acc="SRR000043"
40 | total_spots="608946" total_bases="168985392" load_done="true"
41 | is_public="true" cluster_name="public"
42 | static_data_available="true"/><Run acc="SRR000046"
43 | total_spots="79047" total_bases="21258857" load_done="true"
44 | is_public="true" cluster_name="public"
45 | static_data_available="true"/><Run acc="SRR000048"
46 | total_spots="640737" total_bases="177619279" load_done="true"
47 | is_public="true" cluster_name="public"
48 | static_data_available="true"/><Run acc="SRR000050"
49 | total_spots="547349" total_bases="153260655" load_done="true"
50 | is_public="true" cluster_name="public"
51 | static_data_available="true"/><Run acc="SRR000057"
52 | total_spots="76744" total_bases="21203932" load_done="true"
53 | is_public="true" cluster_name="public"
54 | static_data_available="true"/><Run acc="SRR000058"
55 | total_spots="586643" total_bases="162991481" load_done="true"
56 | is_public="true" cluster_name="public"
57 | static_data_available="true"/>
58 |
59 |
60 | - 2008/04/04
61 | - 2015/04/09
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/q2_fondue/tests/test_get_all.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 | import unittest
9 | from unittest.mock import ANY, Mock
10 |
11 | import pandas as pd
12 | from qiime2 import Artifact
13 |
14 | from q2_fondue.get_all import get_all
15 | from q2_fondue.tests.test_sequences import SequenceTests
16 |
17 |
18 | class FakeCtx(Mock):
19 | def __init__(self, ids_path, meta_path, failed_ids=None):
20 | super().__init__()
21 | self.ids = Artifact.import_data("NCBIAccessionIDs", ids_path)
22 | self.meta = Artifact.import_data("SRAMetadata", meta_path)
23 | self.failed_empty = Artifact.import_data("SRAFailedIDs", pd.DataFrame())
24 | if failed_ids:
25 | self.failed = Artifact.import_data(
26 | "SRAFailedIDs",
27 | pd.DataFrame(
28 | data={"Error message": ["Some error message" for _ in failed_ids]},
29 | index=pd.Index(failed_ids, name="ID"),
30 | ),
31 | )
32 | else:
33 | self.failed = self.failed_empty
34 |
35 | self.get_metadata = Mock(return_value=(self.meta, self.failed_empty))
36 | self.get_sequences = Mock(return_value=(Mock(), Mock(), self.failed))
37 |
38 | def get_action(self, plugin, action):
39 | if action == "get_metadata":
40 | return self.get_metadata
41 | elif action == "get_sequences":
42 | return self.get_sequences
43 |
44 |
45 | class TestGetAll(SequenceTests):
46 | package = "q2_fondue.tests"
47 |
48 | def test_get_all_single(self):
49 | """
50 | Test verifying that pipeline get_all calls all expected actions,
51 | individual actions are tested in details in respective test classes
52 | """
53 | mock_ctx = FakeCtx(
54 | ids_path=self.get_data_path("SRR123456_md.tsv"),
55 | meta_path=self.get_data_path("sra-metadata-mock.tsv"),
56 | )
57 | obs_meta, _, _, obs_failed = get_all(
58 | mock_ctx, mock_ctx.ids, "fake@email.com", retries=1
59 | )
60 |
61 | mock_ctx.get_metadata.assert_called_once_with(
62 | mock_ctx.ids, "fake@email.com", 1, "INFO", None
63 | )
64 | mock_ctx.get_sequences.assert_called_once_with(
65 | ANY, "fake@email.com", 1, 1, "INFO"
66 | )
67 |
68 | run_ids = mock_ctx.get_sequences.call_args_list[0][0][0]
69 | run_ids = run_ids.view(pd.DataFrame).index.to_list()
70 | self.assertListEqual(run_ids, ["SRR123456"])
71 |
72 | self.assertEqual(obs_meta, mock_ctx.meta)
73 | self.assertEqual(obs_failed, mock_ctx.failed)
74 |
75 | def test_get_all_multi_with_missing_ids(self):
76 | """
77 | Test verifying that pipeline get_all calls all expected actions,
78 | individual actions are tested in details in respective test classes
79 | """
80 | mock_ctx = FakeCtx(
81 | ids_path=self.get_data_path("SRR1234567_md.tsv"),
82 | meta_path=self.get_data_path("sra-metadata-mock.tsv"),
83 | failed_ids=["SRR123457"],
84 | )
85 | obs_meta, _, _, obs_failed = get_all(
86 | mock_ctx, mock_ctx.ids, "fake@email.com", retries=1
87 | )
88 |
89 | mock_ctx.get_metadata.assert_called_once_with(
90 | mock_ctx.ids, "fake@email.com", 1, "INFO", None
91 | )
92 | mock_ctx.get_sequences.assert_called_once_with(
93 | ANY, "fake@email.com", 1, 1, "INFO"
94 | )
95 |
96 | run_ids = mock_ctx.get_sequences.call_args_list[0][0][0]
97 | run_ids = run_ids.view(pd.DataFrame).index.to_list()
98 | self.assertListEqual(run_ids, ["SRR123456"])
99 |
100 | self.assertEqual(obs_meta, mock_ctx.meta)
101 | self.assertListEqual(
102 | obs_failed.view(pd.DataFrame).index.to_list(), ["SRR123457"]
103 | )
104 |
105 |
106 | if __name__ == "__main__":
107 | unittest.main()
108 |
--------------------------------------------------------------------------------
/q2_fondue/entrezpy_clients/_esearch.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
9 | from typing import List, Union
10 |
11 | import pandas as pd
12 | from entrezpy.esearch.esearch_analyzer import EsearchAnalyzer
13 | from entrezpy.esearch.esearch_result import EsearchResult
14 |
15 |
16 | class ESearchResult(EsearchResult):
17 | """Entrezpy client for ESearch utility used to search for or validate
18 | provided accession IDs.
19 | """
20 |
21 | def __init__(self, response, request):
22 | super().__init__(response, request)
23 | self.result = None
24 |
25 | def validate_result(self) -> dict:
26 | """Validates hit counts obtained for all the provided UIDs.
27 |
28 | As the expected hit count for a valid SRA accession ID is 1, all the
29 | IDs with that value will be considered valid. UIDs with count higher
30 | than 1 will be considered 'ambiguous' as they could not be resolved
31 | to a single result. Likewise, UIDs with a count of 0 will be considered
32 | 'invalid' as no result could be found for those.
33 |
34 | Raises:
35 | InvalidIDs: An exception is raised when either ambiguous or invalid
36 | IDs were encountered.
37 |
38 | """
39 | # correct id should have count == 1
40 | leftover_ids = self.result[self.result != 1]
41 | if leftover_ids.shape[0] == 0:
42 | return {}
43 | ambiguous_ids = leftover_ids[leftover_ids > 0]
44 | invalid_ids = leftover_ids[leftover_ids == 0]
45 |
46 | error_msg = "Some of the IDs are invalid or ambiguous:"
47 | if ambiguous_ids.shape[0] > 0:
48 | error_msg += f'\n Ambiguous IDs: {", ".join(ambiguous_ids.index)}'
49 | if invalid_ids.shape[0] > 0:
50 | error_msg += f'\n Invalid IDs: {", ".join(invalid_ids.index)}'
51 | self.logger.warning(error_msg)
52 | return {
53 | **{_id: "ID is ambiguous." for _id in ambiguous_ids.index},
54 | **{_id: "ID is invalid." for _id in invalid_ids.index},
55 | }
56 |
57 | def parse_search_results(self, response, uids: Union[List[str], None]):
58 | """Parses response received from Esearch as a pandas Series object.
59 |
60 | Hit counts obtained in the response will be extracted and assigned to
61 | their respective query IDs. IDs not found in the results but present
62 | in the UIDs list will get a count of 0.
63 |
64 | Args:
65 | response (): Response received from Esearch.
66 | uids (List[str]): List of original UIDs that were submitted
67 | as a query.
68 |
69 | """
70 | translation_stack = response["esearchresult"].get("translationstack")
71 | if not translation_stack:
72 | self.result = pd.Series({x: 0 for x in uids}, name="count")
73 | return
74 |
75 | # filter out only positive hits
76 | found_terms = [x for x in translation_stack if isinstance(x, dict)]
77 | found_terms = {
78 | x["term"].replace("[All Fields]", ""): int(x["count"]) for x in found_terms
79 | }
80 |
81 | # find ids that are missing
82 | if uids:
83 | missing_ids = [x for x in uids if x not in found_terms.keys()]
84 | missing_ids = {x: 0 for x in missing_ids}
85 | found_terms.update(missing_ids)
86 |
87 | self.result = pd.Series(found_terms, name="count")
88 |
89 |
90 | class ESearchAnalyzer(EsearchAnalyzer):
91 | def __init__(self, uids):
92 | super().__init__()
93 | self.uids = uids
94 |
95 | # override the base method to use our own ESResult
96 | def init_result(self, response, request):
97 | if not self.result:
98 | self.result = ESearchResult(response, request)
99 | return True
100 | return False
101 |
102 | # override the base method to additionally parse the result
103 | def analyze_result(self, response, request):
104 | super().analyze_result(response, request)
105 | self.result.parse_search_results(response, self.uids)
106 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/metadata_processed_multi.json:
--------------------------------------------------------------------------------
1 | {
2 | "FAKEID1": {
3 | "Experiment ID": "ERX3980916",
4 | "Biosample ID": "SAMEA6608408",
5 | "Bioproject ID": "PRJEB37054",
6 | "Study ID": "ERP120343",
7 | "Sample Accession": "ERS4372624",
8 | "Organism": "Vitis vinifera",
9 | "Library Source": "METAGENOMIC",
10 | "Library Selection": "PCR",
11 | "Library Layout": "SINGLE",
12 | "Instrument": "Illumina MiSeq",
13 | "Platform": "ILLUMINA",
14 | "Bases": "11552099",
15 | "Spots": "39323",
16 | "Avg Spot Len": "293",
17 | "Bytes": "3914295",
18 | "Public": "True",
19 | "Ena-first-public [run]": "2020-05-31",
20 | "Ena-first-public [sample]": "2020-05-31",
21 | "Ena-first-public [study]": "2020-05-31",
22 | "Ena-last-update [run]": "2020-03-06",
23 | "Ena-last-update [sample]": "2020-03-06",
24 | "Ena-last-update [study]": "2020-03-04",
25 | "Amount or size of sample collected [sample]": "50",
26 | "Collection date [sample]": "2015-09-28",
27 | "Collection day [sample]": "1",
28 | "Collection hours [sample]": "0",
29 | "Environment (biome) [sample]": "berry plant",
30 | "Environment (feature) [sample]": "grape plant",
31 | "Environment (material) [sample]": "wine must",
32 | "Geographic location (country and/or sea) [sample]": "Germany",
33 | "Geographic location (latitude) [sample]": "48.71 N",
34 | "Geographic location (longitude) [sample]": "9.12 E",
35 | "Investigation type [sample]": "metagenome",
36 | "Multiplex identifiers [sample]": "TAGATCGCTCGCCTTA",
37 | "Pcr primers [sample]": "GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT",
38 | "Plant-associated environmental package [sample]": "plant-associated",
39 | "Project name [sample]": "wine must microbiota analysis during fermentation",
40 | "Sample storage temperature [sample]": "-80",
41 | "Sample volume or weight for dna extraction [sample]": "0.5",
42 | "Sequencing method [sample]": "Illumina MiSeq",
43 | "Subspecific genetic lineage [sample]": "Bacchus1",
44 | "Target subfragment [sample]": "16S rRNA gene",
45 | "Library Name": "unspecified",
46 | "Name": "BAC1.D1.0.32A",
47 | "Center Name": "University of Hohenheim",
48 | "Title": "Vitis vinifera",
49 | "Tax ID": "29760"
50 | },
51 | "FAKEID2": {
52 | "Experiment ID": "ERX3980917",
53 | "Biosample ID": "SAMEA6608409",
54 | "Bioproject ID": "PRJEB37054",
55 | "Study ID": "ERP120343",
56 | "Sample Accession": "ERS4372625",
57 | "Organism": "Vitis vinifera",
58 | "Library Source": "METAGENOMIC",
59 | "Library Selection": "PCR",
60 | "Library Layout": "SINGLE",
61 | "Instrument": "Illumina MiSeq",
62 | "Platform": "ILLUMINA",
63 | "Bases": "17523267",
64 | "Spots": "59799",
65 | "Avg Spot Len": "293",
66 | "Bytes": "5879896",
67 | "Public": "True",
68 | "Ena-first-public [run]": "2020-05-31",
69 | "Ena-first-public [sample]": "2020-05-31",
70 | "Ena-first-public [study]": "2020-05-31",
71 | "Ena-last-update [run]": "2020-03-06",
72 | "Ena-last-update [sample]": "2020-03-06",
73 | "Ena-last-update [study]": "2020-03-04",
74 | "Amount or size of sample collected [sample]": "50",
75 | "Collection date [sample]": "2015-09-28",
76 | "Collection day [sample]": "1",
77 | "Collection hours [sample]": "2",
78 | "Environment (biome) [sample]": "berry plant",
79 | "Environment (feature) [sample]": "grape plant",
80 | "Environment (material) [sample]": "wine must",
81 | "Geographic location (country and/or sea) [sample]": "Germany",
82 | "Geographic location (latitude) [sample]": "48.71 N",
83 | "Geographic location (longitude) [sample]": "9.12 E",
84 | "Investigation type [sample]": "metagenome",
85 | "Multiplex identifiers [sample]": "CTCTCTATTCGCCTTA",
86 | "Pcr primers [sample]": "GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT",
87 | "Plant-associated environmental package [sample]": "plant-associated",
88 | "Project name [sample]": "wine must microbiota analysis during fermentation",
89 | "Sample storage temperature [sample]": "-80",
90 | "Sample volume or weight for dna extraction [sample]": "0.5",
91 | "Sequencing method [sample]": "Illumina MiSeq",
92 | "Subspecific genetic lineage [sample]": "Bacchus1",
93 | "Target subfragment [sample]": "16S rRNA gene",
94 | "Library Name": "unspecified",
95 | "Name": "BAC1.D1.1.33A",
96 | "Center Name": "University of Hohenheim",
97 | "Title": "Vitis vinifera",
98 | "Tax ID": "29760"
99 | }
100 | }
--------------------------------------------------------------------------------
/q2_fondue/entrezpy_clients/_pipelines.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 | from typing import Union
9 |
10 | from entrezpy import conduit as ec
11 |
12 | from entrezpy.elink.elink_analyzer import ElinkAnalyzer
13 |
14 | from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer
15 | from q2_fondue.entrezpy_clients._esearch import ESearchAnalyzer
16 | from q2_fondue.entrezpy_clients._utils import set_up_entrezpy_logging
17 |
18 | import entrezpy.esearch.esearcher as searcher
19 |
20 | from q2_fondue.utils import _chunker
21 |
22 | BATCH_SIZE = 500
23 |
24 |
25 | def _get_run_ids(
26 | email: str,
27 | n_jobs: int,
28 | ids: Union[list, None],
29 | query: Union[str, None],
30 | source: str,
31 | log_level: str,
32 | ) -> list:
33 | """Pipeline to retrieve run IDs associated with BioSample query
34 | (provided in `query`) or other aggregate IDs like studies
35 | (`source`='study'), bioprojects (`source`='bioproject'), samples
36 | (`source`='sample') or experiments (`source`='experiment')
37 | provided in `ids`.
38 |
39 | Args:
40 | email (str): User email.
41 | n_jobs (int): Number of jobs.
42 | ids (list): List of study, bioproject, sample or experiment IDs.
43 | query (str): Search query to find IDs by.
44 | source (str): Type of IDs provided ('study', 'bioproject',
45 | 'sample' or 'experiment').
46 | log_level (str): The log level to set.
47 |
48 | Returns:
49 | list: Run IDs associated with provided ids.
50 | """
51 | term = " OR ".join(ids) if ids else query
52 |
53 | # create pipeline to fetch all run IDs
54 | elink = True
55 | if source == "bioproject":
56 | db = "bioproject"
57 | elif source == "biosample":
58 | db = "biosample"
59 | else:
60 | db = "sra"
61 | elink = False
62 |
63 | # find UIDS based on a query;
64 | # instead of saving the result on the history server
65 | # we will store all the UIDs recovered based on the
66 | # search query and use those in the mini-pipeline below;
67 | # this way we are not limited by ELink only accepting up to
68 | # who knows how many IDs and erroring out if we provide too
69 | # many (which could be the case e.g.: when we ask for more
70 | # than 10000 BioProject IDs or the text query returns more
71 | # than 10000 IDs presumably)
72 | esearcher = searcher.Esearcher(
73 | "esearcher", email, apikey=None, apikey_var=None, threads=n_jobs, qid=None
74 | )
75 | esearch_response = esearcher.inquire(
76 | {"db": db, "term": term, "usehistory": False, "rettype": "json"},
77 | analyzer=ESearchAnalyzer(ids),
78 | )
79 |
80 | # use the UIDs to link to other DBs and fetch related records;
81 | # we won't be using multi-threading here as this shouldn't take
82 | # long (we're only fetching IDs) and we don't want those dead
83 | # threads afterwards
84 | econduit = ec.Conduit(email=email, threads=0)
85 | set_up_entrezpy_logging(econduit, log_level)
86 | run_ids_pipeline = econduit.new_pipeline()
87 |
88 | # create a pipeline to link and fetch the run IDs;
89 | # we process the IDs obtained from the previous step in batches
90 | # as ELink cannot handle more than a certain amount of IDs
91 | # at the same time (recommended by NCBI)
92 | for _ids in _chunker(esearch_response.result.uids, BATCH_SIZE):
93 | if elink:
94 | el = run_ids_pipeline.add_link(
95 | {"db": "sra", "dbfrom": db, "id": _ids, "link": False},
96 | analyzer=ElinkAnalyzer(),
97 | )
98 | else:
99 | el = None
100 |
101 | # given SRA run IDs, fetch all metadata
102 | efetch_params = {
103 | "rettype": "docsum",
104 | "retmode": "xml",
105 | "reqsize": BATCH_SIZE,
106 | "retmax": len(_ids),
107 | }
108 | if not elink:
109 | # we need to specify these manually as in this scenario
110 | # EFetch is not linked to anything
111 | efetch_params.update({"id": _ids, "db": db})
112 |
113 | run_ids_pipeline.add_fetch(
114 | efetch_params, analyzer=EFetchAnalyzer(log_level), dependency=el
115 | )
116 |
117 | econduit.run(run_ids_pipeline)
118 |
119 | # recover run IDs from all instances of EFetchAnalyzer
120 | all_run_ids = []
121 | for x in econduit.analyzers.values():
122 | if isinstance(x, EFetchAnalyzer):
123 | all_run_ids.extend(x.result.metadata)
124 |
125 | return sorted(all_run_ids)
126 |
--------------------------------------------------------------------------------
/q2_fondue/types/_format.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
9 | import itertools
10 |
11 | import pandas as pd
12 | from qiime2.plugin import ValidationError
13 | from qiime2.plugin import model
14 | from qiime2.metadata.base import is_id_header, FORMATTED_ID_HEADERS
15 | from q2_fondue.entrezpy_clients._utils import PREFIX
16 |
17 |
18 | class SRAMetadataFormat(model.TextFileFormat):
19 |
20 | REQUIRED_IDS = [
21 | "ID",
22 | "Biosample ID",
23 | "Bioproject ID",
24 | "Experiment ID",
25 | "Study ID",
26 | "Sample Accession",
27 | ]
28 | REQUIRED_HEADER_FIELDS = [
29 | "Organism",
30 | "Instrument",
31 | "Platform",
32 | "Bases",
33 | "Bytes",
34 | "Public",
35 | "Library Selection",
36 | "Library Source",
37 | "Library Layout",
38 | ]
39 | REQUIRED_HEADER_FIELDS.extend(REQUIRED_IDS)
40 |
41 | def _validate(self):
42 | df = pd.read_csv(str(self), sep="\t")
43 |
44 | missing_cols = [x for x in self.REQUIRED_HEADER_FIELDS if x not in df.columns]
45 | if missing_cols:
46 | raise ValidationError(
47 | "Some required columns are missing from the metadata file: "
48 | f'{", ".join(missing_cols)}.'
49 | )
50 |
51 | # some IDs must be present in all samples
52 | nans = df.isnull().sum(axis=0)[self.REQUIRED_IDS]
53 | missing_ids = nans.where(nans > 0).dropna().index.tolist()
54 | if missing_ids:
55 | raise ValidationError(
56 | "Some samples are missing IDs in the following fields: "
57 | f'{", ".join(missing_ids)}.'
58 | )
59 |
60 | def _validate_(self, level):
61 | self._validate()
62 |
63 |
64 | SRAMetadataDirFmt = model.SingleFileDirectoryFormat(
65 | "SRAMetadataDirFmt", "sra-metadata.tsv", SRAMetadataFormat
66 | )
67 |
68 |
69 | class SRAFailedIDsFormat(model.TextFileFormat):
70 | """
71 | This is a "fake" format only used to store a list of failed SRA IDs,
72 | which can be converted to QIIME's metadata and input into any fondue
73 | action.
74 | """
75 |
76 | def _validate_(self, level):
77 | df = pd.read_csv(str(self), sep="\t", index_col=0)
78 |
79 | if df.shape[1] > 1:
80 | raise ValidationError(
81 | "Failed IDs artifact should only contain a single column "
82 | "with error message for the runs that could not be fetched "
83 | "(indexed by run ID)."
84 | )
85 |
86 |
87 | SRAFailedIDsDirFmt = model.SingleFileDirectoryFormat(
88 | "SRAFailedIDsDirFmt", "sra-failed-ids.tsv", SRAFailedIDsFormat
89 | )
90 |
91 |
92 | class NCBIAccessionIDsFormat(model.TextFileFormat):
93 | """
94 | This is a format used to store a list of SRA accession IDs (run,
95 | study, BioProject, sample and experiment IDs), which can be converted
96 | to QIIME's metadata. Artifacts containing of run, study and BioProject
97 | IDs can be input into any fondue action.
98 | """
99 |
100 | ALLOWED_PREFIXES = tuple(
101 | itertools.chain(
102 | *[
103 | v
104 | for k, v in PREFIX.items()
105 | if k in ("bioproject", "run", "study", "sample", "experiment")
106 | ]
107 | )
108 | )
109 |
110 | def _validate_id(self, _id: str):
111 | if not _id.startswith(self.ALLOWED_PREFIXES):
112 | raise ValidationError(
113 | "Some of the provided IDs are invalid - only SRA run, study, "
114 | "BioProject, sample and experiment IDs are allowed. Please "
115 | "check your input and try again."
116 | )
117 |
118 | def _validate_(self, level):
119 | df = pd.read_csv(str(self), sep="\t")
120 | cols = df.columns.tolist()
121 |
122 | if df.shape[1] > 2 or (
123 | df.shape[1] == 2 and not any(x in cols for x in ["doi", "DOI"])
124 | ):
125 | raise ValidationError(
126 | "NCBI Accession IDs artifact should only contain a single "
127 | "column with IDs of the SRA runs, studies or NCBI's "
128 | "BioProjects and an optional column `doi` with "
129 | "associated DOIs."
130 | )
131 |
132 | # check that there is a valid ID header:
133 | if not any([is_id_header(x) for x in cols]):
134 | raise ValidationError(
135 | f"NCBI Accession IDs artifact must contain a valid "
136 | f"ID header from {FORMATTED_ID_HEADERS}."
137 | )
138 |
139 | df.iloc[:, 0].apply(self._validate_id)
140 |
141 |
142 | NCBIAccessionIDsDirFmt = model.SingleFileDirectoryFormat(
143 | "NCBIAccessionIDsDirFmt", "ncbi-accession-ids.tsv", NCBIAccessionIDsFormat
144 | )
145 |
--------------------------------------------------------------------------------
/q2_fondue/tests/test_esearch.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
9 | import unittest
10 | from unittest.mock import MagicMock
11 |
12 | import pandas as pd
13 | from q2_fondue.entrezpy_clients._esearch import ESearchResult, ESearchAnalyzer
14 | from q2_fondue.tests._utils import _TestPluginWithEntrezFakeComponents
15 |
16 |
17 | class FakeESAnalyzer:
18 | def __init__(self, uids):
19 | self.uids = uids
20 | self.log_level = "INFO"
21 | self.result = MagicMock()
22 | self.result.result = pd.Series(data=[6, 6], index=["ABC", "123"])
23 |
24 |
25 | class TestEsearchClients(_TestPluginWithEntrezFakeComponents):
26 | package = "q2_fondue.tests"
27 |
28 | def test_esresult_parse_search_results(self):
29 | esearch_result = self.generate_es_result("single", "_correct")
30 | esearch_result.parse_search_results(
31 | self.json_to_response("single", "_correct"), ["SRR000001"]
32 | )
33 |
34 | obs = esearch_result.result
35 | exp = pd.Series(data=[1], index=["SRR000001"], name="count")
36 | pd.testing.assert_series_equal(exp, obs)
37 |
38 | def test_esresult_parse_search_results_ambiguous(self):
39 | esearch_result = self.generate_es_result("single", "_ambiguous")
40 | esearch_result.parse_search_results(
41 | self.json_to_response("single", "_ambiguous"), ["SR012"]
42 | )
43 |
44 | obs = esearch_result.result
45 | exp = pd.Series(data=[7], index=["SR012"], name="count")
46 | pd.testing.assert_series_equal(exp, obs)
47 |
48 | def test_esresult_parse_search_results_multi(self):
49 | esearch_result = self.generate_es_result("multi", "_correct")
50 | esearch_result.parse_search_results(
51 | self.json_to_response("multi", "_correct"),
52 | ["SRR000001", "SRR000013", "ERR3978173"],
53 | )
54 |
55 | obs = esearch_result.result
56 | exp = pd.Series(
57 | data=[1, 1, 1], index=["SRR000001", "SRR000013", "ERR3978173"], name="count"
58 | )
59 | pd.testing.assert_series_equal(exp, obs)
60 |
61 | def test_esresult_parse_search_results_multi_invalid(self):
62 | esearch_result = self.generate_es_result("multi", "_invalid")
63 | esearch_result.parse_search_results(
64 | self.json_to_response("multi", "_invalid"), ["ABCD123", "SRR001"]
65 | )
66 |
67 | obs = esearch_result.result
68 | exp = pd.Series(data=[0, 0], index=["ABCD123", "SRR001"], name="count")
69 | pd.testing.assert_series_equal(exp, obs)
70 |
71 | def test_esresult_parse_search_results_multi_mixed(self):
72 | esearch_result = self.generate_es_result("multi", "_mixed")
73 | esearch_result.parse_search_results(
74 | self.json_to_response("multi", "_mixed"),
75 | ["SRR000001", "SRR000013", "SR012", "ABCD123", "SRR001"],
76 | )
77 |
78 | obs = esearch_result.result
79 | exp = pd.Series(
80 | data=[1, 1, 7, 0, 0],
81 | index=["SRR000001", "SRR000013", "SR012", "ABCD123", "SRR001"],
82 | name="count",
83 | )
84 | pd.testing.assert_series_equal(exp, obs)
85 |
86 | def test_esresult_validate_result_single(self):
87 | esearch_result = self.generate_es_result("single", "_correct")
88 | esearch_result.result = pd.Series(data=[1], index=["SRR000001"], name="count")
89 |
90 | obs = esearch_result.validate_result()
91 | self.assertDictEqual(obs, {})
92 |
93 | def test_esresult_validate_result_single_ambiguous(self):
94 | esearch_result = self.generate_es_result("single", "_ambiguous")
95 | esearch_result.result = pd.Series(data=[7], index=["SR012"], name="count")
96 |
97 | obs = esearch_result.validate_result()
98 | exp = {"SR012": "ID is ambiguous."}
99 | self.assertDictEqual(obs, exp)
100 |
101 | def test_esresult_validate_result_multi(self):
102 | esearch_result = self.generate_es_result("multi", "_correct")
103 | esearch_result.result = pd.Series(
104 | data=[1, 1, 1], index=["SRR000001", "SRR000013", "ERR3978173"], name="count"
105 | )
106 |
107 | obs = esearch_result.validate_result()
108 | self.assertDictEqual(obs, {})
109 |
110 | def test_esresult_validate_result_multi_invalid(self):
111 | esearch_result = self.generate_es_result("multi", "_invalid")
112 | esearch_result.result = pd.Series(
113 | data=[0, 0], index=["ABCD123", "SRR001"], name="count"
114 | )
115 |
116 | obs = esearch_result.validate_result()
117 | exp = {"ABCD123": "ID is invalid.", "SRR001": "ID is invalid."}
118 | self.assertDictEqual(obs, exp)
119 |
120 | def test_esresult_validate_result_multi_mixed(self):
121 | esearch_result = self.generate_es_result("multi", "_mixed")
122 | esearch_result.result = pd.Series(
123 | data=[1, 1, 7, 0, 0],
124 | index=["SRR000001", "SRR000013", "SR012", "ABCD123", "SRR001"],
125 | name="count",
126 | )
127 |
128 | obs = esearch_result.validate_result()
129 | exp = {
130 | "SR012": "ID is ambiguous.",
131 | "ABCD123": "ID is invalid.",
132 | "SRR001": "ID is invalid.",
133 | }
134 | self.assertDictEqual(obs, exp)
135 |
136 | def test_esanalyzer_analyze_result(self):
137 | es_analyzer = ESearchAnalyzer(["SRR000001"])
138 | es_analyzer.analyze_result(
139 | response=self.json_to_response("single", "_correct"),
140 | request=self.generate_es_request("SRR000001"),
141 | )
142 |
143 | self.assertTrue(isinstance(es_analyzer.result, ESearchResult))
144 |
145 |
146 | if __name__ == "__main__":
147 | unittest.main()
148 |
--------------------------------------------------------------------------------
/.github/workflows/docker-push.yaml:
--------------------------------------------------------------------------------
1 | name: Docker push
2 |
3 | on:
4 | workflow_run:
5 | workflows: ["CI"]
6 | types:
7 | - completed
8 |
9 | jobs:
10 | push-docker-images:
11 | runs-on: ubuntu-latest
12 | if: ${{ github.event.workflow_run.conclusion == 'success' }}
13 | steps:
14 | - name: Download build metadata
15 | uses: actions/github-script@v7
16 | with:
17 | script: |
18 | // Get artifacts from the triggering workflow run
19 | const artifacts = await github.rest.actions.listWorkflowRunArtifacts({
20 | owner: context.repo.owner,
21 | repo: context.repo.repo,
22 | run_id: context.payload.workflow_run.id,
23 | });
24 |
25 | // Find build metadata artifact
26 | const metadataArtifact = artifacts.data.artifacts.find(
27 | artifact => artifact.name === 'build-metadata'
28 | );
29 |
30 | if (!metadataArtifact) {
31 | console.log('No build metadata found, skipping Docker push');
32 | return;
33 | }
34 |
35 | // Download the metadata
36 | const download = await github.rest.actions.downloadArtifact({
37 | owner: context.repo.owner,
38 | repo: context.repo.repo,
39 | artifact_id: metadataArtifact.id,
40 | archive_format: 'zip',
41 | });
42 |
43 | const fs = require('fs');
44 | fs.writeFileSync('metadata.zip', Buffer.from(download.data));
45 |
46 | - name: Extract and parse metadata
47 | id: metadata
48 | run: |
49 | if [ -f "metadata.zip" ]; then
50 | unzip metadata.zip
51 | if [ -f "build-metadata.json" ]; then
52 | # Parse JSON and set outputs
53 | echo "repository=$(jq -r '.repository' build-metadata.json)" >> $GITHUB_OUTPUT
54 | echo "sha=$(jq -r '.sha' build-metadata.json)" >> $GITHUB_OUTPUT
55 | echo "short-sha=$(jq -r '.short_sha' build-metadata.json)" >> $GITHUB_OUTPUT
56 | echo "epoch=$(jq -r '.epoch' build-metadata.json)" >> $GITHUB_OUTPUT
57 | echo "ref=$(jq -r '.ref' build-metadata.json)" >> $GITHUB_OUTPUT
58 | echo "event-name=$(jq -r '.event_name' build-metadata.json)" >> $GITHUB_OUTPUT
59 | echo "pr-number=$(jq -r '.pr_number' build-metadata.json)" >> $GITHUB_OUTPUT
60 | echo "tag-name=$(jq -r '.tag_name' build-metadata.json)" >> $GITHUB_OUTPUT
61 | echo "is-tag-push=$(jq -r '.is_tag_push' build-metadata.json)" >> $GITHUB_OUTPUT
62 | echo "build-pr-image=$(jq -r '.build_pr_image' build-metadata.json)" >> $GITHUB_OUTPUT
63 | echo "is-main-push=$(jq -r '.is_main_push' build-metadata.json)" >> $GITHUB_OUTPUT
64 | echo "has-metadata=true" >> $GITHUB_OUTPUT
65 |
66 | # Display metadata for debugging
67 | echo "Build metadata:"
68 | cat build-metadata.json | jq .
69 | else
70 | echo "has-metadata=false" >> $GITHUB_OUTPUT
71 | fi
72 | else
73 | echo "has-metadata=false" >> $GITHUB_OUTPUT
74 | fi
75 |
76 | - name: Set up Docker Buildx
77 | if: steps.metadata.outputs.has-metadata == 'true'
78 | uses: docker/setup-buildx-action@v3
79 |
80 | - name: Login to the remote registry
81 | if: steps.metadata.outputs.has-metadata == 'true'
82 | uses: docker/login-action@v3
83 | with:
84 | registry: quay.io
85 | username: ${{ secrets.DOCKER_USERNAME }}
86 | password: ${{ secrets.DOCKER_PASSWORD }}
87 |
88 | - name: Download test image artifact
89 | if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.build-pr-image == 'true'
90 | uses: actions/download-artifact@v4
91 | with:
92 | name: test-docker-image
93 | path: .
94 | run-id: ${{ github.event.workflow_run.id }}
95 | github-token: ${{ secrets.GITHUB_TOKEN }}
96 |
97 | - name: Load and push test image
98 | if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.build-pr-image == 'true'
99 | run: |
100 | if [ -f "test-image.tar.gz" ]; then
101 | # Load the image
102 | docker load < test-image.tar.gz
103 |
104 | # Determine the tag based on event type
105 | if [ "${{ steps.metadata.outputs.event-name }}" = "pull_request" ]; then
106 | TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:pr-${{ steps.metadata.outputs.pr-number }}-${{ steps.metadata.outputs.short-sha }}"
107 | else
108 | TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:test-${{ steps.metadata.outputs.short-sha }}"
109 | fi
110 |
111 | # Re-tag and push
112 | docker tag ${{ steps.metadata.outputs.sha }} "$TAG"
113 | docker push "$TAG"
114 | echo "Pushed test image: $TAG"
115 | else
116 | echo "No test image artifact found"
117 | fi
118 |
119 | - name: Download production image artifact
120 | if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.is-main-push == 'true'
121 | uses: actions/download-artifact@v4
122 | with:
123 | name: prod-docker-image
124 | path: .
125 | run-id: ${{ github.event.workflow_run.id }}
126 | github-token: ${{ secrets.GITHUB_TOKEN }}
127 |
128 | - name: Load and push production image
129 | if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.is-main-push == 'true'
130 | run: |
131 | if [ -f "prod-image.tar" ]; then
132 | # Load the image
133 | docker load < prod-image.tar
134 |
135 | # Determine the tag based on whether this is a tag push or main branch push
136 | if [ "${{ steps.metadata.outputs.is-tag-push }}" = "true" ]; then
137 | # For tag pushes, use just the tag name (no hash suffix)
138 | TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:${{ steps.metadata.outputs.tag-name }}"
139 | else
140 | # For main branch pushes, use epoch + hash
141 | TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:${{ steps.metadata.outputs.epoch }}-${{ steps.metadata.outputs.short-sha }}"
142 | fi
143 |
144 | docker tag temp-prod-image "$TAG"
145 | docker push "$TAG"
146 | echo "Pushed production image: $TAG"
147 | else
148 | echo "No production image artifact found"
149 | fi
--------------------------------------------------------------------------------
/q2_fondue/tests/_utils.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
9 | import io
10 | import json
11 | import logging
12 |
13 | import pandas as pd
14 | from entrezpy.efetch.efetch_request import EfetchRequest
15 | from entrezpy.esearch.esearch_request import EsearchRequest
16 | from qiime2.plugin.testing import TestPluginBase
17 |
18 | from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer, EFetchResult
19 | from q2_fondue.entrezpy_clients._esearch import ESearchResult
20 | from q2_fondue.entrezpy_clients._sra_meta import (
21 | SRAStudy,
22 | SRASample,
23 | SRAExperiment,
24 | LibraryMetadata,
25 | SRARun,
26 | )
27 |
28 |
29 | class FakeParams:
30 | def __init__(
31 | self,
32 | temp_dir,
33 | uids=None,
34 | term=None,
35 | eutil="efetch.cgi",
36 | rettype="xml",
37 | retmode="xml",
38 | ):
39 | self.query_id = "some-id-123"
40 | self.term = term
41 | self.usehistory = False
42 | self.cmd = None
43 | self.linkname = None
44 | self.holding = False
45 | self.doseq = None
46 | self.db = "sra"
47 | self.dbfrom = "sra"
48 | self.eutil = eutil
49 | self.uids = uids
50 | self.webenv = None
51 | self.idtype = None
52 | self.datetype = None
53 | self.reldate = None
54 | self.mindate = None
55 | self.maxdate = None
56 | self.querykey = 0
57 | self.rettype = rettype
58 | self.retmode = retmode
59 | self.strand = None
60 | self.sort = None
61 | self.field = None
62 | self.retstart = 0
63 | self.retmax = 0
64 | self.seqstart = None
65 | self.seqstop = None
66 | self.complexity = None
67 | self.temp_dir = temp_dir
68 |
69 |
70 | class _TestPluginWithEntrezFakeComponents(TestPluginBase):
71 | def setUp(self):
72 | super().setUp()
73 | self.efetch_result_single = self.generate_ef_result("single")
74 | self.efetch_result_multi = self.generate_ef_result("multi")
75 | self.efetch_analyzer = EFetchAnalyzer(log_level="INFO")
76 | self.efetch_request_properties = {
77 | "db",
78 | "eutil",
79 | "uids",
80 | "webenv",
81 | "querykey",
82 | "rettype",
83 | "retmode",
84 | "strand",
85 | "seqstart",
86 | "seqstop",
87 | "complexity",
88 | }
89 | self.esearch_request_properties = {"db", "eutil", "webenv", "retmode", "term"}
90 | self.library_meta = LibraryMetadata(
91 | name="unspecified", layout="SINGLE", selection="PCR", source="METAGENOMIC"
92 | )
93 | with open(self.get_data_path("metadata_response_small.json"), "r") as ff:
94 | self.metadata_dict = json.load(ff)
95 | self.maxDiff = None
96 | self.fake_logger = logging.getLogger("test_log")
97 |
98 | def xml_to_response(self, kind, suffix="", prefix="metadata"):
99 | path = self.get_data_path(f"{prefix}_response_{kind}{suffix}.xml")
100 | response = io.open(path, "rb", buffering=0)
101 | return response
102 |
103 | def json_to_response(self, kind, suffix="", raw=False, utility="esearch"):
104 | path = self.get_data_path(f"{utility}_response_{kind}{suffix}.json")
105 | response = io.open(path, "rb", buffering=0)
106 | if raw:
107 | return response
108 | else:
109 | return json.loads(io.open(path, "rb", buffering=0).read())
110 |
111 | def generate_ef_request(self, uids, start=0, size=1):
112 | request_params = FakeParams(self.temp_dir.name, uids=uids)
113 | return EfetchRequest(
114 | eutil="efetch.fcgi", parameter=request_params, start=start, size=size
115 | )
116 |
117 | def generate_ef_result(self, kind, prefix="metadata"):
118 | return EFetchResult(
119 | response=self.xml_to_response(kind, prefix=prefix),
120 | request=self.generate_ef_request(["FAKEID1", "FAKEID2"]),
121 | log_level="INFO",
122 | )
123 |
124 | def generate_sra_metadata(self):
125 | study_id, sample_id = "ERP120343", "ERS4372624"
126 | experiment_id, run_ids = "ERX3980916", ["FAKEID1", "FAKEID2"]
127 | study = SRAStudy(
128 | id=study_id,
129 | bioproject_id="PRJEB37054",
130 | center_name="University of Hohenheim",
131 | custom_meta={
132 | "ENA-FIRST-PUBLIC [STUDY]": "2020-05-31",
133 | "ENA-LAST-UPDATE [STUDY]": "2020-03-04",
134 | },
135 | )
136 | sample = SRASample(
137 | id=sample_id,
138 | biosample_id="SAMEA6608408",
139 | name="BAC1.D1.0.32A",
140 | title="Vitis vinifera",
141 | organism="Vitis vinifera",
142 | tax_id="29760",
143 | study_id=study_id,
144 | custom_meta={
145 | "environment (biome) [SAMPLE]": "berry plant",
146 | "geographic location (country and/or sea) [SAMPLE]": "Germany",
147 | "sample storage temperature [SAMPLE]": "-80",
148 | },
149 | )
150 | experiment = SRAExperiment(
151 | id=experiment_id,
152 | instrument="Illumina MiSeq",
153 | platform="ILLUMINA",
154 | library=self.library_meta,
155 | sample_id=sample_id,
156 | custom_meta={"Temperature [EXPERIMENT]": "12", "Depth [EXPERIMENT]": "500"},
157 | )
158 | runs = [
159 | SRARun(
160 | id=_id,
161 | bases=11552099,
162 | spots=39323,
163 | public=True,
164 | bytes=3914295,
165 | experiment_id=experiment_id,
166 | custom_meta={
167 | "ENA-FIRST-PUBLIC [RUN]": "2020-05-31",
168 | "ENA-LAST-UPDATE [RUN]": "2020-03-06",
169 | },
170 | )
171 | for _id in run_ids
172 | ]
173 | return study, sample, experiment, runs
174 |
175 | def generate_expected_df(self):
176 | exp_df = pd.read_json(
177 | path_or_buf=self.get_data_path("metadata_processed_multi.json"),
178 | orient="index",
179 | )
180 | exp_df.index.name = "ID"
181 | numeric_cols = {
182 | "Amount or size of sample collected [sample]",
183 | "Collection day [sample]",
184 | "Collection hours [sample]",
185 | "Sample storage temperature [sample]",
186 | "Tax ID",
187 | "Sample volume or weight for dna extraction [sample]",
188 | }
189 | exp_df["Public"] = exp_df["Public"].astype(bool)
190 | for col in numeric_cols:
191 | exp_df[col] = exp_df[col].astype(str)
192 | return exp_df
193 |
194 | def generate_es_request(self, term, start=0, size=1):
195 | request_params = FakeParams(
196 | self.temp_dir.name, retmode="json", term=term, eutil="esearch.fcgi"
197 | )
198 | return EsearchRequest(
199 | eutil="esearch.fcgi", parameter=request_params, start=start, size=size
200 | )
201 |
202 | def generate_es_result(self, kind, suffix):
203 | return ESearchResult(
204 | response=self.json_to_response(kind, suffix, utility="esearch")[
205 | "esearchresult"
206 | ],
207 | request=self.generate_es_request(term="abc OR 123"),
208 | )
209 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/efetch_response_runs.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 | 13481774
8 | - <Summary><Title>18</Title><Platform
9 | instrument_model="Illumina HiSeq 3000">ILLUMINA</Platform><Statistics
10 | total_runs="1" total_spots="63703" total_bases="38349206"
11 | total_size="22735317" load_done="true" cluster_name="public"/></Summary><Submitter
12 | acc="SRA1206349" center_name="Jiangxi Agricultural University"
13 | contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory
14 | of Animal Nutritio"/><Experiment acc="SRX10339760" ver="1"
15 | status="public" name="18"/><Study acc="SRP310597" name="PRJNA
16 | Chuanzhong black lamb Raw sequence reads"/><Organism
17 | taxid="1904483" ScientificName="sheep gut metagenome"/><Sample
18 | acc="SRS8459117" name=""/><Instrument ILLUMINA="Illumina
19 | HiSeq 3000"/><Library_descriptor><LIBRARY_NAME>18</LIBRARY_NAME><LIBRARY_STRATEGY>AMPLICON</LIBRARY_STRATEGY><LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>PCR</LIBRARY_SELECTION><LIBRARY_LAYOUT>
20 | <PAIRED/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA707607</Bioproject><Biosample>SAMN18309312</Biosample>
21 |
22 | - <Run acc="SRR13961771"
23 | total_spots="63703" total_bases="38349206" load_done="true"
24 | is_public="true" cluster_name="public"
25 | static_data_available="true"/>
26 |
27 |
28 | - 2021/03/17
29 | - 2021/03/15
30 |
31 |
32 |
33 | 4
34 | - <Summary><Title>454
35 | sequencing of Human HapMap individual NA18505 genomic paired-end
36 | library</Title><Platform instrument_model="454 GS FLX">LS454</Platform><Statistics
37 | total_runs="10" total_spots="4703662" total_bases="1306798474"
38 | total_size="3205056622" load_done="true"
39 | static_data_available="true" cluster_name="public"/></Summary><Submitter
40 | acc="SRA000197" center_name="454MSC" contact_name="Chris OSullivan"
41 | lab_name=""/><Experiment acc="SRX000003" ver="10"
42 | status="public" name="454 sequencing of Human HapMap individual
43 | NA18505 genomic paired-end library"/><Study acc="SRP000001"
44 | name="Paired-end mapping reveals extensive structural variation in
45 | the human genome"/><Organism taxid="9606"
46 | ScientificName="Homo sapiens"/><Sample acc="SRS000100"
47 | name=""/><Instrument LS454="454 GS FLX"/><Library_descriptor><LIBRARY_NAME
48 | xmlns="">SID2699</LIBRARY_NAME><LIBRARY_STRATEGY
49 | xmlns="">WGS</LIBRARY_STRATEGY><LIBRARY_SOURCE xmlns="">GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION
50 | xmlns="">RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT
51 | xmlns=""> <PAIRED NOMINAL_LENGTH="3000"/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA33627</Bioproject><Biosample>SAMN00001583</Biosample>
52 |
53 | - <Run acc="SRR000007"
54 | total_spots="633196" total_bases="175275395" load_done="true"
55 | is_public="true" cluster_name="public"
56 | static_data_available="true"/><Run acc="SRR000018"
57 | total_spots="626624" total_bases="174403220" load_done="true"
58 | is_public="true" cluster_name="public"
59 | static_data_available="true"/><Run acc="SRR000020"
60 | total_spots="374556" total_bases="103411232" load_done="true"
61 | is_public="true" cluster_name="public"
62 | static_data_available="true"/><Run acc="SRR000038"
63 | total_spots="529820" total_bases="148389031" load_done="true"
64 | is_public="true" cluster_name="public"
65 | static_data_available="true"/><Run acc="SRR000043"
66 | total_spots="608946" total_bases="168985392" load_done="true"
67 | is_public="true" cluster_name="public"
68 | static_data_available="true"/><Run acc="SRR000046"
69 | total_spots="79047" total_bases="21258857" load_done="true"
70 | is_public="true" cluster_name="public"
71 | static_data_available="true"/><Run acc="SRR000048"
72 | total_spots="640737" total_bases="177619279" load_done="true"
73 | is_public="true" cluster_name="public"
74 | static_data_available="true"/><Run acc="SRR000050"
75 | total_spots="547349" total_bases="153260655" load_done="true"
76 | is_public="true" cluster_name="public"
77 | static_data_available="true"/><Run acc="SRR000057"
78 | total_spots="76744" total_bases="21203932" load_done="true"
79 | is_public="true" cluster_name="public"
80 | static_data_available="true"/><Run acc="SRR000058"
81 | total_spots="586643" total_bases="162991481" load_done="true"
82 | is_public="true" cluster_name="public"
83 | static_data_available="true"/>
84 |
85 |
86 | - 2008/04/04
87 | - 2015/04/09
88 |
89 |
90 |
91 | 13481786
92 | - <Summary><Title>12</Title><Platform
93 | instrument_model="Illumina HiSeq 3000">ILLUMINA</Platform><Statistics
94 | total_runs="1" total_spots="59130" total_bases="35596260"
95 | total_size="21079845" load_done="true" cluster_name="public"/></Summary><Submitter
96 | acc="SRA1206349" center_name="Jiangxi Agricultural University"
97 | contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory
98 | of Animal Nutritio"/><Experiment acc="SRX10339772" ver="1"
99 | status="public" name="12"/><Study acc="SRP310597" name="PRJNA
100 | Chuanzhong black lamb Raw sequence reads"/><Organism
101 | taxid="1904483" ScientificName="sheep gut metagenome"/><Sample
102 | acc="SRS8459130" name=""/><Instrument ILLUMINA="Illumina
103 | HiSeq 3000"/><Library_descriptor><LIBRARY_NAME>12</LIBRARY_NAME><LIBRARY_STRATEGY>AMPLICON</LIBRARY_STRATEGY><LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>PCR</LIBRARY_SELECTION><LIBRARY_LAYOUT>
104 | <PAIRED/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA707607</Bioproject><Biosample>SAMN18309306</Biosample>
105 |
106 | - <Run acc="SRR13961759"
107 | total_spots="59130" total_bases="35596260" load_done="true"
108 | is_public="true" cluster_name="public"
109 | static_data_available="true"/>
110 |
111 |
112 | - 2021/03/17
113 | - 2021/03/15
114 |
115 |
116 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/scraper_items_no_attach.json:
--------------------------------------------------------------------------------
1 | [{
2 | "key": "CP4ED2CY",
3 | "version": 1257,
4 | "library": {
5 | "type": "user",
6 | "id": 12345,
7 | "name": "username",
8 | "links": {
9 | "alternate": {
10 | "href": "https://www.zotero.org/username",
11 | "type": "text/html"
12 | }
13 | }
14 | },
15 | "links": {
16 | "self": {
17 | "href": "https://api.zotero.org/users/12345/items/CP4ED2CY",
18 | "type": "application/json"
19 | },
20 | "alternate": {
21 | "href": "https://www.zotero.org/username/items/CP4ED2CY",
22 | "type": "text/html"
23 | },
24 | "attachment": {
25 | "href": "https://api.zotero.org/users/12345/items/DMJ4AQ48",
26 | "type": "application/json",
27 | "attachmentType": "application/pdf",
28 | "attachmentSize": 3648434
29 | }
30 | },
31 | "meta": {
32 | "creatorSummary": "Pruski et al.",
33 | "parsedDate": "2021-10-13",
34 | "numChildren": 2
35 | },
36 | "data": {
37 | "key": "CP4ED2CY",
38 | "version": 1257,
39 | "itemType": "journalArticle",
40 | "title": "Direct on-swab metabolic profiling of vaginal microbiome host interactions during pregnancy and preterm birth",
41 | "creators": [
42 | {
43 | "creatorType": "author",
44 | "firstName": "Pamela",
45 | "lastName": "Pruski"
46 | },
47 | {
48 | "creatorType": "author",
49 | "firstName": "Gonçalo D. S.",
50 | "lastName": "Correia"
51 | },
52 | {
53 | "creatorType": "author",
54 | "firstName": "Holly V.",
55 | "lastName": "Lewis"
56 | },
57 | {
58 | "creatorType": "author",
59 | "firstName": "Katia",
60 | "lastName": "Capuccini"
61 | },
62 | {
63 | "creatorType": "author",
64 | "firstName": "Paolo",
65 | "lastName": "Inglese"
66 | },
67 | {
68 | "creatorType": "author",
69 | "firstName": "Denise",
70 | "lastName": "Chan"
71 | },
72 | {
73 | "creatorType": "author",
74 | "firstName": "Richard G.",
75 | "lastName": "Brown"
76 | },
77 | {
78 | "creatorType": "author",
79 | "firstName": "Lindsay",
80 | "lastName": "Kindinger"
81 | },
82 | {
83 | "creatorType": "author",
84 | "firstName": "Yun S.",
85 | "lastName": "Lee"
86 | },
87 | {
88 | "creatorType": "author",
89 | "firstName": "Ann",
90 | "lastName": "Smith"
91 | },
92 | {
93 | "creatorType": "author",
94 | "firstName": "Julian",
95 | "lastName": "Marchesi"
96 | },
97 | {
98 | "creatorType": "author",
99 | "firstName": "Julie A. K.",
100 | "lastName": "McDonald"
101 | },
102 | {
103 | "creatorType": "author",
104 | "firstName": "Simon",
105 | "lastName": "Cameron"
106 | },
107 | {
108 | "creatorType": "author",
109 | "firstName": "Kate",
110 | "lastName": "Alexander-Hardiman"
111 | },
112 | {
113 | "creatorType": "author",
114 | "firstName": "Anna L.",
115 | "lastName": "David"
116 | },
117 | {
118 | "creatorType": "author",
119 | "firstName": "Sarah J.",
120 | "lastName": "Stock"
121 | },
122 | {
123 | "creatorType": "author",
124 | "firstName": "Jane E.",
125 | "lastName": "Norman"
126 | },
127 | {
128 | "creatorType": "author",
129 | "firstName": "Vasso",
130 | "lastName": "Terzidou"
131 | },
132 | {
133 | "creatorType": "author",
134 | "firstName": "T. G.",
135 | "lastName": "Teoh"
136 | },
137 | {
138 | "creatorType": "author",
139 | "firstName": "Lynne",
140 | "lastName": "Sykes"
141 | },
142 | {
143 | "creatorType": "author",
144 | "firstName": "Phillip R.",
145 | "lastName": "Bennett"
146 | },
147 | {
148 | "creatorType": "author",
149 | "firstName": "Zoltan",
150 | "lastName": "Takats"
151 | },
152 | {
153 | "creatorType": "author",
154 | "firstName": "David A.",
155 | "lastName": "MacIntyre"
156 | }
157 | ],
158 | "abstractNote": "The pregnancy vaginal microbiome contributes to risk of preterm birth, the primary cause of death in children under 5 years of age. Here we describe direct on-swab metabolic profiling by Desorption Electrospray Ionization Mass Spectrometry (DESI-MS) for sample preparation-free characterisation of the cervicovaginal metabolome in two independent pregnancy cohorts (VMET, n\u2009=\u2009160; 455 swabs; VMET II, n\u2009=\u2009205; 573 swabs). By integrating metataxonomics and immune profiling data from matched samples, we show that specific metabolome signatures can be used to robustly predict simultaneously both the composition of the vaginal microbiome and host inflammatory status. In these patients, vaginal microbiota instability and innate immune activation, as predicted using DESI-MS, associated with preterm birth, including in women receiving cervical cerclage for preterm birth prevention. These findings highlight direct on-swab metabolic profiling by DESI-MS as an innovative approach for preterm birth risk stratification through rapid assessment of vaginal microbiota-host dynamics.",
159 | "publicationTitle": "Nature Communications",
160 | "volume": "12",
161 | "issue": "1",
162 | "pages": "5967",
163 | "date": "2021-10-13",
164 | "series": "",
165 | "seriesTitle": "",
166 | "seriesText": "",
167 | "journalAbbreviation": "Nat Commun",
168 | "language": "en",
169 | "DOI": "10.1038/s41467-021-26215-w",
170 | "ISSN": "2041-1723",
171 | "shortTitle": "",
172 | "url": "https://www.nature.com/articles/s41467-021-26215-w",
173 | "accessDate": "2021-11-10T07:04:46Z",
174 | "archive": "",
175 | "archiveLocation": "",
176 | "libraryCatalog": "www.nature.com",
177 | "callNumber": "",
178 | "rights": "2021 The Author(s)",
179 | "extra": "Bandiera_abtest: a\nCc_license_type: cc_by\nCg_type: Nature Research Journals\nNumber: 1\nPrimary_atype: Research\nPublisher: Nature Publishing Group\nSubject_term: Infectious-disease diagnostics;Predictive markers;Risk factors;Translational research\nSubject_term_id: infectious-disease-diagnostics;predictive-markers;risk-factors;translational-research",
180 | "tags": [
181 | {
182 | "tag": "Infectious-disease diagnostics",
183 | "type": 1
184 | },
185 | {
186 | "tag": "Predictive markers",
187 | "type": 1
188 | },
189 | {
190 | "tag": "Risk factors",
191 | "type": 1
192 | },
193 | {
194 | "tag": "Translational research",
195 | "type": 1
196 | }
197 | ],
198 | "collections": [
199 | "DCHC4FUN"
200 | ],
201 | "relations": {},
202 | "dateAdded": "2021-11-10T07:04:46Z",
203 | "dateModified": "2021-11-10T07:04:46Z"
204 | }
205 | }]
--------------------------------------------------------------------------------
/q2_fondue/utils.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 | import gzip
9 | import os
10 | import shutil
11 | import signal
12 | import subprocess
13 | from typing import List
14 |
15 | from entrezpy.esearch import esearcher as es
16 | from q2_types.per_sample_sequences import CasavaOneEightSingleLanePerSampleDirFmt
17 | from qiime2 import Artifact
18 |
19 | from q2_fondue.entrezpy_clients._esearch import ESearchAnalyzer
20 | from q2_fondue.entrezpy_clients._utils import (
21 | PREFIX,
22 | InvalidIDs,
23 | set_up_logger,
24 | set_up_entrezpy_logging,
25 | )
26 |
27 | LOGGER = set_up_logger("INFO", logger_name=__name__)
28 |
29 |
30 | class DownloadError(Exception):
31 | pass
32 |
33 |
34 | def _chunker(seq, size):
35 | # source: https://stackoverflow.com/a/434328/579416
36 | return (seq[pos : pos + size] for pos in range(0, len(seq), size))
37 |
38 |
39 | def _validate_run_ids(
40 | email: str, n_jobs: int, run_ids: List[str], log_level: str
41 | ) -> dict:
42 | """Validates provided accession IDs using ESearch.
43 |
44 | Args:
45 | email (str): A valid e-mail address.
46 | n_jobs (int): Number of threads to be used in parallel.
47 | run_ids (List[str]): List of all the run IDs to be validated.
48 | log_level (str): Logging level.
49 |
50 | Returns:
51 | dict: Dictionary of invalid IDs (as keys) with a description.
52 | """
53 | # must process in batches because esearch requests with
54 | # runID count > 10'000 fail
55 | invalid_ids = {}
56 | for batch in _chunker(run_ids, 10000):
57 | esearcher = es.Esearcher(
58 | "esearcher", email, apikey=None, apikey_var=None, threads=0, qid=None
59 | )
60 | set_up_entrezpy_logging(esearcher, log_level)
61 |
62 | esearch_response = esearcher.inquire(
63 | {"db": "sra", "term": " OR ".join(batch), "usehistory": False},
64 | analyzer=ESearchAnalyzer(batch),
65 | )
66 | invalid_ids.update(esearch_response.result.validate_result())
67 |
68 | return invalid_ids
69 |
70 |
71 | def _determine_id_type(ids: list):
72 | ids = [x[:3] for x in ids]
73 | for kind in PREFIX.keys():
74 | if all([x in PREFIX[kind] for x in ids]):
75 | return kind
76 | raise InvalidIDs(
77 | "The type of provided IDs is either not supported or "
78 | "IDs of mixed types were provided. Please provide IDs "
79 | "corresponding to either SRA run (#S|E|DRR), study "
80 | "(#S|E|DRP) or NCBI BioProject IDs (#PRJ)."
81 | )
82 |
83 |
84 | def handle_threaded_exception(args):
85 | logger = set_up_logger("DEBUG", logger_name="ThreadedErrorsManager")
86 | msg = "Data fetching was interrupted by the following error: \n"
87 |
88 | if "gaierror is not JSON serializable" in str(args.exc_value):
89 | msg += (
90 | "EntrezPy failed to connect to NCBI. Please check your "
91 | "internet connection and try again. It may help to wait "
92 | "a few minutes before retrying."
93 | )
94 | # silence threads exiting correctly
95 | elif issubclass(args.exc_type, SystemExit) and str(args.exc_value) == "0":
96 | return
97 | else:
98 | msg += (
99 | f'Caught {args.exc_type} with value "{args.exc_value}" '
100 | f"in thread {args.thread}"
101 | )
102 |
103 | logger.exception(msg)
104 |
105 | # This will send a SIGINT to the main thread, which will gracefully
106 | # kill the running Q2 action. No artifacts will be saved.
107 | os.kill(os.getpid(), signal.SIGINT)
108 |
109 |
110 | def _has_enough_space(acc_id: str, output_dir: str) -> bool:
111 | """Checks whether there is enough storage available for fasterq-dump
112 | to process sequences for a given ID.
113 |
114 | fasterq-dump will be used to check the amount of space required for the
115 | final data. Required space is estimated as 10x that of the final data
116 | (as per NCBI's documentation).
117 |
118 | Args:
119 | acc_id (str): The accession ID to be processed.
120 | output_dir (str): Location where the output would be saved.
121 |
122 | Return
123 | bool: Whether there is enough space available for fasterq-dump tool.
124 | """
125 | if acc_id is None:
126 | return True
127 |
128 | cmd_fasterq = ["fasterq-dump", "--size-check", "only", "-x", acc_id]
129 | result = subprocess.run(cmd_fasterq, text=True, capture_output=True, cwd=output_dir)
130 |
131 | if result.returncode == 0:
132 | return True
133 | elif result.returncode == 3 and "disk-limit exeeded" in result.stderr:
134 | LOGGER.warning("Not enough space to fetch run %s.", acc_id)
135 | return False
136 | else:
137 | LOGGER.error(
138 | 'fasterq-dump exited with a "%s" error code (the message '
139 | 'was: "%s"). We will try to fetch the next accession ID.',
140 | result.returncode,
141 | result.stderr,
142 | )
143 | return True
144 |
145 |
146 | def _rewrite_fastq(file_in: str, file_out: str) -> None:
147 | """Rewrites a FASTQ file with gzip compression.
148 |
149 | Takes an uncompressed FASTQ file and writes it to a new location with
150 | gzip compression.
151 |
152 | Args:
153 | file_in (str): Path to input uncompressed FASTQ file
154 | file_out (str): Path where compressed FASTQ file should be written
155 | """
156 | with open(file_in, "rb") as f_in, gzip.open(file_out, "wb") as f_out:
157 | shutil.copyfileobj(f_in, f_out)
158 |
159 |
160 | def _is_empty(artifact: Artifact) -> bool:
161 | """Checks if a sequence artifact is empty.
162 |
163 | Determines if a sequence artifact is empty by checking if all sample IDs
164 | are "xxx", which indicates an empty placeholder artifact.
165 |
166 | Args:
167 | artifact: A QIIME 2 sequence artifact
168 |
169 | Returns:
170 | bool: True if the artifact is empty, False otherwise
171 | """
172 | samples = artifact.view(CasavaOneEightSingleLanePerSampleDirFmt).manifest.index
173 | return all(sample == "xxx" for sample in samples)
174 |
175 |
176 | def _remove_empty(*artifact_lists) -> tuple:
177 | """Removes empty artifacts from lists of sequence artifacts.
178 |
179 | Takes one or more lists of sequence artifacts and filters out any empty
180 | artifacts (those containing only placeholder 'xxx' samples). Returns
181 | tuple of filtered lists maintaining the same order as input.
182 |
183 | Args:
184 | *artifact_lists: Variable number of lists containing sequence artifacts
185 | to filter
186 |
187 | Returns:
188 | tuple: Tuple of filtered lists with empty artifacts removed, in same
189 | order as input lists
190 | """
191 | processed_artifacts = []
192 | for artifacts in artifact_lists:
193 | processed_artifacts.append(
194 | [artifact for artifact in artifacts if not _is_empty(artifact)]
195 | )
196 | return tuple(processed_artifacts)
197 |
198 |
199 | def _make_empty_artifact(ctx, paired: bool) -> Artifact:
200 | """Creates an empty sequence artifact.
201 |
202 | Creates an empty QIIME 2 sequence artifact containing placeholder files.
203 | For paired-end sequences, creates two empty fastq files (R1 and R2).
204 | For single-end sequences, creates one empty fastq file (R1).
205 |
206 | Args:
207 | ctx: QIIME 2 plugin context
208 | paired (bool): Whether to create paired-end (True) or
209 | single-end (False) artifact
210 |
211 | Returns:
212 | QIIME 2 artifact: Empty sequence artifact of appropriate type
213 | (paired or single-end)
214 | """
215 | if paired:
216 | filenames = ["xxx_00_L001_R1_001.fastq.gz", "xxx_00_L001_R2_001.fastq.gz"]
217 | _type = "SampleData[PairedEndSequencesWithQuality]"
218 | else:
219 | filenames = ["xxx_01_L001_R1_001.fastq.gz"]
220 | _type = "SampleData[SequencesWithQuality]"
221 |
222 | casava_out = CasavaOneEightSingleLanePerSampleDirFmt()
223 | for filename in filenames:
224 | with gzip.open(str(casava_out.path / filename), mode="w"):
225 | pass
226 |
227 | return ctx.make_artifact(_type, casava_out)
228 |
--------------------------------------------------------------------------------
/q2_fondue/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 | import gzip
9 | import os
10 | import signal
11 | import tempfile
12 | import threading
13 | import unittest
14 | from threading import Thread
15 | from unittest.mock import patch, MagicMock
16 |
17 | from q2_types.per_sample_sequences import CasavaOneEightSingleLanePerSampleDirFmt
18 | from qiime2 import Artifact
19 | from qiime2.plugin.testing import TestPluginBase
20 |
21 | from q2_fondue.utils import (
22 | handle_threaded_exception,
23 | _has_enough_space,
24 | _chunker,
25 | _rewrite_fastq,
26 | _is_empty,
27 | _remove_empty,
28 | _make_empty_artifact,
29 | )
30 |
31 |
32 | class TestExceptHooks(unittest.TestCase):
33 | package = "q2_fondue.tests"
34 |
35 | def do_something_with_error(self, msg):
36 | raise Exception(msg)
37 |
38 | @patch("os.kill")
39 | def test_handle_threaded_exception_gaierror(self, patch_kill):
40 | with self.assertLogs(level="DEBUG", logger="ThreadedErrorsManager") as cm:
41 | threading.excepthook = handle_threaded_exception
42 | error_msg = "Something went wrong: gaierror is " "not JSON serializable."
43 | t = Thread(target=self.do_something_with_error, args=(error_msg,))
44 | t.start()
45 | t.join()
46 |
47 | self.assertIn("EntrezPy failed to connect to NCBI", cm.output[0])
48 |
49 | pid = os.getpid()
50 | patch_kill.assert_called_once_with(pid, signal.SIGINT)
51 |
52 | @patch("os.kill")
53 | def test_handle_threaded_exception_other_errors(self, patch_kill):
54 | with self.assertLogs(level="DEBUG", logger="ThreadedErrorsManager") as cm:
55 | threading.excepthook = handle_threaded_exception
56 | error_msg = "Some unknown exception."
57 | t = Thread(target=self.do_something_with_error, args=(error_msg,))
58 | t.start()
59 | t.join()
60 |
61 | self.assertIn(
62 | "Caught with value " '"Some unknown exception."',
63 | cm.output[0],
64 | )
65 |
66 | pid = os.getpid()
67 | patch_kill.assert_called_once_with(pid, signal.SIGINT)
68 |
69 |
70 | class TestSRAUtils(TestPluginBase):
71 | package = "q2_fondue.tests"
72 |
73 | @patch("subprocess.run")
74 | def test_has_enough_space(self, patched_run):
75 | patched_run.return_value = MagicMock(returncode=0)
76 |
77 | acc, test_dir = "ABC123", "some/where"
78 | obs = _has_enough_space(acc, test_dir)
79 | self.assertTrue(obs)
80 | patched_run.assert_called_once_with(
81 | ["fasterq-dump", "--size-check", "only", "-x", acc],
82 | text=True,
83 | capture_output=True,
84 | cwd=test_dir,
85 | )
86 |
87 | @patch("subprocess.run")
88 | def test_has_enough_space_not(self, patched_run):
89 | with open(self.get_data_path("fasterq-dump-response.txt")) as f:
90 | response = "".join(f.readlines())
91 | patched_run.return_value = MagicMock(stderr=response, returncode=3)
92 |
93 | acc, test_dir = "ABC123", "some/where"
94 | obs = _has_enough_space(acc, test_dir)
95 | self.assertFalse(obs)
96 | patched_run.assert_called_once_with(
97 | ["fasterq-dump", "--size-check", "only", "-x", acc],
98 | text=True,
99 | capture_output=True,
100 | cwd=test_dir,
101 | )
102 |
103 | @patch("subprocess.run")
104 | def test_has_enough_space_error(self, patched_run):
105 | patched_run.return_value = MagicMock(stderr="errorX", returncode=8)
106 |
107 | acc, test_dir = "ABC123", "some/where"
108 | with self.assertLogs("q2_fondue.utils", level="ERROR") as cm:
109 | obs = _has_enough_space(acc, test_dir)
110 | self.assertEqual(
111 | cm.output,
112 | [
113 | 'ERROR:q2_fondue.utils:fasterq-dump exited with a "8" error code '
114 | '(the message was: "errorX"). We will try to fetch the next '
115 | "accession ID."
116 | ],
117 | )
118 | self.assertTrue(obs)
119 | patched_run.assert_called_once_with(
120 | ["fasterq-dump", "--size-check", "only", "-x", acc],
121 | text=True,
122 | capture_output=True,
123 | cwd=test_dir,
124 | )
125 |
126 | def test_chunker(self):
127 | obs_out = _chunker(["A", "B", "C"], 2)
128 | exp_out_1 = ["A", "B"]
129 | exp_out_2 = ["C"]
130 | self.assertEqual(next(obs_out), exp_out_1)
131 | self.assertEqual(next(obs_out), exp_out_2)
132 |
133 | def test_chunker_no_chunks(self):
134 | obs_out = _chunker(["A", "B", "C"], 4)
135 | exp_out = ["A", "B", "C"]
136 | self.assertEqual(next(obs_out), exp_out)
137 |
138 | def test_rewrite_fastq(self):
139 | file_in = self.get_data_path("SRR123456.fastq")
140 | file_out = tempfile.NamedTemporaryFile()
141 |
142 | _rewrite_fastq(file_in, file_out.name)
143 |
144 | with open(file_in, "rb") as fin:
145 | with gzip.open(file_out.name, "r") as fout:
146 | for lin, lout in zip(fin.readlines(), fout.readlines()):
147 | self.assertEqual(lin, lout)
148 |
149 | # clean up
150 | file_out.close()
151 |
152 |
153 | class TestSequenceUtils(TestPluginBase):
154 | package = "q2_fondue.tests"
155 |
156 | def test_is_empty_with_empty_artifact(self):
157 | casava_out = CasavaOneEightSingleLanePerSampleDirFmt()
158 | filenames = ["xxx_01_L001_R1_001.fastq.gz"]
159 | for filename in filenames:
160 | with gzip.open(str(casava_out.path / filename), mode="w"):
161 | pass
162 |
163 | artifact = Artifact.import_data("SampleData[SequencesWithQuality]", casava_out)
164 |
165 | self.assertTrue(_is_empty(artifact))
166 |
167 | def test_is_empty_with_nonempty_artifact(self):
168 | artifact = Artifact.import_data(
169 | "SampleData[SequencesWithQuality]",
170 | self.get_data_path("single1"),
171 | CasavaOneEightSingleLanePerSampleDirFmt,
172 | )
173 |
174 | self.assertFalse(_is_empty(artifact))
175 |
176 | def test_remove_empty(self):
177 | empty_casava = CasavaOneEightSingleLanePerSampleDirFmt()
178 | with gzip.open(
179 | str(empty_casava.path / "xxx_01_L001_R1_001.fastq.gz"), mode="w"
180 | ):
181 | pass
182 | empty_artifact_single = Artifact.import_data(
183 | "SampleData[SequencesWithQuality]", empty_casava
184 | )
185 | with gzip.open(
186 | str(empty_casava.path / "xxx_01_L001_R2_001.fastq.gz"), mode="w"
187 | ):
188 | pass
189 | empty_artifact_paired = Artifact.import_data(
190 | "SampleData[PairedEndSequencesWithQuality]", empty_casava
191 | )
192 |
193 | non_empty_artifact_single = Artifact.import_data(
194 | "SampleData[SequencesWithQuality]",
195 | self.get_data_path("single1"),
196 | CasavaOneEightSingleLanePerSampleDirFmt,
197 | )
198 | non_empty_artifact_paired = Artifact.import_data(
199 | "SampleData[PairedEndSequencesWithQuality]",
200 | self.get_data_path("paired1"),
201 | CasavaOneEightSingleLanePerSampleDirFmt,
202 | )
203 |
204 | singles = [empty_artifact_single, non_empty_artifact_single]
205 | paired = [empty_artifact_paired, non_empty_artifact_paired]
206 |
207 | filtered_singles, filtered_paired = _remove_empty(singles, paired)
208 |
209 | self.assertEqual(len(filtered_singles), 1)
210 | self.assertEqual(len(filtered_paired), 1)
211 | self.assertIs(filtered_singles[0], non_empty_artifact_single)
212 | self.assertIs(filtered_paired[0], non_empty_artifact_paired)
213 |
214 | def test_make_empty_artifact_single(self):
215 | ctx = MagicMock()
216 | ctx.make_artifact.return_value = "single_artifact"
217 |
218 | result = _make_empty_artifact(ctx, False)
219 |
220 | self.assertEqual(result, "single_artifact")
221 | ctx.make_artifact.assert_called_once()
222 |
223 | args, kwargs = ctx.make_artifact.call_args
224 |
225 | self.assertEqual(args[0], "SampleData[SequencesWithQuality]")
226 |
227 | casava_output = args[1]
228 | self.assertTrue(
229 | os.path.exists(casava_output.path / "xxx_01_L001_R1_001.fastq.gz")
230 | )
231 |
232 | def test_make_empty_artifact_paired(self):
233 | ctx = MagicMock()
234 | ctx.make_artifact.return_value = "paired_artifact"
235 |
236 | result = _make_empty_artifact(ctx, True)
237 |
238 | self.assertEqual(result, "paired_artifact")
239 | ctx.make_artifact.assert_called_once()
240 |
241 | args, kwargs = ctx.make_artifact.call_args
242 |
243 | self.assertEqual(args[0], "SampleData[PairedEndSequencesWithQuality]")
244 |
245 | casava_output = args[1]
246 | self.assertTrue(
247 | os.path.exists(casava_output.path / "xxx_00_L001_R1_001.fastq.gz")
248 | )
249 | self.assertTrue(
250 | os.path.exists(casava_output.path / "xxx_00_L001_R2_001.fastq.gz")
251 | )
252 |
253 |
254 | if __name__ == "__main__":
255 | unittest.main()
256 |
--------------------------------------------------------------------------------
/q2_fondue/metadata.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
9 | import threading
10 | from typing import List, Tuple
11 |
12 | import entrezpy.efetch.efetcher as ef
13 | import pandas as pd
14 | from qiime2 import Metadata
15 |
16 | from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer
17 | from q2_fondue.utils import (
18 | _validate_run_ids,
19 | _determine_id_type,
20 | handle_threaded_exception,
21 | )
22 | from q2_fondue.entrezpy_clients._utils import (
23 | set_up_entrezpy_logging,
24 | set_up_logger,
25 | InvalidIDs,
26 | )
27 | from q2_fondue.entrezpy_clients._pipelines import _get_run_ids
28 |
29 |
30 | threading.excepthook = handle_threaded_exception
31 | BATCH_SIZE = 150
32 |
33 |
34 | def _efetcher_inquire(
35 | efetcher: ef.Efetcher, run_ids: List[str], log_level: str
36 | ) -> Tuple[pd.DataFrame, dict]:
37 | """Makes an EFetch request using the provided IDs.
38 |
39 | Args:
40 | efetcher (ef.Efetcher): A valid instance of an Entrezpy Efetcher.
41 | run_ids (List[str]): List of all the run IDs to be fetched.
42 | log_level (str): Logging level.
43 |
44 | Returns:
45 | pd.DataFrame: DataFrame with metadata obtained for the provided IDs.
46 | dict: Dictionary of the run IDs that were not found with
47 | respective error messages.
48 | """
49 | metadata_response = efetcher.inquire(
50 | {
51 | "db": "sra",
52 | "id": run_ids,
53 | "rettype": "xml",
54 | "retmode": "xml",
55 | "retmax": len(run_ids),
56 | "reqsize": BATCH_SIZE,
57 | },
58 | analyzer=EFetchAnalyzer(log_level),
59 | )
60 |
61 | if metadata_response.result is None:
62 | return (pd.DataFrame(), {m_id: metadata_response.error_msg for m_id in run_ids})
63 | else:
64 | return metadata_response.result.metadata_to_df(), {}
65 |
66 |
67 | def _execute_efetcher(email, n_jobs, run_ids, log_level):
68 | efetcher = ef.Efetcher(
69 | "efetcher", email, apikey=None, apikey_var=None, threads=n_jobs, qid=None
70 | )
71 | set_up_entrezpy_logging(efetcher, log_level)
72 |
73 | return _efetcher_inquire(efetcher, run_ids, log_level)
74 |
75 |
76 | def _get_run_meta(
77 | email, n_jobs, run_ids, validated, log_level, logger
78 | ) -> (pd.DataFrame, dict):
79 | if not validated:
80 | invalid_ids = _validate_run_ids(email, n_jobs, run_ids, log_level)
81 | valid_ids = sorted(list(set(run_ids) - set(invalid_ids.keys())))
82 |
83 | if not valid_ids:
84 | raise InvalidIDs("All provided IDs were invalid. Please check your input.")
85 | if invalid_ids:
86 | logger.warning(
87 | f"The following provided IDs are invalid: "
88 | f'{",".join(invalid_ids.keys())}. Please correct them and '
89 | f"try fetching those independently."
90 | )
91 | else:
92 | # we assume that IDs retrieved by linking from aggregate IDs
93 | # (e.g., BioProject or study) should only return valid IDs,
94 | # since we asked NCBI to get those for us
95 | valid_ids = run_ids
96 |
97 | # fetch metadata
98 | logger.info("Fetching metadata for %i run IDs.", len(valid_ids))
99 | meta_df, missing_ids = _execute_efetcher(email, n_jobs, valid_ids, log_level)
100 |
101 | if missing_ids:
102 | logger.warning(
103 | "Metadata for the following run IDs could not be fetched: "
104 | f'{",".join(missing_ids.keys())}. '
105 | f"Please try fetching those independently."
106 | )
107 |
108 | return meta_df, missing_ids
109 |
110 |
111 | def _get_other_meta(
112 | email, n_jobs, project_ids, id_type, log_level, logger
113 | ) -> (pd.DataFrame, dict):
114 | run_ids = _get_run_ids(email, n_jobs, project_ids, None, id_type, log_level)
115 |
116 | return _get_run_meta(email, n_jobs, run_ids, True, log_level, logger)
117 |
118 |
119 | def _find_doi_mapping_and_type(mapping_doi_ids: Metadata) -> (pd.Series, str):
120 | """If present, save DOI name to ID mappings together with type
121 | of IDs the DOI names are matching to.
122 |
123 | Args:
124 | mapping_doi_ids (Metadata): Table of accession IDs with
125 | associated DOI names.
126 | Returns:
127 | pd.Series: Series of DOI names with matched accession IDs.
128 | str: Type of accession IDs in matching.
129 | """
130 | id2doi = mapping_doi_ids.to_dataframe().iloc[:, 0]
131 | doi_ids = sorted(list(mapping_doi_ids.get_ids()))
132 | id2doi_type = _determine_id_type(doi_ids)
133 |
134 | return (id2doi, id2doi_type)
135 |
136 |
137 | def get_metadata(
138 | accession_ids: Metadata,
139 | email: str,
140 | threads: int = 1,
141 | log_level: str = "INFO",
142 | linked_doi: Metadata = None,
143 | ) -> (pd.DataFrame, pd.DataFrame):
144 | """Fetches metadata using the provided run/bioproject/study/sample or
145 | experiment accession IDs.
146 |
147 | If aggregate IDs (such as bioproject, study, sample, experiment IDs) were
148 | provided, first run IDs will be fetched using a Conduit Pipeline.
149 | The run IDs will be validated using an ESearch query. The metadata will
150 | be fetched only for the valid run IDs. Invalid run IDs will be raised
151 | with a warning. Run IDs for which the metadata could not be fetched will
152 | be returned with the corresponding error message as missing_ids.
153 |
154 | Args:
155 | accession_ids (Metadata): Table of all the accession IDs
156 | to be fetched (either run, bioproject, study, sample or
157 | experiment IDs). If table does not contain DOI names, names
158 | from `linked_doi` will be matched.
159 | linked_doi (Metadata): Optional table of accession IDs with
160 | associated DOI names. Preferably used when refetching failed
161 | run IDs that can be matched after metadata was fetched
162 | successfully. Ignored if `accession_ids` already contains DOI
163 | names.
164 | email (str): A valid e-mail address (required by NCBI).
165 | threads (int, default=1): Number of threads to be used in parallel.
166 | log_level (str, default='INFO'): Logging level.
167 |
168 | Returns:
169 | pd.DataFrame: DataFrame with metadata obtained for the provided IDs.
170 | pd.DataFrame: DataFrame with runs IDs for which no metadata was
171 | fetched and the associated error messages.
172 | """
173 | logger = set_up_logger(log_level, logger_name=__name__)
174 |
175 | # extract DOI names to IDs mapping for later
176 | if any(x in accession_ids.columns for x in ["doi", "DOI"]):
177 | id2doi, id2doi_type = _find_doi_mapping_and_type(accession_ids)
178 | elif linked_doi and any(x in linked_doi.columns for x in ["doi", "DOI"]):
179 | id2doi, id2doi_type = _find_doi_mapping_and_type(linked_doi)
180 | else:
181 | id2doi, id2doi_type = None, None
182 |
183 | # Retrieve input IDs
184 | accession_ids = sorted(list(accession_ids.get_ids()))
185 |
186 | # figure out which id type we're dealing with
187 | id_type = _determine_id_type(accession_ids)
188 |
189 | # get actual metadata
190 | if id_type == "run":
191 | meta, missing_ids = _get_run_meta(
192 | email, threads, accession_ids, False, log_level, logger
193 | )
194 | else:
195 | meta, missing_ids = _get_other_meta(
196 | email, threads, accession_ids, id_type, log_level, logger
197 | )
198 |
199 | # match DOI names to metadata if present
200 | match_study_meta = {
201 | "bioproject": "Bioproject ID",
202 | "study": "Study ID",
203 | "experiment": "Experiment ID",
204 | "sample": "Sample Accession",
205 | }
206 | if id2doi is not None and id2doi_type == "run":
207 | meta = meta.join(id2doi, how="left")
208 | elif id2doi is not None and id2doi_type != "run":
209 | meta = meta.merge(
210 | id2doi, how="left", left_on=match_study_meta[id2doi_type], right_index=True
211 | )
212 |
213 | missing_ids = pd.DataFrame(
214 | data={"Error message": missing_ids.values()},
215 | index=pd.Index(missing_ids.keys(), name="ID"),
216 | )
217 | return meta, missing_ids
218 |
219 |
220 | def merge_metadata(metadata: pd.DataFrame) -> pd.DataFrame:
221 | """Merges provided multiple metadata into a single metadata object.
222 |
223 | Args:
224 | metadata (pd.DataFrame): List of metadata DataFrames to be merged.
225 |
226 | Returns:
227 | metadata_merged (pd.DataFrame): Final metadata DataFrame.
228 | """
229 | logger = set_up_logger("INFO", logger_name=__name__)
230 | logger.info("Merging %s metadata DataFrames.", len(metadata))
231 |
232 | metadata_merged = pd.concat(metadata, axis=0, join="outer")
233 |
234 | records_count = metadata_merged.shape[0]
235 | metadata_merged.drop_duplicates(inplace=True)
236 | if records_count != metadata_merged.shape[0]:
237 | logger.info(
238 | "%s duplicate record(s) found in the metadata " "were dropped.",
239 | records_count - metadata_merged.shape[0],
240 | )
241 |
242 | if len(metadata_merged.index) != len(set(metadata_merged.index)):
243 | logger.warning(
244 | "Records with same IDs but differing values were found in "
245 | "the metadata and will not be removed."
246 | )
247 |
248 | logger.info(
249 | "Merged metadata DataFrame has %s rows and %s columns.",
250 | metadata_merged.shape[0],
251 | metadata_merged.shape[1],
252 | )
253 |
254 | return metadata_merged
255 |
--------------------------------------------------------------------------------
/q2_fondue/tests/data/metadata_response_small.json:
--------------------------------------------------------------------------------
1 | {
2 | "EXPERIMENT": {
3 | "@accession": "ERX3980916",
4 | "@alias": "ena-EXPERIMENT-UNIVERSITY OF HOHENHEIM-06-03-2020-13:37:12:076-1",
5 | "@center_name": "UNIVERSITY OF HOHENHEIM",
6 | "IDENTIFIERS": {
7 | "PRIMARY_ID": "ERX3980916"
8 | },
9 | "TITLE": "Illumina MiSeq sequencing",
10 | "STUDY_REF": {
11 | "@accession": "ERP120343",
12 | "IDENTIFIERS": {
13 | "PRIMARY_ID": "ERP120343",
14 | "EXTERNAL_ID": {
15 | "@namespace": "BioProject",
16 | "#text": "PRJEB37054"
17 | }
18 | }
19 | },
20 | "DESIGN": {
21 | "DESIGN_DESCRIPTION": null,
22 | "SAMPLE_DESCRIPTOR": {
23 | "@accession": "ERS4372624",
24 | "IDENTIFIERS": {
25 | "PRIMARY_ID": "ERS4372624",
26 | "EXTERNAL_ID": {
27 | "@namespace": "BioSample",
28 | "#text": "SAMEA6608408"
29 | }
30 | }
31 | },
32 | "LIBRARY_DESCRIPTOR": {
33 | "LIBRARY_NAME": "unspecified",
34 | "LIBRARY_STRATEGY": "AMPLICON",
35 | "LIBRARY_SOURCE": "METAGENOMIC",
36 | "LIBRARY_SELECTION": "PCR",
37 | "LIBRARY_LAYOUT": {
38 | "SINGLE": null
39 | }
40 | }
41 | },
42 | "PLATFORM": {
43 | "ILLUMINA": {
44 | "INSTRUMENT_MODEL": "Illumina MiSeq"
45 | }
46 | },
47 | "EXPERIMENT_ATTRIBUTES": {
48 | "EXPERIMENT_ATTRIBUTE": [
49 | {
50 | "TAG": "Temperature",
51 | "VALUE": "12"
52 | },
53 | {
54 | "TAG": "Depth",
55 | "VALUE": "500"
56 | }
57 | ]
58 | }
59 | },
60 | "SUBMISSION": {
61 | "@accession": "ERA2402167",
62 | "@alias": "ena-SUBMISSION-UNIVERSITY OF HOHENHEIM-06-03-2020-13:27:09:756-1",
63 | "@center_name": "UNIVERSITY OF HOHENHEIM",
64 | "@lab_name": "European Nucleotide Archive",
65 | "IDENTIFIERS": {
66 | "PRIMARY_ID": "ERA2402167"
67 | },
68 | "TITLE": "Submitted by UNIVERSITY OF HOHENHEIM on 06-MAR-2020"
69 | },
70 | "Organization": {
71 | "@type": "center",
72 | "Name": {
73 | "@abbr": "University of Hohenheim",
74 | "#text": "University of Hohenheim"
75 | }
76 | },
77 | "STUDY": {
78 | "@accession": "ERP120343",
79 | "@alias": "ena-STUDY-UNIVERSITY OF HOHENHEIM-04-03-2020-12:54:47:240-944",
80 | "@center_name": "UNIVERSITY OF HOHENHEIM",
81 | "IDENTIFIERS": {
82 | "PRIMARY_ID": "ERP120343",
83 | "EXTERNAL_ID": {
84 | "@namespace": "BioProject",
85 | "#text": "PRJEB37054"
86 | }
87 | },
88 | "DESCRIPTOR": {
89 | "STUDY_TITLE": "The microbial load, diversity and composition of\n the wine microbiota is affected by wine type and\n environmental-stress factors",
90 | "STUDY_TYPE": {
91 | "@existing_study_type": "Other"
92 | },
93 | "STUDY_ABSTRACT": "In order to improve the understanding of the\n composition, organization and temporal dynamics of the wine\n microbiota, the relative and absolute bacterial wine\n microbiota composition during the first week of\n fermentation was determined, including distinct red and\n white wine cultivars, by 16S rRNA gene amplicon sequencing.",
94 | "CENTER_PROJECT_NAME": "Wine microbiota analysis during\n fermentation",
95 | "STUDY_DESCRIPTION": "In order to improve the understanding of the\n composition, organization and temporal dynamics of the wine\n microbiota, the relative and absolute bacterial wine\n microbiota composition during the first week of\n fermentation was determined, including distinct red and\n white wine cultivars, by 16S rRNA gene amplicon sequencing."
96 | },
97 | "STUDY_ATTRIBUTES": {
98 | "STUDY_ATTRIBUTE": [
99 | {
100 | "TAG": "ENA-FIRST-PUBLIC",
101 | "VALUE": "2020-05-31"
102 | },
103 | {
104 | "TAG": "ENA-LAST-UPDATE",
105 | "VALUE": "2020-03-04"
106 | }
107 | ]
108 | }
109 | },
110 | "SAMPLE": {
111 | "@accession": "ERS4372624",
112 | "@alias": "BAC1.D1.0.32A",
113 | "@center_name": "UNIVERSITY OF HOHENHEIM",
114 | "IDENTIFIERS": {
115 | "PRIMARY_ID": "ERS4372624",
116 | "EXTERNAL_ID": {
117 | "@namespace": "BioSample",
118 | "#text": "SAMEA6608408"
119 | }
120 | },
121 | "TITLE": "Vitis vinifera",
122 | "SAMPLE_NAME": {
123 | "TAXON_ID": "29760",
124 | "SCIENTIFIC_NAME": "Vitis vinifera",
125 | "COMMON_NAME": "wine grape"
126 | },
127 | "SAMPLE_ATTRIBUTES": {
128 | "SAMPLE_ATTRIBUTE": [
129 | {
130 | "TAG": "environment (biome)",
131 | "VALUE": "berry plant"
132 | },
133 | {
134 | "TAG": "geographic location (country and/or sea)",
135 | "VALUE": "Germany"
136 | },
137 | {
138 | "TAG": "sample storage temperature",
139 | "VALUE": "-80",
140 | "UNITS": "°C"
141 | }
142 | ]
143 | }
144 | },
145 | "Pool": {
146 | "Member": {
147 | "@member_name": "",
148 | "@accession": "ERS4372624",
149 | "@sample_name": "BAC1.D1.0.32A",
150 | "@sample_title": "Vitis vinifera",
151 | "@spots": "39323",
152 | "@bases": "11552099",
153 | "@tax_id": "29760",
154 | "@organism": "Vitis vinifera",
155 | "IDENTIFIERS": {
156 | "PRIMARY_ID": "ERS4372624",
157 | "EXTERNAL_ID": {
158 | "@namespace": "BioSample",
159 | "#text": "SAMEA6608408"
160 | }
161 | }
162 | }
163 | },
164 | "RUN_SET": {
165 | "RUN": {
166 | "@accession": "FAKEID1",
167 | "@alias": "ena-RUN-UNIVERSITY OF HOHENHEIM-06-03-2020-13:37:12:076-1",
168 | "@center_name": "UNIVERSITY OF HOHENHEIM",
169 | "@total_spots": "39323",
170 | "@total_bases": "11552099",
171 | "@size": "3914295",
172 | "@load_done": "true",
173 | "@published": "2020-06-01 17:54:43",
174 | "@is_public": "true",
175 | "@cluster_name": "public",
176 | "@static_data_available": "1",
177 | "IDENTIFIERS": {
178 | "PRIMARY_ID": "FAKEID1"
179 | },
180 | "TITLE": "Illumina MiSeq sequencing",
181 | "EXPERIMENT_REF": {
182 | "@accession": "ERX3980916",
183 | "IDENTIFIERS": {
184 | "PRIMARY_ID": "ERX3980916"
185 | }
186 | },
187 | "RUN_ATTRIBUTES": {
188 | "RUN_ATTRIBUTE": [
189 | {
190 | "TAG": "ENA-FIRST-PUBLIC",
191 | "VALUE": "2020-05-31"
192 | },
193 | {
194 | "TAG": "ENA-LAST-UPDATE",
195 | "VALUE": "2020-03-06"
196 | }
197 | ]
198 | },
199 | "Pool": {
200 | "Member": {
201 | "@member_name": "",
202 | "@accession": "ERS4372624",
203 | "@sample_name": "BAC1.D1.0.32A",
204 | "@sample_title": "Vitis vinifera",
205 | "@spots": "39323",
206 | "@bases": "11552099",
207 | "@tax_id": "29760",
208 | "@organism": "Vitis vinifera",
209 | "IDENTIFIERS": {
210 | "PRIMARY_ID": "ERS4372624",
211 | "EXTERNAL_ID": {
212 | "@namespace": "BioSample",
213 | "#text": "SAMEA6608408"
214 | }
215 | }
216 | }
217 | },
218 | "SRAFiles": {
219 | "SRAFile": {
220 | "@cluster": "public",
221 | "@filename": "FAKEID1",
222 | "@url": "https://sra-download.ncbi.nlm.nih.gov/traces/era16/ERR/ERR3978/FAKEID1",
223 | "@size": "3915680",
224 | "@date": "2020-06-01 19:51:45",
225 | "@md5": "d92e4c21e26e5f2bd2cdaf56cfcfeaa0",
226 | "@semantic_name": "run",
227 | "@supertype": "Primary ETL",
228 | "@sratoolkit": "1",
229 | "Alternatives": [
230 | {
231 | "@url": "https://sra-download.ncbi.nlm.nih.gov/traces/era16/ERR/ERR3978/FAKEID1",
232 | "@free_egress": "worldwide",
233 | "@access_type": "anonymous",
234 | "@org": "NCBI"
235 | },
236 | {
237 | "@url": "https://sra-pub-run-odp.s3.amazonaws.com/sra/FAKEID1/FAKEID1",
238 | "@free_egress": "worldwide",
239 | "@access_type": "anonymous",
240 | "@org": "AWS"
241 | },
242 | {
243 | "@url": "gs://sra-pub-run-8/FAKEID1/FAKEID1.1",
244 | "@free_egress": "gs.US",
245 | "@access_type": "gcp identity",
246 | "@org": "GCP"
247 | }
248 | ]
249 | }
250 | },
251 | "CloudFiles": {
252 | "CloudFile": [
253 | {
254 | "@filetype": "run",
255 | "@provider": "gs",
256 | "@location": "gs.US"
257 | },
258 | {
259 | "@filetype": "run",
260 | "@provider": "s3",
261 | "@location": "s3.us-east-1"
262 | }
263 | ]
264 | },
265 | "Statistics": {
266 | "@nreads": "1",
267 | "@nspots": "39323",
268 | "Read": {
269 | "@index": "0",
270 | "@count": "39323",
271 | "@average": "293.77",
272 | "@stdev": "20.23"
273 | }
274 | },
275 | "Bases": {
276 | "@cs_native": "false",
277 | "@count": "11552099",
278 | "Base": [
279 | {
280 | "@value": "A",
281 | "@count": "3143257"
282 | },
283 | {
284 | "@value": "C",
285 | "@count": "2405184"
286 | },
287 | {
288 | "@value": "G",
289 | "@count": "3867631"
290 | },
291 | {
292 | "@value": "T",
293 | "@count": "2136027"
294 | },
295 | {
296 | "@value": "N",
297 | "@count": "0"
298 | }
299 | ]
300 | }
301 | }
302 | }
303 | }
--------------------------------------------------------------------------------
/q2_fondue/entrezpy_clients/_sra_meta.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 |
9 | from abc import abstractmethod, ABCMeta
10 | from dataclasses import dataclass, field
11 | from typing import Union, List
12 |
13 | import pandas as pd
14 |
15 | from q2_fondue.entrezpy_clients._utils import get_attrs
16 |
17 |
18 | META_REQUIRED_COLUMNS = [
19 | "Experiment ID",
20 | "Biosample ID",
21 | "Bioproject ID",
22 | "Study ID",
23 | "Sample Accession",
24 | "Organism",
25 | "Library Source",
26 | "Library Layout",
27 | "Library Selection",
28 | "Instrument",
29 | "Platform",
30 | "Bases",
31 | "Spots",
32 | "Avg Spot Len",
33 | "Bytes",
34 | "Public",
35 | ]
36 |
37 |
38 | @dataclass
39 | class LibraryMetadata:
40 | """A class for storing sequencing library metadata."""
41 |
42 | name: str
43 | layout: str
44 | selection: str
45 | source: str
46 |
47 | def generate_meta(self):
48 | index = get_attrs(self)
49 | return pd.DataFrame(
50 | data=[getattr(self, k) for k in index],
51 | index=[f"library_{x}" for x in index],
52 | ).T
53 |
54 |
55 | @dataclass
56 | class SRABaseMeta(metaclass=ABCMeta):
57 | """A base class for generation of SRA metadata objects.
58 |
59 | Attributes:
60 | id (str): Unique ID of the metadata object.
61 | custom_meta (Union[dict, None]): Custom metadata belonging
62 | to the object, if any.
63 | child (str): a one-word description of the child type for
64 | the given object (e.g., a 'sample' is a child of a 'study').
65 | """
66 |
67 | id: str
68 | custom_meta: Union[dict, None]
69 | child: str = None
70 |
71 | def __post_init__(self):
72 | """Initializes custom metadata DataFrame."""
73 | if self.custom_meta:
74 | self.custom_meta_df = pd.DataFrame(self.custom_meta, index=[self.id])
75 | else:
76 | self.custom_meta_df = None
77 |
78 | def __eq__(self, other):
79 | """Compares all attributes. To be used on subclasses that contain
80 | DataFrames as attributes."""
81 | same = {}
82 | for k, v in vars(self).items():
83 | if isinstance(v, pd.DataFrame):
84 | same[k] = self.__getattribute__(k).equals(other.__getattribute__(k))
85 | else:
86 | same[k] = self.__getattribute__(k) == other.__getattribute__(k)
87 | return all(same.values())
88 |
89 | def get_base_metadata(self, excluded: tuple) -> pd.DataFrame:
90 | """Generates a DataFrame containing basic metadata of the SRA object.
91 |
92 | The metadata generated by this method do not contain any of the
93 | metadata belonging the any of the object's children.
94 |
95 | Args:
96 | excluded (tuple): attributes to be excluded during metadata
97 | DataFrame generation
98 | Returns:
99 | base_meta (pd.DataFrame): Requested base metadata.
100 | """
101 | index = get_attrs(
102 | self, excluded=("child", "custom_meta", "custom_meta_df") + excluded
103 | )
104 | base_meta = pd.DataFrame(
105 | data={k: getattr(self, k) for k in index}, index=[self.id]
106 | )
107 |
108 | if self.custom_meta:
109 | base_meta = pd.concat(
110 | [base_meta, self.custom_meta_df],
111 | axis=1,
112 | )
113 |
114 | return base_meta
115 |
116 | def get_child_metadata(self) -> pd.DataFrame:
117 | """Generates a DataFrame containing metadata of all the
118 | children SRA objects.
119 |
120 | Returns:
121 | child_meta (pd.DataFrame): Requested children objects' metadata.
122 | """
123 | child_meta_dfs = [
124 | x.generate_meta() for x in self.__getattribute__(f"{self.child}s")
125 | ]
126 | if child_meta_dfs:
127 | child_meta = pd.concat(child_meta_dfs)
128 | else:
129 | child_meta = pd.DataFrame()
130 | child_meta.index.name = f"{self.child}_id"
131 | return child_meta
132 |
133 | @abstractmethod
134 | def generate_meta(self) -> pd.DataFrame:
135 | """Generates a DataFrame with all metadata.
136 |
137 | Metadata from current object will be collected and merged together
138 | with metadata gathered from all of its children.
139 |
140 | Returns:
141 | pd.DataFrame: DataFrame containing all metadata.
142 | """
143 | pass
144 |
145 |
146 | @dataclass(eq=False)
147 | class SRARun(SRABaseMeta):
148 | """A class containing all the SRA run metadata.
149 |
150 | Attributes:
151 | public (bool): True if the dataset was public.
152 | bytes (int): Size of the run dataset.
153 | bases (int): Nucleotide count of the run dataset.
154 | spots (int): Spot count of the run dataset.
155 | avg_spot_len (int): Average spot length.
156 | experiment_id (str): ID of the experiment which the run belongs to.
157 | child (str): Run's child type (None, as runs have no children objects).
158 | """
159 |
160 | public: bool = True
161 | bytes: int = None
162 | bases: int = None
163 | spots: int = None
164 | avg_spot_len: int = None
165 | experiment_id: str = None
166 | child: str = None
167 |
168 | def __post_init__(self):
169 | """Calculates an average spot length."""
170 | super().__post_init__()
171 | if self.spots > 0:
172 | self.avg_spot_len = int(self.bases / self.spots)
173 | else:
174 | self.avg_spot_len = 0
175 |
176 | def generate_meta(self) -> pd.DataFrame:
177 | """Generates run's metadata.
178 |
179 | Returns:
180 | pd.DataFrame: Run's metadata.
181 | """
182 | return self.get_base_metadata(excluded=("id",))
183 |
184 |
185 | @dataclass
186 | class SRAExperiment(SRABaseMeta):
187 | """A class containing all the SRA experiment metadata.
188 |
189 | Attributes:
190 | instrument (str): Sequencing instrument name.
191 | platform (str): Sequencing platform name.
192 | library (LibraryMetadata): Metadata of the sequencing library.
193 | runs (List[SRARun]): All SRA runs belonging to this experiment.
194 | child (str): Runs are children of experiment objects.
195 |
196 | """
197 |
198 | instrument: str = None
199 | platform: str = None
200 | library: LibraryMetadata = None
201 | runs: List[SRARun] = field(default_factory=list)
202 | sample_id: str = None
203 | child: str = "run"
204 |
205 | def generate_meta(self) -> pd.DataFrame:
206 | """Generates experiment's metadata.
207 |
208 | Generated metadata will include all metadata of the linked runs.
209 |
210 | Returns:
211 | pd.DataFrame: Experiment's metadata with all of its children.
212 | """
213 | exp_meta = self.get_base_metadata(excluded=("id", "runs", "library"))
214 | lib_meta = self.library.generate_meta()
215 | lib_meta.index = exp_meta.index
216 |
217 | exp_meta = pd.concat([exp_meta, lib_meta], axis=1)
218 | runs_meta = self.get_child_metadata()
219 | if len(runs_meta) > 0:
220 | runs_merged = runs_meta.merge(
221 | exp_meta, left_on="experiment_id", right_index=True
222 | )
223 | runs_merged.index.name = "run_id"
224 | return runs_merged
225 | else:
226 | return exp_meta
227 |
228 |
229 | @dataclass(eq=False)
230 | class SRASample(SRABaseMeta):
231 | """A class containing all the SRA sample metadata.
232 |
233 | Attributes:
234 | name (str): Name of the sample.
235 | title (str): Title of the sample.
236 | biosample_id (str): BioSample ID linked to the sample.
237 | organism (str): Organism name.
238 | tax_id (str): Organism taxonomic ID.
239 | study_id (str): = ID of the study which the sample belongs to.
240 | experiments (List[SRAExperiment]): All SRA experiments
241 | belonging to the sample.
242 | child (str): = Experiments are children of sample objects.
243 | """
244 |
245 | name: str = None
246 | title: str = None
247 | biosample_id: str = None
248 | organism: str = None
249 | tax_id: str = None
250 | study_id: str = None
251 | experiments: List[SRAExperiment] = field(default_factory=list)
252 | child: str = "experiment"
253 |
254 | def generate_meta(self) -> pd.DataFrame:
255 | """Generates SRA sample's metadata.
256 |
257 | Generated metadata will include all metadata of the linked experiments.
258 |
259 | Returns:
260 | pd.DataFrame: Sample's metadata with all of its children.
261 | """
262 | sample_meta = self.get_base_metadata(excluded=("id", "experiments"))
263 | exps_meta = self.get_child_metadata()
264 | if len(exps_meta) > 0:
265 | exps_merged = exps_meta.merge(
266 | sample_meta, left_on="sample_id", right_index=True
267 | )
268 | exps_merged.index.name = "run_id"
269 | return exps_merged
270 | else:
271 | return sample_meta
272 |
273 |
274 | @dataclass
275 | class SRAStudy(SRABaseMeta):
276 | """Generates SRA study's metadata.
277 |
278 | Generated metadata will include all metadata of the linked samples.
279 |
280 | Attributes:
281 | bioproject_id (str): ID of the linked BioProject.
282 | center_name (str): Name of the center where the study was performed.
283 | samples (List[SRASample]): All SRA samples belonging to the study.
284 | child (str): Samples are children of study objects.
285 | """
286 |
287 | bioproject_id: str = None
288 | center_name: str = None
289 | samples: List[SRASample] = field(default_factory=list)
290 | child: str = "sample"
291 |
292 | def generate_meta(self) -> pd.DataFrame:
293 | """Generates SRA study's metadata.
294 |
295 | Generated metadata will include all metadata of the linked samples.
296 |
297 | Returns:
298 | pd.DataFrame: Study's metadata with all of its children.
299 | """
300 | study_meta = self.get_base_metadata(excluded=("id", "samples"))
301 | samples_meta = self.get_child_metadata()
302 | if len(samples_meta) > 0:
303 | samples_merged = samples_meta.merge(
304 | study_meta, left_on="study_id", right_index=True
305 | )
306 | samples_merged.index.name = "run_id"
307 | return samples_merged
308 | else:
309 | return study_meta
310 |
--------------------------------------------------------------------------------