├── q2_fondue
    ├── tests
    │   ├── data
    │   │   ├── testaccA.sra
    │   │   ├── SRR123456.sra
    │   │   ├── SRR123457.sra
    │   │   ├── SRP123456_md.tsv
    │   │   ├── SRS123456_md.tsv
    │   │   ├── SRX123456_md.tsv
    │   │   ├── PRJNA734376_md.tsv
    │   │   ├── SRR123456_md.tsv
    │   │   ├── SRR123457_md.tsv
    │   │   ├── study_ids.tsv
    │   │   ├── testaccB_md.tsv
    │   │   ├── testaccC_md.tsv
    │   │   ├── bioproject_ids.tsv
    │   │   ├── run_ids.tsv
    │   │   ├── sample_ids.tsv
    │   │   ├── experiment_ids.tsv
    │   │   ├── sample_ids_w_doi.tsv
    │   │   ├── study_ids_w_doi.tsv
    │   │   ├── SRR1234567_md.tsv
    │   │   ├── bioproject_ids_w_doi.tsv
    │   │   ├── experiment_ids_w_doi.tsv
    │   │   ├── failed_ids_no_doi.tsv
    │   │   ├── testaccBC_md.tsv
    │   │   ├── run_ids_w_doi.tsv
    │   │   ├── metadata_response_error.xml
    │   │   ├── run_ids_w_doi_2.tsv
    │   │   ├── empty
    │   │   │   ├── xxx_00_L001_R1_001.fastq.gz
    │   │   │   └── xxx_00_L001_R2_001.fastq.gz
    │   │   ├── paired1
    │   │   │   ├── SEQID1_00_L001_R1_001.fastq.gz
    │   │   │   ├── SEQID1_00_L001_R2_001.fastq.gz
    │   │   │   ├── SEQID2_00_L001_R1_001.fastq.gz
    │   │   │   └── SEQID2_00_L001_R2_001.fastq.gz
    │   │   ├── paired2
    │   │   │   ├── SEQID3_00_L001_R1_001.fastq.gz
    │   │   │   ├── SEQID3_00_L001_R2_001.fastq.gz
    │   │   │   ├── SEQID4_00_L001_R1_001.fastq.gz
    │   │   │   └── SEQID4_00_L001_R2_001.fastq.gz
    │   │   ├── single1
    │   │   │   ├── SEQID1_00_L001_R1_001.fastq.gz
    │   │   │   └── SEQID2_00_L001_R1_001.fastq.gz
    │   │   ├── single2
    │   │   │   ├── SEQID3_00_L001_R1_001.fastq.gz
    │   │   │   └── SEQID4_00_L001_R1_001.fastq.gz
    │   │   ├── SRR123457_2.fastq
    │   │   ├── testacc_2.fastq
    │   │   ├── testacc_00_L001_R2_001.fastq
    │   │   ├── elink_response_single.json
    │   │   ├── esearch_response_single_ambiguous.json
    │   │   ├── esearch_response_single_correct.json
    │   │   ├── sra-metadata-1.tsv
    │   │   ├── sra-metadata-2.tsv
    │   │   ├── sra-metadata-3.tsv
    │   │   ├── esearch_response_multi_invalid.json
    │   │   ├── sra-metadata-4.tsv
    │   │   ├── sra-metadata-failed-ids.tsv
    │   │   ├── sra-metadata-5.tsv
    │   │   ├── sra-metadata-6.tsv
    │   │   ├── sra-metadata-7.tsv
    │   │   ├── sra-metadata-8.tsv
    │   │   ├── sra-metadata-mock.tsv
    │   │   ├── sra-metadata-exp-4.tsv
    │   │   ├── esearch_response_multi_correct.json
    │   │   ├── esearch_response_multi_mixed.json
    │   │   ├── sra-metadata-exp-2.tsv
    │   │   ├── sra-metadata-exp-3.tsv
    │   │   ├── sra-metadata-exp-5.tsv
    │   │   ├── sra-metadata-exp-1.tsv
    │   │   ├── testaccHYB.fastq
    │   │   ├── fasterq-dump-response.txt
    │   │   ├── SRR123456.fastq
    │   │   ├── testaccA.fastq
    │   │   ├── testacc_1.fastq
    │   │   ├── SRR123457_1.fastq
    │   │   ├── testaccA_01_L001_R1_001.fastq
    │   │   ├── testacc_00_L001_R1_001.fastq
    │   │   ├── testaccHYB_2.fastq
    │   │   ├── testaccHYB_1.fastq
    │   │   ├── scraper_items_no_doi.json
    │   │   ├── efetch_b2_response_runs.xml
    │   │   ├── efetch_b1_response_runs.xml
    │   │   ├── efetch_response_runs_single_item.xml
    │   │   ├── metadata_processed_multi.json
    │   │   ├── efetch_response_runs.xml
    │   │   ├── scraper_items_no_attach.json
    │   │   └── metadata_response_small.json
    │   ├── __init__.py
    │   ├── test_query.py
    │   ├── test_get_all.py
    │   ├── test_esearch.py
    │   ├── _utils.py
    │   └── test_utils.py
    ├── types
    │   ├── tests
    │   │   ├── data
    │   │   │   ├── sra-failed-ids-empty.tsv
    │   │   │   ├── ncbi-ids-wrong.tsv
    │   │   │   ├── ncbi-ids-bioprojects.tsv
    │   │   │   ├── ncbi-ids-other.tsv
    │   │   │   ├── ncbi-ids-studies.tsv
    │   │   │   ├── ncbi-ids-runs.tsv
    │   │   │   ├── ncbi-ids-runs-wrong-id-header.tsv
    │   │   │   ├── ncbi-ids-runs-doi.tsv
    │   │   │   ├── ncbi-ids-runs-no-doi.tsv
    │   │   │   ├── sra-failed-ids.tsv
    │   │   │   ├── sra-metadata-missing-columns.tsv
    │   │   │   ├── sra-metadata-missing-ids.tsv
    │   │   │   └── sra-metadata.tsv
    │   │   └── __init__.py
    │   ├── _type.py
    │   ├── __init__.py
    │   ├── _transformer.py
    │   └── _format.py
    ├── entrezpy_clients
    │   ├── __init__.py
    │   ├── _utils.py
    │   ├── _esearch.py
    │   ├── _pipelines.py
    │   └── _sra_meta.py
    ├── __init__.py
    ├── query.py
    ├── get_all.py
    ├── citations.bib
    ├── utils.py
    └── metadata.py
├── .gitattributes
├── tutorial
    ├── metadata_file.tsv
    └── metadata_file_runs.tsv
├── setup.cfg
├── logo.png
├── .github
    └── workflows
    │   ├── join-release.yaml
    │   ├── tag-release.yaml
    │   ├── dependecies.yaml
    │   ├── ci.yaml
    │   ├── q2-ci.yaml
    │   ├── dependent-issues.yaml
    │   └── docker-push.yaml
├── parallel.config
├── .coveragerc
├── .copier-answers.yml
├── Makefile
├── conda-recipe
    └── meta.yaml
├── .gitignore
├── pyproject.toml
├── LICENSE
├── Dockerfile
└── install-sra-tools.sh


/q2_fondue/tests/data/testaccA.sra:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123456.sra:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123457.sra:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | pyproject.toml export-subst
2 | 


--------------------------------------------------------------------------------
/tutorial/metadata_file.tsv:
--------------------------------------------------------------------------------
1 | id
2 | PRJEB14186
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRP123456_md.tsv:
--------------------------------------------------------------------------------
1 | id
2 | SRP123456
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRS123456_md.tsv:
--------------------------------------------------------------------------------
1 | id
2 | SRS123456
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRX123456_md.tsv:
--------------------------------------------------------------------------------
1 | id
2 | SRX123456
3 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/sra-failed-ids-empty.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/PRJNA734376_md.tsv:
--------------------------------------------------------------------------------
1 | id
2 | PRJNA734376
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123456_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123456
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123457_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123457
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/study_ids.tsv:
--------------------------------------------------------------------------------
1 | id
2 | ERP12345
3 | SRP23456
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccB_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123456
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccC_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123457
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/bioproject_ids.tsv:
--------------------------------------------------------------------------------
1 | id
2 | PRJNA123
3 | PRJNA234
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/run_ids.tsv:
--------------------------------------------------------------------------------
1 | id
2 | SRR123
3 | SRR234
4 | SRR345
5 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sample_ids.tsv:
--------------------------------------------------------------------------------
1 | id
2 | ERS147978
3 | ERS3588233
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | extend-ignore = E203
4 | 


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/logo.png


--------------------------------------------------------------------------------
/q2_fondue/tests/data/experiment_ids.tsv:
--------------------------------------------------------------------------------
1 | id
2 | ERX115020
3 | SRX10331465
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sample_ids_w_doi.tsv:
--------------------------------------------------------------------------------
1 | id	DOI
2 | SRS000100	some_doi1
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/study_ids_w_doi.tsv:
--------------------------------------------------------------------------------
1 | id	DOI
2 | SRP000001	some_doi1
3 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-wrong.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | ABC123
3 | SRX098
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR1234567_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123456
3 | SRR123457
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/bioproject_ids_w_doi.tsv:
--------------------------------------------------------------------------------
1 | ID	DOI
2 | PRJNA33627	some_doi1
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/experiment_ids_w_doi.tsv:
--------------------------------------------------------------------------------
1 | id	DOI
2 | SRX000007	some_doi1
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/failed_ids_no_doi.tsv:
--------------------------------------------------------------------------------
1 | id	
2 | SRR000001	
3 | SRR000002	
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccBC_md.tsv:
--------------------------------------------------------------------------------
1 | sample-id
2 | SRR123456
3 | SRR123457
4 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-bioprojects.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | PRJ1234
3 | PRJ56789
4 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-other.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | ERX115020
3 | ERS115020
4 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-studies.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | ERP104978
3 | SRP123456
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/run_ids_w_doi.tsv:
--------------------------------------------------------------------------------
1 | ID	DOI
2 | SRR000001	some_doi1
3 | SRR000002	some_doi2
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/metadata_response_error.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"  ?>
2 | <ERROR>
3 | </ERROR>
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/run_ids_w_doi_2.tsv:
--------------------------------------------------------------------------------
1 | ID	DOI
2 | SRR123456	some_doi1
3 | SRR123457	some_doi2
4 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-runs.tsv:
--------------------------------------------------------------------------------
1 | ID
2 | SRR000013
3 | SRR000001
4 | ERR3978173
5 | ERR3978174
6 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-runs-wrong-id-header.tsv:
--------------------------------------------------------------------------------
1 | wrongID
2 | SRR000013
3 | SRR000001
4 | ERR3978173
5 | ERR3978174
6 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-runs-doi.tsv:
--------------------------------------------------------------------------------
1 | ID	DOI
2 | SRR000013	some_doi1
3 | SRR000001	some_doi2
4 | ERR3978173	some_doi3
5 | ERR3978174	some_doi4
6 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/empty/xxx_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/empty/xxx_00_L001_R1_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/empty/xxx_00_L001_R2_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/empty/xxx_00_L001_R2_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/ncbi-ids-runs-no-doi.tsv:
--------------------------------------------------------------------------------
1 | ID	FUNFACT
2 | SRR000013	some_doi1
3 | SRR000001	some_doi2
4 | ERR3978173	some_doi3
5 | ERR3978174	some_doi4
6 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired1/SEQID1_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID1_00_L001_R1_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired1/SEQID1_00_L001_R2_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID1_00_L001_R2_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired1/SEQID2_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID2_00_L001_R1_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired1/SEQID2_00_L001_R2_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID2_00_L001_R2_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired2/SEQID3_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID3_00_L001_R1_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired2/SEQID3_00_L001_R2_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID3_00_L001_R2_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired2/SEQID4_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID4_00_L001_R1_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/paired2/SEQID4_00_L001_R2_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID4_00_L001_R2_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/single1/SEQID1_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single1/SEQID1_00_L001_R1_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/single1/SEQID2_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single1/SEQID2_00_L001_R1_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/single2/SEQID3_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single2/SEQID3_00_L001_R1_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/tests/data/single2/SEQID4_00_L001_R1_001.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single2/SEQID4_00_L001_R1_001.fastq.gz


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/sra-failed-ids.tsv:
--------------------------------------------------------------------------------
1 | ID	Error message
2 | SRR000020	ID is ambiguous.
3 | SRR000021	ID is invalid.
4 | ERR0000020	ID is ambiguous.
5 | ERR0000021	ID is invalid.
6 | 


--------------------------------------------------------------------------------
/.github/workflows/join-release.yaml:
--------------------------------------------------------------------------------
1 | name: join-release
2 | on:
3 |   workflow_dispatch: {}
4 | jobs:
5 |   release:
6 |     uses: qiime2/distributions/.github/workflows/lib-join-release.yaml@dev
7 | 


--------------------------------------------------------------------------------
/.github/workflows/tag-release.yaml:
--------------------------------------------------------------------------------
1 | name: tag-release
2 | on:
3 |   push:
4 |     branches: ["Release-*"]
5 | jobs:
6 |   tag:
7 |     uses: qiime2/distributions/.github/workflows/lib-tag-release.yaml@dev
8 | 


--------------------------------------------------------------------------------
/parallel.config:
--------------------------------------------------------------------------------
 1 | [parsl]
 2 | 
 3 | [[parsl.executors]]
 4 | class = "HighThroughputExecutor"
 5 | label = "default"
 6 | max_workers = 1
 7 | 
 8 | [parsl.executors.provider]
 9 | class = "LocalProvider"
10 | max_blocks = 4


--------------------------------------------------------------------------------
/.github/workflows/dependecies.yaml:
--------------------------------------------------------------------------------
 1 | name: Dependency check
 2 | on:
 3 |   pull_request:
 4 |     branches: ["main"]
 5 |     types: [opened, reopened, synchronize, labeled, unlabeled]
 6 | 
 7 | jobs:
 8 |   ci:
 9 |     uses: bokulich-lab/utilities/.github/workflows/dependencies.yaml@main
10 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source = q2_fondue
 3 | branch = True
 4 | omit =
 5 |     */tests*
 6 |     */__init__.py
 7 |     q2_fondue/_version.py
 8 |     versioneer.py
 9 | 
10 | [report]
11 | fail_under = 90
12 | omit =
13 |     */tests*
14 |     */__init__.py
15 |     q2_fondue/_version.py
16 |     versioneer.py
17 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   pull_request:
 4 |     branches: ["main"]
 5 |   push:
 6 |     branches: ["main"]
 7 |     tags: ["*"]
 8 | 
 9 | jobs:
10 |   ci:
11 |     uses: bokulich-lab/utilities/.github/workflows/ci.yaml@main
12 |     with:
13 |       distro: moshpit
14 |       build_docker: true
15 | 


--------------------------------------------------------------------------------
/.github/workflows/q2-ci.yaml:
--------------------------------------------------------------------------------
 1 | name: QIIME 2 CI
 2 | on:
 3 |   pull_request:
 4 |     branches: ["main"]
 5 |   push:
 6 |     branches: ["main"]
 7 | 
 8 | jobs:
 9 |   qiime-ci:
10 |     uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml@dev
11 |     with:
12 |       distro: moshpit
13 |       recipe-path: 'conda-recipe'
14 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 | 


--------------------------------------------------------------------------------
/q2_fondue/entrezpy_clients/__init__.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------------------------
2 | # Copyright (c) 2025, Bokulich Laboratories.
3 | #
4 | # Distributed under the terms of the Modified BSD License.
5 | #
6 | # The full license is in the file LICENSE, distributed with this software.
7 | # ----------------------------------------------------------------------------
8 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123457_2.fastq:
--------------------------------------------------------------------------------
 1 | @test_acc_single.1 test_1_seq length=59
 2 | TTGGGGGGCACCATCTAATCAGCTGCCAGTGCTGCCAGAACATAAAGCAGGCAGAAATT
 3 | +test_acc_single.1 test_1_seq length=59
 4 | ?60-*'$"<=D===;8C=<<<<<==C<=<<==<=C=:<C<=<=FB2=<=C;<==HC9/%
 5 | @test_acc_single.4 test_2_seq length=7
 6 | ACTAATA
 7 | +test_acc_single.4 test_2_seq length=7
 8 | <<A9<=<
 9 | @test_acc_single.5 test_3_seq length=9
10 | TAAACAGGT
11 | +test_acc_single.5 test_3_seq length=9
12 | <FB2<;B;<
13 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testacc_2.fastq:
--------------------------------------------------------------------------------
 1 | @test_acc_single.1 test_1_seq length=59
 2 | TTGGGGGGCACCATCTAATCAGCTGCCAGTGCTGCCAGAACATAAAGCAGGCAGAAATT
 3 | +test_acc_single.1 test_1_seq length=59
 4 | ?60-*'$"<=D===;8C=<<<<<==C<=<<==<=C=:<C<=<=FB2=<=C;<==HC9/%
 5 | @test_acc_single.4 test_2_seq length=7
 6 | ACTAATA
 7 | +test_acc_single.4 test_2_seq length=7
 8 | <<A9<=<
 9 | @test_acc_single.5 test_3_seq length=9
10 | TAAACAGGT
11 | +test_acc_single.5 test_3_seq length=9
12 | <FB2<;B;<
13 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testacc_00_L001_R2_001.fastq:
--------------------------------------------------------------------------------
 1 | @test_acc_single.1 test_1_seq length=59
 2 | TTGGGGGGCACCATCTAATCAGCTGCCAGTGCTGCCAGAACATAAAGCAGGCAGAAATT
 3 | +test_acc_single.1 test_1_seq length=59
 4 | ?60-*'$"<=D===;8C=<<<<<==C<=<<==<=C=:<C<=<=FB2=<=C;<==HC9/%
 5 | @test_acc_single.4 test_2_seq length=7
 6 | ACTAATA
 7 | +test_acc_single.4 test_2_seq length=7
 8 | <<A9<=<
 9 | @test_acc_single.5 test_3_seq length=9
10 | TAAACAGGT
11 | +test_acc_single.5 test_3_seq length=9
12 | <FB2<;B;<
13 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/elink_response_single.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "header": {
 3 |     "type": "elink",
 4 |     "version": "0.3"
 5 |   },
 6 |   "linksets": [
 7 |     {
 8 |       "dbfrom": "bioproject",
 9 |       "ids": [
10 |         "33627"
11 |       ],
12 |       "linksetdbhistories": [
13 |         {
14 |           "dbto": "sra",
15 |           "linkname": "bioproject_sra",
16 |           "querykey": "2"
17 |         }
18 |       ],
19 |       "webenv": "MCID_61725b77d7807e40801b66a9"
20 | }]}
21 | 


--------------------------------------------------------------------------------
/tutorial/metadata_file_runs.tsv:
--------------------------------------------------------------------------------
 1 | id
 2 | ERR1428207
 3 | ERR1428208
 4 | ERR1428209
 5 | ERR1428210
 6 | ERR1428211
 7 | ERR1428212
 8 | ERR1428213
 9 | ERR1428214
10 | ERR1428215
11 | ERR1428216
12 | ERR1428217
13 | ERR1428218
14 | ERR1428219
15 | ERR1428220
16 | ERR1428221
17 | ERR1428222
18 | ERR1428223
19 | ERR1428224
20 | ERR1428225
21 | ERR1428226
22 | ERR1428227
23 | ERR1428228
24 | ERR1428229
25 | ERR1428230
26 | ERR1428231
27 | ERR1428232
28 | ERR1428233
29 | ERR1428234
30 | ERR1428235
31 | ERR1428236


--------------------------------------------------------------------------------
/.copier-answers.yml:
--------------------------------------------------------------------------------
 1 | # Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
 2 | _commit: dfb0404
 3 | _src_path: https://github.com/qiime2/q2-setup-template.git
 4 | module_name: fondue
 5 | plugin_name: q2_fondue
 6 | plugin_scripts: null
 7 | project_author_email: ziemski.michal@gmail.com
 8 | project_author_name: Michal Ziemski
 9 | project_description: null
10 | project_name: q2-fondue
11 | project_urls_homepage: https://github.com/bokulich-lab/q2-fondue
12 | project_urls_repository: https://github.com/bokulich-lab/q2-fondue
13 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/esearch_response_single_ambiguous.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "header": {
 3 |     "type": "esearch",
 4 |     "version": "0.3"
 5 |   },
 6 |   "esearchresult": {
 7 |     "count": "7",
 8 |     "retmax": "0",
 9 |     "retstart": "0",
10 |     "idlist": [],
11 |     "translationset": [],
12 |     "translationstack": [
13 |       {
14 |         "term": "SR012[All Fields]",
15 |         "field": "All Fields",
16 |         "count": "7",
17 |         "explode": "N"
18 |       },
19 |       "GROUP"
20 |     ],
21 |     "querytranslation": "SR012[All Fields]"
22 | }}
23 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/esearch_response_single_correct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "header": {
 3 |     "type": "esearch",
 4 |     "version": "0.3"
 5 |   },
 6 |   "esearchresult": {
 7 |     "count": "1",
 8 |     "retmax": "0",
 9 |     "retstart": "0",
10 |     "idlist": [],
11 |     "translationset": [],
12 |     "translationstack": [
13 |       {
14 |         "term": "SRR000001[All Fields]",
15 |         "field": "All Fields",
16 |         "count": "1",
17 |         "explode": "N"
18 |       },
19 |       "GROUP"
20 |     ],
21 |     "querytranslation": "SRR000001[All Fields]"
22 | }}
23 | 


--------------------------------------------------------------------------------
/q2_fondue/__init__.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------------------------
 2 | # Copyright (c) 2025, Bokulich Laboratories.
 3 | #
 4 | # Distributed under the terms of the Modified BSD License.
 5 | #
 6 | # The full license is in the file LICENSE, distributed with this software.
 7 | # ----------------------------------------------------------------------------
 8 | 
 9 | import importlib
10 | 
11 | try:
12 |     from ._version import __version__
13 | except ModuleNotFoundError:
14 |     __version__ = "0.0.0+notfound"
15 | 
16 | importlib.import_module("q2_fondue.types")
17 | 


--------------------------------------------------------------------------------
/q2_fondue/types/_type.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------------------------
 2 | # Copyright (c) 2025, Bokulich Laboratories.
 3 | #
 4 | # Distributed under the terms of the Modified BSD License.
 5 | #
 6 | # The full license is in the file LICENSE, distributed with this software.
 7 | # ----------------------------------------------------------------------------
 8 | 
 9 | from qiime2.core.type import SemanticType
10 | 
11 | SRAMetadata = SemanticType("SRAMetadata")
12 | SRAFailedIDs = SemanticType("SRAFailedIDs")
13 | NCBIAccessionIDs = SemanticType("NCBIAccessionIDs")
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all lint test test-cov test-docker install dev clean distclean
 2 | 
 3 | PYTHON ?= python
 4 | 
 5 | all: ;
 6 | 
 7 | lint:
 8 | 	flake8
 9 | 
10 | test: all
11 | 	py.test
12 | 
13 | test-cov: all
14 | 	python -m pytest --cov=q2_fondue -n 4 && coverage xml -o coverage.xml
15 | 
16 | test-docker: all
17 | 	qiime info
18 | 	qiime fondue --help
19 | 
20 | install: all
21 | 	bash install-sra-tools.sh
22 | 	$(PYTHON) -m pip install -v .
23 | 
24 | dev: all
25 | 	bash install-sra-tools.sh
26 | 	pip install coverage parameterized
27 | 	pip install -e .
28 | 
29 | clean: distclean
30 | 
31 | distclean: ;
32 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-1.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1
2 | SRR123456	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	ABC
3 | SRR123457	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-2.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1
2 | SRR123458	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	GHI
3 | SRR123459	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	JKL


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-3.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1
2 | SRR123460	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	MNO
3 | SRR123461	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	PQR


--------------------------------------------------------------------------------
/q2_fondue/tests/data/esearch_response_multi_invalid.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "header": {
 3 |     "type": "esearch",
 4 |     "version": "0.3"
 5 |   },
 6 |   "esearchresult": {
 7 |     "count": "0",
 8 |     "retmax": "0",
 9 |     "retstart": "0",
10 |     "idlist": [],
11 |     "translationset": [],
12 |     "querytranslation": "ABCD123[All Fields] OR SRR001[All Fields]",
13 |     "errorlist": {
14 |       "phrasesnotfound": [
15 |         "ABCD123",
16 |         "SRR001"
17 |       ],
18 |       "fieldsnotfound": []
19 |     },
20 |     "warninglist": {
21 |       "phrasesignored": [],
22 |       "quotedphrasesnotfound": [],
23 |       "outputmessages": [
24 |       "No items found."
25 |     ]}}}
26 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-4.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 2
2 | SRR123458	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	AB12
3 | SRR123459	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DE34


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-failed-ids.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1
2 | SRR000001	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	ABC
3 | SRR000002	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-5.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1	Some Meta 2
2 | SRR123456	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	ABC
3 | SRR123457	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF	AB12
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-6.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1	Some Meta 2
2 | SRR123458	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	GHI	DE12
3 | SRR123459	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC		GH34
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-7.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1	Some Meta 2
2 | SRR123457	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF	AB12
3 | SRR123458	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC		DE34
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-8.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1	Some Meta 2
2 | SRR123457	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF	XXX
3 | SRR123458	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC		DE34
4 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-mock.tsv:
--------------------------------------------------------------------------------
1 | ID	DNA-ID	population	Super Population Code	Coriell plate	Coriell panel	Population Description	sex	HapMap sample ID	Super Population Description	culture_collection	family_id	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name
2 | SRR123456	NA18505	YRI	AFR	HAPMAPPT03	MGP00013	Yoruba in Ibadan, Nigeria	female	NA18505	African	CORIELL:GM18505	Y005	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	True	454MSC
3 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-exp-4.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1	Some Meta 2
2 | SRR123456	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	ABC
3 | SRR123457	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF	AB12
4 | SRR123458	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC		DE34
5 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/esearch_response_multi_correct.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "header": {
 3 |     "type": "esearch",
 4 |     "version": "0.3"
 5 |   },
 6 |   "esearchresult": {
 7 |     "count": "2",
 8 |     "retmax": "0",
 9 |     "retstart": "0",
10 |     "idlist": [],
11 |     "translationset": [],
12 |     "translationstack": [
13 |       {
14 |         "term": "SRR000001[All Fields]",
15 |         "field": "All Fields",
16 |         "count": "1",
17 |         "explode": "N"
18 |       },
19 |       {
20 |         "term": "SRR000013[All Fields]",
21 |         "field": "All Fields",
22 |         "count": "1",
23 |         "explode": "N"
24 |       },
25 |       "OR",
26 |       {
27 |         "term": "ERR3978173[All Fields]",
28 |         "field": "All Fields",
29 |         "count": "1",
30 |         "explode": "N"
31 |       },
32 |       "OR"
33 |     ],
34 |     "querytranslation": "SRR000001[All Fields] OR SRR000013[All Fields] OR ERR3978173[All Fields]"
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/q2_fondue/types/__init__.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------------------------
 2 | # Copyright (c) 2025, Bokulich Laboratories.
 3 | #
 4 | # Distributed under the terms of the Modified BSD License.
 5 | #
 6 | # The full license is in the file LICENSE, distributed with this software.
 7 | # ----------------------------------------------------------------------------
 8 | 
 9 | from ._format import (
10 |     SRAMetadataFormat,
11 |     SRAMetadataDirFmt,
12 |     SRAFailedIDsFormat,
13 |     SRAFailedIDsDirFmt,
14 |     NCBIAccessionIDsFormat,
15 |     NCBIAccessionIDsDirFmt,
16 | )
17 | from ._type import SRAMetadata, SRAFailedIDs, NCBIAccessionIDs
18 | 
19 | 
20 | __all__ = [
21 |     "SRAMetadataFormat",
22 |     "SRAMetadataDirFmt",
23 |     "SRAMetadata",
24 |     "SRAFailedIDsFormat",
25 |     "SRAFailedIDsDirFmt",
26 |     "SRAFailedIDs",
27 |     "NCBIAccessionIDsFormat",
28 |     "NCBIAccessionIDsDirFmt",
29 |     "NCBIAccessionIDs",
30 | ]
31 | 


--------------------------------------------------------------------------------
/conda-recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: q2-fondue
 3 |   version: {{ PLUGIN_VERSION }}
 4 | source:
 5 |   path: ..
 6 | build:
 7 |   script: 'make VERBOSE=1
 8 | 
 9 |     make install
10 | 
11 |     '
12 | requirements:
13 |   host:
14 |   - python {{ python }}
15 |   - setuptools
16 |   - versioningit
17 |   - wheel
18 |   run:
19 |   - entrezpy {{ entrezpy }}
20 |   - xmltodict
21 |   - python {{ python }}
22 |   - python-dotenv
23 |   - pyzotero
24 |   - q2-types >={{ q2_types }}
25 |   - qiime2 >={{ qiime2 }}
26 |   - tqdm {{ tqdm }}
27 |   build:
28 |   - python {{ python }}
29 |   - setuptools
30 |   - versioningit
31 | test:
32 |   imports:
33 |   - q2_fondue
34 |   - qiime2.plugins.fondue
35 |   requires:
36 |   - parameterized
37 |   - coverage
38 |   - pytest-cov
39 |   commands:
40 |   - pytest --cov q2_fondue --cov-report xml:coverage.xml --pyargs q2_fondue
41 | about:
42 |   home: https://github.com/bokulich-lab/q2-fondue
43 |   license: BSD-3-Clause
44 |   license_family: BSD
45 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/esearch_response_multi_mixed.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "header": {
 3 |     "type": "esearch",
 4 |     "version": "0.3"
 5 |   },
 6 |   "esearchresult": {
 7 |     "count": "8",
 8 |     "retmax": "0",
 9 |     "retstart": "0",
10 |     "idlist": [],
11 |     "translationset": [],
12 |     "translationstack": [
13 |       {
14 |         "term": "SRR000001[All Fields]",
15 |         "field": "All Fields",
16 |         "count": "1",
17 |         "explode": "N"
18 |       },
19 |       {
20 |         "term": "SRR000013[All Fields]",
21 |         "field": "All Fields",
22 |         "count": "1",
23 |         "explode": "N"
24 |       },
25 |       "OR",
26 |       {
27 |         "term": "SR012[All Fields]",
28 |         "field": "All Fields",
29 |         "count": "7",
30 |         "explode": "N"
31 |       },
32 |       "OR"
33 |     ],
34 |     "querytranslation": "SRR000001[All Fields] OR SRR000013[All Fields] OR SR012[All Fields]",
35 |     "errorlist": {
36 |       "phrasesnotfound": [
37 |       "ABCD123", "SRR001"
38 |     ], "fieldsnotfound": []}}}
39 | 


--------------------------------------------------------------------------------
/.github/workflows/dependent-issues.yaml:
--------------------------------------------------------------------------------
 1 | name: Dependent issues
 2 | 
 3 | on:
 4 |   issues:
 5 |     types:
 6 |       - opened
 7 |       - edited
 8 |       - closed
 9 |       - reopened
10 |   pull_request_target:
11 |     types:
12 |       - opened
13 |       - edited
14 |       - closed
15 |       - reopened
16 |       - synchronize
17 | 
18 |   schedule:
19 |     - cron: '0 0 * * *'
20 | 
21 | jobs:
22 |   check:
23 |     runs-on: ubuntu-latest
24 |     steps:
25 |       - uses: z0al/dependent-issues@v1
26 |         env:
27 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
28 |           GITHUB_READ_TOKEN: ${{ secrets.GITHUB_READ_TOKEN }}
29 | 
30 |         with:
31 |           label: dependent
32 | 
33 |           # (Optional) Enable checking for dependencies in issues.
34 |           # Enable by setting the value to "on". Default "off"
35 |           check_issues: off
36 | 
37 |           ignore_dependabot: off
38 | 
39 |           keywords: depends on, blocked by, merge after
40 | 
41 |           comment: >
42 |             This PR/issue depends on:
43 |             {{ dependencies }}
44 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-exp-2.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1	Some Meta 2
2 | SRR123456	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	ABC
3 | SRR123457	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF
4 | SRR123458	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC		AB12
5 | SRR123459	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC		DE34
6 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-exp-3.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1	Some Meta 2
2 | SRR123456	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	ABC
3 | SRR123457	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF	AB12
4 | SRR123458	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	GHI	DE12
5 | SRR123459	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC		GH34
6 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-exp-5.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1	Some Meta 2
2 | SRR123456	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	ABC
3 | SRR123457	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF	AB12
4 | SRR123457	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF	XXX
5 | SRR123458	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC		DE34
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 
64 | # vi
65 | .*.swp
66 | 
67 | # other
68 | *~
69 | .env
70 | 
71 | .DS_store
72 | .idea
73 | .vscode
74 | 
75 | 
76 | fasterq.tmp.*/**
77 | sratoolkit**/**
78 | 
79 | # ignore dbGAP permission keys
80 | **.krt
81 | **.ngc
82 | 
83 | # Version file from versioningit
84 | _version.py
85 | 
86 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "q2-fondue"
 3 | authors = [
 4 |     { name = "Michal Ziemski", email = "ziemski.michal@gmail.com" }
 5 | ]
 6 | description = "None"
 7 | readme = {file = "README.md", content-type = "text/markdown"}
 8 | license = {file = "LICENSE"}
 9 | dynamic = ["version"]
10 | 
11 | [project.urls]
12 | Homepage = "https://github.com/bokulich-lab/q2-fondue"
13 | Repository = "https://github.com/bokulich-lab/q2-fondue"
14 | 
15 | [project.entry-points.'qiime2.plugins']
16 | "q2-fondue" = "q2_fondue.plugin_setup:plugin"
17 | 
18 | [build-system]
19 | requires = [
20 |     "setuptools",
21 |     "versioningit",
22 |     "wheel"
23 | ]
24 | build-backend = "setuptools.build_meta"
25 | 
26 | [tool.versioningit.vcs]
27 | method = "git-archive"
28 | describe-subst = "2026.1.0.dev0-1-g71797cbd"
29 | default-tag = "0.0.1"
30 | 
31 | [tool.versioningit.next-version]
32 | method = "minor"
33 | 
34 | [tool.versioningit.format]
35 | distance = "{base_version}+{distance}.{vcs}{rev}"
36 | dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
37 | distance-dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
38 | 
39 | [tool.versioningit.write]
40 | file = "q2_fondue/_version.py"
41 | 
42 | [tool.setuptools]
43 | include-package-data = true
44 | 
45 | [tool.setuptools.packages.find]
46 | where = ["."]
47 | include = ["q2_fondue*"]
48 | 
49 | [tool.setuptools.package-data]
50 | q2_fondue = ["**/*"]
51 | 


--------------------------------------------------------------------------------
/q2_fondue/query.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------------------------
 2 | # Copyright (c) 2025, Bokulich Laboratories.
 3 | #
 4 | # Distributed under the terms of the Modified BSD License.
 5 | #
 6 | # The full license is in the file LICENSE, distributed with this software.
 7 | # ----------------------------------------------------------------------------
 8 | 
 9 | import threading
10 | import pandas as pd
11 | 
12 | from q2_fondue.utils import handle_threaded_exception
13 | from q2_fondue.entrezpy_clients._pipelines import _get_run_ids
14 | 
15 | threading.excepthook = handle_threaded_exception
16 | 
17 | 
18 | def get_ids_from_query(
19 |     query: str, email: str, threads: int = 1, log_level: str = "INFO"
20 | ) -> pd.Series:
21 |     """Retrieves SRA run IDs based on a search query performed
22 |         on the BioSample database.
23 | 
24 |     Args:
25 |         query (str): Search query to be executed on
26 |             the BioSample database.
27 |         email (str): A valid e-mail address (required by NCBI).
28 |         threads (int, default=1): Number of threads to be used in parallel.
29 |         log_level (str, default='INFO'): Logging level.
30 | 
31 |     Returns:
32 |         ids (pd.Series): Retrieved SRA run IDs.
33 |     """
34 |     run_ids = _get_run_ids(email, threads, None, query, "biosample", log_level)
35 | 
36 |     return pd.Series(run_ids, name="ID")
37 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/test_query.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------------------------
 2 | # Copyright (c) 2025, Bokulich Laboratories.
 3 | #
 4 | # Distributed under the terms of the Modified BSD License.
 5 | #
 6 | # The full license is in the file LICENSE, distributed with this software.
 7 | # ----------------------------------------------------------------------------
 8 | import pandas as pd
 9 | import unittest
10 | 
11 | from pandas.testing import assert_frame_equal
12 | from qiime2.plugins import fondue
13 | from unittest.mock import patch
14 | 
15 | from q2_fondue.tests.test_sequences import SequenceTests
16 | 
17 | 
18 | class TestQuery(SequenceTests):
19 |     package = "q2_fondue.tests"
20 | 
21 |     @patch("q2_fondue.query._get_run_ids", return_value=["SRR123", "SRR234"])
22 |     def test_query(self, mock_ids):
23 |         query = "some magical query text"
24 | 
25 |         (obs_ids,) = fondue.actions.get_ids_from_query(
26 |             query, "fake@email.com", 1, "DEBUG"
27 |         )
28 |         exp_ids = pd.DataFrame(
29 |             index=pd.Index(["SRR123", "SRR234"], name="ID"),
30 |             columns=[],
31 |         )
32 | 
33 |         mock_ids.assert_called_once_with(
34 |             "fake@email.com", 1, None, query, "biosample", "DEBUG"
35 |         )
36 |         assert_frame_equal(obs_ids.view(pd.DataFrame), exp_ids)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     unittest.main()
41 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/sra-metadata-exp-1.tsv:
--------------------------------------------------------------------------------
1 | ID	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	Some Meta 1
2 | SRR123456	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	ABC
3 | SRR123457	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	DEF
4 | SRR123458	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	GHI
5 | SRR123459	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	JKL
6 | SRR123460	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	MNO
7 | SRR123461	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	TRUE	454MSC	PQR
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2025, Bokulich Laboratories.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccHYB.fastq:
--------------------------------------------------------------------------------
 1 | @ERR3018303.92 Bgsng7131.m10_5758889 length=224
 2 | CCTGTTCGCTCCCCACGCTTTCGAGCCTCAGCGTCAGTTACAGACCAGAGAGCCGCTTTCGCCACCGGTGTTCCTCCATATATCTACGCATTTCACCGCTACACATGGAATTCCACTCTCCCCTTCTGCACTCAAGTTTGACAGTTTCCAAAGCGAACTATGGTTGAGCCACAGCCTTTAACTTCAGACTTATCAAACCGCCTGCGCTCGCTTTACGCCCAATA
 3 | +ERR3018303.92 Bgsng7131.m10_5758889 length=224
 4 | HHHHHHHGGGHGGGFGGGGGGHGGGGGHHHHHGGGGGHHHHHHHHHHHHGFGFHGGGGGHGGGGGHGGGGGGHHHHHGFHHGHHHHHGGGGGHHHHHGGGGGHHHHHHF1FGFHHHHHHHHHGHHHHHGHHHHHHHHHHHFHGHHHHHHGHGHHFGGGHHGHHHHGHFGGGGGGGGGGGGGGGGGFGGGGGFGGGGFGGGFFFFFFFFFFFFFFFFFFFFFFFF
 5 | @ERR3018303.93 Bgsng7131.m10_1129555 length=229
 6 | TACGTAGGTGGCAAGCGCTGTCCGGATTTATTGGGCGTAAAGGGAACGCAGGCGGTCTTTTAAGTCTGATGTGAAAGCCTTCGGCTTAACCGAAGTAGTGCATTGGAAACTGGAAGACTTGAGTGCAGAAGAGGAGAGTGGAACTCAATGTGTAGCGGTGAAATGGGTAGATATATGGAAGAACACCAGTGGCGAAAGCGGCGCGTTGGCCTGTAACTGACGCTGAGGT
 7 | +ERR3018303.93 Bgsng7131.m10_1129555 length=229
 8 | D2FEGFA0GGE0GFCHG/A/B11AEEE/G1FBG1GA>>EA/GFGGHG///E@EEE>/?1BGB1B>F2B1G1212FGFFHHB<CA//BGHGGC//@/@11?GDFHHHH<D0FFB0>00>1GGDGHFBBDGHEHGHAACC/;.CGC:FB/C0000CGG?GGGGG00C0.BB.09CBFGGGGGB.9/F9A-9/;/-;99@?/99@###########################
 9 | @ERR3018303.94 Bgsng7131.m10_1839802 length=250
10 | TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGACTTTTAAGTGAGATGTGAAATACCCGGGCTCAACTTGGGTGCTGCATTTCAAACTGGAAGTCTAGAGTGCAGGAGAGGAGAATGGAATTCCTAGTGTAGCGGTGAAATGCGTAGAGATTAGGAAGAACACCAGTGGCGAAGGCGATTCTCTGGACTGTAACTGACGCTGAGGCTCGAAAGCGTGGGGAGCAAAC
11 | +ERR3018303.94 Bgsng7131.m10_1839802 length=250
12 | CBBCCFFCFFCFGGGGGGGGGGHGGGGGHHHHHHHHGGFGHHGGGGGGGGGGHGGGGGHHHFGHHGHHGHHHHHHEGHHHHGGGFGHGHGGHGHHGEFFHGHHHHHGHGGHGHHHHHHHGHFHHHHHGGFFHGDFGHFHGHHGHHGHHHHHHHFHHGDGGGFGHGHFGD?EFCHGHHFCGHFHHGGGGGGGFFGFADAED?CFFFFFFFB;BEBFFFFFF/ADFAAFEFEFF@BADFFFFFFFAABEFF:
13 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/fasterq-dump-response.txt:
--------------------------------------------------------------------------------
 1 | cursor-cache : 5,242,880 bytes
 2 | buf-size     : 1,048,576 bytes
 3 | mem-limit    : 52,428,800 bytes
 4 | threads      : 6
 5 | scratch-path : '/private/var/folders/7f/7nw_x13n5q965rss_qz6061m0000gq/T/tmpf0d7oy6a/fasterq.tmp.MacBook-Pro.35899/'
 6 | total ram    : 17,179,869,184 bytes
 7 | output-format: FASTQ split 3
 8 | check-mode   : only
 9 | output-file  : '/private/var/folders/7f/7nw_x13n5q965rss_qz6061m0000gq/T/tmpf0d7oy6a/ERR2750829.fastq'
10 | output-dir   : '.'
11 | output       : '/private/var/folders/7f/7nw_x13n5q965rss_qz6061m0000gq/T/tmpf0d7oy6a/ERR2750829.fastq'
12 | append-mode  : 'NO'
13 | stdout-mode  : 'NO'
14 | seq-defline  : '@$ac.$si $sn length=$rl'
15 | qual-defline  : '+$ac.$si $sn length=$rl'
16 | only-unaligned : 'NO'
17 | only-aligned   : 'NO'
18 | accession     : 'ERR2750829'
19 | accession-path: 'ERR2750829'
20 | est. output          : 44,926,989,570 bytes
21 | disk-limit (OS)      : 9,149,612,032 bytes
22 | disk-limit-tmp (OS)  : 9,149,612,032 bytes
23 | out/tmp on same fs   : 'NO'
24 | 
25 | ERR2750829 is remote
26 | ... has a size of 12,459,034,417 bytes
27 | ... is cSRA without alignments
28 | ... SEQ has NAME column = YES
29 | ... SEQ has SPOT_GROUP column = YES
30 | ... uses 'SEQUENCE' as sequence-table
31 | SEQ.first_row = 1
32 | SEQ.row_count = 84,543,740
33 | SEQ.spot_count = 84,543,740
34 | SEQ.total_base_count = 16,545,432,985
35 | SEQ.bio_base_count = 16,545,432,985
36 | SEQ.avg_name_len = 1
37 | SEQ.avg_spot_group_len = 0
38 | SEQ.avg_bio_reads_per_spot = 2
39 | SEQ.avg_tech_reads_per_spot = 0
40 | ALIGN.first_row = 0
41 | ALIGN.row_count = 0
42 | ALIGN.spot_count = 0
43 | ALIGN.total_base_count = 0
44 | ALIGN.bio_base_count = 0
45 | 
46 | disk-limit exeeded!
47 | fasterq-dump quit with error code 3
48 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3:latest AS base
 2 | 
 3 | ARG ENVIRONMENT
 4 | ARG PLUGIN_NAME
 5 | 
 6 | ENV PLUGIN_NAME=$PLUGIN_NAME
 7 | ENV PATH=/opt/conda/envs/${PLUGIN_NAME}/bin:$PATH \
 8 |     LC_ALL=C.UTF-8 LANG=C.UTF-8 \
 9 |     MPLBACKEND=agg \
10 |     UNIFRAC_USE_GPU=N \
11 |     HOME=/home/qiime2 \
12 |     XDG_CONFIG_HOME=/home/qiime2
13 | 
14 | WORKDIR /home/qiime2
15 | COPY environment.yml .
16 | COPY install-sra-tools.sh .
17 | 
18 | RUN apt-get update \
19 |     && apt-get install -y --no-install-recommends wget curl procps make \
20 |     && apt-get clean \
21 |     && rm -rf /var/lib/apt/lists/*
22 | 
23 | RUN conda update -qy conda \
24 |     && conda install -c conda-forge -qy mamba \
25 |     && mamba env create -n ${PLUGIN_NAME} --file environment.yml \
26 |     && mamba run -n ${PLUGIN_NAME} bash install-sra-tools.sh \
27 |     && mamba clean --all --yes \
28 |     && chmod -R a+rwx /opt/conda
29 | 
30 | RUN mkdir -p .ncbi
31 | RUN printf '/LIBS/GUID = "%s"\n' `uuidgen` > .ncbi/user-settings.mkfg
32 | 
33 | COPY . ./plugin
34 | RUN mamba run -n ${PLUGIN_NAME} pip install ./plugin
35 | 
36 | RUN /bin/bash -c "source activate ${PLUGIN_NAME}"
37 | ENV CONDA_PREFIX=/opt/conda/envs/${PLUGIN_NAME}/
38 | RUN mamba run -n ${PLUGIN_NAME} qiime dev refresh-cache
39 | RUN echo "source activate ${PLUGIN_NAME}" >> $HOME/.bashrc
40 | RUN echo "source tab-qiime" >> $HOME/.bashrc
41 | 
42 | 
43 | FROM base AS test
44 | 
45 | RUN mamba run -n ${PLUGIN_NAME} pip install pytest pytest-cov coverage parameterized pytest-xdist
46 | CMD mamba run -n ${PLUGIN_NAME} make -f ./plugin/Makefile test-cov
47 | 
48 | FROM base AS prod
49 | 
50 | # Important: let any UID modify these directories so that
51 | # `docker run -u UID:GID` works
52 | RUN rm -rf ./plugin
53 | RUN chmod -R a+rwx /home/qiime2


--------------------------------------------------------------------------------
/q2_fondue/get_all.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------------------------
 2 | # Copyright (c) 2025, Bokulich Laboratories.
 3 | #
 4 | # Distributed under the terms of the Modified BSD License.
 5 | #
 6 | # The full license is in the file LICENSE, distributed with this software.
 7 | # ----------------------------------------------------------------------------
 8 | 
 9 | import qiime2 as q2
10 | 
11 | import pandas as pd
12 | import threading
13 | 
14 | from q2_fondue.utils import handle_threaded_exception
15 | from qiime2 import Artifact
16 | 
17 | 
18 | threading.excepthook = handle_threaded_exception
19 | 
20 | 
21 | def get_all(
22 |     ctx, accession_ids, email, threads=1, retries=2, log_level="INFO", linked_doi=None
23 | ):
24 | 
25 |     # get required methods
26 |     get_metadata = ctx.get_action("fondue", "get_metadata")
27 |     get_sequences = ctx.get_action("fondue", "get_sequences")
28 | 
29 |     # fetch metadata
30 |     metadata, failed_ids = get_metadata(
31 |         accession_ids, email, threads, log_level, linked_doi
32 |     )
33 |     failed_ids_df = failed_ids.view(pd.DataFrame)
34 | 
35 |     # fetch sequences - use metadata to get run ids, regardless if
36 |     # runs or projects were requested
37 |     run_ids = q2.Artifact.import_data(
38 |         "NCBIAccessionIDs", pd.Series(metadata.view(pd.DataFrame).index)
39 |     )
40 |     (
41 |         seq_single,
42 |         seq_paired,
43 |         failed_ids,
44 |     ) = get_sequences(run_ids, email, retries, threads, log_level)
45 |     failed_ids_df = pd.concat([failed_ids_df, failed_ids.view(pd.DataFrame)])
46 |     if failed_ids_df.shape[0] > 0:
47 |         failed_ids = Artifact.import_data("SRAFailedIDs", failed_ids_df)
48 | 
49 |     return metadata, seq_single, seq_paired, failed_ids
50 | 


--------------------------------------------------------------------------------
/q2_fondue/citations.bib:
--------------------------------------------------------------------------------
 1 | @article {Ziemski2022,
 2 | 	author = {Ziemski, Michal and Adamov, Anja and Kim, Lina and Flörl, Lena and Bokulich, Nicholas A},
 3 | 	title = {Reproducible acquisition, management, and meta-analysis of nucleotide sequence (meta)data using q2-fondue},
 4 | 	year = {2022},
 5 | 	month = {09},
 6 | 	doi = {10.1093/bioinformatics/btac639},
 7 | 	URL = {https://doi.org/10.1093/bioinformatics/btac639},
 8 | 	journal = {Bioinformatics},
 9 | 	issn = {1367-4803},
10 | }
11 | 
12 | @article{Buchmann2019,
13 | 	author = {Buchmann, Jan P and Holmes, Edward C},
14 | 	doi = {10.1093/bioinformatics/btz385},
15 | 	editor = {Wren, Jonathan},
16 | 	journal = {Bioinformatics},
17 | 	month = {nov},
18 | 	number = {21},
19 | 	pages = {4511--4514},
20 | 	publisher = {Oxford University Press},
21 | 	title = {Entrezpy: a Python library to dynamically interact with the NCBI Entrez databases},
22 | 	url = {https://academic.oup.com/bioinformatics/article/35/21/4511/5488119},
23 | 	volume = {35},
24 | 	year = {2019}
25 | }
26 | 
27 | @misc{SraToolkit,
28 | 	name = {SRA Toolkit},
29 | 	author = {SRA Toolkit Development Team},
30 | 	version = {2.9.6},
31 | 	url = {https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software}
32 | }
33 | 
34 | @misc{stephan_hugel_2019_2917290,
35 | 	author = {Stephan Hügel and Peter Gerdes and Patrick Fournier and
36 |             emuzie and Patrick Golden and jghauser and Stefan Frühwirth and
37 |             Sean Takats and Pablo Orduña and Merlin and Erik Hetzner and
38 |             Christian Brodbeck and Avram Lyon and A Lee},
39 | 	title = {urschrei/pyzotero: Zenodo Release},
40 | 	month = {may},
41 | 	year = 2019,
42 | 	publisher = {Zenodo},
43 | 	version = {v1.3.15},
44 | 	doi = {10.5281/zenodo.2917290},
45 | 	url = {https://doi.org/10.5281/zenodo.2917290}
46 | }
47 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123456.fastq:
--------------------------------------------------------------------------------
 1 | @test_acc_single.1 test_1_seq length=278
 2 | AAACTCCTAGCCTACATCCGTACGAGTTAGCGTGGGATTACGAGGTGCACACCATTTCATTCCGTACGGGTAAATTTTTGTATTTTTAGCAGACGGCAGGGTTTCACCATGGTTGACCAACGTACTAATCTTGAACTCCTGACCTCAAGTGATTTGCCTGCCTTCAGCCTCCCAAAGTGACTGGGTATTACAGATGTGAGCGAGTTTGTGCCCAAGCCTTATAAGTAAATTTATAAATTTACATAATTTAAATGACTTATGCTTAGCGAAATAGTTTA
 3 | +test_acc_single.1 test_1_seq length=278
 4 | 85)9=9/3-8?68<7=8<3657747==49==+;FB2;A;5:'*>69<:74)9.;C?+;<B<B;(<';FA/;C>*GC8/%9<=GC8.#=2:5:16D==<EA2EA.;5=44<;2C=5;@73&<<2;5;6+9<?776+:24'26:7,<9A;=:;0C>*6?7<<C=D=<52?:9CA2CA23<2<;3CA12:A<9414<7<<6;99<2/=9#<;9B@27.;=6>:77>:1<A>+CA138?<)C@2166:A<B?->:%<<9<;33<;6?9;<;4=:%<$CA1+1
 5 | @test_acc_single.4 test_2_seq length=274
 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
 7 | +test_acc_single.4 test_2_seq length=274
 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A<B>)9EA3#EA2GC7*<A='A<6FB2EA2"GC6)<B<B:C<B<=9@;$8FB0C=(45:8HD7+=3%%<HC90';2&GC8-">6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-<FB2;FB1<8/FB1=57:':?82'723):=7?;5+61351%3),1;<--52&$5:7/3(>88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
 9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<<B:D=FB1=>5:D=;D=7:==D=;8<<<<<FB1FB2<;<<=<<E@-C==4;<::8;<FB2FB0=9<;<<A<<B<:<<9;9B<9:7:<<<;:?9<;@>+;:A<<9A<B@2@;9<<A=6<C@2=:B@/;:<<7B@81)CA08;CA7+<<9CA1CA6)5=795A=5
13 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccA.fastq:
--------------------------------------------------------------------------------
 1 | @test_acc_single.1 test_1_seq length=278
 2 | AAACTCCTAGCCTACATCCGTACGAGTTAGCGTGGGATTACGAGGTGCACACCATTTCATTCCGTACGGGTAAATTTTTGTATTTTTAGCAGACGGCAGGGTTTCACCATGGTTGACCAACGTACTAATCTTGAACTCCTGACCTCAAGTGATTTGCCTGCCTTCAGCCTCCCAAAGTGACTGGGTATTACAGATGTGAGCGAGTTTGTGCCCAAGCCTTATAAGTAAATTTATAAATTTACATAATTTAAATGACTTATGCTTAGCGAAATAGTTTA
 3 | +test_acc_single.1 test_1_seq length=278
 4 | 85)9=9/3-8?68<7=8<3657747==49==+;FB2;A;5:'*>69<:74)9.;C?+;<B<B;(<';FA/;C>*GC8/%9<=GC8.#=2:5:16D==<EA2EA.;5=44<;2C=5;@73&<<2;5;6+9<?776+:24'26:7,<9A;=:;0C>*6?7<<C=D=<52?:9CA2CA23<2<;3CA12:A<9414<7<<6;99<2/=9#<;9B@27.;=6>:77>:1<A>+CA138?<)C@2166:A<B?->:%<<9<;33<;6?9;<;4=:%<$CA1+1
 5 | @test_acc_single.4 test_2_seq length=274
 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
 7 | +test_acc_single.4 test_2_seq length=274
 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A<B>)9EA3#EA2GC7*<A='A<6FB2EA2"GC6)<B<B:C<B<=9@;$8FB0C=(45:8HD7+=3%%<HC90';2&GC8-">6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-<FB2;FB1<8/FB1=57:':?82'723):=7?;5+61351%3),1;<--52&$5:7/3(>88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
 9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<<B:D=FB1=>5:D=;D=7:==D=;8<<<<<FB1FB2<;<<=<<E@-C==4;<::8;<FB2FB0=9<;<<A<<B<:<<9;9B<9:7:<<<;:?9<;@>+;:A<<9A<B@2@;9<<A=6<C@2=:B@/;:<<7B@81)CA08;CA7+<<9CA1CA6)5=795A=5
13 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testacc_1.fastq:
--------------------------------------------------------------------------------
 1 | @test_acc_single.1 test_1_seq length=278
 2 | AAACTCCTAGCCTACATCCGTACGAGTTAGCGTGGGATTACGAGGTGCACACCATTTCATTCCGTACGGGTAAATTTTTGTATTTTTAGCAGACGGCAGGGTTTCACCATGGTTGACCAACGTACTAATCTTGAACTCCTGACCTCAAGTGATTTGCCTGCCTTCAGCCTCCCAAAGTGACTGGGTATTACAGATGTGAGCGAGTTTGTGCCCAAGCCTTATAAGTAAATTTATAAATTTACATAATTTAAATGACTTATGCTTAGCGAAATAGTTTA
 3 | +test_acc_single.1 test_1_seq length=278
 4 | 85)9=9/3-8?68<7=8<3657747==49==+;FB2;A;5:'*>69<:74)9.;C?+;<B<B;(<';FA/;C>*GC8/%9<=GC8.#=2:5:16D==<EA2EA.;5=44<;2C=5;@73&<<2;5;6+9<?776+:24'26:7,<9A;=:;0C>*6?7<<C=D=<52?:9CA2CA23<2<;3CA12:A<9414<7<<6;99<2/=9#<;9B@27.;=6>:77>:1<A>+CA138?<)C@2166:A<B?->:%<<9<;33<;6?9;<;4=:%<$CA1+1
 5 | @test_acc_single.4 test_2_seq length=274
 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
 7 | +test_acc_single.4 test_2_seq length=274
 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A<B>)9EA3#EA2GC7*<A='A<6FB2EA2"GC6)<B<B:C<B<=9@;$8FB0C=(45:8HD7+=3%%<HC90';2&GC8-">6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-<FB2;FB1<8/FB1=57:':?82'723):=7?;5+61351%3),1;<--52&$5:7/3(>88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
 9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<<B:D=FB1=>5:D=;D=7:==D=;8<<<<<FB1FB2<;<<=<<E@-C==4;<::8;<FB2FB0=9<;<<A<<B<:<<9;9B<9:7:<<<;:?9<;@>+;:A<<9A<B@2@;9<<A=6<C@2=:B@/;:<<7B@81)CA08;CA7+<<9CA1CA6)5=795A=5
13 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/SRR123457_1.fastq:
--------------------------------------------------------------------------------
 1 | @test_acc_single.1 test_1_seq length=278
 2 | AAACTCCTAGCCTACATCCGTACGAGTTAGCGTGGGATTACGAGGTGCACACCATTTCATTCCGTACGGGTAAATTTTTGTATTTTTAGCAGACGGCAGGGTTTCACCATGGTTGACCAACGTACTAATCTTGAACTCCTGACCTCAAGTGATTTGCCTGCCTTCAGCCTCCCAAAGTGACTGGGTATTACAGATGTGAGCGAGTTTGTGCCCAAGCCTTATAAGTAAATTTATAAATTTACATAATTTAAATGACTTATGCTTAGCGAAATAGTTTA
 3 | +test_acc_single.1 test_1_seq length=278
 4 | 85)9=9/3-8?68<7=8<3657747==49==+;FB2;A;5:'*>69<:74)9.;C?+;<B<B;(<';FA/;C>*GC8/%9<=GC8.#=2:5:16D==<EA2EA.;5=44<;2C=5;@73&<<2;5;6+9<?776+:24'26:7,<9A;=:;0C>*6?7<<C=D=<52?:9CA2CA23<2<;3CA12:A<9414<7<<6;99<2/=9#<;9B@27.;=6>:77>:1<A>+CA138?<)C@2166:A<B?->:%<<9<;33<;6?9;<;4=:%<$CA1+1
 5 | @test_acc_single.4 test_2_seq length=274
 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
 7 | +test_acc_single.4 test_2_seq length=274
 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A<B>)9EA3#EA2GC7*<A='A<6FB2EA2"GC6)<B<B:C<B<=9@;$8FB0C=(45:8HD7+=3%%<HC90';2&GC8-">6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-<FB2;FB1<8/FB1=57:':?82'723):=7?;5+61351%3),1;<--52&$5:7/3(>88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
 9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<<B:D=FB1=>5:D=;D=7:==D=;8<<<<<FB1FB2<;<<=<<E@-C==4;<::8;<FB2FB0=9<;<<A<<B<:<<9;9B<9:7:<<<;:?9<;@>+;:A<<9A<B@2@;9<<A=6<C@2=:B@/;:<<7B@81)CA08;CA7+<<9CA1CA6)5=795A=5
13 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccA_01_L001_R1_001.fastq:
--------------------------------------------------------------------------------
 1 | @test_acc_single.1 test_1_seq length=278
 2 | AAACTCCTAGCCTACATCCGTACGAGTTAGCGTGGGATTACGAGGTGCACACCATTTCATTCCGTACGGGTAAATTTTTGTATTTTTAGCAGACGGCAGGGTTTCACCATGGTTGACCAACGTACTAATCTTGAACTCCTGACCTCAAGTGATTTGCCTGCCTTCAGCCTCCCAAAGTGACTGGGTATTACAGATGTGAGCGAGTTTGTGCCCAAGCCTTATAAGTAAATTTATAAATTTACATAATTTAAATGACTTATGCTTAGCGAAATAGTTTA
 3 | +test_acc_single.1 test_1_seq length=278
 4 | 85)9=9/3-8?68<7=8<3657747==49==+;FB2;A;5:'*>69<:74)9.;C?+;<B<B;(<';FA/;C>*GC8/%9<=GC8.#=2:5:16D==<EA2EA.;5=44<;2C=5;@73&<<2;5;6+9<?776+:24'26:7,<9A;=:;0C>*6?7<<C=D=<52?:9CA2CA23<2<;3CA12:A<9414<7<<6;99<2/=9#<;9B@27.;=6>:77>:1<A>+CA138?<)C@2166:A<B?->:%<<9<;33<;6?9;<;4=:%<$CA1+1
 5 | @test_acc_single.4 test_2_seq length=274
 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
 7 | +test_acc_single.4 test_2_seq length=274
 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A<B>)9EA3#EA2GC7*<A='A<6FB2EA2"GC6)<B<B:C<B<=9@;$8FB0C=(45:8HD7+=3%%<HC90';2&GC8-">6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-<FB2;FB1<8/FB1=57:':?82'723):=7?;5+61351%3),1;<--52&$5:7/3(>88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
 9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<<B:D=FB1=>5:D=;D=7:==D=;8<<<<<FB1FB2<;<<=<<E@-C==4;<::8;<FB2FB0=9<;<<A<<B<:<<9;9B<9:7:<<<;:?9<;@>+;:A<<9A<B@2@;9<<A=6<C@2=:B@/;:<<7B@81)CA08;CA7+<<9CA1CA6)5=795A=5
13 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testacc_00_L001_R1_001.fastq:
--------------------------------------------------------------------------------
 1 | @test_acc_single.1 test_1_seq length=278
 2 | AAACTCCTAGCCTACATCCGTACGAGTTAGCGTGGGATTACGAGGTGCACACCATTTCATTCCGTACGGGTAAATTTTTGTATTTTTAGCAGACGGCAGGGTTTCACCATGGTTGACCAACGTACTAATCTTGAACTCCTGACCTCAAGTGATTTGCCTGCCTTCAGCCTCCCAAAGTGACTGGGTATTACAGATGTGAGCGAGTTTGTGCCCAAGCCTTATAAGTAAATTTATAAATTTACATAATTTAAATGACTTATGCTTAGCGAAATAGTTTA
 3 | +test_acc_single.1 test_1_seq length=278
 4 | 85)9=9/3-8?68<7=8<3657747==49==+;FB2;A;5:'*>69<:74)9.;C?+;<B<B;(<';FA/;C>*GC8/%9<=GC8.#=2:5:16D==<EA2EA.;5=44<;2C=5;@73&<<2;5;6+9<?776+:24'26:7,<9A;=:;0C>*6?7<<C=D=<52?:9CA2CA23<2<;3CA12:A<9414<7<<6;99<2/=9#<;9B@27.;=6>:77>:1<A>+CA138?<)C@2166:A<B?->:%<<9<;33<;6?9;<;4=:%<$CA1+1
 5 | @test_acc_single.4 test_2_seq length=274
 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC
 7 | +test_acc_single.4 test_2_seq length=274
 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A<B>)9EA3#EA2GC7*<A='A<6FB2EA2"GC6)<B<B:C<B<=9@;$8FB0C=(45:8HD7+=3%%<HC90';2&GC8-">6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-<FB2;FB1<8/FB1=57:':?82'723):=7?;5+61351%3),1;<--52&$5:7/3(>88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=.
 9 | @test_acc_single.5 test_3_seq length=267
10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC
11 | +test_acc_single.5 test_3_seq length=267
12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<<B:D=FB1=>5:D=;D=7:==D=;8<<<<<FB1FB2<;<<=<<E@-C==4;<::8;<FB2FB0=9<;<<A<<B<:<<9;9B<9:7:<<<;:?9<;@>+;:A<<9A<B@2@;9<<A=6<C@2=:B@/;:<<7B@81)CA08;CA7+<<9CA1CA6)5=795A=5
13 | 


--------------------------------------------------------------------------------
/install-sra-tools.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | TOOLKIT_VER="3.0.0"
 4 | 
 5 | if [[ "$OSTYPE" == "linux"* ]]; then
 6 |   LINUX_VER=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 7 |   if [[ "$LINUX_VER" == '"Ubuntu"' || "$LINUX_VER" == '"Debian GNU/Linux"' ]]; then
 8 |     OS_VER="ubuntu64"
 9 |   elif [[ "$LINUX_VER" == '"CentOS Linux"' ]]; then
10 |     OS_VER="centos_linux64"
11 |   else
12 |     echo "Detected OS version (${LINUX_VER}) is not supported. Aborting."
13 |     exit 1
14 |   fi
15 | elif [[ "$OSTYPE" == "darwin"* ]]; then
16 |   OS_VER="mac64"
17 | else
18 |   echo "Detected OS version (${OSTYPE}) is not supported. Aborting."
19 |   exit 1
20 | fi
21 | 
22 | TOOLKIT_URL="http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/${TOOLKIT_VER}/sratoolkit.${TOOLKIT_VER}-${OS_VER}.tar.gz"
23 | 
24 | echo "Fetching SRA Tools from ${TOOLKIT_URL}..."
25 | curl -L "${TOOLKIT_URL}" > sratoolkit.tar.gz
26 | 
27 | echo "Extracting..."
28 | tar -xzf sratoolkit.tar.gz
29 | rm sratoolkit.tar.gz
30 | mv "sratoolkit.${TOOLKIT_VER}-${OS_VER}/" "sratoolkit/"
31 | 
32 | if [[ "$PREFIX" == "" ]]; then
33 |   echo "Setting PREFIX=$CONDA_PREFIX"
34 |   PREFIX="$CONDA_PREFIX"
35 | fi
36 | 
37 | echo "Installing SRA Tools in $PREFIX..."
38 | if [[ ! -d "$PREFIX/bin/" ]]; then
39 |   mkdir $PREFIX/bin/
40 | fi
41 | find sratoolkit/bin/ -maxdepth 1 -type f -exec mv -f {} $PREFIX/bin/ \;
42 | find sratoolkit/bin/ -maxdepth 1 -type l -exec mv -f {} $PREFIX/bin/ \;
43 | rm -r sratoolkit
44 | 
45 | echo "Testing installation..."
46 | if [[ $(which prefetch) == "$PREFIX/bin"* ]]; then
47 |   echo "Success!"
48 | else
49 |   echo "Installation failed."
50 |   exit 1
51 | fi
52 | 
53 | echo "Configuring SRA Toolkit:"
54 | SRA_CACHE_LOC="$HOME/.prefetch_cache"
55 | echo "Creating prefetch cache directory under $SRA_CACHE_LOC..."
56 | mkdir "$SRA_CACHE_LOC"
57 | echo "Running vdb-config..."
58 | vdb-config -s "/repository/user/main/public/root=$SRA_CACHE_LOC"
59 | vdb-config --prefetch-to-user-repo
60 | echo "Configuration completed."
61 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccHYB_2.fastq:
--------------------------------------------------------------------------------
 1 | @ERR3018303.88 Bgsng7131.m10_3542277 length=228
 2 | TACGTAGGGTGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGTTCGTCGCGTCCGGTGTGAAAGTCCATCGCTTAACGGTGGATCCGCGCCGGGTACGGGCGGGCTTGAGTGCGGTAGGGGAGACTGGAATTCCCGGTGTAACGGTGGAATGTGTAGATATCAGGAAGAACACCAATGGCGAAGGCAGGTCTCTGGGCCGTTACTGACGCTGAGG
 3 | +ERR3018303.88 Bgsng7131.m10_3542277 length=228
 4 | HHHHHHHGGHGGHHHHGGGGHHHGGGGGHHHHHHHHGGGGHGHGGHHGGGGGGGGGGGGGGGGGGGGGGGGGGHHHHGHHHHHGGGHGHHGGGEGGHHHHGGGGGCGAGHGGGGGFFFFEF.99BFDCAFFFFFFC.EFFFFFFFFFFDCDF.BFFFFDEFF/BFBFFFFFFFFB/;B;BF.9FBBDFE;/;BDFAFFFEA;;B/BFFFFEED=.A99BFFFF?DFFF
 5 | @ERR3018303.89 Bgsng7131.m10_3605463 length=230
 6 | TACGTAGGTGGCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGAACGCAGGCGGTCTTTTCAGTCTGATGTGAAAGCCTTCGGCTTAACCGAAGTAGTGCATTGGAAACTGGAAGACTTGAGTGCAGAAGAGGAGAGTGGCACTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAAGAACACCAGGGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGTGGTT
 7 | +ERR3018303.89 Bgsng7131.m10_3605463 length=230
 8 | GGHHHGGEDBFHGFFEFGEFFHHGGGEGHH5BFFHGEFGGHBGGAGFGGCCGGEGEGGHHG4FDFDFDDGHHDHHF2CFFHHHD>?/2G1?AFF<?1>D0DFBGGHFFHBGF0CGBCHDGFDF00G0GGFFCGECDHGFHG.C0;/C/9B0;CFBB?AEGGBBBFFEDDAGEFFFBFFFFBFB.BFAE...-@-@BF?/AA;-.9BBFFBFFFFFFFFEBF?DA######
 9 | @ERR3018303.90 Bgsng7131.m10_3641968 length=231
10 | TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGGCTGTGGCTCAACCATAGTTCGCTTTGGAAACTGTCAAACTTGAGTGCAGAAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCGGTGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGAGGCTCG
11 | +ERR3018303.90 Bgsng7131.m10_3641968 length=231
12 | GGGHHHHGHHHHGGGGGGGGGHHGGGGGHHHHHHHHGGGGHHHGGGGGGGGGGGGGGGGFFHHHDGHHHHHHGGHHHHHHHHGHHHHHGHHGGHHHHHGGGHGHHFHHBHHHGHHHHFHG<GGGFHFFGHHGGGFDGGHGHHHGGGBFFGGGGGGGGE?EFGFFGGDGGGGGFFFFFEFFFFFFFFFF-CD?C;DFFFFFA>FFFFFFFFFFFFFFFFFFFFFFFFFFEFA
13 | @ERR3018303.91 Bgsng7131.m10_3716454 length=231
14 | TACGTAGGTGGCAAGCGTTATCCGGATTTATTGGGCGTAAAGAGAGTGCAGGCGGTTTTCTAAGTCTGATGTGAAAGCCTTCGGCTTAACCGGAGAAGTGCATCGGAAACTGGATAACTTGAGTGCAGAAGAGGGTAGTGGAACTCCATGTGTAGCGGTGGAATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTACCTGGTCTGCAACTGACGCTGAGACTC
15 | +ERR3018303.91 Bgsng7131.m10_3716454 length=231
16 | GGGHHHHGHGGHHHGHGGGGHHHGGGGGGHEGHHHHGFGGHHHHHGHHHHHHGGGEFGGGHGHGGFHGEDG4GEGH3BECGEHG@CCBHBGFG/B<GF2@<1DFAD/FCCFFC<<<G11F1F1>F1=FDG<CFHG.GE0<G0C/0;CFFFCGFFG0@D.-CEB9BC9..99BFFEFFGFGGFF9FBFF99FFAA-9-;;9@9;A.BFFFFFFFFFFFBB//ADADDFFFF9
17 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/testaccHYB_1.fastq:
--------------------------------------------------------------------------------
 1 | @ERR3018303.88 Bgsng7131.m10_3542277 length=250
 2 | TACGTAGGGTGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGTTCGTCGCGTCCGGTGTGAAAGTCCATCGCTTAACGGTGGATCCGCGCCGGGTACGGGCGGGCTTGAGTGCGGTAGGGGAGACTGGAATTCCCGGTGTAACGGTGGAATGTGTAGATATCGGGAAGAACACCAATGGCGAAGGCAGGTCTCTGGGCCGTTACTGACGCTGAGGAGCGAAAGCGTGGGGGGCGAAC
 3 | +ERR3018303.88 Bgsng7131.m10_3542277 length=250
 4 | BBBBBFFBBCAAGGFGGGGGGGHGGGGGH5FHGHHHGGFG2GBEFGHFEGEEGGGGG?FGEEGGGGGGEFGGGHHH34FG?FGEFHGAGHF??FGGFHFHGG/CFGFGGHDFGGC?DGGGGGGGGBBGDFGEFFDF;AB/EBFFFFFFFFFFFFFFFFFEDABBFB/FFFBBFFF.>>D-DEF.BFF?9BFFFCFBFFE.DFAF/BFFFF;.99AAEBFFFFFF?;FE/;9@@-;9BADFAFF-9=-;@F
 5 | @ERR3018303.89 Bgsng7131.m10_3605463 length=250
 6 | TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGGCTGTGGCTCAACCATAGTTCGCTTTGGAAACTGTCAAACTTGAGTGCAGAAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCGGTGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGAGGCTCGAAAGCGTGGGGGGCGAACA
 7 | +ERR3018303.89 Bgsng7131.m10_3605463 length=250
 8 | BBBBBFFBFFDFAEFEGGGCGGHGG?FGHGHHGHHHGGGGGHGGGGGGCCECGGGEFEGBGFHHHHGHB4FGFFHG4FG3EG3GHHEHHHHFHEFDGFFCACFGHEB2GDGF2F222@F>2C@1FHHB0CGFG/A??<>F1>FG0GHHHHHH0DG@F<@A0BC0CFD.@ACFGE;CF0B?.BFFGF-;9-99--B@D/.--BD/99BF//////;/9BBFA.9...A.EA@-9;.::9ABFF########
 9 | @ERR3018303.90 Bgsng7131.m10_3641968 length=250
10 | TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGCCTGTGGCTCAACCATAGTTCGCTTTGGAAACTGTCCAACTTGGGTGCAGAAGGGGGGGGTGGAATTTCATGTGTAGCGGGGAAATGCGTAGATATATGGGGGAACACCCGTGGCGAAAGCGGCTCTCTTGGCTGTAACTGACGCTGAGGCTCGAAAACGTGGGGAGCCAAAC
11 | +ERR3018303.90 Bgsng7131.m10_3641968 length=250
12 | AAAAAFF@@F1C1EEGGGG?AAFEGGCGHG2GGHHHEGGGHHHG//EGGGGGGGGGGGF1BFDGHF21E2FHBGGG11BB01>FGHHHFHG/F1B22BF@</F1<1B1011?11@D1<F>1///?<F??<F<C-@-;9@@-B/B/://99///:/-@---AEFFF-A-9-:/9/9/;-9--;99BA-;:B@-9-@@-/-;--9/;//;///A9BFB9//99A----;-9;@;FA-/-:AE@@########
13 | @ERR3018303.91 Bgsng7131.m10_3716454 length=250
14 | TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGAAGGCGAAGGCAGCCCCTTGGGAATGTACTGACGCTCATGTGCGAAAGCGTGGGGGGCAAAC
15 | +ERR3018303.91 Bgsng7131.m10_3716454 length=250
16 | BBBBBBBBBBBBGGGGEGGFFGHGGGGGHHDHGHHGGGGGHHHGGCEGGGGGGGGGGGGHHHBFHH4GGHGHHHHHHHHGG??EEEGHHHHG0GGGFFHFHDGHHHHGFGHDGEHFFEHHGGHHHFHHGGG.FFGGGFCF.FFFFFFBF/BFFFBFF;AFEFFFFEDADFEFFFFFBB9FDBFBBFF?.BDDEAFBC9.AD.EDFFFBBE?BFFF/BBFFF9?.DD/BFFFFF@@DB.9;BAF---@EFF
17 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/sra-metadata-missing-columns.tsv:
--------------------------------------------------------------------------------
1 | ID	DNA-ID	population	Super Population Code	Coriell plate	Coriell panel	Population Description	sex	HapMap sample ID	Super Population Description	culture_collection	family_id	Library Name	AvgSpotLen	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Center Name	ENA-FIRST-PUBLIC	ENA-LAST-UPDATE	ENA first public	ENA last update	External Id	INSDC center alias	INSDC center name	INSDC first public	INSDC last update	INSDC status	Submitter Id	amount or size of sample collected	collection date	collection day	collection hours	common name	environment (biome)	environment (feature)	environment (material)	geographic location (country and/or sea)	geographic location (latitude)	geographic location (longitude)	investigation type	multiplex identifiers	pcr primers	plant-associated environmental package	project name	sample name	sample storage temperature	sample volume or weight for DNA extraction	sequencing method	subspecific genetic lineage	target subfragment
2 | SRR000013	NA18505	YRI	AFR	HAPMAPPT03	MGP00013	Yoruba in Ibadan, Nigeria	female	NA18505	African	CORIELL:GM18505	Y005	SID2748	274	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454MSC																																	
3 | SRR000001	NA18505	YRI	AFR	HAPMAPPT03	MGP00013	Yoruba in Ibadan, Nigeria	female	NA18505	African	CORIELL:GM18505	Y005	SID2748	274	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454MSC																																	
4 | ERR3978173												unspecified	293	29760	SAMEA6608408	ERS4372624	Vitis vinifera	SAMEA6608408	PRJEB37054	ERX3980916	UNIVERSITY OF HOHENHEIM	2020-05-31	2020-03-04	2020-05-31	2020-03-06	SAMEA6608408	UNIVERSITY OF HOHENHEIM	UNIVERSITY OF HOHENHEIM	2020-05-31T04:05:57Z	2020-03-06T13:37:27Z	public	BAC1.D1.0.32A	50	2015-09-28	1	0	wine grape	berry plant	grape plant	wine must	Germany	48.71 N	9.12 E	metagenome	TAGATCGCTCGCCTTA	GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT	plant-associated	wine must microbiota analysis during fermentation	BAC1.D1.0.32A	-80	0.5	Illumina MiSeq	Bacchus1	16S rRNA gene
5 | ERR3978174												unspecified	293	29760	SAMEA6608409	ERS4372625	Vitis vinifera	SAMEA6608409	PRJEB37054	ERX3980917	UNIVERSITY OF HOHENHEIM	2020-05-31	2020-03-04	2020-05-31	2020-03-06	SAMEA6608409	UNIVERSITY OF HOHENHEIM	UNIVERSITY OF HOHENHEIM	2020-05-31T04:05:57Z	2020-03-06T13:37:27Z	public	BAC1.D1.1.33A	50	2015-09-28	1	2	wine grape	berry plant	grape plant	wine must	Germany	48.71 N	9.12 E	metagenome	CTCTCTATTCGCCTTA	GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT	plant-associated	wine must microbiota analysis during fermentation	BAC1.D1.1.33A	-80	0.5	Illumina MiSeq	Bacchus1	16S rRNA gene
6 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/sra-metadata-missing-ids.tsv:
--------------------------------------------------------------------------------
1 | ID	DNA-ID	population	Super Population Code	Coriell plate	Coriell panel	Population Description	sex	HapMap sample ID	Super Population Description	culture_collection	family_id	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	ENA-FIRST-PUBLIC	ENA-LAST-UPDATE	ENA first public	ENA last update	External Id	INSDC center alias	INSDC center name	INSDC first public	INSDC last update	INSDC status	Submitter Id	amount or size of sample collected	collection date	collection day	collection hours	common name	environment (biome)	environment (feature)	environment (material)	geographic location (country and/or sea)	geographic location (latitude)	geographic location (longitude)	investigation type	multiplex identifiers	pcr primers	plant-associated environmental package	project name	sample name	sample storage temperature	sample volume or weight for DNA extraction	sequencing method	subspecific genetic lineage	target subfragment
2 | SRR000013	NA18505	YRI	AFR	HAPMAPPT03	MGP00013	Yoruba in Ibadan, Nigeria	female	NA18505	African	CORIELL:GM18505	Y005	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	True	454MSC
3 | SRR000001	NA18505	YRI	AFR	HAPMAPPT03	MGP00013	Yoruba in Ibadan, Nigeria	female	NA18505	African	CORIELL:GM18505	Y005	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505		PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	312523848	True	454MSC
4 | ERR3978173												PCR	METAGENOMIC	unspecified	SINGLE	11552099	39323	293	Vitis vinifera	29760	SAMEA6608408	ERS4372624	Vitis vinifera	SAMEA6608408	PRJEB37054	ERX3980916	Illumina MiSeq	ILLUMINA	ERP120343	3914295	True	UNIVERSITY OF HOHENHEIM	2020-05-31	2020-03-04	2020-05-31	2020-03-06	SAMEA6608408	UNIVERSITY OF HOHENHEIM	UNIVERSITY OF HOHENHEIM	2020-05-31T04:05:57Z	2020-03-06T13:37:27Z	public	BAC1.D1.0.32A	50	2015-09-28	1	0	wine grape	berry plant	grape plant	wine must	Germany	48.71 N	9.12 E	metagenome	TAGATCGCTCGCCTTA	GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT	plant-associated	wine must microbiota analysis during fermentation	BAC1.D1.0.32A	-80	0.5	Illumina MiSeq	Bacchus1	16S rRNA gene
5 | ERR3978174												PCR	METAGENOMIC	unspecified	SINGLE	17523267	59799	293	Vitis vinifera	29760	SAMEA6608409	ERS4372625	Vitis vinifera	SAMEA6608409	PRJEB37054	ERX3980917	Illumina MiSeq	ILLUMINA		5879896	True	UNIVERSITY OF HOHENHEIM	2020-05-31	2020-03-04	2020-05-31	2020-03-06	SAMEA6608409	UNIVERSITY OF HOHENHEIM	UNIVERSITY OF HOHENHEIM	2020-05-31T04:05:57Z	2020-03-06T13:37:27Z	public	BAC1.D1.1.33A	50	2015-09-28	1	2	wine grape	berry plant	grape plant	wine must	Germany	48.71 N	9.12 E	metagenome	CTCTCTATTCGCCTTA	GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT	plant-associated	wine must microbiota analysis during fermentation	BAC1.D1.1.33A	-80	0.5	Illumina MiSeq	Bacchus1	16S rRNA gene
6 | 


--------------------------------------------------------------------------------
/q2_fondue/types/tests/data/sra-metadata.tsv:
--------------------------------------------------------------------------------
1 | ID	DNA-ID	population	Super Population Code	Coriell plate	Coriell panel	Population Description	sex	HapMap sample ID	Super Population Description	culture_collection	family_id	Library Selection	Library Source	Library Name	Library Layout	Bases	Spots	AvgSpotLen	Organism	Tax ID	Sample Name	Sample Accession	Sample Title	Biosample ID	Bioproject ID	Experiment ID	Instrument	Platform	Study ID	Bytes	Public	Center Name	ENA-FIRST-PUBLIC	ENA-LAST-UPDATE	ENA first public	ENA last update	External Id	INSDC center alias	INSDC center name	INSDC first public	INSDC last update	INSDC status	Submitter Id	amount or size of sample collected	collection date	collection day	collection hours	common name	environment (biome)	environment (feature)	environment (material)	geographic location (country and/or sea)	geographic location (latitude)	geographic location (longitude)	investigation type	multiplex identifiers	pcr primers	plant-associated environmental package	project name	sample name	sample storage temperature	sample volume or weight for DNA extraction	sequencing method	subspecific genetic lineage	target subfragment
2 | SRR000013	NA18505	YRI	AFR	HAPMAPPT03	MGP00013	Yoruba in Ibadan, Nigeria	female	NA18505	African	CORIELL:GM18505	Y005	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	322532842	True	454MSC
3 | SRR000001	NA18505	YRI	AFR	HAPMAPPT03	MGP00013	Yoruba in Ibadan, Nigeria	female	NA18505	African	CORIELL:GM18505	Y005	RANDOM	GENOMIC	SID2748	PAIRED	913746807	3325405	274	Homo sapiens	9606	NA18505	SRS000100	Coriell GM18505	SAMN00001583	PRJNA33627	SRX000007	454 GS FLX	LS454	SRP000001	312523848	True	454MSC																																	
4 | ERR3978173												PCR	METAGENOMIC	unspecified	SINGLE	11552099	39323	293	Vitis vinifera	29760	SAMEA6608408	ERS4372624	Vitis vinifera	SAMEA6608408	PRJEB37054	ERX3980916	Illumina MiSeq	ILLUMINA	ERP120343	3914295	True	UNIVERSITY OF HOHENHEIM	2020-05-31	2020-03-04	2020-05-31	2020-03-06	SAMEA6608408	UNIVERSITY OF HOHENHEIM	UNIVERSITY OF HOHENHEIM	2020-05-31T04:05:57Z	2020-03-06T13:37:27Z	public	BAC1.D1.0.32A	50	2015-09-28	1	0	wine grape	berry plant	grape plant	wine must	Germany	48.71 N	9.12 E	metagenome	TAGATCGCTCGCCTTA	GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT	plant-associated	wine must microbiota analysis during fermentation	BAC1.D1.0.32A	-80	0.5	Illumina MiSeq	Bacchus1	16S rRNA gene
5 | ERR3978174												PCR	METAGENOMIC	unspecified	SINGLE	17523267	59799	293	Vitis vinifera	29760	SAMEA6608409	ERS4372625	Vitis vinifera	SAMEA6608409	PRJEB37054	ERX3980917	Illumina MiSeq	ILLUMINA	ERP120343	5879896	True	UNIVERSITY OF HOHENHEIM	2020-05-31	2020-03-04	2020-05-31	2020-03-06	SAMEA6608409	UNIVERSITY OF HOHENHEIM	UNIVERSITY OF HOHENHEIM	2020-05-31T04:05:57Z	2020-03-06T13:37:27Z	public	BAC1.D1.1.33A	50	2015-09-28	1	2	wine grape	berry plant	grape plant	wine must	Germany	48.71 N	9.12 E	metagenome	CTCTCTATTCGCCTTA	GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT	plant-associated	wine must microbiota analysis during fermentation	BAC1.D1.1.33A	-80	0.5	Illumina MiSeq	Bacchus1	16S rRNA gene
6 | 


--------------------------------------------------------------------------------
/q2_fondue/types/_transformer.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | 
  9 | import pandas as pd
 10 | import qiime2
 11 | 
 12 | from ..plugin_setup import plugin
 13 | from ._format import SRAMetadataFormat, SRAFailedIDsFormat, NCBIAccessionIDsFormat
 14 | 
 15 | 
 16 | def _meta_fmt_to_metadata(ff):
 17 |     with ff.open() as fh:
 18 |         df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str")
 19 |         return qiime2.Metadata(df)
 20 | 
 21 | 
 22 | def _series_to_meta_fmt(data: pd.Series, meta_fmt):
 23 |     with meta_fmt.open() as fh:
 24 |         data.to_csv(fh, sep="\t", header=True, index=False)
 25 |     return meta_fmt
 26 | 
 27 | 
 28 | @plugin.register_transformer
 29 | def _1(data: pd.DataFrame) -> SRAMetadataFormat:
 30 |     ff = SRAMetadataFormat()
 31 |     with ff.open() as fh:
 32 |         data.to_csv(fh, sep="\t", header=True)
 33 |     return ff
 34 | 
 35 | 
 36 | @plugin.register_transformer
 37 | def _2(ff: SRAMetadataFormat) -> pd.DataFrame:
 38 |     with ff.open() as fh:
 39 |         df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str")
 40 |         return df
 41 | 
 42 | 
 43 | @plugin.register_transformer
 44 | def _3(ff: SRAMetadataFormat) -> qiime2.Metadata:
 45 |     return _meta_fmt_to_metadata(ff)
 46 | 
 47 | 
 48 | @plugin.register_transformer
 49 | def _4(data: pd.DataFrame) -> SRAFailedIDsFormat:
 50 |     ff = SRAFailedIDsFormat()
 51 |     with ff.open() as fh:
 52 |         data.to_csv(fh, sep="\t", header=True, index=True)
 53 |     return ff
 54 | 
 55 | 
 56 | @plugin.register_transformer
 57 | def _5(ff: SRAFailedIDsFormat) -> pd.DataFrame:
 58 |     with ff.open() as fh:
 59 |         df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str")
 60 |         return df
 61 | 
 62 | 
 63 | @plugin.register_transformer
 64 | def _6(ff: SRAFailedIDsFormat) -> qiime2.Metadata:
 65 |     return _meta_fmt_to_metadata(ff)
 66 | 
 67 | 
 68 | @plugin.register_transformer
 69 | def _7(data: pd.DataFrame) -> NCBIAccessionIDsFormat:
 70 |     ff = NCBIAccessionIDsFormat()
 71 |     with ff.open() as fh:
 72 |         data.to_csv(fh, sep="\t", header=True, index=True)
 73 |     return ff
 74 | 
 75 | 
 76 | @plugin.register_transformer
 77 | def _77(data: pd.Series) -> NCBIAccessionIDsFormat:
 78 |     ff = NCBIAccessionIDsFormat()
 79 |     return _series_to_meta_fmt(data, ff)
 80 | 
 81 | 
 82 | @plugin.register_transformer
 83 | def _8(ff: NCBIAccessionIDsFormat) -> pd.DataFrame:
 84 |     with ff.open() as fh:
 85 |         df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str")
 86 |         return df
 87 | 
 88 | 
 89 | @plugin.register_transformer
 90 | def _9(ff: NCBIAccessionIDsFormat) -> qiime2.Metadata:
 91 |     return _meta_fmt_to_metadata(ff)
 92 | 
 93 | 
 94 | @plugin.register_transformer
 95 | def _10(ff: SRAMetadataFormat) -> NCBIAccessionIDsFormat:
 96 |     fout = NCBIAccessionIDsFormat()
 97 |     with ff.open() as fh, fout.open() as fo:
 98 |         df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str")
 99 |         df.index.to_frame().to_csv(fo, sep="\t", header=True, index=False)
100 |     return fout
101 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/scraper_items_no_doi.json:
--------------------------------------------------------------------------------
  1 | [{
  2 |     "key": "WZV4HG8X",
  3 |     "version": 1259,
  4 |     "library": {
  5 |         "type": "user",
  6 |         "id": 12345,
  7 |         "name": "username",
  8 |         "links": {
  9 |             "alternate": {
 10 |                 "href": "https://www.zotero.org/username",
 11 |                 "type": "text/html"
 12 |             }
 13 |         }
 14 |     },
 15 |     "links": {
 16 |         "self": {
 17 |             "href": "https://api.zotero.org/users/12345/items/WZV4HG8X",
 18 |             "type": "application/json"
 19 |         },
 20 |         "alternate": {
 21 |             "href": "https://www.zotero.org/username/items/WZV4HG8X",
 22 |             "type": "text/html"
 23 |         },
 24 |         "up": {
 25 |             "href": "https://api.zotero.org/users/12345/items/CP4ED2CY",
 26 |             "type": "application/json"
 27 |         },
 28 |         "enclosure": {
 29 |             "type": "text/html",
 30 |             "href": "https://api.zotero.org/users/12345/items/WZV4HG8X/file/view"
 31 |         }
 32 |     },
 33 |     "meta": {
 34 |         "numChildren": false
 35 |     },
 36 |     "data": {
 37 |         "key": "WZV4HG8X",
 38 |         "version": 1259,
 39 |         "parentItem": "CP4ED2CY",
 40 |         "itemType": "attachment",
 41 |         "linkMode": "imported_url",
 42 |         "title": "Snapshot",
 43 |         "accessDate": "2021-11-10T07:04:53Z",
 44 |         "url": "https://www.nature.com/articles/s41467-021-26215-w",
 45 |         "note": "",
 46 |         "contentType": "text/html",
 47 |         "charset": "utf-8",
 48 |         "filename": "s41467-021-26215-w.html",
 49 |         "md5": "9ba88a9f08c42a02d11a00b3498198f4",
 50 |         "mtime": 1636527893000,
 51 |         "tags": [],
 52 |         "relations": {},
 53 |         "dateAdded": "2021-11-10T07:04:53Z",
 54 |         "dateModified": "2021-11-10T07:04:53Z"
 55 |     }
 56 | },
 57 | {
 58 |     "key": "DMJ4AQ48",
 59 |     "version": 1261,
 60 |     "library": {
 61 |         "type": "user",
 62 |         "id": 12345,
 63 |         "name": "username",
 64 |         "links": {
 65 |             "alternate": {
 66 |                 "href": "https://www.zotero.org/username",
 67 |                 "type": "text/html"
 68 |             }
 69 |         }
 70 |     },
 71 |     "links": {
 72 |         "self": {
 73 |             "href": "https://api.zotero.org/users/12345/items/DMJ4AQ48",
 74 |             "type": "application/json"
 75 |         },
 76 |         "alternate": {
 77 |             "href": "https://www.zotero.org/username/items/DMJ4AQ48",
 78 |             "type": "text/html"
 79 |         },
 80 |         "up": {
 81 |             "href": "https://api.zotero.org/users/12345/items/CP4ED2CY",
 82 |             "type": "application/json"
 83 |         },
 84 |         "enclosure": {
 85 |             "type": "application/pdf",
 86 |             "href": "https://api.zotero.org/users/12345/items/DMJ4AQ48/file/view",
 87 |             "title": "Pruski et al. - 2021 - Direct on-swab metabolic profiling of vaginal micr.pdf",
 88 |             "length": 3648434
 89 |         }
 90 |     },
 91 |     "meta": {
 92 |         "numChildren": false
 93 |     },
 94 |     "data": {
 95 |         "key": "DMJ4AQ48",
 96 |         "version": 1261,
 97 |         "parentItem": "CP4ED2CY",
 98 |         "itemType": "attachment",
 99 |         "linkMode": "imported_url",
100 |         "title": "Full Text PDF",
101 |         "accessDate": "2021-11-10T07:04:46Z",
102 |         "url": "https://www.nature.com/articles/s41467-021-26215-w.pdf",
103 |         "note": "",
104 |         "contentType": "application/pdf",
105 |         "charset": "",
106 |         "filename": "Pruski et al. - 2021 - Direct on-swab metabolic profiling of vaginal micr.pdf",
107 |         "md5": "28edb400729d11e14b2b1829ceb16b3a",
108 |         "mtime": 1636528753000,
109 |         "tags": [],
110 |         "relations": {},
111 |         "dateAdded": "2021-11-10T07:04:46Z",
112 |         "dateModified": "2021-11-10T07:04:46Z"
113 |     }
114 | }]


--------------------------------------------------------------------------------
/q2_fondue/tests/data/efetch_b2_response_runs.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"  ?>
 2 | <!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD esummary v1 20041029//EN"
 3 |         "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20041029/esummary-v1.dtd">
 4 | <eSummaryResult>
 5 | 
 6 |     <DocSum>
 7 |         <Id>13481774</Id>
 8 |         <Item Name="ExpXml" Type="String">&lt;Summary&gt;&lt;Title&gt;18&lt;/Title&gt;&lt;Platform
 9 |             instrument_model="Illumina HiSeq 3000"&gt;ILLUMINA&lt;/Platform&gt;&lt;Statistics
10 |             total_runs="1" total_spots="63703" total_bases="38349206"
11 |             total_size="22735317" load_done="true" cluster_name="public"/&gt;&lt;/Summary&gt;&lt;Submitter
12 |             acc="SRA1206349" center_name="Jiangxi Agricultural University"
13 |             contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory
14 |             of Animal Nutritio"/&gt;&lt;Experiment acc="SRX10339760" ver="1"
15 |             status="public" name="18"/&gt;&lt;Study acc="SRP310597" name="PRJNA
16 |             Chuanzhong black lamb Raw sequence reads"/&gt;&lt;Organism
17 |             taxid="1904483" ScientificName="sheep gut metagenome"/&gt;&lt;Sample
18 |             acc="SRS8459117" name=""/&gt;&lt;Instrument ILLUMINA="Illumina
19 |             HiSeq 3000"/&gt;&lt;Library_descriptor&gt;&lt;LIBRARY_NAME&gt;18&lt;/LIBRARY_NAME&gt;&lt;LIBRARY_STRATEGY&gt;AMPLICON&lt;/LIBRARY_STRATEGY&gt;&lt;LIBRARY_SOURCE&gt;GENOMIC&lt;/LIBRARY_SOURCE&gt;&lt;LIBRARY_SELECTION&gt;PCR&lt;/LIBRARY_SELECTION&gt;&lt;LIBRARY_LAYOUT&gt;
20 |             &lt;PAIRED/&gt; &lt;/LIBRARY_LAYOUT&gt;&lt;/Library_descriptor&gt;&lt;Bioproject&gt;PRJNA707607&lt;/Bioproject&gt;&lt;Biosample&gt;SAMN18309312&lt;/Biosample&gt;
21 |         </Item>
22 |         <Item Name="Runs" Type="String">&lt;Run acc="SRR13961771"
23 |             total_spots="63703" total_bases="38349206" load_done="true"
24 |             is_public="true" cluster_name="public"
25 |             static_data_available="true"/&gt;
26 |         </Item>
27 |         <Item Name="ExtLinks" Type="String"></Item>
28 |         <Item Name="CreateDate" Type="String">2021/03/17</Item>
29 |         <Item Name="UpdateDate" Type="String">2021/03/15</Item>
30 |     </DocSum>
31 | 
32 |     <DocSum>
33 |         <Id>13481786</Id>
34 |         <Item Name="ExpXml" Type="String">&lt;Summary&gt;&lt;Title&gt;12&lt;/Title&gt;&lt;Platform
35 |             instrument_model="Illumina HiSeq 3000"&gt;ILLUMINA&lt;/Platform&gt;&lt;Statistics
36 |             total_runs="1" total_spots="59130" total_bases="35596260"
37 |             total_size="21079845" load_done="true" cluster_name="public"/&gt;&lt;/Summary&gt;&lt;Submitter
38 |             acc="SRA1206349" center_name="Jiangxi Agricultural University"
39 |             contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory
40 |             of Animal Nutritio"/&gt;&lt;Experiment acc="SRX10339772" ver="1"
41 |             status="public" name="12"/&gt;&lt;Study acc="SRP310597" name="PRJNA
42 |             Chuanzhong black lamb Raw sequence reads"/&gt;&lt;Organism
43 |             taxid="1904483" ScientificName="sheep gut metagenome"/&gt;&lt;Sample
44 |             acc="SRS8459130" name=""/&gt;&lt;Instrument ILLUMINA="Illumina
45 |             HiSeq 3000"/&gt;&lt;Library_descriptor&gt;&lt;LIBRARY_NAME&gt;12&lt;/LIBRARY_NAME&gt;&lt;LIBRARY_STRATEGY&gt;AMPLICON&lt;/LIBRARY_STRATEGY&gt;&lt;LIBRARY_SOURCE&gt;GENOMIC&lt;/LIBRARY_SOURCE&gt;&lt;LIBRARY_SELECTION&gt;PCR&lt;/LIBRARY_SELECTION&gt;&lt;LIBRARY_LAYOUT&gt;
46 |             &lt;PAIRED/&gt; &lt;/LIBRARY_LAYOUT&gt;&lt;/Library_descriptor&gt;&lt;Bioproject&gt;PRJNA707607&lt;/Bioproject&gt;&lt;Biosample&gt;SAMN18309306&lt;/Biosample&gt;
47 |         </Item>
48 |         <Item Name="Runs" Type="String">&lt;Run acc="SRR13961759"
49 |             total_spots="59130" total_bases="35596260" load_done="true"
50 |             is_public="true" cluster_name="public"
51 |             static_data_available="true"/&gt;
52 |         </Item>
53 |         <Item Name="ExtLinks" Type="String"></Item>
54 |         <Item Name="CreateDate" Type="String">2021/03/17</Item>
55 |         <Item Name="UpdateDate" Type="String">2021/03/15</Item>
56 |     </DocSum>
57 | </eSummaryResult>
58 | 


--------------------------------------------------------------------------------
/q2_fondue/entrezpy_clients/_utils.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | 
  9 | import logging
 10 | import sys
 11 | 
 12 | import pandas as pd
 13 | 
 14 | PREFIX = {
 15 |     "run": ("SRR", "ERR", "DRR"),
 16 |     "experiment": ("SRX", "ERX", "DRX"),
 17 |     "sample": ("SRS", "ERS", "DRS"),
 18 |     "study": ("SRP", "ERP", "DRP"),
 19 |     "bioproject": ("PRJ",),
 20 | }
 21 | 
 22 | 
 23 | class InvalidIDs(Exception):
 24 |     pass
 25 | 
 26 | 
 27 | def get_attrs(obj, excluded=()):
 28 |     return [
 29 |         k for k, v in vars(obj).items() if k not in excluded and not k.startswith("__")
 30 |     ]
 31 | 
 32 | 
 33 | def rename_columns(df: pd.DataFrame):
 34 |     # clean up ID columns
 35 |     col_map = {}
 36 |     id_cols = [col for col in df.columns if col.endswith("_id")]
 37 |     for col in id_cols:
 38 |         col_split = col.split("_")
 39 |         col_map[col] = f"{col_split[0].capitalize()} {col_split[1].upper()}"
 40 | 
 41 |     # clean up other multi-word columns
 42 |     wordy_cols = [col for col in df.columns if "_" in col and col not in id_cols]
 43 |     for col in wordy_cols:
 44 |         col_map[col] = " ".join([x.capitalize() for x in col.split("_")])
 45 | 
 46 |     # capitalize the rest
 47 |     remainder_cols = [
 48 |         col for col in df.columns if col not in id_cols and col not in wordy_cols
 49 |     ]
 50 |     for col in remainder_cols:
 51 |         col_map[col] = col.capitalize()
 52 | 
 53 |     df.rename(columns=col_map, inplace=True)
 54 | 
 55 |     # rename Sample ID to Sample Accession (incompatible with qiime naming)
 56 |     df.rename(columns={"Sample ID": "Sample Accession"}, inplace=True)
 57 | 
 58 |     return df
 59 | 
 60 | 
 61 | def set_up_entrezpy_logging(entrezpy_obj, log_level, log_id=False):
 62 |     """Sets up logging for the given Entrezpy object.
 63 | 
 64 |     Args:
 65 |         entrezpy_obj (object): An Entrezpy object that has a logger attribute.
 66 |         log_level (str): The log level to set.
 67 |         log_id (bool): If True, accession ID will be added to the log.
 68 |     """
 69 |     handler = set_up_logging_handler(log_id=log_id)
 70 | 
 71 |     entrezpy_obj.logger.addHandler(handler)
 72 |     entrezpy_obj.logger.setLevel(log_level)
 73 | 
 74 |     if hasattr(entrezpy_obj, "request_pool"):
 75 |         entrezpy_obj.request_pool.logger.addHandler(handler)
 76 |         entrezpy_obj.request_pool.logger.setLevel(log_level)
 77 | 
 78 | 
 79 | def set_up_logger(
 80 |     log_level, cls_obj=None, logger_name=None, log_id=False
 81 | ) -> logging.Logger:
 82 |     """Sets up the module/class logger.
 83 | 
 84 |     Args:
 85 |         log_level (str): The log level to set.
 86 |         cls_obj: Class instance for which the logger should be created.
 87 |         logger_name (str): The name of the logger.
 88 |         log_id (bool): If True, accession ID will be added to the log.
 89 | 
 90 |     Returns:
 91 |         logging.Logger: The module logger.
 92 |     """
 93 |     if cls_obj:
 94 |         logger = logging.getLogger(f"{cls_obj.__module__}")
 95 |     else:
 96 |         logger = logging.getLogger(logger_name)
 97 |     logger.setLevel(log_level)
 98 |     handler = set_up_logging_handler(log_id=log_id)
 99 |     logger.addHandler(handler)
100 |     return logger
101 | 
102 | 
103 | def set_up_logging_handler(log_id: bool = False) -> logging.StreamHandler:
104 |     """Sets up logging handler."""
105 |     handler = logging.StreamHandler(sys.stdout)
106 |     if log_id:
107 |         formatter = logging.Formatter(
108 |             "%(asctime)s [%(threadName)s] [%(levelname)s] "
109 |             "[%(name)s] [%(accession_id)s]: %(message)s"
110 |         )
111 |     else:
112 |         formatter = logging.Formatter(
113 |             "%(asctime)s [%(threadName)s] [%(levelname)s] " "[%(name)s]: %(message)s"
114 |         )
115 |     handler.setFormatter(formatter)
116 |     return handler
117 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/efetch_b1_response_runs.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"  ?>
 2 | <!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD esummary v1 20041029//EN"
 3 |         "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20041029/esummary-v1.dtd">
 4 | <eSummaryResult>
 5 | 
 6 |     <DocSum>
 7 |         <Id>4</Id>
 8 |         <Item Name="ExpXml" Type="String">&lt;Summary&gt;&lt;Title&gt;454
 9 |             sequencing of Human HapMap individual NA18505 genomic paired-end
10 |             library&lt;/Title&gt;&lt;Platform instrument_model="454 GS FLX"&gt;LS454&lt;/Platform&gt;&lt;Statistics
11 |             total_runs="10" total_spots="4703662" total_bases="1306798474"
12 |             total_size="3205056622" load_done="true"
13 |             static_data_available="true" cluster_name="public"/&gt;&lt;/Summary&gt;&lt;Submitter
14 |             acc="SRA000197" center_name="454MSC" contact_name="Chris OSullivan"
15 |             lab_name=""/&gt;&lt;Experiment acc="SRX000003" ver="10"
16 |             status="public" name="454 sequencing of Human HapMap individual
17 |             NA18505 genomic paired-end library"/&gt;&lt;Study acc="SRP000001"
18 |             name="Paired-end mapping reveals extensive structural variation in
19 |             the human genome"/&gt;&lt;Organism taxid="9606"
20 |             ScientificName="Homo sapiens"/&gt;&lt;Sample acc="SRS000100"
21 |             name=""/&gt;&lt;Instrument LS454="454 GS FLX"/&gt;&lt;Library_descriptor&gt;&lt;LIBRARY_NAME
22 |             xmlns=""&gt;SID2699&lt;/LIBRARY_NAME&gt;&lt;LIBRARY_STRATEGY
23 |             xmlns=""&gt;WGS&lt;/LIBRARY_STRATEGY&gt;&lt;LIBRARY_SOURCE xmlns=""&gt;GENOMIC&lt;/LIBRARY_SOURCE&gt;&lt;LIBRARY_SELECTION
24 |             xmlns=""&gt;RANDOM&lt;/LIBRARY_SELECTION&gt;&lt;LIBRARY_LAYOUT
25 |             xmlns=""&gt; &lt;PAIRED NOMINAL_LENGTH="3000"/&gt; &lt;/LIBRARY_LAYOUT&gt;&lt;/Library_descriptor&gt;&lt;Bioproject&gt;PRJNA33627&lt;/Bioproject&gt;&lt;Biosample&gt;SAMN00001583&lt;/Biosample&gt;
26 |         </Item>
27 |         <Item Name="Runs" Type="String">&lt;Run acc="SRR000007"
28 |             total_spots="633196" total_bases="175275395" load_done="true"
29 |             is_public="true" cluster_name="public"
30 |             static_data_available="true"/&gt;&lt;Run acc="SRR000018"
31 |             total_spots="626624" total_bases="174403220" load_done="true"
32 |             is_public="true" cluster_name="public"
33 |             static_data_available="true"/&gt;&lt;Run acc="SRR000020"
34 |             total_spots="374556" total_bases="103411232" load_done="true"
35 |             is_public="true" cluster_name="public"
36 |             static_data_available="true"/&gt;&lt;Run acc="SRR000038"
37 |             total_spots="529820" total_bases="148389031" load_done="true"
38 |             is_public="true" cluster_name="public"
39 |             static_data_available="true"/&gt;&lt;Run acc="SRR000043"
40 |             total_spots="608946" total_bases="168985392" load_done="true"
41 |             is_public="true" cluster_name="public"
42 |             static_data_available="true"/&gt;&lt;Run acc="SRR000046"
43 |             total_spots="79047" total_bases="21258857" load_done="true"
44 |             is_public="true" cluster_name="public"
45 |             static_data_available="true"/&gt;&lt;Run acc="SRR000048"
46 |             total_spots="640737" total_bases="177619279" load_done="true"
47 |             is_public="true" cluster_name="public"
48 |             static_data_available="true"/&gt;&lt;Run acc="SRR000050"
49 |             total_spots="547349" total_bases="153260655" load_done="true"
50 |             is_public="true" cluster_name="public"
51 |             static_data_available="true"/&gt;&lt;Run acc="SRR000057"
52 |             total_spots="76744" total_bases="21203932" load_done="true"
53 |             is_public="true" cluster_name="public"
54 |             static_data_available="true"/&gt;&lt;Run acc="SRR000058"
55 |             total_spots="586643" total_bases="162991481" load_done="true"
56 |             is_public="true" cluster_name="public"
57 |             static_data_available="true"/&gt;
58 |         </Item>
59 |         <Item Name="ExtLinks" Type="String"></Item>
60 |         <Item Name="CreateDate" Type="String">2008/04/04</Item>
61 |         <Item Name="UpdateDate" Type="String">2015/04/09</Item>
62 |     </DocSum>
63 | 
64 | </eSummaryResult>
65 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/efetch_response_runs_single_item.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"  ?>
 2 | <!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD esummary v1 20041029//EN"
 3 |         "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20041029/esummary-v1.dtd">
 4 | <eSummaryResult>
 5 | 
 6 |     <DocSum>
 7 |         <Id>4</Id>
 8 |         <Item Name="ExpXml" Type="String">&lt;Summary&gt;&lt;Title&gt;454
 9 |             sequencing of Human HapMap individual NA18505 genomic paired-end
10 |             library&lt;/Title&gt;&lt;Platform instrument_model="454 GS FLX"&gt;LS454&lt;/Platform&gt;&lt;Statistics
11 |             total_runs="10" total_spots="4703662" total_bases="1306798474"
12 |             total_size="3205056622" load_done="true"
13 |             static_data_available="true" cluster_name="public"/&gt;&lt;/Summary&gt;&lt;Submitter
14 |             acc="SRA000197" center_name="454MSC" contact_name="Chris OSullivan"
15 |             lab_name=""/&gt;&lt;Experiment acc="SRX000003" ver="10"
16 |             status="public" name="454 sequencing of Human HapMap individual
17 |             NA18505 genomic paired-end library"/&gt;&lt;Study acc="SRP000001"
18 |             name="Paired-end mapping reveals extensive structural variation in
19 |             the human genome"/&gt;&lt;Organism taxid="9606"
20 |             ScientificName="Homo sapiens"/&gt;&lt;Sample acc="SRS000100"
21 |             name=""/&gt;&lt;Instrument LS454="454 GS FLX"/&gt;&lt;Library_descriptor&gt;&lt;LIBRARY_NAME
22 |             xmlns=""&gt;SID2699&lt;/LIBRARY_NAME&gt;&lt;LIBRARY_STRATEGY
23 |             xmlns=""&gt;WGS&lt;/LIBRARY_STRATEGY&gt;&lt;LIBRARY_SOURCE xmlns=""&gt;GENOMIC&lt;/LIBRARY_SOURCE&gt;&lt;LIBRARY_SELECTION
24 |             xmlns=""&gt;RANDOM&lt;/LIBRARY_SELECTION&gt;&lt;LIBRARY_LAYOUT
25 |             xmlns=""&gt; &lt;PAIRED NOMINAL_LENGTH="3000"/&gt; &lt;/LIBRARY_LAYOUT&gt;&lt;/Library_descriptor&gt;&lt;Bioproject&gt;PRJNA33627&lt;/Bioproject&gt;&lt;Biosample&gt;SAMN00001583&lt;/Biosample&gt;
26 |         </Item>
27 |         <Item Name="Runs" Type="String">&lt;Run acc="SRR000007"
28 |             total_spots="633196" total_bases="175275395" load_done="true"
29 |             is_public="true" cluster_name="public"
30 |             static_data_available="true"/&gt;&lt;Run acc="SRR000018"
31 |             total_spots="626624" total_bases="174403220" load_done="true"
32 |             is_public="true" cluster_name="public"
33 |             static_data_available="true"/&gt;&lt;Run acc="SRR000020"
34 |             total_spots="374556" total_bases="103411232" load_done="true"
35 |             is_public="true" cluster_name="public"
36 |             static_data_available="true"/&gt;&lt;Run acc="SRR000038"
37 |             total_spots="529820" total_bases="148389031" load_done="true"
38 |             is_public="true" cluster_name="public"
39 |             static_data_available="true"/&gt;&lt;Run acc="SRR000043"
40 |             total_spots="608946" total_bases="168985392" load_done="true"
41 |             is_public="true" cluster_name="public"
42 |             static_data_available="true"/&gt;&lt;Run acc="SRR000046"
43 |             total_spots="79047" total_bases="21258857" load_done="true"
44 |             is_public="true" cluster_name="public"
45 |             static_data_available="true"/&gt;&lt;Run acc="SRR000048"
46 |             total_spots="640737" total_bases="177619279" load_done="true"
47 |             is_public="true" cluster_name="public"
48 |             static_data_available="true"/&gt;&lt;Run acc="SRR000050"
49 |             total_spots="547349" total_bases="153260655" load_done="true"
50 |             is_public="true" cluster_name="public"
51 |             static_data_available="true"/&gt;&lt;Run acc="SRR000057"
52 |             total_spots="76744" total_bases="21203932" load_done="true"
53 |             is_public="true" cluster_name="public"
54 |             static_data_available="true"/&gt;&lt;Run acc="SRR000058"
55 |             total_spots="586643" total_bases="162991481" load_done="true"
56 |             is_public="true" cluster_name="public"
57 |             static_data_available="true"/&gt;
58 |         </Item>
59 |         <Item Name="ExtLinks" Type="String"></Item>
60 |         <Item Name="CreateDate" Type="String">2008/04/04</Item>
61 |         <Item Name="UpdateDate" Type="String">2015/04/09</Item>
62 |     </DocSum>
63 | 
64 | </eSummaryResult>
65 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/test_get_all.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | import unittest
  9 | from unittest.mock import ANY, Mock
 10 | 
 11 | import pandas as pd
 12 | from qiime2 import Artifact
 13 | 
 14 | from q2_fondue.get_all import get_all
 15 | from q2_fondue.tests.test_sequences import SequenceTests
 16 | 
 17 | 
 18 | class FakeCtx(Mock):
 19 |     def __init__(self, ids_path, meta_path, failed_ids=None):
 20 |         super().__init__()
 21 |         self.ids = Artifact.import_data("NCBIAccessionIDs", ids_path)
 22 |         self.meta = Artifact.import_data("SRAMetadata", meta_path)
 23 |         self.failed_empty = Artifact.import_data("SRAFailedIDs", pd.DataFrame())
 24 |         if failed_ids:
 25 |             self.failed = Artifact.import_data(
 26 |                 "SRAFailedIDs",
 27 |                 pd.DataFrame(
 28 |                     data={"Error message": ["Some error message" for _ in failed_ids]},
 29 |                     index=pd.Index(failed_ids, name="ID"),
 30 |                 ),
 31 |             )
 32 |         else:
 33 |             self.failed = self.failed_empty
 34 | 
 35 |         self.get_metadata = Mock(return_value=(self.meta, self.failed_empty))
 36 |         self.get_sequences = Mock(return_value=(Mock(), Mock(), self.failed))
 37 | 
 38 |     def get_action(self, plugin, action):
 39 |         if action == "get_metadata":
 40 |             return self.get_metadata
 41 |         elif action == "get_sequences":
 42 |             return self.get_sequences
 43 | 
 44 | 
 45 | class TestGetAll(SequenceTests):
 46 |     package = "q2_fondue.tests"
 47 | 
 48 |     def test_get_all_single(self):
 49 |         """
 50 |         Test verifying that pipeline get_all calls all expected actions,
 51 |         individual actions are tested in details in respective test classes
 52 |         """
 53 |         mock_ctx = FakeCtx(
 54 |             ids_path=self.get_data_path("SRR123456_md.tsv"),
 55 |             meta_path=self.get_data_path("sra-metadata-mock.tsv"),
 56 |         )
 57 |         obs_meta, _, _, obs_failed = get_all(
 58 |             mock_ctx, mock_ctx.ids, "fake@email.com", retries=1
 59 |         )
 60 | 
 61 |         mock_ctx.get_metadata.assert_called_once_with(
 62 |             mock_ctx.ids, "fake@email.com", 1, "INFO", None
 63 |         )
 64 |         mock_ctx.get_sequences.assert_called_once_with(
 65 |             ANY, "fake@email.com", 1, 1, "INFO"
 66 |         )
 67 | 
 68 |         run_ids = mock_ctx.get_sequences.call_args_list[0][0][0]
 69 |         run_ids = run_ids.view(pd.DataFrame).index.to_list()
 70 |         self.assertListEqual(run_ids, ["SRR123456"])
 71 | 
 72 |         self.assertEqual(obs_meta, mock_ctx.meta)
 73 |         self.assertEqual(obs_failed, mock_ctx.failed)
 74 | 
 75 |     def test_get_all_multi_with_missing_ids(self):
 76 |         """
 77 |         Test verifying that pipeline get_all calls all expected actions,
 78 |         individual actions are tested in details in respective test classes
 79 |         """
 80 |         mock_ctx = FakeCtx(
 81 |             ids_path=self.get_data_path("SRR1234567_md.tsv"),
 82 |             meta_path=self.get_data_path("sra-metadata-mock.tsv"),
 83 |             failed_ids=["SRR123457"],
 84 |         )
 85 |         obs_meta, _, _, obs_failed = get_all(
 86 |             mock_ctx, mock_ctx.ids, "fake@email.com", retries=1
 87 |         )
 88 | 
 89 |         mock_ctx.get_metadata.assert_called_once_with(
 90 |             mock_ctx.ids, "fake@email.com", 1, "INFO", None
 91 |         )
 92 |         mock_ctx.get_sequences.assert_called_once_with(
 93 |             ANY, "fake@email.com", 1, 1, "INFO"
 94 |         )
 95 | 
 96 |         run_ids = mock_ctx.get_sequences.call_args_list[0][0][0]
 97 |         run_ids = run_ids.view(pd.DataFrame).index.to_list()
 98 |         self.assertListEqual(run_ids, ["SRR123456"])
 99 | 
100 |         self.assertEqual(obs_meta, mock_ctx.meta)
101 |         self.assertListEqual(
102 |             obs_failed.view(pd.DataFrame).index.to_list(), ["SRR123457"]
103 |         )
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     unittest.main()
108 | 


--------------------------------------------------------------------------------
/q2_fondue/entrezpy_clients/_esearch.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | 
  9 | from typing import List, Union
 10 | 
 11 | import pandas as pd
 12 | from entrezpy.esearch.esearch_analyzer import EsearchAnalyzer
 13 | from entrezpy.esearch.esearch_result import EsearchResult
 14 | 
 15 | 
 16 | class ESearchResult(EsearchResult):
 17 |     """Entrezpy client for ESearch utility used to search for or validate
 18 |     provided accession IDs.
 19 |     """
 20 | 
 21 |     def __init__(self, response, request):
 22 |         super().__init__(response, request)
 23 |         self.result = None
 24 | 
 25 |     def validate_result(self) -> dict:
 26 |         """Validates hit counts obtained for all the provided UIDs.
 27 | 
 28 |         As the expected hit count for a valid SRA accession ID is 1, all the
 29 |         IDs with that value will be considered valid. UIDs with count higher
 30 |         than 1 will be considered 'ambiguous' as they could not be resolved
 31 |         to a single result. Likewise, UIDs with a count of 0 will be considered
 32 |         'invalid' as no result could be found for those.
 33 | 
 34 |         Raises:
 35 |             InvalidIDs: An exception is raised when either ambiguous or invalid
 36 |                 IDs were encountered.
 37 | 
 38 |         """
 39 |         # correct id should have count == 1
 40 |         leftover_ids = self.result[self.result != 1]
 41 |         if leftover_ids.shape[0] == 0:
 42 |             return {}
 43 |         ambiguous_ids = leftover_ids[leftover_ids > 0]
 44 |         invalid_ids = leftover_ids[leftover_ids == 0]
 45 | 
 46 |         error_msg = "Some of the IDs are invalid or ambiguous:"
 47 |         if ambiguous_ids.shape[0] > 0:
 48 |             error_msg += f'\n Ambiguous IDs: {", ".join(ambiguous_ids.index)}'
 49 |         if invalid_ids.shape[0] > 0:
 50 |             error_msg += f'\n Invalid IDs: {", ".join(invalid_ids.index)}'
 51 |         self.logger.warning(error_msg)
 52 |         return {
 53 |             **{_id: "ID is ambiguous." for _id in ambiguous_ids.index},
 54 |             **{_id: "ID is invalid." for _id in invalid_ids.index},
 55 |         }
 56 | 
 57 |     def parse_search_results(self, response, uids: Union[List[str], None]):
 58 |         """Parses response received from Esearch as a pandas Series object.
 59 | 
 60 |         Hit counts obtained in the response will be extracted and assigned to
 61 |         their respective query IDs. IDs not found in the results but present
 62 |         in the UIDs list will get a count of 0.
 63 | 
 64 |         Args:
 65 |             response (): Response received from Esearch.
 66 |             uids (List[str]): List of original UIDs that were submitted
 67 |                 as a query.
 68 | 
 69 |         """
 70 |         translation_stack = response["esearchresult"].get("translationstack")
 71 |         if not translation_stack:
 72 |             self.result = pd.Series({x: 0 for x in uids}, name="count")
 73 |             return
 74 | 
 75 |         # filter out only positive hits
 76 |         found_terms = [x for x in translation_stack if isinstance(x, dict)]
 77 |         found_terms = {
 78 |             x["term"].replace("[All Fields]", ""): int(x["count"]) for x in found_terms
 79 |         }
 80 | 
 81 |         # find ids that are missing
 82 |         if uids:
 83 |             missing_ids = [x for x in uids if x not in found_terms.keys()]
 84 |             missing_ids = {x: 0 for x in missing_ids}
 85 |             found_terms.update(missing_ids)
 86 | 
 87 |         self.result = pd.Series(found_terms, name="count")
 88 | 
 89 | 
 90 | class ESearchAnalyzer(EsearchAnalyzer):
 91 |     def __init__(self, uids):
 92 |         super().__init__()
 93 |         self.uids = uids
 94 | 
 95 |     # override the base method to use our own ESResult
 96 |     def init_result(self, response, request):
 97 |         if not self.result:
 98 |             self.result = ESearchResult(response, request)
 99 |             return True
100 |         return False
101 | 
102 |     # override the base method to additionally parse the result
103 |     def analyze_result(self, response, request):
104 |         super().analyze_result(response, request)
105 |         self.result.parse_search_results(response, self.uids)
106 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/metadata_processed_multi.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "FAKEID1": {
  3 |     "Experiment ID": "ERX3980916",
  4 |     "Biosample ID": "SAMEA6608408",
  5 |     "Bioproject ID": "PRJEB37054",
  6 |     "Study ID": "ERP120343",
  7 |     "Sample Accession": "ERS4372624",
  8 |     "Organism": "Vitis vinifera",
  9 |     "Library Source": "METAGENOMIC",
 10 |     "Library Selection": "PCR",
 11 |     "Library Layout": "SINGLE",
 12 |     "Instrument": "Illumina MiSeq",
 13 |     "Platform": "ILLUMINA",
 14 |     "Bases": "11552099",
 15 |     "Spots": "39323",
 16 |     "Avg Spot Len": "293",
 17 |     "Bytes": "3914295",
 18 |     "Public": "True",
 19 |     "Ena-first-public [run]": "2020-05-31",
 20 |     "Ena-first-public [sample]": "2020-05-31",
 21 |     "Ena-first-public [study]": "2020-05-31",
 22 |     "Ena-last-update [run]": "2020-03-06",
 23 |     "Ena-last-update [sample]": "2020-03-06",
 24 |     "Ena-last-update [study]": "2020-03-04",
 25 |     "Amount or size of sample collected [sample]": "50",
 26 |     "Collection date [sample]": "2015-09-28",
 27 |     "Collection day [sample]": "1",
 28 |     "Collection hours [sample]": "0",
 29 |     "Environment (biome) [sample]": "berry plant",
 30 |     "Environment (feature) [sample]": "grape plant",
 31 |     "Environment (material) [sample]": "wine must",
 32 |     "Geographic location (country and/or sea) [sample]": "Germany",
 33 |     "Geographic location (latitude) [sample]": "48.71 N",
 34 |     "Geographic location (longitude) [sample]": "9.12 E",
 35 |     "Investigation type [sample]": "metagenome",
 36 |     "Multiplex identifiers [sample]": "TAGATCGCTCGCCTTA",
 37 |     "Pcr primers [sample]": "GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT",
 38 |     "Plant-associated environmental package [sample]": "plant-associated",
 39 |     "Project name [sample]": "wine must microbiota analysis during fermentation",
 40 |     "Sample storage temperature [sample]": "-80",
 41 |     "Sample volume or weight for dna extraction [sample]": "0.5",
 42 |     "Sequencing method [sample]": "Illumina MiSeq",
 43 |     "Subspecific genetic lineage [sample]": "Bacchus1",
 44 |     "Target subfragment [sample]": "16S rRNA gene",
 45 |     "Library Name": "unspecified",
 46 |     "Name": "BAC1.D1.0.32A",
 47 |     "Center Name": "University of Hohenheim",
 48 |     "Title": "Vitis vinifera",
 49 |     "Tax ID": "29760"
 50 |   },
 51 |   "FAKEID2": {
 52 |     "Experiment ID": "ERX3980917",
 53 |     "Biosample ID": "SAMEA6608409",
 54 |     "Bioproject ID": "PRJEB37054",
 55 |     "Study ID": "ERP120343",
 56 |     "Sample Accession": "ERS4372625",
 57 |     "Organism": "Vitis vinifera",
 58 |     "Library Source": "METAGENOMIC",
 59 |     "Library Selection": "PCR",
 60 |     "Library Layout": "SINGLE",
 61 |     "Instrument": "Illumina MiSeq",
 62 |     "Platform": "ILLUMINA",
 63 |     "Bases": "17523267",
 64 |     "Spots": "59799",
 65 |     "Avg Spot Len": "293",
 66 |     "Bytes": "5879896",
 67 |     "Public": "True",
 68 |     "Ena-first-public [run]": "2020-05-31",
 69 |     "Ena-first-public [sample]": "2020-05-31",
 70 |     "Ena-first-public [study]": "2020-05-31",
 71 |     "Ena-last-update [run]": "2020-03-06",
 72 |     "Ena-last-update [sample]": "2020-03-06",
 73 |     "Ena-last-update [study]": "2020-03-04",
 74 |     "Amount or size of sample collected [sample]": "50",
 75 |     "Collection date [sample]": "2015-09-28",
 76 |     "Collection day [sample]": "1",
 77 |     "Collection hours [sample]": "2",
 78 |     "Environment (biome) [sample]": "berry plant",
 79 |     "Environment (feature) [sample]": "grape plant",
 80 |     "Environment (material) [sample]": "wine must",
 81 |     "Geographic location (country and/or sea) [sample]": "Germany",
 82 |     "Geographic location (latitude) [sample]": "48.71 N",
 83 |     "Geographic location (longitude) [sample]": "9.12 E",
 84 |     "Investigation type [sample]": "metagenome",
 85 |     "Multiplex identifiers [sample]": "CTCTCTATTCGCCTTA",
 86 |     "Pcr primers [sample]": "GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT",
 87 |     "Plant-associated environmental package [sample]": "plant-associated",
 88 |     "Project name [sample]": "wine must microbiota analysis during fermentation",
 89 |     "Sample storage temperature [sample]": "-80",
 90 |     "Sample volume or weight for dna extraction [sample]": "0.5",
 91 |     "Sequencing method [sample]": "Illumina MiSeq",
 92 |     "Subspecific genetic lineage [sample]": "Bacchus1",
 93 |     "Target subfragment [sample]": "16S rRNA gene",
 94 |     "Library Name": "unspecified",
 95 |     "Name": "BAC1.D1.1.33A",
 96 |     "Center Name": "University of Hohenheim",
 97 |     "Title": "Vitis vinifera",
 98 |     "Tax ID": "29760"
 99 |   }
100 | }


--------------------------------------------------------------------------------
/q2_fondue/entrezpy_clients/_pipelines.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | from typing import Union
  9 | 
 10 | from entrezpy import conduit as ec
 11 | 
 12 | from entrezpy.elink.elink_analyzer import ElinkAnalyzer
 13 | 
 14 | from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer
 15 | from q2_fondue.entrezpy_clients._esearch import ESearchAnalyzer
 16 | from q2_fondue.entrezpy_clients._utils import set_up_entrezpy_logging
 17 | 
 18 | import entrezpy.esearch.esearcher as searcher
 19 | 
 20 | from q2_fondue.utils import _chunker
 21 | 
 22 | BATCH_SIZE = 500
 23 | 
 24 | 
 25 | def _get_run_ids(
 26 |     email: str,
 27 |     n_jobs: int,
 28 |     ids: Union[list, None],
 29 |     query: Union[str, None],
 30 |     source: str,
 31 |     log_level: str,
 32 | ) -> list:
 33 |     """Pipeline to retrieve run IDs associated with BioSample query
 34 |     (provided in `query`) or other aggregate IDs like studies
 35 |     (`source`='study'), bioprojects (`source`='bioproject'), samples
 36 |     (`source`='sample') or experiments (`source`='experiment')
 37 |     provided in `ids`.
 38 | 
 39 |     Args:
 40 |         email (str): User email.
 41 |         n_jobs (int): Number of jobs.
 42 |         ids (list): List of study, bioproject, sample or experiment IDs.
 43 |         query (str): Search query to find IDs by.
 44 |         source (str): Type of IDs provided ('study', 'bioproject',
 45 |                       'sample' or 'experiment').
 46 |         log_level (str): The log level to set.
 47 | 
 48 |     Returns:
 49 |         list: Run IDs associated with provided ids.
 50 |     """
 51 |     term = " OR ".join(ids) if ids else query
 52 | 
 53 |     # create pipeline to fetch all run IDs
 54 |     elink = True
 55 |     if source == "bioproject":
 56 |         db = "bioproject"
 57 |     elif source == "biosample":
 58 |         db = "biosample"
 59 |     else:
 60 |         db = "sra"
 61 |         elink = False
 62 | 
 63 |     # find UIDS based on a query;
 64 |     # instead of saving the result on the history server
 65 |     # we will store all the UIDs recovered based on the
 66 |     # search query and use those in the mini-pipeline below;
 67 |     # this way we are not limited by ELink only accepting up to
 68 |     # who knows how many IDs and erroring out if we provide too
 69 |     # many (which could be the case e.g.: when we ask for more
 70 |     # than 10000 BioProject IDs or the text query returns more
 71 |     # than 10000 IDs presumably)
 72 |     esearcher = searcher.Esearcher(
 73 |         "esearcher", email, apikey=None, apikey_var=None, threads=n_jobs, qid=None
 74 |     )
 75 |     esearch_response = esearcher.inquire(
 76 |         {"db": db, "term": term, "usehistory": False, "rettype": "json"},
 77 |         analyzer=ESearchAnalyzer(ids),
 78 |     )
 79 | 
 80 |     # use the UIDs to link to other DBs and fetch related records;
 81 |     # we won't be using multi-threading here as this shouldn't take
 82 |     # long (we're only fetching IDs) and we don't want those dead
 83 |     # threads afterwards
 84 |     econduit = ec.Conduit(email=email, threads=0)
 85 |     set_up_entrezpy_logging(econduit, log_level)
 86 |     run_ids_pipeline = econduit.new_pipeline()
 87 | 
 88 |     # create a pipeline to link and fetch the run IDs;
 89 |     # we process the IDs obtained from the previous step in batches
 90 |     # as ELink cannot handle more than a certain amount of IDs
 91 |     # at the same time (recommended by NCBI)
 92 |     for _ids in _chunker(esearch_response.result.uids, BATCH_SIZE):
 93 |         if elink:
 94 |             el = run_ids_pipeline.add_link(
 95 |                 {"db": "sra", "dbfrom": db, "id": _ids, "link": False},
 96 |                 analyzer=ElinkAnalyzer(),
 97 |             )
 98 |         else:
 99 |             el = None
100 | 
101 |         # given SRA run IDs, fetch all metadata
102 |         efetch_params = {
103 |             "rettype": "docsum",
104 |             "retmode": "xml",
105 |             "reqsize": BATCH_SIZE,
106 |             "retmax": len(_ids),
107 |         }
108 |         if not elink:
109 |             # we need to specify these manually as in this scenario
110 |             # EFetch is not linked to anything
111 |             efetch_params.update({"id": _ids, "db": db})
112 | 
113 |         run_ids_pipeline.add_fetch(
114 |             efetch_params, analyzer=EFetchAnalyzer(log_level), dependency=el
115 |         )
116 | 
117 |     econduit.run(run_ids_pipeline)
118 | 
119 |     # recover run IDs from all instances of EFetchAnalyzer
120 |     all_run_ids = []
121 |     for x in econduit.analyzers.values():
122 |         if isinstance(x, EFetchAnalyzer):
123 |             all_run_ids.extend(x.result.metadata)
124 | 
125 |     return sorted(all_run_ids)
126 | 


--------------------------------------------------------------------------------
/q2_fondue/types/_format.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | 
  9 | import itertools
 10 | 
 11 | import pandas as pd
 12 | from qiime2.plugin import ValidationError
 13 | from qiime2.plugin import model
 14 | from qiime2.metadata.base import is_id_header, FORMATTED_ID_HEADERS
 15 | from q2_fondue.entrezpy_clients._utils import PREFIX
 16 | 
 17 | 
 18 | class SRAMetadataFormat(model.TextFileFormat):
 19 | 
 20 |     REQUIRED_IDS = [
 21 |         "ID",
 22 |         "Biosample ID",
 23 |         "Bioproject ID",
 24 |         "Experiment ID",
 25 |         "Study ID",
 26 |         "Sample Accession",
 27 |     ]
 28 |     REQUIRED_HEADER_FIELDS = [
 29 |         "Organism",
 30 |         "Instrument",
 31 |         "Platform",
 32 |         "Bases",
 33 |         "Bytes",
 34 |         "Public",
 35 |         "Library Selection",
 36 |         "Library Source",
 37 |         "Library Layout",
 38 |     ]
 39 |     REQUIRED_HEADER_FIELDS.extend(REQUIRED_IDS)
 40 | 
 41 |     def _validate(self):
 42 |         df = pd.read_csv(str(self), sep="\t")
 43 | 
 44 |         missing_cols = [x for x in self.REQUIRED_HEADER_FIELDS if x not in df.columns]
 45 |         if missing_cols:
 46 |             raise ValidationError(
 47 |                 "Some required columns are missing from the metadata file: "
 48 |                 f'{", ".join(missing_cols)}.'
 49 |             )
 50 | 
 51 |         # some IDs must be present in all samples
 52 |         nans = df.isnull().sum(axis=0)[self.REQUIRED_IDS]
 53 |         missing_ids = nans.where(nans > 0).dropna().index.tolist()
 54 |         if missing_ids:
 55 |             raise ValidationError(
 56 |                 "Some samples are missing IDs in the following fields: "
 57 |                 f'{", ".join(missing_ids)}.'
 58 |             )
 59 | 
 60 |     def _validate_(self, level):
 61 |         self._validate()
 62 | 
 63 | 
 64 | SRAMetadataDirFmt = model.SingleFileDirectoryFormat(
 65 |     "SRAMetadataDirFmt", "sra-metadata.tsv", SRAMetadataFormat
 66 | )
 67 | 
 68 | 
 69 | class SRAFailedIDsFormat(model.TextFileFormat):
 70 |     """
 71 |     This is a "fake" format only used to store a list of failed SRA IDs,
 72 |     which can be converted to QIIME's metadata and input into any fondue
 73 |     action.
 74 |     """
 75 | 
 76 |     def _validate_(self, level):
 77 |         df = pd.read_csv(str(self), sep="\t", index_col=0)
 78 | 
 79 |         if df.shape[1] > 1:
 80 |             raise ValidationError(
 81 |                 "Failed IDs artifact should only contain a single column "
 82 |                 "with error message for the runs that could not be fetched "
 83 |                 "(indexed by run ID)."
 84 |             )
 85 | 
 86 | 
 87 | SRAFailedIDsDirFmt = model.SingleFileDirectoryFormat(
 88 |     "SRAFailedIDsDirFmt", "sra-failed-ids.tsv", SRAFailedIDsFormat
 89 | )
 90 | 
 91 | 
 92 | class NCBIAccessionIDsFormat(model.TextFileFormat):
 93 |     """
 94 |     This is a format used to store a list of SRA accession IDs (run,
 95 |     study, BioProject, sample and experiment IDs), which can be converted
 96 |     to QIIME's metadata. Artifacts containing of run, study and BioProject
 97 |     IDs can be input into any fondue action.
 98 |     """
 99 | 
100 |     ALLOWED_PREFIXES = tuple(
101 |         itertools.chain(
102 |             *[
103 |                 v
104 |                 for k, v in PREFIX.items()
105 |                 if k in ("bioproject", "run", "study", "sample", "experiment")
106 |             ]
107 |         )
108 |     )
109 | 
110 |     def _validate_id(self, _id: str):
111 |         if not _id.startswith(self.ALLOWED_PREFIXES):
112 |             raise ValidationError(
113 |                 "Some of the provided IDs are invalid - only SRA run, study, "
114 |                 "BioProject, sample and experiment IDs are allowed. Please "
115 |                 "check your input and try again."
116 |             )
117 | 
118 |     def _validate_(self, level):
119 |         df = pd.read_csv(str(self), sep="\t")
120 |         cols = df.columns.tolist()
121 | 
122 |         if df.shape[1] > 2 or (
123 |             df.shape[1] == 2 and not any(x in cols for x in ["doi", "DOI"])
124 |         ):
125 |             raise ValidationError(
126 |                 "NCBI Accession IDs artifact should only contain a single "
127 |                 "column with IDs of the SRA runs, studies or NCBI's "
128 |                 "BioProjects and an optional column `doi` with "
129 |                 "associated DOIs."
130 |             )
131 | 
132 |         # check that there is a valid ID header:
133 |         if not any([is_id_header(x) for x in cols]):
134 |             raise ValidationError(
135 |                 f"NCBI Accession IDs artifact must contain a valid "
136 |                 f"ID header from {FORMATTED_ID_HEADERS}."
137 |             )
138 | 
139 |         df.iloc[:, 0].apply(self._validate_id)
140 | 
141 | 
142 | NCBIAccessionIDsDirFmt = model.SingleFileDirectoryFormat(
143 |     "NCBIAccessionIDsDirFmt", "ncbi-accession-ids.tsv", NCBIAccessionIDsFormat
144 | )
145 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/test_esearch.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | 
  9 | import unittest
 10 | from unittest.mock import MagicMock
 11 | 
 12 | import pandas as pd
 13 | from q2_fondue.entrezpy_clients._esearch import ESearchResult, ESearchAnalyzer
 14 | from q2_fondue.tests._utils import _TestPluginWithEntrezFakeComponents
 15 | 
 16 | 
 17 | class FakeESAnalyzer:
 18 |     def __init__(self, uids):
 19 |         self.uids = uids
 20 |         self.log_level = "INFO"
 21 |         self.result = MagicMock()
 22 |         self.result.result = pd.Series(data=[6, 6], index=["ABC", "123"])
 23 | 
 24 | 
 25 | class TestEsearchClients(_TestPluginWithEntrezFakeComponents):
 26 |     package = "q2_fondue.tests"
 27 | 
 28 |     def test_esresult_parse_search_results(self):
 29 |         esearch_result = self.generate_es_result("single", "_correct")
 30 |         esearch_result.parse_search_results(
 31 |             self.json_to_response("single", "_correct"), ["SRR000001"]
 32 |         )
 33 | 
 34 |         obs = esearch_result.result
 35 |         exp = pd.Series(data=[1], index=["SRR000001"], name="count")
 36 |         pd.testing.assert_series_equal(exp, obs)
 37 | 
 38 |     def test_esresult_parse_search_results_ambiguous(self):
 39 |         esearch_result = self.generate_es_result("single", "_ambiguous")
 40 |         esearch_result.parse_search_results(
 41 |             self.json_to_response("single", "_ambiguous"), ["SR012"]
 42 |         )
 43 | 
 44 |         obs = esearch_result.result
 45 |         exp = pd.Series(data=[7], index=["SR012"], name="count")
 46 |         pd.testing.assert_series_equal(exp, obs)
 47 | 
 48 |     def test_esresult_parse_search_results_multi(self):
 49 |         esearch_result = self.generate_es_result("multi", "_correct")
 50 |         esearch_result.parse_search_results(
 51 |             self.json_to_response("multi", "_correct"),
 52 |             ["SRR000001", "SRR000013", "ERR3978173"],
 53 |         )
 54 | 
 55 |         obs = esearch_result.result
 56 |         exp = pd.Series(
 57 |             data=[1, 1, 1], index=["SRR000001", "SRR000013", "ERR3978173"], name="count"
 58 |         )
 59 |         pd.testing.assert_series_equal(exp, obs)
 60 | 
 61 |     def test_esresult_parse_search_results_multi_invalid(self):
 62 |         esearch_result = self.generate_es_result("multi", "_invalid")
 63 |         esearch_result.parse_search_results(
 64 |             self.json_to_response("multi", "_invalid"), ["ABCD123", "SRR001"]
 65 |         )
 66 | 
 67 |         obs = esearch_result.result
 68 |         exp = pd.Series(data=[0, 0], index=["ABCD123", "SRR001"], name="count")
 69 |         pd.testing.assert_series_equal(exp, obs)
 70 | 
 71 |     def test_esresult_parse_search_results_multi_mixed(self):
 72 |         esearch_result = self.generate_es_result("multi", "_mixed")
 73 |         esearch_result.parse_search_results(
 74 |             self.json_to_response("multi", "_mixed"),
 75 |             ["SRR000001", "SRR000013", "SR012", "ABCD123", "SRR001"],
 76 |         )
 77 | 
 78 |         obs = esearch_result.result
 79 |         exp = pd.Series(
 80 |             data=[1, 1, 7, 0, 0],
 81 |             index=["SRR000001", "SRR000013", "SR012", "ABCD123", "SRR001"],
 82 |             name="count",
 83 |         )
 84 |         pd.testing.assert_series_equal(exp, obs)
 85 | 
 86 |     def test_esresult_validate_result_single(self):
 87 |         esearch_result = self.generate_es_result("single", "_correct")
 88 |         esearch_result.result = pd.Series(data=[1], index=["SRR000001"], name="count")
 89 | 
 90 |         obs = esearch_result.validate_result()
 91 |         self.assertDictEqual(obs, {})
 92 | 
 93 |     def test_esresult_validate_result_single_ambiguous(self):
 94 |         esearch_result = self.generate_es_result("single", "_ambiguous")
 95 |         esearch_result.result = pd.Series(data=[7], index=["SR012"], name="count")
 96 | 
 97 |         obs = esearch_result.validate_result()
 98 |         exp = {"SR012": "ID is ambiguous."}
 99 |         self.assertDictEqual(obs, exp)
100 | 
101 |     def test_esresult_validate_result_multi(self):
102 |         esearch_result = self.generate_es_result("multi", "_correct")
103 |         esearch_result.result = pd.Series(
104 |             data=[1, 1, 1], index=["SRR000001", "SRR000013", "ERR3978173"], name="count"
105 |         )
106 | 
107 |         obs = esearch_result.validate_result()
108 |         self.assertDictEqual(obs, {})
109 | 
110 |     def test_esresult_validate_result_multi_invalid(self):
111 |         esearch_result = self.generate_es_result("multi", "_invalid")
112 |         esearch_result.result = pd.Series(
113 |             data=[0, 0], index=["ABCD123", "SRR001"], name="count"
114 |         )
115 | 
116 |         obs = esearch_result.validate_result()
117 |         exp = {"ABCD123": "ID is invalid.", "SRR001": "ID is invalid."}
118 |         self.assertDictEqual(obs, exp)
119 | 
120 |     def test_esresult_validate_result_multi_mixed(self):
121 |         esearch_result = self.generate_es_result("multi", "_mixed")
122 |         esearch_result.result = pd.Series(
123 |             data=[1, 1, 7, 0, 0],
124 |             index=["SRR000001", "SRR000013", "SR012", "ABCD123", "SRR001"],
125 |             name="count",
126 |         )
127 | 
128 |         obs = esearch_result.validate_result()
129 |         exp = {
130 |             "SR012": "ID is ambiguous.",
131 |             "ABCD123": "ID is invalid.",
132 |             "SRR001": "ID is invalid.",
133 |         }
134 |         self.assertDictEqual(obs, exp)
135 | 
136 |     def test_esanalyzer_analyze_result(self):
137 |         es_analyzer = ESearchAnalyzer(["SRR000001"])
138 |         es_analyzer.analyze_result(
139 |             response=self.json_to_response("single", "_correct"),
140 |             request=self.generate_es_request("SRR000001"),
141 |         )
142 | 
143 |         self.assertTrue(isinstance(es_analyzer.result, ESearchResult))
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     unittest.main()
148 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-push.yaml:
--------------------------------------------------------------------------------
  1 | name: Docker push
  2 | 
  3 | on:
  4 |   workflow_run:
  5 |     workflows: ["CI"]
  6 |     types:
  7 |       - completed
  8 | 
  9 | jobs:
 10 |   push-docker-images:
 11 |     runs-on: ubuntu-latest
 12 |     if: ${{ github.event.workflow_run.conclusion == 'success' }}
 13 |     steps:
 14 |       - name: Download build metadata
 15 |         uses: actions/github-script@v7
 16 |         with:
 17 |           script: |
 18 |             // Get artifacts from the triggering workflow run
 19 |             const artifacts = await github.rest.actions.listWorkflowRunArtifacts({
 20 |               owner: context.repo.owner,
 21 |               repo: context.repo.repo,
 22 |               run_id: context.payload.workflow_run.id,
 23 |             });
 24 | 
 25 |             // Find build metadata artifact
 26 |             const metadataArtifact = artifacts.data.artifacts.find(
 27 |               artifact => artifact.name === 'build-metadata'
 28 |             );
 29 | 
 30 |             if (!metadataArtifact) {
 31 |               console.log('No build metadata found, skipping Docker push');
 32 |               return;
 33 |             }
 34 | 
 35 |             // Download the metadata
 36 |             const download = await github.rest.actions.downloadArtifact({
 37 |               owner: context.repo.owner,
 38 |               repo: context.repo.repo,
 39 |               artifact_id: metadataArtifact.id,
 40 |               archive_format: 'zip',
 41 |             });
 42 | 
 43 |             const fs = require('fs');
 44 |             fs.writeFileSync('metadata.zip', Buffer.from(download.data));
 45 | 
 46 |       - name: Extract and parse metadata
 47 |         id: metadata
 48 |         run: |
 49 |           if [ -f "metadata.zip" ]; then
 50 |             unzip metadata.zip
 51 |             if [ -f "build-metadata.json" ]; then
 52 |               # Parse JSON and set outputs
 53 |               echo "repository=$(jq -r '.repository' build-metadata.json)" >> $GITHUB_OUTPUT
 54 |               echo "sha=$(jq -r '.sha' build-metadata.json)" >> $GITHUB_OUTPUT
 55 |               echo "short-sha=$(jq -r '.short_sha' build-metadata.json)" >> $GITHUB_OUTPUT
 56 |               echo "epoch=$(jq -r '.epoch' build-metadata.json)" >> $GITHUB_OUTPUT
 57 |               echo "ref=$(jq -r '.ref' build-metadata.json)" >> $GITHUB_OUTPUT
 58 |               echo "event-name=$(jq -r '.event_name' build-metadata.json)" >> $GITHUB_OUTPUT
 59 |               echo "pr-number=$(jq -r '.pr_number' build-metadata.json)" >> $GITHUB_OUTPUT
 60 |               echo "tag-name=$(jq -r '.tag_name' build-metadata.json)" >> $GITHUB_OUTPUT
 61 |               echo "is-tag-push=$(jq -r '.is_tag_push' build-metadata.json)" >> $GITHUB_OUTPUT
 62 |               echo "build-pr-image=$(jq -r '.build_pr_image' build-metadata.json)" >> $GITHUB_OUTPUT
 63 |               echo "is-main-push=$(jq -r '.is_main_push' build-metadata.json)" >> $GITHUB_OUTPUT
 64 |               echo "has-metadata=true" >> $GITHUB_OUTPUT
 65 |               
 66 |               # Display metadata for debugging
 67 |               echo "Build metadata:"
 68 |               cat build-metadata.json | jq .
 69 |             else
 70 |               echo "has-metadata=false" >> $GITHUB_OUTPUT
 71 |             fi
 72 |           else
 73 |             echo "has-metadata=false" >> $GITHUB_OUTPUT
 74 |           fi
 75 | 
 76 |       - name: Set up Docker Buildx
 77 |         if: steps.metadata.outputs.has-metadata == 'true'
 78 |         uses: docker/setup-buildx-action@v3
 79 | 
 80 |       - name: Login to the remote registry
 81 |         if: steps.metadata.outputs.has-metadata == 'true'
 82 |         uses: docker/login-action@v3
 83 |         with:
 84 |           registry: quay.io
 85 |           username: ${{ secrets.DOCKER_USERNAME }}
 86 |           password: ${{ secrets.DOCKER_PASSWORD }}
 87 | 
 88 |       - name: Download test image artifact
 89 |         if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.build-pr-image == 'true'
 90 |         uses: actions/download-artifact@v4
 91 |         with:
 92 |           name: test-docker-image
 93 |           path: .
 94 |           run-id: ${{ github.event.workflow_run.id }}
 95 |           github-token: ${{ secrets.GITHUB_TOKEN }}
 96 | 
 97 |       - name: Load and push test image
 98 |         if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.build-pr-image == 'true'
 99 |         run: |
100 |           if [ -f "test-image.tar.gz" ]; then
101 |             # Load the image
102 |             docker load < test-image.tar.gz
103 |           
104 |             # Determine the tag based on event type
105 |             if [ "${{ steps.metadata.outputs.event-name }}" = "pull_request" ]; then
106 |               TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:pr-${{ steps.metadata.outputs.pr-number }}-${{ steps.metadata.outputs.short-sha }}"
107 |             else
108 |               TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:test-${{ steps.metadata.outputs.short-sha }}"
109 |             fi
110 |           
111 |             # Re-tag and push
112 |             docker tag ${{ steps.metadata.outputs.sha }} "$TAG"
113 |             docker push "$TAG"
114 |             echo "Pushed test image: $TAG"
115 |           else
116 |             echo "No test image artifact found"
117 |           fi
118 | 
119 |       - name: Download production image artifact
120 |         if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.is-main-push == 'true'
121 |         uses: actions/download-artifact@v4
122 |         with:
123 |           name: prod-docker-image
124 |           path: .
125 |           run-id: ${{ github.event.workflow_run.id }}
126 |           github-token: ${{ secrets.GITHUB_TOKEN }}
127 | 
128 |       - name: Load and push production image
129 |         if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.is-main-push == 'true'
130 |         run: |
131 |           if [ -f "prod-image.tar" ]; then
132 |             # Load the image
133 |             docker load < prod-image.tar
134 |           
135 |             # Determine the tag based on whether this is a tag push or main branch push
136 |             if [ "${{ steps.metadata.outputs.is-tag-push }}" = "true" ]; then
137 |               # For tag pushes, use just the tag name (no hash suffix)
138 |               TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:${{ steps.metadata.outputs.tag-name }}"
139 |             else
140 |               # For main branch pushes, use epoch + hash
141 |               TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:${{ steps.metadata.outputs.epoch }}-${{ steps.metadata.outputs.short-sha }}"
142 |             fi
143 |           
144 |             docker tag temp-prod-image "$TAG"
145 |             docker push "$TAG"
146 |             echo "Pushed production image: $TAG"
147 |           else
148 |             echo "No production image artifact found"
149 |           fi


--------------------------------------------------------------------------------
/q2_fondue/tests/_utils.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | 
  9 | import io
 10 | import json
 11 | import logging
 12 | 
 13 | import pandas as pd
 14 | from entrezpy.efetch.efetch_request import EfetchRequest
 15 | from entrezpy.esearch.esearch_request import EsearchRequest
 16 | from qiime2.plugin.testing import TestPluginBase
 17 | 
 18 | from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer, EFetchResult
 19 | from q2_fondue.entrezpy_clients._esearch import ESearchResult
 20 | from q2_fondue.entrezpy_clients._sra_meta import (
 21 |     SRAStudy,
 22 |     SRASample,
 23 |     SRAExperiment,
 24 |     LibraryMetadata,
 25 |     SRARun,
 26 | )
 27 | 
 28 | 
 29 | class FakeParams:
 30 |     def __init__(
 31 |         self,
 32 |         temp_dir,
 33 |         uids=None,
 34 |         term=None,
 35 |         eutil="efetch.cgi",
 36 |         rettype="xml",
 37 |         retmode="xml",
 38 |     ):
 39 |         self.query_id = "some-id-123"
 40 |         self.term = term
 41 |         self.usehistory = False
 42 |         self.cmd = None
 43 |         self.linkname = None
 44 |         self.holding = False
 45 |         self.doseq = None
 46 |         self.db = "sra"
 47 |         self.dbfrom = "sra"
 48 |         self.eutil = eutil
 49 |         self.uids = uids
 50 |         self.webenv = None
 51 |         self.idtype = None
 52 |         self.datetype = None
 53 |         self.reldate = None
 54 |         self.mindate = None
 55 |         self.maxdate = None
 56 |         self.querykey = 0
 57 |         self.rettype = rettype
 58 |         self.retmode = retmode
 59 |         self.strand = None
 60 |         self.sort = None
 61 |         self.field = None
 62 |         self.retstart = 0
 63 |         self.retmax = 0
 64 |         self.seqstart = None
 65 |         self.seqstop = None
 66 |         self.complexity = None
 67 |         self.temp_dir = temp_dir
 68 | 
 69 | 
 70 | class _TestPluginWithEntrezFakeComponents(TestPluginBase):
 71 |     def setUp(self):
 72 |         super().setUp()
 73 |         self.efetch_result_single = self.generate_ef_result("single")
 74 |         self.efetch_result_multi = self.generate_ef_result("multi")
 75 |         self.efetch_analyzer = EFetchAnalyzer(log_level="INFO")
 76 |         self.efetch_request_properties = {
 77 |             "db",
 78 |             "eutil",
 79 |             "uids",
 80 |             "webenv",
 81 |             "querykey",
 82 |             "rettype",
 83 |             "retmode",
 84 |             "strand",
 85 |             "seqstart",
 86 |             "seqstop",
 87 |             "complexity",
 88 |         }
 89 |         self.esearch_request_properties = {"db", "eutil", "webenv", "retmode", "term"}
 90 |         self.library_meta = LibraryMetadata(
 91 |             name="unspecified", layout="SINGLE", selection="PCR", source="METAGENOMIC"
 92 |         )
 93 |         with open(self.get_data_path("metadata_response_small.json"), "r") as ff:
 94 |             self.metadata_dict = json.load(ff)
 95 |         self.maxDiff = None
 96 |         self.fake_logger = logging.getLogger("test_log")
 97 | 
 98 |     def xml_to_response(self, kind, suffix="", prefix="metadata"):
 99 |         path = self.get_data_path(f"{prefix}_response_{kind}{suffix}.xml")
100 |         response = io.open(path, "rb", buffering=0)
101 |         return response
102 | 
103 |     def json_to_response(self, kind, suffix="", raw=False, utility="esearch"):
104 |         path = self.get_data_path(f"{utility}_response_{kind}{suffix}.json")
105 |         response = io.open(path, "rb", buffering=0)
106 |         if raw:
107 |             return response
108 |         else:
109 |             return json.loads(io.open(path, "rb", buffering=0).read())
110 | 
111 |     def generate_ef_request(self, uids, start=0, size=1):
112 |         request_params = FakeParams(self.temp_dir.name, uids=uids)
113 |         return EfetchRequest(
114 |             eutil="efetch.fcgi", parameter=request_params, start=start, size=size
115 |         )
116 | 
117 |     def generate_ef_result(self, kind, prefix="metadata"):
118 |         return EFetchResult(
119 |             response=self.xml_to_response(kind, prefix=prefix),
120 |             request=self.generate_ef_request(["FAKEID1", "FAKEID2"]),
121 |             log_level="INFO",
122 |         )
123 | 
124 |     def generate_sra_metadata(self):
125 |         study_id, sample_id = "ERP120343", "ERS4372624"
126 |         experiment_id, run_ids = "ERX3980916", ["FAKEID1", "FAKEID2"]
127 |         study = SRAStudy(
128 |             id=study_id,
129 |             bioproject_id="PRJEB37054",
130 |             center_name="University of Hohenheim",
131 |             custom_meta={
132 |                 "ENA-FIRST-PUBLIC [STUDY]": "2020-05-31",
133 |                 "ENA-LAST-UPDATE [STUDY]": "2020-03-04",
134 |             },
135 |         )
136 |         sample = SRASample(
137 |             id=sample_id,
138 |             biosample_id="SAMEA6608408",
139 |             name="BAC1.D1.0.32A",
140 |             title="Vitis vinifera",
141 |             organism="Vitis vinifera",
142 |             tax_id="29760",
143 |             study_id=study_id,
144 |             custom_meta={
145 |                 "environment (biome) [SAMPLE]": "berry plant",
146 |                 "geographic location (country and/or sea) [SAMPLE]": "Germany",
147 |                 "sample storage temperature [SAMPLE]": "-80",
148 |             },
149 |         )
150 |         experiment = SRAExperiment(
151 |             id=experiment_id,
152 |             instrument="Illumina MiSeq",
153 |             platform="ILLUMINA",
154 |             library=self.library_meta,
155 |             sample_id=sample_id,
156 |             custom_meta={"Temperature [EXPERIMENT]": "12", "Depth [EXPERIMENT]": "500"},
157 |         )
158 |         runs = [
159 |             SRARun(
160 |                 id=_id,
161 |                 bases=11552099,
162 |                 spots=39323,
163 |                 public=True,
164 |                 bytes=3914295,
165 |                 experiment_id=experiment_id,
166 |                 custom_meta={
167 |                     "ENA-FIRST-PUBLIC [RUN]": "2020-05-31",
168 |                     "ENA-LAST-UPDATE [RUN]": "2020-03-06",
169 |                 },
170 |             )
171 |             for _id in run_ids
172 |         ]
173 |         return study, sample, experiment, runs
174 | 
175 |     def generate_expected_df(self):
176 |         exp_df = pd.read_json(
177 |             path_or_buf=self.get_data_path("metadata_processed_multi.json"),
178 |             orient="index",
179 |         )
180 |         exp_df.index.name = "ID"
181 |         numeric_cols = {
182 |             "Amount or size of sample collected [sample]",
183 |             "Collection day [sample]",
184 |             "Collection hours [sample]",
185 |             "Sample storage temperature [sample]",
186 |             "Tax ID",
187 |             "Sample volume or weight for dna extraction [sample]",
188 |         }
189 |         exp_df["Public"] = exp_df["Public"].astype(bool)
190 |         for col in numeric_cols:
191 |             exp_df[col] = exp_df[col].astype(str)
192 |         return exp_df
193 | 
194 |     def generate_es_request(self, term, start=0, size=1):
195 |         request_params = FakeParams(
196 |             self.temp_dir.name, retmode="json", term=term, eutil="esearch.fcgi"
197 |         )
198 |         return EsearchRequest(
199 |             eutil="esearch.fcgi", parameter=request_params, start=start, size=size
200 |         )
201 | 
202 |     def generate_es_result(self, kind, suffix):
203 |         return ESearchResult(
204 |             response=self.json_to_response(kind, suffix, utility="esearch")[
205 |                 "esearchresult"
206 |             ],
207 |             request=self.generate_es_request(term="abc OR 123"),
208 |         )
209 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/efetch_response_runs.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"  ?>
  2 | <!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD esummary v1 20041029//EN"
  3 |         "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20041029/esummary-v1.dtd">
  4 | <eSummaryResult>
  5 | 
  6 |     <DocSum>
  7 |         <Id>13481774</Id>
  8 |         <Item Name="ExpXml" Type="String">&lt;Summary&gt;&lt;Title&gt;18&lt;/Title&gt;&lt;Platform
  9 |             instrument_model="Illumina HiSeq 3000"&gt;ILLUMINA&lt;/Platform&gt;&lt;Statistics
 10 |             total_runs="1" total_spots="63703" total_bases="38349206"
 11 |             total_size="22735317" load_done="true" cluster_name="public"/&gt;&lt;/Summary&gt;&lt;Submitter
 12 |             acc="SRA1206349" center_name="Jiangxi Agricultural University"
 13 |             contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory
 14 |             of Animal Nutritio"/&gt;&lt;Experiment acc="SRX10339760" ver="1"
 15 |             status="public" name="18"/&gt;&lt;Study acc="SRP310597" name="PRJNA
 16 |             Chuanzhong black lamb Raw sequence reads"/&gt;&lt;Organism
 17 |             taxid="1904483" ScientificName="sheep gut metagenome"/&gt;&lt;Sample
 18 |             acc="SRS8459117" name=""/&gt;&lt;Instrument ILLUMINA="Illumina
 19 |             HiSeq 3000"/&gt;&lt;Library_descriptor&gt;&lt;LIBRARY_NAME&gt;18&lt;/LIBRARY_NAME&gt;&lt;LIBRARY_STRATEGY&gt;AMPLICON&lt;/LIBRARY_STRATEGY&gt;&lt;LIBRARY_SOURCE&gt;GENOMIC&lt;/LIBRARY_SOURCE&gt;&lt;LIBRARY_SELECTION&gt;PCR&lt;/LIBRARY_SELECTION&gt;&lt;LIBRARY_LAYOUT&gt;
 20 |             &lt;PAIRED/&gt; &lt;/LIBRARY_LAYOUT&gt;&lt;/Library_descriptor&gt;&lt;Bioproject&gt;PRJNA707607&lt;/Bioproject&gt;&lt;Biosample&gt;SAMN18309312&lt;/Biosample&gt;
 21 |         </Item>
 22 |         <Item Name="Runs" Type="String">&lt;Run acc="SRR13961771"
 23 |             total_spots="63703" total_bases="38349206" load_done="true"
 24 |             is_public="true" cluster_name="public"
 25 |             static_data_available="true"/&gt;
 26 |         </Item>
 27 |         <Item Name="ExtLinks" Type="String"></Item>
 28 |         <Item Name="CreateDate" Type="String">2021/03/17</Item>
 29 |         <Item Name="UpdateDate" Type="String">2021/03/15</Item>
 30 |     </DocSum>
 31 | 
 32 |     <DocSum>
 33 |         <Id>4</Id>
 34 |         <Item Name="ExpXml" Type="String">&lt;Summary&gt;&lt;Title&gt;454
 35 |             sequencing of Human HapMap individual NA18505 genomic paired-end
 36 |             library&lt;/Title&gt;&lt;Platform instrument_model="454 GS FLX"&gt;LS454&lt;/Platform&gt;&lt;Statistics
 37 |             total_runs="10" total_spots="4703662" total_bases="1306798474"
 38 |             total_size="3205056622" load_done="true"
 39 |             static_data_available="true" cluster_name="public"/&gt;&lt;/Summary&gt;&lt;Submitter
 40 |             acc="SRA000197" center_name="454MSC" contact_name="Chris OSullivan"
 41 |             lab_name=""/&gt;&lt;Experiment acc="SRX000003" ver="10"
 42 |             status="public" name="454 sequencing of Human HapMap individual
 43 |             NA18505 genomic paired-end library"/&gt;&lt;Study acc="SRP000001"
 44 |             name="Paired-end mapping reveals extensive structural variation in
 45 |             the human genome"/&gt;&lt;Organism taxid="9606"
 46 |             ScientificName="Homo sapiens"/&gt;&lt;Sample acc="SRS000100"
 47 |             name=""/&gt;&lt;Instrument LS454="454 GS FLX"/&gt;&lt;Library_descriptor&gt;&lt;LIBRARY_NAME
 48 |             xmlns=""&gt;SID2699&lt;/LIBRARY_NAME&gt;&lt;LIBRARY_STRATEGY
 49 |             xmlns=""&gt;WGS&lt;/LIBRARY_STRATEGY&gt;&lt;LIBRARY_SOURCE xmlns=""&gt;GENOMIC&lt;/LIBRARY_SOURCE&gt;&lt;LIBRARY_SELECTION
 50 |             xmlns=""&gt;RANDOM&lt;/LIBRARY_SELECTION&gt;&lt;LIBRARY_LAYOUT
 51 |             xmlns=""&gt; &lt;PAIRED NOMINAL_LENGTH="3000"/&gt; &lt;/LIBRARY_LAYOUT&gt;&lt;/Library_descriptor&gt;&lt;Bioproject&gt;PRJNA33627&lt;/Bioproject&gt;&lt;Biosample&gt;SAMN00001583&lt;/Biosample&gt;
 52 |         </Item>
 53 |         <Item Name="Runs" Type="String">&lt;Run acc="SRR000007"
 54 |             total_spots="633196" total_bases="175275395" load_done="true"
 55 |             is_public="true" cluster_name="public"
 56 |             static_data_available="true"/&gt;&lt;Run acc="SRR000018"
 57 |             total_spots="626624" total_bases="174403220" load_done="true"
 58 |             is_public="true" cluster_name="public"
 59 |             static_data_available="true"/&gt;&lt;Run acc="SRR000020"
 60 |             total_spots="374556" total_bases="103411232" load_done="true"
 61 |             is_public="true" cluster_name="public"
 62 |             static_data_available="true"/&gt;&lt;Run acc="SRR000038"
 63 |             total_spots="529820" total_bases="148389031" load_done="true"
 64 |             is_public="true" cluster_name="public"
 65 |             static_data_available="true"/&gt;&lt;Run acc="SRR000043"
 66 |             total_spots="608946" total_bases="168985392" load_done="true"
 67 |             is_public="true" cluster_name="public"
 68 |             static_data_available="true"/&gt;&lt;Run acc="SRR000046"
 69 |             total_spots="79047" total_bases="21258857" load_done="true"
 70 |             is_public="true" cluster_name="public"
 71 |             static_data_available="true"/&gt;&lt;Run acc="SRR000048"
 72 |             total_spots="640737" total_bases="177619279" load_done="true"
 73 |             is_public="true" cluster_name="public"
 74 |             static_data_available="true"/&gt;&lt;Run acc="SRR000050"
 75 |             total_spots="547349" total_bases="153260655" load_done="true"
 76 |             is_public="true" cluster_name="public"
 77 |             static_data_available="true"/&gt;&lt;Run acc="SRR000057"
 78 |             total_spots="76744" total_bases="21203932" load_done="true"
 79 |             is_public="true" cluster_name="public"
 80 |             static_data_available="true"/&gt;&lt;Run acc="SRR000058"
 81 |             total_spots="586643" total_bases="162991481" load_done="true"
 82 |             is_public="true" cluster_name="public"
 83 |             static_data_available="true"/&gt;
 84 |         </Item>
 85 |         <Item Name="ExtLinks" Type="String"></Item>
 86 |         <Item Name="CreateDate" Type="String">2008/04/04</Item>
 87 |         <Item Name="UpdateDate" Type="String">2015/04/09</Item>
 88 |     </DocSum>
 89 | 
 90 |     <DocSum>
 91 |         <Id>13481786</Id>
 92 |         <Item Name="ExpXml" Type="String">&lt;Summary&gt;&lt;Title&gt;12&lt;/Title&gt;&lt;Platform
 93 |             instrument_model="Illumina HiSeq 3000"&gt;ILLUMINA&lt;/Platform&gt;&lt;Statistics
 94 |             total_runs="1" total_spots="59130" total_bases="35596260"
 95 |             total_size="21079845" load_done="true" cluster_name="public"/&gt;&lt;/Summary&gt;&lt;Submitter
 96 |             acc="SRA1206349" center_name="Jiangxi Agricultural University"
 97 |             contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory
 98 |             of Animal Nutritio"/&gt;&lt;Experiment acc="SRX10339772" ver="1"
 99 |             status="public" name="12"/&gt;&lt;Study acc="SRP310597" name="PRJNA
100 |             Chuanzhong black lamb Raw sequence reads"/&gt;&lt;Organism
101 |             taxid="1904483" ScientificName="sheep gut metagenome"/&gt;&lt;Sample
102 |             acc="SRS8459130" name=""/&gt;&lt;Instrument ILLUMINA="Illumina
103 |             HiSeq 3000"/&gt;&lt;Library_descriptor&gt;&lt;LIBRARY_NAME&gt;12&lt;/LIBRARY_NAME&gt;&lt;LIBRARY_STRATEGY&gt;AMPLICON&lt;/LIBRARY_STRATEGY&gt;&lt;LIBRARY_SOURCE&gt;GENOMIC&lt;/LIBRARY_SOURCE&gt;&lt;LIBRARY_SELECTION&gt;PCR&lt;/LIBRARY_SELECTION&gt;&lt;LIBRARY_LAYOUT&gt;
104 |             &lt;PAIRED/&gt; &lt;/LIBRARY_LAYOUT&gt;&lt;/Library_descriptor&gt;&lt;Bioproject&gt;PRJNA707607&lt;/Bioproject&gt;&lt;Biosample&gt;SAMN18309306&lt;/Biosample&gt;
105 |         </Item>
106 |         <Item Name="Runs" Type="String">&lt;Run acc="SRR13961759"
107 |             total_spots="59130" total_bases="35596260" load_done="true"
108 |             is_public="true" cluster_name="public"
109 |             static_data_available="true"/&gt;
110 |         </Item>
111 |         <Item Name="ExtLinks" Type="String"></Item>
112 |         <Item Name="CreateDate" Type="String">2021/03/17</Item>
113 |         <Item Name="UpdateDate" Type="String">2021/03/15</Item>
114 |     </DocSum>
115 | </eSummaryResult>
116 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/scraper_items_no_attach.json:
--------------------------------------------------------------------------------
  1 | [{
  2 |     "key": "CP4ED2CY",
  3 |     "version": 1257,
  4 |     "library": {
  5 |         "type": "user",
  6 |         "id": 12345,
  7 |         "name": "username",
  8 |         "links": {
  9 |             "alternate": {
 10 |                 "href": "https://www.zotero.org/username",
 11 |                 "type": "text/html"
 12 |             }
 13 |         }
 14 |     },
 15 |     "links": {
 16 |         "self": {
 17 |             "href": "https://api.zotero.org/users/12345/items/CP4ED2CY",
 18 |             "type": "application/json"
 19 |         },
 20 |         "alternate": {
 21 |             "href": "https://www.zotero.org/username/items/CP4ED2CY",
 22 |             "type": "text/html"
 23 |         },
 24 |         "attachment": {
 25 |             "href": "https://api.zotero.org/users/12345/items/DMJ4AQ48",
 26 |             "type": "application/json",
 27 |             "attachmentType": "application/pdf",
 28 |             "attachmentSize": 3648434
 29 |         }
 30 |     },
 31 |     "meta": {
 32 |         "creatorSummary": "Pruski et al.",
 33 |         "parsedDate": "2021-10-13",
 34 |         "numChildren": 2
 35 |     },
 36 |     "data": {
 37 |         "key": "CP4ED2CY",
 38 |         "version": 1257,
 39 |         "itemType": "journalArticle",
 40 |         "title": "Direct on-swab metabolic profiling of vaginal microbiome host interactions during pregnancy and preterm birth",
 41 |         "creators": [
 42 |             {
 43 |                 "creatorType": "author",
 44 |                 "firstName": "Pamela",
 45 |                 "lastName": "Pruski"
 46 |             },
 47 |             {
 48 |                 "creatorType": "author",
 49 |                 "firstName": "Gonçalo D. S.",
 50 |                 "lastName": "Correia"
 51 |             },
 52 |             {
 53 |                 "creatorType": "author",
 54 |                 "firstName": "Holly V.",
 55 |                 "lastName": "Lewis"
 56 |             },
 57 |             {
 58 |                 "creatorType": "author",
 59 |                 "firstName": "Katia",
 60 |                 "lastName": "Capuccini"
 61 |             },
 62 |             {
 63 |                 "creatorType": "author",
 64 |                 "firstName": "Paolo",
 65 |                 "lastName": "Inglese"
 66 |             },
 67 |             {
 68 |                 "creatorType": "author",
 69 |                 "firstName": "Denise",
 70 |                 "lastName": "Chan"
 71 |             },
 72 |             {
 73 |                 "creatorType": "author",
 74 |                 "firstName": "Richard G.",
 75 |                 "lastName": "Brown"
 76 |             },
 77 |             {
 78 |                 "creatorType": "author",
 79 |                 "firstName": "Lindsay",
 80 |                 "lastName": "Kindinger"
 81 |             },
 82 |             {
 83 |                 "creatorType": "author",
 84 |                 "firstName": "Yun S.",
 85 |                 "lastName": "Lee"
 86 |             },
 87 |             {
 88 |                 "creatorType": "author",
 89 |                 "firstName": "Ann",
 90 |                 "lastName": "Smith"
 91 |             },
 92 |             {
 93 |                 "creatorType": "author",
 94 |                 "firstName": "Julian",
 95 |                 "lastName": "Marchesi"
 96 |             },
 97 |             {
 98 |                 "creatorType": "author",
 99 |                 "firstName": "Julie A. K.",
100 |                 "lastName": "McDonald"
101 |             },
102 |             {
103 |                 "creatorType": "author",
104 |                 "firstName": "Simon",
105 |                 "lastName": "Cameron"
106 |             },
107 |             {
108 |                 "creatorType": "author",
109 |                 "firstName": "Kate",
110 |                 "lastName": "Alexander-Hardiman"
111 |             },
112 |             {
113 |                 "creatorType": "author",
114 |                 "firstName": "Anna L.",
115 |                 "lastName": "David"
116 |             },
117 |             {
118 |                 "creatorType": "author",
119 |                 "firstName": "Sarah J.",
120 |                 "lastName": "Stock"
121 |             },
122 |             {
123 |                 "creatorType": "author",
124 |                 "firstName": "Jane E.",
125 |                 "lastName": "Norman"
126 |             },
127 |             {
128 |                 "creatorType": "author",
129 |                 "firstName": "Vasso",
130 |                 "lastName": "Terzidou"
131 |             },
132 |             {
133 |                 "creatorType": "author",
134 |                 "firstName": "T. G.",
135 |                 "lastName": "Teoh"
136 |             },
137 |             {
138 |                 "creatorType": "author",
139 |                 "firstName": "Lynne",
140 |                 "lastName": "Sykes"
141 |             },
142 |             {
143 |                 "creatorType": "author",
144 |                 "firstName": "Phillip R.",
145 |                 "lastName": "Bennett"
146 |             },
147 |             {
148 |                 "creatorType": "author",
149 |                 "firstName": "Zoltan",
150 |                 "lastName": "Takats"
151 |             },
152 |             {
153 |                 "creatorType": "author",
154 |                 "firstName": "David A.",
155 |                 "lastName": "MacIntyre"
156 |             }
157 |         ],
158 |         "abstractNote": "The pregnancy vaginal microbiome contributes to risk of preterm birth, the primary cause of death in children under 5 years of age. Here we describe direct on-swab metabolic profiling by Desorption Electrospray Ionization Mass Spectrometry (DESI-MS) for sample preparation-free characterisation of the cervicovaginal metabolome in two independent pregnancy cohorts (VMET, n\u2009=\u2009160; 455 swabs; VMET II, n\u2009=\u2009205; 573 swabs). By integrating metataxonomics and immune profiling data from matched samples, we show that specific metabolome signatures can be used to robustly predict simultaneously both the composition of the vaginal microbiome and host inflammatory status. In these patients, vaginal microbiota instability and innate immune activation, as predicted using DESI-MS, associated with preterm birth, including in women receiving cervical cerclage for preterm birth prevention. These findings highlight direct on-swab metabolic profiling by DESI-MS as an innovative approach for preterm birth risk stratification through rapid assessment of vaginal microbiota-host dynamics.",
159 |         "publicationTitle": "Nature Communications",
160 |         "volume": "12",
161 |         "issue": "1",
162 |         "pages": "5967",
163 |         "date": "2021-10-13",
164 |         "series": "",
165 |         "seriesTitle": "",
166 |         "seriesText": "",
167 |         "journalAbbreviation": "Nat Commun",
168 |         "language": "en",
169 |         "DOI": "10.1038/s41467-021-26215-w",
170 |         "ISSN": "2041-1723",
171 |         "shortTitle": "",
172 |         "url": "https://www.nature.com/articles/s41467-021-26215-w",
173 |         "accessDate": "2021-11-10T07:04:46Z",
174 |         "archive": "",
175 |         "archiveLocation": "",
176 |         "libraryCatalog": "www.nature.com",
177 |         "callNumber": "",
178 |         "rights": "2021 The Author(s)",
179 |         "extra": "Bandiera_abtest: a\nCc_license_type: cc_by\nCg_type: Nature Research Journals\nNumber: 1\nPrimary_atype: Research\nPublisher: Nature Publishing Group\nSubject_term: Infectious-disease diagnostics;Predictive markers;Risk factors;Translational research\nSubject_term_id: infectious-disease-diagnostics;predictive-markers;risk-factors;translational-research",
180 |         "tags": [
181 |             {
182 |                 "tag": "Infectious-disease diagnostics",
183 |                 "type": 1
184 |             },
185 |             {
186 |                 "tag": "Predictive markers",
187 |                 "type": 1
188 |             },
189 |             {
190 |                 "tag": "Risk factors",
191 |                 "type": 1
192 |             },
193 |             {
194 |                 "tag": "Translational research",
195 |                 "type": 1
196 |             }
197 |         ],
198 |         "collections": [
199 |             "DCHC4FUN"
200 |         ],
201 |         "relations": {},
202 |         "dateAdded": "2021-11-10T07:04:46Z",
203 |         "dateModified": "2021-11-10T07:04:46Z"
204 |     }
205 | }]


--------------------------------------------------------------------------------
/q2_fondue/utils.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | import gzip
  9 | import os
 10 | import shutil
 11 | import signal
 12 | import subprocess
 13 | from typing import List
 14 | 
 15 | from entrezpy.esearch import esearcher as es
 16 | from q2_types.per_sample_sequences import CasavaOneEightSingleLanePerSampleDirFmt
 17 | from qiime2 import Artifact
 18 | 
 19 | from q2_fondue.entrezpy_clients._esearch import ESearchAnalyzer
 20 | from q2_fondue.entrezpy_clients._utils import (
 21 |     PREFIX,
 22 |     InvalidIDs,
 23 |     set_up_logger,
 24 |     set_up_entrezpy_logging,
 25 | )
 26 | 
 27 | LOGGER = set_up_logger("INFO", logger_name=__name__)
 28 | 
 29 | 
 30 | class DownloadError(Exception):
 31 |     pass
 32 | 
 33 | 
 34 | def _chunker(seq, size):
 35 |     # source: https://stackoverflow.com/a/434328/579416
 36 |     return (seq[pos : pos + size] for pos in range(0, len(seq), size))
 37 | 
 38 | 
 39 | def _validate_run_ids(
 40 |     email: str, n_jobs: int, run_ids: List[str], log_level: str
 41 | ) -> dict:
 42 |     """Validates provided accession IDs using ESearch.
 43 | 
 44 |     Args:
 45 |         email (str): A valid e-mail address.
 46 |         n_jobs (int): Number of threads to be used in parallel.
 47 |         run_ids (List[str]): List of all the run IDs to be validated.
 48 |         log_level (str): Logging level.
 49 | 
 50 |     Returns:
 51 |         dict: Dictionary of invalid IDs (as keys) with a description.
 52 |     """
 53 |     # must process in batches because esearch requests with
 54 |     # runID count > 10'000 fail
 55 |     invalid_ids = {}
 56 |     for batch in _chunker(run_ids, 10000):
 57 |         esearcher = es.Esearcher(
 58 |             "esearcher", email, apikey=None, apikey_var=None, threads=0, qid=None
 59 |         )
 60 |         set_up_entrezpy_logging(esearcher, log_level)
 61 | 
 62 |         esearch_response = esearcher.inquire(
 63 |             {"db": "sra", "term": " OR ".join(batch), "usehistory": False},
 64 |             analyzer=ESearchAnalyzer(batch),
 65 |         )
 66 |         invalid_ids.update(esearch_response.result.validate_result())
 67 | 
 68 |     return invalid_ids
 69 | 
 70 | 
 71 | def _determine_id_type(ids: list):
 72 |     ids = [x[:3] for x in ids]
 73 |     for kind in PREFIX.keys():
 74 |         if all([x in PREFIX[kind] for x in ids]):
 75 |             return kind
 76 |     raise InvalidIDs(
 77 |         "The type of provided IDs is either not supported or "
 78 |         "IDs of mixed types were provided. Please provide IDs "
 79 |         "corresponding to either SRA run (#S|E|DRR), study "
 80 |         "(#S|E|DRP) or NCBI BioProject IDs (#PRJ)."
 81 |     )
 82 | 
 83 | 
 84 | def handle_threaded_exception(args):
 85 |     logger = set_up_logger("DEBUG", logger_name="ThreadedErrorsManager")
 86 |     msg = "Data fetching was interrupted by the following error: \n"
 87 | 
 88 |     if "gaierror is not JSON serializable" in str(args.exc_value):
 89 |         msg += (
 90 |             "EntrezPy failed to connect to NCBI. Please check your "
 91 |             "internet connection and try again. It may help to wait "
 92 |             "a few minutes before retrying."
 93 |         )
 94 |     # silence threads exiting correctly
 95 |     elif issubclass(args.exc_type, SystemExit) and str(args.exc_value) == "0":
 96 |         return
 97 |     else:
 98 |         msg += (
 99 |             f'Caught {args.exc_type} with value "{args.exc_value}" '
100 |             f"in thread {args.thread}"
101 |         )
102 | 
103 |     logger.exception(msg)
104 | 
105 |     # This will send a SIGINT to the main thread, which will gracefully
106 |     # kill the running Q2 action. No artifacts will be saved.
107 |     os.kill(os.getpid(), signal.SIGINT)
108 | 
109 | 
110 | def _has_enough_space(acc_id: str, output_dir: str) -> bool:
111 |     """Checks whether there is enough storage available for fasterq-dump
112 |         to process sequences for a given ID.
113 | 
114 |     fasterq-dump will be used to check the amount of space required for the
115 |     final data. Required space is estimated as 10x that of the final data
116 |     (as per NCBI's documentation).
117 | 
118 |     Args:
119 |         acc_id (str): The accession ID to be processed.
120 |         output_dir (str): Location where the output would be saved.
121 | 
122 |     Return
123 |         bool: Whether there is enough space available for fasterq-dump tool.
124 |     """
125 |     if acc_id is None:
126 |         return True
127 | 
128 |     cmd_fasterq = ["fasterq-dump", "--size-check", "only", "-x", acc_id]
129 |     result = subprocess.run(cmd_fasterq, text=True, capture_output=True, cwd=output_dir)
130 | 
131 |     if result.returncode == 0:
132 |         return True
133 |     elif result.returncode == 3 and "disk-limit exeeded" in result.stderr:
134 |         LOGGER.warning("Not enough space to fetch run %s.", acc_id)
135 |         return False
136 |     else:
137 |         LOGGER.error(
138 |             'fasterq-dump exited with a "%s" error code (the message '
139 |             'was: "%s"). We will try to fetch the next accession ID.',
140 |             result.returncode,
141 |             result.stderr,
142 |         )
143 |         return True
144 | 
145 | 
146 | def _rewrite_fastq(file_in: str, file_out: str) -> None:
147 |     """Rewrites a FASTQ file with gzip compression.
148 | 
149 |     Takes an uncompressed FASTQ file and writes it to a new location with
150 |     gzip compression.
151 | 
152 |     Args:
153 |         file_in (str): Path to input uncompressed FASTQ file
154 |         file_out (str): Path where compressed FASTQ file should be written
155 |     """
156 |     with open(file_in, "rb") as f_in, gzip.open(file_out, "wb") as f_out:
157 |         shutil.copyfileobj(f_in, f_out)
158 | 
159 | 
160 | def _is_empty(artifact: Artifact) -> bool:
161 |     """Checks if a sequence artifact is empty.
162 | 
163 |     Determines if a sequence artifact is empty by checking if all sample IDs
164 |     are "xxx", which indicates an empty placeholder artifact.
165 | 
166 |     Args:
167 |         artifact: A QIIME 2 sequence artifact
168 | 
169 |     Returns:
170 |         bool: True if the artifact is empty, False otherwise
171 |     """
172 |     samples = artifact.view(CasavaOneEightSingleLanePerSampleDirFmt).manifest.index
173 |     return all(sample == "xxx" for sample in samples)
174 | 
175 | 
176 | def _remove_empty(*artifact_lists) -> tuple:
177 |     """Removes empty artifacts from lists of sequence artifacts.
178 | 
179 |     Takes one or more lists of sequence artifacts and filters out any empty
180 |     artifacts (those containing only placeholder 'xxx' samples). Returns
181 |     tuple of filtered lists maintaining the same order as input.
182 | 
183 |     Args:
184 |         *artifact_lists: Variable number of lists containing sequence artifacts
185 |             to filter
186 | 
187 |     Returns:
188 |         tuple: Tuple of filtered lists with empty artifacts removed, in same
189 |             order as input lists
190 |     """
191 |     processed_artifacts = []
192 |     for artifacts in artifact_lists:
193 |         processed_artifacts.append(
194 |             [artifact for artifact in artifacts if not _is_empty(artifact)]
195 |         )
196 |     return tuple(processed_artifacts)
197 | 
198 | 
199 | def _make_empty_artifact(ctx, paired: bool) -> Artifact:
200 |     """Creates an empty sequence artifact.
201 | 
202 |     Creates an empty QIIME 2 sequence artifact containing placeholder files.
203 |     For paired-end sequences, creates two empty fastq files (R1 and R2).
204 |     For single-end sequences, creates one empty fastq file (R1).
205 | 
206 |     Args:
207 |         ctx: QIIME 2 plugin context
208 |         paired (bool): Whether to create paired-end (True) or
209 |             single-end (False) artifact
210 | 
211 |     Returns:
212 |         QIIME 2 artifact: Empty sequence artifact of appropriate type
213 |             (paired or single-end)
214 |     """
215 |     if paired:
216 |         filenames = ["xxx_00_L001_R1_001.fastq.gz", "xxx_00_L001_R2_001.fastq.gz"]
217 |         _type = "SampleData[PairedEndSequencesWithQuality]"
218 |     else:
219 |         filenames = ["xxx_01_L001_R1_001.fastq.gz"]
220 |         _type = "SampleData[SequencesWithQuality]"
221 | 
222 |     casava_out = CasavaOneEightSingleLanePerSampleDirFmt()
223 |     for filename in filenames:
224 |         with gzip.open(str(casava_out.path / filename), mode="w"):
225 |             pass
226 | 
227 |     return ctx.make_artifact(_type, casava_out)
228 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | import gzip
  9 | import os
 10 | import signal
 11 | import tempfile
 12 | import threading
 13 | import unittest
 14 | from threading import Thread
 15 | from unittest.mock import patch, MagicMock
 16 | 
 17 | from q2_types.per_sample_sequences import CasavaOneEightSingleLanePerSampleDirFmt
 18 | from qiime2 import Artifact
 19 | from qiime2.plugin.testing import TestPluginBase
 20 | 
 21 | from q2_fondue.utils import (
 22 |     handle_threaded_exception,
 23 |     _has_enough_space,
 24 |     _chunker,
 25 |     _rewrite_fastq,
 26 |     _is_empty,
 27 |     _remove_empty,
 28 |     _make_empty_artifact,
 29 | )
 30 | 
 31 | 
 32 | class TestExceptHooks(unittest.TestCase):
 33 |     package = "q2_fondue.tests"
 34 | 
 35 |     def do_something_with_error(self, msg):
 36 |         raise Exception(msg)
 37 | 
 38 |     @patch("os.kill")
 39 |     def test_handle_threaded_exception_gaierror(self, patch_kill):
 40 |         with self.assertLogs(level="DEBUG", logger="ThreadedErrorsManager") as cm:
 41 |             threading.excepthook = handle_threaded_exception
 42 |             error_msg = "Something went wrong: gaierror is " "not JSON serializable."
 43 |             t = Thread(target=self.do_something_with_error, args=(error_msg,))
 44 |             t.start()
 45 |             t.join()
 46 | 
 47 |             self.assertIn("EntrezPy failed to connect to NCBI", cm.output[0])
 48 | 
 49 |             pid = os.getpid()
 50 |             patch_kill.assert_called_once_with(pid, signal.SIGINT)
 51 | 
 52 |     @patch("os.kill")
 53 |     def test_handle_threaded_exception_other_errors(self, patch_kill):
 54 |         with self.assertLogs(level="DEBUG", logger="ThreadedErrorsManager") as cm:
 55 |             threading.excepthook = handle_threaded_exception
 56 |             error_msg = "Some unknown exception."
 57 |             t = Thread(target=self.do_something_with_error, args=(error_msg,))
 58 |             t.start()
 59 |             t.join()
 60 | 
 61 |             self.assertIn(
 62 |                 "Caught <class 'Exception'> with value " '"Some unknown exception."',
 63 |                 cm.output[0],
 64 |             )
 65 | 
 66 |             pid = os.getpid()
 67 |             patch_kill.assert_called_once_with(pid, signal.SIGINT)
 68 | 
 69 | 
 70 | class TestSRAUtils(TestPluginBase):
 71 |     package = "q2_fondue.tests"
 72 | 
 73 |     @patch("subprocess.run")
 74 |     def test_has_enough_space(self, patched_run):
 75 |         patched_run.return_value = MagicMock(returncode=0)
 76 | 
 77 |         acc, test_dir = "ABC123", "some/where"
 78 |         obs = _has_enough_space(acc, test_dir)
 79 |         self.assertTrue(obs)
 80 |         patched_run.assert_called_once_with(
 81 |             ["fasterq-dump", "--size-check", "only", "-x", acc],
 82 |             text=True,
 83 |             capture_output=True,
 84 |             cwd=test_dir,
 85 |         )
 86 | 
 87 |     @patch("subprocess.run")
 88 |     def test_has_enough_space_not(self, patched_run):
 89 |         with open(self.get_data_path("fasterq-dump-response.txt")) as f:
 90 |             response = "".join(f.readlines())
 91 |         patched_run.return_value = MagicMock(stderr=response, returncode=3)
 92 | 
 93 |         acc, test_dir = "ABC123", "some/where"
 94 |         obs = _has_enough_space(acc, test_dir)
 95 |         self.assertFalse(obs)
 96 |         patched_run.assert_called_once_with(
 97 |             ["fasterq-dump", "--size-check", "only", "-x", acc],
 98 |             text=True,
 99 |             capture_output=True,
100 |             cwd=test_dir,
101 |         )
102 | 
103 |     @patch("subprocess.run")
104 |     def test_has_enough_space_error(self, patched_run):
105 |         patched_run.return_value = MagicMock(stderr="errorX", returncode=8)
106 | 
107 |         acc, test_dir = "ABC123", "some/where"
108 |         with self.assertLogs("q2_fondue.utils", level="ERROR") as cm:
109 |             obs = _has_enough_space(acc, test_dir)
110 |         self.assertEqual(
111 |             cm.output,
112 |             [
113 |                 'ERROR:q2_fondue.utils:fasterq-dump exited with a "8" error code '
114 |                 '(the message was: "errorX"). We will try to fetch the next '
115 |                 "accession ID."
116 |             ],
117 |         )
118 |         self.assertTrue(obs)
119 |         patched_run.assert_called_once_with(
120 |             ["fasterq-dump", "--size-check", "only", "-x", acc],
121 |             text=True,
122 |             capture_output=True,
123 |             cwd=test_dir,
124 |         )
125 | 
126 |     def test_chunker(self):
127 |         obs_out = _chunker(["A", "B", "C"], 2)
128 |         exp_out_1 = ["A", "B"]
129 |         exp_out_2 = ["C"]
130 |         self.assertEqual(next(obs_out), exp_out_1)
131 |         self.assertEqual(next(obs_out), exp_out_2)
132 | 
133 |     def test_chunker_no_chunks(self):
134 |         obs_out = _chunker(["A", "B", "C"], 4)
135 |         exp_out = ["A", "B", "C"]
136 |         self.assertEqual(next(obs_out), exp_out)
137 | 
138 |     def test_rewrite_fastq(self):
139 |         file_in = self.get_data_path("SRR123456.fastq")
140 |         file_out = tempfile.NamedTemporaryFile()
141 | 
142 |         _rewrite_fastq(file_in, file_out.name)
143 | 
144 |         with open(file_in, "rb") as fin:
145 |             with gzip.open(file_out.name, "r") as fout:
146 |                 for lin, lout in zip(fin.readlines(), fout.readlines()):
147 |                     self.assertEqual(lin, lout)
148 | 
149 |         # clean up
150 |         file_out.close()
151 | 
152 | 
153 | class TestSequenceUtils(TestPluginBase):
154 |     package = "q2_fondue.tests"
155 | 
156 |     def test_is_empty_with_empty_artifact(self):
157 |         casava_out = CasavaOneEightSingleLanePerSampleDirFmt()
158 |         filenames = ["xxx_01_L001_R1_001.fastq.gz"]
159 |         for filename in filenames:
160 |             with gzip.open(str(casava_out.path / filename), mode="w"):
161 |                 pass
162 | 
163 |         artifact = Artifact.import_data("SampleData[SequencesWithQuality]", casava_out)
164 | 
165 |         self.assertTrue(_is_empty(artifact))
166 | 
167 |     def test_is_empty_with_nonempty_artifact(self):
168 |         artifact = Artifact.import_data(
169 |             "SampleData[SequencesWithQuality]",
170 |             self.get_data_path("single1"),
171 |             CasavaOneEightSingleLanePerSampleDirFmt,
172 |         )
173 | 
174 |         self.assertFalse(_is_empty(artifact))
175 | 
176 |     def test_remove_empty(self):
177 |         empty_casava = CasavaOneEightSingleLanePerSampleDirFmt()
178 |         with gzip.open(
179 |             str(empty_casava.path / "xxx_01_L001_R1_001.fastq.gz"), mode="w"
180 |         ):
181 |             pass
182 |         empty_artifact_single = Artifact.import_data(
183 |             "SampleData[SequencesWithQuality]", empty_casava
184 |         )
185 |         with gzip.open(
186 |             str(empty_casava.path / "xxx_01_L001_R2_001.fastq.gz"), mode="w"
187 |         ):
188 |             pass
189 |         empty_artifact_paired = Artifact.import_data(
190 |             "SampleData[PairedEndSequencesWithQuality]", empty_casava
191 |         )
192 | 
193 |         non_empty_artifact_single = Artifact.import_data(
194 |             "SampleData[SequencesWithQuality]",
195 |             self.get_data_path("single1"),
196 |             CasavaOneEightSingleLanePerSampleDirFmt,
197 |         )
198 |         non_empty_artifact_paired = Artifact.import_data(
199 |             "SampleData[PairedEndSequencesWithQuality]",
200 |             self.get_data_path("paired1"),
201 |             CasavaOneEightSingleLanePerSampleDirFmt,
202 |         )
203 | 
204 |         singles = [empty_artifact_single, non_empty_artifact_single]
205 |         paired = [empty_artifact_paired, non_empty_artifact_paired]
206 | 
207 |         filtered_singles, filtered_paired = _remove_empty(singles, paired)
208 | 
209 |         self.assertEqual(len(filtered_singles), 1)
210 |         self.assertEqual(len(filtered_paired), 1)
211 |         self.assertIs(filtered_singles[0], non_empty_artifact_single)
212 |         self.assertIs(filtered_paired[0], non_empty_artifact_paired)
213 | 
214 |     def test_make_empty_artifact_single(self):
215 |         ctx = MagicMock()
216 |         ctx.make_artifact.return_value = "single_artifact"
217 | 
218 |         result = _make_empty_artifact(ctx, False)
219 | 
220 |         self.assertEqual(result, "single_artifact")
221 |         ctx.make_artifact.assert_called_once()
222 | 
223 |         args, kwargs = ctx.make_artifact.call_args
224 | 
225 |         self.assertEqual(args[0], "SampleData[SequencesWithQuality]")
226 | 
227 |         casava_output = args[1]
228 |         self.assertTrue(
229 |             os.path.exists(casava_output.path / "xxx_01_L001_R1_001.fastq.gz")
230 |         )
231 | 
232 |     def test_make_empty_artifact_paired(self):
233 |         ctx = MagicMock()
234 |         ctx.make_artifact.return_value = "paired_artifact"
235 | 
236 |         result = _make_empty_artifact(ctx, True)
237 | 
238 |         self.assertEqual(result, "paired_artifact")
239 |         ctx.make_artifact.assert_called_once()
240 | 
241 |         args, kwargs = ctx.make_artifact.call_args
242 | 
243 |         self.assertEqual(args[0], "SampleData[PairedEndSequencesWithQuality]")
244 | 
245 |         casava_output = args[1]
246 |         self.assertTrue(
247 |             os.path.exists(casava_output.path / "xxx_00_L001_R1_001.fastq.gz")
248 |         )
249 |         self.assertTrue(
250 |             os.path.exists(casava_output.path / "xxx_00_L001_R2_001.fastq.gz")
251 |         )
252 | 
253 | 
254 | if __name__ == "__main__":
255 |     unittest.main()
256 | 


--------------------------------------------------------------------------------
/q2_fondue/metadata.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | 
  9 | import threading
 10 | from typing import List, Tuple
 11 | 
 12 | import entrezpy.efetch.efetcher as ef
 13 | import pandas as pd
 14 | from qiime2 import Metadata
 15 | 
 16 | from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer
 17 | from q2_fondue.utils import (
 18 |     _validate_run_ids,
 19 |     _determine_id_type,
 20 |     handle_threaded_exception,
 21 | )
 22 | from q2_fondue.entrezpy_clients._utils import (
 23 |     set_up_entrezpy_logging,
 24 |     set_up_logger,
 25 |     InvalidIDs,
 26 | )
 27 | from q2_fondue.entrezpy_clients._pipelines import _get_run_ids
 28 | 
 29 | 
 30 | threading.excepthook = handle_threaded_exception
 31 | BATCH_SIZE = 150
 32 | 
 33 | 
 34 | def _efetcher_inquire(
 35 |     efetcher: ef.Efetcher, run_ids: List[str], log_level: str
 36 | ) -> Tuple[pd.DataFrame, dict]:
 37 |     """Makes an EFetch request using the provided IDs.
 38 | 
 39 |     Args:
 40 |         efetcher (ef.Efetcher): A valid instance of an Entrezpy Efetcher.
 41 |         run_ids (List[str]): List of all the run IDs to be fetched.
 42 |         log_level (str): Logging level.
 43 | 
 44 |     Returns:
 45 |         pd.DataFrame: DataFrame with metadata obtained for the provided IDs.
 46 |         dict: Dictionary of the run IDs that were not found with
 47 |             respective error messages.
 48 |     """
 49 |     metadata_response = efetcher.inquire(
 50 |         {
 51 |             "db": "sra",
 52 |             "id": run_ids,
 53 |             "rettype": "xml",
 54 |             "retmode": "xml",
 55 |             "retmax": len(run_ids),
 56 |             "reqsize": BATCH_SIZE,
 57 |         },
 58 |         analyzer=EFetchAnalyzer(log_level),
 59 |     )
 60 | 
 61 |     if metadata_response.result is None:
 62 |         return (pd.DataFrame(), {m_id: metadata_response.error_msg for m_id in run_ids})
 63 |     else:
 64 |         return metadata_response.result.metadata_to_df(), {}
 65 | 
 66 | 
 67 | def _execute_efetcher(email, n_jobs, run_ids, log_level):
 68 |     efetcher = ef.Efetcher(
 69 |         "efetcher", email, apikey=None, apikey_var=None, threads=n_jobs, qid=None
 70 |     )
 71 |     set_up_entrezpy_logging(efetcher, log_level)
 72 | 
 73 |     return _efetcher_inquire(efetcher, run_ids, log_level)
 74 | 
 75 | 
 76 | def _get_run_meta(
 77 |     email, n_jobs, run_ids, validated, log_level, logger
 78 | ) -> (pd.DataFrame, dict):
 79 |     if not validated:
 80 |         invalid_ids = _validate_run_ids(email, n_jobs, run_ids, log_level)
 81 |         valid_ids = sorted(list(set(run_ids) - set(invalid_ids.keys())))
 82 | 
 83 |         if not valid_ids:
 84 |             raise InvalidIDs("All provided IDs were invalid. Please check your input.")
 85 |         if invalid_ids:
 86 |             logger.warning(
 87 |                 f"The following provided IDs are invalid: "
 88 |                 f'{",".join(invalid_ids.keys())}. Please correct them and '
 89 |                 f"try fetching those independently."
 90 |             )
 91 |     else:
 92 |         # we assume that IDs retrieved by linking from aggregate IDs
 93 |         # (e.g., BioProject or study) should only return valid IDs,
 94 |         # since we asked NCBI to get those for us
 95 |         valid_ids = run_ids
 96 | 
 97 |     # fetch metadata
 98 |     logger.info("Fetching metadata for %i run IDs.", len(valid_ids))
 99 |     meta_df, missing_ids = _execute_efetcher(email, n_jobs, valid_ids, log_level)
100 | 
101 |     if missing_ids:
102 |         logger.warning(
103 |             "Metadata for the following run IDs could not be fetched: "
104 |             f'{",".join(missing_ids.keys())}. '
105 |             f"Please try fetching those independently."
106 |         )
107 | 
108 |     return meta_df, missing_ids
109 | 
110 | 
111 | def _get_other_meta(
112 |     email, n_jobs, project_ids, id_type, log_level, logger
113 | ) -> (pd.DataFrame, dict):
114 |     run_ids = _get_run_ids(email, n_jobs, project_ids, None, id_type, log_level)
115 | 
116 |     return _get_run_meta(email, n_jobs, run_ids, True, log_level, logger)
117 | 
118 | 
119 | def _find_doi_mapping_and_type(mapping_doi_ids: Metadata) -> (pd.Series, str):
120 |     """If present, save DOI name to ID mappings together with type
121 |     of IDs the DOI names are matching to.
122 | 
123 |     Args:
124 |         mapping_doi_ids (Metadata): Table of accession IDs with
125 |             associated DOI names.
126 |     Returns:
127 |         pd.Series: Series of DOI names with matched accession IDs.
128 |         str: Type of accession IDs in matching.
129 |     """
130 |     id2doi = mapping_doi_ids.to_dataframe().iloc[:, 0]
131 |     doi_ids = sorted(list(mapping_doi_ids.get_ids()))
132 |     id2doi_type = _determine_id_type(doi_ids)
133 | 
134 |     return (id2doi, id2doi_type)
135 | 
136 | 
137 | def get_metadata(
138 |     accession_ids: Metadata,
139 |     email: str,
140 |     threads: int = 1,
141 |     log_level: str = "INFO",
142 |     linked_doi: Metadata = None,
143 | ) -> (pd.DataFrame, pd.DataFrame):
144 |     """Fetches metadata using the provided run/bioproject/study/sample or
145 |     experiment accession IDs.
146 | 
147 |     If aggregate IDs (such as bioproject, study, sample, experiment IDs) were
148 |     provided, first run IDs will be fetched using a Conduit Pipeline.
149 |     The run IDs will be validated using an ESearch query. The metadata will
150 |     be fetched only for the valid run IDs. Invalid run IDs will be raised
151 |     with a warning. Run IDs for which the metadata could not be fetched will
152 |     be returned with the corresponding error message as missing_ids.
153 | 
154 |     Args:
155 |         accession_ids (Metadata): Table of all the accession IDs
156 |             to be fetched (either run, bioproject, study, sample or
157 |             experiment IDs). If table does not contain DOI names, names
158 |             from `linked_doi` will be matched.
159 |         linked_doi (Metadata): Optional table of accession IDs with
160 |             associated DOI names. Preferably used when refetching failed
161 |             run IDs that can be matched after metadata was fetched
162 |             successfully. Ignored if `accession_ids` already contains DOI
163 |             names.
164 |         email (str): A valid e-mail address (required by NCBI).
165 |         threads (int, default=1): Number of threads to be used in parallel.
166 |         log_level (str, default='INFO'): Logging level.
167 | 
168 |     Returns:
169 |         pd.DataFrame: DataFrame with metadata obtained for the provided IDs.
170 |         pd.DataFrame: DataFrame with runs IDs for which no metadata was
171 |             fetched and the associated error messages.
172 |     """
173 |     logger = set_up_logger(log_level, logger_name=__name__)
174 | 
175 |     # extract DOI names to IDs mapping for later
176 |     if any(x in accession_ids.columns for x in ["doi", "DOI"]):
177 |         id2doi, id2doi_type = _find_doi_mapping_and_type(accession_ids)
178 |     elif linked_doi and any(x in linked_doi.columns for x in ["doi", "DOI"]):
179 |         id2doi, id2doi_type = _find_doi_mapping_and_type(linked_doi)
180 |     else:
181 |         id2doi, id2doi_type = None, None
182 | 
183 |     # Retrieve input IDs
184 |     accession_ids = sorted(list(accession_ids.get_ids()))
185 | 
186 |     # figure out which id type we're dealing with
187 |     id_type = _determine_id_type(accession_ids)
188 | 
189 |     # get actual metadata
190 |     if id_type == "run":
191 |         meta, missing_ids = _get_run_meta(
192 |             email, threads, accession_ids, False, log_level, logger
193 |         )
194 |     else:
195 |         meta, missing_ids = _get_other_meta(
196 |             email, threads, accession_ids, id_type, log_level, logger
197 |         )
198 | 
199 |     # match DOI names to metadata if present
200 |     match_study_meta = {
201 |         "bioproject": "Bioproject ID",
202 |         "study": "Study ID",
203 |         "experiment": "Experiment ID",
204 |         "sample": "Sample Accession",
205 |     }
206 |     if id2doi is not None and id2doi_type == "run":
207 |         meta = meta.join(id2doi, how="left")
208 |     elif id2doi is not None and id2doi_type != "run":
209 |         meta = meta.merge(
210 |             id2doi, how="left", left_on=match_study_meta[id2doi_type], right_index=True
211 |         )
212 | 
213 |     missing_ids = pd.DataFrame(
214 |         data={"Error message": missing_ids.values()},
215 |         index=pd.Index(missing_ids.keys(), name="ID"),
216 |     )
217 |     return meta, missing_ids
218 | 
219 | 
220 | def merge_metadata(metadata: pd.DataFrame) -> pd.DataFrame:
221 |     """Merges provided multiple metadata into a single metadata object.
222 | 
223 |     Args:
224 |         metadata (pd.DataFrame): List of metadata DataFrames to be merged.
225 | 
226 |     Returns:
227 |         metadata_merged (pd.DataFrame): Final metadata DataFrame.
228 |     """
229 |     logger = set_up_logger("INFO", logger_name=__name__)
230 |     logger.info("Merging %s metadata DataFrames.", len(metadata))
231 | 
232 |     metadata_merged = pd.concat(metadata, axis=0, join="outer")
233 | 
234 |     records_count = metadata_merged.shape[0]
235 |     metadata_merged.drop_duplicates(inplace=True)
236 |     if records_count != metadata_merged.shape[0]:
237 |         logger.info(
238 |             "%s duplicate record(s) found in the metadata " "were dropped.",
239 |             records_count - metadata_merged.shape[0],
240 |         )
241 | 
242 |     if len(metadata_merged.index) != len(set(metadata_merged.index)):
243 |         logger.warning(
244 |             "Records with same IDs but differing values were found in "
245 |             "the metadata and will not be removed."
246 |         )
247 | 
248 |     logger.info(
249 |         "Merged metadata DataFrame has %s rows and %s columns.",
250 |         metadata_merged.shape[0],
251 |         metadata_merged.shape[1],
252 |     )
253 | 
254 |     return metadata_merged
255 | 


--------------------------------------------------------------------------------
/q2_fondue/tests/data/metadata_response_small.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "EXPERIMENT": {
  3 |     "@accession": "ERX3980916",
  4 |     "@alias": "ena-EXPERIMENT-UNIVERSITY OF HOHENHEIM-06-03-2020-13:37:12:076-1",
  5 |     "@center_name": "UNIVERSITY OF HOHENHEIM",
  6 |     "IDENTIFIERS": {
  7 |       "PRIMARY_ID": "ERX3980916"
  8 |     },
  9 |     "TITLE": "Illumina MiSeq sequencing",
 10 |     "STUDY_REF": {
 11 |       "@accession": "ERP120343",
 12 |       "IDENTIFIERS": {
 13 |         "PRIMARY_ID": "ERP120343",
 14 |         "EXTERNAL_ID": {
 15 |           "@namespace": "BioProject",
 16 |           "#text": "PRJEB37054"
 17 |         }
 18 |       }
 19 |     },
 20 |     "DESIGN": {
 21 |       "DESIGN_DESCRIPTION": null,
 22 |       "SAMPLE_DESCRIPTOR": {
 23 |         "@accession": "ERS4372624",
 24 |         "IDENTIFIERS": {
 25 |           "PRIMARY_ID": "ERS4372624",
 26 |           "EXTERNAL_ID": {
 27 |             "@namespace": "BioSample",
 28 |             "#text": "SAMEA6608408"
 29 |           }
 30 |         }
 31 |       },
 32 |       "LIBRARY_DESCRIPTOR": {
 33 |         "LIBRARY_NAME": "unspecified",
 34 |         "LIBRARY_STRATEGY": "AMPLICON",
 35 |         "LIBRARY_SOURCE": "METAGENOMIC",
 36 |         "LIBRARY_SELECTION": "PCR",
 37 |         "LIBRARY_LAYOUT": {
 38 |           "SINGLE": null
 39 |         }
 40 |       }
 41 |     },
 42 |     "PLATFORM": {
 43 |       "ILLUMINA": {
 44 |         "INSTRUMENT_MODEL": "Illumina MiSeq"
 45 |       }
 46 |     },
 47 |     "EXPERIMENT_ATTRIBUTES": {
 48 |         "EXPERIMENT_ATTRIBUTE": [
 49 |           {
 50 |             "TAG": "Temperature",
 51 |             "VALUE": "12"
 52 |           },
 53 |           {
 54 |             "TAG": "Depth",
 55 |             "VALUE": "500"
 56 |           }
 57 |         ]
 58 |       }
 59 |   },
 60 |   "SUBMISSION": {
 61 |     "@accession": "ERA2402167",
 62 |     "@alias": "ena-SUBMISSION-UNIVERSITY OF HOHENHEIM-06-03-2020-13:27:09:756-1",
 63 |     "@center_name": "UNIVERSITY OF HOHENHEIM",
 64 |     "@lab_name": "European Nucleotide Archive",
 65 |     "IDENTIFIERS": {
 66 |       "PRIMARY_ID": "ERA2402167"
 67 |     },
 68 |     "TITLE": "Submitted by UNIVERSITY OF HOHENHEIM on 06-MAR-2020"
 69 |   },
 70 |   "Organization": {
 71 |     "@type": "center",
 72 |     "Name": {
 73 |       "@abbr": "University of Hohenheim",
 74 |       "#text": "University of Hohenheim"
 75 |     }
 76 |   },
 77 |   "STUDY": {
 78 |     "@accession": "ERP120343",
 79 |     "@alias": "ena-STUDY-UNIVERSITY OF HOHENHEIM-04-03-2020-12:54:47:240-944",
 80 |     "@center_name": "UNIVERSITY OF HOHENHEIM",
 81 |     "IDENTIFIERS": {
 82 |       "PRIMARY_ID": "ERP120343",
 83 |       "EXTERNAL_ID": {
 84 |         "@namespace": "BioProject",
 85 |         "#text": "PRJEB37054"
 86 |       }
 87 |     },
 88 |     "DESCRIPTOR": {
 89 |       "STUDY_TITLE": "The microbial load, diversity and composition of\n                    the wine microbiota is affected by wine type and\n                    environmental-stress factors",
 90 |       "STUDY_TYPE": {
 91 |         "@existing_study_type": "Other"
 92 |       },
 93 |       "STUDY_ABSTRACT": "In order to improve the understanding of the\n                    composition, organization and temporal dynamics of the wine\n                    microbiota, the relative and absolute bacterial wine\n                    microbiota composition during the first week of\n                    fermentation was determined, including distinct red and\n                    white wine cultivars, by 16S rRNA gene amplicon sequencing.",
 94 |       "CENTER_PROJECT_NAME": "Wine microbiota analysis during\n                    fermentation",
 95 |       "STUDY_DESCRIPTION": "In order to improve the understanding of the\n                    composition, organization and temporal dynamics of the wine\n                    microbiota, the relative and absolute bacterial wine\n                    microbiota composition during the first week of\n                    fermentation was determined, including distinct red and\n                    white wine cultivars, by 16S rRNA gene amplicon sequencing."
 96 |     },
 97 |     "STUDY_ATTRIBUTES": {
 98 |       "STUDY_ATTRIBUTE": [
 99 |         {
100 |           "TAG": "ENA-FIRST-PUBLIC",
101 |           "VALUE": "2020-05-31"
102 |         },
103 |         {
104 |           "TAG": "ENA-LAST-UPDATE",
105 |           "VALUE": "2020-03-04"
106 |         }
107 |       ]
108 |     }
109 |   },
110 |   "SAMPLE": {
111 |     "@accession": "ERS4372624",
112 |     "@alias": "BAC1.D1.0.32A",
113 |     "@center_name": "UNIVERSITY OF HOHENHEIM",
114 |     "IDENTIFIERS": {
115 |       "PRIMARY_ID": "ERS4372624",
116 |       "EXTERNAL_ID": {
117 |         "@namespace": "BioSample",
118 |         "#text": "SAMEA6608408"
119 |       }
120 |     },
121 |     "TITLE": "Vitis vinifera",
122 |     "SAMPLE_NAME": {
123 |       "TAXON_ID": "29760",
124 |       "SCIENTIFIC_NAME": "Vitis vinifera",
125 |       "COMMON_NAME": "wine grape"
126 |     },
127 |     "SAMPLE_ATTRIBUTES": {
128 |       "SAMPLE_ATTRIBUTE": [
129 |         {
130 |           "TAG": "environment (biome)",
131 |           "VALUE": "berry plant"
132 |         },
133 |         {
134 |           "TAG": "geographic location (country and/or sea)",
135 |           "VALUE": "Germany"
136 |         },
137 |         {
138 |           "TAG": "sample storage temperature",
139 |           "VALUE": "-80",
140 |           "UNITS": "°C"
141 |         }
142 |       ]
143 |     }
144 |   },
145 |   "Pool": {
146 |     "Member": {
147 |       "@member_name": "",
148 |       "@accession": "ERS4372624",
149 |       "@sample_name": "BAC1.D1.0.32A",
150 |       "@sample_title": "Vitis vinifera",
151 |       "@spots": "39323",
152 |       "@bases": "11552099",
153 |       "@tax_id": "29760",
154 |       "@organism": "Vitis vinifera",
155 |       "IDENTIFIERS": {
156 |         "PRIMARY_ID": "ERS4372624",
157 |         "EXTERNAL_ID": {
158 |           "@namespace": "BioSample",
159 |           "#text": "SAMEA6608408"
160 |         }
161 |       }
162 |     }
163 |   },
164 |   "RUN_SET": {
165 |     "RUN": {
166 |       "@accession": "FAKEID1",
167 |       "@alias": "ena-RUN-UNIVERSITY OF HOHENHEIM-06-03-2020-13:37:12:076-1",
168 |       "@center_name": "UNIVERSITY OF HOHENHEIM",
169 |       "@total_spots": "39323",
170 |       "@total_bases": "11552099",
171 |       "@size": "3914295",
172 |       "@load_done": "true",
173 |       "@published": "2020-06-01 17:54:43",
174 |       "@is_public": "true",
175 |       "@cluster_name": "public",
176 |       "@static_data_available": "1",
177 |       "IDENTIFIERS": {
178 |         "PRIMARY_ID": "FAKEID1"
179 |       },
180 |       "TITLE": "Illumina MiSeq sequencing",
181 |       "EXPERIMENT_REF": {
182 |         "@accession": "ERX3980916",
183 |         "IDENTIFIERS": {
184 |           "PRIMARY_ID": "ERX3980916"
185 |         }
186 |       },
187 |       "RUN_ATTRIBUTES": {
188 |         "RUN_ATTRIBUTE": [
189 |           {
190 |             "TAG": "ENA-FIRST-PUBLIC",
191 |             "VALUE": "2020-05-31"
192 |           },
193 |           {
194 |             "TAG": "ENA-LAST-UPDATE",
195 |             "VALUE": "2020-03-06"
196 |           }
197 |         ]
198 |       },
199 |       "Pool": {
200 |         "Member": {
201 |           "@member_name": "",
202 |           "@accession": "ERS4372624",
203 |           "@sample_name": "BAC1.D1.0.32A",
204 |           "@sample_title": "Vitis vinifera",
205 |           "@spots": "39323",
206 |           "@bases": "11552099",
207 |           "@tax_id": "29760",
208 |           "@organism": "Vitis vinifera",
209 |           "IDENTIFIERS": {
210 |             "PRIMARY_ID": "ERS4372624",
211 |             "EXTERNAL_ID": {
212 |               "@namespace": "BioSample",
213 |               "#text": "SAMEA6608408"
214 |             }
215 |           }
216 |         }
217 |       },
218 |       "SRAFiles": {
219 |         "SRAFile": {
220 |           "@cluster": "public",
221 |           "@filename": "FAKEID1",
222 |           "@url": "https://sra-download.ncbi.nlm.nih.gov/traces/era16/ERR/ERR3978/FAKEID1",
223 |           "@size": "3915680",
224 |           "@date": "2020-06-01 19:51:45",
225 |           "@md5": "d92e4c21e26e5f2bd2cdaf56cfcfeaa0",
226 |           "@semantic_name": "run",
227 |           "@supertype": "Primary ETL",
228 |           "@sratoolkit": "1",
229 |           "Alternatives": [
230 |             {
231 |               "@url": "https://sra-download.ncbi.nlm.nih.gov/traces/era16/ERR/ERR3978/FAKEID1",
232 |               "@free_egress": "worldwide",
233 |               "@access_type": "anonymous",
234 |               "@org": "NCBI"
235 |             },
236 |             {
237 |               "@url": "https://sra-pub-run-odp.s3.amazonaws.com/sra/FAKEID1/FAKEID1",
238 |               "@free_egress": "worldwide",
239 |               "@access_type": "anonymous",
240 |               "@org": "AWS"
241 |             },
242 |             {
243 |               "@url": "gs://sra-pub-run-8/FAKEID1/FAKEID1.1",
244 |               "@free_egress": "gs.US",
245 |               "@access_type": "gcp identity",
246 |               "@org": "GCP"
247 |             }
248 |           ]
249 |         }
250 |       },
251 |       "CloudFiles": {
252 |         "CloudFile": [
253 |           {
254 |             "@filetype": "run",
255 |             "@provider": "gs",
256 |             "@location": "gs.US"
257 |           },
258 |           {
259 |             "@filetype": "run",
260 |             "@provider": "s3",
261 |             "@location": "s3.us-east-1"
262 |           }
263 |         ]
264 |       },
265 |       "Statistics": {
266 |         "@nreads": "1",
267 |         "@nspots": "39323",
268 |         "Read": {
269 |           "@index": "0",
270 |           "@count": "39323",
271 |           "@average": "293.77",
272 |           "@stdev": "20.23"
273 |         }
274 |       },
275 |       "Bases": {
276 |         "@cs_native": "false",
277 |         "@count": "11552099",
278 |         "Base": [
279 |           {
280 |             "@value": "A",
281 |             "@count": "3143257"
282 |           },
283 |           {
284 |             "@value": "C",
285 |             "@count": "2405184"
286 |           },
287 |           {
288 |             "@value": "G",
289 |             "@count": "3867631"
290 |           },
291 |           {
292 |             "@value": "T",
293 |             "@count": "2136027"
294 |           },
295 |           {
296 |             "@value": "N",
297 |             "@count": "0"
298 |           }
299 |         ]
300 |       }
301 |     }
302 |   }
303 | }


--------------------------------------------------------------------------------
/q2_fondue/entrezpy_clients/_sra_meta.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------------------------
  2 | # Copyright (c) 2025, Bokulich Laboratories.
  3 | #
  4 | # Distributed under the terms of the Modified BSD License.
  5 | #
  6 | # The full license is in the file LICENSE, distributed with this software.
  7 | # ----------------------------------------------------------------------------
  8 | 
  9 | from abc import abstractmethod, ABCMeta
 10 | from dataclasses import dataclass, field
 11 | from typing import Union, List
 12 | 
 13 | import pandas as pd
 14 | 
 15 | from q2_fondue.entrezpy_clients._utils import get_attrs
 16 | 
 17 | 
 18 | META_REQUIRED_COLUMNS = [
 19 |     "Experiment ID",
 20 |     "Biosample ID",
 21 |     "Bioproject ID",
 22 |     "Study ID",
 23 |     "Sample Accession",
 24 |     "Organism",
 25 |     "Library Source",
 26 |     "Library Layout",
 27 |     "Library Selection",
 28 |     "Instrument",
 29 |     "Platform",
 30 |     "Bases",
 31 |     "Spots",
 32 |     "Avg Spot Len",
 33 |     "Bytes",
 34 |     "Public",
 35 | ]
 36 | 
 37 | 
 38 | @dataclass
 39 | class LibraryMetadata:
 40 |     """A class for storing sequencing library metadata."""
 41 | 
 42 |     name: str
 43 |     layout: str
 44 |     selection: str
 45 |     source: str
 46 | 
 47 |     def generate_meta(self):
 48 |         index = get_attrs(self)
 49 |         return pd.DataFrame(
 50 |             data=[getattr(self, k) for k in index],
 51 |             index=[f"library_{x}" for x in index],
 52 |         ).T
 53 | 
 54 | 
 55 | @dataclass
 56 | class SRABaseMeta(metaclass=ABCMeta):
 57 |     """A base class for generation of SRA metadata objects.
 58 | 
 59 |     Attributes:
 60 |         id (str): Unique ID of the metadata object.
 61 |         custom_meta (Union[dict, None]): Custom metadata belonging
 62 |             to the object, if any.
 63 |         child (str): a one-word description of the child type for
 64 |             the given object (e.g., a 'sample' is a child of a 'study').
 65 |     """
 66 | 
 67 |     id: str
 68 |     custom_meta: Union[dict, None]
 69 |     child: str = None
 70 | 
 71 |     def __post_init__(self):
 72 |         """Initializes custom metadata DataFrame."""
 73 |         if self.custom_meta:
 74 |             self.custom_meta_df = pd.DataFrame(self.custom_meta, index=[self.id])
 75 |         else:
 76 |             self.custom_meta_df = None
 77 | 
 78 |     def __eq__(self, other):
 79 |         """Compares all attributes. To be used on subclasses that contain
 80 |         DataFrames as attributes."""
 81 |         same = {}
 82 |         for k, v in vars(self).items():
 83 |             if isinstance(v, pd.DataFrame):
 84 |                 same[k] = self.__getattribute__(k).equals(other.__getattribute__(k))
 85 |             else:
 86 |                 same[k] = self.__getattribute__(k) == other.__getattribute__(k)
 87 |         return all(same.values())
 88 | 
 89 |     def get_base_metadata(self, excluded: tuple) -> pd.DataFrame:
 90 |         """Generates a DataFrame containing basic metadata of the SRA object.
 91 | 
 92 |         The metadata generated by this method do not contain any of the
 93 |         metadata belonging the any of the object's children.
 94 | 
 95 |         Args:
 96 |             excluded (tuple): attributes to be excluded during metadata
 97 |                 DataFrame generation
 98 |         Returns:
 99 |             base_meta (pd.DataFrame): Requested base metadata.
100 |         """
101 |         index = get_attrs(
102 |             self, excluded=("child", "custom_meta", "custom_meta_df") + excluded
103 |         )
104 |         base_meta = pd.DataFrame(
105 |             data={k: getattr(self, k) for k in index}, index=[self.id]
106 |         )
107 | 
108 |         if self.custom_meta:
109 |             base_meta = pd.concat(
110 |                 [base_meta, self.custom_meta_df],
111 |                 axis=1,
112 |             )
113 | 
114 |         return base_meta
115 | 
116 |     def get_child_metadata(self) -> pd.DataFrame:
117 |         """Generates a DataFrame containing metadata of all the
118 |             children SRA objects.
119 | 
120 |         Returns:
121 |              child_meta (pd.DataFrame): Requested children objects' metadata.
122 |         """
123 |         child_meta_dfs = [
124 |             x.generate_meta() for x in self.__getattribute__(f"{self.child}s")
125 |         ]
126 |         if child_meta_dfs:
127 |             child_meta = pd.concat(child_meta_dfs)
128 |         else:
129 |             child_meta = pd.DataFrame()
130 |         child_meta.index.name = f"{self.child}_id"
131 |         return child_meta
132 | 
133 |     @abstractmethod
134 |     def generate_meta(self) -> pd.DataFrame:
135 |         """Generates a DataFrame with all metadata.
136 | 
137 |         Metadata from current object will be collected and merged together
138 |         with metadata gathered from all of its children.
139 | 
140 |         Returns:
141 |             pd.DataFrame: DataFrame containing all metadata.
142 |         """
143 |         pass
144 | 
145 | 
146 | @dataclass(eq=False)
147 | class SRARun(SRABaseMeta):
148 |     """A class containing all the SRA run metadata.
149 | 
150 |     Attributes:
151 |         public (bool): True if the dataset was public.
152 |         bytes (int): Size of the run dataset.
153 |         bases (int): Nucleotide count of the run dataset.
154 |         spots (int): Spot count of the run dataset.
155 |         avg_spot_len (int): Average spot length.
156 |         experiment_id (str): ID of the experiment which the run belongs to.
157 |         child (str): Run's child type (None, as runs have no children objects).
158 |     """
159 | 
160 |     public: bool = True
161 |     bytes: int = None
162 |     bases: int = None
163 |     spots: int = None
164 |     avg_spot_len: int = None
165 |     experiment_id: str = None
166 |     child: str = None
167 | 
168 |     def __post_init__(self):
169 |         """Calculates an average spot length."""
170 |         super().__post_init__()
171 |         if self.spots > 0:
172 |             self.avg_spot_len = int(self.bases / self.spots)
173 |         else:
174 |             self.avg_spot_len = 0
175 | 
176 |     def generate_meta(self) -> pd.DataFrame:
177 |         """Generates run's metadata.
178 | 
179 |         Returns:
180 |             pd.DataFrame: Run's metadata.
181 |         """
182 |         return self.get_base_metadata(excluded=("id",))
183 | 
184 | 
185 | @dataclass
186 | class SRAExperiment(SRABaseMeta):
187 |     """A class containing all the SRA experiment metadata.
188 | 
189 |     Attributes:
190 |         instrument (str): Sequencing instrument name.
191 |         platform (str): Sequencing platform name.
192 |         library (LibraryMetadata): Metadata of the sequencing library.
193 |         runs (List[SRARun]): All SRA runs belonging to this experiment.
194 |         child (str): Runs are children of experiment objects.
195 | 
196 |     """
197 | 
198 |     instrument: str = None
199 |     platform: str = None
200 |     library: LibraryMetadata = None
201 |     runs: List[SRARun] = field(default_factory=list)
202 |     sample_id: str = None
203 |     child: str = "run"
204 | 
205 |     def generate_meta(self) -> pd.DataFrame:
206 |         """Generates experiment's metadata.
207 | 
208 |         Generated metadata will include all metadata of the linked runs.
209 | 
210 |         Returns:
211 |             pd.DataFrame: Experiment's metadata with all of its children.
212 |         """
213 |         exp_meta = self.get_base_metadata(excluded=("id", "runs", "library"))
214 |         lib_meta = self.library.generate_meta()
215 |         lib_meta.index = exp_meta.index
216 | 
217 |         exp_meta = pd.concat([exp_meta, lib_meta], axis=1)
218 |         runs_meta = self.get_child_metadata()
219 |         if len(runs_meta) > 0:
220 |             runs_merged = runs_meta.merge(
221 |                 exp_meta, left_on="experiment_id", right_index=True
222 |             )
223 |             runs_merged.index.name = "run_id"
224 |             return runs_merged
225 |         else:
226 |             return exp_meta
227 | 
228 | 
229 | @dataclass(eq=False)
230 | class SRASample(SRABaseMeta):
231 |     """A class containing all the SRA sample metadata.
232 | 
233 |     Attributes:
234 |         name (str): Name of the sample.
235 |         title (str): Title of the sample.
236 |         biosample_id (str): BioSample ID linked to the sample.
237 |         organism (str): Organism name.
238 |         tax_id (str): Organism taxonomic ID.
239 |         study_id (str): = ID of the study which the sample belongs to.
240 |         experiments (List[SRAExperiment]): All SRA experiments
241 |             belonging to the sample.
242 |         child (str): = Experiments are children of sample objects.
243 |     """
244 | 
245 |     name: str = None
246 |     title: str = None
247 |     biosample_id: str = None
248 |     organism: str = None
249 |     tax_id: str = None
250 |     study_id: str = None
251 |     experiments: List[SRAExperiment] = field(default_factory=list)
252 |     child: str = "experiment"
253 | 
254 |     def generate_meta(self) -> pd.DataFrame:
255 |         """Generates SRA sample's metadata.
256 | 
257 |         Generated metadata will include all metadata of the linked experiments.
258 | 
259 |         Returns:
260 |             pd.DataFrame: Sample's metadata with all of its children.
261 |         """
262 |         sample_meta = self.get_base_metadata(excluded=("id", "experiments"))
263 |         exps_meta = self.get_child_metadata()
264 |         if len(exps_meta) > 0:
265 |             exps_merged = exps_meta.merge(
266 |                 sample_meta, left_on="sample_id", right_index=True
267 |             )
268 |             exps_merged.index.name = "run_id"
269 |             return exps_merged
270 |         else:
271 |             return sample_meta
272 | 
273 | 
274 | @dataclass
275 | class SRAStudy(SRABaseMeta):
276 |     """Generates SRA study's metadata.
277 | 
278 |     Generated metadata will include all metadata of the linked samples.
279 | 
280 |     Attributes:
281 |         bioproject_id (str): ID of the linked BioProject.
282 |         center_name (str): Name of the center where the study was performed.
283 |         samples (List[SRASample]): All SRA samples belonging to the study.
284 |         child (str): Samples are children of study objects.
285 |     """
286 | 
287 |     bioproject_id: str = None
288 |     center_name: str = None
289 |     samples: List[SRASample] = field(default_factory=list)
290 |     child: str = "sample"
291 | 
292 |     def generate_meta(self) -> pd.DataFrame:
293 |         """Generates SRA study's metadata.
294 | 
295 |         Generated metadata will include all metadata of the linked samples.
296 | 
297 |         Returns:
298 |             pd.DataFrame: Study's metadata with all of its children.
299 |         """
300 |         study_meta = self.get_base_metadata(excluded=("id", "samples"))
301 |         samples_meta = self.get_child_metadata()
302 |         if len(samples_meta) > 0:
303 |             samples_merged = samples_meta.merge(
304 |                 study_meta, left_on="study_id", right_index=True
305 |             )
306 |             samples_merged.index.name = "run_id"
307 |             return samples_merged
308 |         else:
309 |             return study_meta
310 | 


--------------------------------------------------------------------------------