├── q2_fondue ├── tests │ ├── data │ │ ├── testaccA.sra │ │ ├── SRR123456.sra │ │ ├── SRR123457.sra │ │ ├── SRP123456_md.tsv │ │ ├── SRS123456_md.tsv │ │ ├── SRX123456_md.tsv │ │ ├── PRJNA734376_md.tsv │ │ ├── SRR123456_md.tsv │ │ ├── SRR123457_md.tsv │ │ ├── study_ids.tsv │ │ ├── testaccB_md.tsv │ │ ├── testaccC_md.tsv │ │ ├── bioproject_ids.tsv │ │ ├── run_ids.tsv │ │ ├── sample_ids.tsv │ │ ├── experiment_ids.tsv │ │ ├── sample_ids_w_doi.tsv │ │ ├── study_ids_w_doi.tsv │ │ ├── SRR1234567_md.tsv │ │ ├── bioproject_ids_w_doi.tsv │ │ ├── experiment_ids_w_doi.tsv │ │ ├── failed_ids_no_doi.tsv │ │ ├── testaccBC_md.tsv │ │ ├── run_ids_w_doi.tsv │ │ ├── metadata_response_error.xml │ │ ├── run_ids_w_doi_2.tsv │ │ ├── empty │ │ │ ├── xxx_00_L001_R1_001.fastq.gz │ │ │ └── xxx_00_L001_R2_001.fastq.gz │ │ ├── paired1 │ │ │ ├── SEQID1_00_L001_R1_001.fastq.gz │ │ │ ├── SEQID1_00_L001_R2_001.fastq.gz │ │ │ ├── SEQID2_00_L001_R1_001.fastq.gz │ │ │ └── SEQID2_00_L001_R2_001.fastq.gz │ │ ├── paired2 │ │ │ ├── SEQID3_00_L001_R1_001.fastq.gz │ │ │ ├── SEQID3_00_L001_R2_001.fastq.gz │ │ │ ├── SEQID4_00_L001_R1_001.fastq.gz │ │ │ └── SEQID4_00_L001_R2_001.fastq.gz │ │ ├── single1 │ │ │ ├── SEQID1_00_L001_R1_001.fastq.gz │ │ │ └── SEQID2_00_L001_R1_001.fastq.gz │ │ ├── single2 │ │ │ ├── SEQID3_00_L001_R1_001.fastq.gz │ │ │ └── SEQID4_00_L001_R1_001.fastq.gz │ │ ├── SRR123457_2.fastq │ │ ├── testacc_2.fastq │ │ ├── testacc_00_L001_R2_001.fastq │ │ ├── elink_response_single.json │ │ ├── esearch_response_single_ambiguous.json │ │ ├── esearch_response_single_correct.json │ │ ├── sra-metadata-1.tsv │ │ ├── sra-metadata-2.tsv │ │ ├── sra-metadata-3.tsv │ │ ├── esearch_response_multi_invalid.json │ │ ├── sra-metadata-4.tsv │ │ ├── sra-metadata-failed-ids.tsv │ │ ├── sra-metadata-5.tsv │ │ ├── sra-metadata-6.tsv │ │ ├── sra-metadata-7.tsv │ │ ├── sra-metadata-8.tsv │ │ ├── sra-metadata-mock.tsv │ │ ├── sra-metadata-exp-4.tsv │ │ ├── esearch_response_multi_correct.json │ │ ├── esearch_response_multi_mixed.json │ │ ├── sra-metadata-exp-2.tsv │ │ ├── sra-metadata-exp-3.tsv │ │ ├── sra-metadata-exp-5.tsv │ │ ├── sra-metadata-exp-1.tsv │ │ ├── testaccHYB.fastq │ │ ├── fasterq-dump-response.txt │ │ ├── SRR123456.fastq │ │ ├── testaccA.fastq │ │ ├── testacc_1.fastq │ │ ├── SRR123457_1.fastq │ │ ├── testaccA_01_L001_R1_001.fastq │ │ ├── testacc_00_L001_R1_001.fastq │ │ ├── testaccHYB_2.fastq │ │ ├── testaccHYB_1.fastq │ │ ├── scraper_items_no_doi.json │ │ ├── efetch_b2_response_runs.xml │ │ ├── efetch_b1_response_runs.xml │ │ ├── efetch_response_runs_single_item.xml │ │ ├── metadata_processed_multi.json │ │ ├── efetch_response_runs.xml │ │ ├── scraper_items_no_attach.json │ │ └── metadata_response_small.json │ ├── __init__.py │ ├── test_query.py │ ├── test_get_all.py │ ├── test_esearch.py │ ├── _utils.py │ └── test_utils.py ├── types │ ├── tests │ │ ├── data │ │ │ ├── sra-failed-ids-empty.tsv │ │ │ ├── ncbi-ids-wrong.tsv │ │ │ ├── ncbi-ids-bioprojects.tsv │ │ │ ├── ncbi-ids-other.tsv │ │ │ ├── ncbi-ids-studies.tsv │ │ │ ├── ncbi-ids-runs.tsv │ │ │ ├── ncbi-ids-runs-wrong-id-header.tsv │ │ │ ├── ncbi-ids-runs-doi.tsv │ │ │ ├── ncbi-ids-runs-no-doi.tsv │ │ │ ├── sra-failed-ids.tsv │ │ │ ├── sra-metadata-missing-columns.tsv │ │ │ ├── sra-metadata-missing-ids.tsv │ │ │ └── sra-metadata.tsv │ │ └── __init__.py │ ├── _type.py │ ├── __init__.py │ ├── _transformer.py │ └── _format.py ├── entrezpy_clients │ ├── __init__.py │ ├── _utils.py │ ├── _esearch.py │ ├── _pipelines.py │ └── _sra_meta.py ├── __init__.py ├── query.py ├── get_all.py ├── citations.bib ├── utils.py └── metadata.py ├── .gitattributes ├── tutorial ├── metadata_file.tsv └── metadata_file_runs.tsv ├── setup.cfg ├── logo.png ├── .github └── workflows │ ├── join-release.yaml │ ├── tag-release.yaml │ ├── dependecies.yaml │ ├── ci.yaml │ ├── q2-ci.yaml │ ├── dependent-issues.yaml │ └── docker-push.yaml ├── parallel.config ├── .coveragerc ├── .copier-answers.yml ├── Makefile ├── conda-recipe └── meta.yaml ├── .gitignore ├── pyproject.toml ├── LICENSE ├── Dockerfile └── install-sra-tools.sh /q2_fondue/tests/data/testaccA.sra: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/SRR123456.sra: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/SRR123457.sra: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | pyproject.toml export-subst 2 | -------------------------------------------------------------------------------- /tutorial/metadata_file.tsv: -------------------------------------------------------------------------------- 1 | id 2 | PRJEB14186 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/SRP123456_md.tsv: -------------------------------------------------------------------------------- 1 | id 2 | SRP123456 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/SRS123456_md.tsv: -------------------------------------------------------------------------------- 1 | id 2 | SRS123456 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/SRX123456_md.tsv: -------------------------------------------------------------------------------- 1 | id 2 | SRX123456 3 | -------------------------------------------------------------------------------- /q2_fondue/types/tests/data/sra-failed-ids-empty.tsv: -------------------------------------------------------------------------------- 1 | ID 2 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/PRJNA734376_md.tsv: -------------------------------------------------------------------------------- 1 | id 2 | PRJNA734376 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/SRR123456_md.tsv: -------------------------------------------------------------------------------- 1 | sample-id 2 | SRR123456 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/SRR123457_md.tsv: -------------------------------------------------------------------------------- 1 | sample-id 2 | SRR123457 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/study_ids.tsv: -------------------------------------------------------------------------------- 1 | id 2 | ERP12345 3 | SRP23456 4 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/testaccB_md.tsv: -------------------------------------------------------------------------------- 1 | sample-id 2 | SRR123456 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/testaccC_md.tsv: -------------------------------------------------------------------------------- 1 | sample-id 2 | SRR123457 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/bioproject_ids.tsv: -------------------------------------------------------------------------------- 1 | id 2 | PRJNA123 3 | PRJNA234 4 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/run_ids.tsv: -------------------------------------------------------------------------------- 1 | id 2 | SRR123 3 | SRR234 4 | SRR345 5 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/sample_ids.tsv: -------------------------------------------------------------------------------- 1 | id 2 | ERS147978 3 | ERS3588233 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | extend-ignore = E203 4 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/logo.png -------------------------------------------------------------------------------- /q2_fondue/tests/data/experiment_ids.tsv: -------------------------------------------------------------------------------- 1 | id 2 | ERX115020 3 | SRX10331465 4 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/sample_ids_w_doi.tsv: -------------------------------------------------------------------------------- 1 | id DOI 2 | SRS000100 some_doi1 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/study_ids_w_doi.tsv: -------------------------------------------------------------------------------- 1 | id DOI 2 | SRP000001 some_doi1 3 | -------------------------------------------------------------------------------- /q2_fondue/types/tests/data/ncbi-ids-wrong.tsv: -------------------------------------------------------------------------------- 1 | ID 2 | ABC123 3 | SRX098 4 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/SRR1234567_md.tsv: -------------------------------------------------------------------------------- 1 | sample-id 2 | SRR123456 3 | SRR123457 4 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/bioproject_ids_w_doi.tsv: -------------------------------------------------------------------------------- 1 | ID DOI 2 | PRJNA33627 some_doi1 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/experiment_ids_w_doi.tsv: -------------------------------------------------------------------------------- 1 | id DOI 2 | SRX000007 some_doi1 3 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/failed_ids_no_doi.tsv: -------------------------------------------------------------------------------- 1 | id 2 | SRR000001 3 | SRR000002 4 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/testaccBC_md.tsv: -------------------------------------------------------------------------------- 1 | sample-id 2 | SRR123456 3 | SRR123457 4 | -------------------------------------------------------------------------------- /q2_fondue/types/tests/data/ncbi-ids-bioprojects.tsv: -------------------------------------------------------------------------------- 1 | ID 2 | PRJ1234 3 | PRJ56789 4 | -------------------------------------------------------------------------------- /q2_fondue/types/tests/data/ncbi-ids-other.tsv: -------------------------------------------------------------------------------- 1 | ID 2 | ERX115020 3 | ERS115020 4 | -------------------------------------------------------------------------------- /q2_fondue/types/tests/data/ncbi-ids-studies.tsv: -------------------------------------------------------------------------------- 1 | ID 2 | ERP104978 3 | SRP123456 4 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/run_ids_w_doi.tsv: -------------------------------------------------------------------------------- 1 | ID DOI 2 | SRR000001 some_doi1 3 | SRR000002 some_doi2 4 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/metadata_response_error.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/run_ids_w_doi_2.tsv: -------------------------------------------------------------------------------- 1 | ID DOI 2 | SRR123456 some_doi1 3 | SRR123457 some_doi2 4 | -------------------------------------------------------------------------------- /q2_fondue/types/tests/data/ncbi-ids-runs.tsv: -------------------------------------------------------------------------------- 1 | ID 2 | SRR000013 3 | SRR000001 4 | ERR3978173 5 | ERR3978174 6 | -------------------------------------------------------------------------------- /q2_fondue/types/tests/data/ncbi-ids-runs-wrong-id-header.tsv: -------------------------------------------------------------------------------- 1 | wrongID 2 | SRR000013 3 | SRR000001 4 | ERR3978173 5 | ERR3978174 6 | -------------------------------------------------------------------------------- /q2_fondue/types/tests/data/ncbi-ids-runs-doi.tsv: -------------------------------------------------------------------------------- 1 | ID DOI 2 | SRR000013 some_doi1 3 | SRR000001 some_doi2 4 | ERR3978173 some_doi3 5 | ERR3978174 some_doi4 6 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/empty/xxx_00_L001_R1_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/empty/xxx_00_L001_R1_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/empty/xxx_00_L001_R2_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/empty/xxx_00_L001_R2_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/types/tests/data/ncbi-ids-runs-no-doi.tsv: -------------------------------------------------------------------------------- 1 | ID FUNFACT 2 | SRR000013 some_doi1 3 | SRR000001 some_doi2 4 | ERR3978173 some_doi3 5 | ERR3978174 some_doi4 6 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/paired1/SEQID1_00_L001_R1_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID1_00_L001_R1_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/paired1/SEQID1_00_L001_R2_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID1_00_L001_R2_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/paired1/SEQID2_00_L001_R1_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID2_00_L001_R1_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/paired1/SEQID2_00_L001_R2_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired1/SEQID2_00_L001_R2_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/paired2/SEQID3_00_L001_R1_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID3_00_L001_R1_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/paired2/SEQID3_00_L001_R2_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID3_00_L001_R2_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/paired2/SEQID4_00_L001_R1_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID4_00_L001_R1_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/paired2/SEQID4_00_L001_R2_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/paired2/SEQID4_00_L001_R2_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/single1/SEQID1_00_L001_R1_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single1/SEQID1_00_L001_R1_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/single1/SEQID2_00_L001_R1_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single1/SEQID2_00_L001_R1_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/single2/SEQID3_00_L001_R1_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single2/SEQID3_00_L001_R1_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/tests/data/single2/SEQID4_00_L001_R1_001.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bokulich-lab/q2-fondue/HEAD/q2_fondue/tests/data/single2/SEQID4_00_L001_R1_001.fastq.gz -------------------------------------------------------------------------------- /q2_fondue/types/tests/data/sra-failed-ids.tsv: -------------------------------------------------------------------------------- 1 | ID Error message 2 | SRR000020 ID is ambiguous. 3 | SRR000021 ID is invalid. 4 | ERR0000020 ID is ambiguous. 5 | ERR0000021 ID is invalid. 6 | -------------------------------------------------------------------------------- /.github/workflows/join-release.yaml: -------------------------------------------------------------------------------- 1 | name: join-release 2 | on: 3 | workflow_dispatch: {} 4 | jobs: 5 | release: 6 | uses: qiime2/distributions/.github/workflows/lib-join-release.yaml@dev 7 | -------------------------------------------------------------------------------- /.github/workflows/tag-release.yaml: -------------------------------------------------------------------------------- 1 | name: tag-release 2 | on: 3 | push: 4 | branches: ["Release-*"] 5 | jobs: 6 | tag: 7 | uses: qiime2/distributions/.github/workflows/lib-tag-release.yaml@dev 8 | -------------------------------------------------------------------------------- /parallel.config: -------------------------------------------------------------------------------- 1 | [parsl] 2 | 3 | [[parsl.executors]] 4 | class = "HighThroughputExecutor" 5 | label = "default" 6 | max_workers = 1 7 | 8 | [parsl.executors.provider] 9 | class = "LocalProvider" 10 | max_blocks = 4 -------------------------------------------------------------------------------- /.github/workflows/dependecies.yaml: -------------------------------------------------------------------------------- 1 | name: Dependency check 2 | on: 3 | pull_request: 4 | branches: ["main"] 5 | types: [opened, reopened, synchronize, labeled, unlabeled] 6 | 7 | jobs: 8 | ci: 9 | uses: bokulich-lab/utilities/.github/workflows/dependencies.yaml@main 10 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = q2_fondue 3 | branch = True 4 | omit = 5 | */tests* 6 | */__init__.py 7 | q2_fondue/_version.py 8 | versioneer.py 9 | 10 | [report] 11 | fail_under = 90 12 | omit = 13 | */tests* 14 | */__init__.py 15 | q2_fondue/_version.py 16 | versioneer.py 17 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | branches: ["main"] 5 | push: 6 | branches: ["main"] 7 | tags: ["*"] 8 | 9 | jobs: 10 | ci: 11 | uses: bokulich-lab/utilities/.github/workflows/ci.yaml@main 12 | with: 13 | distro: moshpit 14 | build_docker: true 15 | -------------------------------------------------------------------------------- /.github/workflows/q2-ci.yaml: -------------------------------------------------------------------------------- 1 | name: QIIME 2 CI 2 | on: 3 | pull_request: 4 | branches: ["main"] 5 | push: 6 | branches: ["main"] 7 | 8 | jobs: 9 | qiime-ci: 10 | uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml@dev 11 | with: 12 | distro: moshpit 13 | recipe-path: 'conda-recipe' 14 | -------------------------------------------------------------------------------- /q2_fondue/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | -------------------------------------------------------------------------------- /q2_fondue/types/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | -------------------------------------------------------------------------------- /q2_fondue/entrezpy_clients/__init__.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/SRR123457_2.fastq: -------------------------------------------------------------------------------- 1 | @test_acc_single.1 test_1_seq length=59 2 | TTGGGGGGCACCATCTAATCAGCTGCCAGTGCTGCCAGAACATAAAGCAGGCAGAAATT 3 | +test_acc_single.1 test_1_seq length=59 4 | ?60-*'$"<=D===;8C=<<<<<==C<=<<==<=C=:={{ q2_types }} 25 | - qiime2 >={{ qiime2 }} 26 | - tqdm {{ tqdm }} 27 | build: 28 | - python {{ python }} 29 | - setuptools 30 | - versioningit 31 | test: 32 | imports: 33 | - q2_fondue 34 | - qiime2.plugins.fondue 35 | requires: 36 | - parameterized 37 | - coverage 38 | - pytest-cov 39 | commands: 40 | - pytest --cov q2_fondue --cov-report xml:coverage.xml --pyargs q2_fondue 41 | about: 42 | home: https://github.com/bokulich-lab/q2-fondue 43 | license: BSD-3-Clause 44 | license_family: BSD 45 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/esearch_response_multi_mixed.json: -------------------------------------------------------------------------------- 1 | { 2 | "header": { 3 | "type": "esearch", 4 | "version": "0.3" 5 | }, 6 | "esearchresult": { 7 | "count": "8", 8 | "retmax": "0", 9 | "retstart": "0", 10 | "idlist": [], 11 | "translationset": [], 12 | "translationstack": [ 13 | { 14 | "term": "SRR000001[All Fields]", 15 | "field": "All Fields", 16 | "count": "1", 17 | "explode": "N" 18 | }, 19 | { 20 | "term": "SRR000013[All Fields]", 21 | "field": "All Fields", 22 | "count": "1", 23 | "explode": "N" 24 | }, 25 | "OR", 26 | { 27 | "term": "SR012[All Fields]", 28 | "field": "All Fields", 29 | "count": "7", 30 | "explode": "N" 31 | }, 32 | "OR" 33 | ], 34 | "querytranslation": "SRR000001[All Fields] OR SRR000013[All Fields] OR SR012[All Fields]", 35 | "errorlist": { 36 | "phrasesnotfound": [ 37 | "ABCD123", "SRR001" 38 | ], "fieldsnotfound": []}}} 39 | -------------------------------------------------------------------------------- /.github/workflows/dependent-issues.yaml: -------------------------------------------------------------------------------- 1 | name: Dependent issues 2 | 3 | on: 4 | issues: 5 | types: 6 | - opened 7 | - edited 8 | - closed 9 | - reopened 10 | pull_request_target: 11 | types: 12 | - opened 13 | - edited 14 | - closed 15 | - reopened 16 | - synchronize 17 | 18 | schedule: 19 | - cron: '0 0 * * *' 20 | 21 | jobs: 22 | check: 23 | runs-on: ubuntu-latest 24 | steps: 25 | - uses: z0al/dependent-issues@v1 26 | env: 27 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 28 | GITHUB_READ_TOKEN: ${{ secrets.GITHUB_READ_TOKEN }} 29 | 30 | with: 31 | label: dependent 32 | 33 | # (Optional) Enable checking for dependencies in issues. 34 | # Enable by setting the value to "on". Default "off" 35 | check_issues: off 36 | 37 | ignore_dependabot: off 38 | 39 | keywords: depends on, blocked by, merge after 40 | 41 | comment: > 42 | This PR/issue depends on: 43 | {{ dependencies }} 44 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/sra-metadata-exp-2.tsv: -------------------------------------------------------------------------------- 1 | ID Library Selection Library Source Library Name Library Layout Bases Spots AvgSpotLen Organism Tax ID Sample Name Sample Accession Sample Title Biosample ID Bioproject ID Experiment ID Instrument Platform Study ID Bytes Public Center Name Some Meta 1 Some Meta 2 2 | SRR123456 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC ABC 3 | SRR123457 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DEF 4 | SRR123458 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC AB12 5 | SRR123459 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DE34 6 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/sra-metadata-exp-3.tsv: -------------------------------------------------------------------------------- 1 | ID Library Selection Library Source Library Name Library Layout Bases Spots AvgSpotLen Organism Tax ID Sample Name Sample Accession Sample Title Biosample ID Bioproject ID Experiment ID Instrument Platform Study ID Bytes Public Center Name Some Meta 1 Some Meta 2 2 | SRR123456 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC ABC 3 | SRR123457 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DEF AB12 4 | SRR123458 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC GHI DE12 5 | SRR123459 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC GH34 6 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/sra-metadata-exp-5.tsv: -------------------------------------------------------------------------------- 1 | ID Library Selection Library Source Library Name Library Layout Bases Spots AvgSpotLen Organism Tax ID Sample Name Sample Accession Sample Title Biosample ID Bioproject ID Experiment ID Instrument Platform Study ID Bytes Public Center Name Some Meta 1 Some Meta 2 2 | SRR123456 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC ABC 3 | SRR123457 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DEF AB12 4 | SRR123457 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DEF XXX 5 | SRR123458 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DE34 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | # vi 65 | .*.swp 66 | 67 | # other 68 | *~ 69 | .env 70 | 71 | .DS_store 72 | .idea 73 | .vscode 74 | 75 | 76 | fasterq.tmp.*/** 77 | sratoolkit**/** 78 | 79 | # ignore dbGAP permission keys 80 | **.krt 81 | **.ngc 82 | 83 | # Version file from versioningit 84 | _version.py 85 | 86 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "q2-fondue" 3 | authors = [ 4 | { name = "Michal Ziemski", email = "ziemski.michal@gmail.com" } 5 | ] 6 | description = "None" 7 | readme = {file = "README.md", content-type = "text/markdown"} 8 | license = {file = "LICENSE"} 9 | dynamic = ["version"] 10 | 11 | [project.urls] 12 | Homepage = "https://github.com/bokulich-lab/q2-fondue" 13 | Repository = "https://github.com/bokulich-lab/q2-fondue" 14 | 15 | [project.entry-points.'qiime2.plugins'] 16 | "q2-fondue" = "q2_fondue.plugin_setup:plugin" 17 | 18 | [build-system] 19 | requires = [ 20 | "setuptools", 21 | "versioningit", 22 | "wheel" 23 | ] 24 | build-backend = "setuptools.build_meta" 25 | 26 | [tool.versioningit.vcs] 27 | method = "git-archive" 28 | describe-subst = "2026.1.0.dev0-1-g71797cbd" 29 | default-tag = "0.0.1" 30 | 31 | [tool.versioningit.next-version] 32 | method = "minor" 33 | 34 | [tool.versioningit.format] 35 | distance = "{base_version}+{distance}.{vcs}{rev}" 36 | dirty = "{base_version}+{distance}.{vcs}{rev}.dirty" 37 | distance-dirty = "{base_version}+{distance}.{vcs}{rev}.dirty" 38 | 39 | [tool.versioningit.write] 40 | file = "q2_fondue/_version.py" 41 | 42 | [tool.setuptools] 43 | include-package-data = true 44 | 45 | [tool.setuptools.packages.find] 46 | where = ["."] 47 | include = ["q2_fondue*"] 48 | 49 | [tool.setuptools.package-data] 50 | q2_fondue = ["**/*"] 51 | -------------------------------------------------------------------------------- /q2_fondue/query.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | import threading 10 | import pandas as pd 11 | 12 | from q2_fondue.utils import handle_threaded_exception 13 | from q2_fondue.entrezpy_clients._pipelines import _get_run_ids 14 | 15 | threading.excepthook = handle_threaded_exception 16 | 17 | 18 | def get_ids_from_query( 19 | query: str, email: str, threads: int = 1, log_level: str = "INFO" 20 | ) -> pd.Series: 21 | """Retrieves SRA run IDs based on a search query performed 22 | on the BioSample database. 23 | 24 | Args: 25 | query (str): Search query to be executed on 26 | the BioSample database. 27 | email (str): A valid e-mail address (required by NCBI). 28 | threads (int, default=1): Number of threads to be used in parallel. 29 | log_level (str, default='INFO'): Logging level. 30 | 31 | Returns: 32 | ids (pd.Series): Retrieved SRA run IDs. 33 | """ 34 | run_ids = _get_run_ids(email, threads, None, query, "biosample", log_level) 35 | 36 | return pd.Series(run_ids, name="ID") 37 | -------------------------------------------------------------------------------- /q2_fondue/tests/test_query.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import pandas as pd 9 | import unittest 10 | 11 | from pandas.testing import assert_frame_equal 12 | from qiime2.plugins import fondue 13 | from unittest.mock import patch 14 | 15 | from q2_fondue.tests.test_sequences import SequenceTests 16 | 17 | 18 | class TestQuery(SequenceTests): 19 | package = "q2_fondue.tests" 20 | 21 | @patch("q2_fondue.query._get_run_ids", return_value=["SRR123", "SRR234"]) 22 | def test_query(self, mock_ids): 23 | query = "some magical query text" 24 | 25 | (obs_ids,) = fondue.actions.get_ids_from_query( 26 | query, "fake@email.com", 1, "DEBUG" 27 | ) 28 | exp_ids = pd.DataFrame( 29 | index=pd.Index(["SRR123", "SRR234"], name="ID"), 30 | columns=[], 31 | ) 32 | 33 | mock_ids.assert_called_once_with( 34 | "fake@email.com", 1, None, query, "biosample", "DEBUG" 35 | ) 36 | assert_frame_equal(obs_ids.view(pd.DataFrame), exp_ids) 37 | 38 | 39 | if __name__ == "__main__": 40 | unittest.main() 41 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/sra-metadata-exp-1.tsv: -------------------------------------------------------------------------------- 1 | ID Library Selection Library Source Library Name Library Layout Bases Spots AvgSpotLen Organism Tax ID Sample Name Sample Accession Sample Title Biosample ID Bioproject ID Experiment ID Instrument Platform Study ID Bytes Public Center Name Some Meta 1 2 | SRR123456 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC ABC 3 | SRR123457 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC DEF 4 | SRR123458 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC GHI 5 | SRR123459 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC JKL 6 | SRR123460 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC MNO 7 | SRR123461 RANDOM GENOMIC SID2748 PAIRED 913746807 3325405 274 Homo sapiens 9606 NA18505 SRS000100 Coriell GM18505 SAMN00001583 PRJNA33627 SRX000007 454 GS FLX LS454 SRP000001 322532842 TRUE 454MSC PQR 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2025, Bokulich Laboratories. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/testaccHYB.fastq: -------------------------------------------------------------------------------- 1 | @ERR3018303.92 Bgsng7131.m10_5758889 length=224 2 | CCTGTTCGCTCCCCACGCTTTCGAGCCTCAGCGTCAGTTACAGACCAGAGAGCCGCTTTCGCCACCGGTGTTCCTCCATATATCTACGCATTTCACCGCTACACATGGAATTCCACTCTCCCCTTCTGCACTCAAGTTTGACAGTTTCCAAAGCGAACTATGGTTGAGCCACAGCCTTTAACTTCAGACTTATCAAACCGCCTGCGCTCGCTTTACGCCCAATA 3 | +ERR3018303.92 Bgsng7131.m10_5758889 length=224 4 | HHHHHHHGGGHGGGFGGGGGGHGGGGGHHHHHGGGGGHHHHHHHHHHHHGFGFHGGGGGHGGGGGHGGGGGGHHHHHGFHHGHHHHHGGGGGHHHHHGGGGGHHHHHHF1FGFHHHHHHHHHGHHHHHGHHHHHHHHHHHFHGHHHHHHGHGHHFGGGHHGHHHHGHFGGGGGGGGGGGGGGGGGFGGGGGFGGGGFGGGFFFFFFFFFFFFFFFFFFFFFFFF 5 | @ERR3018303.93 Bgsng7131.m10_1129555 length=229 6 | TACGTAGGTGGCAAGCGCTGTCCGGATTTATTGGGCGTAAAGGGAACGCAGGCGGTCTTTTAAGTCTGATGTGAAAGCCTTCGGCTTAACCGAAGTAGTGCATTGGAAACTGGAAGACTTGAGTGCAGAAGAGGAGAGTGGAACTCAATGTGTAGCGGTGAAATGGGTAGATATATGGAAGAACACCAGTGGCGAAAGCGGCGCGTTGGCCTGTAACTGACGCTGAGGT 7 | +ERR3018303.93 Bgsng7131.m10_1129555 length=229 8 | D2FEGFA0GGE0GFCHG/A/B11AEEE/G1FBG1GA>>EA/GFGGHG///E@EEE>/?1BGB1B>F2B1G1212FGFFHHB00>1GGDGHFBBDGHEHGHAACC/;.CGC:FB/C0000CGG?GGGGG00C0.BB.09CBFGGGGGB.9/F9A-9/;/-;99@?/99@########################### 9 | @ERR3018303.94 Bgsng7131.m10_1839802 length=250 10 | TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGACTTTTAAGTGAGATGTGAAATACCCGGGCTCAACTTGGGTGCTGCATTTCAAACTGGAAGTCTAGAGTGCAGGAGAGGAGAATGGAATTCCTAGTGTAGCGGTGAAATGCGTAGAGATTAGGAAGAACACCAGTGGCGAAGGCGATTCTCTGGACTGTAACTGACGCTGAGGCTCGAAAGCGTGGGGAGCAAAC 11 | +ERR3018303.94 Bgsng7131.m10_1839802 length=250 12 | CBBCCFFCFFCFGGGGGGGGGGHGGGGGHHHHHHHHGGFGHHGGGGGGGGGGHGGGGGHHHFGHHGHHGHHHHHHEGHHHHGGGFGHGHGGHGHHGEFFHGHHHHHGHGGHGHHHHHHHGHFHHHHHGGFFHGDFGHFHGHHGHHGHHHHHHHFHHGDGGGFGHGHFGD?EFCHGHHFCGHFHHGGGGGGGFFGFADAED?CFFFFFFFB;BEBFFFFFF/ADFAAFEFEFF@BADFFFFFFFAABEFF: 13 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/fasterq-dump-response.txt: -------------------------------------------------------------------------------- 1 | cursor-cache : 5,242,880 bytes 2 | buf-size : 1,048,576 bytes 3 | mem-limit : 52,428,800 bytes 4 | threads : 6 5 | scratch-path : '/private/var/folders/7f/7nw_x13n5q965rss_qz6061m0000gq/T/tmpf0d7oy6a/fasterq.tmp.MacBook-Pro.35899/' 6 | total ram : 17,179,869,184 bytes 7 | output-format: FASTQ split 3 8 | check-mode : only 9 | output-file : '/private/var/folders/7f/7nw_x13n5q965rss_qz6061m0000gq/T/tmpf0d7oy6a/ERR2750829.fastq' 10 | output-dir : '.' 11 | output : '/private/var/folders/7f/7nw_x13n5q965rss_qz6061m0000gq/T/tmpf0d7oy6a/ERR2750829.fastq' 12 | append-mode : 'NO' 13 | stdout-mode : 'NO' 14 | seq-defline : '@$ac.$si $sn length=$rl' 15 | qual-defline : '+$ac.$si $sn length=$rl' 16 | only-unaligned : 'NO' 17 | only-aligned : 'NO' 18 | accession : 'ERR2750829' 19 | accession-path: 'ERR2750829' 20 | est. output : 44,926,989,570 bytes 21 | disk-limit (OS) : 9,149,612,032 bytes 22 | disk-limit-tmp (OS) : 9,149,612,032 bytes 23 | out/tmp on same fs : 'NO' 24 | 25 | ERR2750829 is remote 26 | ... has a size of 12,459,034,417 bytes 27 | ... is cSRA without alignments 28 | ... SEQ has NAME column = YES 29 | ... SEQ has SPOT_GROUP column = YES 30 | ... uses 'SEQUENCE' as sequence-table 31 | SEQ.first_row = 1 32 | SEQ.row_count = 84,543,740 33 | SEQ.spot_count = 84,543,740 34 | SEQ.total_base_count = 16,545,432,985 35 | SEQ.bio_base_count = 16,545,432,985 36 | SEQ.avg_name_len = 1 37 | SEQ.avg_spot_group_len = 0 38 | SEQ.avg_bio_reads_per_spot = 2 39 | SEQ.avg_tech_reads_per_spot = 0 40 | ALIGN.first_row = 0 41 | ALIGN.row_count = 0 42 | ALIGN.spot_count = 0 43 | ALIGN.total_base_count = 0 44 | ALIGN.bio_base_count = 0 45 | 46 | disk-limit exeeded! 47 | fasterq-dump quit with error code 3 48 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3:latest AS base 2 | 3 | ARG ENVIRONMENT 4 | ARG PLUGIN_NAME 5 | 6 | ENV PLUGIN_NAME=$PLUGIN_NAME 7 | ENV PATH=/opt/conda/envs/${PLUGIN_NAME}/bin:$PATH \ 8 | LC_ALL=C.UTF-8 LANG=C.UTF-8 \ 9 | MPLBACKEND=agg \ 10 | UNIFRAC_USE_GPU=N \ 11 | HOME=/home/qiime2 \ 12 | XDG_CONFIG_HOME=/home/qiime2 13 | 14 | WORKDIR /home/qiime2 15 | COPY environment.yml . 16 | COPY install-sra-tools.sh . 17 | 18 | RUN apt-get update \ 19 | && apt-get install -y --no-install-recommends wget curl procps make \ 20 | && apt-get clean \ 21 | && rm -rf /var/lib/apt/lists/* 22 | 23 | RUN conda update -qy conda \ 24 | && conda install -c conda-forge -qy mamba \ 25 | && mamba env create -n ${PLUGIN_NAME} --file environment.yml \ 26 | && mamba run -n ${PLUGIN_NAME} bash install-sra-tools.sh \ 27 | && mamba clean --all --yes \ 28 | && chmod -R a+rwx /opt/conda 29 | 30 | RUN mkdir -p .ncbi 31 | RUN printf '/LIBS/GUID = "%s"\n' `uuidgen` > .ncbi/user-settings.mkfg 32 | 33 | COPY . ./plugin 34 | RUN mamba run -n ${PLUGIN_NAME} pip install ./plugin 35 | 36 | RUN /bin/bash -c "source activate ${PLUGIN_NAME}" 37 | ENV CONDA_PREFIX=/opt/conda/envs/${PLUGIN_NAME}/ 38 | RUN mamba run -n ${PLUGIN_NAME} qiime dev refresh-cache 39 | RUN echo "source activate ${PLUGIN_NAME}" >> $HOME/.bashrc 40 | RUN echo "source tab-qiime" >> $HOME/.bashrc 41 | 42 | 43 | FROM base AS test 44 | 45 | RUN mamba run -n ${PLUGIN_NAME} pip install pytest pytest-cov coverage parameterized pytest-xdist 46 | CMD mamba run -n ${PLUGIN_NAME} make -f ./plugin/Makefile test-cov 47 | 48 | FROM base AS prod 49 | 50 | # Important: let any UID modify these directories so that 51 | # `docker run -u UID:GID` works 52 | RUN rm -rf ./plugin 53 | RUN chmod -R a+rwx /home/qiime2 -------------------------------------------------------------------------------- /q2_fondue/get_all.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | import qiime2 as q2 10 | 11 | import pandas as pd 12 | import threading 13 | 14 | from q2_fondue.utils import handle_threaded_exception 15 | from qiime2 import Artifact 16 | 17 | 18 | threading.excepthook = handle_threaded_exception 19 | 20 | 21 | def get_all( 22 | ctx, accession_ids, email, threads=1, retries=2, log_level="INFO", linked_doi=None 23 | ): 24 | 25 | # get required methods 26 | get_metadata = ctx.get_action("fondue", "get_metadata") 27 | get_sequences = ctx.get_action("fondue", "get_sequences") 28 | 29 | # fetch metadata 30 | metadata, failed_ids = get_metadata( 31 | accession_ids, email, threads, log_level, linked_doi 32 | ) 33 | failed_ids_df = failed_ids.view(pd.DataFrame) 34 | 35 | # fetch sequences - use metadata to get run ids, regardless if 36 | # runs or projects were requested 37 | run_ids = q2.Artifact.import_data( 38 | "NCBIAccessionIDs", pd.Series(metadata.view(pd.DataFrame).index) 39 | ) 40 | ( 41 | seq_single, 42 | seq_paired, 43 | failed_ids, 44 | ) = get_sequences(run_ids, email, retries, threads, log_level) 45 | failed_ids_df = pd.concat([failed_ids_df, failed_ids.view(pd.DataFrame)]) 46 | if failed_ids_df.shape[0] > 0: 47 | failed_ids = Artifact.import_data("SRAFailedIDs", failed_ids_df) 48 | 49 | return metadata, seq_single, seq_paired, failed_ids 50 | -------------------------------------------------------------------------------- /q2_fondue/citations.bib: -------------------------------------------------------------------------------- 1 | @article {Ziemski2022, 2 | author = {Ziemski, Michal and Adamov, Anja and Kim, Lina and Flörl, Lena and Bokulich, Nicholas A}, 3 | title = {Reproducible acquisition, management, and meta-analysis of nucleotide sequence (meta)data using q2-fondue}, 4 | year = {2022}, 5 | month = {09}, 6 | doi = {10.1093/bioinformatics/btac639}, 7 | URL = {https://doi.org/10.1093/bioinformatics/btac639}, 8 | journal = {Bioinformatics}, 9 | issn = {1367-4803}, 10 | } 11 | 12 | @article{Buchmann2019, 13 | author = {Buchmann, Jan P and Holmes, Edward C}, 14 | doi = {10.1093/bioinformatics/btz385}, 15 | editor = {Wren, Jonathan}, 16 | journal = {Bioinformatics}, 17 | month = {nov}, 18 | number = {21}, 19 | pages = {4511--4514}, 20 | publisher = {Oxford University Press}, 21 | title = {Entrezpy: a Python library to dynamically interact with the NCBI Entrez databases}, 22 | url = {https://academic.oup.com/bioinformatics/article/35/21/4511/5488119}, 23 | volume = {35}, 24 | year = {2019} 25 | } 26 | 27 | @misc{SraToolkit, 28 | name = {SRA Toolkit}, 29 | author = {SRA Toolkit Development Team}, 30 | version = {2.9.6}, 31 | url = {https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software} 32 | } 33 | 34 | @misc{stephan_hugel_2019_2917290, 35 | author = {Stephan Hügel and Peter Gerdes and Patrick Fournier and 36 | emuzie and Patrick Golden and jghauser and Stefan Frühwirth and 37 | Sean Takats and Pablo Orduña and Merlin and Erik Hetzner and 38 | Christian Brodbeck and Avram Lyon and A Lee}, 39 | title = {urschrei/pyzotero: Zenodo Release}, 40 | month = {may}, 41 | year = 2019, 42 | publisher = {Zenodo}, 43 | version = {v1.3.15}, 44 | doi = {10.5281/zenodo.2917290}, 45 | url = {https://doi.org/10.5281/zenodo.2917290} 46 | } 47 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/SRR123456.fastq: -------------------------------------------------------------------------------- 1 | @test_acc_single.1 test_1_seq length=278 2 | AAACTCCTAGCCTACATCCGTACGAGTTAGCGTGGGATTACGAGGTGCACACCATTTCATTCCGTACGGGTAAATTTTTGTATTTTTAGCAGACGGCAGGGTTTCACCATGGTTGACCAACGTACTAATCTTGAACTCCTGACCTCAAGTGATTTGCCTGCCTTCAGCCTCCCAAAGTGACTGGGTATTACAGATGTGAGCGAGTTTGTGCCCAAGCCTTATAAGTAAATTTATAAATTTACATAATTTAAATGACTTATGCTTAGCGAAATAGTTTA 3 | +test_acc_single.1 test_1_seq length=278 4 | 85)9=9/3-8?68<7=8<3657747==49==+;FB2;A;5:'*>69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1 5 | @test_acc_single.4 test_2_seq length=274 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC 7 | +test_acc_single.4 test_2_seq length=274 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=. 9 | @test_acc_single.5 test_3_seq length=267 10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC 11 | +test_acc_single.5 test_3_seq length=267 12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1 5 | @test_acc_single.4 test_2_seq length=274 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC 7 | +test_acc_single.4 test_2_seq length=274 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=. 9 | @test_acc_single.5 test_3_seq length=267 10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC 11 | +test_acc_single.5 test_3_seq length=267 12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1 5 | @test_acc_single.4 test_2_seq length=274 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC 7 | +test_acc_single.4 test_2_seq length=274 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=. 9 | @test_acc_single.5 test_3_seq length=267 10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC 11 | +test_acc_single.5 test_3_seq length=267 12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1 5 | @test_acc_single.4 test_2_seq length=274 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC 7 | +test_acc_single.4 test_2_seq length=274 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=. 9 | @test_acc_single.5 test_3_seq length=267 10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC 11 | +test_acc_single.5 test_3_seq length=267 12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1 5 | @test_acc_single.4 test_2_seq length=274 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC 7 | +test_acc_single.4 test_2_seq length=274 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=. 9 | @test_acc_single.5 test_3_seq length=267 10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC 11 | +test_acc_single.5 test_3_seq length=267 12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A69<:74)9.;C?+;*GC8/%9<=GC8.#=2:5:16D==*6?7<:77>:1+CA138?<)C@2166:A:%<<9<;33<;6?9;<;4=:%<$CA1+1 5 | @test_acc_single.4 test_2_seq length=274 6 | ATTGGAGTGTGACATTCTGTGTTCCACATGCATCGACTAGGGCGTTTGGTAACCCGAAAAGGGTTTTGAAATTCAAACCCCTTTTCGGTTCCAACATTTCAAACCATAGCAAAATAATATTTTTAATAAAAATTCATTTACCATTTGTTGCTAGTGTTTAAAATAATACTGATTATTTATTTGACTTTAACTCTCCAATCTTGTTAATTCACTAATTATCTAGATTGTGTAGGTTCTTGGATTTCTCAATCATGTCATCTGCATTCTGANCGGC 7 | +test_acc_single.4 test_2_seq length=274 8 | A='=4<;<<<;3==B:<5<<9@9A988497;867=<;<-EA/:9A<&C=1A)9EA3#EA2GC7*6;0EA3)D=7@;%9@7<:7<8=:B=(FB5&;=5<4;;=87-88A=90&CA0'<64*61''-7+,3)30)B=*5:*/!,A=. 9 | @test_acc_single.5 test_3_seq length=267 10 | TTTTTTCCCTCTTCCTCCTCTTCCTCCTTCTTATTCCTCTTTCTCACCTGCTCCCTTCTCTCTTTCTTTCTTAAGCCAGTATTACTTTGTTTTCTGTTATTTGCATTCCAAAGAATCCTAACAGATTCATCTCATTTAAAGATCTCATTTAATTCATCTCATTTAAAGATCTCAACAACTCTATAAGACAGTATAGGTAGGGCAGGATGGAAATTAGAGGCATTTAATTTATATCAAAAATTTGCAAAACTCAAATTTTCTTATAAC 11 | +test_acc_single.5 test_3_seq length=267 12 | :=6@;$@;%<7<3C<;?6;<@8@8:=4>65:08?7C<==@;$5<<9?7<=<;A=(A91<=<=FB/=@;$=8-C==B;<===B;=8@<&:GC6)<3:B;<@;$=<5:D=;D=7:==D=;8<<<<+;:A<<9A sratoolkit.tar.gz 26 | 27 | echo "Extracting..." 28 | tar -xzf sratoolkit.tar.gz 29 | rm sratoolkit.tar.gz 30 | mv "sratoolkit.${TOOLKIT_VER}-${OS_VER}/" "sratoolkit/" 31 | 32 | if [[ "$PREFIX" == "" ]]; then 33 | echo "Setting PREFIX=$CONDA_PREFIX" 34 | PREFIX="$CONDA_PREFIX" 35 | fi 36 | 37 | echo "Installing SRA Tools in $PREFIX..." 38 | if [[ ! -d "$PREFIX/bin/" ]]; then 39 | mkdir $PREFIX/bin/ 40 | fi 41 | find sratoolkit/bin/ -maxdepth 1 -type f -exec mv -f {} $PREFIX/bin/ \; 42 | find sratoolkit/bin/ -maxdepth 1 -type l -exec mv -f {} $PREFIX/bin/ \; 43 | rm -r sratoolkit 44 | 45 | echo "Testing installation..." 46 | if [[ $(which prefetch) == "$PREFIX/bin"* ]]; then 47 | echo "Success!" 48 | else 49 | echo "Installation failed." 50 | exit 1 51 | fi 52 | 53 | echo "Configuring SRA Toolkit:" 54 | SRA_CACHE_LOC="$HOME/.prefetch_cache" 55 | echo "Creating prefetch cache directory under $SRA_CACHE_LOC..." 56 | mkdir "$SRA_CACHE_LOC" 57 | echo "Running vdb-config..." 58 | vdb-config -s "/repository/user/main/public/root=$SRA_CACHE_LOC" 59 | vdb-config --prefetch-to-user-repo 60 | echo "Configuration completed." 61 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/testaccHYB_2.fastq: -------------------------------------------------------------------------------- 1 | @ERR3018303.88 Bgsng7131.m10_3542277 length=228 2 | TACGTAGGGTGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGCTCGTAGGCGGTTCGTCGCGTCCGGTGTGAAAGTCCATCGCTTAACGGTGGATCCGCGCCGGGTACGGGCGGGCTTGAGTGCGGTAGGGGAGACTGGAATTCCCGGTGTAACGGTGGAATGTGTAGATATCAGGAAGAACACCAATGGCGAAGGCAGGTCTCTGGGCCGTTACTGACGCTGAGG 3 | +ERR3018303.88 Bgsng7131.m10_3542277 length=228 4 | HHHHHHHGGHGGHHHHGGGGHHHGGGGGHHHHHHHHGGGGHGHGGHHGGGGGGGGGGGGGGGGGGGGGGGGGGHHHHGHHHHHGGGHGHHGGGEGGHHHHGGGGGCGAGHGGGGGFFFFEF.99BFDCAFFFFFFC.EFFFFFFFFFFDCDF.BFFFFDEFF/BFBFFFFFFFFB/;B;BF.9FBBDFE;/;BDFAFFFEA;;B/BFFFFEED=.A99BFFFF?DFFF 5 | @ERR3018303.89 Bgsng7131.m10_3605463 length=230 6 | TACGTAGGTGGCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGAACGCAGGCGGTCTTTTCAGTCTGATGTGAAAGCCTTCGGCTTAACCGAAGTAGTGCATTGGAAACTGGAAGACTTGAGTGCAGAAGAGGAGAGTGGCACTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAAGAACACCAGGGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGTGGTT 7 | +ERR3018303.89 Bgsng7131.m10_3605463 length=230 8 | GGHHHGGEDBFHGFFEFGEFFHHGGGEGHH5BFFHGEFGGHBGGAGFGGCCGGEGEGGHHG4FDFDFDDGHHDHHF2CFFHHHD>?/2G1?AFFD0DFBGGHFFHBGF0CGBCHDGFDF00G0GGFFCGECDHGFHG.C0;/C/9B0;CFBB?AEGGBBBFFEDDAGEFFFBFFFFBFB.BFAE...-@-@BF?/AA;-.9BBFFBFFFFFFFFEBF?DA###### 9 | @ERR3018303.90 Bgsng7131.m10_3641968 length=231 10 | TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGGCTGTGGCTCAACCATAGTTCGCTTTGGAAACTGTCAAACTTGAGTGCAGAAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCGGTGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGAGGCTCG 11 | +ERR3018303.90 Bgsng7131.m10_3641968 length=231 12 | GGGHHHHGHHHHGGGGGGGGGHHGGGGGHHHHHHHHGGGGHHHGGGGGGGGGGGGGGGGFFHHHDGHHHHHHGGHHHHHHHHGHHHHHGHHGGHHHHHGGGHGHHFHHBHHHGHHHHFHGFFFFFFFFFFFFFFFFFFFFFFFFFFEFA 13 | @ERR3018303.91 Bgsng7131.m10_3716454 length=231 14 | TACGTAGGTGGCAAGCGTTATCCGGATTTATTGGGCGTAAAGAGAGTGCAGGCGGTTTTCTAAGTCTGATGTGAAAGCCTTCGGCTTAACCGGAGAAGTGCATCGGAAACTGGATAACTTGAGTGCAGAAGAGGGTAGTGGAACTCCATGTGTAGCGGTGGAATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTACCTGGTCTGCAACTGACGCTGAGACTC 15 | +ERR3018303.91 Bgsng7131.m10_3716454 length=231 16 | GGGHHHHGHGGHHHGHGGGGHHHGGGGGGHEGHHHHGFGGHHHHHGHHHHHHGGGEFGGGHGHGGFHGEDG4GEGH3BECGEHG@CCBHBGFG/BF1=FDG>D-DEF.BFF?9BFFFCFBFFE.DFAF/BFFFF;.99AAEBFFFFFF?;FE/;9@@-;9BADFAFF-9=-;@F 5 | @ERR3018303.89 Bgsng7131.m10_3605463 length=250 6 | TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGGCTGTGGCTCAACCATAGTTCGCTTTGGAAACTGTCAAACTTGAGTGCAGAAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCGGTGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGAGGCTCGAAAGCGTGGGGGGCGAACA 7 | +ERR3018303.89 Bgsng7131.m10_3605463 length=250 8 | BBBBBFFBFFDFAEFEGGGCGGHGG?FGHGHHGHHHGGGGGHGGGGGGCCECGGGEFEGBGFHHHHGHB4FGFFHG4FG3EG3GHHEHHHHFHEFDGFFCACFGHEB2GDGF2F222@F>2C@1FHHB0CGFG/A??<>F1>FG0GHHHHHH0DG@F<@A0BC0CFD.@ACFGE;CF0B?.BFFGF-;9-99--B@D/.--BD/99BF//////;/9BBFA.9...A.EA@-9;.::9ABFF######## 9 | @ERR3018303.90 Bgsng7131.m10_3641968 length=250 10 | TACGTAGGTCCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGCCTGTGGCTCAACCATAGTTCGCTTTGGAAACTGTCCAACTTGGGTGCAGAAGGGGGGGGTGGAATTTCATGTGTAGCGGGGAAATGCGTAGATATATGGGGGAACACCCGTGGCGAAAGCGGCTCTCTTGGCTGTAACTGACGCTGAGGCTCGAAAACGTGGGGAGCCAAAC 11 | +ERR3018303.90 Bgsng7131.m10_3641968 length=250 12 | AAAAAFF@@F1C1EEGGGG?AAFEGGCGHG2GGHHHEGGGHHHG//EGGGGGGGGGGGF1BFDGHF21E2FHBGGG11BB01>FGHHHFHG/F1B22BF@1///? SRAMetadataFormat: 30 | ff = SRAMetadataFormat() 31 | with ff.open() as fh: 32 | data.to_csv(fh, sep="\t", header=True) 33 | return ff 34 | 35 | 36 | @plugin.register_transformer 37 | def _2(ff: SRAMetadataFormat) -> pd.DataFrame: 38 | with ff.open() as fh: 39 | df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str") 40 | return df 41 | 42 | 43 | @plugin.register_transformer 44 | def _3(ff: SRAMetadataFormat) -> qiime2.Metadata: 45 | return _meta_fmt_to_metadata(ff) 46 | 47 | 48 | @plugin.register_transformer 49 | def _4(data: pd.DataFrame) -> SRAFailedIDsFormat: 50 | ff = SRAFailedIDsFormat() 51 | with ff.open() as fh: 52 | data.to_csv(fh, sep="\t", header=True, index=True) 53 | return ff 54 | 55 | 56 | @plugin.register_transformer 57 | def _5(ff: SRAFailedIDsFormat) -> pd.DataFrame: 58 | with ff.open() as fh: 59 | df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str") 60 | return df 61 | 62 | 63 | @plugin.register_transformer 64 | def _6(ff: SRAFailedIDsFormat) -> qiime2.Metadata: 65 | return _meta_fmt_to_metadata(ff) 66 | 67 | 68 | @plugin.register_transformer 69 | def _7(data: pd.DataFrame) -> NCBIAccessionIDsFormat: 70 | ff = NCBIAccessionIDsFormat() 71 | with ff.open() as fh: 72 | data.to_csv(fh, sep="\t", header=True, index=True) 73 | return ff 74 | 75 | 76 | @plugin.register_transformer 77 | def _77(data: pd.Series) -> NCBIAccessionIDsFormat: 78 | ff = NCBIAccessionIDsFormat() 79 | return _series_to_meta_fmt(data, ff) 80 | 81 | 82 | @plugin.register_transformer 83 | def _8(ff: NCBIAccessionIDsFormat) -> pd.DataFrame: 84 | with ff.open() as fh: 85 | df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str") 86 | return df 87 | 88 | 89 | @plugin.register_transformer 90 | def _9(ff: NCBIAccessionIDsFormat) -> qiime2.Metadata: 91 | return _meta_fmt_to_metadata(ff) 92 | 93 | 94 | @plugin.register_transformer 95 | def _10(ff: SRAMetadataFormat) -> NCBIAccessionIDsFormat: 96 | fout = NCBIAccessionIDsFormat() 97 | with ff.open() as fh, fout.open() as fo: 98 | df = pd.read_csv(fh, sep="\t", header=0, index_col=0, dtype="str") 99 | df.index.to_frame().to_csv(fo, sep="\t", header=True, index=False) 100 | return fout 101 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/scraper_items_no_doi.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "key": "WZV4HG8X", 3 | "version": 1259, 4 | "library": { 5 | "type": "user", 6 | "id": 12345, 7 | "name": "username", 8 | "links": { 9 | "alternate": { 10 | "href": "https://www.zotero.org/username", 11 | "type": "text/html" 12 | } 13 | } 14 | }, 15 | "links": { 16 | "self": { 17 | "href": "https://api.zotero.org/users/12345/items/WZV4HG8X", 18 | "type": "application/json" 19 | }, 20 | "alternate": { 21 | "href": "https://www.zotero.org/username/items/WZV4HG8X", 22 | "type": "text/html" 23 | }, 24 | "up": { 25 | "href": "https://api.zotero.org/users/12345/items/CP4ED2CY", 26 | "type": "application/json" 27 | }, 28 | "enclosure": { 29 | "type": "text/html", 30 | "href": "https://api.zotero.org/users/12345/items/WZV4HG8X/file/view" 31 | } 32 | }, 33 | "meta": { 34 | "numChildren": false 35 | }, 36 | "data": { 37 | "key": "WZV4HG8X", 38 | "version": 1259, 39 | "parentItem": "CP4ED2CY", 40 | "itemType": "attachment", 41 | "linkMode": "imported_url", 42 | "title": "Snapshot", 43 | "accessDate": "2021-11-10T07:04:53Z", 44 | "url": "https://www.nature.com/articles/s41467-021-26215-w", 45 | "note": "", 46 | "contentType": "text/html", 47 | "charset": "utf-8", 48 | "filename": "s41467-021-26215-w.html", 49 | "md5": "9ba88a9f08c42a02d11a00b3498198f4", 50 | "mtime": 1636527893000, 51 | "tags": [], 52 | "relations": {}, 53 | "dateAdded": "2021-11-10T07:04:53Z", 54 | "dateModified": "2021-11-10T07:04:53Z" 55 | } 56 | }, 57 | { 58 | "key": "DMJ4AQ48", 59 | "version": 1261, 60 | "library": { 61 | "type": "user", 62 | "id": 12345, 63 | "name": "username", 64 | "links": { 65 | "alternate": { 66 | "href": "https://www.zotero.org/username", 67 | "type": "text/html" 68 | } 69 | } 70 | }, 71 | "links": { 72 | "self": { 73 | "href": "https://api.zotero.org/users/12345/items/DMJ4AQ48", 74 | "type": "application/json" 75 | }, 76 | "alternate": { 77 | "href": "https://www.zotero.org/username/items/DMJ4AQ48", 78 | "type": "text/html" 79 | }, 80 | "up": { 81 | "href": "https://api.zotero.org/users/12345/items/CP4ED2CY", 82 | "type": "application/json" 83 | }, 84 | "enclosure": { 85 | "type": "application/pdf", 86 | "href": "https://api.zotero.org/users/12345/items/DMJ4AQ48/file/view", 87 | "title": "Pruski et al. - 2021 - Direct on-swab metabolic profiling of vaginal micr.pdf", 88 | "length": 3648434 89 | } 90 | }, 91 | "meta": { 92 | "numChildren": false 93 | }, 94 | "data": { 95 | "key": "DMJ4AQ48", 96 | "version": 1261, 97 | "parentItem": "CP4ED2CY", 98 | "itemType": "attachment", 99 | "linkMode": "imported_url", 100 | "title": "Full Text PDF", 101 | "accessDate": "2021-11-10T07:04:46Z", 102 | "url": "https://www.nature.com/articles/s41467-021-26215-w.pdf", 103 | "note": "", 104 | "contentType": "application/pdf", 105 | "charset": "", 106 | "filename": "Pruski et al. - 2021 - Direct on-swab metabolic profiling of vaginal micr.pdf", 107 | "md5": "28edb400729d11e14b2b1829ceb16b3a", 108 | "mtime": 1636528753000, 109 | "tags": [], 110 | "relations": {}, 111 | "dateAdded": "2021-11-10T07:04:46Z", 112 | "dateModified": "2021-11-10T07:04:46Z" 113 | } 114 | }] -------------------------------------------------------------------------------- /q2_fondue/tests/data/efetch_b2_response_runs.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 13481774 8 | <Summary><Title>18</Title><Platform 9 | instrument_model="Illumina HiSeq 3000">ILLUMINA</Platform><Statistics 10 | total_runs="1" total_spots="63703" total_bases="38349206" 11 | total_size="22735317" load_done="true" cluster_name="public"/></Summary><Submitter 12 | acc="SRA1206349" center_name="Jiangxi Agricultural University" 13 | contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory 14 | of Animal Nutritio"/><Experiment acc="SRX10339760" ver="1" 15 | status="public" name="18"/><Study acc="SRP310597" name="PRJNA 16 | Chuanzhong black lamb Raw sequence reads"/><Organism 17 | taxid="1904483" ScientificName="sheep gut metagenome"/><Sample 18 | acc="SRS8459117" name=""/><Instrument ILLUMINA="Illumina 19 | HiSeq 3000"/><Library_descriptor><LIBRARY_NAME>18</LIBRARY_NAME><LIBRARY_STRATEGY>AMPLICON</LIBRARY_STRATEGY><LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>PCR</LIBRARY_SELECTION><LIBRARY_LAYOUT> 20 | <PAIRED/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA707607</Bioproject><Biosample>SAMN18309312</Biosample> 21 | 22 | <Run acc="SRR13961771" 23 | total_spots="63703" total_bases="38349206" load_done="true" 24 | is_public="true" cluster_name="public" 25 | static_data_available="true"/> 26 | 27 | 28 | 2021/03/17 29 | 2021/03/15 30 | 31 | 32 | 33 | 13481786 34 | <Summary><Title>12</Title><Platform 35 | instrument_model="Illumina HiSeq 3000">ILLUMINA</Platform><Statistics 36 | total_runs="1" total_spots="59130" total_bases="35596260" 37 | total_size="21079845" load_done="true" cluster_name="public"/></Summary><Submitter 38 | acc="SRA1206349" center_name="Jiangxi Agricultural University" 39 | contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory 40 | of Animal Nutritio"/><Experiment acc="SRX10339772" ver="1" 41 | status="public" name="12"/><Study acc="SRP310597" name="PRJNA 42 | Chuanzhong black lamb Raw sequence reads"/><Organism 43 | taxid="1904483" ScientificName="sheep gut metagenome"/><Sample 44 | acc="SRS8459130" name=""/><Instrument ILLUMINA="Illumina 45 | HiSeq 3000"/><Library_descriptor><LIBRARY_NAME>12</LIBRARY_NAME><LIBRARY_STRATEGY>AMPLICON</LIBRARY_STRATEGY><LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>PCR</LIBRARY_SELECTION><LIBRARY_LAYOUT> 46 | <PAIRED/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA707607</Bioproject><Biosample>SAMN18309306</Biosample> 47 | 48 | <Run acc="SRR13961759" 49 | total_spots="59130" total_bases="35596260" load_done="true" 50 | is_public="true" cluster_name="public" 51 | static_data_available="true"/> 52 | 53 | 54 | 2021/03/17 55 | 2021/03/15 56 | 57 | 58 | -------------------------------------------------------------------------------- /q2_fondue/entrezpy_clients/_utils.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | import logging 10 | import sys 11 | 12 | import pandas as pd 13 | 14 | PREFIX = { 15 | "run": ("SRR", "ERR", "DRR"), 16 | "experiment": ("SRX", "ERX", "DRX"), 17 | "sample": ("SRS", "ERS", "DRS"), 18 | "study": ("SRP", "ERP", "DRP"), 19 | "bioproject": ("PRJ",), 20 | } 21 | 22 | 23 | class InvalidIDs(Exception): 24 | pass 25 | 26 | 27 | def get_attrs(obj, excluded=()): 28 | return [ 29 | k for k, v in vars(obj).items() if k not in excluded and not k.startswith("__") 30 | ] 31 | 32 | 33 | def rename_columns(df: pd.DataFrame): 34 | # clean up ID columns 35 | col_map = {} 36 | id_cols = [col for col in df.columns if col.endswith("_id")] 37 | for col in id_cols: 38 | col_split = col.split("_") 39 | col_map[col] = f"{col_split[0].capitalize()} {col_split[1].upper()}" 40 | 41 | # clean up other multi-word columns 42 | wordy_cols = [col for col in df.columns if "_" in col and col not in id_cols] 43 | for col in wordy_cols: 44 | col_map[col] = " ".join([x.capitalize() for x in col.split("_")]) 45 | 46 | # capitalize the rest 47 | remainder_cols = [ 48 | col for col in df.columns if col not in id_cols and col not in wordy_cols 49 | ] 50 | for col in remainder_cols: 51 | col_map[col] = col.capitalize() 52 | 53 | df.rename(columns=col_map, inplace=True) 54 | 55 | # rename Sample ID to Sample Accession (incompatible with qiime naming) 56 | df.rename(columns={"Sample ID": "Sample Accession"}, inplace=True) 57 | 58 | return df 59 | 60 | 61 | def set_up_entrezpy_logging(entrezpy_obj, log_level, log_id=False): 62 | """Sets up logging for the given Entrezpy object. 63 | 64 | Args: 65 | entrezpy_obj (object): An Entrezpy object that has a logger attribute. 66 | log_level (str): The log level to set. 67 | log_id (bool): If True, accession ID will be added to the log. 68 | """ 69 | handler = set_up_logging_handler(log_id=log_id) 70 | 71 | entrezpy_obj.logger.addHandler(handler) 72 | entrezpy_obj.logger.setLevel(log_level) 73 | 74 | if hasattr(entrezpy_obj, "request_pool"): 75 | entrezpy_obj.request_pool.logger.addHandler(handler) 76 | entrezpy_obj.request_pool.logger.setLevel(log_level) 77 | 78 | 79 | def set_up_logger( 80 | log_level, cls_obj=None, logger_name=None, log_id=False 81 | ) -> logging.Logger: 82 | """Sets up the module/class logger. 83 | 84 | Args: 85 | log_level (str): The log level to set. 86 | cls_obj: Class instance for which the logger should be created. 87 | logger_name (str): The name of the logger. 88 | log_id (bool): If True, accession ID will be added to the log. 89 | 90 | Returns: 91 | logging.Logger: The module logger. 92 | """ 93 | if cls_obj: 94 | logger = logging.getLogger(f"{cls_obj.__module__}") 95 | else: 96 | logger = logging.getLogger(logger_name) 97 | logger.setLevel(log_level) 98 | handler = set_up_logging_handler(log_id=log_id) 99 | logger.addHandler(handler) 100 | return logger 101 | 102 | 103 | def set_up_logging_handler(log_id: bool = False) -> logging.StreamHandler: 104 | """Sets up logging handler.""" 105 | handler = logging.StreamHandler(sys.stdout) 106 | if log_id: 107 | formatter = logging.Formatter( 108 | "%(asctime)s [%(threadName)s] [%(levelname)s] " 109 | "[%(name)s] [%(accession_id)s]: %(message)s" 110 | ) 111 | else: 112 | formatter = logging.Formatter( 113 | "%(asctime)s [%(threadName)s] [%(levelname)s] " "[%(name)s]: %(message)s" 114 | ) 115 | handler.setFormatter(formatter) 116 | return handler 117 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/efetch_b1_response_runs.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 4 8 | <Summary><Title>454 9 | sequencing of Human HapMap individual NA18505 genomic paired-end 10 | library</Title><Platform instrument_model="454 GS FLX">LS454</Platform><Statistics 11 | total_runs="10" total_spots="4703662" total_bases="1306798474" 12 | total_size="3205056622" load_done="true" 13 | static_data_available="true" cluster_name="public"/></Summary><Submitter 14 | acc="SRA000197" center_name="454MSC" contact_name="Chris OSullivan" 15 | lab_name=""/><Experiment acc="SRX000003" ver="10" 16 | status="public" name="454 sequencing of Human HapMap individual 17 | NA18505 genomic paired-end library"/><Study acc="SRP000001" 18 | name="Paired-end mapping reveals extensive structural variation in 19 | the human genome"/><Organism taxid="9606" 20 | ScientificName="Homo sapiens"/><Sample acc="SRS000100" 21 | name=""/><Instrument LS454="454 GS FLX"/><Library_descriptor><LIBRARY_NAME 22 | xmlns="">SID2699</LIBRARY_NAME><LIBRARY_STRATEGY 23 | xmlns="">WGS</LIBRARY_STRATEGY><LIBRARY_SOURCE xmlns="">GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION 24 | xmlns="">RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT 25 | xmlns=""> <PAIRED NOMINAL_LENGTH="3000"/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA33627</Bioproject><Biosample>SAMN00001583</Biosample> 26 | 27 | <Run acc="SRR000007" 28 | total_spots="633196" total_bases="175275395" load_done="true" 29 | is_public="true" cluster_name="public" 30 | static_data_available="true"/><Run acc="SRR000018" 31 | total_spots="626624" total_bases="174403220" load_done="true" 32 | is_public="true" cluster_name="public" 33 | static_data_available="true"/><Run acc="SRR000020" 34 | total_spots="374556" total_bases="103411232" load_done="true" 35 | is_public="true" cluster_name="public" 36 | static_data_available="true"/><Run acc="SRR000038" 37 | total_spots="529820" total_bases="148389031" load_done="true" 38 | is_public="true" cluster_name="public" 39 | static_data_available="true"/><Run acc="SRR000043" 40 | total_spots="608946" total_bases="168985392" load_done="true" 41 | is_public="true" cluster_name="public" 42 | static_data_available="true"/><Run acc="SRR000046" 43 | total_spots="79047" total_bases="21258857" load_done="true" 44 | is_public="true" cluster_name="public" 45 | static_data_available="true"/><Run acc="SRR000048" 46 | total_spots="640737" total_bases="177619279" load_done="true" 47 | is_public="true" cluster_name="public" 48 | static_data_available="true"/><Run acc="SRR000050" 49 | total_spots="547349" total_bases="153260655" load_done="true" 50 | is_public="true" cluster_name="public" 51 | static_data_available="true"/><Run acc="SRR000057" 52 | total_spots="76744" total_bases="21203932" load_done="true" 53 | is_public="true" cluster_name="public" 54 | static_data_available="true"/><Run acc="SRR000058" 55 | total_spots="586643" total_bases="162991481" load_done="true" 56 | is_public="true" cluster_name="public" 57 | static_data_available="true"/> 58 | 59 | 60 | 2008/04/04 61 | 2015/04/09 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/efetch_response_runs_single_item.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 4 8 | <Summary><Title>454 9 | sequencing of Human HapMap individual NA18505 genomic paired-end 10 | library</Title><Platform instrument_model="454 GS FLX">LS454</Platform><Statistics 11 | total_runs="10" total_spots="4703662" total_bases="1306798474" 12 | total_size="3205056622" load_done="true" 13 | static_data_available="true" cluster_name="public"/></Summary><Submitter 14 | acc="SRA000197" center_name="454MSC" contact_name="Chris OSullivan" 15 | lab_name=""/><Experiment acc="SRX000003" ver="10" 16 | status="public" name="454 sequencing of Human HapMap individual 17 | NA18505 genomic paired-end library"/><Study acc="SRP000001" 18 | name="Paired-end mapping reveals extensive structural variation in 19 | the human genome"/><Organism taxid="9606" 20 | ScientificName="Homo sapiens"/><Sample acc="SRS000100" 21 | name=""/><Instrument LS454="454 GS FLX"/><Library_descriptor><LIBRARY_NAME 22 | xmlns="">SID2699</LIBRARY_NAME><LIBRARY_STRATEGY 23 | xmlns="">WGS</LIBRARY_STRATEGY><LIBRARY_SOURCE xmlns="">GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION 24 | xmlns="">RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT 25 | xmlns=""> <PAIRED NOMINAL_LENGTH="3000"/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA33627</Bioproject><Biosample>SAMN00001583</Biosample> 26 | 27 | <Run acc="SRR000007" 28 | total_spots="633196" total_bases="175275395" load_done="true" 29 | is_public="true" cluster_name="public" 30 | static_data_available="true"/><Run acc="SRR000018" 31 | total_spots="626624" total_bases="174403220" load_done="true" 32 | is_public="true" cluster_name="public" 33 | static_data_available="true"/><Run acc="SRR000020" 34 | total_spots="374556" total_bases="103411232" load_done="true" 35 | is_public="true" cluster_name="public" 36 | static_data_available="true"/><Run acc="SRR000038" 37 | total_spots="529820" total_bases="148389031" load_done="true" 38 | is_public="true" cluster_name="public" 39 | static_data_available="true"/><Run acc="SRR000043" 40 | total_spots="608946" total_bases="168985392" load_done="true" 41 | is_public="true" cluster_name="public" 42 | static_data_available="true"/><Run acc="SRR000046" 43 | total_spots="79047" total_bases="21258857" load_done="true" 44 | is_public="true" cluster_name="public" 45 | static_data_available="true"/><Run acc="SRR000048" 46 | total_spots="640737" total_bases="177619279" load_done="true" 47 | is_public="true" cluster_name="public" 48 | static_data_available="true"/><Run acc="SRR000050" 49 | total_spots="547349" total_bases="153260655" load_done="true" 50 | is_public="true" cluster_name="public" 51 | static_data_available="true"/><Run acc="SRR000057" 52 | total_spots="76744" total_bases="21203932" load_done="true" 53 | is_public="true" cluster_name="public" 54 | static_data_available="true"/><Run acc="SRR000058" 55 | total_spots="586643" total_bases="162991481" load_done="true" 56 | is_public="true" cluster_name="public" 57 | static_data_available="true"/> 58 | 59 | 60 | 2008/04/04 61 | 2015/04/09 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /q2_fondue/tests/test_get_all.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import unittest 9 | from unittest.mock import ANY, Mock 10 | 11 | import pandas as pd 12 | from qiime2 import Artifact 13 | 14 | from q2_fondue.get_all import get_all 15 | from q2_fondue.tests.test_sequences import SequenceTests 16 | 17 | 18 | class FakeCtx(Mock): 19 | def __init__(self, ids_path, meta_path, failed_ids=None): 20 | super().__init__() 21 | self.ids = Artifact.import_data("NCBIAccessionIDs", ids_path) 22 | self.meta = Artifact.import_data("SRAMetadata", meta_path) 23 | self.failed_empty = Artifact.import_data("SRAFailedIDs", pd.DataFrame()) 24 | if failed_ids: 25 | self.failed = Artifact.import_data( 26 | "SRAFailedIDs", 27 | pd.DataFrame( 28 | data={"Error message": ["Some error message" for _ in failed_ids]}, 29 | index=pd.Index(failed_ids, name="ID"), 30 | ), 31 | ) 32 | else: 33 | self.failed = self.failed_empty 34 | 35 | self.get_metadata = Mock(return_value=(self.meta, self.failed_empty)) 36 | self.get_sequences = Mock(return_value=(Mock(), Mock(), self.failed)) 37 | 38 | def get_action(self, plugin, action): 39 | if action == "get_metadata": 40 | return self.get_metadata 41 | elif action == "get_sequences": 42 | return self.get_sequences 43 | 44 | 45 | class TestGetAll(SequenceTests): 46 | package = "q2_fondue.tests" 47 | 48 | def test_get_all_single(self): 49 | """ 50 | Test verifying that pipeline get_all calls all expected actions, 51 | individual actions are tested in details in respective test classes 52 | """ 53 | mock_ctx = FakeCtx( 54 | ids_path=self.get_data_path("SRR123456_md.tsv"), 55 | meta_path=self.get_data_path("sra-metadata-mock.tsv"), 56 | ) 57 | obs_meta, _, _, obs_failed = get_all( 58 | mock_ctx, mock_ctx.ids, "fake@email.com", retries=1 59 | ) 60 | 61 | mock_ctx.get_metadata.assert_called_once_with( 62 | mock_ctx.ids, "fake@email.com", 1, "INFO", None 63 | ) 64 | mock_ctx.get_sequences.assert_called_once_with( 65 | ANY, "fake@email.com", 1, 1, "INFO" 66 | ) 67 | 68 | run_ids = mock_ctx.get_sequences.call_args_list[0][0][0] 69 | run_ids = run_ids.view(pd.DataFrame).index.to_list() 70 | self.assertListEqual(run_ids, ["SRR123456"]) 71 | 72 | self.assertEqual(obs_meta, mock_ctx.meta) 73 | self.assertEqual(obs_failed, mock_ctx.failed) 74 | 75 | def test_get_all_multi_with_missing_ids(self): 76 | """ 77 | Test verifying that pipeline get_all calls all expected actions, 78 | individual actions are tested in details in respective test classes 79 | """ 80 | mock_ctx = FakeCtx( 81 | ids_path=self.get_data_path("SRR1234567_md.tsv"), 82 | meta_path=self.get_data_path("sra-metadata-mock.tsv"), 83 | failed_ids=["SRR123457"], 84 | ) 85 | obs_meta, _, _, obs_failed = get_all( 86 | mock_ctx, mock_ctx.ids, "fake@email.com", retries=1 87 | ) 88 | 89 | mock_ctx.get_metadata.assert_called_once_with( 90 | mock_ctx.ids, "fake@email.com", 1, "INFO", None 91 | ) 92 | mock_ctx.get_sequences.assert_called_once_with( 93 | ANY, "fake@email.com", 1, 1, "INFO" 94 | ) 95 | 96 | run_ids = mock_ctx.get_sequences.call_args_list[0][0][0] 97 | run_ids = run_ids.view(pd.DataFrame).index.to_list() 98 | self.assertListEqual(run_ids, ["SRR123456"]) 99 | 100 | self.assertEqual(obs_meta, mock_ctx.meta) 101 | self.assertListEqual( 102 | obs_failed.view(pd.DataFrame).index.to_list(), ["SRR123457"] 103 | ) 104 | 105 | 106 | if __name__ == "__main__": 107 | unittest.main() 108 | -------------------------------------------------------------------------------- /q2_fondue/entrezpy_clients/_esearch.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | from typing import List, Union 10 | 11 | import pandas as pd 12 | from entrezpy.esearch.esearch_analyzer import EsearchAnalyzer 13 | from entrezpy.esearch.esearch_result import EsearchResult 14 | 15 | 16 | class ESearchResult(EsearchResult): 17 | """Entrezpy client for ESearch utility used to search for or validate 18 | provided accession IDs. 19 | """ 20 | 21 | def __init__(self, response, request): 22 | super().__init__(response, request) 23 | self.result = None 24 | 25 | def validate_result(self) -> dict: 26 | """Validates hit counts obtained for all the provided UIDs. 27 | 28 | As the expected hit count for a valid SRA accession ID is 1, all the 29 | IDs with that value will be considered valid. UIDs with count higher 30 | than 1 will be considered 'ambiguous' as they could not be resolved 31 | to a single result. Likewise, UIDs with a count of 0 will be considered 32 | 'invalid' as no result could be found for those. 33 | 34 | Raises: 35 | InvalidIDs: An exception is raised when either ambiguous or invalid 36 | IDs were encountered. 37 | 38 | """ 39 | # correct id should have count == 1 40 | leftover_ids = self.result[self.result != 1] 41 | if leftover_ids.shape[0] == 0: 42 | return {} 43 | ambiguous_ids = leftover_ids[leftover_ids > 0] 44 | invalid_ids = leftover_ids[leftover_ids == 0] 45 | 46 | error_msg = "Some of the IDs are invalid or ambiguous:" 47 | if ambiguous_ids.shape[0] > 0: 48 | error_msg += f'\n Ambiguous IDs: {", ".join(ambiguous_ids.index)}' 49 | if invalid_ids.shape[0] > 0: 50 | error_msg += f'\n Invalid IDs: {", ".join(invalid_ids.index)}' 51 | self.logger.warning(error_msg) 52 | return { 53 | **{_id: "ID is ambiguous." for _id in ambiguous_ids.index}, 54 | **{_id: "ID is invalid." for _id in invalid_ids.index}, 55 | } 56 | 57 | def parse_search_results(self, response, uids: Union[List[str], None]): 58 | """Parses response received from Esearch as a pandas Series object. 59 | 60 | Hit counts obtained in the response will be extracted and assigned to 61 | their respective query IDs. IDs not found in the results but present 62 | in the UIDs list will get a count of 0. 63 | 64 | Args: 65 | response (): Response received from Esearch. 66 | uids (List[str]): List of original UIDs that were submitted 67 | as a query. 68 | 69 | """ 70 | translation_stack = response["esearchresult"].get("translationstack") 71 | if not translation_stack: 72 | self.result = pd.Series({x: 0 for x in uids}, name="count") 73 | return 74 | 75 | # filter out only positive hits 76 | found_terms = [x for x in translation_stack if isinstance(x, dict)] 77 | found_terms = { 78 | x["term"].replace("[All Fields]", ""): int(x["count"]) for x in found_terms 79 | } 80 | 81 | # find ids that are missing 82 | if uids: 83 | missing_ids = [x for x in uids if x not in found_terms.keys()] 84 | missing_ids = {x: 0 for x in missing_ids} 85 | found_terms.update(missing_ids) 86 | 87 | self.result = pd.Series(found_terms, name="count") 88 | 89 | 90 | class ESearchAnalyzer(EsearchAnalyzer): 91 | def __init__(self, uids): 92 | super().__init__() 93 | self.uids = uids 94 | 95 | # override the base method to use our own ESResult 96 | def init_result(self, response, request): 97 | if not self.result: 98 | self.result = ESearchResult(response, request) 99 | return True 100 | return False 101 | 102 | # override the base method to additionally parse the result 103 | def analyze_result(self, response, request): 104 | super().analyze_result(response, request) 105 | self.result.parse_search_results(response, self.uids) 106 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/metadata_processed_multi.json: -------------------------------------------------------------------------------- 1 | { 2 | "FAKEID1": { 3 | "Experiment ID": "ERX3980916", 4 | "Biosample ID": "SAMEA6608408", 5 | "Bioproject ID": "PRJEB37054", 6 | "Study ID": "ERP120343", 7 | "Sample Accession": "ERS4372624", 8 | "Organism": "Vitis vinifera", 9 | "Library Source": "METAGENOMIC", 10 | "Library Selection": "PCR", 11 | "Library Layout": "SINGLE", 12 | "Instrument": "Illumina MiSeq", 13 | "Platform": "ILLUMINA", 14 | "Bases": "11552099", 15 | "Spots": "39323", 16 | "Avg Spot Len": "293", 17 | "Bytes": "3914295", 18 | "Public": "True", 19 | "Ena-first-public [run]": "2020-05-31", 20 | "Ena-first-public [sample]": "2020-05-31", 21 | "Ena-first-public [study]": "2020-05-31", 22 | "Ena-last-update [run]": "2020-03-06", 23 | "Ena-last-update [sample]": "2020-03-06", 24 | "Ena-last-update [study]": "2020-03-04", 25 | "Amount or size of sample collected [sample]": "50", 26 | "Collection date [sample]": "2015-09-28", 27 | "Collection day [sample]": "1", 28 | "Collection hours [sample]": "0", 29 | "Environment (biome) [sample]": "berry plant", 30 | "Environment (feature) [sample]": "grape plant", 31 | "Environment (material) [sample]": "wine must", 32 | "Geographic location (country and/or sea) [sample]": "Germany", 33 | "Geographic location (latitude) [sample]": "48.71 N", 34 | "Geographic location (longitude) [sample]": "9.12 E", 35 | "Investigation type [sample]": "metagenome", 36 | "Multiplex identifiers [sample]": "TAGATCGCTCGCCTTA", 37 | "Pcr primers [sample]": "GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT", 38 | "Plant-associated environmental package [sample]": "plant-associated", 39 | "Project name [sample]": "wine must microbiota analysis during fermentation", 40 | "Sample storage temperature [sample]": "-80", 41 | "Sample volume or weight for dna extraction [sample]": "0.5", 42 | "Sequencing method [sample]": "Illumina MiSeq", 43 | "Subspecific genetic lineage [sample]": "Bacchus1", 44 | "Target subfragment [sample]": "16S rRNA gene", 45 | "Library Name": "unspecified", 46 | "Name": "BAC1.D1.0.32A", 47 | "Center Name": "University of Hohenheim", 48 | "Title": "Vitis vinifera", 49 | "Tax ID": "29760" 50 | }, 51 | "FAKEID2": { 52 | "Experiment ID": "ERX3980917", 53 | "Biosample ID": "SAMEA6608409", 54 | "Bioproject ID": "PRJEB37054", 55 | "Study ID": "ERP120343", 56 | "Sample Accession": "ERS4372625", 57 | "Organism": "Vitis vinifera", 58 | "Library Source": "METAGENOMIC", 59 | "Library Selection": "PCR", 60 | "Library Layout": "SINGLE", 61 | "Instrument": "Illumina MiSeq", 62 | "Platform": "ILLUMINA", 63 | "Bases": "17523267", 64 | "Spots": "59799", 65 | "Avg Spot Len": "293", 66 | "Bytes": "5879896", 67 | "Public": "True", 68 | "Ena-first-public [run]": "2020-05-31", 69 | "Ena-first-public [sample]": "2020-05-31", 70 | "Ena-first-public [study]": "2020-05-31", 71 | "Ena-last-update [run]": "2020-03-06", 72 | "Ena-last-update [sample]": "2020-03-06", 73 | "Ena-last-update [study]": "2020-03-04", 74 | "Amount or size of sample collected [sample]": "50", 75 | "Collection date [sample]": "2015-09-28", 76 | "Collection day [sample]": "1", 77 | "Collection hours [sample]": "2", 78 | "Environment (biome) [sample]": "berry plant", 79 | "Environment (feature) [sample]": "grape plant", 80 | "Environment (material) [sample]": "wine must", 81 | "Geographic location (country and/or sea) [sample]": "Germany", 82 | "Geographic location (latitude) [sample]": "48.71 N", 83 | "Geographic location (longitude) [sample]": "9.12 E", 84 | "Investigation type [sample]": "metagenome", 85 | "Multiplex identifiers [sample]": "CTCTCTATTCGCCTTA", 86 | "Pcr primers [sample]": "GTGCCAGCMGCCGCGGTAAGGACTACHVGGGTWTCTAAT", 87 | "Plant-associated environmental package [sample]": "plant-associated", 88 | "Project name [sample]": "wine must microbiota analysis during fermentation", 89 | "Sample storage temperature [sample]": "-80", 90 | "Sample volume or weight for dna extraction [sample]": "0.5", 91 | "Sequencing method [sample]": "Illumina MiSeq", 92 | "Subspecific genetic lineage [sample]": "Bacchus1", 93 | "Target subfragment [sample]": "16S rRNA gene", 94 | "Library Name": "unspecified", 95 | "Name": "BAC1.D1.1.33A", 96 | "Center Name": "University of Hohenheim", 97 | "Title": "Vitis vinifera", 98 | "Tax ID": "29760" 99 | } 100 | } -------------------------------------------------------------------------------- /q2_fondue/entrezpy_clients/_pipelines.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | from typing import Union 9 | 10 | from entrezpy import conduit as ec 11 | 12 | from entrezpy.elink.elink_analyzer import ElinkAnalyzer 13 | 14 | from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer 15 | from q2_fondue.entrezpy_clients._esearch import ESearchAnalyzer 16 | from q2_fondue.entrezpy_clients._utils import set_up_entrezpy_logging 17 | 18 | import entrezpy.esearch.esearcher as searcher 19 | 20 | from q2_fondue.utils import _chunker 21 | 22 | BATCH_SIZE = 500 23 | 24 | 25 | def _get_run_ids( 26 | email: str, 27 | n_jobs: int, 28 | ids: Union[list, None], 29 | query: Union[str, None], 30 | source: str, 31 | log_level: str, 32 | ) -> list: 33 | """Pipeline to retrieve run IDs associated with BioSample query 34 | (provided in `query`) or other aggregate IDs like studies 35 | (`source`='study'), bioprojects (`source`='bioproject'), samples 36 | (`source`='sample') or experiments (`source`='experiment') 37 | provided in `ids`. 38 | 39 | Args: 40 | email (str): User email. 41 | n_jobs (int): Number of jobs. 42 | ids (list): List of study, bioproject, sample or experiment IDs. 43 | query (str): Search query to find IDs by. 44 | source (str): Type of IDs provided ('study', 'bioproject', 45 | 'sample' or 'experiment'). 46 | log_level (str): The log level to set. 47 | 48 | Returns: 49 | list: Run IDs associated with provided ids. 50 | """ 51 | term = " OR ".join(ids) if ids else query 52 | 53 | # create pipeline to fetch all run IDs 54 | elink = True 55 | if source == "bioproject": 56 | db = "bioproject" 57 | elif source == "biosample": 58 | db = "biosample" 59 | else: 60 | db = "sra" 61 | elink = False 62 | 63 | # find UIDS based on a query; 64 | # instead of saving the result on the history server 65 | # we will store all the UIDs recovered based on the 66 | # search query and use those in the mini-pipeline below; 67 | # this way we are not limited by ELink only accepting up to 68 | # who knows how many IDs and erroring out if we provide too 69 | # many (which could be the case e.g.: when we ask for more 70 | # than 10000 BioProject IDs or the text query returns more 71 | # than 10000 IDs presumably) 72 | esearcher = searcher.Esearcher( 73 | "esearcher", email, apikey=None, apikey_var=None, threads=n_jobs, qid=None 74 | ) 75 | esearch_response = esearcher.inquire( 76 | {"db": db, "term": term, "usehistory": False, "rettype": "json"}, 77 | analyzer=ESearchAnalyzer(ids), 78 | ) 79 | 80 | # use the UIDs to link to other DBs and fetch related records; 81 | # we won't be using multi-threading here as this shouldn't take 82 | # long (we're only fetching IDs) and we don't want those dead 83 | # threads afterwards 84 | econduit = ec.Conduit(email=email, threads=0) 85 | set_up_entrezpy_logging(econduit, log_level) 86 | run_ids_pipeline = econduit.new_pipeline() 87 | 88 | # create a pipeline to link and fetch the run IDs; 89 | # we process the IDs obtained from the previous step in batches 90 | # as ELink cannot handle more than a certain amount of IDs 91 | # at the same time (recommended by NCBI) 92 | for _ids in _chunker(esearch_response.result.uids, BATCH_SIZE): 93 | if elink: 94 | el = run_ids_pipeline.add_link( 95 | {"db": "sra", "dbfrom": db, "id": _ids, "link": False}, 96 | analyzer=ElinkAnalyzer(), 97 | ) 98 | else: 99 | el = None 100 | 101 | # given SRA run IDs, fetch all metadata 102 | efetch_params = { 103 | "rettype": "docsum", 104 | "retmode": "xml", 105 | "reqsize": BATCH_SIZE, 106 | "retmax": len(_ids), 107 | } 108 | if not elink: 109 | # we need to specify these manually as in this scenario 110 | # EFetch is not linked to anything 111 | efetch_params.update({"id": _ids, "db": db}) 112 | 113 | run_ids_pipeline.add_fetch( 114 | efetch_params, analyzer=EFetchAnalyzer(log_level), dependency=el 115 | ) 116 | 117 | econduit.run(run_ids_pipeline) 118 | 119 | # recover run IDs from all instances of EFetchAnalyzer 120 | all_run_ids = [] 121 | for x in econduit.analyzers.values(): 122 | if isinstance(x, EFetchAnalyzer): 123 | all_run_ids.extend(x.result.metadata) 124 | 125 | return sorted(all_run_ids) 126 | -------------------------------------------------------------------------------- /q2_fondue/types/_format.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | import itertools 10 | 11 | import pandas as pd 12 | from qiime2.plugin import ValidationError 13 | from qiime2.plugin import model 14 | from qiime2.metadata.base import is_id_header, FORMATTED_ID_HEADERS 15 | from q2_fondue.entrezpy_clients._utils import PREFIX 16 | 17 | 18 | class SRAMetadataFormat(model.TextFileFormat): 19 | 20 | REQUIRED_IDS = [ 21 | "ID", 22 | "Biosample ID", 23 | "Bioproject ID", 24 | "Experiment ID", 25 | "Study ID", 26 | "Sample Accession", 27 | ] 28 | REQUIRED_HEADER_FIELDS = [ 29 | "Organism", 30 | "Instrument", 31 | "Platform", 32 | "Bases", 33 | "Bytes", 34 | "Public", 35 | "Library Selection", 36 | "Library Source", 37 | "Library Layout", 38 | ] 39 | REQUIRED_HEADER_FIELDS.extend(REQUIRED_IDS) 40 | 41 | def _validate(self): 42 | df = pd.read_csv(str(self), sep="\t") 43 | 44 | missing_cols = [x for x in self.REQUIRED_HEADER_FIELDS if x not in df.columns] 45 | if missing_cols: 46 | raise ValidationError( 47 | "Some required columns are missing from the metadata file: " 48 | f'{", ".join(missing_cols)}.' 49 | ) 50 | 51 | # some IDs must be present in all samples 52 | nans = df.isnull().sum(axis=0)[self.REQUIRED_IDS] 53 | missing_ids = nans.where(nans > 0).dropna().index.tolist() 54 | if missing_ids: 55 | raise ValidationError( 56 | "Some samples are missing IDs in the following fields: " 57 | f'{", ".join(missing_ids)}.' 58 | ) 59 | 60 | def _validate_(self, level): 61 | self._validate() 62 | 63 | 64 | SRAMetadataDirFmt = model.SingleFileDirectoryFormat( 65 | "SRAMetadataDirFmt", "sra-metadata.tsv", SRAMetadataFormat 66 | ) 67 | 68 | 69 | class SRAFailedIDsFormat(model.TextFileFormat): 70 | """ 71 | This is a "fake" format only used to store a list of failed SRA IDs, 72 | which can be converted to QIIME's metadata and input into any fondue 73 | action. 74 | """ 75 | 76 | def _validate_(self, level): 77 | df = pd.read_csv(str(self), sep="\t", index_col=0) 78 | 79 | if df.shape[1] > 1: 80 | raise ValidationError( 81 | "Failed IDs artifact should only contain a single column " 82 | "with error message for the runs that could not be fetched " 83 | "(indexed by run ID)." 84 | ) 85 | 86 | 87 | SRAFailedIDsDirFmt = model.SingleFileDirectoryFormat( 88 | "SRAFailedIDsDirFmt", "sra-failed-ids.tsv", SRAFailedIDsFormat 89 | ) 90 | 91 | 92 | class NCBIAccessionIDsFormat(model.TextFileFormat): 93 | """ 94 | This is a format used to store a list of SRA accession IDs (run, 95 | study, BioProject, sample and experiment IDs), which can be converted 96 | to QIIME's metadata. Artifacts containing of run, study and BioProject 97 | IDs can be input into any fondue action. 98 | """ 99 | 100 | ALLOWED_PREFIXES = tuple( 101 | itertools.chain( 102 | *[ 103 | v 104 | for k, v in PREFIX.items() 105 | if k in ("bioproject", "run", "study", "sample", "experiment") 106 | ] 107 | ) 108 | ) 109 | 110 | def _validate_id(self, _id: str): 111 | if not _id.startswith(self.ALLOWED_PREFIXES): 112 | raise ValidationError( 113 | "Some of the provided IDs are invalid - only SRA run, study, " 114 | "BioProject, sample and experiment IDs are allowed. Please " 115 | "check your input and try again." 116 | ) 117 | 118 | def _validate_(self, level): 119 | df = pd.read_csv(str(self), sep="\t") 120 | cols = df.columns.tolist() 121 | 122 | if df.shape[1] > 2 or ( 123 | df.shape[1] == 2 and not any(x in cols for x in ["doi", "DOI"]) 124 | ): 125 | raise ValidationError( 126 | "NCBI Accession IDs artifact should only contain a single " 127 | "column with IDs of the SRA runs, studies or NCBI's " 128 | "BioProjects and an optional column `doi` with " 129 | "associated DOIs." 130 | ) 131 | 132 | # check that there is a valid ID header: 133 | if not any([is_id_header(x) for x in cols]): 134 | raise ValidationError( 135 | f"NCBI Accession IDs artifact must contain a valid " 136 | f"ID header from {FORMATTED_ID_HEADERS}." 137 | ) 138 | 139 | df.iloc[:, 0].apply(self._validate_id) 140 | 141 | 142 | NCBIAccessionIDsDirFmt = model.SingleFileDirectoryFormat( 143 | "NCBIAccessionIDsDirFmt", "ncbi-accession-ids.tsv", NCBIAccessionIDsFormat 144 | ) 145 | -------------------------------------------------------------------------------- /q2_fondue/tests/test_esearch.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | import unittest 10 | from unittest.mock import MagicMock 11 | 12 | import pandas as pd 13 | from q2_fondue.entrezpy_clients._esearch import ESearchResult, ESearchAnalyzer 14 | from q2_fondue.tests._utils import _TestPluginWithEntrezFakeComponents 15 | 16 | 17 | class FakeESAnalyzer: 18 | def __init__(self, uids): 19 | self.uids = uids 20 | self.log_level = "INFO" 21 | self.result = MagicMock() 22 | self.result.result = pd.Series(data=[6, 6], index=["ABC", "123"]) 23 | 24 | 25 | class TestEsearchClients(_TestPluginWithEntrezFakeComponents): 26 | package = "q2_fondue.tests" 27 | 28 | def test_esresult_parse_search_results(self): 29 | esearch_result = self.generate_es_result("single", "_correct") 30 | esearch_result.parse_search_results( 31 | self.json_to_response("single", "_correct"), ["SRR000001"] 32 | ) 33 | 34 | obs = esearch_result.result 35 | exp = pd.Series(data=[1], index=["SRR000001"], name="count") 36 | pd.testing.assert_series_equal(exp, obs) 37 | 38 | def test_esresult_parse_search_results_ambiguous(self): 39 | esearch_result = self.generate_es_result("single", "_ambiguous") 40 | esearch_result.parse_search_results( 41 | self.json_to_response("single", "_ambiguous"), ["SR012"] 42 | ) 43 | 44 | obs = esearch_result.result 45 | exp = pd.Series(data=[7], index=["SR012"], name="count") 46 | pd.testing.assert_series_equal(exp, obs) 47 | 48 | def test_esresult_parse_search_results_multi(self): 49 | esearch_result = self.generate_es_result("multi", "_correct") 50 | esearch_result.parse_search_results( 51 | self.json_to_response("multi", "_correct"), 52 | ["SRR000001", "SRR000013", "ERR3978173"], 53 | ) 54 | 55 | obs = esearch_result.result 56 | exp = pd.Series( 57 | data=[1, 1, 1], index=["SRR000001", "SRR000013", "ERR3978173"], name="count" 58 | ) 59 | pd.testing.assert_series_equal(exp, obs) 60 | 61 | def test_esresult_parse_search_results_multi_invalid(self): 62 | esearch_result = self.generate_es_result("multi", "_invalid") 63 | esearch_result.parse_search_results( 64 | self.json_to_response("multi", "_invalid"), ["ABCD123", "SRR001"] 65 | ) 66 | 67 | obs = esearch_result.result 68 | exp = pd.Series(data=[0, 0], index=["ABCD123", "SRR001"], name="count") 69 | pd.testing.assert_series_equal(exp, obs) 70 | 71 | def test_esresult_parse_search_results_multi_mixed(self): 72 | esearch_result = self.generate_es_result("multi", "_mixed") 73 | esearch_result.parse_search_results( 74 | self.json_to_response("multi", "_mixed"), 75 | ["SRR000001", "SRR000013", "SR012", "ABCD123", "SRR001"], 76 | ) 77 | 78 | obs = esearch_result.result 79 | exp = pd.Series( 80 | data=[1, 1, 7, 0, 0], 81 | index=["SRR000001", "SRR000013", "SR012", "ABCD123", "SRR001"], 82 | name="count", 83 | ) 84 | pd.testing.assert_series_equal(exp, obs) 85 | 86 | def test_esresult_validate_result_single(self): 87 | esearch_result = self.generate_es_result("single", "_correct") 88 | esearch_result.result = pd.Series(data=[1], index=["SRR000001"], name="count") 89 | 90 | obs = esearch_result.validate_result() 91 | self.assertDictEqual(obs, {}) 92 | 93 | def test_esresult_validate_result_single_ambiguous(self): 94 | esearch_result = self.generate_es_result("single", "_ambiguous") 95 | esearch_result.result = pd.Series(data=[7], index=["SR012"], name="count") 96 | 97 | obs = esearch_result.validate_result() 98 | exp = {"SR012": "ID is ambiguous."} 99 | self.assertDictEqual(obs, exp) 100 | 101 | def test_esresult_validate_result_multi(self): 102 | esearch_result = self.generate_es_result("multi", "_correct") 103 | esearch_result.result = pd.Series( 104 | data=[1, 1, 1], index=["SRR000001", "SRR000013", "ERR3978173"], name="count" 105 | ) 106 | 107 | obs = esearch_result.validate_result() 108 | self.assertDictEqual(obs, {}) 109 | 110 | def test_esresult_validate_result_multi_invalid(self): 111 | esearch_result = self.generate_es_result("multi", "_invalid") 112 | esearch_result.result = pd.Series( 113 | data=[0, 0], index=["ABCD123", "SRR001"], name="count" 114 | ) 115 | 116 | obs = esearch_result.validate_result() 117 | exp = {"ABCD123": "ID is invalid.", "SRR001": "ID is invalid."} 118 | self.assertDictEqual(obs, exp) 119 | 120 | def test_esresult_validate_result_multi_mixed(self): 121 | esearch_result = self.generate_es_result("multi", "_mixed") 122 | esearch_result.result = pd.Series( 123 | data=[1, 1, 7, 0, 0], 124 | index=["SRR000001", "SRR000013", "SR012", "ABCD123", "SRR001"], 125 | name="count", 126 | ) 127 | 128 | obs = esearch_result.validate_result() 129 | exp = { 130 | "SR012": "ID is ambiguous.", 131 | "ABCD123": "ID is invalid.", 132 | "SRR001": "ID is invalid.", 133 | } 134 | self.assertDictEqual(obs, exp) 135 | 136 | def test_esanalyzer_analyze_result(self): 137 | es_analyzer = ESearchAnalyzer(["SRR000001"]) 138 | es_analyzer.analyze_result( 139 | response=self.json_to_response("single", "_correct"), 140 | request=self.generate_es_request("SRR000001"), 141 | ) 142 | 143 | self.assertTrue(isinstance(es_analyzer.result, ESearchResult)) 144 | 145 | 146 | if __name__ == "__main__": 147 | unittest.main() 148 | -------------------------------------------------------------------------------- /.github/workflows/docker-push.yaml: -------------------------------------------------------------------------------- 1 | name: Docker push 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["CI"] 6 | types: 7 | - completed 8 | 9 | jobs: 10 | push-docker-images: 11 | runs-on: ubuntu-latest 12 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 13 | steps: 14 | - name: Download build metadata 15 | uses: actions/github-script@v7 16 | with: 17 | script: | 18 | // Get artifacts from the triggering workflow run 19 | const artifacts = await github.rest.actions.listWorkflowRunArtifacts({ 20 | owner: context.repo.owner, 21 | repo: context.repo.repo, 22 | run_id: context.payload.workflow_run.id, 23 | }); 24 | 25 | // Find build metadata artifact 26 | const metadataArtifact = artifacts.data.artifacts.find( 27 | artifact => artifact.name === 'build-metadata' 28 | ); 29 | 30 | if (!metadataArtifact) { 31 | console.log('No build metadata found, skipping Docker push'); 32 | return; 33 | } 34 | 35 | // Download the metadata 36 | const download = await github.rest.actions.downloadArtifact({ 37 | owner: context.repo.owner, 38 | repo: context.repo.repo, 39 | artifact_id: metadataArtifact.id, 40 | archive_format: 'zip', 41 | }); 42 | 43 | const fs = require('fs'); 44 | fs.writeFileSync('metadata.zip', Buffer.from(download.data)); 45 | 46 | - name: Extract and parse metadata 47 | id: metadata 48 | run: | 49 | if [ -f "metadata.zip" ]; then 50 | unzip metadata.zip 51 | if [ -f "build-metadata.json" ]; then 52 | # Parse JSON and set outputs 53 | echo "repository=$(jq -r '.repository' build-metadata.json)" >> $GITHUB_OUTPUT 54 | echo "sha=$(jq -r '.sha' build-metadata.json)" >> $GITHUB_OUTPUT 55 | echo "short-sha=$(jq -r '.short_sha' build-metadata.json)" >> $GITHUB_OUTPUT 56 | echo "epoch=$(jq -r '.epoch' build-metadata.json)" >> $GITHUB_OUTPUT 57 | echo "ref=$(jq -r '.ref' build-metadata.json)" >> $GITHUB_OUTPUT 58 | echo "event-name=$(jq -r '.event_name' build-metadata.json)" >> $GITHUB_OUTPUT 59 | echo "pr-number=$(jq -r '.pr_number' build-metadata.json)" >> $GITHUB_OUTPUT 60 | echo "tag-name=$(jq -r '.tag_name' build-metadata.json)" >> $GITHUB_OUTPUT 61 | echo "is-tag-push=$(jq -r '.is_tag_push' build-metadata.json)" >> $GITHUB_OUTPUT 62 | echo "build-pr-image=$(jq -r '.build_pr_image' build-metadata.json)" >> $GITHUB_OUTPUT 63 | echo "is-main-push=$(jq -r '.is_main_push' build-metadata.json)" >> $GITHUB_OUTPUT 64 | echo "has-metadata=true" >> $GITHUB_OUTPUT 65 | 66 | # Display metadata for debugging 67 | echo "Build metadata:" 68 | cat build-metadata.json | jq . 69 | else 70 | echo "has-metadata=false" >> $GITHUB_OUTPUT 71 | fi 72 | else 73 | echo "has-metadata=false" >> $GITHUB_OUTPUT 74 | fi 75 | 76 | - name: Set up Docker Buildx 77 | if: steps.metadata.outputs.has-metadata == 'true' 78 | uses: docker/setup-buildx-action@v3 79 | 80 | - name: Login to the remote registry 81 | if: steps.metadata.outputs.has-metadata == 'true' 82 | uses: docker/login-action@v3 83 | with: 84 | registry: quay.io 85 | username: ${{ secrets.DOCKER_USERNAME }} 86 | password: ${{ secrets.DOCKER_PASSWORD }} 87 | 88 | - name: Download test image artifact 89 | if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.build-pr-image == 'true' 90 | uses: actions/download-artifact@v4 91 | with: 92 | name: test-docker-image 93 | path: . 94 | run-id: ${{ github.event.workflow_run.id }} 95 | github-token: ${{ secrets.GITHUB_TOKEN }} 96 | 97 | - name: Load and push test image 98 | if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.build-pr-image == 'true' 99 | run: | 100 | if [ -f "test-image.tar.gz" ]; then 101 | # Load the image 102 | docker load < test-image.tar.gz 103 | 104 | # Determine the tag based on event type 105 | if [ "${{ steps.metadata.outputs.event-name }}" = "pull_request" ]; then 106 | TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:pr-${{ steps.metadata.outputs.pr-number }}-${{ steps.metadata.outputs.short-sha }}" 107 | else 108 | TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:test-${{ steps.metadata.outputs.short-sha }}" 109 | fi 110 | 111 | # Re-tag and push 112 | docker tag ${{ steps.metadata.outputs.sha }} "$TAG" 113 | docker push "$TAG" 114 | echo "Pushed test image: $TAG" 115 | else 116 | echo "No test image artifact found" 117 | fi 118 | 119 | - name: Download production image artifact 120 | if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.is-main-push == 'true' 121 | uses: actions/download-artifact@v4 122 | with: 123 | name: prod-docker-image 124 | path: . 125 | run-id: ${{ github.event.workflow_run.id }} 126 | github-token: ${{ secrets.GITHUB_TOKEN }} 127 | 128 | - name: Load and push production image 129 | if: steps.metadata.outputs.has-metadata == 'true' && steps.metadata.outputs.is-main-push == 'true' 130 | run: | 131 | if [ -f "prod-image.tar" ]; then 132 | # Load the image 133 | docker load < prod-image.tar 134 | 135 | # Determine the tag based on whether this is a tag push or main branch push 136 | if [ "${{ steps.metadata.outputs.is-tag-push }}" = "true" ]; then 137 | # For tag pushes, use just the tag name (no hash suffix) 138 | TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:${{ steps.metadata.outputs.tag-name }}" 139 | else 140 | # For main branch pushes, use epoch + hash 141 | TAG="quay.io/bokulichlab/${{ steps.metadata.outputs.repository }}:${{ steps.metadata.outputs.epoch }}-${{ steps.metadata.outputs.short-sha }}" 142 | fi 143 | 144 | docker tag temp-prod-image "$TAG" 145 | docker push "$TAG" 146 | echo "Pushed production image: $TAG" 147 | else 148 | echo "No production image artifact found" 149 | fi -------------------------------------------------------------------------------- /q2_fondue/tests/_utils.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | import io 10 | import json 11 | import logging 12 | 13 | import pandas as pd 14 | from entrezpy.efetch.efetch_request import EfetchRequest 15 | from entrezpy.esearch.esearch_request import EsearchRequest 16 | from qiime2.plugin.testing import TestPluginBase 17 | 18 | from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer, EFetchResult 19 | from q2_fondue.entrezpy_clients._esearch import ESearchResult 20 | from q2_fondue.entrezpy_clients._sra_meta import ( 21 | SRAStudy, 22 | SRASample, 23 | SRAExperiment, 24 | LibraryMetadata, 25 | SRARun, 26 | ) 27 | 28 | 29 | class FakeParams: 30 | def __init__( 31 | self, 32 | temp_dir, 33 | uids=None, 34 | term=None, 35 | eutil="efetch.cgi", 36 | rettype="xml", 37 | retmode="xml", 38 | ): 39 | self.query_id = "some-id-123" 40 | self.term = term 41 | self.usehistory = False 42 | self.cmd = None 43 | self.linkname = None 44 | self.holding = False 45 | self.doseq = None 46 | self.db = "sra" 47 | self.dbfrom = "sra" 48 | self.eutil = eutil 49 | self.uids = uids 50 | self.webenv = None 51 | self.idtype = None 52 | self.datetype = None 53 | self.reldate = None 54 | self.mindate = None 55 | self.maxdate = None 56 | self.querykey = 0 57 | self.rettype = rettype 58 | self.retmode = retmode 59 | self.strand = None 60 | self.sort = None 61 | self.field = None 62 | self.retstart = 0 63 | self.retmax = 0 64 | self.seqstart = None 65 | self.seqstop = None 66 | self.complexity = None 67 | self.temp_dir = temp_dir 68 | 69 | 70 | class _TestPluginWithEntrezFakeComponents(TestPluginBase): 71 | def setUp(self): 72 | super().setUp() 73 | self.efetch_result_single = self.generate_ef_result("single") 74 | self.efetch_result_multi = self.generate_ef_result("multi") 75 | self.efetch_analyzer = EFetchAnalyzer(log_level="INFO") 76 | self.efetch_request_properties = { 77 | "db", 78 | "eutil", 79 | "uids", 80 | "webenv", 81 | "querykey", 82 | "rettype", 83 | "retmode", 84 | "strand", 85 | "seqstart", 86 | "seqstop", 87 | "complexity", 88 | } 89 | self.esearch_request_properties = {"db", "eutil", "webenv", "retmode", "term"} 90 | self.library_meta = LibraryMetadata( 91 | name="unspecified", layout="SINGLE", selection="PCR", source="METAGENOMIC" 92 | ) 93 | with open(self.get_data_path("metadata_response_small.json"), "r") as ff: 94 | self.metadata_dict = json.load(ff) 95 | self.maxDiff = None 96 | self.fake_logger = logging.getLogger("test_log") 97 | 98 | def xml_to_response(self, kind, suffix="", prefix="metadata"): 99 | path = self.get_data_path(f"{prefix}_response_{kind}{suffix}.xml") 100 | response = io.open(path, "rb", buffering=0) 101 | return response 102 | 103 | def json_to_response(self, kind, suffix="", raw=False, utility="esearch"): 104 | path = self.get_data_path(f"{utility}_response_{kind}{suffix}.json") 105 | response = io.open(path, "rb", buffering=0) 106 | if raw: 107 | return response 108 | else: 109 | return json.loads(io.open(path, "rb", buffering=0).read()) 110 | 111 | def generate_ef_request(self, uids, start=0, size=1): 112 | request_params = FakeParams(self.temp_dir.name, uids=uids) 113 | return EfetchRequest( 114 | eutil="efetch.fcgi", parameter=request_params, start=start, size=size 115 | ) 116 | 117 | def generate_ef_result(self, kind, prefix="metadata"): 118 | return EFetchResult( 119 | response=self.xml_to_response(kind, prefix=prefix), 120 | request=self.generate_ef_request(["FAKEID1", "FAKEID2"]), 121 | log_level="INFO", 122 | ) 123 | 124 | def generate_sra_metadata(self): 125 | study_id, sample_id = "ERP120343", "ERS4372624" 126 | experiment_id, run_ids = "ERX3980916", ["FAKEID1", "FAKEID2"] 127 | study = SRAStudy( 128 | id=study_id, 129 | bioproject_id="PRJEB37054", 130 | center_name="University of Hohenheim", 131 | custom_meta={ 132 | "ENA-FIRST-PUBLIC [STUDY]": "2020-05-31", 133 | "ENA-LAST-UPDATE [STUDY]": "2020-03-04", 134 | }, 135 | ) 136 | sample = SRASample( 137 | id=sample_id, 138 | biosample_id="SAMEA6608408", 139 | name="BAC1.D1.0.32A", 140 | title="Vitis vinifera", 141 | organism="Vitis vinifera", 142 | tax_id="29760", 143 | study_id=study_id, 144 | custom_meta={ 145 | "environment (biome) [SAMPLE]": "berry plant", 146 | "geographic location (country and/or sea) [SAMPLE]": "Germany", 147 | "sample storage temperature [SAMPLE]": "-80", 148 | }, 149 | ) 150 | experiment = SRAExperiment( 151 | id=experiment_id, 152 | instrument="Illumina MiSeq", 153 | platform="ILLUMINA", 154 | library=self.library_meta, 155 | sample_id=sample_id, 156 | custom_meta={"Temperature [EXPERIMENT]": "12", "Depth [EXPERIMENT]": "500"}, 157 | ) 158 | runs = [ 159 | SRARun( 160 | id=_id, 161 | bases=11552099, 162 | spots=39323, 163 | public=True, 164 | bytes=3914295, 165 | experiment_id=experiment_id, 166 | custom_meta={ 167 | "ENA-FIRST-PUBLIC [RUN]": "2020-05-31", 168 | "ENA-LAST-UPDATE [RUN]": "2020-03-06", 169 | }, 170 | ) 171 | for _id in run_ids 172 | ] 173 | return study, sample, experiment, runs 174 | 175 | def generate_expected_df(self): 176 | exp_df = pd.read_json( 177 | path_or_buf=self.get_data_path("metadata_processed_multi.json"), 178 | orient="index", 179 | ) 180 | exp_df.index.name = "ID" 181 | numeric_cols = { 182 | "Amount or size of sample collected [sample]", 183 | "Collection day [sample]", 184 | "Collection hours [sample]", 185 | "Sample storage temperature [sample]", 186 | "Tax ID", 187 | "Sample volume or weight for dna extraction [sample]", 188 | } 189 | exp_df["Public"] = exp_df["Public"].astype(bool) 190 | for col in numeric_cols: 191 | exp_df[col] = exp_df[col].astype(str) 192 | return exp_df 193 | 194 | def generate_es_request(self, term, start=0, size=1): 195 | request_params = FakeParams( 196 | self.temp_dir.name, retmode="json", term=term, eutil="esearch.fcgi" 197 | ) 198 | return EsearchRequest( 199 | eutil="esearch.fcgi", parameter=request_params, start=start, size=size 200 | ) 201 | 202 | def generate_es_result(self, kind, suffix): 203 | return ESearchResult( 204 | response=self.json_to_response(kind, suffix, utility="esearch")[ 205 | "esearchresult" 206 | ], 207 | request=self.generate_es_request(term="abc OR 123"), 208 | ) 209 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/efetch_response_runs.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 13481774 8 | <Summary><Title>18</Title><Platform 9 | instrument_model="Illumina HiSeq 3000">ILLUMINA</Platform><Statistics 10 | total_runs="1" total_spots="63703" total_bases="38349206" 11 | total_size="22735317" load_done="true" cluster_name="public"/></Summary><Submitter 12 | acc="SRA1206349" center_name="Jiangxi Agricultural University" 13 | contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory 14 | of Animal Nutritio"/><Experiment acc="SRX10339760" ver="1" 15 | status="public" name="18"/><Study acc="SRP310597" name="PRJNA 16 | Chuanzhong black lamb Raw sequence reads"/><Organism 17 | taxid="1904483" ScientificName="sheep gut metagenome"/><Sample 18 | acc="SRS8459117" name=""/><Instrument ILLUMINA="Illumina 19 | HiSeq 3000"/><Library_descriptor><LIBRARY_NAME>18</LIBRARY_NAME><LIBRARY_STRATEGY>AMPLICON</LIBRARY_STRATEGY><LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>PCR</LIBRARY_SELECTION><LIBRARY_LAYOUT> 20 | <PAIRED/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA707607</Bioproject><Biosample>SAMN18309312</Biosample> 21 | 22 | <Run acc="SRR13961771" 23 | total_spots="63703" total_bases="38349206" load_done="true" 24 | is_public="true" cluster_name="public" 25 | static_data_available="true"/> 26 | 27 | 28 | 2021/03/17 29 | 2021/03/15 30 | 31 | 32 | 33 | 4 34 | <Summary><Title>454 35 | sequencing of Human HapMap individual NA18505 genomic paired-end 36 | library</Title><Platform instrument_model="454 GS FLX">LS454</Platform><Statistics 37 | total_runs="10" total_spots="4703662" total_bases="1306798474" 38 | total_size="3205056622" load_done="true" 39 | static_data_available="true" cluster_name="public"/></Summary><Submitter 40 | acc="SRA000197" center_name="454MSC" contact_name="Chris OSullivan" 41 | lab_name=""/><Experiment acc="SRX000003" ver="10" 42 | status="public" name="454 sequencing of Human HapMap individual 43 | NA18505 genomic paired-end library"/><Study acc="SRP000001" 44 | name="Paired-end mapping reveals extensive structural variation in 45 | the human genome"/><Organism taxid="9606" 46 | ScientificName="Homo sapiens"/><Sample acc="SRS000100" 47 | name=""/><Instrument LS454="454 GS FLX"/><Library_descriptor><LIBRARY_NAME 48 | xmlns="">SID2699</LIBRARY_NAME><LIBRARY_STRATEGY 49 | xmlns="">WGS</LIBRARY_STRATEGY><LIBRARY_SOURCE xmlns="">GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION 50 | xmlns="">RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT 51 | xmlns=""> <PAIRED NOMINAL_LENGTH="3000"/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA33627</Bioproject><Biosample>SAMN00001583</Biosample> 52 | 53 | <Run acc="SRR000007" 54 | total_spots="633196" total_bases="175275395" load_done="true" 55 | is_public="true" cluster_name="public" 56 | static_data_available="true"/><Run acc="SRR000018" 57 | total_spots="626624" total_bases="174403220" load_done="true" 58 | is_public="true" cluster_name="public" 59 | static_data_available="true"/><Run acc="SRR000020" 60 | total_spots="374556" total_bases="103411232" load_done="true" 61 | is_public="true" cluster_name="public" 62 | static_data_available="true"/><Run acc="SRR000038" 63 | total_spots="529820" total_bases="148389031" load_done="true" 64 | is_public="true" cluster_name="public" 65 | static_data_available="true"/><Run acc="SRR000043" 66 | total_spots="608946" total_bases="168985392" load_done="true" 67 | is_public="true" cluster_name="public" 68 | static_data_available="true"/><Run acc="SRR000046" 69 | total_spots="79047" total_bases="21258857" load_done="true" 70 | is_public="true" cluster_name="public" 71 | static_data_available="true"/><Run acc="SRR000048" 72 | total_spots="640737" total_bases="177619279" load_done="true" 73 | is_public="true" cluster_name="public" 74 | static_data_available="true"/><Run acc="SRR000050" 75 | total_spots="547349" total_bases="153260655" load_done="true" 76 | is_public="true" cluster_name="public" 77 | static_data_available="true"/><Run acc="SRR000057" 78 | total_spots="76744" total_bases="21203932" load_done="true" 79 | is_public="true" cluster_name="public" 80 | static_data_available="true"/><Run acc="SRR000058" 81 | total_spots="586643" total_bases="162991481" load_done="true" 82 | is_public="true" cluster_name="public" 83 | static_data_available="true"/> 84 | 85 | 86 | 2008/04/04 87 | 2015/04/09 88 | 89 | 90 | 91 | 13481786 92 | <Summary><Title>12</Title><Platform 93 | instrument_model="Illumina HiSeq 3000">ILLUMINA</Platform><Statistics 94 | total_runs="1" total_spots="59130" total_bases="35596260" 95 | total_size="21079845" load_done="true" cluster_name="public"/></Summary><Submitter 96 | acc="SRA1206349" center_name="Jiangxi Agricultural University" 97 | contact_name="huan chen" lab_name="Jiangxi Province Key Laboratory 98 | of Animal Nutritio"/><Experiment acc="SRX10339772" ver="1" 99 | status="public" name="12"/><Study acc="SRP310597" name="PRJNA 100 | Chuanzhong black lamb Raw sequence reads"/><Organism 101 | taxid="1904483" ScientificName="sheep gut metagenome"/><Sample 102 | acc="SRS8459130" name=""/><Instrument ILLUMINA="Illumina 103 | HiSeq 3000"/><Library_descriptor><LIBRARY_NAME>12</LIBRARY_NAME><LIBRARY_STRATEGY>AMPLICON</LIBRARY_STRATEGY><LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>PCR</LIBRARY_SELECTION><LIBRARY_LAYOUT> 104 | <PAIRED/> </LIBRARY_LAYOUT></Library_descriptor><Bioproject>PRJNA707607</Bioproject><Biosample>SAMN18309306</Biosample> 105 | 106 | <Run acc="SRR13961759" 107 | total_spots="59130" total_bases="35596260" load_done="true" 108 | is_public="true" cluster_name="public" 109 | static_data_available="true"/> 110 | 111 | 112 | 2021/03/17 113 | 2021/03/15 114 | 115 | 116 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/scraper_items_no_attach.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "key": "CP4ED2CY", 3 | "version": 1257, 4 | "library": { 5 | "type": "user", 6 | "id": 12345, 7 | "name": "username", 8 | "links": { 9 | "alternate": { 10 | "href": "https://www.zotero.org/username", 11 | "type": "text/html" 12 | } 13 | } 14 | }, 15 | "links": { 16 | "self": { 17 | "href": "https://api.zotero.org/users/12345/items/CP4ED2CY", 18 | "type": "application/json" 19 | }, 20 | "alternate": { 21 | "href": "https://www.zotero.org/username/items/CP4ED2CY", 22 | "type": "text/html" 23 | }, 24 | "attachment": { 25 | "href": "https://api.zotero.org/users/12345/items/DMJ4AQ48", 26 | "type": "application/json", 27 | "attachmentType": "application/pdf", 28 | "attachmentSize": 3648434 29 | } 30 | }, 31 | "meta": { 32 | "creatorSummary": "Pruski et al.", 33 | "parsedDate": "2021-10-13", 34 | "numChildren": 2 35 | }, 36 | "data": { 37 | "key": "CP4ED2CY", 38 | "version": 1257, 39 | "itemType": "journalArticle", 40 | "title": "Direct on-swab metabolic profiling of vaginal microbiome host interactions during pregnancy and preterm birth", 41 | "creators": [ 42 | { 43 | "creatorType": "author", 44 | "firstName": "Pamela", 45 | "lastName": "Pruski" 46 | }, 47 | { 48 | "creatorType": "author", 49 | "firstName": "Gonçalo D. S.", 50 | "lastName": "Correia" 51 | }, 52 | { 53 | "creatorType": "author", 54 | "firstName": "Holly V.", 55 | "lastName": "Lewis" 56 | }, 57 | { 58 | "creatorType": "author", 59 | "firstName": "Katia", 60 | "lastName": "Capuccini" 61 | }, 62 | { 63 | "creatorType": "author", 64 | "firstName": "Paolo", 65 | "lastName": "Inglese" 66 | }, 67 | { 68 | "creatorType": "author", 69 | "firstName": "Denise", 70 | "lastName": "Chan" 71 | }, 72 | { 73 | "creatorType": "author", 74 | "firstName": "Richard G.", 75 | "lastName": "Brown" 76 | }, 77 | { 78 | "creatorType": "author", 79 | "firstName": "Lindsay", 80 | "lastName": "Kindinger" 81 | }, 82 | { 83 | "creatorType": "author", 84 | "firstName": "Yun S.", 85 | "lastName": "Lee" 86 | }, 87 | { 88 | "creatorType": "author", 89 | "firstName": "Ann", 90 | "lastName": "Smith" 91 | }, 92 | { 93 | "creatorType": "author", 94 | "firstName": "Julian", 95 | "lastName": "Marchesi" 96 | }, 97 | { 98 | "creatorType": "author", 99 | "firstName": "Julie A. K.", 100 | "lastName": "McDonald" 101 | }, 102 | { 103 | "creatorType": "author", 104 | "firstName": "Simon", 105 | "lastName": "Cameron" 106 | }, 107 | { 108 | "creatorType": "author", 109 | "firstName": "Kate", 110 | "lastName": "Alexander-Hardiman" 111 | }, 112 | { 113 | "creatorType": "author", 114 | "firstName": "Anna L.", 115 | "lastName": "David" 116 | }, 117 | { 118 | "creatorType": "author", 119 | "firstName": "Sarah J.", 120 | "lastName": "Stock" 121 | }, 122 | { 123 | "creatorType": "author", 124 | "firstName": "Jane E.", 125 | "lastName": "Norman" 126 | }, 127 | { 128 | "creatorType": "author", 129 | "firstName": "Vasso", 130 | "lastName": "Terzidou" 131 | }, 132 | { 133 | "creatorType": "author", 134 | "firstName": "T. G.", 135 | "lastName": "Teoh" 136 | }, 137 | { 138 | "creatorType": "author", 139 | "firstName": "Lynne", 140 | "lastName": "Sykes" 141 | }, 142 | { 143 | "creatorType": "author", 144 | "firstName": "Phillip R.", 145 | "lastName": "Bennett" 146 | }, 147 | { 148 | "creatorType": "author", 149 | "firstName": "Zoltan", 150 | "lastName": "Takats" 151 | }, 152 | { 153 | "creatorType": "author", 154 | "firstName": "David A.", 155 | "lastName": "MacIntyre" 156 | } 157 | ], 158 | "abstractNote": "The pregnancy vaginal microbiome contributes to risk of preterm birth, the primary cause of death in children under 5 years of age. Here we describe direct on-swab metabolic profiling by Desorption Electrospray Ionization Mass Spectrometry (DESI-MS) for sample preparation-free characterisation of the cervicovaginal metabolome in two independent pregnancy cohorts (VMET, n\u2009=\u2009160; 455 swabs; VMET II, n\u2009=\u2009205; 573 swabs). By integrating metataxonomics and immune profiling data from matched samples, we show that specific metabolome signatures can be used to robustly predict simultaneously both the composition of the vaginal microbiome and host inflammatory status. In these patients, vaginal microbiota instability and innate immune activation, as predicted using DESI-MS, associated with preterm birth, including in women receiving cervical cerclage for preterm birth prevention. These findings highlight direct on-swab metabolic profiling by DESI-MS as an innovative approach for preterm birth risk stratification through rapid assessment of vaginal microbiota-host dynamics.", 159 | "publicationTitle": "Nature Communications", 160 | "volume": "12", 161 | "issue": "1", 162 | "pages": "5967", 163 | "date": "2021-10-13", 164 | "series": "", 165 | "seriesTitle": "", 166 | "seriesText": "", 167 | "journalAbbreviation": "Nat Commun", 168 | "language": "en", 169 | "DOI": "10.1038/s41467-021-26215-w", 170 | "ISSN": "2041-1723", 171 | "shortTitle": "", 172 | "url": "https://www.nature.com/articles/s41467-021-26215-w", 173 | "accessDate": "2021-11-10T07:04:46Z", 174 | "archive": "", 175 | "archiveLocation": "", 176 | "libraryCatalog": "www.nature.com", 177 | "callNumber": "", 178 | "rights": "2021 The Author(s)", 179 | "extra": "Bandiera_abtest: a\nCc_license_type: cc_by\nCg_type: Nature Research Journals\nNumber: 1\nPrimary_atype: Research\nPublisher: Nature Publishing Group\nSubject_term: Infectious-disease diagnostics;Predictive markers;Risk factors;Translational research\nSubject_term_id: infectious-disease-diagnostics;predictive-markers;risk-factors;translational-research", 180 | "tags": [ 181 | { 182 | "tag": "Infectious-disease diagnostics", 183 | "type": 1 184 | }, 185 | { 186 | "tag": "Predictive markers", 187 | "type": 1 188 | }, 189 | { 190 | "tag": "Risk factors", 191 | "type": 1 192 | }, 193 | { 194 | "tag": "Translational research", 195 | "type": 1 196 | } 197 | ], 198 | "collections": [ 199 | "DCHC4FUN" 200 | ], 201 | "relations": {}, 202 | "dateAdded": "2021-11-10T07:04:46Z", 203 | "dateModified": "2021-11-10T07:04:46Z" 204 | } 205 | }] -------------------------------------------------------------------------------- /q2_fondue/utils.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import gzip 9 | import os 10 | import shutil 11 | import signal 12 | import subprocess 13 | from typing import List 14 | 15 | from entrezpy.esearch import esearcher as es 16 | from q2_types.per_sample_sequences import CasavaOneEightSingleLanePerSampleDirFmt 17 | from qiime2 import Artifact 18 | 19 | from q2_fondue.entrezpy_clients._esearch import ESearchAnalyzer 20 | from q2_fondue.entrezpy_clients._utils import ( 21 | PREFIX, 22 | InvalidIDs, 23 | set_up_logger, 24 | set_up_entrezpy_logging, 25 | ) 26 | 27 | LOGGER = set_up_logger("INFO", logger_name=__name__) 28 | 29 | 30 | class DownloadError(Exception): 31 | pass 32 | 33 | 34 | def _chunker(seq, size): 35 | # source: https://stackoverflow.com/a/434328/579416 36 | return (seq[pos : pos + size] for pos in range(0, len(seq), size)) 37 | 38 | 39 | def _validate_run_ids( 40 | email: str, n_jobs: int, run_ids: List[str], log_level: str 41 | ) -> dict: 42 | """Validates provided accession IDs using ESearch. 43 | 44 | Args: 45 | email (str): A valid e-mail address. 46 | n_jobs (int): Number of threads to be used in parallel. 47 | run_ids (List[str]): List of all the run IDs to be validated. 48 | log_level (str): Logging level. 49 | 50 | Returns: 51 | dict: Dictionary of invalid IDs (as keys) with a description. 52 | """ 53 | # must process in batches because esearch requests with 54 | # runID count > 10'000 fail 55 | invalid_ids = {} 56 | for batch in _chunker(run_ids, 10000): 57 | esearcher = es.Esearcher( 58 | "esearcher", email, apikey=None, apikey_var=None, threads=0, qid=None 59 | ) 60 | set_up_entrezpy_logging(esearcher, log_level) 61 | 62 | esearch_response = esearcher.inquire( 63 | {"db": "sra", "term": " OR ".join(batch), "usehistory": False}, 64 | analyzer=ESearchAnalyzer(batch), 65 | ) 66 | invalid_ids.update(esearch_response.result.validate_result()) 67 | 68 | return invalid_ids 69 | 70 | 71 | def _determine_id_type(ids: list): 72 | ids = [x[:3] for x in ids] 73 | for kind in PREFIX.keys(): 74 | if all([x in PREFIX[kind] for x in ids]): 75 | return kind 76 | raise InvalidIDs( 77 | "The type of provided IDs is either not supported or " 78 | "IDs of mixed types were provided. Please provide IDs " 79 | "corresponding to either SRA run (#S|E|DRR), study " 80 | "(#S|E|DRP) or NCBI BioProject IDs (#PRJ)." 81 | ) 82 | 83 | 84 | def handle_threaded_exception(args): 85 | logger = set_up_logger("DEBUG", logger_name="ThreadedErrorsManager") 86 | msg = "Data fetching was interrupted by the following error: \n" 87 | 88 | if "gaierror is not JSON serializable" in str(args.exc_value): 89 | msg += ( 90 | "EntrezPy failed to connect to NCBI. Please check your " 91 | "internet connection and try again. It may help to wait " 92 | "a few minutes before retrying." 93 | ) 94 | # silence threads exiting correctly 95 | elif issubclass(args.exc_type, SystemExit) and str(args.exc_value) == "0": 96 | return 97 | else: 98 | msg += ( 99 | f'Caught {args.exc_type} with value "{args.exc_value}" ' 100 | f"in thread {args.thread}" 101 | ) 102 | 103 | logger.exception(msg) 104 | 105 | # This will send a SIGINT to the main thread, which will gracefully 106 | # kill the running Q2 action. No artifacts will be saved. 107 | os.kill(os.getpid(), signal.SIGINT) 108 | 109 | 110 | def _has_enough_space(acc_id: str, output_dir: str) -> bool: 111 | """Checks whether there is enough storage available for fasterq-dump 112 | to process sequences for a given ID. 113 | 114 | fasterq-dump will be used to check the amount of space required for the 115 | final data. Required space is estimated as 10x that of the final data 116 | (as per NCBI's documentation). 117 | 118 | Args: 119 | acc_id (str): The accession ID to be processed. 120 | output_dir (str): Location where the output would be saved. 121 | 122 | Return 123 | bool: Whether there is enough space available for fasterq-dump tool. 124 | """ 125 | if acc_id is None: 126 | return True 127 | 128 | cmd_fasterq = ["fasterq-dump", "--size-check", "only", "-x", acc_id] 129 | result = subprocess.run(cmd_fasterq, text=True, capture_output=True, cwd=output_dir) 130 | 131 | if result.returncode == 0: 132 | return True 133 | elif result.returncode == 3 and "disk-limit exeeded" in result.stderr: 134 | LOGGER.warning("Not enough space to fetch run %s.", acc_id) 135 | return False 136 | else: 137 | LOGGER.error( 138 | 'fasterq-dump exited with a "%s" error code (the message ' 139 | 'was: "%s"). We will try to fetch the next accession ID.', 140 | result.returncode, 141 | result.stderr, 142 | ) 143 | return True 144 | 145 | 146 | def _rewrite_fastq(file_in: str, file_out: str) -> None: 147 | """Rewrites a FASTQ file with gzip compression. 148 | 149 | Takes an uncompressed FASTQ file and writes it to a new location with 150 | gzip compression. 151 | 152 | Args: 153 | file_in (str): Path to input uncompressed FASTQ file 154 | file_out (str): Path where compressed FASTQ file should be written 155 | """ 156 | with open(file_in, "rb") as f_in, gzip.open(file_out, "wb") as f_out: 157 | shutil.copyfileobj(f_in, f_out) 158 | 159 | 160 | def _is_empty(artifact: Artifact) -> bool: 161 | """Checks if a sequence artifact is empty. 162 | 163 | Determines if a sequence artifact is empty by checking if all sample IDs 164 | are "xxx", which indicates an empty placeholder artifact. 165 | 166 | Args: 167 | artifact: A QIIME 2 sequence artifact 168 | 169 | Returns: 170 | bool: True if the artifact is empty, False otherwise 171 | """ 172 | samples = artifact.view(CasavaOneEightSingleLanePerSampleDirFmt).manifest.index 173 | return all(sample == "xxx" for sample in samples) 174 | 175 | 176 | def _remove_empty(*artifact_lists) -> tuple: 177 | """Removes empty artifacts from lists of sequence artifacts. 178 | 179 | Takes one or more lists of sequence artifacts and filters out any empty 180 | artifacts (those containing only placeholder 'xxx' samples). Returns 181 | tuple of filtered lists maintaining the same order as input. 182 | 183 | Args: 184 | *artifact_lists: Variable number of lists containing sequence artifacts 185 | to filter 186 | 187 | Returns: 188 | tuple: Tuple of filtered lists with empty artifacts removed, in same 189 | order as input lists 190 | """ 191 | processed_artifacts = [] 192 | for artifacts in artifact_lists: 193 | processed_artifacts.append( 194 | [artifact for artifact in artifacts if not _is_empty(artifact)] 195 | ) 196 | return tuple(processed_artifacts) 197 | 198 | 199 | def _make_empty_artifact(ctx, paired: bool) -> Artifact: 200 | """Creates an empty sequence artifact. 201 | 202 | Creates an empty QIIME 2 sequence artifact containing placeholder files. 203 | For paired-end sequences, creates two empty fastq files (R1 and R2). 204 | For single-end sequences, creates one empty fastq file (R1). 205 | 206 | Args: 207 | ctx: QIIME 2 plugin context 208 | paired (bool): Whether to create paired-end (True) or 209 | single-end (False) artifact 210 | 211 | Returns: 212 | QIIME 2 artifact: Empty sequence artifact of appropriate type 213 | (paired or single-end) 214 | """ 215 | if paired: 216 | filenames = ["xxx_00_L001_R1_001.fastq.gz", "xxx_00_L001_R2_001.fastq.gz"] 217 | _type = "SampleData[PairedEndSequencesWithQuality]" 218 | else: 219 | filenames = ["xxx_01_L001_R1_001.fastq.gz"] 220 | _type = "SampleData[SequencesWithQuality]" 221 | 222 | casava_out = CasavaOneEightSingleLanePerSampleDirFmt() 223 | for filename in filenames: 224 | with gzip.open(str(casava_out.path / filename), mode="w"): 225 | pass 226 | 227 | return ctx.make_artifact(_type, casava_out) 228 | -------------------------------------------------------------------------------- /q2_fondue/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | import gzip 9 | import os 10 | import signal 11 | import tempfile 12 | import threading 13 | import unittest 14 | from threading import Thread 15 | from unittest.mock import patch, MagicMock 16 | 17 | from q2_types.per_sample_sequences import CasavaOneEightSingleLanePerSampleDirFmt 18 | from qiime2 import Artifact 19 | from qiime2.plugin.testing import TestPluginBase 20 | 21 | from q2_fondue.utils import ( 22 | handle_threaded_exception, 23 | _has_enough_space, 24 | _chunker, 25 | _rewrite_fastq, 26 | _is_empty, 27 | _remove_empty, 28 | _make_empty_artifact, 29 | ) 30 | 31 | 32 | class TestExceptHooks(unittest.TestCase): 33 | package = "q2_fondue.tests" 34 | 35 | def do_something_with_error(self, msg): 36 | raise Exception(msg) 37 | 38 | @patch("os.kill") 39 | def test_handle_threaded_exception_gaierror(self, patch_kill): 40 | with self.assertLogs(level="DEBUG", logger="ThreadedErrorsManager") as cm: 41 | threading.excepthook = handle_threaded_exception 42 | error_msg = "Something went wrong: gaierror is " "not JSON serializable." 43 | t = Thread(target=self.do_something_with_error, args=(error_msg,)) 44 | t.start() 45 | t.join() 46 | 47 | self.assertIn("EntrezPy failed to connect to NCBI", cm.output[0]) 48 | 49 | pid = os.getpid() 50 | patch_kill.assert_called_once_with(pid, signal.SIGINT) 51 | 52 | @patch("os.kill") 53 | def test_handle_threaded_exception_other_errors(self, patch_kill): 54 | with self.assertLogs(level="DEBUG", logger="ThreadedErrorsManager") as cm: 55 | threading.excepthook = handle_threaded_exception 56 | error_msg = "Some unknown exception." 57 | t = Thread(target=self.do_something_with_error, args=(error_msg,)) 58 | t.start() 59 | t.join() 60 | 61 | self.assertIn( 62 | "Caught with value " '"Some unknown exception."', 63 | cm.output[0], 64 | ) 65 | 66 | pid = os.getpid() 67 | patch_kill.assert_called_once_with(pid, signal.SIGINT) 68 | 69 | 70 | class TestSRAUtils(TestPluginBase): 71 | package = "q2_fondue.tests" 72 | 73 | @patch("subprocess.run") 74 | def test_has_enough_space(self, patched_run): 75 | patched_run.return_value = MagicMock(returncode=0) 76 | 77 | acc, test_dir = "ABC123", "some/where" 78 | obs = _has_enough_space(acc, test_dir) 79 | self.assertTrue(obs) 80 | patched_run.assert_called_once_with( 81 | ["fasterq-dump", "--size-check", "only", "-x", acc], 82 | text=True, 83 | capture_output=True, 84 | cwd=test_dir, 85 | ) 86 | 87 | @patch("subprocess.run") 88 | def test_has_enough_space_not(self, patched_run): 89 | with open(self.get_data_path("fasterq-dump-response.txt")) as f: 90 | response = "".join(f.readlines()) 91 | patched_run.return_value = MagicMock(stderr=response, returncode=3) 92 | 93 | acc, test_dir = "ABC123", "some/where" 94 | obs = _has_enough_space(acc, test_dir) 95 | self.assertFalse(obs) 96 | patched_run.assert_called_once_with( 97 | ["fasterq-dump", "--size-check", "only", "-x", acc], 98 | text=True, 99 | capture_output=True, 100 | cwd=test_dir, 101 | ) 102 | 103 | @patch("subprocess.run") 104 | def test_has_enough_space_error(self, patched_run): 105 | patched_run.return_value = MagicMock(stderr="errorX", returncode=8) 106 | 107 | acc, test_dir = "ABC123", "some/where" 108 | with self.assertLogs("q2_fondue.utils", level="ERROR") as cm: 109 | obs = _has_enough_space(acc, test_dir) 110 | self.assertEqual( 111 | cm.output, 112 | [ 113 | 'ERROR:q2_fondue.utils:fasterq-dump exited with a "8" error code ' 114 | '(the message was: "errorX"). We will try to fetch the next ' 115 | "accession ID." 116 | ], 117 | ) 118 | self.assertTrue(obs) 119 | patched_run.assert_called_once_with( 120 | ["fasterq-dump", "--size-check", "only", "-x", acc], 121 | text=True, 122 | capture_output=True, 123 | cwd=test_dir, 124 | ) 125 | 126 | def test_chunker(self): 127 | obs_out = _chunker(["A", "B", "C"], 2) 128 | exp_out_1 = ["A", "B"] 129 | exp_out_2 = ["C"] 130 | self.assertEqual(next(obs_out), exp_out_1) 131 | self.assertEqual(next(obs_out), exp_out_2) 132 | 133 | def test_chunker_no_chunks(self): 134 | obs_out = _chunker(["A", "B", "C"], 4) 135 | exp_out = ["A", "B", "C"] 136 | self.assertEqual(next(obs_out), exp_out) 137 | 138 | def test_rewrite_fastq(self): 139 | file_in = self.get_data_path("SRR123456.fastq") 140 | file_out = tempfile.NamedTemporaryFile() 141 | 142 | _rewrite_fastq(file_in, file_out.name) 143 | 144 | with open(file_in, "rb") as fin: 145 | with gzip.open(file_out.name, "r") as fout: 146 | for lin, lout in zip(fin.readlines(), fout.readlines()): 147 | self.assertEqual(lin, lout) 148 | 149 | # clean up 150 | file_out.close() 151 | 152 | 153 | class TestSequenceUtils(TestPluginBase): 154 | package = "q2_fondue.tests" 155 | 156 | def test_is_empty_with_empty_artifact(self): 157 | casava_out = CasavaOneEightSingleLanePerSampleDirFmt() 158 | filenames = ["xxx_01_L001_R1_001.fastq.gz"] 159 | for filename in filenames: 160 | with gzip.open(str(casava_out.path / filename), mode="w"): 161 | pass 162 | 163 | artifact = Artifact.import_data("SampleData[SequencesWithQuality]", casava_out) 164 | 165 | self.assertTrue(_is_empty(artifact)) 166 | 167 | def test_is_empty_with_nonempty_artifact(self): 168 | artifact = Artifact.import_data( 169 | "SampleData[SequencesWithQuality]", 170 | self.get_data_path("single1"), 171 | CasavaOneEightSingleLanePerSampleDirFmt, 172 | ) 173 | 174 | self.assertFalse(_is_empty(artifact)) 175 | 176 | def test_remove_empty(self): 177 | empty_casava = CasavaOneEightSingleLanePerSampleDirFmt() 178 | with gzip.open( 179 | str(empty_casava.path / "xxx_01_L001_R1_001.fastq.gz"), mode="w" 180 | ): 181 | pass 182 | empty_artifact_single = Artifact.import_data( 183 | "SampleData[SequencesWithQuality]", empty_casava 184 | ) 185 | with gzip.open( 186 | str(empty_casava.path / "xxx_01_L001_R2_001.fastq.gz"), mode="w" 187 | ): 188 | pass 189 | empty_artifact_paired = Artifact.import_data( 190 | "SampleData[PairedEndSequencesWithQuality]", empty_casava 191 | ) 192 | 193 | non_empty_artifact_single = Artifact.import_data( 194 | "SampleData[SequencesWithQuality]", 195 | self.get_data_path("single1"), 196 | CasavaOneEightSingleLanePerSampleDirFmt, 197 | ) 198 | non_empty_artifact_paired = Artifact.import_data( 199 | "SampleData[PairedEndSequencesWithQuality]", 200 | self.get_data_path("paired1"), 201 | CasavaOneEightSingleLanePerSampleDirFmt, 202 | ) 203 | 204 | singles = [empty_artifact_single, non_empty_artifact_single] 205 | paired = [empty_artifact_paired, non_empty_artifact_paired] 206 | 207 | filtered_singles, filtered_paired = _remove_empty(singles, paired) 208 | 209 | self.assertEqual(len(filtered_singles), 1) 210 | self.assertEqual(len(filtered_paired), 1) 211 | self.assertIs(filtered_singles[0], non_empty_artifact_single) 212 | self.assertIs(filtered_paired[0], non_empty_artifact_paired) 213 | 214 | def test_make_empty_artifact_single(self): 215 | ctx = MagicMock() 216 | ctx.make_artifact.return_value = "single_artifact" 217 | 218 | result = _make_empty_artifact(ctx, False) 219 | 220 | self.assertEqual(result, "single_artifact") 221 | ctx.make_artifact.assert_called_once() 222 | 223 | args, kwargs = ctx.make_artifact.call_args 224 | 225 | self.assertEqual(args[0], "SampleData[SequencesWithQuality]") 226 | 227 | casava_output = args[1] 228 | self.assertTrue( 229 | os.path.exists(casava_output.path / "xxx_01_L001_R1_001.fastq.gz") 230 | ) 231 | 232 | def test_make_empty_artifact_paired(self): 233 | ctx = MagicMock() 234 | ctx.make_artifact.return_value = "paired_artifact" 235 | 236 | result = _make_empty_artifact(ctx, True) 237 | 238 | self.assertEqual(result, "paired_artifact") 239 | ctx.make_artifact.assert_called_once() 240 | 241 | args, kwargs = ctx.make_artifact.call_args 242 | 243 | self.assertEqual(args[0], "SampleData[PairedEndSequencesWithQuality]") 244 | 245 | casava_output = args[1] 246 | self.assertTrue( 247 | os.path.exists(casava_output.path / "xxx_00_L001_R1_001.fastq.gz") 248 | ) 249 | self.assertTrue( 250 | os.path.exists(casava_output.path / "xxx_00_L001_R2_001.fastq.gz") 251 | ) 252 | 253 | 254 | if __name__ == "__main__": 255 | unittest.main() 256 | -------------------------------------------------------------------------------- /q2_fondue/metadata.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | import threading 10 | from typing import List, Tuple 11 | 12 | import entrezpy.efetch.efetcher as ef 13 | import pandas as pd 14 | from qiime2 import Metadata 15 | 16 | from q2_fondue.entrezpy_clients._efetch import EFetchAnalyzer 17 | from q2_fondue.utils import ( 18 | _validate_run_ids, 19 | _determine_id_type, 20 | handle_threaded_exception, 21 | ) 22 | from q2_fondue.entrezpy_clients._utils import ( 23 | set_up_entrezpy_logging, 24 | set_up_logger, 25 | InvalidIDs, 26 | ) 27 | from q2_fondue.entrezpy_clients._pipelines import _get_run_ids 28 | 29 | 30 | threading.excepthook = handle_threaded_exception 31 | BATCH_SIZE = 150 32 | 33 | 34 | def _efetcher_inquire( 35 | efetcher: ef.Efetcher, run_ids: List[str], log_level: str 36 | ) -> Tuple[pd.DataFrame, dict]: 37 | """Makes an EFetch request using the provided IDs. 38 | 39 | Args: 40 | efetcher (ef.Efetcher): A valid instance of an Entrezpy Efetcher. 41 | run_ids (List[str]): List of all the run IDs to be fetched. 42 | log_level (str): Logging level. 43 | 44 | Returns: 45 | pd.DataFrame: DataFrame with metadata obtained for the provided IDs. 46 | dict: Dictionary of the run IDs that were not found with 47 | respective error messages. 48 | """ 49 | metadata_response = efetcher.inquire( 50 | { 51 | "db": "sra", 52 | "id": run_ids, 53 | "rettype": "xml", 54 | "retmode": "xml", 55 | "retmax": len(run_ids), 56 | "reqsize": BATCH_SIZE, 57 | }, 58 | analyzer=EFetchAnalyzer(log_level), 59 | ) 60 | 61 | if metadata_response.result is None: 62 | return (pd.DataFrame(), {m_id: metadata_response.error_msg for m_id in run_ids}) 63 | else: 64 | return metadata_response.result.metadata_to_df(), {} 65 | 66 | 67 | def _execute_efetcher(email, n_jobs, run_ids, log_level): 68 | efetcher = ef.Efetcher( 69 | "efetcher", email, apikey=None, apikey_var=None, threads=n_jobs, qid=None 70 | ) 71 | set_up_entrezpy_logging(efetcher, log_level) 72 | 73 | return _efetcher_inquire(efetcher, run_ids, log_level) 74 | 75 | 76 | def _get_run_meta( 77 | email, n_jobs, run_ids, validated, log_level, logger 78 | ) -> (pd.DataFrame, dict): 79 | if not validated: 80 | invalid_ids = _validate_run_ids(email, n_jobs, run_ids, log_level) 81 | valid_ids = sorted(list(set(run_ids) - set(invalid_ids.keys()))) 82 | 83 | if not valid_ids: 84 | raise InvalidIDs("All provided IDs were invalid. Please check your input.") 85 | if invalid_ids: 86 | logger.warning( 87 | f"The following provided IDs are invalid: " 88 | f'{",".join(invalid_ids.keys())}. Please correct them and ' 89 | f"try fetching those independently." 90 | ) 91 | else: 92 | # we assume that IDs retrieved by linking from aggregate IDs 93 | # (e.g., BioProject or study) should only return valid IDs, 94 | # since we asked NCBI to get those for us 95 | valid_ids = run_ids 96 | 97 | # fetch metadata 98 | logger.info("Fetching metadata for %i run IDs.", len(valid_ids)) 99 | meta_df, missing_ids = _execute_efetcher(email, n_jobs, valid_ids, log_level) 100 | 101 | if missing_ids: 102 | logger.warning( 103 | "Metadata for the following run IDs could not be fetched: " 104 | f'{",".join(missing_ids.keys())}. ' 105 | f"Please try fetching those independently." 106 | ) 107 | 108 | return meta_df, missing_ids 109 | 110 | 111 | def _get_other_meta( 112 | email, n_jobs, project_ids, id_type, log_level, logger 113 | ) -> (pd.DataFrame, dict): 114 | run_ids = _get_run_ids(email, n_jobs, project_ids, None, id_type, log_level) 115 | 116 | return _get_run_meta(email, n_jobs, run_ids, True, log_level, logger) 117 | 118 | 119 | def _find_doi_mapping_and_type(mapping_doi_ids: Metadata) -> (pd.Series, str): 120 | """If present, save DOI name to ID mappings together with type 121 | of IDs the DOI names are matching to. 122 | 123 | Args: 124 | mapping_doi_ids (Metadata): Table of accession IDs with 125 | associated DOI names. 126 | Returns: 127 | pd.Series: Series of DOI names with matched accession IDs. 128 | str: Type of accession IDs in matching. 129 | """ 130 | id2doi = mapping_doi_ids.to_dataframe().iloc[:, 0] 131 | doi_ids = sorted(list(mapping_doi_ids.get_ids())) 132 | id2doi_type = _determine_id_type(doi_ids) 133 | 134 | return (id2doi, id2doi_type) 135 | 136 | 137 | def get_metadata( 138 | accession_ids: Metadata, 139 | email: str, 140 | threads: int = 1, 141 | log_level: str = "INFO", 142 | linked_doi: Metadata = None, 143 | ) -> (pd.DataFrame, pd.DataFrame): 144 | """Fetches metadata using the provided run/bioproject/study/sample or 145 | experiment accession IDs. 146 | 147 | If aggregate IDs (such as bioproject, study, sample, experiment IDs) were 148 | provided, first run IDs will be fetched using a Conduit Pipeline. 149 | The run IDs will be validated using an ESearch query. The metadata will 150 | be fetched only for the valid run IDs. Invalid run IDs will be raised 151 | with a warning. Run IDs for which the metadata could not be fetched will 152 | be returned with the corresponding error message as missing_ids. 153 | 154 | Args: 155 | accession_ids (Metadata): Table of all the accession IDs 156 | to be fetched (either run, bioproject, study, sample or 157 | experiment IDs). If table does not contain DOI names, names 158 | from `linked_doi` will be matched. 159 | linked_doi (Metadata): Optional table of accession IDs with 160 | associated DOI names. Preferably used when refetching failed 161 | run IDs that can be matched after metadata was fetched 162 | successfully. Ignored if `accession_ids` already contains DOI 163 | names. 164 | email (str): A valid e-mail address (required by NCBI). 165 | threads (int, default=1): Number of threads to be used in parallel. 166 | log_level (str, default='INFO'): Logging level. 167 | 168 | Returns: 169 | pd.DataFrame: DataFrame with metadata obtained for the provided IDs. 170 | pd.DataFrame: DataFrame with runs IDs for which no metadata was 171 | fetched and the associated error messages. 172 | """ 173 | logger = set_up_logger(log_level, logger_name=__name__) 174 | 175 | # extract DOI names to IDs mapping for later 176 | if any(x in accession_ids.columns for x in ["doi", "DOI"]): 177 | id2doi, id2doi_type = _find_doi_mapping_and_type(accession_ids) 178 | elif linked_doi and any(x in linked_doi.columns for x in ["doi", "DOI"]): 179 | id2doi, id2doi_type = _find_doi_mapping_and_type(linked_doi) 180 | else: 181 | id2doi, id2doi_type = None, None 182 | 183 | # Retrieve input IDs 184 | accession_ids = sorted(list(accession_ids.get_ids())) 185 | 186 | # figure out which id type we're dealing with 187 | id_type = _determine_id_type(accession_ids) 188 | 189 | # get actual metadata 190 | if id_type == "run": 191 | meta, missing_ids = _get_run_meta( 192 | email, threads, accession_ids, False, log_level, logger 193 | ) 194 | else: 195 | meta, missing_ids = _get_other_meta( 196 | email, threads, accession_ids, id_type, log_level, logger 197 | ) 198 | 199 | # match DOI names to metadata if present 200 | match_study_meta = { 201 | "bioproject": "Bioproject ID", 202 | "study": "Study ID", 203 | "experiment": "Experiment ID", 204 | "sample": "Sample Accession", 205 | } 206 | if id2doi is not None and id2doi_type == "run": 207 | meta = meta.join(id2doi, how="left") 208 | elif id2doi is not None and id2doi_type != "run": 209 | meta = meta.merge( 210 | id2doi, how="left", left_on=match_study_meta[id2doi_type], right_index=True 211 | ) 212 | 213 | missing_ids = pd.DataFrame( 214 | data={"Error message": missing_ids.values()}, 215 | index=pd.Index(missing_ids.keys(), name="ID"), 216 | ) 217 | return meta, missing_ids 218 | 219 | 220 | def merge_metadata(metadata: pd.DataFrame) -> pd.DataFrame: 221 | """Merges provided multiple metadata into a single metadata object. 222 | 223 | Args: 224 | metadata (pd.DataFrame): List of metadata DataFrames to be merged. 225 | 226 | Returns: 227 | metadata_merged (pd.DataFrame): Final metadata DataFrame. 228 | """ 229 | logger = set_up_logger("INFO", logger_name=__name__) 230 | logger.info("Merging %s metadata DataFrames.", len(metadata)) 231 | 232 | metadata_merged = pd.concat(metadata, axis=0, join="outer") 233 | 234 | records_count = metadata_merged.shape[0] 235 | metadata_merged.drop_duplicates(inplace=True) 236 | if records_count != metadata_merged.shape[0]: 237 | logger.info( 238 | "%s duplicate record(s) found in the metadata " "were dropped.", 239 | records_count - metadata_merged.shape[0], 240 | ) 241 | 242 | if len(metadata_merged.index) != len(set(metadata_merged.index)): 243 | logger.warning( 244 | "Records with same IDs but differing values were found in " 245 | "the metadata and will not be removed." 246 | ) 247 | 248 | logger.info( 249 | "Merged metadata DataFrame has %s rows and %s columns.", 250 | metadata_merged.shape[0], 251 | metadata_merged.shape[1], 252 | ) 253 | 254 | return metadata_merged 255 | -------------------------------------------------------------------------------- /q2_fondue/tests/data/metadata_response_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "EXPERIMENT": { 3 | "@accession": "ERX3980916", 4 | "@alias": "ena-EXPERIMENT-UNIVERSITY OF HOHENHEIM-06-03-2020-13:37:12:076-1", 5 | "@center_name": "UNIVERSITY OF HOHENHEIM", 6 | "IDENTIFIERS": { 7 | "PRIMARY_ID": "ERX3980916" 8 | }, 9 | "TITLE": "Illumina MiSeq sequencing", 10 | "STUDY_REF": { 11 | "@accession": "ERP120343", 12 | "IDENTIFIERS": { 13 | "PRIMARY_ID": "ERP120343", 14 | "EXTERNAL_ID": { 15 | "@namespace": "BioProject", 16 | "#text": "PRJEB37054" 17 | } 18 | } 19 | }, 20 | "DESIGN": { 21 | "DESIGN_DESCRIPTION": null, 22 | "SAMPLE_DESCRIPTOR": { 23 | "@accession": "ERS4372624", 24 | "IDENTIFIERS": { 25 | "PRIMARY_ID": "ERS4372624", 26 | "EXTERNAL_ID": { 27 | "@namespace": "BioSample", 28 | "#text": "SAMEA6608408" 29 | } 30 | } 31 | }, 32 | "LIBRARY_DESCRIPTOR": { 33 | "LIBRARY_NAME": "unspecified", 34 | "LIBRARY_STRATEGY": "AMPLICON", 35 | "LIBRARY_SOURCE": "METAGENOMIC", 36 | "LIBRARY_SELECTION": "PCR", 37 | "LIBRARY_LAYOUT": { 38 | "SINGLE": null 39 | } 40 | } 41 | }, 42 | "PLATFORM": { 43 | "ILLUMINA": { 44 | "INSTRUMENT_MODEL": "Illumina MiSeq" 45 | } 46 | }, 47 | "EXPERIMENT_ATTRIBUTES": { 48 | "EXPERIMENT_ATTRIBUTE": [ 49 | { 50 | "TAG": "Temperature", 51 | "VALUE": "12" 52 | }, 53 | { 54 | "TAG": "Depth", 55 | "VALUE": "500" 56 | } 57 | ] 58 | } 59 | }, 60 | "SUBMISSION": { 61 | "@accession": "ERA2402167", 62 | "@alias": "ena-SUBMISSION-UNIVERSITY OF HOHENHEIM-06-03-2020-13:27:09:756-1", 63 | "@center_name": "UNIVERSITY OF HOHENHEIM", 64 | "@lab_name": "European Nucleotide Archive", 65 | "IDENTIFIERS": { 66 | "PRIMARY_ID": "ERA2402167" 67 | }, 68 | "TITLE": "Submitted by UNIVERSITY OF HOHENHEIM on 06-MAR-2020" 69 | }, 70 | "Organization": { 71 | "@type": "center", 72 | "Name": { 73 | "@abbr": "University of Hohenheim", 74 | "#text": "University of Hohenheim" 75 | } 76 | }, 77 | "STUDY": { 78 | "@accession": "ERP120343", 79 | "@alias": "ena-STUDY-UNIVERSITY OF HOHENHEIM-04-03-2020-12:54:47:240-944", 80 | "@center_name": "UNIVERSITY OF HOHENHEIM", 81 | "IDENTIFIERS": { 82 | "PRIMARY_ID": "ERP120343", 83 | "EXTERNAL_ID": { 84 | "@namespace": "BioProject", 85 | "#text": "PRJEB37054" 86 | } 87 | }, 88 | "DESCRIPTOR": { 89 | "STUDY_TITLE": "The microbial load, diversity and composition of\n the wine microbiota is affected by wine type and\n environmental-stress factors", 90 | "STUDY_TYPE": { 91 | "@existing_study_type": "Other" 92 | }, 93 | "STUDY_ABSTRACT": "In order to improve the understanding of the\n composition, organization and temporal dynamics of the wine\n microbiota, the relative and absolute bacterial wine\n microbiota composition during the first week of\n fermentation was determined, including distinct red and\n white wine cultivars, by 16S rRNA gene amplicon sequencing.", 94 | "CENTER_PROJECT_NAME": "Wine microbiota analysis during\n fermentation", 95 | "STUDY_DESCRIPTION": "In order to improve the understanding of the\n composition, organization and temporal dynamics of the wine\n microbiota, the relative and absolute bacterial wine\n microbiota composition during the first week of\n fermentation was determined, including distinct red and\n white wine cultivars, by 16S rRNA gene amplicon sequencing." 96 | }, 97 | "STUDY_ATTRIBUTES": { 98 | "STUDY_ATTRIBUTE": [ 99 | { 100 | "TAG": "ENA-FIRST-PUBLIC", 101 | "VALUE": "2020-05-31" 102 | }, 103 | { 104 | "TAG": "ENA-LAST-UPDATE", 105 | "VALUE": "2020-03-04" 106 | } 107 | ] 108 | } 109 | }, 110 | "SAMPLE": { 111 | "@accession": "ERS4372624", 112 | "@alias": "BAC1.D1.0.32A", 113 | "@center_name": "UNIVERSITY OF HOHENHEIM", 114 | "IDENTIFIERS": { 115 | "PRIMARY_ID": "ERS4372624", 116 | "EXTERNAL_ID": { 117 | "@namespace": "BioSample", 118 | "#text": "SAMEA6608408" 119 | } 120 | }, 121 | "TITLE": "Vitis vinifera", 122 | "SAMPLE_NAME": { 123 | "TAXON_ID": "29760", 124 | "SCIENTIFIC_NAME": "Vitis vinifera", 125 | "COMMON_NAME": "wine grape" 126 | }, 127 | "SAMPLE_ATTRIBUTES": { 128 | "SAMPLE_ATTRIBUTE": [ 129 | { 130 | "TAG": "environment (biome)", 131 | "VALUE": "berry plant" 132 | }, 133 | { 134 | "TAG": "geographic location (country and/or sea)", 135 | "VALUE": "Germany" 136 | }, 137 | { 138 | "TAG": "sample storage temperature", 139 | "VALUE": "-80", 140 | "UNITS": "°C" 141 | } 142 | ] 143 | } 144 | }, 145 | "Pool": { 146 | "Member": { 147 | "@member_name": "", 148 | "@accession": "ERS4372624", 149 | "@sample_name": "BAC1.D1.0.32A", 150 | "@sample_title": "Vitis vinifera", 151 | "@spots": "39323", 152 | "@bases": "11552099", 153 | "@tax_id": "29760", 154 | "@organism": "Vitis vinifera", 155 | "IDENTIFIERS": { 156 | "PRIMARY_ID": "ERS4372624", 157 | "EXTERNAL_ID": { 158 | "@namespace": "BioSample", 159 | "#text": "SAMEA6608408" 160 | } 161 | } 162 | } 163 | }, 164 | "RUN_SET": { 165 | "RUN": { 166 | "@accession": "FAKEID1", 167 | "@alias": "ena-RUN-UNIVERSITY OF HOHENHEIM-06-03-2020-13:37:12:076-1", 168 | "@center_name": "UNIVERSITY OF HOHENHEIM", 169 | "@total_spots": "39323", 170 | "@total_bases": "11552099", 171 | "@size": "3914295", 172 | "@load_done": "true", 173 | "@published": "2020-06-01 17:54:43", 174 | "@is_public": "true", 175 | "@cluster_name": "public", 176 | "@static_data_available": "1", 177 | "IDENTIFIERS": { 178 | "PRIMARY_ID": "FAKEID1" 179 | }, 180 | "TITLE": "Illumina MiSeq sequencing", 181 | "EXPERIMENT_REF": { 182 | "@accession": "ERX3980916", 183 | "IDENTIFIERS": { 184 | "PRIMARY_ID": "ERX3980916" 185 | } 186 | }, 187 | "RUN_ATTRIBUTES": { 188 | "RUN_ATTRIBUTE": [ 189 | { 190 | "TAG": "ENA-FIRST-PUBLIC", 191 | "VALUE": "2020-05-31" 192 | }, 193 | { 194 | "TAG": "ENA-LAST-UPDATE", 195 | "VALUE": "2020-03-06" 196 | } 197 | ] 198 | }, 199 | "Pool": { 200 | "Member": { 201 | "@member_name": "", 202 | "@accession": "ERS4372624", 203 | "@sample_name": "BAC1.D1.0.32A", 204 | "@sample_title": "Vitis vinifera", 205 | "@spots": "39323", 206 | "@bases": "11552099", 207 | "@tax_id": "29760", 208 | "@organism": "Vitis vinifera", 209 | "IDENTIFIERS": { 210 | "PRIMARY_ID": "ERS4372624", 211 | "EXTERNAL_ID": { 212 | "@namespace": "BioSample", 213 | "#text": "SAMEA6608408" 214 | } 215 | } 216 | } 217 | }, 218 | "SRAFiles": { 219 | "SRAFile": { 220 | "@cluster": "public", 221 | "@filename": "FAKEID1", 222 | "@url": "https://sra-download.ncbi.nlm.nih.gov/traces/era16/ERR/ERR3978/FAKEID1", 223 | "@size": "3915680", 224 | "@date": "2020-06-01 19:51:45", 225 | "@md5": "d92e4c21e26e5f2bd2cdaf56cfcfeaa0", 226 | "@semantic_name": "run", 227 | "@supertype": "Primary ETL", 228 | "@sratoolkit": "1", 229 | "Alternatives": [ 230 | { 231 | "@url": "https://sra-download.ncbi.nlm.nih.gov/traces/era16/ERR/ERR3978/FAKEID1", 232 | "@free_egress": "worldwide", 233 | "@access_type": "anonymous", 234 | "@org": "NCBI" 235 | }, 236 | { 237 | "@url": "https://sra-pub-run-odp.s3.amazonaws.com/sra/FAKEID1/FAKEID1", 238 | "@free_egress": "worldwide", 239 | "@access_type": "anonymous", 240 | "@org": "AWS" 241 | }, 242 | { 243 | "@url": "gs://sra-pub-run-8/FAKEID1/FAKEID1.1", 244 | "@free_egress": "gs.US", 245 | "@access_type": "gcp identity", 246 | "@org": "GCP" 247 | } 248 | ] 249 | } 250 | }, 251 | "CloudFiles": { 252 | "CloudFile": [ 253 | { 254 | "@filetype": "run", 255 | "@provider": "gs", 256 | "@location": "gs.US" 257 | }, 258 | { 259 | "@filetype": "run", 260 | "@provider": "s3", 261 | "@location": "s3.us-east-1" 262 | } 263 | ] 264 | }, 265 | "Statistics": { 266 | "@nreads": "1", 267 | "@nspots": "39323", 268 | "Read": { 269 | "@index": "0", 270 | "@count": "39323", 271 | "@average": "293.77", 272 | "@stdev": "20.23" 273 | } 274 | }, 275 | "Bases": { 276 | "@cs_native": "false", 277 | "@count": "11552099", 278 | "Base": [ 279 | { 280 | "@value": "A", 281 | "@count": "3143257" 282 | }, 283 | { 284 | "@value": "C", 285 | "@count": "2405184" 286 | }, 287 | { 288 | "@value": "G", 289 | "@count": "3867631" 290 | }, 291 | { 292 | "@value": "T", 293 | "@count": "2136027" 294 | }, 295 | { 296 | "@value": "N", 297 | "@count": "0" 298 | } 299 | ] 300 | } 301 | } 302 | } 303 | } -------------------------------------------------------------------------------- /q2_fondue/entrezpy_clients/_sra_meta.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- 2 | # Copyright (c) 2025, Bokulich Laboratories. 3 | # 4 | # Distributed under the terms of the Modified BSD License. 5 | # 6 | # The full license is in the file LICENSE, distributed with this software. 7 | # ---------------------------------------------------------------------------- 8 | 9 | from abc import abstractmethod, ABCMeta 10 | from dataclasses import dataclass, field 11 | from typing import Union, List 12 | 13 | import pandas as pd 14 | 15 | from q2_fondue.entrezpy_clients._utils import get_attrs 16 | 17 | 18 | META_REQUIRED_COLUMNS = [ 19 | "Experiment ID", 20 | "Biosample ID", 21 | "Bioproject ID", 22 | "Study ID", 23 | "Sample Accession", 24 | "Organism", 25 | "Library Source", 26 | "Library Layout", 27 | "Library Selection", 28 | "Instrument", 29 | "Platform", 30 | "Bases", 31 | "Spots", 32 | "Avg Spot Len", 33 | "Bytes", 34 | "Public", 35 | ] 36 | 37 | 38 | @dataclass 39 | class LibraryMetadata: 40 | """A class for storing sequencing library metadata.""" 41 | 42 | name: str 43 | layout: str 44 | selection: str 45 | source: str 46 | 47 | def generate_meta(self): 48 | index = get_attrs(self) 49 | return pd.DataFrame( 50 | data=[getattr(self, k) for k in index], 51 | index=[f"library_{x}" for x in index], 52 | ).T 53 | 54 | 55 | @dataclass 56 | class SRABaseMeta(metaclass=ABCMeta): 57 | """A base class for generation of SRA metadata objects. 58 | 59 | Attributes: 60 | id (str): Unique ID of the metadata object. 61 | custom_meta (Union[dict, None]): Custom metadata belonging 62 | to the object, if any. 63 | child (str): a one-word description of the child type for 64 | the given object (e.g., a 'sample' is a child of a 'study'). 65 | """ 66 | 67 | id: str 68 | custom_meta: Union[dict, None] 69 | child: str = None 70 | 71 | def __post_init__(self): 72 | """Initializes custom metadata DataFrame.""" 73 | if self.custom_meta: 74 | self.custom_meta_df = pd.DataFrame(self.custom_meta, index=[self.id]) 75 | else: 76 | self.custom_meta_df = None 77 | 78 | def __eq__(self, other): 79 | """Compares all attributes. To be used on subclasses that contain 80 | DataFrames as attributes.""" 81 | same = {} 82 | for k, v in vars(self).items(): 83 | if isinstance(v, pd.DataFrame): 84 | same[k] = self.__getattribute__(k).equals(other.__getattribute__(k)) 85 | else: 86 | same[k] = self.__getattribute__(k) == other.__getattribute__(k) 87 | return all(same.values()) 88 | 89 | def get_base_metadata(self, excluded: tuple) -> pd.DataFrame: 90 | """Generates a DataFrame containing basic metadata of the SRA object. 91 | 92 | The metadata generated by this method do not contain any of the 93 | metadata belonging the any of the object's children. 94 | 95 | Args: 96 | excluded (tuple): attributes to be excluded during metadata 97 | DataFrame generation 98 | Returns: 99 | base_meta (pd.DataFrame): Requested base metadata. 100 | """ 101 | index = get_attrs( 102 | self, excluded=("child", "custom_meta", "custom_meta_df") + excluded 103 | ) 104 | base_meta = pd.DataFrame( 105 | data={k: getattr(self, k) for k in index}, index=[self.id] 106 | ) 107 | 108 | if self.custom_meta: 109 | base_meta = pd.concat( 110 | [base_meta, self.custom_meta_df], 111 | axis=1, 112 | ) 113 | 114 | return base_meta 115 | 116 | def get_child_metadata(self) -> pd.DataFrame: 117 | """Generates a DataFrame containing metadata of all the 118 | children SRA objects. 119 | 120 | Returns: 121 | child_meta (pd.DataFrame): Requested children objects' metadata. 122 | """ 123 | child_meta_dfs = [ 124 | x.generate_meta() for x in self.__getattribute__(f"{self.child}s") 125 | ] 126 | if child_meta_dfs: 127 | child_meta = pd.concat(child_meta_dfs) 128 | else: 129 | child_meta = pd.DataFrame() 130 | child_meta.index.name = f"{self.child}_id" 131 | return child_meta 132 | 133 | @abstractmethod 134 | def generate_meta(self) -> pd.DataFrame: 135 | """Generates a DataFrame with all metadata. 136 | 137 | Metadata from current object will be collected and merged together 138 | with metadata gathered from all of its children. 139 | 140 | Returns: 141 | pd.DataFrame: DataFrame containing all metadata. 142 | """ 143 | pass 144 | 145 | 146 | @dataclass(eq=False) 147 | class SRARun(SRABaseMeta): 148 | """A class containing all the SRA run metadata. 149 | 150 | Attributes: 151 | public (bool): True if the dataset was public. 152 | bytes (int): Size of the run dataset. 153 | bases (int): Nucleotide count of the run dataset. 154 | spots (int): Spot count of the run dataset. 155 | avg_spot_len (int): Average spot length. 156 | experiment_id (str): ID of the experiment which the run belongs to. 157 | child (str): Run's child type (None, as runs have no children objects). 158 | """ 159 | 160 | public: bool = True 161 | bytes: int = None 162 | bases: int = None 163 | spots: int = None 164 | avg_spot_len: int = None 165 | experiment_id: str = None 166 | child: str = None 167 | 168 | def __post_init__(self): 169 | """Calculates an average spot length.""" 170 | super().__post_init__() 171 | if self.spots > 0: 172 | self.avg_spot_len = int(self.bases / self.spots) 173 | else: 174 | self.avg_spot_len = 0 175 | 176 | def generate_meta(self) -> pd.DataFrame: 177 | """Generates run's metadata. 178 | 179 | Returns: 180 | pd.DataFrame: Run's metadata. 181 | """ 182 | return self.get_base_metadata(excluded=("id",)) 183 | 184 | 185 | @dataclass 186 | class SRAExperiment(SRABaseMeta): 187 | """A class containing all the SRA experiment metadata. 188 | 189 | Attributes: 190 | instrument (str): Sequencing instrument name. 191 | platform (str): Sequencing platform name. 192 | library (LibraryMetadata): Metadata of the sequencing library. 193 | runs (List[SRARun]): All SRA runs belonging to this experiment. 194 | child (str): Runs are children of experiment objects. 195 | 196 | """ 197 | 198 | instrument: str = None 199 | platform: str = None 200 | library: LibraryMetadata = None 201 | runs: List[SRARun] = field(default_factory=list) 202 | sample_id: str = None 203 | child: str = "run" 204 | 205 | def generate_meta(self) -> pd.DataFrame: 206 | """Generates experiment's metadata. 207 | 208 | Generated metadata will include all metadata of the linked runs. 209 | 210 | Returns: 211 | pd.DataFrame: Experiment's metadata with all of its children. 212 | """ 213 | exp_meta = self.get_base_metadata(excluded=("id", "runs", "library")) 214 | lib_meta = self.library.generate_meta() 215 | lib_meta.index = exp_meta.index 216 | 217 | exp_meta = pd.concat([exp_meta, lib_meta], axis=1) 218 | runs_meta = self.get_child_metadata() 219 | if len(runs_meta) > 0: 220 | runs_merged = runs_meta.merge( 221 | exp_meta, left_on="experiment_id", right_index=True 222 | ) 223 | runs_merged.index.name = "run_id" 224 | return runs_merged 225 | else: 226 | return exp_meta 227 | 228 | 229 | @dataclass(eq=False) 230 | class SRASample(SRABaseMeta): 231 | """A class containing all the SRA sample metadata. 232 | 233 | Attributes: 234 | name (str): Name of the sample. 235 | title (str): Title of the sample. 236 | biosample_id (str): BioSample ID linked to the sample. 237 | organism (str): Organism name. 238 | tax_id (str): Organism taxonomic ID. 239 | study_id (str): = ID of the study which the sample belongs to. 240 | experiments (List[SRAExperiment]): All SRA experiments 241 | belonging to the sample. 242 | child (str): = Experiments are children of sample objects. 243 | """ 244 | 245 | name: str = None 246 | title: str = None 247 | biosample_id: str = None 248 | organism: str = None 249 | tax_id: str = None 250 | study_id: str = None 251 | experiments: List[SRAExperiment] = field(default_factory=list) 252 | child: str = "experiment" 253 | 254 | def generate_meta(self) -> pd.DataFrame: 255 | """Generates SRA sample's metadata. 256 | 257 | Generated metadata will include all metadata of the linked experiments. 258 | 259 | Returns: 260 | pd.DataFrame: Sample's metadata with all of its children. 261 | """ 262 | sample_meta = self.get_base_metadata(excluded=("id", "experiments")) 263 | exps_meta = self.get_child_metadata() 264 | if len(exps_meta) > 0: 265 | exps_merged = exps_meta.merge( 266 | sample_meta, left_on="sample_id", right_index=True 267 | ) 268 | exps_merged.index.name = "run_id" 269 | return exps_merged 270 | else: 271 | return sample_meta 272 | 273 | 274 | @dataclass 275 | class SRAStudy(SRABaseMeta): 276 | """Generates SRA study's metadata. 277 | 278 | Generated metadata will include all metadata of the linked samples. 279 | 280 | Attributes: 281 | bioproject_id (str): ID of the linked BioProject. 282 | center_name (str): Name of the center where the study was performed. 283 | samples (List[SRASample]): All SRA samples belonging to the study. 284 | child (str): Samples are children of study objects. 285 | """ 286 | 287 | bioproject_id: str = None 288 | center_name: str = None 289 | samples: List[SRASample] = field(default_factory=list) 290 | child: str = "sample" 291 | 292 | def generate_meta(self) -> pd.DataFrame: 293 | """Generates SRA study's metadata. 294 | 295 | Generated metadata will include all metadata of the linked samples. 296 | 297 | Returns: 298 | pd.DataFrame: Study's metadata with all of its children. 299 | """ 300 | study_meta = self.get_base_metadata(excluded=("id", "samples")) 301 | samples_meta = self.get_child_metadata() 302 | if len(samples_meta) > 0: 303 | samples_merged = samples_meta.merge( 304 | study_meta, left_on="study_id", right_index=True 305 | ) 306 | samples_merged.index.name = "run_id" 307 | return samples_merged 308 | else: 309 | return study_meta 310 | --------------------------------------------------------------------------------