├── conda-recipe ├── grabseqs │ ├── conda_build_config.yaml │ └── meta.yaml ├── build_all.sh └── conda-build.sh ├── environment.yml ├── .circleci ├── setup.sh └── config.yml ├── setup.py ├── LICENSE ├── tests ├── test_mgrast.bash ├── test_general.bash ├── test_imicrobe.bash ├── test_sra.bash └── run_tests.bash ├── .gitignore ├── grabseqslib ├── __init__.py ├── utils.py ├── mgrast.py └── sra.py ├── template.py ├── faq ├── faq.md └── template_usage.md └── README.md /conda-recipe/grabseqs/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | python: 2 | - 3.6 3 | - 3.7 4 | - 3.9 5 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - conda-forge 4 | - louiejtaylor 5 | dependencies: 6 | - sra-tools>3.2 7 | - python 8 | - requests 9 | - pigz 10 | - wget 11 | - pandas 12 | -------------------------------------------------------------------------------- /conda-recipe/build_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd .. 4 | 5 | python3 setup.py sdist bdist_wheel 6 | 7 | twine upload dist/* 8 | 9 | rm -r grabseqs.egg-info/ 10 | rm -r build/ 11 | rm -r dist/ 12 | 13 | cd conda-recipe 14 | 15 | bash conda-build.sh 16 | -------------------------------------------------------------------------------- /conda-recipe/conda-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Assumes `conda config --set anaconda_upload yes` has 4 | # been run--otherwise just upload manually per the 5 | # instructions from conda-build. 6 | 7 | conda-build -c conda-forge -c bioconda -c louiejtaylor grabseqs 8 | -------------------------------------------------------------------------------- /.circleci/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # setup miniconda 6 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 7 | bash Miniconda3-latest-Linux-x86_64.sh -b -p ${HOME}/miniconda3 8 | 9 | # CircleCI fix weird error 10 | # cat .circleci/deplist.txt | xargs sudo apt-get install 11 | 12 | export PATH=${PATH}:${HOME}/miniconda3/bin 13 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # 5 | 6 | version: 2 7 | jobs: 8 | build: 9 | docker: 10 | - image: cimg/python:3.9 11 | steps: 12 | - checkout 13 | - run: bash .circleci/setup.sh 14 | - run: bash tests/run_tests.bash -i -v -d $HOME 15 | 16 | workflows: 17 | version: 2 18 | weeklybuild: 19 | triggers: 20 | - schedule: 21 | cron: "30 14 * * 3" 22 | filters: 23 | branches: 24 | only: 25 | - master 26 | jobs: 27 | - build 28 | default: 29 | jobs: 30 | - build 31 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from distutils.core import setup 3 | 4 | setup(name='grabseqs', 5 | version='1.0.0', 6 | description='Easily download reads from next-gen sequencing repositories like NCBI SRA', 7 | author='Louis J Taylor', 8 | author_email='l'+'ouist'+'@'+'u'+'penn.edu', 9 | url='https://github.com/louiejtaylor/grabseqs', 10 | packages=['grabseqslib'], 11 | license='MIT License', 12 | entry_points={'console_scripts': [ 13 | 'grabseqs = grabseqslib:main' 14 | ]}, 15 | install_requires=[ 16 | 'requests', 17 | 'argparse', 18 | 'pandas>=2' 19 | ], 20 | classifiers = ['Intended Audience :: Science/Research', 21 | 'Environment :: Console', 22 | 'Environment :: Web Environment', 23 | 'License :: OSI Approved :: MIT License', 24 | 'Programming Language :: Python :: 3', 25 | 'Topic :: Scientific/Engineering :: Bio-Informatics',], 26 | py_modules = ['utils','sra','mgrast'] 27 | ) 28 | -------------------------------------------------------------------------------- /conda-recipe/grabseqs/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "grabseqs" %} 2 | {% set version = "1.0.0" %} 3 | 4 | package: 5 | name: "{{ name|lower }}" 6 | version: "{{ version }}" 7 | 8 | source: 9 | url: "https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz" 10 | 11 | build: 12 | number: 0 13 | entry_points: 14 | - grabseqs = grabseqslib:main 15 | script: "{{ PYTHON }} -m pip install . -vv" 16 | 17 | requirements: 18 | host: 19 | - pandas 20 | - python {{ python }} 21 | - requests 22 | run: 23 | - pandas 24 | - python 25 | - requests 26 | - sra-tools 27 | - pigz 28 | - wget 29 | 30 | test: 31 | imports: 32 | - grabseqslib 33 | commands: 34 | - grabseqs --help 35 | 36 | about: 37 | home: "https://github.com/louiejtaylor/grabseqs" 38 | license: MIT 39 | license_family: MIT 40 | summary: "Easily download reads from next-gen sequencing repositories like NCBI SRA" 41 | 42 | extra: 43 | recipe-maintainers: 44 | - louiejtaylor 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Louis J Taylor 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/test_mgrast.bash: -------------------------------------------------------------------------------- 1 | # test sample listing, metadata download 2 | function test_mgrast_listing { 3 | if [ `grabseqs mgrast -l mgp85479 | wc -l` -ne 4 ]; then 4 | exit 1 5 | fi 6 | } 7 | 8 | # test metadata 9 | function test_mgrast_metadata { 10 | grabseqs mgrast -o $TMPDIR/test_md_mg -m META.csv -l mgp85479 11 | if [ `cat $TMPDIR/test_md_mg/META.csv | wc -l` -ne 5 ] ; then 12 | exit 1 13 | fi 14 | } 15 | 16 | # download a tiny sample, .fastq-formatted 17 | function test_mgrast_fastq { 18 | grabseqs mgrast -o $TMPDIR/test_tiny_mg mgm4793571.3 19 | ls $TMPDIR/test_tiny_mg/mgm4793571.3.fastq.gz 20 | } 21 | 22 | ## download a tiny sample, .fasta-formatted 23 | function test_mgrast_fasta { 24 | grabseqs mgrast -o $TMPDIR/test_tiny_mg_fasta mgm4440055.3 25 | ls $TMPDIR/test_tiny_mg_fasta/mgm4440055.3.fastq.gz 26 | } 27 | 28 | ## test no clobber 29 | function test_mgrast_fastq_noclobber { 30 | u=`grabseqs mgrast -o $TMPDIR/test_tiny_mg mgm4793571.3` 31 | echo $u 32 | if [[ $u != *"Pass -f to force download"* ]] ; then 33 | exit 1 34 | fi 35 | } 36 | 37 | ## test force 38 | function test_mgrast_fastq_force_download { 39 | u=`grabseqs mgrast -o $TMPDIR/test_tiny_mg -f mgm4793571.3` 40 | echo $u 41 | if [[ $u == *"Pass -f to force download"* ]] ; then 42 | exit 1 43 | fi 44 | ls $TMPDIR/test_tiny_mg/mgm4793571.3.fastq.gz 45 | } 46 | 47 | # test case for invalid/empty accessions--should raise error (#51) 48 | function test_mgrast_invalid_acc { 49 | if grabseqs mgrast -l mgp0fake; then 50 | exit 1 51 | fi 52 | } 53 | -------------------------------------------------------------------------------- /tests/test_general.bash: -------------------------------------------------------------------------------- 1 | # test to see whether install succeeded 2 | function test_grabseqs_installed { 3 | grabseqs -v 4 | grabseqs -h 5 | } 6 | 7 | # test missing sra-tools 8 | #function test_grabseqs_no_sratools { 9 | # conda remove sra-tools -qy 10 | # if grabseqs sra -o $TMPDIR/test_no_sra-tools ERR2279063; then 11 | # exit 1 12 | # fi 13 | # conda install "sra-tools>3.2" -c bioconda -qy 14 | #} 15 | 16 | # test missing pigz 17 | function test_grabseqs_no_pigz { 18 | if which pigz; then 19 | echo "pigz installed outside of conda, cannot test whether it is missing" 20 | else 21 | conda remove pigz -qy 22 | u=`grabseqs mgrast -o $TMPDIR/test_nopigz mgm4633450.3` 23 | echo $u 24 | if [[ $u != *"pigz not found, using gzip"* ]] ; then 25 | exit 1 26 | fi 27 | conda install -c anaconda pigz -qy 28 | fi 29 | } 30 | 31 | # test conda install 32 | function test_grabseqs_conda_install { 33 | conda deactivate 34 | conda create -n grabseqs-unittest-conda -qy 35 | conda activate grabseqs-unittest-conda 36 | conda install -c louiejtaylor -c bioconda -c conda-forge -qy grabseqs 37 | conda deactivate 38 | conda env remove -yqn grabseqs-unittest-conda 39 | conda activate grabseqs-unittest 40 | } 41 | 42 | # test install with python3.7 (issue #38) 43 | function test_grabseqs_conda_newer_python { 44 | conda deactivate 45 | conda create -n grabseqs-unittest-py37 -qy 46 | conda activate grabseqs-unittest-py37 47 | conda install python=3.7 -qy 48 | conda install -c louiejtaylor -c bioconda -c conda-forge grabseqs -qy 49 | conda deactivate 50 | conda env remove -yqn grabseqs-unittest-py37 51 | conda activate grabseqs-unittest 52 | } 53 | -------------------------------------------------------------------------------- /tests/test_imicrobe.bash: -------------------------------------------------------------------------------- 1 | # test sample listing and metadata download 2 | function test_imicrobe_listing_project { 3 | if [ `grabseqs imicrobe -l p1 | wc -l` -ne 2 ]; then 4 | exit 1 5 | fi 6 | } 7 | 8 | # test metadata download 9 | function test_imicrobe_metadata_download { 10 | grabseqs imicrobe -o $TMPDIR/test_md_im -m META.csv -l p1 11 | if [ `cat $TMPDIR/test_md_im/META.csv | wc -l` -ne 3 ] ; then 12 | exit 1 13 | fi 14 | } 15 | 16 | # paired sample listing 17 | function test_imicrobe_listing_paired_sample { 18 | ps=`grabseqs imicrobe -l s6398` 19 | echo $ps 20 | if [ "$ps" != "s6398_1.fastq.gz,s6398_2.fastq.gz" ]; then 21 | exit 1 22 | fi 23 | } 24 | 25 | # download a tiny sample, .fasta-formatted 26 | function test_imicrobe_fasta { 27 | grabseqs imicrobe -o $TMPDIR/test_tiny_im s710 28 | ls $TMPDIR/test_tiny_im/s710.fastq.gz 29 | } 30 | 31 | # download a tiny sample, .fastq-formatted paired 32 | function test_imicrobe_fastq_paired { 33 | grabseqs imicrobe -o $TMPDIR/test_tiny_im s6399 34 | ls $TMPDIR/test_tiny_im/s6399_1.fastq.gz 35 | ls $TMPDIR/test_tiny_im/s6399_2.fastq.gz 36 | echo -e "$PASS iMicrobe fastq-formatted sample download test passed" 37 | } 38 | 39 | ## test no clobber 40 | function test_imicrobe_no_clobber { 41 | t=`grabseqs imicrobe -t 2 -o $TMPDIR/test_tiny_im s710` 42 | echo $t 43 | if [[ $t != *"Pass -f to force download"* ]] ; then 44 | exit 1 45 | fi 46 | } 47 | 48 | ## test force 49 | function test_imicrobe_fasta_force { 50 | tf=`grabseqs imicrobe -t 2 -o $TMPDIR/test_tiny_im -f s710` 51 | echo $tf 52 | if [[ $tf == *"Pass -f to force download"* ]] ; then 53 | exit 1 54 | fi 55 | ls $TMPDIR/test_tiny_im/s710.fastq.gz 56 | } 57 | 58 | # test case for invalid/empty accessions--should raise error (#51) 59 | function test_imicrobe_invalid_acc { 60 | if grabseqs mgrast -l p4fake; then 61 | exit 1 62 | fi 63 | } 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # sequence and cluster 2 | *.fastq 3 | *.fastq.gz 4 | *.fq 5 | *.fq.gz 6 | *.fasta 7 | *.fa 8 | *.out 9 | *.err 10 | test/ 11 | bin/test/ 12 | clean_build.sh 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *.cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # celery beat schedule file 92 | celerybeat-schedule 93 | 94 | # SageMath parsed files 95 | *.sage.py 96 | 97 | # Environments 98 | .env 99 | .venv 100 | env/ 101 | venv/ 102 | ENV/ 103 | env.bak/ 104 | venv.bak/ 105 | 106 | # Spyder project settings 107 | .spyderproject 108 | .spyproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # mkdocs documentation 114 | /site 115 | 116 | # mypy 117 | .mypy_cache/ 118 | -------------------------------------------------------------------------------- /grabseqslib/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["utils","sra","mgrast"] 2 | 3 | import os, sys, argparse, warnings, shutil 4 | import pandas as pd 5 | 6 | from pathlib import Path 7 | from grabseqslib.sra import process_sra, add_sra_subparser 8 | from grabseqslib.mgrast import process_mgrast, add_mgrast_subparser 9 | 10 | def main(): 11 | ''' 12 | Command-line argument-handling function 13 | ''' 14 | # Set up parsers 15 | parser = argparse.ArgumentParser(prog="grabseqs", 16 | description='Download metagenomic sequences from public datasets.') 17 | parser.add_argument('--version', '-v', action='version', version='%(prog)s 1.0.0') 18 | subpa = parser.add_subparsers(help='repositories available') 19 | 20 | add_sra_subparser(subpa) 21 | add_mgrast_subparser(subpa) 22 | 23 | args = parser.parse_args() 24 | # Make output directories if they don't exist 25 | try: 26 | if args.outdir != "": 27 | if not os.path.exists(args.outdir): 28 | os.makedirs(args.outdir) 29 | except AttributeError: 30 | # No subcommand provided (all subcomands have `-o`) 31 | print("Subcommand not specified, run `grabseqs -h` or `grabseqs {repository} -h` for help") 32 | sys.exit(0) 33 | 34 | # Figure out which subparser was called 35 | try: 36 | if args.rastid: 37 | repo = "MG-RAST" 38 | except AttributeError: 39 | repo = "SRA" 40 | 41 | # Check deps 42 | zip_func = "gzip" 43 | if shutil.which("pigz"): 44 | zip_func = "pigz" 45 | else: 46 | print("pigz not found, using gzip") 47 | 48 | metadata_agg = None 49 | 50 | # Download samples 51 | if repo == "SRA": 52 | metadata_agg = process_sra(args, zip_func) 53 | 54 | elif repo == "MG-RAST": 55 | metadata_agg = process_mgrast(args, zip_func) 56 | 57 | # Handle metadata 58 | if args.metadata != "": 59 | md_path = Path(args.outdir) / Path(args.metadata) 60 | if not os.path.isfile(md_path): 61 | metadata_agg.to_csv(md_path, index = False) 62 | print("Metadata saved to new file: " + str(md_path)) 63 | else: 64 | metadata_i = pd.read_csv(md_path) 65 | metadata_f = pd.concat([metadata_i,metadata_agg],sort=True) 66 | metadata_f.to_csv(md_path, index = False) 67 | print("Metadata appended to existing file: " + str(md_path)) 68 | -------------------------------------------------------------------------------- /tests/test_sra.bash: -------------------------------------------------------------------------------- 1 | # test sample listing, metadata download 2 | function test_sra_listing { 3 | if [ `grabseqs sra -l SRP057027 | wc -l` -ne 369 ]; then 4 | exit 1 5 | fi 6 | } 7 | 8 | # test metadata download 9 | function test_sra_metadata_downloaded { 10 | grabseqs sra -m SRP057027.tsv -l -o $TMPDIR/test_metadata/ SRP057027 11 | if [ `cat $TMPDIR/test_metadata/SRP057027.tsv | wc -l` -ne 370 ] ; then 12 | exit 1 13 | fi 14 | } 15 | 16 | # test behavior with -l and --no_parsing 17 | function test_sra_no_parsing_flag { 18 | if [ `grabseqs sra -l --no_parsing SRR1804203 | wc -l` -ne 1 ]; then 19 | exit 1 20 | fi 21 | } 22 | 23 | # unpaired fasterq-dump 24 | function test_sra_unpaired { 25 | grabseqs sra -t 2 -o $TMPDIR/test_tiny_sra ERR2279063 26 | ls $TMPDIR/test_tiny_sra/ERR2279063.fastq.gz 27 | } 28 | 29 | # paired fasterq-dump 30 | function test_sra_paired { 31 | grabseqs sra -t 2 -o $TMPDIR/test_tiny_sra_paired SRR1913936 32 | ls $TMPDIR/test_tiny_sra_paired/SRR1913936_1.fastq.gz 33 | ls $TMPDIR/test_tiny_sra_paired/SRR1913936_2.fastq.gz 34 | } 35 | 36 | # unpaired fastq-dump 37 | function test_sra_unpaired_fastqdump { 38 | grabseqs sra -t 2 -o $TMPDIR/test_fastqdump_sra --use_fastq_dump ERR2279063 39 | ls $TMPDIR/test_fastqdump_sra/ERR2279063.fastq.gz 40 | } 41 | 42 | # paired fasterq-dump 43 | function test_sra_paired_fastqdump { 44 | grabseqs sra -t 2 -o $TMPDIR/test_fastqdump_sra_paired --use_fastq_dump SRR1913936 45 | ls $TMPDIR/test_fastqdump_sra_paired/SRR1913936_1.fastq.gz 46 | ls $TMPDIR/test_fastqdump_sra_paired/SRR1913936_2.fastq.gz 47 | } 48 | 49 | # test no clobber 50 | function test_sra_no_clobber { 51 | t=`grabseqs sra -t 2 -o $TMPDIR/test_fastqdump_sra ERR2279063` 52 | echo $t 53 | if [[ $t != *"Pass -f to force download"* ]] ; then 54 | exit 1 55 | fi 56 | } 57 | 58 | # test force 59 | function test_sra_forced { 60 | tf=`grabseqs sra -r 0 -t 2 -o $TMPDIR/test_fastqdump_sra -f ERR2279063` 61 | echo $tf 62 | if [[ $tf == *"Pass -f to force download"* ]] ; then 63 | exit 1 64 | fi 65 | } 66 | 67 | # test custom args to fasterq-dump (#44) 68 | function test_sra_custom_fasterqdump_args { 69 | grabseqs sra SRR1913936 -r 0 -o $TMPDIR/test_fasterqdump_custom --custom_fqdump_args='--split-spot' 70 | # this is a paired run, but with `--split-spot` instead of `--split-3` it should come down as a single interleaved fastq.gz 71 | ls $TMPDIR/test_fasterqdump_custom/SRR1913936.fastq.gz 72 | } 73 | 74 | # test custom args to fastq-dump (#44) 75 | function test_sra_custom_fastqdump_args { 76 | grabseqs sra SRR1913936 -r 0 --use_fastq_dump -o $TMPDIR/test_fastqdump_custom --custom_fqdump_args='--gzip --skip-technical' 77 | # this is a paired run, but without the `--split-3` arg it should come down as a single interleaved fastq.gz 78 | ls $TMPDIR/test_fastqdump_custom/SRR1913936.fastq.gz 79 | } 80 | 81 | # test case for invalid/empty accessions--should raise error (#51) 82 | function test_sra_invalid_acc { 83 | if grabseqs sra -l PRJNAXXXXXXXX; then 84 | exit 1 85 | fi 86 | } 87 | -------------------------------------------------------------------------------- /grabseqslib/utils.py: -------------------------------------------------------------------------------- 1 | import os, glob, gzip, sys 2 | from subprocess import call 3 | 4 | def check_existing(save_loc, acc): 5 | """ 6 | Function to check for single- or paired-end reads 7 | in a given `save_loc` for a particular `acc`ession. 8 | Returns "paired" if paired reads found, "single" if 9 | unpaired reads found, "both" if single- and paired- 10 | end reads found, and False if nothing matching that 11 | accession was found. 12 | """ 13 | if save_loc == '': 14 | loc_to_search = os.getcwd() 15 | else: 16 | loc_to_search = save_loc 17 | try: 18 | existing = [f for f in os.listdir(loc_to_search) if f.endswith('fastq.gz')] 19 | except FileNotFoundError: 20 | return False 21 | paired = False 22 | unpaired = False 23 | for f in existing: 24 | if acc + '.fastq.gz' in f: 25 | unpaired = True 26 | if (acc + '_1.fastq.gz' in f) or (acc + '_2.fastq.gz' in f): 27 | paired = True 28 | if unpaired == True and paired == True: 29 | return "both" 30 | elif paired == True: 31 | return "paired" 32 | elif unpaired == True: 33 | return "unpaired" 34 | else: 35 | return False 36 | 37 | def gzip_files(paths, tool="gzip", threads=1): 38 | """ 39 | Zips files at one or more `paths` using specified `tool`. 40 | Returns the command-line tool's return code. 41 | """ 42 | if type(paths) != type(["list'o'strings"]): 43 | paths = [paths] 44 | validated_paths = [] 45 | for p in paths: 46 | if os.path.isfile(p): 47 | validated_paths.append(p) 48 | if tool == "gzip": 49 | retcode = call(["gzip -f " + ' '.join(validated_paths)], shell=True) 50 | elif tool == "pigz": 51 | retcode = call(["pigz -f -p "+ str(threads) + ' ' + ' '.join(validated_paths)], shell=True) 52 | else: 53 | print("Unrecognized tool "+tool+" specified: cannot compress ", validated_paths) 54 | sys.exit(1) 55 | return retcode 56 | 57 | def fetch_file(url, outfile, retries = 0): 58 | """ 59 | Function to fetch a remote file from a `url`, 60 | writing to `outfile` with a particular number of 61 | `retries`. 62 | """ 63 | wget_cmd = ["wget", "-O", outfile, url] 64 | retcode = call(wget_cmd) 65 | return retcode 66 | 67 | def build_paths(acc, loc, paired, ext = ".fastq"): 68 | """ 69 | Builds paths for saving downloaded files from a given 70 | `acc` in a particular `loc`, depending on whether or 71 | not they are `paired`. Can specify any `ext`. Returns 72 | a list of paths of length 1 or 2. 73 | """ 74 | if paired: 75 | suffix = ["_1", "_2"] 76 | else: 77 | 78 | suffix = [""] 79 | return [os.path.join(loc,acc+s+ext) for s in suffix] 80 | 81 | def check_filetype(fp): 82 | """ 83 | Function to classify downloaded files as gzipped or not, 84 | and in FASTQ, FASTA, or not based on contents. Returns a 85 | formatted extension (i.e. '.fastq', 'fasta.gz') corresponding 86 | to the filetype or an empty string if the filetype is not 87 | recognized. 88 | """ 89 | try: 90 | f = gzip.open(fp) 91 | first_b = f.readline() 92 | gz = ".gz" 93 | first = first_b.decode("ascii") 94 | except OSError: # file not gzipped 95 | f.close() 96 | f = open(fp, 'r') 97 | first = f.readline() 98 | f.close() 99 | gz = "" 100 | if len(first) == 0: 101 | return "" 102 | if first[0] == ">": 103 | return "fasta"+gz 104 | elif first[0] == "@": 105 | return "fastq"+gz 106 | else: 107 | return "" 108 | 109 | def fasta_to_fastq(fp_fa, fp_fq, zipped, dummy_char = "I"): 110 | """ 111 | Function to convert fasta (at `fp_fa`) to fastq (at `fp_fq`) 112 | possibly zipped, adding a `dummy_score`. 113 | """ 114 | if len(dummy_char) != 1: 115 | raise Exception("FASTQ dummy quality char must be only one char.") 116 | 117 | fq = open(fp_fq, 'w') 118 | 119 | seq = -1 120 | if zipped: 121 | f = gzip(fp_fa) 122 | else: 123 | f = open(fp_fa) 124 | for line in f.readlines(): 125 | if line[0] == '>': 126 | if seq == -1: 127 | fq.write('@'+line[1:]) 128 | else: 129 | fq.write(seq+'\n') 130 | fq.write('+\n') 131 | fq.write(dummy_char*len(seq)+'\n') 132 | fq.write('@'+line[1:]) 133 | seq = '' 134 | else: 135 | seq += line.strip() 136 | 137 | f.close() 138 | 139 | if len(seq) > 0: 140 | fq.write(seq+'\n') 141 | fq.write('+\n') 142 | fq.write(dummy_char*len(seq)+'\n') 143 | 144 | fq.close() 145 | 146 | -------------------------------------------------------------------------------- /tests/run_tests.bash: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # setup 4 | set -e 5 | 6 | STARTING_DIR=$(pwd) 7 | 8 | export PATH=${PATH}:${HOME}/miniconda3/bin 9 | 10 | # set up temp locations 11 | TMPLOC=/tmp 12 | 13 | USER_TMPDIR=false 14 | SKIP_IMICROBE=false 15 | SKIP_SRA=false 16 | SKIP_MGRAST=false 17 | 18 | while getopts "d:t:imsvh" opt; do 19 | case $opt in 20 | d) 21 | USER_TMPDIR=true 22 | TMPLOC=`readlink -f $OPTARG` 23 | ;; 24 | i) 25 | SKIP_IMICROBE=true 26 | ;; 27 | m) 28 | SKIP_MGRAST=true 29 | ;; 30 | s) 31 | SKIP_SRA=true 32 | ;; 33 | t) 34 | RUN_TEST=$OPTARG 35 | ;; 36 | v) 37 | VERBOSE=true 38 | ;; 39 | h) 40 | echo "Run the grabseqs test suite." 41 | echo " -d DIR Use DIR rather than a temporary directory (remains after tests finish)" 42 | echo " -t TEST Run a specific test only" 43 | echo " -i Don't run iMicrobe tests" 44 | echo " -m Don't run MG-RAST tests" 45 | echo " -s Don't run SRA tests" 46 | echo " -v Run tests with verbose output" 47 | echo " -h Display this message and exit" 48 | exit 1 49 | ;; 50 | \?) 51 | echo "Unknown option - '$OPTARG'" 52 | exit 1 53 | ;; 54 | esac 55 | done 56 | 57 | TMPDIR=$TMPLOC/grabseqs_unittest 58 | mkdir -p $TMPDIR 59 | fs=`ls $TMPDIR | wc -l` 60 | 61 | if [ $fs -ne 0 ] ; then 62 | echo "Directory $TMPDIR not empty. Clean it or specify a testing location with -d [loc]" 63 | exit 1 64 | fi 65 | 66 | GREEN="\x1B[32m" 67 | RESET="\x1B[0m" 68 | PASS="${GREEN}\u2714${RESET}" 69 | FAIL="${RED}X${RESET}" 70 | 71 | # environment and package install 72 | 73 | function setup { 74 | 75 | CONDA_BASE=$(conda info --base) # see https://github.com/conda/conda/issues/7980 76 | source $CONDA_BASE/etc/profile.d/conda.sh # allows conda [de]activate in scripts 77 | verbose "Setting up conda environment..." 78 | conda env update --name=grabseqs-unittest --file environment.yml 79 | conda activate grabseqs-unittest 80 | # required for installing libs 81 | pip install setuptools 82 | verbose "Installing grabseqs library" 83 | python setup.py install 84 | 85 | # Fix CircleCI testing issue for iMicrobe 86 | if [ `echo $HOME | grep "/home/circleci" | wc -l` -eq 1 ]; then 87 | echo "Tests running on CircleCI, adding add'l dependency" 88 | pip uninstall -y "urllib3" 89 | pip install "urllib3" 90 | fi 91 | } 92 | 93 | # functions copied and adapted from sunbeam-labs/sunbeam 94 | 95 | function msg { 96 | echo -ne "${1}" 97 | } 98 | 99 | function verbose { 100 | if [ "$VERBOSE" = true ]; then 101 | echo -ne "${1}" 102 | fi 103 | } 104 | 105 | function broke { 106 | local RETCODE=$? 107 | msg "\nFailed command error output:\n`cat ${2}.err`\n" 108 | msg "${FAIL} (log: ${LOGFILE}.[out/err])\n" 109 | cleanup 1 110 | } 111 | 112 | function capture_output { 113 | msg "Running ${1}... " 114 | 115 | LOGFILE="${TMPDIR}/${1}" 116 | 117 | set -o pipefail 118 | if [ "$VERBOSE" = true ]; then 119 | OUTPUT_STRING="> >(tee ${LOGFILE}.out) 2> >(tee ${LOGFILE}.err >&2)" 120 | else 121 | OUTPUT_STRING="> ${LOGFILE}.out 2> ${LOGFILE}.err" 122 | fi 123 | trap "broke ${1} ${LOGFILE} $?" exit 124 | eval "${1} ${OUTPUT_STRING}" 125 | set +o pipefail 126 | trap "cleanup $?" exit 127 | msg "${PASS}\n" 128 | } 129 | 130 | function cleanup { 131 | local TMPRC=$? 132 | local RETCODE=$TMPRC 133 | if [ ${1} -gt ${TMPRC} ]; then 134 | RETCODE=${1} 135 | else 136 | RETCODE=${TMPRC} 137 | fi 138 | cd $STARTING_DIR 139 | if [ $RETCODE -ne 0 ]; then 140 | msg "${RED}-- TESTS FAILED --${RESET}\n" 141 | else 142 | msg "${GREEN}-- TESTS SUCCEEDED --${RESET}\n" 143 | fi 144 | conda deactivate 145 | 146 | verbose "Deleting temporary conda environment \n" 147 | conda env remove -yqn grabseqs-unittest 148 | 149 | # Remove temp directory if created by us 150 | if [ "$USER_TMPDIR" = false ]; then 151 | verbose "Deleting temporary directory ${TMPDIR}\n" 152 | rm -rf $TMPDIR 153 | fi 154 | 155 | # Exit, maintaining previous return code 156 | exit $RETCODE 157 | 158 | echo -e "$PASS all tests passed!" 159 | } 160 | 161 | function run_test_suite { 162 | for testcase in $(declare -f | grep -o "^test[a-zA-Z_]*") ; do 163 | capture_output ${testcase} 164 | done 165 | } 166 | 167 | trap cleanup exit 168 | 169 | capture_output setup 170 | 171 | # read in tests 172 | source tests/test_general.bash 173 | 174 | if [ "$SKIP_IMICROBE" = false ]; then 175 | echo "iMicrobe temporarily removed" 176 | # source tests/test_imicrobe.bash 177 | # temp until imicrobe issues resolved 178 | fi 179 | 180 | if [ "$SKIP_SRA" = false ]; then 181 | source tests/test_sra.bash 182 | fi 183 | 184 | if [ "$SKIP_MGRAST" = false ]; then 185 | source tests/test_mgrast.bash 186 | fi 187 | 188 | 189 | # Run single test, if specified, or all found tests otherwise 190 | if [ ! -z ${RUN_TEST+x} ]; then 191 | capture_output ${RUN_TEST} 192 | else 193 | run_test_suite 194 | fi 195 | 196 | -------------------------------------------------------------------------------- /template.py: -------------------------------------------------------------------------------- 1 | # This file serves as a template for adding repositories to grabseqs. 2 | 3 | import requests, argparse, sys, os, time, json, glob 4 | from subprocess import call 5 | import pandas as pd 6 | 7 | from grabseqslib.utils import check_existing, fetch_file 8 | 9 | def add_newrepo_subparser(subparser): 10 | """ 11 | Function to add a subparser for newrepo repository. 12 | """ 13 | 14 | ### Base args: should be in every 15 | parser_newrepo = subparser.add_parser('newrepo', help="download from newrepo") 16 | parser_newrepo.add_argument('newrepoid', type=str, nargs='+', 17 | help="One or more newrepo project or sample identifiers (EXAMPLE####)") 18 | 19 | parser_newrepo.add_argument('-o', dest="outdir", type=str, default="", 20 | help="directory in which to save output. created if it doesn't exist") 21 | parser_newrepo.add_argument('-r',dest="retries", type=int, default=0, 22 | help="number of times to retry download") 23 | parser_newrepo.add_argument('-t',dest="threads", type=int, default=1, 24 | help="threads to use (for pigz)") 25 | 26 | parser_newrepo.add_argument('-f', dest="force", action="store_true", 27 | help = "force re-download of files") 28 | parser_newrepo.add_argument('-l', dest="list", action="store_true", 29 | help="list (but do not download) samples to be grabbed") 30 | 31 | ### OPTIONAL: Use if metadata are available 32 | parser_newrepo.add_argument('-m', dest="metadata", action="store_true", 33 | help="save metadata") 34 | 35 | ### Add any repository-specific arguments here 36 | 37 | def process_newrepo(args): 38 | """ 39 | Controller function for parsing newrepo repository data/metadata downloading. 40 | """ 41 | 42 | # check for any necessary dependencies 43 | 44 | metadata_agg = None 45 | 46 | # loop through passed identifiers 47 | for newrepo_identifier in args.newrepoid: 48 | sample_list, metadata_agg = map_newrepo_project_acc(newrepo_identifier, metadata_agg) 49 | for sample in sample_list: 50 | download_newrepo_sample(acc, 51 | args.retries, 52 | args.threads, 53 | args.outdir, 54 | args.force, 55 | args.list) 56 | 57 | # Metadata may be available from different locations for different repos. 58 | # In this example, the metadata and project -> accession mapping are done 59 | # in the same step (as this is the most common configuration we've encountered). 60 | return metadata_agg 61 | 62 | def map_newrepo_project_acc(pacc, metadata_agg = None): 63 | """ 64 | Function to get list of newrepo sample accession numbers from a particular 65 | project. Takes project accession number `pacc` and an optional `metadata_agg` 66 | pandas dataframe and returns a list of newrepo accession numbers with any new 67 | metadata appended to `metadata_agg`. 68 | """ 69 | 70 | sample_list = [] 71 | # Search for project or sample information and metadata (if available) 72 | 73 | # LISTING OPTION 1: If user would like to list the available samples (-l) and 74 | # this information is available, i.e. from a metadata table, this can 75 | # be tested for here (and then return an empty list to prevent downstream 76 | # processing). For an example of this, see the sra.py module 77 | 78 | 79 | # This is example code from sra.py showing how one might append all metadata from 80 | # one run into the same dataframe 81 | if type(metadata_agg) == type(None): 82 | metadata_agg = pd.read_csv(StringIO(metadata.text)) 83 | else: 84 | metadata_agg = metadata_agg.append(pd.read_csv(StringIO(metadata.text)),sort=True) 85 | 86 | return sample_list, metadata 87 | 88 | def download_newrepo_sample(acc, retries = 0, threads = 1, loc='', force=False, list_only=False, zip_func = "gzip"): 89 | """ 90 | Helper function to download sequences given an newrepo `acc`ession, 91 | with support for a particular number of `retries`. Can use multiple 92 | `threads` with pigz (if data are not already compressed on arrival). 93 | """ 94 | 95 | # LISTING OPTION 2: If the information about whether samples are paired or 96 | # unpaired is only available from a sample-specific page, it usually makes more 97 | # sense to look that up here, and then just skip the downloading part. For an 98 | # example of this, see the mg-rast.py module 99 | 100 | # Make sure to check that the sample isn't already downloaded 101 | if not force: 102 | found = check_existing(loc, acc) 103 | if found != False: 104 | print("found existing file matching acc:" + acc + ", skipping download. Pass -f to force download") 105 | return False 106 | 107 | # Need to know this 108 | paired = True 109 | 110 | # Generally, unless there's a tool like fasterq-dump that downloads both reads, 111 | # it's just easier to iterate through file paths (i.e. either one unpaired, or 112 | # two paired). 113 | seq_urls = [] 114 | file_paths = build_paths(acc, loc, paired) #see utils.py for details 115 | 116 | for i in range(len(seq_urls)): 117 | print("Downloading accession "+acc+" from newrepo repository") 118 | # fetch_file should work for most things where a URL is available 119 | retcode = fetch_file(seq_urls[i],file_paths[i],retries) 120 | 121 | # There are a number of things you may want to do here: check and handle 122 | # downloaded file integrity, convert to .fastq (see mgrast.py for an example 123 | # of a scenario dealing with .fastx in general), retries, etc. 124 | 125 | print("Compressing .fastq") 126 | gzip_files(file_paths, zip_func, threads) 127 | 128 | return True 129 | -------------------------------------------------------------------------------- /faq/faq.md: -------------------------------------------------------------------------------- 1 | # grabseqs FAQ 2 | 3 | This page provides a few, hopefully helpful, hints that go above and beyond the [README](https://github.com/louiejtaylor/grabseqs/blob/master/README.md). 4 | 5 | Sections: [General](#general-faqs) | [SRA](#sra-faqs) | [MG-RAST](#mg-rast-faqs) | [iMicrobe](#imicrobe-faqs) 6 | 7 | ## General FAQs 8 | 9 | - **What if I have a file containing a list of many accessions I want to download, and don't want to type them all on the command-line?** 10 | 11 | Let's say you have a newline-separated list of SRA accession numbers in a file called `acc.txt`. You can pass those through to grabseqs like so: 12 | 13 | grabseqs sra $(cat acc.txt) 14 | 15 | - **I can't install on Python version 3.X through conda** 16 | 17 | Installation and release through conda explicitly uses Python versions 3.6 and 3.7 (as of Jan 2020). If you're using a different version of Python 3, try installing the grabseqs [requirements](https://github.com/louiejtaylor/grabseqs/blob/master/environment.yml) only via conda, then installing the grabseqs package through pip (`pip install grabseqs`) since the PyPI package is not built for a specific Python 3 minor version. 18 | 19 | - **The reads aren't downloading, why?** 20 | 21 | Many possibilites here, and often they are respository-dependent. General tips: 22 | 23 | 1) Make sure your internet connection is good (and that you have permission to download files on whatever system you're using). 24 | 25 | 2) Try again later. Sporadic connection problems are common. 26 | 27 | 3) Make sure the reads are available/have been released to the public. Grabseqs only accesses publically available reads, so if you can't download from the SRA/iMicrobe/MG-RAST website without logging in, neither can grabseqs. 28 | 29 | - **What format are reads?** 30 | 31 | Reads are saved as gzipped FASTQ files (extension `.fastq.gz`). If the repository data is in .fasta format, dummy quality scores are added. 32 | 33 | - **How is the metadata formatted?** 34 | 35 | Metadata is downloaded and stored in .csv format (to the filename specified by the `-m` flag, e.g. `-m metadata.csv`). Metadata is appended if the filename already exists (assuming the file specified is in proper .csv format). Column names in the repository metadata are maintained. We do not recommend combining metadata from different repositories--while this will work without error, we do not parse/rename columns from each repository. 36 | 37 | - **Can you/can I add X repository?** 38 | 39 | Since downloading from repositories is modular, adding new repositories is hopefully simple. We provide a [template](https://github.com/louiejtaylor/grabseqs/blob/master/template.py) for adding new repositories--each new repo is essentially handled as a separate argparse subparser 40 | 41 | - **Who/what do I cite?** 42 | 43 | We would appreciate you referencing our GitHub page if you find this tool useful. More importantly, be fair to and appreciative of the researchers who generate the data, and the organizations who make the data available to the public. At minimum, citation of the paper/dataset(/repository) is appropriate. It is your responsibility to abide by the guidelines of the groups/repositories who make data available. Open data faciliates collaboration--if you're not sure, ask! 44 | 45 | [Top](#grabseqs-faq) 46 | 47 | ## SRA FAQs 48 | 49 | - **What default arguments do you use for calling fasterq/fastq-dump?** 50 | 51 | We use arguments that remove technical reads, return gzipped fastq files (when available), and split paired reads into separate files (with a third file for reads without a mate). Specifically, the commands look like: 52 | 53 | fasterq-dump -e {thread_num} -f -3 SRR######### 54 | # or 55 | fastq-dump --gzip --split-3 --skip-technical SRR######### 56 | # both of these can have "-O /path/to/outdir/" optionally 57 | # appended before the accession if specified by the user 58 | 59 | For the current version of the code, see the `run_fasterq_dump` function within the [sra.py module](https://github.com/louiejtaylor/grabseqs/blob/master/grabseqslib/sra.py). 60 | 61 | If you'd like to pass your own arguments to either of these functions, use the `--custom_fqdump_args` parameter like so: 62 | 63 | grabseqs sra SRP####### -r 0 --custom_fqdump_args="--split-spot --include-technical --progress" 64 | 65 | - **Why am I running out of space?** 66 | 67 | If you're going to be using SRA data, after you've installed sra-tools, run `vdb-config -i` and turn off local file caching unless you want extra copies of the downloaded sequences taking up space ([read more here](https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration)). 68 | 69 | - **My reads are not paired properly.** 70 | 71 | Sometimes fasterq-dump filters out reads but won't also filter the mate, and I haven't figured out why, or how to circumvent it. Try adding the `--use_fastq_dump` flag--it seems that fastq-dump handles this situation better. 72 | 73 | [Top](#grabseqs-faq) 74 | 75 | ## MG-RAST FAQs 76 | 77 | - **My accession can't be found by grabseqs.** 78 | 79 | Many of the projects in MG-RAST are not publically accessible. If you're having trouble with a particular accession number or project, go to the [MG-RAST website](http://www.mg-rast.org/) and make sure you can download it by hand first. **Please do this first, as a number of samples which were previously available for download are now unavailable.** If this works fine, please [open an issue](https://github.com/louiejtaylor/grabseqs/issues) and we'll check it out! 80 | 81 | [Top](#grabseqs-faq) 82 | 83 | ## iMicrobe FAQs 84 | 85 | 2025-08-14: currently non-functional 86 | 87 | - **My iMicrobe download isn't working.** 88 | 89 | As with MG-RAST, many of the iMicrobe projects are not available to the public. If you're having download troubles, see whether you can download a test sample manually from [the iMicrobe website](https://www.imicrobe.us/). iMicrobe seems to have reads stored a lot of different ways. For iMicrobe, we handle reads that are either in .fastq or .fasta (adding dummy scores in .fastq conversion), either .gzipped or not, and either paired or unpaired. If you come across reads that are in another format, or you can download from the website but not through grabseqs, please [open an issue](https://github.com/louiejtaylor/grabseqs/issues) and we'll take a look. 90 | 91 | [Top](#grabseqs-faq) 92 | -------------------------------------------------------------------------------- /grabseqslib/mgrast.py: -------------------------------------------------------------------------------- 1 | import requests, os, json 2 | import pandas as pd 3 | from io import StringIO 4 | from subprocess import call 5 | from grabseqslib.utils import check_existing, fetch_file, check_filetype, fasta_to_fastq, gzip_files 6 | 7 | def add_mgrast_subparser(subparser): 8 | """ 9 | Function to add the MG-RAST subparser. 10 | """ 11 | 12 | parser_rast = subparser.add_parser('mgrast', help="download from MG-RAST") 13 | parser_rast.add_argument('rastid', type=str, nargs='+', 14 | help="One or more MG-RAST project or sample identifiers (mgp####/mgm######)") 15 | 16 | parser_rast.add_argument('-m', dest="metadata", type=str, default="", 17 | help="filename in which to save metadata (.csv format, relative to OUTDIR)") 18 | parser_rast.add_argument('-o', dest="outdir", type=str, default="", 19 | help="directory in which to save output. created if it doesn't exist") 20 | parser_rast.add_argument('-r',dest="retries", type=int, default=0, 21 | help="number of times to retry download") 22 | parser_rast.add_argument('-t',dest="threads", type=int, default=1, 23 | help="threads to use (for pigz)") 24 | 25 | parser_rast.add_argument('-f', dest="force", action="store_true", 26 | help = "force re-download of files") 27 | parser_rast.add_argument('-l', dest="list", action="store_true", 28 | help="list (but do not download) samples to be grabbed") 29 | 30 | def process_mgrast(args, zip_func): 31 | """ 32 | Top-level function to process MG-RAST download. Returns aggregated metadata. 33 | """ 34 | metadata_agg = None 35 | for rast_proj in args.rastid: 36 | # get targets 37 | target_list = get_mgrast_acc_metadata(rast_proj) 38 | for target in target_list: 39 | # get samples and/or metadata 40 | metadata_agg = download_mgrast_sample(target, 41 | args.retries, 42 | args.threads, 43 | args.outdir, 44 | args.force, 45 | args.list, 46 | not (args.metadata == ""), 47 | metadata_agg, zip_func) 48 | return metadata_agg 49 | 50 | def get_mgrast_acc_metadata(pacc): 51 | """ 52 | Function to get list of MG-RAST sample accession numbers from a particular 53 | project. Takes project accession number `pacc` and returns a list of mgm 54 | accession numbers. 55 | """ 56 | if pacc[:3] == "mgm": 57 | return [pacc] 58 | elif pacc[:3] != "mgp": 59 | raise NameError("Unknown prefix: " + pacc[:3] + ". Should be 'mgm' or 'mgp'.") 60 | metadata_json = json.loads(requests.get("http://api.metagenomics.anl.gov/metadata/export/"+pacc).text) 61 | sample_list = [] 62 | for sample in metadata_json["samples"]: 63 | sample_list.append(sample["libraries"][0]["data"]["metagenome_id"]["value"]) #metadata: ["data"] 64 | return sample_list 65 | 66 | def download_mgrast_sample(acc, retries = 0, threads = 1, loc='', force=False, list_only=False, download_metadata=False, metadata_agg = None, zip_func = "gzip"): 67 | """ 68 | Helper function to download original (uploaded) MG-RAST `acc`ession, 69 | with support for a particular number of `retries`. Can use multiple 70 | `threads` with pigz (if data are not already compressed on arrival). 71 | Also will optionally `download_metadata`. 72 | """ 73 | read_stages = ["050.1", "050.2"] # R1 and R2 (if paired) 74 | 75 | stage_json = json.loads(requests.get("http://api.metagenomics.anl.gov/download/"+acc).text) 76 | stages_to_grab = [] 77 | for stage in stage_json["data"]: 78 | if stage["file_id"] in read_stages: 79 | stages_to_grab.append(stage["file_id"]) 80 | stages_to_grab = sorted(stages_to_grab) # sort because json 81 | 82 | if len(stages_to_grab) == 0: 83 | raise Exception("No reads found for accession: "+acc) 84 | else: 85 | if len(stages_to_grab) == 1: 86 | fext = [""] # unpaired, no ext 87 | else: 88 | fext = ["_"+str(i+1) for i in range(len(stages_to_grab))] # paired 89 | if download_metadata: 90 | metadata_json = json.loads(requests.get("http://api.metagenomics.anl.gov/metadata/export/"+acc).text) 91 | sample_info = metadata_json["mixs"] 92 | colnames = ["mgm_id"]+list(sorted(list(sample_info.keys()))) 93 | colvals = [acc]+[str(sample_info[x]) for x in colnames[1:]] 94 | formatted_table = ','.join(colnames)+'\n'+','.join(colvals) 95 | if type(metadata_agg) == type(None): 96 | metadata_agg = pd.read_csv(StringIO(formatted_table)) 97 | else: 98 | metadata_agg = pd.concat([metadata_agg,pd.read_csv(StringIO(formatted_table))],sort=True) 99 | if list_only: 100 | print(','.join([acc+ext+".fastq.gz" for ext in fext])) 101 | else: 102 | if not force: 103 | found = check_existing(loc, acc) 104 | if found != False: 105 | print("found existing file matching acc:" + acc + ", skipping download. Pass -f to force download") 106 | return metadata_agg 107 | 108 | fa_paths = [os.path.join(loc,acc+ext+".fasta") for ext in fext] 109 | fq_paths = [os.path.join(loc,acc+ext+".fastq") for ext in fext] 110 | 111 | for i in range(len(fa_paths)): 112 | fa_path = fa_paths[i] 113 | fq_path = fq_paths[i] 114 | file_url = "http://api.metagenomics.anl.gov/download/"+acc+"?file="+stages_to_grab[i] 115 | retcode = fetch_file(file_url,fa_path,retries) 116 | ftype = check_filetype(fa_path) 117 | gzipped = ftype.endswith('.gz') 118 | if ftype.startswith("fasta"): 119 | print("Converting .fasta to .fastq (adding dummy quality scores), compressing") 120 | fasta_to_fastq(fa_path, fq_path, gzipped) 121 | retcode = call(["rm "+fa_path], shell=True) # get rid of old fasta 122 | rzip = gzip_files(fq_path, zip_func, threads) 123 | #rzip = call(["pigz -f -p "+ str(threads) + ' ' + fq_path], shell=True) 124 | elif ftype.startswith("fastq"): 125 | if gzipped: 126 | print("downloaded file in .fastq.gz format already!") 127 | call(["mv", fa_path, fq_path+".gz"]) 128 | else: 129 | print("downloaded file in .fastq format already, compressing .fastq") 130 | call(["mv", fa_path, fq_path]) 131 | rzip = gzip_files(fq_path, zip_func, threads) 132 | else: 133 | print("requested sample "+acc+" does not appear to be in .fasta or .fastq format. This may be because it is not publically accessible from MG-RAST.") 134 | return metadata_agg 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # grabseqs 2 | 3 | Utility for simplifying bulk downloading data from next-generation sequencing repositories, like [NCBI SRA](https://www.ncbi.nlm.nih.gov/sra/), [MG-RAST](http://www.mg-rast.org/). 4 | 5 | [![CircleCI](https://circleci.com/gh/louiejtaylor/grabseqs.svg?style=shield)](https://circleci.com/gh/louiejtaylor/grabseqs) [![Conda version](https://anaconda.org/louiejtaylor/grabseqs/badges/version.svg)](https://anaconda.org/louiejtaylor/grabseqs) [![Conda downloads](https://anaconda.org/louiejtaylor/grabseqs/badges/downloads.svg)](https://anaconda.org/louiejtaylor/grabseqs/files) [![Paper link](https://img.shields.io/badge/Published%20in-Bioinformatics-126888.svg)](https://doi.org/10.1093/bioinformatics/btaa167) 6 | 7 | [iMicrobe](https://www.imicrobe.us/) is currently not supported--working to remedy this (2025/08/14) 8 | 9 | ## Install 10 | 11 | Install grabseqs and all dependencies [via conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html): 12 | 13 | conda install grabseqs -c louiejtaylor -c bioconda -c conda-forge 14 | 15 | Or with pip (and install the non-Python [dependencies](https://github.com/louiejtaylor/grabseqs#dependencies) yourself): 16 | 17 | pip install grabseqs 18 | 19 | **Note:** If you're using SRA data, after you've installed sra-tools, run `vdb-config -i` and turn off local file caching unless you want extra copies of the downloaded sequences taking up space ([read more here](https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration)). 20 | 21 | ## Quick start 22 | 23 | Download all samples from a single SRA Project: 24 | 25 | grabseqs sra SRP####### 26 | 27 | Or any combination of projects (S/ERP), runs (S/ERR), BioProjects (PRJNA): 28 | 29 | grabseqs sra SRR######## ERP####### PRJNA######## ERR######## 30 | 31 | If you'd like to do a dry run and just get a list of samples that will be downloaded, pass `-l`: 32 | 33 | grabseqs sra -l SRP######## 34 | 35 | Similar syntax works for MG-RAST: 36 | 37 | grabseqs mgrast mgp##### mgm####### 38 | 39 | ## Detailed usage 40 | 41 | See the [grabseqs FAQ](https://github.com/louiejtaylor/grabseqs/blob/master/faq/faq.md) for detailed troubleshooting tips. 42 | 43 | Fun options: 44 | 45 | grabseqs sra -t 10 -m metadata.csv -o proj/ -r 3 SRP####### 46 | 47 | (translation: use 10 threads, save metadata to `proj/metadata.csv`, download to the dir `proj/`, retry failed downloads 3x, get all samples from SRP#######) 48 | 49 | If you'd like to do a dry run and only get a list of samples that will be downloaded, pass `-l`: 50 | 51 | grabseqs sra -l SRP######## 52 | 53 | If you'd like to pass your own arguments to `fasterq-dump` to get data in a slightly different format, you can do so like this: 54 | 55 | grabseqs sra SRP####### -r 0 --custom_fqdump_args="--split-spot --progress" 56 | 57 | Full usage: 58 | 59 | grabseqs sra [-h] [-m METADATA] [-o OUTDIR] [-r RETRIES] [-t THREADS] 60 | [-f] [-l] [--no_parsing] [--parse_run_ids] 61 | [--use_fastq_dump] 62 | id [id ...] 63 | 64 | positional arguments: 65 | id One or more BioProject, ERR/SRR or ERP/SRP number(s) 66 | 67 | optional arguments: 68 | -h, --help show this help message and exit 69 | -m METADATA filename in which to save SRA metadata (.csv format, 70 | relative to OUTDIR) 71 | -o OUTDIR directory in which to save output. created if it doesn't 72 | exist 73 | -r RETRIES number of times to retry download 74 | -t THREADS threads to use (for fasterq-dump/pigz) 75 | -f force re-download of files 76 | -l list (but do not download) samples to be grabbed 77 | --parse_run_ids parse SRR/ERR identifers (do not pass straight to fasterq- 78 | dump) 79 | --custom_fqdump_args CUSTOM_FQD_ARGS 80 | "string" containing args to pass to fastq-dump 81 | --use_fastq_dump use legacy fastq-dump instead of fasterq-dump (no 82 | multithreaded downloading) 83 | 84 | Downloads .fastq.gz files to `OUTDIR` (or the working directory if not specified). If the `-m` flag is passed, saves metadata to `OUTDIR` with filename `METADATA` in csv format. 85 | 86 | Similar options are available for downloading from MG-RAST: 87 | 88 | grabseqs mgrast [-h] [-m METADATA] [-o OUTDIR] [-r RETRIES] 89 | [-t THREADS] [-f] [-l] 90 | rastid [rastid ...] 91 | 92 | ## Troubleshooting 93 | 94 | See the [grabseqs FAQ](https://github.com/louiejtaylor/grabseqs/blob/master/faq/faq.md) for detailed troubleshooting tips. If the FAQs don't fix your problem, feel free to [open an issue](https://github.com/louiejtaylor/grabseqs/issues)! 95 | 96 | ## Dependencies 97 | 98 | - Python 3 (external packages req'd: requests, requests-html, pandas, fake-useragent) 99 | - sra-tools>3.2 100 | - pigz 101 | - wget 102 | 103 | If you use conda (on Linux), these will be installed for you! 104 | 105 | Grabseqs runs on Mac or Linux. We've tested on these specific OSes: 106 | 107 | Linux (conda or pip): 108 | - CentOS 6, 7, and 8 109 | - Debian 9 and 10 110 | - Ubuntu 16.04, 18.04, and 19.10 111 | - Red Hat Enterprise 6, 7, and 8 112 | - SUSE Enterprise 12 and 15 113 | 114 | Mac (pip): 115 | - MacOS 10.14 116 | 117 | Grabseqs has been tested and works with the following version of the Python dependencies (though these are neither minimal nor pinned version numbers): 118 | 119 | - requests 2.22.0 120 | - pandas>2 121 | 122 | ## Citation 123 | 124 | If you use grabseqs in your work, please cite: 125 | 126 | Louis J Taylor, Arwa Abbas, Frederic D Bushman. "grabseqs: Simple downloading of reads and metadata from multiple next-generation sequencing data repositories." *Bioinformatics*, (2020), btaa167, https://doi.org/10.1093/bioinformatics/btaa167 127 | 128 | Please also cite the researchers who generated the data (and the repository, if appropriate)! 129 | 130 | ------------ 131 | 132 | ## Changelog 133 | 134 | **1.0.0** (2025-08-14) 135 | - Added a walk-through for adding a new repo using `template.py` 136 | - Better handling for invalid SRA accession numbers 137 | - Update endpoint for NCBI for SRA downloads 138 | - Temporarily remove iMicrobe--needs rewrite to use a different tool 139 | 140 | **0.7.0** (2020-01-29) 141 | - Allow users to pass custom args to fast(er)q-dump 142 | - Minor re-writes of download handling code for easier readability 143 | 144 | **0.6.1** (2019-12-20) 145 | - Validate compressed files (fix #8 and #34) 146 | 147 | **0.6.0** (2019-12-12) 148 | - Gracefully handle incomplete or missing dependencies 149 | - Major rewrite of test suite 150 | 151 | **0.5.2** (2019-12-05) 152 | - Improvements to work with multiple versions of Python 3 153 | 154 | **0.5.1** (2019-11-23) 155 | - Hotfix handling outdated versions of sra-tools 156 | 157 | **0.5.0** (2019-04-11) 158 | - Metadata available for all sources in .csv format 159 | 160 | ## History 161 | 162 | This project spawned out of/incorporates code from [hisss](https://github.com/louiejtaylor/hisss); many thanks to [ArwaAbbas](https://github.com/ArwaAbbas) for helping make this work! 163 | -------------------------------------------------------------------------------- /grabseqslib/sra.py: -------------------------------------------------------------------------------- 1 | import requests, time, shutil, sys 2 | import pandas as pd 3 | from io import StringIO 4 | from subprocess import call 5 | from grabseqslib.utils import check_existing, build_paths, gzip_files 6 | 7 | def process_sra(args, zip_func): 8 | """ 9 | High-level logic for SRA download processing. Takes 10 | `args` from grabseqslib argument parser and `zip_func` 11 | """ 12 | # check deps 13 | dep_list = ["fastq-dump", "fasterq-dump"] 14 | deps_have = [shutil.which(dep) for dep in dep_list] 15 | if (not deps_have[0]) and (not deps_have[1]): # no sra-tools 16 | print("Neither fastq-dump nor fasterq-dump found; one is required. Please install sra-tools") 17 | sys.exit(1) 18 | elif not deps_have[1]: 19 | use_fastq_dump = True 20 | else: 21 | use_fastq_dump = args.fastqdump 22 | 23 | metadata_agg = None 24 | 25 | for sra_identifier in args.id: 26 | # get targets and metadata 27 | acclist, metadata_agg = get_sra_acc_metadata(sra_identifier, 28 | args.outdir, 29 | args.list, 30 | not args.SRR_parsing, 31 | metadata_agg) 32 | for acc in acclist: 33 | # get samples 34 | run_fasterq_dump(acc, 35 | args.retries, 36 | args.threads, 37 | args.outdir, 38 | args.force, 39 | use_fastq_dump, 40 | args.custom_fqd_args, 41 | zip_func) 42 | 43 | return metadata_agg 44 | 45 | 46 | def add_sra_subparser(subparser): 47 | """ 48 | Function to add subparser for SRA data. 49 | """ 50 | # Parser 51 | parser_sra = subparser.add_parser('sra', help="download from SRA") 52 | parser_sra.add_argument('id', type=str, nargs='+', 53 | help="One or more BioProject, ERR/SRR or ERP/SRP number(s)") 54 | 55 | # Options 56 | parser_sra.add_argument('-m', dest="metadata", type=str, default="", 57 | help="filename in which to save SRA metadata (.csv format, relative to OUTDIR)") 58 | parser_sra.add_argument('-o', dest="outdir", type=str, default="", 59 | help="directory in which to save output. created if it doesn't exist") 60 | parser_sra.add_argument('-r',dest="retries", type=int, default=2, 61 | help="number of times to retry download") 62 | parser_sra.add_argument('-t',dest="threads", type=int, default=1, 63 | help="threads to use (for fasterq-dump/pigz)") 64 | 65 | # General flags 66 | parser_sra.add_argument('-f', dest="force", action="store_true", 67 | help = "force re-download of files") 68 | parser_sra.add_argument('-l', dest="list", action="store_true", 69 | help="list (but do not download) samples to be grabbed") 70 | 71 | # SRA-specific flags 72 | parser_sra.add_argument('--parse_run_ids', dest="SRR_parsing", action="store_true", 73 | help="parse SRR/ERR identifers (do not pass straight to fasterq-dump)") 74 | parser_sra.add_argument("--use_fastq_dump", dest="fastqdump", action="store_true", 75 | help="use legacy fastq-dump instead of fasterq-dump (no multithreaded downloading)") 76 | parser_sra.add_argument("--custom_fqdump_args", dest="custom_fqd_args", type=str, default="", 77 | help="'string' containing args to pass to fast(er)q-dump") 78 | 79 | # LEGACY: this will be removed in the next major version as this is now default. 80 | parser_sra.add_argument('--no_parsing', dest="no_SRR_parsing", action="store_true", 81 | help="Legacy option to not parse SRR IDs (now default)") 82 | 83 | 84 | def get_sra_acc_metadata(pacc, loc = '', list_only = False, no_SRR_parsing = True, metadata_agg = None): 85 | """ 86 | Function to get list of SRA accession numbers from a particular project. 87 | Takes project accession number `pacc` and returns a list of SRA 88 | accession numbers. Optional arguments to `save` metadata .csv in a specified 89 | `loc`ation. 90 | Originally featured in: https://github.com/louiejtaylor/hisss 91 | """ 92 | # Grab metadata for given accession number 93 | pacc = pacc.strip() 94 | metadata = requests.get("https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&term="+pacc) 95 | lines = [l.split(',') for l in metadata.text.split("\n")] 96 | try: 97 | run_col = lines[0].index("Run") 98 | except ValueError: # "Run" column always present unless search failed 99 | raise ValueError("Could not find samples for accession: "+pacc+". If this accession number is valid, try re-running.") 100 | 101 | # Generate list of runs to download 102 | run_list = [l[run_col] for l in lines[1:] if len(l[run_col]) > 0] 103 | 104 | # Aggregate metadata if multiple samples/projects are being asked for 105 | if type(metadata_agg) == type(None): 106 | metadata_agg = pd.read_csv(StringIO(metadata.text)) 107 | else: 108 | metadata_agg = metadata_agg.append(pd.read_csv(StringIO(metadata.text)),sort=True) 109 | 110 | if list_only: 111 | # Do not download but read metadata and say what will be downloaded 112 | layout_col = lines[0].index("LibraryLayout") 113 | if no_SRR_parsing and (pacc.startswith('SRR') or pacc.startswith('ERR')): 114 | layout_list = [l[layout_col] for l in lines[1:] if len(l[run_col]) > 0 and l[run_col] == pacc] 115 | run_list = [pacc] 116 | else: 117 | layout_list = [l[layout_col] for l in lines[1:] if len(l[run_col]) > 0] 118 | 119 | # Print filenames that should come down (assuming the repo metadata is correct) 120 | for i in range(len(layout_list)): 121 | if layout_list[i] == "SINGLE": 122 | print(run_list[i]+".fastq.gz") 123 | elif layout_list[i] == "PAIRED": 124 | print(run_list[i]+"_1.fastq.gz,"+run_list[i]+"_2.fastq.gz") 125 | else: 126 | raise Exception("Unknown library layout: "+layout_list[i]) 127 | 128 | # If we're here, we're listing and not downloading. So we don't return any accessions to download 129 | # (empty list), but the user may still want the aggregated metadata 130 | return [], metadata_agg 131 | else: 132 | if no_SRR_parsing: # default, if given a Run return a Run 133 | if pacc.startswith('SRR') or pacc.startswith('ERR'): 134 | return [pacc], metadata_agg 135 | # otherwise, return all the Run accessions associated with whatever identifier was passed. 136 | return run_list, metadata_agg 137 | 138 | def run_fasterq_dump(acc, retries = 2, threads = 1, loc='', force=False, fastqdump=False, custom_args="", zip_func="gzip"): 139 | """ 140 | Helper function to run fast(er)q-dump to grab a particular `acc`ession, 141 | with support for a particular number of `retries`. Can use multiple 142 | `threads`. 143 | """ 144 | skip = False 145 | retcode = 1 146 | while retries >= 0: 147 | if not force: 148 | found = check_existing(loc, acc) 149 | if found != False: 150 | print("found existing file matching acc:" + acc + ", skipping download. Pass -f to force download") 151 | skip = True 152 | break 153 | if not skip: 154 | if len(custom_args) == 0: 155 | if fastqdump: # use legacy fastq-dump 156 | cmd = ["fastq-dump", "--gzip", "--split-3", "--skip-technical"] 157 | else: 158 | cmd = ["fasterq-dump", "-e", str(threads), "-f", "-3"] 159 | else: 160 | suffix = "er" 161 | if fastqdump: 162 | suffix = "" 163 | prog_to_run = "fast" + suffix + "q-dump" 164 | cmd = [prog_to_run] + custom_args.split(' ') 165 | if loc != "": 166 | cmd = cmd + ['-O', loc] 167 | cmd = cmd + [acc] 168 | print("running: "+" ".join(cmd)) 169 | retcode = call(cmd) 170 | rgzip = 0 171 | if retcode == 0: 172 | if not fastqdump: 173 | # zip all possible output files for that acc 174 | fnames = build_paths(acc, loc, False) + build_paths(acc, loc, True) 175 | rgzip = gzip_files(fnames, zip_func, threads) 176 | if rgzip == 0: 177 | if check_existing(loc, acc) != False: 178 | break 179 | 180 | # only here if downloading and zipping failed 181 | print("SRA download for acc "+acc+" failed, retrying "+str(retries)+" more times.") 182 | if retries > 0: 183 | time.sleep(100) #TODO?: user-modifiable 184 | retries -= 1 185 | else: 186 | raise Exception("download for "+acc+" failed. fast(er)q-dump returned "+str(retcode)+", pigz returned "+str(rgzip)+".") 187 | else: 188 | break 189 | 190 | -------------------------------------------------------------------------------- /faq/template_usage.md: -------------------------------------------------------------------------------- 1 | # Adding support for new modules 2 | 3 | We provide a [Python template](https://github.com/louiejtaylor/grabseqs/blob/master/template.py) 4 | to facilitate adding support for new repositories, and welcome pull requests! This page walks 5 | through the process of building out that template for a hypothetical new repository, `newrepo`. 6 | 7 | ## Background 8 | 9 | ### Grabseqs anatomy 10 | 11 | The grabseqs functionality is contained within a package called `grabseqslib`, to disambiguate 12 | between the command-line tool and the package providing the functionality. `grabseqslib` includes 13 | multiple modules: one per repository (e.g. `sra` and `mgrast`), as well as a `utils` module 14 | containing general functions used by each module. Finally, `__init__.py` creates the base argument 15 | parser and passes command-line arguments to be processed by repository-specific logic. 16 | 17 | New repositories will likely be comprised of an additional module (e.g. `newrepo.py`), as well as 18 | a few lines of logic in `__init__.py` to handle argument parsing. 19 | 20 | ### Repository structure 21 | 22 | Generally, grabseqs is built on the assumption that repositories contain "projects", consisting of 23 | one or more "samples". This assumption has held thus far from both a data generation perspective 24 | (usually in a study you sequence a number of samples related to a research question) and is also 25 | useful from a data re-use perspective (you'd often like to reanalyze all or a subset of the samples 26 | from a given study). Simple data re-use was the core motivation for developing grabseqs. 27 | This relationship is often reflected in the accession numbers for samples/projects, and can be 28 | exploited in automating sample access--for example, MG-RAST project IDs are prefixed by "mgp", 29 | whereas sample (metagenome) names are prefixed by "mgm". 30 | 31 | Repository metadata is also an important point to consider. Is there programmatically-accessible 32 | metadata? If so, is it available on the project-level, sample-level, or both? For example, when 33 | accessing data from NCBI's SRA, you can request detailed, sample-level metadata from the API 34 | given only a project accession number, which greatly simplifies metadata processing. For MG-RAST 35 | and iMicrobe, on the other hand, APIs map project information to sample numbers, which then must 36 | be queried individually for sample-level metadata. 37 | 38 | ## Adding support for `newrepo` 39 | 40 | Now, we'll walk step-by-step through adding support for a new repository, `newrepo`, to grabseqs. 41 | The end goal is to be able to run the following command: 42 | 43 | grabseqs newrepo samp1234 proj123 44 | 45 | Where "samp1234" is an example sample accession to be downloaded, and "proj123" is an example 46 | project accession number. 47 | 48 | If you'd like to eventually make a pull request back into this repository, we recommend making a fork 49 | of this repository to work with. Either way, you'll likely want a local copy to work with, so clone away! 50 | 51 | ### Step 1: Make a new module 52 | 53 | We recommend starting with a copy of [`template.py`](https://github.com/louiejtaylor/grabseqs/blob/master/template.py), 54 | as it lays out the structure used in all the other grabseqs modules. Make a copy of this file in the 55 | `grabseqslib` directory, with a name that makes sense--for our example we'll call it `newrepo.py`. 56 | 57 | This template module contains four functions. We'll name them after `newrepo`, so they are: 58 | 59 | - `add_newrepo_subparser`: adds an argparse subparser for handling arguments to the grabseqs 60 | command-line tool for your repo 61 | - `process_newrepo`: controller function for handling input project/sample accession numbers 62 | - `map_newrepo_project_acc`: function to map project accession numbers to sample accession numbers 63 | for downloading 64 | - `download_newrepo_sample`: function to handle downloading of reads given a sample accession number 65 | 66 | These functions handle 99% of the work for newrepo downloads, and we'll walk through filling them out 67 | one-by-one. 68 | 69 | ### Step 2: Add the `newrepo` subparser 70 | 71 | The `add_newrepo_subparser` function is likely mostly done for you in the template. There are a number of 72 | options that you'll likely use for any repository (e.g. specifying an output directory, number of retries, 73 | forcing re-download of already-existing files, multithreading, and listing/dry-running \[rather than 74 | downloading\] samples matching a particular query). These options are pre-specified for you and shouldn't 75 | be tweaked much for consistency with other repository subparsers. If metadata is not programmatically 76 | available from `newrepo`, remove the `-m` option. 77 | 78 | If you'd like to include additional functions above and beyond these default options, you may add them to 79 | the argparser object within this function. See the `sra.py` file for examples of other options--there are 80 | a variety of options for the `grabseqs sra` subparser above and beyond the default options. 81 | 82 | You may have noticed that this function isn't quite hooked up to the main argparse instance yet. That's okay; 83 | we'll do this at the end when we edit `__init__.py`. 84 | 85 | ### Step 3: Add the controller logic 86 | 87 | From here on out, it's a good idea to have a good handle on the questions addressed in the "Repository 88 | structure" section above. Important questions include: 89 | 90 | - How can I programmatically access data, metadata, and map project accession numbers to sample accession 91 | numbers? E.g. is there an API endpoint (used in MG-RAST, iMicrobe metadata/downloads/mapping and 92 | SRA metadata/mapping) or a tool to aid in downloading (used in SRA downloads)? 93 | - Is metadata available? If so, is sample-level metadata accessible from a project accession number 94 | (easiest)? 95 | 96 | The answers to these questions determine how the `process_newrepo` controller function will be structured. 97 | The example in `template.py` assumes that metadata is available, and that sample-level metadata is present 98 | in the project-sample mapping step. Thus, the controller generally functions like so: 99 | 100 | # begin looping through accession numbers passed by user 101 | for newrepo_identifier in args.newrepoid: 102 | 103 | # for each project identifier, map it to sample identifiers and grab metadata (a pandas dataframe) 104 | sample_list, metadata_agg = map_newrepo_project_acc(newrepo_identifier, metadata_agg) 105 | 106 | # for each sample mapped to by the passed project identifier 107 | for sample in sample_list: 108 | 109 | # download that sample 110 | download_newrepo_sample(acc, 111 | args.retries, 112 | args.threads, 113 | args.outdir, 114 | args.force, 115 | args.list) 116 | 117 | Actual metadata saving is handled in a repository-agnostic fashion--the `process_newrepo` function will 118 | return the pandas dataframe containing metadata, which will then be saved in a safe/non-clobber-y way 119 | (with no additional effort necessary on your part). 120 | 121 | Now, let's go write the actual mapping logic. 122 | 123 | ### Step 4: Map project accessions to sample accessions 124 | 125 | The `map_newrepo_project_acc` maps project to sample accession numbers, returning a list of sample accession 126 | numbers. Depending on metadata availability, you may also access sample metadata in this mapping step, and 127 | it seems prudent to only make one API call when necessary, so we've written the example using this slightly 128 | more complicated workflow--this is also used in the SRA and MG-RAST modules. 129 | 130 | Generally, you can pass one or more project or sample accessions to grabseqs. Depending on from where metadata is 131 | obtained, you'll either want to avoid `map_newrepo_project_acc` altogether if a sample accession number is 132 | passed; or grab metadata and return a singleton list (containing the sample accession number) and metadata to your 133 | controller function. An example of using the pandas.DataFrame.append() method to concatenate multiple metadata 134 | tables is included in this function in the template file. 135 | 136 | The code here is dependent on the format of the project-sample map. SRA provides mapping information in csv format; 137 | the MG-RAST API returns JSON maps--feel free to use that code for inspiration. Your workflow might look something like 138 | this (based on the MG-RAST JSON workflow and using the `json` and `requests` libraries), where `pacc` is the 139 | accession number: 140 | 141 | # initialize vars 142 | sample_list = [] 143 | metadata_df = pd.DataFrame() 144 | 145 | # hit api 146 | metadata_json = json.loads(requests.get("http://api.newrepo.gov/metadata/export/"+str(pacc)).text) 147 | 148 | for sample in metadata_json["samples"]: 149 | sample_list.append(sample["value"]) 150 | # additional logic to add metadata lines to metadata_df 151 | 152 | If the user would like to list \[but not download\] the available samples (-l) and information on read paired-ness 153 | is available here, i.e. from a metadata table, this can be tested for here (and then return an empty 154 | `sample_list` to prevent downstream downloading). For an example of this workflow, see the `sra.py` module. 155 | 156 | ### Step 5: Download samples! 157 | 158 | A bit of of boilerplate is included already, handling the `-f` (force) option: 159 | 160 | # Make sure to check that the sample isn't already downloaded 161 | if not force: 162 | # using check_existing from utils.py 163 | found = check_existing(loc, acc) 164 | if found != False: 165 | print("found existing file matching acc:" + acc + ", skipping download. Pass -f to force download") 166 | return False 167 | 168 | You can build the expected paths for the eventual downloaded reads like so: 169 | 170 | paired = True 171 | # using build_paths from utils.py 172 | file_paths = build_paths(acc, loc, paired) 173 | 174 | Generally, unless there's a tool like NCBI's fasterq-dump that downloads both reads in one command, it's 175 | just easier to iterate through file paths (i.e. either one unpaired, or two paired). 176 | 177 | If the file is directly available from an API URL, the `fetch_file` function from `grabseqs.utils` should serve 178 | you well (it uses `wget`, a grabseqs dependency): 179 | 180 | seq_urls = ["http://api.newrepo.gov/data/"+str(acc)+"_R1.fastq", 181 | "http://api.newrepo.gov/data/"+str(acc)+"_R2.fastq"] 182 | 183 | for i in range(len(seq_urls)): 184 | 185 | print("Downloading accession "+acc+" from newrepo") 186 | 187 | # fetch_file should work for most things where a URL is available 188 | retcode = fetch_file(seq_urls[i],file_paths[i],retries) 189 | 190 | # There are a number of things you may want to do here: check and handle 191 | # downloaded file integrity, convert to .fastq (see mgrast.py for an example 192 | # of a scenario dealing with .fastx in general), etc. 193 | 194 | print("Compressing .fastq") 195 | gzip_files(file_paths, zip_func, threads) 196 | 197 | If metadata is only available on a sample-wise basis, you may want to do metadata handling in this function 198 | as well, or in a separate function if two API calls are necessary. See `mgrast.py` for an example of metadata 199 | handling at the sample level. If 200 | 201 | Regarding sample listing/dry-running (-l)--if the information about whether samples are paired or unpaired is 202 | only available from a sample-specific source, it usually makes more sense to look that up here, and then just 203 | skip the downloading part. For an example of this workflow, see the `mgrast.py` module. 204 | 205 | Now we've written all the logic for argument parsing, metadata wrangling, project-sample accession mapping, and 206 | raw data downloading! We just have to hook it all together to the main grabseqs program. 207 | 208 | ### Step 6: Hooking up subparser and controller functions 209 | 210 | Here, you need to edit `__init__.py`. This should be fairly self explanatory based on what's already 211 | present for the other submodules, but you'll need to add the following: 212 | 213 | - Import your new functions: 214 | ```{python} 215 | from grabseqslib.sra import process_sra, add_sra_subparser 216 | from grabseqslib.imicrobe import process_imicrobe, add_imicrobe_subparser 217 | from grabseqslib.mgrast import process_mgrast, add_mgrast_subparser 218 | from grabseqslib.newrepo import process_newrepo, add_newrepo_subparser 219 | ``` 220 | - Add your new subparser: 221 | ```{python} 222 | add_sra_subparser(subpa) 223 | add_imicrobe_subparser(subpa) 224 | add_mgrast_subparser(subpa) 225 | add_mgrast_subparser(subpa) 226 | ``` 227 | - Check to see if the user called your subparser: 228 | ```{python} 229 | try: 230 | if args.newrepoid: 231 | repo = "newrepo" 232 | ``` 233 | - Finally, add a case for your controller function: 234 | ```{python} 235 | if repo == "SRA": 236 | metadata_agg = process_sra(args, zip_func) 237 | elif repo == "MG-RAST": 238 | metadata_agg = process_mgrast(args, zip_func) 239 | elif repo == "iMicrobe": 240 | metadata_agg = process_imicrobe(args, zip_func) 241 | elif repo == "newrepo": 242 | metadata_agg = process_newrepo(args, zip_func) 243 | ``` 244 | 245 | ## What next? 246 | 247 | So, you've added a new repository--that's awesome (and thank you!!)! Feel free to open a pull request. We 248 | run grabseqs through a rigorous set of automated tests on every commit/weekly, so please write a new test 249 | or three testing data/metadata downloading, or any edge cases that you encounter that other users/developers 250 | might not know about. 251 | --------------------------------------------------------------------------------