├── conda-recipe
    ├── grabseqs
    │   ├── conda_build_config.yaml
    │   └── meta.yaml
    ├── build_all.sh
    └── conda-build.sh
├── environment.yml
├── .circleci
    ├── setup.sh
    └── config.yml
├── setup.py
├── LICENSE
├── tests
    ├── test_mgrast.bash
    ├── test_general.bash
    ├── test_imicrobe.bash
    ├── test_sra.bash
    └── run_tests.bash
├── .gitignore
├── grabseqslib
    ├── __init__.py
    ├── utils.py
    ├── mgrast.py
    └── sra.py
├── template.py
├── faq
    ├── faq.md
    └── template_usage.md
└── README.md


/conda-recipe/grabseqs/conda_build_config.yaml:
--------------------------------------------------------------------------------
1 | python:
2 |   - 3.6
3 |   - 3.7
4 |   - 3.9
5 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - bioconda
 3 |   - conda-forge
 4 |   - louiejtaylor
 5 | dependencies:
 6 |   - sra-tools>3.2
 7 |   - python
 8 |   - requests
 9 |   - pigz
10 |   - wget
11 |   - pandas
12 | 


--------------------------------------------------------------------------------
/conda-recipe/build_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd ..
 4 | 
 5 | python3 setup.py sdist bdist_wheel
 6 | 
 7 | twine upload dist/*
 8 | 
 9 | rm -r grabseqs.egg-info/
10 | rm -r build/
11 | rm -r dist/
12 | 
13 | cd conda-recipe
14 | 
15 | bash conda-build.sh
16 | 


--------------------------------------------------------------------------------
/conda-recipe/conda-build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Assumes `conda config --set anaconda_upload yes` has
4 | # been run--otherwise just upload manually per the 
5 | # instructions from conda-build.
6 | 
7 | conda-build -c conda-forge -c bioconda -c louiejtaylor grabseqs
8 | 


--------------------------------------------------------------------------------
/.circleci/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # setup miniconda
 6 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 7 | bash Miniconda3-latest-Linux-x86_64.sh -b -p ${HOME}/miniconda3
 8 | 
 9 | # CircleCI fix weird error
10 | # cat .circleci/deplist.txt | xargs sudo apt-get install
11 | 
12 | export PATH=${PATH}:${HOME}/miniconda3/bin
13 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Python CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
 4 | #
 5 | 
 6 | version: 2
 7 | jobs:
 8 |   build:
 9 |     docker:
10 |       - image: cimg/python:3.9
11 |     steps:
12 |       - checkout
13 |       - run: bash .circleci/setup.sh
14 |       - run: bash tests/run_tests.bash -i -v -d $HOME
15 | 
16 | workflows:
17 |   version: 2
18 |   weeklybuild:
19 |     triggers:
20 |       - schedule:
21 |           cron: "30 14 * * 3"
22 |           filters:
23 |             branches:
24 |               only:
25 |                 - master
26 |     jobs:
27 |       - build
28 |   default:
29 |     jobs:
30 |       - build
31 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from distutils.core import setup
 3 | 
 4 | setup(name='grabseqs',
 5 | 	version='1.0.0',
 6 | 	description='Easily download reads from next-gen sequencing repositories like NCBI SRA',
 7 | 	author='Louis J Taylor',
 8 | 	author_email='l'+'ouist'+'@'+'u'+'penn.edu',
 9 | 	url='https://github.com/louiejtaylor/grabseqs',
10 | 	packages=['grabseqslib'],
11 | 	license='MIT License',
12 | 	entry_points={'console_scripts': [
13 | 	'grabseqs = grabseqslib:main'
14 | 	]},
15 | 	install_requires=[
16 | 		'requests',
17 | 		'argparse',
18 | 		'pandas>=2'
19 | 	],
20 | 	classifiers = ['Intended Audience :: Science/Research',
21 | 				'Environment :: Console',
22 | 				'Environment :: Web Environment',
23 | 				'License :: OSI Approved :: MIT License',
24 | 				'Programming Language :: Python :: 3',
25 | 				'Topic :: Scientific/Engineering :: Bio-Informatics',],
26 | 	py_modules = ['utils','sra','mgrast']
27 | )
28 | 


--------------------------------------------------------------------------------
/conda-recipe/grabseqs/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "grabseqs" %}
 2 | {% set version = "1.0.0" %}
 3 | 
 4 | package:
 5 |   name: "{{ name|lower }}"
 6 |   version: "{{ version }}"
 7 | 
 8 | source:
 9 |   url: "https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz"
10 | 
11 | build:
12 |   number: 0
13 |   entry_points:
14 |     - grabseqs = grabseqslib:main
15 |   script: "{{ PYTHON }} -m pip install . -vv"
16 | 
17 | requirements:
18 |   host:
19 |     - pandas
20 |     - python {{ python }}
21 |     - requests
22 |   run:
23 |     - pandas
24 |     - python
25 |     - requests
26 |     - sra-tools
27 |     - pigz
28 |     - wget
29 | 
30 | test:
31 |   imports:
32 |     - grabseqslib
33 |   commands:
34 |     - grabseqs --help
35 | 
36 | about:
37 |   home: "https://github.com/louiejtaylor/grabseqs"
38 |   license: MIT
39 |   license_family: MIT
40 |   summary: "Easily download reads from next-gen sequencing repositories like NCBI SRA"
41 | 
42 | extra:
43 |   recipe-maintainers:
44 |     - louiejtaylor
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Louis J Taylor
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/test_mgrast.bash:
--------------------------------------------------------------------------------
 1 | # test sample listing, metadata download
 2 | function test_mgrast_listing {
 3 | if [ `grabseqs mgrast -l mgp85479 | wc -l` -ne 4 ]; then
 4 |     exit 1
 5 | fi
 6 | }
 7 | 
 8 | # test metadata
 9 | function test_mgrast_metadata {
10 |     grabseqs mgrast -o $TMPDIR/test_md_mg -m META.csv -l mgp85479
11 |     if [ `cat $TMPDIR/test_md_mg/META.csv | wc -l` -ne 5 ] ; then
12 |         exit 1
13 |     fi
14 | }
15 | 
16 | # download a tiny sample, .fastq-formatted
17 | function test_mgrast_fastq {
18 |     grabseqs mgrast -o $TMPDIR/test_tiny_mg mgm4793571.3
19 |     ls $TMPDIR/test_tiny_mg/mgm4793571.3.fastq.gz
20 | }
21 | 
22 | ## download a tiny sample, .fasta-formatted
23 | function test_mgrast_fasta {
24 |     grabseqs mgrast -o $TMPDIR/test_tiny_mg_fasta mgm4440055.3
25 |     ls $TMPDIR/test_tiny_mg_fasta/mgm4440055.3.fastq.gz
26 | }
27 | 
28 | ## test no clobber
29 | function test_mgrast_fastq_noclobber {
30 |     u=`grabseqs mgrast -o $TMPDIR/test_tiny_mg mgm4793571.3`
31 |     echo $u
32 |     if [[ $u != *"Pass -f to force download"* ]] ; then
33 |         exit 1
34 |     fi
35 | }
36 | 
37 | ## test force
38 | function test_mgrast_fastq_force_download {
39 |     u=`grabseqs mgrast -o $TMPDIR/test_tiny_mg -f mgm4793571.3`
40 |     echo $u
41 |     if [[ $u == *"Pass -f to force download"* ]] ; then
42 |         exit 1
43 |     fi
44 |     ls $TMPDIR/test_tiny_mg/mgm4793571.3.fastq.gz
45 | }
46 | 
47 | # test case for invalid/empty accessions--should raise error (#51)
48 | function test_mgrast_invalid_acc {
49 |     if grabseqs mgrast -l mgp0fake; then
50 |         exit 1
51 |     fi
52 | }
53 | 


--------------------------------------------------------------------------------
/tests/test_general.bash:
--------------------------------------------------------------------------------
 1 | # test to see whether install succeeded
 2 | function test_grabseqs_installed {
 3 |     grabseqs -v
 4 |     grabseqs -h
 5 | }
 6 | 
 7 | # test missing sra-tools
 8 | #function test_grabseqs_no_sratools {
 9 | #    conda remove sra-tools -qy
10 | #    if grabseqs sra -o $TMPDIR/test_no_sra-tools ERR2279063; then
11 | #        exit 1
12 | #    fi
13 | #    conda install "sra-tools>3.2" -c bioconda -qy
14 | #}
15 | 
16 | # test missing pigz
17 | function test_grabseqs_no_pigz {
18 |     if which pigz; then
19 |         echo "pigz installed outside of conda, cannot test whether it is missing"
20 |     else
21 |         conda remove pigz -qy
22 |         u=`grabseqs mgrast -o $TMPDIR/test_nopigz mgm4633450.3`
23 |         echo $u
24 |         if [[ $u != *"pigz not found, using gzip"* ]] ; then
25 |             exit 1
26 |         fi
27 |         conda install -c anaconda pigz -qy
28 |    fi
29 | }
30 | 
31 | # test conda install
32 | function test_grabseqs_conda_install {
33 |     conda deactivate
34 |     conda create -n grabseqs-unittest-conda -qy
35 |     conda activate grabseqs-unittest-conda
36 |     conda install -c louiejtaylor -c bioconda -c conda-forge -qy grabseqs
37 |     conda deactivate
38 |     conda env remove -yqn grabseqs-unittest-conda
39 |     conda activate grabseqs-unittest
40 | }
41 | 
42 | # test install with python3.7 (issue #38)
43 | function test_grabseqs_conda_newer_python {
44 |     conda deactivate
45 |     conda create -n grabseqs-unittest-py37 -qy
46 |     conda activate grabseqs-unittest-py37
47 |     conda install python=3.7 -qy
48 |     conda install -c louiejtaylor -c bioconda -c conda-forge grabseqs -qy
49 |     conda deactivate
50 |     conda env remove -yqn grabseqs-unittest-py37
51 |     conda activate grabseqs-unittest
52 | }
53 | 


--------------------------------------------------------------------------------
/tests/test_imicrobe.bash:
--------------------------------------------------------------------------------
 1 | # test sample listing and metadata download
 2 | function test_imicrobe_listing_project {
 3 |     if [ `grabseqs imicrobe -l p1 | wc -l` -ne 2 ]; then
 4 |         exit 1
 5 |     fi
 6 | }
 7 | 
 8 | # test metadata download
 9 | function test_imicrobe_metadata_download {
10 |     grabseqs imicrobe -o $TMPDIR/test_md_im -m META.csv -l p1
11 |     if [ `cat $TMPDIR/test_md_im/META.csv | wc -l` -ne 3 ] ; then
12 |         exit 1
13 |     fi
14 | }
15 | 
16 | # paired sample listing
17 | function test_imicrobe_listing_paired_sample {
18 | ps=`grabseqs imicrobe -l s6398`
19 | echo $ps
20 | if [ "$ps" != "s6398_1.fastq.gz,s6398_2.fastq.gz" ]; then
21 |     exit 1
22 | fi
23 | }
24 | 
25 | # download a tiny sample, .fasta-formatted
26 | function test_imicrobe_fasta {
27 |     grabseqs imicrobe -o $TMPDIR/test_tiny_im s710
28 |     ls $TMPDIR/test_tiny_im/s710.fastq.gz
29 | }
30 | 
31 | # download a tiny sample, .fastq-formatted paired
32 | function test_imicrobe_fastq_paired {
33 |     grabseqs imicrobe -o $TMPDIR/test_tiny_im s6399
34 |     ls $TMPDIR/test_tiny_im/s6399_1.fastq.gz
35 |     ls $TMPDIR/test_tiny_im/s6399_2.fastq.gz
36 |     echo -e "$PASS iMicrobe fastq-formatted sample download test passed"
37 | }
38 | 
39 | ## test no clobber
40 | function test_imicrobe_no_clobber {
41 |     t=`grabseqs imicrobe -t 2 -o $TMPDIR/test_tiny_im s710`
42 |     echo $t
43 |     if [[ $t != *"Pass -f to force download"* ]] ; then
44 |          exit 1
45 |     fi
46 | }
47 | 
48 | ## test force
49 | function test_imicrobe_fasta_force {
50 |     tf=`grabseqs imicrobe -t 2 -o $TMPDIR/test_tiny_im -f s710`
51 |     echo $tf
52 |     if [[ $tf == *"Pass -f to force download"* ]] ; then
53 |         exit 1
54 |     fi
55 |     ls $TMPDIR/test_tiny_im/s710.fastq.gz
56 | }
57 | 
58 | # test case for invalid/empty accessions--should raise error (#51)
59 | function test_imicrobe_invalid_acc {
60 |     if grabseqs mgrast -l p4fake; then
61 |         exit 1
62 |     fi
63 | }
64 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # sequence and cluster
  2 | *.fastq
  3 | *.fastq.gz
  4 | *.fq
  5 | *.fq.gz
  6 | *.fasta
  7 | *.fa
  8 | *.out
  9 | *.err
 10 | test/
 11 | bin/test/
 12 | clean_build.sh
 13 | 
 14 | # Byte-compiled / optimized / DLL files
 15 | __pycache__/
 16 | *.py[cod]
 17 | *$py.class
 18 | 
 19 | # C extensions
 20 | *.so
 21 | 
 22 | # Distribution / packaging
 23 | .Python
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .coverage
 55 | .coverage.*
 56 | .cache
 57 | nosetests.xml
 58 | coverage.xml
 59 | *.cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | 
 72 | # Flask stuff:
 73 | instance/
 74 | .webassets-cache
 75 | 
 76 | # Scrapy stuff:
 77 | .scrapy
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # celery beat schedule file
 92 | celerybeat-schedule
 93 | 
 94 | # SageMath parsed files
 95 | *.sage.py
 96 | 
 97 | # Environments
 98 | .env
 99 | .venv
100 | env/
101 | venv/
102 | ENV/
103 | env.bak/
104 | venv.bak/
105 | 
106 | # Spyder project settings
107 | .spyderproject
108 | .spyproject
109 | 
110 | # Rope project settings
111 | .ropeproject
112 | 
113 | # mkdocs documentation
114 | /site
115 | 
116 | # mypy
117 | .mypy_cache/
118 | 


--------------------------------------------------------------------------------
/grabseqslib/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["utils","sra","mgrast"]
 2 | 
 3 | import os, sys, argparse, warnings, shutil
 4 | import pandas as pd
 5 | 
 6 | from pathlib import Path
 7 | from grabseqslib.sra import process_sra, add_sra_subparser
 8 | from grabseqslib.mgrast import process_mgrast, add_mgrast_subparser
 9 | 
10 | def main():
11 |     '''
12 |     Command-line argument-handling function
13 |     '''
14 |     # Set up parsers
15 |     parser = argparse.ArgumentParser(prog="grabseqs",
16 |          description='Download metagenomic sequences from public datasets.')
17 |     parser.add_argument('--version', '-v', action='version', version='%(prog)s 1.0.0')
18 |     subpa = parser.add_subparsers(help='repositories available')
19 | 
20 |     add_sra_subparser(subpa)
21 |     add_mgrast_subparser(subpa)
22 | 
23 |     args = parser.parse_args()
24 |     # Make output directories if they don't exist
25 |     try:
26 |         if args.outdir != "":
27 |             if not os.path.exists(args.outdir):
28 |                 os.makedirs(args.outdir)
29 |     except AttributeError: 
30 |         # No subcommand provided (all subcomands have `-o`)
31 |         print("Subcommand not specified, run `grabseqs -h` or  `grabseqs {repository} -h` for help")
32 |         sys.exit(0)
33 | 
34 |     # Figure out which subparser was called
35 |     try:
36 |         if args.rastid:
37 |             repo = "MG-RAST"
38 |     except AttributeError:
39 |         repo = "SRA"
40 | 
41 |     # Check deps
42 |     zip_func = "gzip"
43 |     if shutil.which("pigz"):
44 |         zip_func = "pigz"
45 |     else:
46 |         print("pigz not found, using gzip")
47 | 
48 |     metadata_agg = None
49 | 
50 |     # Download samples
51 |     if repo == "SRA":
52 |         metadata_agg = process_sra(args, zip_func)
53 | 
54 |     elif repo == "MG-RAST":
55 |         metadata_agg = process_mgrast(args, zip_func)
56 | 
57 |     # Handle metadata
58 |     if args.metadata != "":
59 |         md_path = Path(args.outdir) / Path(args.metadata)
60 |         if not os.path.isfile(md_path):
61 |             metadata_agg.to_csv(md_path, index = False)
62 |             print("Metadata saved to new file: " + str(md_path))
63 |         else:
64 |             metadata_i = pd.read_csv(md_path)
65 |             metadata_f = pd.concat([metadata_i,metadata_agg],sort=True)
66 |             metadata_f.to_csv(md_path, index = False)
67 |             print("Metadata appended to existing file: " + str(md_path))
68 | 


--------------------------------------------------------------------------------
/tests/test_sra.bash:
--------------------------------------------------------------------------------
 1 | # test sample listing, metadata download
 2 | function test_sra_listing {
 3 |     if [ `grabseqs sra -l SRP057027 | wc -l` -ne 369 ]; then
 4 |         exit 1
 5 |     fi
 6 | }
 7 | 
 8 | # test metadata download
 9 | function test_sra_metadata_downloaded {
10 |     grabseqs sra -m SRP057027.tsv -l -o $TMPDIR/test_metadata/ SRP057027
11 |     if [ `cat $TMPDIR/test_metadata/SRP057027.tsv | wc -l` -ne 370 ] ; then
12 |         exit 1
13 |     fi
14 | }
15 | 
16 | # test behavior with -l and --no_parsing
17 | function test_sra_no_parsing_flag {
18 |     if [ `grabseqs sra -l --no_parsing SRR1804203 | wc -l` -ne 1 ]; then
19 |         exit 1
20 |     fi
21 | }
22 | 
23 | # unpaired fasterq-dump
24 | function test_sra_unpaired {
25 |     grabseqs sra -t 2 -o $TMPDIR/test_tiny_sra ERR2279063
26 |     ls $TMPDIR/test_tiny_sra/ERR2279063.fastq.gz
27 | }
28 | 
29 | # paired fasterq-dump
30 | function test_sra_paired {
31 |     grabseqs sra -t 2 -o $TMPDIR/test_tiny_sra_paired SRR1913936
32 |     ls $TMPDIR/test_tiny_sra_paired/SRR1913936_1.fastq.gz
33 |     ls $TMPDIR/test_tiny_sra_paired/SRR1913936_2.fastq.gz
34 | }
35 | 
36 | # unpaired fastq-dump
37 | function test_sra_unpaired_fastqdump {
38 |     grabseqs sra -t 2 -o $TMPDIR/test_fastqdump_sra --use_fastq_dump ERR2279063
39 |     ls $TMPDIR/test_fastqdump_sra/ERR2279063.fastq.gz
40 | }
41 | 
42 | # paired fasterq-dump
43 | function test_sra_paired_fastqdump {
44 |     grabseqs sra -t 2 -o $TMPDIR/test_fastqdump_sra_paired --use_fastq_dump SRR1913936
45 |     ls $TMPDIR/test_fastqdump_sra_paired/SRR1913936_1.fastq.gz
46 |     ls $TMPDIR/test_fastqdump_sra_paired/SRR1913936_2.fastq.gz
47 | }
48 | 
49 | # test no clobber
50 | function test_sra_no_clobber {
51 |     t=`grabseqs sra -t 2 -o $TMPDIR/test_fastqdump_sra ERR2279063`
52 |     echo $t
53 |     if [[ $t != *"Pass -f to force download"* ]] ; then
54 |         exit 1
55 |     fi
56 | }
57 | 
58 | # test force
59 | function test_sra_forced {
60 |     tf=`grabseqs sra -r 0 -t 2 -o $TMPDIR/test_fastqdump_sra -f ERR2279063`
61 |     echo $tf
62 |     if [[ $tf == *"Pass -f to force download"* ]] ; then
63 |         exit 1
64 |     fi
65 | }
66 | 
67 | # test custom args to fasterq-dump (#44)
68 | function test_sra_custom_fasterqdump_args {
69 |     grabseqs sra SRR1913936 -r 0 -o $TMPDIR/test_fasterqdump_custom --custom_fqdump_args='--split-spot'
70 |     # this is a paired run, but with `--split-spot` instead of `--split-3` it should come down as a single interleaved fastq.gz
71 |     ls $TMPDIR/test_fasterqdump_custom/SRR1913936.fastq.gz
72 | }
73 | 
74 | # test custom args to fastq-dump (#44)
75 | function test_sra_custom_fastqdump_args {
76 |     grabseqs sra SRR1913936 -r 0 --use_fastq_dump -o $TMPDIR/test_fastqdump_custom --custom_fqdump_args='--gzip --skip-technical'
77 |     # this is a paired run, but without the `--split-3` arg it should come down as a single interleaved fastq.gz
78 |     ls $TMPDIR/test_fastqdump_custom/SRR1913936.fastq.gz
79 | }
80 | 
81 | # test case for invalid/empty accessions--should raise error (#51)
82 | function test_sra_invalid_acc {
83 |     if grabseqs sra -l PRJNAXXXXXXXX; then
84 |         exit 1
85 |     fi
86 | }
87 | 


--------------------------------------------------------------------------------
/grabseqslib/utils.py:
--------------------------------------------------------------------------------
  1 | import os, glob, gzip, sys
  2 | from subprocess import call
  3 | 
  4 | def check_existing(save_loc, acc):
  5 |     """
  6 |     Function to check for single- or paired-end reads
  7 |     in a given `save_loc` for a particular `acc`ession.
  8 |     Returns "paired" if paired reads found, "single" if
  9 |     unpaired reads found, "both" if single- and paired-
 10 |     end reads found, and False if nothing matching that 
 11 |     accession was found.
 12 |     """
 13 |     if save_loc == '':
 14 |         loc_to_search = os.getcwd()
 15 |     else:
 16 |         loc_to_search = save_loc
 17 |     try:
 18 |         existing = [f for f in os.listdir(loc_to_search) if f.endswith('fastq.gz')]
 19 |     except FileNotFoundError:
 20 |         return False
 21 |     paired = False
 22 |     unpaired = False
 23 |     for f in existing:
 24 |         if acc + '.fastq.gz' in f:
 25 |             unpaired = True
 26 |         if (acc + '_1.fastq.gz' in f) or (acc + '_2.fastq.gz' in f):
 27 |             paired = True
 28 |     if unpaired == True and paired == True:
 29 |         return "both"
 30 |     elif paired == True:
 31 |         return "paired"
 32 |     elif unpaired == True:
 33 |         return "unpaired"
 34 |     else:
 35 |         return False
 36 | 
 37 | def gzip_files(paths, tool="gzip", threads=1):
 38 |     """
 39 |     Zips files at one or more `paths` using specified `tool`.
 40 |     Returns the command-line tool's return code.
 41 |     """
 42 |     if type(paths) != type(["list'o'strings"]):
 43 |         paths = [paths]
 44 |     validated_paths = []
 45 |     for p in paths:
 46 |         if os.path.isfile(p):
 47 |             validated_paths.append(p)
 48 |     if tool == "gzip":
 49 |         retcode = call(["gzip -f " + ' '.join(validated_paths)], shell=True)
 50 |     elif tool == "pigz":
 51 |         retcode = call(["pigz -f -p "+ str(threads) + ' ' + ' '.join(validated_paths)], shell=True)
 52 |     else:
 53 |         print("Unrecognized tool "+tool+" specified: cannot compress ", validated_paths)
 54 |         sys.exit(1)
 55 |     return retcode
 56 | 
 57 | def fetch_file(url, outfile, retries = 0):
 58 |     """
 59 |     Function to fetch a remote file from a `url`,
 60 |     writing to `outfile` with a particular number of
 61 |     `retries`.
 62 |     """
 63 |     wget_cmd = ["wget", "-O", outfile, url]
 64 |     retcode = call(wget_cmd)
 65 |     return retcode
 66 | 
 67 | def build_paths(acc, loc, paired, ext = ".fastq"):
 68 |     """
 69 |     Builds paths for saving downloaded files from a given
 70 |     `acc` in a particular `loc`, depending on whether or
 71 |     not they are `paired`. Can specify any `ext`. Returns
 72 |     a list of paths of length 1 or 2.
 73 |     """
 74 |     if paired:
 75 |         suffix = ["_1", "_2"]
 76 |     else:
 77 | 
 78 |         suffix = [""]
 79 |     return [os.path.join(loc,acc+s+ext) for s in suffix]
 80 | 
 81 | def check_filetype(fp):
 82 |     """
 83 |     Function to classify downloaded files as gzipped or not,
 84 |     and in FASTQ, FASTA, or not based on contents. Returns a 
 85 |     formatted extension (i.e. '.fastq', 'fasta.gz') corresponding
 86 |     to the filetype or an empty string if the filetype is not 
 87 |     recognized.
 88 |     """
 89 |     try:
 90 |         f = gzip.open(fp)
 91 |         first_b = f.readline()
 92 |         gz = ".gz"
 93 |         first = first_b.decode("ascii")
 94 |     except OSError: # file not gzipped
 95 |         f.close()
 96 |         f = open(fp, 'r')
 97 |         first = f.readline()
 98 |         f.close()
 99 |         gz = ""
100 |     if len(first) == 0:
101 |         return ""
102 |     if first[0] == ">":
103 |         return "fasta"+gz
104 |     elif first[0] == "@":
105 |         return "fastq"+gz
106 |     else:
107 |         return ""
108 | 
109 | def fasta_to_fastq(fp_fa, fp_fq, zipped, dummy_char = "I"):
110 |     """
111 |     Function to convert fasta (at `fp_fa`) to fastq (at `fp_fq`)
112 |     possibly zipped, adding a `dummy_score`.
113 |     """
114 |     if len(dummy_char) != 1:
115 |         raise Exception("FASTQ dummy quality char must be only one char.")
116 | 
117 |     fq = open(fp_fq, 'w')
118 | 
119 |     seq = -1
120 |     if zipped:
121 |         f = gzip(fp_fa)
122 |     else:
123 |         f = open(fp_fa)
124 |     for line in f.readlines():
125 |         if line[0] == '>':
126 |             if seq == -1:
127 |                 fq.write('@'+line[1:])
128 |             else:
129 |                 fq.write(seq+'\n')
130 |                 fq.write('+\n')    
131 |                 fq.write(dummy_char*len(seq)+'\n')
132 |                 fq.write('@'+line[1:])
133 |             seq = ''
134 |         else:
135 |             seq += line.strip()
136 | 
137 |     f.close()
138 | 
139 |     if len(seq) > 0:
140 |         fq.write(seq+'\n')
141 |         fq.write('+\n')
142 |         fq.write(dummy_char*len(seq)+'\n')
143 | 
144 |     fq.close()
145 | 
146 | 


--------------------------------------------------------------------------------
/tests/run_tests.bash:
--------------------------------------------------------------------------------
  1 | #/bin/bash
  2 | 
  3 | # setup
  4 | set -e
  5 | 
  6 | STARTING_DIR=$(pwd)
  7 | 
  8 | export PATH=${PATH}:${HOME}/miniconda3/bin
  9 | 
 10 | # set up temp locations
 11 | TMPLOC=/tmp
 12 | 
 13 | USER_TMPDIR=false
 14 | SKIP_IMICROBE=false
 15 | SKIP_SRA=false
 16 | SKIP_MGRAST=false
 17 | 
 18 | while getopts "d:t:imsvh" opt; do
 19 |   case $opt in
 20 |     d)
 21 |       USER_TMPDIR=true
 22 |       TMPLOC=`readlink -f $OPTARG`
 23 |       ;;
 24 |     i)
 25 |       SKIP_IMICROBE=true
 26 |       ;;
 27 |     m)
 28 |       SKIP_MGRAST=true
 29 |       ;;
 30 |     s)
 31 |       SKIP_SRA=true
 32 |       ;;
 33 |     t)
 34 |       RUN_TEST=$OPTARG
 35 |       ;;
 36 |     v)
 37 |       VERBOSE=true
 38 |       ;;
 39 |     h)
 40 |       echo "Run the grabseqs test suite."
 41 |       echo "  -d DIR       Use DIR rather than a temporary directory (remains after tests finish)"
 42 |       echo "  -t TEST      Run a specific test only"
 43 |       echo "  -i           Don't run iMicrobe tests"
 44 |       echo "  -m           Don't run MG-RAST tests"
 45 |       echo "  -s           Don't run SRA tests"
 46 |       echo "  -v           Run tests with verbose output"
 47 |       echo "  -h           Display this message and exit"
 48 |       exit 1
 49 |       ;;
 50 |     \?)
 51 |       echo "Unknown option - '$OPTARG'"
 52 |       exit 1
 53 |       ;;
 54 |   esac
 55 | done
 56 | 
 57 | TMPDIR=$TMPLOC/grabseqs_unittest
 58 | mkdir -p $TMPDIR
 59 | fs=`ls $TMPDIR | wc -l`
 60 | 
 61 | if [ $fs -ne 0 ] ; then
 62 |     echo "Directory $TMPDIR not empty. Clean it or specify a testing location with -d [loc]"
 63 |     exit 1
 64 | fi
 65 | 
 66 | GREEN="\x1B[32m"
 67 | RESET="\x1B[0m"
 68 | PASS="${GREEN}\u2714${RESET}"
 69 | FAIL="${RED}X${RESET}"
 70 | 
 71 | # environment and package install
 72 | 
 73 | function setup {
 74 | 
 75 |     CONDA_BASE=$(conda info --base) # see https://github.com/conda/conda/issues/7980
 76 |     source $CONDA_BASE/etc/profile.d/conda.sh # allows conda [de]activate in scripts
 77 |     verbose "Setting up conda environment..."
 78 |     conda env update --name=grabseqs-unittest --file environment.yml
 79 |     conda activate grabseqs-unittest
 80 |     # required for installing libs
 81 |     pip install setuptools
 82 |     verbose "Installing grabseqs library"
 83 |     python setup.py install
 84 | 
 85 |     # Fix CircleCI testing issue for iMicrobe
 86 |     if [ `echo $HOME | grep "/home/circleci" | wc -l` -eq 1 ]; then
 87 |         echo "Tests running on CircleCI, adding add'l dependency"
 88 |         pip uninstall -y "urllib3"
 89 | 	pip install "urllib3"
 90 |     fi
 91 | }
 92 | 
 93 | # functions copied and adapted from sunbeam-labs/sunbeam
 94 | 
 95 | function msg {
 96 |     echo -ne "${1}"
 97 | }
 98 | 
 99 | function verbose {
100 |     if [ "$VERBOSE" = true ]; then
101 | 	echo -ne "${1}"
102 |     fi
103 | }
104 | 
105 | function broke {
106 |     local RETCODE=$?
107 |     msg "\nFailed command error output:\n`cat ${2}.err`\n"
108 |     msg "${FAIL} (log: ${LOGFILE}.[out/err])\n"
109 |     cleanup 1
110 | }
111 | 
112 | function capture_output {
113 |     msg "Running ${1}... "
114 | 
115 |     LOGFILE="${TMPDIR}/${1}"	
116 |  
117 |     set -o pipefail
118 |     if [ "$VERBOSE" = true ]; then
119 | 	OUTPUT_STRING="> >(tee ${LOGFILE}.out) 2> >(tee ${LOGFILE}.err >&2)"
120 |     else
121 | 	OUTPUT_STRING="> ${LOGFILE}.out 2> ${LOGFILE}.err"
122 |     fi
123 |     trap "broke ${1} ${LOGFILE} $?" exit
124 |     eval "${1} ${OUTPUT_STRING}"
125 |     set +o pipefail
126 |     trap "cleanup $?" exit
127 |     msg "${PASS}\n"
128 | }
129 | 
130 | function cleanup {
131 |     local TMPRC=$?
132 |     local RETCODE=$TMPRC
133 |     if [ ${1} -gt ${TMPRC} ]; then
134 | 	RETCODE=${1}
135 |     else
136 | 	RETCODE=${TMPRC}
137 |     fi
138 |     cd $STARTING_DIR
139 |     if [ $RETCODE -ne 0 ]; then
140 | 	msg "${RED}-- TESTS FAILED --${RESET}\n"
141 |     else
142 | 	msg "${GREEN}-- TESTS SUCCEEDED --${RESET}\n"
143 |     fi
144 |     conda deactivate
145 | 
146 |     verbose "Deleting temporary conda environment \n"
147 |     conda env remove -yqn grabseqs-unittest
148 | 
149 |     # Remove temp directory if created by us
150 |     if [ "$USER_TMPDIR" = false ]; then
151 | 	verbose "Deleting temporary directory ${TMPDIR}\n"
152 |         rm -rf $TMPDIR
153 |     fi
154 | 
155 |     # Exit, maintaining previous return code
156 |     exit $RETCODE
157 | 
158 |     echo -e "$PASS all tests passed!"
159 | }
160 | 
161 | function run_test_suite {
162 |     for testcase in $(declare -f | grep -o "^test[a-zA-Z_]*") ; do
163 | 	capture_output ${testcase}
164 |     done
165 | }
166 | 
167 | trap cleanup exit
168 | 
169 | capture_output setup
170 | 
171 | # read in tests
172 | source tests/test_general.bash
173 | 
174 | if [ "$SKIP_IMICROBE" = false ]; then
175 |     echo "iMicrobe temporarily removed"
176 |     # source tests/test_imicrobe.bash
177 |     # temp until imicrobe issues resolved
178 | fi
179 | 
180 | if [ "$SKIP_SRA" = false ]; then
181 |     source tests/test_sra.bash
182 | fi
183 | 
184 | if [ "$SKIP_MGRAST" = false ]; then
185 |     source tests/test_mgrast.bash
186 | fi
187 | 
188 | 
189 | # Run single test, if specified, or all found tests otherwise
190 | if [ ! -z ${RUN_TEST+x} ]; then
191 |     capture_output ${RUN_TEST}
192 | else
193 |     run_test_suite
194 | fi
195 | 
196 | 


--------------------------------------------------------------------------------
/template.py:
--------------------------------------------------------------------------------
  1 | # This file serves as a template for adding repositories to grabseqs.
  2 | 
  3 | import requests, argparse, sys, os, time, json, glob
  4 | from subprocess import call
  5 | import pandas as pd
  6 | 
  7 | from grabseqslib.utils import check_existing, fetch_file
  8 | 
  9 | def add_newrepo_subparser(subparser):
 10 |     """
 11 |     Function to add a subparser for newrepo repository.
 12 |     """
 13 | 
 14 |     ### Base args: should be in every
 15 |     parser_newrepo = subparser.add_parser('newrepo', help="download from newrepo")
 16 |     parser_newrepo.add_argument('newrepoid', type=str, nargs='+', 
 17 |                 help="One or more newrepo project or sample identifiers (EXAMPLE####)")
 18 | 
 19 |     parser_newrepo.add_argument('-o', dest="outdir", type=str, default="",
 20 |                 help="directory in which to save output. created if it doesn't exist")
 21 |     parser_newrepo.add_argument('-r',dest="retries", type=int, default=0,
 22 |                 help="number of times to retry download")
 23 |     parser_newrepo.add_argument('-t',dest="threads", type=int, default=1,
 24 |                 help="threads to use (for pigz)")
 25 | 
 26 |     parser_newrepo.add_argument('-f', dest="force", action="store_true",
 27 |                 help = "force re-download of files")
 28 |     parser_newrepo.add_argument('-l', dest="list", action="store_true",
 29 |                 help="list (but do not download) samples to be grabbed")
 30 | 
 31 |     ### OPTIONAL: Use if metadata are available
 32 |     parser_newrepo.add_argument('-m', dest="metadata", action="store_true",
 33 |                 help="save metadata")
 34 | 
 35 |     ### Add any repository-specific arguments here
 36 | 
 37 | def process_newrepo(args):
 38 |     """
 39 |     Controller function for parsing newrepo repository data/metadata downloading.
 40 |     """
 41 | 
 42 |     # check for any necessary dependencies
 43 | 
 44 |     metadata_agg = None
 45 | 
 46 |     # loop through passed identifiers
 47 |     for newrepo_identifier in args.newrepoid:
 48 |         sample_list, metadata_agg = map_newrepo_project_acc(newrepo_identifier, metadata_agg)
 49 |         for sample in sample_list:
 50 |             download_newrepo_sample(acc,
 51 |                                 args.retries,
 52 |                                 args.threads,
 53 |                                 args.outdir,
 54 |                                 args.force,
 55 |                                 args.list)
 56 | 
 57 |     # Metadata may be available from different locations for different repos.
 58 |     # In this example, the metadata and project -> accession mapping are done
 59 |     # in the same step (as this is the most common configuration we've encountered).
 60 |     return metadata_agg
 61 | 
 62 | def map_newrepo_project_acc(pacc, metadata_agg = None):
 63 |     """
 64 |     Function to get list of newrepo sample accession numbers from a particular 
 65 |     project. Takes project accession number `pacc` and an optional `metadata_agg`
 66 |     pandas dataframe and returns a list of newrepo accession numbers with any new
 67 |     metadata appended to `metadata_agg`.
 68 |     """
 69 | 
 70 |     sample_list = []
 71 |     # Search for project or sample information and metadata (if available)
 72 |     
 73 |     # LISTING OPTION 1: If user would like to list the available samples (-l) and
 74 |     # this information is available, i.e. from a metadata table, this can
 75 |     # be tested for here (and then return an empty list to prevent downstream
 76 |     # processing). For an example of this, see the sra.py module
 77 |     
 78 |     
 79 |     # This is example code from sra.py showing how one might append all metadata from 
 80 |     # one run into the same dataframe
 81 |     if type(metadata_agg) == type(None):
 82 |         metadata_agg = pd.read_csv(StringIO(metadata.text))
 83 |     else:
 84 |         metadata_agg = metadata_agg.append(pd.read_csv(StringIO(metadata.text)),sort=True)
 85 |         
 86 |     return sample_list, metadata
 87 | 
 88 | def download_newrepo_sample(acc, retries = 0, threads = 1, loc='', force=False, list_only=False, zip_func = "gzip"):
 89 |     """
 90 |     Helper function to download sequences given an newrepo `acc`ession,
 91 |     with support for a particular number of `retries`. Can use multiple
 92 |     `threads` with pigz (if data are not already compressed on arrival).
 93 |     """
 94 | 
 95 |     # LISTING OPTION 2: If the information about whether samples are paired or
 96 |     # unpaired is only available from a sample-specific page, it usually makes more
 97 |     # sense to look that up here, and then just skip the downloading part. For an
 98 |     # example of this, see the mg-rast.py module
 99 | 
100 |     # Make sure to check that the sample isn't already downloaded
101 |     if not force:
102 |         found = check_existing(loc, acc)
103 |         if found != False:
104 |             print("found existing file matching acc:" + acc + ", skipping download. Pass -f to force download")
105 |             return False
106 | 
107 |     # Need to know this
108 |     paired = True
109 | 
110 |     # Generally, unless there's a tool like fasterq-dump that downloads both reads,
111 |     # it's just easier to iterate through file paths (i.e. either one unpaired, or
112 |     # two paired).
113 |     seq_urls = []
114 |     file_paths = build_paths(acc, loc, paired) #see utils.py for details
115 | 
116 |     for i in range(len(seq_urls)):
117 |         print("Downloading accession "+acc+" from newrepo repository")
118 |         # fetch_file should work for most things where a URL is available
119 |         retcode = fetch_file(seq_urls[i],file_paths[i],retries)
120 | 
121 |         # There are a number of things you may want to do here: check and handle
122 |         # downloaded file integrity, convert to .fastq (see mgrast.py for an example
123 |         # of a scenario dealing with .fastx in general), retries, etc.
124 | 
125 |         print("Compressing .fastq")
126 |         gzip_files(file_paths, zip_func, threads)
127 | 
128 |     return True
129 | 


--------------------------------------------------------------------------------
/faq/faq.md:
--------------------------------------------------------------------------------
 1 | # grabseqs FAQ
 2 | 
 3 | This page provides a few, hopefully helpful, hints that go above and beyond the [README](https://github.com/louiejtaylor/grabseqs/blob/master/README.md).
 4 | 
 5 | Sections: [General](#general-faqs) | [SRA](#sra-faqs) | [MG-RAST](#mg-rast-faqs) | [iMicrobe](#imicrobe-faqs)
 6 | 
 7 | ## General FAQs
 8 | 
 9 |  - **What if I have a file containing a list of many accessions I want to download, and don't want to type them all on the command-line?**
10 | 
11 | Let's say you have a newline-separated list of SRA accession numbers in a file called `acc.txt`. You can pass those through to grabseqs like so:
12 |  
13 |      grabseqs sra $(cat acc.txt)
14 | 
15 |  - **I can't install on Python version 3.X through conda**
16 | 
17 | Installation and release through conda explicitly uses Python versions 3.6 and 3.7 (as of Jan 2020). If you're using a different version of Python 3, try installing the grabseqs [requirements](https://github.com/louiejtaylor/grabseqs/blob/master/environment.yml) only via conda, then installing the grabseqs package through pip (`pip install grabseqs`) since the PyPI package is not built for a specific Python 3 minor version.
18 | 
19 |  - **The reads aren't downloading, why?**
20 | 
21 | Many possibilites here, and often they are respository-dependent. General tips: 
22 | 
23 | 1) Make sure your internet connection is good (and that you have permission to download files on whatever system you're using). 
24 | 
25 | 2) Try again later. Sporadic connection problems are common. 
26 | 
27 | 3) Make sure the reads are available/have been released to the public. Grabseqs only accesses publically available reads, so if you can't download from the SRA/iMicrobe/MG-RAST website without logging in, neither can grabseqs.
28 | 
29 |  - **What format are reads?**
30 | 
31 | Reads are saved as gzipped FASTQ files (extension `.fastq.gz`). If the repository data is in .fasta format, dummy quality scores are added.
32 | 
33 |  - **How is the metadata formatted?**
34 | 
35 | Metadata is downloaded and stored in .csv format (to the filename specified by the `-m` flag, e.g. `-m metadata.csv`). Metadata is appended if the filename already exists (assuming the file specified is in proper .csv format). Column names in the repository metadata are maintained. We do not recommend combining metadata from different repositories--while this will work without error, we do not parse/rename columns from each repository.
36 | 
37 |  - **Can you/can I add X repository?**
38 | 
39 | Since downloading from repositories is modular, adding new repositories is hopefully simple. We provide a [template](https://github.com/louiejtaylor/grabseqs/blob/master/template.py) for adding new repositories--each new repo is essentially handled as a separate argparse subparser
40 | 
41 |  - **Who/what do I cite?**
42 | 
43 | We would appreciate you referencing our GitHub page if you find this tool useful. More importantly, be fair to and appreciative of the researchers who generate the data, and the organizations who make the data available to the public. At minimum, citation of the paper/dataset(/repository) is appropriate. It is your responsibility to abide by the guidelines of the groups/repositories who make data available. Open data faciliates collaboration--if you're not sure, ask!
44 | 
45 | [Top](#grabseqs-faq)
46 | 
47 | ## SRA FAQs
48 | 
49 |  - **What default arguments do you use for calling fasterq/fastq-dump?**
50 | 
51 | We use arguments that remove technical reads, return gzipped fastq files (when available), and split paired reads into separate files (with a third file for reads without a mate). Specifically, the commands look like:
52 | 
53 |     fasterq-dump -e {thread_num} -f -3 SRR#########
54 |     # or
55 |     fastq-dump --gzip --split-3 --skip-technical SRR#########
56 |     # both of these can have "-O /path/to/outdir/" optionally 
57 |     # appended before the accession if specified by the user
58 | 
59 | For the current version of the code, see the `run_fasterq_dump` function within the [sra.py module](https://github.com/louiejtaylor/grabseqs/blob/master/grabseqslib/sra.py).
60 | 
61 | If you'd like to pass your own arguments to either of these functions, use the `--custom_fqdump_args` parameter like so:
62 | 
63 |     grabseqs sra SRP####### -r 0 --custom_fqdump_args="--split-spot --include-technical --progress"
64 | 
65 |  - **Why am I running out of space?**
66 |  
67 |  If you're going to be using SRA data, after you've installed sra-tools, run `vdb-config -i` and turn off local file caching unless you want extra copies of the downloaded sequences taking up space ([read more here](https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration)).
68 | 
69 |  - **My reads are not paired properly.**
70 | 
71 | Sometimes fasterq-dump filters out reads but won't also filter the mate, and I haven't figured out why, or how to circumvent it. Try adding the `--use_fastq_dump` flag--it seems that fastq-dump handles this situation better.
72 | 
73 | [Top](#grabseqs-faq)
74 | 
75 | ## MG-RAST FAQs
76 | 
77 |  - **My accession can't be found by grabseqs.**
78 | 
79 | Many of the projects in MG-RAST are not publically accessible. If you're having trouble with a particular accession number or project, go to the [MG-RAST website](http://www.mg-rast.org/) and make sure you can download it by hand first. **Please do this first, as a number of samples which were previously available for download are now unavailable.** If this works fine, please [open an issue](https://github.com/louiejtaylor/grabseqs/issues) and we'll check it out!
80 | 
81 | [Top](#grabseqs-faq)
82 | 
83 | ## iMicrobe FAQs
84 | 
85 | 2025-08-14: currently non-functional
86 | 
87 |  - **My iMicrobe download isn't working.**
88 | 
89 | As with MG-RAST, many of the iMicrobe projects are not available to the public. If you're having download troubles, see whether you can download a test sample manually from [the iMicrobe website](https://www.imicrobe.us/). iMicrobe seems to have reads stored a lot of different ways. For iMicrobe, we handle reads that are either in .fastq or .fasta (adding dummy scores in .fastq conversion), either .gzipped or not, and either paired or unpaired. If you come across reads that are in another format, or you can download from the website but not through grabseqs, please [open an issue](https://github.com/louiejtaylor/grabseqs/issues) and we'll take a look.
90 | 
91 | [Top](#grabseqs-faq)
92 | 


--------------------------------------------------------------------------------
/grabseqslib/mgrast.py:
--------------------------------------------------------------------------------
  1 | import requests, os, json
  2 | import pandas as pd
  3 | from io import StringIO
  4 | from subprocess import call
  5 | from grabseqslib.utils import check_existing, fetch_file, check_filetype, fasta_to_fastq, gzip_files
  6 | 
  7 | def add_mgrast_subparser(subparser):
  8 |     """
  9 |     Function to add the MG-RAST subparser.
 10 |     """
 11 | 
 12 |     parser_rast = subparser.add_parser('mgrast', help="download from MG-RAST")
 13 |     parser_rast.add_argument('rastid', type=str, nargs='+', 
 14 |                 help="One or more MG-RAST project or sample identifiers (mgp####/mgm######)")
 15 | 
 16 |     parser_rast.add_argument('-m', dest="metadata", type=str, default="",
 17 |                 help="filename in which to save metadata (.csv format, relative to OUTDIR)")
 18 |     parser_rast.add_argument('-o', dest="outdir", type=str, default="",
 19 |                 help="directory in which to save output. created if it doesn't exist")
 20 |     parser_rast.add_argument('-r',dest="retries", type=int, default=0,
 21 |                 help="number of times to retry download")
 22 |     parser_rast.add_argument('-t',dest="threads", type=int, default=1,
 23 |                 help="threads to use (for pigz)")
 24 | 
 25 |     parser_rast.add_argument('-f', dest="force", action="store_true",
 26 |                 help = "force re-download of files")
 27 |     parser_rast.add_argument('-l', dest="list", action="store_true",
 28 |                 help="list (but do not download) samples to be grabbed")
 29 | 
 30 | def process_mgrast(args, zip_func):
 31 |     """
 32 |     Top-level function to process MG-RAST download. Returns aggregated metadata.
 33 |     """
 34 |     metadata_agg = None
 35 |     for rast_proj in args.rastid:
 36 |         # get targets
 37 |         target_list = get_mgrast_acc_metadata(rast_proj)
 38 |         for target in target_list:
 39 |             # get samples and/or metadata
 40 |             metadata_agg = download_mgrast_sample(target,
 41 |                                                   args.retries,
 42 |                                                   args.threads,
 43 |                                                   args.outdir,
 44 |                                                   args.force,
 45 |                                                   args.list,
 46 |                                                   not (args.metadata == ""),
 47 |                                                   metadata_agg, zip_func)
 48 |     return metadata_agg
 49 | 
 50 | def get_mgrast_acc_metadata(pacc):
 51 |     """
 52 |     Function to get list of MG-RAST sample accession numbers from a particular 
 53 |     project. Takes project accession number `pacc` and returns a list of mgm
 54 |     accession numbers.
 55 |     """
 56 |     if pacc[:3] == "mgm":
 57 |         return [pacc]
 58 |     elif pacc[:3] != "mgp":
 59 |         raise NameError("Unknown prefix: " + pacc[:3] + ". Should be 'mgm' or 'mgp'.")
 60 |     metadata_json = json.loads(requests.get("http://api.metagenomics.anl.gov/metadata/export/"+pacc).text)
 61 |     sample_list = []
 62 |     for sample in metadata_json["samples"]:
 63 |         sample_list.append(sample["libraries"][0]["data"]["metagenome_id"]["value"]) #metadata: ["data"]
 64 |     return sample_list
 65 | 
 66 | def download_mgrast_sample(acc, retries = 0, threads = 1, loc='', force=False, list_only=False, download_metadata=False, metadata_agg = None, zip_func = "gzip"):
 67 |     """
 68 |     Helper function to download original (uploaded) MG-RAST `acc`ession,
 69 |     with support for a particular number of `retries`. Can use multiple
 70 |     `threads` with pigz (if data are not already compressed on arrival).
 71 |     Also will optionally `download_metadata`.
 72 |     """
 73 |     read_stages = ["050.1", "050.2"] # R1 and R2 (if paired)
 74 | 
 75 |     stage_json = json.loads(requests.get("http://api.metagenomics.anl.gov/download/"+acc).text)
 76 |     stages_to_grab = []
 77 |     for stage in stage_json["data"]:
 78 |         if stage["file_id"] in read_stages:
 79 |             stages_to_grab.append(stage["file_id"])
 80 |     stages_to_grab = sorted(stages_to_grab) # sort because json
 81 | 
 82 |     if len(stages_to_grab) == 0:
 83 |         raise Exception("No reads found for accession: "+acc)
 84 |     else:
 85 |         if len(stages_to_grab) == 1:
 86 |             fext = [""] # unpaired, no ext
 87 |         else:
 88 |             fext = ["_"+str(i+1) for i in range(len(stages_to_grab))] # paired
 89 |     if download_metadata:
 90 |         metadata_json = json.loads(requests.get("http://api.metagenomics.anl.gov/metadata/export/"+acc).text)
 91 |         sample_info = metadata_json["mixs"]
 92 |         colnames = ["mgm_id"]+list(sorted(list(sample_info.keys())))
 93 |         colvals = [acc]+[str(sample_info[x]) for x in colnames[1:]]
 94 |         formatted_table = ','.join(colnames)+'\n'+','.join(colvals)
 95 |         if type(metadata_agg) == type(None):
 96 |             metadata_agg = pd.read_csv(StringIO(formatted_table))
 97 |         else:
 98 |             metadata_agg = pd.concat([metadata_agg,pd.read_csv(StringIO(formatted_table))],sort=True)
 99 |     if list_only:
100 |         print(','.join([acc+ext+".fastq.gz" for ext in fext]))
101 |     else:
102 |         if not force:
103 |             found = check_existing(loc, acc)
104 |             if found != False:
105 |                 print("found existing file matching acc:" + acc + ", skipping download. Pass -f to force download")
106 |                 return metadata_agg
107 | 
108 |         fa_paths = [os.path.join(loc,acc+ext+".fasta") for ext in fext]
109 |         fq_paths = [os.path.join(loc,acc+ext+".fastq") for ext in fext]
110 | 
111 |         for i in range(len(fa_paths)):
112 |             fa_path = fa_paths[i]
113 |             fq_path = fq_paths[i]
114 |             file_url = "http://api.metagenomics.anl.gov/download/"+acc+"?file="+stages_to_grab[i]
115 |             retcode = fetch_file(file_url,fa_path,retries)
116 |             ftype = check_filetype(fa_path)
117 |             gzipped = ftype.endswith('.gz')
118 |             if ftype.startswith("fasta"):
119 |                 print("Converting .fasta to .fastq (adding dummy quality scores), compressing")
120 |                 fasta_to_fastq(fa_path, fq_path, gzipped)
121 |                 retcode = call(["rm "+fa_path], shell=True) # get rid of old fasta
122 |                 rzip = gzip_files(fq_path, zip_func, threads)
123 |                 #rzip = call(["pigz -f -p "+ str(threads) + ' ' + fq_path], shell=True)
124 |             elif ftype.startswith("fastq"):
125 |                 if gzipped:
126 |                     print("downloaded file in .fastq.gz format already!")
127 |                     call(["mv", fa_path, fq_path+".gz"])
128 |                 else:
129 |                     print("downloaded file in .fastq format already, compressing .fastq")
130 |                     call(["mv", fa_path, fq_path])
131 |                     rzip = gzip_files(fq_path, zip_func, threads)
132 |             else:
133 |                 print("requested sample "+acc+" does not appear to be in .fasta or .fastq format. This may be because it is not publically accessible from MG-RAST.")
134 |     return metadata_agg
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # grabseqs
  2 | 
  3 | Utility for simplifying bulk downloading data from next-generation sequencing repositories, like [NCBI SRA](https://www.ncbi.nlm.nih.gov/sra/), [MG-RAST](http://www.mg-rast.org/).
  4 | 
  5 | [![CircleCI](https://circleci.com/gh/louiejtaylor/grabseqs.svg?style=shield)](https://circleci.com/gh/louiejtaylor/grabseqs) [![Conda version](https://anaconda.org/louiejtaylor/grabseqs/badges/version.svg)](https://anaconda.org/louiejtaylor/grabseqs) [![Conda downloads](https://anaconda.org/louiejtaylor/grabseqs/badges/downloads.svg)](https://anaconda.org/louiejtaylor/grabseqs/files) [![Paper link](https://img.shields.io/badge/Published%20in-Bioinformatics-126888.svg)](https://doi.org/10.1093/bioinformatics/btaa167)
  6 | 
  7 | [iMicrobe](https://www.imicrobe.us/) is currently not supported--working to remedy this (2025/08/14)
  8 | 
  9 | ## Install
 10 | 
 11 | Install grabseqs and all dependencies [via conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html):
 12 | 
 13 |     conda install grabseqs -c louiejtaylor -c bioconda -c conda-forge
 14 | 
 15 | Or with pip (and install the non-Python [dependencies](https://github.com/louiejtaylor/grabseqs#dependencies) yourself):
 16 | 
 17 |     pip install grabseqs
 18 |     
 19 | **Note:** If you're using SRA data, after you've installed sra-tools, run `vdb-config -i` and turn off local file caching unless you want extra copies of the downloaded sequences taking up space ([read more here](https://github.com/ncbi/sra-tools/wiki/Toolkit-Configuration)).
 20 | 
 21 | ## Quick start
 22 | 
 23 | Download all samples from a single SRA Project:
 24 | 
 25 |     grabseqs sra SRP#######
 26 |     
 27 | Or any combination of projects (S/ERP), runs (S/ERR), BioProjects (PRJNA):
 28 | 
 29 |     grabseqs sra SRR######## ERP####### PRJNA######## ERR########
 30 | 
 31 | If you'd like to do a dry run and just get a list of samples that will be downloaded, pass `-l`:
 32 |     
 33 |     grabseqs sra -l SRP########
 34 | 
 35 | Similar syntax works for MG-RAST:
 36 | 
 37 |     grabseqs mgrast mgp##### mgm#######
 38 | 
 39 | ## Detailed usage
 40 | 
 41 | See the [grabseqs FAQ](https://github.com/louiejtaylor/grabseqs/blob/master/faq/faq.md) for detailed troubleshooting tips.
 42 | 
 43 | Fun options:
 44 | 
 45 |     grabseqs sra -t 10 -m metadata.csv -o proj/ -r 3 SRP#######
 46 | 
 47 | (translation: use 10 threads, save metadata to `proj/metadata.csv`, download to the dir `proj/`, retry failed downloads 3x, get all samples from SRP#######)
 48 |     
 49 | If you'd like to do a dry run and only get a list of samples that will be downloaded, pass `-l`:
 50 |     
 51 |     grabseqs sra -l SRP########
 52 | 
 53 | If you'd like to pass your own arguments to `fasterq-dump` to get data in a slightly different format, you can do so like this:
 54 | 
 55 |     grabseqs sra SRP####### -r 0 --custom_fqdump_args="--split-spot --progress"
 56 | 
 57 | Full usage:
 58 | 
 59 |     grabseqs sra [-h] [-m METADATA] [-o OUTDIR] [-r RETRIES] [-t THREADS]
 60 |                  [-f] [-l] [--no_parsing] [--parse_run_ids]
 61 |                  [--use_fastq_dump]
 62 |                  id [id ...]
 63 | 
 64 |     positional arguments:
 65 |       id                One or more BioProject, ERR/SRR or ERP/SRP number(s)
 66 | 
 67 |     optional arguments:
 68 |       -h, --help        show this help message and exit
 69 |       -m METADATA       filename in which to save SRA metadata (.csv format,
 70 |                         relative to OUTDIR)
 71 |       -o OUTDIR         directory in which to save output. created if it doesn't
 72 |                         exist
 73 |       -r RETRIES        number of times to retry download
 74 |       -t THREADS        threads to use (for fasterq-dump/pigz)
 75 |       -f                force re-download of files
 76 |       -l                list (but do not download) samples to be grabbed
 77 |       --parse_run_ids   parse SRR/ERR identifers (do not pass straight to fasterq-
 78 |                         dump)
 79 |       --custom_fqdump_args CUSTOM_FQD_ARGS
 80 |                         "string" containing args to pass to fastq-dump
 81 |       --use_fastq_dump  use legacy fastq-dump instead of fasterq-dump (no
 82 |                         multithreaded downloading)
 83 |       
 84 | Downloads .fastq.gz files to `OUTDIR` (or the working directory if not specified). If the `-m` flag is passed, saves metadata to `OUTDIR` with filename `METADATA` in csv format.
 85 | 
 86 | Similar options are available for downloading from MG-RAST:
 87 | 
 88 |     grabseqs mgrast [-h] [-m METADATA] [-o OUTDIR] [-r RETRIES]
 89 |                     [-t THREADS] [-f] [-l]
 90 |                     rastid [rastid ...]
 91 | 
 92 | ## Troubleshooting
 93 | 
 94 | See the [grabseqs FAQ](https://github.com/louiejtaylor/grabseqs/blob/master/faq/faq.md) for detailed troubleshooting tips. If the FAQs don't fix your problem, feel free to [open an issue](https://github.com/louiejtaylor/grabseqs/issues)!
 95 | 
 96 | ## Dependencies
 97 | 
 98 |    - Python 3 (external packages req'd: requests, requests-html, pandas, fake-useragent)
 99 |    - sra-tools>3.2
100 |    - pigz
101 |    - wget
102 | 
103 | If you use conda (on Linux), these will be installed for you!
104 | 
105 | Grabseqs runs on Mac or Linux. We've tested on these specific OSes:
106 | 
107 | Linux (conda or pip):
108 |   - CentOS 6, 7, and 8
109 |   - Debian 9 and 10
110 |   - Ubuntu 16.04, 18.04, and 19.10
111 |   - Red Hat Enterprise 6, 7, and 8
112 |   - SUSE Enterprise 12 and 15
113 | 
114 | Mac (pip):
115 |   - MacOS 10.14
116 | 
117 | Grabseqs has been tested and works with the following version of the Python dependencies (though these are neither minimal nor pinned version numbers):
118 |    
119 |    - requests 2.22.0
120 |    - pandas>2
121 | 
122 | ## Citation
123 | 
124 | If you use grabseqs in your work, please cite:
125 | 
126 | Louis J Taylor, Arwa Abbas, Frederic D Bushman. "grabseqs: Simple downloading of reads and metadata from multiple next-generation sequencing data repositories." *Bioinformatics*, (2020), btaa167, https://doi.org/10.1093/bioinformatics/btaa167
127 | 
128 | Please also cite the researchers who generated the data (and the repository, if appropriate)!
129 | 
130 | ------------
131 | 
132 | ## Changelog
133 | 
134 | **1.0.0** (2025-08-14)
135 |  - Added a walk-through for adding a new repo using `template.py`
136 |  - Better handling for invalid SRA accession numbers
137 |  - Update endpoint for NCBI for SRA downloads
138 |  - Temporarily remove iMicrobe--needs rewrite to use a different tool
139 | 
140 | **0.7.0** (2020-01-29)
141 |  - Allow users to pass custom args to fast(er)q-dump
142 |  - Minor re-writes of download handling code for easier readability
143 | 
144 | **0.6.1** (2019-12-20)
145 |  - Validate compressed files (fix #8 and #34)
146 |  
147 | **0.6.0** (2019-12-12)
148 |  - Gracefully handle incomplete or missing dependencies
149 |  - Major rewrite of test suite
150 | 
151 | **0.5.2** (2019-12-05)
152 |  - Improvements to work with multiple versions of Python 3
153 | 
154 | **0.5.1** (2019-11-23)
155 |  - Hotfix handling outdated versions of sra-tools
156 | 
157 | **0.5.0** (2019-04-11)
158 |  - Metadata available for all sources in .csv format
159 | 
160 | ## History
161 | 
162 | This project spawned out of/incorporates code from [hisss](https://github.com/louiejtaylor/hisss); many thanks to [ArwaAbbas](https://github.com/ArwaAbbas) for helping make this work!
163 | 


--------------------------------------------------------------------------------
/grabseqslib/sra.py:
--------------------------------------------------------------------------------
  1 | import requests, time, shutil, sys
  2 | import pandas as pd
  3 | from io import StringIO
  4 | from subprocess import call
  5 | from grabseqslib.utils import check_existing, build_paths, gzip_files
  6 | 
  7 | def process_sra(args, zip_func):
  8 |     """
  9 |     High-level logic for SRA download processing. Takes
 10 |     `args` from grabseqslib argument parser and `zip_func`
 11 |     """
 12 |     # check deps
 13 |     dep_list = ["fastq-dump", "fasterq-dump"]
 14 |     deps_have = [shutil.which(dep) for dep in dep_list]
 15 |     if (not deps_have[0]) and (not deps_have[1]): # no sra-tools
 16 |         print("Neither fastq-dump nor fasterq-dump found; one is required. Please install sra-tools")
 17 |         sys.exit(1)
 18 |     elif not deps_have[1]:
 19 |         use_fastq_dump = True
 20 |     else:
 21 |         use_fastq_dump = args.fastqdump
 22 | 
 23 |     metadata_agg = None
 24 | 
 25 |     for sra_identifier in args.id:
 26 |         # get targets and metadata
 27 |         acclist, metadata_agg = get_sra_acc_metadata(sra_identifier,
 28 |                                                      args.outdir,
 29 |                                                      args.list,
 30 |                                                      not args.SRR_parsing,
 31 |                                                      metadata_agg)
 32 |         for acc in acclist:
 33 |             # get samples
 34 |             run_fasterq_dump(acc,
 35 |                              args.retries,
 36 |                              args.threads,
 37 |                              args.outdir,
 38 |                              args.force,
 39 |                              use_fastq_dump,
 40 |                              args.custom_fqd_args,
 41 |                              zip_func)
 42 | 
 43 |     return metadata_agg
 44 | 
 45 | 
 46 | def add_sra_subparser(subparser):
 47 |     """
 48 |     Function to add subparser for SRA data.
 49 |     """
 50 |     # Parser
 51 |     parser_sra = subparser.add_parser('sra', help="download from SRA")
 52 |     parser_sra.add_argument('id', type=str, nargs='+', 
 53 |                 help="One or more BioProject, ERR/SRR or ERP/SRP number(s)")
 54 | 
 55 |     # Options
 56 |     parser_sra.add_argument('-m', dest="metadata", type=str, default="",
 57 |                 help="filename in which to save SRA metadata (.csv format, relative to OUTDIR)")
 58 |     parser_sra.add_argument('-o', dest="outdir", type=str, default="",
 59 |                 help="directory in which to save output. created if it doesn't exist")
 60 |     parser_sra.add_argument('-r',dest="retries", type=int, default=2,
 61 |                 help="number of times to retry download")
 62 |     parser_sra.add_argument('-t',dest="threads", type=int, default=1,
 63 |                 help="threads to use (for fasterq-dump/pigz)")
 64 | 
 65 |     # General flags
 66 |     parser_sra.add_argument('-f', dest="force", action="store_true",
 67 |                 help = "force re-download of files")
 68 |     parser_sra.add_argument('-l', dest="list", action="store_true",
 69 |                 help="list (but do not download) samples to be grabbed")
 70 | 
 71 |     # SRA-specific flags
 72 |     parser_sra.add_argument('--parse_run_ids', dest="SRR_parsing", action="store_true", 
 73 |                                 help="parse SRR/ERR identifers (do not pass straight to fasterq-dump)")
 74 |     parser_sra.add_argument("--use_fastq_dump", dest="fastqdump", action="store_true",
 75 |                 help="use legacy fastq-dump instead of fasterq-dump (no multithreaded downloading)")
 76 |     parser_sra.add_argument("--custom_fqdump_args", dest="custom_fqd_args", type=str, default="",
 77 |                 help="'string' containing args to pass to fast(er)q-dump")
 78 | 
 79 |     # LEGACY: this will be removed in the next major version as this is now default.
 80 |     parser_sra.add_argument('--no_parsing', dest="no_SRR_parsing", action="store_true", 
 81 |                 help="Legacy option to not parse SRR IDs (now default)")
 82 | 
 83 | 
 84 | def get_sra_acc_metadata(pacc, loc = '', list_only = False, no_SRR_parsing = True, metadata_agg = None):
 85 |     """
 86 |     Function to get list of SRA accession numbers from a particular project.
 87 |     Takes project accession number `pacc` and returns a list of SRA 
 88 |     accession numbers. Optional arguments to `save` metadata .csv in a specified
 89 |     `loc`ation.
 90 |     Originally featured in: https://github.com/louiejtaylor/hisss
 91 |     """
 92 |     # Grab metadata for given accession number    
 93 |     pacc = pacc.strip()
 94 |     metadata = requests.get("https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&term="+pacc)
 95 |     lines = [l.split(',') for l in metadata.text.split("\n")]
 96 |     try:
 97 |         run_col = lines[0].index("Run")
 98 |     except ValueError: # "Run" column always present unless search failed
 99 |         raise ValueError("Could not find samples for accession: "+pacc+". If this accession number is valid, try re-running.")
100 | 
101 |     # Generate list of runs to download
102 |     run_list = [l[run_col] for l in lines[1:] if len(l[run_col]) > 0]
103 | 
104 |     # Aggregate metadata if multiple samples/projects are being asked for
105 |     if type(metadata_agg) == type(None):
106 |         metadata_agg = pd.read_csv(StringIO(metadata.text))
107 |     else:
108 |         metadata_agg = metadata_agg.append(pd.read_csv(StringIO(metadata.text)),sort=True)
109 | 
110 |     if list_only: 
111 |         # Do not download but read metadata and say what will be downloaded
112 |         layout_col = lines[0].index("LibraryLayout")
113 |         if no_SRR_parsing and (pacc.startswith('SRR') or pacc.startswith('ERR')):
114 |             layout_list = [l[layout_col] for l in lines[1:] if len(l[run_col]) > 0 and l[run_col] == pacc]
115 |             run_list = [pacc]
116 |         else:
117 |             layout_list = [l[layout_col] for l in lines[1:] if len(l[run_col]) > 0]
118 |             
119 |         # Print filenames that should come down (assuming the repo metadata is correct)
120 |         for i in range(len(layout_list)):
121 |             if layout_list[i] == "SINGLE":
122 |                 print(run_list[i]+".fastq.gz")
123 |             elif layout_list[i] == "PAIRED":
124 |                 print(run_list[i]+"_1.fastq.gz,"+run_list[i]+"_2.fastq.gz")
125 |             else:
126 |                 raise Exception("Unknown library layout: "+layout_list[i])
127 |                 
128 |         # If we're here, we're listing and not downloading. So we don't return any accessions to download 
129 |         # (empty list), but the user may still want the aggregated metadata
130 |         return [], metadata_agg
131 |     else:
132 |         if no_SRR_parsing: # default, if given a Run return a Run
133 |             if pacc.startswith('SRR') or pacc.startswith('ERR'):
134 |                 return [pacc], metadata_agg
135 |         # otherwise, return all the Run accessions associated with whatever identifier was passed.
136 |         return run_list, metadata_agg
137 | 
138 | def run_fasterq_dump(acc, retries = 2, threads = 1, loc='', force=False, fastqdump=False, custom_args="", zip_func="gzip"):
139 |     """
140 |     Helper function to run fast(er)q-dump to grab a particular `acc`ession,
141 |     with support for a particular number of `retries`. Can use multiple
142 |     `threads`.
143 |     """
144 |     skip = False
145 |     retcode = 1
146 |     while retries >= 0:
147 |         if not force:
148 |             found = check_existing(loc, acc)
149 |             if found != False:
150 |                 print("found existing file matching acc:" + acc + ", skipping download. Pass -f to force download")
151 |                 skip = True
152 |                 break
153 |         if not skip:
154 |             if len(custom_args) == 0:
155 |                 if fastqdump: # use legacy fastq-dump
156 |                     cmd = ["fastq-dump", "--gzip", "--split-3", "--skip-technical"]
157 |                 else:
158 |                     cmd = ["fasterq-dump", "-e", str(threads), "-f", "-3"]
159 |             else:
160 |                 suffix = "er"
161 |                 if fastqdump:
162 |                     suffix = ""
163 |                 prog_to_run = "fast" + suffix + "q-dump"
164 |                 cmd = [prog_to_run] + custom_args.split(' ')
165 |             if loc != "":
166 |                 cmd = cmd + ['-O', loc]
167 |             cmd = cmd + [acc]
168 |             print("running: "+" ".join(cmd))
169 |             retcode = call(cmd)
170 |             rgzip = 0
171 |             if retcode == 0:
172 |                 if not fastqdump:
173 |                     # zip all possible output files for that acc
174 |                     fnames = build_paths(acc, loc, False) + build_paths(acc, loc, True)
175 |                     rgzip = gzip_files(fnames, zip_func, threads)
176 |                 if rgzip == 0:
177 |                     if check_existing(loc, acc) != False:
178 |                         break
179 | 
180 |             # only here if downloading and zipping failed
181 |             print("SRA download for acc "+acc+" failed, retrying "+str(retries)+" more times.")
182 |             if retries > 0:
183 |                 time.sleep(100) #TODO?: user-modifiable
184 |                 retries -= 1
185 |             else:
186 |                 raise Exception("download for "+acc+" failed. fast(er)q-dump returned "+str(retcode)+", pigz returned "+str(rgzip)+".")
187 |         else:
188 |             break
189 | 
190 | 


--------------------------------------------------------------------------------
/faq/template_usage.md:
--------------------------------------------------------------------------------
  1 | # Adding support for new modules
  2 | 
  3 | We provide a [Python template](https://github.com/louiejtaylor/grabseqs/blob/master/template.py) 
  4 | to facilitate adding support for new repositories, and welcome pull requests! This page walks
  5 | through the process of building out that template for a hypothetical new repository, `newrepo`.
  6 | 
  7 | ## Background
  8 | 
  9 | ### Grabseqs anatomy
 10 | 
 11 | The grabseqs functionality is contained within a package called `grabseqslib`, to disambiguate
 12 | between the command-line tool and the package providing the functionality. `grabseqslib` includes
 13 | multiple modules: one per repository (e.g. `sra` and `mgrast`), as well as a `utils` module 
 14 | containing general functions used by each module. Finally, `__init__.py` creates the base argument
 15 | parser and passes command-line arguments to be processed by repository-specific logic.
 16 | 
 17 | New repositories will likely be comprised of an additional module (e.g. `newrepo.py`), as well as
 18 | a few lines of logic in `__init__.py` to handle argument parsing.
 19 | 
 20 | ### Repository structure
 21 | 
 22 | Generally, grabseqs is built on the assumption that repositories contain "projects", consisting of
 23 | one or more "samples". This assumption has held thus far from both a data generation perspective 
 24 | (usually in a study you sequence a number of samples related to a research question) and is also 
 25 | useful from a data re-use perspective (you'd often like to reanalyze all or a subset of the samples
 26 | from a given study). Simple data re-use was the core motivation for developing grabseqs.
 27 | This relationship is often reflected in the accession numbers for samples/projects, and can be
 28 | exploited in automating sample access--for example, MG-RAST project IDs are prefixed by "mgp", 
 29 | whereas sample (metagenome) names are prefixed by "mgm".
 30 | 
 31 | Repository metadata is also an important point to consider. Is there programmatically-accessible
 32 | metadata? If so, is it available on the project-level, sample-level, or both? For example, when
 33 | accessing data from NCBI's SRA, you can request detailed, sample-level metadata from the API
 34 | given only a project accession number, which greatly simplifies metadata processing. For MG-RAST
 35 | and iMicrobe, on the other hand, APIs map project information to sample numbers, which then must
 36 | be queried individually for sample-level metadata.
 37 | 
 38 | ## Adding support for `newrepo`
 39 | 
 40 | Now, we'll walk step-by-step through adding support for a new repository, `newrepo`, to grabseqs.
 41 | The end goal is to be able to run the following command:
 42 | 
 43 |     grabseqs newrepo samp1234 proj123
 44 | 
 45 | Where "samp1234" is an example sample accession to be downloaded, and "proj123" is an example 
 46 | project accession number.
 47 | 
 48 | If you'd like to eventually make a pull request back into this repository, we recommend making a fork
 49 | of this repository to work with. Either way, you'll likely want a local copy to work with, so clone away!
 50 | 
 51 | ### Step 1: Make a new module
 52 | 
 53 | We recommend starting with a copy of [`template.py`](https://github.com/louiejtaylor/grabseqs/blob/master/template.py),
 54 | as it lays out the structure used in all the other grabseqs modules. Make a copy of this file in the
 55 | `grabseqslib` directory, with a name that makes sense--for our example we'll call it `newrepo.py`.
 56 | 
 57 | This template module contains four functions. We'll name them after `newrepo`, so they are:
 58 | 
 59 |  - `add_newrepo_subparser`: adds an argparse subparser for handling arguments to the grabseqs
 60 | command-line tool for your repo
 61 |  - `process_newrepo`: controller function for handling input project/sample accession numbers
 62 |  - `map_newrepo_project_acc`: function to map project accession numbers to sample accession numbers
 63 | for downloading
 64 |  - `download_newrepo_sample`: function to handle downloading of reads given a sample accession number
 65 | 
 66 | These functions handle 99% of the work for newrepo downloads, and we'll walk through filling them out
 67 | one-by-one.
 68 | 
 69 | ### Step 2: Add the `newrepo` subparser
 70 | 
 71 | The `add_newrepo_subparser` function is likely mostly done for you in the template. There are a number of 
 72 | options that you'll likely use for any repository (e.g. specifying an output directory, number of retries,
 73 | forcing re-download of already-existing files, multithreading, and listing/dry-running \[rather than 
 74 | downloading\] samples matching a particular query). These options are pre-specified for you and shouldn't
 75 | be tweaked much for consistency with other repository subparsers. If metadata is not programmatically 
 76 | available from `newrepo`, remove the `-m` option.
 77 | 
 78 | If you'd like to include additional functions above and beyond these default options, you may add them to 
 79 | the argparser object within this function. See the `sra.py` file for examples of other options--there are
 80 | a variety of options for the `grabseqs sra` subparser above and beyond the default options.
 81 | 
 82 | You may have noticed that this function isn't quite hooked up to the main argparse instance yet. That's okay;
 83 | we'll do this at the end when we edit `__init__.py`.
 84 | 
 85 | ### Step 3: Add the controller logic
 86 | 
 87 | From here on out, it's a good idea to have a good handle on the questions addressed in the "Repository 
 88 | structure" section above. Important questions include:
 89 | 
 90 |  - How can I programmatically access data, metadata, and map project accession numbers to sample accession 
 91 |  numbers? E.g. is there an API endpoint (used in MG-RAST, iMicrobe metadata/downloads/mapping and 
 92 |  SRA metadata/mapping) or a tool to aid in downloading (used in SRA downloads)?
 93 |  - Is metadata available? If so, is sample-level metadata accessible from a project accession number
 94 |  (easiest)?
 95 |  
 96 | The answers to these questions determine how the `process_newrepo` controller function will be structured.
 97 | The example in `template.py` assumes that metadata is available, and that sample-level metadata is present
 98 | in the project-sample mapping step. Thus, the controller generally functions like so:
 99 | 
100 |     # begin looping through accession numbers passed by user
101 |     for newrepo_identifier in args.newrepoid:
102 |         
103 |         # for each project identifier, map it to sample identifiers and grab metadata (a pandas dataframe)
104 |         sample_list, metadata_agg = map_newrepo_project_acc(newrepo_identifier, metadata_agg)
105 |         
106 |         # for each sample mapped to by the passed project identifier
107 |         for sample in sample_list:
108 |             
109 |             # download that sample
110 |             download_newrepo_sample(acc,
111 |                                     args.retries,
112 |                                     args.threads,
113 |                                     args.outdir,
114 |                                     args.force,
115 |                                     args.list)
116 |  
117 | Actual metadata saving is handled in a repository-agnostic fashion--the `process_newrepo` function will
118 | return the pandas dataframe containing metadata, which will then be saved in a safe/non-clobber-y way
119 | (with no additional effort necessary on your part).
120 |  
121 | Now, let's go write the actual mapping logic.
122 |  
123 | ### Step 4: Map project accessions to sample accessions
124 | 
125 | The `map_newrepo_project_acc` maps project to sample accession numbers, returning a list of sample accession
126 | numbers. Depending on metadata availability, you may also access sample metadata in this mapping step, and
127 | it seems prudent to only make one API call when necessary, so we've written the example using this slightly
128 | more complicated workflow--this is also used in the SRA and MG-RAST modules.
129 | 
130 | Generally, you can pass one or more project or sample accessions to grabseqs. Depending on from where metadata is 
131 | obtained, you'll either want to avoid `map_newrepo_project_acc` altogether if a sample accession number is
132 | passed; or grab metadata and return a singleton list (containing the sample accession number) and metadata to your
133 | controller function. An example of using the pandas.DataFrame.append() method to concatenate multiple metadata
134 | tables is included in this function in the template file.
135 | 
136 | The code here is dependent on the format of the project-sample map. SRA provides mapping information in csv format;
137 | the MG-RAST API returns JSON maps--feel free to use that code for inspiration. Your workflow might look something like
138 | this (based on the MG-RAST JSON workflow and using the `json` and `requests` libraries), where `pacc` is the 
139 | accession number:
140 | 
141 |     # initialize vars
142 |     sample_list = []
143 |     metadata_df = pd.DataFrame()
144 |     
145 |     # hit api
146 |     metadata_json = json.loads(requests.get("http://api.newrepo.gov/metadata/export/"+str(pacc)).text)
147 |     
148 |     for sample in metadata_json["samples"]:
149 |         sample_list.append(sample["value"])
150 |         # additional logic to add metadata lines to metadata_df
151 | 
152 | If the user would like to list \[but not download\] the available samples (-l) and information on read paired-ness
153 | is available here, i.e. from a metadata table, this can be tested for here (and then return an empty 
154 | `sample_list` to prevent downstream downloading). For an example of this workflow, see the `sra.py` module.
155 | 
156 | ### Step 5: Download samples!
157 | 
158 | A bit of of boilerplate is included already, handling the `-f` (force) option:
159 | 
160 |     # Make sure to check that the sample isn't already downloaded
161 |     if not force:
162 |         # using check_existing from utils.py
163 |         found = check_existing(loc, acc)
164 |         if found != False:
165 |             print("found existing file matching acc:" + acc + ", skipping download. Pass -f to force download")
166 |             return False
167 | 
168 | You can build the expected paths for the eventual downloaded reads like so:
169 | 
170 |     paired = True
171 |     # using build_paths from utils.py
172 |     file_paths = build_paths(acc, loc, paired)
173 | 
174 | Generally, unless there's a tool like NCBI's fasterq-dump that downloads both reads in one command, it's 
175 | just easier to iterate through file paths (i.e. either one unpaired, or two paired). 
176 | 
177 | If the file is directly available from an API URL, the `fetch_file` function from `grabseqs.utils` should serve 
178 | you well (it uses `wget`, a grabseqs dependency):
179 | 
180 |     seq_urls = ["http://api.newrepo.gov/data/"+str(acc)+"_R1.fastq",
181 |                 "http://api.newrepo.gov/data/"+str(acc)+"_R2.fastq"]
182 |     
183 |     for i in range(len(seq_urls)):
184 |     
185 |         print("Downloading accession "+acc+" from newrepo")
186 |         
187 |         # fetch_file should work for most things where a URL is available
188 |         retcode = fetch_file(seq_urls[i],file_paths[i],retries)
189 | 
190 |         # There are a number of things you may want to do here: check and handle
191 |         # downloaded file integrity, convert to .fastq (see mgrast.py for an example
192 |         # of a scenario dealing with .fastx in general), etc.
193 | 
194 |         print("Compressing .fastq")
195 |         gzip_files(file_paths, zip_func, threads)
196 | 
197 | If metadata is only available on a sample-wise basis, you may want to do metadata handling in this function
198 | as well, or in a separate function if two API calls are necessary. See `mgrast.py` for an example of metadata
199 | handling at the sample level. If 
200 | 
201 | Regarding sample listing/dry-running (-l)--if the information about whether samples are paired or unpaired is 
202 | only available from a sample-specific source, it usually makes more sense to look that up here, and then just 
203 | skip the downloading part. For an example of this workflow, see the `mgrast.py` module.
204 | 
205 | Now we've written all the logic for argument parsing, metadata wrangling, project-sample accession mapping, and
206 | raw data downloading! We just have to hook it all together to the main grabseqs program.
207 | 
208 | ### Step 6: Hooking up subparser and controller functions
209 | 
210 | Here, you need to edit `__init__.py`. This should be fairly self explanatory based on what's already 
211 | present for the other submodules, but you'll need to add the following:
212 | 
213 |  - Import your new functions:
214 | ```{python}
215 | from grabseqslib.sra import process_sra, add_sra_subparser
216 | from grabseqslib.imicrobe import process_imicrobe, add_imicrobe_subparser
217 | from grabseqslib.mgrast import process_mgrast, add_mgrast_subparser
218 | from grabseqslib.newrepo import process_newrepo, add_newrepo_subparser
219 | ```
220 |  - Add your new subparser:
221 | ```{python}
222 | add_sra_subparser(subpa)
223 | add_imicrobe_subparser(subpa)
224 | add_mgrast_subparser(subpa)
225 | add_mgrast_subparser(subpa)
226 | ```
227 |  - Check to see if the user called your subparser:
228 | ```{python}
229 | try:
230 |     if args.newrepoid:
231 |         repo = "newrepo"
232 | ```
233 |  - Finally, add a case for your controller function:
234 | ```{python}
235 | if repo == "SRA":
236 |     metadata_agg = process_sra(args, zip_func)
237 | elif repo == "MG-RAST":
238 |     metadata_agg = process_mgrast(args, zip_func)
239 | elif repo == "iMicrobe":
240 |     metadata_agg = process_imicrobe(args, zip_func)
241 | elif repo == "newrepo":
242 |     metadata_agg = process_newrepo(args, zip_func)
243 | ```
244 | 
245 | ## What next?
246 | 
247 | So, you've added a new repository--that's awesome (and thank you!!)! Feel free to open a pull request. We 
248 | run grabseqs through a rigorous set of automated tests on every commit/weekly, so please write a new test 
249 | or three testing data/metadata downloading, or any edge cases that you encounter that other users/developers
250 | might not know about.
251 | 


--------------------------------------------------------------------------------