├── .gitignore
├── LICENSE
├── README.rst
├── parallel-fastq-dump
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | dist
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Renan Valieris
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | parallel-fastq-dump
 3 | ===================
 4 | parallel ``fastq-dump`` wrapper
 5 | 
 6 | Why & How
 7 | ---------
 8 | NCBI ``fastq-dump`` can be very slow sometimes, even if you have the resources (network, IO, CPU) to go faster, even if you already downloaded the sra file (see the protip below). This tool speeds up the process by dividing the work into multiple threads.
 9 | 
10 | This is possible because ``fastq-dump`` have options (``-N`` and ``-X``) to query specific ranges of the sra file, this tool works by dividing the work into the requested number of threads, running multiple ``fastq-dump`` in parallel and concatenating the results back together, as if you had just executed a plain ``fastq-dump`` call.
11 | 
12 | Protips
13 | -------
14 | * Downloading with ``fastq-dump`` is slow, even with multiple threads, it is recommended to use ``prefetch`` to download the target sra file before using ``fastq-dump``, that way ``fastq-dump`` will only need to do the dumping.
15 | * All extra arguments will be passed directly to ``fastq-dump``, ``--gzip``, ``--split-files`` and filters works as expected.
16 | * This tool is **not** a replacement, you still need ``fastq-dump`` and ``sra-stat`` on your ``PATH`` for it to work properly.
17 | * Speed improvements are better with bigger files, think at least 200k reads/pairs for each thread used.
18 | 
19 | Install
20 | -------
21 | The preferred way to install is using `Bioconda <http://bioconda.github.io/>`_:
22 | 
23 | ``conda install parallel-fastq-dump``
24 | 
25 | this will get you the sra-tools dependency as well.
26 | 
27 | **Important**: Make sure the sra-tools package being installed is a recent version (>=2.10.0) to guarantee compatibility with NCBI servers,
28 | conda might try to install an older version to be compatible with existing packages installed in your env, to be sure use this command:
29 | 
30 | ``conda install parallel-fastq-dump 'sra-tools>=3.0.0'``
31 | 
32 | If that doesn't work you could also install it on a separate new env:
33 | 
34 | ``conda create -n testenv parallel-fastq-dump 'sra-tools>=3.0.0'``
35 | 
36 | Examples
37 | --------
38 | ``$ parallel-fastq-dump --sra-id SRR2244401 --threads 4 --outdir out/ --split-files --gzip``
39 | 
40 | Micro Benchmark
41 | ---------------
42 | 
43 | .. figure:: https://cloud.githubusercontent.com/assets/6310472/23962085/bdefef44-098b-11e7-825f-1da53d6568d6.png
44 | 


--------------------------------------------------------------------------------
/parallel-fastq-dump:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import sys
  3 | import os
  4 | import shutil
  5 | import tempfile
  6 | import subprocess
  7 | import argparse
  8 | import logging
  9 | 
 10 | __version__ = '0.6.7'
 11 | 
 12 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 13 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
 14 |                       argparse.RawDescriptionHelpFormatter):
 15 |     pass
 16 | 
 17 | desc = 'parallel fastq-dump wrapper, extra args will be passed through'
 18 | epi = """DESCRIPTION:
 19 | Example: parallel-fastq-dump --sra-id SRR2244401 --threads 4 --outdir out/ --split-files --gzip
 20 | """
 21 | 
 22 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
 23 |                                  formatter_class=CustomFormatter)
 24 | argparse.ArgumentDefaultsHelpFormatter
 25 | parser.add_argument('-s','--sra-id', help='SRA id', action='append')
 26 | parser.add_argument('-t','--threads', help='number of threads', default=1, type=int)
 27 | parser.add_argument('-O','--outdir', help='output directory', default='.')
 28 | parser.add_argument('-T', '--tmpdir', help='temporary directory', default=None)
 29 | parser.add_argument('-N','--minSpotId', help='Minimum spot id', default=1, type=int)
 30 | parser.add_argument('-X','--maxSpotId', help='Maximum spot id', default=None, type=int)
 31 | parser.add_argument('-V', '--version', help='shows version', action='store_true', default=False)
 32 | 
 33 | 
 34 | def pfd(args, srr_id, extra_args):
 35 |     """
 36 |     Parallel fastq dump
 37 |     Parameters
 38 |     ----------
 39 |     args : dict
 40 |         User-provided args
 41 |     srr_id : str
 42 |         SRR ID
 43 |     extra_args : dict
 44 |         Extra args
 45 |     """
 46 |     tmp_dir = tempfile.TemporaryDirectory(prefix='pfd_',dir=args.tmpdir)
 47 |     logging.info('tempdir: {}'.format(tmp_dir.name))
 48 | 
 49 |     n_spots = get_spot_count(srr_id)
 50 |     logging.info('{} spots: {}'.format(srr_id,n_spots))
 51 | 
 52 |     # minSpotId cant be lower than 1
 53 |     start = max(args.minSpotId, 1)
 54 |     # maxSpotId cant be higher than n_spots
 55 |     end = min(args.maxSpotId, n_spots) if args.maxSpotId is not None else n_spots
 56 | 
 57 |     blocks = split_blocks(start, end, args.threads)
 58 |     logging.info('blocks: {}'.format(blocks))
 59 |     
 60 |     ps = []
 61 |     for i in range(0,args.threads):
 62 |         d = os.path.join(tmp_dir.name, str(i))
 63 |         os.mkdir(d)
 64 |         cmd = ['fastq-dump', '-N', str(blocks[i][0]), '-X', str(blocks[i][1]),
 65 |                '-O', d] + extra_args + [srr_id]
 66 |         logging.info('CMD: {}'.format(' '.join(cmd)))
 67 |         p = subprocess.Popen(cmd)
 68 |         ps.append(p)
 69 | 
 70 |     wfd = {}
 71 |     for i in range(0,args.threads):
 72 |         exit_code = ps[i].wait()
 73 |         if exit_code != 0:
 74 |             logging.warning('fastq-dump error! exit code: {}'.format(exit_code))
 75 |             sys.exit(1)
 76 | 
 77 |         tmp_path = os.path.join(tmp_dir.name, str(i))
 78 |         for fo in os.listdir(tmp_path):
 79 |             if fo not in wfd:
 80 |                 wfd[fo] = open(os.path.join(args.outdir,fo), 'wb')
 81 |             with open(os.path.join(tmp_path,fo), 'rb') as fd:
 82 |                 shutil.copyfileobj(fd, wfd[fo])
 83 |             os.remove(os.path.join(tmp_path,fo))
 84 |     
 85 |     # close the file descriptors for good measure
 86 |     for fd in wfd.values():
 87 |         fd.close()
 88 | 
 89 | def split_blocks(start, end, n_pieces):
 90 |     total = (end-start+1)
 91 |     avg = int(total / n_pieces)
 92 |     out = []
 93 |     last = start
 94 |     for i in range(0,n_pieces):
 95 |         out.append([last,last + avg-1])
 96 |         last += avg
 97 |         if i == n_pieces-1: out[i][1] += total % n_pieces
 98 |     return out
 99 | 
100 | def get_spot_count(sra_id):
101 |     """
102 |     Get spot count via sra-stat
103 |     Parameters
104 |     ----------
105 |     sra_id : str
106 |         SRA ID
107 |     """
108 |     cmd = ['sra-stat', '--meta', '--quick', sra_id]
109 |     logging.info('CMD: {}'.format(' '.join(cmd)))
110 |     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
111 |     stdout, stderr = p.communicate()    
112 |     txt = stdout.decode().rstrip().split('\n')
113 |     total = 0
114 |     try:
115 |         for l in txt:
116 |             total += int(l.split('|')[2].split(':')[0])
117 |     except IndexError:
118 |         msg = 'sra-stat output parsing error!'
119 |         msg += '\n--sra-stat STDOUT--\n{}'
120 |         msg += '\n--sra-stat STDERR--\n{}'
121 |         etxt = stderr.decode().rstrip().split('\n')
122 |         raise IndexError(msg.format('\n'.join(txt), '\n'.join(etxt)))
123 |     return total
124 | 
125 | def partition(f, l):
126 |     r = ([],[])
127 |     for i in l:
128 |         if f(i):
129 |             r[0].append(i)
130 |         else:
131 |             r[1].append(i)
132 |     return r
133 | 
134 | def is_sra_file(path):
135 |     """
136 |     Determine whether path is SRA file
137 |     parameters
138 |     ----------
139 |     path : str
140 |         file path
141 |     """
142 |     f = os.path.basename(path)
143 |     if f.lower().endswith('.sra'): return True
144 |     if 'SRR' in f.upper(): return True
145 |     if 'ERR' in f.upper(): return True
146 |     if 'DRR' in f.upper(): return True
147 |     return False
148 | 
149 | def main():
150 |     """
151 |     Main interface
152 |     """
153 |     args, extra = parser.parse_known_args()
154 |     if args.version:
155 |         print('parallel-fastq-dump : {}'.format(__version__))
156 |         subprocess.Popen(['fastq-dump', '-V']).wait()
157 |         sys.exit(0)
158 | 
159 |     elif args.sra_id:
160 |         extra_srrs, extra_args = partition(is_sra_file,extra)        
161 |         args.sra_id.extend(extra_srrs)
162 |         logging.info('SRR ids: {}'.format(args.sra_id))
163 |         logging.info('extra args: {}'.format(extra_args))
164 | 
165 |         # output directory
166 |         if not os.path.isdir(args.outdir) and args.outdir != '.':
167 |             os.makedirs(args.outdir)
168 |         # temp directory
169 |         if (args.tmpdir is not None and
170 |             not os.path.isdir(args.tmpdir)
171 |             and args.tmpdir != '.'):
172 |             os.makedirs(args.tmpdir)
173 |         # fastq dump
174 |         for si in args.sra_id:
175 |             pfd(args, si, extra_args)
176 |     else:
177 |         parser.print_help()
178 |         sys.exit(1)
179 | 
180 | if __name__ == '__main__':
181 |     main()
182 | 
183 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from setuptools import setup
 3 | 
 4 | pname = "parallel-fastq-dump"
 5 | 
 6 | exec(list(filter(
 7 |     lambda l: l.startswith("__version__"),
 8 |     open(pname).read().split("\n")
 9 | ))[0])
10 | 
11 | setup(
12 |     name=pname,
13 |     version=__version__,
14 |     author="Renan Valieris",
15 |     author_email="renan.valieris@cipe.accamargo.org.br",
16 |     description="parallel fastq-dump wrapper",
17 |     license="MIT",
18 |     url="https://github.com/rvalieris/parallel-fastq-dump",
19 |     scripts=[pname],
20 |     classifiers=[
21 |         "Development Status :: 4 - Beta",
22 |         "Environment :: Console",
23 |         "Intended Audience :: Science/Research",
24 |         "License :: OSI Approved :: MIT License",
25 |         "Programming Language :: Python :: 3",
26 |         "Topic :: Scientific/Engineering :: Bio-Informatics"
27 |     ]
28 | )
29 | 


--------------------------------------------------------------------------------