├── .gitignore ├── LICENSE ├── README.rst ├── parallel-fastq-dump └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | dist 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Renan Valieris 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | parallel-fastq-dump 3 | =================== 4 | parallel ``fastq-dump`` wrapper 5 | 6 | Why & How 7 | --------- 8 | NCBI ``fastq-dump`` can be very slow sometimes, even if you have the resources (network, IO, CPU) to go faster, even if you already downloaded the sra file (see the protip below). This tool speeds up the process by dividing the work into multiple threads. 9 | 10 | This is possible because ``fastq-dump`` have options (``-N`` and ``-X``) to query specific ranges of the sra file, this tool works by dividing the work into the requested number of threads, running multiple ``fastq-dump`` in parallel and concatenating the results back together, as if you had just executed a plain ``fastq-dump`` call. 11 | 12 | Protips 13 | ------- 14 | * Downloading with ``fastq-dump`` is slow, even with multiple threads, it is recommended to use ``prefetch`` to download the target sra file before using ``fastq-dump``, that way ``fastq-dump`` will only need to do the dumping. 15 | * All extra arguments will be passed directly to ``fastq-dump``, ``--gzip``, ``--split-files`` and filters works as expected. 16 | * This tool is **not** a replacement, you still need ``fastq-dump`` and ``sra-stat`` on your ``PATH`` for it to work properly. 17 | * Speed improvements are better with bigger files, think at least 200k reads/pairs for each thread used. 18 | 19 | Install 20 | ------- 21 | The preferred way to install is using `Bioconda `_: 22 | 23 | ``conda install parallel-fastq-dump`` 24 | 25 | this will get you the sra-tools dependency as well. 26 | 27 | **Important**: Make sure the sra-tools package being installed is a recent version (>=2.10.0) to guarantee compatibility with NCBI servers, 28 | conda might try to install an older version to be compatible with existing packages installed in your env, to be sure use this command: 29 | 30 | ``conda install parallel-fastq-dump 'sra-tools>=3.0.0'`` 31 | 32 | If that doesn't work you could also install it on a separate new env: 33 | 34 | ``conda create -n testenv parallel-fastq-dump 'sra-tools>=3.0.0'`` 35 | 36 | Examples 37 | -------- 38 | ``$ parallel-fastq-dump --sra-id SRR2244401 --threads 4 --outdir out/ --split-files --gzip`` 39 | 40 | Micro Benchmark 41 | --------------- 42 | 43 | .. figure:: https://cloud.githubusercontent.com/assets/6310472/23962085/bdefef44-098b-11e7-825f-1da53d6568d6.png 44 | -------------------------------------------------------------------------------- /parallel-fastq-dump: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import os 4 | import shutil 5 | import tempfile 6 | import subprocess 7 | import argparse 8 | import logging 9 | 10 | __version__ = '0.6.7' 11 | 12 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 13 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 14 | argparse.RawDescriptionHelpFormatter): 15 | pass 16 | 17 | desc = 'parallel fastq-dump wrapper, extra args will be passed through' 18 | epi = """DESCRIPTION: 19 | Example: parallel-fastq-dump --sra-id SRR2244401 --threads 4 --outdir out/ --split-files --gzip 20 | """ 21 | 22 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 23 | formatter_class=CustomFormatter) 24 | argparse.ArgumentDefaultsHelpFormatter 25 | parser.add_argument('-s','--sra-id', help='SRA id', action='append') 26 | parser.add_argument('-t','--threads', help='number of threads', default=1, type=int) 27 | parser.add_argument('-O','--outdir', help='output directory', default='.') 28 | parser.add_argument('-T', '--tmpdir', help='temporary directory', default=None) 29 | parser.add_argument('-N','--minSpotId', help='Minimum spot id', default=1, type=int) 30 | parser.add_argument('-X','--maxSpotId', help='Maximum spot id', default=None, type=int) 31 | parser.add_argument('-V', '--version', help='shows version', action='store_true', default=False) 32 | 33 | 34 | def pfd(args, srr_id, extra_args): 35 | """ 36 | Parallel fastq dump 37 | Parameters 38 | ---------- 39 | args : dict 40 | User-provided args 41 | srr_id : str 42 | SRR ID 43 | extra_args : dict 44 | Extra args 45 | """ 46 | tmp_dir = tempfile.TemporaryDirectory(prefix='pfd_',dir=args.tmpdir) 47 | logging.info('tempdir: {}'.format(tmp_dir.name)) 48 | 49 | n_spots = get_spot_count(srr_id) 50 | logging.info('{} spots: {}'.format(srr_id,n_spots)) 51 | 52 | # minSpotId cant be lower than 1 53 | start = max(args.minSpotId, 1) 54 | # maxSpotId cant be higher than n_spots 55 | end = min(args.maxSpotId, n_spots) if args.maxSpotId is not None else n_spots 56 | 57 | blocks = split_blocks(start, end, args.threads) 58 | logging.info('blocks: {}'.format(blocks)) 59 | 60 | ps = [] 61 | for i in range(0,args.threads): 62 | d = os.path.join(tmp_dir.name, str(i)) 63 | os.mkdir(d) 64 | cmd = ['fastq-dump', '-N', str(blocks[i][0]), '-X', str(blocks[i][1]), 65 | '-O', d] + extra_args + [srr_id] 66 | logging.info('CMD: {}'.format(' '.join(cmd))) 67 | p = subprocess.Popen(cmd) 68 | ps.append(p) 69 | 70 | wfd = {} 71 | for i in range(0,args.threads): 72 | exit_code = ps[i].wait() 73 | if exit_code != 0: 74 | logging.warning('fastq-dump error! exit code: {}'.format(exit_code)) 75 | sys.exit(1) 76 | 77 | tmp_path = os.path.join(tmp_dir.name, str(i)) 78 | for fo in os.listdir(tmp_path): 79 | if fo not in wfd: 80 | wfd[fo] = open(os.path.join(args.outdir,fo), 'wb') 81 | with open(os.path.join(tmp_path,fo), 'rb') as fd: 82 | shutil.copyfileobj(fd, wfd[fo]) 83 | os.remove(os.path.join(tmp_path,fo)) 84 | 85 | # close the file descriptors for good measure 86 | for fd in wfd.values(): 87 | fd.close() 88 | 89 | def split_blocks(start, end, n_pieces): 90 | total = (end-start+1) 91 | avg = int(total / n_pieces) 92 | out = [] 93 | last = start 94 | for i in range(0,n_pieces): 95 | out.append([last,last + avg-1]) 96 | last += avg 97 | if i == n_pieces-1: out[i][1] += total % n_pieces 98 | return out 99 | 100 | def get_spot_count(sra_id): 101 | """ 102 | Get spot count via sra-stat 103 | Parameters 104 | ---------- 105 | sra_id : str 106 | SRA ID 107 | """ 108 | cmd = ['sra-stat', '--meta', '--quick', sra_id] 109 | logging.info('CMD: {}'.format(' '.join(cmd))) 110 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 111 | stdout, stderr = p.communicate() 112 | txt = stdout.decode().rstrip().split('\n') 113 | total = 0 114 | try: 115 | for l in txt: 116 | total += int(l.split('|')[2].split(':')[0]) 117 | except IndexError: 118 | msg = 'sra-stat output parsing error!' 119 | msg += '\n--sra-stat STDOUT--\n{}' 120 | msg += '\n--sra-stat STDERR--\n{}' 121 | etxt = stderr.decode().rstrip().split('\n') 122 | raise IndexError(msg.format('\n'.join(txt), '\n'.join(etxt))) 123 | return total 124 | 125 | def partition(f, l): 126 | r = ([],[]) 127 | for i in l: 128 | if f(i): 129 | r[0].append(i) 130 | else: 131 | r[1].append(i) 132 | return r 133 | 134 | def is_sra_file(path): 135 | """ 136 | Determine whether path is SRA file 137 | parameters 138 | ---------- 139 | path : str 140 | file path 141 | """ 142 | f = os.path.basename(path) 143 | if f.lower().endswith('.sra'): return True 144 | if 'SRR' in f.upper(): return True 145 | if 'ERR' in f.upper(): return True 146 | if 'DRR' in f.upper(): return True 147 | return False 148 | 149 | def main(): 150 | """ 151 | Main interface 152 | """ 153 | args, extra = parser.parse_known_args() 154 | if args.version: 155 | print('parallel-fastq-dump : {}'.format(__version__)) 156 | subprocess.Popen(['fastq-dump', '-V']).wait() 157 | sys.exit(0) 158 | 159 | elif args.sra_id: 160 | extra_srrs, extra_args = partition(is_sra_file,extra) 161 | args.sra_id.extend(extra_srrs) 162 | logging.info('SRR ids: {}'.format(args.sra_id)) 163 | logging.info('extra args: {}'.format(extra_args)) 164 | 165 | # output directory 166 | if not os.path.isdir(args.outdir) and args.outdir != '.': 167 | os.makedirs(args.outdir) 168 | # temp directory 169 | if (args.tmpdir is not None and 170 | not os.path.isdir(args.tmpdir) 171 | and args.tmpdir != '.'): 172 | os.makedirs(args.tmpdir) 173 | # fastq dump 174 | for si in args.sra_id: 175 | pfd(args, si, extra_args) 176 | else: 177 | parser.print_help() 178 | sys.exit(1) 179 | 180 | if __name__ == '__main__': 181 | main() 182 | 183 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | from setuptools import setup 3 | 4 | pname = "parallel-fastq-dump" 5 | 6 | exec(list(filter( 7 | lambda l: l.startswith("__version__"), 8 | open(pname).read().split("\n") 9 | ))[0]) 10 | 11 | setup( 12 | name=pname, 13 | version=__version__, 14 | author="Renan Valieris", 15 | author_email="renan.valieris@cipe.accamargo.org.br", 16 | description="parallel fastq-dump wrapper", 17 | license="MIT", 18 | url="https://github.com/rvalieris/parallel-fastq-dump", 19 | scripts=[pname], 20 | classifiers=[ 21 | "Development Status :: 4 - Beta", 22 | "Environment :: Console", 23 | "Intended Audience :: Science/Research", 24 | "License :: OSI Approved :: MIT License", 25 | "Programming Language :: Python :: 3", 26 | "Topic :: Scientific/Engineering :: Bio-Informatics" 27 | ] 28 | ) 29 | --------------------------------------------------------------------------------