├── setup.py ├── data ├── SampleSheet_no_index2.csv ├── SampleSheet.csv └── SampleSheet_bad_header.csv ├── .gitignore ├── LICENSE ├── README.md └── bcl2fastq.py /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | setup( 5 | name='bcl2fastq', 6 | version='1.3.0', 7 | url='http://github.com/brwnj/bcl2fastq', 8 | license='MIT', 9 | author='Joe Brown', 10 | author_email='brwnjm@gmail.com', 11 | description='NextSeq 1k/2k specific bcl2fastq wrapper.', 12 | long_description=__doc__, 13 | py_modules=['bcl2fastq'], 14 | install_requires=[ 15 | 'click>=4.0', 16 | 'pandas', 17 | 'seaborn', 18 | 'matplotlib', 19 | ], 20 | entry_points=''' 21 | [console_scripts] 22 | bcl_to_fastq=bcl2fastq:bcl2fastq 23 | ''' 24 | ) 25 | -------------------------------------------------------------------------------- /data/SampleSheet_no_index2.csv: -------------------------------------------------------------------------------- 1 | [Header],,,,,,,,, 2 | IEMFileVersion,4,,,,,,,, 3 | Investigator Name,test,,,,,,,, 4 | Experiment Name,ttttest,,,,,,,, 5 | Date,8/5/14,,,,,,,, 6 | Workflow,GenerateFASTQ,,,,,,,, 7 | Application,FASTQ Only,,,,,,,, 8 | Assay,Nextera XT v2_AllSets,,,,,,,, 9 | Description,more testing,,,,,,,, 10 | Chemistry,Amplicon,,,,,,,, 11 | ,,,,,,,,, 12 | [Reads],,,,,,,,, 13 | 151,,,,,,,,, 14 | 151,,,,,,,,, 15 | ,,,,,,,,, 16 | [Settings],,,,,,,,, 17 | ReverseComplement,0,,,,,,,, 18 | Adapter,CTGTCTCTTATACACATCT,,,,,,,, 19 | ,,,,,,,,, 20 | ,,,,,,,,, 21 | [Data],,,,,,,,, 22 | Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description 23 | s1,s1,p1,A01,N701,TAAGGCGA,,, 24 | s2,s2,p1,B01,N701,TAAGGCGA,,, 25 | s3,s3,p1,C01,N701,TAAGGCGA,,, 26 | s4,s4,p1,D01,N701,TAAGGCGA,,, 27 | s5,s5,p1,E01,N701,TAAGGCGA,,, 28 | -------------------------------------------------------------------------------- /data/SampleSheet.csv: -------------------------------------------------------------------------------- 1 | [Header],,,,,,,,, 2 | IEMFileVersion,4,,,,,,,, 3 | Investigator Name,test,,,,,,,, 4 | Experiment Name,ttttest,,,,,,,, 5 | Date,8/5/14,,,,,,,, 6 | Workflow,GenerateFASTQ,,,,,,,, 7 | Application,FASTQ Only,,,,,,,, 8 | Assay,Nextera XT v2_AllSets,,,,,,,, 9 | Description,more testing,,,,,,,, 10 | Chemistry,Amplicon,,,,,,,, 11 | ,,,,,,,,, 12 | [Reads],,,,,,,,, 13 | 151,,,,,,,,, 14 | 151,,,,,,,,, 15 | ,,,,,,,,, 16 | [Settings],,,,,,,,, 17 | ReverseComplement,0,,,,,,,, 18 | Adapter,CTGTCTCTTATACACATCT,,,,,,,, 19 | ,,,,,,,,, 20 | ,,,,,,,,, 21 | [Data],,,,,,,,, 22 | Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description 23 | s1,s1,p1,A01,N701,TAAGGCGA,S502,CTCTCTAT,, 24 | s2,s2,p1,B01,N701,TAAGGCGA,S503,TATCCTCT,, 25 | s3,s3,p1,C01,N701,TAAGGCGA,S505,GTAAGGAG,, 26 | s4,s4,p1,D01,N701,TAAGGCGA,S506,ACTGCATA,, 27 | s5,s5,p1,E01,N701,TAAGGCGA,S507,AAGGAGTA,, 28 | -------------------------------------------------------------------------------- /data/SampleSheet_bad_header.csv: -------------------------------------------------------------------------------- 1 | [Header],,,,,,,,, 2 | IEMFileVersion,4,,,,,,,, 3 | Investigator Name,test,,,,,,,, 4 | Experiment Name,ttttest,,,,,,,, 5 | Date,8/5/14,,,,,,,, 6 | Workflow,GenerateFASTQ,,,,,,,, 7 | Application,FASTQ Only,,,,,,,, 8 | Assay,Nextera XT v2_AllSets,,,,,,,, 9 | Description,more testing,,,,,,,, 10 | Chemistry,Amplicon,,,,,,,, 11 | ,,,,,,,,, 12 | [Reads],,,,,,,,, 13 | 151,,,,,,,,, 14 | 151,,,,,,,,, 15 | ,,,,,,,,, 16 | [Settings],,,,,,,,, 17 | ReverseComplement,0,,,,,,,, 18 | Adapter,CTGTCTCTTATACACATCT,,,,,,,, 19 | ,,,,,,,,, 20 | ,,,,,,,,, 21 | [Data],,,,,,,,, 22 | Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,sample_project,Description 23 | s1,s1,p1,A01,N701,TAAGGCGA,S502,CTCTCTAT,, 24 | s2,s2,p1,B01,N701,TAAGGCGA,S503,TATCCTCT,, 25 | s3,s3,p1,C01,N701,TAAGGCGA,S505,GTAAGGAG,, 26 | s4,s4,p1,D01,N701,TAAGGCGA,S506,ACTGCATA,, 27 | s5,s5,p1,E01,N701,TAAGGCGA,S507,AAGGAGTA,, 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | .idea/ 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Joe Brown 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NextSeq .bcl Conversion 2 | `bcl_to_fastq` runs bcl2fastq with optional effects to the Sample Sheet and 3 | concatenates reads across lanes into R1 and R2 by sample. By default, 4 | Undetermined and reads across individual lanes are removed on success and 5 | all reads are placed in BaseCalls directory. 6 | 7 | Tested on `bcl2fastq2` Conversion Software v2.17.1.14 and Python 2.7 and 3.5. 8 | 9 | # Running 10 | ``` 11 | $ cd /data/nextseq/170111_NS500409_0130_AHHGTMAFXX/SampleSheet.csv 12 | $ bcl_to_fastq --reverse-complement --processing 80 13 | [2017-01-14 19:07:57 - INFO] Using /data/nextseq/170111_NS500409_0130_AHHGTMAFXX/SampleSheet.csv 14 | [2017-01-14 19:07:57 - INFO] Processing /data/nextseq/170111_NS500409_0130_AHHGTMAFXX/SampleSheet.csv 15 | [2017-01-14 19:07:58 - INFO] Found 384 samples for run 170111_NS500409_0130_AHHGTMAFXX 16 | [2017-01-14 19:07:58 - INFO] Run complete. 17 | [2017-01-14 19:07:58 - INFO] Converting .bcl to .fastq using: $>bcl2fastq -r 12 -d 12 -p 80 -w 12 --barcode-mismatches 0 --no-lane-splitting -R . 18 | [2017-01-14 20:54:02 - INFO] .bcl Conversion successful 19 | [2017-01-14 20:54:02 - INFO] Generating demultiplexing stats file 20 | ``` 21 | 22 | # Results 23 | In the run folder, SampleSheet.csv.bak is a backup copy of the original 24 | SampleSheet.csv and is accompanied by: 25 | 26 | ## bcl2fastq.log 27 | 28 | ``` 29 | $ head bcl2fastq.log 30 | 2017-01-14 19:07:58 [f82880] INFO: Create FASTQs for index reads: NO 31 | BCL to FASTQ file converter 32 | bcl2fastq v2.17.1.14 33 | Copyright (c) 2007-2015 Illumina, Inc. 34 | 35 | 2017-01-14 19:07:58 [16f2880] Command-line invocation: bcl2fastq -r 12 -d 12 -p 80 -w 12 --barcode-mismatches 0 --no-lane-splitting -R . 36 | 2017-01-14 19:07:58 [16f2880] INFO: Minimum log level: INFO 37 | 2017-01-14 19:07:58 [16f2880] INFO: Sample sheet: './SampleSheet.csv' 38 | 2017-01-14 19:07:59 [16f2880] INFO: Runfolder path: '.' 39 | 2017-01-14 19:07:59 [16f2880] INFO: Input path: './Data/Intensities/BaseCalls/' 40 | etc... 41 | ``` 42 | 43 | ## demultiplexing_stats.csv 44 | ``` 45 | $ head demultiplexing_stats.csv 46 | AAA003-K10,102570 47 | AAA007-J07,72566 48 | AAA240-I17,146605 49 | AAA240-J05,197833 50 | etc... 51 | ``` 52 | 53 | Fastq files (`_R?.fastq.gz`) are available in 54 | RunFolder/Data/Intensities/BaseCalls along with a file named SAMPLES which 55 | merely listed the sample IDs that were processed. 56 | 57 | # Help 58 | ``` 59 | $ bcl_to_fastq -h 60 | Usage: bcl_to_fastq [OPTIONS] 61 | 62 | Runs bcl2fastq2, creating fastqs and concatenating fastqs across lanes. 63 | Original fastq files and Undetermined files are deleted. 64 | 65 | Options: 66 | --runfolder TEXT path to directory containing run data 67 | [default: .] 68 | --loading INTEGER number of threads used for loading BCL data 69 | [default: 12] 70 | --demultiplexing INTEGER number of threads used for demultiplexing 71 | [default: 12] 72 | --processing INTEGER number of threads used for processing 73 | demultiplexed data [default: 24] 74 | --writing INTEGER number of threads used for writing FASTQ data 75 | [default: 12] 76 | --barcode-mismatches INTEGER number of allowed mismatches per index 77 | [default: 0] 78 | --keep-tmp save fastqs across lanes as well as 79 | Undetermined [default: False] 80 | --reverse-complement reverse complement index 2 of the sample sheet 81 | [default: False] 82 | --no-wait process the run without checking its 83 | completion status [default: False] 84 | -h, --help Show this message and exit. 85 | ``` 86 | 87 | # Requires 88 | + [click](http://click.pocoo.org/4/) 89 | + [pandas](http://pandas.pydata.org/) 90 | + [bcl2fastq2](http://support.illumina.com/downloads/bcl2fastq_conversion_software.html) 91 | + matplotlib 92 | + numpy 93 | + seaborn 94 | 95 | 96 | # Install 97 | ``` 98 | git clone git@github.com:brwnj/bcl2fastq.git 99 | cd bcl2fastq 100 | python setup.py install 101 | ``` 102 | -------------------------------------------------------------------------------- /bcl2fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from __future__ import print_function 5 | 6 | import matplotlib 7 | 8 | matplotlib.use("Agg") 9 | 10 | import logging 11 | import os 12 | import shutil 13 | import string 14 | import subprocess as sp 15 | import sys 16 | import tempfile 17 | import time 18 | import warnings 19 | from datetime import datetime 20 | from glob import glob 21 | from xml.etree import cElementTree as ET 22 | 23 | import click 24 | import matplotlib.pyplot as plt 25 | import numpy as np 26 | import pandas as pd 27 | import seaborn as sns 28 | import six 29 | from matplotlib.cbook import MatplotlibDeprecationWarning 30 | 31 | warnings.simplefilter("ignore", MatplotlibDeprecationWarning) 32 | sns.set_context("paper") 33 | sns.set_style("whitegrid", {"axes.linewidth": 1}) 34 | if six.PY2: 35 | _complement = string.maketrans("ATCG", "TAGC") 36 | else: 37 | _complement = str.maketrans("ATCG", "TAGC") 38 | complement = lambda seq: seq.translate(_complement) 39 | logging.basicConfig( 40 | level=logging.INFO, 41 | format="[%(asctime)s - %(levelname)s] %(message)s", 42 | datefmt="%Y-%m-%d %H:%M:%S", 43 | ) 44 | 45 | 46 | def get_samplesheet(path): 47 | ss = path 48 | if os.path.isdir(ss): 49 | ss = os.path.join(os.path.abspath(path), "SampleSheet.csv") 50 | logging.info("Using %s", ss) 51 | if not os.path.exists(ss): 52 | raise OSError(2, "No such file", ss) 53 | return ss 54 | 55 | 56 | def get_file_sizes(output_dir): 57 | total_size = 0 58 | for f in glob(os.path.join(output_dir, "*.fastq.gz")): 59 | if os.path.basename(f).startswith("Undetermined_"): 60 | continue 61 | total_size += os.path.getsize(f) 62 | return total_size 63 | 64 | 65 | def process_samplesheet( 66 | samplesheet, new_samplesheet, reverse_complement=False, determine=False 67 | ): 68 | """Fix hidden characters in sample names and optional reverse complement 69 | the second index. 70 | 71 | Args: 72 | samplesheet (str): file path to SampleSheet.csv 73 | new_samplesheet (str): file path to new_sample_sheet.csv 74 | reverse_complement (bool): to reverse complement 'Index2' 75 | determine (bool): using several tiles, determine if it's better to reverse complement index2 76 | 77 | Returns: 78 | list 79 | """ 80 | if not determine: 81 | logging.info("Processing %s", samplesheet) 82 | 83 | samples = [] 84 | start = False 85 | index2_idx = None 86 | 87 | with open(samplesheet, "rU" if six.PY2 else "r") as ifh, open( 88 | new_samplesheet, "w" 89 | ) as ofh: 90 | for line in ifh: 91 | toks = line.strip().split(",") 92 | if not start: 93 | # table header processing 94 | if toks[0] == "Sample_ID": 95 | start = True 96 | if reverse_complement: 97 | if "index2" in toks: 98 | index2_idx = toks.index("index2") 99 | elif "Index2" in toks: 100 | index2_idx = toks.index("Index2") 101 | else: 102 | logging.warn("There is no Index2 to reverse complement") 103 | 104 | elif toks[0]: 105 | # convert underscores to dashes 106 | toks[0] = toks[0].replace("_", "-").replace(".", "-") 107 | toks[1] = toks[0] 108 | samples.append(toks[0]) 109 | 110 | # only adjust on known index 111 | if reverse_complement and index2_idx: 112 | toks[index2_idx] = complement(toks[index2_idx])[::-1] 113 | 114 | # remove blank lines at end of table 115 | else: 116 | break 117 | print(*[t.strip() for t in toks], sep=",", file=ofh) 118 | 119 | run = os.path.basename(os.path.dirname(samplesheet)) 120 | if not determine: 121 | logging.info("Found %d samples for run %s", len(samples), run) 122 | return samples 123 | 124 | 125 | def wait_for_completion(path, no_wait=True, delay=14400): 126 | if no_wait: 127 | return True 128 | rta_complete = os.path.join(path, "RTAComplete.txt") 129 | 130 | # waiting for RTAComplete.txt to show up in local filesystem 131 | notify = True 132 | while not os.path.exists(rta_complete): 133 | if notify: 134 | logging.info( 135 | "Waiting on run completion. [%s]" % time.strftime("%Y-%m-%d %H:%M:%S") 136 | ) 137 | notify = False 138 | # wait 60 seconds each loop waiting for RTA file 139 | time.sleep(60) 140 | logging.info("Run complete.") 141 | 142 | # wait [delay] seconds for remaining files to transfer locally 143 | time.sleep(delay) 144 | return True 145 | 146 | 147 | def run_bcl2fastq(runfolder, args, determine=False): 148 | runlog = os.path.join(runfolder, "bcl2fastq.log") 149 | cmd = " ".join(map(str, args)) 150 | if determine: 151 | logging.info("Converting a subset to determine barcodes using: $>%s", cmd) 152 | else: 153 | logging.info("Converting .bcl to .fastq using: $>%s", cmd) 154 | with open(runlog, "w") as fh: 155 | # bcl2fastq version info... 156 | sp.check_call( 157 | "bcl2fastq --version 2>&1 | tail -2 | head -1", 158 | stdout=fh, 159 | stderr=fh, 160 | shell=True, 161 | ) 162 | sp.check_call(cmd, stdout=fh, stderr=fh, shell=True) 163 | logging.info(".bcl Conversion successful") 164 | 165 | 166 | def run_determination_step( 167 | input_dir, 168 | runfolder, 169 | output_dir, 170 | samplesheet, 171 | loading, 172 | processing, 173 | writing, 174 | barcode_mismatches, 175 | ): 176 | # set up a temporary working directory 177 | tmpd = tempfile.mkdtemp(dir=output_dir if output_dir else runfolder) 178 | 179 | date = datetime.now().strftime("%Y-%m-%d-%H%M-%S") 180 | rc_samplesheet = "%s.%s.rc.csv" % (samplesheet, date) 181 | process_samplesheet( 182 | samplesheet, rc_samplesheet, reverse_complement=True, determine=True 183 | ) 184 | 185 | # run the first set 186 | # 1105 is small subset of tiles I found to generally work 187 | cmd_args = [ 188 | "bcl2fastq", 189 | "--tiles", 190 | "1105", 191 | "--no-lane-splitting", 192 | "--runfolder-dir", 193 | runfolder, 194 | "--output-dir", 195 | tmpd, 196 | "--barcode-mismatches", 197 | barcode_mismatches, 198 | "--loading-threads", 199 | loading, 200 | "--processing-threads", 201 | processing, 202 | "--writing-threads", 203 | writing, 204 | "--sample-sheet", 205 | rc_samplesheet, 206 | ] 207 | if input_dir: 208 | cmd_args.extend(["--input-dir", input_dir]) 209 | run_bcl2fastq(tmpd, cmd_args, determine=True) 210 | rc_file_size = get_file_sizes(tmpd) 211 | shutil.rmtree(tmpd) 212 | 213 | # try the original 214 | tmpd = tempfile.mkdtemp(dir=output_dir if output_dir else runfolder) 215 | orig_samplesheet = "%s.%s.orig.csv" % (samplesheet, date) 216 | samples = process_samplesheet( 217 | samplesheet, orig_samplesheet, reverse_complement=False, determine=True 218 | ) 219 | 220 | # run the first set 221 | cmd_args = [ 222 | "bcl2fastq", 223 | "--tiles", 224 | "1105", 225 | "--no-lane-splitting", 226 | "--runfolder-dir", 227 | runfolder, 228 | "--output-dir", 229 | tmpd, 230 | "--barcode-mismatches", 231 | barcode_mismatches, 232 | "--loading-threads", 233 | loading, 234 | "--processing-threads", 235 | processing, 236 | "--writing-threads", 237 | writing, 238 | "--sample-sheet", 239 | orig_samplesheet, 240 | ] 241 | if input_dir: 242 | cmd_args.extend(["--input-dir", input_dir]) 243 | run_bcl2fastq(tmpd, cmd_args, determine=True) 244 | orig_file_size = get_file_sizes(tmpd) 245 | shutil.rmtree(tmpd) 246 | 247 | if rc_file_size > orig_file_size: 248 | logging.info("Using reverse complement of Index2 to demultiplex") 249 | os.remove(orig_samplesheet) 250 | return samples, rc_samplesheet 251 | elif rc_file_size == orig_file_size: 252 | logging.critical( 253 | ( 254 | "The original and reverse complemented barcodes " 255 | "yielded the same number of demultiplexed reads." 256 | ) 257 | ) 258 | sys.exit(1) 259 | else: 260 | logging.info("Using the original barcodes to demultiplex") 261 | os.remove(rc_samplesheet) 262 | return samples, orig_samplesheet 263 | 264 | 265 | def xml_to_df(stats_xml): 266 | if os.path.exists(stats_xml): 267 | logging.info("Generating demultiplexing stats file") 268 | doc = ET.parse(stats_xml) 269 | root = doc.getroot() 270 | counts = {} 271 | for sample in root.iter("Sample"): 272 | name = sample.get("name") 273 | if name == "all" or name == "unknown": 274 | continue 275 | counts[name] = {} 276 | for barcode in sample.iter("Barcode"): 277 | if barcode.get("name") == "all": 278 | continue 279 | for lane in barcode.iter("Lane"): 280 | lane_name = lane.get("number") 281 | count = int(lane.findtext("BarcodeCount")) 282 | counts[name][lane_name] = count 283 | return pd.DataFrame(counts) 284 | else: 285 | logging.warning("Could not find file %s", stats_xml) 286 | return None 287 | 288 | 289 | def barplot_distribution(df, out_file): 290 | width = max([len(df) / 10, 12]) 291 | f, ax = plt.subplots(figsize=(width, 6)) 292 | df.plot(kind="bar", stacked=True, ax=ax) 293 | f.savefig(out_file, bbox_inches="tight") 294 | plt.close() 295 | 296 | 297 | def Lc(x): 298 | """Computes the ordinary and generalized Lorenz curve of a list. 299 | 300 | >>> import numpy as np 301 | >>> t = [1,2,np.nan,7,8] 302 | >>> p, L, Lg = Lc(t) 303 | >>> len(p) == len(L) == len(Lg) 304 | True 305 | >>> p[1:4] 306 | array([ 0.25, 0.5 , 0.75]) 307 | >>> L[1:4] # doctest: +ELLIPSIS 308 | array([ 0.055..., 0.166..., 0.555...]) 309 | >>> Lg[1:4] 310 | array([ 0.25, 0.75, 2.5 ]) 311 | >>> t = [1,2,np.nan,7,-8] 312 | >>> Lc(t) # doctest: +ELLIPSIS 313 | Traceback (most recent call last): 314 | ... 315 | ValueError: x contained negative number 316 | """ 317 | assert len(x) > 0, "x is empty" 318 | a = np.array(x, dtype=float) 319 | a = a[np.isfinite(a)] 320 | if a.min() < 0: 321 | raise ValueError("x contained negative number") 322 | a.sort(kind="mergesort") 323 | a_len = float(len(a)) 324 | p = np.arange(1, a_len + 1) / a_len 325 | p = np.append([0], p) 326 | L = a.cumsum() / a.sum() 327 | L = np.append([0], L) 328 | Lg = L * np.mean(a) 329 | return p, L, Lg 330 | 331 | 332 | def lorenz_curve(df, out_file): 333 | p, L, Lg = Lc(df.sum(axis=1).values) 334 | f, ax = plt.subplots(figsize=(8, 6)) 335 | plt.plot(p, L, axes=ax) 336 | plt.plot([0, 1], axes=ax, color="black", linestyle="--") 337 | ax.set(title="Distribution of Barcode Mapped Reads") 338 | f.savefig(out_file, bbox_inches="tight") 339 | plt.close() 340 | 341 | 342 | def compile_demultiplex_stats(runfolder, out_dir): 343 | stats_xml = os.path.join( 344 | os.path.abspath(out_dir), "Stats", "DemultiplexingStats.xml" 345 | ) 346 | df = xml_to_df(stats_xml) 347 | if df is not None: 348 | df.sum().to_csv(os.path.join(runfolder, "demultiplexing_stats.csv")) 349 | try: 350 | dft = df.transpose().drop("Undetermined", axis=0) 351 | except ValueError: 352 | dft = df.transpose() 353 | barplot_distribution( 354 | dft, os.path.join(runfolder, "demultiplexing_distribution.pdf") 355 | ) 356 | lorenz_curve( 357 | dft, os.path.join(runfolder, "demultiplexing_distribution_curve.pdf") 358 | ) 359 | 360 | 361 | @click.command( 362 | context_settings=dict( 363 | help_option_names=["-h", "--help"], 364 | ignore_unknown_options=True, 365 | ) 366 | ) 367 | @click.option( 368 | "-i", 369 | "--input-dir", 370 | default=None, 371 | show_default=True, 372 | help="path to input directory; default is RUNFOLDER-DIR/Data/Intensities/BaseCalls", 373 | ) 374 | @click.option( 375 | "-R", 376 | "--runfolder-dir", 377 | default=os.path.realpath("."), 378 | show_default=True, 379 | help="path to directory containing run data", 380 | ) 381 | @click.option( 382 | "-o", 383 | "--output-dir", 384 | default=None, 385 | help="path to demultiplexed output; default is same as INPUT-DIR", 386 | ) 387 | @click.option( 388 | "--sample-sheet", 389 | default=None, 390 | help="file path to sample sheet; default is RUNFOLDER-DIR/SampleSheet.csv", 391 | ) 392 | @click.option( 393 | "--loading", 394 | default=12, 395 | type=int, 396 | show_default=True, 397 | help="number of threads used for loading BCL data", 398 | ) 399 | @click.option( 400 | "--processing", 401 | default=24, 402 | type=int, 403 | show_default=True, 404 | help="number of threads used for processing demultiplexed data", 405 | ) 406 | @click.option( 407 | "--writing", 408 | default=12, 409 | type=int, 410 | show_default=True, 411 | help="number of threads used for writing FASTQ data", 412 | ) 413 | @click.option( 414 | "--barcode-mismatches", 415 | default=0, 416 | type=int, 417 | show_default=True, 418 | help="number of allowed mismatches per index", 419 | ) 420 | @click.option( 421 | "--keep-tmp", 422 | is_flag=True, 423 | default=False, 424 | show_default=True, 425 | help="save Undetermined reads", 426 | ) 427 | @click.option( 428 | "--reverse-complement", 429 | is_flag=True, 430 | default=False, 431 | show_default=True, 432 | help="reverse complement index 2 of the sample sheet", 433 | ) 434 | @click.option( 435 | "--no-wait", 436 | is_flag=True, 437 | default=False, 438 | show_default=True, 439 | help="process the run without checking its completion status", 440 | ) 441 | @click.option( 442 | "--overwrite", 443 | is_flag=True, 444 | default=False, 445 | show_default=True, 446 | help="overwrite existing fastq files in the output directory", 447 | ) 448 | @click.option( 449 | "--determine", 450 | is_flag=True, 451 | default=False, 452 | show_default=True, 453 | help="use barcodes in samplesheet as well as the reverse complement of index 2, then demultiplex with best", 454 | ) 455 | @click.option( 456 | "--no-cleanup", 457 | is_flag=True, 458 | default=False, 459 | show_default=True, 460 | help="skip all cleaning up -- do not rename fastq output and do not delete undetermined files", 461 | ) 462 | @click.option( 463 | "--delay", 464 | default=14400, 465 | type=int, 466 | show_default=True, 467 | help="number of seconds to sleep after finding RTAComplete.txt -- applies only when waiting for a run to complete", 468 | ) 469 | @click.argument("bcl2fastq_args", nargs=-1, type=click.UNPROCESSED) 470 | def bcl2fastq( 471 | input_dir, 472 | runfolder_dir, 473 | output_dir, 474 | sample_sheet, 475 | loading, 476 | processing, 477 | writing, 478 | barcode_mismatches, 479 | keep_tmp, 480 | reverse_complement, 481 | no_wait, 482 | overwrite, 483 | determine, 484 | no_cleanup, 485 | delay, 486 | bcl2fastq_args, 487 | ): 488 | """Runs bcl2fastq2, creating fastqs and concatenating fastqs across lanes. 489 | Undetermined files are deleted by default. 490 | 491 | Any arguments not matching those outlined below will be sent to the 492 | `bcl2fastq` call. 493 | """ 494 | try: 495 | if not sample_sheet: 496 | samplesheet = get_samplesheet(runfolder_dir) 497 | else: 498 | samplesheet = get_samplesheet(sample_sheet) 499 | except OSError: 500 | logging.critical("Could not find sample sheet CSV.") 501 | sys.exit(1) 502 | 503 | # will wait on the run to complete without ever checking for samples in 504 | # the samplesheet 505 | if not determine: 506 | # new samplesheet written each time leaving the original unchanged 507 | date = datetime.now().strftime("%Y-%m-%d-%H%M-%S") 508 | new_samplesheet = "%s.%s.csv" % (samplesheet, date) 509 | if os.path.exists(os.path.join(runfolder_dir, "bcl2fastq.log")): 510 | # this run has already been converted, so don't reverse complement 511 | # just get the sample names 512 | if reverse_complement: 513 | logging.warning( 514 | ( 515 | "reverse complementing has been skipped as " 516 | "a log file (bcl2fastq.log) was found" 517 | ) 518 | ) 519 | samples = process_samplesheet(samplesheet, new_samplesheet, False) 520 | else: 521 | samples = process_samplesheet( 522 | samplesheet, new_samplesheet, reverse_complement 523 | ) 524 | if len(samples) == 0: 525 | logging.critical( 526 | ( 527 | "No samples were found in the SampleSheet. " 528 | "Please check its formatting." 529 | ) 530 | ) 531 | sys.exit(1) 532 | 533 | # check for RTAComplete.txt and sleep for delay 534 | completion_success = wait_for_completion(runfolder_dir, no_wait, delay) 535 | if not completion_success: 536 | logging.critical("Run did not complete as planned. Exiting.") 537 | sys.exit(1) 538 | 539 | # set where we're going to dump the result files 540 | if output_dir: 541 | fastq_dir = os.path.realpath(output_dir) 542 | else: 543 | fastq_dir = os.path.abspath( 544 | os.path.join(runfolder_dir, "Data", "Intensities", "BaseCalls") 545 | ) 546 | 547 | # check original and reverse complement using a few tiles 548 | if determine: 549 | # need to add input_dir, output_dir 550 | samples, new_samplesheet = run_determination_step( 551 | input_dir, 552 | runfolder_dir, 553 | output_dir, 554 | samplesheet, 555 | loading, 556 | processing, 557 | writing, 558 | barcode_mismatches, 559 | ) 560 | 561 | # run bcl2fastq on the run folder 562 | cmd_args = [ 563 | "bcl2fastq", 564 | "--sample-sheet", 565 | new_samplesheet, 566 | "--loading-threads", 567 | loading, 568 | "--processing-threads", 569 | processing, 570 | "--writing-threads", 571 | writing, 572 | "--barcode-mismatches", 573 | barcode_mismatches, 574 | "--no-lane-splitting", 575 | "--runfolder-dir", 576 | runfolder_dir, 577 | ] + list(bcl2fastq_args) 578 | if output_dir: 579 | cmd_args.extend(["--output-dir", output_dir]) 580 | if input_dir: 581 | cmd_args.extend(["--input-dir", input_dir]) 582 | run_bcl2fastq(runfolder_dir, cmd_args) 583 | 584 | # parse DemultiplexingStats.xml into a csv with summary plots 585 | compile_demultiplex_stats(runfolder_dir, fastq_dir) 586 | 587 | # TODO: deprecate! 588 | # write file with sample names for downstream parallelization 589 | with open(os.path.join(fastq_dir, "SAMPLES"), "w") as ofh: 590 | print(*samples, sep="\n", file=ofh) 591 | 592 | # cleanup the output directory 593 | if not no_cleanup: 594 | logging.info("Cleaning up output directory [%s]" % fastq_dir) 595 | for f in glob(os.path.join(fastq_dir, "*.fastq*")): 596 | if not f.endswith(".gz") and not f.endswith(".fastq"): 597 | continue 598 | filename = os.path.basename(f) 599 | if filename.startswith("Undetermined_") and not keep_tmp: 600 | logging.info("Deleting %s" % filename) 601 | os.remove(f) 602 | else: 603 | try: 604 | # AD-332-A10_S1_R1_001.fastq.gz --> AD-332-A10_R1.fastq.gz 605 | sample_name, _, read_index, ext = filename.split("_") 606 | # munge the file name 607 | new_file_name = "%s_%s.%s" % ( 608 | sample_name, 609 | read_index, 610 | ext.partition(".")[-1], 611 | ) 612 | # prepend the path 613 | new_file_name = os.path.join(os.path.dirname(f), new_file_name) 614 | if overwrite and os.path.exists(new_file_name): 615 | os.remove(new_file_name) 616 | os.rename(f, new_file_name) 617 | except ValueError: 618 | logging.warn( 619 | "Renaming skipped: the output dir contains conflicting FASTQ file for %s" 620 | % f 621 | ) 622 | 623 | 624 | if __name__ == "__main__": 625 | bcl2fastq() 626 | --------------------------------------------------------------------------------