├── setup.py
├── data
    ├── SampleSheet_no_index2.csv
    ├── SampleSheet.csv
    └── SampleSheet_bad_header.csv
├── .gitignore
├── LICENSE
├── README.md
└── bcl2fastq.py


/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | setup(
 5 |     name='bcl2fastq',
 6 |     version='1.3.0',
 7 |     url='http://github.com/brwnj/bcl2fastq',
 8 |     license='MIT',
 9 |     author='Joe Brown',
10 |     author_email='brwnjm@gmail.com',
11 |     description='NextSeq 1k/2k specific bcl2fastq wrapper.',
12 |     long_description=__doc__,
13 |     py_modules=['bcl2fastq'],
14 |     install_requires=[
15 |         'click>=4.0',
16 |         'pandas',
17 |         'seaborn',
18 |         'matplotlib',
19 |     ],
20 |     entry_points='''
21 |         [console_scripts]
22 |         bcl_to_fastq=bcl2fastq:bcl2fastq
23 |     '''
24 | )
25 | 


--------------------------------------------------------------------------------
/data/SampleSheet_no_index2.csv:
--------------------------------------------------------------------------------
 1 | [Header],,,,,,,,,
 2 | IEMFileVersion,4,,,,,,,,
 3 | Investigator Name,test,,,,,,,,
 4 | Experiment Name,ttttest,,,,,,,,
 5 | Date,8/5/14,,,,,,,,
 6 | Workflow,GenerateFASTQ,,,,,,,,
 7 | Application,FASTQ Only,,,,,,,,
 8 | Assay,Nextera XT v2_AllSets,,,,,,,,
 9 | Description,more testing,,,,,,,,
10 | Chemistry,Amplicon,,,,,,,,
11 | ,,,,,,,,,
12 | [Reads],,,,,,,,,
13 | 151,,,,,,,,,
14 | 151,,,,,,,,,
15 | ,,,,,,,,,
16 | [Settings],,,,,,,,,
17 | ReverseComplement,0,,,,,,,,
18 | Adapter,CTGTCTCTTATACACATCT,,,,,,,,
19 | ,,,,,,,,,
20 | ,,,,,,,,,
21 | [Data],,,,,,,,,
22 | Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description
23 | s1,s1,p1,A01,N701,TAAGGCGA,,,
24 | s2,s2,p1,B01,N701,TAAGGCGA,,,
25 | s3,s3,p1,C01,N701,TAAGGCGA,,,
26 | s4,s4,p1,D01,N701,TAAGGCGA,,,
27 | s5,s5,p1,E01,N701,TAAGGCGA,,,
28 | 


--------------------------------------------------------------------------------
/data/SampleSheet.csv:
--------------------------------------------------------------------------------
 1 | [Header],,,,,,,,,
 2 | IEMFileVersion,4,,,,,,,,
 3 | Investigator Name,test,,,,,,,,
 4 | Experiment Name,ttttest,,,,,,,,
 5 | Date,8/5/14,,,,,,,,
 6 | Workflow,GenerateFASTQ,,,,,,,,
 7 | Application,FASTQ Only,,,,,,,,
 8 | Assay,Nextera XT v2_AllSets,,,,,,,,
 9 | Description,more testing,,,,,,,,
10 | Chemistry,Amplicon,,,,,,,,
11 | ,,,,,,,,,
12 | [Reads],,,,,,,,,
13 | 151,,,,,,,,,
14 | 151,,,,,,,,,
15 | ,,,,,,,,,
16 | [Settings],,,,,,,,,
17 | ReverseComplement,0,,,,,,,,
18 | Adapter,CTGTCTCTTATACACATCT,,,,,,,,
19 | ,,,,,,,,,
20 | ,,,,,,,,,
21 | [Data],,,,,,,,,
22 | Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
23 | s1,s1,p1,A01,N701,TAAGGCGA,S502,CTCTCTAT,,
24 | s2,s2,p1,B01,N701,TAAGGCGA,S503,TATCCTCT,,
25 | s3,s3,p1,C01,N701,TAAGGCGA,S505,GTAAGGAG,,
26 | s4,s4,p1,D01,N701,TAAGGCGA,S506,ACTGCATA,,
27 | s5,s5,p1,E01,N701,TAAGGCGA,S507,AAGGAGTA,,
28 | 


--------------------------------------------------------------------------------
/data/SampleSheet_bad_header.csv:
--------------------------------------------------------------------------------
 1 | [Header],,,,,,,,,
 2 | IEMFileVersion,4,,,,,,,,
 3 | Investigator Name,test,,,,,,,,
 4 | Experiment Name,ttttest,,,,,,,,
 5 | Date,8/5/14,,,,,,,,
 6 | Workflow,GenerateFASTQ,,,,,,,,
 7 | Application,FASTQ Only,,,,,,,,
 8 | Assay,Nextera XT v2_AllSets,,,,,,,,
 9 | Description,more testing,,,,,,,,
10 | Chemistry,Amplicon,,,,,,,,
11 | ,,,,,,,,,
12 | [Reads],,,,,,,,,
13 | 151,,,,,,,,,
14 | 151,,,,,,,,,
15 | ,,,,,,,,,
16 | [Settings],,,,,,,,,
17 | ReverseComplement,0,,,,,,,,
18 | Adapter,CTGTCTCTTATACACATCT,,,,,,,,
19 | ,,,,,,,,,
20 | ,,,,,,,,,
21 | [Data],,,,,,,,,
22 | Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,sample_project,Description
23 | s1,s1,p1,A01,N701,TAAGGCGA,S502,CTCTCTAT,,
24 | s2,s2,p1,B01,N701,TAAGGCGA,S503,TATCCTCT,,
25 | s3,s3,p1,C01,N701,TAAGGCGA,S505,GTAAGGAG,,
26 | s4,s4,p1,D01,N701,TAAGGCGA,S506,ACTGCATA,,
27 | s5,s5,p1,E01,N701,TAAGGCGA,S507,AAGGAGTA,,
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | .idea/
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Joe Brown
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NextSeq .bcl Conversion
  2 | `bcl_to_fastq` runs bcl2fastq with optional effects to the Sample Sheet and
  3 | concatenates reads across lanes into R1 and R2 by sample. By default,
  4 | Undetermined and reads across individual lanes are removed on success and
  5 | all reads are placed in BaseCalls directory.
  6 | 
  7 | Tested on `bcl2fastq2` Conversion Software v2.17.1.14 and Python 2.7 and 3.5.
  8 | 
  9 | # Running
 10 | ```
 11 | $ cd /data/nextseq/170111_NS500409_0130_AHHGTMAFXX/SampleSheet.csv
 12 | $ bcl_to_fastq --reverse-complement --processing 80
 13 | [2017-01-14 19:07:57 - INFO] Using /data/nextseq/170111_NS500409_0130_AHHGTMAFXX/SampleSheet.csv
 14 | [2017-01-14 19:07:57 - INFO] Processing /data/nextseq/170111_NS500409_0130_AHHGTMAFXX/SampleSheet.csv
 15 | [2017-01-14 19:07:58 - INFO] Found 384 samples for run 170111_NS500409_0130_AHHGTMAFXX
 16 | [2017-01-14 19:07:58 - INFO] Run complete.
 17 | [2017-01-14 19:07:58 - INFO] Converting .bcl to .fastq using: $>bcl2fastq -r 12 -d 12 -p 80 -w 12 --barcode-mismatches 0 --no-lane-splitting -R .
 18 | [2017-01-14 20:54:02 - INFO] .bcl Conversion successful
 19 | [2017-01-14 20:54:02 - INFO] Generating demultiplexing stats file
 20 | ```
 21 | 
 22 | # Results
 23 | In the run folder, SampleSheet.csv.bak is a backup copy of the original
 24 | SampleSheet.csv and is accompanied by:
 25 | 
 26 | ## bcl2fastq.log
 27 | 
 28 | ```
 29 | $ head bcl2fastq.log
 30 | 2017-01-14 19:07:58 [f82880] INFO: Create FASTQs for index reads: NO
 31 | BCL to FASTQ file converter
 32 | bcl2fastq v2.17.1.14
 33 | Copyright (c) 2007-2015 Illumina, Inc.
 34 | 
 35 | 2017-01-14 19:07:58 [16f2880] Command-line invocation: bcl2fastq -r 12 -d 12 -p 80 -w 12 --barcode-mismatches 0 --no-lane-splitting -R .
 36 | 2017-01-14 19:07:58 [16f2880] INFO: Minimum log level: INFO
 37 | 2017-01-14 19:07:58 [16f2880] INFO: Sample sheet: './SampleSheet.csv'
 38 | 2017-01-14 19:07:59 [16f2880] INFO: Runfolder path: '.'
 39 | 2017-01-14 19:07:59 [16f2880] INFO: Input path: './Data/Intensities/BaseCalls/'
 40 | etc...
 41 | ```
 42 | 
 43 | ## demultiplexing_stats.csv
 44 | ```
 45 | $ head demultiplexing_stats.csv
 46 | AAA003-K10,102570
 47 | AAA007-J07,72566
 48 | AAA240-I17,146605
 49 | AAA240-J05,197833
 50 | etc...
 51 | ```
 52 | 
 53 | Fastq files (`<sample>_R?.fastq.gz`) are available in
 54 | RunFolder/Data/Intensities/BaseCalls along with a file named SAMPLES which
 55 | merely listed the sample IDs that were processed.
 56 | 
 57 | # Help
 58 | ```
 59 | $ bcl_to_fastq -h
 60 | Usage: bcl_to_fastq [OPTIONS]
 61 | 
 62 |   Runs bcl2fastq2, creating fastqs and concatenating fastqs across lanes.
 63 |   Original fastq files and Undetermined files are deleted.
 64 | 
 65 | Options:
 66 |   --runfolder TEXT              path to directory containing run data
 67 |                                 [default: .]
 68 |   --loading INTEGER             number of threads used for loading BCL data
 69 |                                 [default: 12]
 70 |   --demultiplexing INTEGER      number of threads used for demultiplexing
 71 |                                 [default: 12]
 72 |   --processing INTEGER          number of threads used for processing
 73 |                                 demultiplexed data  [default: 24]
 74 |   --writing INTEGER             number of threads used for writing FASTQ data
 75 |                                 [default: 12]
 76 |   --barcode-mismatches INTEGER  number of allowed mismatches per index
 77 |                                 [default: 0]
 78 |   --keep-tmp                    save fastqs across lanes as well as
 79 |                                 Undetermined  [default: False]
 80 |   --reverse-complement          reverse complement index 2 of the sample sheet
 81 |                                 [default: False]
 82 |   --no-wait                     process the run without checking its
 83 |                                 completion status  [default: False]
 84 |   -h, --help                    Show this message and exit.
 85 | ```
 86 | 
 87 | # Requires
 88 | + [click](http://click.pocoo.org/4/)
 89 | + [pandas](http://pandas.pydata.org/)
 90 | + [bcl2fastq2](http://support.illumina.com/downloads/bcl2fastq_conversion_software.html)
 91 | + matplotlib
 92 | + numpy
 93 | + seaborn
 94 | 
 95 | 
 96 | # Install
 97 | ```
 98 | git clone git@github.com:brwnj/bcl2fastq.git
 99 | cd bcl2fastq
100 | python setup.py install
101 | ```
102 | 


--------------------------------------------------------------------------------
/bcl2fastq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | from __future__ import print_function
  5 | 
  6 | import matplotlib
  7 | 
  8 | matplotlib.use("Agg")
  9 | 
 10 | import logging
 11 | import os
 12 | import shutil
 13 | import string
 14 | import subprocess as sp
 15 | import sys
 16 | import tempfile
 17 | import time
 18 | import warnings
 19 | from datetime import datetime
 20 | from glob import glob
 21 | from xml.etree import cElementTree as ET
 22 | 
 23 | import click
 24 | import matplotlib.pyplot as plt
 25 | import numpy as np
 26 | import pandas as pd
 27 | import seaborn as sns
 28 | import six
 29 | from matplotlib.cbook import MatplotlibDeprecationWarning
 30 | 
 31 | warnings.simplefilter("ignore", MatplotlibDeprecationWarning)
 32 | sns.set_context("paper")
 33 | sns.set_style("whitegrid", {"axes.linewidth": 1})
 34 | if six.PY2:
 35 |     _complement = string.maketrans("ATCG", "TAGC")
 36 | else:
 37 |     _complement = str.maketrans("ATCG", "TAGC")
 38 | complement = lambda seq: seq.translate(_complement)
 39 | logging.basicConfig(
 40 |     level=logging.INFO,
 41 |     format="[%(asctime)s - %(levelname)s] %(message)s",
 42 |     datefmt="%Y-%m-%d %H:%M:%S",
 43 | )
 44 | 
 45 | 
 46 | def get_samplesheet(path):
 47 |     ss = path
 48 |     if os.path.isdir(ss):
 49 |         ss = os.path.join(os.path.abspath(path), "SampleSheet.csv")
 50 |     logging.info("Using %s", ss)
 51 |     if not os.path.exists(ss):
 52 |         raise OSError(2, "No such file", ss)
 53 |     return ss
 54 | 
 55 | 
 56 | def get_file_sizes(output_dir):
 57 |     total_size = 0
 58 |     for f in glob(os.path.join(output_dir, "*.fastq.gz")):
 59 |         if os.path.basename(f).startswith("Undetermined_"):
 60 |             continue
 61 |         total_size += os.path.getsize(f)
 62 |     return total_size
 63 | 
 64 | 
 65 | def process_samplesheet(
 66 |     samplesheet, new_samplesheet, reverse_complement=False, determine=False
 67 | ):
 68 |     """Fix hidden characters in sample names and optional reverse complement
 69 |     the second index.
 70 | 
 71 |     Args:
 72 |         samplesheet (str): file path to SampleSheet.csv
 73 |         new_samplesheet (str): file path to new_sample_sheet.csv
 74 |         reverse_complement (bool): to reverse complement 'Index2'
 75 |         determine (bool): using several tiles, determine if it's better to reverse complement index2
 76 | 
 77 |     Returns:
 78 |         list
 79 |     """
 80 |     if not determine:
 81 |         logging.info("Processing %s", samplesheet)
 82 | 
 83 |     samples = []
 84 |     start = False
 85 |     index2_idx = None
 86 | 
 87 |     with open(samplesheet, "rU" if six.PY2 else "r") as ifh, open(
 88 |         new_samplesheet, "w"
 89 |     ) as ofh:
 90 |         for line in ifh:
 91 |             toks = line.strip().split(",")
 92 |             if not start:
 93 |                 # table header processing
 94 |                 if toks[0] == "Sample_ID":
 95 |                     start = True
 96 |                     if reverse_complement:
 97 |                         if "index2" in toks:
 98 |                             index2_idx = toks.index("index2")
 99 |                         elif "Index2" in toks:
100 |                             index2_idx = toks.index("Index2")
101 |                         else:
102 |                             logging.warn("There is no Index2 to reverse complement")
103 | 
104 |             elif toks[0]:
105 |                 # convert underscores to dashes
106 |                 toks[0] = toks[0].replace("_", "-").replace(".", "-")
107 |                 toks[1] = toks[0]
108 |                 samples.append(toks[0])
109 | 
110 |                 # only adjust on known index
111 |                 if reverse_complement and index2_idx:
112 |                     toks[index2_idx] = complement(toks[index2_idx])[::-1]
113 | 
114 |             # remove blank lines at end of table
115 |             else:
116 |                 break
117 |             print(*[t.strip() for t in toks], sep=",", file=ofh)
118 | 
119 |     run = os.path.basename(os.path.dirname(samplesheet))
120 |     if not determine:
121 |         logging.info("Found %d samples for run %s", len(samples), run)
122 |     return samples
123 | 
124 | 
125 | def wait_for_completion(path, no_wait=True, delay=14400):
126 |     if no_wait:
127 |         return True
128 |     rta_complete = os.path.join(path, "RTAComplete.txt")
129 | 
130 |     # waiting for RTAComplete.txt to show up in local filesystem
131 |     notify = True
132 |     while not os.path.exists(rta_complete):
133 |         if notify:
134 |             logging.info(
135 |                 "Waiting on run completion. [%s]" % time.strftime("%Y-%m-%d %H:%M:%S")
136 |             )
137 |             notify = False
138 |         # wait 60 seconds each loop waiting for RTA file
139 |         time.sleep(60)
140 |     logging.info("Run complete.")
141 | 
142 |     # wait [delay] seconds for remaining files to transfer locally
143 |     time.sleep(delay)
144 |     return True
145 | 
146 | 
147 | def run_bcl2fastq(runfolder, args, determine=False):
148 |     runlog = os.path.join(runfolder, "bcl2fastq.log")
149 |     cmd = " ".join(map(str, args))
150 |     if determine:
151 |         logging.info("Converting a subset to determine barcodes using: $>%s", cmd)
152 |     else:
153 |         logging.info("Converting .bcl to .fastq using: $>%s", cmd)
154 |     with open(runlog, "w") as fh:
155 |         # bcl2fastq version info...
156 |         sp.check_call(
157 |             "bcl2fastq --version 2>&1 | tail -2 | head -1",
158 |             stdout=fh,
159 |             stderr=fh,
160 |             shell=True,
161 |         )
162 |         sp.check_call(cmd, stdout=fh, stderr=fh, shell=True)
163 |     logging.info(".bcl Conversion successful")
164 | 
165 | 
166 | def run_determination_step(
167 |     input_dir,
168 |     runfolder,
169 |     output_dir,
170 |     samplesheet,
171 |     loading,
172 |     processing,
173 |     writing,
174 |     barcode_mismatches,
175 | ):
176 |     # set up a temporary working directory
177 |     tmpd = tempfile.mkdtemp(dir=output_dir if output_dir else runfolder)
178 | 
179 |     date = datetime.now().strftime("%Y-%m-%d-%H%M-%S")
180 |     rc_samplesheet = "%s.%s.rc.csv" % (samplesheet, date)
181 |     process_samplesheet(
182 |         samplesheet, rc_samplesheet, reverse_complement=True, determine=True
183 |     )
184 | 
185 |     # run the first set
186 |     # 1105 is small subset of tiles I found to generally work
187 |     cmd_args = [
188 |         "bcl2fastq",
189 |         "--tiles",
190 |         "1105",
191 |         "--no-lane-splitting",
192 |         "--runfolder-dir",
193 |         runfolder,
194 |         "--output-dir",
195 |         tmpd,
196 |         "--barcode-mismatches",
197 |         barcode_mismatches,
198 |         "--loading-threads",
199 |         loading,
200 |         "--processing-threads",
201 |         processing,
202 |         "--writing-threads",
203 |         writing,
204 |         "--sample-sheet",
205 |         rc_samplesheet,
206 |     ]
207 |     if input_dir:
208 |         cmd_args.extend(["--input-dir", input_dir])
209 |     run_bcl2fastq(tmpd, cmd_args, determine=True)
210 |     rc_file_size = get_file_sizes(tmpd)
211 |     shutil.rmtree(tmpd)
212 | 
213 |     # try the original
214 |     tmpd = tempfile.mkdtemp(dir=output_dir if output_dir else runfolder)
215 |     orig_samplesheet = "%s.%s.orig.csv" % (samplesheet, date)
216 |     samples = process_samplesheet(
217 |         samplesheet, orig_samplesheet, reverse_complement=False, determine=True
218 |     )
219 | 
220 |     # run the first set
221 |     cmd_args = [
222 |         "bcl2fastq",
223 |         "--tiles",
224 |         "1105",
225 |         "--no-lane-splitting",
226 |         "--runfolder-dir",
227 |         runfolder,
228 |         "--output-dir",
229 |         tmpd,
230 |         "--barcode-mismatches",
231 |         barcode_mismatches,
232 |         "--loading-threads",
233 |         loading,
234 |         "--processing-threads",
235 |         processing,
236 |         "--writing-threads",
237 |         writing,
238 |         "--sample-sheet",
239 |         orig_samplesheet,
240 |     ]
241 |     if input_dir:
242 |         cmd_args.extend(["--input-dir", input_dir])
243 |     run_bcl2fastq(tmpd, cmd_args, determine=True)
244 |     orig_file_size = get_file_sizes(tmpd)
245 |     shutil.rmtree(tmpd)
246 | 
247 |     if rc_file_size > orig_file_size:
248 |         logging.info("Using reverse complement of Index2 to demultiplex")
249 |         os.remove(orig_samplesheet)
250 |         return samples, rc_samplesheet
251 |     elif rc_file_size == orig_file_size:
252 |         logging.critical(
253 |             (
254 |                 "The original and reverse complemented barcodes "
255 |                 "yielded the same number of demultiplexed reads."
256 |             )
257 |         )
258 |         sys.exit(1)
259 |     else:
260 |         logging.info("Using the original barcodes to demultiplex")
261 |         os.remove(rc_samplesheet)
262 |         return samples, orig_samplesheet
263 | 
264 | 
265 | def xml_to_df(stats_xml):
266 |     if os.path.exists(stats_xml):
267 |         logging.info("Generating demultiplexing stats file")
268 |         doc = ET.parse(stats_xml)
269 |         root = doc.getroot()
270 |         counts = {}
271 |         for sample in root.iter("Sample"):
272 |             name = sample.get("name")
273 |             if name == "all" or name == "unknown":
274 |                 continue
275 |             counts[name] = {}
276 |             for barcode in sample.iter("Barcode"):
277 |                 if barcode.get("name") == "all":
278 |                     continue
279 |                 for lane in barcode.iter("Lane"):
280 |                     lane_name = lane.get("number")
281 |                     count = int(lane.findtext("BarcodeCount"))
282 |                     counts[name][lane_name] = count
283 |         return pd.DataFrame(counts)
284 |     else:
285 |         logging.warning("Could not find file %s", stats_xml)
286 |         return None
287 | 
288 | 
289 | def barplot_distribution(df, out_file):
290 |     width = max([len(df) / 10, 12])
291 |     f, ax = plt.subplots(figsize=(width, 6))
292 |     df.plot(kind="bar", stacked=True, ax=ax)
293 |     f.savefig(out_file, bbox_inches="tight")
294 |     plt.close()
295 | 
296 | 
297 | def Lc(x):
298 |     """Computes the ordinary and generalized Lorenz curve of a list.
299 | 
300 |     >>> import numpy as np
301 |     >>> t = [1,2,np.nan,7,8]
302 |     >>> p, L, Lg = Lc(t)
303 |     >>> len(p) == len(L) == len(Lg)
304 |     True
305 |     >>> p[1:4]
306 |     array([ 0.25,  0.5 ,  0.75])
307 |     >>> L[1:4] # doctest: +ELLIPSIS
308 |     array([ 0.055...,  0.166...,  0.555...])
309 |     >>> Lg[1:4]
310 |     array([ 0.25,  0.75,  2.5 ])
311 |     >>> t = [1,2,np.nan,7,-8]
312 |     >>> Lc(t) # doctest: +ELLIPSIS
313 |     Traceback (most recent call last):
314 |      ...
315 |     ValueError: x contained negative number
316 |     """
317 |     assert len(x) > 0, "x is empty"
318 |     a = np.array(x, dtype=float)
319 |     a = a[np.isfinite(a)]
320 |     if a.min() < 0:
321 |         raise ValueError("x contained negative number")
322 |     a.sort(kind="mergesort")
323 |     a_len = float(len(a))
324 |     p = np.arange(1, a_len + 1) / a_len
325 |     p = np.append([0], p)
326 |     L = a.cumsum() / a.sum()
327 |     L = np.append([0], L)
328 |     Lg = L * np.mean(a)
329 |     return p, L, Lg
330 | 
331 | 
332 | def lorenz_curve(df, out_file):
333 |     p, L, Lg = Lc(df.sum(axis=1).values)
334 |     f, ax = plt.subplots(figsize=(8, 6))
335 |     plt.plot(p, L, axes=ax)
336 |     plt.plot([0, 1], axes=ax, color="black", linestyle="--")
337 |     ax.set(title="Distribution of Barcode Mapped Reads")
338 |     f.savefig(out_file, bbox_inches="tight")
339 |     plt.close()
340 | 
341 | 
342 | def compile_demultiplex_stats(runfolder, out_dir):
343 |     stats_xml = os.path.join(
344 |         os.path.abspath(out_dir), "Stats", "DemultiplexingStats.xml"
345 |     )
346 |     df = xml_to_df(stats_xml)
347 |     if df is not None:
348 |         df.sum().to_csv(os.path.join(runfolder, "demultiplexing_stats.csv"))
349 |         try:
350 |             dft = df.transpose().drop("Undetermined", axis=0)
351 |         except ValueError:
352 |             dft = df.transpose()
353 |         barplot_distribution(
354 |             dft, os.path.join(runfolder, "demultiplexing_distribution.pdf")
355 |         )
356 |         lorenz_curve(
357 |             dft, os.path.join(runfolder, "demultiplexing_distribution_curve.pdf")
358 |         )
359 | 
360 | 
361 | @click.command(
362 |     context_settings=dict(
363 |         help_option_names=["-h", "--help"],
364 |         ignore_unknown_options=True,
365 |     )
366 | )
367 | @click.option(
368 |     "-i",
369 |     "--input-dir",
370 |     default=None,
371 |     show_default=True,
372 |     help="path to input directory; default is RUNFOLDER-DIR/Data/Intensities/BaseCalls",
373 | )
374 | @click.option(
375 |     "-R",
376 |     "--runfolder-dir",
377 |     default=os.path.realpath("."),
378 |     show_default=True,
379 |     help="path to directory containing run data",
380 | )
381 | @click.option(
382 |     "-o",
383 |     "--output-dir",
384 |     default=None,
385 |     help="path to demultiplexed output; default is same as INPUT-DIR",
386 | )
387 | @click.option(
388 |     "--sample-sheet",
389 |     default=None,
390 |     help="file path to sample sheet; default is RUNFOLDER-DIR/SampleSheet.csv",
391 | )
392 | @click.option(
393 |     "--loading",
394 |     default=12,
395 |     type=int,
396 |     show_default=True,
397 |     help="number of threads used for loading BCL data",
398 | )
399 | @click.option(
400 |     "--processing",
401 |     default=24,
402 |     type=int,
403 |     show_default=True,
404 |     help="number of threads used for processing demultiplexed data",
405 | )
406 | @click.option(
407 |     "--writing",
408 |     default=12,
409 |     type=int,
410 |     show_default=True,
411 |     help="number of threads used for writing FASTQ data",
412 | )
413 | @click.option(
414 |     "--barcode-mismatches",
415 |     default=0,
416 |     type=int,
417 |     show_default=True,
418 |     help="number of allowed mismatches per index",
419 | )
420 | @click.option(
421 |     "--keep-tmp",
422 |     is_flag=True,
423 |     default=False,
424 |     show_default=True,
425 |     help="save Undetermined reads",
426 | )
427 | @click.option(
428 |     "--reverse-complement",
429 |     is_flag=True,
430 |     default=False,
431 |     show_default=True,
432 |     help="reverse complement index 2 of the sample sheet",
433 | )
434 | @click.option(
435 |     "--no-wait",
436 |     is_flag=True,
437 |     default=False,
438 |     show_default=True,
439 |     help="process the run without checking its completion status",
440 | )
441 | @click.option(
442 |     "--overwrite",
443 |     is_flag=True,
444 |     default=False,
445 |     show_default=True,
446 |     help="overwrite existing fastq files in the output directory",
447 | )
448 | @click.option(
449 |     "--determine",
450 |     is_flag=True,
451 |     default=False,
452 |     show_default=True,
453 |     help="use barcodes in samplesheet as well as the reverse complement of index 2, then demultiplex with best",
454 | )
455 | @click.option(
456 |     "--no-cleanup",
457 |     is_flag=True,
458 |     default=False,
459 |     show_default=True,
460 |     help="skip all cleaning up -- do not rename fastq output and do not delete undetermined files",
461 | )
462 | @click.option(
463 |     "--delay",
464 |     default=14400,
465 |     type=int,
466 |     show_default=True,
467 |     help="number of seconds to sleep after finding RTAComplete.txt -- applies only when waiting for a run to complete",
468 | )
469 | @click.argument("bcl2fastq_args", nargs=-1, type=click.UNPROCESSED)
470 | def bcl2fastq(
471 |     input_dir,
472 |     runfolder_dir,
473 |     output_dir,
474 |     sample_sheet,
475 |     loading,
476 |     processing,
477 |     writing,
478 |     barcode_mismatches,
479 |     keep_tmp,
480 |     reverse_complement,
481 |     no_wait,
482 |     overwrite,
483 |     determine,
484 |     no_cleanup,
485 |     delay,
486 |     bcl2fastq_args,
487 | ):
488 |     """Runs bcl2fastq2, creating fastqs and concatenating fastqs across lanes.
489 |     Undetermined files are deleted by default.
490 | 
491 |     Any arguments not matching those outlined below will be sent to the
492 |     `bcl2fastq` call.
493 |     """
494 |     try:
495 |         if not sample_sheet:
496 |             samplesheet = get_samplesheet(runfolder_dir)
497 |         else:
498 |             samplesheet = get_samplesheet(sample_sheet)
499 |     except OSError:
500 |         logging.critical("Could not find sample sheet CSV.")
501 |         sys.exit(1)
502 | 
503 |     # will wait on the run to complete without ever checking for samples in
504 |     # the samplesheet
505 |     if not determine:
506 |         # new samplesheet written each time leaving the original unchanged
507 |         date = datetime.now().strftime("%Y-%m-%d-%H%M-%S")
508 |         new_samplesheet = "%s.%s.csv" % (samplesheet, date)
509 |         if os.path.exists(os.path.join(runfolder_dir, "bcl2fastq.log")):
510 |             # this run has already been converted, so don't reverse complement
511 |             # just get the sample names
512 |             if reverse_complement:
513 |                 logging.warning(
514 |                     (
515 |                         "reverse complementing has been skipped as "
516 |                         "a log file (bcl2fastq.log) was found"
517 |                     )
518 |                 )
519 |             samples = process_samplesheet(samplesheet, new_samplesheet, False)
520 |         else:
521 |             samples = process_samplesheet(
522 |                 samplesheet, new_samplesheet, reverse_complement
523 |             )
524 |         if len(samples) == 0:
525 |             logging.critical(
526 |                 (
527 |                     "No samples were found in the SampleSheet. "
528 |                     "Please check its formatting."
529 |                 )
530 |             )
531 |             sys.exit(1)
532 | 
533 |     # check for RTAComplete.txt and sleep for delay
534 |     completion_success = wait_for_completion(runfolder_dir, no_wait, delay)
535 |     if not completion_success:
536 |         logging.critical("Run did not complete as planned. Exiting.")
537 |         sys.exit(1)
538 | 
539 |     # set where we're going to dump the result files
540 |     if output_dir:
541 |         fastq_dir = os.path.realpath(output_dir)
542 |     else:
543 |         fastq_dir = os.path.abspath(
544 |             os.path.join(runfolder_dir, "Data", "Intensities", "BaseCalls")
545 |         )
546 | 
547 |     # check original and reverse complement using a few tiles
548 |     if determine:
549 |         # need to add input_dir, output_dir
550 |         samples, new_samplesheet = run_determination_step(
551 |             input_dir,
552 |             runfolder_dir,
553 |             output_dir,
554 |             samplesheet,
555 |             loading,
556 |             processing,
557 |             writing,
558 |             barcode_mismatches,
559 |         )
560 | 
561 |     # run bcl2fastq on the run folder
562 |     cmd_args = [
563 |         "bcl2fastq",
564 |         "--sample-sheet",
565 |         new_samplesheet,
566 |         "--loading-threads",
567 |         loading,
568 |         "--processing-threads",
569 |         processing,
570 |         "--writing-threads",
571 |         writing,
572 |         "--barcode-mismatches",
573 |         barcode_mismatches,
574 |         "--no-lane-splitting",
575 |         "--runfolder-dir",
576 |         runfolder_dir,
577 |     ] + list(bcl2fastq_args)
578 |     if output_dir:
579 |         cmd_args.extend(["--output-dir", output_dir])
580 |     if input_dir:
581 |         cmd_args.extend(["--input-dir", input_dir])
582 |     run_bcl2fastq(runfolder_dir, cmd_args)
583 | 
584 |     # parse DemultiplexingStats.xml into a csv with summary plots
585 |     compile_demultiplex_stats(runfolder_dir, fastq_dir)
586 | 
587 |     # TODO: deprecate!
588 |     # write file with sample names for downstream parallelization
589 |     with open(os.path.join(fastq_dir, "SAMPLES"), "w") as ofh:
590 |         print(*samples, sep="\n", file=ofh)
591 | 
592 |     # cleanup the output directory
593 |     if not no_cleanup:
594 |         logging.info("Cleaning up output directory [%s]" % fastq_dir)
595 |         for f in glob(os.path.join(fastq_dir, "*.fastq*")):
596 |             if not f.endswith(".gz") and not f.endswith(".fastq"):
597 |                 continue
598 |             filename = os.path.basename(f)
599 |             if filename.startswith("Undetermined_") and not keep_tmp:
600 |                 logging.info("Deleting %s" % filename)
601 |                 os.remove(f)
602 |             else:
603 |                 try:
604 |                     # AD-332-A10_S1_R1_001.fastq.gz --> AD-332-A10_R1.fastq.gz
605 |                     sample_name, _, read_index, ext = filename.split("_")
606 |                     # munge the file name
607 |                     new_file_name = "%s_%s.%s" % (
608 |                         sample_name,
609 |                         read_index,
610 |                         ext.partition(".")[-1],
611 |                     )
612 |                     # prepend the path
613 |                     new_file_name = os.path.join(os.path.dirname(f), new_file_name)
614 |                     if overwrite and os.path.exists(new_file_name):
615 |                         os.remove(new_file_name)
616 |                     os.rename(f, new_file_name)
617 |                 except ValueError:
618 |                     logging.warn(
619 |                         "Renaming skipped: the output dir contains conflicting FASTQ file for %s"
620 |                         % f
621 |                     )
622 | 
623 | 
624 | if __name__ == "__main__":
625 |     bcl2fastq()
626 | 


--------------------------------------------------------------------------------