├── .gitignore
├── LICENSE.txt
├── README.md
├── Snakefile.count_and_push
├── Snakefile.count_and_push_test
├── Snakefile.count_dups
├── bcl_direct_reader.py
├── cbcl_read.py
├── count_well_duplicates.py
├── doit.sh
├── dump_slocs.py
├── get_cached_targets.sh
├── parse_slocs.py
├── plan.md
├── prepare_cluster_indexes.py
├── summary_to_wiki.py
├── summary_to_wiki2.py
├── target.py
└── test
    ├── __init__.py
    ├── bad1.list
    ├── bad2.list
    ├── old
        └── test_bcl_direct_reader.py
    ├── small.list
    ├── test_count_well_duplicates.py
    └── test_target.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | tmp
4 | tmp_*
5 | *.list
6 | sge_output
7 | .nfs*
8 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, Edinburgh Genomics
 2 | Authors: Judith Risse <judith.risse@ed.ac.uk>,
 3 |          Timothy Booth <tim.booth@ed.ac.uk>
 4 | 
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Detection of Duplicates on Patterned Flowcells
 2 | ==============================================
 3 | 
 4 | ***THE PLN:*** Directly detect local duplicates in patterned flow cells without alignment or demultiplexing.
 5 | 
 6 | Overview
 7 | --------
 8 | 
 9 | We sample a number targets on each tile of an Illumina Hiseq 4000 (or Hiseq X) flowcell to scan for duplicates.  A target is a specific well and the surrounding wells out to a pre-defined distance (say, five layers out in the honeycomb).  For each target we read a substring of the sequence for every well directly from the raw BCL files.  Then we count the duplicates found by looking for wells that match the centre sequence.
10 | 
11 | Usage
12 | -----
13 | 
14 | ```prepare_cluster_indexes.py``` will come up with a list of cluster locations (targets) to be sampled, and work out the co-ordinates of all the surrounding wells.  It parses the standard .locs file found in the Data directory for every Illumina run.  Note that the layout of wells is specific to the generation of flowcell rather than being specific to the machine, so watch out if you are planning to use the same locations file for scanning multiple flowcells - check that the .locs files are indeed the same.
15 | 
16 | ```count_well_duplicates.py``` will read the data from your BCL files and output duplication stats.  It needs to be supplied with a run to be analysed and also a targets file produced with the ```prepare_cluster_indexes.py``` script.
17 | 
18 | Results
19 | -------
20 | 
21 | A summary of repeats found will be printed for each tile, with grand totals per lane at the end.  For the per-lane stats, proportions will also be calculated.
22 | 
23 | Below is the end of the output for a sample run, showing the last two tiles and the summary for lane 8.
24 | 
25 |     ...
26 |     Lane: 8 Tile: 2223  Targets: 1712/2500
27 |     Level: 1    Wells: 10270    Dups: 6     Hit: 6      AccO: 6     AccI: 32
28 |     Level: 2    Wells: 20539    Dups: 13    Hit: 13     AccO: 19    AccI: 26
29 |     Level: 3    Wells: 30800    Dups: 3     Hit: 3      AccO: 22    AccI: 13
30 |     Level: 4    Wells: 41058    Dups: 5     Hit: 5      AccO: 27    AccI: 10
31 |     Level: 5    Wells: 51318    Dups: 5     Hit: 5      AccO: 32    AccI: 5
32 |     Lane: 8 Tile: 2224  Targets: 1697/2500
33 |     Level: 1    Wells: 10180    Dups: 11    Hit: 9      AccO: 9     AccI: 32
34 |     Level: 2    Wells: 20359    Dups: 12    Hit: 10     AccO: 18    AccI: 24
35 |     Level: 3    Wells: 30530    Dups: 8     Hit: 7      AccO: 24    AccI: 15
36 |     Level: 4    Wells: 40703    Dups: 7     Hit: 4      AccO: 27    AccI: 9
37 |     Level: 5    Wells: 50876    Dups: 13    Hit: 6      AccO: 32    AccI: 6
38 |     LaneSummary: 8  Tiles: 96   Targets: 157089/240000
39 |     Level: 1    Wells: 942452   Dups: 583 (0.001)   Hit: 553 (0.004)    AccO: 553 (0.004)   AccI: 1933 (0.012)
40 |     Level: 2    Wells: 1884692  Dups: 498 (0.000)   Hit: 449 (0.003)    AccO: 988 (0.006)   AccI: 1400 (0.009)
41 |     Level: 3    Wells: 2826472  Dups: 434 (0.000)   Hit: 371 (0.002)    AccO: 1344 (0.009)  AccI: 966 (0.006)
42 |     Level: 4    Wells: 3768293  Dups: 376 (0.000)   Hit: 308 (0.002)    AccO: 1637 (0.010)  AccI: 608 (0.004)
43 |     Level: 5    Wells: 4709687  Dups: 393 (0.000)   Hit: 309 (0.002)    AccO: 1933 (0.012)  AccI: 309 (0.002)
44 | 
45 | So in the above printout:
46 | 
47 | * 2500 targets (as generated by prepare_cluster_indexes) were scanned
48 | * The targets extended to 5 levels away from each centre well
49 | * 96 tiles were inspected, totalling 240000 potential targets
50 | * On tile 2223, 1712 of the targets had a good centre sequence and thus were actually counted
51 | * In total, 157089 of the 240000 targets were counted
52 | * Looking just at adjacent wells (level 1), in summary 942452 wells were inspected...
53 |  * of which 583 were found to match the centre sequence (0.1% of wells)
54 |  * which equated to 553 targets (0.4% of targets) if each target was counted just once
55 | * Totting up cumulative summary counts across all five levels...
56 |  * 988 of the 157089 targets counted had a duplicate at level 1 or 2
57 |  * 1344 of the 157089 targets had a duplicate at level 1, 2 or 3
58 |  * 1933 of the targets had a duplicate well at any level
59 |  * 309 of the targets (0.2%) had a duplicate well at level 5
60 |  * 608 (0.4%) had a duplicate in level 5 or 4
61 | 
62 | BCL Direct Reader
63 | -----------------
64 | 
65 | The file ```bcl_direct_reader.py``` contains pure Python code for retrieving sequence from raw BCL files.  For our purposes there is little to be gained from porting this to C as most of the time is spent Gunzipping the data.
66 | 
67 | Run ```pydoc ./bcl_direct_reader.py``` for more info.
68 | 
69 | Health Warning
70 | --------------
71 | 
72 | This code is not yet well tested, specifically it has not been validated on Hiseq X data, though it is designed to work on those flowcells.
73 | 


--------------------------------------------------------------------------------
/Snakefile.count_and_push:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # vim: ft=python
  3 | 
  4 | # This script is very specific to Edinburgh Genomics and our use of Confluence
  5 | # Wiki.  External users should make use of Snakefile.count_dups instead.
  6 | 
  7 | # If $DATADIR is set, reads from that folder, else CWD
  8 | # If $WORKDIR is set, works in that folder, else
  9 | # ../../runqc/WellDuplicates/`basename $PWD`
 10 | 
 11 | # Contents >>>
 12 | #   + Embedded BASH script to bootstrap the workflow
 13 | #   + Initialisation and configuration
 14 | #   + Helper functions
 15 | #   + The rules specific to this workflow
 16 | #   + More generic rules
 17 | 
 18 | """true" ### Begin shell script part
 19 | set -e ; set -u
 20 | 
 21 | threads=${SNAKE_THREADS:-8}
 22 | 
 23 | #You have to set CLUSTER_QUEUE.  There is no default now!
 24 | #Normally at EG it will be "casava"
 25 | queue=${CLUSTER_QUEUE}
 26 | 
 27 | #Set this if you want to enable re-running on an existing workdir
 28 | rerun=${SNAKE_RERUN:-0}
 29 | 
 30 | datadir="${DATADIR:-`pwd`}"
 31 | 
 32 | #Don't run until there is a RTARead1Complete.txt touch file,
 33 | #and a RunInfo.xml file (on NovoSeq we just have RTAComplete.txt?!)
 34 | #And as we need to write to the Wiki, we need the main pipeline to
 35 | #have already made the run page for us too.
 36 | set -x
 37 | compgen -G "$datadir/*upload_run_info_on_wiki*" >/dev/null
 38 | test -e "$datadir"/RTARead1Complete.txt || test -e "$datadir"/RTAComplete.txt
 39 | test -e "$datadir"/Data/Intensities/s.locs
 40 | test -e "$datadir"/RunInfo.xml
 41 | set +x
 42 | 
 43 | workdir="${WORKDIR:-$datadir/../../runqc/WellDuplicates/`basename $datadir`}"
 44 | workdir="`readlink -f "$workdir"`"
 45 | #Make $workdir and in the process ensure the script only runs once.
 46 | #Link the $datadir back to the $workdir
 47 | if [ "$rerun" != 0 ] ; then set +e ; fi
 48 | mkdir "$workdir"
 49 | ln -sr "$datadir" "$workdir/datadir"
 50 | 
 51 | #Before changing dir, get the real path to this file on the assumption that
 52 | #count_well_duplicates.py may well be there.
 53 | #I can't just prepend it to the PATH here as it won't carry across to cluster jobs.
 54 | scriptdir="`dirname $0`"
 55 | 
 56 | 
 57 | ## And off we go.
 58 | if [ "${queue}" = none ] ; then
 59 |     set +e
 60 |     snakemake -s "$0" -j 1 --config workdir="$workdir" scriptdir="$scriptdir" -- "$@"
 61 | else
 62 |     ## Settings specific to SLURM vs. SGE
 63 |     if [ -e /lustre/software ] ; then
 64 |         drmaa_args=" -p $queue"
 65 |     else
 66 |         drmaa_args=" -q $queue -S /bin/bash -p -10 -V \
 67 |                      -o "$workdir"/sge_output -e "$workdir"/sge_output"
 68 | 
 69 |         ## Ensure the cluster output is going to the right place.
 70 |         mkdir -p "$workdir"/sge_output
 71 |     fi
 72 | 
 73 |     set +e
 74 | 
 75 |     #Annoyingly I'm getting intermittent failures, so enable 3 retries as a crude
 76 |     #workaround.
 77 |     for try in 1 2 3 4 ; do
 78 |         snakemake \
 79 |          -s "$0" -j $threads -T \
 80 |          --config workdir="$workdir" scriptdir="$scriptdir" \
 81 |          -p --jobname "{rulename}.snakejob.{jobid}.sh" --rerun-incomplete \
 82 |          --drmaa "$drmaa_args" \
 83 |          "$@"
 84 |     sleep 5 ; done
 85 | fi
 86 | 
 87 | "exit""" ### End of shell script part
 88 | 
 89 | #!/usr/bin/env snakemake
 90 | from snakemake.utils import format
 91 | import xml.etree.ElementTree as ET
 92 | 
 93 | #Regular glob() is useful but it can be improved like so.
 94 | import os
 95 | from glob import glob as _glob
 96 | glob = lambda pathname: sorted(_glob(os.path.expanduser(pathname)))
 97 | 
 98 | workdir: config['workdir']
 99 | 
100 | #Configuration options as constants
101 | 
102 | TARGETS_TO_SAMPLE = 2500
103 | # LANES_TO_SAMPLE = "1 2 3 4 5 6 7 8" # see below
104 | READ_LENGTH = 50
105 | LEVELS_TO_SCAN = 5
106 | REPORT_VERBOSE = True
107 | 
108 | ### Calculate some derived options
109 | 
110 | #Find the scripts if they are in the same folder as this one,
111 | #even if it's not in the default PATH.
112 | if 'scriptdir' in config:
113 |     _PATHSET = 'PATH=\'%s\'":$PATH" ' % config['scriptdir']
114 | else:
115 |     _PATHSET = ''
116 | 
117 | PREP_INDICES     = _PATHSET + "prepare_cluster_indexes.py"
118 | COUNT_WELL_DUPL  = _PATHSET + "count_well_duplicates.py"
119 | SUMMARY_TO_WIKI  = _PATHSET + "summary_to_wiki.py"
120 | SUMMARY_TO_WIKI2 = _PATHSET + "summary_to_wiki2.py"
121 | 
122 | # Per-cluster config
123 | if os.path.exists('/lustre/software'):
124 |     #Disabled on GSEG for now
125 |     UPLOAD_TO_WIKI   = 'echo upload_file_to_wiki'
126 | else:
127 |     #It's essential this does not call the default system python2
128 |     UPLOAD_TO_WIKI   = 'env PYTHONPATH="/ifs/software/linux_x86_64/wiki_communication/current"' + \
129 |                        ' /ifs/software/linux_x86_64/bin/python2' + \
130 |                        ' /ifs/software/linux_x86_64/wiki_communication/current/bin/upload_file_to_wiki.py --real'
131 | 
132 | #Get the run info
133 | run_info_root = ET.parse("datadir/RunInfo.xml").getroot()
134 | 
135 | # Machine type is only used to determine the highest tile, so I'll make it
136 | # that you can just pass the tile number directly. Note the tiles in the XML
137 | # are not in order so I can't just take the last one!
138 | LAST_LANE, LAST_TILE = max(te.text for te in run_info_root.findall(".//Tiles/Tile")).split('_')
139 | 
140 | # Lanes to sample is now variable sinde the arrival of Novaseq, so get it from
141 | # RunInfo.xml...
142 | LANES_TO_SAMPLE = range(1, int(LAST_LANE) + 1)
143 | 
144 | # For most runs we want to start at read 20, but some runs only have 51
145 | # cycles in read1.
146 | num_cycles = int(run_info_root.find("Run/Reads/Read[@Number='1']").get('NumCycles'))
147 | if(num_cycles > READ_LENGTH + 20):
148 |     START_POS = 20
149 | else:
150 |     assert num_cycles > READ_LENGTH
151 |     START_POS = 0
152 | END_POS = READ_LENGTH + START_POS
153 | 
154 | ### Specific rules
155 | localrules: main, summarize_all_lanes, send_to_wiki, format_for_wiki, send_to_wiki2, format_for_wiki2
156 | 
157 | """Main rule just defines everything to be generated.
158 |    The shell script should have made me a new working folder with datadir
159 |    being a symlink to the sequencer output directory.
160 | """
161 | rule main:
162 |     input: txt = format("{TARGETS_TO_SAMPLE}targets_all_lanes.txt"),
163 |            wiki = format("{TARGETS_TO_SAMPLE}targets_uploaded_to_wiki.touch"),
164 |            wiki2 = format("{TARGETS_TO_SAMPLE}targets_acci1_to_wiki.touch")
165 | 
166 | rule summarize_all_lanes:
167 |     #sleep 10 crudely prevents re-running due to clock skew
168 |     output: "{targets}targets_all_lanes.txt"
169 |     input:
170 |         expand( "{{targets}}targets_lane{lane}.txt",
171 |                  lane=LANES_TO_SAMPLE )
172 |     shell: "sleep 10 ; tail -n $(( {LEVELS_TO_SCAN} + 4 )) {input} > {output}"
173 | 
174 | rule count_well_dupl:
175 |     output: "{targets}targets_lane{lane}.txt"
176 |     input: targfile = "{targets}clusters.list"
177 |     params: summary = '-S' if not REPORT_VERBOSE else ''
178 |     shell:
179 |         "{COUNT_WELL_DUPL} -f {input.targfile} -n {wildcards.targets} -s {LAST_TILE} -r datadir" +
180 |         " -i {wildcards.lane} -l {LEVELS_TO_SCAN} -x {START_POS} -y {END_POS}" +
181 |         " {params.summary} > {output}"
182 | 
183 | rule format_for_wiki:
184 |     #Makes a Wiki page (in Wiki markup) that can go as a sub-page of the runpage
185 |     #sleep 10 prevents re-running due to clock skew
186 |     output: "{targets}targets_all_lanes.wiki"
187 |     input: "{targets}targets_all_lanes.txt"
188 |     shell: "sleep 10 ; {SUMMARY_TO_WIKI} < {input} > {output}"
189 | 
190 | rule format_for_wiki2:
191 |     #Makes a comment (in HTML markup) that can be attached to the runpage
192 |     #sleep 10 prevents re-running due to clock skew
193 |     output: "{targets}targets_acci1.wiki"
194 |     input: "{targets}targets_all_lanes.txt"
195 |     shell: "sleep 10 ; {SUMMARY_TO_WIKI2} < {input} > {output}"
196 | 
197 | rule send_to_wiki:
198 |     output: "{targets}targets_uploaded_to_wiki.touch"
199 |     input: "{targets}targets_all_lanes.wiki"
200 |     params: run_page = os.path.basename(os.path.realpath("datadir")),
201 |             page_title = "well_duplicates_" + os.path.basename(os.path.realpath("datadir"))
202 |     shell:
203 |         #This rule will push the result to the wiki, creating the .touch file
204 |         #which serves as a proxy for a successful upload.
205 |         "{UPLOAD_TO_WIKI} --overwrite -f {input} -t '{params.page_title}' -p '{params.run_page}' > {output}"
206 | 
207 | # Send just the abbreviated results as a table in the comments, as requested by Karim
208 | rule send_to_wiki2:
209 |     output: "{targets}targets_acci1_to_wiki.touch"
210 |     input: "{targets}targets_acci1.wiki"
211 |     params: run_page = os.path.basename(os.path.realpath("datadir"))
212 |     shell:
213 |         #This rule will push the result to the wiki, creating the .touch file
214 |         #which serves as a proxy for a successful upload.
215 |         "{UPLOAD_TO_WIKI} -f {input} --comment -t '{params.run_page}' > {output}"
216 | 
217 | rule prep_indices:
218 |     output: "{targets}clusters.list"
219 |     run:
220 |         #We don't want to re-calculate indices every time, but we don't
221 |         #want to assume to locs files are all identical.  So let's have
222 |         #a shared pool of cluster lists based on the md5sum of the s.locs
223 |         slocs = "datadir/Data/Intensities/s.locs"
224 | 
225 |         if os.path.exists("../cluster_lists"):
226 |             md5, = [ l.split()[0] for l in
227 |                      shell("md5sum {slocs}", iterable=True) ]
228 |             cached_list = format("../cluster_lists/{wildcards.targets}clusters_{md5}.list")
229 | 
230 |             if not os.path.exists(format("{cached_list}.done")):
231 |                 #Make it now.  Slight paranoia regarding race condition on the file.
232 |                 #If two processes try to generate the .list file at once then one should
233 |                 #at least fail with a modicum of grace.  If PREP_INDICES fails for
234 |                 #some reason, future jobs will fail until you fix or delete the partial output.
235 |                 shell("set -o noclobber ;" +
236 |                       " {PREP_INDICES} -n {wildcards.targets} -f {slocs} > {cached_list}")
237 |                 shell("touch {cached_list}.done")
238 |             shell("ln -sr {cached_list} {output}")
239 |         else:
240 |             #No-cache mode it is, then
241 |             shell("{PREP_INDICES} -n {wildcards.targets} -f {slocs} > {output}")
242 | 
243 | 
244 | ### Generic rules
245 | 
246 | # none
247 | 


--------------------------------------------------------------------------------
/Snakefile.count_and_push_test:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # vim: ft=python
  3 | 
  4 | # This script is very specific to Edinburgh Genomics and our use of Confluence
  5 | # Wiki.  External users should make use of Snakefile.count_dups instead.
  6 | 
  7 | # If $DATADIR is set, reads from that folder, else CWD
  8 | # If $WORKDIR is set, works in that folder, else
  9 | # ../../runqc/WellDuplicates/`basename $PWD`
 10 | 
 11 | # Contents >>>
 12 | #   + Embedded BASH script to bootstrap the workflow
 13 | #   + Initialisation and configuration
 14 | #   + Helper functions
 15 | #   + The rules specific to this workflow
 16 | #   + More generic rules
 17 | 
 18 | """true" ### Begin shell script part
 19 | set -e ; set -u
 20 | 
 21 | threads=${SNAKE_THREADS:-8}
 22 | 
 23 | #You have to set CLUSTER_QUEUE.  There is no default now!
 24 | #Normally at EG it will be "casava"
 25 | queue=${CLUSTER_QUEUE}
 26 | 
 27 | #Set this if you want to enable re-running on an existing workdir
 28 | rerun=${SNAKE_RERUN:-0}
 29 | 
 30 | datadir="${DATADIR:-`pwd`}"
 31 | 
 32 | #Don't run until there is a RTARead1Complete.txt touch file,
 33 | #and a RunInfo.xml file (on NovoSeq we just have RTAComplete.txt?!)
 34 | #And as we need to write to the Wiki, we need the main pipeline to
 35 | #have already made the run page for us too.
 36 | set -x
 37 | compgen -G "$datadir/*upload_run_info_on_wiki*" >/dev/null
 38 | test -e "$datadir"/RTARead1Complete.txt || test -e "$datadir"/RTAComplete.txt
 39 | test -e "$datadir"/Data/Intensities/s.locs
 40 | test -e "$datadir"/RunInfo.xml
 41 | set +x
 42 | 
 43 | workdir="${WORKDIR:-$datadir/../../runqc/WellDuplicates/`basename $datadir`}"
 44 | workdir="`readlink -f "$workdir"`"
 45 | #Make $workdir and in the process ensure the script only runs once.
 46 | #Link the $datadir back to the $workdir
 47 | if [ "$rerun" != 0 ] ; then set +e ; fi
 48 | mkdir "$workdir"
 49 | ln -sr "$datadir" "$workdir/datadir"
 50 | 
 51 | #Before changing dir, get the real path to this file on the assumption that
 52 | #count_well_duplicates.py may well be there.
 53 | #I can't just prepend it to the PATH here as it won't carry across to cluster jobs.
 54 | scriptdir="`dirname $0`"
 55 | 
 56 | 
 57 | ## And off we go.
 58 | if [ "${queue}" = none ] ; then
 59 |     set +e
 60 |     snakemake -s "$0" -j 1 --config workdir="$workdir" scriptdir="$scriptdir" -- "$@"
 61 | else
 62 |     ## Settings specific to SLURM vs. SGE
 63 |     if [ -e /lustre/software ] ; then
 64 |         drmaa_args=" -p $queue"
 65 |     else
 66 |         drmaa_args=" -q $queue -S /bin/bash -p -10 -V \
 67 |                      -o "$workdir"/sge_output -e "$workdir"/sge_output"
 68 | 
 69 |         ## Ensure the cluster output is going to the right place.
 70 |         mkdir -p "$workdir"/sge_output
 71 |     fi
 72 | 
 73 |     set +e
 74 | 
 75 |     #Annoyingly I'm getting intermittent failures, so enable 3 retries as a crude
 76 |     #workaround.
 77 |     for try in 1 ; do
 78 |         snakemake \
 79 |          -s "$0" -j $threads -T \
 80 |          --config workdir="$workdir" scriptdir="$scriptdir" \
 81 |          -p --jobname "{rulename}.snakejob.{jobid}.sh" --rerun-incomplete \
 82 |          --drmaa "$drmaa_args" \
 83 |          "$@"
 84 |     sleep 5 ; done
 85 | fi
 86 | 
 87 | "exit""" ### End of shell script part
 88 | 
 89 | #!/usr/bin/env snakemake
 90 | from snakemake.utils import format
 91 | import xml.etree.ElementTree as ET
 92 | 
 93 | #Regular glob() is useful but it can be improved like so.
 94 | import os
 95 | from glob import glob as _glob
 96 | glob = lambda pathname: sorted(_glob(os.path.expanduser(pathname)))
 97 | 
 98 | workdir: config['workdir']
 99 | 
100 | #Configuration options as constants
101 | 
102 | TARGETS_TO_SAMPLE = 2500
103 | # LANES_TO_SAMPLE = "1 2 3 4 5 6 7 8" # see below
104 | READ_LENGTH = 50
105 | LEVELS_TO_SCAN = 5
106 | REPORT_VERBOSE = True
107 | 
108 | ### Calculate some derived options
109 | 
110 | #Find the scripts if they are in the same folder as this one,
111 | #even if it's not in the default PATH.
112 | if 'scriptdir' in config:
113 |     _PATHSET = 'PATH=\'%s\'":$PATH" ' % config['scriptdir']
114 | else:
115 |     _PATHSET = ''
116 | 
117 | PREP_INDICES     = _PATHSET + "prepare_cluster_indexes.py"
118 | COUNT_WELL_DUPL  = _PATHSET + "count_well_duplicates.py"
119 | SUMMARY_TO_WIKI  = _PATHSET + "summary_to_wiki.py"
120 | SUMMARY_TO_WIKI2 = _PATHSET + "summary_to_wiki2.py"
121 | 
122 | # Per-cluster config
123 | # DEBUG...
124 | if True or os.path.exists('/lustre/software'):
125 |     #Disabled on GSEG for now
126 |     UPLOAD_TO_WIKI   = 'echo upload_file_to_wiki'
127 | else:
128 |     #It's essential this does not call the default system python2
129 |     UPLOAD_TO_WIKI   = 'env PYTHONPATH="/ifs/software/linux_x86_64/wiki_communication/current"' + \
130 |                        ' /ifs/software/linux_x86_64/bin/python2' + \
131 |                        ' /ifs/software/linux_x86_64/wiki_communication/current/bin/upload_file_to_wiki.py --real'
132 | 
133 | #Get the run info
134 | run_info_root = ET.parse("datadir/RunInfo.xml").getroot()
135 | 
136 | # Machine type is only used to determine the highest tile, so I'll make it
137 | # that you can just pass the tile number directly. Note the tiles in the XML
138 | # are not in order so I can't just take the last one!
139 | LAST_LANE, LAST_TILE = max(te.text for te in run_info_root.findall(".//Tiles/Tile")).split('_')
140 | 
141 | # Lanes to sample is now variable since the arrival of NovaSeq, so get it from
142 | # RunInfo.xml...
143 | LANES_TO_SAMPLE = range(1, int(LAST_LANE) + 1)
144 | 
145 | # Special option for NovaSeq (and a hacky way to activate it)
146 | if LAST_TILE >= '2400':
147 |     LANES_TO_SAMPLE = [ '{}{}'.format(l,s) for l in LANES_TO_SAMPLE for s in 'TB' ]
148 | 
149 | # For most runs we want to start at read 20, but some runs only have 51
150 | # cycles in read1.
151 | num_cycles = int(run_info_root.find("Run/Reads/Read[@Number='1']").get('NumCycles'))
152 | if(num_cycles > READ_LENGTH + 20):
153 |     START_POS = 20
154 | else:
155 |     assert num_cycles > READ_LENGTH
156 |     START_POS = 0
157 | END_POS = READ_LENGTH + START_POS
158 | 
159 | ### Specific rules
160 | localrules: main, summarize_all_lanes, send_to_wiki, format_for_wiki, send_to_wiki2, format_for_wiki2
161 | 
162 | """Main rule just defines everything to be generated.
163 |    The shell script should have made me a new working folder with datadir
164 |    being a symlink to the sequencer output directory.
165 | """
166 | rule main:
167 |     input: txt = format("{TARGETS_TO_SAMPLE}targets_all_lanes.txt"),
168 |            wiki = format("{TARGETS_TO_SAMPLE}targets_uploaded_to_wiki.touch"),
169 |            wiki2 = format("{TARGETS_TO_SAMPLE}targets_acci1_to_wiki.touch")
170 | 
171 | rule summarize_all_lanes:
172 |     #sleep 2 crudely prevents re-running due to clock skew
173 |     output: "{targets}targets_all_lanes.txt"
174 |     input:
175 |         expand( "{{targets}}targets_lane{lane}.txt",
176 |                  lane=LANES_TO_SAMPLE )
177 |     shell: "sleep 2 ; tail -n $(( {LEVELS_TO_SCAN} + 5 )) {input} > {output}"
178 | 
179 | rule count_well_dupl:
180 |     output: "{targets}targets_lane{lane,\d}{surface,[TB]?}.txt"
181 |     input: targfile = "{targets}clusters.list"
182 |     params:
183 |         summary = '-S' if not REPORT_VERBOSE else '',
184 |         tile = lambda wc: '-t "1..[02468]"' if wc.surface == 'T' else '-t "2..[02468]"' if wc.surface == 'B' else ''
185 |     shell:
186 |         "{COUNT_WELL_DUPL} -f {input.targfile} -n {wildcards.targets} -s {LAST_TILE} -r datadir" +
187 |         " -i {wildcards.lane} -l {LEVELS_TO_SCAN} -x {START_POS} -y {END_POS} {params.tile}" +
188 |         " {params.summary} > {output}"
189 | 
190 | rule format_for_wiki:
191 |     #Makes a Wiki page (in Wiki markup) that can go as a sub-page of the runpage
192 |     #sleep 2 crudely prevents re-running due to clock skew
193 |     output: "{targets}targets_all_lanes.wiki"
194 |     input: "{targets}targets_all_lanes.txt"
195 |     shell: "sleep 2 ; {SUMMARY_TO_WIKI} < {input} > {output}"
196 | 
197 | rule format_for_wiki2:
198 |     #Makes a comment (in HTML markup) that can be attached to the runpage
199 |     #sleep 2 prevents re-running due to clock skew
200 |     output: "{targets}targets_acci1.wiki"
201 |     input: "{targets}targets_all_lanes.txt"
202 |     shell: "sleep 2 ; {SUMMARY_TO_WIKI2} < {input} > {output}"
203 | 
204 | rule send_to_wiki:
205 |     output: "{targets}targets_uploaded_to_wiki.touch"
206 |     input: "{targets}targets_all_lanes.wiki"
207 |     params: run_page = os.path.basename(os.path.realpath("datadir")),
208 |             page_title = "well_duplicates_" + os.path.basename(os.path.realpath("datadir"))
209 |     shell:
210 |         #This rule will push the result to the wiki, creating the .touch file
211 |         #which serves as a proxy for a successful upload.
212 |         "{UPLOAD_TO_WIKI} --overwrite -f {input} -t '{params.page_title}' -p '{params.run_page}' > {output}"
213 | 
214 | # Send just the abbreviated results as a table in the comments, as requested by Karim
215 | rule send_to_wiki2:
216 |     output: "{targets}targets_acci1_to_wiki.touch"
217 |     input: "{targets}targets_acci1.wiki"
218 |     params: run_page = os.path.basename(os.path.realpath("datadir"))
219 |     shell:
220 |         #This rule will push the result to the wiki, creating the .touch file
221 |         #which serves as a proxy for a successful upload.
222 |         "{UPLOAD_TO_WIKI} -f {input} --comment -t '{params.run_page}' > {output}"
223 | 
224 | rule prep_indices:
225 |     output: "{targets}clusters.list"
226 |     run:
227 |         #We don't want to re-calculate indices every time, but we don't
228 |         #want to assume to locs files are all identical.  So let's have
229 |         #a shared pool of cluster lists based on the md5sum of the s.locs
230 |         slocs = "datadir/Data/Intensities/s.locs"
231 | 
232 |         if os.path.exists("../cluster_lists"):
233 |             md5, = [ l.split()[0] for l in
234 |                      shell("md5sum {slocs}", iterable=True) ]
235 |             cached_list = format("../cluster_lists/{wildcards.targets}clusters_{md5}.list")
236 | 
237 |             if not os.path.exists(format("{cached_list}.done")):
238 |                 #Make it now.  Slight paranoia regarding race condition on the file.
239 |                 #If two processes try to generate the .list file at once then one should
240 |                 #at least fail with a modicum of grace.  If PREP_INDICES fails for
241 |                 #some reason, future jobs will fail until you fix or delete the partial output.
242 |                 shell("set -o noclobber ;" +
243 |                       " {PREP_INDICES} -n {wildcards.targets} -f {slocs} > {cached_list}")
244 |                 shell("touch {cached_list}.done")
245 |             shell("ln -sr {cached_list} {output}")
246 |         else:
247 |             #No-cache mode it is, then
248 |             shell("{PREP_INDICES} -n {wildcards.targets} -f {slocs} > {output}")
249 | 
250 | 
251 | ### Generic rules
252 | 
253 | # none
254 | 


--------------------------------------------------------------------------------
/Snakefile.count_dups:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # vim: ft=python
  3 | 
  4 | # Execution script for count_well_duplicates.py based on Snakemake.
  5 | # Requires Snakemake in your default PATH, and the well_duplicates Python scripts
  6 | # to be in the same dir as this script or in your default PATH (remember
  7 | # temporary settings of $PATH don't transfer if you spawn jobs over the cluster).
  8 | # If $DATADIR is set, reads from that folder, else CWD
  9 | # If $WORKDIR is set, works in that folder, else
 10 | #   ../../runqc/WellDuplicates/`basename $PWD`
 11 | 
 12 | # You don't need a cluster environment to run this, but in any case you'll need
 13 | # to set $CLUSTER_QUEUE to something, even if you set it to "none".
 14 | 
 15 | # Contents >>>
 16 | #   + Embedded BASH script to bootstrap the workflow
 17 | #   + Initialisation and configuration
 18 | #   + Helper functions
 19 | #   + The rules specific to this workflow
 20 | #   + More generic rules
 21 | 
 22 | """true" ### Begin shell script part
 23 | set -e ; set -u ; set -x
 24 | 
 25 | threads=${SNAKE_THREADS:-8}
 26 | 
 27 | #You have to set CLUSTER_QUEUE.  There is no default now!
 28 | #Normally at EG it will be "casava"
 29 | #It can be "none" if you want to just run on the local machine.
 30 | queue=${CLUSTER_QUEUE}
 31 | 
 32 | datadir="${DATADIR:-`pwd`}"
 33 | 
 34 | #Set SNAKE_RERUN=1 if you want to enable re-running on an existing workdir
 35 | rerun=${SNAKE_RERUN:-0}
 36 | 
 37 | #Don't run until there is a RTARead1Complete.txt touch file,
 38 | #and a RunInfo.xml
 39 | test -e "$datadir"/RTARead1Complete.txt
 40 | test -e "$datadir"/Data/Intensities/s.locs
 41 | test -e "$datadir"/RunInfo.xml
 42 | 
 43 | workdir="${WORKDIR:-$datadir/../../runqc/WellDuplicates/`basename $datadir`}"
 44 | workdir="`readlink -f "$workdir"`"
 45 | #Make $workdir and in the process ensure the script only runs once.
 46 | #Link the $datadir back to the $workdir
 47 | if [ "$rerun" != 0 ] ; then set +e ; fi
 48 | mkdir "$workdir"
 49 | ln -sr "$datadir" "$workdir/datadir"
 50 | 
 51 | #Before changing dir, get the real path to this file on the assumption that
 52 | #count_well_duplicates.py may well be there.
 53 | #I can't just prepend it to the PATH here as it won't carry across to cluster jobs.
 54 | scriptdir="`dirname $0`"
 55 | 
 56 | ## And off we go.
 57 | cd /tmp
 58 | if [ "${queue}" = none ] ; then
 59 |     set +e
 60 |     snakemake -s "$0" -j 1 --config workdir="$workdir" scriptdir="$scriptdir" -- "$@"
 61 | else
 62 |     ## Ensure the cluster output is going to the right place.
 63 |     mkdir -p "$workdir"/sge_output
 64 | 
 65 |     set +e
 66 | 
 67 |     #Annoyingly I'm getting intermittent failures, so enable 3 retries as a crude
 68 |     #workaround.
 69 |     for try in 1 2 3 ; do
 70 |     snakemake \
 71 |      -s "$0" -j $threads -T \
 72 |      --config workdir="$workdir" scriptdir="$scriptdir" \
 73 |      -p --jobname "{rulename}.snakejob.{jobid}.sh" \
 74 |      --drmaa " -q $queue -S /bin/bash -p -10 -V \
 75 |                -o "$workdir"/sge_output -e "$workdir"/sge_output \
 76 |              " \
 77 |      -- "$@"
 78 |      done
 79 | fi
 80 | 
 81 | "exit""" ### End of shell script part
 82 | 
 83 | #!/usr/bin/env snakemake
 84 | from snakemake.utils import format
 85 | import xml.etree.ElementTree as ET
 86 | 
 87 | #Regular glob() is useful but it can be improved like so.
 88 | import os
 89 | from glob import glob as _glob
 90 | glob = lambda pathname: sorted(_glob(os.path.expanduser(pathname)))
 91 | 
 92 | workdir: config['workdir']
 93 | 
 94 | #Configuration options as constants
 95 | 
 96 | TARGETS_TO_SAMPLE = 2500
 97 | #LANES_TO_SAMPLE = "1 2 3 4 5 6 7 8" #see below
 98 | READ_LENGTH = 50
 99 | LEVELS_TO_SCAN = 5
100 | REPORT_VERBOSE = True
101 | 
102 | ### Calculate some derived options
103 | 
104 | #Find the scripts if they are in the same folder as this one,
105 | #even if it's not in the default PATH.
106 | if 'scriptdir' in config:
107 |     _PATHSET = 'PATH=\'%s\'":$PATH" ' % config['scriptdir']
108 | else:
109 |     _PATHSET = ''
110 | 
111 | PREP_INDICES    =_PATHSET + "prepare_cluster_indexes.py"
112 | COUNT_WELL_DUPL =_PATHSET + "count_well_duplicates.py"
113 | 
114 | #Get the run info
115 | run_info_root = ET.parse("datadir/RunInfo.xml").getroot()
116 | 
117 | # Machine type is only used to determine the highest tile, so I'll make it
118 | # that you can just pass the tile number directly. Note the tiles in the XML
119 | # are not in order so I can't just take the last one!
120 | LAST_LANE, LAST_TILE = max(te.text for te in run_info_root.findall(".//Tiles/Tile"))
121 | 
122 | # Lanes to sample is now variable sinde the arrival of Novaseq, so get it from
123 | # RunInfo.xml...
124 | LANES_TO_SAMPLE = range(1, int(LAST_LANE) + 1)
125 | 
126 | # For most runs we want to start at read 20, but some runs only have 51
127 | # cycles in read1.
128 | num_cycles = int(run_info_root.find("Run/Reads/Read[@Number='1']").get('NumCycles'))
129 | if(num_cycles > READ_LENGTH + 20):
130 |     START_POS = 20
131 | else:
132 |     assert num_cycles > READ_LENGTH
133 |     START_POS = 0
134 | END_POS = READ_LENGTH + START_POS
135 | 
136 | ### Specific rules
137 | localrules: main, summarize_all_lanes
138 | 
139 | """Main rule just defines everything to be generated.
140 |    The shell script should have made me a new working folder with datadir
141 |    being a symlink to the sequencer output directory.
142 | """
143 | rule main:
144 |     input: format("{TARGETS_TO_SAMPLE}targets_all_lanes.txt")
145 | 
146 | rule summarize_all_lanes:
147 |     output: "{targets}targets_all_lanes.txt"
148 |     input:
149 |         expand( "{{targets}}targets_lane{lane}.txt",
150 |                  lane=LANES_TO_SAMPLE )
151 |     shell: "tail -n $(( {LEVELS_TO_SCAN} + 1 )) {input} > {output}"
152 | 
153 | rule count_well_dupl:
154 |     output: "{targets}targets_lane{lane}.txt"
155 |     input: targfile = "{targets}clusters.list"
156 |     params: summary = '-S' if not REPORT_VERBOSE else ''
157 |     shell:
158 |         "{COUNT_WELL_DUPL} -f {input.targfile} -n {wildcards.targets} -s {LAST_TILE} -r datadir" +
159 |         " -i {wildcards.lane} -l {LEVELS_TO_SCAN} --cycles {START_POS}-{END_POS}" +
160 |         " {params.summary} > {output}"
161 | 
162 | rule prep_indices:
163 |     output: "{targets}clusters.list"
164 |     run:
165 |         #We don't want to re-calculate indices every time, but we don't
166 |         #want to assume to locs files are all identical.  So let's have
167 |         #a shared pool of cluster lists based on the md5sum of the s.locs
168 |         slocs = "datadir/Data/Intensities/s.locs"
169 | 
170 |         if os.path.exists("../cluster_lists"):
171 |             md5, = [ l.split()[0] for l in
172 |                      shell("md5sum {slocs}", iterable=True) ]
173 |             cached_list = format("../cluster_lists/{wildcards.targets}clusters_{md5}.list")
174 | 
175 |             if not os.path.exists(format("{cached_list}.done")):
176 |                 #Make it now.  Slight paranoia regarding race condition on the file.
177 |                 #If two processes try to generate the .list file at once then one should
178 |                 #at least fail with a modicum of grace.  If PREP_INDICES fails for
179 |                 #some reason, future jobs will fail until you fix or delete the partial output.
180 |                 shell("set -o noclobber ;" +
181 |                       " {PREP_INDICES} -n {wildcards.targets} -f {slocs} > {cached_list}")
182 |                 shell("touch {cached_list}.done")
183 |             shell("ln -sr {cached_list} {output}")
184 |         else:
185 |             #No-cache mode it is, then
186 |             shell("{PREP_INDICES} -n {wildcards.targets} -f {slocs} > {output}")
187 | 
188 | 
189 | ### Generic rules
190 | 
191 | # none
192 | 


--------------------------------------------------------------------------------
/bcl_direct_reader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | A module to grab sequence reads direct from the .bcl and .filter files
  4 | outputted by Illumina.  The motivation is that for some QC tasks we want
  5 | to grab a small subsample of reads, and getting these from the FASTQ is
  6 | very inefficient, especially once everything is demultiplexed and zipped.
  7 | 
  8 | We already have a C implementation of this in the bcl2fastq source and
  9 | a Java implementation in Picard Tools, but the world needs Python.
 10 | 
 11 | We assume not only the BCL format but also the standard directory
 12 | layout as specified in
 13 | https://support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq_letterbooklet_15038058brpmi.pdf
 14 | 
 15 | This module can take advantage of the fact that the BCL files have a fixed record length
 16 | and use seek() to jump to the location where the position of interest is stored.
 17 | Unfortunately for GZipped files this does not give much of an advantage, as the file
 18 | must be decompressed internally to perform the seek().  For reading several sequences
 19 | at once a simple load to memory turns out to be faster, so the reader will do
 20 | that.
 21 | 
 22 | For max efficiency you should call get_seqs() just once per tile with
 23 | all the locations you want to extract.
 24 | 
 25 | Synopsis:
 26 | 
 27 |    proj = BCLReader("/your/project/dir")
 28 |    tile = proj.get_tile(1, 1101)
 29 | 
 30 |    all_seqs = tile.get_seqs([70657,70658,70659], start=20, end=40)
 31 |    seq1, flag1 = all_seqs[70567]
 32 |    seq2, flag2 = all_seqs[70568]
 33 |    seq3, flag3 = all_seqs[70569]
 34 | 
 35 |    how_many_valid = sum([flag1,flag2,flag3])
 36 | 
 37 | On 24th Oct 2017:
 38 | We'd also like this module to be able to read .cbcl files, which are concatenated BCL files
 39 | (aka. indexed gzip files). Reading these efficiently might require changing the API a little.
 40 | 
 41 | """
 42 | 
 43 | __version__ = 1.2
 44 | __author__ = 'Tim Booth, Edinburgh Genomics <tim.booth@ed.ac.uk>'
 45 | 
 46 | import os, sys, re
 47 | import struct
 48 | import gzip
 49 | 
 50 | # This now works only in Python3 - byte semantics are totally different
 51 | assert sys.version >= '3'
 52 | 
 53 | # Callers may find these constants useful when dealing with results.
 54 | SEQUENCE  = 0
 55 | QUAL_FLAG = 1
 56 | 
 57 | class BCLReader(object):
 58 | 
 59 |     def __init__(self, location="."):
 60 |         """Creates a BCLReader instance that reads from a single run.
 61 |            location: The top level data directory for the run.
 62 |            This should be the one that contains the Data directory and the
 63 |            RunInfo.xml file.
 64 |         """
 65 |         # Just check that we can read the expected files at this
 66 |         # location.
 67 |         basecalls_dir = os.listdir( os.path.join(location, "Data", "Intensities", "BaseCalls") )
 68 |         self.lanes = [ d for d in basecalls_dir if re.match('L\d\d\d$', d) ]
 69 | 
 70 |         self.location = location
 71 | 
 72 | 
 73 |     def get_seq(self, lane, tile, cluster_index, start=0, end=None):
 74 |         """Fetches a single sequence from a specified tile.
 75 |            lane: lane number (see get_tile)
 76 |            tile: tile number (see get_tile)
 77 |            cluster_index: cluster number counting from 0.  To determine
 78 |              this from the standard position co-ordinates you need to
 79 |              parse the .locs file for the run.
 80 |            ** This is going to be very inefficient for fetching multiple sequences. **
 81 |            Use get_tile(...).get_seqs([0,2,4,6]) instead.
 82 |         """
 83 |         tile = self.get_tile(lane, tile)
 84 | 
 85 |         result = tile.get_seqs([cluster_index], start, end)
 86 | 
 87 |         # return nuc_string, flag
 88 |         return result[cluster_index]
 89 | 
 90 |     def get_tile(self, lane, tile, in_memory=False):
 91 |         """Opens a tile for reading.  You can then call get_seqs() to actually
 92 |            fetch the data.
 93 |            Lane and tile should be specified as per the Illumina file structure,
 94 |            so lanes are 1 to 8 and tiles are eg. [12][12]{01-28} (for HiSeq 4000).
 95 |         """
 96 |         lane_dir = str(lane)
 97 |         if lane_dir not in self.lanes:
 98 |             lane_dir = 'L%03d' % int(lane_dir)
 99 | 
100 |         data_dir = os.path.join(self.location, "Data", "Intensities", "BaseCalls", lane_dir)
101 | 
102 |         if in_memory:
103 |             raise RuntimeError("Preloading into memory not implemented yet")
104 | 
105 |         return Tile(data_dir, tile)
106 | 
107 | 
108 | class Tile(object):
109 | 
110 |     def __init__(self, data_dir, tile):
111 |         """Fetches sequences from a single tile.
112 |            You would not normally instantiate these directly.  Create a
113 |            BCLReader and call get_tile() instead.
114 |         """
115 | 
116 |         # Find the file prefix I need to be looking at.  Could infer it
117 |         # from the lane number but instead I'll do it by looking for the
118 |         # matching .filter file
119 |         self.bcl_filename = None
120 |         self.data_dir = data_dir
121 |         self.tile = tile
122 | 
123 |         data_dir_listing = os.listdir(data_dir)
124 |         for filt in data_dir_listing:
125 |             amatch = re.match('(.+_%s).filter' % tile, filt)
126 |             if amatch:
127 |                 self.bcl_filename = amatch.group(1) + '.bcl.gz'
128 |                 self.filter_file = os.path.join(data_dir, filt)
129 |                 break
130 | 
131 |         if not self.bcl_filename:
132 |             raise RuntimeError("Cannot find a .filter file for tile %s" % tile)
133 | 
134 |         # The CBCL filename is in the format "L00{lane}_{surface}", where the prefix
135 |         # should have the same name as the data_dir, and the surface is the first
136 |         # digit of the tile name.
137 |         self.cbcl_filename = "%s_%s.cbcl" % (os.path.basename(data_dir), str(tile)[0])
138 | 
139 |         # Also work out the number of cycle folders.  Should be 308
140 |         # for HiSeq 4000
141 |         cycle_dirs = [ f for f in data_dir_listing if re.match('C\d+.1$', f) ]
142 |         self.num_cycles = len(cycle_dirs)
143 | 
144 |         # And also the number of clusters, which should be 4309650
145 |         # for HiSeq 4000.  We need to snag this from the top of the .filter file
146 |         with open(self.filter_file, 'rb') as filt_fh:
147 | 
148 |             filt_header = struct.unpack('<III', filt_fh.read(12))
149 |             # The first word should be 0, and the version byte is 3, at least
150 |             # on my test files.
151 |             assert tuple(filt_header[0:2]) == (0, 3)
152 |             self.num_clusters = filt_header[2]
153 | 
154 |         # For now, don't read the rest of the .filter file
155 |         self.filter_offsets = None
156 |         self.passing_wells = None
157 | 
158 |     def get_seqs(self, cluster_indices, start=0, end=None):
159 |         """Collects the sequences specified by indices as a hash of pairs of seq+flag.  Ie.
160 |                 result = { idx1: ( 'ATCG...', True ), idx2: ('NNNG...', False), ... }
161 |            indices: An iterable that yields integers.  The order is unimportant.
162 |            start: starting base.
163 |            end: ending base.  Note this is as in Python's range(start,end), so
164 |                 start=2 and end=10 will skip the first two bases and yield the
165 |                 next 8.
166 |         """
167 |         # To build the sequence we have to loop over all the .bcl.gz files for the selected tile
168 |         # in the cycle folders.  These are all named C[num].1 where num is 1-308 (unpadded).
169 |         # The first proper base of the read will be in C1.1 as the read is set to start after the
170 |         # adapter, but there may still be an internal barcode (eg. for RAD) so high diversity
171 |         # is not assured.
172 |         # To understand the meaning of all the cycles and what reads they correspond to we do
173 |         # need to look at the run settings.
174 |         if end is None:
175 |             end = self.num_cycles
176 | 
177 |         # We could use NumPy
178 |         # for more efficient storage but it seems overkill as the number of reads
179 |         # being extracted is comparatively small.
180 |         # This also ensures that all the indices are ints
181 |         seq_collector =  { int(idx) : ['N'] * (end - start) for idx in cluster_indices }
182 |         flag_collector = { int(idx) : None for idx in cluster_indices }
183 |         sorted_keys = sorted(seq_collector.keys())
184 | 
185 |         # Fail fast if a key is out of range
186 |         if sorted_keys[-1] >= self.num_clusters:
187 |             raise IndexError("Requested cluster %i is out of range.  Highest on this tile is %i." %
188 |                              (sorted_keys[-1], self.num_clusters-1) )
189 | 
190 |         # And just to be sure, no key should be negative
191 |         if sorted_keys[0] < 0:
192 |             raise IndexError("Requested cluster %i is a negative number." % sorted_keys[0])
193 | 
194 |         # Get the accept/reject flag from the .filter file
195 |         fo = self._get_filter_offsets()
196 |         for idx in sorted_keys:
197 |             flag_collector[idx] = (fo[idx] != -1)
198 | 
199 |         # Now the actual basecalls
200 |         for cycle in range(start, end):
201 |             cycle_dir = os.path.join(self.data_dir, 'C%i.1' % (cycle + 1))
202 | 
203 |             # Now are we looking at .bcl.gz files or NovaSeq .cbcl files??
204 |             cycle_file = os.path.join(cycle_dir, self.bcl_filename)
205 |             cbcl_file  = os.path.join(cycle_dir, self.cbcl_filename)
206 | 
207 |             try:
208 |                 with gzip.open(cycle_file, 'rb') as bcl_fh:
209 |                     self._get_seqs_from_bcl(bcl_fh, cycle - start, sorted_keys, seq_collector)
210 |             except FileNotFoundError:
211 |                 # Try the cbcl file. If this fails allow the stack trace which will report both
212 |                 # missing files.
213 |                 # Note that this does result in opening the same CBCL file again and again
214 |                 # for each tile, but each chunk is only unzipped once.
215 |                 with open(cbcl_file, 'rb') as fh:
216 |                     self._get_seqs_from_cbcl(fh, cycle - start, sorted_keys, seq_collector)
217 | 
218 |         # Remap the arrays into strings
219 |         #  return dict( idx : (nuc_string, flag) )
220 |         return { idx : ( ''.join(seq), flag_collector[idx] ) for idx, seq in seq_collector.items() }
221 | 
222 |     def _get_filter_offsets(self):
223 |         """ Load the filter file, and convert it to a series of offsets. The actual
224 |             offsets are only used when reading excluded CBCL files but the -1 entries
225 |             will indicate the bad wells.
226 |             The file must exist as we opened it earlier when reading self.num_clusters
227 |         """
228 |         # Lazy load
229 |         if self.filter_offsets:
230 |             return self.filter_offsets
231 | 
232 |         with open(self.filter_file, 'rb') as filt_fh:
233 | 
234 |             filt_header = filt_fh.read(12)
235 |             # We already saw this!
236 |             assert tuple(struct.unpack('<III', filt_header)) == (0, 3, self.num_clusters)
237 | 
238 |             # Slurp the whole thing - file length should match the num_clusters value
239 |             # we already know.
240 |             filt_bytes = struct.unpack('<{}B'.format(self.num_clusters), filt_fh.read())
241 | 
242 |         # Make the table of offsets
243 |         filt_offsets = [-1] * self.num_clusters
244 |         offset = 0
245 |         for n, flag in enumerate(filt_bytes):
246 |             if flag & 0b00000001:
247 |                 filt_offsets[n] = offset
248 |                 offset += 1
249 | 
250 |         self.filter_offsets = filt_offsets
251 |         self.passing_wells = offset
252 | 
253 |         return self.filter_offsets
254 | 
255 |     def _get_seqs_from_cbcl(self, fh, cycle_idx, sorted_keys, seq_collector):
256 |         """ Reads from the fh to find the appropriate BCL block and then unpacks
257 |             it to extract the basecalls. Deals with excluded/unexcluded flag, requesting
258 |             the filter_offsets as necessary.
259 |             See cbcl_read.py for a more comprehensive version of CBCL reading code.
260 |         """
261 |         # Assume that fh is positioned at the start and read the header...
262 |         # First 12 bytes are fixed fields.
263 |         header_bytes = fh.read(12)
264 |         h_version, h_size, h_basebits, h_qbits, h_bins = struct.unpack('<HIBBI', header_bytes)
265 | 
266 |         assert h_version == 1
267 |         assert h_size > 32  #Should actually be 5681 for all the current CBCL files
268 |         assert h_basebits == 2
269 |         assert h_qbits == 2 #6 is valid but we don't support it!
270 |         assert h_bins == 4  #implied if h_qbits is 2
271 | 
272 |         # We don't care about the quality binning info but we do need the tile count,
273 |         # even though we're pretty sure it will always be 352.
274 |         qbin_and_tc_bytes = fh.read((h_bins * 4 * 2) + 4)
275 |         tile_count, = struct.unpack('<I', qbin_and_tc_bytes[-4:])
276 | 
277 |         # Now I can get all the file offsets which is what I really wanted.
278 |         # Plus the excluded_flag which is the final byte.
279 |         all_offset_bytes = fh.read( tile_count * 16 + 1 )
280 |         excluded_flag = bool(all_offset_bytes[-1])
281 |         t_bcl_offset = h_size
282 | 
283 |         # A loop is necessary as I have to tot up the csize values to get the seek offset
284 |         tile_as_int = int(self.tile)
285 |         for t in range(tile_count):
286 |             t_number, t_clusters, t_usize, t_csize = struct.unpack('<IIII', all_offset_bytes[t*16:(t+1)*16])
287 | 
288 |             if t_number == tile_as_int:
289 |                 # Found it!
290 |                 break
291 | 
292 |             t_bcl_offset += t_csize
293 | 
294 |         # We did find it, right?
295 |         assert t_number == tile_as_int
296 | 
297 |         # Go to the start of the block of tile data and slurp it all (even if
298 |         # I only want 1 or 2 bases - seems pointless to try and optimise the
299 |         # bases < 10 case)
300 |         fh.seek(t_bcl_offset)
301 |         zipdata = gzip.GzipFile(fileobj=fh, mode='rb').read(t_usize)
302 | 
303 |         if excluded_flag:
304 |             excluded_offsets = self._get_filter_offsets()
305 | 
306 |         for welln in sorted_keys:
307 | 
308 |             wellidx = welln
309 |             if excluded_flag:
310 |                 wellidx = excluded_offsets[welln]
311 | 
312 |             if wellidx == -1:
313 |                 # Leave the read as an N
314 |                 continue
315 | 
316 |             if wellidx % 2:
317 |                 # Take the high bits
318 |                 base_byte = zipdata[wellidx//2] >> 4
319 |             else:
320 |                 # Take the low bits
321 |                 base_byte = zipdata[wellidx//2] & 0b00001111
322 | 
323 |             # Finally it's the same as for old BCL.
324 |             if base_byte:
325 |                 seq_collector[welln][cycle_idx] = ('A', 'C', 'G', 'T')[base_byte & 0b00000011]
326 | 
327 |     def _get_seqs_from_bcl(self, fh, cycle_idx, sorted_keys, seq_collector):
328 |         """ Reads from the fh, which is presumably a gzip stream handle, and
329 |             adds the specified seqs to the seq_collector.
330 |             This is intended for internal use only.
331 |             And obviously it can only be called once per fh.
332 |         """
333 |         bcl_header = fh.read(4)
334 | 
335 |         # The BCL header should be a fixed length depending on the machine type.
336 |         # This assertion checks that it is at least consistent with the filter
337 |         # file for this tile.
338 |         assert struct.unpack('<I', bcl_header)[0] == self.num_clusters
339 | 
340 |         # I envisaged a a cunning system where we would seek through the file,
341 |         # just reading the chunks we wanted.  Turns out for more than, say,
342 |         # 10 reads, it's faster just to slurp the thing.  For over 10000 it's
343 |         # considerably faster!
344 |         if len(sorted_keys) > 10:
345 |             slurped_file = fh.read()
346 | 
347 |             for idx in sorted_keys:
348 |                 base_byte = slurped_file[idx]
349 | 
350 |                 # base = 'N'
351 |                 # qual = 0
352 |                 if base_byte:
353 |                     # The two lowest bits give us the base call
354 |                     base = ('A', 'C', 'G', 'T')[base_byte & 0b00000011]
355 | 
356 |                     # And the high bits give us the quality, but we're not using
357 |                     # it here, other than the above test which catches no-calls.
358 |                     # qual = base_byte >> 2
359 | 
360 |                     #Collect the base
361 |                     seq_collector[idx][cycle_idx] = base
362 |         else:
363 |             for idx in sorted_keys:
364 |                 fh.seek(idx + 4)
365 |                 # Is reading bytes 1 at a time slow?  I'd imagine that internal
366 |                 # cacheing negates any need for chunked reads at this level.
367 |                 base_byte, = fh.read(1)
368 | 
369 |                 # Copy-paste-ahoy!
370 |                 if base_byte:
371 |                     base = ('A', 'C', 'G', 'T')[base_byte & 0b00000011]
372 |                     seq_collector[idx][cycle_idx] = base
373 | 
374 | 
375 | 


--------------------------------------------------------------------------------
/cbcl_read.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os, sys, re
  3 | import struct
  4 | import gzip
  5 | from itertools import chain
  6 | 
  7 | # This is a stand-alone script to inspect a CBCL file - see the format description at
  8 | # https://support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq2_guide_15051736_v2.pdf
  9 | # I'll get this working then use it as the basis for the tile reader update.
 10 | 
 11 | def main(cbcl_file):
 12 | 
 13 |     def p(key, val): print("  {}: {}".format(key, val))
 14 |     def w(msg): print("!! {} !!".format(msg))
 15 | 
 16 |     # First job is to open the file and inspect the header...
 17 |     with open(cbcl_file, 'rb') as fh:
 18 | 
 19 |         # First 12 bytes are fixed fields.
 20 |         header_bytes = fh.read(12)
 21 |         h_version, h_size, h_basebits, h_qbits, h_bins = struct.unpack('<HIBBI', header_bytes)
 22 | 
 23 |         print("Header info for {}...".format(cbcl_file))
 24 |         p("version", h_version)
 25 |         if h_version != 1: w("expected version 1")
 26 |         p("header_size", h_size)
 27 |         if h_size <= 0: w("expected positive integer")
 28 |         p("base_bits", h_basebits)
 29 |         if h_basebits != 2: w("expected 2 bits per base")
 30 |         p("qscore_bits", h_qbits)
 31 |         if h_qbits not in (2, 6): w("expected 2 or 6 bits per quality score")
 32 |         p("quality_bins", h_bins)
 33 | 
 34 |         # Now we have quality binning info, in pairs of 4-byte values.
 35 |         # We can also snag the number of tile records which is the next 4 bytes
 36 |         qbin_bytes = fh.read((h_bins * 4 * 2) + 4)
 37 |         qbin_values = struct.unpack('<'+('II'*h_bins), qbin_bytes[:-4])
 38 | 
 39 |         if h_bins:
 40 |             for n in range(h_bins):
 41 |                 print("    bin {n} maps {f}[{f:0{fs}b}] ==> {t}".format(n=n, fs=h_qbits, f=qbin_values[n*2], t=qbin_values[n*2+1]))
 42 |         else:
 43 |             w("expected to see some bins")
 44 | 
 45 |         # Convert bins to map (this works even with no bins)
 46 |         qual_bin_map = [0] * (max(qbin_values[::2]) + 1)
 47 |         for n in range(h_bins):
 48 |             qual_bin_map[qbin_values[n*2]] = qbin_values[n*2+1]
 49 | 
 50 |         # Now the number of tile records. We expect 352 for the novaseq
 51 |         tile_count, = struct.unpack('<I', qbin_bytes[-4:])
 52 |         p("tile_count", tile_count)
 53 |         if tile_count != 352: w("expected to see 352 tiles in a novaseq cbcl file")
 54 | 
 55 |         # Now the file offsets. I'll unpack these in one go. I think we have 16 bytes per record
 56 |         # but the part about the 'non-PF clusters excluded flag' is ambiguous. Is it an extra byte
 57 |         # or what?? Hopefully it will become clear.
 58 |         all_offset_bytes = fh.read( tile_count * 16 + 1 )
 59 |         total_offset = 0
 60 |         offset_dict = dict()
 61 |         for t in range(tile_count):
 62 |             t_number, t_clusters, t_usize, t_csize = struct.unpack('<IIII', all_offset_bytes[t*16:(t+1)*16])
 63 | 
 64 |             print("    {t:-3} tile {n} with {c} clusters us={us} cs={cs} off={off}".format(
 65 |                         t=t, n=t_number, c=t_clusters, us=t_usize, cs=t_csize, off=total_offset ))
 66 | 
 67 |             # Record this vital info so we can access the data for any tile
 68 |             offset_dict[t_number] = (total_offset + h_size, t_usize, t_clusters)
 69 | 
 70 |             # Tot up the offsets to see where to seek for the next block.
 71 |             total_offset += t_csize
 72 | 
 73 |         # The documentation says there's another flag:
 74 |         #  "non-PF clusters excluded flag -- 1: non-PF clusters are excluded"
 75 |         # Oh yes, it's the final byte after the offsets table.
 76 | 
 77 |         # Now this is very important because the first 25 cycles are recorded in full (all tiles have
 78 |         # 4091904 clusters) but after this the flag goes on and all of the invalid reads are filtered out.
 79 |         # So I need to be able to read both types of file, one of which can only be understood with
 80 |         # reference to the filter file. How annoying.
 81 |         excluded_flag = all_offset_bytes[-1]
 82 |         p("excluded_flag", excluded_flag)
 83 |         if excluded_flag not in (0, 1): w("excluded_flag can only be 0 or 1")
 84 |         excluded_flag = bool(excluded_flag)
 85 | 
 86 |         # Now the final total_offset should be the file size plus the header size.
 87 |         total_header_size = len(header_bytes) + len(qbin_bytes) + len(all_offset_bytes)
 88 | 
 89 |         # And sanity-check agains the claimed header size
 90 |         if total_header_size != h_size:
 91 |             w("Header claims to be {} bytes but it's actually {}.".format(h_size, total_header_size))
 92 | 
 93 |         if total_header_size + total_offset == os.stat(cbcl_file).st_size:
 94 |             print("Total file size is {} as expected.".format(total_header_size + total_offset))
 95 |         else:
 96 |             w("File should be {} bytes but it's actually {}. Difference={}".format(
 97 |                         total_header_size + total_offset,
 98 |                             os.stat(cbcl_file).st_size,
 99 |                                 os.stat(cbcl_file).st_size - (total_header_size + total_offset) ))
100 | 
101 |         if not h_qbits == 2:
102 |             print("The code can't currently read bases where h_qbits != 2")
103 |             exit(0)
104 | 
105 |         # OK, now let's sample the first and last 100 bases in the first and last 12 tiles. Initially I'll do this
106 |         # without reference to the filter file. Then I'll go add code that makes use of the filter to
107 |         # support excluded_flag=1.
108 |         basemap = ['A', 'C', 'G', 'T']
109 | 
110 |         for tilenum, (offset, usize, wellcount) in sorted(offset_dict.items())[:12] + sorted(offset_dict.items())[-12:]:
111 | 
112 |             # Go to the start of the block of tile data and slurp it all
113 |             fh.seek(offset)
114 |             zipdata = gzip.GzipFile(fileobj=fh, mode='rb').read(usize)
115 | 
116 |             #Print bases in the first 100 and final 100 wells.
117 |             #But I will assume that h_qbits is 2 since apparently it always is (this is now checked above).
118 |             bases_from_start = 100 ; bases_from_end = 100
119 |             seq = []
120 | 
121 |             # Obtain an indexing array for sequences in a BCL block where the excluded flag is set.
122 |             # Since I'm currently reading a single cycle I need to do this per tile, but in the real
123 |             # code I'll load the offsets once then scan through the .cbcl files per cycle, reading just
124 |             # that tile each time.
125 |             # Obtain this array even for non-excluded .cbcl file because then I can show excluded bases
126 |             # as '-' whereas no-calls within non-excluded sequences will be N's. Got that??
127 |             excluded_offsets, passing_wells = locate_and_load_filter_file(cbcl_file, tilenum)
128 | 
129 |             # File should correspond with tile layout
130 |             if excluded_flag:
131 |                 assert passing_wells == wellcount
132 |                 # We can only now discover the real well count
133 |                 wellcount = len(excluded_offsets)
134 |             else:
135 |                 assert len(excluded_offsets) == wellcount
136 | 
137 |             for welln in chain(range(bases_from_start), range(wellcount - bases_from_end, wellcount)):
138 | 
139 |                 if excluded_flag:
140 |                     # Indirect offset look-up
141 |                     wellidx = excluded_offsets[welln]
142 |                 else:
143 |                     # Direct correspondence between the well number and the file offset
144 |                     # Show the excluded sequences as '-' to be consistent.
145 |                     wellidx = welln if excluded_offsets[welln] != -1 else -1
146 |                     # Or to reveal the base calls for all wells I could do this...
147 |                     # wellidx = welln
148 | 
149 |                 # The manual says "For a two bit quality score, this is two clusters per byte where
150 |                 # the bottom 4 bits are the first cluster and the higher 4 bits are the second cluster."
151 |                 # TODO - must be 100% sure this is the right way around!! (Can look at the last byte
152 |                 # of the odd-numbered filtered data blocks to see that they are like 00001111, as well
153 |                 # as comparing my results to the FASTQ files).
154 |                 # So the structure of the first byte in a bcl block is:
155 |                 #   00 00 00 00 => qual_well_1 call_well_1 qual_well_0 call_well_0
156 |                 # By that logic, if any qual is 00 then the call must also be 00 - ie. 00110011 should
157 |                 # be impossible, or any other number where there is a pair of zeros not followed by
158 |                 # another pair of zeros.
159 |                 # I'll add an assertion to sanity-check this.
160 | 
161 |                 if wellidx == -1:
162 |                     # Excluded - though there may still be an underlying basecall
163 |                     # if excluded_flag was False - see above
164 |                     seq.append('-')
165 | 
166 |                 elif wellidx % 2:
167 |                     # Take the high bits
168 |                     if zipdata[wellidx//2] & 0b11110000:
169 |                         assert zipdata[wellidx//2] & 0b11000000, \
170 |                             "Found base call in high bits with 0 quality: {:08b}".format(zipdata[wellidx//2])
171 |                         seq.append(basemap[ (zipdata[wellidx//2] & 0b00110000) >> 4 ])
172 |                     else:
173 |                         seq.append('N')
174 |                 else:
175 |                     if zipdata[wellidx//2] & 0b00001111:
176 |                         assert zipdata[wellidx//2] & 0b00001100, \
177 |                             "Found base call in low bits with 0 quality: {:08b}".format(zipdata[wellidx//2])
178 |                         seq.append(basemap[ zipdata[wellidx//2] & 0b00000011 ])
179 |                     else:
180 |                         seq.append('N')
181 | 
182 |             print( "Basecall in first {} wells of tile {} is {}".format(bases_from_start, tilenum, ''.join(seq[:bases_from_start])) )
183 |             print( "  ...and the last {} wells of tile {} is {}".format(bases_from_end, tilenum, ''.join(seq[bases_from_start:])) )
184 | 
185 | def locate_and_load_filter_file(cbcl_file, tilenum):
186 |         """ Locate and parse the filter file for this tile.
187 |             Translate this into a list of offsets where the wells will be found in the excluded
188 |             bcl blocks, so if the filter starts 000110101 then the lookup needs to be
189 |             [ -1, -1, -1, 0, 1, -1, 2, -1, 3 ]
190 |             Also return the number of passing wells (final offset + 1)
191 |         """
192 |         # Filter files live one directory up and they have names like s_2_2488.filter ==> s_{lane}_{tile}.filter
193 |         lane_dir = os.path.dirname(os.path.dirname(os.path.realpath(cbcl_file)))
194 | 
195 |         # lane is 1,2,3 or 4
196 |         lane = lane_dir[-1]
197 | 
198 |         filter_file = "{d}/s_{lane}_{tile}.filter".format(d=lane_dir, lane=lane, tile=tilenum)
199 | 
200 |         with open(filter_file, 'rb') as filt_fh:
201 | 
202 |             filt_header = filt_fh.read(12)
203 |             assert tuple(struct.unpack('<III', filt_header))[:2] == (0, 3)
204 |             num_clusters = struct.unpack('<III', filt_header)[2]
205 | 
206 |             # Slurp the whole thing - file length should match what the header says.
207 |             filt_bytes = struct.unpack('<{}B'.format(num_clusters), filt_fh.read())
208 | 
209 |             # Make the table off offsets
210 |             filt_offsets = [-1] * num_clusters
211 |             offset = 0
212 |             for n, flag in enumerate(filt_bytes):
213 |                 if flag & 0b00000001:
214 |                     filt_offsets[n] = offset
215 |                     offset += 1
216 | 
217 |         print("[ Loaded {ff} with {p} of {t} wells passing. ]".format(ff=os.path.basename(filter_file), p=offset, t=len(filt_offsets)))
218 | 
219 |         return filt_offsets, offset
220 | 
221 | if __name__ == '__main__':
222 |     main(*sys.argv[1:])
223 | 


--------------------------------------------------------------------------------
/count_well_duplicates.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | __AUTHORS__ = ['Judith Risse', 'Tim Booth']
  4 | __VERSION__ = 0.3
  5 | 
  6 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
  7 | import sys, re
  8 | from itertools import islice
  9 | import Levenshtein
 10 | import bcl_direct_reader
 11 | from target import load_targets
 12 | 
 13 | HISEQ_4000 = "hiseq_4000"
 14 | HISEQ_X = "hiseq_x"
 15 | 
 16 | #Used for indexing tuple per target:
 17 | TALLY  = 0
 18 | LENGTH = 1
 19 | 
 20 | #Used for sequences returned from BCL Direct Reader
 21 | SEQUENCE  = bcl_direct_reader.SEQUENCE
 22 | QUAL_FLAG = bcl_direct_reader.QUAL_FLAG
 23 | 
 24 | def log(msg):
 25 |     print(str(msg), file=sys.stderr)
 26 | 
 27 | def output_writer(lane, sample_size, lane_dupl, levels=0, verbose=False):
 28 |     """ Reports on the lane by totting up the values in lane_dupl.
 29 |         The lane and sample_size arguments are added to the report but
 30 |         are not used in any calculations.
 31 | 
 32 |         If you want to understand what this is actually doing, look at
 33 |         the test code in test_count_well_duplicates.py
 34 | 
 35 |         Note it would be possible to output the data as it is generated
 36 |         without gathering the whole lot in RAM, but this seems a pointless
 37 |         optimization.
 38 |     """
 39 | 
 40 |     #First infer the number of levels, if not provided explicitly
 41 |     if not levels:
 42 |         for atile in lane_dupl.values():
 43 |             #Could be a duff tile?
 44 |             if len(atile) > 0:
 45 |                 #No, it's OK.  Use the length of the first target.
 46 |                 levels = len(atile[0])
 47 |                 break
 48 | 
 49 |     #Grand totals...
 50 |     # Targets that got sampled (ie. had a valid read at the centre
 51 |     tot_targets = 0
 52 |     # Wells that got examined, at each level
 53 |     tot_wells = [0] * levels
 54 |     # Dups found at each level (total wells)
 55 |     tot_dups = [0] * levels
 56 |     # Dups found at each level (counting 1 per target per level)
 57 |     tot_hits = [0] * levels
 58 |     # Accumulated hits counting from the inside out
 59 |     tot_acco = [0] * levels
 60 |     # and counting from the outside in
 61 |     tot_acci = [0] * levels
 62 | 
 63 |     for tile in sorted(lane_dupl.keys()):
 64 | 
 65 |         tile_counts = lane_dupl[tile]
 66 |         targets = len(tile_counts)
 67 |         tot_targets += targets
 68 | 
 69 |         if verbose:
 70 |             print("Lane: %s\tTile: %s\tTargets: %i/%i" % (
 71 |                          lane,     tile,        targets,sample_size))
 72 | 
 73 |         #AccO and AccI require an explicit loop over targets.
 74 |         #I could tally the other things in this loop too but to me it makes
 75 |         #the code less readable.
 76 |         #Not that the code is pretty, but test coverage assures me it's good.
 77 |         acco = [0] * levels
 78 |         acci = [0] * levels
 79 |         for targ in tile_counts:
 80 |             seen_hit = 0
 81 |             for lev in range(levels):
 82 |                 if targ[lev][TALLY]:
 83 |                     seen_hit = 1
 84 |                 acco[lev] += seen_hit
 85 |             seen_hit = 0
 86 |             for lev in reversed(range(levels)):
 87 |                 if targ[lev][TALLY]:
 88 |                     seen_hit = 1
 89 |                 acci[lev] += seen_hit
 90 | 
 91 |         for lev in range(levels):
 92 | 
 93 |             wells = sum(targ[lev][LENGTH] for targ in tile_counts)
 94 |             dups = sum(targ[lev][TALLY] for targ in tile_counts)
 95 |             hits = sum(bool(targ[lev][TALLY]) for targ in tile_counts)
 96 | 
 97 |             if verbose:
 98 |                 print("Level: %i\tWells: %i\tDups: %i\tHit: %i\tAccO: %i\tAccI: %i" % (
 99 |                               lev+1,     wells,    dups,    hits,     acco[lev],acci[lev]))
100 | 
101 |             tot_wells[lev] += wells
102 |             tot_dups[lev] += dups
103 |             tot_hits[lev] += hits
104 | 
105 |             tot_acco[lev] += acco[lev]
106 |             tot_acci[lev] += acci[lev]
107 | 
108 |     #And finally the Picard-scaled percentage
109 |     #I have no real justification for this calculation, other than it looked reasonable
110 |     #at the time.
111 |     if tot_acci:
112 |         grand_tot_hits = tot_acci[0]
113 |         grand_tot_dups = sum(tot_dups)
114 | 
115 |         peds = ( grand_tot_hits *
116 |                  ( 1 - grand_tot_hits / ( grand_tot_dups + grand_tot_hits ) ) /
117 |                  tot_targets )
118 | 
119 |         #Judith came up with this: 1-1/(2imcs-2)
120 |         #Which simplifies to...
121 |         peds2= ( grand_tot_hits *
122 |                  ( 1 - grand_tot_hits / ( 2 * grand_tot_dups ) ) /
123 |                  tot_targets )
124 |     else:
125 |         grand_tot_hits = peds = peds2 = 0
126 | 
127 | 
128 |     #And report
129 |     print("LaneSummary: %s\tTiles: %i\tTargets: %i/%i" % (
130 |                         lane,      len(lane_dupl),
131 |                                                 tot_targets,
132 |                                                    sample_size*len(lane_dupl) ))
133 | 
134 |     for lev in range(levels):
135 |         print("Level: %i\tWells: %i\tDups: %i (%.5f)\t" % (
136 |                       lev+1,     tot_wells[lev],
137 |                                            tot_dups[lev],
138 |                                                tot_dups[lev] / tot_wells[lev]) +
139 |               "Hit: %i (%.5f)\tAccO: %i (%.5f)\tAccI: %i (%.5f)" % (
140 |                     tot_hits[lev],
141 |                         tot_hits[lev] / tot_targets,
142 |                                      tot_acco[lev],
143 |                                          tot_acco[lev] / tot_targets,
144 |                                                       tot_acci[lev],
145 |                                                           tot_acci[lev] / tot_targets)
146 |              )
147 | 
148 |     raw_dup_rate = grand_tot_hits/tot_targets if grand_tot_hits else 0.0
149 | 
150 |     print()
151 |     print("Overall duplication (Acc/Targets): {:.2%}".format(raw_dup_rate))
152 |     print("Picard-equivalent duplication v1:  {:.2%}".format(peds))
153 |     print("Picard-equivalent duplication v2:  {:.2%}".format(peds2))
154 | 
155 | 
156 | def main():
157 |     # Setup options
158 |     args = parse_args()
159 | 
160 |     if args.quiet:
161 |         global log
162 |         log = lambda *args: None
163 | 
164 |     lanes = args.lane.split(',') if args.lane else range(1, 8+1)
165 | 
166 |     max_tile = 24 #Works for Highseq X
167 |     max_swath = 22 #Works for X and 4000
168 |     if args.stype == HISEQ_4000:
169 |         max_tile = 28
170 |     else:
171 |         try:
172 |             max_tile = int(args.stype) % 100
173 |             max_swath = int(args.stype) // 100 or 22
174 |         except ValueError:
175 |             pass # Never mind. Stick with 24/22.
176 | 
177 |     # Build a list of tiles we expect to see. Swaths for the older machines are [11, 12, 21, 22] but
178 |     # in general and to handle the Novoseq we can infer the list from the max_swath value.
179 |     tiles = []
180 |     for swath in [ '{}{}'.format(s, n) for s in range(1,max_swath//10+1) for n in range(1,max_swath%10+1) ]:
181 |         for tile in range(1,max_tile+1):
182 |             tiles.append("%s%02d" % (swath, tile))
183 | 
184 |     # If tiles are specified check that all are valid.
185 |     if args.tile_id:
186 |         filtered_tiles = []
187 |         for tpat in args.tile_id.split(','):
188 |             t_match = [t for t in tiles if re.match('^'+tpat+'$', t)]
189 |             assert t_match, "%s matches no tile identifiers for a %s" % (t, args.stype)
190 |             filtered_tiles.extend(t_match)
191 |         tiles = sorted(set(filtered_tiles))
192 | 
193 |     # And set cycles based on either --start/--end or --cycles
194 |     cycles = [(args.start, args.end)]
195 |     if args.cycles:
196 |         #Minimal validation - user will get cryptic messages on bad values
197 |         cycles = [ (int(s), int(e)) for r in args.cycles.split(',') for s, e in (r.split('-'),) ]
198 | 
199 |     # Decide how we are calculating edit distances
200 |     get_edit_distance = Levenshtein.hamming if args.hamming else Levenshtein.distance
201 | 
202 |     targets = load_targets( filename = args.coord_file,
203 |                             levels = args.level+1,
204 |                             limit = args.sample_size)
205 |     bcl_reader = bcl_direct_reader.BCLReader(args.run)
206 | 
207 |     for lane in lanes:
208 | 
209 |         lane_dupl = {}
210 |         for tile in tiles:
211 |             log("Reading tile %s in lane %s" % (tile, lane))
212 |             tile_bcl = bcl_reader.get_tile(lane, tile)
213 | 
214 |             #This actually reads the sequence data from the BCL into RAM
215 |             #Now we support ranges, we might have to do this two or more times.
216 |             seq_objs = []
217 |             for r in cycles:
218 |                 seq_objs.append( tile_bcl.get_seqs(targets.get_all_indices(), *r) )
219 | 
220 |             log("Got %i sequences from %i contiguous cycle ranges." % (
221 |                      sum(len(s) for s in seq_objs),
222 |                                        len(seq_objs) ))
223 | 
224 |             #Each entry in lane_dupl dict is a list of valid (ie. centre seq passed QC)
225 |             #targets for this tile.
226 |             lane_dupl[tile] = []
227 | 
228 |             for target in targets:
229 | 
230 |                 center = target.get_centre()
231 |                 #log("Center: %s"%center)
232 | 
233 |                 # if the center sequence does not pass the pass filter we don't assess edit distance
234 |                 # as large number of Ns compared to other reads with large number of Ns results in
235 |                 # small edit distance
236 |                 if not seq_objs[0][center][QUAL_FLAG]:
237 |                     continue
238 |                 center_seq = ''.join(s[center][SEQUENCE] for s in seq_objs)
239 | 
240 |                 #Add a placeholder for the new stats
241 |                 target_stats = [None] * args.level
242 |                 lane_dupl[tile].append(target_stats)
243 | 
244 |                 for level in range(args.level):
245 |                     #The level variable now runs from 0, but the target levels run from
246 |                     #1 because 0 is the centre, so be careful!
247 |                     dups = 0
248 |                     well_indices = list(target.get_indices(level+1))
249 |                     assert len(well_indices) > 0
250 |                     for well_index in well_indices:
251 |                         well_seq = ''.join(s[well_index][SEQUENCE] for s in seq_objs)
252 |                         dist = get_edit_distance(center_seq, well_seq)
253 | 
254 |                         #Log all the duplicates. This might get fairly large!
255 |                         #Note that to locate the matching sequence header in a FASTQ file you need to
256 |                         #convert the well number into co-ords. Eg for location 123456:
257 |                         # $ dump_slocs.py datadir/Data/Intensities/s.locs | grep ^0123456
258 |                         if dist <= args.edit_distance:
259 |                             dups += 1
260 |                             log("center seq at {:>07}: {}".format(center, center_seq))
261 |                             log("well seq at   {:>07}: {}".format(well_index, well_seq))
262 |                             log("edit distance: {}".format(dist))
263 | 
264 |                     #Save a tuple of (TALLY, LENGTH)
265 |                     target_stats[level] = (dups, len(well_indices))
266 | 
267 |             #log(lane_dupl)
268 |         #Write output per lane
269 |         output_writer(lane, len(targets), lane_dupl, verbose = not args.summary_only)
270 | 
271 | 
272 | def parse_args():
273 |     description = """This script creates or executes commands that will assess well duplicates
274 |     in a run without mapping. Reads within level l of a selected reads from the coordinate file
275 |     will be assessed for Levenshtein (edit) distance.
276 |     """
277 | 
278 |     parser = ArgumentParser(description=description, formatter_class=ArgumentDefaultsHelpFormatter)
279 | 
280 |     parser.add_argument("-f", "--coord_file", dest="coord_file", required=True,
281 |                         help="The file containing the random sample per tile.")
282 |     parser.add_argument("-e", "--edit_distance", dest="edit_distance", type=int, default=2,
283 |                         help="max edit distance between two reads to count as duplicate")
284 |     parser.add_argument("-n", "--sample_size", dest="sample_size", type=int, default=2500,
285 |                         help="number of reads to be tested for well duplicates (max number" +
286 |                              " of prepared clusters is 10000 at the moment)")
287 |     parser.add_argument("-l", "--level", dest="level", type=int, default=3,
288 |                         help="levels around central spot to test, max = 5")
289 |     parser.add_argument("-s", "--stype", dest="stype", required=True,
290 |                         help=("Sequencer model. Can be {HISEQ_4000} or {HISEQ_X} or else the highest tile" +
291 |                               " number in which case the tile/swath configuration will be inferred.").format(**globals()))
292 |     parser.add_argument("-r", "--run", dest="run", required=True,
293 |                         help="path to base of run, i.e /ifs/seqdata/150715_K00169_0016_BH3FGFBBXX")
294 |     parser.add_argument("-t", "--tile", dest="tile_id", type=str,
295 |                         help="comma-separated list of specific tiles on a lane to analyse." +
296 |                              " Four digits, using Illumina tile numbering convention. You can also use a" +
297 |                              " regex match so 1... for top surface only, or 1..[02468] to sample only even" +
298 |                              " tiles on the top surface.")
299 |     parser.add_argument("-i", "--lane", dest="lane", type=str,
300 |                         help="comma-separated list of specific lanes to analyse, 1-8")
301 |     parser.add_argument("-x", "--start", dest="start", type=int, default=50,
302 |                         help="Starting cycle/base position for the slice of read to be examined")
303 |     parser.add_argument("-y", "--end", dest="end", type=int, default=100,
304 |                         help="Final cycle/base position for the slice of read to be examined")
305 |     parser.add_argument("--cycles",
306 |                         help="Specify cycles/bases to scan as a list of ranges, eg. 10-50,100-120. Note" +
307 |                              " that this will override -x/-y if specified. You'll need to work out for" +
308 |                              " yourself which cycles correspond to which read.")
309 |     parser.add_argument("--hamming", action="store_true",
310 |                         help="Compare sequences using the Hamming distance rather than the Levenshtein edit distance.")
311 |     parser.add_argument("-S", "--summary-only", action="store_true",
312 |                         help="Only print the summary per lane, not for every tile")
313 |     parser.add_argument("-q", "--quiet", action="store_true",
314 |                         help="No log output")
315 |     parser.add_argument("--version", action="version", version=str(__VERSION__))
316 | 
317 |     return parser.parse_args()
318 | 
319 | if __name__ == "__main__":
320 |     main()
321 | 


--------------------------------------------------------------------------------
/doit.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -l
  2 | set -e ; set -u
  3 | 
  4 | ### Note: This script is unlikely to be useful outside of Edinburgh Genomics ###
  5 | ### Note2: see cron_o_matic.sh for a script that handles the locking/logging in a
  6 | ###        nice generic way - I based it on this.
  7 | ###         https://gist.github.com/tbooth/b09608aa1b44b82097860ebbb812923d
  8 | 
  9 | # This script aims to be something that can be executed as a cron job at, say,
 10 | # 15 minute intervals.  Therefore it needs to:
 11 | # 1) Run very fast if there is nothing to do - CHECK
 12 | # 2) Not get in a tizz if two instances start at once - CHECK
 13 | # 3) Not get stuck in a retry-loop if a job repeatedly fails - Handled in Snakefile
 14 | # 4) Push summary data to web1 where we can look upon it - CHECK
 15 | # 5) Add results to the Wiki as a sub-page of the run - Handled in Snakefile
 16 | # 6) Refuse to run on the backup headnode - CHECK
 17 | # 7) Log to a sensible location - CHECK
 18 | 
 19 | # Here's the quick fix for 6...
 20 | # Refuse to run on headnode2
 21 | if [[ "${HOSTNAME%%.*}" == headnode2 ]] ; then
 22 |     echo "This script should not be run on headnode2" >&2
 23 |     exit 1
 24 | fi
 25 | 
 26 | # Settings specific to old/new clusters. Note that this stand-alone script should soon
 27 | # be superceded on the new cluster by a QC stage incorporated into Illuminatus.
 28 | if [ -e /lustre/software ] ; then
 29 |     WORKDIR_ROOT="$HOME/WellDuplicates"
 30 |     SEQDATA=/lustre/seqdata
 31 | else
 32 |     WORKDIR_ROOT=/ifs/runqc/WellDuplicates
 33 |     SEQDATA=/ifs/seqdata
 34 | fi
 35 | 
 36 | #If run in a TTY, log to the screen, else log to the log file.
 37 | # ie. to foce logging to a file, run ./doit.sh >/dev/null
 38 | if [ -z "`tty`" ] ; then
 39 |     LOGFILE="$WORKDIR_ROOT/logs/autorun.`date +%Y%m%d`.log"
 40 | fi
 41 | 
 42 | # Previously the script checked 'ps' for other instances, but I'm switching to the
 43 | # recommended 'flock' mechanism.
 44 | FLOCK_FILE="${TMPDIR:-/tmp}/flock_$(readlink -f "$0" | md5sum | awk '{print $1}')"
 45 | if [ "${FLOCK_ON:-0}" = 0 ] ; then
 46 |     # echo "Locking exclusively on $FLOCK_FILE, PID=$$"
 47 |     (   flock -n 9 || exit 33
 48 |         export FLOCK_ON=9
 49 |         source "$0" "$@"
 50 |     ) 9>"$FLOCK_FILE" ; rc="$?"
 51 |     if [ "$rc" = 33 ] ; then
 52 |         #Just means the previous run is still going.
 53 |         slmsg="*** Failed to gain shared lock, PID=$$"
 54 |         if [ -n "${LOGFILE:-}" ] ; then
 55 |             echo "$slmsg" >> "$LOGFILE"
 56 |         else
 57 |             echo "$slmsg" >&2
 58 |         fi
 59 |     elif [ "$rc" != 0 ] ; then
 60 |         #This should trigger an e-mail to the cron manager
 61 |         echo "Script exited with error $rc" >&2
 62 |     fi
 63 |     #Else, spawned copy ran, nothing more to do.
 64 |     #echo "Exiting unlocked script, PID=$$"
 65 |     exit "$rc"
 66 | fi
 67 | #echo "Locked on $FLOCK_FILE, PID=$$"
 68 | 
 69 | # 4) This script already runs with "bash -l" in order to set up the SGE environment
 70 | #    and the extended PATH.  Otherwise I'd have to source /etc/profile.d/sge.sh and
 71 | #    add to the PATH here.
 72 | 
 73 | # Now we can send any further output to the log
 74 | if [ -n "${LOGFILE:-}" ] ; then
 75 |     exec >>"$LOGFILE"
 76 | fi
 77 | 
 78 | # Most things are dealt with by the Snakefile.  I need to run it over all the runs
 79 | # and push any new results that appear to web1.
 80 | # I also need to handle logging.
 81 | SNAKEFILE="$(dirname $(readlink -f $0))"/Snakefile.count_and_push
 82 | export CLUSTER_QUEUE=casava
 83 | 
 84 | echo "=== Running at `date`. PID=$$, SNAKEFILE=$SNAKEFILE, CLUSTER_QUEUE=$CLUSTER_QUEUE ==="
 85 | 
 86 | for f in "$SEQDATA"/??????_[AKE]00* ; do
 87 |     echo "Trying to process $f"
 88 |     export WORKDIR="$WORKDIR_ROOT/`basename $f`"
 89 | 
 90 |     #If processing the run fails we do want to continue.
 91 |     #This makes it annoying if you want to cancel the whole thing but is important
 92 |     #to ensure one problem run doesn't gum up the whole pipeline.
 93 |     if ( cd "$f" && "$SNAKEFILE" 2>&1 ) ; then
 94 |         if [ "${DO_JUST_ONE:-0}" != 0 ] ; then
 95 |             echo "Exiting as DO_JUST_ONE was set."
 96 |             exit 0
 97 |         fi
 98 |     fi
 99 | 
100 | done
101 | 
102 | # Copying to web1 has been removed. See GIT on 21/2/17 for the old version.
103 | 
104 | # That should doit.
105 | echo "=== Finished run at `date`. PID=$$ ==="
106 | 


--------------------------------------------------------------------------------
/dump_slocs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import struct
 4 | import math
 5 | import sys
 6 | 
 7 | #Python normally complains about being killed by SIG_PIPE, but we just want
 8 | #exit gracefully if that happens (eg. if output is piped to head)
 9 | #(http://docs.python.org/library/signal.html)
10 | from signal import signal, SIGPIPE, SIG_DFL
11 | signal(SIGPIPE,SIG_DFL)
12 | 
13 | def main():
14 |     f = None
15 |     try:
16 |         f = open(sys.argv[1], 'rb')
17 |     except IndexError:
18 |         f = sys.stdin
19 | 
20 |     o = sys.stdout
21 | 
22 |     try:
23 |         header = yield_header(f)
24 |         o.write("##%s\n"%(str(header)))
25 |         clusternumpad = len(str(header[2]))
26 | 
27 |         for coords in enumerate(yield_coords(f)):
28 |             #Write out "%i %i:%i\n"% (clusternum,x,y)
29 |             #But I made it ugly by also interpolating the padding size
30 |             o.write(("%%0%ii %%i:%%i\n"%clusternumpad) % (coords[:1] + coords[1][:]))
31 |             #o.write(str(coords) + "\n")
32 | 
33 |     finally:
34 |         o.close()
35 |         f.close()
36 | 
37 | def yield_header(f):
38 |     #First 3*4 bytes are the header
39 |     #Only the third element is really useful as this is the number of locations
40 |     #that should be in the file.
41 |     buf = f.read(12)
42 |     return struct.unpack('<ifI', buf)
43 | 
44 | def yield_coords(f):
45 |     #You must have read the header first
46 |     assert f.tell() >= 12
47 | 
48 |     buf = f.read(8)
49 |     clusternum = 0
50 |     while len(buf) == 8:
51 |         # Each following 8 bytes are a co-ordinate pair as detailed in
52 |         # https://broadinstitute.github.io/picard/javadoc/picard/picard/illumina/parser/readers/LocsFileReader.html
53 |         # and
54 |         # https://www.biostars.org/p/51681/
55 |         t = struct.unpack('<ff', buf)
56 |         x = int(t[0] * 10.0 + 1000.5)
57 |         y = int(t[1] * 10.0 + 1000.5)
58 | 
59 |         yield (x,y)
60 |         clusternum += 1
61 |         buf = f.read(8)
62 | 
63 | main()
64 | 


--------------------------------------------------------------------------------
/get_cached_targets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | set -o noclobber
 4 | 
 5 | # This logic was broken out of Snakefile.count. It generates a targets file
 6 | # for a given s.locs file with a given number of targets. If the cache is available
 7 | # and an appropriate file already exists then the output will be a symlink.
 8 | # If the cache is not in use the output will be a regular file.
 9 | # A very basic locking mechanism ensures we don't write junk into the cache (a
10 | # second process trying to write to the cache will just fail).
11 | 
12 | # Output will never be clobbered so if there's an old file you need to remove it
13 | # first.
14 | trap "echo 'Usage: get_cached_targets.sh <locs_file> <target_count> <output_file>'" EXIT
15 | locs_file="$1"
16 | target_count="$2"
17 | output_file="$3"
18 | trap - EXIT
19 | 
20 | # First ensure I'll run the corresponding well dups scripts
21 | WD_ROOT="$(dirname $(readlink -f $0))"
22 | PATH="$WD_ROOT:$PATH"
23 | 
24 | # Then work out where the cache directory is at
25 | CLUSTER_LISTS="${CLUSTER_LISTS:-$WD_ROOT/cluster_lists}"
26 | 
27 | if [ -d "$CLUSTER_LISTS" ] ; then
28 |     md5=`md5sum "$locs_file" | awk '{print $1}'`
29 | 
30 |     cached_list="$CLUSTER_LISTS/${target_count}clusters_${md5}.list"
31 | 
32 |     if ! [ -e "${cached_list}.done" ] ; then
33 |         trap "rm '$cached_list'" EXIT
34 |         prepare_cluster_indexes.py -n "$target_count" -f "$locs_file" > "$cached_list"
35 |         touch "${cached_list}.done"
36 |         trap - EXIT
37 |     fi
38 |     # Shall the link be relative?
39 |     if [ "${cached_list:0:1}" = / ] ; then
40 |         ln -s "$cached_list" "$output_file"
41 |     else
42 |         ln -sr "$cached_list" "$output_file"
43 |     fi
44 | else
45 |     # No-cache mode it is, then
46 |     prepare_cluster_indexes.py -n "$target_count" -f "$locs_file" > "$output_file"
47 | fi
48 | 


--------------------------------------------------------------------------------
/parse_slocs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import struct
 4 | import sys
 5 | 
 6 | fn = sys.argv[1]
 7 | sys.stderr.write(fn)
 8 | f = open(fn, 'rb')
 9 | o = open(sys.argv[2],'w')
10 | sys.stderr.write(sys.argv[2])
11 | try:
12 |     buf = f.read(12)
13 |     o.write("##%s\n"%(str(struct.unpack('=ifI', buf))))
14 |     buf = f.read(8)
15 |     while len(buf) == 8:
16 |         t = struct.unpack('=ff', buf)
17 |         x=int(round( 10 * t[0] + 1000))
18 |         y=int(round( 10 * t[1] + 1000))
19 |         o.write("%s\t%s\n"%(x,y))
20 |         buf = f.read(8)
21 | finally:
22 |     o.close()
23 |     f.close()
24 | 


--------------------------------------------------------------------------------
/plan.md:
--------------------------------------------------------------------------------
 1 | THE PLN
 2 | =======
 3 | 
 4 | Detect well duplicates in patterned flow cells without alignment
 5 | 
 6 | Setup stage
 7 | -----------
 8 | 
 9 | input:
10 | 
11 | * s.locs file
12 | * sample size N
13 |         
14 | output: file with cluster_index => x.coord, y.coord
15 | 
16 |     open s.locs
17 |     generate N random numbers between 0..max_num of clusters
18 |     for (1..N)
19 |         find cluster_index entry in s.locs
20 |         convert to x.coord, y.coord
21 |         find adjacent 6,12,18 clusters
22 |         get their cluster_index
23 |         convert to x.coord, y.coord
24 |         add to output
25 | 
26 | Hiseq 4000 s.locs coordinates:
27 | 
28 | | pixel_distance | x | y | rownr | row_distance | max_pix_dist | pixel_distance | x | y | rownr | row_distance | max_pix_dist | pixel_distance | x | y | rownr | row_distance | max_pix_dist |
29 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
30 | | 0 | 9536 | 6589 | 500000 | 0 |  | 0 | 18101 | 12181 | 1000000 | 0 |  | 0 | 21268 | 998 | 1000 | 0 |
31 | | 20.5913 | 9546 | 6607 | 501571 | -1571 |  | 19.7231 | 18111 | 12198 | 1001571 | -1571 |  | 19.7231 | 21278 | 1015 | 2571 | -1571 |
32 | | 20.5913 | 9526 | 6607 | 501570 | -1570 |  | 19.7231 | 18091 | 12198 | 1001570 | -1570 |  | 20.2485 | 21257 | 1015 | 2570 | -1570 |
33 | | 21 | 9557 | 6589 | 500001 | -1 |  | 21 | 18122 | 12181 | 1000001 | -1 |  | 20 | 21288 | 998 | 1001 | -1 |
34 | | 20 | 9516 | 6589 | 499999 | 1 |  | 20 | 18081 | 12181 | 999999 | 1 |  | 21 | 21247 | 998 | 999 | 1 |
35 | | 19.7231 | 9546 | 6572 | 498429 | 1571 |  | 20.5913 | 18111 | 12163 | 998429 | 1571 |  |  |  |  |  |  |
36 | | 19.7231 | 9526 | 6572 | 498428 | 1572 | 22 | 20.5913 | 18091 | 12163 | 998428 | 1572 |  |  |  |  |  |  |
37 | | 40.8167 | 9557 | 6624 | 503143 | -3143 |  | 40.8167 | 18122 | 12216 | 1003143 | -3143 |  | 40.3113 | 21288 | 1033 | 4143 | -3143 |
38 | | 35 | 9536 | 6624 | 503142 | -3142 |  | 35 | 18101 | 12216 | 1003142 | -3142 |  | 35 | 21268 | 1033 | 4142 | -3142 |
39 | | 40.3113 | 9516 | 6624 | 503141 | -3141 |  | 40.3113 | 18081 | 12216 | 1003141 | -3141 |  | 40.8167 | 21247 | 1033 | 4141 | -3141 |
40 | | 35.8469 | 9567 | 6607 | 501572 | -1572 |  | 35.3553 | 18132 | 12198 | 1001572 | -1572 |  | 34.4819 | 21298 | 1015 | 2572 | -1572 |
41 | | 34.9857 | 9506 | 6607 | 501569 | -1569 |  | 34.4819 | 18071 | 12198 | 1001569 | -1569 |  | 35.3553 | 21237 | 1015 | 2569 | -1569 |
42 | | 41 | 9577 | 6589 | 500002 | -2 |  | 41 | 18142 | 12181 | 1000002 | -2 |  | 40 | 21308 | 998 | 1002 | -2 |
43 | | 40 | 9496 | 6589 | 499998 | 2 |  | 40 | 18061 | 12181 | 999998 | 2 |  | 41 | 21227 | 998 | 998 | 2 |
44 | | 35.3553 | 9567 | 6572 | 498430 | 1570 |  | 35.8469 | 18132 | 12163 | 998430 | 1570 |  |  |  |  |  |  |
45 | | 34.4819 | 9506 | 6572 | 498427 | 1573 |  | 34.9857 | 18071 | 12163 | 998427 | 1573 |  |  |  |  |  |  |
46 | | 40.8167 | 9557 | 6554 | 496859 | 3141 |  | 40.8167 | 18122 | 12146 | 996859 | 3141 |  |  |  |  |  |  |
47 | | 35 | 9536 | 6554 | 496858 | 3142 |  | 35 | 18101 | 12146 | 996858 | 3142 |  |  |  |  |  |  |
48 | | 40.3113 | 9516 | 6554 | 496857 | 3143 | 42 | 40.3113 | 18081 | 12146 | 996857 | 3143 |  |  |  |  |  |  |
49 | | 61.4003 | 9567 | 6642 | 504714 | -4714 |  | 61.4003 | 18132 | 12234 | 1004714 | -4714 |  | 60.0333 | 21298 | 1050 | 5714 | -4714 |
50 | | 53.9351 | 9546 | 6642 | 504713 | -4713 |  | 53.9351 | 18111 | 12234 | 1004713 | -4713 |  | 52.9528 | 21278 | 1050 | 5713 | -4713 |
51 | | 53.9351 | 9526 | 6642 | 504712 | -4712 |  | 53.9351 | 18091 | 12234 | 1004712 | -4712 |  | 53.1507 | 21257 | 1050 | 5712 | -4712 |
52 | | 60.9016 | 9506 | 6642 | 504711 | -4711 |  | 60.9016 | 18071 | 12234 | 1004711 | -4711 |  | 60.5392 | 21237 | 1050 | 5711 | -4711 |
53 | | 53.9073 | 9577 | 6624 | 503144 | -3144 |  | 53.9073 | 18142 | 12216 | 1003144 | -3144 |  | 53.1507 | 21308 | 1033 | 4144 | -3144 |
54 | | 53.1507 | 9496 | 6624 | 503140 | -3140 |  | 53.1507 | 18061 | 12216 | 1003140 | -3140 |  | 53.9073 | 21227 | 1033 | 4140 | -3140 |
55 | | 54.0833 | 9587 | 6607 | 501573 | -1573 |  | 53.7587 | 18152 | 12198 | 1001573 | -1573 |  | 52.811 | 21318 | 1015 | 2573 | -1573 |
56 | | 54.0833 | 9485 | 6607 | 501568 | -1568 |  | 52.811 | 18051 | 12198 | 1001568 | -1568 |  | 53.7587 | 21217 | 1015 | 2568 | -1568 |
57 | | 61 | 9597 | 6589 | 500003 | -3 |  | 61 | 18162 | 12181 | 1000003 | -3 |  | 60 | 21328 | 998 | 1003 | -3 |
58 | | 61 | 9475 | 6589 | 499997 | 3 |  | 61 | 18040 | 12181 | 999997 | 3 |  | 61 | 21207 | 998 | 997 | 3 |
59 | | 53.7587 | 9587 | 6572 | 498431 | 1569 |  | 54.0833 | 18152 | 12163 | 998431 | 1569 |  |  |  |  |  |  |
60 | | 53.7587 | 9485 | 6572 | 498426 | 1574 |  | 53.1413 | 18051 | 12163 | 998426 | 1574 |  |  |  |  |  |  |
61 | | 53.9073 | 9577 | 6554 | 496860 | 3140 |  | 53.9073 | 18142 | 12146 | 996860 | 3140 |  |  |  |  |  |  |
62 | | 53.1507 | 9496 | 6554 | 496856 | 3144 |  | 53.1507 | 18061 | 12146 | 996856 | 3144 |  |  |  |  |  |  |
63 | | 61.4003 | 9567 | 6536 | 495288 | 4712 |  | 61.4003 | 18132 | 12128 | 995288 | 4712 |  |  |  |  |  |  |
64 | | 53.9351 | 9546 | 6536 | 495287 | 4713 |  | 53.9351 | 18111 | 12128 | 995287 | 4713 |  |  |  |  |  |  |
65 | | 53.9351 | 9526 | 6536 | 495286 | 4714 |  | 53.9351 | 18091 | 12128 | 995286 | 4714 |  |  |  |  |  |  |
66 | | 60.9016 | 9506 | 6536 | 495285 | 4715 | 62 | 60.9016 | 18071 | 12128 | 995285 | 4715 |  |  |  |  |  |  |
67 | 
68 | Run stage
69 | ---------
70 | 
71 | on each run in the QC pipeline stage
72 | 
73 |     get bcl reader
74 |     for each lane
75 |         for each tile
76 |             for each cluster_index in file
77 |                 get central read (nuc_seq, pass)
78 |                 get 6,12,18 surrounding reads (nuc_seq, pass)
79 |                 check_edit_distance (LD<=2 -> duplicate read)
80 |                 calculate %duplicates
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/prepare_cluster_indexes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | 
  4 | input: sample_size n, sequencer_type (hiseq4000, hiseqx), levels (max 3)
  5 | return: dictionary of surrounding cluster indexes for n randomly selected wells
  6 | """
  7 | __AUTHORS__ = ['Judith Risse', 'Tim Booth']
  8 | __VERSION__ = 0.2
  9 | 
 10 | import random
 11 | import sys
 12 | import struct
 13 | import math
 14 | # from dump_slocs import yield_coords
 15 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 16 | 
 17 | # maximum pixel distence between wells at a given level, required for edges of the flow cell
 18 | # going out past 5 steps doesn't work properly!
 19 | MAX_DISTS = [1, 22, 42, 62, 82, 102]
 20 | 
 21 | DEF_SEED = 13
 22 | 
 23 | DEF_SAMPLE_SIZE = 2500
 24 | 
 25 | 
 26 | def get_random_array(r_max, r_l, seed):
 27 |     if seed:
 28 |         random.seed(seed)
 29 |     ra = random.sample(range(r_max),r_l)
 30 |     return ra
 31 | 
 32 | def get_distance(x1, y1, x2, y2):
 33 | 
 34 |     dist = math.sqrt((x2-x1)**2+(y2-y1)**2)
 35 |     return dist
 36 | 
 37 | 
 38 | def get_indexes(cluster_coord, cluster_x, cluster_y, slocs_fh, levels=5):
 39 |     """
 40 | 
 41 |     :rtype: object
 42 |     """
 43 |     MAX_SEARCH_AREA = 20000
 44 | 
 45 |     # I tried replacing this with "[[]] * levels" but that yields a list of N
 46 |     # references to a single list.  Oops.
 47 |     l_index = [[] for l in range(levels)]
 48 | 
 49 |     # reset slocs file handle to position 12 bytes (i.e. after the header)i
 50 |     # or 5000*8bytes (0 or 5000 lines) before cluster_coord
 51 |     # TODO max distance for hiseq 4000, need to check for X
 52 |     offset_coord = max([0, cluster_coord - MAX_SEARCH_AREA])
 53 |     offset = offset_coord * 8 + 12 #Byte offset in the file
 54 |     log(offset)
 55 |     slocs_fh.seek(offset)
 56 |     for coords in enumerate(yield_coords(slocs_fh)):
 57 |         (x, y) = coords[1]
 58 |         cluster_index = coords[0] + offset_coord
 59 |         dist = get_distance(cluster_x, cluster_y, x, y)
 60 | 
 61 |         for lev in range(levels): # 0,1,2,3,4
 62 |             if MAX_DISTS[lev] < dist <= MAX_DISTS[lev+1]:
 63 |                 l_index[lev].append(cluster_index)
 64 | 
 65 |         #Stop searching when we get over 20000 records away
 66 |         if cluster_index > cluster_coord + MAX_SEARCH_AREA:
 67 |             break
 68 | 
 69 |     #Ensure we got something at every level
 70 |     for lev, wells in enumerate(l_index):
 71 |         if not l_index[lev]:
 72 |             raise RuntimeError(
 73 |                 "Got no wells for cluster %s at (%s,%s) level %s",
 74 |                                          (cluster_coord,
 75 |                                                  cluster_x,
 76 |                                                     cluster_y,lev) )
 77 | 
 78 |     return l_index
 79 | 
 80 | 
 81 | def parse_args():
 82 |     """Prepare argparser object. New options will be added in this
 83 |     function first.
 84 |     """
 85 |     description = """This Script creates a list of n random cluster coordinates and
 86 |     the the indexes of surrounding coordinates with distances 1-3 from a HiSeq 4000 or HiSeqX s.locs file.
 87 |     """
 88 | 
 89 |     parser = ArgumentParser(description=description, formatter_class=ArgumentDefaultsHelpFormatter)
 90 |     parser.add_argument("-f", "--slocs", dest="slocs", type=str, required=True,
 91 |                         help="The slocs file to analyse.")
 92 |     parser.add_argument("-s", "--seed", dest="seed", type=int, default=None,
 93 |                         help="Seed for the random read selection")
 94 |     parser.add_argument("-n", "--sample_size", dest="sample_size", type=int, default=DEF_SAMPLE_SIZE,
 95 |                         help="number of n random clusters")
 96 | 
 97 |     return parser.parse_args()
 98 | 
 99 | def yield_coords(f):
100 |     # You must have read the header first
101 |     assert f.tell() >= 12
102 | 
103 |     buf = f.read(8)
104 |     clusternum = 0
105 |     while len(buf) == 8:
106 |         # Each following 8 bytes are a co-ordinate pair as detailed in
107 |         # https://broadinstitute.github.io/picard/javadoc/picard/picard/illumina/parser/readers/LocsFileReader.html
108 |         # and
109 |         # https://www.biostars.org/p/51681/
110 |         t = struct.unpack('<ff', buf)
111 |         x = int(t[0] * 10.0 + 1000.5)
112 |         y = int(t[1] * 10.0 + 1000.5)
113 | 
114 |         yield (x, y)
115 |         clusternum += 1
116 |         buf = f.read(8)
117 | 
118 | def log(msg):
119 |     print(str(msg), file=sys.stderr)
120 | 
121 | def main():
122 |     args = parse_args()
123 | 
124 |     # TODO some logging, but needs a logger implementation
125 |     log("seed: %s" % (args.seed))
126 |     log("sample size: %s" % (args.sample_size))
127 | 
128 |     slocs_fh = None
129 |     try:
130 |         slocs_fh = open(args.slocs, 'rb')
131 |     except IndexError:
132 |         slocs_fh = sys.stdin
133 | 
134 |     # get MAX_CLUSTERS from header of s.locs file
135 |     buf = slocs_fh.read(12)
136 |     header = struct.unpack('=ifI', buf)
137 |     MAX_CLUSTERS = int(header[2])
138 |     log("Maximum number of cluster according to s.locs: %s" % MAX_CLUSTERS)
139 | 
140 |     # generate random list depending on MAX_CLUSTERS and sample_size
141 |     # default sequencer is hiseq_4000
142 | 
143 |     random_sample = get_random_array(MAX_CLUSTERS, args.sample_size, args.seed)
144 | 
145 |     log(random_sample)
146 | 
147 | 
148 |     coord_dict = {}
149 | 
150 |     for coord in random_sample:
151 |         slocs_fh.seek(12 + (coord * 8))  # 12 bytes for header, 8 byte per record, counting starts at 0
152 |         # TODO should be imported from dump_slocs or suchlike, but I only need the one line here
153 |         buf = slocs_fh.read(8)
154 |         t = struct.unpack('=ff', buf)
155 |         cluster_x = int(t[0] * 10.0 + 1000.5)
156 |         cluster_y = int(t[1] * 10.0 + 1000.5)
157 | 
158 |         all_levs = get_indexes(coord, cluster_x, cluster_y, slocs_fh)
159 |         assert coord not in coord_dict
160 |         coord_dict[coord] = all_levs
161 | 
162 |     for key in coord_dict.keys():
163 |         #Print the key on a line followed by a comma-separated list of
164 |         #coords for each level out on one line each.
165 |         print(str(key))
166 |         for l in coord_dict[key]:
167 |             print(",".join( map(str,l) ))
168 | 
169 |     slocs_fh.close()
170 | 
171 | 
172 | main()
173 | 


--------------------------------------------------------------------------------
/summary_to_wiki.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | ### Note: This script is unlikely to be useful outside of Edinburgh Genomics ###
 4 | 
 5 | import os, sys, re
 6 | 
 7 | """A simple script.  Reads the 10000targets_all_lanes.txt summary text file
 8 |    as generated by the Snakefile and spits it out as wiki markup for Confluence.
 9 |    Note the file is actually created by running 'tail' on the individual results
10 |    which is where the ==> Headings <== comes from.
11 |    I could make the code output Wiki markup directly but I don't want to have
12 |    that in the scripts, so this is done in the style of old school "perl -p"
13 |    regex-powered munging.
14 | """
15 | 
16 | def munge(line):
17 |     header_mo = re.search(r"==> [\w/]*?(\d+)targets_lane(\d+)([TB]?)\.txt <==", line)
18 |     if header_mo:
19 |         return "h3. %s targets per tile on lane %s%s" % header_mo.groups()
20 | 
21 |     sum_mo = re.match(r"LaneSummary:.*(Tiles:.*)", line)
22 |     if sum_mo:
23 |         return re.sub(r"\t", r"   ", sum_mo.group(1))
24 | 
25 |     lev1_mo = re.match("Level: 1\s", line)
26 |     if lev1_mo:
27 |         headings = [i.split(": ")[0] for i in line.split("\t")]
28 |         return "||".join([""] + headings + [""]) + "\n" + fmtline(line)
29 | 
30 |     levn_mo = re.match("Level:", line)
31 |     if levn_mo:
32 |         return fmtline(line)
33 | 
34 | def fmtline(line):
35 |     """Formats a Level:.* line
36 |        On line 1, hilights the last box, as that's the figure we care about
37 |     """
38 |     levn_mo = re.match("Level: (\d+)", line)
39 |     vals = [i.split(": ",1)[1] for i in line.split("\t")]
40 | 
41 |     if levn_mo.group(1) == '1':
42 |         vals[-1] = "{color:red}%s{color}" % vals[-1]
43 | 
44 |     return "|".join([""] + vals + [""])
45 | 
46 | if __name__ == '__main__':
47 |     #pipeline mode
48 |     from signal import signal, SIGPIPE, SIG_DFL
49 |     signal(SIGPIPE,SIG_DFL)
50 | 
51 |     for line in (x.rstrip("\n") for x in sys.stdin):
52 |         munged = munge(line)
53 |         if munged is not None: print(munged)
54 | 
55 | 


--------------------------------------------------------------------------------
/summary_to_wiki2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | ### NOTE : This script is unlikely to be useful outside of Edinburgh Genomics. ###
 3 | 
 4 | import os, sys, re
 5 | 
 6 | """Like summary_to_wiki.py.  Reads the 10000targets_all_lanes.txt summary text file
 7 |    as generated by the Snakefile and spits out just the fraction of repeats for
 8 |    each lane, in a 2-column table.
 9 |    Note the file is actually created by running 'tail' on the individual results
10 |    which is where the ==> Headings <== comes from.
11 |    I could make the code output Wiki markup directly but I don't want to have
12 |    that in the scripts, so this is done in the style of old-school "perl -p"
13 |    regex-powered munging.
14 |    The output is going to be added as comments so I need to generate HTML markup.
15 | """
16 | 
17 | lane = "0"
18 | raw_perc_dup = "-"
19 | 
20 | def munge(line):
21 |     global lane, raw_perc_dup
22 | 
23 |     sum_mo = re.search(r"(\d+[TB]?)\.txt <==$", line)
24 |     if sum_mo:
25 |         lane = sum_mo.group(1)
26 | 
27 |     """
28 |     lev1_mo = re.match("Level: 1\s", line)
29 |     if lev1_mo:
30 |         #headings = [i.split(": ")[0] for i in line.split("\t")]
31 |         vals = [i.split(": ",1)[1] for i in line.split("\t")]
32 | 
33 |         frac_dup = re.search(r"\((.*)\)", vals[-1]).group(1)
34 |         perc_dup = round(float(frac_dup) * 100, len(frac_dup))
35 | 
36 |         return trow(lane, "%s %%" % perc_dup)
37 |     """
38 |     dup_mo = re.match(r"Overall duplication .*: ([0-9.%]+)", line)
39 |     if dup_mo:
40 |         raw_perc_dup = dup_mo.group(1).replace("%", " %")
41 | 
42 |     dup_mo = re.match(r"Picard-equivalent duplication v1: *([0-9.%]+)", line)
43 |     if dup_mo:
44 |         #Put a space before the % sign
45 |         perc_dup = dup_mo.group(1).replace("%", " %")
46 |         _rpd = raw_perc_dup
47 |         raw_perc_dup = "-"
48 |         return trow(lane, _rpd, perc_dup)
49 | 
50 | #HTML silliness
51 | def trow(*args):
52 |     return "<tr>" + ''.join("<td>%s</td>" % a for a in args) + "</tr>"
53 | 
54 | def h3(*args):
55 |     return ''.join("<h3>%s</h3>" % a for a in args)
56 | 
57 | def tstart(*headers):
58 |     return '<table>\n<tr>' + ''.join("<th>%s</th>" % h for h in headers) + "</tr>"
59 | 
60 | def tend():
61 |     return '</table>'
62 | 
63 | if __name__ == '__main__':
64 |     #pipeline mode
65 |     from signal import signal, SIGPIPE, SIG_DFL
66 |     signal(SIGPIPE,SIG_DFL)
67 | 
68 |     print(h3("Well Duplicates Summary"))
69 |     print(tstart("Lane","Est. Duplication","P.E. Scaled"))
70 | 
71 |     for line in (x.rstrip("\n") for x in sys.stdin):
72 |         munged = munge(line)
73 |         if munged is not None:
74 |             print(munged)
75 | 
76 |     print(tend())
77 | 


--------------------------------------------------------------------------------
/target.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | 
  3 | from itertools import chain
  4 | from collections import defaultdict
  5 | 
  6 | def load_targets(filename, levels=None, limit=None):
  7 |     """Loads the target coordinates from a CSV file.  This function will now infer
  8 |        the number of levels represented in the file, but you can opt to load just a
  9 |        subset.
 10 |         filename: File to open
 11 |         levels: Number of levels to load inclusive of the centre,
 12 |                 else all levels in the file will be loaded.
 13 |     """
 14 | 
 15 |     all_targets = AllTargets()
 16 | 
 17 |     with open(filename, 'r') as coord_fh:
 18 | 
 19 |         #This used to read the file N lines at a time but now it just keeps reading until
 20 |         #it finds a single number which it assumes must be the centre of a new target.
 21 |         targ_lines = None
 22 | 
 23 |         #Silly way to get all the lines in a file followed by a blank,
 24 |         #without reading the whole file into a list.
 25 |         for aline in chain( (x.rstrip() for x in coord_fh), ['']):
 26 | 
 27 |             if ',' not in aline:
 28 |                 #We've either hit EOF or the start of the next record.
 29 |                 if targ_lines:
 30 |                     all_targets.add_target([
 31 |                             [int(x) for x in l.split(',')] for l in targ_lines[:levels]
 32 |                         ])
 33 |                     if limit and len(all_targets) == limit:
 34 |                         break
 35 | 
 36 |                 targ_lines = []
 37 | 
 38 |             targ_lines.append(aline)
 39 | 
 40 |     return all_targets
 41 | 
 42 | class AllTargets:
 43 |     """A holder for a bunch of Target objects.  Normally produced by a
 44 |        call to load_targets.
 45 |     """
 46 | 
 47 |     def __init__(self):
 48 |         self._target_dict = dict()
 49 | 
 50 |         # This will contain [ index : [ target, target, ... ] ]
 51 |         # So a natural use for a defaultdict of lists
 52 |         self._reverse_lookup = defaultdict(list)
 53 | 
 54 |         self.levels = None
 55 | 
 56 |     def __len__(self):
 57 |         return len(self._target_dict)
 58 | 
 59 |     def __iter__(self):
 60 |         """Iteration yields a list of target objects"""
 61 |         return self._target_dict.values().__iter__()
 62 | 
 63 |     def get_target_by_centre(self, centre):
 64 | 
 65 |         return self._target_dict[centre]
 66 | 
 67 |     def add_target(self, coords):
 68 | 
 69 |         new_target = Target(coords)
 70 | 
 71 |         #You shouldn't add the same target twice
 72 |         assert new_target.get_centre() not in self._target_dict
 73 | 
 74 |         #All targets should be the same size
 75 |         if self.levels is None:
 76 |             self.levels = new_target.get_levels()
 77 |         else:
 78 |             assert self.levels == new_target.get_levels()
 79 | 
 80 |         self._target_dict[new_target.get_centre()] = new_target
 81 | 
 82 |         #As well as indexing by centre, hash all indices
 83 |         for idx in new_target.get_indices():
 84 |                 self._reverse_lookup[idx].append(new_target)
 85 | 
 86 |     def get_all_indices(self, level=None):
 87 |         """Returns all the indices held in all targets.
 88 |            Optionally limit to level.
 89 |         """
 90 |         if level == 0:
 91 |             #Do it the quick way
 92 |             return list(self._target_dict.keys())
 93 |         elif level is None:
 94 |             #Use the reverse lookup dict
 95 |             return list(self._reverse_lookup.keys())
 96 |         else:
 97 |             #Scan all targets and flatten the list (standard Python-ism)
 98 |             return [ y for x in self for y in x.get_indices(level) ]
 99 | 
100 |     def get_from_index(self, index):
101 | 
102 |         res = []
103 |         for target in self._reverse_lookup[index]:
104 |             res.append( (target, target.get_level_from_index(index)) )
105 | 
106 |         return res
107 | 
108 | class Target:
109 |     def __init__(self, coords):
110 |         """Parse the data which is in the format of
111 |            [[n],[n,n,...],[n,n,n,n,n,...]]
112 |         """
113 |         assert len(coords[0]) == 1, "Centre of target must be a single int, not " + str(coords)
114 |         #centre = coords[0][0]
115 | 
116 |         self.coords = coords
117 | 
118 |     def get_indices(self, level = None):
119 | 
120 |         if level is None:
121 |             #Flatten the list (standard Python-ism)
122 |             return [ y for x in self.coords for y in x ]
123 |         else:
124 |             return self.coords[level]
125 | 
126 |     def get_centre(self):
127 | 
128 |         return self.get_indices(0)[0]
129 | 
130 |     def get_levels(self):
131 |         """Get the target size
132 |         """
133 |         return len(self.coords)
134 | 
135 |     def get_level_from_index(self, index):
136 | 
137 |         for lev, arr in enumerate(self.coords):
138 |             if index in arr:
139 |                 return lev
140 | 
141 |         #Do we want this??
142 |         #raise Exception("No such index")
143 |         # or this?
144 |         return None
145 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EdinburghGenomics/well_duplicates/7d1a68dd359edf3c09937ca00a32ecd0b2549f58/test/__init__.py


--------------------------------------------------------------------------------
/test/bad1.list:
--------------------------------------------------------------------------------
 1 | 1998850
 2 | 1997278,1997279,1998849,1998851,2000420,2000421
 3 | 1995707,1995708,1995709,1997277,1997280,1998848,1998852,2000419,2000422,2001991,2001992,2001993
 4 | 1994135,1994136,1994137,1994138,1995706,1995710,1997276,1997281,1998847,1998853,2000418,2000423,2001990,2001994,2003561,2003562,2003563,2003564
 5 | 3178500
 6 | 3176929,3176930,3178499,3178501,3180071,3180072
 7 | 3175357,3175358,3175359,3176928,3176931,3178498,3178502,3180070,3180073,3181641,3181642,3181643
 8 | 3173786,3173787,3173788,3173789,3175356,3175360,3176927,3176932,3178497,3178503,3180069,3180074,3181640,3181644,3183212,3183213,3183214,3183215
 9 | 1605639
10 | 1604067,1604068,1605638,1605640,1607209,1607210
11 | 1602496,1602497,1602498,1604066,1604069,1605637,1605641,1607208,1607211,1608780,1608781,1608782
12 | 1600924,1600925,1600926,1600927,1602495,1602499,1604065,1604070,1605636,1605642,1607207,1607212,1608779,1608783,1610350,1610351,1610352,1610353
13 | 1033607
14 | 1032036,1032037,1033606,1033608,1035178,1035179
15 | 1030464,1030465,1030466,1032035,1032038,1033605,1033609,1035177,1035180,1036748,1036749,1036750
16 | 1028893,1028894,1028895,1028896,1030463,1030467,1032034,1032039,1033604,1033610,1035176,1035181,1036747,1036751,1038319,1038320,1038321,1038322
17 | 1033610
18 | 1032039,1032040,1033609,1033611,1035181,1035182
19 | 1030467,1030468,1030469,1032038,1032041,1033608,1033612,1035180,1035183,1036751,1036752,1036753
20 | 1028896,1028897,1028898,1028899,1030466,1030470,1032037,1032042,1033607,1033613,1035179,1035184,1036750,1036754,1038322,1038323,1038324,1038325
21 | 1032036
22 | 1030464,1030465,1032035,1032037,1033606,1033607
23 | 1028893,1028894,1028895,1030463,1030466,1032034,1032038,1033605,1033608,1035177,1035178,1035179
24 | 1027321,1027322,1027323,1027324,1028892,1028896,1030462,1030467,1032033,1032039,1033604,1033609,1035176,1035180,1036747,1036748,1036749,1036750
25 | 196654
26 | 195083,195084,196653,196655,198225,198226
27 | 193511,193512,193513,195082,195085,196652,196656,198224,198227,199795,199796,199797
28 | 191940,191941,191942,191943,193510,193514,195081,195086,196651,196657,198223,198228,199794,199798,201366,201367,201368,201369
29 | 
30 | 


--------------------------------------------------------------------------------
/test/bad2.list:
--------------------------------------------------------------------------------
 1 | 1998850
 2 | 1997278,1997279,1998849,1998851,2000420,2000421
 3 | 1995707,1995708,1995709,1997277,1997280,1998848,1998852,2000419,2000422,2001991,2001992,2001993
 4 | 1994135,1994136,1994137,1994138,1995706,1995710,1997276,1997281,1998847,1998853,2000418,2000423,2001990,2001994,2003561,2003562,2003563,2003564
 5 | 3178500
 6 | 3176929,3176930,3178499,3178501,3180071,3180072
 7 | 3175357,3175358,3175359,3176928,3176931,3178498,3178502,3180070,3180073,3181641,3181642,3181643
 8 | 3173786,3173787,3173788,3173789,3175356,3175360,3176927,3176932,3178497,3178503,3180069,3180074,3181640,3181644,3183212,3183213,3183214,3183215
 9 | 1605639
10 | 1604067,1604068,1605638,1605640,1607209,1607210
11 | 1602496,1602497,1602498,1604066,1604069,1605637,1605641,1607208,1607211,1608780,1608781,1608782
12 | 1600924,1600925,1600926,1600927,1602495,1602499,1604065,1604070,1605636,1605642,1607207,1607212,1608779,1608783,1610350,1610351,1610352,1610353
13 | 1033607
14 | 1032036,1032037,1033606,1033608,1035178,1035179
15 | 1030464,1030465,1030466,1032035,1032038,1033605,1033609,1035177,1035180,1036748,1036749,1036750
16 | 1028893,1028894,1028895,1028896,1030463,1030467,1032034,1032039,1033604,1033610,1035176,1035181,1036747,1036751,1038319,1038320,1038321,1038322
17 | 1033610
18 | 1032039,1032040,1033609,1033611,1035181,1035182
19 | 1030467,1030468,1030469,1032038,1032041,1033608,1033612,1035180,1035183,1036751,1036752,1036753
20 | 1028896,1028897,1028898,1028899,1030466,1030470,1032037,1032042,1033607,1033613,1035179,1035184,1036750,1036754,1038322,1038323,1038324,1038325
21 | 1032036
22 | 1030464,1030465,1032035,1032037,1033606,1033607
23 | 1028893,1028894,1028895,1030463,1030466,1032034,1032038,1033605,1033608,1035177,1035178,1035179
24 | 1027321,1027322,1027323,1027324,1028892,1028896,1030462,1030467,1032033,1032039,1033604,1033609,1035176,1035180,1036747,1036748,1036749,1036750
25 | 196654
26 | 195083,195084,196653,196655,198225,198226
27 | 193511,193512,193513,195082,195085,196652,196656,198224,198227,199795,199796,199797
28 | 


--------------------------------------------------------------------------------
/test/old/test_bcl_direct_reader.py:
--------------------------------------------------------------------------------
  1 | #!python
  2 | from __future__ import print_function, division, absolute_import
  3 | 
  4 | import sys
  5 | import unittest
  6 | import time
  7 | import random
  8 | 
  9 | try:
 10 |     from bcl_direct_reader import BCLReader
 11 | except:
 12 |     #If this fails, you is probably running the tests wrongly
 13 |     print("****",
 14 |           "You want to run these tests by using:",
 15 |           "  python -m unittest test.test_bcl_direct_reader",
 16 |           "or even",
 17 |           "  python -m unittest discover",
 18 |           "****",
 19 |           sep="\n")
 20 |     raise
 21 | 
 22 | # Note that these tests are dependent on specific files being present in
 23 | # /ifs/seqdata/150715_K00169_0016_BH3FGFBBXX.
 24 | 
 25 | # Update on 3/5/2016 - these files have gone, so these tests will never
 26 | # work again.
 27 | 
 28 | TEST_PROJ = '/ifs/seqdata/150715_K00169_0016_BH3FGFBBXX'
 29 | 
 30 | class TestBCLReader(unittest.TestCase):
 31 | 
 32 |     def test_invalid_project(self):
 33 |         #What if we open the wrong folder?
 34 |         self.assertRaises(OSError, BCLReader, '/not/a/folder')
 35 | 
 36 |         #Or point to the Data subfolder in a valid project?
 37 |         self.assertRaises(OSError, BCLReader, TEST_PROJ + '/Data')
 38 | 
 39 |     def test_open_project(self):
 40 | 
 41 |         proj = BCLReader(TEST_PROJ)
 42 | 
 43 |         #For our test project we should see 8 lanes
 44 |         self.assertEqual(len(proj.lanes), 8)
 45 | 
 46 |     def test_get_seq(self):
 47 |         """Tests fetching a single sequence
 48 |         """
 49 | 
 50 |         #For this project, we can look at /ifs/seqarchive/150715_K00169_0016_BH3FGFBBXX/150715_K00169_0016_BH3FGFBBXX_1_1.sanfastq.gz
 51 |         #and see that the first sequence is:
 52 |         # @K00169:16:H3FGFBBXX:1:1101:1162:1791 1:N:0:ANNTCA
 53 |         # AATAGTCAGGTTAAATTTAATGTGACNNNTTATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGANTNAGNNNNN...
 54 | 
 55 |         # So the cluster co-ordinates are (1162:1791).  Running
 56 |         # $ dump_slocs.py s.locs | grep '1162:1791'
 57 |         # Gives us:
 58 |         # 0070657 1162:1791  (counting clusters from 0, not 1)
 59 | 
 60 |         known_seq = 'AATAGTCAGGTTAAATTTAATGTGACNNNTTATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGANTNAGNNNNN'
 61 | 
 62 |         proj = BCLReader(TEST_PROJ)
 63 | 
 64 |         #For speed, just get the first 30 bases
 65 |         got_seq, got_accept = proj.get_seq(1, 1101, 70657, end=30)
 66 | 
 67 |         self.assertEqual(got_seq, known_seq[0:30])
 68 |         self.assertEqual(got_accept, True)
 69 | 
 70 |         #And test the start/end by looking at bases 20-40
 71 |         got_seq, got_accept = proj.get_seq(1, 1101, 70657, start=20, end=40)
 72 | 
 73 |         self.assertEqual(got_seq, known_seq[20:40])
 74 |         self.assertEqual(got_accept, True)
 75 | 
 76 |     #This takes about a minute the first time, and about 30 seconds on subsequent
 77 |     #reads from the same tile.  Comment out the next line to try it.
 78 |     @unittest.skip("slow test")
 79 |     def test_worst_case_speed(self):
 80 | 
 81 |         #The slowest single read is the full sequence of the very last cluster
 82 |         #on a tile, which on our test data is 4309650-1
 83 | 
 84 |         proj = BCLReader(TEST_PROJ)
 85 | 
 86 |         start_time = time.time()
 87 | 
 88 |         tile = proj.get_tile(1, 1101)
 89 |         slow_seq, _ = tile.get_seqs([4309649])[4309649]
 90 | 
 91 |         self.assertEqual(len(slow_seq), tile.num_cycles)
 92 | 
 93 |         print("\n*** Slow read took %.2f seconds." % (time.time() - start_time))
 94 | 
 95 |     @unittest.skip("slow test")
 96 |     def test_multi_read_speed(self):
 97 | 
 98 |         # Due to the way I read the data, getting bases from many spots should
 99 |         # be fairly efficient.
100 |         proj = BCLReader(TEST_PROJ)
101 |         tile = proj.get_tile(1, 1101)
102 | 
103 |         # How to generate some consistent lists?
104 |         fetchlists = {}
105 |         sample_sizes = [1,10,100,1000,10000,100000]
106 | 
107 |         start_time = time.time()
108 |         for count in sample_sizes:
109 |             random.seed(0)
110 |             fetchlists[count] = random.sample(range(4309650), count)
111 |         print("\n*** Making the number lists took %.2f seconds." % (time.time() - start_time))
112 | 
113 |         #Now see about fetching
114 |         for count in sample_sizes:
115 |             start_time = time.time()
116 |             res = tile.get_seqs(fetchlists[count], start=20, end=40)
117 |             print("\n*** Fetching 20 bases from %i sequences took %.2f seconds." %
118 |                                                (count,           (time.time() - start_time))
119 |                  )
120 | 
121 |         #In my mind, fetching the first 10000 seqs should be faster than fetching just the last
122 |         #in the file, but maybe not...
123 |         start_time = time.time()
124 |         res = tile.get_seqs([4309649], start=20, end=40)
125 |         print("\n*** Fetching 20 bases from sequence 4309649 took %.2f seconds." % (time.time() - start_time))
126 | 
127 |         start_time = time.time()
128 |         res = tile.get_seqs(range(100000), start=20, end=40)
129 |         print("\n*** Fetching 20 bases from the first 100000 seqs took %.2f seconds." % (time.time() - start_time))
130 | 
131 |     def test_invalid_get_seqs(self):
132 | 
133 |         #What if I ask for a sequence that's out of range?
134 |         proj = BCLReader(TEST_PROJ)
135 | 
136 |         self.assertRaises(IndexError, proj.get_seq, 1, 1101, 5000000)
137 | 
138 |     def test_multiple_get_seqs(self):
139 |         """Tests batch sequence fetching, and also tests the accept/reject flag
140 |         """
141 | 
142 |         #The library was designed to extract multiple sequences at once, so test this.
143 |         #Also test that the accept/reject flag is being got correctly.  Again I'll use
144 |         #sequences from the file above, tile 1101
145 |         good_seqs = ( ( 70657 , 'AATAGTCAGG' ), # at 1162:1791
146 |                       ( 70658 , 'TGTGGCATTT' ), # at 1182:1791
147 |                       ( 70660 , 'TCAGAATCAG' ), # at 1223:1791
148 |                       ( 89638 , 'TTGCTTATCA' ), # at 4024:2002 (line 50001 in the .sanfasq)
149 |                       ( 89639 , 'GCCTTATGGC' ), # at 4045:2002
150 |                       ( 89641 , 'TACTGAGAAG' )) # at 4085:2002
151 | 
152 |         #I can't find these in the FASTQ so assumed they were rejected reads
153 |         #The sequences were obtained from running bcl_direct_reader.py so unlike
154 |         #the good_seqs above are only valid for regression testing.
155 |         bad_seqs = ( ( 70659, 'TTGGACGAGG' ),
156 |                      ( 89640, 'CCCNNNNNNN' ),
157 |                      (     0, 'NNNNNNNNNN' ),
158 |                      (     1, 'NNNNNNNNNN' ),
159 |                      (     2, 'NNNNNNNNNN' ))
160 | 
161 |         #Synthesize the expected result:
162 |         expected_result = { clus[0] : ( clus[1], True )
163 |                             for clus in good_seqs  }
164 | 
165 |         expected_result.update( { clus[0] : (clus[1], False)
166 |                                   for clus in bad_seqs } )
167 | 
168 |         #print(expected_result)
169 | 
170 |         proj = BCLReader(TEST_PROJ)
171 |         tile = proj.get_tile(1, 1101)
172 | 
173 |         result = tile.get_seqs([clus[0] for clus in good_seqs + bad_seqs], end=10)
174 | 
175 |         self.assertEqual(result, expected_result)
176 | 
177 |         #TODO - if I implement any caching then test that I get the same result on a second call
178 | 
179 | 


--------------------------------------------------------------------------------
/test/small.list:
--------------------------------------------------------------------------------
 1 | 1998850
 2 | 1997278,1997279,1998849,1998851,2000420,2000421
 3 | 1995707,1995708,1995709,1997277,1997280,1998848,1998852,2000419,2000422,2001991,2001992,2001993
 4 | 1994135,1994136,1994137,1994138,1995706,1995710,1997276,1997281,1998847,1998853,2000418,2000423,2001990,2001994,2003561,2003562,2003563,2003564
 5 | 3178500
 6 | 3176929,3176930,3178499,3178501,3180071,3180072
 7 | 3175357,3175358,3175359,3176928,3176931,3178498,3178502,3180070,3180073,3181641,3181642,3181643
 8 | 3173786,3173787,3173788,3173789,3175356,3175360,3176927,3176932,3178497,3178503,3180069,3180074,3181640,3181644,3183212,3183213,3183214,3183215
 9 | 1605639
10 | 1604067,1604068,1605638,1605640,1607209,1607210
11 | 1602496,1602497,1602498,1604066,1604069,1605637,1605641,1607208,1607211,1608780,1608781,1608782
12 | 1600924,1600925,1600926,1600927,1602495,1602499,1604065,1604070,1605636,1605642,1607207,1607212,1608779,1608783,1610350,1610351,1610352,1610353
13 | 1033607
14 | 1032036,1032037,1033606,1033608,1035178,1035179
15 | 1030464,1030465,1030466,1032035,1032038,1033605,1033609,1035177,1035180,1036748,1036749,1036750
16 | 1028893,1028894,1028895,1028896,1030463,1030467,1032034,1032039,1033604,1033610,1035176,1035181,1036747,1036751,1038319,1038320,1038321,1038322
17 | 1033610
18 | 1032039,1032040,1033609,1033611,1035181,1035182
19 | 1030467,1030468,1030469,1032038,1032041,1033608,1033612,1035180,1035183,1036751,1036752,1036753
20 | 1028896,1028897,1028898,1028899,1030466,1030470,1032037,1032042,1033607,1033613,1035179,1035184,1036750,1036754,1038322,1038323,1038324,1038325
21 | 1032036
22 | 1030464,1030465,1032035,1032037,1033606,1033607
23 | 1028893,1028894,1028895,1030463,1030466,1032034,1032038,1033605,1033608,1035177,1035178,1035179
24 | 1027321,1027322,1027323,1027324,1028892,1028896,1030462,1030467,1032033,1032039,1033604,1033609,1035176,1035180,1036747,1036748,1036749,1036750
25 | 196654
26 | 195083,195084,196653,196655,198225,198226
27 | 193511,193512,193513,195082,195085,196652,196656,198224,198227,199795,199796,199797
28 | 191940,191941,191942,191943,193510,193514,195081,195086,196651,196657,198223,198228,199794,199798,201366,201367,201368,201369
29 | 


--------------------------------------------------------------------------------
/test/test_count_well_duplicates.py:
--------------------------------------------------------------------------------
  1 | #!/urs/bin/env python3
  2 | 
  3 | import sys
  4 | import re
  5 | import io
  6 | import unittest
  7 | from unittest.mock import patch
  8 | 
  9 | try:
 10 |     from count_well_duplicates import output_writer, TALLY, LENGTH
 11 | except:
 12 |     #If this fails, you is probably running the tests wrongly
 13 |     print("****",
 14 |           "You want to run these tests from the top-level source folder by using:",
 15 |           "  python3 -m unittest test.test_count_well_duplicates",
 16 |           "or even",
 17 |           "  python3 -m unittest discover",
 18 |           "****",
 19 |           sep="\n")
 20 |     raise
 21 | 
 22 | #Ensure out assumptions about constants are right
 23 | assert TALLY == 0
 24 | assert LENGTH == 1
 25 | 
 26 | # dups_found = TILE_DUPL[tile][target][level][TALLY]
 27 | # targets_inspected = TILE_DUPL[tile][target][level][LENGTH]
 28 | 
 29 | # erm erm erm
 30 | # A lane has ~100 tiles.  A tile has maybe 2500 targets.
 31 | # A target has 5 levels (not including the centre).
 32 | # But here we have a lane with 1 tile, and the tile has 4 targets,
 33 | # and the target has 3 levels.
 34 | # See notebook sketch, which I will try to add add as a PNG.
 35 | 
 36 | # Each col = 1 level of target.  We're not recording lev 0!
 37 | LANE_DUPL = {'1208' : [
 38 | #           Level 1 T/L    2 T/L    3 T/L
 39 |                 [ ( 0, 6), ( 0,12), ( 0,18) ],  #<-- 1 row = 1 target of this tile
 40 |                 [ ( 2, 6), ( 1,10), ( 0,12) ],
 41 |                 [ ( 3, 6), ( 1,10), ( 1,12) ],
 42 |                 [ ( 0, 6), ( 1,12), ( 0,18) ] ] }
 43 | 
 44 | EXPECTED_OUT_1 = """
 45 | Lane: 1   Tile: 1208   Targets: 4/4
 46 | Level: 1   Wells: 24   Dups: 5   Hit: 2   AccO: 2   AccI: 3
 47 | Level: 2   Wells: 44   Dups: 3   Hit: 3   AccO: 3   AccI: 3
 48 | Level: 3   Wells: 60   Dups: 1   Hit: 1   AccO: 3   AccI: 1
 49 | LaneSummary: 1   Tiles: 1   Targets: 4/4
 50 | Level: 1   Wells: 24   Dups: 5 (0.20833)   Hit: 2 (0.50000)   AccO: 2 (0.50000)   AccI: 3 (0.75000)
 51 | Level: 2   Wells: 44   Dups: 3 (0.06818)   Hit: 3 (0.75000)   AccO: 3 (0.75000)   AccI: 3 (0.75000)
 52 | Level: 3   Wells: 60   Dups: 1 (0.01667)   Hit: 1 (0.25000)   AccO: 3 (0.75000)   AccI: 1 (0.25000)
 53 | """
 54 | 
 55 | BAD_TILE_LANE = {'1209':  [ ] }  #No valid targets.
 56 | BAD_TILE_LANE.update(LANE_DUPL)
 57 | 
 58 | # If we add the bad tile, what do we get?
 59 | 
 60 | EXPECTED_OUT_2 = """
 61 | Lane: 1   Tile: 1208   Targets: 4/4
 62 | Level: 1   Wells: 24   Dups: 5   Hit: 2   AccO: 2   AccI: 3
 63 | Level: 2   Wells: 44   Dups: 3   Hit: 3   AccO: 3   AccI: 3
 64 | Level: 3   Wells: 60   Dups: 1   Hit: 1   AccO: 3   AccI: 1
 65 | Lane: 1   Tile: 1209   Targets: 0/4
 66 | Level: 1   Wells: 0   Dups: 0   Hit: 0   AccO: 0   AccI: 0
 67 | Level: 2   Wells: 0   Dups: 0   Hit: 0   AccO: 0   AccI: 0
 68 | Level: 3   Wells: 0   Dups: 0   Hit: 0   AccO: 0   AccI: 0
 69 | LaneSummary: 1   Tiles: 2   Targets: 4/8
 70 | Level: 1   Wells: 24   Dups: 5 (0.20833)   Hit: 2 (0.50000)   AccO: 2 (0.50000)   AccI: 3 (0.75000)
 71 | Level: 2   Wells: 44   Dups: 3 (0.06818)   Hit: 3 (0.75000)   AccO: 3 (0.75000)   AccI: 3 (0.75000)
 72 | Level: 3   Wells: 60   Dups: 1 (0.01667)   Hit: 1 (0.25000)   AccO: 3 (0.75000)   AccI: 1 (0.25000)
 73 | """
 74 | 
 75 | # If we just ask for 2 levels?
 76 | # FIXME - my test dataset doesn't capture the case where AccI would be different at
 77 | # level 2 if there were a dupe at level 3 only.
 78 | EXPECTED_OUT_3 = """
 79 | Lane: 1   Tile: 1208   Targets: 4/4
 80 | Level: 1   Wells: 24   Dups: 5   Hit: 2   AccO: 2   AccI: 3
 81 | Level: 2   Wells: 44   Dups: 3   Hit: 3   AccO: 3   AccI: 3
 82 | LaneSummary: 1   Tiles: 1   Targets: 4/4
 83 | Level: 1   Wells: 24   Dups: 5 (0.20833)   Hit: 2 (0.50000)   AccO: 2 (0.50000)   AccI: 3 (0.75000)
 84 | Level: 2   Wells: 44   Dups: 3 (0.06818)   Hit: 3 (0.75000)   AccO: 3 (0.75000)   AccI: 3 (0.75000)
 85 | """
 86 | 
 87 | # Empty output when the lane is totally bad and no targets are read at all.
 88 | EXPECTED_OUT_4 = """
 89 | Lane: 1   Tile: 1222   Targets: 0/4
 90 | LaneSummary: 1   Tiles: 1   Targets: 0/4
 91 | """
 92 | 
 93 | class TestCountWellDuplicates(unittest.TestCase):
 94 | 
 95 |     #Capture sys.stdout - standard Mock procedure.
 96 |     @patch('sys.stdout', new_callable=io.StringIO)
 97 |     def test_output_writer_1lane_full(self, mock_stdout):
 98 | 
 99 |         output_writer(1, 4, LANE_DUPL, verbose=1)
100 | 
101 |         self._rescmp(mock_stdout, EXPECTED_OUT_1)
102 | 
103 |     @patch('sys.stdout', new_callable=io.StringIO)
104 |     def test_output_writer_badlane_full(self, mock_stdout):
105 | 
106 |         output_writer(1, 4, BAD_TILE_LANE, verbose=1)
107 | 
108 |         self._rescmp(mock_stdout, EXPECTED_OUT_2)
109 | 
110 |     @patch('sys.stdout', new_callable=io.StringIO)
111 |     def test_output_writer_badlane_brief(self, mock_stdout):
112 | 
113 |         output_writer(1, 4, BAD_TILE_LANE, verbose=0)
114 | 
115 |         #Basically we expect to see just the last 4 lines
116 |         self._rescmp(mock_stdout, EXPECTED_OUT_2, -4)
117 | 
118 |     @patch('sys.stdout', new_callable=io.StringIO)
119 |     def test_output_writer_limited_levels(self, mock_stdout):
120 | 
121 |         output_writer(1, 4, LANE_DUPL, verbose=1, levels=2)
122 | 
123 |         self._rescmp(mock_stdout, EXPECTED_OUT_3)
124 | 
125 |     @patch('sys.stdout', new_callable=io.StringIO)
126 |     def test_output_writer_empty_data(self, mock_stdout):
127 | 
128 |         #Note if you try to specify levels you'll get a div by zero
129 |         #error, but otherwise you'll just get a blank result.
130 |         output_writer(1, 4, {'1222':  [ ] }, verbose=1)
131 | 
132 |         self._rescmp(mock_stdout, EXPECTED_OUT_4)
133 | 
134 |     def _rescmp(self, ioobj, astring, start=0, end=None):
135 |         """This just helps you to compare the thing that got printed
136 |            out with the string that holds the expected result.
137 |         """
138 | 
139 |         #The thing that got printed...
140 |         lines1 = ioobj.getvalue().rstrip("\n").split("\n")
141 | 
142 |         #The string we expected, ignoring leading newline
143 |         #and swapping consecutive spaces for tabs.
144 |         lines2 = [ re.sub('\s\s+', '\t', s) for s in astring.lstrip().rstrip("\n").split("\n") ]
145 |         lines2 = lines2[start:end]
146 | 
147 |         #And now we can compare!
148 |         self.assertEqual(lines1, lines2)
149 | 


--------------------------------------------------------------------------------
/test/test_target.py:
--------------------------------------------------------------------------------
  1 | #!python
  2 | from __future__ import print_function, division, absolute_import
  3 | 
  4 | import sys
  5 | import unittest
  6 | import time
  7 | 
  8 | try:
  9 |     # Adding this to sys.path helps the test work if you just run it directly.
 10 |     sys.path.insert(0,'.')
 11 |     from target import load_targets
 12 | except:
 13 |     #If this fails, you is probably running the tests wrongly
 14 |     print("****",
 15 |           "You want to run these tests from the top-level source folder by using:",
 16 |           "  python -m unittest test.test_target",
 17 |           "or even",
 18 |           "  python -m unittest discover",
 19 |           "****",
 20 |           sep="\n")
 21 |     raise
 22 | 
 23 | # These tests use a subset of the file hiseq_4000_10000clusters.list
 24 | 
 25 | TEST_FILE = 'test/small.list'
 26 | BAD_FILE_1 = 'test/bad1.list'
 27 | BAD_FILE_2 = 'test/bad2.list'
 28 | 
 29 | class TestTargetReader(unittest.TestCase):
 30 | 
 31 |     all_targets = None
 32 | 
 33 |     #Load the targets each time as the file is so small
 34 |     def setUp(self):
 35 |         self.all_targets = load_targets(TEST_FILE)
 36 | 
 37 |     def test_load_subset(self):
 38 |         sub_targets = load_targets(TEST_FILE, levels=2)
 39 | 
 40 |         self.assertEqual(sub_targets.levels, 2)
 41 | 
 42 |     def test_load_limit(self):
 43 |         lim_targets = load_targets(TEST_FILE, limit=2)
 44 | 
 45 |         self.assertEqual(len(lim_targets), 2)
 46 | 
 47 |         #Test iteration over all targets
 48 |         count = 0
 49 |         for targ in lim_targets:
 50 |             count +=1
 51 | 
 52 |         self.assertEqual(count, 2)
 53 | 
 54 |     def test_get_all_indices(self):
 55 |         #Use the first two targets, as above
 56 |         lim_targets = load_targets(TEST_FILE, levels=3, limit=2)
 57 | 
 58 |         self.assertEqual(
 59 |             set(lim_targets.get_all_indices(0)),
 60 |             set((1998850, 3178500))
 61 |         )
 62 | 
 63 |         self.assertEqual(
 64 |             set(lim_targets.get_all_indices(1)),
 65 |             set(map(int,(
 66 |                 "1997278,1997279,1998849,1998851,2000420,2000421," +
 67 |                 "3176929,3176930,3178499,3178501,3180071,3180072"
 68 |                 ).split(",")))
 69 |         )
 70 | 
 71 |         self.assertEqual(
 72 |             set(lim_targets.get_all_indices(None)),
 73 |             set(map(int,(
 74 |                 "1998850,1997278,1997279,1998849,1998851,2000420," +
 75 |                 "2000421,1995707,1995708,1995709,1997277,1997280," +
 76 |                 "1998848,1998852,2000419,2000422,2001991,2001992," +
 77 |                 "2001993,3178500,3176929,3176930,3178499,3178501," +
 78 |                 "3180071,3180072,3175357,3175358,3175359,3176928," +
 79 |                 "3176931,3178498,3178502,3180070,3180073,3181641," +
 80 |                 "3181642,3181643"
 81 |                 ).split(",")))
 82 |         )
 83 | 
 84 |     def test_load_badfile(self):
 85 | 
 86 |         #bad1 has a blank line at the end
 87 |         self.assertRaises(ValueError, load_targets, BAD_FILE_1)
 88 | 
 89 |         #bad2 has the last line missing
 90 |         self.assertRaises(AssertionError, load_targets, BAD_FILE_2)
 91 | 
 92 |     def test_num_levels(self):
 93 |         all_targets = self.all_targets
 94 | 
 95 |         #I know there are 4 levels in the test file
 96 |         self.assertEqual(all_targets.levels, 4)
 97 | 
 98 |         self.assertEqual(all_targets.get_target_by_centre(196654).get_levels(), 4)
 99 | 
100 |     def test_num_targets(self):
101 |         all_targets = self.all_targets
102 | 
103 |         #There should be 7 of them
104 |         self.assertEqual(len(all_targets), 7)
105 | 
106 |         #Ditto if we do it this way
107 |         self.assertEqual(len(set(all_targets.get_all_indices(0))), 7)
108 | 
109 |     def test_lookups(self):
110 |         all_targets = self.all_targets
111 | 
112 |         #If I look up point 196654 it should be the centre of cluster 6
113 |         res = all_targets.get_from_index(196654)
114 | 
115 |         target_for_196654 = res[0][0]
116 |         self.assertEqual(res, [ ( target_for_196654, 0 ) ])
117 | 
118 |         self.assertEqual(target_for_196654.get_centre(), 196654)
119 | 
120 |         res2 = target_for_196654.get_indices(1)
121 |         self.assertEqual(res2, list(map(int,'195083,195084,196653,196655,198225,198226'.split(','))))
122 | 
123 |         #Getting all indices should be the same as getting them by level
124 |         gathered_indices = set()
125 |         for lev in range(4):
126 |             gathered_indices.update(target_for_196654.get_indices(lev))
127 |         self.assertEqual(gathered_indices, set(target_for_196654.get_indices()))
128 | 
129 |     def test_multiple_appearances(self):
130 |         all_targets = self.all_targets
131 | 
132 |         #The file should contain exactly 213 distinct spots, as revealed by
133 |         #sed 's/,/\n/g' test/small.list | sort -u | wc
134 |         self.assertEqual(len(set(all_targets.get_all_indices())), 213)
135 | 
136 |         #Point 1030466 should pop up twice in rank 2 and once in rank 3
137 |         res = all_targets.get_from_index(1030466)
138 | 
139 |         self.assertEqual(len(res), 3)
140 | 
141 |         self.assertEqual(sorted([ x[1] for x in res ]), [2,2,3])
142 | 
143 |     def test_bad_add(self):
144 |         all_targets = self.all_targets
145 | 
146 |         self.assertRaises(Exception, all_targets.add_target, [(1, 2), (3, 4)])
147 | 
148 |         #This should complain about the number of levels
149 |         self.assertRaises(AssertionError, all_targets.add_target, [(111,), (112, 113, 114, 115)])
150 | 
151 |         #This should work, but only once
152 |         sub_targets = load_targets(TEST_FILE, 2)
153 |         sub_targets.add_target( [(111,), (112, 113, 114, 115)] )
154 |         self.assertRaises(Exception, sub_targets.add_target, [(111,), (112, 113, 114, 115)])
155 | 
156 | if __name__ == '__main__':
157 |     unittest.main()
158 | 


--------------------------------------------------------------------------------