├── .gitignore
├── .gitlab-ci.yml
├── .gitmodules
├── CHANGELOG.md
├── LICENSE
├── Makefile
├── README.md
├── conda
    ├── build.sh
    └── meta.yaml
├── src
    ├── bamindex
    │   ├── build_main.c
    │   ├── dump_main.c
    │   ├── fetch_main.c
    │   ├── index.c
    │   ├── index.h
    │   └── main.c
    ├── bamstats
    │   ├── args.c
    │   ├── args.h
    │   ├── bamiter.c
    │   ├── bamiter.h
    │   ├── main.c
    │   ├── readstats.c
    │   └── readstats.h
    ├── common.c
    ├── common.h
    ├── fastcat
    │   ├── args.c
    │   ├── args.h
    │   ├── main.c
    │   ├── writer.c
    │   └── writer.h
    ├── fastqcomments.c
    ├── fastqcomments.h
    ├── hts_defs.h
    ├── kh_counter.c
    ├── kh_counter.h
    ├── regiter.c
    ├── regiter.h
    ├── stats.c
    ├── stats.h
    ├── version.c
    └── version.h
└── test
    ├── bamindex
        └── 400.bam
    ├── bamstats
        ├── 310dx.bam
        ├── 310dx.bam.bai
        ├── 400ecoli-with-qcfail.bam
        ├── 400ecoli.bam
        ├── 400ecoli.bam.bai
        ├── RCS-100A.bam
        ├── RCS-100A.bam.bai
        └── RCS-100A.bam.polya.hist
    ├── bamstats_badNM
        └── test.sam
    ├── bamstats_zeroNM
        └── test.sam
    ├── data
        ├── bc0.fastq.gz
        ├── bc1.fastq.gz
        ├── bc2.fastq.gz
        ├── bcEmpty.fastq.gz
        ├── bcMangled.fastq.gz
        └── samtoolsfastq.fastq.gz
    ├── fastcat_expected_results
        ├── concat.reheader.sorted.fastq.gz
        ├── concat.sorted.fastq.gz
        ├── per-file-stats.tsv
        └── per-read-stats.tsv
    ├── parse_rd
        ├── RD-first-tag-and-no-RG-CW-4285.fastq
        ├── RD-first-tag-and-no-RG-CW-4285.fastq.runids
        ├── empty-RD-CW-4299.fastq
        └── empty-RD-CW-4299.fastq.runids
    ├── parse_rg
        ├── bad-ones.bam
        ├── bad-ones.bam.bai
        ├── bad-ones.bam.callers
        ├── dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam
        ├── dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.bai
        ├── dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.callers
        ├── dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz
        ├── dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz.callers
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.bai
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.callers
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz.callers
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.bai
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.callers
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz.callers
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.bai
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.callers
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz
        ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz.callers
        ├── mixed.bam
        ├── mixed.bam.callers
        ├── mixed.bam.fastq.gz
        ├── mixed.bam.fastq.gz.callers
        ├── mixed_basecaller_model_key.fastq.gz
        └── mixed_basecaller_model_key.fastq.gz.callers
    ├── rg_parse.c
    ├── sam2fastq
        ├── wf_basecalling_demo.fastq
        └── wf_basecalling_demo.sam
    └── sort-sam.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | src/**/*.o
4 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | include:
 2 |     - project: "epi2melabs/ci-templates"
 3 |       file: "push-github.yaml"
 4 |     - project: "epi2melabs/ci-templates"
 5 |       file: "push-conda.yaml"
 6 |     - project: "epi2melabs/ci-templates"
 7 |       file: "snippets.yaml"
 8 | 
 9 | image: ${UBUNTUIMAGE}:20.04
10 | 
11 | variables:
12 |     GIT_SUBMODULE_STRATEGY: recursive
13 | 
14 | 
15 | .prep-image: &prep-image |
16 |     DEBIAN_FRONTEND=noninteractive
17 |     apt update -qq
18 |     apt install -y --no-install-recommends gcc autoconf automake valgrind make curl wget zlib1g-dev libbz2-dev libreadline-dev libssl-dev libffi-dev liblzma-dev libcurl4-gnutls-dev
19 | 
20 | stages:
21 |     - test
22 |     - prerelease
23 |     - release
24 | 
25 | build:
26 |     stage: test
27 |     before_script:
28 |         - *prep-image
29 |     artifacts:
30 |       when: always
31 |       paths:
32 |         - test/*
33 |     script:
34 |         - make
35 |         - ./fastcat --help
36 |         - ./bamstats --help
37 |         - ./bamindex build --help
38 |         - PEPPER=1 make test
39 | 
40 | deploy-checks:
41 |     stage: prerelease
42 |     script:
43 |         - !reference [.check, argp-c-version]
44 |         - !reference [.check, changelog]
45 |         - export LICENSE_FILE="BSD-4-CLAUSE"
46 |         - !reference [.check, license]
47 |     rules:
48 |         - if: '$CI_COMMIT_TAG =~ /^v[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+$/'
49 | 
50 | 
51 | .before-script: &before-script |
52 |     export CONDA_PKG=${CI_PROJECT_NAME}
53 |     export CONDA_PKG_VERSION=${CI_COMMIT_TAG/v/}
54 |     cd conda
55 | 
56 | conda:
57 |     extends: .deploy-conda
58 |     before_script:
59 |         - *prep-image
60 |         - *before-script
61 | 
62 | conda-arm:
63 |     extends: .deploy-conda-linux-arm
64 |     before_script:
65 |         - *prep-image
66 |         - *before-script
67 | 
68 | conda-mac:
69 |     extends: .deploy-conda-mac
70 |     before_script:
71 |         - *before-script
72 | 
73 | conda-mac-arm:
74 |     extends: .deploy-conda-mac-arm
75 |     before_script:
76 |         - *before-script
77 | 
78 | test-conda-arm:
79 |     stage: test
80 |     extends: .deploy-conda-linux-arm
81 |     variables:
82 |         UPLOAD: "no"
83 |     before_script:
84 |         - *prep-image
85 |         - *before-script
86 |     rules:
87 |     - if: $CI_COMMIT_BRANCH != null
88 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "htslib"]
2 | 	path = htslib
3 | 	url = https://github.com/samtools/htslib.git
4 | [submodule "zlib-ng"]
5 | 	path = zlib-ng
6 | 	url = https://github.com/zlib-ng/zlib-ng.git
7 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  6 | 
  7 | ## [v0.22.0]
  8 | ### Changed
  9 | - Bumped htslib to 1.21.
 10 | ### Added
 11 | - `fastcat` now has a verbose option. Logging of filepaths processed is suppressed without `--verbose`.
 12 | 
 13 | ## [v0.21.0]
 14 | ### Added
 15 | - `fastcat` can now optionally output BAM files, carrying across meta information into tags.
 16 | 
 17 | ## [v0.20.0]
 18 | ### Changed
 19 | - Tightened up checks on provided regions.
 20 | - Histogram files were not written when input was empty, they now are restoring previous behaviour.
 21 | ### Fixed
 22 | - Read mean quality was not recomputed when qs tag was not present in BAM.
 23 | 
 24 | ## [v0.19.1]
 25 | ### Fixed
 26 | - Compilation on macOS with clang.
 27 | - Segmentation fault with bad read group information.
 28 | 
 29 | ## [v0.19.0]
 30 | ### Added
 31 | - Regions can now be provided in a three (or more) column BED file to `bamstats` to calculate per-region statistics.
 32 | ### Fixed
 33 | - A segmentation fault whilst writing statistics files when demultiplexing is enabled.
 34 | - A memory leak occurring in `bamstats` when qcfail reads are encountered in the input.
 35 | ### Changed
 36 | - Retrieve mean quality score 'qs' tag from BAM record, rather than recomputing. Previous behaviour can be restored with the `--recalc_qual` option.
 37 | - Get basecall model version id from `model_version_id` in FASTQ header (in addition to `basecaller_model_version_id`)
 38 | - Parse required bam tags in one pass rather than piecemeal as required.
 39 | 
 40 | ## [v0.18.6]
 41 | ### Changed
 42 | - Ill advised parsing of RG ID field has been extended to additionally allow for protocol_run_id style (uuid) Run IDs, as well as standalone acquisition_id (sha1) Run IDs
 43 | 
 44 | ## [v0.18.5]
 45 | ### Changed
 46 | - Parsing of RG ID field containing a modified base model, now returns only the core basecaller model.
 47 | ### Fixed
 48 | - Workaround samtools "bug" where RG ID suffix is not fixed width.
 49 | 
 50 | ## [v0.18.4]
 51 | ### Fixed
 52 | - Segfault on SAM-style tags without values in the FASTQ header.
 53 | 
 54 | ## [v0.18.3]
 55 | ### Fixed
 56 | - Bug causing segfault on unlikely RG SAM tags in FASTQ header comments.
 57 | - SAM parsing of FASTQ header not enabled if only either of RG or RD tag is present and at the beginning of the header comment.
 58 | 
 59 | ## [v0.18.2]
 60 | ### Fixed
 61 | - 'run_id' instead of 'basecaller' as column name in bamstats basecaller summary output header line.
 62 | 
 63 | ## [v0.18.1]
 64 | ### Fixed
 65 | - 'run_id' instead of 'basecaller' as column name in basecaller summary output header line.
 66 | - `(null)` in FASTQ header comments when run with `-H` on files that had `basecall_model_version_id=...` as only header comment.
 67 | 
 68 | ## [v0.18.0]
 69 | ### Added
 70 | - Basecaller summary information similar to runid summary.
 71 | - RNA poly-A tail length histogram output.
 72 | ### Fixed
 73 | - Random output for runid when not found in header.
 74 | 
 75 | ## [v0.17.1]
 76 | ### Added
 77 | - `--runids` option to `bamstats` for enumerating detected run identifiers.
 78 | 
 79 | ## [v0.17.0]
 80 | ### Added
 81 | - `--reads_per_file` option can split inputs into batched files when demultiplexing. Users should use Unix `split` with piped output.
 82 | - `--runids` option to output a file enumerating detected run identifiers.
 83 | ### Changed
 84 | - Per-file read statistics now relate to filtered reads only.
 85 | - Link `fastcat` against zlib-ng for an even faster cat.
 86 | 
 87 | ## [v0.16.8]
 88 | ### Changed
 89 | - `fastcat` reverts to using a space separator (introduced in v0.16.0) between the Read ID and comment when outputting FASTQ comments that are not SAM tags
 90 | 
 91 | ## [v0.16.7]
 92 | ### Fixed
 93 | - Modification of BAM record with strtok when inferring Run ID from RG aux tag causing missing NM tag
 94 | 
 95 | ## [v0.16.6]
 96 | ### Fixed
 97 | - Additional spurious "contains non-integer 'NM' tag type" errors by checking EINVAL only when NM appears to be zero, and clearing errno first
 98 | 
 99 | ## [v0.16.5]
100 | ### Fixed
101 | - Spurious "contains non-integer 'NM' tag type" errors by checking EINVAL only when NM appears to be zero
102 | 
103 | ## [v0.16.4]
104 | ### Added
105 | - `bamstats` now saves histograms for unmapped reads when `--unmapped` is provided.
106 | 
107 | ## [v0.16.3]
108 | ### Fixed
109 | - Incorrect sanity check of NM.
110 | 
111 | ## [v0.16.2]
112 | ### Fixed
113 | - Prevent reads with implausible NM tag leading to illegal memory access in add_qual_count
114 | 
115 | ## [v0.16.1]
116 | ### Changed
117 | - Extended FASTQ SAM tag parsing to comment lines that include the RD tag (as well as RG).
118 | 
119 | ## [v0.16.0]
120 | ### Added
121 | - Support for reading SAM tags from FASTQ headers.
122 | ### Changed
123 | - `fastcat` will output a tab between the Read ID and the SAM tags rather than a space to match samtools convention.
124 | - `bamstats` uses `bam_get_tag_caseinsensitive` wrapper to get SAM tags with case insensitivity.
125 | - `fastcat` and `bamstats` will infer a Run ID from the `RG` tag if `RD` is not available.
126 | - Bumped version of htslib used to 1.19.
127 | ### Fixed
128 | - Incorrectly capitalised ONT SAM tags are now output in lowercase by fastcat: `ch`, `rn`, `st`.
129 | 
130 | ## [v0.15.2]
131 | ### Fixed
132 | - Duplicated recipe name in Makefile.
133 | 
134 | ### Added
135 | - Section explaining `bamstats` output columns to README.
136 | 
137 | ## [v0.15.1]
138 | ### Fixed
139 | - Decimal precision of hisotgram outputs.
140 | 
141 | ## [v0.15.0]
142 | ### Added
143 | - Calculation of read length and quality  histograms to `fastcat` and `bamstats`.
144 | - Calculation of alignment accuracy and alignment read coverage to `bamstats`.
145 | 
146 | ## [v0.14.1]
147 | ### Fixed
148 | - Missing compilation of conda aarch64 package
149 | 
150 | ## [v0.14.0]
151 | ### Added
152 | - `bamstats --duplex` option allows to count the number of duplex reads and 
153 |   duplex-forming reads.
154 | 
155 | ## [v0.13.2]
156 | ### Fixed
157 | - Bug writing long reads to demultiplexed gzipped outputs.
158 | 
159 | ## [v0.13.1]
160 | ### Fixed
161 | - Bug writing `UINTMAX_MAX` for `min_length` and `nan` for `mean_quality` of a
162 |   file in fastcat per-file stats if there were no reads in that file.
163 | 
164 | ## [v0.13.0]
165 | ### Added
166 | - Column with start time from MinKNOW header to `bamstats` output.
167 | 
168 | ### Changed
169 | - `bamstats` now prints `mean_quality`, `iden`, and `acc` values with 2 decimal
170 |   places instead of 3 (the reason being that `fastcat` already uses 2 decimal
171 |   places for `mean_quality` and more precision is unnecessary).
172 | 
173 | ## [v0.12.0]
174 | ### Added
175 | - Column with run ID from MinKNOW header to `fastcat` per-read stats and
176 |   `bamstats` output.
177 | 
178 | ## [v0.11.2]
179 | ### Changed
180 | - Reverted the change of the default value of the `start_time` field to an empty
181 |   string (it had been set to `"2000-01-01T00:00:00Z"` in v0.11.1).
182 | 
183 | ## [v0.11.1]
184 | ### Fixed
185 | - Bug in `fastcat` per-read summary stats.
186 | 
187 | ## [v0.11.0]
188 | ### Changed
189 | - Bamstats can now be run without a BAM index.
190 | - `fastcat -H` now wraps all known header fields into SAM tags regardless of
191 |   whether the header was "valid" (i.e. all expected fields were present) or not.
192 | 
193 | ## [v0.10.2]
194 | ### Added
195 | - Linux and macOS ARM conda packages.
196 | 
197 | ## [v0.10.1]
198 | ### Fixed
199 | - bamindex program missing from conda package.
200 | 
201 | ## [v0.10.0]
202 | ### Added
203 | - Create bamindex program to index unaligned BAMs for horizontal-parallel processing.
204 | 
205 | ## [v0.9.0]
206 | ### Fixed
207 | - Ensure reheadered fastq is indeed formatted as a valid SAM tag(s).
208 | 
209 | ## [v0.8.0]
210 | ### Added
211 | - Option to bamstats to add 'sample_name' column equivalent to fastcat.
212 | 
213 | ## [v0.7.0]
214 | ### Added
215 | - Option to report unmapped alignments in per read and summary files.
216 | 
217 | ## [v0.6.1]
218 | ### Fixed
219 | - Min read length in per-file statistics.
220 | 
221 | ## [v0.6.0]
222 | ### Added
223 | - `mean_quality` column to bamstats output, equivalent to that from fastcat.
224 | - optional per-reference summary file for bamstats similar to samtools flagstats.
225 | 
226 | ## [v0.5.0]
227 | ### Changed
228 | - Behaviour of `-x/--recurse`. Top-level directory input will always be searched for
229 |   data. Turning on recursion now exclusively refers to descending into child (and
230 |   subsequent) directories.
231 | 
232 | ## [v0.4.12]
233 | ### Fixed
234 | - Updated kseq.h to allow exit on broken fastq/a stream.
235 | 
236 | ## [v0.4.11]
237 | ### Changed
238 | - `fastcat` will exit non-zero if an input file (named or recursed) cannot be opened
239 | 
240 | ## [v0.4.10]
241 | ### Fixed
242 | - Use of uninitialized memory in thread pool init, leading to memory leak.
243 | 
244 | ## [v0.4.9]
245 | ### Fixed
246 | - Handle BAM_CEQUAL and BAM_CDIFF that some aligners like to use.
247 | 
248 | ## [v0.4.8]
249 | ### Fixed
250 | - Doubled tab in output header.
251 | 
252 | ## [v0.4.7]
253 | ### Changed
254 | - Build conda package using bioconda's htslib.
255 | ### Fixed
256 | - Occasional hanging on exit.
257 | 
258 | ## [v0.4.6]
259 | ### Fixed
260 | - Missing tab character in output header.
261 | 
262 | ## [v0.4.5]
263 | ### Changed
264 | - Pin openssl version in conda build to that which work across Python versions.
265 | 
266 | ## [v0.4.4]
267 | ### Fixed
268 | - Removed libdeflate from conda build which caused issues with threading.
269 | 
270 | ## [v0.4.3]
271 | ### Changed
272 | - Only multithread BAM decompression.
273 | 
274 | ## [v0.4.2]
275 | ### Added
276 | - Multithreading to `bamstats` for improved throughput.
277 | 
278 | ## [v0.4.1]
279 | ### Changed
280 | - Improved performance of `bamstats` for many-target bams.
281 | 
282 | ## [v0.4.0]
283 | ### Added
284 | - `bamstats` program for summarising (primary) alignment information.
285 | 
286 | ## [v0.3.8]
287 | ### Fixed
288 | - Refomatted header tags were space separated, fixed to tab separated.
289 | 
290 | ## [v0.3.7]
291 | ### Added
292 | - Option to reformat fastq headers as SAM-style tags for minimap2 passthrough.
293 | 
294 | ## [v0.3.6]
295 | ### Fixed
296 | - Per-file summary file created with broken header.
297 | 
298 | ## [v0.3.5]
299 | ### Fixed
300 | - Per-read summary file created incorrectly when `-s` option provided.
301 | 
302 | ## [v0.3.4]
303 | ### Fixed
304 | - Program hang when directory input given without trailing `/`.
305 | 
306 | ## [v0.3.3]
307 | ### Added
308 | - Transpose read number, channel, and start time from fastq headers to summary.
309 | ### Changed
310 | - Additional columns in per-read summary file as above. These will be present,
311 |   regardless of whether header information is present or not.
312 | 
313 | ## [v0.3.2]
314 | ### Fixed
315 | - Changed erroneously small MAX_BARCODE define; added runtime check to avoid
316 |   invalid memory access.
317 | 
318 | ## [v0.3.1]
319 | ### Fixed
320 | - Updated CI release scripts.
321 | 
322 | ## [v0.3.0]
323 | ### Added
324 | - Parsing Guppy/MinKNOW fastq key=value header comments.
325 | - Ability to demultiplex inputs based on "barcode" key in headers.
326 | ### Changed
327 | - Per-read and per-file summary files now optional.
328 | 
329 | ## [v0.2.1]
330 | ### Added
331 | - Read length and read quality output filtering.
332 | ### Changed
333 | - Average qualities computed with Kahan summation.
334 | 
335 | ## [v0.2.0]
336 | ### Fixed
337 | - Program hang when input file was non-existent or a directory.
338 | ### Added
339 | - Ability to traverse a directory input.
340 | 
341 | ## [v0.1.0]
342 | ### Added
343 | - Ability to read input files from stdin.
344 | 
345 | ## [v0.0.3]
346 | ### Changed
347 | - Moved output files to optional arguments.
348 | ### Added
349 | - `-s` option to add in a `sample_name` column to outputs.
350 | 
351 | ## [v0.0.2]
352 | ### Changed
353 | - No end-user changes.
354 | 
355 | ## [v0.0.1]
356 | ### Added
357 | - Per-read and per-file summarising of fastq data.
358 | 
359 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020-, Oxford Nanopore Technologies Plc. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 | 
 6 | * Redistributions of source code must retain the above copyright notice, this
 7 |   list of conditions and the following disclaimer.
 8 | 
 9 | * Redistributions in binary form must reproduce the above copyright notice,
10 |   this list of conditions and the following disclaimer in the documentation
11 |   and/or other materials provided with the distribution.
12 | 
13 | * All advertising materials mentioning features or use of this software must
14 |   display the following acknowledgement: This product includes software
15 |   developed by Oxford Nanopore Technologies Plc.
16 | 
17 | * Neither the name of Oxford Nanopore Technologies Plc. nor the names of
18 |   its contributors may be used to endorse or promote products derived from this
19 |   software without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY Oxford Nanopore Technologies Plc. AS IS AND ANY
22 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL Oxford Nanopore Technologies Plc. BE LIABLE FOR ANY DIRECT,
25 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
29 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | OS := $(shell uname)
  2 | ifeq ($(OS), Darwin)
  3 |     # mainly for dev builds using homebrew things
  4 |     EXTRA_LDFLAGS ?= -L$(shell brew --prefix openssl@1.1)/lib -L$(shell brew --prefix curl)/lib
  5 |     ARGP ?= $(shell brew --prefix argp-standalone)/lib/libargp.a
  6 |     ARGP_INC ?= -I$(shell brew --prefix argp-standalone)/include
  7 |     CFLAGS ?= -fpic -O3 ${ARGP_INC}
  8 |     ZCAT = "gzcat"
  9 | else
 10 |     ARGP ?=
 11 |     ARGP_INC ?=
 12 |     CFLAGS ?= -fpic -msse3 -O3 ${ARGP_INC}
 13 |     ZCAT = "zcat"
 14 | endif
 15 | 
 16 | VALGRIND ?= valgrind
 17 | 
 18 | CC ?= gcc
 19 | STATIC_HTSLIB ?= htslib/libhts.a
 20 | EXTRA_CFLAGS ?=
 21 | EXTRA_LDFLAGS ?=
 22 | EXTRA_LIBS ?=
 23 | EXTRA_LIBS ?=
 24 | HTS_CONF_ARGS ?=
 25 | NOTHREADS ?=
 26 | ifeq ($(NOTHREADS), 1)
 27 |     CFLAGS += -DNOTHREADS
 28 | endif
 29 | 
 30 | # we can't do pedantic because min/max macros lead to:
 31 | #     "ISO C forbids braced-groups within expressions [-Werror=pedantic]"
 32 | ifeq ($(shell $(CC) --version | grep clang | wc -l), 0)
 33 |     WARNINGS = -Werror -Wall -Wextra -Wno-incompatible-pointer-types
 34 | else
 35 |     WARNINGS = -Werror -Wall -Wextra -Wpedantic -Wno-language-extension-token -Wno-gnu-statement-expression -Wno-incompatible-function-pointer-types
 36 | endif
 37 | 
 38 | GRIND = $(VALGRIND) --error-exitcode=1 --tool=memcheck --leak-check=full --show-leak-kinds=all -s
 39 | ifeq ($(OS), Darwin)
 40 | 	GRIND =
 41 | endif
 42 | # optionally run all tests under valgrind
 43 | ifeq ($(PEPPER), 1)
 44 | 	PEPPER = $(GRIND)
 45 | else
 46 | 	PEPPER = 
 47 | endif
 48 | 
 49 | 
 50 | .PHONY:
 51 | default: fastcat bamstats bamindex
 52 | 
 53 | .PHONY:
 54 | test: test_fastcat test_bamstats test_meta test_bamindex
 55 | 
 56 | .PHONY:
 57 | test_memory: mem_check_fastcat mem_check_bamstats mem_check_bamindex
 58 | 
 59 | .PHONY:
 60 | clean:
 61 | 	rm -rf fastcat bamstats bamindex src/fastcat/*.o src/bamstats/*.o src/bamindex/*.o src/*.o
 62 | 
 63 | .PHONY: clean_htslib
 64 | clean_htslib:
 65 | 	cd htslib && make clean
 66 | 
 67 | 
 68 | ###
 69 | # build stages
 70 | 
 71 | htslib/libhts.a:
 72 | 	@echo Compiling $(@F)
 73 | 	cd htslib/ \
 74 | 		&& autoheader \
 75 | 		&& autoconf \
 76 | 		&& autoreconf --install \
 77 | 		&& CFLAGS="$(CFLAGS) $(EXTRA_CFLAGS)" ./configure $(HTS_CONF_ARGS) \
 78 | 		&& make -j 4
 79 | 
 80 | # just for testing
 81 | SAMVER=1.21
 82 | samtools:
 83 | 	curl -L -o samtools-${SAMVER}.tar.bz2 https://github.com/samtools/samtools/releases/download/${SAMVER}/samtools-${SAMVER}.tar.bz2;
 84 | 	tar -xjf samtools-${SAMVER}.tar.bz2;
 85 | 	rm samtools-${SAMVER}.tar.bz2
 86 | 	cd samtools-${SAMVER} && make -j 4
 87 | 	cp samtools-${SAMVER}/samtools $@
 88 | 
 89 | #TODO: for conda we could use zlib-ng from conda-forge
 90 | 
 91 | zlib-ng/zlib.h:
 92 | 	@echo Configuring zlib-ng
 93 | 	cd zlib-ng/ \
 94 | 		&& CFLAGS="$(CFLAGS) $(EXTRA_CFLAGS)" ./configure --zlib-compat \
 95 | 
 96 | zlib-ng/libz.a: zlib-ng/zlib.h
 97 | 	@echo Compiling $(@F)
 98 | 	cd zlib-ng/ \
 99 | 		&& make -j 4 libz.a
100 | 
101 | src/%.o: src/%.c zlib-ng/zlib.h
102 | 	$(CC) -Isrc -Ihtslib -Izlib-ng -c -pthread $(WARNINGS) -fstack-protector-strong -D_FORTIFY_SOURCE=2 \
103 | 		$(CFLAGS) $(EXTRA_CFLAGS) $< -o $@
104 | 
105 | fastcat: src/version.o src/fastcat/main.o src/fastcat/args.o src/fastcat/writer.o src/fastqcomments.o src/common.o src/stats.o src/kh_counter.o $(STATIC_HTSLIB) zlib-ng/libz.a
106 | 	$(CC) -Isrc -Izlib-ng $(WARNINGS) -fstack-protector-strong -D_FORTIFY_SOURCE=2 \
107 | 		$(CFLAGS) $(EXTRA_CFLAGS) $(EXTRA_LDFLAGS) \
108 | 		$^ $(ARGP) \
109 | 		-lm -lz -llzma -lbz2 -lpthread -lcurl -lcrypto $(EXTRA_LIBS) \
110 | 		-o $@
111 | 
112 | bamstats: src/version.o src/bamstats/main.o src/bamstats/args.o src/bamstats/readstats.o src/bamstats/bamiter.o src/fastqcomments.o src/common.o src/regiter.o src/stats.o src/kh_counter.o $(STATIC_HTSLIB)
113 | 	$(CC) -Isrc -Ihtslib $(WARNINGS) -fstack-protector-strong -D_FORTIFY_SOURCE=2 \
114 | 		$(CFLAGS) $(EXTRA_CFLAGS) $(EXTRA_LDFLAGS) \
115 | 		$^ $(ARGP) \
116 | 		-lm -lz -llzma -lbz2 -lpthread -lcurl -lcrypto $(EXTRA_LIBS) \
117 | 		-o $@
118 | 
119 | bamindex: src/version.o src/bamindex/main.o src/bamindex/build_main.o src/bamindex/fetch_main.o src/bamindex/dump_main.o src/bamindex/index.o $(STATIC_HTSLIB)
120 | 	$(CC) -Isrc -Ihtslib $(WARNINGS) -fstack-protector-strong -D_FORTIFY_SOURCE=2 \
121 | 		$(CFLAGS) $(EXTRA_CFLAGS) $(EXTRA_LDFLAGS) \
122 | 		$^ $(ARGP) \
123 | 		-lm -lz -llzma -lbz2 -lpthread -lcurl -lcrypto $(EXTRA_LIBS) \
124 | 		-o $@
125 | 
126 | test/rg_parse: src/version.o test/rg_parse.o src/common.o 
127 | 	$(CC) -Isrc $(WARNINGS) -fstack-protector-strong -D_FORTIFY_SOURCE=2 \
128 | 		$(CFLAGS) $(EXTRA_CFLAGS) $(EXTRA_LDFLAGS) \
129 | 		$^ $(ARGP) \
130 | 		-lm $(EXTRA_LIBS) \
131 | 		-o $@
132 | 
133 | 
134 | ###
135 | # fastcat tests
136 | 
137 | .PHONY:
138 | test_fastcat: mem_check_fastcat mem_check_fastcat_demultiplex mem_check_fastcat_bam mem_check_fastcat_demultiplex_bam test_fastcat_bam_equivalent
139 | 
140 | .PHONY: mem_check_fastcat
141 | mem_check_fastcat: fastcat
142 | 	rm -rf fastcat-histograms
143 | 	$(GRIND) ./fastcat test/data/*.fastq.gz > /dev/null
144 | 
145 | .PHONY: mem_check_fastcat_bam
146 | mem_check_fastcat_bam: fastcat
147 | 	rm -rf fastcat-histograms
148 | 	$(GRIND) ./fastcat test/data/*.fastq.gz -B > /dev/null
149 | 
150 | .PHONY: mem_check_fastcat_demultiplex
151 | mem_check_fastcat_demultiplex: fastcat
152 | 	rm -rf demultiplex
153 | 	$(GRIND) ./fastcat test/data/*.fastq.gz --demultiplex demultiplex > /dev/null
154 | 
155 | .PHONY: mem_check_fastcat_demultiplex_bam
156 | mem_check_fastcat_demultiplex_bam: fastcat
157 | 	rm -rf demultiplex
158 | 	$(GRIND) ./fastcat test/data/*.fastq.gz --demultiplex demultiplex -B > /dev/null
159 | 
160 | .PHONY: test_fastcat_bam_equivalent
161 | fastcat_bam_equivalent: fastcat bamstats samtools
162 | 	@echo ""
163 | 	@echo "Testing fastcat bam equivalence"
164 | 	rm -rf test/test-tmp-fcb-equiv-van*
165 | 	rm -rf test/test-tmp-fcb-equiv-bam*
166 | 	$(PEPPER) ./fastcat test/data/*.fastq.gz --histograms test/test-tmp-fcb-equiv-van --reheader | ./samtools import -T '*' - | ./test/sort-sam.py > test/test-tmp-fcb-equiv-van.sam && \
167 | 	$(PEPPER) ./fastcat test/data/*.fastq.gz --histograms test/test-tmp-fcb-equiv-bam -B | ./samtools view | ./test/sort-sam.py > test/test-tmp-fcb-equiv-bam.sam && \
168 | 	diff test/test-tmp-fcb-equiv-van.sam test/test-tmp-fcb-equiv-bam.sam
169 | 
170 | 
171 | ###
172 | # bamstats tests
173 | 
174 | .PHONY: 
175 | test_bamstats: test_bamstats_NM test_bamstats_polya mem_check_bamstats
176 | 
177 | .PHONY: test_bamstats_NM
178 | test_bamstats_NM: bamstats
179 | 	rm -rf test/test-tmp-bs-nm
180 | 	mkdir test/test-tmp-bs-nm && \
181 | 	cd test/test-tmp-bs-nm && \
182 | 	$(PEPPER) ../../bamstats ../bamstats_badNM/test.sam 2> err || grep "appears to contain implausible alignment information" err && rm -rf bamstats-histograms-bs-nm && \
183 | 	rm -rf bamstats-histograms && \
184 | 	$(PEPPER) ../../bamstats ../bamstats_zeroNM/test.sam
185 | 	rm -r test/test-tmp-bs-nm
186 | 
187 | .PHONY: test_bamstats_polya
188 | test_bamstats_polya: bamstats
189 | 	rm -rf test/test-tmp-bs-pa
190 | 	mkdir test/test-tmp-bs-pa && \
191 | 	cd test/test-tmp-bs-pa && \
192 | 	$(PEPPER) ../../bamstats ../bamstats/RCS-100A.bam --poly_a > /dev/null && \
193 | 	diff bamstats-histograms/polya.hist ../bamstats/RCS-100A.bam.polya.hist
194 | 	rm -r test/test-tmp-bs-pa
195 | 
196 | .PHONY:
197 | mem_check_bamstats: bamstats
198 | 	@echo "Memcheck bamstats with good data"
199 | 	rm -rf bamstats-histograms
200 | 	$(GRIND) ./bamstats test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam > /dev/null
201 | 	@echo "Memcheck bamstats with bad data"
202 | 	@echo ""
203 | 	rm -rf bamstats-histograms
204 | 	$(GRIND) ./bamstats test/parse_rg/bad-ones.bam > /dev/null
205 | 	@echo ""
206 | 	@echo "Memcheck bamstats with qcfails"
207 | 	rm -rf bamstats-histograms
208 | 	$(GRIND) ./bamstats test/bamstats/400ecoli-with-qcfail.bam > /dev/null
209 | 	@echo ""
210 | 	@echo "Memcheck bamstats duplex"
211 | 	rm -rf bamstats-histograms
212 | 	$(GRIND) ./bamstats test/bamstats/310dx.bam
213 | 
214 | 
215 | ###
216 | # meta data tests (both fastcat and bamstats)
217 | 
218 | .PHONY:
219 | test_meta: test_meta_fastcat test_meta_bamstats
220 | 
221 | .PHONY: test_meta_fastcat
222 | test_meta_fastcat: fastcat
223 | 	rm -rf test/test-tmp-meta-fastq
224 | 	mkdir test/test-tmp-meta-fastq && \
225 | 	cd test/test-tmp-meta-fastq && \
226 | 	set -e; \
227 | 	for i in ../parse_rd/*.fastq; do \
228 | 		echo $$i; \
229 | 		$(PEPPER) ../../fastcat $$i --histograms hist -i rd > /dev/null; \
230 | 		diff rd $$i.runids || exit 1; \
231 | 		rm -rf hist rg; \
232 | 	done;
233 | 	rm -r test/test-tmp-meta-fastq
234 | 
235 | .PHONY: test_meta_bamstats
236 | test_meta_bamstats: bamstats
237 | 	rm -rf test/test-tmp-meta-bam
238 | 	mkdir test/test-tmp-meta-bam && \
239 | 	cd test/test-tmp-meta-bam && \
240 | 	set -e; \
241 | 	for i in ../parse_rg/*.bam; do \
242 | 		$(PEPPER) ../../bamstats $$i --histograms hist -l rg \
243 | 			> /dev/null; \
244 | 		diff rg $$i.callers || exit 1; \
245 | 		rm -rf hist rg; \
246 | 	done;
247 | 	rm -r test/test-tmp-meta-bam
248 | 
249 | .PHONY: regression_test_rg_parsing
250 | regression_test_rg_parsing: test/rg_parse
251 | 	$(PEPPER) ./test/rg_parse
252 | 
253 | 
254 | ###
255 | # bamindex tests
256 | 
257 | .PHONY:
258 | test_bamindex: mem_check_bamindex-build mem_check_bamindex-dump mem_check_bamindex-fetch 
259 | 
260 | .PHONY: mem_check_bamindex-build
261 | mem_check_bamindex-build: bamindex
262 | 	$(GRIND) ./bamindex build test/bamindex/400.bam
263 | 
264 | .PHONY: mem_check_bamindex-dump
265 | mem_check_bamindex-dump: bamindex mem_check_bamindex-build
266 | 	$(GRIND) ./bamindex dump test/bamindex/400.bam.bci > /dev/null
267 | 
268 | .PHONY: mem_check_bamindex-fetch
269 | mem_check_bamindex-fetch: bamindex mem_check_bamindex-build
270 | 	$(GRIND) ./bamindex fetch test/bamindex/400.bam --chunk 5 > /dev/null
271 | 
272 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # fastcat
  2 | 
  3 | A set of simply utilities for creating summaries from standard bioinformatics formats.
  4 | 
  5 | ### Installation
  6 | 
  7 | All tools are distributed in a single package from our conda channel, they can be installed
  8 | into an isolated conda environment with:
  9 | 
 10 | ```
 11 | mamba create -n fastcat -c conda-forge -c bioconda -c nanoporetech fastcat
 12 | ```
 13 | 
 14 | #### Compilation
 15 | 
 16 | Although not recommended, compilation from source is via make:
 17 | 
 18 | ```
 19 | make fastcat bamstats bamindex
 20 | ```
 21 | 
 22 | Several libraries are assumed to be present on the system for linking.
 23 | 
 24 | ### fastcat
 25 | 
 26 | This eponymous tool concatenates .fastq(.gz) files whilst creating a summary
 27 | of the sequences. Can also demultiplex reads according to Guppy/MinKNOW
 28 | .fastq record headers.
 29 | 
 30 | ```
 31 | Usage: fastcat [OPTION...]
 32 |             reads1.fastq(.gz) reads2.fastq(.gz) dir-with-fastq ...
 33 | fastcat -- concatenate and summarise .fastq(.gz) files.
 34 | 
 35 |   -a, --min_length=MIN READ LENGTH
 36 |                              minimum read length to output (excluded reads
 37 |                              remain listed in summaries).
 38 |   -b, --max_length=MAX READ LENGTH
 39 |                              maximum read length to output (excluded reads
 40 |                              remain listed in summaries).
 41 |   -d, --demultiplex=OUT DIR  Separate barcoded samples using fastq header
 42 |                              information. Option value is top-level output
 43 |                              directory.
 44 |   -f, --file=FILE SUMMARY    Per-file summary output
 45 |       --histograms=DIRECTORY Directory for outputting histogram information.
 46 |                              When --demultiplex is enabled histograms are
 47 |                              written to per-sample demultiplexed output
 48 |                              directories. (default: fastcat-histograms)
 49 |   -H, --reheader             Rewrite fastq header comments as SAM tags (useful
 50 |                              for passing through minimap2).
 51 |   -q, --min_qscore=MIN READ QSCOROE
 52 |                              minimum read Qscore to output (excluded reads
 53 |                              remain listed in summaries).
 54 |   -r, --read=READ SUMMARY    Per-read summary output
 55 |   -s, --sample=SAMPLE NAME   Sample name (if given, adds a 'sample_name'
 56 |                              column).
 57 |   -x, --recurse              Search directories recursively for '.fastq',
 58 |                              '.fq', '.fastq.gz', and '.fq.gz' files.
 59 |   -?, --help                 Give this help list
 60 |       --usage                Give a short usage message
 61 |   -V, --version              Print program version
 62 | 
 63 | Mandatory or optional arguments to long options are also mandatory or optional
 64 | for any corresponding short options.
 65 | 
 66 | Input files may be given on stdin by specifing the input as '-'. Also accepts
 67 | directories as input and looks for .fastq(.gz) files in the top-level
 68 | directory. Recurses into sub-directories when the -x option is given. The
 69 | command will exit non-zero if any file encountered cannot be read.
 70 | ```
 71 | 
 72 | The program writes the input sequences to `stdout` in .fastq format to be
 73 | recompressed with `gzip` (or more usefully `bgzip`).
 74 | 
 75 | The `per-read.txt` is a tab-separated file with columns:
 76 | 
 77 | ```
 78 | read_id        filename                read_length  mean_quality
 79 | SRR12447496.1  SRR12447496_1.fastq.gz  531          14.03
 80 | SRR12447496.2  SRR12447496_1.fastq.gz  513          13.91
 81 | SRR12447496.3  SRR12447496_1.fastq.gz  473          14.70
 82 | ...
 83 | ```
 84 | 
 85 | The mean quality is defined as:
 86 | ```
 87 | -10 * log10(mean(10^(Q/-10)))
 88 | ```
 89 | 
 90 | where `Q` are the set of all per-base quality scores for the read.
 91 | 
 92 | The `per-file.txt` is also a tab-separated file with columns:
 93 | 
 94 | ```
 95 | filename                n_seqs  n_bases  min_length  max_length  mean_quality
 96 | SRR12447496_1.fastq.gz  16048   8090160  434         697         13.10
 97 | SRR12447498_1.fastq.gz  16203   8049713  421         697         13.25
 98 | SRR12447499_1.fastq.gz  15484   7812439  424         612         13.16
 99 | ...
100 | ```
101 | where the `mean_quality` column is the mean of the per-read `mean_quality` values.
102 | 
103 | Additionally as its a common thing to want to do, the program will write
104 | the two files:
105 | 
106 | * `length.hist` - read length histogram, and
107 | * `quality.hist` - read mean base-quality score histogram.
108 | 
109 | When data is demultiplexed one such file will be written to the demultiplexed
110 | samples' directories. When demultiplexing is not enabled the files will be
111 | placed in a directory according to the `--histograms` option. The format of the
112 | histogram files is a tab-separated file of sparse, ordered intervals `[lower, uppper)`:
113 | 
114 | ```
115 | lower    upper    count
116 | ```
117 | 
118 | The final bin may be unbounded, which is signified by a `0` entry for the upper
119 | bin edge.
120 | 
121 | 
122 | ### bamstats
123 | 
124 | The `bamstats` utility is a re-implementation of the `stats_from_bam` program
125 | from [pomoxis](github.com/nanoporetech/pomoxis). It creates read-level summary
126 | statistics of alignments found in a BAM file and reports these in a TSV file.
127 | 
128 | Additionally as its a common thing to want to do, the program will write
129 | the four files:
130 | 
131 | * `length.hist` - read length histogram,
132 | * `quality.hist` - read mean base-quality score histogram,
133 | * `accuracy.hist` - read alignment accuracy histogram, and
134 | * `coverage.hist` - read alignment coverage histogram.
135 | 
136 | These files are as described for the `fastcat` program.
137 | 
138 | ```
139 | Usage: bamstats [OPTION...] <reads.bam>
140 | bamstats -- summarise rears/alignments in one or more BAM files.
141 | 
142 |  General options:
143 |   -f, --flagstats=FLAGSTATS  File for outputting alignment flag counts.
144 |       --histograms=DIRECTORY Directory for outputting histogram information.
145 |                              (default: bamstats-histograms)
146 |   -r, --region=chr:start-end Genomic region to process.
147 |   -s, --sample=SAMPLE NAME   Sample name (if given, adds a 'sample_name'
148 |                              column).
149 |   -t, --threads=THREADS      Number of threads for BAM processing.
150 | 
151 |  Read filtering options:
152 | 
153 |   -g, --read_group=RG        Only process reads from given read group.
154 |       --haplotype=VAL        Only process reads from a given haplotype.
155 |                              Equivalent to --tag_name HP --tag_value VAL.
156 |       --tag_name=TN          Only process reads with a given tag (see
157 |                              --tag_value).
158 |       --tag_value=VAL        Only process reads with a given tag value.
159 |   -u, --unmapped             Include unmapped/unplaced reads in output.
160 | 
161 |   -?, --help                 Give this help list
162 |       --usage                Give a short usage message
163 |   -V, --version              Print program version
164 | 
165 | Mandatory or optional arguments to long options are also mandatory or optional
166 | for any corresponding short options.
167 | 
168 | The program creates a simple TSV file containing statistics for each primary
169 | alignment stored within the input BAM files.
170 | ```
171 | 
172 | #### Output format
173 | 
174 | The `bamstats` output is a tab-separated text file with columns as in the table
175 | below. The `q` prefix to columns names relates to the so-called "query"
176 | sequence, i.e. the sequencing read. The `r` prefix relates to the reference
177 | sequence. Not all column names where properties are quoted for both the query
178 | and reference follow this convention; this is an unfortunate historical wart.
179 | 
180 | All coordinates are given as zero-based, end exclusive.
181 | In sequence alignment jargon the term "match" means any a pair of bases
182 | (one each from the query and reference) which are aligned to each other.
183 | The term does not convey its common English meaning that the two bases
184 | have the same identity. An 'A' base from the query can match (be aligned to)
185 | a 'C' base from the reference.
186 | 
187 | | index | name | description
188 | | - | - | -
189 | | 1 | `name` | Read identifier (column 1 from a SAM file).
190 | | 2 | `runid` | Sequencing run identifier (from the `RD` tag of the SAM record).
191 | | 3 | `sample_name` | Sample name (optional, provided as input by the user).
192 | | 4 | `ref` | Reference sequence name (column 3 from a SAM file).
193 | | 5 | `coverage` | Proportion of read spanned by the alignment.
194 | | 6 | `ref_coverage` | Proportion of reference spanned by the alignment.
195 | | 7 | `qstart` | Alignment start coordinate on the query (tantamount to the total left-hand clipping in [SAM terminology](https://samtools.github.io/hts-specs/)).
196 | | 8 | `qend` | Alignment end coordinate on the query (see `qstart`).
197 | | 9 | `rstart` | Alignment start coordinate on the reference (column 4 of SAM).
198 | | 10 | `rend` | Alignment end coordinate on the reference.
199 | | 11 | `aligned_ref_len` | Length of alignment on reference (simply `rend - rstart`).
200 | | 12 | `direction` | Alignment direction. `+` for forward reference sequence, `-` for reverse complement.
201 | | 13 | `length` | Total length of the alignment including all insertions.
202 | | 14 | `read_length` | Length of query sequence (as stored in the input file).
203 | | 15 | `mean_quality` | Mean per-base quality of the query sequence expressed on Phred scale. See discussion in `fastcat` section above.
204 | | 16 | `start_time` | Sequencing start time for the read (from the `ST` tag of the SAM record).
205 | | 17 | `match` | Number of matches in the alignment (see description above).
206 | | 18 | `ins` | Number of inserted bases in alignment.
207 | | 19 | `del` | Number of deleted bases in alignment.
208 | | 20 | `sub` | Number of substitutions (mismatches) in alignment.
209 | | 21 | `iden` | Proportion of matches which are not mismatches: `(match - sub) / match`.
210 | | 22 | `acc` | Alignment accuracy: `(length - ins - del - sub) / length`. Sometimes also referred to as [BLAST-identity](https://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity).
211 | | 23 | `duplex` | Whether the read was simplex (`0`), duplex (`1`), or duplex-forming (`-1`). See [dorado documentation](https://github.com/nanoporetech/dorado?tab=readme-ov-file#duplex).
212 | 
213 | 
214 | ### bamindex
215 | 
216 | The `bamindex` program is a rather curious program that will create a positional index
217 | of alignments in a BAM file. It is intended to be used  within workflows to allow
218 | parallel processing of records in a BAM file, each worker processing a contiguous chunk
219 | of the file. This is most useful with unaligned BAM files.
220 | 
221 | The program was insired by [bri](https://github.com/jts/bri) by Jared Simpson at [OICR](https://oicr.on.ca/);
222 | which is far cooler.
223 | 
224 | There are three subcommands:
225 | 
226 | **bamindex index**
227 | 
228 | ```
229 | $ ./bamindex build --help
230 | Usage: build [OPTION...] <reads.bam>
231 | bamindex build -- create a BAM index corresponding to batches of records.
232 | 
233 |  General options:
234 |   -c, --chunk_size=SIZE      Number of records in a chunk.
235 |   -t, --threads=THREADS      Number of threads for BAM processing.
236 | 
237 |   -?, --help                 Give this help list
238 |       --usage                Give a short usage message
239 |   -V, --version              Print program version
240 | 
241 | Mandatory or optional arguments to long options are also mandatory or optional
242 | for any corresponding short options.
243 | 
244 | The program creates a simple index of file offsets for each of every (n * M)th
245 | alignment record. No care is taken to keep records corresponding to the same
246 | query together, or any other such niceities. Its intended to be used simply
247 | with unaligned, unsorted BAMs.
248 | ```
249 | 
250 | **bamindex fetch**
251 | 
252 | ```
253 | $ ./bamindex fetch --help
254 | Usage: fetch [OPTION...] <reads.bam.bci>
255 | bamindex fetch -- fetch records from a BAM according to an index.
256 | 
257 |  General options:
258 |   -c, --chunk=SIZE           Chunk index to retrieve.
259 |   -t, --threads=THREADS      Number of threads for BAM processing.
260 | 
261 |   -?, --help                 Give this help list
262 |       --usage                Give a short usage message
263 |   -V, --version              Print program version
264 | 
265 | Mandatory or optional arguments to long options are also mandatory or optional
266 | for any corresponding short options.
267 | 
268 | The program simply will fetch a batch of records from a BAM fileusing and index
269 | and a chunk ID.
270 | ```
271 | 
272 | **bamindex dump**
273 | 
274 | ```
275 | $ ./bamindex dump --help 
276 | Usage: dump [OPTION...] <reads.bam.bci>
277 | bamindex dump -- dump a BAM chunk index to stdout as text.
278 | 
279 |   -?, --help                 Give this help list
280 |       --usage                Give a short usage message
281 |   -V, --version              Print program version
282 | 
283 | The program simply writes the contents of an index to stdout for human
284 | inspection. It has no other purpose.
285 | ```
286 | 


--------------------------------------------------------------------------------
/conda/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=fastcat
 4 | 
 5 | 
 6 | # don't enable libdeflate -- seems to cause hangs when used with threaded decompression
 7 | #export HTS_CONF_ARGS="--prefix=${PREFIX} --enable-libcurl --enable-plugins --enable-gcs --enable-s3"
 8 | # ignore that, just link to htslib from bioconda
 9 | export EXTRA_CFLAGS="-I$PREFIX/include"
10 | export STATIC_HTSLIB=""
11 | export EXTRA_LDFLAGS="-L$PREFIX/lib"
12 | export EXTRA_LIBS="-ldl -lhts"
13 | 
14 | OS=$(uname)
15 | if [[ "$OS" == "Darwin" ]]; then
16 |     echo "Setting Darwin args"
17 |     export ARGP=${PREFIX}/lib/libargp.a
18 |     export EXTRA_CFLAGS="${EXTRA_CFLAGS} -isysroot ${CONDA_BUILD_SYSROOT} -mmacosx-version-min=${MACOSX_DEPLOYMENT_TARGET}"
19 | fi
20 | 
21 | make clean clean_htslib
22 | 
23 | mkdir -p $PREFIX/bin
24 | for binary in fastcat bamstats bamindex; do
25 |     make $binary
26 |     cp $binary $PREFIX/bin && chmod +x $PREFIX/bin/$binary
27 | done
28 | 


--------------------------------------------------------------------------------
/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: {{ environ.get('CONDA_PKG') }}
 3 |   version: {{ environ.get('CONDA_PKG_VERSION') }}
 4 | 
 5 | source:
 6 |     path: ../
 7 | 
 8 | build:
 9 |     number: 0
10 | 
11 | requirements:
12 |     build:
13 |         - {{ compiler('c') }}
14 |     host:
15 |         - argp-standalone # [osx]
16 |         # not sure why zlib needs to be explicitly listed here,
17 |         # bioconda::samtools does it too, and conda build can't find it otherwise
18 |         # despite it getting installed into the build env
19 |         - htslib >=1.20
20 |         - zlib
21 |         - xz
22 |     run:
23 |         - htslib >=1.20
24 |         - zlib
25 |         - xz
26 | test:
27 |     commands:
28 |         fastcat --help
29 |         bamstats --help
30 |         bamindex build --help
31 | 
32 | about:
33 |     home: "https://github.com/epi2me-labs/fastcat"
34 |     license: Mozilla Public License 2.0 
35 |     license_family: OTHER
36 |     license_file: LICENSE
37 |     summary: "Concatenate fast/a/q/gz and calculate basic statistics"
38 |     doc_url: https://github.com/epi2me-labs/fastcat
39 |     dev_url: https://github.com/epi2me-labs/fastcat 
40 | 
41 | extra:
42 |     recipe-maintainers:
43 |         - cjw85
44 | 
45 | 


--------------------------------------------------------------------------------
/src/bamindex/build_main.c:
--------------------------------------------------------------------------------
  1 | // bamindex build program
  2 | 
  3 | #include <err.h>
  4 | #include <string.h>
  5 | #include <stdbool.h>
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <sys/resource.h>
  9 | #include <time.h>
 10 | #include "htslib/faidx.h"
 11 | #include "htslib/sam.h"
 12 | #include "htslib/thread_pool.h"
 13 | #include "htslib/bgzf.h"
 14 | 
 15 | #include "index.h"
 16 | #include "../version.h"
 17 | 
 18 | #include <argp.h>
 19 | 
 20 | typedef struct arguments {
 21 |     const char* bam;
 22 |     int threads;
 23 |     int chunk_size;
 24 | } arguments_t;
 25 | 
 26 | static char doc[] = 
 27 | "bamindex build -- create a BAM index corresponding to batches of records.\
 28 | \vThe program creates a simple index of file offsets for each of every \
 29 | (n * M)th alignment record. No care is taken to keep records corresponding \
 30 | to the same query together, or any other such niceities. Its intended to \
 31 | be used simply with unaligned, unsorted BAMs.";
 32 | static char args_doc[] = "<reads.bam>";
 33 | static struct argp_option options[] = {
 34 |     {0, 0, 0, 0,
 35 |         "General options:", 0},
 36 |     {"threads", 't', "THREADS", 0,
 37 |         "Number of threads for BAM processing.", 0},
 38 |     {"chunk_size", 'c', "SIZE", 0,
 39 |         "Number of records in a chunk.", 0},
 40 |     { 0 }
 41 | };
 42 | 
 43 | static error_t parse_opt (int key, char *arg, struct argp_state *state) {
 44 |     arguments_t *arguments = state->input;
 45 |     switch (key) {
 46 |         case 't':
 47 |             arguments->threads = atoi(arg);
 48 |             break;
 49 |         case 'c':
 50 |             arguments->chunk_size = atoi(arg);
 51 |             break;
 52 |         case ARGP_KEY_NO_ARGS:
 53 |             argp_usage (state);
 54 |             break;
 55 |         case ARGP_KEY_ARG:
 56 |             if (state->arg_num == 0) {
 57 |                 arguments->bam = arg;
 58 |                 break;
 59 |             }
 60 |             break;
 61 |         case ARGP_KEY_END:
 62 |             if (state->arg_num != 1)
 63 |                 argp_usage (state);
 64 |             break;
 65 |         default:
 66 |             return ARGP_ERR_UNKNOWN;
 67 |     }
 68 |     return 0;
 69 | }
 70 | 
 71 | static struct argp argp = {options, parse_opt, args_doc, doc, 0, 0, 0};
 72 | 
 73 | static arguments_t parse_arguments(int argc, char** argv) {
 74 |     arguments_t args;
 75 |     args.bam = NULL;
 76 |     args.threads = 1;
 77 |     args.chunk_size = 1;
 78 |     argp_parse(&argp, argc, argv, 0, 0, &args);
 79 |     return args;
 80 | }
 81 | 
 82 | void index_build(const char* filename, const char* output_fname, int threads, size_t every) {
 83 |     htsFile *fp = hts_open(filename, "r");
 84 |     bam_hdr_t *h = sam_hdr_read(fp);
 85 |     if(fp == NULL || h == NULL) {
 86 |         fprintf(stderr, "Could not open %s\n", filename);
 87 |         exit(EXIT_FAILURE);
 88 |     }
 89 |     
 90 |     FILE* out_fp = fopen(output_fname, "wb");
 91 |     bc_idx_t *idx = bc_idx_init1(every);
 92 |     size_t rtn;
 93 |     if ((rtn = bc_idx_write_header(out_fp, idx)) > 0) {
 94 |         fprintf(stderr, "Failed to write header to index. Error %zu.\n", rtn);
 95 |     }
 96 | 
 97 |     htsThreadPool p = {NULL, 0};
 98 |     if (threads > 1 ) {
 99 |         p.pool = hts_tpool_init(threads);
100 |         hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
101 |     }
102 | 
103 |     int ret = 0;
104 |     int i = 0;
105 |     bam1_t* b = bam_init1();
106 |     size_t file_offset = bgzf_tell(fp->fp.bgzf);
107 |     while ((ret = sam_read1(fp, h, b)) >= 0) {
108 |         if ((i % every) != 0) {
109 |             file_offset = bgzf_tell(fp->fp.bgzf);
110 |             i++;
111 |             continue;
112 |         }
113 |         if (i % 100000 == 0) {
114 |             fprintf(stderr, "Record %d %zu\n", i, file_offset);
115 |         }
116 |         if (bc_idx_write(out_fp, idx, file_offset, bam_get_qname(b)) < 0) {
117 |             fprintf(stderr, "Failed to write records to index.\n");
118 |             exit(EXIT_FAILURE);
119 |         }
120 |         file_offset = bgzf_tell(fp->fp.bgzf);
121 |         i++;
122 |     }
123 | 
124 |     bam_hdr_destroy(h);
125 |     bam_destroy1(b);
126 |     hts_close(fp);
127 |     if (p.pool) { // must be after fp
128 |         hts_tpool_destroy(p.pool);
129 |     }
130 | 
131 |     // fill in how many records we wrote
132 |     if (bc_idx_write_header(out_fp, idx) > 0) {
133 |         fprintf(stderr, "Failed to write header to index.\n");
134 |     }
135 |     fclose(out_fp);
136 |     fprintf(stderr, "Written %zu/%d records to index.\n", idx->n_chunks, i);
137 |     bc_idx_destroy(idx);
138 | }
139 | 
140 | 
141 | int main_build(int argc, char *argv[]) {
142 |     clock_t begin = clock();
143 |     arguments_t args = parse_arguments(argc, argv);
144 | #ifdef NOTHREADS
145 |     if (args.threads != 1) {
146 |         fprintf(
147 |             stderr,
148 |             "--threads set to %d, but threading not supported by this build.\n", args.threads);
149 |     }
150 | #endif
151 | 
152 |     char* index_fname = generate_index_filename(args.bam, NULL);
153 |     index_build(args.bam, index_fname, args.threads, args.chunk_size);
154 |     free(index_fname);
155 | 
156 |     clock_t end = clock();
157 |     fprintf(stderr, "Total CPU time: %fs\n", (double)(end - begin) / CLOCKS_PER_SEC);
158 |     return EXIT_SUCCESS;
159 | }
160 | 


--------------------------------------------------------------------------------
/src/bamindex/dump_main.c:
--------------------------------------------------------------------------------
 1 | // bamindex dump program
 2 | 
 3 | #include <err.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <sys/stat.h>
 7 | #include <time.h>
 8 | 
 9 | #include "index.h"
10 | 
11 | #include <argp.h>
12 | 
13 | static char doc[] = 
14 | "bamindex dump -- dump a BAM chunk index to stdout as text.\
15 | \vThe program simply writes the contents of an index to stdout for human \
16 | inspection. It has no other purpose.";
17 | static char args_doc[] = "<reads.bam.bci>";
18 | static struct argp_option options[] = {
19 |     { 0 }
20 | };
21 | 
22 | typedef struct arguments {
23 |     const char* index;
24 | } arguments_t;
25 | 
26 | static error_t parse_opt (int key, char *arg, struct argp_state *state) {
27 |     arguments_t *arguments = state->input;
28 |     switch (key) {
29 |         case ARGP_KEY_NO_ARGS:
30 |             argp_usage (state);
31 |             break;
32 |         case ARGP_KEY_ARG:
33 |             if (state->arg_num == 0) {
34 |                 arguments->index = arg;
35 |                 break;
36 |             }
37 |             break;
38 |         case ARGP_KEY_END:
39 |             if (state->arg_num != 1)
40 |                 argp_usage (state);
41 |             break;
42 |         default:
43 |             return ARGP_ERR_UNKNOWN;
44 |     }
45 |     return 0;
46 | }
47 | 
48 | static struct argp argp = {options, parse_opt, args_doc, doc, 0, 0, 0};
49 | 
50 | static arguments_t parse_arguments(int argc, char** argv) {
51 |     arguments_t args;
52 |     args.index = NULL;
53 |     argp_parse(&argp, argc, argv, 0, 0, &args);
54 |     return args;
55 | }
56 | 
57 | 
58 | void index_dump(const char* filename) {
59 |     struct stat st;
60 |     if (stat(filename, &st) != 0) {
61 |         errx(1, "Cannot open index file %s\n", filename);
62 |         exit(EXIT_FAILURE);
63 |     }
64 | 
65 |     FILE *fp = fopen(filename, "rb");
66 |     bc_idx_t *idx;
67 |     if((idx = bc_idx_read(fp)) == NULL) {
68 |         fprintf(stderr, "Couldn't read index file: %s.\n", filename);
69 |         exit(EXIT_FAILURE);
70 |     }
71 |     fprintf(stdout, "Index contains %zu chunks of size %zu.\n", idx->n_chunks, idx->chunk_size);
72 |     for (size_t i=0; i<idx->n_chunks; ++i){
73 |         fprintf(stdout, "%zu %s\n", (idx->recs[i]).file_offset, (idx->recs[i]).qname);
74 |     }
75 |     bc_idx_destroy(idx);
76 |     fclose(fp);
77 | }
78 | 
79 | 
80 | int main_dump(int argc, char *argv[]) {
81 |     clock_t begin = clock();
82 |     arguments_t args = parse_arguments(argc, argv);
83 |     index_dump(args.index);
84 |     clock_t end = clock();
85 |     fprintf(stderr, "Total CPU time: %fs\n", (double)(end - begin) / CLOCKS_PER_SEC);
86 |     return EXIT_SUCCESS;
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/src/bamindex/fetch_main.c:
--------------------------------------------------------------------------------
  1 | // bamindex fetch program
  2 | 
  3 | #include <err.h>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <sys/stat.h>
  7 | #include <time.h>
  8 | 
  9 | #include "htslib/faidx.h"
 10 | #include "htslib/sam.h"
 11 | #include "htslib/thread_pool.h"
 12 | #include "htslib/bgzf.h"
 13 | 
 14 | #include "index.h"
 15 | 
 16 | #include <argp.h>
 17 | 
 18 | static char doc[] = 
 19 | "bamindex fetch -- fetch records from a BAM according to an index.\
 20 | \vThe program simply will fetch a batch of records from a BAM file" \
 21 | "using and index and a chunk ID. Output is written as uncompressed "\
 22 | "BAM to stdout.";
 23 | static char args_doc[] = "<reads.bam.bci>";
 24 | static struct argp_option options[] = {
 25 |     {0, 0, 0, 0,
 26 |         "General options:", 0},
 27 |     {"threads", 't', "THREADS", 0,
 28 |         "Number of threads for BAM processing.", 0},
 29 |     {"chunk", 'c', "SIZE", 0,
 30 |         "Chunk index to retrieve.", 0},
 31 |     { 0 }
 32 | };
 33 | 
 34 | typedef struct arguments {
 35 |     const char* bam;
 36 |     const char* index;
 37 |     int chunk_idx;
 38 |     int threads;
 39 | } arguments_t;
 40 | 
 41 | static error_t parse_opt (int key, char *arg, struct argp_state *state) {
 42 |     arguments_t *arguments = state->input;
 43 |     switch (key) {
 44 |         case 't':
 45 |             arguments->threads = atoi(arg);
 46 |             break;
 47 |         case 'c':
 48 |             arguments->chunk_idx = atoi(arg);
 49 |             break;
 50 |         case ARGP_KEY_NO_ARGS:
 51 |             argp_usage (state);
 52 |             break;
 53 |         case ARGP_KEY_ARG:
 54 |             if (state->arg_num == 0) {
 55 |                 arguments->bam = arg;
 56 |                 break;
 57 |             }
 58 |             break;
 59 |         case ARGP_KEY_END:
 60 |             if (state->arg_num != 1)
 61 |                 argp_usage (state);
 62 |             break;
 63 |         default:
 64 |             return ARGP_ERR_UNKNOWN;
 65 |     }
 66 |     return 0;
 67 | }
 68 | 
 69 | static struct argp argp = {options, parse_opt, args_doc, doc, 0, 0, 0};
 70 | 
 71 | static arguments_t parse_arguments(int argc, char** argv) {
 72 |     arguments_t args;
 73 |     args.bam = NULL;
 74 |     args.index = NULL;
 75 |     args.threads = 1;
 76 |     args.chunk_idx = 0;
 77 |     argp_parse(&argp, argc, argv, 0, 0, &args);
 78 |     args.index = generate_index_filename(args.bam, args.index);
 79 |     return args;
 80 | }
 81 | 
 82 | 
 83 | void index_fetch(const char* bam_fname, const char* index_fname, int chunk, int threads) {
 84 |     htsFile *fp = hts_open(bam_fname, "r");
 85 |     bam_hdr_t *h = sam_hdr_read(fp);
 86 |     if(fp == NULL || h == NULL) {
 87 |         fprintf(stderr, "Could not open %s\n", bam_fname);
 88 |         exit(EXIT_FAILURE);
 89 |     }
 90 |     htsThreadPool p = {NULL, 0};
 91 |     if (threads > 1 ) {
 92 |         p.pool = hts_tpool_init(threads);
 93 |         hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
 94 |     }
 95 | 
 96 |     struct stat st;
 97 |     if (stat(index_fname, &st) != 0) {
 98 |         errx(1, "Cannot open index file %s\n", index_fname);
 99 |         exit(EXIT_FAILURE);
100 |     }
101 |     FILE* idx_fp = fopen(index_fname, "rb");
102 |     bc_idx_t *idx;
103 |     if((idx = bc_idx_read(idx_fp)) == NULL) {
104 |         fprintf(stderr, "Couldn't read index file: %s.\n", index_fname);
105 |         exit(EXIT_FAILURE);
106 |     }
107 |     fclose(idx_fp);
108 |     bc_rec_t rec = idx->recs[chunk];
109 | 
110 |     fprintf(stderr, "Starting from: %zu %s\n", rec.file_offset, rec.qname);
111 |     fprintf(stderr, "Reading %zu records from bam.\n", idx->chunk_size);
112 | 
113 | 
114 |     if(bgzf_seek(fp->fp.bgzf, rec.file_offset, SEEK_SET) != 0) {
115 |         fprintf(stderr, "Failed to seek to first record.\n");
116 |         exit(EXIT_FAILURE);
117 |     }
118 | 
119 |     size_t written = 0;
120 |     bam1_t* b = bam_init1();
121 |     htsFile * out_fp;
122 |     if ((out_fp = hts_open("-", "wb0")) == 0) {
123 |         fprintf(stderr, "Failed to open standard output for writing.\n");
124 |         exit(EXIT_FAILURE);
125 |     }
126 | 
127 |     // TODO: fill in the NULLs here
128 |     if(sam_hdr_add_pg(h, "bamindex.fetch", "VN", argp_program_version, NULL, NULL, NULL) != 0){
129 |         fprintf(stderr, "Failed to add PG line to the header.\n");
130 |         exit(EXIT_FAILURE);
131 |     }
132 |     if(sam_hdr_write(out_fp, h) != 0) {
133 |         fprintf(stderr, "Failed to write the SAM header.\n");
134 |         exit(EXIT_FAILURE);
135 |     }
136 |     while ((sam_read1(fp, h, b) >= 0) && (written < (idx->chunk_size))) {
137 |         if((sam_write1(out_fp, h, b) < 0)) {
138 |             fprintf(stderr, "Failed to write output record.");
139 |             exit(EXIT_FAILURE);
140 |         }
141 |         written++;
142 |     }
143 |     hts_close(out_fp);
144 | 
145 |     bam_hdr_destroy(h);
146 |     bam_destroy1(b);
147 |     hts_close(fp);
148 |     if (p.pool) { // must be after fp
149 |         hts_tpool_destroy(p.pool);
150 |     }
151 | 
152 |     bc_idx_destroy(idx);
153 |     fprintf(stderr, "Written %zu records to output.\n", written);
154 | }
155 | 
156 | 
157 | int main_fetch(int argc, char *argv[]) {
158 |     clock_t begin = clock();
159 |     arguments_t args = parse_arguments(argc, argv);
160 |     index_fetch(args.bam, args.index, args.chunk_idx, args.threads);
161 |     free((char*)args.index);
162 |     clock_t end = clock();
163 |     fprintf(stderr, "Total CPU time: %fs\n", (double)(end - begin) / CLOCKS_PER_SEC);
164 |     return EXIT_SUCCESS;
165 | }
166 | 


--------------------------------------------------------------------------------
/src/bamindex/index.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <string.h>
  3 | #include <stdio.h>
  4 | 
  5 | #include "htslib/sam.h"
  6 | 
  7 | #include "index.h"
  8 | 
  9 | const size_t MAGIC_LEN = 5;
 10 | const char* FILE_MAGIC = "FANZ\0";
 11 | 
 12 | 
 13 | char* generate_index_filename(const char* input_bam, const char* input_index) {
 14 |     char* out_fn;
 15 | 
 16 |     if(input_index != NULL) {
 17 |         out_fn = calloc(strlen(input_index) + 1, sizeof(char));
 18 |         if(out_fn == NULL) {
 19 |             exit(EXIT_FAILURE);
 20 |         }
 21 |         strcpy(out_fn, input_index);
 22 |     } else {
 23 |         out_fn = calloc(strlen(input_bam) + 5, sizeof(char));
 24 |         if(out_fn == NULL) {
 25 |             exit(EXIT_FAILURE);
 26 |         }
 27 |         strcpy(out_fn, input_bam);
 28 |         strcat(out_fn, ".bci");
 29 |     }
 30 |     return out_fn;
 31 | }
 32 | 
 33 | bc_idx_t *bc_idx_init(void) {
 34 |     bc_idx_t *h = (bc_idx_t*)calloc(1, sizeof(bc_idx_t));
 35 |     if (h == NULL) return NULL;
 36 |     // any init?
 37 |     return h;
 38 | }
 39 | 
 40 | bc_idx_t *bc_idx_init1(const size_t chunk_size) {
 41 |     bc_idx_t *idx = bc_idx_init();
 42 |     if (idx == NULL) return NULL;
 43 |     idx->version = 1;
 44 |     idx->chunk_size = chunk_size;
 45 |     idx->n_chunks = 0;
 46 |     idx->stored = 0;
 47 |     return idx;
 48 | }
 49 | 
 50 | void bc_idx_destroy(bc_idx_t *h) {
 51 |     if (h->stored > 0) {
 52 |         for (size_t i=0; i<h->stored; ++i) {
 53 |             if (h->recs[i].qname != NULL) {
 54 |                 free(h->recs[i].qname);
 55 |             }
 56 |         }
 57 |         if (h->recs != NULL) {
 58 |             free(h->recs);
 59 |         }
 60 |     }
 61 |     free(h);
 62 | }
 63 | 
 64 | bc_idx_t *bc_idx_read(FILE *fp) {
 65 |     char buf[MAGIC_LEN];
 66 |     size_t magic_len = fread(&(buf), sizeof(char), MAGIC_LEN, fp);
 67 |     if (magic_len != MAGIC_LEN || memcmp(buf, FILE_MAGIC, MAGIC_LEN)) {
 68 |         fprintf(stderr, "Invalid BAM chunk index binary header.\n");
 69 |         return NULL;
 70 |     }
 71 |     bc_idx_t *h = bc_idx_init();
 72 |     if(h == NULL) {
 73 |         fprintf(stderr, "Failed to allocate header.\n");
 74 |         return NULL;
 75 |     }
 76 |     size_t items = 0;
 77 |     items += fread(&(h->version), sizeof(h->version), 1, fp);
 78 |     items += fread(&(h->chunk_size), sizeof(h->chunk_size), 1, fp);
 79 |     items += fread(&(h->n_chunks), sizeof(h->n_chunks), 1, fp);
 80 |     if (items != 3) {
 81 |         bc_idx_destroy(h);
 82 |         fprintf(stderr, "Invalid BAM chunk index binary header.\n");
 83 |         return NULL;
 84 |     }
 85 | 
 86 |     h->stored = h->n_chunks;
 87 |     h->recs = (bc_rec_t*)calloc(h->stored, sizeof(bc_rec_t));
 88 | 
 89 |     size_t valid = 0;
 90 |     char *msg = "Failed to read index contents. File is currupt.\n";
 91 |     for (size_t i=0; i<h->n_chunks; ++i, ++valid) {
 92 |         bc_rec_t *r = &(h->recs[i]);
 93 |         if (fread(&(r->file_offset), sizeof(r->file_offset), 1, fp) != 1) {
 94 |             fputs(msg, stderr); break;
 95 |         }
 96 |         if (fread(&(r->lqname), sizeof(r->lqname), 1, fp) != 1) {
 97 |             fputs(msg, stderr); break;
 98 |         }
 99 |         r->qname = (char*)calloc(r->lqname, sizeof(char));
100 |         if (fread(r->qname, sizeof(char), r->lqname, fp) != r->lqname) {
101 |             fputs(msg, stderr); break;
102 |         }
103 |     }
104 |     if (valid != h->stored) {
105 |         bc_idx_destroy(h);
106 |         return NULL;
107 |     }
108 |     return h;
109 | }
110 | 
111 | int bc_idx_write_header(FILE* fp, bc_idx_t* idx) {
112 |     int rtn = 0;
113 |     fseek(fp, 0, SEEK_SET);
114 |     if (fwrite(FILE_MAGIC, sizeof(char), MAGIC_LEN, fp) != MAGIC_LEN) rtn = 1;
115 |     if (fwrite(&(idx->version), sizeof(idx->version), 1, fp) != 1) rtn = 2;
116 |     if (fwrite(&(idx->chunk_size), sizeof(idx->chunk_size), 1, fp) != 1) rtn = 3;
117 |     if (fwrite(&(idx->n_chunks), sizeof(idx->n_chunks), 1, fp) != 1) rtn = 4;
118 |     fseek(fp, 0, SEEK_END);
119 |     return rtn;
120 | }
121 | 
122 | int bc_idx_write(FILE* fp, bc_idx_t* idx, size_t offset, char* qname) {
123 |     // write: file offset, length qname, qname
124 |     size_t l_qname = strlen(qname) + 1;
125 |     if (fwrite(&offset, sizeof(offset), 1, fp) != 1) return -1;
126 |     if (fwrite(&l_qname, sizeof(l_qname), 1, fp) != 1) return -1; 
127 |     if (fwrite(qname, sizeof(char), l_qname, fp) != l_qname) return -1;
128 |     (idx->n_chunks)++;
129 |     return (int)(idx->n_chunks);
130 | }
131 | 


--------------------------------------------------------------------------------
/src/bamindex/index.h:
--------------------------------------------------------------------------------
 1 | #ifndef _BAM_INDEX_INDEX_H
 2 | #define _BAM_INDEX_INDEX_H
 3 | 
 4 | 
 5 | typedef struct bc_rec_t {
 6 |     size_t file_offset;
 7 |     size_t lqname;
 8 |     char *qname; 
 9 | } bc_rec_t;
10 | 
11 | 
12 | typedef struct bc_idx_t {
13 |     size_t version;
14 |     size_t chunk_size;
15 |     size_t n_chunks;
16 |     size_t stored;  // TODO: disentangle header from contents
17 |     bc_rec_t *recs; 
18 | } bc_idx_t;
19 | 
20 | 
21 | char* generate_index_filename(const char* input_bam, const char* input_index);
22 | bc_idx_t *bc_idx_init(void);
23 | bc_idx_t *bc_idx_init1(size_t every);
24 | void bc_idx_destroy(bc_idx_t *h);
25 | bc_idx_t *bc_idx_read(FILE *fp);
26 | int bc_idx_write_header(FILE* fp, bc_idx_t* idx);
27 | int bc_idx_write(FILE* fp, bc_idx_t* idx, size_t offset, char* qname);
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/src/bamindex/main.c:
--------------------------------------------------------------------------------
  1 | // bamstats program
  2 | 
  3 | #include <err.h>
  4 | #include <string.h>
  5 | #include <stdbool.h>
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <sys/resource.h>
  9 | 
 10 | 
 11 | enum command_mode {
 12 |     MODE_HELP = 0,
 13 |     MODE_BUILD,
 14 |     MODE_FETCH,
 15 |     MODE_DUMP,
 16 |     MODE_INVALID };
 17 | static const enum command_mode ncommand = MODE_INVALID;
 18 | 
 19 | enum command_mode get_mode(const char *modestr) {
 20 |     if (0 == strcmp(modestr, "help")) {return MODE_HELP;}
 21 |     if (0 == strcmp(modestr, "build")) {return MODE_BUILD;}
 22 |     if (0 == strcmp(modestr, "fetch")) {return MODE_FETCH;}
 23 |     if (0 == strcmp(modestr, "dump")) {return MODE_DUMP;}
 24 |     return MODE_INVALID;
 25 | }
 26 | 
 27 | void fprint_commands(void);
 28 | 
 29 | const char *mode_string(const enum command_mode mode) {
 30 |     switch (mode) {
 31 |     case MODE_HELP:
 32 |         return "help";
 33 |     case MODE_BUILD:
 34 |         return "build";
 35 |     case MODE_FETCH:
 36 |         return "fetch";
 37 |     case MODE_DUMP:
 38 |         return "dump";
 39 |     case MODE_INVALID:
 40 |         fprint_commands();
 41 |         errx(EXIT_FAILURE, "Invalid subcommand\n");
 42 |     default:
 43 |         errx(EXIT_FAILURE, "bamindex failure -- report bug\n");
 44 |     }
 45 | 
 46 |     return NULL;
 47 | }
 48 | 
 49 | const char *mode_description(const enum command_mode mode) {
 50 |     switch (mode) {
 51 |     case MODE_HELP:
 52 |         return "Print general help or help about a subcommand.";
 53 |     case MODE_BUILD:
 54 |         return "Build a BAM index.";
 55 |     case MODE_FETCH:
 56 |         return "Fetch records from a BAM using an index.";
 57 |     case MODE_DUMP:
 58 |         return "Dump an index fetch to text.";
 59 |     case MODE_INVALID:
 60 |         fprint_commands();
 61 |         errx(EXIT_FAILURE, "Invalid subcommand\n");
 62 |     default:
 63 |         errx(EXIT_FAILURE, "bamindex failure -- report bug\n");
 64 |     }
 65 | 
 66 |     return NULL;
 67 | }
 68 | 
 69 | void fprint_commands(void) {
 70 |     for (enum command_mode i = 0; i < ncommand; i++) {
 71 |         fprintf(
 72 |             stderr, "* bamindex %-14s%s\n", mode_string(i), mode_description(i));
 73 |     }
 74 | }
 75 | int main_build(int argc, char *argv[]);
 76 | int main_fetch(int argc, char *argv[]);
 77 | int main_dump(int argc, char *argv[]);
 78 | 
 79 | int main(int argc, char *argv[]) {
 80 | 
 81 |     if (argc == 1) {
 82 |         // Called as program name on it's own
 83 |         fprint_commands();
 84 |         return EXIT_SUCCESS;
 85 |     }
 86 | 
 87 |     int ret = EXIT_FAILURE;
 88 |     switch (get_mode(argv[1])) {
 89 |     case MODE_HELP:
 90 |         fprint_commands();
 91 |         break;
 92 |     case MODE_BUILD:
 93 |         ret = main_build(argc - 1, argv + 1);
 94 |         break;
 95 |     case MODE_FETCH:
 96 |         ret = main_fetch(argc - 1, argv + 1);
 97 |         break;
 98 |     case MODE_DUMP:
 99 |         ret = main_dump(argc - 1, argv + 1);
100 |         break;
101 |     default:
102 |         ret = EXIT_FAILURE;
103 |         warnx("Unrecognised subcommand %s\n", argv[1]);
104 |     }
105 | 
106 |     return ret;
107 | }
108 | 
109 | 


--------------------------------------------------------------------------------
/src/bamstats/args.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <sys/stat.h>
  5 | #include <argp.h>
  6 | 
  7 | #include "htslib/sam.h"
  8 | #include "htslib/faidx.h"
  9 | #include "args.h"
 10 | #include "../version.h"
 11 | 
 12 | const char *argp_program_bug_address = "chris.wright@nanoporetech.com";
 13 | static char doc[] = 
 14 | "bamstats -- summarise rears/alignments in one or more BAM files.\
 15 | \vThe program creates a simple TSV file containing statistics for \
 16 | each primary alignment stored within the input BAM files.";
 17 | static char args_doc[] = "<reads.bam>";
 18 | static struct argp_option options[] = {
 19 |     {0, 0, 0, 0,
 20 |         "General options:", 0},
 21 |     {"region", 'r', "chr:start-end", 0,
 22 |         "Genomic region to process.", 0},
 23 |     {"bed", 'b', "BEDFILE", 0,
 24 |         "BED file for regions to process.", 0},
 25 |     {"threads", 't', "THREADS", 0,
 26 |         "Number of threads for BAM processing.", 0},
 27 |     {"sample", 's',"SAMPLE NAME",   0,
 28 |         "Sample name (if given, adds a 'sample_name' column).", 0},
 29 |     {"flagstats", 'f', "FLAGSTATS", 0,
 30 |         "File for outputting alignment flag counts.", 0},
 31 |     {"runids", 'i', "ID SUMMARY",  0,
 32 |         "Run ID summary output", 0},
 33 |     {"basecallers", 'l', "BASECALLERS", 0,
 34 |         "Basecaller summary output", 0},
 35 |     {"histograms", 0x400, "DIRECTORY", 0,
 36 |         "Directory for outputting histogram information. (default: bamstats-histograms)", 0},
 37 |     {"recalc_qual", 0x900, 0, 0,
 38 |         "Force recomputing mean quality, else use 'qs' tag in BAM if present.", 0},
 39 |     {0, 0, 0, 0,
 40 |         "Read filtering options:", 0},
 41 |     {"unmapped", 'u', 0, 0,
 42 |         "Include unmapped/unplaced reads in output.", 3},
 43 |     {"read_group", 'g', "RG", 0,
 44 |         "Only process reads from given read group.", 3},
 45 |     {"tag_name", 0x100, "TN", 0,
 46 |         "Only process reads with a given tag (see --tag_value).", 3},
 47 |     {"tag_value", 0x200, "VAL", 0,
 48 |         "Only process reads with a given tag value.", 3},
 49 |     {"haplotype", 0x300, "VAL", 0,
 50 |         "Only process reads from a given haplotype. Equivalent to --tag_name HP --tag_value VAL.", 3},
 51 |     {0, 0, 0, 0,
 52 |         "Poly-A Options:", 0},
 53 |     {"poly_a", 0x500, 0, 0,
 54 |         "Enable poly-A tail length histogram.", 5},
 55 |     {"poly_a_cover", 0x600, "PCT_COVERAGE", 0,
 56 |         "Reference alignment coverage for acceptance of read. (default: 95)", 5},
 57 |     {"poly_a_qual", 0x700, "QUAL", 0,
 58 |         "Read mean Q score for acceptance of read. (default: 10)", 5},
 59 |     {"poly_a_rev", 0x800, 0, 0,
 60 |         "Allow reverse alignments (useful for cDNA, default is appropriate for direct RNA seq).", 5},
 61 |     { 0 }
 62 | };
 63 | 
 64 | bool file_exists(char* filename) {
 65 |     struct stat st;
 66 |     return (stat(filename, &st) == 0);
 67 | }
 68 | 
 69 | static int tag_items = 0;
 70 | static bool tag_given = false;
 71 | static bool hp_given = false;
 72 | static error_t parse_opt (int key, char *arg, struct argp_state *state) {
 73 |     arguments_t *arguments = state->input;
 74 |     switch (key) {
 75 |         case 'r':
 76 |             arguments->region = arg;
 77 |             break;
 78 |         case 'b':
 79 |             arguments->bed = arg;
 80 |             break;
 81 |         case 'g':
 82 |             arguments->read_group = arg;
 83 |             break;
 84 |         case 'f':
 85 |             arguments->flagstats = arg;
 86 |             break;
 87 |         case 'i':
 88 |             arguments->runids = arg;
 89 |             break;
 90 |         case 'l':
 91 |             arguments->basecallers = arg;
 92 |             break;
 93 |         case 0x400:
 94 |             arguments->histograms = arg;
 95 |             break;
 96 |         case 0x500:
 97 |             arguments->poly_a = true;
 98 |             break;
 99 |         case 0x600:
100 |             arguments->poly_a_cover = atof(arg);
101 |             break;
102 |         case 0x700:
103 |             arguments->poly_a_qual = atof(arg);
104 |             break;
105 |         case 0x800:
106 |             arguments->poly_a_rev = true;
107 |             break;
108 |         case 's':
109 |             arguments->sample = arg;
110 |             break;
111 |         case 'u':
112 |             arguments->unmapped = true;
113 |             break;
114 |         case 0x100:
115 |             if (strlen(arg) > 2) {
116 |                 argp_error(state, "Tag name should be a two-letter code, received: '%s'.", arg);
117 |             }
118 |             memcpy(arguments->tag_name, arg, 2 *sizeof(char));
119 |             tag_items += 1;
120 |             tag_given = true;
121 |             break;
122 |         case 0x200:
123 |             arguments->tag_value = atoi(arg);
124 |             tag_items += 1;
125 |             tag_given = true;
126 |             break;
127 |         case 0x300:
128 |             memcpy(arguments->tag_name, "HP", 2 * sizeof(char));
129 |             arguments->tag_value = atoi(arg);
130 |             tag_items += 2;
131 |             hp_given = true;
132 |             break;
133 |         case 't':
134 |             arguments->threads = atoi(arg);
135 |             break;
136 |         case 0x900:
137 |             arguments->force_recalc_qual = true;
138 |             break;
139 |         case ARGP_KEY_NO_ARGS:
140 |             argp_usage (state);
141 |             break;
142 |         case ARGP_KEY_ARG:
143 |             if (state->arg_num == 0) {
144 |                 arguments->bam = (const char**)(&state->argv[state->next - 1]);
145 |                 state->next = state->argc;
146 |                 break;
147 |             }
148 |             break;
149 |         case ARGP_KEY_END:
150 |             if (state->arg_num != 1)
151 |                 argp_usage (state);
152 |             break;
153 |         default:
154 |             return ARGP_ERR_UNKNOWN;
155 |     }
156 |     return 0;
157 | }
158 | 
159 | static struct argp argp = {options, parse_opt, args_doc, doc, 0, 0, 0};
160 | 
161 | arguments_t parse_arguments(int argc, char** argv) {
162 |     arguments_t args;
163 |     args.bam = NULL;
164 |     args.flagstats = NULL;
165 |     args.runids = NULL;
166 |     args.basecallers = NULL;
167 |     args.histograms = "bamstats-histograms";
168 |     args.poly_a = false;
169 |     args.poly_a_cover = 95;
170 |     args.poly_a_qual = 10;
171 |     args.poly_a_rev = false;
172 |     args.sample = NULL;
173 |     args.ref = NULL;
174 |     args.region = NULL;
175 |     args.bed = NULL;
176 |     args.unmapped = false;
177 |     args.read_group = NULL;
178 |     args.tag_name[0] = '\0';
179 |     args.tag_value = -1;
180 |     args.threads = 1;
181 |     args.force_recalc_qual = false;
182 |     argp_parse(&argp, argc, argv, 0, 0, &args);
183 |     if (tag_items % 2 > 0) {
184 |         fprintf(stderr, "ERROR: Both or neither of --tag_name and --tag_value must be given.\n");
185 |         exit(EXIT_FAILURE);
186 |     }
187 |     if (tag_given && hp_given) {
188 |         fprintf(stderr, "ERROR: If --haplotype is given neither of --tag_name or --tag_value should be provided.\n");
189 |         exit(EXIT_FAILURE);
190 |     }
191 |     return args;
192 | }
193 | 


--------------------------------------------------------------------------------
/src/bamstats/args.h:
--------------------------------------------------------------------------------
 1 | #ifndef _MODBAMBED_ARGS_H
 2 | #define _MODBAMBED_ARGS_H
 3 | 
 4 | #include <stdbool.h>
 5 | 
 6 | 
 7 | typedef struct arguments {
 8 |     const char** bam;
 9 |     char* flagstats;
10 |     char* runids;
11 |     char* basecallers;
12 |     char* histograms;
13 |     bool poly_a;
14 |     float poly_a_cover;
15 |     float poly_a_qual;
16 |     bool poly_a_rev;
17 |     char *sample;
18 |     char* ref;
19 |     char* region;
20 |     char* bed;
21 |     char* read_group;
22 |     char tag_name[2];
23 |     int tag_value;
24 |     int threads;
25 |     bool unmapped;
26 |     bool force_recalc_qual;
27 | } arguments_t;
28 | 
29 | arguments_t parse_arguments(int argc, char** argv);
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/src/bamstats/bamiter.c:
--------------------------------------------------------------------------------
  1 | #include <ctype.h>
  2 | #include <errno.h>
  3 | #include <string.h>
  4 | 
  5 | #include "bamiter.h"
  6 | #include "common.h"
  7 | 
  8 | /** Set up a bam file for reading (filtered) records.
  9 |  *  
 10 |  *  @param fp htsFile pointer
 11 |  *  @param idx hts_idx_t pointer
 12 |  *  @param hdr sam_hdr_t pointer
 13 |  *  @param chr bam target name.
 14 |  *  @param start start position of chr to consider.
 15 |  *  @param end end position of chr to consider.
 16 |  *  @param overlap_start whether reads overhanging start should be included.
 17 |  *  @param read_group by which to filter alignments.
 18 |  *  @param tag_name by which to filter alignments.
 19 |  *  @param tag_value associated with tag_name.
 20 |  *
 21 |  *  The return value can be freed with destroy_bam_iter_data.
 22 |  *
 23 |  */
 24 | mplp_data *create_bam_iter_data(
 25 |         htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr,
 26 |         const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start,
 27 |         const char *read_group, const char tag_name[2], const int tag_value) {
 28 | 
 29 |     mplp_data *data = xalloc(1, sizeof(mplp_data), "pileup init data");
 30 | 
 31 |     // find the target index for query below
 32 |     if (chr == NULL) {  // all reads
 33 |         data->iter = NULL;
 34 |     } else {
 35 |         int mytid;
 36 |         if (strcmp(chr, "*") == 0) { // unplaced
 37 |             mytid = HTS_IDX_NOCOOR;
 38 |             start = 0; end = INT64_MAX;
 39 |         } else {
 40 |             mytid = sam_hdr_name2tid(hdr, chr);
 41 |             if (mytid < 0) {
 42 |                 fprintf(stderr, "Failed to find reference sequence '%s' in bam.\n", chr);
 43 |                 free(data);
 44 |                 return NULL;
 45 |             }
 46 |         }
 47 |         data->iter = bam_itr_queryi(idx, mytid, start, end);
 48 |     }
 49 | 
 50 |     // setup bam interator
 51 |     data->fp = fp; data->idx = idx; data->hdr = hdr;
 52 |     data->min_start = overlap_start ? -1 : start; // unmapped reads have pos -1
 53 |     memcpy(data->tag_name, tag_name, 2); data->tag_value = tag_value;
 54 |     data->min_mapQ = 0; data->read_group = read_group;
 55 | 
 56 |     return data;
 57 | }
 58 | 
 59 | /** Clean up auxiliary bam reading data.
 60 |  *
 61 |  *  @param data auxiliary structure to clean.
 62 |  *
 63 |  */
 64 | void destroy_bam_iter_data(mplp_data *data) {
 65 |     bam_itr_destroy(data->iter);
 66 |     free(data);
 67 | }
 68 | 
 69 | 
 70 | /** Read a bam record.
 71 |  *
 72 |  *  @param data an mplp_data encoding the bam file to read with filter options.
 73 |  *  @param b output pointer.
 74 |  *
 75 |  */
 76 | int read_bam(void *data, bam1_t *b) {
 77 |     mplp_data *aux = (mplp_data*) data;
 78 |     uint8_t *tag;
 79 |     bool check_tag = (strcmp(aux->tag_name, "") != 0);
 80 |     bool have_rg = (aux->read_group != NULL);
 81 |     uint8_t *rg;
 82 |     char *rg_val;
 83 |     int ret;
 84 |     while (1) {
 85 |         ret = aux->iter ? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b);
 86 |         if (ret<0) break;
 87 |         // only take primary alignments
 88 |         //if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FQCFAIL | BAM_FDUP)) continue;
 89 |         // maybe remove reads overlapping start
 90 |         if (b->core.pos < aux->min_start) continue;
 91 |         // filter by mapping quality
 92 |         if ((int)b->core.qual < aux->min_mapQ) continue;
 93 |         // filter by tag
 94 |         if (check_tag) {
 95 |             tag = bam_get_tag_caseinsensitive((const bam1_t*) b, aux->tag_name);
 96 |             if (tag == NULL){ // tag isn't present or is currupt
 97 |                 if (aux->keep_missing) {
 98 |                     break;
 99 |                 } else {
100 |                     continue;
101 |                 }
102 |             }
103 |             errno = 0;
104 |             int tag_value = bam_aux2i(tag);
105 |             if (tag_value == 0 && errno == EINVAL) continue; // tag was not integer
106 |             if (tag_value != aux->tag_value) continue;
107 |         }
108 |         // filter by RG (read group):
109 |         if (have_rg) {
110 |             rg = bam_get_tag_caseinsensitive((const bam1_t*) b, "RG");
111 |             if (rg == NULL) continue;  // missing
112 |             rg_val = bam_aux2Z(rg);
113 |             if (rg_val == 0 && errno == EINVAL) continue;  // bad parse
114 |             if (strcmp(aux->read_group, rg_val) != 0) continue;  // not wanted
115 |         }
116 |         break;
117 |     }
118 |     return ret;
119 | }
120 | 
121 | 
122 | /** Create an map of query position to reference position
123 |  *
124 |  *  @param b alignment record
125 |  *
126 |  *  The length of the returned array is b->core->l_qlen.
127 |  */
128 | int *qpos2rpos(bam1_t *b) {
129 |     // we only deal in primary/soft-clipped alignments so length
130 |     // of qseq member is the length of the intact query sequence.
131 |     // TODO: add check for alignment being primary / no hard clipping
132 |     uint32_t qlen = b->core.l_qseq;
133 |     uint32_t *cigar = bam_get_cigar(b);
134 |     int *posmap = xalloc(qlen, sizeof(uint32_t), "pos_map");
135 |     for (size_t i = 0; i < qlen; ++i) posmap[i] = -1;  // unaligned
136 |     int qpos = 0, rpos = b->core.pos;
137 |     for (size_t i = 0; i < b->core.n_cigar; ++i){
138 |         uint32_t op = bam_cigar_op(cigar[i]);
139 |         uint32_t len = bam_cigar_oplen(cigar[i]);
140 |         uint32_t take = bam_cigar_type(op);
141 |         if (((take&0x1)>0) & ((take&0x2)>0)) {
142 |             // consumes query and ref
143 |             for (size_t j = 0; j < len; ++j, ++qpos, ++rpos) {
144 |                 posmap[qpos] = rpos;
145 |             }
146 |         }
147 |         else if ((take&0x1)>0) {
148 |             // consumes query only
149 |             qpos += len;
150 |         }
151 |         else {
152 |             // consumes ref
153 |             rpos += len;
154 |         }
155 |     }
156 |     return posmap;
157 | }
158 | 
159 | /** Fetch a BAM tag with case insensitivity
160 |  *
161 |  *  @param b BAM record
162 |  *  @param tag Tag to fetch via bam_aux_get
163 |  *
164 |  */
165 | uint8_t* bam_get_tag_caseinsensitive(const bam1_t* b, char* tag) {
166 | 
167 |     uint8_t* ret;
168 |     char upper_tag[3];
169 |     char lower_tag[3];
170 |     upper_tag[2] = '\0';
171 |     lower_tag[2] = '\0';
172 |     for (int i = 0; i < 2; i++) {
173 |         upper_tag[i] = toupper(tag[i]);
174 |         lower_tag[i] = tolower(tag[i]);
175 |     }
176 |     // Try uppercase variant
177 |     ret = bam_aux_get((const bam1_t*) b, upper_tag);
178 |     if (ret == NULL){
179 |         // Try lowercase variant
180 |         ret = bam_aux_get((const bam1_t*) b, lower_tag);
181 |     }
182 |     return ret;
183 | }
184 | 


--------------------------------------------------------------------------------
/src/bamstats/bamiter.h:
--------------------------------------------------------------------------------
 1 | #ifndef _MODBAMBED_BAMITER_H
 2 | #define _MODBAMBED_BAMITER_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <stdbool.h>
 6 | #include "htslib/sam.h"
 7 | 
 8 | // parameters for bam iteration
 9 | typedef struct {
10 |     htsFile *fp;
11 |     hts_idx_t *idx;
12 |     sam_hdr_t *hdr;
13 |     hts_itr_t *iter;
14 |     int min_start;
15 |     int min_mapQ;
16 |     char tag_name[2];
17 |     int tag_value;
18 |     bool keep_missing;
19 |     const char *read_group;
20 | } mplp_data;
21 | 
22 | /** Set up a bam file for reading (filtered) records.
23 |  *
24 |  *  @param fp htsFile pointer
25 |  *  @param idx hts_idx_t pointer
26 |  *  @param hdr sam_hdr_t pointer
27 |  *  @param chr bam target name.
28 |  *  @param start start position of chr to consider.
29 |  *  @param end end position of chr to consider.
30 |  *  @param overlap_start whether reads overhanging start should be included.
31 |  *  @param read_group by which to filter alignments.
32 |  *  @param tag_name by which to filter alignments.
33 |  *  @param tag_value associated with tag_name.
34 |  *
35 |  *  The return value can be freed with destroy_bam_iter_data.
36 |  *
37 |  */
38 | mplp_data *create_bam_iter_data(
39 |     htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr,
40 |     const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start,
41 |     const char *read_group, const char tag_name[2], const int tag_value);
42 | 
43 | /** Clean up auxiliary bam reading data.
44 |  *
45 |  *  @param data auxiliary structure to clean.
46 |  *
47 |  */
48 | void destroy_bam_iter_data(mplp_data *data);
49 | 
50 | /** Read a bam record.
51 |  *
52 |  *  @param data an mplp_data encoding the bam file to read with filter options.
53 |  *  @param b output pointer.
54 |  *
55 |  */
56 | int read_bam(void *data, bam1_t *b);
57 | 
58 | /** Create an map of query position to reference position
59 |  *
60 |  *  @param b alignment record
61 |  *
62 |  *  The length of the returned array is b->core->l_qlen.
63 |  */
64 | int *qpos2rpos(bam1_t *b);
65 | 
66 | /** Fetch a BAM tag with case insensitivity
67 |  *
68 |  *  @param b BAM record
69 |  *  @param tag Tag to fetch via bam_aux_get
70 |  *
71 |  */
72 | uint8_t* bam_get_tag_caseinsensitive(const bam1_t* b, char* tag);
73 | 
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/src/bamstats/main.c:
--------------------------------------------------------------------------------
  1 | // bamstats program
  2 | 
  3 | #include <string.h>
  4 | #include <stdbool.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <sys/resource.h>
  8 | #include <time.h>
  9 | #include "htslib/faidx.h"
 10 | #include "htslib/sam.h"
 11 | #include "htslib/thread_pool.h"
 12 | 
 13 | #include "args.h"
 14 | #include "common.h"
 15 | #include "readstats.h"
 16 | #include "regiter.h"
 17 | 
 18 | 
 19 | void write_header(const char* sample) {
 20 |     char *sn = sample == NULL ? "" : "\tsample_name";
 21 |     fprintf(stdout, 
 22 |         "name\trunid%s\tref\tcoverage\tref_coverage\t"\
 23 |         "qstart\tqend\trstart\trend\t"\
 24 |         "aligned_ref_len\tdirection\tlength\tread_length\tmean_quality\tstart_time\t"\
 25 |         "match\tins\tdel\tsub\tiden\tacc\tduplex\n",
 26 |         sn);
 27 | }
 28 | 
 29 | // stats array should have 8 entries
 30 | // total, primary, BAM_FSECONDARY, BAM_FSUPPLEMENTARY, BAM_FUNMAP, BAM_FQCFAIL, BAM_FDUP, unused
 31 | // note: HTS spec makes a distinction between "unmapped" (flag & 4) and "unplaced". Unplaced
 32 | //       are not necessarily unmapped but lack definitive coords, this is mainly for paired-end
 33 | //       but we'll keep the distinction here.
 34 | void write_stats_header(FILE* fh, const char* sample) {
 35 |     char *sn = sample == NULL ? "" : "\tsample_name";
 36 |     fprintf(fh, "ref%s\ttotal\tprimary\tsecondary\tsupplementary\tunmapped\tqcfail\tduplicate\tduplex\tduplex_forming\n", sn);
 37 | }
 38 | 
 39 | static inline void write_stats(size_t *stats, const char* chr, const char* sample, FILE* fh) {
 40 |     if (fh != NULL) {
 41 |         if (sample == NULL) {
 42 |             fprintf(fh,
 43 |                 "%s\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\n",
 44 |                 chr, stats[0], stats[1], stats[2], stats[3], stats[4], stats[5], stats[6], stats[7], stats[8]
 45 |             );
 46 |         } else {
 47 |             fprintf(fh,
 48 |                 "%s\t%s\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\n",
 49 |                 chr, sample, stats[0], stats[1], stats[2], stats[3], stats[4], stats[5], stats[6], stats[7], stats[8]
 50 |             );
 51 |         }
 52 |     }
 53 | }
 54 | 
 55 | static inline void write_counter(const char* fname, kh_counter_t *counter, const char* sample, const char* bam_fname, const char* column_name) {
 56 |     FILE* stats_fp = fopen(fname, "w");
 57 |     fprintf(stats_fp, "filename\t");
 58 |     if (sample != NULL) fprintf(stats_fp, "sample_name\t");
 59 |     fprintf(stats_fp, "%s\tcount\n", column_name);
 60 |     for (khiter_t k = 0; k < kh_end(counter); ++k) {
 61 |         if (kh_exist(counter, k)) {
 62 |             fprintf(stats_fp, "%s\t", bam_fname);
 63 |             if (sample != NULL) fprintf(stats_fp, "%s\t", sample);
 64 |             fprintf(stats_fp, "%s\t%d\n", kh_key(counter, k), kh_val(counter, k));
 65 |         }
 66 |     }
 67 |     fclose(stats_fp);
 68 | }
 69 | 
 70 | 
 71 | void write_hist_stats(read_stats* stats, char* prefix, char* name) {
 72 |     char* path = calloc(strlen(prefix) + strlen(name) + 2, sizeof(char));
 73 |     sprintf(path, "%s/%s", prefix, name);
 74 |     FILE* fp = fopen(path, "w");
 75 |     print_stats(stats, false, true, fp);
 76 |     fclose(fp); free(path);
 77 | }
 78 | 
 79 | 
 80 | int main(int argc, char *argv[]) {
 81 |     clock_t begin = clock();
 82 |     arguments_t args = parse_arguments(argc, argv);
 83 | #ifdef NOTHREADS
 84 |     if (args.threads != 1) {
 85 |         fprintf(
 86 |             stderr,
 87 |             "--threads set to %d, but threading not supported by this build.\n", args.threads);
 88 |     }
 89 | #endif
 90 | 
 91 |     // large basecaller runs can produce more files than a single
 92 |     // process can open, check this ahead of time.
 93 | #ifndef WASM
 94 |     struct rlimit reslimit;
 95 |     size_t nfile = 0; for (; args.bam[nfile]; nfile++);
 96 |     if (getrlimit(RLIMIT_NOFILE, &reslimit) == 0) {
 97 |         if (nfile * args.threads > reslimit.rlim_cur - 100) {
 98 |             fprintf(stderr,
 99 |                 "ERROR: Too many BAM files provided (%zu). Try running "
100 |                 "samtools merge on subsets of files to produce fewer files", nfile);
101 |             exit(EXIT_FAILURE);
102 |         }
103 |     }
104 | #endif
105 |     
106 |     int rtn = mkdir_hier(args.histograms);
107 |     if (rtn == -1) {
108 |         fprintf(stderr,
109 |            "Error: Cannot create output directory '%s'. Check location is writeable and directory does not exist.\n",
110 |            args.histograms);
111 |         exit(EXIT_FAILURE);
112 |     }
113 | 
114 |     if (nfile > 1) {
115 |         fprintf(stderr, "ERROR: Multiple input files detected, this program currently supports only a single file.\n");
116 |         exit(EXIT_FAILURE);
117 |     }
118 | 
119 |     write_header(args.sample);
120 | 
121 |     htsFile *fp = hts_open(args.bam[0], "rb");
122 |     sam_hdr_t *hdr = sam_hdr_read(fp);
123 |     if (hdr == 0 || fp == 0) {
124 |         fprintf(stderr, "Failed to read .bam file '%s'.\n", args.bam[0]);
125 |         exit(EXIT_FAILURE);
126 |     }
127 | 
128 |     htsThreadPool p = {NULL, 0};
129 |     if (args.threads > 1 ) {
130 |         fprintf(stderr, "Using %d threads\n", args.threads);
131 |         p.pool = hts_tpool_init(args.threads);
132 |         hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
133 |     }
134 | 
135 |     FILE* flagstats = NULL;
136 |     flag_stats* flag_counts = NULL;
137 |     if (args.flagstats != NULL) {
138 |         flagstats = fopen(args.flagstats, "w");
139 |         write_stats_header(flagstats, args.sample);
140 |         flag_counts = create_flag_stats(
141 |             args.region == NULL ? hdr->n_targets : 1, args.unmapped
142 |         );
143 |     }
144 | 
145 |     kh_counter_t *run_ids = kh_counter_init();
146 |     kh_counter_t *basecallers = kh_counter_init();
147 |     read_stats* length_stats = create_length_stats();
148 |     read_stats* qual_stats = create_qual_stats(QUAL_HIST_WIDTH);
149 |     read_stats* acc_stats = create_qual_stats(ACC_HIST_WIDTH);
150 |     read_stats* cov_stats = create_qual_stats(COV_HIST_WIDTH);
151 |     read_stats* polya_stats = args.poly_a ? create_length_stats() : NULL;
152 | 
153 |     // Prepare also for the unmapped reads
154 |     read_stats* length_stats_unmapped = create_length_stats();
155 |     read_stats* qual_stats_unmapped = create_qual_stats(QUAL_HIST_WIDTH);
156 | 
157 |     if (args.region == NULL && args.bed == NULL) {
158 |         // iterate over the entire file
159 |         process_bams(
160 |             fp, NULL, hdr, args.sample,
161 |             NULL, 0, INT64_MAX, true,
162 |             args.read_group, args.tag_name, args.tag_value,
163 |             flag_counts, args.unmapped,
164 |             length_stats, qual_stats, acc_stats, cov_stats,
165 |             length_stats_unmapped, qual_stats_unmapped,
166 |             polya_stats, args.poly_a_cover, args.poly_a_qual, args.poly_a_rev,
167 |             run_ids, basecallers, args.force_recalc_qual);
168 | 
169 |         // write flagstat counts if requested
170 |         if (flag_counts != NULL) {
171 |             for (int i=0; i < hdr->n_targets; ++i) {
172 |                 const char* chr = sam_hdr_tid2name(hdr, i);
173 |                 write_stats(flag_counts->counts[i], chr, args.sample, flagstats);
174 |             }
175 |             if (args.unmapped) {
176 |                 write_stats(flag_counts->unmapped, "*", args.sample, flagstats);
177 |             }
178 |         }
179 |     } else {
180 |         // process given region / BED
181 |         hts_idx_t *idx = sam_index_load(fp, args.bam[0]);
182 |         if (idx == 0){
183 |             fprintf(stderr, "Cannot find index file for '%s', which is required for processing by region.\n", args.bam[0]);
184 |             exit(EXIT_FAILURE);
185 |         }
186 | 
187 |         regiter rit = init_region_iterator(args.bed, args.region, hdr);
188 |         int check = 0;
189 |         while ((check = next_region(&rit)) != -1) {
190 |             if (check == -2 && args.bed == NULL) {
191 |                 // we were given only a region, not a bed, and that region was garbage
192 |                 // => user error, should stop immediately
193 |                 exit(EXIT_FAILURE);
194 |             }
195 |             if (check != 0) continue;  // skip other errors
196 | 
197 |             process_bams(
198 |                 fp, idx, hdr, args.sample,
199 |                 rit.chr, rit.start, rit.end, true,
200 |                 args.read_group, args.tag_name, args.tag_value,
201 |                 flag_counts, args.unmapped,
202 |                 length_stats, qual_stats, acc_stats, cov_stats,
203 |                 length_stats_unmapped, qual_stats_unmapped,
204 |                 polya_stats, args.poly_a_cover, args.poly_a_qual, args.poly_a_rev,
205 |                 run_ids, basecallers, args.force_recalc_qual);
206 |             if (flag_counts != NULL) {
207 |                 // TODO: regions might not be whole chromosomes...
208 |                 write_stats(flag_counts->counts[0], rit.chr, args.sample, flagstats);
209 |             }
210 |         }
211 |         fprintf(stderr, "Processed %d regions\n", rit.n_regions);
212 |         
213 |         destroy_region_iterator(&rit);
214 |         hts_idx_destroy(idx);
215 |     }
216 | 
217 |     write_hist_stats(length_stats, args.histograms, "length.hist");
218 |     write_hist_stats(qual_stats, args.histograms, "quality.hist");
219 |     write_hist_stats(acc_stats, args.histograms, "accuracy.hist");
220 |     write_hist_stats(cov_stats, args.histograms, "coverage.hist");
221 |     if (polya_stats != NULL) {
222 |         write_hist_stats(polya_stats, args.histograms, "polya.hist");
223 |     } 
224 | 
225 |     // Save also histograms for the unmapped reads if requested
226 |     // and if the user is not asking for a region
227 |     if (args.unmapped && args.region == NULL){
228 |         write_hist_stats(length_stats_unmapped, args.histograms, "length.unmap.hist");
229 |         write_hist_stats(qual_stats_unmapped, args.histograms, "quality.unmap.hist");
230 |     }
231 | 
232 |     // write runids summary
233 |     if (args.runids != NULL) {
234 |         write_counter(args.runids, run_ids, args.sample, args.bam[0], "run_id");
235 |     } 
236 |     // write basecallers summary
237 |     if (args.basecallers != NULL) {
238 |         write_counter(args.basecallers, basecallers, args.sample, args.bam[0], "basecaller");
239 |     } 
240 | 
241 |     destroy_length_stats(length_stats);
242 |     destroy_qual_stats(qual_stats);
243 |     destroy_qual_stats(acc_stats);
244 |     destroy_qual_stats(cov_stats);
245 |     destroy_length_stats(length_stats_unmapped);
246 |     destroy_qual_stats(qual_stats_unmapped);
247 |     destroy_length_stats(polya_stats);
248 |     kh_counter_destroy(basecallers);
249 |     kh_counter_destroy(run_ids);
250 | 
251 |     if (flagstats != NULL) {
252 |         fclose(flagstats);
253 |     }
254 | 
255 |     if (flag_counts != NULL) destroy_flag_stats(flag_counts);
256 |     sam_hdr_destroy(hdr);
257 |     hts_close(fp);
258 |     if (p.pool) { // must be after fp
259 |         hts_tpool_destroy(p.pool);
260 |     }
261 | 
262 |     clock_t end = clock();
263 |     fprintf(stderr, "Total CPU time: %fs\n", (double)(end - begin) / CLOCKS_PER_SEC);
264 |     exit(EXIT_SUCCESS);
265 | }
266 | 


--------------------------------------------------------------------------------
/src/bamstats/readstats.c:
--------------------------------------------------------------------------------
  1 | #define _GNU_SOURCE
  2 | #include <assert.h>
  3 | #include <ctype.h>
  4 | #include <errno.h>
  5 | #include <math.h>
  6 | #include <pthread.h>
  7 | #include <string.h>
  8 | #include <stdbool.h>
  9 | #include <stdio.h>
 10 | #include <stdint.h>
 11 | #include <stdlib.h>
 12 | #include <unistd.h>
 13 | #include "htslib/sam.h"
 14 | #include "htslib/faidx.h"
 15 | #include "thread_pool_internal.h"
 16 | 
 17 | #include "../common.h"
 18 | #include "../stats.h"
 19 | #include "../kh_counter.h"
 20 | #include "bamiter.h"
 21 | #include "readstats.h"
 22 | #include "args.h"
 23 | 
 24 | #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
 25 | #define bam1_seqi(s, i) (bam_seqi((s), (i)))
 26 | #define bam_nt16_rev_table seq_nt16_str
 27 | #define bam_nt16_table seq_nt16_table
 28 | 
 29 | 
 30 | static const int NOTPRIMARY = BAM_FSUPPLEMENTARY | BAM_FSECONDARY | BAM_FUNMAP;
 31 | // counting alignment flags
 32 | // total, primary, ..., unused
 33 | static const size_t FLAG_MASK[8] = {
 34 |     0, 0, BAM_FSECONDARY, BAM_FSUPPLEMENTARY, BAM_FUNMAP, BAM_FQCFAIL, BAM_FDUP, 0
 35 | };
 36 | 
 37 | 
 38 | /** Initialise flagstat counts struct for a BAM file.
 39 |  *
 40 |  *  @param n_refs number of reference sequences.
 41 |  *
 42 |  */
 43 | flag_stats* create_flag_stats(size_t n_refs, bool store_unmapped) {
 44 |     flag_stats* stats = xalloc(1, sizeof(flag_stats), "flagstat");
 45 |     stats->n_refs = n_refs;
 46 |     stats->counts = xalloc(n_refs, sizeof(size_t*), "flagstat");
 47 |     stats->unmapped = store_unmapped ? xalloc(8, sizeof(size_t), "flagstat") : NULL;
 48 | 
 49 |     for (size_t i = 0; i < n_refs; i++) {
 50 |         stats->counts[i] = xalloc(8, sizeof(size_t), "flagstat");
 51 |     }
 52 | 
 53 |     return stats;
 54 | }
 55 | 
 56 | /** Clean up flagstat counts.
 57 |  *
 58 |  *  @param stats flagstat counts structure to clean.
 59 |  *
 60 |  */
 61 | void destroy_flag_stats(flag_stats* stats) {
 62 |     for (size_t i = 0; i < stats->n_refs; i++) {
 63 |         free(stats->counts[i]);
 64 |     }
 65 |     free(stats->counts);
 66 |     free(stats->unmapped);
 67 |     free(stats);
 68 | }
 69 | 
 70 | 
 71 | // Count number of each cigar operation in an alignment
 72 | inline size_t* create_cigar_stats (bam1_t* b) {
 73 | 	static const size_t NCODES = 10;
 74 |     size_t* stats = xalloc(NCODES, sizeof(size_t*), "read stats");
 75 |     uint32_t *cigar = bam_get_cigar(b);
 76 |     for (size_t i = 0; i < b->core.n_cigar; ++i){
 77 |         uint32_t op = bam_cigar_op(cigar[i]);
 78 |         uint32_t len = bam_cigar_oplen(cigar[i]);
 79 | 		stats[op] += len;
 80 | 	}
 81 | 	return stats;
 82 | }
 83 | 
 84 | // Find the first aligned position of the query sequence
 85 | inline size_t get_query_start (bam1_t* b) {
 86 |     uint32_t start_offset = 0;
 87 |     uint32_t qlen = b->core.l_qseq;
 88 |     uint32_t *cigar = bam_get_cigar(b);
 89 |     for (size_t i = 0; i < b->core.n_cigar; ++i){
 90 |         uint32_t op = bam_cigar_op(cigar[i]);
 91 |         if (op == BAM_CHARD_CLIP) {
 92 |             if ((start_offset != 0) && (start_offset != qlen)) {
 93 |                 fprintf(stderr, "Invalid clipping in cigar string.\n");
 94 | 			    exit(EXIT_FAILURE);
 95 |             }
 96 | 		} else if (op == BAM_CSOFT_CLIP) {
 97 |             start_offset += bam_cigar_oplen(cigar[i]);
 98 |         } else {
 99 |             break;
100 | 		}
101 | 	}
102 |     return start_offset;
103 | }
104 | 
105 | // Find the last aligned position of the query sequence
106 | inline size_t get_query_end(bam1_t* b) {
107 |     // TODO: assume l_qseq correct
108 |     uint32_t end_offset = b->core.l_qseq;
109 |     uint32_t qlen = end_offset;
110 |     uint32_t *cigar = bam_get_cigar(b);
111 |     for (int i=b->core.n_cigar - 1; i >= 0; --i){
112 |         uint32_t op = bam_cigar_op(cigar[i]);
113 |         if (op == BAM_CHARD_CLIP) {
114 |             if (end_offset != qlen) {
115 |                 fprintf(stderr, "Invalid clipping in cigar string.\n");
116 | 			    exit(EXIT_FAILURE);
117 |             }
118 | 		} else if (op == BAM_CSOFT_CLIP) {
119 |             end_offset -= bam_cigar_oplen(cigar[i]);
120 |         } else {
121 |             break;
122 | 		}
123 | 	}
124 |     return end_offset;
125 | }
126 | 
127 | 
128 | static inline void process_flagstat_counts(const bam1_t* b, size_t* counts, const int duplex_code ) {
129 |     counts[0] += 1;
130 |     counts[1] += ((b->core.flag & (NOTPRIMARY)) == 0);
131 |     for (size_t i=2; i<6; ++i){
132 |         counts[i] += ((b->core.flag & FLAG_MASK[i]) != 0);
133 |     }
134 |     counts[7] += (duplex_code == 1);
135 |     counts[8] += (duplex_code == -1);
136 | }
137 | 
138 | 
139 | // see section 4.2.4 of the SAM spec for more details
140 | #define IS_INTEGER_TAG(t) ((t) == 'i' || (t) == 'I' || (t) == 'c' || (t) == 'C' || (t) == 's' || (t) == 'S')
141 | 
142 | #define N_TAGS 8
143 | typedef struct {
144 |     char *RG;  // read group
145 |     char *RD;  // read group (old skool)
146 |     char *st;  // start time
147 |     int NM;    // edit distance
148 |     int pi;    // parent read
149 |     int pt;    // poly-t/a tail length
150 |     float qs;  // quality score
151 |     int dx;    // duplex
152 | } bam_tags_t;
153 | 
154 | 
155 | // Function to fetch tags from a bam1_t record
156 | bam_tags_t fetch_bam_tags(const bam1_t *b, const bam_hdr_t *header) {
157 |     // default duplex tag to simple read, everything else as invalid
158 |     bam_tags_t tags = {NULL, NULL, NULL, -1, -1, -1, -1, 0};  
159 | 
160 |     uint8_t *aux = bam_aux_first(b);
161 |     int n_tags = 0;
162 |     while (n_tags < 100 && aux != NULL) {
163 |         n_tags++;
164 |         const char *t = bam_aux_tag(aux);
165 |         char tag[3];
166 |         tag[2] = '\0';
167 |         for (int i = 0; i < 2; i++) {
168 |             tag[i] = toupper(t[i]);
169 |         }
170 |         uint8_t type = bam_aux_type(aux);
171 | 
172 |         // do this here to avoid repeating below
173 |         int ival = -1;
174 |         bool ierr = false;
175 |         if (IS_INTEGER_TAG(type)) {
176 |             errno = 0;
177 |             ival = bam_aux2i(aux);
178 |             ierr = (ival == 0 && errno == EINVAL);
179 |         }
180 |         
181 |         if ((strcmp(tag, "RG") == 0) && (tags.RG == NULL) && type == 'Z') {
182 |             tags.RG = strdup(bam_aux2Z(aux));
183 |         } else if ((strcmp(tag, "RD") == 0) && (tags.RD == NULL) && type == 'Z') {
184 |             tags.RD = strdup(bam_aux2Z(aux));
185 |         } else if ((strcmp(tag, "ST") == 0) && (tags.st == NULL) && type == 'Z') {
186 |             tags.st = strdup(bam_aux2Z(aux));
187 |         } else if (strcmp(tag, "NM") == 0 && !ierr) {
188 |             tags.NM = ival;
189 |         } else if (strcmp(tag, "PI") == 0 && !ierr) {
190 |             tags.pi = ival;
191 |         } else if (strcmp(tag, "PT") == 0 && !ierr) {
192 |             tags.pt = ival;
193 |         } else if (strcmp(tag, "DX") == 0 && !ierr) {
194 |             tags.dx = ival;
195 |         } else if (strcmp(tag, "QS") == 0 && (type == 'f')) {
196 |             errno = 0;
197 |             tags.qs = bam_aux2f(aux);
198 |             if (tags.qs == 0 && errno == EINVAL) {
199 |                 tags.qs = -1;
200 |             }
201 |         }
202 |         else {
203 |             // we added above when we shouldn't have
204 |             n_tags--;
205 |         }
206 | 
207 |         aux = bam_aux_next(b, aux);
208 |     }
209 | 
210 |     // Check we have all the tags we need
211 |     // note theres weird corner case of duplicate tags, but when does that happen?
212 |     bool good_align = ((b->core.flag & (NOTPRIMARY | BAM_FQCFAIL | BAM_FDUP)) == 0);
213 |     if (good_align && (tags.NM == -1)) {
214 |         fprintf(stderr, "Read '%s' does not contain an integer 'NM' tag.\n", bam_get_qname(b));
215 |         kstring_t rec = {0, 0, NULL};
216 |         if (sam_format1(header, b, &rec) < 0) {
217 |             fprintf(stderr, "Failed to format record for error message.\n");
218 |         } else {
219 |             fprintf(stderr, "%s\n", rec.s);
220 |         }
221 |         ks_free(&rec);
222 |         exit(EXIT_FAILURE);
223 |     }
224 |     return tags;
225 | }
226 | 
227 | 
228 | void free_bam_tags(bam_tags_t *tags) {
229 |     if (!tags) return;
230 |     if (tags->RG) free(tags->RG);
231 |     if (tags->RD) free(tags->RD);
232 |     if (tags->st) free(tags->st);
233 |     //free(tags);  // we stack allocate all these now
234 | }
235 | 
236 | 
237 | // Do all-the-things
238 | void process_bams(
239 |         htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr, const char *sample,
240 |         const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start,
241 |         const char *read_group, const char tag_name[2], const int tag_value,
242 |         flag_stats *flag_counts, bool unmapped,
243 |         read_stats* length_stats, read_stats* qual_stats, read_stats* acc_stats, read_stats* cov_stats,
244 |         read_stats* length_stats_unmapped, read_stats* qual_stats_unmapped,
245 |         read_stats* polya_stats, float polya_cover, float polya_qual, bool polya_rev,
246 |         kh_counter_t* runids, kh_counter_t* basecallers,
247 |         bool force_recalc_qual) {
248 |     if (chr != NULL) {
249 |         if (strcmp(chr, "*") == 0) {
250 |             fprintf(stderr, "Processing: Unplaced reads\n");
251 |         } else {
252 |             fprintf(stderr, "Processing: %s:%zu-%zu\n", chr, (size_t)start, (size_t)end);
253 |         }
254 |     }
255 | 
256 |     // setup bam reading - reuse our pileup structure, but actually just need iterator
257 |     mplp_data* bam = create_bam_iter_data(
258 |         fp, idx, hdr,
259 |         chr, start, end, overlap_start,
260 |         read_group, tag_name, tag_value);
261 |     if (bam == NULL) return;
262 | 
263 |     int res;
264 |     bam1_t *b = bam_init1();
265 |     readgroup* rg_info = NULL;
266 |     char *runid = NULL;
267 |     char *basecaller = NULL;
268 |     char *start_time = NULL;
269 | 
270 |     while ((res = read_bam(bam, b) >= 0)) {
271 |         // get all our tags
272 |         bam_tags_t tags = fetch_bam_tags(b, hdr);
273 | 
274 |         // get info from readgroup, note we could use subitems from readgroup
275 |         // here more directly, but this is to be consistent with fastcat where
276 |         // we only have the readgroup ID string to play with
277 |         runid = "";
278 |         basecaller = "";
279 |         start_time = "";
280 |         if (tags.RG != NULL) {
281 |             rg_info = create_rg_info(tags.RG);
282 |             if (rg_info->runid != NULL) {
283 |                 runid = rg_info->runid;
284 |             }
285 |             if (rg_info->basecaller != NULL) {
286 |                 basecaller = rg_info->basecaller;
287 |             }
288 |         } else if (tags.RD != NULL) {
289 |             runid = tags.RD;
290 |         }
291 | 
292 |         if (tags.st != NULL) {
293 |             start_time = tags.st;
294 |         }
295 |         kh_counter_increment(runids, runid);
296 |         kh_counter_increment(basecallers, basecaller);
297 | 
298 |         // write a record for unmapped/unplaced
299 |         if (b->core.flag & BAM_FUNMAP) {
300 |             if (unmapped) {
301 |                 // an unmapped read can still have a RNAME and POS, but we
302 |                 // ignore that here, because its not a thing we care about
303 |                 char* qname = bam_get_qname(b);
304 |                 uint32_t read_length = b->core.l_qseq;
305 |                 float mean_quality = mean_qual_from_bam(bam_get_qual(b), read_length);
306 |                 if (sample == NULL) {
307 |                     fprintf(stdout,
308 |                         "%s\t%s\t*\tnan\tnan\t" \
309 |                         "nan\tnan\tnan\tnan\t" \
310 |                         "0\t*\t0\t" \
311 |                         "%u\t%.2f\t%s\t" \
312 |                         "0\t0\t0\t0\tnan\tnan\t%d\n",
313 |                         qname, runid, //chr, coverage, ref_cover,
314 |                         //qstart, qend, rstart, rend,
315 |                         //aligned_ref_len, direction, length,
316 |                         read_length, mean_quality, start_time,
317 |                         //match, ins, delt, sub, iden, acc
318 |                         tags.dx
319 |                     );
320 |                 } else {
321 |                     fprintf(stdout,
322 |                         "%s\t%s\t%s\t*\tnan\tnan\t" \
323 |                         "nan\tnan\tnan\tnan\t" \
324 |                         "0\t*\t0\t" \
325 |                         "%u\t%.2f\t%s\t" \
326 |                         "0\t0\t0\t0\tnan\tnan\t%d\n",
327 |                         qname, runid, sample, //chr, coverage, ref_cover,
328 |                         //qstart, qend, rstart, rend,
329 |                         //aligned_ref_len, direction, length,
330 |                         read_length, mean_quality, start_time,
331 |                         //match, ins, delt, sub, iden, acc
332 |                         tags.dx
333 |                     );
334 |                 }
335 |                 // add to flagstat counts if required
336 |                 if (flag_counts != NULL) {
337 |                     process_flagstat_counts(b, flag_counts->unmapped, tags.dx);
338 |                 }
339 | 
340 |                 // accumulate stats into histogram
341 |                 add_length_count(length_stats_unmapped, read_length);
342 |                 add_qual_count(qual_stats_unmapped, mean_quality);
343 |             }
344 |             goto FINISH_READ;
345 |         }
346 | 
347 |         if (flag_counts != NULL) {
348 |             // when we have a target region (as opposed to looping over the whole file),
349 |             // `flag_counts` will only contain one (dynamic) array of counts; otherwise
350 |             // there will be as many dynamic arrays as references in the BAM header
351 |             size_t* counts = (chr != NULL) ? flag_counts->counts[0]
352 |                                            : flag_counts->counts[b->core.tid];
353 |             process_flagstat_counts(b, counts, tags.dx);
354 |         }
355 | 
356 |         // only take "good" primary alignments for further processing
357 |         if (b->core.flag & (NOTPRIMARY | BAM_FQCFAIL | BAM_FDUP)) {
358 |             goto FINISH_READ;
359 |         }
360 |         char* qname = bam_get_qname(b);
361 | 
362 |         size_t* stats = create_cigar_stats(b);
363 |         size_t match, ins, delt;
364 |         // some aligners like to get fancy
365 |         match = stats[BAM_CMATCH] + stats[BAM_CEQUAL] + stats[BAM_CDIFF];
366 |         ins = stats[BAM_CINS];
367 |         delt = stats[BAM_CDEL];
368 |         size_t sub = tags.NM - ins - delt;
369 |         size_t length = match + ins + delt;
370 |         float iden = 100 * ((float)(match - sub)) / match;
371 |         float acc = 100 - 100 * ((float)(tags.NM)) / length;
372 |         // some things we've seen go wrong
373 |         // explode now because there is almost certainly something wrong with the tags
374 |         // and calling add_qual_count with a value less than zero will cause a segfault
375 |         if (iden < 0.0 || acc < 0.0 || (size_t)tags.NM > length) {
376 |             fprintf(stderr, "Read '%s' appears to contain implausible alignment information\n", qname);
377 |             exit(EXIT_FAILURE);
378 |         }
379 |         // we only deal in primary/soft-clipped alignments so length
380 |         // of qseq member is the length of the intact query sequence.
381 |         uint32_t read_length = b->core.l_qseq;
382 |         size_t qstart = get_query_start(b);
383 |         size_t qend = get_query_end(b);
384 |         // get mean quality score, from tag or recompute
385 |         float mean_quality = tags.qs;
386 |         if (mean_quality == -1 || force_recalc_qual) {
387 |             mean_quality = mean_qual_from_bam_naive(bam_get_qual(b), read_length);
388 |         }
389 | 
390 |         float coverage = 100 * ((float)(qend - qstart)) / read_length;
391 |         size_t rstart = b->core.pos;
392 |         size_t rend = bam_endpos(b);
393 |         size_t aligned_ref_len = rend - rstart;
394 |         size_t ref_length = sam_hdr_tid2len(hdr, b->core.tid);
395 |         float ref_cover = 100 * ((float)(aligned_ref_len)) / ref_length;
396 |         char direction = "+-"[bam_is_rev(b)];
397 | 
398 |         // accumulate stats into histogram
399 |         add_length_count(length_stats, read_length);
400 |         add_qual_count(qual_stats, mean_quality);
401 |         add_qual_count(acc_stats, acc);
402 |         add_qual_count(cov_stats, coverage);
403 | 
404 |         // get poly-A tail length. For now we require:
405 |         //    i) "good" coverage on reference, i.e. "full length"
406 |         //   ii) read is sense strand, i.e. fwd alignment
407 |         //  iii) "good" mean quality
408 |         //   iv) no split reads
409 |         if (polya_stats != NULL) {
410 |             int polya_len = -1;
411 |             if ((ref_cover >= polya_cover)
412 |                     && (!bam_is_rev(b) || polya_rev)
413 |                     && mean_quality >= polya_qual) {
414 |                 if (tags.pi == -1 && tags.pt >= 0) {
415 |                     polya_len = tags.pt;
416 |                 }
417 |             }
418 |             if (polya_len >= 0) {
419 |                 add_length_count(polya_stats, polya_len);
420 |             }
421 |         }
422 | 
423 |         if (sample == NULL) {
424 |             fprintf(stdout,
425 |                 "%s\t%s\t%s\t" \
426 |                 "%.4f\t%.4f\t" \
427 |                 "%lu\t%lu\t%lu\t%lu\t" \
428 |                 "%lu\t%c\t%lu\t%u\t%.2f\t%s\t" \
429 |                 "%lu\t%lu\t%lu\t%lu\t%.2f\t%.2f\t%d\n",
430 |                 qname, runid, (chr != NULL) ? chr : sam_hdr_tid2name(hdr, b->core.tid),
431 |                 coverage, ref_cover,
432 |                 qstart, qend, rstart, rend,
433 |                 aligned_ref_len, direction, length, read_length, mean_quality, start_time,
434 |                 match, ins, delt, sub, iden, acc, tags.dx);
435 |         } else {
436 |             fprintf(stdout,
437 |                 "%s\t%s\t%s\t%s\t" \
438 |                 "%.4f\t%.4f\t" \
439 |                 "%lu\t%lu\t%lu\t%lu\t" \
440 |                 "%lu\t%c\t%lu\t%u\t%.2f\t%s\t" \
441 |                 "%lu\t%lu\t%lu\t%lu\t%.2f\t%.2f\t%d\n",
442 |                 qname, runid, sample, (chr != NULL) ? chr : sam_hdr_tid2name(hdr, b->core.tid),
443 |                 coverage, ref_cover,
444 |                 qstart, qend, rstart, rend,
445 |                 aligned_ref_len, direction, length, read_length, mean_quality, start_time,
446 |                 match, ins, delt, sub, iden, acc, tags.dx);
447 |         }
448 | 		free(stats);
449 | 
450 | FINISH_READ:
451 |         destroy_rg_info(rg_info);
452 |         rg_info = NULL;
453 |         runid = NULL;
454 |         basecaller = NULL;
455 |         start_time = NULL;
456 |         free_bam_tags(&tags);
457 |     }
458 | 
459 |     destroy_bam_iter_data(bam);
460 |     bam_destroy1(b);
461 | 
462 |     return;
463 | }
464 | 


--------------------------------------------------------------------------------
/src/bamstats/readstats.h:
--------------------------------------------------------------------------------
 1 | #ifndef _BAMSTATS_STATS_H
 2 | #define _BAMSTATS_STATS_H
 3 | 
 4 | #include <stdbool.h>
 5 | #include "htslib/sam.h"
 6 | 
 7 | #include "args.h"
 8 | #include "../stats.h"
 9 | #include "../kh_counter.h"
10 | 
11 | 
12 | // struct for flagstat counts
13 | typedef struct {
14 |     size_t n_refs;
15 |     size_t** counts;
16 |     size_t* unmapped;
17 | } flag_stats;
18 | 
19 | /** Create flagstat counts struct for a BAM file.
20 |  *
21 |  *  @param n_refs number of reference sequences.
22 |  *  @param store_unmapped whether to count unmapped reads.
23 |  *
24 |  */
25 | flag_stats* create_flag_stats(size_t n_refs, bool store_unmapped);
26 | 
27 | /** Clean up flagstat counts.
28 |  *
29 |  *  @param stats flagstat counts structure to clean.
30 |  *
31 |  */
32 | void destroy_flag_stats(flag_stats* stats);
33 | 
34 | 
35 | /** Generates alignment stats from a region of a bam.
36 |  *
37 |  *  @param fp htsFile pointer
38 |  *  @param idx hts_idx_t pointer
39 |  *  @param hdr sam_hdr_t pointer
40 |  *  @param sample sample name.
41 |  *  @param chr bam target name.
42 |  *  @param start start position of chr to consider.
43 |  *  @param end end position of chr to consider.
44 |  *  @param overlap_start whether reads overhanging start should be included.
45 |  *  @param read_group by which to filter alignments.
46 |  *  @param tag_name by which to filter alignments.
47 |  *  @param tag_value associated with tag_name.
48 |  *  @param flag_counts flag_stats pointer.
49 |  *  @param unmapped bool include unmapped reads in output.
50 |  *  @param length_stats read_stats* for accumulating read length information.
51 |  *  @param qual_stats read_stats* for accumulating read quality information.
52 |  *  @param acc_stats read_stats* for accumulating read alignment accuracy information.
53 |  *  @param cov_stats read_stats* for accumulating read alignment coverage information.
54 |  *  @param length_stats_unmapped read_stats* for accumulating read length information for unmapped reads.
55 |  *  @param qual_stats_unmapped read_stats* for accumulating read quality information for unmapped reads.
56 |  *  @param polya_stats read_stats* for accumulating polyA tail length information.
57 |  *  @param polya_cover minimum reference coverage for polyA tail length to be considered.
58 |  *  @param polya_qual minimum mean quality for polyA tail length to be considered.
59 |  *  @param polya_rev whether to allow reverse alignments for polyA tail length.
60 |  *  @param runids kh_counter_t* for accumulating runid information.
61 |  *  @param basecallers kh_counter_t* for accumulating basecaller information.
62 |  *  @param force_recalc_quality whether to recalculate mean quality from phred scores.
63 |  *  @returns void. Prints output to stdout.
64 |  *
65 |  */
66 | void process_bams(
67 |     htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr, const char *sample,
68 |     const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start,
69 |     const char *read_group, const char tag_name[2], const int tag_value,
70 |     flag_stats *flag_counts, bool unmapped,
71 |     read_stats* length_stats, read_stats* qual_stats, read_stats* acc_stats, read_stats* cov_stats,
72 |     read_stats* length_stats_unmapped, read_stats* qual_stats_unmapped,
73 |     read_stats* polya_stats, float polya_cover, float polya_qual, bool polya_rev,
74 |     kh_counter_t* runids, kh_counter_t* basecallers,
75 |     bool force_recalc_quality);
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/src/common.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <errno.h>
  3 | #include <math.h>
  4 | #include <regex.h>
  5 | #include <string.h>
  6 | #include <stdbool.h>
  7 | #include <stdio.h>
  8 | #include <stdint.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | #include <sys/stat.h>
 12 | #include <unistd.h>
 13 | 
 14 | #include "common.h"
 15 | 
 16 | 
 17 | /* The following two functions were adpated from:
 18 |  * https://gist.github.com/JonathonReinhart/8c0d90191c38af2dcadb102c4e202950
 19 |  */
 20 | static int maybe_mkdir(const char* path, mode_t mode) {
 21 |     struct stat st;
 22 |     errno = 0;
 23 | 
 24 |     // Try to make the directory
 25 |     if (mkdir(path, mode) == 0)
 26 |         return 0;
 27 | 
 28 |     // If it fails for any reason but EEXIST, fail
 29 |     if (errno != EEXIST)
 30 |         return -1;
 31 | 
 32 |     // Check if the existing path is a directory
 33 |     if (stat(path, &st) != 0)
 34 |         return -1;
 35 | 
 36 |     // If not, fail with ENOTDIR
 37 |     if (!S_ISDIR(st.st_mode)) {
 38 |         errno = ENOTDIR;
 39 |         return -1;
 40 |     }
 41 | 
 42 |     errno = 0;
 43 |     return 0;
 44 | }
 45 | 
 46 | /** mkdir a directory structure recursively, but fail if pre-exists.
 47 |  *
 48 |  * @param path directory path to ensure exists
 49 |  *
 50 |  */
 51 | int mkdir_hier(char *path) {
 52 | 
 53 |     char *_path = NULL;
 54 |     char *p; 
 55 |     int result = -1;
 56 |     mode_t mode = 0700;
 57 |     errno = 0;
 58 | 
 59 |     // if we can just make the directory, fine. If it exists
 60 |     // already then exit
 61 |     if (mkdir(path, mode) != 0 && errno == EEXIST)
 62 |         return -1;
 63 |         
 64 |     _path = strdup(path);
 65 |     if (_path == NULL)
 66 |         goto out;
 67 | 
 68 |     for (p = _path + 1; *p; p++) {
 69 |         if (*p == '/') {
 70 |             *p = '\0';
 71 |             if (maybe_mkdir(_path, mode) != 0)
 72 |                 goto out;
 73 |             *p = '/';
 74 |         }
 75 |     }   
 76 | 
 77 |     if (maybe_mkdir(_path, mode) != 0)
 78 |         goto out;
 79 | 
 80 |     result = 0;
 81 | out:
 82 |     free(_path);
 83 |     return result;
 84 | }
 85 | 
 86 | 
 87 | 
 88 | 
 89 | /** Allocates zero-initialised memory with a message on failure.
 90 |  *
 91 |  *  @param num number of elements to allocate.
 92 |  *  @param size size of each element.
 93 |  *  @param msg message to describe allocation on failure.
 94 |  *  @returns pointer to allocated memory
 95 |  *
 96 |  */
 97 | void *xalloc(size_t num, size_t size, char* msg){
 98 |     void *res = calloc(num, size);
 99 |     if (res == NULL){
100 |         fprintf(stderr, "Failed to allocate mem for %s\n", msg);
101 |         exit(1);
102 |     }
103 |     return res;
104 | }
105 | 
106 | 
107 | /** Reallocates memory with a message on failure.
108 |  *
109 |  *  @param ptr pointer to realloc.
110 |  *  @param size size of each element.
111 |  *  @param msg message to describe allocation on failure.
112 |  *  @returns pointer to allocated memory
113 |  *
114 |  */
115 | void *xrealloc(void *ptr, size_t size, char* msg){
116 |     void *res = realloc(ptr, size);
117 |     if (res == NULL){
118 |         fprintf(stderr, "Failed to reallocate mem for %s\n", msg);
119 |         exit(1);
120 |     }
121 |     return res;
122 | }
123 | 
124 | 
125 | /** Retrieves a substring.
126 |  *
127 |  *  @param string input string.
128 |  *  @param postion start position of substring.
129 |  *  @param length length of substring required.
130 |  *  @returns string pointer.
131 |  *
132 |  */
133 | char *substring(char *string, size_t position, size_t length) {
134 |    char *ptr;
135 |    size_t i;
136 | 
137 |    ptr = malloc(length + 1);
138 | 
139 |    for (i = 0 ; i < length ; i++) {
140 |       *(ptr + i) = *(string + position);
141 |       string++;
142 |    }
143 | 
144 |    *(ptr + i) = '\0';
145 |    return ptr;
146 | }
147 | 
148 | int replace_char(char *str, char orig, char rep) {
149 |     char *ix = str;
150 |     int n = 0;
151 |     while((ix = strchr(ix, orig)) != NULL) {
152 |         *ix++ = rep;
153 |         n++;
154 |     }
155 |     return n;
156 | }
157 | 
158 | const double qprobs[100] = {
159 |     1.00000000e+00, 7.94328235e-01, 6.30957344e-01, 5.01187234e-01,
160 |     3.98107171e-01, 3.16227766e-01, 2.51188643e-01, 1.99526231e-01,
161 |     1.58489319e-01, 1.25892541e-01, 1.00000000e-01, 7.94328235e-02,
162 |     6.30957344e-02, 5.01187234e-02, 3.98107171e-02, 3.16227766e-02,
163 |     2.51188643e-02, 1.99526231e-02, 1.58489319e-02, 1.25892541e-02,
164 |     1.00000000e-02, 7.94328235e-03, 6.30957344e-03, 5.01187234e-03,
165 |     3.98107171e-03, 3.16227766e-03, 2.51188643e-03, 1.99526231e-03,
166 |     1.58489319e-03, 1.25892541e-03, 1.00000000e-03, 7.94328235e-04,
167 |     6.30957344e-04, 5.01187234e-04, 3.98107171e-04, 3.16227766e-04,
168 |     2.51188643e-04, 1.99526231e-04, 1.58489319e-04, 1.25892541e-04,
169 |     1.00000000e-04, 7.94328235e-05, 6.30957344e-05, 5.01187234e-05,
170 |     3.98107171e-05, 3.16227766e-05, 2.51188643e-05, 1.99526231e-05,
171 |     1.58489319e-05, 1.25892541e-05, 1.00000000e-05, 7.94328235e-06,
172 |     6.30957344e-06, 5.01187234e-06, 3.98107171e-06, 3.16227766e-06,
173 |     2.51188643e-06, 1.99526231e-06, 1.58489319e-06, 1.25892541e-06,
174 |     1.00000000e-06, 7.94328235e-07, 6.30957344e-07, 5.01187234e-07,
175 |     3.98107171e-07, 3.16227766e-07, 2.51188643e-07, 1.99526231e-07,
176 |     1.58489319e-07, 1.25892541e-07, 1.00000000e-07, 7.94328235e-08,
177 |     6.30957344e-08, 5.01187234e-08, 3.98107171e-08, 3.16227766e-08,
178 |     2.51188643e-08, 1.99526231e-08, 1.58489319e-08, 1.25892541e-08,
179 |     1.00000000e-08, 7.94328235e-09, 6.30957344e-09, 5.01187234e-09,
180 |     3.98107171e-09, 3.16227766e-09, 2.51188643e-09, 1.99526231e-09,
181 |     1.58489319e-09, 1.25892541e-09, 1.00000000e-09, 7.94328235e-10,
182 |     6.30957344e-10, 5.01187234e-10, 3.98107171e-10, 3.16227766e-10,
183 |     2.51188643e-10, 1.99526231e-10, 1.58489319e-10, 1.25892541e-10};
184 | 
185 | 
186 | inline void kahan_sum(double* sum, double term, double* c) {
187 |     double y = term + *c;
188 |     double t = *sum + y;
189 |     *c = (t - *sum) - y;
190 |     *sum = t;
191 | }
192 | 
193 | 
194 | inline float mean_qual(char* qual, size_t len) {
195 |     if (len == 0 ) return nanf("");
196 |     double qsum = 0;
197 |     double c = 0;
198 |     for (size_t i=0; i<len; ++i) {
199 |         int q = (int)(qual[i]) - 33;
200 |         kahan_sum(&qsum, qprobs[q], &c);
201 |     }
202 |     qsum /= len;
203 |     return -10 * log10(qsum);
204 | }
205 | 
206 | 
207 | inline float mean_qual_from_bam(uint8_t* qual, size_t len) {
208 |     if (len == 0 || qual[0] == 0xff ) return nanf("");
209 |     double qsum = 0;
210 |     double c = 0;
211 |     for (size_t i=0; i<len; ++i) {
212 |         int q = (int)(qual[i]);
213 |         kahan_sum(&qsum, qprobs[q], &c);
214 |     }
215 |     qsum /= len;
216 |     return -10 * log10(qsum);
217 | }
218 | 
219 | 
220 | inline float mean_qual_naive(char* qual, size_t len) {
221 |     if (len == 0 ) return nanf("");
222 |     double qsum = 0;
223 |     for (size_t i=0; i<len; ++i) {
224 |         int q = (int)(qual[i]) - 33;
225 |         qsum += qprobs[q];
226 |     }
227 |     qsum /= len;
228 |     return -10 * log10(qsum);
229 | }
230 | 
231 | 
232 | inline float mean_qual_from_bam_naive(uint8_t* qual, size_t len) {
233 |     if (len == 0 || qual[0] == 0xff ) return nanf("");
234 |     double qsum = 0;
235 |     for (size_t i=0; i<len; ++i) {
236 |         int q = (int)(qual[i]);
237 |         qsum += qprobs[q];
238 |     }
239 |     qsum /= len;
240 |     return -10 * log10(qsum);
241 | }
242 | 
243 | 
244 | // Strip hexadecimal suffixes that samtools merge can add to RG IDs
245 | void strip_hex_suffix(char *str) {
246 |     regex_t regex;
247 |     regmatch_t matches[1];
248 |    
249 |     // samtools formats this as "%s-%0lX" which is a long formatted
250 |     // as hex with leading zeros. BUT there's no field width specified!
251 |     // https://github.com/samtools/samtools/issues/2086
252 |     // This will strip even a solitary `-` from the end of the string,
253 |     // we could be probabilistic and require say 2 hex digits leaving
254 |     // the 16 edge cases of 0 ("") to F. No one should have and RG ID
255 |     // ending in "-", right?
256 |     if (regcomp(&regex, "-[0-9A-Fa-f]{0,8}$", REG_EXTENDED) != 0) {
257 |         fprintf(stderr, "Could not compile regex\n");
258 |         exit(1);
259 |     }
260 | 
261 |     // Check if the string matches the pattern, and strip the match from the src
262 |     if (regexec(&regex, str, 1, matches, 0) == 0) {
263 |         str[matches[0].rm_so] = '\0';
264 |     }
265 | 
266 |     regfree(&regex);
267 | }
268 | 
269 | 
270 | void destroy_rg_info(readgroup* rg) {
271 |     if (rg != NULL) {
272 |         free(rg->readgroup);
273 |         free(rg);
274 |         rg = NULL;
275 |     }
276 | }
277 | 
278 | // rg is of the form:
279 | // <runid>_<basecalling_model>_<barcode_arrangement>
280 | //
281 | // where:
282 | //   - runid is either (see CW-4704):
283 | //        - a 40 character string representing an acquisition_id sha
284 | //        - a 36 character string representing a protocol_run_id uuid
285 | //   - basecalling_model is a string maybe containing `_`, and containing one or more `@`
286 | //   -    mod_caller is optional part of this starting with `_` after the first `@`
287 | //   - barcode_arrangement is a optional(!) string with an unknown format, but hopefully no `@`
288 | //
289 | // The function always returns an object with a copy of the input. The subfields may be
290 | // NULL pointers if the parsing was incomplete. Client code should therefore always check that
291 | // members are not NULL before use.
292 | //
293 | readgroup* create_rg_info(char* rg) {
294 |     readgroup* rg_info = xalloc(1, sizeof(readgroup), "readgroup");
295 |     rg_info->readgroup = strdup(rg);
296 |     rg_info->runid = NULL;
297 |     rg_info->basecaller = NULL;
298 |     rg_info->modcaller = NULL;
299 |     rg_info->barcode = NULL;
300 | 
301 |     // first strip of `-ABCDEF` from the end
302 |     strip_hex_suffix(rg_info->readgroup);
303 | 
304 |     // I tried to do this with regex, but even chatGPT couldn't give me a
305 |     // POSIX regex that would work. So we'll do it manually (and somewhat
306 |     // more understandably/controllably)
307 | 
308 |     // runid runs to first `_`
309 |     rg_info->runid = rg_info->readgroup;
310 |     char* delim = strchr(rg_info->runid, '_');
311 |     if (delim == NULL) {
312 |         return rg_info;
313 |     }
314 |     delim[0] = '\0';
315 |     // ensure runid is long enough to be an acquisition sha or protocol uuid
316 |     int runid_l = strlen(rg_info->runid);
317 |     if (runid_l != 36 && runid_l != 40) {
318 |         // free the mutated copy, and reset
319 |         free(rg_info->readgroup);
320 |         rg_info->readgroup = strdup(rg);
321 |         rg_info->runid = NULL;
322 |         return rg_info;
323 |     }
324 |     // basecaller + modcaller runs to first `_` after last `@`
325 |     // though barcode is optional, so there may not
326 |     // be a `_` after the last `@`
327 |     rg_info->basecaller = delim + 1;
328 |     delim = strrchr(rg_info->basecaller, '@');
329 |     if (delim == NULL) {
330 |         rg_info->basecaller = NULL;
331 |         return rg_info;
332 |     }
333 |     // modcaller is optional, we can detect its presence by more than one `@` in basecaller
334 |     char* delim1 = strchr(rg_info->basecaller, '@');
335 |     if (delim1 == delim) {  // only one `@`
336 |         rg_info->modcaller = NULL;
337 |     } else {
338 |         delim1 = strchr(delim1, '_'); // modcaller starts at `_` after first `@`
339 |         delim1[0] = '\0';
340 |         rg_info->modcaller = delim1 + 1;
341 |     }
342 |     delim = strchr(delim, '_');
343 |     // barcode is optional
344 |     if (delim) {
345 |         delim[0] = '\0';
346 |         rg_info->barcode = delim + 1;
347 |     }
348 |     return rg_info;
349 | }
350 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef _FASTCAT_COMMON_H
 2 | #define _FASTCAT_COMMON_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | 
 7 | /** Simple min/max
 8 |  * @param a
 9 |  * @param b
10 |  *
11 |  * @returns the min/max of a and b
12 |  *
13 |  */
14 | #define min(a, b) ({ \
15 |     typeof (a) _a = (a); \
16 |     typeof (b) _b = (b); \
17 |     _a < _b ? _a : _b; \
18 | })
19 | #define max(a, b) ({ \
20 |     typeof (a) _a = (a); \
21 |     typeof (b) _b = (b); \
22 |     _a > _b ? _a : _b; \
23 | })
24 | 
25 | /** mkdir a directory structure recursively, but fail if pre-exists.
26 |  *
27 |  * @param path directory path to ensure exists
28 |  *
29 |  */
30 | int mkdir_hier(char* path);
31 | 
32 | /** Allocates zero-initialised memory with a message on failure.
33 |  *
34 |  *  @param num number of elements to allocate.
35 |  *  @param size size of each element.
36 |  *  @param msg message to describe allocation on failure.
37 |  *  @returns pointer to allocated memory
38 |  *
39 |  */
40 | void *xalloc(size_t num, size_t size, char* msg);
41 | 
42 | 
43 | /** Reallocates memory with a message on failure.
44 |  *
45 |  *  @param ptr pointer to realloc.
46 |  *  @param size size of each element.
47 |  *  @param msg message to describe allocation on failure.
48 |  *  @returns pointer to allocated memory
49 |  *
50 |  */
51 | void *xrealloc(void *ptr, size_t size, char* msg);
52 | 
53 | 
54 | /** Retrieves a substring.
55 |  *
56 |  *  @param string input string.
57 |  *  @param postion start position of substring.
58 |  *  @param length length of substring required.
59 |  *  @returns string pointer.
60 |  *
61 |  */
62 | char *substring(char *string, size_t position, size_t length);
63 | 
64 | /** Globally replace a char in a char*
65 |  * 
66 |  * @param str char* source string
67 |  * @param orig original character
68 |  * @param rep replacement
69 |  * @returns number of times replacement made
70 |  *
71 |  */
72 | int replace_char(char *str, char orig, char rep);
73 | 
74 | 
75 | // https://en.wikipedia.org/wiki/Kahan_summation_algorithm
76 | void kahan_sum(double* sum, double term, double* c);
77 | 
78 | float mean_qual(char* qual, size_t len);
79 | float mean_qual_naive(char* qual, size_t len);
80 | float mean_qual_from_bam(uint8_t* qual, size_t len);
81 | float mean_qual_from_bam_naive(uint8_t* qual, size_t len);
82 | 
83 | typedef struct readgroup {
84 |     char* readgroup;
85 |     char* runid;
86 |     char* basecaller;
87 |     char* modcaller;
88 |     char* barcode;
89 | } readgroup;
90 | 
91 | readgroup* create_rg_info(char* rg);
92 | void destroy_rg_info(readgroup* rg);
93 | 
94 | #endif
95 | 


--------------------------------------------------------------------------------
/src/fastcat/args.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <argp.h>
  3 | 
  4 | #include "args.h"
  5 | #include "../version.h"
  6 | 
  7 | const char *argp_program_bug_address = "chris.wright@nanoporetech.com";
  8 | static char doc[] = 
  9 | "fastcat -- concatenate and summarise .fastq(.gz) files.\
 10 | \vInput files may be given on stdin by specifing the input as '-'. \
 11 | Also accepts directories as input and looks for .fastq(.gz) files in \
 12 | the top-level directory. Recurses into sub-directories when the \
 13 | -x option is given. The command \
 14 | will exit non-zero if any file encountered cannot be read.";
 15 | static char args_doc[] = "reads1.fastq(.gz) reads2.fastq(.gz) dir-with-fastq ...";
 16 | static struct argp_option options[] = {
 17 |     {0, 0, 0, 0,
 18 |         "General options:", 0},
 19 |     {"recurse", 'x', 0, 0,
 20 |         "Search directories recursively for '.fastq', '.fq', '.fastq.gz', and '.fq.gz' files.", 0},
 21 |     {"threads", 't', "THREADS", 0,
 22 |         "Number of threads for output compression (only with --bam_out.", 0},
 23 |     {0, 0, 0, 0,
 24 |         "Output options:", 0},
 25 |     {"sample", 's', "SAMPLE NAME",   0,
 26 |         "Sample name (if given, adds a 'sample_name' column).", 0},
 27 |     {"reads_per_file", 'c', "NUM", 0,
 28 |         "Split reads into files with a set number of reads (default: single file).", 0},
 29 |     {"reheader", 'H', 0, 0,
 30 |         "Rewrite fastq header comments as SAM tags (useful for passing through minimap2).", 0},
 31 |     {"bam_out", 'B', 0, 0,
 32 |         "Output data as unaligned BAM.", 0},
 33 |     {"verbose", 'v', 0, 0,
 34 |         "Verbose output.", 0},
 35 |     {0, 0, 0, 0,
 36 |         "Output file selection:", 0},
 37 |     {"read", 'r', "READ SUMMARY",  0,
 38 |         "Per-read summary output", 0},
 39 |     {"file", 'f', "FILE SUMMARY",  0,
 40 |         "Per-file summary output", 0},
 41 |     {"runids", 'i', "ID SUMMARY",  0,
 42 |         "Run ID summary output", 0},
 43 |     {"basecallers", 'l', "CALLER SUMMARY",  0,
 44 |         "Basecaller mode summary output", 0},
 45 |     {"demultiplex", 'd', "OUT DIR",  0,
 46 |         "Separate barcoded samples using fastq header information. Option value is top-level output directory.", 0},
 47 |     {"histograms", 0x400, "DIRECTORY", 0,
 48 |         "Directory for outputting histogram information. When --demultiplex is enabled histograms are written to per-sample demultiplexed output directories. (default: fastcat-histograms)", 0},
 49 |     {0, 0, 0, 0,
 50 |         "Read filtering options:", 0},
 51 |     {"min_length", 'a', "MIN READ LENGTH", 0,
 52 |         "minimum read length to output (excluded reads remain listed in summaries).", 0},
 53 |     {"max_length", 'b', "MAX READ LENGTH", 0,
 54 |         "maximum read length to output (excluded reads remain listed in summaries).", 0},
 55 |     {"min_qscore", 'q', "MIN READ QSCOROE", 0,
 56 |         "minimum read Qscore to output (excluded reads remain listed in summaries).", 0},
 57 |     { 0 }
 58 | };
 59 | 
 60 | 
 61 | static error_t parse_opt (int key, char *arg, struct argp_state *state) {
 62 |     arguments_t *arguments = state->input;
 63 |     switch (key) {
 64 |         case 'r':
 65 |             arguments->perread = arg;
 66 |             break;
 67 |         case 'f':
 68 |             arguments->perfile = arg;
 69 |             break;
 70 |         case 'i':
 71 |             arguments->runids = arg;
 72 |             break;
 73 |         case 'l':
 74 |             arguments->basecallers = arg;
 75 |             break;
 76 |         case 's':
 77 |             arguments->sample = arg;
 78 |             break;
 79 |         case 'a':
 80 |             arguments->min_length = atoi(arg);
 81 |             break;
 82 |         case 'b':
 83 |             arguments->max_length = atoi(arg);
 84 |             break;
 85 |         case 'c':
 86 |             arguments->reads_per_file = atoi(arg);
 87 |             break;
 88 |         case 'd':
 89 |             arguments->demultiplex_dir = arg;
 90 |             break;
 91 |         case 0x400:
 92 |             arguments->histograms = arg;
 93 |             break;
 94 |         case 'q':
 95 |             arguments->min_qscore = (float)atof(arg);
 96 |             break;
 97 |         case 'x':
 98 |             arguments->recurse = -1;  // 0: stops recursion
 99 |             break;
100 |         case 'H':
101 |             arguments->reheader = 1;
102 |             break;
103 |         case 'B':
104 |             arguments->write_bam = 1;
105 |             break;
106 |         case 'v':
107 |             arguments->verbose = 1;
108 |             break;
109 |         case 't':
110 |             arguments->threads = atoi(arg);
111 |             break;
112 |         case ARGP_KEY_NO_ARGS:
113 |             argp_usage (state);
114 |             break;
115 |         case ARGP_KEY_ARG:
116 |             arguments->files = &state->argv[state->next - 1];
117 |             state->next = state->argc;
118 |             break;
119 |         default:
120 |             return ARGP_ERR_UNKNOWN;
121 |     }
122 |     return 0;
123 | }
124 | 
125 | static struct argp argp = {options, parse_opt, args_doc, doc, 0, 0, 0};
126 | 
127 | 
128 | arguments_t parse_arguments(int argc, char** argv) {
129 |     arguments_t args;
130 |     args.perread = NULL;
131 |     args.perfile = NULL;
132 |     args.runids = NULL;
133 |     args.basecallers = NULL;
134 |     args.sample = "";
135 |     args.min_length = 0;
136 |     args.max_length = (size_t)-1;
137 |     args.min_qscore = 0;
138 |     args.recurse = 1; // always allow descent into TLD
139 |     args.demultiplex_dir = NULL;
140 |     args.histograms = "fastcat-histograms";
141 |     args.reheader = 0;
142 |     args.write_bam = 0;
143 |     args.threads = 1;
144 |     args.reads_per_file = 0;
145 |     argp_parse(&argp, argc, argv, 0, 0, &args);
146 |     return args;
147 | }
148 | 


--------------------------------------------------------------------------------
/src/fastcat/args.h:
--------------------------------------------------------------------------------
 1 | #ifndef FASTCAT_ARGS_H
 2 | #define FASTCAT_ARGS_H
 3 | #include <stdbool.h>
 4 | 
 5 | 
 6 | typedef struct arguments {
 7 |     char *perread;
 8 |     char *perfile;
 9 |     char *runids;
10 |     char *basecallers;
11 |     char *sample;
12 |     size_t min_length;
13 |     size_t max_length;
14 |     float min_qscore;
15 |     int recurse;
16 |     size_t reheader;
17 |     size_t write_bam;
18 |     char* demultiplex_dir;
19 |     char* histograms;
20 |     char **files;
21 |     size_t reads_per_file;
22 |     int threads;
23 |     bool verbose;
24 | } arguments_t;
25 | 
26 | arguments_t parse_arguments(int argc, char** argv);
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/src/fastcat/main.c:
--------------------------------------------------------------------------------
  1 | #include <sys/stat.h>
  2 | #include <sys/types.h>
  3 | #include <dirent.h>
  4 | #include <errno.h>
  5 | 
  6 | #include <zlib.h>
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <math.h>
 10 | 
 11 | #include "htslib/kseq.h"
 12 | KSEQ_INIT(gzFile, gzread)
 13 | #define KSEQ_DECLARED
 14 | 
 15 | #include "../common.h"
 16 | #include "../fastqcomments.h"
 17 | #include "../kh_counter.h"
 18 | #include "args.h"
 19 | #include "writer.h"
 20 | 
 21 | 
 22 | const char filetypes[4][9] = {".fastq", ".fq", ".fastq.gz", ".fq.gz"};
 23 | size_t nfiletypes = 4;
 24 | 
 25 | // defined below -- recursion
 26 | int process_file(char* fname, writer writer, arguments_t *args, int recurse);
 27 | 
 28 | int process_dir(const char *name, writer writer, arguments_t *args, int recurse) {
 29 |     int status = 0;
 30 |     DIR *dir;
 31 |     struct dirent *entry;
 32 |     char* search;
 33 | 
 34 |     // read all files in directory
 35 |     if (!(dir = opendir(name))) {
 36 |         fprintf(stderr, "Error: could not process directory %s: %s\n", name, strerror(errno));
 37 |         return errno;
 38 |     }
 39 |     while ((entry = readdir(dir)) != NULL) {
 40 |         char *path = calloc(strlen(name) + strlen(entry->d_name) + 2, sizeof(char));
 41 |         sprintf(path, "%s/%s", name, entry->d_name);
 42 |         if ((entry->d_type == DT_DIR) && (recurse != 0)) {
 43 |             // skip
 44 |         } else {
 45 |             for (size_t i=0; i<nfiletypes; ++i) {
 46 |                 search = strstr(entry->d_name, filetypes[i]);
 47 |                 if (search != NULL) {
 48 |                     if (args->verbose) {
 49 |                         fprintf(stderr, "Processing %s\n", path);
 50 |                     }
 51 |                     int rtn = process_file(path, writer, args, recurse - 1);
 52 |                     status = max(status, rtn);
 53 |                     break;
 54 |                 }
 55 |             }
 56 |         }
 57 |         free(path);
 58 |     }
 59 |     closedir(dir);
 60 | 
 61 |     // start again and look at child directories
 62 |     if (!(dir = opendir(name))) {
 63 |         fprintf(stderr, "Error: could not process directory %s: %s\n", name, strerror(errno));
 64 |         return errno;
 65 |     }
 66 |     while ((entry = readdir(dir)) != NULL) {
 67 |         char *path = calloc(strlen(name) + strlen(entry->d_name) + 2, sizeof(char));
 68 |         sprintf(path, "%s/%s", name, entry->d_name);
 69 |         if ((entry->d_type == DT_DIR) && (recurse != 0)) {
 70 |             if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
 71 |                 free(path);
 72 |                 continue;
 73 |             }
 74 |             int rtn = process_dir(path, writer, args, recurse - 1);
 75 |             status = max(status, rtn);
 76 |         } else {
 77 |             // skip
 78 |         }
 79 |         free(path);
 80 |     }
 81 |     closedir(dir);
 82 | 
 83 |     return status;
 84 | }
 85 | 
 86 | 
 87 | int process_file(char* fname, writer writer, arguments_t* args, int recurse) {
 88 |     int status = 0;
 89 |     struct stat finfo;
 90 |     int res = stat(fname, &finfo);
 91 |     if (res == -1) {
 92 |         fprintf(stderr, "Error: could not process file %s: %s\n", fname, strerror(errno));
 93 |         return errno;
 94 |     }
 95 | 
 96 |     // handle directory input
 97 |     if ((finfo.st_mode & S_IFMT) == S_IFDIR) {
 98 |         if (recurse != 0) {
 99 |             char* sfname = strip_path(fname);
100 |             int rtn = process_dir(sfname, writer, args, recurse - 1);
101 |             status = max(status, rtn);
102 |             free(sfname);
103 |         }
104 |         return status;
105 |     }
106 | 
107 |     gzFile fp;
108 |     kseq_t *seq;
109 | 
110 |     fp = gzopen(fname, "r");
111 |     seq = kseq_init(fp);
112 |     size_t n = 0, slen = 0;
113 |     size_t minl = UINTMAX_MAX, maxl = 0;
114 |     double meanq = 0.0, c = 0.0;
115 |     status = 0;
116 |     kh_counter_t *run_ids = kh_counter_init();
117 |     kh_counter_t *basecallers = kh_counter_init();
118 |     while ((status = kseq_read(seq)) >= 0) {
119 |         // accumulate stats only for reads within length and quality thresholds
120 |         if (seq->qual.l == 0) { status = -99; break; }
121 |         if ((seq->seq.l >= args->min_length) && (seq->seq.l <= args->max_length)) {
122 |             float mean_q = mean_qual_naive(seq->qual.s, seq->qual.l);
123 |             if (mean_q < args->min_qscore) continue;
124 |             ++n ; slen += seq->seq.l;
125 |             minl = min(minl, seq->seq.l);
126 |             maxl = max(maxl, seq->seq.l);
127 |             kahan_sum(&meanq, mean_q, &c);
128 |             read_meta meta = parse_read_meta(seq->comment);
129 |             write_read(writer, seq, meta, mean_q, fname);
130 |             kh_counter_increment(run_ids, meta->runid);
131 |             kh_counter_increment(basecallers, meta->basecaller);
132 |             destroy_read_meta(meta);
133 |         }
134 |     }
135 | 
136 |     // handle errors
137 |     switch (status) {
138 |         case -1:
139 |             status = EXIT_SUCCESS;
140 |             break;
141 |         case -2:
142 |             status = EXIT_FAILURE;
143 |             fprintf(stderr, "Truncated quality string found for record in file '%s'.\n", fname);
144 |             break;
145 |         case -3:
146 |             status = EXIT_FAILURE;
147 |             fprintf(stderr, "Error reading file '%s', possibly truncated\n", fname);
148 |             break;
149 |         case -99:
150 |             status = EXIT_FAILURE;
151 |             fprintf(stderr, "No quality string found for record in file '%s' (FASTA is unsupported).\n", fname);
152 |             break;
153 |         default:
154 |             status = EXIT_FAILURE;
155 |             fprintf(stderr, "Unknown error reading file '%s'.\n", fname);
156 |     }
157 | 
158 |     // summary entries
159 |     if(writer->perfile != NULL) {
160 |         fprintf(writer->perfile, "%s\t", fname);
161 |         if (writer->sample != NULL) fprintf(writer->perfile, "%s\t", args->sample);
162 |         if (n == 0) {
163 |             // there were no reads in the input file
164 |             fprintf(writer->perfile, "0\t0\t0\t0\t0.00\n");
165 |         } else {
166 |             fprintf(writer->perfile, "%zu\t%zu\t%zu\t%zu\t%.2f\n",
167 |                 n, slen, minl, maxl, meanq/n
168 |             );
169 |         }
170 |     }
171 |     if(writer->runids != NULL) {
172 |         for (khiter_t k = 0; k < kh_end(run_ids); ++k) {
173 |             if (kh_exist(run_ids, k)) {
174 |                 fprintf(writer->runids, "%s\t", fname);
175 |                 if (writer->sample != NULL) fprintf(writer->runids, "%s\t", args->sample);
176 |                 fprintf(writer->runids, "%s\t%d\n", kh_key(run_ids, k), kh_val(run_ids, k));
177 |             }
178 |         }
179 |     }
180 |     if(writer->basecallers != NULL) {
181 |         for (khiter_t k = 0; k < kh_end(basecallers); ++k) {
182 |             if (kh_exist(basecallers, k)) {
183 |                 fprintf(writer->basecallers, "%s\t", fname);
184 |                 if (writer->sample != NULL) fprintf(writer->basecallers, "%s\t", args->sample);
185 |                 fprintf(writer->basecallers, "%s\t%d\n", kh_key(basecallers, k), kh_val(basecallers, k));
186 |             }
187 |         }
188 |     }
189 | 
190 |     // cleanup
191 |     kh_counter_destroy(basecallers);
192 |     kh_counter_destroy(run_ids);
193 |     kseq_destroy(seq);
194 |     gzclose(fp);
195 |     return status;
196 | }
197 | 
198 | 
199 | int main(int argc, char **argv) {
200 |     arguments_t args = parse_arguments(argc, argv);
201 | 
202 |     writer writer = initialize_writer(
203 |         args.demultiplex_dir, args.histograms, args.perread, args.perfile,
204 |         args.runids, args.basecallers, args.sample,
205 |         args.reheader, args.write_bam, args.reads_per_file,
206 |         args.threads);
207 |     if (writer == NULL) exit(1);
208 | 
209 |     size_t nfile = 0;
210 |     int status = 0;
211 |     for( ; args.files[nfile] ; nfile++);
212 | 
213 |     if (nfile==1 && strcmp(args.files[0], "-") == 0) {
214 |         char *ln = NULL;
215 |         size_t n = 0;
216 |         ssize_t nchr = 0;
217 |         int recurse = 0;
218 |         while ((nchr = getline (&ln, &n, stdin)) != -1) {
219 |             ln[strcspn(ln, "\r\n")] = 0;
220 |             int rtn = process_file(ln, writer, &args, recurse);
221 |             status = max(status, rtn);
222 |         }
223 |         free(ln);
224 |     } else {
225 |         for (size_t i=0; i<nfile; ++i) {
226 |             int rtn = process_file(args.files[i], writer, &args, args.recurse);
227 |             status = max(status, rtn);
228 |         }
229 |     }
230 |     destroy_writer(writer);
231 | 
232 |     if (status != 0) {
233 |         fprintf(stderr, "Completed processing with errors. Outputs may be incomplete.\n");
234 |         return EXIT_FAILURE;
235 |     }
236 |     return EXIT_SUCCESS;
237 | }
238 | 


--------------------------------------------------------------------------------
/src/fastcat/writer.h:
--------------------------------------------------------------------------------
 1 | #ifndef FASTCAT_WRITER_H
 2 | #define FASTCAT_WRITER_H
 3 | 
 4 | #include <zlib.h>
 5 | 
 6 | 
 7 | // this gives us kseq_t for below
 8 | #ifndef KSEQ_DECLARED
 9 | #include "htslib/kseq.h"
10 | KSEQ_DECLARE(gzFile)
11 | #endif
12 | 
13 | #include <htslib/sam.h> // HTSlib for BAM output
14 | 
15 | #include "../stats.h"
16 | #include "../fastqcomments.h"
17 | 
18 | // barcode 0 is reserved for "unclassified"
19 | #define MAX_BARCODES 1025
20 | 
21 | typedef struct {
22 |     char* output;
23 |     char* histograms;
24 |     gzFile* handles;
25 |     size_t* nreads;
26 |     size_t* reads_written;
27 |     size_t* file_index;
28 |     read_stats** l_stats;
29 |     read_stats** q_stats;
30 |     FILE* perread;
31 |     FILE* perfile;
32 |     FILE* runids;
33 |     FILE* basecallers;
34 |     char* sample;
35 |     size_t reheader;
36 |     size_t reads_per_file;
37 |     // optional BAM conversion
38 |     int write_bam;
39 |     htsFile** bam_files;
40 |     bam_hdr_t* bam_hdr;
41 |     htsThreadPool hts_pool;
42 | } _writer;
43 | 
44 | typedef _writer* writer;
45 | 
46 | char* strip_path(char* input);
47 | 
48 | writer initialize_writer(
49 |         char* output_dir, char* histograms, char* perread, char* perfile,
50 |         char* runids, char* basecallers, char* sample,
51 |         size_t reheader, size_t write_bam, size_t reads_per_file,
52 |         int threads);
53 | 
54 | void destroy_writer(writer writer);
55 | 
56 | void write_read(writer writer, kseq_t* seq, read_meta meta, float mean_q, char* fname);
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/src/fastqcomments.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <stdbool.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | 
  6 | #include "common.h"
  7 | #include "fastqcomments.h"
  8 | 
  9 | 
 10 | // like `ksprintf()`, but will put the optional delimiter `d` before the added string if
 11 | // `s` is not empty (and skip the delimiter otherwise)
 12 | #define ksprintf_with_opt_delim(s, d, fmt, ...) \
 13 |     ksprintf(s, "%s" fmt, s->l == 0 ? "" : d, __VA_ARGS__)
 14 | 
 15 | 
 16 | read_meta create_read_meta(const kstring_t* comment) {
 17 |     read_meta meta = xalloc(1, sizeof(_read_meta), "meta");
 18 |     meta->comment = xalloc(comment->l + 1, sizeof(char), "meta->comment");
 19 |     strncpy(meta->comment, comment->s, comment->l);
 20 |     meta->rg = "";
 21 |     meta->rg_info = NULL;
 22 |     meta->runid = "";
 23 |     meta->basecaller = "";
 24 |     meta->flow_cell_id = "";
 25 |     meta->barcode = "";
 26 |     meta->ibarcode = 0;
 27 |     meta->barcode_alias = "";
 28 |     meta->start_time = "";
 29 |     meta->read_number = 0;
 30 |     meta->channel = 0;
 31 |     meta->rest = xalloc(1, sizeof(kstring_t), "meta->rest");
 32 |     ks_initialize(meta->rest);
 33 |     meta->tags_str = xalloc(1, sizeof(kstring_t), "meta->tags_str");
 34 |     ks_initialize(meta->tags_str);
 35 | 
 36 |     return meta;
 37 | }
 38 | 
 39 | void destroy_read_meta(read_meta meta) {
 40 |     free(meta->comment);
 41 |     destroy_rg_info(meta->rg_info);
 42 |     free(meta->rest->s);
 43 |     free(meta->rest);
 44 |     free(meta->tags_str->s);
 45 |     free(meta->tags_str);
 46 |     free(meta);
 47 | }
 48 | 
 49 | // The caller is responsible for calling destroy_read_meta on the returned object.
 50 | read_meta parse_read_meta(kstring_t comment) {
 51 |     read_meta meta = create_read_meta(&comment);
 52 | 
 53 |     // if an RG or RD tag appears in the seq->comment, assume there are SAM tags to parse
 54 |     char* res = NULL;
 55 |     bool sam_tags = false;
 56 |     if (strlen(meta->comment) > 0) {
 57 |         // check if comment starts with "RG:Z:" or "RD:Z:"
 58 |         if (!strncmp(meta->comment, "RG:Z:", 5) || !strncmp(meta->comment, "RD:Z:", 5)) {
 59 |             sam_tags = true;
 60 |         }
 61 |         // RG or RD could also appear later in the comment (we include '\t' in the check
 62 |         // to be extra stringent)
 63 |         res = strstr(meta->comment, "\tRG:Z:");
 64 |         if (res != NULL) {
 65 |             sam_tags = true;
 66 |         }
 67 |         res = strstr(meta->comment, "\tRD:Z:");
 68 |         if (res != NULL) {
 69 |             sam_tags = true;
 70 |         }
 71 |     }
 72 | 
 73 |     char *pch=NULL, *p1=NULL, *p2=NULL;
 74 |     char *key=NULL, *keytype=NULL, *value=NULL;
 75 | 
 76 |     char sam_token[2] = "\t";
 77 |     char fq_token[2] = " ";
 78 |     char* token = fq_token;
 79 |     if (sam_tags) {
 80 |         token = sam_token;
 81 |     }
 82 |     pch = strtok_r(meta->comment, token, &p1);
 83 |     while (pch != NULL) {
 84 | 
 85 |         if (sam_tags) {
 86 |             // split to tag:type:value
 87 |             key = strtok_r(pch, ":", &p2);
 88 |             keytype = strtok_r(NULL, ":", &p2);
 89 |             value = strtok_r(NULL, "", &p2);
 90 |             // we allow empty tags (e.g. 'RG:Z:'); in this case, `keytype` will be
 91 |             // non-null, but `value` will be null; we set it to ""
 92 |             if (keytype != NULL && value == NULL) value = "";
 93 |         }
 94 |         else {
 95 |             // split words on `=`
 96 |             key = strtok_r(pch, "=", &p2);
 97 |             keytype = NULL;
 98 |             value = strtok_r(NULL, "", &p2);
 99 |         }
100 | 
101 |         // if there was no delimiter in the word, value will be NULL --> add word to `rest`
102 |         if (value == NULL) {
103 |             ksprintf_with_opt_delim(meta->rest, " ", "%s", key);
104 |         } else {
105 |             if (!strcmp(key, "runid") || !strcmp(key, "RD")) {
106 |                 // we'll output RD depending on the value of RG, later
107 |                 meta->runid = value;
108 |                 ksprintf_with_opt_delim(meta->tags_str, "\t", "RD:Z:%s", meta->runid);
109 |             }
110 |             else if (!strcmp(key, "RG")) {
111 |                 meta->rg = value;
112 |                 ksprintf_with_opt_delim(meta->tags_str, "\t", "RG:Z:%s", value);
113 |             }
114 |             // CW-4766 - inconsistent naming of basecall model version id by guppy/minknow/dorado
115 |             else if (!strcmp(key, "basecall_model_version_id") || !strcmp(key, "model_version_id")) {
116 |                 meta->basecaller = value;
117 |                 // there's no discrete tag defined by guppy/minknow/doroado
118 |                 // for this; so not added to `tags_str` (but to `rest` instead)
119 |                 ksprintf_with_opt_delim(meta->rest, " ", "%s=%s", key, value);
120 |             }
121 |             else if (!strcmp(key, "flow_cell_id") || !strcmp(key, "FC")) {
122 |                 meta->flow_cell_id = value;
123 |                 ksprintf_with_opt_delim(meta->tags_str, "\t", "FC:Z:%s", value);
124 |             }
125 |             else if (!strcmp(key, "barcode") || !strcmp(key, "BC")) {
126 |                 meta->barcode = value;
127 |                 meta->ibarcode = atoi(value+7);  // "unclassified" -> 0
128 |                 ksprintf_with_opt_delim(meta->tags_str, "\t", "BC:Z:%s", value);
129 |             }
130 |             else if (!strcmp(key, "barcode_alias") || !strcmp(key, "BA")) {
131 |                 meta->barcode_alias = value;
132 |                 ksprintf_with_opt_delim(meta->tags_str, "\t", "BA:Z:%s", value);
133 |             }
134 |             else if (!strcmp(key, "read") || !strcmp(key, "RN") || !strcmp(key, "rn")) {
135 |                 meta->read_number = atoi(value);
136 |                 ksprintf_with_opt_delim(meta->tags_str, "\t", "rn:i:%s", value);
137 |             }
138 |             else if (!strcmp(key, "CH") || !strcmp(key, "ch")) {
139 |                 meta->channel = atoi(value);
140 |                 ksprintf_with_opt_delim(meta->tags_str, "\t", "ch:i:%s", value);
141 |             }
142 |             else if (!strcmp(key, "start_time") || !strcmp(key, "ST") || !strcmp(key, "st")) {
143 |                 meta->start_time = value;
144 |                 ksprintf_with_opt_delim(meta->tags_str, "\t", "st:Z:%s", value);
145 |             } else {
146 |                 if (sam_tags) {
147 |                     // pass through all other tags
148 |                     ksprintf_with_opt_delim(meta->tags_str, "\t", "%s:%s:%s", key, keytype, value);
149 |                 }
150 |                 else {
151 |                     // long form key=value was not mapped to a SAM tag, send it to CO via meta->rest
152 |                     ksprintf_with_opt_delim(meta->rest, " ", "%s=%s", key, value);
153 |                 }
154 |             }
155 |         }
156 |         pch = strtok_r(NULL, token, &p1);
157 |     }
158 | 
159 |     // if there is a `rest`
160 |     // (also check that the first char of rest is not ' ', in which case something must have gone wrong)
161 |     // first replace all tabs with space to avoid all manner of confusion with Martin
162 |     for (size_t i=0; i<meta->rest->l; i++) {
163 |         if (meta->rest->s[i] == '\t') {
164 |             meta->rest->s[i] = ' ';
165 |         }
166 |     }
167 |     if (meta->rest->l != 0 && meta->rest->s[0] != ' ') {
168 |         ksprintf_with_opt_delim(meta->tags_str, "\t", "CO:Z:%s", meta->rest->s);
169 |     }
170 | 
171 |     bool need_run_id = strlen(meta->runid) == 0;
172 |     bool need_basecaller = strlen(meta->basecaller) == 0;
173 |     if(strlen(meta->rg) > 0 && (need_run_id || need_basecaller)) {
174 |         readgroup* rg_info = create_rg_info(meta->rg);
175 |         if (need_run_id && rg_info->runid != NULL) {
176 |             meta->runid = rg_info->runid;
177 |             ksprintf_with_opt_delim(meta->tags_str, "\t", "RD:Z:%s", rg_info->runid);
178 |         }
179 |         if (need_basecaller && rg_info->basecaller != NULL) {
180 |             meta->basecaller = rg_info->basecaller;
181 |         }
182 |         meta->rg_info = rg_info;
183 |     }
184 | 
185 |     return meta;
186 | }
187 | 


--------------------------------------------------------------------------------
/src/fastqcomments.h:
--------------------------------------------------------------------------------
 1 | #ifndef FASTCAT_FASTQCOMMENTS_H
 2 | #define FASTCAT_FASTQCOMMENTS_H
 3 | 
 4 | #include "common.h"
 5 | #include "htslib/kstring.h"
 6 | 
 7 | typedef struct {
 8 |     char* comment;
 9 |     char* rg;
10 |     readgroup* rg_info;
11 |     char* runid;
12 |     char* basecaller;
13 |     char* flow_cell_id;
14 |     char* barcode;
15 |     size_t ibarcode;
16 |     char* barcode_alias;
17 |     char* start_time;
18 |     size_t read_number;
19 |     size_t channel;
20 |     kstring_t* rest;
21 |     kstring_t* tags_str;
22 | } _read_meta;
23 | 
24 | typedef _read_meta* read_meta;
25 | 
26 | 
27 | // constructor
28 | read_meta create_read_meta(const kstring_t* comment);
29 | 
30 | // destructor
31 | void destroy_read_meta(read_meta meta);
32 | 
33 | // parser
34 | read_meta parse_read_meta(kstring_t comment);
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/src/hts_defs.h:
--------------------------------------------------------------------------------
  1 | /*  hts_defs.h -- Miscellaneous definitions.
  2 | 
  3 |     Copyright (C) 2013-2015,2017, 2019-2020 Genome Research Ltd.
  4 | 
  5 |     Author: John Marshall <jm18@sanger.ac.uk>
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in
 15 | all copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 20 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 22 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 23 | DEALINGS IN THE SOFTWARE.  */
 24 | 
 25 | #ifndef HTSLIB_HTS_DEFS_H
 26 | #define HTSLIB_HTS_DEFS_H
 27 | 
 28 | #if defined __MINGW32__
 29 | #include <stdio.h>     // For __MINGW_PRINTF_FORMAT macro
 30 | #endif
 31 | 
 32 | #ifdef __clang__
 33 | #ifdef __has_attribute
 34 | #define HTS_COMPILER_HAS(attribute) __has_attribute(attribute)
 35 | #endif
 36 | 
 37 | #elif defined __GNUC__
 38 | #define HTS_GCC_AT_LEAST(major, minor) \
 39 |     (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
 40 | #endif
 41 | 
 42 | #ifndef HTS_COMPILER_HAS
 43 | #define HTS_COMPILER_HAS(attribute) 0
 44 | #endif
 45 | #ifndef HTS_GCC_AT_LEAST
 46 | #define HTS_GCC_AT_LEAST(major, minor) 0
 47 | #endif
 48 | 
 49 | #if HTS_COMPILER_HAS(__nonstring__) || HTS_GCC_AT_LEAST(8,1)
 50 | #define HTS_NONSTRING __attribute__ ((__nonstring__))
 51 | #else
 52 | #define HTS_NONSTRING
 53 | #endif
 54 | 
 55 | #if HTS_COMPILER_HAS(__noreturn__) || HTS_GCC_AT_LEAST(3,0)
 56 | #define HTS_NORETURN __attribute__ ((__noreturn__))
 57 | #else
 58 | #define HTS_NORETURN
 59 | #endif
 60 | 
 61 | // GCC introduced warn_unused_result in 3.4 but added -Wno-unused-result later
 62 | #if HTS_COMPILER_HAS(__warn_unused_result__) || HTS_GCC_AT_LEAST(4,5)
 63 | #define HTS_RESULT_USED __attribute__ ((__warn_unused_result__))
 64 | #else
 65 | #define HTS_RESULT_USED
 66 | #endif
 67 | 
 68 | #if HTS_COMPILER_HAS(__unused__) || HTS_GCC_AT_LEAST(3,0)
 69 | #define HTS_UNUSED __attribute__ ((__unused__))
 70 | #else
 71 | #define HTS_UNUSED
 72 | #endif
 73 | 
 74 | #if HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(4,5)
 75 | #define HTS_DEPRECATED(message) __attribute__ ((__deprecated__ (message)))
 76 | #elif HTS_GCC_AT_LEAST(3,1)
 77 | #define HTS_DEPRECATED(message) __attribute__ ((__deprecated__))
 78 | #else
 79 | #define HTS_DEPRECATED(message)
 80 | #endif
 81 | 
 82 | #if HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(6,4)
 83 | #define HTS_DEPRECATED_ENUM(message) __attribute__ ((__deprecated__ (message)))
 84 | #else
 85 | #define HTS_DEPRECATED_ENUM(message)
 86 | #endif
 87 | 
 88 | // On mingw the "printf" format type doesn't work.  It needs "gnu_printf"
 89 | // in order to check %lld and %z, otherwise it defaults to checking against
 90 | // the Microsoft library printf format options despite linking against the
 91 | // GNU posix implementation of printf.  The __MINGW_PRINTF_FORMAT macro
 92 | // expands to printf or gnu_printf as required, but obviously may not
 93 | // exist
 94 | #ifdef __MINGW_PRINTF_FORMAT
 95 | #define HTS_PRINTF_FMT __MINGW_PRINTF_FORMAT
 96 | #else
 97 | #define HTS_PRINTF_FMT printf
 98 | #endif
 99 | 
100 | #if HTS_COMPILER_HAS(__format__) || HTS_GCC_AT_LEAST(3,0)
101 | #define HTS_FORMAT(type, idx, first) __attribute__((__format__ (type, idx, first)))
102 | #else
103 | #define HTS_FORMAT(type, idx, first)
104 | #endif
105 | 
106 | #if defined(_WIN32) || defined(__CYGWIN__)
107 | #if defined(HTS_BUILDING_LIBRARY)
108 | #define HTSLIB_EXPORT __declspec(dllexport)
109 | #else
110 | #define HTSLIB_EXPORT
111 | #endif
112 | #elif HTS_COMPILER_HAS(__visibility__) || HTS_GCC_AT_LEAST(4,0)
113 | #define HTSLIB_EXPORT __attribute__((__visibility__("default")))
114 | #elif defined(__SUNPRO_C) && __SUNPRO_C >= 0x550
115 | #define HTSLIB_EXPORT __global
116 | #else
117 | #define HTSLIB_EXPORT
118 | #endif
119 | 
120 | #endif
121 | 


--------------------------------------------------------------------------------
/src/kh_counter.c:
--------------------------------------------------------------------------------
 1 | // Wrap khash to make it more consise to use
 2 | 
 3 | #define _GNU_SOURCE
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | #include "kh_counter.h"
 7 | 
 8 | /* Implementation of a counter of strings (increasing only)
 9 |  *
10 |  * kh_counter_t *counter = kh_counter_init();
11 |  * kh_counter_increment(counter, "one");
12 |  * kh_counter_increment(counter, "two");
13 |  * kh_counter_increment(counter, "two");
14 |  * kh_counter_add(counter, "three", 2);
15 |  * kh_counter_increment(counter, "three");
16 |  * kh_counter_destroy(h);
17 |  *
18 |  */
19 | 
20 | 
21 | kh_counter_t *kh_counter_init(void) {
22 |     kh_counter_t *h = kh_init(KH_COUNTER);
23 |     return h;
24 | }
25 | 
26 | int kh_counter_val(kh_counter_t *hash, char *key) {
27 |     khiter_t k = kh_get(KH_COUNTER, hash, key);
28 |     int val = k != kh_end(hash) ? kh_val(hash, k) : 0;
29 |     return val;
30 | }
31 | 
32 | size_t kh_counter_add(kh_counter_t *hash, char *key, int val) {
33 |     if (key == NULL) {return -1;}
34 |     // note: key is copied so no need for caller to hold on to it
35 |     int ret;
36 |     khiter_t k = kh_put(KH_COUNTER, hash, key, &ret);
37 |     if (ret == 1) { // new key
38 |         kh_key(hash, k) = strdup(key);
39 |         kh_value(hash, k) = val;
40 |     } else if (ret == 0) {  // exists
41 |         // get value and add
42 |         int cur = kh_val(hash, k);
43 |         kh_value(hash, k) = cur + val;
44 |     } else {
45 |         // shouldnt get here - previously deleted key
46 |     }
47 |     return ret;
48 | }
49 | 
50 | size_t kh_counter_sub(kh_counter_t *hash, char *key, int val) {
51 |     if (key == NULL) {return -1;}
52 |     // note: key is copied so no need for caller to hold on to it
53 |     int ret;
54 |     khiter_t k = kh_put(KH_COUNTER, hash, key, &ret);
55 |     if (ret == 1) { // new key
56 |         kh_key(hash, k) = strdup(key);
57 |         kh_value(hash, k) = -val;
58 |     } else if (ret == 0) {  // exists
59 |         // get value and add
60 |         int cur = kh_val(hash, k);
61 |         kh_value(hash, k) = cur - val;
62 |     } else {
63 |         // shouldnt get here - previously deleted key
64 |     }
65 |     return ret;
66 | }
67 | 
68 | size_t kh_counter_increment(kh_counter_t *hash, char *key) {
69 |     return kh_counter_add(hash, key, 1);
70 | }
71 | 
72 | void kh_counter_destroy(kh_counter_t *hash) {
73 |     for (khiter_t k = 0; k < kh_end(hash); k++){
74 |         if (kh_exist(hash, k)) {
75 |             free((char*) kh_key(hash, k));
76 |         }
77 |     }
78 |     kh_destroy(KH_COUNTER, hash);
79 | }
80 | 


--------------------------------------------------------------------------------
/src/kh_counter.h:
--------------------------------------------------------------------------------
 1 | #ifndef _KHCOUNTER_H
 2 | #define _KHCOUNTER_H
 3 | 
 4 | #include "htslib/khash.h"
 5 | 
 6 | 
 7 | KHASH_MAP_INIT_STR(KH_COUNTER, int)
 8 | #define kh_counter_t khash_t(KH_COUNTER)
 9 | 
10 | // create a counter
11 | kh_counter_t *kh_counter_init(void);
12 | 
13 | // Get a value from a counter 
14 | int kh_counter_val(kh_counter_t *hash, char *key);
15 | 
16 | // Clean up a counter
17 | void kh_counter_destroy(kh_counter_t *hash);
18 | 
19 | // Increment a counter by one
20 | size_t kh_counter_increment(kh_counter_t *hash, char *key);
21 | 
22 | // Decrement a counter by one
23 | size_t kh_counter_sub(kh_counter_t *hash, char *key, int val);
24 | 
25 | // Increment a counter by a given amount
26 | size_t kh_counter_add(kh_counter_t *hash, char *key, int val);
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/src/regiter.c:
--------------------------------------------------------------------------------
  1 | #include "regiter.h"
  2 | #include "common.h"
  3 | 
  4 | int region_from_string(char* input, char** chr, int* start, int* end) {
  5 |     *chr = xalloc(strlen(input) + 1, sizeof(char), "chr");
  6 |     strcpy(*chr, input);
  7 |     char *reg_chr = (char *) hts_parse_reg(input, start, end);
  8 |     int rtn = 0;
  9 |     if (reg_chr) {
 10 |         *reg_chr = '\0';  // sets chr to be terminated at correct point
 11 |     } else {
 12 |         rtn = -1;
 13 |     }
 14 |     return rtn;
 15 | }
 16 | 
 17 | 
 18 | int region_from_bed(FILE* bed_fp, char** chr, int* start, int* end) {
 19 |     char* line = NULL;
 20 |     char* line_copy = NULL;
 21 |     size_t len = 0;
 22 |     ssize_t read;
 23 |     int rtn = 0;
 24 | 
 25 |     *start = -1;
 26 |     *end = -1;
 27 | 
 28 |     if ((read = getline(&line, &len, bed_fp)) != -1) {
 29 |         char *newline_pos = strchr(line, '\n');
 30 |         if (newline_pos != NULL) {
 31 |             *newline_pos = '\0';  // Null-terminate the string at the newline
 32 |         }
 33 |         line_copy = strdup(line);  // Copy line for error reporting
 34 | 
 35 |         // get chromosome
 36 |         char* tok = strtok(line, "\t");
 37 |         if (tok == NULL) {
 38 |             fprintf(stderr, "WARNING: Missing chromosome field in BED file line: '%s'.\n", line_copy);
 39 |             rtn = -2;
 40 |             goto cleanup;
 41 |         }
 42 |         *chr = xrealloc(*chr, (strlen(tok) + 1) * sizeof(char), "chr");
 43 |         strcpy(*chr, tok);
 44 | 
 45 |         // get start coordinate
 46 |         tok = strtok(NULL, "\t");
 47 |         if (tok == NULL) {
 48 |             fprintf(stderr, "WARNING: Missing start field in BED file line: '%s'.\n", line_copy);
 49 |             rtn = -2;
 50 |             goto cleanup;
 51 |         }
 52 |         char* endptr;
 53 |         *start = strtol(tok, &endptr, 10);
 54 |         if (*endptr != '\0') {
 55 |             fprintf(stderr, "WARNING: Invalid start field in BED file line: '%s'.\n", line_copy);
 56 |             rtn = -2;
 57 |             goto cleanup;
 58 |         }
 59 | 
 60 |         // get end coordinate
 61 |         tok = strtok(NULL, "\t");
 62 |         if (tok == NULL) {
 63 |             fprintf(stderr, "WARNING: Missing end field in BED file line: '%s'.\n", line_copy);
 64 |             rtn = -2;
 65 |             goto cleanup;
 66 |         }
 67 |         *end = strtol(tok, &endptr, 10);
 68 |         if (*endptr != '\0') {
 69 |             fprintf(stderr, "WARNING: Invalid end field in BED file line: '%s'.\n", line_copy);
 70 |             rtn = -2;
 71 |             goto cleanup;
 72 |         }
 73 | 
 74 |         // Validate start and end
 75 |         if (*start < 0 || *end < 0 || *start >= *end) {
 76 |             fprintf(stderr, "WARNING: Invalid region in BED file line: '%s'.\n", line_copy);
 77 |             rtn = -2;
 78 |             goto cleanup;
 79 |         }
 80 |     } else {
 81 |         rtn = -1;  // EOF
 82 |     }
 83 | 
 84 | cleanup:
 85 |     free(line);
 86 |     free(line_copy);
 87 |     return rtn;
 88 | }
 89 | 
 90 | 
 91 | 
 92 | // Initialize the region iterator
 93 | regiter init_region_iterator(const char *bed_file, const char *single_region, sam_hdr_t *hdr) {
 94 |     regiter it = {0};
 95 |     it.hdr = hdr;
 96 |     if (bed_file != NULL) {
 97 |         it.bed_fp = fopen(bed_file, "r");
 98 |         if (it.bed_fp == NULL) {
 99 |             fprintf(stderr, "ERROR: Unable to open BED file: %s\n", bed_file);
100 |             it.error |= 1;
101 |         }
102 |     } else if (single_region != NULL) {
103 |         it.single_region = strdup(single_region);
104 |         it.mode = 1; // we'll process this first then switch to BED file mode
105 |     }
106 |     return it;
107 | }
108 | 
109 | 
110 | // Clean up the iterator
111 | void destroy_region_iterator(regiter *it) {
112 |     if (it->chr != NULL) free(it->chr);
113 |     if (it->single_region != NULL) free(it->single_region);
114 |     if (it->bed_fp != NULL) fclose(it->bed_fp);
115 | }
116 | 
117 | // Get the next region
118 | // returns:
119 | //      0 successful
120 | //     -1 if no more regions
121 | //     -2 if error parsing region string
122 | //     -3 if reference not found in BAM header
123 | int next_region(regiter *it) {
124 |     int rtn = 0;
125 |     if (it->mode == 0) {
126 |         rtn = region_from_bed(it->bed_fp, &it->chr, &it->start, &it->end);
127 |     } else if (it->mode == 1) {
128 |         it->mode = 0;
129 |         if (it->single_region) {
130 |             rtn = region_from_string(it->single_region, &it->chr, &it->start, &it->end);
131 |             if (rtn == -1) {
132 |                 fprintf(stderr, "WARNING: Failed to parse region string: %s\n", it->single_region);
133 |                 it->error |= 2;
134 |                 rtn = -2;
135 |             }
136 |         }
137 |     }
138 | 
139 |     if (rtn == 0) {
140 |         // check reference exists and tidy up length
141 |         int tid = sam_hdr_name2tid(it->hdr, it->chr);
142 |         if (tid < 0) {
143 |             fprintf(stderr, "WARNING: Failed to find reference '%s' in BAM header.\n", it->chr);
144 |             rtn = -3;
145 |         }
146 |         else {
147 |             size_t ref_length = (size_t)sam_hdr_tid2len(it->hdr, tid);
148 |             int ns = min(it->start, (int)ref_length);
149 |             int ne = min(it->end, (int)ref_length);
150 |             if (ns >= ne) {
151 |                 fprintf(stderr, "WARNING: Zero-length region created after truncating to reference length (%ld) '%s:%d-%d'.\n", ref_length, it->chr, it->start, it->end);
152 |                 rtn = -2;
153 |             }
154 |             else {
155 |                 it->start = ns;
156 |                 it->end = ne;
157 |             }
158 |             it->n_regions++;
159 |         }
160 |     }
161 |     return rtn;
162 | }
163 | 


--------------------------------------------------------------------------------
/src/regiter.h:
--------------------------------------------------------------------------------
 1 | #ifndef _FASTCAT_REGITER_H
 2 | #define _FASTCAT_REGITER_H
 3 | 
 4 | #include "htslib/sam.h"
 5 | 
 6 | typedef struct {
 7 |     char *chr;
 8 |     int start;
 9 |     int end;
10 |     FILE *bed_fp;
11 |     char *single_region;
12 |     int n_regions;
13 |     sam_hdr_t *hdr;
14 |     int mode; // 0: BED file, 1: single region
15 |     int error;  // &1: couldn't open BED,
16 |                 // &2: couldn't parse single_region
17 | } regiter;
18 | 
19 | 
20 | int region_from_string(char* input, char** chr, int* start, int* end);
21 | int region_from_bed(FILE* bed_fp, char** chr, int* start, int* end);
22 | regiter init_region_iterator(const char *bed_file, const char *single_region, sam_hdr_t *hdr);
23 | void destroy_region_iterator(regiter *it);
24 | int next_region(regiter *it);
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/src/stats.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <math.h>
  5 | #include <string.h>
  6 | 
  7 | #include "stats.h"
  8 | #include "common.h"
  9 | 
 10 | 
 11 | read_stats* create_length_stats(void) {
 12 |     read_stats* stats = (read_stats*) xalloc(1, sizeof(read_stats), "length_stats");
 13 | 
 14 |     bin_groups* bins = (bin_groups*) xalloc(1, sizeof(bin_groups), "bin_groups");
 15 |     stats->buckets = bins;
 16 |     bins->n = 1;
 17 |     bins->groups = (size_t*) xalloc(1, 3*bins->n*sizeof(size_t), "groups");
 18 |     size_t* grps = bins->groups;
 19 |     // - end exclusive upper edges
 20 |     // - 0 is first lower edge
 21 |     // - final >x bucket
 22 |     //grps[0] =   50000; grps[1] =     1;
 23 |     //grps[3] =  100000; grps[4] =    10;
 24 |     //grps[6] = 1000000; grps[7] = 1000;
 25 |     // just do one massive bucket up to 10M - 76Mbytes
 26 |     grps[0] = 10000000; grps[1] = 1;
 27 | 
 28 | 
 29 |     // count the total number of bins across all groups
 30 |     stats->n = 0;
 31 |     size_t lower = 0;
 32 |     for (size_t i=0; i<bins->n; i++) {
 33 |         size_t upper = bins->groups[3*i];
 34 |         size_t step = bins->groups[3*i + 1];
 35 |         size_t nbins = (upper - lower) / step;
 36 |         bins->groups[3*i + 2] = nbins;
 37 |         stats->n += nbins;
 38 |         lower = upper;
 39 |     }
 40 |     stats->n++;
 41 | 
 42 |     // fill in all the edges
 43 |     stats->width = 0;
 44 |     stats->edges = xalloc(stats->n, sizeof(size_t), "edges");
 45 |     stats->counts = xalloc(stats->n, sizeof(size_t), "counts");
 46 |     size_t i=0;
 47 |     lower = 0;
 48 |     size_t upper = 0;
 49 |     for (size_t b=0; b<bins->n; b++) {
 50 |         upper = bins->groups[3*b];
 51 |         size_t step = bins->groups[3*b + 1];
 52 |         for (size_t j=lower; j<upper; j+=step, i++) {
 53 |             stats->edges[i] = j;
 54 |         }
 55 |         lower = upper;
 56 |     }
 57 |     stats->edges[i] = upper;
 58 |     return stats;
 59 | }
 60 | 
 61 | void destroy_length_stats(read_stats* stats) {
 62 |     if (stats != NULL) {
 63 |         free(stats->buckets->groups);
 64 |         free(stats->buckets);
 65 |         free(stats->edges);
 66 |         free(stats->counts);
 67 |         free(stats);
 68 |     }
 69 | }
 70 | 
 71 | void add_length_count(read_stats* stats, size_t x) {
 72 |     size_t lower = 0;
 73 |     size_t cum_bin = 0;
 74 |     bool done = false;
 75 |     for (size_t i=0; i<stats->buckets->n; i++) {
 76 |         size_t upper = stats->buckets->groups[3*i];
 77 |         if (x < upper) {
 78 |             size_t step = stats->buckets->groups[3*i + 1];
 79 |             stats->counts[cum_bin + (x - lower) / step]++;
 80 |             done = true;
 81 |             break;
 82 |         }
 83 |         lower = upper;
 84 |         cum_bin += stats->buckets->groups[3*i + 2];
 85 |     }
 86 |     if (!done) {
 87 |         stats->counts[cum_bin]++;
 88 |     }
 89 | }
 90 | 
 91 | 
 92 | read_stats* create_qual_stats(float width) {
 93 |     read_stats* stats = (read_stats*) xalloc(1, sizeof(read_stats), "quality stats");
 94 |     stats->width = width;
 95 |     // this fixes the range to [0, 100], good for both QUAL and %age acc
 96 |     stats->n = (size_t) (100.0 / stats->width) + 1;
 97 |     stats->counts = xalloc(stats->n, sizeof(size_t), "counts");
 98 |     return stats;
 99 | }
100 | 
101 | void destroy_qual_stats(read_stats* stats) {
102 |     if (stats != NULL) {
103 |         free(stats->counts);
104 |         free(stats);
105 |     }
106 | }
107 | 
108 | void add_qual_count(read_stats* stats, float q) {
109 |     q = fmin(q, 100.0);
110 |     stats->counts[(int) (q / stats->width)]++;
111 | }
112 | 
113 | void print_stats(read_stats* stats, bool zeroes, bool tsv, FILE* fp) {
114 |     if (fp == NULL) {
115 |         fp = stderr;
116 |     }
117 |     if (stats->width == 0) {
118 |         for (size_t i=0; i<stats->n; i++) {
119 |             if (stats->counts[i] == 0 && !zeroes) continue;
120 |             if (tsv) {
121 |                 fprintf(fp, "%zu\t%zu\t%zu\n", stats->edges[i], stats->edges[i+1], stats->counts[i]);
122 |             }
123 |             else {
124 |                 fprintf(fp, "[%zu, %zu)\t%zu\n", stats->edges[i], stats->edges[i+1], stats->counts[i]);
125 |             }
126 |         }
127 |     }
128 |     else {
129 |         if (tsv) {
130 |             size_t decimals = _leading_decimals(stats->width);
131 |             char fmt[64] = {0};  // my brain hurts and 64 seems big enough
132 |             snprintf(fmt, 63, "%%.%zuf\t%%.%zuf\t%%zu\n", decimals, decimals);
133 |             for (size_t i=0; i<stats->n; i++) {
134 |                 if (stats->counts[i] == 0 && !zeroes) continue;
135 |                 fprintf(fp, fmt, (float) i * stats->width, (float) (i+1) * stats->width, stats->counts[i]);
136 |             }
137 |         }
138 |         else {
139 |             size_t decimals = _leading_decimals(stats->width);
140 |             char fmt[64] = {0};
141 |             snprintf(fmt, 63, "[%%.%zuf, %%.%zuf)\t%%zu\n", decimals, decimals);
142 |             for (size_t i=0; i<stats->n; i++) {
143 |                 if (stats->counts[i] == 0 && !zeroes) continue;
144 |                 fprintf(fp, fmt, (float) i * stats->width, (float) (i+1) * stats->width, stats->counts[i]);
145 |             }
146 |         }
147 |     }
148 | }
149 | 
150 | // nasty function to e.g. 0.001 -> 3
151 | size_t _leading_decimals(float number) {
152 |     char str[64] = { 0 };
153 |     snprintf(str, sizeof(str), "%f", number);
154 |     char* point = strchr(str, '.');
155 |     return 1 + strspn(point + 1, "0");
156 | }
157 | 
158 | //int main(int argc, char **argv) {
159 | //    read_stats* stats = create_length_stats();
160 | //    // bins every 1
161 | //    add_length_count(stats, 1);
162 | //    add_length_count(stats, 4);
163 | //    add_length_count(stats, 950);
164 | //    add_length_count(stats, 998);
165 | //    add_length_count(stats, 999);
166 | //
167 | //    //// changing to bins every 10
168 | //    add_length_count(stats, 1000);
169 | //    add_length_count(stats, 1001);
170 | //    add_length_count(stats, 1009);
171 | //    add_length_count(stats, 1010);
172 | //    add_length_count(stats, 1045);
173 | //    add_length_count(stats, 1050);
174 | //
175 | //    // changing to bins every 100
176 | //    add_length_count(stats, 9845);
177 | //    add_length_count(stats, 9900);
178 | //    add_length_count(stats, 9901);
179 | //    add_length_count(stats, 9909);
180 | //    add_length_count(stats, 9910);
181 | //    add_length_count(stats, 9999);
182 | //    add_length_count(stats, 10000);
183 | //    add_length_count(stats, 10001);
184 | //    add_length_count(stats, 10010);
185 | //    add_length_count(stats, 10100);
186 | //    add_length_count(stats, 10150);
187 | //    add_length_count(stats, 10199);
188 | //    add_length_count(stats, 10200);
189 | //    add_length_count(stats, 10210);
190 | //
191 | //    // changing to bins every 1000
192 | //    add_length_count(stats, 99100); 
193 | //    add_length_count(stats, 99150); 
194 | //    add_length_count(stats, 99500); 
195 | //    add_length_count(stats, 99999); 
196 | //    add_length_count(stats, 100000); 
197 | //    add_length_count(stats, 100001); 
198 | //    add_length_count(stats, 100999); 
199 | //    add_length_count(stats, 101000); 
200 | //    add_length_count(stats, 102000); 
201 | //
202 | //    add_length_count(stats,  999999);
203 | //    add_length_count(stats, 1000000);
204 | //    add_length_count(stats, 2000000);
205 | //
206 | //    print_stats(stats, false, false, stderr);
207 | //    destroy_length_stats(stats);
208 | //
209 | //    float width = 0.02;
210 | //    read_stats* qstats = create_q_stats(width);
211 | //    
212 | //    add_q_count(qstats, 9.89);
213 | //    add_q_count(qstats, 9.89);
214 | //    add_q_count(qstats, 99.5);
215 | //    add_q_count(qstats, 99.9);
216 | //    add_q_count(qstats, 100);
217 | //    add_q_count(qstats, 101);
218 | //    add_q_count(qstats, 101);
219 | //    add_q_count(qstats, 101);
220 | //    print_stats(qstats, false, false, stderr);
221 | //    destroy_q_stats(qstats);
222 | //
223 | //}
224 | 


--------------------------------------------------------------------------------
/src/stats.h:
--------------------------------------------------------------------------------
 1 | #ifndef FASTCAT_STATS_H
 2 | #define FASTCAT_STATS_H
 3 | 
 4 | #include "stdbool.h"
 5 | // use only 1 non-zero decimal place in these -- see _leading_decimals()
 6 | #define QUAL_HIST_WIDTH 0.02   // QUAL is log10
 7 | #define ACC_HIST_WIDTH 0.0001  // ACC is linear %age, Q60
 8 | #define COV_HIST_WIDTH 0.01    // COV is linear %age
 9 | 
10 | typedef struct {
11 |     size_t n;
12 |     size_t* groups;  // 3 items: upper, step, number 
13 | } bin_groups;
14 | 
15 | 
16 | typedef struct {
17 |     size_t n;
18 |     float width;  // for fixed width
19 |     size_t* edges;
20 |     size_t* counts;
21 |     bin_groups* buckets;
22 | } read_stats;
23 | 
24 | 
25 | read_stats* create_length_stats(void);
26 | void destroy_length_stats(read_stats* stats);
27 | void add_length_count(read_stats* stats, size_t);
28 | 
29 | // uses read_stats with a fixed (and rescaled grid)
30 | read_stats* create_qual_stats(float width);
31 | void destroy_qual_stats(read_stats* stats);
32 | void add_qual_count(read_stats* stats, float q);
33 | 
34 | void print_stats(read_stats* stats, bool zeroes, bool tsv, FILE* fp);
35 | 
36 | size_t _leading_decimals(float num);
37 | #endif
38 | 


--------------------------------------------------------------------------------
/src/version.c:
--------------------------------------------------------------------------------
1 | #include "version.h"
2 | 
3 | const char *argp_program_version = "0.22.0";
4 | 


--------------------------------------------------------------------------------
/src/version.h:
--------------------------------------------------------------------------------
1 | #ifndef FASTCAT_VERSION_H
2 | #define FASTCAT_VERSION_H
3 | 
4 | extern const char *argp_program_version;
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/test/bamindex/400.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamindex/400.bam


--------------------------------------------------------------------------------
/test/bamstats/310dx.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/310dx.bam


--------------------------------------------------------------------------------
/test/bamstats/310dx.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/310dx.bam.bai


--------------------------------------------------------------------------------
/test/bamstats/400ecoli-with-qcfail.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/400ecoli-with-qcfail.bam


--------------------------------------------------------------------------------
/test/bamstats/400ecoli.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/400ecoli.bam


--------------------------------------------------------------------------------
/test/bamstats/400ecoli.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/400ecoli.bam.bai


--------------------------------------------------------------------------------
/test/bamstats/RCS-100A.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/RCS-100A.bam


--------------------------------------------------------------------------------
/test/bamstats/RCS-100A.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/RCS-100A.bam.bai


--------------------------------------------------------------------------------
/test/bamstats/RCS-100A.bam.polya.hist:
--------------------------------------------------------------------------------
  1 | 7	8	2
  2 | 8	9	1
  3 | 10	11	1
  4 | 12	13	1
  5 | 14	15	1
  6 | 34	35	1
  7 | 62	63	2
  8 | 68	69	1
  9 | 69	70	1
 10 | 82	83	1
 11 | 83	84	1
 12 | 85	86	2
 13 | 88	89	1
 14 | 89	90	1
 15 | 90	91	2
 16 | 94	95	2
 17 | 95	96	3
 18 | 97	98	4
 19 | 98	99	2
 20 | 99	100	5
 21 | 100	101	7
 22 | 101	102	3
 23 | 102	103	7
 24 | 103	104	7
 25 | 104	105	4
 26 | 105	106	10
 27 | 106	107	5
 28 | 107	108	12
 29 | 108	109	6
 30 | 109	110	12
 31 | 110	111	14
 32 | 111	112	2
 33 | 112	113	9
 34 | 113	114	8
 35 | 114	115	6
 36 | 115	116	6
 37 | 116	117	6
 38 | 117	118	6
 39 | 118	119	3
 40 | 119	120	6
 41 | 120	121	1
 42 | 121	122	1
 43 | 123	124	6
 44 | 124	125	3
 45 | 125	126	6
 46 | 126	127	2
 47 | 127	128	1
 48 | 130	131	3
 49 | 131	132	1
 50 | 132	133	1
 51 | 133	134	2
 52 | 134	135	2
 53 | 135	136	1
 54 | 136	137	1
 55 | 137	138	2
 56 | 138	139	2
 57 | 139	140	1
 58 | 140	141	1
 59 | 141	142	2
 60 | 142	143	2
 61 | 146	147	1
 62 | 147	148	1
 63 | 149	150	1
 64 | 150	151	2
 65 | 151	152	3
 66 | 152	153	1
 67 | 153	154	1
 68 | 154	155	1
 69 | 155	156	1
 70 | 157	158	3
 71 | 158	159	1
 72 | 159	160	2
 73 | 160	161	1
 74 | 163	164	1
 75 | 165	166	1
 76 | 167	168	1
 77 | 171	172	1
 78 | 172	173	1
 79 | 174	175	2
 80 | 175	176	1
 81 | 176	177	3
 82 | 181	182	1
 83 | 183	184	1
 84 | 184	185	1
 85 | 187	188	1
 86 | 188	189	1
 87 | 189	190	1
 88 | 190	191	2
 89 | 191	192	1
 90 | 193	194	2
 91 | 196	197	2
 92 | 197	198	1
 93 | 198	199	1
 94 | 199	200	2
 95 | 200	201	1
 96 | 208	209	2
 97 | 210	211	1
 98 | 213	214	2
 99 | 216	217	2
100 | 217	218	1
101 | 218	219	1
102 | 219	220	1
103 | 220	221	1
104 | 222	223	1
105 | 223	224	1
106 | 225	226	2
107 | 228	229	1
108 | 229	230	2
109 | 230	231	1
110 | 232	233	1
111 | 237	238	1
112 | 238	239	2
113 | 240	241	1
114 | 241	242	1
115 | 242	243	1
116 | 244	245	1
117 | 246	247	1
118 | 247	248	1
119 | 248	249	1
120 | 253	254	1
121 | 254	255	1
122 | 267	268	1
123 | 269	270	1
124 | 273	274	2
125 | 276	277	2
126 | 277	278	1
127 | 278	279	1
128 | 279	280	1
129 | 284	285	1
130 | 287	288	1
131 | 289	290	2
132 | 292	293	1
133 | 293	294	1
134 | 295	296	1
135 | 297	298	1
136 | 301	302	1
137 | 302	303	1
138 | 314	315	1
139 | 317	318	1
140 | 319	320	1
141 | 320	321	1
142 | 329	330	1
143 | 335	336	1
144 | 344	345	1
145 | 347	348	1
146 | 352	353	1
147 | 356	357	1
148 | 357	358	1
149 | 358	359	1
150 | 359	360	1
151 | 365	366	1
152 | 373	374	1
153 | 375	376	1
154 | 376	377	1
155 | 380	381	1
156 | 386	387	1
157 | 387	388	2
158 | 407	408	1
159 | 411	412	1
160 | 412	413	2
161 | 416	417	1
162 | 419	420	1
163 | 421	422	1
164 | 423	424	1
165 | 428	429	1
166 | 430	431	1
167 | 431	432	1
168 | 434	435	2
169 | 435	436	1
170 | 440	441	1
171 | 441	442	2
172 | 457	458	2
173 | 461	462	1
174 | 465	466	1
175 | 473	474	2
176 | 485	486	1
177 | 492	493	1
178 | 498	499	1
179 | 503	504	1
180 | 509	510	1
181 | 513	514	1
182 | 548	549	1
183 | 


--------------------------------------------------------------------------------
/test/bamstats_badNM/test.sam:
--------------------------------------------------------------------------------
1 | @HD	VN:1.6	SO:coordinate
2 | @SQ	SN:chr1	LN:100000
3 | 39c3c681-3193-4613-8ffb-1045c7568f70	0	chr1	1000	39	1S	*	0	0	T	C	ms:i:176	AS:i:174	nn:i:0	tp:A:P	cm:i:8	s1:i:64	s2:i:0	de:f:0.069	rl:i:0	qs:i:11	du:f:0.474	ns:i:570	ts:i:570	mx:i:1	ch:i:1309	st:Z:2024-01-24T17:15:41.615+00:00	rn:i:-1	sm:f:-736.172	sd:f:0.00798761	sv:Z:	dx:i:0	sp:i:6852	MN:i:166	NM:i:10	BC:Z:barcode09
4 | 


--------------------------------------------------------------------------------
/test/bamstats_zeroNM/test.sam:
--------------------------------------------------------------------------------
1 | @HD	VN:1.6	SO:coordinate
2 | @SQ	SN:chrX	LN:156040895
3 | 9ad9daf6-5d60-44c3-9535-d641ce144eff	16	chrX	43690	29	9S480M36S	*	0	525	AACGTATTGCTGCATTGGGCCTGGGTCTCATTGAGGACAGATAGCGACCAGACTGTGCAACCTTTAGAGTCTGCATTGGGCTTAGGTCTCATTGAGGGCAGTTAGAGAGCAGACTGTGCAACCTTTAGAGTCTGCATTGGGCCTAGGTCTCATTGAGAGCAGATAGAGAGCACACTGTGCAACCTCTAGAGTCGGCATTGGGCCTAGGTCTCATTGAGGACAGATAGAGACCAGACTGTTGAAACTTTAGAGTCTGCATTGGGCCTAGGTCTCATTGAGGACAGATAGAGGGCAGACTGTGCAACCTTTAGAGTCTACAATGGGCCTAGGTATCAGTGAGGACAAATAGAGAGGAGACTGTGCAACCTTTAGAGTCTGCACTGGCCCTAGGTCTCTTTGAGGACAGACAGAGAGCAGAATGTGCAAACTTTAGAGTCTGCACTGGGCCTAGGTGTCATTGAGGACAGATAGAGACCAGACTGTGCAACCAGCAATACGTAACTGAACGAAGTACATGTACATAAC	()*0222332333?FB@>>DFCFF@@??@IFCCCDGHGFDC===>ABB<,+++,599<KHLC@@?0////0CBBCDDDECEEDBCCEGSJSEBABACCBDACEDCDCCBBCCDDCCDBBCCDCCEFGIGFFECBBCBCCC==<<844469<=<<=CDJBBAAGIECDCFDEDEFEDDBBBBBBBCEDGB????@BCBEFGGGHEHGGHHFGGGHIHEFFEHEECDDFFHGFGQJ{I{IQMHGHJHHGGIOJBBA@?BDEEFGHHJGMGIGFFFGHGFEFIEEECBDGEDFGDEFEGDCCCCEDCDBABCFHJFCCAABCCHGHE***DEDDGFHHKHGHGJKGGFGECGHIHDDDDEEFEDDFFHHIEA?@BGFKIFFFGHDCC@?@@CDDKJKLKHHFHEIGFFDDDDDEEEFGFGGHGEGEFGHEFDCDFFGGFIGFGEDCCC@////8ABIFIFJGFFGHIFGFFEGGGDCDDHHIHEFADDCDCDBAA@=>?>@GIFFEB@0/'(''&&$$$%&)&%$$##	AS:i:960	cm:i:92	de:f:0	ms:i:960	nn:i:0	s1:i:478	s2:i:474	tp:A:P	MM:Z:C+m?,2,1,88,41,13;	ML:B:C,1,2,255,2,4	qs:i:18	mx:i:2	ch:i:2213	rn:i:3733	st:Z:2022-08-18T20:18:39Z	f5:Z:PAM87936_2807d237_503.fast5	ns:i:8675	ts:i:0	MD:Z:480	NM:i:0	RG:Z:2807d2374ca5985b978a023fba4f92836a5ce559_dna_r10.4.1_e8.2_sup@v3.5.1
4 | 


--------------------------------------------------------------------------------
/test/data/bc0.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/bc0.fastq.gz


--------------------------------------------------------------------------------
/test/data/bc1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/bc1.fastq.gz


--------------------------------------------------------------------------------
/test/data/bc2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/bc2.fastq.gz


--------------------------------------------------------------------------------
/test/data/bcEmpty.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/bcEmpty.fastq.gz


--------------------------------------------------------------------------------
/test/data/bcMangled.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/bcMangled.fastq.gz


--------------------------------------------------------------------------------
/test/data/samtoolsfastq.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/samtoolsfastq.fastq.gz


--------------------------------------------------------------------------------
/test/fastcat_expected_results/concat.reheader.sorted.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/fastcat_expected_results/concat.reheader.sorted.fastq.gz


--------------------------------------------------------------------------------
/test/fastcat_expected_results/concat.sorted.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/fastcat_expected_results/concat.sorted.fastq.gz


--------------------------------------------------------------------------------
/test/fastcat_expected_results/per-file-stats.tsv:
--------------------------------------------------------------------------------
1 | filename	sample_name	n_seqs	n_bases	min_length	max_length	mean_quality
2 | ../data/bc1.fastq.gz	sample	10	5904	281	878	12.86
3 | ../data/bc0.fastq.gz	sample	10	4599	191	965	9.86
4 | ../data/bc2.fastq.gz	sample	10	5812	259	990	12.19
5 | ../data/bcEmpty.fastq.gz	sample	10	4599	191	965	9.86
6 | ../data/bcMangled.fastq.gz	sample	20	9990	470	517	13.27
7 | ../data/samtoolsfastq.fastq.gz	sample	20	261268	247	43346	18.90
8 | 


--------------------------------------------------------------------------------
/test/fastcat_expected_results/per-read-stats.tsv:
--------------------------------------------------------------------------------
 1 | read_id	filename	runid	sample_name	read_length	mean_quality	channel	read_number	start_time
 2 | 959b96ca-5864-427a-ab22-53154ecb3511	../data/bc1.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	672	12.98	81	15	2021-04-20T17:00:40Z
 3 | 28f9c419-4d76-462e-9151-a746969ab2f2	../data/bc1.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	694	10.05	412	19	2021-04-20T17:00:41Z
 4 | cad9e07d-b2b2-47ad-a51c-2b5727a88552	../data/bc1.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	562	12.94	363	18	2021-04-20T17:00:41Z
 5 | 4916f252-0f9d-428d-94af-69b42e4734aa	../data/bc1.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	283	8.91	210	33	2021-04-20T17:00:42Z
 6 | 3533a366-8bdd-4a12-82ec-280562b04527	../data/bc1.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	852	14.23	222	26	2021-04-20T17:00:42Z
 7 | 533ce526-194b-43af-91e8-13c6e95a7645	../data/bc1.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	449	15.32	396	35	2021-04-20T17:00:43Z
 8 | 450143b6-075b-4bd7-8585-cc9ba8128c02	../data/bc1.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	531	14.09	503	61	2021-04-20T17:00:44Z
 9 | efe4925d-b0b4-4939-b3cb-7af80a70c036	../data/bc1.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	281	12.37	130	40	2021-04-20T17:00:45Z
10 | 2b1179cc-be33-4d83-b902-7ccdf9c03f38	../data/bc1.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	702	13.34	328	78	2021-04-20T17:00:48Z
11 | b0de027e-fc5f-481d-a4e6-3a1b8521d39c	../data/bc1.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	878	14.34	185	69	2021-04-20T17:00:48Z
12 | 32e13a1c-4171-4706-b6ce-a32c0f65fa16	../data/bc0.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	326	12.31	282	9	2021-04-20T17:00:40Z
13 | b87f011e-b802-4993-8f56-fd240b2e784f	../data/bc0.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	407	9.13	213	19	2021-04-20T17:00:41Z
14 | 6f64aedb-bb8e-4777-b494-43e661841e06	../data/bc0.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	355	9.98	67	13	2021-04-20T17:00:41Z
15 | c372fb2c-dd45-4feb-81b2-c167c3d1ce93	../data/bc0.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	317	8.14	337	18	2021-04-20T17:00:41Z
16 | 18d04e8d-2816-4986-8e1b-e5be676837fc	../data/bc0.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	965	7.57	507	18	2021-04-20T17:00:41Z
17 | aa81ca34-9310-42fd-9893-33112e283acc	../data/bc0.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	691	12.05	244	19	2021-04-20T17:00:41Z
18 | c746fb2f-78f6-4a0a-9c75-39465c855c8d	../data/bc0.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	191	8.99	379	35	2021-04-20T17:00:42Z
19 | 99a108d2-8e72-42bf-bebf-ad8373cfe450	../data/bc0.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	456	8.61	177	38	2021-04-20T17:00:42Z
20 | 5d01447f-f17b-4acb-b87e-d60d8aeeccc8	../data/bc0.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	659	12.16	417	21	2021-04-20T17:00:41Z
21 | b0279f8e-e988-44c5-895f-201b68217623	../data/bc0.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	232	9.62	435	32	2021-04-20T17:00:43Z
22 | 1d0fded8-95e7-4b53-858f-8391da5d6537	../data/bc2.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	259	7.77	190	13	2021-04-20T17:00:40Z
23 | 0e9a279b-b513-4ecf-96d9-7403749cb7e8	../data/bc2.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	772	10.60	216	41	2021-04-20T17:00:43Z
24 | 50664f3b-7ba8-4d9e-b710-8f14cbec660f	../data/bc2.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	485	12.28	143	65	2021-04-20T17:00:45Z
25 | 6a9931a5-783b-4ffa-9a98-c9e3bd8c985e	../data/bc2.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	357	13.76	236	32	2021-04-20T17:00:45Z
26 | 09ab577f-c1d9-4fad-b08a-67de01d26b12	../data/bc2.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	728	13.02	469	54	2021-04-20T17:00:45Z
27 | a243792f-4322-4b22-b99f-fb5163642791	../data/bc2.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	990	12.54	324	48	2021-04-20T17:00:45Z
28 | c1ee6f3c-6c2e-4701-a8e9-567557463305	../data/bc2.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	632	13.21	62	48	2021-04-20T17:00:46Z
29 | ed89256b-9cce-45b4-866d-c7033fc6f901	../data/bc2.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	367	12.43	279	82	2021-04-20T17:00:46Z
30 | 3124fa05-5b9a-4b78-ab24-168c4110ee8b	../data/bc2.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	712	15.69	442	67	2021-04-20T17:00:46Z
31 | 8e042016-4de9-4d95-ae78-d671855c9416	../data/bc2.fastq.gz	5a21d8a6996146deceeaea3784244c52741cae93	sample	510	10.63	269	78	2021-04-20T17:00:47Z
32 | 32e13a1c-4171-4706-b6ce-a32c0f65fa16	../data/bcEmpty.fastq.gz		sample	326	12.31	0	0	
33 | b87f011e-b802-4993-8f56-fd240b2e784f	../data/bcEmpty.fastq.gz		sample	407	9.13	0	0	
34 | 6f64aedb-bb8e-4777-b494-43e661841e06	../data/bcEmpty.fastq.gz		sample	355	9.98	0	0	
35 | c372fb2c-dd45-4feb-81b2-c167c3d1ce93	../data/bcEmpty.fastq.gz		sample	317	8.14	0	0	
36 | 18d04e8d-2816-4986-8e1b-e5be676837fc	../data/bcEmpty.fastq.gz		sample	965	7.57	0	0	
37 | aa81ca34-9310-42fd-9893-33112e283acc	../data/bcEmpty.fastq.gz		sample	691	12.05	0	0	
38 | c746fb2f-78f6-4a0a-9c75-39465c855c8d	../data/bcEmpty.fastq.gz		sample	191	8.99	0	0	
39 | 99a108d2-8e72-42bf-bebf-ad8373cfe450	../data/bcEmpty.fastq.gz		sample	456	8.61	0	0	
40 | 5d01447f-f17b-4acb-b87e-d60d8aeeccc8	../data/bcEmpty.fastq.gz		sample	659	12.16	0	0	
41 | b0279f8e-e988-44c5-895f-201b68217623	../data/bcEmpty.fastq.gz		sample	232	9.62	0	0	
42 | SRR12480552.4	../data/bcMangled.fastq.gz	0000000000000000000000000000000000000000	sample	489	13.57	0	0	
43 | SRR12480552.12	../data/bcMangled.fastq.gz	0000000000000000000000000000000000000000	sample	470	10.86	0	0	
44 | SRR12480552.6	../data/bcMangled.fastq.gz	0000000000000000000000000000000000000000	sample	499	12.98	0	0	
45 | SRR12480552.25	../data/bcMangled.fastq.gz	0000000000000000000000000000000000000000	sample	517	15.36	0	0	
46 | SRR12480552.30	../data/bcMangled.fastq.gz	0000000000000000000000000000000000000000	sample	486	13.94	0	0	
47 | SRR12480552.33	../data/bcMangled.fastq.gz	0000000000000000000000000000000000000000	sample	502	11.66	0	0	
48 | SRR12480552.48	../data/bcMangled.fastq.gz	0000000000000000000000000000000000000000	sample	512	14.60	0	0	
49 | SRR12480552.69	../data/bcMangled.fastq.gz	0000000000000000000000000000000000000000	sample	493	14.11	0	0	
50 | SRR12480552.75	../data/bcMangled.fastq.gz	0000000000000000000000000000000000000000	sample	515	10.33	0	0	
51 | SRR12480552.13	../data/bcMangled.fastq.gz	0000000000000000000000000000000000000000	sample	512	15.31	0	0	
52 | SRR12480552.4	../data/bcMangled.fastq.gz		sample	489	13.57	0	0	
53 | SRR12480552.12	../data/bcMangled.fastq.gz		sample	470	10.86	0	0	
54 | SRR12480552.6	../data/bcMangled.fastq.gz		sample	499	12.98	0	0	
55 | SRR12480552.25	../data/bcMangled.fastq.gz		sample	517	15.36	0	0	
56 | SRR12480552.30	../data/bcMangled.fastq.gz		sample	486	13.94	0	0	
57 | SRR12480552.33	../data/bcMangled.fastq.gz		sample	502	11.66	0	0	
58 | SRR12480552.48	../data/bcMangled.fastq.gz		sample	512	14.60	0	0	
59 | SRR12480552.69	../data/bcMangled.fastq.gz		sample	493	14.11	0	0	
60 | SRR12480552.75	../data/bcMangled.fastq.gz		sample	515	10.33	0	0	
61 | SRR12480552.13	../data/bcMangled.fastq.gz		sample	512	15.32	0	0	
62 | 02f8ea0a-48a3-43e3-b3b4-4832304a1d63	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	874	18.23	388	6040	2022-09-27T16:54:30.522+00:00
63 | 049cbf74-5247-4924-9d40-8b13e06d2ce1	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	382	18.45	68	6052	2022-09-27T16:53:36.712+00:00
64 | 0a43aaf1-eb93-4546-b7c2-26d6cad0495a	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	1086	17.68	1311	3548	2022-09-27T16:54:04.381+00:00
65 | 0d6b275d-0988-415e-9e52-584605eac9c7	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	1212	18.00	359	3193	2022-09-27T16:54:31.133+00:00
66 | 06d1b46a-f572-4322-bb4b-125cc04356da	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	567	15.03	329	3280	2022-09-27T16:55:12.575+00:00
67 | 00daaf65-afcc-40cb-9b19-aa2f08431e58	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	1788	20.12	94	5128	2022-09-27T16:54:04.401+00:00
68 | 07b41e60-d563-48e3-add7-c8df26e1b05f	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	528	16.09	76	3965	2022-09-27T16:53:47.919+00:00
69 | 07edc020-dc45-41ca-b3e9-d604a583f2b3	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	625	21.26	98	4130	2022-09-27T16:54:10.177+00:00
70 | 0f885800-dd67-436b-8388-f1eb11c08283	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	441	19.59	552	5436	2022-09-27T16:54:05.459+00:00
71 | 005af53d-c9bc-4036-ba0e-75d0a65915e2	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	15165	17.84	374	4128	2022-09-27T16:53:06.188+00:00
72 | 001a54f0-480c-4921-a9ac-bb56ac265b53	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	31127	14.62	431	1364	2022-09-27T16:53:46.787+00:00
73 | 0006f7f5-10ec-46c5-826a-e29ccbabd157	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	43346	24.49	421	3305	2022-09-27T16:53:08.748+00:00
74 | 00f5b0a4-9bcc-4d3b-b28c-82ee8a225d96	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	25675	20.28	2666	5266	2022-09-27T16:52:52.092+00:00
75 | 0137de1f-d6ca-4ff2-a057-4a945931e65b	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	3766	22.28	1407	12552	2022-09-27T16:54:20.749+00:00
76 | 01141689-0761-4a0c-9d4e-0e52321d30e3	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	21861	22.55	340	3399	2022-09-27T16:53:32.512+00:00
77 | 007ecbd6-47dc-46b9-9a65-b73e0dbbc7fd	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	42473	22.32	9	1128	2022-09-27T16:53:22.726+00:00
78 | 007f0041-5024-4d59-8ed6-c3871842f74f	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	39158	15.30	147	5862	2022-09-27T16:53:03.892+00:00
79 | 01b3eaa4-c93b-4f48-948f-ef2974ade8d5	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	247	12.14	86	4014	2022-09-27T16:53:50.280+00:00
80 | 0154f130-64f2-4cd4-8f81-8a9d9c3c9c33	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	12531	20.13	415	1830	2022-09-27T16:53:55.209+00:00
81 | 014199a8-94ce-4d85-b5dd-b922e37d0fab	../data/samtoolsfastq.fastq.gz	ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c	sample	18416	21.62	576	3293	2022-09-27T16:52:39.735+00:00
82 | 


--------------------------------------------------------------------------------
/test/parse_rd/RD-first-tag-and-no-RG-CW-4285.fastq:
--------------------------------------------------------------------------------
1 | @read-id	RD:Z:dummy_run_id	rn:i:1234
2 | AAAAAAAAAAAAAAAAA
3 | +
4 | AAAAAAAAAAAAAAAAA
5 | 


--------------------------------------------------------------------------------
/test/parse_rd/RD-first-tag-and-no-RG-CW-4285.fastq.runids:
--------------------------------------------------------------------------------
1 | filename	run_id	count
2 | ../parse_rd/RD-first-tag-and-no-RG-CW-4285.fastq	dummy_run_id	1
3 | 


--------------------------------------------------------------------------------
/test/parse_rd/empty-RD-CW-4299.fastq:
--------------------------------------------------------------------------------
1 | @9ae4818c-61e9-4011-bf40-ca5721922da4	RD:Z:
2 | AAAAAAAAAAAAAAAAAA
3 | +
4 | AAAAAAAAAAAAAAAAAA
5 | 


--------------------------------------------------------------------------------
/test/parse_rd/empty-RD-CW-4299.fastq.runids:
--------------------------------------------------------------------------------
1 | filename	run_id	count
2 | ../parse_rd/empty-RD-CW-4299.fastq		1
3 | 


--------------------------------------------------------------------------------
/test/parse_rg/bad-ones.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/bad-ones.bam


--------------------------------------------------------------------------------
/test/parse_rg/bad-ones.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/bad-ones.bam.bai


--------------------------------------------------------------------------------
/test/parse_rg/bad-ones.bam.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/bad-ones.bam		100
3 | 


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.bai:
--------------------------------------------------------------------------------
1 | BAI           


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam	dna_r10.4.1_e8.2_400bps_hac@v4.3.0	21
3 | 


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz	dna_r10.4.1_e8.2_400bps_hac@v4.3.0	21
3 | 


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.bai:
--------------------------------------------------------------------------------
1 | BAI    3       


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam	dna_r10.4.1_e8.2_400bps_hac@v5.0.0	21
3 | 


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz	dna_r10.4.1_e8.2_400bps_hac@v5.0.0	21
3 | 


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.bai:
--------------------------------------------------------------------------------
1 | BAI           


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam	dna_r10.4.1_e8.2_400bps_hac@v5.0.0	21
3 | 


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz	dna_r10.4.1_e8.2_400bps_hac@v5.0.0	21
3 | 


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.bai:
--------------------------------------------------------------------------------
1 | BAI    }      


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam	dna_r10.4.1_e8.2_400bps_hac@v5.0.0	1661
3 | 


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz


--------------------------------------------------------------------------------
/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz	dna_r10.4.1_e8.2_400bps_hac@v5.0.0	1661
3 | 


--------------------------------------------------------------------------------
/test/parse_rg/mixed.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/mixed.bam


--------------------------------------------------------------------------------
/test/parse_rg/mixed.bam.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/mixed.bam	dna_r10.4.1_e8.2_400bps_hac@v5.0.0	1703
3 | ../parse_rg/mixed.bam	dna_r10.4.1_e8.2_400bps_hac@v4.3.0	21
4 | 


--------------------------------------------------------------------------------
/test/parse_rg/mixed.bam.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/mixed.bam.fastq.gz


--------------------------------------------------------------------------------
/test/parse_rg/mixed.bam.fastq.gz.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/mixed.bam.fastq.gz	dna_r10.4.1_e8.2_400bps_hac@v5.0.0	1703
3 | ../parse_rg/mixed.bam.fastq.gz	dna_r10.4.1_e8.2_400bps_hac@v4.3.0	21
4 | 


--------------------------------------------------------------------------------
/test/parse_rg/mixed_basecaller_model_key.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/mixed_basecaller_model_key.fastq.gz


--------------------------------------------------------------------------------
/test/parse_rg/mixed_basecaller_model_key.fastq.gz.callers:
--------------------------------------------------------------------------------
1 | filename	basecaller	count
2 | ../parse_rg/mixed_basecaller_model_key.fastq.gz	dna_r10.4.1_e8.2_400bps_sup@2023-12-15_episeq-cs	12
3 | 


--------------------------------------------------------------------------------
/test/rg_parse.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | 
  5 | 
  6 | #include "../src/common.h"
  7 | 
  8 | typedef struct {
  9 |     char* runid;
 10 |     char* basecall_model;
 11 |     char* mod_model;
 12 |     char* barcode;
 13 |     char* suffix;
 14 | } TestCase;
 15 | 
 16 | 
 17 | int compare(char* str1, char* str2) {
 18 |     if (str1 == NULL && str2 == NULL) {
 19 |         return 0;
 20 |     }
 21 |     if (str1 == NULL || str2 == NULL) {
 22 |         return 1;
 23 |     }
 24 |     return strncmp(str1, str2, max(strlen(str1), strlen(str2)));
 25 | }
 26 | 
 27 | 
 28 | int main() {
 29 |     char *runid_acquisition = "ef1af1ab8967cb20ca30dbeca93fd66592bf4619";
 30 |     char *runid_protocol = "c886531d-28f5-41f6-b948-948e8cb78e5e";
 31 |     char *basecall_model = "basecall_model_name@v1.2.3";
 32 |     char *mod_model_name = "basecall_model_name@v1.2.3_5mCG_5hmCG@v1";
 33 |     char *barcode = "barcode01";
 34 |     char *suffix = "-1A2B3C4D";
 35 | 
 36 |     TestCase cases[] = {
 37 |         {runid_acquisition, basecall_model, mod_model_name, barcode, suffix},
 38 |         {runid_acquisition, basecall_model, mod_model_name, barcode, NULL},
 39 |         {runid_acquisition, basecall_model, mod_model_name, NULL, suffix},
 40 |         {runid_acquisition, basecall_model, mod_model_name, NULL, NULL},
 41 |         {runid_acquisition, basecall_model, NULL, barcode, suffix},
 42 |         {runid_acquisition, basecall_model, NULL, barcode, NULL},
 43 |         {runid_acquisition, basecall_model, NULL, NULL, suffix},
 44 |         {runid_acquisition, basecall_model, NULL, NULL, NULL},
 45 |         {runid_protocol, basecall_model, mod_model_name, barcode, suffix},
 46 |         {runid_protocol, basecall_model, mod_model_name, barcode, NULL},
 47 |         {runid_protocol, basecall_model, mod_model_name, NULL, suffix},
 48 |         {runid_protocol, basecall_model, mod_model_name, NULL, NULL},
 49 |         {runid_protocol, basecall_model, NULL, barcode, suffix},
 50 |         {runid_protocol, basecall_model, NULL, barcode, NULL},
 51 |         {runid_protocol, basecall_model, NULL, NULL, suffix},
 52 |         {runid_protocol, basecall_model, NULL, NULL, NULL},
 53 |     };
 54 | 
 55 |     int fails = 0;
 56 |     for (int i = 0; i < sizeof(cases)/sizeof(TestCase); i++) {
 57 | 
 58 |         char* read_group = calloc(400, sizeof(char));
 59 |         read_group = strcpy(read_group, cases[i].runid);
 60 |         if (cases[i].basecall_model != NULL) {
 61 |             read_group = strcat(read_group, "_");
 62 |             read_group = strcat(read_group, cases[i].basecall_model);
 63 |         }
 64 |         if (cases[i].mod_model != NULL) {
 65 |             read_group = strcat(read_group, "_");
 66 |             read_group = strcat(read_group, cases[i].mod_model);
 67 |         }
 68 |         if (cases[i].barcode != NULL) {
 69 |             read_group = strcat(read_group, "_");
 70 |             read_group = strcat(read_group, cases[i].barcode);
 71 |         }
 72 |         if (cases[i].suffix != NULL) {
 73 |             read_group = strcat(read_group, cases[i].suffix);
 74 |         }
 75 |         printf("Test case %d: %s\n", i, read_group);
 76 | 
 77 |         readgroup* info = create_rg_info(read_group);
 78 | 
 79 |         int fail = 0;
 80 |         fail += compare(info->runid, cases[i].runid) != 0;
 81 |         fail += compare(info->basecaller, cases[i].basecall_model) != 0;
 82 |         fail += compare(info->modcaller, cases[i].mod_model) != 0;
 83 |         fail += compare(info->barcode, cases[i].barcode) != 0;
 84 | 
 85 |         if (fail) {
 86 |             fails++;
 87 |             printf("  Failed\n");
 88 |             printf("    Expected: %s %s %s %s\n", cases[i].runid, cases[i].basecall_model, cases[i].mod_model, cases[i].barcode);
 89 |             printf("         Got: %s %s %s %s\n", info->runid, info->basecaller, info->modcaller, info->barcode);
 90 |         }
 91 | 
 92 |         free(info);
 93 |         free(read_group);
 94 |         printf("\n");
 95 |     }
 96 | 
 97 |     if (fails == 0) {
 98 |         printf("All tests passed\n");
 99 |     } else {
100 |         printf("%d tests failed\n", fails);
101 |     }
102 |     return fails != 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/test/sort-sam.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Reads SAM from stdin and writes to STDOUT, sorting auxiliary tags by key."""
 3 | import sys
 4 | import re
 5 | 
 6 | if __name__ == "__main__":
 7 |     for line in sys.stdin:
 8 |         if line.startswith("@"):
 9 |             continue
10 |         else:
11 |             fields = line.strip().split("\t")
12 |             if len(fields) < 11:
13 |                 continue
14 |             core_fields = fields[:11]
15 |             aux_fields = fields[11:]
16 |             aux_fields.sort()
17 |             print("\t".join(core_fields + aux_fields))
18 | 


--------------------------------------------------------------------------------