├── .gitignore ├── .gitlab-ci.yml ├── .gitmodules ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── conda ├── build.sh └── meta.yaml ├── src ├── bamindex │ ├── build_main.c │ ├── dump_main.c │ ├── fetch_main.c │ ├── index.c │ ├── index.h │ └── main.c ├── bamstats │ ├── args.c │ ├── args.h │ ├── bamiter.c │ ├── bamiter.h │ ├── main.c │ ├── readstats.c │ └── readstats.h ├── common.c ├── common.h ├── fastcat │ ├── args.c │ ├── args.h │ ├── main.c │ ├── writer.c │ └── writer.h ├── fastqcomments.c ├── fastqcomments.h ├── hts_defs.h ├── kh_counter.c ├── kh_counter.h ├── regiter.c ├── regiter.h ├── stats.c ├── stats.h ├── version.c └── version.h └── test ├── bamindex └── 400.bam ├── bamstats ├── 310dx.bam ├── 310dx.bam.bai ├── 400ecoli-with-qcfail.bam ├── 400ecoli.bam ├── 400ecoli.bam.bai ├── RCS-100A.bam ├── RCS-100A.bam.bai └── RCS-100A.bam.polya.hist ├── bamstats_badNM └── test.sam ├── bamstats_zeroNM └── test.sam ├── data ├── bc0.fastq.gz ├── bc1.fastq.gz ├── bc2.fastq.gz ├── bcEmpty.fastq.gz ├── bcMangled.fastq.gz └── samtoolsfastq.fastq.gz ├── fastcat_expected_results ├── concat.reheader.sorted.fastq.gz ├── concat.sorted.fastq.gz ├── per-file-stats.tsv └── per-read-stats.tsv ├── parse_rd ├── RD-first-tag-and-no-RG-CW-4285.fastq ├── RD-first-tag-and-no-RG-CW-4285.fastq.runids ├── empty-RD-CW-4299.fastq └── empty-RD-CW-4299.fastq.runids ├── parse_rg ├── bad-ones.bam ├── bad-ones.bam.bai ├── bad-ones.bam.callers ├── dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam ├── dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.bai ├── dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.callers ├── dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz ├── dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz.callers ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.bai ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.callers ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz.callers ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.bai ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.callers ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz.callers ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.bai ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.callers ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz ├── dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz.callers ├── mixed.bam ├── mixed.bam.callers ├── mixed.bam.fastq.gz ├── mixed.bam.fastq.gz.callers ├── mixed_basecaller_model_key.fastq.gz └── mixed_basecaller_model_key.fastq.gz.callers ├── rg_parse.c ├── sam2fastq ├── wf_basecalling_demo.fastq └── wf_basecalling_demo.sam └── sort-sam.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.swo 3 | src/**/*.o 4 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | include: 2 | - project: "epi2melabs/ci-templates" 3 | file: "push-github.yaml" 4 | - project: "epi2melabs/ci-templates" 5 | file: "push-conda.yaml" 6 | - project: "epi2melabs/ci-templates" 7 | file: "snippets.yaml" 8 | 9 | image: ${UBUNTUIMAGE}:20.04 10 | 11 | variables: 12 | GIT_SUBMODULE_STRATEGY: recursive 13 | 14 | 15 | .prep-image: &prep-image | 16 | DEBIAN_FRONTEND=noninteractive 17 | apt update -qq 18 | apt install -y --no-install-recommends gcc autoconf automake valgrind make curl wget zlib1g-dev libbz2-dev libreadline-dev libssl-dev libffi-dev liblzma-dev libcurl4-gnutls-dev 19 | 20 | stages: 21 | - test 22 | - prerelease 23 | - release 24 | 25 | build: 26 | stage: test 27 | before_script: 28 | - *prep-image 29 | artifacts: 30 | when: always 31 | paths: 32 | - test/* 33 | script: 34 | - make 35 | - ./fastcat --help 36 | - ./bamstats --help 37 | - ./bamindex build --help 38 | - PEPPER=1 make test 39 | 40 | deploy-checks: 41 | stage: prerelease 42 | script: 43 | - !reference [.check, argp-c-version] 44 | - !reference [.check, changelog] 45 | - export LICENSE_FILE="BSD-4-CLAUSE" 46 | - !reference [.check, license] 47 | rules: 48 | - if: '$CI_COMMIT_TAG =~ /^v[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+$/' 49 | 50 | 51 | .before-script: &before-script | 52 | export CONDA_PKG=${CI_PROJECT_NAME} 53 | export CONDA_PKG_VERSION=${CI_COMMIT_TAG/v/} 54 | cd conda 55 | 56 | conda: 57 | extends: .deploy-conda 58 | before_script: 59 | - *prep-image 60 | - *before-script 61 | 62 | conda-arm: 63 | extends: .deploy-conda-linux-arm 64 | before_script: 65 | - *prep-image 66 | - *before-script 67 | 68 | conda-mac: 69 | extends: .deploy-conda-mac 70 | before_script: 71 | - *before-script 72 | 73 | conda-mac-arm: 74 | extends: .deploy-conda-mac-arm 75 | before_script: 76 | - *before-script 77 | 78 | test-conda-arm: 79 | stage: test 80 | extends: .deploy-conda-linux-arm 81 | variables: 82 | UPLOAD: "no" 83 | before_script: 84 | - *prep-image 85 | - *before-script 86 | rules: 87 | - if: $CI_COMMIT_BRANCH != null 88 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "htslib"] 2 | path = htslib 3 | url = https://github.com/samtools/htslib.git 4 | [submodule "zlib-ng"] 5 | path = zlib-ng 6 | url = https://github.com/zlib-ng/zlib-ng.git 7 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [v0.22.0] 8 | ### Changed 9 | - Bumped htslib to 1.21. 10 | ### Added 11 | - `fastcat` now has a verbose option. Logging of filepaths processed is suppressed without `--verbose`. 12 | 13 | ## [v0.21.0] 14 | ### Added 15 | - `fastcat` can now optionally output BAM files, carrying across meta information into tags. 16 | 17 | ## [v0.20.0] 18 | ### Changed 19 | - Tightened up checks on provided regions. 20 | - Histogram files were not written when input was empty, they now are restoring previous behaviour. 21 | ### Fixed 22 | - Read mean quality was not recomputed when qs tag was not present in BAM. 23 | 24 | ## [v0.19.1] 25 | ### Fixed 26 | - Compilation on macOS with clang. 27 | - Segmentation fault with bad read group information. 28 | 29 | ## [v0.19.0] 30 | ### Added 31 | - Regions can now be provided in a three (or more) column BED file to `bamstats` to calculate per-region statistics. 32 | ### Fixed 33 | - A segmentation fault whilst writing statistics files when demultiplexing is enabled. 34 | - A memory leak occurring in `bamstats` when qcfail reads are encountered in the input. 35 | ### Changed 36 | - Retrieve mean quality score 'qs' tag from BAM record, rather than recomputing. Previous behaviour can be restored with the `--recalc_qual` option. 37 | - Get basecall model version id from `model_version_id` in FASTQ header (in addition to `basecaller_model_version_id`) 38 | - Parse required bam tags in one pass rather than piecemeal as required. 39 | 40 | ## [v0.18.6] 41 | ### Changed 42 | - Ill advised parsing of RG ID field has been extended to additionally allow for protocol_run_id style (uuid) Run IDs, as well as standalone acquisition_id (sha1) Run IDs 43 | 44 | ## [v0.18.5] 45 | ### Changed 46 | - Parsing of RG ID field containing a modified base model, now returns only the core basecaller model. 47 | ### Fixed 48 | - Workaround samtools "bug" where RG ID suffix is not fixed width. 49 | 50 | ## [v0.18.4] 51 | ### Fixed 52 | - Segfault on SAM-style tags without values in the FASTQ header. 53 | 54 | ## [v0.18.3] 55 | ### Fixed 56 | - Bug causing segfault on unlikely RG SAM tags in FASTQ header comments. 57 | - SAM parsing of FASTQ header not enabled if only either of RG or RD tag is present and at the beginning of the header comment. 58 | 59 | ## [v0.18.2] 60 | ### Fixed 61 | - 'run_id' instead of 'basecaller' as column name in bamstats basecaller summary output header line. 62 | 63 | ## [v0.18.1] 64 | ### Fixed 65 | - 'run_id' instead of 'basecaller' as column name in basecaller summary output header line. 66 | - `(null)` in FASTQ header comments when run with `-H` on files that had `basecall_model_version_id=...` as only header comment. 67 | 68 | ## [v0.18.0] 69 | ### Added 70 | - Basecaller summary information similar to runid summary. 71 | - RNA poly-A tail length histogram output. 72 | ### Fixed 73 | - Random output for runid when not found in header. 74 | 75 | ## [v0.17.1] 76 | ### Added 77 | - `--runids` option to `bamstats` for enumerating detected run identifiers. 78 | 79 | ## [v0.17.0] 80 | ### Added 81 | - `--reads_per_file` option can split inputs into batched files when demultiplexing. Users should use Unix `split` with piped output. 82 | - `--runids` option to output a file enumerating detected run identifiers. 83 | ### Changed 84 | - Per-file read statistics now relate to filtered reads only. 85 | - Link `fastcat` against zlib-ng for an even faster cat. 86 | 87 | ## [v0.16.8] 88 | ### Changed 89 | - `fastcat` reverts to using a space separator (introduced in v0.16.0) between the Read ID and comment when outputting FASTQ comments that are not SAM tags 90 | 91 | ## [v0.16.7] 92 | ### Fixed 93 | - Modification of BAM record with strtok when inferring Run ID from RG aux tag causing missing NM tag 94 | 95 | ## [v0.16.6] 96 | ### Fixed 97 | - Additional spurious "contains non-integer 'NM' tag type" errors by checking EINVAL only when NM appears to be zero, and clearing errno first 98 | 99 | ## [v0.16.5] 100 | ### Fixed 101 | - Spurious "contains non-integer 'NM' tag type" errors by checking EINVAL only when NM appears to be zero 102 | 103 | ## [v0.16.4] 104 | ### Added 105 | - `bamstats` now saves histograms for unmapped reads when `--unmapped` is provided. 106 | 107 | ## [v0.16.3] 108 | ### Fixed 109 | - Incorrect sanity check of NM. 110 | 111 | ## [v0.16.2] 112 | ### Fixed 113 | - Prevent reads with implausible NM tag leading to illegal memory access in add_qual_count 114 | 115 | ## [v0.16.1] 116 | ### Changed 117 | - Extended FASTQ SAM tag parsing to comment lines that include the RD tag (as well as RG). 118 | 119 | ## [v0.16.0] 120 | ### Added 121 | - Support for reading SAM tags from FASTQ headers. 122 | ### Changed 123 | - `fastcat` will output a tab between the Read ID and the SAM tags rather than a space to match samtools convention. 124 | - `bamstats` uses `bam_get_tag_caseinsensitive` wrapper to get SAM tags with case insensitivity. 125 | - `fastcat` and `bamstats` will infer a Run ID from the `RG` tag if `RD` is not available. 126 | - Bumped version of htslib used to 1.19. 127 | ### Fixed 128 | - Incorrectly capitalised ONT SAM tags are now output in lowercase by fastcat: `ch`, `rn`, `st`. 129 | 130 | ## [v0.15.2] 131 | ### Fixed 132 | - Duplicated recipe name in Makefile. 133 | 134 | ### Added 135 | - Section explaining `bamstats` output columns to README. 136 | 137 | ## [v0.15.1] 138 | ### Fixed 139 | - Decimal precision of hisotgram outputs. 140 | 141 | ## [v0.15.0] 142 | ### Added 143 | - Calculation of read length and quality histograms to `fastcat` and `bamstats`. 144 | - Calculation of alignment accuracy and alignment read coverage to `bamstats`. 145 | 146 | ## [v0.14.1] 147 | ### Fixed 148 | - Missing compilation of conda aarch64 package 149 | 150 | ## [v0.14.0] 151 | ### Added 152 | - `bamstats --duplex` option allows to count the number of duplex reads and 153 | duplex-forming reads. 154 | 155 | ## [v0.13.2] 156 | ### Fixed 157 | - Bug writing long reads to demultiplexed gzipped outputs. 158 | 159 | ## [v0.13.1] 160 | ### Fixed 161 | - Bug writing `UINTMAX_MAX` for `min_length` and `nan` for `mean_quality` of a 162 | file in fastcat per-file stats if there were no reads in that file. 163 | 164 | ## [v0.13.0] 165 | ### Added 166 | - Column with start time from MinKNOW header to `bamstats` output. 167 | 168 | ### Changed 169 | - `bamstats` now prints `mean_quality`, `iden`, and `acc` values with 2 decimal 170 | places instead of 3 (the reason being that `fastcat` already uses 2 decimal 171 | places for `mean_quality` and more precision is unnecessary). 172 | 173 | ## [v0.12.0] 174 | ### Added 175 | - Column with run ID from MinKNOW header to `fastcat` per-read stats and 176 | `bamstats` output. 177 | 178 | ## [v0.11.2] 179 | ### Changed 180 | - Reverted the change of the default value of the `start_time` field to an empty 181 | string (it had been set to `"2000-01-01T00:00:00Z"` in v0.11.1). 182 | 183 | ## [v0.11.1] 184 | ### Fixed 185 | - Bug in `fastcat` per-read summary stats. 186 | 187 | ## [v0.11.0] 188 | ### Changed 189 | - Bamstats can now be run without a BAM index. 190 | - `fastcat -H` now wraps all known header fields into SAM tags regardless of 191 | whether the header was "valid" (i.e. all expected fields were present) or not. 192 | 193 | ## [v0.10.2] 194 | ### Added 195 | - Linux and macOS ARM conda packages. 196 | 197 | ## [v0.10.1] 198 | ### Fixed 199 | - bamindex program missing from conda package. 200 | 201 | ## [v0.10.0] 202 | ### Added 203 | - Create bamindex program to index unaligned BAMs for horizontal-parallel processing. 204 | 205 | ## [v0.9.0] 206 | ### Fixed 207 | - Ensure reheadered fastq is indeed formatted as a valid SAM tag(s). 208 | 209 | ## [v0.8.0] 210 | ### Added 211 | - Option to bamstats to add 'sample_name' column equivalent to fastcat. 212 | 213 | ## [v0.7.0] 214 | ### Added 215 | - Option to report unmapped alignments in per read and summary files. 216 | 217 | ## [v0.6.1] 218 | ### Fixed 219 | - Min read length in per-file statistics. 220 | 221 | ## [v0.6.0] 222 | ### Added 223 | - `mean_quality` column to bamstats output, equivalent to that from fastcat. 224 | - optional per-reference summary file for bamstats similar to samtools flagstats. 225 | 226 | ## [v0.5.0] 227 | ### Changed 228 | - Behaviour of `-x/--recurse`. Top-level directory input will always be searched for 229 | data. Turning on recursion now exclusively refers to descending into child (and 230 | subsequent) directories. 231 | 232 | ## [v0.4.12] 233 | ### Fixed 234 | - Updated kseq.h to allow exit on broken fastq/a stream. 235 | 236 | ## [v0.4.11] 237 | ### Changed 238 | - `fastcat` will exit non-zero if an input file (named or recursed) cannot be opened 239 | 240 | ## [v0.4.10] 241 | ### Fixed 242 | - Use of uninitialized memory in thread pool init, leading to memory leak. 243 | 244 | ## [v0.4.9] 245 | ### Fixed 246 | - Handle BAM_CEQUAL and BAM_CDIFF that some aligners like to use. 247 | 248 | ## [v0.4.8] 249 | ### Fixed 250 | - Doubled tab in output header. 251 | 252 | ## [v0.4.7] 253 | ### Changed 254 | - Build conda package using bioconda's htslib. 255 | ### Fixed 256 | - Occasional hanging on exit. 257 | 258 | ## [v0.4.6] 259 | ### Fixed 260 | - Missing tab character in output header. 261 | 262 | ## [v0.4.5] 263 | ### Changed 264 | - Pin openssl version in conda build to that which work across Python versions. 265 | 266 | ## [v0.4.4] 267 | ### Fixed 268 | - Removed libdeflate from conda build which caused issues with threading. 269 | 270 | ## [v0.4.3] 271 | ### Changed 272 | - Only multithread BAM decompression. 273 | 274 | ## [v0.4.2] 275 | ### Added 276 | - Multithreading to `bamstats` for improved throughput. 277 | 278 | ## [v0.4.1] 279 | ### Changed 280 | - Improved performance of `bamstats` for many-target bams. 281 | 282 | ## [v0.4.0] 283 | ### Added 284 | - `bamstats` program for summarising (primary) alignment information. 285 | 286 | ## [v0.3.8] 287 | ### Fixed 288 | - Refomatted header tags were space separated, fixed to tab separated. 289 | 290 | ## [v0.3.7] 291 | ### Added 292 | - Option to reformat fastq headers as SAM-style tags for minimap2 passthrough. 293 | 294 | ## [v0.3.6] 295 | ### Fixed 296 | - Per-file summary file created with broken header. 297 | 298 | ## [v0.3.5] 299 | ### Fixed 300 | - Per-read summary file created incorrectly when `-s` option provided. 301 | 302 | ## [v0.3.4] 303 | ### Fixed 304 | - Program hang when directory input given without trailing `/`. 305 | 306 | ## [v0.3.3] 307 | ### Added 308 | - Transpose read number, channel, and start time from fastq headers to summary. 309 | ### Changed 310 | - Additional columns in per-read summary file as above. These will be present, 311 | regardless of whether header information is present or not. 312 | 313 | ## [v0.3.2] 314 | ### Fixed 315 | - Changed erroneously small MAX_BARCODE define; added runtime check to avoid 316 | invalid memory access. 317 | 318 | ## [v0.3.1] 319 | ### Fixed 320 | - Updated CI release scripts. 321 | 322 | ## [v0.3.0] 323 | ### Added 324 | - Parsing Guppy/MinKNOW fastq key=value header comments. 325 | - Ability to demultiplex inputs based on "barcode" key in headers. 326 | ### Changed 327 | - Per-read and per-file summary files now optional. 328 | 329 | ## [v0.2.1] 330 | ### Added 331 | - Read length and read quality output filtering. 332 | ### Changed 333 | - Average qualities computed with Kahan summation. 334 | 335 | ## [v0.2.0] 336 | ### Fixed 337 | - Program hang when input file was non-existent or a directory. 338 | ### Added 339 | - Ability to traverse a directory input. 340 | 341 | ## [v0.1.0] 342 | ### Added 343 | - Ability to read input files from stdin. 344 | 345 | ## [v0.0.3] 346 | ### Changed 347 | - Moved output files to optional arguments. 348 | ### Added 349 | - `-s` option to add in a `sample_name` column to outputs. 350 | 351 | ## [v0.0.2] 352 | ### Changed 353 | - No end-user changes. 354 | 355 | ## [v0.0.1] 356 | ### Added 357 | - Per-read and per-file summarising of fastq data. 358 | 359 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020-, Oxford Nanopore Technologies Plc. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright notice, this 7 | list of conditions and the following disclaimer. 8 | 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | * All advertising materials mentioning features or use of this software must 14 | display the following acknowledgement: This product includes software 15 | developed by Oxford Nanopore Technologies Plc. 16 | 17 | * Neither the name of Oxford Nanopore Technologies Plc. nor the names of 18 | its contributors may be used to endorse or promote products derived from this 19 | software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY Oxford Nanopore Technologies Plc. AS IS AND ANY 22 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL Oxford Nanopore Technologies Plc. BE LIABLE FOR ANY DIRECT, 25 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 29 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 30 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | OS := $(shell uname) 2 | ifeq ($(OS), Darwin) 3 | # mainly for dev builds using homebrew things 4 | EXTRA_LDFLAGS ?= -L$(shell brew --prefix openssl@1.1)/lib -L$(shell brew --prefix curl)/lib 5 | ARGP ?= $(shell brew --prefix argp-standalone)/lib/libargp.a 6 | ARGP_INC ?= -I$(shell brew --prefix argp-standalone)/include 7 | CFLAGS ?= -fpic -O3 ${ARGP_INC} 8 | ZCAT = "gzcat" 9 | else 10 | ARGP ?= 11 | ARGP_INC ?= 12 | CFLAGS ?= -fpic -msse3 -O3 ${ARGP_INC} 13 | ZCAT = "zcat" 14 | endif 15 | 16 | VALGRIND ?= valgrind 17 | 18 | CC ?= gcc 19 | STATIC_HTSLIB ?= htslib/libhts.a 20 | EXTRA_CFLAGS ?= 21 | EXTRA_LDFLAGS ?= 22 | EXTRA_LIBS ?= 23 | EXTRA_LIBS ?= 24 | HTS_CONF_ARGS ?= 25 | NOTHREADS ?= 26 | ifeq ($(NOTHREADS), 1) 27 | CFLAGS += -DNOTHREADS 28 | endif 29 | 30 | # we can't do pedantic because min/max macros lead to: 31 | # "ISO C forbids braced-groups within expressions [-Werror=pedantic]" 32 | ifeq ($(shell $(CC) --version | grep clang | wc -l), 0) 33 | WARNINGS = -Werror -Wall -Wextra -Wno-incompatible-pointer-types 34 | else 35 | WARNINGS = -Werror -Wall -Wextra -Wpedantic -Wno-language-extension-token -Wno-gnu-statement-expression -Wno-incompatible-function-pointer-types 36 | endif 37 | 38 | GRIND = $(VALGRIND) --error-exitcode=1 --tool=memcheck --leak-check=full --show-leak-kinds=all -s 39 | ifeq ($(OS), Darwin) 40 | GRIND = 41 | endif 42 | # optionally run all tests under valgrind 43 | ifeq ($(PEPPER), 1) 44 | PEPPER = $(GRIND) 45 | else 46 | PEPPER = 47 | endif 48 | 49 | 50 | .PHONY: 51 | default: fastcat bamstats bamindex 52 | 53 | .PHONY: 54 | test: test_fastcat test_bamstats test_meta test_bamindex 55 | 56 | .PHONY: 57 | test_memory: mem_check_fastcat mem_check_bamstats mem_check_bamindex 58 | 59 | .PHONY: 60 | clean: 61 | rm -rf fastcat bamstats bamindex src/fastcat/*.o src/bamstats/*.o src/bamindex/*.o src/*.o 62 | 63 | .PHONY: clean_htslib 64 | clean_htslib: 65 | cd htslib && make clean 66 | 67 | 68 | ### 69 | # build stages 70 | 71 | htslib/libhts.a: 72 | @echo Compiling $(@F) 73 | cd htslib/ \ 74 | && autoheader \ 75 | && autoconf \ 76 | && autoreconf --install \ 77 | && CFLAGS="$(CFLAGS) $(EXTRA_CFLAGS)" ./configure $(HTS_CONF_ARGS) \ 78 | && make -j 4 79 | 80 | # just for testing 81 | SAMVER=1.21 82 | samtools: 83 | curl -L -o samtools-${SAMVER}.tar.bz2 https://github.com/samtools/samtools/releases/download/${SAMVER}/samtools-${SAMVER}.tar.bz2; 84 | tar -xjf samtools-${SAMVER}.tar.bz2; 85 | rm samtools-${SAMVER}.tar.bz2 86 | cd samtools-${SAMVER} && make -j 4 87 | cp samtools-${SAMVER}/samtools $@ 88 | 89 | #TODO: for conda we could use zlib-ng from conda-forge 90 | 91 | zlib-ng/zlib.h: 92 | @echo Configuring zlib-ng 93 | cd zlib-ng/ \ 94 | && CFLAGS="$(CFLAGS) $(EXTRA_CFLAGS)" ./configure --zlib-compat \ 95 | 96 | zlib-ng/libz.a: zlib-ng/zlib.h 97 | @echo Compiling $(@F) 98 | cd zlib-ng/ \ 99 | && make -j 4 libz.a 100 | 101 | src/%.o: src/%.c zlib-ng/zlib.h 102 | $(CC) -Isrc -Ihtslib -Izlib-ng -c -pthread $(WARNINGS) -fstack-protector-strong -D_FORTIFY_SOURCE=2 \ 103 | $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ 104 | 105 | fastcat: src/version.o src/fastcat/main.o src/fastcat/args.o src/fastcat/writer.o src/fastqcomments.o src/common.o src/stats.o src/kh_counter.o $(STATIC_HTSLIB) zlib-ng/libz.a 106 | $(CC) -Isrc -Izlib-ng $(WARNINGS) -fstack-protector-strong -D_FORTIFY_SOURCE=2 \ 107 | $(CFLAGS) $(EXTRA_CFLAGS) $(EXTRA_LDFLAGS) \ 108 | $^ $(ARGP) \ 109 | -lm -lz -llzma -lbz2 -lpthread -lcurl -lcrypto $(EXTRA_LIBS) \ 110 | -o $@ 111 | 112 | bamstats: src/version.o src/bamstats/main.o src/bamstats/args.o src/bamstats/readstats.o src/bamstats/bamiter.o src/fastqcomments.o src/common.o src/regiter.o src/stats.o src/kh_counter.o $(STATIC_HTSLIB) 113 | $(CC) -Isrc -Ihtslib $(WARNINGS) -fstack-protector-strong -D_FORTIFY_SOURCE=2 \ 114 | $(CFLAGS) $(EXTRA_CFLAGS) $(EXTRA_LDFLAGS) \ 115 | $^ $(ARGP) \ 116 | -lm -lz -llzma -lbz2 -lpthread -lcurl -lcrypto $(EXTRA_LIBS) \ 117 | -o $@ 118 | 119 | bamindex: src/version.o src/bamindex/main.o src/bamindex/build_main.o src/bamindex/fetch_main.o src/bamindex/dump_main.o src/bamindex/index.o $(STATIC_HTSLIB) 120 | $(CC) -Isrc -Ihtslib $(WARNINGS) -fstack-protector-strong -D_FORTIFY_SOURCE=2 \ 121 | $(CFLAGS) $(EXTRA_CFLAGS) $(EXTRA_LDFLAGS) \ 122 | $^ $(ARGP) \ 123 | -lm -lz -llzma -lbz2 -lpthread -lcurl -lcrypto $(EXTRA_LIBS) \ 124 | -o $@ 125 | 126 | test/rg_parse: src/version.o test/rg_parse.o src/common.o 127 | $(CC) -Isrc $(WARNINGS) -fstack-protector-strong -D_FORTIFY_SOURCE=2 \ 128 | $(CFLAGS) $(EXTRA_CFLAGS) $(EXTRA_LDFLAGS) \ 129 | $^ $(ARGP) \ 130 | -lm $(EXTRA_LIBS) \ 131 | -o $@ 132 | 133 | 134 | ### 135 | # fastcat tests 136 | 137 | .PHONY: 138 | test_fastcat: mem_check_fastcat mem_check_fastcat_demultiplex mem_check_fastcat_bam mem_check_fastcat_demultiplex_bam test_fastcat_bam_equivalent 139 | 140 | .PHONY: mem_check_fastcat 141 | mem_check_fastcat: fastcat 142 | rm -rf fastcat-histograms 143 | $(GRIND) ./fastcat test/data/*.fastq.gz > /dev/null 144 | 145 | .PHONY: mem_check_fastcat_bam 146 | mem_check_fastcat_bam: fastcat 147 | rm -rf fastcat-histograms 148 | $(GRIND) ./fastcat test/data/*.fastq.gz -B > /dev/null 149 | 150 | .PHONY: mem_check_fastcat_demultiplex 151 | mem_check_fastcat_demultiplex: fastcat 152 | rm -rf demultiplex 153 | $(GRIND) ./fastcat test/data/*.fastq.gz --demultiplex demultiplex > /dev/null 154 | 155 | .PHONY: mem_check_fastcat_demultiplex_bam 156 | mem_check_fastcat_demultiplex_bam: fastcat 157 | rm -rf demultiplex 158 | $(GRIND) ./fastcat test/data/*.fastq.gz --demultiplex demultiplex -B > /dev/null 159 | 160 | .PHONY: test_fastcat_bam_equivalent 161 | fastcat_bam_equivalent: fastcat bamstats samtools 162 | @echo "" 163 | @echo "Testing fastcat bam equivalence" 164 | rm -rf test/test-tmp-fcb-equiv-van* 165 | rm -rf test/test-tmp-fcb-equiv-bam* 166 | $(PEPPER) ./fastcat test/data/*.fastq.gz --histograms test/test-tmp-fcb-equiv-van --reheader | ./samtools import -T '*' - | ./test/sort-sam.py > test/test-tmp-fcb-equiv-van.sam && \ 167 | $(PEPPER) ./fastcat test/data/*.fastq.gz --histograms test/test-tmp-fcb-equiv-bam -B | ./samtools view | ./test/sort-sam.py > test/test-tmp-fcb-equiv-bam.sam && \ 168 | diff test/test-tmp-fcb-equiv-van.sam test/test-tmp-fcb-equiv-bam.sam 169 | 170 | 171 | ### 172 | # bamstats tests 173 | 174 | .PHONY: 175 | test_bamstats: test_bamstats_NM test_bamstats_polya mem_check_bamstats 176 | 177 | .PHONY: test_bamstats_NM 178 | test_bamstats_NM: bamstats 179 | rm -rf test/test-tmp-bs-nm 180 | mkdir test/test-tmp-bs-nm && \ 181 | cd test/test-tmp-bs-nm && \ 182 | $(PEPPER) ../../bamstats ../bamstats_badNM/test.sam 2> err || grep "appears to contain implausible alignment information" err && rm -rf bamstats-histograms-bs-nm && \ 183 | rm -rf bamstats-histograms && \ 184 | $(PEPPER) ../../bamstats ../bamstats_zeroNM/test.sam 185 | rm -r test/test-tmp-bs-nm 186 | 187 | .PHONY: test_bamstats_polya 188 | test_bamstats_polya: bamstats 189 | rm -rf test/test-tmp-bs-pa 190 | mkdir test/test-tmp-bs-pa && \ 191 | cd test/test-tmp-bs-pa && \ 192 | $(PEPPER) ../../bamstats ../bamstats/RCS-100A.bam --poly_a > /dev/null && \ 193 | diff bamstats-histograms/polya.hist ../bamstats/RCS-100A.bam.polya.hist 194 | rm -r test/test-tmp-bs-pa 195 | 196 | .PHONY: 197 | mem_check_bamstats: bamstats 198 | @echo "Memcheck bamstats with good data" 199 | rm -rf bamstats-histograms 200 | $(GRIND) ./bamstats test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam > /dev/null 201 | @echo "Memcheck bamstats with bad data" 202 | @echo "" 203 | rm -rf bamstats-histograms 204 | $(GRIND) ./bamstats test/parse_rg/bad-ones.bam > /dev/null 205 | @echo "" 206 | @echo "Memcheck bamstats with qcfails" 207 | rm -rf bamstats-histograms 208 | $(GRIND) ./bamstats test/bamstats/400ecoli-with-qcfail.bam > /dev/null 209 | @echo "" 210 | @echo "Memcheck bamstats duplex" 211 | rm -rf bamstats-histograms 212 | $(GRIND) ./bamstats test/bamstats/310dx.bam 213 | 214 | 215 | ### 216 | # meta data tests (both fastcat and bamstats) 217 | 218 | .PHONY: 219 | test_meta: test_meta_fastcat test_meta_bamstats 220 | 221 | .PHONY: test_meta_fastcat 222 | test_meta_fastcat: fastcat 223 | rm -rf test/test-tmp-meta-fastq 224 | mkdir test/test-tmp-meta-fastq && \ 225 | cd test/test-tmp-meta-fastq && \ 226 | set -e; \ 227 | for i in ../parse_rd/*.fastq; do \ 228 | echo $$i; \ 229 | $(PEPPER) ../../fastcat $$i --histograms hist -i rd > /dev/null; \ 230 | diff rd $$i.runids || exit 1; \ 231 | rm -rf hist rg; \ 232 | done; 233 | rm -r test/test-tmp-meta-fastq 234 | 235 | .PHONY: test_meta_bamstats 236 | test_meta_bamstats: bamstats 237 | rm -rf test/test-tmp-meta-bam 238 | mkdir test/test-tmp-meta-bam && \ 239 | cd test/test-tmp-meta-bam && \ 240 | set -e; \ 241 | for i in ../parse_rg/*.bam; do \ 242 | $(PEPPER) ../../bamstats $$i --histograms hist -l rg \ 243 | > /dev/null; \ 244 | diff rg $$i.callers || exit 1; \ 245 | rm -rf hist rg; \ 246 | done; 247 | rm -r test/test-tmp-meta-bam 248 | 249 | .PHONY: regression_test_rg_parsing 250 | regression_test_rg_parsing: test/rg_parse 251 | $(PEPPER) ./test/rg_parse 252 | 253 | 254 | ### 255 | # bamindex tests 256 | 257 | .PHONY: 258 | test_bamindex: mem_check_bamindex-build mem_check_bamindex-dump mem_check_bamindex-fetch 259 | 260 | .PHONY: mem_check_bamindex-build 261 | mem_check_bamindex-build: bamindex 262 | $(GRIND) ./bamindex build test/bamindex/400.bam 263 | 264 | .PHONY: mem_check_bamindex-dump 265 | mem_check_bamindex-dump: bamindex mem_check_bamindex-build 266 | $(GRIND) ./bamindex dump test/bamindex/400.bam.bci > /dev/null 267 | 268 | .PHONY: mem_check_bamindex-fetch 269 | mem_check_bamindex-fetch: bamindex mem_check_bamindex-build 270 | $(GRIND) ./bamindex fetch test/bamindex/400.bam --chunk 5 > /dev/null 271 | 272 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fastcat 2 | 3 | A set of simply utilities for creating summaries from standard bioinformatics formats. 4 | 5 | ### Installation 6 | 7 | All tools are distributed in a single package from our conda channel, they can be installed 8 | into an isolated conda environment with: 9 | 10 | ``` 11 | mamba create -n fastcat -c conda-forge -c bioconda -c nanoporetech fastcat 12 | ``` 13 | 14 | #### Compilation 15 | 16 | Although not recommended, compilation from source is via make: 17 | 18 | ``` 19 | make fastcat bamstats bamindex 20 | ``` 21 | 22 | Several libraries are assumed to be present on the system for linking. 23 | 24 | ### fastcat 25 | 26 | This eponymous tool concatenates .fastq(.gz) files whilst creating a summary 27 | of the sequences. Can also demultiplex reads according to Guppy/MinKNOW 28 | .fastq record headers. 29 | 30 | ``` 31 | Usage: fastcat [OPTION...] 32 | reads1.fastq(.gz) reads2.fastq(.gz) dir-with-fastq ... 33 | fastcat -- concatenate and summarise .fastq(.gz) files. 34 | 35 | -a, --min_length=MIN READ LENGTH 36 | minimum read length to output (excluded reads 37 | remain listed in summaries). 38 | -b, --max_length=MAX READ LENGTH 39 | maximum read length to output (excluded reads 40 | remain listed in summaries). 41 | -d, --demultiplex=OUT DIR Separate barcoded samples using fastq header 42 | information. Option value is top-level output 43 | directory. 44 | -f, --file=FILE SUMMARY Per-file summary output 45 | --histograms=DIRECTORY Directory for outputting histogram information. 46 | When --demultiplex is enabled histograms are 47 | written to per-sample demultiplexed output 48 | directories. (default: fastcat-histograms) 49 | -H, --reheader Rewrite fastq header comments as SAM tags (useful 50 | for passing through minimap2). 51 | -q, --min_qscore=MIN READ QSCOROE 52 | minimum read Qscore to output (excluded reads 53 | remain listed in summaries). 54 | -r, --read=READ SUMMARY Per-read summary output 55 | -s, --sample=SAMPLE NAME Sample name (if given, adds a 'sample_name' 56 | column). 57 | -x, --recurse Search directories recursively for '.fastq', 58 | '.fq', '.fastq.gz', and '.fq.gz' files. 59 | -?, --help Give this help list 60 | --usage Give a short usage message 61 | -V, --version Print program version 62 | 63 | Mandatory or optional arguments to long options are also mandatory or optional 64 | for any corresponding short options. 65 | 66 | Input files may be given on stdin by specifing the input as '-'. Also accepts 67 | directories as input and looks for .fastq(.gz) files in the top-level 68 | directory. Recurses into sub-directories when the -x option is given. The 69 | command will exit non-zero if any file encountered cannot be read. 70 | ``` 71 | 72 | The program writes the input sequences to `stdout` in .fastq format to be 73 | recompressed with `gzip` (or more usefully `bgzip`). 74 | 75 | The `per-read.txt` is a tab-separated file with columns: 76 | 77 | ``` 78 | read_id filename read_length mean_quality 79 | SRR12447496.1 SRR12447496_1.fastq.gz 531 14.03 80 | SRR12447496.2 SRR12447496_1.fastq.gz 513 13.91 81 | SRR12447496.3 SRR12447496_1.fastq.gz 473 14.70 82 | ... 83 | ``` 84 | 85 | The mean quality is defined as: 86 | ``` 87 | -10 * log10(mean(10^(Q/-10))) 88 | ``` 89 | 90 | where `Q` are the set of all per-base quality scores for the read. 91 | 92 | The `per-file.txt` is also a tab-separated file with columns: 93 | 94 | ``` 95 | filename n_seqs n_bases min_length max_length mean_quality 96 | SRR12447496_1.fastq.gz 16048 8090160 434 697 13.10 97 | SRR12447498_1.fastq.gz 16203 8049713 421 697 13.25 98 | SRR12447499_1.fastq.gz 15484 7812439 424 612 13.16 99 | ... 100 | ``` 101 | where the `mean_quality` column is the mean of the per-read `mean_quality` values. 102 | 103 | Additionally as its a common thing to want to do, the program will write 104 | the two files: 105 | 106 | * `length.hist` - read length histogram, and 107 | * `quality.hist` - read mean base-quality score histogram. 108 | 109 | When data is demultiplexed one such file will be written to the demultiplexed 110 | samples' directories. When demultiplexing is not enabled the files will be 111 | placed in a directory according to the `--histograms` option. The format of the 112 | histogram files is a tab-separated file of sparse, ordered intervals `[lower, uppper)`: 113 | 114 | ``` 115 | lower upper count 116 | ``` 117 | 118 | The final bin may be unbounded, which is signified by a `0` entry for the upper 119 | bin edge. 120 | 121 | 122 | ### bamstats 123 | 124 | The `bamstats` utility is a re-implementation of the `stats_from_bam` program 125 | from [pomoxis](github.com/nanoporetech/pomoxis). It creates read-level summary 126 | statistics of alignments found in a BAM file and reports these in a TSV file. 127 | 128 | Additionally as its a common thing to want to do, the program will write 129 | the four files: 130 | 131 | * `length.hist` - read length histogram, 132 | * `quality.hist` - read mean base-quality score histogram, 133 | * `accuracy.hist` - read alignment accuracy histogram, and 134 | * `coverage.hist` - read alignment coverage histogram. 135 | 136 | These files are as described for the `fastcat` program. 137 | 138 | ``` 139 | Usage: bamstats [OPTION...] 140 | bamstats -- summarise rears/alignments in one or more BAM files. 141 | 142 | General options: 143 | -f, --flagstats=FLAGSTATS File for outputting alignment flag counts. 144 | --histograms=DIRECTORY Directory for outputting histogram information. 145 | (default: bamstats-histograms) 146 | -r, --region=chr:start-end Genomic region to process. 147 | -s, --sample=SAMPLE NAME Sample name (if given, adds a 'sample_name' 148 | column). 149 | -t, --threads=THREADS Number of threads for BAM processing. 150 | 151 | Read filtering options: 152 | 153 | -g, --read_group=RG Only process reads from given read group. 154 | --haplotype=VAL Only process reads from a given haplotype. 155 | Equivalent to --tag_name HP --tag_value VAL. 156 | --tag_name=TN Only process reads with a given tag (see 157 | --tag_value). 158 | --tag_value=VAL Only process reads with a given tag value. 159 | -u, --unmapped Include unmapped/unplaced reads in output. 160 | 161 | -?, --help Give this help list 162 | --usage Give a short usage message 163 | -V, --version Print program version 164 | 165 | Mandatory or optional arguments to long options are also mandatory or optional 166 | for any corresponding short options. 167 | 168 | The program creates a simple TSV file containing statistics for each primary 169 | alignment stored within the input BAM files. 170 | ``` 171 | 172 | #### Output format 173 | 174 | The `bamstats` output is a tab-separated text file with columns as in the table 175 | below. The `q` prefix to columns names relates to the so-called "query" 176 | sequence, i.e. the sequencing read. The `r` prefix relates to the reference 177 | sequence. Not all column names where properties are quoted for both the query 178 | and reference follow this convention; this is an unfortunate historical wart. 179 | 180 | All coordinates are given as zero-based, end exclusive. 181 | In sequence alignment jargon the term "match" means any a pair of bases 182 | (one each from the query and reference) which are aligned to each other. 183 | The term does not convey its common English meaning that the two bases 184 | have the same identity. An 'A' base from the query can match (be aligned to) 185 | a 'C' base from the reference. 186 | 187 | | index | name | description 188 | | - | - | - 189 | | 1 | `name` | Read identifier (column 1 from a SAM file). 190 | | 2 | `runid` | Sequencing run identifier (from the `RD` tag of the SAM record). 191 | | 3 | `sample_name` | Sample name (optional, provided as input by the user). 192 | | 4 | `ref` | Reference sequence name (column 3 from a SAM file). 193 | | 5 | `coverage` | Proportion of read spanned by the alignment. 194 | | 6 | `ref_coverage` | Proportion of reference spanned by the alignment. 195 | | 7 | `qstart` | Alignment start coordinate on the query (tantamount to the total left-hand clipping in [SAM terminology](https://samtools.github.io/hts-specs/)). 196 | | 8 | `qend` | Alignment end coordinate on the query (see `qstart`). 197 | | 9 | `rstart` | Alignment start coordinate on the reference (column 4 of SAM). 198 | | 10 | `rend` | Alignment end coordinate on the reference. 199 | | 11 | `aligned_ref_len` | Length of alignment on reference (simply `rend - rstart`). 200 | | 12 | `direction` | Alignment direction. `+` for forward reference sequence, `-` for reverse complement. 201 | | 13 | `length` | Total length of the alignment including all insertions. 202 | | 14 | `read_length` | Length of query sequence (as stored in the input file). 203 | | 15 | `mean_quality` | Mean per-base quality of the query sequence expressed on Phred scale. See discussion in `fastcat` section above. 204 | | 16 | `start_time` | Sequencing start time for the read (from the `ST` tag of the SAM record). 205 | | 17 | `match` | Number of matches in the alignment (see description above). 206 | | 18 | `ins` | Number of inserted bases in alignment. 207 | | 19 | `del` | Number of deleted bases in alignment. 208 | | 20 | `sub` | Number of substitutions (mismatches) in alignment. 209 | | 21 | `iden` | Proportion of matches which are not mismatches: `(match - sub) / match`. 210 | | 22 | `acc` | Alignment accuracy: `(length - ins - del - sub) / length`. Sometimes also referred to as [BLAST-identity](https://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity). 211 | | 23 | `duplex` | Whether the read was simplex (`0`), duplex (`1`), or duplex-forming (`-1`). See [dorado documentation](https://github.com/nanoporetech/dorado?tab=readme-ov-file#duplex). 212 | 213 | 214 | ### bamindex 215 | 216 | The `bamindex` program is a rather curious program that will create a positional index 217 | of alignments in a BAM file. It is intended to be used within workflows to allow 218 | parallel processing of records in a BAM file, each worker processing a contiguous chunk 219 | of the file. This is most useful with unaligned BAM files. 220 | 221 | The program was insired by [bri](https://github.com/jts/bri) by Jared Simpson at [OICR](https://oicr.on.ca/); 222 | which is far cooler. 223 | 224 | There are three subcommands: 225 | 226 | **bamindex index** 227 | 228 | ``` 229 | $ ./bamindex build --help 230 | Usage: build [OPTION...] 231 | bamindex build -- create a BAM index corresponding to batches of records. 232 | 233 | General options: 234 | -c, --chunk_size=SIZE Number of records in a chunk. 235 | -t, --threads=THREADS Number of threads for BAM processing. 236 | 237 | -?, --help Give this help list 238 | --usage Give a short usage message 239 | -V, --version Print program version 240 | 241 | Mandatory or optional arguments to long options are also mandatory or optional 242 | for any corresponding short options. 243 | 244 | The program creates a simple index of file offsets for each of every (n * M)th 245 | alignment record. No care is taken to keep records corresponding to the same 246 | query together, or any other such niceities. Its intended to be used simply 247 | with unaligned, unsorted BAMs. 248 | ``` 249 | 250 | **bamindex fetch** 251 | 252 | ``` 253 | $ ./bamindex fetch --help 254 | Usage: fetch [OPTION...] 255 | bamindex fetch -- fetch records from a BAM according to an index. 256 | 257 | General options: 258 | -c, --chunk=SIZE Chunk index to retrieve. 259 | -t, --threads=THREADS Number of threads for BAM processing. 260 | 261 | -?, --help Give this help list 262 | --usage Give a short usage message 263 | -V, --version Print program version 264 | 265 | Mandatory or optional arguments to long options are also mandatory or optional 266 | for any corresponding short options. 267 | 268 | The program simply will fetch a batch of records from a BAM fileusing and index 269 | and a chunk ID. 270 | ``` 271 | 272 | **bamindex dump** 273 | 274 | ``` 275 | $ ./bamindex dump --help 276 | Usage: dump [OPTION...] 277 | bamindex dump -- dump a BAM chunk index to stdout as text. 278 | 279 | -?, --help Give this help list 280 | --usage Give a short usage message 281 | -V, --version Print program version 282 | 283 | The program simply writes the contents of an index to stdout for human 284 | inspection. It has no other purpose. 285 | ``` 286 | -------------------------------------------------------------------------------- /conda/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NAME=fastcat 4 | 5 | 6 | # don't enable libdeflate -- seems to cause hangs when used with threaded decompression 7 | #export HTS_CONF_ARGS="--prefix=${PREFIX} --enable-libcurl --enable-plugins --enable-gcs --enable-s3" 8 | # ignore that, just link to htslib from bioconda 9 | export EXTRA_CFLAGS="-I$PREFIX/include" 10 | export STATIC_HTSLIB="" 11 | export EXTRA_LDFLAGS="-L$PREFIX/lib" 12 | export EXTRA_LIBS="-ldl -lhts" 13 | 14 | OS=$(uname) 15 | if [[ "$OS" == "Darwin" ]]; then 16 | echo "Setting Darwin args" 17 | export ARGP=${PREFIX}/lib/libargp.a 18 | export EXTRA_CFLAGS="${EXTRA_CFLAGS} -isysroot ${CONDA_BUILD_SYSROOT} -mmacosx-version-min=${MACOSX_DEPLOYMENT_TARGET}" 19 | fi 20 | 21 | make clean clean_htslib 22 | 23 | mkdir -p $PREFIX/bin 24 | for binary in fastcat bamstats bamindex; do 25 | make $binary 26 | cp $binary $PREFIX/bin && chmod +x $PREFIX/bin/$binary 27 | done 28 | -------------------------------------------------------------------------------- /conda/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: {{ environ.get('CONDA_PKG') }} 3 | version: {{ environ.get('CONDA_PKG_VERSION') }} 4 | 5 | source: 6 | path: ../ 7 | 8 | build: 9 | number: 0 10 | 11 | requirements: 12 | build: 13 | - {{ compiler('c') }} 14 | host: 15 | - argp-standalone # [osx] 16 | # not sure why zlib needs to be explicitly listed here, 17 | # bioconda::samtools does it too, and conda build can't find it otherwise 18 | # despite it getting installed into the build env 19 | - htslib >=1.20 20 | - zlib 21 | - xz 22 | run: 23 | - htslib >=1.20 24 | - zlib 25 | - xz 26 | test: 27 | commands: 28 | fastcat --help 29 | bamstats --help 30 | bamindex build --help 31 | 32 | about: 33 | home: "https://github.com/epi2me-labs/fastcat" 34 | license: Mozilla Public License 2.0 35 | license_family: OTHER 36 | license_file: LICENSE 37 | summary: "Concatenate fast/a/q/gz and calculate basic statistics" 38 | doc_url: https://github.com/epi2me-labs/fastcat 39 | dev_url: https://github.com/epi2me-labs/fastcat 40 | 41 | extra: 42 | recipe-maintainers: 43 | - cjw85 44 | 45 | -------------------------------------------------------------------------------- /src/bamindex/build_main.c: -------------------------------------------------------------------------------- 1 | // bamindex build program 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "htslib/faidx.h" 11 | #include "htslib/sam.h" 12 | #include "htslib/thread_pool.h" 13 | #include "htslib/bgzf.h" 14 | 15 | #include "index.h" 16 | #include "../version.h" 17 | 18 | #include 19 | 20 | typedef struct arguments { 21 | const char* bam; 22 | int threads; 23 | int chunk_size; 24 | } arguments_t; 25 | 26 | static char doc[] = 27 | "bamindex build -- create a BAM index corresponding to batches of records.\ 28 | \vThe program creates a simple index of file offsets for each of every \ 29 | (n * M)th alignment record. No care is taken to keep records corresponding \ 30 | to the same query together, or any other such niceities. Its intended to \ 31 | be used simply with unaligned, unsorted BAMs."; 32 | static char args_doc[] = ""; 33 | static struct argp_option options[] = { 34 | {0, 0, 0, 0, 35 | "General options:", 0}, 36 | {"threads", 't', "THREADS", 0, 37 | "Number of threads for BAM processing.", 0}, 38 | {"chunk_size", 'c', "SIZE", 0, 39 | "Number of records in a chunk.", 0}, 40 | { 0 } 41 | }; 42 | 43 | static error_t parse_opt (int key, char *arg, struct argp_state *state) { 44 | arguments_t *arguments = state->input; 45 | switch (key) { 46 | case 't': 47 | arguments->threads = atoi(arg); 48 | break; 49 | case 'c': 50 | arguments->chunk_size = atoi(arg); 51 | break; 52 | case ARGP_KEY_NO_ARGS: 53 | argp_usage (state); 54 | break; 55 | case ARGP_KEY_ARG: 56 | if (state->arg_num == 0) { 57 | arguments->bam = arg; 58 | break; 59 | } 60 | break; 61 | case ARGP_KEY_END: 62 | if (state->arg_num != 1) 63 | argp_usage (state); 64 | break; 65 | default: 66 | return ARGP_ERR_UNKNOWN; 67 | } 68 | return 0; 69 | } 70 | 71 | static struct argp argp = {options, parse_opt, args_doc, doc, 0, 0, 0}; 72 | 73 | static arguments_t parse_arguments(int argc, char** argv) { 74 | arguments_t args; 75 | args.bam = NULL; 76 | args.threads = 1; 77 | args.chunk_size = 1; 78 | argp_parse(&argp, argc, argv, 0, 0, &args); 79 | return args; 80 | } 81 | 82 | void index_build(const char* filename, const char* output_fname, int threads, size_t every) { 83 | htsFile *fp = hts_open(filename, "r"); 84 | bam_hdr_t *h = sam_hdr_read(fp); 85 | if(fp == NULL || h == NULL) { 86 | fprintf(stderr, "Could not open %s\n", filename); 87 | exit(EXIT_FAILURE); 88 | } 89 | 90 | FILE* out_fp = fopen(output_fname, "wb"); 91 | bc_idx_t *idx = bc_idx_init1(every); 92 | size_t rtn; 93 | if ((rtn = bc_idx_write_header(out_fp, idx)) > 0) { 94 | fprintf(stderr, "Failed to write header to index. Error %zu.\n", rtn); 95 | } 96 | 97 | htsThreadPool p = {NULL, 0}; 98 | if (threads > 1 ) { 99 | p.pool = hts_tpool_init(threads); 100 | hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); 101 | } 102 | 103 | int ret = 0; 104 | int i = 0; 105 | bam1_t* b = bam_init1(); 106 | size_t file_offset = bgzf_tell(fp->fp.bgzf); 107 | while ((ret = sam_read1(fp, h, b)) >= 0) { 108 | if ((i % every) != 0) { 109 | file_offset = bgzf_tell(fp->fp.bgzf); 110 | i++; 111 | continue; 112 | } 113 | if (i % 100000 == 0) { 114 | fprintf(stderr, "Record %d %zu\n", i, file_offset); 115 | } 116 | if (bc_idx_write(out_fp, idx, file_offset, bam_get_qname(b)) < 0) { 117 | fprintf(stderr, "Failed to write records to index.\n"); 118 | exit(EXIT_FAILURE); 119 | } 120 | file_offset = bgzf_tell(fp->fp.bgzf); 121 | i++; 122 | } 123 | 124 | bam_hdr_destroy(h); 125 | bam_destroy1(b); 126 | hts_close(fp); 127 | if (p.pool) { // must be after fp 128 | hts_tpool_destroy(p.pool); 129 | } 130 | 131 | // fill in how many records we wrote 132 | if (bc_idx_write_header(out_fp, idx) > 0) { 133 | fprintf(stderr, "Failed to write header to index.\n"); 134 | } 135 | fclose(out_fp); 136 | fprintf(stderr, "Written %zu/%d records to index.\n", idx->n_chunks, i); 137 | bc_idx_destroy(idx); 138 | } 139 | 140 | 141 | int main_build(int argc, char *argv[]) { 142 | clock_t begin = clock(); 143 | arguments_t args = parse_arguments(argc, argv); 144 | #ifdef NOTHREADS 145 | if (args.threads != 1) { 146 | fprintf( 147 | stderr, 148 | "--threads set to %d, but threading not supported by this build.\n", args.threads); 149 | } 150 | #endif 151 | 152 | char* index_fname = generate_index_filename(args.bam, NULL); 153 | index_build(args.bam, index_fname, args.threads, args.chunk_size); 154 | free(index_fname); 155 | 156 | clock_t end = clock(); 157 | fprintf(stderr, "Total CPU time: %fs\n", (double)(end - begin) / CLOCKS_PER_SEC); 158 | return EXIT_SUCCESS; 159 | } 160 | -------------------------------------------------------------------------------- /src/bamindex/dump_main.c: -------------------------------------------------------------------------------- 1 | // bamindex dump program 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "index.h" 10 | 11 | #include 12 | 13 | static char doc[] = 14 | "bamindex dump -- dump a BAM chunk index to stdout as text.\ 15 | \vThe program simply writes the contents of an index to stdout for human \ 16 | inspection. It has no other purpose."; 17 | static char args_doc[] = ""; 18 | static struct argp_option options[] = { 19 | { 0 } 20 | }; 21 | 22 | typedef struct arguments { 23 | const char* index; 24 | } arguments_t; 25 | 26 | static error_t parse_opt (int key, char *arg, struct argp_state *state) { 27 | arguments_t *arguments = state->input; 28 | switch (key) { 29 | case ARGP_KEY_NO_ARGS: 30 | argp_usage (state); 31 | break; 32 | case ARGP_KEY_ARG: 33 | if (state->arg_num == 0) { 34 | arguments->index = arg; 35 | break; 36 | } 37 | break; 38 | case ARGP_KEY_END: 39 | if (state->arg_num != 1) 40 | argp_usage (state); 41 | break; 42 | default: 43 | return ARGP_ERR_UNKNOWN; 44 | } 45 | return 0; 46 | } 47 | 48 | static struct argp argp = {options, parse_opt, args_doc, doc, 0, 0, 0}; 49 | 50 | static arguments_t parse_arguments(int argc, char** argv) { 51 | arguments_t args; 52 | args.index = NULL; 53 | argp_parse(&argp, argc, argv, 0, 0, &args); 54 | return args; 55 | } 56 | 57 | 58 | void index_dump(const char* filename) { 59 | struct stat st; 60 | if (stat(filename, &st) != 0) { 61 | errx(1, "Cannot open index file %s\n", filename); 62 | exit(EXIT_FAILURE); 63 | } 64 | 65 | FILE *fp = fopen(filename, "rb"); 66 | bc_idx_t *idx; 67 | if((idx = bc_idx_read(fp)) == NULL) { 68 | fprintf(stderr, "Couldn't read index file: %s.\n", filename); 69 | exit(EXIT_FAILURE); 70 | } 71 | fprintf(stdout, "Index contains %zu chunks of size %zu.\n", idx->n_chunks, idx->chunk_size); 72 | for (size_t i=0; in_chunks; ++i){ 73 | fprintf(stdout, "%zu %s\n", (idx->recs[i]).file_offset, (idx->recs[i]).qname); 74 | } 75 | bc_idx_destroy(idx); 76 | fclose(fp); 77 | } 78 | 79 | 80 | int main_dump(int argc, char *argv[]) { 81 | clock_t begin = clock(); 82 | arguments_t args = parse_arguments(argc, argv); 83 | index_dump(args.index); 84 | clock_t end = clock(); 85 | fprintf(stderr, "Total CPU time: %fs\n", (double)(end - begin) / CLOCKS_PER_SEC); 86 | return EXIT_SUCCESS; 87 | } 88 | 89 | -------------------------------------------------------------------------------- /src/bamindex/fetch_main.c: -------------------------------------------------------------------------------- 1 | // bamindex fetch program 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "htslib/faidx.h" 10 | #include "htslib/sam.h" 11 | #include "htslib/thread_pool.h" 12 | #include "htslib/bgzf.h" 13 | 14 | #include "index.h" 15 | 16 | #include 17 | 18 | static char doc[] = 19 | "bamindex fetch -- fetch records from a BAM according to an index.\ 20 | \vThe program simply will fetch a batch of records from a BAM file" \ 21 | "using and index and a chunk ID. Output is written as uncompressed "\ 22 | "BAM to stdout."; 23 | static char args_doc[] = ""; 24 | static struct argp_option options[] = { 25 | {0, 0, 0, 0, 26 | "General options:", 0}, 27 | {"threads", 't', "THREADS", 0, 28 | "Number of threads for BAM processing.", 0}, 29 | {"chunk", 'c', "SIZE", 0, 30 | "Chunk index to retrieve.", 0}, 31 | { 0 } 32 | }; 33 | 34 | typedef struct arguments { 35 | const char* bam; 36 | const char* index; 37 | int chunk_idx; 38 | int threads; 39 | } arguments_t; 40 | 41 | static error_t parse_opt (int key, char *arg, struct argp_state *state) { 42 | arguments_t *arguments = state->input; 43 | switch (key) { 44 | case 't': 45 | arguments->threads = atoi(arg); 46 | break; 47 | case 'c': 48 | arguments->chunk_idx = atoi(arg); 49 | break; 50 | case ARGP_KEY_NO_ARGS: 51 | argp_usage (state); 52 | break; 53 | case ARGP_KEY_ARG: 54 | if (state->arg_num == 0) { 55 | arguments->bam = arg; 56 | break; 57 | } 58 | break; 59 | case ARGP_KEY_END: 60 | if (state->arg_num != 1) 61 | argp_usage (state); 62 | break; 63 | default: 64 | return ARGP_ERR_UNKNOWN; 65 | } 66 | return 0; 67 | } 68 | 69 | static struct argp argp = {options, parse_opt, args_doc, doc, 0, 0, 0}; 70 | 71 | static arguments_t parse_arguments(int argc, char** argv) { 72 | arguments_t args; 73 | args.bam = NULL; 74 | args.index = NULL; 75 | args.threads = 1; 76 | args.chunk_idx = 0; 77 | argp_parse(&argp, argc, argv, 0, 0, &args); 78 | args.index = generate_index_filename(args.bam, args.index); 79 | return args; 80 | } 81 | 82 | 83 | void index_fetch(const char* bam_fname, const char* index_fname, int chunk, int threads) { 84 | htsFile *fp = hts_open(bam_fname, "r"); 85 | bam_hdr_t *h = sam_hdr_read(fp); 86 | if(fp == NULL || h == NULL) { 87 | fprintf(stderr, "Could not open %s\n", bam_fname); 88 | exit(EXIT_FAILURE); 89 | } 90 | htsThreadPool p = {NULL, 0}; 91 | if (threads > 1 ) { 92 | p.pool = hts_tpool_init(threads); 93 | hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); 94 | } 95 | 96 | struct stat st; 97 | if (stat(index_fname, &st) != 0) { 98 | errx(1, "Cannot open index file %s\n", index_fname); 99 | exit(EXIT_FAILURE); 100 | } 101 | FILE* idx_fp = fopen(index_fname, "rb"); 102 | bc_idx_t *idx; 103 | if((idx = bc_idx_read(idx_fp)) == NULL) { 104 | fprintf(stderr, "Couldn't read index file: %s.\n", index_fname); 105 | exit(EXIT_FAILURE); 106 | } 107 | fclose(idx_fp); 108 | bc_rec_t rec = idx->recs[chunk]; 109 | 110 | fprintf(stderr, "Starting from: %zu %s\n", rec.file_offset, rec.qname); 111 | fprintf(stderr, "Reading %zu records from bam.\n", idx->chunk_size); 112 | 113 | 114 | if(bgzf_seek(fp->fp.bgzf, rec.file_offset, SEEK_SET) != 0) { 115 | fprintf(stderr, "Failed to seek to first record.\n"); 116 | exit(EXIT_FAILURE); 117 | } 118 | 119 | size_t written = 0; 120 | bam1_t* b = bam_init1(); 121 | htsFile * out_fp; 122 | if ((out_fp = hts_open("-", "wb0")) == 0) { 123 | fprintf(stderr, "Failed to open standard output for writing.\n"); 124 | exit(EXIT_FAILURE); 125 | } 126 | 127 | // TODO: fill in the NULLs here 128 | if(sam_hdr_add_pg(h, "bamindex.fetch", "VN", argp_program_version, NULL, NULL, NULL) != 0){ 129 | fprintf(stderr, "Failed to add PG line to the header.\n"); 130 | exit(EXIT_FAILURE); 131 | } 132 | if(sam_hdr_write(out_fp, h) != 0) { 133 | fprintf(stderr, "Failed to write the SAM header.\n"); 134 | exit(EXIT_FAILURE); 135 | } 136 | while ((sam_read1(fp, h, b) >= 0) && (written < (idx->chunk_size))) { 137 | if((sam_write1(out_fp, h, b) < 0)) { 138 | fprintf(stderr, "Failed to write output record."); 139 | exit(EXIT_FAILURE); 140 | } 141 | written++; 142 | } 143 | hts_close(out_fp); 144 | 145 | bam_hdr_destroy(h); 146 | bam_destroy1(b); 147 | hts_close(fp); 148 | if (p.pool) { // must be after fp 149 | hts_tpool_destroy(p.pool); 150 | } 151 | 152 | bc_idx_destroy(idx); 153 | fprintf(stderr, "Written %zu records to output.\n", written); 154 | } 155 | 156 | 157 | int main_fetch(int argc, char *argv[]) { 158 | clock_t begin = clock(); 159 | arguments_t args = parse_arguments(argc, argv); 160 | index_fetch(args.bam, args.index, args.chunk_idx, args.threads); 161 | free((char*)args.index); 162 | clock_t end = clock(); 163 | fprintf(stderr, "Total CPU time: %fs\n", (double)(end - begin) / CLOCKS_PER_SEC); 164 | return EXIT_SUCCESS; 165 | } 166 | -------------------------------------------------------------------------------- /src/bamindex/index.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "htslib/sam.h" 6 | 7 | #include "index.h" 8 | 9 | const size_t MAGIC_LEN = 5; 10 | const char* FILE_MAGIC = "FANZ\0"; 11 | 12 | 13 | char* generate_index_filename(const char* input_bam, const char* input_index) { 14 | char* out_fn; 15 | 16 | if(input_index != NULL) { 17 | out_fn = calloc(strlen(input_index) + 1, sizeof(char)); 18 | if(out_fn == NULL) { 19 | exit(EXIT_FAILURE); 20 | } 21 | strcpy(out_fn, input_index); 22 | } else { 23 | out_fn = calloc(strlen(input_bam) + 5, sizeof(char)); 24 | if(out_fn == NULL) { 25 | exit(EXIT_FAILURE); 26 | } 27 | strcpy(out_fn, input_bam); 28 | strcat(out_fn, ".bci"); 29 | } 30 | return out_fn; 31 | } 32 | 33 | bc_idx_t *bc_idx_init(void) { 34 | bc_idx_t *h = (bc_idx_t*)calloc(1, sizeof(bc_idx_t)); 35 | if (h == NULL) return NULL; 36 | // any init? 37 | return h; 38 | } 39 | 40 | bc_idx_t *bc_idx_init1(const size_t chunk_size) { 41 | bc_idx_t *idx = bc_idx_init(); 42 | if (idx == NULL) return NULL; 43 | idx->version = 1; 44 | idx->chunk_size = chunk_size; 45 | idx->n_chunks = 0; 46 | idx->stored = 0; 47 | return idx; 48 | } 49 | 50 | void bc_idx_destroy(bc_idx_t *h) { 51 | if (h->stored > 0) { 52 | for (size_t i=0; istored; ++i) { 53 | if (h->recs[i].qname != NULL) { 54 | free(h->recs[i].qname); 55 | } 56 | } 57 | if (h->recs != NULL) { 58 | free(h->recs); 59 | } 60 | } 61 | free(h); 62 | } 63 | 64 | bc_idx_t *bc_idx_read(FILE *fp) { 65 | char buf[MAGIC_LEN]; 66 | size_t magic_len = fread(&(buf), sizeof(char), MAGIC_LEN, fp); 67 | if (magic_len != MAGIC_LEN || memcmp(buf, FILE_MAGIC, MAGIC_LEN)) { 68 | fprintf(stderr, "Invalid BAM chunk index binary header.\n"); 69 | return NULL; 70 | } 71 | bc_idx_t *h = bc_idx_init(); 72 | if(h == NULL) { 73 | fprintf(stderr, "Failed to allocate header.\n"); 74 | return NULL; 75 | } 76 | size_t items = 0; 77 | items += fread(&(h->version), sizeof(h->version), 1, fp); 78 | items += fread(&(h->chunk_size), sizeof(h->chunk_size), 1, fp); 79 | items += fread(&(h->n_chunks), sizeof(h->n_chunks), 1, fp); 80 | if (items != 3) { 81 | bc_idx_destroy(h); 82 | fprintf(stderr, "Invalid BAM chunk index binary header.\n"); 83 | return NULL; 84 | } 85 | 86 | h->stored = h->n_chunks; 87 | h->recs = (bc_rec_t*)calloc(h->stored, sizeof(bc_rec_t)); 88 | 89 | size_t valid = 0; 90 | char *msg = "Failed to read index contents. File is currupt.\n"; 91 | for (size_t i=0; in_chunks; ++i, ++valid) { 92 | bc_rec_t *r = &(h->recs[i]); 93 | if (fread(&(r->file_offset), sizeof(r->file_offset), 1, fp) != 1) { 94 | fputs(msg, stderr); break; 95 | } 96 | if (fread(&(r->lqname), sizeof(r->lqname), 1, fp) != 1) { 97 | fputs(msg, stderr); break; 98 | } 99 | r->qname = (char*)calloc(r->lqname, sizeof(char)); 100 | if (fread(r->qname, sizeof(char), r->lqname, fp) != r->lqname) { 101 | fputs(msg, stderr); break; 102 | } 103 | } 104 | if (valid != h->stored) { 105 | bc_idx_destroy(h); 106 | return NULL; 107 | } 108 | return h; 109 | } 110 | 111 | int bc_idx_write_header(FILE* fp, bc_idx_t* idx) { 112 | int rtn = 0; 113 | fseek(fp, 0, SEEK_SET); 114 | if (fwrite(FILE_MAGIC, sizeof(char), MAGIC_LEN, fp) != MAGIC_LEN) rtn = 1; 115 | if (fwrite(&(idx->version), sizeof(idx->version), 1, fp) != 1) rtn = 2; 116 | if (fwrite(&(idx->chunk_size), sizeof(idx->chunk_size), 1, fp) != 1) rtn = 3; 117 | if (fwrite(&(idx->n_chunks), sizeof(idx->n_chunks), 1, fp) != 1) rtn = 4; 118 | fseek(fp, 0, SEEK_END); 119 | return rtn; 120 | } 121 | 122 | int bc_idx_write(FILE* fp, bc_idx_t* idx, size_t offset, char* qname) { 123 | // write: file offset, length qname, qname 124 | size_t l_qname = strlen(qname) + 1; 125 | if (fwrite(&offset, sizeof(offset), 1, fp) != 1) return -1; 126 | if (fwrite(&l_qname, sizeof(l_qname), 1, fp) != 1) return -1; 127 | if (fwrite(qname, sizeof(char), l_qname, fp) != l_qname) return -1; 128 | (idx->n_chunks)++; 129 | return (int)(idx->n_chunks); 130 | } 131 | -------------------------------------------------------------------------------- /src/bamindex/index.h: -------------------------------------------------------------------------------- 1 | #ifndef _BAM_INDEX_INDEX_H 2 | #define _BAM_INDEX_INDEX_H 3 | 4 | 5 | typedef struct bc_rec_t { 6 | size_t file_offset; 7 | size_t lqname; 8 | char *qname; 9 | } bc_rec_t; 10 | 11 | 12 | typedef struct bc_idx_t { 13 | size_t version; 14 | size_t chunk_size; 15 | size_t n_chunks; 16 | size_t stored; // TODO: disentangle header from contents 17 | bc_rec_t *recs; 18 | } bc_idx_t; 19 | 20 | 21 | char* generate_index_filename(const char* input_bam, const char* input_index); 22 | bc_idx_t *bc_idx_init(void); 23 | bc_idx_t *bc_idx_init1(size_t every); 24 | void bc_idx_destroy(bc_idx_t *h); 25 | bc_idx_t *bc_idx_read(FILE *fp); 26 | int bc_idx_write_header(FILE* fp, bc_idx_t* idx); 27 | int bc_idx_write(FILE* fp, bc_idx_t* idx, size_t offset, char* qname); 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /src/bamindex/main.c: -------------------------------------------------------------------------------- 1 | // bamstats program 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | enum command_mode { 12 | MODE_HELP = 0, 13 | MODE_BUILD, 14 | MODE_FETCH, 15 | MODE_DUMP, 16 | MODE_INVALID }; 17 | static const enum command_mode ncommand = MODE_INVALID; 18 | 19 | enum command_mode get_mode(const char *modestr) { 20 | if (0 == strcmp(modestr, "help")) {return MODE_HELP;} 21 | if (0 == strcmp(modestr, "build")) {return MODE_BUILD;} 22 | if (0 == strcmp(modestr, "fetch")) {return MODE_FETCH;} 23 | if (0 == strcmp(modestr, "dump")) {return MODE_DUMP;} 24 | return MODE_INVALID; 25 | } 26 | 27 | void fprint_commands(void); 28 | 29 | const char *mode_string(const enum command_mode mode) { 30 | switch (mode) { 31 | case MODE_HELP: 32 | return "help"; 33 | case MODE_BUILD: 34 | return "build"; 35 | case MODE_FETCH: 36 | return "fetch"; 37 | case MODE_DUMP: 38 | return "dump"; 39 | case MODE_INVALID: 40 | fprint_commands(); 41 | errx(EXIT_FAILURE, "Invalid subcommand\n"); 42 | default: 43 | errx(EXIT_FAILURE, "bamindex failure -- report bug\n"); 44 | } 45 | 46 | return NULL; 47 | } 48 | 49 | const char *mode_description(const enum command_mode mode) { 50 | switch (mode) { 51 | case MODE_HELP: 52 | return "Print general help or help about a subcommand."; 53 | case MODE_BUILD: 54 | return "Build a BAM index."; 55 | case MODE_FETCH: 56 | return "Fetch records from a BAM using an index."; 57 | case MODE_DUMP: 58 | return "Dump an index fetch to text."; 59 | case MODE_INVALID: 60 | fprint_commands(); 61 | errx(EXIT_FAILURE, "Invalid subcommand\n"); 62 | default: 63 | errx(EXIT_FAILURE, "bamindex failure -- report bug\n"); 64 | } 65 | 66 | return NULL; 67 | } 68 | 69 | void fprint_commands(void) { 70 | for (enum command_mode i = 0; i < ncommand; i++) { 71 | fprintf( 72 | stderr, "* bamindex %-14s%s\n", mode_string(i), mode_description(i)); 73 | } 74 | } 75 | int main_build(int argc, char *argv[]); 76 | int main_fetch(int argc, char *argv[]); 77 | int main_dump(int argc, char *argv[]); 78 | 79 | int main(int argc, char *argv[]) { 80 | 81 | if (argc == 1) { 82 | // Called as program name on it's own 83 | fprint_commands(); 84 | return EXIT_SUCCESS; 85 | } 86 | 87 | int ret = EXIT_FAILURE; 88 | switch (get_mode(argv[1])) { 89 | case MODE_HELP: 90 | fprint_commands(); 91 | break; 92 | case MODE_BUILD: 93 | ret = main_build(argc - 1, argv + 1); 94 | break; 95 | case MODE_FETCH: 96 | ret = main_fetch(argc - 1, argv + 1); 97 | break; 98 | case MODE_DUMP: 99 | ret = main_dump(argc - 1, argv + 1); 100 | break; 101 | default: 102 | ret = EXIT_FAILURE; 103 | warnx("Unrecognised subcommand %s\n", argv[1]); 104 | } 105 | 106 | return ret; 107 | } 108 | 109 | -------------------------------------------------------------------------------- /src/bamstats/args.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "htslib/sam.h" 8 | #include "htslib/faidx.h" 9 | #include "args.h" 10 | #include "../version.h" 11 | 12 | const char *argp_program_bug_address = "chris.wright@nanoporetech.com"; 13 | static char doc[] = 14 | "bamstats -- summarise rears/alignments in one or more BAM files.\ 15 | \vThe program creates a simple TSV file containing statistics for \ 16 | each primary alignment stored within the input BAM files."; 17 | static char args_doc[] = ""; 18 | static struct argp_option options[] = { 19 | {0, 0, 0, 0, 20 | "General options:", 0}, 21 | {"region", 'r', "chr:start-end", 0, 22 | "Genomic region to process.", 0}, 23 | {"bed", 'b', "BEDFILE", 0, 24 | "BED file for regions to process.", 0}, 25 | {"threads", 't', "THREADS", 0, 26 | "Number of threads for BAM processing.", 0}, 27 | {"sample", 's',"SAMPLE NAME", 0, 28 | "Sample name (if given, adds a 'sample_name' column).", 0}, 29 | {"flagstats", 'f', "FLAGSTATS", 0, 30 | "File for outputting alignment flag counts.", 0}, 31 | {"runids", 'i', "ID SUMMARY", 0, 32 | "Run ID summary output", 0}, 33 | {"basecallers", 'l', "BASECALLERS", 0, 34 | "Basecaller summary output", 0}, 35 | {"histograms", 0x400, "DIRECTORY", 0, 36 | "Directory for outputting histogram information. (default: bamstats-histograms)", 0}, 37 | {"recalc_qual", 0x900, 0, 0, 38 | "Force recomputing mean quality, else use 'qs' tag in BAM if present.", 0}, 39 | {0, 0, 0, 0, 40 | "Read filtering options:", 0}, 41 | {"unmapped", 'u', 0, 0, 42 | "Include unmapped/unplaced reads in output.", 3}, 43 | {"read_group", 'g', "RG", 0, 44 | "Only process reads from given read group.", 3}, 45 | {"tag_name", 0x100, "TN", 0, 46 | "Only process reads with a given tag (see --tag_value).", 3}, 47 | {"tag_value", 0x200, "VAL", 0, 48 | "Only process reads with a given tag value.", 3}, 49 | {"haplotype", 0x300, "VAL", 0, 50 | "Only process reads from a given haplotype. Equivalent to --tag_name HP --tag_value VAL.", 3}, 51 | {0, 0, 0, 0, 52 | "Poly-A Options:", 0}, 53 | {"poly_a", 0x500, 0, 0, 54 | "Enable poly-A tail length histogram.", 5}, 55 | {"poly_a_cover", 0x600, "PCT_COVERAGE", 0, 56 | "Reference alignment coverage for acceptance of read. (default: 95)", 5}, 57 | {"poly_a_qual", 0x700, "QUAL", 0, 58 | "Read mean Q score for acceptance of read. (default: 10)", 5}, 59 | {"poly_a_rev", 0x800, 0, 0, 60 | "Allow reverse alignments (useful for cDNA, default is appropriate for direct RNA seq).", 5}, 61 | { 0 } 62 | }; 63 | 64 | bool file_exists(char* filename) { 65 | struct stat st; 66 | return (stat(filename, &st) == 0); 67 | } 68 | 69 | static int tag_items = 0; 70 | static bool tag_given = false; 71 | static bool hp_given = false; 72 | static error_t parse_opt (int key, char *arg, struct argp_state *state) { 73 | arguments_t *arguments = state->input; 74 | switch (key) { 75 | case 'r': 76 | arguments->region = arg; 77 | break; 78 | case 'b': 79 | arguments->bed = arg; 80 | break; 81 | case 'g': 82 | arguments->read_group = arg; 83 | break; 84 | case 'f': 85 | arguments->flagstats = arg; 86 | break; 87 | case 'i': 88 | arguments->runids = arg; 89 | break; 90 | case 'l': 91 | arguments->basecallers = arg; 92 | break; 93 | case 0x400: 94 | arguments->histograms = arg; 95 | break; 96 | case 0x500: 97 | arguments->poly_a = true; 98 | break; 99 | case 0x600: 100 | arguments->poly_a_cover = atof(arg); 101 | break; 102 | case 0x700: 103 | arguments->poly_a_qual = atof(arg); 104 | break; 105 | case 0x800: 106 | arguments->poly_a_rev = true; 107 | break; 108 | case 's': 109 | arguments->sample = arg; 110 | break; 111 | case 'u': 112 | arguments->unmapped = true; 113 | break; 114 | case 0x100: 115 | if (strlen(arg) > 2) { 116 | argp_error(state, "Tag name should be a two-letter code, received: '%s'.", arg); 117 | } 118 | memcpy(arguments->tag_name, arg, 2 *sizeof(char)); 119 | tag_items += 1; 120 | tag_given = true; 121 | break; 122 | case 0x200: 123 | arguments->tag_value = atoi(arg); 124 | tag_items += 1; 125 | tag_given = true; 126 | break; 127 | case 0x300: 128 | memcpy(arguments->tag_name, "HP", 2 * sizeof(char)); 129 | arguments->tag_value = atoi(arg); 130 | tag_items += 2; 131 | hp_given = true; 132 | break; 133 | case 't': 134 | arguments->threads = atoi(arg); 135 | break; 136 | case 0x900: 137 | arguments->force_recalc_qual = true; 138 | break; 139 | case ARGP_KEY_NO_ARGS: 140 | argp_usage (state); 141 | break; 142 | case ARGP_KEY_ARG: 143 | if (state->arg_num == 0) { 144 | arguments->bam = (const char**)(&state->argv[state->next - 1]); 145 | state->next = state->argc; 146 | break; 147 | } 148 | break; 149 | case ARGP_KEY_END: 150 | if (state->arg_num != 1) 151 | argp_usage (state); 152 | break; 153 | default: 154 | return ARGP_ERR_UNKNOWN; 155 | } 156 | return 0; 157 | } 158 | 159 | static struct argp argp = {options, parse_opt, args_doc, doc, 0, 0, 0}; 160 | 161 | arguments_t parse_arguments(int argc, char** argv) { 162 | arguments_t args; 163 | args.bam = NULL; 164 | args.flagstats = NULL; 165 | args.runids = NULL; 166 | args.basecallers = NULL; 167 | args.histograms = "bamstats-histograms"; 168 | args.poly_a = false; 169 | args.poly_a_cover = 95; 170 | args.poly_a_qual = 10; 171 | args.poly_a_rev = false; 172 | args.sample = NULL; 173 | args.ref = NULL; 174 | args.region = NULL; 175 | args.bed = NULL; 176 | args.unmapped = false; 177 | args.read_group = NULL; 178 | args.tag_name[0] = '\0'; 179 | args.tag_value = -1; 180 | args.threads = 1; 181 | args.force_recalc_qual = false; 182 | argp_parse(&argp, argc, argv, 0, 0, &args); 183 | if (tag_items % 2 > 0) { 184 | fprintf(stderr, "ERROR: Both or neither of --tag_name and --tag_value must be given.\n"); 185 | exit(EXIT_FAILURE); 186 | } 187 | if (tag_given && hp_given) { 188 | fprintf(stderr, "ERROR: If --haplotype is given neither of --tag_name or --tag_value should be provided.\n"); 189 | exit(EXIT_FAILURE); 190 | } 191 | return args; 192 | } 193 | -------------------------------------------------------------------------------- /src/bamstats/args.h: -------------------------------------------------------------------------------- 1 | #ifndef _MODBAMBED_ARGS_H 2 | #define _MODBAMBED_ARGS_H 3 | 4 | #include 5 | 6 | 7 | typedef struct arguments { 8 | const char** bam; 9 | char* flagstats; 10 | char* runids; 11 | char* basecallers; 12 | char* histograms; 13 | bool poly_a; 14 | float poly_a_cover; 15 | float poly_a_qual; 16 | bool poly_a_rev; 17 | char *sample; 18 | char* ref; 19 | char* region; 20 | char* bed; 21 | char* read_group; 22 | char tag_name[2]; 23 | int tag_value; 24 | int threads; 25 | bool unmapped; 26 | bool force_recalc_qual; 27 | } arguments_t; 28 | 29 | arguments_t parse_arguments(int argc, char** argv); 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/bamstats/bamiter.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "bamiter.h" 6 | #include "common.h" 7 | 8 | /** Set up a bam file for reading (filtered) records. 9 | * 10 | * @param fp htsFile pointer 11 | * @param idx hts_idx_t pointer 12 | * @param hdr sam_hdr_t pointer 13 | * @param chr bam target name. 14 | * @param start start position of chr to consider. 15 | * @param end end position of chr to consider. 16 | * @param overlap_start whether reads overhanging start should be included. 17 | * @param read_group by which to filter alignments. 18 | * @param tag_name by which to filter alignments. 19 | * @param tag_value associated with tag_name. 20 | * 21 | * The return value can be freed with destroy_bam_iter_data. 22 | * 23 | */ 24 | mplp_data *create_bam_iter_data( 25 | htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr, 26 | const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start, 27 | const char *read_group, const char tag_name[2], const int tag_value) { 28 | 29 | mplp_data *data = xalloc(1, sizeof(mplp_data), "pileup init data"); 30 | 31 | // find the target index for query below 32 | if (chr == NULL) { // all reads 33 | data->iter = NULL; 34 | } else { 35 | int mytid; 36 | if (strcmp(chr, "*") == 0) { // unplaced 37 | mytid = HTS_IDX_NOCOOR; 38 | start = 0; end = INT64_MAX; 39 | } else { 40 | mytid = sam_hdr_name2tid(hdr, chr); 41 | if (mytid < 0) { 42 | fprintf(stderr, "Failed to find reference sequence '%s' in bam.\n", chr); 43 | free(data); 44 | return NULL; 45 | } 46 | } 47 | data->iter = bam_itr_queryi(idx, mytid, start, end); 48 | } 49 | 50 | // setup bam interator 51 | data->fp = fp; data->idx = idx; data->hdr = hdr; 52 | data->min_start = overlap_start ? -1 : start; // unmapped reads have pos -1 53 | memcpy(data->tag_name, tag_name, 2); data->tag_value = tag_value; 54 | data->min_mapQ = 0; data->read_group = read_group; 55 | 56 | return data; 57 | } 58 | 59 | /** Clean up auxiliary bam reading data. 60 | * 61 | * @param data auxiliary structure to clean. 62 | * 63 | */ 64 | void destroy_bam_iter_data(mplp_data *data) { 65 | bam_itr_destroy(data->iter); 66 | free(data); 67 | } 68 | 69 | 70 | /** Read a bam record. 71 | * 72 | * @param data an mplp_data encoding the bam file to read with filter options. 73 | * @param b output pointer. 74 | * 75 | */ 76 | int read_bam(void *data, bam1_t *b) { 77 | mplp_data *aux = (mplp_data*) data; 78 | uint8_t *tag; 79 | bool check_tag = (strcmp(aux->tag_name, "") != 0); 80 | bool have_rg = (aux->read_group != NULL); 81 | uint8_t *rg; 82 | char *rg_val; 83 | int ret; 84 | while (1) { 85 | ret = aux->iter ? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); 86 | if (ret<0) break; 87 | // only take primary alignments 88 | //if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FQCFAIL | BAM_FDUP)) continue; 89 | // maybe remove reads overlapping start 90 | if (b->core.pos < aux->min_start) continue; 91 | // filter by mapping quality 92 | if ((int)b->core.qual < aux->min_mapQ) continue; 93 | // filter by tag 94 | if (check_tag) { 95 | tag = bam_get_tag_caseinsensitive((const bam1_t*) b, aux->tag_name); 96 | if (tag == NULL){ // tag isn't present or is currupt 97 | if (aux->keep_missing) { 98 | break; 99 | } else { 100 | continue; 101 | } 102 | } 103 | errno = 0; 104 | int tag_value = bam_aux2i(tag); 105 | if (tag_value == 0 && errno == EINVAL) continue; // tag was not integer 106 | if (tag_value != aux->tag_value) continue; 107 | } 108 | // filter by RG (read group): 109 | if (have_rg) { 110 | rg = bam_get_tag_caseinsensitive((const bam1_t*) b, "RG"); 111 | if (rg == NULL) continue; // missing 112 | rg_val = bam_aux2Z(rg); 113 | if (rg_val == 0 && errno == EINVAL) continue; // bad parse 114 | if (strcmp(aux->read_group, rg_val) != 0) continue; // not wanted 115 | } 116 | break; 117 | } 118 | return ret; 119 | } 120 | 121 | 122 | /** Create an map of query position to reference position 123 | * 124 | * @param b alignment record 125 | * 126 | * The length of the returned array is b->core->l_qlen. 127 | */ 128 | int *qpos2rpos(bam1_t *b) { 129 | // we only deal in primary/soft-clipped alignments so length 130 | // of qseq member is the length of the intact query sequence. 131 | // TODO: add check for alignment being primary / no hard clipping 132 | uint32_t qlen = b->core.l_qseq; 133 | uint32_t *cigar = bam_get_cigar(b); 134 | int *posmap = xalloc(qlen, sizeof(uint32_t), "pos_map"); 135 | for (size_t i = 0; i < qlen; ++i) posmap[i] = -1; // unaligned 136 | int qpos = 0, rpos = b->core.pos; 137 | for (size_t i = 0; i < b->core.n_cigar; ++i){ 138 | uint32_t op = bam_cigar_op(cigar[i]); 139 | uint32_t len = bam_cigar_oplen(cigar[i]); 140 | uint32_t take = bam_cigar_type(op); 141 | if (((take&0x1)>0) & ((take&0x2)>0)) { 142 | // consumes query and ref 143 | for (size_t j = 0; j < len; ++j, ++qpos, ++rpos) { 144 | posmap[qpos] = rpos; 145 | } 146 | } 147 | else if ((take&0x1)>0) { 148 | // consumes query only 149 | qpos += len; 150 | } 151 | else { 152 | // consumes ref 153 | rpos += len; 154 | } 155 | } 156 | return posmap; 157 | } 158 | 159 | /** Fetch a BAM tag with case insensitivity 160 | * 161 | * @param b BAM record 162 | * @param tag Tag to fetch via bam_aux_get 163 | * 164 | */ 165 | uint8_t* bam_get_tag_caseinsensitive(const bam1_t* b, char* tag) { 166 | 167 | uint8_t* ret; 168 | char upper_tag[3]; 169 | char lower_tag[3]; 170 | upper_tag[2] = '\0'; 171 | lower_tag[2] = '\0'; 172 | for (int i = 0; i < 2; i++) { 173 | upper_tag[i] = toupper(tag[i]); 174 | lower_tag[i] = tolower(tag[i]); 175 | } 176 | // Try uppercase variant 177 | ret = bam_aux_get((const bam1_t*) b, upper_tag); 178 | if (ret == NULL){ 179 | // Try lowercase variant 180 | ret = bam_aux_get((const bam1_t*) b, lower_tag); 181 | } 182 | return ret; 183 | } 184 | -------------------------------------------------------------------------------- /src/bamstats/bamiter.h: -------------------------------------------------------------------------------- 1 | #ifndef _MODBAMBED_BAMITER_H 2 | #define _MODBAMBED_BAMITER_H 3 | 4 | #include 5 | #include 6 | #include "htslib/sam.h" 7 | 8 | // parameters for bam iteration 9 | typedef struct { 10 | htsFile *fp; 11 | hts_idx_t *idx; 12 | sam_hdr_t *hdr; 13 | hts_itr_t *iter; 14 | int min_start; 15 | int min_mapQ; 16 | char tag_name[2]; 17 | int tag_value; 18 | bool keep_missing; 19 | const char *read_group; 20 | } mplp_data; 21 | 22 | /** Set up a bam file for reading (filtered) records. 23 | * 24 | * @param fp htsFile pointer 25 | * @param idx hts_idx_t pointer 26 | * @param hdr sam_hdr_t pointer 27 | * @param chr bam target name. 28 | * @param start start position of chr to consider. 29 | * @param end end position of chr to consider. 30 | * @param overlap_start whether reads overhanging start should be included. 31 | * @param read_group by which to filter alignments. 32 | * @param tag_name by which to filter alignments. 33 | * @param tag_value associated with tag_name. 34 | * 35 | * The return value can be freed with destroy_bam_iter_data. 36 | * 37 | */ 38 | mplp_data *create_bam_iter_data( 39 | htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr, 40 | const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start, 41 | const char *read_group, const char tag_name[2], const int tag_value); 42 | 43 | /** Clean up auxiliary bam reading data. 44 | * 45 | * @param data auxiliary structure to clean. 46 | * 47 | */ 48 | void destroy_bam_iter_data(mplp_data *data); 49 | 50 | /** Read a bam record. 51 | * 52 | * @param data an mplp_data encoding the bam file to read with filter options. 53 | * @param b output pointer. 54 | * 55 | */ 56 | int read_bam(void *data, bam1_t *b); 57 | 58 | /** Create an map of query position to reference position 59 | * 60 | * @param b alignment record 61 | * 62 | * The length of the returned array is b->core->l_qlen. 63 | */ 64 | int *qpos2rpos(bam1_t *b); 65 | 66 | /** Fetch a BAM tag with case insensitivity 67 | * 68 | * @param b BAM record 69 | * @param tag Tag to fetch via bam_aux_get 70 | * 71 | */ 72 | uint8_t* bam_get_tag_caseinsensitive(const bam1_t* b, char* tag); 73 | 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /src/bamstats/main.c: -------------------------------------------------------------------------------- 1 | // bamstats program 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "htslib/faidx.h" 10 | #include "htslib/sam.h" 11 | #include "htslib/thread_pool.h" 12 | 13 | #include "args.h" 14 | #include "common.h" 15 | #include "readstats.h" 16 | #include "regiter.h" 17 | 18 | 19 | void write_header(const char* sample) { 20 | char *sn = sample == NULL ? "" : "\tsample_name"; 21 | fprintf(stdout, 22 | "name\trunid%s\tref\tcoverage\tref_coverage\t"\ 23 | "qstart\tqend\trstart\trend\t"\ 24 | "aligned_ref_len\tdirection\tlength\tread_length\tmean_quality\tstart_time\t"\ 25 | "match\tins\tdel\tsub\tiden\tacc\tduplex\n", 26 | sn); 27 | } 28 | 29 | // stats array should have 8 entries 30 | // total, primary, BAM_FSECONDARY, BAM_FSUPPLEMENTARY, BAM_FUNMAP, BAM_FQCFAIL, BAM_FDUP, unused 31 | // note: HTS spec makes a distinction between "unmapped" (flag & 4) and "unplaced". Unplaced 32 | // are not necessarily unmapped but lack definitive coords, this is mainly for paired-end 33 | // but we'll keep the distinction here. 34 | void write_stats_header(FILE* fh, const char* sample) { 35 | char *sn = sample == NULL ? "" : "\tsample_name"; 36 | fprintf(fh, "ref%s\ttotal\tprimary\tsecondary\tsupplementary\tunmapped\tqcfail\tduplicate\tduplex\tduplex_forming\n", sn); 37 | } 38 | 39 | static inline void write_stats(size_t *stats, const char* chr, const char* sample, FILE* fh) { 40 | if (fh != NULL) { 41 | if (sample == NULL) { 42 | fprintf(fh, 43 | "%s\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\n", 44 | chr, stats[0], stats[1], stats[2], stats[3], stats[4], stats[5], stats[6], stats[7], stats[8] 45 | ); 46 | } else { 47 | fprintf(fh, 48 | "%s\t%s\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\t%zu\n", 49 | chr, sample, stats[0], stats[1], stats[2], stats[3], stats[4], stats[5], stats[6], stats[7], stats[8] 50 | ); 51 | } 52 | } 53 | } 54 | 55 | static inline void write_counter(const char* fname, kh_counter_t *counter, const char* sample, const char* bam_fname, const char* column_name) { 56 | FILE* stats_fp = fopen(fname, "w"); 57 | fprintf(stats_fp, "filename\t"); 58 | if (sample != NULL) fprintf(stats_fp, "sample_name\t"); 59 | fprintf(stats_fp, "%s\tcount\n", column_name); 60 | for (khiter_t k = 0; k < kh_end(counter); ++k) { 61 | if (kh_exist(counter, k)) { 62 | fprintf(stats_fp, "%s\t", bam_fname); 63 | if (sample != NULL) fprintf(stats_fp, "%s\t", sample); 64 | fprintf(stats_fp, "%s\t%d\n", kh_key(counter, k), kh_val(counter, k)); 65 | } 66 | } 67 | fclose(stats_fp); 68 | } 69 | 70 | 71 | void write_hist_stats(read_stats* stats, char* prefix, char* name) { 72 | char* path = calloc(strlen(prefix) + strlen(name) + 2, sizeof(char)); 73 | sprintf(path, "%s/%s", prefix, name); 74 | FILE* fp = fopen(path, "w"); 75 | print_stats(stats, false, true, fp); 76 | fclose(fp); free(path); 77 | } 78 | 79 | 80 | int main(int argc, char *argv[]) { 81 | clock_t begin = clock(); 82 | arguments_t args = parse_arguments(argc, argv); 83 | #ifdef NOTHREADS 84 | if (args.threads != 1) { 85 | fprintf( 86 | stderr, 87 | "--threads set to %d, but threading not supported by this build.\n", args.threads); 88 | } 89 | #endif 90 | 91 | // large basecaller runs can produce more files than a single 92 | // process can open, check this ahead of time. 93 | #ifndef WASM 94 | struct rlimit reslimit; 95 | size_t nfile = 0; for (; args.bam[nfile]; nfile++); 96 | if (getrlimit(RLIMIT_NOFILE, &reslimit) == 0) { 97 | if (nfile * args.threads > reslimit.rlim_cur - 100) { 98 | fprintf(stderr, 99 | "ERROR: Too many BAM files provided (%zu). Try running " 100 | "samtools merge on subsets of files to produce fewer files", nfile); 101 | exit(EXIT_FAILURE); 102 | } 103 | } 104 | #endif 105 | 106 | int rtn = mkdir_hier(args.histograms); 107 | if (rtn == -1) { 108 | fprintf(stderr, 109 | "Error: Cannot create output directory '%s'. Check location is writeable and directory does not exist.\n", 110 | args.histograms); 111 | exit(EXIT_FAILURE); 112 | } 113 | 114 | if (nfile > 1) { 115 | fprintf(stderr, "ERROR: Multiple input files detected, this program currently supports only a single file.\n"); 116 | exit(EXIT_FAILURE); 117 | } 118 | 119 | write_header(args.sample); 120 | 121 | htsFile *fp = hts_open(args.bam[0], "rb"); 122 | sam_hdr_t *hdr = sam_hdr_read(fp); 123 | if (hdr == 0 || fp == 0) { 124 | fprintf(stderr, "Failed to read .bam file '%s'.\n", args.bam[0]); 125 | exit(EXIT_FAILURE); 126 | } 127 | 128 | htsThreadPool p = {NULL, 0}; 129 | if (args.threads > 1 ) { 130 | fprintf(stderr, "Using %d threads\n", args.threads); 131 | p.pool = hts_tpool_init(args.threads); 132 | hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p); 133 | } 134 | 135 | FILE* flagstats = NULL; 136 | flag_stats* flag_counts = NULL; 137 | if (args.flagstats != NULL) { 138 | flagstats = fopen(args.flagstats, "w"); 139 | write_stats_header(flagstats, args.sample); 140 | flag_counts = create_flag_stats( 141 | args.region == NULL ? hdr->n_targets : 1, args.unmapped 142 | ); 143 | } 144 | 145 | kh_counter_t *run_ids = kh_counter_init(); 146 | kh_counter_t *basecallers = kh_counter_init(); 147 | read_stats* length_stats = create_length_stats(); 148 | read_stats* qual_stats = create_qual_stats(QUAL_HIST_WIDTH); 149 | read_stats* acc_stats = create_qual_stats(ACC_HIST_WIDTH); 150 | read_stats* cov_stats = create_qual_stats(COV_HIST_WIDTH); 151 | read_stats* polya_stats = args.poly_a ? create_length_stats() : NULL; 152 | 153 | // Prepare also for the unmapped reads 154 | read_stats* length_stats_unmapped = create_length_stats(); 155 | read_stats* qual_stats_unmapped = create_qual_stats(QUAL_HIST_WIDTH); 156 | 157 | if (args.region == NULL && args.bed == NULL) { 158 | // iterate over the entire file 159 | process_bams( 160 | fp, NULL, hdr, args.sample, 161 | NULL, 0, INT64_MAX, true, 162 | args.read_group, args.tag_name, args.tag_value, 163 | flag_counts, args.unmapped, 164 | length_stats, qual_stats, acc_stats, cov_stats, 165 | length_stats_unmapped, qual_stats_unmapped, 166 | polya_stats, args.poly_a_cover, args.poly_a_qual, args.poly_a_rev, 167 | run_ids, basecallers, args.force_recalc_qual); 168 | 169 | // write flagstat counts if requested 170 | if (flag_counts != NULL) { 171 | for (int i=0; i < hdr->n_targets; ++i) { 172 | const char* chr = sam_hdr_tid2name(hdr, i); 173 | write_stats(flag_counts->counts[i], chr, args.sample, flagstats); 174 | } 175 | if (args.unmapped) { 176 | write_stats(flag_counts->unmapped, "*", args.sample, flagstats); 177 | } 178 | } 179 | } else { 180 | // process given region / BED 181 | hts_idx_t *idx = sam_index_load(fp, args.bam[0]); 182 | if (idx == 0){ 183 | fprintf(stderr, "Cannot find index file for '%s', which is required for processing by region.\n", args.bam[0]); 184 | exit(EXIT_FAILURE); 185 | } 186 | 187 | regiter rit = init_region_iterator(args.bed, args.region, hdr); 188 | int check = 0; 189 | while ((check = next_region(&rit)) != -1) { 190 | if (check == -2 && args.bed == NULL) { 191 | // we were given only a region, not a bed, and that region was garbage 192 | // => user error, should stop immediately 193 | exit(EXIT_FAILURE); 194 | } 195 | if (check != 0) continue; // skip other errors 196 | 197 | process_bams( 198 | fp, idx, hdr, args.sample, 199 | rit.chr, rit.start, rit.end, true, 200 | args.read_group, args.tag_name, args.tag_value, 201 | flag_counts, args.unmapped, 202 | length_stats, qual_stats, acc_stats, cov_stats, 203 | length_stats_unmapped, qual_stats_unmapped, 204 | polya_stats, args.poly_a_cover, args.poly_a_qual, args.poly_a_rev, 205 | run_ids, basecallers, args.force_recalc_qual); 206 | if (flag_counts != NULL) { 207 | // TODO: regions might not be whole chromosomes... 208 | write_stats(flag_counts->counts[0], rit.chr, args.sample, flagstats); 209 | } 210 | } 211 | fprintf(stderr, "Processed %d regions\n", rit.n_regions); 212 | 213 | destroy_region_iterator(&rit); 214 | hts_idx_destroy(idx); 215 | } 216 | 217 | write_hist_stats(length_stats, args.histograms, "length.hist"); 218 | write_hist_stats(qual_stats, args.histograms, "quality.hist"); 219 | write_hist_stats(acc_stats, args.histograms, "accuracy.hist"); 220 | write_hist_stats(cov_stats, args.histograms, "coverage.hist"); 221 | if (polya_stats != NULL) { 222 | write_hist_stats(polya_stats, args.histograms, "polya.hist"); 223 | } 224 | 225 | // Save also histograms for the unmapped reads if requested 226 | // and if the user is not asking for a region 227 | if (args.unmapped && args.region == NULL){ 228 | write_hist_stats(length_stats_unmapped, args.histograms, "length.unmap.hist"); 229 | write_hist_stats(qual_stats_unmapped, args.histograms, "quality.unmap.hist"); 230 | } 231 | 232 | // write runids summary 233 | if (args.runids != NULL) { 234 | write_counter(args.runids, run_ids, args.sample, args.bam[0], "run_id"); 235 | } 236 | // write basecallers summary 237 | if (args.basecallers != NULL) { 238 | write_counter(args.basecallers, basecallers, args.sample, args.bam[0], "basecaller"); 239 | } 240 | 241 | destroy_length_stats(length_stats); 242 | destroy_qual_stats(qual_stats); 243 | destroy_qual_stats(acc_stats); 244 | destroy_qual_stats(cov_stats); 245 | destroy_length_stats(length_stats_unmapped); 246 | destroy_qual_stats(qual_stats_unmapped); 247 | destroy_length_stats(polya_stats); 248 | kh_counter_destroy(basecallers); 249 | kh_counter_destroy(run_ids); 250 | 251 | if (flagstats != NULL) { 252 | fclose(flagstats); 253 | } 254 | 255 | if (flag_counts != NULL) destroy_flag_stats(flag_counts); 256 | sam_hdr_destroy(hdr); 257 | hts_close(fp); 258 | if (p.pool) { // must be after fp 259 | hts_tpool_destroy(p.pool); 260 | } 261 | 262 | clock_t end = clock(); 263 | fprintf(stderr, "Total CPU time: %fs\n", (double)(end - begin) / CLOCKS_PER_SEC); 264 | exit(EXIT_SUCCESS); 265 | } 266 | -------------------------------------------------------------------------------- /src/bamstats/readstats.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "htslib/sam.h" 14 | #include "htslib/faidx.h" 15 | #include "thread_pool_internal.h" 16 | 17 | #include "../common.h" 18 | #include "../stats.h" 19 | #include "../kh_counter.h" 20 | #include "bamiter.h" 21 | #include "readstats.h" 22 | #include "args.h" 23 | 24 | #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) 25 | #define bam1_seqi(s, i) (bam_seqi((s), (i))) 26 | #define bam_nt16_rev_table seq_nt16_str 27 | #define bam_nt16_table seq_nt16_table 28 | 29 | 30 | static const int NOTPRIMARY = BAM_FSUPPLEMENTARY | BAM_FSECONDARY | BAM_FUNMAP; 31 | // counting alignment flags 32 | // total, primary, ..., unused 33 | static const size_t FLAG_MASK[8] = { 34 | 0, 0, BAM_FSECONDARY, BAM_FSUPPLEMENTARY, BAM_FUNMAP, BAM_FQCFAIL, BAM_FDUP, 0 35 | }; 36 | 37 | 38 | /** Initialise flagstat counts struct for a BAM file. 39 | * 40 | * @param n_refs number of reference sequences. 41 | * 42 | */ 43 | flag_stats* create_flag_stats(size_t n_refs, bool store_unmapped) { 44 | flag_stats* stats = xalloc(1, sizeof(flag_stats), "flagstat"); 45 | stats->n_refs = n_refs; 46 | stats->counts = xalloc(n_refs, sizeof(size_t*), "flagstat"); 47 | stats->unmapped = store_unmapped ? xalloc(8, sizeof(size_t), "flagstat") : NULL; 48 | 49 | for (size_t i = 0; i < n_refs; i++) { 50 | stats->counts[i] = xalloc(8, sizeof(size_t), "flagstat"); 51 | } 52 | 53 | return stats; 54 | } 55 | 56 | /** Clean up flagstat counts. 57 | * 58 | * @param stats flagstat counts structure to clean. 59 | * 60 | */ 61 | void destroy_flag_stats(flag_stats* stats) { 62 | for (size_t i = 0; i < stats->n_refs; i++) { 63 | free(stats->counts[i]); 64 | } 65 | free(stats->counts); 66 | free(stats->unmapped); 67 | free(stats); 68 | } 69 | 70 | 71 | // Count number of each cigar operation in an alignment 72 | inline size_t* create_cigar_stats (bam1_t* b) { 73 | static const size_t NCODES = 10; 74 | size_t* stats = xalloc(NCODES, sizeof(size_t*), "read stats"); 75 | uint32_t *cigar = bam_get_cigar(b); 76 | for (size_t i = 0; i < b->core.n_cigar; ++i){ 77 | uint32_t op = bam_cigar_op(cigar[i]); 78 | uint32_t len = bam_cigar_oplen(cigar[i]); 79 | stats[op] += len; 80 | } 81 | return stats; 82 | } 83 | 84 | // Find the first aligned position of the query sequence 85 | inline size_t get_query_start (bam1_t* b) { 86 | uint32_t start_offset = 0; 87 | uint32_t qlen = b->core.l_qseq; 88 | uint32_t *cigar = bam_get_cigar(b); 89 | for (size_t i = 0; i < b->core.n_cigar; ++i){ 90 | uint32_t op = bam_cigar_op(cigar[i]); 91 | if (op == BAM_CHARD_CLIP) { 92 | if ((start_offset != 0) && (start_offset != qlen)) { 93 | fprintf(stderr, "Invalid clipping in cigar string.\n"); 94 | exit(EXIT_FAILURE); 95 | } 96 | } else if (op == BAM_CSOFT_CLIP) { 97 | start_offset += bam_cigar_oplen(cigar[i]); 98 | } else { 99 | break; 100 | } 101 | } 102 | return start_offset; 103 | } 104 | 105 | // Find the last aligned position of the query sequence 106 | inline size_t get_query_end(bam1_t* b) { 107 | // TODO: assume l_qseq correct 108 | uint32_t end_offset = b->core.l_qseq; 109 | uint32_t qlen = end_offset; 110 | uint32_t *cigar = bam_get_cigar(b); 111 | for (int i=b->core.n_cigar - 1; i >= 0; --i){ 112 | uint32_t op = bam_cigar_op(cigar[i]); 113 | if (op == BAM_CHARD_CLIP) { 114 | if (end_offset != qlen) { 115 | fprintf(stderr, "Invalid clipping in cigar string.\n"); 116 | exit(EXIT_FAILURE); 117 | } 118 | } else if (op == BAM_CSOFT_CLIP) { 119 | end_offset -= bam_cigar_oplen(cigar[i]); 120 | } else { 121 | break; 122 | } 123 | } 124 | return end_offset; 125 | } 126 | 127 | 128 | static inline void process_flagstat_counts(const bam1_t* b, size_t* counts, const int duplex_code ) { 129 | counts[0] += 1; 130 | counts[1] += ((b->core.flag & (NOTPRIMARY)) == 0); 131 | for (size_t i=2; i<6; ++i){ 132 | counts[i] += ((b->core.flag & FLAG_MASK[i]) != 0); 133 | } 134 | counts[7] += (duplex_code == 1); 135 | counts[8] += (duplex_code == -1); 136 | } 137 | 138 | 139 | // see section 4.2.4 of the SAM spec for more details 140 | #define IS_INTEGER_TAG(t) ((t) == 'i' || (t) == 'I' || (t) == 'c' || (t) == 'C' || (t) == 's' || (t) == 'S') 141 | 142 | #define N_TAGS 8 143 | typedef struct { 144 | char *RG; // read group 145 | char *RD; // read group (old skool) 146 | char *st; // start time 147 | int NM; // edit distance 148 | int pi; // parent read 149 | int pt; // poly-t/a tail length 150 | float qs; // quality score 151 | int dx; // duplex 152 | } bam_tags_t; 153 | 154 | 155 | // Function to fetch tags from a bam1_t record 156 | bam_tags_t fetch_bam_tags(const bam1_t *b, const bam_hdr_t *header) { 157 | // default duplex tag to simple read, everything else as invalid 158 | bam_tags_t tags = {NULL, NULL, NULL, -1, -1, -1, -1, 0}; 159 | 160 | uint8_t *aux = bam_aux_first(b); 161 | int n_tags = 0; 162 | while (n_tags < 100 && aux != NULL) { 163 | n_tags++; 164 | const char *t = bam_aux_tag(aux); 165 | char tag[3]; 166 | tag[2] = '\0'; 167 | for (int i = 0; i < 2; i++) { 168 | tag[i] = toupper(t[i]); 169 | } 170 | uint8_t type = bam_aux_type(aux); 171 | 172 | // do this here to avoid repeating below 173 | int ival = -1; 174 | bool ierr = false; 175 | if (IS_INTEGER_TAG(type)) { 176 | errno = 0; 177 | ival = bam_aux2i(aux); 178 | ierr = (ival == 0 && errno == EINVAL); 179 | } 180 | 181 | if ((strcmp(tag, "RG") == 0) && (tags.RG == NULL) && type == 'Z') { 182 | tags.RG = strdup(bam_aux2Z(aux)); 183 | } else if ((strcmp(tag, "RD") == 0) && (tags.RD == NULL) && type == 'Z') { 184 | tags.RD = strdup(bam_aux2Z(aux)); 185 | } else if ((strcmp(tag, "ST") == 0) && (tags.st == NULL) && type == 'Z') { 186 | tags.st = strdup(bam_aux2Z(aux)); 187 | } else if (strcmp(tag, "NM") == 0 && !ierr) { 188 | tags.NM = ival; 189 | } else if (strcmp(tag, "PI") == 0 && !ierr) { 190 | tags.pi = ival; 191 | } else if (strcmp(tag, "PT") == 0 && !ierr) { 192 | tags.pt = ival; 193 | } else if (strcmp(tag, "DX") == 0 && !ierr) { 194 | tags.dx = ival; 195 | } else if (strcmp(tag, "QS") == 0 && (type == 'f')) { 196 | errno = 0; 197 | tags.qs = bam_aux2f(aux); 198 | if (tags.qs == 0 && errno == EINVAL) { 199 | tags.qs = -1; 200 | } 201 | } 202 | else { 203 | // we added above when we shouldn't have 204 | n_tags--; 205 | } 206 | 207 | aux = bam_aux_next(b, aux); 208 | } 209 | 210 | // Check we have all the tags we need 211 | // note theres weird corner case of duplicate tags, but when does that happen? 212 | bool good_align = ((b->core.flag & (NOTPRIMARY | BAM_FQCFAIL | BAM_FDUP)) == 0); 213 | if (good_align && (tags.NM == -1)) { 214 | fprintf(stderr, "Read '%s' does not contain an integer 'NM' tag.\n", bam_get_qname(b)); 215 | kstring_t rec = {0, 0, NULL}; 216 | if (sam_format1(header, b, &rec) < 0) { 217 | fprintf(stderr, "Failed to format record for error message.\n"); 218 | } else { 219 | fprintf(stderr, "%s\n", rec.s); 220 | } 221 | ks_free(&rec); 222 | exit(EXIT_FAILURE); 223 | } 224 | return tags; 225 | } 226 | 227 | 228 | void free_bam_tags(bam_tags_t *tags) { 229 | if (!tags) return; 230 | if (tags->RG) free(tags->RG); 231 | if (tags->RD) free(tags->RD); 232 | if (tags->st) free(tags->st); 233 | //free(tags); // we stack allocate all these now 234 | } 235 | 236 | 237 | // Do all-the-things 238 | void process_bams( 239 | htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr, const char *sample, 240 | const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start, 241 | const char *read_group, const char tag_name[2], const int tag_value, 242 | flag_stats *flag_counts, bool unmapped, 243 | read_stats* length_stats, read_stats* qual_stats, read_stats* acc_stats, read_stats* cov_stats, 244 | read_stats* length_stats_unmapped, read_stats* qual_stats_unmapped, 245 | read_stats* polya_stats, float polya_cover, float polya_qual, bool polya_rev, 246 | kh_counter_t* runids, kh_counter_t* basecallers, 247 | bool force_recalc_qual) { 248 | if (chr != NULL) { 249 | if (strcmp(chr, "*") == 0) { 250 | fprintf(stderr, "Processing: Unplaced reads\n"); 251 | } else { 252 | fprintf(stderr, "Processing: %s:%zu-%zu\n", chr, (size_t)start, (size_t)end); 253 | } 254 | } 255 | 256 | // setup bam reading - reuse our pileup structure, but actually just need iterator 257 | mplp_data* bam = create_bam_iter_data( 258 | fp, idx, hdr, 259 | chr, start, end, overlap_start, 260 | read_group, tag_name, tag_value); 261 | if (bam == NULL) return; 262 | 263 | int res; 264 | bam1_t *b = bam_init1(); 265 | readgroup* rg_info = NULL; 266 | char *runid = NULL; 267 | char *basecaller = NULL; 268 | char *start_time = NULL; 269 | 270 | while ((res = read_bam(bam, b) >= 0)) { 271 | // get all our tags 272 | bam_tags_t tags = fetch_bam_tags(b, hdr); 273 | 274 | // get info from readgroup, note we could use subitems from readgroup 275 | // here more directly, but this is to be consistent with fastcat where 276 | // we only have the readgroup ID string to play with 277 | runid = ""; 278 | basecaller = ""; 279 | start_time = ""; 280 | if (tags.RG != NULL) { 281 | rg_info = create_rg_info(tags.RG); 282 | if (rg_info->runid != NULL) { 283 | runid = rg_info->runid; 284 | } 285 | if (rg_info->basecaller != NULL) { 286 | basecaller = rg_info->basecaller; 287 | } 288 | } else if (tags.RD != NULL) { 289 | runid = tags.RD; 290 | } 291 | 292 | if (tags.st != NULL) { 293 | start_time = tags.st; 294 | } 295 | kh_counter_increment(runids, runid); 296 | kh_counter_increment(basecallers, basecaller); 297 | 298 | // write a record for unmapped/unplaced 299 | if (b->core.flag & BAM_FUNMAP) { 300 | if (unmapped) { 301 | // an unmapped read can still have a RNAME and POS, but we 302 | // ignore that here, because its not a thing we care about 303 | char* qname = bam_get_qname(b); 304 | uint32_t read_length = b->core.l_qseq; 305 | float mean_quality = mean_qual_from_bam(bam_get_qual(b), read_length); 306 | if (sample == NULL) { 307 | fprintf(stdout, 308 | "%s\t%s\t*\tnan\tnan\t" \ 309 | "nan\tnan\tnan\tnan\t" \ 310 | "0\t*\t0\t" \ 311 | "%u\t%.2f\t%s\t" \ 312 | "0\t0\t0\t0\tnan\tnan\t%d\n", 313 | qname, runid, //chr, coverage, ref_cover, 314 | //qstart, qend, rstart, rend, 315 | //aligned_ref_len, direction, length, 316 | read_length, mean_quality, start_time, 317 | //match, ins, delt, sub, iden, acc 318 | tags.dx 319 | ); 320 | } else { 321 | fprintf(stdout, 322 | "%s\t%s\t%s\t*\tnan\tnan\t" \ 323 | "nan\tnan\tnan\tnan\t" \ 324 | "0\t*\t0\t" \ 325 | "%u\t%.2f\t%s\t" \ 326 | "0\t0\t0\t0\tnan\tnan\t%d\n", 327 | qname, runid, sample, //chr, coverage, ref_cover, 328 | //qstart, qend, rstart, rend, 329 | //aligned_ref_len, direction, length, 330 | read_length, mean_quality, start_time, 331 | //match, ins, delt, sub, iden, acc 332 | tags.dx 333 | ); 334 | } 335 | // add to flagstat counts if required 336 | if (flag_counts != NULL) { 337 | process_flagstat_counts(b, flag_counts->unmapped, tags.dx); 338 | } 339 | 340 | // accumulate stats into histogram 341 | add_length_count(length_stats_unmapped, read_length); 342 | add_qual_count(qual_stats_unmapped, mean_quality); 343 | } 344 | goto FINISH_READ; 345 | } 346 | 347 | if (flag_counts != NULL) { 348 | // when we have a target region (as opposed to looping over the whole file), 349 | // `flag_counts` will only contain one (dynamic) array of counts; otherwise 350 | // there will be as many dynamic arrays as references in the BAM header 351 | size_t* counts = (chr != NULL) ? flag_counts->counts[0] 352 | : flag_counts->counts[b->core.tid]; 353 | process_flagstat_counts(b, counts, tags.dx); 354 | } 355 | 356 | // only take "good" primary alignments for further processing 357 | if (b->core.flag & (NOTPRIMARY | BAM_FQCFAIL | BAM_FDUP)) { 358 | goto FINISH_READ; 359 | } 360 | char* qname = bam_get_qname(b); 361 | 362 | size_t* stats = create_cigar_stats(b); 363 | size_t match, ins, delt; 364 | // some aligners like to get fancy 365 | match = stats[BAM_CMATCH] + stats[BAM_CEQUAL] + stats[BAM_CDIFF]; 366 | ins = stats[BAM_CINS]; 367 | delt = stats[BAM_CDEL]; 368 | size_t sub = tags.NM - ins - delt; 369 | size_t length = match + ins + delt; 370 | float iden = 100 * ((float)(match - sub)) / match; 371 | float acc = 100 - 100 * ((float)(tags.NM)) / length; 372 | // some things we've seen go wrong 373 | // explode now because there is almost certainly something wrong with the tags 374 | // and calling add_qual_count with a value less than zero will cause a segfault 375 | if (iden < 0.0 || acc < 0.0 || (size_t)tags.NM > length) { 376 | fprintf(stderr, "Read '%s' appears to contain implausible alignment information\n", qname); 377 | exit(EXIT_FAILURE); 378 | } 379 | // we only deal in primary/soft-clipped alignments so length 380 | // of qseq member is the length of the intact query sequence. 381 | uint32_t read_length = b->core.l_qseq; 382 | size_t qstart = get_query_start(b); 383 | size_t qend = get_query_end(b); 384 | // get mean quality score, from tag or recompute 385 | float mean_quality = tags.qs; 386 | if (mean_quality == -1 || force_recalc_qual) { 387 | mean_quality = mean_qual_from_bam_naive(bam_get_qual(b), read_length); 388 | } 389 | 390 | float coverage = 100 * ((float)(qend - qstart)) / read_length; 391 | size_t rstart = b->core.pos; 392 | size_t rend = bam_endpos(b); 393 | size_t aligned_ref_len = rend - rstart; 394 | size_t ref_length = sam_hdr_tid2len(hdr, b->core.tid); 395 | float ref_cover = 100 * ((float)(aligned_ref_len)) / ref_length; 396 | char direction = "+-"[bam_is_rev(b)]; 397 | 398 | // accumulate stats into histogram 399 | add_length_count(length_stats, read_length); 400 | add_qual_count(qual_stats, mean_quality); 401 | add_qual_count(acc_stats, acc); 402 | add_qual_count(cov_stats, coverage); 403 | 404 | // get poly-A tail length. For now we require: 405 | // i) "good" coverage on reference, i.e. "full length" 406 | // ii) read is sense strand, i.e. fwd alignment 407 | // iii) "good" mean quality 408 | // iv) no split reads 409 | if (polya_stats != NULL) { 410 | int polya_len = -1; 411 | if ((ref_cover >= polya_cover) 412 | && (!bam_is_rev(b) || polya_rev) 413 | && mean_quality >= polya_qual) { 414 | if (tags.pi == -1 && tags.pt >= 0) { 415 | polya_len = tags.pt; 416 | } 417 | } 418 | if (polya_len >= 0) { 419 | add_length_count(polya_stats, polya_len); 420 | } 421 | } 422 | 423 | if (sample == NULL) { 424 | fprintf(stdout, 425 | "%s\t%s\t%s\t" \ 426 | "%.4f\t%.4f\t" \ 427 | "%lu\t%lu\t%lu\t%lu\t" \ 428 | "%lu\t%c\t%lu\t%u\t%.2f\t%s\t" \ 429 | "%lu\t%lu\t%lu\t%lu\t%.2f\t%.2f\t%d\n", 430 | qname, runid, (chr != NULL) ? chr : sam_hdr_tid2name(hdr, b->core.tid), 431 | coverage, ref_cover, 432 | qstart, qend, rstart, rend, 433 | aligned_ref_len, direction, length, read_length, mean_quality, start_time, 434 | match, ins, delt, sub, iden, acc, tags.dx); 435 | } else { 436 | fprintf(stdout, 437 | "%s\t%s\t%s\t%s\t" \ 438 | "%.4f\t%.4f\t" \ 439 | "%lu\t%lu\t%lu\t%lu\t" \ 440 | "%lu\t%c\t%lu\t%u\t%.2f\t%s\t" \ 441 | "%lu\t%lu\t%lu\t%lu\t%.2f\t%.2f\t%d\n", 442 | qname, runid, sample, (chr != NULL) ? chr : sam_hdr_tid2name(hdr, b->core.tid), 443 | coverage, ref_cover, 444 | qstart, qend, rstart, rend, 445 | aligned_ref_len, direction, length, read_length, mean_quality, start_time, 446 | match, ins, delt, sub, iden, acc, tags.dx); 447 | } 448 | free(stats); 449 | 450 | FINISH_READ: 451 | destroy_rg_info(rg_info); 452 | rg_info = NULL; 453 | runid = NULL; 454 | basecaller = NULL; 455 | start_time = NULL; 456 | free_bam_tags(&tags); 457 | } 458 | 459 | destroy_bam_iter_data(bam); 460 | bam_destroy1(b); 461 | 462 | return; 463 | } 464 | -------------------------------------------------------------------------------- /src/bamstats/readstats.h: -------------------------------------------------------------------------------- 1 | #ifndef _BAMSTATS_STATS_H 2 | #define _BAMSTATS_STATS_H 3 | 4 | #include 5 | #include "htslib/sam.h" 6 | 7 | #include "args.h" 8 | #include "../stats.h" 9 | #include "../kh_counter.h" 10 | 11 | 12 | // struct for flagstat counts 13 | typedef struct { 14 | size_t n_refs; 15 | size_t** counts; 16 | size_t* unmapped; 17 | } flag_stats; 18 | 19 | /** Create flagstat counts struct for a BAM file. 20 | * 21 | * @param n_refs number of reference sequences. 22 | * @param store_unmapped whether to count unmapped reads. 23 | * 24 | */ 25 | flag_stats* create_flag_stats(size_t n_refs, bool store_unmapped); 26 | 27 | /** Clean up flagstat counts. 28 | * 29 | * @param stats flagstat counts structure to clean. 30 | * 31 | */ 32 | void destroy_flag_stats(flag_stats* stats); 33 | 34 | 35 | /** Generates alignment stats from a region of a bam. 36 | * 37 | * @param fp htsFile pointer 38 | * @param idx hts_idx_t pointer 39 | * @param hdr sam_hdr_t pointer 40 | * @param sample sample name. 41 | * @param chr bam target name. 42 | * @param start start position of chr to consider. 43 | * @param end end position of chr to consider. 44 | * @param overlap_start whether reads overhanging start should be included. 45 | * @param read_group by which to filter alignments. 46 | * @param tag_name by which to filter alignments. 47 | * @param tag_value associated with tag_name. 48 | * @param flag_counts flag_stats pointer. 49 | * @param unmapped bool include unmapped reads in output. 50 | * @param length_stats read_stats* for accumulating read length information. 51 | * @param qual_stats read_stats* for accumulating read quality information. 52 | * @param acc_stats read_stats* for accumulating read alignment accuracy information. 53 | * @param cov_stats read_stats* for accumulating read alignment coverage information. 54 | * @param length_stats_unmapped read_stats* for accumulating read length information for unmapped reads. 55 | * @param qual_stats_unmapped read_stats* for accumulating read quality information for unmapped reads. 56 | * @param polya_stats read_stats* for accumulating polyA tail length information. 57 | * @param polya_cover minimum reference coverage for polyA tail length to be considered. 58 | * @param polya_qual minimum mean quality for polyA tail length to be considered. 59 | * @param polya_rev whether to allow reverse alignments for polyA tail length. 60 | * @param runids kh_counter_t* for accumulating runid information. 61 | * @param basecallers kh_counter_t* for accumulating basecaller information. 62 | * @param force_recalc_quality whether to recalculate mean quality from phred scores. 63 | * @returns void. Prints output to stdout. 64 | * 65 | */ 66 | void process_bams( 67 | htsFile *fp, hts_idx_t *idx, sam_hdr_t *hdr, const char *sample, 68 | const char *chr, hts_pos_t start, hts_pos_t end, bool overlap_start, 69 | const char *read_group, const char tag_name[2], const int tag_value, 70 | flag_stats *flag_counts, bool unmapped, 71 | read_stats* length_stats, read_stats* qual_stats, read_stats* acc_stats, read_stats* cov_stats, 72 | read_stats* length_stats_unmapped, read_stats* qual_stats_unmapped, 73 | read_stats* polya_stats, float polya_cover, float polya_qual, bool polya_rev, 74 | kh_counter_t* runids, kh_counter_t* basecallers, 75 | bool force_recalc_quality); 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /src/common.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "common.h" 15 | 16 | 17 | /* The following two functions were adpated from: 18 | * https://gist.github.com/JonathonReinhart/8c0d90191c38af2dcadb102c4e202950 19 | */ 20 | static int maybe_mkdir(const char* path, mode_t mode) { 21 | struct stat st; 22 | errno = 0; 23 | 24 | // Try to make the directory 25 | if (mkdir(path, mode) == 0) 26 | return 0; 27 | 28 | // If it fails for any reason but EEXIST, fail 29 | if (errno != EEXIST) 30 | return -1; 31 | 32 | // Check if the existing path is a directory 33 | if (stat(path, &st) != 0) 34 | return -1; 35 | 36 | // If not, fail with ENOTDIR 37 | if (!S_ISDIR(st.st_mode)) { 38 | errno = ENOTDIR; 39 | return -1; 40 | } 41 | 42 | errno = 0; 43 | return 0; 44 | } 45 | 46 | /** mkdir a directory structure recursively, but fail if pre-exists. 47 | * 48 | * @param path directory path to ensure exists 49 | * 50 | */ 51 | int mkdir_hier(char *path) { 52 | 53 | char *_path = NULL; 54 | char *p; 55 | int result = -1; 56 | mode_t mode = 0700; 57 | errno = 0; 58 | 59 | // if we can just make the directory, fine. If it exists 60 | // already then exit 61 | if (mkdir(path, mode) != 0 && errno == EEXIST) 62 | return -1; 63 | 64 | _path = strdup(path); 65 | if (_path == NULL) 66 | goto out; 67 | 68 | for (p = _path + 1; *p; p++) { 69 | if (*p == '/') { 70 | *p = '\0'; 71 | if (maybe_mkdir(_path, mode) != 0) 72 | goto out; 73 | *p = '/'; 74 | } 75 | } 76 | 77 | if (maybe_mkdir(_path, mode) != 0) 78 | goto out; 79 | 80 | result = 0; 81 | out: 82 | free(_path); 83 | return result; 84 | } 85 | 86 | 87 | 88 | 89 | /** Allocates zero-initialised memory with a message on failure. 90 | * 91 | * @param num number of elements to allocate. 92 | * @param size size of each element. 93 | * @param msg message to describe allocation on failure. 94 | * @returns pointer to allocated memory 95 | * 96 | */ 97 | void *xalloc(size_t num, size_t size, char* msg){ 98 | void *res = calloc(num, size); 99 | if (res == NULL){ 100 | fprintf(stderr, "Failed to allocate mem for %s\n", msg); 101 | exit(1); 102 | } 103 | return res; 104 | } 105 | 106 | 107 | /** Reallocates memory with a message on failure. 108 | * 109 | * @param ptr pointer to realloc. 110 | * @param size size of each element. 111 | * @param msg message to describe allocation on failure. 112 | * @returns pointer to allocated memory 113 | * 114 | */ 115 | void *xrealloc(void *ptr, size_t size, char* msg){ 116 | void *res = realloc(ptr, size); 117 | if (res == NULL){ 118 | fprintf(stderr, "Failed to reallocate mem for %s\n", msg); 119 | exit(1); 120 | } 121 | return res; 122 | } 123 | 124 | 125 | /** Retrieves a substring. 126 | * 127 | * @param string input string. 128 | * @param postion start position of substring. 129 | * @param length length of substring required. 130 | * @returns string pointer. 131 | * 132 | */ 133 | char *substring(char *string, size_t position, size_t length) { 134 | char *ptr; 135 | size_t i; 136 | 137 | ptr = malloc(length + 1); 138 | 139 | for (i = 0 ; i < length ; i++) { 140 | *(ptr + i) = *(string + position); 141 | string++; 142 | } 143 | 144 | *(ptr + i) = '\0'; 145 | return ptr; 146 | } 147 | 148 | int replace_char(char *str, char orig, char rep) { 149 | char *ix = str; 150 | int n = 0; 151 | while((ix = strchr(ix, orig)) != NULL) { 152 | *ix++ = rep; 153 | n++; 154 | } 155 | return n; 156 | } 157 | 158 | const double qprobs[100] = { 159 | 1.00000000e+00, 7.94328235e-01, 6.30957344e-01, 5.01187234e-01, 160 | 3.98107171e-01, 3.16227766e-01, 2.51188643e-01, 1.99526231e-01, 161 | 1.58489319e-01, 1.25892541e-01, 1.00000000e-01, 7.94328235e-02, 162 | 6.30957344e-02, 5.01187234e-02, 3.98107171e-02, 3.16227766e-02, 163 | 2.51188643e-02, 1.99526231e-02, 1.58489319e-02, 1.25892541e-02, 164 | 1.00000000e-02, 7.94328235e-03, 6.30957344e-03, 5.01187234e-03, 165 | 3.98107171e-03, 3.16227766e-03, 2.51188643e-03, 1.99526231e-03, 166 | 1.58489319e-03, 1.25892541e-03, 1.00000000e-03, 7.94328235e-04, 167 | 6.30957344e-04, 5.01187234e-04, 3.98107171e-04, 3.16227766e-04, 168 | 2.51188643e-04, 1.99526231e-04, 1.58489319e-04, 1.25892541e-04, 169 | 1.00000000e-04, 7.94328235e-05, 6.30957344e-05, 5.01187234e-05, 170 | 3.98107171e-05, 3.16227766e-05, 2.51188643e-05, 1.99526231e-05, 171 | 1.58489319e-05, 1.25892541e-05, 1.00000000e-05, 7.94328235e-06, 172 | 6.30957344e-06, 5.01187234e-06, 3.98107171e-06, 3.16227766e-06, 173 | 2.51188643e-06, 1.99526231e-06, 1.58489319e-06, 1.25892541e-06, 174 | 1.00000000e-06, 7.94328235e-07, 6.30957344e-07, 5.01187234e-07, 175 | 3.98107171e-07, 3.16227766e-07, 2.51188643e-07, 1.99526231e-07, 176 | 1.58489319e-07, 1.25892541e-07, 1.00000000e-07, 7.94328235e-08, 177 | 6.30957344e-08, 5.01187234e-08, 3.98107171e-08, 3.16227766e-08, 178 | 2.51188643e-08, 1.99526231e-08, 1.58489319e-08, 1.25892541e-08, 179 | 1.00000000e-08, 7.94328235e-09, 6.30957344e-09, 5.01187234e-09, 180 | 3.98107171e-09, 3.16227766e-09, 2.51188643e-09, 1.99526231e-09, 181 | 1.58489319e-09, 1.25892541e-09, 1.00000000e-09, 7.94328235e-10, 182 | 6.30957344e-10, 5.01187234e-10, 3.98107171e-10, 3.16227766e-10, 183 | 2.51188643e-10, 1.99526231e-10, 1.58489319e-10, 1.25892541e-10}; 184 | 185 | 186 | inline void kahan_sum(double* sum, double term, double* c) { 187 | double y = term + *c; 188 | double t = *sum + y; 189 | *c = (t - *sum) - y; 190 | *sum = t; 191 | } 192 | 193 | 194 | inline float mean_qual(char* qual, size_t len) { 195 | if (len == 0 ) return nanf(""); 196 | double qsum = 0; 197 | double c = 0; 198 | for (size_t i=0; ireadgroup); 273 | free(rg); 274 | rg = NULL; 275 | } 276 | } 277 | 278 | // rg is of the form: 279 | // __ 280 | // 281 | // where: 282 | // - runid is either (see CW-4704): 283 | // - a 40 character string representing an acquisition_id sha 284 | // - a 36 character string representing a protocol_run_id uuid 285 | // - basecalling_model is a string maybe containing `_`, and containing one or more `@` 286 | // - mod_caller is optional part of this starting with `_` after the first `@` 287 | // - barcode_arrangement is a optional(!) string with an unknown format, but hopefully no `@` 288 | // 289 | // The function always returns an object with a copy of the input. The subfields may be 290 | // NULL pointers if the parsing was incomplete. Client code should therefore always check that 291 | // members are not NULL before use. 292 | // 293 | readgroup* create_rg_info(char* rg) { 294 | readgroup* rg_info = xalloc(1, sizeof(readgroup), "readgroup"); 295 | rg_info->readgroup = strdup(rg); 296 | rg_info->runid = NULL; 297 | rg_info->basecaller = NULL; 298 | rg_info->modcaller = NULL; 299 | rg_info->barcode = NULL; 300 | 301 | // first strip of `-ABCDEF` from the end 302 | strip_hex_suffix(rg_info->readgroup); 303 | 304 | // I tried to do this with regex, but even chatGPT couldn't give me a 305 | // POSIX regex that would work. So we'll do it manually (and somewhat 306 | // more understandably/controllably) 307 | 308 | // runid runs to first `_` 309 | rg_info->runid = rg_info->readgroup; 310 | char* delim = strchr(rg_info->runid, '_'); 311 | if (delim == NULL) { 312 | return rg_info; 313 | } 314 | delim[0] = '\0'; 315 | // ensure runid is long enough to be an acquisition sha or protocol uuid 316 | int runid_l = strlen(rg_info->runid); 317 | if (runid_l != 36 && runid_l != 40) { 318 | // free the mutated copy, and reset 319 | free(rg_info->readgroup); 320 | rg_info->readgroup = strdup(rg); 321 | rg_info->runid = NULL; 322 | return rg_info; 323 | } 324 | // basecaller + modcaller runs to first `_` after last `@` 325 | // though barcode is optional, so there may not 326 | // be a `_` after the last `@` 327 | rg_info->basecaller = delim + 1; 328 | delim = strrchr(rg_info->basecaller, '@'); 329 | if (delim == NULL) { 330 | rg_info->basecaller = NULL; 331 | return rg_info; 332 | } 333 | // modcaller is optional, we can detect its presence by more than one `@` in basecaller 334 | char* delim1 = strchr(rg_info->basecaller, '@'); 335 | if (delim1 == delim) { // only one `@` 336 | rg_info->modcaller = NULL; 337 | } else { 338 | delim1 = strchr(delim1, '_'); // modcaller starts at `_` after first `@` 339 | delim1[0] = '\0'; 340 | rg_info->modcaller = delim1 + 1; 341 | } 342 | delim = strchr(delim, '_'); 343 | // barcode is optional 344 | if (delim) { 345 | delim[0] = '\0'; 346 | rg_info->barcode = delim + 1; 347 | } 348 | return rg_info; 349 | } 350 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef _FASTCAT_COMMON_H 2 | #define _FASTCAT_COMMON_H 3 | 4 | #include 5 | 6 | 7 | /** Simple min/max 8 | * @param a 9 | * @param b 10 | * 11 | * @returns the min/max of a and b 12 | * 13 | */ 14 | #define min(a, b) ({ \ 15 | typeof (a) _a = (a); \ 16 | typeof (b) _b = (b); \ 17 | _a < _b ? _a : _b; \ 18 | }) 19 | #define max(a, b) ({ \ 20 | typeof (a) _a = (a); \ 21 | typeof (b) _b = (b); \ 22 | _a > _b ? _a : _b; \ 23 | }) 24 | 25 | /** mkdir a directory structure recursively, but fail if pre-exists. 26 | * 27 | * @param path directory path to ensure exists 28 | * 29 | */ 30 | int mkdir_hier(char* path); 31 | 32 | /** Allocates zero-initialised memory with a message on failure. 33 | * 34 | * @param num number of elements to allocate. 35 | * @param size size of each element. 36 | * @param msg message to describe allocation on failure. 37 | * @returns pointer to allocated memory 38 | * 39 | */ 40 | void *xalloc(size_t num, size_t size, char* msg); 41 | 42 | 43 | /** Reallocates memory with a message on failure. 44 | * 45 | * @param ptr pointer to realloc. 46 | * @param size size of each element. 47 | * @param msg message to describe allocation on failure. 48 | * @returns pointer to allocated memory 49 | * 50 | */ 51 | void *xrealloc(void *ptr, size_t size, char* msg); 52 | 53 | 54 | /** Retrieves a substring. 55 | * 56 | * @param string input string. 57 | * @param postion start position of substring. 58 | * @param length length of substring required. 59 | * @returns string pointer. 60 | * 61 | */ 62 | char *substring(char *string, size_t position, size_t length); 63 | 64 | /** Globally replace a char in a char* 65 | * 66 | * @param str char* source string 67 | * @param orig original character 68 | * @param rep replacement 69 | * @returns number of times replacement made 70 | * 71 | */ 72 | int replace_char(char *str, char orig, char rep); 73 | 74 | 75 | // https://en.wikipedia.org/wiki/Kahan_summation_algorithm 76 | void kahan_sum(double* sum, double term, double* c); 77 | 78 | float mean_qual(char* qual, size_t len); 79 | float mean_qual_naive(char* qual, size_t len); 80 | float mean_qual_from_bam(uint8_t* qual, size_t len); 81 | float mean_qual_from_bam_naive(uint8_t* qual, size_t len); 82 | 83 | typedef struct readgroup { 84 | char* readgroup; 85 | char* runid; 86 | char* basecaller; 87 | char* modcaller; 88 | char* barcode; 89 | } readgroup; 90 | 91 | readgroup* create_rg_info(char* rg); 92 | void destroy_rg_info(readgroup* rg); 93 | 94 | #endif 95 | -------------------------------------------------------------------------------- /src/fastcat/args.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "args.h" 5 | #include "../version.h" 6 | 7 | const char *argp_program_bug_address = "chris.wright@nanoporetech.com"; 8 | static char doc[] = 9 | "fastcat -- concatenate and summarise .fastq(.gz) files.\ 10 | \vInput files may be given on stdin by specifing the input as '-'. \ 11 | Also accepts directories as input and looks for .fastq(.gz) files in \ 12 | the top-level directory. Recurses into sub-directories when the \ 13 | -x option is given. The command \ 14 | will exit non-zero if any file encountered cannot be read."; 15 | static char args_doc[] = "reads1.fastq(.gz) reads2.fastq(.gz) dir-with-fastq ..."; 16 | static struct argp_option options[] = { 17 | {0, 0, 0, 0, 18 | "General options:", 0}, 19 | {"recurse", 'x', 0, 0, 20 | "Search directories recursively for '.fastq', '.fq', '.fastq.gz', and '.fq.gz' files.", 0}, 21 | {"threads", 't', "THREADS", 0, 22 | "Number of threads for output compression (only with --bam_out.", 0}, 23 | {0, 0, 0, 0, 24 | "Output options:", 0}, 25 | {"sample", 's', "SAMPLE NAME", 0, 26 | "Sample name (if given, adds a 'sample_name' column).", 0}, 27 | {"reads_per_file", 'c', "NUM", 0, 28 | "Split reads into files with a set number of reads (default: single file).", 0}, 29 | {"reheader", 'H', 0, 0, 30 | "Rewrite fastq header comments as SAM tags (useful for passing through minimap2).", 0}, 31 | {"bam_out", 'B', 0, 0, 32 | "Output data as unaligned BAM.", 0}, 33 | {"verbose", 'v', 0, 0, 34 | "Verbose output.", 0}, 35 | {0, 0, 0, 0, 36 | "Output file selection:", 0}, 37 | {"read", 'r', "READ SUMMARY", 0, 38 | "Per-read summary output", 0}, 39 | {"file", 'f', "FILE SUMMARY", 0, 40 | "Per-file summary output", 0}, 41 | {"runids", 'i', "ID SUMMARY", 0, 42 | "Run ID summary output", 0}, 43 | {"basecallers", 'l', "CALLER SUMMARY", 0, 44 | "Basecaller mode summary output", 0}, 45 | {"demultiplex", 'd', "OUT DIR", 0, 46 | "Separate barcoded samples using fastq header information. Option value is top-level output directory.", 0}, 47 | {"histograms", 0x400, "DIRECTORY", 0, 48 | "Directory for outputting histogram information. When --demultiplex is enabled histograms are written to per-sample demultiplexed output directories. (default: fastcat-histograms)", 0}, 49 | {0, 0, 0, 0, 50 | "Read filtering options:", 0}, 51 | {"min_length", 'a', "MIN READ LENGTH", 0, 52 | "minimum read length to output (excluded reads remain listed in summaries).", 0}, 53 | {"max_length", 'b', "MAX READ LENGTH", 0, 54 | "maximum read length to output (excluded reads remain listed in summaries).", 0}, 55 | {"min_qscore", 'q', "MIN READ QSCOROE", 0, 56 | "minimum read Qscore to output (excluded reads remain listed in summaries).", 0}, 57 | { 0 } 58 | }; 59 | 60 | 61 | static error_t parse_opt (int key, char *arg, struct argp_state *state) { 62 | arguments_t *arguments = state->input; 63 | switch (key) { 64 | case 'r': 65 | arguments->perread = arg; 66 | break; 67 | case 'f': 68 | arguments->perfile = arg; 69 | break; 70 | case 'i': 71 | arguments->runids = arg; 72 | break; 73 | case 'l': 74 | arguments->basecallers = arg; 75 | break; 76 | case 's': 77 | arguments->sample = arg; 78 | break; 79 | case 'a': 80 | arguments->min_length = atoi(arg); 81 | break; 82 | case 'b': 83 | arguments->max_length = atoi(arg); 84 | break; 85 | case 'c': 86 | arguments->reads_per_file = atoi(arg); 87 | break; 88 | case 'd': 89 | arguments->demultiplex_dir = arg; 90 | break; 91 | case 0x400: 92 | arguments->histograms = arg; 93 | break; 94 | case 'q': 95 | arguments->min_qscore = (float)atof(arg); 96 | break; 97 | case 'x': 98 | arguments->recurse = -1; // 0: stops recursion 99 | break; 100 | case 'H': 101 | arguments->reheader = 1; 102 | break; 103 | case 'B': 104 | arguments->write_bam = 1; 105 | break; 106 | case 'v': 107 | arguments->verbose = 1; 108 | break; 109 | case 't': 110 | arguments->threads = atoi(arg); 111 | break; 112 | case ARGP_KEY_NO_ARGS: 113 | argp_usage (state); 114 | break; 115 | case ARGP_KEY_ARG: 116 | arguments->files = &state->argv[state->next - 1]; 117 | state->next = state->argc; 118 | break; 119 | default: 120 | return ARGP_ERR_UNKNOWN; 121 | } 122 | return 0; 123 | } 124 | 125 | static struct argp argp = {options, parse_opt, args_doc, doc, 0, 0, 0}; 126 | 127 | 128 | arguments_t parse_arguments(int argc, char** argv) { 129 | arguments_t args; 130 | args.perread = NULL; 131 | args.perfile = NULL; 132 | args.runids = NULL; 133 | args.basecallers = NULL; 134 | args.sample = ""; 135 | args.min_length = 0; 136 | args.max_length = (size_t)-1; 137 | args.min_qscore = 0; 138 | args.recurse = 1; // always allow descent into TLD 139 | args.demultiplex_dir = NULL; 140 | args.histograms = "fastcat-histograms"; 141 | args.reheader = 0; 142 | args.write_bam = 0; 143 | args.threads = 1; 144 | args.reads_per_file = 0; 145 | argp_parse(&argp, argc, argv, 0, 0, &args); 146 | return args; 147 | } 148 | -------------------------------------------------------------------------------- /src/fastcat/args.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTCAT_ARGS_H 2 | #define FASTCAT_ARGS_H 3 | #include 4 | 5 | 6 | typedef struct arguments { 7 | char *perread; 8 | char *perfile; 9 | char *runids; 10 | char *basecallers; 11 | char *sample; 12 | size_t min_length; 13 | size_t max_length; 14 | float min_qscore; 15 | int recurse; 16 | size_t reheader; 17 | size_t write_bam; 18 | char* demultiplex_dir; 19 | char* histograms; 20 | char **files; 21 | size_t reads_per_file; 22 | int threads; 23 | bool verbose; 24 | } arguments_t; 25 | 26 | arguments_t parse_arguments(int argc, char** argv); 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /src/fastcat/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "htslib/kseq.h" 12 | KSEQ_INIT(gzFile, gzread) 13 | #define KSEQ_DECLARED 14 | 15 | #include "../common.h" 16 | #include "../fastqcomments.h" 17 | #include "../kh_counter.h" 18 | #include "args.h" 19 | #include "writer.h" 20 | 21 | 22 | const char filetypes[4][9] = {".fastq", ".fq", ".fastq.gz", ".fq.gz"}; 23 | size_t nfiletypes = 4; 24 | 25 | // defined below -- recursion 26 | int process_file(char* fname, writer writer, arguments_t *args, int recurse); 27 | 28 | int process_dir(const char *name, writer writer, arguments_t *args, int recurse) { 29 | int status = 0; 30 | DIR *dir; 31 | struct dirent *entry; 32 | char* search; 33 | 34 | // read all files in directory 35 | if (!(dir = opendir(name))) { 36 | fprintf(stderr, "Error: could not process directory %s: %s\n", name, strerror(errno)); 37 | return errno; 38 | } 39 | while ((entry = readdir(dir)) != NULL) { 40 | char *path = calloc(strlen(name) + strlen(entry->d_name) + 2, sizeof(char)); 41 | sprintf(path, "%s/%s", name, entry->d_name); 42 | if ((entry->d_type == DT_DIR) && (recurse != 0)) { 43 | // skip 44 | } else { 45 | for (size_t i=0; id_name, filetypes[i]); 47 | if (search != NULL) { 48 | if (args->verbose) { 49 | fprintf(stderr, "Processing %s\n", path); 50 | } 51 | int rtn = process_file(path, writer, args, recurse - 1); 52 | status = max(status, rtn); 53 | break; 54 | } 55 | } 56 | } 57 | free(path); 58 | } 59 | closedir(dir); 60 | 61 | // start again and look at child directories 62 | if (!(dir = opendir(name))) { 63 | fprintf(stderr, "Error: could not process directory %s: %s\n", name, strerror(errno)); 64 | return errno; 65 | } 66 | while ((entry = readdir(dir)) != NULL) { 67 | char *path = calloc(strlen(name) + strlen(entry->d_name) + 2, sizeof(char)); 68 | sprintf(path, "%s/%s", name, entry->d_name); 69 | if ((entry->d_type == DT_DIR) && (recurse != 0)) { 70 | if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) { 71 | free(path); 72 | continue; 73 | } 74 | int rtn = process_dir(path, writer, args, recurse - 1); 75 | status = max(status, rtn); 76 | } else { 77 | // skip 78 | } 79 | free(path); 80 | } 81 | closedir(dir); 82 | 83 | return status; 84 | } 85 | 86 | 87 | int process_file(char* fname, writer writer, arguments_t* args, int recurse) { 88 | int status = 0; 89 | struct stat finfo; 90 | int res = stat(fname, &finfo); 91 | if (res == -1) { 92 | fprintf(stderr, "Error: could not process file %s: %s\n", fname, strerror(errno)); 93 | return errno; 94 | } 95 | 96 | // handle directory input 97 | if ((finfo.st_mode & S_IFMT) == S_IFDIR) { 98 | if (recurse != 0) { 99 | char* sfname = strip_path(fname); 100 | int rtn = process_dir(sfname, writer, args, recurse - 1); 101 | status = max(status, rtn); 102 | free(sfname); 103 | } 104 | return status; 105 | } 106 | 107 | gzFile fp; 108 | kseq_t *seq; 109 | 110 | fp = gzopen(fname, "r"); 111 | seq = kseq_init(fp); 112 | size_t n = 0, slen = 0; 113 | size_t minl = UINTMAX_MAX, maxl = 0; 114 | double meanq = 0.0, c = 0.0; 115 | status = 0; 116 | kh_counter_t *run_ids = kh_counter_init(); 117 | kh_counter_t *basecallers = kh_counter_init(); 118 | while ((status = kseq_read(seq)) >= 0) { 119 | // accumulate stats only for reads within length and quality thresholds 120 | if (seq->qual.l == 0) { status = -99; break; } 121 | if ((seq->seq.l >= args->min_length) && (seq->seq.l <= args->max_length)) { 122 | float mean_q = mean_qual_naive(seq->qual.s, seq->qual.l); 123 | if (mean_q < args->min_qscore) continue; 124 | ++n ; slen += seq->seq.l; 125 | minl = min(minl, seq->seq.l); 126 | maxl = max(maxl, seq->seq.l); 127 | kahan_sum(&meanq, mean_q, &c); 128 | read_meta meta = parse_read_meta(seq->comment); 129 | write_read(writer, seq, meta, mean_q, fname); 130 | kh_counter_increment(run_ids, meta->runid); 131 | kh_counter_increment(basecallers, meta->basecaller); 132 | destroy_read_meta(meta); 133 | } 134 | } 135 | 136 | // handle errors 137 | switch (status) { 138 | case -1: 139 | status = EXIT_SUCCESS; 140 | break; 141 | case -2: 142 | status = EXIT_FAILURE; 143 | fprintf(stderr, "Truncated quality string found for record in file '%s'.\n", fname); 144 | break; 145 | case -3: 146 | status = EXIT_FAILURE; 147 | fprintf(stderr, "Error reading file '%s', possibly truncated\n", fname); 148 | break; 149 | case -99: 150 | status = EXIT_FAILURE; 151 | fprintf(stderr, "No quality string found for record in file '%s' (FASTA is unsupported).\n", fname); 152 | break; 153 | default: 154 | status = EXIT_FAILURE; 155 | fprintf(stderr, "Unknown error reading file '%s'.\n", fname); 156 | } 157 | 158 | // summary entries 159 | if(writer->perfile != NULL) { 160 | fprintf(writer->perfile, "%s\t", fname); 161 | if (writer->sample != NULL) fprintf(writer->perfile, "%s\t", args->sample); 162 | if (n == 0) { 163 | // there were no reads in the input file 164 | fprintf(writer->perfile, "0\t0\t0\t0\t0.00\n"); 165 | } else { 166 | fprintf(writer->perfile, "%zu\t%zu\t%zu\t%zu\t%.2f\n", 167 | n, slen, minl, maxl, meanq/n 168 | ); 169 | } 170 | } 171 | if(writer->runids != NULL) { 172 | for (khiter_t k = 0; k < kh_end(run_ids); ++k) { 173 | if (kh_exist(run_ids, k)) { 174 | fprintf(writer->runids, "%s\t", fname); 175 | if (writer->sample != NULL) fprintf(writer->runids, "%s\t", args->sample); 176 | fprintf(writer->runids, "%s\t%d\n", kh_key(run_ids, k), kh_val(run_ids, k)); 177 | } 178 | } 179 | } 180 | if(writer->basecallers != NULL) { 181 | for (khiter_t k = 0; k < kh_end(basecallers); ++k) { 182 | if (kh_exist(basecallers, k)) { 183 | fprintf(writer->basecallers, "%s\t", fname); 184 | if (writer->sample != NULL) fprintf(writer->basecallers, "%s\t", args->sample); 185 | fprintf(writer->basecallers, "%s\t%d\n", kh_key(basecallers, k), kh_val(basecallers, k)); 186 | } 187 | } 188 | } 189 | 190 | // cleanup 191 | kh_counter_destroy(basecallers); 192 | kh_counter_destroy(run_ids); 193 | kseq_destroy(seq); 194 | gzclose(fp); 195 | return status; 196 | } 197 | 198 | 199 | int main(int argc, char **argv) { 200 | arguments_t args = parse_arguments(argc, argv); 201 | 202 | writer writer = initialize_writer( 203 | args.demultiplex_dir, args.histograms, args.perread, args.perfile, 204 | args.runids, args.basecallers, args.sample, 205 | args.reheader, args.write_bam, args.reads_per_file, 206 | args.threads); 207 | if (writer == NULL) exit(1); 208 | 209 | size_t nfile = 0; 210 | int status = 0; 211 | for( ; args.files[nfile] ; nfile++); 212 | 213 | if (nfile==1 && strcmp(args.files[0], "-") == 0) { 214 | char *ln = NULL; 215 | size_t n = 0; 216 | ssize_t nchr = 0; 217 | int recurse = 0; 218 | while ((nchr = getline (&ln, &n, stdin)) != -1) { 219 | ln[strcspn(ln, "\r\n")] = 0; 220 | int rtn = process_file(ln, writer, &args, recurse); 221 | status = max(status, rtn); 222 | } 223 | free(ln); 224 | } else { 225 | for (size_t i=0; i 5 | 6 | 7 | // this gives us kseq_t for below 8 | #ifndef KSEQ_DECLARED 9 | #include "htslib/kseq.h" 10 | KSEQ_DECLARE(gzFile) 11 | #endif 12 | 13 | #include // HTSlib for BAM output 14 | 15 | #include "../stats.h" 16 | #include "../fastqcomments.h" 17 | 18 | // barcode 0 is reserved for "unclassified" 19 | #define MAX_BARCODES 1025 20 | 21 | typedef struct { 22 | char* output; 23 | char* histograms; 24 | gzFile* handles; 25 | size_t* nreads; 26 | size_t* reads_written; 27 | size_t* file_index; 28 | read_stats** l_stats; 29 | read_stats** q_stats; 30 | FILE* perread; 31 | FILE* perfile; 32 | FILE* runids; 33 | FILE* basecallers; 34 | char* sample; 35 | size_t reheader; 36 | size_t reads_per_file; 37 | // optional BAM conversion 38 | int write_bam; 39 | htsFile** bam_files; 40 | bam_hdr_t* bam_hdr; 41 | htsThreadPool hts_pool; 42 | } _writer; 43 | 44 | typedef _writer* writer; 45 | 46 | char* strip_path(char* input); 47 | 48 | writer initialize_writer( 49 | char* output_dir, char* histograms, char* perread, char* perfile, 50 | char* runids, char* basecallers, char* sample, 51 | size_t reheader, size_t write_bam, size_t reads_per_file, 52 | int threads); 53 | 54 | void destroy_writer(writer writer); 55 | 56 | void write_read(writer writer, kseq_t* seq, read_meta meta, float mean_q, char* fname); 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /src/fastqcomments.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "common.h" 7 | #include "fastqcomments.h" 8 | 9 | 10 | // like `ksprintf()`, but will put the optional delimiter `d` before the added string if 11 | // `s` is not empty (and skip the delimiter otherwise) 12 | #define ksprintf_with_opt_delim(s, d, fmt, ...) \ 13 | ksprintf(s, "%s" fmt, s->l == 0 ? "" : d, __VA_ARGS__) 14 | 15 | 16 | read_meta create_read_meta(const kstring_t* comment) { 17 | read_meta meta = xalloc(1, sizeof(_read_meta), "meta"); 18 | meta->comment = xalloc(comment->l + 1, sizeof(char), "meta->comment"); 19 | strncpy(meta->comment, comment->s, comment->l); 20 | meta->rg = ""; 21 | meta->rg_info = NULL; 22 | meta->runid = ""; 23 | meta->basecaller = ""; 24 | meta->flow_cell_id = ""; 25 | meta->barcode = ""; 26 | meta->ibarcode = 0; 27 | meta->barcode_alias = ""; 28 | meta->start_time = ""; 29 | meta->read_number = 0; 30 | meta->channel = 0; 31 | meta->rest = xalloc(1, sizeof(kstring_t), "meta->rest"); 32 | ks_initialize(meta->rest); 33 | meta->tags_str = xalloc(1, sizeof(kstring_t), "meta->tags_str"); 34 | ks_initialize(meta->tags_str); 35 | 36 | return meta; 37 | } 38 | 39 | void destroy_read_meta(read_meta meta) { 40 | free(meta->comment); 41 | destroy_rg_info(meta->rg_info); 42 | free(meta->rest->s); 43 | free(meta->rest); 44 | free(meta->tags_str->s); 45 | free(meta->tags_str); 46 | free(meta); 47 | } 48 | 49 | // The caller is responsible for calling destroy_read_meta on the returned object. 50 | read_meta parse_read_meta(kstring_t comment) { 51 | read_meta meta = create_read_meta(&comment); 52 | 53 | // if an RG or RD tag appears in the seq->comment, assume there are SAM tags to parse 54 | char* res = NULL; 55 | bool sam_tags = false; 56 | if (strlen(meta->comment) > 0) { 57 | // check if comment starts with "RG:Z:" or "RD:Z:" 58 | if (!strncmp(meta->comment, "RG:Z:", 5) || !strncmp(meta->comment, "RD:Z:", 5)) { 59 | sam_tags = true; 60 | } 61 | // RG or RD could also appear later in the comment (we include '\t' in the check 62 | // to be extra stringent) 63 | res = strstr(meta->comment, "\tRG:Z:"); 64 | if (res != NULL) { 65 | sam_tags = true; 66 | } 67 | res = strstr(meta->comment, "\tRD:Z:"); 68 | if (res != NULL) { 69 | sam_tags = true; 70 | } 71 | } 72 | 73 | char *pch=NULL, *p1=NULL, *p2=NULL; 74 | char *key=NULL, *keytype=NULL, *value=NULL; 75 | 76 | char sam_token[2] = "\t"; 77 | char fq_token[2] = " "; 78 | char* token = fq_token; 79 | if (sam_tags) { 80 | token = sam_token; 81 | } 82 | pch = strtok_r(meta->comment, token, &p1); 83 | while (pch != NULL) { 84 | 85 | if (sam_tags) { 86 | // split to tag:type:value 87 | key = strtok_r(pch, ":", &p2); 88 | keytype = strtok_r(NULL, ":", &p2); 89 | value = strtok_r(NULL, "", &p2); 90 | // we allow empty tags (e.g. 'RG:Z:'); in this case, `keytype` will be 91 | // non-null, but `value` will be null; we set it to "" 92 | if (keytype != NULL && value == NULL) value = ""; 93 | } 94 | else { 95 | // split words on `=` 96 | key = strtok_r(pch, "=", &p2); 97 | keytype = NULL; 98 | value = strtok_r(NULL, "", &p2); 99 | } 100 | 101 | // if there was no delimiter in the word, value will be NULL --> add word to `rest` 102 | if (value == NULL) { 103 | ksprintf_with_opt_delim(meta->rest, " ", "%s", key); 104 | } else { 105 | if (!strcmp(key, "runid") || !strcmp(key, "RD")) { 106 | // we'll output RD depending on the value of RG, later 107 | meta->runid = value; 108 | ksprintf_with_opt_delim(meta->tags_str, "\t", "RD:Z:%s", meta->runid); 109 | } 110 | else if (!strcmp(key, "RG")) { 111 | meta->rg = value; 112 | ksprintf_with_opt_delim(meta->tags_str, "\t", "RG:Z:%s", value); 113 | } 114 | // CW-4766 - inconsistent naming of basecall model version id by guppy/minknow/dorado 115 | else if (!strcmp(key, "basecall_model_version_id") || !strcmp(key, "model_version_id")) { 116 | meta->basecaller = value; 117 | // there's no discrete tag defined by guppy/minknow/doroado 118 | // for this; so not added to `tags_str` (but to `rest` instead) 119 | ksprintf_with_opt_delim(meta->rest, " ", "%s=%s", key, value); 120 | } 121 | else if (!strcmp(key, "flow_cell_id") || !strcmp(key, "FC")) { 122 | meta->flow_cell_id = value; 123 | ksprintf_with_opt_delim(meta->tags_str, "\t", "FC:Z:%s", value); 124 | } 125 | else if (!strcmp(key, "barcode") || !strcmp(key, "BC")) { 126 | meta->barcode = value; 127 | meta->ibarcode = atoi(value+7); // "unclassified" -> 0 128 | ksprintf_with_opt_delim(meta->tags_str, "\t", "BC:Z:%s", value); 129 | } 130 | else if (!strcmp(key, "barcode_alias") || !strcmp(key, "BA")) { 131 | meta->barcode_alias = value; 132 | ksprintf_with_opt_delim(meta->tags_str, "\t", "BA:Z:%s", value); 133 | } 134 | else if (!strcmp(key, "read") || !strcmp(key, "RN") || !strcmp(key, "rn")) { 135 | meta->read_number = atoi(value); 136 | ksprintf_with_opt_delim(meta->tags_str, "\t", "rn:i:%s", value); 137 | } 138 | else if (!strcmp(key, "CH") || !strcmp(key, "ch")) { 139 | meta->channel = atoi(value); 140 | ksprintf_with_opt_delim(meta->tags_str, "\t", "ch:i:%s", value); 141 | } 142 | else if (!strcmp(key, "start_time") || !strcmp(key, "ST") || !strcmp(key, "st")) { 143 | meta->start_time = value; 144 | ksprintf_with_opt_delim(meta->tags_str, "\t", "st:Z:%s", value); 145 | } else { 146 | if (sam_tags) { 147 | // pass through all other tags 148 | ksprintf_with_opt_delim(meta->tags_str, "\t", "%s:%s:%s", key, keytype, value); 149 | } 150 | else { 151 | // long form key=value was not mapped to a SAM tag, send it to CO via meta->rest 152 | ksprintf_with_opt_delim(meta->rest, " ", "%s=%s", key, value); 153 | } 154 | } 155 | } 156 | pch = strtok_r(NULL, token, &p1); 157 | } 158 | 159 | // if there is a `rest` 160 | // (also check that the first char of rest is not ' ', in which case something must have gone wrong) 161 | // first replace all tabs with space to avoid all manner of confusion with Martin 162 | for (size_t i=0; irest->l; i++) { 163 | if (meta->rest->s[i] == '\t') { 164 | meta->rest->s[i] = ' '; 165 | } 166 | } 167 | if (meta->rest->l != 0 && meta->rest->s[0] != ' ') { 168 | ksprintf_with_opt_delim(meta->tags_str, "\t", "CO:Z:%s", meta->rest->s); 169 | } 170 | 171 | bool need_run_id = strlen(meta->runid) == 0; 172 | bool need_basecaller = strlen(meta->basecaller) == 0; 173 | if(strlen(meta->rg) > 0 && (need_run_id || need_basecaller)) { 174 | readgroup* rg_info = create_rg_info(meta->rg); 175 | if (need_run_id && rg_info->runid != NULL) { 176 | meta->runid = rg_info->runid; 177 | ksprintf_with_opt_delim(meta->tags_str, "\t", "RD:Z:%s", rg_info->runid); 178 | } 179 | if (need_basecaller && rg_info->basecaller != NULL) { 180 | meta->basecaller = rg_info->basecaller; 181 | } 182 | meta->rg_info = rg_info; 183 | } 184 | 185 | return meta; 186 | } 187 | -------------------------------------------------------------------------------- /src/fastqcomments.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTCAT_FASTQCOMMENTS_H 2 | #define FASTCAT_FASTQCOMMENTS_H 3 | 4 | #include "common.h" 5 | #include "htslib/kstring.h" 6 | 7 | typedef struct { 8 | char* comment; 9 | char* rg; 10 | readgroup* rg_info; 11 | char* runid; 12 | char* basecaller; 13 | char* flow_cell_id; 14 | char* barcode; 15 | size_t ibarcode; 16 | char* barcode_alias; 17 | char* start_time; 18 | size_t read_number; 19 | size_t channel; 20 | kstring_t* rest; 21 | kstring_t* tags_str; 22 | } _read_meta; 23 | 24 | typedef _read_meta* read_meta; 25 | 26 | 27 | // constructor 28 | read_meta create_read_meta(const kstring_t* comment); 29 | 30 | // destructor 31 | void destroy_read_meta(read_meta meta); 32 | 33 | // parser 34 | read_meta parse_read_meta(kstring_t comment); 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/hts_defs.h: -------------------------------------------------------------------------------- 1 | /* hts_defs.h -- Miscellaneous definitions. 2 | 3 | Copyright (C) 2013-2015,2017, 2019-2020 Genome Research Ltd. 4 | 5 | Author: John Marshall 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in 15 | all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. */ 24 | 25 | #ifndef HTSLIB_HTS_DEFS_H 26 | #define HTSLIB_HTS_DEFS_H 27 | 28 | #if defined __MINGW32__ 29 | #include // For __MINGW_PRINTF_FORMAT macro 30 | #endif 31 | 32 | #ifdef __clang__ 33 | #ifdef __has_attribute 34 | #define HTS_COMPILER_HAS(attribute) __has_attribute(attribute) 35 | #endif 36 | 37 | #elif defined __GNUC__ 38 | #define HTS_GCC_AT_LEAST(major, minor) \ 39 | (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) 40 | #endif 41 | 42 | #ifndef HTS_COMPILER_HAS 43 | #define HTS_COMPILER_HAS(attribute) 0 44 | #endif 45 | #ifndef HTS_GCC_AT_LEAST 46 | #define HTS_GCC_AT_LEAST(major, minor) 0 47 | #endif 48 | 49 | #if HTS_COMPILER_HAS(__nonstring__) || HTS_GCC_AT_LEAST(8,1) 50 | #define HTS_NONSTRING __attribute__ ((__nonstring__)) 51 | #else 52 | #define HTS_NONSTRING 53 | #endif 54 | 55 | #if HTS_COMPILER_HAS(__noreturn__) || HTS_GCC_AT_LEAST(3,0) 56 | #define HTS_NORETURN __attribute__ ((__noreturn__)) 57 | #else 58 | #define HTS_NORETURN 59 | #endif 60 | 61 | // GCC introduced warn_unused_result in 3.4 but added -Wno-unused-result later 62 | #if HTS_COMPILER_HAS(__warn_unused_result__) || HTS_GCC_AT_LEAST(4,5) 63 | #define HTS_RESULT_USED __attribute__ ((__warn_unused_result__)) 64 | #else 65 | #define HTS_RESULT_USED 66 | #endif 67 | 68 | #if HTS_COMPILER_HAS(__unused__) || HTS_GCC_AT_LEAST(3,0) 69 | #define HTS_UNUSED __attribute__ ((__unused__)) 70 | #else 71 | #define HTS_UNUSED 72 | #endif 73 | 74 | #if HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(4,5) 75 | #define HTS_DEPRECATED(message) __attribute__ ((__deprecated__ (message))) 76 | #elif HTS_GCC_AT_LEAST(3,1) 77 | #define HTS_DEPRECATED(message) __attribute__ ((__deprecated__)) 78 | #else 79 | #define HTS_DEPRECATED(message) 80 | #endif 81 | 82 | #if HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(6,4) 83 | #define HTS_DEPRECATED_ENUM(message) __attribute__ ((__deprecated__ (message))) 84 | #else 85 | #define HTS_DEPRECATED_ENUM(message) 86 | #endif 87 | 88 | // On mingw the "printf" format type doesn't work. It needs "gnu_printf" 89 | // in order to check %lld and %z, otherwise it defaults to checking against 90 | // the Microsoft library printf format options despite linking against the 91 | // GNU posix implementation of printf. The __MINGW_PRINTF_FORMAT macro 92 | // expands to printf or gnu_printf as required, but obviously may not 93 | // exist 94 | #ifdef __MINGW_PRINTF_FORMAT 95 | #define HTS_PRINTF_FMT __MINGW_PRINTF_FORMAT 96 | #else 97 | #define HTS_PRINTF_FMT printf 98 | #endif 99 | 100 | #if HTS_COMPILER_HAS(__format__) || HTS_GCC_AT_LEAST(3,0) 101 | #define HTS_FORMAT(type, idx, first) __attribute__((__format__ (type, idx, first))) 102 | #else 103 | #define HTS_FORMAT(type, idx, first) 104 | #endif 105 | 106 | #if defined(_WIN32) || defined(__CYGWIN__) 107 | #if defined(HTS_BUILDING_LIBRARY) 108 | #define HTSLIB_EXPORT __declspec(dllexport) 109 | #else 110 | #define HTSLIB_EXPORT 111 | #endif 112 | #elif HTS_COMPILER_HAS(__visibility__) || HTS_GCC_AT_LEAST(4,0) 113 | #define HTSLIB_EXPORT __attribute__((__visibility__("default"))) 114 | #elif defined(__SUNPRO_C) && __SUNPRO_C >= 0x550 115 | #define HTSLIB_EXPORT __global 116 | #else 117 | #define HTSLIB_EXPORT 118 | #endif 119 | 120 | #endif 121 | -------------------------------------------------------------------------------- /src/kh_counter.c: -------------------------------------------------------------------------------- 1 | // Wrap khash to make it more consise to use 2 | 3 | #define _GNU_SOURCE 4 | #include 5 | #include 6 | #include "kh_counter.h" 7 | 8 | /* Implementation of a counter of strings (increasing only) 9 | * 10 | * kh_counter_t *counter = kh_counter_init(); 11 | * kh_counter_increment(counter, "one"); 12 | * kh_counter_increment(counter, "two"); 13 | * kh_counter_increment(counter, "two"); 14 | * kh_counter_add(counter, "three", 2); 15 | * kh_counter_increment(counter, "three"); 16 | * kh_counter_destroy(h); 17 | * 18 | */ 19 | 20 | 21 | kh_counter_t *kh_counter_init(void) { 22 | kh_counter_t *h = kh_init(KH_COUNTER); 23 | return h; 24 | } 25 | 26 | int kh_counter_val(kh_counter_t *hash, char *key) { 27 | khiter_t k = kh_get(KH_COUNTER, hash, key); 28 | int val = k != kh_end(hash) ? kh_val(hash, k) : 0; 29 | return val; 30 | } 31 | 32 | size_t kh_counter_add(kh_counter_t *hash, char *key, int val) { 33 | if (key == NULL) {return -1;} 34 | // note: key is copied so no need for caller to hold on to it 35 | int ret; 36 | khiter_t k = kh_put(KH_COUNTER, hash, key, &ret); 37 | if (ret == 1) { // new key 38 | kh_key(hash, k) = strdup(key); 39 | kh_value(hash, k) = val; 40 | } else if (ret == 0) { // exists 41 | // get value and add 42 | int cur = kh_val(hash, k); 43 | kh_value(hash, k) = cur + val; 44 | } else { 45 | // shouldnt get here - previously deleted key 46 | } 47 | return ret; 48 | } 49 | 50 | size_t kh_counter_sub(kh_counter_t *hash, char *key, int val) { 51 | if (key == NULL) {return -1;} 52 | // note: key is copied so no need for caller to hold on to it 53 | int ret; 54 | khiter_t k = kh_put(KH_COUNTER, hash, key, &ret); 55 | if (ret == 1) { // new key 56 | kh_key(hash, k) = strdup(key); 57 | kh_value(hash, k) = -val; 58 | } else if (ret == 0) { // exists 59 | // get value and add 60 | int cur = kh_val(hash, k); 61 | kh_value(hash, k) = cur - val; 62 | } else { 63 | // shouldnt get here - previously deleted key 64 | } 65 | return ret; 66 | } 67 | 68 | size_t kh_counter_increment(kh_counter_t *hash, char *key) { 69 | return kh_counter_add(hash, key, 1); 70 | } 71 | 72 | void kh_counter_destroy(kh_counter_t *hash) { 73 | for (khiter_t k = 0; k < kh_end(hash); k++){ 74 | if (kh_exist(hash, k)) { 75 | free((char*) kh_key(hash, k)); 76 | } 77 | } 78 | kh_destroy(KH_COUNTER, hash); 79 | } 80 | -------------------------------------------------------------------------------- /src/kh_counter.h: -------------------------------------------------------------------------------- 1 | #ifndef _KHCOUNTER_H 2 | #define _KHCOUNTER_H 3 | 4 | #include "htslib/khash.h" 5 | 6 | 7 | KHASH_MAP_INIT_STR(KH_COUNTER, int) 8 | #define kh_counter_t khash_t(KH_COUNTER) 9 | 10 | // create a counter 11 | kh_counter_t *kh_counter_init(void); 12 | 13 | // Get a value from a counter 14 | int kh_counter_val(kh_counter_t *hash, char *key); 15 | 16 | // Clean up a counter 17 | void kh_counter_destroy(kh_counter_t *hash); 18 | 19 | // Increment a counter by one 20 | size_t kh_counter_increment(kh_counter_t *hash, char *key); 21 | 22 | // Decrement a counter by one 23 | size_t kh_counter_sub(kh_counter_t *hash, char *key, int val); 24 | 25 | // Increment a counter by a given amount 26 | size_t kh_counter_add(kh_counter_t *hash, char *key, int val); 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /src/regiter.c: -------------------------------------------------------------------------------- 1 | #include "regiter.h" 2 | #include "common.h" 3 | 4 | int region_from_string(char* input, char** chr, int* start, int* end) { 5 | *chr = xalloc(strlen(input) + 1, sizeof(char), "chr"); 6 | strcpy(*chr, input); 7 | char *reg_chr = (char *) hts_parse_reg(input, start, end); 8 | int rtn = 0; 9 | if (reg_chr) { 10 | *reg_chr = '\0'; // sets chr to be terminated at correct point 11 | } else { 12 | rtn = -1; 13 | } 14 | return rtn; 15 | } 16 | 17 | 18 | int region_from_bed(FILE* bed_fp, char** chr, int* start, int* end) { 19 | char* line = NULL; 20 | char* line_copy = NULL; 21 | size_t len = 0; 22 | ssize_t read; 23 | int rtn = 0; 24 | 25 | *start = -1; 26 | *end = -1; 27 | 28 | if ((read = getline(&line, &len, bed_fp)) != -1) { 29 | char *newline_pos = strchr(line, '\n'); 30 | if (newline_pos != NULL) { 31 | *newline_pos = '\0'; // Null-terminate the string at the newline 32 | } 33 | line_copy = strdup(line); // Copy line for error reporting 34 | 35 | // get chromosome 36 | char* tok = strtok(line, "\t"); 37 | if (tok == NULL) { 38 | fprintf(stderr, "WARNING: Missing chromosome field in BED file line: '%s'.\n", line_copy); 39 | rtn = -2; 40 | goto cleanup; 41 | } 42 | *chr = xrealloc(*chr, (strlen(tok) + 1) * sizeof(char), "chr"); 43 | strcpy(*chr, tok); 44 | 45 | // get start coordinate 46 | tok = strtok(NULL, "\t"); 47 | if (tok == NULL) { 48 | fprintf(stderr, "WARNING: Missing start field in BED file line: '%s'.\n", line_copy); 49 | rtn = -2; 50 | goto cleanup; 51 | } 52 | char* endptr; 53 | *start = strtol(tok, &endptr, 10); 54 | if (*endptr != '\0') { 55 | fprintf(stderr, "WARNING: Invalid start field in BED file line: '%s'.\n", line_copy); 56 | rtn = -2; 57 | goto cleanup; 58 | } 59 | 60 | // get end coordinate 61 | tok = strtok(NULL, "\t"); 62 | if (tok == NULL) { 63 | fprintf(stderr, "WARNING: Missing end field in BED file line: '%s'.\n", line_copy); 64 | rtn = -2; 65 | goto cleanup; 66 | } 67 | *end = strtol(tok, &endptr, 10); 68 | if (*endptr != '\0') { 69 | fprintf(stderr, "WARNING: Invalid end field in BED file line: '%s'.\n", line_copy); 70 | rtn = -2; 71 | goto cleanup; 72 | } 73 | 74 | // Validate start and end 75 | if (*start < 0 || *end < 0 || *start >= *end) { 76 | fprintf(stderr, "WARNING: Invalid region in BED file line: '%s'.\n", line_copy); 77 | rtn = -2; 78 | goto cleanup; 79 | } 80 | } else { 81 | rtn = -1; // EOF 82 | } 83 | 84 | cleanup: 85 | free(line); 86 | free(line_copy); 87 | return rtn; 88 | } 89 | 90 | 91 | 92 | // Initialize the region iterator 93 | regiter init_region_iterator(const char *bed_file, const char *single_region, sam_hdr_t *hdr) { 94 | regiter it = {0}; 95 | it.hdr = hdr; 96 | if (bed_file != NULL) { 97 | it.bed_fp = fopen(bed_file, "r"); 98 | if (it.bed_fp == NULL) { 99 | fprintf(stderr, "ERROR: Unable to open BED file: %s\n", bed_file); 100 | it.error |= 1; 101 | } 102 | } else if (single_region != NULL) { 103 | it.single_region = strdup(single_region); 104 | it.mode = 1; // we'll process this first then switch to BED file mode 105 | } 106 | return it; 107 | } 108 | 109 | 110 | // Clean up the iterator 111 | void destroy_region_iterator(regiter *it) { 112 | if (it->chr != NULL) free(it->chr); 113 | if (it->single_region != NULL) free(it->single_region); 114 | if (it->bed_fp != NULL) fclose(it->bed_fp); 115 | } 116 | 117 | // Get the next region 118 | // returns: 119 | // 0 successful 120 | // -1 if no more regions 121 | // -2 if error parsing region string 122 | // -3 if reference not found in BAM header 123 | int next_region(regiter *it) { 124 | int rtn = 0; 125 | if (it->mode == 0) { 126 | rtn = region_from_bed(it->bed_fp, &it->chr, &it->start, &it->end); 127 | } else if (it->mode == 1) { 128 | it->mode = 0; 129 | if (it->single_region) { 130 | rtn = region_from_string(it->single_region, &it->chr, &it->start, &it->end); 131 | if (rtn == -1) { 132 | fprintf(stderr, "WARNING: Failed to parse region string: %s\n", it->single_region); 133 | it->error |= 2; 134 | rtn = -2; 135 | } 136 | } 137 | } 138 | 139 | if (rtn == 0) { 140 | // check reference exists and tidy up length 141 | int tid = sam_hdr_name2tid(it->hdr, it->chr); 142 | if (tid < 0) { 143 | fprintf(stderr, "WARNING: Failed to find reference '%s' in BAM header.\n", it->chr); 144 | rtn = -3; 145 | } 146 | else { 147 | size_t ref_length = (size_t)sam_hdr_tid2len(it->hdr, tid); 148 | int ns = min(it->start, (int)ref_length); 149 | int ne = min(it->end, (int)ref_length); 150 | if (ns >= ne) { 151 | fprintf(stderr, "WARNING: Zero-length region created after truncating to reference length (%ld) '%s:%d-%d'.\n", ref_length, it->chr, it->start, it->end); 152 | rtn = -2; 153 | } 154 | else { 155 | it->start = ns; 156 | it->end = ne; 157 | } 158 | it->n_regions++; 159 | } 160 | } 161 | return rtn; 162 | } 163 | -------------------------------------------------------------------------------- /src/regiter.h: -------------------------------------------------------------------------------- 1 | #ifndef _FASTCAT_REGITER_H 2 | #define _FASTCAT_REGITER_H 3 | 4 | #include "htslib/sam.h" 5 | 6 | typedef struct { 7 | char *chr; 8 | int start; 9 | int end; 10 | FILE *bed_fp; 11 | char *single_region; 12 | int n_regions; 13 | sam_hdr_t *hdr; 14 | int mode; // 0: BED file, 1: single region 15 | int error; // &1: couldn't open BED, 16 | // &2: couldn't parse single_region 17 | } regiter; 18 | 19 | 20 | int region_from_string(char* input, char** chr, int* start, int* end); 21 | int region_from_bed(FILE* bed_fp, char** chr, int* start, int* end); 22 | regiter init_region_iterator(const char *bed_file, const char *single_region, sam_hdr_t *hdr); 23 | void destroy_region_iterator(regiter *it); 24 | int next_region(regiter *it); 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /src/stats.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "stats.h" 8 | #include "common.h" 9 | 10 | 11 | read_stats* create_length_stats(void) { 12 | read_stats* stats = (read_stats*) xalloc(1, sizeof(read_stats), "length_stats"); 13 | 14 | bin_groups* bins = (bin_groups*) xalloc(1, sizeof(bin_groups), "bin_groups"); 15 | stats->buckets = bins; 16 | bins->n = 1; 17 | bins->groups = (size_t*) xalloc(1, 3*bins->n*sizeof(size_t), "groups"); 18 | size_t* grps = bins->groups; 19 | // - end exclusive upper edges 20 | // - 0 is first lower edge 21 | // - final >x bucket 22 | //grps[0] = 50000; grps[1] = 1; 23 | //grps[3] = 100000; grps[4] = 10; 24 | //grps[6] = 1000000; grps[7] = 1000; 25 | // just do one massive bucket up to 10M - 76Mbytes 26 | grps[0] = 10000000; grps[1] = 1; 27 | 28 | 29 | // count the total number of bins across all groups 30 | stats->n = 0; 31 | size_t lower = 0; 32 | for (size_t i=0; in; i++) { 33 | size_t upper = bins->groups[3*i]; 34 | size_t step = bins->groups[3*i + 1]; 35 | size_t nbins = (upper - lower) / step; 36 | bins->groups[3*i + 2] = nbins; 37 | stats->n += nbins; 38 | lower = upper; 39 | } 40 | stats->n++; 41 | 42 | // fill in all the edges 43 | stats->width = 0; 44 | stats->edges = xalloc(stats->n, sizeof(size_t), "edges"); 45 | stats->counts = xalloc(stats->n, sizeof(size_t), "counts"); 46 | size_t i=0; 47 | lower = 0; 48 | size_t upper = 0; 49 | for (size_t b=0; bn; b++) { 50 | upper = bins->groups[3*b]; 51 | size_t step = bins->groups[3*b + 1]; 52 | for (size_t j=lower; jedges[i] = j; 54 | } 55 | lower = upper; 56 | } 57 | stats->edges[i] = upper; 58 | return stats; 59 | } 60 | 61 | void destroy_length_stats(read_stats* stats) { 62 | if (stats != NULL) { 63 | free(stats->buckets->groups); 64 | free(stats->buckets); 65 | free(stats->edges); 66 | free(stats->counts); 67 | free(stats); 68 | } 69 | } 70 | 71 | void add_length_count(read_stats* stats, size_t x) { 72 | size_t lower = 0; 73 | size_t cum_bin = 0; 74 | bool done = false; 75 | for (size_t i=0; ibuckets->n; i++) { 76 | size_t upper = stats->buckets->groups[3*i]; 77 | if (x < upper) { 78 | size_t step = stats->buckets->groups[3*i + 1]; 79 | stats->counts[cum_bin + (x - lower) / step]++; 80 | done = true; 81 | break; 82 | } 83 | lower = upper; 84 | cum_bin += stats->buckets->groups[3*i + 2]; 85 | } 86 | if (!done) { 87 | stats->counts[cum_bin]++; 88 | } 89 | } 90 | 91 | 92 | read_stats* create_qual_stats(float width) { 93 | read_stats* stats = (read_stats*) xalloc(1, sizeof(read_stats), "quality stats"); 94 | stats->width = width; 95 | // this fixes the range to [0, 100], good for both QUAL and %age acc 96 | stats->n = (size_t) (100.0 / stats->width) + 1; 97 | stats->counts = xalloc(stats->n, sizeof(size_t), "counts"); 98 | return stats; 99 | } 100 | 101 | void destroy_qual_stats(read_stats* stats) { 102 | if (stats != NULL) { 103 | free(stats->counts); 104 | free(stats); 105 | } 106 | } 107 | 108 | void add_qual_count(read_stats* stats, float q) { 109 | q = fmin(q, 100.0); 110 | stats->counts[(int) (q / stats->width)]++; 111 | } 112 | 113 | void print_stats(read_stats* stats, bool zeroes, bool tsv, FILE* fp) { 114 | if (fp == NULL) { 115 | fp = stderr; 116 | } 117 | if (stats->width == 0) { 118 | for (size_t i=0; in; i++) { 119 | if (stats->counts[i] == 0 && !zeroes) continue; 120 | if (tsv) { 121 | fprintf(fp, "%zu\t%zu\t%zu\n", stats->edges[i], stats->edges[i+1], stats->counts[i]); 122 | } 123 | else { 124 | fprintf(fp, "[%zu, %zu)\t%zu\n", stats->edges[i], stats->edges[i+1], stats->counts[i]); 125 | } 126 | } 127 | } 128 | else { 129 | if (tsv) { 130 | size_t decimals = _leading_decimals(stats->width); 131 | char fmt[64] = {0}; // my brain hurts and 64 seems big enough 132 | snprintf(fmt, 63, "%%.%zuf\t%%.%zuf\t%%zu\n", decimals, decimals); 133 | for (size_t i=0; in; i++) { 134 | if (stats->counts[i] == 0 && !zeroes) continue; 135 | fprintf(fp, fmt, (float) i * stats->width, (float) (i+1) * stats->width, stats->counts[i]); 136 | } 137 | } 138 | else { 139 | size_t decimals = _leading_decimals(stats->width); 140 | char fmt[64] = {0}; 141 | snprintf(fmt, 63, "[%%.%zuf, %%.%zuf)\t%%zu\n", decimals, decimals); 142 | for (size_t i=0; in; i++) { 143 | if (stats->counts[i] == 0 && !zeroes) continue; 144 | fprintf(fp, fmt, (float) i * stats->width, (float) (i+1) * stats->width, stats->counts[i]); 145 | } 146 | } 147 | } 148 | } 149 | 150 | // nasty function to e.g. 0.001 -> 3 151 | size_t _leading_decimals(float number) { 152 | char str[64] = { 0 }; 153 | snprintf(str, sizeof(str), "%f", number); 154 | char* point = strchr(str, '.'); 155 | return 1 + strspn(point + 1, "0"); 156 | } 157 | 158 | //int main(int argc, char **argv) { 159 | // read_stats* stats = create_length_stats(); 160 | // // bins every 1 161 | // add_length_count(stats, 1); 162 | // add_length_count(stats, 4); 163 | // add_length_count(stats, 950); 164 | // add_length_count(stats, 998); 165 | // add_length_count(stats, 999); 166 | // 167 | // //// changing to bins every 10 168 | // add_length_count(stats, 1000); 169 | // add_length_count(stats, 1001); 170 | // add_length_count(stats, 1009); 171 | // add_length_count(stats, 1010); 172 | // add_length_count(stats, 1045); 173 | // add_length_count(stats, 1050); 174 | // 175 | // // changing to bins every 100 176 | // add_length_count(stats, 9845); 177 | // add_length_count(stats, 9900); 178 | // add_length_count(stats, 9901); 179 | // add_length_count(stats, 9909); 180 | // add_length_count(stats, 9910); 181 | // add_length_count(stats, 9999); 182 | // add_length_count(stats, 10000); 183 | // add_length_count(stats, 10001); 184 | // add_length_count(stats, 10010); 185 | // add_length_count(stats, 10100); 186 | // add_length_count(stats, 10150); 187 | // add_length_count(stats, 10199); 188 | // add_length_count(stats, 10200); 189 | // add_length_count(stats, 10210); 190 | // 191 | // // changing to bins every 1000 192 | // add_length_count(stats, 99100); 193 | // add_length_count(stats, 99150); 194 | // add_length_count(stats, 99500); 195 | // add_length_count(stats, 99999); 196 | // add_length_count(stats, 100000); 197 | // add_length_count(stats, 100001); 198 | // add_length_count(stats, 100999); 199 | // add_length_count(stats, 101000); 200 | // add_length_count(stats, 102000); 201 | // 202 | // add_length_count(stats, 999999); 203 | // add_length_count(stats, 1000000); 204 | // add_length_count(stats, 2000000); 205 | // 206 | // print_stats(stats, false, false, stderr); 207 | // destroy_length_stats(stats); 208 | // 209 | // float width = 0.02; 210 | // read_stats* qstats = create_q_stats(width); 211 | // 212 | // add_q_count(qstats, 9.89); 213 | // add_q_count(qstats, 9.89); 214 | // add_q_count(qstats, 99.5); 215 | // add_q_count(qstats, 99.9); 216 | // add_q_count(qstats, 100); 217 | // add_q_count(qstats, 101); 218 | // add_q_count(qstats, 101); 219 | // add_q_count(qstats, 101); 220 | // print_stats(qstats, false, false, stderr); 221 | // destroy_q_stats(qstats); 222 | // 223 | //} 224 | -------------------------------------------------------------------------------- /src/stats.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTCAT_STATS_H 2 | #define FASTCAT_STATS_H 3 | 4 | #include "stdbool.h" 5 | // use only 1 non-zero decimal place in these -- see _leading_decimals() 6 | #define QUAL_HIST_WIDTH 0.02 // QUAL is log10 7 | #define ACC_HIST_WIDTH 0.0001 // ACC is linear %age, Q60 8 | #define COV_HIST_WIDTH 0.01 // COV is linear %age 9 | 10 | typedef struct { 11 | size_t n; 12 | size_t* groups; // 3 items: upper, step, number 13 | } bin_groups; 14 | 15 | 16 | typedef struct { 17 | size_t n; 18 | float width; // for fixed width 19 | size_t* edges; 20 | size_t* counts; 21 | bin_groups* buckets; 22 | } read_stats; 23 | 24 | 25 | read_stats* create_length_stats(void); 26 | void destroy_length_stats(read_stats* stats); 27 | void add_length_count(read_stats* stats, size_t); 28 | 29 | // uses read_stats with a fixed (and rescaled grid) 30 | read_stats* create_qual_stats(float width); 31 | void destroy_qual_stats(read_stats* stats); 32 | void add_qual_count(read_stats* stats, float q); 33 | 34 | void print_stats(read_stats* stats, bool zeroes, bool tsv, FILE* fp); 35 | 36 | size_t _leading_decimals(float num); 37 | #endif 38 | -------------------------------------------------------------------------------- /src/version.c: -------------------------------------------------------------------------------- 1 | #include "version.h" 2 | 3 | const char *argp_program_version = "0.22.0"; 4 | -------------------------------------------------------------------------------- /src/version.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTCAT_VERSION_H 2 | #define FASTCAT_VERSION_H 3 | 4 | extern const char *argp_program_version; 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /test/bamindex/400.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamindex/400.bam -------------------------------------------------------------------------------- /test/bamstats/310dx.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/310dx.bam -------------------------------------------------------------------------------- /test/bamstats/310dx.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/310dx.bam.bai -------------------------------------------------------------------------------- /test/bamstats/400ecoli-with-qcfail.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/400ecoli-with-qcfail.bam -------------------------------------------------------------------------------- /test/bamstats/400ecoli.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/400ecoli.bam -------------------------------------------------------------------------------- /test/bamstats/400ecoli.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/400ecoli.bam.bai -------------------------------------------------------------------------------- /test/bamstats/RCS-100A.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/RCS-100A.bam -------------------------------------------------------------------------------- /test/bamstats/RCS-100A.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/bamstats/RCS-100A.bam.bai -------------------------------------------------------------------------------- /test/bamstats/RCS-100A.bam.polya.hist: -------------------------------------------------------------------------------- 1 | 7 8 2 2 | 8 9 1 3 | 10 11 1 4 | 12 13 1 5 | 14 15 1 6 | 34 35 1 7 | 62 63 2 8 | 68 69 1 9 | 69 70 1 10 | 82 83 1 11 | 83 84 1 12 | 85 86 2 13 | 88 89 1 14 | 89 90 1 15 | 90 91 2 16 | 94 95 2 17 | 95 96 3 18 | 97 98 4 19 | 98 99 2 20 | 99 100 5 21 | 100 101 7 22 | 101 102 3 23 | 102 103 7 24 | 103 104 7 25 | 104 105 4 26 | 105 106 10 27 | 106 107 5 28 | 107 108 12 29 | 108 109 6 30 | 109 110 12 31 | 110 111 14 32 | 111 112 2 33 | 112 113 9 34 | 113 114 8 35 | 114 115 6 36 | 115 116 6 37 | 116 117 6 38 | 117 118 6 39 | 118 119 3 40 | 119 120 6 41 | 120 121 1 42 | 121 122 1 43 | 123 124 6 44 | 124 125 3 45 | 125 126 6 46 | 126 127 2 47 | 127 128 1 48 | 130 131 3 49 | 131 132 1 50 | 132 133 1 51 | 133 134 2 52 | 134 135 2 53 | 135 136 1 54 | 136 137 1 55 | 137 138 2 56 | 138 139 2 57 | 139 140 1 58 | 140 141 1 59 | 141 142 2 60 | 142 143 2 61 | 146 147 1 62 | 147 148 1 63 | 149 150 1 64 | 150 151 2 65 | 151 152 3 66 | 152 153 1 67 | 153 154 1 68 | 154 155 1 69 | 155 156 1 70 | 157 158 3 71 | 158 159 1 72 | 159 160 2 73 | 160 161 1 74 | 163 164 1 75 | 165 166 1 76 | 167 168 1 77 | 171 172 1 78 | 172 173 1 79 | 174 175 2 80 | 175 176 1 81 | 176 177 3 82 | 181 182 1 83 | 183 184 1 84 | 184 185 1 85 | 187 188 1 86 | 188 189 1 87 | 189 190 1 88 | 190 191 2 89 | 191 192 1 90 | 193 194 2 91 | 196 197 2 92 | 197 198 1 93 | 198 199 1 94 | 199 200 2 95 | 200 201 1 96 | 208 209 2 97 | 210 211 1 98 | 213 214 2 99 | 216 217 2 100 | 217 218 1 101 | 218 219 1 102 | 219 220 1 103 | 220 221 1 104 | 222 223 1 105 | 223 224 1 106 | 225 226 2 107 | 228 229 1 108 | 229 230 2 109 | 230 231 1 110 | 232 233 1 111 | 237 238 1 112 | 238 239 2 113 | 240 241 1 114 | 241 242 1 115 | 242 243 1 116 | 244 245 1 117 | 246 247 1 118 | 247 248 1 119 | 248 249 1 120 | 253 254 1 121 | 254 255 1 122 | 267 268 1 123 | 269 270 1 124 | 273 274 2 125 | 276 277 2 126 | 277 278 1 127 | 278 279 1 128 | 279 280 1 129 | 284 285 1 130 | 287 288 1 131 | 289 290 2 132 | 292 293 1 133 | 293 294 1 134 | 295 296 1 135 | 297 298 1 136 | 301 302 1 137 | 302 303 1 138 | 314 315 1 139 | 317 318 1 140 | 319 320 1 141 | 320 321 1 142 | 329 330 1 143 | 335 336 1 144 | 344 345 1 145 | 347 348 1 146 | 352 353 1 147 | 356 357 1 148 | 357 358 1 149 | 358 359 1 150 | 359 360 1 151 | 365 366 1 152 | 373 374 1 153 | 375 376 1 154 | 376 377 1 155 | 380 381 1 156 | 386 387 1 157 | 387 388 2 158 | 407 408 1 159 | 411 412 1 160 | 412 413 2 161 | 416 417 1 162 | 419 420 1 163 | 421 422 1 164 | 423 424 1 165 | 428 429 1 166 | 430 431 1 167 | 431 432 1 168 | 434 435 2 169 | 435 436 1 170 | 440 441 1 171 | 441 442 2 172 | 457 458 2 173 | 461 462 1 174 | 465 466 1 175 | 473 474 2 176 | 485 486 1 177 | 492 493 1 178 | 498 499 1 179 | 503 504 1 180 | 509 510 1 181 | 513 514 1 182 | 548 549 1 183 | -------------------------------------------------------------------------------- /test/bamstats_badNM/test.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.6 SO:coordinate 2 | @SQ SN:chr1 LN:100000 3 | 39c3c681-3193-4613-8ffb-1045c7568f70 0 chr1 1000 39 1S * 0 0 T C ms:i:176 AS:i:174 nn:i:0 tp:A:P cm:i:8 s1:i:64 s2:i:0 de:f:0.069 rl:i:0 qs:i:11 du:f:0.474 ns:i:570 ts:i:570 mx:i:1 ch:i:1309 st:Z:2024-01-24T17:15:41.615+00:00 rn:i:-1 sm:f:-736.172 sd:f:0.00798761 sv:Z: dx:i:0 sp:i:6852 MN:i:166 NM:i:10 BC:Z:barcode09 4 | -------------------------------------------------------------------------------- /test/bamstats_zeroNM/test.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.6 SO:coordinate 2 | @SQ SN:chrX LN:156040895 3 | 9ad9daf6-5d60-44c3-9535-d641ce144eff 16 chrX 43690 29 9S480M36S * 0 525 AACGTATTGCTGCATTGGGCCTGGGTCTCATTGAGGACAGATAGCGACCAGACTGTGCAACCTTTAGAGTCTGCATTGGGCTTAGGTCTCATTGAGGGCAGTTAGAGAGCAGACTGTGCAACCTTTAGAGTCTGCATTGGGCCTAGGTCTCATTGAGAGCAGATAGAGAGCACACTGTGCAACCTCTAGAGTCGGCATTGGGCCTAGGTCTCATTGAGGACAGATAGAGACCAGACTGTTGAAACTTTAGAGTCTGCATTGGGCCTAGGTCTCATTGAGGACAGATAGAGGGCAGACTGTGCAACCTTTAGAGTCTACAATGGGCCTAGGTATCAGTGAGGACAAATAGAGAGGAGACTGTGCAACCTTTAGAGTCTGCACTGGCCCTAGGTCTCTTTGAGGACAGACAGAGAGCAGAATGTGCAAACTTTAGAGTCTGCACTGGGCCTAGGTGTCATTGAGGACAGATAGAGACCAGACTGTGCAACCAGCAATACGTAACTGAACGAAGTACATGTACATAAC ()*0222332333?FB@>>DFCFF@@??@IFCCCDGHGFDC===>ABB<,+++,599?>@GIFFEB@0/'(''&&$$$%&)&%$$## AS:i:960 cm:i:92 de:f:0 ms:i:960 nn:i:0 s1:i:478 s2:i:474 tp:A:P MM:Z:C+m?,2,1,88,41,13; ML:B:C,1,2,255,2,4 qs:i:18 mx:i:2 ch:i:2213 rn:i:3733 st:Z:2022-08-18T20:18:39Z f5:Z:PAM87936_2807d237_503.fast5 ns:i:8675 ts:i:0 MD:Z:480 NM:i:0 RG:Z:2807d2374ca5985b978a023fba4f92836a5ce559_dna_r10.4.1_e8.2_sup@v3.5.1 4 | -------------------------------------------------------------------------------- /test/data/bc0.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/bc0.fastq.gz -------------------------------------------------------------------------------- /test/data/bc1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/bc1.fastq.gz -------------------------------------------------------------------------------- /test/data/bc2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/bc2.fastq.gz -------------------------------------------------------------------------------- /test/data/bcEmpty.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/bcEmpty.fastq.gz -------------------------------------------------------------------------------- /test/data/bcMangled.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/bcMangled.fastq.gz -------------------------------------------------------------------------------- /test/data/samtoolsfastq.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/data/samtoolsfastq.fastq.gz -------------------------------------------------------------------------------- /test/fastcat_expected_results/concat.reheader.sorted.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/fastcat_expected_results/concat.reheader.sorted.fastq.gz -------------------------------------------------------------------------------- /test/fastcat_expected_results/concat.sorted.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/fastcat_expected_results/concat.sorted.fastq.gz -------------------------------------------------------------------------------- /test/fastcat_expected_results/per-file-stats.tsv: -------------------------------------------------------------------------------- 1 | filename sample_name n_seqs n_bases min_length max_length mean_quality 2 | ../data/bc1.fastq.gz sample 10 5904 281 878 12.86 3 | ../data/bc0.fastq.gz sample 10 4599 191 965 9.86 4 | ../data/bc2.fastq.gz sample 10 5812 259 990 12.19 5 | ../data/bcEmpty.fastq.gz sample 10 4599 191 965 9.86 6 | ../data/bcMangled.fastq.gz sample 20 9990 470 517 13.27 7 | ../data/samtoolsfastq.fastq.gz sample 20 261268 247 43346 18.90 8 | -------------------------------------------------------------------------------- /test/fastcat_expected_results/per-read-stats.tsv: -------------------------------------------------------------------------------- 1 | read_id filename runid sample_name read_length mean_quality channel read_number start_time 2 | 959b96ca-5864-427a-ab22-53154ecb3511 ../data/bc1.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 672 12.98 81 15 2021-04-20T17:00:40Z 3 | 28f9c419-4d76-462e-9151-a746969ab2f2 ../data/bc1.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 694 10.05 412 19 2021-04-20T17:00:41Z 4 | cad9e07d-b2b2-47ad-a51c-2b5727a88552 ../data/bc1.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 562 12.94 363 18 2021-04-20T17:00:41Z 5 | 4916f252-0f9d-428d-94af-69b42e4734aa ../data/bc1.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 283 8.91 210 33 2021-04-20T17:00:42Z 6 | 3533a366-8bdd-4a12-82ec-280562b04527 ../data/bc1.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 852 14.23 222 26 2021-04-20T17:00:42Z 7 | 533ce526-194b-43af-91e8-13c6e95a7645 ../data/bc1.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 449 15.32 396 35 2021-04-20T17:00:43Z 8 | 450143b6-075b-4bd7-8585-cc9ba8128c02 ../data/bc1.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 531 14.09 503 61 2021-04-20T17:00:44Z 9 | efe4925d-b0b4-4939-b3cb-7af80a70c036 ../data/bc1.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 281 12.37 130 40 2021-04-20T17:00:45Z 10 | 2b1179cc-be33-4d83-b902-7ccdf9c03f38 ../data/bc1.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 702 13.34 328 78 2021-04-20T17:00:48Z 11 | b0de027e-fc5f-481d-a4e6-3a1b8521d39c ../data/bc1.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 878 14.34 185 69 2021-04-20T17:00:48Z 12 | 32e13a1c-4171-4706-b6ce-a32c0f65fa16 ../data/bc0.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 326 12.31 282 9 2021-04-20T17:00:40Z 13 | b87f011e-b802-4993-8f56-fd240b2e784f ../data/bc0.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 407 9.13 213 19 2021-04-20T17:00:41Z 14 | 6f64aedb-bb8e-4777-b494-43e661841e06 ../data/bc0.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 355 9.98 67 13 2021-04-20T17:00:41Z 15 | c372fb2c-dd45-4feb-81b2-c167c3d1ce93 ../data/bc0.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 317 8.14 337 18 2021-04-20T17:00:41Z 16 | 18d04e8d-2816-4986-8e1b-e5be676837fc ../data/bc0.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 965 7.57 507 18 2021-04-20T17:00:41Z 17 | aa81ca34-9310-42fd-9893-33112e283acc ../data/bc0.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 691 12.05 244 19 2021-04-20T17:00:41Z 18 | c746fb2f-78f6-4a0a-9c75-39465c855c8d ../data/bc0.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 191 8.99 379 35 2021-04-20T17:00:42Z 19 | 99a108d2-8e72-42bf-bebf-ad8373cfe450 ../data/bc0.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 456 8.61 177 38 2021-04-20T17:00:42Z 20 | 5d01447f-f17b-4acb-b87e-d60d8aeeccc8 ../data/bc0.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 659 12.16 417 21 2021-04-20T17:00:41Z 21 | b0279f8e-e988-44c5-895f-201b68217623 ../data/bc0.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 232 9.62 435 32 2021-04-20T17:00:43Z 22 | 1d0fded8-95e7-4b53-858f-8391da5d6537 ../data/bc2.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 259 7.77 190 13 2021-04-20T17:00:40Z 23 | 0e9a279b-b513-4ecf-96d9-7403749cb7e8 ../data/bc2.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 772 10.60 216 41 2021-04-20T17:00:43Z 24 | 50664f3b-7ba8-4d9e-b710-8f14cbec660f ../data/bc2.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 485 12.28 143 65 2021-04-20T17:00:45Z 25 | 6a9931a5-783b-4ffa-9a98-c9e3bd8c985e ../data/bc2.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 357 13.76 236 32 2021-04-20T17:00:45Z 26 | 09ab577f-c1d9-4fad-b08a-67de01d26b12 ../data/bc2.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 728 13.02 469 54 2021-04-20T17:00:45Z 27 | a243792f-4322-4b22-b99f-fb5163642791 ../data/bc2.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 990 12.54 324 48 2021-04-20T17:00:45Z 28 | c1ee6f3c-6c2e-4701-a8e9-567557463305 ../data/bc2.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 632 13.21 62 48 2021-04-20T17:00:46Z 29 | ed89256b-9cce-45b4-866d-c7033fc6f901 ../data/bc2.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 367 12.43 279 82 2021-04-20T17:00:46Z 30 | 3124fa05-5b9a-4b78-ab24-168c4110ee8b ../data/bc2.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 712 15.69 442 67 2021-04-20T17:00:46Z 31 | 8e042016-4de9-4d95-ae78-d671855c9416 ../data/bc2.fastq.gz 5a21d8a6996146deceeaea3784244c52741cae93 sample 510 10.63 269 78 2021-04-20T17:00:47Z 32 | 32e13a1c-4171-4706-b6ce-a32c0f65fa16 ../data/bcEmpty.fastq.gz sample 326 12.31 0 0 33 | b87f011e-b802-4993-8f56-fd240b2e784f ../data/bcEmpty.fastq.gz sample 407 9.13 0 0 34 | 6f64aedb-bb8e-4777-b494-43e661841e06 ../data/bcEmpty.fastq.gz sample 355 9.98 0 0 35 | c372fb2c-dd45-4feb-81b2-c167c3d1ce93 ../data/bcEmpty.fastq.gz sample 317 8.14 0 0 36 | 18d04e8d-2816-4986-8e1b-e5be676837fc ../data/bcEmpty.fastq.gz sample 965 7.57 0 0 37 | aa81ca34-9310-42fd-9893-33112e283acc ../data/bcEmpty.fastq.gz sample 691 12.05 0 0 38 | c746fb2f-78f6-4a0a-9c75-39465c855c8d ../data/bcEmpty.fastq.gz sample 191 8.99 0 0 39 | 99a108d2-8e72-42bf-bebf-ad8373cfe450 ../data/bcEmpty.fastq.gz sample 456 8.61 0 0 40 | 5d01447f-f17b-4acb-b87e-d60d8aeeccc8 ../data/bcEmpty.fastq.gz sample 659 12.16 0 0 41 | b0279f8e-e988-44c5-895f-201b68217623 ../data/bcEmpty.fastq.gz sample 232 9.62 0 0 42 | SRR12480552.4 ../data/bcMangled.fastq.gz 0000000000000000000000000000000000000000 sample 489 13.57 0 0 43 | SRR12480552.12 ../data/bcMangled.fastq.gz 0000000000000000000000000000000000000000 sample 470 10.86 0 0 44 | SRR12480552.6 ../data/bcMangled.fastq.gz 0000000000000000000000000000000000000000 sample 499 12.98 0 0 45 | SRR12480552.25 ../data/bcMangled.fastq.gz 0000000000000000000000000000000000000000 sample 517 15.36 0 0 46 | SRR12480552.30 ../data/bcMangled.fastq.gz 0000000000000000000000000000000000000000 sample 486 13.94 0 0 47 | SRR12480552.33 ../data/bcMangled.fastq.gz 0000000000000000000000000000000000000000 sample 502 11.66 0 0 48 | SRR12480552.48 ../data/bcMangled.fastq.gz 0000000000000000000000000000000000000000 sample 512 14.60 0 0 49 | SRR12480552.69 ../data/bcMangled.fastq.gz 0000000000000000000000000000000000000000 sample 493 14.11 0 0 50 | SRR12480552.75 ../data/bcMangled.fastq.gz 0000000000000000000000000000000000000000 sample 515 10.33 0 0 51 | SRR12480552.13 ../data/bcMangled.fastq.gz 0000000000000000000000000000000000000000 sample 512 15.31 0 0 52 | SRR12480552.4 ../data/bcMangled.fastq.gz sample 489 13.57 0 0 53 | SRR12480552.12 ../data/bcMangled.fastq.gz sample 470 10.86 0 0 54 | SRR12480552.6 ../data/bcMangled.fastq.gz sample 499 12.98 0 0 55 | SRR12480552.25 ../data/bcMangled.fastq.gz sample 517 15.36 0 0 56 | SRR12480552.30 ../data/bcMangled.fastq.gz sample 486 13.94 0 0 57 | SRR12480552.33 ../data/bcMangled.fastq.gz sample 502 11.66 0 0 58 | SRR12480552.48 ../data/bcMangled.fastq.gz sample 512 14.60 0 0 59 | SRR12480552.69 ../data/bcMangled.fastq.gz sample 493 14.11 0 0 60 | SRR12480552.75 ../data/bcMangled.fastq.gz sample 515 10.33 0 0 61 | SRR12480552.13 ../data/bcMangled.fastq.gz sample 512 15.32 0 0 62 | 02f8ea0a-48a3-43e3-b3b4-4832304a1d63 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 874 18.23 388 6040 2022-09-27T16:54:30.522+00:00 63 | 049cbf74-5247-4924-9d40-8b13e06d2ce1 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 382 18.45 68 6052 2022-09-27T16:53:36.712+00:00 64 | 0a43aaf1-eb93-4546-b7c2-26d6cad0495a ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 1086 17.68 1311 3548 2022-09-27T16:54:04.381+00:00 65 | 0d6b275d-0988-415e-9e52-584605eac9c7 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 1212 18.00 359 3193 2022-09-27T16:54:31.133+00:00 66 | 06d1b46a-f572-4322-bb4b-125cc04356da ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 567 15.03 329 3280 2022-09-27T16:55:12.575+00:00 67 | 00daaf65-afcc-40cb-9b19-aa2f08431e58 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 1788 20.12 94 5128 2022-09-27T16:54:04.401+00:00 68 | 07b41e60-d563-48e3-add7-c8df26e1b05f ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 528 16.09 76 3965 2022-09-27T16:53:47.919+00:00 69 | 07edc020-dc45-41ca-b3e9-d604a583f2b3 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 625 21.26 98 4130 2022-09-27T16:54:10.177+00:00 70 | 0f885800-dd67-436b-8388-f1eb11c08283 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 441 19.59 552 5436 2022-09-27T16:54:05.459+00:00 71 | 005af53d-c9bc-4036-ba0e-75d0a65915e2 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 15165 17.84 374 4128 2022-09-27T16:53:06.188+00:00 72 | 001a54f0-480c-4921-a9ac-bb56ac265b53 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 31127 14.62 431 1364 2022-09-27T16:53:46.787+00:00 73 | 0006f7f5-10ec-46c5-826a-e29ccbabd157 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 43346 24.49 421 3305 2022-09-27T16:53:08.748+00:00 74 | 00f5b0a4-9bcc-4d3b-b28c-82ee8a225d96 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 25675 20.28 2666 5266 2022-09-27T16:52:52.092+00:00 75 | 0137de1f-d6ca-4ff2-a057-4a945931e65b ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 3766 22.28 1407 12552 2022-09-27T16:54:20.749+00:00 76 | 01141689-0761-4a0c-9d4e-0e52321d30e3 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 21861 22.55 340 3399 2022-09-27T16:53:32.512+00:00 77 | 007ecbd6-47dc-46b9-9a65-b73e0dbbc7fd ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 42473 22.32 9 1128 2022-09-27T16:53:22.726+00:00 78 | 007f0041-5024-4d59-8ed6-c3871842f74f ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 39158 15.30 147 5862 2022-09-27T16:53:03.892+00:00 79 | 01b3eaa4-c93b-4f48-948f-ef2974ade8d5 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 247 12.14 86 4014 2022-09-27T16:53:50.280+00:00 80 | 0154f130-64f2-4cd4-8f81-8a9d9c3c9c33 ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 12531 20.13 415 1830 2022-09-27T16:53:55.209+00:00 81 | 014199a8-94ce-4d85-b5dd-b922e37d0fab ../data/samtoolsfastq.fastq.gz ee1ba89e9e8bdff1ed2f4b2a81d21464ff7c4f1c sample 18416 21.62 576 3293 2022-09-27T16:52:39.735+00:00 82 | -------------------------------------------------------------------------------- /test/parse_rd/RD-first-tag-and-no-RG-CW-4285.fastq: -------------------------------------------------------------------------------- 1 | @read-id RD:Z:dummy_run_id rn:i:1234 2 | AAAAAAAAAAAAAAAAA 3 | + 4 | AAAAAAAAAAAAAAAAA 5 | -------------------------------------------------------------------------------- /test/parse_rd/RD-first-tag-and-no-RG-CW-4285.fastq.runids: -------------------------------------------------------------------------------- 1 | filename run_id count 2 | ../parse_rd/RD-first-tag-and-no-RG-CW-4285.fastq dummy_run_id 1 3 | -------------------------------------------------------------------------------- /test/parse_rd/empty-RD-CW-4299.fastq: -------------------------------------------------------------------------------- 1 | @9ae4818c-61e9-4011-bf40-ca5721922da4 RD:Z: 2 | AAAAAAAAAAAAAAAAAA 3 | + 4 | AAAAAAAAAAAAAAAAAA 5 | -------------------------------------------------------------------------------- /test/parse_rd/empty-RD-CW-4299.fastq.runids: -------------------------------------------------------------------------------- 1 | filename run_id count 2 | ../parse_rd/empty-RD-CW-4299.fastq 1 3 | -------------------------------------------------------------------------------- /test/parse_rg/bad-ones.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/bad-ones.bam -------------------------------------------------------------------------------- /test/parse_rg/bad-ones.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/bad-ones.bam.bai -------------------------------------------------------------------------------- /test/parse_rg/bad-ones.bam.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/bad-ones.bam 100 3 | -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.bai: -------------------------------------------------------------------------------- 1 | BAI -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam dna_r10.4.1_e8.2_400bps_hac@v4.3.0 21 3 | -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v4.3.0.bam.fastq.gz dna_r10.4.1_e8.2_400bps_hac@v4.3.0 21 3 | -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.bai: -------------------------------------------------------------------------------- 1 | BAI3 -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam dna_r10.4.1_e8.2_400bps_hac@v5.0.0 21 3 | -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0.bam.fastq.gz dna_r10.4.1_e8.2_400bps_hac@v5.0.0 21 3 | -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.bai: -------------------------------------------------------------------------------- 1 | BAI -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam dna_r10.4.1_e8.2_400bps_hac@v5.0.0 21 3 | -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG.bam.fastq.gz dna_r10.4.1_e8.2_400bps_hac@v5.0.0 21 3 | -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.bai: -------------------------------------------------------------------------------- 1 | BAI} -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam dna_r10.4.1_e8.2_400bps_hac@v5.0.0 1661 3 | -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz -------------------------------------------------------------------------------- /test/parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/dna_r10.4.1_e8.2_400bps_hac@v5.0.0_5mCG_5hmCG_rbk.bam.fastq.gz dna_r10.4.1_e8.2_400bps_hac@v5.0.0 1661 3 | -------------------------------------------------------------------------------- /test/parse_rg/mixed.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/mixed.bam -------------------------------------------------------------------------------- /test/parse_rg/mixed.bam.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/mixed.bam dna_r10.4.1_e8.2_400bps_hac@v5.0.0 1703 3 | ../parse_rg/mixed.bam dna_r10.4.1_e8.2_400bps_hac@v4.3.0 21 4 | -------------------------------------------------------------------------------- /test/parse_rg/mixed.bam.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/mixed.bam.fastq.gz -------------------------------------------------------------------------------- /test/parse_rg/mixed.bam.fastq.gz.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/mixed.bam.fastq.gz dna_r10.4.1_e8.2_400bps_hac@v5.0.0 1703 3 | ../parse_rg/mixed.bam.fastq.gz dna_r10.4.1_e8.2_400bps_hac@v4.3.0 21 4 | -------------------------------------------------------------------------------- /test/parse_rg/mixed_basecaller_model_key.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/fastcat/398a95bda9d409d6e49497cb1a3ae21f9115d354/test/parse_rg/mixed_basecaller_model_key.fastq.gz -------------------------------------------------------------------------------- /test/parse_rg/mixed_basecaller_model_key.fastq.gz.callers: -------------------------------------------------------------------------------- 1 | filename basecaller count 2 | ../parse_rg/mixed_basecaller_model_key.fastq.gz dna_r10.4.1_e8.2_400bps_sup@2023-12-15_episeq-cs 12 3 | -------------------------------------------------------------------------------- /test/rg_parse.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | #include "../src/common.h" 7 | 8 | typedef struct { 9 | char* runid; 10 | char* basecall_model; 11 | char* mod_model; 12 | char* barcode; 13 | char* suffix; 14 | } TestCase; 15 | 16 | 17 | int compare(char* str1, char* str2) { 18 | if (str1 == NULL && str2 == NULL) { 19 | return 0; 20 | } 21 | if (str1 == NULL || str2 == NULL) { 22 | return 1; 23 | } 24 | return strncmp(str1, str2, max(strlen(str1), strlen(str2))); 25 | } 26 | 27 | 28 | int main() { 29 | char *runid_acquisition = "ef1af1ab8967cb20ca30dbeca93fd66592bf4619"; 30 | char *runid_protocol = "c886531d-28f5-41f6-b948-948e8cb78e5e"; 31 | char *basecall_model = "basecall_model_name@v1.2.3"; 32 | char *mod_model_name = "basecall_model_name@v1.2.3_5mCG_5hmCG@v1"; 33 | char *barcode = "barcode01"; 34 | char *suffix = "-1A2B3C4D"; 35 | 36 | TestCase cases[] = { 37 | {runid_acquisition, basecall_model, mod_model_name, barcode, suffix}, 38 | {runid_acquisition, basecall_model, mod_model_name, barcode, NULL}, 39 | {runid_acquisition, basecall_model, mod_model_name, NULL, suffix}, 40 | {runid_acquisition, basecall_model, mod_model_name, NULL, NULL}, 41 | {runid_acquisition, basecall_model, NULL, barcode, suffix}, 42 | {runid_acquisition, basecall_model, NULL, barcode, NULL}, 43 | {runid_acquisition, basecall_model, NULL, NULL, suffix}, 44 | {runid_acquisition, basecall_model, NULL, NULL, NULL}, 45 | {runid_protocol, basecall_model, mod_model_name, barcode, suffix}, 46 | {runid_protocol, basecall_model, mod_model_name, barcode, NULL}, 47 | {runid_protocol, basecall_model, mod_model_name, NULL, suffix}, 48 | {runid_protocol, basecall_model, mod_model_name, NULL, NULL}, 49 | {runid_protocol, basecall_model, NULL, barcode, suffix}, 50 | {runid_protocol, basecall_model, NULL, barcode, NULL}, 51 | {runid_protocol, basecall_model, NULL, NULL, suffix}, 52 | {runid_protocol, basecall_model, NULL, NULL, NULL}, 53 | }; 54 | 55 | int fails = 0; 56 | for (int i = 0; i < sizeof(cases)/sizeof(TestCase); i++) { 57 | 58 | char* read_group = calloc(400, sizeof(char)); 59 | read_group = strcpy(read_group, cases[i].runid); 60 | if (cases[i].basecall_model != NULL) { 61 | read_group = strcat(read_group, "_"); 62 | read_group = strcat(read_group, cases[i].basecall_model); 63 | } 64 | if (cases[i].mod_model != NULL) { 65 | read_group = strcat(read_group, "_"); 66 | read_group = strcat(read_group, cases[i].mod_model); 67 | } 68 | if (cases[i].barcode != NULL) { 69 | read_group = strcat(read_group, "_"); 70 | read_group = strcat(read_group, cases[i].barcode); 71 | } 72 | if (cases[i].suffix != NULL) { 73 | read_group = strcat(read_group, cases[i].suffix); 74 | } 75 | printf("Test case %d: %s\n", i, read_group); 76 | 77 | readgroup* info = create_rg_info(read_group); 78 | 79 | int fail = 0; 80 | fail += compare(info->runid, cases[i].runid) != 0; 81 | fail += compare(info->basecaller, cases[i].basecall_model) != 0; 82 | fail += compare(info->modcaller, cases[i].mod_model) != 0; 83 | fail += compare(info->barcode, cases[i].barcode) != 0; 84 | 85 | if (fail) { 86 | fails++; 87 | printf(" Failed\n"); 88 | printf(" Expected: %s %s %s %s\n", cases[i].runid, cases[i].basecall_model, cases[i].mod_model, cases[i].barcode); 89 | printf(" Got: %s %s %s %s\n", info->runid, info->basecaller, info->modcaller, info->barcode); 90 | } 91 | 92 | free(info); 93 | free(read_group); 94 | printf("\n"); 95 | } 96 | 97 | if (fails == 0) { 98 | printf("All tests passed\n"); 99 | } else { 100 | printf("%d tests failed\n", fails); 101 | } 102 | return fails != 0; 103 | } 104 | -------------------------------------------------------------------------------- /test/sort-sam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Reads SAM from stdin and writes to STDOUT, sorting auxiliary tags by key.""" 3 | import sys 4 | import re 5 | 6 | if __name__ == "__main__": 7 | for line in sys.stdin: 8 | if line.startswith("@"): 9 | continue 10 | else: 11 | fields = line.strip().split("\t") 12 | if len(fields) < 11: 13 | continue 14 | core_fields = fields[:11] 15 | aux_fields = fields[11:] 16 | aux_fields.sort() 17 | print("\t".join(core_fields + aux_fields)) 18 | --------------------------------------------------------------------------------