├── version.cpp ├── .travis.yml ├── .github └── workflows │ └── makefile.yml ├── MolecularID.cpp ├── logging.hpp ├── version.h ├── iohts.cpp ├── Hash.hpp ├── logging.cpp ├── iohts.hpp ├── LICENSE.txt ├── install-dependencies.sh ├── MolecularID.hpp ├── common.cpp ├── bin ├── uvcnorm.sh ├── uvcSurrogateAlign.sh └── uvcTN.sh ├── scripts └── extract-barcodes.py ├── Makefile ├── instcode.hpp ├── uvcActiveRegion.cpp ├── grouping.hpp ├── common.hpp ├── README.md ├── debarcode_main.c ├── main_consensus.hpp ├── CmdLineArgs.hpp ├── main_conversion.hpp └── grouping.cpp /version.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | const char *GIT_DIFF_FULL = 4 | #include "gitdiff.txt" 5 | ; 6 | 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # https://docs.travis-ci.com/user/reference/overview/ : distributions xenial bionic focal # dist: xenial by default 2 | 3 | script: 4 | #- export CC=gcc-7 5 | #- export CXX=g++-7 6 | - ./install-dependencies.sh 7 | - make clean 8 | - make all -j4 9 | - make deploy 10 | -------------------------------------------------------------------------------- /.github/workflows/makefile.yml: -------------------------------------------------------------------------------- 1 | name: Makefile CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | 17 | - name: Install dependencies 18 | run: ./install-dependencies.sh 19 | - name: Run compile all 20 | run: make all -j8 21 | - name: Run deploy 22 | run: make deploy 23 | -------------------------------------------------------------------------------- /MolecularID.cpp: -------------------------------------------------------------------------------- 1 | #include "MolecularID.hpp" 2 | 3 | #include "Hash.hpp" 4 | 5 | uvc1_hash_t MolecularBarcode::calcHash() const { 6 | uvc1_hash_t ret = 0; 7 | ret += hash2hash(ret, hash2hash(beg_tidpos_pair.first, beg_tidpos_pair.second)); 8 | ret += hash2hash(ret, hash2hash(end_tidpos_pair.first, end_tidpos_pair.second)); 9 | ret += hash2hash(ret, strhash(qnamestring.c_str())); 10 | ret += hash2hash(ret, strhash(umistring.c_str())); 11 | ret += hash2hash(ret, duplexflag); 12 | ret += hash2hash(ret, dedup_idflag); 13 | return ret; 14 | } 15 | 16 | -------------------------------------------------------------------------------- /logging.hpp: -------------------------------------------------------------------------------- 1 | #ifndef logging_hpp_INCLUDED 2 | #define logging_hpp_INCLUDED 3 | 4 | #include 5 | #include 6 | 7 | #define LOG(level) \ 8 | if (level > Log::ReportingLevel()) ; \ 9 | else Log().Get(level) 10 | 11 | enum TLogLevel { logCRITICAL , logERROR , logWARNING , logINFO , logINFO2, logDEBUG , logDEBUG1 , logDEBUG2 , logDEBUG3 , logDEBUG4 }; 12 | 13 | class Log { 14 | public: 15 | Log() {}; 16 | virtual ~Log(); 17 | std::ostringstream& 18 | Get(TLogLevel level = logINFO2); 19 | public: 20 | static TLogLevel& ReportingLevel(); 21 | protected: 22 | std::ostringstream os; 23 | private: 24 | Log(const Log&); 25 | Log& operator =(const Log&); 26 | private: 27 | TLogLevel messageLevel; 28 | }; 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /version.h: -------------------------------------------------------------------------------- 1 | #ifndef version_INCLUDED 2 | #define version_INCLUDED 3 | #include 4 | 5 | #define VERSION123 "0.15.1" 6 | 7 | #ifndef COMMIT_VERSION 8 | #define COMMIT_VERSION "NotVersionControlled" 9 | #endif 10 | 11 | #ifndef COMMIT_DIFF_SH 12 | #define COMMIT_DIFF_SH "NoVersion" 13 | #endif 14 | 15 | #define VERSION_CLEAN VERSION123 "." COMMIT_VERSION 16 | #define VERSION_DIRTY VERSION123 "." COMMIT_VERSION "-dirty" 17 | #define VERSION ((strlen(COMMIT_DIFF_SH) > 0) ? (VERSION_DIRTY) : (VERSION_CLEAN)) 18 | 19 | #define VERSION_DETAIL_CLEAN (VERSION_CLEAN " (" COMMIT_DIFF_SH ")") 20 | #define VERSION_DETAIL_DIRTY (VERSION_DIRTY " (" COMMIT_DIFF_SH ")") 21 | #define VERSION_DETAIL ((strlen(COMMIT_DIFF_SH) > 0) ? (VERSION_DETAIL_DIRTY) : (VERSION_DETAIL_CLEAN)) 22 | 23 | extern const char *GIT_DIFF_FULL; 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /iohts.cpp: -------------------------------------------------------------------------------- 1 | #include "iohts.hpp" 2 | #include "common.hpp" 3 | 4 | #include "htslib/faidx.h" 5 | #include "htslib/sam.h" 6 | #include "htslib/vcf.h" 7 | 8 | #include 9 | 10 | bool BedLine::is_valid() { 11 | return ((tid >= 0) || (tname.size() > 0)) && (beg_pos < end_pos); 12 | } 13 | 14 | std::vector 15 | load_bam_records( 16 | int & sam_itr_ret, 17 | samFile * sam_infile, 18 | const hts_idx_t * hts_idx, 19 | const uvc1_refgpos_t query_tid, 20 | const uvc1_refgpos_t query_beg, 21 | const uvc1_refgpos_t query_end) { 22 | 23 | std::vector ret; 24 | bam1_t *aln = bam_init1(); 25 | hts_itr_t *hts_itr = sam_itr_queryi(hts_idx, query_tid, query_beg, query_end); 26 | int itr_result = 0; 27 | while ((itr_result = sam_itr_next(sam_infile, hts_itr, aln)) >= 0) { 28 | ret.push_back(bam_dup1(aln)); 29 | } 30 | sam_itr_ret = itr_result; 31 | sam_itr_destroy(hts_itr); 32 | bam_destroy1(aln); 33 | return ret; 34 | } 35 | 36 | -------------------------------------------------------------------------------- /Hash.hpp: -------------------------------------------------------------------------------- 1 | #ifndef Hashing_hpp_INLCUDED 2 | #define Hashing_hpp_INLCUDED 3 | 4 | #include "common.hpp" 5 | 6 | template 7 | inline 8 | uvc1_hash_t 9 | strnhash(const T *str, size_t n, const uvc1_hash_t base = 31UL) { 10 | uvc1_hash_t ret = 0; 11 | for (size_t i = 0; i < n && str[i]; i++) { 12 | ret = ret * base + ((uvc1_hash_t)str[i]); 13 | } 14 | return ret; 15 | } 16 | 17 | template 18 | inline 19 | uvc1_hash_t 20 | strnhash_rc(const T *str, size_t n, const uvc1_hash_t base = 31UL) { 21 | uvc1_hash_t ret = 0; 22 | for (size_t i = 0; i < n && str[i]; i++) { 23 | ret = ret * base + STATIC_REV_COMPLEMENT.data[((uvc1_hash_t)str[n-i-(size_t)1])]; 24 | } 25 | return ret; 26 | } 27 | 28 | template 29 | inline 30 | uvc1_hash_t 31 | strhash(const T *str, const uvc1_hash_t base = 31UL) { 32 | return strnhash(str, SIZE_MAX, base); 33 | } 34 | 35 | inline 36 | uvc1_hash_t 37 | hash2hash(uvc1_hash_t hash1, uvc1_hash_t hash2) { 38 | return hash1 * ((1UL<<(31UL)) - 1UL) + hash2; 39 | } 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /logging.cpp: -------------------------------------------------------------------------------- 1 | #include "logging.hpp" 2 | 3 | // Defined in header: enum TLogLevel{ logCRITICAL , logERROR , logWARNING , logINFO , logINFO2, logDEBUG , logDEBUG1 , logDEBUG2 , logDEBUG3 , logDEBUG4 }; 4 | const char *TLogLevelToString[10] = {"logCRITICAL", "logERROR", "logWARNING", "logINFO", "logINFO2", "logDEBUG", "logDEBUG1", "logDEBUG2", "logDEBUG3", "logDEBUG4"}; 5 | 6 | static TLogLevel globalMessageLevel = logINFO2; 7 | 8 | char * 9 | nowtime(char *buffer) { 10 | time_t rawtime; 11 | time(&rawtime); 12 | struct tm t; 13 | localtime_r(&rawtime, &t); 14 | strftime(buffer, 128, "%F %T %z", &t); 15 | return buffer; 16 | } 17 | 18 | TLogLevel& Log::ReportingLevel() { return globalMessageLevel; }; 19 | 20 | std::ostringstream& 21 | Log::Get(TLogLevel level) { 22 | char buffer[128]; 23 | os << "- " << nowtime(buffer); 24 | os << " " << TLogLevelToString[level] << ": "; 25 | messageLevel = level; 26 | return os; 27 | } 28 | 29 | Log::~Log() { 30 | if (messageLevel <= Log::ReportingLevel()) { 31 | os << std::endl; 32 | fprintf(stderr, "%s", os.str().c_str()); 33 | fflush(stderr); 34 | } 35 | } 36 | 37 | -------------------------------------------------------------------------------- /iohts.hpp: -------------------------------------------------------------------------------- 1 | #ifndef IS_IOHTS_INCLUDED 2 | #define IS_IOHTS_INCLUDED 3 | 4 | #include "common.hpp" 5 | 6 | #include "htslib/sam.h" 7 | #include "htslib/hts.h" 8 | 9 | #include 10 | #include 11 | 12 | #define BED_END_TO_END_BIT 0x1 13 | 14 | struct BedLine { 15 | std::string tname; // can be set to empty if tid != -1 16 | uvc1_refgpos_t tid; // can be set to -1 if !tname.isEmpty() 17 | uvc1_refgpos_t beg_pos; 18 | uvc1_refgpos_t end_pos; 19 | uvc1_flag_t region_flag; 20 | uvc1_readnum_big_t n_reads; 21 | 22 | BedLine( 23 | uvc1_refgpos_t a_tid, 24 | uvc1_refgpos_t a_beg_pos, 25 | uvc1_refgpos_t a_end_pos, 26 | const uvc1_flag_t a_region_flag, 27 | const uvc1_readnum_big_t a_n_reads) { 28 | this->tid = a_tid; 29 | this->beg_pos = a_beg_pos; 30 | this->end_pos = a_end_pos; 31 | this->region_flag = a_region_flag; 32 | this->n_reads = a_n_reads; 33 | }; 34 | bool is_valid(); 35 | }; 36 | 37 | std::vector 38 | load_bam_records( 39 | int & sam_itr_queryi_ret, 40 | samFile *samfile, 41 | const hts_idx_t * hts_idx, 42 | const uvc1_refgpos_t query_tid, 43 | const uvc1_refgpos_t query_beg, 44 | const uvc1_refgpos_t query_end); 45 | 46 | #endif 47 | 48 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, Genetron Health 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /install-dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | # cmdline param 1: option for this script 4 | # cmdline param 2: option for the configure script in bcftools and samtools 5 | 6 | set -evx 7 | currdir="${PWD}" 8 | 9 | if [ $(echo "${1}" | grep skip-bcftools | wc -l) -eq 0 ]; then 10 | mkdir -p "${currdir}/ext/" 11 | cd "${currdir}/ext/" 12 | if [ $(echo "${1}" | grep skip-downloading-bcftools | wc -l) -eq 0 ]; then 13 | wget -c https://github.com/samtools/bcftools/releases/download/1.11/bcftools-1.11.tar.bz2 14 | fi 15 | tar -xvf bcftools-1.11.tar.bz2 16 | cd "${currdir}/ext/bcftools-1.11" 17 | ./configure ${2} 18 | make -j 4 19 | cp bcftools "${currdir}/bin/" 20 | 21 | # htslib-*-lowdep is used for compiling UVC 22 | cp -r "${currdir}/ext/bcftools-1.11/htslib-1.11" "${currdir}/ext/htslib-1.11-lowdep" 23 | cd "${currdir}/ext/htslib-1.11-lowdep" 24 | ./configure -disable-plugins --disable-libcurl --disable-s3 --disable-largefile --without-libdeflate ${2} # --disable-bz2 and --disable-lzma are both for disabling CRAM files 25 | make -j 4 26 | 27 | # make install # this command may fail without root privilege, but it does not matter much as bcftools is in the PATH variable by exporting in uvcTN.sh 28 | fi 29 | 30 | # samtools is not directly used by UVC but is often used for processing the input to UVC. 31 | if [ $(echo "${1}" | grep skip-samtools | wc -l) -eq 0 ]; then 32 | mkdir -p "${currdir}/ext/" 33 | cd "${currdir}/ext/" 34 | if [ $(echo "${1}" | grep skip-downloading-samtools | wc -l) -eq 0 ]; then 35 | wget -c https://github.com/samtools/samtools/releases/download/1.11/samtools-1.11.tar.bz2 36 | fi 37 | tar -xvf samtools-1.11.tar.bz2 38 | cd "${currdir}/ext/samtools-1.11" 39 | ./configure ${2} 40 | make -j 4 41 | cp samtools "${currdir}/bin/" 42 | fi 43 | 44 | if [ $(echo "${1}" | grep skip-parallel | wc -l) -eq 0 ]; then 45 | cd "${currdir}/ext/" 46 | wget -c http://ftp.gnu.org/gnu/parallel/parallel-20201122.tar.bz2 47 | tar -xvf parallel-20201122.tar.bz2 48 | cd "${currdir}/ext/parallel-20201122" 49 | ./configure 50 | make 51 | cp src/parallel "${currdir}/bin/" 52 | fi 53 | 54 | -------------------------------------------------------------------------------- /MolecularID.hpp: -------------------------------------------------------------------------------- 1 | #ifndef MolecularID_hpp_INCLUDED 2 | #define MolecularID_hpp_INCLUDED 3 | 4 | #include "common.hpp" 5 | 6 | #include 7 | 8 | struct MolecularBarcode { 9 | 10 | std::pair beg_tidpos_pair; 11 | std::pair end_tidpos_pair; 12 | std::string qnamestring = ""; 13 | std::string umistring = ""; 14 | 15 | uvc1_flag_t duplexflag = 0x0; 16 | uvc1_flag_t dedup_idflag = 0x0; 17 | 18 | uvc1_hash_t hashvalue; 19 | 20 | MolecularBarcode 21 | createKey() const { 22 | MolecularBarcode mb; 23 | 24 | mb.beg_tidpos_pair = std::make_pair(-1, -1); 25 | mb.end_tidpos_pair = std::make_pair(-1, -1); 26 | if (0x3 == (0x3 & dedup_idflag)) { 27 | auto min2 = MIN(this->beg_tidpos_pair, this->end_tidpos_pair); 28 | auto max2 = MAX(this->beg_tidpos_pair, this->end_tidpos_pair); 29 | mb.beg_tidpos_pair = min2; 30 | mb.end_tidpos_pair = max2; 31 | } else if (0x1 & dedup_idflag) { 32 | mb.beg_tidpos_pair = this->beg_tidpos_pair; 33 | } else if (0x2 & dedup_idflag) { 34 | mb.end_tidpos_pair = this->end_tidpos_pair; 35 | } 36 | 37 | if (0x4 & dedup_idflag) { 38 | mb.qnamestring = this->qnamestring; 39 | } else { 40 | mb.qnamestring = ""; 41 | } 42 | if (0x8 & dedup_idflag) { 43 | mb.umistring = this->umistring; 44 | } else { 45 | mb.umistring = ""; 46 | } 47 | 48 | mb.duplexflag = this->duplexflag; 49 | mb.dedup_idflag = this->dedup_idflag; 50 | return mb; 51 | } 52 | bool 53 | operator<(const MolecularBarcode & that) const { 54 | bool isdiff, isless; 55 | compare_diff_less(isdiff, isless, this->beg_tidpos_pair, that.beg_tidpos_pair); 56 | if (isdiff) { return isless; } 57 | compare_diff_less(isdiff, isless, this->end_tidpos_pair, that.end_tidpos_pair); 58 | if (isdiff) { return isless; } 59 | compare_diff_less(isdiff, isless, this->qnamestring, that.qnamestring); 60 | if (isdiff) { return isless; } 61 | compare_diff_less(isdiff, isless, this->umistring, that.umistring); 62 | if (isdiff) { return isless; } 63 | compare_diff_less(isdiff, isless, this->duplexflag, that.duplexflag); 64 | if (isdiff) { return isless; } 65 | compare_diff_less(isdiff, isless, this->dedup_idflag, that.dedup_idflag); 66 | if (isdiff) { return isless; } 67 | return (this->hashvalue < that.hashvalue); 68 | } 69 | uvc1_hash_t calcHash() const; 70 | }; 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.hpp" 2 | 3 | const auto _ASSAY_TYPE_TO_MSG = std::array({{ 4 | [ASSAY_TYPE_AUTO] = "Assay type of each molecule fragment will be automatically inferred from the data", 5 | [ASSAY_TYPE_CAPTURE] = "Data is generatd from a capture-based assay with selection by probe hybridization", 6 | [ASSAY_TYPE_AMPLICON] = "Data is generated from an amplicon-based assay with targeted amplification by PCR", 7 | }}); 8 | const std::vector ASSAY_TYPE_TO_MSG = std::vector(_ASSAY_TYPE_TO_MSG.begin(), _ASSAY_TYPE_TO_MSG.end()); 9 | 10 | const auto _MOLECULE_TAG_TO_MSG = std::array({{ 11 | [MOLECULE_TAG_AUTO] = "Molecule tag of each molecule fragment will be automatically inferred from the data", 12 | [MOLECULE_TAG_NONE] = "Molecule is not tagged", 13 | [MOLECULE_TAG_BARCODING] = "Molecule is tagged with a unique molecular identifer (UMI) on one strand as in Safe-SeqS", 14 | [MOLECULE_TAG_DUPLEX] = "Molecule is tagged with a duplex UMI", 15 | }}); 16 | const std::vector MOLECULE_TAG_TO_MSG = std::vector(_MOLECULE_TAG_TO_MSG.begin(), _MOLECULE_TAG_TO_MSG.end()); 17 | 18 | const auto _SEQUENCING_PLATFORM_TO_MSG = std::array({{ 19 | [SEQUENCING_PLATFORM_AUTO] = "Unknown sequencing platform that will be automatically inferred from the data", 20 | [SEQUENCING_PLATFORM_ILLUMINA] = "Illumina sequencing platform (compatible with BGI and MGI)", 21 | [SEQUENCING_PLATFORM_IONTORRENT] = "IonTorrent sequencing platform by Life Technologies and ThermoFisher", 22 | [SEQUENCING_PLATFORM_OTHER] = "Other sequencing platform (for example, Nanopore)", 23 | }}); 24 | const std::vector SEQUENCING_PLATFORM_TO_MSG = std::vector(_SEQUENCING_PLATFORM_TO_MSG.begin(), _SEQUENCING_PLATFORM_TO_MSG.end()); 25 | 26 | const auto _SEQUENCING_PLATFORM_TO_NAME = std::array({{ 27 | [SEQUENCING_PLATFORM_AUTO] = "AUTO", 28 | [SEQUENCING_PLATFORM_ILLUMINA] = PLAT_ILLUMINA_LIKE, 29 | [SEQUENCING_PLATFORM_IONTORRENT] = PLAT_ION_LIKE, 30 | [SEQUENCING_PLATFORM_OTHER] = "OtherSequencingPlatform", 31 | }}); 32 | const std::vector SEQUENCING_PLATFORM_TO_NAME = std::vector( _SEQUENCING_PLATFORM_TO_NAME.begin(), _SEQUENCING_PLATFORM_TO_NAME.end()); 33 | 34 | const auto _PAIR_END_MERGE_TO_MSG = std::array ({{ 35 | [PAIR_END_MERGE_YES] = "paired-end sequenced segments are merged", 36 | [PAIR_END_MERGE_NO] = "paired-end sequenced segments are not merged", 37 | }}); 38 | const std::vector PAIR_END_MERGE_TO_MSG = std::vector(_PAIR_END_MERGE_TO_MSG.begin(), _PAIR_END_MERGE_TO_MSG.end()); 39 | 40 | -------------------------------------------------------------------------------- /bin/uvcnorm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DEFAULT_NORM_FLAG="-m+both" 4 | DEFAULT_NUM_THREADS=4 5 | DEFAULT_MIN_SNV_QUAL=58.5 6 | DEFAULT_MIN_NON_SNV_QUAL=49.5 7 | DEFAULT_MIN_NLODQ=-9999 8 | 9 | scriptdir="$(dirname "$(which "$0")")" 10 | 11 | if [ $# -lt 2 ]; then 12 | echo "Usage: $0

[] [] []" 13 | echo " input-vcf: UVC-generated VCF with non-normalized variants. " 14 | echo " output-vcf: normalized VCF, which can be evaluated by tools such as RTG vcfeval for assessing somatic-variant-calling performance. " 15 | echo " multiallelic-control: snps|indels|both|any, same as the --multiallelics option in bcftools [${DEFAULT_NORM_FLAG}]. " 16 | echo " num-threads: the number of threads to use, same as the --threads option in bcftools [${DEFAULT_NUM_THREADS}]. " 17 | echo " min-SNV-qual: the minimum QUAL at each position below which the variant is always not merged [${DEFAULT_MIN_SNV_QUAL}]. " 18 | echo " min-non-SNV-qual: the minimum QUAL at each position below which the variant is always not merged [${DEFAULT_MIN_NON_SNV_QUAL}]. " 19 | echo " min-NLODQ: the minimum NLODQ at each position below which the variant is filtered out. " 20 | echo " This option can remove tumor SNV/InDel with InDel/SNV in the matched normal at the same position, respectively. " 21 | echo " Set to a very negative value to disable this filter [${DEFAULT_MIN_NLODQ}]. " 22 | exit 1 23 | fi 24 | 25 | if [ -z "${3}" ]; then 26 | normflag="${DEFAULT_NORM_FLAG}" 27 | else 28 | normflag="${3}" 29 | fi 30 | 31 | if [ -z "${4}" ]; then 32 | numthreads="${DEFAULT_NUM_THREADS}" 33 | else 34 | numthreads="${4}" 35 | fi 36 | 37 | if [ -z "${5}" ]; then 38 | minSNVqual="${DEFAULT_MIN_SNV_QUAL}" 39 | else 40 | minSNVqual="${5}" 41 | fi 42 | 43 | if [ -z "${6}" ]; then 44 | minNonSNVqual="${DEFAULT_MIN_NON_SNV_QUAL}" 45 | else 46 | minNonSNVqual="${6}" 47 | fi 48 | 49 | if [ -z "${7}" ]; then 50 | minNLODQ="${DEFAULT_MIN_NLODQ}" 51 | else 52 | minNLODQ="${7}" 53 | fi 54 | 55 | export PATH="${scriptdir}:${PATH}" # remove this line in the rare case that an important executable is shadowed by this command 56 | 57 | vcfnf=$(bcftools view --header-only "${1}" | tail -n1 | awk '{print NF}') 58 | if [ ${vcfnf} -eq 10 ]; then 59 | si=0 60 | elif [ ${vcfnf} -eq 11 ]; then 61 | si=1 62 | else 63 | echo "The input VCF ${1} has ${vcfnf} columns, but only 10 (tumor-only sample) or 11 (tumor and normal samples) are expected!" 64 | exit 1 65 | fi 66 | 67 | bcftools view --threads $numthreads -i \ 68 | " ALT != '*' 69 | && (vNLODQ[0:0] > ${minNLODQ} && vNLODQ[0:1] > ${minNLODQ}) 70 | && ((TYPE == 'snps' && QUAL >= ${minSNVqual}) || (TYPE != 'snps' && QUAL >= ${minNonSNVqual}) 71 | || ((cVQ1M[${si}:0] - cVQ2M[${si}:0] >= 0) && (cVQ1M[${si}:0] - cVQ1[${si}:1] == 0)) 72 | || ((cVQ1M[${si}:0] - cVQ2M[${si}:0] < 0) && (cVQ2M[${si}:0] - cVQ2[${si}:1] == 0)))" "${1}" \ 73 | | bcftools norm ${normflag} -Oz -o "${2}" 74 | bcftools index --threads $numthreads -ft "${2}" 75 | 76 | exit 0 77 | 78 | -------------------------------------------------------------------------------- /bin/uvcSurrogateAlign.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # WARNING: this script is still in alpha stage and not production-ready !!! 4 | 5 | # params: outvcf invcf ref bam[,bed] InDel-length 6 | # output: outvcf: vcf with surrogate alignments 7 | 8 | set -evx 9 | 10 | scriptdir="$(dirname "$(which "$0")")" 11 | 12 | export PATH="${scriptdir}:${PATH}" # remove this line in the rare case that an important executable is shadowed by this command 13 | 14 | if [ -z "${UVC_BIN_EXE_FULL_NAME}" ]; then 15 | UVC_BIN_EXE_FULL_NAME="${scriptdir}/uvc1" 16 | else 17 | echo "WARNING: using UVC_BIN_EXE_FULL_NAME=${UVC_BIN_EXE_FULL_NAME} from environment variable." 18 | echo "Please enter the shell command (unset UVC_BIN_EXE_FULL_NAME) before running uvcTN.sh if the default uvc binary exe full path should be used." 19 | fi 20 | 21 | BWA_SURROGATE_PARAMS=" -A 3 -B 12 -O 18 -E 1 -L 18 " 22 | GROUNDTRUTH_ALLELE_IDENTITY_FLAG="both" 23 | 24 | outvcf="${1}" 25 | invcf="${2}" 26 | ref="${3}" 27 | bam=$(echo "${4}" | awk -F "," '{print $1}') 28 | bed=$(echo "${4}" | awk -F "," '{print $2}') 29 | indelsize=14 30 | 31 | outdir="${outvcf}.surrogate" 32 | #fq="${outdir}/surrogate.fastq.gz" 33 | fq0="${outdir}/surrogate.SE.fastq.gz" 34 | fq1="${outdir}/surrogate.R1.fastq.gz" 35 | fq2="${outdir}/surrogate.R2.fastq.gz" 36 | #sfq="${outdir}/surrogate.singleton.fastq.gz" 37 | 38 | ncpus=32 39 | nbams=8 40 | 41 | mkdir -p "${outdir}" 42 | if [ -z "${bed}" ]; then 43 | bed="${outdir}/superactive.bed" 44 | echo "Will generate the bed file ${bed} which contains superactive regions used for surrogate alignments. " 45 | printf "track name=superactive description=\"Containing super-active regions (regions with noisy alignments) for surrogate alignments.\"\n" > "${bed}" 46 | bcftools query -f '%CHROM\t%POS0\t%END\t%ID\n' -i "ALT = \"\"" "${invcf}" | bedtools slop -b 120 -g "${ref}.fai" -i - | bedtools merge -i - >> "${bed}" 47 | fi 48 | 49 | samtools view -@ ${nbams} -L "${bed}" "${bam}" -bhu | samtools sort -@ ${nbams} -n -u -o - | samtools fastq -@ ${nbams} - -s "${fq0}" -1 "${fq1}" -2 "${fq2}" 50 | bwa mem ${BWA_SURROGATE_PARAMS} -t ${ncpus} "${ref}" "${fq1}" "${fq2}" | samtools sort -@ ${nbams} - -o "${outdir}/surrogate.bam" 51 | samtools index -@ ${nbams} "${outdir}/surrogate.bam" 52 | 53 | sample=$(bcftools view "${invcf}" --header-only | tail -n1 |awk '{print $NF}') 54 | "${UVC_BIN_EXE_FULL_NAME}" -t ${ncpus} --outvar-flag 0xF -f "${ref}" -s "${sample}" -o "${outdir}/surrogate.vcf.gz" "${outdir}/surrogate.bam" 55 | bcftools index --threads ${nbams} -ft "${outdir}/surrogate.vcf.gz" 56 | 57 | #bcftools view --threads ${nbams} -i "TYPE != \"indel\" || (abs(strlen(ALT)-strlen(REF)) < ${indelsize})" "${invcf}" -Oz -o "${outdir}/original-filt.vcf.gz" 58 | 59 | bcftools view --threads ${nbams} "${invcf}" -Oz -o "${outdir}/original-filt.vcf.gz" 60 | bcftools index --threads ${nbams} "${outdir}/original-filt.vcf.gz" 61 | bcftools view --threads ${nbams} -i "TYPE = \"indel\" && (abs(strlen(ALT)-strlen(REF)) > ${indelsize}) && GERMLINE=1 && GT != \"ref\"" "${outdir}/surrogate.vcf.gz" -Oz -o "${outdir}/surrogate-filt.vcf.gz" 62 | bcftools index --threads ${nbams} "${outdir}/surrogate-filt.vcf.gz" 63 | 64 | bcftools concat --threads ${nbams} -a -d ${GROUNDTRUTH_ALLELE_IDENTITY_FLAG} "${outdir}/surrogate-filt.vcf.gz" "${outdir}/original-filt.vcf.gz" -Oz -o "${outvcf}" 65 | bcftools index --threads ${nbams} -ft "${outvcf}" 66 | 67 | -------------------------------------------------------------------------------- /scripts/extract-barcodes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import gzip 4 | 5 | def iter_fq(fqfile): 6 | hdr, seq, comment, qual = ('', '', '', '') 7 | for i, line in enumerate(fqfile): 8 | if i % 4 == 0: 9 | assert line.startswith('@') 10 | hdr = line[1:] # SRR3478158.1565 11 | #hdrtokens = hdr.split()[0].split('.') 12 | #if len(hdrtokens) > 2 and hdrtokens[2] in ['1', '2']: 13 | # hdrtokens[2] = '12' 14 | # hdr = '.'.join(hdrtokens) 15 | elif i % 4 == 1: 16 | seq = line 17 | elif i % 4 == 2: 18 | assert line.startswith('+') 19 | comment = line[1:] 20 | else: 21 | qual = line 22 | yield hdr, seq, comment, qual 23 | 24 | def change_hdr(hdr, seq1, umi_beg1, umi_end1, seq2, umi_beg2, umi_end2, isduplex): 25 | hdr2 = hdr.rstrip().split() 26 | if (len(hdr2) > 1 and len(hdr2[1]) > 15 and len(hdr2[1].split(':')) > 2): 27 | hdr3 = hdr2[1] 28 | else: 29 | hdr3 = hdr2[0] 30 | hdrtokens = hdr3.split('.') 31 | if len(hdrtokens) > 2 and hdrtokens[2] in ['1', '2']: 32 | hdrtokens[2] = '12' 33 | hdr3 = '.'.join(hdrtokens) 34 | r1tag = (seq1[umi_beg1:umi_end1] if len(seq1) > umi_end1 else ''.join(['N' for _ in range(umi_end1-umi_beg1)])) 35 | r2tag = (seq2[umi_beg2:umi_end2] if len(seq2) > umi_end2 else ''.join(['N' for _ in range(umi_end2-umi_beg2)])) 36 | combsymbol = ('+' if isduplex else '-') 37 | hdr4 = hdr3 + '#' + ((r1tag + combsymbol + r2tag) if (r1tag != '' and r2tag != '') else (r1tag + r2tag)) 38 | return hdr4 + '\n' 39 | 40 | def proc(r1infile, r2infile, r1outfile, r2outfile, incluBeg1 = 0, excluEnd1 = 11, incluBeg2 = 0, excluEnd2 = 11, isduplex = False): 41 | g1, g2 = iter_fq(r1infile), iter_fq(r2infile) 42 | rec1 = next(g1, -1) 43 | rec2 = next(g2, -1) 44 | while rec1 != -1 and rec2 != -1: 45 | #hdr1 = change_hdr(rec1[0], rec1[1]) 46 | #hdr2 = change_hdr(rec2[0], rec1[1]) 47 | hdr1 = change_hdr(rec1[0], rec1[1], incluBeg1, excluEnd1, rec2[1], incluBeg2, excluEnd2, isduplex) 48 | hdr2 = change_hdr(rec2[0], rec1[1], incluBeg1, excluEnd1, rec2[1], incluBeg2, excluEnd2, isduplex) 49 | r1outfile.write('@{}{}+{}{}'.format(hdr1, rec1[1], rec1[2], rec1[3])) 50 | r2outfile.write('@{}{}+{}{}'.format(hdr2, rec2[1], rec2[2], rec2[3])) 51 | rec1 = next(g1, -1) 52 | rec2 = next(g2, -1) 53 | assert rec1 == -1 and rec2 == -1 54 | 55 | r1in = sys.argv[1] 56 | r2in = sys.argv[2] 57 | 58 | r1out = sys.argv[3] 59 | r2out = sys.argv[4] 60 | 61 | assert 4 == len(set([r1in, r2in, r1out, r2out])) or r2in == 'None' 62 | if r2in == 'None': r2in = r1in 63 | 64 | if len(sys.argv) > 6: 65 | incluBeg = int(sys.argv[5]) 66 | excluEnd = int(sys.argv[6]) 67 | else: 68 | incluBeg = 0 69 | excluEnd = 23 70 | 71 | if len(sys.argv) > 8: 72 | incluBeg2 = int(sys.argv[7]) 73 | excluEnd2 = int(sys.argv[8]) 74 | else: 75 | incluBeg2 = 0 76 | excluEnd2 = 0 77 | 78 | if len(sys.argv) > 9 and 'duplex' not in sys.argv[9]: 79 | isduplex = False 80 | else: 81 | isduplex = True 82 | 83 | with gzip.open(r1in) as r1infile, gzip.open(r2in) as r2infile, gzip.open(r1out, 'wb', compresslevel=1) as r1outfile, gzip.open(r2out, 'wb', compresslevel=1) as r2outfile: 84 | proc(r1infile, r2infile, r1outfile, r2outfile, incluBeg, excluEnd, incluBeg2, excluEnd2, isduplex) 85 | 86 | 87 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Example command to build: make all -j9 && make deploy 2 | 3 | release = debarcode uvc-1-fopenmp-thread uvc-1-cpp-std-thread 4 | release : $(release) 5 | debug = debarcode uvc-2-fopenmp-thread-asan.debug uvc-2-cpp-std-thread-asan.debug uvc-3-asan.debug uvc-4.debug 6 | debug : $(debug) 7 | debug-ub : debarcode uvc-2-fopenmp-thread-ubsan.debug uvc-2-cpp-std-thread-ubsan.debug uvc-3-ubsan.debug 8 | all : release debug 9 | ALL : all debug-ub 10 | 11 | HDR=CLI11-1.7.1/CLI11.hpp Hash.hpp main_conversion.hpp main_consensus.hpp \ 12 | CmdLineArgs.hpp common.hpp grouping.hpp iohts.hpp logging.hpp main.hpp MolecularID.hpp version.h 13 | SRC=CmdLineArgs.cpp common.cpp grouping.cpp iohts.cpp logging.cpp main.cpp MolecularID.cpp version.cpp 14 | DEP=bcf_formats.step1.hpp instcode.hpp Makefile 15 | 16 | HTSPATH=ext/htslib-1.11-lowdep/libhts.a 17 | HTSFLAGS=$(HTSPATH) -I ext/htslib-1.11-lowdep/ -pthread -lm -lz -lbz2 -llzma # -lcurl -lcrypto # can be changed depending on the specific installed components of htslib (please refer to the INSTALL file in htslib) 18 | CC=gcc # can be changed to clang or other compilers as needed 19 | CXX=g++ # can be changed to clang or other compilers as needed 20 | CXXFLAGS=-std=c++14 -static-libstdc++ -Wall 21 | COMMIT_VERSION=$(shell git rev-parse HEAD | head -c 7) 22 | COMMIT_DIFF_SH=$(shell git diff HEAD --shortstat) 23 | COMMIT_DIFF_FULL=$(shell echo "R\"ZXF_specQUOTE(\n $$(git diff HEAD | sed 's/ZXF_specQUOTE/ZXF_specquote/g') \n)ZXF_specQUOTE\"" > gitdiff.txt) 24 | VERFLAGS=-DCOMMIT_VERSION="\"$(COMMIT_VERSION)\"" -DCOMMIT_DIFF_SH="\"$(COMMIT_DIFF_SH)\"" -DCOMMIT_DIFF_FULL="\"$(COMMIT_DIFF_FULL)\"" 25 | # UVC_IN_DEBUG_MODE enables locus-specific diagnostic debugging info 26 | DEBUG_OPTS=-DUVC_IN_DEBUG_MODE -static-libasan 27 | UBSAN=--param=max-vartrack-size=640000000 -fsanitize=undefined 28 | 29 | debarcode : debarcode_main.c version.h Makefile 30 | $(CC) -O3 -o debarcode $(VERFLAGS) debarcode_main.c ${HTSFLAGS} 31 | 32 | # the main executable, uses OpenMP for multi-threading 33 | uvc-1-fopenmp-thread : $(HDR) $(SRC) $(DEP) 34 | $(CXX) -O3 -DNDEBUG -o uvc-1-fopenmp-thread $(CXXFLAGS) $(VERFLAGS) $(SRC) $(HTSFLAGS) -fopenmp # -l htslib 35 | 36 | # the main executable, uses C++ standard template library thread for multi-threading, useful if OpenMP runtime is not available 37 | uvc-1-cpp-std-thread : $(HDR) $(SRC) $(DEP) 38 | $(CXX) -O3 -DNDEBUG -o uvc-1-cpp-std-thread $(CXXFLAGS) $(VERFLAGS) $(SRC) $(HTSFLAGS) -DUSE_STDLIB_THREAD # -l htslib 39 | 40 | uvc-2-fopenmp-thread-asan.debug : $(HDR) $(SRC) $(DEP) 41 | $(CXX) -O2 -g -p -o uvc-2-fopenmp-thread-asan.debug $(CXXFLAGS) $(VERFLAGS) $(SRC) $(HTSFLAGS) $(DEBUG_OPTS) -fopenmp -fsanitize=address 42 | uvc-2-fopenmp-thread-ubsan.debug : $(HDR) $(SRC) $(DEP) 43 | $(CXX) -O2 -g -p -o uvc-2-fopenmp-thread-ubsan.debug $(CXXFLAGS) $(VERFLAGS) $(SRC) $(HTSFLAGS) $(DEBUG_OPTS) -fopenmp $(UBSAN) 44 | uvc-2-cpp-std-thread-asan.debug : $(HDR) $(SRC) $(DEP) 45 | $(CXX) -O2 -g -p -o uvc-2-cpp-std-thread-asan.debug $(CXXFLAGS) $(VERFLAGS) $(SRC) $(HTSFLAGS) $(DEBUG_OPTS) -DUSE_STDLIB_THREAD -fsanitize=address 46 | uvc-2-cpp-std-thread-ubsan.debug : $(HDR) $(SRC) $(DEP) 47 | $(CXX) -O2 -g -p -o uvc-2-cpp-std-thread-ubsan.debug $(CXXFLAGS) $(VERFLAGS) $(SRC) $(HTSFLAGS) $(DEBUG_OPTS) -DUSE_STDLIB_THREAD $(UBSAN) 48 | uvc-3-asan.debug : $(HDR) $(SRC) $(DEP) 49 | $(CXX) -O2 -g -p -o uvc-3-asan.debug $(CXXFLAGS) $(VERFLAGS) $(SRC) $(HTSFLAGS) $(DEBUG_OPTS) -fsanitize=address 50 | uvc-3-ubsan.debug : $(HDR) $(SRC) $(DEP) 51 | $(CXX) -O2 -g -p -o uvc-3-ubsan.debug $(CXXFLAGS) $(VERFLAGS) $(SRC) $(HTSFLAGS) $(DEBUG_OPTS) $(UBSAN) 52 | uvc-4.debug : $(HDR) $(SRC) $(DEP) 53 | $(CXX) -O0 -g -p -o uvc-4.debug $(CXXFLAGS) $(VERFLAGS) $(SRC) $(HTSFLAGS) $(DEBUG_OPTS) -Wextra -DENABLE_ASSERT_IN_UVC 54 | 55 | bcf_formats_generator1.out : bcf_formats_generator1.cpp version.h 56 | $(CXX) -o bcf_formats_generator1.out $(CXXFLAGS) bcf_formats_generator1.cpp 57 | 58 | bcf_formats.step1.hpp : bcf_formats_generator1.out 59 | ./bcf_formats_generator1.out > bcf_formats.step1.hpp # auto-generate the C++ code from the BCF-template generator 60 | 61 | .PHONY: release all debug debug-ub ALL clean deploy 62 | 63 | clean: 64 | rm bcf_formats_generator1.out bcf_formats.step1.hpp *.o *.debug uvc-1-fopenmp-thread uvc-1-cpp-std-thread *.gch debarcode || true 65 | 66 | deploy: 67 | cp uvc-1-fopenmp-thread bin/uvc1 # The default binary executable uses OpenML thread, and uvc1 is used by uvcTN.sh 68 | cp $(release) bin/ || true 69 | cp $(debug) bin/ || true 70 | 71 | -------------------------------------------------------------------------------- /instcode.hpp: -------------------------------------------------------------------------------- 1 | 2 | #undef INDELTYPE 3 | #if INDEL_ID == 1 4 | #define INDELTYPE std::string 5 | #else 6 | #define INDELTYPE uvc1_readpos_t 7 | #endif 8 | 9 | // this code is instantiated multiple times, with INDELTYPE as instantiation parameter. 10 | std::array 11 | #if INDEL_ID == 1 12 | fill_by_indel_info2_1 13 | #else 14 | fill_by_indel_info2_2 15 | #endif 16 | (bcfrec::BcfFormat & fmt, 17 | const Symbol2CountCoverageSet & symbol2CountCoverageSet IGNORE_UNUSED_PARAM, 18 | const int strand, 19 | const uvc1_refgpos_t refpos, 20 | const AlignmentSymbol symbol, 21 | const std::map> & bq_tsum_depth, 22 | const std::map> & fq_tsum_depth, 23 | const std::map> & fq_tsum_depth_c2DP, 24 | const std::map> & fq_tsum_depth_c2dDP, 25 | 26 | const std::string & refchars IGNORE_UNUSED_PARAM, 27 | const uvc1_flag_t specialflag IGNORE_UNUSED_PARAM) { 28 | 29 | assertUVC(isSymbolIns(symbol) || isSymbolDel(symbol)); 30 | if (isSymbolIns(symbol)) { 31 | assertUVC (LINK_I1 == symbol || LINK_I2 == symbol || LINK_I3P == symbol || 32 | !fprintf(stderr, "Symbol %s does not match any of {%s, %s, %s}", 33 | SYMBOL_TO_DESC_ARR[symbol], SYMBOL_TO_DESC_ARR[LINK_I1], SYMBOL_TO_DESC_ARR[LINK_I2], SYMBOL_TO_DESC_ARR[LINK_I3P])); 34 | } else { 35 | assertUVC (LINK_D1 == symbol || LINK_D2 == symbol || LINK_D3P == symbol || 36 | !fprintf(stderr, "Symbol %s does not match any of {%s, %s, %s}", 37 | SYMBOL_TO_DESC_ARR[symbol], SYMBOL_TO_DESC_ARR[LINK_D1], SYMBOL_TO_DESC_ARR[LINK_D2], SYMBOL_TO_DESC_ARR[LINK_D3P])); 38 | } 39 | assertUVC(bq_tsum_depth.find(refpos) != bq_tsum_depth.end()); 40 | 41 | std::vector> bqfq_depth_mutform_tuples; 42 | for (auto indel2data4 : bq_tsum_depth.at(refpos)) { 43 | const auto indel = indel2data4.first; 44 | #if INDEL_ID == 1 45 | const std::string indelstring = indel2data4.first; 46 | #else 47 | const std::string indelstring = refchars.substr(refpos - symbol2CountCoverageSet.getUnifiedIncluBegPosition(), indel2data4.first); 48 | #endif 49 | if (indelstring.size() == 0) { 50 | continue; 51 | } 52 | 53 | const uvc1_readnum_t bqdata = posToIndelToData_get(bq_tsum_depth, refpos, indel); 54 | const uvc1_readnum_t fqdata = posToIndelToData_get(fq_tsum_depth, refpos, indel); 55 | const uvc1_readnum_t fqdata_c2DP = posToIndelToData_get(fq_tsum_depth_c2DP, refpos, indel); 56 | const uvc1_readnum_t fqdata_c2dDP = posToIndelToData_get(fq_tsum_depth_c2dDP, refpos, indel); 57 | assertUVC(bqdata > 0); 58 | bqfq_depth_mutform_tuples.push_back(std::make_tuple(fqdata, bqdata, fqdata_c2DP, fqdata_c2dDP, indelstring)); 59 | } 60 | uvc1_readnum_t gapbAD1sum = 0; 61 | uvc1_readnum_t gapcAD1sum = 0; 62 | std::sort(bqfq_depth_mutform_tuples.rbegin(), bqfq_depth_mutform_tuples.rend()); 63 | auto & gapN = ((0 == strand) ? fmt.gapNf : fmt.gapNr); 64 | gapN.push_back(bqfq_depth_mutform_tuples.size()); 65 | uvc1_readpos_t prev_gapseq_len = 0; 66 | uvc1_readnum_t prev_gap_cAD = 0; 67 | uvc1_readnum_t maxdiff = 0; 68 | for (auto bqfq_depth_mutform : bqfq_depth_mutform_tuples) { 69 | const auto gap_seq = std::get<2+2>(bqfq_depth_mutform); 70 | assertUVC(gap_seq.size() > 0); 71 | auto gap_cAD = std::get<0>(bqfq_depth_mutform); 72 | auto gap_cAD2 = std::get<2>(bqfq_depth_mutform); 73 | auto gap_cAD3 = std::get<3>(bqfq_depth_mutform); 74 | auto gap_bAD = std::get<1>(bqfq_depth_mutform); 75 | fmt.gapSeq.push_back(gap_seq); 76 | fmt.gapbAD1.push_back(gap_bAD); 77 | fmt.gapcAD1.push_back(gap_cAD); 78 | fmt.gc2AD.push_back(gap_cAD2); 79 | fmt.gc2dAD.push_back(gap_cAD3); 80 | if ((UNSIGN2SIGN(gap_seq.size()) != prev_gapseq_len) && (prev_gap_cAD > gap_cAD)) { 81 | maxdiff = MAX(maxdiff, prev_gap_cAD - gap_cAD); 82 | } 83 | prev_gapseq_len = gap_seq.size(); 84 | prev_gap_cAD = gap_cAD; 85 | gapbAD1sum += gap_bAD; 86 | gapcAD1sum += gap_cAD; 87 | } 88 | return {MAX(maxdiff, prev_gap_cAD), gapcAD1sum}; 89 | 90 | // this is a rare case of indel inconsistency: 91 | /* 92 | if (fmt.gapbAD1[strand] != gapbAD1sum) { 93 | std::string msg = std::to_string(strand) + "\t" + std::to_string(refpos) + "\t" + std::to_string(symbol); 94 | bcfrec::streamAppendBcfFormat(msg, fmt); 95 | std::cerr << msg << "\n"; 96 | } 97 | */ 98 | // "/4+16" is a probabilistic check in the following code 99 | /* 100 | assertUVC(fmt.AD2[strand] >= gapAD2sum && (fmt.AD2[strand] <= gapAD2sum * 5 / 4 + 16) || 101 | !(std::cerr << fmt.AD2[strand] << " >= | <=5/4+16 " << gapAD2sum 102 | << " failed for AD2 and gapAD2sum for strand " << strand << " at position " << refpos << " for symbol " << SYMBOL_TO_DESC_ARR[symbol] 103 | << " and gapNum " << fmt.gapNum[strand] << " or equiv " << rawdu2_dedup_size1_mutform_tuples.size() << std::endl)); 104 | assertUVC(fmt.ADr[strand] >= gapADrsum && (fmt.ADr[strand] <= gapADrsum * 5 / 4 + 16) || 105 | !(std::cerr << fmt.ADr[strand] << " >= | <=5/4+16 " << gapADrsum 106 | << " failed for ADr and gapADrsum for strand " << strand << " at position " << refpos << " for symbol " << SYMBOL_TO_DESC_ARR[symbol] << " and gapNum " << fmt.gapNum[strand] << std::endl)); 107 | */ 108 | }; 109 | 110 | -------------------------------------------------------------------------------- /uvcActiveRegion.cpp: -------------------------------------------------------------------------------- 1 | // This code is used for generating a list of positions that may be of variant candidates in BED format. 2 | // This code is not actually used for calling somatic variants because the speedup gained from using this bed file is too low. 3 | // However, this code can still be useful in some scenarios (such as preparing a bed file for calling high-confidence variants in low-coverage bam file). 4 | // Thus, this source file is kept. 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "assert.h" 11 | #include "string.h" 12 | 13 | #include "htslib/faidx.h" 14 | #include "htslib/sam.h" 15 | 16 | struct _CharToSymbol { 17 | std::array data; 18 | _CharToSymbol() { 19 | for (int i = 0; i < 128; i++) { 20 | data[i] = 0; 21 | } 22 | data['A'] = data['a'] = 0x1; 23 | data['C'] = data['c'] = 0x2; 24 | data['G'] = data['g'] = 0x4; 25 | data['T'] = data['t'] = 0x8; 26 | data['I'] = data['i'] = 0x0; 27 | data['-'] = data['_'] = 0x0; 28 | } 29 | }; 30 | 31 | const auto CHAR_TO_SYMBOL = _CharToSymbol(); 32 | 33 | int 34 | updateDepthsByAln(std::vector & region_var_dp, std::vector & region_ref_dp, 35 | const bam1_t *const b, const unsigned int region_offset, const char* region_str, unsigned int region_str_len) { 36 | if (b->core.flag & 0x4) { return - 1; } 37 | if (0 == (b->core.pos % (1024*64))) { 38 | fprintf(stderr, "rpos=%d, region_offset=%d, region_var_dp.size()=%d, region_str_len=%d\n", b->core.pos, region_offset, region_var_dp.size(), region_str_len); 39 | } 40 | assert(region_var_dp.size() == region_ref_dp.size()); 41 | unsigned int qpos = 0; 42 | unsigned int rpos = b->core.pos; 43 | const uint32_t n_cigar = b->core.n_cigar; 44 | const uint32_t *cigar = bam_get_cigar(b); 45 | const uint8_t *bseq = bam_get_seq(b); 46 | for (unsigned int i = 0; i < n_cigar; i++) { 47 | uint32_t c = cigar[i]; 48 | unsigned int cigar_op = bam_cigar_op(c); 49 | unsigned int cigar_oplen = bam_cigar_oplen(c); 50 | if (cigar_op == BAM_CMATCH || cigar_op == BAM_CEQUAL || cigar_op == BAM_CDIFF) { 51 | for (unsigned int i2 = 0; i2 < cigar_oplen; i2++) { 52 | unsigned int base4bit = bam_seqi(bseq, qpos); 53 | if (region_str[rpos-region_offset] != base4bit && region_str[rpos-region_offset] && bam_get_qual(b)[qpos] > 20) { 54 | region_var_dp[rpos-region_offset] += 1; 55 | } else { 56 | region_ref_dp[rpos-region_offset] += 1; 57 | } 58 | qpos += 1; 59 | rpos += 1; 60 | } 61 | } else if (cigar_op == BAM_CINS) { 62 | region_var_dp[rpos-region_offset-1] += 1; 63 | region_var_dp[rpos-region_offset-0] += 1; 64 | qpos += cigar_oplen; 65 | } else if (cigar_op == BAM_CDEL) { 66 | region_var_dp[rpos-region_offset-1] += 1; 67 | region_var_dp[rpos-region_offset+cigar_oplen] += 1; 68 | rpos += cigar_oplen; 69 | } else if (cigar_op == BAM_CREF_SKIP) { 70 | rpos += cigar_oplen; 71 | } else if (cigar_op == BAM_CSOFT_CLIP) { 72 | qpos += cigar_oplen; 73 | } else if (cigar_op == BAM_CHARD_CLIP) { 74 | // pass 75 | } else if (cigar_op == BAM_CPAD) { 76 | // pass 77 | } else if (cigar_op == BAM_CBACK) { 78 | throw -1; 79 | } else { 80 | throw -2; 81 | } 82 | } 83 | } 84 | 85 | int 86 | gen_bed_region(const char *tname, std::vector & region_var_dp, std::vector & region_ref_dp) { 87 | assert(region_var_dp.size() == region_ref_dp.size()); 88 | for (unsigned i = 0; i < region_var_dp.size(); i++) { 89 | if (region_var_dp[i] >= 4 && region_var_dp[i] * (200-1) > region_ref_dp[i]) { 90 | printf("%s\t%d\t%d\t%d/%d\n", tname, (i > 2 ? (i-2) : 0), (i+2)+1, region_var_dp[i], region_ref_dp[i]); 91 | } 92 | } 93 | } 94 | 95 | int main(int argc, char **argv) { 96 | faidx_t *fai = fai_load(argv[1]); 97 | 98 | samFile *sam_infile = sam_open(argv[2], "r"); 99 | bam_hdr_t * sam_header = sam_hdr_read(sam_infile); 100 | hts_idx_t * hts_idx = sam_index_load2(sam_infile, argv[2], NULL); 101 | 102 | for (unsigned int tid = 0; tid < sam_header->n_targets; tid++) { 103 | unsigned int seqlen = sam_header->target_len[tid]; 104 | std::vector region_var_dp(seqlen, 0); 105 | std::vector region_ref_dp(seqlen, 0); 106 | 107 | const char *tname = faidx_iseq(fai, tid); 108 | assert(0 == strcmp(tname, sam_header->target_name[tid])); 109 | int regionlen; 110 | char *fetchedseq = faidx_fetch_seq(fai, tname, 0, seqlen - 1, ®ionlen); 111 | assert(regionlen == seqlen); 112 | for (unsigned int i = 0; i < regionlen; i++) { 113 | fetchedseq[i] = CHAR_TO_SYMBOL.data[fetchedseq[i]]; 114 | } 115 | // std::string region_string = std::string(fetchedseq); 116 | 117 | hts_itr_t * hts_itr = sam_itr_queryi(hts_idx, tid, 0, seqlen); 118 | bam1_t *aln = bam_init1(); 119 | while (sam_itr_next(sam_infile, hts_itr, aln) >= 0) { 120 | updateDepthsByAln(region_var_dp, region_ref_dp, 121 | aln, 0, fetchedseq, regionlen); 122 | } 123 | bam_destroy1(aln); 124 | hts_itr_destroy(hts_itr); 125 | free(fetchedseq); 126 | 127 | gen_bed_region(tname, region_var_dp, region_ref_dp); 128 | } 129 | 130 | hts_idx_destroy(hts_idx); 131 | bam_hdr_destroy(sam_header); 132 | sam_close(sam_infile); 133 | } 134 | 135 | 136 | -------------------------------------------------------------------------------- /grouping.hpp: -------------------------------------------------------------------------------- 1 | #ifndef grouping_hpp_INCLUDED 2 | #define grouping_hpp_INCLUDED 3 | 4 | #include "CmdLineArgs.hpp" 5 | #include "common.hpp" 6 | #include "iohts.hpp" 7 | #include "MolecularID.hpp" 8 | 9 | #include "htslib/sam.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #define logDEBUGx1 logDEBUG // set to logINFO to enable it 26 | 27 | struct SamIter { 28 | const std::string input_bam_fname; 29 | const std::string & tier1_target_region; 30 | const std::string region_bed_fname; 31 | const int64_t bed_in_avg_sequencing_DP; 32 | const uvc1_flag_t bed_in_avg_sequencing_DP_n_from_t; 33 | const size_t nthreads; 34 | const int64_t mem_per_thread; 35 | const bool is_fastq_gen; 36 | samFile *sam_infile = NULL; 37 | bam_hdr_t *samheader = NULL; 38 | hts_idx_t *sam_idx = NULL; 39 | hts_itr_t *sam_itr = NULL; 40 | 41 | bam1_t *alnrecord = bam_init1(); 42 | uvc1_refgpos_t last_it_tid = -1; 43 | uvc1_refgpos_t last_it_beg = -1; 44 | uvc1_refgpos_t last_it_end = -1; 45 | 46 | std::vector _bedlines; 47 | size_t _bedregion_idx = 0; 48 | 49 | SamIter(const CommandLineArgs ¶mset): 50 | input_bam_fname(paramset.bam_input_fname), 51 | tier1_target_region(paramset.tier1_target_region), 52 | region_bed_fname(paramset.bed_region_fname), 53 | bed_in_avg_sequencing_DP(paramset.bed_in_avg_sequencing_DP), 54 | bed_in_avg_sequencing_DP_n_from_t(paramset.bed_in_avg_sequencing_DP_n_from_t), 55 | nthreads(paramset.max_cpu_num), 56 | mem_per_thread(paramset.mem_per_thread), 57 | is_fastq_gen(paramset.fam_consensus_out_fastq.size() > 0) { 58 | this->sam_infile = sam_open(input_bam_fname.c_str(), "r"); 59 | if (NULL == this->sam_infile) { 60 | fprintf(stderr, "Failed to open the file %s!", input_bam_fname.c_str()); 61 | abort(); 62 | } 63 | this->samheader = sam_hdr_read(sam_infile); 64 | if (NULL == this->samheader) { 65 | fprintf(stderr, "Failed to read the header of the file %s!", input_bam_fname.c_str()); 66 | abort(); 67 | } 68 | if (IS_PROVIDED(this->tier1_target_region)) { 69 | this->sam_idx = sam_index_load(this->sam_infile, input_bam_fname.c_str()); 70 | if (NULL == this->sam_idx) { 71 | fprintf(stderr, "Failed to load the index for the file %s!", input_bam_fname.c_str()); 72 | abort(); 73 | } 74 | this->sam_itr = sam_itr_querys(this->sam_idx, this->samheader, this->tier1_target_region.c_str()); 75 | if (NULL == this->sam_itr) { 76 | fprintf(stderr, "Failed to load the region %s in the indexed file %s!", tier1_target_region.c_str(), input_bam_fname.c_str()); 77 | abort(); 78 | } 79 | target_region_to_contigs(this->_bedlines, this->tier1_target_region, this->samheader); 80 | } else if (IS_PROVIDED(this->region_bed_fname)) { 81 | this->sam_idx = sam_index_load(this->sam_infile, input_bam_fname.c_str()); 82 | if (NULL == this->sam_idx) { 83 | fprintf(stderr, "Failed to load the index for the file %s!", input_bam_fname.c_str()); 84 | abort(); 85 | } 86 | bed_fname_to_contigs(this->_bedlines, this->region_bed_fname, this->samheader); 87 | } 88 | } 89 | ~SamIter() { 90 | bam_destroy1(alnrecord); 91 | if (NULL != sam_itr) { sam_itr_destroy(sam_itr); } 92 | if (NULL != sam_idx) { hts_idx_destroy(sam_idx); } 93 | bam_hdr_destroy(samheader); 94 | sam_close(sam_infile); 95 | } 96 | 97 | int 98 | bed_fname_to_contigs( 99 | std::vector & bedlines, 100 | const std::string & bed_fname, 101 | const bam_hdr_t *bam_hdr); 102 | 103 | int 104 | target_region_to_contigs( 105 | std::vector & bedlines, 106 | const std::string & tier1_target_region, 107 | const bam_hdr_t *bam_hdr); 108 | 109 | int64_t 110 | iternext( 111 | uvc1_flag_t & iter_ret_flag, 112 | std::vector & bedlines, 113 | const uvc1_flag_t specialflag IGNORE_UNUSED_PARAM); 114 | }; 115 | 116 | int 117 | samfname_to_tid_to_tname_tseq_tup_vec( 118 | std::vector> & tid_to_tname_tseqlen_tuple_vec, 119 | const std::string & bam_input_fname); 120 | 121 | int 122 | clean_fill_strand_umi_readset( 123 | std::vector>, 2>> &umi_strand_readset); 124 | 125 | int 126 | fill_strand_umi_readset_with_strand_to_umi_to_reads( 127 | std::vector>, 2>, MolecularBarcode>> &umi_strand_readset, 128 | std::map>, 2>, MolecularBarcode>> &umi_to_strand_to_reads, 129 | const CommandLineArgs & paramset, 130 | uvc1_flag_t specialflag); 131 | 132 | std::array 133 | bamfname_to_strand_to_familyuid_to_reads( 134 | std::map>, 2>, MolecularBarcode>> &umi_to_strand_to_reads, 135 | uvc1_refgpos_t & extended_inclu_beg_pos, 136 | uvc1_refgpos_t & extended_exclu_end_pos, 137 | uvc1_refgpos_t tid, 138 | uvc1_refgpos_t fetch_tbeg, 139 | uvc1_refgpos_t fetch_tend, 140 | bool end2end, 141 | size_t regionbatch_ordinal, 142 | size_t regionbatch_tot_num, 143 | const std::string UMI_STRUCT_STRING, 144 | samFile *sam_file, 145 | const hts_idx_t * hts_idx, 146 | size_t thread_id, 147 | const CommandLineArgs & paramset, 148 | uvc1_flag_t specialflag); 149 | #endif 150 | 151 | -------------------------------------------------------------------------------- /bin/uvcTN.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | scriptdir="$(dirname "$(which "$0")")" 4 | if [ $# -lt 5 ]; then 5 | echo "Usage: $0 [,][/[,]/] [] [--tumor-params ] [--normal-params ]" 6 | echo " The output bgzipped vcf file is " 7 | echo " (with .byproduct/_T_uvc1.vcf.gz as intermediate tumor vcf) if is provided or " 8 | echo " /_N_uvc1.vcf.gz (with /_T_uvc1.vcf.gz as intermediate tumor vcf) if normalSampleName is not provided. " 9 | echo " is the number of processes corresponding to the number of chromosomes that are run concurrently in parallel, " 10 | echo " where 0 (zero, which is the default value) means no chromosome-level parallelization. " 11 | echo " is the number of threads used by bcftools. " 12 | echo " means the string parallel or qsub. " 13 | echo " The string parallel requires GNU parallel to be installed and the string qsub requires the variable UVC_QSUB_CMD to be set. " 14 | echo " For example, UVC_QSUB_CMD can be \"qsub -V -S /bin/sh\". " 15 | echo " Please note that the qsub option is deprecated because of two reasons: " 16 | echo " 1. The number of qsub commands is equal to the number of reference fasta sequences. " 17 | echo " 1. Afterwards, the user has to manually merge the VCF results generated from their respective reference fasta sequences. " 18 | echo " More commands result in more output files and more overhead. " 19 | echo " Therefore, please do not use the qsub option especially if the reference fasta file contains a lot of sequences. " 20 | echo " is the set of command-line parameters (i.e. arguments) to ${scriptdir}/uvc1 for both tumor and normal samples. " 21 | echo " --tumor-params is optional and is followed by the parameters to ${scriptdir}/uvc1 for only the tumor-sample. " 22 | echo " --normal-params is optional and is followed by the parameters to ${scriptdir}/uvc1 for only the normal-sample. " 23 | echo " For help on the usage of \"${scriptdir}/uvc1\", please enter \"${scriptdir}/uvc1\" -h " 24 | exit 1 25 | fi 26 | 27 | tstate=1 28 | nstate=1 29 | tparams=() 30 | nparams=() 31 | for p in "${@:6}"; do 32 | cstate=0 33 | if [[ "${p}" = "--tumor-params" ]]; then 34 | tstate=1 35 | nstate=0 36 | cstate=1 37 | elif [[ "${p}" = "--normal-params" ]]; then 38 | tstate=0 39 | nstate=1 40 | cstate=1 41 | fi 42 | if [ $cstate -eq 0 ]; then 43 | if [ $tstate -eq 1 ]; then 44 | tparams+=($p) 45 | fi 46 | if [ $nstate -eq 1 ]; then 47 | nparams+=($p) 48 | fi 49 | fi 50 | done 51 | 52 | echo "cmdLineArgParser.infoMessage.T: the command-line parameters are ( ${tparams[@]} ) for generating the tumor vcf" 53 | echo "cmdLineArgParser.infoMessage.N: the command-line parameters are ( ${nparams[@]} ) for generating the normal vcf" 54 | 55 | set -evx 56 | 57 | if [ -z "${UVC_BIN_EXE_FULL_NAME}" ]; then 58 | UVC_BIN_EXE_FULL_NAME="${scriptdir}/uvc1" 59 | else 60 | echo "WARNING: using UVC_BIN_EXE_FULL_NAME=${UVC_BIN_EXE_FULL_NAME} from environment variable." 61 | echo "Please enter the shell command (unset UVC_BIN_EXE_FULL_NAME) before running uvcTN.sh if the default uvc binary exe full path should be used." 62 | fi 63 | 64 | ref="$1" 65 | tbam="$2" 66 | nbam="$3" 67 | samplename=$(echo "$5/0/parallel" | awk -F"/" '{print $1}') 68 | nprocsa=$(echo "$5/0/parallel" | awk -F"/" '{print $2}') 69 | nprocs=$(echo "${nprocsa}" | awk -F"," '{print $1}') 70 | nprocs2=$(echo "${nprocsa},4" | awk -F"," '{print $2}') 71 | paratool=$(echo "$5/0/parallel" | awk -F"/" '{print $3}') 72 | 73 | if [ $(echo "${samplename}" | awk -F "," '{print NF}') -eq 2 ]; then 74 | tsample=$(echo "${samplename}" | awk -F "," '{print $1}') 75 | nsample=$(echo "${samplename}" | awk -F "," '{print $2}') 76 | outdir="${4}.byproduct" 77 | nvcfgz="${4}" 78 | else 79 | tsample="${samplename}_T" 80 | nsample="${samplename}_N" 81 | outdir="$4" 82 | nvcfgz="${outdir}/${nsample}_uvc1.vcf.gz" 83 | fi 84 | nlog="${outdir}/${nsample}_uvc1.stderr" 85 | tvcfgz="${outdir}/${tsample}_uvc1.vcf.gz" 86 | tlog="${outdir}/${tsample}_uvc1.stderr" 87 | tbed="${outdir}/${tsample}_uvc1.bed" 88 | mkdir -p "${outdir}" 89 | 90 | export PATH="${scriptdir}:${PATH}" # remove this line in the rare case that an important executable is shadowed by this command 91 | 92 | if [ "${nprocs}" -gt 0 ]; then 93 | tnames=$(cat "${ref}.fai" | awk '{print $1}') 94 | if [ "${paratool}" = "parallel" ]; then 95 | mkdir -p "${outdir}/parallel-results/" 96 | for tname in ${tnames}; do 97 | echo "${0}" "${ref}" "${tbam}" "${nbam}" "${outdir}/parallel-results/${tname}.uvc1.vcf.gz" "${tsample},${nsample}" --targets "${tname}" "${@:6}" 98 | done > "${outdir}/run_parallel.sh" 99 | cat "${outdir}/run_parallel.sh" | parallel -j "${nprocs}" 100 | bcftools concat -n -Oz -o "${nvcfgz}" "${outdir}/parallel-results/"*".uvc1.vcf.gz" 101 | bcftools index -ft --threads "${nprocs2}" "${nvcfgz}" 102 | elif [ "${paratool}" = "qsub" ]; then 103 | echo "The use of the qsub command that is automatically generated by this script is deprecated! Please consider using either " 104 | echo " 1) the intrinsic multithreading capability in UVC with the -t option to the UVC binary executable or " 105 | echo " 2) GNU parallel for multiprocessing. " 106 | if [ -z "${UVC_QSUB_CMD}" ]; then 107 | echo "The variable UVC_QSUB_CMD must be set and exported in order to use qsub! " 108 | exit -2 109 | fi 110 | for tname in ${tnames}; do 111 | echo echo "${0}" "${ref}" "${tbam}" "${nbam}" "${outdir}/${tname}" "${samplename}" --targets "${tname}" "${@:6}" "|" "${UVC_QSUB_CMD}" -o "${outdir}" -e "${outdir}" -v JOB_NAME="${tname}.job" 112 | done > "${outdir}/run_qsub.sh" 113 | sh "${outdir}/run_qsub.sh" 114 | else 115 | echo "The multiprocessing tool ${paratool} is neither parallel nor qsub. " 116 | exit -1 117 | fi 118 | else 119 | date 120 | "${UVC_BIN_EXE_FULL_NAME}" -f "${ref}" -s "${tsample}" "${tbam}" -o "${tvcfgz}" --tn-is-paired 1 --bed-out-fname "${tbed}" "${tparams[@]}" 2> "${tlog}" 121 | date 122 | bcftools index -ft --threads "${nprocs2}" "${tvcfgz}" # or use tabix, requires htslib 1.6 or plus 123 | 124 | date 125 | "${UVC_BIN_EXE_FULL_NAME}" -f "${ref}" -s "${nsample}" "${nbam}" -o "${nvcfgz}" --tn-is-paired 1 --bed-in-fname "${tbed}" "${nparams[@]}" --tumor-vcf "${tvcfgz}" 2> "${nlog}" 126 | date 127 | bcftools index -ft --threads "${nprocs2}" "${nvcfgz}" # or use tabix, requires htslib 1.6 or plus 128 | fi 129 | 130 | date 131 | exit 0 132 | 133 | -------------------------------------------------------------------------------- /common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef common_hpp_INCLUDED 2 | #define common_hpp_INCLUDED 3 | 4 | #define COMPILATION_ENABLE_XMGOT 0 5 | #define COMPILATION_TRY_HIGH_DEPTH_POS_BIAS 0 6 | 7 | // #include "precompiled/precompiled_main.hpp" 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #ifdef __GNUC__ 21 | #if __GNUC__ > 3 22 | #define IGNORE_UNUSED_PARAM __attribute__((unused)) 23 | #else 24 | #define IGNORE_UNUSED_PARAM // let warning pop up but still compile fine 25 | #endif 26 | #else 27 | #define IGNORE_UNUSED_PARAM 28 | #endif 29 | 30 | #define STRINGIZE_(x) #x 31 | #define STRINGIZE(x) STRINGIZE_(x) 32 | 33 | #define STATIC_ASSERT_WITH_DEFAULT_MSG(x) static_assert((x), "Static assertUVCion failed at " __FILE__ ":" STRINGIZE(__LINE__)); 34 | 35 | // #define assertUVC(x) (assert(x)) // assertion error prevents us from checking the stack-trace that is otherwise printed by enabling address/undefine-behavior sanitizer(s). 36 | #ifdef ENABLE_ASSERT_IN_UVC 37 | #define assertUVC(x) (assert(x)) 38 | #else 39 | #define assertUVC(x) 40 | #endif 41 | 42 | // constants 43 | 44 | #define MGVCF_REGION_MAX_SIZE 1000 45 | #define NUM_WORKING_UNITS_PER_THREAD 8 46 | 47 | #define OUTVAR_GERMLINE 0x1 48 | #define OUTVAR_SOMATIC 0x2 49 | #define OUTVAR_ANY 0x4 50 | #define OUTVAR_MGVCF 0x8 51 | #define OUTVAR_ADDITIONAL_INDEL_CANDIDATE 0x10 52 | #define OUTVAR_BASE_NN 0x20 53 | #define OUTVAR_LINK_NN 0x40 54 | 55 | #define IS_PROVIDED(x) (std::string("") != (x) && std::string(".") != (x)) 56 | #define ISNT_PROVIDED(x) (!IS_PROVIDED(x)) 57 | 58 | #define OPT_ONLY_PRINT_VCF_HEADER "/only-print-vcf-header/" 59 | #define OPT_ONLY_PRINT_DEBUG_DETAIL "/only-print-debug-detail/" 60 | #define PLAT_ILLUMINA_LIKE "Illumina/BGI" 61 | #define PLAT_ION_LIKE "IonTorrent/LifeTechnologies/ThermoFisher" 62 | 63 | #define MAX_STR_N_BASES 100 // doi: 10.1016/S1672-0229(07)60009-6 64 | #define MAX_INSERT_SIZE 2000 // https://doi.org/10.2147/AGG.S162531 65 | #define DBLFLT_EPS ((double)FLT_EPSILON) 66 | 67 | // substitutions 68 | 69 | #define SIGN2UNSIGN(x) ((x)) // disabled 70 | #define UNSIGN2SIGN(x) ((int64_t)(x)) 71 | #define MIN(x, y) (((x) < (y)) ? (x) : (y)) 72 | #define MAX(x, y) (((x) > (y)) ? (x) : (y)) 73 | #define INT64MUL(x, y) ((int64_t)(x) * (int64_t)(y)) 74 | 75 | #define NORM_INSERT_SIZE(b) { if (abs((b)->core.isize) >= MAX_INSERT_SIZE) { (b)->core.isize = 0; } } 76 | 77 | #define rtr_endpos(r) ((r).begpos + (r).tracklen) 78 | #define ispowerof2(num) (((num) & ((num)-1)) == 0) 79 | 80 | // conversion between Phred, nat, bit, frac, and states 81 | 82 | #define phred2nat(x) ((log(10.0)/10.0) * (x)) 83 | #define nat2phred(x) ((10.0/log(10.0)) * (x)) 84 | #define frac2phred(x) (-(10.0/log(10.0)) * log(x)) 85 | #define phred2frac(x) (pow(10.0, (-x)/10.0)) 86 | #define numstates2phred(x) ((10.0/log(10.0)) * log(x)) 87 | #define phred2numstates(x) (pow(10.0, (x)/10.0)) 88 | #define numstates2deciphred(x) ((uvc1_qual_t)round((100.0/log(10.0)) * log(x))) 89 | 90 | #define bam_get_strand(aln) ((((aln)->core.flag & 0x81) == 0x81) ? (!!((aln)->core.flag & 0x20)) : (!!((aln)->core.flag & 0x10))) 91 | 92 | #define ARE_INTERVALS_OVERLAPPING(int1min, int1max, int2min, int2max) (!(((int1max) <= (int2min)) || (((int2max) <= (int1min))))) 93 | 94 | typedef uint64_t uvc1_unsigned_int_t; // It seems that a bug in g++ causes compiling error if this type is defined as (unsigned int) 95 | 96 | typedef int32_t uvc1_qual_t; // quality (usually Phred-scaled) 97 | typedef int32_t uvc1_deciphred_t; // 10x Phred 98 | 99 | typedef int32_t uvc1_readnum_t; // depth of segment, fragment, family, etc. // max 2 billion reads 100 | typedef int32_t uvc1_readnum100x_t; // 100x depth of segment, fragment, etc. // max 20 million reads 101 | typedef int32_t uvc1_readpos_t; // position with respect to the read, fragment, or insert 102 | typedef int32_t uvc1_refgpos_t; // position with respect to the reference genome 103 | typedef int32_t uvc1_rp_diff_t; 104 | 105 | typedef int32_t uvc1_base_t; 106 | typedef int32_t uvc1_base1500x_t; 107 | 108 | typedef int64_t uvc1_readnum_big_t; 109 | typedef int64_t uvc1_readpos_big_t; 110 | typedef int64_t uvc1_refgpos_big_t; 111 | typedef int64_t uvc1_qual_big_t; // big qual 112 | 113 | typedef uint32_t uvc1_flag_t; 114 | typedef uint64_t uvc1_hash_t; 115 | 116 | enum AssayType { 117 | ASSAY_TYPE_AUTO, 118 | ASSAY_TYPE_CAPTURE, 119 | ASSAY_TYPE_AMPLICON, 120 | }; 121 | 122 | extern const std::vector ASSAY_TYPE_TO_MSG; 123 | 124 | enum MoleculeTag { 125 | MOLECULE_TAG_AUTO, 126 | MOLECULE_TAG_NONE, 127 | MOLECULE_TAG_BARCODING, 128 | MOLECULE_TAG_DUPLEX, 129 | }; 130 | 131 | extern const std::vector MOLECULE_TAG_TO_MSG; 132 | 133 | enum SequencingPlatform { 134 | SEQUENCING_PLATFORM_AUTO, 135 | SEQUENCING_PLATFORM_ILLUMINA, 136 | SEQUENCING_PLATFORM_IONTORRENT, 137 | SEQUENCING_PLATFORM_OTHER, 138 | }; 139 | 140 | extern const std::vector SEQUENCING_PLATFORM_TO_MSG; 141 | extern const std::vector SEQUENCING_PLATFORM_TO_NAME; 142 | 143 | enum PairEndMerge { 144 | PAIR_END_MERGE_YES, 145 | PAIR_END_MERGE_NO, 146 | }; 147 | 148 | extern const std::vector PAIR_END_MERGE_TO_MSG; 149 | 150 | struct RegionalTandemRepeat { 151 | uvc1_refgpos_t begpos = 0; 152 | uvc1_readpos_t tracklen = 0; 153 | uvc1_readpos_t unitlen = 0; 154 | 155 | uvc1_qual_t indelphred = 40 + 3; 156 | 157 | uvc1_refgpos_t anyTR_begpos = 0; 158 | uvc1_readpos_t anyTR_tracklen = 0; 159 | uvc1_readpos_t anyTR_unitlen = 0; 160 | }; 161 | 162 | struct RevComplement { 163 | char data[128]; 164 | char table16[16]; 165 | RevComplement() { 166 | for (int i = 0; i < 128; i++) { 167 | data[i] = (char)i; 168 | } 169 | data['A'] = 'T'; 170 | data['T'] = 'A'; 171 | data['C'] = 'G'; 172 | data['G'] = 'C'; 173 | data['a'] = 't'; 174 | data['t'] = 'a'; 175 | data['c'] = 'g'; 176 | data['g'] = 'c'; 177 | for (int i = 0; i < 16; i++) { 178 | table16[i] = (char)i; 179 | } 180 | table16[1] = 8/1; 181 | table16[2] = 8/2; 182 | table16[4] = 8/4; 183 | table16[8] = 8/8; 184 | } 185 | }; 186 | const static RevComplement STATIC_REV_COMPLEMENT; 187 | 188 | template 189 | inline 190 | T 191 | mathsquare(T x) { 192 | return x * x; 193 | } 194 | 195 | template 196 | inline 197 | auto 198 | non_neg_minus(T1 a, T2 b) { 199 | return (a > b ? (a - b) : 0); 200 | } 201 | 202 | template 203 | inline 204 | std::string 205 | anyuint2hexstring(T n) { 206 | static const char *hexnum2char = "0123456789ABCDEF"; 207 | T n1 = n; 208 | std::string ret; 209 | const size_t nchars = sizeof(T) * 2; 210 | ret.reserve(nchars); 211 | for (size_t i = 0; i < nchars; i++) { 212 | T n2 = (n1 & 0xF); 213 | ret.push_back(hexnum2char[n2]); 214 | n1 >>= 4UL; 215 | } 216 | std::reverse(ret.begin(), ret.end()); 217 | return ret; 218 | } 219 | 220 | template 221 | inline void 222 | compare_diff_less(bool & isdiff, bool & isless, const T & k1, const T & k2) { 223 | isdiff = (k1 != k2); 224 | isless = (k1 < k2); 225 | }; 226 | 227 | #endif 228 | 229 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | UVC is a very accurate and reasonably fast somatic variant caller. 2 | The executable uvc1 in the bin directory takes one BAM file as input and generates one block-gzipped VCF file as output. 3 | The script uvcTN.sh in the bin directory takes two BAM files corresponding to tumor and normal as input and generate two block-gzipped VCF files (tumor-variant VCF and normal-filtered VCF) as output. 4 | 5 | # How to install 6 | 7 | UVC requires BASH 4.0+ (4.0 is the minimum version required) and a compiler that supports the C++14 standard (g++ 6.3.0 is the minimum version recommended). 8 | The Makefile in this directory compiles with g++, but the Makefile can be easily modified to use another compiler instead of g++ (for example, clang). 9 | To install from scratch, please run: (./install-dependencies.sh && make clean && make all -j4 && make deploy). 10 | Please note that ./install-dependencies.sh requires bzip2 to decompress the downloaded files with the (.tar.bz2) extension. 11 | UVC depends on git 2.12+, htslib 1.6+ and bcftools 1.6+ (lower versions of htslib and bcftools may also work, but are not tested). 12 | If these two dependencies were already installed, then install-dependencies.sh may not be necessary. 13 | For trouble-shooting with the installation of htslib and bcftools, please check their official repositories at https://github.com/samtools/htslib and https://github.com/samtools/bcftools. 14 | More specifically, if any error message containing "error while loading shared libraries" pops up, please use the command (./configure --disable-plugins --disable-bz2 --disable-lzma --disable-libcurl --disable-s3 --disable-largefile) to build the corresponding htslib required by UVC first, then build UVC. 15 | Although not required, it is highly recommmended that bcftools is installed at a system location (a location that can be found in the PATH environment variable). 16 | The UVC binary uses multi-threading efficiently for up to 16 threads. 17 | After reaching 16 threads, adding more threads no longer significantly reduces wall-clock runtime. 18 | However, more efficient speed-up can still be gained by runing with GNU parallel or qsub to use one job per chromosome. 19 | 20 | In total, the installation should take about 5 minutes. 21 | 22 | # How to use 23 | 24 | The script uvcTN.sh in the bin directory is used for analyzing tumor-normal pairs. 25 | Run uvcTN.sh without any command-line argument will display its usage help. 26 | The usage help for uvcTN.sh refers to the executable uvc1, which performs the actual variant calling. 27 | The executable uvc1 can perform each of the following tasks: 28 | 1. tumor-only variant call to generate a tumor-only bgzipped vcf with vcf.gz as file extension. 29 | 2. filtering of tumor variants in tumor-only bgzipped vcf with its matched normal. 30 | The script uvcTN.sh simply wraps around the binary executable uvc1. 31 | 32 | For UMI (unique molecular identifier, a.k.a. molecular barcode) to be detected, the read name (QNAME) in the input BAM file should be in the format of originalName#UMI. 33 | For example, the UMI-labeled read name can be 34 | 1. "H5G5ABBCC:4:1209:10114:63736#ACGTAACCA" (ACGTAACCA is the single-strand barcode) or 35 | 2. "H5G5ABBCC:1:3010:10412:33669#AGTA+TGGT" (AGTA+TGGT is the duplex barcode). 36 | The auxiliary tool debarcode can be used for appending UMI sequences into read names. 37 | Running debarcode without any command-line argument will display its usage help. 38 | 39 | It is recommended to manually check the following outlier variant candidates if very high precision is required: 40 | 1. for non-UMI data, variant candidates with FORMAT/FTS consisting of three or more filter strings (typically less than 1.5% of all variants). 41 | 2. for UMI data, variant candidates with FORMAT/FTS consisting of one or more filter strings (typically less than 6% of all variants). 42 | 43 | If manual check is still too labor-intensive, then it is recommended to keep such outlier variant candidate if the candidate 44 | 1. is at a hotspot (for example, if the candidate shows high-frequency occurence in the COSMIC database) and 45 | 2. does not show germline risk (such as low-frequency occurence or absence in dbSNP). 46 | 47 | Otherwise, it is recommended to reject such variant candidate. 48 | 49 | # What to report if a runtime error arises 50 | 51 | In fact, uvc1 and some other executables all generated the same output given the same input. Their differences are as follows. 52 | 1. uvc1: the release version that runs the fastest with multi-threading (this binary exe file identical to the file named uvc-1-cpp-std-thread.out). 53 | Because of extreme runtime optimization, this program probably will not generate any useful error message when an error arises. 54 | 2. uvc-2-omp-thread-asan.debug: the OpenMP-multi-thread debug version that runs with address sanitizer (Asan). 55 | If the release version encounters any runtime error. please use this debug version to generate some useful error message. 56 | The error message can then be used by the code maintainer(s) for debugging and/or testing. 57 | 3. uvc-3-asan.debug: the single-thread debug version that runs with address sanitizer (Asan). 58 | If the debug version with multi-threading encounters any runtime error obfuscated by multi-threading, please use this debug version to generate some useful error message. 59 | The error message can then be used by the code maintainer(s) for debugging and/or testing. 60 | 4. uvc-1-cpp-std-thread.out: similar to uvc1 except that uvc.cppt.out uses C++14 STL thread instead of OpenMP for multi-threading. 61 | If the OpenMP library is not available, then this version can be used instead of uvc1. 62 | 63 | The diagnostic runtime error messages provided by uvc-2-omp-thread-ubsan.debug and/or uvc-3-ubsan.debug (UBsans) can provide additional debug information to the code maintainer(s). 64 | UBsans were not compiled by default due to slow compiling time. 65 | If uvc-2-omp-thread-asan.debug and uvc-3-asan.debug did not provide any useful stack-trace, then please compile UBsans and run UBsans instead of Asans to generate some useful error message. 66 | 67 | All bug reports, feature requests, and ideas for improvement are welcome (although not all of them may be addressed in time)! 68 | 69 | # Other things 70 | 71 | The environment variable ONE_STEP_UMI_STRUCT has special meaning to UVC. 72 | Please make sure that ONE_STEP_UMI_STRUCT is either not set (e.g., by using the unset shell command) or set to the empty string before running UVC. 73 | The python script extract-barcodes.py is obsolete and is replaced by debarcode. 74 | Compared with extract-barcodes.py, debarcode generates equivalent output but consumes only 40% of its runtime. 75 | The outputs of these two programs may not be the same in compressed space but are exactly the same in decompressed space. 76 | The script bin/uvcnorm.sh can be used for normalizing variants. 77 | By default, the normalization generates one SNV record per position and one InDel record per position. 78 | The script bin/uvcSurrogateAlign.sh is still under development and should be be used. 79 | 80 | For more information, please check the wiki. 81 | 82 | # References 83 | 84 | ## Publication 85 | 86 | Xiaofei Zhao, Allison C Hu, Sizhen Wang, Xiaoyue Wang, Calling small variants using universality with Bayes-factor-adjusted odds ratios, Briefings in Bioinformatics, 2021;, bbab458, https://doi.org/10.1093/bib/bbab458 87 | 88 | Calling small variants with universality and Bayesian-frequentist hybridism 89 | Xiaofei Zhao, Allison Hu, Sizhen Wang, Xiaoyue Wang 90 | bioRxiv 2020.08.23.263749; doi: https://doi.org/10.1101/2020.08.23.263749 91 | 92 | ## Patent 93 | 94 | [1]赵霄飞, 王思振. 一种基于高通量测序的基因变异检测方法:, CN111243664A[P]. 2020. 95 | 96 | # Contact: XiaoFei Zhao 97 | 98 | 99 | x43zhao AT uwaterloo DOT ca (first email to be reached at) 100 | 101 | cndfeifei AT gmail DOT com (if the above email does not work) 102 | 103 | xiaofei DOT zhao AT genetronhealth DOT com (if the above email does not work) 104 | 105 | -------------------------------------------------------------------------------- /debarcode_main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "unistd.h" 7 | #include "zlib.h" 8 | 9 | #include "htslib/kseq.h" 10 | 11 | #include "version.h" 12 | 13 | #define MAX_UMI_LEN (128) 14 | 15 | KSEQ_INIT(gzFile, gzread) 16 | 17 | typedef struct { 18 | char *in[2]; 19 | char *out[2]; 20 | unsigned int beg[2]; 21 | unsigned int end[2]; 22 | unsigned int isduplex; 23 | unsigned int use_comment_as_header; 24 | } CmdLineArgs; 25 | 26 | void usage(int argc, char **argv) { 27 | const char *requiredhelp = "\n\tThis parameter is required."; 28 | const char *sehelp = "\n\tLeave this parameter empty if the input data is single-end."; 29 | const char *zerohelp = "\n\tThis parameter has a default value of zero and is of positive-integer type."; 30 | 31 | fprintf(stderr, "Proram %s version %s\n", argv[0], VERSION_DETAIL); 32 | fprintf(stderr, " -h\tShow this help message\n"); 33 | fprintf(stderr, " -v\tShow the version information\n"); 34 | 35 | fprintf(stderr, " -i\tThe R1 input gzipped fastq filename of string type.%s\n", requiredhelp); 36 | fprintf(stderr, " -o\tThe R1 output gzipped fastq filename of string type.%s\n", requiredhelp); 37 | fprintf(stderr, " -b\tThe inclusive begin position of the UMI in the R1 input gzipped fastq filename.%s\n", zerohelp); 38 | fprintf(stderr, " -e\tThe exclusive end position of the UMI in the R1 input gzipped fastq filename.%s\n", zerohelp); 39 | 40 | fprintf(stderr, " -j\tThe R2 input gzipped fastq filename of string type.%s\n", sehelp); 41 | fprintf(stderr, " -p\tThe R2 output gzipped fastq filename of string type.%s\n", sehelp); 42 | fprintf(stderr, " -c\tThe inclusive begin position of the UMI in the R2 input gzipped fastq filename.%s\n", zerohelp); 43 | fprintf(stderr, " -f\tThe exclusive end position of the UMI in the R1 input gzipped fastq filename.%s\n", zerohelp); 44 | 45 | fprintf(stderr, " -C\tIf this switch is turned on, then use the first whitespace-separated token in the comment as sequence name.\n"); 46 | fprintf(stderr, " -D\tIf this switch is turned on, then assume that the input data is generated by duplex sequencing, and vice versa.\n"); 47 | } 48 | 49 | int parse(CmdLineArgs *args_ptr, int argc, char **argv) { 50 | int opt; 51 | while ((opt = getopt(argc, argv, "b:c:e:f:i:j:o:p:CDvh")) != -1) { 52 | switch(opt) { 53 | case 'b': args_ptr->beg[0] = atoi(optarg); /* printf("beg0 = %d\n", args_ptr->beg[0]); */ break; 54 | case 'c': args_ptr->beg[1] = atoi(optarg); /* printf("beg1 = %d\n", args_ptr->beg[1]); */ break; 55 | case 'e': args_ptr->end[0] = atoi(optarg); /* printf("end0 = %d\n", args_ptr->end[0]); */ break; 56 | case 'f': args_ptr->end[1] = atoi(optarg); /* printf("end1 = %d\n", args_ptr->end[1]); */ break; 57 | case 'i': args_ptr->in[0] = optarg; break; 58 | case 'j': args_ptr->in[1] = optarg; break; 59 | case 'o': args_ptr->out[0] = optarg; break; 60 | case 'p': args_ptr->out[1] = optarg; break; 61 | case 'C': args_ptr->use_comment_as_header = 1; break; 62 | case 'D': args_ptr->isduplex = 1; break; 63 | case 'v': fprintf(stderr, "Program %s version %s\n", argv[0], VERSION_DETAIL); return 0; 64 | case 'h': usage(argc, argv); exit(0); 65 | default: usage(argc, argv); return -1; 66 | } 67 | } 68 | return 0; 69 | } 70 | 71 | int process(const CmdLineArgs args, const unsigned int r1r2num) { 72 | int ret = 0; 73 | 74 | char umis[2][MAX_UMI_LEN]; 75 | gzFile in[2]; 76 | kseq_t *seq[2] = {NULL, NULL}; 77 | gzFile out[2]; 78 | 79 | for (unsigned int r1r2idx = 0; r1r2idx < r1r2num; r1r2idx++) { 80 | memset(umis[r1r2idx], 0, MAX_UMI_LEN); 81 | in[r1r2idx] = gzopen(args.in[r1r2idx], "r"); 82 | seq[r1r2idx] = kseq_init(in[r1r2idx]); 83 | out[r1r2idx] = gzopen(args.out[r1r2idx], "wb1"); 84 | } 85 | 86 | int lens[2] = {0, 0}; 87 | for (uint64_t nrec = 1;; nrec++) { 88 | for (unsigned int r1r2idx = 0; r1r2idx < r1r2num; r1r2idx++) { 89 | unsigned int beg = args.beg[r1r2idx]; 90 | unsigned int end = args.end[r1r2idx]; 91 | lens[r1r2idx] = kseq_read(seq[r1r2idx]); 92 | if (end > beg) { 93 | if (seq[r1r2idx]->seq.l > end) { 94 | strncpy(umis[r1r2idx], &(seq[r1r2idx]->seq.s[beg]), end-beg); 95 | } else { 96 | for (unsigned int posidx = 0; posidx < end - beg; posidx++) { 97 | umis[r1r2idx][posidx] = 'N'; 98 | } 99 | } 100 | } else { 101 | umis[r1r2idx][0] = '\0'; 102 | } 103 | } 104 | if (lens[0] < 0 || lens[1] < 0) { 105 | break; 106 | } 107 | for (unsigned int r1r2idx = 0; r1r2idx < r1r2num; r1r2idx++) { 108 | gzwrite(out[r1r2idx], "@", 1); 109 | if (args.use_comment_as_header) { 110 | unsigned int hdrlen = 0; 111 | for (; seq[r1r2idx]->comment.s[hdrlen] > ' '; hdrlen++); 112 | gzwrite(out[r1r2idx], seq[r1r2idx]->comment.s, hdrlen); 113 | } else { 114 | gzwrite(out[r1r2idx], seq[r1r2idx]->name.s, seq[r1r2idx]->name.l); 115 | } 116 | gzwrite(out[r1r2idx], "#", 1); 117 | unsigned int umi_num = 0; 118 | for (unsigned int r1r2idx2 = 0; r1r2idx2 < r1r2num; r1r2idx2++) { 119 | if (umis[r1r2idx2][0] != '\0') { 120 | if (umi_num > 0) { 121 | gzwrite(out[r1r2idx], (args.isduplex ? "+" : "-"), 1); 122 | } 123 | gzwrite(out[r1r2idx], umis[r1r2idx2], strlen(umis[r1r2idx2])); 124 | umi_num++; 125 | } 126 | } 127 | gzwrite(out[r1r2idx], "\n", 1); 128 | gzwrite(out[r1r2idx], seq[r1r2idx]->seq.s, seq[r1r2idx]->seq.l); 129 | gzwrite(out[r1r2idx], "\n", 1); 130 | gzwrite(out[r1r2idx], "+", 1); 131 | gzwrite(out[r1r2idx], seq[r1r2idx]->name.s, seq[r1r2idx]->name.l); 132 | gzwrite(out[r1r2idx], " ", 1); 133 | gzwrite(out[r1r2idx], seq[r1r2idx]->comment.s, seq[r1r2idx]->comment.l); 134 | gzwrite(out[r1r2idx], "\n", 1); 135 | gzwrite(out[r1r2idx], seq[r1r2idx]->qual.s, seq[r1r2idx]->qual.l); 136 | gzwrite(out[r1r2idx], "\n", 1); 137 | } 138 | } 139 | 140 | for (unsigned int r1r2idx = 0; r1r2idx < r1r2num; r1r2idx++) { 141 | gzclose(out[r1r2idx]); 142 | kseq_destroy(seq[r1r2idx]); 143 | gzclose(in[r1r2idx]); 144 | if (lens[r1r2idx] != lens[0]) { 145 | fprintf(stderr, "Warning: last kseq_read return codes for R1 and R%d are %d and %d, implying R1 and R%d may have different number of records.\n", 146 | r1r2idx+1, lens[0], lens[r1r2idx], r1r2idx+1); 147 | ret -= 2; 148 | } 149 | } 150 | return ret; 151 | } 152 | 153 | int main(int argc, char **argv) { 154 | CmdLineArgs args; 155 | memset(&args, 0, sizeof(args)); 156 | 157 | int parse_res = parse(&args, argc, argv); 158 | if (parse_res != 0) { return parse_res; } 159 | 160 | if (NULL == args.in[0] || NULL == args.out[0]) { 161 | fprintf(stderr, "The R1 end must have both input gzipped FASTQ file and output gzipped FASTQ file.\n"); 162 | usage(argc, argv); 163 | return 1; 164 | } 165 | if (args.end[0] - args.beg[0] >= MAX_UMI_LEN) { 166 | fprintf(stderr, "The R1 end of input gzipped FASTQ file has UMI of length (%d-%d), but the maximum allowed UMI length is %d.\n", args.end[0], args.beg[0], MAX_UMI_LEN-1); 167 | usage(argc, argv); 168 | return 4; 169 | } 170 | 171 | unsigned int r1r2num = 1; 172 | if (NULL != args.in[1] || NULL != args.out[1]) { 173 | if (NULL == args.in[1] || NULL == args.out[1]) { 174 | fprintf(stderr, "The R2 end must have both input gzipped FASTQ file and output gzipped FASTQ file.\n"); 175 | usage(argc, argv); 176 | return 2; 177 | } 178 | if (args.end[1] - args.beg[1] >= MAX_UMI_LEN) { 179 | fprintf(stderr, "The R2 end of input gzipped FASTQ file has UMI of length (%d-%d), but the maximum allowed UMI length is %d.\n", args.end[1], args.beg[1], MAX_UMI_LEN-1); 180 | usage(argc, argv); 181 | return 8; 182 | } 183 | r1r2num = 2; 184 | } 185 | 186 | return process(args, r1r2num); 187 | } 188 | -------------------------------------------------------------------------------- /main_consensus.hpp: -------------------------------------------------------------------------------- 1 | #ifndef main_consensus_hpp_INCLUDED 2 | #define main_consensus_hpp_INCLUDED 3 | 4 | #include "main_conversion.hpp" 5 | #include "common.hpp" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | enum ConsensusBlockCigarType { 14 | CONSENSUS_BLOCK_CSOFT_CLIP_FIXED_LEFT_TO_VAR_RIGHT, 15 | CONSENSUS_BLOCK_CINS, 16 | CONSENSUS_BLOCK_CSOFT_CLIP_FIXED_RIGHT_TO_VAR_LEFT, 17 | CONSENSUS_BLOCK_END, 18 | }; 19 | #define NUM_CONSENSUS_BLOCK_CIGAR_TYPES 3 20 | static_assert(NUM_CONSENSUS_BLOCK_CIGAR_TYPES == CONSENSUS_BLOCK_END); 21 | #define ALL_CONSENSUS_BLOCK_CIGAR_TYPES (std::array \ 22 | {{ CONSENSUS_BLOCK_CSOFT_CLIP_FIXED_LEFT_TO_VAR_RIGHT, CONSENSUS_BLOCK_CINS, CONSENSUS_BLOCK_CSOFT_CLIP_FIXED_RIGHT_TO_VAR_LEFT }}) 23 | 24 | bool 25 | is_ConsensusBlockCigarType_right2left(ConsensusBlockCigarType cigartype) { 26 | return (CONSENSUS_BLOCK_CSOFT_CLIP_FIXED_RIGHT_TO_VAR_LEFT == cigartype); 27 | }; 28 | 29 | #define BASE2COUNT_BQ_SUM_IDX (BASE_NN+1) 30 | #define BASE2COUNT_NFRAGS_IDX (BASE_NN+2) 31 | 32 | struct FastqConsensusBase { 33 | char base; 34 | int8_t quality; 35 | uvc1_readnum_t family_size; 36 | uvc1_readnum_t family_identity; 37 | }; 38 | 39 | // typedef std::pair FastqBaseAndQual; 40 | typedef std::basic_string FastqConsensusSeqSegment; 41 | typedef std::array BaseToCount; 42 | typedef std::vector ConsensusBlock; 43 | 44 | void 45 | reverseAndComplement(FastqConsensusSeqSegment & fqrec) { 46 | std::reverse(fqrec.begin(), fqrec.end()); 47 | for (auto & baseBQ :fqrec) { 48 | baseBQ.base = STATIC_REV_COMPLEMENT.data[(size_t)baseBQ.base]; 49 | } 50 | } 51 | 52 | ConsensusBlock 53 | ConsensusBlock_trim(const ConsensusBlock & conblock, uvc1_qual_t percDP_thres = 20, uvc1_readpos_t n_consec_positions_thres = 5) { 54 | uvc1_qual_t maxDP = 0; 55 | for (const auto base2cnt : conblock) { 56 | uvc1_qual_t thisDP = 0; 57 | for (const AlignmentSymbol posbase : SYMBOL_TYPE_TO_NON_NN_SYMBOLS[BASE_SYMBOL]) { 58 | thisDP += base2cnt[posbase]; 59 | } 60 | UPDATE_MAX(maxDP, thisDP); 61 | } 62 | ConsensusBlock ret; 63 | uvc1_refgpos_t prev_pos = 0; 64 | uvc1_refgpos_t curr_pos = 0; 65 | uvc1_refgpos_t n_consec_positions = 0; 66 | for (const auto base2cnt : conblock) { 67 | curr_pos++; 68 | uvc1_qual_t thisDP = 0; 69 | for (const AlignmentSymbol posbase : SYMBOL_TYPE_TO_NON_NN_SYMBOLS[BASE_SYMBOL]) { 70 | thisDP += base2cnt[posbase]; 71 | } 72 | if (thisDP * 100 < maxDP * percDP_thres) { 73 | if (prev_pos + 1 == curr_pos) { n_consec_positions++ ;} 74 | else { n_consec_positions = 1; } 75 | if (n_consec_positions >= n_consec_positions_thres) { 76 | for (uvc1_refgpos_t i = 1; i < n_consec_positions; i++) { 77 | ret.pop_back(); 78 | } 79 | return ret; 80 | } 81 | } 82 | prev_pos = curr_pos; 83 | ret.push_back(base2cnt); 84 | } 85 | return ret; 86 | } 87 | 88 | const FastqConsensusSeqSegment 89 | consensusBlockToSeqQual(const ConsensusBlock & cb1, bool is_right2left) { 90 | FastqConsensusSeqSegment ret; 91 | // ConsensusBlock & cb1 ; // = pos2conblock4it.first->second; 92 | for (size_t inspos1 = 0; inspos1 < cb1.size(); inspos1++) { 93 | size_t inspos = (is_right2left ? (cb1.size() - inspos1 - 1) : inspos1); 94 | AlignmentSymbol conbase = BASE_NN; 95 | uvc1_readnum_t concount = 0; 96 | uvc1_readnum_t totcount = 0; 97 | for (const AlignmentSymbol posbase : SYMBOL_TYPE_TO_NON_NN_SYMBOLS[BASE_SYMBOL]) { 98 | if (cb1[inspos][posbase] > concount) { 99 | conbase = posbase; 100 | concount = cb1[inspos][posbase]; 101 | } 102 | totcount += cb1[inspos][posbase]; 103 | } 104 | const char *desc = SYMBOL_TO_DESC_ARR[conbase]; 105 | assertUVC (strlen(desc) == 1); 106 | FastqConsensusBase fcb; 107 | fcb.base = desc[0]; 108 | fcb.quality = cb1[inspos][BASE2COUNT_BQ_SUM_IDX] / MAX(cb1[inspos][BASE2COUNT_NFRAGS_IDX], 1); 109 | fcb.family_size = totcount; 110 | fcb.family_identity = (double)concount / (double)MAX(totcount, 1); 111 | ret.push_back(fcb); 112 | } 113 | return ret; 114 | } 115 | 116 | struct ConsensusBlockSet { 117 | std::map pos2conblock; 118 | void setIsRightToLeft(bool is_r2l) { is_right2left = is_r2l; } 119 | 120 | void 121 | incByPosSeqQual(uvc1_readpos_t pos, const std::string & seq, const auto & qual) { 122 | assertUVC(seq.size() == qual.size()); 123 | auto pos2conblock4it = this->pos2conblock.insert(std::make_pair(pos, ConsensusBlock())); 124 | ConsensusBlock & cb2 = pos2conblock4it.first->second; 125 | while (cb2.size() < seq.size()) { 126 | cb2.push_back(std::array {{ 0 }}); 127 | } 128 | for (size_t inspos = 0; inspos < seq.size(); inspos++) { 129 | AlignmentSymbol posbase = CHAR_TO_SYMBOL.data[seq[inspos]]; 130 | UPDATE_MAX(cb2[inspos][posbase], qual[inspos]); 131 | UPDATE_MAX(cb2[inspos][BASE2COUNT_BQ_SUM_IDX], qual[inspos]); 132 | cb2[inspos][BASE2COUNT_NFRAGS_IDX] = 1; 133 | } 134 | }; 135 | 136 | FastqConsensusSeqSegment 137 | returnSeqQualVec(uvc1_readpos_t pos, uvc1_qual_t percDP = 20, uvc1_refgpos_t n_consec_positions = 3) { 138 | FastqConsensusSeqSegment ret; 139 | auto pos2conblock4it = this->pos2conblock.find(pos); 140 | if (pos2conblock4it != this->pos2conblock.end()) { 141 | return consensusBlockToSeqQual(ConsensusBlock_trim(pos2conblock4it->second, percDP, n_consec_positions), is_right2left); 142 | } else { 143 | return FastqConsensusSeqSegment(); 144 | } 145 | }; 146 | 147 | // @deprecated this method has no real use-case so far 148 | void 149 | incByConsensusForSeq(const ConsensusBlockSet & cbset, uvc1_readnum_t incvalue = 1) { 150 | for (const auto & pos2conblock4it1 : cbset.pos2conblock) { 151 | uvc1_refgpos_t pos = pos2conblock4it1.first; 152 | auto pos2conblock4it = this->pos2conblock.insert(std::make_pair(pos, ConsensusBlock())); 153 | const ConsensusBlock & cb1 = cbset.pos2conblock.at(pos); 154 | ConsensusBlock & cb2 = pos2conblock4it.first->second; 155 | while (cb2.size() < cb1.size()) { 156 | cb2.push_back(std::array {{ 0 }}); 157 | } 158 | for (size_t inspos = 0; inspos < cb1.size(); inspos++) { 159 | AlignmentSymbol conbase = BASE_NN; 160 | uvc1_readnum_t concount = 0; 161 | uvc1_readnum_t totcount = 0; 162 | for (const AlignmentSymbol posbase : SYMBOL_TYPE_TO_NON_NN_SYMBOLS[BASE_SYMBOL]) { 163 | if (cb1[inspos][posbase] > concount) { 164 | conbase = posbase; 165 | concount = cb1[inspos][posbase]; 166 | totcount += cb1[inspos][posbase]; 167 | } 168 | } 169 | cb2[inspos][conbase] += incvalue; 170 | cb2[inspos][BASE2COUNT_BQ_SUM_IDX] += non_neg_minus(concount * 2, totcount); // not totcount; 171 | cb2[inspos][BASE2COUNT_NFRAGS_IDX] += incvalue; 172 | } 173 | } 174 | }; 175 | 176 | void 177 | incByMajorMinusMinor(const ConsensusBlockSet & cbset) { 178 | for (const auto & pos2conblock4it1 : cbset.pos2conblock) { 179 | uvc1_refgpos_t pos = pos2conblock4it1.first; 180 | auto pos2conblock4it = this->pos2conblock.insert(std::make_pair(pos, ConsensusBlock())); 181 | const ConsensusBlock & cb1 = cbset.pos2conblock.at(pos); 182 | ConsensusBlock & cb2 = pos2conblock4it.first->second; 183 | while (cb2.size() < cb1.size()) { 184 | cb2.push_back(std::array {{ 0 }}); 185 | } 186 | for (size_t inspos = 0; inspos < cb1.size(); inspos++) { 187 | AlignmentSymbol conbase = BASE_NN; 188 | uvc1_readnum_t concount = 0; 189 | uvc1_readnum_t totcount = 0; 190 | for (const AlignmentSymbol posbase : SYMBOL_TYPE_TO_NON_NN_SYMBOLS[BASE_SYMBOL]) { 191 | if (cb1[inspos][posbase] > concount) { 192 | conbase = posbase; 193 | concount = cb1[inspos][posbase]; 194 | } 195 | totcount += cb1[inspos][posbase]; 196 | } 197 | cb2[inspos][conbase] += 1; 198 | cb2[inspos][BASE2COUNT_BQ_SUM_IDX] += non_neg_minus(concount * 2, totcount); // totcount; 199 | cb2[inspos][BASE2COUNT_NFRAGS_IDX] += 1; // cb1[inspos][BASE2COUNT_NFRAGS_IDX]; 200 | } 201 | } 202 | }; 203 | 204 | /* 205 | void 206 | incBySummation(const ConsensusBlockSet & cbset) { 207 | for (const auto & pos2conblock4it1 : cbset.pos2conblock) { 208 | uvc1_refgpos_t pos = pos2conblock4it1.first; 209 | auto pos2conblock4it = this->pos2conblock.insert(std::make_pair(pos, ConsensusBlock())); 210 | const ConsensusBlock & cb1 = cbset.pos2conblock.at(pos); 211 | ConsensusBlock & cb2 = pos2conblock4it.first->second; 212 | while (cb2.size() < cb1.size()) { 213 | cb2.push_back(std::array {{ 0 }}); 214 | } 215 | for (size_t inspos = 0; inspos < cb1.size(); inspos++) { 216 | for (size_t posbase = 0; posbase < BASE_NN; posbase++) { 217 | cb2[inspos][posbase] += cb1[inspos][posbase]; 218 | } 219 | } 220 | } 221 | }; 222 | */ 223 | private: 224 | bool is_right2left = false; 225 | }; 226 | 227 | #endif 228 | -------------------------------------------------------------------------------- /CmdLineArgs.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CmdLineArgs_hpp_INCLUDED 2 | #define CmdLineArgs_hpp_INCLUDED 3 | 4 | #include "common.hpp" 5 | #include "CLI11-1.7.1/CLI11.hpp" 6 | #include 7 | #include 8 | #include 9 | 10 | #define DEBUG_NOTE_FLAG_BITMASK_BAQ_OFFSETARR 0x1 11 | 12 | // Three FASTQ files and three cluster files 13 | #define NUM_FQLIKE_CON_OUT_FILES (3) 14 | 15 | static const std::array FASTQ_LIKE_SUFFIXES = { 16 | "R1.fastq.gz", "R2.fastq.gz", "SE.fastq.gz" 17 | // ,"R1.group.gz", "R2.group.gz", "SE.group.gz" 18 | }; 19 | 20 | struct CommandLineArgs { 21 | 22 | const std::string NOT_PROVIDED = "."; 23 | // *** 00. frequently used parameters 24 | 25 | std::string bam_input_fname = NOT_PROVIDED; // not missing 26 | std::string fasta_ref_fname = NOT_PROVIDED; 27 | std::string vcf_out_pass_fname = "-"; 28 | 29 | std::string bed_region_fname = NOT_PROVIDED; // bcftools view -R 30 | std::string tier1_target_region = NOT_PROVIDED; // bcftools view -t 31 | std::string sample_name = "-"; 32 | 33 | size_t max_cpu_num = 8; 34 | size_t mem_per_thread = (1024 * 3 / 2); // MegaBytes, for samtools sort 1.11 the value is 768. Variant calling is more complex and mem-consuming than sorting. 35 | 36 | uvc1_flag_t outvar_flag = OUTVAR_SOMATIC + OUTVAR_ANY + OUTVAR_MGVCF + OUTVAR_BASE_NN + OUTVAR_ADDITIONAL_INDEL_CANDIDATE; 37 | bool should_output_all = false; 38 | bool should_output_all_germline = false; 39 | double vqual = (double)15; // set to 20 for less output 40 | 41 | AssayType assay_type = ASSAY_TYPE_AUTO; 42 | 43 | uvc1_qual_t fam_thres_highBQ_snv = 25; 44 | uvc1_qual_t fam_thres_highBQ_indel = 13; 45 | uvc1_readnum_t fam_thres_dup1add = 2; 46 | uvc1_readnum100x_t fam_thres_dup1perc = 80; 47 | uvc1_readnum_t fam_thres_dup2add = 3; 48 | uvc1_readnum100x_t fam_thres_dup2perc = 70; // 85 for more consensus specificity 49 | uvc1_readnum100x_t fam_thres_qseqlen = 75; 50 | std::string fam_consensus_out_fastq = ""; 51 | uvc1_readnum_t fam_consensus_out_fastq_thres_dup1add = 1; 52 | 53 | // *** 01. parameters of the names of files, samples, regions, etc. 54 | 55 | std::string vcf_tumor_fname = NOT_PROVIDED; 56 | std::string bed_out_fname = NOT_PROVIDED; 57 | std::string bed_in_fname = NOT_PROVIDED; 58 | uvc1_readnum_t bed_in_avg_sequencing_DP = -1; // infer from input BAM data 59 | uvc1_flag_t bed_in_avg_sequencing_DP_n_from_t = 0x0; 60 | // *** 02. parameters that control input, output, and logs (driven by computational requirements and resources) 61 | 62 | bool is_tumor_format_retrieved = true; 63 | 64 | // https://www.biostars.org/p/110670/ 65 | 66 | uvc1_readpos_t kept_aln_min_aln_len = 0; 67 | uvc1_qual_t kept_aln_min_mapqual = 0; // 40; // from GATK 68 | uvc1_readpos_t kept_aln_min_isize = 0; 69 | uvc1_readpos_t kept_aln_max_isize = INT32_MAX; 70 | bool kept_aln_is_zero_isize_discarded = false; 71 | 72 | uvc1_readnum_t min_altdp_thres = 2; 73 | 74 | uvc1_readnum_t vdp1 = 1000; 75 | uvc1_readnum_t vad1 = 4; 76 | double vfa1 = 0.002; 77 | uvc1_readnum_t vdp2 = 10000; 78 | uvc1_readnum_t vad2 = 8; 79 | double vfa2 = 0.0002; 80 | 81 | uvc1_readnum_t min_r_ad = 0; 82 | uvc1_readnum_t min_a_ad = 0; 83 | 84 | bool should_add_note = false; 85 | bool always_log = false; 86 | // *** 03. parameters that are driven by the properties of the assay 87 | 88 | MoleculeTag molecule_tag = MOLECULE_TAG_AUTO; 89 | SequencingPlatform sequencing_platform = SEQUENCING_PLATFORM_AUTO; 90 | 91 | // NOTE: these two inferred parameters are not shown (and cannot be passed in as parameters) on the command-line. 92 | SequencingPlatform inferred_sequencing_platform = sequencing_platform; 93 | uvc1_qual_t inferred_maxMQ = 0; 94 | 95 | PairEndMerge pair_end_merge = PAIR_END_MERGE_YES; 96 | bool disable_duplex = false; 97 | uvc1_readpos_t primerlen = 0; // 23; // https://link.springer.com/chapter/10.1007/978-1-4020-6241-4_5 : 18 - 22 bps 98 | uvc1_readpos_t primerlen2 = 23; // https://genome.cshlp.org/content/3/3/S30.full.pdf : 18 - 24 bps 99 | uvc1_flag_t primer_flag = 0x0; 100 | uvc1_readpos_t central_readlen = 0; // estimate from the data 101 | uvc1_qual_t bq_phred_added_misma = 0; // estimate from the data 102 | uvc1_qual_t bq_phred_added_indel = 0; // estimate from the data 103 | 104 | // http://snap.stanford.edu/class/cs224w-2015/slides/04-powerlaws.pdf 105 | // https://cs.brynmawr.edu/Courses/cs380/spring2013/section02/slides/10_ScaleFreeNetworks.pdf 106 | double powlaw_exponent = 3.0; // universality constant 107 | double powlaw_anyvar_base = (double)(60+25+5); // universality constant 108 | double powlaw_amplicon_allele_fraction_coef = (5.0/8.0); 109 | 110 | uvc1_qual_t penal4lowdep = 37; 111 | uvc1_qual_t assay_sequencing_BQ_max = 37; 112 | uvc1_qual_t assay_sequencing_BQ_inc = 0; 113 | 114 | uvc1_readnum_t phasing_haplotype_max_count = 8; 115 | uvc1_readnum_t phasing_haplotype_min_ad = 1; 116 | uvc1_readnum_t phasing_haplotype_max_detail_cnt = 3; 117 | 118 | // *** 04. parameters for dedupping reads 119 | 120 | // PCR stutter noise at (di,tri,tetra,...)-nucleotide generates +-(2,3,4...) shift in read end position, 121 | // so more accurate dedupping requires us to consider these cases. This constant is good enough for the general case. 122 | double dedup_center_mult = 5; 123 | //double dedup_amplicon_count_to_surrcount_ratio = 16; // can be 20 124 | //double dedup_amplicon_count_to_surrcount_ratio_twosided = 4; // can be 5 or 6 125 | double dedup_amplicon_end2end_ratio = 1.5; 126 | double dedup_amplicon_border_to_insert_cov_weak_avgDP_ratio = 5; // *1.5; 127 | double dedup_amplicon_border_to_insert_cov_strong_avgDP_ratio = 20; // *1.5; 128 | 129 | double dedup_amplicon_border_to_insert_cov_weak_totDP_ratio = 0.05; 130 | double dedup_amplicon_border_to_insert_cov_strong_totDP_ratio = 0.20; 131 | 132 | double dedup_amplicon_border_weak_minDP = 100; 133 | double dedup_amplicon_border_strong_minDP = 400; 134 | 135 | uvc1_flag_t dedup_flag = 0x0; 136 | 137 | // *** 05. parameters related to bias thresholds 138 | 139 | uvc1_qual_t bias_thres_highBQ = 20; 140 | uvc1_qual_t bias_thres_highBAQ = 20; // is 20+3 for SNVs 141 | 142 | uvc1_readpos_t bias_thres_aLPxT_add = 5; 143 | uvc1_readpos_t bias_thres_aLPxT_perc = 160; 144 | 145 | #if COMPILATION_ENABLE_XMGOT 146 | uvc1_base1500x_t bias_thres_PFXM1T_add = 130; // 35; // set very high to disable mismatch bias 147 | uvc1_base1500x_t bias_thres_PFXM2T_add = 20; 148 | uvc1_base1500x_t bias_thres_PFGO1T_add = 125; // set very high to disable gap bias 149 | uvc1_base1500x_t bias_thres_PFGO2T_add = 15; 150 | 151 | uvc1_readnum100x_t bias_thres_PFXM1T_perc = 50; 152 | uvc1_readnum100x_t bias_thres_PFXM2T_perc = 70; 153 | uvc1_readnum100x_t bias_thres_PFGO1T_perc = 50; 154 | uvc1_readnum100x_t bias_thres_PFGO2T_perc = 70; 155 | 156 | uvc1_readnum100x_t bias_thres_PFXM1NT_perc = 70; // for normal 157 | uvc1_readnum100x_t bias_thres_PFGO1NT_perc = 70; // for normal 158 | #endif 159 | 160 | uvc1_readpos_t bias_thres_aLRP1t_minus = 10; 161 | uvc1_readpos_t bias_thres_aLRP2t_minus = 5; 162 | uvc1_readpos_t bias_thres_aLRB1t_minus = 50; 163 | uvc1_readpos_t bias_thres_aLRB2t_minus = 25; 164 | 165 | uvc1_readnum100x_t bias_thres_aLRP1t_avgmul_perc = 100; 166 | uvc1_readnum100x_t bias_thres_aLRP2t_avgmul_perc = 100; 167 | uvc1_readnum100x_t bias_thres_aLRB1t_avgmul_perc = 100; 168 | uvc1_readnum100x_t bias_thres_aLRB2t_avgmul_perc = 100; 169 | 170 | uvc1_readnum100x_t bias_thres_aLRP1Nt_avgmul_perc = 80; // for normal 171 | uvc1_readnum100x_t bias_thres_aLRB1Nt_avgmul_perc = 80; // for normal 172 | 173 | uvc1_readnum100x_t bias_thres_aLRI1T_perc = 200; 174 | uvc1_readnum100x_t bias_thres_aLRI2T_perc = 150; 175 | uvc1_readnum100x_t bias_thres_aLRI1t_perc = 50; 176 | uvc1_readnum100x_t bias_thres_aLRI2t_perc = 67; 177 | 178 | uvc1_readnum100x_t bias_thres_aLRI1NT_perc = 250; // for normal 179 | uvc1_readnum100x_t bias_thres_aLRI1Nt_perc = 40; // for normal 180 | 181 | uvc1_readpos_t bias_thres_aLRI1T_add = 180; // 200; 182 | uvc1_readpos_t bias_thres_aLRI2T_add = 150; 183 | 184 | uvc1_qual_t bias_thres_PFBQ1 = 25; 185 | uvc1_qual_t bias_thres_PFBQ2 = 30; 186 | 187 | uvc1_base1500x_t bias_thres_aXM1T_add = 30; 188 | 189 | uvc1_readpos_t bias_thres_interfering_indel = 5; 190 | uvc1_qual_t bias_thres_interfering_indel_BQ = 21; 191 | uvc1_qual_t bias_thres_BAQ1 = 23; 192 | uvc1_qual_t bias_thres_BAQ2 = 33; 193 | 194 | uvc1_readpos_t bias_thres_strict_c2LRP0 = 5; 195 | 196 | double bias_thres_FTS_FA = 0.6; // 0.95+1e-5; 197 | bool bias_is_orientation_artifact_mixed_with_sequencing_error = false; 198 | double bias_orientation_min_effective_allelefrac = 0.004; 199 | 200 | // *** 06. parameters related to the priors of bias 201 | 202 | uvc1_readnum100x_t bias_prior_DPadd_perc = 50; 203 | 204 | double bias_priorfreq_pos = 40; // set very high to disable position bias, insert-end bias, strand bias, and orientation bias. 205 | double bias_priorfreq_indel_in_read_div = 20; 206 | double bias_priorfreq_indel_in_var_div2 = 15; 207 | double bias_priorfreq_indel_in_str_div2 = 10; 208 | double bias_priorfreq_var_in_str_div2 = 5; 209 | 210 | double bias_prior_var_DP_mul = 1.25 + DBLFLT_EPS; 211 | 212 | uvc1_qual_t bias_priorfreq_ipos_snv = 60-15; // set very high to disable insert-end bias 213 | uvc1_qual_t bias_priorfreq_ipos_indel = 60-15; 214 | uvc1_qual_t bias_priorfreq_strand_snv_base = 10; // set very high to disable strand bias 215 | uvc1_qual_t bias_priorfreq_strand_indel = 60-15; 216 | 217 | double bias_FA_pseudocount_indel_in_read = 0.5/10.0; 218 | 219 | double bias_priorfreq_orientation_snv_base = 60-15; // set very high to disable orientation bias 220 | double bias_priorfreq_orientation_indel_base = 60-15; 221 | double bias_orientation_counter_avg_end_len = 20; 222 | 223 | uvc1_qual_t bias_FA_powerlaw_noUMI_phred_inc_snv = 5; 224 | uvc1_qual_t bias_FA_powerlaw_noUMI_phred_inc_indel = 7; // this is actually the intrinsic lower error rate of indel instead of the one after reduction by bias. 225 | uvc1_qual_t bias_FA_powerlaw_withUMI_phred_inc_snv = 5+3; 226 | uvc1_qual_t bias_FA_powerlaw_withUMI_phred_inc_indel = 7; 227 | 228 | uvc1_readnum_t bias_reduction_by_high_sequencingDP_min_n_totDepth = 800; 229 | uvc1_readnum_t bias_reduction_by_high_sequencingDP_min_n_altDepth = 3; 230 | 231 | uvc1_flag_t nobias_flag = 0x2; 232 | double nobias_pos_indel_lenfrac_thres = 2.0; // set very low to disable position bias for InDels 233 | uvc1_readpos_t nobias_pos_indel_str_track_len = 16; 234 | 235 | // *** 07. parameters related to read families 236 | 237 | uvc1_readnum_t fam_thres_emperr_all_flat_snv = 4; 238 | uvc1_readnum100x_t fam_thres_emperr_con_perc_snv = 67; 239 | uvc1_readnum_t fam_thres_emperr_all_flat_indel = 4; // can be 5 240 | uvc1_readnum100x_t fam_thres_emperr_con_perc_indel = 67; // can be 75 241 | 242 | uvc1_readnum_t fam_min_n_copies = 800; // 300 * 3; // 300 DNA copies per nanogram of DNA 243 | uvc1_readnum_t fam_min_n_copies_DPxAD = 20 * 1000; 244 | uvc1_readnum100x_t fam_min_overseq_perc = 200; // 250; // percent fold of over-sequencing 245 | uvc1_readnum100x_t fam_bias_overseq_perc = 150; // percent fold of over-sequencing 246 | uvc1_readnum100x_t fam_tier3DP_bias_overseq_perc = 350; 247 | uvc1_readnum100x_t fam_indel_nonUMI_phred_dec_per_fold_overseq = 9; 248 | 249 | // 10: error of 10 PCR cycles using low-fidelity polymerase, https://www.nature.com/articles/s41598-020-63102-8 250 | // 13: reduction in error by using high-fidelity polymerase for UMI assay, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3287198/ 251 | // https://www.bio-rad.com/webroot/web/pdf/lsr/literature/Bulletin_7076.pdf 252 | // uint16_t fam_phred_indel_err_red_by_high_fidelity_pol is 10; // 10 + 13; 253 | // https://www.nature.com/articles/s41598-018-31064-7 : All libraries included PCR steps totaling 37 cycles. During Step 4, at cycles 21, 23, 25, 27, 254 | // 14: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3111315/ : Following 25 additional cycles of PCR, There were 19 cycles of PCR 255 | uvc1_qual_t fam_phred_indel_inc_before_barcode_labeling = 14; // can be 13, 13 + 14, or 10 + 13 256 | // https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3616734/ : all major library-prep artifacts 257 | uvc1_qual_t fam_phred_sscs_transition_CG_TA = 40; // Cytosine deamination into Uracil, especially in FFPE samples, also by UV light radiation, more upstream 258 | uvc1_qual_t fam_phred_sscs_transition_AT_GC = 44; // https://en.wikipedia.org/wiki/DNA_oxidation, DNA synthesis error, more downstream 259 | uvc1_qual_t fam_phred_sscs_transversion_CG_AT = 48; // https://www.pnas.org/content/109/36/14508 : there can be C->A artifact 260 | uvc1_qual_t fam_phred_sscs_transversion_other = 48; 261 | uvc1_qual_t fam_phred_sscs_indel_open = 58; 262 | uvc1_qual_t fam_phred_sscs_indel_ext = 0; 263 | uvc1_qual_t fam_phred_dscs_all = 58; 264 | uvc1_qual_t fam_phred_dscs_max = 68; // theoretical max 265 | uvc1_qual_t fam_phred_dscs_inc_max = (68-48); 266 | 267 | // 10*log((2.7e-3-3.5e-5)/(1.5e-4-3.5e-5))/log(10)*3 is 41 from https://doi.org/10.1073/pnas.1208715109 PMC3437896 268 | // The -6 is to accommodate for the fact that most BQs are strictly above 30. 269 | uvc1_qual_t fam_phred_pow_sscs_transversion_AT_TA_origin = 44 - (41-6) + 4; // A:T > T:A somatic mutations are uncommon 270 | double fam_phred_pow_sscs_snv_origin = 44 - (41-6); 271 | double fam_phred_pow_sscs_indel_origin = fam_phred_sscs_indel_open - 9 * 3; 272 | double fam_phred_pow_dscs_all_origin = 0; 273 | uvc1_flag_t fam_flag = 0x0; // 0x1: set cap to BQ. 0x2: disable requirement of having UMI to consensus. 274 | 275 | // *** 08. parameters related to systematic errors 276 | 277 | uvc1_qual_t syserr_BQ_prior = 30; // https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-451 278 | 279 | uvc1_deciphred_t syserr_BQ_sbratio_q_add = 5; // note: this is 10*phred, or equivalently in deciphred 280 | uvc1_deciphred_t syserr_BQ_sbratio_q_max = 40; 281 | uvc1_deciphred_t syserr_BQ_xmratio_q_add = 5; 282 | uvc1_deciphred_t syserr_BQ_xmratio_q_max = 40; 283 | uvc1_deciphred_t syserr_BQ_bmratio_q_add = 5; 284 | uvc1_deciphred_t syserr_BQ_bmratio_q_max = 40; 285 | 286 | int syserr_BQ_strand_favor_mul = 3; 287 | 288 | uvc1_deciphred_t syserr_minABQ_pcr_snv = 0; // will be inferred from sequencing platform 289 | uvc1_deciphred_t syserr_minABQ_pcr_indel = 0; // will be inferred from sequencing platform 290 | uvc1_deciphred_t syserr_minABQ_cap_snv = 0; 291 | uvc1_deciphred_t syserr_minABQ_cap_indel = 0; 292 | 293 | uvc1_refgpos_t syserr_mut_region_n_bases = 11; 294 | 295 | uvc1_qual_t syserr_MQ_min = 0; 296 | uvc1_qual_t syserr_MQ_max = 60; // from bwa 297 | 298 | double syserr_MQ_NMR_expfrac = 0.03; // 23/750 299 | double syserr_MQ_NMR_altfrac_coef = 2.0; // base and exponent multiplicative factor for the ALT allele 300 | // SRR7890876_SRR7890881_fp_chr7_100955016_T_C in the MUC3A gene can be a true positive variant 301 | double syserr_MQ_NMR_nonaltfrac_coef = 2.0; // base and exponent multiplicative factor for the non-ALT alleles 302 | double syserr_MQ_NMR_pl_exponent = 3.0; // power-law exponent for penalty to the the region of high-basecall-quality XM regions. 303 | double syserr_MQ_nonref_base = 40; // power-law exponent for penalty to the the region of high-basecall-quality XM regions. 304 | 305 | // Make sure that, by default, all variants (which usually include hotspot variants) are found in the vcf output regardless of mapping quality. 306 | 307 | // *** 09. parameters related to germline vars // PMC4271055: probablity of germline call error is between 1/100kb and 1/200kb 308 | 309 | double germ_hetero_FA = 0.47; 310 | 311 | // https://www.biostars.org/p/6177/ probablity of hetero is 0.8e-3 for non-african, it should be 32 for african. 312 | uvc1_qual_t germ_phred_hetero_snp = 31; 313 | uvc1_qual_t germ_phred_hetero_indel = 41-1; 314 | uvc1_qual_t germ_phred_homalt_snp = 31+2; 315 | uvc1_qual_t germ_phred_homalt_indel = 41-1+2; 316 | uvc1_qual_t germ_phred_het3al_snp = 54+5; 317 | uvc1_qual_t germ_phred_het3al_indel = 41-1+9; 318 | 319 | // *** 10. parameters related to tumor-normal-pairs. 320 | 321 | uvc1_qual_t tn_q_inc_max = 9; 322 | uvc1_qual_t tn_q_inc_max_sscs_CG_AT = 0; 323 | uvc1_qual_t tn_q_inc_max_sscs_other = 5; 324 | 325 | // Phred-scaled likelihood that the observed allele fraction additively deviates from the expected allele fraction by a multiplicative factor of 2 326 | double tn_syserr_norm_devqual = 15.0; // can be (double)(12.5); 327 | uvc1_flag_t tn_is_paired = false; 328 | //uvc1_flag_t tn_flag = 0x0; 329 | 330 | // *** 11. parameters related to InDels. 331 | 332 | uvc1_qual_t indel_BQ_max = 43-1; 333 | uvc1_readpos_t indel_str_repeatsize_max = 6; 334 | uvc1_readpos_t indel_vntr_repeatsize_max = 35; 335 | double indel_polymerase_size = 8.0; 336 | double indel_polymerase_slip_rate = 8.0; 337 | double indel_del_to_ins_err_ratio = 5.0; // https://www.ncbi.nlm.nih.gov/pmc/articles/PMC149199/ Table 1 homopolymer error 338 | uvc1_readpos_t indel_adj_tracklen_dist = 6; 339 | uvc1_readnum100x_t indel_adj_indellen_perc = 160; 340 | 341 | double indel_multiallele_samepos_penal = 11.0; 342 | double indel_multiallele_diffpos_penal = 8.0; 343 | double indel_multiallele_soma_penal_thres = 11.0; 344 | double indel_tetraallele_germline_penal_value = 8.0 * 2; 345 | double indel_tetraallele_germline_penal_thres = 22.0; 346 | 347 | uvc1_readpos_t indel_ins_penal_pseudocount = 16; 348 | 349 | // According to "A New Lossless DNA Compression Algorithm Based on A Single-Block Encoding Scheme" Table 7 Korea2009024, 350 | // there is 2*577/800 bits of info per nucleotide for the human genome. 351 | uvc1_qual_t indel_nonSTR_phred_per_base = 5; 352 | // https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1505-2 353 | uvc1_qual_t indel_str_phred_per_region = 5*2; // should be 15 but set to 10 to allow some correlation 354 | uvc1_readpos_t indel_filter_edge_dist = 5; 355 | 356 | // https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2734402/#bib41 : powlaw exponent of 1.5-1.6 for mut rate vs indel len. 357 | // https://pubmed.ncbi.nlm.nih.gov/18641631/ : SNV mutation rate near (up to a few hundred bp) heterozygous InDels are higher than expected. 358 | 359 | // *** 12. parameters related to contamination 360 | 361 | double contam_any_mul_frac = 0.02; // from the ContEst paper at https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3167057/ 362 | double contam_t2n_mul_frac = 0.05; // from the DeTiN paper at https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6528031/ 363 | 364 | // *** 13. parameters related to micro-adjustment (they do not have any clear theory support) 365 | int32_t microadjust_xm = 7; 366 | uvc1_readpos_t microadjust_cliplen = 5; 367 | uvc1_qual_t microadjust_delFAQmax = 10+9+30; // to override 368 | 369 | double microadjust_bias_pos_indel_fold = 2; 370 | double microadjust_bias_pos_indel_misma_to_indel_ratio = 4 * (1.0 - DBL_EPSILON); 371 | 372 | double microadjust_nobias_pos_indel_misma_to_indel_ratio = 4 * (1.0 - DBL_EPSILON); 373 | uvc1_readpos_t microadjust_nobias_pos_indel_maxlen = 16; 374 | uvc1_qual_t microadjust_nobias_pos_indel_bMQ = 50; 375 | uvc1_readnum100x_t microadjust_nobias_pos_indel_perc = 50; 376 | double microadjust_nobias_strand_all_fold = 5; // it was reduced from 20 377 | double microadjust_refbias_indel_max = 2.0; 378 | 379 | double microadjust_counterbias_pos_odds_ratio = 3.5; 380 | double microadjust_counterbias_pos_fold_ratio = 5.0; 381 | 382 | uvc1_qual_t microadjust_fam_binom_qual_halving_thres = 70; // 22; // =x where x means halved effect of read support at one-FP/(10^(x/10)) base-pairs 383 | int32_t microadjust_fam_lowfreq_invFA = 1000; 384 | uvc1_qual_t microadjust_ref_MQ_dec_max = 15; 385 | 386 | uvc1_qual_t microadjust_syserr_MQ_NMR_tn_syserr_no_penal_qual_min = 30; 387 | uvc1_qual_t microadjust_syserr_MQ_NMR_tn_syserr_no_penal_qual_max = 30+12; 388 | uvc1_readpos_t microadjust_near_clip_dist = 2; 389 | 390 | uvc1_readpos_t microadjust_longfrag_sidelength_min = 300; // both sides span at least one exon 391 | uvc1_readpos_t microadjust_longfrag_sidelength_max = 600; 392 | double microadjust_longfrag_sidelength_zeroMQpenalty = 300; 393 | 394 | uvc1_readpos_t microadjust_alignment_clip_min_len = 12; 395 | double microadjust_alignment_clip_min_frac = 0.05; 396 | uvc1_readpos_t microadjust_alignment_clip_min_count = 2; 397 | uvc1_readpos_t microadjust_alignment_tracklen_min = 25; 398 | 399 | uvc1_qual_t microadjust_germline_mix_with_del_snv_penalty = 9; 400 | uvc1_flag_t microadjust_padded_deletion_flag = 0x2; 401 | 402 | uvc1_readnum_t microadjust_strand_orientation_absence_DP_fold = 5; 403 | uvc1_qual_t microadjust_orientation_absence_snv_penalty = 4; 404 | uvc1_qual_t microadjust_strand_absence_snv_penalty = 4; 405 | uvc1_qual_t microadjust_dedup_absence_indel_penalty = 1; // 8; 406 | 407 | uvc1_readpos_t microadjust_median_readlen_thres = 125; 408 | uvc1_qual_t microadjust_BAQ_per_base_x1024 = 1024; 409 | 410 | uvc1_readpos_t lib_wgs_min_avg_fraglen = 300; 411 | double lib_nonwgs_ad_pseudocount = 0.1; 412 | uvc1_readpos_t lib_nonwgs_clip_penal_min_indelsize = 8; 413 | double lib_nonwgs_normal_full_self_rescue_fa = 0.1; 414 | double lib_nonwgs_normal_min_self_rescue_fa_ratio = 0.2; 415 | double lib_nonwgs_normal_add_mul_ad = 1.0; 416 | uvc1_qual_t lib_nonwgs_normal_max_rescued_MQ = 30; 417 | uvc1_qual_t lib_wgs_normal_max_rescued_MQ = 0; 418 | 419 | // *** 14. parameters related to debugging in vcf 420 | uvc1_flag_t debug_note_flag = 0x0; 421 | uvc1_readpos_t debug_warn_min_read_end_ins_cigar_oplen = 16; 422 | uvc1_refgpos_t debug_tid = -1; 423 | uvc1_refgpos_t debug_pos = -1; 424 | 425 | // *** extra useful info 426 | // https://www.biostars.org/p/254467/#254868 : Question: Are these false somatic variants? Visual inspection with IGV 427 | // How to tell the difference between HDR and kataegis? 428 | // *** end 429 | 430 | bool inferred_is_fastq_generated = false; 431 | bool inferred_is_vcf_generated = true; 432 | 433 | int 434 | initFromArgCV(int & parsing_result_flag, int argc, const char *const* argv); 435 | 436 | SequencingPlatform 437 | selfUpdateByPlatform(void); 438 | }; 439 | 440 | #endif 441 | -------------------------------------------------------------------------------- /main_conversion.hpp: -------------------------------------------------------------------------------- 1 | #ifndef conversion_hpp_INCLUDED 2 | #define conversion_hpp_INCLUDED 3 | 4 | #include "common.hpp" 5 | 6 | #include "htslib/sam.h" 7 | #include "htslib/vcf.h" 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define SQR_QUAL_DIV 32 21 | 22 | // math functions 23 | 24 | template 25 | auto MEDIAN(const T & v) { 26 | assertUVC(v.size() > 0); 27 | return (v[(v.size() - 1) / 2] + v[(v.size()) / 2]) / 2; 28 | } 29 | 30 | template 31 | auto FIRST(const T & v) { 32 | assertUVC(v.size() > 0); 33 | return v[0]; 34 | } 35 | 36 | template 37 | auto LAST(const T & v) { 38 | assertUVC(v.size() > 0); 39 | return v[(v.size()-1)]; 40 | } 41 | 42 | template 43 | int64_t 44 | int64mul(const T1 a, const T2 b) { 45 | return ((int64_t)a) * ((int64_t)b); 46 | } 47 | 48 | template 49 | uint64_t 50 | uint64mul(const T1 a, const T2 b) { 51 | return ((uint64_t)a) * ((uint64_t)b); 52 | } 53 | 54 | template 55 | auto 56 | CENTER(T1 a, T2 b, int center = 0) { 57 | return ((abs(a - center) < abs(b - center)) ? a : b); 58 | } 59 | 60 | template 61 | auto 62 | MIN3(T1 a, T2 b, T3 c) { 63 | return MIN(a, MIN(b, c)); 64 | } 65 | 66 | template 67 | auto 68 | MIN4(T1 a, T2 b, T3 c, T4 d) { 69 | return MIN(a, MIN3(b, c, d)); 70 | } 71 | 72 | template 73 | auto 74 | MIN5(T1 a, T2 b, T3 c, T4 d, T5 e) { 75 | return MIN(a, MIN4(b, c, d, e)); 76 | } 77 | 78 | template 79 | auto 80 | MINVEC(T v) { 81 | assertUVC(v.size() > 0); 82 | auto ret = v[0]; 83 | for (auto e : v) { 84 | ret = MIN(ret, e); 85 | } 86 | return ret; 87 | } 88 | 89 | template 90 | auto 91 | MAX3(T1 a, T2 b, T3 c) { 92 | return MAX(a, MAX(b, c)); 93 | } 94 | 95 | template 96 | auto 97 | MAX4(T1 a, T2 b, T3 c, T4 d) { 98 | return MAX(a, MAX3(b, c, d)); 99 | } 100 | 101 | template 102 | auto 103 | MAX5(T1 a, T2 b, T3 c, T4 d, T5 e) { 104 | return MAX(a, MAX4(b, c, d, e)); 105 | } 106 | 107 | template 108 | auto 109 | MAX6(T1 a, T2 b, T3 c, T4 d, T5 e, T6 f) { 110 | return MAX(a, MAX5(b, c, d, e, f)); 111 | } 112 | 113 | template 114 | auto 115 | MAXVEC(T v) { 116 | assertUVC(v.size() > 0); 117 | auto ret = v[0]; 118 | for (auto e : v) { 119 | ret = MAX(ret, e); 120 | } 121 | return ret; 122 | } 123 | 124 | template 125 | auto 126 | BETWEEN(T1 v, T2 a, T3 b) { 127 | return MIN(MAX(a, v), b); 128 | } 129 | 130 | template 131 | void 132 | UPDATE_MIN(T1 & a, const T2 & b) { 133 | a = MIN(a, b); 134 | } 135 | 136 | template 137 | void 138 | UPDATE_MAX(T1 & a, const T2 & b) { 139 | a = MAX(a, b); 140 | } 141 | 142 | template 143 | auto 144 | SUMVEC(const T & vec) { 145 | auto r = 0; 146 | for (size_t i = 0; i < vec.size(); i++) { 147 | r += vec[i]; 148 | } 149 | return r; 150 | } 151 | 152 | const size_t left2right = 0; 153 | const size_t right2left = 1; 154 | 155 | template 156 | auto 157 | SUMPAIR(const T & array) { 158 | // STATIC_ASSERT_WITH_DEFAULT_MSG(array.size() == 2); 159 | assertUVC(array.size() == 2); 160 | return (array[0] + array[1]); 161 | } 162 | 163 | template 164 | T 165 | calc_non_negative(const T v, T base = pow(10.0, 0.1), T thres = 10.0) { 166 | if (v < thres) { 167 | return log1p(pow(base, v)) / log(base); 168 | } else { 169 | return v; 170 | } 171 | } 172 | 173 | template 174 | T 175 | calc_score_with_penal_at_low_val(const T v, T penal_mult, T thres = 60.0) { 176 | return v * penal_mult; 177 | // enable the following if it makes sense. 178 | if (v <= thres) { 179 | return v * penal_mult; 180 | } else { 181 | return thres * penal_mult + (v - thres); 182 | } 183 | } 184 | 185 | template 186 | auto 187 | mathcube(T x) { 188 | return x * x * x; 189 | } 190 | 191 | constexpr 192 | double 193 | prob2odds(double p) { 194 | assertUVC((0.0 < p && p < 1.0) || !fprintf(stderr, "%f is not between 0 and 1!", p)); 195 | return p / (1.0 - p); 196 | } 197 | 198 | constexpr 199 | double 200 | odds2prob(double odds) { 201 | assertUVC((0.0 < odds) || !fprintf(stderr, "%f is not greater than zero!", odds)); 202 | return odds / (odds + 1.0); 203 | } 204 | 205 | static_assert(prob2odds(odds2prob(1)) > 0.99); 206 | static_assert(prob2odds(odds2prob(1)) < 1.01); 207 | 208 | static_assert(odds2prob(prob2odds(0.66)) > 0.65); 209 | static_assert(odds2prob(prob2odds(0.66)) < 0.67); 210 | 211 | double 212 | logit(double p) { 213 | return log(prob2odds(p)); 214 | } 215 | 216 | double 217 | logit2(double a, double b) { 218 | return logit((a + DBL_EPSILON)/(a+b + 2.0*DBL_EPSILON)); 219 | } 220 | 221 | // always at least zero 222 | template 223 | constexpr double 224 | calc_binom_10log10_likeratio(double prob, double a, double b) { 225 | if (TSetMaxProbToOne) { prob = MIN(1.0, prob); } 226 | prob = (prob + DBL_EPSILON) / (1.0 + (2.0 * DBL_EPSILON)); 227 | assertUVC((prob > 0 && prob < 1) || !fprintf(stderr, "The assertUVCion 0 < %f < 1 failed!\n", prob)); 228 | a += DBL_EPSILON; 229 | b += DBL_EPSILON; 230 | double A = ( prob) * (a + b); 231 | double B = (1.0 - prob) * (a + b); 232 | if (TIsBiDirectional || a > A) { 233 | return 10.0 / log(10.0) * (a * log(a / A) + b * log(b / B)); 234 | } else { 235 | return 0.0; 236 | } 237 | } 238 | 239 | #ifdef TEST_calc_binom_10log10_likeratio 240 | int 241 | main(int argc, char **argv) { 242 | double prob = atof(argv[1]); 243 | double a = atof(argv[2]); 244 | double b = atof(argv[3]); 245 | double ret1 = calc_binom_10log10_likeratio(prob, a, b); 246 | double ret2 = calc_binom_10log10_likeratio(prob, a, b); 247 | printf("calc_binom_10log10_likeratio(%f, %f, %f) = <%f, %f>\n", prob, a, b, ret1, ret2); 248 | } 249 | #endif 250 | 251 | STATIC_ASSERT_WITH_DEFAULT_MSG(abs(calc_binom_10log10_likeratio(0.1, 10, 90)) < 1e-4); 252 | STATIC_ASSERT_WITH_DEFAULT_MSG(calc_binom_10log10_likeratio(0.1, 90, 10) > 763); // 10/log(10) * (90*log(9)+10*log(1/9)) 253 | STATIC_ASSERT_WITH_DEFAULT_MSG(calc_binom_10log10_likeratio(0.1, 90, 10) < 764); // 10/log(10) * (90*log(9)+10*log(1/9)) 254 | STATIC_ASSERT_WITH_DEFAULT_MSG(abs(calc_binom_10log10_likeratio(0.1, 1, 99)) < 1e-4); // 10/log(10) * (90*log(9)+10*log(1/9)) 255 | 256 | template 257 | T 258 | collectget(const T1 & collection, size_t idx, T defaultval = 0) { 259 | return (idx < collection.size() ? collection[idx] : defaultval); 260 | } 261 | 262 | template 263 | void 264 | clear_push(T1 & collection, T2 v, size_t idx = 0) { 265 | if (0 == idx) { 266 | collection.clear(); 267 | } 268 | collection.push_back(v); 269 | } 270 | 271 | template 272 | void 273 | autoswap ( T& a, T& b ) { 274 | T c(a); a=b; b=c; 275 | } 276 | 277 | template 278 | std::string 279 | string_join(const T & container, std::string sep = std::string(",")) { 280 | std::string ret = ""; 281 | for (auto e : container) { 282 | ret += e + sep; 283 | } 284 | ret.pop_back(); 285 | return ret; 286 | } 287 | 288 | template 289 | std::string 290 | other_join(const T & container, std::string sep = std::string(",")) { 291 | std::string ret = ""; 292 | for (const auto & e : container) { 293 | ret += std::to_string(e) + sep; 294 | } 295 | if (ret.size() > 0) { ret.pop_back(); } 296 | return ret; 297 | } 298 | 299 | template 300 | std::string 301 | int32t_join(const T & container, std::string sep = std::string(",")) { 302 | std::string ret = ""; 303 | for (const auto & e : container) { 304 | if (e == INT32_MIN) { 305 | ret += ".,"; 306 | } else { 307 | ret += std::to_string(e) + sep; 308 | } 309 | } 310 | if (ret.size() > 0) { ret.pop_back(); } 311 | return ret; 312 | } 313 | 314 | // variant-call data structures and functions 315 | 316 | enum AlignmentSymbol { 317 | BASE_A, // = 0, 318 | BASE_C, // = 1, 319 | BASE_G, // = 2, 320 | BASE_T, // = 3, 321 | BASE_N, // = 4, // ambigous in the original sequencing data 322 | BASE_NN, // = 5, // NA not available 323 | LINK_M, // = 6, // absence of any gap 324 | LINK_D3P,// = 7, // deletion of length 3 or plus 325 | LINK_D2, // = 8, // deletion of length 2 326 | LINK_D1, // = 9, 327 | LINK_I3P,// = 10, // insertion of length 1 // where the inserted sequence is not a repeat 328 | LINK_I2, // = 11, 329 | LINK_I1, // = 12, 330 | LINK_NN, // = 13, // padded InDel-noInDel symbol in deleted sequence 331 | END_ALIGNMENT_SYMBOLS, 332 | MGVCF_SYMBOL, 333 | ADDITIONAL_INDEL_CANDIDATE_SYMBOL, 334 | }; 335 | 336 | const char* SYMBOL_TO_DESC_ARR[] = { 337 | [BASE_A] = "A", [BASE_C] = "C", [BASE_G] = "G", [BASE_T] = "T", [BASE_N] = "N", 338 | [BASE_NN] = "*", 339 | [LINK_M] = "", 340 | [LINK_D3P] = "", [LINK_D2] = "", [LINK_D1] = "", 341 | [LINK_I3P] = "", [LINK_I2] = "", [LINK_I1] = "", 342 | [LINK_NN] = "*", 343 | [END_ALIGNMENT_SYMBOLS] = "", 344 | [MGVCF_SYMBOL] = "", 345 | [ADDITIONAL_INDEL_CANDIDATE_SYMBOL] = "", 346 | }; 347 | 348 | #define NUM_ALIGNMENT_SYMBOLS 14 349 | STATIC_ASSERT_WITH_DEFAULT_MSG(NUM_ALIGNMENT_SYMBOLS == END_ALIGNMENT_SYMBOLS); 350 | 351 | // Please note that there are left-to-right clips and right-to-left clips, but I cannot know in advance if the direction matters in consensus. 352 | // My intuition tells me that it matters but only for some extremely rare situations, so I did not divide soft-clips according to their strands. 353 | // #define NUM_CLIP_SYMBOLS 2 354 | // const std::array CLIP_SYMBOLS = {{CLIP_LEFT_TO_RIGHT, CLIP_RIGHT_TO_LEFT}}; 355 | 356 | #define NUM_INS_SYMBOLS 3 357 | const std::array INS_SYMBOLS = {{LINK_I1, LINK_I2, LINK_I3P}}; 358 | 359 | #define NUM_DEL_SYMBOLS 3 360 | const std::array DEL_SYMBOLS = {{LINK_D1, LINK_D2, LINK_D3P}}; 361 | 362 | const std::array INDEL_SYMBOLS = {{LINK_I1, LINK_I2, LINK_I3P, LINK_D1, LINK_D2, LINK_D3P}}; 363 | 364 | constexpr bool 365 | areSymbolsMutated(AlignmentSymbol ref, AlignmentSymbol alt) { 366 | if (alt <= BASE_NN) { 367 | return ref != alt && ref < BASE_N && alt < BASE_N; 368 | } else { 369 | return alt != LINK_M && alt != LINK_NN; 370 | } 371 | }; 372 | 373 | 374 | 375 | enum SymbolType { 376 | BASE_SYMBOL, 377 | LINK_SYMBOL, 378 | NUM_SYMBOL_TYPES, 379 | }; 380 | 381 | enum LinkType { 382 | MAT_LINK, 383 | INS_LINK, 384 | DEL_LINK, 385 | NUM_LINK_TYPES, 386 | }; 387 | 388 | const AlignmentSymbol SYMBOL_TYPE_TO_INCLU_BEG[NUM_SYMBOL_TYPES] = { 389 | [BASE_SYMBOL] = BASE_A, 390 | [LINK_SYMBOL] = LINK_M, 391 | }; 392 | 393 | const std::array SYMBOL_TYPE_ARR = { 394 | BASE_SYMBOL, LINK_SYMBOL 395 | }; 396 | 397 | const std::array, NUM_SYMBOL_TYPES> SYMBOL_TYPE_TO_SYMBOLS = {{ 398 | [BASE_SYMBOL] = std::vector{{BASE_A, BASE_C, BASE_G, BASE_T, BASE_N, BASE_NN}}, 399 | [LINK_SYMBOL] = std::vector{{LINK_M, LINK_I1, LINK_I2, LINK_I3P, LINK_D1, LINK_D2, LINK_D3P, LINK_NN}} 400 | }}; 401 | 402 | const std::array, NUM_SYMBOL_TYPES> SYMBOL_TYPE_TO_NON_NN_SYMBOLS = {{ 403 | [BASE_SYMBOL] = std::vector{{ BASE_A, BASE_C, BASE_G, BASE_T, BASE_N }}, 404 | [LINK_SYMBOL] = std::vector{{ LINK_M, LINK_I1,LINK_I2, LINK_I3P, LINK_D1, LINK_D2, LINK_D3P }} 405 | }}; 406 | 407 | const AlignmentSymbol SYMBOL_TYPE_TO_INCLU_END[NUM_SYMBOL_TYPES] = { 408 | [BASE_SYMBOL] = BASE_NN, 409 | [LINK_SYMBOL] = LINK_NN, 410 | }; 411 | 412 | const AlignmentSymbol SYMBOL_TYPE_TO_AMBIG[NUM_SYMBOL_TYPES] = { 413 | [BASE_SYMBOL] = BASE_NN, 414 | [LINK_SYMBOL] = LINK_NN, 415 | }; 416 | 417 | constexpr bool 418 | isSymbolIns(const AlignmentSymbol symbol) { 419 | if (LINK_I3P == symbol || LINK_I2 == symbol || LINK_I1 == symbol) { 420 | return true; 421 | } else { 422 | return false; 423 | } 424 | } 425 | 426 | constexpr bool 427 | isSymbolDel(const AlignmentSymbol symbol) { 428 | if (LINK_D3P == symbol || LINK_D2 == symbol || LINK_D1 == symbol) { 429 | return true; 430 | } else { 431 | return false; 432 | } 433 | } 434 | 435 | constexpr AlignmentSymbol 436 | insLenToSymbol(uvc1_readpos_t len, const bam1_t *b) { 437 | assertUVC(len >= 0 || !fprintf(stderr, "Error: the bam record with qname %s at tid %d pos %ld has insertion of length %d !\n", 438 | bam_get_qname(b), b->core.tid, b->core.pos, len)); 439 | return (1 == len ? LINK_I1 : ((2 == len) ? LINK_I2 : LINK_I3P)); 440 | } 441 | 442 | constexpr AlignmentSymbol 443 | delLenToSymbol(uvc1_readpos_t len, const bam1_t *b) { 444 | assertUVC(len >= 0 || !fprintf(stderr, "Error: the bam record with qname %s at tid %d pos %ld has deletion of length %d !\n", 445 | bam_get_qname(b), b->core.tid, b->core.pos, len)); 446 | return (1 == len ? LINK_D1 : ((2 == len) ? LINK_D2 : LINK_D3P)); 447 | } 448 | 449 | 450 | 451 | int 452 | insSymbolToInsIdx(AlignmentSymbol s) { 453 | return (LINK_I1 == s ? 0 : ((LINK_I2 == s) ? 1: 2)); 454 | } 455 | 456 | int 457 | delSymbolToDelIdx(AlignmentSymbol s) { 458 | return (LINK_D1 == s ? 0 : ((LINK_D2 == s) ? 1: 2)); 459 | } 460 | 461 | const std::array SYMBOL_TYPES_IN_VCF_ORDER = {{LINK_SYMBOL, BASE_SYMBOL}}; 462 | 463 | bool 464 | isSymbolSubstitution(AlignmentSymbol symbol) { 465 | return (SYMBOL_TYPE_TO_INCLU_BEG[BASE_SYMBOL] <= symbol && symbol <= SYMBOL_TYPE_TO_INCLU_END[BASE_SYMBOL]); 466 | } 467 | 468 | 469 | 470 | 471 | 472 | 473 | struct _CharToSymbol { 474 | std::array data; 475 | _CharToSymbol() { 476 | for (size_t i = 0; i < 128; i++) { 477 | data[i] = BASE_N; 478 | } 479 | data['A'] = data['a'] = BASE_A; 480 | data['C'] = data['c'] = BASE_C; 481 | data['G'] = data['g'] = BASE_G; 482 | data['T'] = data['t'] = BASE_T; 483 | data['I'] = data['i'] = LINK_M; 484 | data['-'] = data['_'] = LINK_D1; 485 | } 486 | }; 487 | 488 | const _CharToSymbol CHAR_TO_SYMBOL; 489 | 490 | struct TumorKeyInfo { 491 | std::string ref_alt; 492 | int32_t VTI = -1; 493 | bool enable_tier2_consensus_format_tags = false; 494 | uvc1_refgpos_t pos = 0; 495 | 496 | uvc1_readnum_t BDP = 0; 497 | uvc1_readnum_t bDP = 0; 498 | 499 | uvc1_readnum100x_t CDP1x = 0; 500 | uvc1_readnum100x_t cDP1x = 0; 501 | uvc1_qual_t cVQ1 = 0; 502 | uvc1_qual_t cPCQ1 = 0; 503 | 504 | uvc1_readnum100x_t CDP2x = 0; 505 | uvc1_readnum100x_t cDP2x = 0; 506 | uvc1_qual_t cVQ2 = 0; 507 | uvc1_qual_t cPCQ2 = 0; 508 | 509 | uvc1_qual_t bNMQ = 0; 510 | uvc1_qual_t vHGQ = 0; 511 | bcf1_t *bcf1_record = NULL; 512 | 513 | uvc1_readnum_t tDP = 0; 514 | std::array tADR = {{ 0 }}; 515 | uvc1_readnum_t nDP = 0; 516 | std::array nADR = {{ 0 }}; 517 | uvc1_readnum_t tDPC = 0; 518 | std::array tADCR = {{ 0 }}; 519 | std::array nADCR = {{ 0 }}; 520 | 521 | /* 522 | ~TumorKeyInfo() { 523 | if (bcf1_record != NULL) { 524 | // this line must be elsewhere apparently due to the subtle differences between C and C++. 525 | // bcf_destroy(bcf1_record); 526 | } 527 | } 528 | */ 529 | }; 530 | 531 | std::basic_string 532 | string2symbolseq(const std::string & instring) { 533 | std::basic_string ret; 534 | ret.reserve(instring.size()); 535 | for (size_t i = 0; i < instring.size(); i++) { 536 | ret.push_back(CHAR_TO_SYMBOL.data[instring[i]]); 537 | } 538 | return ret; 539 | }; 540 | 541 | struct SegFormatPrepSet { 542 | uvc1_readnum_t segprep_a_dp; 543 | uvc1_readnum_t segprep_a_near_ins_dp; 544 | uvc1_readnum_t segprep_a_near_del_dp; 545 | uvc1_readnum_t segprep_a_near_RTR_ins_dp; 546 | uvc1_readnum_t segprep_a_near_RTR_del_dp; 547 | 548 | // std::array segprep_a_pcr_dps = {{ 0 }}; // depth of PCR amplicons on the R1R2 and R2R1 orientations 549 | uvc1_readnum_t segprep_a_pcr_dp; // depth of PCR-amplicon sequencing segments 550 | uvc1_readnum_t segprep_a_umi_dp; // depth of UMI-enabled sequencing segments 551 | uvc1_readnum_t segprep_a_snv_dp; 552 | uvc1_readnum_t segprep_a_dnv_dp; 553 | uvc1_readnum_t segprep_a_highBQ_dp; // depth of high-BQ bases 554 | 555 | uvc1_readnum_t segprep_a_near_pcr_clip_dp; 556 | uvc1_readnum_t segprep_a_near_long_clip_dp; 557 | 558 | uvc1_readnum_t segprep_a_at_ins_dp; 559 | uvc1_readnum_t segprep_a_at_del_dp; 560 | 561 | // not really 100x but at the same order of magnitude 562 | uvc1_readnum100x_t segprep_a_XM1500; // number of mismatches per 1500 bases 563 | uvc1_readnum100x_t segprep_a_GO1500; // number of gap openings per 1500 bases 564 | uvc1_readnum100x_t segprep_a_GAPLEN; 565 | uvc1_readnum100x_t segprep_a_qlen; 566 | 567 | uvc1_readpos_big_t segprep_a_near_ins_pow2len; 568 | uvc1_readpos_big_t segprep_a_near_del_pow2len; 569 | uvc1_readnum100x_t segprep_a_near_ins_inv100len; 570 | uvc1_readnum100x_t segprep_a_near_del_inv100len; 571 | 572 | uvc1_readpos_big_t segprep_a_near_ins_l_pow2len; 573 | uvc1_readpos_big_t segprep_a_near_ins_r_pow2len; 574 | uvc1_readpos_big_t segprep_a_near_del_l_pow2len; 575 | uvc1_readpos_big_t segprep_a_near_del_r_pow2len; 576 | 577 | uvc1_readpos_big_t segprep_a_LI; 578 | uvc1_readnum_t segprep_a_LIDP; 579 | uvc1_readpos_big_t segprep_a_RI; 580 | uvc1_readnum_t segprep_a_RIDP; 581 | 582 | uvc1_readpos_t segprep_a_l_dist_sum; 583 | uvc1_readpos_t segprep_a_r_dist_sum; 584 | uvc1_readpos_t segprep_a_inslen_sum; 585 | uvc1_readpos_t segprep_a_dellen_sum; 586 | 587 | uvc1_qual_big_t segprep_a_l_BAQ_sum; 588 | uvc1_qual_big_t segprep_a_r_BAQ_sum; 589 | uvc1_qual_big_t segprep_a_insBAQ_sum; 590 | uvc1_qual_big_t segprep_a_delBAQ_sum; 591 | 592 | #if COMPILATION_TRY_HIGH_DEPTH_POS_BIAS 593 | // data-driven border for position bias 594 | uvc1_readnum_t segprep_aa_l_ins_dist_x_wei; 595 | uvc1_readnum_t segprep_aa_l_ins_weight; 596 | uvc1_readnum_t segprep_aa_r_ins_dist_x_wei; 597 | uvc1_readnum_t segprep_aa_r_ins_weight; 598 | 599 | uvc1_readnum_t segprep_aa_l_del_dist_x_wei; 600 | uvc1_readnum_t segprep_aa_l_del_weight; 601 | uvc1_readnum_t segprep_aa_r_del_dist_x_wei; 602 | uvc1_readnum_t segprep_aa_r_del_weight; 603 | #endif 604 | 605 | }; 606 | #define NUM_SEG_FORMAT_PREP_SETS ((size_t)SEG_FORMAT_PREP_SET_END) 607 | 608 | template 609 | uvc1_readnum_big_t 610 | calc_indel_weight(const T1 indelsize, const T2 borderlen) { 611 | return (1024L * 1024L) * mathcube(indelsize) / mathcube(MAX(borderlen, 8)); 612 | } 613 | 614 | struct SegFormatThresSet { 615 | uvc1_readpos_t segthres_aLPxT; 616 | uvc1_readpos_t segthres_aRPxT; 617 | 618 | #if COMPILATION_ENABLE_XMGOT 619 | uvc1_base1500x_t segthres_aXM1T; // mismatch, higher means more bias 620 | uvc1_base1500x_t segthres_aXM2T; 621 | uvc1_base1500x_t segthres_aGO1T; // gap-open, higher means more bias 622 | uvc1_base1500x_t segthres_aGO2T; 623 | #endif 624 | 625 | uvc1_readpos_t segthres_aLI1T; // distance to left insert end, higher means more bias 626 | uvc1_readpos_t segthres_aLI2T; 627 | uvc1_readpos_t segthres_aRI1T; // distance to right insert end 628 | uvc1_readpos_t segthres_aRI2T; 629 | uvc1_readpos_t segthres_aLI1t; // distance to left insert end, lower means more bias 630 | uvc1_readpos_t segthres_aLI2t; 631 | uvc1_readpos_t segthres_aRI1t; // distance to right insert end, lower means more bias 632 | uvc1_readpos_t segthres_aRI2t; 633 | 634 | uvc1_readpos_t segthres_aLP1t; 635 | uvc1_readpos_t segthres_aLP2t; 636 | uvc1_readpos_t segthres_aRP1t; 637 | uvc1_readpos_t segthres_aRP2t; 638 | 639 | uvc1_qual_t segthres_aLB1t; 640 | uvc1_qual_t segthres_aLB2t; 641 | uvc1_qual_t segthres_aRB1t; 642 | uvc1_qual_t segthres_aRB2t; 643 | }; 644 | 645 | struct SegFormatInfoSet { 646 | // allele-specific 647 | uvc1_readnum100x_t seginfo_a2XM2; 648 | uvc1_readnum100x_t seginfo_a2BM2; 649 | 650 | uvc1_readnum100x_t seginfo_aPF1; // BQ without mismatches 651 | uvc1_readnum100x_t seginfo_aPF2; 652 | 653 | uvc1_readnum_t seginfo_aBQ2; // number passing BQ filter 654 | 655 | uvc1_qual_t seginfo_aMQs; 656 | uvc1_readnum_t seginfo_aP1; 657 | uvc1_readnum_t seginfo_aP2; 658 | uvc1_readnum_t seginfo_aP3; 659 | uvc1_readnum_t seginfo_aNC; 660 | 661 | uvc1_readnum_t seginfo_aDPff; 662 | uvc1_readnum_t seginfo_aDPfr; 663 | uvc1_readnum_t seginfo_aDPrf; 664 | uvc1_readnum_t seginfo_aDPrr; 665 | 666 | uvc1_readnum_t seginfo_aLP1; // left seg pos 667 | uvc1_readnum_t seginfo_aLP2; 668 | uvc1_readpos_t seginfo_aLPL; 669 | uvc1_readnum_t seginfo_aRP1; // right seg pos 670 | uvc1_readnum_t seginfo_aRP2; 671 | uvc1_readpos_t seginfo_aRPL; 672 | 673 | uvc1_readnum_t seginfo_aLB1; // left seg pos 674 | uvc1_readnum_t seginfo_aLB2; 675 | uvc1_readpos_big_t seginfo_aLBL; 676 | uvc1_readnum_t seginfo_aRB1; // right seg pos 677 | uvc1_readnum_t seginfo_aRB2; 678 | uvc1_readpos_big_t seginfo_aRBL; 679 | 680 | uvc1_readnum_t seginfo_aLI1; // left insert 681 | uvc1_readnum_t seginfo_aLI2; 682 | 683 | uvc1_readnum_t seginfo_aRI1; // right insert 684 | uvc1_readnum_t seginfo_aRI2; 685 | 686 | uvc1_readnum_t seginfo_aRIf; 687 | uvc1_readnum_t seginfo_aLIr; 688 | 689 | uvc1_readpos_big_t seginfo_aLIT; 690 | uvc1_readpos_big_t seginfo_aRIT; 691 | }; 692 | 693 | enum FragFormatDepthSet { 694 | FRAG_bDP, // raw 695 | FRAG_bTA, // FRAG_b10xSeqTlen, 696 | FRAG_bTB, // FRAG_b10xSeqTNevents, 697 | FRAG_FORMAT_DEPTH_SET_END 698 | }; 699 | #define NUM_FRAG_FORMAT_DEPTH_SETS ((size_t)FRAG_FORMAT_DEPTH_SET_END) 700 | 701 | struct FamFormatInfoSet { 702 | uvc1_readnum_t faminfo_c2LP1; // left tier-2-consensus family pos 703 | uvc1_readnum_t faminfo_c2LP2; 704 | uvc1_readpos_t faminfo_c2LPL; 705 | uvc1_readnum_t faminfo_c2RP1; // right tier-2-consensus family pos 706 | uvc1_readnum_t faminfo_c2RP2; 707 | uvc1_readpos_t faminfo_c2RPL; 708 | 709 | uvc1_readnum_t faminfo_c2LP0; 710 | uvc1_readnum_t faminfo_c2RP0; 711 | 712 | uvc1_readnum_t faminfo_c2LB1; // left tier-2-consensus family pos 713 | uvc1_readnum_t faminfo_c2LB2; 714 | uvc1_readpos_big_t faminfo_c2LBL; 715 | uvc1_readnum_t faminfo_c2RB1; // right tier-2-consensus family pos 716 | uvc1_readnum_t faminfo_c2RB2; 717 | uvc1_readpos_big_t faminfo_c2RBL; 718 | 719 | uvc1_readnum_t faminfo_c2BQ2; 720 | }; 721 | 722 | enum FamFormatDepthSet { 723 | FAM_cDP1, // raw 724 | FAM_cDP12,// filtered 725 | FAM_cDP2, // 2, 0.8, family-consensus 726 | FAM_cDP3, // 10, 0.8, family-consensus 727 | FAM_cDPM, // duped match 728 | FAM_cDPm, // duped mismatch 729 | FAM_cDP21, // singleton 730 | FAM_cDPD, // support of single-strand-consensus sequence which did not form a duplex consensus sequence 731 | 732 | FAM_FORMAT_DEPTH_SET_END 733 | }; 734 | #define NUM_FAM_FORMAT_DEPTH_SETS ((size_t)FAM_FORMAT_DEPTH_SET_END) 735 | 736 | enum DuplexFormatDepthSet { 737 | DUPLEX_dDP1, // raw 738 | DUPLEX_dDP2, // double-strand consensus 739 | DUPLEX_FORMAT_TAG_SET_END 740 | }; 741 | #define NUM_DUPLEX_FORMAT_DEPTH_SETS ((size_t)DUPLEX_FORMAT_TAG_SET_END) 742 | 743 | enum VQFormatTagSet { 744 | 745 | VQ_a1BQf, 746 | VQ_a1BQr, 747 | VQ_a2BQf, 748 | VQ_a2BQr, 749 | 750 | VQ_bMQ, 751 | 752 | VQ_bIAQb, // prefinal 753 | VQ_bIADb, 754 | VQ_bIDQb, 755 | 756 | VQ_cIAQf, 757 | VQ_cIADf, 758 | VQ_cIDQf, 759 | 760 | VQ_cIAQr, 761 | VQ_cIADr, 762 | VQ_cIDQr, 763 | 764 | // later computed 765 | VQ_aBQQ, // prefinal 766 | VQ_bIAQ, // prefinal 767 | VQ_cIAQ, // prefinal 768 | 769 | VQ_aPLQ, // preprefinal 770 | VQ_c1PLQ, // prefinal, deduped 771 | VQ_c2PLQ, // prefinal, consensus-applied 772 | VQ_dPLQ, // prefinal less priority 773 | 774 | VQ_C1DPv, // 100 times higher than expected 775 | VQ_c1DPv, // 100 times higher than expected 776 | VQ_c1VQ, // final VarQual 777 | VQ_C2DPv, // 100 times higher than expected 778 | VQ_c2DPv, // ... 779 | VQ_c2VQ, // ... 780 | 781 | VQ_FORMAT_TAG_SET_END 782 | }; 783 | #define NUM_VQ_FORMAT_TAG_SETS ((size_t)VQ_FORMAT_TAG_SET_END) 784 | 785 | template 786 | uvc1_readnum_t 787 | seg_format_get_ad(const T & s) { 788 | return s.seginfo_aDPff + s.seginfo_aDPfr + s.seginfo_aDPrf + s.seginfo_aDPrr; 789 | }; 790 | 791 | template 792 | uvc1_qual_t 793 | get_avgBQ(const T1 & bg_seg_bqsum_conslogo, const T2 & symbol_to_seg_format_depth_sets, const uvc1_refgpos_t epos, const T3 s) { 794 | const auto denom = seg_format_get_ad(symbol_to_seg_format_depth_sets.getByPos(epos)[s]); 795 | return bg_seg_bqsum_conslogo.getByPos(epos).getSymbolCount(s) / MAX(1, denom); 796 | } 797 | 798 | template 799 | 800 | std::array 801 | dp4_to_pcFA(double overseq_frac, double aADpass, double aADfail, double aDPpass, double aDPfail, 802 | double pl_exponent = 3.0, double n_nats = log(500+1), 803 | double aADavgKeyVal = -1, double aDPavgKeyVal = -1, double priorAD = 0.5, double priorDP = 1.0) { 804 | assertUVC(aADpass >= 0.0 || !fprintf(stderr, "%f >= %f failed for pass!\n", aADpass, 0.0)); 805 | assertUVC(aADfail >= 0.0 || !fprintf(stderr, "%f >= %f failed for fail!\n", aADfail, 0.0)); 806 | assertUVC(aADpass <= aDPpass || !fprintf(stderr, "%f <= %f failed for pass!\n", aADpass, aDPpass)); 807 | assertUVC(aADfail <= aDPfail || !fprintf(stderr, "%f <= %f failed for fail!\n", aADfail, aDPfail)); 808 | if (!TIsOverseqFracDisabled) { 809 | aDPfail *= overseq_frac; 810 | aDPpass *= overseq_frac; 811 | aADfail *= overseq_frac; 812 | aADpass *= overseq_frac; 813 | } 814 | aDPfail += priorDP; 815 | aDPpass += priorDP; 816 | aADfail += priorAD; 817 | aADpass += priorAD; 818 | const double nobiasFA = (aADfail + aADpass) / (aDPfail + aDPpass); 819 | if ((aADpass / aDPpass) >= (aADfail / aDPfail)) { 820 | if (TBidirectional) { 821 | autoswap(aDPfail, aDPpass); 822 | autoswap(aADfail, aADpass); 823 | } else { 824 | return std::array {{ (aADpass / aDPpass), nobiasFA }}; 825 | } 826 | } 827 | auto aBDfail = aDPfail * 2 - aADfail * 1; 828 | auto aBDpass = aDPpass * 2 - aADpass * 1; 829 | assertUVC (aBDfail > 0); 830 | assertUVC (aBDpass > 0); 831 | double aADpassfrac = aADpass / (aADpass + aADfail); 832 | double aBDpassfrac = aBDpass / (aBDpass + aBDfail); 833 | if ((!TBidirectional) && (aADavgKeyVal >= 0) && (aDPavgKeyVal >= 0)) { 834 | aADpassfrac = aADavgKeyVal / (aADavgKeyVal + aDPavgKeyVal * 0.9); // interpolate 835 | aBDpassfrac = 1.0 - aADpassfrac; 836 | } 837 | double infogain = aADfail * log((1.0 - aADpassfrac) / (1.0 - aBDpassfrac)); 838 | if (TBidirectional) { 839 | infogain += aADpass * log(aADpassfrac / aBDpassfrac); 840 | } 841 | #ifdef TEST_dp4_to_pcFA 842 | printf("infogain = %f\n", infogain); 843 | #endif 844 | if (infogain <= n_nats) { 845 | return std::array {{ aADfail / aDPfail, nobiasFA }}; 846 | } else { 847 | return std::array {{ MAX(aADpass / aDPpass, (aADfail / aDPfail) * exp((n_nats - infogain) / pl_exponent)), nobiasFA }}; 848 | } 849 | } 850 | 851 | #ifdef TEST_dp4_to_pcFA 852 | int 853 | main(int argc, char **argv) { 854 | double adpass = atof(argv[1]); 855 | double adfail = atof(argv[2]); 856 | double dppass = atof(argv[3]); 857 | double dpfail = atof(argv[4]); 858 | double entropy = atof(argv[5]); 859 | double entrmax = atof(argv[6]); 860 | double ldist = atof(argv[7]); 861 | double rdist = atof(argv[8]); 862 | double pca = atof(argv[9]); 863 | double pcb = atof(argv[10]); 864 | 865 | const auto ret1 = dp4_to_pcFA(-1, adpass, adfail, dppass, dpfail, entropy, entrmax, ldist, rdist, pca, pcb); 866 | const auto ret2 = dp4_to_pcFA(-1, adpass, adfail, dppass, dpfail, entropy, entrmax, ldist, rdist, pca, pcb); 867 | printf("dp4_to_pcFA<(false AND true)>(%f, %f, %f, %f, %f, %f, %f, %f, %f, %f) = ({%f, %f} AND {%f, %f})\n", 868 | adpass, adfail, dppass, dpfail, entropy, entrmax, ldist, rdist, pca, pcb, ret1[0], ret1[1], ret2[0], ret2[1]); 869 | } 870 | 871 | #endif 872 | 873 | // conversion between different defintions in bioinformatics 874 | 875 | uvc1_qual_t 876 | char2phred(const char charvalue) { 877 | return charvalue - 33; 878 | } 879 | 880 | char 881 | phred2char(const uvc1_qual_t phredvalue) { 882 | return phredvalue + 33; 883 | } 884 | 885 | double 886 | phred2prob(const uvc1_qual_t phredvalue) { 887 | return pow(10, -((float)phredvalue) / 10); 888 | } 889 | 890 | uvc1_qual_t 891 | prob2phred(const double probvalue) { 892 | return floor(-10 * log(probvalue) / log(10)); 893 | } 894 | 895 | double 896 | prob2realphred(const double probvalue) { 897 | return -10 * log(probvalue) / log(10); 898 | } 899 | 900 | template 901 | void 902 | process_cigar(T1 & qpos, T2 & rpos, T3 cigar_op, T4 cigar_oplen) { 903 | if (cigar_op == BAM_CREF_SKIP) { 904 | rpos += cigar_oplen; 905 | } else if (cigar_op == BAM_CSOFT_CLIP) { 906 | qpos += cigar_oplen; 907 | } else if (cigar_op == BAM_CHARD_CLIP) { 908 | // pass 909 | } else if (cigar_op == BAM_CPAD) { 910 | // pass 911 | } else if (cigar_op == BAM_CBACK) { 912 | throw -1; 913 | } else { 914 | throw -2; 915 | } 916 | } 917 | 918 | // variant-call math functions 919 | 920 | #define NUM_BUCKETS 16 921 | 922 | template 923 | uvc1_qual_t 924 | proton_cigarlen2phred(T cigarlen) { 925 | uvc1_qual_t oplen2phred[12+1] = { 926 | 0, // generated by the python code: for i in range(1,12+1): print('(uvc_qual_t)round({}), //{}'.format(10/log(10)*log(i**3), i)) 927 | (uvc1_qual_t)round(0.0), //1 928 | (uvc1_qual_t)round(9.030899869919434), //2 929 | (uvc1_qual_t)round(14.313637641589871), //3 930 | (uvc1_qual_t)round(18.061799739838868), //4 931 | (uvc1_qual_t)round(20.969100130080562), //5 932 | (uvc1_qual_t)round(23.344537511509305), //6 933 | (uvc1_qual_t)round(25.352941200427697), //7 934 | (uvc1_qual_t)round(27.092699609758302), //8 935 | (uvc1_qual_t)round(28.627275283179742), //9 936 | (uvc1_qual_t)round(29.999999999999993), //10 937 | (uvc1_qual_t)round(31.241780554746747), //11 938 | (uvc1_qual_t)round(32.37543738142874), //12 939 | }; 940 | return oplen2phred[MIN(cigarlen, 12)]; 941 | } 942 | 943 | template 944 | int 945 | infer_max_qual_assuming_independence( 946 | uvc1_qual_t & maxvqual, 947 | uvc1_readnum_t & argmaxAD, 948 | uvc1_qual_t & argmaxBQ, 949 | const uvc1_qual_t max_qual, 950 | const uvc1_qual_t dec_qual, 951 | const std::array & qual_distr, 952 | const uvc1_readnum_t totDP, 953 | const uvc1_hash_t specialflag IGNORE_UNUSED_PARAM) { 954 | 955 | uvc1_qual_t currvqual = 0; 956 | uvc1_readnum_t currAD = 0; 957 | maxvqual = 0; 958 | argmaxAD = 0; 959 | argmaxBQ = 0; 960 | for (uvc1_qual_t idx = 0; idx < MIN(NUM_BUCKETS, max_qual / dec_qual); idx++) { 961 | const auto currQD = qual_distr[idx]; 962 | if (0 == currQD) { continue; } 963 | currAD += currQD; 964 | auto currBQ = max_qual - (dec_qual * idx); 965 | double expBQ = 10.0 / log(10.0) * log(((double)totDP / (double)currAD) + DBL_EPSILON); 966 | currvqual = (uvc1_qual_t)(currAD * (currBQ - expBQ)); 967 | if (currvqual > maxvqual) { 968 | argmaxAD = currAD; 969 | argmaxBQ = currBQ; 970 | maxvqual = currvqual; 971 | } 972 | } 973 | return 0; 974 | } 975 | 976 | #endif 977 | 978 | -------------------------------------------------------------------------------- /grouping.cpp: -------------------------------------------------------------------------------- 1 | #include "grouping.hpp" 2 | #include "logging.hpp" 3 | #include "Hash.hpp" 4 | 5 | //#define MAX_NUM_REF_BASES (1000*1000) 6 | //#define MAX_NUM_READS (2000*1000) 7 | 8 | // at 150*16 average sequencing depth, the two below amount of bytes are approx equal to each other. 9 | #define NUM_BYTES_PER_REF_POS ((size_t)(1024*8)) // estimated 10 | #define NUM_BYTES_PER_READ ((size_t)(512)) // estimated 11 | 12 | #define UPDATE_MIN(a, b) ((a) = MIN((a), (b))); 13 | 14 | template 15 | inline 16 | size_t 17 | mathsquare_big(T x) { 18 | return ((size_t)x) * ((size_t)x); 19 | } 20 | 21 | // position of 5' is the starting position, but position of 3' is unreliable without mate info. 22 | const uvc1_readpos_t ARRPOS_MARGIN = MAX_INSERT_SIZE; 23 | const uvc1_readpos_t ARRPOS_OUTER_RANGE = 10; 24 | const uvc1_readpos_t ARRPOS_INNER_RANGE = 3; 25 | 26 | // const RevComplement THE_REV_COMPLEMENT; 27 | 28 | bool 29 | check_if_is_over_mem_lim( 30 | const uvc1_readnum_big_t total_n_reads, 31 | const uvc1_readnum_big_t total_n_reads_x_reads, 32 | const uvc1_refgpos_big_t total_n_rposs, 33 | const uvc1_refgpos_big_t total_n_rposs_x_rposs, 34 | // const uvc1_refgpos_big_t total_n_regions, 35 | const size_t nthreads, 36 | const size_t mem_per_thread, 37 | const bool is_fastq_gen) { 38 | 39 | const size_t tmp_n_bytes_used_by_reads = INT64MUL(MIN(total_n_reads_x_reads / MAX(1, total_n_reads) * nthreads, (size_t)total_n_reads), NUM_BYTES_PER_READ); 40 | const size_t tmp_n_bytes_used_by_rposs = INT64MUL(MIN(total_n_rposs_x_rposs / MAX(1, total_n_rposs) * nthreads, (size_t)total_n_rposs) + (2 * MAX_STR_N_BASES * nthreads), NUM_BYTES_PER_REF_POS); 41 | const size_t vcf_n_bytes_used_by_rposs = INT64MUL(total_n_rposs, 1024); // estimate from the htslib specs of VCF 42 | const size_t fqs_n_bytes_used_by_reads = (is_fastq_gen ? (INT64MUL(total_n_reads, NUM_BYTES_PER_READ) / 4) : 0); // consensus and compression 43 | 44 | const size_t tot_n_bytes_used = tmp_n_bytes_used_by_reads + tmp_n_bytes_used_by_rposs + vcf_n_bytes_used_by_rposs + fqs_n_bytes_used_by_reads; 45 | return (tot_n_bytes_used > ((1024UL*1024UL) * mem_per_thread * nthreads)); 46 | } 47 | 48 | bool 49 | check_if_sub_is_over_mem_lim( 50 | const uvc1_readnum_big_t region_n_reads, 51 | // const uvc1_readnum_big_t total_n_reads_x_reads, 52 | const uvc1_refgpos_big_t region_n_rposs, 53 | // const uvc1_readnum_big_t total_n_rposs_x_rposs, 54 | size_t mem_per_thread, 55 | size_t curr_beg, 56 | size_t block_running_end) { 57 | 58 | const size_t tmp_n_bytes_used_by_reads = INT64MUL(region_n_reads, NUM_BYTES_PER_READ); 59 | const size_t tmp_n_bytes_used_by_rposs = INT64MUL(region_n_rposs, NUM_BYTES_PER_REF_POS + 1024); 60 | 61 | const size_t memfree = ((1024UL*1024UL) / NUM_WORKING_UNITS_PER_THREAD) * mem_per_thread; 62 | // more overlap -> more mem -> less likely to return true 63 | const size_t mem_by_read_overlap = memfree * MIN(non_neg_minus(block_running_end, curr_beg), 150) / (150); 64 | 65 | const size_t tot_n_bytes_used = tmp_n_bytes_used_by_reads + tmp_n_bytes_used_by_rposs; 66 | return (tot_n_bytes_used > memfree + mem_by_read_overlap); 67 | } 68 | 69 | int 70 | SamIter::target_region_to_contigs( 71 | std::vector & bedlines, 72 | const std::string & tier1_target_region, 73 | const bam_hdr_t *bam_hdr) { 74 | std::map tname_to_tid; 75 | for (uvc1_refgpos_t i = 0; i < bam_hdr->n_targets; i++) { 76 | tname_to_tid[bam_hdr->target_name[i]] = i; 77 | } 78 | std::string region; 79 | std::istringstream regionstream(tier1_target_region); 80 | while (getline(regionstream, region, ',')) { 81 | char *tname = (char*)malloc(region.size() + 1); 82 | uint64_t tbeg1, tend1; 83 | int n_tokens = sscanf(region.c_str(), "%[^:]:%lu-%lu", tname, &tbeg1, &tend1); 84 | if (n_tokens < 3) { 85 | n_tokens = sscanf(region.c_str(), "%[^:]:%lu", tname, &tbeg1); 86 | tend1 = tbeg1 + 1; 87 | } 88 | if (n_tokens < 2) { 89 | LOG(logERROR) << "The region " << region << " is neither in the format TEMPLATE:START-END nor in the format TEMPLATE:POS " 90 | << "(template usually denotes chromosome). "; 91 | exit(16); 92 | } else { 93 | uvc1_refgpos_t tbeg = (uvc1_refgpos_t)tbeg1; 94 | uvc1_refgpos_t tend = (uvc1_refgpos_t)tend1; 95 | uvc1_flag_t bedline_flag = 0x0; 96 | uvc1_readnum_big_t nreads = ((-1 == bed_in_avg_sequencing_DP) ? 0 : (bed_in_avg_sequencing_DP * (tend - tbeg) + 1)); 97 | if (tname_to_tid.find(tname) == tname_to_tid.end()) { 98 | LOG(logERROR) << "The template name " << region << " is not found in the input BAM header (template usually denotes chromosome). "; 99 | exit(17); 100 | } else { 101 | bedlines.push_back(BedLine(tname_to_tid[tname], tbeg, tend, bedline_flag, nreads)); 102 | } 103 | } 104 | free(tname); 105 | } 106 | return 0; 107 | } 108 | 109 | int 110 | SamIter::bed_fname_to_contigs( 111 | std::vector & bedlines, 112 | const std::string & bed_fname, 113 | const bam_hdr_t *bam_hdr) { 114 | 115 | std::map tname_to_tid; 116 | for (uvc1_refgpos_t i = 0; i < bam_hdr->n_targets; i++) { 117 | tname_to_tid[bam_hdr->target_name[i]] = i; 118 | } 119 | std::ifstream bedfile(bed_fname); 120 | while (bedfile.good()) { 121 | std::string line; 122 | getline(bedfile, line); 123 | if (line.empty() || line[0] == '#') { 124 | continue; 125 | } 126 | std::istringstream linestream(line); 127 | std::string tname; 128 | uvc1_refgpos_t tbeg; 129 | uvc1_refgpos_t tend; 130 | linestream >> tname; 131 | linestream >> tbeg; 132 | linestream >> tend; 133 | if (!(tbeg < tend)) { 134 | std::cerr << "The bedfile " << bed_fname << " does not have its end after its start at: " << tname << "\t" << tbeg << "\t" << tend; 135 | exit (16); 136 | } 137 | if (tname_to_tid.find(tname) == tname_to_tid.end()) { 138 | std::cerr << "The reference template name " << tname << " from the bedfile " << bed_fname << " is not in the input sam file"; 139 | exit (17); 140 | } 141 | uvc1_flag_t bedline_flag = 0x0; 142 | std::string token; 143 | uvc1_readnum_t nreads = ((-1 == bed_in_avg_sequencing_DP) ? 0 : (bed_in_avg_sequencing_DP * (tend - tbeg) + 1)); 144 | while (linestream.good()) { 145 | linestream >> token; 146 | if (token == ("BedLineFlag")) { 147 | linestream >> bedline_flag; 148 | } else if (token == "NumberOfReadsInThisInterval") { 149 | linestream >> nreads; 150 | } 151 | } 152 | bedlines.push_back(BedLine(tname_to_tid[tname], tbeg, tend, bedline_flag, nreads)); 153 | } 154 | return 0; 155 | } 156 | 157 | int64_t 158 | SamIter::iternext( 159 | uvc1_flag_t & iter_ret_flag, 160 | std::vector & bedlines, 161 | const uvc1_flag_t specialflag IGNORE_UNUSED_PARAM) { 162 | iter_ret_flag = 0; 163 | // uvc1_readnum_t total_n_regions = 0; // may be useful for some purposes? 164 | uvc1_readnum_big_t total_n_reads = 0; 165 | uvc1_refgpos_big_t total_n_rposs = 0; 166 | uvc1_readnum_big_t total_n_reads_x_reads = 0; 167 | uvc1_refgpos_big_t total_n_rposs_x_rposs = 0; 168 | if (this->_bedlines.size() > 0) { 169 | for (; this->_bedregion_idx < this->_bedlines.size(); this->_bedregion_idx++) { 170 | const auto & bedline = (this->_bedlines[this->_bedregion_idx]); 171 | bedlines.push_back(bedline); 172 | const auto bed_tid = bedline.tid; // std::get<0>(bedreg); 173 | const auto bed_beg = bedline.beg_pos; 174 | const auto bed_end = bedline.end_pos; 175 | int64_t region_n_reads = INT64MUL(bed_in_avg_sequencing_DP, (bed_end - bed_beg)); // Please note that left-over reads from the previoous iteration are ignored 176 | if (bed_in_avg_sequencing_DP_n_from_t) { 177 | region_n_reads = bedline.n_reads; // Let normal_use_tumor_num_of_reads 178 | } else if (-1 == bed_in_avg_sequencing_DP) { 179 | hts_itr_t *hts_itr = sam_itr_queryi(this->sam_idx, bed_tid, bed_beg, bed_end); 180 | if (NULL == hts_itr) { 181 | LOG(logERROR) << "Error when fetching region tid=" << bed_tid << ":" << bed_beg << "-" << bed_end << ", aborting now. "; 182 | exit(18); 183 | } 184 | region_n_reads = 0; 185 | while ( (NULL == sam_idx && (sam_read1(this->sam_infile, this->samheader, alnrecord) >= 0)) 186 | || (NULL != sam_idx && (sam_itr_next(this->sam_infile, hts_itr, alnrecord) >= 0))) { 187 | if ((bed_tid == alnrecord->core.tid) && 188 | ARE_INTERVALS_OVERLAPPING(bed_beg, bed_end, alnrecord->core.pos, bam_endpos(alnrecord))) { 189 | region_n_reads++; 190 | } else if ((bed_tid < alnrecord->core.tid) || ((bed_tid == alnrecord->core.tid) && (bed_end <= alnrecord->core.pos))) { 191 | break; 192 | } 193 | } 194 | sam_itr_destroy(hts_itr); 195 | } 196 | uvc1_refgpos_big_t region_n_rposs = bed_end - bed_beg; // region-n-ref-positions 197 | // total_n_regions++; 198 | total_n_reads += region_n_reads; 199 | total_n_rposs += region_n_rposs; 200 | total_n_reads_x_reads += mathsquare_big(region_n_reads); 201 | total_n_rposs_x_rposs += mathsquare_big(region_n_rposs); 202 | const bool is_over_mem_lim = check_if_is_over_mem_lim( 203 | total_n_reads, total_n_reads_x_reads, 204 | total_n_rposs, total_n_rposs_x_rposs, 205 | // total_n_regions, 206 | this->nthreads, this->mem_per_thread, 207 | this->is_fastq_gen); 208 | if (is_over_mem_lim) { 209 | this->_bedregion_idx++; 210 | return total_n_reads; 211 | } 212 | } 213 | } else { 214 | 215 | uvc1_refgpos_t block_tid = this->last_it_tid; 216 | uvc1_refgpos_t block_beg = this->last_it_beg; 217 | uvc1_refgpos_t block_running_end = this->last_it_end; 218 | 219 | uvc1_readnum_big_t region_n_reads = 0; 220 | uvc1_refgpos_big_t region_n_ref_positions = 0; 221 | uvc1_refgpos_big_t region_n_ref_positions_add = 0; 222 | 223 | int sam_read_ret = -1; 224 | do { 225 | sam_read_ret = ((NULL != sam_idx) ? (sam_itr_next(this->sam_infile, this->sam_itr, alnrecord)) 226 | : (sam_read1(this->sam_infile, this->samheader, alnrecord))); 227 | if ((sam_read_ret < -1)) { 228 | LOG(logWARNING) << "Encountered error while iterating over the first BAM record in the file " << this->input_bam_fname << " error code is " << sam_read_ret; 229 | break; 230 | } 231 | if (BAM_FUNMAP & alnrecord->core.flag) { continue; } 232 | NORM_INSERT_SIZE(alnrecord); 233 | const auto curr_tid = alnrecord->core.tid; 234 | const auto curr_beg = alnrecord->core.pos; 235 | const auto curr_end = bam_endpos(alnrecord); 236 | 237 | const bool is_sub_mem_over_lim = check_if_sub_is_over_mem_lim( 238 | region_n_reads, // region_n_reads_x_reads, 239 | region_n_ref_positions + region_n_ref_positions_add, // region_n_rposs_x_rposs, 240 | this->mem_per_thread, curr_beg, block_running_end); 241 | const bool is_template_changed = (curr_tid != block_tid); 242 | // is_very_far_jumped results in a lot of wasted mem-alloc and computation, so it is not used 243 | //const bool is_very_far_jumped = ((curr_tid == block_tid) && (block_running_end + MAX_INSERT_SIZE < curr_beg)); 244 | const bool is_far_jumped = ((curr_tid == block_tid) && (block_running_end + (MAX_STR_N_BASES * 2) < curr_beg)); 245 | 246 | if (0 == (total_n_reads % (1024*1024))) { 247 | LOG(logDEBUG4) << "ReadName=" << bam_get_qname(alnrecord) 248 | << " TID=" << (alnrecord->core.tid) 249 | << " POS=" << (alnrecord->core.pos) 250 | << " is_template_changed=" << is_template_changed 251 | << " is_far_jumped=" << is_far_jumped 252 | << " is_sub_mem_over_lim=" << is_sub_mem_over_lim 253 | << " sam_read_ret=" << sam_read_ret 254 | << " total_n_reads=" << total_n_reads 255 | << " approx total_n_ref_bases=" << (block_running_end - block_beg); 256 | } 257 | uvc1_flag_t region_flag = (!!is_template_changed) * 16 + (!!is_far_jumped) * 8 + (!!is_sub_mem_over_lim) * 4 + (!!(-1 == sam_read_ret)) * 2; // 0x1 bit is reserved for END_TO_END 258 | if (region_flag) { 259 | // flush to output due to ref-genome segmentation 260 | const bool is_1st_read = (-1 == block_tid); 261 | const int64_t div = 1; // Please note that MGVCF_REGION_MAX_SIZE will be used later instead of here, so div is set to one here. 262 | int64_t block_norm_end = MIN((((block_running_end + div - 1) / div) * div), (uvc1_refgpos_t)(is_1st_read ? INT_MAX : this->samheader->target_len[block_tid])); 263 | 264 | const bool is_block_zero_sized = (block_beg >= block_norm_end); 265 | if ((!is_1st_read) && (!is_block_zero_sized)) { 266 | 267 | bedlines.push_back(BedLine(block_tid, block_beg, block_norm_end, region_flag, region_n_reads)); 268 | LOG(logDEBUG4) << "The BED line tid=" << block_tid << ":" << block_beg << "-" << block_norm_end 269 | << " flag=" << region_flag << " num_reads=" << (int)region_n_reads << " is STORED, reason=" 270 | << is_1st_read << is_block_zero_sized; 271 | uvc1_refgpos_big_t region_s_rposs = region_n_ref_positions + region_n_ref_positions_add; 272 | // total_n_regions++; 273 | total_n_reads += region_n_reads; 274 | total_n_rposs += region_s_rposs; 275 | total_n_reads_x_reads += mathsquare_big(region_n_reads); 276 | total_n_rposs_x_rposs += mathsquare_big(region_s_rposs); 277 | region_n_ref_positions = 0; 278 | region_n_ref_positions_add = 0; 279 | region_n_reads = 0; 280 | } else { 281 | LOG(logDEBUG4) << "The BED line tid=" << block_tid << ":" << block_beg << "-" << block_norm_end 282 | << " flag=" << region_flag << " num_reads=" << (int)region_n_reads << " is NOT-STORED, reason=" 283 | << is_1st_read << is_block_zero_sized; 284 | } 285 | block_tid = curr_tid; 286 | const auto new_block_beg = MAX(block_beg, (curr_beg / div) * div); // skip over non-covered bases 287 | block_beg = (is_template_changed ? curr_beg : MAX(new_block_beg, block_norm_end)); 288 | const bool is_over_mem_lim = check_if_is_over_mem_lim( 289 | total_n_reads, total_n_reads_x_reads, 290 | total_n_rposs, total_n_rposs_x_rposs, 291 | // total_n_regions, 292 | this->nthreads, this->mem_per_thread, 293 | this->is_fastq_gen); 294 | if (is_over_mem_lim) { 295 | this->last_it_tid = block_tid; 296 | this->last_it_beg = block_beg; 297 | this->last_it_end = MAX(block_beg, block_norm_end); 298 | return (total_n_reads); 299 | } 300 | } 301 | if (is_template_changed) { 302 | block_beg = curr_beg; // only rarely needed in some situations? 303 | block_running_end = curr_end; 304 | region_n_ref_positions_add += region_n_ref_positions; 305 | } else { 306 | block_running_end = MAX(block_running_end, curr_end); 307 | } 308 | region_n_reads++; 309 | region_n_ref_positions = block_running_end - block_beg; 310 | } while (sam_read_ret >= 0); 311 | } 312 | iter_ret_flag |= 0x1; 313 | return total_n_reads; 314 | } 315 | 316 | int 317 | samfname_to_tid_to_tname_tseq_tup_vec( 318 | std::vector> & tid_to_tname_tseqlen_tuple_vec, 319 | const std::string & bam_input_fname) { 320 | 321 | tid_to_tname_tseqlen_tuple_vec.clear(); 322 | samFile *sam_infile = sam_open(bam_input_fname.c_str(), "r"); 323 | bam_hdr_t * samheader = sam_hdr_read(sam_infile); 324 | tid_to_tname_tseqlen_tuple_vec.reserve(samheader->n_targets); 325 | for (uvc1_refgpos_t tid = 0; tid < UNSIGN2SIGN(samheader->n_targets); tid++) { 326 | tid_to_tname_tseqlen_tuple_vec.push_back(std::make_tuple(std::string(samheader->target_name[tid]), samheader->target_len[tid])); 327 | } 328 | bam_hdr_destroy(samheader); 329 | sam_close(sam_infile); 330 | return 0; 331 | } 332 | 333 | enum FilterReason { 334 | NOT_FILTERED, 335 | NOT_MAPPED, 336 | NOT_PRIMARY_ALN, 337 | LOW_MAPQ, 338 | LOW_ALN_LEN, 339 | LOW_ISIZE, 340 | HIGH_ISIZE, 341 | ZERO_ISIZE, 342 | OUT_OF_RANGE, 343 | NOT_END_TO_END, 344 | NUM_FILTER_REASONS 345 | }; 346 | 347 | template 348 | enum FilterReason 349 | fill_isrc_isr2_beg_end_with_aln(bool & isrc, bool & isr2, uvc1_refgpos_t & tBeg, uvc1_refgpos_t & tEnd, T &num_seqs, 350 | const bam1_t *aln, const uvc1_refgpos_t fetch_tbeg, const uvc1_refgpos_t fetch_tend, 351 | const uvc1_qual_t min_mapqual, 352 | const uvc1_readpos_t min_aln_len, 353 | const uvc1_readpos_t min_isize, 354 | const uvc1_readpos_t max_isize, 355 | const bool is_zero_isize_discarded, 356 | const uvc1_flag_t region_flag, const bool is_pair_end_merge_enabled) { 357 | num_seqs = 0; 358 | if (aln->core.flag & 0x4) { 359 | return NOT_MAPPED; 360 | } 361 | if ((aln->core.flag & 0x900) != 0) { 362 | return NOT_PRIMARY_ALN; 363 | } 364 | 365 | if (aln->core.qual < min_mapqual) { 366 | return LOW_MAPQ; 367 | } 368 | if (SIGN2UNSIGN(bam_endpos(aln) - aln->core.pos) < min_aln_len) { 369 | return LOW_ALN_LEN; 370 | } 371 | if (0 == (aln->core.isize)) { 372 | if (is_zero_isize_discarded) { 373 | return ZERO_ISIZE; 374 | } 375 | } else { 376 | if (abs(aln->core.isize) < min_isize) { 377 | return LOW_ISIZE; 378 | } 379 | if (abs(aln->core.isize) > max_isize) { 380 | return HIGH_ISIZE; 381 | } 382 | } 383 | 384 | isrc = ((aln->core.flag & 0x10) == 0x10); 385 | isr2 = ((aln->core.flag & 0x80) == 0x80 && (aln->core.flag & 0x1) == 0x1); 386 | if (!is_pair_end_merge_enabled) { isr2 = false; } 387 | const auto begpos = aln->core.pos; 388 | const auto endpos = bam_endpos(aln) - 1; 389 | if ((!is_pair_end_merge_enabled) 390 | || ((aln->core.flag & 0x1) == 0) 391 | // || ((aln->core.flag & 0x2) == 0) // having this line causes problems to SRR2556939_chr3_178936090_178936092 392 | || (aln->core.flag & 0x8) 393 | || (0 == aln->core.isize) 394 | || ((abs(aln->core.isize)) >= (ARRPOS_MARGIN))) { 395 | tBeg = (isrc ? endpos : begpos); 396 | tEnd = (isrc ? begpos : endpos); 397 | num_seqs = 1; 398 | } else { 399 | auto tBegP1 = MIN(begpos, SIGN2UNSIGN(aln->core.mpos)); 400 | auto tEndP1 = tBegP1 + abs(aln->core.isize) - 1; 401 | bool strand = bam_get_strand(aln); // (isrc ^ isr2); 402 | tBeg = (strand ? tEndP1 : tBegP1); 403 | tEnd = (strand ? tBegP1 : tEndP1); 404 | num_seqs = 2; 405 | } 406 | auto tOrdBeg = MIN(tBeg, tEnd); 407 | auto tOrdEnd = MAX(tBeg, tEnd); 408 | if (tOrdBeg + (ARRPOS_MARGIN - ARRPOS_OUTER_RANGE) <= fetch_tbeg || fetch_tend - 1 + (ARRPOS_MARGIN - ARRPOS_OUTER_RANGE) <= tOrdEnd) { 409 | return OUT_OF_RANGE; 410 | } 411 | if ((region_flag & BED_END_TO_END_BIT) && !(tOrdBeg <= fetch_tbeg && tOrdEnd >= fetch_tend)) { 412 | return NOT_END_TO_END; 413 | } 414 | return NOT_FILTERED; 415 | } 416 | 417 | uvc1_unsigned_int_t 418 | unsigned_diff(uvc1_unsigned_int_t a, uvc1_unsigned_int_t b) { 419 | return (a > b ? a - b : b - a); 420 | } 421 | 422 | int 423 | poscounter_to_pos2pcenter( 424 | std::vector & pos_to_center_pos, 425 | const std::vector & pos_to_count, 426 | const double dedup_center_mult) { 427 | 428 | for (uvc1_refgpos_t locov_pos = ARRPOS_INNER_RANGE; locov_pos < UNSIGN2SIGN(pos_to_count.size()) - ARRPOS_INNER_RANGE; locov_pos++) { 429 | auto locov_count = pos_to_count[locov_pos]; 430 | pos_to_center_pos[locov_pos] = locov_pos; 431 | auto max_count = locov_count; 432 | // check if inner_pos is attracted by outer position 433 | for (auto hicov_pos = locov_pos - ARRPOS_INNER_RANGE; hicov_pos < locov_pos + ARRPOS_INNER_RANGE + 1; hicov_pos++) { 434 | auto hicov_count = pos_to_count[hicov_pos]; 435 | if ((hicov_count > max_count) && ((hicov_count + 1) > (locov_count + 1) * pow(dedup_center_mult, unsigned_diff(locov_pos, hicov_pos)))) { 436 | pos_to_center_pos[locov_pos] = hicov_pos; 437 | max_count = hicov_count; 438 | } 439 | } 440 | } 441 | return 0; 442 | } 443 | 444 | int 445 | clean_fill_strand_umi_readset( 446 | std::vector>, 2>> &umi_strand_readset) { 447 | for (auto & strand_readset : umi_strand_readset) { 448 | for (int strand = 0; strand < 2; strand++) { 449 | for (auto & read : strand_readset[strand]) { 450 | for (bam1_t *aln : read) { 451 | bam_destroy1(aln); 452 | } 453 | } 454 | } 455 | } 456 | return 0; 457 | } 458 | 459 | int 460 | apply_bq_err_correction3(bam1_t *aln, const uvc1_qual_t assay_sequencing_BQ_max, const uvc1_qual_t assay_sequencing_BQ_inc) { 461 | if ((0 == aln->core.l_qseq) || (aln->core.flag & 0x4)) { return -1; } 462 | 463 | for (uvc1_readpos_t i = 0; i < aln->core.l_qseq; i++) { 464 | uvc1_qual_t bq = bam_get_qual(aln)[i]; 465 | bam_get_qual(aln)[i] = MIN(bq + assay_sequencing_BQ_inc, assay_sequencing_BQ_max); 466 | } 467 | 468 | const auto cigar = bam_get_cigar(aln); 469 | const int isrc = ((aln->core.flag & 0x10) ? 1 : 0); 470 | uvc1_readpos_t inclu_beg_poss[2] = {0, aln->core.l_qseq - 1}; 471 | uvc1_readpos_t exclu_end_poss[2] = {aln->core.l_qseq, 0 - 1}; 472 | uvc1_readpos_t end_clip_len = 0; 473 | if (aln->core.n_cigar > 0) { 474 | auto cigar_1elem = cigar[0]; 475 | if (bam_cigar_op(cigar_1elem) == BAM_CSOFT_CLIP) { 476 | if (0 == isrc) { 477 | inclu_beg_poss[0] += bam_cigar_oplen(cigar_1elem); 478 | } else { 479 | exclu_end_poss[1] += bam_cigar_oplen(cigar_1elem); 480 | end_clip_len = bam_cigar_oplen(cigar_1elem); 481 | } 482 | } 483 | cigar_1elem = cigar[aln->core.n_cigar-1]; 484 | if (bam_cigar_op(cigar_1elem) == BAM_CSOFT_CLIP) { 485 | if (1 == isrc) { 486 | inclu_beg_poss[1] -= bam_cigar_oplen(cigar_1elem); 487 | } else { 488 | exclu_end_poss[0] -= bam_cigar_oplen(cigar_1elem); 489 | end_clip_len = bam_cigar_oplen(cigar_1elem); 490 | } 491 | } 492 | } 493 | 494 | const uvc1_refgpos_t pos_incs[2] = {1, -1}; 495 | { 496 | uint8_t prev_b = 0; 497 | uvc1_unsigned_int_t distinct_cnt = 0; 498 | int termpos = exclu_end_poss[isrc] - pos_incs[isrc]; 499 | for (; termpos != inclu_beg_poss[isrc] - pos_incs[isrc]; termpos -= pos_incs[isrc]) { 500 | uint8_t b = bam_seqi(bam_get_seq(aln), termpos); 501 | auto q = bam_get_qual(aln)[termpos]; 502 | if (b != prev_b && q >= 20) { 503 | prev_b = b; 504 | distinct_cnt += 1; 505 | if (2 == distinct_cnt) { break; } 506 | } 507 | } 508 | uvc1_readpos_t homopol_tracklen = abs(termpos - (exclu_end_poss[isrc] - pos_incs[isrc])); 509 | uvc1_qual_t tail_penal = (end_clip_len >= 20 ? 1 : 0) 510 | + (homopol_tracklen >= 15 ? 2 : (homopol_tracklen >= 10 ? 1 : 0)); 511 | if (tail_penal > 0) { 512 | const bool is_in_log_reg = (aln->core.tid == 0 && aln->core.pos < 9509431 && aln->core.pos > 9509400); 513 | if (is_in_log_reg) { 514 | LOG(logINFO) << "tail_penal = " << tail_penal << " for read " << bam_get_qname(aln); 515 | } 516 | for (uvc1_refgpos_t pos = exclu_end_poss[isrc] - pos_incs[isrc]; pos != (inclu_beg_poss[isrc] - pos_incs[isrc]) && pos != termpos; pos -= pos_incs[isrc]) { 517 | const uvc1_qual_t q = bam_get_qual(aln)[pos]; 518 | bam_get_qual(aln)[pos] = MAX(bam_get_qual(aln)[pos], tail_penal + 1) - tail_penal; 519 | if (is_in_log_reg) { 520 | LOG(logINFO) << "\tQuality adjustment at pos " << pos << " : " << q << " -> " << (int)bam_get_qual(aln)[pos]; 521 | } 522 | } 523 | } 524 | } 525 | { 526 | uvc1_refgpos_t homopol_len = 0; 527 | uint8_t prev_b = 0; 528 | // https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-451 529 | for (uvc1_refgpos_t pos = inclu_beg_poss[isrc]; pos != exclu_end_poss[isrc]; pos += pos_incs[isrc]) { 530 | const uint8_t b = bam_seqi(bam_get_seq(aln), pos); 531 | if (b == prev_b) { 532 | homopol_len++; 533 | if (homopol_len >= 4 && b == seq_nt16_table['G']) { 534 | bam_get_qual(aln)[pos] = MAX(bam_get_qual(aln)[pos], 1 + 1) - 1; 535 | } 536 | } else { 537 | prev_b = b; 538 | homopol_len = 1; 539 | } 540 | } 541 | } 542 | return 0; 543 | } 544 | 545 | int 546 | fill_strand_umi_readset_with_strand_to_umi_to_reads( 547 | std::vector>, 2>, MolecularBarcode>> &umi_strand_readset, 548 | std::map>, 2>, MolecularBarcode>> &umi_to_strand_to_reads, 549 | const CommandLineArgs & paramset, 550 | const uvc1_flag_t specialflag IGNORE_UNUSED_PARAM) { 551 | for (auto & umi_to_strand_to_reads_element : umi_to_strand_to_reads) { 552 | const auto strand_to_reads = umi_to_strand_to_reads_element.second.first; 553 | const auto dflag = umi_to_strand_to_reads_element.second.second; 554 | umi_strand_readset.push_back(std::make_pair(std::array>, 2>(), dflag)); 555 | for (int strand = 0; strand < 2; strand++) { 556 | for (auto read : strand_to_reads[strand]) { 557 | const std::vector alns = read.second; 558 | umi_strand_readset.back().first[strand].push_back(std::vector()); 559 | for (auto aln : alns) { 560 | apply_bq_err_correction3(aln, paramset.assay_sequencing_BQ_max, paramset.assay_sequencing_BQ_inc); 561 | umi_strand_readset.back().first[strand].back().push_back(aln); 562 | } 563 | } 564 | } 565 | } 566 | return 0; 567 | }; 568 | 569 | template 570 | uvc1_hash_t 571 | bam2umihash(int & is_umi_found, const bam1_t *aln, const std::vector & UMI_STRUCT, const int max_begin_diff_umi2read = 5) { 572 | LOG(logDEBUGx1) << "Going over " << UMI_STRUCT.size() << " bases in the pattern"; 573 | 574 | auto *bamseq = bam_get_seq(aln); 575 | 576 | for (int i = 0; i < max_begin_diff_umi2read; i++) { 577 | size_t patpos = 0; 578 | uvc1_hash_t umihash = 0; 579 | for (int j = i; j < aln->core.l_qseq && patpos < UMI_STRUCT.size(); j++) { 580 | char int4base; 581 | if (is_rc) { 582 | char int4base2 = bam_seqi(bamseq, aln->core.l_qseq - 1 - j); 583 | int4base = STATIC_REV_COMPLEMENT.table16[(int8_t)int4base2]; 584 | } else { 585 | int4base = bam_seqi(bamseq, j); 586 | } 587 | if (UMI_STRUCT[patpos] == int4base || 15 == UMI_STRUCT[patpos]) { 588 | if (0xF == UMI_STRUCT[patpos]) { 589 | umihash = umihash * 16 + int4base; 590 | } 591 | patpos++; 592 | } else { 593 | LOG(logDEBUGx1) << "Misma at query position " << j << " (" << (int)int4base << ") and pattern position " << patpos << " (" << (int)UMI_STRUCT[patpos] << ") for read " << bam_get_qname(aln); 594 | break; 595 | } 596 | } 597 | if (UMI_STRUCT.size() == patpos) { 598 | is_umi_found++; 599 | LOG(logDEBUGx1) << "UMI-is-found: " << patpos << " / " << UMI_STRUCT.size() << " with flag " << is_umi_found << " and hash value " << umihash; 600 | return umihash; 601 | } else { 602 | LOG(logDEBUGx1) << "Fraction of bases in UMI that are found: " << patpos << " / " << UMI_STRUCT.size() << " "; 603 | } 604 | } 605 | return 0; 606 | }; 607 | 608 | std::array 609 | bamfname_to_strand_to_familyuid_to_reads( 610 | std::map>, 2>, MolecularBarcode>> &umi_to_strand_to_reads, 611 | uvc1_refgpos_t & extended_inclu_beg_pos, 612 | uvc1_refgpos_t & extended_exclu_end_pos, 613 | uvc1_refgpos_t tid, 614 | uvc1_refgpos_t fetch_tbeg, 615 | uvc1_refgpos_t fetch_tend, 616 | bool end2end, 617 | size_t regionbatch_ordinal, 618 | size_t regionbatch_tot_num, 619 | const std::string UMI_STRUCT_STRING, 620 | samFile *sam_infile, 621 | const hts_idx_t * hts_idx, 622 | size_t thread_id, 623 | const CommandLineArgs & paramset, 624 | const uvc1_flag_t specialflag IGNORE_UNUSED_PARAM) { 625 | assertUVC (fetch_tend > fetch_tbeg); 626 | 627 | const bool is_pair_end_merge_enabled = (PAIR_END_MERGE_NO != paramset.pair_end_merge); 628 | 629 | const bool should_log = (ispowerof2(regionbatch_ordinal+1) || ispowerof2(regionbatch_tot_num - regionbatch_ordinal)); 630 | std::vector umi_struct_string16; 631 | for (auto ch : UMI_STRUCT_STRING) { 632 | umi_struct_string16.push_back(seq_nt16_table[(int8_t)ch]); 633 | } 634 | for (auto base : umi_struct_string16) { 635 | LOG(logDEBUGx1) << "Base " << (int)base; 636 | } 637 | extended_inclu_beg_pos = INT32_MAX; 638 | extended_exclu_end_pos = 0; 639 | 640 | uvc1_readnum_big_t pcrpassed, umi_pcrpassed; 641 | pcrpassed = umi_pcrpassed = 0; 642 | 643 | // samFile *sam_infile = sam_open(paramset.bam_input_fname.c_str(), "r"); 644 | if (should_log) { 645 | LOG(logINFO) << "Thread " << thread_id << " started dedupping the chunk tid" << tid << ":" << fetch_tbeg << "-" << fetch_tend 646 | << " (region no " << regionbatch_ordinal << "/" << regionbatch_tot_num << " in this batch)"; 647 | } 648 | uvc1_refgpos_t fetch_size = fetch_tend - fetch_tbeg + (ARRPOS_MARGIN + ARRPOS_OUTER_RANGE) * 2; 649 | 650 | std::vector inicount(fetch_size, 0); 651 | std::array, 4> isrc_isr2_to_beg_count = {{ inicount, inicount, inicount, inicount }}; 652 | std::array, 4> isrc_isr2_to_end_count = {{ inicount, inicount, inicount, inicount }}; 653 | std::vector inicount64(fetch_size + 1, 0); 654 | std::array, 4> isrc_isr2_to_border_count_prefixsum = {{ inicount64, inicount64, inicount64, inicount64 }};; 655 | 656 | hts_itr_t * hts_itr; 657 | bam1_t *aln = bam_init1(); 658 | 659 | std::set visited_qnames; 660 | uvc1_readnum_big_t num_iter1_passed_alns = 0; 661 | // Although the following line can speed up things, it may result in different output depending on tid:fetch_tbeg-fetch_tend 662 | // hts_itr = sam_itr_queryi(hts_idx, tid, fetch_tbeg, fetch_tend); 663 | // Hence, the following line is used instead 664 | hts_itr = sam_itr_queryi(hts_idx, tid, non_neg_minus(fetch_tbeg, MAX_INSERT_SIZE), (fetch_tend + MAX_INSERT_SIZE)); 665 | 666 | while (sam_itr_next(sam_infile, hts_itr, aln) >= 0) { 667 | //for (const bam1_t *aln : bam_list){ 668 | bool isrc = false; 669 | bool isr2 = false; 670 | uvc1_refgpos_t tBeg = 0; 671 | uvc1_refgpos_t tEnd = 0; 672 | uvc1_unsigned_int_t num_seqs = 0; 673 | NORM_INSERT_SIZE(aln); // may be too early here? 674 | FilterReason filterReason = fill_isrc_isr2_beg_end_with_aln(isrc, isr2, tBeg, tEnd, num_seqs, 675 | aln, fetch_tbeg, fetch_tend, 676 | paramset.kept_aln_min_aln_len, 677 | paramset.kept_aln_min_mapqual, 678 | paramset.kept_aln_min_isize, 679 | paramset.kept_aln_max_isize, 680 | paramset.kept_aln_is_zero_isize_discarded, 681 | end2end, is_pair_end_merge_enabled); 682 | if (!is_pair_end_merge_enabled) { assertUVC(!isr2); } 683 | 684 | if (NOT_FILTERED == filterReason) { 685 | uvc1_refgpos_t begidx = tBeg + ARRPOS_MARGIN - fetch_tbeg; 686 | uvc1_refgpos_t endidx = tEnd + ARRPOS_MARGIN - fetch_tbeg; 687 | if (begidx >= 0 && ((size_t)begidx) < isrc_isr2_to_beg_count[isrc * 2 + isr2].size()) { isrc_isr2_to_beg_count[isrc * 2 + isr2][begidx] += 1; } 688 | if (endidx >= 0 && ((size_t)endidx) < isrc_isr2_to_end_count[isrc * 2 + isr2].size()) { isrc_isr2_to_end_count[isrc * 2 + isr2][endidx] += 1; } 689 | 690 | if (ARE_INTERVALS_OVERLAPPING(MIN(tBeg, tEnd), MAX(tBeg, tEnd) + 2, fetch_tbeg, fetch_tend)) { 691 | visited_qnames.insert(bam_get_qname(aln)); 692 | } 693 | num_iter1_passed_alns++; 694 | } 695 | } 696 | sam_itr_destroy(hts_itr); 697 | 698 | for (size_t isrc_isr2 = 0; isrc_isr2 < 4; isrc_isr2++) { 699 | uvc1_readnum_big_t beg_prefixsum = 0; 700 | uvc1_readnum_big_t end_prefixsum = 0; 701 | isrc_isr2_to_border_count_prefixsum[isrc_isr2][0] = (beg_prefixsum + end_prefixsum); 702 | for (size_t i = 0; i < isrc_isr2_to_border_count_prefixsum[isrc_isr2].size() - 1; i++) { 703 | beg_prefixsum += isrc_isr2_to_beg_count[isrc_isr2][i]; 704 | end_prefixsum += isrc_isr2_to_end_count[isrc_isr2][i]; 705 | isrc_isr2_to_border_count_prefixsum[isrc_isr2][i+1] = (beg_prefixsum + end_prefixsum); 706 | } 707 | } 708 | std::array, 4> isrc_isr2_to_beg2bcenter = {{ inicount, inicount, inicount, inicount }}; 709 | for (size_t isrc_isr2 = 0; isrc_isr2 < 4; isrc_isr2++) { 710 | auto beg_to_count = isrc_isr2_to_beg_count[isrc_isr2]; 711 | poscounter_to_pos2pcenter(isrc_isr2_to_beg2bcenter[isrc_isr2], beg_to_count, paramset.dedup_center_mult); 712 | } 713 | std::array, 4> isrc_isr2_to_end2ecenter = {{ inicount, inicount, inicount, inicount }}; 714 | for (size_t isrc_isr2 = 0; isrc_isr2 < 4; isrc_isr2++) { 715 | auto end_to_count = isrc_isr2_to_end_count[isrc_isr2]; 716 | poscounter_to_pos2pcenter(isrc_isr2_to_end2ecenter[isrc_isr2], end_to_count, paramset.dedup_center_mult); 717 | } 718 | 719 | uvc1_readnum_t beg_peak_max = 0; 720 | for (auto beg_count : isrc_isr2_to_beg_count) { 721 | for (auto countval : beg_count) { 722 | beg_peak_max = MAX(beg_peak_max, countval); 723 | } 724 | } 725 | 726 | std::array fillcode_to_num_alns; 727 | 728 | uvc1_readnum_big_t alnidx = 0; 729 | // hts_itr = sam_itr_queryi(hts_idx, tid, fetch_tbeg, fetch_tend); 730 | hts_itr = sam_itr_queryi(hts_idx, tid, non_neg_minus(fetch_tbeg, MAX_INSERT_SIZE), (fetch_tend + MAX_INSERT_SIZE)); 731 | while (sam_itr_next(sam_infile, hts_itr, aln) >= 0) { 732 | if (aln->core.pos < non_neg_minus(fetch_tbeg, MAX_INSERT_SIZE + 1) || bam_endpos(aln) > (fetch_tend + MAX_INSERT_SIZE + 1)) { 733 | continue; 734 | } 735 | if (visited_qnames.find(bam_get_qname(aln)) == visited_qnames.end()) { 736 | continue; 737 | } 738 | 739 | bool isrc = false; 740 | bool isr2 = false; 741 | uvc1_refgpos_t tBeg = 0; 742 | uvc1_refgpos_t tEnd = 0; 743 | uvc1_unsigned_int_t num_seqs = 0; 744 | NORM_INSERT_SIZE(aln); // may be too early here? 745 | FilterReason filterReason = fill_isrc_isr2_beg_end_with_aln(isrc, isr2, tBeg, tEnd, num_seqs, 746 | aln, fetch_tbeg, fetch_tend, 747 | paramset.kept_aln_min_aln_len, 748 | paramset.kept_aln_min_mapqual, 749 | paramset.kept_aln_min_isize, 750 | paramset.kept_aln_max_isize, 751 | paramset.kept_aln_is_zero_isize_discarded, 752 | end2end, is_pair_end_merge_enabled); 753 | if (!is_pair_end_merge_enabled) { assertUVC(!isr2); } 754 | 755 | fillcode_to_num_alns[filterReason]++; 756 | 757 | if (NOT_FILTERED != filterReason) { 758 | continue; 759 | } 760 | 761 | extended_inclu_beg_pos = MIN(extended_inclu_beg_pos, SIGN2UNSIGN(aln->core.pos)); 762 | extended_exclu_end_pos = MAX(extended_exclu_end_pos, SIGN2UNSIGN(bam_endpos(aln))); 763 | 764 | const char *qname = bam_get_qname(aln); 765 | const uvc1_hash_t qname_hash = strhash(qname, 31UL); 766 | const uvc1_hash_t qname_hash2 = strhash(qname, 17UL); 767 | const size_t qname_len = strlen(qname); 768 | const char *umi_beg1 = strchr(qname, '#'); 769 | const char *umi_beg = ((NULL != umi_beg1) ? (umi_beg1 + 1) : (qname + qname_len)); 770 | const char *umi_end1 = strchr(umi_beg, '#'); 771 | const char *umi_end = ((NULL != umi_end1) ? (umi_end1 ) : (qname + qname_len)); 772 | 773 | int is_umi_found = ((umi_beg + 1 < umi_end) && (MOLECULE_TAG_NONE != paramset.molecule_tag)); // UMI has at least one letter 774 | int is_duplex_found = 0; 775 | uvc1_hash_t umihash = 0; 776 | const size_t umi_len = umi_end - umi_beg; 777 | if (is_umi_found) { 778 | size_t umi_half = (umi_end - umi_beg - 1) / 2; 779 | if ((umi_len % 2 == 1 ) && ( '+' == umi_beg[umi_half]) && (!paramset.disable_duplex)) { 780 | uvc1_hash_t umihash_part1 = strnhash(umi_beg , umi_half); // alpha 781 | uvc1_hash_t umihash_part2 = strnhash(umi_beg + umi_half + 1, umi_half); // beta 782 | umihash = ((isrc ^ isr2) ? hash2hash(umihash_part1, umihash_part2) : hash2hash(umihash_part2, umihash_part1)); 783 | is_duplex_found++; 784 | } else { 785 | umihash = strnhash(umi_beg, umi_end-umi_beg); 786 | } 787 | } else if ((aln->core.flag & 0x1) == 0 && umi_struct_string16.size() > 0) { // should be proton 788 | umihash = bam2umihash(is_umi_found, aln, umi_struct_string16); 789 | if (!is_umi_found) { 790 | umihash = bam2umihash(is_umi_found, aln, umi_struct_string16); 791 | } 792 | } 793 | size_t isrc_isr2 = isrc * 2 + isr2; 794 | uvc1_refgpos_t beg1 = tBeg + ARRPOS_MARGIN - fetch_tbeg; 795 | uvc1_refgpos_t end1 = tEnd + ARRPOS_MARGIN - fetch_tbeg; 796 | uvc1_refgpos_t beg2 = (isrc_isr2_to_beg2bcenter[isrc_isr2][beg1]); 797 | uvc1_refgpos_t end2 = (isrc_isr2_to_end2ecenter[isrc_isr2][end1]); 798 | uvc1_readnum_big_t beg2count = isrc_isr2_to_beg_count[isrc_isr2][beg2]; 799 | uvc1_readnum_big_t end2count = isrc_isr2_to_end_count[isrc_isr2][end2]; 800 | const auto insert2posL = MIN(beg2 + 6, end2); 801 | const auto insert2posR = MAX(beg2, non_neg_minus(end2, 6)); 802 | const auto insert_cov_totDP = isrc_isr2_to_border_count_prefixsum[isrc_isr2][insert2posR] - isrc_isr2_to_border_count_prefixsum[isrc_isr2][insert2posL]; 803 | 804 | /* 805 | uvc1_readnum_t beg2surrcount = 0; 806 | for (auto i = -ARRPOS_OUTER_RANGE; i < ARRPOS_OUTER_RANGE + 1; i++) { 807 | if (i > ARRPOS_INNER_RANGE || i < -ARRPOS_INNER_RANGE) { 808 | assertUVC(i+beg2 < fetch_size || !fprintf(stderr, "beg2 index %d + %d = %d is too big!", i, beg2, i+beg2)); 809 | uvc1_readnum_t beg_count = isrc_isr2_to_beg_count.at(isrc_isr2).at(i + beg2); 810 | beg2surrcount = MAX(beg2surrcount, beg_count); 811 | } 812 | } 813 | uvc1_readnum_t end2surrcount = 0; 814 | for (auto i = -ARRPOS_OUTER_RANGE; i < ARRPOS_OUTER_RANGE + 1; i++) { 815 | if (i > ARRPOS_INNER_RANGE && i < -ARRPOS_INNER_RANGE) { 816 | assertUVC(i + end2 < fetch_size || !fprintf(stderr, "end2 index %d + %d = %d is too big!", i, end2, i + end2)); 817 | uvc1_readnum_t end_count = isrc_isr2_to_end_count.at(isrc_isr2).at(i + end2); 818 | end2surrcount = MAX(end2surrcount, end_count); 819 | } 820 | } 821 | */ 822 | const uvc1_readnum_big_t tot_ins_cov_border_DP = 823 | isrc_isr2_to_border_count_prefixsum[isrc_isr2][insert2posR] 824 | - isrc_isr2_to_border_count_prefixsum[isrc_isr2][insert2posL]; 825 | // in the denominator we have a) -2 to take out two positions at beg2 and end2 and b) +2 to add pseudocount, and these two +-2 cancel out each other. 826 | double begratio = (double)(beg2count * (insert2posR - insert2posL) + 1) / (double)(tot_ins_cov_border_DP + (insert2posR - insert2posL) + 1); 827 | double endratio = (double)(end2count * (insert2posR - insert2posL) + 1) / (double)(tot_ins_cov_border_DP + (insert2posR - insert2posL) + 1); 828 | const bool is_beg_amplicon = (begratio > paramset.dedup_amplicon_border_to_insert_cov_weak_avgDP_ratio 829 | && (beg2count >= paramset.dedup_amplicon_border_weak_minDP) && (beg2count >= tot_ins_cov_border_DP * paramset.dedup_amplicon_border_to_insert_cov_weak_totDP_ratio)); 830 | const bool is_end_amplicon = (endratio > paramset.dedup_amplicon_border_to_insert_cov_weak_avgDP_ratio 831 | && (end2count >= paramset.dedup_amplicon_border_weak_minDP) && (end2count >= tot_ins_cov_border_DP * paramset.dedup_amplicon_border_to_insert_cov_weak_totDP_ratio)); 832 | const bool is_beg_strong_amplicon = (begratio > paramset.dedup_amplicon_border_to_insert_cov_strong_avgDP_ratio 833 | && (beg2count >= paramset.dedup_amplicon_border_strong_minDP) && (beg2count >= tot_ins_cov_border_DP * paramset.dedup_amplicon_border_to_insert_cov_strong_totDP_ratio)); 834 | const bool is_end_strong_amplicon = (endratio > paramset.dedup_amplicon_border_to_insert_cov_strong_avgDP_ratio 835 | && (end2count >= paramset.dedup_amplicon_border_strong_minDP) && (end2count >= tot_ins_cov_border_DP * paramset.dedup_amplicon_border_to_insert_cov_strong_totDP_ratio)); 836 | 837 | /* 838 | double begfrac = (double)(beg2count + 1) / (double)(beg2surrcount + 2); 839 | double endfrac = (double)(end2count + 1) / (double)(end2surrcount + 2); 840 | const bool is_beg_amplicon = (begfrac > paramset.dedup_amplicon_count_to_surrcount_ratio_twosided); 841 | const bool is_end_amplicon = (endfrac > paramset.dedup_amplicon_count_to_surrcount_ratio_twosided); 842 | const bool is_beg_strong_amplicon = (begfrac > paramset.dedup_amplicon_count_to_surrcount_ratio); 843 | const bool is_end_strong_amplicon = (endfrac > paramset.dedup_amplicon_count_to_surrcount_ratio); 844 | 845 | double is_insert_amplicon_1 = (MIN(begratio, endratio) > paramset.dedup_amplicon_border_to_insert_cov_avgDP_ratio_of_min); 846 | double is_insert_amplicon_2 = (MAX(begratio, endratio) > paramset.dedup_amplicon_border_to_insert_cov_avgDP_ratio_of_max); 847 | 848 | */ 849 | const bool is_assay_amplicon = (is_beg_strong_amplicon || is_end_strong_amplicon 850 | || (is_beg_amplicon && is_end_amplicon)); 851 | pcrpassed += is_assay_amplicon; 852 | 853 | // beg end qname UMI = 1 2 4 8 854 | // IonTorrent amplicon without UMI: beg + end + qname 855 | // IonTorrent capture without UMI: beg + end 856 | // IonTorrent amplicon with UMI: beg + UMI 857 | // IonTorrent capture with UMI: beg + UMI 858 | // Illumina amplicon without UMI: beg + end + qname 859 | // Illumina capture without UMI: beg + end 860 | // Illumina amplicon with UMI: beg + end + UMI 861 | // Illumina capture with UMI: beg + end + UMI 862 | // 863 | // For Illumina amplicon with UMI: 864 | // if beg * frac > end, then: beg + UMI 865 | // if end * frac > beg, then: end + UMI 866 | 867 | uvc1_flag_t dedup_idflag = 0x0; 868 | if (paramset.dedup_flag != 0) { 869 | dedup_idflag = paramset.dedup_flag; 870 | } else if ((SEQUENCING_PLATFORM_IONTORRENT == paramset.inferred_sequencing_platform)) { // is_proton 871 | if (is_umi_found) { 872 | dedup_idflag = 0x9; 873 | } else if (is_assay_amplicon) { 874 | // we have no way to remove duplicates in PCR amplicons if no UMI is given 875 | dedup_idflag = 0x7; 876 | } else { 877 | dedup_idflag = 0x3; 878 | } 879 | } else { 880 | if (is_umi_found) { 881 | if (is_beg_strong_amplicon && is_end_amplicon 882 | && beg2count > end2count * paramset.dedup_amplicon_end2end_ratio) { 883 | dedup_idflag = 0x9; 884 | } else if (is_end_strong_amplicon && is_beg_amplicon 885 | && end2count > beg2count * paramset.dedup_amplicon_end2end_ratio) { 886 | dedup_idflag = 0xA; 887 | } else { 888 | dedup_idflag = 0xB; 889 | } 890 | } else if (is_assay_amplicon) { 891 | dedup_idflag = 0x7; 892 | } else { 893 | dedup_idflag = 0x3; 894 | } 895 | } 896 | const auto alnflag = (aln->core.flag); 897 | const bool are_borders_preserved = ((alnflag & 0x1) && (!(alnflag & 0x4)) && (!(alnflag & 0x8)) 898 | && (abs(aln->core.isize) >= (MAX_INSERT_SIZE * 3 / 4) || aln->core.isize == 0)); 899 | uvc1_refgpos_t begtid = ((!(aln->core.flag & 0x4)) ? aln->core.tid : (INT32_MAX-1)); 900 | uvc1_refgpos_t endtid = (((aln->core.flag & 0x1) && !(aln->core.flag & 0x8)) ? aln->core.mtid : (INT32_MAX-1)); 901 | uvc1_refgpos_t beg3 = (are_borders_preserved ? (aln->core.pos) : (beg2 - ARRPOS_MARGIN + fetch_tbeg)); 902 | uvc1_refgpos_t end3 = (are_borders_preserved ? (aln->core.mpos) : (end2 - ARRPOS_MARGIN + fetch_tbeg)); 903 | std::pair begpair = std::make_pair(begtid, beg3); 904 | std::pair endpair = std::make_pair(endtid, end3); 905 | 906 | /* 907 | uvc1_hash_t molecule_hash = (are_borders_preserved ? 1 : 0); 908 | if (0x3 == (0x3 & dedup_idflag)) { 909 | auto min2 = MIN(begpair, endpair); 910 | auto max2 = MAX(begpair, endpair); 911 | molecule_hash = hash2hash(molecule_hash + 6, hash2hash(hash2hash(min2.first, min2.second), hash2hash(max2.first, max2.second))); 912 | } else if (0x1 & dedup_idflag) { 913 | molecule_hash = hash2hash(molecule_hash + 2, hash2hash(begpair.first, begpair.second)); 914 | } else if (0x2 & dedup_idflag) { 915 | molecule_hash = hash2hash(molecule_hash + 4, hash2hash(endpair.first, endpair.second)); 916 | } 917 | if (0x4 & dedup_idflag) { 918 | molecule_hash = hash2hash(molecule_hash, qname_hash); 919 | } 920 | if (0x8 & dedup_idflag) { 921 | molecule_hash = hash2hash(molecule_hash, umihash); 922 | } 923 | */ 924 | 925 | int strand = bam_get_strand(aln); // (isrc ^ isr2); 926 | 927 | MolecularBarcode mb; 928 | mb.beg_tidpos_pair = begpair; 929 | mb.end_tidpos_pair = endpair; 930 | mb.qnamestring = bam_get_qname(aln); 931 | mb.umistring = (is_umi_found ? std::string(umi_beg, umi_len) : ""); 932 | mb.duplexflag = (is_umi_found ? 0x1 : 0) + (is_duplex_found ? 0x2 : 0) + (is_assay_amplicon ? 0x4 : 0) + (are_borders_preserved ? 0x8 : 0); 933 | mb.dedup_idflag = dedup_idflag; 934 | 935 | // mb.hashvalue = molecule_hash; 936 | 937 | MolecularBarcode mbkey = mb.createKey(); 938 | mb.hashvalue = mbkey.hashvalue = mbkey.calcHash(); 939 | umi_to_strand_to_reads.insert(std::make_pair(mbkey, std::make_pair(std::array>, 2>(), mb))); 940 | umi_to_strand_to_reads[mbkey].first[strand].insert(std::make_pair(qname_hash2, std::vector())); 941 | 942 | umi_to_strand_to_reads[mbkey].first[strand][qname_hash2].push_back(bam_dup1(aln)); 943 | // umi_to_strand_to_reads[molecule_hash].first[strand][qname_hash2].push_back((mut_aln)); 944 | 945 | const bool should_log_read = (ispowerof2(alnidx + 1)); 946 | if (!is_pair_end_merge_enabled) { assertUVC(!isr2); } 947 | if ((should_log_read && (beg_peak_max >= 2000 || should_log)) || paramset.always_log) { 948 | LOG(logINFO) << "thread_id = " << thread_id << " ; " 949 | << "readname = " << qname << " ; " 950 | << "alnidx = " << alnidx << " ; " 951 | << "num_iter1_passed_alns = " << num_iter1_passed_alns << " ; " 952 | << "isrc = " << isrc << " ; " 953 | << "isr2 = " << isr2 << " ; " 954 | << "strand = " << strand << " ; " 955 | << "num_seqs = " << num_seqs << " ; " 956 | << "dedup_idflag = " << dedup_idflag << " ; " 957 | << "is_assay_amplicon = " << is_assay_amplicon << " ; " 958 | << "tid = " << aln->core.tid << " ; " 959 | << "fastaseq_range = " << tBeg << "," << tEnd << " ; " 960 | << "original_range = " << beg1 << "," << end1 << " ; " 961 | << "adjusted_rdiff = " << (beg2 - beg1) << "," << (end2 - end1) << " ; " 962 | << "adjusted_count = " << beg2count << "," << end2count << " ; " 963 | //<< "adjusted_surrounding_counts = " << beg2surrcount << "," << end2surrcount << " ; " 964 | << "insert_cov_totDP = " << insert_cov_totDP << " from-" << beg2 << "-to-" << end2 << " ; " 965 | << "beg_tid_pos = " << begpair.first << "," << begpair.second << " ; " 966 | << "end_tid_pos = " << endpair.first << "," << endpair.second << " ; " 967 | << "barcode_umihash = " << (is_umi_found ? umihash : 0) << " ; " 968 | << "molecule_hash = " << anyuint2hexstring(mbkey.hashvalue) << " ; " 969 | << "qname_hash = " << anyuint2hexstring(qname_hash) << " ; " 970 | << "qname_hash2 = " << anyuint2hexstring(qname_hash2) << " ; " 971 | << "dflag = " << mb.duplexflag << " ; " 972 | << "UMIstring = " << umi_beg << " ; " 973 | << "UMIsize = " << umi_len << " ; " 974 | << "num_qname_from_molecule_so_far = " << umi_to_strand_to_reads[mbkey].first[strand].size() << " ; "; 975 | } 976 | alnidx += 1; 977 | } 978 | sam_itr_destroy(hts_itr); 979 | 980 | bam_destroy1(aln); 981 | // sam_close(sam_infile); 982 | 983 | const bool is_min_DP_failed_1 = ( 984 | ISNT_PROVIDED(paramset.vcf_tumor_fname) 985 | && (UNSIGN2SIGN(visited_qnames.size()) < paramset.min_altdp_thres) 986 | && (!paramset.should_output_all)); 987 | // IMPORTANT_NOTE: if singleton should be generated too, then the following variable should always be set to true 988 | const bool is_min_DP_failed_2 = ((paramset.fam_consensus_out_fastq.size() > 0) && (UNSIGN2SIGN(visited_qnames.size()) < paramset.fam_thres_dup2add)); 989 | const bool is_min_DP_failed = (is_min_DP_failed_1 && is_min_DP_failed_2); 990 | if (is_min_DP_failed){ 991 | if (should_log) { LOG(logINFO) << "Thread " << thread_id << " skipped dedupping."; } 992 | return std::array({ -1, -1, -1}); 993 | } else { 994 | if (should_log) { LOG(logINFO) << "Thread " << thread_id << " finished dedupping."; } 995 | return std::array({alnidx, pcrpassed, umi_pcrpassed}); 996 | } 997 | } 998 | 999 | --------------------------------------------------------------------------------