├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── CODECsuite_TPC_Notice_Final.txt ├── LICENSE.txt ├── README.md ├── accuracy.cpp ├── bbcpputil ├── .gitmodules ├── cpp │ ├── AlignmentConsensus.cpp │ └── BamRecordExt.cpp ├── cxxopts │ └── include │ │ └── cxxopts.hpp └── include │ ├── Algo.h │ ├── Alignment.h │ ├── AlignmentConsensus.h │ ├── BamRecordExt.h │ ├── DNAUtils.h │ ├── FastxIO.h │ ├── FastxRecord.h │ ├── Files.h │ ├── GenomicRegionCollectionExt.h │ ├── Gotoh.h │ ├── Insert.h │ ├── InsertSeqFactory.h │ ├── MAF.h │ ├── MutCounter.h │ ├── ReadVCF.h │ ├── Stats.h │ ├── StringUtils.h │ ├── TargetLayout.h │ ├── Variant.h │ └── pileup.h ├── codec.cpp ├── demux.cpp ├── duplex_filter.cpp ├── include ├── Adapter.h ├── BamIO.h └── Index.h ├── msi ├── Snakefile ├── msi.R └── msi.cpp ├── obsolete ├── bamtofastq.cpp ├── concat_umi_to_fastq.cpp └── print_qual.cpp ├── snakemake ├── AdapV2 │ ├── Snakefile │ ├── capture_wf_1 │ │ └── Snakefile │ └── wgs │ │ └── Snakefile ├── README.md ├── jobscript.sh ├── pipeline_input_examples │ ├── caputure │ │ ├── config.yaml │ │ ├── input.tsv │ │ └── runSnakemake.sh │ └── wgs │ │ ├── config.yaml │ │ ├── input.tsv │ │ └── runSnakemake.sh ├── qsub_wrapper.py └── script │ ├── agg_log.py │ ├── cds_summarize.py │ ├── codec2maf │ ├── collect_duplex_metrics.py │ ├── cov_sum.py │ ├── create_maf_from_probe_rg.py │ ├── downsample_read_families.py │ ├── dpx │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── bam_iterator.cpython-36.pyc │ │ ├── bam_iterator.cpython-38.pyc │ │ ├── collect_duplex_metrics.cpython-36.pyc │ │ ├── downsampler.cpython-36.pyc │ │ ├── downsampler.cpython-38.pyc │ │ ├── intervals.cpython-36.pyc │ │ └── intervals.cpython-38.pyc │ ├── downsampler.py │ ├── get_mutant_metrics.py │ └── intervals.py │ ├── error_rate_by_family_size.py │ ├── extract_false_positive_reads.py │ ├── familysize_dist.py │ ├── fastqsplit.pl │ ├── generate_reads.py │ ├── get_midpoint_from_interval.py │ ├── maf2vcf.py │ ├── msisensor_combine_result.py │ ├── print_mut_cpgstatus.py │ ├── print_snv_roc.py │ ├── rev_qualscore.py │ ├── vcf_update_genotype.py │ └── vcf_validate_against_maf.py └── trimadapter.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | cmake-build*/ 2 | .DS_Store 3 | .idea/ 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "bbcpputil/third_party/SeqLib"] 2 | path = bbcpputil/third_party/SeqLib 3 | url = https://github.com/ruolin/SeqLib.git 4 | [submodule "bbcpputil/third_party/backward-cpp"] 5 | path = bbcpputil/third_party/backward-cpp 6 | url = https://github.com/bombela/backward-cpp 7 | [submodule "bbcpputil/third_party/seqan"] 8 | path = bbcpputil/third_party/seqan 9 | url = https://github.com/ruolin/seqan.git 10 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | set( CMAKE_CXX_STANDARD 14 ) 3 | project (codecsuite LANGUAGES CXX) 4 | set(CMAKE_BUILD_TYPE Release) 5 | #set(CMAKE_BUILD_TYPE Debug) 6 | #for gprof 7 | #set (CMAKE_CXX_FLAGS "${CAMKE_CXX_FLAGS} -g -pg") 8 | #for valgrind 9 | #set (CMAKE_CXX_FLAGS "${CAMKE_CXX_FLAGS} -g -O1") 10 | set( CMAKE_EXE_LINKER_FLAGS " ${CMAKE_EXE_LINKER_FLAGS} -pthread") 11 | set( CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -Wall -Wno-deprecated -Wno-unused-variable " ) 12 | 13 | 14 | set (CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -DSEQAN_ENABLE_TESTING=0 -DSEQAN_HAS_ZLIB=1") 15 | find_package(OpenMP REQUIRED) 16 | 17 | include_directories(include) 18 | include_directories(bbcpputil/third_party/seqan/include/) 19 | 20 | set( CMAKE_EXE_LINKER_FLAGS " ${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++ -L. ") 21 | set( CPP_LIBS ${CPP_LIBS} seqlib bwa fml hts bz2) 22 | set( CPP_LIBS ${CPP_LIBS} dl ) 23 | set( CPP_LIBS ${CPP_LIBS} z lzma) 24 | set( CPP_LIBS ${CPP_LIBS} OpenMP::OpenMP_CXX) 25 | 26 | include_directories(bbcpputil/include) 27 | 28 | set(SEQLIB_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/bbcpputil/third_party/SeqLib) 29 | find_library(SEQLIB_LIBRARY 30 | NAMES seqlib 31 | PATHS ${SEQLIB_ROOT} 32 | PATH_SUFFIXES lib 33 | NO_DEFAULT_PATH 34 | ) 35 | include(ExternalProject) 36 | if(NOT SEQLIB_LIBRARY) 37 | message("SeqLib not found") 38 | ExternalProject_Add(SeqLib 39 | SOURCE_DIR "${SEQLIB_ROOT}" 40 | INSTALL_DIR "${SEQLIB_ROOT}" 41 | BUILD_IN_SOURCE 1 42 | CONFIGURE_COMMAND chmod +x ./configure COMMAND ./configure prefix=${SEQLIB_ROOT} 43 | BUILD_COMMAND make CXXFLAGS='-std=c++11' 44 | INSTALL_COMMAND make install 45 | ) 46 | else() 47 | message("SeqLib found") 48 | add_custom_target(SeqLib) 49 | endif() 50 | 51 | include_directories(${SEQLIB_ROOT}) 52 | include_directories(${SEQLIB_ROOT}/htslib) 53 | link_directories(${SEQLIB_ROOT}/lib/) 54 | 55 | ## not currently used. But reserve for denovo consensus 56 | 57 | set(deps bbcpputil/cpp/AlignmentConsensus.cpp bbcpputil/cpp/BamRecordExt.cpp) 58 | set(subcommand demux.cpp trimadapter.cpp duplex_filter.cpp accuracy.cpp) 59 | if (CMAKE_BUILD_TYPE MATCHES Debug) 60 | message(" ") 61 | message("CMAKE IN DEBUG MODE") 62 | message(" ") 63 | link_directories($ENV{ELFUTILS_ROOT}/lib/) 64 | set(LIBDW_LIBRARY "$ENV{ELFUTILS_ROOT}/lib/libdw.a") 65 | set(LIBDW_INCLUDE_DIR "$ENV{ELFUTILS_ROOT}/include/") 66 | set( CPP_LIBS ${CPP_LIBS} dw elf) # backward_cpp 67 | add_subdirectory(bbcpputil/third_party/backward-cpp) 68 | elseif(CMAKE_BUILD_TYPE MATCHES Release) 69 | message(" ") 70 | message("CMAKE IN RELEASE MODE") 71 | message(" ") 72 | endif () 73 | 74 | add_executable(codec codec.cpp ${subcommand} ${deps}) 75 | add_dependencies(codec SeqLib) 76 | 77 | if (CMAKE_BUILD_TYPE MATCHES Debug) 78 | #add_executable(codec codec.cpp ${subcommand} ${deps} ${BACKWARD_ENABLE}) 79 | add_backward(codec) 80 | endif() 81 | target_link_libraries(codec ${CPP_LIBS}) 82 | 83 | ######## uncomment if you need to build CODEC_MSI 84 | #add_executable(msi msi/msi.cpp ${deps}) 85 | #target_link_libraries(msi z ${CPP_LIBS}) 86 | -------------------------------------------------------------------------------- /CODECsuite_TPC_Notice_Final.txt: -------------------------------------------------------------------------------- 1 | Notice of Third Party Code Dependencies 2 | 3 | CODECsuite is distributed, in part, under and subject to the provisions of respective licenses for the following dependencies: 4 | 5 | 1. SeqLib 6 | Copyright 2016 Jeremiah A. Wala. All rights reserved. 7 | https://github.com/walaj/SeqLib/blob/master/LICENSE 8 | 9 | 2. backward-cpp 10 | Copyright (c) 2013 Google Inc. All rights reserved. 11 | https://github.com/bombela/backward-cpp/blob/master/LICENSE.txt 12 | 13 | 3. seqan 14 | Copyright (c) 2006-2018 Knut Reinert, FU Berlin. All rights reserved. 15 | https://github.com/seqan/seqan/blob/master/LICENSE 16 | 17 | 4. spoa 18 | Copyright (c) 2016 Robert Vaser. All rights reserved. 19 | https://github.com/rvaser/spoa/blob/master/LICENSE 20 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | CODECsuite is released under the following BSD 3-Clause License: 2 | 3 | Copyright (c) 2021 The Broad Institute, Inc. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | UPDATES 2 | * (05/19/25) Version 1.1.5 introduces new scripts for converting CODEC single fragment duplex variants output to MAF or VCF format. 3 | [See details here](#reformatting-codec-sfc-variant-output-into-maf-or-vcf-format) 4 | 5 | * (01/08/25) Version 1.1.4 introduces a new script `codec filter` designed to filter consensus BAM files. It retains only the reads and bases relevant for variant calling. Fragments (read-pairs) that do not pass fragment-level filtering are excluded from the output BAM. Bases that fail the filters are assigned a minimum base quality score (Q2), ensuring they are ignored by most coverage analysis and variant calling tools. 6 | It can be run as the following: 7 | 8 | ``` codec filter -b mol_consensus.sorbybyname.bam -o duplex_only.bam -r reference.fa -q 30 -m 60 -Q 0.7 -B 0.5 -N 0.05 ...``` 9 | > [!NOTE] 10 | > The input BAM file must be sorted by read name, and the output BAM will also be query-name sorted. For consistent filtering results, it is recommended to use the same parameters as those in codec call. 11 | 12 | 13 | # CODECsuite 14 | The CODEC analysis pipeline, CODECsuite, comprises five key steps: demultiplexing, adapter trimming, alignment, duplicate collapsing, and single-fragment mutation calling. Duplicate collapsing and alignments are performed using the third-party tools Fgbio and BWA, respectively. After removing byproducts and applying fragment-level filtering, mutations were identified exclusively from duplexes in the overlapped regions, where bases from each read align and match. Bases within these regions underwent stringent filtering based on criteria such as base quality, proximity to fragment ends, overlap with germline mutations, and other factors. Notably, a single read pair is sufficient to form a duplex, as each read represents one strand. Refer to the [paper](https://www.nature.com/articles/s41588-023-01376-0) for more details. 15 | 16 | ## Installation 17 | Tested on Red Hat 7 and Ubuntu 18.04 18 | 19 | prerequisite for C++ based programs. For snakemake workflow check out [here](./snakemake) 20 | 1. git 21 | 2. tested with gcc 5.2 and 7.3 with c++14 support 22 | 3. cmake 3.18.3 or above 23 | 24 | First, recursive clone the repo and create a build directory which will holds the installion files and final executables. 25 | 26 | `git clone --recursive git@github.com:broadinstitute/CODECsuite.git && cd CODECsuite && mkdir build` 27 | 28 | Next, build the program with cmake. 29 | 30 | `cd build && cmake .. && make` 31 | 32 | After this, you should be able to see an executable named `codec` in the build folder you just created. 33 | 34 | ## Demultiplexing 35 | CODECsuite is expected to work with raw lane-level fastq.gz. This can be obtained from illumina [bcl2fastq](https://support.illumina.com/downloads/bcl2fastq-conversion-software-v2-20.html). 36 | The first step is demultiplexing and it requires a sample sheet in csv format for each lane which looks the following. 37 | Currently, we have used 12 barcodes. For good cluster generation, we recommend to have at least 4 sample barcodes per 38 | sequencing lane. 39 | 40 | | SampleName | IndexBarcode1 | IndexBarcode2 | 41 | |------------|---------------|---------------| 42 | |Sample01|CTTGAACGGACTGTCCAC|CACCGAGCGTTAGACTAC| 43 | |Sample02|GAGCCTACTCAGTCAACG|GTGTCGAACACTTGACGG| 44 | |Sample03|AGCTTGTAAGGCAGGTTA|ACTGATCTTCAGCTGACT| 45 | |Sample04|TCAAGCGTCTTACATGGT|TGAATCTGAGGCACTGTA| 46 | |Sample05|CTGGTCCAAGAACGTCTG|CTCTGAACGATCGAGCTC| 47 | |Sample06|GATCCAGTTCTGTCGAGC|GAGGTGCATGCACCTTAG| 48 | |Sample07|ACCTATAGGTGCAACGAA|ACTAACTTCCATTGCACT| 49 | |Sample08|TGAAGGTCCACTGTATCT|TGACCTGGATGGATAGGA| 50 | |Sample09|CACTGCTTCGAGACGAAG|CTCCAGTTACTGAGACGG| 51 | |Sample10|GTGATACCTCGATGCTCC|GAGGTCCAGTCTCTGTCC| 52 | |Sample11|ACTCAGAGAACTCATGGA|ACTACAGGTGGATCCAAT| 53 | |Sample12|TGAGCTGAGTTCGTACTT|TGATGTACCAACGATGTA| 54 | 55 | `codex demux -1 reads.r1.fastq.gz -2 reads.r2.fastq.gz -p sample_sheet.csv -o demux_outprefix ` 56 | 57 | Given the toy sample_sheet.csv and code this command will generate 58 | ``` 59 | demux_outprefix.sample_A.1.fastq.gz, demux_outprefix.sample_A.2.fastq.gz 60 | demux_outprefix.sample_B.1.fastq.gz, demux_outprefix.sample_B.2.fastq.gz 61 | ``` 62 | 63 | ## Adapter trimming 64 | After demultiplexing CODEC reads still contain in-situ sample barcode and adapter sequences. The next step is to trim 65 | these out since they could interfere alignment 66 | 67 | `codec trim -1 demux_outprefix.sample_A.1.fastq.gz -2 demux_outprefix.sample_A.2.fastq.gz -o trim_outprefix -u 3 -U 3 -f 2 -t 2 -s sample_A` 68 | 69 | This tells the CODECsuite that first 3bp of a read is the UMI and to trim off the next two 2bp. 70 | The output files of the adapter trimming step looks like 71 | ``` 72 | trim_outprefix.sample_A.trim.bam 73 | trim_outprefix.sample_A.trim.log 74 | ``` 75 | By default, single-end byproducts are also output to the `trim.bam`. To split the output use `-S/--split_bam_output`. 76 | 77 | The bam file is standard uBam (unmapped bam) with additional tags 78 | ``` 79 | RX: UMI sequence from R1 and R2, concatenated by a hyphen 80 | QX: UMI quality scores 81 | bc: Index barcode sequence 82 | s5: 5' adapter sequence (same as Index barcode) 83 | q5: 5' adapter quality scores 84 | s3: 3' adapter sequence (same as Index barcode of the mate) 85 | q3: 3' adapter quality scores 86 | sl: the rest of 3' adapter sequence 87 | ql: the rest of 3' adapter quality scores 88 | ``` 89 | 90 | After adapter trimming. The codec reads can be mapped by standard NGS tools such as BWA. For our end-to-end pipeline 91 | please see [snakemake](./snakemake). 92 | 93 | > [!NOTE] 94 | > We recommend using SMaHT duplex reference genome which is basically HG38 without decoy sequences. See reasons here: https://smaht-dac.github.io/pipelines-docs/DOCS/REFERENCE_FILES/Genome_Builds/1_Build_GRCh38.html 95 | 96 | ## Single fragment caller (SFC) and mutation rate computation 97 | 98 | After GATK best practice (alignment, markduplicate, indel realignment) for example. Of note, BQSR should NOT be run for 99 | CODEC data since CODEC has a different quality score distribution. I do not recommend BQSR in general since the modern 100 | Illumina sequencers' quality scores having been improved and BQSR almost doubles the bam size. 101 | 102 | Now, we can run SFC to call mutations. SFC is designed to call somatic mutations. For the best results, we need to have 103 | a bed file which contains the high confident regions (e.g., [this](https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.3/GRCh38@all/LowComplexity/GRCh38_notinAllTandemRepeatsandHomopolymers_slop5.bed.gz)) in the reference genome and a germline bam for masking the germline 104 | variants. If there is no germline bam, it is recommend to have a germline vcf file. The population based vcf (e.g., [gnomad vcf](https://storage.googleapis.com/gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz)) is almost 105 | surely recommended since it can account for contamination and low sequencing depth of germline bam. However, to avoid over-filtering true somatic mutations, a minimum allele frequency threshold is recommend for the population vcf (e.g., 0.01%) 106 | 107 | There is a set of 108 | fragment level and base level filters to improve the precision of the mutation calls at the cost of data loss and potential 109 | loss of real mutations. Depends on the applications, we have presets of parameters: `-p/--preset` 110 | ``` 111 | stringent: setting where high precision calling is needed. Situations like calling background mutation rate in white 112 | blood cells 113 | 114 | lenient: setting where certain sensitivity is prefer. situations like calling cancer mutations in tumor biopsy or high 115 | tumor fraction ctDNA samples. 116 | 117 | null: as little as filtering possible. For advanced users who want to filtering on there own ends. 118 | ``` 119 | 120 | It is highly recommended that a user try and play with different parameters and figure out the ones that are best for 121 | themselves. Trimming the end `-d 12` and use Q30 cutoff `-q 30` is always recommended. The others are ad hoc. However, 122 | the most effective parameter is probably `-Q/--min_passQ_frac`: the fraction of Q30+Q30 bases in the overlap region. The fraction essentially 123 | measures the cluster quality, which is important for single fragment calling. Some examples of running SFC 124 | ``` 125 | codec call -b input.mark_duplicated.bam -L highconfidentregions.bed -r hg38.fa -n germline.bam -p lenient -o output 126 | 127 | ``` 128 | 129 | The output of the SFC are 130 | ``` 131 | output.mutation_metrics.txt: includes SNV_rate, INDEL_rate and etc. 132 | output.variatns_called.txt: mutations from single fragments 133 | output.context_count.txt: trinucleotide context and dinucleotide context counts 134 | ``` 135 | 136 | > [!NOTE] 137 | > All CODEC related resources can be found at https://console.cloud.google.com/storage/browser/codec_cloud_resources. Including the population based vcf: https://storage.googleapis.com/codec_cloud_resources/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz 138 | 139 | ## Reformatting CODEC SFC variant output into MAF or VCF format 140 | Scripts `codec2maf` and `maf2vcf.py` can be found in folder `snakemake/script/`. `maf2vcf.py` depends on `maf2vcf.pl` script from the perl package [mskcc/vcf2maf](https://github.com/mskcc/vcf2maf/tree/main) 141 | 1. CODEC txt file To MAF: `codec2maf -i output.variatns_called.txt -o output.variatns_called.maf` 142 | 2. MAF To VCF: `maf2vcf.py output.variatns_called.maf -r hg38.fa -o outdir -p /usr/bin/maf2vcf.pl` 143 | 144 | 145 | ## Other notes 146 | 1. For CODEC-MSI please refer to [msi](./msi). And by default CMAKE will not build CODEC-MSI. Please uncomment the last two 147 | lines if you indeed want to build CODEC-MSI 148 | 149 | 2. The Snakemake is hard-coded to de-multiplex 4 lanes simultaneously (e.g. for NovaSeq 6000). If you need to de-multiplex 150 | less #lanes (e.g. for NovaSeq SP), comment out entire rules for DemuxL3 and DemuxL4. If you have more than 4 lanes (e.g. HiSeq X) 151 | either do 4 lane at a times or add more rules yourself. 152 | 153 | 3. The Snakemake pipeline setup file `qsub_wrapper.py` is specific to [UGE](https://en.wikipedia.org/wiki/Univa_Grid_Engine). 154 | You may need to change settings for your computing environment. 155 | -------------------------------------------------------------------------------- /bbcpputil/.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/backward-cpp"] 2 | path = third_party/backward-cpp 3 | url = https://github.com/bombela/backward-cpp 4 | [submodule "third_party/spoa"] 5 | path = third_party/spoa 6 | url = git@github.com:rvaser/spoa.git 7 | [submodule "third_party/SeqLib"] 8 | path = third_party/SeqLib 9 | url = git@github.com:ruolin/SeqLib.git 10 | [submodule "third_party/seqan"] 11 | path = third_party/seqan 12 | url = git@github.com:ruolin/seqan.git 13 | -------------------------------------------------------------------------------- /bbcpputil/include/Algo.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 10/31/21. 3 | // 4 | 5 | #ifndef CODECSUITE_BBCPPUTIL_INCLUDE_ALGO_H_ 6 | #define CODECSUITE_BBCPPUTIL_INCLUDE_ALGO_H_ 7 | #include 8 | #include 9 | #include 10 | 11 | namespace cpputil { 12 | 13 | class UniqueQueue { 14 | /* 15 | * Pair up Queue with Set so that Queue has only unique element 16 | */ 17 | public: 18 | UniqueQueue(size_t cap) : capacity_(cap), n_added_(0) {}; 19 | UniqueQueue() : UniqueQueue(0) {}; 20 | 21 | bool exist(const std::string& in) const { 22 | //auto h = hash_string(in.c_str()); 23 | if (s_.find(in) == s_.end()) return false; 24 | else return true; 25 | } 26 | 27 | void add(const std::string& in) { 28 | //auto h = hash_string(in.c_str()); 29 | if (s_.find(in) == s_.end()) { 30 | if (q_.size() < capacity_) { 31 | q_.push(in); 32 | s_.insert(in); 33 | } else { 34 | auto key = q_.front(); 35 | q_.pop(); 36 | s_.erase(key); 37 | q_.push(in); 38 | s_.insert(in); 39 | } 40 | ++n_added_; 41 | } 42 | } 43 | 44 | void clearQueue() { 45 | std::queue empty; 46 | std::swap(q_, empty); 47 | s_.clear(); 48 | } 49 | 50 | uint64_t NumAdded() const { 51 | return n_added_; 52 | } 53 | 54 | private: 55 | // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash 56 | inline uint64_t hash_string(const char* __s) const 57 | { 58 | uint64_t hash = 0xcbf29ce484222325ull; 59 | for ( ; *__s; ++__s) 60 | { 61 | hash *= 1099511628211ull; 62 | hash ^= *__s; 63 | } 64 | return hash; 65 | } 66 | 67 | std::unordered_set s_; 68 | std::queue q_; 69 | const size_t capacity_; 70 | uint64_t n_added_; 71 | }; 72 | 73 | inline int largest_cluster(std::vector sortedpos, int window, int &beg, int &end) { 74 | std::sort(sortedpos.begin(), sortedpos.end()); 75 | if (sortedpos.size() == 0) return 0; 76 | if (sortedpos.size() == 1) { 77 | beg = sortedpos[0]; 78 | end = sortedpos[0]; 79 | return 1; 80 | } 81 | int max_size = 1; 82 | for (unsigned i= 0; i < sortedpos.size() - 1; ++i) { 83 | unsigned j = i+1; 84 | int s = 1; 85 | for (; j < sortedpos.size();) { 86 | if (sortedpos[j] - sortedpos[i] < window) { 87 | ++s; 88 | ++j; 89 | } else { 90 | if (s > max_size) { 91 | beg = sortedpos[i]; 92 | end = sortedpos[j - 1]; 93 | max_size = s; 94 | } 95 | break; 96 | } 97 | if (j == sortedpos.size() && s > max_size) { 98 | beg = sortedpos[i]; 99 | end = sortedpos[j - 1]; 100 | max_size = s; 101 | } 102 | } 103 | } 104 | return max_size; 105 | } 106 | 107 | } 108 | 109 | #endif //CODECSUITE_BBCPPUTIL_INCLUDE_ALGO_H_ 110 | -------------------------------------------------------------------------------- /bbcpputil/include/Alignment.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 2/26/20. 3 | // This file holds old stuff for finding multiple segments (e.g. oligos) in 4 | // reads. 5 | // 6 | 7 | #ifndef CPPUTIL_INCLUDE_ALIGNMENT_H_ 8 | #define CPPUTIL_INCLUDE_ALIGNMENT_H_ 9 | 10 | #include "BamRecordExt.h" 11 | 12 | namespace cpputil { 13 | 14 | typedef std::vector Segments; // sequenced parts of a paired-end read for example. 15 | // or group of duplicated reads 16 | 17 | // check all BamReocrd has same start and stop 18 | inline bool AreSegsCompleteOverlap(const Segments &segs) { 19 | if (segs.empty()) return false; 20 | int32_t s = segs.front().PositionWithSClips(); 21 | int32_t e = segs.front().PositionEndWithSClips(); 22 | for (auto const & seg : segs) { 23 | if (seg.PositionWithSClips() != s) return false; 24 | if (seg.PositionEndWithSClips() != e) return false; 25 | } 26 | return true; 27 | } 28 | 29 | inline std::string GetUid(const Segments &segs, const std::string& umi_tag) { 30 | std::string uid; 31 | bool status; 32 | if (umi_tag.empty()) status = false; 33 | else { 34 | status = segs.front().GetZTag(umi_tag, uid); 35 | if (!status and segs.size() == 2) { 36 | status = segs.back().GetZTag(umi_tag, uid); 37 | } 38 | } 39 | if(!status) { // molecular identifier not exist 40 | std::string rxtag; 41 | int32_t s, e; 42 | if (segs.size() == 1) { 43 | s = segs.front().PositionWithSClips(); 44 | e = segs.front().PositionEndWithSClips(); 45 | } else if(segs.size() == 2){ 46 | if (segs.front().ReverseFlag()) { 47 | s = segs.front().PositionEndWithSClips(); 48 | e = segs.back().PositionWithSClips(); 49 | } else { 50 | s = segs.front().PositionWithSClips(); 51 | e = segs.back().PositionEndWithSClips(); 52 | } 53 | } 54 | bool rxtag_status = segs.front().GetZTag("RX", rxtag); 55 | if (rxtag_status) { 56 | uid = std::to_string(s) + "," + std::to_string(e) + ":" + rxtag; 57 | } else { 58 | } 59 | } 60 | return uid; 61 | } 62 | 63 | inline bool AreSegsCompleteOverlapExcludingSclip(const Segments &segs) { 64 | if (segs.empty()) return false; 65 | int32_t s = segs.front().Position(); 66 | int32_t e = segs.front().PositionEnd(); 67 | for (auto const & seg : segs) { 68 | if (seg.Position() != s) return false; 69 | if (seg.PositionEnd() != e) return false; 70 | } 71 | return true; 72 | } 73 | 74 | inline int GetNumOverlapBasesPEAlignment(const Segments & segs, bool FR_only = true) { 75 | assert(segs.size() == 2); 76 | assert(segs.front().Qname() == segs.back().Qname()); 77 | if (segs.front().Interchromosomal()) return 0; 78 | if (segs.front().PairOrientation() != 0 && FR_only) return 0; 79 | int left = std::max(segs.front().Position(), segs.back().Position()); 80 | int right = std::min(segs.front().PositionEnd(), segs.back().PositionEnd()); 81 | int ol = right - left; 82 | if (ol < 0) { 83 | ol = 0; 84 | } 85 | return ol; 86 | } 87 | 88 | inline bool ArePEAlignmentOverlapAtLeastK(const Segments & segs, int k) { 89 | if (k == -1) return AreSegsCompleteOverlapExcludingSclip(segs); 90 | int ol = GetNumOverlapBasesPEAlignment(segs); 91 | if (ol < k) return false; 92 | // if (segs.front().Position() < segs.back().PositionEnd() && segs.back().Position() < segs.front().PositionEnd()) { 93 | // return true; 94 | // } 95 | return true; 96 | } 97 | 98 | inline bool SegmentNotEmpty(const Segments &seg, int dummy) { 99 | return !seg.empty(); 100 | } 101 | 102 | } 103 | #endif //CPPUTIL_INCLUDE_ALIGNMENT_H_ 104 | -------------------------------------------------------------------------------- /bbcpputil/include/AlignmentConsensus.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 3/19/20. 3 | // 4 | 5 | #ifndef CPPUTIL_INCLUDE_ALIGNMENTCONSENSUS_H_ 6 | #define CPPUTIL_INCLUDE_ALIGNMENTCONSENSUS_H_ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "BamRecordExt.h" 13 | #include "Alignment.h" 14 | 15 | namespace cpputil { 16 | 17 | inline void find_insert_(const SeqLib::Cigar &cigar, int left_cursor, std::map &ins_len) { 18 | for (auto it = cigar.begin(); it != cigar.end(); ++it) { 19 | if (it->Type() == 'H') { 20 | continue; 21 | } else if (it->Type() == 'I') { 22 | ins_len[left_cursor] = std::max(ins_len[left_cursor], (int) it->Length()); 23 | } else { 24 | left_cursor += it->Length(); 25 | } 26 | } 27 | } 28 | 29 | 30 | std::string GetConsensusTemplate(const Segments& segs, int32_t& ref_most_left); 31 | 32 | std::pair 33 | GetGappedSeqAndQual(const SeqLib::BamRecord &r, const int start, const std::string consensus_template); 34 | 35 | std::string MergePairSeq(const Segments &segs, const std::vector& seqs, bool trim_overhang); 36 | std::string MergePair(const Segments &segs, bool trim_overhang); 37 | std::pair, std::vector> GetPairPileup(const Segments &segs); 38 | 39 | std::pair PairConsensus(const Segments &segs, const std::vector& seqs, 40 | bool trim_overhang, int qcutoff, std::vector& out_quals); 41 | 42 | std::pair PairSeqConsensus(const Segments &seg, bool trim_overhang, int qcutoff); 43 | } 44 | 45 | #endif //CPPUTIL_INCLUDE_ALIGNMENTCONSENSUS_H_ 46 | -------------------------------------------------------------------------------- /bbcpputil/include/BamRecordExt.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 5/10/21. 3 | // 4 | 5 | #ifndef CPPUTIL_INCLUDE_BAMRECORDEXT_CPP_H_ 6 | #define CPPUTIL_INCLUDE_BAMRECORDEXT_CPP_H_ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "SeqLib/RefGenome.h" 12 | namespace cpputil { 13 | 14 | class BamPileup { 15 | public: 16 | SeqLib::BamRecord bam; 17 | int32_t qpos; 18 | int indel, level; 19 | uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; 20 | BamPileup(const bam_pileup1_t *pi): bam(pi->b) { 21 | qpos = pi->qpos; 22 | indel = pi->indel; 23 | level = pi->level; 24 | is_del = pi->is_del; 25 | is_head = pi->is_head; 26 | is_refskip = pi->is_refskip; 27 | aux = pi->aux; 28 | } 29 | }; 30 | 31 | bool ProperPair(const SeqLib::BamRecord& bam); 32 | 33 | std::pair MatePositionAndPositionEndWithSoftClip(const SeqLib::BamRecord & bam); 34 | 35 | int32_t GetUnclippedFramgentLength(const SeqLib::BamRecord &b); 36 | 37 | std::pair MatePositionAndPositionEnd(const SeqLib::BamRecord & bam); 38 | 39 | //overlap len of paired end reads in the reference coordinate, excluding soft clipping 40 | int32_t InsertSize(const SeqLib::BamRecord & read1, const SeqLib::BamRecord& read2); 41 | 42 | std::pair ProperPairFramgentEndsWithSclip(const SeqLib::BamRecord &b); 43 | 44 | void PrintQual(const SeqLib::BamRecord &b); 45 | 46 | int32_t CountNOrLowQInMatchedBases(const SeqLib::BamRecord &b, const int qcutoff); 47 | 48 | void AddMatchedBasesToCycleCount( const SeqLib::BamRecord& b, 49 | std::vector& q0_cycle_count, 50 | std::vector& q30_cycle_count, 51 | int start = -1, 52 | int end = std::numeric_limits::max()); 53 | 54 | int32_t CountNBasesInAlignment(const SeqLib::BamRecord &b); 55 | 56 | int32_t NumSoftClip5End(const SeqLib::BamRecord &b); 57 | 58 | uint32_t GetNumNonIndelAlignedBases(const SeqLib::BamRecord &bam); 59 | int32_t GetNM(const SeqLib::BamRecord &bam); 60 | int32_t GetNMismatch(const SeqLib::BamRecord &bam, bool NisMM = false); 61 | bool HasClusteredMuts(const SeqLib::BamRecord &bam, const SeqLib::BamHeader& header, 62 | const SeqLib::RefGenome& refgenome, const int cutoff); 63 | 64 | bool GetBTag(const SeqLib::BamRecord&, const std::string&, std::vector&); 65 | 66 | int32_t GetNMismatchX(const SeqLib::BamRecord &bam); 67 | int GetFamilySize(const SeqLib::BamRecord &bam); 68 | int32_t IndelLen(const SeqLib::BamRecord &bam); 69 | 70 | int32_t GetTotalIndelLen(const SeqLib::BamRecord &bam); 71 | 72 | int SoftClip3end(SeqLib::BamRecord &bam); 73 | 74 | bool SoftClipBamRecord(SeqLib::BamRecord &bam); 75 | 76 | void MaskBaseBelowMinBq(SeqLib::BamRecord &bam, int32_t mbp); 77 | 78 | void TrimBamFromFragEnd(SeqLib::BamRecord &bam, int32_t mp, int32_t mate_position_end_with_sclip, int32_t end5, int32_t end3); 79 | 80 | void TrimPairFromFragEnd(SeqLib::BamRecord &left, SeqLib::BamRecord&right, int32_t n_trim); 81 | 82 | void TrimSingleFromFragEnd(SeqLib::BamRecord &bam, int32_t n_trim); 83 | 84 | int RefPosToQueryPos(const SeqLib::BamRecord &bam, const int refpos); 85 | 86 | bool IsPairOverlap(const SeqLib::BamRecord& one, const SeqLib::BamRecord& two); 87 | 88 | std::pair GetPairOverlapRStartAndRStop(const SeqLib::BamRecord& fwd, const SeqLib::BamRecord& rev); 89 | int EffFragLen(const std::vector&, int count_overhang); 90 | 91 | std::pair,std::pair> 92 | GetPairOverlapQStartAndQStop(const SeqLib::BamRecord& fwd, const SeqLib::BamRecord& rev); 93 | 94 | 95 | std::pair 96 | GetBamOverlapQStartAndQStop(const SeqLib::BamRecord& record, const SeqLib::GenomicRegion& gr); 97 | 98 | 99 | } //end namespace 100 | 101 | 102 | #endif //CPPUTIL_INCLUDE_BAMRECORDEXT_H_ 103 | -------------------------------------------------------------------------------- /bbcpputil/include/DNAUtils.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 12/20/19. 3 | // 4 | 5 | #ifndef REALIGN_INCLUDE_DNAUTILS_H_ 6 | #define REALIGN_INCLUDE_DNAUTILS_H_ 7 | #include 8 | #include 9 | #include 10 | 11 | namespace cpputil { 12 | 13 | inline char complement(char n) { 14 | switch (n) { 15 | case 'A':return 'T'; 16 | case 'T':return 'A'; 17 | case 'G':return 'C'; 18 | case 'C':return 'G'; 19 | case 'N':return 'N'; 20 | case 'n':return 'n'; 21 | case 'a':return 't'; 22 | case 't':return 'a'; 23 | case 'c':return 'g'; 24 | case 'g':return 'c'; 25 | } 26 | assert(false); 27 | return ' '; 28 | } 29 | 30 | inline std::string complementString(std::string x) { 31 | std::transform(std::begin(x), std::end(x), std::begin(x), complement); 32 | return x; 33 | } 34 | 35 | inline void reverse_complement(std::string &seq) { 36 | std::transform(std::begin(seq), std::end(seq), std::begin(seq), complement); 37 | for (int i = 0, j = seq.size() - 1; i < j; i++, j--) { 38 | std::swap(seq[i], seq[j]); 39 | } 40 | } 41 | 42 | inline void reverse(std::string &seq) { 43 | for (int i = 0, j = seq.size() - 1; i < j; i++, j--) { 44 | std::swap(seq[i], seq[j]); 45 | } 46 | } 47 | 48 | inline void PrintQualString(const std::string& qual, int min_bq = 20, int offset = 33) { 49 | std::string line1; 50 | std::string line2; 51 | std::string stat; 52 | for (unsigned i = 0; i < qual.size(); ++i) { 53 | int q = (int) qual[i] - offset; 54 | int div = q / 10; 55 | int reminder = q % 10; 56 | line1 += std::to_string(div); 57 | line2 += std::to_string(reminder); 58 | stat += q >= min_bq ? "*" : " "; 59 | } 60 | std::cout << line1 << std::endl; 61 | std::cout << line2 << std::endl; 62 | std::cout << stat << std::endl; 63 | } 64 | 65 | inline int TrimLowBQfromBack(const std::string& qual, char bq) { 66 | int i = qual.size(); 67 | while (i > 0 && qual[i-1] < bq) {i--;} 68 | return i; 69 | } 70 | 71 | inline int LastNfromBack(const std::string& seq) { 72 | //one pass the first not N from the back 73 | int i = seq.size(); 74 | while (i > 0 && seq[i-1] == 'N') {i--;} 75 | return i; 76 | } 77 | 78 | inline int FirstNotNfromFront(const std::string& seq) { 79 | int i = 0; 80 | while (i < (int) seq.size() && seq[i] == 'N') {i++;} 81 | return i; 82 | } 83 | 84 | } 85 | #endif //REALIGN_INCLUDE_DNAUTILS_H_ 86 | -------------------------------------------------------------------------------- /bbcpputil/include/FastxIO.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTA_IO 2 | #define FASTA_IO 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "FastxRecord.h" 11 | 12 | namespace cpputil { 13 | 14 | inline bool endswith(std::string const &value, std::string const &ending) { 15 | if (ending.size() > value.size()) return false; 16 | return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); 17 | } 18 | 19 | class FastqWriter { 20 | std::unique_ptr ofstream_; 21 | std::unique_ptr seqStream_; 22 | //std::mutex mtx_; 23 | public: 24 | FastqWriter() = default; 25 | FastqWriter(std::ofstream& f) : seqStream_(std::make_unique(f)) {} 26 | FastqWriter(const std::string& file) { 27 | open(file); 28 | } 29 | void open(std::ofstream& f) { 30 | seqStream_ = std::make_unique(f); 31 | } 32 | void open(const std::string& file) { 33 | ofstream_ = std::make_unique(file); 34 | seqStream_ = std::make_unique(*ofstream_); 35 | } 36 | void Write(const std::string &id, const std::string &seq, const std::string &qual) { 37 | seqan::CharString rid = id; 38 | seqan::CharString rseq = seq; 39 | seqan::CharString rqual = qual; 40 | //mtx_.lock(); 41 | seqan::writeRecord(*seqStream_, rid, rseq, rqual, seqan::Fastq()); 42 | //mtx_.unlock(); 43 | } 44 | 45 | void Write(const std::string &id, const std::string &seq) { 46 | std::string qual = std::string(seq.size(), 'I'); 47 | Write(id, seq, qual); 48 | } 49 | 50 | void Write(const FastxRecord &fxr) { 51 | if (fxr.qual.empty()) { 52 | Write(fxr.id, fxr.seq); 53 | } else { 54 | Write(fxr.id, fxr.seq, fxr.qual); 55 | } 56 | 57 | } 58 | }; 59 | 60 | class FastxReader { 61 | seqan::SeqFileIn seqfilein_; 62 | int ftype_; //0 for fasta, 1 for fastq, 2 for unknown 63 | public: 64 | FastxReader(std::string fastx) : seqfilein_(fastx.c_str()) { 65 | if (endswith(fastx, ".fq") or endswith(fastx, ".fastq") or 66 | endswith(fastx, ".fq.gz") or endswith(fastx, ".fastq.gz")) { 67 | ftype_ = 1; 68 | } else if (endswith(fastx, ".fa") or endswith(fastx, ".fasta") or 69 | endswith(fastx, ".fa.gz") or endswith(fastx, ".fasta.gz")) { 70 | ftype_ = 0; 71 | } else { 72 | ftype_ = 2; 73 | throw std::runtime_error("unknown file format " + fastx); 74 | } 75 | } 76 | 77 | bool yield(FastxRecord &record) { 78 | record.cleanup(); 79 | seqan::CharString id; 80 | seqan::CharString seq; 81 | seqan::CharString qual; 82 | 83 | if (seqan::atEnd(seqfilein_)) { 84 | return false; 85 | } 86 | if (ftype_ == 0) { 87 | seqan::readRecord(id, seq, seqfilein_); 88 | } else { 89 | seqan::readRecord(id, seq, qual, seqfilein_); 90 | } 91 | record = FastxRecord(seqan::toCString(id), seqan::toCString(seq), seqan::toCString(qual)); 92 | return true; 93 | } 94 | }; 95 | 96 | }//end namepsace 97 | #endif 98 | -------------------------------------------------------------------------------- /bbcpputil/include/FastxRecord.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 10/12/20. 3 | // 4 | 5 | #ifndef ADAPTERTRIM_CPPUTIL_INCLUDE_FASTXRECORD_H_ 6 | #define ADAPTERTRIM_CPPUTIL_INCLUDE_FASTXRECORD_H_ 7 | 8 | #include 9 | #include "StringUtils.h" 10 | #include "DNAUtils.h" 11 | 12 | namespace cpputil { 13 | 14 | inline std::string broad_name(const std::string &name) { 15 | auto fields = split(name, ":"); 16 | std::string res = fields[1]; 17 | for (unsigned i = 3; i < fields.size(); ++i) { 18 | res += ":" + fields[i]; 19 | } 20 | return res; 21 | } 22 | 23 | inline std::pair split_instrument_id_from_broad_name(const std::string &broad_name) { 24 | auto fields = split(broad_name, ":"); 25 | auto instrument_id = fields[0]; 26 | std::string the_rest = ""; 27 | for (unsigned i = 1; i < fields.size(); ++i) { 28 | the_rest += fields[i]; 29 | } 30 | return std::make_pair(instrument_id, std::stoull(the_rest)); 31 | } 32 | 33 | struct FastxRecord { 34 | std::string id; // full ID 35 | std::string seq; 36 | std::string qual; 37 | size_t name_idx = std::string::npos; 38 | FastxRecord() = default; 39 | FastxRecord(std::string i, std::string s, std::string q) : id(i), seq(s), qual(q) { 40 | name_idx = id.find(' '); 41 | if (name_idx == std::string::npos) { 42 | name_idx = id.find('/'); 43 | } 44 | }; 45 | 46 | FastxRecord(const SeqLib::BamRecord &br, bool duplex_umi = false) { 47 | id = br.Qname(); 48 | seq = br.Sequence(); 49 | qual = br.Qualities(); 50 | if (br.ReverseFlag()) { 51 | reverse_complement(seq); 52 | std::reverse(qual.begin(), qual.end()); 53 | } 54 | std::string umiseq; 55 | std::string umiqual; 56 | if (br.GetZTag("RX", umiseq)) { 57 | bool status = br.GetZTag("QX", umiqual); 58 | if (!status) { 59 | umiqual = std::string('I', umiseq.size()); 60 | } 61 | if (duplex_umi) { 62 | auto umiseqs = cpputil::split(umiseq, "-"); 63 | auto umi1_qual = umiqual.substr(0, umiseqs[0].size()); 64 | auto umi2_qual = umiqual.substr(umiseqs[0].size()+1, umiseqs[1].size()); 65 | if (br.FirstFlag()) { 66 | seq = umiseqs[0] + seq; 67 | qual = umi1_qual + qual; 68 | } else { 69 | seq = umiseqs[1] + seq; 70 | qual = umi2_qual + qual; 71 | } 72 | } else { 73 | seq = umiseq + seq; 74 | qual = umiqual + qual; 75 | } 76 | } 77 | } 78 | 79 | void update_id_with_umi(const std::string &umi) { 80 | auto name = id.substr(0, name_idx); 81 | auto suffix = id.substr(name_idx); 82 | id = name + "_" + umi + suffix; 83 | name_idx += umi.size() + 1; 84 | } 85 | 86 | virtual void cleanup() { 87 | id.clear(); 88 | seq.clear(); 89 | qual.clear(); 90 | name_idx = std::string::npos; 91 | } 92 | 93 | bool is_filtered() { 94 | std::size_t found = id.find_first_of(':', name_idx); 95 | if (found < id.size() - 1) { 96 | if (id[found+1] == 'Y') return true; 97 | } 98 | return false; 99 | } 100 | 101 | std::string index_barcode() const { 102 | std::size_t found = id.find_last_of(':'); 103 | if (found != std::string::npos) { 104 | return id.substr(found + 1); 105 | } else { 106 | return ""; 107 | } 108 | } 109 | 110 | std::string name() const { 111 | return (name_idx != std::string::npos ? id.substr(0, name_idx) : id); 112 | } 113 | 114 | //broad cannonicalized name,e.g., 115 | //"D00203:HCY5YBCX3200606:HCY5YBCX3:1:1105:4656:14095" 116 | //"HCY5YBCX3200606:1:1105:4656:14095" 117 | // Warning, should only work for illumina fastq read convention 118 | 119 | std::string broad_id() const { 120 | if (name_idx != std::string::npos) { 121 | return broad_name(this->name()) + " " + id.substr(name_idx); 122 | } else { 123 | return broad_name(this->name()); 124 | } 125 | } 126 | }; 127 | 128 | class AnnotatedSeq { 129 | std::string seq_; 130 | std::string qual_; 131 | public: 132 | AnnotatedSeq() = default; 133 | AnnotatedSeq(std::string s, std::string q) : seq_(s), qual_(q) { 134 | assert(s.size() == q.size()); 135 | }; 136 | bool empty() const { 137 | return seq_.size() == 0; 138 | } 139 | decltype(auto) qual() const { 140 | return (qual_); 141 | } 142 | decltype(auto) seq() const { 143 | return (seq_); 144 | } 145 | void cleanup() { 146 | seq_.clear(); 147 | qual_.clear(); 148 | } 149 | }; 150 | 151 | struct ExtFastxRecord : public FastxRecord { 152 | AnnotatedSeq adap5; 153 | AnnotatedSeq adap3; 154 | AnnotatedSeq umi; 155 | AnnotatedSeq trim3; 156 | std::string barcode; 157 | int tm = 255; //unsigned 158 | int rc_adpt = 0; 159 | 160 | void cleanup() override { 161 | tm = 255; 162 | rc_adpt = 0; 163 | FastxRecord::cleanup(); 164 | adap3.cleanup(); 165 | adap5.cleanup(); 166 | umi.cleanup(); 167 | trim3.cleanup(); 168 | barcode.clear(); 169 | } 170 | }; 171 | 172 | template 173 | Stream &operator<<(Stream &os, const FastxRecord &fxr) { 174 | if (fxr.qual.empty()) { 175 | os << ">" + fxr.id << "\n" << fxr.seq << "\n"; 176 | } else { 177 | os << "@" + fxr.id << "\n" << fxr.seq << "\n" << "+\n" << fxr.qual << "\n"; 178 | } 179 | return os; 180 | } 181 | 182 | } 183 | #endif //ADAPTERTRIM_CPPUTIL_INCLUDE_FASTXRECORD_H_ 184 | -------------------------------------------------------------------------------- /bbcpputil/include/Files.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 5/11/21. 3 | // 4 | #ifndef CPPUTIL_INCLUDE_FILES_H_ 5 | #define CPPUTIL_INCLUDE_FILES_H_ 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace cpputil { 12 | 13 | inline bool FileExist(const std::string &name) { 14 | std::ifstream f(name.c_str()); 15 | return f.good(); 16 | } 17 | } 18 | 19 | #endif //CPPUTIL_INCLUDE_FILES_H_ 20 | -------------------------------------------------------------------------------- /bbcpputil/include/GenomicRegionCollectionExt.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 8/26/20. 3 | // 4 | 5 | #ifndef CPPUTIL_INCLUDE_GENOMICREGIONCOLLECTIONEXT_H_ 6 | #define CPPUTIL_INCLUDE_GENOMICREGIONCOLLECTIONEXT_H_ 7 | #include 8 | #include 9 | #include 10 | #include "Seqlib/GenomicRegionCollection.h" 11 | namespace cpputil { 12 | 13 | } 14 | 15 | 16 | #endif //CPPUTIL_INCLUDE_GENOMICREGIONCOLLECTIONEXT_H_ 17 | -------------------------------------------------------------------------------- /bbcpputil/include/Gotoh.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 9/27/20. 3 | // 4 | 5 | #ifndef ADAPTERTRIM_CPPUTIL_INCLUDE_GOTOH_H_ 6 | #define ADAPTERTRIM_CPPUTIL_INCLUDE_GOTOH_H_ 7 | 8 | /* 9 | This code snippet finds all optimal paths using affine gap penalty. 10 | So many public codes and textbooks implement or teach solutions leading to suboptimal solutions. 11 | Inspired by paper https://www.biorxiv.org/content/10.1101/031500v1.full.pdf, 12 | I decided to implement the correct algorithm which is documented at 13 | https://www.researchgate.net/publication/19580571_Optimal_sequence_alignment_using_affine_gap_costs 14 | */ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | class Alignment { 25 | std::vector display; 26 | int nm = 0; 27 | public: 28 | struct Node { 29 | int i; 30 | int j; 31 | Node(int ii, int jj) : i(ii), j(jj) {} 32 | Node() = default; 33 | }; 34 | 35 | Alignment(const std::string& row, const std::string col, const std::vector& path) { 36 | std::string rowgap; 37 | std::string colgap; 38 | std::string visual; 39 | for (unsigned ii = 0; ii < path.size() - 1; ++ii) { 40 | const Node & cur = path[ii]; 41 | const Node & next = path[ii + 1]; 42 | if (cur.i == next.i + 1 && cur.j == next.j + 1) { 43 | rowgap += row[next.i]; 44 | colgap += col[next.j]; 45 | if (row[next.i] != col[next.j]) { 46 | nm++; 47 | visual += ' '; 48 | } else { 49 | visual += '|'; 50 | } 51 | } else if (cur.i == next.i && cur.j == next.j + 1) { 52 | nm ++; 53 | visual += ' '; 54 | rowgap += '-'; 55 | colgap += col[next.j]; 56 | } else if (cur.i == next.i + 1 && cur.j == next.j) { 57 | nm ++; 58 | visual += ' '; 59 | colgap += '-'; 60 | rowgap += row[next.j]; 61 | } 62 | } 63 | std::reverse(colgap.begin(), colgap.end()); 64 | std::reverse(visual.begin(), visual.end()); 65 | std::reverse(rowgap.begin(), rowgap.end()); 66 | display.push_back(colgap); 67 | display.push_back(visual); 68 | display.push_back(rowgap); 69 | } 70 | 71 | int NM() const { 72 | return nm; 73 | } 74 | friend std::ostream& operator<<(std::ostream&, const Alignment&); 75 | }; 76 | 77 | std::ostream& operator<<(std::ostream& os, const Alignment& align) { 78 | os << align.nm << '\n'; 79 | for (const std::string& s : align.display) { 80 | os << s << '\n'; 81 | } 82 | return os; 83 | } 84 | 85 | using std::vector; 86 | class AffineGap { 87 | using Node = Alignment::Node; 88 | /* 89 | * matrix as Reference at row top Query at column left 90 | */ 91 | std::string rowstring_; 92 | std::string colstring_; 93 | vector align_path_; 94 | vector> paths_; 95 | int nrow_; 96 | int ncol_; 97 | vector> R_; // match matrix 98 | vector> P_; // vertical insertion matrix 99 | vector> Q_; // horizontal deletion matrix 100 | vector> vert_whole_; // a 101 | vector> hori_whole_; // b 102 | vector> diag_whole_; // c 103 | vector> vert_top_half_; // d 104 | vector> vert_bottom_half_; // e 105 | vector> hori_left_half_; // f 106 | vector> hori_right_half_; // g 107 | 108 | //gap score = gap_open + gap_ext * gap_len 109 | const static int gap_open_ = -1; 110 | const static int gap_ext_ = -1; 111 | static int DiagScore_(const char& a, const char& b) { 112 | return a == b? 0: -1; 113 | } 114 | 115 | public: 116 | AffineGap(const std::string& query, const std::string& ref): rowstring_(query), colstring_(ref), //align_path_(colstring_, rowstring_), 117 | nrow_((int) rowstring_.length() + 1), 118 | ncol_((int) colstring_.length() + 1), 119 | R_(vector>(nrow_, vector(ncol_))), 120 | P_(vector>(nrow_, vector(ncol_))), 121 | Q_(vector>(nrow_, vector(ncol_))), 122 | vert_whole_(vector>(nrow_ + 1, vector(ncol_ + 1))), 123 | hori_whole_(vector>(nrow_ + 1, vector(ncol_ + 1))), 124 | diag_whole_(vector>(nrow_ + 1, vector(ncol_ + 1))), 125 | vert_top_half_(vector>(nrow_ + 1, vector(ncol_ + 1))), 126 | vert_bottom_half_(vector>(nrow_ + 1, vector(ncol_ + 1))), 127 | hori_left_half_(vector>(nrow_ + 1, vector(ncol_ + 1))), 128 | hori_right_half_(vector>(nrow_ + 1, vector(ncol_ + 1))) 129 | { 130 | // init 131 | for (int j = 0; j < ncol_; ++j) { 132 | P_[0][j] = 2 * gap_open_ + std::max(ncol_, nrow_) * gap_ext_ - 1; // ensure a large number 133 | R_[0][j] = gap_open_ + j * gap_ext_; 134 | } 135 | for (int i = 0; i < nrow_; ++i) { 136 | Q_[i][0] = 2 * gap_open_ + std::max(ncol_, nrow_) * gap_ext_ - 1; // ensure a large number 137 | R_[i][0] = gap_open_ + i * gap_ext_; 138 | } 139 | R_[0][0] = 0; 140 | diag_whole_[nrow_][ncol_] = 1; 141 | 142 | //cost assignment 143 | for (int i = 0; i < nrow_; ++i) { 144 | for (int j = 0; j < ncol_; ++j) { 145 | if (i != 0) { 146 | P_[i][j] = gap_ext_ + std::max(P_[i-1][j], R_[i-1][j] + gap_open_); 147 | if (P_[i][j] == gap_ext_ + P_[i-1][j]) vert_top_half_[i-1][j] = 1; 148 | if (P_[i][j] == gap_ext_ + gap_open_ + R_[i-1][j]) vert_bottom_half_[i-1][j] = 1; 149 | } 150 | if (j != 0) { 151 | Q_[i][j] = gap_ext_ + std::max(Q_[i][j-1], R_[i][j-1] + gap_open_); 152 | if (Q_[i][j] == gap_ext_ + Q_[i][j-1]) hori_left_half_[i][j-1] = 1; 153 | if (Q_[i][j] == gap_ext_ + gap_open_ + R_[i][j-1]) hori_right_half_[i][j-1] = 1; 154 | } 155 | if (i != 0 && j != 0 ) { 156 | R_[i][j] = std::max(R_[i-1][j-1] + DiagScore_(colstring_[j-1], rowstring_[i-1]), std::max(Q_[i][j], P_[i][j])); 157 | if (R_[i][j] == R_[i-1][j-1] + DiagScore_(colstring_[j-1], rowstring_[i-1])) diag_whole_[i][j] = 1; 158 | } 159 | if (R_[i][j] == P_[i][j]) vert_whole_[i][j] = 1; 160 | if (R_[i][j] == Q_[i][j]) hori_whole_[i][j] = 1; 161 | } 162 | } 163 | // std::cout<< "cost assignment\t"; 164 | // std::cout << diag_whole_[nrow_-1][ncol_-1] << "\t" << vert_whole_[nrow_-1][ncol_-1] << "\t" << hori_whole_[nrow_-1][ncol_-1] << "\n"; 165 | 166 | //edge assignment 167 | for (int i = nrow_ - 1; i >= 0; --i) { 168 | for (int j = ncol_ - 1; j >= 0; --j) { 169 | if ((vert_whole_[i+1][j] == 0 || vert_bottom_half_[i][j] == 0) && 170 | (hori_whole_[i][j+1] == 0 || hori_right_half_[i][j] == 0) && 171 | diag_whole_[i+1][j+1] == 0) { 172 | // if (vert_bottom_half_[i][j] == 0 && vert_whole_[i+1][j] != 0) { 173 | // std::cerr << "vi: " << i << " vj: " << j << std::endl; 174 | // } 175 | // if (hori_whole_[i][j+1] != 0 && hori_right_half_[i][j] == 1) { 176 | // std::cerr << "hi: " << i << " hj: " << j << std::endl; 177 | // } 178 | vert_whole_[i][j] = 0; 179 | hori_whole_[i][j] = 0; 180 | diag_whole_[i][j] = 0; 181 | } 182 | if (vert_whole_[i+1][j] == 0 && 183 | hori_whole_[i][j+1] == 0 && 184 | diag_whole_[i+1][j+1] == 0) { 185 | continue; 186 | } else { 187 | if ( vert_whole_[i+1][j] == 1 && vert_top_half_[i][j] == 1) { 188 | vert_top_half_[i+1][j] = 1 - vert_bottom_half_[i][j]; 189 | vert_bottom_half_[i][j] = 1 - vert_whole_[i][j]; 190 | vert_whole_[i][j] = 1; 191 | } else { 192 | vert_top_half_[i+1][j] = 0; 193 | vert_bottom_half_[i][j] = 0; 194 | } 195 | 196 | if ( hori_whole_[i][j + 1] == 1 && hori_left_half_[i][j] == 1) { 197 | hori_left_half_[i][j+1] = 1 - hori_right_half_[i][j]; 198 | hori_right_half_[i][j] = 1 - hori_whole_[i][j]; 199 | hori_whole_[i][j] = 1; 200 | } else { 201 | hori_left_half_[i][j+1] = 0; 202 | hori_right_half_[i][j] = 0; 203 | } 204 | } 205 | } 206 | } 207 | // backtrack by bit array matrics 208 | DFS(Node(nrow_ -1, ncol_ -1), 0); 209 | } 210 | 211 | void DFS(const Node& cn, const int must_go_dir) { 212 | // must_go_dir, 0: not required, 1: must go left, 2: must go above 213 | auto prev = align_path_.empty() ? Node(0,0) : align_path_.back(); 214 | align_path_.push_back(cn); 215 | if (cn.i == 0 && cn.j == 0) { 216 | paths_.push_back(align_path_); 217 | } 218 | else { 219 | if (must_go_dir == 1) { 220 | int next_must_go = hori_whole_[cn.i][cn.j] && hori_left_half_[cn.i][cn.j] ? 1 : 0; 221 | DFS(Node(cn.i, cn.j-1), next_must_go); 222 | } 223 | else if (must_go_dir == 2){ 224 | int next_must_go = vert_whole_[cn.i][cn.j] && vert_top_half_[cn.i][cn.j] ? 2 : 0; 225 | DFS(Node(cn.i-1, cn.j), next_must_go); 226 | } 227 | else { 228 | if (diag_whole_[cn.i][cn.j]) { 229 | DFS(Node(cn.i - 1, cn.j - 1), 0); 230 | } 231 | if (vert_whole_[cn.i][cn.j]) { 232 | if (vert_bottom_half_[cn.i][cn.j]) { 233 | if (cn.i + 1 != prev.i || cn.j != prev.j) return; 234 | } 235 | int next_must_go = vert_top_half_[cn.i][cn.j] ? 2 : 0; 236 | DFS(Node(cn.i - 1, cn.j), next_must_go); 237 | } 238 | if (hori_whole_[cn.i][cn.j]) { 239 | if (hori_right_half_[cn.i][cn.j]) { 240 | if (cn.i != prev.i || cn.j + 1 != prev.j) return; 241 | } 242 | int next_must_go = hori_left_half_[cn.i][cn.j] ? 1 : 0; 243 | DFS(Node(cn.i, cn.j - 1), next_must_go); 244 | } 245 | } 246 | } 247 | align_path_.pop_back(); 248 | } 249 | 250 | decltype(auto) Paths() const { 251 | return (paths_); 252 | } 253 | 254 | decltype(auto) Path() const { 255 | return paths_.front(); 256 | } 257 | 258 | void PrintAllPaths() const { 259 | // int i = 2, j=4; 260 | // std::cout<< "edge assignment\n"; 261 | // std::cout << vert_whole_[i][j] << "\t" << hori_whole_[i][j] << "\t" << diag_whole_[i][j] << "\t"; 262 | // std::cout << vert_top_half_[i][j] << "\t" << vert_bottom_half_[i][j] << "\t" << hori_left_half_[i][j] << "\t" << hori_right_half_[i][j]; 263 | // std::cout << "\n"; 264 | for (auto& p : paths_) { 265 | Alignment a(rowstring_, colstring_, p); 266 | std::cerr << a; 267 | std::cerr << '\n'; 268 | } 269 | }; 270 | 271 | // void Print() const { 272 | // std::cerr << "R\n"; 273 | // std::cerr << R_; 274 | // std::cerr << "P\n"; 275 | // std::cerr << P_; 276 | // std::cerr << "Q\n"; 277 | // std::cerr << Q_; 278 | // std::cerr << "a\n"; 279 | // std::cerr << vert_whole_; // a 280 | // std::cerr << "b\n"; 281 | // std::cerr << hori_whole_; // b 282 | // std::cerr << "c\n"; 283 | // std::cerr << diag_whole_; // c 284 | // std::cerr << "d\n"; 285 | // std::cerr << vert_top_half_; // d 286 | // std::cerr << "e\n"; 287 | // std::cerr << vert_bottom_half_; // e 288 | // std::cerr << "f\n"; 289 | // std::cerr << hori_left_half_; // f 290 | // std::cerr << "g\n"; 291 | // std::cerr << hori_right_half_; // g 292 | // } 293 | 294 | }; 295 | 296 | #endif //ADAPTERTRIM_CPPUTIL_INCLUDE_GOTOH_H_ 297 | -------------------------------------------------------------------------------- /bbcpputil/include/Insert.h: -------------------------------------------------------------------------------- 1 | #ifndef INSERT_SEQ_H 2 | #define INSERT_SEQ_H 3 | #include 4 | //#include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "AlignmentConsensus.h" 14 | 15 | 16 | using std::list; 17 | using std::vector; 18 | using std::pair; 19 | using std::string; 20 | 21 | namespace cpputil { 22 | 23 | class InsertSeq { 24 | 25 | using BamPointer = std::shared_ptr; 26 | using BamRecord = SeqLib::BamRecord; 27 | //Switch to SeqLib::BamRecord 28 | typedef Segments::iterator iterator; 29 | typedef Segments::const_iterator const_iterator; 30 | 31 | Segments inprogress_, invalid_, ambiguous_; 32 | vector paired_; // segment size is two 33 | string seqid_; 34 | 35 | struct SeqQual { 36 | string seq; 37 | string qual; 38 | SeqQual(string s, string q) : seq(s), qual(q) {} 39 | }; 40 | 41 | bool IsMate_(const bam1_t *bam, const bam1_t *mate) const { 42 | /* Modified based on Rsamtool src/Template.h 43 | //https://github.com/Bioconductor/Rsamtools 44 | 45 | // is_mate checks the following bit flags: 46 | // 1. Bit 0x40 and 0x80: Segments are a pair of first/last OR 47 | // neither segment is marked first/last 48 | // 2. Bit 0x100: Both segments are secondary OR both not secondary 49 | // 3. Bit 0x10 and 0x20: Strand flag 0x20 of one mate must match strand 50 | // flag 0x10 of the other mate and vice versa 51 | // 4. Bit 0x2: Both proper OR both not proper 52 | // 5. mpos match: 53 | // bit 0x10 of rec1 == bit 0x20 of rec2 AND 54 | // bit 0x10 or rec2 == bit 0x20 of rec1 55 | // segment2 mpos matches segment1 pos 56 | // 6. Both Mapped; 57 | */ 58 | const bool bam_read1 = bam->core.flag & BAM_FREAD1; 59 | const bool mate_read1 = mate->core.flag & BAM_FREAD1; 60 | const bool bam_read2 = bam->core.flag & BAM_FREAD2; 61 | const bool mate_read2 = mate->core.flag & BAM_FREAD2; 62 | const bool bam_secondary = bam->core.flag & BAM_FSECONDARY; 63 | const bool mate_secondary = mate->core.flag & BAM_FSECONDARY; 64 | const bool bam_proper = bam->core.flag & BAM_FPROPER_PAIR; 65 | const bool mate_proper = mate->core.flag & BAM_FPROPER_PAIR; 66 | const bool bam_rev = bam->core.flag & BAM_FREVERSE; 67 | const bool mate_rev = mate->core.flag & BAM_FREVERSE; 68 | const bool bam_mrev = bam->core.flag & BAM_FMREVERSE; 69 | const bool mate_mrev = mate->core.flag & BAM_FMREVERSE; 70 | const bool bam_unmap = bam->core.flag & BAM_FUNMAP; 71 | const bool mate_unmap = bam->core.flag & BAM_FMUNMAP; 72 | const uint32_t 73 | pos = bam->core.pos, 74 | mpos = bam->core.mpos, 75 | mate_pos = mate->core.pos, 76 | mate_mpos = mate->core.mpos; 77 | return 78 | ((bam_read1 ^ bam_read2) && (mate_read1 ^ mate_read2)) && 79 | (bam_read1 != mate_read1) && 80 | (bam_secondary == mate_secondary) && 81 | (((bam_rev != mate_mrev) && (bam_mrev != mate_rev)) || 82 | ((bam_rev == mate_mrev) && (bam_mrev == mate_rev))) && 83 | (bam_proper == mate_proper) && 84 | !bam_unmap && 85 | !mate_unmap && 86 | (pos == mate_mpos) && (mpos == mate_pos) && 87 | (bam->core.mtid == mate->core.tid); 88 | } 89 | 90 | void add_to_paired(BamRecord bam, BamRecord mate) { 91 | // keep the order of R1 then R2 92 | if (bam.FirstFlag()) { 93 | Segments tmp{bam, mate}; 94 | paired_.emplace_back(tmp); 95 | } else { 96 | Segments tmp{mate, bam}; 97 | paired_.emplace_back(tmp); 98 | } 99 | } 100 | 101 | static int32_t GetBreakPointCorrection_(const int32_t stop, const SeqLib::Cigar &cigar) { 102 | int32_t num_refbase_consumed = 0; 103 | int32_t correction = 0; 104 | assert(std::distance(cigar.begin(), cigar.end()) > 1); 105 | for (auto it = cigar.begin(); it != cigar.end(); ++it) { 106 | if (num_refbase_consumed > stop) break; 107 | if (it->Type() == 'I') { 108 | correction += it->Length(); 109 | } else if (it->Type() == 'D') { 110 | num_refbase_consumed += it->Length(); 111 | correction -= it->Length(); 112 | } else if (it->Type() == 'H') { 113 | continue; 114 | } else { 115 | num_refbase_consumed += it->Length(); 116 | } 117 | } 118 | return correction; 119 | } 120 | 121 | static pair GetSegmentOverhang(const Segments &seg) { 122 | int32_t left_front = seg.front().PositionWithSClips(); 123 | int32_t left_end = seg.front().PositionEndWithSClips(); 124 | int32_t right_front = seg.back().PositionWithSClips(); 125 | int32_t right_end = seg.back().PositionEndWithSClips(); 126 | // if (left_front > right_front) { 127 | // DEBUG(seg.front()) 128 | // DEBUG(seg.back()); 129 | // } 130 | assert(left_front <= right_front); 131 | assert(left_end <= right_end); 132 | 133 | int32_t left_break = right_front - left_front; 134 | int32_t right_break = left_end - right_front + 1; 135 | left_break += GetBreakPointCorrection_(left_break, seg.front().GetCigar()); 136 | right_break += GetBreakPointCorrection_(right_break, seg.back().GetCigar()); 137 | SeqQual left_oh(seg.front().Sequence().substr(0, left_break), seg.front().Qualities().substr(0, left_break)); 138 | SeqQual right_oh(seg.back().Sequence().substr(right_break - 1, seg.back().Sequence().size() - right_break + 1), 139 | seg.back().Qualities().substr(right_break - 1, seg.back().Qualities().size() - right_break + 1)); 140 | 141 | return std::make_pair(left_oh, right_oh); 142 | } 143 | 144 | // static bool IsSorted(const Segments &seg) { 145 | // return seg.front().PositionWithSClips() <= seg.back().PositionWithSClips(); 146 | // } 147 | 148 | 149 | // static string GetConsensusTemplate(const Segments &seg) { 150 | // /* 151 | // * '~' : uninitialized 152 | // * '+' : insertion 153 | // * '-' : deletion 154 | // */ 155 | // assert(IsSorted(seg)); 156 | // const SeqLib::Cigar left_cigar = seg.front().GetCigar(); 157 | // const SeqLib::Cigar right_cigar = seg.back().GetCigar(); 158 | // int ref_span = std::max(seg.back().PositionEndWithSClips(), seg.front().PositionEndWithSClips()) 159 | // - seg.front().PositionWithSClips(); 160 | // std::map ins_len; 161 | // find_insert_(left_cigar, 0, ins_len); 162 | // find_insert_(right_cigar, seg.back().PositionWithSClips() - seg.front().PositionWithSClips(), ins_len); 163 | // string consens; 164 | // int b = 0; 165 | // for (const auto pos_len : ins_len) { 166 | // consens += string(pos_len.first - b, '~'); 167 | // consens += string(pos_len.second, '+'); 168 | // b = pos_len.first; 169 | // } 170 | // consens += string(ref_span - b, '~'); 171 | // return consens; 172 | // } 173 | 174 | 175 | public: 176 | InsertSeq() = default; 177 | InsertSeq(BamRecord br) { 178 | add(br); 179 | } 180 | 181 | void add(BamRecord br) { 182 | inprogress_.push_back(br); 183 | } 184 | 185 | decltype(auto) paired() const { 186 | return (paired_); 187 | } 188 | 189 | decltype(auto) inprogress() const { 190 | return (inprogress_); 191 | } 192 | 193 | decltype(auto) forward_segments() const { 194 | Segments forward; 195 | for (const auto &read : inprogress_) { 196 | if (read.ReverseFlag()) { 197 | continue; 198 | } else { 199 | forward.push_back(read); 200 | } 201 | } 202 | return forward; 203 | } 204 | 205 | decltype(auto) reverse_segments() const { 206 | Segments reverse; 207 | for (const auto &read : inprogress_) { 208 | if (read.ReverseFlag()) { 209 | reverse.push_back(read); 210 | } 211 | } 212 | return reverse; 213 | } 214 | 215 | bool empty() const { 216 | return inprogress_.empty() && invalid_.empty() && ambiguous_.empty() && paired_.empty(); 217 | } 218 | 219 | void Mate() { 220 | /* Adapted from Rsamtool src/Template.h 221 | //https://github.com/Bioconductor/Rsamtools 222 | */ 223 | // This is O(n^2) where n is the number of reads. This does not work for large n. 224 | 225 | // Mate paired bam records to segments. Segments are non-overlap intervals on genome. 226 | const int unmated = -1, multiple = -2, processed = -3; 227 | vector > 228 | status(inprogress_.size(), 229 | pair(unmated, BamRecord())); 230 | Segments::iterator it0; 231 | 232 | // identify unambiguous and ambiguous mates 233 | it0 = inprogress_.begin(); 234 | for (unsigned int i = 0; i < inprogress_.size(); ++i) { 235 | status[i].second = *it0; 236 | Segments::iterator it1 = it0; 237 | for (unsigned int j = i + 1; j < inprogress_.size(); ++j) { 238 | ++it1; 239 | if (IsMate_(it0->raw(), it1->raw())) { 240 | status[i].first = status[i].first == unmated ? j : multiple; 241 | status[j].first = status[j].first == unmated ? i : multiple; 242 | } 243 | } 244 | ++it0; 245 | } 246 | 247 | // process unambiguous and ambiguous mates 248 | for (unsigned int i = 0; i < status.size(); ++i) { 249 | if (status[i].first == unmated) 250 | continue; 251 | if (status[i].first >= 0 && status[status[i].first].first >= 0) { 252 | // unambiguous mates 253 | add_to_paired(status[i].second, status[status[i].first].second); 254 | status[status[i].first].first = processed; 255 | status[i].first = processed; 256 | } else if (status[i].first != processed) { 257 | // ambiguous mates, added to 'ambiguous' queue 258 | ambiguous_.push_back(status[i].second); 259 | status[i].first = processed; 260 | } 261 | ++it0; 262 | } 263 | 264 | // remove segments that have been assigned to paired or 265 | // ambiguous queue 266 | it0 = inprogress_.begin(); 267 | for (unsigned int i = 0; i != status.size(); ++i) { 268 | if (status[i].first == processed) { 269 | it0 = inprogress_.erase(it0); 270 | } else { 271 | ++it0; 272 | } 273 | } 274 | } 275 | }; 276 | } 277 | #endif 278 | -------------------------------------------------------------------------------- /bbcpputil/include/InsertSeqFactory.h: -------------------------------------------------------------------------------- 1 | #ifndef INSERT_SEQ_FACTORY_H 2 | #define INSERT_SEQ_FACTORY_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "DNAUtils.h" 9 | 10 | #include "TargetLayout.h" 11 | #include "Alignment.h" 12 | #include "Insert.h" 13 | #include "BamRecordExt.h" 14 | 15 | #include "FastxIO.h" 16 | 17 | #ifndef NDEBUG 18 | #include 19 | #endif 20 | 21 | #ifndef NDEBUG 22 | # define DEBUG(x) do {std::cerr << x << std::endl;} while(0); 23 | #else 24 | # define DEBUG(x) do {} while (0) 25 | #endif 26 | 27 | using std::map; 28 | using std::string; 29 | 30 | namespace cpputil { 31 | class InsertSeqFactory { 32 | 33 | SeqLib::BamReader bam_reader_; 34 | map id_inserts_; 35 | map::iterator it_; 36 | int min_mapq_; 37 | bool load_supp_; 38 | bool load_secondary_; 39 | bool load_duplicate_; 40 | bool load_proper_pair_only_; 41 | bool clip3_ = false; 42 | int last_chr_; 43 | bool finished_ = false; 44 | int paired_end_library_ = 1; // 0 single end, 1 paired end 45 | 46 | bool passfilter(const SeqLib::BamRecord& b) { 47 | if (!b.MappedFlag()) return false; 48 | if (!load_supp_ && b.SupplementaryFlag()) return false; 49 | if (!load_secondary_ && b.SecondaryFlag()) return false; 50 | if (!load_duplicate_ && b.DuplicateFlag()) return false; 51 | if (load_proper_pair_only_ && !b.ProperPair()) return false; 52 | if (b.MapQuality() < min_mapq_) return false; 53 | return true; 54 | } 55 | 56 | std::pair loadrecord(int32_t stop_at = INT32_MAX) { 57 | //load a single record 58 | SeqLib::BamRecord b; 59 | string qname; 60 | int chrid; 61 | while (true) { 62 | auto ret = bam_reader_.GetNextRecord(b); 63 | if (!ret) { 64 | return std::make_pair(string(), -1); 65 | } 66 | if (!b.MappedFlag()) continue; 67 | if (b.Position() > stop_at) { 68 | //Stop and this read is not added 69 | return std::make_pair(b.Qname(), -1); 70 | } 71 | if (!passfilter(b)) continue; 72 | qname = b.Qname(); 73 | chrid = b.ChrID(); 74 | if (clip3_) { 75 | cpputil::SoftClip3end(b); 76 | } 77 | 78 | auto itfind = id_inserts_.find(b.Qname()); 79 | if (itfind == id_inserts_.end()) { 80 | id_inserts_.emplace(b.Qname(), b); 81 | } else { 82 | itfind->second.add(b); 83 | } 84 | break; 85 | } 86 | return std::make_pair(qname, chrid); 87 | } 88 | 89 | bool yieldfrags(bool (*Selector)(const Segments &, int param), 90 | const bool load_unpair, 91 | const int pair_min_ol, 92 | const std::string uid_tag_name, 93 | std::vector>& ret) { 94 | if (id_inserts_.empty()) return false; 95 | it_ = id_inserts_.begin(); 96 | std::map> uid_to_segs; 97 | int dummy = 1; // psudo uid 98 | while (true) {// Yield a group of read(s), either paired, single, or family until no more reads. 99 | std::vector recs; 100 | if (paired_end_library_) { 101 | if (load_unpair) { 102 | recs = YieldPairAndUnpair(); 103 | } else { 104 | recs = YieldPair(Selector, pair_min_ol); 105 | } 106 | } else { 107 | recs = YieldSingle(); 108 | } 109 | if (recs.empty()) break; 110 | if (not load_duplicate_) { 111 | for (auto& rec : recs) { 112 | uid_to_segs[std::to_string(dummy++)].push_back(rec); 113 | } 114 | } else { 115 | for (auto& rec : recs) { 116 | std::string uid = GetUid(rec, uid_tag_name); 117 | uid_to_segs[uid].push_back(rec); 118 | } 119 | } 120 | } 121 | ret.reserve(uid_to_segs.size()); 122 | for (auto& it: uid_to_segs) { 123 | ret.push_back(it.second); 124 | } 125 | id_inserts_.clear(); 126 | return true; 127 | } 128 | 129 | public: 130 | //InsertSeqFactory() = delete; 131 | InsertSeqFactory() = default; 132 | 133 | InsertSeqFactory(const std::string &bam, int mapq, bool load_supp, bool load_sec, bool load_duplicate, bool load_proper_pair_only, bool clip3): 134 | min_mapq_(mapq), 135 | load_supp_(load_supp), 136 | load_secondary_(load_sec), 137 | load_duplicate_(load_duplicate), 138 | load_proper_pair_only_(load_proper_pair_only), 139 | clip3_(clip3), 140 | last_chr_(-1) { 141 | bam_reader_.Open(bam); 142 | SeqLib::BamRecord b; 143 | }; 144 | 145 | bool IsPairEndLib() const { 146 | return (paired_end_library_); 147 | } 148 | 149 | bool ReadByRegion(bool (*Selector)(const Segments &, int param), // selector 150 | const SeqLib::GenomicRegion& gr, 151 | std::vector>& ret, 152 | int pair_min_ol, // mim overlap between read1 and read2 153 | const std::string uid_tag_name, 154 | bool load_unpair) { 155 | //clearance 156 | if (not ret.empty()) ret.clear(); 157 | if (not id_inserts_.empty()) id_inserts_.clear(); 158 | bool stat = bam_reader_.SetRegion(gr); 159 | if (not stat) { 160 | std::cerr << gr << " not found" << std::endl; 161 | return false; 162 | } 163 | std::string readid; 164 | int chrid; 165 | while(true) { 166 | std::tie(readid, chrid) = loadrecord(gr.pos2); 167 | if (chrid == -1) break; 168 | } 169 | bool status = yieldfrags(Selector, load_unpair, pair_min_ol, uid_tag_name, ret); 170 | return status; 171 | } 172 | 173 | std::vector FetchReadNameSorted(bool load_unpair = false) { 174 | std::vector ret; 175 | SeqLib::BamRecord b; 176 | while(true) { 177 | bool has_read = bam_reader_.GetNextRecord(b); 178 | if (!has_read) { 179 | break; 180 | } 181 | if (!passfilter(b)) continue; 182 | if (clip3_) { 183 | cpputil::SoftClip3end(b); 184 | } 185 | auto itfind = id_inserts_.find(b.Qname()); 186 | if (itfind == id_inserts_.end()) { 187 | for(auto it : id_inserts_) { 188 | it.second.Mate(); 189 | for (auto seg: it.second.paired()) { 190 | ret.push_back(seg); 191 | } 192 | if (load_unpair) { 193 | for (auto bam: it.second.inprogress()) { 194 | Segments tmp(1, bam); 195 | ret.push_back(tmp); 196 | } 197 | } 198 | } 199 | id_inserts_.clear(); 200 | id_inserts_.emplace(b.Qname(), b); 201 | return ret; 202 | } else { 203 | itfind->second.add(b); 204 | } 205 | } 206 | if (!id_inserts_.empty()) { 207 | for(auto it : id_inserts_) { 208 | it.second.Mate(); 209 | for (auto seg: it.second.paired()) { 210 | ret.push_back(seg); 211 | } 212 | if (load_unpair) { 213 | for (auto bam: it.second.inprogress()) { 214 | Segments tmp(1, bam); 215 | ret.push_back(tmp); 216 | } 217 | } 218 | } 219 | id_inserts_.clear(); 220 | } else { 221 | finished_ = true; 222 | } 223 | return ret; 224 | } 225 | 226 | 227 | std::vector> ReadByChrom(bool (*Selector)(const Segments &, int param), // selector 228 | int pair_min_ol, // mim overlap between read1 and read2 229 | const std::string uid_tag_name = "", 230 | bool load_unpair = false) { 231 | std::vector> ret; 232 | if (finished()) return ret; 233 | while (true) { 234 | std::string readid; 235 | int chrid; 236 | // Load reads in to id_inserts_; 237 | std::tie(readid, chrid) = loadrecord(); 238 | if (last_chr_ != -1 && chrid != last_chr_) { // first read in a different chromosome (2nd and above) 239 | InsertSeq save; 240 | if (!readid.empty()) { // save this read from processing and put it back to id_inserts_ after processing 241 | save = id_inserts_[readid]; 242 | id_inserts_.erase(readid); 243 | } 244 | it_ = id_inserts_.begin(); 245 | 246 | yieldfrags(Selector, load_unpair, pair_min_ol, uid_tag_name, ret); 247 | 248 | if (!readid.empty()) { // Put back the saved read 249 | id_inserts_.emplace(readid, save); 250 | } 251 | last_chr_ = chrid; 252 | break; 253 | } 254 | if (chrid == -1) { 255 | finished_ = true; 256 | break; 257 | } 258 | last_chr_ = chrid; 259 | } 260 | return ret; 261 | } 262 | 263 | bool finished() { 264 | return finished_; 265 | } 266 | 267 | decltype(auto) bamheader() const { 268 | return (bam_reader_.Header()); 269 | } 270 | 271 | //Iterator 272 | std::vector YieldSingle() { 273 | std::vector res; 274 | if (it_ != id_inserts_.end()) { 275 | res.push_back(it_->second.inprogress()); 276 | ++it_; 277 | } 278 | return res; 279 | } 280 | 281 | //Iterator 282 | std::vector YieldPairAndUnpair() { 283 | std::vector res; 284 | for(;it_ != id_inserts_.end();) { 285 | it_->second.Mate(); 286 | for (auto seg: it_->second.paired()) { 287 | 288 | res.push_back(seg); 289 | } 290 | for (auto bam: it_->second.inprogress()) { 291 | Segments tmp(1, bam); 292 | res.push_back(tmp); 293 | } 294 | ++it_; 295 | if (!res.empty()) break; 296 | } 297 | return res; 298 | } 299 | 300 | //Iterator 301 | std::vector YieldPair(bool (*Selector)(const Segments &, int k), int k) { 302 | /* 303 | * Always yield R1 and R2 in order 304 | */ 305 | std::vector res; 306 | for (; it_ != id_inserts_.end();) { 307 | it_->second.Mate(); 308 | bool found = false; 309 | for (auto seg: it_->second.paired()) { 310 | if (Selector(seg, k)) { 311 | res.push_back(seg); 312 | found = true; 313 | } 314 | } 315 | ++it_; 316 | if (found) { 317 | break; 318 | } 319 | } 320 | return res; 321 | } 322 | 323 | // Only if both reads of FR pair pass the filter 324 | // std::vector YieldFamily(bool (*Selector)(const Segments &)) { 325 | // std::vector forward_reverse_bams; 326 | // for (; family_it_ != start_end_2_rnames_.end();) { 327 | // Segments forwards; 328 | // Segments reverses; 329 | // for (auto id : family_it_->second) { 330 | // auto f = id_inserts_[id].forward_segments(); 331 | // auto r = id_inserts_[id].reverse_segments(); 332 | // if (!f.empty() && !r.empty()) { 333 | // forwards.insert(forwards.end(), f.begin(), f.end()); 334 | // reverses.insert(reverses.end(), r.begin(), r.end()); 335 | // } 336 | // } 337 | // ++family_it_; 338 | // if (Selector(forwards) && Selector(reverses)) { 339 | // forward_reverse_bams.push_back(forwards); 340 | // forward_reverse_bams.push_back(reverses); 341 | // break; 342 | // } 343 | // } 344 | // return forward_reverse_bams; 345 | // } 346 | 347 | }; 348 | } 349 | 350 | #endif 351 | -------------------------------------------------------------------------------- /bbcpputil/include/MAF.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 3/26/20. 3 | // 4 | 5 | #ifndef CPPUTIL_INCLUDE_MAF_H_ 6 | #define CPPUTIL_INCLUDE_MAF_H_ 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "StringUtils.h" 13 | 14 | namespace cpputil { 15 | class MAFReader { 16 | std::ifstream in_; 17 | std::map >> records_; 18 | bool isopen_ = false; 19 | 20 | public: 21 | MAFReader() = default; 22 | 23 | MAFReader(const std::string& maf) { 24 | Open(maf); 25 | } 26 | 27 | bool IsOpen() const { 28 | return isopen_; 29 | } 30 | 31 | bool Open(const std::string& maf) { 32 | isopen_ = true; 33 | in_ = std::ifstream(maf); 34 | string line; 35 | //get header 36 | getline(in_, line, '\n'); 37 | if (in_.eof()) { 38 | return false; 39 | } 40 | std::vector fields; 41 | split_by_char(line, '\t', fields); 42 | for(auto &s : fields) { 43 | std::transform(s.begin(), s.end(), s.begin(), 44 | [](unsigned char c) -> unsigned char { return std::toupper(c); }); 45 | } 46 | int chr_idx = std::distance(fields.begin(), std::find(fields.begin(), fields.end(), "CHROMOSOME")); 47 | int start_idx = std::distance(fields.begin(), std::find(fields.begin(), fields.end(), "START_POSITION")); 48 | int alt_idx = std::distance(fields.begin(), std::find(fields.begin(), fields.end(), "TUMOR_SEQ_ALLELE2")); 49 | 50 | while (true) { 51 | line.clear(); 52 | fields.clear(); 53 | getline(in_, line, '\n'); 54 | if (in_.eof()) { 55 | break; 56 | } 57 | split_by_char(line, '\t', fields); 58 | auto it = records_.find(fields[chr_idx]); 59 | if ( it == records_.end()) { 60 | std::map> key = {{std::stoi(fields[start_idx]), std::vector(1, fields[alt_idx])}}; 61 | records_[fields[chr_idx]] = key; 62 | } else { 63 | auto it2 = it->second.find(std::stoi(fields[start_idx])); 64 | if (it2 == it->second.end()) { 65 | it->second[std::stoi(fields[start_idx])] = std::vector(1, fields[alt_idx]); 66 | } else { 67 | it2->second.push_back(fields[alt_idx]); 68 | } 69 | } 70 | } 71 | return true; 72 | } 73 | 74 | bool var_exist(const std::string& contig, const int32_t pos, std::string alt="") const { 75 | auto it = records_.find(contig); 76 | if (it != records_.end()) { 77 | auto it2 = it->second.find(pos + 1); 78 | if (it2 != it->second.end()) { 79 | if (alt.empty()) return true; 80 | if( std::find(it2->second.begin(), it2->second.end(), alt) != it2->second.end()) { 81 | return true; 82 | } 83 | } 84 | } 85 | return false; 86 | } 87 | 88 | void Print() const { 89 | for (auto const& rec : records_) { 90 | for (auto const& pos_alts : rec.second) { 91 | for (auto const& alt : pos_alts.second) { 92 | std::cerr << rec.first << "\t" << pos_alts.first << "\t" << alt << std::endl; 93 | } 94 | } 95 | } 96 | } 97 | }; 98 | } 99 | 100 | #endif //CPPUTIL_INCLUDE_MAF_H_ 101 | -------------------------------------------------------------------------------- /bbcpputil/include/Stats.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 7/1/21. 3 | // 4 | 5 | #ifndef BBCPPUTIL_INCLUDE_STATS_H_ 6 | #define BBCPPUTIL_INCLUDE_STATS_H_ 7 | #include 8 | #include 9 | namespace cpputil { 10 | 11 | inline int GetMode(const std::vector& array) { 12 | /* 13 | * Return the mode of an integer vector. Works for short and medium size array and 14 | * the span of the array is not too larger 15 | * 16 | */ 17 | assert(not array.empty()); 18 | const auto ret = std::minmax_element(begin(array), end(array)); 19 | std::vector hist(*ret.second - *ret.first + 1); 20 | for (const auto& ii: array) { 21 | ++hist[ii - *ret.first]; 22 | } 23 | return std::max_element(hist.begin(), hist.end()) - hist.begin() + *ret.first; 24 | } 25 | 26 | 27 | } 28 | #endif //BBCPPUTIL_INCLUDE_STATS_H_ 29 | -------------------------------------------------------------------------------- /bbcpputil/include/StringUtils.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 3/26/20. 3 | // 4 | 5 | #ifndef CPPUTIL_INCLUDE_STRINTUTILS_H_ 6 | #define CPPUTIL_INCLUDE_STRINTUTILS_H_ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | namespace cpputil { 16 | inline void split_by_char(const std::string &s, char c, 17 | std::vector &v) { 18 | int i = 0; 19 | int j = s.find(c); 20 | 21 | while (j >= 0) { 22 | v.push_back(s.substr(i, j - i)); 23 | i = ++j; 24 | j = s.find(c, j); 25 | 26 | if (j < 0) { 27 | v.push_back(s.substr(i, s.length())); 28 | } 29 | } 30 | } 31 | 32 | inline std::vector split(const std::string& s, const std::string& delims) 33 | { 34 | std::vector result; 35 | std::string::size_type lastPos = s.find_first_not_of(delims, 0); 36 | std::string::size_type pos = s.find_first_of(delims, lastPos); 37 | while (std::string::npos != pos || std::string::npos != lastPos) { 38 | result.push_back(s.substr(lastPos, pos - lastPos)); 39 | lastPos = s.find_first_not_of(delims, pos); 40 | pos = s.find_first_of(delims, lastPos); 41 | } 42 | return result; 43 | } 44 | 45 | /* 46 | * https://stackoverflow.com/questions/9277906/stdvector-to-string-with-custom-delimiter 47 | * By Shadow2531 48 | */ 49 | template 50 | std::string join(const T& v, const std::string& delim) { 51 | std::ostringstream s; 52 | for (const auto& i : v) { 53 | if (&i != &v[0]) { 54 | s << delim; 55 | } 56 | s << i; 57 | } 58 | return s.str(); 59 | } 60 | 61 | inline double entropy(const std::string& dna_seq) { 62 | std::map cnt; 63 | for (const char& d : dna_seq) { 64 | cnt[d]++; 65 | } 66 | std::vector vec; 67 | std::vector p; 68 | int tot = 0; 69 | for (auto it : cnt) { 70 | tot += it.second; 71 | vec.push_back(it.second); 72 | } 73 | p.resize(vec.size()); 74 | std::transform(vec.begin(), vec.end(), p.begin(), [&tot](double x) {return x / tot;}); 75 | std::transform(p.begin(), p.end(), p.begin(), [](double x) {return x * log2(x);}); 76 | return -std::accumulate(p.begin(), p.end(), 0.0); 77 | } 78 | 79 | } 80 | 81 | #endif //CPPUTIL_INCLUDE_STRINTUTILS_H_ 82 | -------------------------------------------------------------------------------- /bbcpputil/include/TargetLayout.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 5/9/20. 3 | // 4 | 5 | #ifndef CPPUTIL_INCLUDE_TARGETLAYOUT_H_ 6 | #define CPPUTIL_INCLUDE_TARGETLAYOUT_H_ 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | namespace cpputil{ 17 | 18 | class TargetLayout { 19 | size_t idx_ = 0; 20 | SeqLib::GenomicRegionVector ginvs_; 21 | 22 | bool _Load(const SeqLib::BamHeader& header, const std::string& bed_path) { 23 | SeqLib::GRC grc; 24 | bool ret = grc.ReadBED(bed_path, header); 25 | if (!ret) { 26 | throw std::runtime_error(bed_path + " cannot be read!"); 27 | } 28 | ginvs_ = grc.AsGenomicRegionVector(); 29 | std::cerr << "read " << ginvs_.size() << " regions\n"; 30 | return true; 31 | } 32 | 33 | public: 34 | TargetLayout() = default; 35 | TargetLayout(const SeqLib::BamHeader& header, const std::string& bed_path) { 36 | _Load(header, bed_path); 37 | } 38 | 39 | size_t NumRegion() const { 40 | return ginvs_.size(); 41 | } 42 | 43 | decltype(auto) operator[] (int i) const{ 44 | return (ginvs_.at(i)); 45 | } 46 | 47 | bool NextRegion(SeqLib::GenomicRegion & gr) { 48 | if (idx_ < ginvs_.size()) { 49 | gr = ginvs_[idx_]; 50 | ++idx_; 51 | return true; 52 | } else { 53 | return false; 54 | } 55 | } 56 | 57 | 58 | }; 59 | 60 | } 61 | #endif //CPPUTIL_INCLUDE_TARGETLAYOUT_H_ 62 | -------------------------------------------------------------------------------- /codec.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 5/10/21. 3 | // 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #ifndef PACKAGE_VERSION 11 | #define PACKAGE_VERSION "1.1.5" 12 | #endif 13 | 14 | int codec_demux(int argc, char **argv); 15 | int codec_trim(int argc, char **argv); 16 | int codec_filter(int argc, char **argv); 17 | int codec_accuracy(int argc, char **argv); 18 | //int codec_filter(int argc, char ** argv); 19 | 20 | 21 | int print_help() 22 | { 23 | std::cout<< "---------------------------------------------------\n"; 24 | std::cout<< "Program: codec (concatenating original duplex for error correction analysis suite)\n"; 25 | std::cout<< "Version: " << PACKAGE_VERSION << std::endl; 26 | std::cout<< "Usage: codec [options]\n"; 27 | std::cout<< "Common command: demux de-multiplexing.\n"; 28 | std::cout<< " trim trim CODEC adapter sequence.\n"; 29 | std::cout<< " call single fragment mutation caller.\n"; 30 | std::cout<< "---------------------------------------------------\n"; 31 | std::cout<< "Optional command:\n"; 32 | std::cout<< " filter Filter duplex reads on base and fragment levels.\n"; 33 | std::cout<< "---------------------------------------------------\n"; 34 | std::cout<< "Contact: ruolin@broadinstitute.org. " 35 | "Copyright: bloodbiopsy@broadinstitute.org 2020-2021. \n"; 36 | return 1; 37 | } 38 | 39 | int main(int argc, char *argv[]) { 40 | int ret; 41 | if (argc < 2) return print_help(); 42 | else if (strcmp(argv[1], "demux") == 0) ret = codec_demux(argc-1, argv+1); 43 | else if (strcmp(argv[1], "trim") == 0) ret = codec_trim(argc-1, argv+1); 44 | else if (strcmp(argv[1], "filter") == 0) ret = codec_filter(argc-1, argv+1); 45 | else if (strcmp(argv[1], "call") == 0) ret = codec_accuracy(argc-1, argv+1); 46 | // else if (strcmp(argv[1], "filter") == 0) ret = codec_filter(argc-1, argv+1); 47 | else { 48 | std::cerr << "[codec] unrecongnized command " << argv[1] << std::endl; 49 | print_help(); 50 | ret = 1; 51 | } 52 | return ret; 53 | } 54 | -------------------------------------------------------------------------------- /demux.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 3/13/20. 3 | // 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "Index.h" 10 | #include "Files.h" 11 | 12 | using std::string; 13 | struct DemuxOptions { 14 | string library_file; 15 | string fastq1; 16 | string fastq2; 17 | string outprefix = "./test"; 18 | string reference; 19 | int index_begin = 3; 20 | int index_len = 18; 21 | int max_ed = 2; 22 | int min_readlen = 30; 23 | bool include_non_pf = false; 24 | bool verbose = false; 25 | bool out_unmatch = false; 26 | bool out_hopped = false; 27 | bool count_PF = false; 28 | }; 29 | 30 | 31 | static struct option demux_long_options[] = { 32 | {"library_param", required_argument, 0, 'p'}, 33 | {"q1", required_argument, 0, '1'}, 34 | {"q2", required_argument, 0, '2'}, 35 | {"outprefix", required_argument , 0, 'o'}, 36 | {"ref", required_argument , 0, 'r'}, 37 | {"index_begin", required_argument, 0, 'b'}, 38 | {"index_len", required_argument, 0, 'l'}, 39 | {"min_read_len", required_argument, 0, 'm'}, 40 | {"max_ed", required_argument, 0, 'e'}, 41 | {"verbose", no_argument , 0, 'v'}, 42 | {"out_unmatch", no_argument , 0, 'u'}, 43 | {"out_hopped", no_argument , 0, 'h'}, 44 | {"include_non_pf", no_argument , 0, 'i'}, 45 | {"count_pf", no_argument , 0, 'c'}, 46 | {0,0,0,0} 47 | }; 48 | 49 | const char* demux_short_options = "p:1:2:o:r:vib:l:e:cuh:m:"; 50 | 51 | void codec_demux_usage() 52 | { 53 | std::cerr<< "---------------------------------------------------\n"; 54 | std::cerr<< "Usage: codec demux [options]\n"; 55 | std::cerr<< "General Options:\n"; 56 | std::cerr<< "-p/--library_param, Sample, barcode mapping in CSV format. Header must be \"SampleName,IndexBarcode1,IndexBarcode2\"\n"; 57 | std::cerr<< "-1/--q1, Input read1\n"; 58 | std::cerr<< "-2/--q2, Input read2\n"; 59 | std::cerr<< "-b/--index_begin, The read position where the index begins (Default: 3) \n"; 60 | std::cerr<< "-l/--index_len, Index length (Default: 18)\n"; 61 | std::cerr<< "-m/--min_read_len, Minimum read length (Default: 30)\n"; 62 | std::cerr<< "-e/--max_ed, Maximum edit distance allowed as a match (Default: 2)\n"; 63 | std::cerr<< "-o/--outprefix, Output path, e.g., /tmp/test\n"; 64 | std::cerr<< "-r/--ref, Reference genome fasta file, for judging index hopping\n"; 65 | std::cerr<< "-i/--include_non_pf, Include non-pass filter reads\n"; 66 | std::cerr<< "-v/--verbose, Print verbose information\n"; 67 | std::cerr<< "-c/--count_pf, Just count number of pass filter pairs. Do not do anything else\n"; 68 | std::cerr<< "-u/--out_unmatch, Output reads having no matching barcodes\n"; 69 | std::cerr<< "-u/--out_hopped, Output reads hopped\n"; 70 | } 71 | 72 | int demux_parse_options(int argc, char* argv[], DemuxOptions& opt) { 73 | int option_index; 74 | int next_option = 0; 75 | do { 76 | next_option = getopt_long(argc, argv, demux_short_options, demux_long_options, &option_index); 77 | switch (next_option) { 78 | case -1:break; 79 | case 'p': 80 | opt.library_file = optarg; 81 | break; 82 | case '1': 83 | opt.fastq1 = optarg; 84 | break; 85 | case '2': 86 | opt.fastq2 = optarg; 87 | break; 88 | case 'b': 89 | opt.index_begin = atoi(optarg); 90 | break; 91 | case 'l': 92 | opt.index_len = atoi(optarg); 93 | break; 94 | case 'm': 95 | opt.min_readlen = atoi(optarg); 96 | break; 97 | case 'e': 98 | opt.max_ed = atoi(optarg); 99 | break; 100 | case 'o': 101 | opt.outprefix = optarg; 102 | break; 103 | case 'r': 104 | opt.reference = optarg; 105 | break; 106 | case 'i': 107 | opt.include_non_pf = true; 108 | break; 109 | case 'v': 110 | opt.verbose = true; 111 | break; 112 | case 'c': 113 | opt.count_PF = true; 114 | break; 115 | case 'h': 116 | opt.out_hopped = true; 117 | break; 118 | case 'u': 119 | opt.out_unmatch = true; 120 | break; 121 | default:codec_demux_usage(); 122 | return 1; 123 | } 124 | } while (next_option != -1); 125 | 126 | return 0; 127 | } 128 | 129 | int codec_demux(int argc, char ** argv) { 130 | DemuxOptions opt; 131 | int parse_ret = demux_parse_options(argc, argv, opt); 132 | if (parse_ret) return 1; 133 | if (argc == 1) { 134 | codec_demux_usage(); 135 | return 1; 136 | } 137 | //AffineGap ag("CACTGATCGTCAGCTGAC", "TGAATCTGAGGCACTGTA"); 138 | // AffineGap ag("AGT", "TGAGTT"); 139 | // ag.PrintAllPaths(); 140 | // exit(0); 141 | 142 | CDS::IndexBarcode ibmatcher(opt.library_file, opt.outprefix, opt.max_ed, opt.out_unmatch, opt.out_hopped, opt.verbose); 143 | if (!opt.reference.empty()) { 144 | ibmatcher.LoadBwa(opt.reference); 145 | } 146 | if (not cpputil::FileExist(opt.library_file)) { 147 | std::cerr << opt.library_file << " does not exist\n"; 148 | return 1; 149 | } 150 | cpputil::FastxReader R1_reader(opt.fastq1); 151 | cpputil::FastxReader R2_reader(opt.fastq2); 152 | cpputil::FastxRecord read1; 153 | cpputil::FastxRecord read2; 154 | uint64_t total_pf_reads = 0; 155 | uint64_t total_reads = 0; 156 | while (R1_reader.yield(read1)) { 157 | R2_reader.yield(read2); 158 | ++total_reads; 159 | if (not opt.include_non_pf and (read1.is_filtered() or read2.is_filtered())) continue; 160 | if (read1.seq.length() < opt.min_readlen or read2.seq.length() < opt.min_readlen) continue; 161 | ++total_pf_reads; 162 | //if (opt.count_PF) continue; 163 | assert(read1.name() == read2.name()); 164 | ibmatcher.DecodePair(read1, read2, opt.index_begin, opt.index_len); 165 | } 166 | // if (opt.count_PF) { 167 | uint64_t total_matched = ibmatcher.total_matched(); 168 | for (unsigned i = 0; i < ibmatcher.samples().size(); ++ i) { 169 | string s = ibmatcher.samples()[i]; 170 | uint64_t n = ibmatcher.nmatched()[i]; 171 | std::cout << "#sample, matched, matched%: " << s << ", " << n << ", " << (double) n / total_matched << std::endl; 172 | } 173 | std::cout << "#total, #PF, #matched, matched%: " << total_reads << ", " << total_pf_reads << ", " << total_matched << ", " << (double) total_matched / total_pf_reads << std::endl; 174 | // } 175 | return 0; 176 | } 177 | -------------------------------------------------------------------------------- /include/BamIO.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 7/13/20. 3 | // 4 | 5 | #ifndef ADAPTERTRIM_INCLUDE_BAMIO_H_ 6 | #define ADAPTERTRIM_INCLUDE_BAMIO_H_ 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "BamRecordExt.h" 12 | #include "DNAUtils.h" 13 | #include "FastxRecord.h" 14 | 15 | namespace cpputil { 16 | 17 | class UnMappedBamWriter { 18 | SeqLib::BamWriter bam_writer_; 19 | std::string sample_; 20 | std::string rgid_; 21 | 22 | const SeqLib::BamRecord CreateUBamRecord(const ExtFastxRecord& fxr, bool first_read) { 23 | SeqLib::BamRecord out; 24 | out.init(); 25 | bam1_t* b = out.raw(); 26 | b->core.tid = -1; 27 | b->core.pos = -1; 28 | b->core.qual = 0; 29 | b->core.flag = first_read ? 77: 141; 30 | 31 | // set dumy mate 32 | b->core.mtid = -1; 33 | b->core.mpos = -1; 34 | b->core.isize = 0; 35 | 36 | // allocate all the data 37 | b->core.l_qname = fxr.name().length() + 1; 38 | b->core.l_qseq = fxr.seq.length(); //(seq.length()>>1) + seq.length() % 2; // 4-bit encoding 39 | b->l_data = b->core.l_qname + ((b->core.l_qseq+1)>>1) + (b->core.l_qseq); 40 | b->data = (uint8_t*)malloc(b->l_data); 41 | 42 | // allocate the qname 43 | memcpy(b->data, fxr.name().c_str(), fxr.name().length() + 1); 44 | 45 | // allocate the sequence 46 | uint8_t* m_bases = b->data + b->core.l_qname; 47 | 48 | // TODO move this out of bigger loop 49 | int slen = fxr.seq.length(); 50 | for (int i = 0; i < slen; ++i) { 51 | // bad idea but works for now 52 | uint8_t base = 15; 53 | if (fxr.seq.at(i) == 'A') 54 | base = 1; 55 | else if (fxr.seq.at(i) == 'C') 56 | base = 2; 57 | else if (fxr.seq.at(i) == 'G') 58 | base = 4; 59 | else if (fxr.seq.at(i) == 'T') 60 | base = 8; 61 | 62 | m_bases[i >> 1] &= ~(0xF << ((~i & 1) << 2)); ///< zero out previous 4-bit base encoding 63 | m_bases[i >> 1] |= base << ((~i & 1) << 2); ///< insert new 4-bit base encoding 64 | 65 | } 66 | if (!fxr.qual.empty() && fxr.qual.length() != (unsigned) b->core.l_qseq) 67 | throw std::invalid_argument("New quality score should be same as seq length"); 68 | 69 | // length of qual is always same as seq. If empty qual, just set first bit of qual to 0 70 | if (not fxr.qual.empty()) { 71 | char * q = strdup(fxr.qual.data()); 72 | for (size_t i = 0; i < fxr.qual.length(); ++i) 73 | q[i] -= 33; 74 | memcpy(bam_get_qual(b), q, fxr.qual.length()); // dont copy /0 terminator 75 | free(q); 76 | } 77 | 78 | out.AddIntTag("td", fxr.rc_adpt); 79 | if (not fxr.umi.empty()) { 80 | out.AddZTag("RX", fxr.umi.seq()); 81 | out.AddZTag("QX", fxr.umi.qual()); 82 | } 83 | if (not fxr.adap5.empty()) { 84 | out.AddZTag("s5", fxr.adap5.seq()); 85 | out.AddZTag("q5", fxr.adap5.qual()); 86 | } 87 | if (not fxr.adap3.empty()) { 88 | out.AddZTag("s3", fxr.adap3.seq()); 89 | out.AddZTag("q3", fxr.adap3.qual()); 90 | } 91 | if (not fxr.trim3.empty()) { 92 | out.AddZTag("sl", fxr.trim3.seq()); 93 | out.AddZTag("ql", fxr.trim3.qual()); 94 | } 95 | if (not rgid_.empty()) { 96 | out.AddZTag("RG", rgid_); 97 | } 98 | if (not fxr.barcode.empty()) { 99 | out.AddZTag("bc", fxr.barcode); 100 | } 101 | if (fxr.tm != 255) { 102 | out.AddIntTag("tm", fxr.tm); 103 | } 104 | return out; 105 | } 106 | 107 | const SeqLib::BamRecord CreateUBamRecord(const SeqLib::BamRecord &bam_template, std::string seq, std::string qual, bool single_end) { 108 | // seq and qual are assumed to be PLUS strand 109 | // if (not ProperPair(bam_template)) { 110 | // throw std::runtime_error("not a proper pair"); 111 | // } 112 | SeqLib::BamRecord out; 113 | out.init(); 114 | SeqLib::Cigar c; 115 | out.SetCigar(c); 116 | if (bam_template.ReverseFlag()) { 117 | reverse_complement(seq); 118 | reverse(qual); 119 | } 120 | out.SetQname(bam_template.Qname()); 121 | if (single_end) { 122 | out.raw()->core.flag = 4; 123 | } 124 | else { 125 | out.raw()->core.flag = bam_template.FirstFlag() ? 77: 141; 126 | } 127 | out.SetSequence(seq); 128 | out.SetQualities(qual, 33); 129 | out.SetChrID(-1); 130 | out.SetChrIDMate(-1); 131 | out.SetPosition(-1); 132 | out.SetPositionMate(-1); 133 | int32_t cD; 134 | std::string rg; 135 | int32_t cM; 136 | std::string mi; 137 | std::string rx; 138 | if (bam_template.GetIntTag("cD", cD)) { 139 | out.AddIntTag("cD", cD); 140 | } 141 | if (bam_template.GetZTag("RG", rg)) { 142 | out.AddZTag("RG", rg); 143 | } 144 | if (bam_template.GetIntTag("cM", cM)) { 145 | out.AddIntTag("cM", cM); 146 | } 147 | if (bam_template.GetZTag("MI", mi)) { 148 | out.AddZTag("MI", mi); 149 | } 150 | if (bam_template.GetZTag("RX", rx)) { 151 | out.AddZTag("RX", rx); 152 | } 153 | return out; 154 | } 155 | 156 | public: 157 | UnMappedBamWriter() = default; 158 | UnMappedBamWriter(std::string path, std::string rgid, std::string sample) : sample_(sample), rgid_(rgid) { 159 | std::string header_str = "@HD\tVN:1.5\tGO:none\n"; 160 | header_str += "@RG\tID:" + rgid_ + "\tSM:" + sample_ + "\n"; 161 | SeqLib::BamHeader bh(header_str); 162 | bam_writer_.Open(path); 163 | bam_writer_.SetHeader(bh); 164 | bam_writer_.WriteHeader(); 165 | } 166 | 167 | UnMappedBamWriter(std::string path, const SeqLib::BamHeader& tpl) { 168 | std::istringstream iss(tpl.AsString()); 169 | std::string line; 170 | std::string newhdr; 171 | while (std::getline(iss, line, '\n')) { 172 | if (line.length() == 0 || line.at(0) != '@') break; 173 | std::string t = line.substr(0, 3); 174 | if ( t == "@HD" || t == "@RG") { 175 | newhdr += line +"\n"; 176 | } 177 | } 178 | SeqLib::BamHeader bh(newhdr); 179 | bam_writer_.Open(path); 180 | bam_writer_.SetHeader(bh); 181 | bam_writer_.WriteHeader(); 182 | } 183 | 184 | void Open(std::string path, std::string rgid, std::string sample) { 185 | std::string header_str = "@HD\tVN:1.5\tGO:none\n"; 186 | header_str += "@RG\tID:" + rgid + "\tSM:" + sample + "\n"; 187 | SeqLib::BamHeader bh(header_str); 188 | bam_writer_.Open(path); 189 | bam_writer_.SetHeader(bh); 190 | bam_writer_.WriteHeader(); 191 | } 192 | 193 | // void Init(std::string path, std::string readgroup, std::string sample) { 194 | // header_str += "@RG\tID:" 195 | // header_str += readgroup; 196 | // header_str += "\tSM:"; 197 | // header_str += sample; 198 | // header_str += "\n"; 199 | // 200 | // } 201 | 202 | ~UnMappedBamWriter() { 203 | bam_writer_.Close(); 204 | } 205 | bool IsOpen() { 206 | return bam_writer_.IsOpen(); 207 | } 208 | 209 | 210 | void WriteRecord(const SeqLib::BamRecord & R1, const SeqLib::BamRecord& R2, std::string seq1, std::string seq2, std::string qual1, std::string qual2) { 211 | // Write paired end records 212 | // R1, R2 must be strictly First in pair and, Second in pair 213 | if (not cpputil::ProperPair(R1) || not cpputil::ProperPair(R2)) { 214 | throw std::runtime_error("not a proper pair"); 215 | } 216 | auto r1 = CreateUBamRecord(R1, seq1, qual1, false); 217 | auto r2 = CreateUBamRecord(R2, seq2, qual2, false); 218 | //std::cout << out << std::endl; 219 | bool status1 = bam_writer_.WriteRecord(r1); 220 | bool status2 = bam_writer_.WriteRecord(r2); 221 | if (not status1 or not status2) { 222 | std::cerr << "cannot write bam record " << R1.Qname() << std::endl; 223 | } 224 | } 225 | 226 | //simply strip off mapping information and output ubam 227 | //this is used for intermolecular bams 228 | void WriteRecord(const SeqLib::BamRecord & R1, const SeqLib::BamRecord& R2) { 229 | auto r1 = CreateUBamRecord(R1, R1.Sequence(), R1.QualitySequence(), false); 230 | auto r2 = CreateUBamRecord(R2, R2.Sequence(), R2.QualitySequence(), false); 231 | bool status1 = bam_writer_.WriteRecord(r1); 232 | bool status2 = bam_writer_.WriteRecord(r2); 233 | if (not status1 or not status2) { 234 | std::cerr << "cannot write bam record " << R1.Qname() << std::endl; 235 | } 236 | } 237 | 238 | void WriteRecord(const SeqLib::BamRecord & R1, std::string seq, std::string qual) { 239 | // Write Single end record 240 | // R1, R2 must be strictly First in pair and, Second in pair 241 | if (not cpputil::ProperPair(R1)){ 242 | throw std::runtime_error("not a proper pair"); 243 | } 244 | auto out = CreateUBamRecord(R1, seq, qual, true); 245 | bool status = bam_writer_.WriteRecord(out); 246 | if (not status) { 247 | std::cerr << "cannot write bam record " << R1.Qname() << std::endl; 248 | } 249 | } 250 | 251 | void WriteRecord(ExtFastxRecord& fxr, bool first_read) { 252 | auto out = CreateUBamRecord(fxr, first_read); 253 | bool status = bam_writer_.WriteRecord(out); 254 | if (not status) { 255 | std::cerr << "cannot write bam record " << fxr.broad_id() << std::endl; 256 | } 257 | } 258 | }; 259 | 260 | } 261 | #endif //ADAPTERTRIM_INCLUDE_BAMIO_H_ 262 | -------------------------------------------------------------------------------- /include/Index.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 9/18/20. 3 | // 4 | 5 | #ifndef ADAPTERTRIM_INCLUDE_INDEX_H_ 6 | #define ADAPTERTRIM_INCLUDE_INDEX_H_ 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "BamRecordExt.h" 18 | #include "StringUtils.h" 19 | #include "FastxIO.h" 20 | #include "Gotoh.h" 21 | #include "omp.h" 22 | 23 | namespace CDS { 24 | static void PrintAlignment(const StripedSmithWaterman::Alignment& alignment){ 25 | std::cout << "===== SSW result =====" << std::endl; 26 | std::cout << "Best Smith-Waterman score:\t" << alignment.sw_score << std::endl 27 | << "Next-best Smith-Waterman score:\t" << alignment.sw_score_next_best << std::endl 28 | << "Reference start:\t" << alignment.ref_begin << std::endl 29 | << "Reference end:\t" << alignment.ref_end << std::endl 30 | << "Query start:\t" << alignment.query_begin << std::endl 31 | << "Query end:\t" << alignment.query_end << std::endl 32 | << "Next-best reference end:\t" << alignment.ref_end_next_best << std::endl 33 | << "Number of mismatches:\t" << alignment.mismatches << std::endl 34 | << "Cigar: " << alignment.cigar_string << std::endl; 35 | std::cout << "======================" << std::endl; 36 | } 37 | int SSW(const std::string &ref, const std::string &query, bool verbose=false) { 38 | StripedSmithWaterman::Aligner aligner(2,0,2,1); 39 | //StripedSmithWaterman::Aligner aligner; 40 | 41 | StripedSmithWaterman::Filter filter; 42 | StripedSmithWaterman::Alignment alignment; 43 | aligner.Align(query.c_str(), ref.c_str(), ref.size(), filter, &alignment, 15); 44 | int ed = alignment.mismatches + alignment.query_begin + query.length() - alignment.query_end - 1; 45 | if (verbose) { 46 | PrintAlignment(alignment); 47 | std::cout << "edit: " << ed << std::endl; 48 | } 49 | return ed; 50 | } 51 | 52 | 53 | bool IsIntermol(const SeqLib::BWAWrapper& bwa, 54 | const std::string& name1, const std::string& seq1, const std::string& name2, const std::string& seq2) { 55 | SeqLib::BamRecordVector read1_bam; 56 | SeqLib::BamRecordVector read2_bam; 57 | bwa.AlignSequence(seq1, name1, read1_bam, false, -1, 0); 58 | bwa.AlignSequence(seq2, name2, read2_bam, false, -1, 0); 59 | if (read1_bam.empty() and read2_bam.empty()) { 60 | return false; 61 | } 62 | if (read1_bam.empty() or read2_bam.empty()) { 63 | return true; 64 | } 65 | // if (verbose) { 66 | // std::cerr << read1_bam[0] << std::endl; 67 | // std::cerr << read2_bam[0] << std::endl; 68 | // } 69 | auto is = cpputil::InsertSize(read1_bam[0], read2_bam[0]); 70 | if (is > 1000 || is == 0) return true; 71 | else return false; 72 | } 73 | 74 | class IndexBarcode { 75 | std::vector index1s_; 76 | std::vector index2s_; 77 | std::vector snames_; 78 | std::vector nmatched_; 79 | std::vector fq1_writers_; 80 | std::vector fq2_writers_; 81 | cpputil::FastqWriter unkfq1_writer_; 82 | cpputil::FastqWriter unkfq2_writer_; 83 | cpputil::FastqWriter hopfq1_writer_; 84 | cpputil::FastqWriter hopfq2_writer_; 85 | std::ifstream file_; 86 | SeqLib::BWAWrapper bwa_; 87 | int max_ed_; 88 | bool out_unmatched_; 89 | bool out_hopped_; 90 | bool verbose_; 91 | 92 | // static const int INDEX_START = 3; 93 | // static const int INDEX_LEN = 18; 94 | 95 | int MatchIndex (const std::string& seq, const std::string& qual, const std::vector& indexes, int& nm) { 96 | int lowest_nm = std::numeric_limits::max(); 97 | int second_lowest_nm = std::numeric_limits::max(); 98 | int best_idx = 0; 99 | for (unsigned i = 0; i < indexes.size(); ++i) { 100 | int s = SSW(seq, indexes[i]); 101 | //AffineGap ag(seq, indexes[i]); 102 | //Alignment align(seq, indexes[i], ag.Path()); 103 | //int s = align.NM(); 104 | if (s < lowest_nm) { 105 | second_lowest_nm = lowest_nm; 106 | lowest_nm = s; 107 | best_idx = i; 108 | } else if (s < second_lowest_nm) { 109 | second_lowest_nm = s; 110 | } 111 | } 112 | nm = lowest_nm; 113 | if (lowest_nm <= max_ed_) { 114 | return best_idx; 115 | } 116 | if (lowest_nm == max_ed_ + 1 && second_lowest_nm > max_ed_ + 3) { 117 | return best_idx; 118 | } 119 | return -1; 120 | } 121 | 122 | public: 123 | IndexBarcode (const std::string& index_file, const std::string& outprefix, const int max_ed, bool out_unmatched, bool out_hopped, bool v): 124 | file_(index_file), max_ed_(max_ed), out_unmatched_(out_unmatched), out_hopped_(out_hopped), verbose_(v) 125 | { 126 | std::string header; 127 | std::string line; 128 | std::getline(file_, header); 129 | auto colnames = cpputil::split(header, ","); 130 | if (colnames.size() != 3 || colnames[0] != "SampleName" || colnames[1] != "IndexBarcode1" || colnames[2] != "IndexBarcode2") { 131 | throw std::runtime_error("Invalid index file\n Format required as three comma-delimited columns with header SampleName,IndexBarcode1,IndexBarcode2"); 132 | } 133 | if (out_unmatched) { 134 | unkfq1_writer_.open(outprefix + ".unmatched.1.fastq.gz"); 135 | unkfq2_writer_.open(outprefix + ".unmatched.2.fastq.gz"); 136 | } 137 | if (out_unmatched) { 138 | hopfq1_writer_.open(outprefix + ".hopped.1.fastq.gz"); 139 | hopfq2_writer_.open(outprefix + ".hopped.2.fastq.gz"); 140 | } 141 | std::set unique_sids; 142 | while(std::getline(file_, line)) { 143 | std::cerr << line << std::endl; 144 | auto fields = cpputil::split(line, ","); 145 | if (unique_sids.find(fields[0]) == unique_sids.end()) { 146 | unique_sids.insert(fields[0]); 147 | snames_.push_back(fields[0]); 148 | index1s_.push_back(fields[1]); 149 | index2s_.push_back(fields[2]); 150 | fq1_writers_.emplace_back(outprefix + "." + fields[0] + ".1.fastq.gz"); 151 | fq2_writers_.emplace_back(outprefix + "." + fields[0] + ".2.fastq.gz"); 152 | } else { 153 | std::cerr << "Warning: duplicated sample name in library_params. Ignore \"" << line << "\"\n"; 154 | } 155 | } 156 | nmatched_.resize(snames_.size(), 0); 157 | // Print output header 158 | if (verbose_) { 159 | std::cout << "id\t" 160 | "observed_1\t" 161 | "barcode_1\t" 162 | "nm1\t" 163 | "observed_2\t" 164 | "barcode_2\t" 165 | "nm2\t" 166 | "sample_1\t" 167 | "sample_2\t" 168 | "matched\t" 169 | "conflicted\t" 170 | "hopped" << std::endl; 171 | } 172 | } 173 | 174 | void LoadBwa(const std::string &refgenome) { 175 | std::cerr << "loading index " << refgenome << std::endl; 176 | bwa_.LoadIndex(refgenome); 177 | std::cerr << "finished load index " << refgenome << std::endl; 178 | } 179 | 180 | std::pair ExtractIndex(const cpputil::FastxRecord& read, int index_begin, int index_len) { 181 | std::string s1 = read.seq.substr(index_begin, index_len); 182 | std::string q1 = read.qual.substr(index_begin, index_len); 183 | return std::make_pair(s1, q1); 184 | } 185 | 186 | void DecodePair(const cpputil::FastxRecord& r1, const cpputil::FastxRecord& r2, int index_begin, int index_len) { 187 | std::string ob1, qual1, ob2, qual2; 188 | std::tie(ob1, qual1) = ExtractIndex(r1, std::max(index_begin-2, 0), index_len+2); 189 | std::tie(ob2, qual2) = ExtractIndex(r2, std::max(index_begin-2, 0), index_len+2); 190 | int nm1, nm2; 191 | int idx1 = MatchIndex(ob1, qual1, index1s_, nm1); 192 | int idx2 = MatchIndex(ob2, qual2, index2s_, nm2); 193 | std::string r1b = idx1 == -1 ? "" : index1s_[idx1]; 194 | std::string r2b = idx2 == -1 ? "" : index2s_[idx2]; 195 | std::string r1s = idx1 == -1 ? "" : snames_[idx1]; 196 | std::string r2s = idx2 == -1 ? "" : snames_[idx2]; 197 | std::string match; 198 | std::string conflict = "0"; 199 | if (idx1 == -1 || idx2 == -1 || idx1 != idx2 ) { 200 | if (out_unmatched_) { 201 | unkfq1_writer_.Write(r1.id, r1.seq, r1.qual); 202 | unkfq2_writer_.Write(r2.id, r2.seq, r2.qual); 203 | } 204 | match = "0"; 205 | if (idx1 != -1 && idx2 != -1 && idx1 != idx2) conflict = "1"; 206 | } else { 207 | size_t stop = r1.id.find_last_of(':'); 208 | fq1_writers_[idx1].Write(r1.id.substr(0, stop) + ":" + r1b, r1.seq, r1.qual); 209 | fq2_writers_[idx2].Write(r2.id.substr(0, stop) + ":" + r2b, r2.seq, r2.qual); 210 | match = "1"; 211 | ++nmatched_[idx1]; 212 | } 213 | std::string hopped = "0"; 214 | if (conflict == "1" and !bwa_.IsEmpty()) { 215 | hopped = IsIntermol(bwa_, r1.name(), r1.seq.substr(index_begin + index_len + 1), 216 | r2.name(), r2.seq.substr(index_begin + index_len + 1)) ? "0" : "1"; 217 | if (out_hopped_ && hopped == "1") { 218 | hopfq1_writer_.Write(r1.id, r1.seq, r1.qual); 219 | hopfq2_writer_.Write(r2.id, r2.seq, r2.qual); 220 | } 221 | } 222 | if (verbose_) { 223 | std::cout << r1.name() << "\t" << ob1 << "\t" << r1b << "\t" << nm1 << "\t" << ob2 << "\t" << \ 224 | r2b << "\t" << nm2 << "\t" << r1s << "\t" << r2s << "\t" 225 | << match << "\t" << conflict << "\t" << hopped << "\n"; 226 | } 227 | } 228 | uint64_t total_matched() const {return std::accumulate(nmatched_.begin(), nmatched_.end(), (uint64_t) 0);} 229 | decltype(auto) samples() const {return snames_;} 230 | decltype(auto) nmatched() const {return nmatched_;} 231 | }; 232 | 233 | } 234 | 235 | 236 | #endif //ADAPTERTRIM_INCLUDE_INDEX_H_ 237 | -------------------------------------------------------------------------------- /msi/Snakefile: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | metadata_file = config["input_meta"] 3 | metadata = pd.read_csv(metadata_file, sep="\t").set_index("pair") 4 | nparallel = config['nparallel'] if 'nparallel' in config else 40 5 | bed="" 6 | hg19="" 7 | gnomad_vcf="" 8 | workdir: config["cwd"] 9 | bedtools = "bedtools" 10 | 11 | rule all: 12 | input: 13 | expand("result/{pair}.{type}.filtered", pair = metadata.index, type=["all", "msi"]) 14 | 15 | rule SplitBed: 16 | input: 17 | bed 18 | params: 19 | nsplit = nparallel, 20 | prefix = "tmp/split_region_" 21 | output: 22 | splitbed = temp(expand("tmp/split_region_{id}.bed", id = [str(x).zfill(3) for x in range(nparallel)])) 23 | shell: 24 | """ 25 | split {input} -n l/{params.nsplit} -a 3 -d {params.prefix} --additional-suffix .bed 26 | """ 27 | 28 | rule MsiDetect: 29 | input: 30 | normal = lambda wildcards: metadata.loc[wildcards.pair][config['normal_col']], 31 | tumor = lambda wildcards: metadata.loc[wildcards.pair]['tumor'], 32 | bed = "tmp/split_region_{id}.bed" 33 | params: 34 | mapq = 50, 35 | outprefix = "tmp/{pair}_region_{id}", 36 | population_vcf = gnomad_vcf 37 | output: 38 | temp("tmp/{pair}_region_{id}.msi"), 39 | temp("tmp/{pair}_region_{id}.all") 40 | resources: 41 | runtime = 2 42 | shell: 43 | """ 44 | msi -t {input.tumor} -n {input.normal} -L {input.bed} -m {params.mapq} -r {hg19} -o {params.outprefix} -V {params.population_vcf} -D -U DI -x 2 45 | """ 46 | 47 | rule AggMsi: 48 | input: 49 | expand("tmp/{{pair}}_region_{id}.{{type}}", id = [str(x).zfill(3) for x in range(nparallel)]) 50 | output: 51 | "result/{pair}.{type}" 52 | wildcard_constraints: 53 | type = "[0-9a-zA-Z_]+" 54 | shell: 55 | """ 56 | cat {input} > {output} 57 | """ 58 | 59 | rule FilterGerm: 60 | input: 61 | msi = "result/{pair}.{type}", 62 | germ_vcf = lambda wildcards: metadata.loc[wildcards.pair]['germ_vcf'], 63 | output: 64 | "result/{pair}.{type}.filtered" 65 | wildcard_constraints: 66 | type = "[0-9a-zA-Z_]+" 67 | shell: 68 | """ 69 | awk "{{OFS=\\"\\t\\"}};{{if (\$7 < 0.7 && \$8 < 0.7 ) {{print \$1,\$2-5,\$2+\$4+5,\$0}} }}" {input.msi} | {bedtools} intersect -a - -b {input.germ_vcf} -c | cut -f 4- | awk "\$NF == 0 && length(\$3) == 1 {{print \$0}}" > {output} 70 | """ 71 | -------------------------------------------------------------------------------- /obsolete/bamtofastq.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 10/6/20. 3 | // 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #ifdef BGZF_MAX_BLOCK_SIZE 16 | #pragma push_macro("BGZF_MAX_BLOCK_SIZE") 17 | #undef BGZF_MAX_BLOCK_SIZE 18 | #define BGZF_MAX_BLOCK_SIZE_BAK 19 | #endif 20 | 21 | #ifdef BGZF_BLOCK_SIZE 22 | #pragma push_macro("BGZF_BLOCK_SIZE") 23 | #undef BGZF_BLOCK_SIZE 24 | #define BGZF_BLOCK_SIZE_BAK 25 | #endif 26 | #include "FastxIO.h" 27 | 28 | using std::string; 29 | struct Options { 30 | string bam; 31 | string fastq1; 32 | string fastq2; 33 | string tmpdir = "/tmp"; 34 | int thread = 1; 35 | }; 36 | 37 | 38 | static struct option long_options[] = { 39 | {"input1", required_argument, 0, '1'}, 40 | {"input2", required_argument, 0, '2'}, 41 | {"bam", required_argument , 0, 'b'}, 42 | {"tmpdir", required_argument , 0, 't'}, 43 | {"thread", required_argument, 0, 'p'}, 44 | {0,0,0,0} 45 | }; 46 | 47 | const char*short_options = "1:2:b:t:p:"; 48 | 49 | void print_help() 50 | { 51 | std::cerr<< "---------------------------------------------------\n"; 52 | std::cerr<< "Usage: consensus [options]\n"; 53 | std::cerr<< "General Options:\n"; 54 | std::cerr<< "-b/--bam, Bam input\n"; 55 | std::cerr<< "-1/--fastq1, Output Fastq1\n"; 56 | std::cerr<< "-2/--fastq2, Output Fastq2\n"; 57 | std::cerr<< "-t/--tmpdir, Temporary dir for sorted bam [/tmp]\n"; 58 | std::cerr<< "-p/--thread, Number of threads for sort [1]\n"; 59 | } 60 | 61 | int parse_options(int argc, char* argv[], Options& opt) { 62 | int option_index; 63 | int next_option = 0; 64 | do { 65 | next_option = getopt_long(argc, argv, short_options, long_options, &option_index); 66 | switch (next_option) { 67 | case -1:break; 68 | case '1': 69 | opt.fastq1 = optarg; 70 | break; 71 | case '2': 72 | opt.fastq2 = optarg; 73 | break; 74 | case 'b': 75 | opt.bam = optarg; 76 | break; 77 | case 't': 78 | opt.tmpdir = optarg; 79 | break; 80 | case 'p': 81 | opt.thread = atoi(optarg); 82 | break; 83 | default:print_help(); 84 | return 1; 85 | } 86 | } while (next_option != -1); 87 | 88 | return 0; 89 | } 90 | 91 | int main(int argc, char ** argv) { 92 | Options opt; 93 | int parse_ret = parse_options(argc, argv, opt); 94 | if (parse_ret) return 1; 95 | if (argc == 1) { 96 | print_help(); 97 | exit(0); 98 | } 99 | char temp[100]; 100 | strcpy(temp, opt.tmpdir.c_str()); 101 | strcat(temp, "/tempsort.XXXXXX"); 102 | int fd = mkstemp(temp); 103 | if (fd == -1) { 104 | std::cerr << "unable to create temp file for sorting bam in queryname order\n"; 105 | return 1; 106 | } 107 | string samsort = "samtools sort -n " + opt.bam + " -o " + string(temp) + " -@ " + std::to_string(opt.thread); 108 | std::cout << samsort << std::endl; 109 | std::system(samsort.c_str()); 110 | SeqLib::BamRecord read; 111 | SeqLib::BamReader input; 112 | input.Open(temp); 113 | cpputil::FastqWriter R1(opt.fastq1); 114 | cpputil::FastqWriter R2(opt.fastq2); 115 | std::vector pair(2); 116 | // After sorted by name, using yield like approach 117 | while(input.GetNextRecord(read)) { 118 | if (read.FirstFlag() && !read.SupplementaryFlag() && !read.SecondaryFlag()) { 119 | if (pair[0].isEmpty()) { 120 | pair[0] = read; 121 | if (!pair[1].isEmpty()) { 122 | if (pair[1].Qname() != pair[0].Qname()) { 123 | throw std::runtime_error("Bam file must be query name sorted! Exit at read " + read.Qname()); 124 | } else { 125 | cpputil::FastxRecord fx1(pair[0], true); 126 | R1.Write(fx1); 127 | cpputil::FastxRecord fx2(pair[1], true); 128 | R2.Write(fx2); 129 | pair.clear(); 130 | pair.resize(2); 131 | } 132 | } 133 | } else { 134 | throw std::runtime_error("Duplicated read name, " + read.Qname()); 135 | } 136 | } 137 | 138 | if (!read.FirstFlag() && !read.SupplementaryFlag() && !read.SecondaryFlag()) { 139 | if (pair[1].isEmpty()) { 140 | pair[1] = read; 141 | if (!pair[0].isEmpty()) { 142 | if (pair[1].Qname() != pair[0].Qname()) { 143 | throw std::runtime_error("Bam file must be query name sorted! Exit at read " + read.Qname()); 144 | } else { 145 | cpputil::FastxRecord fx1(pair[0], true); 146 | R1.Write(fx1); 147 | cpputil::FastxRecord fx2(pair[1], true); 148 | R2.Write(fx2); 149 | pair.clear(); 150 | pair.resize(2); 151 | } 152 | } 153 | } else { 154 | throw std::runtime_error("Duplicated read name, " + read.Qname()); 155 | } 156 | } 157 | } 158 | close(fd); 159 | unlink(temp); 160 | return 0; 161 | } 162 | -------------------------------------------------------------------------------- /obsolete/concat_umi_to_fastq.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 10/6/20. 3 | // 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using std::string; 11 | struct Options { 12 | string fastq1; 13 | string fastq2; 14 | string index1; 15 | string index2; 16 | string out1; 17 | string out2; 18 | }; 19 | 20 | 21 | static struct option long_options[] = { 22 | {"input1", required_argument, 0, '1'}, 23 | {"input2", required_argument, 0, '2'}, 24 | {"index1", required_argument , 0, 'i'}, 25 | {"index2", required_argument , 0, 'I'}, 26 | {"out1", required_argument , 0, 'o'}, 27 | {"out2", required_argument , 0, 'O'}, 28 | {0,0,0,0} 29 | }; 30 | 31 | const char*short_options = "1:2:o:O:i:I:"; 32 | 33 | void print_help() 34 | { 35 | std::cerr<< "---------------------------------------------------\n"; 36 | std::cerr<< "Usage: consensus [options]\n"; 37 | std::cerr<< "General Options:\n"; 38 | std::cerr<< "-1/--input1, Input Read 1\n"; 39 | std::cerr<< "-2/--iunput2, Input Read 2\n"; 40 | std::cerr<< "-i/--index1, index/UMI file for Read 1\n"; 41 | std::cerr<< "-I/--index2, index/UMI file for Read 2\n"; 42 | std::cerr<< "-o/--output1, Output Read 1\n"; 43 | std::cerr<< "-o/--output2, Output Read 2\n"; 44 | } 45 | 46 | int parse_options(int argc, char* argv[], Options& opt) { 47 | int option_index; 48 | int next_option = 0; 49 | do { 50 | next_option = getopt_long(argc, argv, short_options, long_options, &option_index); 51 | switch (next_option) { 52 | case -1:break; 53 | case 'i': 54 | opt.index1 = optarg; 55 | break; 56 | case 'I': 57 | opt.index2 = optarg; 58 | break; 59 | case '1': 60 | opt.fastq1 = optarg; 61 | break; 62 | case '2': 63 | opt.fastq2 = optarg; 64 | break; 65 | case 'o': 66 | opt.out1 = optarg; 67 | break; 68 | case 'O': 69 | opt.out2= optarg; 70 | break; 71 | default:print_help(); 72 | return 1; 73 | } 74 | } while (next_option != -1); 75 | 76 | return 0; 77 | } 78 | 79 | int main(int argc, char ** argv) { 80 | Options opt; 81 | int parse_ret = parse_options(argc, argv, opt); 82 | if (parse_ret) return 1; 83 | if (argc == 1) { 84 | print_help(); 85 | exit(0); 86 | } 87 | 88 | cpputil::FastxReader R1_reader(opt.fastq1); 89 | cpputil::FastxReader R2_reader(opt.fastq2); 90 | cpputil::FastxReader I1_reader(opt.index1); 91 | cpputil::FastxReader I2_reader(opt.index2); 92 | cpputil::FastqWriter O1_writer(opt.out1); 93 | cpputil::FastqWriter O2_writer(opt.out2); 94 | cpputil::FastxRecord read1; 95 | cpputil::FastxRecord read2; 96 | cpputil::FastxRecord index1; 97 | cpputil::FastxRecord index2; 98 | cpputil::FastxRecord outread1; 99 | cpputil::FastxRecord outread2; 100 | while (R1_reader.yield(read1)) { 101 | R2_reader.yield(read2); 102 | I1_reader.yield(index1); 103 | I2_reader.yield(index2); 104 | assert(read1.name() == read2.name()); 105 | assert(index1.name() == index2.name()); 106 | assert(index1.name() == read1.name()); 107 | outread1.id = read1.id; 108 | outread1.seq = index1.seq + read1.seq; 109 | outread1.qual = index1.qual + read1.qual; 110 | outread2.id = read2.id; 111 | outread2.seq = index2.seq + read2.seq; 112 | outread2.qual = index2.qual + read2.qual; 113 | O1_writer.Write(outread1); 114 | O2_writer.Write(outread2); 115 | } 116 | return 0; 117 | } 118 | -------------------------------------------------------------------------------- /obsolete/print_qual.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Ruolin Liu on 7/12/20. 3 | // 4 | 5 | #include 6 | #include "DNAUtils.h" 7 | 8 | int main(int argc, char** argv) { 9 | std::string qual = argv[1]; 10 | int bq = std::stoi(argv[2]); 11 | cpputil::PrintQualString(qual, bq); 12 | return 0; 13 | } -------------------------------------------------------------------------------- /snakemake/AdapV2/capture_wf_1/Snakefile: -------------------------------------------------------------------------------- 1 | include: "../Snakefile" 2 | GROUP_BY_UMI_DIR="groupbyumi" 3 | DETECT_DIR="detect" 4 | CONSENSUS_OUT="consensus_out" 5 | RECOVERY = config['duplex_recovery_script'] 6 | COLLECT_DUPMET = f"python {RECOVERY}" 7 | 8 | rule all: 9 | input: 10 | expand(Metrics_OUT + "/byproduct/{batch_id}.{index}.byproduct.txt", zip, batch_id = metadata.reset_index()['batch'], index = metadata.reset_index()['sample']), 11 | expand(Metrics_OUT + "/{index}.raw.umiHistogram.txt",index = metadata.reset_index()['sample']), 12 | expand(Metrics_OUT + "/{index}.duplex_yield_metrics.txt",index=sample_names), 13 | expand("sfc/{index}.error_metrics.txt", index=sample_names), 14 | expand(Metrics_OUT + "/{index}.raw.hs_metrics.txt", index=sample_names), 15 | 16 | 17 | sample_to_maf = metadata.reset_index().groupby('sample').agg({'fingerprint_maf': set}) 18 | sample_to_bait = metadata.reset_index().groupby('sample').agg({'bait_intervals': set}) 19 | sample_to_bed = metadata.reset_index().groupby('sample').agg({'bait_bed': set}) 20 | 21 | rule CollectRawHsMetrics: 22 | input: 23 | bam = "tmp/{index}.raw.replacerg.markdup.bam", 24 | output: 25 | metrics = Metrics_OUT + "/{index}.raw.hs_metrics.txt", 26 | per_target_cov = Metrics_OUT + "/{index}.raw.per_target_cov.txt" 27 | params: 28 | ref = REF, 29 | bait = lambda wildcards: sample_to_bait.loc[wildcards.index]['bait_intervals'], 30 | resources: 31 | mem = 16, 32 | runtime = 12 33 | shell: 34 | """ 35 | {PICARD} CollectHsMetrics COVERAGE_CAP=20000 I={input.bam} O={output.metrics} R={params.ref} BAIT_INTERVALS={params.bait} TARGET_INTERVALS={params.bait} PER_TARGET_COVERAGE={output.per_target_cov} 36 | """ 37 | 38 | ##CODEC specific filters 39 | rule FilterMolecularConsensusReads: 40 | input: 41 | bam = "filtered/{index}.mol_consensus.aligned.bam" 42 | output: 43 | bam = "filtered/{index}.mol_consensus.filtered.bam", 44 | bai = "filtered/{index}.mol_consensus.filtered.bam.bai" 45 | resources: 46 | mem = 8, 47 | runtime = 24 48 | shell: 49 | """ 50 | {FILTER} -b {input.bam} -f 2 | samtools sort - -o {output.bam} && samtools index {output.bam} 51 | """ 52 | 53 | rule CODEC_SFC: 54 | input: 55 | bam = "filtered/{index}.mol_consensus.filtered.bam" 56 | output: 57 | accu = "sfc/{index}.mutant_metrics.txt", 58 | call = "sfc/{index}.variants_called.txt", 59 | context = "sfc/{index}.context_count.txt", 60 | params: 61 | ref = REF, 62 | high_conf_region = lambda wildcards : sample_to_bed.loc[wildcards.index]['bait_bed'], 63 | germ_vcf = lambda wildcards : sample_to_vcf.loc[wildcards.index]['germline_vcf'], 64 | germ_bam = lambda wildcards : sample_to_germbam.loc[wildcards.index]['germline_bam'], 65 | mut_maf = lambda wildcards: sample_to_maf.loc[wildcards.index]['fingerprint_maf'], 66 | resources: 67 | mem = 8, 68 | runtime = 96 69 | shell: 70 | """ 71 | {CALL_BIN} -b {input.bam} \ 72 | -L {params.high_conf_region} \ 73 | -r {params.ref} \ 74 | -n {params.germ_bam} \ 75 | -m 60 \ 76 | -q 30 \ 77 | -d 12 \ 78 | -V {params.germ_vcf} \ 79 | -M {params.mut_maf} \ 80 | -x 2 \ 81 | -5 \ 82 | -g 30 \ 83 | -G 250 \ 84 | -Q 0.6 \ 85 | -N 0.03 \ 86 | -B 0.5 \ 87 | -Y 0 \ 88 | -a {output.accu} \ 89 | -e {output.call} \ 90 | -C {output.context} 91 | """ 92 | 93 | 94 | rule CollectRawInsertSizeMetrics: 95 | input: 96 | bam = "tmp/{batch_id}.{index}.raw.aligned.bam", 97 | output: 98 | txt = Metrics_OUT + "/{batch_id}.{index}.raw.insert_size_metrics.txt", 99 | hist = Metrics_OUT + "/{batch_id}.{index}.raw.insert_size_histogram.pdf" 100 | params: 101 | ref = REF 102 | shell: 103 | """ 104 | {PICARD} CollectInsertSizeMetrics I={input.bam} O={output.txt} H={output.hist} M=0.5 W=900 DEVIATIONS=100 105 | """ 106 | 107 | rule SortGBUbam: 108 | input: 109 | GROUP_BY_UMI_DIR + "/{index}.raw.GroupedByUmi.bam", 110 | output: 111 | bam = GROUP_BY_UMI_DIR + "/{index}.sorted.GroupedByUmi.bam", 112 | bai = GROUP_BY_UMI_DIR + "/{index}.sorted.GroupedByUmi.bai", 113 | resources: 114 | mem = 16, 115 | runtime = 48 116 | shell: 117 | """ 118 | {PICARD} SortSam I={input} O={output.bam} SO=coordinate CREATE_INDEX=true MAX_RECORDS_IN_RAM=1000000 119 | """ 120 | 121 | 122 | rule DuplexRecoveryByTarget: 123 | input: 124 | GROUP_BY_UMI_DIR + "/{index}.raw.GroupedByUmi.bam", 125 | output: 126 | Metrics_OUT + "/{index}.duplex_yield_metrics.txt", 127 | resources: 128 | mem = 8, 129 | runtime = 12 130 | params: 131 | interval = lambda wildcards: sample_to_bait.loc[wildcards.index]['bait_intervals'], 132 | shell: 133 | """ 134 | {COLLECT_DUPMET} --bam_file {input} \ 135 | -l {params.interval} \ 136 | --min_reads 1 \ 137 | -c \ 138 | -p \ 139 | -r \ 140 | -o {output} 141 | """ -------------------------------------------------------------------------------- /snakemake/AdapV2/wgs/Snakefile: -------------------------------------------------------------------------------- 1 | include: "../Snakefile" 2 | EVAL_REGION_BED= config['region_bed'] 3 | EVAL_REGION_IL= config['region_interval_list'] 4 | DBSNP= config['dbsnp'] 5 | 6 | rule all: 7 | input: 8 | expand(Metrics_OUT + "/{index}.mol_consensus.wgs_metrics.txt",index = sample_names), 9 | expand(ACCU_OUT + "/{index}.mutant_metrics.txt", index = sample_names), 10 | expand("raw_sfc/{index}.mutant_metrics.txt", index = sample_names), 11 | expand(Metrics_OUT + "/{index}.raw.insert_size_metrics.txt",index = sample_names), 12 | expand(Metrics_OUT + "/byproduct/{batch_id}.{index}.byproduct.txt", zip, batch_id = metadata.reset_index()['batch'], index = metadata.reset_index()['sample']), 13 | expand(Metrics_OUT + "/{index}.raw.wgs_metrics.txt",index = sample_names) 14 | 15 | rule CollectRawWgsMetrics: 16 | input: 17 | bam = "tmp/{index}.raw.replacerg.markdup.bam", 18 | output: 19 | metrics = Metrics_OUT + "/{index}.raw.wgs_metrics.txt", 20 | params: 21 | ref = REF, 22 | itl = EVAL_REGION_IL 23 | resources: 24 | mem = 16, 25 | runtime = 96 26 | shell: 27 | """ 28 | {PICARD} CollectWgsMetrics I={input.bam} O={output.metrics} R={params.ref} INTERVALS={params.itl} \ 29 | COUNT_UNPAIRED=true MINIMUM_BASE_QUALITY=0 MINIMUM_MAPPING_QUALITY=0 30 | """ 31 | 32 | rule CollectWgsMetrics: 33 | input: 34 | bam = "consensus/{index}.replacerg.markdup.bam", 35 | output: 36 | metrics = Metrics_OUT + "/{index}.mol_consensus.wgs_metrics.txt", 37 | params: 38 | ref = REF, 39 | itl = EVAL_REGION_IL 40 | resources: 41 | mem = 16, 42 | runtime = 96 43 | shell: 44 | """ 45 | {PICARD} CollectWgsMetrics I={input.bam} O={output.metrics} R={params.ref} INTERVALS={params.itl} \ 46 | INCLUDE_BQ_HISTOGRAM=true 47 | """ 48 | 49 | rule CollectFinalWgsMetrics: 50 | input: 51 | bam = "consensus/{index}.mol_consensus.aligned.bam", 52 | output: 53 | metrics = Metrics_OUT + "/{index}.mol_consensus.wgs_metrics.txt", 54 | params: 55 | ref = REF, 56 | itl = EVAL_REGION_IL 57 | shell: 58 | """ 59 | {PICARD} CollectWgsMetrics I={input.bam} O={output.metrics} R={params.ref} INTERVALS={params.itl} INCLUDE_BQ_HISTOGRAM=true MINIMUM_BASE_QUALITY=30 60 | """ 61 | # 62 | rule CSS_SFC_ErrorMetrics: 63 | input: 64 | bam = "consensus/{index}.mol_consensus.aligned.bam", 65 | output: 66 | accu = ACCU_OUT + "/{index}.mutant_metrics.txt", 67 | called = ACCU_OUT + "/{index}.variants_called.txt", 68 | context = ACCU_OUT + "/{index}.context_count.txt", 69 | params: 70 | ref = REF, 71 | high_conf_region = EVAL_REGION_BED, 72 | dbsnp = DBSNP, 73 | germ_bam = lambda wildcards : sample_to_germbam.loc[wildcards.index]['germline_bam'], 74 | #germ_vcf = lambda wildcards : sample_to_germvcf.loc[wildcards.index]['germline_vcf'], 75 | resources: 76 | mem = 16, 77 | runtime = 96 78 | shell: 79 | """ 80 | {CALL_BIN} -b {input.bam} \ 81 | -L {params.high_conf_region} \ 82 | -r {params.ref} \ 83 | -m 60 \ 84 | -q 30 \ 85 | -d 12 \ 86 | -n {params.germ_bam} \ 87 | -V {params.dbsnp} \ 88 | -x 6 \ 89 | -c 4 \ 90 | -5 \ 91 | -g 30 \ 92 | -G 250 \ 93 | -Q 0.7 \ 94 | -B 0.6 \ 95 | -N 0.05 \ 96 | -Y 5 \ 97 | -W 1 \ 98 | -a {output.accu} \ 99 | -e {output.called} \ 100 | -C {output.context} 101 | """ 102 | 103 | rule RAW_SFC_ErrorMetrics: 104 | input: 105 | bam = "tmp/{index}.raw.replacerg.markdup.bam" 106 | output: 107 | accu = "raw_sfc/{index}.mutant_metrics.txt", 108 | called = "raw_sfc/{index}.variants_called.txt", 109 | context = "raw_sfc/{index}.context_count.txt", 110 | params: 111 | ref = REF, 112 | high_conf_region = EVAL_REGION_BED, 113 | dbsnp = DBSNP, 114 | germ_bam = lambda wildcards : sample_to_germbam.loc[wildcards.index]['germline_bam'], 115 | #germ_vcf = lambda wildcards : sample_to_germvcf.loc[wildcards.index]['germline_vcf'], 116 | resources: 117 | mem = 16, 118 | runtime = 96 119 | shell: 120 | """ 121 | {CALL_BIN} -b {input.bam} \ 122 | -L {params.high_conf_region} \ 123 | -r {params.ref} \ 124 | -m 60 \ 125 | -n {params.germ_bam} \ 126 | -q 30 \ 127 | -d 12 \ 128 | -V {params.dbsnp} \ 129 | -x 6 \ 130 | -c 4 \ 131 | -5 \ 132 | -g 30 \ 133 | -G 250 \ 134 | -Q 0.6 \ 135 | -B 0.6 \ 136 | -N 0.1 \ 137 | -Y 5 \ 138 | -W 1 \ 139 | -a {output.accu} \ 140 | -e {output.called} \ 141 | -C {output.context} 142 | """ -------------------------------------------------------------------------------- /snakemake/README.md: -------------------------------------------------------------------------------- 1 | ## Setup input for workflow 2 | * sample_sheet.csv is used for `codex demux`, it requires three columns `SampleName,IndexBarcode1,IndexBarcode2`. Each row must have unique SampleName 3 | * input.tsv store paths for fastq files and other pipeline inputs. The `sample` column in input.tsv has to match the `SampleName` column in sample_sheet.csv 4 | -------------------------------------------------------------------------------- /snakemake/jobscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # properties = {properties} 3 | which python; 4 | {exec_job} 5 | -------------------------------------------------------------------------------- /snakemake/pipeline_input_examples/caputure/config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | input_meta: input.tsv 3 | cwd: codec 4 | ncores: 4 5 | nparallel: 40 6 | tmpdir: tmp 7 | ref: $HG19 8 | codec_root: 9 | dict: dummy 10 | gatk3: 11 | gatk4: gatk 12 | mutect: 13 | fgbio: 14 | bwa: bwa 15 | region_bed: 16 | region_interval_list: 17 | sampleid: 18 | dbsnp: 19 | duplex_recovery_script: 20 | -------------------------------------------------------------------------------- /snakemake/pipeline_input_examples/caputure/input.tsv: -------------------------------------------------------------------------------- 1 | batch sample fastq1 fastq2 germline_vcf germline_bam sample_sheet fingerprint_maf bait_intervals bait_bed 2 | -------------------------------------------------------------------------------- /snakemake/pipeline_input_examples/caputure/runSnakemake.sh: -------------------------------------------------------------------------------- 1 | root=$PWD/../../../codecsuite/snakemake && snakemake --cluster-sync $root/qsub_wrapper.py --jobscript $root/jobscript.sh --snakefile $root/AdapV2/capture_wf_1/Snakefile --configfile config.yaml --latency-wait 30 -j 2000 --restart-times 3 -p --rerun-incomplete --reason -n 2 | -------------------------------------------------------------------------------- /snakemake/pipeline_input_examples/wgs/config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | input_meta: pipeline_input.tsv 3 | cwd: output 4 | ncores: 4 5 | nparallel: 40 6 | tmpdir: tmp 7 | ref: 8 | codec_root: 9 | dict: dummy 10 | gatk3: 11 | gatk4: gatk 12 | bwa: 13 | mutect: 14 | fgbio: 15 | region_bed: 16 | region_interval_list: 17 | sampleid: wgs1 18 | dbsnp: 19 | -------------------------------------------------------------------------------- /snakemake/pipeline_input_examples/wgs/input.tsv: -------------------------------------------------------------------------------- 1 | batch sample fastq1 fastq2 germline_bam sample_sheet 2 | -------------------------------------------------------------------------------- /snakemake/pipeline_input_examples/wgs/runSnakemake.sh: -------------------------------------------------------------------------------- 1 | root=$PWD/../../../codecsuite/snakemake && snakemake --cluster-sync $root/qsub_wrapper.py --jobscript $root/jobscript.sh --snakefile $root/AdapV2/wgs/Snakefile --configfile config.yaml --latency-wait 30 -j 2000 --restart-times 3 -p --rerun-incomplete --reason -n 2 | -------------------------------------------------------------------------------- /snakemake/qsub_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | 6 | from snakemake.utils import read_job_properties 7 | 8 | jobscript = sys.argv[1] 9 | job_properties = read_job_properties(jobscript) 10 | 11 | defaults = {'mem': 8, 'runtime': 24, 'ncores': 1} 12 | for k, v in defaults.items(): 13 | if k not in job_properties['resources']: 14 | job_properties['resources'][k] = v 15 | 16 | params = job_properties['resources'] 17 | 18 | qsub_cmd = (f'qsub -l h_vmem={params["mem"]}G ' 19 | f'-pe smp {params["ncores"]} -binding linear:{params["ncores"]} ' 20 | f'-l h_rt={params["runtime"]}:00:00 ' 21 | f'-o logs/{job_properties["rule"]}/ ' 22 | f'-e logs/{job_properties["rule"]}/ ' 23 | f'-N {job_properties["rule"]} ' 24 | f'-cwd -V -j y -sync y ' 25 | f'{jobscript}') 26 | 27 | qsub_cmd += ' | tail -2 | cut -d " " -f 3' 28 | os.system(qsub_cmd) 29 | -------------------------------------------------------------------------------- /snakemake/script/agg_log.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import logging 4 | import pandas as pd 5 | import sys 6 | 7 | logger = logging.getLogger("{}".format(__file__)) 8 | 9 | def get_arguments(): 10 | 11 | parser = argparse.ArgumentParser(prog="aggregate miredas reult(s)", formatter_class=argparse.RawDescriptionHelpFormatter) 12 | parser.add_argument("log", type=str, nargs="+", help="trim adapter logs") 13 | parser.add_argument("out", type=str, help="output name") 14 | args = parser.parse_args() 15 | return args 16 | 17 | def process(opts): 18 | tot = pd.DataFrame() 19 | for f in opts.log: 20 | df = pd.read_csv(f, sep=":", names=['cat', "count"]) 21 | if tot.empty: 22 | tot = df 23 | else: 24 | tot['count'] = tot['count'].add(df['count']) 25 | tot['cat'] = tot['cat'] + ":" 26 | tot.to_csv(opts.out, sep=" ", index=False, header=False) 27 | 28 | if __name__ == '__main__': 29 | sys.exit(process(get_arguments())) 30 | -------------------------------------------------------------------------------- /snakemake/script/cds_summarize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import logging 4 | import sys 5 | import json 6 | import os 7 | import pandas as pd 8 | import numpy as np 9 | import pysam 10 | from collections import defaultdict 11 | #from Bio import pairwise2 12 | 13 | logger = logging.getLogger("{}".format(__file__)) 14 | 15 | #def check_tandem_adpt(seq): 16 | #linker = "AGATCGGAAGAGCTTCATCATTAGATCCATTAATGTTACACTTCAACTCTTCACCCACATCAGATTAGTACCAGCTTCGAGGATCAACACGTCAGAGTCTAGCTGGTGATAGGAAGTGTAGGTAACATAGACGAAGTTATCAACAATGTGTAACTGACTTAACGCTCTTCCGATCT" 17 | #res = pairwise2.align.localms(seq, linker, 1, -4, -6,-2) 18 | 19 | def read_pair_generator(bam, region_string=None): 20 | """ 21 | Generate read pairs in a BAM file or within a region string. 22 | Reads are added to read_dict until a pair is found. 23 | """ 24 | read_dict = defaultdict(lambda: [None, None]) 25 | for read in bam.fetch(until_eof=True): 26 | if read.is_secondary or read.is_supplementary: 27 | continue 28 | qname = read.query_name 29 | if qname not in read_dict: 30 | if read.is_read1: 31 | read_dict[qname][0] = read 32 | else: 33 | read_dict[qname][1] = read 34 | else: 35 | if read.is_read1: 36 | yield read, read_dict[qname][1] 37 | else: 38 | yield read_dict[qname][0], read 39 | del read_dict[qname] 40 | 41 | def is_overlapped(read1, read2): 42 | if read1.is_unmapped or read2.is_unmapped: 43 | return False 44 | if read1.reference_name != read2.reference_name: 45 | return False 46 | if read1.reference_start < read2.reference_end and read2.reference_start < read1.reference_end: 47 | return True 48 | 49 | def is_complete_overlapped_excluding_sclips(read1, read2): 50 | if not is_overlapped(read1, read2): 51 | return False 52 | if read1.reference_start != read2.reference_start: 53 | return False 54 | if read1.reference_end != read2.reference_end: 55 | return False 56 | return True 57 | 58 | def overlap_len(read1, read2): 59 | if not is_overlapped(read1,read2): 60 | return 0 61 | else: 62 | return min(read1.reference_end, read2.reference_end) - max(read1.reference_start, read2.reference_start) 63 | 64 | def overlap_span_ratio(read1, read2): 65 | ol = overlap_len(read1, read2) 66 | if ol == 0: 67 | return 0; 68 | else: 69 | span = max(read1.reference_end, read2.reference_end) - min(read1.reference_start, read2.reference_start) 70 | return ol/span 71 | 72 | def get_arguments(): 73 | 74 | parser = argparse.ArgumentParser(prog="CODEC byproducts summary", formatter_class=argparse.RawDescriptionHelpFormatter) 75 | parser.add_argument("--highconf_bam", type=str, default = "", help="high confident CDS reads", required=True) 76 | parser.add_argument("--sample_id", type=str, help="sample id", required=True) 77 | parser.add_argument("--trim_log", type=str, default = "", help="trim linker log file", required=False) 78 | parser.add_argument("--fastp", type=str, help="json file output by fastp", required=False) 79 | parser.add_argument("--lowconf_bam", type=str, default = "", help="low confident CDS reads", required=False) 80 | parser.add_argument("--si_hiconf_bam", type=str, default = "", help="single insert highconf bam", required=False) 81 | parser.add_argument("--si_lowconf_bam", type=str, default = "", help="single insert lowconf bam", required=False) 82 | parser.add_argument("--trim_one_bam", type=str, default = "", help="Where linker has been trimmed from only one end", required=False) 83 | parser.add_argument("--untrim_both_bam", type=str, default = "", help="Where linker has not been trimmed from both ends", required=False) 84 | parser.add_argument("--hs_metrics", type=str, help="hs metrics output", default="") 85 | parser.add_argument("--cds_intermol_bamout", type=str, help="output bam for cds intermolcular reads", default = "", required=False) 86 | parser.add_argument("--demux_log", type=str, help="demux log file", default = "", required=False) 87 | #parser.add_argument("--si_intermol_bamout", type=str, help="output bam for singleinsert intermolcular reads", required=True) 88 | args = parser.parse_args() 89 | return args 90 | 91 | class CdsMetrics: 92 | """ 93 | ## uf = unfiltered 94 | ## pf = passed filter 95 | """ 96 | def __init__(self, sid): 97 | self.sample_id = sid 98 | self.n_raw_frag = 0 99 | self.n_unmapped = 0 100 | self.pct_raw_uf_q20 = 0 101 | self.pct_raw_uf_q30 = 0 102 | self.n_raw_pf_frag = 0 103 | self.pct_aligned_frag = 0 104 | self.on_target_rate = 0 105 | self.mean_bait_cov = 0 106 | self.mean_target_cov = 0 107 | 108 | #CDS specific 109 | self.n_high_conf = 0 110 | self.n_adp_dimer_frag = 0 111 | self.n_double_ligation = 0 112 | self.n_low_conf = 0 113 | self.n_intermol = 0 # intermolecular 114 | self.n_single_hiconf = 0 115 | self.n_single_lowconf = 0 116 | self.n_insuf_trim = 0 117 | self.n_close_proxim = 0 118 | # self.read_len_diff = [] 119 | # self.aln_len_diff = [] 120 | # self.ol_ratios = defaultdict(list) 121 | 122 | def n_raw_uf_frag(self): 123 | return self.n_raw_frag 124 | def n_categorized(self): 125 | return self.n_high_conf + self.n_low_conf + self.n_adp_dimer_frag + self.n_double_ligation + \ 126 | self.n_intermol + self.n_single_hiconf + self.n_single_lowconf + self.n_insuf_trim + self.n_unmapped + self.n_close_proxim 127 | def __str__(self): 128 | header = ["sample_id", 129 | ### CDS specific 130 | "pct_correct", 131 | "pct_double_ligation", 132 | "pct_adp_dimer", 133 | "pct_intermol", 134 | "pct_unmapped", 135 | "pct_close_proxim", 136 | "pct_categorized", 137 | "n_correct", 138 | "n_double_ligation", 139 | "n_adp_dimer", 140 | "n_intermol", 141 | "n_unmapped", 142 | "n_close_proxim", 143 | "n_categorized", 144 | "n_total", 145 | ] 146 | 147 | header_str = "\t".join(header) 148 | return f"{header_str}\n" \ 149 | f"{self.sample_id}\t" \ 150 | f"{self.n_high_conf / self.n_raw_uf_frag()}\t" \ 151 | f"{self.n_double_ligation / self.n_raw_uf_frag()}\t" \ 152 | f"{self.n_adp_dimer_frag / self.n_raw_uf_frag()}\t" \ 153 | f"{self.n_intermol / self.n_raw_uf_frag()}\t" \ 154 | f"{self.n_unmapped / self.n_raw_uf_frag()}\t" \ 155 | f"{self.n_close_proxim / self.n_raw_uf_frag()}\t" \ 156 | f"{self.n_categorized() / self.n_raw_uf_frag()}\t" \ 157 | f"{self.n_high_conf}\t" \ 158 | f"{self.n_double_ligation}\t" \ 159 | f"{self.n_adp_dimer_frag}\t" \ 160 | f"{self.n_intermol}\t" \ 161 | f"{self.n_unmapped}\t" \ 162 | f"{self.n_close_proxim}\t" \ 163 | f"{self.n_categorized()}\t" \ 164 | f"{self.n_raw_frag}" 165 | 166 | def parse_linker_trim_log(log_file, cdsm, adap_v2): 167 | with open(log_file, 'r') as f: 168 | for line in f: 169 | k, v = line.split(":") 170 | if k == "TOTAL" or k == "TOTOL": 171 | cdsm.n_raw_frag = int(v) 172 | if k == "LOST_BOTH": 173 | cdsm.n_adp_dimer_frag = int(v) 174 | elif not adap_v2 and k == "DOUBLE_LIGATION": 175 | cdsm.n_double_ligation += int(v) 176 | elif adap_v2 and k== "LOST_READ1": 177 | cdsm.n_double_ligation += int(v) 178 | elif adap_v2 and k== "LOST_READ2": 179 | cdsm.n_double_ligation += int(v) 180 | 181 | 182 | def alignment_analysis(bam, cdsm, trim_type, intermol_bam = None, im_dist_cutoff = 5_000, adap_v2 = False): 183 | assert(trim_type in ['HighConf', "LowConf", "TrimOne", "UntrimBoth", "SingleHiconf", "SingleLowconf"]) 184 | samfile = pysam.AlignmentFile(bam, "rb") 185 | total_frag = 0 186 | for read1, read2 in read_pair_generator(samfile): 187 | total_frag += 1 188 | if read1.is_unmapped or read2.is_unmapped: 189 | if read1.infer_query_length() and read1.infer_query_length() > 15 and \ 190 | read2.infer_query_length() and read2.infer_query_length() > 15: 191 | cdsm.n_unmapped += 1 192 | continue 193 | 194 | if read1.reference_name != read2.reference_name or abs(read1.tlen) > im_dist_cutoff: 195 | cdsm.n_intermol += 1 196 | if intermol_bam: 197 | intermol_bam.write(read1) 198 | intermol_bam.write(read2) 199 | continue 200 | 201 | if trim_type == "HighConf": 202 | if adap_v2: 203 | if is_overlapped(read1, read2): 204 | cdsm.n_high_conf += 1 205 | else: 206 | cdsm.n_close_proxim += 1 207 | else: 208 | if is_complete_overlapped_excluding_sclips(read1, read2): 209 | cdsm.n_high_conf += 1 210 | # elif trim_type == "LowConf": 211 | # if is_complete_overlapped_excluding_sclips(read1, read2): 212 | # cdsm.n_low_conf += 1 213 | # elif trim_type == "SingleHiconf": 214 | # if (not read1.has_tag('tm') or read1.get_tag('tm') != 4 ) and \ 215 | # (not read2.has_tag('tm') or read2.get_tag('tm') != 4 ) and \ 216 | # is_complete_overlapped_excluding_sclips(read1, read2): 217 | # cdsm.n_single_hiconf += 1 218 | # elif trim_type == "SingleLowconf": 219 | # if is_complete_overlapped_excluding_sclips(read1, read2): 220 | # cdsm.n_single_lowconf += 1 221 | # elif trim_type == "TrimOne" or trim_type == "UntrimBoth": 222 | # if not adap_v2 and is_overlapped(read1, read2): 223 | # cdsm.n_insuf_trim += 1 224 | 225 | return total_frag 226 | 227 | def process(opts): 228 | cdsm = CdsMetrics(opts.sample_id) 229 | adap_v2 = False 230 | if opts.fastp: 231 | with open(opts.fastp, 'r') as f: 232 | sample_dict = json.load(f) 233 | cdsm.n_raw_frag = int(sample_dict['summary']["before_filtering"]["total_reads"])/2 234 | cdsm.n_raw_pf_frag = int(sample_dict['summary']['after_filtering']['total_reads'])/2 235 | cdsm.pct_raw_uf_q20 = sample_dict['summary']['before_filtering']['q20_rate'] 236 | cdsm.pct_raw_uf_q30 = sample_dict['summary']['before_filtering']['q30_rate'] 237 | 238 | if opts.cds_intermol_bamout: 239 | bam = pysam.AlignmentFile(opts.highconf_bam, "rb") 240 | cds_intermol_writer = pysam.AlignmentFile(opts.cds_intermol_bamout + ".tmp.bam", "wb", template=bam) 241 | bam.close() 242 | 243 | if opts.lowconf_bam: 244 | num_frag_processed = alignment_analysis(opts.lowconf_bam, cdsm, trim_type = 'LowConf') 245 | adap_v2 = True if num_frag_processed == 0 else False 246 | else: 247 | adap_v2 = True 248 | 249 | if opts.trim_log: 250 | parse_linker_trim_log(opts.trim_log, cdsm, adap_v2) 251 | 252 | if opts.highconf_bam: 253 | if opts.cds_intermol_bamout: 254 | nread_processed = alignment_analysis(opts.highconf_bam, cdsm, trim_type = 'HighConf', intermol_bam=cds_intermol_writer, adap_v2=adap_v2) 255 | else: 256 | nread_processed =alignment_analysis(opts.highconf_bam, cdsm, trim_type='HighConf', adap_v2=adap_v2) 257 | if cdsm.n_raw_frag == 0: 258 | cdsm.n_raw_frag = nread_processed 259 | if opts.trim_one_bam: 260 | alignment_analysis(opts.trim_one_bam, cdsm, trim_type = 'TrimOne') 261 | if opts.untrim_both_bam: 262 | alignment_analysis(opts.untrim_both_bam, cdsm, trim_type = 'UntrimBoth') 263 | if opts.si_hiconf_bam: 264 | alignment_analysis(opts.si_hiconf_bam, cdsm, trim_type = 'SingleHiconf') 265 | if opts.si_lowconf_bam: 266 | alignment_analysis(opts.si_lowconf_bam, cdsm, trim_type = 'SingleLowconf') 267 | 268 | 269 | if opts.hs_metrics: 270 | hs_metrics_df = pd.read_csv(opts.hs_metrics, skiprows=6, nrows=1, sep='\t', low_memory=False) 271 | cdsm.pct_aligned_frag = hs_metrics_df['PCT_PF_UQ_READS_ALIGNED'][0] 272 | cdsm.on_target_rate = hs_metrics_df['PCT_SELECTED_BASES'][0] 273 | cdsm.mean_bait_cov = hs_metrics_df['MEAN_BAIT_COVERAGE'][0] 274 | cdsm.mean_target_cov = hs_metrics_df['MEAN_TARGET_COVERAGE'][0] 275 | print(cdsm) 276 | if opts.cds_intermol_bamout: 277 | cds_intermol_writer.close() 278 | os.system(f"samtools sort -n {opts.cds_intermol_bamout}.tmp.bam -o {opts.cds_intermol_bamout} && rm {opts.cds_intermol_bamout}.tmp.bam") 279 | 280 | 281 | if __name__ == '__main__': 282 | sys.exit(process(get_arguments())) 283 | -------------------------------------------------------------------------------- /snakemake/script/codec2maf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(optparse) 3 | 4 | option_list = list( 5 | make_option(c("-i", "--inputmaf"), type="character", default=NULL, 6 | help="input data"), 7 | make_option(c("-o", "--outputmaf"), type="character", default=NULL, 8 | help="output file name"), 9 | make_option(c("-Q", "--q60frac"), type="double", default=0.0, 10 | help="min q60frac [default= %default]"), 11 | make_option(c("-N", "--Nfrac"), type="double", default=1.0, 12 | help="max N frac [default= %default]"), 13 | make_option(c("-L", "--minIndelFragLen"), type="integer", default=0, 14 | help="minimum fraglen when containing indels [default= %default]"), 15 | make_option(c("-l", "--maxIndelLen"), type="integer", default=50, 16 | help="maximum small indel len [default= %default]"), 17 | make_option(c("-D", "--minIndelDist2End"), type="integer", default=0, 18 | help="minimum distance of a indel to the fragend [default= %default]"), 19 | make_option(c("-b", "--breakmnv"), action="store_true", default=FALSE, 20 | help="break the MNV to SNV [default %default]"), 21 | make_option(c("-p", "--pairedOnly"), action="store_true", default=FALSE, 22 | help="only include paired reads"), 23 | make_option(c("-f", "--countByFragId"), action="store_true", default=FALSE, 24 | help="count by fragment id: (frag_len, dist_to_fragend). This is to avoid artifact fragments due to UMI it won't work for ddBTP data") 25 | 26 | ); 27 | 28 | opt_parser = OptionParser(option_list=option_list); 29 | opt = parse_args(opt_parser); 30 | 31 | if (is.null(opt$inputmaf)){ 32 | print_help(opt_parser) 33 | stop("At least one argument must be supplied (input file).n", call.=FALSE) 34 | } 35 | 36 | library(data.table) 37 | library(tidyverse) 38 | 39 | break_mnv <- function(mut_fam_df, snv_only=FALSE) { 40 | indel = mut_fam_df %>% filter(type != "SNV") 41 | mut_fam_df = mut_fam_df %>% filter(type == "SNV") 42 | snv = mut_fam_df %>% filter(nchar(ref) == 1) 43 | mnv = mut_fam_df %>% filter(nchar(ref) > 1) 44 | mnv2snv = data.frame() 45 | if (nrow(mnv) > 0) { 46 | for (i in 1:nrow(mnv)) { 47 | for (j in 1:nchar(mnv[i,]$ref)) { 48 | tmp = mnv[i,] 49 | tmp$ref = substr(mnv[i,]$ref, j, j) 50 | tmp$alt = substr(mnv[i,]$alt, j, j) 51 | tmp$ref_pos = mnv[i,]$ref_pos + j - 1 52 | mnv2snv = rbind(mnv2snv, tmp) 53 | } 54 | } 55 | } 56 | if (snv_only) { 57 | rbind(snv, mnv2snv) 58 | } 59 | else { 60 | rbind(snv, mnv2snv, indel) 61 | } 62 | } 63 | 64 | add_varid <- function(df) { 65 | if ("Chromosome" %in% colnames(df)) { 66 | df = df %>% mutate(id = paste(Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep="_")) 67 | } else if ("chrom" %in% colnames(df)) { 68 | df = df %>% mutate(id = paste(chrom, ref_pos, ref, alt, sep="_")) 69 | } 70 | df 71 | } 72 | 73 | convert_chr_to_numeric <- function(chr) { 74 | chr <- gsub("chr", "", chr) # Remove the "chr" prefix 75 | 76 | # Convert special cases 77 | if (chr == "X") return(23) 78 | if (chr == "Y") return(24) 79 | if (chr == "M") return(25) 80 | 81 | return(as.numeric(chr)) # For numbered chromosomes, convert to numeric 82 | } 83 | 84 | codec2maf <- function(infile, outfile, q60rate, Nrate, min_frag_indel, min_dist_indel, max_indel_len, breakmnv, pairedOnly, countByFragId) { 85 | codec = fread(infile) 86 | print(paste(nrow(codec %>% filter(type != "SNV")), "frag contains INDEL")) 87 | print(paste(nrow(codec %>% filter(type == "SNV")), "frag contains SNV")) 88 | #codec = codec %>% filter(numQ60/olen >= q60rate & numN/flen <= Nrate) 89 | if (pairedOnly) { 90 | codec = codec %>% filter(flen != 0) 91 | } 92 | codec = codec %>% filter(numQpass/clen >= q60rate & (flen == 0 | numN/flen <= Nrate)) 93 | codec = codec %>% filter(type == "SNV" | (flen >= as.integer(min_frag_indel))) 94 | codec = codec %>% filter(type == "SNV" | (dist_to_fragend >= as.integer(min_dist_indel))) 95 | codec = codec %>% mutate(fragid = paste0(flen, "_", dist_to_fragend)) 96 | codec = codec %>% filter(nchar(ref) <= max_indel_len & nchar(alt) <= max_indel_len) 97 | print(paste(nrow(codec %>% filter(type != "SNV")), "frag contains INDEL after filtering")) 98 | print(paste(nrow(codec %>% filter(type == "SNV")), "frag contains SNV after filtering")) 99 | if (breakmnv) { 100 | codec = break_mnv(codec) 101 | } 102 | codec = add_varid(codec) 103 | print(head(codec)) 104 | if (countByFragId) { 105 | codec = codec %>% group_by(id) %>% summarise(chrom = unique(chrom), 106 | ref_pos = unique(ref_pos), 107 | ref = unique(ref), 108 | alt = unique(alt), 109 | type = unique(type), 110 | t_alt_count=length(unique(fragid)), 111 | t_ref_count=unique(site_depth)) 112 | } else { 113 | codec = codec %>% group_by(id) %>% summarise(chrom = unique(chrom), 114 | ref_pos = unique(ref_pos), 115 | ref = unique(ref), 116 | alt = unique(alt), 117 | type = unique(type), 118 | t_alt_count=length(unique(read_name)), 119 | t_ref_count=unique(site_depth)) 120 | } 121 | codec = codec[, 2:ncol(codec)] 122 | codec$chrid = sapply(codec$chrom, convert_chr_to_numeric) 123 | codec = arrange(codec, chrid, ref_pos) 124 | codec = codec[, 1:(ncol(codec) - 1)] 125 | colnames(codec)[1:5] = c("Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2", "Variant_Type") 126 | codec = codec %>% mutate(Variant_Type = ifelse(Variant_Type=="SNV", "SNP", Variant_Type)) 127 | codec$Hugo_Symbol = "Unknown" 128 | codec$Tumor_Sample_Barcode = "TUMOR" 129 | #codec$tumor_f=0.5 130 | codec = codec %>% mutate(t_ref_count = t_ref_count - t_alt_count) 131 | codec = codec %>% select("Hugo_Symbol", "Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2", "Variant_Type", "Tumor_Sample_Barcode", "t_alt_count", "t_ref_count") 132 | write_tsv(codec, outfile) 133 | print(paste(nrow(codec %>% filter(Variant_Type == "SNP")), "snvs write to", outfile)) 134 | print(paste(nrow(codec %>% filter(Variant_Type != "SNP")), "indels write to", outfile)) 135 | } 136 | 137 | codec2maf(opt$inputmaf, opt$outputmaf, opt$q60frac, opt$Nfrac, opt$minIndelFragLen, opt$minIndelDist2End, opt$maxIndelLen, opt$breakmnv, opt$pairedOnly, opt$countByFragId) 138 | -------------------------------------------------------------------------------- /snakemake/script/cov_sum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import sys 7 | 8 | logger = logging.getLogger("{}".format(__file__)) 9 | 10 | def process(): 11 | cov_count = {} 12 | for line in sys.stdin: 13 | if line.startswith('REF'): 14 | continue 15 | cols = line.strip().split("\t") 16 | cov = int(cols[2]) 17 | if cov not in cov_count: 18 | cov_count[cov] = 1 19 | else: 20 | cov_count[cov] += 1 21 | 22 | for k in sorted(cov_count.keys()): 23 | print(k, cov_count[k], sep='\t') 24 | 25 | if __name__ == '__main__': 26 | sys.exit(process()) 27 | 28 | -------------------------------------------------------------------------------- /snakemake/script/create_maf_from_probe_rg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import logging 4 | import sys 5 | import pysam 6 | 7 | logger = logging.getLogger("{}".format(__file__)) 8 | def read_bed(bedfile): 9 | """ Creates generator from bed file or interval_list """ 10 | logger.info("Reading region file...") 11 | interval_list = bedfile.endswith("interval_list") 12 | with open(bedfile, "r") as bed: 13 | for line in bed: 14 | if line.startswith("@"): 15 | continue 16 | line = line.strip() 17 | chrom, start, stop = line.split()[0:3] 18 | start, stop = int(start), int(stop) 19 | if interval_list: 20 | start -= 1 21 | yield chrom, start, stop 22 | 23 | def get_arguments(): 24 | 25 | parser = argparse.ArgumentParser(prog="Parse Fastp json result(s)", formatter_class=argparse.RawDescriptionHelpFormatter) 26 | parser.add_argument("--bed", type=str, help="json file output by fastp", required=False) 27 | parser.add_argument("--ref", type=str, help="json file output by fastp", required=False) 28 | args = parser.parse_args() 29 | return args 30 | 31 | def process(opts): 32 | header= ['Hugo_Symbol', 'Chromosome', 'Start_position', 'Reference_Allele', 'Tumor_Seq_Allele2', 'probed'] 33 | reffa = pysam.FastaFile(opts.ref) 34 | print('\t'.join(header)) 35 | for chrom, s, e in read_bed(opts.bed): 36 | halfw = int(e) - int(s) 37 | start = int(s) + halfw 38 | REF = reffa.fetch(chrom, start , start + 1) 39 | ALT = REF 40 | line = ['NA', chrom, str(start + 1), REF, ALT, '1'] 41 | print('\t'.join(line)) 42 | 43 | if __name__ == '__main__': 44 | sys.exit(process(get_arguments())) 45 | -------------------------------------------------------------------------------- /snakemake/script/downsample_read_families.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import sys 5 | import os 6 | import pysam 7 | import random 8 | from random import sample 9 | 10 | def parse_cl_args(): 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument("--in_bam", help="input BAM", required=True) 14 | parser.add_argument("--sample_id", help="sample_id for series of bams", required=True) 15 | parser.add_argument("--outdir", help="output directory", default="./") 16 | parser.add_argument("--min_family_size", type = int, help="minimum family size (A+B) for downsampling", default=20) 17 | parser.add_argument("--max_family_size", type = int, help="max_family_size for output a bam", default=20) 18 | parser.add_argument("--min_strand_specific_family_size", type = int, default = 10) 19 | parser.add_argument("--seed", help="seed for random sampling", default=7) 20 | parser.add_argument("--is_codec", help="if is CODEC library", default=False, action='store_true') 21 | 22 | return parser.parse_args() 23 | 24 | class PairEnd: 25 | def __init__(self, aln, is_cds): 26 | self.name = aln.query_name 27 | self.reads = [] 28 | self.fid = None 29 | self.strand = None 30 | self.push(aln, is_cds) 31 | 32 | def push(self, aln, is_cds): 33 | assert(aln.query_name == self.name) 34 | 35 | try: 36 | mitag = aln.get_tag("MI") 37 | except KeyError: 38 | sys.stderr.write(aln.query_name + " has no MI tag\n") 39 | if is_cds: 40 | fid = mitag 41 | strand = None 42 | else: 43 | if "/" in mitag: 44 | fid, strand = mitag.split("/") 45 | else: 46 | fid = mitag 47 | strand = None 48 | if self.fid: 49 | assert(self.fid == fid) 50 | else: 51 | self.fid = fid 52 | if strand and self.strand: 53 | assert(strand == self.strand) 54 | elif strand: 55 | self.strand = strand 56 | 57 | self.reads.append(aln) 58 | 59 | 60 | class Duplex: 61 | def __init__(self, pairend, is_cds): 62 | self.A_reads = [] 63 | self.B_reads = [] 64 | self.cds_reads = [] 65 | self.fid = None 66 | self.push(pairend, is_cds) 67 | self.is_cds = is_cds 68 | 69 | def sizeA(self): 70 | if self.is_cds: 71 | return len(self.cds_reads) 72 | else: 73 | return len(self.A_reads) 74 | 75 | def sizeB(self): 76 | if self.is_cds: 77 | return len(self.cds_reads) 78 | else: 79 | return len(self.B_reads) 80 | 81 | def size(self): 82 | if self.is_cds: 83 | return len(self.cds_reads) 84 | else: 85 | return self.sizeA() + self.sizeB() 86 | 87 | def push(self, pairend, is_cds): 88 | if self.fid: 89 | assert(self.fid == pairend.fid) 90 | else: 91 | self.fid = pairend.fid 92 | if is_cds: 93 | self.cds_reads.append(pairend) 94 | else: 95 | if pairend.strand == 'A': 96 | self.A_reads.append(pairend) 97 | elif pairend.strand == 'B': 98 | self.B_reads.append(pairend) 99 | else: 100 | raise ValueError(pairend.name + " MI tag malformed\n") 101 | 102 | 103 | def sub_sample(duplex, target_size, is_cds): 104 | assert(duplex.size() >= target_size) 105 | 106 | ret = [] 107 | if is_cds: 108 | draw = sample(list(range(duplex.size())), target_size) 109 | ret = [duplex.cds_reads[ii] for ii in draw] 110 | return ret 111 | 112 | if target_size == 1: 113 | draw = sample(list(range(duplex.size())), target_size) 114 | ret.append(duplex.A_reads[draw[0]] if draw[0] < duplex.sizeA() else duplex.B_reads[draw[0] - duplex.sizeA()]) 115 | return ret 116 | 117 | if duplex.sizeA() < duplex.sizeB(): 118 | if duplex.sizeA() < target_size / 2: 119 | ret = duplex.A_reads 120 | else: 121 | idx = sample(list(range(duplex.sizeA())), int(target_size / 2)) 122 | ret = [duplex.A_reads[ii] for ii in idx] 123 | rest_idx = sample(list(range(duplex.sizeB())), target_size - len(ret)) 124 | ret = ret + [duplex.B_reads[ii] for ii in rest_idx] 125 | else: 126 | if duplex.sizeB() < target_size / 2: 127 | ret = duplex.B_reads 128 | else: 129 | idx = sample(list(range(duplex.sizeB())), int(target_size / 2)) 130 | ret = [duplex.B_reads[ii] for ii in idx] 131 | rest_idx = sample(list(range(duplex.sizeA())), target_size - len(ret)) 132 | ret = ret + [duplex.A_reads[idx] for idx in rest_idx] 133 | 134 | return ret 135 | 136 | 137 | class DuplexFamilyBamsWriter: 138 | def __init__(self, in_bam, max_fs, min_fs, min_sp_fs, outdir, sid, is_codec): 139 | self.max_family_size = max_fs 140 | self.min_family_size_to_downsample = min_fs 141 | self.min_strand_specific_size_to_downsample = min_sp_fs 142 | self.sample_id = sid 143 | self.out_bams = [] 144 | self.is_codec = is_codec 145 | for ii in range(max_fs): 146 | fname = sid + "_" + str(ii + 1) + ".bam" 147 | fpath = os.path.join(outdir, fname) 148 | self.out_bams.append(pysam.AlignmentFile(fpath, "wb", template=in_bam)) 149 | 150 | def write_duplex(self, duplex): 151 | if duplex.size() >= self.min_family_size_to_downsample and \ 152 | duplex.sizeA() >= self.min_strand_specific_size_to_downsample and \ 153 | duplex.sizeB() >= self.min_strand_specific_size_to_downsample: 154 | for fs in range(self.max_family_size): 155 | if duplex.size() > fs: 156 | fragments = sub_sample(duplex, fs + 1, self.is_codec) 157 | for frag in fragments: 158 | for record in frag.reads: 159 | self.out_bams[fs].write(record) 160 | 161 | def close_all(self): 162 | for bam in self.out_bams: 163 | bam.close() 164 | 165 | 166 | def process(opts): 167 | 168 | random.seed(opts.seed) 169 | in_bam = pysam.AlignmentFile(opts.in_bam, "rb") 170 | dstack = [] #duplex stack 171 | fstack = [] #fragment stack 172 | writer = DuplexFamilyBamsWriter(in_bam, opts.max_family_size, opts.min_family_size, opts.min_strand_specific_family_size, 173 | opts.outdir, opts.sample_id, opts.is_codec) 174 | 175 | for aln in in_bam.fetch(until_eof=True): 176 | if fstack: 177 | if fstack[-1].name == aln.query_name: 178 | fstack[-1].push(aln, is_cds=opts.is_codec) 179 | continue 180 | else: 181 | pe = fstack.pop() 182 | # do something 183 | if dstack: 184 | if dstack[-1].fid == pe.fid: 185 | dstack[-1].push(pe, is_cds=opts.is_codec) 186 | else: 187 | dpx = dstack.pop() 188 | writer.write_duplex(dpx) 189 | dpx = Duplex(pe, is_cds=opts.is_codec) 190 | dstack.append(dpx) 191 | else: 192 | dpx = Duplex(pe, is_cds=opts.is_codec) 193 | dstack.append(dpx) 194 | 195 | pe = PairEnd(aln, opts.is_codec) 196 | fstack.append(pe) 197 | 198 | # process the last read 199 | if fstack: 200 | pe = fstack.pop() 201 | # do something 202 | if dstack: 203 | if dstack[-1].fid == pe.fid: 204 | dstack[-1].push(pe, is_cds = opts.is_codec) 205 | else: 206 | dpx = dstack.pop() 207 | writer.write_duplex(dpx) 208 | dpx = Duplex(pe, is_cds=opts.is_codec) 209 | dstack.append(dpx) 210 | 211 | if dstack: 212 | dpx = dstack.pop() 213 | writer.write_duplex(dpx) 214 | 215 | in_bam.close() 216 | writer.close_all() 217 | 218 | 219 | if __name__ == "__main__": 220 | process(parse_cl_args()) 221 | -------------------------------------------------------------------------------- /snakemake/script/dpx/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /snakemake/script/dpx/__pycache__/bam_iterator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/bam_iterator.cpython-36.pyc -------------------------------------------------------------------------------- /snakemake/script/dpx/__pycache__/bam_iterator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/bam_iterator.cpython-38.pyc -------------------------------------------------------------------------------- /snakemake/script/dpx/__pycache__/collect_duplex_metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/collect_duplex_metrics.cpython-36.pyc -------------------------------------------------------------------------------- /snakemake/script/dpx/__pycache__/downsampler.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/downsampler.cpython-36.pyc -------------------------------------------------------------------------------- /snakemake/script/dpx/__pycache__/downsampler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/downsampler.cpython-38.pyc -------------------------------------------------------------------------------- /snakemake/script/dpx/__pycache__/intervals.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/intervals.cpython-36.pyc -------------------------------------------------------------------------------- /snakemake/script/dpx/__pycache__/intervals.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/intervals.cpython-38.pyc -------------------------------------------------------------------------------- /snakemake/script/dpx/downsampler.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from collections import Counter 3 | 4 | import numpy as np 5 | import sys 6 | 7 | 8 | np.random.seed(10) 9 | 10 | def get_overlap(read, interval_dict): 11 | rchrom = str(read.reference_name) 12 | rstart = read.reference_start 13 | rend = read.reference_end 14 | if rchrom in interval_dict: 15 | olap = interval_dict[rchrom].search(rstart, rend) 16 | if olap: 17 | tstart, tend = olap[0].start, olap[0].end 18 | overlap = min(rend, tend) - max(rstart, tstart) 19 | target = f"{rchrom}:{tstart}-{tend}" 20 | return target, overlap 21 | return None, 0 22 | 23 | class Downsampler: 24 | """ Class to perform downsampling on a list of family IDs 25 | from Fgbio GroupReadsByUmi """ 26 | 27 | def __init__(self, probabilities, min_min_strand_reads, min_max_strand_reads, per_target, interval_dict, is_cds): 28 | self.probabilities = probabilities 29 | self.min_min_strand_reads = min_min_strand_reads 30 | self.min_max_strand_reads = min_max_strand_reads 31 | self.kept_families = defaultdict(list) 32 | self.counts = ( 33 | defaultdict(Counter) 34 | if not per_target 35 | else defaultdict(lambda: defaultdict(Counter)) 36 | ) 37 | self.per_target = per_target 38 | self.interval_dict = interval_dict 39 | self.is_cds = is_cds 40 | 41 | def downsample(self, read_pairs, probability): 42 | """ Downsamples list of read pairs at a given probability """ 43 | duplexes = defaultdict(lambda: defaultdict(lambda: 0)) 44 | summary_counts = defaultdict(lambda: 0) 45 | min_min_strand_reads = self.min_min_strand_reads 46 | min_max_strand_reads = self.min_max_strand_reads 47 | kept_reads = [] 48 | previous_coordinate = set() 49 | for read_pair in read_pairs: 50 | strand = read_pair.strand 51 | family = read_pair.family 52 | if not family: 53 | continue 54 | coordinate_id = read_pair.coordinate_id 55 | if np.random.random() <= probability: 56 | summary_counts["read_pairs"] += 1 57 | if coordinate_id not in previous_coordinate: 58 | summary_counts["cs_families"] += 1 59 | previous_coordinate.add(coordinate_id) 60 | duplexes[family][strand] += 1 61 | if self.is_cds and read_pair.are_ends_overlapped(): 62 | if strand == "A": 63 | duplexes[family]["B"] += 1 64 | elif strand == "B": 65 | duplexes[family]["A"] += 1 66 | #if self.per_target: 67 | kept_reads.append(read_pair) 68 | for family, count in duplexes.items(): 69 | ss_families = (count["A"] > 0) + (count["B"] > 0) 70 | ds_families = int(ss_families > 0) 71 | summary_counts["ss_families"] += ss_families 72 | summary_counts["ds_families"] += ds_families 73 | if min(count["A"], count["B"]) >= min_min_strand_reads and \ 74 | max(count["A"], count["B"]) >= min_max_strand_reads: 75 | summary_counts["ds_duplexes"] += 1 76 | return summary_counts, kept_reads 77 | 78 | def run_downsamplings(self, reads, serial_sampling=True): 79 | """ When serial sampling is true, we use reads from the sampling of the 80 | next highest probability.""" 81 | probs = np.sort(self.probabilities)[::-1] 82 | adj_probs = probs 83 | if serial_sampling: 84 | # If serial sampling, we need to adjust probability as kept_reads 85 | # is smaller after each sampling. 86 | adj_probs = adj_probs / np.insert(adj_probs, 0, 1)[:-1] 87 | 88 | for actual, prob in zip(probs, adj_probs): 89 | summary_counts, kept_reads = self.downsample(reads, prob) 90 | summary_counts = Counter(summary_counts) 91 | if serial_sampling: 92 | reads = kept_reads 93 | if self.per_target: 94 | if kept_reads: 95 | if kept_reads[0].read1 and kept_reads[0].read2: 96 | target, _ = kept_reads[0].get_overlap(self.interval_dict) 97 | elif kept_reads[0].read1: 98 | target, _ = get_overlap(kept_reads[0].read1, self.interval_dict) 99 | elif kept_reads[0].read2: 100 | target, _ = get_overlap(kept_reads[0].read2, self.interval_dict) 101 | if not target: 102 | print(kept_reads[0].read1, "\n", kept_reads[0].read2) 103 | assert(target) 104 | self.counts[target][actual] = ( 105 | self.counts[target][actual] + summary_counts 106 | ) 107 | else: 108 | self.counts[actual] = self.counts[actual] + summary_counts 109 | -------------------------------------------------------------------------------- /snakemake/script/dpx/get_mutant_metrics.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import click 3 | import pandas as pd 4 | import pysam 5 | 6 | from bam_iterator import iterate_pileup_reads, get_unclipped_read_ends, get_unclipped_fragment_ends, keep_read, is_proper_orientation 7 | 8 | 9 | class TargetMolecule: 10 | def __init__( 11 | self, pileupread, target_pos, target_ref_base, target_alt_base 12 | ): 13 | self.pileupread = pileupread 14 | self.target_pos = target_pos 15 | self.target_ref_base = target_ref_base 16 | self.target_alt_base = target_alt_base 17 | self.read = pileupread.alignment 18 | self.chrom = self.read.reference_name 19 | self.read_start, self.read_end = get_unclipped_read_ends( 20 | start = self.read.reference_start, cigar = self.read.cigarstring 21 | ) 22 | self.read_sequence = self.read.query_sequence 23 | self.base = self.read.query_sequence[pileupread.query_position] 24 | self.start, self.end = get_unclipped_fragment_ends( 25 | read1_chrom = self.chrom, 26 | read1_start = self.read.reference_start, 27 | read1_cigar = self.read.cigarstring, 28 | read1_reverse = self.read.is_reverse, 29 | read2_chrom = self.read.next_reference_name, 30 | read2_start = self.read.next_reference_start, 31 | read2_cigar = self.get_read_tag("MC"), 32 | read2_reverse = self.read.mate_is_reverse 33 | ) 34 | self.mapping_quality = self.read.mapping_quality 35 | self.base_quality = self.read.query_qualities[pileupread.query_position] 36 | self.family_id = self.get_read_tag("MI") 37 | self.family_size = self.get_read_tag("cD") 38 | self.umi = self.get_read_tag("RX") 39 | self.overlap = False 40 | self.discordant = False 41 | self.mismatch_count = self.get_read_tag( 42 | "NM" 43 | ) - pileupread.alignment.query_alignment_sequence.count( 44 | "N" 45 | ) # added by Chris 46 | self.N_count = pileupread.alignment.query_alignment_sequence.count( 47 | "N" 48 | ) # added by Chris 49 | 50 | def __str__(self): 51 | args = [ 52 | f'{k}="{v}"' 53 | for k, v in self.__dict__.items() 54 | if k not in ["pileupread", "read"] 55 | ] 56 | args = ", ".join(args) 57 | return f"{self.__class__.__name__}({args})" 58 | 59 | def get_read_tag(self, tag, default="NA"): 60 | try: 61 | return self.read.get_tag(tag) 62 | except KeyError: 63 | return default 64 | 65 | def get_mapping_qualities(self): 66 | """ Returns tuple: (read1.mapping_quality, read2.mapping_quality) """ 67 | res = (self.read.mapping_quality, self.get_read_tag("MQ")) 68 | if not self.read.is_read1: 69 | res = res[::-1] 70 | return res 71 | 72 | @property 73 | def aD_bD(self): 74 | """ aD and bD tags (sorted so max is first) """ 75 | ad, bd = self.get_read_tag("aD"), self.get_read_tag("bD") 76 | ad, bd = sorted([int(ad), int(bd)], reverse=True) 77 | return f"{ad}/{bd}" 78 | 79 | @property 80 | def insert_size(self): 81 | """ Return insert size with unclipped ends taken into account """ 82 | return self.end - self.start 83 | 84 | @property 85 | def read_position(self): 86 | """ Returns 5' position of base in read """ 87 | if self.read.is_reverse: 88 | pos = self.read.infer_read_length() - self.pileupread.query_position 89 | else: 90 | pos = self.pileupread.query_position + 1 91 | return pos 92 | 93 | @property 94 | def distance_from_5_prime(self): 95 | """ Returns distance from 5' end of fragment. In cases 96 | where reads overlap, 5' distance from read may not equal 5' 97 | distance from end of fragment """ 98 | return min( 99 | [self.read_position, abs(self.insert_size) - self.read_position + 1] 100 | ) 101 | 102 | @property 103 | def target_site(self): 104 | """ Returns string representing target site """ 105 | return f"{self.chrom}:{self.target_pos}" 106 | 107 | @property 108 | def fragment_id(self): 109 | """ Returns string representing fragment_id """ 110 | return f"{self.chrom}:{self.start + 1}-{self.end}" 111 | 112 | @property 113 | def molecule_class(self): 114 | """ Return string representing molecule class """ 115 | if self.base == self.target_ref_base: 116 | return "REF" 117 | elif self.base == self.target_alt_base: 118 | return "ALT" 119 | else: 120 | return "OTHER" 121 | 122 | 123 | def parse_maf(maf_file): 124 | """ Reads in MAF file and yields a Variant namedtuple """ 125 | infile = pd.read_table(maf_file) 126 | headers = infile.columns 127 | Variant = namedtuple("VariantSite", headers) 128 | for idx, row in infile.iterrows(): 129 | yield Variant(*row) 130 | 131 | 132 | def pass_filter(pileupread): 133 | """ Make sure read can be used """ 134 | if ( 135 | not pileupread.is_del 136 | and not pileupread.is_refskip 137 | and keep_read(pileupread.alignment) 138 | and is_proper_orientation( 139 | read1_chrom = pileupread.alignment.reference_name, 140 | read1_start = pileupread.alignment.reference_start, 141 | read1_reverse = pileupread.alignment.is_reverse, 142 | read2_chrom = pileupread.alignment.next_reference_name, 143 | read2_start = pileupread.alignment.next_reference_start, 144 | read2_reverse = pileupread.alignment.mate_is_reverse 145 | ) 146 | ): 147 | return True 148 | return False 149 | 150 | 151 | def check_discordance(molecule, other_molecule): 152 | """ Check discordance between two molecules and update """ 153 | molecule.overlap = True 154 | other_molecule.overlap = True 155 | if molecule.base != other_molecule.base: 156 | molecule.discordant = True 157 | molecule.base = "N" 158 | other_molecule.discordant = True 159 | other_molecule.base = "N" 160 | 161 | 162 | def target_molecule_generator( 163 | variant_site, bam, mapping_quality, base_quality, output_both=False 164 | ): 165 | """ For a given variant site, a pileup will be created from the BAM. 166 | Each read overlapping the variant site will be yielded if it 167 | passes filters 168 | Output both: output both reads if both overlap base """ 169 | chrom = str(variant_site.Chromosome) 170 | start = int(variant_site.Start_position) 171 | end = start 172 | ref_base = variant_site.Reference_Allele 173 | alt_base = variant_site.Tumor_Seq_Allele2 174 | molecule_dict = {} 175 | for pileupread in iterate_pileup_reads( 176 | bam, 177 | chrom, 178 | start - 1, 179 | end, 180 | stepper="nofilter", 181 | min_mapping_quality=mapping_quality, 182 | min_base_qual=base_quality, 183 | truncate=True, 184 | max_depth=1000000, 185 | noun="pileup reads", 186 | verb="processed", 187 | log_every=100000, 188 | ): 189 | read_name = pileupread.alignment.query_name 190 | if pass_filter(pileupread): 191 | molecule = TargetMolecule(pileupread, end, ref_base, alt_base) 192 | if read_name not in molecule_dict: 193 | molecule_dict[read_name] = [molecule] 194 | else: 195 | other_molecule = molecule_dict[read_name][0] 196 | check_discordance(molecule, other_molecule) 197 | molecule_dict[read_name].append(molecule) 198 | 199 | for _, molecules in molecule_dict.items(): 200 | if output_both: 201 | for molecule in molecules: 202 | yield molecule 203 | else: 204 | yield molecules[0] 205 | 206 | 207 | @click.command(context_settings=dict(help_option_names=["-h", "--help"])) 208 | @click.option( 209 | "-m", 210 | "--maf_file", 211 | type=click.Path(exists=True), 212 | help="MAF describing variants", 213 | required=True, 214 | ) 215 | @click.option( 216 | "-b", 217 | "--bam_file", 218 | type=click.Path(exists=True), 219 | help="Input DSC/SSC consensus BAM file", 220 | ) 221 | @click.option( 222 | "-f", 223 | "--file_list", 224 | type=click.Path(exists=True), 225 | help="Run on list of BAM files", 226 | ) 227 | @click.option( 228 | "-o", 229 | "--output", 230 | type=click.Path(writable=True), 231 | help="Output file [default: BAM prefix]", 232 | default=None, 233 | ) 234 | @click.option( 235 | "--mapq", 236 | type=int, 237 | help="Minimum mapping quality", 238 | default=60, 239 | show_default=True, 240 | ) 241 | @click.option( 242 | "--baseq", 243 | type=int, 244 | help="Minimum base quality", 245 | default=89, 246 | show_default=True, 247 | ) 248 | @click.option( 249 | "--mutant_only", is_flag=True, help="Only output mutant fragments" 250 | ) 251 | def create_summary_file( 252 | bam_file, maf_file, output, mapq, baseq, file_list, mutant_only 253 | ): 254 | """ Summarize consensus molecules at variant sites. """ 255 | if file_list: 256 | bam_files = [x.strip() for x in open(file_list, "r").readlines()] 257 | outputs = [ 258 | bam.split("/")[-1].replace(".bam", "_pileup_summary.txt") 259 | for bam in bam_files 260 | ] 261 | else: 262 | bam_files = [bam_file] 263 | outputs = [ 264 | output 265 | if output 266 | else bam_file.split("/")[-1].replace(".bam", "_pileup_summary.txt") 267 | ] 268 | 269 | for bam_file, output in zip(bam_files, outputs): 270 | print(f"Working on: {bam_file}") 271 | outfile = open(output, "w") 272 | headers = [ 273 | "family_id", 274 | "family_size", 275 | "umi", 276 | "fragment_id", 277 | "target_site", 278 | "molecule_class", 279 | "base", 280 | "read_position", 281 | "distance_from_5_prime", 282 | "discordant", 283 | "mismatch_count", 284 | "N_count", 285 | ] 286 | outfile.write("\t".join(headers) + "\n") 287 | 288 | bam = pysam.AlignmentFile(bam_file) 289 | for variant_site in parse_maf(maf_file): 290 | for target_molecule in target_molecule_generator( 291 | variant_site, bam, mapq, baseq 292 | ): 293 | if mutant_only: 294 | if target_molecule.molecule_class != "ALT": 295 | continue 296 | outfile.write( 297 | "\t".join( 298 | str(target_molecule.__getattribute__(x)) 299 | for x in headers 300 | ) 301 | + "\n" 302 | ) 303 | total_reads = iterate_pileup_reads.counts 304 | print(f"Processed {total_reads} consensus reads") 305 | bam.close() 306 | 307 | 308 | if __name__ == "__main__": 309 | create_summary_file() 310 | -------------------------------------------------------------------------------- /snakemake/script/dpx/intervals.py: -------------------------------------------------------------------------------- 1 | from quicksect import IntervalTree 2 | from collections import defaultdict 3 | from math import ceil 4 | def read_bed(bedfile): 5 | """ Creates generator from bed file or interval_list """ 6 | interval_list = bedfile.endswith("interval_list") 7 | with open(bedfile, "r") as bed: 8 | for line in bed: 9 | if line.startswith("@"): 10 | continue 11 | line = line.strip() 12 | chrom, start, stop = line.split()[0:3] 13 | start, stop = int(start), int(stop) 14 | if interval_list: 15 | start -= 1 16 | yield chrom, start, stop 17 | 18 | 19 | def create_interval_dict_from_bed(bedfile, midpoint=False): 20 | """ 21 | Used for marking on/off target fragments by creating interval 22 | trees for each chromosome in bedfile. 23 | """ 24 | interval_dict = defaultdict(IntervalTree) 25 | if bedfile: 26 | for chrom, start, stop in read_bed(bedfile): 27 | if midpoint: 28 | mid = ceil((start+stop)/2) 29 | start = mid - 1 30 | stop = mid 31 | interval_dict[str(chrom)].add(start, stop) 32 | return interval_dict 33 | -------------------------------------------------------------------------------- /snakemake/script/error_rate_by_family_size.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import sys 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | import matplotlib.ticker as mtick 10 | from matplotlib.ticker import MaxNLocator 11 | import seaborn as sns 12 | sns.set(font_scale=2) 13 | logger = logging.getLogger("{}".format(__file__)) 14 | 15 | def get_arguments(): 16 | 17 | parser = argparse.ArgumentParser(prog="plot error rates stratified by family size", formatter_class=argparse.RawDescriptionHelpFormatter) 18 | parser.add_argument("accu", type=str, help="input accu file") 19 | args = parser.parse_args() 20 | return args 21 | 22 | def absolute_path(path): 23 | """convert relative path to absolute path""" 24 | if os.path.isabs(path): 25 | return path 26 | else: 27 | return os.path.join(os.getcwd(), path) 28 | 29 | def process(options): 30 | fsize_accu = {} 31 | with open(options.accu, 'r') as fh: 32 | for line in fh: 33 | cols = line.strip().split('\t') 34 | if cols[1] == "chrY" or cols[1] == "chrX": 35 | continue 36 | fields = cols[0].split('_') 37 | fsize = int(fields[-1]) 38 | if fsize not in fsize_accu: 39 | fsize_accu[fsize] = [int(cols[3]), int(cols[4]), 0] 40 | else: 41 | fsize_accu[fsize][0] += int(cols[3]) 42 | fsize_accu[fsize][1] += int(cols[4]) 43 | fsize_accu[fsize][2] += 1 44 | 45 | accus = [0] * 10 46 | counts = [0] * 10 47 | for k,v in fsize_accu.items(): 48 | if k <= 10: 49 | print (k, v[0], v[1], v[1]/v[0], v[2]) 50 | accus[k-1] = v[1]/v[0] * 1e5 51 | counts[k-1] = v[2] * 1e-6 52 | fig, ax = plt.subplots(figsize=(11.7, 8.27)) 53 | ax.bar(np.arange(10) + 1, accus) 54 | ax.xaxis.set_major_locator(MaxNLocator(integer=True)) 55 | plt.xlabel("Family size") 56 | plt.ylabel("Error per 100k") 57 | fig.savefig('error_rate_by_family_size.png') 58 | 59 | fig1, ax1 = plt.subplots(figsize=(11.7, 8.27)) 60 | ax1.bar(np.arange(10) + 1, counts) 61 | ax1.xaxis.set_major_locator(MaxNLocator(integer=True)) 62 | #ax1.ticklabel_format(axis='y', useMathText=True) 63 | plt.xlabel("Family size") 64 | plt.ylabel("Million Fragment") 65 | fig1.savefig('num_frag_family_size.png') 66 | 67 | if __name__ == '__main__': 68 | sys.exit(process(get_arguments())) 69 | 70 | -------------------------------------------------------------------------------- /snakemake/script/extract_false_positive_reads.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | import sys 5 | import pysam 6 | import subprocess 7 | import argparse 8 | import os 9 | 10 | def get_arguments(): 11 | 12 | parser = argparse.ArgumentParser(prog="grep false positive reads and print from gbu bam. Additionaly can print fastq files", 13 | formatter_class=argparse.RawDescriptionHelpFormatter) 14 | parser.add_argument("--fq1", type=str, help="raw fastq1 file", default="") 15 | parser.add_argument("--fq2", type=str, help="raw fastq2 file", default="") 16 | parser.add_argument("bam", type=str, help="groupby_umi_bam") 17 | parser.add_argument('df', type=str, help="error family file") 18 | parser.add_argument('prefix', type=str, help="prefix for output") 19 | 20 | args = parser.parse_args() 21 | return args 22 | 23 | def process(options): 24 | prefix_id = options.prefix 25 | 26 | df = pd.read_csv(options.df, sep="\t", low_memory=False) 27 | new = df['family_id'].str.split('/', n = 1, expand = True) 28 | df['family_num'] = new[0] 29 | df['strand'] = new[1] 30 | df = df.astype({'family_num' : 'int64'}) 31 | df = df.sort_values(by=['family_num', 'strand']) 32 | df = df[df['pass_filter'] == 1] 33 | gbubam = pysam.AlignmentFile(options.bam, "rb") 34 | tmpbam = pysam.AlignmentFile(prefix_id + ".tmp.bam", "wb", template=gbubam) 35 | f = open(prefix_id + ".run", 'w') 36 | rowit = df.iterrows() 37 | cur_row = next(rowit)[1] 38 | in_match = False 39 | counter = 0 40 | for aln in gbubam.fetch(until_eof=True): 41 | mitag = aln.get_tag("MI") 42 | if mitag == cur_row['family_id']: 43 | if not in_match: 44 | print(cur_row['family_id']) 45 | in_match = True 46 | counter = 0 47 | if options.fq1: 48 | if aln.is_read1: 49 | counter += 1 50 | cmd = ["zegrep", "-A 3", "-m 1", aln.query_name, options.fq1, ">>", prefix_id + ".1.fastq #" + mitag] 51 | print(" ".join(cmd), file=f) 52 | if options.fq2: 53 | if aln.is_read2: 54 | cmd = ["zegrep", "-A 3", "-m 1", aln.query_name, options.fq2, ">>", prefix_id + ".2.fastq #" + mitag] 55 | print(" ".join(cmd), file=f) 56 | tmpbam.write(aln) 57 | if in_match and mitag != cur_row['family_id']: 58 | print("matched", counter, "times") 59 | in_match = False 60 | try: 61 | cur_row = next(rowit)[1] 62 | except StopIteration: 63 | break 64 | tmpbam.close() 65 | cmd = "samtools sort {0} -o {1} && samtools index {1} && rm {0}".format(prefix_id + ".tmp.bam", prefix_id + ".bam") 66 | subprocess.check_call(cmd, shell=True) 67 | if __name__ == '__main__': 68 | sys.exit(process(get_arguments())) 69 | -------------------------------------------------------------------------------- /snakemake/script/familysize_dist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import sys 7 | import pysam 8 | import pandas as pd 9 | from collections import defaultdict 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | import matplotlib.ticker as plticker 13 | import seaborn as sns 14 | from scipy import stats 15 | 16 | logger = logging.getLogger("{}".format(__file__)) 17 | 18 | def read_pair_generator(bam, region_string=None): 19 | """ 20 | Generate read pairs in a BAM file or within a region string. 21 | Reads are added to read_dict until a pair is found. 22 | """ 23 | read_dict = defaultdict(lambda: [None, None]) 24 | for read in bam.fetch(until_eof=True): 25 | if read.is_secondary or read.is_supplementary: 26 | continue 27 | qname = read.query_name 28 | if qname not in read_dict: 29 | if read.is_read1: 30 | read_dict[qname][0] = read 31 | else: 32 | read_dict[qname][1] = read 33 | else: 34 | if read.is_read1: 35 | yield read, read_dict[qname][1] 36 | else: 37 | yield read_dict[qname][0], read 38 | del read_dict[qname] 39 | 40 | def read_bed(bedfile): 41 | """ Creates generator from bed file or interval_list """ 42 | logger.info("Reading region file...") 43 | interval_list = bedfile.endswith("interval_list") 44 | with open(bedfile, "r") as bed: 45 | for line in bed: 46 | if line.startswith("@"): 47 | continue 48 | line = line.strip() 49 | chrom, start, stop = line.split()[0:3] 50 | start, stop = int(start), int(stop) 51 | if interval_list: 52 | start -= 1 53 | yield chrom, start, stop 54 | 55 | def is_overlap(read1, read2): 56 | if read1.is_unmapped or read2.is_unmapped: 57 | return False 58 | if read1.reference_name != read2.reference_name: 59 | return False 60 | if read1.reference_start < read2.reference_end and read2.reference_start < read1.reference_end: 61 | return True 62 | 63 | def overlap_len(read1, read2): 64 | if not is_overlap(read1,read2): 65 | return 0 66 | else: 67 | return min(read1.reference_end, read2.reference_end) - max(read1.reference_start, read2.reference_start) 68 | 69 | def overlap_span_ratio(read1, read2): 70 | ol = overlap_len(read1, read2) 71 | if ol == 0: 72 | return 0; 73 | else: 74 | span = max(read1.reference_end, read2.reference_end) - min(read1.reference_start, read2.reference_start) 75 | return ol/span 76 | 77 | def get_arguments(): 78 | 79 | parser = argparse.ArgumentParser(prog="", formatter_class=argparse.RawDescriptionHelpFormatter) 80 | parser.add_argument("bam", type=str, help="input bam file") 81 | parser.add_argument("bed", type=str, help="target bed") 82 | parser.add_argument("--im_dist_cutoff", default=5000, help="intermolecu reads min distance to be considered", type=int, required=False) 83 | args = parser.parse_args() 84 | return args 85 | 86 | def absolute_path(path): 87 | """convert relative path to absolute path""" 88 | if os.path.isabs(path): 89 | return path 90 | else: 91 | return os.path.join(os.getcwd(), path) 92 | 93 | def bin_counts(vec, cut): 94 | cat = pd.cut(np.array(vec), cut) 95 | print(cat.value_counts()) 96 | 97 | def cdf_plot(vec, bins, figure, cumulative, xlab): 98 | #values, base = np.histogram(vec, bins=bins) 99 | #cumulative = np.cumsum(values) 100 | #plt.plot(base[:-1], cumulative, c='blue') 101 | fig, ax = plt.subplots() 102 | ax.hist(vec, bins, density=True, cumulative=cumulative, histtype='step') 103 | ax.set_axisbelow(True) 104 | ax.yaxis.grid(color='gray', linestyle='dashed') 105 | ax.xaxis.grid(color='gray', linestyle='dashed') 106 | #loc = plticker.MultipleLocator(base=(max(vec) - min(vec)) / 10) # this locator puts ticks at regular intervals 107 | #ax.xaxis.set_major_locator(loc) 108 | #ax.yaxis.set_major_locator(loc) 109 | ax.set_xlabel(xlab) 110 | dir = " > " if cumulative == 1 else " < " 111 | ax.set_ylabel("cumulative fraction of reads" + dir + xlab) 112 | ax.set_title("number of reads " + str(len(vec))) 113 | fig.savefig(figure) 114 | plt.close() 115 | 116 | 117 | class CdsStat: 118 | def __init__(self, sample_id): 119 | self.sample_id = sample_id 120 | self.total_frag = 0 121 | self.num_intermol = 0 # intermolecular 122 | self.num_overlap = 0 123 | self.read_len_diff = [] 124 | self.aln_len_diff = [] 125 | self.ol_ratios = [] 126 | 127 | def plot_read_len_diff(self): 128 | cdf_plot(self.read_len_diff, 100, "read_len_diff.png", 1, "read len diff") 129 | 130 | def plot_ol_ratios(self): 131 | cdf_plot(self.ol_ratios, 100, "ol_ratios.png", -1, "overlap ratio") 132 | 133 | def __str__(self): 134 | return f"{self.sample_id}\t{self.total_frag}\t{self.num_intermol}\t{self.num_overlap}\t{self.num_intermol/self.total_frag}\t{self.num_overlap/self.total_frag}" 135 | 136 | 137 | 138 | def process(options): 139 | samfile = pysam.AlignmentFile(options.bam, "rb") 140 | fid_size_dict = {} 141 | for chrom, start, end in read_bed(options.bed): 142 | for read in samfile.fetch(chrom, start, end): 143 | if read.is_unmapped or read.mate_is_unmapped: 144 | continue 145 | if read.is_read1 and read.has_tag('UG'): 146 | famid = read.get_tag('UG') 147 | intermol = False 148 | if not read.is_proper_pair or read.template_length == 0 or read.tempalte_length > options.im_dist_cutoff: 149 | intermol = True 150 | if famid not in fid_size_dict: 151 | fid_size_dict[famid] = [intermol, 1] 152 | else: 153 | assert(intermol == fid_size_dict[famid][0]) 154 | fid_size_dict[famid][1] += 1 155 | keys = zip(*fid_size_dict.values) 156 | d = {'family_id' : list(fid_size_dict.keys()), 'is_intermol' : keys[0], 'size': keys[1]} 157 | 158 | 159 | if __name__ == '__main__': 160 | sys.exit(process(get_arguments())) 161 | 162 | -------------------------------------------------------------------------------- /snakemake/script/fastqsplit.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # @author Luis M. Rodriguez-R 4 | # @license artistic license 2.0 5 | # @update Jul-05-2015 6 | # Round robin split 7 | 8 | ### modified by Ruolin Liu to read the file from stdin 9 | use warnings; 10 | use strict; 11 | use Symbol; 12 | 13 | #my ($file, $base, $outN) = @ARGV; 14 | my ($base, $outN) = @ARGV; 15 | 16 | $outN ||= 2; 17 | #($file and $base) or die " 18 | ($base) or die " 19 | Usage 20 | $0 in_file.fq out_base[ no_files] 21 | 22 | in_file.fq Input file in FastA format. 23 | out_base Prefix for the name of the output files. It will 24 | be appended with ..fastq, where is a consecutive 25 | number starting in 1. 26 | no_files Number of files to generate. By default: 2. 27 | "; 28 | 29 | 30 | my @outSym = (); 31 | for my $i (1 .. $outN){ 32 | $outSym[$i-1] = gensym; 33 | open $outSym[$i-1], ">", "$base.$i.fastq" or die "I can not create the file: $base.$i.fa: $!\n"; 34 | } 35 | 36 | 37 | my($i, $seq) = (-1, ''); 38 | #open FILE, "<", $file or die "I can not read the file: $file: $!\n"; 39 | while(my $ln=){ 40 | if($.%4 == 1){ 41 | print { $outSym[$i % $outN] } $seq if $seq; 42 | $i++; 43 | $seq = ''; 44 | } 45 | $seq.=$ln; 46 | } 47 | print { $outSym[$i % $outN] } $seq if $seq; 48 | #close FILE; 49 | 50 | for(my $j=0; $j<$outN; $j++){ 51 | close $outSym[$j]; 52 | } 53 | 54 | print STDERR "Sequences: ".($i+1)."\nFiles: $outN\n"; 55 | -------------------------------------------------------------------------------- /snakemake/script/generate_reads.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import sys 7 | import pandas as pd 8 | from Bio.Seq import reverse_complement 9 | 10 | logger = logging.getLogger("{}".format(__file__)) 11 | 12 | """ 13 | Simulate CDS reads from the xlxs file provided by Jin 14 | """ 15 | def get_arguments(): 16 | 17 | parser = argparse.ArgumentParser(prog="foo", formatter_class=argparse.RawDescriptionHelpFormatter) 18 | parser.add_argument("xlsx", type=str, help="input xlsx file") 19 | parser.add_argument("--readlen", type=int, default=300, help="read len") 20 | args = parser.parse_args() 21 | return args 22 | 23 | def absolute_path(path): 24 | """convert relative path to absolute path""" 25 | if os.path.isabs(path): 26 | return path 27 | else: 28 | return os.path.join(os.getcwd(), path) 29 | 30 | def print_fastq(name, seq, filename): 31 | filename.write("@"+name) 32 | filename.write("\n") 33 | filename.write(seq) 34 | filename.write("\n") 35 | filename.write("+\n") 36 | filename.write("I" * len(seq)) 37 | filename.write("\n") 38 | 39 | def generate_read(template, readlen): 40 | read1 = None 41 | read2 = None 42 | if len(template) > readlen: 43 | ss = len(template) - readlen 44 | read1 = template[:readlen] 45 | read2 = reverse_complement(template[ss:]) 46 | else: 47 | read1 = template 48 | read2 = reverse_complement(template) 49 | return (read1,read2) 50 | 51 | def process(options): 52 | xl = pd.ExcelFile(options.xlsx) 53 | fastq1 = open("read1.fastq", "w") 54 | fastq2 = open("read2.fastq", "w") 55 | rl = options.readlen 56 | 57 | for sid, sheetname in enumerate(xl.sheet_names): 58 | sheet = xl.parse(sheetname) 59 | for cid, col in enumerate(sheet.columns): 60 | for rid, row in enumerate(sheet[col].tolist()): 61 | if rid != 0: continue 62 | row = row.strip(" '") 63 | n1 = "test_" + str(sid) + "_" + str(cid) + "_" + str(rid) 64 | n2 = "test_" + str(sid) + "_" + str(cid) + "_" + str(rid) 65 | read1, read2 = generate_read(row, rl) 66 | print_fastq(n1, read1, fastq1) 67 | print_fastq(n2, read2, fastq2) 68 | 69 | #row_rc = reverse_complement(row) 70 | #n1 = "test_rc" + str(sid) + "_" + str(cid) + "_" + str(rid) 71 | #n2 = "test_rc" + str(sid) + "_" + str(cid) + "_" + str(rid) 72 | #read1, read2 = generate_read(row_rc, rl) 73 | #print_fastq(n1, read1, fastq1) 74 | #print_fastq(n2, read2, fastq2) 75 | 76 | if __name__ == '__main__': 77 | sys.exit(process(get_arguments())) 78 | -------------------------------------------------------------------------------- /snakemake/script/get_midpoint_from_interval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from math import ceil 4 | 5 | def read_bed(bedfile): 6 | """ Creates generator from bed file or interval_list """ 7 | interval_list = bedfile.endswith("interval_list") 8 | with open(bedfile, "r") as bed: 9 | for line in bed: 10 | if line.startswith("@"): 11 | continue 12 | line = line.strip() 13 | chrom, start, stop = line.split()[0:3] 14 | start, stop = int(start), int(stop) 15 | if interval_list: 16 | start -= 1 17 | yield chrom, start, stop 18 | 19 | def get_arguments(): 20 | 21 | parser = argparse.ArgumentParser(prog="get midpoint from interval and print like a bed file", formatter_class=argparse.RawDescriptionHelpFormatter) 22 | parser.add_argument("bed", type=str, help="interval list or bed file") 23 | parser.add_argument("out", type=str, help="output for bed file") 24 | args = parser.parse_args() 25 | return args 26 | 27 | def process(options): 28 | outh = open(options.out, 'w') 29 | for chrom, start, stop in read_bed(options.bed): 30 | mid = ceil((start+stop)/2) 31 | print(chrom, mid-1, mid, sep='\t', file=outh) 32 | 33 | if __name__ == '__main__': 34 | sys.exit(process(get_arguments())) 35 | -------------------------------------------------------------------------------- /snakemake/script/maf2vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import sys 7 | import pandas as pd 8 | import subprocess as sp 9 | 10 | logger = logging.getLogger("{}".format(__file__)) 11 | 12 | def get_arguments(): 13 | 14 | parser = argparse.ArgumentParser(prog="convert maf to vcf file", formatter_class=argparse.RawDescriptionHelpFormatter) 15 | parser.add_argument("maf", type=str, help="input maf file") 16 | parser.add_argument("-r", "--ref_fasta", required=True, help="reference fasta") 17 | parser.add_argument("-p", "--path_to_maf2vcf", default = "/usr/bin/maf2vcf.pl" , type=str, help="path to the conversion script") 18 | parser.add_argument("-n", "--t_alt", type=int, default=1, help="min t_alt (1)") 19 | parser.add_argument("-O", "--out_type", choices=['vcf', 'maf'], default="vcf", help="output type") 20 | parser.add_argument("-o", "--out_dir", default="./", help="output dir") 21 | parser.add_argument("-s", "--sample_name", default="", help="sample name") 22 | parser.add_argument("-d", "--max_del_len", type=int, default=1e9, help="maximum deletion length allowed") 23 | parser.add_argument("-i", "--min_indel_len", type=int, default=1, help="min indel lngth allowed") 24 | args = parser.parse_args() 25 | return args 26 | 27 | 28 | def process(options): 29 | maf = pd.read_csv(options.maf, comment="#", sep='\t', low_memory=False) 30 | print("input variant count:", maf.shape[0]) 31 | if 't_alt_count' in maf.columns: 32 | maf = maf[maf['t_alt_count'] >= options.t_alt] 33 | if options.max_del_len < 1e9: 34 | maf = maf[maf['Reference_Allele'].str.len() <= options.max_del_len + 1] 35 | if options.min_indel_len > 1: 36 | maf = maf[abs(maf['Reference_Allele'].str.len() - maf['Tumor_Seq_Allele2'].str.len()) >= options.min_indel_len] 37 | print("output variant count:", maf.shape[0]) 38 | if not options.sample_name: 39 | sname = os.path.basename(options.maf) 40 | sname = sname[:-4] 41 | else: 42 | sname = options.sample_name 43 | outmaf = os.path.join(options.out_dir, sname+".filtered.maf") 44 | outtsv = os.path.join(options.out_dir, sname+".filtered.pairs.tsv") 45 | #maf = maf[["Chromosome", "Start_Position", "End_Position", 46 | # "Variant_Type", "Tumor_Sample_Barcode", "Reference_Allele", "Tumor_Seq_Allele2", "t_alt_count", "t_ref_count", "n_alt_count", "n_ref_count"]] 47 | maf = maf[["Chromosome", "Start_Position", "Variant_Type", "Tumor_Sample_Barcode", "Reference_Allele", "Tumor_Seq_Allele2", "t_alt_count"]] 48 | maf.to_csv(outmaf, sep='\t', index=False) 49 | if options.out_type == "vcf": 50 | cmd=f"perl {options.path_to_maf2vcf} --input-maf {outmaf} --output-dir {options.out_dir} --ref-fasta {options.ref_fasta}" 51 | print(f"converting to vcf: {cmd}") 52 | sp.check_output(cmd, shell=True) 53 | sp.check_output(f"rm {outmaf}", shell=True) 54 | sp.check_output(f"rm {outtsv}", shell=True) 55 | 56 | 57 | if __name__ == '__main__': 58 | sys.exit(process(get_arguments())) 59 | -------------------------------------------------------------------------------- /snakemake/script/msisensor_combine_result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import logging 4 | import sys 5 | import numpy as np 6 | import os 7 | import pprint 8 | from collections import defaultdict 9 | 10 | logger = logging.getLogger("{}".format(__file__)) 11 | 12 | def get_arguments(): 13 | 14 | parser = argparse.ArgumentParser(prog="combine msisensor results", formatter_class=argparse.RawDescriptionHelpFormatter) 15 | parser.add_argument("table", type=str, help="msisensor all table") 16 | parser.add_argument("dist", type=str, help="msisensor dist file") 17 | args = parser.parse_args() 18 | return args 19 | 20 | 21 | def process(opts): 22 | collector = defaultdict(lambda : defaultdict( lambda : defaultdict(int))) 23 | pp = pprint.PrettyPrinter(indent=4) 24 | hp_agg = defaultdict(lambda : defaultdict(int)) 25 | with open(opts.dist, 'r') as fh: 26 | for nrow, line in enumerate(fh): 27 | if nrow % 2 == 0: 28 | cols = line.split() 29 | chrom = cols[0] 30 | pos = int(cols[1]) 31 | else: 32 | duo = line.split(": ") 33 | vec = [int(x) for x in duo[1].split()] 34 | nonzeroind = np.nonzero(vec)[0] 35 | for ind in nonzeroind: 36 | collector[chrom][pos][ind + 1] = vec[ind] 37 | with open(opts.table, 'r') as fh: 38 | for line in fh: 39 | if line.startswith("chromosome"): 40 | continue 41 | cols = line.strip().split('\t') 42 | # no germline variatns 43 | if cols[-1] == '0': 44 | repeat_times = cols[3] 45 | chrom = cols[0] 46 | pos = int(cols[1]) 47 | repeat = cols[4] 48 | unit_cnt = collector[chrom][pos] 49 | for k,v in unit_cnt.items(): 50 | key = repeat + "," + repeat_times 51 | hp_agg[key][k] += v 52 | if repeat == 'A' and repeat_times == '10': 53 | if k == 5: 54 | print(line) 55 | 56 | #pp.pprint(hp_agg) 57 | 58 | 59 | 60 | if __name__ == '__main__': 61 | sys.exit(process(get_arguments())) 62 | -------------------------------------------------------------------------------- /snakemake/script/print_mut_cpgstatus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import sys 7 | import pysam 8 | import pandas as pd 9 | 10 | logger = logging.getLogger("{}".format(__file__)) 11 | 12 | def get_arguments(): 13 | 14 | parser = argparse.ArgumentParser(prog="print whether mutant families in CpG context or not", formatter_class=argparse.RawDescriptionHelpFormatter) 15 | parser.add_argument("mutfam", type=str, help="mutant families") 16 | parser.add_argument("ref", type=str, help="reference file") 17 | args = parser.parse_args() 18 | return args 19 | 20 | def absolute_path(path): 21 | """convert relative path to absolute path""" 22 | if os.path.isabs(path): 23 | return path 24 | else: 25 | return os.path.join(os.getcwd(), path) 26 | 27 | def process(opt): 28 | mf = pd.read_csv(opt.mutfam, sep='\t', low_memory=False) 29 | mf = mf[mf['pass_filter'] == 1].reset_index() 30 | ncol = mf.shape[1] 31 | mf['CpG'] = 'NA' 32 | fasta = pysam.FastaFile(opt.ref) 33 | for idx, row in mf.iterrows(): 34 | if row['ref_allele'] == 'C': 35 | nextb = fasta.fetch(str(row['contig']), row['position'], row['position'] + 1) 36 | if nextb == 'G' or nextb == 'g': 37 | mf.iloc[idx, ncol] = '1' 38 | else: 39 | mf.iloc[idx, ncol] = '0' 40 | if row['ref_allele'] == 'G': 41 | nextb = fasta.fetch(str(row['contig']), row['position']-2, row['position'] - 1) 42 | if nextb == 'C' or nextb == 'c': 43 | mf.iloc[idx, ncol] = '1' 44 | else: 45 | mf.iloc[idx, ncol] = '0' 46 | print(mf) 47 | 48 | if __name__ == '__main__': 49 | sys.exit(process(get_arguments())) 50 | 51 | -------------------------------------------------------------------------------- /snakemake/script/print_snv_roc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import sys 6 | import glob 7 | import os 8 | import pandas as pd 9 | 10 | logger = logging.getLogger("{}".format(__file__)) 11 | def get_arguments(): 12 | 13 | parser = argparse.ArgumentParser(prog="print whether mutant families in CpG context or not", formatter_class=argparse.RawDescriptionHelpFormatter) 14 | parser.add_argument("vcfeval_folder", type=str, help="vcfeval output folder") 15 | args = parser.parse_args() 16 | return args 17 | 18 | def process(opts): 19 | outs = glob.glob(os.path.join(opts.vcfeval_folder, "*/*/snp_roc.tsv.gz")) 20 | total_df = pd.DataFrame() 21 | for o in outs: 22 | fields = o.split('/') 23 | df = pd.read_csv(o, sep='\t', skiprows=6) 24 | df['cutoff'] = fields[-2] 25 | df['vaf'] = fields[-3][2:] 26 | if total_df.empty: 27 | total_df = df 28 | else: 29 | total_df = pd.concat([total_df, df]) 30 | total_df.sort_values(by=['vaf', 'cutoff'], inplace=True) 31 | total_df = total_df[['vaf','cutoff', 'precision', 'sensitivity', 'false_positives', 'false_negatives', 'true_positives_baseline', 'f_measure']] 32 | print (total_df) 33 | total_df.to_csv("summary.tsv", sep='\t', index=False) 34 | 35 | 36 | if __name__ == '__main__': 37 | sys.exit(process(get_arguments())) 38 | 39 | -------------------------------------------------------------------------------- /snakemake/script/rev_qualscore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import sys 7 | import pysam 8 | 9 | def get_arguments(): 10 | 11 | parser = argparse.ArgumentParser(prog="", formatter_class=argparse.RawDescriptionHelpFormatter) 12 | parser.add_argument("bam", type=str, help="input bam file") 13 | parser.add_argument("out", type=str, help="output bam file") 14 | args = parser.parse_args() 15 | return args 16 | 17 | def process(options): 18 | inbam = pysam.AlignmentFile(options.bam, "rb") 19 | outbam = pysam.AlignmentFile(options.out, "wb", template=inbam) 20 | for read in inbam: 21 | if read.is_reverse: 22 | read.query_qualities = read.query_qualities[::-1] 23 | outbam.write(read) 24 | 25 | 26 | 27 | if __name__ == '__main__': 28 | sys.exit(process(get_arguments())) 29 | 30 | -------------------------------------------------------------------------------- /snakemake/script/vcf_update_genotype.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import sys 6 | from cyvcf2 import VCF, Writer 7 | 8 | logger = logging.getLogger("{}".format(__file__)) 9 | def get_arguments(): 10 | 11 | parser = argparse.ArgumentParser(prog="change the variant genotype", formatter_class=argparse.RawDescriptionHelpFormatter) 12 | parser.add_argument("vcf", type=str, help="vcf input") 13 | parser.add_argument("out", type=str, help="vcf output") 14 | args = parser.parse_args() 15 | return args 16 | 17 | def process(opts): 18 | inputvcf = VCF(opts.vcf) 19 | writer = Writer(opts.out, inputvcf, "wz") 20 | for record in inputvcf: 21 | assert(len(record.gt_types) == 1) 22 | if record.gt_types[0] == 0: 23 | record.genotypes = [[0,1,False]] 24 | writer.write_record(record) 25 | else: 26 | writer.write_record(record) 27 | writer.close() 28 | 29 | if __name__ == '__main__': 30 | sys.exit(process(get_arguments())) 31 | 32 | -------------------------------------------------------------------------------- /snakemake/script/vcf_validate_against_maf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import sys 6 | from cyvcf2 import VCF, Writer 7 | import pandas as pd 8 | 9 | logger = logging.getLogger("{}".format(__file__)) 10 | def get_arguments(): 11 | 12 | parser = argparse.ArgumentParser(prog="change the variant genotype", formatter_class=argparse.RawDescriptionHelpFormatter) 13 | parser.add_argument("vcf", type=str, help="vcf input") 14 | parser.add_argument("out", type=str, help="vcf output") 15 | parser.add_argument("maf", type=str, help="maf file for truth") 16 | parser.add_argument("--whitelist", type=str, action='append', help="whitelist varainst in chr:pos format") 17 | args = parser.parse_args() 18 | return args 19 | 20 | def process(opts): 21 | inputvcf = VCF(opts.vcf) 22 | writer = Writer(opts.out, inputvcf, "wz") 23 | maf = pd.read_csv(opts.maf, sep='\t', low_memory=False) 24 | maf = maf.astype({'Chromosome' : 'str'}) 25 | for record in inputvcf: 26 | assert(len(record.gt_types) == 1) 27 | if not maf[(maf['Chromosome'] == record.CHROM) & (maf['Start_position'] == record.start + 1) & (maf['Tumor_Seq_Allele2'].isin(record.ALT))].empty: 28 | writer.write_record(record) 29 | else: 30 | found = False 31 | if opts.whitelist: 32 | for var in opts.whitelist: 33 | chr, pos = var.split(':') 34 | if record.CHROM == chr and record.start + 1 == int(pos): 35 | found = True 36 | break 37 | if found: 38 | writer.write_record(record) 39 | else: 40 | print(record.CHROM, record.start, record.start + 1, sep='\t') 41 | else: 42 | print(record.CHROM, record.start, record.start + 1, sep='\t') 43 | 44 | 45 | if __name__ == '__main__': 46 | sys.exit(process(get_arguments())) 47 | 48 | --------------------------------------------------------------------------------