├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── CODECsuite_TPC_Notice_Final.txt
├── LICENSE.txt
├── README.md
├── accuracy.cpp
├── bbcpputil
    ├── .gitmodules
    ├── cpp
    │   ├── AlignmentConsensus.cpp
    │   └── BamRecordExt.cpp
    ├── cxxopts
    │   └── include
    │   │   └── cxxopts.hpp
    └── include
    │   ├── Algo.h
    │   ├── Alignment.h
    │   ├── AlignmentConsensus.h
    │   ├── BamRecordExt.h
    │   ├── DNAUtils.h
    │   ├── FastxIO.h
    │   ├── FastxRecord.h
    │   ├── Files.h
    │   ├── GenomicRegionCollectionExt.h
    │   ├── Gotoh.h
    │   ├── Insert.h
    │   ├── InsertSeqFactory.h
    │   ├── MAF.h
    │   ├── MutCounter.h
    │   ├── ReadVCF.h
    │   ├── Stats.h
    │   ├── StringUtils.h
    │   ├── TargetLayout.h
    │   ├── Variant.h
    │   └── pileup.h
├── codec.cpp
├── demux.cpp
├── duplex_filter.cpp
├── include
    ├── Adapter.h
    ├── BamIO.h
    └── Index.h
├── msi
    ├── Snakefile
    ├── msi.R
    └── msi.cpp
├── obsolete
    ├── bamtofastq.cpp
    ├── concat_umi_to_fastq.cpp
    └── print_qual.cpp
├── snakemake
    ├── AdapV2
    │   ├── Snakefile
    │   ├── capture_wf_1
    │   │   └── Snakefile
    │   └── wgs
    │   │   └── Snakefile
    ├── README.md
    ├── jobscript.sh
    ├── pipeline_input_examples
    │   ├── caputure
    │   │   ├── config.yaml
    │   │   ├── input.tsv
    │   │   └── runSnakemake.sh
    │   └── wgs
    │   │   ├── config.yaml
    │   │   ├── input.tsv
    │   │   └── runSnakemake.sh
    ├── qsub_wrapper.py
    └── script
    │   ├── agg_log.py
    │   ├── cds_summarize.py
    │   ├── codec2maf
    │   ├── collect_duplex_metrics.py
    │   ├── cov_sum.py
    │   ├── create_maf_from_probe_rg.py
    │   ├── downsample_read_families.py
    │   ├── dpx
    │       ├── __pycache__
    │       │   ├── __init__.cpython-38.pyc
    │       │   ├── bam_iterator.cpython-36.pyc
    │       │   ├── bam_iterator.cpython-38.pyc
    │       │   ├── collect_duplex_metrics.cpython-36.pyc
    │       │   ├── downsampler.cpython-36.pyc
    │       │   ├── downsampler.cpython-38.pyc
    │       │   ├── intervals.cpython-36.pyc
    │       │   └── intervals.cpython-38.pyc
    │       ├── downsampler.py
    │       ├── get_mutant_metrics.py
    │       └── intervals.py
    │   ├── error_rate_by_family_size.py
    │   ├── extract_false_positive_reads.py
    │   ├── familysize_dist.py
    │   ├── fastqsplit.pl
    │   ├── generate_reads.py
    │   ├── get_midpoint_from_interval.py
    │   ├── maf2vcf.py
    │   ├── msisensor_combine_result.py
    │   ├── print_mut_cpgstatus.py
    │   ├── print_snv_roc.py
    │   ├── rev_qualscore.py
    │   ├── vcf_update_genotype.py
    │   └── vcf_validate_against_maf.py
└── trimadapter.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | cmake-build*/
2 | .DS_Store
3 | .idea/
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "bbcpputil/third_party/SeqLib"]
 2 | 	path = bbcpputil/third_party/SeqLib
 3 | 	url = https://github.com/ruolin/SeqLib.git
 4 | [submodule "bbcpputil/third_party/backward-cpp"]
 5 | 	path = bbcpputil/third_party/backward-cpp
 6 | 	url = https://github.com/bombela/backward-cpp
 7 | [submodule "bbcpputil/third_party/seqan"]
 8 | 	path = bbcpputil/third_party/seqan
 9 | 	url = https://github.com/ruolin/seqan.git
10 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18)
 2 | set( CMAKE_CXX_STANDARD 14 )
 3 | project (codecsuite LANGUAGES CXX)
 4 | set(CMAKE_BUILD_TYPE Release)
 5 | #set(CMAKE_BUILD_TYPE Debug)
 6 | #for gprof
 7 | #set (CMAKE_CXX_FLAGS "${CAMKE_CXX_FLAGS} -g -pg")
 8 | #for valgrind
 9 | #set (CMAKE_CXX_FLAGS "${CAMKE_CXX_FLAGS} -g -O1")
10 | set( CMAKE_EXE_LINKER_FLAGS " ${CMAKE_EXE_LINKER_FLAGS} -pthread")
11 | set( CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -Wall -Wno-deprecated -Wno-unused-variable " )
12 | 
13 | 
14 | set (CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -DSEQAN_ENABLE_TESTING=0 -DSEQAN_HAS_ZLIB=1")
15 | find_package(OpenMP REQUIRED)
16 | 
17 | include_directories(include)
18 | include_directories(bbcpputil/third_party/seqan/include/)
19 | 
20 | set( CMAKE_EXE_LINKER_FLAGS " ${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++ -L. ")
21 | set( CPP_LIBS ${CPP_LIBS} seqlib bwa fml hts bz2)
22 | set( CPP_LIBS ${CPP_LIBS} dl )
23 | set( CPP_LIBS ${CPP_LIBS} z lzma)
24 | set( CPP_LIBS ${CPP_LIBS} OpenMP::OpenMP_CXX)
25 | 
26 | include_directories(bbcpputil/include)
27 | 
28 | set(SEQLIB_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/bbcpputil/third_party/SeqLib)
29 | find_library(SEQLIB_LIBRARY
30 |         NAMES seqlib
31 |         PATHS ${SEQLIB_ROOT}
32 |         PATH_SUFFIXES lib
33 |         NO_DEFAULT_PATH
34 |         )
35 | include(ExternalProject)
36 | if(NOT SEQLIB_LIBRARY)
37 |     message("SeqLib not found")
38 |     ExternalProject_Add(SeqLib
39 |             SOURCE_DIR "${SEQLIB_ROOT}"
40 |             INSTALL_DIR "${SEQLIB_ROOT}"
41 |             BUILD_IN_SOURCE 1
42 |             CONFIGURE_COMMAND chmod +x ./configure COMMAND ./configure prefix=${SEQLIB_ROOT}
43 |             BUILD_COMMAND make CXXFLAGS='-std=c++11'
44 |             INSTALL_COMMAND make install
45 |             )
46 | else()
47 |     message("SeqLib found")
48 |     add_custom_target(SeqLib)
49 | endif()
50 | 
51 | include_directories(${SEQLIB_ROOT})
52 | include_directories(${SEQLIB_ROOT}/htslib)
53 | link_directories(${SEQLIB_ROOT}/lib/)
54 | 
55 | ## not currently used. But reserve for denovo consensus
56 | 
57 | set(deps bbcpputil/cpp/AlignmentConsensus.cpp bbcpputil/cpp/BamRecordExt.cpp)
58 | set(subcommand demux.cpp trimadapter.cpp duplex_filter.cpp accuracy.cpp)
59 | if (CMAKE_BUILD_TYPE MATCHES Debug)
60 |     message(" ")
61 |     message("CMAKE IN DEBUG MODE")
62 |     message(" ")
63 |     link_directories($ENV{ELFUTILS_ROOT}/lib/)
64 |     set(LIBDW_LIBRARY "$ENV{ELFUTILS_ROOT}/lib/libdw.a")
65 |     set(LIBDW_INCLUDE_DIR "$ENV{ELFUTILS_ROOT}/include/")
66 |     set( CPP_LIBS ${CPP_LIBS} dw elf) # backward_cpp
67 |     add_subdirectory(bbcpputil/third_party/backward-cpp)
68 | elseif(CMAKE_BUILD_TYPE MATCHES Release)
69 |     message(" ")
70 |     message("CMAKE IN RELEASE MODE")
71 |     message(" ")
72 | endif ()
73 | 
74 | add_executable(codec codec.cpp ${subcommand} ${deps})
75 | add_dependencies(codec SeqLib)
76 | 
77 | if (CMAKE_BUILD_TYPE MATCHES Debug)
78 |     #add_executable(codec codec.cpp ${subcommand} ${deps} ${BACKWARD_ENABLE})
79 |     add_backward(codec)
80 | endif()
81 | target_link_libraries(codec ${CPP_LIBS})
82 | 
83 | ######## uncomment if you need to build CODEC_MSI 
84 | #add_executable(msi msi/msi.cpp ${deps})
85 | #target_link_libraries(msi z ${CPP_LIBS})
86 | 


--------------------------------------------------------------------------------
/CODECsuite_TPC_Notice_Final.txt:
--------------------------------------------------------------------------------
 1 | Notice of Third Party Code Dependencies
 2 | 
 3 | CODECsuite is distributed, in part, under and subject to the provisions of respective licenses for the following dependencies: 
 4 | 
 5 | 1. SeqLib
 6 | Copyright 2016 Jeremiah A. Wala. All rights reserved.
 7 | https://github.com/walaj/SeqLib/blob/master/LICENSE
 8 | 
 9 | 2. backward-cpp
10 | Copyright (c) 2013 Google Inc. All rights reserved.
11 | https://github.com/bombela/backward-cpp/blob/master/LICENSE.txt
12 | 
13 | 3. seqan
14 | Copyright (c) 2006-2018 Knut Reinert, FU Berlin. All rights reserved.
15 | https://github.com/seqan/seqan/blob/master/LICENSE
16 | 
17 | 4. spoa
18 | Copyright (c) 2016 Robert Vaser. All rights reserved.
19 | https://github.com/rvaser/spoa/blob/master/LICENSE
20 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | CODECsuite is released under the following BSD 3-Clause License:
 2 | 
 3 | Copyright (c) 2021 The Broad Institute, Inc.  All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 | 
11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | UPDATES
  2 | * (05/19/25) Version 1.1.5 introduces new scripts for converting CODEC single fragment duplex variants output to MAF or VCF format.
  3 |   [See details here](#reformatting-codec-sfc-variant-output-into-maf-or-vcf-format)
  4 | 
  5 | *  (01/08/25) Version 1.1.4 introduces a new script `codec filter` designed to filter consensus BAM files. It retains only the reads and bases relevant for variant calling. Fragments (read-pairs) that do not pass fragment-level filtering are excluded from the output BAM. Bases that fail the filters are assigned a minimum base quality score (Q2), ensuring they are ignored by most coverage analysis and variant calling tools.
  6 |    It can be run as the following:
  7 |    
  8 |    ``` codec filter -b mol_consensus.sorbybyname.bam -o duplex_only.bam -r reference.fa -q 30 -m 60 -Q 0.7 -B 0.5 -N 0.05 ...```
  9 | > [!NOTE]
 10 | > The input BAM file must be sorted by read name, and the output BAM will also be query-name sorted. For consistent filtering results, it is recommended to use the same parameters as those in codec call.
 11 | 
 12 | 
 13 | # CODECsuite
 14 | The CODEC analysis pipeline, CODECsuite, comprises five key steps: demultiplexing, adapter trimming, alignment, duplicate collapsing, and single-fragment mutation calling. Duplicate collapsing and alignments are performed using the third-party tools Fgbio and BWA, respectively. After removing byproducts and applying fragment-level filtering, mutations were identified exclusively from duplexes in the overlapped regions, where bases from each read align and match. Bases within these regions underwent stringent filtering based on criteria such as base quality, proximity to fragment ends, overlap with germline mutations, and other factors. Notably, a single read pair is sufficient to form a duplex, as each read represents one strand. Refer to the [paper](https://www.nature.com/articles/s41588-023-01376-0) for more details. 
 15 | 
 16 | ## Installation
 17 | Tested on Red Hat 7 and Ubuntu 18.04
 18 | 
 19 | prerequisite for C++ based programs. For snakemake workflow check out [here](./snakemake)
 20 | 1. git
 21 | 2. tested with gcc 5.2 and 7.3 with c++14 support
 22 | 3. cmake 3.18.3 or above
 23 | 
 24 | First, recursive clone the repo and create a build directory which will holds the installion files and final executables.
 25 | 
 26 | `git clone --recursive git@github.com:broadinstitute/CODECsuite.git && cd CODECsuite && mkdir build`
 27 | 
 28 | Next, build the program with cmake.
 29 | 
 30 | `cd build && cmake .. && make`
 31 | 
 32 | After this, you should be able to see an executable named `codec` in the build folder you just created.
 33 | 
 34 | ## Demultiplexing
 35 | CODECsuite is expected to work with raw lane-level fastq.gz. This can be obtained from illumina [bcl2fastq](https://support.illumina.com/downloads/bcl2fastq-conversion-software-v2-20.html).
 36 | The first step is demultiplexing and it requires a sample sheet in csv format for each lane which looks the following.
 37 | Currently, we have used 12 barcodes. For good cluster generation, we recommend to have at least 4 sample barcodes per
 38 | sequencing lane. 
 39 | 
 40 | | SampleName | IndexBarcode1 | IndexBarcode2 |
 41 | |------------|---------------|---------------|
 42 | |Sample01|CTTGAACGGACTGTCCAC|CACCGAGCGTTAGACTAC|
 43 | |Sample02|GAGCCTACTCAGTCAACG|GTGTCGAACACTTGACGG|
 44 | |Sample03|AGCTTGTAAGGCAGGTTA|ACTGATCTTCAGCTGACT|
 45 | |Sample04|TCAAGCGTCTTACATGGT|TGAATCTGAGGCACTGTA|
 46 | |Sample05|CTGGTCCAAGAACGTCTG|CTCTGAACGATCGAGCTC|
 47 | |Sample06|GATCCAGTTCTGTCGAGC|GAGGTGCATGCACCTTAG|
 48 | |Sample07|ACCTATAGGTGCAACGAA|ACTAACTTCCATTGCACT|
 49 | |Sample08|TGAAGGTCCACTGTATCT|TGACCTGGATGGATAGGA|
 50 | |Sample09|CACTGCTTCGAGACGAAG|CTCCAGTTACTGAGACGG|
 51 | |Sample10|GTGATACCTCGATGCTCC|GAGGTCCAGTCTCTGTCC|
 52 | |Sample11|ACTCAGAGAACTCATGGA|ACTACAGGTGGATCCAAT|
 53 | |Sample12|TGAGCTGAGTTCGTACTT|TGATGTACCAACGATGTA|
 54 | 
 55 | `codex demux -1 reads.r1.fastq.gz -2 reads.r2.fastq.gz -p sample_sheet.csv -o demux_outprefix `
 56 | 
 57 | Given the toy sample_sheet.csv and code this command will generate 
 58 | ```
 59 | demux_outprefix.sample_A.1.fastq.gz, demux_outprefix.sample_A.2.fastq.gz
 60 | demux_outprefix.sample_B.1.fastq.gz, demux_outprefix.sample_B.2.fastq.gz
 61 | ```
 62 | 
 63 | ## Adapter trimming
 64 | After demultiplexing CODEC reads still contain in-situ sample barcode and adapter sequences. The next step is to trim 
 65 | these out since they could interfere alignment
 66 | 
 67 | `codec trim -1 demux_outprefix.sample_A.1.fastq.gz -2 demux_outprefix.sample_A.2.fastq.gz -o trim_outprefix -u 3 -U 3 -f 2 -t 2 -s sample_A` 
 68 | 
 69 | This tells the CODECsuite that first 3bp of a read is the UMI and to trim off the next two 2bp. 
 70 | The output files of the adapter trimming step looks like
 71 | ```
 72 | trim_outprefix.sample_A.trim.bam
 73 | trim_outprefix.sample_A.trim.log
 74 | ```
 75 | By default, single-end byproducts are also output to the `trim.bam`. To split the output use `-S/--split_bam_output`.
 76 | 
 77 | The bam file is standard uBam (unmapped bam) with additional tags
 78 | ```
 79 | RX: UMI sequence from R1 and R2, concatenated by a hyphen
 80 | QX: UMI quality scores
 81 | bc: Index barcode sequence
 82 | s5: 5' adapter sequence (same as Index barcode)
 83 | q5: 5' adapter quality scores
 84 | s3: 3' adapter sequence (same as Index barcode of the mate)
 85 | q3: 3' adapter quality scores
 86 | sl: the rest of 3' adapter  sequence
 87 | ql: the rest of 3' adapter quality scores
 88 | ```
 89 | 
 90 | After adapter trimming. The codec reads can be mapped by standard NGS tools such as BWA. For our end-to-end pipeline 
 91 | please see [snakemake](./snakemake).
 92 | 
 93 | > [!NOTE]
 94 | > We recommend using SMaHT duplex reference genome which is basically HG38 without decoy sequences. See reasons here: https://smaht-dac.github.io/pipelines-docs/DOCS/REFERENCE_FILES/Genome_Builds/1_Build_GRCh38.html
 95 | 
 96 | ## Single fragment caller (SFC) and mutation rate computation
 97 | 
 98 | After GATK best practice (alignment, markduplicate, indel realignment)  for example. Of note, BQSR should NOT be run for 
 99 | CODEC data since CODEC has a different quality score distribution. I do not recommend BQSR in general since the modern
100 | Illumina sequencers' quality scores having been improved and BQSR almost doubles the bam size.
101 | 
102 | Now, we can run SFC to call mutations. SFC is designed to call somatic mutations. For the best results, we need to have
103 | a bed file which contains the high confident regions (e.g., [this](https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.3/GRCh38@all/LowComplexity/GRCh38_notinAllTandemRepeatsandHomopolymers_slop5.bed.gz)) in the reference genome and a germline bam for masking the germline 
104 | variants. If there is no germline bam, it is recommend to have a germline vcf file. The population based vcf (e.g., [gnomad vcf](https://storage.googleapis.com/gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz)) is almost
105 | surely recommended since it can account for contamination and low sequencing depth of germline bam. However, to avoid over-filtering true somatic mutations, a minimum allele frequency threshold is recommend for the population vcf (e.g., 0.01%)
106 | 
107 | There is a set of 
108 | fragment level and base level filters to improve the precision of the mutation calls at the cost of data loss and potential
109 | loss of real mutations. Depends on the applications, we have presets of parameters: `-p/--preset`
110 | ```
111 | stringent: setting where high precision calling is needed. Situations like calling background mutation rate in white 
112 | blood cells
113 | 
114 | lenient: setting where certain sensitivity is prefer. situations like calling cancer mutations in tumor biopsy or high 
115 | tumor fraction ctDNA samples. 
116 | 
117 | null: as little as filtering possible. For advanced users who want to filtering on there own ends.  
118 | ```
119 | 
120 | It is highly recommended that a user try and play with different parameters and figure out the ones that are best for
121 | themselves. Trimming the end `-d 12` and use Q30 cutoff `-q 30` is always recommended. The others are ad hoc. However, 
122 | the most effective parameter is probably `-Q/--min_passQ_frac`: the fraction of Q30+Q30 bases in the overlap region. The fraction essentially
123 | measures the cluster quality, which is important for single fragment calling. Some examples of running SFC
124 | ```
125 | codec call -b input.mark_duplicated.bam -L highconfidentregions.bed  -r hg38.fa -n germline.bam -p lenient -o output
126 | 
127 | ```
128 | 
129 | The output of the SFC are
130 | ```
131 | output.mutation_metrics.txt: includes SNV_rate, INDEL_rate and etc. 
132 | output.variatns_called.txt: mutations from single fragments
133 | output.context_count.txt: trinucleotide context and dinucleotide context counts
134 | ```
135 | 
136 | > [!NOTE]
137 | > All CODEC related resources can be found at https://console.cloud.google.com/storage/browser/codec_cloud_resources. Including the population based vcf: https://storage.googleapis.com/codec_cloud_resources/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz
138 | 
139 | ## Reformatting CODEC SFC variant output into MAF or VCF format 
140 | Scripts `codec2maf` and `maf2vcf.py` can be found in folder `snakemake/script/`.  `maf2vcf.py` depends on `maf2vcf.pl` script from the perl package [mskcc/vcf2maf](https://github.com/mskcc/vcf2maf/tree/main)
141 | 1. CODEC txt file To MAF: `codec2maf -i output.variatns_called.txt -o output.variatns_called.maf`
142 | 2. MAF To VCF: `maf2vcf.py output.variatns_called.maf -r hg38.fa -o outdir -p /usr/bin/maf2vcf.pl`
143 | 
144 | 
145 | ## Other notes
146 | 1. For CODEC-MSI please refer to [msi](./msi). And by default CMAKE will not build CODEC-MSI. Please uncomment the last two
147 | lines if you indeed want to build CODEC-MSI
148 | 
149 | 2. The Snakemake is hard-coded to de-multiplex 4 lanes simultaneously (e.g. for NovaSeq 6000). If you need to de-multiplex
150 | less #lanes (e.g. for NovaSeq SP), comment out entire rules for DemuxL3 and DemuxL4. If you have more than 4 lanes (e.g. HiSeq X)
151 | either do 4 lane at a times or add more rules yourself. 
152 | 
153 | 3. The Snakemake pipeline setup file `qsub_wrapper.py` is specific to [UGE](https://en.wikipedia.org/wiki/Univa_Grid_Engine).
154 | You may need to change settings for your computing environment. 
155 | 


--------------------------------------------------------------------------------
/bbcpputil/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "third_party/backward-cpp"]
 2 | 	path = third_party/backward-cpp
 3 | 	url = https://github.com/bombela/backward-cpp
 4 | [submodule "third_party/spoa"]
 5 | 	path = third_party/spoa
 6 | 	url = git@github.com:rvaser/spoa.git
 7 | [submodule "third_party/SeqLib"]
 8 | 	path = third_party/SeqLib
 9 | 	url = git@github.com:ruolin/SeqLib.git
10 | [submodule "third_party/seqan"]
11 | 	path = third_party/seqan
12 | 	url = git@github.com:ruolin/seqan.git 
13 | 


--------------------------------------------------------------------------------
/bbcpputil/include/Algo.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 10/31/21.
  3 | //
  4 | 
  5 | #ifndef CODECSUITE_BBCPPUTIL_INCLUDE_ALGO_H_
  6 | #define CODECSUITE_BBCPPUTIL_INCLUDE_ALGO_H_
  7 | #include <unordered_set>
  8 | #include <queue>
  9 | #include <string>
 10 | 
 11 | namespace cpputil {
 12 | 
 13 | class UniqueQueue {
 14 |   /*
 15 |    * Pair up Queue with Set so that Queue has only unique element
 16 |    */
 17 | public:
 18 |   UniqueQueue(size_t cap) : capacity_(cap), n_added_(0) {};
 19 |   UniqueQueue() : UniqueQueue(0) {};
 20 | 
 21 |   bool exist(const std::string& in) const {
 22 |     //auto h = hash_string(in.c_str());
 23 |     if (s_.find(in) == s_.end()) return false;
 24 |     else return true;
 25 |   }
 26 | 
 27 |   void add(const std::string& in) {
 28 |     //auto h = hash_string(in.c_str());
 29 |     if (s_.find(in) == s_.end()) {
 30 |       if (q_.size() < capacity_) {
 31 |         q_.push(in);
 32 |         s_.insert(in);
 33 |       } else {
 34 |        auto key = q_.front();
 35 |        q_.pop();
 36 |        s_.erase(key);
 37 |        q_.push(in);
 38 |        s_.insert(in);
 39 |       }
 40 |       ++n_added_;
 41 |     }
 42 |   }
 43 | 
 44 |   void clearQueue() {
 45 |     std::queue<std::string> empty;
 46 |     std::swap(q_, empty);
 47 |     s_.clear();
 48 |   }
 49 | 
 50 |   uint64_t NumAdded() const {
 51 |     return n_added_;
 52 |   }
 53 | 
 54 | private:
 55 |   // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
 56 |   inline uint64_t hash_string(const char* __s) const
 57 |   {
 58 |     uint64_t hash = 0xcbf29ce484222325ull;
 59 |     for ( ; *__s; ++__s)
 60 |     {
 61 |       hash *= 1099511628211ull;
 62 |       hash ^= *__s;
 63 |     }
 64 |     return hash;
 65 |   }
 66 | 
 67 |   std::unordered_set<std::string> s_;
 68 |   std::queue<std::string> q_;
 69 |   const size_t capacity_;
 70 |   uint64_t n_added_;
 71 | };
 72 | 
 73 | inline int largest_cluster(std::vector<int> sortedpos, int window, int &beg, int &end) {
 74 |   std::sort(sortedpos.begin(), sortedpos.end());
 75 |   if (sortedpos.size() == 0) return 0;
 76 |   if (sortedpos.size() == 1) {
 77 |     beg = sortedpos[0];
 78 |     end = sortedpos[0];
 79 |     return 1;
 80 |   }
 81 |   int max_size = 1;
 82 |   for (unsigned i= 0; i < sortedpos.size() - 1; ++i) {
 83 |     unsigned j = i+1;
 84 |     int s = 1;
 85 |     for (; j < sortedpos.size();) {
 86 |       if (sortedpos[j] - sortedpos[i] < window) {
 87 |         ++s;
 88 |         ++j;
 89 |       } else {
 90 |         if (s > max_size) {
 91 |           beg = sortedpos[i];
 92 |           end = sortedpos[j - 1];
 93 |           max_size = s;
 94 |         }
 95 |         break;
 96 |       }
 97 |       if (j == sortedpos.size() && s > max_size) {
 98 |         beg = sortedpos[i];
 99 |         end = sortedpos[j - 1];
100 |         max_size = s;
101 |       }
102 |     }
103 |   }
104 |   return max_size;
105 | }
106 | 
107 | }
108 | 
109 | #endif //CODECSUITE_BBCPPUTIL_INCLUDE_ALGO_H_
110 | 


--------------------------------------------------------------------------------
/bbcpputil/include/Alignment.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 2/26/20.
  3 | // This file holds old stuff for finding multiple segments (e.g. oligos) in
  4 | // reads.
  5 | //
  6 | 
  7 | #ifndef CPPUTIL_INCLUDE_ALIGNMENT_H_
  8 | #define CPPUTIL_INCLUDE_ALIGNMENT_H_
  9 | 
 10 | #include "BamRecordExt.h"
 11 | 
 12 | namespace cpputil {
 13 | 
 14 | typedef std::vector<SeqLib::BamRecord> Segments; // sequenced parts of a paired-end read for example.
 15 |                                                  // or group of duplicated reads
 16 | 
 17 | // check all BamReocrd has same start and stop
 18 | inline bool AreSegsCompleteOverlap(const Segments &segs) {
 19 |   if (segs.empty()) return false;
 20 |   int32_t s = segs.front().PositionWithSClips();
 21 |   int32_t e = segs.front().PositionEndWithSClips();
 22 |   for (auto const & seg : segs) {
 23 |     if (seg.PositionWithSClips() != s) return false;
 24 |     if (seg.PositionEndWithSClips() != e) return false;
 25 |   }
 26 |   return true;
 27 | }
 28 | 
 29 | inline std::string GetUid(const Segments &segs, const std::string& umi_tag) {
 30 |   std::string uid;
 31 |   bool status;
 32 |   if (umi_tag.empty()) status = false;
 33 |   else {
 34 |     status = segs.front().GetZTag(umi_tag, uid);
 35 |     if (!status and segs.size() == 2)  {
 36 |       status = segs.back().GetZTag(umi_tag, uid);
 37 |     }
 38 |   }
 39 |   if(!status) { // molecular identifier not exist
 40 |     std::string rxtag;
 41 |     int32_t s, e;
 42 |     if (segs.size() == 1) {
 43 |       s = segs.front().PositionWithSClips();
 44 |       e = segs.front().PositionEndWithSClips();
 45 |     } else if(segs.size() == 2){
 46 |       if (segs.front().ReverseFlag()) {
 47 |         s = segs.front().PositionEndWithSClips();
 48 |         e = segs.back().PositionWithSClips();
 49 |       } else {
 50 |         s = segs.front().PositionWithSClips();
 51 |         e = segs.back().PositionEndWithSClips();
 52 |       }
 53 |     }
 54 |     bool rxtag_status = segs.front().GetZTag("RX", rxtag);
 55 |     if (rxtag_status) {
 56 |       uid = std::to_string(s) + "," + std::to_string(e) + ":" + rxtag;
 57 |     } else {
 58 |     }
 59 |   }
 60 |   return uid;
 61 | }
 62 | 
 63 | inline bool AreSegsCompleteOverlapExcludingSclip(const Segments &segs) {
 64 |   if (segs.empty()) return false;
 65 |   int32_t s = segs.front().Position();
 66 |   int32_t e = segs.front().PositionEnd();
 67 |   for (auto const & seg : segs) {
 68 |     if (seg.Position() != s) return false;
 69 |     if (seg.PositionEnd() != e) return false;
 70 |   }
 71 |   return true;
 72 | }
 73 | 
 74 | inline int GetNumOverlapBasesPEAlignment(const Segments & segs, bool FR_only = true) {
 75 |   assert(segs.size() == 2);
 76 |   assert(segs.front().Qname() == segs.back().Qname());
 77 |   if (segs.front().Interchromosomal()) return 0;
 78 |   if (segs.front().PairOrientation() != 0 && FR_only) return 0;
 79 |   int left = std::max(segs.front().Position(),  segs.back().Position());
 80 |   int right = std::min(segs.front().PositionEnd(),  segs.back().PositionEnd());
 81 |   int ol = right - left;
 82 |   if (ol < 0) {
 83 |     ol = 0;
 84 |   }
 85 |   return ol;
 86 | }
 87 | 
 88 | inline bool ArePEAlignmentOverlapAtLeastK(const Segments & segs, int k) {
 89 |   if (k == -1) return AreSegsCompleteOverlapExcludingSclip(segs);
 90 |   int ol = GetNumOverlapBasesPEAlignment(segs);
 91 |   if (ol < k) return false;
 92 | //  if (segs.front().Position() < segs.back().PositionEnd() && segs.back().Position() < segs.front().PositionEnd()) {
 93 | //    return true;
 94 | //  }
 95 |   return true;
 96 | }
 97 | 
 98 | inline bool SegmentNotEmpty(const Segments &seg, int dummy) {
 99 |   return !seg.empty();
100 | }
101 | 
102 | }
103 | #endif //CPPUTIL_INCLUDE_ALIGNMENT_H_
104 | 


--------------------------------------------------------------------------------
/bbcpputil/include/AlignmentConsensus.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Ruolin Liu on 3/19/20.
 3 | //
 4 | 
 5 | #ifndef CPPUTIL_INCLUDE_ALIGNMENTCONSENSUS_H_
 6 | #define CPPUTIL_INCLUDE_ALIGNMENTCONSENSUS_H_
 7 | 
 8 | #include <string>
 9 | #include <vector>
10 | #include <map>
11 | #include <htslib/sam.h>
12 | #include "BamRecordExt.h"
13 | #include "Alignment.h"
14 | 
15 | namespace cpputil {
16 | 
17 | inline void find_insert_(const SeqLib::Cigar &cigar, int left_cursor, std::map<int, int> &ins_len) {
18 |   for (auto it = cigar.begin(); it != cigar.end(); ++it) {
19 |     if (it->Type() == 'H') {
20 |       continue;
21 |     } else if (it->Type() == 'I') {
22 |       ins_len[left_cursor] = std::max(ins_len[left_cursor], (int) it->Length());
23 |     } else {
24 |       left_cursor += it->Length();
25 |     }
26 |   }
27 | }
28 | 
29 | 
30 | std::string GetConsensusTemplate(const Segments& segs, int32_t& ref_most_left);
31 | 
32 | std::pair<std::string, std::string>
33 |     GetGappedSeqAndQual(const SeqLib::BamRecord &r, const int start, const std::string consensus_template);
34 | 
35 | std::string MergePairSeq(const Segments &segs, const std::vector<std::string>& seqs, bool trim_overhang);
36 | std::string MergePair(const Segments &segs, bool trim_overhang);
37 | std::pair<std::vector<std::string>, std::vector<std::string>> GetPairPileup(const Segments &segs);
38 | 
39 | std::pair<std::string, std::string> PairConsensus(const Segments &segs, const std::vector<std::string>& seqs,
40 |                                               bool trim_overhang, int qcutoff, std::vector<std::string>& out_quals);
41 | 
42 | std::pair<std::string, std::string> PairSeqConsensus(const Segments &seg, bool trim_overhang, int qcutoff);
43 | }
44 | 
45 | #endif //CPPUTIL_INCLUDE_ALIGNMENTCONSENSUS_H_
46 | 


--------------------------------------------------------------------------------
/bbcpputil/include/BamRecordExt.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 5/10/21.
  3 | //
  4 | 
  5 | #ifndef CPPUTIL_INCLUDE_BAMRECORDEXT_CPP_H_
  6 | #define CPPUTIL_INCLUDE_BAMRECORDEXT_CPP_H_
  7 | 
  8 | #include <limits>
  9 | #include<set>
 10 | #include <SeqLib/BamRecord.h>
 11 | #include "SeqLib/RefGenome.h"
 12 | namespace cpputil {
 13 | 
 14 | class BamPileup {
 15 |  public:
 16 |   SeqLib::BamRecord bam;
 17 |   int32_t qpos;
 18 |   int indel, level;
 19 |   uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28;
 20 |   BamPileup(const bam_pileup1_t *pi): bam(pi->b) {
 21 |     qpos = pi->qpos;
 22 |     indel = pi->indel;
 23 |     level = pi->level;
 24 |     is_del = pi->is_del;
 25 |     is_head = pi->is_head;
 26 |     is_refskip = pi->is_refskip;
 27 |     aux = pi->aux;
 28 |   }
 29 | };
 30 | 
 31 | bool ProperPair(const SeqLib::BamRecord& bam);
 32 | 
 33 | std::pair<int32_t, int32_t> MatePositionAndPositionEndWithSoftClip(const SeqLib::BamRecord & bam);
 34 | 
 35 | int32_t GetUnclippedFramgentLength(const SeqLib::BamRecord &b);
 36 | 
 37 | std::pair<int32_t, int32_t> MatePositionAndPositionEnd(const SeqLib::BamRecord & bam);
 38 | 
 39 | //overlap len of paired end reads in the reference coordinate, excluding soft clipping
 40 | int32_t InsertSize(const SeqLib::BamRecord & read1, const SeqLib::BamRecord& read2);
 41 | 
 42 | std::pair<uint64_t, uint64_t> ProperPairFramgentEndsWithSclip(const SeqLib::BamRecord &b);
 43 | 
 44 | void PrintQual(const SeqLib::BamRecord &b);
 45 | 
 46 | int32_t CountNOrLowQInMatchedBases(const SeqLib::BamRecord &b, const int qcutoff);
 47 | 
 48 | void AddMatchedBasesToCycleCount( const SeqLib::BamRecord& b,
 49 |                                   std::vector<int64_t>& q0_cycle_count,
 50 |                                   std::vector<int64_t>& q30_cycle_count,
 51 |                                   int start = -1,
 52 |                                   int end = std::numeric_limits<int>::max());
 53 | 
 54 | int32_t CountNBasesInAlignment(const SeqLib::BamRecord &b);
 55 | 
 56 | int32_t NumSoftClip5End(const SeqLib::BamRecord &b);
 57 | 
 58 | uint32_t GetNumNonIndelAlignedBases(const SeqLib::BamRecord &bam);
 59 | int32_t GetNM(const SeqLib::BamRecord &bam);
 60 | int32_t GetNMismatch(const SeqLib::BamRecord &bam, bool NisMM = false);
 61 | bool HasClusteredMuts(const SeqLib::BamRecord &bam, const SeqLib::BamHeader& header,
 62 |                       const SeqLib::RefGenome& refgenome, const int cutoff);
 63 | 
 64 | bool GetBTag(const SeqLib::BamRecord&, const std::string&, std::vector<int64_t>&);
 65 | 
 66 | int32_t GetNMismatchX(const SeqLib::BamRecord &bam);
 67 | int GetFamilySize(const SeqLib::BamRecord &bam);
 68 | int32_t IndelLen(const SeqLib::BamRecord &bam);
 69 | 
 70 | int32_t GetTotalIndelLen(const SeqLib::BamRecord &bam);
 71 | 
 72 | int SoftClip3end(SeqLib::BamRecord &bam);
 73 | 
 74 | bool SoftClipBamRecord(SeqLib::BamRecord &bam);
 75 | 
 76 | void MaskBaseBelowMinBq(SeqLib::BamRecord &bam, int32_t mbp);
 77 | 
 78 | void TrimBamFromFragEnd(SeqLib::BamRecord &bam, int32_t mp, int32_t mate_position_end_with_sclip, int32_t end5, int32_t end3);
 79 | 
 80 | void TrimPairFromFragEnd(SeqLib::BamRecord &left, SeqLib::BamRecord&right, int32_t n_trim);
 81 | 
 82 | void TrimSingleFromFragEnd(SeqLib::BamRecord &bam, int32_t n_trim);
 83 | 
 84 | int RefPosToQueryPos(const SeqLib::BamRecord &bam, const int refpos);
 85 | 
 86 | bool IsPairOverlap(const SeqLib::BamRecord& one, const SeqLib::BamRecord& two);
 87 | 
 88 | std::pair<int, int> GetPairOverlapRStartAndRStop(const SeqLib::BamRecord& fwd, const SeqLib::BamRecord& rev);
 89 | int EffFragLen(const std::vector<SeqLib::BamRecord>&, int count_overhang);
 90 | 
 91 | std::pair<std::pair<int,int>,std::pair<int,int>>
 92 | GetPairOverlapQStartAndQStop(const SeqLib::BamRecord& fwd, const SeqLib::BamRecord& rev);
 93 | 
 94 | 
 95 | std::pair<int,int>
 96 | GetBamOverlapQStartAndQStop(const SeqLib::BamRecord& record, const SeqLib::GenomicRegion& gr);
 97 | 
 98 | 
 99 | } //end namespace
100 | 
101 | 
102 | #endif //CPPUTIL_INCLUDE_BAMRECORDEXT_H_
103 | 


--------------------------------------------------------------------------------
/bbcpputil/include/DNAUtils.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Ruolin Liu on 12/20/19.
 3 | //
 4 | 
 5 | #ifndef REALIGN_INCLUDE_DNAUTILS_H_
 6 | #define REALIGN_INCLUDE_DNAUTILS_H_
 7 | #include <string>
 8 | #include <algorithm>
 9 | #include <cassert>
10 | 
11 | namespace cpputil {
12 | 
13 | inline char complement(char n) {
14 |   switch (n) {
15 |     case 'A':return 'T';
16 |     case 'T':return 'A';
17 |     case 'G':return 'C';
18 |     case 'C':return 'G';
19 |     case 'N':return 'N';
20 |     case 'n':return 'n';
21 |     case 'a':return 't';
22 |     case 't':return 'a';
23 |     case 'c':return 'g';
24 |     case 'g':return 'c';
25 |   }
26 |   assert(false);
27 |   return ' ';
28 | }
29 | 
30 | inline std::string complementString(std::string x) {
31 |   std::transform(std::begin(x), std::end(x), std::begin(x), complement);
32 |   return x;
33 | }
34 | 
35 | inline void reverse_complement(std::string &seq) {
36 |   std::transform(std::begin(seq), std::end(seq), std::begin(seq), complement);
37 |   for (int i = 0, j = seq.size() - 1; i < j; i++, j--) {
38 |     std::swap(seq[i], seq[j]);
39 |   }
40 | }
41 | 
42 | inline void reverse(std::string &seq) {
43 |   for (int i = 0, j = seq.size() - 1; i < j; i++, j--) {
44 |     std::swap(seq[i], seq[j]);
45 |   }
46 | }
47 | 
48 | inline void PrintQualString(const std::string& qual, int min_bq = 20, int offset = 33) {
49 |   std::string line1;
50 |   std::string line2;
51 |   std::string stat;
52 |   for (unsigned i = 0; i < qual.size(); ++i) {
53 |     int q = (int) qual[i] - offset;
54 |     int div = q / 10;
55 |     int reminder = q % 10;
56 |     line1 += std::to_string(div);
57 |     line2 += std::to_string(reminder);
58 |     stat += q >= min_bq ? "*" : " ";
59 |   }
60 |   std::cout << line1 << std::endl;
61 |   std::cout << line2 << std::endl;
62 |   std::cout << stat << std::endl;
63 | }
64 | 
65 | inline int TrimLowBQfromBack(const std::string& qual, char bq) {
66 |   int i = qual.size();
67 |   while (i > 0 && qual[i-1] < bq) {i--;}
68 |   return i;
69 | }
70 | 
71 | inline int LastNfromBack(const std::string& seq) {
72 |   //one pass the first not N from the back
73 |   int i = seq.size();
74 |   while (i > 0 && seq[i-1] == 'N') {i--;}
75 |   return i;
76 | }
77 | 
78 | inline int FirstNotNfromFront(const std::string& seq) {
79 |   int i = 0;
80 |   while (i < (int) seq.size() && seq[i] == 'N') {i++;}
81 |   return i;
82 | }
83 | 
84 | }
85 | #endif //REALIGN_INCLUDE_DNAUTILS_H_
86 | 


--------------------------------------------------------------------------------
/bbcpputil/include/FastxIO.h:
--------------------------------------------------------------------------------
 1 | #ifndef FASTA_IO
 2 | #define FASTA_IO
 3 | #include <string>
 4 | #include <memory>
 5 | #include <seqan/seq_io.h>
 6 | #include <seqan/sequence.h>
 7 | #include <seqan/stream/iostream_bgzf.h>
 8 | #include <fstream>
 9 | 
10 | #include "FastxRecord.h"
11 | 
12 | namespace cpputil {
13 | 
14 | inline bool endswith(std::string const &value, std::string const &ending) {
15 |   if (ending.size() > value.size()) return false;
16 |   return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
17 | }
18 | 
19 | class FastqWriter {
20 |   std::unique_ptr<std::ofstream> ofstream_;
21 |   std::unique_ptr<seqan::bgzf_ostream> seqStream_;
22 |   //std::mutex mtx_;
23 |  public:
24 |   FastqWriter() = default;
25 |   FastqWriter(std::ofstream& f) : seqStream_(std::make_unique<seqan::bgzf_ostream>(f)) {}
26 |   FastqWriter(const std::string& file) {
27 |     open(file);
28 |   }
29 |   void open(std::ofstream& f) {
30 |     seqStream_ = std::make_unique<seqan::bgzf_ostream>(f);
31 |   }
32 |   void open(const std::string& file) {
33 |     ofstream_ = std::make_unique<std::ofstream>(file);
34 |     seqStream_ = std::make_unique<seqan::bgzf_ostream>(*ofstream_);
35 |   }
36 |   void Write(const std::string &id, const std::string &seq, const std::string &qual) {
37 |     seqan::CharString rid = id;
38 |     seqan::CharString rseq = seq;
39 |     seqan::CharString rqual = qual;
40 |     //mtx_.lock();
41 |     seqan::writeRecord(*seqStream_, rid, rseq, rqual, seqan::Fastq());
42 |     //mtx_.unlock();
43 |   }
44 | 
45 |   void Write(const std::string &id, const std::string &seq) {
46 |     std::string qual = std::string(seq.size(), 'I');
47 |     Write(id, seq, qual);
48 |   }
49 | 
50 |   void Write(const FastxRecord &fxr) {
51 |     if (fxr.qual.empty()) {
52 |       Write(fxr.id, fxr.seq);
53 |     } else {
54 |       Write(fxr.id, fxr.seq, fxr.qual);
55 |     }
56 | 
57 |   }
58 | };
59 | 
60 | class FastxReader {
61 |   seqan::SeqFileIn seqfilein_;
62 |   int ftype_; //0 for fasta, 1 for fastq, 2 for unknown
63 |  public:
64 |   FastxReader(std::string fastx) : seqfilein_(fastx.c_str()) {
65 |     if (endswith(fastx, ".fq") or endswith(fastx, ".fastq") or
66 |         endswith(fastx, ".fq.gz") or endswith(fastx, ".fastq.gz")) {
67 |       ftype_ = 1;
68 |     } else if (endswith(fastx, ".fa") or endswith(fastx, ".fasta") or
69 |         endswith(fastx, ".fa.gz") or endswith(fastx, ".fasta.gz")) {
70 |       ftype_ = 0;
71 |     } else {
72 |       ftype_ = 2;
73 |       throw std::runtime_error("unknown file format " + fastx);
74 |     }
75 |   }
76 | 
77 |   bool yield(FastxRecord &record) {
78 |     record.cleanup();
79 |     seqan::CharString id;
80 |     seqan::CharString seq;
81 |     seqan::CharString qual;
82 | 
83 |     if (seqan::atEnd(seqfilein_)) {
84 |       return false;
85 |     }
86 |     if (ftype_ == 0) {
87 |       seqan::readRecord(id, seq, seqfilein_);
88 |     } else {
89 |       seqan::readRecord(id, seq, qual, seqfilein_);
90 |     }
91 |     record = FastxRecord(seqan::toCString(id), seqan::toCString(seq), seqan::toCString(qual));
92 |     return true;
93 |   }
94 | };
95 | 
96 | }//end namepsace
97 | #endif
98 | 


--------------------------------------------------------------------------------
/bbcpputil/include/FastxRecord.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 10/12/20.
  3 | //
  4 | 
  5 | #ifndef ADAPTERTRIM_CPPUTIL_INCLUDE_FASTXRECORD_H_
  6 | #define ADAPTERTRIM_CPPUTIL_INCLUDE_FASTXRECORD_H_
  7 | 
  8 | #include <SeqLib/BamRecord.h>
  9 | #include "StringUtils.h"
 10 | #include "DNAUtils.h"
 11 | 
 12 | namespace cpputil {
 13 | 
 14 | inline std::string broad_name(const std::string &name) {
 15 |   auto fields = split(name, ":");
 16 |   std::string res = fields[1];
 17 |   for (unsigned i = 3; i < fields.size(); ++i) {
 18 |     res += ":" + fields[i];
 19 |   }
 20 |   return res;
 21 | }
 22 | 
 23 | inline std::pair<std::string, uint64_t> split_instrument_id_from_broad_name(const std::string &broad_name) {
 24 |   auto fields = split(broad_name, ":");
 25 |   auto instrument_id = fields[0];
 26 |   std::string the_rest = "";
 27 |   for (unsigned i = 1; i < fields.size(); ++i) {
 28 |     the_rest += fields[i];
 29 |   }
 30 |   return std::make_pair(instrument_id, std::stoull(the_rest));
 31 | }
 32 | 
 33 | struct FastxRecord {
 34 |   std::string id; // full ID
 35 |   std::string seq;
 36 |   std::string qual;
 37 |   size_t name_idx = std::string::npos;
 38 |   FastxRecord() = default;
 39 |   FastxRecord(std::string i, std::string s, std::string q) : id(i), seq(s), qual(q) {
 40 |     name_idx = id.find(' ');
 41 |     if (name_idx == std::string::npos) {
 42 |       name_idx = id.find('/');
 43 |     }
 44 |   };
 45 | 
 46 |   FastxRecord(const SeqLib::BamRecord &br, bool duplex_umi = false) {
 47 |     id = br.Qname();
 48 |     seq = br.Sequence();
 49 |     qual = br.Qualities();
 50 |     if (br.ReverseFlag()) {
 51 |       reverse_complement(seq);
 52 |       std::reverse(qual.begin(), qual.end());
 53 |     }
 54 |     std::string umiseq;
 55 |     std::string umiqual;
 56 |     if (br.GetZTag("RX", umiseq)) {
 57 |       bool status = br.GetZTag("QX", umiqual);
 58 |       if (!status) {
 59 |         umiqual = std::string('I', umiseq.size());
 60 |       }
 61 |       if (duplex_umi) {
 62 |         auto umiseqs = cpputil::split(umiseq, "-");
 63 |         auto umi1_qual = umiqual.substr(0, umiseqs[0].size());
 64 |         auto umi2_qual = umiqual.substr(umiseqs[0].size()+1, umiseqs[1].size());
 65 |         if (br.FirstFlag()) {
 66 |           seq = umiseqs[0] + seq;
 67 |           qual = umi1_qual + qual;
 68 |         } else {
 69 |           seq = umiseqs[1] + seq;
 70 |           qual = umi2_qual + qual;
 71 |         }
 72 |       } else {
 73 |         seq = umiseq + seq;
 74 |         qual = umiqual + qual;
 75 |       }
 76 |     }
 77 |   }
 78 | 
 79 |   void update_id_with_umi(const std::string &umi) {
 80 |     auto name = id.substr(0, name_idx);
 81 |     auto suffix = id.substr(name_idx);
 82 |     id = name + "_" + umi + suffix;
 83 |     name_idx += umi.size() + 1;
 84 |   }
 85 | 
 86 |   virtual void cleanup() {
 87 |     id.clear();
 88 |     seq.clear();
 89 |     qual.clear();
 90 |     name_idx = std::string::npos;
 91 |   }
 92 | 
 93 |   bool is_filtered() {
 94 |     std::size_t found = id.find_first_of(':', name_idx);
 95 |     if (found < id.size() - 1) {
 96 |       if (id[found+1] == 'Y') return true;
 97 |     }
 98 |     return false;
 99 |   }
100 | 
101 |   std::string index_barcode() const {
102 |     std::size_t found = id.find_last_of(':');
103 |     if (found != std::string::npos) {
104 |       return id.substr(found + 1);
105 |     } else {
106 |       return "";
107 |     }
108 |   }
109 | 
110 |   std::string name() const {
111 |     return (name_idx != std::string::npos ? id.substr(0, name_idx) : id);
112 |   }
113 | 
114 |   //broad cannonicalized name,e.g.,
115 |   //"D00203:HCY5YBCX3200606:HCY5YBCX3:1:1105:4656:14095"
116 |   //"HCY5YBCX3200606:1:1105:4656:14095"
117 |   // Warning, should only work for illumina fastq read convention
118 | 
119 |   std::string broad_id() const {
120 |     if (name_idx != std::string::npos) {
121 |       return broad_name(this->name()) + " " + id.substr(name_idx);
122 |     } else {
123 |       return broad_name(this->name());
124 |     }
125 |   }
126 | };
127 | 
128 | class AnnotatedSeq {
129 |   std::string seq_;
130 |   std::string qual_;
131 |  public:
132 |   AnnotatedSeq() = default;
133 |   AnnotatedSeq(std::string s, std::string q) : seq_(s), qual_(q) {
134 |     assert(s.size() == q.size());
135 |   };
136 |   bool empty() const {
137 |     return seq_.size() == 0;
138 |   }
139 |   decltype(auto) qual() const {
140 |     return (qual_);
141 |   }
142 |   decltype(auto) seq() const {
143 |     return (seq_);
144 |   }
145 |   void cleanup() {
146 |     seq_.clear();
147 |     qual_.clear();
148 |   }
149 | };
150 | 
151 | struct ExtFastxRecord : public FastxRecord {
152 |   AnnotatedSeq adap5;
153 |   AnnotatedSeq adap3;
154 |   AnnotatedSeq umi;
155 |   AnnotatedSeq trim3;
156 |   std::string barcode;
157 |   int tm = 255; //unsigned
158 |   int rc_adpt = 0;
159 | 
160 |   void cleanup() override {
161 |     tm = 255;
162 |     rc_adpt = 0;
163 |     FastxRecord::cleanup();
164 |     adap3.cleanup();
165 |     adap5.cleanup();
166 |     umi.cleanup();
167 |     trim3.cleanup();
168 |     barcode.clear();
169 |   }
170 | };
171 | 
172 | template<typename Stream>
173 | Stream &operator<<(Stream &os, const FastxRecord &fxr) {
174 |   if (fxr.qual.empty()) {
175 |     os << ">" + fxr.id << "\n" << fxr.seq << "\n";
176 |   } else {
177 |     os << "@" + fxr.id << "\n" << fxr.seq << "\n" << "+\n" << fxr.qual << "\n";
178 |   }
179 |   return os;
180 | }
181 | 
182 | }
183 | #endif //ADAPTERTRIM_CPPUTIL_INCLUDE_FASTXRECORD_H_
184 | 


--------------------------------------------------------------------------------
/bbcpputil/include/Files.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Ruolin Liu on 5/11/21.
 3 | //
 4 | #ifndef CPPUTIL_INCLUDE_FILES_H_
 5 | #define CPPUTIL_INCLUDE_FILES_H_
 6 | #include <string>
 7 | #include <fstream>
 8 | #include <sys/stat.h>
 9 | #include <unistd.h>
10 | 
11 | namespace cpputil {
12 | 
13 | inline bool FileExist(const std::string &name) {
14 |   std::ifstream f(name.c_str());
15 |   return f.good();
16 | }
17 | }
18 | 
19 | #endif //CPPUTIL_INCLUDE_FILES_H_
20 | 


--------------------------------------------------------------------------------
/bbcpputil/include/GenomicRegionCollectionExt.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Ruolin Liu on 8/26/20.
 3 | //
 4 | 
 5 | #ifndef CPPUTIL_INCLUDE_GENOMICREGIONCOLLECTIONEXT_H_
 6 | #define CPPUTIL_INCLUDE_GENOMICREGIONCOLLECTIONEXT_H_
 7 | #include <cassert>
 8 | #include <map>
 9 | #include <string>
10 | #include "Seqlib/GenomicRegionCollection.h"
11 | namespace cpputil {
12 | 
13 | }
14 | 
15 | 
16 | #endif //CPPUTIL_INCLUDE_GENOMICREGIONCOLLECTIONEXT_H_
17 | 


--------------------------------------------------------------------------------
/bbcpputil/include/Gotoh.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 9/27/20.
  3 | //
  4 | 
  5 | #ifndef ADAPTERTRIM_CPPUTIL_INCLUDE_GOTOH_H_
  6 | #define ADAPTERTRIM_CPPUTIL_INCLUDE_GOTOH_H_
  7 | 
  8 | /*
  9 | This code snippet finds all optimal paths using affine gap penalty.
 10 | So many public codes and textbooks implement or teach solutions leading to suboptimal solutions.
 11 | Inspired by paper https://www.biorxiv.org/content/10.1101/031500v1.full.pdf,
 12 | I decided to implement the correct algorithm which is documented at
 13 | https://www.researchgate.net/publication/19580571_Optimal_sequence_alignment_using_affine_gap_costs
 14 | */
 15 | 
 16 | #include <iostream>
 17 | #include <string>
 18 | #include <vector>
 19 | #include <climits>
 20 | #include <array>
 21 | #include <assert.h>
 22 | #include <stack>
 23 | 
 24 | class Alignment {
 25 |   std::vector<std::string> display;
 26 |   int nm = 0;
 27 | public:
 28 |   struct Node {
 29 |     int i;
 30 |     int j;
 31 |     Node(int ii, int jj) : i(ii), j(jj) {}
 32 |     Node() = default;
 33 |   };
 34 | 
 35 |   Alignment(const std::string& row, const std::string col, const std::vector<Node>& path) {
 36 |     std::string rowgap;
 37 |     std::string colgap;
 38 |     std::string visual;
 39 |     for (unsigned ii = 0; ii < path.size() - 1; ++ii) {
 40 |       const Node & cur = path[ii];
 41 |       const Node & next = path[ii + 1];
 42 |       if (cur.i == next.i + 1 && cur.j == next.j + 1) {
 43 |         rowgap += row[next.i];
 44 |         colgap += col[next.j];
 45 |         if (row[next.i] != col[next.j]) {
 46 |           nm++;
 47 |           visual += ' ';
 48 |         } else {
 49 |           visual += '|';
 50 |         }
 51 |       } else if (cur.i == next.i && cur.j == next.j + 1) {
 52 |         nm ++;
 53 |         visual += ' ';
 54 |         rowgap += '-';
 55 |         colgap += col[next.j];
 56 |       } else if (cur.i == next.i + 1 && cur.j == next.j) {
 57 |         nm ++;
 58 |         visual += ' ';
 59 |         colgap += '-';
 60 |         rowgap += row[next.j];
 61 |       }
 62 |     }
 63 |     std::reverse(colgap.begin(), colgap.end());
 64 |     std::reverse(visual.begin(), visual.end());
 65 |     std::reverse(rowgap.begin(), rowgap.end());
 66 |     display.push_back(colgap);
 67 |     display.push_back(visual);
 68 |     display.push_back(rowgap);
 69 |   }
 70 | 
 71 |   int NM() const {
 72 |     return nm;
 73 |   }
 74 |   friend std::ostream& operator<<(std::ostream&, const Alignment&);
 75 | };
 76 | 
 77 | std::ostream& operator<<(std::ostream& os, const Alignment& align) {
 78 |   os << align.nm << '\n';
 79 |   for (const std::string& s : align.display) {
 80 |     os << s << '\n';
 81 |   }
 82 |   return os;
 83 | }
 84 | 
 85 | using std::vector;
 86 | class AffineGap {
 87 |   using Node =  Alignment::Node;
 88 |   /*
 89 |    * matrix as Reference at row top Query at column left
 90 |    */
 91 |   std::string rowstring_;
 92 |   std::string colstring_;
 93 |   vector<Node> align_path_;
 94 |   vector<vector<Node>> paths_;
 95 |   int nrow_;
 96 |   int ncol_;
 97 |   vector<vector<int>> R_; // match matrix
 98 |   vector<vector<int>> P_; // vertical insertion matrix
 99 |   vector<vector<int>> Q_; // horizontal deletion matrix
100 |   vector<vector<bool>> vert_whole_; // a
101 |   vector<vector<bool>> hori_whole_; // b
102 |   vector<vector<bool>> diag_whole_; // c
103 |   vector<vector<bool>> vert_top_half_; // d
104 |   vector<vector<bool>> vert_bottom_half_; // e
105 |   vector<vector<bool>> hori_left_half_; // f
106 |   vector<vector<bool>> hori_right_half_; // g
107 | 
108 |   //gap score = gap_open + gap_ext * gap_len
109 |   const static int gap_open_ = -1;
110 |   const static int gap_ext_ = -1;
111 |   static int DiagScore_(const char& a, const char& b) {
112 |     return a == b? 0: -1;
113 |   }
114 | 
115 |  public:
116 |   AffineGap(const std::string& query, const std::string& ref): rowstring_(query), colstring_(ref), //align_path_(colstring_, rowstring_),
117 |                                                      nrow_((int) rowstring_.length() + 1),
118 |                                                      ncol_((int) colstring_.length() + 1),
119 |                                                      R_(vector<vector<int>>(nrow_, vector<int>(ncol_))),
120 |                                                      P_(vector<vector<int>>(nrow_, vector<int>(ncol_))),
121 |                                                      Q_(vector<vector<int>>(nrow_, vector<int>(ncol_))),
122 |                                                      vert_whole_(vector<vector<bool>>(nrow_ + 1, vector<bool>(ncol_ + 1))),
123 |                                                      hori_whole_(vector<vector<bool>>(nrow_ + 1, vector<bool>(ncol_ + 1))),
124 |                                                      diag_whole_(vector<vector<bool>>(nrow_ + 1, vector<bool>(ncol_ + 1))),
125 |                                                      vert_top_half_(vector<vector<bool>>(nrow_ + 1, vector<bool>(ncol_ + 1))),
126 |                                                      vert_bottom_half_(vector<vector<bool>>(nrow_ + 1, vector<bool>(ncol_ + 1))),
127 |                                                      hori_left_half_(vector<vector<bool>>(nrow_ + 1, vector<bool>(ncol_ + 1))),
128 |                                                      hori_right_half_(vector<vector<bool>>(nrow_ + 1, vector<bool>(ncol_ + 1)))
129 |   {
130 |     // init
131 |     for (int j = 0; j < ncol_; ++j) {
132 |       P_[0][j] = 2 * gap_open_ + std::max(ncol_, nrow_) * gap_ext_ - 1; // ensure a large number
133 |       R_[0][j] = gap_open_ + j * gap_ext_;
134 |     }
135 |     for (int i = 0; i < nrow_; ++i) {
136 |       Q_[i][0] = 2 * gap_open_ + std::max(ncol_, nrow_) * gap_ext_ - 1; // ensure a large number
137 |       R_[i][0] = gap_open_ + i * gap_ext_;
138 |     }
139 |     R_[0][0] = 0;
140 |     diag_whole_[nrow_][ncol_] = 1;
141 | 
142 |     //cost assignment
143 |     for (int i = 0; i < nrow_; ++i) {
144 |       for (int j = 0; j < ncol_; ++j) {
145 |         if (i != 0) {
146 |           P_[i][j] = gap_ext_ + std::max(P_[i-1][j], R_[i-1][j] + gap_open_);
147 |           if (P_[i][j] == gap_ext_ + P_[i-1][j]) vert_top_half_[i-1][j] = 1;
148 |           if (P_[i][j] == gap_ext_ + gap_open_ + R_[i-1][j]) vert_bottom_half_[i-1][j] = 1;
149 |         }
150 |         if (j != 0) {
151 |           Q_[i][j] = gap_ext_ + std::max(Q_[i][j-1], R_[i][j-1] + gap_open_);
152 |           if (Q_[i][j] == gap_ext_ + Q_[i][j-1]) hori_left_half_[i][j-1] = 1;
153 |           if (Q_[i][j] == gap_ext_ + gap_open_ + R_[i][j-1]) hori_right_half_[i][j-1] = 1;
154 |         }
155 |         if (i != 0 && j != 0 ) {
156 |           R_[i][j] = std::max(R_[i-1][j-1] + DiagScore_(colstring_[j-1], rowstring_[i-1]), std::max(Q_[i][j], P_[i][j]));
157 |           if (R_[i][j]  == R_[i-1][j-1] + DiagScore_(colstring_[j-1], rowstring_[i-1])) diag_whole_[i][j] = 1;
158 |         }
159 |         if (R_[i][j]  == P_[i][j]) vert_whole_[i][j] = 1;
160 |         if (R_[i][j]  == Q_[i][j]) hori_whole_[i][j] = 1;
161 |       }
162 |     }
163 | //    std::cout<< "cost assignment\t";
164 | //    std::cout << diag_whole_[nrow_-1][ncol_-1] << "\t" << vert_whole_[nrow_-1][ncol_-1] << "\t" << hori_whole_[nrow_-1][ncol_-1] << "\n";
165 | 
166 |     //edge assignment
167 |     for (int i = nrow_ - 1; i >= 0; --i) {
168 |       for (int j = ncol_ - 1; j >= 0; --j) {
169 |         if ((vert_whole_[i+1][j] == 0 || vert_bottom_half_[i][j] == 0) &&
170 |             (hori_whole_[i][j+1] == 0 || hori_right_half_[i][j] == 0) &&
171 |             diag_whole_[i+1][j+1] == 0) {
172 | //          if (vert_bottom_half_[i][j] == 0 && vert_whole_[i+1][j] != 0) {
173 | //            std::cerr << "vi: " << i << " vj: " << j << std::endl;
174 | //          }
175 | //          if (hori_whole_[i][j+1] != 0 && hori_right_half_[i][j] == 1) {
176 | //            std::cerr << "hi: " << i << " hj: " << j << std::endl;
177 | //          }
178 |           vert_whole_[i][j] = 0;
179 |           hori_whole_[i][j] = 0;
180 |           diag_whole_[i][j] = 0;
181 |         }
182 |         if (vert_whole_[i+1][j] == 0  &&
183 |             hori_whole_[i][j+1] == 0  &&
184 |             diag_whole_[i+1][j+1] == 0) {
185 |           continue;
186 |         } else {
187 |           if ( vert_whole_[i+1][j] == 1 && vert_top_half_[i][j] == 1) {
188 |             vert_top_half_[i+1][j] = 1 -  vert_bottom_half_[i][j];
189 |             vert_bottom_half_[i][j] = 1 - vert_whole_[i][j];
190 |             vert_whole_[i][j] = 1;
191 |           } else {
192 |             vert_top_half_[i+1][j] = 0;
193 |             vert_bottom_half_[i][j] = 0;
194 |           }
195 | 
196 |           if ( hori_whole_[i][j + 1] == 1 && hori_left_half_[i][j] == 1) {
197 |             hori_left_half_[i][j+1] = 1 -  hori_right_half_[i][j];
198 |             hori_right_half_[i][j] = 1 - hori_whole_[i][j];
199 |             hori_whole_[i][j] = 1;
200 |           } else {
201 |             hori_left_half_[i][j+1] = 0;
202 |             hori_right_half_[i][j] = 0;
203 |           }
204 |         }
205 |       }
206 |     }
207 |     // backtrack by bit array matrics
208 |     DFS(Node(nrow_ -1, ncol_ -1), 0);
209 |   }
210 | 
211 |   void DFS(const Node& cn, const int must_go_dir) {
212 |     // must_go_dir, 0: not required, 1: must go left, 2: must go above
213 |     auto prev = align_path_.empty() ? Node(0,0) : align_path_.back();
214 |     align_path_.push_back(cn);
215 |     if (cn.i == 0 && cn.j == 0) {
216 |       paths_.push_back(align_path_);
217 |     }
218 |     else {
219 |       if (must_go_dir == 1) {
220 |         int next_must_go = hori_whole_[cn.i][cn.j] && hori_left_half_[cn.i][cn.j] ? 1 : 0;
221 |         DFS(Node(cn.i, cn.j-1), next_must_go);
222 |       }
223 |       else if (must_go_dir == 2){
224 |         int next_must_go = vert_whole_[cn.i][cn.j] && vert_top_half_[cn.i][cn.j] ? 2 : 0;
225 |         DFS(Node(cn.i-1, cn.j), next_must_go);
226 |       }
227 |       else {
228 |         if (diag_whole_[cn.i][cn.j]) {
229 |           DFS(Node(cn.i - 1, cn.j - 1), 0);
230 |         }
231 |         if (vert_whole_[cn.i][cn.j]) {
232 |           if (vert_bottom_half_[cn.i][cn.j]) {
233 |             if (cn.i + 1 != prev.i || cn.j != prev.j) return;
234 |           }
235 |           int next_must_go = vert_top_half_[cn.i][cn.j] ? 2 : 0;
236 |           DFS(Node(cn.i - 1, cn.j), next_must_go);
237 |         }
238 |         if (hori_whole_[cn.i][cn.j]) {
239 |           if (hori_right_half_[cn.i][cn.j]) {
240 |             if (cn.i != prev.i || cn.j + 1 != prev.j) return;
241 |           }
242 |           int next_must_go = hori_left_half_[cn.i][cn.j] ? 1 : 0;
243 |           DFS(Node(cn.i, cn.j - 1), next_must_go);
244 |         }
245 |       }
246 |     }
247 |     align_path_.pop_back();
248 |   }
249 | 
250 |   decltype(auto) Paths() const {
251 |     return (paths_);
252 |   }
253 | 
254 |   decltype(auto) Path() const {
255 |     return paths_.front();
256 |   }
257 | 
258 |   void PrintAllPaths() const {
259 | //    int i = 2, j=4;
260 | //    std::cout<< "edge assignment\n";
261 | //    std::cout << vert_whole_[i][j] << "\t" << hori_whole_[i][j] << "\t" << diag_whole_[i][j] << "\t";
262 | //    std::cout << vert_top_half_[i][j] << "\t" << vert_bottom_half_[i][j] << "\t" << hori_left_half_[i][j] << "\t" << hori_right_half_[i][j];
263 | //    std::cout << "\n";
264 |     for (auto& p : paths_) {
265 |       Alignment a(rowstring_, colstring_, p);
266 |       std::cerr << a;
267 |       std::cerr << '\n';
268 |     }
269 |   };
270 | 
271 | //  void Print() const {
272 | //    std::cerr << "R\n";
273 | //    std::cerr << R_;
274 | //    std::cerr << "P\n";
275 | //    std::cerr << P_;
276 | //    std::cerr << "Q\n";
277 | //    std::cerr << Q_;
278 | //    std::cerr << "a\n";
279 | //    std::cerr << vert_whole_; // a
280 | //    std::cerr << "b\n";
281 | //    std::cerr << hori_whole_; // b
282 | //    std::cerr << "c\n";
283 | //    std::cerr << diag_whole_; // c
284 | //    std::cerr << "d\n";
285 | //    std::cerr << vert_top_half_; // d
286 | //    std::cerr << "e\n";
287 | //    std::cerr << vert_bottom_half_; // e
288 | //    std::cerr << "f\n";
289 | //    std::cerr << hori_left_half_; // f
290 | //    std::cerr << "g\n";
291 | //    std::cerr << hori_right_half_; // g
292 | //  }
293 | 
294 | };
295 | 
296 | #endif //ADAPTERTRIM_CPPUTIL_INCLUDE_GOTOH_H_
297 | 


--------------------------------------------------------------------------------
/bbcpputil/include/Insert.h:
--------------------------------------------------------------------------------
  1 | #ifndef INSERT_SEQ_H
  2 | #define INSERT_SEQ_H
  3 | #include <map>
  4 | //#include <queue>
  5 | #include <string>
  6 | #include <algorithm>
  7 | #include <list>
  8 | #include <vector>
  9 | #include <htslib/sam.h>
 10 | #include <memory>
 11 | #include <iostream>
 12 | #include <SeqLib/BamRecord.h>
 13 | #include "AlignmentConsensus.h"
 14 | 
 15 | 
 16 | using std::list;
 17 | using std::vector;
 18 | using std::pair;
 19 | using std::string;
 20 | 
 21 | namespace cpputil {
 22 | 
 23 | class InsertSeq {
 24 | 
 25 |   using BamPointer = std::shared_ptr<bam1_t>;
 26 |   using BamRecord = SeqLib::BamRecord;
 27 |   //Switch to SeqLib::BamRecord
 28 |   typedef Segments::iterator iterator;
 29 |   typedef Segments::const_iterator const_iterator;
 30 | 
 31 |   Segments inprogress_, invalid_, ambiguous_;
 32 |   vector<Segments> paired_; // segment size is two
 33 |   string seqid_;
 34 | 
 35 |   struct SeqQual {
 36 |     string seq;
 37 |     string qual;
 38 |     SeqQual(string s, string q) : seq(s), qual(q) {}
 39 |   };
 40 | 
 41 |   bool IsMate_(const bam1_t *bam, const bam1_t *mate) const {
 42 |     /* Modified based on Rsamtool src/Template.h
 43 |     //https://github.com/Bioconductor/Rsamtools
 44 | 
 45 |     // is_mate checks the following bit flags:
 46 |     // 1. Bit 0x40 and 0x80: Segments are a pair of first/last OR
 47 |     //    neither segment is marked first/last
 48 |     // 2. Bit 0x100: Both segments are secondary OR both not secondary
 49 |     // 3. Bit 0x10 and 0x20: Strand flag 0x20 of one mate must match strand
 50 |     //                       flag 0x10 of the other mate and vice versa
 51 |     // 4. Bit 0x2: Both proper OR both not proper 
 52 |     // 5. mpos match:
 53 |     //      bit 0x10 of rec1 == bit 0x20 of rec2 AND
 54 |     //      bit 0x10 or rec2 == bit 0x20 of rec1
 55 |     //      segment2 mpos matches segment1 pos
 56 |     // 6. Both Mapped;
 57 |     */
 58 |     const bool bam_read1 = bam->core.flag & BAM_FREAD1;
 59 |     const bool mate_read1 = mate->core.flag & BAM_FREAD1;
 60 |     const bool bam_read2 = bam->core.flag & BAM_FREAD2;
 61 |     const bool mate_read2 = mate->core.flag & BAM_FREAD2;
 62 |     const bool bam_secondary = bam->core.flag & BAM_FSECONDARY;
 63 |     const bool mate_secondary = mate->core.flag & BAM_FSECONDARY;
 64 |     const bool bam_proper = bam->core.flag & BAM_FPROPER_PAIR;
 65 |     const bool mate_proper = mate->core.flag & BAM_FPROPER_PAIR;
 66 |     const bool bam_rev = bam->core.flag & BAM_FREVERSE;
 67 |     const bool mate_rev = mate->core.flag & BAM_FREVERSE;
 68 |     const bool bam_mrev = bam->core.flag & BAM_FMREVERSE;
 69 |     const bool mate_mrev = mate->core.flag & BAM_FMREVERSE;
 70 |     const bool bam_unmap = bam->core.flag & BAM_FUNMAP;
 71 |     const bool mate_unmap = bam->core.flag & BAM_FMUNMAP;
 72 |     const uint32_t
 73 |         pos = bam->core.pos,
 74 |         mpos = bam->core.mpos,
 75 |         mate_pos = mate->core.pos,
 76 |         mate_mpos = mate->core.mpos;
 77 |     return
 78 |         ((bam_read1 ^ bam_read2) && (mate_read1 ^ mate_read2)) &&
 79 |             (bam_read1 != mate_read1) &&
 80 |             (bam_secondary == mate_secondary) &&
 81 |             (((bam_rev != mate_mrev) && (bam_mrev != mate_rev)) ||
 82 |                 ((bam_rev == mate_mrev) && (bam_mrev == mate_rev))) &&
 83 |             (bam_proper == mate_proper) &&
 84 |             !bam_unmap &&
 85 |             !mate_unmap &&
 86 |             (pos == mate_mpos) && (mpos == mate_pos) &&
 87 |             (bam->core.mtid == mate->core.tid);
 88 |   }
 89 | 
 90 |   void add_to_paired(BamRecord bam, BamRecord mate) {
 91 |     // keep the order of R1 then R2
 92 |     if (bam.FirstFlag()) {
 93 |       Segments tmp{bam, mate};
 94 |       paired_.emplace_back(tmp);
 95 |     } else {
 96 |       Segments tmp{mate, bam};
 97 |       paired_.emplace_back(tmp);
 98 |     }
 99 |   }
100 | 
101 |   static int32_t GetBreakPointCorrection_(const int32_t stop, const SeqLib::Cigar &cigar) {
102 |     int32_t num_refbase_consumed = 0;
103 |     int32_t correction = 0;
104 |     assert(std::distance(cigar.begin(), cigar.end()) > 1);
105 |     for (auto it = cigar.begin(); it != cigar.end(); ++it) {
106 |       if (num_refbase_consumed > stop) break;
107 |       if (it->Type() == 'I') {
108 |         correction += it->Length();
109 |       } else if (it->Type() == 'D') {
110 |         num_refbase_consumed += it->Length();
111 |         correction -= it->Length();
112 |       } else if (it->Type() == 'H') {
113 |         continue;
114 |       } else {
115 |         num_refbase_consumed += it->Length();
116 |       }
117 |     }
118 |     return correction;
119 |   }
120 | 
121 |   static pair<SeqQual, SeqQual> GetSegmentOverhang(const Segments &seg) {
122 |     int32_t left_front = seg.front().PositionWithSClips();
123 |     int32_t left_end = seg.front().PositionEndWithSClips();
124 |     int32_t right_front = seg.back().PositionWithSClips();
125 |     int32_t right_end = seg.back().PositionEndWithSClips();
126 | //    if (left_front > right_front) {
127 | //      DEBUG(seg.front())
128 | //      DEBUG(seg.back());
129 | //    }
130 |     assert(left_front <= right_front);
131 |     assert(left_end <= right_end);
132 | 
133 |     int32_t left_break = right_front - left_front;
134 |     int32_t right_break = left_end - right_front + 1;
135 |     left_break += GetBreakPointCorrection_(left_break, seg.front().GetCigar());
136 |     right_break += GetBreakPointCorrection_(right_break, seg.back().GetCigar());
137 |     SeqQual left_oh(seg.front().Sequence().substr(0, left_break), seg.front().Qualities().substr(0, left_break));
138 |     SeqQual right_oh(seg.back().Sequence().substr(right_break - 1, seg.back().Sequence().size() - right_break + 1),
139 |                      seg.back().Qualities().substr(right_break - 1, seg.back().Qualities().size() - right_break + 1));
140 | 
141 |     return std::make_pair(left_oh, right_oh);
142 |   }
143 | 
144 | //  static bool IsSorted(const Segments &seg) {
145 | //    return seg.front().PositionWithSClips() <= seg.back().PositionWithSClips();
146 | //  }
147 | 
148 | 
149 | //  static string GetConsensusTemplate(const Segments &seg) {
150 | //    /*
151 | //     *  '~' : uninitialized
152 | //     *  '+' : insertion
153 | //     *  '-' : deletion
154 | //     */
155 | //    assert(IsSorted(seg));
156 | //    const SeqLib::Cigar left_cigar = seg.front().GetCigar();
157 | //    const SeqLib::Cigar right_cigar = seg.back().GetCigar();
158 | //    int ref_span = std::max(seg.back().PositionEndWithSClips(), seg.front().PositionEndWithSClips())
159 | //        - seg.front().PositionWithSClips();
160 | //    std::map<int, int> ins_len;
161 | //    find_insert_(left_cigar, 0, ins_len);
162 | //    find_insert_(right_cigar, seg.back().PositionWithSClips() - seg.front().PositionWithSClips(), ins_len);
163 | //    string consens;
164 | //    int b = 0;
165 | //    for (const auto pos_len : ins_len) {
166 | //      consens += string(pos_len.first - b, '~');
167 | //      consens += string(pos_len.second, '+');
168 | //      b = pos_len.first;
169 | //    }
170 | //    consens += string(ref_span - b, '~');
171 | //    return consens;
172 | //  }
173 | 
174 | 
175 |  public:
176 |   InsertSeq() = default;
177 |   InsertSeq(BamRecord br) {
178 |     add(br);
179 |   }
180 | 
181 |   void add(BamRecord br) {
182 |     inprogress_.push_back(br);
183 |   }
184 | 
185 |   decltype(auto) paired() const {
186 |     return (paired_);
187 |   }
188 | 
189 |   decltype(auto) inprogress() const {
190 |     return (inprogress_);
191 |   }
192 | 
193 |   decltype(auto) forward_segments() const {
194 |     Segments forward;
195 |     for (const auto &read : inprogress_) {
196 |       if (read.ReverseFlag()) {
197 |         continue;
198 |       } else {
199 |         forward.push_back(read);
200 |       }
201 |     }
202 |     return forward;
203 |   }
204 | 
205 |   decltype(auto) reverse_segments() const {
206 |     Segments reverse;
207 |     for (const auto &read : inprogress_) {
208 |       if (read.ReverseFlag()) {
209 |         reverse.push_back(read);
210 |       }
211 |     }
212 |     return reverse;
213 |   }
214 | 
215 |   bool empty() const {
216 |     return inprogress_.empty() && invalid_.empty() && ambiguous_.empty() && paired_.empty();
217 |   }
218 | 
219 |   void Mate() {
220 |     /* Adapted from Rsamtool src/Template.h
221 |     //https://github.com/Bioconductor/Rsamtools
222 |     */
223 |     // This is O(n^2) where n is the number of reads. This does not work for large n.
224 | 
225 |     // Mate paired bam records to segments. Segments are non-overlap intervals on genome.
226 |     const int unmated = -1, multiple = -2, processed = -3;
227 |     vector<pair<int, BamRecord> >
228 |         status(inprogress_.size(),
229 |                pair<int, BamRecord>(unmated, BamRecord()));
230 |     Segments::iterator it0;
231 | 
232 |     // identify unambiguous and ambiguous mates
233 |     it0 = inprogress_.begin();
234 |     for (unsigned int i = 0; i < inprogress_.size(); ++i) {
235 |       status[i].second = *it0;
236 |       Segments::iterator it1 = it0;
237 |       for (unsigned int j = i + 1; j < inprogress_.size(); ++j) {
238 |         ++it1;
239 |         if (IsMate_(it0->raw(), it1->raw())) {
240 |           status[i].first = status[i].first == unmated ? j : multiple;
241 |           status[j].first = status[j].first == unmated ? i : multiple;
242 |         }
243 |       }
244 |       ++it0;
245 |     }
246 | 
247 |     // process unambiguous and ambiguous mates
248 |     for (unsigned int i = 0; i < status.size(); ++i) {
249 |       if (status[i].first == unmated)
250 |         continue;
251 |       if (status[i].first >= 0 && status[status[i].first].first >= 0) {
252 |         // unambiguous mates
253 |         add_to_paired(status[i].second, status[status[i].first].second);
254 |         status[status[i].first].first = processed;
255 |         status[i].first = processed;
256 |       } else if (status[i].first != processed) {
257 |         // ambiguous mates, added to 'ambiguous' queue
258 |         ambiguous_.push_back(status[i].second);
259 |         status[i].first = processed;
260 |       }
261 |       ++it0;
262 |     }
263 | 
264 |     // remove segments that have been assigned to paired or
265 |     // ambiguous queue
266 |     it0 = inprogress_.begin();
267 |     for (unsigned int i = 0; i != status.size(); ++i) {
268 |       if (status[i].first == processed) {
269 |         it0 = inprogress_.erase(it0);
270 |       } else {
271 |         ++it0;
272 |       }
273 |     }
274 |   }
275 | };
276 | }
277 | #endif
278 | 


--------------------------------------------------------------------------------
/bbcpputil/include/InsertSeqFactory.h:
--------------------------------------------------------------------------------
  1 | #ifndef INSERT_SEQ_FACTORY_H
  2 | #define INSERT_SEQ_FACTORY_H
  3 | 
  4 | #include <map>
  5 | #include <set>
  6 | #include <SeqLib/BamReader.h>
  7 | #include <SeqLib/GenomicRegion.h>
  8 | #include "DNAUtils.h"
  9 | 
 10 | #include "TargetLayout.h"
 11 | #include "Alignment.h"
 12 | #include "Insert.h"
 13 | #include "BamRecordExt.h"
 14 | 
 15 | #include "FastxIO.h"
 16 | 
 17 | #ifndef NDEBUG
 18 | #include <iostream>
 19 | #endif
 20 | 
 21 | #ifndef NDEBUG
 22 | #  define DEBUG(x) do {std::cerr << x << std::endl;} while(0);
 23 | #else
 24 | #  define DEBUG(x) do {} while (0)
 25 | #endif
 26 | 
 27 | using std::map;
 28 | using std::string;
 29 | 
 30 | namespace cpputil {
 31 | class InsertSeqFactory {
 32 | 
 33 |   SeqLib::BamReader bam_reader_;
 34 |   map<string, InsertSeq> id_inserts_;
 35 |   map<string, InsertSeq>::iterator it_;
 36 |   int min_mapq_;
 37 |   bool load_supp_;
 38 |   bool load_secondary_;
 39 |   bool load_duplicate_;
 40 |   bool load_proper_pair_only_;
 41 |   bool clip3_ = false;
 42 |   int last_chr_;
 43 |   bool finished_ = false;
 44 |   int paired_end_library_ = 1; // 0 single end, 1 paired end
 45 | 
 46 |   bool passfilter(const SeqLib::BamRecord& b) {
 47 |     if (!b.MappedFlag()) return false;
 48 |     if (!load_supp_ && b.SupplementaryFlag()) return false;
 49 |     if (!load_secondary_ && b.SecondaryFlag()) return false;
 50 |     if (!load_duplicate_ && b.DuplicateFlag()) return false;
 51 |     if (load_proper_pair_only_ && !b.ProperPair()) return false;
 52 |     if (b.MapQuality() < min_mapq_) return false;
 53 |     return true;
 54 |   }
 55 | 
 56 |   std::pair<string, int> loadrecord(int32_t stop_at = INT32_MAX) {
 57 |     //load a single record
 58 |     SeqLib::BamRecord b;
 59 |     string qname;
 60 |     int chrid;
 61 |     while (true) {
 62 |       auto ret = bam_reader_.GetNextRecord(b);
 63 |       if (!ret) {
 64 |         return std::make_pair(string(), -1);
 65 |       }
 66 |       if (!b.MappedFlag()) continue;
 67 |       if (b.Position() > stop_at) {
 68 |         //Stop and this read is not added
 69 |         return std::make_pair(b.Qname(), -1);
 70 |       }
 71 |       if (!passfilter(b)) continue;
 72 |       qname = b.Qname();
 73 |       chrid = b.ChrID();
 74 |       if (clip3_) {
 75 |         cpputil::SoftClip3end(b);
 76 |       }
 77 | 
 78 |       auto itfind = id_inserts_.find(b.Qname());
 79 |       if (itfind == id_inserts_.end()) {
 80 |         id_inserts_.emplace(b.Qname(), b);
 81 |       } else {
 82 |         itfind->second.add(b);
 83 |       }
 84 |       break;
 85 |     }
 86 |     return std::make_pair(qname, chrid);
 87 |   }
 88 | 
 89 |   bool yieldfrags(bool (*Selector)(const Segments &, int param),
 90 |                    const bool load_unpair,
 91 |                    const int pair_min_ol,
 92 |                    const std::string uid_tag_name,
 93 |                    std::vector<std::vector<Segments>>& ret) {
 94 |     if (id_inserts_.empty()) return false;
 95 |     it_ = id_inserts_.begin();
 96 |     std::map<std::string, std::vector<Segments>> uid_to_segs;
 97 |     int dummy = 1; // psudo uid
 98 |     while (true) {// Yield a group of read(s), either paired, single, or family until no more reads.
 99 |       std::vector<Segments> recs;
100 |       if (paired_end_library_) {
101 |         if (load_unpair) {
102 |           recs = YieldPairAndUnpair();
103 |         } else {
104 |           recs = YieldPair(Selector, pair_min_ol);
105 |         }
106 |       } else {
107 |         recs = YieldSingle();
108 |       }
109 |       if (recs.empty()) break;
110 |       if (not load_duplicate_) {
111 |         for (auto& rec : recs) {
112 |           uid_to_segs[std::to_string(dummy++)].push_back(rec);
113 |         }
114 |       } else {
115 |         for (auto& rec : recs) {
116 |           std::string uid = GetUid(rec, uid_tag_name);
117 |           uid_to_segs[uid].push_back(rec);
118 |         }
119 |       }
120 |     }
121 |     ret.reserve(uid_to_segs.size());
122 |     for (auto& it: uid_to_segs) {
123 |       ret.push_back(it.second);
124 |     }
125 |     id_inserts_.clear();
126 |     return true;
127 |   }
128 | 
129 |  public:
130 |   //InsertSeqFactory() = delete;
131 |   InsertSeqFactory() = default;
132 | 
133 |   InsertSeqFactory(const std::string &bam, int mapq, bool load_supp, bool load_sec, bool load_duplicate, bool load_proper_pair_only, bool clip3):
134 |       min_mapq_(mapq),
135 |       load_supp_(load_supp),
136 |       load_secondary_(load_sec),
137 |       load_duplicate_(load_duplicate),
138 |       load_proper_pair_only_(load_proper_pair_only),
139 |       clip3_(clip3),
140 |       last_chr_(-1) {
141 |     bam_reader_.Open(bam);
142 |     SeqLib::BamRecord b;
143 |   };
144 | 
145 |   bool IsPairEndLib() const {
146 |     return (paired_end_library_);
147 |   }
148 | 
149 |   bool ReadByRegion(bool (*Selector)(const Segments &, int param),  // selector
150 |                     const SeqLib::GenomicRegion& gr,
151 |                     std::vector<std::vector<Segments>>& ret,
152 |                     int pair_min_ol, // mim overlap between read1 and read2
153 |                     const std::string uid_tag_name,
154 |                     bool load_unpair) {
155 |     //clearance
156 |     if (not ret.empty()) ret.clear();
157 |     if (not id_inserts_.empty()) id_inserts_.clear();
158 |     bool stat = bam_reader_.SetRegion(gr);
159 |     if (not stat) {
160 |       std::cerr << gr << " not found" << std::endl;
161 |       return false;
162 |     }
163 |     std::string readid;
164 |     int chrid;
165 |     while(true) {
166 |       std::tie(readid, chrid) = loadrecord(gr.pos2);
167 |       if (chrid == -1) break;
168 |     }
169 |     bool status = yieldfrags(Selector, load_unpair, pair_min_ol, uid_tag_name, ret);
170 |     return status;
171 |   }
172 | 
173 |   std::vector<Segments> FetchReadNameSorted(bool load_unpair = false) {
174 |     std::vector<Segments> ret;
175 |     SeqLib::BamRecord b;
176 |     while(true) {
177 |       bool has_read = bam_reader_.GetNextRecord(b);
178 |       if (!has_read) {
179 |         break;
180 |       }
181 |       if (!passfilter(b)) continue;
182 |       if (clip3_) {
183 |         cpputil::SoftClip3end(b);
184 |       }
185 |       auto itfind = id_inserts_.find(b.Qname());
186 |       if (itfind == id_inserts_.end()) {
187 |         for(auto it  : id_inserts_) {
188 |           it.second.Mate();
189 |           for (auto seg: it.second.paired()) {
190 |             ret.push_back(seg);
191 |           }
192 |           if (load_unpair) {
193 |             for (auto bam: it.second.inprogress()) {
194 |               Segments tmp(1, bam);
195 |               ret.push_back(tmp);
196 |             }
197 |           }
198 |         }
199 |         id_inserts_.clear();
200 |         id_inserts_.emplace(b.Qname(), b);
201 |         return ret;
202 |       } else {
203 |         itfind->second.add(b);
204 |       }
205 |     }
206 |     if (!id_inserts_.empty()) {
207 |       for(auto it  : id_inserts_) {
208 |         it.second.Mate();
209 |         for (auto seg: it.second.paired()) {
210 |           ret.push_back(seg);
211 |         }
212 |         if (load_unpair) {
213 |           for (auto bam: it.second.inprogress()) {
214 |             Segments tmp(1, bam);
215 |             ret.push_back(tmp);
216 |           }
217 |         }
218 |       }
219 |       id_inserts_.clear();
220 |     } else {
221 |       finished_ = true;
222 |     }
223 |     return ret;
224 |   }
225 | 
226 | 
227 |   std::vector<std::vector<Segments>> ReadByChrom(bool (*Selector)(const Segments &, int param),  // selector
228 |                                                  int pair_min_ol, // mim overlap between read1 and read2
229 |                                                  const std::string uid_tag_name = "",
230 |                                                  bool load_unpair = false) {
231 |     std::vector<std::vector<Segments>> ret;
232 |     if (finished()) return ret;
233 |     while (true) {
234 |       std::string readid;
235 |       int chrid;
236 |       // Load reads in to id_inserts_;
237 |       std::tie(readid, chrid) = loadrecord();
238 |       if (last_chr_ != -1 && chrid != last_chr_) { // first read in a different chromosome (2nd and above)
239 |         InsertSeq save;
240 |         if (!readid.empty()) { // save this read from processing and put it back to id_inserts_ after processing
241 |           save = id_inserts_[readid];
242 |           id_inserts_.erase(readid);
243 |         }
244 |         it_ = id_inserts_.begin();
245 | 
246 |         yieldfrags(Selector, load_unpair, pair_min_ol, uid_tag_name, ret);
247 | 
248 |         if (!readid.empty()) { // Put back the saved read
249 |           id_inserts_.emplace(readid, save);
250 |         }
251 |         last_chr_ = chrid;
252 |         break;
253 |       }
254 |       if (chrid == -1) {
255 |         finished_ = true;
256 |         break;
257 |       }
258 |       last_chr_ = chrid;
259 |     }
260 |     return ret;
261 |   }
262 | 
263 |   bool finished() {
264 |     return finished_;
265 |   }
266 | 
267 |   decltype(auto) bamheader() const {
268 |     return (bam_reader_.Header());
269 |   }
270 | 
271 |   //Iterator
272 |   std::vector<Segments> YieldSingle() {
273 |     std::vector<Segments> res;
274 |     if (it_ != id_inserts_.end()) {
275 |       res.push_back(it_->second.inprogress());
276 |       ++it_;
277 |     }
278 |     return res;
279 |   }
280 | 
281 |   //Iterator
282 |   std::vector<Segments> YieldPairAndUnpair() {
283 |     std::vector<Segments> res;
284 |     for(;it_ != id_inserts_.end();) {
285 |       it_->second.Mate();
286 |       for (auto seg: it_->second.paired()) {
287 | 
288 |           res.push_back(seg);
289 |       }
290 |       for (auto bam: it_->second.inprogress()) {
291 |         Segments tmp(1, bam);
292 |         res.push_back(tmp);
293 |       }
294 |       ++it_;
295 |       if (!res.empty()) break;
296 |     }
297 |     return res;
298 |   }
299 | 
300 |   //Iterator
301 |   std::vector<Segments> YieldPair(bool (*Selector)(const Segments &, int k), int k) {
302 |     /*
303 |      * Always yield R1 and R2 in order
304 |      */
305 |     std::vector<Segments> res;
306 |     for (; it_ != id_inserts_.end();) {
307 |       it_->second.Mate();
308 |       bool found = false;
309 |       for (auto seg: it_->second.paired()) {
310 |         if (Selector(seg, k)) {
311 |           res.push_back(seg);
312 |           found = true;
313 |         }
314 |       }
315 |       ++it_;
316 |       if (found) {
317 |         break;
318 |       }
319 |     }
320 |     return res;
321 |   }
322 | 
323 |   // Only if both reads of FR pair pass the filter
324 | //  std::vector<Segments> YieldFamily(bool (*Selector)(const Segments &)) {
325 | //    std::vector<Segments> forward_reverse_bams;
326 | //    for (; family_it_ != start_end_2_rnames_.end();) {
327 | //      Segments forwards;
328 | //      Segments reverses;
329 | //      for (auto id : family_it_->second) {
330 | //        auto f = id_inserts_[id].forward_segments();
331 | //        auto r = id_inserts_[id].reverse_segments();
332 | //        if (!f.empty() && !r.empty()) {
333 | //          forwards.insert(forwards.end(), f.begin(), f.end());
334 | //          reverses.insert(reverses.end(), r.begin(), r.end());
335 | //        }
336 | //      }
337 | //      ++family_it_;
338 | //      if (Selector(forwards) && Selector(reverses)) {
339 | //        forward_reverse_bams.push_back(forwards);
340 | //        forward_reverse_bams.push_back(reverses);
341 | //        break;
342 | //      }
343 | //    }
344 | //    return forward_reverse_bams;
345 | //  }
346 | 
347 | };
348 | }
349 | 
350 | #endif
351 | 


--------------------------------------------------------------------------------
/bbcpputil/include/MAF.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 3/26/20.
  3 | //
  4 | 
  5 | #ifndef CPPUTIL_INCLUDE_MAF_H_
  6 | #define CPPUTIL_INCLUDE_MAF_H_
  7 | #include <string>
  8 | #include <map>
  9 | #include <vector>
 10 | #include <fstream>
 11 | #include <algorithm>
 12 | #include "StringUtils.h"
 13 | 
 14 | namespace cpputil {
 15 | class MAFReader {
 16 |   std::ifstream in_;
 17 |   std::map<std::string, std::map<int32_t, std::vector<std::string> >> records_;
 18 |   bool isopen_ = false;
 19 | 
 20 |  public:
 21 |   MAFReader() = default;
 22 | 
 23 |   MAFReader(const std::string& maf) {
 24 |     Open(maf);
 25 |   }
 26 | 
 27 |   bool IsOpen() const {
 28 |     return isopen_;
 29 |   }
 30 | 
 31 |   bool Open(const std::string& maf) {
 32 |     isopen_ = true;
 33 |     in_ = std::ifstream(maf);
 34 |     string line;
 35 |     //get header
 36 |     getline(in_, line, '\n');
 37 |     if (in_.eof()) {
 38 |       return false;
 39 |     }
 40 |     std::vector<std::string> fields;
 41 |     split_by_char(line, '\t', fields);
 42 |     for(auto &s : fields) {
 43 |       std::transform(s.begin(), s.end(), s.begin(),
 44 |       [](unsigned char c) -> unsigned char { return std::toupper(c); });
 45 |     }
 46 |     int chr_idx =  std::distance(fields.begin(), std::find(fields.begin(), fields.end(), "CHROMOSOME"));
 47 |     int start_idx =  std::distance(fields.begin(), std::find(fields.begin(), fields.end(), "START_POSITION"));
 48 |     int alt_idx =  std::distance(fields.begin(), std::find(fields.begin(), fields.end(), "TUMOR_SEQ_ALLELE2"));
 49 | 
 50 |     while (true) {
 51 |       line.clear();
 52 |       fields.clear();
 53 |       getline(in_, line, '\n');
 54 |       if (in_.eof()) {
 55 |         break;
 56 |       }
 57 |       split_by_char(line, '\t', fields);
 58 |       auto it = records_.find(fields[chr_idx]);
 59 |       if ( it == records_.end()) {
 60 |         std::map<int32_t, std::vector<std::string>> key = {{std::stoi(fields[start_idx]), std::vector<std::string>(1, fields[alt_idx])}};
 61 |         records_[fields[chr_idx]] = key;
 62 |       } else {
 63 |         auto it2 = it->second.find(std::stoi(fields[start_idx]));
 64 |         if (it2 == it->second.end()) {
 65 |           it->second[std::stoi(fields[start_idx])] = std::vector<std::string>(1, fields[alt_idx]);
 66 |         } else {
 67 |           it2->second.push_back(fields[alt_idx]);
 68 |         }
 69 |       }
 70 |     }
 71 |     return true;
 72 |   }
 73 | 
 74 |   bool var_exist(const std::string& contig, const int32_t pos, std::string alt="") const {
 75 |     auto it = records_.find(contig);
 76 |     if (it != records_.end()) {
 77 |       auto it2 = it->second.find(pos + 1);
 78 |       if (it2 != it->second.end()) {
 79 |         if (alt.empty()) return true;
 80 |         if( std::find(it2->second.begin(), it2->second.end(), alt) != it2->second.end()) {
 81 |           return true;
 82 |         }
 83 |       }
 84 |     }
 85 |     return false;
 86 |   }
 87 | 
 88 |   void Print() const {
 89 |     for (auto const& rec : records_) {
 90 |       for (auto const& pos_alts : rec.second) {
 91 |         for (auto const& alt : pos_alts.second) {
 92 |           std::cerr << rec.first << "\t" << pos_alts.first << "\t" << alt << std::endl;
 93 |         }
 94 |       }
 95 |     }
 96 |   }
 97 | };
 98 | }
 99 | 
100 | #endif //CPPUTIL_INCLUDE_MAF_H_
101 | 


--------------------------------------------------------------------------------
/bbcpputil/include/Stats.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Ruolin Liu on 7/1/21.
 3 | //
 4 | 
 5 | #ifndef BBCPPUTIL_INCLUDE_STATS_H_
 6 | #define BBCPPUTIL_INCLUDE_STATS_H_
 7 | #include <vector>
 8 | #include <algorithm>
 9 | namespace cpputil {
10 | 
11 | inline int GetMode(const std::vector<int>& array) {
12 |   /*
13 |    * Return the mode of an integer vector. Works for short and medium size array and
14 |    * the span of the array is not too larger
15 |    *
16 |    */
17 |   assert(not array.empty());
18 |   const auto ret = std::minmax_element(begin(array), end(array));
19 |   std::vector<int> hist(*ret.second - *ret.first + 1);
20 |   for (const auto& ii: array) {
21 |     ++hist[ii - *ret.first];
22 |   }
23 |   return std::max_element(hist.begin(), hist.end()) - hist.begin() + *ret.first;
24 | }
25 | 
26 | 
27 | }
28 | #endif //BBCPPUTIL_INCLUDE_STATS_H_
29 | 


--------------------------------------------------------------------------------
/bbcpputil/include/StringUtils.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Ruolin Liu on 3/26/20.
 3 | //
 4 | 
 5 | #ifndef CPPUTIL_INCLUDE_STRINTUTILS_H_
 6 | #define CPPUTIL_INCLUDE_STRINTUTILS_H_
 7 | 
 8 | #include <string>
 9 | #include <vector>
10 | #include <map>
11 | #include <sstream>
12 | #include <numeric>
13 | 
14 | 
15 | namespace cpputil {
16 | inline void split_by_char(const std::string &s, char c,
17 |            std::vector<std::string> &v) {
18 |   int i = 0;
19 |   int j = s.find(c);
20 | 
21 |   while (j >= 0) {
22 |     v.push_back(s.substr(i, j - i));
23 |     i = ++j;
24 |     j = s.find(c, j);
25 | 
26 |     if (j < 0) {
27 |       v.push_back(s.substr(i, s.length()));
28 |     }
29 |   }
30 | }
31 | 
32 | inline std::vector<std::string> split(const std::string& s, const std::string& delims)
33 | {
34 |   std::vector<std::string> result;
35 |   std::string::size_type lastPos = s.find_first_not_of(delims, 0);
36 |   std::string::size_type pos = s.find_first_of(delims, lastPos);
37 |   while (std::string::npos != pos || std::string::npos != lastPos) {
38 |     result.push_back(s.substr(lastPos, pos - lastPos));
39 |     lastPos = s.find_first_not_of(delims, pos);
40 |     pos = s.find_first_of(delims, lastPos);
41 |   }
42 |   return result;
43 | }
44 | 
45 | /*
46 |  * https://stackoverflow.com/questions/9277906/stdvector-to-string-with-custom-delimiter
47 |  * By Shadow2531
48 |  */
49 | template <typename T>
50 | std::string join(const T& v, const std::string& delim) {
51 |   std::ostringstream s;
52 |   for (const auto& i : v) {
53 |     if (&i != &v[0]) {
54 |       s << delim;
55 |     }
56 |     s << i;
57 |   }
58 |   return s.str();
59 | }
60 | 
61 | inline double entropy(const std::string& dna_seq) {
62 |   std::map<char, int> cnt;
63 |   for (const char& d : dna_seq)  {
64 |     cnt[d]++;
65 |   }
66 |   std::vector<int> vec;
67 |   std::vector<double> p;
68 |   int tot = 0;
69 |   for (auto it : cnt) {
70 |     tot += it.second;
71 |     vec.push_back(it.second);
72 |   }
73 |   p.resize(vec.size());
74 |   std::transform(vec.begin(), vec.end(), p.begin(), [&tot](double x) {return x / tot;});
75 |   std::transform(p.begin(), p.end(), p.begin(), [](double x) {return x * log2(x);});
76 |   return -std::accumulate(p.begin(), p.end(), 0.0);
77 | }
78 | 
79 | }
80 | 
81 | #endif //CPPUTIL_INCLUDE_STRINTUTILS_H_
82 | 


--------------------------------------------------------------------------------
/bbcpputil/include/TargetLayout.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Ruolin Liu on 5/9/20.
 3 | //
 4 | 
 5 | #ifndef CPPUTIL_INCLUDE_TARGETLAYOUT_H_
 6 | #define CPPUTIL_INCLUDE_TARGETLAYOUT_H_
 7 | 
 8 | #include <vector>
 9 | #include <map>
10 | #include <unordered_map>
11 | 
12 | #include <SeqLib/BamReader.h>
13 | #include <SeqLib/GenomicRegion.h>
14 | #include <SeqLib/GenomicRegionCollection.h>
15 | 
16 | namespace cpputil{
17 | 
18 | class TargetLayout {
19 |   size_t idx_ = 0;
20 |   SeqLib::GenomicRegionVector ginvs_;
21 | 
22 |   bool _Load(const SeqLib::BamHeader& header, const std::string& bed_path) {
23 |     SeqLib::GRC grc;
24 |     bool ret = grc.ReadBED(bed_path, header);
25 |     if (!ret) {
26 |       throw std::runtime_error(bed_path + " cannot be read!");
27 |     }
28 |     ginvs_ = grc.AsGenomicRegionVector();
29 |     std::cerr << "read " << ginvs_.size() << " regions\n";
30 |     return true;
31 |   }
32 | 
33 | public:
34 |   TargetLayout() = default;
35 |   TargetLayout(const SeqLib::BamHeader& header, const std::string& bed_path) {
36 |     _Load(header, bed_path);
37 |   }
38 | 
39 |   size_t NumRegion() const {
40 |     return ginvs_.size();
41 |   }
42 | 
43 |   decltype(auto) operator[] (int i) const{
44 |     return (ginvs_.at(i));
45 |   }
46 | 
47 |   bool NextRegion(SeqLib::GenomicRegion & gr) {
48 |     if (idx_ < ginvs_.size()) {
49 |       gr = ginvs_[idx_];
50 |       ++idx_;
51 |       return true;
52 |     } else {
53 |       return false;
54 |     }
55 |   }
56 | 
57 | 
58 | };
59 | 
60 | }
61 | #endif //CPPUTIL_INCLUDE_TARGETLAYOUT_H_
62 | 


--------------------------------------------------------------------------------
/codec.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Ruolin Liu on 5/10/21.
 3 | //
 4 | 
 5 | #include <iostream>
 6 | #include <string>
 7 | #include<cstring>
 8 | #include <getopt.h>
 9 | #include <cassert>
10 | #ifndef PACKAGE_VERSION
11 | #define PACKAGE_VERSION "1.1.5"
12 | #endif
13 | 
14 | int codec_demux(int argc, char **argv);
15 | int codec_trim(int argc, char **argv);
16 | int codec_filter(int argc, char **argv);
17 | int codec_accuracy(int argc, char **argv);
18 | //int codec_filter(int argc, char ** argv);
19 | 
20 | 
21 | int print_help()
22 | {
23 |   std::cout<< "---------------------------------------------------\n";
24 |   std::cout<< "Program: codec (concatenating original duplex for error correction analysis suite)\n";
25 |   std::cout<< "Version: " << PACKAGE_VERSION << std::endl;
26 |   std::cout<< "Usage:   codec <command> [options]\n";
27 |   std::cout<< "Common command:      demux                      de-multiplexing.\n";
28 |   std::cout<< "                     trim                       trim CODEC adapter sequence.\n";
29 |   std::cout<< "                     call                       single fragment mutation caller.\n";
30 |   std::cout<< "---------------------------------------------------\n";
31 |   std::cout<< "Optional command:\n";
32 |   std::cout<< "                     filter                     Filter duplex reads on base and fragment levels.\n";
33 |   std::cout<< "---------------------------------------------------\n";
34 |   std::cout<< "Contact: ruolin@broadinstitute.org. "
35 |               "Copyright: bloodbiopsy@broadinstitute.org 2020-2021. \n";
36 |   return 1;
37 | }
38 | 
39 | int main(int argc, char *argv[]) {
40 |   int ret;
41 |   if (argc < 2) return print_help();
42 |   else if (strcmp(argv[1], "demux") == 0) ret = codec_demux(argc-1, argv+1);
43 |   else if (strcmp(argv[1], "trim") == 0) ret = codec_trim(argc-1, argv+1);
44 |   else if (strcmp(argv[1], "filter") == 0) ret = codec_filter(argc-1, argv+1);
45 |   else if (strcmp(argv[1], "call") == 0) ret = codec_accuracy(argc-1, argv+1);
46 | //  else if (strcmp(argv[1], "filter") == 0) ret = codec_filter(argc-1, argv+1);
47 |   else {
48 |     std::cerr << "[codec] unrecongnized command " << argv[1] << std::endl;
49 |     print_help();
50 |     ret = 1;
51 |   }
52 |   return ret;
53 | }
54 | 


--------------------------------------------------------------------------------
/demux.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 3/13/20.
  3 | //
  4 | 
  5 | #include <iostream>
  6 | #include <getopt.h>
  7 | #include <cassert>
  8 | 
  9 | #include "Index.h"
 10 | #include "Files.h"
 11 | 
 12 | using std::string;
 13 | struct DemuxOptions {
 14 |   string library_file;
 15 |   string fastq1;
 16 |   string fastq2;
 17 |   string outprefix = "./test";
 18 |   string reference;
 19 |   int index_begin = 3;
 20 |   int index_len = 18;
 21 |   int max_ed = 2;
 22 |   int min_readlen = 30;
 23 |   bool include_non_pf = false;
 24 |   bool verbose = false;
 25 |   bool out_unmatch = false;
 26 |   bool out_hopped = false;
 27 |   bool count_PF = false;
 28 | };
 29 | 
 30 | 
 31 | static struct option  demux_long_options[] = {
 32 |     {"library_param",            required_argument,      0,        'p'},
 33 |     {"q1",                       required_argument,      0,        '1'},
 34 |     {"q2",                       required_argument,      0,        '2'},
 35 |     {"outprefix",                required_argument ,     0,        'o'},
 36 |     {"ref",                      required_argument ,     0,        'r'},
 37 |     {"index_begin",              required_argument,      0,        'b'},
 38 |     {"index_len",                required_argument,      0,        'l'},
 39 |     {"min_read_len",             required_argument,      0,        'm'},
 40 |     {"max_ed",                   required_argument,      0,        'e'},
 41 |     {"verbose",                  no_argument ,           0,        'v'},
 42 |     {"out_unmatch",                  no_argument ,           0,        'u'},
 43 |     {"out_hopped",                  no_argument ,           0,        'h'},
 44 |     {"include_non_pf",           no_argument ,           0,        'i'},
 45 |     {"count_pf",                 no_argument ,           0,        'c'},
 46 |     {0,0,0,0}
 47 | };
 48 | 
 49 | const char* demux_short_options = "p:1:2:o:r:vib:l:e:cuh:m:";
 50 | 
 51 | void codec_demux_usage()
 52 | {
 53 |   std::cerr<< "---------------------------------------------------\n";
 54 |   std::cerr<< "Usage: codec demux [options]\n";
 55 |   std::cerr<< "General Options:\n";
 56 |   std::cerr<< "-p/--library_param,                    Sample, barcode mapping in CSV format. Header must be \"SampleName,IndexBarcode1,IndexBarcode2\"\n";
 57 |   std::cerr<< "-1/--q1,                               Input read1\n";
 58 |   std::cerr<< "-2/--q2,                               Input read2\n";
 59 |   std::cerr<< "-b/--index_begin,                      The read position where the index begins (Default: 3) \n";
 60 |   std::cerr<< "-l/--index_len,                        Index length (Default: 18)\n";
 61 |   std::cerr<< "-m/--min_read_len,                     Minimum read length (Default: 30)\n";
 62 |   std::cerr<< "-e/--max_ed,                           Maximum edit distance allowed as a match (Default: 2)\n";
 63 |   std::cerr<< "-o/--outprefix,                        Output path, e.g., /tmp/test\n";
 64 |   std::cerr<< "-r/--ref,                              Reference genome fasta file, for judging index hopping\n";
 65 |   std::cerr<< "-i/--include_non_pf,                   Include non-pass filter reads\n";
 66 |   std::cerr<< "-v/--verbose,                          Print verbose information\n";
 67 |   std::cerr<< "-c/--count_pf,                         Just count number of pass filter pairs. Do not do anything else\n";
 68 |   std::cerr<< "-u/--out_unmatch,                      Output reads having no matching barcodes\n";
 69 |   std::cerr<< "-u/--out_hopped,                       Output reads hopped\n";
 70 | }
 71 | 
 72 | int demux_parse_options(int argc, char* argv[], DemuxOptions& opt) {
 73 |   int option_index;
 74 |   int next_option = 0;
 75 |   do {
 76 |     next_option = getopt_long(argc, argv, demux_short_options, demux_long_options, &option_index);
 77 |     switch (next_option) {
 78 |       case -1:break;
 79 |       case 'p':
 80 |         opt.library_file = optarg;
 81 |         break;
 82 |       case '1':
 83 |         opt.fastq1 = optarg;
 84 |         break;
 85 |       case '2':
 86 |         opt.fastq2 = optarg;
 87 |         break;
 88 |       case 'b':
 89 |         opt.index_begin = atoi(optarg);
 90 |         break;
 91 |       case 'l':
 92 |         opt.index_len = atoi(optarg);
 93 |         break;
 94 |       case 'm':
 95 |         opt.min_readlen = atoi(optarg);
 96 |         break;
 97 |       case 'e':
 98 |         opt.max_ed = atoi(optarg);
 99 |         break;
100 |       case 'o':
101 |         opt.outprefix = optarg;
102 |         break;
103 |       case 'r':
104 |         opt.reference = optarg;
105 |         break;
106 |       case 'i':
107 |         opt.include_non_pf = true;
108 |         break;
109 |       case 'v':
110 |         opt.verbose = true;
111 |         break;
112 |       case 'c':
113 |         opt.count_PF = true;
114 |         break;
115 |       case 'h':
116 |         opt.out_hopped = true;
117 |         break;
118 |       case 'u':
119 |         opt.out_unmatch = true;
120 |         break;
121 |       default:codec_demux_usage();
122 |         return 1;
123 |     }
124 |   } while (next_option != -1);
125 | 
126 |   return 0;
127 | }
128 | 
129 | int codec_demux(int argc, char ** argv) {
130 |   DemuxOptions opt;
131 |   int parse_ret = demux_parse_options(argc, argv, opt);
132 |   if (parse_ret) return 1;
133 |   if (argc == 1) {
134 |     codec_demux_usage();
135 |     return 1;
136 |   }
137 |   //AffineGap ag("CACTGATCGTCAGCTGAC", "TGAATCTGAGGCACTGTA");
138 | //  AffineGap ag("AGT", "TGAGTT");
139 | //  ag.PrintAllPaths();
140 | //  exit(0);
141 | 
142 |   CDS::IndexBarcode ibmatcher(opt.library_file, opt.outprefix, opt.max_ed, opt.out_unmatch, opt.out_hopped, opt.verbose);
143 |   if (!opt.reference.empty()) {
144 |     ibmatcher.LoadBwa(opt.reference);
145 |   }
146 |   if (not cpputil::FileExist(opt.library_file)) {
147 |     std::cerr << opt.library_file << " does not exist\n";
148 |     return 1;
149 |   }
150 |   cpputil::FastxReader R1_reader(opt.fastq1);
151 |   cpputil::FastxReader R2_reader(opt.fastq2);
152 |   cpputil::FastxRecord read1;
153 |   cpputil::FastxRecord read2;
154 |   uint64_t total_pf_reads = 0;
155 |   uint64_t total_reads = 0;
156 |   while (R1_reader.yield(read1)) {
157 |     R2_reader.yield(read2);
158 |     ++total_reads;
159 |     if (not opt.include_non_pf and (read1.is_filtered() or read2.is_filtered())) continue;
160 |     if (read1.seq.length() < opt.min_readlen or read2.seq.length() < opt.min_readlen) continue;
161 |     ++total_pf_reads;
162 |     //if (opt.count_PF) continue;
163 |     assert(read1.name() == read2.name());
164 |     ibmatcher.DecodePair(read1, read2, opt.index_begin, opt.index_len);
165 |   }
166 | //  if (opt.count_PF) {
167 | uint64_t total_matched = ibmatcher.total_matched();
168 | for (unsigned i = 0; i < ibmatcher.samples().size(); ++ i) {
169 |   string s = ibmatcher.samples()[i];
170 |   uint64_t n = ibmatcher.nmatched()[i];
171 |   std::cout << "#sample, matched, matched%: " << s << ", " << n << ", " << (double) n / total_matched << std::endl;
172 | }
173 | std::cout << "#total, #PF, #matched, matched%: " << total_reads << ", " << total_pf_reads << ", " << total_matched << ", " << (double) total_matched / total_pf_reads << std::endl;
174 | //  }
175 |   return 0;
176 | }
177 | 


--------------------------------------------------------------------------------
/include/BamIO.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 7/13/20.
  3 | //
  4 | 
  5 | #ifndef ADAPTERTRIM_INCLUDE_BAMIO_H_
  6 | #define ADAPTERTRIM_INCLUDE_BAMIO_H_
  7 | #include <string>
  8 | #include <SeqLib/BamRecord.h>
  9 | #include <SeqLib/BamWriter.h>
 10 | #include <SeqLib/BamHeader.h>
 11 | #include "BamRecordExt.h"
 12 | #include "DNAUtils.h"
 13 | #include "FastxRecord.h"
 14 | 
 15 | namespace cpputil {
 16 | 
 17 | class UnMappedBamWriter {
 18 |   SeqLib::BamWriter bam_writer_;
 19 |   std::string sample_;
 20 |   std::string rgid_;
 21 | 
 22 |   const SeqLib::BamRecord CreateUBamRecord(const ExtFastxRecord& fxr, bool first_read) {
 23 |     SeqLib::BamRecord out;
 24 |     out.init();
 25 |     bam1_t* b = out.raw();
 26 |     b->core.tid = -1;
 27 |     b->core.pos = -1;
 28 |     b->core.qual = 0;
 29 |     b->core.flag = first_read ? 77: 141;
 30 | 
 31 |     // set dumy mate
 32 |     b->core.mtid = -1;
 33 |     b->core.mpos = -1;
 34 |     b->core.isize = 0;
 35 | 
 36 |     // allocate all the data
 37 |     b->core.l_qname = fxr.name().length() + 1;
 38 |     b->core.l_qseq = fxr.seq.length(); //(seq.length()>>1) + seq.length() % 2; // 4-bit encoding
 39 |     b->l_data = b->core.l_qname + ((b->core.l_qseq+1)>>1) + (b->core.l_qseq);
 40 |     b->data = (uint8_t*)malloc(b->l_data);
 41 | 
 42 |     // allocate the qname
 43 |     memcpy(b->data, fxr.name().c_str(), fxr.name().length() + 1);
 44 | 
 45 |     // allocate the sequence
 46 |     uint8_t* m_bases = b->data + b->core.l_qname;
 47 | 
 48 |     // TODO move this out of bigger loop
 49 |     int slen = fxr.seq.length();
 50 |     for (int i = 0; i < slen; ++i) {
 51 |       // bad idea but works for now
 52 |       uint8_t base = 15;
 53 |       if (fxr.seq.at(i) == 'A')
 54 |         base = 1;
 55 |       else if (fxr.seq.at(i) == 'C')
 56 |         base = 2;
 57 |       else if (fxr.seq.at(i) == 'G')
 58 |         base = 4;
 59 |       else if (fxr.seq.at(i) == 'T')
 60 |         base = 8;
 61 | 
 62 |       m_bases[i >> 1] &= ~(0xF << ((~i & 1) << 2));   ///< zero out previous 4-bit base encoding
 63 |       m_bases[i >> 1] |= base << ((~i & 1) << 2);  ///< insert new 4-bit base encoding
 64 | 
 65 |     }
 66 |     if (!fxr.qual.empty() && fxr.qual.length() != (unsigned) b->core.l_qseq)
 67 |       throw std::invalid_argument("New quality score should be same as seq length");
 68 | 
 69 |     // length of qual is always same as seq. If empty qual, just set first bit of qual to 0
 70 |     if (not fxr.qual.empty()) {
 71 |       char * q = strdup(fxr.qual.data());
 72 |       for (size_t i = 0; i < fxr.qual.length(); ++i)
 73 |         q[i] -= 33;
 74 |       memcpy(bam_get_qual(b), q, fxr.qual.length()); // dont copy /0 terminator
 75 |       free(q);
 76 |     }
 77 | 
 78 |     out.AddIntTag("td", fxr.rc_adpt);
 79 |     if (not fxr.umi.empty()) {
 80 |       out.AddZTag("RX", fxr.umi.seq());
 81 |       out.AddZTag("QX", fxr.umi.qual());
 82 |     }
 83 |     if (not fxr.adap5.empty()) {
 84 |       out.AddZTag("s5", fxr.adap5.seq());
 85 |       out.AddZTag("q5", fxr.adap5.qual());
 86 |     }
 87 |     if (not fxr.adap3.empty()) {
 88 |       out.AddZTag("s3", fxr.adap3.seq());
 89 |       out.AddZTag("q3", fxr.adap3.qual());
 90 |     }
 91 |     if (not fxr.trim3.empty()) {
 92 |       out.AddZTag("sl", fxr.trim3.seq());
 93 |       out.AddZTag("ql", fxr.trim3.qual());
 94 |     }
 95 |     if (not rgid_.empty()) {
 96 |       out.AddZTag("RG", rgid_);
 97 |     }
 98 |     if (not fxr.barcode.empty()) {
 99 |       out.AddZTag("bc", fxr.barcode);
100 |     }
101 |     if (fxr.tm != 255) {
102 |       out.AddIntTag("tm", fxr.tm);
103 |     }
104 |     return out;
105 |   }
106 | 
107 |   const SeqLib::BamRecord CreateUBamRecord(const SeqLib::BamRecord &bam_template, std::string seq, std::string qual, bool single_end) {
108 |     // seq and qual are assumed to be PLUS strand
109 | //    if (not ProperPair(bam_template)) {
110 | //      throw std::runtime_error("not a proper pair");
111 | //    }
112 |     SeqLib::BamRecord out;
113 |     out.init();
114 |     SeqLib::Cigar c;
115 |     out.SetCigar(c);
116 |     if (bam_template.ReverseFlag()) {
117 |       reverse_complement(seq);
118 |       reverse(qual);
119 |     }
120 |     out.SetQname(bam_template.Qname());
121 |     if (single_end) {
122 |       out.raw()->core.flag = 4;
123 |     }
124 |     else {
125 |       out.raw()->core.flag = bam_template.FirstFlag() ? 77: 141;
126 |     }
127 |     out.SetSequence(seq);
128 |     out.SetQualities(qual, 33);
129 |     out.SetChrID(-1);
130 |     out.SetChrIDMate(-1);
131 |     out.SetPosition(-1);
132 |     out.SetPositionMate(-1);
133 |     int32_t cD;
134 |     std::string rg;
135 |     int32_t cM;
136 |     std::string mi;
137 |     std::string rx;
138 |     if (bam_template.GetIntTag("cD", cD)) {
139 |       out.AddIntTag("cD", cD);
140 |     }
141 |     if (bam_template.GetZTag("RG", rg)) {
142 |       out.AddZTag("RG", rg);
143 |     }
144 |     if (bam_template.GetIntTag("cM", cM)) {
145 |       out.AddIntTag("cM", cM);
146 |     }
147 |     if (bam_template.GetZTag("MI", mi)) {
148 |       out.AddZTag("MI", mi);
149 |     }
150 |     if (bam_template.GetZTag("RX", rx)) {
151 |       out.AddZTag("RX", rx);
152 |     }
153 |     return out;
154 |   }
155 | 
156 |  public:
157 |   UnMappedBamWriter() = default;
158 |   UnMappedBamWriter(std::string path, std::string rgid, std::string sample) : sample_(sample), rgid_(rgid) {
159 |     std::string header_str = "@HD\tVN:1.5\tGO:none\n";
160 |     header_str += "@RG\tID:" + rgid_ + "\tSM:" + sample_ + "\n";
161 |     SeqLib::BamHeader bh(header_str);
162 |     bam_writer_.Open(path);
163 |     bam_writer_.SetHeader(bh);
164 |     bam_writer_.WriteHeader();
165 |   }
166 | 
167 |   UnMappedBamWriter(std::string path, const SeqLib::BamHeader& tpl) {
168 |     std::istringstream iss(tpl.AsString());
169 |     std::string line;
170 |     std::string newhdr;
171 |     while (std::getline(iss, line, '\n')) {
172 |       if (line.length() == 0 || line.at(0) != '@') break;
173 |       std::string t = line.substr(0, 3);
174 |       if ( t == "@HD" || t == "@RG") {
175 |         newhdr += line +"\n";
176 |       }
177 |     }
178 |     SeqLib::BamHeader bh(newhdr);
179 |     bam_writer_.Open(path);
180 |     bam_writer_.SetHeader(bh);
181 |     bam_writer_.WriteHeader();
182 |   }
183 | 
184 |   void Open(std::string path, std::string rgid, std::string sample) {
185 |     std::string header_str = "@HD\tVN:1.5\tGO:none\n";
186 |     header_str += "@RG\tID:" + rgid + "\tSM:" + sample + "\n";
187 |     SeqLib::BamHeader bh(header_str);
188 |     bam_writer_.Open(path);
189 |     bam_writer_.SetHeader(bh);
190 |     bam_writer_.WriteHeader();
191 |   }
192 | 
193 | //  void Init(std::string path, std::string readgroup, std::string sample) {
194 | //    header_str += "@RG\tID:"
195 | //    header_str += readgroup;
196 | //    header_str += "\tSM:";
197 | //    header_str += sample;
198 | //    header_str += "\n";
199 | //
200 | //  }
201 | 
202 |   ~UnMappedBamWriter() {
203 |     bam_writer_.Close();
204 |   }
205 |   bool IsOpen() {
206 |     return bam_writer_.IsOpen();
207 |   }
208 | 
209 | 
210 |   void WriteRecord(const SeqLib::BamRecord & R1, const SeqLib::BamRecord& R2, std::string seq1, std::string seq2, std::string qual1, std::string qual2) {
211 |     // Write paired end records
212 |     // R1, R2 must be strictly First in pair and, Second in pair
213 |     if (not cpputil::ProperPair(R1) || not cpputil::ProperPair(R2)) {
214 |       throw std::runtime_error("not a proper pair");
215 |     }
216 |     auto r1 = CreateUBamRecord(R1, seq1, qual1, false);
217 |     auto r2 = CreateUBamRecord(R2, seq2, qual2, false);
218 |     //std::cout << out << std::endl;
219 |     bool status1 = bam_writer_.WriteRecord(r1);
220 |     bool status2 = bam_writer_.WriteRecord(r2);
221 |     if (not status1 or not status2) {
222 |       std::cerr << "cannot write bam record " << R1.Qname() << std::endl;
223 |     }
224 |   }
225 | 
226 |   //simply strip off mapping information and output ubam
227 |   //this is used for intermolecular bams
228 |   void WriteRecord(const SeqLib::BamRecord & R1, const SeqLib::BamRecord& R2) {
229 |     auto r1 = CreateUBamRecord(R1, R1.Sequence(), R1.QualitySequence(), false);
230 |     auto r2 = CreateUBamRecord(R2, R2.Sequence(), R2.QualitySequence(), false);
231 |     bool status1 = bam_writer_.WriteRecord(r1);
232 |     bool status2 = bam_writer_.WriteRecord(r2);
233 |     if (not status1 or not status2) {
234 |       std::cerr << "cannot write bam record " << R1.Qname() << std::endl;
235 |     }
236 |   }
237 | 
238 |   void WriteRecord(const SeqLib::BamRecord & R1, std::string seq, std::string qual) {
239 |     // Write Single end record
240 |     // R1, R2 must be strictly First in pair and, Second in pair
241 |     if (not cpputil::ProperPair(R1)){
242 |       throw std::runtime_error("not a proper pair");
243 |     }
244 |     auto out = CreateUBamRecord(R1, seq, qual, true);
245 |     bool status = bam_writer_.WriteRecord(out);
246 |     if (not status) {
247 |       std::cerr << "cannot write bam record " << R1.Qname() << std::endl;
248 |     }
249 |   }
250 | 
251 |   void WriteRecord(ExtFastxRecord& fxr, bool first_read) {
252 |     auto out = CreateUBamRecord(fxr, first_read);
253 |     bool status = bam_writer_.WriteRecord(out);
254 |     if (not status) {
255 |       std::cerr << "cannot write bam record " << fxr.broad_id() << std::endl;
256 |     }
257 |   }
258 | };
259 | 
260 | }
261 | #endif //ADAPTERTRIM_INCLUDE_BAMIO_H_
262 | 


--------------------------------------------------------------------------------
/include/Index.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 9/18/20.
  3 | //
  4 | 
  5 | #ifndef ADAPTERTRIM_INCLUDE_INDEX_H_
  6 | #define ADAPTERTRIM_INCLUDE_INDEX_H_
  7 | #include <string>
  8 | #include <vector>
  9 | #include <fstream>
 10 | #include <iostream>
 11 | #include <limits>
 12 | #include <SeqLib/ssw_cpp.h>
 13 | #include <SeqLib/BWAWrapper.h>
 14 | #include <SeqLib/BamRecord.h>
 15 | #include <memory>
 16 | #include <set>
 17 | #include "BamRecordExt.h"
 18 | #include "StringUtils.h"
 19 | #include "FastxIO.h"
 20 | #include "Gotoh.h"
 21 | #include "omp.h"
 22 | 
 23 | namespace CDS {
 24 | static void PrintAlignment(const StripedSmithWaterman::Alignment& alignment){
 25 |   std::cout << "===== SSW result =====" << std::endl;
 26 |   std::cout << "Best Smith-Waterman score:\t" << alignment.sw_score << std::endl
 27 |        << "Next-best Smith-Waterman score:\t" << alignment.sw_score_next_best << std::endl
 28 |        << "Reference start:\t" << alignment.ref_begin << std::endl
 29 |        << "Reference end:\t" << alignment.ref_end << std::endl
 30 |        << "Query start:\t" << alignment.query_begin << std::endl
 31 |        << "Query end:\t" << alignment.query_end << std::endl
 32 |        << "Next-best reference end:\t" << alignment.ref_end_next_best << std::endl
 33 |        << "Number of mismatches:\t" << alignment.mismatches << std::endl
 34 |        << "Cigar: " << alignment.cigar_string << std::endl;
 35 |   std::cout << "======================" << std::endl;
 36 | }
 37 | int SSW(const std::string &ref, const std::string &query, bool verbose=false) {
 38 |   StripedSmithWaterman::Aligner aligner(2,0,2,1);
 39 |   //StripedSmithWaterman::Aligner aligner;
 40 | 
 41 |   StripedSmithWaterman::Filter filter;
 42 |   StripedSmithWaterman::Alignment alignment;
 43 |   aligner.Align(query.c_str(), ref.c_str(), ref.size(), filter, &alignment, 15);
 44 |   int ed = alignment.mismatches + alignment.query_begin + query.length() - alignment.query_end - 1;
 45 |   if (verbose) {
 46 |     PrintAlignment(alignment);
 47 |     std::cout << "edit: " << ed << std::endl;
 48 |   }
 49 |   return ed;
 50 | }
 51 | 
 52 | 
 53 | bool IsIntermol(const SeqLib::BWAWrapper& bwa,
 54 |     const std::string& name1, const std::string& seq1, const std::string& name2, const std::string& seq2) {
 55 |   SeqLib::BamRecordVector read1_bam;
 56 |   SeqLib::BamRecordVector read2_bam;
 57 |   bwa.AlignSequence(seq1, name1, read1_bam, false, -1, 0);
 58 |   bwa.AlignSequence(seq2, name2, read2_bam, false, -1, 0);
 59 |   if (read1_bam.empty() and read2_bam.empty()) {
 60 |     return false;
 61 |   }
 62 |   if (read1_bam.empty() or read2_bam.empty()) {
 63 |     return true;
 64 |   }
 65 | //  if (verbose) {
 66 | //    std::cerr << read1_bam[0] << std::endl;
 67 | //    std::cerr << read2_bam[0] << std::endl;
 68 | //  }
 69 |   auto is = cpputil::InsertSize(read1_bam[0], read2_bam[0]);
 70 |   if (is > 1000 || is == 0)  return true;
 71 |   else return false;
 72 | }
 73 | 
 74 | class IndexBarcode {
 75 |   std::vector<std::string> index1s_;
 76 |   std::vector<std::string> index2s_;
 77 |   std::vector<std::string> snames_;
 78 |   std::vector<uint64_t> nmatched_;
 79 |   std::vector<cpputil::FastqWriter> fq1_writers_;
 80 |   std::vector<cpputil::FastqWriter> fq2_writers_;
 81 |   cpputil::FastqWriter unkfq1_writer_;
 82 |   cpputil::FastqWriter unkfq2_writer_;
 83 |   cpputil::FastqWriter hopfq1_writer_;
 84 |   cpputil::FastqWriter hopfq2_writer_;
 85 |   std::ifstream file_;
 86 |   SeqLib::BWAWrapper  bwa_;
 87 |   int max_ed_;
 88 |   bool out_unmatched_;
 89 |   bool out_hopped_;
 90 |   bool verbose_;
 91 | 
 92 | //  static const int INDEX_START = 3;
 93 | //  static const int INDEX_LEN = 18;
 94 | 
 95 |   int MatchIndex (const std::string& seq, const std::string& qual, const std::vector<std::string>& indexes, int& nm) {
 96 |     int lowest_nm = std::numeric_limits<int>::max();
 97 |     int second_lowest_nm = std::numeric_limits<int>::max();
 98 |     int best_idx = 0;
 99 |     for (unsigned i  = 0; i < indexes.size(); ++i) {
100 |       int s = SSW(seq, indexes[i]);
101 |       //AffineGap ag(seq, indexes[i]);
102 |       //Alignment align(seq, indexes[i], ag.Path());
103 |       //int s = align.NM();
104 |       if (s < lowest_nm) {
105 |         second_lowest_nm = lowest_nm;
106 |         lowest_nm = s;
107 |         best_idx = i;
108 |       } else if (s < second_lowest_nm) {
109 |         second_lowest_nm = s;
110 |       }
111 |     }
112 |     nm = lowest_nm;
113 |     if (lowest_nm <= max_ed_) {
114 |       return best_idx;
115 |     }
116 |     if (lowest_nm == max_ed_ + 1 && second_lowest_nm > max_ed_ + 3) {
117 |       return best_idx;
118 |     }
119 |     return -1;
120 |   }
121 | 
122 |  public:
123 |   IndexBarcode (const std::string& index_file, const std::string& outprefix, const int max_ed, bool out_unmatched, bool out_hopped, bool v):
124 |         file_(index_file), max_ed_(max_ed), out_unmatched_(out_unmatched), out_hopped_(out_hopped), verbose_(v)
125 |   {
126 |     std::string header;
127 |     std::string line;
128 |     std::getline(file_, header);
129 |     auto colnames = cpputil::split(header, ",");
130 |     if (colnames.size() != 3 || colnames[0] != "SampleName" || colnames[1] != "IndexBarcode1" || colnames[2] != "IndexBarcode2") {
131 |       throw std::runtime_error("Invalid index file\n Format required as three comma-delimited columns with header SampleName,IndexBarcode1,IndexBarcode2");
132 |     }
133 |     if (out_unmatched) {
134 |       unkfq1_writer_.open(outprefix + ".unmatched.1.fastq.gz");
135 |       unkfq2_writer_.open(outprefix + ".unmatched.2.fastq.gz");
136 |     }
137 |     if (out_unmatched) {
138 |       hopfq1_writer_.open(outprefix + ".hopped.1.fastq.gz");
139 |       hopfq2_writer_.open(outprefix + ".hopped.2.fastq.gz");
140 |     }
141 |     std::set<std::string> unique_sids;
142 |     while(std::getline(file_, line)) {
143 |       std::cerr << line << std::endl;
144 |       auto fields = cpputil::split(line, ",");
145 |       if (unique_sids.find(fields[0]) == unique_sids.end()) {
146 |         unique_sids.insert(fields[0]);
147 |         snames_.push_back(fields[0]);
148 |         index1s_.push_back(fields[1]);
149 |         index2s_.push_back(fields[2]);
150 |         fq1_writers_.emplace_back(outprefix + "." + fields[0] + ".1.fastq.gz");
151 |         fq2_writers_.emplace_back(outprefix + "." + fields[0] + ".2.fastq.gz");
152 |       } else {
153 |         std::cerr << "Warning: duplicated sample name in library_params. Ignore \"" << line << "\"\n";
154 |       }
155 |     }
156 |     nmatched_.resize(snames_.size(), 0);
157 |     // Print output header
158 |     if (verbose_) {
159 |       std::cout << "id\t"
160 |                    "observed_1\t"
161 |                    "barcode_1\t"
162 |                    "nm1\t"
163 |                    "observed_2\t"
164 |                    "barcode_2\t"
165 |                    "nm2\t"
166 |                    "sample_1\t"
167 |                    "sample_2\t"
168 |                    "matched\t"
169 |                    "conflicted\t"
170 |                    "hopped" << std::endl;
171 |     }
172 |   }
173 | 
174 |   void LoadBwa(const std::string &refgenome) {
175 |     std::cerr << "loading index " << refgenome << std::endl;
176 |     bwa_.LoadIndex(refgenome);
177 |     std::cerr << "finished load index " << refgenome << std::endl;
178 |   }
179 | 
180 |   std::pair<std::string, std::string> ExtractIndex(const cpputil::FastxRecord& read, int index_begin, int index_len) {
181 |     std::string s1 = read.seq.substr(index_begin, index_len);
182 |     std::string q1 = read.qual.substr(index_begin, index_len);
183 |     return std::make_pair(s1, q1);
184 |   }
185 | 
186 |   void DecodePair(const cpputil::FastxRecord& r1, const cpputil::FastxRecord& r2, int index_begin, int index_len) {
187 |     std::string ob1, qual1, ob2, qual2;
188 |     std::tie(ob1, qual1) = ExtractIndex(r1, std::max(index_begin-2, 0), index_len+2);
189 |     std::tie(ob2, qual2) = ExtractIndex(r2, std::max(index_begin-2, 0), index_len+2);
190 |     int nm1, nm2;
191 |     int idx1 = MatchIndex(ob1, qual1, index1s_, nm1);
192 |     int idx2 = MatchIndex(ob2, qual2, index2s_, nm2);
193 |     std::string r1b = idx1 == -1 ? "" : index1s_[idx1];
194 |     std::string r2b = idx2 == -1 ? "" : index2s_[idx2];
195 |     std::string r1s = idx1 == -1 ? "" : snames_[idx1];
196 |     std::string r2s = idx2 == -1 ? "" : snames_[idx2];
197 |     std::string match;
198 |     std::string conflict = "0";
199 |     if (idx1 == -1 || idx2 == -1 || idx1 != idx2 ) {
200 |       if (out_unmatched_) {
201 |         unkfq1_writer_.Write(r1.id, r1.seq, r1.qual);
202 |         unkfq2_writer_.Write(r2.id, r2.seq, r2.qual);
203 |       }
204 |       match = "0";
205 |       if (idx1 != -1 && idx2 != -1 && idx1 != idx2) conflict = "1";
206 |     } else {
207 |       size_t stop = r1.id.find_last_of(':');
208 |       fq1_writers_[idx1].Write(r1.id.substr(0, stop) + ":" + r1b, r1.seq, r1.qual);
209 |       fq2_writers_[idx2].Write(r2.id.substr(0, stop) + ":" + r2b, r2.seq, r2.qual);
210 |       match = "1";
211 |       ++nmatched_[idx1];
212 |     }
213 |     std::string hopped = "0";
214 |     if (conflict == "1" and !bwa_.IsEmpty()) {
215 |       hopped = IsIntermol(bwa_, r1.name(), r1.seq.substr(index_begin + index_len + 1),
216 |                           r2.name(), r2.seq.substr(index_begin + index_len + 1)) ? "0" : "1";
217 |       if (out_hopped_ && hopped == "1") {
218 |         hopfq1_writer_.Write(r1.id, r1.seq, r1.qual);
219 |         hopfq2_writer_.Write(r2.id, r2.seq, r2.qual);
220 |       }
221 |     }
222 |     if (verbose_) {
223 |       std::cout << r1.name() << "\t" << ob1 << "\t" << r1b << "\t" << nm1 << "\t" << ob2 << "\t" << \
224 |                  r2b << "\t" << nm2 << "\t" << r1s << "\t" << r2s << "\t"
225 |                 << match << "\t" << conflict << "\t" << hopped << "\n";
226 |     }
227 |   }
228 |   uint64_t total_matched() const {return std::accumulate(nmatched_.begin(), nmatched_.end(), (uint64_t) 0);}
229 |   decltype(auto) samples() const {return snames_;}
230 |   decltype(auto) nmatched() const {return nmatched_;}
231 | };
232 | 
233 | }
234 | 
235 | 
236 | #endif //ADAPTERTRIM_INCLUDE_INDEX_H_
237 | 


--------------------------------------------------------------------------------
/msi/Snakefile:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | metadata_file = config["input_meta"]
 3 | metadata = pd.read_csv(metadata_file, sep="\t").set_index("pair")
 4 | nparallel = config['nparallel'] if 'nparallel' in config else 40
 5 | bed=""
 6 | hg19=""
 7 | gnomad_vcf=""
 8 | workdir: config["cwd"]
 9 | bedtools = "bedtools"
10 | 
11 | rule all:
12 |     input:
13 |          expand("result/{pair}.{type}.filtered", pair = metadata.index, type=["all", "msi"])
14 | 
15 | rule SplitBed:
16 |     input:
17 |         bed
18 |     params:
19 |         nsplit = nparallel,
20 |         prefix = "tmp/split_region_"
21 |     output:
22 |         splitbed = temp(expand("tmp/split_region_{id}.bed", id = [str(x).zfill(3) for x in range(nparallel)]))
23 |     shell:
24 |         """
25 |         split {input} -n l/{params.nsplit} -a 3 -d {params.prefix} --additional-suffix .bed
26 |         """
27 | 
28 | rule MsiDetect:
29 |     input:
30 |         normal = lambda wildcards: metadata.loc[wildcards.pair][config['normal_col']],
31 |         tumor = lambda wildcards: metadata.loc[wildcards.pair]['tumor'],
32 |         bed = "tmp/split_region_{id}.bed"
33 |     params:
34 |         mapq = 50,
35 |         outprefix = "tmp/{pair}_region_{id}",
36 |         population_vcf = gnomad_vcf
37 |     output:
38 |         temp("tmp/{pair}_region_{id}.msi"),
39 |         temp("tmp/{pair}_region_{id}.all")
40 |     resources:
41 |         runtime = 2
42 |     shell:
43 |         """
44 |         msi -t {input.tumor} -n {input.normal} -L {input.bed} -m {params.mapq} -r {hg19} -o {params.outprefix} -V {params.population_vcf} -D -U DI -x 2
45 |         """
46 | 
47 | rule AggMsi:
48 |     input:
49 |         expand("tmp/{{pair}}_region_{id}.{{type}}", id = [str(x).zfill(3) for x in range(nparallel)])
50 |     output:
51 |         "result/{pair}.{type}"
52 |     wildcard_constraints:
53 |         type = "[0-9a-zA-Z_]+"
54 |     shell:
55 |         """
56 |         cat {input} > {output}
57 |         """
58 | 
59 | rule FilterGerm:
60 |     input:
61 |         msi = "result/{pair}.{type}",
62 |         germ_vcf = lambda wildcards: metadata.loc[wildcards.pair]['germ_vcf'],
63 |     output:
64 |         "result/{pair}.{type}.filtered"
65 |     wildcard_constraints:
66 |         type = "[0-9a-zA-Z_]+"
67 |     shell:
68 |         """
69 |         awk "{{OFS=\\"\\t\\"}};{{if (\$7 < 0.7 && \$8 < 0.7 ) {{print \$1,\$2-5,\$2+\$4+5,\$0}} }}" {input.msi} | {bedtools} intersect -a - -b {input.germ_vcf} -c | cut -f 4- | awk "\$NF == 0 && length(\$3) == 1 {{print \$0}}" > {output}
70 |         """
71 | 


--------------------------------------------------------------------------------
/obsolete/bamtofastq.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 10/6/20.
  3 | //
  4 | 
  5 | #include <iostream>
  6 | #include <getopt.h>
  7 | #include <cassert>
  8 | #include <cstdlib>
  9 | #include <unistd.h>
 10 | #include <string.h>
 11 | #include <tuple>
 12 | #include <SeqLib/BamReader.h>
 13 | #include <SeqLib/BamRecord.h>
 14 | 
 15 | #ifdef BGZF_MAX_BLOCK_SIZE
 16 | #pragma push_macro("BGZF_MAX_BLOCK_SIZE")
 17 | #undef BGZF_MAX_BLOCK_SIZE
 18 | #define BGZF_MAX_BLOCK_SIZE_BAK
 19 | #endif
 20 | 
 21 | #ifdef BGZF_BLOCK_SIZE
 22 | #pragma push_macro("BGZF_BLOCK_SIZE")
 23 | #undef BGZF_BLOCK_SIZE
 24 | #define BGZF_BLOCK_SIZE_BAK
 25 | #endif
 26 | #include "FastxIO.h"
 27 | 
 28 | using std::string;
 29 | struct Options {
 30 |   string bam;
 31 |   string fastq1;
 32 |   string fastq2;
 33 |   string tmpdir = "/tmp";
 34 |   int thread = 1;
 35 | };
 36 | 
 37 | 
 38 | static struct option  long_options[] = {
 39 |     {"input1",                  required_argument,      0,        '1'},
 40 |     {"input2",                  required_argument,      0,        '2'},
 41 |     {"bam",                     required_argument ,     0,        'b'},
 42 |     {"tmpdir",                  required_argument ,     0,        't'},
 43 |     {"thread",                  required_argument,      0,        'p'},
 44 |     {0,0,0,0}
 45 | };
 46 | 
 47 | const char*short_options = "1:2:b:t:p:";
 48 | 
 49 | void print_help()
 50 | {
 51 |   std::cerr<< "---------------------------------------------------\n";
 52 |   std::cerr<< "Usage: consensus [options]\n";
 53 |   std::cerr<< "General Options:\n";
 54 |   std::cerr<< "-b/--bam,                            Bam input\n";
 55 |   std::cerr<< "-1/--fastq1,                         Output Fastq1\n";
 56 |   std::cerr<< "-2/--fastq2,                         Output Fastq2\n";
 57 |   std::cerr<< "-t/--tmpdir,                         Temporary dir for sorted bam [/tmp]\n";
 58 |   std::cerr<< "-p/--thread,                         Number of threads for sort [1]\n";
 59 | }
 60 | 
 61 | int parse_options(int argc, char* argv[], Options& opt) {
 62 |   int option_index;
 63 |   int next_option = 0;
 64 |   do {
 65 |     next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
 66 |     switch (next_option) {
 67 |       case -1:break;
 68 |       case '1':
 69 |         opt.fastq1 = optarg;
 70 |         break;
 71 |       case '2':
 72 |         opt.fastq2 = optarg;
 73 |         break;
 74 |       case 'b':
 75 |         opt.bam = optarg;
 76 |         break;
 77 |       case 't':
 78 |         opt.tmpdir = optarg;
 79 |         break;
 80 |       case 'p':
 81 |         opt.thread = atoi(optarg);
 82 |         break;
 83 |       default:print_help();
 84 |         return 1;
 85 |     }
 86 |   } while (next_option != -1);
 87 | 
 88 |   return 0;
 89 | }
 90 | 
 91 | int main(int argc, char ** argv) {
 92 |   Options opt;
 93 |   int parse_ret = parse_options(argc, argv, opt);
 94 |   if (parse_ret) return 1;
 95 |   if (argc == 1) {
 96 |     print_help();
 97 |     exit(0);
 98 |   }
 99 |   char temp[100];
100 |   strcpy(temp, opt.tmpdir.c_str());
101 |   strcat(temp, "/tempsort.XXXXXX");
102 |   int fd = mkstemp(temp);
103 |   if (fd == -1) {
104 |     std::cerr << "unable to create temp file for sorting bam in queryname order\n";
105 |     return 1;
106 |   }
107 |   string samsort = "samtools sort -n " + opt.bam + " -o " + string(temp) + " -@ " + std::to_string(opt.thread);
108 |   std::cout << samsort << std::endl;
109 |   std::system(samsort.c_str());
110 |   SeqLib::BamRecord read;
111 |   SeqLib::BamReader input;
112 |   input.Open(temp);
113 |   cpputil::FastqWriter R1(opt.fastq1);
114 |   cpputil::FastqWriter R2(opt.fastq2);
115 |   std::vector<SeqLib::BamRecord> pair(2);
116 |   // After sorted by name, using yield like approach
117 |   while(input.GetNextRecord(read)) {
118 |     if (read.FirstFlag() && !read.SupplementaryFlag() && !read.SecondaryFlag()) {
119 |       if (pair[0].isEmpty()) {
120 |         pair[0] = read;
121 |         if (!pair[1].isEmpty()) {
122 |           if (pair[1].Qname() != pair[0].Qname()) {
123 |             throw std::runtime_error("Bam file must be query name sorted! Exit at read " + read.Qname());
124 |           } else {
125 |             cpputil::FastxRecord fx1(pair[0], true);
126 |             R1.Write(fx1);
127 |             cpputil::FastxRecord fx2(pair[1], true);
128 |             R2.Write(fx2);
129 |             pair.clear();
130 |             pair.resize(2);
131 |           }
132 |         }
133 |       } else {
134 |         throw std::runtime_error("Duplicated read name, " + read.Qname());
135 |       }
136 |     }
137 | 
138 |     if (!read.FirstFlag() && !read.SupplementaryFlag() && !read.SecondaryFlag()) {
139 |       if (pair[1].isEmpty()) {
140 |         pair[1] = read;
141 |         if (!pair[0].isEmpty()) {
142 |           if (pair[1].Qname() != pair[0].Qname()) {
143 |             throw std::runtime_error("Bam file must be query name sorted! Exit at read " + read.Qname());
144 |           } else {
145 |             cpputil::FastxRecord fx1(pair[0], true);
146 |             R1.Write(fx1);
147 |             cpputil::FastxRecord fx2(pair[1], true);
148 |             R2.Write(fx2);
149 |             pair.clear();
150 |             pair.resize(2);
151 |           }
152 |         }
153 |       } else {
154 |         throw std::runtime_error("Duplicated read name, " + read.Qname());
155 |       }
156 |     }
157 |   }
158 |   close(fd);
159 |   unlink(temp);
160 |   return 0;
161 | }
162 | 


--------------------------------------------------------------------------------
/obsolete/concat_umi_to_fastq.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Ruolin Liu on 10/6/20.
  3 | //
  4 | 
  5 | #include <iostream>
  6 | #include <getopt.h>
  7 | #include <cassert>
  8 | #include <FastxIO.h>
  9 | 
 10 | using std::string;
 11 | struct Options {
 12 |   string fastq1;
 13 |   string fastq2;
 14 |   string index1;
 15 |   string index2;
 16 |   string out1;
 17 |   string out2;
 18 | };
 19 | 
 20 | 
 21 | static struct option  long_options[] = {
 22 |     {"input1",                  required_argument,      0,        '1'},
 23 |     {"input2",                  required_argument,      0,        '2'},
 24 |     {"index1",                  required_argument ,     0,        'i'},
 25 |     {"index2",                  required_argument ,     0,        'I'},
 26 |     {"out1",                    required_argument ,     0,        'o'},
 27 |     {"out2",                    required_argument ,     0,        'O'},
 28 |     {0,0,0,0}
 29 | };
 30 | 
 31 | const char*short_options = "1:2:o:O:i:I:";
 32 | 
 33 | void print_help()
 34 | {
 35 |   std::cerr<< "---------------------------------------------------\n";
 36 |   std::cerr<< "Usage: consensus [options]\n";
 37 |   std::cerr<< "General Options:\n";
 38 |   std::cerr<< "-1/--input1,                         Input Read 1\n";
 39 |   std::cerr<< "-2/--iunput2,                        Input Read 2\n";
 40 |   std::cerr<< "-i/--index1,                         index/UMI file for Read 1\n";
 41 |   std::cerr<< "-I/--index2,                         index/UMI file for Read 2\n";
 42 |   std::cerr<< "-o/--output1,                        Output Read 1\n";
 43 |   std::cerr<< "-o/--output2,                        Output Read 2\n";
 44 | }
 45 | 
 46 | int parse_options(int argc, char* argv[], Options& opt) {
 47 |   int option_index;
 48 |   int next_option = 0;
 49 |   do {
 50 |     next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
 51 |     switch (next_option) {
 52 |       case -1:break;
 53 |       case 'i':
 54 |         opt.index1 = optarg;
 55 |         break;
 56 |       case 'I':
 57 |         opt.index2 = optarg;
 58 |         break;
 59 |       case '1':
 60 |         opt.fastq1 = optarg;
 61 |         break;
 62 |       case '2':
 63 |         opt.fastq2 = optarg;
 64 |         break;
 65 |       case 'o':
 66 |         opt.out1 = optarg;
 67 |         break;
 68 |       case 'O':
 69 |         opt.out2= optarg;
 70 |         break;
 71 |       default:print_help();
 72 |         return 1;
 73 |     }
 74 |   } while (next_option != -1);
 75 | 
 76 |   return 0;
 77 | }
 78 | 
 79 | int main(int argc, char ** argv) {
 80 |   Options opt;
 81 |   int parse_ret = parse_options(argc, argv, opt);
 82 |   if (parse_ret) return 1;
 83 |   if (argc == 1) {
 84 |     print_help();
 85 |     exit(0);
 86 |   }
 87 | 
 88 |   cpputil::FastxReader R1_reader(opt.fastq1);
 89 |   cpputil::FastxReader R2_reader(opt.fastq2);
 90 |   cpputil::FastxReader I1_reader(opt.index1);
 91 |   cpputil::FastxReader I2_reader(opt.index2);
 92 |   cpputil::FastqWriter O1_writer(opt.out1);
 93 |   cpputil::FastqWriter O2_writer(opt.out2);
 94 |   cpputil::FastxRecord read1;
 95 |   cpputil::FastxRecord read2;
 96 |   cpputil::FastxRecord index1;
 97 |   cpputil::FastxRecord index2;
 98 |   cpputil::FastxRecord outread1;
 99 |   cpputil::FastxRecord outread2;
100 |   while (R1_reader.yield(read1)) {
101 |     R2_reader.yield(read2);
102 |     I1_reader.yield(index1);
103 |     I2_reader.yield(index2);
104 |     assert(read1.name() == read2.name());
105 |     assert(index1.name() == index2.name());
106 |     assert(index1.name() == read1.name());
107 |     outread1.id = read1.id;
108 |     outread1.seq = index1.seq + read1.seq;
109 |     outread1.qual = index1.qual + read1.qual;
110 |     outread2.id = read2.id;
111 |     outread2.seq = index2.seq + read2.seq;
112 |     outread2.qual = index2.qual + read2.qual;
113 |     O1_writer.Write(outread1);
114 |     O2_writer.Write(outread2);
115 |   }
116 |   return 0;
117 | }
118 | 


--------------------------------------------------------------------------------
/obsolete/print_qual.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Ruolin Liu on 7/12/20.
 3 | //
 4 | 
 5 | #include <iostream>
 6 | #include "DNAUtils.h"
 7 | 
 8 | int main(int argc, char** argv) {
 9 |   std::string qual = argv[1];
10 |   int bq = std::stoi(argv[2]);
11 |   cpputil::PrintQualString(qual, bq);
12 |   return 0;
13 | }


--------------------------------------------------------------------------------
/snakemake/AdapV2/capture_wf_1/Snakefile:
--------------------------------------------------------------------------------
  1 | include: "../Snakefile"
  2 | GROUP_BY_UMI_DIR="groupbyumi"
  3 | DETECT_DIR="detect"
  4 | CONSENSUS_OUT="consensus_out"
  5 | RECOVERY = config['duplex_recovery_script']
  6 | COLLECT_DUPMET = f"python {RECOVERY}"
  7 | 
  8 | rule all:
  9 |     input:
 10 |          expand(Metrics_OUT + "/byproduct/{batch_id}.{index}.byproduct.txt", zip, batch_id = metadata.reset_index()['batch'], index = metadata.reset_index()['sample']),
 11 |          expand(Metrics_OUT + "/{index}.raw.umiHistogram.txt",index = metadata.reset_index()['sample']),
 12 |          expand(Metrics_OUT + "/{index}.duplex_yield_metrics.txt",index=sample_names),
 13 |          expand("sfc/{index}.error_metrics.txt", index=sample_names),
 14 |          expand(Metrics_OUT + "/{index}.raw.hs_metrics.txt", index=sample_names),
 15 | 
 16 | 
 17 | sample_to_maf = metadata.reset_index().groupby('sample').agg({'fingerprint_maf': set})
 18 | sample_to_bait = metadata.reset_index().groupby('sample').agg({'bait_intervals': set})
 19 | sample_to_bed = metadata.reset_index().groupby('sample').agg({'bait_bed': set})
 20 | 
 21 | rule CollectRawHsMetrics:
 22 |     input:
 23 |          bam = "tmp/{index}.raw.replacerg.markdup.bam",
 24 |     output:
 25 |           metrics = Metrics_OUT + "/{index}.raw.hs_metrics.txt",
 26 |           per_target_cov = Metrics_OUT  + "/{index}.raw.per_target_cov.txt"
 27 |     params:
 28 |           ref = REF,
 29 |           bait = lambda wildcards: sample_to_bait.loc[wildcards.index]['bait_intervals'],
 30 |     resources:
 31 |              mem = 16,
 32 |              runtime = 12
 33 |     shell:
 34 |          """
 35 |          {PICARD} CollectHsMetrics COVERAGE_CAP=20000 I={input.bam} O={output.metrics} R={params.ref} BAIT_INTERVALS={params.bait} TARGET_INTERVALS={params.bait} PER_TARGET_COVERAGE={output.per_target_cov}
 36 |          """
 37 | 
 38 | ##CODEC specific filters
 39 | rule FilterMolecularConsensusReads:
 40 |     input:
 41 |         bam = "filtered/{index}.mol_consensus.aligned.bam"
 42 |     output:
 43 |         bam = "filtered/{index}.mol_consensus.filtered.bam",
 44 |         bai = "filtered/{index}.mol_consensus.filtered.bam.bai"
 45 |     resources:
 46 |         mem = 8,
 47 |         runtime = 24
 48 |     shell:
 49 |         """
 50 |         {FILTER} -b {input.bam} -f 2 | samtools sort - -o {output.bam} && samtools index {output.bam}  
 51 |         """
 52 | 
 53 | rule CODEC_SFC:
 54 |     input:
 55 |         bam = "filtered/{index}.mol_consensus.filtered.bam"
 56 |     output:
 57 |         accu =  "sfc/{index}.mutant_metrics.txt",
 58 |         call =  "sfc/{index}.variants_called.txt",
 59 |         context = "sfc/{index}.context_count.txt",
 60 |     params:
 61 |         ref = REF,
 62 |         high_conf_region = lambda wildcards : sample_to_bed.loc[wildcards.index]['bait_bed'],
 63 |         germ_vcf = lambda wildcards : sample_to_vcf.loc[wildcards.index]['germline_vcf'],
 64 |         germ_bam = lambda wildcards : sample_to_germbam.loc[wildcards.index]['germline_bam'],
 65 |         mut_maf = lambda wildcards: sample_to_maf.loc[wildcards.index]['fingerprint_maf'],
 66 |     resources:
 67 |         mem = 8,
 68 |         runtime = 96
 69 |     shell:
 70 |         """
 71 |            {CALL_BIN}  -b {input.bam} \
 72 |                -L {params.high_conf_region} \
 73 |                -r {params.ref} \
 74 |                -n {params.germ_bam} \
 75 |                -m 60 \
 76 |                -q 30 \
 77 |                -d 12 \
 78 |                -V {params.germ_vcf} \
 79 |                -M {params.mut_maf} \
 80 |                -x 2 \
 81 |                -5 \
 82 |                -g 30 \
 83 |                -G 250 \
 84 |                -Q 0.6 \
 85 |                -N 0.03 \
 86 |                -B 0.5 \
 87 |                -Y 0 \
 88 |                -a {output.accu} \
 89 |                -e {output.call} \
 90 |                -C {output.context}
 91 |         """
 92 | 
 93 | 
 94 | rule CollectRawInsertSizeMetrics:
 95 |     input:
 96 |          bam = "tmp/{batch_id}.{index}.raw.aligned.bam",
 97 |     output:
 98 |           txt = Metrics_OUT + "/{batch_id}.{index}.raw.insert_size_metrics.txt",
 99 |           hist = Metrics_OUT + "/{batch_id}.{index}.raw.insert_size_histogram.pdf"
100 |     params:
101 |           ref = REF
102 |     shell:
103 |          """
104 |          {PICARD} CollectInsertSizeMetrics I={input.bam} O={output.txt} H={output.hist} M=0.5 W=900 DEVIATIONS=100
105 |          """
106 | 
107 | rule SortGBUbam:
108 |     input:
109 |         GROUP_BY_UMI_DIR + "/{index}.raw.GroupedByUmi.bam",
110 |     output:
111 |         bam = GROUP_BY_UMI_DIR + "/{index}.sorted.GroupedByUmi.bam",
112 |         bai = GROUP_BY_UMI_DIR + "/{index}.sorted.GroupedByUmi.bai",
113 |     resources:
114 |         mem = 16,
115 |         runtime = 48
116 |     shell:
117 |         """
118 |         {PICARD} SortSam I={input} O={output.bam} SO=coordinate CREATE_INDEX=true MAX_RECORDS_IN_RAM=1000000
119 |         """
120 | 
121 | 
122 | rule DuplexRecoveryByTarget:
123 |     input:
124 |          GROUP_BY_UMI_DIR + "/{index}.raw.GroupedByUmi.bam",
125 |     output:
126 |          Metrics_OUT + "/{index}.duplex_yield_metrics.txt",
127 |     resources:
128 |          mem = 8,
129 |          runtime = 12
130 |     params:
131 |          interval = lambda wildcards: sample_to_bait.loc[wildcards.index]['bait_intervals'],
132 |     shell:
133 |          """
134 |          {COLLECT_DUPMET} --bam_file {input} \
135 |              -l {params.interval} \
136 |              --min_reads 1 \
137 |              -c \
138 |              -p \
139 |              -r \
140 |              -o {output}
141 |          """


--------------------------------------------------------------------------------
/snakemake/AdapV2/wgs/Snakefile:
--------------------------------------------------------------------------------
  1 | include: "../Snakefile"
  2 | EVAL_REGION_BED= config['region_bed']
  3 | EVAL_REGION_IL= config['region_interval_list']
  4 | DBSNP= config['dbsnp']
  5 | 
  6 | rule all:
  7 |     input:
  8 |         expand(Metrics_OUT + "/{index}.mol_consensus.wgs_metrics.txt",index = sample_names),
  9 |         expand(ACCU_OUT + "/{index}.mutant_metrics.txt", index = sample_names),
 10 |         expand("raw_sfc/{index}.mutant_metrics.txt", index = sample_names),
 11 |         expand(Metrics_OUT + "/{index}.raw.insert_size_metrics.txt",index = sample_names),
 12 |         expand(Metrics_OUT + "/byproduct/{batch_id}.{index}.byproduct.txt", zip, batch_id = metadata.reset_index()['batch'], index = metadata.reset_index()['sample']),
 13 |         expand(Metrics_OUT + "/{index}.raw.wgs_metrics.txt",index = sample_names)
 14 | 
 15 | rule CollectRawWgsMetrics:
 16 |     input:
 17 |          bam = "tmp/{index}.raw.replacerg.markdup.bam",
 18 |     output:
 19 |           metrics = Metrics_OUT + "/{index}.raw.wgs_metrics.txt",
 20 |     params:
 21 |           ref = REF,
 22 |           itl = EVAL_REGION_IL
 23 |     resources:
 24 |           mem = 16,
 25 |           runtime = 96
 26 |     shell:
 27 |          """
 28 |          {PICARD} CollectWgsMetrics I={input.bam} O={output.metrics} R={params.ref} INTERVALS={params.itl} \
 29 |          COUNT_UNPAIRED=true MINIMUM_BASE_QUALITY=0 MINIMUM_MAPPING_QUALITY=0
 30 |          """
 31 | 
 32 | rule CollectWgsMetrics:
 33 |     input:
 34 |          bam = "consensus/{index}.replacerg.markdup.bam",
 35 |     output:
 36 |           metrics = Metrics_OUT + "/{index}.mol_consensus.wgs_metrics.txt",
 37 |     params:
 38 |           ref = REF,
 39 |           itl = EVAL_REGION_IL
 40 |     resources:
 41 |           mem = 16,
 42 |           runtime = 96
 43 |     shell:
 44 |          """
 45 |          {PICARD} CollectWgsMetrics I={input.bam} O={output.metrics} R={params.ref} INTERVALS={params.itl} \
 46 |          INCLUDE_BQ_HISTOGRAM=true
 47 |          """
 48 | 
 49 | rule CollectFinalWgsMetrics:
 50 |     input:
 51 |          bam = "consensus/{index}.mol_consensus.aligned.bam",
 52 |     output:
 53 |           metrics = Metrics_OUT + "/{index}.mol_consensus.wgs_metrics.txt",
 54 |     params:
 55 |           ref = REF,
 56 |           itl = EVAL_REGION_IL
 57 |     shell:
 58 |          """
 59 |          {PICARD} CollectWgsMetrics I={input.bam} O={output.metrics} R={params.ref} INTERVALS={params.itl} INCLUDE_BQ_HISTOGRAM=true MINIMUM_BASE_QUALITY=30
 60 |          """
 61 | #
 62 | rule CSS_SFC_ErrorMetrics:
 63 |     input:
 64 |          bam = "consensus/{index}.mol_consensus.aligned.bam",
 65 |     output:
 66 |           accu = ACCU_OUT + "/{index}.mutant_metrics.txt",
 67 |           called = ACCU_OUT + "/{index}.variants_called.txt",
 68 |           context = ACCU_OUT + "/{index}.context_count.txt",
 69 |     params:
 70 |           ref = REF,
 71 |           high_conf_region = EVAL_REGION_BED,
 72 |           dbsnp = DBSNP,
 73 |           germ_bam = lambda wildcards : sample_to_germbam.loc[wildcards.index]['germline_bam'],
 74 |           #germ_vcf = lambda wildcards : sample_to_germvcf.loc[wildcards.index]['germline_vcf'],
 75 |     resources:
 76 |              mem = 16,
 77 |              runtime = 96
 78 |     shell:
 79 |          """
 80 |             {CALL_BIN}  -b {input.bam} \
 81 |                 -L {params.high_conf_region} \
 82 |                 -r {params.ref} \
 83 |                 -m 60 \
 84 |                 -q 30 \
 85 |                 -d 12 \
 86 |                 -n {params.germ_bam} \
 87 |                 -V {params.dbsnp} \
 88 |                 -x 6 \
 89 |                 -c 4 \
 90 |                 -5 \
 91 |                 -g 30 \
 92 |                 -G 250 \
 93 |                 -Q 0.7 \
 94 |                 -B 0.6 \
 95 |                 -N 0.05 \
 96 |                 -Y 5 \
 97 |                 -W 1 \
 98 |                 -a {output.accu} \
 99 |                 -e {output.called} \
100 |                 -C {output.context}
101 |          """
102 | 
103 | rule RAW_SFC_ErrorMetrics:
104 |     input:
105 |          bam = "tmp/{index}.raw.replacerg.markdup.bam"
106 |     output:
107 |           accu = "raw_sfc/{index}.mutant_metrics.txt",
108 |           called = "raw_sfc/{index}.variants_called.txt",
109 |           context = "raw_sfc/{index}.context_count.txt",
110 |     params:
111 |           ref = REF,
112 |           high_conf_region = EVAL_REGION_BED,
113 |           dbsnp = DBSNP,
114 |           germ_bam = lambda wildcards : sample_to_germbam.loc[wildcards.index]['germline_bam'],
115 |           #germ_vcf = lambda wildcards : sample_to_germvcf.loc[wildcards.index]['germline_vcf'],
116 |     resources:
117 |              mem = 16,
118 |              runtime = 96
119 |     shell:
120 |          """
121 |             {CALL_BIN}  -b {input.bam} \
122 |                 -L {params.high_conf_region} \
123 |                 -r {params.ref} \
124 |                 -m 60 \
125 |                 -n {params.germ_bam} \
126 |                 -q 30 \
127 |                 -d 12 \
128 |                 -V {params.dbsnp} \
129 |                 -x 6 \
130 |                 -c 4 \
131 |                 -5 \
132 |                 -g 30 \
133 |                 -G 250 \
134 |                 -Q 0.6 \
135 |                 -B 0.6 \
136 |                 -N 0.1 \
137 |                 -Y 5 \
138 |                 -W 1 \
139 |                 -a {output.accu} \
140 |                 -e {output.called} \
141 |                 -C {output.context}
142 |          """


--------------------------------------------------------------------------------
/snakemake/README.md:
--------------------------------------------------------------------------------
1 | ## Setup input for workflow
2 | * sample_sheet.csv is used for `codex demux`, it requires three columns `SampleName,IndexBarcode1,IndexBarcode2`. Each row must have unique SampleName
3 | * input.tsv store paths for fastq files and other pipeline inputs. The `sample` column in input.tsv has to match the `SampleName` column in sample_sheet.csv
4 | 


--------------------------------------------------------------------------------
/snakemake/jobscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # properties = {properties}
3 | which python;
4 | {exec_job}
5 | 


--------------------------------------------------------------------------------
/snakemake/pipeline_input_examples/caputure/config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | input_meta: input.tsv
 3 | cwd: codec
 4 | ncores: 4
 5 | nparallel: 40
 6 | tmpdir: tmp
 7 | ref: $HG19
 8 | codec_root:
 9 | dict: dummy
10 | gatk3:
11 | gatk4: gatk
12 | mutect:
13 | fgbio:
14 | bwa: bwa
15 | region_bed:
16 | region_interval_list:
17 | sampleid: 
18 | dbsnp:
19 | duplex_recovery_script: 
20 | 


--------------------------------------------------------------------------------
/snakemake/pipeline_input_examples/caputure/input.tsv:
--------------------------------------------------------------------------------
1 | batch	sample	fastq1	fastq2	germline_vcf	germline_bam	sample_sheet	fingerprint_maf	bait_intervals	bait_bed
2 | 


--------------------------------------------------------------------------------
/snakemake/pipeline_input_examples/caputure/runSnakemake.sh:
--------------------------------------------------------------------------------
1 | root=$PWD/../../../codecsuite/snakemake && snakemake --cluster-sync $root/qsub_wrapper.py --jobscript $root/jobscript.sh --snakefile $root/AdapV2/capture_wf_1/Snakefile --configfile config.yaml --latency-wait 30 -j 2000 --restart-times 3 -p --rerun-incomplete --reason -n
2 | 


--------------------------------------------------------------------------------
/snakemake/pipeline_input_examples/wgs/config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | input_meta: pipeline_input.tsv
 3 | cwd: output
 4 | ncores: 4
 5 | nparallel: 40
 6 | tmpdir: tmp
 7 | ref: 
 8 | codec_root: 
 9 | dict: dummy
10 | gatk3: 
11 | gatk4: gatk
12 | bwa: 
13 | mutect: 
14 | fgbio: 
15 | region_bed: 
16 | region_interval_list: 
17 | sampleid: wgs1 
18 | dbsnp: 
19 | 


--------------------------------------------------------------------------------
/snakemake/pipeline_input_examples/wgs/input.tsv:
--------------------------------------------------------------------------------
1 | batch	sample	fastq1	fastq2	germline_bam	sample_sheet
2 | 


--------------------------------------------------------------------------------
/snakemake/pipeline_input_examples/wgs/runSnakemake.sh:
--------------------------------------------------------------------------------
1 | root=$PWD/../../../codecsuite/snakemake && snakemake --cluster-sync $root/qsub_wrapper.py --jobscript $root/jobscript.sh --snakefile $root/AdapV2/wgs/Snakefile --configfile config.yaml --latency-wait 30 -j 2000 --restart-times 3 -p --rerun-incomplete --reason -n
2 | 


--------------------------------------------------------------------------------
/snakemake/qsub_wrapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | from snakemake.utils import read_job_properties
 7 | 
 8 | jobscript = sys.argv[1]
 9 | job_properties = read_job_properties(jobscript)
10 | 
11 | defaults = {'mem': 8, 'runtime': 24, 'ncores': 1}
12 | for k, v in defaults.items():
13 |     if k not in job_properties['resources']:
14 |         job_properties['resources'][k] = v
15 | 
16 | params = job_properties['resources']
17 | 
18 | qsub_cmd = (f'qsub -l h_vmem={params["mem"]}G '
19 |             f'-pe smp {params["ncores"]} -binding linear:{params["ncores"]} '
20 |             f'-l h_rt={params["runtime"]}:00:00 '
21 |             f'-o logs/{job_properties["rule"]}/ '
22 |             f'-e logs/{job_properties["rule"]}/ '
23 |             f'-N {job_properties["rule"]} '
24 |             f'-cwd -V -j y -sync y '
25 |             f'{jobscript}')
26 | 
27 | qsub_cmd += ' | tail -2 | cut -d " " -f 3'
28 | os.system(qsub_cmd)
29 | 


--------------------------------------------------------------------------------
/snakemake/script/agg_log.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import logging
 4 | import pandas as pd
 5 | import sys
 6 | 
 7 | logger = logging.getLogger("{}".format(__file__))
 8 | 
 9 | def get_arguments():
10 | 
11 |     parser = argparse.ArgumentParser(prog="aggregate miredas reult(s)", formatter_class=argparse.RawDescriptionHelpFormatter)
12 |     parser.add_argument("log", type=str, nargs="+", help="trim adapter logs")
13 |     parser.add_argument("out", type=str, help="output name")
14 |     args = parser.parse_args()
15 |     return args
16 | 
17 | def process(opts):
18 |     tot = pd.DataFrame()
19 |     for f in opts.log:
20 |         df = pd.read_csv(f, sep=":", names=['cat', "count"])
21 |         if tot.empty:
22 |             tot = df
23 |         else:
24 |             tot['count'] = tot['count'].add(df['count'])
25 |     tot['cat'] = tot['cat'] + ":"
26 |     tot.to_csv(opts.out, sep=" ", index=False, header=False)
27 | 
28 | if __name__ == '__main__':
29 |     sys.exit(process(get_arguments()))
30 | 


--------------------------------------------------------------------------------
/snakemake/script/cds_summarize.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import logging
  4 | import sys
  5 | import json
  6 | import os
  7 | import pandas as pd
  8 | import numpy as np
  9 | import pysam
 10 | from collections import defaultdict
 11 | #from Bio import pairwise2
 12 | 
 13 | logger = logging.getLogger("{}".format(__file__))
 14 | 
 15 | #def check_tandem_adpt(seq):
 16 |     #linker = "AGATCGGAAGAGCTTCATCATTAGATCCATTAATGTTACACTTCAACTCTTCACCCACATCAGATTAGTACCAGCTTCGAGGATCAACACGTCAGAGTCTAGCTGGTGATAGGAAGTGTAGGTAACATAGACGAAGTTATCAACAATGTGTAACTGACTTAACGCTCTTCCGATCT"
 17 |     #res = pairwise2.align.localms(seq, linker, 1, -4, -6,-2)
 18 | 
 19 | def read_pair_generator(bam, region_string=None):
 20 |     """
 21 |     Generate read pairs in a BAM file or within a region string.
 22 |     Reads are added to read_dict until a pair is found.
 23 |     """
 24 |     read_dict = defaultdict(lambda: [None, None])
 25 |     for read in bam.fetch(until_eof=True):
 26 |         if read.is_secondary or read.is_supplementary:
 27 |             continue
 28 |         qname = read.query_name
 29 |         if qname not in read_dict:
 30 |             if read.is_read1:
 31 |                 read_dict[qname][0] = read
 32 |             else:
 33 |                 read_dict[qname][1] = read
 34 |         else:
 35 |             if read.is_read1:
 36 |                 yield read, read_dict[qname][1]
 37 |             else:
 38 |                 yield read_dict[qname][0], read
 39 |             del read_dict[qname]
 40 | 
 41 | def is_overlapped(read1, read2):
 42 |     if read1.is_unmapped or read2.is_unmapped:
 43 |         return False
 44 |     if read1.reference_name != read2.reference_name:
 45 |         return False
 46 |     if read1.reference_start < read2.reference_end and read2.reference_start < read1.reference_end:
 47 |         return True
 48 | 
 49 | def is_complete_overlapped_excluding_sclips(read1, read2):
 50 |     if not is_overlapped(read1, read2):
 51 |         return False
 52 |     if read1.reference_start != read2.reference_start:
 53 |         return False
 54 |     if read1.reference_end != read2.reference_end:
 55 |         return False
 56 |     return True
 57 | 
 58 | def overlap_len(read1, read2):
 59 |     if not is_overlapped(read1,read2):
 60 |         return 0
 61 |     else:
 62 |         return min(read1.reference_end, read2.reference_end) - max(read1.reference_start, read2.reference_start)
 63 | 
 64 | def overlap_span_ratio(read1, read2):
 65 |     ol = overlap_len(read1, read2)
 66 |     if ol == 0:
 67 |         return 0;
 68 |     else:
 69 |         span = max(read1.reference_end, read2.reference_end) - min(read1.reference_start, read2.reference_start)
 70 |         return ol/span
 71 | 
 72 | def get_arguments():
 73 | 
 74 |     parser = argparse.ArgumentParser(prog="CODEC byproducts summary", formatter_class=argparse.RawDescriptionHelpFormatter)
 75 |     parser.add_argument("--highconf_bam", type=str, default = "", help="high confident CDS reads", required=True)
 76 |     parser.add_argument("--sample_id", type=str, help="sample id", required=True)
 77 |     parser.add_argument("--trim_log", type=str, default = "", help="trim linker log file", required=False)
 78 |     parser.add_argument("--fastp", type=str, help="json file output by fastp", required=False)
 79 |     parser.add_argument("--lowconf_bam", type=str, default = "", help="low confident CDS reads", required=False)
 80 |     parser.add_argument("--si_hiconf_bam", type=str, default = "", help="single insert highconf bam", required=False)
 81 |     parser.add_argument("--si_lowconf_bam", type=str, default = "", help="single insert lowconf bam", required=False)
 82 |     parser.add_argument("--trim_one_bam", type=str, default = "", help="Where linker has been trimmed from only one end", required=False)
 83 |     parser.add_argument("--untrim_both_bam", type=str, default = "", help="Where linker has not been trimmed from both ends", required=False)
 84 |     parser.add_argument("--hs_metrics", type=str, help="hs metrics output", default="")
 85 |     parser.add_argument("--cds_intermol_bamout", type=str, help="output bam for cds intermolcular reads", default = "", required=False)
 86 |     parser.add_argument("--demux_log", type=str, help="demux log file", default = "", required=False)
 87 |     #parser.add_argument("--si_intermol_bamout", type=str, help="output bam for singleinsert intermolcular reads", required=True)
 88 |     args = parser.parse_args()
 89 |     return args
 90 | 
 91 | class CdsMetrics:
 92 |     """
 93 |     ## uf = unfiltered
 94 |     ## pf = passed filter
 95 |     """
 96 |     def __init__(self, sid):
 97 |         self.sample_id = sid
 98 |         self.n_raw_frag = 0
 99 |         self.n_unmapped = 0
100 |         self.pct_raw_uf_q20 = 0
101 |         self.pct_raw_uf_q30 = 0
102 |         self.n_raw_pf_frag = 0
103 |         self.pct_aligned_frag = 0
104 |         self.on_target_rate = 0
105 |         self.mean_bait_cov = 0
106 |         self.mean_target_cov = 0
107 | 
108 | #CDS specific
109 |         self.n_high_conf = 0
110 |         self.n_adp_dimer_frag = 0
111 |         self.n_double_ligation = 0
112 |         self.n_low_conf = 0
113 |         self.n_intermol = 0 # intermolecular
114 |         self.n_single_hiconf = 0
115 |         self.n_single_lowconf = 0
116 |         self.n_insuf_trim = 0
117 |         self.n_close_proxim = 0
118 |         # self.read_len_diff = []
119 |         # self.aln_len_diff = []
120 |         # self.ol_ratios = defaultdict(list)
121 | 
122 |     def n_raw_uf_frag(self):
123 |         return self.n_raw_frag
124 |     def n_categorized(self):
125 |         return self.n_high_conf + self.n_low_conf + self.n_adp_dimer_frag + self.n_double_ligation + \
126 |                self.n_intermol + self.n_single_hiconf + self.n_single_lowconf + self.n_insuf_trim + self.n_unmapped + self.n_close_proxim
127 |     def __str__(self):
128 |         header = ["sample_id",
129 |                   ### CDS specific
130 |                   "pct_correct",
131 |                   "pct_double_ligation",
132 |                   "pct_adp_dimer",
133 |                   "pct_intermol",
134 |                   "pct_unmapped",
135 |                   "pct_close_proxim",
136 |                   "pct_categorized",
137 |                   "n_correct",
138 |                   "n_double_ligation",
139 |                   "n_adp_dimer",
140 |                   "n_intermol",
141 |                   "n_unmapped",
142 |                   "n_close_proxim",
143 |                   "n_categorized",
144 |                   "n_total",
145 |                   ]
146 | 
147 |         header_str = "\t".join(header)
148 |         return f"{header_str}\n" \
149 |                f"{self.sample_id}\t" \
150 |                f"{self.n_high_conf / self.n_raw_uf_frag()}\t" \
151 |                f"{self.n_double_ligation / self.n_raw_uf_frag()}\t" \
152 |                f"{self.n_adp_dimer_frag / self.n_raw_uf_frag()}\t" \
153 |                f"{self.n_intermol / self.n_raw_uf_frag()}\t" \
154 |                f"{self.n_unmapped / self.n_raw_uf_frag()}\t" \
155 |                f"{self.n_close_proxim / self.n_raw_uf_frag()}\t" \
156 |                f"{self.n_categorized() / self.n_raw_uf_frag()}\t" \
157 |                f"{self.n_high_conf}\t" \
158 |                f"{self.n_double_ligation}\t" \
159 |                f"{self.n_adp_dimer_frag}\t" \
160 |                f"{self.n_intermol}\t" \
161 |                f"{self.n_unmapped}\t" \
162 |                f"{self.n_close_proxim}\t" \
163 |                f"{self.n_categorized()}\t" \
164 |                f"{self.n_raw_frag}"
165 | 
166 | def parse_linker_trim_log(log_file, cdsm, adap_v2):
167 |     with open(log_file, 'r') as f:
168 |         for line in f:
169 |             k, v = line.split(":")
170 |             if k == "TOTAL" or k == "TOTOL":
171 |                 cdsm.n_raw_frag = int(v)
172 |             if k == "LOST_BOTH":
173 |                 cdsm.n_adp_dimer_frag = int(v)
174 |             elif not adap_v2 and k == "DOUBLE_LIGATION":
175 |                 cdsm.n_double_ligation += int(v)
176 |             elif adap_v2 and k== "LOST_READ1":
177 |                 cdsm.n_double_ligation += int(v)
178 |             elif adap_v2 and k== "LOST_READ2":
179 |                 cdsm.n_double_ligation += int(v)
180 | 
181 | 
182 | def alignment_analysis(bam, cdsm, trim_type, intermol_bam = None, im_dist_cutoff = 5_000, adap_v2 = False):
183 |     assert(trim_type in ['HighConf', "LowConf", "TrimOne", "UntrimBoth", "SingleHiconf", "SingleLowconf"])
184 |     samfile = pysam.AlignmentFile(bam, "rb")
185 |     total_frag = 0
186 |     for read1, read2 in read_pair_generator(samfile):
187 |         total_frag += 1
188 |         if read1.is_unmapped or read2.is_unmapped:
189 |             if read1.infer_query_length() and read1.infer_query_length() > 15 and \
190 |                     read2.infer_query_length() and read2.infer_query_length() > 15:
191 |                 cdsm.n_unmapped += 1
192 |             continue
193 | 
194 |         if read1.reference_name != read2.reference_name or abs(read1.tlen) > im_dist_cutoff:
195 |             cdsm.n_intermol += 1
196 |             if intermol_bam:
197 |                 intermol_bam.write(read1)
198 |                 intermol_bam.write(read2)
199 |             continue
200 | 
201 |         if trim_type == "HighConf":
202 |             if adap_v2:
203 |                 if is_overlapped(read1, read2):
204 |                     cdsm.n_high_conf += 1
205 |                 else:
206 |                     cdsm.n_close_proxim += 1
207 |             else:
208 |                 if is_complete_overlapped_excluding_sclips(read1, read2):
209 |                     cdsm.n_high_conf += 1
210 |         # elif trim_type == "LowConf":
211 |         #     if is_complete_overlapped_excluding_sclips(read1, read2):
212 |         #         cdsm.n_low_conf += 1
213 |         # elif trim_type == "SingleHiconf":
214 |         #     if (not read1.has_tag('tm') or read1.get_tag('tm') != 4 ) and \
215 |         #             (not read2.has_tag('tm') or read2.get_tag('tm') != 4 ) and \
216 |         #             is_complete_overlapped_excluding_sclips(read1, read2):
217 |         #             cdsm.n_single_hiconf += 1
218 |         # elif trim_type == "SingleLowconf":
219 |         #     if is_complete_overlapped_excluding_sclips(read1, read2):
220 |         #         cdsm.n_single_lowconf += 1
221 |         # elif trim_type == "TrimOne" or trim_type == "UntrimBoth":
222 |         #     if not adap_v2 and is_overlapped(read1, read2):
223 |         #         cdsm.n_insuf_trim += 1
224 | 
225 |     return total_frag
226 | 
227 | def process(opts):
228 |     cdsm = CdsMetrics(opts.sample_id)
229 |     adap_v2 = False
230 |     if opts.fastp:
231 |         with open(opts.fastp, 'r') as f:
232 |             sample_dict = json.load(f)
233 |             cdsm.n_raw_frag  = int(sample_dict['summary']["before_filtering"]["total_reads"])/2
234 |             cdsm.n_raw_pf_frag = int(sample_dict['summary']['after_filtering']['total_reads'])/2
235 |             cdsm.pct_raw_uf_q20 = sample_dict['summary']['before_filtering']['q20_rate']
236 |             cdsm.pct_raw_uf_q30 = sample_dict['summary']['before_filtering']['q30_rate']
237 | 
238 |     if opts.cds_intermol_bamout:
239 |         bam = pysam.AlignmentFile(opts.highconf_bam, "rb")
240 |         cds_intermol_writer = pysam.AlignmentFile(opts.cds_intermol_bamout + ".tmp.bam", "wb", template=bam)
241 |         bam.close()
242 | 
243 |     if opts.lowconf_bam:
244 |         num_frag_processed = alignment_analysis(opts.lowconf_bam, cdsm, trim_type = 'LowConf')
245 |         adap_v2 = True if num_frag_processed == 0 else False
246 |     else:
247 |         adap_v2 = True
248 | 
249 |     if opts.trim_log:
250 |         parse_linker_trim_log(opts.trim_log, cdsm, adap_v2)
251 | 
252 |     if opts.highconf_bam:
253 |         if opts.cds_intermol_bamout:
254 |             nread_processed = alignment_analysis(opts.highconf_bam, cdsm, trim_type = 'HighConf', intermol_bam=cds_intermol_writer, adap_v2=adap_v2)
255 |         else:
256 |             nread_processed =alignment_analysis(opts.highconf_bam, cdsm, trim_type='HighConf', adap_v2=adap_v2)
257 |     if cdsm.n_raw_frag == 0:
258 |         cdsm.n_raw_frag = nread_processed
259 |     if opts.trim_one_bam:
260 |         alignment_analysis(opts.trim_one_bam, cdsm, trim_type = 'TrimOne')
261 |     if opts.untrim_both_bam:
262 |         alignment_analysis(opts.untrim_both_bam, cdsm, trim_type = 'UntrimBoth')
263 |     if opts.si_hiconf_bam:
264 |         alignment_analysis(opts.si_hiconf_bam, cdsm, trim_type = 'SingleHiconf')
265 |     if opts.si_lowconf_bam:
266 |         alignment_analysis(opts.si_lowconf_bam, cdsm, trim_type = 'SingleLowconf')
267 | 
268 | 
269 |     if opts.hs_metrics:
270 |         hs_metrics_df = pd.read_csv(opts.hs_metrics, skiprows=6, nrows=1, sep='\t', low_memory=False)
271 |         cdsm.pct_aligned_frag = hs_metrics_df['PCT_PF_UQ_READS_ALIGNED'][0]
272 |         cdsm.on_target_rate = hs_metrics_df['PCT_SELECTED_BASES'][0]
273 |         cdsm.mean_bait_cov = hs_metrics_df['MEAN_BAIT_COVERAGE'][0]
274 |         cdsm.mean_target_cov = hs_metrics_df['MEAN_TARGET_COVERAGE'][0]
275 |     print(cdsm)
276 |     if opts.cds_intermol_bamout:
277 |         cds_intermol_writer.close()
278 |         os.system(f"samtools sort -n {opts.cds_intermol_bamout}.tmp.bam -o {opts.cds_intermol_bamout} && rm {opts.cds_intermol_bamout}.tmp.bam")
279 | 
280 | 
281 | if __name__ == '__main__':
282 |     sys.exit(process(get_arguments()))
283 | 


--------------------------------------------------------------------------------
/snakemake/script/codec2maf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | library(optparse)
  3 | 
  4 | option_list = list(
  5 |   make_option(c("-i", "--inputmaf"), type="character", default=NULL, 
  6 |               help="input data"),
  7 |   make_option(c("-o", "--outputmaf"), type="character", default=NULL, 
  8 |               help="output file name"),
  9 |   make_option(c("-Q", "--q60frac"), type="double", default=0.0, 
 10 |               help="min q60frac [default= %default]"),
 11 |   make_option(c("-N", "--Nfrac"), type="double", default=1.0, 
 12 |               help="max N frac [default= %default]"),
 13 |   make_option(c("-L", "--minIndelFragLen"), type="integer", default=0, 
 14 |               help="minimum fraglen when containing indels [default= %default]"),
 15 |   make_option(c("-l", "--maxIndelLen"), type="integer", default=50, 
 16 |               help="maximum small indel len [default= %default]"),
 17 |   make_option(c("-D", "--minIndelDist2End"), type="integer", default=0, 
 18 |               help="minimum distance of a indel to the fragend [default= %default]"),
 19 |   make_option(c("-b", "--breakmnv"), action="store_true", default=FALSE,
 20 |               help="break the MNV to SNV [default %default]"),
 21 |   make_option(c("-p", "--pairedOnly"), action="store_true", default=FALSE,
 22 |               help="only include paired reads"),
 23 |   make_option(c("-f", "--countByFragId"), action="store_true", default=FALSE,
 24 |               help="count by fragment id: (frag_len, dist_to_fragend). This is to avoid artifact fragments due to UMI it won't work for ddBTP data")
 25 | 
 26 | ); 
 27 |  
 28 | opt_parser = OptionParser(option_list=option_list);
 29 | opt = parse_args(opt_parser);
 30 | 
 31 | if (is.null(opt$inputmaf)){
 32 |   print_help(opt_parser)
 33 |   stop("At least one argument must be supplied (input file).n", call.=FALSE)
 34 | }
 35 | 
 36 | library(data.table)
 37 | library(tidyverse)
 38 | 
 39 | break_mnv <- function(mut_fam_df, snv_only=FALSE) {
 40 |   indel = mut_fam_df %>% filter(type != "SNV")
 41 |   mut_fam_df = mut_fam_df %>% filter(type == "SNV")
 42 |   snv = mut_fam_df %>% filter(nchar(ref) == 1)
 43 |   mnv = mut_fam_df %>% filter(nchar(ref) > 1)
 44 |   mnv2snv = data.frame()
 45 |   if (nrow(mnv) > 0) {
 46 |     for (i in 1:nrow(mnv)) {
 47 |       for (j in 1:nchar(mnv[i,]$ref)) {
 48 |         tmp = mnv[i,]
 49 |         tmp$ref = substr(mnv[i,]$ref, j, j)
 50 |         tmp$alt = substr(mnv[i,]$alt, j, j)
 51 |         tmp$ref_pos = mnv[i,]$ref_pos + j - 1
 52 |         mnv2snv = rbind(mnv2snv, tmp)
 53 |       }  
 54 |     }
 55 |   }
 56 |   if (snv_only) {
 57 |     rbind(snv, mnv2snv) 
 58 |   }
 59 |   else {
 60 |     rbind(snv, mnv2snv, indel) 
 61 |   }
 62 | }
 63 | 
 64 | add_varid <- function(df) {
 65 |   if ("Chromosome" %in% colnames(df)) {
 66 |     df = df %>% mutate(id = paste(Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep="_")) 
 67 |   } else if ("chrom" %in% colnames(df)) {
 68 |     df = df %>% mutate(id = paste(chrom, ref_pos, ref, alt, sep="_")) 
 69 |   }
 70 |   df
 71 | }
 72 | 
 73 | convert_chr_to_numeric <- function(chr) {
 74 |   chr <- gsub("chr", "", chr)  # Remove the "chr" prefix
 75 |   
 76 |   # Convert special cases
 77 |   if (chr == "X") return(23)
 78 |   if (chr == "Y") return(24)
 79 |   if (chr == "M") return(25)
 80 |   
 81 |   return(as.numeric(chr))  # For numbered chromosomes, convert to numeric
 82 | }
 83 | 
 84 | codec2maf <- function(infile, outfile, q60rate, Nrate, min_frag_indel, min_dist_indel, max_indel_len, breakmnv, pairedOnly, countByFragId) {
 85 |     codec = fread(infile)
 86 |     print(paste(nrow(codec %>% filter(type != "SNV")), "frag contains INDEL"))
 87 |     print(paste(nrow(codec %>% filter(type == "SNV")), "frag contains SNV"))
 88 |     #codec = codec %>% filter(numQ60/olen >= q60rate & numN/flen <= Nrate)
 89 |     if (pairedOnly) {
 90 |         codec = codec %>% filter(flen != 0)
 91 |     }
 92 |     codec = codec %>% filter(numQpass/clen >= q60rate & (flen == 0 | numN/flen <= Nrate))
 93 |     codec = codec %>% filter(type == "SNV" | (flen >= as.integer(min_frag_indel)))
 94 |     codec = codec %>% filter(type == "SNV" | (dist_to_fragend >= as.integer(min_dist_indel)))
 95 |     codec = codec %>% mutate(fragid = paste0(flen, "_", dist_to_fragend))
 96 |     codec = codec %>% filter(nchar(ref) <= max_indel_len & nchar(alt) <= max_indel_len)
 97 |     print(paste(nrow(codec %>% filter(type != "SNV")), "frag contains INDEL after filtering"))
 98 |     print(paste(nrow(codec %>% filter(type == "SNV")), "frag contains SNV after filtering"))
 99 |     if (breakmnv) {
100 |         codec = break_mnv(codec)
101 |     }
102 |     codec = add_varid(codec)
103 |     print(head(codec))
104 |     if (countByFragId) {
105 |         codec = codec %>% group_by(id) %>% summarise(chrom = unique(chrom), 
106 |                                                        ref_pos = unique(ref_pos), 
107 |                                                        ref = unique(ref),
108 |                                                        alt = unique(alt),
109 |                                                        type = unique(type),
110 |                                                        t_alt_count=length(unique(fragid)),
111 |                                                        t_ref_count=unique(site_depth))
112 |     } else {
113 |         codec = codec %>% group_by(id) %>% summarise(chrom = unique(chrom), 
114 |                                                        ref_pos = unique(ref_pos), 
115 |                                                        ref = unique(ref),
116 |                                                        alt = unique(alt),
117 |                                                        type = unique(type),
118 |                                                        t_alt_count=length(unique(read_name)),
119 |                                                        t_ref_count=unique(site_depth))
120 |     }
121 |     codec = codec[, 2:ncol(codec)]
122 |     codec$chrid = sapply(codec$chrom, convert_chr_to_numeric)
123 |     codec = arrange(codec, chrid, ref_pos)
124 |     codec = codec[, 1:(ncol(codec) - 1)]
125 |     colnames(codec)[1:5] = c("Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2", "Variant_Type")
126 |     codec = codec %>% mutate(Variant_Type = ifelse(Variant_Type=="SNV", "SNP", Variant_Type))
127 |     codec$Hugo_Symbol = "Unknown"
128 |     codec$Tumor_Sample_Barcode = "TUMOR"
129 |     #codec$tumor_f=0.5
130 |     codec = codec %>% mutate(t_ref_count = t_ref_count - t_alt_count)
131 |     codec = codec %>% select("Hugo_Symbol", "Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2", "Variant_Type", "Tumor_Sample_Barcode", "t_alt_count", "t_ref_count")
132 |     write_tsv(codec, outfile)
133 |     print(paste(nrow(codec %>% filter(Variant_Type == "SNP")), "snvs write to", outfile))
134 |     print(paste(nrow(codec %>% filter(Variant_Type != "SNP")), "indels write to", outfile))
135 | }
136 | 
137 | codec2maf(opt$inputmaf, opt$outputmaf, opt$q60frac, opt$Nfrac, opt$minIndelFragLen, opt$minIndelDist2End, opt$maxIndelLen, opt$breakmnv, opt$pairedOnly, opt$countByFragId)
138 | 


--------------------------------------------------------------------------------
/snakemake/script/cov_sum.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | import sys
 7 | 
 8 | logger = logging.getLogger("{}".format(__file__))
 9 | 
10 | def process():
11 |     cov_count = {}
12 |     for line in sys.stdin:
13 |         if line.startswith('REF'):
14 |             continue
15 |         cols = line.strip().split("\t")
16 |         cov = int(cols[2])
17 |         if cov not in cov_count:
18 |             cov_count[cov] = 1
19 |         else:
20 |             cov_count[cov] += 1
21 | 
22 |     for k in sorted(cov_count.keys()):
23 |         print(k, cov_count[k], sep='\t')
24 | 
25 | if __name__ == '__main__':
26 |     sys.exit(process())
27 | 
28 | 


--------------------------------------------------------------------------------
/snakemake/script/create_maf_from_probe_rg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import logging
 4 | import sys
 5 | import pysam
 6 | 
 7 | logger = logging.getLogger("{}".format(__file__))
 8 | def read_bed(bedfile):
 9 |     """ Creates generator from bed file or interval_list """
10 |     logger.info("Reading region file...")
11 |     interval_list = bedfile.endswith("interval_list")
12 |     with open(bedfile, "r") as bed:
13 |         for line in bed:
14 |             if line.startswith("@"):
15 |                 continue
16 |             line = line.strip()
17 |             chrom, start, stop = line.split()[0:3]
18 |             start, stop = int(start), int(stop)
19 |             if interval_list:
20 |                 start -= 1
21 |             yield chrom, start, stop
22 | 
23 | def get_arguments():
24 | 
25 |     parser = argparse.ArgumentParser(prog="Parse Fastp json result(s)", formatter_class=argparse.RawDescriptionHelpFormatter)
26 |     parser.add_argument("--bed", type=str, help="json file output by fastp", required=False)
27 |     parser.add_argument("--ref", type=str, help="json file output by fastp", required=False)
28 |     args = parser.parse_args()
29 |     return args
30 | 
31 | def process(opts):
32 |     header= ['Hugo_Symbol', 'Chromosome', 'Start_position', 'Reference_Allele', 'Tumor_Seq_Allele2', 'probed']
33 |     reffa = pysam.FastaFile(opts.ref)
34 |     print('\t'.join(header))
35 |     for chrom, s, e in read_bed(opts.bed):
36 |         halfw = int(e) - int(s)
37 |         start = int(s) + halfw
38 |         REF = reffa.fetch(chrom, start , start + 1)
39 |         ALT = REF
40 |         line = ['NA', chrom, str(start + 1), REF, ALT, '1']
41 |         print('\t'.join(line))
42 | 
43 | if __name__ == '__main__':
44 |     sys.exit(process(get_arguments()))
45 | 


--------------------------------------------------------------------------------
/snakemake/script/downsample_read_families.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import sys
  5 | import os
  6 | import pysam
  7 | import random
  8 | from random import sample
  9 | 
 10 | def parse_cl_args():
 11 |     parser = argparse.ArgumentParser()
 12 | 
 13 |     parser.add_argument("--in_bam", help="input BAM", required=True)
 14 |     parser.add_argument("--sample_id", help="sample_id for series of bams", required=True)
 15 |     parser.add_argument("--outdir", help="output directory", default="./")
 16 |     parser.add_argument("--min_family_size", type = int, help="minimum family size (A+B) for downsampling", default=20)
 17 |     parser.add_argument("--max_family_size", type = int, help="max_family_size for output a bam", default=20)
 18 |     parser.add_argument("--min_strand_specific_family_size", type = int, default = 10)
 19 |     parser.add_argument("--seed", help="seed for random sampling", default=7)
 20 |     parser.add_argument("--is_codec", help="if is CODEC library", default=False, action='store_true')
 21 | 
 22 |     return parser.parse_args()
 23 | 
 24 | class PairEnd:
 25 |     def __init__(self, aln, is_cds):
 26 |         self.name = aln.query_name
 27 |         self.reads = []
 28 |         self.fid = None
 29 |         self.strand = None
 30 |         self.push(aln, is_cds)
 31 | 
 32 |     def push(self, aln, is_cds):
 33 |         assert(aln.query_name == self.name)
 34 | 
 35 |         try:
 36 |             mitag = aln.get_tag("MI")
 37 |         except KeyError:
 38 |             sys.stderr.write(aln.query_name + " has no MI tag\n")
 39 |         if is_cds:
 40 |             fid = mitag
 41 |             strand = None
 42 |         else:
 43 |             if "/" in mitag:
 44 |                 fid, strand = mitag.split("/")
 45 |             else:
 46 |                 fid = mitag
 47 |                 strand = None
 48 |         if self.fid:
 49 |             assert(self.fid == fid)
 50 |         else:
 51 |             self.fid = fid
 52 |         if strand and self.strand:
 53 |             assert(strand == self.strand)
 54 |         elif strand:
 55 |             self.strand = strand
 56 | 
 57 |         self.reads.append(aln)
 58 | 
 59 | 
 60 | class Duplex:
 61 |     def __init__(self, pairend, is_cds):
 62 |         self.A_reads = []
 63 |         self.B_reads = []
 64 |         self.cds_reads = []
 65 |         self.fid = None
 66 |         self.push(pairend, is_cds)
 67 |         self.is_cds = is_cds
 68 | 
 69 |     def sizeA(self):
 70 |         if self.is_cds:
 71 |             return len(self.cds_reads)
 72 |         else:
 73 |             return len(self.A_reads)
 74 | 
 75 |     def sizeB(self):
 76 |         if self.is_cds:
 77 |             return len(self.cds_reads)
 78 |         else:
 79 |             return len(self.B_reads)
 80 | 
 81 |     def size(self):
 82 |         if self.is_cds:
 83 |             return len(self.cds_reads)
 84 |         else:
 85 |             return self.sizeA() + self.sizeB()
 86 | 
 87 |     def push(self, pairend, is_cds):
 88 |         if self.fid:
 89 |             assert(self.fid == pairend.fid)
 90 |         else:
 91 |             self.fid = pairend.fid
 92 |         if is_cds:
 93 |             self.cds_reads.append(pairend)
 94 |         else:
 95 |             if pairend.strand == 'A':
 96 |                 self.A_reads.append(pairend)
 97 |             elif pairend.strand == 'B':
 98 |                 self.B_reads.append(pairend)
 99 |             else:
100 |                 raise ValueError(pairend.name + " MI tag malformed\n")
101 | 
102 | 
103 | def sub_sample(duplex, target_size, is_cds):
104 |     assert(duplex.size() >= target_size)
105 | 
106 |     ret = []
107 |     if is_cds:
108 |         draw = sample(list(range(duplex.size())), target_size)
109 |         ret = [duplex.cds_reads[ii] for ii in draw]
110 |         return ret
111 | 
112 |     if target_size == 1:
113 |         draw = sample(list(range(duplex.size())), target_size)
114 |         ret.append(duplex.A_reads[draw[0]] if draw[0] < duplex.sizeA() else duplex.B_reads[draw[0] - duplex.sizeA()])
115 |         return ret
116 | 
117 |     if duplex.sizeA() < duplex.sizeB():
118 |         if duplex.sizeA() < target_size / 2:
119 |             ret = duplex.A_reads
120 |         else:
121 |             idx = sample(list(range(duplex.sizeA())), int(target_size / 2))
122 |             ret = [duplex.A_reads[ii] for ii in idx]
123 |         rest_idx = sample(list(range(duplex.sizeB())), target_size - len(ret))
124 |         ret = ret + [duplex.B_reads[ii] for ii in rest_idx]
125 |     else:
126 |         if duplex.sizeB() < target_size / 2:
127 |             ret = duplex.B_reads
128 |         else:
129 |             idx = sample(list(range(duplex.sizeB())), int(target_size / 2))
130 |             ret = [duplex.B_reads[ii] for ii in idx]
131 |         rest_idx = sample(list(range(duplex.sizeA())), target_size - len(ret))
132 |         ret = ret + [duplex.A_reads[idx] for idx in rest_idx]
133 | 
134 |     return ret
135 | 
136 | 
137 | class DuplexFamilyBamsWriter:
138 |     def __init__(self, in_bam, max_fs, min_fs, min_sp_fs, outdir, sid, is_codec):
139 |         self.max_family_size = max_fs
140 |         self.min_family_size_to_downsample = min_fs
141 |         self.min_strand_specific_size_to_downsample = min_sp_fs
142 |         self.sample_id = sid
143 |         self.out_bams = []
144 |         self.is_codec = is_codec
145 |         for ii in range(max_fs):
146 |             fname = sid + "_" + str(ii + 1) + ".bam"
147 |             fpath = os.path.join(outdir, fname)
148 |             self.out_bams.append(pysam.AlignmentFile(fpath, "wb", template=in_bam))
149 | 
150 |     def write_duplex(self, duplex):
151 |         if duplex.size() >= self.min_family_size_to_downsample and  \
152 |             duplex.sizeA() >= self.min_strand_specific_size_to_downsample and \
153 |                 duplex.sizeB() >= self.min_strand_specific_size_to_downsample:
154 |             for fs in range(self.max_family_size):
155 |                 if duplex.size() > fs:
156 |                     fragments = sub_sample(duplex, fs + 1, self.is_codec)
157 |                     for frag in fragments:
158 |                         for record in frag.reads:
159 |                             self.out_bams[fs].write(record)
160 | 
161 |     def close_all(self):
162 |         for bam in self.out_bams:
163 |             bam.close()
164 | 
165 | 
166 | def process(opts):
167 | 
168 |     random.seed(opts.seed)
169 |     in_bam = pysam.AlignmentFile(opts.in_bam, "rb")
170 |     dstack = [] #duplex stack
171 |     fstack = [] #fragment stack
172 |     writer = DuplexFamilyBamsWriter(in_bam, opts.max_family_size, opts.min_family_size, opts.min_strand_specific_family_size,
173 |                                     opts.outdir, opts.sample_id, opts.is_codec)
174 | 
175 |     for aln in in_bam.fetch(until_eof=True):
176 |         if fstack:
177 |             if fstack[-1].name == aln.query_name:
178 |                 fstack[-1].push(aln, is_cds=opts.is_codec)
179 |                 continue
180 |             else:
181 |                 pe = fstack.pop()
182 |                 # do something
183 |                 if dstack:
184 |                     if dstack[-1].fid == pe.fid:
185 |                         dstack[-1].push(pe, is_cds=opts.is_codec)
186 |                     else:
187 |                         dpx = dstack.pop()
188 |                         writer.write_duplex(dpx)
189 |                         dpx = Duplex(pe, is_cds=opts.is_codec)
190 |                         dstack.append(dpx)
191 |                 else:
192 |                     dpx = Duplex(pe, is_cds=opts.is_codec)
193 |                     dstack.append(dpx)
194 | 
195 |         pe = PairEnd(aln, opts.is_codec)
196 |         fstack.append(pe)
197 | 
198 |     # process the last read
199 |     if fstack:
200 |         pe = fstack.pop()
201 |         # do something
202 |         if dstack:
203 |             if dstack[-1].fid == pe.fid:
204 |                 dstack[-1].push(pe, is_cds = opts.is_codec)
205 |             else:
206 |                 dpx = dstack.pop()
207 |                 writer.write_duplex(dpx)
208 |                 dpx = Duplex(pe, is_cds=opts.is_codec)
209 |                 dstack.append(dpx)
210 | 
211 |     if dstack:
212 |         dpx = dstack.pop()
213 |         writer.write_duplex(dpx)
214 | 
215 |     in_bam.close()
216 |     writer.close_all()
217 | 
218 | 
219 | if __name__ == "__main__":
220 |     process(parse_cl_args())
221 | 


--------------------------------------------------------------------------------
/snakemake/script/dpx/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/snakemake/script/dpx/__pycache__/bam_iterator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/bam_iterator.cpython-36.pyc


--------------------------------------------------------------------------------
/snakemake/script/dpx/__pycache__/bam_iterator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/bam_iterator.cpython-38.pyc


--------------------------------------------------------------------------------
/snakemake/script/dpx/__pycache__/collect_duplex_metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/collect_duplex_metrics.cpython-36.pyc


--------------------------------------------------------------------------------
/snakemake/script/dpx/__pycache__/downsampler.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/downsampler.cpython-36.pyc


--------------------------------------------------------------------------------
/snakemake/script/dpx/__pycache__/downsampler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/downsampler.cpython-38.pyc


--------------------------------------------------------------------------------
/snakemake/script/dpx/__pycache__/intervals.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/intervals.cpython-36.pyc


--------------------------------------------------------------------------------
/snakemake/script/dpx/__pycache__/intervals.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/CODECsuite/90eb447ecfa2e11cb029c609a48076aa77fc1064/snakemake/script/dpx/__pycache__/intervals.cpython-38.pyc


--------------------------------------------------------------------------------
/snakemake/script/dpx/downsampler.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from collections import Counter
  3 | 
  4 | import numpy as np
  5 | import sys
  6 | 
  7 | 
  8 | np.random.seed(10)
  9 | 
 10 | def get_overlap(read, interval_dict):
 11 |     rchrom = str(read.reference_name)
 12 |     rstart = read.reference_start
 13 |     rend = read.reference_end
 14 |     if rchrom in interval_dict:
 15 |         olap = interval_dict[rchrom].search(rstart, rend)
 16 |         if olap:
 17 |             tstart, tend = olap[0].start, olap[0].end
 18 |             overlap = min(rend, tend) - max(rstart, tstart)
 19 |             target = f"{rchrom}:{tstart}-{tend}"
 20 |             return target, overlap
 21 |     return None, 0
 22 | 
 23 | class Downsampler:
 24 |     """ Class to perform downsampling on a list of family IDs
 25 |     from Fgbio GroupReadsByUmi """
 26 | 
 27 |     def __init__(self, probabilities, min_min_strand_reads, min_max_strand_reads, per_target, interval_dict, is_cds):
 28 |         self.probabilities = probabilities
 29 |         self.min_min_strand_reads = min_min_strand_reads
 30 |         self.min_max_strand_reads = min_max_strand_reads
 31 |         self.kept_families = defaultdict(list)
 32 |         self.counts = (
 33 |             defaultdict(Counter)
 34 |             if not per_target
 35 |             else defaultdict(lambda: defaultdict(Counter))
 36 |         )
 37 |         self.per_target = per_target
 38 |         self.interval_dict = interval_dict
 39 |         self.is_cds = is_cds
 40 | 
 41 |     def downsample(self, read_pairs, probability):
 42 |         """ Downsamples list of read pairs at a given probability """
 43 |         duplexes = defaultdict(lambda: defaultdict(lambda: 0))
 44 |         summary_counts = defaultdict(lambda: 0)
 45 |         min_min_strand_reads = self.min_min_strand_reads
 46 |         min_max_strand_reads = self.min_max_strand_reads
 47 |         kept_reads = []
 48 |         previous_coordinate = set()
 49 |         for read_pair in read_pairs:
 50 |             strand = read_pair.strand
 51 |             family = read_pair.family
 52 |             if not family:
 53 |                 continue
 54 |             coordinate_id = read_pair.coordinate_id
 55 |             if np.random.random() <= probability:
 56 |                 summary_counts["read_pairs"] += 1
 57 |                 if coordinate_id not in previous_coordinate:
 58 |                     summary_counts["cs_families"] += 1
 59 |                     previous_coordinate.add(coordinate_id)
 60 |                 duplexes[family][strand] += 1
 61 |                 if self.is_cds and read_pair.are_ends_overlapped():
 62 |                     if strand == "A":
 63 |                         duplexes[family]["B"] += 1
 64 |                     elif strand == "B":
 65 |                         duplexes[family]["A"] += 1
 66 |                 #if self.per_target:
 67 |                 kept_reads.append(read_pair)
 68 |         for family, count in duplexes.items():
 69 |             ss_families = (count["A"] > 0) + (count["B"] > 0)
 70 |             ds_families = int(ss_families > 0)
 71 |             summary_counts["ss_families"] += ss_families
 72 |             summary_counts["ds_families"] += ds_families
 73 |             if min(count["A"], count["B"]) >= min_min_strand_reads and \
 74 |                 max(count["A"], count["B"]) >= min_max_strand_reads:
 75 |                 summary_counts["ds_duplexes"] += 1
 76 |         return summary_counts, kept_reads
 77 | 
 78 |     def run_downsamplings(self, reads, serial_sampling=True):
 79 |         """ When serial sampling is true, we use reads from the sampling of the
 80 |         next highest probability."""
 81 |         probs = np.sort(self.probabilities)[::-1]
 82 |         adj_probs = probs
 83 |         if serial_sampling:
 84 |             # If serial sampling, we need to adjust probability as kept_reads
 85 |             # is smaller after each sampling.
 86 |             adj_probs = adj_probs / np.insert(adj_probs, 0, 1)[:-1]
 87 | 
 88 |         for actual, prob in zip(probs, adj_probs):
 89 |             summary_counts, kept_reads = self.downsample(reads, prob)
 90 |             summary_counts = Counter(summary_counts)
 91 |             if serial_sampling:
 92 |                 reads = kept_reads
 93 |             if self.per_target:
 94 |                 if kept_reads:
 95 |                     if kept_reads[0].read1 and kept_reads[0].read2:
 96 |                         target, _ = kept_reads[0].get_overlap(self.interval_dict)
 97 |                     elif kept_reads[0].read1:
 98 |                         target, _ = get_overlap(kept_reads[0].read1, self.interval_dict)
 99 |                     elif kept_reads[0].read2:
100 |                         target, _ = get_overlap(kept_reads[0].read2, self.interval_dict)
101 |                     if not target:
102 |                         print(kept_reads[0].read1, "\n", kept_reads[0].read2)
103 |                     assert(target)
104 |                     self.counts[target][actual] = (
105 |                         self.counts[target][actual] + summary_counts
106 |                     )
107 |             else:
108 |                 self.counts[actual] = self.counts[actual] + summary_counts
109 | 


--------------------------------------------------------------------------------
/snakemake/script/dpx/get_mutant_metrics.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import click
  3 | import pandas as pd
  4 | import pysam
  5 | 
  6 | from bam_iterator import iterate_pileup_reads, get_unclipped_read_ends, get_unclipped_fragment_ends, keep_read, is_proper_orientation
  7 | 
  8 | 
  9 | class TargetMolecule:
 10 |     def __init__(
 11 |         self, pileupread, target_pos, target_ref_base, target_alt_base
 12 |     ):
 13 |         self.pileupread = pileupread
 14 |         self.target_pos = target_pos
 15 |         self.target_ref_base = target_ref_base
 16 |         self.target_alt_base = target_alt_base
 17 |         self.read = pileupread.alignment
 18 |         self.chrom = self.read.reference_name
 19 |         self.read_start, self.read_end = get_unclipped_read_ends(
 20 |             start = self.read.reference_start, cigar = self.read.cigarstring
 21 |         )
 22 |         self.read_sequence = self.read.query_sequence
 23 |         self.base = self.read.query_sequence[pileupread.query_position]
 24 |         self.start, self.end = get_unclipped_fragment_ends(
 25 |             read1_chrom = self.chrom, 
 26 |             read1_start = self.read.reference_start,
 27 |             read1_cigar = self.read.cigarstring, 
 28 |             read1_reverse = self.read.is_reverse, 
 29 |             read2_chrom = self.read.next_reference_name, 
 30 |             read2_start = self.read.next_reference_start, 
 31 |             read2_cigar = self.get_read_tag("MC"), 
 32 |             read2_reverse = self.read.mate_is_reverse
 33 |         )
 34 |         self.mapping_quality = self.read.mapping_quality
 35 |         self.base_quality = self.read.query_qualities[pileupread.query_position]
 36 |         self.family_id = self.get_read_tag("MI")
 37 |         self.family_size = self.get_read_tag("cD")
 38 |         self.umi = self.get_read_tag("RX")
 39 |         self.overlap = False
 40 |         self.discordant = False
 41 |         self.mismatch_count = self.get_read_tag(
 42 |             "NM"
 43 |         ) - pileupread.alignment.query_alignment_sequence.count(
 44 |             "N"
 45 |         )  # added by Chris
 46 |         self.N_count = pileupread.alignment.query_alignment_sequence.count(
 47 |             "N"
 48 |         )  # added by Chris
 49 | 
 50 |     def __str__(self):
 51 |         args = [
 52 |             f'{k}="{v}"'
 53 |             for k, v in self.__dict__.items()
 54 |             if k not in ["pileupread", "read"]
 55 |         ]
 56 |         args = ", ".join(args)
 57 |         return f"{self.__class__.__name__}({args})"
 58 | 
 59 |     def get_read_tag(self, tag, default="NA"):
 60 |         try:
 61 |             return self.read.get_tag(tag)
 62 |         except KeyError:
 63 |             return default
 64 | 
 65 |     def get_mapping_qualities(self):
 66 |         """ Returns tuple: (read1.mapping_quality, read2.mapping_quality) """
 67 |         res = (self.read.mapping_quality, self.get_read_tag("MQ"))
 68 |         if not self.read.is_read1:
 69 |             res = res[::-1]
 70 |         return res
 71 | 
 72 |     @property
 73 |     def aD_bD(self):
 74 |         """ aD and bD tags (sorted so max is first) """
 75 |         ad, bd = self.get_read_tag("aD"), self.get_read_tag("bD")
 76 |         ad, bd = sorted([int(ad), int(bd)], reverse=True)
 77 |         return f"{ad}/{bd}"
 78 | 
 79 |     @property
 80 |     def insert_size(self):
 81 |         """ Return insert size with unclipped ends taken into account """
 82 |         return self.end - self.start
 83 | 
 84 |     @property
 85 |     def read_position(self):
 86 |         """ Returns 5' position of base in read """
 87 |         if self.read.is_reverse:
 88 |             pos = self.read.infer_read_length() - self.pileupread.query_position
 89 |         else:
 90 |             pos = self.pileupread.query_position + 1
 91 |         return pos
 92 | 
 93 |     @property
 94 |     def distance_from_5_prime(self):
 95 |         """ Returns distance from 5' end of fragment. In cases
 96 |         where reads overlap, 5' distance from read may not equal 5'
 97 |         distance from end of fragment """
 98 |         return min(
 99 |             [self.read_position, abs(self.insert_size) - self.read_position + 1]
100 |         )
101 | 
102 |     @property
103 |     def target_site(self):
104 |         """ Returns string representing target site """
105 |         return f"{self.chrom}:{self.target_pos}"
106 | 
107 |     @property
108 |     def fragment_id(self):
109 |         """ Returns string representing fragment_id """
110 |         return f"{self.chrom}:{self.start + 1}-{self.end}"
111 | 
112 |     @property
113 |     def molecule_class(self):
114 |         """ Return string representing molecule class """
115 |         if self.base == self.target_ref_base:
116 |             return "REF"
117 |         elif self.base == self.target_alt_base:
118 |             return "ALT"
119 |         else:
120 |             return "OTHER"
121 | 
122 | 
123 | def parse_maf(maf_file):
124 |     """ Reads in MAF file and yields a Variant namedtuple """
125 |     infile = pd.read_table(maf_file)
126 |     headers = infile.columns
127 |     Variant = namedtuple("VariantSite", headers)
128 |     for idx, row in infile.iterrows():
129 |         yield Variant(*row)
130 | 
131 | 
132 | def pass_filter(pileupread):
133 |     """ Make sure read can be used """
134 |     if (
135 |         not pileupread.is_del
136 |         and not pileupread.is_refskip
137 |         and keep_read(pileupread.alignment)
138 |         and is_proper_orientation(
139 |             read1_chrom = pileupread.alignment.reference_name, 
140 |             read1_start = pileupread.alignment.reference_start,
141 |             read1_reverse = pileupread.alignment.is_reverse, 
142 |             read2_chrom = pileupread.alignment.next_reference_name, 
143 |             read2_start = pileupread.alignment.next_reference_start, 
144 |             read2_reverse = pileupread.alignment.mate_is_reverse
145 |         )
146 |     ):
147 |         return True
148 |     return False
149 | 
150 | 
151 | def check_discordance(molecule, other_molecule):
152 |     """ Check discordance between two molecules and update """
153 |     molecule.overlap = True
154 |     other_molecule.overlap = True
155 |     if molecule.base != other_molecule.base:
156 |         molecule.discordant = True
157 |         molecule.base = "N"
158 |         other_molecule.discordant = True
159 |         other_molecule.base = "N"
160 | 
161 | 
162 | def target_molecule_generator(
163 |     variant_site, bam, mapping_quality, base_quality, output_both=False
164 | ):
165 |     """ For a given variant site, a pileup will be created from the BAM.
166 |     Each read overlapping the variant site will be yielded if it
167 |     passes filters
168 |     Output both: output both reads if both overlap base """
169 |     chrom = str(variant_site.Chromosome)
170 |     start = int(variant_site.Start_position)
171 |     end = start
172 |     ref_base = variant_site.Reference_Allele
173 |     alt_base = variant_site.Tumor_Seq_Allele2
174 |     molecule_dict = {}
175 |     for pileupread in iterate_pileup_reads(
176 |         bam,
177 |         chrom,
178 |         start - 1,
179 |         end,
180 |         stepper="nofilter",
181 |         min_mapping_quality=mapping_quality,
182 |         min_base_qual=base_quality,
183 |         truncate=True,
184 |         max_depth=1000000,
185 |         noun="pileup reads",
186 |         verb="processed",
187 |         log_every=100000,
188 |     ):
189 |         read_name = pileupread.alignment.query_name
190 |         if pass_filter(pileupread):
191 |             molecule = TargetMolecule(pileupread, end, ref_base, alt_base)
192 |             if read_name not in molecule_dict:
193 |                 molecule_dict[read_name] = [molecule]
194 |             else:
195 |                 other_molecule = molecule_dict[read_name][0]
196 |                 check_discordance(molecule, other_molecule)
197 |                 molecule_dict[read_name].append(molecule)
198 | 
199 |     for _, molecules in molecule_dict.items():
200 |         if output_both:
201 |             for molecule in molecules:
202 |                 yield molecule
203 |         else:
204 |             yield molecules[0]
205 | 
206 | 
207 | @click.command(context_settings=dict(help_option_names=["-h", "--help"]))
208 | @click.option(
209 |     "-m",
210 |     "--maf_file",
211 |     type=click.Path(exists=True),
212 |     help="MAF describing variants",
213 |     required=True,
214 | )
215 | @click.option(
216 |     "-b",
217 |     "--bam_file",
218 |     type=click.Path(exists=True),
219 |     help="Input DSC/SSC consensus BAM file",
220 | )
221 | @click.option(
222 |     "-f",
223 |     "--file_list",
224 |     type=click.Path(exists=True),
225 |     help="Run on list of BAM files",
226 | )
227 | @click.option(
228 |     "-o",
229 |     "--output",
230 |     type=click.Path(writable=True),
231 |     help="Output file  [default: BAM prefix]",
232 |     default=None,
233 | )
234 | @click.option(
235 |     "--mapq",
236 |     type=int,
237 |     help="Minimum mapping quality",
238 |     default=60,
239 |     show_default=True,
240 | )
241 | @click.option(
242 |     "--baseq",
243 |     type=int,
244 |     help="Minimum base quality",
245 |     default=89,
246 |     show_default=True,
247 | )
248 | @click.option(
249 |     "--mutant_only", is_flag=True, help="Only output mutant fragments"
250 | )
251 | def create_summary_file(
252 |     bam_file, maf_file, output, mapq, baseq, file_list, mutant_only
253 | ):
254 |     """ Summarize consensus molecules at variant sites. """
255 |     if file_list:
256 |         bam_files = [x.strip() for x in open(file_list, "r").readlines()]
257 |         outputs = [
258 |             bam.split("/")[-1].replace(".bam", "_pileup_summary.txt")
259 |             for bam in bam_files
260 |         ]
261 |     else:
262 |         bam_files = [bam_file]
263 |         outputs = [
264 |             output
265 |             if output
266 |             else bam_file.split("/")[-1].replace(".bam", "_pileup_summary.txt")
267 |         ]
268 | 
269 |     for bam_file, output in zip(bam_files, outputs):
270 |         print(f"Working on: {bam_file}")
271 |         outfile = open(output, "w")
272 |         headers = [
273 |             "family_id",
274 |             "family_size",
275 |             "umi",
276 |             "fragment_id",
277 |             "target_site",
278 |             "molecule_class",
279 |             "base",
280 |             "read_position",
281 |             "distance_from_5_prime",
282 |             "discordant",
283 |             "mismatch_count",
284 |             "N_count",
285 |         ]
286 |         outfile.write("\t".join(headers) + "\n")
287 | 
288 |         bam = pysam.AlignmentFile(bam_file)
289 |         for variant_site in parse_maf(maf_file):
290 |             for target_molecule in target_molecule_generator(
291 |                 variant_site, bam, mapq, baseq
292 |             ):
293 |                 if mutant_only:
294 |                     if target_molecule.molecule_class != "ALT":
295 |                         continue
296 |                 outfile.write(
297 |                     "\t".join(
298 |                         str(target_molecule.__getattribute__(x))
299 |                         for x in headers
300 |                     )
301 |                     + "\n"
302 |                 )
303 |         total_reads = iterate_pileup_reads.counts
304 |         print(f"Processed {total_reads} consensus reads")
305 |         bam.close()
306 | 
307 | 
308 | if __name__ == "__main__":
309 |     create_summary_file()
310 | 


--------------------------------------------------------------------------------
/snakemake/script/dpx/intervals.py:
--------------------------------------------------------------------------------
 1 | from quicksect import IntervalTree
 2 | from collections import defaultdict
 3 | from math import ceil
 4 | def read_bed(bedfile):
 5 |     """ Creates generator from bed file or interval_list """
 6 |     interval_list = bedfile.endswith("interval_list")
 7 |     with open(bedfile, "r") as bed:
 8 |         for line in bed:
 9 |             if line.startswith("@"):
10 |                 continue
11 |             line = line.strip()
12 |             chrom, start, stop = line.split()[0:3]
13 |             start, stop = int(start), int(stop)
14 |             if interval_list:
15 |                 start -= 1
16 |             yield chrom, start, stop
17 | 
18 | 
19 | def create_interval_dict_from_bed(bedfile, midpoint=False):
20 |     """
21 |     Used for marking on/off target fragments by creating interval
22 |     trees for each chromosome in bedfile.
23 |     """
24 |     interval_dict = defaultdict(IntervalTree)
25 |     if bedfile:
26 |         for chrom, start, stop in read_bed(bedfile):
27 |             if midpoint:
28 |                 mid = ceil((start+stop)/2)
29 |                 start = mid - 1
30 |                 stop = mid
31 |             interval_dict[str(chrom)].add(start, stop)
32 |     return interval_dict
33 | 


--------------------------------------------------------------------------------
/snakemake/script/error_rate_by_family_size.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | import sys
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | import matplotlib.ticker as mtick
10 | from matplotlib.ticker import MaxNLocator
11 | import seaborn as sns
12 | sns.set(font_scale=2)
13 | logger = logging.getLogger("{}".format(__file__))
14 | 
15 | def get_arguments():
16 | 
17 |     parser = argparse.ArgumentParser(prog="plot error rates stratified by family size", formatter_class=argparse.RawDescriptionHelpFormatter)
18 |     parser.add_argument("accu", type=str, help="input accu file")
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | def absolute_path(path):
23 |     """convert relative path to absolute path"""
24 |     if os.path.isabs(path):
25 |         return path
26 |     else:
27 |         return os.path.join(os.getcwd(), path)
28 | 
29 | def process(options):
30 |     fsize_accu = {}
31 |     with open(options.accu, 'r') as fh:
32 |         for line in fh:
33 |             cols = line.strip().split('\t')
34 |             if cols[1] == "chrY" or cols[1] == "chrX":
35 |                 continue
36 |             fields = cols[0].split('_')
37 |             fsize = int(fields[-1])
38 |             if fsize not in fsize_accu:
39 |                 fsize_accu[fsize] = [int(cols[3]), int(cols[4]), 0]
40 |             else:
41 |                 fsize_accu[fsize][0] += int(cols[3])
42 |                 fsize_accu[fsize][1] += int(cols[4])
43 |                 fsize_accu[fsize][2] += 1
44 | 
45 |     accus = [0] * 10
46 |     counts = [0] * 10
47 |     for k,v in fsize_accu.items():
48 |         if k <= 10:
49 |             print (k, v[0], v[1], v[1]/v[0], v[2])
50 |             accus[k-1] = v[1]/v[0] * 1e5
51 |             counts[k-1] = v[2] * 1e-6
52 |     fig, ax = plt.subplots(figsize=(11.7, 8.27))
53 |     ax.bar(np.arange(10) + 1, accus)
54 |     ax.xaxis.set_major_locator(MaxNLocator(integer=True))
55 |     plt.xlabel("Family size")
56 |     plt.ylabel("Error per 100k")
57 |     fig.savefig('error_rate_by_family_size.png')
58 | 
59 |     fig1, ax1 = plt.subplots(figsize=(11.7, 8.27))
60 |     ax1.bar(np.arange(10) + 1, counts)
61 |     ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
62 |     #ax1.ticklabel_format(axis='y', useMathText=True)
63 |     plt.xlabel("Family size")
64 |     plt.ylabel("Million Fragment")
65 |     fig1.savefig('num_frag_family_size.png')
66 | 
67 | if __name__ == '__main__':
68 |     sys.exit(process(get_arguments()))
69 | 
70 | 


--------------------------------------------------------------------------------
/snakemake/script/extract_false_positive_reads.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import pandas as pd
 4 | import sys
 5 | import pysam
 6 | import subprocess
 7 | import argparse
 8 | import os
 9 | 
10 | def get_arguments():
11 | 
12 |     parser = argparse.ArgumentParser(prog="grep false positive reads and print from gbu bam. Additionaly can print fastq files",
13 |                                      formatter_class=argparse.RawDescriptionHelpFormatter)
14 |     parser.add_argument("--fq1", type=str, help="raw fastq1 file", default="")
15 |     parser.add_argument("--fq2", type=str, help="raw fastq2 file", default="")
16 |     parser.add_argument("bam", type=str, help="groupby_umi_bam")
17 |     parser.add_argument('df', type=str, help="error family file")
18 |     parser.add_argument('prefix', type=str, help="prefix for output")
19 | 
20 |     args = parser.parse_args()
21 |     return args
22 | 
23 | def process(options):
24 |     prefix_id = options.prefix
25 | 
26 |     df = pd.read_csv(options.df, sep="\t", low_memory=False)
27 |     new = df['family_id'].str.split('/', n = 1, expand = True)
28 |     df['family_num']  = new[0]
29 |     df['strand'] = new[1]
30 |     df = df.astype({'family_num' : 'int64'})
31 |     df = df.sort_values(by=['family_num', 'strand'])
32 |     df = df[df['pass_filter'] == 1]
33 |     gbubam = pysam.AlignmentFile(options.bam, "rb")
34 |     tmpbam = pysam.AlignmentFile(prefix_id + ".tmp.bam", "wb", template=gbubam)
35 |     f = open(prefix_id + ".run", 'w')
36 |     rowit = df.iterrows()
37 |     cur_row = next(rowit)[1]
38 |     in_match = False
39 |     counter = 0
40 |     for aln in gbubam.fetch(until_eof=True):
41 |         mitag = aln.get_tag("MI")
42 |         if mitag == cur_row['family_id']:
43 |             if not in_match:
44 |                 print(cur_row['family_id'])
45 |                 in_match = True
46 |                 counter = 0
47 |             if options.fq1:
48 |                 if aln.is_read1:
49 |                     counter += 1
50 |                     cmd = ["zegrep", "-A 3", "-m 1", aln.query_name, options.fq1, ">>", prefix_id + ".1.fastq #" + mitag]
51 |                     print(" ".join(cmd), file=f)
52 |             if options.fq2:
53 |                 if aln.is_read2:
54 |                     cmd = ["zegrep", "-A 3", "-m 1", aln.query_name, options.fq2, ">>", prefix_id + ".2.fastq #" + mitag]
55 |                     print(" ".join(cmd), file=f)
56 |             tmpbam.write(aln)
57 |         if in_match and mitag != cur_row['family_id']:
58 |             print("matched", counter, "times")
59 |             in_match = False
60 |             try:
61 |                 cur_row = next(rowit)[1]
62 |             except StopIteration:
63 |                 break
64 |     tmpbam.close()
65 |     cmd = "samtools sort {0} -o {1} && samtools index {1} && rm {0}".format(prefix_id + ".tmp.bam", prefix_id + ".bam")
66 |     subprocess.check_call(cmd, shell=True)
67 | if __name__ == '__main__':
68 |     sys.exit(process(get_arguments()))
69 | 


--------------------------------------------------------------------------------
/snakemake/script/familysize_dist.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import logging
  5 | import os
  6 | import sys
  7 | import pysam
  8 | import pandas as pd
  9 | from collections import defaultdict
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | import matplotlib.ticker as plticker
 13 | import seaborn as sns
 14 | from scipy import stats
 15 | 
 16 | logger = logging.getLogger("{}".format(__file__))
 17 | 
 18 | def read_pair_generator(bam, region_string=None):
 19 |     """
 20 |     Generate read pairs in a BAM file or within a region string.
 21 |     Reads are added to read_dict until a pair is found.
 22 |     """
 23 |     read_dict = defaultdict(lambda: [None, None])
 24 |     for read in bam.fetch(until_eof=True):
 25 |         if read.is_secondary or read.is_supplementary:
 26 |             continue
 27 |         qname = read.query_name
 28 |         if qname not in read_dict:
 29 |             if read.is_read1:
 30 |                 read_dict[qname][0] = read
 31 |             else:
 32 |                 read_dict[qname][1] = read
 33 |         else:
 34 |             if read.is_read1:
 35 |                 yield read, read_dict[qname][1]
 36 |             else:
 37 |                 yield read_dict[qname][0], read
 38 |             del read_dict[qname]
 39 | 
 40 | def read_bed(bedfile):
 41 |     """ Creates generator from bed file or interval_list """
 42 |     logger.info("Reading region file...")
 43 |     interval_list = bedfile.endswith("interval_list")
 44 |     with open(bedfile, "r") as bed:
 45 |         for line in bed:
 46 |             if line.startswith("@"):
 47 |                 continue
 48 |             line = line.strip()
 49 |             chrom, start, stop = line.split()[0:3]
 50 |             start, stop = int(start), int(stop)
 51 |             if interval_list:
 52 |                 start -= 1
 53 |             yield chrom, start, stop
 54 | 
 55 | def is_overlap(read1, read2):
 56 |     if read1.is_unmapped or read2.is_unmapped:
 57 |         return False
 58 |     if read1.reference_name != read2.reference_name:
 59 |         return False
 60 |     if read1.reference_start < read2.reference_end and read2.reference_start < read1.reference_end:
 61 |         return True
 62 | 
 63 | def overlap_len(read1, read2):
 64 |     if not is_overlap(read1,read2):
 65 |         return 0
 66 |     else:
 67 |         return min(read1.reference_end, read2.reference_end) - max(read1.reference_start, read2.reference_start)
 68 | 
 69 | def overlap_span_ratio(read1, read2):
 70 |     ol = overlap_len(read1, read2)
 71 |     if ol == 0:
 72 |         return 0;
 73 |     else:
 74 |         span = max(read1.reference_end, read2.reference_end) - min(read1.reference_start, read2.reference_start)
 75 |         return ol/span
 76 | 
 77 | def get_arguments():
 78 | 
 79 |     parser = argparse.ArgumentParser(prog="", formatter_class=argparse.RawDescriptionHelpFormatter)
 80 |     parser.add_argument("bam", type=str, help="input bam file")
 81 |     parser.add_argument("bed", type=str, help="target bed")
 82 |     parser.add_argument("--im_dist_cutoff", default=5000, help="intermolecu reads min distance to be considered", type=int, required=False)
 83 |     args = parser.parse_args()
 84 |     return args
 85 | 
 86 | def absolute_path(path):
 87 |     """convert relative path to absolute path"""
 88 |     if os.path.isabs(path):
 89 |         return path
 90 |     else:
 91 |         return os.path.join(os.getcwd(), path)
 92 | 
 93 | def bin_counts(vec, cut):
 94 |     cat = pd.cut(np.array(vec), cut)
 95 |     print(cat.value_counts())
 96 | 
 97 | def cdf_plot(vec, bins, figure, cumulative, xlab):
 98 |     #values, base = np.histogram(vec, bins=bins)
 99 |     #cumulative = np.cumsum(values)
100 |     #plt.plot(base[:-1], cumulative, c='blue')
101 |     fig, ax = plt.subplots()
102 |     ax.hist(vec, bins, density=True, cumulative=cumulative, histtype='step')
103 |     ax.set_axisbelow(True)
104 |     ax.yaxis.grid(color='gray', linestyle='dashed')
105 |     ax.xaxis.grid(color='gray', linestyle='dashed')
106 |     #loc = plticker.MultipleLocator(base=(max(vec) - min(vec)) / 10) # this locator puts ticks at regular intervals
107 |     #ax.xaxis.set_major_locator(loc)
108 |     #ax.yaxis.set_major_locator(loc)
109 |     ax.set_xlabel(xlab)
110 |     dir = " > " if cumulative == 1 else " < "
111 |     ax.set_ylabel("cumulative fraction of reads" + dir + xlab)
112 |     ax.set_title("number of reads " + str(len(vec)))
113 |     fig.savefig(figure)
114 |     plt.close()
115 | 
116 | 
117 | class CdsStat:
118 |     def __init__(self, sample_id):
119 |         self.sample_id = sample_id
120 |         self.total_frag =  0
121 |         self.num_intermol = 0 # intermolecular
122 |         self.num_overlap = 0
123 |         self.read_len_diff = []
124 |         self.aln_len_diff = []
125 |         self.ol_ratios = []
126 | 
127 |     def plot_read_len_diff(self):
128 |         cdf_plot(self.read_len_diff, 100, "read_len_diff.png", 1, "read len diff")
129 | 
130 |     def plot_ol_ratios(self):
131 |         cdf_plot(self.ol_ratios, 100, "ol_ratios.png", -1, "overlap ratio")
132 | 
133 |     def __str__(self):
134 |         return f"{self.sample_id}\t{self.total_frag}\t{self.num_intermol}\t{self.num_overlap}\t{self.num_intermol/self.total_frag}\t{self.num_overlap/self.total_frag}"
135 | 
136 | 
137 | 
138 | def process(options):
139 |     samfile = pysam.AlignmentFile(options.bam, "rb")
140 |     fid_size_dict = {}
141 |     for chrom, start, end in read_bed(options.bed):
142 |         for read in samfile.fetch(chrom, start, end):
143 |             if read.is_unmapped or read.mate_is_unmapped:
144 |                 continue
145 |             if read.is_read1 and read.has_tag('UG'):
146 |                 famid = read.get_tag('UG')
147 |                 intermol = False
148 |                 if not read.is_proper_pair or read.template_length == 0 or read.tempalte_length > options.im_dist_cutoff:
149 |                     intermol = True
150 |                 if famid not in fid_size_dict:
151 |                    fid_size_dict[famid] = [intermol, 1]
152 |                 else:
153 |                     assert(intermol == fid_size_dict[famid][0])
154 |                     fid_size_dict[famid][1] += 1
155 |     keys = zip(*fid_size_dict.values)
156 |     d = {'family_id' : list(fid_size_dict.keys()), 'is_intermol' : keys[0], 'size': keys[1]}
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     sys.exit(process(get_arguments()))
161 | 
162 | 


--------------------------------------------------------------------------------
/snakemake/script/fastqsplit.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
 4 | # @license artistic license 2.0
 5 | # @update Jul-05-2015
 6 | # Round robin split 
 7 | 
 8 | ### modified by Ruolin Liu to read the file from stdin
 9 | use warnings;
10 | use strict;
11 | use Symbol;
12 | 
13 | #my ($file, $base, $outN) = @ARGV;
14 | my ($base, $outN) = @ARGV;
15 | 
16 | $outN ||= 2;
17 | #($file and $base) or die "
18 | ($base) or die "
19 | Usage
20 |    $0 in_file.fq out_base[ no_files]
21 |    
22 |    in_file.fq   Input file in FastA format.
23 |    out_base Prefix for the name of the output files.  It will
24 |         be appended with .<i>.fastq, where <i> is a consecutive
25 |         number starting in 1.
26 |    no_files Number of files to generate.  By default: 2.
27 | ";
28 | 
29 | 
30 | my @outSym = ();
31 | for my $i (1 .. $outN){
32 |    $outSym[$i-1] = gensym;
33 |    open $outSym[$i-1], ">", "$base.$i.fastq" or die "I can not create the file: $base.$i.fa: $!\n";
34 | }
35 | 
36 | 
37 | my($i, $seq) = (-1, '');
38 | #open FILE, "<", $file or die "I can not read the file: $file: $!\n";
39 | while(my $ln=<STDIN>){
40 |    if($.%4 == 1){
41 |       print { $outSym[$i % $outN] } $seq if $seq;
42 |       $i++;
43 |       $seq = '';
44 |    }
45 |    $seq.=$ln;
46 | }
47 | print { $outSym[$i % $outN] } $seq if $seq;
48 | #close FILE;
49 | 
50 | for(my $j=0; $j<$outN; $j++){
51 |    close $outSym[$j];
52 | }
53 | 
54 | print STDERR "Sequences: ".($i+1)."\nFiles: $outN\n";
55 | 


--------------------------------------------------------------------------------
/snakemake/script/generate_reads.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | import sys
 7 | import pandas as pd
 8 | from Bio.Seq import reverse_complement
 9 | 
10 | logger = logging.getLogger("{}".format(__file__))
11 | 
12 | """
13 | Simulate CDS reads from the xlxs file provided by Jin
14 | """
15 | def get_arguments():
16 | 
17 |     parser = argparse.ArgumentParser(prog="foo", formatter_class=argparse.RawDescriptionHelpFormatter)
18 |     parser.add_argument("xlsx", type=str, help="input xlsx file")
19 |     parser.add_argument("--readlen", type=int, default=300, help="read len")
20 |     args = parser.parse_args()
21 |     return args
22 | 
23 | def absolute_path(path):
24 |     """convert relative path to absolute path"""
25 |     if os.path.isabs(path):
26 |         return path
27 |     else:
28 |         return os.path.join(os.getcwd(), path)
29 | 
30 | def print_fastq(name, seq, filename):
31 |     filename.write("@"+name)
32 |     filename.write("\n")
33 |     filename.write(seq)
34 |     filename.write("\n")
35 |     filename.write("+\n")
36 |     filename.write("I" * len(seq))
37 |     filename.write("\n")
38 | 
39 | def generate_read(template, readlen):
40 |     read1 = None
41 |     read2 = None
42 |     if len(template) > readlen:
43 |         ss = len(template) - readlen
44 |         read1 = template[:readlen]
45 |         read2 = reverse_complement(template[ss:])
46 |     else:
47 |         read1 = template
48 |         read2 = reverse_complement(template)
49 |     return (read1,read2)
50 | 
51 | def process(options):
52 |     xl = pd.ExcelFile(options.xlsx)
53 |     fastq1 = open("read1.fastq", "w")
54 |     fastq2 = open("read2.fastq", "w")
55 |     rl = options.readlen
56 | 
57 |     for sid, sheetname in enumerate(xl.sheet_names):
58 |         sheet = xl.parse(sheetname)
59 |         for cid, col in enumerate(sheet.columns):
60 |             for rid, row in enumerate(sheet[col].tolist()):
61 |                 if rid != 0: continue
62 |                 row = row.strip(" '")
63 |                 n1 = "test_" + str(sid) + "_" + str(cid) + "_" + str(rid)
64 |                 n2 = "test_" + str(sid) + "_" + str(cid) + "_" + str(rid)
65 |                 read1, read2 = generate_read(row, rl)
66 |                 print_fastq(n1, read1, fastq1)
67 |                 print_fastq(n2, read2, fastq2)
68 | 
69 |                 #row_rc = reverse_complement(row)
70 |                 #n1 = "test_rc" + str(sid) + "_" + str(cid) + "_" + str(rid)
71 |                 #n2 = "test_rc" + str(sid) + "_" + str(cid) + "_" + str(rid)
72 |                 #read1, read2 = generate_read(row_rc, rl)
73 |                 #print_fastq(n1, read1, fastq1)
74 |                 #print_fastq(n2, read2, fastq2)
75 | 
76 | if __name__ == '__main__':
77 |     sys.exit(process(get_arguments()))
78 | 


--------------------------------------------------------------------------------
/snakemake/script/get_midpoint_from_interval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | from math import ceil
 4 | 
 5 | def read_bed(bedfile):
 6 |     """ Creates generator from bed file or interval_list """
 7 |     interval_list = bedfile.endswith("interval_list")
 8 |     with open(bedfile, "r") as bed:
 9 |         for line in bed:
10 |             if line.startswith("@"):
11 |                 continue
12 |             line = line.strip()
13 |             chrom, start, stop = line.split()[0:3]
14 |             start, stop = int(start), int(stop)
15 |             if interval_list:
16 |                 start -= 1
17 |             yield chrom, start, stop
18 | 
19 | def get_arguments():
20 | 
21 |     parser = argparse.ArgumentParser(prog="get midpoint from interval and print like a bed file", formatter_class=argparse.RawDescriptionHelpFormatter)
22 |     parser.add_argument("bed", type=str, help="interval list or bed file")
23 |     parser.add_argument("out", type=str, help="output for bed file")
24 |     args = parser.parse_args()
25 |     return args
26 | 
27 | def process(options):
28 |     outh = open(options.out, 'w')
29 |     for chrom, start, stop in read_bed(options.bed):
30 |         mid = ceil((start+stop)/2)
31 |         print(chrom, mid-1, mid, sep='\t', file=outh)
32 | 
33 | if __name__ == '__main__':
34 |     sys.exit(process(get_arguments()))
35 | 


--------------------------------------------------------------------------------
/snakemake/script/maf2vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | import sys
 7 | import pandas as pd
 8 | import subprocess as sp
 9 | 
10 | logger = logging.getLogger("{}".format(__file__))
11 | 
12 | def get_arguments():
13 | 
14 |     parser = argparse.ArgumentParser(prog="convert maf to vcf file", formatter_class=argparse.RawDescriptionHelpFormatter)
15 |     parser.add_argument("maf", type=str, help="input maf file")
16 |     parser.add_argument("-r", "--ref_fasta", required=True, help="reference fasta")
17 |     parser.add_argument("-p", "--path_to_maf2vcf", default = "/usr/bin/maf2vcf.pl" , type=str, help="path to the conversion script")
18 |     parser.add_argument("-n", "--t_alt", type=int, default=1, help="min t_alt (1)")
19 |     parser.add_argument("-O", "--out_type", choices=['vcf', 'maf'], default="vcf", help="output type")
20 |     parser.add_argument("-o", "--out_dir", default="./", help="output dir")
21 |     parser.add_argument("-s", "--sample_name", default="", help="sample name")
22 |     parser.add_argument("-d", "--max_del_len", type=int, default=1e9, help="maximum deletion length allowed")
23 |     parser.add_argument("-i", "--min_indel_len", type=int, default=1, help="min indel lngth allowed")
24 |     args = parser.parse_args()
25 |     return args
26 | 
27 | 
28 | def process(options):
29 |     maf = pd.read_csv(options.maf, comment="#", sep='\t', low_memory=False)
30 |     print("input variant count:", maf.shape[0])
31 |     if 't_alt_count' in maf.columns:
32 |         maf = maf[maf['t_alt_count'] >= options.t_alt]
33 |     if options.max_del_len < 1e9:
34 |         maf = maf[maf['Reference_Allele'].str.len() <= options.max_del_len + 1]
35 |     if options.min_indel_len > 1:
36 |         maf = maf[abs(maf['Reference_Allele'].str.len() - maf['Tumor_Seq_Allele2'].str.len()) >= options.min_indel_len]
37 |     print("output variant count:",  maf.shape[0])
38 |     if not options.sample_name:
39 |         sname = os.path.basename(options.maf)
40 |         sname = sname[:-4]
41 |     else:
42 |         sname = options.sample_name
43 |     outmaf = os.path.join(options.out_dir, sname+".filtered.maf")
44 |     outtsv = os.path.join(options.out_dir, sname+".filtered.pairs.tsv")
45 |     #maf = maf[["Chromosome", "Start_Position", "End_Position",
46 |     #              "Variant_Type", "Tumor_Sample_Barcode", "Reference_Allele", "Tumor_Seq_Allele2", "t_alt_count", "t_ref_count", "n_alt_count", "n_ref_count"]]
47 |     maf = maf[["Chromosome", "Start_Position", "Variant_Type", "Tumor_Sample_Barcode", "Reference_Allele", "Tumor_Seq_Allele2", "t_alt_count"]]
48 |     maf.to_csv(outmaf, sep='\t', index=False)
49 |     if options.out_type == "vcf":
50 |         cmd=f"perl {options.path_to_maf2vcf} --input-maf {outmaf} --output-dir {options.out_dir} --ref-fasta {options.ref_fasta}"
51 |         print(f"converting to vcf: {cmd}")
52 |         sp.check_output(cmd, shell=True)
53 |         sp.check_output(f"rm {outmaf}", shell=True)
54 |         sp.check_output(f"rm {outtsv}", shell=True)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     sys.exit(process(get_arguments()))
59 | 


--------------------------------------------------------------------------------
/snakemake/script/msisensor_combine_result.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import logging
 4 | import sys
 5 | import numpy as np
 6 | import os
 7 | import pprint
 8 | from collections import defaultdict
 9 | 
10 | logger = logging.getLogger("{}".format(__file__))
11 | 
12 | def get_arguments():
13 | 
14 |     parser = argparse.ArgumentParser(prog="combine msisensor results", formatter_class=argparse.RawDescriptionHelpFormatter)
15 |     parser.add_argument("table", type=str, help="msisensor all table")
16 |     parser.add_argument("dist", type=str, help="msisensor dist file")
17 |     args = parser.parse_args()
18 |     return args
19 | 
20 | 
21 | def process(opts):
22 |     collector = defaultdict(lambda : defaultdict( lambda : defaultdict(int)))
23 |     pp = pprint.PrettyPrinter(indent=4)
24 |     hp_agg = defaultdict(lambda : defaultdict(int))
25 |     with open(opts.dist, 'r') as fh:
26 |         for nrow, line in enumerate(fh):
27 |             if nrow % 2 == 0:
28 |                 cols = line.split()
29 |                 chrom = cols[0]
30 |                 pos = int(cols[1])
31 |             else:
32 |                 duo = line.split(": ")
33 |                 vec = [int(x) for x in duo[1].split()]
34 |                 nonzeroind = np.nonzero(vec)[0]
35 |                 for ind in nonzeroind:
36 |                     collector[chrom][pos][ind + 1] = vec[ind]
37 |     with open(opts.table, 'r') as fh:
38 |         for line in fh:
39 |             if line.startswith("chromosome"):
40 |                 continue
41 |             cols = line.strip().split('\t')
42 |             # no germline variatns
43 |             if cols[-1] == '0':
44 |                 repeat_times = cols[3]
45 |                 chrom = cols[0]
46 |                 pos = int(cols[1])
47 |                 repeat = cols[4]
48 |                 unit_cnt = collector[chrom][pos]
49 |                 for k,v in unit_cnt.items():
50 |                     key = repeat + "," + repeat_times
51 |                     hp_agg[key][k] += v
52 |                     if repeat == 'A' and repeat_times == '10':
53 |                         if k == 5:
54 |                             print(line)
55 | 
56 |     #pp.pprint(hp_agg)
57 | 
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     sys.exit(process(get_arguments()))
62 | 


--------------------------------------------------------------------------------
/snakemake/script/print_mut_cpgstatus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | import sys
 7 | import pysam
 8 | import pandas as pd
 9 | 
10 | logger = logging.getLogger("{}".format(__file__))
11 | 
12 | def get_arguments():
13 | 
14 |     parser = argparse.ArgumentParser(prog="print whether mutant families in CpG context or not", formatter_class=argparse.RawDescriptionHelpFormatter)
15 |     parser.add_argument("mutfam", type=str, help="mutant families")
16 |     parser.add_argument("ref", type=str, help="reference file")
17 |     args = parser.parse_args()
18 |     return args
19 | 
20 | def absolute_path(path):
21 |     """convert relative path to absolute path"""
22 |     if os.path.isabs(path):
23 |         return path
24 |     else:
25 |         return os.path.join(os.getcwd(), path)
26 | 
27 | def process(opt):
28 |     mf = pd.read_csv(opt.mutfam, sep='\t', low_memory=False)
29 |     mf = mf[mf['pass_filter'] == 1].reset_index()
30 |     ncol = mf.shape[1]
31 |     mf['CpG'] = 'NA'
32 |     fasta = pysam.FastaFile(opt.ref)
33 |     for idx, row in mf.iterrows():
34 |         if row['ref_allele'] == 'C':
35 |             nextb = fasta.fetch(str(row['contig']), row['position'], row['position'] + 1)
36 |             if nextb == 'G' or nextb == 'g':
37 |                 mf.iloc[idx, ncol] = '1'
38 |             else:
39 |                 mf.iloc[idx, ncol] = '0'
40 |         if row['ref_allele'] == 'G':
41 |             nextb = fasta.fetch(str(row['contig']), row['position']-2, row['position'] - 1)
42 |             if nextb == 'C' or nextb == 'c':
43 |                 mf.iloc[idx, ncol] = '1'
44 |             else:
45 |                 mf.iloc[idx, ncol] = '0'
46 |     print(mf)
47 | 
48 | if __name__ == '__main__':
49 |     sys.exit(process(get_arguments()))
50 | 
51 | 


--------------------------------------------------------------------------------
/snakemake/script/print_snv_roc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import sys
 6 | import glob
 7 | import os
 8 | import pandas as pd
 9 | 
10 | logger = logging.getLogger("{}".format(__file__))
11 | def get_arguments():
12 | 
13 |     parser = argparse.ArgumentParser(prog="print whether mutant families in CpG context or not", formatter_class=argparse.RawDescriptionHelpFormatter)
14 |     parser.add_argument("vcfeval_folder", type=str, help="vcfeval output folder")
15 |     args = parser.parse_args()
16 |     return args
17 | 
18 | def process(opts):
19 |     outs = glob.glob(os.path.join(opts.vcfeval_folder, "*/*/snp_roc.tsv.gz"))
20 |     total_df = pd.DataFrame()
21 |     for o in outs:
22 |         fields = o.split('/')
23 |         df = pd.read_csv(o, sep='\t', skiprows=6)
24 |         df['cutoff'] = fields[-2]
25 |         df['vaf'] =  fields[-3][2:]
26 |         if total_df.empty:
27 |             total_df = df
28 |         else:
29 |             total_df = pd.concat([total_df, df])
30 |         total_df.sort_values(by=['vaf', 'cutoff'], inplace=True)
31 |         total_df = total_df[['vaf','cutoff', 'precision', 'sensitivity', 'false_positives', 'false_negatives', 'true_positives_baseline', 'f_measure']]
32 |     print (total_df)
33 |     total_df.to_csv("summary.tsv", sep='\t', index=False)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     sys.exit(process(get_arguments()))
38 | 
39 | 


--------------------------------------------------------------------------------
/snakemake/script/rev_qualscore.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | import sys
 7 | import pysam
 8 | 
 9 | def get_arguments():
10 | 
11 |     parser = argparse.ArgumentParser(prog="", formatter_class=argparse.RawDescriptionHelpFormatter)
12 |     parser.add_argument("bam", type=str, help="input bam file")
13 |     parser.add_argument("out", type=str, help="output bam file")
14 |     args = parser.parse_args()
15 |     return args
16 | 
17 | def process(options):
18 |     inbam = pysam.AlignmentFile(options.bam, "rb")
19 |     outbam = pysam.AlignmentFile(options.out, "wb", template=inbam)
20 |     for read in inbam:
21 |         if read.is_reverse:
22 |             read.query_qualities = read.query_qualities[::-1]
23 |         outbam.write(read)
24 | 
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     sys.exit(process(get_arguments()))
29 | 
30 | 


--------------------------------------------------------------------------------
/snakemake/script/vcf_update_genotype.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import sys
 6 | from cyvcf2 import VCF, Writer
 7 | 
 8 | logger = logging.getLogger("{}".format(__file__))
 9 | def get_arguments():
10 | 
11 |     parser = argparse.ArgumentParser(prog="change the variant genotype", formatter_class=argparse.RawDescriptionHelpFormatter)
12 |     parser.add_argument("vcf", type=str, help="vcf input")
13 |     parser.add_argument("out", type=str, help="vcf output")
14 |     args = parser.parse_args()
15 |     return args
16 | 
17 | def process(opts):
18 |     inputvcf = VCF(opts.vcf)
19 |     writer = Writer(opts.out, inputvcf, "wz")
20 |     for record in inputvcf:
21 |         assert(len(record.gt_types) == 1)
22 |         if record.gt_types[0] == 0:
23 |             record.genotypes = [[0,1,False]]
24 |             writer.write_record(record)
25 |         else:
26 |             writer.write_record(record)
27 |     writer.close()
28 | 
29 | if __name__ == '__main__':
30 |     sys.exit(process(get_arguments()))
31 | 
32 | 


--------------------------------------------------------------------------------
/snakemake/script/vcf_validate_against_maf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import sys
 6 | from cyvcf2 import VCF, Writer
 7 | import pandas as pd
 8 | 
 9 | logger = logging.getLogger("{}".format(__file__))
10 | def get_arguments():
11 | 
12 |     parser = argparse.ArgumentParser(prog="change the variant genotype", formatter_class=argparse.RawDescriptionHelpFormatter)
13 |     parser.add_argument("vcf", type=str, help="vcf input")
14 |     parser.add_argument("out", type=str, help="vcf output")
15 |     parser.add_argument("maf", type=str, help="maf file for truth")
16 |     parser.add_argument("--whitelist", type=str, action='append', help="whitelist varainst in chr:pos format")
17 |     args = parser.parse_args()
18 |     return args
19 | 
20 | def process(opts):
21 |     inputvcf = VCF(opts.vcf)
22 |     writer = Writer(opts.out, inputvcf, "wz")
23 |     maf = pd.read_csv(opts.maf, sep='\t', low_memory=False)
24 |     maf = maf.astype({'Chromosome' : 'str'})
25 |     for record in inputvcf:
26 |         assert(len(record.gt_types) == 1)
27 |         if not maf[(maf['Chromosome'] == record.CHROM) & (maf['Start_position'] == record.start + 1) & (maf['Tumor_Seq_Allele2'].isin(record.ALT))].empty:
28 |             writer.write_record(record)
29 |         else:
30 |             found = False
31 |             if opts.whitelist:
32 |                 for var in opts.whitelist:
33 |                     chr, pos = var.split(':')
34 |                     if record.CHROM == chr and record.start + 1 == int(pos):
35 |                        found = True
36 |                        break
37 |                 if found:
38 |                     writer.write_record(record)
39 |                 else:
40 |                     print(record.CHROM, record.start, record.start + 1, sep='\t')
41 |             else:
42 |                 print(record.CHROM, record.start, record.start + 1, sep='\t')
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     sys.exit(process(get_arguments()))
47 | 
48 | 


--------------------------------------------------------------------------------