├── .github
├── ISSUE_TEMPLATE
│ └── bug_report.md
└── workflows
│ └── ci.yml
├── LICENSE
├── Makefile
├── README.md
├── chromap.1
├── docs
├── _config.yml
├── chromap.html
└── index.md
├── src
├── alignment.cc
├── alignment.h
├── barcode_translator.h
├── bed_mapping.h
├── candidate.h
├── candidate_position_generating_config.h
├── candidate_processor.cc
├── candidate_processor.h
├── chromap.cc
├── chromap.h
├── chromap_driver.cc
├── chromap_driver.h
├── cxxopts.hpp
├── draft_mapping.h
├── draft_mapping_generator.cc
├── draft_mapping_generator.h
├── feature_barcode_matrix.cc
├── feature_barcode_matrix.h
├── feature_barcode_matrix_writer.h
├── hit_utils.h
├── index.cc
├── index.h
├── index_parameters.h
├── index_utils.h
├── khash.h
├── kseq.h
├── ksw.cc
├── ksw.h
├── mapping.h
├── mapping_generator.cc
├── mapping_generator.h
├── mapping_in_memory.h
├── mapping_metadata.h
├── mapping_parameters.h
├── mapping_processor.h
├── mapping_writer.cc
├── mapping_writer.h
├── minimizer.h
├── minimizer_generator.cc
├── minimizer_generator.h
├── mmcache.hpp
├── paf_mapping.h
├── paired_end_mapping_metadata.h
├── pairs_mapping.h
├── sam_mapping.h
├── sequence_batch.cc
├── sequence_batch.h
├── sequence_effective_range.h
├── strand.h
├── summary_metadata.h
├── temp_mapping.h
└── utils.h
└── test
├── read1.fq
├── read2.fq
└── ref.fa
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: "[BUG] XXX"
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Describe the data you are using and provide a sample of your data if possible. For example, the paired-end reads are generated by 10x scATAC-seq. The read length is 50bp and the barcode length is 16bp.
16 | 2. Get the Chromap version by running ```chromap -v``` and post it here.
17 | 3. Provide the full command line you used to run Chromap.
18 | 4. Provide the log output by Chromap and highlight the error message.
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Environment (please complete the following information):**
27 | - OS: [e.g. Ubuntu 22.10]
28 | - Way you install Chromap [e.g. use Bioconda, download binary, build from source]
29 | - If you compiled Chromap from source yourself, please provide the compiler version [e.g. GCC 7.4.0]
30 |
31 | **Additional context**
32 | Add any other context about the problem here.
33 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 | branches: [ master ]
8 |
9 | env:
10 | DEVELOPER_DIR: /Applications/Xcode.app/Contents/Developer
11 |
12 | jobs:
13 | ubuntu:
14 | runs-on: ubuntu-latest
15 | strategy:
16 | matrix:
17 | compiler: [g++, clang++]
18 | steps:
19 | - uses: actions/checkout@v2
20 | - name: install-deps
21 | run:
22 | sudo apt-get update; sudo apt-get install -y clang libomp5 libomp-dev
23 | - name: build-chromap
24 | run:
25 | make CXX=${{ matrix.compiler }}
26 | - name: test-chromap
27 | run:
28 | ./chromap -h
29 |
30 | macos:
31 | runs-on: macos-latest
32 | strategy:
33 | matrix:
34 | compiler: [clang++]
35 | steps:
36 | - uses: actions/checkout@v2
37 | - name: cache-openmp
38 | id: cache-openmp
39 | uses: actions/cache@v3
40 | with:
41 | path: openmp-install
42 | key: openmp-macos-install
43 | - name: build-openmp
44 | if: steps.cache-openmp.outputs.cache-hit != 'true'
45 | run: |
46 | wget https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.0/openmp-14.0.0.src.tar.xz
47 | tar -xf openmp-14.0.0.src.tar.xz
48 | cd openmp-14.0.0.src
49 | sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S
50 | sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S
51 | mkdir -p build && cd build
52 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \
53 | -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF ..
54 | cmake --build . -j 3
55 | cmake --build . --target install
56 | mkdir $GITHUB_WORKSPACE/openmp-install
57 | cp -r install/* $GITHUB_WORKSPACE/openmp-install
58 | - name: install-openmp
59 | run: |
60 | sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
61 | sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib
62 | - name: build-chromap
63 | run:
64 | make CXX=${{ matrix.compiler }} CXXFLAGS="-arch x86_64 -isysroot $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk -std=c++11 -Wall -O3 -Xclang -fopenmp -msse4.1" LDFLAGS="-L$DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib -rpath $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib -lm -lz -lomp"
65 | - name: test-chromap
66 | run:
67 | ./chromap -h
68 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Haowen Zhang, Li Song, X. Shirley Liu, Heng Li
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | CXX=g++
2 | CXXFLAGS=-std=c++11 -Wall -O3 -fopenmp -msse4.1
3 | LDFLAGS=-lm -lz
4 |
5 | cpp_source=sequence_batch.cc index.cc minimizer_generator.cc candidate_processor.cc alignment.cc feature_barcode_matrix.cc ksw.cc draft_mapping_generator.cc mapping_generator.cc mapping_writer.cc chromap.cc chromap_driver.cc
6 | src_dir=src
7 | objs_dir=objs
8 | objs+=$(patsubst %.cc,$(objs_dir)/%.o,$(cpp_source))
9 |
10 | exec=chromap
11 |
12 | ifneq ($(asan),)
13 | CXXFLAGS+=-fsanitize=address -g
14 | LDFLAGS+=-fsanitize=address -ldl -g
15 | endif
16 |
17 | all: dir $(exec)
18 |
19 | dir:
20 | mkdir -p $(objs_dir)
21 |
22 | $(exec): $(objs)
23 | $(CXX) $(CXXFLAGS) $(objs) -o $(exec) $(LDFLAGS)
24 |
25 | $(objs_dir)/%.o: $(src_dir)/%.cc
26 | $(CXX) $(CXXFLAGS) -c $< -o $@
27 |
28 | .PHONY: clean
29 | clean:
30 | -rm -rf $(exec) $(objs_dir)
31 |
--------------------------------------------------------------------------------
/chromap.1:
--------------------------------------------------------------------------------
1 | .TH chromap 1 "25 Jan 2024" "chromap-0.2.6 (r490)" "Bioinformatics tools"
2 | .SH NAME
3 | .PP
4 | chromap - fast alignment and preprocessing of chromatin profiles
5 | .SH SYNOPSIS
6 | * Indexing the reference genome:
7 | .RS 4
8 | chromap
9 | .B -i
10 | .RB [ -k
11 | .IR kmer ]
12 | .RB [ -w
13 | .IR miniWinSize ]
14 | .B -r
15 | .I ref.fa
16 | .B -o
17 | .I ref.index
18 | .RE
19 |
20 | * Mapping (sc)ATAC-seq reads:
21 | .RS 4
22 | chromap
23 | .B --preset
24 | .I atac
25 | .B -r
26 | .I ref.fa
27 | .B -x
28 | .I ref.index
29 | .B -1
30 | .I read1.fq
31 | .B -2
32 | .I read2.fq
33 | .B -o
34 | .I aln.bed
35 | .RB [ -b
36 | .IR barcode.fq.gz ]
37 | .RB [ --barcode-whitelist
38 | .IR whitelist.txt ]
39 | .RE
40 |
41 | * Mapping ChIP-seq reads:
42 | .RS 4
43 | chromap
44 | .B --preset
45 | .I chip
46 | .B -r
47 | .I ref.fa
48 | .B -x
49 | .I ref.index
50 | .B -1
51 | .I read1.fq
52 | .B -2
53 | .I read2.fq
54 | .B -o
55 | .I aln.bed
56 | .RE
57 |
58 | * Mapping Hi-C reads:
59 | .RS 4
60 | chromap
61 | .B --preset
62 | .I hic
63 | .B -r
64 | .I ref.fa
65 | .B -x
66 | .I ref.index
67 | .B -1
68 | .I read1.fq
69 | .B -2
70 | .I read2.fq
71 | .B -o
72 | .I aln.pairs
73 | .br
74 | chromap
75 | .B --preset
76 | .I hic
77 | .B -r
78 | .I ref.fa
79 | .B -x
80 | .I ref.index
81 | .B -1
82 | .I read1.fq
83 | .B -2
84 | .I read2.fq
85 | .B --SAM
86 | .B -o
87 | .I aln.sam
88 | .RE
89 |
90 | .SH DESCRIPTION
91 | .PP
92 | Chromap is an ultrafast method for aligning and preprocessing high throughput
93 | chromatin profiles. Typical use cases include: (1) trimming sequencing adapters,
94 | mapping bulk ATAC-seq or ChIP-seq genomic reads to the human genome and removing
95 | duplicates; (2) trimming sequencing adapters, mapping single cell ATAC-seq
96 | genomic reads to the human genome, correcting barcodes, removing duplicates and
97 | performing Tn5 shift; (3) split alignment of Hi-C reads against a reference
98 | genome. In all these three cases, Chromap is 10-20 times faster while being
99 | accurate.
100 | .SH OPTIONS
101 | .SS Indexing options
102 | .TP 10
103 | .BI -k \ INT
104 | Minimizer k-mer length [17].
105 | .TP
106 | .BI -w \ INT
107 | Minimizer window size [7]. A minimizer is the smallest k-mer
108 | in a window of w consecutive k-mers.
109 | .TP
110 | .B --min-frag-length
111 | Min fragment length for choosing k and w automatically [30]. Users can increase
112 | this value when the min length of the fragments of interest is long, which can
113 | increase the mapping speed. Note that the default value 30 is the min fragment
114 | length that chromap can map.
115 |
116 | .SS Mapping options
117 | .TP 10
118 | .BI --split-alignment
119 | Allow split alignments. This option should be set only when mapping Hi-C reads.
120 | .TP
121 | .BI -e \ INT
122 | Max edit distance allowed to map a read [8].
123 | .TP
124 | .BI -s \ INT
125 | Min number of minimizers required to map a read [2].
126 | .TP
127 | .BI -f \ INT1 [, INT2 ]
128 | Ignore minimizers occuring more than
129 | .I INT1
130 | [500] times.
131 | .I INT2
132 | [1000] is the threshold for a second round of seeding.
133 | .TP
134 | .BI -l \ INT
135 | Max insert size, only for paired-end read mapping [1000].
136 | .TP
137 | .BI -q \ INT
138 | Min MAPQ in range [0, 60] for mappings to be output [30].
139 | .TP
140 | .BI --min-read-length \ INT
141 | Skip mapping the reads of length less than
142 | .I INT
143 | [30]. Note that this is different from the index option
144 | .BR --min-frag-length
145 | , which set
146 | .BR -k
147 | and
148 | .BR -w
149 | for indexing the genome.
150 | .TP
151 | .BI --trim-adapters
152 | Try to trim adapters on 3'. This only works for paired-end reads. When the
153 | fragment length indicated by the read pair is less than the length of the reads,
154 | the two mates are overlapped with each other. Then the regions outside the
155 | overlap are regarded as adapters and trimmed.
156 | .TP
157 | .BI --remove-pcr-duplicates
158 | Remove PCR duplicates.
159 | .TP
160 | .BI --remove-pcr-duplicates-at-bulk-level
161 | Remove PCR duplicates at bulk level for single cell data.
162 | .TP
163 | .BI --remove-pcr-duplicates-at-cell-level
164 | Remove PCR duplicates at cell level for single cell data.
165 | .TP
166 | .BI --Tn5-shift
167 | Perform Tn5 shift. When this option is turned on, the forward mapping start
168 | positions are increased by 4bp and the reverse mapping end positions are
169 | decreased by 5bp. Note that this works only when
170 | .BR --SAM
171 | is NOT set.
172 | .TP
173 | .BI --low-mem
174 | Use low memory mode. When this option is set, multiple temporary intermediate
175 | mapping files might be generated on disk and they are merged at the end of
176 | processing to reduce memory usage. When this is NOT set, all the mapping results
177 | are kept in the memory before they are saved on disk, which works more
178 | efficiently for datasets that are not too large.
179 | .TP
180 | .BI --bc-error-threshold \ INT
181 | Max Hamming distance allowed to correct a barcode [1]. Note that the max
182 | supported threshold is 2.
183 | .TP
184 | .BI --bc-probability-threshold \ FLT
185 | Min probability to correct a barcode [0.9]. When there are multiple whitelisted
186 | barcodes with the same Hamming distance to the barcode to correct, chromap will
187 | process the base quality of the mismatched bases, and compute a probability that
188 | the correction is right.
189 | .TP
190 | .BI -t \ INT
191 | The number of threads for mapping [1].
192 |
193 | .SS Input options
194 | .TP 10
195 | .BI -r \ FILE
196 | Reference file.
197 | .TP
198 | .BI -x \ FILE
199 | Index file.
200 | .TP
201 | .BI -1 \ FILE
202 | Single-end read files or paired-end read files 1. Chromap supports mulitple
203 | input files concatenate by ",". For example, setting this option to
204 | "Library1_R1.fastq.gz,Library2_R1.fastq.gz,Library3_R1.fastq.gz" will make
205 | all three files as input and map them in this order. Similarly,
206 | .BR -2
207 | and
208 | .BR -b
209 | also support multiple input files. And the ordering of the input files for all
210 | the three options should match.
211 | .TP
212 | .BI -2 \ FILE
213 | Paired-end read files 2.
214 | .TP
215 | .BI -b \ FILE
216 | Cell barcode files.
217 | .TP
218 | .BI --barcode-whitelist \ FILE
219 | Cell barcode whitelist file. This is supposed to be a txt file where each line
220 | is a whitelisted barcode.
221 | .TP
222 | .BI --read-format \ STR
223 | Format for read files and barcode files ["r1:0:-1,bc:0:-1"] as 10x Genomics
224 | single-end format.
225 |
226 | .SS Output options
227 | .TP 10
228 | .BR -o \ FILE
229 | Output file.
230 | .TP
231 | .BR --output-mappings-not-in-whitelist
232 | Output mappings with barcode not in the whitelist.
233 | .TP
234 | .BR --chr-order \ FILE
235 | Custom chromosome order file. If not specified, the order of reference sequences will be used.
236 | .TP
237 | .BR --BED
238 | Output mappings in BED/BEDPE format. Note that only one of the formats should be
239 | set.
240 | .TP
241 | .BR --TagAlign
242 | Output mappings in TagAlign/PairedTagAlign format.
243 | .TP
244 | .BR --SAM
245 | Output mappings in SAM format.
246 | .TP
247 | .BR --pairs
248 | Output mappings in pairs format (defined by 4DN for HiC data).
249 | .TP
250 | .BR --pairs-natural-chr-order \ FILE
251 | Custom chromosome order file for pairs flipping. If not specified, the custom chromosome order will be used.
252 | .TP
253 | .BR --barcode-translate \ FILE
254 | Convert input barcodes to another set of barcodes in the output.
255 | .TP
256 | .BR --summary \ FILE
257 | Summarize the mapping statistics at bulk or barcode level.
258 | .TP
259 | .B -v
260 | Print version number to stdout.
261 |
262 | .SS Preset options
263 | .TP 10
264 | .BI --preset \ STR
265 | Preset []. This option applies multiple options at the same time. It should be
266 | applied before other options because options applied later will overwrite the
267 | values set by
268 | .BR --preset .
269 | Available
270 | .I STR
271 | are:
272 | .RS
273 | .TP 10
274 | .B chip
275 | Mapping ChIP-seq reads
276 | .RB ( -l
277 | .I 2000
278 | .B --remove-pcr-duplicates --low-mem
279 | .BR --BED ).
280 | .TP
281 | .B atac
282 | Mapping ATAC-seq/scATAC-seq reads
283 | .RB ( -l
284 | .I 2000
285 | .B --remove-pcr-duplicates --low-mem --trim-adapters --Tn5-shift
286 | .B --remove-pcr-duplicates-at-cell-level
287 | .BR --BED ).
288 | .TP
289 | .B hic
290 | Mapping Hi-C reads
291 | .RB ( -e
292 | .I 4
293 | .B -q
294 | .I 1
295 | .B --low-mem --split-alignment
296 | .BR --pairs ).
297 |
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-modernist
--------------------------------------------------------------------------------
/docs/chromap.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 |
6 |
7 |
8 |
9 |
10 |
16 | chromap
17 |
18 |
19 |
20 |
21 | chromap
22 |
23 | NAME
24 | SYNOPSIS
25 | DESCRIPTION
26 | OPTIONS
27 |
28 |
29 |
30 |
31 | NAME
32 |
33 |
34 |
35 |
36 | chromap - fast
37 | alignment and preprocessing of chromatin profiles
38 |
39 | SYNOPSIS
40 |
41 |
42 |
43 |
44 | * Indexing the
45 | reference genome:
46 |
47 | chromap -i [-k
48 | kmer] [-w miniWinSize] -r
49 | ref.fa -o ref.index
50 |
51 | * Mapping
52 | (sc)ATAC-seq reads:
53 |
54 | chromap --preset
55 | atac -r ref.fa -x
56 | ref.index -1 read1.fq -2
57 | read2.fq -o aln.bed [-b
58 | barcode.fq.gz] [--barcode-whitelist
59 | whitelist.txt]
60 |
61 | * Mapping
62 | ChIP-seq reads:
63 |
64 | chromap --preset
65 | chip -r ref.fa -x
66 | ref.index -1 read1.fq -2
67 | read2.fq -o aln.bed
68 |
69 | * Mapping Hi-C
70 | reads:
71 |
72 | chromap --preset
73 | hic -r ref.fa -x
74 | ref.index -1 read1.fq -2
75 | read2.fq -o aln.pairs
76 | chromap --preset hic -r ref.fa
77 | -x ref.index -1 read1.fq
78 | -2 read2.fq --SAM -o aln.sam
79 |
80 | DESCRIPTION
81 |
82 |
83 |
84 |
85 | Chromap is an
86 | ultrafast method for aligning and preprocessing high
87 | throughput chromatin profiles. Typical use cases include:
88 | (1) trimming sequencing adapters, mapping bulk ATAC-seq or
89 | ChIP-seq genomic reads to the human genome and removing
90 | duplicates; (2) trimming sequencing adapters, mapping single
91 | cell ATAC-seq genomic reads to the human genome, correcting
92 | barcodes, removing duplicates and performing Tn5 shift; (3)
93 | split alignment of Hi-C reads against a reference genome. In
94 | all these three cases, Chromap is 10-20 times faster while
95 | being accurate.
96 |
97 | OPTIONS
98 |
99 |
100 |
101 |
102 | Indexing
103 | options
104 |
105 |
107 |
108 | |
109 |
110 |
111 |
112 | -k INT |
113 | |
114 |
115 |
116 |
117 | Minimizer k-mer length [17]. |
118 |
119 | |
120 |
121 |
122 |
123 | -w INT |
124 | |
125 |
126 |
127 |
128 | Minimizer window size [7]. A minimizer is the smallest
129 | k-mer in a window of w consecutive k-mers. |
130 |
131 |
132 | --min-frag-length
133 |
134 | Min fragment length for
135 | choosing k and w automatically [30]. Users can increase this
136 | value when the min length of the fragments of interest is
137 | long, which can increase the mapping speed. Note that the
138 | default value 30 is the min fragment length that chromap can
139 | map.
140 |
141 | Mapping
142 | options
143 | --split-alignment
144 |
145 | Allow split alignments. This
146 | option should be set only when mapping Hi-C reads.
147 |
148 |
150 |
151 | |
152 |
153 |
154 |
155 | -e INT |
156 | |
157 |
158 |
159 |
160 | Max edit distance allowed to map a read [8]. |
161 |
162 | |
163 |
164 |
165 |
166 | -s INT |
167 | |
168 |
169 |
170 |
171 | Min number of minimizers required to map a read [2]. |
172 |
173 |
174 |
175 | -f INT1[,INT2]
176 |
177 | Ignore minimizers occuring more
178 | than INT1 [500] times. INT2 [1000] is the
179 | threshold for a second round of seeding.
180 |
181 |
183 |
184 | |
185 |
186 |
187 |
188 | -l INT |
189 | |
190 |
191 |
192 |
193 | Max insert size, only for paired-end read mapping
194 | [1000]. |
195 |
196 | |
197 |
198 |
199 |
200 | -q INT |
201 | |
202 |
203 |
204 |
205 | Min MAPQ in range [0, 60] for mappings to be output
206 | [30]. |
207 |
208 |
209 |
210 | --min-read-length INT
211 |
212 | Skip mapping the reads of
213 | length less than INT [30]. Note that this is
214 | different from the index option --min-frag-length ,
215 | which set -k and -w for indexing the
216 | genome.
217 |
218 | --trim-adapters
219 |
220 | Try to trim adapters on
221 | 3’. This only works for paired-end reads. When the
222 | fragment length indicated by the read pair is less than the
223 | length of the reads, the two mates are overlapped with each
224 | other. Then the regions outside the overlap are regarded as
225 | adapters and trimmed.
226 |
227 |
228 | --remove-pcr-duplicates
229 |
230 | Remove PCR duplicates.
231 |
232 |
233 | --remove-pcr-duplicates-at-bulk-level
234 |
235 | Remove PCR duplicates at bulk
236 | level for single cell data.
237 |
238 |
239 | --remove-pcr-duplicates-at-cell-level
240 |
241 | Remove PCR duplicates at cell
242 | level for single cell data.
243 |
244 | --Tn5-shift
245 |
246 | Perform Tn5 shift. When this
247 | option is turned on, the forward mapping start positions are
248 | increased by 4bp and the reverse mapping end positions are
249 | decreased by 5bp. Note that this works only when
250 | --SAM is NOT set.
251 |
252 |
254 |
255 | |
256 |
257 |
258 |
259 | --low-mem |
260 | |
261 |
262 |
263 |
264 | Use low memory mode. When this option is set, multiple
265 | temporary intermediate mapping files might be generated on
266 | disk and they are merged at the end of processing to reduce
267 | memory usage. When this is NOT set, all the mapping results
268 | are kept in the memory before they are saved on disk, which
269 | works more efficiently for datasets that are not too
270 | large. |
271 |
272 |
273 |
274 | --bc-error-threshold INT
275 |
276 | Max Hamming distance allowed to
277 | correct a barcode [1]. Note that the max supported threshold
278 | is 2.
279 |
280 |
281 | --bc-probability-threshold FLT
282 |
283 | Min probability to correct a
284 | barcode [0.9]. When there are multiple whitelisted barcodes
285 | with the same Hamming distance to the barcode to correct,
286 | chromap will process the base quality of the mismatched
287 | bases, and compute a probability that the correction is
288 | right.
289 |
290 |
292 |
293 | |
294 |
295 |
296 |
297 | -t INT |
298 | |
299 |
300 |
301 |
302 | The number of threads for mapping [1]. |
303 |
304 | |
305 |
306 |
307 | Input
308 | options
309 |
310 |
312 |
313 | |
314 |
315 |
316 |
317 | -r FILE |
318 | |
319 |
320 |
321 |
322 | Reference file. |
323 |
324 | |
325 |
326 |
327 |
328 | -x FILE |
329 | |
330 |
331 |
332 |
333 | Index file. |
334 |
335 | |
336 |
337 |
338 |
339 | -1 FILE |
340 | |
341 |
342 |
343 |
344 | Single-end read files or paired-end read files 1.
345 | Chromap supports mulitple input files concatenate by
346 | ",". For example, setting this option to
347 | "read11.fq,read12.fq,read13.fq" will make all
348 | three files as input and map them in this order. Similarly,
349 | -2 and -b also support multiple input files.
350 | And the ordering of the input files for all the three
351 | options should match. |
352 |
353 | |
354 |
355 |
356 |
357 | -2 FILE |
358 | |
359 |
360 |
361 |
362 | Paired-end read files 2. |
363 |
364 | |
365 |
366 |
367 |
368 | -b FILE |
369 | |
370 |
371 |
372 |
373 | Cell barcode files. |
374 |
375 |
376 |
377 | --barcode-whitelist FILE
378 |
379 | Cell barcode whitelist file.
380 | This is supposed to be a txt file where each line is a
381 | whitelisted barcode.
382 |
383 |
384 | --read-format STR
385 |
386 | Format for read files and
387 | barcode files ["r1:0:-1,bc:0:-1"] as 10x Genomics
388 | single-end format.
389 |
390 | Output
391 | options
392 |
393 |
395 |
396 | |
397 |
398 |
399 |
400 | -o FILE |
401 | |
402 |
403 |
404 |
405 | Output file. |
406 |
407 | |
408 |
409 |
410 |
411 | --output-mappings-not-in-whitelist
412 |
413 | Output mappings with barcode
414 | not in the whitelist.
415 |
416 |
417 | --chr-order FILE
418 |
419 | Customized chromsome order.
420 |
421 |
423 |
424 | |
425 |
426 |
427 |
428 | --BED |
429 | |
430 |
431 |
432 |
433 | Output mappings in BED/BEDPE format. Note that only one
434 | of the formats should be set. |
435 |
436 |
437 | --TagAlign
438 |
439 | Output mappings in
440 | TagAlign/PairedTagAlign format.
441 |
442 |
444 |
445 | |
446 |
447 |
448 |
449 | --SAM |
450 | |
451 |
452 |
453 |
454 | Output mappings in SAM format. |
455 |
456 | |
457 |
458 |
459 |
460 | --pairs |
461 | |
462 |
463 |
464 |
465 | Output mappings in pairs format (defined by 4DN for HiC
466 | data). |
467 |
468 |
469 |
470 | --pairs-natural-chr-order FILE
471 |
472 | Natural chromosome order for
473 | pairs flipping.
474 |
475 |
477 |
478 | |
479 |
480 |
481 |
482 | -v |
483 | |
484 |
485 |
486 |
487 | Print version number to stdout. |
488 |
489 | |
490 |
491 |
492 | Preset
493 | options
494 | --preset STR
495 |
496 | Preset []. This option applies
497 | multiple options at the same time. It should be applied
498 | before other options because options applied later will
499 | overwrite the values set by --preset. Available
500 | STR are:
501 |
502 |
504 |
505 | |
506 |
507 |
508 |
509 | chip |
510 | |
511 |
512 |
513 |
514 | Mapping ChIP-seq reads (-l 2000
515 | --remove-pcr-duplicates --low-mem --BED). |
516 |
517 | |
518 |
519 |
520 |
521 | atac |
522 | |
523 |
524 |
525 |
526 | Mapping ATAC-seq/scATAC-seq reads (-l 2000
527 | --remove-pcr-duplicates --low-mem --trim-adapters
528 | --Tn5-shift --remove-pcr-duplicates-at-cell-level
529 | --BED). |
530 |
531 | |
532 |
533 |
534 |
535 | hic |
536 | |
537 |
538 |
539 |
540 | Mapping Hi-C reads (-e 4 -q
541 | 1 --low-mem --split-alignment --pairs). |
542 |
543 |
544 |
545 |
546 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ## Getting help
2 |
3 | * [README][doc]: general documentation
4 | * [Manpage](chromap.html): explanation of command-line options
5 | * [Preprint][biorxiv]: free of charge preprint that describes the method
6 | * [GitHub Issues page][issue]: report bugs, request features and ask questions
7 |
8 | ## Acquiring Chromap
9 |
10 | * `git clone https://github.com/haowenz/chromap.git`
11 | * [GitHub Release page][release]: versioned packages
12 | * Also [available from BioConda][bioconda]
13 |
14 | [doc]: https://github.com/haowenz/chromap/blob/master/README.md
15 | [biorxiv]: https://www.biorxiv.org/content/10.1101/2021.06.18.448995v1
16 | [bioconda]: https://anaconda.org/bioconda/chromap
17 | [release]: https://github.com/haowenz/chromap/releases
18 | [issue]: https://github.com/haowenz/chromap/issues
19 |
--------------------------------------------------------------------------------
/src/alignment.h:
--------------------------------------------------------------------------------
1 | #ifndef ALIGNMENT_H_
2 | #define ALIGNMENT_H_
3 |
4 | #include "mapping_in_memory.h"
5 | #include "sam_mapping.h"
6 | #include "sequence_batch.h"
7 | #include "utils.h"
8 |
9 | namespace chromap {
10 |
11 | int GetLongestMatchLength(const char *pattern, const char *text,
12 | const int read_length);
13 |
14 | // Return newly adjusted reference start/end position for kPositive/kNegative
15 | // mappings.
16 | int AdjustGapBeginning(const Strand mapping_strand, const char *ref,
17 | const char *read, int *gap_beginning, int read_end,
18 | int ref_start_position, int ref_end_position,
19 | int *n_cigar, uint32_t **cigar);
20 |
21 | // Reference (pattern) mapping start postion and cigar must be computed before
22 | // calling this function. Read (text) must be already at the start position.
23 | void GenerateNMAndMDTag(const char *pattern, const char *text,
24 | int mapping_start_position,
25 | MappingInMemory &mapping_in_memory);
26 |
27 | int BandedAlignPatternToText(int error_threshold, const char *pattern,
28 | const char *text, const int read_length,
29 | int *mapping_end_position);
30 |
31 | // Return negative number if the termination are deemed at the beginning of the
32 | // read mappping_end_position is relative to pattern (reference)
33 | // read_mapping_length is for text (read)
34 | int BandedAlignPatternToTextWithDropOff(int error_threshold,
35 | const char *pattern, const char *text,
36 | const int read_length,
37 | int *mapping_end_position,
38 | int *read_mapping_length);
39 |
40 | int BandedAlignPatternToTextWithDropOffFrom3End(
41 | int error_threshold, const char *pattern, const char *text,
42 | const int read_length, int *mapping_end_position, int *read_mapping_length);
43 |
44 | void BandedAlign4PatternsToText(int error_threshold, const char **patterns,
45 | const char *text, int read_length,
46 | int32_t *mapping_edit_distances,
47 | int32_t *mapping_end_positions);
48 |
49 | void BandedAlign8PatternsToText(int error_threshold, const char **patterns,
50 | const char *text, int read_length,
51 | int16_t *mapping_edit_distances,
52 | int16_t *mapping_end_positions);
53 |
54 | void BandedTraceback(int error_threshold, int min_num_errors,
55 | const char *pattern, const char *text,
56 | const int read_length, int *mapping_start_position);
57 |
58 | void BandedTracebackToEnd(int error_threshold, int min_num_errors,
59 | const char *pattern, const char *text,
60 | const int read_length, int *mapping_end_position);
61 |
62 | } // namespace chromap
63 |
64 | #endif // ALIGNMENT_H_
65 |
--------------------------------------------------------------------------------
/src/barcode_translator.h:
--------------------------------------------------------------------------------
1 | #ifndef BARCODETRANSLATOR_H_
2 | #define BARCODETRANSLATOR_H_
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #include
13 |
14 | #include "khash.h"
15 | #include "utils.h"
16 |
17 | namespace chromap {
18 |
19 | KHASH_INIT(k64_str, uint64_t, char *, 1, kh_int64_hash_func,
20 | kh_int64_hash_equal);
21 |
22 | // The class for handling barcode convertion.
23 | class BarcodeTranslator {
24 | public:
25 | BarcodeTranslator() {
26 | barcode_translate_table_ = NULL;
27 | from_bc_length_ = -1;
28 | }
29 |
30 | ~BarcodeTranslator() {
31 | if (barcode_translate_table_ != NULL) {
32 | khiter_t k;
33 | for (k = kh_begin(barcode_translate_table_);
34 | k != kh_end(barcode_translate_table_); ++k) {
35 | if (kh_exist(barcode_translate_table_, k))
36 | free(kh_value(barcode_translate_table_, k));
37 | }
38 | kh_destroy(k64_str, barcode_translate_table_);
39 | }
40 | }
41 |
42 | void SetTranslateTable(const std::string &file) {
43 | barcode_translate_table_ = kh_init(k64_str);
44 |
45 | if (1) {
46 | gzFile barcode_translate_file = gzopen(file.c_str(), "r");
47 | const uint32_t line_buffer_size = 512;
48 | char file_line[line_buffer_size];
49 | while (gzgets(barcode_translate_file, file_line, line_buffer_size) != NULL) {
50 | int line_len = strlen(file_line);
51 | if (file_line[line_len - 1] == '\n') {
52 | file_line[line_len - 1] = '\0';
53 | }
54 | std::string tmp_string(file_line);
55 | ProcessTranslateFileLine(tmp_string);
56 | }
57 | } else {
58 | // Old implementation, which does not support gzipped input.
59 | std::ifstream file_stream(file);
60 | std::string file_line;
61 | while (getline(file_stream, file_line)) {
62 | ProcessTranslateFileLine(file_line);
63 | }
64 | }
65 |
66 | mask_ = (1ull << (2 * from_bc_length_)) - 1;
67 | /*for (int i = 0; i < from_bc_length_; ++i)
68 | {
69 | mask_ |= (3ull << (2*i));
70 | }*/
71 | }
72 |
73 | std::string Translate(uint64_t bc, uint32_t bc_length) {
74 | if (barcode_translate_table_ == NULL) {
75 | return Seed2Sequence(bc, bc_length);
76 | }
77 |
78 | std::string ret;
79 | uint64_t i;
80 | for (i = 0; i < bc_length / from_bc_length_; ++i) {
81 | uint64_t seed = (bc << (2 * i * from_bc_length_)) >>
82 | (2 * (bc_length / from_bc_length_ - 1) * from_bc_length_);
83 | seed &= mask_;
84 | khiter_t barcode_translate_table_iter =
85 | kh_get(k64_str, barcode_translate_table_, seed);
86 | if (barcode_translate_table_iter == kh_end(barcode_translate_table_)) {
87 | std::cerr << "Barcode does not exist in the translation table."
88 | << std::endl;
89 | exit(-1);
90 | }
91 | std::string bc_to(
92 | kh_value(barcode_translate_table_, barcode_translate_table_iter));
93 | if (i == 0) {
94 | ret = bc_to;
95 | } else {
96 | ret += "-" + bc_to;
97 | }
98 | }
99 | return ret;
100 | }
101 |
102 | private:
103 | khash_t(k64_str) * barcode_translate_table_;
104 | int from_bc_length_;
105 | uint64_t mask_;
106 |
107 | std::string Seed2Sequence(uint64_t seed, uint32_t seed_length) const {
108 | std::string sequence;
109 | sequence.reserve(seed_length);
110 | uint64_t mask_ = 3;
111 | for (uint32_t i = 0; i < seed_length; ++i) {
112 | sequence.push_back(
113 | Uint8ToChar((seed >> ((seed_length - 1 - i) * 2)) & mask_));
114 | }
115 | return sequence;
116 | }
117 |
118 | void ProcessTranslateFileLine(std::string &line) {
119 | int i;
120 | int len = line.length();
121 | std::string to;
122 | for (i = 0; i < len; ++i) {
123 | if (line[i] == ',' || line[i] == '\t') break;
124 | }
125 |
126 | to = line.substr(0, i);
127 | // from = line.substr(i + 1, len - i - 1);
128 | from_bc_length_ = len - i - 1;
129 | uint64_t from_seed =
130 | GenerateSeedFromSequence(line.c_str(), len, i + 1, from_bc_length_);
131 |
132 | int khash_return_code;
133 | khiter_t barcode_translate_table_iter = kh_put(
134 | k64_str, barcode_translate_table_, from_seed, &khash_return_code);
135 | kh_value(barcode_translate_table_, barcode_translate_table_iter) =
136 | strdup(to.c_str());
137 | }
138 | };
139 |
140 | } // namespace chromap
141 | #endif
142 |
--------------------------------------------------------------------------------
/src/bed_mapping.h:
--------------------------------------------------------------------------------
1 | #ifndef BEDMAPPING_H_
2 | #define BEDMAPPING_H_
3 |
4 | #include
5 |
6 | #include "mapping.h"
7 |
8 | namespace chromap {
9 |
10 | class MappingWithBarcode : public Mapping {
11 | public:
12 | uint32_t read_id_;
13 | uint64_t cell_barcode_;
14 | uint32_t fragment_start_position_;
15 | uint16_t fragment_length_;
16 | uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1;
17 | uint8_t num_dups_;
18 | // uint8_t mapq;
19 | MappingWithBarcode() : num_dups_(0) {}
20 | MappingWithBarcode(uint32_t read_id, uint64_t cell_barcode,
21 | uint32_t fragment_start_position, uint16_t fragment_length,
22 | uint8_t mapq, uint8_t direction, uint8_t is_unique,
23 | uint8_t num_dups)
24 | : read_id_(read_id),
25 | cell_barcode_(cell_barcode),
26 | fragment_start_position_(fragment_start_position),
27 | fragment_length_(fragment_length),
28 | mapq_(mapq),
29 | direction_(direction),
30 | is_unique_(is_unique),
31 | num_dups_(num_dups) {}
32 | bool operator<(const MappingWithBarcode &m) const {
33 | return std::tie(fragment_start_position_, fragment_length_, cell_barcode_,
34 | mapq_, direction_, is_unique_, read_id_) <
35 | std::tie(m.fragment_start_position_, m.fragment_length_,
36 | m.cell_barcode_, m.mapq_, m.direction_, m.is_unique_,
37 | m.read_id_);
38 | }
39 | bool operator==(const MappingWithBarcode &m) const {
40 | return std::tie(cell_barcode_, fragment_start_position_) ==
41 | std::tie(m.cell_barcode_, m.fragment_start_position_);
42 | }
43 | bool IsSamePosition(const MappingWithBarcode &m) const {
44 | return std::tie(fragment_start_position_) ==
45 | std::tie(m.fragment_start_position_);
46 | }
47 | uint64_t GetBarcode() const { return cell_barcode_; }
48 | void Tn5Shift() {
49 | if (direction_ == 1) {
50 | fragment_start_position_ += 4;
51 | } else {
52 | fragment_length_ -= 5;
53 | }
54 | }
55 | bool IsPositiveStrand() const { return direction_ > 0 ? true : false; }
56 | uint32_t GetStartPosition() const { // inclusive
57 | return fragment_start_position_;
58 | }
59 | uint32_t GetEndPosition() const { // exclusive
60 | return fragment_start_position_ + fragment_length_;
61 | }
62 | };
63 |
64 | class MappingWithoutBarcode : public Mapping {
65 | public:
66 | uint32_t read_id_;
67 | uint32_t fragment_start_position_;
68 | uint16_t fragment_length_;
69 | // uint8_t mapq;
70 | uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1;
71 | uint16_t num_dups_; // Need higher limit in bulk setting
72 |
73 | MappingWithoutBarcode() : num_dups_(0) {}
74 | MappingWithoutBarcode(uint32_t read_id, uint32_t fragment_start_position,
75 | uint16_t fragment_length, uint16_t mapq,
76 | uint8_t direction, uint8_t is_unique, uint8_t num_dups)
77 | : read_id_(read_id),
78 | fragment_start_position_(fragment_start_position),
79 | fragment_length_(fragment_length),
80 | mapq_(mapq),
81 | direction_(direction),
82 | is_unique_(is_unique),
83 | num_dups_(num_dups) {}
84 |
85 | bool operator<(const MappingWithoutBarcode &m) const {
86 | return std::tie(fragment_start_position_, fragment_length_, mapq_,
87 | direction_, is_unique_, read_id_) <
88 | std::tie(m.fragment_start_position_, m.fragment_length_, m.mapq_,
89 | m.direction_, m.is_unique_, m.read_id_);
90 | }
91 | bool operator==(const MappingWithoutBarcode &m) const {
92 | return std::tie(fragment_start_position_) ==
93 | std::tie(m.fragment_start_position_);
94 | }
95 | bool IsSamePosition(const MappingWithoutBarcode &m) const {
96 | return std::tie(fragment_start_position_) ==
97 | std::tie(m.fragment_start_position_);
98 | }
99 | uint64_t GetBarcode() const { return 0; }
100 | void Tn5Shift() {
101 | if (direction_ == 1) {
102 | fragment_start_position_ += 4;
103 | } else {
104 | fragment_length_ -= 5;
105 | }
106 | }
107 | bool IsPositiveStrand() const { return direction_ > 0 ? true : false; }
108 | uint32_t GetStartPosition() const { // inclusive
109 | return fragment_start_position_;
110 | }
111 | uint32_t GetEndPosition() const { // exclusive
112 | return fragment_start_position_ + fragment_length_;
113 | }
114 | };
115 |
116 | class PairedEndMappingWithBarcode : public Mapping {
117 | public:
118 | uint32_t read_id_;
119 | uint64_t cell_barcode_;
120 | uint32_t fragment_start_position_;
121 | uint16_t fragment_length_;
122 | uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1;
123 | uint8_t num_dups_;
124 | // uint8_t mapq;
125 | uint16_t positive_alignment_length_;
126 | uint16_t negative_alignment_length_;
127 | PairedEndMappingWithBarcode() : num_dups_(0) {}
128 | PairedEndMappingWithBarcode(uint32_t read_id, uint64_t cell_barcode,
129 | uint32_t fragment_start_position,
130 | uint16_t fragment_length, uint8_t mapq,
131 | uint8_t direction, uint8_t is_unique,
132 | uint8_t num_dups,
133 | uint16_t positive_alignment_length,
134 | uint16_t negative_alignment_length)
135 | : read_id_(read_id),
136 | cell_barcode_(cell_barcode),
137 | fragment_start_position_(fragment_start_position),
138 | fragment_length_(fragment_length),
139 | mapq_(mapq),
140 | direction_(direction),
141 | is_unique_(is_unique),
142 | num_dups_(num_dups),
143 | positive_alignment_length_(positive_alignment_length),
144 | negative_alignment_length_(negative_alignment_length) {}
145 | bool operator<(const PairedEndMappingWithBarcode &m) const {
146 | return std::tie(fragment_start_position_, fragment_length_, cell_barcode_,
147 | mapq_, direction_, is_unique_, read_id_,
148 | positive_alignment_length_, negative_alignment_length_) <
149 | std::tie(m.fragment_start_position_, m.fragment_length_,
150 | m.cell_barcode_, m.mapq_, m.direction_, m.is_unique_,
151 | m.read_id_, m.positive_alignment_length_,
152 | m.negative_alignment_length_);
153 | }
154 | bool operator==(const PairedEndMappingWithBarcode &m) const {
155 | return std::tie(cell_barcode_, fragment_start_position_,
156 | fragment_length_) == std::tie(m.cell_barcode_,
157 | m.fragment_start_position_,
158 | m.fragment_length_);
159 | }
160 | bool IsSamePosition(const PairedEndMappingWithBarcode &m) const {
161 | return std::tie(fragment_start_position_, fragment_length_) ==
162 | std::tie(m.fragment_start_position_, m.fragment_length_);
163 | }
164 | uint64_t GetBarcode() const { return cell_barcode_; }
165 | void Tn5Shift() {
166 | fragment_start_position_ += 4;
167 | positive_alignment_length_ -= 4;
168 | fragment_length_ -= 9;
169 | negative_alignment_length_ -= 5;
170 | }
171 | bool IsPositiveStrand() const { return direction_ > 0 ? true : false; }
172 | uint32_t GetStartPosition() const { // inclusive
173 | return fragment_start_position_;
174 | }
175 | uint32_t GetEndPosition() const { // exclusive
176 | return fragment_start_position_ + fragment_length_;
177 | }
178 | };
179 |
180 | class PairedEndMappingWithoutBarcode : public Mapping {
181 | public:
182 | uint32_t read_id_;
183 | uint32_t fragment_start_position_;
184 | uint16_t fragment_length_;
185 | uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1;
186 | uint8_t num_dups_;
187 | // uint8_t mapq;
188 | uint16_t positive_alignment_length_;
189 | uint16_t negative_alignment_length_;
190 | PairedEndMappingWithoutBarcode() : num_dups_(0) {}
191 | PairedEndMappingWithoutBarcode(uint32_t read_id,
192 | uint32_t fragment_start_position,
193 | uint16_t fragment_length, uint8_t mapq,
194 | uint8_t direction, uint8_t is_unique,
195 | uint16_t num_dups,
196 | uint16_t positive_alignment_length,
197 | uint16_t negative_alignment_length)
198 | : read_id_(read_id),
199 | fragment_start_position_(fragment_start_position),
200 | fragment_length_(fragment_length),
201 | mapq_(mapq),
202 | direction_(direction),
203 | is_unique_(is_unique),
204 | num_dups_(num_dups),
205 | positive_alignment_length_(positive_alignment_length),
206 | negative_alignment_length_(negative_alignment_length) {}
207 |
208 | bool operator<(const PairedEndMappingWithoutBarcode &m) const {
209 | return std::tie(fragment_start_position_, fragment_length_, mapq_,
210 | direction_, is_unique_, read_id_,
211 | positive_alignment_length_, negative_alignment_length_) <
212 | std::tie(m.fragment_start_position_, m.fragment_length_, m.mapq_,
213 | m.direction_, m.is_unique_, m.read_id_,
214 | m.positive_alignment_length_, m.negative_alignment_length_);
215 | }
216 | bool operator==(const PairedEndMappingWithoutBarcode &m) const {
217 | return std::tie(fragment_start_position_, fragment_length_) ==
218 | std::tie(m.fragment_start_position_, m.fragment_length_);
219 | }
220 | bool IsSamePosition(const PairedEndMappingWithoutBarcode &m) const {
221 | return std::tie(fragment_start_position_, fragment_length_) ==
222 | std::tie(m.fragment_start_position_, m.fragment_length_);
223 | }
224 | uint64_t GetBarcode() const { return 0; }
225 | void Tn5Shift() {
226 | fragment_start_position_ += 4;
227 | positive_alignment_length_ -= 4;
228 | fragment_length_ -= 9;
229 | negative_alignment_length_ -= 5;
230 | }
231 | bool IsPositiveStrand() const { return direction_ > 0 ? true : false; }
232 | uint32_t GetStartPosition() const { // inclusive
233 | return fragment_start_position_;
234 | }
235 | uint32_t GetEndPosition() const { // exclusive
236 | return fragment_start_position_ + fragment_length_;
237 | }
238 | };
239 |
240 | } // namespace chromap
241 |
242 | #endif // BEDMAPPING_H_
243 |
--------------------------------------------------------------------------------
/src/candidate.h:
--------------------------------------------------------------------------------
1 | #ifndef CANDIDATE_H_
2 | #define CANDIDATE_H_
3 |
4 | #include
5 |
6 | namespace chromap {
7 |
8 | struct Candidate {
9 | // The high 32 bits save the reference sequence index in the reference
10 | // sequence batch. The low 32 bits save the reference position on that
11 | // sequence.
12 | uint64_t position = 0;
13 |
14 | // The number of minimizers supports the position.
15 | uint8_t count = 0;
16 |
17 | inline uint32_t GetReferenceSequenceIndex() const { return (position >> 32); }
18 |
19 | inline uint32_t GetReferenceSequencePosition() const { return position; }
20 |
21 | inline uint8_t GetCount() { return count; }
22 |
23 | inline bool operator<(const Candidate &c) const {
24 | if (count > c.count) {
25 | return true;
26 | }
27 |
28 | if (count < c.count) {
29 | return false;
30 | }
31 |
32 | return position < c.position;
33 | }
34 | };
35 |
36 | } // namespace chromap
37 |
38 | #endif // CANDIDATE_H_
39 |
--------------------------------------------------------------------------------
/src/candidate_position_generating_config.h:
--------------------------------------------------------------------------------
1 | #ifndef CANDIDATE_POSITION_GENERATING_CONFIG_H_
2 | #define CANDIDATE_POSITION_GENERATING_CONFIG_H_
3 |
4 | #include
5 |
6 | namespace chromap {
7 |
8 | // This class holds the parameters to generate candidate position. Using the
9 | // parameters, it can check whether a seed is frequent or repetitive.
10 | class CandidatePositionGeneratingConfig {
11 | public:
12 | CandidatePositionGeneratingConfig() = delete;
13 |
14 | CandidatePositionGeneratingConfig(uint32_t max_seed_frequency,
15 | uint32_t repetitive_seed_frequency,
16 | bool use_heap_merge)
17 | : max_seed_frequency_(max_seed_frequency),
18 | repetitive_seed_frequency_(repetitive_seed_frequency),
19 | use_heap_merge_(use_heap_merge) {}
20 |
21 | ~CandidatePositionGeneratingConfig() = default;
22 |
23 | inline bool IsFrequentSeed(uint32_t seed_frequency) const {
24 | return seed_frequency >= max_seed_frequency_;
25 | }
26 |
27 | inline bool IsRepetitiveSeed(uint32_t seed_frequency) const {
28 | return seed_frequency >= repetitive_seed_frequency_;
29 | }
30 |
31 | inline bool UseHeapMerge() const { return use_heap_merge_; }
32 |
33 | inline uint32_t GetMaxSeedFrequency() const { return max_seed_frequency_; }
34 |
35 | private:
36 | // Only seeds with frequency less than this threshold will be used.
37 | const uint32_t max_seed_frequency_;
38 |
39 | // Seeds with frequency greater than or equal to this threshold will be
40 | // considered as repetitive seeds.
41 | const uint32_t repetitive_seed_frequency_;
42 |
43 | // When the number of candidate positions is really large, use heap merge to
44 | // merge sorted candidate lists.
45 | const bool use_heap_merge_;
46 | };
47 |
48 | } // namespace chromap
49 |
50 | #endif // CANDIDATE_POSITION_GENERATING_CONFIG_H_
51 |
--------------------------------------------------------------------------------
/src/candidate_processor.h:
--------------------------------------------------------------------------------
1 | #ifndef CANDIDATE_PROCESSOR_H_
2 | #define CANDIDATE_PROCESSOR_H_
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include "candidate.h"
12 | #include "index.h"
13 | #include "mapping_metadata.h"
14 | #include "paired_end_mapping_metadata.h"
15 | #include "sequence_batch.h"
16 | #include "utils.h"
17 |
18 | namespace chromap {
19 |
20 | class CandidateProcessor {
21 | public:
22 | CandidateProcessor() = delete;
23 |
24 | CandidateProcessor(int min_num_seeds_required_for_mapping,
25 | const std::vector max_seed_frequencies)
26 | : min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping),
27 | max_seed_frequencies_(max_seed_frequencies) {}
28 |
29 | ~CandidateProcessor() = default;
30 |
31 | void GenerateCandidates(int error_threshold, const Index &index,
32 | MappingMetadata &mapping_metadata) const;
33 |
34 | int SupplementCandidates(
35 | int error_threshold, uint32_t search_range, const Index &index,
36 | PairedEndMappingMetadata &paired_end_mapping_metadata) const;
37 |
38 | void ReduceCandidatesForPairedEndRead(
39 | uint32_t mapping_positions_distance,
40 | PairedEndMappingMetadata &paired_end_mapping_metadata) const;
41 |
42 | private:
43 | void GenerateCandidatesOnOneStrand(int error_threshold,
44 | int num_seeds_required,
45 | uint32_t num_minimizers,
46 | std::vector &hits,
47 | std::vector &candidates) const;
48 |
49 | int GenerateCandidatesFromRepetitiveReadWithMateInfoOnOneStrand(
50 | const Strand strand, uint32_t search_range, int error_threshold,
51 | const Index &index, const std::vector &minimizers,
52 | const std::vector &mate_candidates,
53 | uint32_t &repetitive_seed_length, std::vector &hits,
54 | std::vector &candidates) const;
55 |
56 | void MergeCandidates(int error_threshold, std::vector &c1,
57 | std::vector &c2,
58 | std::vector &buffer) const;
59 |
60 | void ReduceCandidatesForPairedEndReadOnOneDirection(
61 | uint32_t mapping_positions_distance,
62 | const std::vector &candidates1,
63 | const std::vector &candidates2,
64 | std::vector &filtered_candidates1,
65 | std::vector &filtered_candidates2) const;
66 |
67 | const int min_num_seeds_required_for_mapping_;
68 | // Vector of size 2. The first element is the frequency threshold, and the
69 | // second element is the frequency threshold to run rescue. The second element
70 | // should always larger than the first one.
71 | // TODO(Haowen): add an error check.
72 | const std::vector max_seed_frequencies_;
73 | };
74 |
75 | } // namespace chromap
76 |
77 | #endif // CANDIDATE_PROCESSOR_H_
78 |
--------------------------------------------------------------------------------
/src/chromap_driver.h:
--------------------------------------------------------------------------------
1 | #ifndef CHROMAP_DRIVER_H_
2 | #define CHROMAP_DRIVER_H_
3 |
4 | namespace chromap {
5 |
6 | class ChromapDriver {
7 | public:
8 | ChromapDriver() = default;
9 | ~ChromapDriver() = default;
10 | void ParseArgsAndRun(int argc, char *argv[]);
11 | };
12 |
13 | } // namespace chromap
14 |
15 | #endif // CHROMAP_DRIVER_H_
16 |
--------------------------------------------------------------------------------
/src/draft_mapping.h:
--------------------------------------------------------------------------------
1 | #ifndef DRAFT_MAPPING_H_
2 | #define DRAFT_MAPPING_H_
3 |
4 | #include
5 |
6 | namespace chromap {
7 |
8 | struct DraftMapping {
9 | int num_errors = 0;
10 |
11 | // The high 32 bits save the reference sequence index in the reference
12 | // sequence batch. The low 32 bits save the mapping end position on the
13 | // reference sequence.
14 | uint64_t position = 0;
15 |
16 | DraftMapping(int num_errors, uint64_t position)
17 | : num_errors(num_errors), position(position) {}
18 |
19 | inline int GetNumErrors() const { return num_errors; }
20 |
21 | inline uint32_t GetReferenceSequenceIndex() const { return (position >> 32); }
22 |
23 | inline uint32_t GetReferenceSequencePosition() const { return position; }
24 | };
25 |
26 | } // namespace chromap
27 |
28 | #endif // DRAFT_MAPPING_H_
29 |
--------------------------------------------------------------------------------
/src/draft_mapping_generator.h:
--------------------------------------------------------------------------------
1 | #ifndef DRAFT_MAPPING_GENERATOR_H_
2 | #define DRAFT_MAPPING_GENERATOR_H_
3 |
4 | #include
5 |
6 | #include "draft_mapping.h"
7 | #include "mapping_metadata.h"
8 | #include "mapping_parameters.h"
9 | #include "sequence_batch.h"
10 | #include "utils.h"
11 |
12 | namespace chromap {
13 |
14 | class DraftMappingGenerator {
15 | public:
16 | DraftMappingGenerator() = delete;
17 |
18 | DraftMappingGenerator(const MappingParameters &mapping_parameters)
19 | : error_threshold_(mapping_parameters.error_threshold),
20 | split_alignment_(mapping_parameters.split_alignment),
21 | num_vpu_lanes_(mapping_parameters.GetNumVPULanes()),
22 | mapping_output_format_(mapping_parameters.mapping_output_format) {}
23 |
24 | ~DraftMappingGenerator() = default;
25 |
26 | void GenerateDraftMappings(const SequenceBatch &read_batch,
27 | uint32_t read_index,
28 | const SequenceBatch &reference,
29 | MappingMetadata &mapping_metadata);
30 |
31 | private:
32 | // Return true if the candidate position is valid on the reference with rid.
33 | // Note only the position is checked and the input rid is not checked in this
34 | // function. So the input rid must be valid.
35 | bool IsValidCandidate(uint32_t rid, uint32_t position, uint32_t read_length,
36 | const SequenceBatch &reference);
37 |
38 | // Return true when there is one non-split mapping generated and the mapping
39 | // is supported by all the minimizers.
40 | bool GenerateNonSplitDraftMappingSupportedByAllMinimizers(
41 | const SequenceBatch &read_batch, uint32_t read_index,
42 | const SequenceBatch &reference, MappingMetadata &mapping_metadata);
43 |
44 | void GenerateDraftMappingsOnOneStrandUsingSIMD(
45 | const Strand candidate_strand, uint32_t read_index,
46 | const SequenceBatch &read_batch, const SequenceBatch &reference,
47 | MappingMetadata &mapping_metadata);
48 |
49 | void GenerateDraftMappingsOnOneStrand(const Strand candidate_strand,
50 | uint32_t read_index,
51 | const SequenceBatch &read_batch,
52 | const SequenceBatch &reference,
53 | MappingMetadata &mapping_metadata);
54 |
55 | const int error_threshold_;
56 | const bool split_alignment_;
57 | const int num_vpu_lanes_;
58 | const MappingOutputFormat mapping_output_format_;
59 | };
60 |
61 | } // namespace chromap
62 |
63 | #endif // DRAFT_MAPPING_GENERATOR_H_
64 |
--------------------------------------------------------------------------------
/src/feature_barcode_matrix.cc:
--------------------------------------------------------------------------------
1 | #include "feature_barcode_matrix.h"
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | namespace chromap {
11 |
12 | void FeatureBarcodeMatrix::BuildAugmentedTreeForPeaks(uint32_t ref_id) {
13 | // std::sort(mappings.begin(), mappings.end(), IntervalLess());
14 | int max_level = 0;
15 | size_t i, last_i = 0; // last_i points to the rightmost node in the tree
16 | uint32_t last = 0; // last is the max value at node last_i
17 | int k;
18 | std::vector &peaks = peaks_on_diff_ref_seqs_[ref_id];
19 | std::vector &extras = tree_extras_on_diff_ref_seqs_[ref_id];
20 | if (peaks.size() == 0) {
21 | max_level = -1;
22 | }
23 |
24 | for (i = 0; i < peaks.size(); i += 2) {
25 | last_i = i;
26 | // last = mappings[i].max = mappings[i].en; // leaves (i.e. at level 0)
27 | last = extras[i] =
28 | peaks[i].start_position + peaks[i].length; // leaves (i.e. at level 0)
29 | }
30 |
31 | for (k = 1; 1LL << k <= (int64_t)peaks.size();
32 | ++k) { // process internal nodes in the bottom-up order
33 | size_t x = 1LL << (k - 1);
34 | size_t i0 = (x << 1) - 1;
35 | size_t step = x << 2; // i0 is the first node
36 | for (i = i0; i < peaks.size();
37 | i += step) { // traverse all nodes at level k
38 | uint32_t el = extras[i - x]; // max value of the left child
39 | uint32_t er =
40 | i + x < peaks.size() ? extras[i + x] : last; // of the right child
41 | uint32_t e = peaks[i].start_position + peaks[i].length;
42 | e = e > el ? e : el;
43 | e = e > er ? e : er;
44 | extras[i] = e; // set the max value for node i
45 | }
46 | last_i =
47 | last_i >> k & 1
48 | ? last_i - x
49 | : last_i +
50 | x; // last_i now points to the parent of the original last_i
51 | if (last_i < peaks.size() &&
52 | extras[last_i] > last) // update last accordingly
53 | last = extras[last_i];
54 | }
55 |
56 | max_level = k - 1;
57 | tree_info_on_diff_ref_seqs_.emplace_back(max_level, peaks.size());
58 | }
59 |
60 | uint32_t FeatureBarcodeMatrix::CallPeaks(
61 | uint16_t coverage_threshold, uint32_t num_reference_sequences,
62 | const SequenceBatch &reference,
63 | const std::vector> &mappings) {
64 | double real_start_time = GetRealTime();
65 | // std::vector> &mappings =
66 | // allocate_multi_mappings_
67 | // ? allocated_mappings_on_diff_ref_seqs_
68 | // : (remove_pcr_duplicates_ ? deduped_mappings_on_diff_ref_seqs_
69 | // : mappings_on_diff_ref_seqs_);
70 | // Build pileup.
71 | for (uint32_t ri = 0; ri < num_reference_sequences; ++ri) {
72 | pileup_on_diff_ref_seqs_.emplace_back(std::vector());
73 | pileup_on_diff_ref_seqs_[ri].assign(reference.GetSequenceLengthAt(ri), 0);
74 | for (size_t mi = 0; mi < mappings[ri].size(); ++mi) {
75 | for (uint16_t pi = 0; pi < mappings[ri][mi].fragment_length_; ++pi) {
76 | ++pileup_on_diff_ref_seqs_[ri]
77 | [mappings[ri][mi].GetStartPosition() + pi];
78 | }
79 | }
80 | }
81 | std::cerr << "Built pileup in " << GetRealTime() - real_start_time << "s.\n";
82 |
83 | real_start_time = GetRealTime();
84 | // Call and save peaks.
85 | tree_extras_on_diff_ref_seqs_.clear();
86 | tree_info_on_diff_ref_seqs_.clear();
87 | tree_extras_on_diff_ref_seqs_.reserve(num_reference_sequences);
88 | tree_info_on_diff_ref_seqs_.reserve(num_reference_sequences);
89 | uint32_t peak_count = 0;
90 | for (uint32_t ri = 0; ri < num_reference_sequences; ++ri) {
91 | tree_extras_on_diff_ref_seqs_.emplace_back(std::vector());
92 | tree_extras_on_diff_ref_seqs_[ri].reserve(
93 | reference.GetSequenceLengthAt(ri) / 100);
94 | peaks_on_diff_ref_seqs_.emplace_back(std::vector());
95 | uint32_t peak_start_position = 0;
96 | uint16_t peak_length = 0;
97 | for (size_t pi = 0; pi < reference.GetSequenceLengthAt(ri); ++pi) {
98 | if (pileup_on_diff_ref_seqs_[ri][pi] >= coverage_threshold) {
99 | if (peak_length == 0) { // start a new peak
100 | peak_start_position = pi;
101 | }
102 | ++peak_length; // extend the peak
103 | } else if (peak_length > 0) { // save the previous peak
104 | // TODO(Haowen): improve peak calling
105 | peaks_on_diff_ref_seqs_[ri].emplace_back(
106 | Peak{peak_start_position, peak_length, peak_count});
107 | tree_extras_on_diff_ref_seqs_[ri].emplace_back(0);
108 | feature_barcode_matrix_writer_.OutputPeaks(peak_start_position,
109 | peak_length, ri, reference);
110 | ++peak_count;
111 | peak_length = 0;
112 | }
113 | }
114 | BuildAugmentedTreeForPeaks(ri);
115 | }
116 | std::cerr << "Call peaks and built peak augmented tree in "
117 | << GetRealTime() - real_start_time << "s.\n";
118 | // Output feature matrix
119 | return peak_count;
120 | }
121 |
122 | void FeatureBarcodeMatrix::OutputFeatureMatrix(
123 | uint32_t num_sequences, const SequenceBatch &reference,
124 | const std::vector> &mappings,
125 | const std::string &matrix_output_prefix) {
126 | feature_barcode_matrix_writer_.InitializeMatrixOutput(matrix_output_prefix);
127 |
128 | uint32_t num_peaks = 0;
129 | if (cell_by_bin_) {
130 | feature_barcode_matrix_writer_.OutputPeaks(bin_size_, num_sequences,
131 | reference);
132 | for (uint32_t i = 0; i < num_sequences; ++i) {
133 | uint32_t ref_seq_length = reference.GetSequenceLengthAt(i);
134 | num_peaks += ref_seq_length / bin_size_;
135 | if (ref_seq_length % bin_size_ != 0) {
136 | ++num_peaks;
137 | }
138 | }
139 | } else {
140 | num_peaks = CallPeaks(depth_cutoff_to_call_peak_, num_sequences, reference,
141 | mappings);
142 | }
143 |
144 | // std::vector> &mappings =
145 | // allocate_multi_mappings_
146 | // ? allocated_mappings_on_diff_ref_seqs_
147 | // : (remove_pcr_duplicates_ ? deduped_mappings_on_diff_ref_seqs_
148 | // : mappings_on_diff_ref_seqs_);
149 | double real_start_time = GetRealTime();
150 | // First pass to index barcodes
151 | uint32_t barcode_index = 0;
152 | for (uint32_t rid = 0; rid < num_sequences; ++rid) {
153 | for (uint32_t mi = 0; mi < mappings[rid].size(); ++mi) {
154 | uint64_t barcode_key = mappings[rid][mi].cell_barcode_;
155 | khiter_t barcode_index_table_iterator =
156 | kh_get(k64_seq, barcode_index_table_, barcode_key);
157 | if (barcode_index_table_iterator == kh_end(barcode_index_table_)) {
158 | int khash_return_code;
159 | barcode_index_table_iterator = kh_put(k64_seq, barcode_index_table_,
160 | barcode_key, &khash_return_code);
161 | assert(khash_return_code != -1 && khash_return_code != 0);
162 | kh_value(barcode_index_table_, barcode_index_table_iterator) =
163 | barcode_index;
164 | ++barcode_index;
165 | feature_barcode_matrix_writer_.AppendBarcodeOutput(barcode_key);
166 | }
167 | }
168 | }
169 | std::cerr << "Index and output barcodes in "
170 | << GetRealTime() - real_start_time << "s.\n";
171 |
172 | real_start_time = GetRealTime();
173 | // Second pass to generate matrix
174 | khash_t(kmatrix) *matrix = kh_init(kmatrix);
175 | std::vector overlapped_peak_indices;
176 | for (uint32_t rid = 0; rid < num_sequences; ++rid) {
177 | for (uint32_t mi = 0; mi < mappings[rid].size(); ++mi) {
178 | uint64_t barcode_key = mappings[rid][mi].cell_barcode_;
179 | khiter_t barcode_index_table_iterator =
180 | kh_get(k64_seq, barcode_index_table_, barcode_key);
181 | uint64_t barcode_index =
182 | kh_value(barcode_index_table_, barcode_index_table_iterator);
183 | overlapped_peak_indices.clear();
184 | if (cell_by_bin_) {
185 | GetNumOverlappedBins(rid, mappings[rid][mi].GetStartPosition(),
186 | mappings[rid][mi].GetEndPosition() -
187 | mappings[rid][mi].GetStartPosition(),
188 | num_sequences, reference, overlapped_peak_indices);
189 | } else {
190 | GetNumOverlappedPeaks(rid, mappings[rid][mi], overlapped_peak_indices);
191 | }
192 | size_t num_overlapped_peaks = overlapped_peak_indices.size();
193 | for (size_t pi = 0; pi < num_overlapped_peaks; ++pi) {
194 | uint32_t peak_index = overlapped_peak_indices[pi];
195 | uint64_t entry_index = (barcode_index << 32) | peak_index;
196 | khiter_t matrix_iterator = kh_get(kmatrix, matrix, entry_index);
197 | if (matrix_iterator == kh_end(matrix)) {
198 | int khash_return_code;
199 | matrix_iterator =
200 | kh_put(kmatrix, matrix, entry_index, &khash_return_code);
201 | assert(khash_return_code != -1 && khash_return_code != 0);
202 | kh_value(matrix, matrix_iterator) = 1;
203 | } else {
204 | kh_value(matrix, matrix_iterator) += 1;
205 | }
206 | }
207 | }
208 | }
209 | std::cerr << "Generate feature matrix in " << GetRealTime() - real_start_time
210 | << "s.\n";
211 | // Output matrix
212 | real_start_time = GetRealTime();
213 | feature_barcode_matrix_writer_.WriteMatrixOutputHead(
214 | num_peaks, kh_size(barcode_index_table_), kh_size(matrix));
215 | uint64_t key;
216 | uint32_t value;
217 | std::vector> feature_matrix;
218 | feature_matrix.reserve(kh_size(matrix));
219 | // kh_foreach(matrix, key, value,
220 | // output_tools_->AppendMatrixOutput((uint32_t)key, (uint32_t)(key >> 32),
221 | // value));
222 | kh_foreach(matrix, key, value, feature_matrix.emplace_back(key, value));
223 | kh_destroy(kmatrix, matrix);
224 | std::sort(feature_matrix.begin(), feature_matrix.end());
225 | for (size_t i = 0; i < feature_matrix.size(); ++i) {
226 | feature_barcode_matrix_writer_.AppendMatrixOutput(
227 | (uint32_t)feature_matrix[i].first,
228 | (uint32_t)(feature_matrix[i].first >> 32), feature_matrix[i].second);
229 | }
230 |
231 | feature_barcode_matrix_writer_.FinalizeMatrixOutput();
232 | std::cerr << "Output feature matrix in " << GetRealTime() - real_start_time
233 | << "s.\n";
234 | }
235 |
236 | void FeatureBarcodeMatrix::GetNumOverlappedBins(
237 | uint32_t rid, uint32_t start_position, uint16_t mapping_length,
238 | uint32_t num_sequences, const SequenceBatch &reference,
239 | std::vector &overlapped_peak_indices) {
240 | uint32_t bin_index = 0;
241 | for (uint32_t i = 0; i < rid; ++i) {
242 | uint32_t ref_seq_length = reference.GetSequenceLengthAt(i);
243 | bin_index += ref_seq_length / bin_size_;
244 | if (ref_seq_length % bin_size_ != 0) {
245 | ++bin_index;
246 | }
247 | }
248 | bin_index += start_position / bin_size_;
249 | overlapped_peak_indices.emplace_back(bin_index);
250 | uint32_t max_num_overlapped_bins = mapping_length / bin_size_ + 2;
251 | for (uint32_t i = 0; i < max_num_overlapped_bins; ++i) {
252 | if (start_position + mapping_length - 1 >=
253 | (bin_index + 1 + i) * bin_size_) {
254 | overlapped_peak_indices.emplace_back(bin_index + 1 + i);
255 | }
256 | }
257 | }
258 |
259 | uint32_t FeatureBarcodeMatrix::GetNumOverlappedPeaks(
260 | uint32_t ref_id, const PairedEndMappingWithBarcode &mapping,
261 | std::vector &overlapped_peak_indices) {
262 | int t = 0;
263 | StackCell stack[64];
264 | // out.clear();
265 | overlapped_peak_indices.clear();
266 | int num_overlapped_peaks = 0;
267 | int max_level = tree_info_on_diff_ref_seqs_[ref_id].first;
268 | uint32_t num_tree_nodes = tree_info_on_diff_ref_seqs_[ref_id].second;
269 | std::vector &peaks = peaks_on_diff_ref_seqs_[ref_id];
270 | std::vector &extras = tree_extras_on_diff_ref_seqs_[ref_id];
271 | // uint32_t interval_start = mapping.fragment_start_position;
272 | uint32_t interval_start =
273 | mapping.GetStartPosition() > (uint32_t)overlap_distance_
274 | ? mapping.GetStartPosition() - overlap_distance_
275 | : 0;
276 | uint32_t interval_end =
277 | mapping.GetEndPosition() + (uint32_t)overlap_distance_;
278 | stack[t++] = StackCell(max_level, (1LL << max_level) - 1,
279 | 0); // push the root; this is a top down traversal
280 | while (
281 | t) { // the following guarantees that numbers in out[] are always sorted
282 | StackCell z = stack[--t];
283 | if (z.k <=
284 | 3) { // we are in a small subtree; traverse every node in this subtree
285 | size_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL << (z.k + 1)) - 1;
286 | if (i1 >= num_tree_nodes) {
287 | i1 = num_tree_nodes;
288 | }
289 | for (i = i0; i < i1 && peaks[i].start_position < interval_end; ++i) {
290 | if (interval_start <
291 | peaks[i].start_position +
292 | peaks[i].length) { // if overlap, append to out[]
293 | // out.push_back(i);
294 | overlapped_peak_indices.emplace_back(peaks[i].index);
295 | ++num_overlapped_peaks;
296 | }
297 | }
298 | } else if (z.w == 0) { // if left child not processed
299 | size_t y =
300 | z.x - (1LL << (z.k - 1)); // the left child of z.x; NB: y may be out
301 | // of range (i.e. y>=a.size())
302 | stack[t++] = StackCell(
303 | z.k, z.x,
304 | 1); // re-add node z.x, but mark the left child having been processed
305 | if (y >= num_tree_nodes ||
306 | extras[y] > interval_start) // push the left child if y is out of
307 | // range or may overlap with the query
308 | stack[t++] = StackCell(z.k - 1, y, 0);
309 | } else if (z.x < num_tree_nodes &&
310 | peaks[z.x].start_position <
311 | interval_end) { // need to push the right child
312 | if (interval_start < peaks[z.x].start_position + peaks[z.x].length) {
313 | // out.push_back(z.x); // test if z.x overlaps the query; if yes, append
314 | // to out[]
315 | overlapped_peak_indices.emplace_back(peaks[z.x].index);
316 | ++num_overlapped_peaks;
317 | }
318 | stack[t++] = StackCell(z.k - 1, z.x + (1LL << (z.k - 1)),
319 | 0); // push the right child
320 | }
321 | }
322 | return num_overlapped_peaks;
323 | }
324 |
325 | } // namespace chromap
326 |
--------------------------------------------------------------------------------
/src/feature_barcode_matrix.h:
--------------------------------------------------------------------------------
1 | #ifndef FEATURE_BARCODE_MATRIX_H_
2 | #define FEATURE_BARCODE_MATRIX_H_
3 |
4 | #include
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | #include "bed_mapping.h"
15 | #include "feature_barcode_matrix_writer.h"
16 | #include "khash.h"
17 | #include "utils.h"
18 |
19 | namespace chromap {
20 |
21 | struct Peak {
22 | uint32_t start_position;
23 | uint16_t length;
24 | uint32_t index;
25 | };
26 |
27 | class FeatureBarcodeMatrix {
28 | public:
29 | FeatureBarcodeMatrix(bool cell_by_bin, int bin_size, int overlap_distance,
30 | uint16_t depth_cutoff_to_call_peak)
31 | : cell_by_bin_(cell_by_bin),
32 | bin_size_(bin_size),
33 | overlap_distance_(overlap_distance),
34 | depth_cutoff_to_call_peak_(depth_cutoff_to_call_peak) {
35 | barcode_index_table_ = kh_init(k64_seq);
36 | }
37 |
38 | ~FeatureBarcodeMatrix() {
39 | if (barcode_index_table_ != NULL) {
40 | kh_destroy(k64_seq, barcode_index_table_);
41 | }
42 | }
43 |
44 | void OutputFeatureMatrix(
45 | uint32_t num_sequences, const SequenceBatch &reference,
46 | const std::vector> &mappings,
47 | const std::string &matrix_output_prefix);
48 |
49 | private:
50 | void BuildAugmentedTreeForPeaks(uint32_t ref_id);
51 |
52 | uint32_t GetNumOverlappedPeaks(
53 | uint32_t ref_id, const PairedEndMappingWithBarcode &mapping,
54 | std::vector &overlapped_peak_indices);
55 |
56 | void GetNumOverlappedBins(uint32_t rid, uint32_t start_position,
57 | uint16_t mapping_length, uint32_t num_sequences,
58 | const SequenceBatch &reference,
59 | std::vector &overlapped_peak_indices);
60 |
61 | uint32_t CallPeaks(
62 | uint16_t coverage_threshold, uint32_t num_reference_sequences,
63 | const SequenceBatch &reference,
64 | const std::vector> &mappings);
65 |
66 | const bool cell_by_bin_;
67 | const int bin_size_;
68 | const int overlap_distance_;
69 | const uint16_t depth_cutoff_to_call_peak_;
70 |
71 | khash_t(k64_seq) * barcode_index_table_;
72 | // (max_level, # nodes)
73 | std::vector> tree_info_on_diff_ref_seqs_;
74 |
75 | // max
76 | std::vector> tree_extras_on_diff_ref_seqs_;
77 |
78 | // For peak calling.
79 | std::vector> pileup_on_diff_ref_seqs_;
80 | std::vector> peaks_on_diff_ref_seqs_;
81 |
82 | FeatureBarcodeMatrixWriter feature_barcode_matrix_writer_;
83 | };
84 |
85 | } // namespace chromap
86 |
87 | #endif // FEATURE_BARCODE_MATRIX_H_
88 |
--------------------------------------------------------------------------------
/src/feature_barcode_matrix_writer.h:
--------------------------------------------------------------------------------
1 | #ifndef FEATUREBARCODEMATRIXWRITER_H_
2 | #define FEATUREBARCODEMATRIXWRITER_H_
3 |
4 | #include
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | #include "barcode_translator.h"
14 | #include "sequence_batch.h"
15 |
16 | namespace chromap {
17 |
18 | // The code here is not working properly since the barcode length is not set.
19 | // But this feature is not used in the realse for now so this is fine.
20 | class FeatureBarcodeMatrixWriter {
21 | public:
22 | FeatureBarcodeMatrixWriter() {}
23 | ~FeatureBarcodeMatrixWriter() {}
24 |
25 | inline void InitializeMatrixOutput(const std::string &matrix_output_prefix) {
26 | matrix_output_prefix_ = matrix_output_prefix;
27 | matrix_output_file_ =
28 | fopen((matrix_output_prefix_ + "_matrix.mtx").c_str(), "w");
29 | assert(matrix_output_file_ != nullptr);
30 | peak_output_file_ =
31 | fopen((matrix_output_prefix_ + "_peaks.bed").c_str(), "w");
32 | assert(peak_output_file_ != nullptr);
33 | barcode_output_file_ =
34 | fopen((matrix_output_prefix_ + "_barcode.tsv").c_str(), "w");
35 | assert(barcode_output_file_ != nullptr);
36 | }
37 |
38 | void OutputPeaks(uint32_t bin_size, uint32_t num_sequences,
39 | const SequenceBatch &reference) {
40 | for (uint32_t rid = 0; rid < num_sequences; ++rid) {
41 | uint32_t sequence_length = reference.GetSequenceLengthAt(rid);
42 | const char *sequence_name = reference.GetSequenceNameAt(rid);
43 | for (uint32_t position = 0; position < sequence_length;
44 | position += bin_size) {
45 | fprintf(peak_output_file_, "%s\t%u\t%u\n", sequence_name, position + 1,
46 | position + bin_size);
47 | }
48 | }
49 | }
50 |
51 | void OutputPeaks(uint32_t peak_start_position, uint16_t peak_length,
52 | uint32_t rid, const SequenceBatch &reference) {
53 | const char *sequence_name = reference.GetSequenceNameAt(rid);
54 | fprintf(peak_output_file_, "%s\t%u\t%u\n", sequence_name,
55 | peak_start_position + 1, peak_start_position + peak_length);
56 | }
57 |
58 | void AppendBarcodeOutput(uint64_t barcode_key) {
59 | fprintf(barcode_output_file_, "%s-1\n",
60 | barcode_translator_.Translate(barcode_key, cell_barcode_length_)
61 | .data());
62 | }
63 |
64 | void WriteMatrixOutputHead(uint64_t num_peaks, uint64_t num_barcodes,
65 | uint64_t num_lines) {
66 | fprintf(matrix_output_file_, "%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\n",
67 | num_peaks, num_barcodes, num_lines);
68 | }
69 |
70 | void AppendMatrixOutput(uint32_t peak_index, uint32_t barcode_index,
71 | uint32_t num_mappings) {
72 | fprintf(matrix_output_file_, "%u\t%u\t%u\n", peak_index, barcode_index,
73 | num_mappings);
74 | }
75 |
76 | inline void FinalizeMatrixOutput() {
77 | fclose(matrix_output_file_);
78 | fclose(peak_output_file_);
79 | fclose(barcode_output_file_);
80 | }
81 |
82 | inline void SetBarcodeTranslateTable(const std::string &file) {
83 | barcode_translator_.SetTranslateTable(file);
84 | }
85 |
86 | inline void SetBarcodeLength(uint32_t cell_barcode_length) {
87 | cell_barcode_length_ = cell_barcode_length;
88 | }
89 |
90 | protected:
91 | uint32_t cell_barcode_length_ = 16;
92 | std::string matrix_output_prefix_;
93 | FILE *peak_output_file_ = nullptr;
94 | FILE *barcode_output_file_ = nullptr;
95 | FILE *matrix_output_file_ = nullptr;
96 | BarcodeTranslator barcode_translator_;
97 | };
98 |
99 | } // namespace chromap
100 |
101 | #endif // FEATUREBARCODEMATRIXWRITER_H_
102 |
--------------------------------------------------------------------------------
/src/hit_utils.h:
--------------------------------------------------------------------------------
1 | #ifndef HIT_UTILS_H_
2 | #define HIT_UTILS_H_
3 |
4 | #include "strand.h"
5 |
6 | namespace chromap {
7 |
8 | inline static uint32_t HitToSequenceIndex(uint64_t hit) { return (hit >> 33); }
9 |
10 | inline static uint32_t HitToSequencePosition(uint64_t hit) {
11 | return (hit >> 1);
12 | }
13 |
14 | inline static Strand HitToStrand(uint64_t hit) {
15 | if ((hit & 1) == 0) {
16 | return kPositive;
17 | }
18 | return kNegative;
19 | }
20 |
21 | inline static bool AreTwoHitsOnTheSameStrand(uint64_t hit1, uint64_t hit2) {
22 | return ((hit1 & 1) == (hit2 & 1));
23 | }
24 |
25 | } // namespace chromap
26 |
27 | #endif // HIT_UTILS_H_
28 |
--------------------------------------------------------------------------------
/src/index.h:
--------------------------------------------------------------------------------
1 | #ifndef INDEX_H_
2 | #define INDEX_H_
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include "candidate_position_generating_config.h"
10 | #include "index_parameters.h"
11 | #include "index_utils.h"
12 | #include "mapping_metadata.h"
13 | #include "minimizer.h"
14 | #include "sequence_batch.h"
15 | #include "utils.h"
16 |
17 | namespace chromap {
18 |
19 | class Index {
20 | public:
21 | Index() = delete;
22 |
23 | // For read mapping.
24 | Index(const std::string &index_file_path)
25 | : index_file_path_(index_file_path) {
26 | lookup_table_ = kh_init(k64);
27 | }
28 |
29 | // For index construction.
30 | Index(const IndexParameters &index_parameters)
31 | : kmer_size_(index_parameters.kmer_size),
32 | window_size_(index_parameters.window_size),
33 | num_threads_(index_parameters.num_threads),
34 | index_file_path_(index_parameters.index_output_file_path) {
35 | lookup_table_ = kh_init(k64);
36 | }
37 |
38 | ~Index() { Destroy(); }
39 |
40 | void Destroy() {
41 | if (lookup_table_ != nullptr) {
42 | kh_destroy(k64, lookup_table_);
43 | lookup_table_ = nullptr;
44 | }
45 |
46 | std::vector().swap(occurrence_table_);
47 | }
48 |
49 | void Construct(uint32_t num_sequences, const SequenceBatch &reference);
50 |
51 | void Save() const;
52 |
53 | void Load();
54 |
55 | // Output index stats.
56 | void Statistics(uint32_t num_sequences, const SequenceBatch &reference) const;
57 |
58 | // Check the index for some reference genome. Only for debug.
59 | void CheckIndex(uint32_t num_sequences, const SequenceBatch &reference) const;
60 |
61 | // Return the number of repetitive seeds.
62 | int GenerateCandidatePositions(
63 | const CandidatePositionGeneratingConfig &generating_config,
64 | MappingMetadata &mapping_metadata) const;
65 |
66 | // Input a search range, for each best mate candidate, serach for candidate
67 | // positions for the read. Return the minimizer count of the best candidate if
68 | // it finishes normally. Or return a negative value if it stops early due to
69 | // too many candidates with low minimizer count.
70 | // 'strand' is the strand to generate (augment) candidates.
71 | int GenerateCandidatePositionsFromRepetitiveReadWithMateInfoOnOneStrand(
72 | const Strand strand, uint32_t search_range,
73 | int min_num_seeds_required_for_mapping, int max_seed_frequency0,
74 | int error_threshold, const std::vector &minimizers,
75 | const std::vector &mate_candidates,
76 | uint32_t &repetitive_seed_length,
77 | std::vector &candidate_positions) const;
78 |
79 | int GetKmerSize() const { return kmer_size_; }
80 |
81 | int GetWindowSize() const { return window_size_; }
82 |
83 | uint32_t GetLookupTableSize() const { return kh_size(lookup_table_); }
84 |
85 | private:
86 | uint64_t GenerateCandidatePositionFromHits(uint64_t reference_hit,
87 | uint64_t read_hit) const;
88 |
89 | void UpdateRepetitiveSeedStats(uint32_t read_position,
90 | RepetitiveSeedStats &stats) const;
91 |
92 | int kmer_size_ = 0;
93 | int window_size_ = 0;
94 | // Number of threads to build the index, which is not used right now.
95 | int num_threads_ = 1;
96 | const std::string index_file_path_;
97 | khash_t(k64) *lookup_table_ = nullptr;
98 | std::vector occurrence_table_;
99 | };
100 |
101 | } // namespace chromap
102 |
103 | #endif // INDEX_H_
104 |
--------------------------------------------------------------------------------
/src/index_parameters.h:
--------------------------------------------------------------------------------
1 | #ifndef INDEX_PARAMETERS_H_
2 | #define INDEX_PARAMETERS_H_
3 |
4 | namespace chromap {
5 |
6 | struct IndexParameters {
7 | int kmer_size = 17;
8 | int window_size = 7;
9 | int num_threads = 1;
10 | std::string reference_file_path;
11 | std::string index_output_file_path;
12 | };
13 |
14 | } // namespace chromap
15 |
16 | #endif // INDEX_PARAMETERS_H_
17 |
--------------------------------------------------------------------------------
/src/index_utils.h:
--------------------------------------------------------------------------------
1 | #ifndef INDEX_UTILS_H_
2 | #define INDEX_UTILS_H_
3 |
4 | #include
5 |
6 | #include "khash.h"
7 |
8 | // Note that the max kmer size is 28 and its hash value is always saved in the
9 | // lowest 56 bits of an unsigned 64-bit integer. When an element is inserted
10 | // into the hash table, its hash value is left shifted by 1 bit and the lowest
11 | // bit of the key value is set to 1 when the minimizer only occurs once. So
12 | // right shift by one bit is lossless and safe.
13 | #define KHashFunctionForIndex(a) ((a) >> 1)
14 | #define KHashEqForIndex(a, b) ((a) >> 1 == (b) >> 1)
15 | KHASH_INIT(/*name=*/k64, /*khkey_t=*/uint64_t, /*khval_t=*/uint64_t,
16 | /*kh_is_map=*/1, /*__hash_func=*/KHashFunctionForIndex,
17 | /*__hash_equal=*/KHashEqForIndex);
18 |
19 | namespace chromap {
20 |
21 | struct RepetitiveSeedStats {
22 | uint32_t repetitive_seed_length = 0;
23 | uint32_t previous_repetitive_seed_position =
24 | std::numeric_limits::max();
25 | int repetitive_seed_count = 0;
26 | };
27 |
28 | inline static uint64_t GenerateHashInLookupTable(uint64_t minimizer_hash) {
29 | return minimizer_hash << 1;
30 | }
31 |
32 | inline static uint64_t GenerateEntryValueInLookupTable(
33 | uint64_t occurrence_table_offset, uint32_t num_occurrences) {
34 | return (occurrence_table_offset << 32) | num_occurrences;
35 | }
36 |
37 | inline static uint32_t GenerateOffsetInOccurrenceTable(uint64_t lookup_value) {
38 | return lookup_value >> 32;
39 | }
40 |
41 | inline static uint32_t GenerateNumOccurrenceInOccurrenceTable(
42 | uint64_t lookup_table_entry_value) {
43 | return static_cast(lookup_table_entry_value);
44 | }
45 |
46 | inline static uint64_t SequenceIndexAndPositionToCandidatePosition(
47 | uint64_t sequence_id, uint32_t sequence_position) {
48 | return (sequence_id << 32) | sequence_position;
49 | }
50 |
51 | inline static uint64_t GenerateCandidatePositionFromOccurrenceTableEntry(
52 | uint64_t entry) {
53 | return entry >> 1;
54 | }
55 |
56 | inline static bool IsSingletonLookupKey(uint64_t lookup_key) {
57 | return (lookup_key & 1) > 0;
58 | }
59 |
60 | // Only used in Index to merge sorted candidate position lists using heap.
61 | struct CandidatePositionWithListIndex {
62 | uint32_t list_index;
63 | uint64_t position;
64 |
65 | CandidatePositionWithListIndex(uint32_t list_index, uint64_t position)
66 | : list_index(list_index), position(position) {}
67 |
68 | bool operator<(const CandidatePositionWithListIndex &h) const {
69 | // The inversed direction is to make a min-heap.
70 | return position > h.position;
71 | }
72 | };
73 |
74 | inline static void HeapMergeCandidatePositionLists(
75 | const std::vector> sorted_candidate_position_lists,
76 | std::vector &candidate_positions) {
77 | std::priority_queue heap;
78 | std::vector candidate_position_list_indices(
79 | sorted_candidate_position_lists.size(), 0);
80 |
81 | for (uint32_t li = 0; li < sorted_candidate_position_lists.size(); ++li) {
82 | if (sorted_candidate_position_lists[li].size() == 0) {
83 | continue;
84 | }
85 | heap.emplace(li, sorted_candidate_position_lists[li][0]);
86 | }
87 |
88 | while (!heap.empty()) {
89 | const CandidatePositionWithListIndex min_candidate_position = heap.top();
90 | heap.pop();
91 | candidate_positions.push_back(min_candidate_position.position);
92 | ++candidate_position_list_indices[min_candidate_position.list_index];
93 |
94 | const uint32_t min_candidate_position_list_index =
95 | candidate_position_list_indices[min_candidate_position.list_index];
96 | const std::vector &min_sorted_candidate_position_list =
97 | sorted_candidate_position_lists[min_candidate_position.list_index];
98 | if (min_candidate_position_list_index <
99 | min_sorted_candidate_position_list.size()) {
100 | heap.emplace(min_candidate_position.list_index,
101 | min_sorted_candidate_position_list
102 | [min_candidate_position_list_index]);
103 | }
104 | }
105 | }
106 |
107 | } // namespace chromap
108 |
109 | #endif // INDEX_UTILS_H_
110 |
--------------------------------------------------------------------------------
/src/kseq.h:
--------------------------------------------------------------------------------
1 | /* The MIT License
2 |
3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | */
25 |
26 | /* Last Modified: 05MAR2012 */
27 |
28 | #ifndef AC_KSEQ_H
29 | #define AC_KSEQ_H
30 |
31 | #include
32 | #include
33 | #include
34 |
35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
36 | #define KS_SEP_TAB 1 // isspace() && !' '
37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
38 | #define KS_SEP_MAX 2
39 |
40 | #define __KS_TYPE(type_t) \
41 | typedef struct __kstream_t { \
42 | unsigned char *buf; \
43 | int begin, end, is_eof; \
44 | type_t f; \
45 | } kstream_t;
46 |
47 | #define ks_err(ks) ((ks)->end == -1)
48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
50 |
51 | #define __KS_BASIC(type_t, __bufsize) \
52 | static inline kstream_t *ks_init(type_t f) \
53 | { \
54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
55 | ks->f = f; \
56 | ks->buf = (unsigned char*)malloc(__bufsize); \
57 | return ks; \
58 | } \
59 | static inline void ks_destroy(kstream_t *ks) \
60 | { \
61 | if (ks) { \
62 | free(ks->buf); \
63 | free(ks); \
64 | } \
65 | }
66 |
67 | #define __KS_GETC(__read, __bufsize) \
68 | static inline int ks_getc(kstream_t *ks) \
69 | { \
70 | if (ks_err(ks)) return -3; \
71 | if (ks->is_eof && ks->begin >= ks->end) return -1; \
72 | if (ks->begin >= ks->end) { \
73 | ks->begin = 0; \
74 | ks->end = __read(ks->f, ks->buf, __bufsize); \
75 | if (ks->end == 0) { ks->is_eof = 1; return -1;} \
76 | if (ks->end == -1) { ks->is_eof = 1; return -3;}\
77 | } \
78 | return (int)ks->buf[ks->begin++]; \
79 | }
80 |
81 | #ifndef KSTRING_T
82 | #define KSTRING_T kstring_t
83 | typedef struct __kstring_t {
84 | size_t l, m;
85 | char *s;
86 | } kstring_t;
87 | #endif
88 |
89 | #ifndef kroundup32
90 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
91 | #endif
92 |
93 | #define __KS_GETUNTIL(__read, __bufsize) \
94 | static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
95 | { \
96 | int gotany = 0; \
97 | if (dret) *dret = 0; \
98 | str->l = append? str->l : 0; \
99 | for (;;) { \
100 | int i; \
101 | if (ks_err(ks)) return -3; \
102 | if (ks->begin >= ks->end) { \
103 | if (!ks->is_eof) { \
104 | ks->begin = 0; \
105 | ks->end = __read(ks->f, ks->buf, __bufsize); \
106 | if (ks->end == 0) { ks->is_eof = 1; break; } \
107 | if (ks->end == -1) { ks->is_eof = 1; return -3; } \
108 | } else break; \
109 | } \
110 | if (delimiter == KS_SEP_LINE) { \
111 | for (i = ks->begin; i < ks->end; ++i) \
112 | if (ks->buf[i] == '\n') break; \
113 | } else if (delimiter > KS_SEP_MAX) { \
114 | for (i = ks->begin; i < ks->end; ++i) \
115 | if (ks->buf[i] == delimiter) break; \
116 | } else if (delimiter == KS_SEP_SPACE) { \
117 | for (i = ks->begin; i < ks->end; ++i) \
118 | if (isspace(ks->buf[i])) break; \
119 | } else if (delimiter == KS_SEP_TAB) { \
120 | for (i = ks->begin; i < ks->end; ++i) \
121 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
122 | } else i = 0; /* never come to here! */ \
123 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
124 | str->m = str->l + (i - ks->begin) + 1; \
125 | kroundup32(str->m); \
126 | str->s = (char*)realloc(str->s, str->m); \
127 | } \
128 | gotany = 1; \
129 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
130 | str->l = str->l + (i - ks->begin); \
131 | ks->begin = i + 1; \
132 | if (i < ks->end) { \
133 | if (dret) *dret = ks->buf[i]; \
134 | break; \
135 | } \
136 | } \
137 | if (!gotany && ks_eof(ks)) return -1; \
138 | if (str->s == 0) { \
139 | str->m = 1; \
140 | str->s = (char*)calloc(1, 1); \
141 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
142 | str->s[str->l] = '\0'; \
143 | return str->l; \
144 | } \
145 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
146 | { return ks_getuntil2(ks, delimiter, str, dret, 0); }
147 |
148 | #define KSTREAM_INIT(type_t, __read, __bufsize) \
149 | __KS_TYPE(type_t) \
150 | __KS_BASIC(type_t, __bufsize) \
151 | __KS_GETC(__read, __bufsize) \
152 | __KS_GETUNTIL(__read, __bufsize)
153 |
154 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
155 |
156 | #define __KSEQ_BASIC(SCOPE, type_t) \
157 | SCOPE kseq_t *kseq_init(type_t fd) \
158 | { \
159 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
160 | s->f = ks_init(fd); \
161 | return s; \
162 | } \
163 | SCOPE void kseq_destroy(kseq_t *ks) \
164 | { \
165 | if (!ks) return; \
166 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
167 | ks_destroy(ks->f); \
168 | free(ks); \
169 | }
170 |
171 | /* Return value:
172 | >=0 length of the sequence (normal)
173 | -1 end-of-file
174 | -2 truncated quality string
175 | -3 error reading stream
176 | */
177 | #define __KSEQ_READ(SCOPE) \
178 | SCOPE int kseq_read(kseq_t *seq) \
179 | { \
180 | int c,r; \
181 | kstream_t *ks = seq->f; \
182 | if (seq->last_char == 0) { /* then jump to the next header line */ \
183 | while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \
184 | if (c < 0) return c; /* end of file or error*/ \
185 | seq->last_char = c; \
186 | } /* else: the first header char has been read in the previous call */ \
187 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
188 | if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \
189 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
190 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
191 | seq->seq.m = 256; \
192 | seq->seq.s = (char*)malloc(seq->seq.m); \
193 | } \
194 | while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \
195 | if (c == '\n') continue; /* skip empty lines */ \
196 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
197 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
198 | } \
199 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
200 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
201 | seq->seq.m = seq->seq.l + 2; \
202 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
203 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
204 | } \
205 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
206 | if (c != '+') return seq->seq.l; /* FASTA */ \
207 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
208 | seq->qual.m = seq->seq.m; \
209 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
210 | } \
211 | while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \
212 | if (c == -1) return -2; /* error: no quality string */ \
213 | while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \
214 | if (c == -3) return -3; /* stream error */ \
215 | seq->last_char = 0; /* we have not come to the next header line */ \
216 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
217 | return seq->seq.l; \
218 | }
219 |
220 | #define __KSEQ_TYPE(type_t) \
221 | typedef struct { \
222 | kstring_t name, comment, seq, qual; \
223 | int last_char; \
224 | kstream_t *f; \
225 | uint32_t id; \
226 | } kseq_t;
227 |
228 | #define KSEQ_INIT2(SCOPE, type_t, __read) \
229 | KSTREAM_INIT(type_t, __read, 16384) \
230 | __KSEQ_TYPE(type_t) \
231 | __KSEQ_BASIC(SCOPE, type_t) \
232 | __KSEQ_READ(SCOPE)
233 |
234 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
235 |
236 | #define KSEQ_DECLARE(type_t) \
237 | __KS_TYPE(type_t) \
238 | __KSEQ_TYPE(type_t) \
239 | extern kseq_t *kseq_init(type_t fd); \
240 | void kseq_destroy(kseq_t *ks); \
241 | int kseq_read(kseq_t *seq);
242 |
243 | #endif
244 |
--------------------------------------------------------------------------------
/src/ksw.h:
--------------------------------------------------------------------------------
1 | #ifndef __AC_KSW_H
2 | #define __AC_KSW_H
3 |
4 | #include
5 |
6 | #define KSW_XBYTE 0x10000
7 | #define KSW_XSTOP 0x20000
8 | #define KSW_XSUBO 0x40000
9 | #define KSW_XSTART 0x80000
10 |
11 | struct _kswq_t;
12 | typedef struct _kswq_t kswq_t;
13 |
14 | typedef struct {
15 | int score; // best score
16 | int te, qe; // target end and query end
17 | int score2, te2; // second best score and ending position on the target
18 | int tb, qb; // target start and query start
19 | } kswr_t;
20 |
21 | #ifdef __cplusplus
22 | extern "C" {
23 | #endif
24 |
25 | /**
26 | * Aligning two sequences
27 | *
28 | * @param qlen length of the query sequence (typically =0, *gscore keeps the best score such that
96 | * the entire query sequence is aligned; *gtle keeps the position on the
97 | * target where *gscore is achieved. Returning *gscore and *gtle helps the
98 | * caller to decide whether an end-to-end hit or a partial hit is preferred.
99 | *
100 | * The first 9 parameters are identical to those in ksw_global()
101 | *
102 | * @param h0 alignment score of upstream sequences
103 | * @param _qle (out) length of the query in the alignment
104 | * @param _tle (out) length of the target in the alignment
105 | * @param _gtle (out) length of the target if query is fully aligned
106 | * @param _gscore (out) score of the best end-to-end alignment; negative if not found
107 | *
108 | * @return best semi-local alignment score
109 | */
110 | int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
111 | int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
112 |
113 | #ifdef __cplusplus
114 | }
115 | #endif
116 |
117 | #endif
118 |
--------------------------------------------------------------------------------
/src/mapping.h:
--------------------------------------------------------------------------------
1 | #ifndef MAPPING_H_
2 | #define MAPPING_H_
3 |
4 | namespace chromap {
5 |
6 | // An interface for various mapping formats.
7 | class Mapping {
8 | public:
9 | virtual ~Mapping() = default;
10 | //// Defines the orders of mappings. Sort by mapping positions first, then
11 | //// sorted by barcode and other fields if available. Make sure to consider
12 | //// enough field so that the order is always deterministic.
13 | // virtual bool operator<(const Mapping &m) const = 0;
14 | //// Return true if two mappings are the same.
15 | // virtual bool operator==(const Mapping &m) const = 0;
16 | // Return true if two mappings are the same. For paired-end mappings, return
17 | // true if the mapping intervals are the same. For single-end mappings, return
18 | // true if the 5' mapping positions and strands are the same. This is
19 | // different from the previous function as this function does not require
20 | // barcodes to be the same.
21 | // virtual bool IsSamePosition(const Mapping &m) const = 0;
22 | // Return true if the mapping strand is positive. For paired-end reads, return
23 | // true if the mapping strand of the first read is positive.
24 | virtual bool IsPositiveStrand() const = 0;
25 | // Barcodes are encoded by 64-bit integers. This function will return the
26 | // encoded barcode. For mapping without barcode, this function will return 0;
27 | virtual uint64_t GetBarcode() const = 0;
28 | // Return inclusive mapping start position.
29 | virtual uint32_t GetStartPosition() const = 0;
30 | // Return exclusive mapping start position.
31 | virtual uint32_t GetEndPosition() const = 0;
32 | //// Return the total byte size of the mapping data field.
33 | // virtual uint16_t GetByteSize() const = 0;
34 | //// Write this mapping to a temp mapping output file in binary.
35 | // virtual size_t WriteToFile(FILE *temp_mapping_output_file) const = 0;
36 | //// Load this mapping fomr a temp mapping output file.
37 | // virtual size_t LoadFromFile(FILE *temp_mapping_output_file) = 0;
38 | // Perform Tn5 shift, which will change the start and end positions. Note that
39 | // currently this can only be done on the mappings that are represented by
40 | // intervals.
41 | virtual void Tn5Shift() = 0;
42 | };
43 |
44 | } // namespace chromap
45 |
46 | #endif // MAPPING_H_
47 |
--------------------------------------------------------------------------------
/src/mapping_generator.cc:
--------------------------------------------------------------------------------
1 | #include "mapping_generator.h"
2 |
3 | namespace chromap {
4 |
5 | // For strand, kPositive is 1, kNegative is 0;
6 | template <>
7 | void MappingGenerator::EmplaceBackSingleEndMappingRecord(
8 | MappingInMemory &mapping_in_memory,
9 | std::vector>
10 | &mappings_on_diff_ref_seqs) {
11 | mappings_on_diff_ref_seqs[mapping_in_memory.rid].emplace_back(
12 | mapping_in_memory.read_id, mapping_in_memory.GetFragmentStartPosition(),
13 | mapping_in_memory.GetFragmentLength(), mapping_in_memory.mapq,
14 | mapping_in_memory.GetStrand(), mapping_in_memory.is_unique,
15 | /*num_dups=*/1);
16 | }
17 |
18 | template <>
19 | void MappingGenerator::EmplaceBackSingleEndMappingRecord(
20 | MappingInMemory &mapping_in_memory,
21 | std::vector> &mappings_on_diff_ref_seqs) {
22 | mappings_on_diff_ref_seqs[mapping_in_memory.rid].emplace_back(
23 | mapping_in_memory.read_id, mapping_in_memory.barcode_key,
24 | mapping_in_memory.GetFragmentStartPosition(),
25 | mapping_in_memory.GetFragmentLength(), mapping_in_memory.mapq,
26 | mapping_in_memory.GetStrand(), mapping_in_memory.is_unique,
27 | /*num_dups=*/1);
28 | }
29 |
30 | template <>
31 | void MappingGenerator::EmplaceBackSingleEndMappingRecord(
32 | MappingInMemory &mapping_in_memory,
33 | std::vector> &mappings_on_diff_ref_seqs) {
34 | mappings_on_diff_ref_seqs[mapping_in_memory.rid].emplace_back(
35 | mapping_in_memory.read_id, std::string(mapping_in_memory.read_name),
36 | mapping_in_memory.read_length,
37 | mapping_in_memory.GetFragmentStartPosition(),
38 | mapping_in_memory.GetFragmentLength(), mapping_in_memory.mapq,
39 | mapping_in_memory.GetStrand(), mapping_in_memory.is_unique,
40 | /*num_dups=*/1);
41 | }
42 |
43 | template <>
44 | void MappingGenerator::EmplaceBackSingleEndMappingRecord(
45 | MappingInMemory &mapping_in_memory,
46 | std::vector> &mappings_on_diff_ref_seqs) {
47 | mappings_on_diff_ref_seqs[mapping_in_memory.rid].emplace_back(
48 | mapping_in_memory.read_id, std::string(mapping_in_memory.read_name),
49 | mapping_in_memory.barcode_key, /*num_dups=*/1,
50 | mapping_in_memory.GetFragmentStartPosition(), mapping_in_memory.rid,
51 | /*mpos=*/0, /*mrid=*/-1, /*tlen=*/0,
52 | mapping_in_memory.SAM_flag, mapping_in_memory.GetStrand(),
53 | /*is_alt=*/0, mapping_in_memory.is_unique, mapping_in_memory.mapq,
54 | mapping_in_memory.NM, mapping_in_memory.n_cigar, mapping_in_memory.cigar,
55 | mapping_in_memory.MD_tag, std::string(mapping_in_memory.read_sequence),
56 | std::string(mapping_in_memory.qual_sequence));
57 | }
58 |
59 | template <>
60 | void MappingGenerator::
61 | EmplaceBackSingleEndMappingRecord(
62 | MappingInMemory &mapping_in_memory,
63 | std::vector>
64 | &mappings_on_diff_ref_seqs) = delete;
65 |
66 | template <>
67 | void MappingGenerator::
68 | EmplaceBackSingleEndMappingRecord(
69 | MappingInMemory &mapping_in_memory,
70 | std::vector>
71 | &mappings_on_diff_ref_seqs) = delete;
72 |
73 | template <>
74 | void MappingGenerator::EmplaceBackSingleEndMappingRecord(
75 | MappingInMemory &mapping_in_memory,
76 | std::vector> &mappings_on_diff_ref_seqs) =
77 | delete;
78 |
79 | template <>
80 | void MappingGenerator::EmplaceBackSingleEndMappingRecord(
81 | MappingInMemory &mapping_in_memory,
82 | std::vector> &mappings_on_diff_ref_seqs) = delete;
83 |
84 | template <>
85 | void MappingGenerator::EmplaceBackPairedEndMappingRecord(
86 | PairedEndMappingInMemory &paired_end_mapping_in_memory,
87 | std::vector> &mappings_on_diff_ref_seqs) {
88 | int tlen = (int)paired_end_mapping_in_memory.GetFragmentLength();
89 | for (int i = 0; i < 2; ++i) {
90 | MappingInMemory &mapping_in_memory = (i == 0 ? paired_end_mapping_in_memory.mapping_in_memory1 :
91 | paired_end_mapping_in_memory.mapping_in_memory2);
92 | MappingInMemory &mate_mapping_in_memory = (i == 0 ? paired_end_mapping_in_memory.mapping_in_memory2 :
93 | paired_end_mapping_in_memory.mapping_in_memory1);
94 |
95 | mappings_on_diff_ref_seqs[mapping_in_memory.rid].emplace_back(
96 | mapping_in_memory.read_id, std::string(mapping_in_memory.read_name),
97 | mapping_in_memory.barcode_key, /*num_dups=*/1,
98 | mapping_in_memory.GetFragmentStartPosition(), mapping_in_memory.rid,
99 | /*mpos=*/mate_mapping_in_memory.GetFragmentStartPosition(),
100 | /*mrid=*/mate_mapping_in_memory.rid,
101 | /*tlen=*/mapping_in_memory.GetStrand() ? tlen : -tlen,
102 | mapping_in_memory.SAM_flag, mapping_in_memory.GetStrand(),
103 | /*is_alt=*/0, mapping_in_memory.is_unique, mapping_in_memory.mapq,
104 | mapping_in_memory.NM, mapping_in_memory.n_cigar, mapping_in_memory.cigar,
105 | mapping_in_memory.MD_tag, std::string(mapping_in_memory.read_sequence),
106 | std::string(mapping_in_memory.qual_sequence));
107 | }
108 | }
109 |
110 | template <>
111 | void MappingGenerator::
112 | EmplaceBackPairedEndMappingRecord(
113 | PairedEndMappingInMemory &paired_end_mapping_in_memory,
114 | std::vector>
115 | &mappings_on_diff_ref_seqs) {
116 | mappings_on_diff_ref_seqs[paired_end_mapping_in_memory.mapping_in_memory1.rid]
117 | .emplace_back(paired_end_mapping_in_memory.GetReadId(),
118 | paired_end_mapping_in_memory.GetFragmentStartPosition(),
119 | paired_end_mapping_in_memory.GetFragmentLength(),
120 | paired_end_mapping_in_memory.mapq,
121 | paired_end_mapping_in_memory.GetStrand(),
122 | paired_end_mapping_in_memory.is_unique, /*num_dups=*/1,
123 | paired_end_mapping_in_memory.GetPositiveAlignmentLength(),
124 | paired_end_mapping_in_memory.GetNegativeAlignmentLength());
125 | }
126 |
127 | template <>
128 | void MappingGenerator::
129 | EmplaceBackPairedEndMappingRecord(
130 | PairedEndMappingInMemory &paired_end_mapping_in_memory,
131 | std::vector>
132 | &mappings_on_diff_ref_seqs) {
133 | mappings_on_diff_ref_seqs[paired_end_mapping_in_memory.mapping_in_memory1.rid]
134 | .emplace_back(paired_end_mapping_in_memory.GetReadId(),
135 | paired_end_mapping_in_memory.GetBarcode(),
136 | paired_end_mapping_in_memory.GetFragmentStartPosition(),
137 | paired_end_mapping_in_memory.GetFragmentLength(),
138 | paired_end_mapping_in_memory.mapq,
139 | paired_end_mapping_in_memory.GetStrand(),
140 | paired_end_mapping_in_memory.is_unique, /*num_dups=*/1,
141 | paired_end_mapping_in_memory.GetPositiveAlignmentLength(),
142 | paired_end_mapping_in_memory.GetNegativeAlignmentLength());
143 | }
144 |
145 | template <>
146 | void MappingGenerator::EmplaceBackPairedEndMappingRecord(
147 | PairedEndMappingInMemory &paired_end_mapping_in_memory,
148 | std::vector> &mappings_on_diff_ref_seqs) {
149 | mappings_on_diff_ref_seqs[paired_end_mapping_in_memory.mapping_in_memory1.rid]
150 | .emplace_back(
151 | paired_end_mapping_in_memory.GetReadId(),
152 | std::string(
153 | paired_end_mapping_in_memory.mapping_in_memory1.read_name),
154 | std::string(
155 | paired_end_mapping_in_memory.mapping_in_memory2.read_name),
156 | paired_end_mapping_in_memory.mapping_in_memory1.read_length,
157 | paired_end_mapping_in_memory.mapping_in_memory2.read_length,
158 | paired_end_mapping_in_memory.GetFragmentStartPosition(),
159 | paired_end_mapping_in_memory.GetNegativeAlignmentLength(),
160 | paired_end_mapping_in_memory.GetFragmentLength(),
161 | paired_end_mapping_in_memory.GetPositiveAlignmentLength(),
162 | paired_end_mapping_in_memory.mapq,
163 | paired_end_mapping_in_memory.mapping_in_memory1.mapq,
164 | paired_end_mapping_in_memory.mapping_in_memory2.mapq,
165 | paired_end_mapping_in_memory.GetStrand(),
166 | paired_end_mapping_in_memory.is_unique, /*num_dups=*/1);
167 | }
168 |
169 | template <>
170 | void MappingGenerator::EmplaceBackPairedEndMappingRecord(
171 | PairedEndMappingInMemory &paired_end_mapping_in_memory,
172 | std::vector> &mappings_on_diff_ref_seqs) {
173 | uint8_t strand1 = paired_end_mapping_in_memory.mapping_in_memory1.GetStrand();
174 | uint8_t strand2 = paired_end_mapping_in_memory.mapping_in_memory2.GetStrand();
175 |
176 | int position1 =
177 | paired_end_mapping_in_memory.mapping_in_memory1.ref_start_position;
178 | int position2 =
179 | paired_end_mapping_in_memory.mapping_in_memory2.ref_start_position;
180 |
181 | if (paired_end_mapping_in_memory.mapping_in_memory1.strand == kNegative) {
182 | position1 =
183 | paired_end_mapping_in_memory.mapping_in_memory1.ref_end_position;
184 | }
185 |
186 | if (paired_end_mapping_in_memory.mapping_in_memory2.strand == kNegative) {
187 | position2 =
188 | paired_end_mapping_in_memory.mapping_in_memory2.ref_end_position;
189 | }
190 |
191 | int rid1 = paired_end_mapping_in_memory.mapping_in_memory1.rid;
192 | int rid2 = paired_end_mapping_in_memory.mapping_in_memory2.rid;
193 | const int rid1_rank = pairs_custom_rid_rank_[rid1];
194 | const int rid2_rank = pairs_custom_rid_rank_[rid2];
195 |
196 | const bool is_rid1_rank_smaller =
197 | rid1_rank < rid2_rank || (rid1 == rid2 && position1 < position2);
198 | if (!is_rid1_rank_smaller) {
199 | std::swap(rid1, rid2);
200 | std::swap(position1, position2);
201 | std::swap(strand1, strand2);
202 | }
203 |
204 | mappings_on_diff_ref_seqs[rid1].emplace_back(
205 | paired_end_mapping_in_memory.GetReadId(),
206 | std::string(paired_end_mapping_in_memory.mapping_in_memory1.read_name),
207 | paired_end_mapping_in_memory.GetBarcode(), rid1, rid2, position1,
208 | position2, strand1, strand2, paired_end_mapping_in_memory.mapq,
209 | paired_end_mapping_in_memory.is_unique, /*num_dups=*/1);
210 | }
211 |
212 | template <>
213 | void MappingGenerator::EmplaceBackPairedEndMappingRecord(
214 | PairedEndMappingInMemory &paired_end_mapping_in_memory,
215 | std::vector> &mappings_on_diff_ref_seqs) =
216 | delete;
217 |
218 | template <>
219 | void MappingGenerator::EmplaceBackPairedEndMappingRecord(
220 | PairedEndMappingInMemory &paired_end_mapping_in_memory,
221 | std::vector>
222 | &mappings_on_diff_ref_seqs) = delete;
223 |
224 | template <>
225 | void MappingGenerator::EmplaceBackPairedEndMappingRecord(
226 | PairedEndMappingInMemory &paired_end_mapping_in_memory,
227 | std::vector> &mappings_on_diff_ref_seqs) = delete;
228 |
229 | } // namespace chromap
230 |
--------------------------------------------------------------------------------
/src/mapping_in_memory.h:
--------------------------------------------------------------------------------
1 | #ifndef MAPPING_IN_MEMORY_H_
2 | #define MAPPING_IN_MEMORY_H_
3 |
4 | #include
5 |
6 | #include
7 |
8 | #include "utils.h"
9 |
10 | namespace chromap {
11 |
12 | // Regardless of mapping format, this struct can temporarily hold a mapping in
13 | // memory for easily passing it into the several functions before pushing it
14 | // to the result mapping vector. It never owns the read or the read qual. It
15 | // owns the cigar before push the mapping to the vector. (For now, the cigar
16 | // memory is released once a SAMMapping is created.) Since this struct is large,
17 | // we should never create a huge vector of this struct.
18 | struct MappingInMemory {
19 | uint32_t read_id = 0;
20 | int read_split_site = 0;
21 | int read_length = 0;
22 |
23 | uint32_t rid = 0;
24 | uint32_t ref_start_position = 0;
25 | uint32_t ref_end_position = 0;
26 |
27 | uint64_t barcode_key = 0;
28 |
29 | Strand strand = kPositive;
30 | bool is_unique = true;
31 | uint8_t mapq = 0;
32 |
33 | // It does NOT own read or read qual.
34 | const char *read_name = nullptr;
35 | const char *read_sequence = nullptr;
36 | const char *qual_sequence = nullptr;
37 |
38 | // SAM fields or tags.
39 | uint16_t SAM_flag = 0;
40 |
41 | uint32_t *cigar = nullptr;
42 | int n_cigar = 0;
43 |
44 | int NM = 0;
45 |
46 | std::string MD_tag;
47 |
48 | inline uint8_t GetStrand() const { return (strand == kPositive ? 1 : 0); }
49 |
50 | inline uint32_t GetFragmentStartPosition() const {
51 | return ref_start_position;
52 | }
53 |
54 | // TODO(Haowen): change this to alignment length.
55 | inline uint16_t GetFragmentLength() const {
56 | return ref_end_position - ref_start_position + 1;
57 | }
58 |
59 | inline uint16_t GetAlignmentLength() const {
60 | return ref_end_position - ref_start_position + 1;
61 | }
62 | };
63 |
64 | struct PairedEndMappingInMemory {
65 | MappingInMemory mapping_in_memory1;
66 | MappingInMemory mapping_in_memory2;
67 | uint8_t is_unique;
68 | uint8_t mapq;
69 |
70 | inline uint8_t GetStrand() const {
71 | return (mapping_in_memory1.strand == kPositive ? 1 : 0);
72 | }
73 |
74 | inline uint32_t GetReadId() const { return mapping_in_memory1.read_id; }
75 |
76 | inline uint64_t GetBarcode() const { return mapping_in_memory1.barcode_key; }
77 |
78 | inline uint32_t GetFragmentStartPosition() const {
79 | if (mapping_in_memory1.strand == kPositive) {
80 | return mapping_in_memory1.GetFragmentStartPosition();
81 | }
82 |
83 | return mapping_in_memory2.GetFragmentStartPosition();
84 | }
85 |
86 | inline int GetFragmentLength() const {
87 | if (mapping_in_memory1.strand == kPositive) {
88 | return mapping_in_memory2.ref_end_position -
89 | mapping_in_memory1.ref_start_position + 1;
90 | }
91 | return mapping_in_memory1.ref_end_position -
92 | mapping_in_memory2.ref_start_position + 1;
93 | }
94 |
95 | inline uint32_t GetPositiveAlignmentLength() const {
96 | if (mapping_in_memory1.strand == kPositive) {
97 | return mapping_in_memory1.GetAlignmentLength();
98 | }
99 | return mapping_in_memory2.GetAlignmentLength();
100 | }
101 |
102 | inline uint32_t GetNegativeAlignmentLength() const {
103 | if (mapping_in_memory1.strand == kNegative) {
104 | return mapping_in_memory1.GetAlignmentLength();
105 | }
106 | return mapping_in_memory2.GetAlignmentLength();
107 | }
108 | };
109 |
110 | } // namespace chromap
111 |
112 | #endif // MAPPING_IN_MEMORY_H_
113 |
--------------------------------------------------------------------------------
/src/mapping_metadata.h:
--------------------------------------------------------------------------------
1 | #ifndef MAPPING_METADATA_H_
2 | #define MAPPING_METADATA_H_
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include "minimizer.h"
10 | #include "candidate.h"
11 | #include "draft_mapping.h"
12 |
13 | namespace chromap {
14 |
15 | class mm_cache;
16 | class Index;
17 | class CandidateProcessor;
18 | class PairedEndMappingMetadata;
19 | class DraftMappingGenerator;
20 | template
21 | class MappingGenerator;
22 | class Chromap;
23 |
24 | class MappingMetadata {
25 | public:
26 | inline void PrepareForMappingNextRead(int reserve_size) {
27 | Clear();
28 | ReserveSpace(reserve_size);
29 | repetitive_seed_length_ = 0;
30 | }
31 |
32 | inline size_t GetNumMinimizers() const {
33 | return minimizers_.size();
34 | }
35 |
36 | inline size_t GetNumPositiveCandidates() const {
37 | return positive_candidates_.size();
38 | }
39 |
40 | inline size_t GetNumNegativeCandidates() const {
41 | return negative_candidates_.size();
42 | }
43 |
44 | inline size_t GetNumCandidates() const {
45 | return positive_candidates_.size() + negative_candidates_.size();
46 | }
47 |
48 | inline size_t GetNumDraftMappings() const {
49 | return positive_mappings_.size() + negative_mappings_.size();
50 | }
51 |
52 | inline void MoveCandidiatesToBuffer() {
53 | positive_candidates_.swap(positive_candidates_buffer_);
54 | positive_candidates_.clear();
55 | negative_candidates_.swap(negative_candidates_buffer_);
56 | negative_candidates_.clear();
57 | }
58 |
59 | // Callback function to update all candidates.
60 | inline void UpdateCandidates(void (*Update)(std::vector &)) {
61 | Update(positive_candidates_);
62 | Update(negative_candidates_);
63 | }
64 |
65 | inline void SortCandidates() {
66 | std::sort(positive_candidates_.begin(), positive_candidates_.end());
67 | std::sort(negative_candidates_.begin(), negative_candidates_.end());
68 | }
69 |
70 | inline void SortMappingsByPositions() {
71 | auto compare_function = [](const DraftMapping &a, const DraftMapping &b) {
72 | return a.position < b.position;
73 | };
74 | std::sort(positive_mappings_.begin(), positive_mappings_.end(),
75 | compare_function);
76 | std::sort(negative_mappings_.begin(), negative_mappings_.end(),
77 | compare_function);
78 | }
79 |
80 | inline int GetMinNumErrors() const { return min_num_errors_; }
81 | inline int GetSecondMinNumErrors() const { return second_min_num_errors_; }
82 | inline int GetNumBestMappings() const { return num_best_mappings_; }
83 | inline int GetNumSecondBestMappings() const {
84 | return num_second_best_mappings_;
85 | }
86 |
87 | inline void SetMinNumErrors(int min_num_errors) {
88 | min_num_errors_ = min_num_errors;
89 | }
90 | inline void SetSecondMinNumErrors(int second_min_num_errors) {
91 | second_min_num_errors_ = second_min_num_errors;
92 | }
93 | inline void SetNumBestMappings(int num_best_mappings) {
94 | num_best_mappings_ = num_best_mappings;
95 | }
96 | inline void SetNumSecondBestMappings(int num_second_best_mappings) {
97 | num_second_best_mappings_ = num_second_best_mappings;
98 | }
99 |
100 | // For debug only.
101 | inline void PrintCandidates(FILE *fp) {
102 | uint32_t i;
103 | for (i = 0; i < positive_candidates_.size(); ++i)
104 | fprintf(fp, "+ %d %d %d %d\n", i,
105 | (int)(positive_candidates_[i].position >> 32),
106 | (int)(positive_candidates_[i].position),
107 | positive_candidates_[i].count);
108 | for (i = 0; i < negative_candidates_.size(); ++i)
109 | fprintf(fp, "- %d %d %d %d\n", i,
110 | (int)(negative_candidates_[i].position >> 32),
111 | (int)(negative_candidates_[i].position),
112 | negative_candidates_[i].count);
113 | }
114 |
115 | protected:
116 | inline void ReserveSpace(int reserve_size) {
117 | minimizers_.reserve(reserve_size);
118 | positive_hits_.reserve(reserve_size);
119 | negative_hits_.reserve(reserve_size);
120 | positive_candidates_.reserve(reserve_size);
121 | negative_candidates_.reserve(reserve_size);
122 | positive_candidates_buffer_.reserve(reserve_size);
123 | negative_candidates_buffer_.reserve(reserve_size);
124 | positive_mappings_.reserve(reserve_size);
125 | negative_mappings_.reserve(reserve_size);
126 | positive_split_sites_.reserve(reserve_size);
127 | negative_split_sites_.reserve(reserve_size);
128 | }
129 |
130 | inline void Clear() {
131 | minimizers_.clear();
132 | positive_hits_.clear();
133 | negative_hits_.clear();
134 | positive_candidates_.clear();
135 | negative_candidates_.clear();
136 | positive_candidates_buffer_.clear();
137 | negative_candidates_buffer_.clear();
138 | positive_mappings_.clear();
139 | negative_mappings_.clear();
140 | positive_split_sites_.clear();
141 | negative_split_sites_.clear();
142 | }
143 |
144 | int min_num_errors_, second_min_num_errors_;
145 | int num_best_mappings_, num_second_best_mappings_;
146 |
147 | uint32_t repetitive_seed_length_;
148 |
149 | std::vector minimizers_;
150 |
151 | std::vector positive_hits_;
152 | std::vector negative_hits_;
153 |
154 | std::vector positive_candidates_;
155 | std::vector negative_candidates_;
156 |
157 | std::vector positive_candidates_buffer_;
158 | std::vector negative_candidates_buffer_;
159 |
160 | // The first element is ed, and the second element is position.
161 | std::vector positive_mappings_;
162 | std::vector negative_mappings_;
163 |
164 | std::vector positive_split_sites_;
165 | std::vector negative_split_sites_;
166 |
167 | friend class mm_cache;
168 | friend class Index;
169 | friend class CandidateProcessor;
170 | friend class PairedEndMappingMetadata;
171 | friend class DraftMappingGenerator;
172 | template
173 | friend class MappingGenerator;
174 | friend class Chromap;
175 | };
176 |
177 | } // namespace chromap
178 |
179 | #endif // MAPPING_METADATA_H_
180 |
--------------------------------------------------------------------------------
/src/mapping_parameters.h:
--------------------------------------------------------------------------------
1 | #ifndef MAPPING_PARAMETERS_H_
2 | #define MAPPING_PARAMETERS_H_
3 |
4 | #include
5 | #include
6 |
7 | namespace chromap {
8 |
9 | enum MappingOutputFormat {
10 | MAPPINGFORMAT_UNKNOWN,
11 | MAPPINGFORMAT_BED,
12 | MAPPINGFORMAT_TAGALIGN,
13 | MAPPINGFORMAT_PAF,
14 | MAPPINGFORMAT_SAM,
15 | MAPPINGFORMAT_PAIRS
16 | };
17 |
18 | struct MappingParameters {
19 | int error_threshold = 8;
20 | int match_score = 1;
21 | int mismatch_penalty = 4;
22 | std::vector gap_open_penalties = {6, 6};
23 | std::vector gap_extension_penalties = {1, 1};
24 | int min_num_seeds_required_for_mapping = 2;
25 | std::vector max_seed_frequencies = {500, 1000};
26 |
27 | double cache_update_param = 0.01;
28 | int cache_size = 4000003;
29 | bool debug_cache = false;
30 | std::string frip_est_params = "-1.0996;4.2391;3.0164e-05;-2.1087e-04;-5.5825e-05";
31 | bool output_num_uniq_cache_slots = true;
32 | int k_for_minhash = 250;
33 |
34 | // Read with # best mappings greater than it will have this number of best
35 | // mappings reported.
36 | int max_num_best_mappings = 1;
37 | int max_insert_size = 1000;
38 | uint8_t mapq_threshold = 30;
39 | int num_threads = 1;
40 | int min_read_length = 30;
41 | int barcode_correction_error_threshold = 1;
42 | double barcode_correction_probability_threshold = 0.9;
43 | int multi_mapping_allocation_distance = 0;
44 | int multi_mapping_allocation_seed = 11;
45 | // Read with more than this number of mappings will be dropped.
46 | int drop_repetitive_reads = 500000;
47 | bool trim_adapters = false;
48 | bool remove_pcr_duplicates = false;
49 | bool remove_pcr_duplicates_at_bulk_level = true;
50 | bool is_bulk_data = true;
51 | bool allocate_multi_mappings = false;
52 | bool only_output_unique_mappings = true;
53 | bool output_mappings_not_in_whitelist = false;
54 | bool Tn5_shift = false;
55 | bool split_alignment = false;
56 | MappingOutputFormat mapping_output_format = MAPPINGFORMAT_BED;
57 | bool low_memory_mode = false;
58 | bool cell_by_bin = false;
59 | int bin_size = 5000;
60 | uint16_t depth_cutoff_to_call_peak = 3;
61 | int peak_min_length = 30;
62 | int peak_merge_max_length = 30;
63 | std::string reference_file_path;
64 | std::string index_file_path;
65 | std::vector read_file1_paths;
66 | std::vector read_file2_paths;
67 | std::vector barcode_file_paths;
68 | std::string barcode_whitelist_file_path;
69 | std::string read_format;
70 | std::string mapping_output_file_path;
71 | std::string matrix_output_prefix;
72 | // The order for general sorting.
73 | std::string custom_rid_order_file_path;
74 | // The order for pairs format flipping.
75 | std::string pairs_flipping_custom_rid_order_file_path;
76 | std::string barcode_translate_table_file_path;
77 | std::string summary_metadata_file_path;
78 | bool skip_barcode_check = false;
79 |
80 | int GetNumVPULanes() const {
81 | int NUM_VPU_LANES = 0;
82 | if (error_threshold < 8) {
83 | NUM_VPU_LANES = 8;
84 | } else if (error_threshold < 16) {
85 | NUM_VPU_LANES = 4;
86 | }
87 | return NUM_VPU_LANES;
88 | }
89 | };
90 |
91 | } // namespace chromap
92 |
93 | #endif // MAPPING_PARAMETERS_H_
94 |
--------------------------------------------------------------------------------
/src/minimizer.h:
--------------------------------------------------------------------------------
1 | #ifndef MINIMIZER_H_
2 | #define MINIMIZER_H_
3 |
4 | #include
5 |
6 | #include "hit_utils.h"
7 | #include "strand.h"
8 |
9 | namespace chromap {
10 |
11 | class Minimizer {
12 | public:
13 | Minimizer() = delete;
14 |
15 | Minimizer(std::pair minimizer)
16 | : hash_(minimizer.first), hit_(minimizer.second) {}
17 |
18 | Minimizer(uint64_t hash, uint64_t hit) : hash_(hash), hit_(hit) {}
19 |
20 | ~Minimizer() = default;
21 |
22 | inline uint64_t GetHash() const { return hash_; }
23 |
24 | inline uint64_t GetHit() const { return hit_; }
25 |
26 | inline uint32_t GetSequenceIndex() const { return HitToSequenceIndex(hit_); }
27 |
28 | inline uint32_t GetSequencePosition() const {
29 | return HitToSequencePosition(hit_);
30 | }
31 |
32 | inline Strand GetSequenceStrand() const { return HitToStrand(hit_); }
33 |
34 | inline bool operator<(const Minimizer &m) const {
35 | if (hash_ < m.hash_) {
36 | return true;
37 | }
38 |
39 | if (hash_ == m.hash_ && hit_ < m.hit_) {
40 | return true;
41 | }
42 |
43 | return false;
44 | }
45 |
46 | private:
47 | // The hash of the kmer.
48 | uint64_t hash_ = 0;
49 |
50 | // The high 31 bits save the sequence index in the sequence batch. The
51 | // following 32 bits save the end position on that sequence. And the lowest
52 | // bit encodes the strand (0 for positive).
53 | uint64_t hit_ = 0;
54 | };
55 |
56 | } // namespace chromap
57 |
58 | #endif // MINIMIZER_H_
59 |
--------------------------------------------------------------------------------
/src/minimizer_generator.cc:
--------------------------------------------------------------------------------
1 | #include "minimizer_generator.h"
2 |
3 | #include "utils.h"
4 |
5 | namespace chromap {
6 |
7 | void MinimizerGenerator::GenerateMinimizers(
8 | const SequenceBatch &sequence_batch, uint32_t sequence_index,
9 | std::vector &minimizers) const {
10 | const uint32_t sequence_length =
11 | sequence_batch.GetSequenceLengthAt(sequence_index);
12 | const char *sequence = sequence_batch.GetSequenceAt(sequence_index);
13 |
14 | const uint64_t num_shifted_bits = 2 * (kmer_size_ - 1);
15 | const uint64_t mask = (((uint64_t)1) << (2 * kmer_size_)) - 1;
16 |
17 | uint64_t seeds_in_two_strands[2] = {0, 0};
18 | std::pair buffer[256];
19 | std::pair min_seed = {UINT64_MAX, UINT64_MAX};
20 |
21 | // 2 uint64_t cost 16 bytes.
22 | memset(buffer, 0xff, window_size_ * 16);
23 |
24 | int unambiguous_length = 0;
25 | int position_in_buffer = 0;
26 | int min_position = 0;
27 |
28 | for (uint32_t position = 0; position < sequence_length; ++position) {
29 | const uint8_t current_base = CharToUint8(sequence[position]);
30 | std::pair current_seed = {UINT64_MAX, UINT64_MAX};
31 |
32 | if (current_base < 4) {
33 | // Not an ambiguous base.
34 | // Forward k-mer.
35 | seeds_in_two_strands[0] =
36 | ((seeds_in_two_strands[0] << 2) | current_base) & mask;
37 | // Reverse k-mer.
38 | seeds_in_two_strands[1] =
39 | (seeds_in_two_strands[1] >> 2) |
40 | (((uint64_t)(3 ^ current_base)) << num_shifted_bits);
41 |
42 | if (seeds_in_two_strands[0] == seeds_in_two_strands[1]) {
43 | // Skip "symmetric k-mers" as we don't know it strand.
44 | continue;
45 | }
46 |
47 | uint64_t hash_keys_for_two_seeds[2] = {
48 | Hash64(seeds_in_two_strands[0], mask),
49 | Hash64(seeds_in_two_strands[1], mask)};
50 |
51 | uint64_t strand =
52 | hash_keys_for_two_seeds[0] < hash_keys_for_two_seeds[1] ? 0 : 1;
53 |
54 | ++unambiguous_length;
55 |
56 | if (unambiguous_length >= kmer_size_) {
57 | current_seed.first = Hash64(hash_keys_for_two_seeds[strand], mask);
58 | current_seed.second =
59 | ((((uint64_t)sequence_index) << 32 | (uint32_t)position) << 1) |
60 | strand;
61 | }
62 | } else {
63 | unambiguous_length = 0;
64 | }
65 |
66 | // Need to do this here as appropriate position_in_buffer and
67 | // buf[position_in_buffer] are needed below.
68 | buffer[position_in_buffer] = current_seed;
69 | if (unambiguous_length == window_size_ + kmer_size_ - 1 &&
70 | min_seed.first != UINT64_MAX && min_seed.first < current_seed.first) {
71 | // Special case for the first window - because identical k-mers are not
72 | // stored yet.
73 | for (int j = position_in_buffer + 1; j < window_size_; ++j)
74 | if (min_seed.first == buffer[j].first &&
75 | buffer[j].second != min_seed.second)
76 | minimizers.emplace_back(buffer[j]);
77 | for (int j = 0; j < position_in_buffer; ++j)
78 | if (min_seed.first == buffer[j].first &&
79 | buffer[j].second != min_seed.second)
80 | minimizers.emplace_back(buffer[j]);
81 | }
82 |
83 | if (current_seed.first <= min_seed.first) {
84 | // A new minimum; then write the old min.
85 | if (unambiguous_length >= window_size_ + kmer_size_ &&
86 | min_seed.first != UINT64_MAX) {
87 | minimizers.emplace_back(min_seed);
88 | }
89 | min_seed = current_seed;
90 | min_position = position_in_buffer;
91 | } else if (position_in_buffer == min_position) {
92 | // Old min has moved outside the window.
93 | if (unambiguous_length >= window_size_ + kmer_size_ - 1 &&
94 | min_seed.first != UINT64_MAX) {
95 | minimizers.emplace_back(min_seed);
96 | }
97 |
98 | min_seed.first = UINT64_MAX;
99 | for (int j = position_in_buffer + 1; j < window_size_; ++j) {
100 | // The two loops are necessary when there are identical k-mers.
101 | if (min_seed.first >= buffer[j].first) {
102 | // >= is important s.t. min is always the closest k-mer.
103 | min_seed = buffer[j];
104 | min_position = j;
105 | }
106 | }
107 |
108 | for (int j = 0; j <= position_in_buffer; ++j) {
109 | if (min_seed.first >= buffer[j].first) {
110 | min_seed = buffer[j];
111 | min_position = j;
112 | }
113 | }
114 |
115 | if (unambiguous_length >= window_size_ + kmer_size_ - 1 &&
116 | min_seed.first != UINT64_MAX) {
117 | // Write identical k-mers.
118 | // These two loops make sure the output is sorted.
119 | for (int j = position_in_buffer + 1; j < window_size_; ++j)
120 | if (min_seed.first == buffer[j].first &&
121 | min_seed.second != buffer[j].second)
122 | minimizers.emplace_back(buffer[j]);
123 | for (int j = 0; j <= position_in_buffer; ++j)
124 | if (min_seed.first == buffer[j].first &&
125 | min_seed.second != buffer[j].second)
126 | minimizers.emplace_back(buffer[j]);
127 | }
128 | }
129 |
130 | ++position_in_buffer;
131 | if (position_in_buffer == window_size_) {
132 | position_in_buffer = 0;
133 | }
134 | }
135 |
136 | if (min_seed.first != UINT64_MAX) {
137 | minimizers.emplace_back(min_seed);
138 | }
139 | }
140 |
141 | } // namespace chromap
142 |
--------------------------------------------------------------------------------
/src/minimizer_generator.h:
--------------------------------------------------------------------------------
1 | #ifndef MINIMIZER_GENERATOR_H_
2 | #define MINIMIZER_GENERATOR_H_
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | #include "minimizer.h"
9 | #include "sequence_batch.h"
10 |
11 | namespace chromap {
12 |
13 | class MinimizerGenerator {
14 | public:
15 | MinimizerGenerator() = delete;
16 |
17 | MinimizerGenerator(int kmer_size, int window_size)
18 | : kmer_size_(kmer_size), window_size_(window_size) {
19 | // 56 bits for a k-mer. So the max kmer size is 28.
20 | assert(kmer_size_ > 0 && kmer_size_ <= 28);
21 | assert(window_size_ > 0 && window_size_ < 256);
22 | }
23 |
24 | ~MinimizerGenerator() = default;
25 |
26 | void GenerateMinimizers(const SequenceBatch &sequence_batch,
27 | uint32_t sequence_index,
28 | std::vector &minimizers) const;
29 |
30 | private:
31 | const int kmer_size_;
32 | const int window_size_;
33 | };
34 |
35 | } // namespace chromap
36 |
37 | #endif // MINIMIZER_GENERATOR_H_
38 |
--------------------------------------------------------------------------------
/src/mmcache.hpp:
--------------------------------------------------------------------------------
1 | #ifndef CHROMAP_CACHE_H_
2 | #define CHROMAP_CACHE_H_
3 |
4 | #include "index.h"
5 | #include "minimizer.h"
6 | #include
7 |
8 | #define FINGER_PRINT_SIZE 103
9 |
10 | #define HEAD_MM_ARRAY_SIZE 4194304 // 2^22
11 | #define HEAD_MM_ARRAY_MASK 0x3fffff // 22 positions
12 |
13 | namespace chromap {
14 | struct _mm_cache_entry {
15 | std::vector minimizers;
16 | std::vector offsets; // the distance to the next minimizer
17 | std::vector strands;
18 | std::vector positive_candidates;
19 | std::vector negative_candidates;
20 | int weight;
21 | unsigned short finger_print_cnt[FINGER_PRINT_SIZE];
22 | int finger_print_cnt_sum;
23 | uint32_t repetitive_seed_length;
24 | int activated;
25 | };
26 |
27 | class mm_cache {
28 | private:
29 | int cache_size;
30 | struct _mm_cache_entry *cache;
31 | int num_locks_for_cache = 1000;
32 | omp_lock_t entry_locks_omp[1000];
33 | std::mutex print_lock;
34 | int kmer_length;
35 | int update_limit;
36 | int saturate_count;
37 | uint64_t *
38 | head_mm; // the first and last minimizer for each cached minimizer vector
39 |
40 | // 0: not match. -1: opposite order. 1: same order
41 | int IsMinimizersMatchCache(const std::vector &minimizers,
42 | const struct _mm_cache_entry &cache) {
43 | if (cache.minimizers.size() != minimizers.size()) return 0;
44 | int size = minimizers.size();
45 | int i, j;
46 | int direction = 0;
47 | for (i = 0; i < size; ++i) {
48 | if (cache.minimizers[i] != minimizers[i].GetHash() ||
49 | (minimizers[i].GetHit() & 1) != cache.strands[i])
50 | break;
51 | }
52 | if (i >= size) {
53 | for (i = 0; i < size - 1; ++i) {
54 | if (cache.offsets[i] != ((int)minimizers[i + 1].GetSequencePosition() -
55 | (int)minimizers[i].GetSequencePosition()))
56 | break;
57 | }
58 | if (i >= size - 1) direction = 1;
59 | }
60 |
61 | if (direction == 1) return 1;
62 |
63 | for (i = 0, j = size - 1; i < size; ++i, --j) {
64 | if (cache.minimizers[i] != minimizers[j].GetHash() ||
65 | (minimizers[j].GetHit() & 1) == cache.strands[i])
66 | break;
67 | }
68 | if (i >= size) {
69 | for (i = 0, j = size - 1; i < size - 1; ++i, --j) {
70 | if (cache.offsets[i] !=
71 | ((int)minimizers[j].GetSequencePosition()) -
72 | ((int)minimizers[j - 1].GetSequencePosition()))
73 | break;
74 | }
75 |
76 | if (i >= size - 1) {
77 | direction = -1;
78 | }
79 | }
80 | return direction;
81 | }
82 |
83 | public:
84 | mm_cache(int size) {
85 | cache = new struct _mm_cache_entry[size];
86 | head_mm = new uint64_t[HEAD_MM_ARRAY_SIZE];
87 | cache_size = size;
88 | // memset(cache, 0, sizeof(cache[0]) * size) ;
89 | for (int i = 0; i < size; ++i) {
90 | cache[i].weight = 0;
91 | memset(cache[i].finger_print_cnt, 0,
92 | sizeof(unsigned short) * FINGER_PRINT_SIZE);
93 | cache[i].finger_print_cnt_sum = 0;
94 | cache[i].activated = 0;
95 | }
96 | memset(head_mm, 0, sizeof(uint64_t) * HEAD_MM_ARRAY_SIZE);
97 | update_limit = 10;
98 | saturate_count = 100;
99 |
100 | // initialize the array of OpenMP locks
101 | for (int i = 0; i < num_locks_for_cache; ++i) {
102 | omp_init_lock(&entry_locks_omp[i]);
103 | }
104 | }
105 |
106 | ~mm_cache() {
107 | delete[] cache;
108 | delete[] head_mm;
109 |
110 | // destory OpenMP locks for parallelizing cache update
111 | for (int i = 0; i < num_locks_for_cache; ++i) {
112 | omp_destroy_lock(&entry_locks_omp[i]);
113 | }
114 | }
115 |
116 | void SetKmerLength(int kl) { kmer_length = kl; }
117 |
118 | // Return the hash entry index. -1 if failed.
119 | int Query(MappingMetadata &mapping_metadata, uint32_t read_len) {
120 | const std::vector &minimizers = mapping_metadata.minimizers_;
121 | std::vector &pos_candidates =
122 | mapping_metadata.positive_candidates_;
123 | std::vector &neg_candidates =
124 | mapping_metadata.negative_candidates_;
125 | uint32_t &repetitive_seed_length = mapping_metadata.repetitive_seed_length_;
126 |
127 | int i;
128 | int msize = minimizers.size();
129 | if (msize == 0) return -1;
130 | if ((head_mm[(minimizers[0].GetHash() >> 6) & HEAD_MM_ARRAY_MASK] &
131 | (1ull << (minimizers[0].GetHash() & 0x3f))) == 0)
132 | return -1;
133 | uint64_t h = 0;
134 | // for (i = 0 ; i < msize; ++i)
135 | // h += (minimizers[i].first);
136 | if (msize == 1) {
137 | h = (minimizers[0].GetHash());
138 | } else {
139 | h = minimizers[0].GetHash() + minimizers[msize - 1].GetHash();
140 | }
141 |
142 | int hidx = h % cache_size;
143 | int direction = IsMinimizersMatchCache(minimizers, cache[hidx]);
144 | if (direction == 1) {
145 | pos_candidates = cache[hidx].positive_candidates;
146 | neg_candidates = cache[hidx].negative_candidates;
147 | repetitive_seed_length = cache[hidx].repetitive_seed_length;
148 | int size = pos_candidates.size();
149 | int shift = (int)minimizers[0].GetSequencePosition();
150 | for (i = 0; i < size; ++i) {
151 | uint64_t rid = pos_candidates[i].position >> 32;
152 | int rpos = (int)pos_candidates[i].position;
153 | pos_candidates[i].position = (rid << 32) + (uint32_t)(rpos - shift);
154 | }
155 | size = neg_candidates.size();
156 | for (i = 0; i < size; ++i) neg_candidates[i].position += shift;
157 | return hidx;
158 | } else if (direction == -1) { // The "read" is on the other direction of
159 | // the cached "read"
160 | int size = cache[hidx].negative_candidates.size();
161 | // Start position of the last minimizer shoud equal the first minimizer's
162 | // end position in rc "read".
163 | int shift = read_len -
164 | ((int)minimizers[msize - 1].GetSequencePosition()) - 1 +
165 | kmer_length - 1;
166 |
167 | pos_candidates = cache[hidx].negative_candidates;
168 | for (i = 0; i < size; ++i) {
169 | uint64_t rid = cache[hidx].negative_candidates[i].position >> 32;
170 | int rpos = (int)cache[hidx].negative_candidates[i].position;
171 | pos_candidates[i].position =
172 | (rid << 32) + (uint32_t)(rpos + shift - read_len + 1);
173 | }
174 | size = cache[hidx].positive_candidates.size();
175 | neg_candidates = cache[hidx].positive_candidates;
176 | for (i = 0; i < size; ++i)
177 | neg_candidates[i].position =
178 | cache[hidx].positive_candidates[i].position - shift + read_len - 1;
179 | repetitive_seed_length = cache[hidx].repetitive_seed_length;
180 |
181 | return hidx;
182 | } else {
183 | return -1;
184 | }
185 | }
186 |
187 | void Update(const std::vector &minimizers,
188 | std::vector &pos_candidates,
189 | std::vector &neg_candidates,
190 | uint32_t repetitive_seed_length,
191 | bool debug=false) {
192 | int i;
193 | int msize = minimizers.size();
194 |
195 | uint64_t h = 0; // for hash
196 | uint64_t f = 0; // for finger printing
197 |
198 | if (msize == 0)
199 | return;
200 | else if (msize == 1) {
201 | h = f = (minimizers[0].GetHash());
202 | } else {
203 | h = minimizers[0].GetHash() + minimizers[msize - 1].GetHash();
204 | f = minimizers[0].GetHash() ^ minimizers[msize - 1].GetHash();
205 | }
206 | int hidx = h % cache_size;
207 | int finger_print = f % FINGER_PRINT_SIZE;
208 |
209 | // beginning of locking phase - make sure to release it wherever we exit
210 | int lock_index = hidx % num_locks_for_cache;
211 | omp_set_lock(&entry_locks_omp[lock_index]);
212 |
213 | ++cache[hidx].finger_print_cnt[finger_print];
214 | ++cache[hidx].finger_print_cnt_sum;
215 |
216 | // case 1: already saturated
217 | if (cache[hidx].finger_print_cnt_sum > saturate_count){
218 | omp_unset_lock(&entry_locks_omp[lock_index]);
219 | return;
220 | }
221 |
222 | // case 2: no heavy hitter or not enough yet
223 | if (cache[hidx].finger_print_cnt_sum < 10 ||
224 | (int)cache[hidx].finger_print_cnt[finger_print] * 5 <
225 | cache[hidx].finger_print_cnt_sum) {
226 | omp_unset_lock(&entry_locks_omp[lock_index]);
227 | return;
228 | }
229 |
230 | int direction = IsMinimizersMatchCache(minimizers, cache[hidx]);
231 | if (direction != 0)
232 | ++cache[hidx].weight;
233 | else
234 | --cache[hidx].weight;
235 | cache[hidx].activated = 1;
236 |
237 | // Renew the cache
238 | if (cache[hidx].weight < 0) {
239 | cache[hidx].weight = 1;
240 | cache[hidx].minimizers.resize(msize);
241 |
242 | if (msize == 0) {
243 | cache[hidx].offsets.clear();
244 | cache[hidx].strands.clear();
245 | omp_unset_lock(&entry_locks_omp[lock_index]);
246 | return;
247 | }
248 |
249 | int size = pos_candidates.size();
250 | int shift = (int)minimizers[0].GetSequencePosition();
251 |
252 | // Do not cache if it is too near the start.
253 | for (i = 0; i < size; ++i)
254 | if ((int)pos_candidates[i].position < kmer_length + shift) {
255 | cache[hidx].offsets.clear();
256 | cache[hidx].strands.clear();
257 | cache[hidx].minimizers.clear();
258 |
259 | omp_unset_lock(&entry_locks_omp[lock_index]);
260 | return;
261 | }
262 |
263 | size = neg_candidates.size();
264 | for (i = 0; i < size; ++i)
265 | if ((int)neg_candidates[i].position -
266 | ((int)minimizers[msize - 1].GetSequencePosition()) <
267 | kmer_length + shift) {
268 | cache[hidx].offsets.clear();
269 | cache[hidx].strands.clear();
270 | cache[hidx].minimizers.clear();
271 |
272 | omp_unset_lock(&entry_locks_omp[lock_index]);
273 | return;
274 | }
275 | cache[hidx].offsets.resize(msize - 1);
276 | cache[hidx].strands.resize(msize);
277 | for (i = 0; i < msize; ++i) {
278 | cache[hidx].minimizers[i] = minimizers[i].GetHash();
279 | cache[hidx].strands[i] = (minimizers[i].GetHit() & 1);
280 | }
281 | for (i = 0; i < msize - 1; ++i) {
282 | cache[hidx].offsets[i] =
283 | ((int)minimizers[i + 1].GetSequencePosition()) -
284 | ((int)minimizers[i].GetSequencePosition());
285 | }
286 | std::vector().swap(cache[hidx].positive_candidates);
287 | std::vector().swap(cache[hidx].negative_candidates);
288 | cache[hidx].positive_candidates = pos_candidates;
289 | cache[hidx].negative_candidates = neg_candidates;
290 | cache[hidx].repetitive_seed_length = repetitive_seed_length;
291 |
292 | // adjust the candidate position.
293 | size = cache[hidx].positive_candidates.size();
294 | for (i = 0; i < size; ++i)
295 | cache[hidx].positive_candidates[i].position += shift;
296 | size = cache[hidx].negative_candidates.size();
297 | for (i = 0; i < size; ++i)
298 | cache[hidx].negative_candidates[i].position -= shift;
299 |
300 | // Debugging output (candidate stored in cache)
301 | if (debug) {
302 | print_lock.lock();
303 | std::cout << "[DEBUG][CACHE][1] hidx = " << hidx << std::endl;
304 | std::cout << "[DEBUG][CACHE][2]" << " pos.size() = "
305 | << cache[hidx].positive_candidates.size()
306 | << " , " << "neg.size() = "
307 | << cache[hidx].negative_candidates.size()
308 | << " , msize = " << msize << std::endl;
309 | std::cout << "[DEBUG][CACHE][3] ";
310 | for (const auto &minimizer : minimizers) {
311 | std::cout << minimizer.GetHash() << " ";
312 | } std::cout << std::endl;
313 |
314 | for (size_t j = 0; j < cache[hidx].positive_candidates.size(); ++j) {
315 | std::cout << "[DEBUG][CACHE][+] "
316 | << "hidx = " << hidx
317 | << " , cand_ref_seq = " << cache[hidx].positive_candidates[j].GetReferenceSequenceIndex()
318 | << " , cand_ref_pos = " << cache[hidx].positive_candidates[j].GetReferenceSequencePosition()
319 | << " , support = " << unsigned(cache[hidx].positive_candidates[j].GetCount()) << std::endl;
320 | }
321 |
322 | for (size_t j = 0; j < cache[hidx].negative_candidates.size(); ++j) {
323 | std::cout << "[DEBUG][CACHE][-] "
324 | << "hidx = " << hidx
325 | << " , cand_ref_seq = " << cache[hidx].negative_candidates[j].GetReferenceSequenceIndex()
326 | << " , cand_ref_pos = " << cache[hidx].negative_candidates[j].GetReferenceSequencePosition()
327 | << " , support = " << unsigned(cache[hidx].negative_candidates[j].GetCount()) << std::endl;
328 | }
329 | print_lock.unlock();
330 | }
331 |
332 | // Update head mm array
333 | head_mm[(minimizers[0].GetHash() >> 6) & HEAD_MM_ARRAY_MASK] |=
334 | (1ull << (minimizers[0].GetHash() & 0x3f));
335 | head_mm[(minimizers[msize - 1].GetHash() >> 6) & HEAD_MM_ARRAY_MASK] |=
336 | (1ull << (minimizers[msize - 1].GetHash() & 0x3f));
337 | }
338 | omp_unset_lock(&entry_locks_omp[lock_index]);
339 | }
340 |
341 | void DirectUpdateWeight(int idx, int weight) { cache[idx].weight += weight; }
342 |
343 | uint64_t GetMemoryBytes() {
344 | int i;
345 | uint64_t ret = 0;
346 | for (i = 0; i < cache_size; ++i) {
347 | ret += sizeof(cache[i]) +
348 | cache[i].minimizers.capacity() * sizeof(uint64_t) +
349 | cache[i].offsets.capacity() * sizeof(int) +
350 | cache[i].positive_candidates.capacity() * sizeof(Candidate) +
351 | cache[i].negative_candidates.capacity() * sizeof(Candidate);
352 | }
353 | return ret;
354 | }
355 |
356 | // How many reads from a batch we want to use to update the cache.
357 | // paired end data has twice the amount reads, so the threshold is lower
358 | uint32_t GetUpdateThreshold(uint32_t num_loaded_reads,
359 | uint64_t num_reads,
360 | bool paired,
361 | double cache_update_param
362 | ) {
363 | const uint32_t block = paired ? 2500000 : 5000000;
364 |
365 | if (num_reads <= block)
366 | return num_loaded_reads;
367 | else
368 | return num_loaded_reads / (1 + (cache_update_param * (num_reads / block)));
369 | }
370 |
371 | void PrintStats() {
372 | for (int i = 0; i < cache_size; ++i) {
373 | printf("%d %d %d %d ", cache[i].weight, cache[i].finger_print_cnt_sum,
374 | int(cache[i].positive_candidates.size() +
375 | cache[i].negative_candidates.size()),
376 | cache[i].activated);
377 | int tmp = 0;
378 | for (int j = 0; j < FINGER_PRINT_SIZE; ++j)
379 | if (cache[i].finger_print_cnt[j] > tmp)
380 | tmp = cache[i].finger_print_cnt[j];
381 | printf("%d", tmp);
382 | for (int j = 0; j < FINGER_PRINT_SIZE; ++j)
383 | printf(" %u", cache[i].finger_print_cnt[j]);
384 | printf("\n");
385 | }
386 | }
387 | };
388 | } // namespace chromap
389 |
390 | #endif
391 |
--------------------------------------------------------------------------------
/src/paf_mapping.h:
--------------------------------------------------------------------------------
1 | #ifndef PAFMAPPING_H_
2 | #define PAFMAPPING_H_
3 |
4 | #include
5 |
6 | #include "mapping.h"
7 |
8 | namespace chromap {
9 |
10 | // When direction = 1, strand is positive
11 | class PAFMapping : public Mapping {
12 | public:
13 | uint32_t read_id_;
14 | std::string read_name_;
15 | uint16_t read_length_;
16 | uint32_t fragment_start_position_;
17 | uint16_t fragment_length_;
18 | uint8_t mapq_ : 6, direction_ : 1, is_unique_ : 1;
19 | uint8_t num_dups_;
20 | PAFMapping() : num_dups_(0) {}
21 | PAFMapping(uint32_t read_id, const std::string &read_name,
22 | uint16_t read_length, uint32_t fragment_start_position,
23 | uint16_t fragment_length, uint8_t mapq, uint8_t direction,
24 | uint8_t is_unique, uint8_t num_dups)
25 | : read_id_(read_id),
26 | read_name_(read_name),
27 | read_length_(read_length),
28 | fragment_start_position_(fragment_start_position),
29 | fragment_length_(fragment_length),
30 | mapq_(mapq),
31 | direction_(direction),
32 | is_unique_(is_unique),
33 | num_dups_(num_dups){};
34 | bool operator<(const PAFMapping &m) const {
35 | return std::tie(fragment_start_position_, fragment_length_, mapq_,
36 | direction_, is_unique_, read_id_, read_length_) <
37 | std::tie(m.fragment_start_position_, m.fragment_length_, m.mapq_,
38 | m.direction_, m.is_unique_, m.read_id_, m.read_length_);
39 | }
40 | bool operator==(const PAFMapping &m) const {
41 | return std::tie(fragment_start_position_) ==
42 | std::tie(m.fragment_start_position_);
43 | }
44 | bool IsSamePosition(const PAFMapping &m) const {
45 | return std::tie(fragment_start_position_) ==
46 | std::tie(m.fragment_start_position_);
47 | }
48 | uint64_t GetBarcode() const { return 0; }
49 | void Tn5Shift() {
50 | if (direction_ == 1) {
51 | fragment_start_position_ += 4;
52 | } else {
53 | fragment_length_ -= 5;
54 | }
55 | }
56 | bool IsPositiveStrand() const { return direction_ > 0 ? true : false; }
57 | uint32_t GetStartPosition() const { // inclusive
58 | return fragment_start_position_;
59 | }
60 | uint32_t GetEndPosition() const { // exclusive
61 | return fragment_start_position_ + fragment_length_;
62 | }
63 | uint16_t GetByteSize() const {
64 | return 2 * sizeof(uint32_t) + 2 * sizeof(uint16_t) + 2 * sizeof(uint8_t) +
65 | read_name_.length() * sizeof(char);
66 | }
67 | size_t WriteToFile(FILE *temp_mapping_output_file) const {
68 | size_t num_written_bytes = 0;
69 | num_written_bytes +=
70 | fwrite(&read_id_, sizeof(uint32_t), 1, temp_mapping_output_file);
71 | uint16_t read_name_length = read_name_.length();
72 | num_written_bytes += fwrite(&read_name_length, sizeof(uint16_t), 1,
73 | temp_mapping_output_file);
74 | num_written_bytes += fwrite(read_name_.data(), sizeof(char),
75 | read_name_length, temp_mapping_output_file);
76 | num_written_bytes +=
77 | fwrite(&read_length_, sizeof(uint16_t), 1, temp_mapping_output_file);
78 | num_written_bytes += fwrite(&fragment_start_position_, sizeof(uint32_t), 1,
79 | temp_mapping_output_file);
80 | num_written_bytes += fwrite(&fragment_length_, sizeof(uint16_t), 1,
81 | temp_mapping_output_file);
82 | uint8_t mapq_direction_is_unique =
83 | (mapq_ << 2) | (direction_ << 1) | is_unique_;
84 | num_written_bytes += fwrite(&mapq_direction_is_unique, sizeof(uint8_t), 1,
85 | temp_mapping_output_file);
86 | num_written_bytes +=
87 | fwrite(&num_dups_, sizeof(uint8_t), 1, temp_mapping_output_file);
88 | return num_written_bytes;
89 | }
90 | size_t LoadFromFile(FILE *temp_mapping_output_file) {
91 | size_t num_read_bytes = 0;
92 | num_read_bytes +=
93 | fread(&read_id_, sizeof(uint32_t), 1, temp_mapping_output_file);
94 | uint16_t read_name_length = 0;
95 | num_read_bytes +=
96 | fread(&read_name_length, sizeof(uint16_t), 1, temp_mapping_output_file);
97 | read_name_ = std::string(read_name_length, '\0');
98 | num_read_bytes += fread(&(read_name_[0]), sizeof(char), read_name_length,
99 | temp_mapping_output_file);
100 | num_read_bytes +=
101 | fread(&read_length_, sizeof(uint16_t), 1, temp_mapping_output_file);
102 | num_read_bytes += fread(&fragment_start_position_, sizeof(uint32_t), 1,
103 | temp_mapping_output_file);
104 | num_read_bytes +=
105 | fread(&fragment_length_, sizeof(uint16_t), 1, temp_mapping_output_file);
106 | uint8_t mapq_direction_is_unique = 0;
107 | num_read_bytes += fread(&mapq_direction_is_unique, sizeof(uint8_t), 1,
108 | temp_mapping_output_file);
109 | mapq_ = (mapq_direction_is_unique >> 2);
110 | direction_ = (mapq_direction_is_unique >> 1) & 1;
111 | is_unique_ = mapq_direction_is_unique & 1;
112 | num_read_bytes +=
113 | fread(&num_dups_, sizeof(uint8_t), 1, temp_mapping_output_file);
114 | return num_read_bytes;
115 | }
116 | };
117 |
118 | class PairedPAFMapping : public Mapping {
119 | public:
120 | uint32_t read_id_;
121 | std::string read1_name_;
122 | std::string read2_name_;
123 | uint16_t read1_length_;
124 | uint16_t read2_length_;
125 | uint32_t fragment_start_position_;
126 | uint16_t fragment_length_;
127 | uint16_t positive_alignment_length_;
128 | uint16_t negative_alignment_length_;
129 | uint8_t mapq_;
130 | uint16_t mapq1_ : 6, mapq2_ : 6, direction_ : 1, is_unique_ : 1,
131 | reserved_ : 2;
132 | uint8_t num_dups_;
133 | // uint8_t mapq; // least significant bit saves the direction of mapping
134 | PairedPAFMapping() : num_dups_(0) {}
135 | PairedPAFMapping(uint32_t read_id, std::string read1_name,
136 | std::string read2_name, uint16_t read1_length,
137 | uint16_t read2_length, uint32_t fragment_start_position,
138 | uint16_t fragment_length, uint16_t positive_alignment_length,
139 | uint16_t negative_alignment_length, uint8_t mapq,
140 | uint16_t mapq1, uint16_t mapq2, uint16_t direction,
141 | uint16_t is_unique, uint8_t num_dups)
142 | : read_id_(read_id),
143 | read1_name_(read1_name),
144 | read2_name_(read2_name),
145 | read1_length_(read1_length),
146 | read2_length_(read2_length),
147 | fragment_start_position_(fragment_start_position),
148 | fragment_length_(fragment_length),
149 | positive_alignment_length_(positive_alignment_length),
150 | negative_alignment_length_(negative_alignment_length),
151 | mapq_(mapq),
152 | mapq1_(mapq1),
153 | mapq2_(mapq2),
154 | direction_(direction),
155 | is_unique_(is_unique),
156 | num_dups_(num_dups) {}
157 | bool operator<(const PairedPAFMapping &m) const {
158 | return std::tie(fragment_start_position_, fragment_length_, mapq1_, mapq2_,
159 | direction_, is_unique_, read_id_,
160 | positive_alignment_length_, negative_alignment_length_) <
161 | std::tie(m.fragment_start_position_, m.fragment_length_, m.mapq1_,
162 | m.mapq2_, m.direction_, m.is_unique_, m.read_id_,
163 | m.positive_alignment_length_, m.negative_alignment_length_);
164 | }
165 | bool operator==(const PairedPAFMapping &m) const {
166 | return std::tie(fragment_start_position_, fragment_length_) ==
167 | std::tie(m.fragment_start_position_, m.fragment_length_);
168 | }
169 | bool IsSamePosition(const PairedPAFMapping &m) const {
170 | return std::tie(fragment_start_position_, fragment_length_) ==
171 | std::tie(m.fragment_start_position_, m.fragment_length_);
172 | }
173 | uint64_t GetBarcode() const { return 0; }
174 | void Tn5Shift() {
175 | fragment_start_position_ += 4;
176 | positive_alignment_length_ -= 4;
177 | fragment_length_ -= 9;
178 | negative_alignment_length_ -= 5;
179 | }
180 | bool IsPositiveStrand() const { return direction_ > 0 ? true : false; }
181 | uint32_t GetStartPosition() const { // inclusive
182 | return fragment_start_position_;
183 | }
184 | uint32_t GetEndPosition() const { // exclusive
185 | return fragment_start_position_ + fragment_length_;
186 | }
187 | uint16_t GetByteSize() const {
188 | return 2 * sizeof(uint32_t) + 6 * sizeof(uint16_t) + 2 * sizeof(uint8_t) +
189 | (read1_name_.length() + read2_name_.length()) * sizeof(char);
190 | }
191 | size_t WriteToFile(FILE *temp_mapping_output_file) const {
192 | size_t num_written_bytes = 0;
193 | num_written_bytes +=
194 | fwrite(&read_id_, sizeof(uint32_t), 1, temp_mapping_output_file);
195 | uint16_t read1_name_length = read1_name_.length();
196 | num_written_bytes += fwrite(&read1_name_length, sizeof(uint16_t), 1,
197 | temp_mapping_output_file);
198 | num_written_bytes += fwrite(read1_name_.data(), sizeof(char),
199 | read1_name_length, temp_mapping_output_file);
200 | uint16_t read2_name_length = read2_name_.length();
201 | num_written_bytes += fwrite(&read2_name_length, sizeof(uint16_t), 1,
202 | temp_mapping_output_file);
203 | num_written_bytes += fwrite(read2_name_.data(), sizeof(char),
204 | read2_name_length, temp_mapping_output_file);
205 | num_written_bytes +=
206 | fwrite(&read1_length_, sizeof(uint16_t), 1, temp_mapping_output_file);
207 | num_written_bytes +=
208 | fwrite(&read2_length_, sizeof(uint16_t), 1, temp_mapping_output_file);
209 | num_written_bytes += fwrite(&fragment_start_position_, sizeof(uint32_t), 1,
210 | temp_mapping_output_file);
211 | num_written_bytes += fwrite(&fragment_length_, sizeof(uint16_t), 1,
212 | temp_mapping_output_file);
213 | num_written_bytes += fwrite(&positive_alignment_length_, sizeof(uint16_t),
214 | 1, temp_mapping_output_file);
215 | num_written_bytes += fwrite(&negative_alignment_length_, sizeof(uint16_t),
216 | 1, temp_mapping_output_file);
217 | num_written_bytes +=
218 | fwrite(&mapq_, sizeof(uint8_t), 1, temp_mapping_output_file);
219 | uint16_t mapq1_mapq2_direction_is_unique =
220 | (mapq1_ << 10) | (mapq2_ << 4) | (direction_ << 3) | (is_unique_ << 2);
221 | num_written_bytes += fwrite(&mapq1_mapq2_direction_is_unique,
222 | sizeof(uint16_t), 1, temp_mapping_output_file);
223 | num_written_bytes +=
224 | fwrite(&num_dups_, sizeof(uint8_t), 1, temp_mapping_output_file);
225 | return num_written_bytes;
226 | }
227 | size_t LoadFromFile(FILE *temp_mapping_output_file) {
228 | size_t num_read_bytes = 0;
229 | num_read_bytes +=
230 | fread(&read_id_, sizeof(uint32_t), 1, temp_mapping_output_file);
231 | uint16_t read1_name_length = 0;
232 | num_read_bytes += fread(&read1_name_length, sizeof(uint16_t), 1,
233 | temp_mapping_output_file);
234 | read1_name_ = std::string(read1_name_length, '\0');
235 | num_read_bytes += fread(&(read1_name_[0]), sizeof(char), read1_name_length,
236 | temp_mapping_output_file);
237 | uint16_t read2_name_length = 0;
238 | num_read_bytes += fread(&read2_name_length, sizeof(uint16_t), 1,
239 | temp_mapping_output_file);
240 | read2_name_ = std::string(read2_name_length, '\0');
241 | num_read_bytes += fread(&(read2_name_[0]), sizeof(char), read2_name_length,
242 | temp_mapping_output_file);
243 | num_read_bytes +=
244 | fread(&read1_length_, sizeof(uint16_t), 1, temp_mapping_output_file);
245 | num_read_bytes +=
246 | fread(&read2_length_, sizeof(uint16_t), 1, temp_mapping_output_file);
247 | num_read_bytes += fread(&fragment_start_position_, sizeof(uint32_t), 1,
248 | temp_mapping_output_file);
249 | num_read_bytes +=
250 | fread(&fragment_length_, sizeof(uint16_t), 1, temp_mapping_output_file);
251 | num_read_bytes += fread(&positive_alignment_length_, sizeof(uint16_t), 1,
252 | temp_mapping_output_file);
253 | num_read_bytes += fread(&negative_alignment_length_, sizeof(uint16_t), 1,
254 | temp_mapping_output_file);
255 | num_read_bytes +=
256 | fread(&mapq_, sizeof(uint8_t), 1, temp_mapping_output_file);
257 | uint16_t mapq1_mapq2_direction_is_unique = 0;
258 | num_read_bytes += fread(&mapq1_mapq2_direction_is_unique, sizeof(uint16_t),
259 | 1, temp_mapping_output_file);
260 | mapq1_ = (mapq1_mapq2_direction_is_unique >> 10);
261 | mapq2_ = ((mapq1_mapq2_direction_is_unique << 6) >> 10);
262 | direction_ = (mapq1_mapq2_direction_is_unique >> 3) & 1;
263 | is_unique_ = (mapq1_mapq2_direction_is_unique >> 2) & 1;
264 | num_read_bytes +=
265 | fread(&num_dups_, sizeof(uint8_t), 1, temp_mapping_output_file);
266 | return num_read_bytes;
267 | }
268 | };
269 |
270 | } // namespace chromap
271 |
272 | #endif // PAFMAPPING_H_
273 |
--------------------------------------------------------------------------------
/src/paired_end_mapping_metadata.h:
--------------------------------------------------------------------------------
1 | #ifndef PAIRED_END_MAPPING_METADATA_H_
2 | #define PAIRED_END_MAPPING_METADATA_H_
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | #include "mapping_metadata.h"
9 |
10 | namespace chromap {
11 |
12 | class PairedEndMappingMetadata {
13 | public:
14 | inline void PreparedForMappingNextReadPair(int reserve_size) {
15 | mapping_metadata1_.PrepareForMappingNextRead(reserve_size);
16 | mapping_metadata2_.PrepareForMappingNextRead(reserve_size);
17 |
18 | F1R2_best_mappings_.clear();
19 | F2R1_best_mappings_.clear();
20 | F1F2_best_mappings_.clear();
21 | R1R2_best_mappings_.clear();
22 |
23 | F1R2_best_mappings_.reserve(reserve_size);
24 | F2R1_best_mappings_.reserve(reserve_size);
25 | F1F2_best_mappings_.reserve(reserve_size);
26 | R1R2_best_mappings_.reserve(reserve_size);
27 | }
28 |
29 | inline void MoveCandidiatesToBuffer() {
30 | mapping_metadata1_.MoveCandidiatesToBuffer();
31 | mapping_metadata2_.MoveCandidiatesToBuffer();
32 | }
33 |
34 | // Callback function to update all candidates.
35 | inline void UpdateCandidates(void (*Update)(std::vector &)) {
36 | mapping_metadata1_.UpdateCandidates(Update);
37 | mapping_metadata2_.UpdateCandidates(Update);
38 | }
39 |
40 | inline void SortMappingsByPositions() {
41 | mapping_metadata1_.SortMappingsByPositions();
42 | mapping_metadata2_.SortMappingsByPositions();
43 | }
44 | // inline void ClearAndReserveMinimizers(int reserve_size) {
45 | // mapping_metadata1_.minimizers_.clear();
46 | // mapping_metadata2_.minimizers_.clear();
47 | // mapping_metadata1_.minimizers_.reserve(reserve_size);
48 | // mapping_metadata2_.minimizers_.reserve(reserve_size);
49 | //}
50 |
51 | inline bool BothEndsHaveMinimizers() const {
52 | return !mapping_metadata1_.minimizers_.empty() &&
53 | !mapping_metadata2_.minimizers_.empty();
54 | }
55 |
56 | inline int GetMinSumErrors() const { return min_sum_errors_; }
57 | inline int GetSecondMinSumErrors() const { return second_min_sum_errors_; }
58 | inline int GetNumBestMappings() const { return num_best_mappings_; }
59 | inline int GetNumSecondBestMappings() const {
60 | return num_second_best_mappings_;
61 | }
62 |
63 | // TODO: think how to deal with the code copy.
64 | inline const std::vector> &GetBestMappings(
65 | const Strand first_mapping_strand,
66 | const Strand second_mapping_strand) const {
67 | if (first_mapping_strand == kPositive) {
68 | if (second_mapping_strand == kPositive) {
69 | return F1F2_best_mappings_;
70 | }
71 | return F1R2_best_mappings_;
72 | } else {
73 | if (second_mapping_strand == kPositive) {
74 | return F2R1_best_mappings_;
75 | }
76 | return R1R2_best_mappings_;
77 | }
78 | }
79 |
80 | inline std::vector> &GetBestMappings(
81 | const Strand first_mapping_strand, const Strand second_mapping_strand) {
82 | if (first_mapping_strand == kPositive) {
83 | if (second_mapping_strand == kPositive) {
84 | return F1F2_best_mappings_;
85 | }
86 | return F1R2_best_mappings_;
87 | } else {
88 | if (second_mapping_strand == kPositive) {
89 | return F2R1_best_mappings_;
90 | }
91 | return R1R2_best_mappings_;
92 | }
93 | }
94 |
95 | inline void SetMinSumErrors(int min_sum_errors) {
96 | min_sum_errors_ = min_sum_errors;
97 | }
98 | inline void SetSecondMinSumErrors(int second_min_sum_errors) {
99 | second_min_sum_errors_ = second_min_sum_errors;
100 | }
101 | inline void SetNumBestMappings(int num_best_mappings) {
102 | num_best_mappings_ = num_best_mappings;
103 | }
104 | inline void SetNumSecondBestMappings(int num_second_best_mappings) {
105 | num_second_best_mappings_ = num_second_best_mappings;
106 | }
107 |
108 | protected:
109 | MappingMetadata mapping_metadata1_;
110 | MappingMetadata mapping_metadata2_;
111 |
112 | int min_sum_errors_, second_min_sum_errors_;
113 | int num_best_mappings_, num_second_best_mappings_;
114 |
115 | std::vector> F1R2_best_mappings_;
116 | std::vector> F2R1_best_mappings_;
117 | std::vector> F1F2_best_mappings_;
118 | std::vector> R1R2_best_mappings_;
119 |
120 | friend class CandidateProcessor;
121 | template
122 | friend class MappingGenerator;
123 | friend class Chromap;
124 | };
125 |
126 | } // namespace chromap
127 |
128 | #endif // PAIRED_END_MAPPING_METADATA_H_
129 |
--------------------------------------------------------------------------------
/src/pairs_mapping.h:
--------------------------------------------------------------------------------
1 | #ifndef PAIRSMAPPING_H_
2 | #define PAIRSMAPPING_H_
3 |
4 | #include
5 |
6 | #include "mapping.h"
7 |
8 | namespace chromap {
9 |
10 | // Format for pairtools for HiC data.
11 | class PairsMapping : public Mapping {
12 | public:
13 | uint32_t read_id_;
14 | std::string read_name_;
15 | uint64_t cell_barcode_;
16 | int rid1_;
17 | int rid2_;
18 | uint32_t pos1_;
19 | uint32_t pos2_;
20 | int strand1_; // 1-positive. 0-negative
21 | int strand2_;
22 | uint16_t mapq_ : 8, is_unique_ : 1, num_dups_ : 7;
23 |
24 | PairsMapping() : num_dups_(0) {}
25 | PairsMapping(uint32_t read_id, std::string read_name, uint64_t cell_barcode,
26 | int rid1, int rid2, uint32_t pos1, uint32_t pos2, int strand1,
27 | int strand2, uint8_t mapq, uint8_t is_unique, uint8_t num_dups)
28 | : read_id_(read_id),
29 | read_name_(read_name),
30 | cell_barcode_(cell_barcode),
31 | rid1_(rid1),
32 | rid2_(rid2),
33 | pos1_(pos1),
34 | pos2_(pos2),
35 | strand1_(strand1),
36 | strand2_(strand2),
37 | mapq_(mapq),
38 | is_unique_(is_unique),
39 | num_dups_(num_dups) {}
40 | bool operator<(const PairsMapping &m) const {
41 | return std::tie(rid1_, rid2_, pos1_, pos2_, mapq_, read_id_) <
42 | std::tie(m.rid1_, m.rid2_, m.pos1_, m.pos2_, m.mapq_, m.read_id_);
43 | }
44 | bool operator==(const PairsMapping &m) const {
45 | return std::tie(rid1_, pos1_, rid2_, pos2_) ==
46 | std::tie(m.rid1_, m.pos1_, m.rid2_, m.pos2_);
47 | // return std::tie(pos1, pos2, rid1, rid2, is_rev1, is_rev2) ==
48 | // std::tie(m.pos1, m.pos2, m.rid1, m.rid2, m.is_rev1, m.is_rev2);
49 | }
50 | bool IsSamePosition(const PairsMapping &m) const {
51 | return std::tie(rid1_, pos1_, rid2_, pos2_) ==
52 | std::tie(m.rid1_, m.pos1_, m.rid2_, m.pos2_);
53 | }
54 | uint64_t GetBarcode() const { return 0; }
55 | void Tn5Shift() {
56 | // We don't support Tn5 shift in SAM format because it has other fields that
57 | // depend mapping position.
58 | }
59 |
60 | int GetPosition(int idx) const {
61 | if (idx == 2) {
62 | return pos2_ + 1;
63 | }
64 | return pos1_ + 1;
65 | }
66 |
67 | char GetStrand(int idx) const {
68 | int d = strand1_;
69 | if (idx == 2) {
70 | d = strand2_;
71 | }
72 | return d > 0 ? '+' : '-';
73 | }
74 |
75 | bool IsPositiveStrand() const { return strand1_ > 0 ? true : false; }
76 | uint32_t GetStartPosition() const { // inclusive
77 | return pos1_;
78 | }
79 | uint32_t GetEndPosition() const { // exclusive
80 | return pos2_;
81 | }
82 | uint16_t GetByteSize() const {
83 | return 5 * sizeof(uint32_t) + 1 * sizeof(uint16_t) + 4 * sizeof(int) +
84 | read_name_.length() * sizeof(char);
85 | }
86 | size_t WriteToFile(FILE *temp_mapping_output_file) const {
87 | size_t num_written_bytes = 0;
88 | num_written_bytes +=
89 | fwrite(&read_id_, sizeof(uint32_t), 1, temp_mapping_output_file);
90 | uint16_t read_name_length = read_name_.length();
91 | num_written_bytes += fwrite(&read_name_length, sizeof(uint16_t), 1,
92 | temp_mapping_output_file);
93 | num_written_bytes += fwrite(read_name_.data(), sizeof(char),
94 | read_name_length, temp_mapping_output_file);
95 | num_written_bytes +=
96 | fwrite(&cell_barcode_, sizeof(uint64_t), 1, temp_mapping_output_file);
97 | num_written_bytes +=
98 | fwrite(&rid1_, sizeof(int), 1, temp_mapping_output_file);
99 | num_written_bytes +=
100 | fwrite(&rid2_, sizeof(int), 1, temp_mapping_output_file);
101 | num_written_bytes +=
102 | fwrite(&pos1_, sizeof(uint32_t), 1, temp_mapping_output_file);
103 | num_written_bytes +=
104 | fwrite(&pos2_, sizeof(uint32_t), 1, temp_mapping_output_file);
105 | num_written_bytes +=
106 | fwrite(&strand1_, sizeof(int), 1, temp_mapping_output_file);
107 | num_written_bytes +=
108 | fwrite(&strand2_, sizeof(int), 1, temp_mapping_output_file);
109 | uint16_t mapq_unique_dups = (mapq_ << 8) | (is_unique_ << 7) | num_dups_;
110 | num_written_bytes += fwrite(&mapq_unique_dups, sizeof(uint16_t), 1,
111 | temp_mapping_output_file);
112 | return num_written_bytes;
113 | }
114 | size_t LoadFromFile(FILE *temp_mapping_output_file) {
115 | size_t num_read_bytes = 0;
116 | num_read_bytes +=
117 | fread(&read_id_, sizeof(uint32_t), 1, temp_mapping_output_file);
118 | uint16_t read_name_length = 0;
119 | num_read_bytes +=
120 | fread(&read_name_length, sizeof(uint16_t), 1, temp_mapping_output_file);
121 | read_name_ = std::string(read_name_length, '\0');
122 | num_read_bytes += fread(&(read_name_[0]), sizeof(char), read_name_length,
123 | temp_mapping_output_file);
124 | num_read_bytes +=
125 | fread(&cell_barcode_, sizeof(uint64_t), 1, temp_mapping_output_file);
126 | num_read_bytes += fread(&rid1_, sizeof(int), 1, temp_mapping_output_file);
127 | num_read_bytes += fread(&rid2_, sizeof(int), 1, temp_mapping_output_file);
128 | num_read_bytes +=
129 | fread(&pos1_, sizeof(uint32_t), 1, temp_mapping_output_file);
130 | num_read_bytes +=
131 | fread(&pos2_, sizeof(uint32_t), 1, temp_mapping_output_file);
132 | num_read_bytes +=
133 | fread(&strand1_, sizeof(int), 1, temp_mapping_output_file);
134 | num_read_bytes +=
135 | fread(&strand2_, sizeof(int), 1, temp_mapping_output_file);
136 | uint16_t mapq_unique_dups = 0;
137 | num_read_bytes +=
138 | fread(&mapq_unique_dups, sizeof(uint16_t), 1, temp_mapping_output_file);
139 | mapq_ = (mapq_unique_dups >> 8);
140 | is_unique_ = (mapq_unique_dups >> 7) & 1;
141 | num_dups_ = ((mapq_unique_dups << 9) >> 9);
142 | return num_read_bytes;
143 | }
144 | };
145 |
146 | } // namespace chromap
147 |
148 | #endif // PAIRSMAPPING_H_
149 |
--------------------------------------------------------------------------------
/src/sequence_batch.cc:
--------------------------------------------------------------------------------
1 | #include "sequence_batch.h"
2 |
3 | #include
4 |
5 | #include "utils.h"
6 |
7 | namespace chromap {
8 |
9 | void SequenceBatch::InitializeLoading(const std::string &sequence_file_path) {
10 | sequence_file_ = gzopen(sequence_file_path.c_str(), "r");
11 | if (sequence_file_ == NULL) {
12 | ExitWithMessage("Cannot find sequence file " + sequence_file_path);
13 | }
14 | sequence_kseq_ = kseq_init(sequence_file_);
15 | }
16 |
17 | void SequenceBatch::FinalizeLoading() {
18 | kseq_destroy(sequence_kseq_);
19 | gzclose(sequence_file_);
20 | }
21 |
22 | bool SequenceBatch::LoadOneSequenceAndSaveAt(uint32_t sequence_index) {
23 | if (sequence_index == 0) {
24 | num_loaded_sequences_ = 0;
25 | }
26 |
27 | int length = kseq_read(sequence_kseq_);
28 | while (length == 0) {
29 | length = kseq_read(sequence_kseq_);
30 | }
31 |
32 | if (length > 0) {
33 | kseq_t *sequence = sequence_batch_[sequence_index];
34 | std::swap(sequence_kseq_->seq, sequence->seq);
35 | ReplaceByEffectiveRange(sequence->seq, /*is_seq=*/true);
36 | std::swap(sequence_kseq_->name, sequence->name);
37 | std::swap(sequence_kseq_->comment, sequence->comment);
38 | sequence->id = total_num_loaded_sequences_;
39 | ++total_num_loaded_sequences_;
40 |
41 | if (sequence_index >= num_loaded_sequences_) {
42 | ++num_loaded_sequences_;
43 | } else if (sequence_index + 1 != num_loaded_sequences_) {
44 | std::cerr << sequence_index << " " << num_loaded_sequences_ << "\n";
45 | ExitWithMessage(
46 | "Shouldn't override other sequences rather than the last!");
47 | }
48 |
49 | if (sequence_kseq_->qual.l != 0) { // fastq file
50 | std::swap(sequence_kseq_->qual, sequence->qual);
51 | ReplaceByEffectiveRange(sequence->qual, /*is_seq=*/false);
52 | }
53 | return false;
54 | }
55 |
56 | // Make sure to reach the end of the file rather than meet an error.
57 | if (length != -1) {
58 | ExitWithMessage(
59 | "Didn't reach the end of sequence file, which might be corrupted!");
60 | }
61 | return true;
62 | }
63 |
64 | uint32_t SequenceBatch::LoadBatch() {
65 | double real_start_time = GetRealTime();
66 | num_loaded_sequences_ = 0;
67 | for (uint32_t sequence_index = 0; sequence_index < max_num_sequences_;
68 | ++sequence_index) {
69 | if (LoadOneSequenceAndSaveAt(sequence_index)) {
70 | break;
71 | }
72 | }
73 |
74 | if (num_loaded_sequences_ != 0) {
75 | std::cerr << "Loaded sequence batch successfully in "
76 | << GetRealTime() - real_start_time << "s, ";
77 | std::cerr << "number of sequences: " << num_loaded_sequences_ << ".\n";
78 | } else {
79 | std::cerr << "No more sequences.\n";
80 | }
81 | return num_loaded_sequences_;
82 | }
83 |
84 | void SequenceBatch::LoadAllSequences() {
85 | double real_start_time = GetRealTime();
86 | sequence_batch_.reserve(200);
87 | num_loaded_sequences_ = 0;
88 | num_bases_ = 0;
89 | int length = kseq_read(sequence_kseq_);
90 | while (length >= 0) {
91 | if (length > 0) {
92 | sequence_batch_.emplace_back((kseq_t *)calloc(1, sizeof(kseq_t)));
93 | kseq_t *sequence = sequence_batch_.back();
94 | std::swap(sequence_kseq_->seq, sequence->seq);
95 | ReplaceByEffectiveRange(sequence->seq, /*is_seq=*/true);
96 | std::swap(sequence_kseq_->name, sequence->name);
97 | std::swap(sequence_kseq_->comment, sequence->comment);
98 | if (sequence_kseq_->qual.l != 0) { // fastq file
99 | std::swap(sequence_kseq_->qual, sequence->qual);
100 | ReplaceByEffectiveRange(sequence->qual, /*is_seq=*/false);
101 | }
102 | sequence->id = total_num_loaded_sequences_;
103 | ++total_num_loaded_sequences_;
104 | ++num_loaded_sequences_;
105 | num_bases_ += length;
106 | }
107 | length = kseq_read(sequence_kseq_);
108 | }
109 |
110 | // Make sure to reach the end of the file rather than meet an error.
111 | if (length != -1) {
112 | ExitWithMessage(
113 | "Didn't reach the end of sequence file, which might be corrupted!");
114 | }
115 |
116 | std::cerr << "Loaded all sequences successfully in "
117 | << GetRealTime() - real_start_time << "s, ";
118 | std::cerr << "number of sequences: " << num_loaded_sequences_ << ", ";
119 | std::cerr << "number of bases: " << num_bases_ << ".\n";
120 | }
121 |
122 | void SequenceBatch::ReplaceByEffectiveRange(kstring_t &seq, bool is_seq) {
123 | seq.l = effective_range_.Replace(seq.s, seq.l, is_seq);
124 | }
125 |
126 | } // namespace chromap
127 |
--------------------------------------------------------------------------------
/src/sequence_batch.h:
--------------------------------------------------------------------------------
1 | #ifndef SEQUENCEBATCH_H_
2 | #define SEQUENCEBATCH_H_
3 |
4 | #include
5 | #include
6 |
7 | #include
8 | #include
9 | #include
10 |
11 | #include "kseq.h"
12 | #include "sequence_effective_range.h"
13 | #include "utils.h"
14 |
15 | namespace chromap {
16 |
17 | class SequenceBatch {
18 | public:
19 | KSEQ_INIT(gzFile, gzread);
20 |
21 | // When 'max_num_sequences' is not specified. This batch can be used to load
22 | // any number of sequences with a positive full effective range.
23 | SequenceBatch() : effective_range_(SequenceEffectiveRange()) {}
24 |
25 | // Construct once and use update sequences when loading each batch.
26 | SequenceBatch(uint32_t max_num_sequences,
27 | const SequenceEffectiveRange &effective_range)
28 | : max_num_sequences_(max_num_sequences),
29 | effective_range_(effective_range) {
30 | sequence_batch_.reserve(max_num_sequences_);
31 | for (uint32_t i = 0; i < max_num_sequences_; ++i) {
32 | sequence_batch_.emplace_back((kseq_t *)calloc(1, sizeof(kseq_t)));
33 | sequence_batch_.back()->f = NULL;
34 | }
35 | negative_sequence_batch_.assign(max_num_sequences_, "");
36 | }
37 |
38 | ~SequenceBatch() {
39 | if (sequence_batch_.size() > 0) {
40 | for (uint32_t i = 0; i < sequence_batch_.size(); ++i) {
41 | kseq_destroy(sequence_batch_[i]);
42 | }
43 | }
44 | }
45 |
46 | inline uint64_t GetNumSequences() const { return num_loaded_sequences_; }
47 |
48 | inline uint32_t GetMaxBatchSize() const { return max_num_sequences_; }
49 |
50 | inline uint64_t GetNumBases() const { return num_bases_; }
51 |
52 | inline std::vector &GetSequenceBatch() { return sequence_batch_; }
53 |
54 | inline std::vector &GetNegativeSequenceBatch() {
55 | return negative_sequence_batch_;
56 | }
57 |
58 | inline const char *GetSequenceAt(uint32_t sequence_index) const {
59 | return sequence_batch_[sequence_index]->seq.s;
60 | }
61 |
62 | inline uint32_t GetSequenceLengthAt(uint32_t sequence_index) const {
63 | return sequence_batch_[sequence_index]->seq.l;
64 | }
65 |
66 | inline const char *GetSequenceNameAt(uint32_t sequence_index) const {
67 | return sequence_batch_[sequence_index]->name.s;
68 | }
69 |
70 | inline uint32_t GetSequenceNameLengthAt(uint32_t sequence_index) const {
71 | return sequence_batch_[sequence_index]->name.l;
72 | }
73 |
74 | inline const char *GetSequenceQualAt(uint32_t sequence_index) const {
75 | return sequence_batch_[sequence_index]->qual.s;
76 | }
77 | inline uint32_t GetSequenceIdAt(uint32_t sequence_index) const {
78 | return sequence_batch_[sequence_index]->id;
79 | }
80 |
81 | inline const std::string &GetNegativeSequenceAt(
82 | uint32_t sequence_index) const {
83 | return negative_sequence_batch_[sequence_index];
84 | }
85 |
86 | // big_endian: N_pos is in the order of sequence
87 | // little_endian: N_pos is in the order from the sequence right side to left,
88 | // this is the order of the GenerateSeed
89 | // e.g: If the sequence is "ACN", big endian returns N at 2,
90 | // little endian returns N at 0.
91 | inline void GetSequenceNsAt(uint32_t sequence_index, bool little_endian,
92 | std::vector &N_pos) {
93 | const int l = sequence_batch_[sequence_index]->seq.l;
94 | const char *s = sequence_batch_[sequence_index]->seq.s;
95 | N_pos.clear();
96 | if (little_endian) {
97 | for (int i = l - 1; i >= 0; --i) {
98 | if (s[i] == 'N') N_pos.push_back(l - 1 - i);
99 | }
100 | } else {
101 | for (int i = 0; i < l; ++i) {
102 | if (s[i] == 'N') N_pos.push_back(i);
103 | }
104 | }
105 | }
106 |
107 | inline bool IsNInSequenceAt(uint32_t sequence_index) {
108 | const int l = sequence_batch_[sequence_index]->seq.l;
109 | const char *s = sequence_batch_[sequence_index]->seq.s;
110 | for (int i = 0 ; i < l ; ++i)
111 | if (s[i] == 'N')
112 | return true;
113 | return false;
114 | }
115 |
116 | // inline char GetReverseComplementBaseOfSequenceAt(uint32_t sequence_index,
117 | // uint32_t position) {
118 | // kseq_t *sequence = sequence_batch_[sequence_index];
119 | // return Uint8ToChar(((uint8_t)3) ^
120 | // (CharToUint8((sequence->seq.s)[sequence->seq.l - position - 1])));
121 | // }
122 |
123 | inline void PrepareNegativeSequenceAt(uint32_t sequence_index) {
124 | kseq_t *sequence = sequence_batch_[sequence_index];
125 | uint32_t sequence_length = sequence->seq.l;
126 | std::string &negative_sequence = negative_sequence_batch_[sequence_index];
127 | negative_sequence.clear();
128 | negative_sequence.reserve(sequence_length);
129 | for (uint32_t i = 0; i < sequence_length; ++i) {
130 | negative_sequence.push_back(Uint8ToChar(
131 | ((uint8_t)3) ^
132 | (CharToUint8((sequence->seq.s)[sequence_length - i - 1]))));
133 | }
134 | }
135 |
136 | inline void TrimSequenceAt(uint32_t sequence_index, int length_after_trim) {
137 | kseq_t *sequence = sequence_batch_[sequence_index];
138 | if (length_after_trim >= (int)sequence->seq.l) {
139 | return;
140 | }
141 |
142 | negative_sequence_batch_[sequence_index].erase(
143 | negative_sequence_batch_[sequence_index].begin(),
144 | negative_sequence_batch_[sequence_index].begin() + sequence->seq.l -
145 | length_after_trim);
146 |
147 | sequence->seq.l = length_after_trim;
148 | sequence->seq.s[sequence->seq.l] = '\0';
149 | sequence->qual.l = length_after_trim;
150 | sequence->qual.s[sequence->qual.l] = '\0';
151 | }
152 |
153 | inline void SwapSequenceBatch(SequenceBatch &batch) {
154 | sequence_batch_.swap(batch.GetSequenceBatch());
155 | negative_sequence_batch_.swap(batch.GetNegativeSequenceBatch());
156 | }
157 |
158 | void InitializeLoading(const std::string &sequence_file_path);
159 |
160 | void FinalizeLoading();
161 |
162 | // The func should never override other sequences rather than the last, which
163 | // means 'sequence_index' cannot be smaller than 'num_loaded_sequences_' - 1.
164 | // Return true when reaching the end of the file.
165 | bool LoadOneSequenceAndSaveAt(uint32_t sequence_index);
166 |
167 | // Return the number of sequences loaded into the batch and return 0 if there
168 | // is no more sequences. This func now is only used to load barcodes.
169 | uint32_t LoadBatch();
170 |
171 | // Load all sequences in a file. This function should only be used to load
172 | // reference. And once the reference is loaded, the batch should never be
173 | // updated. This func is slow when there are large number of sequences.
174 | void LoadAllSequences();
175 |
176 | inline void CorrectBaseAt(uint32_t sequence_index, uint32_t base_position,
177 | char correct_base) {
178 | kseq_t *sequence = sequence_batch_[sequence_index];
179 | sequence->seq.s[base_position] = correct_base;
180 | }
181 |
182 | inline uint64_t GenerateSeedFromSequenceAt(uint32_t sequence_index,
183 | uint32_t start_position,
184 | uint32_t seed_length) const {
185 | const char *sequence = GetSequenceAt(sequence_index);
186 | const uint32_t sequence_length = GetSequenceLengthAt(sequence_index);
187 | return GenerateSeedFromSequence(sequence, sequence_length, start_position,
188 | seed_length);
189 | }
190 |
191 | inline void ReorderSequences(const std::vector &rid_rank) {
192 | std::vector tmp_sequence_batch_ = sequence_batch_;
193 | std::vector tmp_negative_sequence_batch_ =
194 | negative_sequence_batch_;
195 | for (size_t i = 0; i < sequence_batch_.size(); ++i) {
196 | sequence_batch_[rid_rank[i]] = tmp_sequence_batch_[i];
197 | }
198 |
199 | if (negative_sequence_batch_.size() > 0) {
200 | for (size_t i = 0; i < sequence_batch_.size(); ++i) {
201 | negative_sequence_batch_[rid_rank[i]] = tmp_negative_sequence_batch_[i];
202 | }
203 | }
204 | }
205 |
206 | protected:
207 | // When 'is_seq' is set to true, this func will complement the base when
208 | // necessary. Otherwise, it will just reverse the sequence.
209 | void ReplaceByEffectiveRange(kstring_t &seq, bool is_seq);
210 |
211 | // This is the accumulated number of sequences that have ever been loaded into
212 | // the batch. It is useful for tracking read ids.
213 | uint32_t total_num_loaded_sequences_ = 0;
214 |
215 | // This is the number of sequences loaded into the current batch.
216 | uint32_t num_loaded_sequences_ = 0;
217 |
218 | // This is the number of bases loaded into the current batch. It is only
219 | // populated for the reference.
220 | uint64_t num_bases_ = 0;
221 |
222 | // This is the max number of sequences that can be loaded into the batch. It
223 | // is set to 0 when there is no such restriction.
224 | uint32_t max_num_sequences_ = 0;
225 |
226 | gzFile sequence_file_;
227 | kseq_t *sequence_kseq_ = nullptr;
228 | std::vector sequence_batch_;
229 |
230 | // TODO: avoid constructing the negative sequence batch.
231 | std::vector negative_sequence_batch_;
232 |
233 | // Actual range within each sequence.
234 | const SequenceEffectiveRange effective_range_;
235 | };
236 |
237 | } // namespace chromap
238 |
239 | #endif // SEQUENCEBATCH_H_
240 |
--------------------------------------------------------------------------------
/src/sequence_effective_range.h:
--------------------------------------------------------------------------------
1 | #ifndef SEQUENCE_EFFECTIVE_RANGE_H_
2 | #define SEQUENCE_EFFECTIVE_RANGE_H_
3 |
4 | #include
5 |
6 | #include
7 | #include
8 |
9 | #include "utils.h"
10 |
11 | namespace chromap {
12 |
13 | // The class handles the custom read format indicating the effective range on a
14 | // sequence. Default is the full range.
15 | class SequenceEffectiveRange {
16 | public:
17 | SequenceEffectiveRange() = default;
18 | ~SequenceEffectiveRange() = default;
19 |
20 | void InitializeParsing() {
21 | starts.clear();
22 | ends.clear();
23 | strand = '+';
24 | }
25 |
26 | void FinalizeParsing() {
27 | if (starts.empty() && ends.empty()) {
28 | starts.push_back(0);
29 | ends.push_back(-1);
30 | strand = '+';
31 | return;
32 | }
33 |
34 | /*std::sort(starts.begin(), starts.end());
35 | std::sort(ends.begin(), ends.end());
36 |
37 | if (ends[0] == -1) {
38 | ends.erase(ends.begin());
39 | ends.push_back(-1);
40 | }*/
41 | }
42 |
43 | // Return false if it fails to parse the format string.
44 | bool ParseFormatStringAndAppendEffectiveRange(const char *s, int len) {
45 | int i;
46 | int j = 0; // start, end, strand section
47 | char buffer[20];
48 | int blen = 0;
49 |
50 | for (i = 3; i <= len; ++i) {
51 | if (i == len || s[i] == ':') {
52 | buffer[blen] = '\0';
53 | if (j == 0) {
54 | starts.push_back(atoi(buffer));
55 | } else if (j == 1) {
56 | ends.push_back(atoi(buffer));
57 | } else {
58 | strand = buffer[0];
59 | }
60 |
61 | blen = 0;
62 | if (i < len && s[i] == ':') {
63 | ++j;
64 | }
65 | } else {
66 | buffer[blen] = s[i];
67 | ++blen;
68 | }
69 | }
70 |
71 | if (j >= 3 || starts.size() != ends.size()) {
72 | return false;
73 | }
74 |
75 | return true;
76 | }
77 |
78 | // Replace by the range specified in the starts, ends section, but does not
79 | // apply the strand operation. Return new length.
80 | int Replace(char *s, int len, bool need_complement) const {
81 | if (IsFullRangeAndPositiveStrand()) {
82 | return len;
83 | }
84 |
85 | int i, j, k;
86 | i = 0;
87 | const int num_ranges = starts.size();
88 | for (k = 0; k < num_ranges; ++k) {
89 | int start = starts[k];
90 | int end = ends[k];
91 |
92 | if (end == -1) {
93 | end = len - 1;
94 | }
95 |
96 | for (j = start; j <= end; ++i, ++j) {
97 | s[i] = s[j];
98 | }
99 | }
100 |
101 | s[i] = '\0';
102 | len = i;
103 |
104 | if (strand == '-') {
105 | if (need_complement) {
106 | for (i = 0; i < len; ++i) {
107 | s[i] = Uint8ToChar(((uint8_t)3) ^ (CharToUint8(s[i])));
108 | }
109 | }
110 |
111 | for (i = 0, j = len - 1; i < j; ++i, --j) {
112 | char tmp = s[i];
113 | s[i] = s[j];
114 | s[j] = tmp;
115 | }
116 | }
117 | return len;
118 | }
119 |
120 | private:
121 | bool IsFullRangeAndPositiveStrand() const {
122 | if (strand == '+' && starts[0] == 0 && ends[0] == -1) {
123 | return true;
124 | }
125 |
126 | return false;
127 | }
128 |
129 | std::vector starts = {0};
130 | std::vector ends = {-1};
131 | // Strand is either '+' or '-'. The barcode will be reverse-complemented after
132 | // extraction if strand is '-'.
133 | char strand = '+';
134 | };
135 |
136 | } // namespace chromap
137 |
138 | #endif
139 |
--------------------------------------------------------------------------------
/src/strand.h:
--------------------------------------------------------------------------------
1 | #ifndef STRAND_H_
2 | #define STRAND_H_
3 |
4 | namespace chromap {
5 |
6 | enum Strand {
7 | kPositive,
8 | kNegative,
9 | };
10 |
11 | } // namespace chromap
12 |
13 | #endif // STRAND_H_
14 |
--------------------------------------------------------------------------------
/src/summary_metadata.h:
--------------------------------------------------------------------------------
1 | #ifndef SUMMARY_METADATA_H_
2 | #define SUMMARY_METADATA_H_
3 |
4 | #include
5 |
6 | #include
7 | #include
8 | #include
9 |
10 | #include "khash.h"
11 | #include "utils.h"
12 |
13 | // The class summarizes the overall mapping metadata
14 |
15 | namespace chromap {
16 |
17 | enum SummaryMetadataField {
18 | SUMMARY_METADATA_TOTAL = 0,
19 | SUMMARY_METADATA_DUP,
20 | SUMMARY_METADATA_MAPPED,
21 | SUMMARY_METADATA_LOWMAPQ,
22 | SUMMARY_METADATA_CACHEHIT,
23 | SUMMARY_METADATA_CARDINALITY,
24 | SUMMARY_METADATA_FIELDS
25 | };
26 |
27 | struct _barcodeSummaryMetadata {
28 | int counts[SUMMARY_METADATA_FIELDS];
29 | _barcodeSummaryMetadata() {
30 | memset(counts, 0, sizeof(int) * SUMMARY_METADATA_FIELDS);
31 | }
32 | };
33 |
34 |
35 | KHASH_MAP_INIT_INT64(k64_barcode_metadata, struct _barcodeSummaryMetadata)
36 |
37 | class SummaryMetadata {
38 | public:
39 | SummaryMetadata() {
40 | barcode_metadata_ = kh_init(k64_barcode_metadata);
41 | barcode_length_ = 16;
42 | }
43 | ~SummaryMetadata() {
44 | kh_destroy(k64_barcode_metadata, barcode_metadata_);
45 | }
46 |
47 | inline double inverse_logit(double frip) {
48 | return (1.0/(1.0 + std::exp(-frip)));
49 | }
50 |
51 | inline void OutputCounts(const char *barcode, const int *counts, FILE *fp, std::vector frip_est_coeffs, bool output_num_cache_slots_info)
52 | {
53 | // define variables to store values
54 | size_t num_total = counts[SUMMARY_METADATA_TOTAL];
55 | size_t num_dup = counts[SUMMARY_METADATA_DUP];
56 |
57 | size_t num_mapped = counts[SUMMARY_METADATA_MAPPED];
58 | size_t num_unmapped = num_total - num_mapped;
59 |
60 | size_t num_lowmapq = counts[SUMMARY_METADATA_LOWMAPQ];
61 | size_t num_cachehit = counts[SUMMARY_METADATA_CACHEHIT];
62 | double fric = (num_mapped != 0) ? (double) num_cachehit / (double) num_mapped : 0.0;
63 |
64 | size_t num_cache_slots = counts[SUMMARY_METADATA_CARDINALITY];
65 |
66 | // compute the estimated frip
67 | double est_frip = (fric != 0.0) ? inverse_logit(frip_est_coeffs[0] + /* constant */
68 | (frip_est_coeffs[1] * fric) +
69 | (frip_est_coeffs[2] * num_dup) +
70 | (frip_est_coeffs[3] * num_unmapped) +
71 | (frip_est_coeffs[4] * num_lowmapq)) : 0.0;
72 |
73 | // print out data for current barcode
74 | if (!output_num_cache_slots_info) {
75 | fprintf(fp, "%s,%ld,%ld,%ld,%ld,%ld,%.5lf,%.5lf\n",
76 | barcode,
77 | num_total,
78 | num_dup,
79 | num_unmapped,
80 | num_lowmapq,
81 | num_cachehit,
82 | fric,
83 | est_frip);
84 | } else {
85 | fprintf(fp, "%s,%ld,%ld,%ld,%ld,%ld,%.5lf,%.5lf,%ld\n",
86 | barcode,
87 | num_total,
88 | num_dup,
89 | num_unmapped,
90 | num_lowmapq,
91 | num_cachehit,
92 | fric,
93 | est_frip,
94 | num_cache_slots);
95 | }
96 | }
97 |
98 | void Output(const char *filename, bool has_white_list, std::vector frip_est_coeffs, bool output_num_cache_slots_info) {
99 | FILE *fp = fopen(filename, "w");
100 |
101 | // Change summary file header depending on options
102 | if (!output_num_cache_slots_info)
103 | fprintf(fp, "barcode,total,duplicate,unmapped,lowmapq,cachehit,fric,estfrip\n");
104 | else
105 | fprintf(fp, "barcode,total,duplicate,unmapped,lowmapq,cachehit,fric,estfrip,numcacheslots\n");
106 |
107 | khiter_t k;
108 | for (k = kh_begin(barcode_metadata_); k != kh_end(barcode_metadata_); ++k)
109 | if (kh_exist(barcode_metadata_, k)) {
110 | OutputCounts(
111 | Seed2Sequence(kh_key(barcode_metadata_, k), barcode_length_).c_str(),
112 | kh_value(barcode_metadata_, k).counts,
113 | fp,
114 | frip_est_coeffs,
115 | output_num_cache_slots_info
116 | );
117 | }
118 | if (has_white_list) {
119 | OutputCounts(
120 | "non-whitelist",
121 | nonwhitelist_summary_.counts,
122 | fp,
123 | frip_est_coeffs,
124 | output_num_cache_slots_info
125 | ) ;
126 | }
127 | fclose(fp);
128 | }
129 |
130 | void UpdateCount(uint64_t barcode, int type, int change) {
131 | int khash_return_code;
132 | khiter_t barcode_metadata_iter = kh_put(k64_barcode_metadata, barcode_metadata_, barcode, &khash_return_code);
133 | if (khash_return_code) {
134 | struct _barcodeSummaryMetadata nb;
135 | kh_value(barcode_metadata_, barcode_metadata_iter) = nb;
136 | }
137 | kh_value(barcode_metadata_, barcode_metadata_iter).counts[type] += change;
138 | }
139 |
140 | void UpdateNonWhitelistCount(int type, int change) {
141 | nonwhitelist_summary_.counts[type] += change;
142 | }
143 |
144 | void SetBarcodeLength(int l) {
145 | barcode_length_ = l;
146 | }
147 |
148 | // In SAM format for paired-end data, some count will be counted twice
149 | void AdjustPairedEndOverCount() {
150 | khiter_t k;
151 | for (k = kh_begin(barcode_metadata_); k != kh_end(barcode_metadata_); ++k)
152 | if (kh_exist(barcode_metadata_, k)) {
153 | kh_value(barcode_metadata_, k).counts[SUMMARY_METADATA_DUP] /= 2 ;
154 | kh_value(barcode_metadata_, k).counts[SUMMARY_METADATA_LOWMAPQ] /= 2 ;
155 | kh_value(barcode_metadata_, k).counts[SUMMARY_METADATA_MAPPED] /= 2 ;
156 | }
157 | }
158 |
159 | private:
160 | khash_t(k64_barcode_metadata) *barcode_metadata_;
161 | struct _barcodeSummaryMetadata nonwhitelist_summary_; // summarize the fragments with no barcode information
162 | int barcode_length_;
163 |
164 | std::string Seed2Sequence(uint64_t seed, uint32_t seed_length) const {
165 | std::string sequence;
166 | sequence.reserve(seed_length);
167 | uint64_t mask_ = 3;
168 | for (uint32_t i = 0; i < seed_length; ++i) {
169 | sequence.push_back(
170 | Uint8ToChar((seed >> ((seed_length - 1 - i) * 2)) & mask_));
171 | }
172 | return sequence;
173 | }
174 | };
175 |
176 | } // namespace chromap
177 |
178 | #endif
179 |
--------------------------------------------------------------------------------
/src/temp_mapping.h:
--------------------------------------------------------------------------------
1 | #ifndef TEMPMAPPING_H_
2 | #define TEMPMAPPING_H_
3 |
4 | #include
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | #include "bed_mapping.h"
14 | #include "mapping.h"
15 | #include "paf_mapping.h"
16 | #include "pairs_mapping.h"
17 | #include "sam_mapping.h"
18 |
19 | namespace chromap {
20 |
21 | template
22 | struct TempMappingFileHandle {
23 | std::string file_path;
24 | FILE* file;
25 | uint32_t num_mappings;
26 | uint32_t block_size;
27 | uint32_t current_rid;
28 | uint32_t current_mapping_index;
29 | uint32_t num_mappings_on_current_rid;
30 | uint32_t num_loaded_mappings_on_current_rid;
31 | // This vector only keep mappings on the same ref seq.
32 | std::vector mappings;
33 |
34 | inline const MappingRecord& GetCurrentMapping() const {
35 | return mappings[current_mapping_index];
36 | }
37 |
38 | inline bool HasMappings() const { return num_mappings != 0; }
39 |
40 | inline void InitializeTempMappingLoading(uint32_t temp_mapping_block_size) {
41 | file = fopen(file_path.c_str(), "rb");
42 | if (file == NULL) {
43 | std::cerr << "Cannot open temporary file " << file_path << ". This may be caused by creating too many temporary files, please consider using command like \"ulimit -n 32768 -u 32768\" to increase the limit.\n" ;
44 | }
45 | assert(file != NULL);
46 | num_mappings = 0;
47 | block_size = temp_mapping_block_size;
48 | current_rid = 0;
49 | current_mapping_index = 0;
50 | fread(&num_mappings_on_current_rid, sizeof(size_t), 1, file);
51 | num_loaded_mappings_on_current_rid = 0;
52 | mappings.resize(block_size);
53 | // std::cerr << "Block size: " << block_size << ", initialize temp file " <<
54 | // file_path << "\n";
55 | }
56 |
57 | inline void FinalizeTempMappingLoading() { fclose(file); }
58 |
59 | inline void LoadTempMappingBlock(uint32_t num_reference_sequences) {
60 | num_mappings = 0;
61 | while (num_mappings == 0) {
62 | // Only keep mappings on one ref seq, which means # mappings in buffer can
63 | // be less than block size Two cases: current ref seq has remainings or
64 | // not
65 | if (num_loaded_mappings_on_current_rid < num_mappings_on_current_rid) {
66 | // Check if # remains larger than block size
67 | uint32_t num_mappings_to_load_on_current_rid =
68 | num_mappings_on_current_rid - num_loaded_mappings_on_current_rid;
69 | if (num_mappings_to_load_on_current_rid > block_size) {
70 | num_mappings_to_load_on_current_rid = block_size;
71 | }
72 | // std::cerr << num_mappings_to_load_on_current_rid << " " <<
73 | // num_loaded_mappings_on_current_rid << " " <<
74 | // num_mappings_on_current_rid << "\n"; std::cerr << mappings.size() <<
75 | // "\n";
76 | fread(mappings.data(), sizeof(MappingRecord),
77 | num_mappings_to_load_on_current_rid, file);
78 | // std::cerr << "Load mappings\n";
79 | num_loaded_mappings_on_current_rid +=
80 | num_mappings_to_load_on_current_rid;
81 | num_mappings = num_mappings_to_load_on_current_rid;
82 | } else {
83 | // Move to next rid
84 | ++current_rid;
85 | if (current_rid < num_reference_sequences) {
86 | // std::cerr << "Load size\n";
87 | fread(&num_mappings_on_current_rid, sizeof(size_t), 1, file);
88 | // std::cerr << "Load size " << num_mappings_on_current_rid << "\n";
89 | num_loaded_mappings_on_current_rid = 0;
90 | } else {
91 | break;
92 | }
93 | }
94 | }
95 |
96 | current_mapping_index = 0;
97 | }
98 |
99 | inline void Next(uint32_t num_reference_sequences) {
100 | ++current_mapping_index;
101 | if (current_mapping_index >= num_mappings) {
102 | LoadTempMappingBlock(num_reference_sequences);
103 | }
104 | }
105 | };
106 |
107 | template <>
108 | inline void TempMappingFileHandle::LoadTempMappingBlock(
109 | uint32_t num_reference_sequences) {
110 | num_mappings = 0;
111 | while (num_mappings == 0) {
112 | // Only keep mappings on one ref seq, which means # mappings in buffer can
113 | // be less than block size Two cases: current ref seq has remainings or not
114 | if (num_loaded_mappings_on_current_rid < num_mappings_on_current_rid) {
115 | // Check if # remains larger than block size
116 | uint32_t num_mappings_to_load_on_current_rid =
117 | num_mappings_on_current_rid - num_loaded_mappings_on_current_rid;
118 | if (num_mappings_to_load_on_current_rid > block_size) {
119 | num_mappings_to_load_on_current_rid = block_size;
120 | }
121 | // std::cerr << num_mappings_to_load_on_current_rid << " " <<
122 | // num_loaded_mappings_on_current_rid << " " <<
123 | // num_mappings_on_current_rid
124 | // << "\n"; std::cerr << mappings.size() << "\n";
125 | for (size_t mi = 0; mi < num_mappings_to_load_on_current_rid; ++mi) {
126 | mappings[mi].LoadFromFile(file);
127 | }
128 | // fread(mappings.data(), sizeof(MappingRecord),
129 | // num_mappings_to_load_on_current_rid, file); std::cerr << "Load
130 | // mappings\n";
131 | num_loaded_mappings_on_current_rid += num_mappings_to_load_on_current_rid;
132 | num_mappings = num_mappings_to_load_on_current_rid;
133 | } else {
134 | // Move to next rid
135 | ++current_rid;
136 | if (current_rid < num_reference_sequences) {
137 | // std::cerr << "Load size\n";
138 | fread(&num_mappings_on_current_rid, sizeof(size_t), 1, file);
139 | // std::cerr << "Load size " << num_mappings_on_current_rid << "\n";
140 | num_loaded_mappings_on_current_rid = 0;
141 | } else {
142 | break;
143 | }
144 | }
145 | }
146 | current_mapping_index = 0;
147 | }
148 |
149 | template <>
150 | inline void TempMappingFileHandle::LoadTempMappingBlock(
151 | uint32_t num_reference_sequences) {
152 | num_mappings = 0;
153 | while (num_mappings == 0) {
154 | // Only keep mappings on one ref seq, which means # mappings in buffer can
155 | // be less than block size Two cases: current ref seq has remainings or not
156 | if (num_loaded_mappings_on_current_rid < num_mappings_on_current_rid) {
157 | // Check if # remains larger than block size
158 | uint32_t num_mappings_to_load_on_current_rid =
159 | num_mappings_on_current_rid - num_loaded_mappings_on_current_rid;
160 | if (num_mappings_to_load_on_current_rid > block_size) {
161 | num_mappings_to_load_on_current_rid = block_size;
162 | }
163 | // std::cerr << num_mappings_to_load_on_current_rid << " " <<
164 | // num_loaded_mappings_on_current_rid << " " <<
165 | // num_mappings_on_current_rid
166 | // << "\n"; std::cerr << mappings.size() << "\n";
167 | for (size_t mi = 0; mi < num_mappings_to_load_on_current_rid; ++mi) {
168 | mappings[mi].LoadFromFile(file);
169 | }
170 | // fread(mappings.data(), sizeof(MappingRecord),
171 | // num_mappings_to_load_on_current_rid, file); std::cerr << "Load
172 | // mappings\n";
173 | num_loaded_mappings_on_current_rid += num_mappings_to_load_on_current_rid;
174 | num_mappings = num_mappings_to_load_on_current_rid;
175 | } else {
176 | // Move to next rid
177 | ++current_rid;
178 | if (current_rid < num_reference_sequences) {
179 | // std::cerr << "Load size\n";
180 | fread(&num_mappings_on_current_rid, sizeof(size_t), 1, file);
181 | // std::cerr << "Load size " << num_mappings_on_current_rid << "\n";
182 | num_loaded_mappings_on_current_rid = 0;
183 | } else {
184 | break;
185 | }
186 | }
187 | }
188 | current_mapping_index = 0;
189 | }
190 |
191 | template <>
192 | inline void TempMappingFileHandle::LoadTempMappingBlock(
193 | uint32_t num_reference_sequences) {
194 | num_mappings = 0;
195 | while (num_mappings == 0) {
196 | // Only keep mappings on one ref seq, which means # mappings in buffer can
197 | // be less than block size Two cases: current ref seq has remainings or not
198 | if (num_loaded_mappings_on_current_rid < num_mappings_on_current_rid) {
199 | // Check if # remains larger than block size
200 | uint32_t num_mappings_to_load_on_current_rid =
201 | num_mappings_on_current_rid - num_loaded_mappings_on_current_rid;
202 | if (num_mappings_to_load_on_current_rid > block_size) {
203 | num_mappings_to_load_on_current_rid = block_size;
204 | }
205 | // std::cerr << num_mappings_to_load_on_current_rid << " " <<
206 | // num_loaded_mappings_on_current_rid << " " <<
207 | // num_mappings_on_current_rid
208 | // << "\n"; std::cerr << mappings.size() << "\n";
209 | for (size_t mi = 0; mi < num_mappings_to_load_on_current_rid; ++mi) {
210 | mappings[mi].LoadFromFile(file);
211 | }
212 | // fread(mappings.data(), sizeof(MappingRecord),
213 | // num_mappings_to_load_on_current_rid, file); std::cerr << "Load
214 | // mappings\n";
215 | num_loaded_mappings_on_current_rid += num_mappings_to_load_on_current_rid;
216 | num_mappings = num_mappings_to_load_on_current_rid;
217 | } else {
218 | // Move to next rid
219 | ++current_rid;
220 | if (current_rid < num_reference_sequences) {
221 | // std::cerr << "Load size\n";
222 | fread(&num_mappings_on_current_rid, sizeof(size_t), 1, file);
223 | // std::cerr << "Load size " << num_mappings_on_current_rid << "\n";
224 | num_loaded_mappings_on_current_rid = 0;
225 | } else {
226 | break;
227 | }
228 | }
229 | }
230 | current_mapping_index = 0;
231 | }
232 |
233 | template <>
234 | inline void TempMappingFileHandle::LoadTempMappingBlock(
235 | uint32_t num_reference_sequences) {
236 | num_mappings = 0;
237 | while (num_mappings == 0) {
238 | // Only keep mappings on one ref seq, which means # mappings in buffer can
239 | // be less than block size Two cases: current ref seq has remainings or not
240 | if (num_loaded_mappings_on_current_rid < num_mappings_on_current_rid) {
241 | // Check if # remains larger than block size
242 | uint32_t num_mappings_to_load_on_current_rid =
243 | num_mappings_on_current_rid - num_loaded_mappings_on_current_rid;
244 | if (num_mappings_to_load_on_current_rid > block_size) {
245 | num_mappings_to_load_on_current_rid = block_size;
246 | }
247 | // std::cerr << num_mappings_to_load_on_current_rid << " " <<
248 | // num_loaded_mappings_on_current_rid << " " <<
249 | // num_mappings_on_current_rid
250 | // << "\n"; std::cerr << mappings.size() << "\n";
251 | for (size_t mi = 0; mi < num_mappings_to_load_on_current_rid; ++mi) {
252 | mappings[mi].LoadFromFile(file);
253 | }
254 | // fread(mappings.data(), sizeof(MappingRecord),
255 | // num_mappings_to_load_on_current_rid, file); std::cerr << "Load
256 | // mappings\n";
257 | num_loaded_mappings_on_current_rid += num_mappings_to_load_on_current_rid;
258 | num_mappings = num_mappings_to_load_on_current_rid;
259 | } else {
260 | // Move to next rid
261 | ++current_rid;
262 | if (current_rid < num_reference_sequences) {
263 | // std::cerr << "Load size\n";
264 | fread(&num_mappings_on_current_rid, sizeof(size_t), 1, file);
265 | // std::cerr << "Load size " << num_mappings_on_current_rid << "\n";
266 | num_loaded_mappings_on_current_rid = 0;
267 | } else {
268 | break;
269 | }
270 | }
271 | }
272 | current_mapping_index = 0;
273 | }
274 |
275 | } // namespace chromap
276 |
277 | #endif // TEMPMAPPING_H_
278 |
--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H_
2 | #define UTILS_H_
3 |
4 | #include
5 | #include
6 |
7 | #include
8 | #include
9 | #include
10 |
11 | #include "candidate.h"
12 | #include "khash.h"
13 | #include "minimizer.h"
14 | #include "strand.h"
15 |
16 | namespace chromap {
17 |
18 | struct uint128_t {
19 | uint64_t first;
20 | uint64_t second;
21 | };
22 |
23 | struct BarcodeWithQual {
24 | uint32_t corrected_base_index1;
25 | char correct_base1;
26 | uint32_t corrected_base_index2;
27 | char correct_base2;
28 | double score;
29 | bool operator>(const BarcodeWithQual &b) const {
30 | return std::tie(score, corrected_base_index1, correct_base1,
31 | corrected_base_index2, correct_base2) >
32 | std::tie(b.score, b.corrected_base_index1, b.correct_base1,
33 | b.corrected_base_index2, b.correct_base2);
34 | }
35 | };
36 |
37 | struct _mm_history {
38 | unsigned int timestamp = 0;
39 | std::vector minimizers;
40 | std::vector positive_candidates;
41 | std::vector negative_candidates;
42 | uint32_t repetitive_seed_length;
43 | };
44 |
45 | KHASH_MAP_INIT_INT64(k128, uint128_t);
46 | KHASH_MAP_INIT_INT64(k64_seq, uint64_t);
47 | KHASH_SET_INIT_INT(k32_set);
48 | KHASH_MAP_INIT_INT64(kmatrix, uint32_t);
49 |
50 | struct StackCell {
51 | size_t x; // node
52 | int k, w; // k: level; w: 0 if left child hasn't been processed
53 | StackCell(){};
54 | StackCell(int k_, size_t x_, int w_) : x(x_), k(k_), w(w_){};
55 | };
56 |
57 | inline static double GetRealTime() {
58 | struct timeval tp;
59 | struct timezone tzp;
60 | gettimeofday(&tp, &tzp);
61 | return tp.tv_sec + tp.tv_usec * 1e-6;
62 | }
63 |
64 | inline static double GetCPUTime() {
65 | struct rusage r;
66 | getrusage(RUSAGE_SELF, &r);
67 | return r.ru_utime.tv_sec + r.ru_stime.tv_sec +
68 | 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
69 | }
70 |
71 | inline static void ExitWithMessage(const std::string &message) {
72 | std::cerr << message << std::endl;
73 | exit(-1);
74 | }
75 |
76 | inline static uint64_t Hash64(uint64_t key, const uint64_t mask) {
77 | key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
78 | key = key ^ key >> 24;
79 | key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
80 | key = key ^ key >> 14;
81 | key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
82 | key = key ^ key >> 28;
83 | key = (key + (key << 31)) & mask;
84 | return key;
85 | }
86 |
87 | static constexpr uint8_t char_to_uint8_table_[256] = {
88 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
89 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
90 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2,
91 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
92 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4,
93 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
94 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
95 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
96 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
97 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
98 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
99 | static constexpr char uint8_to_char_table_[8] = {'A', 'C', 'G', 'T',
100 | 'N', 'N', 'N', 'N'};
101 |
102 | inline static uint8_t CharToUint8(const char c) {
103 | return char_to_uint8_table_[(uint8_t)c];
104 | }
105 |
106 | inline static char Uint8ToChar(const uint8_t i) {
107 | return uint8_to_char_table_[i];
108 | }
109 |
110 | // Make sure the length is not greater than 32 before calling this function.
111 | inline static uint64_t GenerateSeedFromSequence(const char *sequence,
112 | uint32_t sequence_length,
113 | uint32_t start_position,
114 | uint32_t seed_length) {
115 | uint64_t seed = 0;
116 | for (uint32_t i = 0; i < seed_length; ++i) {
117 | if (start_position + i < sequence_length) {
118 | uint8_t current_base = CharToUint8(sequence[i + start_position]);
119 | if (current_base < 4) { // not an ambiguous base
120 | seed = (seed << 2) | current_base; // forward k-mer
121 | } else {
122 | seed = seed << 2; // N->A
123 | }
124 | } else {
125 | seed = seed << 2; // Pad A
126 | }
127 | }
128 | return seed;
129 | }
130 |
131 | inline static uint64_t GenerateMinimizer(uint32_t sequence_index,
132 | uint32_t sequence_position,
133 | const Strand strand) {
134 | const uint64_t minimizer =
135 | (((uint64_t)sequence_index) << 32 | sequence_position) << 1;
136 | return minimizer | (strand == kPositive ? 0 : 1);
137 | }
138 |
139 | } // namespace chromap
140 |
141 | #endif // UTILS_H_
142 |
--------------------------------------------------------------------------------
/test/read1.fq:
--------------------------------------------------------------------------------
1 | @simulated.1/1
2 | ACTCGCACGATAATCGTCATACATGGACGCGTCATTTGTTGAATACATGGTTTAATTATGAATTGCTATGGAAATCGAGAGGCCCGCTCGGTCCCCTTTA
3 | +
4 | IIHIIHHHIIHHIIIIGHIGGGIIEIIFEHFHIDGIDEIIIDEFGEICEBHIIIFIIHIIIBHIBE>ID>D@IICGFIBIIIB;>C7IEIIBI5GAI>IB
5 | @simulated.2/1
6 | CGAAACTCAAAGGATCTAGTATTGTCCATACGGCGCACACTCCGTCGGGACAACGGCATCGATGCTTACGTTAGCACCAGTTGAAGCGTGATATGTATAT
7 | +
8 | IIIHHIHIIHIHIIIIGIFIHFHIHIIIFHHIIIIECIEHGIHBIIBIHIIDII=GIICB>IFHIGI@FFIGHIDIFEG>:G5>A5DI<@IIIFEI@HII
9 | @simulated.3/1
10 | TTGCTCTCGGCAGTCTGTTTGTGGTACTATGTGCCTAGCTAATGACCTGAGAGGGTTAAGCCTTTGGATCAAGTAACGGACATACGGGGAAGATGTGACA
11 | +
12 | HIHHHIHIIIIHIGIIHIGIGEIGIIIIHFIG?FIGIBC@IIECHIEIIIIIII@ICFIIGCCHFFCIIIID=AEIIIIAIIE9IA=DIIIC7CIIIE7I
13 | @simulated.4/1
14 | ACCCACATACCGAGCTCAGGTGTGTGTGTGGAGCTACACGTAGCCAACCGTTGAGGTATACCAGGATTTCATTAATCCGAGTAATCTTGCAATAGCGCGT
15 | +
16 | IHIIHIHIIHHHHHGIIIFIHFIFIFIICIIIBIHIIBDAGEIIDIAGIEGHGFIIDHEIDIIIIIIIE@IDAII:BIGFIIIGC@I?IIII?IBAIIII
17 | @simulated.5/1
18 | CCCATGTATGTAGACGTACAAGAGAAGAGCCTGTCATTGATACCCTTGTAACTTATGCTTAAGGCCACAGATAGTGTCACGTCGACACTTCTCCTATTAC
19 | +
20 | HHIIHIIIHGGHIIIIHHGFHIIFFEDIIGFIIIIEAIGCIIEIIIIDEIIFCFIIIEEGIFIEIGIID@IAAIIIIIIIGI;IIAIBIIIIEFIGGI?IGCIII?H
33 | @simulated.9/1
34 | CGGTTGGCCCGTCGCTTACCCTTCGTAGCCAACCCGGTGGGGCCTCATCTGCGTCCTCTATTCCACCTTGGACTATAAATTTCCTTCCATGCTAGTCACT
35 | +
36 | IHHIIIIIHHHIIFGFIIIIHIIFGEBIIIFGIIBCIIIFAIIIIAGCAIHEIIIIFDIBI@BI@II?IF=DID@IIIIFICIIIIG@IIFEGIB>IIIIIFCIEIII>FAIBIA:EIDCIIF;II45I=FFIFEIIIIB1;II
5 | @simulated.2/2
6 | CTTGGTAATGTCTGTCTGTCCTGTTCGCTCCTTTTGTCCTCTGGAAGTGGTTACTATAGAAGAGGCACATATGTAGAAGTTCAGGTTGGCCGGCATGTCC
7 | +
8 | HIIIHHIHIGHFGGIHIIIFFHIGIIEIIHIHCFIGHGBIHIIIGIAFIIEFIDEHDGHGIIIDCEIIIIFIHHIIIIIIA=IHEIIBIIIIFII=III6
9 | @simulated.3/2
10 | CTGTTCAAGCAGAGTCCTACCTAAGCGAAAGGTTCAGAAATGGGTGCTTAGAGTGACATAACGACATAATGAGCGCCGCTGGTCGCCCAAGGTGACGACG
11 | +
12 | IHIIHHIIGIIIGHEHGHIGIIFEIIHIHHHCCDGCIIIGHFIIIHEICHHIC@IHBIIBFIAIIB?IA=IEI;CIIBBHG=IIEACIHI;II8@III8I
13 | @simulated.4/2
14 | TCTTATATGGAATCTGGGACGTGCTGGTGGGAGCGAATCGCCCTCCACTTCACTAGGAGCACATGTGACTCCCAAACAGGGAGCTGAAGCCCACCTATGA
15 | +
16 | HHIIHIIIIHIGGGIHIHHIIHCHIGIICIIIEHIIGGIHIIDICEIICIIIFIIGIHI?IGIIHFII?II=IIIDIIIGFIBIGIHIB?D;IE;B:EII
17 | @simulated.5/2
18 | CGAGGTGCTCGGTTTCCCCACTGTCGAAGGTCCATTTGTGAATTTTCCATGATGGTACGTGGTAAATTCACTTCTTACGATAATCACCGTATTCCGTAAG
19 | +
20 | IIHHHHHHHIGIIIHHIIIIIIGIIGFFIHHEGGIIGGIIGIIBIIIFEBEHDIIH?IGEICIFDDIIAEIIIGIIIBFIIC85IFI=DI;?IAI@IIII
21 | @simulated.6/2
22 | TAGCTATACAAGTCCTACAAGTTAATATCCAAAAGCTGACAAGATTTGACATGAGCGGTCACTCCACCGCCACAGTTTCAGACACCACTGTCTATGCACA
23 | +
24 | HIHHHHHIHHFHIIIIEIIIHGIIGIIIGIEIHIFHGIICIHIEIFIEIIFAIICI>@IFB>IHIIHIIIIGC;E>HI:III@EICDGIICIHEI8IIDI
25 | @simulated.7/2
26 | GTCTATACGGTCACCGTGGACCTGCGTGTGAACACACCCGCTACTCTACAAGCACTCAACGTTTGAACTAGGCCCCCAAACGGGCGAATCTTGCGCTTTG
27 | +
28 | HHHHIIIHIHHIHIGHIGIFIIG@IIHEGEIIHIIIDICII@EIIHBH?DGIIEFB?IIIIIEIGIII@DHI@IIIAIIAIII@6;III;III?H=IAHH
29 | @simulated.8/2
30 | CCCTACACACGGTTGCAAATTCCGGCCATCTGTCTTAATAGGAGCCGTGTAGACGCATTAGATCCGAACTTTATGCTGGGTTATTCACTGCCTTAGTAAG
31 | +
32 | IHIIIIHIIFHIIIHIIIIIGDIIFHIHFHIGIGIIHIEDIHCII>IIGIBIFII?IICADIIIHEGI5=CIIF=IIII@;IIIAH3I88IIIIAII:II
33 | @simulated.9/2
34 | CCCCAGGCATCTCACCTGCTCTTACCGGTGAAGATCCAGTTCTGATACATTGGTATATGTGCGAGGTGACGACTGGCGCGATCCGGATCTTCTCATGCCC
35 | +
36 | IHIIIHHHIIIHGHIHIIIIGIIHIGIIIIHEICFIHCG:ICIIIIHIIGIHIII?IF=IF8A