├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── genes ├── cancer.hg19.csv ├── cancer.hg38.csv ├── druggable.hg19.csv └── druggable.hg38.csv ├── scripts └── make_fusion_genes.py ├── src ├── builtinfusion.h ├── cmdline.h ├── common.cpp ├── common.h ├── editdistance.cpp ├── editdistance.h ├── fastareader.cpp ├── fastareader.h ├── fastqreader.cpp ├── fastqreader.h ├── fusion.cpp ├── fusion.h ├── fusionmapper.cpp ├── fusionmapper.h ├── fusionresult.cpp ├── fusionresult.h ├── fusionscan.cpp ├── fusionscan.h ├── gene.cpp ├── gene.h ├── globalsettings.cpp ├── globalsettings.h ├── htmlreporter.cpp ├── htmlreporter.h ├── indexer.cpp ├── indexer.h ├── jsonreporter.cpp ├── jsonreporter.h ├── main.cpp ├── match.cpp ├── match.h ├── matcher.cpp ├── matcher.h ├── overlap.cpp ├── overlap.h ├── pescanner.cpp ├── pescanner.h ├── read.cpp ├── read.h ├── sequence.cpp ├── sequence.h ├── sescanner.cpp ├── sescanner.h ├── unittest.cpp ├── unittest.h ├── util.h └── zlib │ ├── crc32.h │ ├── deflate.h │ ├── gzguts.h │ ├── inffast.h │ ├── inffixed.h │ ├── inflate.h │ ├── inftrees.h │ ├── trees.h │ ├── zconf.h │ ├── zlib.h │ └── zutil.h └── testdata ├── R1.fq ├── R2.fq ├── cancer.csv └── fusions.csv /.gitignore: -------------------------------------------------------------------------------- 1 | report.html 2 | runtest.sh 3 | genefusion 4 | .DS_Store 5 | 6 | # Compiled Object files 7 | *.slo 8 | *.lo 9 | *.o 10 | *.obj 11 | 12 | # Precompiled Headers 13 | *.gch 14 | *.pch 15 | 16 | # Compiled Dynamic libraries 17 | *.so 18 | *.dylib 19 | *.dll 20 | 21 | # Fortran module files 22 | *.mod 23 | *.smod 24 | 25 | # Compiled Static libraries 26 | *.lai 27 | *.la 28 | *.a 29 | *.lib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 OpenGene - Open Source Genetics Toolbox 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | DIR_INC = ./inc 2 | DIR_SRC = ./src 3 | DIR_OBJ = ./obj 4 | BINDIR=/usr/local/bin 5 | 6 | SRC = $(wildcard ${DIR_SRC}/*.cpp) 7 | OBJ = $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC})) 8 | 9 | TARGET = genefuse 10 | 11 | BIN_TARGET = ${TARGET} 12 | 13 | CC = g++ 14 | CFLAGS = -std=c++11 -g -I${DIR_INC} 15 | 16 | ${BIN_TARGET}:${OBJ} 17 | $(CC) $(OBJ) -lz -lpthread -o $@ 18 | 19 | ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp make_obj_dir 20 | $(CC) $(CFLAGS) -O3 -c $< -o $@ 21 | .PHONY:clean 22 | clean: 23 | rm obj/*.o 24 | rm genefuse 25 | 26 | make_obj_dir: 27 | @if test ! -d $(DIR_OBJ) ; \ 28 | then \ 29 | mkdir $(DIR_OBJ) ; \ 30 | fi 31 | 32 | install: 33 | install $(TARGET) $(BINDIR)/$(TARGET) 34 | @echo "Installed." 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![install with conda]( 2 | https://anaconda.org/bioconda/genefuse/badges/version.svg)](https://anaconda.org/bioconda/genefuse) 3 | # GeneFuse 4 | A tool to detect and visualize target gene fusions by scanning FASTQ files directly. This tool accepts FASTQ files and reference genome as input, and outputs detected fusion results in TEXT, JSON and HTML formats. 5 | 6 | # Take a quick glance of the informative report 7 | * Sample HTML report: http://opengene.org/GeneFuse/report.html 8 | * Sample JSON report: http://opengene.org/GeneFuse/report.json 9 | * Dataset for testing: http://opengene.org/dataset.html Please download the paired-end FASTQ files for GeneFuse testing (Illumina platform) 10 | 11 | # Get genefuse program 12 | ## install with Bioconda 13 | [![install with conda]( 14 | https://anaconda.org/bioconda/genefuse/badges/version.svg)](https://anaconda.org/bioconda/genefuse) 15 | ```shell 16 | conda install -c bioconda genefuse 17 | ``` 18 | ## download binary 19 | This binary is only for Linux systems, http://opengene.org/GeneFuse/genefuse 20 | ```shell 21 | # this binary was compiled on CentOS, and tested on CentOS/Ubuntu 22 | wget http://opengene.org/GeneFuse/genefuse 23 | chmod a+x ./genefuse 24 | ``` 25 | ## or compile from source 26 | ```shell 27 | # get source (you can also use browser to download from master or releases) 28 | git clone https://github.com/OpenGene/genefuse.git 29 | 30 | # build 31 | cd genefuse 32 | make 33 | 34 | # Install 35 | sudo make install 36 | ``` 37 | 38 | # Usage 39 | You should provide following arguments to run genefuse 40 | * the reference genome fasta file, specified by `-r` or `--ref=` 41 | * the fusion setting file, specified by `-f` or `--fusion=` 42 | * the fastq file(s), specified by `-1` or `--read1=` for single-end data. If dealing with pair-end data, specify the read2 file by `-2` or `--read2=` 43 | * use `-h` or `--html=` to specify the file name of HTML report 44 | * use `-j` or `--json=` to specify the file name of JSON report 45 | * the plain text result is directly printed to STDOUT, you can pipe it to a file using a `>` 46 | 47 | ## Example 48 | ```shell 49 | genefuse -r hg19.fasta -f genes/druggable.hg19.csv -1 genefuse.R1.fq.gz -2 genefuse.R2.fq.gz -h report.html > result 50 | ``` 51 | 52 | ## Reference genome 53 | The reference genome should be a single whole FASTA file containg all chromosome data. This file shouldn't be compressed. For human data, typicall `hg19/GRch37` or `hg38/GRch38` assembly is used, which can be downloaded from following sites: 54 | * `hg19/GRch37`: https://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz 55 | * `hg38/GRch38`: https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.fa.gz 56 | Remember to decompress hg19.fa.gz/hg38.fa.gz since it is gzipped and is not supported currently. 57 | 58 | ## Fusion file 59 | The fusion file is a list of coordinated target genes together with their exons. A sample is: 60 | ```CSV 61 | >EML4_ENST00000318522.5,chr2:42396490-42559688 62 | 1,42396490,42396776 63 | 2,42472645,42472827 64 | 3,42483641,42483770 65 | 4,42488261,42488434 66 | 5,42490318,42490446 67 | ... 68 | 69 | >ALK_ENST00000389048.3,chr2:29415640-30144432 70 | 1,30142859,30144432 71 | 2,29940444,29940563 72 | 3,29917716,29917880 73 | 4,29754781,29754982 74 | 5,29606598,29606725 75 | ... 76 | ``` 77 | The coordination system should be consistent with the reference genome. 78 | ### Fusion files provided in this package 79 | Four fusion files are provided with `genefuse`: 80 | 1. `genes/druggable.hg19.csv`: all druggable fusion genes based on `hg19/GRch37` reference assembly. 81 | 2. `genes/druggable.hg38.csv`: all druggable fusion genes based on `hg38/GRch38` reference assembly. 82 | 3. `genes/cancer.hg19.csv`: all COSMIC curated fusion genes (http://cancer.sanger.ac.uk/cosmic/fusion) based on `hg19/GRch37` reference assembly. 83 | 4. `genes/cancer.hg38.csv`: all COSMIC curated fusion genes (http://cancer.sanger.ac.uk/cosmic/fusion) based on `hg38/GRch38` reference assembly. 84 | 85 | Notes: 86 | * `genefuse` runs much faster with `druggable` genes than `cancer` genes, since `druggable` genes are only a small subset of `cancer` genes. Use this one if you only care about the fusion related personalized medicine for cancers. 87 | * The `cancer` genes should be enough for most cancer related studies, since all COSMIC curated fusion genes are included. 88 | * If you want to create a custom gene list, please follow the instructions given on next section. 89 | ### Create a fusion file based on hg19 or hg38 90 | If you'd like to create a custom fusion file, you can use `scripts/make_fusion_genes.py` 91 | As the script uses `refFlat.txt` file to determine genomic coordinates of exons, you need to download a `refFlat.txt` file from UCSC Genome Browser in advance. Of course, the choice of using either hg19 or hg38 is up to you. 92 | 93 | - For hg19: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/refFlat.txt.gz 94 | - For hg38: http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/refFlat.txt.gz 95 | 96 | Please make sure unzip the file to txt format before you continue 97 | 98 | As for the input gene list file, all genes should be listed in separate lines. By default, the longest transcript will be used. However, you can specify a different transcript by adding the transcript ID to the end of a gene. The gene and its transcript should be separated by a tab or a space. Please note that each gene should be the HGNC official gene symbol, and each transcript should be NCBI RefSeq transcript ID. 99 | 100 | An example of gene list file: 101 | 102 | ``` 103 | BRCA2 NM_000059 104 | FAM155A 105 | IRS2 106 | ``` 107 | 108 | When both input gene list file (`gene_list.txt`) and `refFlat.txt` file are prepared, you can use following command to generate a user-defined fusion file (`fusion.csv`): 109 | 110 | ```shell 111 | python3 scripts/make_fusion_genes.py gene_list.txt -r /path/to/refflat -o fusion.csv 112 | ``` 113 | 114 | # HTML report 115 | GeneFuse can generate very informative and interactive HTML pages to visualize the fusions with following information: 116 | * the fusion genes, along with their transcripts. 117 | * the inferred break point with reference genome coordinations. 118 | * the inferred fusion protein, with all exons and the transcription direction. 119 | * the supporting reads, with all bases colorized according to their quality scores. 120 | * the number of supporting reads, and how many of them are unique (the rest may be duplications) 121 | ## A HTML report example 122 | ![image](http://www.opengene.org/GeneFuse/eml4alk.png) 123 | See the HTML page of this picture: http://opengene.org/GeneFuse/report.html 124 | 125 | # All options 126 | ``` 127 | options: 128 | -1, --read1 read1 file name (string) 129 | -2, --read2 read2 file name (string [=]) 130 | -f, --fusion fusion file name, in CSV format (string) 131 | -r, --ref reference fasta file name (string) 132 | -u, --unique specify the least supporting read number is required to report a fusion, default is 2 (int [=2]) 133 | -d, --deletion specify the least deletion length of a intra-gene deletion to report, default is 50 (int [=50]) 134 | -h, --html file name to store HTML report, default is genefuse.html (string [=genefuse.html]) 135 | -j, --json file name to store JSON report, default is genefuse.json (string [=genefuse.json]) 136 | -t, --thread worker thread number, default is 4 (int [=4]) 137 | -?, --help print this message 138 | ``` 139 | 140 | # Cite GeneFuse 141 | If you used GeneFuse in you work, you can cite it as: 142 | 143 | Shifu Chen, Ming Liu, Tanxiao Huang, Wenting Liao, Mingyan Xu and Jia Gu. GeneFuse: detection and visualization of target gene fusions from DNA sequencing data. International Journal of Biological Sciences, 2018; 14(8): 843-848. doi: 10.7150/ijbs.24626 144 | -------------------------------------------------------------------------------- /genes/druggable.hg19.csv: -------------------------------------------------------------------------------- 1 | >EML4_ENST00000318522.5,chr2:42396490-42559688 2 | 1,42396490,42396776 3 | 2,42472645,42472827 4 | 3,42483641,42483770 5 | 4,42488261,42488434 6 | 5,42490318,42490446 7 | 6,42491846,42491871 8 | 7,42507990,42508113 9 | 8,42509963,42510112 10 | 9,42511774,42511843 11 | 10,42513409,42513519 12 | 11,42515367,42515462 13 | 12,42522265,42522399 14 | 13,42522521,42522656 15 | 14,42528381,42528532 16 | 15,42530244,42530369 17 | 16,42530455,42530586 18 | 17,42531624,42531691 19 | 18,42543102,42543190 20 | 19,42544567,42544664 21 | 20,42552607,42552694 22 | 21,42553294,42553392 23 | 22,42556026,42556156 24 | 23,42556874,42559688 25 | 26 | >ALK_ENST00000389048.3,chr2:29415640-30144432 27 | 1,30142859,30144432 28 | 2,29940444,29940563 29 | 3,29917716,29917880 30 | 4,29754781,29754982 31 | 5,29606598,29606725 32 | 6,29551216,29551347 33 | 7,29543617,29543748 34 | 8,29541170,29541270 35 | 9,29519754,29519923 36 | 10,29498268,29498362 37 | 11,29497965,29498093 38 | 12,29473971,29474133 39 | 13,29462546,29462696 40 | 14,29456431,29456562 41 | 15,29455170,29455314 42 | 16,29451750,29451932 43 | 17,29450440,29450538 44 | 18,29449788,29449940 45 | 19,29448327,29448431 46 | 20,29446208,29446394 47 | 21,29445383,29445473 48 | 22,29445210,29445274 49 | 23,29443572,29443701 50 | 24,29436850,29436947 51 | 25,29432652,29432744 52 | 26,29430037,29430138 53 | 27,29420408,29420542 54 | 28,29419636,29419726 55 | 29,29415640,29416788 56 | 57 | >KIF5B_ENST00000302418.4,chr10:32297938-32345359 58 | 1,32344776,32345359 59 | 2,32337392,32337479 60 | 3,32329312,32329385 61 | 4,32328255,32328359 62 | 5,32327706,32327754 63 | 6,32327091,32327146 64 | 7,32326448,32326535 65 | 8,32326182,32326306 66 | 9,32324818,32324922 67 | 10,32324450,32324595 68 | 11,32323618,32323766 69 | 12,32322773,32322966 70 | 13,32321634,32321702 71 | 14,32320001,32320207 72 | 15,32317356,32317499 73 | 16,32311776,32311964 74 | 17,32311068,32311185 75 | 18,32310154,32310215 76 | 19,32309950,32310059 77 | 20,32308786,32308887 78 | 21,32307430,32307490 79 | 22,32307244,32307315 80 | 23,32306980,32307084 81 | 24,32306071,32306287 82 | 25,32304437,32304587 83 | 26,32297938,32300444 84 | 85 | >NPM1_ENST00000296930.5,chr5:170814120-170838141 86 | 1,170814652,170815010 87 | 2,170817055,170817134 88 | 3,170818309,170818428 89 | 4,170818710,170818803 90 | 5,170819714,170819820 91 | 6,170819918,170819982 92 | 7,170827157,170827214 93 | 8,170827843,170827929 94 | 9,170832306,170832407 95 | 10,170834704,170834778 96 | 11,170837531,170838141 97 | 98 | >CARS_ENST00000397111.5,chr11:3022152-3078843 99 | 1,3078573,3078843 100 | 2,3063395,3063486 101 | 3,3062126,3062214 102 | 4,3061065,3061161 103 | 5,3060424,3060522 104 | 6,3059280,3059429 105 | 7,3050533,3050673 106 | 8,3050226,3050314 107 | 9,3047906,3048027 108 | 10,3041450,3041562 109 | 11,3040369,3040497 110 | 12,3039850,3039979 111 | 13,3039638,3039741 112 | 14,3039087,3039184 113 | 15,3038336,3038525 114 | 16,3037011,3037079 115 | 17,3033425,3033506 116 | 18,3028109,3028189 117 | 19,3026596,3026663 118 | 20,3023771,3023830 119 | 21,3023200,3023283 120 | 22,3022152,3022478 121 | 122 | >CLTC_ENST00000269122.3,chr17:57697219-57773671 123 | 1,57697219,57697534 124 | 2,57721637,57721844 125 | 3,57724759,57725027 126 | 4,57725601,57725762 127 | 5,57728564,57728677 128 | 6,57733215,57733388 129 | 7,57737752,57737949 130 | 8,57738804,57739004 131 | 9,57741203,57741355 132 | 10,57742148,57742270 133 | 11,57743464,57743601 134 | 12,57743841,57744005 135 | 13,57744158,57744338 136 | 14,57746138,57746301 137 | 15,57751008,57751133 138 | 16,57752062,57752204 139 | 17,57754315,57754549 140 | 18,57756758,57756880 141 | 19,57758273,57758418 142 | 20,57758656,57758839 143 | 21,57759008,57759200 144 | 22,57759632,57759789 145 | 23,57759990,57760154 146 | 24,57760268,57760375 147 | 25,57760456,57760623 148 | 26,57760748,57760897 149 | 27,57760986,57761117 150 | 28,57761236,57761346 151 | 29,57762417,57762587 152 | 30,57762948,57763169 153 | 31,57767997,57768072 154 | 32,57771089,57773671 155 | 156 | >RANBP2_ENST00000283195.6,chr2:109335937-109402267 157 | 1,109335937,109336134 158 | 2,109345588,109345655 159 | 3,109347230,109347341 160 | 4,109347778,109347930 161 | 5,109351988,109352218 162 | 6,109352560,109352705 163 | 7,109356945,109357137 164 | 8,109363167,109363254 165 | 9,109365376,109365585 166 | 10,109367720,109367901 167 | 11,109367984,109368159 168 | 12,109368327,109368450 169 | 13,109369454,109369615 170 | 14,109369882,109370019 171 | 15,109370281,109370427 172 | 16,109371361,109371540 173 | 17,109371632,109371715 174 | 18,109374869,109375004 175 | 19,109378557,109378651 176 | 20,109379693,109384844 177 | 21,109388157,109388327 178 | 22,109388945,109389037 179 | 23,109389324,109389502 180 | 24,109392188,109392392 181 | 25,109393586,109393687 182 | 26,109397725,109397885 183 | 27,109398584,109398857 184 | 28,109398984,109399318 185 | 29,109400052,109402267 186 | 187 | >ATIC_ENST00000236959.9,chr2:216176540-216214487 188 | 1,216176540,216176884 189 | 2,216177221,216177347 190 | 3,216182880,216182956 191 | 4,216184388,216184454 192 | 5,216189964,216190052 193 | 6,216190710,216190861 194 | 7,216191545,216191701 195 | 8,216197105,216197230 196 | 9,216198073,216198180 197 | 10,216199642,216199727 198 | 11,216200758,216200847 199 | 12,216203502,216203630 200 | 13,216209502,216209594 201 | 14,216211482,216211664 202 | 15,216213817,216213972 203 | 16,216214259,216214486 204 | 205 | >STRN_ENST00000263918.4,chr2:37070783-37193615 206 | 1,37193373,37193615 207 | 2,37152248,37152351 208 | 3,37143221,37143294 209 | 4,37132683,37132761 210 | 5,37129727,37129894 211 | 6,37126666,37126801 212 | 7,37121041,37121176 213 | 8,37113859,37113969 214 | 9,37111075,37111218 215 | 10,37105034,37105170 216 | 11,37096697,37096872 217 | 12,37094957,37095004 218 | 13,37088275,37088396 219 | 14,37084999,37085166 220 | 15,37082355,37082495 221 | 16,37078143,37078250 222 | 17,37076857,37076943 223 | 18,37070783,37076768 224 | 225 | >TPM4_ENST00000344824.6,chr19:16177831-16213813 226 | 1,16178317,16178548 227 | 2,16186857,16186982 228 | 3,16192723,16192856 229 | 4,16197233,16197350 230 | 5,16198837,16198907 231 | 6,16199855,16199930 232 | 7,16204346,16204408 233 | 8,16204494,16204563 234 | 9,16212074,16212600 235 | 236 | >CD74_ENST00000009530.7,chr5:149781200-149792492 237 | 1,149792188,149792314 238 | 2,149786715,149786887 239 | 3,149786444,149786523 240 | 4,149785822,149785884 241 | 5,149784648,149784743 242 | 6,149784243,149784330 243 | 7,149782684,149782875 244 | 8,149782126,149782188 245 | 9,149781715,149781813 246 | 247 | >ROS1_ENST00000368508.3,chr6:117609463-117747018 248 | 1,117746697,117747018 249 | 2,117739625,117739669 250 | 3,117737421,117737480 251 | 4,117730745,117730805 252 | 5,117725443,117725591 253 | 6,117724302,117724440 254 | 7,117718078,117718279 255 | 8,117717351,117717427 256 | 9,117715779,117715901 257 | 10,117715325,117715509 258 | 11,117714387,117714484 259 | 12,117710513,117711009 260 | 13,117708943,117709197 261 | 14,117708052,117708162 262 | 15,117706846,117707024 263 | 16,117704480,117704671 264 | 17,117700222,117700322 265 | 18,117687239,117687453 266 | 19,117686744,117686904 267 | 20,117686223,117686367 268 | 21,117683766,117684028 269 | 22,117681505,117681568 270 | 23,117680972,117681174 271 | 24,117678967,117679172 272 | 25,117677792,117678078 273 | 26,117674153,117674332 274 | 27,117665223,117665425 275 | 28,117663563,117663707 276 | 29,117662563,117662795 277 | 30,117662298,117662474 278 | 31,117658335,117658503 279 | 32,117650492,117650609 280 | 33,117647387,117647577 281 | 34,117645495,117645578 282 | 35,117642422,117642557 283 | 36,117641031,117641193 284 | 37,117639351,117639415 285 | 38,117638306,117638435 286 | 39,117632183,117632280 287 | 40,117631244,117631444 288 | 41,117629957,117630091 289 | 42,117622137,117622300 290 | 43,117609463,117609965 291 | 292 | >SLC34A2_ENST00000382051.3,chr4:25656923-25680370 293 | 1,25657466,25657512 294 | 2,25664120,25664234 295 | 3,25664327,25664464 296 | 4,25665824,25665952 297 | 5,25667750,25667893 298 | 6,25669502,25669613 299 | 7,25671269,25671464 300 | 8,25672360,25672455 301 | 9,25673223,25673343 302 | 10,25674709,25674876 303 | 11,25675918,25676034 304 | 12,25676127,25676251 305 | 13,25677757,25680370 306 | 307 | >EZR_ENST00000367075.3,chr6:159186773-159240444 308 | 1,159240349,159240444 309 | 2,159239114,159239198 310 | 3,159210320,159210403 311 | 4,159208140,159208235 312 | 5,159206341,159206615 313 | 6,159205676,159205759 314 | 7,159204552,159204698 315 | 8,159197440,159197536 316 | 9,159192276,159192439 317 | 10,159191796,159191926 318 | 11,159190816,159190976 319 | 12,159190358,159190450 320 | 13,159188293,159188544 321 | 14,159186780,159188110 322 | 323 | >GOPC_ENST00000052569.6,chr6:117639374-117923691 324 | 1,117923167,117923691 325 | 2,117900063,117900227 326 | 3,117896340,117896515 327 | 4,117894630,117894795 328 | 5,117892023,117892118 329 | 6,117890735,117890899 330 | 7,117888017,117888197 331 | 8,117881432,117884547 332 | 333 | >SDC4_ENST00000372733.3,chr20:43953928-43977064 334 | 1,43976965,43977064 335 | 2,43964422,43964560 336 | 3,43961663,43961709 337 | 4,43959006,43959204 338 | 5,43953928,43956055 339 | 340 | >CCDC6_ENST00000263102.6,chr10:61548521-61666414 341 | 1,61665880,61666414 342 | 2,61612311,61612460 343 | 3,61592283,61592411 344 | 4,61574410,61574513 345 | 5,61572393,61572553 346 | 6,61566680,61566836 347 | 7,61564178,61564278 348 | 8,61554231,61554355 349 | 9,61548521,61552869 350 | 351 | >RET_ENST00000355710.3,chr10:43572475-43625799 352 | 1,43572475,43572779 353 | 2,43595907,43596170 354 | 3,43597790,43598077 355 | 4,43600400,43600641 356 | 5,43601824,43602019 357 | 6,43604479,43604678 358 | 7,43606655,43606913 359 | 8,43607547,43607672 360 | 9,43608301,43608411 361 | 10,43609004,43609123 362 | 11,43609928,43610184 363 | 12,43612032,43612179 364 | 13,43613821,43613928 365 | 14,43614979,43615193 366 | 15,43615529,43615651 367 | 16,43617394,43617464 368 | 17,43619119,43619256 369 | 18,43620331,43620430 370 | 19,43622023,43622170 371 | 20,43623560,43625799 372 | 373 | >NCOA4_ENST00000374087.4,chr10:51565108-51590734 374 | 1,51565227,51565296 375 | 2,51579128,51579282 376 | 3,51580556,51580696 377 | 4,51580880,51580968 378 | 5,51581270,51581378 379 | 6,51582183,51582272 380 | 7,51582796,51582939 381 | 8,51584616,51585599 382 | 9,51586271,51586411 383 | 10,51589225,51590732 384 | 385 | >PRKAR1A_ENST00000589228.1,chr17:66507921-66547460 386 | 1,66508568,66508689 387 | 2,66511535,66511717 388 | 3,66518897,66519067 389 | 4,66519866,66519957 390 | 5,66520157,66520218 391 | 6,66521053,66521099 392 | 7,66521895,66522053 393 | 8,66523981,66524041 394 | 9,66525011,66525132 395 | 10,66526061,66526142 396 | 11,66526418,66529572 397 | 398 | >BCR_ENST00000305877.8,chr22:23521891-23660224 399 | 1,23522397,23524426 400 | 2,23595986,23596167 401 | 3,23603137,23603241 402 | 4,23603542,23603727 403 | 5,23610595,23610702 404 | 6,23613719,23613779 405 | 7,23615268,23615320 406 | 8,23615821,23615961 407 | 9,23626164,23626285 408 | 10,23627220,23627388 409 | 11,23629346,23629465 410 | 12,23630284,23630359 411 | 13,23631704,23631808 412 | 14,23632526,23632600 413 | 15,23634728,23634825 414 | 16,23637211,23637342 415 | 17,23651611,23651670 416 | 18,23652511,23652620 417 | 19,23653884,23654023 418 | 20,23655074,23655208 419 | 21,23656155,23656260 420 | 22,23656739,23656901 421 | 23,23657620,23660224 422 | 423 | >ABL1_ENST00000318560.5,chr9:133589333-133763062 424 | 1,133710453,133710912 425 | 2,133729451,133729624 426 | 3,133730188,133730483 427 | 4,133738150,133738422 428 | 5,133747516,133747600 429 | 6,133748247,133748424 430 | 7,133750255,133750439 431 | 8,133753802,133753954 432 | 9,133755455,133755544 433 | 10,133755887,133756051 434 | 11,133759356,133763062 435 | 436 | >ETV6_ENST00000396373.4,chr12:11802788-12048336 437 | 1,11802788,11803094 438 | 2,11905384,11905513 439 | 3,11992074,11992238 440 | 4,12006361,12006495 441 | 5,12022358,12022903 442 | 6,12037379,12037521 443 | 7,12038860,12038960 444 | 8,12043875,12048336 445 | 446 | >NTRK3_ENST00000394480.2,chr15:88418230-88799999 447 | 1,88799875,88799978 448 | 2,88799515,88799717 449 | 3,88799137,88799399 450 | 4,88727456,88727530 451 | 5,88726649,88726720 452 | 6,88690566,88690634 453 | 7,88680635,88680792 454 | 8,88679698,88679840 455 | 9,88679130,88679271 456 | 10,88678332,88678628 457 | 11,88671942,88671965 458 | 12,88670393,88670457 459 | 13,88669502,88669604 460 | 14,88576088,88576276 461 | 15,88483854,88483984 462 | 16,88476243,88476415 463 | 17,88472422,88472665 464 | 18,88423501,88423659 465 | 19,88418230,88420351 466 | 467 | >TPM3_ENST00000368530.2,chr1:154127784-154167124 468 | 1,154164378,154164687 469 | 2,154163662,154163787 470 | 3,154148591,154148724 471 | 4,154145560,154145677 472 | 5,154145384,154145454 473 | 6,154143889,154143964 474 | 7,154143125,154143187 475 | 8,154142876,154142945 476 | 9,154141781,154141859 477 | 10,154139941,154140416 478 | 479 | >NTRK1_ENST00000368196.3,chr1:156785432-156851642 480 | 1,156830607,156830938 481 | 2,156834146,156834220 482 | 3,156834520,156834591 483 | 4,156836702,156836770 484 | 5,156837896,156838041 485 | 6,156838297,156838439 486 | 7,156841415,156841547 487 | 8,156843425,156843751 488 | 9,156844363,156844418 489 | 10,156844698,156844800 490 | 11,156845312,156845458 491 | 12,156845872,156846002 492 | 13,156846192,156846364 493 | 14,156848914,156849154 494 | 15,156849791,156849949 495 | 16,156851249,156851642 496 | 497 | >LMNA_ENST00000368300.4,chr1:156052364-156109880 498 | 1,156084498,156085065 499 | 2,156100408,156100564 500 | 3,156104194,156104319 501 | 4,156104596,156104766 502 | 5,156104978,156105103 503 | 6,156105692,156105912 504 | 7,156106005,156106227 505 | 8,156106712,156106819 506 | 9,156106904,156107023 507 | 10,156107445,156107534 508 | 11,156108279,156108548 509 | 12,156108871,156109880 510 | 511 | >TPR_ENST00000367478.4,chr1:186280784-186344825 512 | 1,186344010,186344457 513 | 2,186342491,186342595 514 | 3,186340102,186340175 515 | 4,186337018,186337114 516 | 5,186332474,186332577 517 | 6,186331969,186332133 518 | 7,186331420,186331512 519 | 8,186330921,186331001 520 | 9,186330754,186330841 521 | 10,186329897,186330037 522 | 11,186329405,186329496 523 | 12,186328931,186329128 524 | 13,186327675,186327782 525 | 14,186326529,186326755 526 | 15,186325418,186325581 527 | 16,186324767,186324900 528 | 17,186324542,186324690 529 | 18,186322820,186322982 530 | 19,186321108,186321242 531 | 20,186320462,186320602 532 | 21,186319355,186319520 533 | 22,186316424,186316590 534 | 23,186315267,186315419 535 | 24,186314712,186314828 536 | 25,186313507,186313710 537 | 26,186313038,186313222 538 | 27,186312458,186312605 539 | 28,186310384,186310521 540 | 29,186310160,186310291 541 | 30,186308774,186308904 542 | 31,186307165,186307375 543 | 32,186306145,186306288 544 | 33,186305628,186305826 545 | 34,186304470,186304675 546 | 35,186304200,186304261 547 | 36,186303457,186303665 548 | 37,186302254,186302526 549 | 38,186301327,186301475 550 | 39,186300630,186300713 551 | 40,186296592,186296792 552 | 41,186295236,186295367 553 | 42,186294896,186294986 554 | 43,186292818,186293002 555 | 44,186291650,186291718 556 | 45,186291450,186291544 557 | 46,186289444,186289550 558 | 47,186287865,186287960 559 | 48,186287579,186287735 560 | 49,186286614,186286732 561 | 50,186283761,186283856 562 | 51,186280784,186283158 563 | 564 | -------------------------------------------------------------------------------- /scripts/make_fusion_genes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate a user-defined fusion file 3 | 4 | Input file should be a text file containing 5 | 1. gene name and 6 | 2. transcript name (optional) 7 | 8 | For example: 9 | gene1 transcript1 10 | gene2 11 | gene3 transcript3 12 | """ 13 | __author__ = "Kai" 14 | __date__ = "20/05/2020" 15 | 16 | import argparse 17 | 18 | 19 | def read_genelist(inputf): 20 | with open(inputf, "r") as fh: 21 | for line in fh: 22 | yield line.rstrip("\n").split() 23 | 24 | 25 | def make_fusion_gene(gene, fw, refflat): 26 | # no transcript specified --> use the longest transcript 27 | if len(gene) == 1: 28 | transcripts = {} 29 | with open(refflat, "r") as fh: 30 | for line in fh: 31 | cur_gene, transcript, chrom, strand, start, end, _, _, _, exonstart, exonend = line.rstrip("\n").split("\t") 32 | if gene[0] != cur_gene: 33 | continue 34 | transcripts[transcript] = (chrom, strand, start, end, exonstart, exonend) 35 | if transcripts == {}: 36 | raise ValueError(f'This gene symbol cannot be found in refFlat.txt: {gene[0]}') 37 | transcript = get_longest_transcript(transcripts.keys(), refflat) 38 | chrom, strand, start, end, exonstart, exonend = transcripts[transcript] 39 | 40 | # use user-specified transcript 41 | elif len(gene) == 2: 42 | with open(refflat, "r") as fh: 43 | for line in fh: 44 | cur_gene, transcript, chrom, strand, start, end, _, _, _, exonstart, exonend = line.rstrip("\n").split("\t") 45 | if gene[0] == cur_gene and gene[1] == transcript: 46 | break 47 | else: 48 | raise ValueError(f'Wrong gene symobol or transcript maybe provided: {gene[0]}, {gene[1]}') 49 | 50 | # write to a file 51 | header = f">{gene[0]}_{transcript},{chrom}:{start}-{end}\n" 52 | fw.write(header) 53 | exons = list(zip(exonstart.split(","), exonend.split(",")))[:-1] 54 | if strand == "-": 55 | exons = exons[::-1] 56 | for index, each_exon in enumerate(exons, start=1): 57 | fw.write(f'{index},{each_exon[0]},{each_exon[1]}\n') 58 | fw.write("\n") 59 | 60 | 61 | def get_longest_transcript(transcripts, refflat): 62 | longest_length = 0 63 | longest_transcript = "" 64 | with open(refflat, "r") as fh: 65 | for line in fh: 66 | line = line.strip().split() 67 | if line[1] in transcripts: 68 | length = int(line[5]) - int(line[4]) 69 | if length > longest_length: 70 | longest_length = length 71 | longest_transcript = line[1] 72 | return longest_transcript 73 | 74 | 75 | if __name__ == "__main__": 76 | parser = argparse.ArgumentParser(description=__doc__) 77 | parser.add_argument("input", help="Input filename") 78 | parser.add_argument("-r", "--refflat", required=True, help="Path to the refFlat.txt file, need to be downloaded from UCSC in advance") 79 | parser.add_argument("-o", "--output", required=True, help="The output filename") 80 | args = parser.parse_args() 81 | 82 | with open(args.output, "w") as fw: 83 | for gene in read_genelist(args.input): 84 | make_fusion_gene(gene, fw, args.refflat) -------------------------------------------------------------------------------- /src/builtinfusion.h: -------------------------------------------------------------------------------- 1 | #ifndef BUILT_IN_FUSION_H 2 | #define BUILT_IN_FUSION_H 3 | 4 | #include 5 | #include 6 | 7 | const string BUILT_IN_FUSIONS = string(""); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /src/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | 4 | #define FUSIONSCAN_VER "0.8.0" 5 | 6 | #define _DEBUG true 7 | 8 | typedef long int64; 9 | typedef unsigned long uint64; 10 | 11 | typedef int int32; 12 | typedef unsigned int uint32; 13 | 14 | typedef short int16; 15 | typedef unsigned short uint16; 16 | 17 | typedef char int8; 18 | typedef unsigned char uint8; 19 | 20 | 21 | #pragma pack(2) 22 | // if contig is -1, means this is a dupe entry, and position will be the position in the dupList 23 | struct GenePos{ 24 | short contig; 25 | int position; 26 | }; 27 | #pragma pack() 28 | 29 | // the limit of the queue to store the packs 30 | // error may happen if it generates more packs than this number 31 | static const int PACK_NUM_LIMIT = 5000000; 32 | 33 | // how many reads one pack has 34 | static const int PACK_SIZE = 1000; 35 | 36 | // if one pack is produced, but not consumed, it will be kept in the memory 37 | // this number limit the number of in memory packs 38 | // if the number of in memory packs is full, the producer thread should sleep 39 | static const int PACK_IN_MEM_LIMIT = 100; 40 | 41 | // the key dup in normal level will be kept, in high level will be skipped 42 | static const int DUPE_NORMAL_LEVEL = -1; 43 | static const int DUPE_HIGH_LEVEL = -2; 44 | 45 | 46 | #endif /* COMMON_H */ 47 | -------------------------------------------------------------------------------- /src/editdistance.cpp: -------------------------------------------------------------------------------- 1 | // ------- 2 | // License 3 | // ------- 4 | // 5 | // It is released under the MIT license. 6 | // 7 | // Copyright (c) 2013 Hiroyuki Tanaka 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 10 | // 11 | // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 14 | 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "editdistance.h" 27 | 28 | using namespace std; 29 | 30 | template 31 | unsigned int edit_distance_bpv(T &cmap, char const *vec, size_t const &vecsize, unsigned int const &tmax, unsigned int const &tlen) { 32 | int D = tmax * 64 + tlen; 33 | TVALUE D0, HP, HN, VP, VN; 34 | uint64_t top = (1L << (tlen - 1)); // 末尾のvectorに適用 35 | uint64_t lmb = (1L << 63); 36 | 37 | for(size_t i = 0; i <= tmax; ++i) { 38 | VP[i] = 0; 39 | VN[i] = 0; 40 | } 41 | for(size_t i = 0; i < tmax; ++i) VP[i] = ~0; 42 | for(size_t i = 0; i < tlen; ++i) VP[tmax] |= (1L << i); 43 | for(size_t i = 0; i < vecsize; ++i) { 44 | TVALUE &PM = cmap[vec[i]]; 45 | for(int r = 0; r <= tmax; ++r) { 46 | uint64_t X = PM[r]; 47 | if(r > 0 && (HN[r - 1] & lmb)) X |= 1L; 48 | D0[r] = (((X & VP[r]) + VP[r]) ^ VP[r]) | X | VN[r]; 49 | HP[r] = VN[r] | ~(D0[r] | VP[r]); 50 | HN[r] = D0[r] & VP[r]; 51 | X = (HP[r] << 1L); 52 | if(r == 0 || HP[r - 1] & lmb) X |= 1L; 53 | VP[r] = (HN[r] << 1L) | ~(D0[r] | X); 54 | if(r > 0 && (HN[r - 1] & lmb)) VP[r] |= 1L; 55 | VN[r] = D0[r] & X; 56 | } 57 | if(HP[tmax] & top) ++D; 58 | else if(HN[tmax] & top) --D; 59 | } 60 | return D; 61 | } 62 | 63 | 64 | /// c.f. http://handasse.blogspot.com/2009/04/c_29.html 65 | template 66 | unsigned int edit_distance_dp(T const *str1, size_t const size1, T const *str2, size_t const size2) { 67 | vector< vector > d(size1 + 1, vector(size2 + 1)); 68 | for (int i = 0; i < size1 + 1; i++) d[i][0] = i; 69 | for (int i = 0; i < size2 + 1; i++) d[0][i] = i; 70 | for (int i = 1; i < size1 + 1; i++) { 71 | for (int j = 1; j < size2 + 1; j++) { 72 | d[i][j] = min(min(d[i-1][j], d[i][j-1]) + 1, d[i-1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); 73 | } 74 | } 75 | return d[size1][size2]; 76 | } 77 | 78 | template 79 | struct varr { 80 | uint64_t arr_[N]; 81 | uint64_t & operator[](size_t const &i) { 82 | return arr_[i]; 83 | } 84 | }; 85 | 86 | 87 | template 88 | unsigned int edit_distance_map_(char const *a, size_t const asize, char const *b, size_t const bsize) { 89 | typedef map > cmap_v; 90 | cmap_v cmap; 91 | unsigned int tmax = (asize - 1) >> 6; 92 | unsigned int tlen = asize - tmax * 64; 93 | for(size_t i = 0; i < tmax; ++i) { 94 | for(size_t j = 0; j < 64; ++j) cmap[a[i * 64 + j]][i] |= (1L << j); 95 | } 96 | for(size_t i = 0; i < tlen; ++i) cmap[a[tmax * 64 + i]][tmax] |= (1L << i); 97 | return edit_distance_bpv(cmap, b, bsize, tmax, tlen); 98 | } 99 | 100 | unsigned int edit_distance(const char *a, const unsigned int asize, const char *b, const unsigned int bsize) { 101 | if(asize == 0) return bsize; 102 | else if(bsize == 0) return asize; 103 | char const *ap, *bp; 104 | unsigned int const *asizep, *bsizep; 105 | if(asize < bsize) ap = b, bp = a, asizep = &bsize, bsizep = &asize; 106 | else ap = a, bp = b, asizep = &asize, bsizep = &bsize; 107 | size_t vsize = ((*asizep - 1) >> 6) + 1; 108 | if(vsize > 10) { 109 | char const *_ = ap; 110 | unsigned int const *__ = asizep; 111 | ap = bp, bp = _, asizep = bsizep, bsizep = __; 112 | vsize = ((*asizep - 1) >> 6) + 1; 113 | } 114 | 115 | if(vsize == 1) return edit_distance_map_<1>(ap, *asizep, bp, *bsizep); 116 | else if(vsize == 2) return edit_distance_map_<2>(ap, *asizep, bp, *bsizep); 117 | else if(vsize == 3) return edit_distance_map_<3>(ap, *asizep, bp, *bsizep); 118 | else if(vsize == 4) return edit_distance_map_<4>(ap, *asizep, bp, *bsizep); 119 | else if(vsize == 5) return edit_distance_map_<5>(ap, *asizep, bp, *bsizep); 120 | else if(vsize == 6) return edit_distance_map_<6>(ap, *asizep, bp, *bsizep); 121 | else if(vsize == 7) return edit_distance_map_<7>(ap, *asizep, bp, *bsizep); 122 | else if(vsize == 8) return edit_distance_map_<8>(ap, *asizep, bp, *bsizep); 123 | else if(vsize == 9) return edit_distance_map_<9>(ap, *asizep, bp, *bsizep); 124 | else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep); 125 | return edit_distance_dp(ap, *asizep, bp, *bsizep); 126 | } 127 | 128 | unsigned int edit_distance(string a, string b) { 129 | return edit_distance(a.c_str(), a.length(), b.c_str(), b.length()); 130 | } 131 | 132 | bool editdistance_test(){ 133 | const char* str1[3] = { 134 | "CCTATCAGGGAGCTGTGGGCCAGCCAGGAGGCAGCACATGCCCAATCCCAGGCCCCTCCCGTTGTAAGTTCCCGTTCTACCCGACAGGGACCTGCTGACAAAAGACAGGGCTGGAGAGCCAGCCTGAAGGCCCTGGGACCCTTCTATCCAC", 135 | "ACTTATGTTTTTAAATGAGGATTATTGATAGTACTCTTGGTTTTTATACCATTCAGATCACTGAATTTATAAAGTACCCATCTAGTACTTCAAAAAGTAAAGTGTTCTGCCAGATCTTAGGTATAGAGGACCCTAACACAGTAAGATCGGA", 136 | "TAGGGGTATGAGTAGAGCTGAGCTGGGGGAAAAGAGGGAAATTCCCAGGGGTGGAGGAAGAGTCAAGTCCCCCTCTACACCTAGAGGATGAACTTAAGGAAGGAGTGAAGGTCATATGTGTTGTTCCTGAGGAAAAGGCCGCTGTAGAAAA", 137 | }; 138 | const char* str2[3] = { 139 | "CCTATCAGGGAGCTGTGGGCCAGCCAGGAGGCAGCACATGCCCAATCCCAGGCCCCTCCCGTTGTAAGTTCCCGTTCTACCCGACAGGGACCTGCTGACAAAAGACAGGGCTGGAGAGCCAGCCTGAAGGCCCTGGGACCCTTCTATCCAC", 140 | "ACTTATGTTTTTAAATGAGGATTATTGATAGTACTCTTGGTTTTTATACCATTCAGATCACTGAATTTATAAAGTACCCATCTAGTACTTGAAAAAGTAAAGTGTTCTGCCAGATCTTAGGTATAGAGGACCCTAACACAGTAAGATCGGA", 141 | "CCTGGGCCTGGCCCTTGTCTAAAACTGACTCTTTTGAGGGTGATTTTGGATGTTCTTAGTAGAGTCTCTCACCTGTACTTTCCTTGCCTAAGGTGCTGTCTTCTCTTGCAGGTTGCCTACACGTTCCTCACATGCCCTAAGAACCATGGGA", 142 | }; 143 | int result[3] = { 144 | 0, 145 | 1, 146 | 90, 147 | }; 148 | 149 | for(int i=0;i<3;i++){ 150 | int ret = 0; 151 | clock_t t1 = clock(); 152 | for (int p=0;p<100000;p++){ 153 | ret = edit_distance(str1[i], strlen(str1[i]), str2[i], strlen(str2[i])); 154 | } 155 | clock_t t2 = clock(); 156 | printf("test 100000 edit_distance, takes %lu ms\n", (t2-t1)/1000); 157 | if(ret != result[i]){ 158 | printf("Fail: (edit_distance), expect %d, but got %d: \n%s\n%s\n", result[i], ret, str1[i], str2[i]); 159 | return false; 160 | } 161 | } 162 | return true; 163 | } 164 | -------------------------------------------------------------------------------- /src/editdistance.h: -------------------------------------------------------------------------------- 1 | #ifndef ___EDITDISTANCE__H__ 2 | #define ___EDITDISTANCE__H__ 3 | 4 | #include 5 | #include 6 | 7 | // struct PatternMap { 8 | // uint64_t p_[256][4]; 9 | // unsigned int tmax_; 10 | // unsigned int tlen_; 11 | // }; 12 | 13 | using namespace std; 14 | 15 | unsigned int edit_distance(const char *a, const unsigned int asize, const char *b, const unsigned int bsize); 16 | // void create_patternmap(struct PatternMap *pm, const int64_t *a, const unsigned int size); 17 | // unsigned int edit_distance_by_patternmap(struct PatternMap *mp, const int64_t *b, const unsigned int size); 18 | 19 | unsigned int edit_distance(string a, string b); 20 | 21 | bool editdistance_test(); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/fastareader.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "fastareader.h" 3 | #include "util.h" 4 | #include 5 | 6 | FastaReader::FastaReader(string faFile, bool forceUpperCase) 7 | { 8 | // Set locale and disable stdio synchronization to improve iostream performance 9 | // http://www.drdobbs.com/the-standard-librarian-iostreams-and-std/184401305 10 | // http://stackoverflow.com/questions/5166263/how-to-get-iostream-to-perform-better 11 | setlocale(LC_ALL,"C"); 12 | ios_base::sync_with_stdio(false); 13 | 14 | mFastaFile = faFile; 15 | mForceUpperCase = forceUpperCase; 16 | if (is_directory(mFastaFile)) { 17 | string error_msg = "There is a problem with the provided fasta file: \'"; 18 | error_msg.append(mFastaFile); 19 | error_msg.append("\' is a directory NOT a file...\n"); 20 | throw invalid_argument(error_msg); 21 | } 22 | mFastaFileStream.open( mFastaFile.c_str(),ios::in); 23 | // verify that the file can be read 24 | if (!mFastaFileStream.is_open()) { 25 | string msg = "There is a problem with the provided fasta file: could NOT read "; 26 | msg.append(mFastaFile.c_str()); 27 | msg.append("...\n"); 28 | throw invalid_argument(msg); 29 | } 30 | 31 | char c; 32 | // seek to first contig 33 | while (mFastaFileStream.get(c) && c != '>') { 34 | if (mFastaFileStream.eof()) { 35 | break; 36 | } 37 | } 38 | } 39 | 40 | FastaReader::~FastaReader() 41 | { 42 | if (mFastaFileStream.is_open()) { 43 | mFastaFileStream.close(); 44 | } 45 | } 46 | 47 | void FastaReader::readNext() 48 | { 49 | mCurrentID = ""; 50 | mCurrentDescription = ""; 51 | mCurrentSequence = ""; 52 | bool foundHeader = false; 53 | 54 | char c; 55 | stringstream ssSeq; 56 | stringstream ssHeader; 57 | while(true){ 58 | mFastaFileStream.get(c); 59 | if(c == '>' || mFastaFileStream.eof()) 60 | break; 61 | else { 62 | if (foundHeader){ 63 | if(mForceUpperCase && c>='a' && c<='z') { 64 | c -= ('a' - 'A'); 65 | } 66 | ssSeq << c; 67 | } 68 | else 69 | ssHeader << c; 70 | } 71 | 72 | string line = ""; 73 | getline(mFastaFileStream,line,'\n'); 74 | 75 | 76 | if(foundHeader == false) { 77 | ssHeader << line; 78 | foundHeader = true; 79 | } 80 | else { 81 | str_keep_valid_sequence(line, mForceUpperCase); 82 | ssSeq << line; 83 | } 84 | } 85 | mCurrentSequence = ssSeq.str(); 86 | string header = ssHeader.str(); 87 | 88 | int space = header.find(" "); 89 | mCurrentID = header.substr(0, space); 90 | } 91 | 92 | bool FastaReader::hasNext() { 93 | return !mFastaFileStream.eof(); 94 | } 95 | 96 | void FastaReader::readAll() { 97 | while(!mFastaFileStream.eof()){ 98 | readNext(); 99 | mAllContigs[mCurrentID] = mCurrentSequence; 100 | } 101 | } 102 | 103 | bool FastaReader::test(){ 104 | FastaReader reader("testdata/tinyref.fa"); 105 | reader.readAll(); 106 | 107 | string contig1 = "GATCACAGGTCTATCACCCTATTAATTGGTATTTTCGTCTGGGGGGTGTGGAGCCGGAGCACCCTATGTCGCAGT"; 108 | string contig2 = "GTCTGCACAGCCGCTTTCCACACAGAACCCCCCCCTCCCCCCGCTTCTGGCAAACCCCAAAAACAAAGAACCCTA"; 109 | 110 | if(reader.mAllContigs.count("contig1") == 0 || reader.mAllContigs.count("contig2") == 0 ) 111 | return false; 112 | 113 | if(reader.mAllContigs["contig1"] != contig1 || reader.mAllContigs["contig2"] != contig2 ) 114 | return false; 115 | 116 | return true; 117 | 118 | } 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /src/fastareader.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTA_READER_H 2 | #define FASTA_READER_H 3 | 4 | // includes 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | class FastaReader 17 | { 18 | public: 19 | FastaReader(string fastaFile, bool forceUpperCase = true); 20 | ~FastaReader(); 21 | bool hasNext(); 22 | void readNext(); 23 | void readAll(); 24 | 25 | inline string currentID() 26 | { 27 | return mCurrentID; 28 | } 29 | 30 | inline string currentDescription() 31 | { 32 | return mCurrentDescription; 33 | } 34 | 35 | inline string currentSequence() 36 | { 37 | return mCurrentSequence; 38 | } 39 | 40 | inline map& contigs() { 41 | return mAllContigs; 42 | } 43 | 44 | static bool test(); 45 | 46 | 47 | public: 48 | string mCurrentSequence; 49 | string mCurrentID ; 50 | string mCurrentDescription; 51 | map mAllContigs; 52 | 53 | private: 54 | bool readLine(); 55 | bool endOfLine(char c); 56 | void setFastaSequenceIdDescription(); 57 | 58 | private: 59 | string mFastaFile; 60 | ifstream mFastaFileStream; 61 | bool mForceUpperCase; 62 | }; 63 | 64 | 65 | #endif 66 | 67 | -------------------------------------------------------------------------------- /src/fastqreader.cpp: -------------------------------------------------------------------------------- 1 | #include "fastqreader.h" 2 | #include "util.h" 3 | #include 4 | 5 | FastqReader::FastqReader(string filename, bool hasQuality){ 6 | mFilename = filename; 7 | mZipFile = NULL; 8 | mZipped = false; 9 | mHasQuality = hasQuality; 10 | init(); 11 | } 12 | 13 | FastqReader::~FastqReader(){ 14 | close(); 15 | } 16 | 17 | void FastqReader::init(){ 18 | if (isZipFastq(mFilename)){ 19 | mZipFile = gzopen(mFilename.c_str(), "r"); 20 | mZipped = true; 21 | } 22 | else if (isFastq(mFilename)){ 23 | mFile.open(mFilename.c_str(), ifstream::in); 24 | mZipped = false; 25 | } else { 26 | cerr << "ERROR: the input file should be fastq (.fq, .fastq) or gzipped fastq (.fq.gz, .fastq.gz)" << endl; 27 | exit(-1); 28 | } 29 | } 30 | 31 | bool FastqReader::getLine(char* line, int maxLine){ 32 | bool status = true; 33 | if(mZipped) 34 | status = gzgets(mZipFile, line, maxLine); 35 | else { 36 | mFile.getline(line, maxLine); 37 | status = !mFile.fail(); 38 | } 39 | 40 | // trim \n, \r or \r\n in the tail 41 | int readed = strlen(line); 42 | if(readed >=2 ){ 43 | if(line[readed-1] == '\n' || line[readed-1] == '\r'){ 44 | line[readed-1] = '\0'; 45 | if(line[readed-2] == '\r') 46 | line[readed-2] = '\0'; 47 | } 48 | } 49 | 50 | return status; 51 | } 52 | 53 | Read* FastqReader::read(){ 54 | const int maxLine = 1000; 55 | char line[maxLine]; 56 | if (mZipped){ 57 | if (mZipFile == NULL) 58 | return NULL; 59 | } 60 | 61 | if(!getLine(line, maxLine))return NULL; 62 | string name(line); 63 | 64 | if (!getLine(line, maxLine))return NULL; 65 | string sequence(line); 66 | 67 | if (!getLine(line, maxLine))return NULL; 68 | string strand(line); 69 | 70 | if (mHasQuality){ 71 | if (!getLine(line, maxLine))return NULL; 72 | string quality(line); 73 | Read* read = new Read(name, sequence, strand, quality); 74 | return read; 75 | } 76 | else { 77 | Read* read = new Read(name, sequence, strand); 78 | return read; 79 | } 80 | 81 | return NULL; 82 | } 83 | 84 | void FastqReader::close(){ 85 | if (mZipped){ 86 | if (mZipFile){ 87 | gzclose(mZipFile); 88 | mZipFile = NULL; 89 | } 90 | } 91 | else { 92 | if (mFile.is_open()){ 93 | mFile.close(); 94 | } 95 | } 96 | } 97 | 98 | bool FastqReader::isZipFastq(string filename) { 99 | if (ends_with(filename, ".fastq.gz")) 100 | return true; 101 | else if (ends_with(filename, ".fq.gz")) 102 | return true; 103 | else if (ends_with(filename, ".fasta.gz")) 104 | return true; 105 | else if (ends_with(filename, ".fa.gz")) 106 | return true; 107 | else 108 | return false; 109 | } 110 | 111 | bool FastqReader::isFastq(string filename) { 112 | if (ends_with(filename, ".fastq")) 113 | return true; 114 | else if (ends_with(filename, ".fq")) 115 | return true; 116 | else if (ends_with(filename, ".fasta")) 117 | return true; 118 | else if (ends_with(filename, ".fa")) 119 | return true; 120 | else 121 | return false; 122 | } 123 | 124 | bool FastqReader::isZipped(){ 125 | return mZipped; 126 | } 127 | 128 | bool FastqReader::test(){ 129 | FastqReader reader1("testdata/R1.fq"); 130 | FastqReader reader2("testdata/R1.fq.gz"); 131 | Read* r1 = NULL; 132 | Read* r2 = NULL; 133 | while(true){ 134 | r1=reader1.read(); 135 | r2=reader2.read(); 136 | if(r1 == NULL || r2 == NULL) 137 | break; 138 | if(r1->mSeq.mStr != r2->mSeq.mStr){ 139 | return false; 140 | } 141 | delete r1; 142 | delete r2; 143 | } 144 | return true; 145 | } 146 | 147 | FastqReaderPair::FastqReaderPair(FastqReader* left, FastqReader* right){ 148 | mLeft = left; 149 | mRight = right; 150 | } 151 | 152 | FastqReaderPair::FastqReaderPair(string leftName, string rightName){ 153 | mLeft = new FastqReader(leftName); 154 | mRight = new FastqReader(rightName); 155 | } 156 | 157 | FastqReaderPair::~FastqReaderPair(){ 158 | if(mLeft){ 159 | delete mLeft; 160 | mLeft = NULL; 161 | } 162 | if(mRight){ 163 | delete mRight; 164 | mRight = NULL; 165 | } 166 | } 167 | 168 | ReadPair* FastqReaderPair::read(){ 169 | Read* l = mLeft->read(); 170 | Read* r = mRight->read(); 171 | if(!l || !r){ 172 | return NULL; 173 | } else { 174 | return new ReadPair(l, r); 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /src/fastqreader.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTQ_READER_H 2 | #define FASTQ_READER_H 3 | 4 | #include 5 | #include 6 | #include "read.h" 7 | #include "zlib/zlib.h" 8 | #include "common.h" 9 | #include 10 | #include 11 | 12 | class FastqReader{ 13 | public: 14 | FastqReader(string filename, bool hasQuality = true); 15 | ~FastqReader(); 16 | bool isZipped(); 17 | 18 | //this function is not thread-safe 19 | //do not call read() of a same FastqReader object from different threads concurrently 20 | Read* read(); 21 | 22 | public: 23 | static bool isZipFastq(string filename); 24 | static bool isFastq(string filename); 25 | static bool test(); 26 | 27 | private: 28 | void init(); 29 | void close(); 30 | bool getLine(char* line, int maxLine); 31 | 32 | private: 33 | string mFilename; 34 | gzFile mZipFile; 35 | ifstream mFile; 36 | bool mZipped; 37 | bool mHasQuality; 38 | 39 | }; 40 | 41 | class FastqReaderPair{ 42 | public: 43 | FastqReaderPair(FastqReader* left, FastqReader* right); 44 | FastqReaderPair(string leftName, string rightName); 45 | ~FastqReaderPair(); 46 | ReadPair* read(); 47 | public: 48 | FastqReader* mLeft; 49 | FastqReader* mRight; 50 | }; 51 | 52 | #endif -------------------------------------------------------------------------------- /src/fusion.cpp: -------------------------------------------------------------------------------- 1 | #include "fusion.h" 2 | #include "editdistance.h" 3 | #include 4 | #include 5 | #include "util.h" 6 | #include 7 | #include "builtinfusion.h" 8 | #include 9 | #include "globalsettings.h" 10 | 11 | Fusion::Fusion(Gene gene){ 12 | mGene = gene; 13 | } 14 | 15 | vector Fusion::parseCsv(string filename) { 16 | ifstream file; 17 | file.open(filename.c_str(), ifstream::in); 18 | const int maxLine = 4096; 19 | char line[maxLine]; 20 | vector fusions; 21 | Gene workingGene; 22 | while(file.getline(line, maxLine)){ 23 | // trim \n, \r or \r\n in the tail 24 | int readed = strlen(line); 25 | if(readed >=2 ){ 26 | if(line[readed-1] == '\n' || line[readed-1] == '\r'){ 27 | line[readed-1] = '\0'; 28 | if(line[readed-2] == '\r') 29 | line[readed-2] = '\0'; 30 | } 31 | } 32 | string linestr(line); 33 | linestr = trim(linestr); 34 | vector splitted; 35 | split(linestr, splitted, ","); 36 | // wrong line 37 | if(splitted.size()<2) 38 | continue; 39 | // comment line 40 | if(starts_with(splitted[0], "#")) 41 | continue; 42 | // gene line 43 | if(starts_with(splitted[0], ">")){ 44 | if(workingGene.valid()){ 45 | Fusion fusion(workingGene); 46 | fusions.push_back(fusion); 47 | } 48 | workingGene = Gene::parse(linestr); 49 | continue; 50 | } 51 | // position line require id, start, position 52 | if(splitted.size()<3) 53 | continue; 54 | 55 | int id = atoi(trim(splitted[0]).c_str()); 56 | int start = atoi(trim(splitted[1]).c_str()); 57 | int end = atoi(trim(splitted[2]).c_str()); 58 | workingGene.addExon(id, start, end); 59 | } 60 | // last one 61 | if(workingGene.valid()){ 62 | Fusion fusion(workingGene); 63 | fusions.push_back(fusion); 64 | } 65 | return fusions; 66 | } 67 | 68 | void Fusion::print(){ 69 | mGene.print(); 70 | } 71 | 72 | void Fusion::printHtml(ofstream& file){ 73 | } 74 | 75 | string Fusion::pos2str(int pos) { 76 | return mGene.pos2str(pos); 77 | } 78 | 79 | bool Fusion::test() { 80 | vector fusions = Fusion::parseCsv("testdata/fusions.csv"); 81 | for(int i=0;i 5 | #include 6 | #include 7 | #include "read.h" 8 | #include 9 | #include 10 | #include 11 | #include "fastareader.h" 12 | #include "gene.h" 13 | 14 | 15 | using namespace std; 16 | 17 | class Fusion{ 18 | public: 19 | Fusion(Gene gene); 20 | 21 | static vector parseCsv(string filename); 22 | 23 | void print(); 24 | void printHtml(ofstream& file); 25 | static bool test(); 26 | bool isReversed() {return mGene.isReversed();} 27 | string pos2str(int pos); 28 | 29 | public: 30 | Gene mGene; 31 | }; 32 | 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/fusionmapper.cpp: -------------------------------------------------------------------------------- 1 | #include "fusionmapper.h" 2 | #include "editdistance.h" 3 | #include 4 | #include 5 | #include "util.h" 6 | #include 7 | #include 8 | #include "globalsettings.h" 9 | #include "matcher.h" 10 | #include 11 | 12 | FusionMapper::FusionMapper(string refFile, string fusionFile){ 13 | mRefFile = refFile; 14 | fusionList = Fusion::parseCsv(fusionFile); 15 | init(); 16 | } 17 | 18 | FusionMapper::~FusionMapper(){ 19 | if(mIndexer != NULL){ 20 | delete mIndexer; 21 | mIndexer = NULL; 22 | } 23 | if(fusionMatches!=NULL) { 24 | //delete fusionMatches; 25 | //fusionMatches = NULL; 26 | } 27 | } 28 | 29 | 30 | void FusionMapper::init(){ 31 | mIndexer = new Indexer(mRefFile, fusionList); 32 | mIndexer->makeIndex(); 33 | 34 | mFusionMatchSize = fusionList.size() * fusionList.size(); 35 | 36 | fusionMatches = new vector[mFusionMatchSize]; 37 | for(int i=0;i(); 39 | } 40 | } 41 | 42 | FastaReader* FusionMapper::getRef() { 43 | if(mIndexer == NULL) 44 | return NULL; 45 | else 46 | return mIndexer->getRef(); 47 | } 48 | 49 | Match* FusionMapper::mapRead(Read* r, bool& mapable, int distanceReq, int qualReq) { 50 | vector mapping = mIndexer->mapRead(r); 51 | 52 | //we only focus on the reads that can be mapped to two genome positions 53 | if(mapping.size() < 2){ 54 | mapable = false; 55 | return NULL; 56 | } 57 | 58 | mapable = true; 59 | 60 | //if the left part of mapping result is reverse, use its reverse complement alternative and skip this one 61 | if(!mIndexer->inRequiredDirection(mapping)) { 62 | return NULL; 63 | } 64 | 65 | /*cout<mName<mSeq.mStr<::iterator iter; 69 | for(iter = mapping.begin(); iter!=mapping.end(); iter++){ 70 | iter->print(); 71 | cout << endl; 72 | } 73 | cout << endl;*/ 74 | 75 | // TODO: set int readBreak, int leftContig, int leftPos, int rightContig, int rightPos 76 | Match* m = makeMatch(r, mapping); 77 | return m; 78 | } 79 | 80 | Match* FusionMapper::makeMatch(Read* r, vector& mapping) { 81 | if(mapping.size()!=2) 82 | return NULL; 83 | SeqMatch left = mapping[0]; 84 | SeqMatch right = mapping[1]; 85 | if(left.seqStart > right.seqStart) { 86 | left = mapping[1]; 87 | right = mapping[0]; 88 | } 89 | int readBreak = (left.seqEnd + right.seqStart)/2; 90 | GenePos leftGP = left.startGP; 91 | GenePos rightGP = right.startGP; 92 | leftGP.position += readBreak; 93 | rightGP.position += readBreak+1; 94 | int gap = right.seqStart - left.seqEnd - 1; 95 | Match* match = new Match(r, readBreak, leftGP, rightGP, gap); 96 | 97 | calcDistance(match); 98 | 99 | return match; 100 | } 101 | 102 | void FusionMapper::calcDistance(Match* match) { 103 | string seq = match->mRead->mSeq.mStr; 104 | 105 | int readBreak = match->mReadBreak; 106 | int leftLen = readBreak+1; 107 | int rightLen = seq.length() - (readBreak+1); 108 | 109 | string leftSeq = seq.substr(0, leftLen); 110 | string rightSeq = seq.substr(readBreak+1, rightLen); 111 | 112 | //Gene& leftGene = fusionList[match->mLeftGP.contig].mGene; 113 | //Gene& rightGene = fusionList[match->mRightGP.contig].mGene; 114 | 115 | match->mLeftDistance = calcED(leftSeq, match->mLeftGP.contig, match->mLeftGP.position - leftLen + 1, match->mLeftGP.position); 116 | match->mRightDistance = calcED(rightSeq, match->mRightGP.contig, match->mRightGP.position, match->mRightGP.position + rightLen - 1); 117 | } 118 | 119 | 120 | int FusionMapper::calcED(string seq, int contig, int start, int end) { 121 | // check start and end are in same strand 122 | if( (start>=0 && end<=0) || (start<=0 && end>=0) ) { 123 | return -1; 124 | } 125 | 126 | string& fusionSeq = mIndexer->mFusionSeq[contig]; 127 | 128 | // check the overflow 129 | if(abs(start)>=fusionSeq.length() || abs(end)>=fusionSeq.length()) 130 | return -2; 131 | 132 | string str = seq; 133 | if(start < 0) { 134 | Sequence s(seq); 135 | Sequence rc = ~s; 136 | str = rc.mStr; 137 | 138 | int tmp = start; 139 | start = -end; 140 | end = -tmp; 141 | } 142 | 143 | string refstr = fusionSeq.substr(start, end-start+1); 144 | 145 | return edit_distance(str.c_str(), str.length(), refstr.c_str(), refstr.length()); 146 | } 147 | 148 | void FusionMapper::addMatch(Match* m) { 149 | int leftContig = m->mLeftGP.contig; 150 | int rightContig = m->mRightGP.contig; 151 | int index = fusionList.size() * rightContig + leftContig; 152 | fusionMatches[index].push_back(m); 153 | } 154 | 155 | void FusionMapper::filterMatches() { 156 | // calc the sequence number before any filtering 157 | int total = 0; 158 | for(int i=0; i=0; m--) { 174 | string seq = fusionMatches[i][m]->mRead->mSeq.mStr; 175 | int readBreak = fusionMatches[i][m]->mReadBreak; 176 | if( isLowComplexity(seq.substr(0, readBreak+1)) 177 | || isLowComplexity(seq.substr(readBreak+1, seq.length() - (readBreak+1) )) ) { 178 | delete fusionMatches[i][m]; 179 | fusionMatches[i].erase(fusionMatches[i].begin() + m); 180 | removed++; 181 | } 182 | } 183 | } 184 | loginfo( string("removeByComplexity: ") + string( int2str(removed ))); 185 | } 186 | 187 | bool FusionMapper::isLowComplexity(string str) { 188 | if(str.length() < 20) 189 | return true; 190 | 191 | if(dis_connected_count(str) < 7) 192 | return true; 193 | 194 | return false; 195 | } 196 | 197 | void FusionMapper::removeByDistance() { 198 | // diff should be less than DIFF_THRESHOLD 199 | const int DIFF_THRESHOLD = 5; 200 | int removed = 0; 201 | for(int i=0; i=0; m--) { 203 | if(fusionMatches[i][m]->mLeftDistance + fusionMatches[i][m]->mRightDistance >= DIFF_THRESHOLD) { 204 | delete fusionMatches[i][m]; 205 | fusionMatches[i].erase(fusionMatches[i].begin() + m); 206 | removed++; 207 | } 208 | } 209 | } 210 | loginfo( string("removeByDistance: ") + string( int2str(removed ))); 211 | } 212 | 213 | void FusionMapper::removeIndels() { 214 | // diff should be greather than INDEL_THRESHOLD 215 | int INDEL_THRESHOLD = GlobalSettings::deletionThreshold; 216 | int removed = 0; 217 | for(int i=0; i=0; m--) { 219 | if(fusionMatches[i][m]->mLeftGP.contig == fusionMatches[i][m]->mRightGP.contig 220 | && abs(fusionMatches[i][m]->mLeftGP.position - fusionMatches[i][m]->mRightGP.position) < INDEL_THRESHOLD) { 221 | delete fusionMatches[i][m]; 222 | fusionMatches[i].erase(fusionMatches[i].begin() + m); 223 | removed++; 224 | } 225 | } 226 | } 227 | loginfo( string("removeIndels: ") + string( int2str(removed ))); 228 | } 229 | 230 | void FusionMapper::removeAlignables() { 231 | FastaReader* ref = getRef(); 232 | if(ref == NULL) 233 | return ; 234 | 235 | vector seqs; 236 | 237 | // first pass to gather all sequences 238 | for(int i=0; igetRead()->mSeq); 241 | } 242 | } 243 | 244 | Matcher matcher(ref, seqs); 245 | 246 | int removed = 0; 247 | // second pass to remove alignable sequences 248 | for(int i=0; i=0; m--) { 250 | MatchResult* mr = matcher.match(fusionMatches[i][m]->getRead()->mSeq); 251 | if(mr != NULL) { 252 | //fusionMatches[i][m]->getRead()->mSeq.print(); 253 | //cout<print(); 255 | delete fusionMatches[i][m]; 256 | fusionMatches[i].erase(fusionMatches[i].begin() + m); 257 | removed++; 258 | } 259 | } 260 | } 261 | loginfo( string("removeAlignables: ") + string( int2str(removed ))); 262 | } 263 | 264 | void FusionMapper::sortMatches() { 265 | // sort the matches to make the pileup more clear 266 | for(int i=0;i=0; m--) 275 | delete fusionMatches[i][m]; 276 | fusionMatches[i].clear(); 277 | } 278 | } 279 | 280 | void FusionMapper::clusterMatches() { 281 | for(int i=0;i frs; 283 | for(int m=0 ;mmFusionSeq[frs[f].mLeftGP.contig], mIndexer->mFusionSeq[frs[f].mRightGP.contig]); 302 | frs[f].adjustFusionBreak(); 303 | frs[f].calcUnique(); 304 | frs[f].updateInfo(fusionList); 305 | if(frs[f].isQualified()) { 306 | if(!GlobalSettings::outputDeletions && frs[f].isDeletion()) 307 | continue; 308 | if(frs[f].isLeftProteinForward() != frs[f].isRightProteinForward()) { 309 | if(!GlobalSettings::outputUntranslated) 310 | continue; 311 | } 312 | frs[f].print(fusionList); 313 | mFusionResults.push_back(frs[f]); 314 | } 315 | } 316 | } 317 | sortFusionResults(); 318 | loginfo("found " + int2str(mFusionResults.size()) + " fusions"); 319 | } 320 | 321 | void FusionMapper::sortFusionResults() { 322 | sort(mFusionResults.begin(), mFusionResults.end(), moreReads); 323 | } 324 | -------------------------------------------------------------------------------- /src/fusionmapper.h: -------------------------------------------------------------------------------- 1 | #ifndef FUSIONMAPPER_H 2 | #define FUSIONMAPPER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include 9 | #include 10 | #include 11 | #include "indexer.h" 12 | #include "match.h" 13 | #include "fusionresult.h" 14 | 15 | 16 | using namespace std; 17 | 18 | class FusionMapper{ 19 | public: 20 | FusionMapper(string refFile, string fusionFile); 21 | ~FusionMapper(); 22 | 23 | Match* mapRead(Read* r, bool& mapable, int distanceReq = 2, int qualReq=20); 24 | FastaReader* getRef(); 25 | void filterMatches(); 26 | void sortMatches(); 27 | void freeMatches(); 28 | void clusterMatches(); 29 | void addMatch(Match* m); 30 | void sortFusionResults(); 31 | inline static bool moreReads(const FusionResult r1, const FusionResult r2) 32 | { 33 | return r1.mUnique > r2.mUnique || (r1.mUnique == r2.mUnique && r1.mMatches.size() > r2.mMatches.size()); 34 | } 35 | 36 | private: 37 | void init(); 38 | Match* makeMatch(Read* r, vector& mapping); 39 | void calcDistance(Match* match); 40 | int calcED(string seq, int contig, int start, int end); 41 | 42 | void removeAlignables(); 43 | void removeByDistance(); 44 | void removeIndels(); 45 | void removeByComplexity(); 46 | bool isLowComplexity(string str); 47 | 48 | public: 49 | string mRefFile; 50 | int mFusionMatchSize; 51 | Indexer* mIndexer; 52 | vector fusionList; 53 | vector *fusionMatches; 54 | vector mFusionResults; 55 | }; 56 | 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /src/fusionresult.h: -------------------------------------------------------------------------------- 1 | #ifndef FUSIONRESULT_H 2 | #define FUSIONRESULT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "match.h" 11 | #include "fusion.h" 12 | 13 | 14 | using namespace std; 15 | 16 | class FusionResult{ 17 | public: 18 | FusionResult(); 19 | ~FusionResult(); 20 | 21 | void print(vector& fusions); 22 | void calcFusionPoint(); 23 | void calcUnique(); 24 | void updateInfo(vector& fusions); 25 | void makeReference(string& refL, string& refR); 26 | void adjustFusionBreak(); 27 | void addMatch(Match* m); 28 | bool support(Match* m); 29 | bool isDeletion(); 30 | bool canBeMapped(); 31 | bool canBeMatched(string& s1, string& s2); 32 | bool isQualified(); 33 | static bool supportSame(Match* m1, Match* m2); 34 | void printFusionProteinHTML(ofstream& file); 35 | bool isLeftProteinForward(); 36 | bool isRightProteinForward(); 37 | 38 | private: 39 | string getRefSeq(string& ref, int start, int end); 40 | int calcED(Match* m, int shift, int& leftED, int& rightED); 41 | void calcLeftExonIntronNumber(); 42 | void calcRightExonIntronNumber(); 43 | void printLeftProteinHTML(ofstream& file); 44 | void printRightProteinHTML(ofstream& file); 45 | void printExonIntronTD(ofstream& file, bool isExon, bool forward, int number, float percent, string style); 46 | 47 | public: 48 | GenePos mLeftGP; 49 | GenePos mRightGP; 50 | vector mMatches; 51 | int mUnique; 52 | string mTitle; 53 | string mLeftRef; 54 | string mRightRef; 55 | string mLeftRefExt; 56 | string mRightRefExt; 57 | string mLeftPos; 58 | string mRightPos; 59 | Gene mLeftGene; 60 | Gene mRightGene; 61 | bool mLeftIsExon; 62 | bool mRightIsExon; 63 | int mLeftExonOrIntronID; 64 | int mRightExonOrIntronID; 65 | float mLeftExonNum; 66 | float mLeftIntronNum; 67 | float mRightExonNum; 68 | float mRightIntronNum; 69 | }; 70 | 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /src/fusionscan.cpp: -------------------------------------------------------------------------------- 1 | #include "fusionscan.h" 2 | #include "fastqreader.h" 3 | #include 4 | #include "htmlreporter.h" 5 | #include "sescanner.h" 6 | #include "pescanner.h" 7 | #include "util.h" 8 | 9 | FusionScan::FusionScan(string fusionFile, string refFile, string read1File, string read2File, string html, string json, int threadNum){ 10 | mRead1File = read1File; 11 | mRead2File = read2File; 12 | mFusionFile = fusionFile; 13 | mRefFile = refFile; 14 | mHtmlFile = html; 15 | mJsonFile = json; 16 | mThreadNum = threadNum; 17 | } 18 | 19 | bool FusionScan::scan(){ 20 | vector fusions = Fusion::parseCsv(mFusionFile); 21 | if(mRead2File != ""){ 22 | PairEndScanner pescanner( mFusionFile, mRefFile, mRead1File, mRead2File, mHtmlFile, mJsonFile, mThreadNum); 23 | return pescanner.scan(); 24 | } 25 | else{ 26 | SingleEndScanner sescanner( mFusionFile, mRefFile, mRead1File, mHtmlFile, mJsonFile, mThreadNum); 27 | return sescanner.scan(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/fusionscan.h: -------------------------------------------------------------------------------- 1 | #ifndef FUSION_SCAN_H 2 | #define FUSION_SCAN_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include "fusion.h" 9 | #include "indexer.h" 10 | 11 | using namespace std; 12 | 13 | class FusionScan{ 14 | public: 15 | FusionScan(string fusionFile, string refFile, string read1File, string read2File, string html, string json, int threadNum); 16 | bool scan(); 17 | 18 | private: 19 | string mFusionFile; 20 | string mRead1File; 21 | string mRead2File; 22 | string mHtmlFile; 23 | string mJsonFile; 24 | string mRefFile; 25 | int mThreadNum; 26 | }; 27 | 28 | 29 | #endif -------------------------------------------------------------------------------- /src/gene.cpp: -------------------------------------------------------------------------------- 1 | #include "gene.h" 2 | #include "editdistance.h" 3 | #include 4 | #include 5 | #include "util.h" 6 | #include 7 | #include 8 | #include "globalsettings.h" 9 | 10 | Gene::Gene(string name, string chr, int start, int end){ 11 | mName = name; 12 | mChr = chr; 13 | mStart = start; 14 | mEnd = end; 15 | mReversed = false; 16 | } 17 | 18 | Gene::Gene(const Gene& other) { 19 | mName = other.mName; 20 | mChr = other.mChr; 21 | mStart = other.mStart; 22 | mEnd = other.mEnd; 23 | mExons = other.mExons; 24 | mReversed = other.mReversed; 25 | } 26 | 27 | Gene::Gene() { 28 | mName = "invalid"; 29 | mChr = "invalid"; 30 | mStart = 0; 31 | mEnd = 0; 32 | mReversed = false; 33 | } 34 | 35 | bool Gene::valid() { 36 | return mName != "invalid" && mStart != 0 && mEnd != 0; 37 | } 38 | 39 | void Gene::addExon(Exon exon) { 40 | mExons.push_back(exon); 41 | if(mExons.size()>1) { 42 | if(mExons[0].start > mExons[1].start) { 43 | mReversed = true; 44 | } 45 | } 46 | } 47 | 48 | void Gene::addExon(int id, int start, int end) { 49 | Exon exon; 50 | exon.id=id; 51 | exon.start=start; 52 | exon.end=end; 53 | addExon(exon); 54 | } 55 | 56 | void Gene::print() { 57 | cout< splitted; 66 | split(linestr, splitted, ","); 67 | if(splitted.size()<2) 68 | return Gene(); 69 | string name = trim(splitted[0].substr(1, splitted[0].length()-1)); 70 | 71 | vector chrPos; 72 | split(splitted[1], chrPos, ":"); 73 | if(chrPos.size()<2) 74 | return Gene(); 75 | string chr = trim(chrPos[0]); 76 | 77 | vector range; 78 | split(chrPos[1], range, "-"); 79 | if(range.size()<2) 80 | return Gene(); 81 | 82 | int start = atoi(trim(range[0]).c_str()); 83 | int end = atoi(trim(range[1]).c_str()); 84 | 85 | return Gene(name, chr, start, end); 86 | 87 | } 88 | 89 | int Gene::genePos2ChrPos(int genepos) { 90 | int chrpos = abs(genepos) + mStart; 91 | if(genepos <0) 92 | chrpos *= -1; 93 | return chrpos; 94 | } 95 | 96 | string Gene::pos2str(int pos) { 97 | int pp = abs(pos) + mStart; 98 | stringstream ss; 99 | ss<= mExons[i].start && pp <= mExons[i].end) { 102 | ss<<"exon:"<0) { 106 | if(mReversed) { 107 | if(mExons[i].end < pp && pp < mExons[i-1].start){ 108 | ss<<"intron:"<<(mExons[i].id-1)<<"|"; 109 | break; 110 | } 111 | } else { 112 | if(mExons[i-1].end < pp && pp < mExons[i].start){ 113 | ss<<"intron:"<<(mExons[i].id-1)<<"|"; 114 | break; 115 | } 116 | } 117 | } 118 | } 119 | if(pos>=0) 120 | ss<<"+"; 121 | else 122 | ss<<"-"; 123 | ss<= mExons[i].start && pp <= mExons[i].end) { 134 | isExon = true; 135 | number = mExons[i].id; 136 | break; 137 | } 138 | if(i>0) { 139 | if(mReversed) { 140 | if(mExons[i].end < pp && pp < mExons[i-1].start){ 141 | isExon = false; 142 | number = (mExons[i].id-1); 143 | break; 144 | } 145 | } else { 146 | if(mExons[i-1].end < pp && pp < mExons[i].start){ 147 | isExon = false; 148 | number = (mExons[i].id-1); 149 | break; 150 | } 151 | } 152 | } 153 | } 154 | } -------------------------------------------------------------------------------- /src/gene.h: -------------------------------------------------------------------------------- 1 | #ifndef GENE_H 2 | #define GENE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include 9 | #include 10 | #include 11 | #include "fastareader.h" 12 | 13 | using namespace std; 14 | 15 | struct Exon{ 16 | int id; 17 | int start; 18 | int end; 19 | }; 20 | 21 | class Gene{ 22 | public: 23 | Gene(string name, string chr, int start, int end); 24 | Gene(const Gene& other); 25 | // WAR to make a default constructor 26 | Gene(); 27 | bool valid(); 28 | bool isReversed() { return mReversed; } 29 | void addExon(Exon exon); 30 | void addExon(int id, int start, int end); 31 | void print(); 32 | static Gene parse(string linestr); 33 | string pos2str(int pos); 34 | int genePos2ChrPos(int genepos); 35 | void getExonIntron(int pos, bool& isExon, int& number); 36 | 37 | public: 38 | string mName; 39 | string mChr; 40 | int mStart; 41 | int mEnd; 42 | vector mExons; 43 | bool mReversed; 44 | }; 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /src/globalsettings.cpp: -------------------------------------------------------------------------------- 1 | #include "globalsettings.h" 2 | 3 | bool GlobalSettings::markedOnlyForVCF = false; 4 | int GlobalSettings::uniqueRequirement = 2; 5 | int GlobalSettings::deletionThreshold = 50; 6 | bool GlobalSettings::outputDeletions = false; 7 | bool GlobalSettings::outputUntranslated = false; 8 | int GlobalSettings::skipKeyDupThreshold = 5; 9 | int GlobalSettings::majorGeneKeyRequirement = 40; 10 | int GlobalSettings::minorGeneKeyRequirement = 20; 11 | int GlobalSettings::mismatchThreshold = 10; -------------------------------------------------------------------------------- /src/globalsettings.h: -------------------------------------------------------------------------------- 1 | #ifndef GLOBALSETTINGS_H 2 | #define GLOBALSETTINGS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | class GlobalSettings{ 11 | public: 12 | GlobalSettings(); 13 | 14 | public: 15 | inline static void setMarkedOnlyForVCF(bool flag){ 16 | markedOnlyForVCF = flag; 17 | } 18 | inline static void setUniqueRequirement(int val){ 19 | uniqueRequirement = val; 20 | } 21 | inline static void setDeletionThreshold(int val){ 22 | deletionThreshold = val; 23 | } 24 | inline static void setOutputDeletions(bool flag){ 25 | outputDeletions = flag; 26 | } 27 | inline static void setOutputUntranslated(bool flag){ 28 | outputUntranslated = flag; 29 | } 30 | 31 | public: 32 | static bool markedOnlyForVCF; 33 | static int uniqueRequirement; 34 | static int deletionThreshold; 35 | static bool outputDeletions; 36 | static bool outputUntranslated; 37 | static int skipKeyDupThreshold; 38 | static int majorGeneKeyRequirement; 39 | static int minorGeneKeyRequirement; 40 | static int mismatchThreshold; 41 | }; 42 | 43 | 44 | #endif -------------------------------------------------------------------------------- /src/htmlreporter.cpp: -------------------------------------------------------------------------------- 1 | #include "htmlreporter.h" 2 | #include "common.h" 3 | #include 4 | #include "globalsettings.h" 5 | 6 | const std::string getCurrentSystemTime() 7 | { 8 | auto tt = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); 9 | struct tm* ptm = localtime(&tt); 10 | char date[60] = {0}; 11 | sprintf(date, "%d-%02d-%02d %02d:%02d:%02d", 12 | (int)ptm->tm_year + 1900,(int)ptm->tm_mon + 1,(int)ptm->tm_mday, 13 | (int)ptm->tm_hour,(int)ptm->tm_min,(int)ptm->tm_sec); 14 | return std::string(date); 15 | } 16 | 17 | HtmlReporter::HtmlReporter(string filename, FusionMapper* mapper){ 18 | mFusionMapper = mapper; 19 | mFusionResults = mapper->mFusionResults; 20 | mFilename = filename; 21 | mFile.open(mFilename.c_str(), ifstream::out); 22 | } 23 | 24 | HtmlReporter::~HtmlReporter(){ 25 | mFile.close(); 26 | } 27 | 28 | void HtmlReporter::run() { 29 | printHeader(); 30 | printHelper(); 31 | printFusions(); 32 | printFooter(); 33 | } 34 | 35 | void HtmlReporter::printHelper() { 36 | mFile << "

Helpful tips:

    "; 37 | mFile << "
  • Base color indicates quality: extremely high (Q40+), high (Q30~Q39) , moderate (Q20~Q29), low (Q15~Q19), extremely low (0~Q14).
  • "; 38 | mFile << "
  • Move mouse over the base, it will show the quality value
  • "; 39 | mFile << "
  • Click on any row, the original read/pair will be displayed
  • "; 40 | mFile << "
  • For pair-end sequencing, GeneFuse tries to merge each pair, with overlapped assigned higher qualities
  • "; 41 | mFile << "

Columns:

    "; 42 | mFile << "
  • col1: is fusion mapped with original read? → means original read, ← means reverse complement
  • "; 43 | mFile << "
  • col2: edit distance (ed) between read and reference sequence (left_part_ed | right_part_ed)
  • "; 44 | mFile << "
  • col3: read's left part after fusion break
  • "; 45 | mFile << "
  • col4: read's right part after fusion break
  • "; 46 | mFile << "
"; 47 | } 48 | 49 | void HtmlReporter::printFusions() { 50 | // calculate the found fusion 51 | int found = mFusionResults.size(); 52 | // print menu 53 | mFile<<""; 64 | id=0; 65 | for(int i=0;i matches = fusion.mMatches; 80 | mFile << "
"; 81 | mFile << ""; 84 | mFile << "
Inferred protein"; 85 | if(fusion.isLeftProteinForward() != fusion.isRightProteinForward()) { 86 | mFile << " (transcription direction conflicts, this fusion may be not transcribed) "; 87 | } 88 | mFile << ":
"; 89 | fusion.printFusionProteinHTML(mFile); 90 | mFile << "
Supporting reads:
"; 91 | mFile << ""; 92 | mFile << ""; 93 | mFile << ""; 94 | mFile << ""; 95 | mFile << ""; 96 | mFile << ""; 97 | mFile << ""; 98 | mFile << ""; 99 | mFile << ""; 100 | for(int m=0; m"; 103 | mFile << ""; 115 | // print a hidden row containing the full read 116 | mFile << ""; 117 | mFile << ""; 120 | mFile << ""; 121 | } 122 | mFile << "
" << fusion.mLeftPos << " = = " << fusion.mRightPos << "
" << fusion.mLeftRef << "" << fusion.mRightRef << "
"; 104 | mFile<<""; 105 | // for display alignment 106 | if(m+1<10) 107 | mFile<<"0"; 108 | if(m+1<100) 109 | mFile<<"0"; 110 | if(m+1<1000) 111 | mFile<<"0"; 112 | mFile << m+1; 113 | matches[m]->printHtmlTD(mFile); 114 | mFile << "
"; 123 | } 124 | 125 | void HtmlReporter::printHeader(){ 126 | mFile << ""; 127 | mFile << "GeneFuse " << FUSIONSCAN_VER << ", at " << getCurrentSystemTime() << ""; 128 | printJS(); 129 | printCSS(); 130 | mFile << ""; 131 | mFile << "
"; 132 | mFile << "
GeneFuse " << FUSIONSCAN_VER << "
"; 133 | } 134 | 135 | void HtmlReporter::printCSS(){ 136 | mFile << ""; 165 | } 166 | 167 | void HtmlReporter::printJS(){ 168 | mFile << ""; 192 | } 193 | 194 | extern string command; 195 | 196 | void HtmlReporter::printFooter(){ 197 | mFile << ""; 201 | mFile << "
"; 202 | } 203 | 204 | void HtmlReporter::printScanTargets(){/* 205 | mFile << "
"; 206 | mFile << "

scanned " << mFusionResults.size() << " fusion spots...

"; 207 | mFile << "
"; 214 | */ 215 | } -------------------------------------------------------------------------------- /src/htmlreporter.h: -------------------------------------------------------------------------------- 1 | #ifndef HTML_REPORTER_H 2 | #define HTML_REPORTER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include "fusion.h" 9 | #include "match.h" 10 | #include 11 | #include 12 | #include "fusionmapper.h" 13 | 14 | using namespace std; 15 | 16 | class HtmlReporter{ 17 | public: 18 | HtmlReporter(string filename, FusionMapper* mapper); 19 | ~HtmlReporter(); 20 | void run(); 21 | 22 | private: 23 | void printHeader(); 24 | void printCSS(); 25 | void printJS(); 26 | void printFooter(); 27 | void printHelper(); 28 | void printFusions(); 29 | void printFusion(int id, FusionResult& fusion); 30 | void printScanTargets(); 31 | 32 | private: 33 | string mFilename; 34 | FusionMapper* mFusionMapper; 35 | ofstream mFile; 36 | vector mFusionResults; 37 | }; 38 | 39 | 40 | #endif -------------------------------------------------------------------------------- /src/indexer.h: -------------------------------------------------------------------------------- 1 | #ifndef INDEXER_H 2 | #define INDEXER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include 9 | #include 10 | #include 11 | #include "fusion.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "common.h" 17 | 18 | 19 | using namespace std; 20 | const unsigned char MATCH_TOP = 3; 21 | const unsigned char MATCH_SECOND = 2; 22 | const unsigned char MATCH_NONE = 1; 23 | const unsigned char MATCH_UNKNOWN = 0; 24 | 25 | #pragma pack() 26 | 27 | struct SeqMatch{ 28 | int seqStart; 29 | int seqEnd; 30 | GenePos startGP; 31 | inline void print() { 32 | cout << seqStart << "-" << seqEnd << "|"; 33 | cout << startGP.contig << ":" << startGP.position; 34 | } 35 | }; 36 | 37 | class Indexer{ 38 | public: 39 | Indexer(string refFile, vector& fusions); 40 | ~Indexer(); 41 | void makeIndex(); 42 | void indexContig(int ctg, string seq, int start); 43 | void printStat(); 44 | FastaReader* getRef(); 45 | 46 | static inline long makeKmer(string & seq, int pos, long lastKmer, int step = 1); 47 | static inline long gp2long(const GenePos& gp); 48 | static inline GenePos shift(const GenePos& gp, int i); 49 | static inline GenePos long2gp(const long val); 50 | 51 | // map the read onto the reference 52 | // return a map, with key is the first-base-mapped GenePos encoded in long, value is the count of this GenePos 53 | // GenePos encoded with 0 means not mapped 54 | vector mapRead(Read* r); 55 | // check a fusion of two mapping mates is: forward at left 56 | bool inRequiredDirection(vector& mapping); 57 | 58 | static bool test(); 59 | 60 | 61 | private: 62 | void makeMask(unsigned char* mask, unsigned char flag, int seqlen, int start, int kmerSize); 63 | vector segmentMask(unsigned char* mask, int seqlen, GenePos gp1, GenePos gp2); 64 | void fillBloomFilter(); 65 | 66 | public: 67 | unordered_map mKmerPos; 68 | unsigned char* mBloomFilter; 69 | vector> mDupeList; 70 | vector mFusionSeq; 71 | 72 | 73 | 74 | private: 75 | string mRefFile; 76 | FastaReader* mReference; 77 | vector mFusions; 78 | int mUniquePos; 79 | int mDupePos; 80 | }; 81 | 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /src/jsonreporter.cpp: -------------------------------------------------------------------------------- 1 | #include "jsonreporter.h" 2 | #include "common.h" 3 | #include 4 | #include "globalsettings.h" 5 | 6 | JsonReporter::JsonReporter(string filename, FusionMapper* mapper){ 7 | mFusionMapper = mapper; 8 | mFusionResults = mapper->mFusionResults; 9 | mFilename = filename; 10 | mFile.open(mFilename.c_str(), ifstream::out); 11 | } 12 | 13 | JsonReporter::~JsonReporter(){ 14 | mFile.close(); 15 | } 16 | 17 | extern string getCurrentSystemTime(); 18 | extern string command; 19 | 20 | void JsonReporter::run() { 21 | mFile << "{" << endl; 22 | mFile << "\t\"command\":\"" << command << "\"," << endl; 23 | mFile << "\t\"version\":\"" << FUSIONSCAN_VER << "\"," << endl; 24 | mFile << "\t\"time\":\"" << getCurrentSystemTime() << "\"," << endl; 25 | mFile << "\t\"fusions\":{"; 26 | 27 | bool isFirstMut = true; 28 | for(int i=0;i matches = fusion.mMatches; 31 | if(!GlobalSettings::outputDeletions && fusion.isDeletion()) 32 | continue; 33 | if(fusion.isLeftProteinForward() != fusion.isRightProteinForward()) { 34 | if(!GlobalSettings::outputUntranslated) 35 | continue; 36 | } 37 | 38 | if(isFirstMut) { 39 | mFile << endl; 40 | isFirstMut = false; 41 | } 42 | else 43 | mFile << "," << endl; 44 | 45 | mFile << "\t\t\"" << fusion.mTitle << "\":{" << endl; 46 | mFile << "\t\t\t\"" << "left" << "\":{" << endl; 47 | mFile << "\t\t\t\t\"" << "gene_name" << "\":" << "\"" << fusion.mLeftGene.mName << "\"," << endl; 48 | mFile << "\t\t\t\t\"" << "gene_chr" << "\":" << "\"" << fusion.mLeftGene.mChr << "\"," << endl; 49 | mFile << "\t\t\t\t\"" << "position" << "\":" << fusion.mLeftGene.genePos2ChrPos(fusion.mLeftGP.position) << "," << endl; 50 | mFile << "\t\t\t\t\"" << "reference" << "\":" << "\"" << fusion.mLeftRef << "\"," << endl; 51 | mFile << "\t\t\t\t\"" << "ref_ext" << "\":" << "\"" << fusion.mLeftRefExt << "\"," << endl; 52 | mFile << "\t\t\t\t\"" << "pos_str" << "\":" << "\"" << fusion.mLeftPos << "\"," << endl; 53 | mFile << "\t\t\t\t\"" << "exon_or_intron" << "\":" << "\"" << (fusion.mLeftIsExon?"exon":"intron") << "\"," << endl; 54 | mFile << "\t\t\t\t\"" << "exon_or_intron_id" << "\":" << fusion.mLeftExonOrIntronID << "," << endl; 55 | mFile << "\t\t\t\t\"" << "strand" << "\":" << "\"" << (fusion.isLeftProteinForward()?"forward":"reversed") << "\"" << endl; 56 | mFile << "\t\t\t}, " << endl; 57 | mFile << "\t\t\t\"" << "right" << "\":{" << endl; 58 | mFile << "\t\t\t\t\"" << "gene_name" << "\":" << "\"" << fusion.mRightGene.mName << "\"," << endl; 59 | mFile << "\t\t\t\t\"" << "gene_chr" << "\":" << "\"" << fusion.mRightGene.mChr << "\"," << endl; 60 | mFile << "\t\t\t\t\"" << "position" << "\":" << fusion.mRightGene.genePos2ChrPos(fusion.mRightGP.position) << "," << endl; 61 | mFile << "\t\t\t\t\"" << "reference" << "\":" << "\"" << fusion.mRightRef << "\"," << endl; 62 | mFile << "\t\t\t\t\"" << "ref_ext" << "\":" << "\"" << fusion.mRightRefExt << "\"," << endl; 63 | mFile << "\t\t\t\t\"" << "pos_str" << "\":" << "\"" << fusion.mRightPos << "\"," << endl; 64 | mFile << "\t\t\t\t\"" << "exon_or_intron" << "\":" << "\"" << (fusion.mRightIsExon?"exon":"intron") << "\"," << endl; 65 | mFile << "\t\t\t\t\"" << "exon_or_intron_id" << "\":" << fusion.mRightExonOrIntronID << "," << endl; 66 | mFile << "\t\t\t\t\"" << "strand" << "\":" << "\"" << (fusion.isRightProteinForward()?"forward":"reversed") << "\"" << endl; 67 | mFile << "\t\t\t}, " << endl; 68 | 69 | mFile << "\t\t\t\"" << "unique" << "\":" << fusion.mUnique << "," << endl; 70 | mFile << "\t\t\t\"" << "reads" << "\":[" << endl; 71 | for(int m=0; mmReadBreak << "," << endl; 74 | mFile << "\t\t\t\t\t\"" << "strand" << "\":" << "\"" << (matches[m]->mReversed?"reversed":"forward") << "\"," << endl; 75 | matches[m]->printReadToJson(mFile, "\t\t\t\t\t"); 76 | mFile << "\t\t\t\t}"; 77 | if(m!=matches.size()-1) 78 | mFile << ","; 79 | mFile << endl; 80 | } 81 | mFile << "\t\t\t]" << endl; 82 | mFile << "\t\t}"; 83 | } 84 | 85 | mFile << endl; 86 | mFile << "\t}" << endl; 87 | mFile << "}" << endl; 88 | } -------------------------------------------------------------------------------- /src/jsonreporter.h: -------------------------------------------------------------------------------- 1 | #ifndef JSON_REPORTER_H 2 | #define JSON_REPORTER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include "fusion.h" 9 | #include "match.h" 10 | #include 11 | #include 12 | #include "fusionmapper.h" 13 | 14 | using namespace std; 15 | 16 | class JsonReporter{ 17 | public: 18 | JsonReporter(string filename, FusionMapper* mapper); 19 | ~JsonReporter(); 20 | void run(); 21 | 22 | private: 23 | string mFilename; 24 | FusionMapper* mFusionMapper; 25 | ofstream mFile; 26 | vector mFusionResults; 27 | }; 28 | 29 | #endif -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "fastqreader.h" 3 | #include "unittest.h" 4 | #include "fusionscan.h" 5 | #include 6 | #include "cmdline.h" 7 | #include 8 | #include "util.h" 9 | #include "globalsettings.h" 10 | 11 | string command; 12 | 13 | int main(int argc, char* argv[]){ 14 | if (argc == 2 && strcmp(argv[1], "test")==0){ 15 | UnitTest tester; 16 | tester.run(); 17 | return 0; 18 | } 19 | cmdline::parser cmd; 20 | cmd.add("read1", '1', "read1 file name", true, ""); 21 | cmd.add("read2", '2', "read2 file name", false, ""); 22 | cmd.add("fusion", 'f', "fusion file name, in CSV format", true, ""); 23 | cmd.add("ref", 'r', "reference fasta file name", true, ""); 24 | cmd.add("unique", 'u', "specify the least supporting read number is required to report a fusion, default is 2", false, 2); 25 | cmd.add("html", 'h', "file name to store HTML report, default is genefuse.html", false, "genefuse.html"); 26 | cmd.add("json", 'j', "file name to store JSON report, default is genefuse.json", false, "genefuse.json"); 27 | cmd.add("thread", 't', "worker thread number, default is 4", false, 4); 28 | cmd.add("deletion", 'd', "specify the least deletion length of a intra-gene deletion to report, default is 50", false, 50); 29 | cmd.add("output_deletions", 'D', "long deletions are not output by default, enable this option to output them"); 30 | cmd.add("output_untranslated_fusions", 'U', "the fusions that cannot be transcribed or translated are not output by default, enable this option to output them"); 31 | cmd.parse_check(argc, argv); 32 | string r1file = cmd.get("read1"); 33 | string r2file = cmd.get("read2"); 34 | string fusionFile = cmd.get("fusion"); 35 | string html = cmd.get("html"); 36 | string json = cmd.get("json"); 37 | string refFile = cmd.get("ref"); 38 | int threadNum = cmd.get("thread"); 39 | int unique = cmd.get("unique"); 40 | int deletion = cmd.get("deletion"); 41 | bool outputDeletion = cmd.exist("output_deletions"); 42 | bool outputUntranslated = cmd.exist("output_untranslated_fusions"); 43 | 44 | GlobalSettings::setUniqueRequirement(unique); 45 | GlobalSettings::setDeletionThreshold(deletion); 46 | GlobalSettings::setOutputDeletions(outputDeletion); 47 | GlobalSettings::setOutputUntranslated(outputUntranslated); 48 | 49 | 50 | if(ends_with(refFile, ".gz") || ends_with(refFile, ".gz")) { 51 | cout << "reference fasta file should not be compressed.\nplease unzip "< 3 | #include "util.h" 4 | 5 | Match::Match(Read* r, int readBreak, GenePos leftGP, GenePos rightGP, int gap, bool reversed){ 6 | mRead = new Read(*r); 7 | mGap = gap; 8 | mReadBreak = readBreak; 9 | mLeftGP = leftGP; 10 | mRightGP = rightGP; 11 | mLeftDistance = 0; 12 | mRightDistance = 0; 13 | mOverallDistance = 0; 14 | mReversed = reversed; 15 | } 16 | 17 | Match::~Match(){ 18 | delete mRead; 19 | mRead = NULL; 20 | for(int i=0;imLeft)); 32 | mOriginalReads.push_back(new Read(*pair->mRight)); 33 | } 34 | 35 | void Match::print(){ 36 | cout<<"break:"<mName.substr(1, mRead->mName.length()-1); 43 | cout << endl; 44 | cout << mRead->mSeq.mStr.substr(0, mReadBreak+1); 45 | cout << " "; 46 | cout << mRead->mSeq.mStr.substr(mReadBreak+1, mRead->length() - (mReadBreak+1)); 47 | cout << endl; 48 | } 49 | 50 | void Match::printHtmlTD(ofstream& file){ 51 | //file<<"d:" << mDistance; 52 | if(mReversed) 53 | file<<"←"; 54 | else 55 | file<<"→"; 56 | 57 | file<<""; 58 | 59 | file<<"" << int2str(mLeftDistance) << "|" << int2str(mRightDistance) << ""; 60 | 61 | vector breaks; 62 | breaks.push_back( mReadBreak+1 ); 63 | mRead->printHtmlTDWithBreaks(file, breaks); 64 | } 65 | 66 | void Match::printReadsToFile(ofstream& file){ 67 | for(int i=0;iprintFile(file); 69 | } 70 | } 71 | 72 | void Match::setReversed(bool flag){ 73 | mReversed = flag; 74 | } 75 | 76 | int Match::countUnique(vector& matches) { 77 | if(matches.size()==0) 78 | return 0; 79 | int count = 1; 80 | Match* cur = matches[0]; 81 | for(int i=1;i *cur || *m < *cur) { 84 | cur = m; 85 | count++; 86 | } 87 | } 88 | return count; 89 | } 90 | 91 | void Match::printReadToJson(ofstream& file, string pad) { 92 | file << pad << "\"seq\":" << "\"" << mRead->mSeq.mStr << "\"," << endl; 93 | file << pad << "\"qual\":" << "\"" << mRead->mQuality << "\"" << endl; 94 | } 95 | -------------------------------------------------------------------------------- /src/match.h: -------------------------------------------------------------------------------- 1 | #ifndef MATCH_H 2 | #define MATCH_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "common.h" 14 | 15 | using namespace std; 16 | 17 | /* 18 | * 19 | * ref: 0... leftGP rightGP ... 20 | * read: ---------------- ------------------------- 21 | * 0... readBreak readBreak+1 ... readlen-1 22 | */ 23 | 24 | class Match{ 25 | public: 26 | Match(Read* r, int readBreak, GenePos leftGP, GenePos rightGP, int gap, bool reversed = false); 27 | ~Match(); 28 | void print(); 29 | void printReadToJson(ofstream& file, string pad); 30 | void printHtmlTD(ofstream& file); 31 | void printReadsToFile(ofstream& file); 32 | void setReversed(bool flag); 33 | void addOriginalRead(Read* r); 34 | void addOriginalPair(ReadPair* pair); 35 | Read* getRead() {return mRead;} 36 | 37 | inline bool operator <(const Match& other) const 38 | { 39 | return mReadBreak < other.mReadBreak || (mReadBreak == other.mReadBreak && mRead->length() > other.mRead->length()); 40 | } 41 | inline bool operator >(const Match& other) const 42 | { 43 | return mReadBreak > other.mReadBreak || (mReadBreak == other.mReadBreak && mRead->length() < other.mRead->length()); 44 | } 45 | inline static bool less(const Match* m1, const Match* m2) 46 | { 47 | return *m1 < *m2; 48 | } 49 | inline static bool greater(const Match* m1, const Match* m2) 50 | { 51 | return *m1 > *m2; 52 | } 53 | 54 | static int countUnique(vector& matches); 55 | 56 | public: 57 | Read* mRead; 58 | vector mOriginalReads; 59 | int mOverallDistance; 60 | int mLeftDistance; 61 | int mRightDistance; 62 | // the gap between left and right segment after segmentation 63 | int mGap; 64 | bool mReversed; 65 | int mReadBreak; 66 | GenePos mLeftGP; 67 | GenePos mRightGP; 68 | }; 69 | 70 | 71 | #endif -------------------------------------------------------------------------------- /src/matcher.cpp: -------------------------------------------------------------------------------- 1 | #include "matcher.h" 2 | #include "util.h" 3 | #include 4 | #include 5 | 6 | // we use 512M memory 7 | const int BLOOM_FILTER_LENGTH = (1<<29); 8 | const int KMER = 16; 9 | 10 | Matcher::Matcher(FastaReader* ref, vector& seqs) { 11 | mReference = ref; 12 | mUniquePos = 0; 13 | mDupePos = 0; 14 | initBloomFilter(seqs); 15 | makeIndex(); 16 | } 17 | 18 | Matcher::~Matcher() { 19 | delete mBloomFilterArray; 20 | mBloomFilterArray=NULL; 21 | } 22 | 23 | void Matcher::initBloomFilter(vector& seqs) { 24 | mBloomFilterArray = new unsigned char[BLOOM_FILTER_LENGTH]; 25 | memset(mBloomFilterArray, 0, BLOOM_FILTER_LENGTH); 26 | for(int s=0;s>3] |= (1<<(kmer & 0x07)); 44 | } 45 | } 46 | 47 | void Matcher::makeIndex() { 48 | if(mReference == NULL) 49 | return ; 50 | 51 | map& ref = mReference->mAllContigs; 52 | 53 | map::iterator iter; 54 | int ctg = 0; 55 | for(iter = ref.begin(); iter!=ref.end(); iter++){ 56 | string ctgName = iter->first; 57 | string s = iter->second; 58 | str2upper(s); 59 | mContigNames.push_back(ctgName); 60 | //index forward 61 | indexContig(ctg, s, 0); 62 | //index reverse complement 63 | //Sequence rseq = ~(Sequence(s)); 64 | //indexContig(ctg, rseq.mStr, -s.length()+1); 65 | ctg++; 66 | } 67 | loginfo("matcher indexing done"); 68 | } 69 | 70 | void Matcher::indexContig(int ctg, string seq, int start) { 71 | unsigned int kmer = 0; 72 | bool valid = false; 73 | for(int i=0; i>3] & (1<<(kmer & 0x07))) == 0) 91 | continue; 92 | 93 | GenePos site; 94 | site.contig = ctg; 95 | site.position = i+start; 96 | 97 | if(mKmerPositions.count(kmer) == 0) { 98 | mKmerPositions[kmer] = vector(); 99 | } 100 | 101 | mKmerPositions[kmer].push_back(site); 102 | 103 | } 104 | } 105 | 106 | MatchResult* Matcher::match(Sequence& sequence) { 107 | Sequence rcseq = ~sequence; 108 | 109 | MatchResult* mc = mapToIndex(sequence); 110 | if(mc!=NULL) 111 | mc->reversed=false; 112 | MatchResult* rcmc = mapToIndex(rcseq); 113 | if(rcmc!=NULL) 114 | rcmc->reversed=true; 115 | 116 | if(mc==NULL) 117 | return rcmc; 118 | else if(rcmc==NULL) 119 | return mc; 120 | else { 121 | if(mc->mismatches.size() <= rcmc->mismatches.size()){ 122 | delete rcmc; 123 | return mc; 124 | } else { 125 | delete mc; 126 | return rcmc; 127 | } 128 | } 129 | } 130 | 131 | MatchResult* Matcher::mapToIndex(Sequence& sequence) { 132 | unordered_map kmerStat; 133 | kmerStat[0]=0; 134 | string seq = sequence.mStr; 135 | const int step = 1; 136 | const int skipThreshold = 50; 137 | int seqlen = seq.length(); 138 | 139 | unsigned int* allKmer = new unsigned int[seqlen]; 140 | memset(allKmer, 0, sizeof(unsigned int) * seqlen); 141 | bool* kmerValid = new bool[seqlen]; 142 | memset(kmerValid, 0, sizeof(bool) * seqlen); 143 | bool* skipped = new bool[seqlen]; 144 | memset(skipped, 0, sizeof(bool) * seqlen); 145 | 146 | // first pass, we only want to find if this seq can be partially aligned to the target 147 | bool valid = false; 148 | for(int i=0; i< seqlen - KMER + 1; i += step) { 149 | unsigned int kmer = makeKmer(seq, i, valid); 150 | kmerValid[i] = valid; 151 | if(!valid) 152 | continue; 153 | 154 | allKmer[i] = kmer; 155 | // no match 156 | if(mKmerPositions.count(kmer) <=0 ){ 157 | kmerStat[0]++; 158 | continue; 159 | } 160 | 161 | if(mKmerPositions[kmer].size() > skipThreshold) { 162 | skipped[i] = true; 163 | continue; 164 | } 165 | 166 | for(int g=0; g::iterator iter; 181 | for(iter = kmerStat.begin(); iter!=kmerStat.end(); iter++){ 182 | long gp = iter->first; 183 | int count = iter->second; 184 | // no need to update the top N 185 | if(gp == 0 || count <= topcount[TOP-1]) 186 | continue; 187 | // update the last one first 188 | topgp[TOP-1]=gp; 189 | topcount[TOP-1]=count; 190 | // compare with the rest ones 191 | for(int t=TOP-2;t>=0;t--){ 192 | if(count > topcount[t]) { 193 | topcount[t+1] = topcount[t]; 194 | topgp[t+1] = topgp[t]; 195 | topcount[t] = count; 196 | topgp[t] = gp; 197 | } 198 | } 199 | } 200 | 201 | for(int t=0;t mismatches = vector(); 231 | for(int i=0;isequence = Sequence(sequence); 238 | mr->startGP = long2gp(topgp[t]); 239 | mr->mismatches = mismatches; 240 | delete mask; 241 | delete skipped; 242 | delete allKmer; 243 | delete kmerValid; 244 | return mr; 245 | } 246 | delete mask; 247 | mask = NULL; 248 | } 249 | 250 | delete skipped; 251 | delete allKmer; 252 | delete kmerValid; 253 | return NULL; 254 | } 255 | 256 | bool Matcher::isConsistent(long thisgp, unsigned int kmer, int seqpos, int threshold) { 257 | vector& gps = mKmerPositions[kmer]; 258 | // align by seqpos 259 | GenePos target = shift(long2gp(thisgp), -seqpos); 260 | int size = gps.size(); 261 | int left = 0; 262 | int right = size-1; 263 | while(left <= right) { 264 | int center = (left + right) /2; 265 | GenePos centerPos = gps[center]; 266 | 267 | if(centerPos.contig < target.contig) { 268 | // go right 269 | left = center+1; 270 | } else if(centerPos.contig > target.contig) { 271 | // go left 272 | right = center-1; 273 | } else { 274 | 275 | if(abs(centerPos.position - target.position) <= threshold) 276 | return true; 277 | 278 | if(centerPos.position < target.position) { 279 | // go right 280 | left = center+1; 281 | } else if(centerPos.position > target.position) { 282 | // go left 283 | right = center-1; 284 | } 285 | } 286 | } 287 | 288 | return false; 289 | } 290 | 291 | void Matcher::makeMask(unsigned char* mask, unsigned char flag, int seqlen, int start, int kmerSize) { 292 | for(int i=start;i> 32; 352 | return gp; 353 | } 354 | 355 | GenePos Matcher::shift(const GenePos& gp, int i){ 356 | GenePos gpNew; 357 | gpNew.contig = gp.contig; 358 | gpNew.position = gp.position - i; 359 | return gpNew; 360 | } 361 | 362 | void Matcher::printStat() { 363 | cout<<"mUniquePos:"< 5 | #include 6 | #include 7 | #include "read.h" 8 | #include 9 | #include 10 | #include 11 | #include "read.h" 12 | #include "fastareader.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "common.h" 18 | 19 | using namespace std; 20 | 21 | struct MatchResult{ 22 | GenePos startGP; 23 | Sequence sequence; 24 | bool reversed; 25 | vector mismatches; 26 | inline void print() { 27 | cout<& seqs); 37 | ~Matcher(); 38 | void makeIndex(); 39 | void indexContig(int ctg, string seq, int start); 40 | void printStat(); 41 | 42 | MatchResult* mapToIndex(Sequence& seq); 43 | MatchResult* match(Sequence& seq); 44 | 45 | static inline long gp2long(const GenePos& gp); 46 | static inline GenePos long2gp(const long val); 47 | static inline unsigned int makeKmer(string & seq, int pos, bool& valid); 48 | static inline int base2num(char c); 49 | static inline GenePos shift(const GenePos& gp, int i); 50 | 51 | static bool test(); 52 | 53 | 54 | private: 55 | void makeMask(unsigned char* mask, unsigned char flag, int seqlen, int start, int kmerSize); 56 | void initBloomFilter(vector& seqs); 57 | void initBloomFilterWithSeq(Sequence& seq); 58 | bool isConsistent(long thisgp, unsigned int kmer, int seqpos, int threshold); 59 | 60 | public: 61 | unordered_map> mKmerPositions; 62 | vector mContigNames; 63 | 64 | 65 | private: 66 | FastaReader* mReference; 67 | int mUniquePos; 68 | int mDupePos; 69 | unsigned char* mBloomFilterArray; 70 | }; 71 | 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /src/overlap.cpp: -------------------------------------------------------------------------------- 1 | #include "overlap.h" 2 | #include "editdistance.h" 3 | #include "math.h" 4 | 5 | Overlap::Overlap(int offset, int overlapLen, int distance){ 6 | mOffset = offset; 7 | mOverlapLen = overlapLen; 8 | mDistance = distance; 9 | mOverlapped = overlapLen>0; 10 | } 11 | 12 | Overlap Overlap::fit(Sequence r1, Sequence r2){ 13 | int len1 = r1.length(); 14 | int len2 = r2.length(); 15 | Sequence reverse_r2 = ~r2; 16 | 17 | bool overlapped = false; 18 | int overlap_len = 0; 19 | int offset = 0; 20 | int distance = 0; 21 | // a match of less than 10 is considered as unconfident 22 | while (offset < len1-10 && overlapped==false){ 23 | // the overlap length of r1 & r2 when r2 is move right for offset 24 | overlap_len = min(len1-offset, len2); 25 | 26 | distance = edit_distance(r1.mStr.substr(offset, overlap_len), reverse_r2.mStr.substr(0, overlap_len)); 27 | float threshold = min(3.0, overlap_len/10.0); 28 | if (distance <= threshold){ 29 | // now we find a good candidate 30 | // we verify it by moving r2 one more base to see if the distance is getting longer 31 | // if yes, then current is the best match, otherwise it's not 32 | while (offset < len1-10){ 33 | int next_offset = offset + 1; 34 | int next_overlap_len = min(len1-next_offset, len2); 35 | int next_distance = edit_distance(r1.mStr.substr(next_offset, next_overlap_len), reverse_r2.mStr.substr(0, next_overlap_len)); 36 | if (distance <= next_distance){ 37 | overlapped = true; 38 | break; 39 | } 40 | else{ 41 | offset = next_offset; 42 | distance = next_distance; 43 | overlap_len = next_overlap_len; 44 | } 45 | } 46 | break; 47 | } 48 | else 49 | offset += max(1, (distance - (int)ceil(threshold))/2 ); 50 | } 51 | 52 | if (overlapped && offset == 0){ 53 | // check if distance can get smaller if offset goes negative 54 | // this only happens when insert DNA is shorter than sequencing read length, and some adapter/primer is sequenced but not trimmed cleanly 55 | // we go reversely 56 | while (offset > -(len2-10)){ 57 | // the overlap length of r1 & r2 when r2 is move right for offset 58 | overlap_len = min(len1, len2- abs(offset)); 59 | distance = edit_distance(r1.mStr.substr(0, overlap_len), reverse_r2.mStr.substr(-offset, overlap_len)); 60 | float threshold = min(3.0, overlap_len/10.0); 61 | if (distance <= threshold){ 62 | while(offset > -(len2-10)){ 63 | int next_offset = offset - 1; 64 | int next_overlap_len = min(len1, len2- abs(next_offset)); 65 | int next_distance = edit_distance(r1.mStr.substr(0, next_overlap_len), reverse_r2.mStr.substr(-next_offset, next_overlap_len)); 66 | if (distance <= next_distance) 67 | return Overlap(offset, overlap_len, distance); 68 | else { 69 | distance = next_distance; 70 | overlap_len = next_overlap_len; 71 | offset = next_offset; 72 | } 73 | } 74 | } 75 | else 76 | offset -= max(1, (distance - (int)ceil(threshold))/2 ); 77 | } 78 | } 79 | else if(overlapped) { 80 | return Overlap(offset, overlap_len, distance); 81 | } 82 | 83 | return Overlap(0,0,0); 84 | } 85 | bool Overlap::test(){ 86 | Sequence r1[5] = { 87 | Sequence("TTTGCAGGCACCTACCACTGTACCTGTCTAATTTTTCTTCTGCCCTTTTTTTTTTTTTTTTTTTTTTTTTGGGGTAGAGACGAGGCCTTGCTATGTAGCCCTTGCTGGTCTCAAACTCCTCGCCTCAAGTGATCCTCCTGCCTCGGCCTCC"), 88 | Sequence("CCCTATGTCTACAAAACATCAGAAAATTAGGGTGTGGTGGCTCATGCCTATAGTCATAGCTACATAGGAGGCTGAGGCAGGAGGATCGCTTGAGGGCAGGAGGATCACTCGAGCTCTGAAGGTCAACGCTGCAGTGAGCTATGATCGTGCC"), 89 | Sequence("TAGAGGGCTCAGATGCATTCCTTTTTAGCAGTGCTCTTATTTGGCATTGGTGGTGCTGTTTCTGTTGACCACTCCCAGAGTCTCTGGATGTTTTGTTATTCCTTTACCTCCCTAGCCTCTCCTTGGGGTTTCTTTGCAGGCTCTTGCTCTC"), 90 | Sequence("CCTGGGTAGCTGGGATACAGGCGCCCGCCACCACGCCCGGCTAATTTTGTATTTTTAGTAGAGACGAGGTTTCACCACATTGGCCAGGCTGGTCTCAAACTCCTGACCTCAGGTGATCTGCCTGCCTCAGCCTCCTAGAGTGCTGGG"), 91 | Sequence("GTTCCTTTTAACATAGAAAGCAGCTAATTTTCCTATTCAAAAAATGGAGCTCTATTAAAAGATAAAACAGCAGCTTAGCTCTAGGTAAAGTGATCCATGCGGTTCTTCTTCTTTTTTTTGTTTTGAGATGGACTCTCGCTCTGTCACCCA") 92 | }; 93 | Sequence r2[5] = { 94 | Sequence("CATGGTGGCTCATGCCTGTAATCCCAGTGGTTTGGGAGGCCGAGGCAGGAGGATCACTTGAGGCGAGGAGTTTGAGACCAGCAAGGGCTACATAGCAAGGCCTCGTCTCTACCCCAAAAAAAAAAAAAAAAAAAAAAAAAGGGCAGAAGAA"), 95 | Sequence("AGTGCAGTGGCACGATCATAGCTCACTGCAGCGTTGACCTTCAGAGCTCGAGTGATCCTCCTGCCCTCAAGCGATCCTCCTGCCTCAGCCTCCTATGTAGCTATGACTATAGGCATGAGCCACCACACCCTAATTTTCTGATGTTTTGTAG"), 96 | Sequence("CTGGAGATAAACACCTAGCAGTCATGAGACAAAGCTCTGCAATGCTTGTATTTATGGGATACAAGAGAGAGCAAGAGCCTGCAAAGAAACCCCAAGGAGAGGCTAGGGAGGTAAAGGAATAACAAAACATCCAGAGACACTGGGAGTGGTC"), 97 | Sequence("CCCAGCACTCTAGGAGGCTGAGGCAGGCAGATCACCTGAGGTCAGGAGTTTGAGACCAGCCTGGCCAATGTGGTGAAACCTCGTCTCTACTAAAAATACAAAATTAGCCGGGCGTGGTGGCGGGCGCCTGTAATCCCAGCTACCCAGC"), 98 | Sequence("TGGGTGACAGAGCGAGAGTCCATCTCAAAACAAAAAAAAGAAGAAGAACCGCACTGGATCACTTTACCTCAGAGCTAAGCTGCTGTTTTATCTTTTAATAGAGCTCCATTTTTTGAATAGGAAAATTAGCTGCTTTCTATGTTAAAAGGAA") 99 | }; 100 | Overlap overlap[5] = { 101 | Overlap(34L, 117L, 0L), 102 | Overlap(8L, 143L, 0L), 103 | Overlap(66L, 85L, 1L), 104 | Overlap(-1, 147, 2L), 105 | Overlap(0, 0, 0) 106 | }; 107 | for (int i=0;i<5;i++){ 108 | Overlap fit = Overlap::fit(r1[i], r2[i]); 109 | if (fit.mOffset!=overlap[i].mOffset || fit.mOverlapLen!=overlap[i].mOverlapLen || fit.mDistance!=overlap[i].mDistance){ 110 | cout<<"Fail in Overlap::fit() with sequence"< 5 | #include 6 | #include 7 | #include "sequence.h" 8 | 9 | using namespace std; 10 | 11 | class Overlap{ 12 | public: 13 | Overlap(int offset, int overlapLen, int distance); 14 | static Overlap fit(Sequence R1, Sequence R2); 15 | static bool test(); 16 | 17 | public: 18 | int mOffset; 19 | int mOverlapLen; 20 | int mDistance; 21 | bool mOverlapped; 22 | }; 23 | 24 | #endif -------------------------------------------------------------------------------- /src/pescanner.cpp: -------------------------------------------------------------------------------- 1 | #include "pescanner.h" 2 | #include "fastqreader.h" 3 | #include 4 | #include "htmlreporter.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "util.h" 10 | #include "jsonreporter.h" 11 | 12 | PairEndScanner::PairEndScanner(string fusionFile, string refFile, string read1File, string read2File, string html, string json, int threadNum){ 13 | mRead1File = read1File; 14 | mRead2File = read2File; 15 | mFusionFile = fusionFile; 16 | mRefFile = refFile; 17 | mHtmlFile = html; 18 | mJsonFile = json; 19 | mProduceFinished = false; 20 | mThreadNum = threadNum; 21 | mFusionMapper = NULL; 22 | } 23 | 24 | PairEndScanner::~PairEndScanner() { 25 | if(mFusionMapper != NULL) { 26 | delete mFusionMapper; 27 | mFusionMapper = NULL; 28 | } 29 | } 30 | 31 | bool PairEndScanner::scan(){ 32 | 33 | mFusionMapper = new FusionMapper(mRefFile, mFusionFile); 34 | 35 | initPackRepository(); 36 | std::thread producer(std::bind(&PairEndScanner::producerTask, this)); 37 | 38 | std::thread** threads = new thread*[mThreadNum]; 39 | for(int t=0; tjoin(); 46 | } 47 | 48 | for(int t=0; tfilterMatches(); 54 | mFusionMapper->sortMatches(); 55 | mFusionMapper->clusterMatches(); 56 | 57 | htmlReport(); 58 | jsonReport(); 59 | 60 | mFusionMapper->freeMatches(); 61 | return true; 62 | } 63 | 64 | void PairEndScanner::pushMatch(Match* m){ 65 | std::unique_lock lock(mFusionMtx); 66 | mFusionMapper->addMatch(m); 67 | lock.unlock(); 68 | } 69 | 70 | bool PairEndScanner::scanPairEnd(ReadPairPack* pack){ 71 | for(int p=0;pcount;p++){ 72 | ReadPair* pair = pack->data[p]; 73 | Read* r1 = pair->mLeft; 74 | Read* r2 = pair->mRight; 75 | Read* rcr1 = NULL; 76 | Read* rcr2 = NULL; 77 | Read* merged = pair->fastMerge(); 78 | Read* mergedRC = NULL; 79 | bool mapable = false; 80 | // if merged successfully, we only search the merged 81 | if(merged != NULL) { 82 | Match* matchMerged = mFusionMapper->mapRead(merged, mapable); 83 | if(matchMerged){ 84 | matchMerged->addOriginalPair(pair); 85 | pushMatch(matchMerged); 86 | } else if(mapable){ 87 | mergedRC = merged->reverseComplement(); 88 | Match* matchMergedRC = mFusionMapper->mapRead(mergedRC, mapable); 89 | if(matchMergedRC){ 90 | matchMergedRC->addOriginalPair(pair); 91 | pushMatch(matchMergedRC); 92 | } 93 | delete mergedRC; 94 | } 95 | 96 | delete pair; 97 | delete merged; 98 | continue; 99 | } 100 | // else still search R1 and R2 separatedly 101 | mapable = false; 102 | Match* matchR1 = mFusionMapper->mapRead(r1, mapable); 103 | if(matchR1){ 104 | matchR1->addOriginalPair(pair); 105 | pushMatch(matchR1); 106 | } else if(mapable){ 107 | rcr1 = r1->reverseComplement(); 108 | Match* matchRcr1 = mFusionMapper->mapRead(rcr1, mapable); 109 | if(matchRcr1){ 110 | matchRcr1->addOriginalPair(pair); 111 | matchRcr1->setReversed(true); 112 | pushMatch(matchRcr1); 113 | } 114 | delete rcr1; 115 | } 116 | mapable = false; 117 | Match* matchR2 = mFusionMapper->mapRead(r2, mapable); 118 | if(matchR2){ 119 | matchR2->addOriginalPair(pair); 120 | pushMatch(matchR2); 121 | } else if(mapable) { 122 | rcr2 = r2->reverseComplement(); 123 | Match* matchRcr2 = mFusionMapper->mapRead(rcr2, mapable); 124 | if(matchRcr2){ 125 | matchRcr2->addOriginalPair(pair); 126 | matchRcr2->setReversed(true); 127 | pushMatch(matchRcr2); 128 | } 129 | delete rcr2; 130 | } 131 | delete pair; 132 | } 133 | 134 | delete pack->data; 135 | delete pack; 136 | 137 | return true; 138 | } 139 | 140 | void PairEndScanner::initPackRepository() { 141 | mRepo.packBuffer = new ReadPairPack*[PACK_NUM_LIMIT]; 142 | memset(mRepo.packBuffer, 0, sizeof(ReadPairPack*)*PACK_NUM_LIMIT); 143 | mRepo.writePos = 0; 144 | mRepo.readPos = 0; 145 | mRepo.readCounter = 0; 146 | 147 | } 148 | 149 | void PairEndScanner::destroyPackRepository() { 150 | delete mRepo.packBuffer; 151 | mRepo.packBuffer = NULL; 152 | } 153 | 154 | void PairEndScanner::producePack(ReadPairPack* pack){ 155 | std::unique_lock lock(mRepo.mtx); 156 | while(((mRepo.writePos + 1) % PACK_NUM_LIMIT) 157 | == mRepo.readPos) { 158 | mRepo.repoNotFull.wait(lock); 159 | } 160 | 161 | mRepo.packBuffer[mRepo.writePos] = pack; 162 | mRepo.writePos++; 163 | 164 | if (mRepo.writePos == PACK_NUM_LIMIT) 165 | mRepo.writePos = 0; 166 | 167 | mRepo.repoNotEmpty.notify_all(); 168 | lock.unlock(); 169 | } 170 | 171 | void PairEndScanner::consumePack(){ 172 | ReadPairPack* data; 173 | std::unique_lock lock(mRepo.mtx); 174 | // read buffer is empty, just wait here. 175 | while(mRepo.writePos % PACK_NUM_LIMIT == mRepo.readPos % PACK_NUM_LIMIT) { 176 | if(mProduceFinished){ 177 | lock.unlock(); 178 | return; 179 | } 180 | mRepo.repoNotEmpty.wait(lock); 181 | } 182 | 183 | data = mRepo.packBuffer[mRepo.readPos]; 184 | mRepo.readPos++; 185 | 186 | if (mRepo.readPos >= PACK_NUM_LIMIT) 187 | mRepo.readPos = 0; 188 | 189 | lock.unlock(); 190 | mRepo.repoNotFull.notify_all(); 191 | 192 | scanPairEnd(data); 193 | } 194 | 195 | void PairEndScanner::producerTask() 196 | { 197 | int slept = 0; 198 | ReadPair** data = new ReadPair*[PACK_SIZE]; 199 | memset(data, 0, sizeof(ReadPair*)*PACK_SIZE); 200 | FastqReaderPair reader(mRead1File, mRead2File); 201 | int count=0; 202 | while(true){ 203 | ReadPair* read = reader.read(); 204 | if(!read){ 205 | // the last pack 206 | ReadPairPack* pack = new ReadPairPack; 207 | pack->data = data; 208 | pack->count = count; 209 | producePack(pack); 210 | data = NULL; 211 | break; 212 | } 213 | data[count] = read; 214 | count++; 215 | // a full pack 216 | if(count == PACK_SIZE){ 217 | ReadPairPack* pack = new ReadPairPack; 218 | pack->data = data; 219 | pack->count = count; 220 | producePack(pack); 221 | //re-initialize data for next pack 222 | data = new ReadPair*[PACK_SIZE]; 223 | memset(data, 0, sizeof(ReadPair*)*PACK_SIZE); 224 | // reset count to 0 225 | count = 0; 226 | // if the consumer is far behind this producer, sleep and wait to limit memory usage 227 | while(mRepo.writePos - mRepo.readPos > PACK_IN_MEM_LIMIT){ 228 | //cout<<"sleep"< lock(mRepo.readCounterMtx); 236 | mProduceFinished = true; 237 | lock.unlock(); 238 | 239 | // if the last data initialized is not used, free it 240 | if(data != NULL) 241 | delete data; 242 | } 243 | 244 | void PairEndScanner::consumerTask() 245 | { 246 | while(true) { 247 | std::unique_lock lock(mRepo.readCounterMtx); 248 | if(mProduceFinished && mRepo.writePos == mRepo.readPos){ 249 | lock.unlock(); 250 | break; 251 | } 252 | if(mProduceFinished){ 253 | consumePack(); 254 | lock.unlock(); 255 | } else { 256 | lock.unlock(); 257 | consumePack(); 258 | } 259 | } 260 | } 261 | 262 | void PairEndScanner::textReport() { 263 | } 264 | 265 | void PairEndScanner::htmlReport() { 266 | if(mHtmlFile == "") 267 | return; 268 | 269 | HtmlReporter reporter(mHtmlFile, mFusionMapper); 270 | reporter.run(); 271 | } 272 | 273 | void PairEndScanner::jsonReport() { 274 | if(mJsonFile == "") 275 | return; 276 | 277 | JsonReporter reporter(mJsonFile, mFusionMapper); 278 | reporter.run(); 279 | } 280 | -------------------------------------------------------------------------------- /src/pescanner.h: -------------------------------------------------------------------------------- 1 | #ifndef PE_SCANNNER_H 2 | #define PE_SCANNNER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include "fusion.h" 9 | #include "match.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "fusionmapper.h" 15 | 16 | 17 | using namespace std; 18 | 19 | struct ReadPairPack { 20 | ReadPair** data; 21 | int count; 22 | }; 23 | 24 | typedef struct ReadPairPack ReadPairPack; 25 | 26 | struct ReadPairRepository { 27 | ReadPairPack** packBuffer; 28 | size_t readPos; 29 | size_t writePos; 30 | size_t readCounter; 31 | std::mutex mtx; 32 | std::mutex readCounterMtx; 33 | std::condition_variable repoNotFull; 34 | std::condition_variable repoNotEmpty; 35 | }; 36 | 37 | typedef struct ReadPairRepository ReadPairRepository; 38 | 39 | class PairEndScanner{ 40 | public: 41 | PairEndScanner(string fusionFile, string refFile, string read1File, string read2File, string html, string json, int threadnum); 42 | ~PairEndScanner(); 43 | bool scan(); 44 | void textReport(); 45 | void htmlReport(); 46 | void jsonReport(); 47 | 48 | private: 49 | bool scanPairEnd(ReadPairPack* pack); 50 | void initPackRepository(); 51 | void destroyPackRepository(); 52 | void producePack(ReadPairPack* pack); 53 | void consumePack(); 54 | void producerTask(); 55 | void consumerTask(); 56 | void pushMatch(Match* m); 57 | 58 | private: 59 | string mFusionFile; 60 | string mRefFile; 61 | string mRead1File; 62 | string mRead2File; 63 | string mHtmlFile; 64 | string mJsonFile; 65 | ReadPairRepository mRepo; 66 | bool mProduceFinished; 67 | std::mutex mFusionMtx; 68 | int mThreadNum; 69 | FusionMapper* mFusionMapper; 70 | }; 71 | 72 | 73 | #endif -------------------------------------------------------------------------------- /src/read.cpp: -------------------------------------------------------------------------------- 1 | #include "read.h" 2 | #include 3 | 4 | Read::Read(string name, string seq, string strand, string quality){ 5 | mName = name; 6 | mSeq = Sequence(seq); 7 | mStrand = strand; 8 | mQuality = quality; 9 | mHasQuality = true; 10 | } 11 | 12 | Read::Read(string name, string seq, string strand){ 13 | mName = name; 14 | mSeq = Sequence(seq); 15 | mStrand = strand; 16 | mHasQuality = false; 17 | } 18 | 19 | Read::Read(string name, Sequence seq, string strand, string quality){ 20 | mName = name; 21 | mSeq = seq; 22 | mStrand = strand; 23 | mQuality = quality; 24 | mHasQuality = true; 25 | } 26 | 27 | Read::Read(string name, Sequence seq, string strand){ 28 | mName = name; 29 | mSeq = seq; 30 | mStrand = strand; 31 | mHasQuality = false; 32 | } 33 | 34 | Read::Read(Read &r) { 35 | mName = r.mName; 36 | mSeq = r.mSeq; 37 | mStrand = r.mStrand; 38 | mQuality = r.mQuality; 39 | mHasQuality = r.mHasQuality; 40 | } 41 | 42 | void Read::print(){ 43 | std::cout << mName << endl; 44 | std::cout << mSeq.mStr << endl; 45 | std::cout << mStrand << endl; 46 | if(mHasQuality) 47 | std::cout << mQuality << endl; 48 | } 49 | 50 | void Read::printFile(ofstream& file){ 51 | file << mName << endl; 52 | file << mSeq.mStr << endl; 53 | file << mStrand << endl; 54 | if(mHasQuality) 55 | file << mQuality << endl; 56 | } 57 | 58 | 59 | void Read::printWithBreaks(vector& breaks){ 60 | std::cout << mName << endl; 61 | std::cout << makeStringWithBreaks(mSeq.mStr, breaks)<< endl; 62 | std::cout << mStrand << endl; 63 | if(mHasQuality) 64 | std::cout << makeStringWithBreaks(mQuality, breaks) << endl; 65 | } 66 | 67 | string Read::makeStringWithBreaks(const string origin, vector& breaks) { 68 | string ret = origin.substr(0, breaks[0]); 69 | for(int i=0;i0) 73 | ret += " " + origin.substr(breaks[breaks.size()-1], origin.length() - breaks[breaks.size()-1]); 74 | return ret; 75 | } 76 | 77 | void Read::printHtmlTDWithBreaks(ofstream& file, vector& breaks) { 78 | file << "" << makeHtmlSeqWithQual(0, breaks[0]) << ""; 79 | for(int i=0;i" << makeHtmlSeqWithQual(breaks[i], breaks[i+1]-breaks[i]) << ""; 84 | } 85 | if(breaks[breaks.size()-1]>0) 86 | file << "" << makeHtmlSeqWithQual(breaks[breaks.size()-1], mSeq.mStr.length() - breaks[breaks.size()-1]) << ""; 87 | } 88 | 89 | string Read::makeHtmlSeqWithQual(int start, int length) { 90 | stringstream ss; 91 | for(int i=start;i"<< mSeq.mStr[i] << ""; 93 | } 94 | return ss.str(); 95 | } 96 | 97 | string Read::qualityColor(char qual) { 98 | if(qual >= 'I') // >= Q40, extremely high quality 99 | return "#78C6B9"; 100 | if(qual >= '?') // Q30 ~ Q39, high quality 101 | return "#33BBE2"; 102 | if(qual >= '5') // Q20 ~ Q29, moderate quality 103 | return "#666666"; 104 | if(qual >= '0') // Q15 ~ Q19, low quality 105 | return "#E99E5B"; 106 | else // <= Q14, extremely low quality 107 | return "#FF0000"; 108 | } 109 | 110 | Read* Read::reverseComplement(){ 111 | Sequence seq = ~mSeq; 112 | string qual; 113 | qual.assign(mQuality.rbegin(), mQuality.rend()); 114 | string strand = (mStrand=="+") ? "-" : "+"; 115 | return new Read(mName, seq, strand, qual); 116 | } 117 | 118 | string Read::lastIndex(){ 119 | int len = mName.length(); 120 | if(len<5) 121 | return ""; 122 | for(int i=len-5;i>=0;i--){ 123 | if(mName[i]==':' or mName[i]=='+'){ 124 | return mName.substr(i+1, len-i); 125 | } 126 | } 127 | return ""; 128 | } 129 | 130 | int Read::lowQualCount(int qual){ 131 | int count = 0; 132 | for(int q=0;qreverseComplement(); 170 | int len1 = mLeft->length(); 171 | int len2 = rcRight->length(); 172 | // use the pointer directly for speed 173 | const char* str1 = mLeft->mSeq.mStr.c_str(); 174 | const char* str2 = rcRight->mSeq.mStr.c_str(); 175 | const char* qual1 = mLeft->mQuality.c_str(); 176 | const char* qual2 = rcRight->mQuality.c_str(); 177 | 178 | // we require at least 30 bp overlapping to merge a pair 179 | const int MIN_OVERLAP = 30; 180 | bool overlapped = false; 181 | int olen = MIN_OVERLAP; 182 | int diff = 0; 183 | // the diff count for 1 high qual + 1 low qual 184 | int lowQualDiff = 0; 185 | 186 | while(olen <= min(len1, len2)){ 187 | diff = 0; 188 | lowQualDiff = 0; 189 | bool ok = true; 190 | int offset = len1 - olen; 191 | for(int i=0;i= Q30 and the other is <= Q15 195 | if((qual1[offset+i]>='?' && qual2[i]<='0') || (qual1[offset+i]<='0' && qual2[i]>='?')){ 196 | lowQualDiff++; 197 | } 198 | // we disallow high quality diff, and only allow up to 3 low qual diff 199 | if(diff>lowQualDiff || lowQualDiff>=3){ 200 | ok = false; 201 | break; 202 | } 203 | } 204 | } 205 | if(ok){ 206 | overlapped = true; 207 | break; 208 | } 209 | olen++; 210 | } 211 | 212 | if(overlapped){ 213 | int offset = len1 - olen; 214 | stringstream ss; 215 | ss << mLeft->mName << " merged_diff_" << diff; 216 | string mergedName = ss.str(); 217 | string mergedSeq = mLeft->mSeq.mStr.substr(0, offset) + rcRight->mSeq.mStr; 218 | string mergedQual = mLeft->mQuality.substr(0, offset) + rcRight->mQuality; 219 | // quality adjuction and correction for low qual diff 220 | for(int i=0;i='?' && qual2[i]<='0'){ 223 | mergedSeq[offset+i] = str1[offset+i]; 224 | mergedQual[offset+i] = qual1[offset+i]; 225 | } else { 226 | mergedSeq[offset+i] = str2[i]; 227 | mergedQual[offset+i] = qual2[i]; 228 | } 229 | } else { 230 | // add the quality of the pair to make a high qual 231 | mergedQual[offset+i] = qual1[offset+i] + qual2[i] - 33; 232 | if(mergedQual[offset+i] >= 'Z') 233 | mergedQual[offset+i] = 'Z'; 234 | } 235 | } 236 | delete rcRight; 237 | return new Read(mergedName, mergedSeq, "+", mergedQual); 238 | } 239 | 240 | delete rcRight; 241 | return NULL; 242 | } 243 | 244 | bool ReadPair::test(){ 245 | Read* left = new Read("@NS500713:64:HFKJJBGXY:1:11101:20469:1097 1:N:0:TATAGCCT+GGTCCCGA", 246 | "TTTTTTCTCTTGGACTCTAACACTGTTTTTTCTTATGAAAACACAGGAGTGATGACTAGTTGAGTGCATTCTTATGAGACTCATAGTCATTCTATGATGTAG", 247 | "+", 248 | "AAAAA6EEEEEEEEEEEEEEEEE#EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE"); 249 | Read* right = new Read("@NS500713:64:HFKJJBGXY:1:11101:20469:1097 1:N:0:TATAGCCT+GGTCCCGA", 250 | "AAAAAACTACACCATAGAATGACTATGAGTCTCATAAGAATGCACTCAACTAGTCATCACTCCTGTGTTTTCATAAGAAAAAACAGTGTTAGAGTCCAAGAG", 251 | "+", 252 | "AAAAA6EEEEE/EEEEEEEEEEE#EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE"); 253 | 254 | ReadPair pair(left, right); 255 | Read* merged = pair.fastMerge(); 256 | if(merged == NULL) 257 | return false; 258 | 259 | if(merged->mSeq.mStr != "TTTTTTCTCTTGGACTCTAACACTGTTTTTTCTTATGAAAACACAGGAGTGATGACTAGTTGAGTGCATTCTTATGAGACTCATAGTCATTCTATGATGTAGTTTTTT") 260 | return false; 261 | 262 | //merged->print(); 263 | return true; 264 | } 265 | -------------------------------------------------------------------------------- /src/read.h: -------------------------------------------------------------------------------- 1 | #ifndef READ_H 2 | #define READ_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "sequence.h" 10 | #include 11 | 12 | using namespace std; 13 | 14 | class Read{ 15 | public: 16 | Read(string name, string seq, string strand, string quality); 17 | Read(string name, Sequence seq, string strand, string quality); 18 | Read(string name, string seq, string strand); 19 | Read(string name, Sequence seq, string strand); 20 | Read(Read &r); 21 | void print(); 22 | void printFile(ofstream& file); 23 | Read* reverseComplement(); 24 | string firstIndex(); 25 | string lastIndex(); 26 | // default is Q20 27 | int lowQualCount(int qual=20); 28 | int length(); 29 | void printWithBreaks(vector& breaks); 30 | void printHtmlTDWithBreaks(ofstream& file, vector& breaks); 31 | 32 | public: 33 | static bool test(); 34 | 35 | private: 36 | string makeStringWithBreaks(const string origin, vector& breaks); 37 | string makeHtmlSeqWithQual(int start, int length); 38 | string qualityColor(char qual); 39 | 40 | public: 41 | string mName; 42 | Sequence mSeq; 43 | string mStrand; 44 | string mQuality; 45 | bool mHasQuality; 46 | }; 47 | 48 | class ReadPair{ 49 | public: 50 | ReadPair(Read* left, Read* right); 51 | ~ReadPair(); 52 | 53 | // merge a pair, without consideration of seq error caused false INDEL 54 | Read* fastMerge(); 55 | public: 56 | Read* mLeft; 57 | Read* mRight; 58 | 59 | public: 60 | static bool test(); 61 | }; 62 | 63 | #endif -------------------------------------------------------------------------------- /src/sequence.cpp: -------------------------------------------------------------------------------- 1 | #include "sequence.h" 2 | 3 | Sequence::Sequence(){ 4 | } 5 | 6 | Sequence::Sequence(string seq){ 7 | mStr = seq; 8 | } 9 | 10 | void Sequence::print(){ 11 | std::cout << mStr; 12 | } 13 | 14 | int Sequence::length(){ 15 | return mStr.length(); 16 | } 17 | 18 | Sequence Sequence::reverseComplement(){ 19 | string str(mStr.length(), 0); 20 | for(int c=0;c 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | class Sequence{ 12 | public: 13 | Sequence(); 14 | Sequence(string seq); 15 | void print(); 16 | int length(); 17 | Sequence reverseComplement(); 18 | 19 | Sequence operator~(); 20 | 21 | static bool test(); 22 | 23 | public: 24 | string mStr; 25 | }; 26 | 27 | #endif -------------------------------------------------------------------------------- /src/sescanner.cpp: -------------------------------------------------------------------------------- 1 | #include "sescanner.h" 2 | #include "fastqreader.h" 3 | #include 4 | #include "htmlreporter.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "util.h" 10 | #include "jsonreporter.h" 11 | 12 | SingleEndScanner::SingleEndScanner(string fusionFile, string refFile, string read1File, string html, string json, int threadNum){ 13 | mRead1File = read1File; 14 | mFusionFile = fusionFile; 15 | mRefFile = refFile; 16 | mHtmlFile = html; 17 | mJsonFile = json; 18 | mProduceFinished = false; 19 | mThreadNum = threadNum; 20 | mFusionMapper = NULL; 21 | } 22 | 23 | SingleEndScanner::~SingleEndScanner() { 24 | if(mFusionMapper != NULL) { 25 | delete mFusionMapper; 26 | mFusionMapper = NULL; 27 | } 28 | } 29 | 30 | bool SingleEndScanner::scan(){ 31 | 32 | mFusionMapper = new FusionMapper(mRefFile, mFusionFile); 33 | 34 | initPackRepository(); 35 | std::thread producer(std::bind(&SingleEndScanner::producerTask, this)); 36 | 37 | std::thread** threads = new thread*[mThreadNum]; 38 | for(int t=0; tjoin(); 45 | } 46 | 47 | for(int t=0; tfilterMatches(); 53 | mFusionMapper->sortMatches(); 54 | mFusionMapper->clusterMatches(); 55 | 56 | htmlReport(); 57 | jsonReport(); 58 | 59 | mFusionMapper->freeMatches(); 60 | 61 | return true; 62 | } 63 | 64 | void SingleEndScanner::pushMatch(Match* m){ 65 | std::unique_lock lock(mFusionMtx); 66 | mFusionMapper->addMatch(m); 67 | lock.unlock(); 68 | } 69 | 70 | bool SingleEndScanner::scanSingleEnd(ReadPack* pack){ 71 | for(int p=0;pcount;p++){ 72 | Read* r1 = pack->data[p]; 73 | bool mapable = false; 74 | Match* matchR1 = mFusionMapper->mapRead(r1, mapable); 75 | if(matchR1){ 76 | matchR1->addOriginalRead(r1); 77 | pushMatch(matchR1); 78 | } else if(mapable){ 79 | Read* rcr1 = r1->reverseComplement(); 80 | Match* matchRcr1 = mFusionMapper->mapRead(rcr1, mapable); 81 | if(matchRcr1){ 82 | matchRcr1->addOriginalRead(r1); 83 | matchRcr1->setReversed(true); 84 | pushMatch(matchRcr1); 85 | } 86 | delete rcr1; 87 | } 88 | delete r1; 89 | } 90 | 91 | delete pack->data; 92 | delete pack; 93 | 94 | return true; 95 | } 96 | 97 | void SingleEndScanner::initPackRepository() { 98 | mRepo.packBuffer = new ReadPack*[PACK_NUM_LIMIT]; 99 | memset(mRepo.packBuffer, 0, sizeof(ReadPack*)*PACK_NUM_LIMIT); 100 | mRepo.writePos = 0; 101 | mRepo.readPos = 0; 102 | mRepo.readCounter = 0; 103 | 104 | } 105 | 106 | void SingleEndScanner::destroyPackRepository() { 107 | delete mRepo.packBuffer; 108 | mRepo.packBuffer = NULL; 109 | } 110 | 111 | void SingleEndScanner::producePack(ReadPack* pack){ 112 | std::unique_lock lock(mRepo.mtx); 113 | while(((mRepo.writePos + 1) % PACK_NUM_LIMIT) 114 | == mRepo.readPos) { 115 | mRepo.repoNotFull.wait(lock); 116 | } 117 | 118 | mRepo.packBuffer[mRepo.writePos] = pack; 119 | mRepo.writePos++; 120 | 121 | if (mRepo.writePos == PACK_NUM_LIMIT) 122 | mRepo.writePos = 0; 123 | 124 | mRepo.repoNotEmpty.notify_all(); 125 | lock.unlock(); 126 | } 127 | 128 | void SingleEndScanner::consumePack(){ 129 | ReadPack* data; 130 | std::unique_lock lock(mRepo.mtx); 131 | // read buffer is empty, just wait here. 132 | while(mRepo.writePos % PACK_NUM_LIMIT == mRepo.readPos % PACK_NUM_LIMIT) { 133 | if(mProduceFinished){ 134 | lock.unlock(); 135 | return; 136 | } 137 | mRepo.repoNotEmpty.wait(lock); 138 | } 139 | 140 | data = mRepo.packBuffer[mRepo.readPos]; 141 | mRepo.readPos++; 142 | 143 | if (mRepo.readPos >= PACK_NUM_LIMIT) 144 | mRepo.readPos = 0; 145 | 146 | lock.unlock(); 147 | mRepo.repoNotFull.notify_all(); 148 | 149 | scanSingleEnd(data); 150 | } 151 | 152 | void SingleEndScanner::producerTask() 153 | { 154 | int slept = 0; 155 | Read** data = new Read*[PACK_SIZE]; 156 | memset(data, 0, sizeof(Read*)*PACK_SIZE); 157 | FastqReader reader1(mRead1File); 158 | int count=0; 159 | while(true){ 160 | Read* read = reader1.read(); 161 | if(!read){ 162 | // the last pack 163 | ReadPack* pack = new ReadPack; 164 | pack->data = data; 165 | pack->count = count; 166 | producePack(pack); 167 | data = NULL; 168 | break; 169 | } 170 | data[count] = read; 171 | count++; 172 | // a full pack 173 | if(count == PACK_SIZE){ 174 | ReadPack* pack = new ReadPack; 175 | pack->data = data; 176 | pack->count = count; 177 | producePack(pack); 178 | //re-initialize data for next pack 179 | data = new Read*[PACK_SIZE]; 180 | memset(data, 0, sizeof(Read*)*PACK_SIZE); 181 | // reset count to 0 182 | count = 0; 183 | // if the consumer is far behind this producer, sleep and wait to limit memory usage 184 | while(mRepo.writePos - mRepo.readPos > PACK_IN_MEM_LIMIT){ 185 | //cout<<"sleep"< lock(mRepo.readCounterMtx); 193 | mProduceFinished = true; 194 | lock.unlock(); 195 | 196 | // if the last data initialized is not used, free it 197 | if(data != NULL) 198 | delete data; 199 | } 200 | 201 | void SingleEndScanner::consumerTask() 202 | { 203 | while(true) { 204 | std::unique_lock lock(mRepo.readCounterMtx); 205 | if(mProduceFinished && mRepo.writePos == mRepo.readPos){ 206 | lock.unlock(); 207 | break; 208 | } 209 | if(mProduceFinished){ 210 | consumePack(); 211 | lock.unlock(); 212 | } else { 213 | lock.unlock(); 214 | consumePack(); 215 | } 216 | } 217 | } 218 | 219 | void SingleEndScanner::textReport() { 220 | } 221 | 222 | void SingleEndScanner::htmlReport() { 223 | if(mHtmlFile == "") 224 | return; 225 | 226 | HtmlReporter reporter(mHtmlFile, mFusionMapper); 227 | reporter.run(); 228 | } 229 | 230 | void SingleEndScanner::jsonReport() { 231 | if(mJsonFile == "") 232 | return; 233 | 234 | JsonReporter reporter(mJsonFile, mFusionMapper); 235 | reporter.run(); 236 | } 237 | -------------------------------------------------------------------------------- /src/sescanner.h: -------------------------------------------------------------------------------- 1 | #ifndef SE_SCANNNER_H 2 | #define SE_SCANNNER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "read.h" 8 | #include "fusion.h" 9 | #include "match.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "fusionmapper.h" 15 | 16 | 17 | using namespace std; 18 | 19 | struct ReadPack { 20 | Read** data; 21 | int count; 22 | }; 23 | 24 | typedef struct ReadPack ReadPack; 25 | 26 | struct ReadRepository { 27 | ReadPack** packBuffer; 28 | size_t readPos; 29 | size_t writePos; 30 | size_t readCounter; 31 | std::mutex mtx; 32 | std::mutex readCounterMtx; 33 | std::condition_variable repoNotFull; 34 | std::condition_variable repoNotEmpty; 35 | }; 36 | 37 | typedef struct ReadRepository ReadRepository; 38 | 39 | class SingleEndScanner{ 40 | public: 41 | SingleEndScanner(string fusionFile, string refFile, string read1File, string html, string json, int threadnum); 42 | ~SingleEndScanner(); 43 | bool scan(); 44 | void textReport(); 45 | void htmlReport(); 46 | void jsonReport(); 47 | 48 | private: 49 | bool scanSingleEnd(ReadPack* pack); 50 | void initPackRepository(); 51 | void destroyPackRepository(); 52 | void producePack(ReadPack* pack); 53 | void consumePack(); 54 | void producerTask(); 55 | void consumerTask(); 56 | void pushMatch(Match* m); 57 | 58 | private: 59 | string mFusionFile; 60 | string mRefFile; 61 | string mRead1File; 62 | string mRead2File; 63 | string mHtmlFile; 64 | string mJsonFile; 65 | ReadRepository mRepo; 66 | bool mProduceFinished; 67 | std::mutex mFusionMtx; 68 | int mThreadNum; 69 | FusionMapper* mFusionMapper; 70 | }; 71 | 72 | 73 | #endif -------------------------------------------------------------------------------- /src/unittest.cpp: -------------------------------------------------------------------------------- 1 | #include "unittest.h" 2 | #include "editdistance.h" 3 | #include "sequence.h" 4 | #include "fastqreader.h" 5 | #include "fastareader.h" 6 | #include "overlap.h" 7 | #include "read.h" 8 | #include "fusion.h" 9 | #include "indexer.h" 10 | #include 11 | 12 | UnitTest::UnitTest(){ 13 | 14 | } 15 | 16 | void UnitTest::run(){ 17 | bool passed = true; 18 | passed &= report(editdistance_test(), "editdistance_test"); 19 | passed &= report(Sequence::test(), "Sequence::test"); 20 | passed &= report(Overlap::test(), "Overlap::test"); 21 | passed &= report(Fusion::test(), "Fusion::test"); 22 | passed &= report(Indexer::test(), "Indexer::test"); 23 | passed &= report(FastaReader::test(), "FastaReader::test"); 24 | printf("\n==========================\n"); 25 | printf("%s\n\n", passed?"ALL PASSED":"FAILED"); 26 | } 27 | 28 | bool UnitTest::report(bool result, string message) { 29 | printf("%s:%s\n\n", message.c_str(), result?" PASSED":" FAILED"); 30 | return result; 31 | } -------------------------------------------------------------------------------- /src/unittest.h: -------------------------------------------------------------------------------- 1 | #ifndef UNITDISTANCE_H 2 | #define UNITDISTANCE_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | class UnitTest{ 11 | public: 12 | UnitTest(); 13 | void run(); 14 | bool report(bool result, string message); 15 | }; 16 | 17 | #endif -------------------------------------------------------------------------------- /src/util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_H 2 | #define UTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "common.h" 12 | 13 | using namespace std; 14 | 15 | inline bool starts_with( string const & value, string const & starting) 16 | { 17 | if (starting.size() > value.size()) return false; 18 | return equal(starting.begin(), starting.end(), value.begin()); 19 | } 20 | 21 | inline bool ends_with( string const & value, string const & ending) 22 | { 23 | if (ending.size() > value.size()) return false; 24 | return equal(ending.rbegin(), ending.rend(), value.rbegin()); 25 | } 26 | 27 | inline string trim(const string& str) 28 | { 29 | string::size_type pos = str.find_first_not_of(' '); 30 | if (pos == string::npos) 31 | { 32 | return string(""); 33 | } 34 | string::size_type pos2 = str.find_last_not_of(' '); 35 | if (pos2 != string::npos) 36 | { 37 | return str.substr(pos, pos2 - pos + 1); 38 | } 39 | return str.substr(pos); 40 | } 41 | 42 | inline int split(const string& str, vector& ret_, string sep = ",") 43 | { 44 | if (str.empty()) 45 | { 46 | return 0; 47 | } 48 | 49 | string tmp; 50 | string::size_type pos_begin = str.find_first_not_of(sep); 51 | string::size_type comma_pos = 0; 52 | 53 | while (pos_begin != string::npos) 54 | { 55 | comma_pos = str.find(sep, pos_begin); 56 | if (comma_pos != string::npos) 57 | { 58 | tmp = str.substr(pos_begin, comma_pos - pos_begin); 59 | pos_begin = comma_pos + sep.length(); 60 | } 61 | else 62 | { 63 | tmp = str.substr(pos_begin); 64 | pos_begin = comma_pos; 65 | } 66 | 67 | ret_.push_back(tmp); 68 | tmp.clear(); 69 | } 70 | return 0; 71 | } 72 | 73 | inline string replace(const string& str, const string& src, const string& dest) 74 | { 75 | string ret; 76 | 77 | string::size_type pos_begin = 0; 78 | string::size_type pos = str.find(src); 79 | while (pos != string::npos) 80 | { 81 | ret.append(str.data() + pos_begin, pos - pos_begin); 82 | ret += dest; 83 | pos_begin = pos + 1; 84 | pos = str.find(src, pos_begin); 85 | } 86 | if (pos_begin < str.length()) 87 | { 88 | ret.append(str.begin() + pos_begin, str.end()); 89 | } 90 | return ret; 91 | } 92 | 93 | //Check if a string is a file or directory 94 | inline bool file_exists(const string& s) 95 | { 96 | bool exists = false; 97 | if(s.length() > 0) { 98 | struct stat status; 99 | int result = stat( s.c_str(), &status ); 100 | if(result == 0) { 101 | exists = true; 102 | } 103 | } 104 | return exists; 105 | } 106 | 107 | 108 | // check if a string is a directory 109 | inline bool is_directory(const string& path) 110 | { 111 | bool isdir = false; 112 | struct stat status; 113 | // visual studion use _S_IFDIR instead of S_IFDIR 114 | // http://msdn.microsoft.com/en-us/library/14h5k7ff.aspx 115 | #ifdef _MSC_VER 116 | #define S_IFDIR _S_IFDIR 117 | #endif 118 | stat( path.c_str(), &status ); 119 | if ( status.st_mode & S_IFDIR ) { 120 | isdir = true; 121 | } 122 | // #endif 123 | return isdir; 124 | } 125 | 126 | inline void check_file_valid(const string& s) { 127 | if(!file_exists(s)){ 128 | cout << "ERROR: file '" << s << "' doesn't exist, quit now" << endl; 129 | exit(-1); 130 | } 131 | if(is_directory(s)){ 132 | cout << "ERROR: '" << s << "' is a folder, not a file, quit now" << endl; 133 | exit(-1); 134 | } 135 | } 136 | 137 | // Remove non alphabetic characters from a string 138 | inline string str_keep_alpha(const string& s) 139 | { 140 | string new_str; 141 | for( size_t it =0; it < s.size(); it++) { 142 | if( isalpha(s[it]) ) { 143 | new_str += s[it]; 144 | } 145 | } 146 | return new_str; 147 | } 148 | 149 | 150 | // Remove invalid sequence characters from a string 151 | inline void str_keep_valid_sequence( string& s, bool forceUpperCase = false) 152 | { 153 | size_t total = 0; 154 | const char case_gap = 'a' - 'A'; 155 | for( size_t it =0; it < s.size(); it++) { 156 | char c = s[it]; 157 | if(forceUpperCase && c>='a' && c<='z') { 158 | c -= case_gap; 159 | } 160 | if( isalpha(c) || c == '-' || c == '*' ) { 161 | s[total] = c; 162 | total ++; 163 | } 164 | } 165 | 166 | s.resize(total); 167 | } 168 | 169 | inline int find_with_right_pos(const string& str, const string& pattern, int start=0) { 170 | int pos = str.find(pattern, start); 171 | if (pos < 0) 172 | return -1; 173 | else 174 | return pos + pattern.length(); 175 | } 176 | 177 | inline void str2upper(string& s){ 178 | transform(s.begin(), s.end(), s.begin(), (int (*)(int))toupper); 179 | } 180 | 181 | inline void str2lower(string& s){ 182 | transform(s.begin(), s.end(), s.begin(), (int (*)(int))tolower); 183 | } 184 | 185 | inline void loginfo(const string s){ 186 | time_t tt = time(NULL); 187 | tm* t= localtime(&tt); 188 | if(_DEBUG) 189 | cerr<tm_hour<<":"<tm_min<<":"<tm_sec<<" "<= hash_bits 149 | */ 150 | 151 | long block_start; 152 | /* Window position at the beginning of the current output block. Gets 153 | * negative when the window is moved backwards. 154 | */ 155 | 156 | uInt match_length; /* length of best match */ 157 | IPos prev_match; /* previous match */ 158 | int match_available; /* set if previous match exists */ 159 | uInt strstart; /* start of string to insert */ 160 | uInt match_start; /* start of matching string */ 161 | uInt lookahead; /* number of valid bytes ahead in window */ 162 | 163 | uInt prev_length; 164 | /* Length of the best match at previous step. Matches not greater than this 165 | * are discarded. This is used in the lazy match evaluation. 166 | */ 167 | 168 | uInt max_chain_length; 169 | /* To speed up deflation, hash chains are never searched beyond this 170 | * length. A higher limit improves compression ratio but degrades the 171 | * speed. 172 | */ 173 | 174 | uInt max_lazy_match; 175 | /* Attempt to find a better match only when the current match is strictly 176 | * smaller than this value. This mechanism is used only for compression 177 | * levels >= 4. 178 | */ 179 | # define max_insert_length max_lazy_match 180 | /* Insert new strings in the hash table only if the match length is not 181 | * greater than this length. This saves time but degrades compression. 182 | * max_insert_length is used only for compression levels <= 3. 183 | */ 184 | 185 | int level; /* compression level (1..9) */ 186 | int strategy; /* favor or force Huffman coding*/ 187 | 188 | uInt good_match; 189 | /* Use a faster search when the previous match is longer than this */ 190 | 191 | int nice_match; /* Stop searching when current match exceeds this */ 192 | 193 | /* used by trees.c: */ 194 | /* Didn't use ct_data typedef below to suppress compiler warning */ 195 | struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */ 196 | struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */ 197 | struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */ 198 | 199 | struct tree_desc_s l_desc; /* desc. for literal tree */ 200 | struct tree_desc_s d_desc; /* desc. for distance tree */ 201 | struct tree_desc_s bl_desc; /* desc. for bit length tree */ 202 | 203 | ush bl_count[MAX_BITS+1]; 204 | /* number of codes at each bit length for an optimal tree */ 205 | 206 | int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */ 207 | int heap_len; /* number of elements in the heap */ 208 | int heap_max; /* element of largest frequency */ 209 | /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used. 210 | * The same heap array is used to build all trees. 211 | */ 212 | 213 | uch depth[2*L_CODES+1]; 214 | /* Depth of each subtree used as tie breaker for trees of equal frequency 215 | */ 216 | 217 | uchf *l_buf; /* buffer for literals or lengths */ 218 | 219 | uInt lit_bufsize; 220 | /* Size of match buffer for literals/lengths. There are 4 reasons for 221 | * limiting lit_bufsize to 64K: 222 | * - frequencies can be kept in 16 bit counters 223 | * - if compression is not successful for the first block, all input 224 | * data is still in the window so we can still emit a stored block even 225 | * when input comes from standard input. (This can also be done for 226 | * all blocks if lit_bufsize is not greater than 32K.) 227 | * - if compression is not successful for a file smaller than 64K, we can 228 | * even emit a stored file instead of a stored block (saving 5 bytes). 229 | * This is applicable only for zip (not gzip or zlib). 230 | * - creating new Huffman trees less frequently may not provide fast 231 | * adaptation to changes in the input data statistics. (Take for 232 | * example a binary file with poorly compressible code followed by 233 | * a highly compressible string table.) Smaller buffer sizes give 234 | * fast adaptation but have of course the overhead of transmitting 235 | * trees more frequently. 236 | * - I can't count above 4 237 | */ 238 | 239 | uInt last_lit; /* running index in l_buf */ 240 | 241 | ushf *d_buf; 242 | /* Buffer for distances. To simplify the code, d_buf and l_buf have 243 | * the same number of elements. To use different lengths, an extra flag 244 | * array would be necessary. 245 | */ 246 | 247 | ulg opt_len; /* bit length of current block with optimal trees */ 248 | ulg static_len; /* bit length of current block with static trees */ 249 | uInt matches; /* number of string matches in current block */ 250 | uInt insert; /* bytes at end of window left to insert */ 251 | 252 | #ifdef DEBUG 253 | ulg compressed_len; /* total bit length of compressed file mod 2^32 */ 254 | ulg bits_sent; /* bit length of compressed data sent mod 2^32 */ 255 | #endif 256 | 257 | ush bi_buf; 258 | /* Output buffer. bits are inserted starting at the bottom (least 259 | * significant bits). 260 | */ 261 | int bi_valid; 262 | /* Number of valid bits in bi_buf. All bits above the last valid bit 263 | * are always zero. 264 | */ 265 | 266 | ulg high_water; 267 | /* High water mark offset in window for initialized bytes -- bytes above 268 | * this are set to zero in order to avoid memory check warnings when 269 | * longest match routines access bytes past the input. This is then 270 | * updated to the new high water mark. 271 | */ 272 | 273 | } FAR deflate_state; 274 | 275 | /* Output a byte on the stream. 276 | * IN assertion: there is enough room in pending_buf. 277 | */ 278 | #define put_byte(s, c) {s->pending_buf[s->pending++] = (c);} 279 | 280 | 281 | #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) 282 | /* Minimum amount of lookahead, except at the end of the input file. 283 | * See deflate.c for comments about the MIN_MATCH+1. 284 | */ 285 | 286 | #define MAX_DIST(s) ((s)->w_size-MIN_LOOKAHEAD) 287 | /* In order to simplify the code, particularly on 16 bit machines, match 288 | * distances are limited to MAX_DIST instead of WSIZE. 289 | */ 290 | 291 | #define WIN_INIT MAX_MATCH 292 | /* Number of bytes after end of data in window to initialize in order to avoid 293 | memory checker errors from longest match routines */ 294 | 295 | /* in trees.c */ 296 | void ZLIB_INTERNAL _tr_init OF((deflate_state *s)); 297 | int ZLIB_INTERNAL _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc)); 298 | void ZLIB_INTERNAL _tr_flush_block OF((deflate_state *s, charf *buf, 299 | ulg stored_len, int last)); 300 | void ZLIB_INTERNAL _tr_flush_bits OF((deflate_state *s)); 301 | void ZLIB_INTERNAL _tr_align OF((deflate_state *s)); 302 | void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf, 303 | ulg stored_len, int last)); 304 | 305 | #define d_code(dist) \ 306 | ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)]) 307 | /* Mapping from a distance to a distance code. dist is the distance - 1 and 308 | * must not have side effects. _dist_code[256] and _dist_code[257] are never 309 | * used. 310 | */ 311 | 312 | #ifndef DEBUG 313 | /* Inline versions of _tr_tally for speed: */ 314 | 315 | #if defined(GEN_TREES_H) || !defined(STDC) 316 | extern uch ZLIB_INTERNAL _length_code[]; 317 | extern uch ZLIB_INTERNAL _dist_code[]; 318 | #else 319 | extern const uch ZLIB_INTERNAL _length_code[]; 320 | extern const uch ZLIB_INTERNAL _dist_code[]; 321 | #endif 322 | 323 | # define _tr_tally_lit(s, c, flush) \ 324 | { uch cc = (c); \ 325 | s->d_buf[s->last_lit] = 0; \ 326 | s->l_buf[s->last_lit++] = cc; \ 327 | s->dyn_ltree[cc].Freq++; \ 328 | flush = (s->last_lit == s->lit_bufsize-1); \ 329 | } 330 | # define _tr_tally_dist(s, distance, length, flush) \ 331 | { uch len = (length); \ 332 | ush dist = (distance); \ 333 | s->d_buf[s->last_lit] = dist; \ 334 | s->l_buf[s->last_lit++] = len; \ 335 | dist--; \ 336 | s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \ 337 | s->dyn_dtree[d_code(dist)].Freq++; \ 338 | flush = (s->last_lit == s->lit_bufsize-1); \ 339 | } 340 | #else 341 | # define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c) 342 | # define _tr_tally_dist(s, distance, length, flush) \ 343 | flush = _tr_tally(s, distance, length) 344 | #endif 345 | 346 | #endif /* DEFLATE_H */ 347 | -------------------------------------------------------------------------------- /src/zlib/gzguts.h: -------------------------------------------------------------------------------- 1 | /* gzguts.h -- zlib internal header definitions for gz* operations 2 | * Copyright (C) 2004, 2005, 2010, 2011, 2012, 2013 Mark Adler 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | #ifdef _LARGEFILE64_SOURCE 7 | # ifndef _LARGEFILE_SOURCE 8 | # define _LARGEFILE_SOURCE 1 9 | # endif 10 | # ifdef _FILE_OFFSET_BITS 11 | # undef _FILE_OFFSET_BITS 12 | # endif 13 | #endif 14 | 15 | #ifdef HAVE_HIDDEN 16 | # define ZLIB_INTERNAL __attribute__((visibility ("hidden"))) 17 | #else 18 | # define ZLIB_INTERNAL 19 | #endif 20 | 21 | #include 22 | #include "zlib.h" 23 | #ifdef STDC 24 | # include 25 | # include 26 | # include 27 | #endif 28 | #include 29 | 30 | #ifdef _WIN32 31 | # include 32 | #endif 33 | 34 | #if defined(__TURBOC__) || defined(_MSC_VER) || defined(_WIN32) 35 | # include 36 | #endif 37 | 38 | #ifdef WINAPI_FAMILY 39 | # define open _open 40 | # define read _read 41 | # define write _write 42 | # define close _close 43 | #endif 44 | 45 | #ifdef NO_DEFLATE /* for compatibility with old definition */ 46 | # define NO_GZCOMPRESS 47 | #endif 48 | 49 | #if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550) 50 | # ifndef HAVE_VSNPRINTF 51 | # define HAVE_VSNPRINTF 52 | # endif 53 | #endif 54 | 55 | #if defined(__CYGWIN__) 56 | # ifndef HAVE_VSNPRINTF 57 | # define HAVE_VSNPRINTF 58 | # endif 59 | #endif 60 | 61 | #if defined(MSDOS) && defined(__BORLANDC__) && (BORLANDC > 0x410) 62 | # ifndef HAVE_VSNPRINTF 63 | # define HAVE_VSNPRINTF 64 | # endif 65 | #endif 66 | 67 | #ifndef HAVE_VSNPRINTF 68 | # ifdef MSDOS 69 | /* vsnprintf may exist on some MS-DOS compilers (DJGPP?), 70 | but for now we just assume it doesn't. */ 71 | # define NO_vsnprintf 72 | # endif 73 | # ifdef __TURBOC__ 74 | # define NO_vsnprintf 75 | # endif 76 | # ifdef WIN32 77 | /* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */ 78 | # if !defined(vsnprintf) && !defined(NO_vsnprintf) 79 | # if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 ) 80 | # define vsnprintf _vsnprintf 81 | # endif 82 | # endif 83 | # endif 84 | # ifdef __SASC 85 | # define NO_vsnprintf 86 | # endif 87 | # ifdef VMS 88 | # define NO_vsnprintf 89 | # endif 90 | # ifdef __OS400__ 91 | # define NO_vsnprintf 92 | # endif 93 | # ifdef __MVS__ 94 | # define NO_vsnprintf 95 | # endif 96 | #endif 97 | 98 | /* unlike snprintf (which is required in C99, yet still not supported by 99 | Microsoft more than a decade later!), _snprintf does not guarantee null 100 | termination of the result -- however this is only used in gzlib.c where 101 | the result is assured to fit in the space provided */ 102 | #ifdef _MSC_VER 103 | # define snprintf _snprintf 104 | #endif 105 | 106 | #ifndef local 107 | # define local static 108 | #endif 109 | /* compile with -Dlocal if your debugger can't find static symbols */ 110 | 111 | /* gz* functions always use library allocation functions */ 112 | #ifndef STDC 113 | extern voidp malloc OF((uInt size)); 114 | extern void free OF((voidpf ptr)); 115 | #endif 116 | 117 | /* get errno and strerror definition */ 118 | #if defined UNDER_CE 119 | # include 120 | # define zstrerror() gz_strwinerror((DWORD)GetLastError()) 121 | #else 122 | # ifndef NO_STRERROR 123 | # include 124 | # define zstrerror() strerror(errno) 125 | # else 126 | # define zstrerror() "stdio error (consult errno)" 127 | # endif 128 | #endif 129 | 130 | /* provide prototypes for these when building zlib without LFS */ 131 | #if !defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0 132 | ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *)); 133 | ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int)); 134 | ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile)); 135 | ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile)); 136 | #endif 137 | 138 | /* default memLevel */ 139 | #if MAX_MEM_LEVEL >= 8 140 | # define DEF_MEM_LEVEL 8 141 | #else 142 | # define DEF_MEM_LEVEL MAX_MEM_LEVEL 143 | #endif 144 | 145 | /* default i/o buffer size -- double this for output when reading (this and 146 | twice this must be able to fit in an unsigned type) */ 147 | #define GZBUFSIZE 8192 148 | 149 | /* gzip modes, also provide a little integrity check on the passed structure */ 150 | #define GZ_NONE 0 151 | #define GZ_READ 7247 152 | #define GZ_WRITE 31153 153 | #define GZ_APPEND 1 /* mode set to GZ_WRITE after the file is opened */ 154 | 155 | /* values for gz_state how */ 156 | #define LOOK 0 /* look for a gzip header */ 157 | #define COPY 1 /* copy input directly */ 158 | #define GZIP 2 /* decompress a gzip stream */ 159 | 160 | /* internal gzip file state data structure */ 161 | typedef struct { 162 | /* exposed contents for gzgetc() macro */ 163 | struct gzFile_s x; /* "x" for exposed */ 164 | /* x.have: number of bytes available at x.next */ 165 | /* x.next: next output data to deliver or write */ 166 | /* x.pos: current position in uncompressed data */ 167 | /* used for both reading and writing */ 168 | int mode; /* see gzip modes above */ 169 | int fd; /* file descriptor */ 170 | char *path; /* path or fd for error messages */ 171 | unsigned size; /* buffer size, zero if not allocated yet */ 172 | unsigned want; /* requested buffer size, default is GZBUFSIZE */ 173 | unsigned char *in; /* input buffer */ 174 | unsigned char *out; /* output buffer (double-sized when reading) */ 175 | int direct; /* 0 if processing gzip, 1 if transparent */ 176 | /* just for reading */ 177 | int how; /* 0: get header, 1: copy, 2: decompress */ 178 | z_off64_t start; /* where the gzip data started, for rewinding */ 179 | int eof; /* true if end of input file reached */ 180 | int past; /* true if read requested past end */ 181 | /* just for writing */ 182 | int level; /* compression level */ 183 | int strategy; /* compression strategy */ 184 | /* seek request */ 185 | z_off64_t skip; /* amount to skip (already rewound if backwards) */ 186 | int seek; /* true if seek request pending */ 187 | /* error information */ 188 | int err; /* error code */ 189 | char *msg; /* error message */ 190 | /* zlib inflate or deflate stream */ 191 | z_stream strm; /* stream structure in-place (not a pointer) */ 192 | } gz_state; 193 | typedef gz_state FAR *gz_statep; 194 | 195 | /* shared functions */ 196 | void ZLIB_INTERNAL gz_error OF((gz_statep, int, const char *)); 197 | #if defined UNDER_CE 198 | char ZLIB_INTERNAL *gz_strwinerror OF((DWORD error)); 199 | #endif 200 | 201 | /* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t 202 | value -- needed when comparing unsigned to z_off64_t, which is signed 203 | (possible z_off64_t types off_t, off64_t, and long are all signed) */ 204 | #ifdef INT_MAX 205 | # define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX) 206 | #else 207 | unsigned ZLIB_INTERNAL gz_intmax OF((void)); 208 | # define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > gz_intmax()) 209 | #endif 210 | -------------------------------------------------------------------------------- /src/zlib/inffast.h: -------------------------------------------------------------------------------- 1 | /* inffast.h -- header to use inffast.c 2 | * Copyright (C) 1995-2003, 2010 Mark Adler 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | /* WARNING: this file should *not* be used by applications. It is 7 | part of the implementation of the compression library and is 8 | subject to change. Applications should only use zlib.h. 9 | */ 10 | 11 | void ZLIB_INTERNAL inflate_fast OF((z_streamp strm, unsigned start)); 12 | -------------------------------------------------------------------------------- /src/zlib/inffixed.h: -------------------------------------------------------------------------------- 1 | /* inffixed.h -- table for decoding fixed codes 2 | * Generated automatically by makefixed(). 3 | */ 4 | 5 | /* WARNING: this file should *not* be used by applications. 6 | It is part of the implementation of this library and is 7 | subject to change. Applications should only use zlib.h. 8 | */ 9 | 10 | static const code lenfix[512] = { 11 | {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48}, 12 | {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128}, 13 | {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59}, 14 | {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176}, 15 | {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20}, 16 | {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100}, 17 | {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8}, 18 | {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216}, 19 | {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76}, 20 | {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114}, 21 | {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2}, 22 | {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148}, 23 | {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42}, 24 | {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86}, 25 | {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15}, 26 | {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236}, 27 | {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62}, 28 | {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142}, 29 | {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31}, 30 | {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162}, 31 | {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25}, 32 | {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105}, 33 | {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4}, 34 | {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202}, 35 | {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69}, 36 | {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125}, 37 | {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13}, 38 | {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195}, 39 | {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35}, 40 | {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91}, 41 | {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19}, 42 | {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246}, 43 | {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55}, 44 | {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135}, 45 | {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99}, 46 | {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190}, 47 | {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16}, 48 | {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96}, 49 | {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6}, 50 | {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209}, 51 | {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72}, 52 | {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116}, 53 | {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4}, 54 | {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153}, 55 | {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44}, 56 | {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82}, 57 | {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11}, 58 | {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229}, 59 | {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58}, 60 | {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138}, 61 | {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51}, 62 | {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173}, 63 | {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30}, 64 | {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110}, 65 | {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0}, 66 | {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195}, 67 | {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65}, 68 | {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121}, 69 | {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9}, 70 | {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258}, 71 | {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37}, 72 | {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93}, 73 | {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23}, 74 | {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251}, 75 | {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51}, 76 | {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131}, 77 | {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67}, 78 | {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183}, 79 | {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23}, 80 | {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103}, 81 | {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9}, 82 | {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223}, 83 | {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79}, 84 | {0,9,255} 85 | }; 86 | 87 | static const code distfix[32] = { 88 | {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025}, 89 | {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193}, 90 | {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385}, 91 | {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577}, 92 | {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073}, 93 | {22,5,193},{64,5,0} 94 | }; 95 | -------------------------------------------------------------------------------- /src/zlib/inflate.h: -------------------------------------------------------------------------------- 1 | /* inflate.h -- internal inflate state definition 2 | * Copyright (C) 1995-2009 Mark Adler 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | /* WARNING: this file should *not* be used by applications. It is 7 | part of the implementation of the compression library and is 8 | subject to change. Applications should only use zlib.h. 9 | */ 10 | 11 | /* define NO_GZIP when compiling if you want to disable gzip header and 12 | trailer decoding by inflate(). NO_GZIP would be used to avoid linking in 13 | the crc code when it is not needed. For shared libraries, gzip decoding 14 | should be left enabled. */ 15 | #ifndef NO_GZIP 16 | # define GUNZIP 17 | #endif 18 | 19 | /* Possible inflate modes between inflate() calls */ 20 | typedef enum { 21 | HEAD, /* i: waiting for magic header */ 22 | FLAGS, /* i: waiting for method and flags (gzip) */ 23 | TIME, /* i: waiting for modification time (gzip) */ 24 | OS, /* i: waiting for extra flags and operating system (gzip) */ 25 | EXLEN, /* i: waiting for extra length (gzip) */ 26 | EXTRA, /* i: waiting for extra bytes (gzip) */ 27 | NAME, /* i: waiting for end of file name (gzip) */ 28 | COMMENT, /* i: waiting for end of comment (gzip) */ 29 | HCRC, /* i: waiting for header crc (gzip) */ 30 | DICTID, /* i: waiting for dictionary check value */ 31 | DICT, /* waiting for inflateSetDictionary() call */ 32 | TYPE, /* i: waiting for type bits, including last-flag bit */ 33 | TYPEDO, /* i: same, but skip check to exit inflate on new block */ 34 | STORED, /* i: waiting for stored size (length and complement) */ 35 | COPY_, /* i/o: same as COPY below, but only first time in */ 36 | COPY, /* i/o: waiting for input or output to copy stored block */ 37 | TABLE, /* i: waiting for dynamic block table lengths */ 38 | LENLENS, /* i: waiting for code length code lengths */ 39 | CODELENS, /* i: waiting for length/lit and distance code lengths */ 40 | LEN_, /* i: same as LEN below, but only first time in */ 41 | LEN, /* i: waiting for length/lit/eob code */ 42 | LENEXT, /* i: waiting for length extra bits */ 43 | DIST, /* i: waiting for distance code */ 44 | DISTEXT, /* i: waiting for distance extra bits */ 45 | MATCH, /* o: waiting for output space to copy string */ 46 | LIT, /* o: waiting for output space to write literal */ 47 | CHECK, /* i: waiting for 32-bit check value */ 48 | LENGTH, /* i: waiting for 32-bit length (gzip) */ 49 | DONE, /* finished check, done -- remain here until reset */ 50 | BAD, /* got a data error -- remain here until reset */ 51 | MEM, /* got an inflate() memory error -- remain here until reset */ 52 | SYNC /* looking for synchronization bytes to restart inflate() */ 53 | } inflate_mode; 54 | 55 | /* 56 | State transitions between above modes - 57 | 58 | (most modes can go to BAD or MEM on error -- not shown for clarity) 59 | 60 | Process header: 61 | HEAD -> (gzip) or (zlib) or (raw) 62 | (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME -> COMMENT -> 63 | HCRC -> TYPE 64 | (zlib) -> DICTID or TYPE 65 | DICTID -> DICT -> TYPE 66 | (raw) -> TYPEDO 67 | Read deflate blocks: 68 | TYPE -> TYPEDO -> STORED or TABLE or LEN_ or CHECK 69 | STORED -> COPY_ -> COPY -> TYPE 70 | TABLE -> LENLENS -> CODELENS -> LEN_ 71 | LEN_ -> LEN 72 | Read deflate codes in fixed or dynamic block: 73 | LEN -> LENEXT or LIT or TYPE 74 | LENEXT -> DIST -> DISTEXT -> MATCH -> LEN 75 | LIT -> LEN 76 | Process trailer: 77 | CHECK -> LENGTH -> DONE 78 | */ 79 | 80 | /* state maintained between inflate() calls. Approximately 10K bytes. */ 81 | struct inflate_state { 82 | inflate_mode mode; /* current inflate mode */ 83 | int last; /* true if processing last block */ 84 | int wrap; /* bit 0 true for zlib, bit 1 true for gzip */ 85 | int havedict; /* true if dictionary provided */ 86 | int flags; /* gzip header method and flags (0 if zlib) */ 87 | unsigned dmax; /* zlib header max distance (INFLATE_STRICT) */ 88 | unsigned long check; /* protected copy of check value */ 89 | unsigned long total; /* protected copy of output count */ 90 | gz_headerp head; /* where to save gzip header information */ 91 | /* sliding window */ 92 | unsigned wbits; /* log base 2 of requested window size */ 93 | unsigned wsize; /* window size or zero if not using window */ 94 | unsigned whave; /* valid bytes in the window */ 95 | unsigned wnext; /* window write index */ 96 | unsigned char FAR *window; /* allocated sliding window, if needed */ 97 | /* bit accumulator */ 98 | unsigned long hold; /* input bit accumulator */ 99 | unsigned bits; /* number of bits in "in" */ 100 | /* for string and stored block copying */ 101 | unsigned length; /* literal or length of data to copy */ 102 | unsigned offset; /* distance back to copy string from */ 103 | /* for table and code decoding */ 104 | unsigned extra; /* extra bits needed */ 105 | /* fixed and dynamic code tables */ 106 | code const FAR *lencode; /* starting table for length/literal codes */ 107 | code const FAR *distcode; /* starting table for distance codes */ 108 | unsigned lenbits; /* index bits for lencode */ 109 | unsigned distbits; /* index bits for distcode */ 110 | /* dynamic table building */ 111 | unsigned ncode; /* number of code length code lengths */ 112 | unsigned nlen; /* number of length code lengths */ 113 | unsigned ndist; /* number of distance code lengths */ 114 | unsigned have; /* number of code lengths in lens[] */ 115 | code FAR *next; /* next available space in codes[] */ 116 | unsigned short lens[320]; /* temporary storage for code lengths */ 117 | unsigned short work[288]; /* work area for code table building */ 118 | code codes[ENOUGH]; /* space for code tables */ 119 | int sane; /* if false, allow invalid distance too far */ 120 | int back; /* bits back of last unprocessed length/lit */ 121 | unsigned was; /* initial length of match */ 122 | }; 123 | -------------------------------------------------------------------------------- /src/zlib/inftrees.h: -------------------------------------------------------------------------------- 1 | /* inftrees.h -- header to use inftrees.c 2 | * Copyright (C) 1995-2005, 2010 Mark Adler 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | /* WARNING: this file should *not* be used by applications. It is 7 | part of the implementation of the compression library and is 8 | subject to change. Applications should only use zlib.h. 9 | */ 10 | 11 | /* Structure for decoding tables. Each entry provides either the 12 | information needed to do the operation requested by the code that 13 | indexed that table entry, or it provides a pointer to another 14 | table that indexes more bits of the code. op indicates whether 15 | the entry is a pointer to another table, a literal, a length or 16 | distance, an end-of-block, or an invalid code. For a table 17 | pointer, the low four bits of op is the number of index bits of 18 | that table. For a length or distance, the low four bits of op 19 | is the number of extra bits to get after the code. bits is 20 | the number of bits in this code or part of the code to drop off 21 | of the bit buffer. val is the actual byte to output in the case 22 | of a literal, the base length or distance, or the offset from 23 | the current table to the next table. Each entry is four bytes. */ 24 | typedef struct { 25 | unsigned char op; /* operation, extra bits, table bits */ 26 | unsigned char bits; /* bits in this part of the code */ 27 | unsigned short val; /* offset in table or code value */ 28 | } code; 29 | 30 | /* op values as set by inflate_table(): 31 | 00000000 - literal 32 | 0000tttt - table link, tttt != 0 is the number of table index bits 33 | 0001eeee - length or distance, eeee is the number of extra bits 34 | 01100000 - end of block 35 | 01000000 - invalid code 36 | */ 37 | 38 | /* Maximum size of the dynamic table. The maximum number of code structures is 39 | 1444, which is the sum of 852 for literal/length codes and 592 for distance 40 | codes. These values were found by exhaustive searches using the program 41 | examples/enough.c found in the zlib distribtution. The arguments to that 42 | program are the number of symbols, the initial root table size, and the 43 | maximum bit length of a code. "enough 286 9 15" for literal/length codes 44 | returns returns 852, and "enough 30 6 15" for distance codes returns 592. 45 | The initial root table size (9 or 6) is found in the fifth argument of the 46 | inflate_table() calls in inflate.c and infback.c. If the root table size is 47 | changed, then these maximum sizes would be need to be recalculated and 48 | updated. */ 49 | #define ENOUGH_LENS 852 50 | #define ENOUGH_DISTS 592 51 | #define ENOUGH (ENOUGH_LENS+ENOUGH_DISTS) 52 | 53 | /* Type of code to build for inflate_table() */ 54 | typedef enum { 55 | CODES, 56 | LENS, 57 | DISTS 58 | } codetype; 59 | 60 | int ZLIB_INTERNAL inflate_table OF((codetype type, unsigned short FAR *lens, 61 | unsigned codes, code FAR * FAR *table, 62 | unsigned FAR *bits, unsigned short FAR *work)); 63 | -------------------------------------------------------------------------------- /src/zlib/trees.h: -------------------------------------------------------------------------------- 1 | /* header created automatically with -DGEN_TREES_H */ 2 | 3 | local const ct_data static_ltree[L_CODES+2] = { 4 | {{ 12},{ 8}}, {{140},{ 8}}, {{ 76},{ 8}}, {{204},{ 8}}, {{ 44},{ 8}}, 5 | {{172},{ 8}}, {{108},{ 8}}, {{236},{ 8}}, {{ 28},{ 8}}, {{156},{ 8}}, 6 | {{ 92},{ 8}}, {{220},{ 8}}, {{ 60},{ 8}}, {{188},{ 8}}, {{124},{ 8}}, 7 | {{252},{ 8}}, {{ 2},{ 8}}, {{130},{ 8}}, {{ 66},{ 8}}, {{194},{ 8}}, 8 | {{ 34},{ 8}}, {{162},{ 8}}, {{ 98},{ 8}}, {{226},{ 8}}, {{ 18},{ 8}}, 9 | {{146},{ 8}}, {{ 82},{ 8}}, {{210},{ 8}}, {{ 50},{ 8}}, {{178},{ 8}}, 10 | {{114},{ 8}}, {{242},{ 8}}, {{ 10},{ 8}}, {{138},{ 8}}, {{ 74},{ 8}}, 11 | {{202},{ 8}}, {{ 42},{ 8}}, {{170},{ 8}}, {{106},{ 8}}, {{234},{ 8}}, 12 | {{ 26},{ 8}}, {{154},{ 8}}, {{ 90},{ 8}}, {{218},{ 8}}, {{ 58},{ 8}}, 13 | {{186},{ 8}}, {{122},{ 8}}, {{250},{ 8}}, {{ 6},{ 8}}, {{134},{ 8}}, 14 | {{ 70},{ 8}}, {{198},{ 8}}, {{ 38},{ 8}}, {{166},{ 8}}, {{102},{ 8}}, 15 | {{230},{ 8}}, {{ 22},{ 8}}, {{150},{ 8}}, {{ 86},{ 8}}, {{214},{ 8}}, 16 | {{ 54},{ 8}}, {{182},{ 8}}, {{118},{ 8}}, {{246},{ 8}}, {{ 14},{ 8}}, 17 | {{142},{ 8}}, {{ 78},{ 8}}, {{206},{ 8}}, {{ 46},{ 8}}, {{174},{ 8}}, 18 | {{110},{ 8}}, {{238},{ 8}}, {{ 30},{ 8}}, {{158},{ 8}}, {{ 94},{ 8}}, 19 | {{222},{ 8}}, {{ 62},{ 8}}, {{190},{ 8}}, {{126},{ 8}}, {{254},{ 8}}, 20 | {{ 1},{ 8}}, {{129},{ 8}}, {{ 65},{ 8}}, {{193},{ 8}}, {{ 33},{ 8}}, 21 | {{161},{ 8}}, {{ 97},{ 8}}, {{225},{ 8}}, {{ 17},{ 8}}, {{145},{ 8}}, 22 | {{ 81},{ 8}}, {{209},{ 8}}, {{ 49},{ 8}}, {{177},{ 8}}, {{113},{ 8}}, 23 | {{241},{ 8}}, {{ 9},{ 8}}, {{137},{ 8}}, {{ 73},{ 8}}, {{201},{ 8}}, 24 | {{ 41},{ 8}}, {{169},{ 8}}, {{105},{ 8}}, {{233},{ 8}}, {{ 25},{ 8}}, 25 | {{153},{ 8}}, {{ 89},{ 8}}, {{217},{ 8}}, {{ 57},{ 8}}, {{185},{ 8}}, 26 | {{121},{ 8}}, {{249},{ 8}}, {{ 5},{ 8}}, {{133},{ 8}}, {{ 69},{ 8}}, 27 | {{197},{ 8}}, {{ 37},{ 8}}, {{165},{ 8}}, {{101},{ 8}}, {{229},{ 8}}, 28 | {{ 21},{ 8}}, {{149},{ 8}}, {{ 85},{ 8}}, {{213},{ 8}}, {{ 53},{ 8}}, 29 | {{181},{ 8}}, {{117},{ 8}}, {{245},{ 8}}, {{ 13},{ 8}}, {{141},{ 8}}, 30 | {{ 77},{ 8}}, {{205},{ 8}}, {{ 45},{ 8}}, {{173},{ 8}}, {{109},{ 8}}, 31 | {{237},{ 8}}, {{ 29},{ 8}}, {{157},{ 8}}, {{ 93},{ 8}}, {{221},{ 8}}, 32 | {{ 61},{ 8}}, {{189},{ 8}}, {{125},{ 8}}, {{253},{ 8}}, {{ 19},{ 9}}, 33 | {{275},{ 9}}, {{147},{ 9}}, {{403},{ 9}}, {{ 83},{ 9}}, {{339},{ 9}}, 34 | {{211},{ 9}}, {{467},{ 9}}, {{ 51},{ 9}}, {{307},{ 9}}, {{179},{ 9}}, 35 | {{435},{ 9}}, {{115},{ 9}}, {{371},{ 9}}, {{243},{ 9}}, {{499},{ 9}}, 36 | {{ 11},{ 9}}, {{267},{ 9}}, {{139},{ 9}}, {{395},{ 9}}, {{ 75},{ 9}}, 37 | {{331},{ 9}}, {{203},{ 9}}, {{459},{ 9}}, {{ 43},{ 9}}, {{299},{ 9}}, 38 | {{171},{ 9}}, {{427},{ 9}}, {{107},{ 9}}, {{363},{ 9}}, {{235},{ 9}}, 39 | {{491},{ 9}}, {{ 27},{ 9}}, {{283},{ 9}}, {{155},{ 9}}, {{411},{ 9}}, 40 | {{ 91},{ 9}}, {{347},{ 9}}, {{219},{ 9}}, {{475},{ 9}}, {{ 59},{ 9}}, 41 | {{315},{ 9}}, {{187},{ 9}}, {{443},{ 9}}, {{123},{ 9}}, {{379},{ 9}}, 42 | {{251},{ 9}}, {{507},{ 9}}, {{ 7},{ 9}}, {{263},{ 9}}, {{135},{ 9}}, 43 | {{391},{ 9}}, {{ 71},{ 9}}, {{327},{ 9}}, {{199},{ 9}}, {{455},{ 9}}, 44 | {{ 39},{ 9}}, {{295},{ 9}}, {{167},{ 9}}, {{423},{ 9}}, {{103},{ 9}}, 45 | {{359},{ 9}}, {{231},{ 9}}, {{487},{ 9}}, {{ 23},{ 9}}, {{279},{ 9}}, 46 | {{151},{ 9}}, {{407},{ 9}}, {{ 87},{ 9}}, {{343},{ 9}}, {{215},{ 9}}, 47 | {{471},{ 9}}, {{ 55},{ 9}}, {{311},{ 9}}, {{183},{ 9}}, {{439},{ 9}}, 48 | {{119},{ 9}}, {{375},{ 9}}, {{247},{ 9}}, {{503},{ 9}}, {{ 15},{ 9}}, 49 | {{271},{ 9}}, {{143},{ 9}}, {{399},{ 9}}, {{ 79},{ 9}}, {{335},{ 9}}, 50 | {{207},{ 9}}, {{463},{ 9}}, {{ 47},{ 9}}, {{303},{ 9}}, {{175},{ 9}}, 51 | {{431},{ 9}}, {{111},{ 9}}, {{367},{ 9}}, {{239},{ 9}}, {{495},{ 9}}, 52 | {{ 31},{ 9}}, {{287},{ 9}}, {{159},{ 9}}, {{415},{ 9}}, {{ 95},{ 9}}, 53 | {{351},{ 9}}, {{223},{ 9}}, {{479},{ 9}}, {{ 63},{ 9}}, {{319},{ 9}}, 54 | {{191},{ 9}}, {{447},{ 9}}, {{127},{ 9}}, {{383},{ 9}}, {{255},{ 9}}, 55 | {{511},{ 9}}, {{ 0},{ 7}}, {{ 64},{ 7}}, {{ 32},{ 7}}, {{ 96},{ 7}}, 56 | {{ 16},{ 7}}, {{ 80},{ 7}}, {{ 48},{ 7}}, {{112},{ 7}}, {{ 8},{ 7}}, 57 | {{ 72},{ 7}}, {{ 40},{ 7}}, {{104},{ 7}}, {{ 24},{ 7}}, {{ 88},{ 7}}, 58 | {{ 56},{ 7}}, {{120},{ 7}}, {{ 4},{ 7}}, {{ 68},{ 7}}, {{ 36},{ 7}}, 59 | {{100},{ 7}}, {{ 20},{ 7}}, {{ 84},{ 7}}, {{ 52},{ 7}}, {{116},{ 7}}, 60 | {{ 3},{ 8}}, {{131},{ 8}}, {{ 67},{ 8}}, {{195},{ 8}}, {{ 35},{ 8}}, 61 | {{163},{ 8}}, {{ 99},{ 8}}, {{227},{ 8}} 62 | }; 63 | 64 | local const ct_data static_dtree[D_CODES] = { 65 | {{ 0},{ 5}}, {{16},{ 5}}, {{ 8},{ 5}}, {{24},{ 5}}, {{ 4},{ 5}}, 66 | {{20},{ 5}}, {{12},{ 5}}, {{28},{ 5}}, {{ 2},{ 5}}, {{18},{ 5}}, 67 | {{10},{ 5}}, {{26},{ 5}}, {{ 6},{ 5}}, {{22},{ 5}}, {{14},{ 5}}, 68 | {{30},{ 5}}, {{ 1},{ 5}}, {{17},{ 5}}, {{ 9},{ 5}}, {{25},{ 5}}, 69 | {{ 5},{ 5}}, {{21},{ 5}}, {{13},{ 5}}, {{29},{ 5}}, {{ 3},{ 5}}, 70 | {{19},{ 5}}, {{11},{ 5}}, {{27},{ 5}}, {{ 7},{ 5}}, {{23},{ 5}} 71 | }; 72 | 73 | const uch ZLIB_INTERNAL _dist_code[DIST_CODE_LEN] = { 74 | 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 75 | 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 76 | 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 77 | 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 78 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 79 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 80 | 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 81 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 82 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 83 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 84 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 85 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 86 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0, 0, 16, 17, 87 | 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 88 | 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 89 | 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 90 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 91 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 92 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 93 | 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 94 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 95 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 96 | 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 97 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 98 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 99 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29 100 | }; 101 | 102 | const uch ZLIB_INTERNAL _length_code[MAX_MATCH-MIN_MATCH+1]= { 103 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, 12, 104 | 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 105 | 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 106 | 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 107 | 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 108 | 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 109 | 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 110 | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 111 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 112 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 113 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 114 | 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 115 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28 116 | }; 117 | 118 | local const int base_length[LENGTH_CODES] = { 119 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 120 | 64, 80, 96, 112, 128, 160, 192, 224, 0 121 | }; 122 | 123 | local const int base_dist[D_CODES] = { 124 | 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 125 | 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 126 | 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576 127 | }; 128 | 129 | -------------------------------------------------------------------------------- /src/zlib/zutil.h: -------------------------------------------------------------------------------- 1 | /* zutil.h -- internal interface and configuration of the compression library 2 | * Copyright (C) 1995-2013 Jean-loup Gailly. 3 | * For conditions of distribution and use, see copyright notice in zlib.h 4 | */ 5 | 6 | /* WARNING: this file should *not* be used by applications. It is 7 | part of the implementation of the compression library and is 8 | subject to change. Applications should only use zlib.h. 9 | */ 10 | 11 | /* @(#) $Id$ */ 12 | 13 | #ifndef ZUTIL_H 14 | #define ZUTIL_H 15 | 16 | #ifdef HAVE_HIDDEN 17 | # define ZLIB_INTERNAL __attribute__((visibility ("hidden"))) 18 | #else 19 | # define ZLIB_INTERNAL 20 | #endif 21 | 22 | #include "zlib.h" 23 | 24 | #if defined(STDC) && !defined(Z_SOLO) 25 | # if !(defined(_WIN32_WCE) && defined(_MSC_VER)) 26 | # include 27 | # endif 28 | # include 29 | # include 30 | #endif 31 | 32 | #ifdef Z_SOLO 33 | typedef long ptrdiff_t; /* guess -- will be caught if guess is wrong */ 34 | #endif 35 | 36 | #ifndef local 37 | # define local static 38 | #endif 39 | /* compile with -Dlocal if your debugger can't find static symbols */ 40 | 41 | typedef unsigned char uch; 42 | typedef uch FAR uchf; 43 | typedef unsigned short ush; 44 | typedef ush FAR ushf; 45 | typedef unsigned long ulg; 46 | 47 | extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */ 48 | /* (size given to avoid silly warnings with Visual C++) */ 49 | 50 | #define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)] 51 | 52 | #define ERR_RETURN(strm,err) \ 53 | return (strm->msg = ERR_MSG(err), (err)) 54 | /* To be used only when the state is known to be valid */ 55 | 56 | /* common constants */ 57 | 58 | #ifndef DEF_WBITS 59 | # define DEF_WBITS MAX_WBITS 60 | #endif 61 | /* default windowBits for decompression. MAX_WBITS is for compression only */ 62 | 63 | #if MAX_MEM_LEVEL >= 8 64 | # define DEF_MEM_LEVEL 8 65 | #else 66 | # define DEF_MEM_LEVEL MAX_MEM_LEVEL 67 | #endif 68 | /* default memLevel */ 69 | 70 | #define STORED_BLOCK 0 71 | #define STATIC_TREES 1 72 | #define DYN_TREES 2 73 | /* The three kinds of block type */ 74 | 75 | #define MIN_MATCH 3 76 | #define MAX_MATCH 258 77 | /* The minimum and maximum match lengths */ 78 | 79 | #define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ 80 | 81 | /* target dependencies */ 82 | 83 | #if defined(MSDOS) || (defined(WINDOWS) && !defined(WIN32)) 84 | # define OS_CODE 0x00 85 | # ifndef Z_SOLO 86 | # if defined(__TURBOC__) || defined(__BORLANDC__) 87 | # if (__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__)) 88 | /* Allow compilation with ANSI keywords only enabled */ 89 | void _Cdecl farfree( void *block ); 90 | void *_Cdecl farmalloc( unsigned long nbytes ); 91 | # else 92 | # include 93 | # endif 94 | # else /* MSC or DJGPP */ 95 | # include 96 | # endif 97 | # endif 98 | #endif 99 | 100 | #ifdef AMIGA 101 | # define OS_CODE 0x01 102 | #endif 103 | 104 | #if defined(VAXC) || defined(VMS) 105 | # define OS_CODE 0x02 106 | # define F_OPEN(name, mode) \ 107 | fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512") 108 | #endif 109 | 110 | #if defined(ATARI) || defined(atarist) 111 | # define OS_CODE 0x05 112 | #endif 113 | 114 | #ifdef OS2 115 | # define OS_CODE 0x06 116 | # if defined(M_I86) && !defined(Z_SOLO) 117 | # include 118 | # endif 119 | #endif 120 | 121 | #if defined(MACOS) || defined(TARGET_OS_MAC) 122 | # define OS_CODE 0x07 123 | # ifndef Z_SOLO 124 | # if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os 125 | # include /* for fdopen */ 126 | # else 127 | # ifndef fdopen 128 | # define fdopen(fd,mode) NULL /* No fdopen() */ 129 | # endif 130 | # endif 131 | # endif 132 | #endif 133 | 134 | #ifdef TOPS20 135 | # define OS_CODE 0x0a 136 | #endif 137 | 138 | #ifdef WIN32 139 | # ifndef __CYGWIN__ /* Cygwin is Unix, not Win32 */ 140 | # define OS_CODE 0x0b 141 | # endif 142 | #endif 143 | 144 | #ifdef __50SERIES /* Prime/PRIMOS */ 145 | # define OS_CODE 0x0f 146 | #endif 147 | 148 | #if defined(_BEOS_) || defined(RISCOS) 149 | # define fdopen(fd,mode) NULL /* No fdopen() */ 150 | #endif 151 | 152 | #if (defined(_MSC_VER) && (_MSC_VER > 600)) && !defined __INTERIX 153 | # if defined(_WIN32_WCE) 154 | # define fdopen(fd,mode) NULL /* No fdopen() */ 155 | # ifndef _PTRDIFF_T_DEFINED 156 | typedef int ptrdiff_t; 157 | # define _PTRDIFF_T_DEFINED 158 | # endif 159 | # else 160 | # define fdopen(fd,type) _fdopen(fd,type) 161 | # endif 162 | #endif 163 | 164 | #if defined(__BORLANDC__) && !defined(MSDOS) 165 | #pragma warn -8004 166 | #pragma warn -8008 167 | #pragma warn -8066 168 | #endif 169 | 170 | /* provide prototypes for these when building zlib without LFS */ 171 | #if !defined(_WIN32) && \ 172 | (!defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0) 173 | ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t)); 174 | ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t)); 175 | #endif 176 | 177 | /* common defaults */ 178 | 179 | #ifndef OS_CODE 180 | # define OS_CODE 0x03 /* assume Unix */ 181 | #endif 182 | 183 | #ifndef F_OPEN 184 | # define F_OPEN(name, mode) fopen((name), (mode)) 185 | #endif 186 | 187 | /* functions */ 188 | 189 | #if defined(pyr) || defined(Z_SOLO) 190 | # define NO_MEMCPY 191 | #endif 192 | #if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__) 193 | /* Use our own functions for small and medium model with MSC <= 5.0. 194 | * You may have to use the same strategy for Borland C (untested). 195 | * The __SC__ check is for Symantec. 196 | */ 197 | # define NO_MEMCPY 198 | #endif 199 | #if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY) 200 | # define HAVE_MEMCPY 201 | #endif 202 | #ifdef HAVE_MEMCPY 203 | # ifdef SMALL_MEDIUM /* MSDOS small or medium model */ 204 | # define zmemcpy _fmemcpy 205 | # define zmemcmp _fmemcmp 206 | # define zmemzero(dest, len) _fmemset(dest, 0, len) 207 | # else 208 | # define zmemcpy memcpy 209 | # define zmemcmp memcmp 210 | # define zmemzero(dest, len) memset(dest, 0, len) 211 | # endif 212 | #else 213 | void ZLIB_INTERNAL zmemcpy OF((Bytef* dest, const Bytef* source, uInt len)); 214 | int ZLIB_INTERNAL zmemcmp OF((const Bytef* s1, const Bytef* s2, uInt len)); 215 | void ZLIB_INTERNAL zmemzero OF((Bytef* dest, uInt len)); 216 | #endif 217 | 218 | /* Diagnostic functions */ 219 | #ifdef DEBUG 220 | # include 221 | extern int ZLIB_INTERNAL z_verbose; 222 | extern void ZLIB_INTERNAL z_error OF((char *m)); 223 | # define Assert(cond,msg) {if(!(cond)) z_error(msg);} 224 | # define Trace(x) {if (z_verbose>=0) fprintf x ;} 225 | # define Tracev(x) {if (z_verbose>0) fprintf x ;} 226 | # define Tracevv(x) {if (z_verbose>1) fprintf x ;} 227 | # define Tracec(c,x) {if (z_verbose>0 && (c)) fprintf x ;} 228 | # define Tracecv(c,x) {if (z_verbose>1 && (c)) fprintf x ;} 229 | #else 230 | # define Assert(cond,msg) 231 | # define Trace(x) 232 | # define Tracev(x) 233 | # define Tracevv(x) 234 | # define Tracec(c,x) 235 | # define Tracecv(c,x) 236 | #endif 237 | 238 | #ifndef Z_SOLO 239 | voidpf ZLIB_INTERNAL zcalloc OF((voidpf opaque, unsigned items, 240 | unsigned size)); 241 | void ZLIB_INTERNAL zcfree OF((voidpf opaque, voidpf ptr)); 242 | #endif 243 | 244 | #define ZALLOC(strm, items, size) \ 245 | (*((strm)->zalloc))((strm)->opaque, (items), (size)) 246 | #define ZFREE(strm, addr) (*((strm)->zfree))((strm)->opaque, (voidpf)(addr)) 247 | #define TRY_FREE(s, p) {if (p) ZFREE(s, p);} 248 | 249 | /* Reverse the bytes in a 32-bit value */ 250 | #define ZSWAP32(q) ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \ 251 | (((q) & 0xff00) << 8) + (((q) & 0xff) << 24)) 252 | 253 | #endif /* ZUTIL_H */ 254 | -------------------------------------------------------------------------------- /testdata/R1.fq: -------------------------------------------------------------------------------- 1 | @NB551106:23:HVMTYBGX2:2:12302:19642:13894 1:N:0:GATCAG 2 | CATCACACACCTTGACTGGTCCCCAGACAACAAGTATATAATGTCTAACTCGGGAGACTATGAAATATTGTACTGTAAGTATGAATGATTTTATATATATATATATATGCTATGATTATATTTATATATATAATAATTATTTTCCATATAT 3 | + 4 | A/AA/EEEEEAEEAEEEEEEAEEEAEAEE/EEE/EEEEEEEEEEEEEEEEEEEAAEEEEEEEEEEE6EE/EEEEEE/ALK,chr2:29415640-30144432 2 | 1,30142859,30144432 3 | 2,29940444,29940563 4 | 3,29917716,29917880 5 | 4,29754781,29754982 6 | 5,29606598,29606725 7 | 6,29551216,29551347 8 | 7,29543617,29543748 9 | 8,29541170,29541270 10 | 9,29519754,29519923 11 | 10,29498268,29498362 12 | 11,29497965,29498093 13 | 12,29473971,29474133 14 | 13,29462546,29462696 15 | 14,29456431,29456562 16 | 15,29455170,29455314 17 | 16,29451750,29451932 18 | 17,29450440,29450538 19 | 18,29449788,29449940 20 | 19,29448327,29448431 21 | 20,29446208,29446394 22 | 21,29445383,29445473 23 | 22,29445210,29445274 24 | 23,29443572,29443701 25 | 24,29436850,29436947 26 | 25,29432652,29432744 27 | 26,29430037,29430138 28 | 27,29420408,29420542 29 | 28,29419636,29419726 30 | 29,29415640,29416788 31 | 32 | >ROS1,chr6:117609463-117747018 33 | 1,117746697,117747018 34 | 2,117739625,117739669 35 | 3,117737421,117737480 36 | 4,117730745,117730805 37 | 5,117725443,117725591 38 | 6,117724302,117724440 39 | 7,117718078,117718279 40 | 8,117717351,117717427 41 | 9,117715779,117715901 42 | 10,117715325,117715509 43 | 11,117714387,117714484 44 | 12,117710513,117711009 45 | 13,117708943,117709197 46 | 14,117708052,117708162 47 | 15,117706846,117707024 48 | 16,117704480,117704671 49 | 17,117700222,117700322 50 | 18,117687239,117687453 51 | 19,117686744,117686904 52 | 20,117686223,117686367 53 | 21,117683766,117684028 54 | 22,117681505,117681568 55 | 23,117680972,117681174 56 | 24,117678967,117679172 57 | 25,117677792,117678078 58 | 26,117674153,117674332 59 | 27,117665223,117665425 60 | 28,117663563,117663707 61 | 29,117662563,117662795 62 | 30,117662298,117662474 63 | 31,117658335,117658503 64 | 32,117650492,117650609 65 | 33,117647387,117647577 66 | 34,117645495,117645578 67 | 35,117642422,117642557 68 | 36,117641031,117641193 69 | 37,117639351,117639415 70 | 38,117638306,117638435 71 | 39,117632183,117632280 72 | 40,117631244,117631444 73 | 41,117629957,117630091 74 | 42,117622137,117622300 75 | 43,117609463,117609965 76 | 77 | >RET,chr10:43572475-43625799 78 | 1,43572475,43572779 79 | 2,43595907,43596170 80 | 3,43597790,43598077 81 | 4,43600400,43600641 82 | 5,43601824,43602019 83 | 6,43604479,43604678 84 | 7,43606655,43606913 85 | 8,43607547,43607672 86 | 9,43608301,43608411 87 | 10,43609004,43609123 88 | 11,43609928,43610184 89 | 12,43612032,43612179 90 | 13,43613821,43613928 91 | 14,43614979,43615193 92 | 15,43615529,43615651 93 | 16,43617394,43617464 94 | 17,43619119,43619256 95 | 18,43620331,43620430 96 | 19,43622023,43622170 97 | 20,43623560,43625799 98 | 99 | >EML4,chr2:42396490-42559688 100 | 1,42396490,42396776 101 | 2,42472645,42472827 102 | 3,42483641,42483770 103 | 4,42488261,42488434 104 | 5,42490318,42490446 105 | 6,42491846,42491871 106 | 7,42507990,42508113 107 | 8,42509963,42510112 108 | 9,42511774,42511843 109 | 10,42513409,42513519 110 | 11,42515367,42515462 111 | 12,42522265,42522399 112 | 13,42522521,42522656 113 | 14,42528381,42528532 114 | 15,42530244,42530369 115 | 16,42530455,42530586 116 | 17,42531624,42531691 117 | 18,42543102,42543190 118 | 19,42544567,42544664 119 | 20,42552607,42552694 120 | 21,42553294,42553392 121 | 22,42556026,42556156 122 | 23,42556874,42559688 --------------------------------------------------------------------------------