├── .gitignore ├── LICENSE ├── CommandLineParser.h ├── Sort.h ├── RefGenome.h ├── SNPReader.h ├── Makefile ├── Output.h ├── Reads.h ├── HashTable.h ├── Changelog ├── MrsFAST.h ├── Common.h ├── SNPIndexer.c ├── Output.c ├── RefGenome.c ├── baseFAST.c ├── SNPReader.c ├── Common.c ├── HELP.man ├── Sort.c ├── README.md ├── CommandLineParser.c ├── HashTable.c └── Reads.c /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /data 3 | /data/* 4 | mrsfast 5 | snp_indexer 6 | gmon.out 7 | HELP 8 | *.o 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | Redistributions of source code must retain the above copyright notice, this list 8 | of conditions and the following disclaimer. 9 | - Redistributions in binary form must reproduce the above copyright notice, this 10 | list of conditions and the following disclaimer in the documentation and/or other 11 | materials provided with the distribution. 12 | - Neither the name of the nor the names of its contributors may be 13 | used to endorse or promote products derived from this software without specific 14 | prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | 29 | -------------------------------------------------------------------------------- /CommandLineParser.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #ifndef __COMMAND_LINE_PARSER__ 37 | #define __COMMAND_LINE_PARSER__ 38 | 39 | int parseCommandLine (int argc, char *argv[]); 40 | void finalizeCommandParser(); 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /Sort.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | * Ermin Hodzic (ermin_hodzic AT sfu DOT ca) 35 | */ 36 | 37 | #ifndef __SORT__ 38 | #define __SORT__ 39 | #include "Common.h" 40 | #include "Reads.h" 41 | 42 | void introSortGI (GeneralIndex *, const int, const int); 43 | void introSortPair (Pair *, const int, const int); 44 | #endif 45 | -------------------------------------------------------------------------------- /RefGenome.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | 37 | #ifndef _REF_GENOME_ 38 | #define _REF_GENOME_ 39 | 40 | int initLoadingRefGenome(char *fileName, char *genomeInfo, int *genomeInfoSize); 41 | void finalizeLoadingRefGenome(); 42 | int loadRefGenome(char **refGen, char **refGenName, int *refGenOff, int *refGenLen); 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /SNPReader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #ifndef __SNIP_READER__ 37 | #define __SNIP_READER__ 38 | 39 | CompressedSeq *loadSNPMap(char *chrName, int contigStartIndex, int contigLength, char *snpAlternatives); 40 | void initLoadingSNPs(char *fileName); 41 | void finalizeSNPs(); 42 | void rewindSNPIndex(); 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | MRSFAST_VERSION := "3.4.2" 2 | BUILD_DATE := "$(shell date)" 3 | 4 | CC?= gcc 5 | all: OPTIMIZE_FLAGS build 6 | debug: DEBUG_FLAGS build 7 | profile: PROFILE_FLAGS build 8 | build: clean_executables SSE_FLAGS mrsfast snp_indexer clean_objects 9 | 10 | 11 | LIBS=-lz -lm -pthread -lpthread 12 | CFLAGS=-fno-pic -DMRSFAST_VERSION=\"$(MRSFAST_VERSION)\" -DBUILD_DATE=\"$(BUILD_DATE)\" 13 | 14 | objects=baseFAST.o Sort.o MrsFAST.o Common.o CommandLineParser.o RefGenome.o HashTable.o Reads.o Output.o SNPReader.o HELP.o 15 | 16 | mrsfast: clean_executables $(objects) 17 | ifeq ($(shell uname -s),Linux) 18 | $(CC) -w $(objects) -o $@ ${LDFLAGS} ${LIBS} 19 | else 20 | $(CC) -Wl,-no_pie -fno-pic -w $(objects) -o $@ ${LDFLAGS} ${LIBS} 21 | endif 22 | 23 | snp_indexer: clean_executables SNPIndexer.o 24 | $(CC) SNPIndexer.o -o $@ ${LDFLAGS} ${LIBS} 25 | 26 | clean_objects: mrsfast snp_indexer 27 | @rm -f $(objects) 28 | @rm -f SNPIndexer.o 29 | @rm -f HELPstub.c 30 | @rm -f HELPstub.o 31 | 32 | clean: 33 | @rm -f $(objects) 34 | @rm -f SNPIndexer.o 35 | @rm -f HELPstub.c 36 | @rm -f HELPstub.o 37 | 38 | clean_executables: 39 | @rm -f mrsfast 40 | @rm -f snp_indexer 41 | 42 | HELP.o: 43 | @groff -Tascii -man ./HELP.man > HELP 44 | ifeq ($(shell uname -s),Linux) 45 | @ld -r -b binary -o HELP.o HELP 46 | else 47 | @touch HELPstub.c 48 | $(CC) -o HELPstub.o -c HELPstub.c 49 | ld -r -o HELP.o -sectcreate binary HELP HELP HELPstub.o 50 | endif 51 | 52 | DEBUG_FLAGS: 53 | $(eval CFLAGS = $(CFLAGS) -ggdb) 54 | $(eval LIBS = $(LIBS) -ggdb) 55 | 56 | OPTIMIZE_FLAGS: 57 | $(eval CFLAGS = $(CFLAGS) -O2) 58 | 59 | PROFILE_FLAGS: 60 | $(eval CFLAGS = $(CFLAGS) -pg -g) 61 | $(eval LIBS = $(LIBS) -pg -g) 62 | 63 | SSE_FLAGS: 64 | ifeq ($(shell uname -s),Linux) 65 | ifeq ($(with-sse4),no) 66 | $(shell echo "-DSSE4=0") 67 | else 68 | $(eval CFLAGS = $(CFLAGS) \ 69 | $(shell gv=`$(CC) -dumpversion`; \ 70 | sc=`grep -c "sse4" /proc/cpuinfo`; \ 71 | echo $$sc.$$gv | awk -F. '{if($$1>0 && $$2>=4 && $$3>=4) print "-DSSE4=1 -msse4.2"; else print "-DSSE4=0"}')) 72 | endif 73 | else 74 | ifeq ($(with-sse4),no) 75 | $(shell echo "-DSSE4=0") 76 | else 77 | $(eval CFLAGS = $(CFLAGS) \ 78 | $(shell gv=`$(CC) -dumpversion`; \ 79 | sc=`sysctl -n machdep.cpu.features | grep -c "SSE4"` ;\ 80 | echo $$sc.$$gv | awk -F. '{if($$1>0 && $$2>=4 && $$3>=4) print "-DSSE4=1 -msse4.2"; else print "-DSSE4=0"}')) 81 | endif 82 | endif 83 | -------------------------------------------------------------------------------- /Output.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #ifndef __OUTPUT__ 37 | #define __OUTPUT__ 38 | 39 | #define FORWARD 0 40 | #define REVERSE 1 41 | 42 | typedef struct 43 | { 44 | char *tag; 45 | char type; 46 | char cVal; 47 | int iVal; 48 | float fVal; 49 | char *sVal; 50 | } OPT_FIELDS; 51 | 52 | typedef struct 53 | { 54 | char *QNAME; 55 | short FLAG; 56 | char *RNAME; 57 | int POS; 58 | unsigned char MAPQ; 59 | char *CIGAR; 60 | char *MRNAME; 61 | int MPOS; 62 | int ISIZE; 63 | char *SEQ; 64 | char *QUAL; 65 | 66 | int optSize; 67 | OPT_FIELDS *optFields; 68 | } SAM; 69 | 70 | int initOutput(char *fileName, int compressed); 71 | void (*finalizeOutput)(); 72 | void (*output)(SAM map); 73 | void (*outputMeta)(char*); 74 | void (*outputBuffer)(char *, int); 75 | 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /Reads.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #ifndef __READ__ 37 | #define __READ__ 38 | 39 | typedef struct 40 | { 41 | int32_t hv; 42 | CheckSumType checksum; 43 | int32_t seqInfo; 44 | 45 | } Pair; 46 | 47 | typedef struct 48 | { 49 | uint16_t *hits; 50 | char *seq; 51 | char *qual; 52 | char *rseq; 53 | CompressedSeq *cseq; 54 | CompressedSeq *crseq; 55 | char *name; 56 | unsigned char *alphCnt; 57 | } Read; 58 | 59 | int readChunk(Read **seqList, unsigned int *seqListSize); 60 | void finalizeReads(); 61 | void getSamplingLocsInfo(int **samplingLocs, int **samplingLocsSeg, int **samplingLocsOffset, int **samplingLocsLen, int **samplingLocsLenFull, int *samplingLocsSize); 62 | int initRead(char *seqFile1, char *seqFile2); 63 | void getReadIndex(ReadIndexTable ***rIndex, int **rIndexSize); 64 | void releaseChunk(); 65 | #endif 66 | -------------------------------------------------------------------------------- /HashTable.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | 37 | #ifndef __HASH_TABLE__ 38 | #define __HASH_TABLE__ 39 | 40 | typedef struct HashTable 41 | { 42 | long long hv; 43 | int *locs; 44 | } HashTable; 45 | 46 | typedef struct 47 | { 48 | GeneralIndex *list; 49 | } IHashTable; 50 | 51 | char *getRefGenome(); 52 | char *getRefGenomeName(); 53 | int getRefGenomeOffset(); 54 | CompressedSeq *getCmpRefGenome(); 55 | CompressedSeq *getCmpRefGenOrigin(); 56 | int getRefGenLength(); 57 | int getCmpRefGenLength(); 58 | int initLoadingHashTable(char*); 59 | HashTable *getHashTable(); 60 | GeneralIndex *getCandidates(int hv); 61 | unsigned char *getAlphabetCount(); 62 | void rewindHashTable(); 63 | int getChrCnt(); 64 | char **getChrNames(); 65 | int getMaxChrLength(); 66 | int generateHashTable(char*, char*); 67 | int checkHashTable(char*); 68 | int loadHashTable(double*); 69 | void finalizeLoadingHashTable(); 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /Changelog: -------------------------------------------------------------------------------- 1 | 2014-03-31 Faraz Hach 2 | * Added with-sse4 option to Makefile 3 | * Bug reported by viktor in dc mode is fixed. 4 | 5 | 2014-01-14 Faraz Hach 6 | * Bug related to calculation of q-grams with respect to last character 7 | of referencce genome is fiexed.. 8 | * Code is ported to OS X Mavericks. 9 | * Code is compatible with clang. 10 | * Make file is modified to handle OS X help file generation. 11 | * New GUI is released for use of mrsfast-ultra. 12 | 13 | 2013-09-13 Faraz Hach 14 | * Bug fixes. 15 | 16 | 2013-07-23 Faraz Hach 17 | * Patch sent by J.K Teer is applied on Reads.c for gzip PE input. 18 | 19 | 2013-06-07 Faraz Hach 20 | * Bug fixes. 21 | 22 | 2013-01-16 Faraz Hach 23 | * First Release of mrsfast-ultra BETA. 24 | * New High Performance mapper 25 | * SNP-aware mapping mode. 26 | 27 | 2012-11-14 Faraz Hach 28 | * Bug fix with respect to quality end null characters. 29 | 30 | 2012-08-18 Faraz Hach 31 | * DIVET output format is fixed. 32 | 33 | 2012-06-13 Faraz Hach 34 | * Phred offset is added. 35 | * DISCORDANT_CUT_OFF is added. 36 | * Bug related to MD is fixed. 37 | * Bug related to CIGAR is fixed. 38 | * Bug related to indexing is fixed. 39 | 40 | 2010-05-03 Faraz Hach 41 | * SAM field quality for fasta files is fixed. 42 | * Window size issues are fixed. 43 | * Path issues are fixed. 44 | * Comments in fastq/fasta files are ignored. 45 | 46 | 2010-02-08 Faraz Hach 47 | * SAM field flag is fixed. 48 | * Paths in discordant-vh mode is fixed. 49 | * --min and --max parameters are now defining template length (the 50 | distance between outer edges of the mapping). 51 | 52 | 2009-12-08 Faraz Hach 53 | * Cache oblivious algorithm is implemented (2x speed up) 54 | * Bugs related to PE mode are fixed. 55 | * Bisulfite support is dropped. 56 | * VariationHunter mode is added. 57 | 58 | 2009-09-11 Faraz Hach 59 | * Bug related to reporting same location multiple times is fixed. 60 | 61 | 2009-07-31 Faraz Hach 62 | * Bug #2828635 is fixed. 63 | * Bug #2828636 is fixed. 64 | * Serveral Bugs in PE mode is fixed. 65 | * SAM field TLEN is fixed. 66 | * --crop is added to command line options. 67 | 68 | 2009-06-17 Faraz Hach 69 | * Performance Improvement. 70 | 71 | 2009-06-04 Faraz Hach 72 | * SAM fields CIGAR and MD are implemented. 73 | * Screen log output is modified. 74 | * Same mate pair names without identifiers (/1,/2) in two files are supported. 75 | 76 | 2009-06-01 Faraz Hach 77 | * Command line options are simplified. 78 | * Support for reading gzip files are added 79 | * Support for outputing in gzip format is added. 80 | * Support for SAM output is added. 81 | * Multiple bug fixes related to memory leackage during indexing is fixed. 82 | * Memory Usage is reported in the output. 83 | -------------------------------------------------------------------------------- /MrsFAST.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #ifndef __MRS_FAST__ 37 | #define __MRS_FAST__ 38 | 39 | #include "Reads.h" 40 | 41 | #define MAP_CHUNKS 15 42 | 43 | enum BestMappingStatus { unset = 0, first_mate, second_mate, trans_loc, improper, proper }; 44 | 45 | typedef struct mn 46 | { 47 | int loc; 48 | char dir; 49 | char err; 50 | char mderr; 51 | float score; 52 | int hits; 53 | int secondBestHits; 54 | int secondBestErrors; 55 | char md[40]; 56 | char chr[40]; 57 | } FullMappingInfo; 58 | 59 | 60 | typedef struct mnp 61 | { 62 | int loc1; 63 | char dir1; 64 | char err1; 65 | char mderr1; 66 | char md1[40]; 67 | char chr1[40]; 68 | int loc2; 69 | char dir2; 70 | char err2; 71 | char mderr2; 72 | char md2[40]; 73 | char chr2[40]; 74 | enum BestMappingStatus status; 75 | } BestMappingInfoPE; 76 | 77 | 78 | 79 | typedef struct lc 80 | { 81 | int loc[MAP_CHUNKS]; 82 | char err[MAP_CHUNKS]; 83 | struct lc *next; 84 | } MappingLocations; 85 | 86 | typedef struct inf 87 | { 88 | int size; 89 | MappingLocations *next; 90 | } MappingInfo; 91 | 92 | typedef struct 93 | { 94 | FILE * fp; 95 | char name[400]; 96 | } FILE_STRUCT; 97 | 98 | extern long long verificationCnt; 99 | extern long long mappingCnt; 100 | extern long long mappedSeqCnt; 101 | extern long long completedSeqCnt; 102 | 103 | void initializeFAST(int seqListSize); 104 | void initFASTChunk(Read *seqList, int seqListSize); 105 | void initFASTContig(); 106 | void finalizeFAST(); 107 | 108 | void mapSeq(unsigned char contigFlag); 109 | #endif 110 | -------------------------------------------------------------------------------- /Common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #ifndef __COMMON__ 37 | #define __COMMON__ 38 | 39 | #if SSE4==1 40 | #define MRSFAST_SSE4 41 | #endif 42 | 43 | 44 | #include 45 | #include 46 | 47 | #define SEQ_MAX_LENGTH 500 // Seq Max Length 48 | #define CMP_SEQ_MAX_LENGTH 10 // Compressed Seq Max Length 49 | #define CONTIG_OVERLAP 1050 // No. of characters overlapped between contings -- equals 50 blocks of length 21 50 | #define CONTIG_NAME_SIZE 200 // Contig name max size 51 | #define FILE_NAME_LENGTH 500 // Filename Max Length 52 | #define MAX_SNP_PER_CHR 100000000 53 | 54 | 55 | typedef uint64_t CompressedSeq; 56 | typedef uint16_t CheckSumType; 57 | 58 | extern unsigned int CONTIG_SIZE; 59 | extern unsigned int CONTIG_MAX_SIZE; 60 | extern unsigned int THREAD_COUNT; 61 | extern double MAX_MEMORY; 62 | extern int THREAD_ID[255]; 63 | 64 | extern unsigned char WINDOW_SIZE; // WINDOW SIZE for indexing/searching 65 | extern unsigned short SEQ_LENGTH; // Sequence(read) length 66 | extern unsigned short QUAL_LENGTH; 67 | extern unsigned short CMP_SEQ_LENGTH; 68 | extern unsigned short DISCORDANT_CUT_OFF; 69 | extern int SNP_QUAL_THRESHOLD; 70 | 71 | extern int indexingMode; 72 | extern int searchingMode; 73 | extern int pairedEndMode; 74 | extern int pairedEndDiscordantMode; 75 | extern int pairedEndProfilingMode; 76 | extern int bestMappingMode; 77 | extern int SNPMode; 78 | extern int seqCompressed; 79 | extern int outCompressed; 80 | extern int cropSize; 81 | extern int tailCropSize; 82 | extern int progressRep; 83 | extern int nohitDisabled; 84 | extern int noSamHeader; 85 | extern char *seqFile1; 86 | extern char *seqFile2; 87 | extern char *seqUnmapped; 88 | extern char *mappingOutput; 89 | extern char *mappingOutputPath; 90 | extern char *unmappedOutput; 91 | extern char *concordantStatOutput; 92 | extern unsigned char seqFastq; 93 | extern int errThreshold; 94 | extern short maxHits; 95 | extern int minPairEndedDiscordantDistance; 96 | extern int maxPairEndedDiscordantDistance; 97 | extern int minPairEndedDistance; 98 | extern int maxPairEndedDistance; 99 | extern char fileName[3][FILE_NAME_LENGTH]; 100 | extern int fileCnt; 101 | extern long long memUsage; 102 | extern char *alphabet; 103 | extern char checkSumLength; 104 | 105 | #pragma pack(push, 1) 106 | typedef struct 107 | { 108 | CheckSumType checksum; 109 | uint32_t info; // ReadIndex => seqInfo | GenomeIndex ==> Loc 110 | } GeneralIndex; 111 | #pragma pack(pop) 112 | typedef struct 113 | { 114 | int hv; 115 | GeneralIndex *list; 116 | } ReadIndexTable; 117 | 118 | typedef struct 119 | { 120 | int loc; 121 | char alt; 122 | } SNPLoc; 123 | 124 | typedef struct 125 | { 126 | char *chrName; 127 | int locCnt; 128 | SNPLoc *snpLocs; 129 | } ChrSNPs; 130 | 131 | FILE * fileOpen(char *fileName, char *mode); 132 | gzFile fileOpenGZ(char *fileName, char *mode); 133 | double getTime(void); 134 | void reverseComplete (char *seq, char *rcSeq , int length); 135 | char reverseCompleteChar(char); 136 | void * getMem(size_t size); 137 | void freeMem(void * ptr, size_t size); 138 | double getMemUsage(); 139 | void reverse (char *seq, char *rcSeq , int length); 140 | void stripPath(char *full, char **path, char **fileName); 141 | void compressSequence(char *seq, int seqLen, CompressedSeq *cseq); 142 | int calculateCompressedLen(int normalLen); 143 | int hashVal(char *seq); 144 | int checkSumVal(char *seq); 145 | void initCommon(); 146 | #endif 147 | -------------------------------------------------------------------------------- /SNPIndexer.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "Common.h" 8 | 9 | #define MAX_LINE_LENGTH 4000 10 | #define MAX_NUM_OF_CHRS 100000 11 | #define PROGRESS_METER_UNIT 4000000 12 | 13 | /**********************************************/ 14 | FILE *fileOpen(char *fileName, char *mode) 15 | { 16 | FILE *fp; 17 | fp = fopen (fileName, mode); 18 | if (fp == NULL) 19 | { 20 | fprintf(stdout, "Error: Cannot Open file \"%s\"\n", fileName); 21 | fflush(stdout); 22 | exit(EXIT_FAILURE); 23 | } 24 | return fp; 25 | } 26 | 27 | /**********************************************/ 28 | int cmp(const void *a, const void *b) 29 | { 30 | SNPLoc *x = (SNPLoc *) a; 31 | SNPLoc *y = (SNPLoc *) b; 32 | return (x->loc - y->loc); 33 | } 34 | 35 | /**********************************************/ 36 | int findChrIndex(char *chr, ChrSNPs *chrInfo, int chrCount) 37 | { 38 | int i; 39 | for (i = 0; i < chrCount; i++) 40 | { 41 | if (! strcmp(chr, chrInfo[i].chrName)) 42 | return i; 43 | } 44 | return -1; 45 | } 46 | /**********************************************/ 47 | void freeMems(ChrSNPs *chrInfo, int chrCount) 48 | { 49 | int i; 50 | for (i = 0; i < chrCount; i++) 51 | { 52 | free(chrInfo[i].chrName); 53 | free(chrInfo[i].snpLocs); 54 | } 55 | free(chrInfo); 56 | } 57 | /**********************************************/ 58 | void fixChromosomeName(char *cname) 59 | { 60 | // any other unifying standard for chromosome names can be added here 61 | // this might be required to make sure different names for the same chromosome (like M and MT, or chr1 and 1) are unified 62 | if (! strcmp(cname, "MT")) 63 | cname[1] = '\0'; 64 | } 65 | /**********************************************/ 66 | 67 | int main(int argc, char *argv[]) 68 | { 69 | if (argc < 3) 70 | { 71 | fprintf(stderr, "Too few input arguments\nInputs must be:\n\t1. Input vcf (v4.0) file name\n\t2. Output SNP index file name\n"); 72 | return 0; 73 | } 74 | FILE *inFile = 0, *outFile = 0; 75 | char *inFileName = argv[1]; 76 | char *outFileName = argv[2]; 77 | char line[MAX_LINE_LENGTH], chr[MAX_LINE_LENGTH], ref[MAX_LINE_LENGTH], alt[MAX_LINE_LENGTH], dummy[MAX_LINE_LENGTH]; 78 | int i, j, loc, chrIndex, nameLen; 79 | unsigned int snpCount = 0; 80 | int chrCount = 0; 81 | ChrSNPs *chrInfo = malloc(MAX_NUM_OF_CHRS * sizeof(ChrSNPs)); 82 | 83 | // read file, count number of chromosomes and their locations 84 | inFile = fileOpen(inFileName, "r"); 85 | fprintf(stdout, "Pre-processing VCF file ...\n"); 86 | 87 | while ( fgets(line, MAX_LINE_LENGTH, inFile) ) 88 | { 89 | if (line[0] == '#') // comment line 90 | continue; 91 | 92 | sscanf(line, "%s%d%s%s%s", chr, &loc, dummy, ref, alt); // read fields 93 | if (strlen(ref) != 1 || strlen(alt) != 1) // only one bp variants 94 | continue; 95 | fixChromosomeName(chr); 96 | 97 | snpCount ++; 98 | chrIndex = findChrIndex(chr, chrInfo, chrCount); 99 | if (chrIndex == -1) // new chr name 100 | { 101 | chrIndex = chrCount ++; 102 | // copy chr name 103 | nameLen = strlen(chr); 104 | chrInfo[chrIndex].chrName = malloc(nameLen + 1); 105 | strcpy(chrInfo[chrIndex].chrName, chr); 106 | // set number of locations 107 | chrInfo[chrIndex].locCnt = 1; 108 | chrInfo[chrIndex].snpLocs = NULL; 109 | } 110 | else 111 | { 112 | chrInfo[chrIndex].locCnt ++; 113 | } 114 | } 115 | 116 | fprintf(stdout, "Chromosomes: %d\n", chrCount); 117 | fprintf(stdout, "Valid SNP locations: %d\n", snpCount); 118 | 119 | // allocate SNPLocs for each chromosome 120 | for (i = 0; i < chrCount; i++) 121 | { 122 | chrInfo[i].snpLocs = malloc(chrInfo[i].locCnt * sizeof(SNPLoc)); 123 | chrInfo[i].locCnt = 0; 124 | //fprintf(stdout, "%s\n", chrInfo[i].chrName); 125 | } 126 | 127 | // read file again, fill locations 128 | rewind(inFile); 129 | i = 0; 130 | fprintf(stdout, "Reading SNP locations "); 131 | fflush(stdout); 132 | 133 | while ( fgets(line, MAX_LINE_LENGTH, inFile) ) 134 | { 135 | if (++i == PROGRESS_METER_UNIT) 136 | { 137 | fprintf(stdout, "."); 138 | fflush(stdout); 139 | i = 0; 140 | } 141 | if (line[0] == '#') // comment line 142 | continue; 143 | 144 | sscanf(line, "%s%d%s%s%s", chr, &loc, dummy, ref, alt); // read fields 145 | if (strlen(ref) != 1 || strlen(alt) != 1) // only one bp variants 146 | continue; 147 | fixChromosomeName(chr); 148 | 149 | chrIndex = findChrIndex(chr, chrInfo, chrCount); 150 | assert(chrIndex != -1); 151 | j = chrInfo[chrIndex].locCnt; 152 | chrInfo[chrIndex].snpLocs[j].loc = loc; 153 | chrInfo[chrIndex].snpLocs[j].alt = alt[0]; 154 | chrInfo[chrIndex].locCnt ++; 155 | } 156 | 157 | fclose(inFile); 158 | 159 | // sort locations for each chromosome 160 | fprintf(stdout, ".\nReformatting data ...\n"); 161 | 162 | for (i = 0; i < chrCount; i++) 163 | { 164 | if (chrInfo[i].locCnt > 0) 165 | qsort(chrInfo[i].snpLocs, chrInfo[i].locCnt, sizeof(SNPLoc), cmp); 166 | } 167 | 168 | // write to output file 169 | fprintf(stdout, "Creating output in %s\n", outFileName); 170 | 171 | outFile = fileOpen(outFileName, "w"); 172 | fwrite(&chrCount, sizeof(int), 1, outFile); 173 | 174 | for (i = 0; i < chrCount; i++) 175 | { 176 | nameLen = strlen(chrInfo[i].chrName); 177 | fwrite(&nameLen, sizeof(int), 1, outFile); // chr name length 178 | fwrite(chrInfo[i].chrName, sizeof(char), nameLen, outFile); // chr name 179 | fwrite(&chrInfo[i].locCnt, sizeof(int), 1, outFile); // num of locs 180 | fwrite(chrInfo[i].snpLocs, sizeof(SNPLoc), chrInfo[i].locCnt, outFile); // all snp locations 181 | } 182 | 183 | fclose(outFile); 184 | fprintf(stdout, "%u SNP locations registered successfully\n", snpCount); 185 | 186 | freeMems(chrInfo, chrCount); 187 | return 0; 188 | } 189 | -------------------------------------------------------------------------------- /Output.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include "Common.h" 41 | #include "Output.h" 42 | 43 | FILE *_out_fp; 44 | gzFile _out_gzfp; 45 | 46 | 47 | 48 | void finalizeGZOutput() 49 | { 50 | gzclose(_out_gzfp); 51 | } 52 | 53 | void finalizeTXOutput() 54 | { 55 | fclose(_out_fp); 56 | } 57 | 58 | 59 | void gzOutputQ(SAM map) 60 | { 61 | gzprintf(_out_gzfp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", 62 | map.QNAME, 63 | map.FLAG, 64 | map.RNAME, 65 | map.POS, 66 | map.MAPQ, 67 | map.CIGAR, 68 | map.MRNAME, 69 | map.MPOS, 70 | map.ISIZE, 71 | map.SEQ, 72 | map.QUAL); 73 | 74 | int i; 75 | 76 | for ( i = 0; i < map.optSize; i++) 77 | { 78 | switch (map.optFields[i].type) 79 | { 80 | case 'A': 81 | gzprintf(_out_gzfp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); 82 | break; 83 | case 'i': 84 | gzprintf(_out_gzfp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); 85 | break; 86 | case 'f': 87 | gzprintf(_out_gzfp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); 88 | break; 89 | case 'Z': 90 | case 'H': 91 | gzprintf(_out_gzfp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); 92 | break; 93 | } 94 | } 95 | gzprintf(_out_gzfp, "\n"); 96 | } 97 | 98 | void outputQ(SAM map) 99 | { 100 | 101 | fprintf(_out_fp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", 102 | map.QNAME, 103 | map.FLAG, 104 | map.RNAME, 105 | map.POS, 106 | map.MAPQ, 107 | map.CIGAR, 108 | map.MRNAME, 109 | map.MPOS, 110 | map.ISIZE, 111 | map.SEQ, 112 | map.QUAL); 113 | 114 | 115 | int i; 116 | for ( i = 0; i < map.optSize; i++) 117 | { 118 | switch (map.optFields[i].type) 119 | { 120 | case 'A': 121 | fprintf(_out_fp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); 122 | break; 123 | case 'i': 124 | fprintf(_out_fp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); 125 | break; 126 | case 'f': 127 | fprintf(_out_fp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); 128 | break; 129 | case 'Z': 130 | case 'H': 131 | fprintf(_out_fp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); 132 | break; 133 | } 134 | } 135 | 136 | fprintf(_out_fp, "\n"); 137 | } 138 | 139 | void outputBufferTxT(char *str, int size) 140 | { 141 | fwrite(str, 1, size, _out_fp); 142 | } 143 | 144 | void outputBufferGZ(char *str, int size) 145 | { 146 | gzwrite(_out_gzfp, str, size); 147 | } 148 | 149 | void outputMetaQ(char* str) 150 | { 151 | fprintf(_out_fp, "%s\n", str); 152 | } 153 | 154 | void gzOutputMetaQ(char* str) 155 | { 156 | gzprintf(_out_gzfp, "%s\n", str); 157 | } 158 | 159 | void noMetaOutput(char *str) {} 160 | 161 | int initOutput ( char *fileName, int compressed) 162 | { 163 | if (compressed) 164 | { 165 | char newFileName[strlen(mappingOutputPath)+strlen(fileName)+4]; 166 | sprintf(newFileName, "%s%s.sam.gz", mappingOutputPath, fileName); 167 | _out_gzfp = fileOpenGZ(newFileName, "w1f"); 168 | if (_out_gzfp == Z_NULL) 169 | { 170 | return 0; 171 | } 172 | 173 | finalizeOutput = &finalizeGZOutput; 174 | 175 | output = &gzOutputQ; 176 | outputMeta =&gzOutputMetaQ; 177 | outputBuffer = &outputBufferGZ; 178 | } 179 | else 180 | { 181 | 182 | char newFileName[strlen(mappingOutputPath)+strlen(fileName)+strlen(".sam")+1]; 183 | if ( !strcmp(mappingOutputPath, "/dev/") && !strcmp(fileName, "null") ) 184 | { 185 | sprintf(newFileName, "%s%s", mappingOutputPath, fileName); 186 | nohitDisabled = 1; 187 | } 188 | else 189 | { 190 | //sprintf(newFileName, "%s%s.sam", mappingOutputPath, fileName); 191 | sprintf(newFileName, "%s%s", mappingOutputPath, fileName); 192 | } 193 | 194 | _out_fp = fileOpen(newFileName, "w"); 195 | if (_out_fp == NULL) 196 | { 197 | return 0; 198 | } 199 | 200 | finalizeOutput = &finalizeTXOutput; 201 | output = &outputQ; 202 | outputMeta = &outputMetaQ; 203 | outputBuffer = &outputBufferTxT; 204 | } 205 | 206 | if (noSamHeader) 207 | outputMeta = &noMetaOutput; 208 | 209 | outputMeta("@HD\tVN:1.4\tSO:unsorted"); 210 | 211 | return 1; 212 | } 213 | 214 | 215 | -------------------------------------------------------------------------------- /RefGenome.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include "Common.h" 42 | #include "RefGenome.h" 43 | 44 | FILE *_rg_fp; 45 | char *_rg_gen; 46 | char *_rg_name; 47 | int _rg_offset; 48 | int _rg_contGen; // false if this segment is the first contig 49 | 50 | int getGenomeMetaInfo(char*, char*, int*); 51 | 52 | /**********************************************/ 53 | int initLoadingRefGenome(char *fileName, char *genomeMetaInfo, int *genomeMetaInfoLength) 54 | { 55 | _rg_fp = fileOpen (fileName, "r"); 56 | 57 | if (!getGenomeMetaInfo(fileName, genomeMetaInfo, genomeMetaInfoLength)) 58 | return 0; 59 | 60 | char ch; 61 | fscanf(_rg_fp, "%c", &ch); // '>' 62 | _rg_contGen = 0; 63 | _rg_offset = 0; 64 | _rg_gen = getMem(CONTIG_MAX_SIZE + 21); 65 | _rg_name = getMem(CONTIG_NAME_SIZE); 66 | return 1; 67 | } 68 | /**********************************************/ 69 | void finalizeLoadingRefGenome() 70 | { 71 | freeMem(_rg_gen, CONTIG_MAX_SIZE + 21); 72 | freeMem(_rg_name, CONTIG_NAME_SIZE); 73 | fclose(_rg_fp); 74 | } 75 | /**********************************************/ 76 | int loadRefGenome(char **refGen, char **refGenName, int *refGenOff, int *refGenLen) 77 | { 78 | char ch; 79 | int i; 80 | int returnVal = 0; 81 | int actualSize=0; 82 | int size; 83 | char *tmp; 84 | 85 | // New Conting 86 | if (!_rg_contGen) 87 | { 88 | size = 0; 89 | tmp = fgets(_rg_name, SEQ_MAX_LENGTH, _rg_fp); 90 | int k; 91 | for (k=0; _rg_name[k] != '\0';k++) 92 | { 93 | if (_rg_name[k] == ' ' || _rg_name[k] == '\t') 94 | { 95 | _rg_name[k]='\0'; 96 | break; 97 | } 98 | } 99 | 100 | } 101 | else 102 | { 103 | size=strlen(_rg_gen); 104 | for( i = 0 ; i < CONTIG_OVERLAP ; i++ ) 105 | { 106 | _rg_gen[i] = _rg_gen[size-CONTIG_OVERLAP+i]; 107 | if (_rg_gen[i] != 'N') 108 | actualSize++; 109 | } 110 | size = CONTIG_OVERLAP; 111 | } 112 | 113 | while( fscanf(_rg_fp, "%c", &ch) > 0 ) 114 | { 115 | if (ch == '>') 116 | { 117 | _rg_contGen = 0; 118 | returnVal = 2; 119 | break; 120 | } 121 | else if (!isspace(ch)) 122 | { 123 | ch = toupper(ch); 124 | _rg_gen[size++] = ch; 125 | if (ch != 'N') 126 | { 127 | actualSize++; 128 | } 129 | if ((actualSize > CONTIG_SIZE || size >= CONTIG_MAX_SIZE) && size%21 == 0) 130 | { 131 | _rg_contGen = 1; 132 | returnVal=1; 133 | break; 134 | } 135 | } 136 | 137 | } 138 | 139 | _rg_gen[size] = '\0'; 140 | for (i=strlen(_rg_name)-1; i >= 0; i--) 141 | if (!isspace(_rg_name[i])) 142 | break; 143 | _rg_name[i+1] = '\0'; 144 | 145 | *refGenOff = _rg_offset; 146 | *refGenName = _rg_name; 147 | *refGen = _rg_gen; 148 | 149 | if (_rg_contGen == 1) 150 | { 151 | _rg_offset += size-CONTIG_OVERLAP; 152 | } 153 | else 154 | { 155 | _rg_offset = 0; 156 | } 157 | 158 | *refGenLen = size; 159 | return returnVal; 160 | } 161 | /**********************************************/ 162 | int getGenomeMetaInfo(char *fileName, char *genomeMetaInfo, int *genomeMetaInfoLength) 163 | { 164 | // genomeMetaInfo structure: 165 | // 4 bytes (numOfChrs): number of chromosomes in file 166 | // for each chromosome we have the following 167 | // 4 bytes (nameLen): length of the chromosome name 168 | // n bytes (name): chromosome name 169 | // 4 bytes (genSize): length of the chromosome in characters 170 | 171 | char ch, *tmp; 172 | int *nameLen, *genSize, *numOfChrs = (int *)genomeMetaInfo; 173 | *numOfChrs = 0; 174 | int i = sizeof(int); 175 | 176 | if ( fscanf(_rg_fp, "%c", &ch) > 0 ) 177 | { 178 | if (ch != '>') 179 | { 180 | fprintf(stdout, "Error: Wrong fasta format file\n"); 181 | return 0; 182 | } 183 | } 184 | else 185 | return 0; 186 | 187 | rewind(_rg_fp); 188 | 189 | fprintf(stdout, "Scanning the fasta file: "); 190 | while( fscanf(_rg_fp, "%c", &ch) > 0 ) 191 | { 192 | if (!isspace(ch)) 193 | { 194 | if (ch == '>') 195 | { 196 | (*numOfChrs)++; 197 | nameLen = (int *)(genomeMetaInfo + i); 198 | *nameLen = 0; 199 | i += sizeof(int); 200 | tmp = fgets(genomeMetaInfo + i, SEQ_MAX_LENGTH, _rg_fp); 201 | while(!isspace(*(genomeMetaInfo+i))) 202 | { 203 | i++; 204 | (*nameLen)++; 205 | } 206 | genSize = (int *)(genomeMetaInfo + i); 207 | i += sizeof(int); 208 | *genSize = 0; 209 | fprintf(stdout, "."); 210 | fflush(stdout); 211 | } 212 | else 213 | { 214 | (*genSize)++; 215 | } 216 | } 217 | } 218 | fprintf(stdout, "\n"); 219 | *genomeMetaInfoLength = i; 220 | 221 | rewind(_rg_fp); 222 | return 1; 223 | } 224 | -------------------------------------------------------------------------------- /baseFAST.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include "Common.h" 41 | #include "CommandLineParser.h" 42 | #include "Reads.h" 43 | #include "Output.h" 44 | #include "HashTable.h" 45 | #include "MrsFAST.h" 46 | #include "SNPReader.h" 47 | 48 | unsigned char seqFastq; 49 | void printStat(); 50 | 51 | int main(int argc, char *argv[]) 52 | { 53 | if (!parseCommandLine(argc, argv)) 54 | return 1; 55 | 56 | /**************************************************** 57 | * INDEXING 58 | ***************************************************/ 59 | if (indexingMode) 60 | { 61 | if (!generateHashTable(fileName[0], fileName[1])) 62 | return 1; 63 | } 64 | /**************************************************** 65 | * SEARCHING 66 | ***************************************************/ 67 | else 68 | { 69 | Read *seqList; 70 | unsigned int seqListSize; 71 | int totalNumOfReads = 0; 72 | double totalLoadingTime = 0; 73 | double totalMappingTime = 0; 74 | double startTime; 75 | double loadingTime; 76 | double mappingTime; 77 | double lstartTime; 78 | double tmpTime; 79 | double maxMem=0; 80 | int flag; 81 | 82 | // Loading Sequences & Sampling Locations 83 | startTime = getTime(); 84 | if (!checkHashTable(fileName[1])) 85 | return 1; 86 | 87 | if (!initRead(seqFile1, seqFile2)) 88 | return 1; 89 | 90 | totalLoadingTime += getTime()-startTime; 91 | 92 | // Preparing output 93 | initOutput(mappingOutput, outCompressed); 94 | 95 | if (!initLoadingHashTable(fileName[1])) 96 | return 1; 97 | 98 | if (SNPMode) 99 | initLoadingSNPs(fileName[2]); 100 | 101 | fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); 102 | fprintf(stdout, "| %15s | %15s | %15s | %15s | %15s %15s |\n","Genome Name","Loading Time", "Mapping Time", "Memory Usage(M)","Total Mappings","Mapped reads"); 103 | fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); 104 | 105 | mappingTime = 0; 106 | loadingTime = 0; 107 | flag = 1; 108 | 109 | 110 | tmpTime = getTime(); 111 | while (readChunk(&seqList, &seqListSize) || seqListSize > 0) 112 | { 113 | totalNumOfReads += seqListSize; 114 | rewindHashTable(); 115 | totalLoadingTime += (getTime() - tmpTime); // readAllReads + initLoadingHashTable 116 | 117 | initializeFAST(seqListSize); 118 | initFASTChunk(seqList, seqListSize); 119 | 120 | do 121 | { 122 | flag = loadHashTable ( &tmpTime ); // Reading a fragment 123 | loadingTime += tmpTime; 124 | 125 | lstartTime = getTime(); 126 | initFASTContig(); 127 | mapSeq(flag); 128 | mappingTime += getTime() - lstartTime; 129 | 130 | if (maxMem < getMemUsage()) 131 | maxMem = getMemUsage(); 132 | 133 | if (flag == 0 || flag == 2) 134 | { 135 | totalMappingTime += mappingTime; 136 | totalLoadingTime += loadingTime; 137 | 138 | 139 | fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", 140 | getRefGenomeName(),loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); 141 | fflush(stdout); 142 | 143 | loadingTime = 0; 144 | mappingTime = 0; 145 | maxMem = 0; 146 | } 147 | else if (progressRep) 148 | { 149 | fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", 150 | getRefGenomeName(),loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); 151 | fflush(stdout); 152 | } 153 | } while (flag); 154 | 155 | releaseChunk(); 156 | tmpTime = getTime(); 157 | } 158 | totalLoadingTime += (getTime() - tmpTime); // for the last readAllReads call 159 | 160 | finalizeFAST(); 161 | finalizeLoadingHashTable(); 162 | finalizeReads(); 163 | finalizeOutput(); 164 | finalizeCommandParser(); 165 | if (SNPMode) 166 | finalizeSNPs(); 167 | 168 | fprintf(stdout, "----------------------------------------------------------------------------------------------------------\n"); 169 | 170 | fprintf(stdout, "%19s%16.2f%18.2f\n\n", "Total:",totalLoadingTime, totalMappingTime); 171 | fprintf(stdout, "%-30s%10.2f\n","Total Time:", totalMappingTime+totalLoadingTime); 172 | fprintf(stdout, "%-30s%10d\n","Total No. of Reads:", totalNumOfReads); 173 | fprintf(stdout, "%-30s%10lld\n","Total No. of Mappings:", mappingCnt); 174 | //fprintf(stdout, "%-30s%10.0f\n","Avg No. of locations verified:", ceil((float)verificationCnt/totalNumOfReads)); 175 | if (memUsage > 0) 176 | fprintf(stdout, "Memory Leak: %lld Bytes\n", memUsage); 177 | } 178 | 179 | return 0; 180 | } 181 | 182 | -------------------------------------------------------------------------------- /SNPReader.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include "Common.h" 41 | #include "HashTable.h" 42 | #include "SNPReader.h" 43 | 44 | CompressedSeq *_snp_SNPMap = NULL; 45 | int _snp_SNPMapLength = 0; 46 | ChrSNPs *_snp_chrSNPs = NULL; 47 | int _snp_chrCnt = 0; 48 | char **_snp_chrNames = NULL; 49 | int _snp_currentChr = 0; 50 | int _snp_currentLoc = 0; 51 | 52 | /**********************************************/ 53 | int findChrIndex(char *chrName) 54 | { 55 | int i; 56 | char cname[CONTIG_NAME_SIZE]; // chr name in FASTA file 57 | 58 | for (i = 0; i < _snp_chrCnt; i++) 59 | { 60 | strcpy(cname, _snp_chrNames[i]); 61 | if (strlen(cname) > 3 && cname[0] == 'c' && cname[1] == 'h' && cname[2] == 'r') 62 | strcpy(cname, _snp_chrNames[i] + 3); // get rid of the potential "chr" at the beginning 63 | if (! strcmp(cname, "MT")) // change "MT" to "M" for consistency with dbSNP naming 64 | cname[1] = '\0'; 65 | 66 | if (! strcmp(chrName, cname)) 67 | return i; 68 | } 69 | return -1; 70 | } 71 | /**********************************************/ 72 | void initLoadingSNPs(char *fileName) 73 | { 74 | int i, loc, t, locCnt, chrIndex, nameLen; 75 | char cname[CONTIG_NAME_SIZE]; // chromosome name from dbSNP 76 | int ccnt; // number of chromosomes in dbSNP 77 | //int chrNameOffset = 0; // used to trim the "chr" at the beginning of chromosome names 78 | SNPLoc *dummy = getMem(MAX_SNP_PER_CHR * sizeof(SNPLoc)); 79 | 80 | _snp_chrCnt = getChrCnt(); 81 | _snp_chrNames = getChrNames(); 82 | _snp_chrSNPs = getMem(_snp_chrCnt * sizeof(ChrSNPs)); 83 | 84 | for (i = 0; i < _snp_chrCnt; i++) // FASTA chromosomes 85 | { 86 | //chrNameOffset = (strlen(_snp_chrNames[i]) > 3 && chrNames[i][0] == 'c' && chrNames[i][1] == 'h' && chrNames[i][2] == 'r') ?3 :0; 87 | _snp_chrSNPs[i].chrName = _snp_chrNames[i];// + chrNameOffset; 88 | 89 | _snp_chrSNPs[i].locCnt = 0; 90 | _snp_chrSNPs[i].snpLocs = NULL; //getMem(MAX_SNP_PER_CHR * sizeof(SNPLoc)); 91 | } 92 | 93 | _snp_SNPMapLength = (calculateCompressedLen(CONTIG_MAX_SIZE)+1) * sizeof(CompressedSeq); 94 | _snp_SNPMap = getMem(_snp_SNPMapLength); 95 | 96 | FILE *fp = fopen(fileName, "rt"); 97 | t = fread(&ccnt, sizeof(int), 1, fp); // ccnt = number of chromosomes in dbSNP 98 | 99 | // look for each dbSNP chromosome in the reference 100 | for (i = 0; i < ccnt; i++) 101 | { 102 | t = fread(&nameLen, sizeof(int), 1, fp); 103 | t = fread(cname, sizeof(char), nameLen, fp); 104 | t = fread(&locCnt, sizeof(int), 1, fp); 105 | 106 | cname[nameLen] = '\0'; 107 | chrIndex = findChrIndex(cname); 108 | 109 | if (chrIndex != -1) // found in FASTA chromosomes 110 | { 111 | _snp_chrSNPs[chrIndex].locCnt = locCnt; 112 | _snp_chrSNPs[chrIndex].snpLocs = getMem(locCnt * sizeof(SNPLoc)); 113 | t = fread(_snp_chrSNPs[chrIndex].snpLocs, sizeof(SNPLoc), locCnt, fp); 114 | } 115 | else // not found 116 | { 117 | t = fread(dummy, sizeof(SNPLoc), locCnt, fp); // read dummy 118 | fprintf(stdout, "Warning: chromosome %s is present in the SNP database but not found in the reference genome\n", cname); 119 | } 120 | } 121 | 122 | fclose(fp); 123 | freeMem(dummy, MAX_SNP_PER_CHR * sizeof(SNPLoc)); 124 | } 125 | /**********************************************/ 126 | void finalizeSNPs() 127 | { 128 | int i; 129 | for (i = 0; i < _snp_chrCnt; i++) 130 | freeMem(_snp_chrSNPs[i].snpLocs, _snp_chrSNPs[i].locCnt * sizeof(SNPLoc)); 131 | freeMem(_snp_chrSNPs, _snp_chrCnt * sizeof(ChrSNPs)); 132 | freeMem(_snp_SNPMap, _snp_SNPMapLength); 133 | } 134 | /**********************************************/ 135 | CompressedSeq *loadSNPMap(char *chrName, int contigStartIndex, int contigLength, char *alt) 136 | { 137 | //memset(_snp_SNPMap, -1, calculateCompressedLen(contigLength) * sizeof(CompressedSeq)); 138 | memset(_snp_SNPMap, -1, _snp_SNPMapLength); 139 | int contigEnd = contigStartIndex + contigLength; 140 | int loc, offset; 141 | CompressedSeq *snp, mask; 142 | 143 | if ( strcmp(chrName, _snp_chrSNPs[_snp_currentChr].chrName) ) // new chr 144 | { 145 | _snp_currentChr++; 146 | _snp_currentLoc = 0; 147 | } 148 | 149 | if (_snp_chrSNPs[_snp_currentChr].locCnt) 150 | { 151 | int i = _snp_currentChr; // just to make the code more readable 152 | int pos = _snp_currentLoc; 153 | 154 | while ( pos < _snp_chrSNPs[i].locCnt && _snp_chrSNPs[i].snpLocs[pos].loc < contigStartIndex ) // this should never happen! 155 | pos ++; 156 | 157 | while ( pos < _snp_chrSNPs[i].locCnt && _snp_chrSNPs[i].snpLocs[pos].loc < contigEnd ) 158 | { 159 | loc = _snp_chrSNPs[i].snpLocs[pos].loc - contigStartIndex - 1; 160 | alt[loc] = _snp_chrSNPs[i].snpLocs[pos].alt; 161 | offset = loc % 21; 162 | mask = 0x7000000000000000; 163 | mask = ~(mask >> offset*3); 164 | 165 | snp = _snp_SNPMap + (loc/21); 166 | *snp &= mask; 167 | 168 | pos ++; 169 | } 170 | 171 | _snp_currentLoc = pos; 172 | } 173 | return _snp_SNPMap; 174 | } 175 | /**********************************************/ 176 | void rewindSNPIndex() 177 | { 178 | _snp_currentChr = 0; 179 | _snp_currentLoc = 0; 180 | } 181 | -------------------------------------------------------------------------------- /Common.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include "Common.h" 44 | 45 | 46 | unsigned short SEQ_LENGTH = 0; 47 | unsigned short QUAL_LENGTH = 0; 48 | unsigned short CMP_SEQ_LENGTH = 0; 49 | long long memUsage = 0; 50 | char *alphabet = "ACGTN"; 51 | char nVal[128]; 52 | char nRev[128]; 53 | char nHVal[128]; 54 | pthread_mutex_t _common_lock; 55 | 56 | 57 | /**********************************************/ 58 | FILE *fileOpen(char *fileName, char *mode) 59 | { 60 | FILE *fp; 61 | fp = fopen (fileName, mode); 62 | if (fp == NULL) 63 | { 64 | fprintf(stdout, "Error: Cannot Open the file %s\n", fileName); 65 | fflush(stdout); 66 | exit(EXIT_FAILURE); 67 | } 68 | return fp; 69 | } 70 | /**********************************************/ 71 | gzFile fileOpenGZ(char *fileName, char *mode) 72 | { 73 | gzFile gzfp; 74 | gzfp = gzopen (fileName, mode); 75 | if (gzfp == Z_NULL) 76 | { 77 | fprintf(stdout, "Error: Cannot Open the file %s\n", fileName); 78 | fflush(stdout); 79 | exit(EXIT_FAILURE); 80 | } 81 | return gzfp; 82 | } 83 | /**********************************************/ 84 | double getTime(void) 85 | { 86 | struct timeval t; 87 | gettimeofday(&t, NULL); 88 | return t.tv_sec+t.tv_usec/1000000.0; 89 | } 90 | 91 | /**********************************************/ 92 | inline char reverseCompleteChar(char c) 93 | { 94 | return nRev[c]; 95 | } 96 | /**********************************************/ 97 | inline void reverseComplete (char *seq, char *rcSeq , int length) // TODO: efficiency check 98 | { 99 | int i; 100 | seq+=length-1; 101 | for (i=0; i=0; i--) 147 | { 148 | if (full[i]=='/') 149 | { 150 | pos = i; 151 | break; 152 | } 153 | 154 | } 155 | 156 | if (pos != -1) 157 | { 158 | sprintf(*fileName, "%s%c", (full+pos+1), '\0'); 159 | full[pos+1]='\0'; 160 | sprintf(*path,"%s%c", full, '\0'); 161 | } 162 | else 163 | { 164 | sprintf(*fileName, "%s%c", full, '\0'); 165 | sprintf(*path,"%c", '\0'); 166 | } 167 | } 168 | /**********************************************/ 169 | inline int calculateCompressedLen(int normalLen) 170 | { 171 | return (normalLen / 21) + ((normalLen%21)?1:0); 172 | } 173 | /**********************************************/ 174 | void compressSequence(char *seq, int seqLen, CompressedSeq *cseq) 175 | { 176 | CompressedSeq val = 0; 177 | int i = 0, pos = 0; 178 | 179 | *cseq = 0; 180 | while (pos < seqLen) 181 | { 182 | *cseq = ((*cseq) << 3) | nVal[seq[pos++]]; 183 | 184 | if (++i == 21) 185 | { 186 | i = 0; 187 | cseq++; 188 | if (pos < seqLen) // not to write the adjacent memory in case seqLen % 21 == 0 189 | *cseq = 0; 190 | } 191 | } 192 | if (i > 0) 193 | { 194 | *cseq <<= (3*(21-i)); 195 | } 196 | } 197 | 198 | /**********************************************/ 199 | int hashVal(char *seq) 200 | { 201 | int i=0; 202 | int val=0, numericVal=0; 203 | 204 | while(i, Simon Fraser University All rights reserved. 238 | 239 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 240 | 241 | .IP 1 242 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 243 | .IP 2 244 | Redistributions in binary form must reproduce the above copyright notice, thislist of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 245 | .IP 3 246 | Neither the name of the Simon Fraser University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 247 | 248 | .P 249 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 250 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 251 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 252 | 253 | -------------------------------------------------------------------------------- /Sort.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | * Ermin Hodzic (ermin_hodzic AT sfu DOT ca) 35 | */ 36 | 37 | #include 38 | #include "Common.h" 39 | #include "Reads.h" 40 | /**********************************************/ 41 | static inline int lg( int x ) { 42 | // floor( log2( x ) ) 43 | int r = 0; 44 | while ( x > 1 ) r++, x >>= 1; 45 | return r; 46 | } 47 | /**********************************************/ 48 | void heapSortGI( GeneralIndex * A, int N ) { 49 | // N - size of the array to be sorted 50 | if ( N <= 1 ) return; 51 | int i, j, maxi; 52 | // We make the array a max heap using "bubble-down" operation 53 | for ( i = N/2 - 1; i >= 0; i-- ) { 54 | j = i; 55 | // bubble-down 56 | while ( 2 * j + 2 < N ) { 57 | if ( A[ 2 * j + 1 ].checksum > A[ 2 * j + 2 ].checksum ) maxi = 2 * j + 1; 58 | else maxi = 2 * j + 2; 59 | if ( A[ maxi ].checksum > A[ j ].checksum ) { 60 | GeneralIndex temp = A[ j ]; 61 | A[ j ] = A[ maxi ]; 62 | A[ maxi ] = temp; 63 | j = maxi; 64 | } 65 | else break; 66 | } 67 | if ( 2 * j + 1 < N && A[ 2 * j + 1 ].checksum > A[ j ].checksum ) { 68 | GeneralIndex temp = A[ j ]; 69 | A[ j ] = A[ 2 * j + 1 ]; 70 | A[ 2 * j + 1 ] = temp; 71 | } 72 | } 73 | // Extracting maximum and moving it to the end of the array as long as size is greater than 1 74 | while ( --N ) { 75 | GeneralIndex temp = A[ 0 ]; 76 | A[ 0 ] = A[ N ]; 77 | A[ N ] = temp; 78 | j = 0; 79 | // bubble-down 80 | while ( 2 * j + 2 < N ) { 81 | if ( A[ 2 * j + 1 ].checksum > A[ 2 * j + 2 ].checksum ) maxi = 2 * j + 1; 82 | else maxi = 2 * j + 2; 83 | if ( A[ maxi ].checksum > A[ j ].checksum ) { 84 | GeneralIndex temp = A[ j ]; 85 | A[ j ] = A[ maxi ]; 86 | A[ maxi ] = temp; 87 | j = maxi; 88 | } 89 | else break; 90 | } 91 | if ( 2 * j + 1 < N && A[ 2 * j + 1 ].checksum > A[ j ].checksum ) { 92 | GeneralIndex temp = A[ j ]; 93 | A[ j ] = A[ 2 * j + 1 ]; 94 | A[ 2 * j + 1 ] = temp; 95 | } 96 | } 97 | } 98 | /**********************************************/ 99 | void insertionSortGI( GeneralIndex * A, const int left, const int right ) { 100 | // left, right - starting and ending index of the interval to be sorted 101 | int i, j; 102 | for ( i = left + 1; i <= right; i++ ) { 103 | j = i; 104 | GeneralIndex temp = A[ i ]; 105 | while ( j > left && A[ j - 1 ].checksum > temp.checksum ) { 106 | A[ j ] = A[ j - 1 ]; 107 | j--; 108 | } 109 | A[ j ] = temp; 110 | } 111 | } 112 | /**********************************************/ 113 | void quickSortGI( GeneralIndex * A, const int left, const int right, int depth ) { 114 | // left, right - starting and ending index of the interval to be sorted 115 | // depth - max recursion depth allowed 116 | if ( right - left <= 16 ) return; // leave it for insertionSort 117 | 118 | if ( depth == 0 ) heapSortGI( A + left, right - left + 1 ); // too many recursive calls, switch to heapsort to ensure O(nlogn) time 119 | else { 120 | int mid = ( left + right ) / 2; 121 | int small = left - 1; 122 | int i; 123 | GeneralIndex temp; 124 | // median of A[ left ], A[ mid ], A[ right ], to be stored in A[ right ] 125 | if ( A[ mid ].checksum < A[ right ].checksum ) { 126 | temp = A[ right ]; 127 | A[ right ] = A[ mid ]; 128 | A[ mid ] = temp; 129 | } 130 | if ( A[ left ].checksum < A[ right ].checksum ) { 131 | temp = A[ right ]; 132 | A[ right ] = A[ left ]; 133 | A[ left ] = temp; 134 | } 135 | if ( A[ left ].checksum < A[ mid ].checksum ) { 136 | temp = A[ right ]; 137 | A[ right ] = A[ left ]; 138 | A[ left ] = temp; 139 | } 140 | else { 141 | temp = A[ right ]; 142 | A[ right ] = A[ mid ]; 143 | A[ mid ] = temp; 144 | } 145 | // partitioning the array with respect to A[ right ] 146 | for ( i = left; i < right; i++ ) { 147 | if ( A[ i ].checksum < A[ right ].checksum ) { 148 | temp = A[ ++small ]; 149 | A[ small ] = A[ i ]; 150 | A[ i ] = temp; 151 | } 152 | } 153 | temp = A[ small + 1 ]; 154 | A[ small + 1 ] = A[ right ]; 155 | A[ right ] = temp; 156 | // recursion 157 | quickSortGI( A, left, small, depth - 1 ); 158 | quickSortGI( A, small + 2, right, depth - 1 ); 159 | } 160 | } 161 | /**********************************************/ 162 | void introSortGI( GeneralIndex * A, const int left, const int right ) { 163 | // main sort, limiting recursion depth to 2 times the logarithm of its length 164 | quickSortGI( A, left, right, 2 * lg( right - left + 1 ) ); 165 | // finish-up small unsorted pieces 166 | insertionSortGI( A, left, right ); 167 | } 168 | /**********************************************/ 169 | void heapSortPair( Pair * A, int N ) { 170 | // N - size of the array to be sorted 171 | if ( N <= 1 ) return; 172 | int i, j, maxi; 173 | // We make the array a max heap using "bubble-down" operation 174 | for ( i = N/2 - 1; i >= 0; i-- ) { 175 | j = i; 176 | // bubble-down 177 | while ( 2 * j + 2 < N ) { 178 | if ( A[ 2 * j + 1 ].hv > A[ 2 * j + 2 ].hv || (A[ 2 * j + 1 ].hv == A[ 2 * j + 2 ].hv && A[ 2 * j + 1 ].checksum > A[ 2 * j + 2 ].checksum) ) maxi = 2 * j + 1; 179 | else maxi = 2 * j + 2; 180 | if ( A[ maxi ].hv > A[ j ].hv || (A[ maxi ].hv == A[ j ].hv && A[ maxi ].checksum > A[ j ].checksum) ) { 181 | Pair temp = A[ j ]; 182 | A[ j ] = A[ maxi ]; 183 | A[ maxi ] = temp; 184 | j = maxi; 185 | } 186 | else break; 187 | } 188 | if ( 2 * j + 1 < N && ( A[ 2 * j + 1 ].hv > A[ j ].hv || (A[ 2 * j + 1 ].hv == A[ j ].hv && A[ 2 * j + 1 ].checksum > A[ j ].checksum) ) ) { 189 | Pair temp = A[ j ]; 190 | A[ j ] = A[ 2 * j + 1 ]; 191 | A[ 2 * j + 1 ] = temp; 192 | } 193 | } 194 | // Extracting maximum and moving it to the end of the array as long as size is greater than 1 195 | while ( --N ) { 196 | Pair temp = A[ 0 ]; 197 | A[ 0 ] = A[ N ]; 198 | A[ N ] = temp; 199 | j = 0; 200 | // bubble-down 201 | while ( 2 * j + 2 < N ) { 202 | if ( A[ 2 * j + 1 ].hv > A[ 2 * j + 2 ].hv || (A[ 2 * j + 1 ].hv == A[ 2 * j + 2 ].hv && A[ 2 * j + 1 ].checksum > A[ 2 * j + 2 ].checksum) ) maxi = 2 * j + 1; 203 | else maxi = 2 * j + 2; 204 | if ( A[ maxi ].hv > A[ j ].hv || (A[ maxi ].hv == A[ j ].hv && A[ maxi ].checksum > A[ j ].checksum) ) { 205 | Pair temp = A[ j ]; 206 | A[ j ] = A[ maxi ]; 207 | A[ maxi ] = temp; 208 | j = maxi; 209 | } 210 | else break; 211 | } 212 | if ( 2 * j + 1 < N && ( A[ 2 * j + 1 ].hv > A[ j ].hv || (A[ 2 * j + 1 ].hv == A[ j ].hv && A[ 2 * j + 1 ].checksum > A[ j ].checksum) ) ) { 213 | Pair temp = A[ j ]; 214 | A[ j ] = A[ 2 * j + 1 ]; 215 | A[ 2 * j + 1 ] = temp; 216 | } 217 | } 218 | } 219 | /**********************************************/ 220 | void insertionSortPair( Pair * A, const int left, const int right ) { 221 | // left, right - starting and ending index of the interval to be sorted 222 | int i, j; 223 | for ( i = left + 1; i <= right; i++ ) { 224 | j = i; 225 | Pair temp = A[ i ]; 226 | while ( j > left && ( A[ j - 1 ].hv > temp.hv || (A[ j - 1 ].hv == temp.hv && A[ j - 1 ].checksum > temp.checksum) ) ) { 227 | A[ j ] = A[ j - 1 ]; 228 | j--; 229 | } 230 | A[ j ] = temp; 231 | } 232 | } 233 | /**********************************************/ 234 | void quickSortPair( Pair * A, const int left, const int right, int depth ) { 235 | // left, right - starting and ending index of the interval to be sorted 236 | // depth - max recursion depth allowed 237 | if ( right - left <= 16 ) return; // leave it for insertionSort 238 | 239 | if ( depth == 0 ) heapSortPair( A + left, right - left + 1 ); // too many recursive calls, switch to heapsort to ensure O(nlogn) time 240 | else { 241 | int mid = ( left + right ) / 2; 242 | int small = left - 1; 243 | int i; 244 | Pair temp; 245 | // median of A[ left ], A[ mid ], A[ right ], to be stored in A[ right ] 246 | if ( A[ mid ].hv < A[ right ].hv || (A[ mid ].hv == A[ right ].hv && A[ mid ].checksum < A[ right ].checksum) ) { 247 | temp = A[ right ]; 248 | A[ right ] = A[ mid ]; 249 | A[ mid ] = temp; 250 | } 251 | if ( A[ left ].hv < A[ right ].hv || (A[ left ].hv == A[ right ].hv && A[ left ].checksum < A[ right ].checksum) ) { 252 | temp = A[ right ]; 253 | A[ right ] = A[ left ]; 254 | A[ left ] = temp; 255 | } 256 | if ( A[ left ].hv < A[ mid ].hv || (A[ left ].hv == A[ mid ].hv && A[ left ].checksum < A[ mid ].checksum) ) { 257 | temp = A[ right ]; 258 | A[ right ] = A[ left ]; 259 | A[ left ] = temp; 260 | } 261 | else { 262 | temp = A[ right ]; 263 | A[ right ] = A[ mid ]; 264 | A[ mid ] = temp; 265 | } 266 | // partitioning the array with respect to A[ right ] 267 | for ( i = left; i < right; i++ ) { 268 | if ( A[ i ].hv < A[ right ].hv || (A[ i ].hv == A[ right ].hv && A[ i ].checksum < A[ right ].checksum) ) { 269 | temp = A[ ++small ]; 270 | A[ small ] = A[ i ]; 271 | A[ i ] = temp; 272 | } 273 | } 274 | temp = A[ small + 1 ]; 275 | A[ small + 1 ] = A[ right ]; 276 | A[ right ] = temp; 277 | // recursion 278 | quickSortPair( A, left, small, depth - 1 ); 279 | quickSortPair( A, small + 2, right, depth - 1 ); 280 | } 281 | } 282 | /**********************************************/ 283 | void introSortPair( Pair * A, const int left, const int right ) { 284 | // main sort, limiting recursion depth to 2 times the logarithm of its length 285 | quickSortPair( A, left, right, 2 * lg( right - left + 1 ) ); 286 | // finish-up small unsorted pieces 287 | insertionSortPair( A, left, right ); 288 | } 289 | 290 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 0. Installation 2 | 3 | ### From source 4 | To install mrsFAST-Ultra, first you should fetch it from our git repository, or download one of the corresponding compressed zip/tar.gz packages. After downloading, change the current directory to the source directory `mrsfast`, and run `make` in the terminal. The `mrsfast` and `snp_indexer` binaries will be created, which are ready to use. 5 | 6 | ```bash 7 | git clone https://github.com/sfu-compbio/mrsfast 8 | cd mrsfast 9 | make 10 | ``` 11 | 12 | If you are interested in a particular version, after downloading the git repo, checkout the that version and do `make`. 13 | 14 | ```bash 15 | git clone https://github.com/sfu-compbio/mrsfast 16 | cd mrsfast 17 | git checkout v3.3.0 18 | make 19 | ``` 20 | ### From release 21 | Alternatively, you can go to [releases page](https://github.com/sfu-compbio/mrsfast/releases) and click on the desired version and then click on download the zip or tar file, switch to directory and run `make`. 22 | 23 | To grab sample data and test `mrsfast`, please download it from our git repository or the compressed package. 24 | 25 | ```bash 26 | git clone https://github.com/sfu-compbio/mrsfast mrsfast/sample-data -b sample-data 27 | ``` 28 | Note that the current version of mrsFAST-Ultra requires 64 bit architecture and does not support 32 bit machines. 29 | 30 | ### From `bioconda` 31 | Install [conda](https://conda.io) and run: 32 | 33 | ```bash 34 | conda install -c bioconda mrsfast 35 | ``` 36 | 37 | ---- 38 | 39 | ## 1. Indexing Reference Genome 40 | 41 | In order to map read sequences to a reference genome, mrsFAST-Ultra first needs to creata an index from the genome fasta file. This command will create the file `genome.fa.index`. 42 | 43 | ```bash 44 | $ ./mrsfast --index genome.fa 45 | ``` 46 | 47 | 48 | By default, the indexing window size (length of hash values stored in the index hash table) is 12. This value could be also determined with the `--ws` option. A maximum value of 14 would be logical for window size. Larger values could lead to excessive memory usage in the mapping stage. 49 | 50 | ```bash 51 | $ ./mrsfast --index genome.fa --ws 14 52 | ``` 53 | 54 | ---- 55 | 56 | ## 2. Mapping Options 57 | 58 | To perform read mapping, mrsFAST-Ultra should be executed with the `--search` option. By default, mrsFAST-Ultra is a all-mapper tool. This means that it finds and reports all the mappings for each input read. If no option is provided, mrsFAST-Ultra performs single-end mapping. This is an example of running mrsFAST-Ultra for mapping read sequences in the sample `reads.fq` file. 59 | 60 | ```bash 61 | $ ./mrsfast --search genome.fa --seq reads.fq 62 | ``` 63 | 64 | By default, mrsFAST-Ultra sets the maximum error threshold to 6% so that %94 similarity between reads and genome is guaranteed. The error threshold could also be set using the `-e` option. Setting this value to zero means that only exact matches are desired. 65 | 66 | ``` 67 | $ ./mrsfast --search genome.fa --seq reads.fq -e 4 68 | ``` 69 | 70 | To trim input reads to a shorter length, `--crop` option can be used. For example if the input reads are of length 100 and `--crop 40` is used, only the first 40 base pairs of each read would be read and used for mapping. 71 | 72 | ``` 73 | $ ./mrsfast --search genome.fa --seq reads.fq -e 4 --crop 40 74 | ``` 75 | 76 | Alternatively, you may crop from tail of the reads by using `--tail-crop`. 77 | 78 | ``` 79 | $ ./mrsfast --search genome.fa --seq reads.fq -e 4 --tail-crop 40 80 | ``` 81 | 82 | mrsFAST-Ultra is able to perform mapping on multiple CPU cores in a parallel manner. The number of threads is determined using the `--threads` option. The default value is 1 which runs mrsFAST-Ultra on a single thread. If `--threads` is set to 0, mrsFAST-Ultra will use all the available cores in the system. 83 | 84 | ``` 85 | $ ./mrsfast --search genome.fa --seq reads.fq -e 4 --threads 4 86 | ``` 87 | 88 | When huge volumes of input reads are required to be mapped on a machine with limited memory resources, mrsFAST-Ultra is capable of adjusting itself with the specified memory limits. The total memory (in GB) available for running mrsFAST-Ultra should be specified with the `--mem` option. In this mode, mrsFAST-Ultra might perform mapping in several iterations and each time it only loads as many reads as allowed by the memory limit. 89 | 90 | ``` 91 | $ ./mrsfast --search genome.fa --seq reads.fq -e 4 --mem 8 92 | ``` 93 | 94 | In the limited mapping mode, mrsFAST-Ultra reports only up to a specified number of mappings for each read. The `-n` option sets the maximum number of mappings per read. Reads with mapping more than this value will not be printed in the output. This option is valid in both single-end and paired-end modes, but could not be used together with best mapping (`--best`) and paired-end discordant mode (`--discordant-vh`). 95 | 96 | ``` 97 | $ ./mrsfast --search genome.fa --seq reads.fq -e 4 -n 100 98 | ``` 99 | 100 | In best mapping mode, for each read mrsFAST-Ultra reports a single best location which has the smallest hamming distance among all of its possible mappings. In case of a tie, one of the mapping locations with the smallest hamming distance is reported at random. This option cannot be used in paired-end discordant mode (`--discordant-vh`). 101 | 102 | ``` 103 | $ ./mrsfast --search genome.fa --seq reads.fq -e 4 --best 104 | ``` 105 | 106 | Except the cases that are pointed out, obviously any combination of the above options can be used together in any of the single-end and paired-end modes. 107 | 108 | ``` 109 | $ ./mrsfast --search genome.fa --seq reads.fq --crop 60 -e 2 --mem 6 --threads 4 -n 100 -o mappings.sam --disable-nohits 110 | ``` 111 | 112 | ---- 113 | 114 | ## 3. Paired-end Mapping 115 | 116 | As mentioned above, by default mrsFAST-Ultra will run in single-end mode. The paired-end mapping options is invoked using the `--pe` option. If the reads are in two different files, `--seq1` and `--seq2` should be used to indicate the input files. If the reads are interleaved in a single file, `--seq` is used to indicated the file. 117 | 118 | ``` 119 | $ ./mrsfast --search genome.fa --seq interleaved-reads.fq --pe -e 4 120 | $ ./mrsfast --search genome.fa --seq1 mates1.fq --seq2 mates2.fq --pe -e 4 121 | ``` 122 | 123 | If the distance range between condordant pairs is not specified as above, mrsFAST-Ultra automatically decides about this range according to the mean and standard deviation of distances observed among the mates. The distance allowed between the paired-end reads should be specified with `--min` and `--max`. These values specify the minimum and maximum of the template length (the distance between outer edges of the mapping mates). 124 | 125 | ``` 126 | $ ./mrsfast --search genome.fa --seq1 mates1.fq --seq2 mates2.fq --pe -e 4 --min 100 --max 500 127 | ``` 128 | 129 | Again, any combination of the introduced mapping options could be used in the paired-end mode. 130 | 131 | ``` 132 | $ ./mrsfast --search genome.fa --seq1 mates1.fq --seq2 mates2.fq --pe -e 4 --min 100 --max 500 --threads 4 --best -o mappings.sam 133 | ``` 134 | mrsFAST-Ultra can report discordant paired-end mappings for structural variation detection using [Variation Hunter](http://variationhunter.sf.net). In this mode the `--min` and `--max` options will define the minimum and maximum inferred size for concordant mappings. 135 | 136 | ``` 137 | $ ./mrsfast --search genome.fa --seq1 mates1.fq --seq2 mates2.fq --discordant-vh -e 4 --min 100 --max 500 138 | ``` 139 | 140 | In paired-end discordant mode, an upper bound can be defined for maximum number of discordant mappings per read. This values is determined by `--max-discordant-cutoff` option. This option is only applicable in paired-end discordant mode. 141 | 142 | ``` 143 | $ ./mrsfast --search genome.fa --seq1 mates1.fq --seq2 mates2.fq --discordant-vh -e 4 \ 144 | --min 100 --max 500 --max-discordant-cutoff 300 145 | ``` 146 | 147 | ---- 148 | 149 | ## 4. SNP-aware mode 150 | mrsFAST-Ultra is able to do sequence mapping in SNP-aware mode. In this mode mrsFAST-Ultra tolerates the mismatches in known SNP locations provided by dbSNP database (see sample file `dbSNP.vcf`). To run in this mode, first, the snp_indexer binary should be used to create an index from the input dbSNP (vcf) file. The following command reads the sample `dbSNP.vcf` file and creates `snp.index` which is only readable by mrsFAST-Ultra. The current vcf format that is accepted by mrsFAST-Ultra is vcf version 4. 151 | 152 | ``` 153 | $ ./snp_indexer dbSNP.vcf snp.index 154 | ``` 155 | 156 | In the next step, using `--snp` option mrsFAST-Ultra accepts the snp.index file as an input, and therefore ignores all mismatches occurring in the known SNP locations. The following command line executes mrsFAST-Ultra in SNP-aware mode using the index file created in last step. 157 | 158 | ``` 159 | $ ./mrsfast --search genome.fa --seq reads.fq -e 4 --snp snp.index 160 | ``` 161 | 162 | To be able to distinguish mismatches occurring as a result of sequencing errors, and those caused by SNPs, mrsFAST-Ultra considers a quality threshold such that a mismatch at a reported SNP location will be ignored only if the corresponding read location has a quality higher than or equal to that quality threshold; otherwise the mismatch would affect the mapping as it is more probably caused by sequencing errors. The value of this threshold is set by the `--snp-qual` option. The default value is 53. 163 | 164 | ``` 165 | $ ./mrsfast --search genome.fa --seq reads.fq -e 4 --snp snp.index --snp-qual 60 166 | ``` 167 | 168 | The SNP-aware mode could be run together with any other combination of options both in single-end and paired-end modes (except `--discordant-vh`). 169 | 170 | ---- 171 | 172 | ## 5. Input and Output Options 173 | 174 | By default, mrsfast-Ultra outputs the mapping results in `output` which is written in standard SAM format. Also in single-end mode, the set of unmapped reads are printed in `output.nohit` file. The name of sam and nohit files can be set by the `-o` option. 175 | 176 | ``` 177 | $ ./mrsfast --search genome.fa --seq reads.fq -o mappings.sam 178 | ``` 179 | 180 | The name of the nohit file can be determined by the `-u` option. 181 | 182 | ``` 183 | $ ./mrsfast --search genome.fa --seq reads.fq -o mappings.sam -u unmapped 184 | ``` 185 | 186 | If the nohit file is not desired as output, it could be omitted by adding `--disable-nohits`. 187 | 188 | ``` 189 | $ ./mrsfast --search genome.fa --seq reads.fq -o mappings.sam --disable-nohits 190 | ``` 191 | 192 | The `--outcomp` option can be used to compress the mrsFAST-Ultra output file in gzip format. 193 | 194 | ``` 195 | $ ./mrsfast --search genome.fa --seq reads.fq -o mappings.sam --outcomp 196 | ``` 197 | 198 | mrsFAST-Ultra can process the input (FASTA/FASTQ) read files in compressed .gz format, using the `--seqcomp` option. 199 | 200 | ``` 201 | $ ./mrsfast --search genome.fa --seqcomp --seq reads.fq.gz -o mappings.sam 202 | ``` 203 | 204 | By default, mrsFAST-Ultra includes a SAM header in the output file. To make sure the SAM header does not appear in the output, `--disable-sam-header` can be used. 205 | 206 | ``` 207 | $ ./mrsfast --search genome.fa --seq reads.fq -o mappings.sam --disable-sam-header 208 | ``` 209 | 210 | ---- 211 | ## 6. mrsFAST-Ultra man page 212 | 213 | To view the full list of mrsFAST-Ultra options and their descriptions, please run the following. 214 | 215 | ``` 216 | $ ./mrsfast -h 217 | ``` 218 | 219 | ## 7. Citation 220 | If you use mrsfast, please cite its [paper](https://www.nature.com/articles/nmeth0810-576): 221 | 222 | > Hach, F. et al. mrsFAST: a cache-oblivious algorithm for short-read mapping. Nature Methods 7, 576–577 (2010) 223 | -------------------------------------------------------------------------------- /CommandLineParser.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include "Common.h" 43 | #include "CommandLineParser.h" 44 | 45 | int uniqueMode=1; 46 | int indexingMode; 47 | int searchingMode; 48 | int pairedEndMode; 49 | int pairedEndDiscordantMode; 50 | int pairedEndProfilingMode = 0; 51 | int bestMappingMode = 0; 52 | int SNPMode = 0; 53 | int seqCompressed; 54 | int outCompressed; 55 | int cropSize = 0; 56 | int tailCropSize = 0; 57 | int progressRep = 0; 58 | int nohitDisabled = 0; 59 | int noSamHeader = 0; 60 | int minPairEndedDistance=-1; 61 | int maxPairEndedDistance=-1; 62 | int minPairEndedDiscordantDistance=-1; 63 | int maxPairEndedDiscordantDistance=-1; 64 | int errThreshold = -1; 65 | char *seqFile1; 66 | char *seqFile2; 67 | char fileName[3][FILE_NAME_LENGTH]; 68 | char *unmappedOutput; 69 | char *mappingOutputPath; 70 | char *concordantStatOutput; 71 | short maxHits = 0; 72 | unsigned char WINDOW_SIZE = 12; 73 | unsigned int CONTIG_SIZE; 74 | unsigned int CONTIG_MAX_SIZE; 75 | unsigned int THREAD_COUNT = 1; 76 | unsigned short DISCORDANT_CUT_OFF = 300; 77 | double MAX_MEMORY = 4;// GB 78 | int THREAD_ID[255]; 79 | int SNP_QUAL_THRESHOLD = 53; 80 | 81 | 82 | 83 | #if (defined(__MACH__) && defined(__APPLE__)) 84 | #include 85 | #else 86 | extern char _binary_HELP_start; 87 | extern char _binary_HELP_end; 88 | #endif 89 | 90 | 91 | void printHelp() 92 | { 93 | #if (defined(__MACH__) && defined(__APPLE__)) 94 | size_t i, sz = getsectbyname("binary", "HELP")->size; 95 | const uint8_t *c = (const uint8_t*) getsectbyname("binary", "HELP")->addr; 96 | for (i = 0; i < sz; i++) 97 | putchar(c[i]); 98 | #else 99 | char *c; 100 | for (c = &_binary_HELP_start; c != &_binary_HELP_end; c++) 101 | putchar(*c); 102 | #endif 103 | exit(EXIT_SUCCESS); 104 | } 105 | 106 | int parseCommandLine (int argc, char *argv[]) 107 | { 108 | int index, len, o; 109 | char *fastaFile = NULL; 110 | char *fastaOutputFile = NULL; 111 | char *indexFile = NULL; 112 | char *SNPFile = NULL; 113 | 114 | mappingOutput = getMem(FILE_NAME_LENGTH); 115 | mappingOutputPath = getMem(FILE_NAME_LENGTH); 116 | unmappedOutput = getMem(FILE_NAME_LENGTH); 117 | concordantStatOutput = getMem(FILE_NAME_LENGTH); 118 | strcpy(mappingOutput, "output"); 119 | strcpy(unmappedOutput, "output.nohit"); 120 | strcpy(concordantStatOutput, "concordant.statistic"); 121 | mappingOutputPath[0] = '\0'; 122 | 123 | static struct option longOptions[] = 124 | { 125 | {"pe", no_argument, &pairedEndMode, 1}, 126 | {"discordant-vh", no_argument, &pairedEndDiscordantMode, 1}, 127 | {"seqcomp", no_argument, &seqCompressed, 1}, 128 | {"outcomp", no_argument, &outCompressed, 1}, 129 | {"progress", no_argument, &progressRep, 1}, 130 | {"best", no_argument, &bestMappingMode, 1}, 131 | {"disable-nohits", no_argument, &nohitDisabled, 1}, 132 | {"disable-sam-header", no_argument, &noSamHeader, 1}, 133 | {"index", required_argument, 0, 'i'}, 134 | {"search", required_argument, 0, 's'}, 135 | {"help", no_argument, 0, 'h'}, 136 | {"version", no_argument, 0, 'v'}, 137 | {"seq", required_argument, 0, 'x'}, 138 | {"seq1", required_argument, 0, 'x'}, 139 | {"seq2", required_argument, 0, 'y'}, 140 | {"ws", required_argument, 0, 'w'}, 141 | {"min", required_argument, 0, 'l'}, 142 | {"max", required_argument, 0, 'm'}, 143 | {"crop", required_argument, 0, 'c'}, 144 | {"tail-crop", required_argument, 0, 'f'}, 145 | {"threads", required_argument, 0, 't'}, 146 | {"mem", required_argument, 0, 'z'}, 147 | {"snp", required_argument, 0, 'p'}, 148 | {"max-discordant-cutoff", required_argument, 0, 'd'}, 149 | {"snp-qual", required_argument, 0, 'q'}, 150 | {0,0,0,0} 151 | }; 152 | 153 | 154 | 155 | while ( (o = getopt_long ( argc, argv, "f:i:u:o:s:e:n:bhv", longOptions, &index))!= -1 ) 156 | { 157 | switch (o) 158 | { 159 | case 'i': 160 | indexingMode = 1; 161 | fastaFile = optarg; 162 | break; 163 | case 's': 164 | searchingMode = 1; 165 | fastaFile = optarg; 166 | break; 167 | case 'c': 168 | cropSize = atoi(optarg); 169 | break; 170 | case 'f': 171 | tailCropSize = atoi(optarg); 172 | break; 173 | case 'w': 174 | if (searchingMode == 1) 175 | { 176 | fprintf(stderr, "Error: Window size can only be set in indexing mode.\n"); 177 | return 0; 178 | } 179 | WINDOW_SIZE = atoi(optarg); 180 | break; 181 | case 'x': 182 | seqFile1 = optarg; 183 | break; 184 | case 'y': 185 | seqFile2 = optarg; 186 | break; 187 | case 'u': 188 | strcpy(unmappedOutput, optarg); 189 | break; 190 | case 'o': 191 | stripPath (optarg, &mappingOutputPath, &mappingOutput); 192 | sprintf(unmappedOutput, "%s%s.nohit", mappingOutputPath, mappingOutput ); 193 | break; 194 | case 'n': 195 | maxHits = atoi(optarg); 196 | break; 197 | case 'e': 198 | errThreshold = atoi(optarg); 199 | break; 200 | case 'q': 201 | SNP_QUAL_THRESHOLD = atoi(optarg); 202 | break; 203 | case 'l': 204 | minPairEndedDistance = atoi(optarg); 205 | break; 206 | case 'm': 207 | maxPairEndedDistance = atoi(optarg); 208 | break; 209 | case 'h': 210 | printHelp(); 211 | return 0; 212 | break; 213 | case 'v': 214 | fprintf(stdout, "Version: %s\nBuild Date: %s\n", MRSFAST_VERSION, BUILD_DATE); 215 | return 0; 216 | break; 217 | case 't': 218 | THREAD_COUNT = atoi(optarg); 219 | if (THREAD_COUNT == 0 || THREAD_COUNT > sysconf( _SC_NPROCESSORS_ONLN )) 220 | THREAD_COUNT = sysconf( _SC_NPROCESSORS_ONLN ); 221 | break; 222 | case 'z': 223 | MAX_MEMORY = atoi(optarg); 224 | break; 225 | case 'd': 226 | DISCORDANT_CUT_OFF = atoi(optarg); 227 | break; 228 | case 'p': 229 | SNPMode = 1; 230 | SNPFile = optarg; 231 | break; 232 | } 233 | 234 | } 235 | 236 | #ifndef MRSFAST_SSE4 237 | if (searchingMode) 238 | fprintf(stdout, "==> This version is compiled without any SSE4 optimization <==\n"); 239 | #endif 240 | if (bestMappingMode) 241 | { 242 | nohitDisabled = 1; 243 | } 244 | 245 | if (indexingMode + searchingMode != 1) 246 | { 247 | fprintf(stdout, "ERROR: Indexing / Searching mode should be selected\n"); 248 | return 0; 249 | } 250 | 251 | if (WINDOW_SIZE > 14 || WINDOW_SIZE < 8) 252 | { 253 | fprintf(stdout, "ERROR: Window size should be in [8..14]\n"); 254 | return 0; 255 | } 256 | 257 | if (MAX_MEMORY < 2) 258 | fprintf(stdout, "ERROR: At least 2 GB of memory is required for running mrsFAST\n"); 259 | 260 | 261 | if ( indexingMode ) 262 | { 263 | CONTIG_SIZE = 80000000; 264 | CONTIG_MAX_SIZE = 120000000; 265 | 266 | if (fastaFile == NULL) 267 | { 268 | fprintf(stdout, "ERROR: Reference(s) should be indicated for indexing\n"); 269 | return 0; 270 | } 271 | } 272 | 273 | if (maxHits) 274 | { 275 | if (maxHits < 0) 276 | { 277 | fprintf(stdout, "ERROR: Number of maximum hits must be greater than 0\n"); 278 | return 0; 279 | } 280 | 281 | if (bestMappingMode) 282 | { 283 | fprintf(stdout, "ERROR: Maximum number of mappings could not be set in best mapping mode. Maximum mappings input ignored\n"); 284 | maxHits = 0; 285 | } 286 | } 287 | 288 | if ( searchingMode ) 289 | { 290 | CONTIG_SIZE = 300000000; 291 | CONTIG_MAX_SIZE = 300000000; 292 | 293 | if ( cropSize && tailCropSize) 294 | { 295 | fprintf(stdout, "ERROR: Sequences can be cropped from only one side\n"); 296 | return 0; 297 | } 298 | 299 | if (pairedEndDiscordantMode) 300 | { 301 | pairedEndDiscordantMode = pairedEndMode = 1; 302 | } 303 | 304 | if (fastaFile == NULL) 305 | { 306 | fprintf(stdout, "ERROR: Index File(s) should be indiciated for searching\n"); 307 | return 0; 308 | } 309 | 310 | if (seqFile1 == NULL && seqFile2 == NULL) 311 | { 312 | fprintf(stdout, "ERROR: Please indicate a sequence file for searching.\n"); 313 | return 0; 314 | } 315 | 316 | if (!pairedEndMode && seqFile2 != NULL) 317 | { 318 | fprintf(stdout, "ERROR: Second File can be indicated in pairedend mode\n"); 319 | return 0; 320 | } 321 | 322 | if (pairedEndMode) 323 | { 324 | if (minPairEndedDistance < 0 && maxPairEndedDistance < 0) 325 | { 326 | pairedEndProfilingMode = 1; 327 | } 328 | else if ( minPairEndedDistance <0 || maxPairEndedDistance < 0 || minPairEndedDistance > maxPairEndedDistance ) 329 | { 330 | fprintf(stdout, "ERROR: Please enter a valid range for pairedend sequences.\n"); 331 | return 0; 332 | } 333 | 334 | if (seqFile1 == NULL) 335 | { 336 | fprintf(stdout, "ERROR: Please indicate the first file for pairedend search.\n"); 337 | return 0; 338 | } 339 | } 340 | } 341 | 342 | int i = 0; 343 | 344 | sprintf(fileName[0], "%s", fastaFile); 345 | sprintf(fileName[1], "%s.index", fileName[0]); 346 | if (SNPMode) 347 | sprintf(fileName[2], "%s", SNPFile); 348 | 349 | if (!indexingMode) 350 | { 351 | fprintf(stdout, "# Threads: %d\n", THREAD_COUNT); 352 | for (i = 0; i < 255; i++) 353 | THREAD_ID[i] = i; 354 | } 355 | 356 | char fname1[FILE_NAME_LENGTH]; 357 | char fname2[FILE_NAME_LENGTH]; 358 | char fname3[FILE_NAME_LENGTH]; 359 | char fname4[FILE_NAME_LENGTH]; 360 | char fname5[FILE_NAME_LENGTH]; 361 | 362 | // Why is this one here? 363 | if (pairedEndMode) 364 | { 365 | sprintf(fname1, "%s__%s__1", mappingOutputPath, mappingOutput); 366 | sprintf(fname2, "%s__%s__2", mappingOutputPath, mappingOutput); 367 | sprintf(fname3, "%s__%s__disc", mappingOutputPath, mappingOutput); 368 | sprintf(fname4, "%s__%s__oea1", mappingOutputPath, mappingOutput); 369 | sprintf(fname5, "%s__%s__oea2", mappingOutputPath, mappingOutput); 370 | unlink(fname1); 371 | unlink(fname2); 372 | unlink(fname3); 373 | unlink(fname4); 374 | unlink(fname5); 375 | } 376 | initCommon(); 377 | return 1; 378 | } 379 | /**********************************************/ 380 | void finalizeCommandParser() 381 | { 382 | freeMem(mappingOutput, FILE_NAME_LENGTH); 383 | freeMem(unmappedOutput, FILE_NAME_LENGTH); 384 | freeMem(mappingOutputPath, FILE_NAME_LENGTH); 385 | freeMem(concordantStatOutput, FILE_NAME_LENGTH); 386 | } 387 | -------------------------------------------------------------------------------- /HashTable.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include "Common.h" 42 | #include "RefGenome.h" 43 | #include "HashTable.h" 44 | #include "Output.h" 45 | #include "Sort.h" 46 | 47 | /**********************************************/ 48 | FILE *_ih_fp = NULL; 49 | IHashTable *_ih_hashTable = NULL; 50 | int _ih_maxHashTableSize = 0; 51 | unsigned int _ih_hashTableMemSize = 0; 52 | GeneralIndex *_ih_hashTableMem = NULL; 53 | int _ih_refGenLen = 0; 54 | CompressedSeq *_ih_crefGen = NULL; 55 | int _ih_crefGenLen = 0; 56 | char *_ih_refGenName = NULL; 57 | long long _ih_memUsage = 0; 58 | int _ih_refGenOff = 0; 59 | unsigned char *_ih_IOBuffer = NULL; 60 | unsigned int _ih_IOBufferSize = (1 << 24); 61 | int MAX_GENOME_INFO_SIZE = 10000000; 62 | int _ih_maxChrLength = 0; 63 | CompressedSeq *_ih_crefGenOrigin = NULL; // only used in pairedEndMode 64 | unsigned char *_ih_alphCnt = NULL; 65 | long int _ih_contigStartPos = 0; 66 | int _ih_chrCnt = 0; 67 | char **_ih_chrNames = 0; 68 | pthread_t *_ih_threads = NULL; 69 | pthread_mutex_t _ih_writeLock; 70 | 71 | 72 | 73 | /**********************************************/ 74 | static inline int encodeVariableByte(unsigned char *buffer, unsigned int value) // returns number of bytes written to buffer 75 | { 76 | int t = 0; 77 | do { 78 | buffer[t++] = (unsigned char) (value & 127); 79 | value /= 128; 80 | } while (value != 0); 81 | buffer[t-1] |= 128; 82 | return t; 83 | } 84 | /**********************************************/ 85 | static inline unsigned int decodeVariableByte(unsigned char *buffer, unsigned int *result) // returns number of bytes read from the buffer 86 | { 87 | int i = 0; 88 | char t; 89 | *result = 0; 90 | do { 91 | t = buffer[i]; 92 | *result |= ((t&127) <<(7*i)); 93 | i++; 94 | } while ((t & 128) == 0); 95 | return i; 96 | } 97 | /**********************************************/ 98 | int compareCheckSumHT (const void *a, const void *b) 99 | { 100 | return ((GeneralIndex *)a)->checksum - ((GeneralIndex *)b)->checksum; 101 | } 102 | /**********************************************/ 103 | void initSavingIHashTable(char *fileName, char *genomeMetaInfo, int genomeMetaInfoLength) 104 | { 105 | // file header: 106 | // 1 byte (magicNumber): Magic number of HashTable (0: v3) 107 | // 1 byte (WINDOW_SIZE): Windows Size of indexing 108 | // 4 bytes (_ih_hsahTableMemSize): HashTbleMemSize: maximum number of elements that can be saved. 109 | // 4 bytes (_ih_IOBufferSize): memory required for reading hash table. In case the value is changed for loading. 110 | // 4 bytes (CONTIG_MAX_SIZE): maximum number of characters that can be in a contig. In case the value is changed for loading 111 | // n bytes (genomeMetaInfo): number of chromosomes, their names and lengths 112 | 113 | _ih_fp = fileOpen(fileName, "w"); 114 | unsigned char magicNumber = 2; 115 | 116 | int tmp; 117 | tmp = fwrite(&magicNumber, sizeof(magicNumber), 1, _ih_fp); 118 | tmp = fwrite(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); 119 | tmp = fwrite(&_ih_hashTableMemSize, sizeof(_ih_hashTableMemSize), 1, _ih_fp); 120 | tmp = fwrite(&_ih_IOBufferSize, sizeof(_ih_IOBufferSize), 1, _ih_fp); 121 | tmp = fwrite(&CONTIG_MAX_SIZE, sizeof(CONTIG_MAX_SIZE), 1, _ih_fp); 122 | tmp = fwrite(genomeMetaInfo, sizeof(char), genomeMetaInfoLength, _ih_fp); 123 | 124 | _ih_IOBuffer = getMem(_ih_IOBufferSize); 125 | } 126 | /**********************************************/ 127 | void finalizeSavingIHashTable() 128 | { 129 | // seeking back to hashTableMemSize to update the value 130 | fseek(_ih_fp, 2, SEEK_SET); 131 | fwrite(&_ih_hashTableMemSize, sizeof(_ih_hashTableMemSize), 1, _ih_fp); 132 | 133 | freeMem(_ih_IOBuffer,_ih_IOBufferSize); 134 | fclose(_ih_fp); 135 | } 136 | /**********************************************/ 137 | void saveHashTable(unsigned int *hashTable, unsigned int size, unsigned int maxSize, char *refGen, char *refGenName, int refGenOffset, unsigned char lastContig) 138 | { 139 | // 1 byte (extraInfo): Reserved; in case the contig has extra information 140 | // 2 bytes (len): Length of the reference genome name 141 | // n bytes (refGenName): Reference genome name 142 | // 4 bytes (refGenOfsset): Offset of the contig from the beginning of the chromosome 143 | // 4 bytes (refGenLength): Length of reference genome 144 | // n bytes (crefGen): compressed reference genome 145 | // 4 bytes (size): number of hashValues in hashTable with more than 0 locations 146 | // n bytes (bufferSize and buffer): array of bufferSize/buffer which includes encoded values of hashValue, count of locations 147 | 148 | int tmp, i; 149 | 150 | unsigned char extraInfo = lastContig; 151 | tmp = fwrite (&extraInfo, sizeof(extraInfo), 1, _ih_fp); 152 | 153 | short len = strlen(refGenName); 154 | tmp = fwrite(&len, sizeof(len), 1, _ih_fp); 155 | tmp = fwrite(refGenName, sizeof(char), len, _ih_fp); 156 | 157 | tmp = fwrite(&refGenOffset, sizeof(refGenOffset), 1, _ih_fp); 158 | 159 | unsigned int refGenLength = strlen(refGen); 160 | tmp = fwrite(&refGenLength, sizeof(refGenLength), 1, _ih_fp); 161 | 162 | unsigned int crefGenLength = calculateCompressedLen(refGenLength); 163 | CompressedSeq *crefGen = getMem(crefGenLength * sizeof(CompressedSeq)); 164 | compressSequence(refGen, refGenLength, crefGen); 165 | tmp = fwrite(crefGen, sizeof(CompressedSeq), crefGenLength, _ih_fp); 166 | freeMem(crefGen, crefGenLength * sizeof(CompressedSeq)); 167 | 168 | unsigned int memSize = 0; 169 | for (i=0; i _ih_IOBufferSize - 10 ) 190 | { 191 | fwrite(&k, sizeof(int), 1, _ih_fp); 192 | fwrite(_ih_IOBuffer, sizeof(unsigned char), k, _ih_fp); 193 | k = 0; 194 | } 195 | } 196 | } 197 | if (k) 198 | { 199 | fwrite(&k, sizeof(int), 1, _ih_fp); 200 | fwrite(_ih_IOBuffer, sizeof(unsigned char), k, _ih_fp); 201 | } 202 | } 203 | /**********************************************/ 204 | int generateHashTable(char *fileName, char *indexName) 205 | { 206 | double startTime = getTime(); 207 | unsigned int hashTableSize = 0; 208 | int refGenOff = 0; 209 | unsigned int hashTableMaxSize = (1 << 2*WINDOW_SIZE); // 4^WINDOW_SIZE 210 | unsigned int *hashTable = getMem(hashTableMaxSize * sizeof(unsigned int)); 211 | char *genomeMetaInfo = getMem(MAX_GENOME_INFO_SIZE); 212 | int genomeMetaInfoLength; 213 | char *refGenName = NULL; 214 | char *refGen = NULL; 215 | char *c = NULL; 216 | char *prev = getMem (CONTIG_NAME_SIZE); 217 | int i, hv, l, flag, stack , val, loc; 218 | char lookup[128]; 219 | unsigned int windowMask = 0xffffffff >> (sizeof(unsigned int)*8 - WINDOW_SIZE*2); 220 | 221 | memset(lookup, 4, 128); 222 | lookup['A'] = 0; 223 | lookup['C'] = 1; 224 | lookup['G'] = 2; 225 | lookup['T'] = 3; 226 | lookup['N'] = 4; 227 | 228 | 229 | //Loading Fasta File 230 | prev[0]='\0'; 231 | 232 | if (!initLoadingRefGenome(fileName, genomeMetaInfo, &genomeMetaInfoLength)) 233 | return 0; 234 | initSavingIHashTable(indexName, genomeMetaInfo, genomeMetaInfoLength); 235 | fprintf(stdout, "Generating Index from %s", fileName); 236 | fflush(stdout); 237 | 238 | do 239 | { 240 | flag = loadRefGenome (&refGen, &refGenName, &refGenOff, &_ih_refGenLen); 241 | 242 | memset(hashTable, 0, hashTableMaxSize * sizeof(unsigned int)); 243 | hashTableSize = 0; 244 | 245 | if ( strcmp(prev, refGenName) != 0) 246 | { 247 | fprintf(stdout, "\n - %s ", refGenName); 248 | fflush(stdout); 249 | sprintf(prev, "%s", refGenName); 250 | } 251 | else 252 | { 253 | fprintf(stdout, "."); 254 | fflush(stdout); 255 | } 256 | 257 | c = refGen; 258 | i = hv = val = 0; 259 | stack = 1; 260 | loc = -WINDOW_SIZE+1; 261 | 262 | while (i++ < _ih_refGenLen) 263 | { 264 | loc++; 265 | val = lookup[*(c++)]; 266 | 267 | if (val != 4 && stack == WINDOW_SIZE) 268 | { 269 | hv = ((hv << 2)|val)&windowMask; 270 | if (hashTable[hv]++ == 0) 271 | hashTableSize++; 272 | } 273 | else 274 | { 275 | if (val == 4) 276 | { 277 | stack = 1; 278 | hv = 0; 279 | } 280 | else 281 | { 282 | stack ++; 283 | hv = (hv <<2)|val; 284 | } 285 | 286 | } 287 | } 288 | 289 | saveHashTable(hashTable, hashTableSize, hashTableMaxSize, refGen, refGenName, refGenOff, flag); 290 | } while (flag); 291 | 292 | freeMem(prev, CONTIG_NAME_SIZE); 293 | freeMem(hashTable, sizeof(unsigned int)*hashTableMaxSize); 294 | freeMem(genomeMetaInfo, MAX_GENOME_INFO_SIZE); 295 | 296 | finalizeLoadingRefGenome(); 297 | finalizeSavingIHashTable(); 298 | 299 | fprintf(stdout, "\nDONE in %0.2fs!\n", (getTime()-startTime)); 300 | return 1; 301 | } 302 | /**********************************************/ 303 | void rewindHashTable() 304 | { 305 | fseek(_ih_fp, _ih_contigStartPos, SEEK_SET); 306 | } 307 | /**********************************************/ 308 | int checkHashTable(char *fileName) 309 | { 310 | _ih_fp = fileOpen(fileName, "r"); 311 | 312 | unsigned char magicNumber; 313 | int tmp; 314 | 315 | tmp = fread(&magicNumber, sizeof(magicNumber), 1, _ih_fp); 316 | if (magicNumber == 1) 317 | { 318 | fprintf(stdout, "Error: Please use version 1.2.6.4 in bisulfite mode.\n"); 319 | return 0; 320 | } 321 | else if (magicNumber == 0) 322 | { 323 | fprintf(stdout, "Error: Please use version 2.x.x.x or upgrade your index.\n"); 324 | return 0; 325 | } 326 | 327 | tmp = fread(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); 328 | tmp = fread(&_ih_hashTableMemSize, sizeof(_ih_hashTableMemSize), 1, _ih_fp); 329 | tmp = fread(&_ih_IOBufferSize, sizeof(_ih_IOBufferSize), 1, _ih_fp); 330 | tmp = fread(&CONTIG_MAX_SIZE, sizeof(CONTIG_MAX_SIZE), 1, _ih_fp); 331 | fclose(_ih_fp); 332 | _ih_fp = NULL; 333 | return 1; 334 | } 335 | 336 | /**********************************************/ 337 | int initLoadingHashTable(char *fileName) 338 | { 339 | // file header: 340 | // 1 byte (magicNumber): Magic number of HashTable (0: v3) 341 | // 1 byte (WINDOW_SIZE): Windows Size of indexing 342 | // 4 bytes (_ih_hsahTableMemSize): HashTbleMemSize: maximum number of elements that can be saved. 343 | // 4 bytes (_ih_IOBufferSize): memory required for reading hash table. In case the value is changed for loading. 344 | // 4 bytes (CONTIG_MAX_SIZE): maximum number of characters that can be in a contig. In case the value is changed for loading 345 | // n bytes (genomeMetaInfo): number of chromosomes, their names and lengths 346 | 347 | if (_ih_fp == NULL) // first time 348 | _ih_fp = fileOpen(fileName, "r"); 349 | else 350 | rewind(_ih_fp); // rewind for mapping the next chunk of reads 351 | 352 | int i, numOfChrs, nameLen; 353 | unsigned char magicNumber; 354 | int tmp; 355 | 356 | _ih_threads = getMem(sizeof(pthread_t) * THREAD_COUNT); 357 | 358 | tmp = fread(&magicNumber, sizeof(magicNumber), 1, _ih_fp); 359 | tmp = fread(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); 360 | tmp = fread(&_ih_hashTableMemSize, sizeof(_ih_hashTableMemSize), 1, _ih_fp); 361 | 362 | _ih_hashTableMem = getMem(_ih_hashTableMemSize*sizeof(GeneralIndex)); 363 | 364 | tmp = fread(&_ih_IOBufferSize, sizeof(_ih_IOBufferSize), 1, _ih_fp); 365 | _ih_IOBuffer = getMem(_ih_IOBufferSize); 366 | 367 | tmp = fread(&CONTIG_MAX_SIZE, sizeof(CONTIG_MAX_SIZE), 1, _ih_fp); 368 | 369 | // Reading Meta 370 | char *strtmp = getMem(2*CONTIG_NAME_SIZE); 371 | 372 | tmp = fread(&_ih_chrCnt, sizeof(int), 1, _ih_fp); 373 | 374 | _ih_chrNames = getMem(_ih_chrCnt * sizeof(char *)); 375 | for (i = 0; i < _ih_chrCnt; i++) 376 | { 377 | _ih_chrNames[i] = getMem(CONTIG_NAME_SIZE); 378 | tmp = fread(&nameLen, sizeof(int), 1, _ih_fp); 379 | tmp = fread(_ih_chrNames[i], sizeof(char), nameLen, _ih_fp); 380 | _ih_chrNames[i][nameLen] = '\0'; 381 | tmp = fread(&_ih_refGenLen, sizeof(int), 1, _ih_fp); 382 | 383 | sprintf(strtmp,"@SQ\tSN:%s\tLN:%d%c", _ih_chrNames[i], _ih_refGenLen, '\0'); 384 | outputMeta(strtmp); 385 | 386 | if (_ih_refGenLen > _ih_maxChrLength) 387 | _ih_maxChrLength = _ih_refGenLen; 388 | } 389 | freeMem(strtmp, 2*CONTIG_NAME_SIZE); 390 | // Reading Meta End 391 | 392 | if (pairedEndMode) 393 | { 394 | _ih_crefGenOrigin = getMem((calculateCompressedLen(_ih_maxChrLength)+1) * sizeof(CompressedSeq)); 395 | _ih_crefGen = _ih_crefGenOrigin; 396 | } 397 | else 398 | { 399 | _ih_crefGen = getMem((calculateCompressedLen(CONTIG_MAX_SIZE)+1) * sizeof(CompressedSeq)); 400 | } 401 | 402 | _ih_maxHashTableSize = pow(4, WINDOW_SIZE); 403 | 404 | _ih_hashTable = getMem (sizeof(IHashTable) * _ih_maxHashTableSize); 405 | memset(_ih_hashTable, 0, _ih_maxHashTableSize * sizeof(IHashTable)); 406 | _ih_refGenName = getMem(CONTIG_NAME_SIZE); 407 | _ih_refGenName[0] = '\0'; 408 | if (!SNPMode) 409 | _ih_alphCnt = getMem(CONTIG_MAX_SIZE * 4); 410 | 411 | _ih_contigStartPos = ftell(_ih_fp); 412 | 413 | return 1; 414 | } 415 | /**********************************************/ 416 | void finalizeLoadingHashTable() 417 | { 418 | int i; 419 | freeMem(_ih_threads, sizeof(pthread_t) * THREAD_COUNT); 420 | 421 | freeMem(_ih_hashTableMem, _ih_hashTableMemSize * sizeof(GeneralIndex)); 422 | freeMem(_ih_IOBuffer, _ih_IOBufferSize); 423 | if (pairedEndMode) 424 | freeMem(_ih_crefGenOrigin, (calculateCompressedLen(_ih_maxChrLength)+1) * sizeof(CompressedSeq)); 425 | else 426 | freeMem(_ih_crefGen, (calculateCompressedLen(CONTIG_MAX_SIZE)+1) * sizeof(CompressedSeq)); 427 | freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize); 428 | freeMem(_ih_refGenName, CONTIG_NAME_SIZE); 429 | if (!SNPMode) 430 | freeMem(_ih_alphCnt, CONTIG_MAX_SIZE * 4); 431 | for (i = 0; i < _ih_chrCnt; i++) 432 | freeMem(_ih_chrNames[i], CONTIG_NAME_SIZE); 433 | freeMem(_ih_chrNames, _ih_chrCnt * sizeof(char *)); 434 | fclose(_ih_fp); 435 | } 436 | /**********************************************/ 437 | void *calculateHashTableOnFly(int *idp) 438 | { 439 | int id = *idp; 440 | 441 | int windowMaskSize = WINDOW_SIZE + checkSumLength; 442 | unsigned long long windowMask = 0xffffffffffffffff >> (sizeof(unsigned long long)*8 - windowMaskSize*2); 443 | unsigned long long checkSumMask = 0xffffffffffffffff >> (sizeof(unsigned long long)*8 - (checkSumLength)*2); 444 | if (checkSumLength == 0) 445 | checkSumMask = 0; 446 | 447 | CompressedSeq *cnext = (_ih_crefGen); 448 | CompressedSeq cdata = *(cnext++); 449 | 450 | int i = 0; 451 | unsigned long long hv = 0; 452 | unsigned long long hvtemp; 453 | int pos, val, t = 0, stack = 1; 454 | int loc = -WINDOW_SIZE - checkSumLength + 1 ; 455 | int x; 456 | // calculate refGen hashValues 457 | while (i++ < _ih_refGenLen ) // BORDER LINE CHECK 458 | { 459 | loc++; 460 | val = (cdata >> 60) & 7; 461 | if (++t == 21) 462 | { 463 | t = 0; 464 | cdata = *(cnext++); 465 | } 466 | else 467 | { 468 | cdata <<= 3; 469 | } 470 | 471 | if (val != 4 && stack == windowMaskSize) 472 | { 473 | hv = ((hv << 2)|val)&windowMask; 474 | hvtemp = hv >> (checkSumLength<<1); 475 | 476 | if (hvtemp % THREAD_COUNT == id) 477 | { 478 | ++_ih_hashTable[hvtemp].list; 479 | _ih_hashTable[hvtemp].list->info= loc; 480 | _ih_hashTable[hvtemp].list->checksum= hv & checkSumMask; 481 | } 482 | } 483 | else 484 | { 485 | if (val == 4) // N 486 | { 487 | stack = 1; 488 | hv = 0; 489 | } 490 | else 491 | { 492 | stack ++; 493 | hv = (hv <<2)|val; 494 | } 495 | 496 | } 497 | } 498 | return NULL; 499 | } 500 | /**********************************************/ 501 | void *sortHashTable(int *id) 502 | { 503 | int cnt; 504 | int i; 505 | for (i=*id; i<_ih_maxHashTableSize;i+=THREAD_COUNT) 506 | { 507 | if (_ih_hashTable[i].list == NULL) continue; 508 | cnt = 0; 509 | while (_ih_hashTable[i].list->info != _ih_refGenLen+1) 510 | { 511 | _ih_hashTable[i].list--; 512 | cnt++; 513 | } 514 | _ih_hashTable[i].list[0].info=cnt; 515 | if (cnt) 516 | introSortGI(_ih_hashTable[i].list, 1 , _ih_hashTable[i].list[0].info); 517 | } 518 | return NULL; 519 | } 520 | /**********************************************/ 521 | void *countQGrams(int *idp) 522 | { 523 | int id = *idp; 524 | 525 | CompressedSeq *cnext, cdata; 526 | int i, t, val; 527 | 528 | int rgBlockSize = _ih_crefGenLen / THREAD_COUNT; 529 | int rgBlockStart = (rgBlockSize * id * 21); 530 | int rgBlockLen = rgBlockSize * 21; 531 | int rgBlockIt = rgBlockLen + SEQ_LENGTH - 1; 532 | if (id == THREAD_COUNT - 1) 533 | { 534 | rgBlockLen = _ih_refGenLen - id*rgBlockSize*21; 535 | rgBlockIt = rgBlockLen; 536 | } 537 | 538 | cnext = _ih_crefGen+(id*rgBlockSize); 539 | cdata = *(cnext++); 540 | t = 0; 541 | char outgoingChar[SEQ_LENGTH]; 542 | unsigned int *copy = (unsigned int *)(_ih_alphCnt+4*rgBlockStart); 543 | unsigned char *cur = (unsigned char *)copy; // current loc 544 | *copy = 0; 545 | 546 | for (i = 0; i < SEQ_LENGTH; i++) 547 | { 548 | val = (cdata >> 60) & 7; 549 | outgoingChar[i] = val; 550 | 551 | if (++t == 21) 552 | { 553 | t = 0; 554 | cdata = *(cnext++); 555 | } 556 | else 557 | { 558 | cdata <<= 3; 559 | } 560 | if (val != 4) 561 | (*(cur+val)) ++; 562 | } 563 | 564 | int o = 0; 565 | 566 | while (i++ < rgBlockIt) // BORDER LINE CHECK 567 | { 568 | cur = (unsigned char *)++copy; 569 | val = (cdata >> 60) & 7; 570 | if (++t == 21) 571 | { 572 | t = 0; 573 | cdata = *(cnext++); 574 | } 575 | else 576 | { 577 | cdata <<= 3; 578 | } 579 | 580 | *copy = *(copy-1); // copies all 4 bytes at once 581 | if (val != 4) 582 | (*(cur + val)) ++; 583 | if (outgoingChar[o]!= 4) 584 | (*(cur + outgoingChar[o])) --; 585 | outgoingChar[o] = val; 586 | o = (++o == SEQ_LENGTH) ?0 :o; 587 | } 588 | return NULL; 589 | } 590 | /**********************************************/ 591 | int loadHashTable(double *loadTime) 592 | { 593 | // 1 byte (extraInfo): Reserved; in case the contig has extra information 594 | // 2 bytes (len): Length of the reference genome name 595 | // n bytes (refGenName): Reference genome name 596 | // 4 bytes (refGenOfsset): Offset of the contig from the beginning of the chromosome 597 | // 4 bytes (refGenLength): Length of reference genome 598 | // n bytes (crefGen): compressed reference genome 599 | // 4 bytes (size): number of hashValues in hashTable with more than 0 locations 600 | // n bytes (bufferSize and buffer): array of bufferSize/buffer which includes encoded values of hashValue, count of locations 601 | 602 | int tmp; 603 | double startTime = getTime(); 604 | unsigned char extraInfo = 0; 605 | short len; 606 | unsigned int hashTableSize; 607 | unsigned int tmpSize; 608 | int i = 0, j; 609 | 610 | if ( fread(&extraInfo, sizeof(extraInfo), 1, _ih_fp) != sizeof(extraInfo) ) 611 | { 612 | return 0; 613 | } 614 | 615 | memset(_ih_hashTable, 0, _ih_maxHashTableSize * sizeof(IHashTable)); 616 | 617 | // Reading Chr Name 618 | tmp = fread(&len, sizeof(len), 1, _ih_fp); 619 | tmp = fread(_ih_refGenName, sizeof(char), len, _ih_fp); 620 | _ih_refGenName [len] ='\0'; 621 | 622 | tmp = fread(&_ih_refGenOff, sizeof (_ih_refGenOff), 1, _ih_fp); 623 | 624 | // Reading Size and Content of Ref Genome 625 | tmp = fread(&_ih_refGenLen, sizeof(_ih_refGenLen), 1, _ih_fp); 626 | 627 | _ih_crefGenLen = calculateCompressedLen(_ih_refGenLen); 628 | if (pairedEndMode) 629 | { 630 | _ih_crefGen = _ih_crefGenOrigin + _ih_refGenOff/21; 631 | } 632 | tmp = fread(_ih_crefGen, sizeof(CompressedSeq), _ih_crefGenLen, _ih_fp); 633 | 634 | 635 | //Reading Hashtable Size and Content 636 | GeneralIndex *mem =_ih_hashTableMem; 637 | 638 | tmp = fread(&hashTableSize, sizeof(hashTableSize), 1, _ih_fp); 639 | 640 | int index = 0, bytesToRead; 641 | unsigned int diff; 642 | unsigned long long hv=0; 643 | i = 0; 644 | while (i < hashTableSize) 645 | { 646 | fread(&bytesToRead, sizeof(int), 1, _ih_fp); 647 | fread(_ih_IOBuffer, sizeof(unsigned char), bytesToRead, _ih_fp); 648 | index = 0; 649 | while (index < bytesToRead) 650 | { 651 | index += decodeVariableByte(_ih_IOBuffer + index, &diff); 652 | index += decodeVariableByte(_ih_IOBuffer + index, &tmpSize); 653 | hv += diff; 654 | _ih_hashTable[hv].list = mem; 655 | mem->info = _ih_refGenLen+1; 656 | mem += (tmpSize + 1); 657 | i++; 658 | } 659 | } 660 | 661 | // // creating hash table 662 | for (i = 0; i < THREAD_COUNT; i++) 663 | pthread_create(_ih_threads + i, NULL, (void*)calculateHashTableOnFly, THREAD_ID + i); 664 | for (i = 0; i < THREAD_COUNT; i++) 665 | pthread_join(_ih_threads[i], NULL); 666 | 667 | // sorting based on checksum 668 | for (i = 0; i < THREAD_COUNT; i++) 669 | pthread_create(_ih_threads + i, NULL, (void*)sortHashTable, THREAD_ID + i); 670 | for (i = 0; i < THREAD_COUNT; i++) 671 | pthread_join(_ih_threads[i], NULL); 672 | 673 | // calculate alphabet count for each location in genome 674 | if (!SNPMode) 675 | { 676 | for (i = 0; i < THREAD_COUNT; i++) 677 | pthread_create(_ih_threads + i, NULL, (void*)countQGrams, THREAD_ID + i); 678 | for (i = 0; i < THREAD_COUNT; i++) 679 | pthread_join(_ih_threads[i], NULL); 680 | } 681 | 682 | *loadTime = getTime()-startTime; 683 | return extraInfo; 684 | } 685 | /**********************************************/ 686 | GeneralIndex *getCandidates(int hv) 687 | { 688 | if ( hv != -1 && _ih_hashTable[hv].list != NULL && _ih_hashTable[hv].list[0].info != 0) 689 | return _ih_hashTable[hv].list; 690 | else 691 | return NULL; 692 | } 693 | /**********************************************/ 694 | char *getRefGenomeName() 695 | { 696 | return _ih_refGenName; 697 | } 698 | /**********************************************/ 699 | int getRefGenomeOffset() 700 | { 701 | return _ih_refGenOff; 702 | } 703 | /**********************************************/ 704 | HashTable *getHashTable() 705 | { 706 | return NULL; 707 | } 708 | /**********************************************/ 709 | CompressedSeq *getCmpRefGenome() 710 | { 711 | return _ih_crefGen; 712 | } 713 | /**********************************************/ 714 | int getRefGenLength() 715 | { 716 | return _ih_refGenLen; 717 | } 718 | /**********************************************/ 719 | int getCmpRefGenLength() 720 | { 721 | return _ih_crefGenLen; 722 | } 723 | /**********************************************/ 724 | unsigned char *getAlphabetCount() 725 | { 726 | return _ih_alphCnt; 727 | } 728 | /**********************************************/ 729 | CompressedSeq *getCmpRefGenOrigin() 730 | { 731 | return _ih_crefGenOrigin; 732 | } 733 | /**********************************************/ 734 | int getChrCnt() 735 | { 736 | return _ih_chrCnt; 737 | } 738 | /**********************************************/ 739 | char **getChrNames() 740 | { 741 | return _ih_chrNames; 742 | } 743 | /**********************************************/ 744 | int getMaxChrLength() 745 | { 746 | return _ih_maxChrLength; 747 | } 748 | -------------------------------------------------------------------------------- /Reads.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) <2008 - 2020>, University of Washington, Simon Fraser University 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * Redistributions of source code must retain the above copyright notice, this list 9 | * of conditions and the following disclaimer. 10 | * - Redistributions in binary form must reproduce the above copyright notice, this 11 | * list of conditions and the following disclaimer in the documentation and/or other 12 | * materials provided with the distribution. 13 | * - Neither the name of the nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without specific 15 | * prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | /* 31 | * Author: 32 | * Faraz Hach (fhach AT cs DOT sfu DOT ca) 33 | * Iman Sarrafi (isarrafi AT cs DOT sfu DOT ca) 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include "Common.h" 43 | #include "Reads.h" 44 | #include "MrsFAST.h" 45 | #include "Sort.h" 46 | 47 | FILE *_r_fp1; 48 | FILE *_r_fp2; 49 | FILE *_r_umfp; 50 | gzFile _r_gzfp1; 51 | gzFile _r_gzfp2; 52 | Read *_r_seq; 53 | int _r_uncroppedSeqLength; 54 | int _r_seqCnt; 55 | int _r_samplingLocsSize; 56 | int *_r_samplingLocs; 57 | int *_r_samplingLocsSeg; 58 | int *_r_samplingLocsOffset; 59 | int *_r_samplingLocsLen; 60 | int *_r_samplingLocsLenFull; 61 | int *_r_indexSize; 62 | pthread_t *_r_threads; 63 | unsigned char _r_fastq; 64 | ReadIndexTable **_r_readIndex; 65 | int *_r_readIndexSize; 66 | int _r_maxSeqCnt; 67 | int _r_firstIteration = 1; 68 | long long _r_readMemUsage = 0; 69 | char *_r_alphIndex = NULL; 70 | char checkSumLength = 0; 71 | char *_r_buf1; 72 | char *_r_buf2; 73 | int *_r_buf1_size; 74 | int *_r_buf2_size; 75 | int *_r_buf1_pos; 76 | int *_r_buf2_pos; 77 | /**********************************************/ 78 | void (*readBuffer1)(); 79 | void (*readBuffer2)(); 80 | /**********************************************/ 81 | void readBufferGZ1() 82 | { 83 | (*_r_buf1_size) = gzread(_r_gzfp1, _r_buf1, 10000000); 84 | (*_r_buf1_pos) = 0; 85 | } 86 | /**********************************************/ 87 | void readBufferGZ2() 88 | { 89 | if (_r_buf1 == _r_buf2) 90 | readBuffer1(); 91 | else 92 | { 93 | (*_r_buf2_size) = gzread(_r_gzfp2, _r_buf2, 10000000); 94 | (*_r_buf2_pos) = 0; 95 | } 96 | } 97 | /**********************************************/ 98 | void readBufferTxT1() 99 | { 100 | (*_r_buf1_size) = fread(_r_buf1, 1, 10000000, _r_fp1); 101 | (*_r_buf1_pos) = 0; 102 | } 103 | /**********************************************/ 104 | void readBufferTxT2() 105 | { 106 | if (_r_buf1 == _r_buf2) 107 | readBuffer1(); 108 | else 109 | { 110 | (*_r_buf2_size) = fread(_r_buf2, 1, 10000000, _r_fp2); 111 | (*_r_buf2_pos) = 0; 112 | } 113 | } 114 | /**********************************************/ 115 | int readFirstSeq(char *seq , int line) 116 | { 117 | int i=0, l=0, j = 0; 118 | char cur; 119 | 120 | while (1) 121 | { 122 | if (*_r_buf1_pos ==*_r_buf1_size) 123 | { 124 | readBuffer1(); 125 | if (*_r_buf1_size == 0) 126 | return 0; 127 | } 128 | 129 | cur = _r_buf1[*_r_buf1_pos]; 130 | (*_r_buf1_pos)++; 131 | j++; 132 | 133 | if ( cur == '\n') 134 | { 135 | if (l>0) i=l; 136 | seq[i]='\0'; 137 | return i; 138 | } 139 | 140 | if (l==0 && (cur == ' ' || cur == '\t')) 141 | { 142 | l = i; 143 | } 144 | 145 | if (cropSize>0 && line%2==0 && i==cropSize) 146 | continue; 147 | if (tailCropSize>0 && line%2==0 && j <= _r_uncroppedSeqLength - tailCropSize) 148 | continue; 149 | 150 | seq[i++]=cur; 151 | 152 | } 153 | } 154 | /**********************************************/ 155 | int readSecondSeq( char *seq, int line ) 156 | { 157 | int i=0, l=0, j = 0;; 158 | char cur; 159 | while (1) 160 | { 161 | if (*_r_buf2_pos ==*_r_buf2_size) 162 | { 163 | readBuffer2(); 164 | if (*_r_buf2_size == 0) 165 | return 0; 166 | } 167 | 168 | cur = _r_buf2[*_r_buf2_pos]; 169 | (*_r_buf2_pos)++; 170 | j++; 171 | if ( cur == '\n') 172 | { 173 | if (l>0) i=l; 174 | seq[i]='\0'; 175 | return i; 176 | } 177 | 178 | if ((cur == ' ' || cur == '\t')&& l==0) 179 | { 180 | l = i; 181 | } 182 | 183 | if (cropSize>0 && line%2==0 && i==cropSize) 184 | continue; 185 | if (tailCropSize>0 && line%2==0 && j <= _r_uncroppedSeqLength - tailCropSize) 186 | continue; 187 | 188 | seq[i++]=cur; 189 | 190 | } 191 | } 192 | /**********************************************/ 193 | int compare (const void *a, const void *b) 194 | { 195 | Pair *x=(Pair *)a; 196 | Pair *y=(Pair *)b; 197 | 198 | if (x->hv == y->hv) 199 | return x->checksum - y->checksum; 200 | else 201 | return x->hv - y->hv; 202 | } 203 | /**********************************************/ 204 | void getReadIndex(ReadIndexTable ***rIndex, int **rIndexSize) 205 | { 206 | *rIndex = _r_readIndex; 207 | *rIndexSize = _r_readIndexSize; 208 | } 209 | 210 | /**********************************************/ 211 | void *preProcessReads(int *idp) 212 | { 213 | int id = *idp; 214 | int i=0, j=0, pos=0, tmpSize=0; 215 | 216 | int32_t hvtmp, cstmp; 217 | 218 | int div = _r_seqCnt / THREAD_COUNT; 219 | div += (_r_seqCnt % THREAD_COUNT)?1:0; 220 | Pair *tmp = getMem(sizeof(Pair)*(div * _r_samplingLocsSize*2)); 221 | char alphCnt[5]; 222 | char *a, *b; 223 | 224 | for (i=id*div; ierrThreshold) 241 | _r_seq[i].hits[0]=1; 242 | 243 | _r_seq[i].alphCnt[0] = alphCnt[0]; 244 | _r_seq[i].alphCnt[1] = alphCnt[1]; 245 | _r_seq[i].alphCnt[2] = alphCnt[2]; 246 | _r_seq[i].alphCnt[3] = alphCnt[3]; 247 | 248 | compressSequence(_r_seq[i].seq, SEQ_LENGTH, _r_seq[i].cseq); 249 | compressSequence(_r_seq[i].rseq, SEQ_LENGTH, _r_seq[i].crseq); 250 | 251 | if (_r_seq[i].hits[0] == 1) // marked reads are not indexed 252 | { 253 | _r_seq[i].hits[0] = 0; 254 | for (j=0; j< 2*_r_samplingLocsSize; j++) 255 | { 256 | tmp[pos].hv = -1; 257 | tmp[pos].checksum = 0; 258 | tmp[pos].seqInfo = pos +(div*id*2*_r_samplingLocsSize); 259 | pos++; 260 | } 261 | } 262 | else 263 | { 264 | for (j=0; j< _r_samplingLocsSize; j++) 265 | { 266 | hvtmp = hashVal(_r_seq[i].seq+_r_samplingLocs[j]); 267 | cstmp = checkSumVal(_r_seq[i].seq+_r_samplingLocs[j]+WINDOW_SIZE); 268 | if (hvtmp == -1 || cstmp == -1) 269 | { 270 | tmp[pos].hv = -1; 271 | tmp[pos].checksum = 0; 272 | } 273 | else 274 | { 275 | tmp[pos].hv = hvtmp; 276 | tmp[pos].checksum = cstmp; 277 | } 278 | tmp[pos].seqInfo = pos +(div*id*2*_r_samplingLocsSize); 279 | pos++; 280 | } 281 | 282 | for (j=0; j<_r_samplingLocsSize; j++) 283 | { 284 | hvtmp = hashVal(_r_seq[i].rseq+_r_samplingLocs[j]); 285 | cstmp = checkSumVal(_r_seq[i].rseq+_r_samplingLocs[j]+WINDOW_SIZE); 286 | 287 | if (hvtmp == -1 || cstmp == -1) 288 | { 289 | tmp[pos].hv = -1; 290 | tmp[pos].checksum = 0; 291 | } 292 | else 293 | { 294 | tmp[pos].hv = hvtmp; 295 | tmp[pos].checksum = cstmp; 296 | } 297 | tmp[pos].seqInfo = pos+(div*id*2*_r_samplingLocsSize); 298 | pos++; 299 | } 300 | 301 | } 302 | tmpSize+=2*_r_samplingLocsSize; 303 | } 304 | 305 | introSortPair( tmp, 0, tmpSize-1); 306 | 307 | int uniq = 0; 308 | int prev = -2; 309 | int beg = -1; 310 | int end = -1; 311 | 312 | for (i=0; i SEQ_LENGTH) 377 | _r_samplingLocs[i] = SEQ_LENGTH - WINDOW_SIZE; 378 | } 379 | _r_samplingLocs[_r_samplingLocsSize]=SEQ_LENGTH; 380 | 381 | int size = sizeof(int)*_r_samplingLocsSize; 382 | _r_samplingLocsSeg = getMem(size); 383 | _r_samplingLocsOffset = getMem(size); 384 | _r_samplingLocsLen = getMem(size); 385 | _r_samplingLocsLenFull = getMem(size); 386 | for (i=0; i<_r_samplingLocsSize; i++) 387 | { 388 | _r_samplingLocsSeg[i] = _r_samplingLocs[i] / (sizeof(CompressedSeq)*8/3); 389 | _r_samplingLocsOffset[i] = _r_samplingLocs[i] % (sizeof(CompressedSeq)*8/3); 390 | _r_samplingLocsLen[i] = _r_samplingLocs[i+1] - _r_samplingLocs[i]; 391 | _r_samplingLocsLenFull[i] = SEQ_LENGTH - _r_samplingLocs[i]; 392 | } 393 | 394 | 395 | // Outputing the sampling locations 396 | /*int j; 397 | for (i=0; i') 505 | _r_fastq = 0; 506 | else 507 | _r_fastq = 1; 508 | 509 | readFirstSeq(dummy,1); 510 | int nameLen = strlen(dummy); 511 | readFirstSeq(dummy,2); 512 | *_r_buf1_pos = 0; 513 | int seqLen = strlen(dummy); 514 | SEQ_LENGTH = 0; 515 | i = 0; 516 | while (i 0) 525 | SEQ_LENGTH = cropSize; 526 | if (tailCropSize > 0) 527 | SEQ_LENGTH = tailCropSize; 528 | 529 | if ( SEQ_LENGTH >= SEQ_MAX_LENGTH ) 530 | { 531 | fprintf(stdout, "ERR: Read Length is greater than the MAX length we can process (Current Max: %d).\n", SEQ_MAX_LENGTH); 532 | exit(EXIT_FAILURE); 533 | } 534 | 535 | if (_r_fastq) 536 | { 537 | QUAL_LENGTH = SEQ_LENGTH; 538 | } 539 | else 540 | { 541 | QUAL_LENGTH = 1; 542 | } 543 | 544 | CMP_SEQ_LENGTH = calculateCompressedLen(SEQ_LENGTH); 545 | 546 | //TODO MEMORY CALCULATION FIX 547 | double readMem = sizeof(Read) + (2 + (SEQ_LENGTH * 2) + QUAL_LENGTH + 3 + (CMP_SEQ_LENGTH * 2 * 8) + (nameLen+10) + 4); 548 | readMem += ((bestMappingMode) ?(sizeof(FullMappingInfo)) :0); 549 | if (pairedEndMode) 550 | readMem += sizeof(MappingInfo) + sizeof(MappingLocations); 551 | 552 | _r_maxSeqCnt = (int)(((MAX_MEMORY-1.2) * (1 << 30))/readMem); 553 | if ( pairedEndMode && _r_maxSeqCnt % 2 ) 554 | _r_maxSeqCnt ++; 555 | _r_maxSeqCnt -= _r_maxSeqCnt % THREAD_COUNT; 556 | 557 | //_r_maxSeqCnt = 500000; 558 | 559 | _r_seq = getMem(sizeof(Read)*_r_maxSeqCnt); 560 | 561 | int maxErrThreshold = (SEQ_LENGTH/WINDOW_SIZE) - 1; 562 | if (errThreshold == -1) 563 | { 564 | errThreshold = SEQ_LENGTH*6/100; 565 | fprintf(stdout, "# Errors: %d\n", errThreshold); 566 | } 567 | if (errThreshold > maxErrThreshold && SEQ_LENGTH>0) 568 | { 569 | errThreshold = maxErrThreshold; 570 | fprintf(stdout, "# Error: %d (full sensitivity)\n", errThreshold); 571 | } 572 | 573 | 574 | checkSumLength = (SEQ_LENGTH / (errThreshold+1)) - WINDOW_SIZE; 575 | if (checkSumLength > sizeof(CheckSumType)*4) 576 | checkSumLength = sizeof(CheckSumType)*4; 577 | 578 | calculateSamplingLocations(); 579 | 580 | 581 | if (!nohitDisabled) 582 | { 583 | _r_umfp = fileOpen(unmappedOutput, "w"); 584 | } 585 | 586 | _r_alphIndex = getMem(128); // used in readChunk() 587 | _r_alphIndex['A'] = 0; 588 | _r_alphIndex['C'] = 1; 589 | _r_alphIndex['G'] = 2; 590 | _r_alphIndex['T'] = 3; 591 | _r_alphIndex['N'] = 4; 592 | 593 | return 1; 594 | } 595 | 596 | /**********************************************/ 597 | int readChunk(Read **seqList, unsigned int *seqListSize) 598 | { 599 | double startTime=getTime(); 600 | 601 | char seq1[SEQ_MAX_LENGTH]; 602 | char name1[SEQ_MAX_LENGTH]; 603 | char qual1[SEQ_MAX_LENGTH]; 604 | 605 | char seq2[SEQ_MAX_LENGTH]; 606 | char name2[SEQ_MAX_LENGTH]; 607 | char qual2[SEQ_MAX_LENGTH]; 608 | 609 | 610 | char dummy[SEQ_MAX_LENGTH]; 611 | int size; 612 | 613 | int maxCnt = 0; 614 | _r_seqCnt = 0; 615 | _r_readMemUsage = 0; 616 | 617 | int i;//, len; 618 | 619 | int namelen; 620 | while( (namelen = readFirstSeq(name1,1)) ) 621 | { 622 | 623 | if (pairedEndMode) 624 | { 625 | if (name1[namelen-2]=='/' && name1[namelen-1]=='1') 626 | { 627 | namelen -= 2; 628 | name1[namelen]='\0'; 629 | } 630 | } 631 | size = sizeof(uint16_t) + (SEQ_LENGTH * 2) + QUAL_LENGTH + 3 + (CMP_SEQ_LENGTH << 4) + namelen +/* 1 +*/ 4; 632 | _r_seq[_r_seqCnt].hits = getMem(size); 633 | _r_readMemUsage += size; 634 | _r_seq[_r_seqCnt].seq = (char *)(_r_seq[_r_seqCnt].hits + 1); 635 | _r_seq[_r_seqCnt].rseq = (char *)(_r_seq[_r_seqCnt].seq + SEQ_LENGTH + 1); 636 | _r_seq[_r_seqCnt].qual = (char *)(_r_seq[_r_seqCnt].rseq + SEQ_LENGTH + 1); 637 | _r_seq[_r_seqCnt].cseq = (CompressedSeq *)(_r_seq[_r_seqCnt].qual + QUAL_LENGTH + 1); 638 | _r_seq[_r_seqCnt].crseq = (CompressedSeq *)(_r_seq[_r_seqCnt].cseq + CMP_SEQ_LENGTH); 639 | _r_seq[_r_seqCnt].name = (char *)(_r_seq[_r_seqCnt].crseq + CMP_SEQ_LENGTH); 640 | _r_seq[_r_seqCnt].alphCnt = (unsigned char *)(_r_seq[_r_seqCnt].name + namelen);// + 1); 641 | _r_seq[_r_seqCnt].hits[0] = 0; 642 | 643 | for (i=1; i= _r_maxSeqCnt) 701 | break; 702 | } 703 | *seqList = _r_seq; 704 | *seqListSize = _r_seqCnt; 705 | 706 | if (_r_seqCnt > 0) 707 | { 708 | preProcessReadsMT(); 709 | fprintf(stdout, "| *Reading Input* | %15.2f | XXXXXXXXXXXXXXX | %15.2f | XXXXXXXXXXXXXXX %15d |\n", (getTime()-startTime), getMemUsage(), _r_seqCnt ); 710 | _r_firstIteration = 0; 711 | } 712 | else if (_r_firstIteration) 713 | { 714 | fprintf(stdout, "ERR: No reads for mapping\n"); 715 | exit(EXIT_FAILURE); 716 | } 717 | 718 | if (_r_seqCnt < _r_maxSeqCnt) // reached end of file 719 | return 0; 720 | else 721 | return 1; 722 | } 723 | /**********************************************/ 724 | void outputUnmapped() 725 | { 726 | if (nohitDisabled) 727 | return; 728 | 729 | if (pairedEndMode) 730 | _r_seqCnt /=2; 731 | 732 | int i = 0; 733 | for (i = 0; i < _r_seqCnt; i++) 734 | { 735 | if (pairedEndMode) 736 | { 737 | if (_r_seq[2*i].hits[0] == 0 && _r_fastq) 738 | { 739 | fprintf(_r_umfp,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual); 740 | } 741 | else if (_r_seq[2*i].hits[0] == 0) 742 | { 743 | fprintf(_r_umfp, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); 744 | } 745 | } 746 | else 747 | { 748 | if (_r_seq[i].hits[0] == 0 && _r_fastq) 749 | { 750 | fprintf(_r_umfp,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual); 751 | } 752 | else if (_r_seq[i].hits[0] == 0) 753 | { 754 | fprintf(_r_umfp,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq); 755 | } 756 | } 757 | } 758 | 759 | if (pairedEndMode) 760 | _r_seqCnt *= 2; 761 | } 762 | /**********************************************/ 763 | void releaseChunk() 764 | { 765 | outputUnmapped(); 766 | 767 | int i, j; 768 | for (i = 0; i < _r_seqCnt; i++) 769 | freeMem(_r_seq[i].hits, 0); 770 | memUsage -= _r_readMemUsage; 771 | _r_readMemUsage = 0; 772 | 773 | for (i = 0; i < THREAD_COUNT; i++) 774 | { 775 | for (j = 0; j < _r_readIndexSize[i]; j++) 776 | freeMem(_r_readIndex[i][j].list, (_r_readIndex[i][j].list[0].info+1)*sizeof(GeneralIndex)); 777 | freeMem(_r_readIndex[i], sizeof(ReadIndexTable)*_r_readIndexSize[i]); 778 | } 779 | freeMem(_r_readIndex, sizeof(ReadIndexTable*)*THREAD_COUNT); 780 | freeMem(_r_readIndexSize, sizeof(int)*THREAD_COUNT); 781 | } 782 | /**********************************************/ 783 | void getSamplingLocsInfo(int **samplingLocs, int **samplingLocsSeg, int **samplingLocsOffset, int **samplingLocsLen, int **samplingLocsLenFull, int *samplingLocsSize) 784 | { 785 | *samplingLocs = _r_samplingLocs; 786 | *samplingLocsSeg = _r_samplingLocsSeg; 787 | *samplingLocsOffset = _r_samplingLocsOffset; 788 | *samplingLocsLen = _r_samplingLocsLen; 789 | *samplingLocsLenFull = _r_samplingLocsLenFull; 790 | *samplingLocsSize = _r_samplingLocsSize; 791 | } 792 | 793 | /**********************************************/ 794 | void finalizeReads() 795 | { 796 | if (!seqCompressed) 797 | { 798 | fclose(_r_fp1); 799 | if ( pairedEndMode && _r_fp2 != _r_fp1 ) 800 | { 801 | fclose(_r_fp2); 802 | } 803 | } 804 | else 805 | { 806 | gzclose(_r_gzfp1); 807 | if ( pairedEndMode && _r_gzfp2 != _r_gzfp1) 808 | { 809 | gzclose(_r_gzfp2); 810 | } 811 | } 812 | freeMem(_r_seq, sizeof(Read)*_r_maxSeqCnt); 813 | freeMem(_r_samplingLocs, sizeof(int)*(_r_samplingLocsSize+1)); 814 | int size = sizeof(int)*_r_samplingLocsSize; 815 | freeMem(_r_samplingLocsSeg, size); 816 | freeMem(_r_samplingLocsOffset, size); 817 | freeMem(_r_samplingLocsLen, size); 818 | freeMem(_r_samplingLocsLenFull, size); 819 | freeMem(_r_alphIndex, 128); 820 | 821 | if (pairedEndMode && _r_buf1 != _r_buf2) 822 | { 823 | freeMem(_r_buf2, 10000000); 824 | freeMem(_r_buf2_pos, sizeof(int)); 825 | freeMem(_r_buf2_size, sizeof(int)); 826 | } 827 | freeMem(_r_buf1, 10000000); 828 | freeMem(_r_buf1_pos, sizeof(int)); 829 | freeMem(_r_buf1_size, sizeof(int)); 830 | 831 | if (!nohitDisabled) 832 | { 833 | fclose(_r_umfp); 834 | } 835 | } 836 | 837 | /**********************************************/ 838 | /*int checkAllReads() 839 | { 840 | char seq[SEQ_MAX_LENGTH]; 841 | char name[SEQ_MAX_LENGTH]; 842 | FILE *fp = _r_fp1; 843 | int flag, firstIteration = 1; 844 | int i = 0, seqCnt[2]; 845 | seqCnt[0] = seqCnt[1] = 0; 846 | 847 | do { 848 | while (readSeq(name, fp)) // name 849 | { 850 | readSeq(seq, fp); // seq 851 | seqCnt[i]++; 852 | 853 | if ( strlen(seq)-1 != SEQ_LENGTH ) 854 | { 855 | rewind(fp); 856 | fprintf(stderr, "ERR: Inconsistent read length %s", name); 857 | return 0; 858 | } 859 | if (_r_fastq) 860 | { 861 | readSeq(seq, fp); // 3rd line 862 | readSeq(seq, fp); // qual 863 | } 864 | } 865 | 866 | if (firstIteration && pairedEndMode && _r_fp2 != _r_fp1) 867 | { 868 | flag = 1; 869 | rewind(_r_fp1); 870 | fp = _r_fp2; 871 | firstIteration = 0; 872 | i++; 873 | } 874 | else 875 | { 876 | flag = 0; 877 | } 878 | 879 | } while (flag); 880 | 881 | rewind(fp); 882 | 883 | if (pairedEndMode) 884 | { 885 | if (_r_fp1 == _r_fp2) 886 | { 887 | if (seqCnt[0] & 1) 888 | { 889 | fprintf(stderr, "ERR: In paired-end mode, number of reads must be divisible by 2\n"); 890 | return 0; 891 | } 892 | } 893 | else 894 | { 895 | if (seqCnt[0] != seqCnt[1]) 896 | { 897 | fprintf(stderr, "ERR: Number of reads must be equal in the input files\n"); 898 | return 0; 899 | } 900 | } 901 | } 902 | 903 | fprintf(stdout, "Input check: OK\n"); 904 | return 1; 905 | } 906 | */ 907 | --------------------------------------------------------------------------------