├── utils ├── debloom ├── README ├── Bloom ├── DoubleKmer ├── tests │ ├── RunTests │ ├── RunTests.o │ ├── BloomTests.o │ ├── JCheckTests.o │ ├── KmerTests.o │ ├── TestUtils.o │ ├── JunctionTests.o │ ├── JunctionMapTests.o │ ├── RollingHashTests.o │ ├── JunctionMapTests.cpp │ ├── JunctionTests.h │ ├── JunctionMapTests.h │ ├── BloomTests.h │ ├── KmerTests.h │ ├── JCheckTests.h │ ├── JunctionTests.cpp │ ├── RollingHashTests.h │ ├── RunTests.h │ ├── RunTests.cpp │ ├── makefile │ ├── TestUtils.h │ ├── TestUtils.cpp │ ├── JCheckTests.cpp │ ├── BloomTests.cpp │ ├── KmerTests.cpp │ └── RollingHashTests.cpp ├── manual │ ├── manual.pdf │ └── manual.tex ├── JuncPairs.cpp ├── Cap.h ├── simple_test.sh ├── DoubleKmer.h ├── Cap.cpp ├── JChecker.h ├── DoubleKmer.cpp ├── LargeInt.h ├── JuncPairs.h ├── Junction.h ├── ReadKmer.h ├── ContigJuncList.h ├── Junction.cpp ├── JChecker.cpp ├── lut.h ├── rvalues.h ├── ReadKmer.cpp ├── Kmer.h ├── JunctionMap.h ├── ttmath │ ├── ttmathmisc.h │ └── ttmaththreads.h ├── ContigJuncList.cpp ├── LargeInt.cpp └── Bloom.h ├── src ├── tests │ ├── RunTests │ ├── RunTests.o │ ├── TestUtils.o │ ├── FullTest.h │ ├── olderTests │ │ ├── TraverseReadsTests.o │ │ ├── FindNextJunctionTests.o │ │ ├── GetReadJunctionsTests.o │ │ ├── TraverseReadsTests.h │ │ ├── FindNextJunctionTests.h │ │ ├── GetReadJunctionsTests.h │ │ ├── GetReadJunctionsTests.cpp │ │ ├── FindNextJunctionTests.cpp │ │ └── TraverseReadsTests.cpp │ ├── RunTests.h │ ├── FullTeset.cpp │ ├── RunTests.cpp │ ├── makefile │ ├── TestUtils.h │ └── TestUtils.cpp ├── wget_urls ├── BfSearchResult.h ├── ContigIterator.h ├── stream_data_from_urls_list.sh ├── disk_mem_used ├── Faucet.h ├── ContigIterator.cpp ├── Contig.h ├── newTests │ ├── ContigTest.cpp │ ├── JunctionMapTest.cpp │ └── ReadscanTest.cpp ├── ContigNode.h ├── ReadScanner.h ├── ContigGraph.h ├── ContigNode.cpp └── Contig.cpp ├── LICENSE └── README.md /utils/debloom: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/README: -------------------------------------------------------------------------------- 1 | Utils for Mink and Minia. 2 | -------------------------------------------------------------------------------- /utils/Bloom: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/Bloom -------------------------------------------------------------------------------- /utils/DoubleKmer: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/DoubleKmer -------------------------------------------------------------------------------- /src/tests/RunTests: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/RunTests -------------------------------------------------------------------------------- /src/tests/RunTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/RunTests.o -------------------------------------------------------------------------------- /utils/tests/RunTests: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/RunTests -------------------------------------------------------------------------------- /src/tests/TestUtils.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/TestUtils.o -------------------------------------------------------------------------------- /utils/tests/RunTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/RunTests.o -------------------------------------------------------------------------------- /utils/manual/manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/manual/manual.pdf -------------------------------------------------------------------------------- /utils/tests/BloomTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/BloomTests.o -------------------------------------------------------------------------------- /utils/tests/JCheckTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/JCheckTests.o -------------------------------------------------------------------------------- /utils/tests/KmerTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/KmerTests.o -------------------------------------------------------------------------------- /utils/tests/TestUtils.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/TestUtils.o -------------------------------------------------------------------------------- /utils/tests/JunctionTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/JunctionTests.o -------------------------------------------------------------------------------- /utils/tests/JunctionMapTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/JunctionMapTests.o -------------------------------------------------------------------------------- /utils/tests/RollingHashTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/RollingHashTests.o -------------------------------------------------------------------------------- /src/tests/FullTest.h: -------------------------------------------------------------------------------- 1 | #ifndef FULL_TEST 2 | #define FULL_TEST 3 | 4 | #include "TestUtils.h" 5 | 6 | void runFullTest(); 7 | 8 | #endif -------------------------------------------------------------------------------- /src/tests/olderTests/TraverseReadsTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/olderTests/TraverseReadsTests.o -------------------------------------------------------------------------------- /src/tests/olderTests/FindNextJunctionTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/olderTests/FindNextJunctionTests.o -------------------------------------------------------------------------------- /src/tests/olderTests/GetReadJunctionsTests.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/olderTests/GetReadJunctionsTests.o -------------------------------------------------------------------------------- /utils/tests/JunctionMapTests.cpp: -------------------------------------------------------------------------------- 1 | #include "JunctionMapTests.h" 2 | 3 | void runJunctionMapTests(){ 4 | printf("No junction map tests yet.\n"); 5 | } -------------------------------------------------------------------------------- /src/tests/RunTests.h: -------------------------------------------------------------------------------- 1 | #ifndef RUN_TESTS 2 | #define RUN_TESTS 3 | 4 | #include "FindNextJunctionTests.h" 5 | #include "GetReadJunctionsTests.h" 6 | 7 | #endif -------------------------------------------------------------------------------- /utils/tests/JunctionTests.h: -------------------------------------------------------------------------------- 1 | #ifndef JUNCTION_TESTS 2 | #define JUNCTION_TESTS 3 | 4 | #include "../Junction.h" 5 | 6 | void runJunctionTests(); 7 | 8 | #endif -------------------------------------------------------------------------------- /src/wget_urls: -------------------------------------------------------------------------------- 1 | ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR034/SRR034939/SRR034939_1.fastq.gz 2 | ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR034/SRR034939/SRR034939_2.fastq.gz 3 | -------------------------------------------------------------------------------- /utils/tests/JunctionMapTests.h: -------------------------------------------------------------------------------- 1 | #ifndef JUNCTIONMAP_TESTS 2 | #define JUNCTIONMAP_TESTS 3 | 4 | #include "../JunctionMap.h" 5 | 6 | void runJunctionMapTests(); 7 | 8 | #endif -------------------------------------------------------------------------------- /src/tests/FullTeset.cpp: -------------------------------------------------------------------------------- 1 | #include "FullTest.h" 2 | 3 | string read1 = "ACGTTCG"; 4 | string read2 = "ACGTACGTTTT"; 5 | string read3 = "TTGCG"; 6 | 7 | void runFullTest(){ 8 | readScanner 9 | } -------------------------------------------------------------------------------- /utils/tests/BloomTests.h: -------------------------------------------------------------------------------- 1 | #ifndef BLOOM_TESTS 2 | #define BLOOM_TESTS 3 | 4 | #include "../Bloom.h" 5 | 6 | namespace bloomTests{ 7 | 8 | void runBloomTests(); 9 | 10 | } 11 | 12 | #endif -------------------------------------------------------------------------------- /utils/tests/KmerTests.h: -------------------------------------------------------------------------------- 1 | #ifndef KMER_TESTS 2 | #define KMER_TESTS 3 | 4 | #include "TestUtils.h" 5 | #include "../Kmer.h" 6 | 7 | namespace kmerTests{ 8 | 9 | void runKmerTests(); 10 | 11 | } 12 | 13 | #endif -------------------------------------------------------------------------------- /utils/JuncPairs.cpp: -------------------------------------------------------------------------------- 1 | #include "JuncPairs.h" 2 | 3 | bool operator<(JuncResult a, JuncResult b){ 4 | return a.distance < b.distance; 5 | } 6 | 7 | bool operator>(JuncResult a, JuncResult b){ 8 | return a.distance > b.distance; 9 | } -------------------------------------------------------------------------------- /utils/tests/JCheckTests.h: -------------------------------------------------------------------------------- 1 | #ifndef JCHECK_TESTS 2 | #define JCHECK_TESTS 3 | 4 | #include "TestUtils.h" 5 | #include "../Bloom.h" 6 | #include "../JChecker.h" 7 | 8 | namespace jCheckTests { 9 | 10 | void runJCheckTests(); 11 | 12 | } 13 | 14 | #endif -------------------------------------------------------------------------------- /utils/tests/JunctionTests.cpp: -------------------------------------------------------------------------------- 1 | #include "JunctionTests.h" 2 | 3 | void createJunction_testOnePath(){ 4 | 5 | } 6 | 7 | void updateJunction_testNewPath(){ 8 | 9 | } 10 | 11 | 12 | void runJunctionTests(){ 13 | printf("No junction tests yet.\n"); 14 | } -------------------------------------------------------------------------------- /utils/tests/RollingHashTests.h: -------------------------------------------------------------------------------- 1 | #ifndef INCREMENTAL_HASH_TESTS 2 | #define INCREMENTAL_HASH_TESTS 3 | 4 | #include "TestUtils.h" 5 | #include "../Bloom.h" 6 | 7 | namespace rollingHashTests{ 8 | 9 | void runRollingHashTests(); 10 | 11 | } 12 | 13 | #endif -------------------------------------------------------------------------------- /utils/tests/RunTests.h: -------------------------------------------------------------------------------- 1 | #ifndef RUN_TESTS 2 | #define RUN_TESTS 3 | 4 | #include "KmerTests.h" 5 | #include "JCheckTests.h" 6 | #include "RollingHashTests.h" 7 | #include "BloomTests.h" 8 | #include "JunctionTests.h" 9 | #include "JunctionMapTests.h" 10 | 11 | #endif -------------------------------------------------------------------------------- /src/tests/olderTests/TraverseReadsTests.h: -------------------------------------------------------------------------------- 1 | #ifndef TRAVERSE_READS_TESTS 2 | #define TRAVERSE_READS_TESTS 3 | 4 | #include "TestUtils.h" 5 | #include "../ReadScanner.h" 6 | 7 | namespace traverseReadsTests{ 8 | 9 | void runTraverseReadsTests(); 10 | 11 | } 12 | 13 | #endif -------------------------------------------------------------------------------- /src/tests/olderTests/FindNextJunctionTests.h: -------------------------------------------------------------------------------- 1 | #ifndef FIND_JUNCTION_TESTS 2 | #define FIND_JUNCTION_TESTS 3 | 4 | #include "TestUtils.h" 5 | #include "../ReadScanner.h" 6 | 7 | namespace findNextJunctionTests{ 8 | 9 | void runFindNextJunctionTests(); 10 | 11 | } 12 | 13 | #endif -------------------------------------------------------------------------------- /src/tests/olderTests/GetReadJunctionsTests.h: -------------------------------------------------------------------------------- 1 | #ifndef GET_READ_JUNCTIONS_TESTS 2 | #define GET_READ_JUNCTIONS_TESTS 3 | 4 | #include "TestUtils.h" 5 | #include "../ReadScanner.h" 6 | 7 | namespace findReadJunctionsTests{ 8 | 9 | void runFindReadJunctionsTests(); 10 | 11 | } 12 | 13 | #endif -------------------------------------------------------------------------------- /src/tests/RunTests.cpp: -------------------------------------------------------------------------------- 1 | #include "RunTests.h" 2 | 3 | //g++ ../Bloom.cpp ../Kmer.cpp ../Debloom.cpp JCheckTests.cpp KmerTests.cpp TestUtils.cpp RunTests.cpp -o RunTests 4 | 5 | int main(int argc, char *argv[]){ 6 | 7 | findNextJunctionTests::runFindNextJunctionTests(); 8 | findReadJunctionsTests::runFindReadJunctionsTests(); 9 | return 0; 10 | } -------------------------------------------------------------------------------- /utils/Cap.h: -------------------------------------------------------------------------------- 1 | #ifndef CAP 2 | #define CAP 3 | 4 | #include 5 | #include "Kmer.h" 6 | using std::ofstream; 7 | 8 | class Cap{ 9 | public: 10 | int dist; 11 | kmer_type lastJunc; 12 | 13 | void writeToFile(ofstream* jFile); 14 | 15 | Cap extend(int dist, kmer_type juncID); 16 | 17 | Cap(int distance,kmer_type juncID); 18 | }; 19 | 20 | #endif -------------------------------------------------------------------------------- /utils/simple_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #simple assembly test of a synthetic 10 K genome, to verify not completely broken 3 | 4 | rm -f t10.contigs.fa 5 | 6 | ./minia test/read50x_ref10K_e001.fasta 25 3 10000 t10 &> /dev/null 7 | 8 | 9 | diff t10.contigs.fa test/result10K.fasta > /dev/null 10 | 11 | var=$? 12 | 13 | if [ $var -eq 0 ] 14 | then 15 | echo Test PASSED 16 | exit 0 17 | else 18 | echo Test FAILED 19 | exit 1 20 | fi 21 | -------------------------------------------------------------------------------- /utils/tests/RunTests.cpp: -------------------------------------------------------------------------------- 1 | #include "RunTests.h" 2 | 3 | //g++ ../Bloom.cpp ../Kmer.cpp ../Debloom.cpp JCheckTests.cpp KmerTests.cpp TestUtils.cpp RunTests.cpp -o RunTests 4 | 5 | int main(int argc, char *argv[]){ 6 | 7 | kmerTests::runKmerTests(); 8 | rollingHashTests::runRollingHashTests(); 9 | jCheckTests::runJCheckTests(); 10 | runJunctionTests(); 11 | runJunctionMapTests(); 12 | bloomTests::runBloomTests(); 13 | 14 | return 0; 15 | } -------------------------------------------------------------------------------- /src/tests/makefile: -------------------------------------------------------------------------------- 1 | CFLAGS+= -O4 -D_FILE_OFFSET_BITS=64 # needed to handle files > 2 GB on 32 bits systems 2 | SRC= PairFinderTest.cpp ../Contig.cpp 3 | 4 | EXEC=minkTests 5 | OBJ= $(SRC:.cpp=.o) 6 | 7 | all: 8 | $(MAKE) $(EXEC) 9 | 10 | minkTests: $(OBJ) PairFinderTest.cpp 11 | cd .. && $(MAKE) 12 | g++ --std=c++0x $(SRC) -o minkTests $(CFLAGS) 13 | 14 | %.o: %.cpp %.h 15 | g++ --std=c++0x -o $@ -c $< $(CFLAGS) 16 | 17 | install: 18 | cp minkTests /usr/local/bin 19 | -------------------------------------------------------------------------------- /utils/tests/makefile: -------------------------------------------------------------------------------- 1 | CFLAGS+= -O4 -D_FILE_OFFSET_BITS=64 # needed to handle files > 2 GB on 32 bits systems 2 | SRC=../JChecker.cpp ../Bloom.cpp ../Kmer.cpp TestUtils.cpp JunctionTests.cpp JunctionMapTests.cpp JCheckTests.cpp KmerTests.cpp RollingHashTests.cpp BloomTests.cpp RunTests.cpp 3 | EXEC=miniaTests 4 | OBJ= $(SRC:.cpp=.o) 5 | 6 | all: 7 | $(MAKE) $(EXEC) 8 | 9 | miniaTests: $(OBJ) RunTests.cpp 10 | g++ --std=c++0x $(SRC) -o RunTests 11 | 12 | %.o: %.cpp %.h 13 | g++ --std=c++0x -o $@ -c $< $(CFLAGS) 14 | 15 | install: 16 | cp miniaTests /usr/local/bin 17 | -------------------------------------------------------------------------------- /utils/tests/TestUtils.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #ifndef TEST_UTILS 4 | #define TEST_UTILS 5 | 6 | #include "../Kmer.h" 7 | #include "../Bloom.h" 8 | 9 | using std::string; 10 | extern kmer_type test_kmer; 11 | 12 | kmer_type getKmerFromString(std::string kmerString); 13 | bool kmer_matches_readseq(char* read, kmer_type kmer, int i); 14 | bool kmer_matches_kmer(kmer_type kmer1, int i1, kmer_type kmer2, int i2); 15 | 16 | Bloom* loadBloom(string list[], int numKmers, int k); 17 | 18 | void fail(char* testName, char* errorMessage); 19 | void fail(char* testName); 20 | 21 | void succeed(char* testName); 22 | 23 | #endif -------------------------------------------------------------------------------- /src/tests/TestUtils.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #ifndef TEST_UTILS 4 | #define TEST_UTILS 5 | 6 | #include "../../utils/Kmer.h" 7 | #include "../../utils/Bloom.h" 8 | 9 | using std::string; 10 | extern kmer_type test_kmer; 11 | 12 | kmer_type getKmerFromString(std::string kmerString); 13 | bool kmer_matches_readseq(char* read, kmer_type kmer, int i); 14 | bool kmer_matches_kmer(kmer_type kmer1, int i1, kmer_type kmer2, int i2); 15 | 16 | Bloom* loadBloom(string list[], int numKmers, int k); 17 | 18 | void fail(char* testName, char* errorMessage); 19 | void fail(char* testName); 20 | 21 | void succeed(char* testName); 22 | 23 | #endif -------------------------------------------------------------------------------- /utils/DoubleKmer.h: -------------------------------------------------------------------------------- 1 | #ifndef DOUBLE_KMER 2 | #define DOUBLE_KMER 3 | 4 | #include "Kmer.h" 5 | 6 | class DoubleKmer{ 7 | 8 | public: 9 | kmer_type kmer; 10 | kmer_type revcompKmer; 11 | 12 | void forward(int nuc); 13 | 14 | //Takes as input the nucleotide extension and the direction 15 | //If the direction is BACKWARD, the nucleotide should be given as seen in the reverse direction 16 | //It will not be complemented within the function. 17 | kmer_type getExtension(int nuc, bool dir); 18 | 19 | kmer_type getCanon(); 20 | 21 | void reverse(); 22 | 23 | DoubleKmer(kmer_type forwardKmer); 24 | }; 25 | #endif -------------------------------------------------------------------------------- /utils/Cap.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "Cap.h" 3 | using std::ofstream; 4 | using std::max; 5 | 6 | void writeToFile(ofstream* jFile); 7 | 8 | //Kmer, then "ext,ext,ext,ext" then "cov,cov,cov,cov" for each of A,C,T,G in order. 9 | void Cap::writeToFile(ofstream*jFile){ 10 | *jFile <<"Distance: " << dist << " "; 11 | *jFile <<"Last ID: " << (long long) lastJunc << " "; 12 | } 13 | 14 | Cap Cap::extend(int extraDistance, kmer_type juncID){ 15 | return *(new Cap(extraDistance+dist, juncID)); 16 | } 17 | 18 | //explicitly set if it's a spacer or not 19 | Cap::Cap(int distance, kmer_type juncID){ 20 | dist=distance; 21 | lastJunc = juncID; 22 | } 23 | 24 | -------------------------------------------------------------------------------- /src/BfSearchResult.h: -------------------------------------------------------------------------------- 1 | #ifndef BF_SEARCH_RESULT 2 | #define BF_SEARCH_RESULT 3 | 4 | //Stores all the info involved in moving from one node to another in the graph by doing a bloom scan 5 | //Also used for scanning junction to junction 6 | struct BfSearchResult{ 7 | BfSearchResult() : kmer(-1){} //use this to tell whether a BfSearchResult has been set or if it was just declared 8 | BfSearchResult(kmer_type km, bool node, int i, int d, string cont) : kmer(km), isNode(node), index(i), distance(d), contig(cont){} 9 | kmer_type kmer; 10 | bool isNode; //either node/junction or sink 11 | int index; //index from it that points back to the start point 12 | int distance; //how far away it was 13 | string contig; //the contig! 14 | }; 15 | 16 | 17 | #endif -------------------------------------------------------------------------------- /utils/JChecker.h: -------------------------------------------------------------------------------- 1 | #ifndef JCHECKER 2 | #define JCHECKER 3 | 4 | #include "Bloom.h" 5 | #include "Kmer.h" 6 | 7 | class JChecker 8 | { 9 | private: 10 | Bloom* bloom; 11 | 12 | //for incremental hashing 13 | uint64_t ** lastHashes; 14 | uint64_t ** nextHashes; 15 | uint64_t ** tempor; 16 | uint64_t nextHash0, nextHash1; 17 | 18 | //to store the kmers in the BFS 19 | kmer_type* lastKmers; 20 | kmer_type* nextKmers; 21 | kmer_type* temp; 22 | 23 | public: 24 | int j; //value of j! 25 | bool jcheck(char* kmerSeq, uint64_t nextH0, uint64_t nextH1);//incremental version 26 | bool jcheck(kmer_type kmer);//normal version 27 | JChecker(int jVal, Bloom* bloo); 28 | }; 29 | #endif -------------------------------------------------------------------------------- /utils/DoubleKmer.cpp: -------------------------------------------------------------------------------- 1 | #include "DoubleKmer.h" 2 | 3 | #include 4 | 5 | void DoubleKmer::forward(int nuc){ 6 | kmer = next_kmer(kmer, nuc, FORWARD); 7 | revcompKmer = next_kmer(revcompKmer, revcomp_int(nuc), BACKWARD); 8 | } 9 | 10 | kmer_type DoubleKmer::getExtension(int nuc, bool direction){ 11 | if(direction == FORWARD){ 12 | return next_kmer(kmer, nuc, FORWARD); 13 | } 14 | else{ 15 | return next_kmer(revcompKmer, nuc, FORWARD); 16 | } 17 | } 18 | 19 | kmer_type DoubleKmer::getCanon(){ 20 | return std::min(kmer, revcompKmer); 21 | } 22 | 23 | void DoubleKmer::reverse(){ 24 | kmer_type temp = kmer; 25 | kmer = revcompKmer; 26 | revcompKmer = temp; 27 | } 28 | 29 | DoubleKmer::DoubleKmer(kmer_type forwardKmer){ 30 | kmer = forwardKmer; 31 | revcompKmer = revcomp(kmer); 32 | } 33 | -------------------------------------------------------------------------------- /src/ContigIterator.h: -------------------------------------------------------------------------------- 1 | #ifndef CONTIG_ITERATOR 2 | #define CONTIG_ITERATOR 3 | 4 | #include "ContigGraph.h" 5 | #include "Contig.h" 6 | #include "ContigNode.h" 7 | #include "../utils/Kmer.h" 8 | #include 9 | #include "../utils/sparsepp.h" 10 | using spp::sparse_hash_map; 11 | 12 | 13 | class ContigIterator{ 14 | private: 15 | ContigGraph* graph; 16 | std::unordered_map::iterator nodeIt; 17 | // sparse_hash_map::iterator nodeIt; 18 | int index; 19 | Contig* findNextContig(); //gets the contig but doesn't increment it and index 20 | void increment(); //increments index and nodeIt to point to next possible contig 21 | public: 22 | ContigIterator(ContigGraph* graph); 23 | Contig* getContig(); //gets contig, moves pointer to next contig or end 24 | bool hasNextContig(); 25 | }; 26 | 27 | #endif -------------------------------------------------------------------------------- /src/stream_data_from_urls_list.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Runs Faucet 3 | #Takes 4 parameter- 4 | #1) the output file prefix and a 5 | #2) file listing the input file URLs 6 | #3) estimated kmers 7 | #4) singletons 8 | 9 | URL_FILE=$2 10 | READ_COMMAND=wget\ --read-timeout=5\ --timeout=15\ -t\ 0\ -qO-\ -i\ $URL_FILE\ \|\ gzip\ -d\ -c\ -q 11 | 12 | eval "./faucet -read_load_file <($READ_COMMAND) -read_scan_file <($READ_COMMAND) -size_kmer 31 -max_read_length 130 -estimated_kmers $3 -singletons $4 -file_prefix $1 --fastq --high_cov" 13 | 14 | # eval "./faucet -size_kmer 27 \ 15 | # -max_read_length 130 \ 16 | # -estimated_kmers 3000000000 \ 17 | # -read_load_file <($READ_COMMAND) \ 18 | # -file_prefix $1 \ 19 | # --two_hash \ 20 | # --just_load_bloom \ 21 | # --fastq " 22 | 23 | # eval "./faucet -size_kmer 27 \ 24 | # -max_read_length 130 \ 25 | # -estimated_kmers 3000000000 \ 26 | # -read_scan_file <($READ_COMMAND) \ 27 | # -bloom_file $1.bloom \ 28 | # -file_prefix $1 \ 29 | # --two_hash \ 30 | # --fastq \ 31 | # --no_cleaning" -------------------------------------------------------------------------------- /src/disk_mem_used: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Author: Roye Rozov, 3 | # based on script by Marc Garcia-Garcera and Rayan Chikhi 4 | # usage: diskused [program arg1 arg2 ...] 5 | 6 | "$@" & 7 | pid=$! 8 | status=$(ps -o rss -o vsz -o pid | grep $pid) 9 | maxdisk=0 10 | deleted=0 11 | maxmem=0 12 | while [ "${#status}" -gt "0" ]; 13 | do 14 | sleep 1 15 | delta=false 16 | disk=$(cat /proc/$pid/io | grep -P '^write_bytes:' | awk '{print $2}') 17 | disk=$((disk/1024)) 18 | if [ "0$disk" -gt "0$maxdisk" ] 2>/dev/null; then 19 | maxdisk=$disk 20 | delta=true 21 | fi 22 | mem=$(ps -o rss -o vsz -o pid | grep $pid | awk '{print $1}') 23 | # echo mem: $mem 24 | mem=$((mem/1024)) 25 | if [ "0$mem" -gt "0$maxmem" ] 2>/dev/null; then 26 | maxmem=$mem 27 | delta=true 28 | fi 29 | # only print if at least one value changed 30 | if $delta; then 31 | (>&2 echo disk: $disk) 32 | (>&2 echo mem: $mem) 33 | fi 34 | status=$(ps -o rss -o vsz -o pid | grep $pid) 35 | done 36 | wait $pid 37 | ret=$? 38 | 39 | (>&2 echo "maximal disk used: $maxdisk KB") 40 | (>&2 echo "maximal memory used: $maxmem MB") 41 | 42 | 43 | exit $ret 44 | -------------------------------------------------------------------------------- /src/Faucet.h: -------------------------------------------------------------------------------- 1 | #ifndef MINK_MAIN 2 | #define MINK_MAIN 3 | 4 | #include "../utils/Bloom.h" 5 | #include "../utils/Kmer.h" 6 | #include "../utils/Junction.h" 7 | #include "../utils/JChecker.h" 8 | #include "ReadScanner.h" 9 | #include "ContigNode.h" 10 | #include "Contig.h" 11 | #include "ReadScanner.h" 12 | #include "ContigGraph.h" 13 | 14 | float fpRate = .04; 15 | int j = 1; 16 | 17 | string read_load_file; 18 | string read_scan_file; 19 | string bloom_input_file; 20 | string junctions_input_file; 21 | string short_pair_filter_file; 22 | string long_pair_filter_file; 23 | 24 | int read_length; 25 | uint64_t estimated_kmers; 26 | uint64_t singletons; 27 | 28 | 29 | // requred arguments: 30 | bool load_file_flag = false; 31 | bool scan_file_flag = false; 32 | bool k_val_flag = false; 33 | bool max_len_flag = false; 34 | bool est_kmers_flag = false; 35 | bool est_sing_flag = false; 36 | bool pref_flag = false; 37 | 38 | // optional arguments: 39 | bool two_hash = false; 40 | bool from_bloom = false; 41 | bool from_junctions = false; 42 | bool just_load = false; 43 | bool fastq = false; 44 | bool mercy = false; 45 | bool node_graph = false; 46 | bool paired_ends = false; 47 | bool no_cleaning = false; 48 | int maxSpacerDist = 100; //max is 128, smaller --> more frequent spacers, bigger --> less frequent. Measured in base pairs 49 | int64_t nb_reads; 50 | bool high_cov = false; 51 | 52 | set all_kmers; 53 | string file_prefix; 54 | 55 | #endif -------------------------------------------------------------------------------- /utils/LargeInt.h: -------------------------------------------------------------------------------- 1 | /* 2 | * arbitrary-precision integer library 3 | * very limited: only does what minia needs (but not what minia deserves) 4 | */ 5 | 6 | #ifndef LargeInt_h 7 | #define LargeInt_h 8 | 9 | #include 10 | #include 11 | 12 | template 13 | class LargeInt 14 | { 15 | 16 | public: 17 | uint64_t array[precision]; 18 | LargeInt(const uint64_t &); 19 | LargeInt(); 20 | 21 | // overloading 22 | LargeInt operator+(const LargeInt &) const; 23 | LargeInt operator-(const LargeInt &) const; 24 | LargeInt operator*(const int &) const; 25 | LargeInt operator/(const uint32_t &) const; 26 | uint32_t operator%(const uint32_t &) const; 27 | LargeInt operator^(const LargeInt &) const; 28 | LargeInt operator&(const LargeInt &) const; 29 | LargeInt operator~() const; 30 | LargeInt operator<<(const int &) const; 31 | LargeInt operator>>(const int &) const; 32 | bool operator!=(const LargeInt &) const; 33 | bool operator==(const LargeInt &) const; 34 | bool operator<(const LargeInt &) const; 35 | bool operator<=(const LargeInt &) const; 36 | 37 | // custom 38 | uint64_t toInt() const; 39 | #ifdef _LP64 40 | __uint128_t toInt128() const; 41 | #endif 42 | 43 | 44 | // c++ fun fact: 45 | // "const" will ban the function from being anything which can attempt to alter any member variables in the object. 46 | }; 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /utils/JuncPairs.h: -------------------------------------------------------------------------------- 1 | #ifndef JUNC_PAIR_SEARCH 2 | #define JUNC_PAIR_SEARCH 3 | 4 | #include "Kmer.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | using std::stringstream; 10 | 11 | struct JuncPair{ 12 | JuncPair(kmer_type km1, kmer_type km2): kmer1(km1), kmer2(km2){} 13 | kmer_type kmer1; 14 | kmer_type kmer2; 15 | friend bool operator==(JuncPair a, JuncPair b) { 16 | return a.kmer1 == b.kmer1 && a.kmer2 == b.kmer2; 17 | }; 18 | 19 | }; 20 | 21 | namespace std { 22 | template <> struct hash 23 | { 24 | size_t operator()(const JuncPair & x) const 25 | { 26 | return (std::hash()(x.kmer1) ^ (std::hash()(x.kmer2) << 1) >> 1); 27 | } 28 | }; 29 | } 30 | 31 | 32 | //Stores all the info involved in finding a junction candidate by searching from a node 33 | class JuncResult{ 34 | public: 35 | JuncResult(kmer_type km, int dist, int cov): kmer(km), distance(dist), coverage(cov){} 36 | kmer_type kmer; 37 | int distance; 38 | int coverage; 39 | friend bool operator<(JuncResult a, JuncResult b); 40 | friend bool operator>(JuncResult a, JuncResult b); 41 | }; 42 | 43 | //Stores all the info involved in finding a junction candidate by searching from a node 44 | struct JuncPairResult{ 45 | JuncPairResult(JuncPair p, int dist, int cov): pair(p), distance(dist), coverage(cov){} 46 | JuncPair pair; 47 | int distance; 48 | int coverage; 49 | }; 50 | #endif -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, Tel Aviv University 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the Tel Aviv University nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /src/ContigIterator.cpp: -------------------------------------------------------------------------------- 1 | #include "ContigIterator.h" 2 | 3 | ContigIterator::ContigIterator(ContigGraph* theGraph){ 4 | graph = theGraph; 5 | nodeIt = graph->getNodeMap()->begin(); 6 | index = 0; 7 | findNextContig(); 8 | } 9 | 10 | Contig* ContigIterator::getContig(){ 11 | if(nodeIt != graph->getNodeMap() -> end()){ 12 | Contig* result = nodeIt->second.contigs[index]; 13 | increment(); 14 | findNextContig(); 15 | return result; 16 | } 17 | } 18 | 19 | void ContigIterator::increment(){ 20 | if(index < 4){ 21 | index++; 22 | } 23 | else{ 24 | if(nodeIt != graph->getNodeMap()->end()){ 25 | index = 0; 26 | nodeIt++; 27 | } 28 | } 29 | } 30 | 31 | Contig* ContigIterator::findNextContig(){ 32 | for( ; nodeIt != graph->getNodeMap()->end(); nodeIt++){ 33 | ContigNode* node = &nodeIt->second; 34 | for(index %= 5; index < 5; index++){//if index is 5, reset to 0. Otherwise use it as is 35 | if(node->contigs[index]){ 36 | Contig* contig = node->contigs[index]; 37 | if(!contig->node1_p || !contig->node2_p){ //if one side is a sink, always return 38 | return contig; 39 | } 40 | else if(contig->node1_p == contig->node2_p){ //if the contig attaches to the same node twice, print when you see lower index 41 | if(index == contig->getMinIndex()){ 42 | return contig; 43 | } 44 | } 45 | else if(contig->getSide(node,index) == 1){ //If it attaches to two distinct nodes, print when you're on side 1 46 | return contig; 47 | } 48 | } 49 | } 50 | } 51 | return NULL; 52 | } 53 | 54 | bool ContigIterator::hasNextContig(){ 55 | return nodeIt != graph->getNodeMap()->end(); 56 | } 57 | -------------------------------------------------------------------------------- /src/tests/TestUtils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include // for max/min 10 | #include // for sorting_kmers 11 | #include 12 | #include 13 | #include "TestUtils.h" 14 | using std::string; 15 | 16 | using namespace std; 17 | int64_t nb_reads; 18 | kmer_type test_kmer; 19 | //Note: nuc order is ACTG = 0123 20 | 21 | bool kmer_matches_readseq(char* read, kmer_type kmer, int i){ 22 | char* kmerSeq = new char[sizeKmer]; 23 | code2seq(kmer, kmerSeq); 24 | for(int pos = 0; pos < sizeKmer; pos++){ 25 | if(read[i+pos] != kmerSeq[pos]) return false; 26 | } 27 | return true; 28 | } 29 | 30 | bool kmer_matches_kmer(kmer_type kmer1, int i1, kmer_type kmer2, int i2){ 31 | char* kmerSeq1 = new char[sizeKmer]; 32 | code2seq(kmer1, kmerSeq1); 33 | char* kmerSeq2 = new char[sizeKmer]; 34 | code2seq(kmer2, kmerSeq2); 35 | int length = min(sizeKmer - i1, sizeKmer - i2); 36 | for(int pos = 0; pos < length; pos++){ 37 | if(kmerSeq1[i1+pos] != kmerSeq2[i2+pos]) return false; 38 | } 39 | return true; 40 | } 41 | 42 | kmer_type getKmerFromString(string kmerString){ 43 | kmer_type kmer; 44 | getFirstKmerFromRead(&kmer, &(kmerString[0])); 45 | return kmer; 46 | } 47 | 48 | Bloom* loadBloom(string list[], int numKmers, int k){ 49 | Bloom* fakeBloom = new Bloom((uint64_t)10000, k); 50 | 51 | std::set valids; 52 | 53 | kmer_type kmer; 54 | for(int i = 0; i < numKmers; i++){ 55 | valids.insert(getKmerFromString(list[i])); 56 | } 57 | fakeBloom->fakify(valids); 58 | return fakeBloom; 59 | } 60 | 61 | void fail(char* testName, char* errorMessage){ 62 | printf("%s: %s \n", testName, errorMessage); 63 | } 64 | 65 | 66 | void fail(char* testName){ 67 | printf("%s: fail. \n", testName); 68 | } 69 | 70 | void succeed(char* testName){ 71 | printf("%s: success! \n", testName); 72 | } 73 | -------------------------------------------------------------------------------- /utils/tests/TestUtils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include // for max/min 10 | #include // for sorting_kmers 11 | #include 12 | #include 13 | #include "TestUtils.h" 14 | using std::string; 15 | 16 | using namespace std; 17 | int64_t nb_reads; 18 | kmer_type test_kmer; 19 | //Note: nuc order is ACTG = 0123 20 | 21 | bool kmer_matches_readseq(char* read, kmer_type kmer, int i){ 22 | char* kmerSeq = new char[sizeKmer]; 23 | code2seq(kmer, kmerSeq); 24 | for(int pos = 0; pos < sizeKmer; pos++){ 25 | if(read[i+pos] != kmerSeq[pos]) return false; 26 | } 27 | return true; 28 | } 29 | 30 | bool kmer_matches_kmer(kmer_type kmer1, int i1, kmer_type kmer2, int i2){ 31 | char* kmerSeq1 = new char[sizeKmer]; 32 | code2seq(kmer1, kmerSeq1); 33 | char* kmerSeq2 = new char[sizeKmer]; 34 | code2seq(kmer2, kmerSeq2); 35 | int length = min(sizeKmer - i1, sizeKmer - i2); 36 | for(int pos = 0; pos < length; pos++){ 37 | if(kmerSeq1[i1+pos] != kmerSeq2[i2+pos]) return false; 38 | } 39 | return true; 40 | } 41 | 42 | kmer_type getKmerFromString(string kmerString){ 43 | kmer_type kmer; 44 | getFirstKmerFromRead(&kmer, &(kmerString[0])); 45 | return kmer; 46 | } 47 | 48 | Bloom* loadBloom(string list[], int numKmers, int k){ 49 | Bloom* fakeBloom = new Bloom((uint64_t)10000, k); 50 | 51 | std::set valids; 52 | 53 | kmer_type kmer; 54 | for(int i = 0; i < numKmers; i++){ 55 | valids.insert(getKmerFromString(list[i])); 56 | } 57 | fakeBloom->fakify(valids); 58 | return fakeBloom; 59 | } 60 | 61 | void fail(char* testName, char* errorMessage){ 62 | printf("%s: %s \n", testName, errorMessage); 63 | } 64 | 65 | 66 | void fail(char* testName){ 67 | printf("%s: fail. \n", testName); 68 | } 69 | 70 | void succeed(char* testName){ 71 | printf("%s: success! \n", testName); 72 | } 73 | -------------------------------------------------------------------------------- /utils/Junction.h: -------------------------------------------------------------------------------- 1 | #ifndef JUNCTION 2 | #define JUNCTION 3 | 4 | #include 5 | #include 6 | #include "Kmer.h" 7 | using std::string; 8 | using std::ofstream; 9 | 10 | class Junction{ 11 | private: 12 | unsigned char cov[4]; //the number of reads along this extension 13 | 14 | public: 15 | //The following three fields are indexed by a value from 0-4. If the value is from 0-3, it indicates the forward extension from adding 16 | //A, C, T, or G, respectively. If the index is 4, it refers to the backwards direction. 17 | unsigned char dist[5]; //the distance to the next adjacent junction, or the farthest scanned as of yet without hitting another junction 18 | bool linked[5]; //whether or not we found another junction along this extension 19 | 20 | //Returns an index that points to a valid path in the direction opposite the direction of the given index 21 | //If input is 4, it returns the valid path of 0-4 22 | //If input is not 0,1,2,3, returns 3 23 | int getOppositeIndex(int index); 24 | int numPathsOut(); //Returns the number of forward paths out of the junction with positive coverage 25 | bool isSolid(int threshold); //Returns true if at least 2 paths out of the junction have at least a threshold coverage. 26 | 27 | 28 | //"dist dist dist dist dist cov cov cov cov cov linked linked linked linked linked " for each of A,C,T,G,Back, in order. 29 | string toString(); 30 | 31 | //Updates the junction to point to given distance, if it's greater than the current distance stored. 32 | void update(int nucExt, unsigned char length); 33 | 34 | 35 | //Returns coverage along given extnsion 36 | //If nucExt == 4, returns the sum of the four coverage fields 37 | int getCoverage(int nucExt); 38 | 39 | void setCoverage(int nucExt, int coverage); 40 | 41 | 42 | //Increments the coverage along the given extension by 1. 43 | void addCoverage(int nucExt); 44 | 45 | //Sets linked to true along the given extension. 46 | void link(int nucExt); 47 | 48 | //Initializes with 0 coverage, 0 distance, and linked false. 49 | Junction(); 50 | 51 | //Get junction from string printout 52 | Junction(string juncString); 53 | }; 54 | 55 | #endif -------------------------------------------------------------------------------- /utils/ReadKmer.h: -------------------------------------------------------------------------------- 1 | #ifndef READ_KMER 2 | #define READ_KMER 3 | 4 | #include 5 | 6 | #include "Kmer.h" 7 | #include "DoubleKmer.h" 8 | 9 | using std::string; 10 | 11 | //Used to represent a kmer with relation to a read. 12 | //Stores the read, the kmer and revcomp as a DoubleKmer and the position on the read (left end of the kmer). 13 | class ReadKmer{ 14 | public: 15 | //basic fields 16 | string* read; 17 | DoubleKmer doubleKmer; 18 | int pos; 19 | bool direction; 20 | 21 | //The real extension of the ReadKmer definitely j-checks to at least the return value, based on its position on the read. 22 | int getMaxGuaranteedJ(bool dir); 23 | 24 | int getDistToEnd(); //returns dist to end 25 | int getTotalPos(); //returns dist to start 26 | 27 | //returns true if the kmer is at a valid position on the read. Does not include the first backward kmer or the last forward kmer. 28 | bool onRead(); 29 | 30 | kmer_type getKmer(); 31 | kmer_type getRevCompKmer(); 32 | 33 | //moves one position forward. If facing BACKWARD, simply changes the kmer to face FORWARD. 34 | //This may entail going from facing BACKWARD to FORWARD 35 | void forward(); 36 | void advanceDist(int dist); //calls forward repeatedly 37 | 38 | //Gets the index on a corresponding junction which corresponds to going along the read in the given direction 39 | //e.g. if the ReadKmer faces forward and direction is BACKWARD, returns 4. 40 | //If the ReadKmer faces forward and the direction is FORWARD and the next real nucleotide on the read is 'T', returns 2 41 | int getExtensionIndex(bool direction); 42 | kmer_type getExtension(int newNuc);// Gets the next extension in the direction its facing, for the given nucleotide extension 43 | kmer_type getRealExtension();// gets the next real kmer in the direction the ReadKmer is pointing 44 | int getRealExtensionNuc(); //gets the next real nucleotide in the direction the ReadKmer is pointing 45 | kmer_type getCanon(); 46 | int offset(); //gets the contribution of the direction to the distance- 0 if BACKWARD, 1 if FORWARD 47 | 48 | char* directionAsString(); //for printing 49 | 50 | ReadKmer(string* theRead); //initializes the DoubleKmer to refer to the first kmer in the read 51 | ReadKmer(string* theRead, int index, bool dir);//Creates a double kmer corresponding to the given read, the index into the read, and the direction 52 | ReadKmer(ReadKmer* toCopy); //copy construct 53 | 54 | }; 55 | 56 | 57 | 58 | #endif -------------------------------------------------------------------------------- /utils/tests/JCheckTests.cpp: -------------------------------------------------------------------------------- 1 | #include "JCheckTests.h" 2 | #include 3 | #include 4 | using std::string; 5 | 6 | namespace jCheckTests{ 7 | 8 | string fake_read1 = "ACGGGCGAACTTTCATAGGA"; 9 | string fake_read2 = "GGCGAACTAGTCCAT"; 10 | string fake_read3 = "AACTTTCATACGATT"; 11 | Bloom* bloom; 12 | string valid_5mers[] = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT" 13 | ,"AACTT","ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA", "AACTA","ACTAG" 14 | , "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT" ,"CATAC", "ATACG", "TACGA", "ACGAT", "CGATT"}; 15 | JChecker* jchecker; 16 | 17 | bool jcheck(string kmer, int j){ 18 | jchecker = new JChecker(j, bloom); 19 | uint64_t hash0 = bloom->get_rolling_hash(getKmerFromString(kmer),0); 20 | uint64_t hash1 = bloom->get_rolling_hash(getKmerFromString(kmer),1); 21 | return jchecker->jcheck(&kmer[0],hash0,hash1); 22 | } 23 | 24 | void jcheck_testForwardJ1NoExtension(){ 25 | char* testName = (char*)"jcheck_testForwardJ1WithNoExtension"; 26 | 27 | bool jchecked = jcheck("TCCAT", 1); 28 | 29 | if(jchecked){ 30 | fail(testName,(char*)"It thought there was an extension."); 31 | return; 32 | } 33 | succeed(testName); 34 | } 35 | 36 | void jcheck_testForwardJ1WithExtension(){ 37 | char* testName = (char*)"jcheck_testForwardJ1WithExtension"; 38 | 39 | bool jchecked = jcheck("GTCCA",1); 40 | 41 | if(!jchecked){ 42 | fail(testName, (char*)"It thought there was no extension."); 43 | return; 44 | } 45 | succeed(testName); 46 | } 47 | 48 | void jcheck_testForwardJ2WithExtension(){ 49 | char* testName = (char*)"jcheck_testForwardJ2WithExtension"; 50 | 51 | bool jchecked = jcheck("GAACT",2); 52 | 53 | if(!jchecked){ 54 | fail(testName, (char*)"It thought there was no extension."); 55 | return; 56 | } 57 | succeed(testName); 58 | } 59 | 60 | void jcheck_testForwardJ2WithNoExtension(){ 61 | char* testName = (char*)"jcheck_testForwardJ2WithNoExtension"; 62 | 63 | bool jchecked = jcheck("ACGAT", 2); 64 | 65 | if(jchecked){ 66 | fail(testName, (char*)"It thought there was an extension."); 67 | return; 68 | } 69 | succeed(testName); 70 | } 71 | 72 | void runJCheckTests(){ 73 | setSizeKmer(5); 74 | bloom = loadBloom(valid_5mers, 28,5); 75 | 76 | jcheck_testForwardJ1WithExtension(); 77 | jcheck_testForwardJ1NoExtension(); 78 | jcheck_testForwardJ2WithExtension(); 79 | jcheck_testForwardJ2WithNoExtension(); 80 | } 81 | 82 | } -------------------------------------------------------------------------------- /utils/ContigJuncList.h: -------------------------------------------------------------------------------- 1 | #ifndef CONTIG_JUNC_LIST 2 | #define CONTIG_JUNC_LIST 3 | 4 | class JuncResult; 5 | 6 | #include 7 | #include "JuncPairs.h" 8 | 9 | //Stores info about all interior junctions 10 | //List of incremental distances and coverages 11 | //Coverages include coverage at each end 12 | class ContigJuncList{ 13 | 14 | public: 15 | 16 | typedef std::vector junc_list; 17 | typedef junc_list::const_iterator const_iterator; 18 | 19 | 20 | ContigJuncList(std::string seq, junc_list dist, junc_list cov); 21 | ContigJuncList(); 22 | 23 | const_iterator begin_distances() const{ return distances.begin();} 24 | const_iterator begin_coverages() const{ return coverages.begin();} 25 | const_iterator end_distances() const{ return distances.end();} 26 | const_iterator end_coverages() const{ return coverages.end();} 27 | void setSeq(std::string cont){seq = cont;} 28 | std::string getSeq(){ return seq;} 29 | bool isValidKmerPosition(int pos);//true iff getKmer makes sense on this 30 | kmer_type getKmer(int pos);//0 = first backward, 1 = first forward, 2 = second backward, etc. 31 | 32 | //Gets a list of JuncResults, specifying distance, coverage, and kmer for juncs, with reference to specified side 33 | std::list getJuncResults(bool startForward, int startDist, int maxDist); 34 | void printJuncResults(int side, int startDist, int maxDist); 35 | void printJuncResults(std::list results); 36 | void printJuncValues(); 37 | int size(); 38 | 39 | //Used for reversing a contig. Simply reverses both lists 40 | void reverse(); 41 | 42 | int length(); //returns length of sequence 43 | 44 | //Sums all distance values 45 | int getTotalDistance(); 46 | 47 | //Concatenates this list of juncs with another 48 | //Removes overlap of middle coverage and middle distance 49 | ContigJuncList concatenate(ContigJuncList otherList); 50 | 51 | //Averages all coverage values in list 52 | double getAvgCoverage(); 53 | double getAvgCoverage(std::list results); 54 | 55 | double getCoverageSampleVariance(); 56 | double getCoverageSampleVariance(std::list results); 57 | ContigJuncList getScaledContigJuncs(double scale_factor); 58 | ContigJuncList getShiftedCoverageContigJuncs(double shift); 59 | ContigJuncList getShiftedCoverageContigJuncsRange(double shift, int maxDist, int side); 60 | 61 | 62 | //Prints distances then coverages to a string 63 | std::string getStringRep(); 64 | 65 | private: 66 | junc_list distances; 67 | junc_list coverages; 68 | std::string seq; //string sequence, represented forward from side 1 to side 2 69 | }; 70 | 71 | 72 | 73 | #endif -------------------------------------------------------------------------------- /utils/Junction.cpp: -------------------------------------------------------------------------------- 1 | #include "Junction.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | using std::ofstream; 7 | using std::max; 8 | using std::istringstream; 9 | using std::stringstream; 10 | 11 | int Junction::getOppositeIndex(int index){ 12 | if(index < 4){ 13 | return 4; 14 | } 15 | else{ 16 | for(int i = 0; i < 4; i++){ 17 | if(cov[i] > 0){ 18 | return i; 19 | } 20 | } 21 | } 22 | } 23 | 24 | int Junction::numPathsOut(){ 25 | int numPaths = 0; 26 | for(int i = 0; i < 4; i++){ 27 | if(cov[i] > 0){ 28 | numPaths++; 29 | } 30 | } 31 | return numPaths; 32 | } 33 | 34 | void Junction::link(int nucExt){ 35 | linked[nucExt] = true; 36 | } 37 | 38 | bool Junction::isSolid(int threshold){ 39 | int pathsOut = 0; 40 | for(int i = 0; i < 4; i++){ 41 | if(cov[i] >= threshold){ 42 | pathsOut++; 43 | } 44 | } 45 | return pathsOut > 1; 46 | } 47 | 48 | int Junction::getCoverage(int nucExt){ 49 | if(nucExt < 4){ 50 | return (int)cov[nucExt]; 51 | } 52 | return (int)cov[0] + (int)cov[1] + (int)cov[2] + (int)cov[3]; 53 | } 54 | 55 | void Junction::setCoverage(int nucExt, int coverage){ 56 | cov[nucExt] = coverage; 57 | } 58 | 59 | void Junction::addCoverage(int nucExt){ 60 | cov[nucExt] = cov[nucExt] + 1; 61 | 62 | //handle overflow 63 | if(cov[nucExt] == 0){ 64 | cov[nucExt] = UCHAR_MAX; 65 | } 66 | } 67 | 68 | //Updates the junc info based on finding a path of length length from the extension nucExt 69 | void Junction::update(int nucExt, unsigned char lengthFor){ 70 | dist[nucExt] = max(dist[nucExt], lengthFor); 71 | } 72 | 73 | //"dist dist dist dist dist cov cov cov cov cov linked linked linked linked linked " for each of A,C,T,G,Back, in order. 74 | string Junction::toString(){ 75 | stringstream stream; 76 | for(int i = 0; i < 5; i++){ 77 | stream << (int)dist[i] << " " ; 78 | } 79 | stream << " "; 80 | for(int i = 0; i < 5; i++){ 81 | stream << getCoverage(i) << " " ; 82 | } 83 | stream << " "; 84 | for(int i = 0; i < 5; i++){ 85 | stream << linked[i] << " " ; 86 | } 87 | return stream.str(); 88 | } 89 | 90 | //explicitly set if it's a spacer or not 91 | Junction::Junction(){ 92 | for(int i = 0; i < 4; i++){ 93 | dist[i] = 0; 94 | cov[i] = 0; 95 | linked[i] = false; 96 | } 97 | dist[4] = 0; 98 | linked[4] = false; 99 | } 100 | 101 | //Get junction from string printout 102 | Junction::Junction(string juncString){ 103 | istringstream iss(juncString); 104 | string val; 105 | for(int i = 0; i < 5; i++){ 106 | iss >> val; 107 | dist[i] = stoi(val); 108 | } 109 | for(int i = 0; i < 4; i++){ 110 | iss >> val; 111 | cov[i] = stoi(val); 112 | } 113 | iss >> val; 114 | for(int i = 0; i < 5; i++){ 115 | iss >> val; 116 | linked[i] = stoi(val); 117 | } 118 | } 119 | 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | You can download Faucet [here](https://github.com/Shamir-Lab/Faucet/releases/download/v0.5/Faucet-v0.5.zip) or clone it via the link below. In case you download the zip, unzip the file before following the instructions below (ignoring the 'git clone' line) 2 | 3 | # Getting Faucet 4 | git clone https://github.com/rozovr/Faucet.git 5 | cd Faucet/src 6 | make depend 7 | make 8 | 9 | # Running Faucet (locally) 10 | Example usage: 11 | 12 | ```bash 13 | ./faucet -read_load_file interlaced_reads.fq \ 14 | -read_scan_file interlaced_reads.fq \ 15 | -size_kmer 31 \ 16 | -max_read_length 100 \ 17 | -estimated_kmers 1000000000 \ 18 | -singletons 200000000 \ 19 | -file_prefix faucet_outputs \ 20 | --fastq \ 21 | --paired_ends 22 | ``` 23 | 24 | The above command takes as input the file interlaced_reads.fq (where entries alternate between mates 1 and 2 of a paired end library), and the input format is fastq. Faucet does not accept separate mate files, but can accept fasta format and files composed of read sequences alone. 25 | 26 | # Streaming from a remote source 27 | A demonstration streaming reads from a remote server is provided in the script src/stream_data_from_urls_list.sh 28 | 29 | You can run it with: 30 | ```bash 31 | ./stream_data_from_urls_list.sh out wget_urls 1596741569 12045222 32 | ``` 33 | where `wget_urls` is a file with URLs downloaded from ENA, 34 | `1596741569` is the estimated number of unique kmers (F0) and `12045222` if the estimated number of singleton kmers (f1). 35 | 36 | # Requirements 37 | Faucet was implemented in C++ 11, so requires a compiler that is not too ancient to support it, and has been tested only on Linux so far. 38 | 39 | # Detailed usage 40 | 41 | Usage: 42 | ./faucet -read_load_file -read_scan_file -size_kmer -max_read_length -estimated_kmers -singletons -file_prefix 43 | Optional arguments: --fastq -max_spacer_dist -fp rate -j --two_hash -bloom_file -junctions_file --paired_ends --no_cleaning 44 | 45 | ### required arguments: 46 | 47 | -read_load_file , a file name string 48 | -read_scan_file , a file name string 49 | -size_kmer , and odd integer <= 31 50 | -max_read_length , the longest read length in the data (e.g., if the reads were trimmed to different sizes) 51 | -estimated_kmers 52 | -singletons 53 | -file_prefix , the desired prefix string or directory path for output files 54 | 55 | we recommend applying ntCard to extract the number estimated k-mers (F0) and singletons (f1) in the dataset. 56 | 57 | 58 | License 59 | ======= 60 | 61 | 62 | * Low level code for dealing with binary encoded k-mers and strings, and for Bloom filters is derived from the original minia implementation, http://minia.genouest.org/; these components, mostly unmodified, are distributed under a GPL 3.0 license 63 | 64 | * Original code is distributed under the BSD 3 clause license. 65 | -------------------------------------------------------------------------------- /src/tests/olderTests/GetReadJunctionsTests.cpp: -------------------------------------------------------------------------------- 1 | #include "GetReadJunctionsTests.h" 2 | #include 3 | #include 4 | using std::map; 5 | 6 | namespace findReadJunctionsTests { 7 | 8 | string fake_read1 = "ACGGGCGAACTTTCATAGGA"; 9 | string fake_read2 = "GGCGAACTAGTCCAT"; 10 | string fake_read3 = "AACTTTCATACGATT"; 11 | string fake_read4 = "CGCGCGCGCGC"; 12 | Bloom* bloom; 13 | ReadScanner* scanner; 14 | 15 | //this is all the kmers from the reads plus two error kmers that cause a 16 | //TACGA --> ACGATT, ACGAAA branch (fake of length 2) 17 | string valid_5mers[] = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT" 18 | ,"AACTT","ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA", "AACTA","ACTAG" 19 | , "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT" ,"CATAC", "ATACG", "TACGA", "ACGAT","CGATT", "ACGAC", "CGACA" 20 | , "CGCGC", "GCGCG"}; 21 | 22 | Junction* getReadJunction(int pos, int dir, Junction** readJunctions){ 23 | return readJunctions[2*pos + dir]; 24 | } 25 | 26 | void testFindReadJunctions_NoJunctions(){ 27 | char* testName = (char*)"testFindReadJunctions_NoJunctions"; 28 | 29 | scanner->find_read_junctions(fake_read4); 30 | 31 | Junction** readJunctions = scanner->get_read_junctions(); 32 | 33 | for(int i = 0; i < fake_read4.length(); i++){ 34 | if(getReadJunction(i,0,readJunctions) || getReadJunction(i,1,readJunctions) ){ 35 | fail(testName); 36 | return; 37 | } 38 | } 39 | succeed(testName); 40 | } 41 | 42 | void testFindReadJunctions_2Junctions(){ 43 | char* testName = (char*)"testFindReadJunctions_2Junctions"; 44 | 45 | scanner->find_read_junctions(fake_read1);//expect junction at 6 and 12 46 | 47 | Junction** readJunctions = scanner->get_read_junctions(); 48 | 49 | if(!getReadJunction(6,0, readJunctions)){ 50 | fail(testName, (char*)"Didn't find the forward junction at 6."); 51 | return; 52 | } 53 | if(!getReadJunction(12,0, readJunctions)){ 54 | fail(testName, (char*)"Didn't find the forward junction at 12"); 55 | } 56 | succeed(testName); 57 | } 58 | 59 | void testFindReadJunctions_backwardJunction(){ 60 | char* testName = (char*)"testFindReadJunctions_BackwardJunction"; 61 | string backwardRead = fake_read1; 62 | revcomp_sequence(&backwardRead[0], backwardRead.length()); 63 | 64 | scanner->find_read_junctions(backwardRead);//expect junction at 6 and 12 65 | 66 | Junction** readJunctions = scanner->get_read_junctions(); 67 | 68 | if(!getReadJunction(3,1, readJunctions)){ 69 | fail(testName, (char*)"Didn't find the backward junction at 3."); 70 | return; 71 | } 72 | if(!getReadJunction(9,1, readJunctions)){ 73 | fail(testName, (char*)"Didn't find the backward junction at 9"); 74 | } 75 | succeed(testName); 76 | } 77 | 78 | void runFindReadJunctionsTests(){ 79 | setSizeKmer(5); 80 | bloom = loadBloom(valid_5mers,31,5); 81 | scanner = new ReadScanner("mockfile", bloom, new JChecker(0, bloom)); 82 | 83 | testFindReadJunctions_NoJunctions(); 84 | testFindReadJunctions_2Junctions(); 85 | testFindReadJunctions_backwardJunction(); 86 | } 87 | 88 | } -------------------------------------------------------------------------------- /src/tests/olderTests/FindNextJunctionTests.cpp: -------------------------------------------------------------------------------- 1 | #include "FindNextJunctionTests.h" 2 | #include 3 | 4 | namespace findNextJunctionTests 5 | { 6 | 7 | string fake_read1 = "ACGGGCGAACTTTCATAGGA"; 8 | string fake_read2 = "GGCGAACTAGTCCAT"; 9 | string fake_read3 = "AACTTTCATACGATT"; 10 | Bloom* bloom; 11 | ReadScanner* scanner; 12 | 13 | //this is all the kmers from the reads plus two error kmers that cause a 14 | //TACGA --> ACGATT, ACGAAA branch (fake of length 2) 15 | string valid_5mers[] = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT" 16 | ,"AACTT","ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA", "AACTA","ACTAG" 17 | , "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT" ,"CATAC", "ATACG", "TACGA", "ACGAT","CGATT", "ACGAC", "CGACA"}; 18 | 19 | void findNextJunction_J1_testFromStart(){ 20 | //int* pos, kmer_type * kmer, string read, int j, Bloom* bloo1 21 | char* testName = (char*)"findNextJunction_J1_testFromStart"; 22 | int pos = 0; 23 | kmer_type kmer = getKmerFromString("ACGGG"); 24 | scanner = new ReadScanner("mockFile", bloom, new JChecker(1, bloom)); 25 | scanner->resetHashes(kmer); 26 | 27 | Junction* junc = scanner->find_next_junction(&pos, &kmer, fake_read1); 28 | 29 | if(pos != 6){ 30 | fail(testName, (char*)"pos was incorrect."); 31 | return; 32 | } 33 | if(kmer != getKmerFromString("GAACT")){ 34 | fail(testName, (char*)"kmer was incorrect."); 35 | return; 36 | } 37 | succeed(testName); 38 | } 39 | 40 | void findNextJunction_J2_testOffEnd(){ 41 | //int* pos, kmer_type * kmer, string read, int j, Bloom* bloo1 42 | char* testName = (char*)"findNextJunction_J2_testOffEnd"; 43 | int pos = 13; 44 | kmer_type kmer = getKmerFromString("CATAG"); 45 | scanner = new ReadScanner("mockFile", bloom, new JChecker(2, bloom)); 46 | scanner->resetHashes(kmer); 47 | 48 | Junction* junc = scanner->find_next_junction(&pos, &kmer, fake_read1); 49 | 50 | if(junc){ 51 | fail(testName, (char*)"Returned a junction."); 52 | return; 53 | } 54 | if(pos != 15 ){ 55 | printf("Position %d\n", pos); 56 | fail(testName, (char*)"Incorrect position."); 57 | return; 58 | } 59 | succeed(testName); 60 | } 61 | void findNextJunction_J1_testAtJunction(){ 62 | //int* pos, kmer_type * kmer, string read, int j, Bloom* bloo1 63 | char* testName = (char*)"findNextJunction_J1_testAtJunction"; 64 | int pos = 13; 65 | kmer_type kmer = getKmerFromString("CATAG"); 66 | scanner = new ReadScanner("mockFile", bloom, new JChecker(2, bloom)); 67 | scanner->getJunctionMap()->createJunction(kmer); 68 | scanner->resetHashes(kmer); 69 | 70 | Junction* junc = scanner->find_next_junction(&pos, &kmer, fake_read1); 71 | 72 | if(pos != 13 ){ 73 | fail(testName, (char*)"Did not return initial position."); 74 | return; 75 | } 76 | succeed(testName); 77 | } 78 | 79 | void runFindNextJunctionTests(){ 80 | setSizeKmer(5); 81 | bloom = loadBloom(valid_5mers,30,5); 82 | 83 | findNextJunction_J1_testFromStart(); 84 | findNextJunction_J2_testOffEnd(); 85 | findNextJunction_J1_testAtJunction(); 86 | } 87 | 88 | } -------------------------------------------------------------------------------- /src/tests/olderTests/TraverseReadsTests.cpp: -------------------------------------------------------------------------------- 1 | #include "TraverseReadsTests.h" 2 | #include 3 | #include 4 | using std::map; 5 | 6 | namespace traverseReadsTests { 7 | 8 | string fake_read1 = "ACGGGCGAACTTTCATAGGA"; 9 | string fake_read2 = "GGCGAACTAGTCCAT"; 10 | string fake_read3 = "AACTTTCATACGATT"; 11 | Bloom* bloom; 12 | 13 | //this is all the kmers from the reads plus two error kmers that cause a 14 | //TACGA --> ACGATT, ACGAAA branch (fake of length 2) 15 | string valid_5mers[] = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT" 16 | ,"AACTT","ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA", "AACTA","ACTAG" 17 | , "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT" ,"CATAC", "ATACG", "TACGA", "ACGAT","CGATT", "ACGAC", "CGACA"}; 18 | 19 | 20 | void printJunctionMap(){ 21 | printf("Size: %d \n", scanner->getJunctionMap().size()); 22 | for (auto& kv : scanner->getJunctionMap()){ 23 | printf("%s \n", print_kmer(kv.first)); 24 | } 25 | } 26 | void traverseReads(int j){ 27 | scanner->setJ(j); 28 | scanner->smart_traverse_read(fake_read1); 29 | scanner->smart_traverse_read(fake_read2); 30 | scanner->smart_traverse_read(fake_read3); 31 | } 32 | 33 | void testTraverseReads_J0(){ 34 | char* testName = (char*)"testTraverseReads_J0"; 35 | scanner = new ReadScanner("mockfile", bloom); 36 | 37 | traverseReads(0); 38 | 39 | if(scanner->getJunctionMap().size() != 4){ 40 | fail(testName, (char*)"junction map size was wrong."); 41 | printJunctionMap(); 42 | return; 43 | } 44 | succeed(testName); 45 | } 46 | 47 | 48 | void testTraverseReads_J1(){ 49 | char* testName = (char*)"testTraverseReads_J1"; 50 | scanner = new ReadScanner("mockfile", bloom); 51 | 52 | traverseReads(1); 53 | 54 | if(scanner->getJunctionMap().size() != 4){ 55 | fail(testName, (char*)"junction map size was wrong."); 56 | printJunctionMap(); 57 | return; 58 | } 59 | succeed(testName); 60 | } 61 | 62 | 63 | void testTraverseReads_J2(){ 64 | char* testName = (char*)"testTraverseReads_J2"; 65 | scanner = new ReadScanner("mockfile", bloom); 66 | 67 | traverseReads(2); 68 | 69 | if(scanner->getJunctionMap().size() != 3){ 70 | fail(testName, (char*)"junction map size was wrong."); 71 | printJunctionMap(); 72 | return; 73 | } 74 | succeed(testName); 75 | } 76 | 77 | void testTraverseReadTwice_SameJuncs(){ 78 | char* testName = (char*)"testTraverseReadsTwice_SameJuncs"; 79 | scanner = new ReadScanner("mockfile", bloom); 80 | 81 | traverseReads(1); 82 | traverseReads(1); 83 | 84 | if(scanner->getJunctionMap().size() != 4){ 85 | fail(testName, (char*)"junction map size was wrong."); 86 | printJunctionMap(); 87 | return; 88 | } 89 | succeed(testName); 90 | } 91 | 92 | void runTraverseReadsTests(){ 93 | setSizeKmer(5); 94 | bloom = loadBloom(valid_5mers,30,5); 95 | 96 | testTraverseReads_J0(); 97 | testTraverseReads_J1(); 98 | testTraverseReads_J2(); 99 | 100 | testTraverseReadTwice_SameJuncs(); 101 | } 102 | 103 | } -------------------------------------------------------------------------------- /utils/JChecker.cpp: -------------------------------------------------------------------------------- 1 | #include "JChecker.h" 2 | #include 3 | 4 | //incremental version 5 | //j = 0 always returns true 6 | //j > 0 checks extensions up to j deep from kmer, and returns true if there is a sequence of j extensions 7 | //which returns all positive in the bloom filter 8 | bool JChecker::jcheck(char* kmerSeq, uint64_t nextH0, uint64_t nextH1){ 9 | if(j == 0){ 10 | return true; 11 | } 12 | uint64_t workingHash0, workingHash1; 13 | int lastCount, nextCount; 14 | lastCount = 1; 15 | lastHashes[0][0] = nextH0; 16 | lastHashes[0][1] = nextH1; 17 | 18 | for(int i = 0; i < j; i++){//for each level up to j 19 | nextCount = 0; //have found no extensions yet 20 | for(int k = 0; k < lastCount; k++){ //for each kmer in the last level 21 | workingHash0 = lastHashes[k][0]; 22 | workingHash1 = lastHashes[k][1]; 23 | for(int nt = 0; nt < 4; nt++){ //for each possible extension 24 | nextHash0 = bloom->roll_hash(workingHash0, NT2int(kmerSeq[i]), nt, 0); 25 | nextHash1 = bloom->roll_hash(workingHash1, NT2int(kmerSeq[i]), nt, 1); 26 | if(bloom->contains(nextHash0, nextHash1)){ //add to next level if it's in the bloom filter 27 | if(i == (j-1)){ 28 | return true;//if this is the last level return true after the first check 29 | } 30 | nextHashes[nextCount][0] = nextHash0; 31 | nextHashes[nextCount][1] = nextHash1; 32 | nextCount++; 33 | } 34 | } 35 | } 36 | 37 | if(nextCount == 0){ //if there are no kmers in the list now, return false 38 | return false; 39 | } 40 | //reset counts and lists for next level of th search 41 | lastCount = nextCount; 42 | 43 | tempor = lastHashes; 44 | lastHashes = nextHashes; 45 | nextHashes = tempor; 46 | } 47 | } 48 | 49 | //Normal version of jchecking, without rolling hash. 50 | //Old hash! use only for old hash! For kpomerscanner 51 | bool JChecker::jcheck(kmer_type kmer){ 52 | kmer_type this_kmer, nextKmer; 53 | int lastCount, nextCount; 54 | 55 | lastCount = 1; 56 | lastKmers[0] = kmer; 57 | 58 | for(int i = 0; i < j; i++){ //for up to j levels 59 | nextCount = 0; 60 | for(int k = 0; k < lastCount; k++){ //for every kmer in the last level 61 | this_kmer = lastKmers[k]; 62 | for(int nt = 0; nt < 4; nt++){ //for every possible extension 63 | nextKmer = next_kmer(this_kmer, nt, FORWARD); 64 | if(bloom->oldContains(get_canon(nextKmer))){//add any positive extensions to the next level 65 | nextKmers[nextCount] = nextKmer; 66 | nextCount++; 67 | } 68 | } 69 | } 70 | if(nextCount == 0){ 71 | return false; //if there are ever no valid kmers in the next level, the kmer does not jcheck 72 | } 73 | lastCount = nextCount; 74 | //switch the pointers to the "last" and "next" arrays so we can use the current "next" one as the next "last" one 75 | temp = lastKmers; 76 | lastKmers = nextKmers; 77 | nextKmers = temp; 78 | } 79 | return true; 80 | } 81 | 82 | JChecker::JChecker(int jVal, Bloom* bloo){ 83 | j = jVal; 84 | bloom = bloo; 85 | 86 | //all this is for rolling hash function.. not relevant now 87 | lastHashes = new uint64_t*[20000]; 88 | nextHashes = new uint64_t*[20000]; 89 | for(int i = 0; i < 20000; i++){ 90 | lastHashes[i] = new uint64_t[2]; 91 | nextHashes[i] = new uint64_t[2]; 92 | } 93 | lastKmers = new kmer_type[1000]; 94 | nextKmers = new kmer_type[1000]; 95 | } -------------------------------------------------------------------------------- /src/Contig.h: -------------------------------------------------------------------------------- 1 | #ifndef CONTIG 2 | #define CONTIG 3 | 4 | #include "../utils/Kmer.h" 5 | #include "../utils/JuncPairs.h" 6 | #include "../utils/Bloom.h" 7 | #include "../utils/JuncPairs.h" 8 | #include "../utils/ContigJuncList.h" 9 | #include "ContigNode.h" 10 | #include 11 | #include 12 | 13 | using std::ofstream; 14 | 15 | class ContigNode; // forward declaration 16 | 17 | class Contig{ 18 | private: 19 | //utility for linking if they're both facing forward 20 | //Glues end 2 of this contig to end 1 of the other 21 | //Doesn't change the value of either contig- just returns the concatenation. 22 | Contig* concatenate(Contig* otherContig); 23 | 24 | std::vector > getNeighbors(bool forward); 25 | 26 | public: 27 | 28 | ~Contig(); 29 | std::string getFastGName(bool RC); 30 | std::string getFastGHeader(bool RC); 31 | 32 | // length can be obtained from sequence 33 | ContigNode * node1_p; //adjacent node on side 1 34 | ContigNode * node2_p; //adjacent node on side 2 35 | unsigned char ind1; //index on which it connects to the node, on side 1 36 | unsigned char ind2; //index on which it connects to the node, on side 2 37 | bool marked; 38 | 39 | //list of coverage and distance for interior junctions along this contig- since we can use for pair BF and coverage info 40 | ContigJuncList contigJuncs; 41 | 42 | bool checkValidity(); 43 | bool isDegenerateLoop();//returns true if both sides have same node and same index 44 | 45 | //Concatenates the two contigs, gluing together the specified sides 46 | Contig* concatenate(Contig* otherContig, int thisSide, int otherSide); 47 | std::pair getPairsMeanStd(Bloom* pair_filter); 48 | void printPairStatistics(Bloom* pair_filter); 49 | int length(); //returns length of sequence 50 | void reverse(); //reverses the contig orientation 51 | ContigNode* otherEndNode(ContigNode * oneEnd);//returns a pointer to the node at the other end 52 | void setEnds(ContigNode* n1, int i1, ContigNode* n2, int i2); 53 | void setIndices(int i1, int i2); 54 | void setSeq(std::string cont){contigJuncs.setSeq(cont);} 55 | std::string getSeq(){return contigJuncs.getSeq();} 56 | double getAvgCoverage(); 57 | double getAvgCoverage(std::list results); 58 | 59 | double getCoverageSampleVariance(); 60 | double getCoverageSampleVariance(std::list results); 61 | void setContigJuncs(ContigJuncList juncList){ contigJuncs = juncList;} 62 | std::list getJuncResults(int side, int startDist, int maxDist); 63 | 64 | //gets the node coverages on each end, returns minimum as base line for how much this should be covered. 65 | //If the contig is isolated, returns 0 66 | int getTotalDistance(){ return contigJuncs.getTotalDistance(); } 67 | 68 | float getMass(); 69 | int getMinIndex(); 70 | kmer_type getNodeKmer(ContigNode * contigNode); //Assumes the given contig node points to one end of this contig 71 | kmer_type getSideKmer(int side); //either 1 or 2 72 | int getSide(ContigNode* node); 73 | int getSide(ContigNode* node, int index); 74 | void setMark(bool value); 75 | bool getMark(); 76 | ContigNode* getNode(int side); 77 | int getIndex(int side); 78 | bool isIsolated();//return true if both sides point to null 79 | void setSide(int side, ContigNode* node); 80 | std::string getStringRep(); 81 | Contig(); 82 | Contig( Contig * c); 83 | 84 | 85 | }; 86 | 87 | #endif -------------------------------------------------------------------------------- /utils/lut.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_H 2 | #define CODE_H 3 | 4 | 5 | //look up table conversion (with A,C,T,G <--> 0,1,2,3) 6 | 7 | //complement of one NT 8 | const unsigned char comp_NT[4] = { 9 | 2,3,0,1 10 | }; 11 | 12 | //reverse complement of 4NT, ie one byte 13 | const unsigned char revcomp_4NT[256] = { 14 | 0xaa, 15 | 0xea, 16 | 0x2a, 17 | 0x6a, 18 | 0xba, 19 | 0xfa, 20 | 0x3a, 21 | 0x7a, 22 | 0x8a, 23 | 0xca, 24 | 0xa, 25 | 0x4a, 26 | 0x9a, 27 | 0xda, 28 | 0x1a, 29 | 0x5a, 30 | 0xae, 31 | 0xee, 32 | 0x2e, 33 | 0x6e, 34 | 0xbe, 35 | 0xfe, 36 | 0x3e, 37 | 0x7e, 38 | 0x8e, 39 | 0xce, 40 | 0xe, 41 | 0x4e, 42 | 0x9e, 43 | 0xde, 44 | 0x1e, 45 | 0x5e, 46 | 0xa2, 47 | 0xe2, 48 | 0x22, 49 | 0x62, 50 | 0xb2, 51 | 0xf2, 52 | 0x32, 53 | 0x72, 54 | 0x82, 55 | 0xc2, 56 | 0x2, 57 | 0x42, 58 | 0x92, 59 | 0xd2, 60 | 0x12, 61 | 0x52, 62 | 0xa6, 63 | 0xe6, 64 | 0x26, 65 | 0x66, 66 | 0xb6, 67 | 0xf6, 68 | 0x36, 69 | 0x76, 70 | 0x86, 71 | 0xc6, 72 | 0x6, 73 | 0x46, 74 | 0x96, 75 | 0xd6, 76 | 0x16, 77 | 0x56, 78 | 0xab, 79 | 0xeb, 80 | 0x2b, 81 | 0x6b, 82 | 0xbb, 83 | 0xfb, 84 | 0x3b, 85 | 0x7b, 86 | 0x8b, 87 | 0xcb, 88 | 0xb, 89 | 0x4b, 90 | 0x9b, 91 | 0xdb, 92 | 0x1b, 93 | 0x5b, 94 | 0xaf, 95 | 0xef, 96 | 0x2f, 97 | 0x6f, 98 | 0xbf, 99 | 0xff, 100 | 0x3f, 101 | 0x7f, 102 | 0x8f, 103 | 0xcf, 104 | 0xf, 105 | 0x4f, 106 | 0x9f, 107 | 0xdf, 108 | 0x1f, 109 | 0x5f, 110 | 0xa3, 111 | 0xe3, 112 | 0x23, 113 | 0x63, 114 | 0xb3, 115 | 0xf3, 116 | 0x33, 117 | 0x73, 118 | 0x83, 119 | 0xc3, 120 | 0x3, 121 | 0x43, 122 | 0x93, 123 | 0xd3, 124 | 0x13, 125 | 0x53, 126 | 0xa7, 127 | 0xe7, 128 | 0x27, 129 | 0x67, 130 | 0xb7, 131 | 0xf7, 132 | 0x37, 133 | 0x77, 134 | 0x87, 135 | 0xc7, 136 | 0x7, 137 | 0x47, 138 | 0x97, 139 | 0xd7, 140 | 0x17, 141 | 0x57, 142 | 0xa8, 143 | 0xe8, 144 | 0x28, 145 | 0x68, 146 | 0xb8, 147 | 0xf8, 148 | 0x38, 149 | 0x78, 150 | 0x88, 151 | 0xc8, 152 | 0x8, 153 | 0x48, 154 | 0x98, 155 | 0xd8, 156 | 0x18, 157 | 0x58, 158 | 0xac, 159 | 0xec, 160 | 0x2c, 161 | 0x6c, 162 | 0xbc, 163 | 0xfc, 164 | 0x3c, 165 | 0x7c, 166 | 0x8c, 167 | 0xcc, 168 | 0xc, 169 | 0x4c, 170 | 0x9c, 171 | 0xdc, 172 | 0x1c, 173 | 0x5c, 174 | 0xa0, 175 | 0xe0, 176 | 0x20, 177 | 0x60, 178 | 0xb0, 179 | 0xf0, 180 | 0x30, 181 | 0x70, 182 | 0x80, 183 | 0xc0, 184 | 0x0, 185 | 0x40, 186 | 0x90, 187 | 0xd0, 188 | 0x10, 189 | 0x50, 190 | 0xa4, 191 | 0xe4, 192 | 0x24, 193 | 0x64, 194 | 0xb4, 195 | 0xf4, 196 | 0x34, 197 | 0x74, 198 | 0x84, 199 | 0xc4, 200 | 0x4, 201 | 0x44, 202 | 0x94, 203 | 0xd4, 204 | 0x14, 205 | 0x54, 206 | 0xa9, 207 | 0xe9, 208 | 0x29, 209 | 0x69, 210 | 0xb9, 211 | 0xf9, 212 | 0x39, 213 | 0x79, 214 | 0x89, 215 | 0xc9, 216 | 0x9, 217 | 0x49, 218 | 0x99, 219 | 0xd9, 220 | 0x19, 221 | 0x59, 222 | 0xad, 223 | 0xed, 224 | 0x2d, 225 | 0x6d, 226 | 0xbd, 227 | 0xfd, 228 | 0x3d, 229 | 0x7d, 230 | 0x8d, 231 | 0xcd, 232 | 0xd, 233 | 0x4d, 234 | 0x9d, 235 | 0xdd, 236 | 0x1d, 237 | 0x5d, 238 | 0xa1, 239 | 0xe1, 240 | 0x21, 241 | 0x61, 242 | 0xb1, 243 | 0xf1, 244 | 0x31, 245 | 0x71, 246 | 0x81, 247 | 0xc1, 248 | 0x1, 249 | 0x41, 250 | 0x91, 251 | 0xd1, 252 | 0x11, 253 | 0x51, 254 | 0xa5, 255 | 0xe5, 256 | 0x25, 257 | 0x65, 258 | 0xb5, 259 | 0xf5, 260 | 0x35, 261 | 0x75, 262 | 0x85, 263 | 0xc5, 264 | 0x5, 265 | 0x45, 266 | 0x95, 267 | 0xd5, 268 | 0x15, 269 | 0x55 270 | }; 271 | 272 | #endif 273 | -------------------------------------------------------------------------------- /src/newTests/ContigTest.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2005, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // A sample program demonstrating using Google C++ testing framework. 31 | // 32 | // Author: wan@google.com (Zhanyong Wan) 33 | 34 | 35 | // This sample shows how to write a simple unit test for a function, 36 | // using Google C++ testing framework. 37 | // 38 | // Writing a unit test using Google C++ testing framework is easy as 1-2-3: 39 | 40 | 41 | // Step 1. Include necessary header files such that the stuff your 42 | // test logic needs is declared. 43 | // 44 | // Don't forget gtest.h, which declares the testing framework. 45 | 46 | #include "../Contig.h" 47 | #include "gtest/gtest.h" 48 | 49 | 50 | // Step 2. Use the TEST macro to define your tests. 51 | // 52 | // TEST has two parameters: the test case name and the test name. 53 | // After using the macro, you should define your test logic between a 54 | // pair of braces. You can use a bunch of macros to indicate the 55 | // success or failure of a test. EXPECT_TRUE and EXPECT_EQ are 56 | // examples of such macros. For a complete list, see gtest.h. 57 | // 58 | // 59 | // 60 | // In Google Test, tests are grouped into test cases. This is how we 61 | // keep test code organized. You should put logically related tests 62 | // into the same test case. 63 | // 64 | // The test case name and the test name should both be valid C++ 65 | // identifiers. And you should not use underscore (_) in the names. 66 | // 67 | // Google Test guarantees that each test you define is run exactly 68 | // once, but it makes no guarantee on the order the tests are 69 | // executed. Therefore, you should write your tests in such a way 70 | // that their results don't depend on their order. 71 | // 72 | // 73 | 74 | 75 | // TEST(ContigTest, Construct) { 76 | // Contig contig = Contig(); 77 | // EXPECT_EQ("", contig.getSeq()); 78 | // EXPECT_EQ(-1, contig.ind1); 79 | // EXPECT_EQ(-1, contig.ind2); 80 | // } 81 | 82 | 83 | // int main(int ac, char* av[]) 84 | // { 85 | // testing::InitGoogleTest(&ac, av); 86 | // return RUN_ALL_TESTS(); 87 | // } -------------------------------------------------------------------------------- /utils/rvalues.h: -------------------------------------------------------------------------------- 1 | static const double rvalues[129][2] = { 2 | { 0.00000, 5.29625 }, 3 | { 0.00000, 0.00000 }, 4 | { 0.00000, 0.00000 }, 5 | { 0.00000, 0.00000 }, 6 | { 0.00000, 0.00000 }, 7 | { 5.64856, 5.49117 }, 8 | { 5.85164, 5.52333 }, 9 | { 6.02772, 5.55385 }, 10 | { 6.18398, 5.58291 }, 11 | { 6.32492, 5.61065 }, 12 | { 6.45362, 5.63721 }, 13 | { 6.57225, 5.66268 }, 14 | { 6.68244, 5.68717 }, 15 | { 6.78544, 5.71075 }, 16 | { 6.88221, 5.73350 }, 17 | { 6.97353, 5.75548 }, 18 | { 7.06004, 5.77674 }, 19 | { 7.14227, 5.79733 }, 20 | { 7.22066, 5.81730 }, 21 | { 7.29558, 5.83669 }, 22 | { 7.36734, 5.85553 }, 23 | { 7.43623, 5.87386 }, 24 | { 7.50248, 5.89171 }, 25 | { 7.56630, 5.90910 }, 26 | { 7.62788, 5.92605 }, 27 | { 7.68737, 5.94260 }, 28 | { 7.74494, 5.95876 }, 29 | { 7.80069, 5.97455 }, 30 | { 7.85477, 5.98999 }, 31 | { 7.90726, 6.00510 }, 32 | { 7.95826, 6.01989 }, 33 | { 8.00786, 6.03437 }, 34 | { 8.05615, 6.04856 }, 35 | { 8.10319, 6.06247 }, 36 | { 8.14904, 6.07611 }, 37 | { 8.19378, 6.08950 }, 38 | { 8.23745, 6.10264 }, 39 | { 8.28012, 6.11555 }, 40 | { 8.32181, 6.12822 }, 41 | { 8.36260, 6.14068 }, 42 | { 8.40250, 6.15293 }, 43 | { 8.44157, 6.16497 }, 44 | { 8.47983, 6.17682 }, 45 | { 8.51733, 6.18848 }, 46 | { 8.55409, 6.19995 }, 47 | { 8.59015, 6.21125 }, 48 | { 8.62553, 6.22238 }, 49 | { 8.66025, 6.23334 }, 50 | { 8.69435, 6.24413 }, 51 | { 8.72784, 6.25478 }, 52 | { 8.76075, 6.26527 }, 53 | { 8.79310, 6.27562 }, 54 | { 8.82490, 6.28582 }, 55 | { 8.85619, 6.29589 }, 56 | { 8.88697, 6.30582 }, 57 | { 8.91725, 6.31562 }, 58 | { 8.94707, 6.32530 }, 59 | { 8.97643, 6.33485 }, 60 | { 9.00534, 6.34428 }, 61 | { 9.03383, 6.35360 }, 62 | { 9.06189, 6.36280 }, 63 | { 9.08956, 6.37189 }, 64 | { 9.11683, 6.38088 }, 65 | { 9.14372, 6.38975 }, 66 | { 9.17024, 6.39853 }, 67 | { 9.19640, 6.40721 }, 68 | { 9.22221, 6.41578 }, 69 | { 9.24768, 6.42427 }, 70 | { 9.27282, 6.43265 }, 71 | { 9.29764, 6.44095 }, 72 | { 9.32214, 6.44916 }, 73 | { 9.34634, 6.45728 }, 74 | { 9.37024, 6.46532 }, 75 | { 9.39385, 6.47328 }, 76 | { 9.41717, 6.48115 }, 77 | { 9.44023, 6.48894 }, 78 | { 9.46301, 6.49666 }, 79 | { 9.48552, 6.50430 }, 80 | { 9.50778, 6.51186 }, 81 | { 9.52979, 6.51935 }, 82 | { 9.55156, 6.52677 }, 83 | { 9.57308, 6.53412 }, 84 | { 9.59437, 6.54140 }, 85 | { 9.61543, 6.54861 }, 86 | { 9.63627, 6.55576 }, 87 | { 9.65688, 6.56284 }, 88 | { 9.67729, 6.56986 }, 89 | { 9.69748, 6.57682 }, 90 | { 9.71747, 6.58371 }, 91 | { 9.73725, 6.59055 }, 92 | { 9.75684, 6.59732 }, 93 | { 9.77624, 6.60404 }, 94 | { 9.79544, 6.61070 }, 95 | { 9.81446, 6.61731 }, 96 | { 9.83330, 6.62386 }, 97 | { 9.85196, 6.63035 }, 98 | { 9.87045, 6.63680 }, 99 | { 9.88876, 6.64319 }, 100 | { 9.90691, 6.64953 }, 101 | { 9.92489, 6.65582 }, 102 | { 9.94271, 6.66206 }, 103 | { 9.96037, 6.66825 }, 104 | { 9.97787, 6.67439 }, 105 | { 9.99522, 6.68049 }, 106 | { 10.01241, 6.68654 }, 107 | { 10.02946, 6.69254 }, 108 | { 10.04637, 6.69850 }, 109 | { 10.06313, 6.70441 }, 110 | { 10.07975, 6.71029 }, 111 | { 10.09623, 6.71611 }, 112 | { 10.11258, 6.72190 }, 113 | { 10.12879, 6.72764 }, 114 | { 10.14488, 6.73335 }, 115 | { 10.16083, 6.73901 }, 116 | { 10.17665, 6.74463 }, 117 | { 10.19235, 6.75022 }, 118 | { 10.20793, 6.75577 }, 119 | { 10.22339, 6.76127 }, 120 | { 10.23873, 6.76674 }, 121 | { 10.25394, 6.77218 }, 122 | { 10.26905, 6.77757 }, 123 | { 10.28404, 6.78293 }, 124 | { 10.29892, 6.78826 }, 125 | { 10.31369, 6.79355 }, 126 | { 10.32835, 6.79881 }, 127 | { 10.34290, 6.80403 }, 128 | { 10.35735, 6.80922 }, 129 | { 10.37169, 6.81437 }, 130 | { 10.38593, 6.81950 } 131 | }; 132 | -------------------------------------------------------------------------------- /src/ContigNode.h: -------------------------------------------------------------------------------- 1 | #ifndef CONTIGNODE 2 | #define CONTIGNODE 3 | 4 | class Contig; // forward declare to avoid circ. dependency 5 | // following http://www.cplusplus.com/forum/articles/10627/#msg49679 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "Contig.h" 14 | #include "../utils/Kmer.h" 15 | #include "../utils/JuncPairs.h" 16 | #include "../utils/Junction.h" 17 | using std::ofstream; 18 | // #include "../utils/sparsepp.h" 19 | // using spp::sparse_hash_map; 20 | 21 | 22 | class ContigNode{ 23 | 24 | public: 25 | unsigned char cov[4]; 26 | Contig * contigs[5]; 27 | 28 | ContigNode(Junction junction); 29 | ContigNode(); 30 | bool isInvertedRepeatNode(); 31 | 32 | 33 | bool checkValidity(); 34 | // returns 0 if target node not reached at up to max_dist, otherwise returns distance on branching path 35 | std::list doPathsConvergeNearby(int max_ind, int min_ind, int max_dist); 36 | 37 | 38 | //gets the neighbors of the specified contig- if contigIndex is 4, returns all forward neighbors 39 | //If contig index isn't 4, only returns back contig as a neighbor 40 | std::vector > getFastGNeighbors(int contigIndex); 41 | 42 | 43 | 44 | std::list getPairCandidates(int index, int maxDist); 45 | 46 | void replaceContig(Contig* oldContig, Contig* newContig); 47 | int numPathsOut(); 48 | int indexOf(Contig* contig); 49 | kmer_type getForwardExtension(int index); 50 | std::vector getIndicesOut(); 51 | int getCoverage(int nucExt); 52 | int getTotalCoverage();//returns getCoverage(4) 53 | void setCoverage(Junction junc); 54 | void setCoverage(int nucExt, int coverage); 55 | void update(int nucExt, Contig * contig); 56 | kmer_type getUniqueKmer(int index);//returns base kmer for backward, or extension for forward index 57 | 58 | //removes the given path out of this node. 59 | //Removes contig pointer, set coverage to 0 60 | void breakPath(int nucExt); 61 | void clearNode(); 62 | 63 | //for traversal 64 | bool hasNeighbor(int index); 65 | ContigNode* getNeighbor(int index); 66 | std::string getString(); 67 | kmer_type getKmer(); 68 | }; 69 | 70 | class NodeQueueEntry{ //contains all info for an entry in the queue for a node BFS 71 | public: 72 | ContigNode* node; 73 | int index; 74 | int startDist; 75 | 76 | NodeQueueEntry(ContigNode* n, int i, int s); 77 | NodeQueueEntry(); 78 | 79 | std::list getJuncResults(int m); //returns immediate junc results from contig along this index 80 | 81 | void addNeighbors(std::vector & queue); // , bool to_back); //searches forward one step, adds relevant nodes to the queue 82 | void recordParents(std::unordered_map& parents); 83 | // void recordParents(sparse_hash_map& parents); 84 | // std::list reconstructPathFromParents(std::unordered_map& parents); 85 | std::list reconstructPathFromParents(std::vector& parents); 86 | friend bool operator==(NodeQueueEntry a, NodeQueueEntry b) { 87 | return a.node == b.node && a.index == b.index && a.startDist == b.startDist; 88 | }; 89 | 90 | }; 91 | 92 | 93 | // struct MyHash { 94 | // size_t operator()(const NodeQueueEntry& x) const { return std::hash()(x.node->getUniqueKmer(x.index)); } 95 | // }; 96 | namespace std { 97 | template <> struct hash 98 | { 99 | size_t operator()(const NodeQueueEntry & x) const 100 | { 101 | return std::hash()(x.node->getUniqueKmer(x.index)); } 102 | }; 103 | } 104 | #endif -------------------------------------------------------------------------------- /src/ReadScanner.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include // for log2f 8 | #include // for max 9 | #include // for truncate 10 | #include 11 | // #include 12 | #include 13 | using std::string; 14 | // using std::unordered_map; 15 | using std::set; 16 | using std::ofstream; 17 | 18 | #ifndef READSCAN_H 19 | #define READSCAN_H 20 | 21 | #include "../utils/Bloom.h" 22 | #include "../utils/Kmer.h" 23 | #include "../utils/JChecker.h" 24 | #include "../utils/JunctionMap.h" 25 | #include "../utils/Junction.h" 26 | #include "../utils/ReadKmer.h" 27 | #include "../utils/Cap.h" 28 | #include "../utils/JuncPairs.h" 29 | 30 | #define DEBUGE(a) //printf a 31 | 32 | class ReadScanner{ 33 | 34 | private: 35 | int maxSpacerDist; // maximum distance in bases between two junctions (spacers added to bridge gaps) 36 | Bloom* bloom; 37 | Bloom* short_pair_filter; 38 | Bloom* long_pair_filter; 39 | set jcheckedSet; 40 | set nextRealSet; 41 | set > juncPairSet; 42 | set backwardSet; 43 | uint64_t hash0, hash1, 44 | nextHash0, nextHash1; 45 | string reads_file; 46 | 47 | uint64_t NbCandKmer, NbRawCandKmer, NbJCheckKmer, NbNoJuncs, 48 | NbSkipped, NbProcessed, readsProcessed, NbSolidKmer,readsNoErrors, 49 | NbJuncPairs, unambiguousReads; 50 | 51 | JChecker* jchecker; 52 | JunctionMap* junctionMap; 53 | 54 | //Should only be called on a read with no real junctions 55 | //Adds a fake junction in the middle and points it to the two ends. This ensures we have coverage of long linear regions, and that we capture 56 | //sinks at the end of such regions. 57 | kmer_type add_fake_junction(string read); 58 | 59 | public: 60 | JunctionMap* getJunctionMap(); 61 | 62 | //Scans one input read; breaks into small segments and calls scan_forward 63 | //Returns back junctions along read from beginning to end 64 | std::list scanInputRead(string read, bool no_cleaning); 65 | 66 | void scanReads(bool fastq, bool paired_ends, bool no_cleaning); //scans all the reads. Fastq if fastq, otherwise fasta 67 | void printScanSummary(); //prints statistics from the readscan 68 | 69 | //Determines if the given ReadKmer is a junction. 70 | //If it's on the middle of the read, just verifies alternate extensions. 71 | //Special logic is needed to handle kmers that are near the ends, if j is not 0, to ensure that 72 | //the real extension seen on the read represents a valid, jcheckable option, and not a tip shorter than j. 73 | bool testForJunction(ReadKmer kmer); 74 | 75 | //Starting from the given kmer, scans forward until a junction is found or the end of the read is hit. 76 | //Returns true if a junction was found. The supplied ReadKmer is also adjusted to the position of the new junction,\ 77 | //or to the end of the read. 78 | bool find_next_junction(ReadKmer * kmer, int lastJuncPos); 79 | 80 | //Returns substrings of the read that are valid with BF and longer than sizeKmer 81 | std::list getValidReads(string read); 82 | 83 | //Scans a read. 84 | //Identifies all junctions on the read, and links adjacent junctions to each other. 85 | //Also updates the relevant distance field on the first junction to point to the start of the read, and on the last 86 | //Junction to point to the end of the read. 87 | //If there are no junctions, add_fake_junction is called 88 | //Returns back junctions along the read from beginning to end 89 | std::list scan_forward(string read, bool no_cleaning); 90 | 91 | ReadScanner(JunctionMap* juncMap, string readFile, Bloom* bloom, Bloom* short_pair_filter, Bloom* long_pair_filter, JChecker* jchecker, int maxSpacerDist); 92 | }; 93 | #endif -------------------------------------------------------------------------------- /utils/ReadKmer.cpp: -------------------------------------------------------------------------------- 1 | #include "ReadKmer.h" 2 | #include 3 | #include 4 | #include 5 | 6 | using std::string; 7 | 8 | 9 | char* ReadKmer::directionAsString(){ 10 | if(direction == FORWARD){ 11 | return (char*)"forward"; 12 | } 13 | else{ 14 | return (char*)"backward"; 15 | } 16 | } 17 | 18 | int ReadKmer::getMaxGuaranteedJ(bool dir){ 19 | if(dir == FORWARD){ 20 | return getDistToEnd()/2-1; 21 | } 22 | else{ 23 | return getTotalPos()/2-1; 24 | } 25 | } 26 | 27 | //Returns number of forward operations needed to move to the last kmer on the read 28 | int ReadKmer::getDistToEnd(){ 29 | return read->length()*2- getTotalPos() - 2*sizeKmer+1; 30 | } 31 | 32 | //Returns the number of forward operations needed to go from the beginning of the read to this ReadKmer 33 | int ReadKmer::getTotalPos(){ 34 | return 2*pos + offset(); 35 | } 36 | 37 | bool ReadKmer::onRead(){ 38 | return (getTotalPos() >= 1) && (getDistToEnd()>=1); 39 | } 40 | 41 | kmer_type ReadKmer::getRevCompKmer(){ 42 | if(direction == FORWARD){ 43 | return doubleKmer.revcompKmer; 44 | } 45 | else{ 46 | return doubleKmer.kmer; 47 | } 48 | } 49 | 50 | kmer_type ReadKmer::getKmer(){ 51 | if(direction == FORWARD){ 52 | return doubleKmer.kmer; 53 | } 54 | else{ 55 | return doubleKmer.revcompKmer; 56 | } 57 | } 58 | 59 | //returns the offset of this kmer from the one at the same position facing backward. 60 | //Backward: 0 61 | //Forward: 1 62 | //used for calculating distances. 63 | int ReadKmer::offset(){ 64 | if(direction == FORWARD){ 65 | return 1; 66 | } 67 | else{ 68 | return 0; 69 | } 70 | } 71 | 72 | void ReadKmer::forward(){ 73 | direction = !direction; 74 | if(direction == FORWARD){ 75 | return; //switching from facing backward to forward doesn't entail a shift 76 | } 77 | int newNuc = 0; 78 | if(pos + sizeKmer < read->length()){ 79 | newNuc = NT2int((*read)[pos + sizeKmer]); 80 | } 81 | doubleKmer.forward(newNuc); 82 | pos++; 83 | } 84 | 85 | void ReadKmer::advanceDist(int dist){ 86 | for(int i = 0; i < dist; i++){ 87 | forward(); 88 | } 89 | } 90 | 91 | kmer_type ReadKmer::getCanon(){ 92 | return doubleKmer.getCanon(); 93 | } 94 | 95 | int ReadKmer::getExtensionIndex(bool dir){ 96 | if(dir != direction){ 97 | return 4; //backward index 98 | } 99 | return getRealExtensionNuc(); 100 | } 101 | 102 | kmer_type ReadKmer::getExtension(int newNuc){ 103 | return doubleKmer.getExtension(newNuc, direction); 104 | } 105 | 106 | //can be used as an index for the junction 107 | int ReadKmer::getRealExtensionNuc(){ 108 | if(direction == FORWARD){ 109 | return NT2int((*read)[sizeKmer + pos]); 110 | } 111 | else{ 112 | return revcomp_int(NT2int((*read)[pos-1])); 113 | } 114 | } 115 | 116 | kmer_type ReadKmer::getRealExtension(){ 117 | return getExtension(getRealExtensionNuc()); 118 | } 119 | 120 | //Starts all the way at the front- facing off the read 121 | ReadKmer::ReadKmer(string* theRead): doubleKmer(0){ 122 | read = theRead; 123 | kmer_type kmer = 0; 124 | getFirstKmerFromRead(&kmer,&((*read)[0])); 125 | doubleKmer = DoubleKmer(kmer); 126 | pos = 0; 127 | direction = BACKWARD; 128 | } 129 | 130 | //Creates a double kmer corresponding to the given read, the index into the read, and the direction 131 | ReadKmer::ReadKmer(string* theRead, int index, bool dir): doubleKmer(0){ 132 | read = theRead; 133 | kmer_type kmer; 134 | getFirstKmerFromRead(&kmer,&((*read)[index])); 135 | doubleKmer = DoubleKmer(kmer); 136 | pos = index; 137 | direction = dir; 138 | } 139 | 140 | ReadKmer::ReadKmer(ReadKmer* toCopy): doubleKmer(toCopy->doubleKmer){ 141 | read = toCopy->read; 142 | doubleKmer = toCopy->doubleKmer; 143 | pos = toCopy->pos; 144 | direction = toCopy->direction; 145 | } 146 | 147 | -------------------------------------------------------------------------------- /utils/Kmer.h: -------------------------------------------------------------------------------- 1 | #ifndef Kmer64_h 2 | #define Kmer64_h 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef _largeint 9 | #include "LargeInt.h" 10 | typedef LargeInt kmer_type; 11 | #else 12 | #ifdef _ttmath 13 | #include "ttmath/ttmath.h" 14 | typedef ttmath::UInt kmer_type; 15 | #else 16 | #if (! defined kmer_type) || (! defined _LP64) 17 | typedef uint64_t kmer_type; 18 | #endif 19 | #endif 20 | #endif 21 | 22 | extern int sizeKmer; 23 | extern kmer_type kmerMask; 24 | extern kmer_type kmerMaskm1; 25 | extern const bool FORWARD; 26 | extern const bool BACKWARD; 27 | extern uint64_t nsolids; 28 | 29 | bool isHomoPolymer(std::string str); 30 | std::list getUnambiguousReads(std::string read);//returns every string of valid nuc characters in the read- throws out all other characters 31 | void setSizeKmer(int k); 32 | char getNucChar(int nucIndex); 33 | bool isValidNuc(char nt); 34 | int NT2int(char nt); 35 | int revcomp_int(int nt_int); 36 | char revcomp_char(char c); 37 | kmer_type codeSeed(char *seq, int sizeKmer, kmer_type kmerMask); 38 | kmer_type codeSeed(char *seq); 39 | kmer_type codeSeedRight(char *seq, kmer_type val_seed, bool new_read); 40 | kmer_type codeSeedRight(char *seq, kmer_type val_seed, bool new_read, int sizeKmer, kmer_type kmerMask); 41 | kmer_type codeSeedRight_revcomp(char *seq, kmer_type val_seed, bool new_read); 42 | kmer_type codeSeedRight_revcomp(char *seq, kmer_type val_seed, bool new_read, int sizeKmer, kmer_type kmerMask); 43 | unsigned char code_n_NT(char *seq, int nb); 44 | unsigned char code4NT(char *seq); 45 | 46 | uint64_t revcomp(uint64_t x); 47 | uint64_t revcomp(uint64_t x, int size); 48 | 49 | #ifdef _largeint 50 | LargeInt revcomp(LargeInt x); 51 | LargeInt revcomp(LargeInt x, int size); 52 | #endif 53 | #ifdef _ttmath 54 | ttmath::UInt revcomp(ttmath::UInt x); 55 | ttmath::UInt revcomp(ttmath::UInt x, int size); 56 | #endif 57 | #ifdef _LP64 58 | __uint128_t revcomp(__uint128_t x); 59 | __uint128_t revcomp(__uint128_t x, int size); 60 | #endif 61 | 62 | int code2seq ( kmer_type code,char *seq); 63 | int code2seq ( kmer_type code,char *seq, int sizeKmer, kmer_type kmerMask); 64 | int code2nucleotide( kmer_type code, int which_nucleotide); 65 | int first_nucleotide(kmer_type kmer); 66 | 67 | kmer_type extractKmerFromRead(char *readSeq, int position, kmer_type *graine, kmer_type *graine_revcomp); 68 | kmer_type extractKmerFromRead(char *readSeq, int position, kmer_type *graine, kmer_type *graine_revcomp, bool sequential); 69 | kmer_type extractKmerFromRead(char *readSeq, int position, kmer_type *graine, kmer_type *graine_revcomp, bool sequential, int sizeKmer, kmer_type kmerMask); 70 | kmer_type maskKmer(kmer_type kmer); 71 | // compute the next kmer w.r.t forward or reverse strand, e.g. for ACTG (revcomp = CAGT) 72 | // it makes sure the result is the min(kmer,revcomp_kmer) 73 | // indicates if the result is the revcomp_kmer by setting *strand 74 | // examples: 75 | // next_kmer(ACTG,A,&0)=CTGA with strand = 0 (because revcomp=TCAG); 76 | // next_kmer(ACTG,A,&1)= (revcomp of ACTG + A = CAGT+A = ) AGTA with strand = 0 (because revcomp = TACT) 77 | kmer_type next_kmer(kmer_type graine, int added_nt, bool strand);//returns shifted in a new place 78 | kmer_type next_kmer(kmer_type graine, int added_nt, int* strand); 79 | void shift_kmer(kmer_type *graine, int added_nt, int strand); //shifts in place 80 | void getFirstKmerFromRead(kmer_type* kmer, char* read); 81 | kmer_type getKmerFromRead(std::string read, int index);//returns kmer starting at position index 82 | kmer_type next_kmer_in_read(kmer_type kmer, int index_in_read, char* read, bool direction); 83 | kmer_type advance_kmer(char* read, kmer_type* kmer, int startPos, int endPos); 84 | kmer_type rotate_right(kmer_type kmer, int dist); 85 | kmer_type rotate_left(kmer_type kmer, int dist); 86 | 87 | std::string canon_contig(std::string contig); 88 | std::string revcomp_string(std::string s); 89 | void revcomp_sequence(char* s, int len); 90 | 91 | kmer_type codeSeed_bin(char *seq); 92 | 93 | kmer_type codeSeedRight_bin(char *seq, kmer_type val_seed, bool new_read); 94 | 95 | kmer_type codeSeedRight_revcomp_bin(char *seq, kmer_type val_seed, bool new_read); 96 | 97 | kmer_type extractKmerFromRead_bin(char *readSeq, int position, kmer_type *graine, kmer_type *graine_revcomp, bool use_compressed); 98 | 99 | kmer_type get_canon(kmer_type a); 100 | char* print_kmer(kmer_type kmer); // debugging 101 | char* print_kmer(kmer_type kmer, int sizeKmer, kmer_type kmerMask); // debugging 102 | 103 | 104 | #endif 105 | -------------------------------------------------------------------------------- /utils/tests/BloomTests.cpp: -------------------------------------------------------------------------------- 1 | #include "BloomTests.h" 2 | #include 3 | #include 4 | #include 5 | 6 | namespace bloomTests{ 7 | 8 | Bloom* bloom; 9 | 10 | char get_random_nuc(){ 11 | return rand() % 4; 12 | } 13 | 14 | kmer_type get_random_kmer(){ 15 | uint64_t rand_kmer = 0; 16 | for(int i = 0; i < sizeKmer; i++){ 17 | rand_kmer ^= (uint64_t)get_random_nuc(); 18 | rand_kmer <<= 2; 19 | } 20 | return rand_kmer; 21 | } 22 | 23 | void test_false_positive_rate(uint64_t bloomSize, int sampleSize, float fpRate){ 24 | 25 | setSizeKmer(25); 26 | bloom = bloom->create_bloom_filter_optimal(bloomSize, fpRate); 27 | for(int i = 0; i < bloomSize; i++){ 28 | bloom->add(get_random_kmer()); 29 | } 30 | int fpCount = 0; 31 | for(int i = 0; i < sampleSize; i++){ 32 | if(bloom->contains(get_random_kmer())){ 33 | fpCount += 1; 34 | } 35 | } 36 | printf("Size %lli, desired rate %f: %f \n", 37 | bloomSize, fpRate, (float)fpCount/(float)sampleSize); 38 | } 39 | 40 | void test_speed_raw(uint64_t bloomSize, int sampleSize, float fpRate){ 41 | 42 | setSizeKmer(25); 43 | bloom = bloom->create_bloom_filter_optimal(bloomSize, fpRate); 44 | for(int i = 0; i < bloomSize; i++){ 45 | bloom->add(get_random_kmer()); 46 | } 47 | time_t start, stop; 48 | time(&start); 49 | int fpCount = 0; 50 | kmer_type kmer = (uint64_t)0; 51 | for(int i = 0; i < sampleSize; i++){ 52 | bloom->contains(kmer); 53 | kmer++; 54 | } 55 | time(&stop); 56 | printf("Raw queries per second: %f \n", sampleSize / difftime(stop,start)); 57 | } 58 | 59 | void test_speed_incremental(uint64_t bloomSize, int sampleSize, float fpRate){ 60 | 61 | setSizeKmer(25); 62 | bloom = bloom->create_bloom_filter_optimal(bloomSize, fpRate); 63 | kmer_type kmer = get_random_kmer(); 64 | uint64_t hash0 = bloom->get_rolling_hash(kmer,0); 65 | uint64_t hash1 = bloom->get_rolling_hash(kmer,1); 66 | for(int i = 0; i < bloomSize; i++){ 67 | bloom->add(get_random_kmer()); 68 | } 69 | time_t start, stop; 70 | time(&start); 71 | int fpCount = 0; 72 | int newNuc = 0; 73 | int oldNuc = 3; 74 | for(int i = 0; i < sampleSize; i++){ 75 | bloom->contains(hash0, hash1); 76 | hash0 = bloom->roll_hash(hash0, oldNuc, newNuc, 0); 77 | hash1 = bloom->roll_hash(hash1, oldNuc, newNuc, 1); 78 | newNuc = (newNuc + 1) %4; 79 | oldNuc = (oldNuc +1)%4; 80 | } 81 | time(&stop); 82 | printf("Incremental Queries per second: %f \n", sampleSize / difftime(stop,start)); 83 | } 84 | 85 | 86 | void test_speed_readscan(uint64_t bloomSize, int sampleSize, float fpRate){ 87 | 88 | setSizeKmer(25); 89 | bloom = bloom->create_bloom_filter_optimal(bloomSize, fpRate); 90 | kmer_type kmer = (uint64_t)0; 91 | uint64_t hash0 = bloom->get_rolling_hash(kmer,0); 92 | uint64_t hash1 = bloom->get_rolling_hash(kmer,1); 93 | for(int i = 0; i < bloomSize; i++){ 94 | bloom->add(get_random_kmer()); 95 | } 96 | time_t start, stop; 97 | time(&start); 98 | int fpCount = 0; 99 | int newNuc = 0; 100 | int oldNuc = 3; 101 | for(int i = 0; i < sampleSize; i++){ 102 | if(i % 100 == 0){ 103 | bloom->contains(kmer); 104 | kmer++; 105 | } 106 | else{ 107 | bloom->contains(hash0, hash1); 108 | hash0 = bloom->roll_hash(hash0, oldNuc, newNuc, 0); 109 | hash1 = bloom->roll_hash(hash1, oldNuc, newNuc, 1); 110 | newNuc = (newNuc + 1) %4; 111 | oldNuc = (oldNuc +1)%4; 112 | } 113 | } 114 | time(&stop); 115 | printf("Read Queries per second: %f \n", sampleSize / difftime(stop,start)); 116 | } 117 | 118 | void test_speed_old(uint64_t bloomSize, int sampleSize, float fpRate){ 119 | 120 | setSizeKmer(25); 121 | bloom = bloom->create_bloom_filter_optimal(bloomSize, fpRate); 122 | for(int i = 0; i < bloomSize; i++){ 123 | bloom->add(get_random_kmer()); 124 | } 125 | time_t start, stop; 126 | time(&start); 127 | int fpCount = 0; 128 | kmer_type kmer = (uint64_t)0; 129 | for(int i = 0; i < sampleSize; i++){ 130 | bloom->oldContains(kmer); 131 | kmer++; 132 | } 133 | time(&stop); 134 | printf("Old queries per second: %f \n", sampleSize / difftime(stop,start)); 135 | } 136 | 137 | void runBloomTests(){ 138 | test_false_positive_rate(100000, 100000, .001); 139 | test_false_positive_rate(100000, 100000, .01); 140 | test_false_positive_rate(100000, 100000, .1); 141 | test_speed_raw(100000, 2000000, .01); 142 | test_speed_incremental(100000, 20000000, .1); 143 | test_speed_readscan(100000, 20000000, .1); 144 | test_speed_old(100000, 5000000, .1); 145 | } 146 | 147 | } -------------------------------------------------------------------------------- /utils/JunctionMap.h: -------------------------------------------------------------------------------- 1 | #ifndef JUNCTION_MAP 2 | #define JUNCTION_MAP 3 | 4 | // #include 5 | #include 6 | #include 7 | #include "Kmer.h" 8 | #include "Junction.h" 9 | #include "Cap.h" 10 | #include "ReadKmer.h" 11 | #include "Bloom.h" 12 | #include "JChecker.h" 13 | #include "Kmer.h" 14 | #include "JuncPairs.h" 15 | 16 | #include "../src/Contig.h" 17 | #include "../src/ContigNode.h" 18 | #include "../src/ContigGraph.h" 19 | #include 20 | #include "../src/BfSearchResult.h" 21 | using std::ofstream; 22 | // using std::unordered_map; 23 | using std::string; 24 | using std::unordered_set; 25 | // #include "sparsepp.h" 26 | // using spp::sparse_hash_map; 27 | 28 | 29 | class JunctionMap{ 30 | 31 | private: 32 | Bloom* bloom; 33 | JChecker* jchecker; 34 | int maxReadLength; //needed for finding sinks properly- tells you when to stop scanning 35 | 36 | 37 | public: 38 | void printDistAndExtension(int dist, int maxDist, int index, kmer_type kmer); 39 | 40 | void buildLinearRegions(ContigGraph* contigGraph); //Builds node graph for any connected component that has branching 41 | void buildBranchingPaths(ContigGraph* contigGraph); //For connected components that have no branching at all- builds contig graph 42 | void destroyComplexJunctions(); //destroys all complex junctions. used after building branching paths for contig graph 43 | void destroyJunctionSet(std::set dead_juncs); // periodically destroy complex junctions - aimed to reduce memory use 44 | 45 | 46 | //Builds a contig graph from this junction map, destroying the non-complex junctions as it goes 47 | ContigGraph* buildContigGraph(); 48 | 49 | //Gets the contig from this junction to the next complex junction or sink 50 | //Has all fields except the ContigNode pointers filled out- indices, juncDistances, and seq are all there 51 | Contig* getContig(Junction junc, kmer_type startKmer, int index); 52 | 53 | //Scans forward from junction junc at index i with bloom filter 54 | //If it hits another junction at or before the distance specified by the given junction, returns a "node" result with that junction 55 | //If it does not, it keeps scanning until it hits another junction or an actual sink 56 | //If it hits a sink, it returns it. If it hits a junction, it tests how far that junction points along the path. 57 | //Based on the indicated overlap, it either decides the entire intermediate sequence is real or the connection is a 58 | //false positive connection. Then returns either a sink or a node result. 59 | BfSearchResult findNeighbor(Junction junc, kmer_type startKmer, int index); 60 | 61 | std::unordered_map junctionMap; //stores the junctions themselves 62 | // sparse_hash_map junctionMap; //stores the junctions themselves 63 | 64 | //Returns true if multiple extensions of the given kmer jcheck 65 | //Assumes the given kmer is in the BF 66 | bool isBloomJunction(kmer_type kmer); 67 | 68 | //Gets the valid extension of the given kmer based on the bloom filter and cFPs. Uses JChecking! so this cuts off tips 69 | //Assume the given kmer is not a junction 70 | //Returns -1 if there is no valid extension 71 | //Returns -2 if there are multiple 72 | //ASSUMES NO CFP SET- since this is only done in findSinks, BEFORE the cFPs are found 73 | int getValidJExtension(DoubleKmer kmer); 74 | 75 | //File format: 76 | //One line for each junction. On each line, the kmer is printed as a string, then the junction is printed. 77 | //See Junction.h for junction print documentation. 78 | void writeToFile(string filename); 79 | 80 | void buildFromFile(string junction_file); 81 | 82 | //Finds the junction associated with the given kmer and returns how far we can skip in the given direction from that junction 83 | int getSkipDist(ReadKmer* readKmer, bool direction); 84 | 85 | //Directly links two adjacent junctions from the same read 86 | void directLinkJunctions(ReadKmer* kmer1, ReadKmer* kmer2, Junction* junc1, Junction* junc2); 87 | 88 | int getNumComplexJunctions(); //Gets the number of junctions with more than one valid extension 89 | int getNumSolidJunctions(int i); //Gets the number of solid complex junctions, multiple valid extensions of coverage at least i 90 | int getNumJunctions(); 91 | 92 | void createJunction(kmer_type kmer); 93 | void createJunction(ReadKmer* readKmer); 94 | bool isJunction(kmer_type kmer); //returns true if there is a junction at the given kmer 95 | bool isJunction(ReadKmer* readKmer); //same as above 96 | Junction* getJunction(ReadKmer kmer); //returns the junction located at the given kmer, or NULL if there is none 97 | Junction* getJunction(kmer_type kmer); //same as above 98 | void killJunction(kmer_type kmer); //removes the junction at the specified kmer, if there is one 99 | 100 | JunctionMap(Bloom* bloo, JChecker* jchecker, int maxReadLength); 101 | }; 102 | #endif -------------------------------------------------------------------------------- /src/ContigGraph.h: -------------------------------------------------------------------------------- 1 | #ifndef CONTIG_GRAPH 2 | #define CONTIG_GRAPH 3 | 4 | class ContigGraph; //forward declare 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "../utils/Kmer.h" 13 | #include "../utils/Junction.h" 14 | #include "../utils/Bloom.h" 15 | #include "../utils/JunctionMap.h" 16 | #include "../utils/JChecker.h" 17 | #include "../utils/JuncPairs.h" 18 | #include "Contig.h" 19 | #include "ContigNode.h" 20 | #include "BfSearchResult.h" 21 | #include "ContigIterator.h" 22 | using std::unordered_set; 23 | using std::unordered_map; 24 | #include "../utils/sparsepp.h" 25 | using std::string; 26 | using spp::sparse_hash_map; 27 | 28 | 29 | class ContigGraph 30 | { 31 | std::vector nodeVector; 32 | std::vector isolated_contigs; 33 | unordered_map nodeMap; 34 | unordered_map::iterator it; 35 | 36 | 37 | int read_length; 38 | 39 | public: 40 | std::vector * getIsolatedContigs(); 41 | unordered_map * getNodeMap(); 42 | 43 | void setReadLength(int length); 44 | bool isCollapsible(ContigNode * node); 45 | 46 | //Gets number of supporting pairs given candidate list 47 | double getScore(std::list leftCand, std::list rightCand, Bloom* pair_filter, int insertSize); 48 | // same as getScore, but stops at first positive query 49 | bool areConnected(std::list leftCand, std::list rightCand, Bloom* pair_filter, int insertSize); 50 | 51 | std::pair getMinMaxForwardExtensions(ContigNode * node, std::string trait); 52 | 53 | 54 | //a,b are on backNode, c,d are on forwardNode 55 | //a pairs with c, b pairs with d 56 | //Does not go ahead with the operation if degeneracies are detected 57 | //Returns true if it goes ahead with disentanglement 58 | void disentanglePair(Contig* contig, ContigNode* backNode, ContigNode* forwardNode, int a, int b, int c, int d); 59 | void disentangleLoop(Contig* contig, ContigNode* backNode, ContigNode* forwardNode, int a, int b, int c, int d); 60 | 61 | void addIsolatedContig(Contig contig); 62 | bool isLowCovContig(Contig* contig); 63 | bool isLowMassContig(Contig* contig); 64 | bool isTip(ContigNode* node, int i); 65 | bool isBubbleNode(ContigNode* node); 66 | std::list getPathIfSimpleBulge(ContigNode* node, int max_dist); 67 | 68 | void deleteContig(Contig* contig); 69 | bool cleanGraph(Bloom* short_pair_filter, Bloom* long_pair_filter); //Cleans graph and returns true if any changes were made 70 | 71 | bool checkGraph(); 72 | void printContigFastG(std::ostream* fastgFile, Contig * contig); 73 | 74 | // calls different sub-functions below to traverse graph and output contigs 75 | void printContigs(string filename); 76 | int printAndMarkBubbleContigs(string fileName); 77 | int printUnmarkedUnitigs(string fileName, int numPrinted); 78 | 79 | 80 | void printGraph(string fileName); //prints graph : TBD print format- fastg? 81 | ContigGraph(); 82 | 83 | Contig* getLongestContig(); 84 | 85 | //Creates a contig node if it doesn't already exist 86 | //If it exists, does nothing and returns the existing one. 87 | //Otherwise, returns the new one 88 | ContigNode * createContigNode(kmer_type kmer, Junction junction); 89 | int disentangleParallelPaths(Bloom* pair_filter, double insertSize, double std); 90 | int disentangleLoopPaths(Bloom* pair_filter, double insertSize, double std); 91 | int removeChimericExtensions(int insertSize); 92 | int validateNoneCollapsible(); 93 | int collapseBulges(int max_dist); 94 | bool deleteTipsAndClean(); 95 | bool removeChimerasAndClean(); 96 | bool collapseBulgesAndClean(); 97 | bool disentangleAndClean(Bloom* pair_filter, double insertSize, double std); 98 | bool areEquivalentContigCoverages(ContigJuncList A, ContigJuncList B, double frac); 99 | bool areDifferentialContigCoverages(ContigJuncList A, ContigJuncList B); 100 | Contig * getNewConcatenatedContig(Contig * back, Contig * contig, ContigNode * node); 101 | 102 | 103 | private: 104 | int deleteTips(); 105 | int deleteIsolatedContigs(); 106 | bool testAndCutIfDegenerate(ContigNode* node); 107 | int collapseDummyNodes(); //removes nodes with only one real extension, merges forward and back contigs 108 | int destroyDegenerateNodes();// Removes nodes with no back contig or no forward contigs 109 | // int cutIfDegenerate(ContigNode* node, kmer_type kmer, auto it); 110 | 111 | unordered_map contigNodeMap; // maps kmers to ContigNodes after contigs constructed 112 | // sparse_hash_map contigNodeMap; // maps kmers to ContigNodes after contigs constructed 113 | 114 | void collapseNode(ContigNode * node, kmer_type kmer); 115 | void cutPath(ContigNode* node, int index); //used on nodes with no backward contig 116 | }; 117 | 118 | 119 | #endif 120 | -------------------------------------------------------------------------------- /utils/ttmath/ttmathmisc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of TTMath Bignum Library 3 | * and is distributed under the (new) BSD licence. 4 | * Author: Tomasz Sowa 5 | */ 6 | 7 | /* 8 | * Copyright (c) 2006-2010, Tomasz Sowa 9 | * All rights reserved. 10 | * 11 | * Redistribution and use in source and binary forms, with or without 12 | * modification, are permitted provided that the following conditions are met: 13 | * 14 | * * Redistributions of source code must retain the above copyright notice, 15 | * this list of conditions and the following disclaimer. 16 | * 17 | * * Redistributions in binary form must reproduce the above copyright 18 | * notice, this list of conditions and the following disclaimer in the 19 | * documentation and/or other materials provided with the distribution. 20 | * 21 | * * Neither the name Tomasz Sowa nor the names of contributors to this 22 | * project may be used to endorse or promote products derived 23 | * from this software without specific prior written permission. 24 | * 25 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 26 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 29 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 35 | * THE POSSIBILITY OF SUCH DAMAGE. 36 | */ 37 | 38 | #ifndef headerfilettmathmisc 39 | #define headerfilettmathmisc 40 | 41 | 42 | /*! 43 | \file ttmathmisc.h 44 | \brief some helpful functions 45 | */ 46 | 47 | 48 | #include 49 | 50 | 51 | namespace ttmath 52 | { 53 | 54 | /*! 55 | some helpful functions 56 | */ 57 | class Misc 58 | { 59 | public: 60 | 61 | 62 | /* 63 | * 64 | * AssignString(result, str) 65 | * result = str 66 | * 67 | */ 68 | 69 | /*! 70 | result = str 71 | */ 72 | static void AssignString(std::string & result, const char * str) 73 | { 74 | result = str; 75 | } 76 | 77 | 78 | #ifndef TTMATH_DONT_USE_WCHAR 79 | 80 | /*! 81 | result = str 82 | */ 83 | static void AssignString(std::wstring & result, const char * str) 84 | { 85 | result.clear(); 86 | 87 | for( ; *str ; ++str ) 88 | result += *str; 89 | } 90 | 91 | 92 | /*! 93 | result = str 94 | */ 95 | static void AssignString(std::wstring & result, const std::string & str) 96 | { 97 | return AssignString(result, str.c_str()); 98 | } 99 | 100 | 101 | /*! 102 | result = str 103 | */ 104 | static void AssignString(std::string & result, const wchar_t * str) 105 | { 106 | result.clear(); 107 | 108 | for( ; *str ; ++str ) 109 | result += static_cast(*str); 110 | } 111 | 112 | 113 | /*! 114 | result = str 115 | */ 116 | static void AssignString(std::string & result, const std::wstring & str) 117 | { 118 | return AssignString(result, str.c_str()); 119 | } 120 | 121 | #endif 122 | 123 | 124 | /* 125 | * 126 | * AddString(result, str) 127 | * result += str 128 | * 129 | */ 130 | 131 | 132 | /*! 133 | result += str 134 | */ 135 | static void AddString(std::string & result, const char * str) 136 | { 137 | result += str; 138 | } 139 | 140 | 141 | #ifndef TTMATH_DONT_USE_WCHAR 142 | 143 | /*! 144 | result += str 145 | */ 146 | static void AddString(std::wstring & result, const char * str) 147 | { 148 | for( ; *str ; ++str ) 149 | result += *str; 150 | } 151 | 152 | #endif 153 | 154 | 155 | /* 156 | this method omits any white characters from the string 157 | char_type is char or wchar_t 158 | */ 159 | template 160 | static void SkipWhiteCharacters(const char_type * & c) 161 | { 162 | // 13 is at the end in a DOS text file (\r\n) 163 | while( (*c==' ' ) || (*c=='\t') || (*c==13 ) || (*c=='\n') ) 164 | ++c; 165 | } 166 | 167 | 168 | 169 | 170 | /*! 171 | this static method converts one character into its value 172 | 173 | for example: 174 | 1 -> 1 175 | 8 -> 8 176 | A -> 10 177 | f -> 15 178 | 179 | this method don't check whether c is correct or not 180 | */ 181 | static uint CharToDigit(uint c) 182 | { 183 | if(c>='0' && c<='9') 184 | return c-'0'; 185 | 186 | if(c>='a' && c<='z') 187 | return c-'a'+10; 188 | 189 | return c-'A'+10; 190 | } 191 | 192 | 193 | /*! 194 | this method changes a character 'c' into its value 195 | (if there can't be a correct value it returns -1) 196 | 197 | for example: 198 | c=2, base=10 -> function returns 2 199 | c=A, base=10 -> function returns -1 200 | c=A, base=16 -> function returns 10 201 | */ 202 | static sint CharToDigit(uint c, uint base) 203 | { 204 | if( c>='0' && c<='9' ) 205 | c=c-'0'; 206 | else 207 | if( c>='a' && c<='z' ) 208 | c=c-'a'+10; 209 | else 210 | if( c>='A' && c<='Z' ) 211 | c=c-'A'+10; 212 | else 213 | return -1; 214 | 215 | 216 | if( c >= base ) 217 | return -1; 218 | 219 | 220 | return sint(c); 221 | } 222 | 223 | 224 | 225 | /*! 226 | this method converts a digit into a char 227 | digit should be from <0,F> 228 | (we don't have to get a base) 229 | 230 | for example: 231 | 1 -> 1 232 | 8 -> 8 233 | 10 -> A 234 | 15 -> F 235 | */ 236 | static uint DigitToChar(uint digit) 237 | { 238 | if( digit < 10 ) 239 | return digit + '0'; 240 | 241 | return digit - 10 + 'A'; 242 | } 243 | 244 | 245 | }; // struct Misc 246 | 247 | } 248 | 249 | 250 | #endif 251 | -------------------------------------------------------------------------------- /utils/manual/manual.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper]{article} 2 | \usepackage{fancyvrb} 3 | \usepackage{pdfpages} 4 | \begin{document} 5 | 6 | \newcommand\vitem[1][]{\SaveVerb[% to use verb in description 7 | aftersave={\item[\textnormal{\UseVerb[#1]{vsave}}]}]{vsave}} 8 | 9 | \title{\Huge \texttt{Minia} --- Short manual} 10 | 11 | \author{R. Chikhi \& G. Rizk\\ 12 | {\small{rayan.chikhi@ens-cachan.org}}} 13 | \maketitle 14 | 15 | \begin{abstract} 16 | \noindent {\normalsize Minia is a software for ultra-low memory DNA sequence assembly. It takes as input a set of short genomic sequences (typically, data produced by the Illumina DNA sequencer). Its output is a set of contigs (assembled sequences), forming an approximation of the expected genome. Minia is based on a succinct representation of the de Bruijn graph. The computational resources required to run Minia are significantly lower than that of other assemblers.} 17 | \end{abstract} 18 | 19 | \tableofcontents 20 | 21 | \section{Installation} 22 | 23 | To install Minia, just type \verb+make+ in the Minia folder. 24 | Minia has been tested on Linux and MacOS systems. 25 | To run Minia, type \verb+./minia+. 26 | 27 | \section{Parameters} 28 | 29 | The usage is:\\ 30 | 31 | 32 | \verb+./minia [input_file] [kmer_size] [min_abundance] [estimated_genome_size] [prefix]+\\ 33 | 34 | 35 | An example command line is:\\ 36 | 37 | 38 | \verb+./minia reads.fastq 31 3 100000000 minia_assembly_k31_m3+\\ 39 | 40 | All the parameters need to be specified, in the following order: 41 | 42 | \begin{enumerate} 43 | 44 | \item \verb+input_file+ -- the input file 45 | 46 | \item \verb+kmer_size+ -- k-mer length 47 | 48 | \item \verb+min_abundance+ -- filters out k-mers seen less than the specified number of times 49 | 50 | \item \verb+estimated_genome_size+ -- rough estimation of the size of the genome to assemble, in base pairs. 51 | 52 | \item \verb+prefix+ -- any prefix string to store unique temporary files for this assembly 53 | 54 | \end{enumerate} 55 | 56 | Minia now uses the Cascading Bloom filters improvement (http://arxiv.org/abs/1302.7278) by default, thanks to Gustavo Sacomoto for the implementation in Minia. Launch Minia with the \verb!--original! option to revert to the original data structure. 57 | 58 | 59 | \section{Explanation of parameters} 60 | \begin{description} 61 | 62 | \vitem+kmer_size+ 63 | The $k$-mer length is the length of the nodes in the de Bruijn graph. It strongly depends on the input dataset. A typical value to try for short Illumina reads (read length above $50$) is 27. For longer Illumina reads ($\approx 100$ bp) with sufficient coverage ($>$ 40x), we had good results with $k=43$. 64 | 65 | \vitem+min_abundance+ 66 | The \verb+min_abundance+ is used to remove erroneous, low-abundance $k$-mers. This parameter also strongly depends on the dataset. It corresponds to the smallest amount of times a correct $k$-mer appears in the reads. A typical value is $3$. Setting it to $1$ is not recommended\footnote{as no erroneous $k$-mer will be discarded, which will likely result in a very large memory usage}. If the dataset has high coverage, try larger values. 67 | 68 | \vitem+estimated_genome_size+ 69 | 70 | The estimated genome size parameter (in base pairs) only controls the memory usage during the first phase of Minia (graph construction). \emph{It has no impact on the assembly}. 71 | 72 | \vitem+prefix+ 73 | The \verb+prefix+ parameter is any arbitrary file name prefix, for example, \verb+test_assembly+. 74 | 75 | \end{description} 76 | 77 | \section{Input} 78 | 79 | \begin{description} 80 | \item \emph{FASTA/FASTQ} 81 | 82 | Minia assembles any type of Illumina reads, given in the FASTA or FASTQ format. Paired or mate-pairs reads are OK, but keep in mind that Minia discards pairing information. 83 | \item \emph{Multipe Files} 84 | 85 | Minia can assemble multiple input files. Just create a text file containing the list of read files, one file name per line, and pass this list as the first parameter of Minia (instead of a FASTA/FASTQ file). Therefore the parameter \verb+input_file+ can be either (i) the read file itself (FASTA/FASTQ/compressed), or (ii) a file containing a list of file names. 86 | \item \emph{line format} 87 | 88 | In FASTA files, each read can be split into multiple lines, whereas in FASTQ, each read sequence must be in a single line. 89 | 90 | \item \emph{gzip compression} 91 | 92 | Minia can direclty read files compressed with gzip. Compressed files should end with '.gz'. Input files of different types can be mixed (i.e. gzipped or not, in FASTA or FASTQ) 93 | 94 | \end{description} 95 | 96 | \section{Output} 97 | 98 | The output of Minia is a set of contigs in the FASTA format, in the file \verb+[prefix].contigs.fa+. 99 | 100 | \section{Memory usage} 101 | 102 | We estimate that the memory usage of Minia is roughly $2$ GB of RAM per gigabases in the target genome to assemble. It is independent of the coverage of the input dataset, provided that the \verb!min_abundance! parameter is correctly set. For example, a human genome was assembled in $5.7$ GB of RAM. This was using the original data structure; the current implementation relies on Cascading Bloom filters and should use $\approx 1-2$ GB less memory. A better estimation of the memory usage can be found in the Appendix. 103 | 104 | \section{Disk usage} 105 | 106 | Minia writes large temporary files during the k-mer counting phase. These files are written in the working directory when you launched Minia. For better performance, run Minia on a local hard drive. 107 | 108 | \section{Larger $k$-mer lengths} 109 | 110 | Minia supports arbitrary large $k$-mer lengths. To compile Minia for $k$-mer lengths up to, say, 100, type: 111 | \begin{verbatim} 112 | make clean && make k=100 113 | \end{verbatim} 114 | 115 | \section{Appendixes} 116 | 117 | The rest of this manual describes the data structure used by Minia. 118 | The first text is from an original research article published at WABI 2012. The second text is an improvement made and implemented in Minia by other authors, published at WABI 2013. 119 | 120 | \includepdf[pages=-]{../paper/wabi12.pdf} 121 | \includepdf[pages=-]{../paper/cascading-wabi13.pdf} 122 | 123 | \end{document} 124 | 125 | -------------------------------------------------------------------------------- /utils/ttmath/ttmaththreads.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of TTMath Bignum Library 3 | * and is distributed under the (new) BSD licence. 4 | * Author: Tomasz Sowa 5 | */ 6 | 7 | /* 8 | * Copyright (c) 2006-2009, Tomasz Sowa 9 | * All rights reserved. 10 | * 11 | * Redistribution and use in source and binary forms, with or without 12 | * modification, are permitted provided that the following conditions are met: 13 | * 14 | * * Redistributions of source code must retain the above copyright notice, 15 | * this list of conditions and the following disclaimer. 16 | * 17 | * * Redistributions in binary form must reproduce the above copyright 18 | * notice, this list of conditions and the following disclaimer in the 19 | * documentation and/or other materials provided with the distribution. 20 | * 21 | * * Neither the name Tomasz Sowa nor the names of contributors to this 22 | * project may be used to endorse or promote products derived 23 | * from this software without specific prior written permission. 24 | * 25 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 26 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 29 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 35 | * THE POSSIBILITY OF SUCH DAMAGE. 36 | */ 37 | 38 | 39 | 40 | #ifndef headerfilettmaththreads 41 | #define headerfilettmaththreads 42 | 43 | #include "ttmathtypes.h" 44 | 45 | #ifdef TTMATH_WIN32_THREADS 46 | #include 47 | #include 48 | #endif 49 | 50 | #ifdef TTMATH_POSIX_THREADS 51 | #include 52 | #endif 53 | 54 | 55 | 56 | /*! 57 | \file ttmaththreads.h 58 | \brief Some objects used in multithreads environment 59 | */ 60 | 61 | 62 | /* 63 | this is a simple skeleton of a program in multithreads environment: 64 | 65 | #define TTMATH_MULTITHREADS 66 | #include 67 | 68 | TTMATH_MULTITHREADS_HELPER 69 | 70 | int main() 71 | { 72 | [...] 73 | } 74 | 75 | make sure that macro TTMATH_MULTITHREADS is defined and (somewhere in *.cpp file) 76 | use TTMATH_MULTITHREADS_HELPER macro (outside of any classes/functions/namespaces scope) 77 | */ 78 | 79 | 80 | namespace ttmath 81 | { 82 | 83 | 84 | #ifdef TTMATH_WIN32_THREADS 85 | 86 | /* 87 | we use win32 threads 88 | */ 89 | 90 | 91 | /*! 92 | in multithreads environment you should use TTMATH_MULTITHREADS_HELPER macro 93 | somewhere in *.cpp file 94 | 95 | (at the moment in win32 this macro does nothing) 96 | */ 97 | #define TTMATH_MULTITHREADS_HELPER 98 | 99 | 100 | /*! 101 | objects of this class are used to synchronize 102 | */ 103 | class ThreadLock 104 | { 105 | HANDLE mutex_handle; 106 | 107 | 108 | void CreateName(char * buffer) const 109 | { 110 | #ifdef _MSC_VER 111 | #pragma warning (disable : 4996) 112 | // warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead. 113 | #endif 114 | 115 | sprintf(buffer, "TTMATH_LOCK_%ul", (unsigned long)GetCurrentProcessId()); 116 | 117 | #ifdef _MSC_VER 118 | #pragma warning (default : 4996) 119 | #endif 120 | } 121 | 122 | 123 | public: 124 | 125 | bool Lock() 126 | { 127 | char buffer[50]; 128 | 129 | CreateName(buffer); 130 | mutex_handle = CreateMutexA(0, false, buffer); 131 | 132 | if( mutex_handle == 0 ) 133 | return false; 134 | 135 | WaitForSingleObject(mutex_handle, INFINITE); 136 | 137 | return true; 138 | } 139 | 140 | 141 | ThreadLock() 142 | { 143 | mutex_handle = 0; 144 | } 145 | 146 | 147 | ~ThreadLock() 148 | { 149 | if( mutex_handle != 0 ) 150 | { 151 | ReleaseMutex(mutex_handle); 152 | CloseHandle(mutex_handle); 153 | } 154 | } 155 | }; 156 | 157 | #endif // #ifdef TTMATH_WIN32_THREADS 158 | 159 | 160 | 161 | 162 | 163 | #ifdef TTMATH_POSIX_THREADS 164 | 165 | /* 166 | we use posix threads 167 | */ 168 | 169 | 170 | /*! 171 | in multithreads environment you should use TTMATH_MULTITHREADS_HELPER macro 172 | somewhere in *.cpp file 173 | (this macro defines a pthread_mutex_t object used by TTMath library) 174 | */ 175 | #define TTMATH_MULTITHREADS_HELPER \ 176 | namespace ttmath \ 177 | { \ 178 | pthread_mutex_t ttmath_mutex = PTHREAD_MUTEX_INITIALIZER; \ 179 | } 180 | 181 | 182 | /*! 183 | ttmath_mutex will be defined by TTMATH_MULTITHREADS_HELPER macro 184 | */ 185 | extern pthread_mutex_t ttmath_mutex; 186 | 187 | 188 | /*! 189 | objects of this class are used to synchronize 190 | */ 191 | class ThreadLock 192 | { 193 | public: 194 | 195 | bool Lock() 196 | { 197 | if( pthread_mutex_lock(&ttmath_mutex) != 0 ) 198 | return false; 199 | 200 | return true; 201 | } 202 | 203 | 204 | ~ThreadLock() 205 | { 206 | pthread_mutex_unlock(&ttmath_mutex); 207 | } 208 | }; 209 | 210 | #endif // #ifdef TTMATH_POSIX_THREADS 211 | 212 | 213 | 214 | 215 | #if !defined(TTMATH_POSIX_THREADS) && !defined(TTMATH_WIN32_THREADS) 216 | 217 | /*! 218 | we don't use win32 and pthreads 219 | */ 220 | 221 | /*! 222 | */ 223 | #define TTMATH_MULTITHREADS_HELPER 224 | 225 | 226 | /*! 227 | objects of this class are used to synchronize 228 | actually we don't synchronize, the method Lock() returns always 'false' 229 | */ 230 | class ThreadLock 231 | { 232 | public: 233 | 234 | bool Lock() 235 | { 236 | return false; 237 | } 238 | }; 239 | 240 | 241 | #endif // #if !defined(TTMATH_POSIX_THREADS) && !defined(TTMATH_WIN32_THREADS) 242 | 243 | 244 | 245 | 246 | 247 | } // namespace 248 | 249 | #endif 250 | 251 | -------------------------------------------------------------------------------- /utils/tests/KmerTests.cpp: -------------------------------------------------------------------------------- 1 | #include "KmerTests.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include // for max/min 11 | #include // for sorting_kmers 12 | #include 13 | #include "TestUtils.h" 14 | 15 | using namespace std; 16 | 17 | namespace kmerTests{ 18 | 19 | void shift_kmer_forward_testOverlap(){ 20 | char* testName = (char*)"shift_kmer_forward_testOverlap"; 21 | kmer_type kmer = test_kmer; 22 | kmer_type originalKmer = kmer; 23 | 24 | shift_kmer(&kmer, 1,0); 25 | 26 | if(!kmer_matches_kmer(kmer, 0, originalKmer,1)){ 27 | fail(testName, (char*)"overlapping portion of strings don't match."); 28 | return; 29 | } 30 | succeed(testName); 31 | } 32 | 33 | void shift_kmer_forward_testLastChar(){ 34 | char* testName = (char*)"shift_kmer_forward_testLastChar"; 35 | kmer_type kmer = test_kmer; 36 | char* kmerSeq = new char[sizeKmer]; 37 | 38 | shift_kmer(&kmer, 1,0); 39 | code2seq(kmer,kmerSeq); 40 | 41 | if(kmerSeq[sizeKmer-1] != 'C'){ 42 | fail(testName); 43 | return; 44 | } 45 | succeed(testName); 46 | } 47 | 48 | void shift_kmer_backward_testOverlap(){ 49 | char* testName = (char*)"shift_kmer_forward_testOverlap"; 50 | kmer_type kmer = test_kmer; 51 | kmer_type originalKmer = kmer; 52 | 53 | shift_kmer(&kmer, 1,1); 54 | 55 | if(!kmer_matches_kmer(kmer, 1, originalKmer,0)){ 56 | fail(testName, (char*)"overlapping portion of strings don't match."); 57 | return; 58 | } 59 | succeed(testName); 60 | } 61 | 62 | void shift_kmer_backward_testFirstChar(){ 63 | char* testName = (char*)"shift_kmer_forward_testLastChar"; 64 | kmer_type kmer = test_kmer; 65 | char* kmerSeq = new char[sizeKmer]; 66 | shift_kmer(&kmer, 2,1); 67 | code2seq(kmer,kmerSeq); 68 | 69 | if(kmerSeq[0] != 'T'){ 70 | fail(testName); 71 | return; 72 | } 73 | succeed(testName); 74 | } 75 | 76 | void next_kmer_forward_testOverlap(){ 77 | char* testName = (char*)"next_kmer_forward_testOverlap"; 78 | kmer_type kmer = test_kmer; 79 | 80 | kmer_type nextKmer= next_kmer(kmer,0,0); 81 | 82 | if(!kmer_matches_kmer(kmer, 1, nextKmer,0)){ 83 | fail(testName, (char*)"overlapping portion of strings don't match."); 84 | return; 85 | } 86 | succeed(testName); 87 | } 88 | 89 | void next_kmer_forward_testLastChar(){ 90 | char* testName = (char*)"ext_kmer_forward_testLastChar"; 91 | kmer_type kmer = test_kmer; 92 | char* nextString = new char[sizeKmer]; 93 | 94 | code2seq(next_kmer(kmer,1,0), nextString); 95 | 96 | if(nextString[sizeKmer-1] != 'C'){ 97 | fail(testName); 98 | return; 99 | } 100 | succeed(testName); 101 | } 102 | 103 | void next_kmer_backward_testOverlap(){ 104 | char* testName = (char*)"next_kmer_forward_testOverlap"; 105 | kmer_type kmer = test_kmer; 106 | 107 | kmer_type nextKmer= next_kmer(kmer,0,1); 108 | 109 | if(!kmer_matches_kmer(kmer, 0, nextKmer,1)){ 110 | fail(testName, (char*)"overlapping portion of strings don't match."); 111 | return; 112 | } 113 | succeed(testName); 114 | } 115 | 116 | void next_kmer_backward_testFirstChar(){ 117 | char* testName = (char*)"ext_kmer_backward_testFirstChar"; 118 | kmer_type kmer = test_kmer; 119 | char* nextString = new char[sizeKmer]; 120 | 121 | code2seq(next_kmer(kmer,1,1), nextString); 122 | 123 | if(nextString[0] != 'C'){ 124 | fail(testName); 125 | return; 126 | } 127 | succeed(testName); 128 | } 129 | 130 | void getFirstKmerFromRead_test(){ 131 | char* testName = (char*) "getFirstKmerFromRead_test"; 132 | char* read = (char*)"ACGGGGGTCAAAATCGGGAATCCGGGGGGAGGCCCTAGT"; 133 | kmer_type kmer; 134 | 135 | getFirstKmerFromRead(&kmer, read); 136 | 137 | if(!kmer_matches_readseq(read, kmer, 0)){ 138 | fail(testName); 139 | return; 140 | } 141 | succeed(testName); 142 | } 143 | 144 | void nextKmerInRead_test_forward(){ 145 | char* testName = (char*) "nextKmerInRead_test_forward"; 146 | kmer_type kmer = test_kmer; 147 | char* kmerSeq = new char [sizeKmer]; 148 | code2seq(kmer, kmerSeq); 149 | 150 | char* read = new char [100]; 151 | read[0] = 'C'; 152 | strcpy(&read[1], kmerSeq); 153 | read[1+sizeKmer] = 'T'; 154 | 155 | kmer_type next_kmer = next_kmer_in_read(kmer, 1, read, 0); 156 | 157 | if(!kmer_matches_readseq(read, next_kmer,2)){ 158 | fail(testName); 159 | return; 160 | } 161 | succeed(testName); 162 | } 163 | 164 | 165 | void nextKmerInRead_test_backward(){ 166 | char* testName = (char*) "nextKmerInRead_test_backward"; 167 | kmer_type kmer = test_kmer; 168 | char* kmerSeq = new char [sizeKmer]; 169 | code2seq(kmer, kmerSeq); 170 | 171 | char* read = new char [100]; 172 | read[0] = 'C'; 173 | strcpy(&read[1], kmerSeq); 174 | read[1+sizeKmer] = 'T'; 175 | 176 | kmer_type next_kmer = next_kmer_in_read(kmer, 1, read, 1); 177 | 178 | if(!kmer_matches_readseq(read, next_kmer,0)){ 179 | fail(testName); 180 | return; 181 | } 182 | 183 | succeed(testName); 184 | } 185 | 186 | void advanceKmer_test(){ 187 | char* testName = (char*) "advanceKmer_test"; 188 | kmer_type kmer = test_kmer; 189 | char* kmerSeq = new char [sizeKmer]; 190 | code2seq(kmer, kmerSeq); 191 | char* read = new char [100]; 192 | strcpy(&read[0], (char*) "CGGT"); 193 | strcpy(&read[4], kmerSeq); 194 | strcpy(&read[4+sizeKmer], (char*)"ACCCGTTTAAACGTTTAGCCTCTCTGAGAGAAAA"); 195 | 196 | advance_kmer(&read[0], &kmer, 4,15); 197 | 198 | if(!kmer_matches_readseq(read, kmer, 15)){ 199 | fail(testName); 200 | return; 201 | } 202 | succeed(testName); 203 | } 204 | 205 | void runKmerTests(){ 206 | setSizeKmer(27); 207 | 208 | 209 | shift_kmer_forward_testOverlap(); 210 | shift_kmer_forward_testLastChar(); 211 | shift_kmer_backward_testOverlap(); 212 | shift_kmer_backward_testFirstChar(); 213 | 214 | next_kmer_forward_testOverlap(); 215 | next_kmer_forward_testLastChar(); 216 | next_kmer_backward_testOverlap(); 217 | next_kmer_backward_testFirstChar(); 218 | 219 | getFirstKmerFromRead_test(); 220 | 221 | nextKmerInRead_test_forward(); 222 | nextKmerInRead_test_backward(); 223 | 224 | advanceKmer_test(); 225 | } 226 | 227 | } -------------------------------------------------------------------------------- /utils/tests/RollingHashTests.cpp: -------------------------------------------------------------------------------- 1 | #include "RollingHashTests.h" 2 | #include 3 | 4 | namespace rollingHashTests 5 | { 6 | 7 | Bloom* bloom; 8 | int kVal = 27; 9 | 10 | void rotate_right_test_moveBitRight(){ 11 | char* testName = (char*)"rotate_right_test_move1BitRight"; 12 | bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10 13 | uint64_t hash = (uint64_t)(1 << 7); 14 | 15 | uint64_t rotated = bloom->rotate_right(hash, 7); 16 | 17 | if(rotated != 1){ 18 | fail(testName); 19 | } 20 | succeed(testName); 21 | } 22 | 23 | 24 | void rotate_right_test_wrapBit(){ 25 | char* testName = (char*)"rotate_right_test_wrapBit"; 26 | bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10 27 | uint64_t hash = (uint64_t)1; 28 | 29 | uint64_t rotated = bloom->rotate_right(hash, 7); 30 | 31 | if(rotated != (1 << 3)){ 32 | fail(testName); 33 | } 34 | succeed(testName); 35 | } 36 | 37 | 38 | 39 | void rotate_right_test_noOverFlow(){ 40 | char* testName = (char*)"rotate_right_test_noOverFlow"; 41 | bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10 42 | uint64_t hash = (uint64_t)1000; 43 | 44 | uint64_t rotated = bloom->rotate_right(hash, 7); 45 | 46 | if(rotated > 1024){ 47 | fail(testName); 48 | } 49 | succeed(testName); 50 | } 51 | 52 | 53 | void rotate_left_test_moveBitLeft(){ 54 | char* testName = (char*)"rotate_Left_test_move1BitLeft"; 55 | bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10 56 | uint64_t hash = (uint64_t)1; 57 | 58 | uint64_t rotated = bloom->rotate_left(hash, 7); 59 | 60 | if(rotated != (1 << 7)){ 61 | fail(testName); 62 | } 63 | succeed(testName); 64 | } 65 | 66 | void rotate_left_test_wrapBit(){ 67 | char* testName = (char*)"rotate_left_test_wrapBit"; 68 | bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10 69 | uint64_t hash = (uint64_t)(1 << 5); 70 | 71 | uint64_t rotated = bloom->rotate_left(hash, 7); 72 | 73 | if(rotated != (1 << 2)){ 74 | fail(testName); 75 | } 76 | succeed(testName); 77 | } 78 | 79 | 80 | void rotate_left_test_noOverFlow(){ 81 | char* testName = (char*)"rotate_left_test_noOverFlow"; 82 | bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10 83 | uint64_t hash = (uint64_t)1000; 84 | 85 | uint64_t rotated = bloom->rotate_left(hash, 7); 86 | 87 | if(rotated > 1024){ 88 | fail(testName); 89 | } 90 | succeed(testName); 91 | } 92 | 93 | void roll_hash_hash_func0_bigbloom_checkSame(){ 94 | bloom = new Bloom((uint64_t)1000,kVal); 95 | char* testName = (char*)"roll_hash_hash_func0_bigbloom_checkSame"; 96 | char* kmerSeq = (char*)"ACTTACTGGGCTCTATTGCGTATCGATCGATCGATGCATCTACCCCCATCTAATTAGAGTGAATAGATCGATCGATCGCATACTCAGCATAGCTATA"; 97 | kmer_type firstKmer, secondKmer, kmer; 98 | 99 | getFirstKmerFromRead(&firstKmer, &(kmerSeq[0])); 100 | getFirstKmerFromRead(&secondKmer, &(kmerSeq[sizeKmer])); 101 | 102 | uint64_t rolledHash = bloom->get_rolling_hash(firstKmer, 0); 103 | uint64_t calculatedHash = bloom->get_rolling_hash(secondKmer, 0); 104 | 105 | for(int i = 0; i < sizeKmer; i++){ 106 | rolledHash = bloom->roll_hash(rolledHash, NT2int(kmerSeq[i]), NT2int(kmerSeq[i+sizeKmer]),0); 107 | } 108 | 109 | if(rolledHash != calculatedHash){ 110 | fail(testName); 111 | return; 112 | } 113 | succeed(testName); 114 | } 115 | 116 | void roll_hash_hash_func1_bigbloom_checkSame(){ 117 | bloom = new Bloom((uint64_t)10000, kVal); 118 | char* testName = (char*)"roll_hash_hash_func1_bigbloom_checkSame"; 119 | char* kmerSeq = (char*)"ACTTACTGGGCTCTATTGCGTATCGATCGATCGATGCATCTACCCCCATCTAATTAGAGTGAATAGATCGATCGATCGCATACTCAGCATAGCTATA"; 120 | kmer_type firstKmer, secondKmer, kmer; 121 | 122 | getFirstKmerFromRead(&firstKmer, &(kmerSeq[0])); 123 | getFirstKmerFromRead(&secondKmer, &(kmerSeq[sizeKmer])); 124 | 125 | uint64_t rolledHash = bloom->get_rolling_hash(firstKmer, 1); 126 | uint64_t calculatedHash = bloom->get_rolling_hash(secondKmer, 1); 127 | 128 | 129 | for(int i = 0; i < sizeKmer; i++){ 130 | rolledHash = bloom->roll_hash(rolledHash, NT2int(kmerSeq[i]), NT2int(kmerSeq[i+sizeKmer]),1); 131 | } 132 | 133 | if(rolledHash != calculatedHash){ 134 | fail(testName); 135 | return; 136 | } 137 | succeed(testName); 138 | } 139 | 140 | void roll_hash_hash_func0_smallbloom_checkSame(){ 141 | bloom = new Bloom((uint64_t)100000, kVal); 142 | char* testName = (char*)"roll_hash_hash_func0_smallbloom_checkSame"; 143 | char* kmerSeq = (char*)"ACTTACTGGGCTCTATTGCGTATCGATCGATCGATGCATCTACCCCCATCTAATTAGAGTGAATAGATCGATCGATCGCATACTCAGCATAGCTATA"; 144 | kmer_type firstKmer, secondKmer, kmer; 145 | 146 | getFirstKmerFromRead(&firstKmer, &(kmerSeq[0])); 147 | getFirstKmerFromRead(&secondKmer, &(kmerSeq[sizeKmer])); 148 | 149 | uint64_t rolledHash = bloom->get_rolling_hash(firstKmer, 0); 150 | uint64_t calculatedHash = bloom->get_rolling_hash(secondKmer, 0); 151 | 152 | for(int i = 0; i < sizeKmer; i++){ 153 | rolledHash = bloom->roll_hash(rolledHash, NT2int(kmerSeq[i]), NT2int(kmerSeq[i+sizeKmer]),0); 154 | } 155 | 156 | if(rolledHash != calculatedHash){ 157 | fail(testName); 158 | return; 159 | } 160 | succeed(testName); 161 | } 162 | 163 | void roll_hash_hash_func1_smallbloom_checkSame(){ 164 | bloom = new Bloom((uint64_t)1000000, kVal); 165 | char* testName = (char*)"roll_hash_hash_func1_smallbloom_checkSame"; 166 | char* kmerSeq = (char*)"ACTTACTGGGCTCTATTGCGTATCGATCGATCGATGCATCTACCCCCATCTAATTAGAGTGAATAGATCGATCGATCGCATACTCAGCATAGCTATA"; 167 | kmer_type firstKmer, secondKmer, kmer; 168 | 169 | getFirstKmerFromRead(&firstKmer, &(kmerSeq[0])); 170 | getFirstKmerFromRead(&secondKmer, &(kmerSeq[sizeKmer])); 171 | 172 | uint64_t rolledHash = bloom->get_rolling_hash(firstKmer, 1); 173 | uint64_t calculatedHash = bloom->get_rolling_hash(secondKmer, 1); 174 | 175 | 176 | for(int i = 0; i < sizeKmer; i++){ 177 | rolledHash = bloom->roll_hash(rolledHash, NT2int(kmerSeq[i]), NT2int(kmerSeq[i+sizeKmer]),1); 178 | } 179 | 180 | if(rolledHash != calculatedHash){ 181 | fail(testName); 182 | return; 183 | } 184 | succeed(testName); 185 | } 186 | 187 | void advance_hash_test_checkSame(){ 188 | bloom = new Bloom((uint64_t)10000, kVal); 189 | char* testName = (char*) "advance_hash_test_checkSame"; 190 | char* kmerSeq = (char*)"ACTTACTGGGCTCTATTGCGTATCGATCGATCGATGCATCTACCCCCATCTAATTAGAGTGAATAGATCGATCGATCGCATACTCAGCATAGCTATA"; 191 | kmer_type firstKmer, secondKmer; 192 | 193 | getFirstKmerFromRead(&firstKmer, &(kmerSeq[0])); 194 | getFirstKmerFromRead(&secondKmer, &(kmerSeq[kVal])); 195 | 196 | uint64_t advancedHash0 = bloom->get_rolling_hash(firstKmer, 0); 197 | uint64_t advancedHash1 = bloom->get_rolling_hash(firstKmer, 1); 198 | uint64_t calculatedHash0 = bloom->get_rolling_hash(secondKmer, 0); 199 | uint64_t calculatedHash1 = bloom->get_rolling_hash(secondKmer, 1); 200 | 201 | bloom->advance_hash(&kmerSeq[0], &advancedHash0, &advancedHash1,0,kVal); 202 | 203 | if(advancedHash0 != calculatedHash0){ 204 | fail(testName, (char*)"hash 0 doesn't match."); 205 | return; 206 | } 207 | if(advancedHash1 != calculatedHash1){ 208 | fail(testName, (char*)"hash 1 doesn't match."); 209 | return; 210 | } 211 | succeed(testName); 212 | 213 | } 214 | 215 | 216 | void runRollingHashTests(){ 217 | setSizeKmer(kVal); 218 | 219 | rotate_right_test_moveBitRight(); 220 | rotate_right_test_wrapBit(); 221 | rotate_right_test_noOverFlow(); 222 | 223 | rotate_left_test_moveBitLeft(); 224 | rotate_left_test_wrapBit(); 225 | rotate_left_test_noOverFlow(); 226 | 227 | roll_hash_hash_func0_bigbloom_checkSame(); 228 | roll_hash_hash_func1_bigbloom_checkSame(); 229 | roll_hash_hash_func0_smallbloom_checkSame(); 230 | roll_hash_hash_func1_smallbloom_checkSame(); 231 | 232 | advance_hash_test_checkSame(); 233 | } 234 | 235 | } -------------------------------------------------------------------------------- /utils/ContigJuncList.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "ContigJuncList.h" 8 | 9 | using std::stringstream; 10 | 11 | ContigJuncList::ContigJuncList(std::string sequence, junc_list dist, junc_list cov){ 12 | seq = sequence; 13 | distances = dist; 14 | coverages = cov; 15 | } 16 | 17 | ContigJuncList::ContigJuncList(){ 18 | distances.clear(); 19 | coverages.clear(); 20 | seq = ""; 21 | } 22 | 23 | int ContigJuncList::size(){ 24 | return coverages.size(); 25 | } 26 | 27 | void ContigJuncList::printJuncValues(){ 28 | for(auto itC = coverages.begin(), itD = distances.begin(); itC != coverages.end(); itC++, itD++){ 29 | std::cout << (unsigned int)*itD << ", " << (unsigned int)*itC <<" ; "; 30 | } 31 | std::cout << "\n"; 32 | } 33 | 34 | void ContigJuncList::printJuncResults(std::list results){ 35 | for(auto it = results.begin(); it != results.end(); ++it){ 36 | std::cout << print_kmer(it->kmer) << " " << it->distance << " " 37 | << it->coverage << " , "; 38 | } 39 | std::cout << "\n"; 40 | } 41 | 42 | void ContigJuncList::printJuncResults(int side, int startDist, int maxDist){ 43 | printJuncResults(getJuncResults(side, startDist, maxDist)); 44 | } 45 | 46 | int ContigJuncList::length(){ 47 | return seq.length(); 48 | } 49 | 50 | bool ContigJuncList::isValidKmerPosition(int pos){ 51 | return pos >= 0 && pos <= (getSeq().length() - sizeKmer + 1)*2; 52 | } 53 | 54 | //0 is the first backward kmer, 1 is the first forward kmer, 55 | //2 is the second backward kmer, etc. 56 | kmer_type ContigJuncList::getKmer(int pos){ 57 | if(pos % 2 == 1){ 58 | return getKmerFromRead(getSeq(), pos/2); 59 | } 60 | else{ 61 | return revcomp(getKmerFromRead(getSeq(), pos/2)); 62 | } 63 | } 64 | 65 | //Gets a list of JuncResults, specifying distance, coverage, and kmer for juncs 66 | //If startForward, assumes first junc faces forward (in towards contig) 67 | //If !startForward, assumes first junc faces backward (away from contig) 68 | //Always assumes no reverse needed- contig can reverse it if necessary before calling 69 | std::list ContigJuncList::getJuncResults(bool startForward, int startDist, int maxDist){ 70 | std::list results = {}; 71 | int startPos = 0; 72 | if(startForward){ 73 | results.push_back(JuncResult(getKmer(3), 2+startDist,coverages.front())); 74 | startPos = 1; 75 | } 76 | int pos = startPos; 77 | for(auto itD = distances.begin(), itC = ++coverages.begin(); 78 | itD != distances.end(); 79 | itD++, itC++){ 80 | pos += *itD; 81 | //real extension is 2 in front if the junc faces forward, or 2 behind if it faces backward 82 | int offset = 2; 83 | if(pos % 2 == 0){ 84 | offset = -2; 85 | } 86 | if(pos + offset - startPos + startDist <= maxDist){ 87 | if(isValidKmerPosition(pos+offset)){ 88 | results.push_back(JuncResult(getKmer(pos + offset), pos + offset-startPos + startDist, *itC)); 89 | } 90 | } 91 | else { break; } 92 | 93 | } 94 | return results; 95 | } 96 | 97 | //Used for reversing a contig. Simply reverses both lists 98 | void ContigJuncList::reverse(){ 99 | std::reverse(coverages.begin(), coverages.end()); 100 | std::reverse(distances.begin(), distances.end()); 101 | // std::cout << seq << ", " << revcomp_string(seq) << std::endl; 102 | seq = revcomp_string(seq); 103 | } 104 | 105 | //Concatenates this list of juncs with another 106 | //Removes overlap of middle coverage and middle distance 107 | ContigJuncList ContigJuncList::concatenate(ContigJuncList otherList){ 108 | 109 | junc_list newCov(coverages); 110 | newCov.pop_back(); 111 | 112 | unsigned char lastCov = coverages.back(); 113 | unsigned char firstCov = otherList.coverages.front(); 114 | 115 | newCov.push_back((unsigned char) std::min((int)lastCov, (int)firstCov)); 116 | 117 | newCov.insert(newCov.end(), ++otherList.coverages.begin(), otherList.coverages.end()); 118 | 119 | junc_list newDist(distances); 120 | newDist.insert(newDist.end(), otherList.distances.begin(), otherList.distances.end()); 121 | std::string newSeq = getSeq().substr(0, getSeq().length()-sizeKmer) + otherList.getSeq(); 122 | return ContigJuncList(newSeq, newDist, newCov); 123 | } 124 | 125 | // shifts coverage value up only up to maxDist - the rest are uneffected 126 | ContigJuncList ContigJuncList::getShiftedCoverageContigJuncsRange(double shift, int maxDist, int side){ 127 | junc_list newCov; 128 | for(auto itD = distances.begin(), itC = coverages.begin(); 129 | itC != coverages.end(); ){ 130 | 131 | double val = (double) *itC+shift; 132 | if ((*itD < maxDist && side == 1) || (*itD > seq.length() - maxDist && side == 2 && seq.length() > maxDist)){ 133 | newCov.push_back((int) std::round((val > 255) ? 255: val)); 134 | }else{ 135 | newCov.push_back(*itC); 136 | } 137 | ++itD; 138 | ++itC; 139 | } 140 | return ContigJuncList(seq,distances,newCov); 141 | } 142 | 143 | ContigJuncList ContigJuncList::getShiftedCoverageContigJuncs(double shift){ 144 | junc_list newCov(coverages); 145 | for (int i = 0; i < newCov.size(); i++){ 146 | double val = (double) newCov.at(i)+shift; 147 | newCov.at(i) = (int) std::round((val > 255) ? 255: val); 148 | } 149 | return ContigJuncList(seq,distances,newCov); 150 | } 151 | 152 | 153 | ContigJuncList ContigJuncList::getScaledContigJuncs(double scale_factor){ 154 | junc_list newCov(coverages); 155 | for (int i = 0; i < newCov.size(); i++){ 156 | newCov.at(i) = (int) std::round(newCov.at(i) * scale_factor); 157 | } 158 | return ContigJuncList(seq,distances,newCov); 159 | } 160 | 161 | //Averages all coverage values in list 162 | double ContigJuncList::getAvgCoverage(){ 163 | double covSum = 0; 164 | if(coverages.size()== 0){ 165 | printf("ERROR: empty junctions list\n"); 166 | return 0; 167 | } 168 | for(auto it = coverages.begin(); it != coverages.end(); ++it){ 169 | covSum += (double) *it; 170 | } 171 | return covSum / coverages.size(); 172 | } 173 | 174 | double ContigJuncList::getAvgCoverage(std::list results){ 175 | if (results.size()==0){return 0;} 176 | double covSum = 0; 177 | for(auto it = results.begin(); it != results.end(); ++it){ 178 | covSum += (double) it->coverage; 179 | } 180 | return covSum / results.size(); 181 | } 182 | 183 | double ContigJuncList::getCoverageSampleVariance(){ 184 | if(coverages.size() < 2){ 185 | // printf("ERROR: 1 or 0 values in junctions list\n"); 186 | return 0; 187 | } 188 | double mean = getAvgCoverage(); 189 | double sum_sqrs = 0; 190 | for(auto it = coverages.begin(); it != coverages.end(); ++it){ 191 | sum_sqrs += pow(mean - (double) *it, 2); 192 | } 193 | return pow(sum_sqrs / (coverages.size()-1), 0.5); 194 | } 195 | 196 | double ContigJuncList::getCoverageSampleVariance(std::list results){ 197 | if (results.size() < 2) {return 0;} 198 | double mean = getAvgCoverage(results); 199 | double sum_sqrs = 0; 200 | for(auto it = results.begin(); it != results.end(); ++it){ 201 | sum_sqrs += pow(mean - (double) it->coverage, 2); 202 | } 203 | return pow(sum_sqrs / (results.size()-1), 0.5); 204 | } 205 | 206 | 207 | //Sums all distance values 208 | int ContigJuncList::getTotalDistance(){ 209 | int totalDist = 0; 210 | for(auto it = distances.begin(); it != distances.end(); ++it){ 211 | totalDist += (int) *it; 212 | } 213 | return totalDist; 214 | } 215 | 216 | //Prints distances then coverages to a string 217 | std::string ContigJuncList::getStringRep(){ 218 | stringstream stream; 219 | stream << getSeq() << "\n"; 220 | stream << "Distances: "; 221 | for(auto it = distances.begin(); it != distances.end(); ++it){ 222 | stream << (int)*it << " "; 223 | } 224 | stream << ". Coverages: "; 225 | for(auto it = coverages.begin(); it != coverages.end(); ++it){ 226 | stream << (int)*it << " "; 227 | } 228 | return stream.str(); 229 | } 230 | -------------------------------------------------------------------------------- /src/newTests/JunctionMapTest.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "gtest/gtest.h" 4 | #include "../ReadScanner.h" 5 | #include "../../utils/Bloom.h" 6 | #include "../../utils/JunctionMap.h" 7 | #include "../ContigGraph.h" 8 | using std::map; 9 | // #include "../../utils/sparsepp.h" 10 | // using spp::std::unordered_map; 11 | 12 | 13 | class juncMapData : public ::testing::Test { 14 | 15 | protected: 16 | std::vector reads; 17 | std::vector kmers; 18 | 19 | Bloom* bloom; 20 | ReadScanner* scanner; 21 | int j; 22 | int read_length; 23 | int estimated_kmers; 24 | int maxSpacerDist; 25 | double fpRate; 26 | 27 | JChecker* jchecker; 28 | JunctionMap* junctionMap; 29 | Bloom* short_pair_filter; 30 | Bloom* long_pair_filter; 31 | ContigGraph* contigGraph = new ContigGraph(); 32 | 33 | // Build a kmer out of a string input 34 | kmer_type getKmerFromString(string kmerString){ 35 | kmer_type kmer; 36 | getFirstKmerFromRead(&kmer, &(kmerString[0])); 37 | return kmer; 38 | } 39 | 40 | // Add the kmers from a vector of strings to a fake bloom filter 41 | std::set addKmers(Bloom* bloom, std::vector kmers) { 42 | std::set valids {}; 43 | for (string kmer : kmers) { 44 | valids.insert(get_canon(getKmerFromString(kmer))); 45 | } 46 | 47 | bloom->addFakeKmers(valids); 48 | valids.clear(); 49 | } 50 | 51 | // Create a bloom filter, but make it a fake one 52 | Bloom* createBloom(){ 53 | Bloom* fakeBloom = fakeBloom->create_bloom_filter_optimal(estimated_kmers, fpRate); 54 | fakeBloom->fakify(); 55 | return fakeBloom; 56 | } 57 | 58 | // This method should be used and modified to print whatever we want ot check about the resulting junction map 59 | void printJunctionMap(ReadScanner scanner) { 60 | auto map = scanner.getJunctionMap()->junctionMap; 61 | printf("Size: %d \n", map.size()); 62 | for (auto& kv : map){ 63 | printf("%s \n", print_kmer(kv.first)); 64 | printf("%d %d %d %d %d \n", 65 | kv.second.dist[0], kv.second.dist[1], kv.second.dist[2], kv.second.dist[3], kv.second.dist[4]); 66 | } 67 | } 68 | 69 | void printContigGraph(ContigGraph* graph){ 70 | ContigIterator* contigIt = new ContigIterator(graph); 71 | //prints contigs that are adjacent to nodes 72 | while(contigIt->hasNextContig()){ 73 | Contig* contig = contigIt->getContig(); 74 | graph->printContigFastG(&std::cout, contig); 75 | } 76 | //prints isolated contigs 77 | std::vector * isolated_contigs = graph->getIsolatedContigs(); 78 | for(auto it = isolated_contigs->begin(); it != isolated_contigs->end(); ++it){ 79 | Contig* contig = &*it; 80 | graph->printContigFastG(&std::cout, contig); 81 | } 82 | } 83 | 84 | // set up blooms, junction map, jchecker, readscanner for testing 85 | juncMapData() { 86 | read_length = 30; 87 | estimated_kmers = 35; 88 | maxSpacerDist = 8; 89 | fpRate = .1; 90 | kmers = {}; 91 | reads = {}; 92 | 93 | bloom = createBloom(); 94 | jchecker = new JChecker(j, bloom); 95 | 96 | junctionMap = new JunctionMap(bloom, jchecker, read_length); 97 | string read_scan_file = "mock_file"; 98 | 99 | short_pair_filter = short_pair_filter->create_bloom_filter_optimal(estimated_kmers/9, fpRate); 100 | long_pair_filter = long_pair_filter->create_bloom_filter_optimal(estimated_kmers/6, fpRate); 101 | 102 | scanner = new ReadScanner(junctionMap, read_scan_file, bloom, short_pair_filter, long_pair_filter, jchecker, maxSpacerDist); 103 | printf("Done initializing!\n"); 104 | contigGraph = new ContigGraph(); 105 | } 106 | ~juncMapData(){ 107 | reads.clear(); 108 | kmers.clear(); 109 | delete jchecker; 110 | delete short_pair_filter; 111 | delete long_pair_filter; 112 | delete bloom; 113 | delete junctionMap; 114 | delete scanner; 115 | delete contigGraph; 116 | } 117 | }; 118 | 119 | 120 | // build junction map of three reads 121 | TEST_F(juncMapData, buildBranchingPaths) { 122 | setSizeKmer(5); 123 | j = 1; 124 | reads = {"ACGGGCGAACTTTCATAGGA", "GGCGAACTAGTCCAT", "AACTTTCATACGATT"}; 125 | kmers = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT","AACTT","ACTTT", 126 | "CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA","GGCGA", "GCGAA", "CGAAC", 127 | "GAACT", "AACTA","ACTAG", "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT","AACTT", 128 | "ACTTT", "CTTTC", "TTTCA", "TTCAT", "TCATA", "CATAC", "ATACG", "TACGA", "ACGAT","CGATT"}; 129 | addKmers(bloom, kmers); 130 | 131 | scanner->scanInputRead(reads[0], true); 132 | scanner->scanInputRead(reads[1], true); 133 | scanner->scanInputRead(reads[2], true); 134 | std::unordered_map map = scanner->getJunctionMap()->junctionMap; 135 | 136 | // Expected junctions & distances before changes 137 | // CTAGT 138 | // 0 8 3 0 3 139 | // TCATA 140 | // 0 10 0 6 12 141 | // GAACT 142 | // 3 0 12 0 13 143 | std::cout << "map before changes\n"; 144 | printJunctionMap(*scanner); 145 | junctionMap->buildBranchingPaths(contigGraph); 146 | std::cout << "built branching paths\n"; 147 | printContigGraph(contigGraph); 148 | 149 | printf("Destroying complex junctions.\n"); 150 | junctionMap->destroyComplexJunctions(); 151 | std::cout << "map before changes\n"; 152 | printJunctionMap(*scanner); 153 | 154 | printf("Building linear regions.\n"); 155 | junctionMap->buildLinearRegions(contigGraph); 156 | printContigGraph(contigGraph); 157 | 158 | printf("Checking graph.\n"); 159 | contigGraph->checkGraph(); 160 | } 161 | 162 | TEST_F(juncMapData, smallDoubleJuncMap) { 163 | setSizeKmer(7); 164 | j = 0; 165 | 166 | reads = {"AAAAACAGCGATTC", "AAAAAGAGCGATTTA"}; 167 | kmers = {"AAAAACA", "AAAAAGA", "AAAACAG", "AAAAGAG", "AAACAGC", "AAAGAGC", 168 | "AACAGCG", "AAGAGCG", "ACAGCGA","AGAGCGA","CAGCGAT", "GAGCGAT", "AGCGATT", 169 | "GCGATTT" ,"GCGATTC", "CGATTTA"}; 170 | addKmers(bloom, kmers); 171 | 172 | scanner->scanInputRead(reads[0], true); 173 | scanner->scanInputRead(reads[1], true); 174 | std::unordered_map map = scanner->getJunctionMap()->junctionMap; 175 | 176 | printJunctionMap(*scanner); 177 | 178 | std::cout << "map before changes\n"; 179 | printJunctionMap(*scanner); 180 | junctionMap->buildBranchingPaths(contigGraph); 181 | std::cout << "built branching paths\n"; 182 | printContigGraph(contigGraph); 183 | 184 | printf("Destroying complex junctions.\n"); 185 | junctionMap->destroyComplexJunctions(); 186 | std::cout << "map before changes\n"; 187 | printJunctionMap(*scanner); 188 | 189 | printf("Building linear regions.\n"); 190 | junctionMap->buildLinearRegions(contigGraph); 191 | printContigGraph(contigGraph); 192 | 193 | printf("Checking graph.\n"); 194 | contigGraph->checkGraph(); 195 | 196 | } 197 | 198 | TEST_F(juncMapData, endJuncMap) { 199 | setSizeKmer(7); 200 | j = 1; 201 | 202 | reads = {"AAAAAACAGCGATTC", "AAAAAACTAAAAAA"}; // single read, first kmer is junction, should poinnt back one 203 | kmers = {"AAAAAAC", "AAAAACA", "AAAAACT", "AAAACAG", "AAACAGC", "AACAGCG", "ACAGCGA","CAGCGAT", "AGCGATT", "GCGATTC", 204 | "AAAACTA", "AAACTAA", "AACTAAA", "ACTAAAA", "CTAAAAA", "CTAAAAAA"}; 205 | addKmers(bloom, kmers); 206 | 207 | scanner->scanInputRead(reads[0], true); 208 | scanner->scanInputRead(reads[1], true); 209 | 210 | std::unordered_map map = scanner->getJunctionMap()->junctionMap; 211 | 212 | printJunctionMap(*scanner); 213 | 214 | std::cout << "map before changes\n"; 215 | printJunctionMap(*scanner); 216 | junctionMap->buildBranchingPaths(contigGraph); 217 | std::cout << "built branching paths\n"; 218 | printContigGraph(contigGraph); 219 | 220 | printf("Destroying complex junctions.\n"); 221 | junctionMap->destroyComplexJunctions(); 222 | std::cout << "map before changes\n"; 223 | printJunctionMap(*scanner); 224 | 225 | printf("Building linear regions.\n"); 226 | junctionMap->buildLinearRegions(contigGraph); 227 | printContigGraph(contigGraph); 228 | 229 | printf("Checking graph.\n"); 230 | contigGraph->checkGraph(); 231 | 232 | } 233 | 234 | -------------------------------------------------------------------------------- /utils/LargeInt.cpp: -------------------------------------------------------------------------------- 1 | #ifndef ASSERTS 2 | #define NDEBUG // disable asserts; those asserts make sure that with PRECISION == [1 or 2], all is correct 3 | #endif 4 | 5 | // some 64-bit assert macros 6 | #if defined(_LP64) && defined(_largeint) 7 | #define assert128(x) assert(precision != 2 || (x)); 8 | #else 9 | #define assert128(x) ; 10 | #endif 11 | 12 | #include 13 | #include 14 | #include "LargeInt.h" 15 | 16 | using namespace std; 17 | 18 | template 19 | LargeInt::LargeInt() 20 | { 21 | } 22 | 23 | template 24 | LargeInt::LargeInt(const uint64_t &c) 25 | { 26 | array[0] = c; 27 | for (int i = 1; i < precision; i++) 28 | array[i] = 0; 29 | } 30 | 31 | 32 | template 33 | LargeInt LargeInt::operator+ (const LargeInt& other) const 34 | { 35 | LargeInt result; 36 | int carry = 0; 37 | for (int i = 0 ; i < precision ; i++) 38 | { 39 | result.array[i] = array[i] + other.array[i] + carry; 40 | carry = (result.array[i] < array[i]) ? 1 : 0; 41 | } 42 | 43 | assert(precision != 1 || (result == other.array[0] + array[0])); 44 | assert128(result.toInt128() == other.toInt128() + toInt128()); 45 | return result; 46 | } 47 | 48 | template 49 | LargeInt LargeInt::operator- (const LargeInt& other) const 50 | { 51 | LargeInt result; 52 | int carry = 0; 53 | for (int i = 0 ; i < precision ; i++) 54 | { 55 | result.array[i] = array[i] - other.array[i] - carry; 56 | carry = (result.array[i] > array[i]) ? 1 : 0; 57 | } 58 | 59 | assert(precision != 1 || (result == array[0] - other.array[0])); 60 | assert128(result.toInt128() == toInt128() - other.toInt128()); 61 | return result; 62 | } 63 | 64 | 65 | template 66 | LargeInt LargeInt::operator* (const int& coeff) const 67 | { 68 | LargeInt result (*this); 69 | // minia doesn't have that many multiplications cases 70 | 71 | if (coeff == 2 || coeff == 4) 72 | { 73 | result = result << (coeff / 2); 74 | } 75 | else 76 | { 77 | if (coeff == 21) 78 | { 79 | result = (result << 4) + (result << 2) + result; 80 | } 81 | else 82 | { 83 | printf("unsupported LargeInt multiplication: %d\n",coeff); 84 | exit(1); 85 | } 86 | } 87 | 88 | assert(precision != 1 || (result == array[0] * coeff)); 89 | assert128(result.toInt128() == toInt128() * coeff); 90 | return result; 91 | } 92 | 93 | 94 | template 95 | LargeInt LargeInt::operator/ (const uint32_t& divisor) const 96 | { 97 | LargeInt result; 98 | fill( result.array, result.array + precision, 0 ); 99 | 100 | // inspired by Divide32() from http://subversion.assembla.com/svn/pxcode/RakNet/Source/BigInt.cpp 101 | 102 | uint64_t r = 0; 103 | uint32_t mask32bits = ~0; 104 | for (int i = precision-1; i >= 0; --i) 105 | { 106 | for (int j = 1; j >= 0; --j) // [j=1: high-32 bits, j=0: low-32 bits] of array[i] 107 | { 108 | uint64_t n = (r << 32) | ((array[i] >> (32*j)) & mask32bits ); 109 | result.array[i] = result.array[i] | (((n / divisor) & mask32bits) << (32*j)); 110 | r = n % divisor; 111 | } 112 | } 113 | assert(precision != 1 || (result == array[0] / divisor)); 114 | assert128(result.toInt128() == toInt128() / divisor); 115 | return result; 116 | } 117 | 118 | 119 | template 120 | uint32_t LargeInt::operator% (const uint32_t& divisor) const 121 | { 122 | uint64_t r = 0; 123 | uint32_t mask32bits = ~0; 124 | for (int i = precision-1; i >= 0; --i) 125 | { 126 | for (int j = 1; j >= 0; --j) // [j=1: high-32 bits, j=0: low-32 bits] of array[i] 127 | { 128 | uint64_t n = (r << 32) | ((array[i] >> (32*j)) & mask32bits ); 129 | r = n % divisor; 130 | } 131 | } 132 | 133 | assert(precision != 1 || (r == array[0] % divisor)); 134 | assert128(r == toInt128() % divisor); 135 | return (uint32_t)r; 136 | } 137 | 138 | template 139 | LargeInt LargeInt::operator^ (const LargeInt& other) const 140 | { 141 | LargeInt result; 142 | for (int i=0 ; i < precision ; i++) 143 | result.array[i] = array[i] ^ other.array[i]; 144 | 145 | assert(precision != 1 || (result == (array[0] ^ other.array[0]))); 146 | assert128(result.toInt128() == (toInt128() ^ other.toInt128())); 147 | return result; 148 | } 149 | 150 | template 151 | LargeInt LargeInt::operator& (const LargeInt& other) const 152 | { 153 | LargeInt result; 154 | for (int i=0 ; i < precision ; i++) 155 | result.array[i] = array[i] & other.array[i]; 156 | 157 | assert(precision != 1 || (result == (array[0] & other.array[0]))); 158 | assert128(result.toInt128() == (toInt128() & other.toInt128())); 159 | return result; 160 | } 161 | 162 | 163 | template 164 | LargeInt LargeInt::operator~ () const 165 | { 166 | LargeInt result; 167 | for (int i=0 ; i < precision ; i++) 168 | result.array[i] = ~array[i]; 169 | 170 | assert(precision != 1 || (result == ~array[0])); 171 | assert128(result.toInt128() == ~toInt128()); 172 | return result; 173 | } 174 | 175 | template 176 | LargeInt LargeInt::operator<< (const int& coeff) const 177 | { 178 | LargeInt result (0); 179 | 180 | int large_shift = coeff / 64; 181 | int small_shift = coeff % 64; 182 | 183 | for (int i = large_shift ; i < precision-1; i++) 184 | { 185 | result.array[i] = result.array[i] | (array[i-large_shift] << small_shift); 186 | if (small_shift == 0) // gcc "bug".. uint64_t x; x>>64 == 1<<63, x<<64 == 1 187 | result.array[i+1] = 0; 188 | else 189 | result.array[i+1] = array[i-large_shift] >> (64 - small_shift); 190 | 191 | } 192 | result.array[precision-1] = result.array[precision-1] | (array[precision-1-large_shift] << small_shift); 193 | 194 | assert(precision != 1 || (result == (array[0] << coeff))); 195 | assert128(result.toInt128() == (toInt128() << coeff)); 196 | return result; 197 | } 198 | 199 | template 200 | LargeInt LargeInt::operator>> (const int& coeff) const 201 | { 202 | LargeInt result (0); 203 | 204 | int large_shift = coeff / 64; 205 | int small_shift = coeff % 64; 206 | 207 | result.array[0] = (array[large_shift] >> small_shift); 208 | 209 | for (int i = 1 ; i < precision - large_shift ; i++) 210 | { 211 | result.array[i] = (array[i+large_shift] >> small_shift); 212 | if (small_shift == 0 && large_shift > 0) // gcc "bug".. uint64_t x; x>>64 == 1<<63, x<<64 == 1 213 | { 214 | result.array[i-1] = result.array[i-1]; 215 | } 216 | else 217 | { 218 | result.array[i-1] = result.array[i-1] | (array[i+large_shift] << (64 - small_shift)); 219 | } 220 | } 221 | 222 | assert(precision != 1 || ( small_shift == 0 || (result == array[0] >> coeff))); 223 | assert128(small_shift == 0 || (result.toInt128() == (toInt128() >> coeff))); 224 | return result; 225 | } 226 | 227 | template 228 | bool LargeInt::operator!= (const LargeInt& c) const 229 | { 230 | for (int i = 0 ; i < precision ; i++) 231 | if( array[i] != c.array[i] ) 232 | return true; 233 | return false; 234 | } 235 | 236 | template 237 | bool LargeInt::operator== (const LargeInt& c) const 238 | { 239 | for (int i = 0 ; i < precision ; i++) 240 | if( array[i] != c.array[i] ) 241 | return false; 242 | return true; 243 | } 244 | 245 | template 246 | bool LargeInt::operator< (const LargeInt& c) const 247 | { 248 | for (int i = precision-1 ; i>=0 ; --i) 249 | if( array[i] != c.array[i] ) 250 | return array[i] < c.array[i]; 251 | 252 | return false; 253 | } 254 | 255 | template 256 | bool LargeInt::operator<=(const LargeInt& c) const 257 | { 258 | return operator==(c) || operator<(c); 259 | } 260 | 261 | template 262 | uint64_t LargeInt::toInt() const 263 | { 264 | return array[0]; 265 | } 266 | 267 | #ifdef _LP64 268 | template 269 | __uint128_t LargeInt::toInt128() const 270 | { 271 | return ((__uint128_t)array[0]) + (((__uint128_t)array[1]) << ((__uint128_t)64)); 272 | } 273 | #endif 274 | 275 | #ifdef KMER_PRECISION 276 | template class LargeInt; // since we didn't define the functions in a .h file, that trick removes linker errors, see http://www.parashift.com/c++-faq-lite/separate-template-class-defn-from-decl.html 277 | #endif 278 | -------------------------------------------------------------------------------- /utils/Bloom.h: -------------------------------------------------------------------------------- 1 | // 2 | // Bloom.h 3 | // 4 | // Created by Guillaume Rizk on 9/02/12. 5 | // 6 | // Modified by Gil Goldshlager 7/15 7 | 8 | #ifndef Bloom_h 9 | #define Bloom_h 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "Kmer.h" 16 | #include "ReadKmer.h" 17 | #include "JuncPairs.h" 18 | #include 19 | #include 20 | #include 21 | 22 | 23 | // not using kmer_type from Kmer.h because I don't want this class to depend on Kmer.h 24 | #ifdef _largeint 25 | #include "LargeInt.h" 26 | typedef LargeInt bloom_elem; 27 | #else 28 | #ifdef _ttmath 29 | #include "ttmath/ttmath.h" 30 | typedef ttmath::UInt bloom_elem; 31 | #else 32 | #if (! defined kmer_type) || (! defined _LP64) 33 | typedef uint64_t bloom_elem; 34 | #else 35 | typedef kmer_type bloom_elem; 36 | #endif 37 | #endif 38 | #endif 39 | 40 | #define NSEEDSBLOOM 10 41 | #define CUSTOMSIZE 1 42 | 43 | static const int bits_per_char = 0x08; // 8 bits in 1 char(unsigned) 44 | static const unsigned char bit_mask[bits_per_char] = { 45 | 0x01, //00000001 46 | 0x02, //00000010 47 | 0x04, //00000100 48 | 0x08, //00001000 49 | 0x10, //00010000 50 | 0x20, //00100000 51 | 0x40, //01000000 52 | 0x80 //10000000 53 | }; 54 | 55 | 56 | static const uint64_t rbase[NSEEDSBLOOM] = 57 | { 58 | 0xAAAAAAAA55555555ULL, 59 | 0x33333333CCCCCCCCULL, 60 | 0x6666666699999999ULL, 61 | 0xB5B5B5B54B4B4B4BULL, 62 | 0xAA55AA5555335533ULL, 63 | 0x33CC33CCCC66CC66ULL, 64 | 0x6699669999B599B5ULL, 65 | 0xB54BB54B4BAA4BAAULL, 66 | 0xAA33AA3355CC55CCULL, 67 | 0x33663366CC99CC99ULL 68 | }; 69 | 70 | 71 | class Bloom{ 72 | 73 | protected: 74 | 75 | #ifdef _largeint 76 | inline uint64_t hash_func(LargeInt elem, int num_hash); 77 | #endif 78 | #ifdef _ttmath 79 | inline uint64_t hash_func(ttmath::UInt elem, int num_hash); 80 | #endif 81 | #ifdef _LP64 82 | inline uint64_t hash_func(__uint128_t key, int num_hash); 83 | #endif 84 | inline uint64_t hash_func(uint64_t key, int num_hash); 85 | inline void generate_hash_seed(); 86 | uint64_t user_seed; 87 | uint64_t seed_tab[NSEEDSBLOOM]; 88 | uint64_t char_hash[2][4]; 89 | 90 | uint64_t getCharHash(int key, int num_hash); 91 | uint64_t getLastCharHash(uint64_t key, int num_hash); 92 | 93 | int n_hash_func; 94 | uint64_t nchar; 95 | int k; 96 | 97 | //only relevant for a fake bloom 98 | bool fake; 99 | 100 | int hashSize; 101 | uint64_t bloomMask; 102 | std::set valid_set; 103 | std::set valid_hash0; 104 | std::set valid_hash1; 105 | 106 | public: 107 | int getNumHash(); 108 | int getHashSize(); 109 | uint64_t getBloomMask(); 110 | 111 | unsigned char * blooma; 112 | 113 | 114 | /********************************************************************************** 115 | These are the important things that are currently being used. 116 | ***********************************************************************************/ 117 | 118 | float weight(); //returns the proportion of 1's in the filter. So should be between 0.0 and 1.0 119 | 120 | 121 | Bloom* create_bloom_filter_2_hash(uint64_t estimated_items, float fpRate); //creates for two hash functions and given fpRate 122 | 123 | Bloom* create_bloom_filter_optimal(uint64_t estimated_items, float fpRate); //creates for smallest size given the fpRate 124 | 125 | //loads all the kmers in the reads file into the bloom filter. 126 | //Input is assumed to be a raw string for each read, one per line. 127 | void load_from_reads(const char* reads_filename); 128 | 129 | //loads all the kmers in the kmers file into the bloom filter. 130 | //Input is assumed to be one kmer per line as a string. 131 | void load_from_kmers(const char* kmers_filename); 132 | 133 | //The hash function minia used and we are now using. 134 | inline uint64_t oldHash(uint64_t key, int num_hash){ 135 | uint64_t hash = seed_tab[num_hash]; 136 | hash ^= (hash << 7) ^ key * (hash >> 3) ^ (~((hash << 11) + (key ^ (hash >> 5)))); 137 | hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1; 138 | hash = hash ^ (hash >> 24); 139 | hash = (hash + (hash << 3)) + (hash << 8); // hash * 265 140 | hash = hash ^ (hash >> 14); 141 | hash = (hash + (hash << 2)) + (hash << 4); // hash * 21 142 | hash = hash ^ (hash >> 28); 143 | hash = hash + (hash << 31); 144 | return hash &= bloomMask; 145 | } 146 | 147 | void addPair(JuncPair pair); 148 | int containsPair(JuncPair pair); 149 | 150 | //Add an element using the old hash function 151 | inline int oldAdd(bloom_elem elem) 152 | { 153 | uint64_t hA,hB; 154 | 155 | hA = oldHash(elem, 0); 156 | hB = oldHash(elem, 1); 157 | 158 | add(hA, hB); 159 | } 160 | 161 | //Check whether an element is contained using the old hash function 162 | inline int oldContains(bloom_elem elem) 163 | { 164 | if(fake){ 165 | return (valid_set.find(elem) != valid_set.end()); 166 | } 167 | uint64_t hA,hB; 168 | 169 | hA = oldHash(elem, 0); 170 | hB = oldHash(elem, 1); 171 | 172 | return contains(hA, hB); 173 | } 174 | 175 | 176 | /********************************************************************************** 177 | Most of the below is not currently used. Much of it is for incremental hashing. 178 | ***********************************************************************************/ 179 | 180 | //rotates hash to the right by dist. Assume 0 < dist < hashSize 181 | inline uint64_t rotate_right(uint64_t hash, int dist){ 182 | dist %= hashSize; 183 | return ((hash >> dist) | (hash << (hashSize - dist))) & bloomMask; 184 | } 185 | 186 | //rotates hash to the right by dist. Assume 0 < dist < hashSize 187 | inline uint64_t rotate_left(uint64_t hash, int dist){ 188 | dist %= hashSize; 189 | return ((hash << dist) | (hash >> (hashSize - dist))) & bloomMask; 190 | } 191 | 192 | //only for num_hash = 0 or 1 193 | uint64_t get_rolling_hash(uint64_t key, int num_hash); 194 | 195 | inline uint64_t roll_hash(uint64_t oldHash, int oldC, int newC, int num_hash){ 196 | return rotate_left(oldHash ^ getCharHash(oldC, num_hash), 1) ^ rotate_right(getLastCharHash(newC, num_hash), k-1); 197 | } 198 | 199 | inline void advance_hash(char* read, uint64_t * hash0, uint64_t * hash1, int startPos, int endPos){ 200 | for(int i = startPos; i < endPos; i++){ 201 | *hash0 = roll_hash(*hash0, NT2int(read[i]), NT2int(read[i+sizeKmer]), 0); 202 | *hash1 = roll_hash(*hash1, NT2int(read[i]), NT2int(read[i+sizeKmer]), 1); 203 | } 204 | } 205 | 206 | inline void add(bloom_elem elem) 207 | { 208 | uint64_t hA,hB; 209 | 210 | hA = get_rolling_hash(elem, 0); 211 | hB = get_rolling_hash(elem, 1); 212 | 213 | add(hA, hB); 214 | } 215 | 216 | 217 | inline void add(uint64_t h0, uint64_t h1) 218 | { 219 | uint64_t h = h0; 220 | for(int i=0; i> 3] |= bit_mask[h & 7]; 225 | } 226 | } 227 | 228 | 229 | inline int contains(bloom_elem elem) 230 | { 231 | if(fake){ 232 | return (valid_set.find(elem) != valid_set.end()); 233 | } 234 | uint64_t hA,hB; 235 | 236 | hA = get_rolling_hash(elem, 0); 237 | hB = get_rolling_hash(elem, 1); 238 | 239 | return contains(hA, hB); 240 | } 241 | 242 | inline int contains(uint64_t h0, uint64_t h1) 243 | { 244 | if(fake){ 245 | return (valid_hash0.find(h0) != valid_hash0.end()) 246 | && (valid_hash1.find(h1) != valid_hash1.end()); 247 | } 248 | uint64_t h = h0 % tai; 249 | for(int i=0; i> 3 ] & bit_mask[h & 7]) != bit_mask[h & 7]){ 253 | return 0; 254 | } 255 | } 256 | return 1; 257 | 258 | } 259 | 260 | 261 | /********************************************************************************** 262 | This is either very basic functions or for testing. 263 | ***********************************************************************************/ 264 | 265 | //makes this a fake bloom filter that returns true only on specified kmers 266 | void fakify(); 267 | // Add a set of kmers to the fake bloom's list, so that they will return true 268 | void addFakeKmers(std::set valid_kmers); 269 | 270 | void setSeed(uint64_t seed) ; 271 | 272 | void set_number_of_hash_func(int i) ; 273 | 274 | /*void add(bloom_elem elem); 275 | int contains(bloom_elem elem); 276 | void add(uint64_t hash0, uint64_t hash1); 277 | int contains(uint64_t hash0, uint64_t hash1);*/ 278 | 279 | uint64_t tai; 280 | uint64_t nb_elem; 281 | 282 | void dump(char * filename); 283 | void load(char * filename); 284 | 285 | Bloom(uint64_t tai_bloom, int k); 286 | Bloom(int tai_bloom); 287 | Bloom(uint64_t tai_bloom); 288 | 289 | Bloom(); 290 | 291 | ~Bloom(); 292 | }; 293 | 294 | void load_two_filters(Bloom* bloo1, Bloom* bloo2, std::string reads_filename, bool fastq, bool mercy); //if fastq, use fastq. Else use fasta 295 | void load_single_filter(Bloom* bloo1, string reads_filename, bool fastq); 296 | double brents_fun(std::function f, double lower, double upper, double tol, unsigned int max_iter); 297 | bool isJunction(ReadKmer readKmer, Bloom* bloom, bool dir); 298 | 299 | #endif 300 | 301 | -------------------------------------------------------------------------------- /src/newTests/ReadscanTest.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "gtest/gtest.h" 4 | #include "../ReadScanner.h" 5 | #include "../../utils/Bloom.h" 6 | #include "../../utils/JunctionMap.h" 7 | using std::unordered_map; 8 | // #include "../../utils/sparsepp.h" 9 | // using spp::std::unordered_map; 10 | 11 | 12 | class readScan : public ::testing::Test { 13 | 14 | protected: 15 | std::vector reads; 16 | std::vector kmers; 17 | 18 | Bloom* bloom; 19 | ReadScanner* scanner; 20 | int j; 21 | int read_length; 22 | int estimated_kmers; 23 | int maxSpacerDist; 24 | double fpRate; 25 | 26 | JChecker* jchecker; 27 | JunctionMap* junctionMap; 28 | Bloom* short_pair_filter; 29 | Bloom* long_pair_filter; 30 | 31 | // Build a kmer out of a string input 32 | kmer_type getKmerFromString(string kmerString){ 33 | kmer_type kmer; 34 | getFirstKmerFromRead(&kmer, &(kmerString[0])); 35 | return kmer; 36 | } 37 | 38 | // Add the kmers from a vector of strings to a fake bloom filter 39 | std::set addKmers(Bloom* bloom, std::vector kmers) { 40 | std::set valids {}; 41 | for (string kmer : kmers) { 42 | valids.insert(get_canon(getKmerFromString(kmer))); 43 | } 44 | 45 | bloom->addFakeKmers(valids); 46 | valids.clear(); 47 | } 48 | 49 | // Create a bloom filter, but make it a fake one 50 | Bloom* createBloom(){ 51 | Bloom* fakeBloom = fakeBloom->create_bloom_filter_optimal(estimated_kmers, fpRate); 52 | fakeBloom->fakify(); 53 | return fakeBloom; 54 | } 55 | 56 | // This method should be used and modified to print whatever we want ot check about the resulting junction map 57 | void printJunctionMap(ReadScanner scanner) { 58 | auto map = scanner.getJunctionMap()->junctionMap; 59 | printf("Size: %d \n", map.size()); 60 | for (auto& kv : map){ 61 | printf("%s \n", print_kmer(kv.first)); 62 | printf("%d %d %d %d %d \n", 63 | kv.second.dist[0], kv.second.dist[1], kv.second.dist[2], kv.second.dist[3], kv.second.dist[4]); 64 | } 65 | } 66 | 67 | // set up blooms, junction map, jchecker, readscanner for testing 68 | readScan() { 69 | j = 0; 70 | read_length = 30; 71 | estimated_kmers = 35; 72 | maxSpacerDist = 8; 73 | fpRate = .1; 74 | kmers = {}; 75 | reads = {}; 76 | 77 | bloom = createBloom(); 78 | jchecker = new JChecker(j, bloom); 79 | 80 | junctionMap = new JunctionMap(bloom, jchecker, read_length); 81 | string read_scan_file = "mock_file"; 82 | 83 | short_pair_filter = short_pair_filter->create_bloom_filter_optimal(estimated_kmers/9, fpRate); 84 | long_pair_filter = long_pair_filter->create_bloom_filter_optimal(estimated_kmers/6, fpRate); 85 | 86 | scanner = new ReadScanner(junctionMap, read_scan_file, bloom, short_pair_filter, long_pair_filter, jchecker, maxSpacerDist); 87 | printf("Done initializing!\n"); 88 | } 89 | ~readScan(){ 90 | reads.clear(); 91 | kmers.clear(); 92 | delete jchecker; 93 | delete short_pair_filter; 94 | delete long_pair_filter; 95 | delete bloom; 96 | delete junctionMap; 97 | delete scanner; 98 | } 99 | }; 100 | 101 | // This test adds one read, and adds the reads kmers to the bloom filter, scans and prints the junction map 102 | TEST_F(readScan, singleReadNoJunctions) { 103 | setSizeKmer(5); 104 | 105 | reads = {"ACGGGCGAACTTTCATAGGA"}; 106 | kmers = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT","AACTT", 107 | "ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA"}; 108 | 109 | addKmers(bloom, kmers); 110 | 111 | scanner->scanInputRead(reads[0], true); 112 | std::unordered_map map = scanner->getJunctionMap()->junctionMap; 113 | // Expected junctions & distances 114 | // TCCTA 115 | // 0 0 15 0 1 116 | // AACTT 117 | // 0 0 15 0 15 118 | //assert junction k-mers in map 119 | ASSERT_EQ(map.count(getKmerFromRead("TCCTA", 0)),1); 120 | ASSERT_EQ(map.count(getKmerFromRead("AACTT", 0)),1); 121 | // only these junction in map 122 | ASSERT_EQ(map.size(),2); 123 | 124 | for (auto& kv : map){ 125 | // assert distances are correct 126 | if (print_kmer(kv.first)=="TCCTA"){ 127 | ASSERT_EQ(kv.second.dist[2],15); 128 | ASSERT_EQ(kv.second.dist[4],1); 129 | } 130 | if (print_kmer(kv.first)=="AACTT"){ 131 | ASSERT_EQ(kv.second.dist[2],15); 132 | ASSERT_EQ(kv.second.dist[4],15); 133 | } 134 | } 135 | } 136 | 137 | TEST_F(readScan, singleReadOneFakeJunction) { 138 | setSizeKmer(5); 139 | 140 | // added k-mers in BF "AACTC", "ACTCC" create fake junction and branch of length 2 141 | reads = {"ACGGGCGAACTTTCATAGGA"}; 142 | kmers = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT","AACTT","AACTC","ACTCC", 143 | "ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA"}; 144 | 145 | addKmers(bloom, kmers); 146 | 147 | scanner->scanInputRead(reads[0], true); 148 | std::unordered_map map = scanner->getJunctionMap()->junctionMap; 149 | // Expected junctions & distances 150 | // CCTAT 151 | // 0 0 0 15 3 152 | // GAACT 153 | // 0 0 15 0 13 154 | //assert junction k-mers in map 155 | ASSERT_EQ(map.count(getKmerFromRead("CCTAT", 0)),1); 156 | ASSERT_EQ(map.count(getKmerFromRead("GAACT", 0)),1); 157 | // only these junction in map 158 | ASSERT_EQ(map.size(),2); 159 | for (auto& kv : map){ 160 | // assert distances are correct 161 | if (print_kmer(kv.first)=="CCTAT"){ 162 | ASSERT_EQ(kv.second.dist[3],15); 163 | ASSERT_EQ(kv.second.dist[4],3); 164 | } 165 | if (print_kmer(kv.first)=="GAACT"){ 166 | ASSERT_EQ(kv.second.dist[2],15); 167 | ASSERT_EQ(kv.second.dist[4],13); 168 | } 169 | } 170 | } 171 | 172 | // Long read, no junctions 173 | TEST_F(readScan, LongReadNoJunctions) { 174 | setSizeKmer(5); 175 | reads = {"ACGGGCGAACTTTCATAGGATCGCACTCAC"}; 176 | kmers = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT","AACTT", 177 | "ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA", 178 | "AGGAT", "GGATC", "GATCG", "ATCGC", "TCGCA", "CGCAC", "GCACT", "GCACT", 179 | "CACTC", "ACTCA", "CTCAC"}; 180 | 181 | addKmers(bloom, kmers); 182 | 183 | scanner->scanInputRead(reads[0], true); 184 | std::unordered_map map = scanner->getJunctionMap()->junctionMap; 185 | // Expected junctions & distances 186 | // ATCGC 187 | // 1 0 0 0 3 188 | // TGCGA 189 | // 0 0 1 0 11 190 | // CGATC 191 | // 0 1 0 0 3 192 | // GGATC 193 | // 0 0 0 1 12 194 | // TTCAT 195 | // 12 0 0 0 15 196 | // TTCGC 197 | // 0 1 0 0 15 198 | // GGCGA 199 | // 1 0 0 0 7 200 | 201 | //assert some of junction k-mers in map 202 | ASSERT_EQ(map.count(getKmerFromRead("ATCGC", 0)),1); 203 | ASSERT_EQ(map.count(getKmerFromRead("GGATC", 0)),1); 204 | // only map is correct size 205 | ASSERT_EQ(map.size(),7); 206 | for (auto& kv : map){ 207 | // assert distances are correct 208 | if (print_kmer(kv.first)=="GGATC"){ 209 | ASSERT_EQ(kv.second.dist[3],1); 210 | ASSERT_EQ(kv.second.dist[4],12); 211 | } 212 | if (print_kmer(kv.first)=="TTCGC"){ 213 | ASSERT_EQ(kv.second.dist[1],1); 214 | ASSERT_EQ(kv.second.dist[4],15); 215 | } 216 | } 217 | } 218 | 219 | 220 | // additional tests wanted: 221 | // edge case: read that's a tandem repeat, no junctions 222 | // read with k-mer missing - see getValidReads mechanism works 223 | 224 | 225 | // Same thing but with three reads 226 | TEST_F(readScan, buildFullMap) { 227 | setSizeKmer(5); 228 | reads = {"ACGGGCGAACTTTCATAGGA", "GGCGAACTAGTCCAT", "AACTTTCATACGATT"}; 229 | kmers = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT","AACTT","ACTTT", 230 | "CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA","GGCGA", "GCGAA", "CGAAC", 231 | "GAACT", "AACTA","ACTAG", "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT","AACTT", 232 | "ACTTT", "CTTTC", "TTTCA", "TTCAT", "TCATA", "CATAC", "ATACG", "TACGA", "ACGAT","CGATT"}; 233 | addKmers(bloom, kmers); 234 | 235 | scanner->scanInputRead(reads[0], true); 236 | scanner->scanInputRead(reads[1], true); 237 | scanner->scanInputRead(reads[2], true); 238 | std::unordered_map map = scanner->getJunctionMap()->junctionMap; 239 | 240 | // Expected junctions & distances 241 | // CTAGT 242 | // 0 8 3 0 3 243 | // TCATA 244 | // 0 10 0 6 12 245 | // GAACT 246 | // 3 0 12 0 13 247 | //assert some of junction k-mers in map 248 | ASSERT_EQ(map.count(getKmerFromRead("CTAGT", 0)),1); 249 | ASSERT_EQ(map.count(getKmerFromRead("GAACT", 0)),1); 250 | // only map is correct size 251 | ASSERT_EQ(map.size(),3); 252 | for (auto& kv : map){ 253 | // assert distances are correct 254 | if (print_kmer(kv.first)=="GAACT"){ 255 | ASSERT_EQ(kv.second.dist[0],3); 256 | ASSERT_EQ(kv.second.dist[4],13); 257 | } 258 | if (print_kmer(kv.first)=="CTAGT"){ 259 | ASSERT_EQ(kv.second.dist[1],8); 260 | ASSERT_EQ(kv.second.dist[4],3); 261 | } 262 | } 263 | // printJunctionMap(*scanner); 264 | } 265 | 266 | TEST_F(readScan, smallDblJuncMap) { 267 | setSizeKmer(7); 268 | // j = 1; 269 | 270 | reads = {"AAAAACAGCGATTC", "AAAAAGAGCGATTTA"}; 271 | kmers = {"AAAAACA", "AAAAAGA", "AAAACAG", "AAAAGAG", "AAACAGC", "AAAGAGC", 272 | "AACAGCG", "AAGAGCG", "ACAGCGA","AGAGCGA","CAGCGAT", "GAGCGAT", "AGCGATT", 273 | "GCGATTT" ,"GCGATTC", "CGATTTA"}; 274 | addKmers(bloom, kmers); 275 | 276 | scanner->scanInputRead(reads[0], true); 277 | scanner->scanInputRead(reads[1], true); 278 | std::unordered_map map = scanner->getJunctionMap()->junctionMap; 279 | 280 | printJunctionMap(*scanner); 281 | } 282 | 283 | // add separate to JunctionMapTest 284 | // test building of map, then removal of complex junctions - closer to 285 | // then 286 | 287 | // int main(int ac, char* av[]) 288 | // { 289 | // testing::InitGoogleTest(&ac, av); 290 | // return RUN_ALL_TESTS(); 291 | // } 292 | -------------------------------------------------------------------------------- /src/ContigNode.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "ContigNode.h" 3 | #include 4 | using std::ofstream; 5 | using std::stringstream; 6 | #include //for std::stringstream 7 | #include //for std::string 8 | #include 9 | 10 | 11 | 12 | 13 | ContigNode::ContigNode(Junction junction){ 14 | for(int i = 0; i < 4; i++){ 15 | cov[i] = junction.getCoverage(i); 16 | contigs[i] = nullptr; 17 | } 18 | contigs[4] = nullptr; 19 | } 20 | 21 | ContigNode::ContigNode(){ 22 | for(int i = 0; i < 5; i++){ 23 | cov[i] = 0; 24 | contigs[i] = nullptr; 25 | } 26 | } 27 | 28 | bool ContigNode::isInvertedRepeatNode(){ 29 | std::vector inds = this->getIndicesOut(); 30 | std::unordered_set seenContigs = {}; 31 | for (auto i : inds){ 32 | if(seenContigs.find(contigs[i]) == seenContigs.end()){ 33 | seenContigs.insert(contigs[i]); 34 | } 35 | else{ 36 | return true; 37 | } 38 | } 39 | return false; 40 | } 41 | 42 | std::list ContigNode::getPairCandidates(int index, int maxDist) { 43 | // std::cout << "43\n"; 44 | 45 | std::unordered_set seenKmers = {}; 46 | std::vector queue(32); 47 | // queue.reserve(100); 48 | int pos = 0; 49 | // queue.push_back(NodeQueueEntry(this, index, 0)); 50 | queue.at(pos) = NodeQueueEntry(this, index, 0); 51 | std::list results = {}; 52 | 53 | while (queue.at(pos).node != nullptr){ 54 | // std::cout << "51, queue size is "<< queue.size() << ", pos is "<< pos <<"\n"; 55 | NodeQueueEntry entry = queue.at(pos); 56 | pos++; 57 | kmer_type unique_kmer; 58 | // std::cout << "57\n"; 59 | if (!entry.node->contigs[entry.index]){ 60 | // std::cout << "60\n"; 61 | continue; // don't advance if at dead end 62 | }else { 63 | // record unique kmer to avoid cycles 64 | unique_kmer = entry.node->getUniqueKmer(entry.index); 65 | // std::cout << "64\n"; 66 | } 67 | if(seenKmers.find(unique_kmer) == seenKmers.end()){ 68 | seenKmers.insert(unique_kmer); 69 | // std::cout << "68\n"; 70 | if(entry.startDist <= maxDist){ 71 | // std::cout << "70\n"; 72 | std::list newResults = entry.getJuncResults(maxDist); 73 | results.insert(results.end(), newResults.begin(), newResults.end()); 74 | // std::cout << "results size " << results.size()<<"\n"; 75 | entry.addNeighbors(queue); 76 | } 77 | } 78 | // std::cout << "77, queue size is "<< queue.size() << ", pos is "<< pos <<"\n"; 79 | if (pos > queue.size() - 1) break; 80 | } 81 | // std::cout << "final results size is " << results.size() << "\n"; 82 | results.sort(); 83 | 84 | return results; 85 | } 86 | 87 | 88 | 89 | std::list ContigNode::doPathsConvergeNearby(int max_ind, int min_ind, int max_dist){ 90 | /* 91 | BFSearches up to max_dist away from the node to verify extensions out of node 92 | converge to the same node. Returns list of Contig ptrs on Q when paths do converge, 93 | otherwise returns empty list. 94 | */ 95 | // std::cout << "94\n"; 96 | 97 | ContigNode* target = contigs[max_ind]->otherEndNode(this); 98 | std::unordered_set seenKmers = {}; 99 | std::list path; 100 | std::vector queue(32); 101 | // queue.reserve(100); 102 | // queue.push_back(NodeQueueEntry(this, min_ind, 0)); 103 | int pos = 0; 104 | queue.at(pos) = NodeQueueEntry(this, min_ind, 0); 105 | 106 | while (queue.at(pos).node != nullptr){ 107 | // std::cout << "105\n"; 108 | 109 | NodeQueueEntry entry = queue.at(pos); 110 | pos++; 111 | 112 | kmer_type unique_kmer; 113 | if (!entry.node->contigs[entry.index]){ 114 | // std::cout << "112\n"; 115 | 116 | continue; // don't advance if at dead end 117 | }else { 118 | // std::cout << "116\n"; 119 | // record unique kmer to avoid cycles 120 | unique_kmer = entry.node->getUniqueKmer(entry.index); 121 | } 122 | if(seenKmers.find(unique_kmer) == seenKmers.end()){ 123 | seenKmers.insert(unique_kmer); 124 | if (entry.startDist > max_dist){ 125 | // std::cout << "123\n"; 126 | if (pos > queue.size() - 1) break; 127 | else continue; 128 | } 129 | else if (entry.node->contigs[entry.index]->otherEndNode(entry.node)==target){ 130 | // reconstruct path from parents 131 | // std::cout << "128\n"; 132 | path = entry.reconstructPathFromParents(queue); 133 | return path; 134 | } 135 | else{ 136 | // std::cout << "133\n"; 137 | entry.addNeighbors(queue); 138 | } 139 | } 140 | // std::cout << "137, queue size is "<< queue.size() << ", pos is "<< pos <<"\n"; 141 | if (pos > queue.size() - 1) break; 142 | 143 | } 144 | // never reached target - return empty list 145 | return {}; 146 | } 147 | 148 | 149 | bool ContigNode::checkValidity(){ 150 | for(int i = 0; i < 5; i++){ 151 | if(contigs[i]){ 152 | Contig* contig = contigs[i]; 153 | int side = contig->getSide(this, i); 154 | if(side == 1){ 155 | if(contig->ind1 != i){ 156 | printf("GRAPHERROR: contig has wrong index.\n"); 157 | return false; 158 | } 159 | if(contig->node1_p != this){ 160 | printf("GRAPHERROR: contig points to wrong node.\n"); 161 | return false; 162 | } 163 | } 164 | if(side == 2){ 165 | if(contig->ind2 != i){ 166 | printf("GRAPHERROR: contig has wrong index.\n"); 167 | return false; 168 | } 169 | if(contig->node2_p != this){ 170 | printf("GRAPHERROR: contig points to wrong node.\n"); 171 | return false; 172 | } 173 | } 174 | } 175 | } 176 | return true; 177 | } 178 | 179 | std::vector> ContigNode::getFastGNeighbors(int contigIndex){ 180 | std::vector> result = {}; 181 | if(contigIndex == 4){ 182 | for(int i = 0; i < 4; i++){ 183 | if(contigs[i]){ 184 | bool RC = false; 185 | if(contigs[i]->getSide(this,i) == 2) { 186 | RC = true; 187 | } 188 | result.push_back(std::pair(contigs[i], RC)); 189 | } 190 | } 191 | } 192 | else{ 193 | if(contigs[4]){ 194 | bool RC = false; 195 | if(contigs[4]->getSide(this,4) == 2) { 196 | RC = true; 197 | } 198 | result.push_back(std::pair(contigs[4], RC)); 199 | } 200 | } 201 | return result; 202 | } 203 | 204 | kmer_type ContigNode::getForwardExtension(int index){ 205 | return next_kmer(getKmer(), index, FORWARD); 206 | } 207 | 208 | kmer_type ContigNode::getUniqueKmer(int index){ 209 | if(index != 4){ 210 | return getForwardExtension(index); 211 | } 212 | else{ 213 | return getKmer(); 214 | } 215 | } 216 | 217 | int ContigNode::numPathsOut(){ 218 | int numPaths = 0; 219 | for(int i = 0; i < 4; i++){ 220 | if(cov[i] > 0){ 221 | numPaths++; 222 | } 223 | } 224 | return numPaths; 225 | } 226 | 227 | std::vector ContigNode::getIndicesOut(){ 228 | std::vector paths = {}; 229 | for(int i = 0; i < 4; i++){ 230 | if(cov[i] > 0){ 231 | paths.push_back(i); 232 | } 233 | } 234 | return paths; 235 | } 236 | 237 | int ContigNode::getTotalCoverage(){ 238 | return getCoverage(4); 239 | } 240 | 241 | int ContigNode::getCoverage(int nucExt){ 242 | if(nucExt < 4){ 243 | return (int)cov[nucExt]; 244 | } 245 | return (int)cov[0] + (int)cov[1] + (int)cov[2] + (int)cov[3]; 246 | } 247 | 248 | void ContigNode::setCoverage(Junction junc){ 249 | for(int i = 0; i < 4; i++){ 250 | cov[i] = junc.getCoverage(i); 251 | } 252 | } 253 | 254 | void ContigNode::setCoverage(int nucExt, int coverage){ 255 | cov[nucExt] = coverage; 256 | } 257 | 258 | void ContigNode::replaceContig(Contig* oldContig, Contig* newContig){ 259 | for(int i = 0; i < 5; i++){ 260 | if(contigs[i] == oldContig){ 261 | contigs[i] = newContig; 262 | } 263 | } 264 | } 265 | 266 | int ContigNode::indexOf(Contig* contig){ 267 | for(int i = 0; i < 5; i++){ 268 | if(contigs[i] == contig){ 269 | return i; 270 | } 271 | } 272 | throw std::logic_error("ERROR: tried to find index of contig that's not present."); 273 | // return 5; 274 | } 275 | 276 | void ContigNode::update(int nucExt, Contig* contig){ 277 | contigs[nucExt] = contig; 278 | } 279 | 280 | void ContigNode::breakPath(int nucExt){ 281 | cov[nucExt] = 0; 282 | contigs[nucExt] = nullptr; 283 | } 284 | 285 | void ContigNode::clearNode(){ 286 | for (int i=0; i<5; i++){ 287 | this->breakPath(i); 288 | } 289 | } 290 | 291 | kmer_type ContigNode::getKmer(){ 292 | for(int i = 4; i >= 0; i--){ 293 | if(contigs[i]){ 294 | return contigs[i]->getNodeKmer(this); 295 | } 296 | } 297 | // intentionally don't return 0 here because that could be a valid kmer value 298 | throw std::logic_error("No valid contigs from which to getKmer()"); 299 | } 300 | 301 | ContigNode* ContigNode::getNeighbor(int index){ 302 | if(contigs[index]){ 303 | return contigs[index]->otherEndNode(this); 304 | } 305 | return nullptr; 306 | } 307 | 308 | std::string ContigNode::getString(){ 309 | std::stringstream result; 310 | for(int i = 0; i < 5; i++){ 311 | result << (int)getCoverage(i) << " "; 312 | result << contigs[i] << " "; 313 | } 314 | return result.str(); 315 | } 316 | 317 | 318 | NodeQueueEntry::NodeQueueEntry(ContigNode* n, int i, int s){ 319 | node = n; 320 | index = i; 321 | startDist = s; 322 | } 323 | 324 | NodeQueueEntry::NodeQueueEntry(){ 325 | node = nullptr; 326 | index = -1; 327 | startDist = -1; 328 | } 329 | 330 | std::list NodeQueueEntry::getJuncResults(int maxDist){ 331 | Contig* contig = node->contigs[index]; 332 | return contig->getJuncResults(contig->getSide(node, index),startDist, maxDist); 333 | } 334 | 335 | void NodeQueueEntry::addNeighbors(std::vector& queue){ 336 | Contig* contig = node->contigs[index]; 337 | // if (node->contigs[index]){ 338 | // printf("no contig at this index!\n"); 339 | // } 340 | int otherSide = 3 - contig->getSide(node,index); 341 | ContigNode* nextNode = contig->getNode(otherSide); 342 | int nextIndex = contig->getIndex(otherSide); 343 | 344 | // std::cout << "328\n"; 345 | int lastNonEmptyPos = 0; 346 | // std::cout << "329, queue size is "<< queue.size() << ", lastNonEmptyPos is "<< lastNonEmptyPos <<"\n"; 347 | while(queue.at(lastNonEmptyPos).node){ 348 | lastNonEmptyPos++; 349 | if (lastNonEmptyPos == queue.size()) break; 350 | } 351 | // std::cout << "331, queue size is "<< queue.size() << ", lastNonEmptyPos is "<< lastNonEmptyPos <<"\n"; 352 | 353 | if(nextNode){ 354 | if(nextIndex != 4){ 355 | if(nextNode->contigs[4]){ 356 | if (lastNonEmptyPos == queue.size()){ 357 | queue.push_back(NodeQueueEntry(nextNode, 4, startDist + contig->getTotalDistance())); 358 | // std::cout << "334, queue size is "<< queue.size() <<"\n"; 359 | } else { 360 | queue.at(lastNonEmptyPos) = NodeQueueEntry(nextNode, 4, startDist + contig->getTotalDistance()); 361 | // queue.push_pack(NodeQueueEntry(nextNode, 4, startDist + contig->getTotalDistance())); 362 | // std::cout << "338, queue size is "<< queue.size() << ", lastNonEmptyPos is "<< lastNonEmptyPos <<"\n"; 363 | } 364 | 365 | } 366 | } 367 | else{ 368 | for (int i = 0; i < 4; i++){ 369 | if(nextNode->contigs[i]){ 370 | if (lastNonEmptyPos == queue.size()){ 371 | queue.push_back(NodeQueueEntry(nextNode, i, startDist + contig->getTotalDistance())); 372 | // std::cout << "348, queue size is "<< queue.size() <<"\n"; 373 | } else{ 374 | queue.at(lastNonEmptyPos) = NodeQueueEntry(nextNode, i, startDist + contig->getTotalDistance()); 375 | // queue.push_back(NodeQueueEntry(nextNode, i, startDist + contig->getTotalDistance())); 376 | // std::cout << "351, queue size is "<< queue.size() << ", lastNonEmptyPos is "<< lastNonEmptyPos <<"\n"; 377 | } 378 | lastNonEmptyPos++; 379 | } 380 | } 381 | } 382 | } 383 | 384 | } 385 | 386 | // use stack of parents to reconstruct path: start from target, get other end node 387 | std::list NodeQueueEntry::reconstructPathFromParents(std::vector& parents){ 388 | std::list path = {}; 389 | path.push_front(node->contigs[index]); // this is the target 390 | NodeQueueEntry *currEntry = this; 391 | 392 | // move along parents vector in reverse order 393 | // query for other end node using entry's contig index 394 | // when other end node is current entry's node, 395 | // make its entry the current entry, add contig to front of path 396 | // std::cout << "in reconstructPathFromParents\n"; 397 | for (auto it = parents.rbegin(); it != parents.rend(); ++it){ 398 | if (!it->node) continue; 399 | if (it->node->contigs[it->index]->otherEndNode(it->node) == currEntry->node){ 400 | path.push_front(it->node->contigs[it->index]); 401 | currEntry = &(*it); 402 | } 403 | } 404 | return path; 405 | } 406 | 407 | 408 | 409 | -------------------------------------------------------------------------------- /src/Contig.cpp: -------------------------------------------------------------------------------- 1 | #include "Contig.h" 2 | #include 3 | #include 4 | #include // std::reverse 5 | #include // std::vector 6 | #include 7 | #include 8 | 9 | 10 | using std::stringstream; 11 | using std::ofstream; 12 | 13 | // we ignore effects due to Bloom filter FPs when querying for pairs 14 | std::pair Contig::getPairsMeanStd(Bloom* pair_filter){ 15 | std::list results = getJuncResults(1, 0, std::min(length(),2000)); 16 | 17 | int pairs_sum = 0; 18 | int pairs_count = 0; 19 | 20 | for(auto itL = results.begin(); itL != results.end(); itL++){ 21 | for(auto itR = itL; itR != results.end(); itR++){ 22 | if(pair_filter->containsPair(JuncPair(itL->kmer, itR->kmer))){ 23 | pairs_count++; 24 | pairs_sum += itR->distance - itL->distance; 25 | } 26 | } 27 | } 28 | if (pairs_sum==0 || pairs_count==0) return std::make_pair(0,0); 29 | double mean = pairs_sum/ (double) pairs_count; 30 | double sum_sqrs = 0; 31 | for(auto itL = results.begin(); itL != results.end(); itL++){ 32 | for(auto itR = itL; itR != results.end(); itR++){ 33 | if(pair_filter->containsPair(JuncPair(itL->kmer, itR->kmer))){ 34 | sum_sqrs += pow((itR->distance - itL->distance) - mean, 2); 35 | } 36 | } 37 | } 38 | double std = pow(sum_sqrs/ (pairs_count - 1.5), 0.5); 39 | std::cout << "pairs_sum is " << pairs_sum << ", pairs_count is " << pairs_count << std::endl; 40 | std::pair mean_std = std::make_pair(mean, std); 41 | return mean_std; 42 | } 43 | 44 | 45 | //Looks at all junction pairs on this contig, and prints a histogram of how many BF positives and BF negatives there are 46 | //for pairs at different distances. 47 | void Contig::printPairStatistics(Bloom* pair_filter){ 48 | std::list results = getJuncResults(1, 0, 3*length()); 49 | std::cout << "Length " << length() << ", results " << results.size() << "\n"; 50 | const int maxDist = 2000; 51 | const int increment = 20; 52 | int posNegPairCounts [2][maxDist/increment] = {}; 53 | 54 | for(int i = 0; i < maxDist/increment; i++){ 55 | posNegPairCounts[0][i] = 0; 56 | posNegPairCounts[1][i] = 0; 57 | } 58 | 59 | for(auto itL = results.begin(); itL != results.end(); itL++){ 60 | for(auto itR = itL; itR != results.end(); itR++){ 61 | int index = (itR->distance - itL->distance)/increment; 62 | if(index < maxDist/increment && index >= 0){ 63 | if(pair_filter->containsPair(JuncPair(itL->kmer, itR->kmer))){ 64 | posNegPairCounts[0][index] += 1; 65 | } 66 | else{ 67 | posNegPairCounts[1][index] += 1; 68 | } 69 | } 70 | } 71 | } 72 | 73 | printf("Pair pos/neg char, aggregated over buckets of length %d:\n", increment); 74 | for(int i = 0; i < maxDist / increment; i++){ 75 | std::cout << "Distance " << i*increment << ": "; 76 | std::cout << posNegPairCounts[0][i] << ","; 77 | std::cout << posNegPairCounts[1][i] << "\n"; 78 | } 79 | } 80 | 81 | //Reverses if needed to get "canonical" concatenation of two in the same direction 82 | //Reverses again at the end to ensure no mutation of contigs 83 | Contig* Contig::concatenate(Contig* otherContig, int thisSide, int otherSide){ 84 | if(thisSide == 1){ 85 | reverse(); 86 | } 87 | if(otherSide == 2){ 88 | otherContig->reverse(); 89 | } 90 | Contig* concatenation = concatenate(otherContig); 91 | if(thisSide == 1){ 92 | reverse(); 93 | } 94 | if(otherSide == 2){ 95 | otherContig->reverse(); 96 | } 97 | return concatenation; 98 | } 99 | 100 | //utility for linking them if they're both facing "forward" 101 | Contig* Contig::concatenate(Contig* otherContig){ 102 | Contig* result = new Contig(); 103 | result->setEnds(node1_p, ind1, otherContig->node2_p, otherContig->ind2); 104 | if(getSeq().length() < sizeKmer){ 105 | printf("ERROR: seq less than k long in Contig::Concatenate.\n"); 106 | } 107 | result->setContigJuncs(contigJuncs.concatenate(otherContig->contigJuncs)); 108 | return result; 109 | } 110 | 111 | void Contig::reverse(){ 112 | {ContigNode * temp = node1_p; 113 | node1_p = node2_p; 114 | node2_p = temp;} 115 | 116 | {int temp = ind1; 117 | ind1 = ind2; 118 | ind2 = temp;} 119 | 120 | contigJuncs.reverse(); 121 | } 122 | 123 | void Contig::setEnds( ContigNode* n1, int i1, ContigNode* n2, int i2){ 124 | node1_p = n1; 125 | node2_p = n2; 126 | setIndices(i1, i2); 127 | if(node1_p){ 128 | node1_p->contigs[i1] = this; 129 | } 130 | if(node2_p){ 131 | node2_p->contigs[i2] = this; 132 | } 133 | } 134 | 135 | //Gets all of the interior junctions on this contig, as a list of JuncResult objects 136 | //Assumes this is startDist away from the real start, so increments all by startDist 137 | //Side refers to which side of the contig to start from 138 | std::list Contig::getJuncResults(int side, int startDist, int maxDist){ 139 | if(side == 2){ 140 | reverse(); 141 | } 142 | auto result = contigJuncs.getJuncResults(ind1 != 4, startDist, maxDist); //forward if ind1 != 4, backward if ind1 == 4 143 | if(side == 2){ 144 | reverse(); 145 | } 146 | return result; 147 | } 148 | 149 | int Contig::length(){ 150 | return contigJuncs.length(); 151 | } 152 | 153 | double Contig::getAvgCoverage(){ 154 | return contigJuncs.getAvgCoverage(); 155 | } 156 | 157 | double Contig::getAvgCoverage(std::list results){ 158 | return contigJuncs.getAvgCoverage(results); 159 | } 160 | 161 | double Contig::getCoverageSampleVariance(){ 162 | return contigJuncs.getCoverageSampleVariance(); 163 | } 164 | 165 | double Contig::getCoverageSampleVariance(std::list results){ 166 | return contigJuncs.getCoverageSampleVariance(results); 167 | } 168 | 169 | float Contig::getMass(){ 170 | return getAvgCoverage()*getSeq().length(); 171 | } 172 | 173 | void Contig::setIndices(int i1, int i2){ 174 | ind1 = i1; 175 | ind2 = i2; 176 | } 177 | 178 | int Contig::getMinIndex(){ 179 | return std::min(ind1, ind2); 180 | } 181 | 182 | ContigNode* Contig::otherEndNode(ContigNode * oneEnd){ 183 | if(node1_p == oneEnd){ 184 | return node2_p; 185 | } 186 | if(node2_p == oneEnd){ 187 | return node1_p; 188 | } 189 | printf("ERROR: tried to get other end of a contig, but the given pointer didn't point to either end!.\n"); 190 | std::cout << "node1_p: " << node1_p << " node2_p: " << node2_p << " oneEnd: " << oneEnd << "\n"; 191 | std::cout << "This contig: " << this << "\n"; 192 | return nullptr; 193 | } 194 | 195 | //Assumes the given contig node points to one end of this contig 196 | kmer_type Contig::getNodeKmer(ContigNode * contigNode){ 197 | if(node1_p == contigNode){ 198 | return getSideKmer(1); 199 | } 200 | if(node2_p == contigNode){ 201 | return getSideKmer(2); 202 | } 203 | throw std::logic_error("Tried to get the kmer corresponding to a node not adjacent to this contig from this contig."); 204 | 205 | // printf("ERROR: tried to get the kmer corresponding to a node not adjacent to this contig from this contig.\n"); 206 | } 207 | 208 | ContigNode* Contig::getNode(int side){ 209 | if (side == 1){ 210 | return node1_p; 211 | } 212 | if(side == 2){ 213 | return node2_p; 214 | } 215 | throw std::logic_error("Called getNode on contignode with side other than 1,2"); 216 | } 217 | 218 | int Contig::getIndex(int side){ 219 | if (side == 1){ 220 | return ind1; 221 | } 222 | if(side == 2){ 223 | return ind2; 224 | } 225 | throw std::logic_error("Called getSide on contignode with side other than 1,2"); 226 | } 227 | 228 | //Gets kmer for node1_p if side == 1, node2_p if side == 2 229 | kmer_type Contig::getSideKmer(int side){ 230 | if(side == 1){ 231 | kmer_type kmer = getKmerFromRead(getSeq(), 0); 232 | if(ind1 == 4) return revcomp(kmer); 233 | return kmer; 234 | } 235 | if(side == 2){ 236 | kmer_type kmer = getKmerFromRead(getSeq(), getSeq().length()-sizeKmer); 237 | if(ind2 == 4) return kmer; 238 | return revcomp(kmer); 239 | } 240 | throw std::logic_error("Tried to get a kmer corresponding to a side other than one or two from a contig."); 241 | } 242 | 243 | int Contig::getSide(ContigNode* node){ 244 | if(node1_p == node){ 245 | return 1; 246 | } 247 | if(node2_p == node){ 248 | return 2; 249 | } 250 | printf("ERROR: tried to get the side of a contig node not adjacent to the contig.\n"); 251 | std::cout << "Node1: " << node1_p << ", Node2: " << node2_p << " Input: " << node << "\n"; 252 | return -1; 253 | } 254 | 255 | int Contig::getSide(ContigNode* node, int index){ 256 | if((node1_p == node) && (ind1 == index)){ 257 | return 1; 258 | } 259 | if((node2_p == node) && (ind2 == index)){ 260 | return 2; 261 | } 262 | printf("ERROR: tried to get the side of a contig node,index pair, but didn't find it on either side.\n"); 263 | std::cout << "Node1: " << node1_p << ", Node2: " << node2_p << " Input: " << node << "\n"; 264 | return -1; 265 | } 266 | 267 | void Contig::setSide(int side, ContigNode* node){ 268 | if(side == 1){ 269 | node1_p = node; 270 | } 271 | else if(side == 2){ 272 | node2_p = node; 273 | } 274 | else printf("ERROR: tried to set side for side other than 1,2.\n"); 275 | } 276 | 277 | void Contig::setMark(bool value){ 278 | marked = value; 279 | } 280 | 281 | bool Contig::getMark(){ 282 | return marked; 283 | } 284 | 285 | 286 | bool Contig::isIsolated(){ 287 | return ((node1_p == nullptr) && (node2_p == nullptr)); 288 | } 289 | 290 | std::vector> Contig::getNeighbors(bool RC){ 291 | if(!RC){ //forward node continuations 292 | if(node2_p){ //if node exists in forward direction 293 | return node2_p->getFastGNeighbors(ind2); 294 | } 295 | } 296 | else{ //backward node continuations 297 | if(node1_p){ //if node exists in backward direction 298 | return node1_p->getFastGNeighbors(ind1); 299 | } 300 | } 301 | return {}; 302 | } 303 | 304 | bool Contig::isDegenerateLoop(){ 305 | if (node1_p && node2_p){ 306 | return (node1_p == node2_p && ind1 == ind2); 307 | } 308 | return false; 309 | } 310 | 311 | bool Contig::checkValidity(){ 312 | // std::cout << "ind1 " << ind1 << ", ind2 " << ind2 << std::endl; 313 | if(node1_p){ 314 | // std::cout << "there is a node 1 ptr\n"; 315 | if(node1_p->contigs[ind1] != this ){ //&& 316 | // (other != print_kmer(revcomp(getKmerFromRead(node1_p->contigs[ind1]->getSeq(), node1_p->contigs[ind1]->getSeq().length()-sizeKmer) ) ) ) 317 | // ){ 318 | printf("CONTIG_ERROR: adjacent node 1 at specified index doesn't point back to this contig.\n"); 319 | // std::cout << "Expected at extension "<< ind1 << "\n"; 320 | // std::cout << "node1_p seq is " << print_kmer(node1_p->getKmer()) <<" , ind 1 is " << ind2 << std::endl; 321 | // std::cout << "node2_p seq is " << print_kmer(node2_p->getKmer()) <<" , ind 2 is " << ind2 << std::endl; 322 | 323 | // std::cout << "contig is\n"; 324 | // std::cout << this->getSeq() << std::endl; // << ", length is\n" << this->getSeq().length() 325 | // if (node1_p->contigs[4]){ 326 | // std::cout << "node1_p at " << 4 << " is\n"; 327 | // std::cout << node1_p->contigs[4]->getSeq() << std::endl; 328 | 329 | // } 330 | // if (node2_p->contigs[4]){ 331 | // std::cout << "node2_p at " << 4 << " is\n"; 332 | // std::cout << node2_p->contigs[4]->getSeq() << std::endl; 333 | 334 | // } 335 | return false; 336 | } 337 | if(getSide(node1_p, ind1) != 1 && !isDegenerateLoop()){ 338 | printf("CONTIG_ERROR: getSide incorrect on node1p, ind1.\n"); 339 | std::cout << "Node1: " << node1_p << ", Ind1: " << ind1 << ", Side: " << getSide(node1_p, ind1) << "\n"; 340 | std::cout << "Node2: " << node2_p << ", Ind2: " << ind2 << ", Side: " << getSide(node2_p, ind2) << "\n"; 341 | return false; 342 | } 343 | } 344 | if(node2_p){ 345 | // std::cout << "there is a node 2 ptr\n"; 346 | if(node2_p->contigs[ind2] != this ){//&& 347 | // (getSeq() != print_kmer(getKmerFromRead(node2_p->contigs[ind2]->getSeq(), node2_p->contigs[ind2]->getSeq().length()-sizeKmer) ) ) 348 | // ){ 349 | printf("CONTIG_ERROR: adjacent node 2 at specified index doesn't point back to this contig.\n"); 350 | // std::cout << "Expected at extension "<< ind2 << "\n"; 351 | // std::cout << "node2_p seq is " << print_kmer(node2_p->getKmer()) <<" , ind 2 is " << ind1 << std::endl; 352 | // std::cout << "node1_p seq is " << print_kmer(node1_p->getKmer()) <<" , ind 1 is " << ind1 << std::endl; 353 | // std::cout << "contig is\n"; 354 | // std::cout << this->getSeq() << std::endl; // << ", length is\n" << this->getSeq().length() 355 | // if (node2_p->contigs[4]){ 356 | // std::cout << "node2_p at " << 4 << " is\n"; 357 | // std::cout << node2_p->contigs[4]->getSeq() << std::endl; 358 | // } 359 | // if (node1_p->contigs[4]){ 360 | // std::cout << "node1_p at " << 4 << " is\n"; 361 | // std::cout << node1_p->contigs[4]->getSeq() << std::endl; 362 | 363 | // } 364 | return false; 365 | } 366 | if(getSide(node2_p, ind2) != 2 && !isDegenerateLoop()){ 367 | printf("CONTIG_ERROR: getSide incorrect on node2p, ind2.\n"); 368 | std::cout << "Node1: " << node1_p << ", Ind1: " << ind1 << ", Side: " << getSide(node1_p, ind1) << "\n"; 369 | std::cout << "Node2: " << node2_p << ", Ind2: " << ind2 << ", Side: " << getSide(node2_p, ind2) << "\n"; 370 | return false; 371 | } 372 | } 373 | 374 | return true; 375 | 376 | } 377 | 378 | string Contig::getFastGName(bool RC){ 379 | stringstream stream; 380 | stream << "NODE_" << this << "_length_" << getSeq().length() << "_cov_" << getAvgCoverage(); 381 | if(RC){ 382 | stream << "'"; 383 | } 384 | return stream.str(); 385 | } 386 | 387 | string Contig::getFastGHeader(bool RC){ 388 | stringstream stream; 389 | stream << ">"; 390 | stream << getFastGName(RC); 391 | 392 | //get neighbors in direction corresponding to RC value 393 | std::vector> neighbors = getNeighbors(RC); 394 | 395 | //if empty return now 396 | if(neighbors.empty()){ 397 | stream << ";" ; 398 | return stream.str(); 399 | } 400 | 401 | //not empty, add neighbors to line 402 | stream << ":"; 403 | for(auto it = neighbors.begin(); it != neighbors.end(); ++it){ 404 | Contig* neighbor = it->first; 405 | bool RC = it->second; 406 | stream << neighbor->getFastGName(RC) << ","; 407 | } 408 | string result = stream.str(); 409 | result[result.length()-1] = ';'; 410 | return result; 411 | } 412 | 413 | string Contig::getStringRep(){ 414 | stringstream stream; 415 | stream << node1_p << "," << ind1 << " " << node2_p << "," << ind2 << "\n"; 416 | stream << contigJuncs.getStringRep(); 417 | stream << "\n"; 418 | return stream.str(); 419 | } 420 | 421 | Contig::Contig(){ 422 | setSeq(""); 423 | node1_p = nullptr; 424 | node2_p = nullptr; 425 | ind1 = 5; 426 | ind2 = 5; 427 | marked = false; 428 | contigJuncs = ContigJuncList(); 429 | } 430 | 431 | Contig::Contig( Contig * c){ 432 | setSeq(""); 433 | node1_p = c->node1_p; 434 | node2_p = c->node2_p; 435 | ind1 = c->ind1; 436 | ind2 = c->ind2; 437 | marked = c->marked; 438 | contigJuncs = c->contigJuncs; 439 | } 440 | 441 | Contig::~Contig(){ 442 | node1_p = nullptr; 443 | node2_p = nullptr; 444 | } 445 | --------------------------------------------------------------------------------