├── utils
    ├── debloom
    ├── README
    ├── Bloom
    ├── DoubleKmer
    ├── tests
    │   ├── RunTests
    │   ├── RunTests.o
    │   ├── BloomTests.o
    │   ├── JCheckTests.o
    │   ├── KmerTests.o
    │   ├── TestUtils.o
    │   ├── JunctionTests.o
    │   ├── JunctionMapTests.o
    │   ├── RollingHashTests.o
    │   ├── JunctionMapTests.cpp
    │   ├── JunctionTests.h
    │   ├── JunctionMapTests.h
    │   ├── BloomTests.h
    │   ├── KmerTests.h
    │   ├── JCheckTests.h
    │   ├── JunctionTests.cpp
    │   ├── RollingHashTests.h
    │   ├── RunTests.h
    │   ├── RunTests.cpp
    │   ├── makefile
    │   ├── TestUtils.h
    │   ├── TestUtils.cpp
    │   ├── JCheckTests.cpp
    │   ├── BloomTests.cpp
    │   ├── KmerTests.cpp
    │   └── RollingHashTests.cpp
    ├── manual
    │   ├── manual.pdf
    │   └── manual.tex
    ├── JuncPairs.cpp
    ├── Cap.h
    ├── simple_test.sh
    ├── DoubleKmer.h
    ├── Cap.cpp
    ├── JChecker.h
    ├── DoubleKmer.cpp
    ├── LargeInt.h
    ├── JuncPairs.h
    ├── Junction.h
    ├── ReadKmer.h
    ├── ContigJuncList.h
    ├── Junction.cpp
    ├── JChecker.cpp
    ├── lut.h
    ├── rvalues.h
    ├── ReadKmer.cpp
    ├── Kmer.h
    ├── JunctionMap.h
    ├── ttmath
    │   ├── ttmathmisc.h
    │   └── ttmaththreads.h
    ├── ContigJuncList.cpp
    ├── LargeInt.cpp
    └── Bloom.h
├── src
    ├── tests
    │   ├── RunTests
    │   ├── RunTests.o
    │   ├── TestUtils.o
    │   ├── FullTest.h
    │   ├── olderTests
    │   │   ├── TraverseReadsTests.o
    │   │   ├── FindNextJunctionTests.o
    │   │   ├── GetReadJunctionsTests.o
    │   │   ├── TraverseReadsTests.h
    │   │   ├── FindNextJunctionTests.h
    │   │   ├── GetReadJunctionsTests.h
    │   │   ├── GetReadJunctionsTests.cpp
    │   │   ├── FindNextJunctionTests.cpp
    │   │   └── TraverseReadsTests.cpp
    │   ├── RunTests.h
    │   ├── FullTeset.cpp
    │   ├── RunTests.cpp
    │   ├── makefile
    │   ├── TestUtils.h
    │   └── TestUtils.cpp
    ├── wget_urls
    ├── BfSearchResult.h
    ├── ContigIterator.h
    ├── stream_data_from_urls_list.sh
    ├── disk_mem_used
    ├── Faucet.h
    ├── ContigIterator.cpp
    ├── Contig.h
    ├── newTests
    │   ├── ContigTest.cpp
    │   ├── JunctionMapTest.cpp
    │   └── ReadscanTest.cpp
    ├── ContigNode.h
    ├── ReadScanner.h
    ├── ContigGraph.h
    ├── ContigNode.cpp
    └── Contig.cpp
├── LICENSE
└── README.md


/utils/debloom:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/README:
--------------------------------------------------------------------------------
1 | Utils for Mink and Minia.
2 | 


--------------------------------------------------------------------------------
/utils/Bloom:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/Bloom


--------------------------------------------------------------------------------
/utils/DoubleKmer:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/DoubleKmer


--------------------------------------------------------------------------------
/src/tests/RunTests:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/RunTests


--------------------------------------------------------------------------------
/src/tests/RunTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/RunTests.o


--------------------------------------------------------------------------------
/utils/tests/RunTests:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/RunTests


--------------------------------------------------------------------------------
/src/tests/TestUtils.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/TestUtils.o


--------------------------------------------------------------------------------
/utils/tests/RunTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/RunTests.o


--------------------------------------------------------------------------------
/utils/manual/manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/manual/manual.pdf


--------------------------------------------------------------------------------
/utils/tests/BloomTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/BloomTests.o


--------------------------------------------------------------------------------
/utils/tests/JCheckTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/JCheckTests.o


--------------------------------------------------------------------------------
/utils/tests/KmerTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/KmerTests.o


--------------------------------------------------------------------------------
/utils/tests/TestUtils.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/TestUtils.o


--------------------------------------------------------------------------------
/utils/tests/JunctionTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/JunctionTests.o


--------------------------------------------------------------------------------
/utils/tests/JunctionMapTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/JunctionMapTests.o


--------------------------------------------------------------------------------
/utils/tests/RollingHashTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/utils/tests/RollingHashTests.o


--------------------------------------------------------------------------------
/src/tests/FullTest.h:
--------------------------------------------------------------------------------
1 | #ifndef FULL_TEST
2 | #define FULL_TEST
3 | 
4 | #include "TestUtils.h"
5 | 
6 | void runFullTest();
7 | 
8 | #endif


--------------------------------------------------------------------------------
/src/tests/olderTests/TraverseReadsTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/olderTests/TraverseReadsTests.o


--------------------------------------------------------------------------------
/src/tests/olderTests/FindNextJunctionTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/olderTests/FindNextJunctionTests.o


--------------------------------------------------------------------------------
/src/tests/olderTests/GetReadJunctionsTests.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamir-Lab/Faucet/HEAD/src/tests/olderTests/GetReadJunctionsTests.o


--------------------------------------------------------------------------------
/utils/tests/JunctionMapTests.cpp:
--------------------------------------------------------------------------------
1 | #include "JunctionMapTests.h"
2 | 
3 | void runJunctionMapTests(){
4 |     printf("No junction map tests yet.\n");
5 | }


--------------------------------------------------------------------------------
/src/tests/RunTests.h:
--------------------------------------------------------------------------------
1 | #ifndef RUN_TESTS
2 | #define RUN_TESTS
3 | 
4 | #include "FindNextJunctionTests.h"
5 | #include "GetReadJunctionsTests.h"
6 | 
7 | #endif


--------------------------------------------------------------------------------
/utils/tests/JunctionTests.h:
--------------------------------------------------------------------------------
1 | #ifndef JUNCTION_TESTS
2 | #define JUNCTION_TESTS
3 | 
4 | #include "../Junction.h"
5 | 
6 | void runJunctionTests();
7 | 
8 | #endif


--------------------------------------------------------------------------------
/src/wget_urls:
--------------------------------------------------------------------------------
1 | ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR034/SRR034939/SRR034939_1.fastq.gz
2 | ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR034/SRR034939/SRR034939_2.fastq.gz
3 | 


--------------------------------------------------------------------------------
/utils/tests/JunctionMapTests.h:
--------------------------------------------------------------------------------
1 | #ifndef JUNCTIONMAP_TESTS
2 | #define JUNCTIONMAP_TESTS
3 | 
4 | #include "../JunctionMap.h"
5 | 
6 | void runJunctionMapTests();
7 | 
8 | #endif


--------------------------------------------------------------------------------
/src/tests/FullTeset.cpp:
--------------------------------------------------------------------------------
1 | #include "FullTest.h"
2 | 
3 | string read1 = "ACGTTCG";
4 | string read2 = "ACGTACGTTTT";
5 | string read3 = "TTGCG";
6 | 
7 | void runFullTest(){
8 |     readScanner
9 | }


--------------------------------------------------------------------------------
/utils/tests/BloomTests.h:
--------------------------------------------------------------------------------
 1 | #ifndef BLOOM_TESTS
 2 | #define BLOOM_TESTS
 3 | 
 4 | #include "../Bloom.h"
 5 | 
 6 | namespace bloomTests{
 7 | 
 8 | void runBloomTests();
 9 | 
10 | }
11 | 
12 | #endif


--------------------------------------------------------------------------------
/utils/tests/KmerTests.h:
--------------------------------------------------------------------------------
 1 | #ifndef KMER_TESTS
 2 | #define KMER_TESTS
 3 | 
 4 | #include "TestUtils.h"
 5 | #include "../Kmer.h"
 6 | 
 7 | namespace kmerTests{ 
 8 | 
 9 | void runKmerTests();
10 | 
11 | }
12 | 
13 | #endif


--------------------------------------------------------------------------------
/utils/JuncPairs.cpp:
--------------------------------------------------------------------------------
1 | #include "JuncPairs.h"
2 | 
3 | bool operator<(JuncResult a, JuncResult b){
4 |     return a.distance < b.distance;
5 | }
6 | 
7 | bool operator>(JuncResult a, JuncResult b){
8 |     return a.distance > b.distance;
9 | }


--------------------------------------------------------------------------------
/utils/tests/JCheckTests.h:
--------------------------------------------------------------------------------
 1 | #ifndef JCHECK_TESTS
 2 | #define JCHECK_TESTS
 3 | 
 4 | #include "TestUtils.h"
 5 | #include "../Bloom.h"
 6 | #include "../JChecker.h"
 7 | 
 8 | namespace jCheckTests {
 9 | 
10 | void runJCheckTests();
11 | 
12 | }
13 | 
14 | #endif


--------------------------------------------------------------------------------
/utils/tests/JunctionTests.cpp:
--------------------------------------------------------------------------------
 1 | #include "JunctionTests.h"
 2 | 
 3 | void createJunction_testOnePath(){
 4 | 
 5 | }
 6 | 
 7 | void updateJunction_testNewPath(){
 8 | 
 9 | }
10 | 
11 | 
12 | void runJunctionTests(){
13 |     printf("No junction tests yet.\n");
14 | }


--------------------------------------------------------------------------------
/utils/tests/RollingHashTests.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCREMENTAL_HASH_TESTS
 2 | #define INCREMENTAL_HASH_TESTS
 3 | 
 4 | #include "TestUtils.h"
 5 | #include "../Bloom.h"
 6 | 
 7 | namespace rollingHashTests{
 8 | 
 9 | void runRollingHashTests();
10 | 
11 | }
12 | 
13 | #endif


--------------------------------------------------------------------------------
/utils/tests/RunTests.h:
--------------------------------------------------------------------------------
 1 | #ifndef RUN_TESTS
 2 | #define RUN_TESTS
 3 | 
 4 | #include "KmerTests.h"
 5 | #include "JCheckTests.h"
 6 | #include "RollingHashTests.h"
 7 | #include "BloomTests.h"
 8 | #include "JunctionTests.h"
 9 | #include "JunctionMapTests.h"
10 | 
11 | #endif


--------------------------------------------------------------------------------
/src/tests/olderTests/TraverseReadsTests.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRAVERSE_READS_TESTS
 2 | #define TRAVERSE_READS_TESTS
 3 | 
 4 | #include "TestUtils.h"
 5 | #include "../ReadScanner.h"
 6 | 
 7 | namespace traverseReadsTests{
 8 | 
 9 | void runTraverseReadsTests();
10 | 
11 | }
12 | 
13 | #endif


--------------------------------------------------------------------------------
/src/tests/olderTests/FindNextJunctionTests.h:
--------------------------------------------------------------------------------
 1 | #ifndef FIND_JUNCTION_TESTS
 2 | #define FIND_JUNCTION_TESTS
 3 | 
 4 | #include "TestUtils.h"
 5 | #include "../ReadScanner.h"
 6 | 
 7 | namespace findNextJunctionTests{
 8 | 
 9 | void runFindNextJunctionTests();
10 | 
11 | }
12 | 
13 | #endif


--------------------------------------------------------------------------------
/src/tests/olderTests/GetReadJunctionsTests.h:
--------------------------------------------------------------------------------
 1 | #ifndef GET_READ_JUNCTIONS_TESTS
 2 | #define GET_READ_JUNCTIONS_TESTS
 3 | 
 4 | #include "TestUtils.h"
 5 | #include "../ReadScanner.h"
 6 | 
 7 | namespace findReadJunctionsTests{
 8 | 
 9 | void runFindReadJunctionsTests();
10 | 
11 | }
12 | 
13 | #endif


--------------------------------------------------------------------------------
/src/tests/RunTests.cpp:
--------------------------------------------------------------------------------
 1 | #include "RunTests.h" 
 2 | 
 3 | //g++ ../Bloom.cpp ../Kmer.cpp ../Debloom.cpp JCheckTests.cpp KmerTests.cpp TestUtils.cpp RunTests.cpp -o RunTests
 4 | 
 5 | int main(int argc, char *argv[]){
 6 | 
 7 |     findNextJunctionTests::runFindNextJunctionTests();
 8 |     findReadJunctionsTests::runFindReadJunctionsTests();
 9 |     return 0;
10 | }


--------------------------------------------------------------------------------
/utils/Cap.h:
--------------------------------------------------------------------------------
 1 | #ifndef CAP
 2 | #define CAP
 3 | 
 4 | #include <iostream>
 5 | #include "Kmer.h"
 6 | using std::ofstream;
 7 | 
 8 | class Cap{
 9 | public:
10 |     int dist;
11 |     kmer_type lastJunc;
12 | 
13 |     void writeToFile(ofstream* jFile);
14 | 
15 |     Cap extend(int dist, kmer_type juncID);
16 | 
17 |     Cap(int distance,kmer_type juncID);
18 | };
19 | 
20 | #endif


--------------------------------------------------------------------------------
/utils/simple_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #simple assembly test of a synthetic 10 K genome, to verify not completely broken
 3 | 
 4 | rm -f t10.contigs.fa
 5 | 
 6 | ./minia test/read50x_ref10K_e001.fasta 25 3 10000 t10 &> /dev/null
 7 | 
 8 | 
 9 | diff t10.contigs.fa  test/result10K.fasta > /dev/null
10 | 
11 | var=$?
12 | 
13 | if [ $var -eq 0 ] 
14 | then
15 |     echo Test PASSED
16 |     exit 0
17 | else
18 |     echo Test FAILED
19 |     exit 1
20 | fi
21 | 


--------------------------------------------------------------------------------
/utils/tests/RunTests.cpp:
--------------------------------------------------------------------------------
 1 | #include "RunTests.h" 
 2 | 
 3 | //g++ ../Bloom.cpp ../Kmer.cpp ../Debloom.cpp JCheckTests.cpp KmerTests.cpp TestUtils.cpp RunTests.cpp -o RunTests
 4 | 
 5 | int main(int argc, char *argv[]){
 6 | 
 7 |     kmerTests::runKmerTests();
 8 |     rollingHashTests::runRollingHashTests();
 9 |     jCheckTests::runJCheckTests();
10 |     runJunctionTests();
11 |     runJunctionMapTests();
12 |     bloomTests::runBloomTests();
13 |     
14 |     return 0;
15 | }


--------------------------------------------------------------------------------
/src/tests/makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS+= -O4 -D_FILE_OFFSET_BITS=64 # needed to handle files > 2 GB on 32 bits systems
 2 | SRC= PairFinderTest.cpp ../Contig.cpp
 3 | 
 4 | EXEC=minkTests
 5 | OBJ= $(SRC:.cpp=.o)
 6 | 
 7 | all:
 8 | 	$(MAKE) $(EXEC)
 9 | 
10 | minkTests: $(OBJ) PairFinderTest.cpp
11 | 	cd .. && $(MAKE)
12 | 	g++ --std=c++0x $(SRC) -o minkTests $(CFLAGS) 
13 | 
14 | %.o: %.cpp %.h
15 | 	g++ --std=c++0x -o $@ -c $< $(CFLAGS)
16 | 
17 | install:
18 | 	cp minkTests /usr/local/bin
19 | 


--------------------------------------------------------------------------------
/utils/tests/makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS+= -O4 -D_FILE_OFFSET_BITS=64 # needed to handle files > 2 GB on 32 bits systems
 2 | SRC=../JChecker.cpp ../Bloom.cpp ../Kmer.cpp TestUtils.cpp JunctionTests.cpp JunctionMapTests.cpp JCheckTests.cpp KmerTests.cpp RollingHashTests.cpp BloomTests.cpp RunTests.cpp
 3 | EXEC=miniaTests
 4 | OBJ= $(SRC:.cpp=.o)
 5 | 
 6 | all:
 7 | 	$(MAKE) $(EXEC)
 8 | 
 9 | miniaTests: $(OBJ) RunTests.cpp
10 | 	g++ --std=c++0x $(SRC) -o RunTests 
11 | 
12 | %.o: %.cpp %.h
13 | 	g++ --std=c++0x -o $@ -c $< $(CFLAGS)
14 | 
15 | install:
16 | 	cp miniaTests /usr/local/bin
17 | 


--------------------------------------------------------------------------------
/utils/tests/TestUtils.h:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | 
 3 | #ifndef TEST_UTILS
 4 | #define TEST_UTILS
 5 | 
 6 | #include "../Kmer.h"
 7 | #include "../Bloom.h"
 8 | 
 9 | using std::string;
10 | extern kmer_type test_kmer;
11 | 
12 | kmer_type getKmerFromString(std::string kmerString);
13 | bool kmer_matches_readseq(char* read, kmer_type kmer, int i);
14 | bool kmer_matches_kmer(kmer_type kmer1, int i1, kmer_type kmer2, int i2);
15 | 
16 | Bloom* loadBloom(string list[], int numKmers, int k);
17 | 
18 | void fail(char* testName, char* errorMessage);
19 | void fail(char* testName);
20 | 
21 | void succeed(char* testName);
22 | 
23 | #endif


--------------------------------------------------------------------------------
/src/tests/TestUtils.h:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | 
 3 | #ifndef TEST_UTILS
 4 | #define TEST_UTILS
 5 | 
 6 | #include "../../utils/Kmer.h"
 7 | #include "../../utils/Bloom.h"
 8 | 
 9 | using std::string;
10 | extern kmer_type test_kmer;
11 | 
12 | kmer_type getKmerFromString(std::string kmerString);
13 | bool kmer_matches_readseq(char* read, kmer_type kmer, int i);
14 | bool kmer_matches_kmer(kmer_type kmer1, int i1, kmer_type kmer2, int i2);
15 | 
16 | Bloom* loadBloom(string list[], int numKmers, int k);
17 | 
18 | void fail(char* testName, char* errorMessage);
19 | void fail(char* testName);
20 | 
21 | void succeed(char* testName);
22 | 
23 | #endif


--------------------------------------------------------------------------------
/utils/DoubleKmer.h:
--------------------------------------------------------------------------------
 1 | #ifndef DOUBLE_KMER
 2 | #define DOUBLE_KMER
 3 | 
 4 | #include "Kmer.h"
 5 | 
 6 | class DoubleKmer{
 7 | 
 8 | public:
 9 |     kmer_type kmer;
10 |     kmer_type revcompKmer;
11 |     
12 |     void forward(int nuc);
13 |     
14 |     //Takes as input the nucleotide extension and the direction
15 |     //If the direction is BACKWARD, the nucleotide should be given as seen in the reverse direction 
16 |     //It will not be complemented within the function.
17 |     kmer_type getExtension(int nuc, bool dir);
18 | 
19 |     kmer_type getCanon();
20 | 
21 |     void reverse();
22 |     
23 |     DoubleKmer(kmer_type forwardKmer);
24 | };
25 | #endif 


--------------------------------------------------------------------------------
/utils/Cap.cpp:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include "Cap.h"
 3 | using std::ofstream;
 4 | using std::max;
 5 | 
 6 | void writeToFile(ofstream* jFile);
 7 | 
 8 | //Kmer, then "ext,ext,ext,ext" then "cov,cov,cov,cov" for each of A,C,T,G in order.
 9 | void Cap::writeToFile(ofstream*jFile){
10 |   *jFile <<"Distance: " << dist << " ";
11 |   *jFile <<"Last ID: " << (long long) lastJunc << " ";
12 | }
13 | 
14 | Cap Cap::extend(int extraDistance, kmer_type juncID){
15 |     return *(new Cap(extraDistance+dist, juncID));
16 | }
17 | 
18 | //explicitly set if it's a spacer or not
19 | Cap::Cap(int distance, kmer_type juncID){
20 |     dist=distance;
21 |     lastJunc = juncID;
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/src/BfSearchResult.h:
--------------------------------------------------------------------------------
 1 | #ifndef BF_SEARCH_RESULT
 2 | #define BF_SEARCH_RESULT
 3 | 
 4 | //Stores all the info involved in moving from one node to another in the graph by doing a bloom scan
 5 | //Also used for scanning junction to junction
 6 | struct BfSearchResult{
 7 |     BfSearchResult() : kmer(-1){} //use this to tell whether a BfSearchResult has been set or if it was just declared
 8 |     BfSearchResult(kmer_type km, bool node, int i, int d, string cont) : kmer(km), isNode(node), index(i), distance(d), contig(cont){} 
 9 |     kmer_type kmer;
10 |     bool isNode; //either node/junction or sink
11 |     int index; //index from it that points back to the start point
12 |     int distance; //how far away it was
13 |     string contig; //the contig!
14 | };
15 | 
16 | 
17 | #endif


--------------------------------------------------------------------------------
/utils/JChecker.h:
--------------------------------------------------------------------------------
 1 | #ifndef JCHECKER 
 2 | #define JCHECKER
 3 | 
 4 | #include "Bloom.h"
 5 | #include "Kmer.h"
 6 | 
 7 | class JChecker 
 8 | {
 9 |     private:
10 |         Bloom* bloom;
11 | 
12 |         //for incremental hashing
13 |         uint64_t ** lastHashes;
14 |         uint64_t ** nextHashes;
15 |         uint64_t ** tempor;
16 |         uint64_t nextHash0, nextHash1;
17 | 
18 |         //to store the kmers in the BFS
19 |         kmer_type* lastKmers;
20 |         kmer_type* nextKmers;
21 |         kmer_type* temp;
22 | 
23 |     public:
24 |         int j; //value of j!
25 |         bool jcheck(char* kmerSeq, uint64_t nextH0, uint64_t nextH1);//incremental version
26 |         bool jcheck(kmer_type kmer);//normal version
27 |         JChecker(int jVal, Bloom* bloo);
28 | };
29 | #endif


--------------------------------------------------------------------------------
/utils/DoubleKmer.cpp:
--------------------------------------------------------------------------------
 1 | #include "DoubleKmer.h"
 2 | 
 3 | #include <algorithm>
 4 | 
 5 | void DoubleKmer::forward(int nuc){
 6 |     kmer = next_kmer(kmer, nuc, FORWARD);
 7 |     revcompKmer = next_kmer(revcompKmer, revcomp_int(nuc), BACKWARD);
 8 | }
 9 | 
10 | kmer_type DoubleKmer::getExtension(int nuc, bool direction){
11 |     if(direction == FORWARD){
12 |         return  next_kmer(kmer, nuc, FORWARD);
13 |     }
14 |     else{
15 |         return next_kmer(revcompKmer, nuc, FORWARD); 
16 |     }
17 | }
18 | 
19 | kmer_type DoubleKmer::getCanon(){
20 |     return std::min(kmer, revcompKmer);
21 | }
22 | 
23 | void DoubleKmer::reverse(){
24 |     kmer_type temp = kmer;
25 |     kmer = revcompKmer;
26 |     revcompKmer = temp;
27 | }
28 | 
29 | DoubleKmer::DoubleKmer(kmer_type forwardKmer){
30 |     kmer = forwardKmer;
31 |     revcompKmer = revcomp(kmer);
32 | }
33 | 


--------------------------------------------------------------------------------
/src/ContigIterator.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONTIG_ITERATOR
 2 | #define CONTIG_ITERATOR
 3 | 
 4 | #include "ContigGraph.h"
 5 | #include "Contig.h"
 6 | #include "ContigNode.h"
 7 | #include "../utils/Kmer.h"
 8 | #include <iterator>
 9 | #include "../utils/sparsepp.h"
10 | using spp::sparse_hash_map;
11 | 
12 | 
13 | class ContigIterator{ 
14 | private:
15 |     ContigGraph* graph;
16 |     std::unordered_map<kmer_type,ContigNode>::iterator nodeIt;
17 |     // sparse_hash_map<kmer_type,ContigNode>::iterator nodeIt;
18 |     int index;
19 |     Contig* findNextContig(); //gets the contig but doesn't increment it and index
20 |     void increment(); //increments index and nodeIt to point to next possible contig
21 | public:
22 |     ContigIterator(ContigGraph* graph);
23 |     Contig* getContig(); //gets contig, moves pointer to next contig or end
24 |     bool hasNextContig();
25 | };
26 | 
27 | #endif


--------------------------------------------------------------------------------
/src/stream_data_from_urls_list.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Runs Faucet
 3 | #Takes 4 parameter- 
 4 | #1) the output file prefix and a 
 5 | #2) file listing the input file URLs
 6 | #3) estimated kmers
 7 | #4) singletons
 8 | 
 9 | URL_FILE=$2
10 | READ_COMMAND=wget\ --read-timeout=5\ --timeout=15\ -t\ 0\ -qO-\ -i\ $URL_FILE\ \|\ gzip\ -d\ -c\ -q
11 | 
12 | eval "./faucet -read_load_file <($READ_COMMAND) -read_scan_file <($READ_COMMAND) -size_kmer 31 -max_read_length 130 -estimated_kmers $3 -singletons $4 -file_prefix $1 --fastq --high_cov"
13 | 
14 | # eval "./faucet -size_kmer 27 \
15 | # -max_read_length 130 \
16 | # -estimated_kmers 3000000000 \
17 | # -read_load_file <($READ_COMMAND) \
18 | # -file_prefix $1 \
19 | # --two_hash \
20 | # --just_load_bloom \
21 | # --fastq "
22 | 
23 | # eval "./faucet -size_kmer 27 \
24 | # -max_read_length 130 \
25 | # -estimated_kmers 3000000000 \
26 | # -read_scan_file <($READ_COMMAND) \
27 | # -bloom_file $1.bloom \
28 | # -file_prefix $1 \
29 | # --two_hash \
30 | # --fastq \
31 | # --no_cleaning"


--------------------------------------------------------------------------------
/src/disk_mem_used:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Author: Roye Rozov, 
 3 | # based on script by Marc Garcia-Garcera and Rayan Chikhi
 4 | #  usage: diskused [program arg1 arg2 ...]
 5 | 
 6 | "$@" &
 7 | pid=$!
 8 | status=$(ps -o rss -o vsz -o pid | grep $pid)
 9 | maxdisk=0
10 | deleted=0
11 | maxmem=0
12 | while [ "${#status}" -gt "0" ];
13 | do
14 |     sleep 1
15 |     delta=false
16 |     disk=$(cat /proc/$pid/io | grep -P '^write_bytes:' | awk '{print $2}')
17 |     disk=$((disk/1024))
18 |     if [ "0$disk" -gt "0$maxdisk" ] 2>/dev/null; then
19 |         maxdisk=$disk
20 |         delta=true
21 |     fi
22 |     mem=$(ps -o rss -o vsz -o pid | grep $pid | awk '{print $1}')
23 |     # echo mem: $mem
24 |     mem=$((mem/1024))
25 |     if [ "0$mem" -gt "0$maxmem" ] 2>/dev/null; then
26 |         maxmem=$mem
27 |         delta=true
28 |     fi
29 |     # only print if at least one value changed
30 |     if $delta; then
31 |         (>&2 echo disk: $disk)
32 |         (>&2 echo mem: $mem)
33 |     fi
34 | status=$(ps -o rss -o vsz -o pid | grep $pid)
35 | done
36 | wait $pid
37 | ret=$?
38 | 
39 | (>&2 echo "maximal disk used: $maxdisk KB")
40 | (>&2 echo "maximal memory used: $maxmem MB")
41 | 
42 | 
43 | exit $ret
44 | 


--------------------------------------------------------------------------------
/src/Faucet.h:
--------------------------------------------------------------------------------
 1 | #ifndef MINK_MAIN
 2 | #define MINK_MAIN
 3 | 
 4 | #include "../utils/Bloom.h"
 5 | #include "../utils/Kmer.h"
 6 | #include "../utils/Junction.h"
 7 | #include "../utils/JChecker.h"
 8 | #include "ReadScanner.h"
 9 | #include "ContigNode.h"
10 | #include "Contig.h"
11 | #include "ReadScanner.h"
12 | #include "ContigGraph.h"
13 | 
14 | float fpRate = .04;
15 | int j = 1;
16 | 
17 | string read_load_file;
18 | string read_scan_file;
19 | string bloom_input_file;
20 | string junctions_input_file;
21 | string short_pair_filter_file;
22 | string long_pair_filter_file;
23 | 
24 | int read_length;
25 | uint64_t estimated_kmers;
26 | uint64_t singletons;
27 | 
28 | 
29 | // requred arguments:
30 | bool load_file_flag = false;
31 | bool scan_file_flag = false;
32 | bool k_val_flag = false;
33 | bool max_len_flag = false;
34 | bool est_kmers_flag = false;
35 | bool est_sing_flag = false;
36 | bool pref_flag = false;
37 | 
38 | // optional arguments:
39 | bool two_hash = false;
40 | bool from_bloom = false;
41 | bool from_junctions = false;
42 | bool just_load = false;
43 | bool fastq = false;
44 | bool mercy = false;
45 | bool node_graph = false;
46 | bool paired_ends = false;
47 | bool no_cleaning = false;
48 | int maxSpacerDist = 100; //max is 128, smaller --> more frequent spacers, bigger --> less frequent.  Measured in base pairs
49 | int64_t nb_reads;
50 | bool high_cov = false;
51 | 
52 | set<kmer_type> all_kmers;
53 | string file_prefix;
54 | 
55 | #endif


--------------------------------------------------------------------------------
/utils/LargeInt.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * arbitrary-precision integer library
 3 |  * very limited: only does what minia needs (but not what minia deserves)
 4 |  */
 5 | 
 6 | #ifndef LargeInt_h
 7 | #define LargeInt_h
 8 | 
 9 | #include <stdint.h>
10 | #include <algorithm>
11 | 
12 | template<int precision>
13 | class LargeInt 
14 | {
15 | 
16 |     public:
17 |     uint64_t array[precision];
18 |     LargeInt(const uint64_t &);
19 |     LargeInt();
20 | 
21 |     // overloading
22 |     LargeInt operator+(const LargeInt &) const;
23 |     LargeInt operator-(const LargeInt &) const;
24 |     LargeInt operator*(const int &) const;
25 |     LargeInt operator/(const uint32_t &) const;
26 |     uint32_t operator%(const uint32_t &) const;
27 |     LargeInt operator^(const LargeInt &) const;
28 |     LargeInt operator&(const LargeInt &) const;
29 |     LargeInt operator~() const;
30 |     LargeInt operator<<(const int &) const;
31 |     LargeInt operator>>(const int &) const;
32 |     bool     operator!=(const LargeInt &) const;
33 |     bool     operator==(const LargeInt &) const;
34 |     bool     operator<(const LargeInt &) const;
35 |     bool     operator<=(const LargeInt &) const;
36 | 
37 |     // custom
38 |     uint64_t toInt() const;
39 |     #ifdef _LP64
40 |     __uint128_t toInt128() const;
41 |     #endif
42 | 
43 | 
44 | // c++ fun fact:
45 | // "const" will ban the function from being anything which can attempt to alter any member variables in the object.
46 | };
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/utils/JuncPairs.h:
--------------------------------------------------------------------------------
 1 | #ifndef JUNC_PAIR_SEARCH
 2 | #define JUNC_PAIR_SEARCH
 3 | 
 4 | #include "Kmer.h"
 5 | #include <sstream>
 6 | #include <deque>
 7 | #include <algorithm> 
 8 | #include <iostream>  
 9 | using std::stringstream;
10 | 
11 | struct JuncPair{
12 |     JuncPair(kmer_type km1, kmer_type km2): kmer1(km1), kmer2(km2){}
13 |     kmer_type kmer1;
14 |     kmer_type kmer2;
15 |     friend bool operator==(JuncPair a, JuncPair b) { 
16 |         return a.kmer1 == b.kmer1 && a.kmer2 == b.kmer2; 
17 |     };
18 | 
19 | };
20 | 
21 | namespace std {
22 |   template <> struct hash<JuncPair>
23 |   {
24 |     size_t operator()(const JuncPair & x) const
25 |     {
26 |         return (std::hash<uint64_t>()(x.kmer1) ^ (std::hash<uint64_t>()(x.kmer2) << 1) >> 1);
27 |     }
28 |   };
29 | }
30 | 
31 | 
32 | //Stores all the info involved in finding a junction candidate by searching from a node
33 | class JuncResult{
34 | public:
35 |     JuncResult(kmer_type km, int dist, int cov): kmer(km), distance(dist), coverage(cov){} 
36 |     kmer_type kmer;
37 |     int distance;
38 |     int coverage;
39 |     friend bool operator<(JuncResult a, JuncResult b);
40 |     friend bool operator>(JuncResult a, JuncResult b);
41 | };
42 | 
43 | //Stores all the info involved in finding a junction candidate by searching from a node
44 | struct JuncPairResult{
45 |     JuncPairResult(JuncPair p, int dist, int cov): pair(p), distance(dist), coverage(cov){} 
46 |     JuncPair pair;
47 |     int distance;
48 |     int coverage;
49 | };
50 | #endif


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017, Tel Aviv University
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |     * Neither the name of the Tel Aviv University nor the
12 |       names of its contributors may be used to endorse or promote products
13 |       derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/src/ContigIterator.cpp:
--------------------------------------------------------------------------------
 1 | #include "ContigIterator.h" 
 2 | 
 3 | ContigIterator::ContigIterator(ContigGraph* theGraph){
 4 |     graph = theGraph;
 5 |     nodeIt = graph->getNodeMap()->begin();
 6 |     index = 0;
 7 |     findNextContig();
 8 | }
 9 | 
10 | Contig* ContigIterator::getContig(){
11 |     if(nodeIt != graph->getNodeMap() -> end()){
12 |         Contig* result = nodeIt->second.contigs[index];
13 |         increment();
14 |         findNextContig();
15 |         return result;
16 |     }
17 | }
18 | 
19 | void ContigIterator::increment(){
20 |     if(index < 4){
21 |         index++;
22 |     }
23 |     else{
24 |         if(nodeIt != graph->getNodeMap()->end()){
25 |             index = 0;
26 |             nodeIt++;
27 |         }
28 |     }
29 | }
30 |     
31 | Contig* ContigIterator::findNextContig(){    
32 |     for( ; nodeIt != graph->getNodeMap()->end(); nodeIt++){
33 |         ContigNode* node = &nodeIt->second;
34 |         for(index %= 5; index < 5; index++){//if index is 5, reset to 0. Otherwise use it as is
35 |             if(node->contigs[index]){
36 |                 Contig* contig = node->contigs[index];
37 |                 if(!contig->node1_p || !contig->node2_p){ //if one side is a sink, always return
38 |                     return contig;
39 |                 }
40 |                 else if(contig->node1_p == contig->node2_p){ //if the contig attaches to the same node twice, print when you see lower index
41 |                     if(index == contig->getMinIndex()){    
42 |                         return contig;
43 |                     }
44 |                 }
45 |                 else if(contig->getSide(node,index) == 1){ //If it attaches to two distinct nodes, print when you're on side 1
46 |                      return contig;  
47 |                 }
48 |             }       
49 |         }
50 |     }
51 |     return NULL;
52 | }
53 |     
54 | bool ContigIterator::hasNextContig(){
55 |     return nodeIt != graph->getNodeMap()->end();
56 | }
57 | 


--------------------------------------------------------------------------------
/src/tests/TestUtils.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <string.h>
 5 | #include <unistd.h>
 6 | #include <sys/types.h>
 7 | #include <inttypes.h>
 8 | #include <stdint.h>
 9 | #include <algorithm> // for max/min
10 | #include <vector> // for sorting_kmers
11 | #include <sys/time.h>
12 | #include <string>
13 | #include "TestUtils.h"
14 | using std::string;
15 | 
16 | using namespace std;
17 | int64_t nb_reads;
18 | kmer_type test_kmer;
19 | //Note: nuc order is ACTG = 0123
20 | 
21 | bool kmer_matches_readseq(char* read, kmer_type kmer, int i){
22 |     char* kmerSeq = new char[sizeKmer];
23 |     code2seq(kmer, kmerSeq);
24 |     for(int pos = 0; pos < sizeKmer; pos++){
25 |         if(read[i+pos] != kmerSeq[pos]) return false;
26 |     }
27 |     return true;
28 | }
29 | 
30 | bool kmer_matches_kmer(kmer_type kmer1, int i1, kmer_type kmer2, int i2){
31 |     char* kmerSeq1 = new char[sizeKmer];
32 |     code2seq(kmer1, kmerSeq1);
33 |     char* kmerSeq2 = new char[sizeKmer];
34 |     code2seq(kmer2, kmerSeq2);
35 |     int length = min(sizeKmer - i1, sizeKmer - i2);
36 |     for(int pos = 0; pos < length; pos++){
37 |         if(kmerSeq1[i1+pos] != kmerSeq2[i2+pos]) return false;
38 |     }
39 |     return true;
40 | }
41 | 
42 | kmer_type getKmerFromString(string kmerString){
43 |     kmer_type kmer;
44 |     getFirstKmerFromRead(&kmer, &(kmerString[0]));
45 |     return kmer;
46 | }
47 | 
48 | Bloom* loadBloom(string list[], int numKmers, int k){
49 |     Bloom* fakeBloom = new Bloom((uint64_t)10000, k);
50 | 
51 |     std::set<kmer_type> valids;
52 | 
53 |     kmer_type kmer;
54 |     for(int i = 0; i < numKmers; i++){
55 |         valids.insert(getKmerFromString(list[i]));
56 |     }
57 |     fakeBloom->fakify(valids);
58 |     return fakeBloom;
59 | }
60 | 
61 | void fail(char* testName, char* errorMessage){
62 |     printf("%s: %s \n", testName, errorMessage);
63 | }
64 | 
65 | 
66 | void fail(char* testName){
67 |     printf("%s: fail. \n", testName);
68 | }
69 | 
70 | void succeed(char* testName){
71 |     printf("%s: success! \n", testName);
72 | }
73 | 


--------------------------------------------------------------------------------
/utils/tests/TestUtils.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <string.h>
 5 | #include <unistd.h>
 6 | #include <sys/types.h>
 7 | #include <inttypes.h>
 8 | #include <stdint.h>
 9 | #include <algorithm> // for max/min
10 | #include <vector> // for sorting_kmers
11 | #include <sys/time.h>
12 | #include <string>
13 | #include "TestUtils.h"
14 | using std::string;
15 | 
16 | using namespace std;
17 | int64_t nb_reads;
18 | kmer_type test_kmer;
19 | //Note: nuc order is ACTG = 0123
20 | 
21 | bool kmer_matches_readseq(char* read, kmer_type kmer, int i){
22 |     char* kmerSeq = new char[sizeKmer];
23 |     code2seq(kmer, kmerSeq);
24 |     for(int pos = 0; pos < sizeKmer; pos++){
25 |         if(read[i+pos] != kmerSeq[pos]) return false;
26 |     }
27 |     return true;
28 | }
29 | 
30 | bool kmer_matches_kmer(kmer_type kmer1, int i1, kmer_type kmer2, int i2){
31 |     char* kmerSeq1 = new char[sizeKmer];
32 |     code2seq(kmer1, kmerSeq1);
33 |     char* kmerSeq2 = new char[sizeKmer];
34 |     code2seq(kmer2, kmerSeq2);
35 |     int length = min(sizeKmer - i1, sizeKmer - i2);
36 |     for(int pos = 0; pos < length; pos++){
37 |         if(kmerSeq1[i1+pos] != kmerSeq2[i2+pos]) return false;
38 |     }
39 |     return true;
40 | }
41 | 
42 | kmer_type getKmerFromString(string kmerString){
43 |     kmer_type kmer;
44 |     getFirstKmerFromRead(&kmer, &(kmerString[0]));
45 |     return kmer;
46 | }
47 | 
48 | Bloom* loadBloom(string list[], int numKmers, int k){
49 |     Bloom* fakeBloom = new Bloom((uint64_t)10000, k);
50 | 
51 |     std::set<kmer_type> valids;
52 | 
53 |     kmer_type kmer;
54 |     for(int i = 0; i < numKmers; i++){
55 |         valids.insert(getKmerFromString(list[i]));
56 |     }
57 |     fakeBloom->fakify(valids);
58 |     return fakeBloom;
59 | }
60 | 
61 | void fail(char* testName, char* errorMessage){
62 |     printf("%s: %s \n", testName, errorMessage);
63 | }
64 | 
65 | 
66 | void fail(char* testName){
67 |     printf("%s: fail. \n", testName);
68 | }
69 | 
70 | void succeed(char* testName){
71 |     printf("%s: success! \n", testName);
72 | }
73 | 


--------------------------------------------------------------------------------
/utils/Junction.h:
--------------------------------------------------------------------------------
 1 | #ifndef JUNCTION
 2 | #define JUNCTION
 3 | 
 4 | #include <iostream>
 5 | #include <string>
 6 | #include "Kmer.h"
 7 | using std::string;
 8 | using std::ofstream;
 9 | 
10 | class Junction{
11 | private:   
12 |     unsigned char cov[4]; //the number of reads along this extension
13 | 
14 | public:
15 |     //The following three fields are indexed by a value from 0-4.  If the value is from 0-3, it indicates the forward extension from adding
16 |     //A, C, T, or G, respectively. If the index is 4, it refers to the backwards direction.
17 |     unsigned char dist[5]; //the distance to the next adjacent junction, or the farthest scanned as of yet without hitting another junction
18 |     bool linked[5]; //whether or not we found another junction along this extension
19 | 
20 |     //Returns an index that points to a valid path in the direction opposite the direction of the given index
21 |     //If input is 4, it returns the valid path of 0-4
22 |     //If input is not 0,1,2,3, returns 3
23 |     int getOppositeIndex(int index);
24 |     int numPathsOut(); //Returns the number of forward paths out of the junction with positive coverage
25 |     bool isSolid(int threshold); //Returns true if at least 2 paths out of the junction have at least a threshold coverage.  
26 |     
27 |     
28 |     //"dist dist dist dist dist  cov cov cov cov cov  linked linked linked linked linked " for each of A,C,T,G,Back, in order.
29 |     string toString();
30 |  
31 |     //Updates the junction to point to given distance, if it's greater than the current distance stored.
32 |     void update(int nucExt, unsigned char length);
33 | 
34 | 
35 |     //Returns coverage along given extnsion
36 |     //If nucExt == 4, returns the sum of the four coverage fields
37 |     int getCoverage(int nucExt);
38 | 
39 |     void setCoverage(int nucExt, int coverage);
40 | 
41 | 
42 |     //Increments the coverage along the given extension by 1.
43 |     void addCoverage(int nucExt);
44 | 
45 |     //Sets linked to true along the given extension.
46 |     void link(int nucExt);
47 | 
48 |     //Initializes with 0 coverage, 0 distance, and linked false.
49 |     Junction();
50 | 
51 |     //Get junction from string printout
52 |     Junction(string juncString);
53 | };
54 | 
55 | #endif


--------------------------------------------------------------------------------
/utils/ReadKmer.h:
--------------------------------------------------------------------------------
 1 | #ifndef READ_KMER
 2 | #define READ_KMER
 3 | 
 4 | #include <string>
 5 | 
 6 | #include "Kmer.h"
 7 | #include "DoubleKmer.h"
 8 | 
 9 | using std::string;
10 | 
11 | //Used to represent a kmer with relation to a read.
12 | //Stores the read, the kmer and revcomp as a DoubleKmer and the position on the read (left end of the kmer). 
13 | class ReadKmer{
14 | public:
15 |     //basic fields
16 |     string* read;
17 |     DoubleKmer doubleKmer;
18 |     int pos;
19 |     bool direction;
20 | 
21 |     //The real extension of the ReadKmer definitely j-checks to at least the return value, based on its position on the read.
22 |     int getMaxGuaranteedJ(bool dir);
23 | 
24 |     int getDistToEnd(); //returns dist to end
25 |     int getTotalPos(); //returns dist to start 
26 | 
27 |     //returns true if the kmer is at a valid position on the read.  Does not include the first backward kmer or the last forward kmer.
28 |     bool onRead();    
29 | 
30 |     kmer_type getKmer();
31 |     kmer_type getRevCompKmer();
32 | 
33 |     //moves one position forward. If facing BACKWARD, simply changes the kmer to face  FORWARD.
34 |     //This may entail going from facing BACKWARD to FORWARD
35 |     void forward(); 
36 |     void advanceDist(int dist); //calls forward repeatedly
37 | 
38 |     //Gets the index on a corresponding junction which corresponds to going along the read in the given direction
39 |     //e.g. if the ReadKmer faces forward and direction is BACKWARD, returns 4.
40 |     //If the ReadKmer faces forward and the direction is FORWARD and the next real nucleotide on the read is 'T', returns 2
41 |     int getExtensionIndex(bool direction); 
42 |     kmer_type getExtension(int newNuc);// Gets the next extension in the direction its facing, for the given nucleotide extension
43 |     kmer_type getRealExtension();// gets the next real kmer in the direction the ReadKmer is pointing
44 |     int getRealExtensionNuc(); //gets the next real nucleotide in the direction the ReadKmer is pointing
45 |     kmer_type getCanon(); 
46 |     int offset(); //gets the contribution of the direction to the distance- 0 if BACKWARD, 1 if FORWARD
47 | 
48 |     char* directionAsString(); //for printing
49 | 
50 |     ReadKmer(string* theRead); //initializes the DoubleKmer to refer to the first kmer in the read
51 |     ReadKmer(string* theRead, int index, bool dir);//Creates a double kmer corresponding to the given read, the index into the read, and the direction
52 |     ReadKmer(ReadKmer* toCopy); //copy construct
53 | 
54 | };
55 | 
56 | 
57 | 
58 | #endif


--------------------------------------------------------------------------------
/utils/tests/JCheckTests.cpp:
--------------------------------------------------------------------------------
 1 | #include "JCheckTests.h"
 2 | #include <string>
 3 | #include <set>
 4 | using std::string;
 5 | 
 6 | namespace jCheckTests{
 7 | 
 8 | string fake_read1 = "ACGGGCGAACTTTCATAGGA";
 9 | string fake_read2 = "GGCGAACTAGTCCAT";
10 | string fake_read3  = "AACTTTCATACGATT";
11 | Bloom* bloom;
12 | string valid_5mers[] = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT"
13 |     ,"AACTT","ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA", "AACTA","ACTAG"
14 |     , "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT" ,"CATAC", "ATACG", "TACGA", "ACGAT", "CGATT"};
15 | JChecker* jchecker;
16 | 
17 | bool jcheck(string kmer, int j){
18 |     jchecker = new JChecker(j, bloom);
19 |     uint64_t hash0 =  bloom->get_rolling_hash(getKmerFromString(kmer),0);
20 |     uint64_t hash1 =  bloom->get_rolling_hash(getKmerFromString(kmer),1);
21 |     return jchecker->jcheck(&kmer[0],hash0,hash1);
22 | }
23 | 
24 | void jcheck_testForwardJ1NoExtension(){
25 |     char* testName = (char*)"jcheck_testForwardJ1WithNoExtension";
26 | 
27 |     bool jchecked = jcheck("TCCAT", 1);
28 | 
29 |     if(jchecked){
30 |         fail(testName,(char*)"It thought there was an extension.");
31 |         return;
32 |     }
33 |     succeed(testName);
34 | }
35 | 
36 | void jcheck_testForwardJ1WithExtension(){
37 |     char* testName = (char*)"jcheck_testForwardJ1WithExtension";
38 | 
39 |     bool jchecked = jcheck("GTCCA",1);
40 | 
41 |     if(!jchecked){
42 |         fail(testName, (char*)"It thought there was no extension.");
43 |         return;
44 |     }
45 |     succeed(testName);
46 | }
47 | 
48 | void jcheck_testForwardJ2WithExtension(){
49 |     char* testName = (char*)"jcheck_testForwardJ2WithExtension";
50 | 
51 |     bool jchecked = jcheck("GAACT",2);
52 |     
53 |     if(!jchecked){
54 |         fail(testName, (char*)"It thought there was no extension.");
55 |         return;
56 |     }
57 |     succeed(testName);
58 | }
59 | 
60 | void jcheck_testForwardJ2WithNoExtension(){
61 |     char* testName = (char*)"jcheck_testForwardJ2WithNoExtension";
62 | 
63 |     bool jchecked = jcheck("ACGAT", 2);
64 | 
65 |     if(jchecked){
66 |         fail(testName, (char*)"It thought there was an extension.");
67 |         return;
68 |     }
69 |     succeed(testName);
70 | }
71 | 
72 | void runJCheckTests(){
73 |     setSizeKmer(5);
74 |     bloom = loadBloom(valid_5mers, 28,5);
75 | 
76 |     jcheck_testForwardJ1WithExtension();
77 |     jcheck_testForwardJ1NoExtension();
78 |     jcheck_testForwardJ2WithExtension();
79 |     jcheck_testForwardJ2WithNoExtension();
80 | }
81 | 
82 | }


--------------------------------------------------------------------------------
/utils/ContigJuncList.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONTIG_JUNC_LIST
 2 | #define CONTIG_JUNC_LIST
 3 | 
 4 | class JuncResult;
 5 | 
 6 | #include <deque>
 7 | #include "JuncPairs.h"
 8 | 
 9 | //Stores info about all interior junctions
10 | //List of incremental distances and coverages
11 | //Coverages include coverage at each end
12 | class ContigJuncList{
13 | 
14 | public:
15 | 
16 |     typedef std::vector<unsigned char> junc_list;
17 |     typedef junc_list::const_iterator const_iterator;
18 | 
19 | 
20 |     ContigJuncList(std::string seq, junc_list dist, junc_list cov);
21 |     ContigJuncList();
22 | 
23 |     const_iterator begin_distances() const{ return distances.begin();}
24 |     const_iterator begin_coverages() const{ return coverages.begin();}
25 |     const_iterator end_distances() const{ return distances.end();}
26 |     const_iterator end_coverages() const{ return coverages.end();}
27 |     void setSeq(std::string cont){seq = cont;}
28 |     std::string getSeq(){ return seq;}
29 |     bool isValidKmerPosition(int pos);//true iff getKmer makes sense on this
30 |     kmer_type getKmer(int pos);//0 = first backward, 1 = first forward, 2 = second backward, etc.
31 | 
32 |     //Gets a list of JuncResults, specifying distance, coverage, and kmer for juncs, with reference to specified side 
33 |     std::list<JuncResult> getJuncResults(bool startForward, int startDist, int maxDist);
34 |     void printJuncResults(int side, int startDist, int maxDist);
35 |     void printJuncResults(std::list<JuncResult> results);
36 |     void printJuncValues();
37 |     int size();
38 | 
39 |     //Used for reversing a contig.  Simply reverses both lists
40 |     void reverse();
41 | 
42 |     int length(); //returns length of sequence
43 |     
44 |     //Sums all distance values
45 |     int getTotalDistance();
46 | 
47 |     //Concatenates this list of juncs with another 
48 |     //Removes overlap of middle coverage and middle distance
49 |     ContigJuncList concatenate(ContigJuncList otherList);
50 | 
51 |     //Averages all coverage values in list
52 |     double getAvgCoverage();
53 |     double getAvgCoverage(std::list<JuncResult> results);
54 | 
55 |     double getCoverageSampleVariance();
56 |     double getCoverageSampleVariance(std::list<JuncResult> results);
57 |     ContigJuncList getScaledContigJuncs(double scale_factor);
58 |     ContigJuncList getShiftedCoverageContigJuncs(double shift);
59 |     ContigJuncList getShiftedCoverageContigJuncsRange(double shift, int maxDist, int side);
60 | 
61 | 
62 |     //Prints distances then coverages to a string
63 |     std::string getStringRep();
64 | 
65 | private: 
66 |     junc_list distances;
67 |     junc_list coverages;
68 |     std::string seq; //string sequence, represented forward from side 1 to side 2
69 | };
70 | 
71 | 
72 | 
73 | #endif


--------------------------------------------------------------------------------
/utils/Junction.cpp:
--------------------------------------------------------------------------------
  1 | #include "Junction.h"
  2 | #include <fstream>
  3 | #include <limits.h>
  4 | #include <sstream>
  5 | #include <iostream>
  6 | using std::ofstream;
  7 | using std::max;
  8 | using std::istringstream;
  9 | using std::stringstream;
 10 | 
 11 | int Junction::getOppositeIndex(int index){
 12 |   if(index < 4){
 13 |     return 4;
 14 |   }
 15 |   else{
 16 |     for(int i = 0; i < 4; i++){
 17 |       if(cov[i] > 0){
 18 |         return i;
 19 |       }
 20 |     }
 21 |   }
 22 | }
 23 | 
 24 | int Junction::numPathsOut(){
 25 |   int numPaths = 0;
 26 |   for(int i = 0; i < 4; i++){
 27 |     if(cov[i] > 0){
 28 |       numPaths++;
 29 |     }
 30 |   }
 31 |   return numPaths;
 32 | }
 33 | 
 34 | void Junction::link(int nucExt){
 35 |   linked[nucExt] = true;
 36 | }
 37 | 
 38 | bool Junction::isSolid(int threshold){
 39 |   int pathsOut = 0;
 40 |   for(int i = 0; i < 4; i++){
 41 |     if(cov[i] >= threshold){
 42 |       pathsOut++;
 43 |     }
 44 |   }
 45 |   return pathsOut > 1;
 46 | }
 47 | 
 48 | int Junction::getCoverage(int nucExt){
 49 |   if(nucExt < 4){
 50 |     return (int)cov[nucExt];
 51 |   }
 52 |   return (int)cov[0] + (int)cov[1] + (int)cov[2] + (int)cov[3];
 53 | }
 54 | 
 55 | void Junction::setCoverage(int nucExt, int coverage){
 56 |   cov[nucExt] = coverage;
 57 | }
 58 | 
 59 | void Junction::addCoverage(int nucExt){
 60 |   cov[nucExt] = cov[nucExt] + 1;
 61 | 
 62 |   //handle overflow
 63 |   if(cov[nucExt] == 0){
 64 |     cov[nucExt] = UCHAR_MAX; 
 65 |   }
 66 | }
 67 | 
 68 | //Updates the junc info based on finding a path of length length from the extension nucExt
 69 | void Junction::update(int nucExt, unsigned char lengthFor){
 70 |       dist[nucExt] = max(dist[nucExt], lengthFor);
 71 | }
 72 | 
 73 | //"dist dist dist dist dist  cov cov cov cov cov  linked linked linked linked linked " for each of A,C,T,G,Back, in order.
 74 | string Junction::toString(){
 75 |   stringstream stream;
 76 |   for(int i = 0; i < 5; i++){
 77 |     stream << (int)dist[i] << " " ;
 78 |   }
 79 |   stream << " ";
 80 |   for(int i = 0; i < 5; i++){
 81 |     stream << getCoverage(i) << " " ;
 82 |   }
 83 |   stream << " ";
 84 |   for(int i = 0; i < 5; i++){
 85 |     stream << linked[i] << " " ;
 86 |   }
 87 |   return stream.str();
 88 | }
 89 | 
 90 | //explicitly set if it's a spacer or not
 91 | Junction::Junction(){
 92 |   for(int i  = 0; i < 4; i++){
 93 |     dist[i] = 0;
 94 |     cov[i] = 0;
 95 |     linked[i] = false;
 96 |   }
 97 |   dist[4] = 0;
 98 |   linked[4] = false;
 99 | }
100 | 
101 | //Get junction from string printout
102 | Junction::Junction(string juncString){
103 |   istringstream iss(juncString);
104 |   string val;
105 |   for(int i = 0; i < 5; i++){
106 |     iss >> val;
107 |     dist[i] = stoi(val);
108 |   }
109 |   for(int i = 0; i < 4; i++){
110 |     iss >> val;
111 |     cov[i] = stoi(val);
112 |   }
113 |   iss >> val;
114 |   for(int i = 0; i < 5; i++){
115 |     iss >> val;
116 |     linked[i] = stoi(val);
117 |   }
118 | }
119 | 
120 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | You can download Faucet [here](https://github.com/Shamir-Lab/Faucet/releases/download/v0.5/Faucet-v0.5.zip) or clone it via the link below. In case you download the zip, unzip the file before following the instructions below (ignoring the 'git clone' line)
 2 | 
 3 | # Getting Faucet
 4 |     git clone https://github.com/rozovr/Faucet.git
 5 |     cd Faucet/src
 6 | 	make depend
 7 | 	make    
 8 | 
 9 | # Running Faucet (locally)
10 | Example usage:
11 | 
12 | ```bash
13 | ./faucet -read_load_file interlaced_reads.fq \
14 |          -read_scan_file interlaced_reads.fq \
15 | 		 -size_kmer 31 \
16 | 		 -max_read_length 100 \
17 | 		 -estimated_kmers 1000000000 \
18 | 		 -singletons 200000000 \
19 | 		 -file_prefix faucet_outputs \
20 | 		 --fastq \
21 | 		 --paired_ends
22 | ```
23 | 
24 | The above command takes as input the file interlaced_reads.fq (where entries alternate between mates 1 and 2 of a paired end library), and the input format is fastq. Faucet does not accept separate mate files, but can accept fasta format and files composed of read sequences alone.
25 | 
26 | # Streaming from a remote source
27 | A demonstration streaming reads from a remote server is provided in the script src/stream_data_from_urls_list.sh
28 | 
29 | You can run it with:
30 | ```bash
31 | ./stream_data_from_urls_list.sh out wget_urls 1596741569 12045222
32 | ```
33 | where `wget_urls` is a file with URLs downloaded from ENA,
34 | `1596741569` is the estimated number of unique kmers (F0) and `12045222` if the estimated number of singleton kmers (f1).
35 | 
36 | # Requirements
37 | Faucet was implemented in C++ 11, so requires a compiler that is not too ancient to support it, and has been tested only on Linux so far. 
38 | 
39 | # Detailed usage
40 | 
41 | Usage:
42 | ./faucet -read_load_file <filename> -read_scan_file <filename> -size_kmer <k> -max_read_length <length> -estimated_kmers <num_kmers> -singletons <num_kmers> -file_prefix <prefix>
43 | Optional arguments: --fastq -max_spacer_dist <dist> -fp rate <rate> -j <int> --two_hash -bloom_file <filename> -junctions_file <filename> --paired_ends --no_cleaning
44 | 
45 | ### required arguments:
46 |  
47 | 	-read_load_file <filename>, a file name string 
48 | 	-read_scan_file <filename> , a file name string
49 | 	-size_kmer <k> , and odd integer <= 31
50 | 	-max_read_length <length>, the longest read length in the data (e.g., if the reads were trimmed to different sizes)
51 | 	-estimated_kmers <num_kmers> 
52 | 	-singletons <num_kmers> 
53 | 	-file_prefix <prefix>, the desired prefix string or directory path for output files 
54 |  
55 | we recommend applying <a href="https://github.com/bcgsc/ntCard">ntCard</a> to extract the number estimated k-mers (F0) and singletons (f1) in the dataset.
56 | 
57 | 
58 | License
59 | =======
60 | 
61 | 
62 | * Low level code for dealing with binary encoded k-mers and strings, and for Bloom filters is derived from the original minia implementation, http://minia.genouest.org/; these components, mostly unmodified, are distributed under a GPL 3.0 license
63 | 
64 | * Original code is distributed under the BSD 3 clause license.
65 | 


--------------------------------------------------------------------------------
/src/tests/olderTests/GetReadJunctionsTests.cpp:
--------------------------------------------------------------------------------
 1 | #include "GetReadJunctionsTests.h"
 2 | #include <stdio.h>
 3 | #include <map>
 4 | using std::map;
 5 | 
 6 | namespace findReadJunctionsTests {
 7 | 
 8 | string fake_read1 = "ACGGGCGAACTTTCATAGGA";
 9 | string fake_read2 = "GGCGAACTAGTCCAT";
10 | string fake_read3  = "AACTTTCATACGATT";
11 | string fake_read4 = "CGCGCGCGCGC";
12 | Bloom* bloom;
13 | ReadScanner* scanner;
14 | 
15 | //this is all the kmers from the reads plus two error kmers that cause a 
16 | //TACGA --> ACGATT, ACGAAA branch (fake of length 2)
17 | string valid_5mers[] = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT"
18 |     ,"AACTT","ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA", "AACTA","ACTAG"
19 |     , "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT" ,"CATAC", "ATACG", "TACGA", "ACGAT","CGATT", "ACGAC", "CGACA"
20 |     , "CGCGC", "GCGCG"};
21 | 
22 | Junction* getReadJunction(int pos, int dir,  Junction** readJunctions){
23 |     return readJunctions[2*pos + dir];
24 | }
25 | 
26 | void testFindReadJunctions_NoJunctions(){
27 |     char* testName = (char*)"testFindReadJunctions_NoJunctions";
28 | 
29 |      scanner->find_read_junctions(fake_read4);
30 | 
31 |      Junction** readJunctions = scanner->get_read_junctions();
32 | 
33 |     for(int i = 0; i < fake_read4.length(); i++){
34 |         if(getReadJunction(i,0,readJunctions) || getReadJunction(i,1,readJunctions) ){
35 |             fail(testName);
36 |             return;
37 |         }
38 |     }  
39 |      succeed(testName);
40 | }
41 | 
42 | void testFindReadJunctions_2Junctions(){
43 |     char* testName = (char*)"testFindReadJunctions_2Junctions";
44 | 
45 |      scanner->find_read_junctions(fake_read1);//expect junction at 6 and 12
46 | 
47 |      Junction** readJunctions = scanner->get_read_junctions();
48 | 
49 |      if(!getReadJunction(6,0, readJunctions)){
50 |         fail(testName, (char*)"Didn't find the forward junction at 6.");
51 |         return;
52 |      } 
53 |      if(!getReadJunction(12,0, readJunctions)){
54 |         fail(testName, (char*)"Didn't find the forward junction at 12");
55 |      }
56 |      succeed(testName);
57 | }
58 | 
59 | void testFindReadJunctions_backwardJunction(){
60 |     char* testName = (char*)"testFindReadJunctions_BackwardJunction";
61 |     string backwardRead = fake_read1;
62 |     revcomp_sequence(&backwardRead[0], backwardRead.length());
63 | 
64 |      scanner->find_read_junctions(backwardRead);//expect junction at 6 and 12
65 | 
66 |      Junction** readJunctions = scanner->get_read_junctions();
67 | 
68 |      if(!getReadJunction(3,1, readJunctions)){
69 |         fail(testName, (char*)"Didn't find the backward junction at 3.");
70 |         return;
71 |      }  
72 |      if(!getReadJunction(9,1, readJunctions)){
73 |         fail(testName, (char*)"Didn't find the backward junction at 9");
74 |      }
75 |      succeed(testName);
76 | }
77 | 
78 | void runFindReadJunctionsTests(){
79 |     setSizeKmer(5);
80 |     bloom = loadBloom(valid_5mers,31,5);
81 |     scanner = new ReadScanner("mockfile", bloom, new JChecker(0, bloom));
82 |     
83 |     testFindReadJunctions_NoJunctions();
84 |     testFindReadJunctions_2Junctions();
85 |     testFindReadJunctions_backwardJunction();
86 | }
87 | 
88 | }


--------------------------------------------------------------------------------
/src/tests/olderTests/FindNextJunctionTests.cpp:
--------------------------------------------------------------------------------
 1 | #include "FindNextJunctionTests.h"
 2 | #include <stdio.h>
 3 | 
 4 | namespace findNextJunctionTests
 5 | {
 6 | 
 7 | string fake_read1 = "ACGGGCGAACTTTCATAGGA";
 8 | string fake_read2 = "GGCGAACTAGTCCAT";
 9 | string fake_read3  = "AACTTTCATACGATT";
10 | Bloom* bloom;
11 | ReadScanner* scanner;
12 | 
13 | //this is all the kmers from the reads plus two error kmers that cause a 
14 | //TACGA --> ACGATT, ACGAAA branch (fake of length 2)
15 | string valid_5mers[] = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT"
16 |     ,"AACTT","ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA", "AACTA","ACTAG"
17 |     , "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT" ,"CATAC", "ATACG", "TACGA", "ACGAT","CGATT", "ACGAC", "CGACA"};
18 | 
19 | void findNextJunction_J1_testFromStart(){
20 |     //int* pos, kmer_type * kmer, string read, int j, Bloom* bloo1
21 |     char* testName = (char*)"findNextJunction_J1_testFromStart";
22 |     int pos = 0;
23 |     kmer_type kmer = getKmerFromString("ACGGG");
24 |      scanner = new ReadScanner("mockFile", bloom, new JChecker(1, bloom));
25 |     scanner->resetHashes(kmer);
26 | 
27 |     Junction* junc = scanner->find_next_junction(&pos, &kmer, fake_read1);
28 | 
29 |     if(pos != 6){
30 |         fail(testName, (char*)"pos was incorrect.");
31 |         return;
32 |     }    
33 |     if(kmer != getKmerFromString("GAACT")){
34 |         fail(testName, (char*)"kmer was incorrect.");
35 |         return;
36 |     }
37 |     succeed(testName);
38 | }
39 | 
40 | void findNextJunction_J2_testOffEnd(){
41 |     //int* pos, kmer_type * kmer, string read, int j, Bloom* bloo1
42 |     char* testName = (char*)"findNextJunction_J2_testOffEnd";
43 |     int pos = 13;
44 |     kmer_type kmer = getKmerFromString("CATAG");  
45 |     scanner = new ReadScanner("mockFile", bloom, new JChecker(2, bloom));
46 |     scanner->resetHashes(kmer);
47 |     
48 |     Junction* junc = scanner->find_next_junction(&pos, &kmer, fake_read1);
49 | 
50 |     if(junc){
51 |         fail(testName, (char*)"Returned a junction.");
52 |         return;
53 |     }
54 |     if(pos != 15 ){
55 |         printf("Position %d\n", pos);
56 |         fail(testName, (char*)"Incorrect position.");
57 |         return;
58 |     }
59 |     succeed(testName);
60 | }
61 | void findNextJunction_J1_testAtJunction(){
62 |     //int* pos, kmer_type * kmer, string read, int j, Bloom* bloo1
63 |     char* testName = (char*)"findNextJunction_J1_testAtJunction";
64 |     int pos = 13;
65 |     kmer_type kmer = getKmerFromString("CATAG");  
66 |     scanner = new ReadScanner("mockFile", bloom, new JChecker(2, bloom));
67 |     scanner->getJunctionMap()->createJunction(kmer);
68 |     scanner->resetHashes(kmer);
69 |     
70 |     Junction* junc = scanner->find_next_junction(&pos, &kmer, fake_read1);
71 | 
72 |     if(pos != 13 ){
73 |         fail(testName, (char*)"Did not return initial position.");
74 |         return;
75 |     }
76 |     succeed(testName);
77 | }
78 | 
79 | void runFindNextJunctionTests(){
80 |     setSizeKmer(5);
81 |     bloom = loadBloom(valid_5mers,30,5);
82 | 
83 |    findNextJunction_J1_testFromStart();
84 |    findNextJunction_J2_testOffEnd();
85 |    findNextJunction_J1_testAtJunction();
86 | }
87 | 
88 | }


--------------------------------------------------------------------------------
/src/tests/olderTests/TraverseReadsTests.cpp:
--------------------------------------------------------------------------------
  1 | #include "TraverseReadsTests.h"
  2 | #include <stdio.h>
  3 | #include <map>
  4 | using std::map;
  5 | 
  6 | namespace traverseReadsTests {
  7 | 
  8 | string fake_read1 = "ACGGGCGAACTTTCATAGGA";
  9 | string fake_read2 = "GGCGAACTAGTCCAT";
 10 | string fake_read3  = "AACTTTCATACGATT";
 11 | Bloom* bloom;
 12 | 
 13 | //this is all the kmers from the reads plus two error kmers that cause a 
 14 | //TACGA --> ACGATT, ACGAAA branch (fake of length 2)
 15 | string valid_5mers[] = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT"
 16 |     ,"AACTT","ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA", "AACTA","ACTAG"
 17 |     , "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT" ,"CATAC", "ATACG", "TACGA", "ACGAT","CGATT", "ACGAC", "CGACA"};
 18 | 
 19 | 
 20 | void printJunctionMap(){
 21 |     printf("Size: %d \n", scanner->getJunctionMap().size());
 22 |     for (auto& kv : scanner->getJunctionMap()){
 23 |         printf("%s \n", print_kmer(kv.first));
 24 |     }
 25 | }
 26 | void traverseReads(int j){
 27 |     scanner->setJ(j);
 28 |     scanner->smart_traverse_read(fake_read1);
 29 |     scanner->smart_traverse_read(fake_read2);
 30 |     scanner->smart_traverse_read(fake_read3);
 31 | }
 32 | 
 33 | void testTraverseReads_J0(){
 34 |     char* testName = (char*)"testTraverseReads_J0";
 35 |     scanner = new ReadScanner("mockfile", bloom);
 36 | 
 37 |     traverseReads(0);
 38 | 
 39 |     if(scanner->getJunctionMap().size() != 4){
 40 |         fail(testName, (char*)"junction map size was wrong.");
 41 |         printJunctionMap();
 42 |         return;
 43 |     }
 44 |     succeed(testName);
 45 | }
 46 | 
 47 | 
 48 | void testTraverseReads_J1(){
 49 |     char* testName = (char*)"testTraverseReads_J1";
 50 |     scanner = new ReadScanner("mockfile", bloom);
 51 | 
 52 |     traverseReads(1);
 53 | 
 54 |     if(scanner->getJunctionMap().size() != 4){
 55 |         fail(testName, (char*)"junction map size was wrong.");
 56 |         printJunctionMap();
 57 |         return;
 58 |     }
 59 |     succeed(testName);
 60 | }
 61 | 
 62 | 
 63 | void testTraverseReads_J2(){
 64 |     char* testName = (char*)"testTraverseReads_J2";
 65 |     scanner = new ReadScanner("mockfile", bloom);
 66 | 
 67 |     traverseReads(2);
 68 | 
 69 |     if(scanner->getJunctionMap().size() != 3){
 70 |         fail(testName, (char*)"junction map size was wrong.");
 71 |         printJunctionMap();
 72 |         return;
 73 |     }
 74 |     succeed(testName);
 75 | }
 76 | 
 77 | void testTraverseReadTwice_SameJuncs(){
 78 |     char* testName = (char*)"testTraverseReadsTwice_SameJuncs";
 79 |     scanner = new ReadScanner("mockfile", bloom);
 80 | 
 81 |     traverseReads(1);
 82 |     traverseReads(1);
 83 | 
 84 |     if(scanner->getJunctionMap().size() != 4){
 85 |         fail(testName, (char*)"junction map size was wrong.");
 86 |         printJunctionMap();
 87 |         return;
 88 |     }
 89 |     succeed(testName);
 90 | }
 91 | 
 92 | void runTraverseReadsTests(){
 93 |     setSizeKmer(5);
 94 |     bloom = loadBloom(valid_5mers,30,5);
 95 | 
 96 |     testTraverseReads_J0();
 97 |     testTraverseReads_J1();
 98 |     testTraverseReads_J2();
 99 | 
100 |     testTraverseReadTwice_SameJuncs();
101 | }
102 | 
103 | }


--------------------------------------------------------------------------------
/utils/JChecker.cpp:
--------------------------------------------------------------------------------
 1 | #include "JChecker.h"
 2 | #include <stdio.h>
 3 | 
 4 | //incremental version
 5 | //j = 0 always returns true
 6 | //j > 0 checks extensions up to j deep from kmer, and returns true if there is a sequence of j extensions
 7 | //which returns all positive in the bloom filter
 8 | bool JChecker::jcheck(char* kmerSeq, uint64_t nextH0, uint64_t nextH1){
 9 |   if(j == 0){
10 |     return true;
11 |   }
12 |   uint64_t workingHash0, workingHash1;
13 |   int lastCount, nextCount;
14 |   lastCount = 1;
15 |   lastHashes[0][0] = nextH0;
16 |   lastHashes[0][1] = nextH1;
17 | 
18 |   for(int i = 0; i < j; i++){//for each level up to j
19 |     nextCount = 0; //have found no extensions yet
20 |     for(int k = 0; k < lastCount; k++){ //for each kmer in the last level
21 |       workingHash0 = lastHashes[k][0];
22 |       workingHash1 = lastHashes[k][1];
23 |       for(int nt = 0; nt < 4; nt++){ //for each possible extension
24 |         nextHash0 = bloom->roll_hash(workingHash0, NT2int(kmerSeq[i]), nt, 0);
25 |         nextHash1 = bloom->roll_hash(workingHash1, NT2int(kmerSeq[i]), nt, 1);
26 |         if(bloom->contains(nextHash0, nextHash1)){ //add to next level if it's in the bloom filter
27 |           if(i == (j-1)){
28 |             return true;//if this is the last level return true after the first check
29 |           }
30 |           nextHashes[nextCount][0] = nextHash0;
31 |           nextHashes[nextCount][1] = nextHash1;
32 |           nextCount++;
33 |         }
34 |       }
35 |     }
36 | 
37 |     if(nextCount == 0){ //if there are no kmers in the list now, return false
38 |       return false;
39 |     }
40 |     //reset counts and lists for next level of th search
41 |     lastCount = nextCount;
42 | 
43 |     tempor = lastHashes;
44 |     lastHashes = nextHashes;
45 |     nextHashes = tempor;
46 |   }
47 | }
48 |     
49 | //Normal version of jchecking, without rolling hash.  
50 | //Old hash! use only for old hash!  For kpomerscanner
51 | bool JChecker::jcheck(kmer_type kmer){
52 |   kmer_type this_kmer, nextKmer;
53 |   int lastCount, nextCount;
54 | 
55 |   lastCount = 1;
56 |   lastKmers[0] = kmer;
57 | 
58 |   for(int i = 0; i < j; i++){ //for up to j levels
59 |     nextCount = 0;
60 |     for(int k = 0; k < lastCount; k++){ //for every kmer in the last level
61 |       this_kmer = lastKmers[k];
62 |       for(int nt = 0; nt < 4; nt++){ //for every possible extension
63 |         nextKmer = next_kmer(this_kmer, nt, FORWARD);
64 |         if(bloom->oldContains(get_canon(nextKmer))){//add any positive extensions to the next level
65 |           nextKmers[nextCount] = nextKmer;
66 |           nextCount++;
67 |         }
68 |       }
69 |     }
70 |     if(nextCount == 0){
71 |       return false; //if there are ever no valid kmers in the next level, the kmer does not jcheck
72 |     }
73 |     lastCount = nextCount;
74 |     //switch the pointers to the "last" and "next" arrays so we can use the current "next" one as the next "last" one
75 |     temp = lastKmers;
76 |     lastKmers = nextKmers;
77 |     nextKmers = temp;
78 |   }
79 |   return true;
80 | }
81 | 
82 | JChecker::JChecker(int jVal, Bloom* bloo){
83 |     j = jVal;
84 |     bloom = bloo;
85 | 
86 |     //all this is for rolling hash function.. not relevant now
87 |     lastHashes = new uint64_t*[20000];
88 |     nextHashes = new uint64_t*[20000];
89 |     for(int i = 0; i < 20000; i++){
90 |         lastHashes[i] = new uint64_t[2];
91 |         nextHashes[i] = new uint64_t[2];
92 |     }
93 |     lastKmers = new kmer_type[1000];
94 |     nextKmers = new kmer_type[1000];
95 | }


--------------------------------------------------------------------------------
/src/Contig.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONTIG
 2 | #define CONTIG
 3 | 
 4 | #include "../utils/Kmer.h"
 5 | #include "../utils/JuncPairs.h"
 6 | #include "../utils/Bloom.h"
 7 | #include "../utils/JuncPairs.h"
 8 | #include "../utils/ContigJuncList.h"
 9 | #include "ContigNode.h"
10 | #include <iostream>
11 | #include <string.h>
12 | 
13 | using std::ofstream;
14 | 
15 | class ContigNode; // forward declaration
16 | 
17 | class Contig{
18 | private:
19 |     //utility for linking if they're both facing forward
20 |     //Glues end 2 of this contig to end 1 of the other
21 |     //Doesn't change the value of either contig- just returns the concatenation.
22 |     Contig* concatenate(Contig* otherContig);
23 | 
24 |     std::vector<std::pair<Contig*, bool> > getNeighbors(bool forward);
25 | 
26 | public:
27 | 
28 |     ~Contig();
29 |     std::string getFastGName(bool RC);
30 |     std::string getFastGHeader(bool RC);
31 | 
32 |     // length can be obtained from sequence
33 |     ContigNode * node1_p;  //adjacent node on side 1
34 |     ContigNode * node2_p; //adjacent node on side 2
35 |     unsigned char ind1; //index on which it connects to the node, on side 1
36 |     unsigned char ind2; //index on which it connects to the node, on side 2
37 |     bool marked;
38 | 
39 |     //list of coverage and distance for interior junctions along this contig- since we can use for pair BF and coverage info
40 |     ContigJuncList contigJuncs;
41 | 
42 |     bool checkValidity();
43 |     bool isDegenerateLoop();//returns true if both sides have same node and same index
44 | 
45 |     //Concatenates the two contigs, gluing together the specified sides
46 |     Contig* concatenate(Contig* otherContig, int thisSide, int otherSide);
47 |     std::pair<double, double> getPairsMeanStd(Bloom* pair_filter);
48 |     void printPairStatistics(Bloom* pair_filter);
49 |     int length(); //returns length of sequence
50 |     void reverse(); //reverses the contig orientation
51 |     ContigNode* otherEndNode(ContigNode * oneEnd);//returns a pointer to the node at the other end
52 |     void setEnds(ContigNode* n1, int i1, ContigNode* n2, int i2);
53 |     void setIndices(int i1, int i2);
54 |     void setSeq(std::string cont){contigJuncs.setSeq(cont);}
55 |     std::string getSeq(){return contigJuncs.getSeq();}
56 |     double getAvgCoverage();
57 |     double getAvgCoverage(std::list<JuncResult> results);
58 | 
59 |     double getCoverageSampleVariance();
60 |     double getCoverageSampleVariance(std::list<JuncResult> results);
61 |     void setContigJuncs(ContigJuncList juncList){ contigJuncs = juncList;}
62 |     std::list<JuncResult> getJuncResults(int side, int startDist, int maxDist);
63 | 
64 |     //gets the node coverages on each end, returns minimum as base line for how much this should be covered.
65 |     //If the contig is isolated, returns 0
66 |     int getTotalDistance(){ return contigJuncs.getTotalDistance(); }
67 | 
68 |     float getMass();
69 |     int getMinIndex();
70 |     kmer_type getNodeKmer(ContigNode * contigNode);    //Assumes the given contig node points to one end of this contig
71 |     kmer_type getSideKmer(int side);    //either 1 or 2
72 |     int getSide(ContigNode* node);
73 |     int getSide(ContigNode* node, int index);
74 |     void setMark(bool value);
75 |     bool getMark();
76 |     ContigNode* getNode(int side);
77 |     int getIndex(int side);
78 |     bool isIsolated();//return true if both sides point to null
79 |     void setSide(int side, ContigNode* node);
80 |     std::string getStringRep();
81 |     Contig();
82 |     Contig( Contig * c);
83 | 
84 | 
85 | };
86 | 
87 | #endif


--------------------------------------------------------------------------------
/utils/lut.h:
--------------------------------------------------------------------------------
  1 | #ifndef CODE_H
  2 | #define CODE_H
  3 | 
  4 | 
  5 | //look up table conversion   (with  A,C,T,G  <-->  0,1,2,3)
  6 | 
  7 | //complement of one NT
  8 | const unsigned char comp_NT[4] = {
  9 |   2,3,0,1
 10 | };
 11 | 
 12 | //reverse complement of 4NT,  ie one byte
 13 | const unsigned char revcomp_4NT[256] = {
 14 |  0xaa,
 15 |  0xea,
 16 |  0x2a,
 17 |  0x6a,
 18 |  0xba,
 19 |  0xfa,
 20 |  0x3a,
 21 |  0x7a,
 22 |  0x8a,
 23 |  0xca,
 24 |  0xa,
 25 |  0x4a,
 26 |  0x9a,
 27 |  0xda,
 28 |  0x1a,
 29 |  0x5a,
 30 |  0xae,
 31 |  0xee,
 32 |  0x2e,
 33 |  0x6e,
 34 |  0xbe,
 35 |  0xfe,
 36 |  0x3e,
 37 |  0x7e,
 38 |  0x8e,
 39 |  0xce,
 40 |  0xe,
 41 |  0x4e,
 42 |  0x9e,
 43 |  0xde,
 44 |  0x1e,
 45 |  0x5e,
 46 |  0xa2,
 47 |  0xe2,
 48 |  0x22,
 49 |  0x62,
 50 |  0xb2,
 51 |  0xf2,
 52 |  0x32,
 53 |  0x72,
 54 |  0x82,
 55 |  0xc2,
 56 |  0x2,
 57 |  0x42,
 58 |  0x92,
 59 |  0xd2,
 60 |  0x12,
 61 |  0x52,
 62 |  0xa6,
 63 |  0xe6,
 64 |  0x26,
 65 |  0x66,
 66 |  0xb6,
 67 |  0xf6,
 68 |  0x36,
 69 |  0x76,
 70 |  0x86,
 71 |  0xc6,
 72 |  0x6,
 73 |  0x46,
 74 |  0x96,
 75 |  0xd6,
 76 |  0x16,
 77 |  0x56,
 78 |  0xab,
 79 |  0xeb,
 80 |  0x2b,
 81 |  0x6b,
 82 |  0xbb,
 83 |  0xfb,
 84 |  0x3b,
 85 |  0x7b,
 86 |  0x8b,
 87 |  0xcb,
 88 |  0xb,
 89 |  0x4b,
 90 |  0x9b,
 91 |  0xdb,
 92 |  0x1b,
 93 |  0x5b,
 94 |  0xaf,
 95 |  0xef,
 96 |  0x2f,
 97 |  0x6f,
 98 |  0xbf,
 99 |  0xff,
100 |  0x3f,
101 |  0x7f,
102 |  0x8f,
103 |  0xcf,
104 |  0xf,
105 |  0x4f,
106 |  0x9f,
107 |  0xdf,
108 |  0x1f,
109 |  0x5f,
110 |  0xa3,
111 |  0xe3,
112 |  0x23,
113 |  0x63,
114 |  0xb3,
115 |  0xf3,
116 |  0x33,
117 |  0x73,
118 |  0x83,
119 |  0xc3,
120 |  0x3,
121 |  0x43,
122 |  0x93,
123 |  0xd3,
124 |  0x13,
125 |  0x53,
126 |  0xa7,
127 |  0xe7,
128 |  0x27,
129 |  0x67,
130 |  0xb7,
131 |  0xf7,
132 |  0x37,
133 |  0x77,
134 |  0x87,
135 |  0xc7,
136 |  0x7,
137 |  0x47,
138 |  0x97,
139 |  0xd7,
140 |  0x17,
141 |  0x57,
142 |  0xa8,
143 |  0xe8,
144 |  0x28,
145 |  0x68,
146 |  0xb8,
147 |  0xf8,
148 |  0x38,
149 |  0x78,
150 |  0x88,
151 |  0xc8,
152 |  0x8,
153 |  0x48,
154 |  0x98,
155 |  0xd8,
156 |  0x18,
157 |  0x58,
158 |  0xac,
159 |  0xec,
160 |  0x2c,
161 |  0x6c,
162 |  0xbc,
163 |  0xfc,
164 |  0x3c,
165 |  0x7c,
166 |  0x8c,
167 |  0xcc,
168 |  0xc,
169 |  0x4c,
170 |  0x9c,
171 |  0xdc,
172 |  0x1c,
173 |  0x5c,
174 |  0xa0,
175 |  0xe0,
176 |  0x20,
177 |  0x60,
178 |  0xb0,
179 |  0xf0,
180 |  0x30,
181 |  0x70,
182 |  0x80,
183 |  0xc0,
184 |  0x0,
185 |  0x40,
186 |  0x90,
187 |  0xd0,
188 |  0x10,
189 |  0x50,
190 |  0xa4,
191 |  0xe4,
192 |  0x24,
193 |  0x64,
194 |  0xb4,
195 |  0xf4,
196 |  0x34,
197 |  0x74,
198 |  0x84,
199 |  0xc4,
200 |  0x4,
201 |  0x44,
202 |  0x94,
203 |  0xd4,
204 |  0x14,
205 |  0x54,
206 |  0xa9,
207 |  0xe9,
208 |  0x29,
209 |  0x69,
210 |  0xb9,
211 |  0xf9,
212 |  0x39,
213 |  0x79,
214 |  0x89,
215 |  0xc9,
216 |  0x9,
217 |  0x49,
218 |  0x99,
219 |  0xd9,
220 |  0x19,
221 |  0x59,
222 |  0xad,
223 |  0xed,
224 |  0x2d,
225 |  0x6d,
226 |  0xbd,
227 |  0xfd,
228 |  0x3d,
229 |  0x7d,
230 |  0x8d,
231 |  0xcd,
232 |  0xd,
233 |  0x4d,
234 |  0x9d,
235 |  0xdd,
236 |  0x1d,
237 |  0x5d,
238 |  0xa1,
239 |  0xe1,
240 |  0x21,
241 |  0x61,
242 |  0xb1,
243 |  0xf1,
244 |  0x31,
245 |  0x71,
246 |  0x81,
247 |  0xc1,
248 |  0x1,
249 |  0x41,
250 |  0x91,
251 |  0xd1,
252 |  0x11,
253 |  0x51,
254 |  0xa5,
255 |  0xe5,
256 |  0x25,
257 |  0x65,
258 |  0xb5,
259 |  0xf5,
260 |  0x35,
261 |  0x75,
262 |  0x85,
263 |  0xc5,
264 |  0x5,
265 |  0x45,
266 |  0x95,
267 |  0xd5,
268 |  0x15,
269 |  0x55
270 | };
271 | 
272 | #endif
273 | 


--------------------------------------------------------------------------------
/src/newTests/ContigTest.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2005, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // A sample program demonstrating using Google C++ testing framework.
31 | //
32 | // Author: wan@google.com (Zhanyong Wan)
33 | 
34 | 
35 | // This sample shows how to write a simple unit test for a function,
36 | // using Google C++ testing framework.
37 | //
38 | // Writing a unit test using Google C++ testing framework is easy as 1-2-3:
39 | 
40 | 
41 | // Step 1. Include necessary header files such that the stuff your
42 | // test logic needs is declared.
43 | //
44 | // Don't forget gtest.h, which declares the testing framework.
45 | 
46 | #include "../Contig.h"
47 | #include "gtest/gtest.h"
48 | 
49 | 
50 | // Step 2. Use the TEST macro to define your tests.
51 | //
52 | // TEST has two parameters: the test case name and the test name.
53 | // After using the macro, you should define your test logic between a
54 | // pair of braces.  You can use a bunch of macros to indicate the
55 | // success or failure of a test.  EXPECT_TRUE and EXPECT_EQ are
56 | // examples of such macros.  For a complete list, see gtest.h.
57 | //
58 | // <TechnicalDetails>
59 | //
60 | // In Google Test, tests are grouped into test cases.  This is how we
61 | // keep test code organized.  You should put logically related tests
62 | // into the same test case.
63 | //
64 | // The test case name and the test name should both be valid C++
65 | // identifiers.  And you should not use underscore (_) in the names.
66 | //
67 | // Google Test guarantees that each test you define is run exactly
68 | // once, but it makes no guarantee on the order the tests are
69 | // executed.  Therefore, you should write your tests in such a way
70 | // that their results don't depend on their order.
71 | //
72 | // </TechnicalDetails>
73 | 
74 | 
75 | // TEST(ContigTest, Construct) {
76 | //   Contig contig = Contig();
77 | //   EXPECT_EQ("", contig.getSeq());
78 | //   EXPECT_EQ(-1, contig.ind1);
79 | //   EXPECT_EQ(-1, contig.ind2);
80 | // }
81 | 
82 | 
83 | // int main(int ac, char* av[])
84 | // {
85 | //   testing::InitGoogleTest(&ac, av);
86 | //   return RUN_ALL_TESTS();
87 | // }


--------------------------------------------------------------------------------
/utils/rvalues.h:
--------------------------------------------------------------------------------
  1 | static const double rvalues[129][2] = {
  2 | 	{ 0.00000, 5.29625 },
  3 | 	{ 0.00000, 0.00000 },
  4 | 	{ 0.00000, 0.00000 },
  5 | 	{ 0.00000, 0.00000 },
  6 | 	{ 0.00000, 0.00000 },
  7 | 	{ 5.64856, 5.49117 },
  8 | 	{ 5.85164, 5.52333 },
  9 | 	{ 6.02772, 5.55385 },
 10 | 	{ 6.18398, 5.58291 },
 11 | 	{ 6.32492, 5.61065 },
 12 | 	{ 6.45362, 5.63721 },
 13 | 	{ 6.57225, 5.66268 },
 14 | 	{ 6.68244, 5.68717 },
 15 | 	{ 6.78544, 5.71075 },
 16 | 	{ 6.88221, 5.73350 },
 17 | 	{ 6.97353, 5.75548 },
 18 | 	{ 7.06004, 5.77674 },
 19 | 	{ 7.14227, 5.79733 },
 20 | 	{ 7.22066, 5.81730 },
 21 | 	{ 7.29558, 5.83669 },
 22 | 	{ 7.36734, 5.85553 },
 23 | 	{ 7.43623, 5.87386 },
 24 | 	{ 7.50248, 5.89171 },
 25 | 	{ 7.56630, 5.90910 },
 26 | 	{ 7.62788, 5.92605 },
 27 | 	{ 7.68737, 5.94260 },
 28 | 	{ 7.74494, 5.95876 },
 29 | 	{ 7.80069, 5.97455 },
 30 | 	{ 7.85477, 5.98999 },
 31 | 	{ 7.90726, 6.00510 },
 32 | 	{ 7.95826, 6.01989 },
 33 | 	{ 8.00786, 6.03437 },
 34 | 	{ 8.05615, 6.04856 },
 35 | 	{ 8.10319, 6.06247 },
 36 | 	{ 8.14904, 6.07611 },
 37 | 	{ 8.19378, 6.08950 },
 38 | 	{ 8.23745, 6.10264 },
 39 | 	{ 8.28012, 6.11555 },
 40 | 	{ 8.32181, 6.12822 },
 41 | 	{ 8.36260, 6.14068 },
 42 | 	{ 8.40250, 6.15293 },
 43 | 	{ 8.44157, 6.16497 },
 44 | 	{ 8.47983, 6.17682 },
 45 | 	{ 8.51733, 6.18848 },
 46 | 	{ 8.55409, 6.19995 },
 47 | 	{ 8.59015, 6.21125 },
 48 | 	{ 8.62553, 6.22238 },
 49 | 	{ 8.66025, 6.23334 },
 50 | 	{ 8.69435, 6.24413 },
 51 | 	{ 8.72784, 6.25478 },
 52 | 	{ 8.76075, 6.26527 },
 53 | 	{ 8.79310, 6.27562 },
 54 | 	{ 8.82490, 6.28582 },
 55 | 	{ 8.85619, 6.29589 },
 56 | 	{ 8.88697, 6.30582 },
 57 | 	{ 8.91725, 6.31562 },
 58 | 	{ 8.94707, 6.32530 },
 59 | 	{ 8.97643, 6.33485 },
 60 | 	{ 9.00534, 6.34428 },
 61 | 	{ 9.03383, 6.35360 },
 62 | 	{ 9.06189, 6.36280 },
 63 | 	{ 9.08956, 6.37189 },
 64 | 	{ 9.11683, 6.38088 },
 65 | 	{ 9.14372, 6.38975 },
 66 | 	{ 9.17024, 6.39853 },
 67 | 	{ 9.19640, 6.40721 },
 68 | 	{ 9.22221, 6.41578 },
 69 | 	{ 9.24768, 6.42427 },
 70 | 	{ 9.27282, 6.43265 },
 71 | 	{ 9.29764, 6.44095 },
 72 | 	{ 9.32214, 6.44916 },
 73 | 	{ 9.34634, 6.45728 },
 74 | 	{ 9.37024, 6.46532 },
 75 | 	{ 9.39385, 6.47328 },
 76 | 	{ 9.41717, 6.48115 },
 77 | 	{ 9.44023, 6.48894 },
 78 | 	{ 9.46301, 6.49666 },
 79 | 	{ 9.48552, 6.50430 },
 80 | 	{ 9.50778, 6.51186 },
 81 | 	{ 9.52979, 6.51935 },
 82 | 	{ 9.55156, 6.52677 },
 83 | 	{ 9.57308, 6.53412 },
 84 | 	{ 9.59437, 6.54140 },
 85 | 	{ 9.61543, 6.54861 },
 86 | 	{ 9.63627, 6.55576 },
 87 | 	{ 9.65688, 6.56284 },
 88 | 	{ 9.67729, 6.56986 },
 89 | 	{ 9.69748, 6.57682 },
 90 | 	{ 9.71747, 6.58371 },
 91 | 	{ 9.73725, 6.59055 },
 92 | 	{ 9.75684, 6.59732 },
 93 | 	{ 9.77624, 6.60404 },
 94 | 	{ 9.79544, 6.61070 },
 95 | 	{ 9.81446, 6.61731 },
 96 | 	{ 9.83330, 6.62386 },
 97 | 	{ 9.85196, 6.63035 },
 98 | 	{ 9.87045, 6.63680 },
 99 | 	{ 9.88876, 6.64319 },
100 | 	{ 9.90691, 6.64953 },
101 | 	{ 9.92489, 6.65582 },
102 | 	{ 9.94271, 6.66206 },
103 | 	{ 9.96037, 6.66825 },
104 | 	{ 9.97787, 6.67439 },
105 | 	{ 9.99522, 6.68049 },
106 | 	{ 10.01241, 6.68654 },
107 | 	{ 10.02946, 6.69254 },
108 | 	{ 10.04637, 6.69850 },
109 | 	{ 10.06313, 6.70441 },
110 | 	{ 10.07975, 6.71029 },
111 | 	{ 10.09623, 6.71611 },
112 | 	{ 10.11258, 6.72190 },
113 | 	{ 10.12879, 6.72764 },
114 | 	{ 10.14488, 6.73335 },
115 | 	{ 10.16083, 6.73901 },
116 | 	{ 10.17665, 6.74463 },
117 | 	{ 10.19235, 6.75022 },
118 | 	{ 10.20793, 6.75577 },
119 | 	{ 10.22339, 6.76127 },
120 | 	{ 10.23873, 6.76674 },
121 | 	{ 10.25394, 6.77218 },
122 | 	{ 10.26905, 6.77757 },
123 | 	{ 10.28404, 6.78293 },
124 | 	{ 10.29892, 6.78826 },
125 | 	{ 10.31369, 6.79355 },
126 | 	{ 10.32835, 6.79881 },
127 | 	{ 10.34290, 6.80403 },
128 | 	{ 10.35735, 6.80922 },
129 | 	{ 10.37169, 6.81437 },
130 | 	{ 10.38593, 6.81950 }
131 | };
132 | 


--------------------------------------------------------------------------------
/src/ContigNode.h:
--------------------------------------------------------------------------------
  1 | #ifndef CONTIGNODE
  2 | #define CONTIGNODE
  3 | 
  4 | class Contig; // forward declare to avoid circ. dependency
  5 | // following http://www.cplusplus.com/forum/articles/10627/#msg49679
  6 | 
  7 | #include <iostream>
  8 | #include <vector>
  9 | #include <set>
 10 | #include <queue>
 11 | #include <unordered_map>
 12 | #include <list>
 13 | #include "Contig.h"
 14 | #include "../utils/Kmer.h"
 15 | #include "../utils/JuncPairs.h"
 16 | #include "../utils/Junction.h" 
 17 | using std::ofstream;
 18 | // #include "../utils/sparsepp.h"
 19 | // using spp::sparse_hash_map;
 20 | 
 21 | 
 22 | class ContigNode{
 23 |     
 24 | public:
 25 |     unsigned char cov[4];
 26 |     Contig * contigs[5];
 27 | 
 28 |     ContigNode(Junction junction);
 29 |     ContigNode();
 30 |     bool isInvertedRepeatNode();
 31 | 
 32 | 
 33 |     bool checkValidity();
 34 |     // returns 0 if target node not reached at up to max_dist, otherwise returns distance on branching path
 35 |     std::list<Contig*> doPathsConvergeNearby(int max_ind, int min_ind, int max_dist);
 36 | 
 37 | 
 38 |     //gets the neighbors of the specified contig- if contigIndex is 4, returns all forward neighbors
 39 |     //If contig index isn't 4, only returns back contig as a neighbor
 40 |     std::vector<std::pair<Contig*, bool> > getFastGNeighbors(int contigIndex);
 41 | 
 42 | 
 43 | 
 44 |     std::list<JuncResult> getPairCandidates(int index, int maxDist);
 45 |       
 46 |     void replaceContig(Contig* oldContig, Contig* newContig);
 47 |     int numPathsOut();
 48 |     int indexOf(Contig* contig);
 49 |     kmer_type getForwardExtension(int index);
 50 |     std::vector<int> getIndicesOut();
 51 |     int getCoverage(int nucExt);
 52 |     int getTotalCoverage();//returns getCoverage(4)
 53 |     void setCoverage(Junction junc);
 54 |     void setCoverage(int nucExt, int coverage);
 55 |     void update(int nucExt, Contig * contig);
 56 |     kmer_type getUniqueKmer(int index);//returns base kmer for backward, or extension for forward index
 57 | 
 58 |     //removes the given path out of this node.
 59 |     //Removes contig pointer, set coverage to 0
 60 |     void breakPath(int nucExt);
 61 |     void clearNode();
 62 | 
 63 |     //for traversal
 64 |     bool hasNeighbor(int index);
 65 |     ContigNode* getNeighbor(int index);
 66 |     std::string getString();
 67 |     kmer_type getKmer();
 68 | };
 69 | 
 70 | class NodeQueueEntry{ //contains all info for an entry in the queue for a node BFS
 71 | public:
 72 |     ContigNode* node;
 73 |     int index;
 74 |     int startDist;
 75 | 
 76 |     NodeQueueEntry(ContigNode* n, int i, int s);
 77 |     NodeQueueEntry();
 78 | 
 79 |     std::list<JuncResult> getJuncResults(int m); //returns immediate junc results from contig along this index
 80 | 
 81 |     void addNeighbors(std::vector<NodeQueueEntry> & queue); // , bool to_back); //searches forward one step, adds relevant nodes to the queue
 82 |     void recordParents(std::unordered_map<NodeQueueEntry, NodeQueueEntry>& parents);
 83 |     // void recordParents(sparse_hash_map<NodeQueueEntry, NodeQueueEntry>& parents);
 84 |     // std::list<Contig*> reconstructPathFromParents(std::unordered_map<NodeQueueEntry, NodeQueueEntry>& parents);
 85 |     std::list<Contig*> reconstructPathFromParents(std::vector<NodeQueueEntry>& parents);
 86 |     friend bool operator==(NodeQueueEntry a, NodeQueueEntry b) { 
 87 |         return a.node == b.node && a.index == b.index && a.startDist == b.startDist; 
 88 |     };
 89 | 
 90 | };
 91 | 
 92 | 
 93 | // struct MyHash {
 94 | //   size_t operator()(const NodeQueueEntry& x) const { return std::hash<uint64_t>()(x.node->getUniqueKmer(x.index)); }
 95 | // };
 96 | namespace std {
 97 |   template <> struct hash<NodeQueueEntry>
 98 |   {
 99 |     size_t operator()(const NodeQueueEntry & x) const
100 |     {
101 |         return std::hash<uint64_t>()(x.node->getUniqueKmer(x.index)); }
102 |     };
103 | }
104 | #endif


--------------------------------------------------------------------------------
/src/ReadScanner.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <sys/time.h>
 5 | #include <string.h>
 6 | #include <inttypes.h>
 7 | #include <cmath> // for log2f
 8 | #include <algorithm> // for max
 9 | #include <unistd.h> // for truncate
10 | #include <string>
11 | // #include <unordered_map>
12 | #include <set>
13 | using std::string;
14 | // using std::unordered_map;
15 | using std::set;
16 | using std::ofstream;
17 | 
18 | #ifndef READSCAN_H
19 | #define READSCAN_H
20 | 
21 | #include "../utils/Bloom.h"
22 | #include "../utils/Kmer.h"
23 | #include "../utils/JChecker.h"
24 | #include "../utils/JunctionMap.h"
25 | #include "../utils/Junction.h"
26 | #include "../utils/ReadKmer.h"
27 | #include "../utils/Cap.h"
28 | #include "../utils/JuncPairs.h"
29 | 
30 | #define DEBUGE(a)  //printf a
31 | 
32 | class ReadScanner{
33 | 
34 | private:
35 |     int maxSpacerDist; // maximum distance in bases between two junctions (spacers added to bridge gaps)
36 |     Bloom* bloom;
37 |     Bloom* short_pair_filter;
38 |     Bloom* long_pair_filter;
39 |     set<kmer_type> jcheckedSet;
40 |     set<kmer_type> nextRealSet;
41 |     set<std::pair<kmer_type, kmer_type> > juncPairSet;
42 |     set<kmer_type> backwardSet;
43 |     uint64_t hash0, hash1,
44 |     nextHash0, nextHash1;
45 |     string reads_file;
46 | 
47 |     uint64_t NbCandKmer, NbRawCandKmer, NbJCheckKmer, NbNoJuncs, 
48 |         NbSkipped, NbProcessed, readsProcessed, NbSolidKmer,readsNoErrors,
49 |          NbJuncPairs, unambiguousReads;
50 | 
51 |     JChecker* jchecker;
52 |     JunctionMap* junctionMap;
53 | 
54 |     //Should only be called on a read with no real junctions
55 |     //Adds a fake junction in the middle and points it to the two ends.  This ensures we have coverage of long linear regions, and that we capture
56 |     //sinks at the end of such regions.
57 |     kmer_type add_fake_junction(string read);
58 |     
59 | public:
60 |     JunctionMap* getJunctionMap();
61 | 
62 |     //Scans one input read; breaks into small segments and calls scan_forward
63 |     //Returns back junctions along read from beginning to end
64 |     std::list<kmer_type> scanInputRead(string read, bool no_cleaning);
65 | 
66 |     void scanReads(bool fastq, bool paired_ends, bool no_cleaning); //scans all the reads.  Fastq if fastq, otherwise fasta
67 |     void printScanSummary(); //prints statistics from the readscan
68 |     
69 |     //Determines if the given ReadKmer is a junction.
70 |     //If it's on the middle of the read, just verifies alternate extensions.
71 |     //Special logic is needed to handle kmers that are near the ends, if j is not 0, to ensure that 
72 |     //the real extension seen on the read represents a valid, jcheckable option, and not a tip shorter than j.
73 |     bool testForJunction(ReadKmer kmer);
74 | 
75 |     //Starting from the given kmer, scans forward until a junction is found or the end of the read is hit.
76 |     //Returns true if a junction was found.  The supplied ReadKmer is also adjusted to the position of the new junction,\
77 |     //or to the end of the read.
78 |     bool find_next_junction(ReadKmer * kmer, int lastJuncPos);
79 | 
80 |     //Returns substrings of the read that are valid with BF and longer than sizeKmer
81 |     std::list<string> getValidReads(string read);
82 | 
83 |     //Scans a read. 
84 |     //Identifies all junctions on the read, and links adjacent junctions to each other.
85 |     //Also updates the relevant distance field on the first junction to point to the start of the read, and on the last
86 |     //Junction to point to the end of the read.
87 |     //If there are no junctions, add_fake_junction is called
88 |     //Returns back junctions along the read from beginning to end
89 |     std::list<kmer_type> scan_forward(string read, bool no_cleaning); 
90 | 
91 |     ReadScanner(JunctionMap* juncMap, string readFile, Bloom* bloom, Bloom* short_pair_filter, Bloom* long_pair_filter, JChecker* jchecker, int maxSpacerDist);
92 | };
93 | #endif


--------------------------------------------------------------------------------
/utils/ReadKmer.cpp:
--------------------------------------------------------------------------------
  1 | #include "ReadKmer.h"
  2 | #include <iostream>
  3 | #include <algorithm>
  4 | #include <string>
  5 | 
  6 | using std::string;
  7 | 
  8 | 
  9 | char* ReadKmer::directionAsString(){
 10 |     if(direction == FORWARD){
 11 |         return (char*)"forward";
 12 |     }
 13 |     else{
 14 |         return (char*)"backward";
 15 |     }
 16 | }
 17 | 
 18 | int ReadKmer::getMaxGuaranteedJ(bool dir){
 19 |     if(dir == FORWARD){
 20 |         return getDistToEnd()/2-1;
 21 |     }
 22 |     else{
 23 |         return getTotalPos()/2-1;
 24 |     }
 25 | }
 26 | 
 27 | //Returns number of forward operations needed to move to the last kmer on the read
 28 | int ReadKmer::getDistToEnd(){
 29 |  return read->length()*2- getTotalPos() - 2*sizeKmer+1;
 30 | }
 31 | 
 32 | //Returns the number of forward operations needed to go from the beginning of the read to this ReadKmer
 33 | int ReadKmer::getTotalPos(){
 34 |     return 2*pos + offset();
 35 | }
 36 | 
 37 | bool ReadKmer::onRead(){
 38 |     return (getTotalPos() >= 1) && (getDistToEnd()>=1);
 39 | }
 40 | 
 41 | kmer_type ReadKmer::getRevCompKmer(){
 42 |     if(direction == FORWARD){
 43 |         return doubleKmer.revcompKmer;
 44 |     }
 45 |     else{
 46 |         return doubleKmer.kmer;
 47 |     }
 48 | }
 49 | 
 50 | kmer_type ReadKmer::getKmer(){
 51 |     if(direction == FORWARD){
 52 |         return doubleKmer.kmer;
 53 |     }
 54 |     else{
 55 |         return doubleKmer.revcompKmer;
 56 |     }
 57 | }
 58 | 
 59 | //returns the offset of this kmer from the one at the same position facing backward.  
 60 | //Backward: 0
 61 | //Forward: 1
 62 | //used for calculating distances.
 63 | int ReadKmer::offset(){
 64 |     if(direction == FORWARD){
 65 |         return 1;
 66 |     }
 67 |     else{
 68 |         return 0;
 69 |     }
 70 | }
 71 | 
 72 | void ReadKmer::forward(){
 73 |     direction = !direction;
 74 |     if(direction == FORWARD){
 75 |         return; //switching from facing backward to forward doesn't entail a shift
 76 |     }
 77 |     int newNuc = 0;
 78 |     if(pos + sizeKmer <  read->length()){
 79 |         newNuc = NT2int((*read)[pos + sizeKmer]);   
 80 |     }
 81 |     doubleKmer.forward(newNuc);
 82 |     pos++;
 83 | }
 84 | 
 85 | void ReadKmer::advanceDist(int dist){
 86 |     for(int i = 0; i < dist; i++){
 87 |         forward();
 88 |     }
 89 | }
 90 | 
 91 | kmer_type ReadKmer::getCanon(){
 92 |     return doubleKmer.getCanon();
 93 | }
 94 | 
 95 | int ReadKmer::getExtensionIndex(bool dir){
 96 |     if(dir != direction){
 97 |         return 4; //backward index
 98 |     }
 99 |     return getRealExtensionNuc();
100 | }
101 | 
102 | kmer_type ReadKmer::getExtension(int newNuc){
103 |     return doubleKmer.getExtension(newNuc, direction);
104 | }
105 | 
106 | //can be used as an index for the junction
107 | int ReadKmer::getRealExtensionNuc(){
108 |     if(direction == FORWARD){
109 |         return NT2int((*read)[sizeKmer + pos]);  
110 |     } 
111 |     else{
112 |         return revcomp_int(NT2int((*read)[pos-1]));
113 |     }
114 | }
115 | 
116 | kmer_type ReadKmer::getRealExtension(){
117 |     return getExtension(getRealExtensionNuc());  
118 | }
119 | 
120 | //Starts all the way at the front- facing off the read
121 | ReadKmer::ReadKmer(string* theRead): doubleKmer(0){
122 |     read = theRead;
123 |     kmer_type kmer = 0;
124 |     getFirstKmerFromRead(&kmer,&((*read)[0]));
125 |     doubleKmer = DoubleKmer(kmer);
126 |     pos = 0;
127 |     direction = BACKWARD;
128 | }
129 | 
130 | //Creates a double kmer corresponding to the given read, the index into the read, and the direction
131 | ReadKmer::ReadKmer(string* theRead, int index, bool dir): doubleKmer(0){
132 |     read = theRead;
133 |     kmer_type kmer;
134 |     getFirstKmerFromRead(&kmer,&((*read)[index]));
135 |     doubleKmer = DoubleKmer(kmer);
136 |     pos = index;
137 |     direction = dir;
138 | }
139 | 
140 | ReadKmer::ReadKmer(ReadKmer* toCopy): doubleKmer(toCopy->doubleKmer){
141 |     read = toCopy->read;
142 |     doubleKmer = toCopy->doubleKmer;
143 |     pos = toCopy->pos;
144 |     direction = toCopy->direction;
145 | }
146 | 
147 | 


--------------------------------------------------------------------------------
/utils/Kmer.h:
--------------------------------------------------------------------------------
  1 | #ifndef Kmer64_h
  2 | #define Kmer64_h
  3 | 
  4 | #include <stdint.h>
  5 | #include <string>
  6 | #include <list>
  7 | 
  8 | #ifdef _largeint
  9 | #include "LargeInt.h"
 10 | typedef LargeInt<KMER_PRECISION> kmer_type;
 11 | #else
 12 | #ifdef _ttmath
 13 | #include "ttmath/ttmath.h"
 14 | typedef ttmath::UInt<KMER_PRECISION> kmer_type;
 15 | #else
 16 | #if (! defined kmer_type) || (! defined _LP64)
 17 | typedef uint64_t kmer_type;
 18 | #endif
 19 | #endif
 20 | #endif
 21 | 
 22 | extern int sizeKmer;
 23 | extern kmer_type kmerMask;
 24 | extern kmer_type kmerMaskm1;
 25 | extern const bool FORWARD;
 26 | extern const bool BACKWARD;
 27 | extern uint64_t nsolids;
 28 | 
 29 | bool isHomoPolymer(std::string str);
 30 | std::list<std::string> getUnambiguousReads(std::string read);//returns every string of valid nuc characters in the read- throws out all other characters 
 31 | void setSizeKmer(int k);
 32 | char getNucChar(int nucIndex);
 33 | bool isValidNuc(char nt);
 34 | int NT2int(char nt);
 35 | int revcomp_int(int nt_int);
 36 | char revcomp_char(char c);
 37 | kmer_type  codeSeed(char *seq, int sizeKmer, kmer_type kmerMask);
 38 | kmer_type  codeSeed(char *seq);
 39 | kmer_type  codeSeedRight(char *seq, kmer_type  val_seed, bool new_read);
 40 | kmer_type  codeSeedRight(char *seq, kmer_type  val_seed, bool new_read, int sizeKmer, kmer_type kmerMask);
 41 | kmer_type  codeSeedRight_revcomp(char *seq, kmer_type  val_seed, bool new_read);
 42 | kmer_type  codeSeedRight_revcomp(char *seq, kmer_type  val_seed, bool new_read, int sizeKmer, kmer_type kmerMask);
 43 | unsigned char  code_n_NT(char *seq, int nb);
 44 | unsigned char  code4NT(char *seq);
 45 | 
 46 | uint64_t revcomp(uint64_t x);
 47 | uint64_t revcomp(uint64_t x, int size);
 48 | 
 49 | #ifdef _largeint
 50 | LargeInt<KMER_PRECISION> revcomp(LargeInt<KMER_PRECISION> x);
 51 | LargeInt<KMER_PRECISION> revcomp(LargeInt<KMER_PRECISION> x, int size);
 52 | #endif
 53 | #ifdef _ttmath
 54 | ttmath::UInt<KMER_PRECISION> revcomp(ttmath::UInt<KMER_PRECISION> x);
 55 | ttmath::UInt<KMER_PRECISION> revcomp(ttmath::UInt<KMER_PRECISION> x, int size);
 56 | #endif
 57 | #ifdef _LP64
 58 | __uint128_t revcomp(__uint128_t x);
 59 | __uint128_t revcomp(__uint128_t x, int size);
 60 | #endif
 61 | 
 62 | int code2seq ( kmer_type code,char *seq);
 63 | int code2seq ( kmer_type code,char *seq, int sizeKmer, kmer_type kmerMask);
 64 | int code2nucleotide( kmer_type code, int which_nucleotide);
 65 | int first_nucleotide(kmer_type kmer);
 66 | 
 67 | kmer_type extractKmerFromRead(char *readSeq, int position, kmer_type *graine, kmer_type *graine_revcomp);
 68 | kmer_type extractKmerFromRead(char *readSeq, int position, kmer_type *graine, kmer_type *graine_revcomp, bool sequential);
 69 | kmer_type extractKmerFromRead(char *readSeq, int position, kmer_type *graine, kmer_type *graine_revcomp, bool sequential, int sizeKmer, kmer_type kmerMask);
 70 | kmer_type maskKmer(kmer_type kmer);
 71 | // compute the next kmer w.r.t forward or reverse strand, e.g. for ACTG (revcomp = CAGT)
 72 | // it makes sure the result is the min(kmer,revcomp_kmer)
 73 | // indicates if the result is the revcomp_kmer by setting *strand 
 74 | // examples:
 75 | // next_kmer(ACTG,A,&0)=CTGA with strand = 0 (because revcomp=TCAG); 
 76 | // next_kmer(ACTG,A,&1)= (revcomp of ACTG + A = CAGT+A = ) AGTA with strand = 0 (because revcomp = TACT)
 77 | kmer_type next_kmer(kmer_type graine, int added_nt, bool strand);//returns shifted in a new place
 78 | kmer_type next_kmer(kmer_type graine, int added_nt, int* strand);
 79 | void shift_kmer(kmer_type *graine, int added_nt, int strand); //shifts in place
 80 | void getFirstKmerFromRead(kmer_type* kmer, char* read);
 81 | kmer_type getKmerFromRead(std::string read, int index);//returns kmer starting at position index
 82 | kmer_type next_kmer_in_read(kmer_type kmer, int index_in_read, char* read, bool direction);
 83 | kmer_type advance_kmer(char* read, kmer_type* kmer,  int startPos, int endPos);
 84 | kmer_type rotate_right(kmer_type kmer, int dist);
 85 | kmer_type rotate_left(kmer_type kmer, int dist);
 86 | 
 87 | std::string canon_contig(std::string contig);
 88 | std::string revcomp_string(std::string s);
 89 | void revcomp_sequence(char* s, int len);
 90 | 
 91 | kmer_type  codeSeed_bin(char *seq);
 92 | 
 93 | kmer_type  codeSeedRight_bin(char *seq, kmer_type  val_seed, bool new_read);
 94 | 
 95 | kmer_type  codeSeedRight_revcomp_bin(char *seq, kmer_type  val_seed, bool new_read);
 96 | 
 97 | kmer_type extractKmerFromRead_bin(char *readSeq, int position, kmer_type *graine, kmer_type *graine_revcomp, bool use_compressed);
 98 | 
 99 | kmer_type get_canon(kmer_type a);
100 | char* print_kmer(kmer_type kmer); // debugging
101 | char* print_kmer(kmer_type kmer, int sizeKmer, kmer_type kmerMask); // debugging
102 | 
103 | 
104 | #endif
105 | 


--------------------------------------------------------------------------------
/utils/tests/BloomTests.cpp:
--------------------------------------------------------------------------------
  1 | #include "BloomTests.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <time.h>
  5 | 
  6 | namespace bloomTests{
  7 | 
  8 | Bloom* bloom;
  9 | 
 10 | char get_random_nuc(){
 11 |     return rand() % 4;
 12 | }
 13 | 
 14 | kmer_type get_random_kmer(){
 15 |     uint64_t rand_kmer = 0;
 16 |     for(int i = 0; i < sizeKmer; i++){
 17 |         rand_kmer ^= (uint64_t)get_random_nuc();
 18 |         rand_kmer <<= 2;
 19 |     }
 20 |     return rand_kmer;
 21 | }
 22 | 
 23 | void test_false_positive_rate(uint64_t bloomSize, int sampleSize, float fpRate){
 24 |     
 25 |     setSizeKmer(25);
 26 |     bloom = bloom->create_bloom_filter_optimal(bloomSize, fpRate);
 27 |     for(int i = 0; i < bloomSize; i++){
 28 |         bloom->add(get_random_kmer());
 29 |     }
 30 |     int fpCount = 0;
 31 |     for(int i = 0; i < sampleSize; i++){
 32 |         if(bloom->contains(get_random_kmer())){
 33 |             fpCount += 1;
 34 |         }
 35 |     }
 36 |     printf("Size %lli, desired rate %f: %f \n", 
 37 |         bloomSize, fpRate, (float)fpCount/(float)sampleSize);
 38 | }
 39 | 
 40 | void test_speed_raw(uint64_t bloomSize, int sampleSize, float fpRate){
 41 |     
 42 |     setSizeKmer(25);
 43 |     bloom = bloom->create_bloom_filter_optimal(bloomSize, fpRate);
 44 |     for(int i = 0; i < bloomSize; i++){
 45 |         bloom->add(get_random_kmer());
 46 |     }
 47 |     time_t start, stop;
 48 |     time(&start);
 49 |     int fpCount = 0;
 50 |     kmer_type kmer = (uint64_t)0;
 51 |     for(int i = 0; i < sampleSize; i++){
 52 |         bloom->contains(kmer);
 53 |         kmer++;
 54 |     }
 55 |     time(&stop);
 56 |     printf("Raw queries per second: %f \n", sampleSize / difftime(stop,start));
 57 | }
 58 | 
 59 | void test_speed_incremental(uint64_t bloomSize, int sampleSize, float fpRate){
 60 |     
 61 |     setSizeKmer(25);
 62 |     bloom = bloom->create_bloom_filter_optimal(bloomSize, fpRate);
 63 |     kmer_type kmer = get_random_kmer();
 64 |     uint64_t hash0 = bloom->get_rolling_hash(kmer,0);
 65 |     uint64_t hash1 = bloom->get_rolling_hash(kmer,1);
 66 |     for(int i = 0; i < bloomSize; i++){
 67 |         bloom->add(get_random_kmer());
 68 |     }
 69 |     time_t start, stop;
 70 |     time(&start);
 71 |     int fpCount = 0;
 72 |     int newNuc = 0;
 73 |     int oldNuc = 3;
 74 |     for(int i = 0; i < sampleSize; i++){
 75 |         bloom->contains(hash0, hash1);
 76 |         hash0 = bloom->roll_hash(hash0, oldNuc, newNuc, 0);
 77 |         hash1 = bloom->roll_hash(hash1, oldNuc, newNuc, 1);
 78 |         newNuc = (newNuc + 1) %4;
 79 |         oldNuc = (oldNuc +1)%4;
 80 |     }
 81 |     time(&stop);
 82 |     printf("Incremental Queries per second: %f \n", sampleSize / difftime(stop,start));
 83 | }
 84 | 
 85 | 
 86 | void test_speed_readscan(uint64_t bloomSize, int sampleSize, float fpRate){
 87 |     
 88 |     setSizeKmer(25);
 89 |     bloom = bloom->create_bloom_filter_optimal(bloomSize, fpRate);
 90 |     kmer_type kmer = (uint64_t)0;
 91 |     uint64_t hash0 = bloom->get_rolling_hash(kmer,0);
 92 |     uint64_t hash1 = bloom->get_rolling_hash(kmer,1);
 93 |     for(int i = 0; i < bloomSize; i++){
 94 |         bloom->add(get_random_kmer());
 95 |     }
 96 |     time_t start, stop;
 97 |     time(&start);
 98 |     int fpCount = 0;
 99 |     int newNuc = 0;
100 |     int oldNuc = 3;
101 |     for(int i = 0; i < sampleSize; i++){
102 |         if(i % 100 == 0){
103 |             bloom->contains(kmer);
104 |             kmer++;
105 |         }
106 |         else{
107 |             bloom->contains(hash0, hash1);
108 |             hash0 = bloom->roll_hash(hash0, oldNuc, newNuc, 0);
109 |             hash1 = bloom->roll_hash(hash1, oldNuc, newNuc, 1);
110 |             newNuc = (newNuc + 1) %4;
111 |             oldNuc = (oldNuc +1)%4;
112 |         }
113 |     }
114 |     time(&stop);
115 |     printf("Read Queries per second: %f \n", sampleSize / difftime(stop,start));
116 | }
117 | 
118 | void test_speed_old(uint64_t bloomSize, int sampleSize, float fpRate){
119 |     
120 |     setSizeKmer(25);
121 |     bloom = bloom->create_bloom_filter_optimal(bloomSize, fpRate);
122 |     for(int i = 0; i < bloomSize; i++){
123 |         bloom->add(get_random_kmer());
124 |     }
125 |     time_t start, stop;
126 |     time(&start);
127 |     int fpCount = 0;
128 |     kmer_type kmer = (uint64_t)0;
129 |     for(int i = 0; i < sampleSize; i++){    
130 |         bloom->oldContains(kmer);
131 |         kmer++;
132 |     }
133 |     time(&stop);
134 |     printf("Old queries per second: %f \n", sampleSize / difftime(stop,start));
135 | }
136 | 
137 | void runBloomTests(){
138 |     test_false_positive_rate(100000, 100000, .001);
139 |     test_false_positive_rate(100000, 100000, .01);
140 |     test_false_positive_rate(100000, 100000, .1);
141 |     test_speed_raw(100000, 2000000, .01);
142 |     test_speed_incremental(100000, 20000000, .1);
143 |     test_speed_readscan(100000, 20000000, .1);
144 |     test_speed_old(100000, 5000000, .1);
145 | }
146 | 
147 | }


--------------------------------------------------------------------------------
/utils/JunctionMap.h:
--------------------------------------------------------------------------------
  1 | #ifndef JUNCTION_MAP
  2 | #define JUNCTION_MAP
  3 | 
  4 | // #include <unordered_map>
  5 | #include <unordered_set>
  6 | #include <string>
  7 | #include "Kmer.h"
  8 | #include "Junction.h"
  9 | #include "Cap.h"
 10 | #include "ReadKmer.h"
 11 | #include "Bloom.h"
 12 | #include "JChecker.h"
 13 | #include "Kmer.h"
 14 | #include "JuncPairs.h"
 15 | 
 16 | #include "../src/Contig.h"
 17 | #include "../src/ContigNode.h"
 18 | #include "../src/ContigGraph.h"
 19 | #include <fstream>
 20 | #include "../src/BfSearchResult.h"
 21 | using std::ofstream;
 22 | // using std::unordered_map;
 23 | using std::string;
 24 | using std::unordered_set;
 25 | // #include "sparsepp.h"
 26 | // using spp::sparse_hash_map;
 27 | 
 28 | 
 29 | class JunctionMap{
 30 | 
 31 | private: 
 32 |     Bloom* bloom;
 33 |     JChecker* jchecker; 
 34 |     int maxReadLength; //needed for finding sinks properly- tells you when to stop scanning   
 35 | 
 36 |     
 37 | public:
 38 |     void printDistAndExtension(int dist, int maxDist, int index, kmer_type kmer);
 39 |  
 40 |     void buildLinearRegions(ContigGraph* contigGraph); //Builds node graph for any connected component that has branching
 41 |     void buildBranchingPaths(ContigGraph* contigGraph); //For connected components that have no branching at all- builds contig graph
 42 |     void destroyComplexJunctions(); //destroys all complex junctions. used after building branching paths for contig graph
 43 |     void destroyJunctionSet(std::set<kmer_type> dead_juncs); // periodically destroy complex junctions - aimed to reduce memory use
 44 | 
 45 | 
 46 |     //Builds a contig graph from this junction map, destroying the non-complex junctions as it goes
 47 |     ContigGraph* buildContigGraph();
 48 | 
 49 |     //Gets the contig from this junction to the next complex junction or sink
 50 |     //Has all fields except the ContigNode pointers filled out- indices, juncDistances, and seq are all there
 51 |     Contig* getContig(Junction junc, kmer_type startKmer, int index);
 52 | 
 53 |     //Scans forward from junction junc at index i with bloom filter
 54 |     //If it hits another junction at or before the distance specified by the given junction, returns a "node" result with that junction
 55 |     //If it does not, it keeps scanning until it hits another junction or an actual sink
 56 |     //If it hits a sink, it returns it.  If it hits a junction, it tests how far that junction points along the path.
 57 |     //Based on the indicated overlap, it either decides the entire intermediate sequence is real or the connection is a 
 58 |     //false positive connection.  Then returns either a sink or a node result.
 59 |     BfSearchResult findNeighbor(Junction junc, kmer_type startKmer, int index);
 60 |     
 61 |     std::unordered_map<kmer_type,Junction> junctionMap;  //stores the junctions themselves
 62 |     // sparse_hash_map<kmer_type,Junction> junctionMap;  //stores the junctions themselves
 63 | 
 64 |     //Returns true if multiple extensions of the given kmer jcheck
 65 |     //Assumes the given kmer is in the BF
 66 |     bool isBloomJunction(kmer_type kmer);
 67 | 
 68 |     //Gets the valid extension of the given kmer based on the bloom filter and cFPs.  Uses JChecking! so this cuts off tips
 69 |     //Assume the given kmer is not a junction
 70 |     //Returns -1 if there is no valid extension
 71 |     //Returns -2 if there are multiple
 72 |     //ASSUMES NO CFP SET- since this is only done in findSinks, BEFORE the cFPs are found
 73 |     int getValidJExtension(DoubleKmer kmer);
 74 | 
 75 |     //File format:
 76 |     //One line for each junction.  On each line, the kmer is printed as a string, then the junction is printed.  
 77 |     //See Junction.h for junction print documentation.
 78 |     void writeToFile(string filename); 
 79 | 
 80 |     void buildFromFile(string junction_file);
 81 |     
 82 |     //Finds the junction associated with the given kmer and returns how far we can skip in the given direction from that junction
 83 |     int getSkipDist(ReadKmer* readKmer, bool direction);
 84 | 
 85 |     //Directly links two adjacent junctions from the same read
 86 |     void directLinkJunctions(ReadKmer* kmer1, ReadKmer* kmer2, Junction* junc1, Junction* junc2);
 87 | 
 88 |     int getNumComplexJunctions(); //Gets the number of junctions with more than one valid extension
 89 |     int getNumSolidJunctions(int i); //Gets the number of solid complex junctions, multiple valid extensions of coverage at least i
 90 |     int getNumJunctions();
 91 | 
 92 |     void createJunction(kmer_type kmer);
 93 |     void createJunction(ReadKmer* readKmer);
 94 |     bool isJunction(kmer_type kmer); //returns true if there is a junction at the given kmer
 95 |     bool isJunction(ReadKmer* readKmer); //same as above
 96 |     Junction* getJunction(ReadKmer kmer); //returns the junction located at the given kmer, or NULL if there is none
 97 |     Junction* getJunction(kmer_type kmer); //same as above
 98 |     void killJunction(kmer_type kmer); //removes the junction at the specified kmer, if there is one
 99 | 
100 |     JunctionMap(Bloom* bloo, JChecker* jchecker, int maxReadLength);
101 | };
102 | #endif


--------------------------------------------------------------------------------
/src/ContigGraph.h:
--------------------------------------------------------------------------------
  1 | #ifndef CONTIG_GRAPH
  2 | #define CONTIG_GRAPH
  3 | 
  4 | class ContigGraph; //forward declare
  5 | #include <ostream>
  6 | #include <fstream>
  7 | #include <iostream> 
  8 | #include <unordered_map>
  9 | #include <unordered_set>
 10 | #include <string>
 11 | #include <vector>
 12 | #include "../utils/Kmer.h"
 13 | #include "../utils/Junction.h"
 14 | #include "../utils/Bloom.h"
 15 | #include "../utils/JunctionMap.h"
 16 | #include "../utils/JChecker.h"
 17 | #include "../utils/JuncPairs.h"
 18 | #include "Contig.h"
 19 | #include "ContigNode.h"
 20 | #include "BfSearchResult.h"
 21 | #include "ContigIterator.h"
 22 | using std::unordered_set;
 23 | using std::unordered_map;
 24 | #include "../utils/sparsepp.h"
 25 | using std::string;
 26 | using spp::sparse_hash_map;
 27 | 
 28 | 
 29 | class ContigGraph 
 30 | {
 31 | std::vector<ContigNode> nodeVector;
 32 | std::vector<Contig> isolated_contigs;
 33 | unordered_map<kmer_type,ContigNode> nodeMap;
 34 | unordered_map<kmer_type,ContigNode>::iterator it;
 35 | 
 36 | 
 37 | int read_length;
 38 | 
 39 | public: 
 40 |     std::vector<Contig> * getIsolatedContigs();
 41 |     unordered_map<kmer_type, ContigNode> * getNodeMap();
 42 | 
 43 |     void setReadLength(int length);
 44 |     bool isCollapsible(ContigNode * node);
 45 | 
 46 |     //Gets number of supporting pairs given candidate list
 47 |     double getScore(std::list<JuncResult> leftCand, std::list<JuncResult> rightCand, Bloom* pair_filter, int insertSize);
 48 |     // same as getScore, but stops at first positive query
 49 |     bool areConnected(std::list<JuncResult> leftCand, std::list<JuncResult> rightCand, Bloom* pair_filter, int insertSize);
 50 | 
 51 |     std::pair <Contig*,Contig*> getMinMaxForwardExtensions(ContigNode * node, std::string trait);
 52 | 
 53 | 
 54 |     //a,b are on backNode, c,d are on forwardNode
 55 |     //a pairs with c, b pairs with d
 56 |     //Does not go ahead with the operation if degeneracies are detected
 57 |     //Returns true if it goes ahead with disentanglement
 58 |     void disentanglePair(Contig* contig, ContigNode* backNode, ContigNode* forwardNode, int a, int b, int c, int d);
 59 |     void disentangleLoop(Contig* contig, ContigNode* backNode, ContigNode* forwardNode, int a, int b, int c, int d);
 60 | 
 61 |     void addIsolatedContig(Contig contig);
 62 |     bool isLowCovContig(Contig* contig);
 63 |     bool isLowMassContig(Contig* contig);
 64 |     bool isTip(ContigNode* node, int i);
 65 |     bool isBubbleNode(ContigNode* node);
 66 |     std::list<Contig*> getPathIfSimpleBulge(ContigNode* node, int max_dist);
 67 | 
 68 |     void deleteContig(Contig* contig);
 69 |     bool cleanGraph(Bloom* short_pair_filter, Bloom* long_pair_filter); //Cleans graph and returns true if any changes were made
 70 | 
 71 |     bool checkGraph();
 72 |     void printContigFastG(std::ostream* fastgFile, Contig * contig);
 73 | 
 74 |     // calls different sub-functions below to traverse graph and output contigs 
 75 |     void printContigs(string filename); 
 76 |     int printAndMarkBubbleContigs(string fileName);
 77 |     int printUnmarkedUnitigs(string fileName, int numPrinted);
 78 | 
 79 | 
 80 |     void printGraph(string fileName); //prints graph : TBD print format- fastg?
 81 |     ContigGraph();
 82 | 
 83 |     Contig* getLongestContig();
 84 | 
 85 |     //Creates a contig node if it doesn't already exist
 86 |     //If it exists, does nothing and returns the existing one.
 87 |     //Otherwise, returns the new one
 88 |     ContigNode * createContigNode(kmer_type kmer, Junction junction);    
 89 |     int disentangleParallelPaths(Bloom* pair_filter, double insertSize, double std);
 90 |     int disentangleLoopPaths(Bloom* pair_filter, double insertSize, double std);
 91 |     int removeChimericExtensions(int insertSize);
 92 |     int validateNoneCollapsible();
 93 |     int collapseBulges(int max_dist);
 94 |     bool deleteTipsAndClean();
 95 |     bool removeChimerasAndClean();
 96 |     bool collapseBulgesAndClean();
 97 |     bool disentangleAndClean(Bloom* pair_filter, double insertSize, double std);
 98 |     bool areEquivalentContigCoverages(ContigJuncList A, ContigJuncList B, double frac);
 99 |     bool areDifferentialContigCoverages(ContigJuncList A, ContigJuncList B);
100 |     Contig * getNewConcatenatedContig(Contig * back, Contig * contig, ContigNode * node);
101 | 
102 | 
103 | private:
104 |     int deleteTips();
105 |     int deleteIsolatedContigs();
106 |     bool testAndCutIfDegenerate(ContigNode* node);
107 |     int collapseDummyNodes(); //removes nodes with only one real extension, merges forward and back contigs
108 |     int destroyDegenerateNodes();// Removes nodes with no back contig or no forward contigs
109 |     // int cutIfDegenerate(ContigNode* node, kmer_type kmer, auto it);
110 | 
111 |     unordered_map<kmer_type, ContigNode> contigNodeMap; // maps kmers to ContigNodes after contigs constructed
112 |     // sparse_hash_map<kmer_type, ContigNode> contigNodeMap; // maps kmers to ContigNodes after contigs constructed
113 | 
114 |     void collapseNode(ContigNode * node, kmer_type kmer);
115 |     void cutPath(ContigNode* node, int index); //used on nodes with no backward contig
116 | };
117 | 
118 | 
119 | #endif
120 | 


--------------------------------------------------------------------------------
/utils/ttmath/ttmathmisc.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is a part of TTMath Bignum Library
  3 |  * and is distributed under the (new) BSD licence.
  4 |  * Author: Tomasz Sowa <t.sowa@ttmath.org>
  5 |  */
  6 | 
  7 | /* 
  8 |  * Copyright (c) 2006-2010, Tomasz Sowa
  9 |  * All rights reserved.
 10 |  * 
 11 |  * Redistribution and use in source and binary forms, with or without
 12 |  * modification, are permitted provided that the following conditions are met:
 13 |  * 
 14 |  *  * Redistributions of source code must retain the above copyright notice,
 15 |  *    this list of conditions and the following disclaimer.
 16 |  *    
 17 |  *  * Redistributions in binary form must reproduce the above copyright
 18 |  *    notice, this list of conditions and the following disclaimer in the
 19 |  *    documentation and/or other materials provided with the distribution.
 20 |  *    
 21 |  *  * Neither the name Tomasz Sowa nor the names of contributors to this
 22 |  *    project may be used to endorse or promote products derived
 23 |  *    from this software without specific prior written permission.
 24 |  *
 25 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 26 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 27 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 28 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 29 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 30 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 31 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 32 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 33 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 35 |  * THE POSSIBILITY OF SUCH DAMAGE.
 36 |  */
 37 | 
 38 | #ifndef headerfilettmathmisc
 39 | #define headerfilettmathmisc
 40 | 
 41 | 
 42 | /*!
 43 | 	\file ttmathmisc.h
 44 |     \brief some helpful functions
 45 | */
 46 | 
 47 | 
 48 | #include <string>
 49 | 
 50 | 
 51 | namespace ttmath
 52 | {
 53 | 
 54 | /*!
 55 | 	some helpful functions
 56 | */
 57 | class Misc
 58 | {
 59 | public:
 60 | 
 61 | 
 62 | /*
 63 |  *
 64 |  *	AssignString(result, str)
 65 |  *	result = str
 66 |  *
 67 |  */
 68 | 
 69 | /*!
 70 | 	result = str
 71 | */
 72 | static void AssignString(std::string & result, const char * str)
 73 | {
 74 | 	result = str;
 75 | }
 76 | 
 77 | 
 78 | #ifndef TTMATH_DONT_USE_WCHAR
 79 | 
 80 | /*!
 81 | 	result = str
 82 | */
 83 | static void AssignString(std::wstring & result, const char * str)
 84 | {
 85 | 	result.clear();
 86 | 
 87 | 	for( ; *str ; ++str )
 88 | 		result += *str;
 89 | }
 90 | 
 91 | 
 92 | /*!
 93 | 	result = str
 94 | */
 95 | static void AssignString(std::wstring & result, const std::string & str)
 96 | {
 97 | 	return AssignString(result, str.c_str());
 98 | }
 99 | 
100 | 
101 | /*!
102 | 	result = str
103 | */
104 | static void AssignString(std::string & result, const wchar_t * str)
105 | {
106 | 	result.clear();
107 | 
108 | 	for( ; *str ; ++str )
109 | 		result += static_cast<char>(*str);
110 | }
111 | 
112 | 
113 | /*!
114 | 	result = str
115 | */
116 | static void AssignString(std::string & result, const std::wstring & str)
117 | {
118 | 	return AssignString(result, str.c_str());
119 | }
120 | 
121 | #endif
122 | 
123 | 
124 | /*
125 |  *
126 |  *	AddString(result, str)
127 |  *	result += str
128 |  *
129 |  */
130 | 
131 | 
132 | /*!
133 | 	result += str
134 | */
135 | static void AddString(std::string & result, const char * str)
136 | {
137 | 	result += str;
138 | }
139 | 
140 | 
141 | #ifndef TTMATH_DONT_USE_WCHAR
142 | 
143 | /*!
144 | 	result += str
145 | */
146 | static void AddString(std::wstring & result, const char * str)
147 | {
148 | 	for( ; *str ; ++str )
149 | 		result += *str;
150 | }
151 | 
152 | #endif
153 | 
154 | 
155 | /*
156 | 	this method omits any white characters from the string
157 | 	char_type is char or wchar_t
158 | */
159 | template<class char_type>
160 | static void SkipWhiteCharacters(const char_type * & c)
161 | {
162 | 	// 13 is at the end in a DOS text file (\r\n)
163 | 	while( (*c==' ' ) || (*c=='\t') || (*c==13 ) || (*c=='\n') )
164 | 		++c;
165 | }
166 | 
167 | 
168 | 
169 | 
170 | /*!
171 | 	this static method converts one character into its value
172 | 
173 | 	for example:
174 | 		1 -> 1
175 | 		8 -> 8
176 | 		A -> 10
177 | 		f -> 15
178 | 
179 | 	this method don't check whether c is correct or not
180 | */
181 | static uint CharToDigit(uint c)
182 | {
183 | 	if(c>='0' && c<='9')
184 | 		return c-'0';
185 | 
186 | 	if(c>='a' && c<='z')
187 | 		return c-'a'+10;
188 | 
189 | return c-'A'+10;
190 | }
191 | 
192 | 
193 | /*!
194 | 	this method changes a character 'c' into its value
195 | 	(if there can't be a correct value it returns -1)
196 | 
197 | 	for example:
198 | 	c=2, base=10 -> function returns 2
199 | 	c=A, base=10 -> function returns -1
200 | 	c=A, base=16 -> function returns 10
201 | */
202 | static sint CharToDigit(uint c, uint base)
203 | {
204 | 	if( c>='0' && c<='9' )
205 | 		c=c-'0';
206 | 	else
207 | 	if( c>='a' && c<='z' )
208 | 		c=c-'a'+10;
209 | 	else
210 | 	if( c>='A' && c<='Z' )
211 | 		c=c-'A'+10;
212 | 	else
213 | 		return -1;
214 | 
215 | 
216 | 	if( c >= base )
217 | 		return -1;
218 | 
219 | 
220 | return sint(c);
221 | }
222 | 
223 | 
224 | 
225 | /*!
226 | 	this method converts a digit into a char
227 | 	digit should be from <0,F>
228 | 	(we don't have to get a base)
229 | 	
230 | 	for example:
231 | 		1  -> 1
232 | 		8  -> 8
233 | 		10 -> A
234 | 		15 -> F
235 | */
236 | static uint DigitToChar(uint digit)
237 | {
238 | 	if( digit < 10 )
239 | 		return digit + '0';
240 | 
241 | return digit - 10 + 'A';
242 | }
243 | 
244 | 
245 | }; // struct Misc
246 | 
247 | }
248 | 
249 | 
250 | #endif
251 | 


--------------------------------------------------------------------------------
/utils/manual/manual.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[a4paper]{article}
  2 | \usepackage{fancyvrb}
  3 | \usepackage{pdfpages}
  4 | \begin{document}
  5 | 
  6 | \newcommand\vitem[1][]{\SaveVerb[% to use verb in description
  7 |     aftersave={\item[\textnormal{\UseVerb[#1]{vsave}}]}]{vsave}}
  8 | 
  9 | \title{\Huge \texttt{Minia} --- Short manual}
 10 | 
 11 | \author{R. Chikhi \& G. Rizk\\
 12 |         {\small{rayan.chikhi@ens-cachan.org}}}
 13 | \maketitle
 14 | 
 15 | \begin{abstract}
 16 | \noindent {\normalsize Minia is a software for ultra-low memory DNA sequence assembly. It takes as input a set of short genomic sequences (typically, data produced by the Illumina DNA sequencer). Its output is a set of contigs (assembled sequences), forming an approximation of the expected genome. Minia is based on a succinct representation of the de Bruijn graph. The computational resources required to run Minia are significantly lower than that of other assemblers.}
 17 | \end{abstract}
 18 | 
 19 | \tableofcontents
 20 | 
 21 | \section{Installation}
 22 | 
 23 | To install Minia, just type \verb+make+ in the Minia folder.
 24 | Minia has been tested on Linux and MacOS systems.
 25 | To run Minia, type \verb+./minia+.
 26 | 
 27 | \section{Parameters}
 28 | 
 29 | The usage is:\\
 30 | 
 31 | 
 32 | \verb+./minia [input_file] [kmer_size] [min_abundance] [estimated_genome_size] [prefix]+\\
 33 | 
 34 | 
 35 | An example command line is:\\
 36 | 
 37 | 
 38 | \verb+./minia reads.fastq 31 3 100000000 minia_assembly_k31_m3+\\
 39 | 
 40 | All the parameters need to be specified, in the following order:
 41 | 
 42 | \begin{enumerate}
 43 | 
 44 | \item \verb+input_file+ -- the input file
 45 | 
 46 | \item \verb+kmer_size+  -- k-mer length 
 47 | 
 48 | \item \verb+min_abundance+ -- filters out k-mers seen less than the specified number of times
 49 | 
 50 | \item \verb+estimated_genome_size+ -- rough estimation of the size of the genome to assemble, in base pairs.
 51 | 
 52 | \item \verb+prefix+ -- any prefix string to store unique temporary files for this assembly
 53 | 
 54 | \end{enumerate}
 55 | 
 56 | Minia now uses the Cascading Bloom filters improvement (http://arxiv.org/abs/1302.7278) by default, thanks to Gustavo Sacomoto for the implementation in Minia. Launch Minia with the \verb!--original! option to revert to the original data structure.
 57 | 
 58 | 
 59 | \section{Explanation of parameters}
 60 | \begin{description}
 61 | 
 62 | \vitem+kmer_size+
 63 | The $k$-mer length is the length of the nodes in the de Bruijn graph. It strongly depends on the input dataset. A typical value to try for short Illumina reads (read length above $50$) is 27. For longer Illumina reads ($\approx 100$ bp) with sufficient coverage ($>$ 40x), we had good results with $k=43$.
 64 | 
 65 | \vitem+min_abundance+
 66 | The \verb+min_abundance+ is used to remove erroneous, low-abundance $k$-mers. This parameter also strongly depends on the dataset. It corresponds to the smallest amount of times a correct $k$-mer appears in the reads. A typical value is $3$. Setting it to $1$ is not recommended\footnote{as no erroneous $k$-mer will be discarded, which will likely result in a very large memory usage}. If the dataset has high coverage, try larger values.
 67 | 
 68 | \vitem+estimated_genome_size+
 69 | 
 70 | The estimated genome size parameter (in base pairs) only controls the memory usage during the first phase of Minia (graph construction). \emph{It has no impact on the assembly}.
 71 | 
 72 | \vitem+prefix+
 73 | The \verb+prefix+ parameter is any arbitrary file name prefix, for example, \verb+test_assembly+.
 74 | 
 75 | \end{description}
 76 | 
 77 | \section{Input}
 78 | 
 79 | \begin{description}
 80 | \item \emph{FASTA/FASTQ}
 81 | 
 82 | Minia assembles any type of Illumina reads, given in the FASTA or FASTQ format. Paired or mate-pairs reads are OK, but keep in mind that Minia discards pairing information.
 83 | \item \emph{Multipe Files}
 84 | 
 85 |  Minia can assemble multiple input files. Just create a text file containing the list of read files, one file name per line, and pass this list as the first parameter of Minia (instead of a FASTA/FASTQ file). Therefore the parameter \verb+input_file+ can be either (i) the read file itself (FASTA/FASTQ/compressed), or (ii) a file containing a list of file names.
 86 | \item \emph{line format}
 87 | 
 88 |  In FASTA files, each read can be split into multiple lines, whereas in FASTQ, each read sequence must be in a single line.
 89 | 
 90 | \item \emph{gzip compression}
 91 | 
 92 | Minia can direclty read files compressed with gzip. Compressed files should end with '.gz'. Input files of different types can be mixed (i.e. gzipped or not, in FASTA or FASTQ)
 93 | 
 94 | \end{description}
 95 | 
 96 | \section{Output}
 97 | 
 98 | The output of Minia is a set of contigs in the FASTA format, in the file \verb+[prefix].contigs.fa+. 
 99 | 
100 | \section{Memory usage}
101 | 
102 | We estimate that the memory usage of Minia is roughly $2$ GB of RAM per gigabases in the target genome to assemble. It is independent of the coverage of the input dataset, provided that the \verb!min_abundance! parameter is correctly set. For example, a human genome was assembled in $5.7$ GB of RAM. This was using the original data structure; the current implementation relies on Cascading Bloom filters and should use $\approx 1-2$ GB less memory. A better estimation of the memory usage can be found in the Appendix.
103 | 
104 | \section{Disk usage}
105 | 
106 | Minia writes large temporary files during the k-mer counting phase. These files are written in the working directory when you launched Minia. For better performance, run Minia on a local hard drive. 
107 | 
108 | \section{Larger $k$-mer lengths}
109 | 
110 | Minia supports arbitrary large $k$-mer lengths. To compile Minia for $k$-mer lengths up to, say, 100, type:
111 | \begin{verbatim}
112 | make clean && make k=100
113 | \end{verbatim}
114 | 
115 | \section{Appendixes}
116 | 
117 | The rest of this manual describes the data structure used by Minia.
118 | The first text is from an original research article published at WABI 2012. The second text is an improvement made and implemented in Minia by other authors, published at WABI 2013.
119 | 
120 | \includepdf[pages=-]{../paper/wabi12.pdf}
121 | \includepdf[pages=-]{../paper/cascading-wabi13.pdf}
122 | 
123 | \end{document}
124 | 
125 | 


--------------------------------------------------------------------------------
/utils/ttmath/ttmaththreads.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is a part of TTMath Bignum Library
  3 |  * and is distributed under the (new) BSD licence.
  4 |  * Author: Tomasz Sowa <t.sowa@ttmath.org>
  5 |  */
  6 | 
  7 | /* 
  8 |  * Copyright (c) 2006-2009, Tomasz Sowa
  9 |  * All rights reserved.
 10 |  * 
 11 |  * Redistribution and use in source and binary forms, with or without
 12 |  * modification, are permitted provided that the following conditions are met:
 13 |  * 
 14 |  *  * Redistributions of source code must retain the above copyright notice,
 15 |  *    this list of conditions and the following disclaimer.
 16 |  *    
 17 |  *  * Redistributions in binary form must reproduce the above copyright
 18 |  *    notice, this list of conditions and the following disclaimer in the
 19 |  *    documentation and/or other materials provided with the distribution.
 20 |  *    
 21 |  *  * Neither the name Tomasz Sowa nor the names of contributors to this
 22 |  *    project may be used to endorse or promote products derived
 23 |  *    from this software without specific prior written permission.
 24 |  *
 25 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 26 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 27 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 28 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 29 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 30 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 31 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 32 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 33 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 35 |  * THE POSSIBILITY OF SUCH DAMAGE.
 36 |  */
 37 | 
 38 | 
 39 | 
 40 | #ifndef headerfilettmaththreads
 41 | #define headerfilettmaththreads
 42 | 
 43 | #include "ttmathtypes.h"
 44 | 
 45 | #ifdef TTMATH_WIN32_THREADS
 46 | #include <windows.h>
 47 | #include <cstdio>
 48 | #endif
 49 | 
 50 | #ifdef TTMATH_POSIX_THREADS
 51 | #include <pthread.h>
 52 | #endif
 53 | 
 54 | 
 55 | 
 56 | /*!
 57 | 	\file ttmaththreads.h
 58 |     \brief Some objects used in multithreads environment
 59 | */
 60 | 
 61 | 
 62 | /*
 63 | 	this is a simple skeleton of a program in multithreads environment:
 64 | 
 65 | 	#define TTMATH_MULTITHREADS
 66 | 	#include<ttmath/ttmath.h>
 67 | 	
 68 | 	TTMATH_MULTITHREADS_HELPER
 69 | 
 70 | 	int main()
 71 | 	{
 72 | 	[...]
 73 | 	}
 74 | 
 75 | 	make sure that macro TTMATH_MULTITHREADS is defined and (somewhere in *.cpp file)
 76 | 	use TTMATH_MULTITHREADS_HELPER macro (outside of any classes/functions/namespaces scope)
 77 | */
 78 | 
 79 | 
 80 | namespace ttmath
 81 | {
 82 | 
 83 | 
 84 | #ifdef TTMATH_WIN32_THREADS
 85 | 
 86 | 	/*
 87 | 		we use win32 threads
 88 | 	*/
 89 | 
 90 | 
 91 | 	/*!
 92 | 		in multithreads environment you should use TTMATH_MULTITHREADS_HELPER macro
 93 | 		somewhere in *.cpp file
 94 | 
 95 | 		(at the moment in win32 this macro does nothing)
 96 | 	*/
 97 | 	#define TTMATH_MULTITHREADS_HELPER
 98 | 
 99 | 
100 | 	/*!
101 | 		objects of this class are used to synchronize
102 | 	*/
103 | 	class ThreadLock
104 | 	{
105 | 		HANDLE mutex_handle;
106 | 
107 | 
108 | 		void CreateName(char * buffer) const
109 | 		{
110 | 			#ifdef _MSC_VER
111 | 			#pragma warning (disable : 4996)
112 | 			// warning C4996: 'sprintf': This function or variable may be unsafe. Consider using sprintf_s instead.
113 | 			#endif
114 | 
115 | 			sprintf(buffer, "TTMATH_LOCK_%ul", (unsigned long)GetCurrentProcessId());
116 | 
117 | 			#ifdef _MSC_VER
118 | 			#pragma warning (default : 4996)
119 | 			#endif
120 | 		}
121 | 
122 | 
123 | 	public:
124 | 
125 | 		bool Lock()
126 | 		{
127 | 		char buffer[50];
128 | 
129 | 			CreateName(buffer);
130 | 			mutex_handle = CreateMutexA(0, false, buffer);
131 | 
132 | 			if( mutex_handle == 0 )
133 | 				return false;
134 | 
135 | 			WaitForSingleObject(mutex_handle, INFINITE);
136 | 
137 | 		return true;
138 | 		}
139 | 
140 | 
141 | 		ThreadLock()
142 | 		{
143 | 			mutex_handle = 0;
144 | 		}
145 | 
146 | 
147 | 		~ThreadLock()
148 | 		{
149 | 			if( mutex_handle != 0 )
150 | 			{
151 | 				ReleaseMutex(mutex_handle);
152 | 				CloseHandle(mutex_handle);
153 | 			}
154 | 		}
155 | 	};
156 | 
157 | #endif  // #ifdef TTMATH_WIN32_THREADS
158 | 
159 | 
160 | 
161 | 
162 | 
163 | #ifdef TTMATH_POSIX_THREADS
164 | 
165 | 	/*
166 | 		we use posix threads
167 | 	*/
168 | 
169 | 
170 | 	/*!
171 | 		in multithreads environment you should use TTMATH_MULTITHREADS_HELPER macro
172 | 		somewhere in *.cpp file
173 | 		(this macro defines a pthread_mutex_t object used by TTMath library)
174 | 	*/
175 | 	#define TTMATH_MULTITHREADS_HELPER                          \
176 | 	namespace ttmath                                            \
177 | 	{                                                           \
178 | 	pthread_mutex_t ttmath_mutex = PTHREAD_MUTEX_INITIALIZER;   \
179 | 	}
180 | 
181 | 
182 | 	/*!
183 | 		ttmath_mutex will be defined by TTMATH_MULTITHREADS_HELPER macro 
184 | 	*/
185 | 	extern pthread_mutex_t ttmath_mutex;
186 | 
187 | 
188 | 	/*!
189 | 		objects of this class are used to synchronize
190 | 	*/
191 | 	class ThreadLock
192 | 	{
193 | 	public:
194 | 
195 | 		bool Lock()
196 | 		{
197 | 			if( pthread_mutex_lock(&ttmath_mutex) != 0 )
198 | 				return false;
199 | 
200 | 		return true;
201 | 		}
202 | 
203 | 
204 | 		~ThreadLock()
205 | 		{
206 | 			pthread_mutex_unlock(&ttmath_mutex);
207 | 		}
208 | 	};
209 | 
210 | #endif // #ifdef TTMATH_POSIX_THREADS
211 | 
212 | 
213 | 
214 | 
215 | #if !defined(TTMATH_POSIX_THREADS) && !defined(TTMATH_WIN32_THREADS)
216 | 
217 | 	/*!
218 | 		we don't use win32 and pthreads
219 | 	*/
220 | 
221 | 	/*!
222 | 	*/
223 | 	#define TTMATH_MULTITHREADS_HELPER
224 | 
225 | 
226 | 	/*!
227 | 		objects of this class are used to synchronize
228 | 		actually we don't synchronize, the method Lock() returns always 'false'
229 | 	*/
230 | 	class ThreadLock
231 | 	{
232 | 	public:
233 | 
234 | 		bool Lock()
235 | 		{
236 | 			return false;
237 | 		}
238 | 	};
239 | 
240 | 
241 | #endif // #if !defined(TTMATH_POSIX_THREADS) && !defined(TTMATH_WIN32_THREADS)
242 | 
243 | 
244 | 
245 | 
246 | 
247 | } // namespace
248 | 
249 | #endif
250 | 
251 | 


--------------------------------------------------------------------------------
/utils/tests/KmerTests.cpp:
--------------------------------------------------------------------------------
  1 | #include "KmerTests.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <time.h>
  5 | #include <string.h>
  6 | #include <unistd.h>
  7 | #include <sys/types.h>
  8 | #include <inttypes.h>
  9 | #include <stdint.h>
 10 | #include <algorithm> // for max/min
 11 | #include <vector> // for sorting_kmers
 12 | #include <sys/time.h>
 13 | #include "TestUtils.h"
 14 | 
 15 | using namespace std;
 16 | 
 17 | namespace kmerTests{
 18 | 
 19 | void shift_kmer_forward_testOverlap(){
 20 |     char* testName = (char*)"shift_kmer_forward_testOverlap";
 21 |     kmer_type kmer = test_kmer;
 22 |     kmer_type originalKmer = kmer;
 23 | 
 24 |     shift_kmer(&kmer, 1,0);
 25 |     
 26 |     if(!kmer_matches_kmer(kmer, 0, originalKmer,1)){
 27 |         fail(testName, (char*)"overlapping portion of strings don't match.");
 28 |         return;
 29 |     }
 30 |     succeed(testName);
 31 | }
 32 | 
 33 | void shift_kmer_forward_testLastChar(){
 34 |     char* testName = (char*)"shift_kmer_forward_testLastChar";
 35 |     kmer_type kmer = test_kmer;
 36 |     char* kmerSeq = new char[sizeKmer];
 37 | 
 38 |     shift_kmer(&kmer, 1,0);
 39 |     code2seq(kmer,kmerSeq);
 40 | 
 41 |     if(kmerSeq[sizeKmer-1] != 'C'){ 
 42 |         fail(testName);
 43 |         return;
 44 |     }
 45 |     succeed(testName);
 46 | }
 47 | 
 48 | void shift_kmer_backward_testOverlap(){
 49 |      char* testName = (char*)"shift_kmer_forward_testOverlap";
 50 |     kmer_type kmer = test_kmer;
 51 |     kmer_type originalKmer = kmer;
 52 | 
 53 |     shift_kmer(&kmer, 1,1);
 54 |     
 55 |     if(!kmer_matches_kmer(kmer, 1, originalKmer,0)){
 56 |         fail(testName, (char*)"overlapping portion of strings don't match.");
 57 |         return;
 58 |     }
 59 |     succeed(testName);
 60 | }
 61 | 
 62 | void shift_kmer_backward_testFirstChar(){
 63 |     char* testName = (char*)"shift_kmer_forward_testLastChar";
 64 |     kmer_type kmer = test_kmer;
 65 |     char* kmerSeq = new char[sizeKmer];
 66 |     shift_kmer(&kmer, 2,1);
 67 |     code2seq(kmer,kmerSeq);
 68 |     
 69 |     if(kmerSeq[0] != 'T'){  
 70 |         fail(testName);
 71 |         return;
 72 |     }
 73 |     succeed(testName);
 74 | }
 75 | 
 76 | void next_kmer_forward_testOverlap(){
 77 |     char* testName = (char*)"next_kmer_forward_testOverlap";
 78 |     kmer_type kmer = test_kmer;
 79 |     
 80 |     kmer_type nextKmer= next_kmer(kmer,0,0);
 81 |     
 82 |     if(!kmer_matches_kmer(kmer, 1, nextKmer,0)){
 83 |         fail(testName, (char*)"overlapping portion of strings don't match.");
 84 |         return;
 85 |     }
 86 |     succeed(testName);
 87 | }
 88 | 
 89 | void next_kmer_forward_testLastChar(){
 90 |     char* testName = (char*)"ext_kmer_forward_testLastChar";
 91 |     kmer_type kmer = test_kmer;
 92 |     char* nextString = new char[sizeKmer];
 93 |     
 94 |     code2seq(next_kmer(kmer,1,0), nextString);
 95 |     
 96 |     if(nextString[sizeKmer-1] != 'C'){  
 97 |         fail(testName);
 98 |         return;
 99 |     }
100 |     succeed(testName);
101 | }
102 | 
103 | void next_kmer_backward_testOverlap(){
104 |      char* testName = (char*)"next_kmer_forward_testOverlap";
105 |     kmer_type kmer = test_kmer;
106 |     
107 |     kmer_type nextKmer= next_kmer(kmer,0,1);
108 |     
109 |     if(!kmer_matches_kmer(kmer, 0, nextKmer,1)){
110 |         fail(testName, (char*)"overlapping portion of strings don't match.");
111 |         return;
112 |     }
113 |     succeed(testName);
114 | }
115 | 
116 | void next_kmer_backward_testFirstChar(){
117 |     char* testName = (char*)"ext_kmer_backward_testFirstChar";
118 |     kmer_type kmer = test_kmer;
119 |     char* nextString = new char[sizeKmer];
120 |     
121 |     code2seq(next_kmer(kmer,1,1), nextString);
122 | 
123 |     if(nextString[0] != 'C'){  
124 |         fail(testName);
125 |         return;
126 |     }
127 |     succeed(testName);
128 | }
129 | 
130 | void getFirstKmerFromRead_test(){
131 |     char* testName = (char*) "getFirstKmerFromRead_test";
132 |     char* read = (char*)"ACGGGGGTCAAAATCGGGAATCCGGGGGGAGGCCCTAGT";
133 |     kmer_type kmer;
134 |     
135 |     getFirstKmerFromRead(&kmer, read);
136 |     
137 |     if(!kmer_matches_readseq(read, kmer, 0)){
138 |         fail(testName);
139 |         return;
140 |     }
141 |     succeed(testName);
142 | }
143 | 
144 | void nextKmerInRead_test_forward(){
145 |     char* testName = (char*) "nextKmerInRead_test_forward";
146 |     kmer_type kmer = test_kmer;
147 |     char* kmerSeq = new char [sizeKmer];
148 |     code2seq(kmer, kmerSeq);
149 | 
150 |     char* read = new char [100];
151 |     read[0] = 'C';
152 |     strcpy(&read[1], kmerSeq);
153 |     read[1+sizeKmer] = 'T';
154 | 
155 |     kmer_type next_kmer = next_kmer_in_read(kmer, 1, read, 0);
156 | 
157 |     if(!kmer_matches_readseq(read, next_kmer,2)){
158 |         fail(testName);
159 |         return;
160 |     }
161 |     succeed(testName);
162 | }
163 | 
164 | 
165 | void nextKmerInRead_test_backward(){
166 |     char* testName = (char*) "nextKmerInRead_test_backward";
167 |     kmer_type kmer = test_kmer;
168 |     char* kmerSeq = new char [sizeKmer];
169 |     code2seq(kmer, kmerSeq);
170 | 
171 |     char* read = new char [100];
172 |     read[0] = 'C';
173 |     strcpy(&read[1], kmerSeq);
174 |     read[1+sizeKmer] = 'T';
175 | 
176 |     kmer_type next_kmer = next_kmer_in_read(kmer, 1, read, 1);
177 | 
178 |     if(!kmer_matches_readseq(read, next_kmer,0)){
179 |         fail(testName);
180 |         return;
181 |     }
182 | 
183 |     succeed(testName);
184 | }
185 | 
186 | void advanceKmer_test(){
187 |     char* testName = (char*) "advanceKmer_test";
188 |     kmer_type kmer = test_kmer;
189 |     char* kmerSeq = new char [sizeKmer];
190 |     code2seq(kmer, kmerSeq);
191 |     char* read = new char [100];
192 |     strcpy(&read[0], (char*) "CGGT"); 
193 |     strcpy(&read[4], kmerSeq);
194 |     strcpy(&read[4+sizeKmer], (char*)"ACCCGTTTAAACGTTTAGCCTCTCTGAGAGAAAA");
195 |     
196 |     advance_kmer(&read[0], &kmer, 4,15);
197 | 
198 |     if(!kmer_matches_readseq(read, kmer, 15)){
199 |         fail(testName);
200 |         return;
201 |     }
202 |     succeed(testName);
203 | }
204 | 
205 | void runKmerTests(){
206 |     setSizeKmer(27);
207 | 
208 | 
209 |     shift_kmer_forward_testOverlap();
210 |     shift_kmer_forward_testLastChar();
211 |     shift_kmer_backward_testOverlap();
212 |     shift_kmer_backward_testFirstChar();
213 | 
214 |     next_kmer_forward_testOverlap();
215 |     next_kmer_forward_testLastChar();
216 |     next_kmer_backward_testOverlap();
217 |     next_kmer_backward_testFirstChar();
218 | 
219 |     getFirstKmerFromRead_test();
220 |     
221 |     nextKmerInRead_test_forward();
222 |     nextKmerInRead_test_backward();
223 | 
224 |     advanceKmer_test();
225 | }
226 | 
227 | }


--------------------------------------------------------------------------------
/utils/tests/RollingHashTests.cpp:
--------------------------------------------------------------------------------
  1 | #include "RollingHashTests.h"
  2 | #include <stdio.h>
  3 | 
  4 | namespace rollingHashTests
  5 | {
  6 |     
  7 | Bloom* bloom;
  8 | int kVal = 27;
  9 | 
 10 | void rotate_right_test_moveBitRight(){
 11 |     char* testName = (char*)"rotate_right_test_move1BitRight";
 12 |     bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10
 13 |     uint64_t hash = (uint64_t)(1 << 7);
 14 | 
 15 |     uint64_t rotated = bloom->rotate_right(hash, 7);
 16 | 
 17 |     if(rotated != 1){
 18 |         fail(testName);
 19 |     }
 20 |     succeed(testName);
 21 | }
 22 | 
 23 | 
 24 | void rotate_right_test_wrapBit(){
 25 |     char* testName = (char*)"rotate_right_test_wrapBit";
 26 |     bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10
 27 |     uint64_t hash = (uint64_t)1;
 28 | 
 29 |     uint64_t rotated = bloom->rotate_right(hash, 7);
 30 | 
 31 |     if(rotated != (1 << 3)){
 32 |         fail(testName);
 33 |     }
 34 |     succeed(testName);
 35 | }
 36 | 
 37 | 
 38 | 
 39 | void rotate_right_test_noOverFlow(){
 40 |     char* testName = (char*)"rotate_right_test_noOverFlow";
 41 |     bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10
 42 |     uint64_t hash = (uint64_t)1000;
 43 | 
 44 |     uint64_t rotated = bloom->rotate_right(hash, 7);
 45 | 
 46 |     if(rotated > 1024){
 47 |         fail(testName);
 48 |     }
 49 |     succeed(testName);
 50 | }
 51 | 
 52 | 
 53 | void rotate_left_test_moveBitLeft(){
 54 |     char* testName = (char*)"rotate_Left_test_move1BitLeft";
 55 |     bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10
 56 |     uint64_t hash = (uint64_t)1;
 57 | 
 58 |     uint64_t rotated = bloom->rotate_left(hash, 7);
 59 | 
 60 |     if(rotated != (1 << 7)){
 61 |         fail(testName);
 62 |     }
 63 |     succeed(testName);
 64 | }
 65 | 
 66 | void rotate_left_test_wrapBit(){
 67 |     char* testName = (char*)"rotate_left_test_wrapBit";
 68 |     bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10
 69 |     uint64_t hash = (uint64_t)(1 << 5);
 70 | 
 71 |     uint64_t rotated = bloom->rotate_left(hash, 7);
 72 | 
 73 |     if(rotated != (1 << 2)){
 74 |         fail(testName);
 75 |     }
 76 |     succeed(testName);
 77 | }
 78 | 
 79 | 
 80 | void rotate_left_test_noOverFlow(){
 81 |     char* testName = (char*)"rotate_left_test_noOverFlow";
 82 |     bloom = new Bloom((uint64_t)1000, kVal); //hashSize should be 10
 83 |     uint64_t hash = (uint64_t)1000;
 84 | 
 85 |     uint64_t rotated = bloom->rotate_left(hash, 7);
 86 | 
 87 |     if(rotated > 1024){
 88 |         fail(testName);
 89 |     }
 90 |     succeed(testName);
 91 | }
 92 | 
 93 | void roll_hash_hash_func0_bigbloom_checkSame(){
 94 |     bloom = new Bloom((uint64_t)1000,kVal);
 95 |     char* testName = (char*)"roll_hash_hash_func0_bigbloom_checkSame";
 96 |     char* kmerSeq = (char*)"ACTTACTGGGCTCTATTGCGTATCGATCGATCGATGCATCTACCCCCATCTAATTAGAGTGAATAGATCGATCGATCGCATACTCAGCATAGCTATA";
 97 |     kmer_type firstKmer, secondKmer, kmer;
 98 | 
 99 |     getFirstKmerFromRead(&firstKmer, &(kmerSeq[0]));
100 |     getFirstKmerFromRead(&secondKmer, &(kmerSeq[sizeKmer]));
101 | 
102 |     uint64_t rolledHash =  bloom->get_rolling_hash(firstKmer, 0);
103 |     uint64_t calculatedHash = bloom->get_rolling_hash(secondKmer, 0);
104 | 
105 |     for(int i = 0; i < sizeKmer; i++){
106 |         rolledHash = bloom->roll_hash(rolledHash, NT2int(kmerSeq[i]), NT2int(kmerSeq[i+sizeKmer]),0);
107 |     }
108 | 
109 |     if(rolledHash != calculatedHash){
110 |         fail(testName);
111 |         return;
112 |     }
113 |     succeed(testName);
114 | }
115 | 
116 | void roll_hash_hash_func1_bigbloom_checkSame(){
117 |     bloom = new Bloom((uint64_t)10000, kVal);
118 |     char* testName = (char*)"roll_hash_hash_func1_bigbloom_checkSame";
119 |     char* kmerSeq = (char*)"ACTTACTGGGCTCTATTGCGTATCGATCGATCGATGCATCTACCCCCATCTAATTAGAGTGAATAGATCGATCGATCGCATACTCAGCATAGCTATA";
120 |     kmer_type firstKmer, secondKmer, kmer;
121 | 
122 |     getFirstKmerFromRead(&firstKmer, &(kmerSeq[0]));
123 |     getFirstKmerFromRead(&secondKmer, &(kmerSeq[sizeKmer]));
124 | 
125 |     uint64_t rolledHash =  bloom->get_rolling_hash(firstKmer, 1);
126 |     uint64_t calculatedHash = bloom->get_rolling_hash(secondKmer, 1);
127 | 
128 |    
129 |     for(int i = 0; i < sizeKmer; i++){
130 |          rolledHash = bloom->roll_hash(rolledHash, NT2int(kmerSeq[i]), NT2int(kmerSeq[i+sizeKmer]),1);
131 |     }
132 | 
133 |     if(rolledHash != calculatedHash){
134 |         fail(testName);
135 |         return;
136 |     }
137 |     succeed(testName);
138 | }
139 | 
140 | void roll_hash_hash_func0_smallbloom_checkSame(){
141 |     bloom = new Bloom((uint64_t)100000, kVal);
142 |     char* testName = (char*)"roll_hash_hash_func0_smallbloom_checkSame";
143 |     char* kmerSeq = (char*)"ACTTACTGGGCTCTATTGCGTATCGATCGATCGATGCATCTACCCCCATCTAATTAGAGTGAATAGATCGATCGATCGCATACTCAGCATAGCTATA";
144 |     kmer_type firstKmer, secondKmer, kmer;
145 | 
146 |     getFirstKmerFromRead(&firstKmer, &(kmerSeq[0]));
147 |     getFirstKmerFromRead(&secondKmer, &(kmerSeq[sizeKmer]));
148 | 
149 |     uint64_t rolledHash =  bloom->get_rolling_hash(firstKmer, 0);
150 |     uint64_t calculatedHash = bloom->get_rolling_hash(secondKmer, 0);
151 | 
152 |     for(int i = 0; i < sizeKmer; i++){
153 |          rolledHash = bloom->roll_hash(rolledHash, NT2int(kmerSeq[i]), NT2int(kmerSeq[i+sizeKmer]),0);
154 |     }
155 | 
156 |     if(rolledHash != calculatedHash){
157 |         fail(testName);
158 |         return;
159 |     }
160 |     succeed(testName);
161 | }
162 | 
163 | void roll_hash_hash_func1_smallbloom_checkSame(){
164 |     bloom = new Bloom((uint64_t)1000000, kVal);
165 |     char* testName = (char*)"roll_hash_hash_func1_smallbloom_checkSame";
166 |     char* kmerSeq = (char*)"ACTTACTGGGCTCTATTGCGTATCGATCGATCGATGCATCTACCCCCATCTAATTAGAGTGAATAGATCGATCGATCGCATACTCAGCATAGCTATA";
167 |     kmer_type firstKmer, secondKmer, kmer;
168 | 
169 |     getFirstKmerFromRead(&firstKmer, &(kmerSeq[0]));
170 |     getFirstKmerFromRead(&secondKmer, &(kmerSeq[sizeKmer]));
171 | 
172 |     uint64_t rolledHash =  bloom->get_rolling_hash(firstKmer, 1);
173 |     uint64_t calculatedHash = bloom->get_rolling_hash(secondKmer, 1);
174 | 
175 |    
176 |     for(int i = 0; i < sizeKmer; i++){
177 |          rolledHash = bloom->roll_hash(rolledHash, NT2int(kmerSeq[i]), NT2int(kmerSeq[i+sizeKmer]),1);
178 |     }
179 | 
180 |     if(rolledHash != calculatedHash){
181 |         fail(testName);
182 |         return;
183 |     }
184 |     succeed(testName);
185 | }
186 | 
187 | void advance_hash_test_checkSame(){
188 |     bloom = new Bloom((uint64_t)10000, kVal);
189 |     char* testName = (char*) "advance_hash_test_checkSame";
190 |     char* kmerSeq = (char*)"ACTTACTGGGCTCTATTGCGTATCGATCGATCGATGCATCTACCCCCATCTAATTAGAGTGAATAGATCGATCGATCGCATACTCAGCATAGCTATA";
191 |     kmer_type firstKmer, secondKmer;
192 | 
193 |     getFirstKmerFromRead(&firstKmer, &(kmerSeq[0]));
194 |     getFirstKmerFromRead(&secondKmer, &(kmerSeq[kVal]));
195 | 
196 |     uint64_t advancedHash0 =  bloom->get_rolling_hash(firstKmer, 0);
197 |     uint64_t advancedHash1 =  bloom->get_rolling_hash(firstKmer, 1);
198 |     uint64_t calculatedHash0 = bloom->get_rolling_hash(secondKmer, 0);
199 |     uint64_t calculatedHash1 = bloom->get_rolling_hash(secondKmer, 1);
200 | 
201 |     bloom->advance_hash(&kmerSeq[0], &advancedHash0, &advancedHash1,0,kVal);
202 | 
203 |     if(advancedHash0 != calculatedHash0){
204 |         fail(testName, (char*)"hash 0 doesn't match.");
205 |         return;
206 |     }
207 |     if(advancedHash1 != calculatedHash1){
208 |         fail(testName, (char*)"hash 1 doesn't match.");
209 |         return;
210 |     }
211 |     succeed(testName);
212 | 
213 | }
214 | 
215 | 
216 | void runRollingHashTests(){
217 |     setSizeKmer(kVal);
218 | 
219 |     rotate_right_test_moveBitRight();
220 |     rotate_right_test_wrapBit();
221 |     rotate_right_test_noOverFlow();
222 | 
223 |     rotate_left_test_moveBitLeft();
224 |     rotate_left_test_wrapBit();
225 |     rotate_left_test_noOverFlow();
226 | 
227 |     roll_hash_hash_func0_bigbloom_checkSame();
228 |      roll_hash_hash_func1_bigbloom_checkSame();
229 |      roll_hash_hash_func0_smallbloom_checkSame();
230 |      roll_hash_hash_func1_smallbloom_checkSame();
231 | 
232 |     advance_hash_test_checkSame();  
233 | }
234 | 
235 | }


--------------------------------------------------------------------------------
/utils/ContigJuncList.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <sstream>
  3 | #include <deque>
  4 | #include <algorithm>   
  5 | #include <fstream>
  6 | 
  7 | #include "ContigJuncList.h"
  8 | 
  9 | using std::stringstream;
 10 | 
 11 | ContigJuncList::ContigJuncList(std::string sequence, junc_list dist, junc_list cov){
 12 |   seq = sequence;
 13 |   distances = dist;
 14 |   coverages = cov;   
 15 | } 
 16 | 
 17 | ContigJuncList::ContigJuncList(){
 18 |   distances.clear();
 19 |   coverages.clear();   
 20 |   seq = "";
 21 | }
 22 | 
 23 | int ContigJuncList::size(){
 24 |     return coverages.size();
 25 | }
 26 | 
 27 | void ContigJuncList::printJuncValues(){
 28 |     for(auto itC = coverages.begin(), itD = distances.begin(); itC != coverages.end(); itC++, itD++){
 29 |         std::cout << (unsigned int)*itD << ", " << (unsigned int)*itC <<" ; "; 
 30 |     }
 31 |     std::cout << "\n";
 32 | }
 33 | 
 34 | void ContigJuncList::printJuncResults(std::list<JuncResult> results){
 35 |     for(auto it = results.begin(); it != results.end(); ++it){
 36 |         std::cout << print_kmer(it->kmer) << " " << it->distance << " " 
 37 |             << it->coverage << " , ";
 38 |     }
 39 |     std::cout << "\n";
 40 | }
 41 | 
 42 | void ContigJuncList::printJuncResults(int side, int startDist, int maxDist){
 43 |     printJuncResults(getJuncResults(side, startDist, maxDist));
 44 | }
 45 | 
 46 | int ContigJuncList::length(){
 47 |     return seq.length();
 48 | }
 49 | 
 50 | bool ContigJuncList::isValidKmerPosition(int pos){
 51 |     return pos >= 0 && pos <= (getSeq().length() - sizeKmer + 1)*2;
 52 | }
 53 | 
 54 | //0 is the first backward kmer, 1 is the first forward kmer, 
 55 | //2 is the second backward kmer, etc.
 56 | kmer_type ContigJuncList::getKmer(int pos){
 57 |     if(pos % 2 == 1){
 58 |         return getKmerFromRead(getSeq(), pos/2);
 59 |     }
 60 |     else{
 61 |         return revcomp(getKmerFromRead(getSeq(), pos/2));
 62 |     }
 63 | }
 64 | 
 65 | //Gets a list of JuncResults, specifying distance, coverage, and kmer for juncs
 66 | //If startForward, assumes first junc faces forward (in towards contig)
 67 | //If !startForward, assumes first junc faces backward (away from contig)
 68 | //Always assumes no reverse needed- contig can reverse it if necessary before calling 
 69 | std::list<JuncResult> ContigJuncList::getJuncResults(bool startForward, int startDist, int maxDist){
 70 |     std::list<JuncResult> results = {};
 71 |     int startPos = 0;
 72 |     if(startForward){
 73 |         results.push_back(JuncResult(getKmer(3), 2+startDist,coverages.front()));
 74 |         startPos = 1;
 75 |     }
 76 |     int pos = startPos;
 77 |     for(auto itD = distances.begin(), itC = ++coverages.begin();
 78 |         itD != distances.end(); 
 79 |         itD++, itC++){
 80 |         pos += *itD;
 81 |         //real extension is 2 in front if the junc faces forward, or 2 behind if it faces backward
 82 |         int offset = 2;
 83 |         if(pos % 2 == 0){
 84 |             offset = -2;
 85 |         }
 86 |         if(pos + offset - startPos + startDist <= maxDist){
 87 |             if(isValidKmerPosition(pos+offset)){
 88 |                 results.push_back(JuncResult(getKmer(pos + offset), pos + offset-startPos + startDist, *itC));
 89 |             }
 90 |         }
 91 |         else { break; }
 92 |         
 93 |     }
 94 |     return results;
 95 | }
 96 | 
 97 | //Used for reversing a contig.  Simply reverses both lists
 98 | void ContigJuncList::reverse(){
 99 |     std::reverse(coverages.begin(), coverages.end());
100 |     std::reverse(distances.begin(), distances.end());
101 |     // std::cout << seq << ", " << revcomp_string(seq) << std::endl;
102 |     seq = revcomp_string(seq);
103 | }
104 | 
105 | //Concatenates this list of juncs with another 
106 | //Removes overlap of middle coverage and middle distance
107 | ContigJuncList ContigJuncList::concatenate(ContigJuncList otherList){
108 | 
109 |     junc_list newCov(coverages);
110 |     newCov.pop_back();
111 | 
112 |     unsigned char lastCov = coverages.back();
113 |     unsigned char firstCov = otherList.coverages.front();
114 | 
115 |     newCov.push_back((unsigned char) std::min((int)lastCov, (int)firstCov));
116 | 
117 |     newCov.insert(newCov.end(), ++otherList.coverages.begin(), otherList.coverages.end());
118 | 
119 |     junc_list newDist(distances);
120 |     newDist.insert(newDist.end(), otherList.distances.begin(), otherList.distances.end());
121 |     std::string newSeq  = getSeq().substr(0, getSeq().length()-sizeKmer) + otherList.getSeq();
122 |     return ContigJuncList(newSeq, newDist, newCov);
123 | }
124 | 
125 | // shifts coverage value up only up to maxDist - the rest are uneffected
126 | ContigJuncList ContigJuncList::getShiftedCoverageContigJuncsRange(double shift, int maxDist, int side){
127 |     junc_list newCov;
128 |     for(auto itD = distances.begin(), itC = coverages.begin();
129 |         itC != coverages.end(); ){
130 | 
131 |         double val = (double) *itC+shift; 
132 |         if ((*itD < maxDist && side == 1) || (*itD > seq.length() - maxDist && side == 2 && seq.length() > maxDist)){ 
133 |             newCov.push_back((int) std::round((val > 255) ? 255: val));
134 |         }else{
135 |             newCov.push_back(*itC);
136 |         }
137 |         ++itD;
138 |         ++itC;
139 |     }
140 |     return ContigJuncList(seq,distances,newCov);
141 | }
142 | 
143 | ContigJuncList ContigJuncList::getShiftedCoverageContigJuncs(double shift){
144 |     junc_list newCov(coverages);
145 |     for (int i = 0; i < newCov.size(); i++){
146 |         double val = (double) newCov.at(i)+shift;
147 |         newCov.at(i) =  (int) std::round((val > 255) ? 255: val);
148 |     }   
149 |     return ContigJuncList(seq,distances,newCov);
150 | }
151 | 
152 | 
153 | ContigJuncList ContigJuncList::getScaledContigJuncs(double scale_factor){
154 |     junc_list newCov(coverages);
155 |     for (int i = 0; i < newCov.size(); i++){
156 |         newCov.at(i) =  (int) std::round(newCov.at(i) * scale_factor);
157 |     }
158 |     return ContigJuncList(seq,distances,newCov);
159 | }
160 | 
161 | //Averages all coverage values in list
162 | double ContigJuncList::getAvgCoverage(){
163 |     double covSum = 0;
164 |     if(coverages.size()== 0){
165 |         printf("ERROR: empty junctions list\n");
166 |         return 0;
167 |     }
168 |     for(auto it = coverages.begin(); it != coverages.end(); ++it){
169 |         covSum += (double) *it;
170 |     }
171 |     return covSum / coverages.size();
172 | }
173 | 
174 | double ContigJuncList::getAvgCoverage(std::list<JuncResult> results){
175 |     if (results.size()==0){return 0;}
176 |     double covSum = 0;
177 |     for(auto it = results.begin(); it != results.end(); ++it){
178 |         covSum += (double) it->coverage; 
179 |     }
180 |     return covSum / results.size();
181 | }
182 | 
183 | double ContigJuncList::getCoverageSampleVariance(){
184 |     if(coverages.size() < 2){
185 |         // printf("ERROR: 1 or 0 values in junctions list\n");
186 |         return 0;
187 |     }
188 |     double mean = getAvgCoverage();
189 |     double sum_sqrs = 0;
190 |     for(auto it = coverages.begin(); it != coverages.end(); ++it){
191 |         sum_sqrs += pow(mean - (double) *it, 2); 
192 |     }
193 |     return pow(sum_sqrs / (coverages.size()-1), 0.5);
194 | }
195 | 
196 | double ContigJuncList::getCoverageSampleVariance(std::list<JuncResult> results){
197 |     if (results.size() < 2) {return 0;}
198 |     double mean = getAvgCoverage(results);
199 |     double sum_sqrs = 0;
200 |     for(auto it = results.begin(); it != results.end(); ++it){
201 |         sum_sqrs += pow(mean - (double) it->coverage, 2); 
202 |     }
203 |     return pow(sum_sqrs / (results.size()-1), 0.5);
204 | }
205 | 
206 | 
207 | //Sums all distance values
208 | int ContigJuncList::getTotalDistance(){
209 |     int totalDist = 0;
210 |     for(auto it = distances.begin(); it != distances.end(); ++it){
211 |         totalDist += (int) *it;
212 |     }
213 |     return totalDist;
214 | }
215 | 
216 | //Prints distances then coverages to a string
217 | std::string ContigJuncList::getStringRep(){
218 |     stringstream stream;
219 |     stream << getSeq() << "\n";
220 |     stream << "Distances: ";
221 |     for(auto it = distances.begin(); it != distances.end(); ++it){
222 |         stream << (int)*it << " ";
223 |     }
224 |     stream << ". Coverages: ";
225 |     for(auto it = coverages.begin(); it != coverages.end(); ++it){
226 |         stream << (int)*it << " ";
227 |     }
228 |     return stream.str();
229 | }
230 | 


--------------------------------------------------------------------------------
/src/newTests/JunctionMapTest.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <map>
  3 | #include "gtest/gtest.h"
  4 | #include "../ReadScanner.h"
  5 | #include "../../utils/Bloom.h"
  6 | #include "../../utils/JunctionMap.h"
  7 | #include "../ContigGraph.h"
  8 | using std::map;
  9 | // #include "../../utils/sparsepp.h"
 10 | // using spp::std::unordered_map;
 11 | 
 12 | 
 13 | class juncMapData : public ::testing::Test {
 14 | 
 15 | protected:
 16 |     std::vector<string> reads;
 17 |     std::vector<string> kmers;
 18 | 
 19 |     Bloom* bloom;
 20 |     ReadScanner* scanner;
 21 |     int j;
 22 |     int read_length;
 23 |     int estimated_kmers;
 24 |     int maxSpacerDist;
 25 |     double fpRate;
 26 | 
 27 |     JChecker* jchecker;
 28 |     JunctionMap* junctionMap;
 29 |     Bloom* short_pair_filter;
 30 |     Bloom* long_pair_filter;
 31 |     ContigGraph* contigGraph = new ContigGraph();
 32 | 
 33 |     // Build a kmer out of a string input
 34 |     kmer_type getKmerFromString(string kmerString){
 35 |         kmer_type kmer;
 36 |         getFirstKmerFromRead(&kmer, &(kmerString[0]));
 37 |         return kmer;
 38 |     }
 39 | 
 40 |     // Add the kmers from a vector of strings to a fake bloom filter
 41 |     std::set<kmer_type> addKmers(Bloom* bloom, std::vector<string> kmers) {
 42 |         std::set<kmer_type> valids {};
 43 |         for (string kmer : kmers) {
 44 |             valids.insert(get_canon(getKmerFromString(kmer)));
 45 |         }
 46 | 
 47 |         bloom->addFakeKmers(valids);
 48 |         valids.clear();
 49 |     }
 50 | 
 51 |     // Create a bloom filter, but make it a fake one
 52 |     Bloom* createBloom(){
 53 |         Bloom* fakeBloom =  fakeBloom->create_bloom_filter_optimal(estimated_kmers, fpRate);
 54 |         fakeBloom->fakify();
 55 |         return fakeBloom;
 56 |     }
 57 | 
 58 |     // This method should be used and modified to print whatever we want ot check about the resulting junction map
 59 |     void printJunctionMap(ReadScanner scanner) {
 60 |         auto map = scanner.getJunctionMap()->junctionMap;
 61 |         printf("Size: %d \n", map.size());
 62 |         for (auto& kv : map){
 63 |             printf("%s \n", print_kmer(kv.first));
 64 |             printf("%d %d %d %d %d \n",
 65 |                 kv.second.dist[0], kv.second.dist[1], kv.second.dist[2], kv.second.dist[3], kv.second.dist[4]);
 66 |         }
 67 |     }
 68 | 
 69 |     void printContigGraph(ContigGraph* graph){
 70 |         ContigIterator* contigIt = new ContigIterator(graph);
 71 |         //prints contigs that are adjacent to nodes
 72 |         while(contigIt->hasNextContig()){
 73 |             Contig* contig = contigIt->getContig();
 74 |             graph->printContigFastG(&std::cout, contig);
 75 |         }
 76 |         //prints isolated contigs
 77 |         std::vector<Contig> * isolated_contigs = graph->getIsolatedContigs();
 78 |         for(auto it = isolated_contigs->begin(); it != isolated_contigs->end(); ++it){
 79 |             Contig* contig = &*it;       
 80 |             graph->printContigFastG(&std::cout, contig);
 81 |         }
 82 |     }
 83 | 
 84 |     // set up blooms, junction map, jchecker, readscanner for testing
 85 |     juncMapData() {
 86 |         read_length = 30;
 87 |         estimated_kmers = 35;
 88 |         maxSpacerDist = 8;
 89 |         fpRate = .1;
 90 |         kmers = {};
 91 |         reads = {};
 92 | 
 93 |         bloom = createBloom(); 
 94 |         jchecker = new JChecker(j, bloom);
 95 | 
 96 |         junctionMap = new JunctionMap(bloom, jchecker, read_length);
 97 |         string read_scan_file = "mock_file";
 98 | 
 99 |         short_pair_filter = short_pair_filter->create_bloom_filter_optimal(estimated_kmers/9, fpRate);
100 |         long_pair_filter = long_pair_filter->create_bloom_filter_optimal(estimated_kmers/6, fpRate);
101 | 
102 |         scanner = new ReadScanner(junctionMap, read_scan_file, bloom, short_pair_filter, long_pair_filter, jchecker, maxSpacerDist);
103 |         printf("Done initializing!\n");
104 |         contigGraph = new ContigGraph();
105 |     }
106 |     ~juncMapData(){
107 |         reads.clear();
108 |         kmers.clear();
109 |         delete jchecker;
110 |         delete short_pair_filter;
111 |         delete long_pair_filter;
112 |         delete bloom;
113 |         delete junctionMap;
114 |         delete scanner;
115 |         delete contigGraph;
116 |     }
117 | };
118 | 
119 | 
120 | // build junction map of three reads
121 | TEST_F(juncMapData, buildBranchingPaths) {
122 |     setSizeKmer(5);
123 |         j = 1;
124 |     reads = {"ACGGGCGAACTTTCATAGGA", "GGCGAACTAGTCCAT", "AACTTTCATACGATT"};
125 |     kmers = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT","AACTT","ACTTT",
126 |         "CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA","GGCGA", "GCGAA", "CGAAC", 
127 |         "GAACT", "AACTA","ACTAG", "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT","AACTT", 
128 |         "ACTTT", "CTTTC", "TTTCA", "TTCAT", "TCATA", "CATAC", "ATACG", "TACGA", "ACGAT","CGATT"};
129 |     addKmers(bloom, kmers);
130 | 
131 |     scanner->scanInputRead(reads[0], true);
132 |     scanner->scanInputRead(reads[1], true);
133 |     scanner->scanInputRead(reads[2], true);
134 |     std::unordered_map<kmer_type, Junction> map = scanner->getJunctionMap()->junctionMap;
135 | 
136 |     // Expected junctions & distances before changes
137 |     // CTAGT 
138 |     // 0 8 3 0 3 
139 |     // TCATA 
140 |     // 0 10 0 6 12 
141 |     // GAACT 
142 |     // 3 0 12 0 13 
143 |     std::cout << "map before changes\n";
144 |     printJunctionMap(*scanner);
145 |     junctionMap->buildBranchingPaths(contigGraph);
146 |     std::cout << "built branching paths\n";
147 |     printContigGraph(contigGraph);
148 | 
149 |     printf("Destroying complex junctions.\n");
150 |     junctionMap->destroyComplexJunctions();
151 |     std::cout << "map before changes\n";
152 |     printJunctionMap(*scanner);
153 |     
154 |     printf("Building linear regions.\n");
155 |     junctionMap->buildLinearRegions(contigGraph);
156 |     printContigGraph(contigGraph);
157 | 
158 |     printf("Checking graph.\n");
159 |     contigGraph->checkGraph();
160 | }
161 | 
162 | TEST_F(juncMapData, smallDoubleJuncMap) {
163 |     setSizeKmer(7);
164 |     j = 0;
165 | 
166 |     reads = {"AAAAACAGCGATTC", "AAAAAGAGCGATTTA"};
167 |     kmers = {"AAAAACA", "AAAAAGA", "AAAACAG", "AAAAGAG", "AAACAGC", "AAAGAGC",
168 |         "AACAGCG", "AAGAGCG", "ACAGCGA","AGAGCGA","CAGCGAT", "GAGCGAT", "AGCGATT", 
169 |         "GCGATTT" ,"GCGATTC", "CGATTTA"};
170 |     addKmers(bloom, kmers);
171 | 
172 |     scanner->scanInputRead(reads[0], true);
173 |     scanner->scanInputRead(reads[1], true);
174 |     std::unordered_map<kmer_type, Junction> map = scanner->getJunctionMap()->junctionMap;
175 | 
176 |     printJunctionMap(*scanner);
177 | 
178 |     std::cout << "map before changes\n";
179 |     printJunctionMap(*scanner);
180 |     junctionMap->buildBranchingPaths(contigGraph);
181 |     std::cout << "built branching paths\n";
182 |     printContigGraph(contigGraph);
183 | 
184 |     printf("Destroying complex junctions.\n");
185 |     junctionMap->destroyComplexJunctions();
186 |     std::cout << "map before changes\n";
187 |     printJunctionMap(*scanner);
188 |     
189 |     printf("Building linear regions.\n");
190 |     junctionMap->buildLinearRegions(contigGraph);
191 |     printContigGraph(contigGraph);
192 | 
193 |     printf("Checking graph.\n");
194 |     contigGraph->checkGraph();
195 | 
196 | }
197 | 
198 | TEST_F(juncMapData, endJuncMap) {
199 |     setSizeKmer(7);
200 |     j = 1;
201 | 
202 |     reads = {"AAAAAACAGCGATTC", "AAAAAACTAAAAAA"}; // single read, first kmer is junction, should poinnt back one
203 |     kmers = {"AAAAAAC", "AAAAACA", "AAAAACT", "AAAACAG", "AAACAGC", "AACAGCG", "ACAGCGA","CAGCGAT", "AGCGATT", "GCGATTC",
204 |         "AAAACTA", "AAACTAA", "AACTAAA", "ACTAAAA", "CTAAAAA", "CTAAAAAA"};
205 |     addKmers(bloom, kmers);
206 | 
207 |     scanner->scanInputRead(reads[0], true);
208 |     scanner->scanInputRead(reads[1], true);
209 | 
210 |     std::unordered_map<kmer_type, Junction> map = scanner->getJunctionMap()->junctionMap;
211 | 
212 |     printJunctionMap(*scanner);
213 | 
214 |     std::cout << "map before changes\n";
215 |     printJunctionMap(*scanner);
216 |     junctionMap->buildBranchingPaths(contigGraph);
217 |     std::cout << "built branching paths\n";
218 |     printContigGraph(contigGraph);
219 | 
220 |     printf("Destroying complex junctions.\n");
221 |     junctionMap->destroyComplexJunctions();
222 |     std::cout << "map before changes\n";
223 |     printJunctionMap(*scanner);
224 |     
225 |     printf("Building linear regions.\n");
226 |     junctionMap->buildLinearRegions(contigGraph);
227 |     printContigGraph(contigGraph);
228 | 
229 |     printf("Checking graph.\n");
230 |     contigGraph->checkGraph();
231 | 
232 | }
233 | 
234 | 


--------------------------------------------------------------------------------
/utils/LargeInt.cpp:
--------------------------------------------------------------------------------
  1 | #ifndef ASSERTS
  2 | #define NDEBUG // disable asserts; those asserts make sure that with PRECISION == [1 or 2], all is correct 
  3 | #endif
  4 | 
  5 | // some 64-bit assert macros
  6 | #if defined(_LP64) && defined(_largeint)
  7 | #define assert128(x) assert(precision != 2 || (x));
  8 | #else
  9 | #define assert128(x) ;
 10 | #endif
 11 | 
 12 | #include <assert.h>
 13 | #include <stdio.h>
 14 | #include "LargeInt.h"
 15 | 
 16 | using namespace std;
 17 | 
 18 | template<int precision>
 19 | LargeInt<precision>::LargeInt()
 20 | {
 21 | }
 22 | 
 23 | template<int precision>
 24 | LargeInt<precision>::LargeInt(const uint64_t &c)
 25 | {
 26 |     array[0] = c;
 27 |     for (int i = 1; i < precision; i++)
 28 |         array[i] = 0;
 29 | }
 30 | 
 31 | 
 32 | template<int precision>
 33 | LargeInt<precision> LargeInt<precision>::operator+ (const LargeInt<precision>& other) const
 34 | {
 35 |     LargeInt<precision> result;
 36 |     int carry = 0;
 37 |     for (int i = 0 ; i < precision ; i++)
 38 |     {
 39 |             result.array[i] = array[i] + other.array[i] + carry;
 40 |             carry = (result.array[i] < array[i]) ? 1 : 0;
 41 |     }
 42 |     
 43 |     assert(precision != 1 || (result == other.array[0] + array[0]));
 44 |     assert128(result.toInt128() == other.toInt128() + toInt128());
 45 |     return result;
 46 | }
 47 | 
 48 | template<int precision>
 49 | LargeInt<precision> LargeInt<precision>::operator- (const LargeInt<precision>& other) const
 50 | {
 51 |     LargeInt<precision> result;
 52 |     int carry = 0;
 53 |     for (int i = 0 ; i < precision ; i++)
 54 |     {
 55 |             result.array[i] = array[i] - other.array[i] - carry;
 56 |             carry = (result.array[i] > array[i]) ? 1 : 0;
 57 |     }
 58 |     
 59 |     assert(precision != 1 || (result == array[0] - other.array[0]));
 60 |     assert128(result.toInt128() == toInt128() - other.toInt128());
 61 |     return result;
 62 | }
 63 | 
 64 | 
 65 | template<int precision>
 66 | LargeInt<precision> LargeInt<precision>::operator* (const int& coeff) const
 67 | {
 68 |     LargeInt<precision> result (*this);
 69 |     // minia doesn't have that many multiplications cases
 70 | 
 71 |     if (coeff == 2 || coeff == 4) 
 72 |     {
 73 |         result = result << (coeff / 2);
 74 |     }
 75 |     else
 76 |     {
 77 |         if (coeff == 21)
 78 |         {
 79 |             result = (result << 4) + (result << 2) + result;
 80 |         }
 81 |         else
 82 |         {
 83 |             printf("unsupported LargeInt multiplication: %d\n",coeff);
 84 |             exit(1);
 85 |         }
 86 |     }
 87 | 
 88 |     assert(precision != 1 || (result == array[0] * coeff));
 89 |     assert128(result.toInt128() == toInt128() * coeff);
 90 |     return result;
 91 | }
 92 | 
 93 | 
 94 | template<int precision>
 95 | LargeInt<precision> LargeInt<precision>::operator/ (const uint32_t& divisor) const
 96 | {
 97 |     LargeInt<precision> result;
 98 |     fill( result.array, result.array + precision, 0 );
 99 | 
100 |     // inspired by Divide32() from http://subversion.assembla.com/svn/pxcode/RakNet/Source/BigInt.cpp 
101 |     
102 |     uint64_t r = 0;
103 |     uint32_t mask32bits = ~0;
104 |     for (int i = precision-1; i >= 0; --i)
105 |     {
106 |         for (int j = 1; j >= 0; --j) // [j=1: high-32 bits, j=0: low-32 bits] of array[i]
107 |         {
108 |             uint64_t n = (r << 32) | ((array[i] >> (32*j)) & mask32bits );
109 |             result.array[i] = result.array[i] | (((n / divisor) & mask32bits) << (32*j));
110 |             r = n % divisor;
111 |         }
112 |     }
113 |     assert(precision != 1 || (result == array[0] / divisor));
114 |     assert128(result.toInt128() == toInt128() / divisor);
115 |     return result;
116 | }
117 | 
118 | 
119 | template<int precision>
120 | uint32_t LargeInt<precision>::operator% (const uint32_t& divisor) const
121 | {
122 |     uint64_t r = 0;
123 |     uint32_t mask32bits = ~0;
124 |     for (int i = precision-1; i >= 0; --i)
125 |     {
126 |         for (int j = 1; j >= 0; --j) // [j=1: high-32 bits, j=0: low-32 bits] of array[i]
127 |         {
128 |             uint64_t n = (r << 32) | ((array[i] >> (32*j)) & mask32bits );
129 |             r = n % divisor;
130 |         }
131 |     }
132 | 
133 |     assert(precision != 1 || (r == array[0] % divisor));
134 |     assert128(r == toInt128() % divisor);
135 |     return (uint32_t)r;
136 | }
137 | 
138 | template<int precision>
139 | LargeInt<precision> LargeInt<precision>::operator^ (const LargeInt& other) const
140 | {
141 |     LargeInt<precision> result;
142 |     for (int i=0 ; i < precision ; i++)
143 |             result.array[i] = array[i] ^ other.array[i];
144 |     
145 |     assert(precision != 1 || (result == (array[0] ^ other.array[0])));
146 |     assert128(result.toInt128() == (toInt128() ^ other.toInt128()));
147 |     return result;
148 | }
149 | 
150 | template<int precision>
151 | LargeInt<precision> LargeInt<precision>::operator& (const LargeInt& other) const
152 | {
153 |     LargeInt<precision> result;
154 |     for (int i=0 ; i < precision ; i++)
155 |             result.array[i] = array[i] & other.array[i];
156 |     
157 |     assert(precision != 1 || (result == (array[0] & other.array[0])));
158 |     assert128(result.toInt128() == (toInt128() & other.toInt128()));
159 |     return result;
160 | }
161 | 
162 | 
163 | template<int precision>
164 | LargeInt<precision> LargeInt<precision>::operator~ () const
165 | {
166 |     LargeInt<precision> result;
167 |     for (int i=0 ; i < precision ; i++)
168 |             result.array[i] = ~array[i];
169 |     
170 |     assert(precision != 1 || (result == ~array[0]));
171 |     assert128(result.toInt128() == ~toInt128());
172 |     return result;
173 | }
174 | 
175 | template<int precision>
176 | LargeInt<precision> LargeInt<precision>::operator<< (const int& coeff) const
177 | {
178 |     LargeInt<precision> result (0);
179 | 
180 |     int large_shift = coeff / 64;
181 |     int small_shift = coeff % 64;
182 |     
183 |     for (int i = large_shift ; i < precision-1; i++)
184 |     {
185 |        result.array[i] = result.array[i] | (array[i-large_shift] << small_shift);
186 |        if (small_shift == 0) // gcc "bug".. uint64_t x; x>>64 == 1<<63, x<<64 == 1
187 |            result.array[i+1] = 0;
188 |        else
189 |            result.array[i+1] = array[i-large_shift] >> (64 - small_shift);
190 | 
191 |     }
192 |     result.array[precision-1] = result.array[precision-1] | (array[precision-1-large_shift] << small_shift);
193 |     
194 |     assert(precision != 1 || (result == (array[0] << coeff)));
195 |     assert128(result.toInt128() == (toInt128() << coeff));
196 |     return result;
197 | }
198 | 
199 | template<int precision>
200 | LargeInt<precision> LargeInt<precision>::operator>> (const int& coeff) const
201 | {
202 |     LargeInt<precision> result (0);
203 | 
204 |     int large_shift = coeff / 64;
205 |     int small_shift = coeff % 64;
206 |     
207 |     result.array[0] = (array[large_shift] >> small_shift);
208 | 
209 |     for (int i = 1 ; i < precision - large_shift ; i++)
210 |     {
211 |        result.array[i] = (array[i+large_shift] >> small_shift);
212 |        if (small_shift == 0 && large_shift > 0) // gcc "bug".. uint64_t x; x>>64 == 1<<63, x<<64 == 1
213 |        {
214 |            result.array[i-1] =  result.array[i-1];
215 |        }
216 |        else
217 |        {
218 |            result.array[i-1] =  result.array[i-1] | (array[i+large_shift] << (64 - small_shift));
219 |        }
220 |     }
221 | 
222 |     assert(precision != 1 || ( small_shift == 0 || (result == array[0] >> coeff)));
223 |     assert128(small_shift == 0 || (result.toInt128() == (toInt128() >> coeff)));
224 |     return result;
225 | }
226 | 
227 | template<int precision>
228 | bool LargeInt<precision>::operator!= (const LargeInt& c) const
229 | {
230 |     for (int i = 0 ; i < precision ; i++)
231 |             if( array[i] != c.array[i] )
232 |                 return true;
233 |     return false;
234 | }
235 | 
236 | template<int precision>
237 | bool LargeInt<precision>::operator== (const LargeInt& c) const
238 | {
239 |     for (int i = 0 ; i < precision ; i++)
240 |             if( array[i] != c.array[i] )
241 |                 return false;
242 |     return true;
243 | }
244 | 
245 | template<int precision>
246 | bool LargeInt<precision>::operator< (const LargeInt& c) const
247 | {
248 |     for (int i = precision-1 ; i>=0 ; --i)
249 |             if( array[i] != c.array[i] )
250 |                 return array[i] < c.array[i]; 
251 |     
252 |     return false;
253 | }
254 |  
255 | template<int precision>
256 | bool LargeInt<precision>::operator<=(const LargeInt& c) const
257 | {
258 |     return operator==(c) || operator<(c);
259 | }   
260 | 
261 | template<int precision>
262 | uint64_t LargeInt<precision>::toInt() const
263 | { 
264 |     return array[0];
265 | }
266 | 
267 | #ifdef _LP64
268 | template<int precision>
269 | __uint128_t LargeInt<precision>::toInt128() const
270 | { 
271 |     return ((__uint128_t)array[0]) + (((__uint128_t)array[1]) << ((__uint128_t)64));
272 | }
273 | #endif
274 | 
275 | #ifdef KMER_PRECISION
276 | template class LargeInt<KMER_PRECISION>; // since we didn't define the functions in a .h file, that trick removes linker errors, see http://www.parashift.com/c++-faq-lite/separate-template-class-defn-from-decl.html
277 | #endif
278 | 


--------------------------------------------------------------------------------
/utils/Bloom.h:
--------------------------------------------------------------------------------
  1 | //
  2 | //  Bloom.h
  3 | //
  4 | //  Created by Guillaume Rizk on 9/02/12.
  5 | //
  6 | // Modified by Gil Goldshlager 7/15
  7 | 
  8 | #ifndef Bloom_h
  9 | #define Bloom_h
 10 | #include <stdlib.h>
 11 | #include <inttypes.h>
 12 | #include <stdint.h>
 13 | #include <set>
 14 | #include <string>
 15 | #include "Kmer.h"
 16 | #include "ReadKmer.h"
 17 | #include "JuncPairs.h"
 18 | #include <cmath>
 19 | #include <functional>
 20 | #include <algorithm>
 21 | 
 22 | 
 23 | // not using kmer_type from Kmer.h because I don't want this class to depend on Kmer.h
 24 | #ifdef _largeint
 25 | #include "LargeInt.h"
 26 | typedef LargeInt<KMER_PRECISION> bloom_elem;
 27 | #else
 28 | #ifdef _ttmath
 29 | #include "ttmath/ttmath.h"
 30 | typedef ttmath::UInt<KMER_PRECISION> bloom_elem;
 31 | #else
 32 | #if (! defined kmer_type) || (! defined _LP64)
 33 | typedef uint64_t bloom_elem;
 34 | #else
 35 | typedef kmer_type bloom_elem;
 36 | #endif
 37 | #endif
 38 | #endif
 39 | 
 40 | #define NSEEDSBLOOM 10
 41 | #define CUSTOMSIZE 1
 42 | 
 43 | static const int bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
 44 | static const unsigned char bit_mask[bits_per_char] = {
 45 |     0x01,  //00000001
 46 |     0x02,  //00000010
 47 |     0x04,  //00000100
 48 |     0x08,  //00001000
 49 |     0x10,  //00010000
 50 |     0x20,  //00100000
 51 |     0x40,  //01000000
 52 |     0x80   //10000000
 53 | };
 54 | 
 55 | 
 56 | static const uint64_t rbase[NSEEDSBLOOM] =
 57 | {
 58 |     0xAAAAAAAA55555555ULL, 
 59 |     0x33333333CCCCCCCCULL,
 60 |     0x6666666699999999ULL,
 61 |     0xB5B5B5B54B4B4B4BULL,
 62 |     0xAA55AA5555335533ULL,
 63 |     0x33CC33CCCC66CC66ULL,
 64 |     0x6699669999B599B5ULL,
 65 |     0xB54BB54B4BAA4BAAULL,
 66 |     0xAA33AA3355CC55CCULL,
 67 |     0x33663366CC99CC99ULL
 68 | };
 69 | 
 70 | 
 71 | class Bloom{
 72 |     
 73 | protected:
 74 |     
 75 | #ifdef _largeint
 76 |     inline uint64_t hash_func(LargeInt<KMER_PRECISION> elem, int num_hash);
 77 | #endif
 78 | #ifdef _ttmath
 79 |     inline uint64_t hash_func(ttmath::UInt<KMER_PRECISION> elem, int num_hash);
 80 | #endif
 81 | #ifdef _LP64
 82 |     inline uint64_t hash_func(__uint128_t key, int num_hash);
 83 | #endif
 84 |     inline uint64_t hash_func(uint64_t key, int num_hash);
 85 |     inline void generate_hash_seed(); 
 86 |     uint64_t user_seed;
 87 |     uint64_t seed_tab[NSEEDSBLOOM];
 88 |     uint64_t char_hash[2][4];
 89 |   
 90 |     uint64_t getCharHash(int key, int num_hash);
 91 |     uint64_t getLastCharHash(uint64_t key, int num_hash);
 92 | 
 93 |     int n_hash_func;
 94 |     uint64_t nchar;
 95 |     int k;
 96 | 
 97 |     //only relevant for a fake bloom
 98 |     bool fake;
 99 | 
100 |     int hashSize;
101 |     uint64_t bloomMask;
102 |     std::set<bloom_elem> valid_set;
103 |     std::set<uint64_t> valid_hash0;
104 |     std::set<uint64_t> valid_hash1;
105 | 
106 | public:
107 |     int getNumHash();
108 |     int getHashSize();
109 |     uint64_t getBloomMask();
110 | 
111 |     unsigned char * blooma;
112 | 
113 | 
114 |     /**********************************************************************************
115 |     These are the important things that are currently being used.
116 |     ***********************************************************************************/
117 |     
118 |     float weight(); //returns the proportion of 1's in the filter.  So should be between 0.0 and 1.0
119 |     
120 | 
121 |     Bloom* create_bloom_filter_2_hash(uint64_t estimated_items, float fpRate); //creates for two hash functions and given fpRate
122 | 
123 |     Bloom* create_bloom_filter_optimal(uint64_t estimated_items, float fpRate); //creates for smallest size given the fpRate
124 | 
125 |     //loads all the kmers in the reads file into the bloom filter.
126 |     //Input is assumed to be a raw string for each read, one per line.
127 |     void load_from_reads(const char* reads_filename); 
128 | 
129 |     //loads all the kmers in the kmers file into the bloom filter.
130 |     //Input is assumed to be one kmer per line as a string.
131 |     void load_from_kmers(const char* kmers_filename); 
132 |     
133 |     //The hash function minia used and we are now using.
134 |     inline uint64_t oldHash(uint64_t key, int num_hash){
135 |       uint64_t hash = seed_tab[num_hash];
136 |       hash ^= (hash <<  7) ^  key * (hash >> 3) ^ (~((hash << 11) + (key ^ (hash >> 5))));
137 |       hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1;
138 |       hash = hash ^ (hash >> 24);
139 |       hash = (hash + (hash << 3)) + (hash << 8); // hash * 265
140 |       hash = hash ^ (hash >> 14);
141 |       hash = (hash + (hash << 2)) + (hash << 4); // hash * 21
142 |       hash = hash ^ (hash >> 28);
143 |       hash = hash + (hash << 31);
144 |       return hash &= bloomMask;
145 |     }
146 | 
147 |     void addPair(JuncPair pair);
148 |     int containsPair(JuncPair pair);
149 |     
150 |     //Add an element using the old hash function
151 |     inline int oldAdd(bloom_elem elem)
152 |     {
153 |         uint64_t hA,hB;
154 | 
155 |         hA = oldHash(elem, 0);
156 |         hB = oldHash(elem, 1);
157 | 
158 |         add(hA, hB);
159 |     }
160 | 
161 |     //Check whether an element is contained using the old hash function
162 |     inline int oldContains(bloom_elem elem)
163 |     {
164 |         if(fake){
165 |             return (valid_set.find(elem) != valid_set.end());
166 |         }
167 |         uint64_t hA,hB;
168 | 
169 |         hA = oldHash(elem, 0);
170 |         hB = oldHash(elem, 1);
171 | 
172 |         return contains(hA, hB);
173 |     }
174 | 
175 | 
176 |     /**********************************************************************************
177 |     Most of the below is not currently used.  Much of it is for incremental hashing.
178 |     ***********************************************************************************/
179 | 
180 |     //rotates hash to the right by dist. Assume 0 < dist < hashSize
181 |     inline uint64_t  rotate_right(uint64_t  hash, int dist){
182 |         dist %= hashSize;
183 |         return ((hash >> dist) | (hash << (hashSize - dist))) & bloomMask;
184 |     }
185 | 
186 |     //rotates hash to the right by dist. Assume 0 < dist < hashSize
187 |     inline uint64_t  rotate_left(uint64_t  hash, int dist){
188 |         dist %= hashSize;
189 |         return ((hash << dist) | (hash >> (hashSize - dist))) & bloomMask;
190 |     }
191 | 
192 |     //only for num_hash = 0 or 1
193 |     uint64_t get_rolling_hash(uint64_t key, int num_hash);
194 | 
195 |     inline uint64_t roll_hash(uint64_t oldHash, int oldC, int newC, int num_hash){
196 |       return rotate_left(oldHash ^ getCharHash(oldC, num_hash), 1) ^ rotate_right(getLastCharHash(newC, num_hash), k-1);
197 |     }
198 |     
199 |     inline void advance_hash(char* read, uint64_t * hash0, uint64_t * hash1, int startPos, int endPos){
200 |         for(int i = startPos; i < endPos; i++){
201 |             *hash0 = roll_hash(*hash0, NT2int(read[i]), NT2int(read[i+sizeKmer]), 0);
202 |             *hash1 = roll_hash(*hash1, NT2int(read[i]), NT2int(read[i+sizeKmer]), 1);
203 |         }
204 |     }
205 | 
206 |     inline void add(bloom_elem elem)
207 |     {
208 |         uint64_t hA,hB;
209 | 
210 |         hA = get_rolling_hash(elem, 0);
211 |         hB = get_rolling_hash(elem, 1);
212 | 
213 |         add(hA, hB);    
214 |     }
215 | 
216 | 
217 |     inline void add(uint64_t h0, uint64_t h1)
218 |     {
219 |         uint64_t h = h0;
220 |         for(int i=0; i<n_hash_func; i++, h += h1)
221 |         {
222 |             h %= tai;
223 |             //printf("Setting array position %lli\n", h);
224 |             blooma [h >> 3] |= bit_mask[h & 7];
225 |         }
226 |     }
227 | 
228 | 
229 |     inline int contains(bloom_elem elem)
230 |     {
231 |         if(fake){
232 |             return (valid_set.find(elem) != valid_set.end());
233 |         }
234 |         uint64_t hA,hB;
235 | 
236 |         hA = get_rolling_hash(elem, 0);
237 |         hB = get_rolling_hash(elem, 1);
238 | 
239 |         return contains(hA, hB);
240 |     }
241 | 
242 |     inline int contains(uint64_t h0, uint64_t h1)
243 |     { 
244 |       if(fake){
245 |         return (valid_hash0.find(h0) != valid_hash0.end()) 
246 |           && (valid_hash1.find(h1) != valid_hash1.end());
247 |       }
248 |         uint64_t h = h0 % tai;
249 |         for(int i=0; i<n_hash_func; i++, h = (h+h1)%tai)
250 |         {
251 |             //printf("Checking array position %lli\n", h);
252 |             if ((blooma[h >> 3 ] & bit_mask[h & 7]) != bit_mask[h & 7]){
253 |                 return 0;
254 |             }
255 |         }
256 |         return 1;
257 |         
258 |     }
259 | 
260 | 
261 |     /**********************************************************************************
262 |     This is either very basic functions or for testing.
263 |     ***********************************************************************************/
264 | 
265 |     //makes this a fake bloom filter that returns true only on specified kmers
266 |     void fakify();
267 |     // Add a set of kmers to the fake bloom's list, so that they will return true
268 |     void addFakeKmers(std::set<bloom_elem> valid_kmers);
269 | 
270 |     void setSeed(uint64_t seed) ;
271 | 
272 |     void set_number_of_hash_func(int i) ;
273 |     
274 |     /*void add(bloom_elem elem);
275 |     int  contains(bloom_elem elem);
276 |     void add(uint64_t hash0, uint64_t hash1);
277 |     int contains(uint64_t hash0, uint64_t hash1);*/
278 |     
279 |     uint64_t tai;
280 |     uint64_t nb_elem;
281 |     
282 |     void dump(char * filename);
283 |     void load(char * filename);
284 | 
285 |     Bloom(uint64_t tai_bloom, int k);
286 |     Bloom(int tai_bloom);
287 |     Bloom(uint64_t tai_bloom);
288 | 
289 |     Bloom();
290 |     
291 |     ~Bloom();
292 | };
293 | 
294 | void load_two_filters(Bloom* bloo1, Bloom* bloo2, std::string reads_filename, bool fastq, bool mercy); //if fastq, use fastq. Else use fasta
295 | void load_single_filter(Bloom* bloo1, string reads_filename, bool fastq);
296 | double brents_fun(std::function<double (double)> f, double lower, double upper, double tol, unsigned int max_iter);
297 | bool isJunction(ReadKmer readKmer, Bloom* bloom, bool dir);
298 | 
299 | #endif
300 | 
301 | 


--------------------------------------------------------------------------------
/src/newTests/ReadscanTest.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <map>
  3 | #include "gtest/gtest.h"
  4 | #include "../ReadScanner.h"
  5 | #include "../../utils/Bloom.h"
  6 | #include "../../utils/JunctionMap.h"
  7 | using std::unordered_map;
  8 | // #include "../../utils/sparsepp.h"
  9 | // using spp::std::unordered_map;
 10 | 
 11 | 
 12 | class readScan : public ::testing::Test {
 13 | 
 14 | protected:
 15 |     std::vector<string> reads;
 16 |     std::vector<string> kmers;
 17 | 
 18 |     Bloom* bloom;
 19 |     ReadScanner* scanner;
 20 |     int j;
 21 |     int read_length;
 22 |     int estimated_kmers;
 23 |     int maxSpacerDist;
 24 |     double fpRate;
 25 | 
 26 |     JChecker* jchecker;
 27 |     JunctionMap* junctionMap;
 28 |     Bloom* short_pair_filter;
 29 |     Bloom* long_pair_filter;
 30 | 
 31 |     // Build a kmer out of a string input
 32 |     kmer_type getKmerFromString(string kmerString){
 33 |         kmer_type kmer;
 34 |         getFirstKmerFromRead(&kmer, &(kmerString[0]));
 35 |         return kmer;
 36 |     }
 37 | 
 38 |     // Add the kmers from a vector of strings to a fake bloom filter
 39 |     std::set<kmer_type> addKmers(Bloom* bloom, std::vector<string> kmers) {
 40 |         std::set<kmer_type> valids {};
 41 |         for (string kmer : kmers) {
 42 |             valids.insert(get_canon(getKmerFromString(kmer)));
 43 |         }
 44 | 
 45 |         bloom->addFakeKmers(valids);
 46 |         valids.clear();
 47 |     }
 48 | 
 49 |     // Create a bloom filter, but make it a fake one
 50 |     Bloom* createBloom(){
 51 |         Bloom* fakeBloom =  fakeBloom->create_bloom_filter_optimal(estimated_kmers, fpRate);
 52 |         fakeBloom->fakify();
 53 |         return fakeBloom;
 54 |     }
 55 | 
 56 |     // This method should be used and modified to print whatever we want ot check about the resulting junction map
 57 |     void printJunctionMap(ReadScanner scanner) {
 58 |         auto map = scanner.getJunctionMap()->junctionMap;
 59 |         printf("Size: %d \n", map.size());
 60 |         for (auto& kv : map){
 61 |             printf("%s \n", print_kmer(kv.first));
 62 |             printf("%d %d %d %d %d \n",
 63 |                 kv.second.dist[0], kv.second.dist[1], kv.second.dist[2], kv.second.dist[3], kv.second.dist[4]);
 64 |         }
 65 |     }
 66 | 
 67 |     // set up blooms, junction map, jchecker, readscanner for testing
 68 |     readScan() {
 69 |         j = 0;
 70 |         read_length = 30;
 71 |         estimated_kmers = 35;
 72 |         maxSpacerDist = 8;
 73 |         fpRate = .1;
 74 |         kmers = {};
 75 |         reads = {};
 76 | 
 77 |         bloom = createBloom(); 
 78 |         jchecker = new JChecker(j, bloom);
 79 | 
 80 |         junctionMap = new JunctionMap(bloom, jchecker, read_length);
 81 |         string read_scan_file = "mock_file";
 82 | 
 83 |         short_pair_filter = short_pair_filter->create_bloom_filter_optimal(estimated_kmers/9, fpRate);
 84 |         long_pair_filter = long_pair_filter->create_bloom_filter_optimal(estimated_kmers/6, fpRate);
 85 | 
 86 |         scanner = new ReadScanner(junctionMap, read_scan_file, bloom, short_pair_filter, long_pair_filter, jchecker, maxSpacerDist);
 87 |         printf("Done initializing!\n");
 88 |     }
 89 |     ~readScan(){
 90 |         reads.clear();
 91 |         kmers.clear();
 92 |         delete jchecker;
 93 |         delete short_pair_filter;
 94 |         delete long_pair_filter;
 95 |         delete bloom;
 96 |         delete junctionMap;
 97 |         delete scanner;
 98 |     }
 99 | };
100 | 
101 | // This test adds one read, and adds the reads kmers to the bloom filter, scans and prints the junction map 
102 | TEST_F(readScan, singleReadNoJunctions) {
103 |     setSizeKmer(5);
104 | 
105 |     reads = {"ACGGGCGAACTTTCATAGGA"};
106 |     kmers = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT","AACTT",
107 |         "ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA"};
108 | 
109 |     addKmers(bloom, kmers);
110 | 
111 |     scanner->scanInputRead(reads[0], true);
112 |     std::unordered_map<kmer_type, Junction> map = scanner->getJunctionMap()->junctionMap;
113 |     // Expected junctions & distances
114 |     // TCCTA 
115 |     // 0 0 15 0 1 
116 |     // AACTT 
117 |     // 0 0 15 0 15 
118 |     //assert junction k-mers in map
119 |     ASSERT_EQ(map.count(getKmerFromRead("TCCTA", 0)),1);
120 |     ASSERT_EQ(map.count(getKmerFromRead("AACTT", 0)),1);
121 |     // only these junction in map
122 |     ASSERT_EQ(map.size(),2);
123 | 
124 |     for (auto& kv : map){
125 |         // assert distances are correct
126 |         if (print_kmer(kv.first)=="TCCTA"){
127 |             ASSERT_EQ(kv.second.dist[2],15);
128 |             ASSERT_EQ(kv.second.dist[4],1);            
129 |         }
130 |         if (print_kmer(kv.first)=="AACTT"){
131 |             ASSERT_EQ(kv.second.dist[2],15);
132 |             ASSERT_EQ(kv.second.dist[4],15);    
133 |         }
134 |     }
135 | }
136 | 
137 | TEST_F(readScan, singleReadOneFakeJunction) {
138 |     setSizeKmer(5);
139 | 
140 |     // added k-mers in BF "AACTC", "ACTCC" create fake junction and branch of length 2
141 |     reads = {"ACGGGCGAACTTTCATAGGA"};
142 |     kmers = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT","AACTT","AACTC","ACTCC",
143 |         "ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA"};
144 | 
145 |     addKmers(bloom, kmers);
146 | 
147 |     scanner->scanInputRead(reads[0], true);
148 |     std::unordered_map<kmer_type, Junction> map = scanner->getJunctionMap()->junctionMap;
149 |     // Expected junctions & distances
150 |     // CCTAT 
151 |     // 0 0 0 15 3 
152 |     // GAACT 
153 |     // 0 0 15 0 13 
154 |     //assert junction k-mers in map
155 |     ASSERT_EQ(map.count(getKmerFromRead("CCTAT", 0)),1);
156 |     ASSERT_EQ(map.count(getKmerFromRead("GAACT", 0)),1);
157 |     // only these junction in map
158 |     ASSERT_EQ(map.size(),2);
159 |     for (auto& kv : map){
160 |         // assert distances are correct
161 |         if (print_kmer(kv.first)=="CCTAT"){
162 |             ASSERT_EQ(kv.second.dist[3],15);
163 |             ASSERT_EQ(kv.second.dist[4],3);            
164 |         }
165 |         if (print_kmer(kv.first)=="GAACT"){
166 |             ASSERT_EQ(kv.second.dist[2],15);
167 |             ASSERT_EQ(kv.second.dist[4],13);    
168 |         }
169 |     }
170 | }
171 | 
172 | // Long read, no junctions
173 | TEST_F(readScan, LongReadNoJunctions) {
174 |     setSizeKmer(5);
175 |     reads = {"ACGGGCGAACTTTCATAGGATCGCACTCAC"};
176 |     kmers = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT","AACTT",
177 |         "ACTTT","CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA",
178 |         "AGGAT", "GGATC", "GATCG", "ATCGC", "TCGCA", "CGCAC", "GCACT", "GCACT", 
179 |         "CACTC", "ACTCA", "CTCAC"};
180 |         
181 |     addKmers(bloom, kmers);
182 | 
183 |     scanner->scanInputRead(reads[0], true);
184 |     std::unordered_map<kmer_type, Junction> map = scanner->getJunctionMap()->junctionMap;
185 |     // Expected junctions & distances
186 |     // ATCGC 
187 |     // 1 0 0 0 3 
188 |     // TGCGA 
189 |     // 0 0 1 0 11 
190 |     // CGATC 
191 |     // 0 1 0 0 3 
192 |     // GGATC 
193 |     // 0 0 0 1 12 
194 |     // TTCAT 
195 |     // 12 0 0 0 15 
196 |     // TTCGC 
197 |     // 0 1 0 0 15 
198 |     // GGCGA 
199 |     // 1 0 0 0 7 
200 | 
201 |     //assert some of junction k-mers in map
202 |     ASSERT_EQ(map.count(getKmerFromRead("ATCGC", 0)),1);
203 |     ASSERT_EQ(map.count(getKmerFromRead("GGATC", 0)),1);
204 |     // only map is correct size
205 |     ASSERT_EQ(map.size(),7);
206 |     for (auto& kv : map){
207 |         // assert distances are correct
208 |         if (print_kmer(kv.first)=="GGATC"){
209 |             ASSERT_EQ(kv.second.dist[3],1);
210 |             ASSERT_EQ(kv.second.dist[4],12);            
211 |         }
212 |         if (print_kmer(kv.first)=="TTCGC"){
213 |             ASSERT_EQ(kv.second.dist[1],1);
214 |             ASSERT_EQ(kv.second.dist[4],15);    
215 |         }
216 |     }
217 | }
218 | 
219 | 
220 | // additional tests wanted:
221 | // edge case: read that's a tandem repeat, no junctions
222 | // read with k-mer missing - see getValidReads mechanism works
223 | 
224 | 
225 | // Same thing but with three reads
226 | TEST_F(readScan, buildFullMap) {
227 |     setSizeKmer(5);
228 |     reads = {"ACGGGCGAACTTTCATAGGA", "GGCGAACTAGTCCAT", "AACTTTCATACGATT"};
229 |     kmers = {"ACGGG","CGGGC","GGGCG","GGCGA","GCGAA","CGAAC","GAACT","AACTT","ACTTT",
230 |         "CTTTC","TTTCA","TTCAT","TCATA","CATAG","ATAGG","TAGGA","GGCGA", "GCGAA", "CGAAC", 
231 |         "GAACT", "AACTA","ACTAG", "CTAGT", "TAGTC", "AGTCC","GTCCA", "TCCAT","AACTT", 
232 |         "ACTTT", "CTTTC", "TTTCA", "TTCAT", "TCATA", "CATAC", "ATACG", "TACGA", "ACGAT","CGATT"};
233 |     addKmers(bloom, kmers);
234 | 
235 |     scanner->scanInputRead(reads[0], true);
236 |     scanner->scanInputRead(reads[1], true);
237 |     scanner->scanInputRead(reads[2], true);
238 |     std::unordered_map<kmer_type, Junction> map = scanner->getJunctionMap()->junctionMap;
239 | 
240 |     // Expected junctions & distances
241 |     // CTAGT 
242 |     // 0 8 3 0 3 
243 |     // TCATA 
244 |     // 0 10 0 6 12 
245 |     // GAACT 
246 |     // 3 0 12 0 13 
247 |     //assert some of junction k-mers in map
248 |     ASSERT_EQ(map.count(getKmerFromRead("CTAGT", 0)),1);
249 |     ASSERT_EQ(map.count(getKmerFromRead("GAACT", 0)),1);
250 |     // only map is correct size
251 |     ASSERT_EQ(map.size(),3);
252 |     for (auto& kv : map){
253 |         // assert distances are correct
254 |         if (print_kmer(kv.first)=="GAACT"){
255 |             ASSERT_EQ(kv.second.dist[0],3);
256 |             ASSERT_EQ(kv.second.dist[4],13);            
257 |         }
258 |         if (print_kmer(kv.first)=="CTAGT"){
259 |             ASSERT_EQ(kv.second.dist[1],8);
260 |             ASSERT_EQ(kv.second.dist[4],3);    
261 |         }
262 |     }
263 |     // printJunctionMap(*scanner);
264 | }
265 | 
266 | TEST_F(readScan, smallDblJuncMap) {
267 |     setSizeKmer(7);
268 |     // j = 1;
269 | 
270 |     reads = {"AAAAACAGCGATTC", "AAAAAGAGCGATTTA"};
271 |     kmers = {"AAAAACA", "AAAAAGA", "AAAACAG", "AAAAGAG", "AAACAGC", "AAAGAGC",
272 |         "AACAGCG", "AAGAGCG", "ACAGCGA","AGAGCGA","CAGCGAT", "GAGCGAT", "AGCGATT", 
273 |         "GCGATTT" ,"GCGATTC", "CGATTTA"};
274 |     addKmers(bloom, kmers);
275 | 
276 |     scanner->scanInputRead(reads[0], true);
277 |     scanner->scanInputRead(reads[1], true);
278 |     std::unordered_map<kmer_type, Junction> map = scanner->getJunctionMap()->junctionMap;
279 | 
280 |     printJunctionMap(*scanner);
281 | }
282 | 
283 | // add separate to JunctionMapTest
284 | // test building of map, then removal of complex junctions - closer to 
285 | // then
286 | 
287 | // int main(int ac, char* av[])
288 | // {
289 | //   testing::InitGoogleTest(&ac, av);
290 | //   return RUN_ALL_TESTS();
291 | // }
292 | 


--------------------------------------------------------------------------------
/src/ContigNode.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include "ContigNode.h"
  3 | #include <time.h>
  4 | using std::ofstream;
  5 | using std::stringstream;
  6 | #include <sstream> //for std::stringstream 
  7 | #include <string>  //for std::string
  8 | #include <unordered_set>
  9 | 
 10 | 
 11 | 
 12 | 
 13 | ContigNode::ContigNode(Junction junction){
 14 |     for(int i  = 0; i < 4; i++){
 15 |         cov[i] = junction.getCoverage(i);
 16 |         contigs[i] = nullptr;
 17 |     }
 18 |     contigs[4] = nullptr;
 19 | }
 20 | 
 21 | ContigNode::ContigNode(){
 22 |     for(int i  = 0; i < 5; i++){
 23 |         cov[i] = 0;
 24 |         contigs[i] = nullptr;
 25 |     }   
 26 | }
 27 |     
 28 | bool ContigNode::isInvertedRepeatNode(){
 29 |     std::vector<int> inds = this->getIndicesOut();
 30 |     std::unordered_set<Contig *> seenContigs = {};
 31 |     for (auto i : inds){
 32 |         if(seenContigs.find(contigs[i]) == seenContigs.end()){
 33 |             seenContigs.insert(contigs[i]);
 34 |         }
 35 |         else{
 36 |             return true;
 37 |         }
 38 |     }
 39 |     return false;
 40 | }
 41 | 
 42 | std::list<JuncResult> ContigNode::getPairCandidates(int index, int maxDist) {
 43 |     // std::cout << "43\n";
 44 | 
 45 |     std::unordered_set<kmer_type> seenKmers = {};
 46 |     std::vector<NodeQueueEntry> queue(32);
 47 |     // queue.reserve(100);
 48 |     int pos = 0;
 49 |     // queue.push_back(NodeQueueEntry(this, index, 0));
 50 |     queue.at(pos) = NodeQueueEntry(this, index, 0);
 51 |     std::list<JuncResult> results = {};
 52 | 
 53 |     while (queue.at(pos).node != nullptr){
 54 |         // std::cout << "51, queue size is "<< queue.size() << ", pos is "<< pos <<"\n";
 55 |         NodeQueueEntry entry = queue.at(pos);
 56 |         pos++;    
 57 |         kmer_type unique_kmer;
 58 |         // std::cout << "57\n";
 59 |         if (!entry.node->contigs[entry.index]){
 60 |             // std::cout << "60\n";
 61 |             continue; // don't advance if at dead end
 62 |         }else {
 63 |             // record unique kmer to avoid cycles
 64 |             unique_kmer = entry.node->getUniqueKmer(entry.index);
 65 |             // std::cout << "64\n";
 66 |         }
 67 |         if(seenKmers.find(unique_kmer) == seenKmers.end()){
 68 |             seenKmers.insert(unique_kmer);
 69 |             // std::cout << "68\n";
 70 |             if(entry.startDist <= maxDist){
 71 |                 // std::cout << "70\n";
 72 |                 std::list<JuncResult> newResults = entry.getJuncResults(maxDist);
 73 |                 results.insert(results.end(), newResults.begin(), newResults.end());
 74 |                 // std::cout << "results size " << results.size()<<"\n";
 75 |                 entry.addNeighbors(queue);
 76 |             }
 77 |         }
 78 |         // std::cout << "77, queue size is "<< queue.size() << ", pos is "<< pos <<"\n";
 79 |         if (pos > queue.size() - 1) break;
 80 |     }
 81 |     // std::cout << "final results size is " << results.size() << "\n";
 82 |     results.sort();
 83 | 
 84 |     return results;
 85 | }
 86 | 
 87 | 
 88 | 
 89 | std::list<Contig*> ContigNode::doPathsConvergeNearby(int max_ind, int min_ind, int max_dist){
 90 |     /*
 91 |         BFSearches up to max_dist away from the node to verify extensions out of node
 92 |         converge to the same node. Returns list of Contig ptrs on Q when paths do converge,
 93 |         otherwise returns empty list. 
 94 |     */
 95 |     // std::cout << "94\n";
 96 | 
 97 |     ContigNode* target = contigs[max_ind]->otherEndNode(this);
 98 |     std::unordered_set<kmer_type> seenKmers = {};   
 99 |     std::list<Contig*> path;    
100 |     std::vector<NodeQueueEntry> queue(32);
101 |     // queue.reserve(100);
102 |     // queue.push_back(NodeQueueEntry(this, min_ind, 0));
103 |     int pos = 0;
104 |     queue.at(pos) = NodeQueueEntry(this, min_ind, 0);
105 | 
106 |     while (queue.at(pos).node != nullptr){
107 |         // std::cout << "105\n";
108 | 
109 |         NodeQueueEntry entry = queue.at(pos);
110 |         pos++;
111 | 
112 |         kmer_type unique_kmer;
113 |         if (!entry.node->contigs[entry.index]){
114 |             // std::cout << "112\n";
115 | 
116 |             continue; // don't advance if at dead end
117 |         }else {
118 |             // std::cout << "116\n";
119 |             // record unique kmer to avoid cycles
120 |             unique_kmer = entry.node->getUniqueKmer(entry.index);
121 |         }
122 |         if(seenKmers.find(unique_kmer) == seenKmers.end()){        
123 |             seenKmers.insert(unique_kmer);
124 |             if (entry.startDist > max_dist){
125 |                 // std::cout << "123\n";
126 |                 if (pos > queue.size() - 1) break;
127 |                 else continue;
128 |             }
129 |             else if (entry.node->contigs[entry.index]->otherEndNode(entry.node)==target){
130 |                 // reconstruct path from parents
131 |                 // std::cout << "128\n";
132 |                 path = entry.reconstructPathFromParents(queue);
133 |                 return path; 
134 |             }
135 |             else{
136 |                 // std::cout << "133\n";
137 |                 entry.addNeighbors(queue); 
138 |             }            
139 |         }
140 |         // std::cout << "137, queue size is "<< queue.size() << ", pos is "<< pos <<"\n";
141 |         if (pos > queue.size() - 1) break;
142 | 
143 |    }
144 |    // never reached target - return empty list
145 |    return {};
146 | }
147 | 
148 | 
149 | bool ContigNode::checkValidity(){
150 |     for(int i = 0; i < 5; i++){
151 |         if(contigs[i]){
152 |             Contig* contig = contigs[i];
153 |             int side = contig->getSide(this, i);
154 |             if(side == 1){
155 |                 if(contig->ind1 != i){
156 |                     printf("GRAPHERROR: contig has wrong index.\n");
157 |                     return false;
158 |                 }
159 |                 if(contig->node1_p != this){
160 |                     printf("GRAPHERROR: contig points to wrong node.\n");
161 |                     return false;
162 |                 }
163 |             }
164 |             if(side == 2){
165 |                 if(contig->ind2 != i){
166 |                     printf("GRAPHERROR: contig has wrong index.\n");
167 |                     return false;
168 |                 }
169 |                 if(contig->node2_p != this){
170 |                     printf("GRAPHERROR: contig points to wrong node.\n");
171 |                     return false;
172 |                 }
173 |             }
174 |         }
175 |     }
176 |     return true;
177 | }
178 | 
179 | std::vector<std::pair<Contig*, bool>> ContigNode::getFastGNeighbors(int contigIndex){
180 |     std::vector<std::pair<Contig*, bool>> result = {};
181 |     if(contigIndex == 4){
182 |         for(int i = 0; i < 4; i++){
183 |             if(contigs[i]){
184 |                 bool RC = false;
185 |                 if(contigs[i]->getSide(this,i) == 2) {
186 |                     RC = true;
187 |                 }
188 |                 result.push_back(std::pair<Contig*, bool>(contigs[i], RC));
189 |             }
190 |         }
191 |     }
192 |     else{
193 |         if(contigs[4]){
194 |             bool RC = false;
195 |             if(contigs[4]->getSide(this,4) == 2) {
196 |                 RC = true;
197 |             }
198 |             result.push_back(std::pair<Contig*, bool>(contigs[4], RC));
199 |         }
200 |     }
201 |     return result;
202 | }
203 | 
204 | kmer_type ContigNode::getForwardExtension(int index){
205 |     return next_kmer(getKmer(), index, FORWARD);
206 | }
207 | 
208 | kmer_type ContigNode::getUniqueKmer(int index){
209 |     if(index != 4){
210 |         return getForwardExtension(index);
211 |     }
212 |     else{
213 |         return getKmer();
214 |     }
215 | }
216 | 
217 | int ContigNode::numPathsOut(){
218 |     int numPaths = 0;
219 |     for(int i = 0; i < 4; i++){
220 |         if(cov[i] > 0){
221 |             numPaths++;
222 |         }
223 |     }
224 |     return numPaths;
225 | }
226 | 
227 | std::vector<int> ContigNode::getIndicesOut(){
228 |     std::vector<int> paths = {};
229 |     for(int i = 0; i < 4; i++){
230 |         if(cov[i] > 0){
231 |             paths.push_back(i);
232 |         }
233 |     }
234 |     return paths;
235 | }
236 | 
237 | int ContigNode::getTotalCoverage(){
238 |     return getCoverage(4);
239 | }
240 | 
241 | int ContigNode::getCoverage(int nucExt){
242 |     if(nucExt < 4){
243 |         return (int)cov[nucExt];
244 |     }
245 |     return (int)cov[0] + (int)cov[1] + (int)cov[2] + (int)cov[3];
246 | }
247 | 
248 | void ContigNode::setCoverage(Junction junc){
249 |     for(int i = 0; i < 4; i++){
250 |         cov[i] = junc.getCoverage(i);
251 |     }
252 | }
253 | 
254 | void ContigNode::setCoverage(int nucExt, int coverage){
255 |     cov[nucExt] = coverage;
256 | }
257 | 
258 | void ContigNode::replaceContig(Contig* oldContig, Contig* newContig){
259 |      for(int i = 0; i < 5; i++){
260 |         if(contigs[i] == oldContig){
261 |             contigs[i] = newContig;
262 |         }
263 |     }
264 | }
265 | 
266 | int ContigNode::indexOf(Contig* contig){
267 |     for(int i = 0; i < 5; i++){
268 |         if(contigs[i] == contig){
269 |             return i;
270 |         }
271 |     }
272 |     throw std::logic_error("ERROR: tried to find index of contig that's not present.");
273 |     // return 5;
274 | }
275 | 
276 | void ContigNode::update(int nucExt, Contig* contig){
277 |     contigs[nucExt] = contig;
278 | }
279 | 
280 | void ContigNode::breakPath(int nucExt){
281 |     cov[nucExt] = 0;
282 |     contigs[nucExt] = nullptr;
283 | }
284 | 
285 | void ContigNode::clearNode(){
286 |     for (int i=0; i<5; i++){
287 |         this->breakPath(i);
288 |     }
289 | }
290 | 
291 | kmer_type ContigNode::getKmer(){
292 |     for(int i = 4; i >= 0; i--){
293 |         if(contigs[i]){
294 |             return contigs[i]->getNodeKmer(this);
295 |         }
296 |     }
297 |    // intentionally don't return 0 here because that could be a valid kmer value
298 |     throw std::logic_error("No valid contigs from which to getKmer()");
299 | }
300 | 
301 | ContigNode* ContigNode::getNeighbor(int index){
302 |     if(contigs[index]){
303 |         return contigs[index]->otherEndNode(this);
304 |     }
305 |     return nullptr;
306 | }
307 | 
308 | std::string ContigNode::getString(){
309 |     std::stringstream result;
310 |     for(int i = 0; i < 5; i++){
311 |         result <<  (int)getCoverage(i) << " ";
312 |        result << contigs[i] << " ";
313 |     }
314 |     return result.str();
315 | }
316 | 
317 | 
318 | NodeQueueEntry::NodeQueueEntry(ContigNode* n, int i, int s){
319 |     node = n;
320 |     index = i;
321 |     startDist = s;
322 | }  
323 | 
324 | NodeQueueEntry::NodeQueueEntry(){
325 |     node = nullptr;
326 |     index = -1;
327 |     startDist = -1;
328 | }
329 | 
330 | std::list<JuncResult> NodeQueueEntry::getJuncResults(int maxDist){
331 |     Contig* contig = node->contigs[index];
332 |      return contig->getJuncResults(contig->getSide(node, index),startDist, maxDist);
333 | }
334 | 
335 | void NodeQueueEntry::addNeighbors(std::vector<NodeQueueEntry>& queue){
336 |     Contig* contig = node->contigs[index];
337 |     // if (node->contigs[index]){
338 |     //     printf("no contig at this index!\n");
339 |     // }
340 |     int otherSide = 3 - contig->getSide(node,index);    
341 |     ContigNode* nextNode = contig->getNode(otherSide);
342 |     int nextIndex = contig->getIndex(otherSide);
343 |     
344 |     // std::cout << "328\n";
345 |     int lastNonEmptyPos = 0;
346 |     // std::cout << "329, queue size is "<< queue.size() << ", lastNonEmptyPos is "<< lastNonEmptyPos <<"\n";
347 |     while(queue.at(lastNonEmptyPos).node){ 
348 |         lastNonEmptyPos++; 
349 |         if (lastNonEmptyPos == queue.size()) break;
350 |     }    
351 |     // std::cout << "331, queue size is "<< queue.size() << ", lastNonEmptyPos is "<< lastNonEmptyPos <<"\n";
352 | 
353 |     if(nextNode){
354 |         if(nextIndex != 4){
355 |             if(nextNode->contigs[4]){
356 |                 if (lastNonEmptyPos == queue.size()){
357 |                     queue.push_back(NodeQueueEntry(nextNode, 4, startDist + contig->getTotalDistance())); 
358 |                     // std::cout << "334, queue size is "<< queue.size() <<"\n";
359 |                 } else {
360 |                     queue.at(lastNonEmptyPos) = NodeQueueEntry(nextNode, 4, startDist + contig->getTotalDistance());
361 |                 //     queue.push_pack(NodeQueueEntry(nextNode, 4, startDist + contig->getTotalDistance()));
362 |                     // std::cout << "338, queue size is "<< queue.size() << ", lastNonEmptyPos is "<< lastNonEmptyPos <<"\n";
363 |                 }
364 | 
365 |             }
366 |         }
367 |         else{
368 |             for (int i = 0; i < 4; i++){
369 |                 if(nextNode->contigs[i]){
370 |                     if (lastNonEmptyPos == queue.size()){
371 |                         queue.push_back(NodeQueueEntry(nextNode, i, startDist + contig->getTotalDistance()));
372 |                         // std::cout << "348, queue size is "<< queue.size() <<"\n";
373 |                     } else{
374 |                         queue.at(lastNonEmptyPos) = NodeQueueEntry(nextNode, i, startDist + contig->getTotalDistance());
375 |                     //     queue.push_back(NodeQueueEntry(nextNode, i, startDist + contig->getTotalDistance()));                        
376 |                         // std::cout << "351, queue size is "<< queue.size() << ", lastNonEmptyPos is "<< lastNonEmptyPos <<"\n";
377 |                     }
378 |                     lastNonEmptyPos++;
379 |                 }
380 |             }
381 |         }
382 |     }
383 |     
384 | }
385 | 
386 | // use stack of parents to reconstruct path: start from target, get other end node
387 | std::list<Contig*> NodeQueueEntry::reconstructPathFromParents(std::vector<NodeQueueEntry>& parents){
388 |     std::list<Contig*> path = {};
389 |     path.push_front(node->contigs[index]); // this is the target
390 |     NodeQueueEntry *currEntry = this;
391 | 
392 |     // move along parents vector in reverse order
393 |     // query for other end node using entry's contig index
394 |     // when other end node is current entry's node, 
395 |     // make its entry the current entry, add contig to front of path
396 |     // std::cout << "in reconstructPathFromParents\n";
397 |     for (auto it = parents.rbegin(); it != parents.rend(); ++it){
398 |         if (!it->node) continue;
399 |         if (it->node->contigs[it->index]->otherEndNode(it->node) == currEntry->node){
400 |             path.push_front(it->node->contigs[it->index]);
401 |             currEntry = &(*it);
402 |         }
403 |     }
404 |     return path;
405 | }
406 | 
407 | 
408 | 
409 | 


--------------------------------------------------------------------------------
/src/Contig.cpp:
--------------------------------------------------------------------------------
  1 | #include "Contig.h"
  2 | #include <fstream>
  3 | #include <sstream>
  4 | #include <algorithm>    // std::reverse
  5 | #include <vector>       // std::vector
  6 | #include <assert.h>
  7 | #include <math.h>
  8 | 
  9 | 
 10 | using std::stringstream;
 11 | using std::ofstream;
 12 | 
 13 | // we ignore effects due to Bloom filter FPs when querying for pairs
 14 | std::pair<double, double> Contig::getPairsMeanStd(Bloom* pair_filter){
 15 | 	std::list<JuncResult> results = getJuncResults(1, 0, std::min(length(),2000));
 16 | 	
 17 | 	int pairs_sum = 0;
 18 | 	int pairs_count = 0;
 19 | 
 20 | 	for(auto itL = results.begin(); itL != results.end(); itL++){
 21 | 		for(auto itR = itL; itR != results.end(); itR++){
 22 | 			if(pair_filter->containsPair(JuncPair(itL->kmer, itR->kmer))){
 23 | 				pairs_count++;
 24 | 				pairs_sum += itR->distance - itL->distance;
 25 | 			}
 26 | 		}
 27 | 	}
 28 | 	if (pairs_sum==0 || pairs_count==0) return std::make_pair(0,0);
 29 | 	double mean = pairs_sum/ (double) pairs_count;
 30 | 	double sum_sqrs = 0;
 31 | 	for(auto itL = results.begin(); itL != results.end(); itL++){
 32 | 		for(auto itR = itL; itR != results.end(); itR++){
 33 | 			if(pair_filter->containsPair(JuncPair(itL->kmer, itR->kmer))){
 34 | 				sum_sqrs += pow((itR->distance - itL->distance) - mean, 2);
 35 | 			}
 36 | 		}
 37 | 	}
 38 | 	double std = pow(sum_sqrs/ (pairs_count - 1.5), 0.5);
 39 | 	std::cout << "pairs_sum is " << pairs_sum << ", pairs_count is " << pairs_count << std::endl;
 40 | 	std::pair <int, int> mean_std = std::make_pair(mean, std);
 41 | 	return mean_std;
 42 | }
 43 | 
 44 | 
 45 | //Looks at all junction pairs on this contig, and prints a histogram of how many BF positives and BF negatives there are
 46 | //for pairs at different distances.
 47 | void Contig::printPairStatistics(Bloom* pair_filter){
 48 | 	std::list<JuncResult> results = getJuncResults(1, 0, 3*length());
 49 | 	std::cout << "Length " << length() << ", results " << results.size() << "\n";
 50 | 	const int maxDist = 2000;
 51 | 	const int increment = 20;
 52 | 	int posNegPairCounts [2][maxDist/increment] = {};
 53 | 
 54 | 	for(int i = 0; i < maxDist/increment; i++){
 55 | 		posNegPairCounts[0][i] = 0;
 56 | 		posNegPairCounts[1][i] = 0;
 57 | 	}
 58 | 
 59 | 	for(auto itL = results.begin(); itL != results.end(); itL++){
 60 | 		for(auto itR = itL; itR != results.end(); itR++){
 61 | 			int index = (itR->distance - itL->distance)/increment;
 62 | 			if(index < maxDist/increment && index >= 0){
 63 | 				if(pair_filter->containsPair(JuncPair(itL->kmer, itR->kmer))){
 64 | 					posNegPairCounts[0][index] += 1;
 65 | 				}
 66 | 				else{
 67 | 					posNegPairCounts[1][index] += 1;
 68 | 				}
 69 | 			}
 70 | 		}
 71 | 	}
 72 | 
 73 | 	printf("Pair pos/neg char, aggregated over buckets of length %d:\n", increment);
 74 | 	for(int i = 0; i < maxDist / increment; i++){
 75 | 		std::cout << "Distance " << i*increment << ": ";
 76 | 		std::cout << posNegPairCounts[0][i] << ",";
 77 | 		std::cout << posNegPairCounts[1][i] << "\n";
 78 | 	}
 79 | }
 80 | 
 81 | //Reverses if needed to get "canonical" concatenation of two in the same direction
 82 | //Reverses again at the end to ensure no mutation of contigs
 83 | Contig* Contig::concatenate(Contig* otherContig, int thisSide, int otherSide){
 84 | 	if(thisSide == 1){
 85 | 		reverse();
 86 | 	}
 87 | 	if(otherSide == 2){
 88 | 		otherContig->reverse();
 89 | 	}
 90 | 	Contig* concatenation =  concatenate(otherContig);
 91 | 	if(thisSide == 1){
 92 | 		reverse();
 93 | 	}
 94 | 	if(otherSide == 2){
 95 | 		otherContig->reverse();
 96 | 	}
 97 | 	return concatenation;
 98 | }
 99 | 
100 | //utility for linking them if they're both facing "forward"
101 | Contig* Contig::concatenate(Contig* otherContig){
102 | 	Contig* result = new Contig();
103 | 	result->setEnds(node1_p, ind1, otherContig->node2_p, otherContig->ind2);
104 | 	if(getSeq().length() < sizeKmer){
105 | 		printf("ERROR: seq less than k long in Contig::Concatenate.\n");
106 | 	}
107 | 	result->setContigJuncs(contigJuncs.concatenate(otherContig->contigJuncs));
108 | 	return result;
109 | }
110 | 
111 | void Contig::reverse(){
112 | 	{ContigNode * temp = node1_p;
113 | 		node1_p = node2_p;
114 | 		node2_p = temp;}
115 | 
116 | 	{int temp = ind1;
117 | 		ind1 = ind2;
118 | 		ind2 = temp;}
119 | 
120 | 	contigJuncs.reverse();
121 | }
122 | 
123 | void Contig::setEnds( ContigNode* n1, int i1, ContigNode* n2, int i2){
124 | 	node1_p = n1;
125 | 	node2_p = n2;
126 | 	setIndices(i1, i2);
127 | 	if(node1_p){
128 | 		node1_p->contigs[i1] = this;
129 | 	}
130 | 	if(node2_p){
131 | 		node2_p->contigs[i2] = this;
132 | 	}
133 | }
134 | 
135 | //Gets all of the interior junctions on this contig, as a list of JuncResult objects
136 | //Assumes this is startDist away from the real start, so increments all by startDist
137 | //Side refers to which side of the contig to start from
138 | std::list<JuncResult> Contig::getJuncResults(int side, int startDist, int maxDist){
139 | 	if(side == 2){
140 | 		reverse();
141 | 	}
142 | 	auto result = contigJuncs.getJuncResults(ind1 != 4, startDist, maxDist); //forward if ind1 != 4, backward if ind1 == 4
143 | 	if(side == 2){
144 | 		reverse();
145 | 	}
146 | 	return result;
147 | }
148 | 
149 | int Contig::length(){
150 | 	return contigJuncs.length();
151 | }
152 | 
153 | double Contig::getAvgCoverage(){
154 | 	return contigJuncs.getAvgCoverage();
155 | }
156 | 
157 | double Contig::getAvgCoverage(std::list<JuncResult> results){
158 | 	return contigJuncs.getAvgCoverage(results);
159 | }
160 | 
161 | double Contig::getCoverageSampleVariance(){
162 | 	return contigJuncs.getCoverageSampleVariance();
163 | }
164 | 
165 | double Contig::getCoverageSampleVariance(std::list<JuncResult> results){
166 | 	return contigJuncs.getCoverageSampleVariance(results);
167 | }
168 | 
169 | float Contig::getMass(){
170 | 	return getAvgCoverage()*getSeq().length();
171 | }
172 | 
173 | void Contig::setIndices(int i1, int i2){
174 | 	ind1 = i1;
175 | 	ind2 = i2;
176 | }
177 | 
178 | int Contig::getMinIndex(){
179 | 	return std::min(ind1, ind2);
180 | }
181 | 
182 | ContigNode* Contig::otherEndNode(ContigNode * oneEnd){
183 | 	if(node1_p == oneEnd){
184 | 		return node2_p;
185 | 	}
186 | 	if(node2_p == oneEnd){
187 | 		return node1_p;
188 | 	}
189 | 	printf("ERROR: tried to get other end of a contig, but the given pointer didn't point to either end!.\n");
190 | 	std::cout << "node1_p: " << node1_p << " node2_p: " << node2_p << " oneEnd: " << oneEnd << "\n";
191 | 	std::cout << "This contig: " << this << "\n";
192 | 	return nullptr;
193 | }
194 | 
195 | //Assumes the given contig node points to one end of this contig
196 | kmer_type Contig::getNodeKmer(ContigNode * contigNode){
197 | 	if(node1_p == contigNode){
198 | 		return getSideKmer(1);
199 | 	}
200 | 	if(node2_p == contigNode){
201 | 		return getSideKmer(2);
202 | 	}
203 |     throw std::logic_error("Tried to get the kmer corresponding to a node not adjacent to this contig from this contig.");
204 | 
205 | 	// printf("ERROR: tried to get the kmer corresponding to a node not adjacent to this contig from this contig.\n");
206 | }
207 | 
208 | ContigNode* Contig::getNode(int side){
209 | 	if (side == 1){
210 | 		return node1_p;
211 | 	}
212 | 	if(side == 2){
213 | 		return node2_p;
214 | 	}
215 | 	throw std::logic_error("Called getNode on contignode with side other than 1,2");
216 | }
217 | 
218 | int Contig::getIndex(int side){
219 | 	if (side == 1){
220 | 		return ind1;
221 | 	}
222 | 	if(side == 2){
223 | 		return ind2;
224 | 	}
225 | 	throw std::logic_error("Called getSide on contignode with side other than 1,2");
226 | }
227 | 
228 | //Gets kmer for node1_p if side == 1, node2_p if side == 2
229 | kmer_type Contig::getSideKmer(int side){
230 | 	if(side == 1){
231 | 		kmer_type kmer = getKmerFromRead(getSeq(), 0);
232 | 		if(ind1 == 4) return revcomp(kmer);
233 | 		return kmer;
234 | 	}
235 | 	if(side == 2){
236 | 		kmer_type kmer = getKmerFromRead(getSeq(), getSeq().length()-sizeKmer);
237 | 		if(ind2 == 4) return kmer;
238 | 		return revcomp(kmer);
239 | 	}
240 | 	throw std::logic_error("Tried to get a kmer corresponding to a side other than one or two from a contig.");
241 | }
242 | 
243 | int Contig::getSide(ContigNode* node){
244 | 	if(node1_p == node){
245 | 		return 1;
246 | 	}
247 | 	if(node2_p == node){
248 | 		return 2;
249 | 	}
250 | 	printf("ERROR: tried to get the side of a contig node not adjacent to the contig.\n");
251 | 	std::cout << "Node1: " << node1_p << ", Node2: " << node2_p << " Input: " << node << "\n";
252 | 	return -1;
253 | }
254 | 
255 | int Contig::getSide(ContigNode* node, int index){
256 | 	if((node1_p == node) && (ind1 == index)){
257 | 		return 1;
258 | 	}
259 | 	if((node2_p == node) && (ind2 == index)){
260 | 		return 2;
261 | 	}
262 | 	printf("ERROR: tried to get the side of a contig node,index pair, but didn't find it on either side.\n");
263 | 	std::cout << "Node1: " << node1_p << ", Node2: " << node2_p << " Input: " << node << "\n";
264 | 	return -1;
265 | }
266 | 
267 | void Contig::setSide(int side, ContigNode* node){
268 | 	if(side == 1){
269 | 		node1_p = node;
270 | 	}
271 | 	else if(side == 2){
272 | 		node2_p = node;
273 | 	}
274 | 	else printf("ERROR: tried to set side for side other than 1,2.\n");	
275 | }
276 | 
277 | void Contig::setMark(bool value){
278 | 	marked = value;
279 | }
280 | 
281 | bool Contig::getMark(){
282 | 	return marked;
283 | }
284 | 
285 | 
286 | bool Contig::isIsolated(){
287 | 	return ((node1_p == nullptr) && (node2_p == nullptr));
288 | }
289 | 
290 | std::vector<std::pair<Contig*, bool>> Contig::getNeighbors(bool RC){
291 | 	if(!RC){ //forward node continuations 
292 | 	    if(node2_p){ //if node exists in forward direction 
293 | 	    	return node2_p->getFastGNeighbors(ind2);
294 | 		}
295 | 	}
296 | 	else{ //backward node continuations
297 | 		if(node1_p){ //if node exists in backward direction
298 | 			return node1_p->getFastGNeighbors(ind1);
299 | 		}
300 | 	}
301 | 	return {};
302 | }
303 | 
304 | bool Contig::isDegenerateLoop(){
305 | 	if (node1_p && node2_p){
306 | 		return (node1_p == node2_p && ind1 == ind2);
307 | 	}
308 | 	return false;
309 | }
310 | 
311 | bool Contig::checkValidity(){
312 | 	// std::cout << "ind1 " << ind1 << ", ind2 " << ind2 << std::endl;
313 | 	if(node1_p){
314 | 		// std::cout << "there is a node 1 ptr\n";
315 | 		if(node1_p->contigs[ind1] != this ){ //&& 
316 | 			// (other != print_kmer(revcomp(getKmerFromRead(node1_p->contigs[ind1]->getSeq(), node1_p->contigs[ind1]->getSeq().length()-sizeKmer) ) ) ) 
317 | 			// ){
318 | 			printf("CONTIG_ERROR: adjacent node 1 at specified index doesn't point back to this contig.\n");
319 | 			// std::cout << "Expected at extension "<< ind1 << "\n";	
320 | 			// std::cout << "node1_p seq is " << print_kmer(node1_p->getKmer()) <<" , ind 1 is " << ind2 <<  std::endl;
321 | 			// std::cout << "node2_p seq is " << print_kmer(node2_p->getKmer()) <<" , ind 2 is " << ind2 <<  std::endl;			
322 | 			
323 | 			// std::cout << "contig is\n"; 
324 | 			// std::cout << this->getSeq()  << std::endl; // << ", length is\n" << this->getSeq().length()
325 | 			// if (node1_p->contigs[4]){
326 | 			// 	std::cout << "node1_p at " << 4 << " is\n";
327 | 			// 	std::cout << node1_p->contigs[4]->getSeq() << std::endl;			
328 | 			
329 | 			// }
330 | 			// if (node2_p->contigs[4]){
331 | 			// 	std::cout << "node2_p at " << 4 << " is\n";
332 | 			// 	std::cout << node2_p->contigs[4]->getSeq() << std::endl;			
333 | 			
334 | 			// }	
335 | 			return false;
336 | 		}
337 | 		if(getSide(node1_p, ind1) != 1 && !isDegenerateLoop()){
338 | 			printf("CONTIG_ERROR: getSide incorrect on node1p, ind1.\n");
339 | 			std::cout << "Node1: " << node1_p << ", Ind1: " << ind1 << ", Side: " << getSide(node1_p, ind1) << "\n";
340 | 			std::cout << "Node2: " << node2_p << ", Ind2: " << ind2 << ", Side: " << getSide(node2_p, ind2) << "\n";
341 | 			return false;
342 | 		}
343 | 	}
344 | 	if(node2_p){
345 | 		// std::cout << "there is a node 2 ptr\n";
346 | 		if(node2_p->contigs[ind2] != this ){//&& 
347 | 			// (getSeq() != print_kmer(getKmerFromRead(node2_p->contigs[ind2]->getSeq(), node2_p->contigs[ind2]->getSeq().length()-sizeKmer) ) ) 
348 | 			// ){
349 | 			printf("CONTIG_ERROR: adjacent node 2 at specified index doesn't point back to this contig.\n");
350 | 			// std::cout << "Expected at extension "<< ind2 << "\n";	
351 | 			// std::cout << "node2_p seq is " << print_kmer(node2_p->getKmer()) <<" , ind 2 is " << ind1 <<  std::endl;			
352 | 			// std::cout << "node1_p seq is " << print_kmer(node1_p->getKmer()) <<" , ind 1 is " << ind1 <<  std::endl;			
353 | 			// std::cout << "contig is\n"; 
354 | 			// std::cout << this->getSeq()  << std::endl; // << ", length is\n" << this->getSeq().length()
355 | 			// if (node2_p->contigs[4]){
356 | 			// 	std::cout << "node2_p at " << 4 << " is\n";
357 | 			// 	std::cout << node2_p->contigs[4]->getSeq() << std::endl;			
358 | 			// }
359 | 			// if (node1_p->contigs[4]){
360 | 			// 	std::cout << "node1_p at " << 4 << " is\n";
361 | 			// 	std::cout << node1_p->contigs[4]->getSeq() << std::endl;			
362 | 			
363 | 			// }
364 | 			return false;
365 | 		}
366 | 		if(getSide(node2_p, ind2) != 2 && !isDegenerateLoop()){
367 | 			printf("CONTIG_ERROR: getSide incorrect on node2p, ind2.\n");
368 | 			std::cout << "Node1: " << node1_p << ", Ind1: " << ind1 << ", Side: " << getSide(node1_p, ind1) << "\n";
369 | 			std::cout << "Node2: " << node2_p << ", Ind2: " << ind2 << ", Side: " << getSide(node2_p, ind2) << "\n";
370 | 			return false;
371 | 		}
372 | 	}
373 | 	
374 | 	return true;
375 | 	
376 | }
377 | 
378 | string Contig::getFastGName(bool RC){
379 | 	stringstream stream;
380 |     stream << "NODE_" << this << "_length_" << getSeq().length() << "_cov_" << getAvgCoverage();
381 |     if(RC){
382 |     	stream << "'";
383 |     }
384 |     return stream.str();
385 | }
386 | 
387 | string Contig::getFastGHeader(bool RC){
388 | 	stringstream stream;
389 | 	stream << ">";
390 |     stream << getFastGName(RC);
391 | 
392 |     //get neighbors in direction corresponding to RC value
393 |     std::vector<std::pair<Contig*, bool>> neighbors = getNeighbors(RC);
394 | 
395 |     //if empty return now
396 |     if(neighbors.empty()){
397 |     	stream << ";" ;
398 |     	return stream.str();
399 |     }
400 | 
401 |     //not empty, add neighbors to line
402 |     stream << ":";
403 |     for(auto it = neighbors.begin(); it != neighbors.end(); ++it){
404 |     	Contig* neighbor = it->first;
405 |     	bool RC = it->second;
406 |     	stream << neighbor->getFastGName(RC) << ",";
407 |     }
408 |     string result = stream.str();
409 |     result[result.length()-1] = ';';
410 |     return result;
411 | }
412 | 
413 | string Contig::getStringRep(){
414 | 	stringstream stream;
415 |     stream << node1_p << "," << ind1 << " " << node2_p << "," << ind2 << "\n";
416 |     stream << contigJuncs.getStringRep();
417 |     stream << "\n";
418 |     return stream.str();
419 | }
420 | 
421 | Contig::Contig(){
422 | 	setSeq("");
423 | 	node1_p = nullptr;
424 | 	node2_p = nullptr;
425 | 	ind1 = 5;
426 | 	ind2 = 5;
427 | 	marked = false;
428 | 	contigJuncs = ContigJuncList();
429 | }
430 | 
431 | Contig::Contig( Contig * c){
432 | 	setSeq("");
433 | 	node1_p = c->node1_p;
434 | 	node2_p = c->node2_p;
435 | 	ind1 = c->ind1;
436 | 	ind2 = c->ind2;
437 | 	marked = c->marked;
438 | 	contigJuncs = c->contigJuncs;
439 | }   
440 | 
441 | Contig::~Contig(){
442 | 	node1_p = nullptr;
443 | 	node2_p = nullptr;
444 | }
445 | 


--------------------------------------------------------------------------------