├── Makefile ├── README ├── Tables ├── Align.csv ├── Align.org ├── Real.csv ├── Real.org ├── Synth.csv ├── Synth.org ├── Viral.csv └── Viral.org └── src ├── Makefile ├── RepeatsDetector.cpp ├── cluster ├── Makefile └── src │ ├── Center.h │ ├── ClusterFactory.cpp │ ├── ClusterFactory.h │ ├── DivergencePoint.cpp │ ├── DivergencePoint.h │ ├── Feature.cpp │ ├── Feature.h │ ├── GLM.cpp │ ├── GLM.h │ ├── Histogram.cpp │ ├── Histogram.h │ ├── LogTable.cpp │ ├── LogTable.h │ ├── Mat.h │ ├── Matrix.cpp │ ├── Matrix.h │ ├── NearestNeighbor.h │ ├── Point.h │ ├── Progress.cpp │ ├── Progress.h │ ├── Runner.cpp │ ├── Runner.h │ ├── SingleFeature.cpp │ ├── SingleFeature.h │ ├── Trainer.cpp │ ├── Trainer.h │ ├── bvec.cpp │ ├── bvec.h │ ├── bvec_iterator.cpp │ ├── bvec_iterator.h │ ├── main.cpp │ ├── needleman_wunsch.cpp │ └── needleman_wunsch.h ├── exception ├── FileDoesNotExistException.cpp ├── FileDoesNotExistException.h ├── InvalidInputException.cpp ├── InvalidInputException.h ├── InvalidOperationException.cpp ├── InvalidOperationException.h ├── InvalidOrderOfOperationsException.cpp ├── InvalidOrderOfOperationsException.h ├── InvalidScoreException.cpp ├── InvalidScoreException.h ├── InvalidStateException.cpp └── InvalidStateException.h ├── nonltr ├── ChromDetector.cpp ├── ChromDetector.h ├── ChromDetectorMaxima.cpp ├── ChromDetectorMaxima.h ├── ChromListMaker.cpp ├── ChromListMaker.h ├── Chromosome.cpp ├── Chromosome.h ├── ChromosomeOneDigit.cpp ├── ChromosomeOneDigit.h ├── ChromosomeRandom.cpp ├── ChromosomeRandom.h ├── DetectorMaxima.cpp ├── DetectorMaxima.h ├── EnrichmentMarkovView.cpp ├── EnrichmentMarkovView.h ├── HMM.cpp ├── HMM.h ├── IChromosome.h ├── ITableView.h ├── KmerHashTable.cpp ├── KmerHashTable.h ├── LocationList.cpp ├── LocationList.h ├── LocationListCollection.cpp ├── LocationListCollection.h ├── Scanner.cpp ├── Scanner.h ├── Scorer.cpp ├── Scorer.h ├── TableBuilder.cpp ├── TableBuilder.h ├── Trainer.cpp └── Trainer.h └── utility ├── AffineId.cpp ├── AffineId.h ├── EmptyLocation.cpp ├── EmptyLocation.h ├── GlobAlignE.cpp ├── GlobAlignE.h ├── ILocation.h ├── LCSLen.cpp ├── LCSLen.h ├── Location.cpp ├── Location.h ├── Util.cpp └── Util.h /Makefile: -------------------------------------------------------------------------------- 1 | all: bin/Red.o bin/meshclust 2 | 3 | bin/Red.o: 4 | mkdir -p bin 5 | mkdir -p bin/exception 6 | mkdir -p bin/nonltr 7 | mkdir -p bin/utility 8 | $(MAKE) -C src 9 | bin/meshclust: bin/Red.o 10 | $(MAKE) -C src/cluster 11 | cp src/cluster/meshclust bin 12 | 13 | clean: 14 | $(MAKE) clean -C src 15 | $(MAKE) clean -C src/cluster 16 | $(RM) -r bin 17 | 18 | rebuild: clean all 19 | .PHONY: all clean 20 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | MeShClust 2 | 3 | The newest version of MeShClust (v3.0) can be obtained from https://github.com/BioinformaticsToolsmith/Identity.git 4 | 5 | Release version 6 | 7 | Requirements: g++ 4.9.1 or later, requires Homebrew on Mac OS X 8 | 9 | Compilation using g++ (homebrew) and GNU Make on Mac OS X 10 | CXX=g++-7 make 11 | 12 | see: https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite 13 | 14 | 15 | Linux/Unix compilation: 16 | make 17 | 18 | If you find this tool helpful, please cite: 19 | 20 | James, Benjamin T. et al. (2018), MeShClust: an intelligent tool for clustering DNA sequences. Nucleic Acids Research, gky315. 21 | 22 | Usage: bin/meshclust *.fasta [--id 0.90] [--kmer 3] [--delta 5] [--output output.clstr] [--iterations 20] [--align] [--sample 3000] [--pivot 40] [--threads TMAX] 23 | 24 | The most important parameter, --id, controls the identity of the sequences. 25 | If the identity is below 60%, alignment is automatically used instead of k-mer measures. 26 | However, alignment can be forced with the --align parameter. 27 | 28 | --kmer decides the size of the kmers. It is by default automatically decided by average sequence length, 29 | but if provided, MeShClust can speed up a little by not having to find the largest sequence length. 30 | Increasing kmer size can increase accuracy, but increases memory consumption fourfold. 31 | 32 | --delta decides how many clusters are looked around in the final clustering stage. 33 | Increasing it creates more accuracy, but takes more time. 34 | 35 | --output specifies the output file, in CD-HIT's CLSTR format 36 | 37 | --iterations specifies how many iterations in the final stage of merging are done until convergence. 38 | 39 | --align forces alignment to be used, which can be much slower than k-mer features, but is 40 | more accurate than using k-mer features to guess alignment. 41 | 42 | --threads sets the number of threads to be used. By default OpenMP uses the number of available cores 43 | on your machine, but this parameter overwrites that. 44 | 45 | --sample selects the total number of sample pairs of sequences used for both training and testing. 46 | 1500 is the default value. 47 | 48 | --pivot selects the maximum number of pairs selected from one pivot sequence. Increasing this means 49 | less pivots are available, but more pairs are selected for one sequence, which can lead to 50 | higher training accuracy. The default value is 40. 51 | 52 | If the argument is not listed here, it is interpreted as an input file. 53 | 54 | 55 | License 56 | 57 | Academic use: The software is provided as-is under the GNU GPLv3. 58 | Any restrictions to use for-profit or non-academics: License needed. 59 | -------------------------------------------------------------------------------- /Tables/Align.csv: -------------------------------------------------------------------------------- 1 | Name,Data,Id,Realtime,Utime,Mem(MB),1-1,Intra,Inter,Sil,Clusters >= 5,avg size >= 5,median size >= 5,pct of clusters >= 5,Purity,NMI 2 | MeShClust,15k,0.83,0:05.73,17.6,93.97265625,100%,96.403,39.606,0.933,200,74.965,74,99.502,1,1 3 | MeShClust with Alignment,15k,0.83,11:00.77,2381.7,190.7578125,100%,93.569,39.677,0.880,200,74.965,74,99.502,1,1 4 | MeShClust,15k,0.87,0:04.42,15.1,93.99609375,99.5%,96.400,39.602,0.918,201,74.582,74,99.015,1,1 5 | MeShClust with Alignment,15k,0.87,8:35.88,1905.8,173,99.5%,93.474,39.684,0.873,201,74.582,74,98.529,1,1 6 | MeShClust,15k,0.9,0:04.28,14.7,93.98046875,99.5%,96.413,39.602,0.918,201,74.572,74,99.015,1,1 7 | MeShClust with Alignment,15k,0.9,6:47.68,1526.8,159.7890625,99.5%,93.464,39.653,0.870,201,74.562,74,97.101,1,1 8 | MeShClust,15k,0.93,0:04.15,14.2,93.9453125,99.5%,96.404,39.600,0.918,201,74.592,74,99.505,1,1 9 | MeShClust with Alignment,15k,0.93,6:02.54,1373.4,159.4765625,97.5%,93.654,39.650,0.802,212,69.925,72,60.399,1,0.993 10 | MeShClust,15k,0.96,0:04.09,13.7,94.046875,98.5%,96.352,39.577,0.873,206,72.684,74,93.636,1,0.998 11 | MeShClust with Alignment,15k,0.96,23:08.02,5382.1,305.0234375,64.5%,95.735,39.599,0.902,202,58.183,59,5.919,1,0.905 12 | MeShClust,15k,0.97,0:05.56,15.1,96.609375,99%,96.483,39.576,0.874,205,70.927,71,31.202,1,0.984 13 | MeShClust with Alignment,15k,0.97,40:38.75,9510.7,441.0390625,0%,96.795,39.635,0.940,200,40.245,41,2.812,1,0.821 14 | -------------------------------------------------------------------------------- /Tables/Align.org: -------------------------------------------------------------------------------- 1 | | Name | Data | Id | Realtime | Utime | Mem(MB) | 1-1 | Intra | Inter | Sil | Clusters >= 5 | avg size >= 5 | median size >= 5 | pct of clusters >= 5 | Purity | NMI | 2 | |--------------------------+------+------+----------+--------+-------------+-------+--------+--------+-------+---------------+---------------+------------------+----------------------+--------+-------| 3 | | MeShClust | 15k | 0.83 | 0:05.73 | 17.6 | 93.97265625 | 100% | 96.403 | 39.606 | 0.933 | 200 | 74.965 | 74 | 99.502 | 1 | 1 | 4 | | MeShClust with Alignment | 15k | 0.83 | 11:00.77 | 2381.7 | 190.7578125 | 100% | 93.569 | 39.677 | 0.880 | 200 | 74.965 | 74 | 99.502 | 1 | 1 | 5 | |--------------------------+------+------+----------+--------+-------------+-------+--------+--------+-------+---------------+---------------+------------------+----------------------+--------+-------| 6 | | MeShClust | 15k | 0.87 | 0:04.42 | 15.1 | 93.99609375 | 99.5% | 96.400 | 39.602 | 0.918 | 201 | 74.582 | 74 | 99.015 | 1 | 1 | 7 | | MeShClust with Alignment | 15k | 0.87 | 8:35.88 | 1905.8 | 173 | 99.5% | 93.474 | 39.684 | 0.873 | 201 | 74.582 | 74 | 98.529 | 1 | 1 | 8 | |--------------------------+------+------+----------+--------+-------------+-------+--------+--------+-------+---------------+---------------+------------------+----------------------+--------+-------| 9 | | MeShClust | 15k | 0.9 | 0:04.28 | 14.7 | 93.98046875 | 99.5% | 96.413 | 39.602 | 0.918 | 201 | 74.572 | 74 | 99.015 | 1 | 1 | 10 | | MeShClust with Alignment | 15k | 0.9 | 6:47.68 | 1526.8 | 159.7890625 | 99.5% | 93.464 | 39.653 | 0.870 | 201 | 74.562 | 74 | 97.101 | 1 | 1 | 11 | |--------------------------+------+------+----------+--------+-------------+-------+--------+--------+-------+---------------+---------------+------------------+----------------------+--------+-------| 12 | | MeShClust | 15k | 0.93 | 0:04.15 | 14.2 | 93.9453125 | 99.5% | 96.404 | 39.600 | 0.918 | 201 | 74.592 | 74 | 99.505 | 1 | 1 | 13 | | MeShClust with Alignment | 15k | 0.93 | 6:02.54 | 1373.4 | 159.4765625 | 97.5% | 93.654 | 39.650 | 0.802 | 212 | 69.925 | 72 | 60.399 | 1 | 0.993 | 14 | |--------------------------+------+------+----------+--------+-------------+-------+--------+--------+-------+---------------+---------------+------------------+----------------------+--------+-------| 15 | | MeShClust | 15k | 0.96 | 0:04.09 | 13.7 | 94.046875 | 98.5% | 96.352 | 39.577 | 0.873 | 206 | 72.684 | 74 | 93.636 | 1 | 0.998 | 16 | | MeShClust with Alignment | 15k | 0.96 | 23:08.02 | 5382.1 | 305.0234375 | 64.5% | 95.735 | 39.599 | 0.902 | 202 | 58.183 | 59 | 5.919 | 1 | 0.905 | 17 | |--------------------------+------+------+----------+--------+-------------+-------+--------+--------+-------+---------------+---------------+------------------+----------------------+--------+-------| 18 | | MeShClust | 15k | 0.97 | 0:05.56 | 15.1 | 96.609375 | 99% | 96.483 | 39.576 | 0.874 | 205 | 70.927 | 71 | 31.202 | 1 | 0.984 | 19 | | MeShClust with Alignment | 15k | 0.97 | 40:38.75 | 9510.7 | 441.0390625 | 0% | 96.795 | 39.635 | 0.940 | 200 | 40.245 | 41 | 2.812 | 1 | 0.821 | 20 | |--------------------------+------+------+----------+--------+-------------+-------+--------+--------+-------+---------------+---------------+------------------+----------------------+--------+-------| 21 | -------------------------------------------------------------------------------- /Tables/Real.csv: -------------------------------------------------------------------------------- 1 | Name,Data,Id,Realtime,Utime,Mem(MB),1-1,Intra,Inter,Sil,Clusters >= 5,avg size >= 5,median size >= 5,pct of clusters >= 5,Purity,NMI 2 | CD-HIT,15k,0.83,0:01.94,1.9,41.05859375,100%,92.673,39.643,0.864,200,74.965,74,99.502,1,1 3 | DNACLUST,15k,0.83,0:21.58,78.3,401.48046875,92%,92.941,39.703,0.612,228,63.754,69,37.438,1,0.981 4 | MeShClust,15k,0.83,0:05.73,17.6,93.97265625,100%,96.403,39.606,0.933,200,74.965,74,99.502,1,1 5 | UCLUST,15k,0.83,0:01.48,1.4,10.05078125,90%,92.848,39.541,0.660,233,64.258,69,94.715,1,0.991 6 | CD-HIT,15k,0.87,0:01.91,1.9,41.109375,100%,92.673,39.643,0.864,200,74.965,74,99.502,1,1 7 | DNACLUST,15k,0.87,1:06.79,231.1,401.484375,29%,93.593,39.559,0.480,323,36.576,37,10.720,1,0.893 8 | MeShClust,15k,0.87,0:04.42,15.1,93.99609375,99.5%,96.400,39.602,0.918,201,74.582,74,99.015,1,1 9 | UCLUST,15k,0.87,0:01.69,1.6,10.328125,80.5%,92.908,39.545,0.548,275,54.095,63,80.645,1,0.979 10 | CD-HIT,15k,0.9,0:01.87,1.8,41.171875,100%,92.673,39.643,0.864,200,74.965,74,99.502,1,1 11 | DNACLUST,15k,0.9,1:37.91,328.9,401.484375,0%,94.982,39.631,0.370,423,17.381,14,5.877,1,0.795 12 | MeShClust,15k,0.9,0:04.28,14.7,93.98046875,99.5%,96.413,39.602,0.918,201,74.572,74,99.015,1,1 13 | UCLUST,15k,0.9,0:01.01,0.9,11.3515625,53%,93.148,39.604,0.362,394,36.977,33,62.342,1,0.950 14 | CD-HIT,15k,0.93,0:01.38,1.3,41.16796875,95%,92.575,39.633,0.759,223,66.906,71,82.900,1,0.993 15 | DNACLUST,15k,0.93,1:12.26,229.8,401.46875,0%,96.632,39.737,0.709,262,9.893,8,2.219,1,0.735 16 | MeShClust,15k,0.93,0:04.15,14.2,93.9453125,99.5%,96.404,39.600,0.918,201,74.592,74,99.505,1,1 17 | UCLUST,15k,0.93,0:01.31,1.3,15.48828125,9.5%,93.930,39.601,0.209,639,20.343,14,34.709,1,0.885 18 | CD-HIT,15k,0.96,0:01.41,1.3,55.8671875,4.5%,94.370,39.627,0.282,493,20.744,15,11.251,1,0.843 19 | DNACLUST,15k,0.96,0:28.71,73.4,401.44140625,0%,98.214,44.197,0.889,34,5.765,5,0.237,1,0.713 20 | MeShClust,15k,0.96,0:04.09,13.7,94.046875,98.5%,96.352,39.577,0.873,206,72.684,74,93.636,1,0.998 21 | UCLUST,15k,0.96,0:02.34,2.3,31.90625,0%,95.643,39.633,0.210,650,10.766,8,9.884,1,0.784 22 | WCD,15k,0.96,0:04.70,18.13,22.914,66.5%,96.848,39.975,0.941,135,73.970,74,2.624,1,0.879 23 | CD-HIT,15k,0.97,0:01.26,1.2,71.84765625,0%,95.889,39.658,0.370,448,11.891,9,5,1,0.765 24 | DNACLUST,15k,0.97,0:18.03,40.0,401.453125,0%,98.787,100,0.988,5,5.200,5,0.034,1,0.712 25 | MeShClust,15k,0.97,0:05.56,15.1,96.609375,99%,96.483,39.576,0.874,205,70.927,71,31.202,1,0.984 26 | UCLUST,15k,0.97,0:02.67,2.6,43.5078125,0%,96.421,39.692,0.373,459,8.669,7,4.888,1,0.752 27 | CD-HIT,150k,0.83,0:59.29,59.1,106.05078125,100%,92.592,39.393,0.859,2000,75.147,75,99.950,1,1 28 | DNACLUST,150k,0.83,35:06.76,7892.5,489.19140625,90.05%,93.031,39.302,0.579,2353,61.605,68,35.004,1,0.984 29 | MeShClust,150k,0.83,1:36.63,327.3,230.171875,100%,96.369,39.373,0.930,2001,75.104,75,99.503,1,1 30 | UCLUST,150k,0.83,0:21.28,21.0,60.30078125,90.3%,92.721,39.314,0.654,2330,64.385,70,94.218,1,0.994 31 | CD-HIT,150k,0.87,0:59.56,59.4,105.8984375,100%,92.592,39.393,0.859,2000,75.147,75,99.950,1,1 32 | DNACLUST,150k,0.87,1:40:27,22709.3,489.19140625,27.95%,93.568,39.357,0.443,3532,33.367,31,11.439,1,0.919 33 | MeShClust,150k,0.87,1:21.26,279.5,230.19921875,100%,96.368,39.374,0.930,2001,75.104,75,99.552,1,1 34 | UCLUST,150k,0.87,0:25.66,25.4,60.30078125,79.45%,92.787,39.337,0.536,2795,53.325,62,80.524,1,0.985 35 | CD-HIT,150k,0.9,0:58.85,58.6,105.8984375,99.8%,92.584,39.383,0.856,2006,74.922,75,99.851,1,1 36 | DNACLUST,150k,0.9,2:18:32,30863.9,489.19140625,0.3%,95.001,39.409,0.394,4135,17.639,14,5.752,1,0.848 37 | MeShClust,150k,0.9,1:17.54,253.0,230.3515625,100%,96.367,39.373,0.929,2001,75.106,75,99.751,1,1 38 | UCLUST,150k,0.9,0:10.44,10.2,60.296875,51.15%,93.077,39.330,0.337,4032,36.185,31,62.280,1,0.963 39 | CD-HIT,150k,0.93,0:14.18,14.0,105.89453125,92.9%,92.495,39.314,0.723,2311,64.693,70,83.070,1,0.994 40 | DNACLUST,150k,0.93,1:41:11,21792.5,489.19140625,0%,96.638,39.336,0.707,2572,9.524,8,2.164,1,0.798 41 | MeShClust,150k,0.93,1:01.88,208.9,231.12109375,100%,96.365,39.374,0.929,2001,75.103,75,99.503,1,1 42 | UCLUST,150k,0.93,0:14.05,13.8,87.5078125,8.15%,93.891,39.383,0.195,6780,19.319,13,36.615,1,0.914 43 | CD-HIT,150k,0.96,0:18.00,17.7,199.2265625,4.2%,94.337,39.388,0.278,4961,20.685,15,11.349,1,0.885 44 | DNACLUST,150k,0.96,37:00.99,7067.7,489.52734375,0%,98.176,40.704,0.966,278,6.151,6,0.193,1,0.781 45 | MeShClust,150k,0.96,0:55.82,187.6,231.90234375,100%,96.373,39.372,0.930,2001,75.054,75,95.195,1,1 46 | UCLUST,150k,0.96,0:26.88,26.4,282.81640625,0%,95.643,39.407,0.226,6497,10.597,8,9.716,1,0.838 47 | WCD,150k,0.96,5:21.11,1229.33,45.867,31.2%,96.915,39.452,0.941,631,74.633,75,0.607,1,0.836 48 | CD-HIT,150k,0.97,0:18.56,18.2,334.578125,0%,95.861,39.393,0.362,4388,12.182,9,4.911,1,0.824 49 | DNACLUST,150k,0.97,21:36.88,3882.7,489.19140625,0%,98.680,44.850,0.976,42,5.619,5,0.029,1,0.780 50 | MeShClust,150k,0.97,1:27.05,274.9,256.46875,96.2%,96.291,39.361,0.926,2006,71.457,72,22.486,1,0.984 51 | UCLUST,150k,0.97,0:32.39,32.0,390.3828125,0%,96.416,39.356,0.367,4543,8.799,7,4.812,1,0.813 52 | CD-HIT,Costello,0.83,10:17.57,616.4,491.36328125,NA,84.643,62.627,0.320,568,1885.092,74,63.111,NA,NA 53 | DNACLUST,Costello,0.83,2:26.16,425.1,1127.2734375,NA,85.109,62.942,0.306,615,1740.629,52,57.584,NA,NA 54 | MeShClust,Costello,0.83,6:18.42,591.5,927.1171875,NA,88.611,65.021,0.437,341,3139.625,20,43.550,NA,NA 55 | UCLUST,Costello,0.83,2:18.56,136.1,412.37890625,NA,87.247,63.451,0.363,962,1112.932,55,72.824,NA,NA 56 | CD-HIT,Costello,0.87,10:16.83,615.7,492.328125,NA,85.965,64.089,0.247,1077,993.171,35,54.421,NA,NA 57 | DNACLUST,Costello,0.87,3:32.53,613.8,1123.4296875,NA,86.102,64.214,0.218,1197,893.310,31,51.395,NA,NA 58 | MeShClust,Costello,0.87,6:57.30,723.5,1079.828125,NA,91.162,65.312,0.530,403,2653.918,12,25.362,NA,NA 59 | UCLUST,Costello,0.87,3:30.73,209.4,412.37890625,NA,88.611,64.469,0.310,1668,641.192,34,63.835,NA,NA 60 | CD-HIT,Costello,0.9,11:40.13,699.0,494.1015625,NA,88.258,64.704,0.246,1746,611.727,24,48.581,NA,NA 61 | DNACLUST,Costello,0.9,4:22.61,707.4,1115.68359375,NA,88.395,64.936,0.197,1927,553.852,23,45.664,NA,NA 62 | MeShClust,Costello,0.9,9:25.28,1181.2,941.80859375,NA,91.911,65.225,0.437,922,1159.304,19,37.388,NA,NA 63 | UCLUST,Costello,0.9,1:08.47,67.2,412.3828125,NA,90.672,65.308,0.302,2605,409.837,27,56.594,NA,NA 64 | CD-HIT,Costello,0.93,9:05.19,544.1,497.96875,NA,90.610,65.585,0.228,2832,375.775,19,41.163,NA,NA 65 | DNACLUST,Costello,0.93,5:08.55,742.4,1130.90625,NA,91.155,65.828,0.194,3120,340.554,18,37.896,NA,NA 66 | MeShClust,Costello,0.93,9:22.34,1193.9,950.96875,NA,92.950,65.553,0.445,1204,886.146,18,30.130,NA,NA 67 | UCLUST,Costello,0.93,1:32.25,90.9,412.37890625,NA,92.612,66.343,0.246,4388,242.153,20,47.941,NA,NA 68 | CD-HIT,Costello,0.96,14:06.25,844.3,510.43359375,NA,93.447,66.501,0.207,4722,223.110,14,30.086,NA,NA 69 | DNACLUST,Costello,0.96,4:16.88,558.2,1122.30859375,NA,94.014,66.866,0.179,4946,212.043,13,25.644,NA,NA 70 | MeShClust,Costello,0.96,9:28.71,1208.5,931.26953125,NA,94.448,66.255,0.453,1735,612.986,15,25.545,NA,NA 71 | UCLUST,Costello,0.96,2:09.83,124.8,412.37890625,NA,94.985,67.037,0.157,7560,138.613,15,35.615,NA,NA 72 | WCD,Costello,0.96,39:11.69,9089.53,204.906,NA,90.704,63.989,0.681,24,44596.583,10,2.424,NA,NA 73 | CD-HIT,Costello,0.97,17:27.64,1046.4,520.4140625,NA,94.462,66.806,0.189,5630,185.592,13,25.158,NA,NA 74 | DNACLUST,Costello,0.97,3:36.42,455.6,1123.41796875,NA,94.958,67.185,0.156,5731,181.083,12,20.561,NA,NA 75 | MeShClust,Costello,0.97,9:47.09,1294.5,999.23828125,NA,95.281,66.370,0.464,2029,523.007,15,23.073,NA,NA 76 | UCLUST,Costello,0.97,2:43.78,162.3,412.37890625,NA,95.905,67.304,0.073,9770,106.043,15,31.340,NA,NA 77 | -------------------------------------------------------------------------------- /Tables/Synth.csv: -------------------------------------------------------------------------------- 1 | Name,Data,Id,Realtime,User time,Memory (MB),1-1,Intra,Inter,Silhouette,Clusters >= 5,avg size >= 5,median size >= 5,pct of clusters >= 5,Purity,NMI 2 | CD-HIT,0.10,0.75,0:00.01,0.0,2.5,0%,-,-,-,-,-,-,-,-,- 3 | DNACLUST,0.10,0.75,0:04.56,15.0,392.1328125,0%,-,-,-,-,-,-,-,1,0.597 4 | MeShClust,0.10,0.75,0:07.73,29.8,79.0703125,100%,89.598,48.371,0.812,10,23.600,24,100,1,1 5 | UCLUST,0.10,0.75,0:00.17,0.1,4.40234375,100%,81.259,48.398,0.664,10,23.100,23,66.667,1,0.983 6 | CD-HIT,0.10,0.8,0:00.20,0.2,34.71484375,50%,81.791,47.952,0.336,15,14.333,12,68.182,1,0.891 7 | DNACLUST,0.10,0.8,0:03.06,9.6,392.1328125,0%,-,-,-,-,-,-,-,1,0.597 8 | MeShClust,0.10,0.8,0:07.75,29.8,79.109375,100%,89.598,48.371,0.812,10,23.600,24,100,1,1 9 | UCLUST,0.10,0.8,0:00.30,0.3,5.86328125,10%,83.796,48.287,0.401,15,10.867,11,23.810,1,0.786 10 | CD-HIT,0.10,0.85,0:00.23,0.2,34.98046875,10%,82.434,48.065,0.287,18,10.889,12,50,1,0.828 11 | DNACLUST,0.10,0.85,0:01.99,5.9,392.1328125,0%,-,-,-,-,-,-,-,1,0.594 12 | MeShClust,0.10,0.85,0:05.79,21.9,79,100%,89.598,48.371,0.812,10,23.600,24,100,1,1 13 | UCLUST,0.10,0.85,0:00.55,0.5,8.53515625,0%,-,-,-,-,-,-,-,1,0.603 14 | CD-HIT,0.10,0.9,0:00.97,1.0,38.5859375,0%,-,-,-,-,-,-,-,1,0.602 15 | DNACLUST,0.10,0.9,0:01.10,2.8,392.1328125,0%,-,-,-,-,-,-,-,1,0.593 16 | MeShClust,0.10,0.9,0:05.55,21.3,78.8359375,100%,89.598,48.371,0.812,10,23.600,24,100,1,1 17 | UCLUST,0.10,0.9,0:00.32,0.3,8.625,0%,-,-,-,-,-,-,-,1,0.597 18 | CD-HIT,0.10,0.95,0:00.14,0.1,38.84375,0%,-,-,-,-,-,-,-,1,0.595 19 | DNACLUST,0.10,0.95,0:00.51,0.8,392.1328125,0%,-,-,-,-,-,-,-,1,0.593 20 | MeShClust,0.10,0.95,0:05.47,20.9,79.5390625,0%,100,46.489,1.000,236,1,1,0,1,0.593 21 | UCLUST,0.10,0.95,0:00.19,0.2,8.66796875,0%,-,-,-,-,-,-,-,1,0.593 22 | CD-HIT,0.25,0.6,0:00.00,0.0,2.5,0%,-,-,-,-,-,-,-,-,- 23 | DNACLUST,0.25,0.6,0:09.49,35.1,392.1328125,0%,-,-,-,-,-,-,-,1,0.594 24 | MeShClust,0.25,0.6,0:07.92,30.7,79.53125,100%,73.027,48.293,0.515,10,23.800,23,100,0.983,0.972 25 | UCLUST,0.25,0.6,0:00.40,0.4,6.69140625,0%,66.383,47.092,0.276,15,10.267,8,22.059,1,0.767 26 | CD-HIT,0.25,0.65,0:00.00,0.0,2.49609375,0%,-,-,-,-,-,-,-,-,- 27 | DNACLUST,0.25,0.65,0:07.56,27.6,392.1328125,0%,-,-,-,-,-,-,-,1,0.592 28 | MeShClust,0.25,0.65,0:06.77,26.1,79.41015625,100%,74.702,48.307,0.546,10,23.800,24,100,1,1 29 | UCLUST,0.25,0.65,0:00.82,0.8,8.6796875,0%,-,-,-,-,-,-,-,1,0.604 30 | CD-HIT,0.25,0.7,0:00.00,0.0,2.50390625,0%,-,-,-,-,-,-,-,-,- 31 | DNACLUST,0.25,0.7,0:05.71,20.3,392.1328125,0%,-,-,-,-,-,-,-,1,0.592 32 | MeShClust,0.25,0.7,0:05.71,21.9,79.40625,100%,81.960,48.100,0.573,14,17,23,71.429,1,0.985 33 | UCLUST,0.25,0.7,0:00.83,0.8,8.80078125,0%,-,-,-,-,-,-,-,1,0.597 34 | CD-HIT,0.25,0.75,0:00.01,0.0,2.5,0%,-,-,-,-,-,-,-,-,- 35 | DNACLUST,0.25,0.75,0:04.59,14.1,392.1328125,0%,-,-,-,-,-,-,-,1,0.592 36 | MeShClust,0.25,0.75,0:05.83,21.9,79.421875,100%,83.157,47.954,0.560,15,15.867,23,66.667,1,0.981 37 | UCLUST,0.25,0.75,0:00.83,0.8,8.80078125,0%,-,-,-,-,-,-,-,1,0.597 38 | CD-HIT,0.25,0.8,0:01.15,1.1,40.140625,0%,-,-,-,-,-,-,-,1,0.596 39 | DNACLUST,0.25,0.8,0:03.40,9.2,392.1328125,0%,-,-,-,-,-,-,-,1,0.592 40 | MeShClust,0.25,0.8,0:05.90,21.8,80.18359375,0%,98.236,44.676,0.927,134,1.776,1,7.463,1,0.698 41 | UCLUST,0.25,0.8,0:01.56,0.8,8.8828125,0%,-,-,-,-,-,-,-,1,0.594 42 | CD-HIT,100 Centers,0.6,0:00.01,0.0,2.50390625,0%,-,-,-,-,-,-,-,-,- 43 | DNACLUST,100 Centers,0.6,2:43.65,619.9,393.03515625,0%,-,-,-,-,-,-,-,1,0.801 44 | MeShClust,100 Centers,0.6,0:07.21,27.6,81.46875,78%,73.099,43.273,0.503,105,9.524,10,93.333,0.911,0.939 45 | UCLUST,100 Centers,0.6,0:02.11,2.1,11.33984375,9%,68.770,44.709,0.439,40,6.325,6,8.811,1,0.878 46 | CD-HIT,100 Centers,0.65,0:00.01,0.0,2.5078125,0%,-,-,-,-,-,-,-,-,- 47 | DNACLUST,100 Centers,0.65,2:09.14,481.4,393.03515625,0%,-,-,-,-,-,-,-,1,0.800 48 | MeShClust,100 Centers,0.65,0:06.96,26.4,81.48828125,97%,74.532,43.317,0.527,103,9.709,10,97.087,0.979,0.984 49 | UCLUST,100 Centers,0.65,0:03.36,3.3,16.23828125,0%,-,-,-,-,-,-,-,1,0.811 50 | CD-HIT,100 Centers,0.7,0:00.01,0.0,2.5078125,0%,-,-,-,-,-,-,-,-,- 51 | DNACLUST,100 Centers,0.7,1:33.88,352.6,393.0390625,0%,-,-,-,-,-,-,-,1,0.800 52 | MeShClust,100 Centers,0.7,0:06.71,25.5,81.60546875,97%,80.218,43.392,0.609,123,8.130,10,79.675,1,0.992 53 | UCLUST,100 Centers,0.7,0:03.41,3.4,16.49609375,0%,-,-,-,-,-,-,-,1,0.810 54 | CD-HIT,100 Centers,0.75,0:00.01,0.0,2.5078125,0%,-,-,-,-,-,-,-,-,- 55 | DNACLUST,100 Centers,0.75,1:04.67,244.3,393.171875,0%,-,-,-,-,-,-,-,1,0.800 56 | MeShClust,100 Centers,0.75,0:06.71,25.6,81.796875,98%,78.896,43.406,0.597,114,8.772,10,86.842,1,0.995 57 | UCLUST,100 Centers,0.75,0:03.35,3.3,18.234375,0%,-,-,-,-,-,-,-,1,0.810 58 | CD-HIT,100 Centers,0.8,0:10.61,10.6,54.1796875,0%,-,-,-,-,-,-,-,1,0.804 59 | DNACLUST,100 Centers,0.8,0:42.32,156.6,393.03515625,0%,-,-,-,-,-,-,-,1,0.800 60 | MeShClust,100 Centers,0.8,0:06.88,26.2,84.92578125,12%,95.542,42.882,0.828,494,2.024,1,17.611,0.999,0.885 61 | UCLUST,100 Centers,0.8,0:03.69,3.6,17.52734375,0%,-,-,-,-,-,-,-,1,0.801 62 | -------------------------------------------------------------------------------- /Tables/Viral.csv: -------------------------------------------------------------------------------- 1 | Name,Data,Id,Realtime,Utime,Mem(MB),Intra,Inter,Silhouette,Clusters,avg size,median size,pct of clusters >= 5,Purity,NMI 2 | MeShClust,6 Viruses,0.43,0:15.45,46.5,80.56640625,51.596,51.631,0.184,3,18,22,66.667,0.426,0.567 3 | UCLUST,6 Viruses,0.43,0:05.15,5.1,62.77734375,35.027,100,0.350,1,48,48,100,0.229,0 4 | MeShClust,6 Viruses,0.47,0:19.38,56.4,80.80078125,54.507,45.086,0.277,4,13.500,15,100,0.611,0.728 5 | UCLUST,6 Viruses,0.47,0:14.80,14.8,64.75390625,59.972,41.527,0.236,10,4.800,5,50,0.667,0.546 6 | MeShClust,6 Viruses,0.5,0:39.17,120.7,80.90234375,74.288,40.182,0.549,11,4.909,4,45.455,0.907,0.889 7 | UCLUST,6 Viruses,0.5,0:24.94,24.9,66.1796875,95.095,38.759,0.899,30,1.600,1,3.333,1,0.711 8 | MeShClust,6 Viruses,0.53,0:49.89,161.9,80.9296875,75.905,39.344,0.569,13,4.154,4,46.154,1,0.926 9 | UCLUST,6 Viruses,0.53,0:25.86,25.8,66.4609375,95.263,37.779,0.897,34,1.412,1,0,1,0.681 10 | MeShClust,6 Viruses,0.57,1:35.44,307.0,81.171875,89.628,36.241,0.792,29,1.862,1,3.448,1,0.778 11 | UCLUST,6 Viruses,0.57,0:26.29,26.2,66.65625,97.610,37.534,0.946,37,1.297,1,0,1,0.671 12 | Template,6 Viruses,,,,,59.647,40.053,0.288,,,,,1,1 13 | MeShClust,7 Viruses,0.43,0:25.00,85.3,81.38671875,51.742,49.928,0.297,3,32,28,100,0.573,0.583 14 | UCLUST,7 Viruses,0.43,0:11.06,11.0,72.18359375,38.428,100,0.384,1,90,90,100,0.467,0 15 | MeShClust,7 Viruses,0.47,0:31.33,102.7,81.26953125,54.705,45.016,0.269,4,24,15,100,0.667,0.677 16 | UCLUST,7 Viruses,0.47,0:18.98,18.9,74.48046875,63.044,42.667,0.302,11,8.182,4,45.455,0.700,0.485 17 | MeShClust,7 Viruses,0.5,1:28.06,290.0,81.53515625,72.016,39.093,0.501,12,8,6,50,0.917,0.865 18 | UCLUST,7 Viruses,0.5,0:32.13,32.1,75.79296875,91.413,37.626,0.830,33,2.727,1,12.121,1,0.729 19 | MeShClust,7 Viruses,0.53,2:48.81,555.9,81.66796875,77.635,37.424,0.585,19,5.053,4,42.105,1,0.866 20 | UCLUST,7 Viruses,0.53,0:34.09,34.0,77.33984375,92.527,37.824,0.840,55,1.636,1,1.818,1,0.595 21 | MeShClust,7 Viruses,0.57,6:15.50,1259.3,82.109375,91.206,37.864,0.824,47,2.043,1,8.511,1,0.687 22 | UCLUST,7 Viruses,0.57,0:35.55,35.5,77.85546875,96.024,38.651,0.905,65,1.385,1,3.077,1,0.575 23 | Template,7 Viruses,-,-,-,-,59.004,39.764,0.274,-,-,-,-,1,1 24 | -------------------------------------------------------------------------------- /Tables/Viral.org: -------------------------------------------------------------------------------- 1 | | Name | Data | Id | Realtime | Utime | Mem(MB) | Intra | Inter | Silhouette | Clusters | avg size | median size | pct of clusters >= 5 | Purity | NMI | 2 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 3 | | MeShClust | 6 Viruses | 0.43 | 0:15.45 | 46.5 | 80.56640625 | 51.596 | 51.631 | 0.184 | 3 | 18 | 22 | 66.667 | 0.426 | 0.567 | 4 | | UCLUST | 6 Viruses | 0.43 | 0:05.15 | 5.1 | 62.77734375 | 35.027 | 100 | 0.350 | 1 | 48 | 48 | 100 | 0.229 | 0 | 5 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 6 | | MeShClust | 6 Viruses | 0.47 | 0:19.38 | 56.4 | 80.80078125 | 54.507 | 45.086 | 0.277 | 4 | 13.500 | 15 | 100 | 0.611 | 0.728 | 7 | | UCLUST | 6 Viruses | 0.47 | 0:14.80 | 14.8 | 64.75390625 | 59.972 | 41.527 | 0.236 | 10 | 4.800 | 5 | 50 | 0.667 | 0.546 | 8 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 9 | | MeShClust | 6 Viruses | 0.5 | 0:39.17 | 120.7 | 80.90234375 | 74.288 | 40.182 | 0.549 | 11 | 4.909 | 4 | 45.455 | 0.907 | 0.889 | 10 | | UCLUST | 6 Viruses | 0.5 | 0:24.94 | 24.9 | 66.1796875 | 95.095 | 38.759 | 0.899 | 30 | 1.600 | 1 | 3.333 | 1 | 0.711 | 11 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 12 | | MeShClust | 6 Viruses | 0.53 | 0:49.89 | 161.9 | 80.9296875 | 75.905 | 39.344 | 0.569 | 13 | 4.154 | 4 | 46.154 | 1 | 0.926 | 13 | | UCLUST | 6 Viruses | 0.53 | 0:25.86 | 25.8 | 66.4609375 | 95.263 | 37.779 | 0.897 | 34 | 1.412 | 1 | 0 | 1 | 0.681 | 14 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 15 | | MeShClust | 6 Viruses | 0.57 | 1:35.44 | 307.0 | 81.171875 | 89.628 | 36.241 | 0.792 | 29 | 1.862 | 1 | 3.448 | 1 | 0.778 | 16 | | UCLUST | 6 Viruses | 0.57 | 0:26.29 | 26.2 | 66.65625 | 97.610 | 37.534 | 0.946 | 37 | 1.297 | 1 | 0 | 1 | 0.671 | 17 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 18 | | Template | 6 Viruses | | | | | 59.647 | 40.053 | 0.288 | | | | | 1 | 1 | 19 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 20 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 21 | | MeShClust | 7 Viruses | 0.43 | 0:25.00 | 85.3 | 81.38671875 | 51.742 | 49.928 | 0.297 | 3 | 32 | 28 | 100 | 0.573 | 0.583 | 22 | | UCLUST | 7 Viruses | 0.43 | 0:11.06 | 11.0 | 72.18359375 | 38.428 | 100 | 0.384 | 1 | 90 | 90 | 100 | 0.467 | 0 | 23 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 24 | | MeShClust | 7 Viruses | 0.47 | 0:31.33 | 102.7 | 81.26953125 | 54.705 | 45.016 | 0.269 | 4 | 24 | 15 | 100 | 0.667 | 0.677 | 25 | | UCLUST | 7 Viruses | 0.47 | 0:18.98 | 18.9 | 74.48046875 | 63.044 | 42.667 | 0.302 | 11 | 8.182 | 4 | 45.455 | 0.700 | 0.485 | 26 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 27 | | MeShClust | 7 Viruses | 0.5 | 1:28.06 | 290.0 | 81.53515625 | 72.016 | 39.093 | 0.501 | 12 | 8 | 6 | 50 | 0.917 | 0.865 | 28 | | UCLUST | 7 Viruses | 0.5 | 0:32.13 | 32.1 | 75.79296875 | 91.413 | 37.626 | 0.830 | 33 | 2.727 | 1 | 12.121 | 1 | 0.729 | 29 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 30 | | MeShClust | 7 Viruses | 0.53 | 2:48.81 | 555.9 | 81.66796875 | 77.635 | 37.424 | 0.585 | 19 | 5.053 | 4 | 42.105 | 1 | 0.866 | 31 | | UCLUST | 7 Viruses | 0.53 | 0:34.09 | 34.0 | 77.33984375 | 92.527 | 37.824 | 0.840 | 55 | 1.636 | 1 | 1.818 | 1 | 0.595 | 32 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 33 | | MeShClust | 7 Viruses | 0.57 | 6:15.50 | 1259.3 | 82.109375 | 91.206 | 37.864 | 0.824 | 47 | 2.043 | 1 | 8.511 | 1 | 0.687 | 34 | | UCLUST | 7 Viruses | 0.57 | 0:35.55 | 35.5 | 77.85546875 | 96.024 | 38.651 | 0.905 | 65 | 1.385 | 1 | 3.077 | 1 | 0.575 | 35 | |-----------+-----------+------+----------+--------+-------------+--------+--------+------------+----------+----------+-------------+----------------------+--------+-------| 36 | | Template | 7 Viruses | - | - | - | - | 59.004 | 39.764 | 0.274 | - | - | - | - | 1 | 1 | 37 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | # CXX = /usr/bin/c++ 2 | CXX ?= g++ 3 | 4 | CXXFLAGS = -O3 -g -fmessage-length=0 -Wall -march=native -std=c++11 5 | 6 | # 7 | # Objects 8 | # 9 | 10 | ORed = ../bin/Red.o 11 | 12 | # Exception 13 | OInvalidInputException = ../bin/exception/InvalidInputException.o 14 | OInvalidStateException = ../bin/exception/InvalidStateException.o 15 | OFileDoesNotExistException = ../bin/exception/FileDoesNotExistException.o 16 | OInvalidOrderOfOperationsException = ../bin/exception/InvalidOrderOfOperationsException.o 17 | OInvalidScoreException = ../bin/exception/InvalidScoreException.o 18 | OInvalidOperationException = ../bin/exception/InvalidOperationException.o 19 | 20 | # Utility 21 | OUtil = ../bin/utility/Util.o 22 | OLocation = ../bin/utility/Location.o 23 | OEmptyLocation = ../bin/utility/EmptyLocation.o 24 | OLCSLen = ../bin/utility/LCSLen.o 25 | OAffineId = ../bin/utility/AffineId.o 26 | OGlobAlignE = ../bin/utility/GlobAlignE.o 27 | 28 | # Non TR 29 | OChromosome = ../bin/nonltr/Chromosome.o 30 | OChromosomeOneDigit = ../bin/nonltr/ChromosomeOneDigit.o 31 | OChromosomeRandom = ../bin/nonltr/ChromosomeRandom.o 32 | OChromListMaker = ../bin/nonltr/ChromListMaker.o 33 | OTableBuilder = ../bin/nonltr/TableBuilder.o 34 | OScorer = ../bin/nonltr/Scorer.o 35 | ODetectorMaxima = ../bin/nonltr/DetectorMaxima.o 36 | OChromDetectorMaxima = ../bin/nonltr/ChromDetectorMaxima.o 37 | OHMM = ../bin/nonltr/HMM.o 38 | OScanner = ../bin/nonltr/Scanner.o 39 | OTrainer = ../bin/nonltr/Trainer.o 40 | OLocationList = ../bin/nonltr/LocationList.o 41 | OLocationListCollection = ../bin/nonltr/LocationListCollection.o 42 | 43 | OBJS = $(ORed) $(OInvalidInputException) $(OInvalidStateException) $(OFileDoesNotExistException) $(OInvalidOrderOfOperationsException) $(OInvalidOperationException) $(OInvalidScoreException) $(OUtil) $(OLocation) $(OEmptyLocation) $(OChromosome) $(OChromosomeOneDigit) $(OChromosomeRandom) $(OChromListMaker) $(OTableBuilder) $(OScorer) $(ODetectorMaxima) $(OChromDetector) $(OChromDetectorMaxima) $(OHMM) $(OScanner) $(OTrainer) $(OLocationList) $(OLocationListCollection) $(OLCSLen) $(OAffineId) $(OGlobAlignE) 44 | 45 | # 46 | # Target 47 | # 48 | 49 | TRed = ../bin/Red 50 | 51 | # 52 | # Make RepeatsDetector 53 | # 54 | 55 | $(TRed): $(OBJS) 56 | $(CXX) -o $(TRed) $(OBJS) 57 | 58 | # 59 | # RepeatsDetector 60 | # 61 | 62 | $(ORed): RepeatsDetector.cpp nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/TableBuilder.h nonltr/HMM.h nonltr/Scanner.h nonltr/Trainer.h utility/Util.h 63 | $(CXX) $(CXXFLAGS) -c RepeatsDetector.cpp -o $(ORed) 64 | 65 | # 66 | # Exception 67 | # 68 | $(OInvalidInputException): exception/InvalidInputException.cpp exception/InvalidInputException.h 69 | $(CXX) $(CXXFLAGS) -c exception/InvalidInputException.cpp -o $(OInvalidInputException) 70 | 71 | $(OInvalidStateException): exception/InvalidStateException.cpp exception/InvalidStateException.h 72 | $(CXX) $(CXXFLAGS) -c exception/InvalidStateException.cpp -o $(OInvalidStateException) 73 | 74 | $(OFileDoesNotExistException): exception/FileDoesNotExistException.cpp exception/FileDoesNotExistException.h 75 | $(CXX) $(CXXFLAGS) -c exception/FileDoesNotExistException.cpp -o $(OFileDoesNotExistException) 76 | 77 | $(OInvalidOrderOfOperationsException): exception/InvalidOrderOfOperationsException.cpp exception/InvalidOrderOfOperationsException.h 78 | $(CXX) $(CXXFLAGS) -c exception/InvalidOrderOfOperationsException.cpp -o $(OInvalidOrderOfOperationsException) 79 | 80 | $(OInvalidScoreException): exception/InvalidScoreException.cpp exception/InvalidScoreException.h 81 | $(CXX) $(CXXFLAGS) -c exception/InvalidScoreException.cpp -o $(OInvalidScoreException) 82 | 83 | $(OInvalidOperationException): exception/InvalidOperationException.cpp exception/InvalidOperationException.h 84 | $(CXX) $(CXXFLAGS) -c exception/InvalidOperationException.cpp -o $(OInvalidOperationException) 85 | 86 | # 87 | # Utility 88 | # 89 | 90 | $(OUtil): utility/Util.cpp utility/Util.h utility/Location.h exception/FileDoesNotExistException.h 91 | $(CXX) $(CXXFLAGS) -c utility/Util.cpp -o $(OUtil) 92 | 93 | $(OLocation): utility/Location.cpp utility/Location.h utility/ILocation.h exception/InvalidInputException.h utility/Util.h 94 | $(CXX) $(CXXFLAGS) -c utility/Location.cpp -o $(OLocation) 95 | 96 | $(OEmptyLocation): utility/EmptyLocation.cpp utility/EmptyLocation.h utility/ILocation.h exception/InvalidOperationException.h 97 | $(CXX) $(CXXFLAGS) -c utility/EmptyLocation.cpp -o $(OEmptyLocation) 98 | 99 | $(OLCSLen): utility/LCSLen.cpp utility/LCSLen.h 100 | $(CXX) $(CXXFLAGS) -c utility/LCSLen.cpp -o $(OLCSLen) 101 | 102 | $(OAffineId): utility/AffineId.cpp utility/AffineId.h 103 | $(CXX) $(CXXFLAGS) -c utility/AffineId.cpp -o $(OAffineId) 104 | 105 | $(OGlobAlignE): utility/GlobAlignE.cpp utility/GlobAlignE.h 106 | $(CXX) $(CXXFLAGS) -c utility/GlobAlignE.cpp -o $(OGlobAlignE) 107 | # 108 | # Non LTR 109 | # 110 | 111 | $(OChromosome): nonltr/Chromosome.cpp nonltr/Chromosome.h nonltr/IChromosome.h utility/Util.h exception/InvalidInputException.h exception/InvalidOperationException.h 112 | $(CXX) $(CXXFLAGS) -c nonltr/Chromosome.cpp -o $(OChromosome) 113 | 114 | $(OChromosomeOneDigit): nonltr/ChromosomeOneDigit.cpp nonltr/ChromosomeOneDigit.h nonltr/Chromosome.h exception/InvalidInputException.h 115 | $(CXX) $(CXXFLAGS) -c nonltr/ChromosomeOneDigit.cpp -o $(OChromosomeOneDigit) 116 | 117 | $(OChromosomeRandom): nonltr/ChromosomeRandom.cpp nonltr/ChromosomeRandom.h nonltr/IChromosome.h exception/InvalidInputException.h exception/InvalidStateException.h utility/Util.h 118 | $(CXX) $(CXXFLAGS) -c nonltr/ChromosomeRandom.cpp -o $(OChromosomeRandom) 119 | 120 | $(OTableBuilder): nonltr/TableBuilder.cpp nonltr/TableBuilder.h utility/Util.h nonltr/ChromosomeOneDigit.h nonltr/ITableView.h nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/EnrichmentMarkovView.h nonltr/EnrichmentMarkovView.cpp exception/InvalidStateException.h nonltr/ChromListMaker.h nonltr/IChromosome.h 121 | $(CXX) $(CXXFLAGS) -c nonltr/TableBuilder.cpp -o $(OTableBuilder) 122 | 123 | $(OScorer): nonltr/Scorer.cpp nonltr/Scorer.h nonltr/ChromosomeOneDigit.h utility/Util.h exception/InvalidStateException.h 124 | $(CXX) $(CXXFLAGS) -c nonltr/Scorer.cpp -o $(OScorer) 125 | 126 | $(ODetectorMaxima): nonltr/DetectorMaxima.cpp nonltr/DetectorMaxima.h utility/ILocation.h exception/InvalidStateException.h 127 | $(CXX) $(CXXFLAGS) -c nonltr/DetectorMaxima.cpp -o $(ODetectorMaxima) 128 | 129 | $(OChromDetectorMaxima): nonltr/ChromDetectorMaxima.cpp nonltr/ChromDetectorMaxima.h nonltr/DetectorMaxima.h nonltr/ChromosomeOneDigit.h utility/Util.h utility/ILocation.h utility/Location.h 130 | $(CXX) $(CXXFLAGS) -c nonltr/ChromDetectorMaxima.cpp -o $(OChromDetectorMaxima) 131 | 132 | $(OHMM): nonltr/HMM.cpp nonltr/HMM.h utility/ILocation.h exception/InvalidStateException.h exception/InvalidInputException.h exception/FileDoesNotExistException.h exception/InvalidOperationException.h 133 | $(CXX) $(CXXFLAGS) -c nonltr/HMM.cpp -o $(OHMM) 134 | 135 | $(OScanner): nonltr/Scanner.cpp nonltr/Scanner.h nonltr/Chromosome.h nonltr/ChromosomeOneDigit.h nonltr/HMM.h nonltr/ITableView.h nonltr/Scorer.h utility/Util.h utility/ILocation.h exception/InvalidInputException.h exception/InvalidStateException.h exception/FileDoesNotExistException.h exception/InvalidOperationException.h 136 | $(CXX) $(CXXFLAGS) -c nonltr/Scanner.cpp -o $(OScanner) 137 | 138 | $(OTrainer): nonltr/Trainer.cpp nonltr/Trainer.h nonltr/TableBuilder.h nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/HMM.h nonltr/ChromDetectorMaxima.h nonltr/Scorer.h nonltr/ChromListMaker.h utility/Util.h nonltr/LocationListCollection.h 139 | $(CXX) $(CXXFLAGS) -c nonltr/Trainer.cpp -o $(OTrainer) 140 | 141 | $(OChromListMaker): nonltr/ChromListMaker.cpp nonltr/ChromListMaker.h nonltr/Chromosome.h nonltr/ChromosomeOneDigit.h utility/Util.h 142 | $(CXX) $(CXXFLAGS) -c nonltr/ChromListMaker.cpp -o $(OChromListMaker) 143 | 144 | $(OCluster): nonltr/Cluster.cpp nonltr/Cluster.h utility/Util.h exception/InvalidStateException.h exception/InvalidInputException.h 145 | $(CXX) $(CXXFLAGS) -c nonltr/Cluster.cpp -o $(OCluster) 146 | 147 | $(OLocationList): nonltr/LocationList.cpp nonltr/LocationList.h utility/ILocation.h utility/Location.h exception/InvalidStateException.h 148 | $(CXX) $(CXXFLAGS) -c nonltr/LocationList.cpp -o $(OLocationList) 149 | 150 | $(OLocationListCollection): nonltr/LocationListCollection.cpp nonltr/LocationListCollection.h utility/Location.h exception/InvalidStateException.h 151 | $(CXX) $(CXXFLAGS) -c nonltr/LocationListCollection.cpp -o $(OLocationListCollection) 152 | 153 | 154 | # 155 | # Make binary directories 156 | # 157 | 158 | red: $(TRed) 159 | 160 | # 161 | # Make Red 162 | # 163 | 164 | bin: 165 | mkdir ../bin 166 | mkdir ../bin/exception 167 | mkdir ../bin/utility 168 | mkdir ../bin/nonltr 169 | 170 | # 171 | # Make clean 172 | # 173 | 174 | clean: 175 | rm -f ../bin/*.o ../bin/exception/*.o ../bin/ms/*.o ../bin/nonltr/*.o ../bin/test/*.o ../bin/utility/*.o ../bin/tr/*.o *.o $(TRed) 176 | -------------------------------------------------------------------------------- /src/cluster/Makefile: -------------------------------------------------------------------------------- 1 | TARGET ?= meshclust 2 | VERSION ?= 1.2.0 3 | CXX ?= g++ 4 | ifeq ($(debug),yes) 5 | CXXFLAGS += -ggdb -fopenmp 6 | else 7 | CXXFLAGS += -fopenmp -O3 -march=native -g 8 | endif 9 | CXXFLAGS += -std=c++11 -DVERSION=\"$(VERSION)\" 10 | LDFLAGS += -lm 11 | 12 | SOURCES := $(shell find ./src -name '*.cpp') 13 | OBJECTS = $(SOURCES:%.cpp=bin/%.o) 14 | BIN_OBJECTS := $(shell find ../../bin/ -mindepth 2 -name '*.o') 15 | 16 | all: clean $(TARGET) 17 | 18 | $(TARGET): $(OBJECTS) $(BIN_OBJECTS) 19 | $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) 20 | 21 | bin/%.o: %.cpp 22 | mkdir -p $(@D) 23 | $(CXX) $(CXXFLAGS) -c $< -o $@ 24 | 25 | clean: 26 | $(RM) $(OBJECTS) $(TARGET) 27 | 28 | install: $(TARGET) 29 | cp $(TARGET) ~/bin 30 | 31 | .PHONY: all clean install 32 | -------------------------------------------------------------------------------- /src/cluster/src/Center.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Center.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef CENTER_H 8 | #define CENTER_H 9 | 10 | #include "Point.h" 11 | 12 | template 13 | struct Center { 14 | Center(Point* c, const vector*> &pts) : center(c->clone()), points(pts), is_to_delete(false) { 15 | } 16 | Center(const Center &cc) : center(cc.center->clone()), points(cc.points), is_to_delete(cc.is_to_delete) {} 17 | 18 | // Center(const Center& c) { 19 | // center = c.get_clone(); 20 | // points = c.getPoints_c(); 21 | // is_to_delete = c.is_delete(); 22 | // } 23 | ~Center() { if (is_to_delete) { delete center; }} 24 | void setCenter(Point* c) { 25 | delete center; 26 | center = c->clone(); 27 | } 28 | Point* getCenter() { return center; } 29 | vector*> &getPoints() { return points; } 30 | 31 | const vector*> &getPoints_c() const { return points; }; 32 | bool is_delete() const { return is_to_delete; } 33 | void lazy_remove() { is_to_delete = true; } 34 | size_t size() const { return points.size(); } 35 | bool empty() const { return points.empty(); } 36 | Point* get_clone() const { 37 | return center->clone(); 38 | } 39 | Point *center; 40 | vector*> points; 41 | bool is_to_delete; 42 | }; 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /src/cluster/src/ClusterFactory.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * ClusterFactory.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef CLUSTERFACTORY_H 9 | #define CLUSTERFACTORY_H 10 | 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "../../nonltr/ChromosomeOneDigit.h" 17 | #include "../../nonltr/KmerHashTable.h" 18 | #include "Point.h" 19 | #include "Trainer.h" 20 | #include "bvec.h" 21 | 22 | template 23 | class ClusterFactory { 24 | public: 25 | ClusterFactory(int k_len, int npp=std::numeric_limits::max()) : k(k_len), num_per_partition(npp) {} 26 | std::vector*> build_points(vector files, std::function*(ChromosomeOneDigit*)> get_point); 27 | Point* get_histogram(ChromosomeOneDigit *chrom); 28 | Point* get_divergence_point(ChromosomeOneDigit *chrom); 29 | T find_h(const std::vector*> ¢ers) const; 30 | void sort_nn(std::vector*> &points, Point* nearest_to=NULL, int arg=3) const; 31 | void MS(bvec &points, T bandwidth, double sim, const Trainer& trn, string output, int iter, int delta); 32 | private: 33 | vector lookup_table; 34 | vector*> m_centers; 35 | const int num_per_partition; 36 | int k; 37 | //void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values); 38 | }; 39 | 40 | template 41 | void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values) 42 | { 43 | const int k = table.getK(); 44 | auto segment = chrom->getSegment(); 45 | const char *seg_bases = chrom->getBase()->c_str(); 46 | for (vector *v : *segment) { 47 | int start = v->at(0); 48 | int end = v->at(1); 49 | table.wholesaleIncrement(seg_bases, start, end - k + 1); 50 | } 51 | unsigned long tableSize = table.getMaxTableSize(); 52 | values.reserve(values.size() + tableSize); 53 | const V * valueArray = table.getValues(); 54 | std::copy(&valueArray[0], &valueArray[tableSize], std::back_inserter(values)); 55 | } 56 | 57 | #ifdef HEADER_HACK 58 | #ifndef CLUSTERFACTORY_C 59 | #define CLUSTERFACTORY_C 60 | #include "ClusterFactory.cpp" 61 | #endif 62 | #endif 63 | 64 | #endif 65 | -------------------------------------------------------------------------------- /src/cluster/src/DivergencePoint.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * DivergencePoint.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "DivergencePoint.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | template 15 | double DivergencePoint::prob_under(Point &p) const 16 | { 17 | const DivergencePoint& c = dynamic_cast&>(p); 18 | double sum = 0; 19 | const size_t s = points.size(); 20 | double total = 0; 21 | std::feclearexcept(FE_OVERFLOW); 22 | std::feclearexcept(FE_UNDERFLOW); 23 | for (int i = 0; i < s; i++) { 24 | sum += c.points[i]; 25 | if (i % 4 == 3) { 26 | for (int j = i - 3; j <= i; j++) { 27 | double prob = c.points[j] / sum; 28 | double log_prob = log(prob); 29 | total += (points[j] - 1) * log_prob; 30 | if ((bool)std::fetestexcept(FE_UNDERFLOW)) { 31 | cout << "Underflow!" << endl; 32 | } 33 | // cond.push_back(log(prob)/log4); 34 | } 35 | sum = 0; 36 | } 37 | } 38 | // for (size_t q = 0; q < s; q += 4) { 39 | // double sum = 0; 40 | // for (int i = q; i < q + 4; i++) { 41 | // sum += c.points[i]; 42 | // } 43 | // for (int i = q; i < q + 4; i++) { 44 | // double prob = c.points[i] / sum; 45 | // double log_prob = log(prob); 46 | // total += (points[i] - 1) * log_prob; 47 | // } 48 | // } 49 | return exp(total / s); 50 | } 51 | 52 | 53 | template 54 | double DivergencePoint::distance_d(Point& p) const 55 | { 56 | const DivergencePoint& c = dynamic_cast&>(p); 57 | uint64_t dist = 0; 58 | uint64_t mag = 0; 59 | for (auto i = 0; i < points.size(); i++) { 60 | dist += 2 * min(points[i],(T)c.points[i]); 61 | mag += points[i] + c.points[i]; 62 | } 63 | double frac = (double)dist / mag; 64 | return 10000.0 * (1.0 - frac * frac); 65 | } 66 | 67 | 68 | template 69 | uint64_t DivergencePoint::distance(const Point& p) const 70 | { 71 | const DivergencePoint& c = dynamic_cast&>(p); 72 | uint64_t dist = 0; 73 | const uint64_t mag = getPseudoMagnitude() + c.getPseudoMagnitude(); 74 | #pragma omp simd 75 | for (auto i = 0; i < points.size(); i++) { 76 | dist += min(points[i], c.points[i]); 77 | } 78 | dist *= 2; 79 | double frac = (double)dist / mag; 80 | return 10000.0 * (1.0 - frac * frac); 81 | } 82 | 83 | template 84 | double DivergencePoint::distance_k1(const Point &p) const 85 | { 86 | uint64_t dist = 0; 87 | 88 | auto a = Point::get_1mers(), b = p.get_1mers(); 89 | uint64_t mag = 0; 90 | for (auto i = 0; i < 4; i++) { 91 | dist += std::min(a[i], b[i]); 92 | mag += a[i]; 93 | } 94 | return (double)dist / (double)mag; 95 | 96 | } 97 | template 98 | DivergencePoint::DivergencePoint(const std::vector& pts, uint64_t len) 99 | { 100 | mag = 0; 101 | for (unsigned int i = 0; i < pts.size(); i++) { 102 | points.push_back(pts.at(i)); 103 | mag += pts.at(i); 104 | } 105 | // display(); 106 | nucl_length = len; 107 | to_delete = false; 108 | id = 0; 109 | } 110 | 111 | 112 | template 113 | DivergencePoint::DivergencePoint(unsigned int size) 114 | { 115 | for (unsigned int i = 0; i < size; i++) { 116 | points.push_back(0); 117 | } 118 | to_delete = false; 119 | nucl_length = 0; 120 | id = 0; 121 | } 122 | 123 | template 124 | void DivergencePoint::operator*=(double d) 125 | { 126 | unsigned int size = points.size(); 127 | for (auto& pt : points) { 128 | pt *= d; 129 | } 130 | } 131 | 132 | template 133 | bool DivergencePoint::operator<(Point& p) const 134 | { 135 | const DivergencePoint& h = dynamic_cast&>(p); 136 | unsigned int size = std::min(points.size(),h.points.size()); 137 | /*int boundary = 0; 138 | for (unsigned int i = 0; i < size; i++) { 139 | if (points.at(i) > h.points.at(i)) { 140 | boundary++; 141 | } else if (points.at(i) < h.points.at(i)) { 142 | boundary--; 143 | } 144 | } 145 | return boundary < 0;*/ 146 | for (unsigned int i = 0; i < size; i++) { 147 | if (points.at(i) >= h.points.at(i)) { 148 | return false; 149 | } 150 | } 151 | return true; 152 | } 153 | 154 | template 155 | void DivergencePoint::operator/=(double d) 156 | { 157 | unsigned int size = points.size(); 158 | for (unsigned int i = 0; i < size; i++) { 159 | points[i] /= d; 160 | } 161 | // cout << endl; 162 | } 163 | 164 | template 165 | void DivergencePoint::operator+=(Point& p) 166 | { 167 | const DivergencePoint& h = dynamic_cast&>(p); 168 | unsigned int size = std::min(points.size(),h.points.size()); 169 | for (unsigned int i = 0; i < size; i++) { 170 | points.at(i) += h.points.at(i); 171 | } 172 | } 173 | 174 | template 175 | uint64_t DivergencePoint::operator-(const Point& p) const 176 | { 177 | return distance(p); 178 | } 179 | 180 | template 181 | void DivergencePoint::set(Point& p) 182 | { 183 | const DivergencePoint& h = dynamic_cast&>(p); 184 | 185 | points = h.points; 186 | // points[0] = std::numeric_limits::max(); 187 | set_length(h.get_length()); 188 | to_delete = h.to_delete; 189 | cout << "old header: " << h.get_header() << endl; 190 | Point::set_header(h.get_header()); 191 | cout << "new header: " << Point::get_header() << endl; 192 | set_id(h.get_id()); 193 | } 194 | 195 | template 196 | void DivergencePoint::display() const 197 | { 198 | unsigned size = points.size(); 199 | for (unsigned i = 0; i < size; i++) { 200 | std::cout << points.at(i) << " "; 201 | } 202 | std::cout << std::endl; 203 | } 204 | 205 | template 206 | void DivergencePoint::zero() 207 | { 208 | for (auto &i : points) { 209 | i = 0; 210 | } 211 | } 212 | 213 | template 214 | void DivergencePoint::addOne() 215 | { 216 | for (auto& a : points) { 217 | a++; 218 | } 219 | } 220 | 221 | template 222 | void DivergencePoint::subOne() 223 | { 224 | for (auto& a : points) { 225 | a--; 226 | } 227 | } 228 | 229 | /* 230 | * p(y|x) = cond_p 231 | * q(y|x) = cond_p 232 | */ 233 | template 234 | double DivergencePoint::divergence(Point& p) const 235 | { 236 | const DivergencePoint& d = dynamic_cast&>(p); 237 | T sum4_p = 0, sum4_q = 0; // Sum for every 4 nucleotides 238 | double total_sum_p = 0, total_sum_q = 0; // Total running sum of all nucleotides 239 | double outer_sum_p = 0, outer_sum_q = 0; // Prior K-mer sum 240 | for (int i = 0; i < points.size(); i++) { // Compute divergence for P and Q simultaneously 241 | sum4_p += points[i]; 242 | sum4_q += d.points[i]; 243 | if (i % 4 == 3) { //finished counting word, now compute probabilities 244 | double inner_sum_p = 0; // Sum of p(X|Y) * log(p(X|Y) / q(X|Y)) 245 | double inner_sum_q = 0; // Sum of q(X|Y) * log(q(X|Y) / p(X|Y)) 246 | for (int j = i - 3; j <= i; j++) { 247 | double conditional_p = points[j] / sum4_p; 248 | double conditional_q = d.points[j] / sum4_q; 249 | double lg = log(conditional_p) - log(conditional_q); 250 | inner_sum_p += conditional_p * lg; 251 | inner_sum_q += -1 * conditional_q * lg; 252 | } 253 | outer_sum_p += sum4_p * inner_sum_p; 254 | outer_sum_q += sum4_q * inner_sum_q; 255 | 256 | total_sum_p += sum4_p; 257 | total_sum_q += sum4_q; 258 | sum4_p = 0; 259 | sum4_q = 0; 260 | } 261 | } 262 | double left = outer_sum_p / total_sum_p; 263 | double right = outer_sum_q / total_sum_q; 264 | return (left + right) / 2.0; 265 | } 266 | 267 | template 268 | uint64_t DivergencePoint::getPseudoMagnitude() const 269 | { 270 | return mag; 271 | } 272 | 273 | 274 | template 275 | uint64_t DivergencePoint::getRealMagnitude() const 276 | { 277 | return mag - points.size(); 278 | } 279 | 280 | #ifndef HEADER_HACK 281 | template class DivergencePoint; 282 | template class DivergencePoint; 283 | template class DivergencePoint; 284 | template class DivergencePoint; 285 | template class DivergencePoint; 286 | template class DivergencePoint; 287 | #endif 288 | -------------------------------------------------------------------------------- /src/cluster/src/DivergencePoint.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * DivergencePoint.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef DIVERGENCE_POINT_H 8 | #define DIVERGENCE_POINT_H 9 | #include "Point.h" 10 | #include 11 | template 12 | class DivergencePoint : public Point { 13 | public: 14 | DivergencePoint(const std::vector& pts, uint64_t len); 15 | DivergencePoint(unsigned int size); 16 | ~DivergencePoint() {} 17 | void operator*=(double d); 18 | void operator/=(double d); 19 | uint64_t operator-(const Point& p) const; 20 | bool operator<(Point& p) const; 21 | void operator+=(Point& p); 22 | 23 | bool equals(Point& p) { 24 | DivergencePoint& c = dynamic_cast< DivergencePoint&>(p); 25 | return p.get_id() == get_id() && c.points == points; 26 | } 27 | void set(Point& p); 28 | void display() const; 29 | void zero(); 30 | void addOne(); 31 | void subOne(); 32 | double prob_under(Point& p) const; 33 | uint64_t getRealMagnitude() const; 34 | uint64_t getPseudoMagnitude() const; 35 | // T magnitude() const { return getRealMagnitude(); }; 36 | double distance_k1(const Point& p) const; 37 | DivergencePoint* clone() const { 38 | auto d = new DivergencePoint(points, to_delete); 39 | d->set_header(Point::get_header()); 40 | d->set_id(get_id()); 41 | d->set_length(get_length()); 42 | return d; 43 | } 44 | DivergencePoint* create() const { 45 | return new DivergencePoint(points.size()); 46 | } 47 | Point* create_double() const { 48 | vector v; 49 | for (auto val : points) { 50 | v.push_back(val); 51 | } 52 | return new DivergencePoint(v, nucl_length); 53 | } 54 | void set_arg_to_this_d(Point& p) const { 55 | DivergencePoint& c = dynamic_cast< DivergencePoint&>(p); 56 | for (int i = 0; i < points.size(); i++) { 57 | c.points[i] = points[i]; 58 | } 59 | c.set_id(id); 60 | }; 61 | 62 | 63 | bool is_to_delete() const { 64 | return to_delete; 65 | } 66 | void set_to_delete(bool b) { 67 | to_delete = b; 68 | } 69 | double divergence(Point& p) const; 70 | double distance_d(Point& p) const; 71 | uint64_t distance(const Point& p) const; 72 | const vector& get_data() const { return points; } 73 | void set_id(int c_id) { id = c_id; }; 74 | const int get_id() const { return id; }; 75 | 76 | void set_length(unsigned long len) { nucl_length = len; }; 77 | unsigned long get_length() const { return nucl_length; }; 78 | unsigned long size() const { return points.size(); }; 79 | std::vector points; 80 | 81 | private: 82 | uintmax_t mag; 83 | bool to_delete; 84 | uint64_t id; 85 | uint64_t nucl_length; 86 | }; 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /src/cluster/src/Feature.h: -------------------------------------------------------------------------------- 1 | #ifndef FEATURES_H 2 | #define FEATURES_H 3 | 4 | #include "SingleFeature.h" 5 | #include 6 | #include "LogTable.h" 7 | #include 8 | 9 | #define FEAT_ALIGN (1 << 0) 10 | #define FEAT_LD (1 << 1) 11 | #define FEAT_MANHATTAN (1 << 2) 12 | #define FEAT_SQCHORD (1 << 3) 13 | #define FEAT_INTERSECTION (1 << 4) 14 | #define FEAT_PEARSON (1 << 5) 15 | #define FEAT_SIMRATIO (1 << 6) 16 | #define FEAT_N2RRC (1 << 7) 17 | #define FEAT_JENSONSHANNON (1 << 8) 18 | #define FEAT_RREE_K_R (1 << 9) 19 | #define FEAT_KULCZYNSKI2 (1 << 10) 20 | 21 | #define COMBO_SQUARED 1 22 | #define COMBO_SELF 2 23 | 24 | 25 | /* 26 | * Usage: 27 | * add_feature(FEAT_LD | FEAT_INTERSECTION, COMBO_SELF); 28 | * add_feature(FEAT_LD | FEAT_JENSONSHANNON, COMBO_SELF); 29 | * 30 | * normalize(some_pairs_to_normalize) 31 | * normalize(more_pairs_to_normalize) 32 | * finalize() 33 | * 34 | * add_feature(....); 35 | * 36 | * normalize(some_pairs_to_normalize) 37 | * normalize(more_pairs_to_normalize) 38 | * finalize() 39 | * 40 | * compute(p,q) 41 | * for (size_t i = 0; i < feature.size(); i++) { 42 | * cout << feature[i] << endl; 43 | * } 44 | */ 45 | #include "LogTable.h" 46 | template 47 | class Feature { 48 | public: 49 | Feature(const uint64_t N, const double* tbl_, double coeff_) : tbl(tbl_) { 50 | flags = 0; 51 | coeff = coeff_; 52 | // tbl = new LogTable(1000000, 2); 53 | } 54 | void add_feature(uint16_t f_flags, int combo=COMBO_SELF); 55 | 56 | void finalize(); 57 | void remove_feature() { 58 | auto indices_to_rm = combos.back().second; 59 | combos.pop_back(); 60 | throw "not implemented"; 61 | 62 | } 63 | void normalize(const vector*,Point*> > &pairs); 64 | vector compute(Point& p, Point& q) { 65 | vector cache = compute_all_raw(p, q); 66 | normalize_cache(cache); 67 | return cache; 68 | }; 69 | double operator()(int col, const vector& cache) const { 70 | auto pr = combos.at(col); 71 | int combo = pr.first; 72 | auto indices = pr.second; 73 | if (combo == COMBO_SELF) { 74 | double prod = 1; 75 | for (auto idx : indices) { 76 | prod *= cache[idx]; 77 | } 78 | return prod; 79 | } else if (combo == COMBO_SQUARED) { 80 | double prod = 1; 81 | for (auto idx : indices) { 82 | prod *= cache[idx] * cache[idx]; 83 | } 84 | return prod; 85 | } else { 86 | throw "invalid combo"; 87 | } 88 | } 89 | size_t size() const { return combos.size(); } 90 | void print_bounds() const { 91 | for (size_t i = 0; i < lookup.size(); i++) { 92 | cout << "bounds[" << i << "]: " << mins[i] << " to " << maxs[i] << endl; 93 | } 94 | } 95 | 96 | static double manhattan(Point& p, Point& q); 97 | static double length_difference(Point& p, Point& q); 98 | static double n2rrc(Point& p, Point& q, const vector&, const vector &); 99 | static double rree_k_r(Point& p, Point& q); 100 | static double intersection(Point& p, Point& q); 101 | double jenson_shannon(Point& p, Point& q) const; 102 | static double pearson(Point& p, Point& q); 103 | static double simratio(Point& a, Point& b); 104 | static double squaredchord(Point& a, Point& b); 105 | static double kulczynski2(Point& a, Point& b); 106 | static double align(Point& a, Point& b, std::map, double> &atable); 107 | private: 108 | vector compute_all_raw(Point& p, Point& q); 109 | void normalize_cache(vector& cache) const; 110 | 111 | bool feat_is_sim(uint16_t single_flag) const; 112 | double raw(uint16_t single_flag, Point& a, Point& b); 113 | int index_of(uint16_t single_flag) const { 114 | for (size_t i = 0; i < lookup.size(); i++) { 115 | if (lookup[i] == single_flag) { 116 | return i; 117 | } 118 | } 119 | return -1; 120 | } 121 | 122 | uint16_t flags; 123 | std::vector 125 | > > combos; 126 | std::vector mins, maxs; 127 | std::vector is_sims, is_finalized; 128 | std::vector lookup; 129 | const double* tbl; 130 | double coeff; 131 | 132 | 133 | std::map, double> atable; 134 | }; 135 | 136 | // template 137 | // class Feature { 138 | // public: 139 | // Feature(std::function)> combination, std::vector > sf) 140 | // : features(sf), combo(combination) {} 141 | // double operator()(Point*, Point*) const; 142 | 143 | 144 | // static double manhattan(Point& p, Point& q); 145 | // static double length_difference(Point& p, Point& q); 146 | // static double n2rrc(Point& p, Point& q, const vector&, const vector &); 147 | // static double rree_k_r(Point& p, Point& q); 148 | // static double intersection(Point& p, Point& q); 149 | // static double jenson_shannon(Point& p, Point& q); 150 | // static double pearson(Point& p, Point& q); 151 | // static double simratio(Point& a, Point& b); 152 | // static double squaredchord(Point& a, Point& b); 153 | // private: 154 | // vector > features; 155 | // std::function)> combo; 156 | // }; 157 | #endif 158 | -------------------------------------------------------------------------------- /src/cluster/src/GLM.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * glm.cpp 3 | * 4 | * Created on: May 29, 2017 5 | * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa 6 | * 7 | */ 8 | 9 | #include "GLM.h" 10 | #include "Matrix.h" 11 | 12 | #include 13 | #include 14 | using namespace std; 15 | // using namespace matrix; 16 | 17 | namespace matrix{ 18 | 19 | void GLM::train(Matrix& features, Matrix& labels){ 20 | weights = features.transpose() * features; 21 | weights = weights.pseudoInverse() * features.transpose() * labels; 22 | } 23 | 24 | Matrix GLM::predict(Matrix& features) const { 25 | Matrix labels; 26 | labels = features * weights; 27 | double log; 28 | for(int i = 0; i < labels.getNumRow(); i++){ 29 | log = round(1/(1 + exp(-(labels.get(i,0))))); 30 | labels.set(i,0, log); 31 | } 32 | return labels; 33 | } 34 | 35 | std::tuple GLM::accuracy(Matrix& oLabels, Matrix& pLabels){ 36 | int sum = 0; 37 | int negSum = 0; 38 | int negSame = 0; 39 | int posSum = 0; 40 | int posSame = 0; 41 | for(int i = 0; i < oLabels.getNumRow(); i++){ 42 | if(oLabels.get(i,0) == -1){ 43 | negSum++; 44 | if(oLabels.get(i,0) == pLabels.get(i, 0)){ 45 | sum++; 46 | negSame++; 47 | } 48 | }else{ 49 | posSum++; 50 | if(oLabels.get(i,0) == pLabels.get(i, 0)){ 51 | sum++; 52 | posSame++; 53 | } 54 | } 55 | } 56 | double acc = (((double)sum*100)/(oLabels.getNumRow())); 57 | double sens = (((double)posSame*100)/(posSum)); 58 | double spec = (((double)negSame*100)/(negSum)); 59 | cout << "Accuracy: " << acc << "% "; 60 | cout << "Sensitivity: " << sens << "% "; 61 | cout << "Specificity: " << spec << "% " << endl; 62 | return make_tuple(acc, sens, spec); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/cluster/src/GLM.h: -------------------------------------------------------------------------------- 1 | /* 2 | * glm.h 3 | * 4 | * Created on: May 29, 2017 5 | * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa 6 | * Modified by Benjamin James 7 | */ 8 | 9 | #ifndef SRC_MATRIX_GLM_H_ 10 | #define SRC_MATRIX_GLM_H_ 11 | 12 | #include "Matrix.h" 13 | #include 14 | namespace matrix { 15 | 16 | class GLM { 17 | private: 18 | Matrix weights; 19 | 20 | public: 21 | void train(matrix::Matrix& features, matrix::Matrix& labels); 22 | Matrix predict(matrix::Matrix& features) const; 23 | std::tuple accuracy(matrix::Matrix& oLabels, matrix::Matrix& pLabels); 24 | const Matrix& get_weights() const { return weights; }; 25 | }; 26 | 27 | } 28 | 29 | #endif /* SRC_MATRIX_GLM_H_ */ 30 | -------------------------------------------------------------------------------- /src/cluster/src/Histogram.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Histogram.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef HEADER_HACK 8 | #include "Histogram.h" 9 | #endif 10 | 11 | #include 12 | #include 13 | 14 | template 15 | double Histogram::distance_k1(const Point &p) const 16 | { 17 | throw "Not implemented"; 18 | const Histogram& h = dynamic_cast&>(p); 19 | uint64_t dist = 0; 20 | auto size = std::min(points.size(),h.points.size()); 21 | /* 22 | for (unsigned int i = 0; i < size; i++) { 23 | T l = points.at(i); 24 | T r = h.points.at(i); 25 | dist += (l > r) ? (l - r) : (r - l); 26 | } 27 | */ 28 | uint64_t avg_mag = (magnitude() + h.magnitude()) / 2.0; 29 | for (auto i = 0; i < size; i++) { 30 | T l = points[i]; 31 | T r = h.points[i]; 32 | dist += min(l, r); 33 | } 34 | return 1.0 - dist / avg_mag; 35 | } 36 | template 37 | Histogram::Histogram(std::vector pts, char mark) 38 | { 39 | for (T t : pts) { 40 | points.push_back(t); 41 | } 42 | to_delete = false; 43 | } 44 | template 45 | Histogram::Histogram(std::vector pts) 46 | { 47 | for (T t : pts) { 48 | points.push_back(t); 49 | } 50 | to_delete = false; 51 | } 52 | 53 | template 54 | Histogram::Histogram(std::vector pts, bool toDelete) 55 | { 56 | for (T t : pts) { 57 | points.push_back(t); 58 | } 59 | to_delete = toDelete; 60 | } 61 | 62 | template 63 | Histogram::Histogram(unsigned int size) 64 | { 65 | for (unsigned int i = 0; i < size; i++) { 66 | points.push_back(0); 67 | } 68 | to_delete = false; 69 | } 70 | 71 | template 72 | void Histogram::operator*=(double d) 73 | { 74 | for (T &t : points) { 75 | t *= d; 76 | } 77 | } 78 | 79 | template 80 | bool Histogram::operator<(Point& p) const 81 | { 82 | const Histogram& h = dynamic_cast&>(p); 83 | unsigned int size = std::min(points.size(),h.points.size()); 84 | for (unsigned int i = 0; i < size; i++) { 85 | if (points.at(i) >= h.points.at(i)) { 86 | return false; 87 | } 88 | } 89 | return true; 90 | } 91 | 92 | template 93 | void Histogram::operator/=(double d) 94 | { 95 | unsigned int size = points.size(); 96 | for (unsigned int i = 0; i < size; i++) { 97 | points.at(i) = points.at(i) / d; 98 | } 99 | } 100 | 101 | template 102 | void Histogram::operator+=(Point& p) 103 | { 104 | const Histogram& h = dynamic_cast&>(p); 105 | unsigned int size = std::min(points.size(),h.points.size()); 106 | for (unsigned int i = 0; i < size; i++) { 107 | points.at(i) += h.points.at(i); 108 | } 109 | } 110 | 111 | template 112 | uint64_t Histogram::operator-(const Point& p) const 113 | { 114 | return distance(p); 115 | } 116 | 117 | template 118 | void Histogram::set(Point& p) 119 | { 120 | const Histogram& h = dynamic_cast&>(p); 121 | points = h.points; 122 | } 123 | 124 | template 125 | void Histogram::display() const 126 | { 127 | unsigned size = points.size(); 128 | for (unsigned i = 0; i < size; i++) { 129 | std::cout << points.at(i) << " "; 130 | } 131 | std::cout << std::endl; 132 | } 133 | 134 | template 135 | void Histogram::addOne() 136 | { 137 | for (auto &a : points) { 138 | a++; 139 | } 140 | } 141 | template 142 | void Histogram::subOne() 143 | { 144 | for (auto &a : points) { 145 | a--; 146 | } 147 | } 148 | 149 | template 150 | void Histogram::zero() 151 | { 152 | for (typename std::vector::iterator it = points.begin(); it != points.end(); ++it) { 153 | *it = 0; 154 | } 155 | } 156 | 157 | template 158 | uint64_t Histogram::distance(const Point& p) const 159 | { 160 | /* 161 | // Vectors should be the same width 162 | const Histogram& h = dynamic_cast&>(p); 163 | T dist = 0; 164 | unsigned int size = std::min(points.size(),h.points.size()); 165 | for (unsigned int i = 0; i < size; i++) { 166 | T l = points.at(i); 167 | T r = h.points.at(i); 168 | dist += (l > r) ? (l - r) : (r - l); 169 | } 170 | return dist; 171 | */ 172 | throw "Not implemented"; 173 | return 0; 174 | } 175 | 176 | template 177 | uint64_t Histogram::magnitude() const 178 | { 179 | uint64_t dist = 0; 180 | for (auto const& p : points) { 181 | dist += p; 182 | } 183 | return dist; 184 | } 185 | 186 | #ifndef HEADER_HACK 187 | template class Histogram; 188 | template class Histogram; 189 | template class Histogram; 190 | template class Histogram; 191 | template class Histogram; 192 | template class Histogram; 193 | #endif 194 | -------------------------------------------------------------------------------- /src/cluster/src/Histogram.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Histogram.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef HISTOGRAM_H 8 | #define HISTOGRAM_H 9 | #include 10 | #include "Point.h" 11 | 12 | template 13 | class Histogram : public Point { 14 | public: 15 | Histogram(std::vector pts); 16 | Histogram(std::vector pts, char marker); 17 | Histogram(std::vector pts, bool to_delete); 18 | Histogram(unsigned int size); 19 | ~Histogram() {} 20 | void operator*=(double d); 21 | void operator/=(double d); 22 | uint64_t operator-(const Point& p) const; 23 | bool operator<(Point& p) const; 24 | void operator+=(Point& p); 25 | void set(Point& p); 26 | void display() const; 27 | void zero(); 28 | void addOne(); 29 | void subOne(); 30 | double distance_k1(const Point& p) const; 31 | double prob_under(Point& p) const { return distance(p); }; 32 | uint64_t distance(const Point& p) const; 33 | uint64_t magnitude() const; 34 | uint64_t getRealMagnitude() const { return 0; }; 35 | double distance_d(Point& p) const { 36 | throw "not implemented"; 37 | return 0; 38 | } 39 | void set_arg_to_this_d(Point& p) const { 40 | throw "not implemented"; 41 | } 42 | Point* create_double() const { 43 | throw "not implemented"; 44 | return NULL; 45 | } 46 | Histogram* clone() const { 47 | return new Histogram(points, to_delete); 48 | } 49 | Histogram* create() const { 50 | return new Histogram(points.size()); 51 | } 52 | bool is_to_delete() const { 53 | return to_delete; 54 | } 55 | void set_to_delete(bool b) { 56 | to_delete = b; 57 | } 58 | const vector& get_data() const { return points; } 59 | void set_id(int c_id) { id = c_id; }; 60 | const int get_id() const { return id; }; 61 | void set_length(unsigned long len) { nucl_length = len; }; 62 | unsigned long get_length() const { return nucl_length; }; 63 | unsigned long size() const { return points.size(); }; 64 | private: 65 | std::vector points; 66 | bool to_delete; 67 | int id; 68 | unsigned long nucl_length; 69 | }; 70 | 71 | #ifdef HEADER_HACK 72 | #ifndef HISTOGRAM_C 73 | #define HISTORGRAM_C 74 | #include "Histogram.cpp" 75 | #endif 76 | #endif 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/cluster/src/LogTable.cpp: -------------------------------------------------------------------------------- 1 | #include "LogTable.h" 2 | 3 | #include 4 | #include 5 | 6 | LogTable::LogTable() : coeff(1000000 / 2) 7 | { 8 | uintmax_t size = 1000000; 9 | double imax = 2; 10 | // map = new double[size]; 11 | double lsize = log(size); 12 | for (uintmax_t i = 0; i < size; i++) { 13 | map[i] = log(imax * (i + 1)) - lsize; 14 | } 15 | std::cout << "dmax: " << coeff << std::endl; 16 | } 17 | LogTable::LogTable(uintmax_t size, double imax) : coeff(size / imax) 18 | { 19 | //map = new double[size]; 20 | double lsize = log(size); 21 | for (uintmax_t i = 0; i < size; i++) { 22 | map[i] = log(imax * (i + 1)) - lsize; 23 | } 24 | std::cout << "dmax: " << coeff << std::endl; 25 | } 26 | 27 | LogTable::~LogTable() 28 | { 29 | //delete[] map; 30 | } 31 | 32 | double LogTable::at(double d) const 33 | { 34 | size_t idx = d * coeff; 35 | return map[idx]; 36 | } 37 | double LogTable::operator[](double d) const 38 | { 39 | size_t index = d * coeff; 40 | return map[index]; 41 | } 42 | -------------------------------------------------------------------------------- /src/cluster/src/LogTable.h: -------------------------------------------------------------------------------- 1 | #ifndef LOGTABLE_H 2 | #define LOGTABLE_H 3 | 4 | #include 5 | #include 6 | 7 | #define TBLSIZE 1000000 8 | class LogTable { 9 | public: 10 | LogTable(); 11 | LogTable(uintmax_t _size, double imax=2); 12 | ~LogTable(); 13 | double at(double d) const; 14 | double operator[](double d) const; 15 | private: 16 | double map[TBLSIZE]; 17 | 18 | const double coeff; 19 | }; 20 | #endif 21 | -------------------------------------------------------------------------------- /src/cluster/src/Mat.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Mat.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef MAT_H 8 | #define MAT_H 9 | #include 10 | #include 11 | using namespace std; 12 | template 13 | class Mat { 14 | public: 15 | Mat(function func, const long size) : n(size), table_size(size*(size+1)/2), compute(func) { 16 | if (size <= 0) { 17 | throw "Invalid size"; 18 | } 19 | table = new T[table_size]; 20 | set = new bool[table_size](); 21 | }; 22 | ~Mat() { 23 | delete[] table; 24 | delete[] set; 25 | }; 26 | void fill() { 27 | unsigned long long count = 0; 28 | #ifdef OPENMP 29 | #pragma omp parallel for collapse(2) shared(set) 30 | #endif 31 | for (long i = 0; i < n; i++) { 32 | for (long j = 0; j < n; j++) { 33 | const auto idx = addr(i, j); 34 | if (!set[idx]) { 35 | auto res = compute(i, j); 36 | table[idx] = res; 37 | set[idx] = true; 38 | count++; 39 | } 40 | if (count % 10000 == 0) { 41 | cout << count << " / " << table_size << endl; 42 | } 43 | } 44 | } 45 | 46 | }; 47 | T& operator[](pair index) { 48 | const unsigned long idx = addr(index.first, index.second); 49 | if (!set[idx]) { 50 | table[idx] = compute(index.first, index.second); 51 | set[idx] = true; 52 | } 53 | return table[idx]; 54 | }; 55 | bool exists(int i, int j) const { 56 | return set[addr(i, j)]; 57 | } 58 | private: 59 | T* table; 60 | bool* set; 61 | const unsigned long table_size; 62 | const unsigned long n; 63 | function compute; 64 | 65 | unsigned long addr(unsigned long i, unsigned long j) const { 66 | if (i <= j) { 67 | return i * n - (i - 1) * i / 2 + j - i; 68 | } else { 69 | return j * n - (j - 1) * j / 2 + i - j; 70 | } 71 | }; 72 | }; 73 | #endif 74 | -------------------------------------------------------------------------------- /src/cluster/src/Matrix.h: -------------------------------------------------------------------------------- 1 | /* 2 | * matrix.h 3 | * 4 | * Created on: May 10, 2017 5 | * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa 6 | */ 7 | 8 | 9 | #ifndef MATRIX_H_ 10 | #define MATRIX_H_ 11 | 12 | #include 13 | #include 14 | 15 | namespace matrix { 16 | 17 | class Matrix 18 | { 19 | private: 20 | std::vector > m; 21 | int numRow; 22 | int numCol; 23 | 24 | 25 | public: 26 | 27 | Matrix(int r, int c); 28 | Matrix(); 29 | ~Matrix(); 30 | Matrix operator+(Matrix n); 31 | Matrix operator-(Matrix n); 32 | Matrix operator*(Matrix n); 33 | Matrix transpose(); 34 | Matrix gaussJordanInverse(); 35 | Matrix pseudoInverse(); 36 | void userFill(); 37 | double determinant(); 38 | double get(int r, int c) const; 39 | void set(int r, int c, double val); 40 | void addRow(double); 41 | void addCol(double); 42 | void print(); 43 | void printToFile(std::string); 44 | void randFill(double low, double high); 45 | void fileFill(std::string filename); 46 | void normalize(double a, double b); 47 | void rowToVector(int, std::vector&); 48 | void colToVector(int, std::vector&); 49 | int getNumRow() const; 50 | }; 51 | } 52 | #endif /* MATRIX_H_ */ 53 | -------------------------------------------------------------------------------- /src/cluster/src/NearestNeighbor.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * NearestNeighbor.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef NEARESTNEIGHBOR_H 8 | #define NEARESTNEIGHBOR_H 9 | // #include 10 | // #include "Point.h" 11 | // template 12 | // class NearestNeighbor { 13 | // public: 14 | // NearestNeighbor(const vector*> &pts) : points(pts) { 15 | // const int dim = pts[0]->get_data().size(); 16 | // const int maxPts = pts.size(); 17 | // dataPts = annAllocPts(maxPts, dim); 18 | // queryPt = annAllocPt(dim); 19 | // for (int nPts = 0; nPts < maxPts; nPts++) { 20 | // auto vec = pts[nPts]->get_data(); 21 | // for (int i = 0; i < vec.size(); i++) { 22 | // dataPts[nPts][i] = vec[i]; 23 | // } 24 | // } 25 | // kd_tree = new ANNkd_tree(dataPts, maxPts, dim); 26 | // nnIdx = new ANNidx[1]; 27 | // dists = new ANNdist[1]; 28 | // }; 29 | // ~NearestNeighbor() { 30 | // delete[] nnIdx; 31 | // delete[] dists; 32 | // delete kd_tree; 33 | // annClose(); 34 | // }; 35 | // void find_nearest_neighbor(Point ¢er) const { 36 | // auto vec = center.get_data(); 37 | // for (int i = 0; i < vec.size(); i++) { 38 | // queryPt[i] = vec[i]; 39 | // } 40 | // kd_tree->annkSearch(queryPt, 1, nnIdx, dists); 41 | // ANNidx idx = nnIdx[0]; 42 | // center.set(*points[idx]); 43 | // }; 44 | // private: 45 | // ANNkd_tree *kd_tree = NULL; 46 | // ANNpointArray dataPts; 47 | // ANNpoint queryPt; 48 | // ANNidxArray nnIdx; 49 | // ANNdistArray dists; 50 | // const vector*> &points; 51 | // }; 52 | #endif 53 | -------------------------------------------------------------------------------- /src/cluster/src/Point.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Point.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef POINT_H 8 | #define POINT_H 9 | 10 | #include 11 | #include "../../nonltr/ChromosomeOneDigit.h" 12 | 13 | /* 14 | * Pure virtual class that defines behavior for 15 | * points. Has clone() and create() that allow for 16 | * polymorphic behavior 17 | */ 18 | template 19 | class Point { 20 | public: 21 | virtual ~Point() { }; 22 | virtual void operator*=(double d) = 0; 23 | virtual void operator/=(double d) = 0; 24 | virtual bool operator<(Point& p) const = 0; 25 | virtual uint64_t operator-(const Point& p) const = 0; 26 | virtual void operator+=(Point& p) = 0; 27 | virtual void set(Point& p) = 0; 28 | virtual void display() const = 0; 29 | virtual uint64_t distance(const Point& p) const = 0; 30 | virtual double distance_d(Point& p) const = 0; 31 | virtual Point* clone() const = 0; 32 | virtual Point* create() const = 0; 33 | 34 | virtual void zero() = 0; 35 | virtual void addOne() = 0; 36 | virtual double distance_k1(const Point& p) const = 0; 37 | virtual double prob_under(Point& center) const = 0; 38 | virtual void subOne() = 0; 39 | virtual uint64_t getRealMagnitude() const = 0; 40 | // virtual T magnitude() const = 0; 41 | virtual bool is_to_delete() const = 0; 42 | virtual void set_to_delete(bool b) = 0; 43 | 44 | virtual Point* create_double() const = 0; 45 | virtual void set_arg_to_this_d(Point& p) const = 0; 46 | 47 | virtual const vector& get_data() const = 0; 48 | 49 | void set_header(const std::string c) { header = string(c); }; 50 | const std::string get_header() const { return header; }; 51 | 52 | void set_data_str(const std::string c) { data = c; }; 53 | const std::string & get_data_str() const { return data; }; 54 | 55 | void set_1mers(const vector &vec) { 56 | for (auto i = 0; i < 4; i++) { 57 | one_mers[i] = vec[i]; 58 | } 59 | } 60 | vector get_1mers() const { 61 | vector vec; 62 | for (auto i = 0; i < 4; i++) { 63 | vec.push_back(one_mers[i]); 64 | } 65 | return vec; 66 | } 67 | virtual unsigned long size() const = 0; 68 | virtual void set_id(int c_id) = 0;//{ id = c_id; }; 69 | virtual const int get_id() const = 0;//{ return id; }; 70 | virtual void set_length(unsigned long len) = 0; 71 | virtual unsigned long get_length() const = 0; 72 | private: 73 | uint64_t one_mers[4]; 74 | std::string header; 75 | std::string data; 76 | }; 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/cluster/src/Progress.cpp: -------------------------------------------------------------------------------- 1 | #include "Progress.h" 2 | #include 3 | Progress::Progress(long num, std::string prefix_) 4 | { 5 | pmax = num; 6 | ended = 0; 7 | pcur = 0; 8 | prefix = prefix_; 9 | barWidth = 70 - (prefix.size()+1); 10 | print(); 11 | } 12 | 13 | void Progress::print() 14 | { 15 | double prog = (double)pcur / pmax; 16 | std::cout << prefix << " ["; 17 | int pos = barWidth * prog; 18 | for (int i = 0; i < barWidth; i++) { 19 | if (i < pos) { 20 | std::cout << "="; 21 | } else if (i == pos) { 22 | std::cout << ">"; 23 | } else { 24 | std::cout << " "; 25 | } 26 | } 27 | std::cout << "] " << int(prog * 100.0) << " %\r"; 28 | std::cout.flush(); 29 | } 30 | 31 | void Progress::end() 32 | { 33 | if (!ended) { 34 | pcur = pmax; 35 | print(); 36 | std::cout << std::endl; 37 | } 38 | ended = true; 39 | } 40 | 41 | void Progress::operator++() 42 | { 43 | pcur++; 44 | print(); 45 | } 46 | void Progress::operator++(int) 47 | { 48 | print(); 49 | pcur++; 50 | } 51 | 52 | 53 | void Progress::operator+=(size_t num) 54 | { 55 | pcur += num; 56 | print(); 57 | } 58 | -------------------------------------------------------------------------------- /src/cluster/src/Progress.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Progress.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include 8 | #ifndef PROGRESS_H 9 | #define PROGRESS_H 10 | 11 | class Progress { 12 | public: 13 | Progress(long num, std::string prefix_); 14 | ~Progress() { end(); } 15 | void end(); 16 | void operator++(); 17 | void operator++(int); 18 | void operator+=(size_t); 19 | private: 20 | void print(); 21 | long pmax; 22 | long pcur; 23 | bool ended; 24 | std::string prefix; 25 | int barWidth; 26 | }; 27 | #endif 28 | -------------------------------------------------------------------------------- /src/cluster/src/Runner.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Runner.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef RUNNER_H 8 | #define RUNNER_H 9 | 10 | #include 11 | #include 12 | #include "Point.h" 13 | using namespace std; 14 | 15 | class Runner { 16 | public: 17 | Runner(int argc, char** argv); 18 | ~Runner() {}; 19 | int run(); 20 | private: 21 | template int do_run(); 22 | template void print_output(const map*, vector*>*> &m) const; 23 | int k = -1; 24 | int bandwidth; 25 | double similarity = 0.90; 26 | long largest_count = 0; 27 | int iterations = 15; 28 | int delta = 5; 29 | bool align = false; 30 | int sample_size = 0; 31 | int pivots = 20; 32 | std::vector files; 33 | string output = "output.clstr"; 34 | void get_opts(int argc, char** argv); 35 | pair find_k(); 36 | }; 37 | #endif 38 | -------------------------------------------------------------------------------- /src/cluster/src/SingleFeature.cpp: -------------------------------------------------------------------------------- 1 | #include "SingleFeature.h" 2 | 3 | template 4 | void SingleFeature::normalize(const vector*,Point*> > &pairs) 5 | { 6 | for (auto p : pairs) { 7 | double d; 8 | if (rc.empty()) { 9 | d = raw(p.first, p.second); 10 | } else { 11 | d = rraw(p.first, p.second, rc, rv); 12 | } 13 | if (!min_set || d < min) { 14 | min = d; 15 | min_set = true; 16 | } 17 | if (!max_set || d > max) { 18 | max = d; 19 | max_set = true; 20 | } 21 | } 22 | } 23 | 24 | template 25 | double SingleFeature::operator()(Point *a, Point *b) const 26 | { 27 | double d; 28 | if (rc.empty()) { 29 | d = raw(a, b); 30 | } else { 31 | d = rraw(a, b, rc, rv); 32 | } 33 | // std::cout << "Raw: " << d << std::endl; 34 | double f = (d - min) / (max - min); 35 | // std::cout << "Normalized: " << f << std::endl; 36 | f = std::min(1.0, std::max(0.0, f)); 37 | if (is_sim) { 38 | return f; 39 | } else { 40 | return 1.0 - f; 41 | } 42 | } 43 | 44 | 45 | template class SingleFeature; 46 | template class SingleFeature; 47 | template class SingleFeature; 48 | template class SingleFeature; 49 | template class SingleFeature; 50 | template class SingleFeature; 51 | -------------------------------------------------------------------------------- /src/cluster/src/SingleFeature.h: -------------------------------------------------------------------------------- 1 | #ifndef SINGLEFEATURE_H 2 | #define SINGLEFEATURE_H 3 | 4 | #include "Point.h" 5 | #include 6 | 7 | template 8 | class SingleFeature { 9 | public: 10 | SingleFeature(std::function*, Point*)> f, bool is_sim_=true) 11 | : raw(f), is_sim(is_sim_), min_set(false), max_set(false) {} 12 | SingleFeature(std::function*, Point*, const vector&, const vector&)> f, vector rrv, vector rrc, bool is_sim_=true) 13 | : rraw(f), is_sim(is_sim_), min_set(false), max_set(false), rv(rrv), rc(rrc) {} 14 | void normalize(const vector*,Point*> > &pairs); 15 | double operator()(Point*, Point*) const; 16 | double min, max; 17 | private: 18 | std::function*, Point*)> raw; 19 | std::function*, Point*, const vector&, const vector&)> rraw; 20 | vector rv, rc; 21 | const bool is_sim; 22 | bool max_set, min_set; 23 | 24 | }; 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /src/cluster/src/Trainer.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | #ifndef TRAINER_H 3 | #define TRAINER_H 4 | 5 | #include "Point.h" 6 | #include "GLM.h" 7 | #include "Feature.h" 8 | #include "bvec.h" 9 | #include "Center.h" 10 | #include "LogTable.h" 11 | #include 12 | template 13 | class Trainer { 14 | public: 15 | Trainer(std::vector*> v, size_t num_points, int largest_count, double cutoff_, size_t max_pts_from_one_, double (&matrix)[4][4], double sig, double eps, int ksize) : points(v), n_points(num_points), cutoff(cutoff_), max_pts_from_one(max_pts_from_one_), k(ksize) { 16 | init(matrix, sig, eps); 17 | uintmax_t size = 1000 * 1000 * 10; 18 | log_table = new double[size]; 19 | log_coeff = size / 2; 20 | double lsize = log(size); 21 | log_table[0] = 0; 22 | for (uintmax_t i = 1; i < size; i++) { 23 | log_table[i] = log(2 * i) - lsize; 24 | } 25 | feat = new Feature(largest_count, log_table, log_coeff); 26 | }; 27 | ~Trainer() { delete feat_mat; delete feat; delete[] log_table;} 28 | std::pair*, Point*>, double>, 29 | std::map*, Point*>, double> > split_old(); 30 | vector*,Point*> > split(); 31 | double train_n(pair*, 32 | Point* 33 | > >, 34 | vector*, 35 | Point*> > > &data, int ncols); 36 | void train(double acc_cutoff=97.5); 37 | std::tuple*,double,size_t,size_t> get_close(Point*, bvec_iterator istart, bvec_iterator iend, bool& is_min) const; 38 | // vector > get_close(Point*, const vector*,int> > &, bool& is_min) const; 39 | void filter(Point*, vector*,bool> >&) const; 40 | Point* closest(Point*, vector*,bool> >&) const; 41 | long merge(vector > ¢ers, long current, long begin, long end) const; 42 | // Point* merge(Point*, vector*,double> >&) const; 43 | double raw_classify(Point*,Point*) const; 44 | private: 45 | matrix::GLM glm; 46 | matrix::Matrix weights; 47 | double align(Point* a, Point* b) const; 48 | std::pair generate_feat_mat(pair*, 49 | Point* 50 | > >, 51 | vector*, 52 | Point*> > > &data, int ncols); 53 | void init(double (&matrix)[4][4], double sig, double eps); 54 | 55 | 56 | pair*,Point*>, double> >, 57 | vector*,Point*>, double > > > get_labels(vector*,Point*> >&, double cutoff) const; 58 | Feature *feat; 59 | double *log_table; 60 | int mat[4][4]; 61 | int sigma, epsilon; 62 | std::vector*> points; 63 | matrix::Matrix *feat_mat = NULL; 64 | size_t n_points, max_pts_from_one; 65 | double cutoff, log_coeff; 66 | int k; 67 | LogTable *tbl = NULL; 68 | }; 69 | #endif 70 | -------------------------------------------------------------------------------- /src/cluster/src/bvec.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * bvec.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "bvec.h" 8 | #include 9 | template 10 | bvec::bvec(vector& lengths, uint64_t bin_size) 11 | { 12 | uint64_t num_points = lengths.size(); 13 | std::sort(std::begin(lengths), std::end(lengths)); 14 | for (uint64_t i = 0; i < lengths.size(); i += bin_size) { 15 | begin_bounds.push_back(lengths[i]); 16 | // uint64_t last_index = std::min((uint64_t)lengths.size() - 1, 17 | // i + bin_size - 1); 18 | //std::cout << "[" << i << " " << last_index << "]" << std::endl; 19 | } 20 | data.reserve(begin_bounds.size()); 21 | for (uint64_t i = 0; i < begin_bounds.size(); i++) { 22 | data.push_back({}); 23 | } 24 | } 25 | 26 | template 27 | Point* bvec::pop() 28 | { 29 | for (auto& bin : data) { 30 | if (!bin.empty()) { 31 | Point* p = bin[0].first; 32 | bin.erase(std::begin(bin)); 33 | return p; 34 | } 35 | } 36 | return NULL; 37 | } 38 | 39 | template 40 | Point* bvec::peek() const 41 | { 42 | for (auto& bin : data) { 43 | if (!bin.empty()) { 44 | Point* p = bin[0].first; 45 | return p; 46 | } 47 | } 48 | return NULL; 49 | } 50 | 51 | template 52 | bool bvec::inner_index_of(uint64_t length, size_t &idx, size_t *pfront, size_t *pback) const 53 | { 54 | 55 | if (data.at(idx).empty() || idx == data.size()) { 56 | if (pfront) { 57 | for (size_t i = 0; i < data.size(); i++) { 58 | if (!data.at(i).empty()) { 59 | idx = i; 60 | *pfront = 0; 61 | break; 62 | } 63 | } 64 | } 65 | if (pback) { 66 | for (int i = data.size()-1; i >= 0; i--) { 67 | if (!data.at(i).empty()) { 68 | idx = i; 69 | *pback = 0; 70 | break; 71 | } 72 | } 73 | } 74 | return true; 75 | } 76 | size_t front = 0, back = 0; 77 | size_t low = 0, high = data.at(idx).size() - 1; 78 | bool found = false; 79 | if (length < data[idx][low].first->get_length() && pfront != NULL) { 80 | *pfront = low; 81 | } 82 | if (length > data[idx][high].first->get_length() && pback != NULL) { 83 | *pback = high; 84 | } 85 | for (;low <= high;) { 86 | size_t mid = (low + high) / 2; 87 | uint64_t d = data[idx][mid].first->get_length(); 88 | if (d == length) { 89 | front = mid; 90 | back = mid; 91 | found = true; 92 | break; 93 | } else if (length < d) { 94 | high = mid; 95 | } else if (length > d) { 96 | low = mid + 1; 97 | } 98 | if (low == high) { 99 | found = true; 100 | front = low; 101 | back = high; 102 | break; 103 | } 104 | } 105 | if (pfront) { 106 | for (long i = front; i >= 0 107 | && data[idx][i].first->get_length() == length; i--) { 108 | front = i; 109 | } 110 | *pfront = front; 111 | } 112 | if (pback) { 113 | for (long i = back; i < data[idx].size() 114 | && data[idx][i].first->get_length() == length; i++) { 115 | back = i; 116 | } 117 | *pback = back; 118 | } 119 | return true; 120 | } 121 | 122 | template 123 | bool bvec::index_of(uint64_t point, size_t* pfront, size_t* pback) const 124 | { 125 | size_t low = begin_bounds.size()-1, high = 0; 126 | 127 | for (size_t i = 0; i < begin_bounds.size(); i++) { 128 | size_t prev = 0; 129 | size_t prev_index = 0; 130 | if (i > 0) { 131 | prev_index = i - 1; 132 | prev = begin_bounds[i-1]; 133 | } 134 | if (point >= prev && point <= begin_bounds[i]) { 135 | low = std::min(low, prev_index); 136 | high = std::max(high, prev_index); 137 | } 138 | } 139 | if (point >= begin_bounds[begin_bounds.size()-1]) { 140 | high = std::max(high, begin_bounds.size()-1); 141 | } 142 | if (pfront) { 143 | *pfront = low; 144 | } 145 | if (pback) { 146 | *pback = high; 147 | } 148 | return true; 149 | } 150 | 151 | template 152 | void bvec::insert(Point *p) 153 | { 154 | uint64_t len = p->get_length(); 155 | size_t front = 0, back = 0; 156 | bool good = index_of(len, &front, &back); 157 | if (!good || front > back) { 158 | std::cerr << "error: list is not sorted" << std::endl; 159 | } 160 | std::vector min_sizes; 161 | size_t minimum = std::numeric_limits::max(); 162 | for (size_t i = front; i <= back; i++) { 163 | size_t sz = data[i].size(); 164 | if (sz < minimum) { 165 | minimum = sz; 166 | min_sizes.clear(); 167 | min_sizes.push_back(i); 168 | } else if (sz == minimum) { 169 | min_sizes.push_back(i); 170 | } 171 | } 172 | if (min_sizes.empty()) { 173 | std::cerr << "error: no bins to insert into, item not inserted" << std::endl; 174 | } 175 | auto mid_min = min_sizes[min_sizes.size() / 2]; 176 | data.at(mid_min).push_back(std::make_pair(p, false)); 177 | } 178 | 179 | template 180 | size_t bvec::size() const 181 | { 182 | size_t num_bins = data.size(); 183 | size_t total_size = 0; 184 | for (size_t i = 0; i < num_bins; i++) { 185 | total_size += data[i].size(); 186 | } 187 | return total_size; 188 | } 189 | 190 | template 191 | size_t bvec::report() const 192 | { 193 | cout << "BVec: "; 194 | size_t num_bins = data.size(); 195 | cout << "num_bins=" << num_bins << endl; 196 | size_t total_size = 0; 197 | for (size_t i = 0; i < num_bins; i++) { 198 | uint64_t next_bound = std::numeric_limits::max(); 199 | if (i + 1 < num_bins) { 200 | next_bound = begin_bounds[i+1]; 201 | } 202 | cout << "Bin " << i << ": [" << begin_bounds[i] << " " << next_bound << "] size=" << data[i].size() << endl; 203 | total_size += data[i].size(); 204 | } 205 | cout << "total_size=" << total_size << endl; 206 | return total_size; 207 | } 208 | template 209 | void bvec::insert_finalize() 210 | { 211 | auto sorter = [](const std::pair*,bool> a, const std::pair*,bool> b) { 212 | return a.first->get_length() < b.first->get_length(); 213 | }; 214 | for (size_t i = 0; i < data.size(); i++) { 215 | std::sort(std::begin(data[i]), std::end(data[i]), sorter); 216 | data[i].shrink_to_fit(); 217 | } 218 | } 219 | 220 | template 221 | bool bvec::empty() const 222 | { 223 | bool is_empty = true; 224 | for (auto bin : data) { 225 | if (!bin.empty()) { 226 | is_empty = false; 227 | break; 228 | } 229 | } 230 | return is_empty; 231 | } 232 | 233 | 234 | template 235 | uint64_t bvec::absolute_idx(bvec_idx_t idx) const 236 | { 237 | uint64_t ptr = 0; 238 | for (int i = 0; i < idx.first; i++) { 239 | ptr += data[i].size(); 240 | } 241 | ptr += idx.second; 242 | return ptr; 243 | } 244 | 245 | template 246 | std::pair 247 | bvec::get_range(uint64_t begin_len, uint64_t end_len) const 248 | { 249 | /* perform binary search to find bin */ 250 | bvec_idx_t front, back; 251 | front.first = 0; 252 | front.second = 0; 253 | back.first = data.size()-1; 254 | back.second = data[back.first].size() - 1; 255 | if (!index_of(begin_len, &front.first, NULL)) { 256 | throw 100; 257 | } 258 | if (!index_of(end_len, NULL, &back.first)) { 259 | throw 100; 260 | } 261 | if (!inner_index_of(begin_len, front.first, &front.second, NULL)) { 262 | throw 100; 263 | } 264 | if (!inner_index_of(end_len, back.first, NULL, &back.second)) { 265 | throw 100; 266 | } 267 | // if (back.first != data.size()) { // ++ to make it an end iterator 268 | // if (back.second != data[back.first].size()) { 269 | // back.second++; 270 | // } else { 271 | // back.first++; 272 | // back.second = 0; 273 | // } 274 | // } else { 275 | // throw 101; 276 | // } 277 | return std::make_pair(front, back); 278 | } 279 | 280 | template 281 | void bvec::erase(size_t r, size_t c) 282 | { 283 | data.at(r).erase(data.at(r).begin() + c); 284 | } 285 | 286 | /* 287 | * TODO: change available to Center class so no intermediate copying is done 288 | */ 289 | template 290 | void bvec::remove_available(bvec_idx_t begin, bvec_idx_t end, std::vector*> &available) 291 | { 292 | size_t a = begin.first; 293 | size_t b = end.first; 294 | int num = 0, new_num = 0; 295 | auto func = [](const bv_data_type d) { return d.second; }; 296 | auto inserter = [&](const std::pair*,bool> p) { 297 | if (p.second) { 298 | #pragma omp critical 299 | available.push_back(p.first); 300 | } 301 | }; 302 | #pragma omp parallel for 303 | for (size_t i = a; i <= b; i++) { 304 | /* move marked points to end of vector, then copy, then erase */ 305 | //const auto last = std::remove_if(std::begin(data[i]), std::end(data[i]), func); 306 | for (int j = 0; j < data[i].size(); j++) { 307 | auto kv = data[i][j]; 308 | if (kv.second) { 309 | #pragma omp critical 310 | { 311 | available.push_back(kv.first); 312 | } 313 | } 314 | } 315 | data[i].erase(std::remove_if(std::begin(data[i]), std::end(data[i]), func), std::end(data[i])); 316 | } 317 | } 318 | 319 | 320 | template 321 | bvec_iterator bvec::iter(bvec_idx_t idx) 322 | { 323 | return bvec_iterator(idx.first, idx.second, &data); 324 | } 325 | 326 | 327 | template class bvec; 328 | template class bvec; 329 | template class bvec; 330 | template class bvec; 331 | template class bvec; 332 | template class bvec; 333 | -------------------------------------------------------------------------------- /src/cluster/src/bvec.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * bvec.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef BVEC_H 8 | #define BVEC_H 9 | 10 | #include "Point.h" 11 | #include "bvec_iterator.h" 12 | 13 | typedef struct bvec_idx { 14 | size_t first, second; 15 | } bvec_idx_t; 16 | 17 | /* 18 | * operations needed: 19 | * 20 | * find bounds (range) 21 | * get available or min and remove 22 | * 23 | */ 24 | template 25 | using bv_data_type = std::pair*, bool>; 26 | 27 | template 28 | using bv_row_type = vector >; 29 | 30 | template 31 | using bv_col_type = vector >; 32 | 33 | template 34 | class bvec { 35 | public: 36 | bvec(vector& lengths, uint64_t bin_size=1000); 37 | 38 | Point* pop(); 39 | Point* peek() const; 40 | void insert(Point* data); 41 | void insert_finalize(); /* sorts bins */ 42 | 43 | 44 | bool index_of(uint64_t length, size_t* front, size_t* back) const; 45 | bool inner_index_of(uint64_t length, size_t& idx, size_t *front, size_t *back) const; 46 | bool empty() const; 47 | 48 | std::pair 49 | get_range(uint64_t begin_len, uint64_t end_len) const; 50 | 51 | void remove_available(bvec_idx_t begin, bvec_idx_t end, std::vector*> &); 52 | 53 | uint64_t absolute_idx(bvec_idx_t idx) const; 54 | 55 | bvec_iterator iter(bvec_idx_t idx); 56 | typedef bvec_iterator iterator; 57 | typedef bvec_iterator const_iterator; 58 | 59 | size_t report() const; 60 | size_t size() const; 61 | 62 | void erase(size_t r, size_t c); 63 | private: 64 | bv_col_type data; 65 | vector begin_bounds; 66 | }; 67 | 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /src/cluster/src/bvec_iterator.cpp: -------------------------------------------------------------------------------- 1 | #include "bvec_iterator.h" 2 | 3 | template 4 | bvec_iterator bvec_iterator::operator++() 5 | { 6 | if (r != col->size()) { 7 | if (c + 1 < col->at(r).size()) { 8 | c++; 9 | } else { 10 | r++; 11 | c = 0; 12 | while (r < col->size() && col->at(r).empty()) { 13 | r++; 14 | } 15 | } 16 | } else { 17 | cerr << "tried incrementing null iterator" << endl; 18 | throw 10; 19 | } 20 | return *this; 21 | } 22 | 23 | template class bvec_iterator; 24 | template class bvec_iterator; 25 | template class bvec_iterator; 26 | template class bvec_iterator; 27 | template class bvec_iterator; 28 | template class bvec_iterator; 29 | -------------------------------------------------------------------------------- /src/cluster/src/bvec_iterator.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * bvec_iterator.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "bvec.h" 8 | #ifndef BVEC_ITERATOR_H 9 | #define BVEC_ITERATOR_H 10 | 11 | 12 | template 13 | class bvec_iterator { 14 | public: 15 | // iterator: split ALL possible points into chunks by indices 16 | using dtype = std::pair*,bool>; 17 | using vtype = vector >; 18 | bvec_iterator(size_t _r, 19 | size_t _c, 20 | vtype* col_) : r(_r), c(_c), col(col_) {} 21 | 22 | bvec_iterator operator++(); 23 | bvec_iterator operator++(int x) { 24 | return ++(*this); 25 | } 26 | dtype& operator*() { 27 | return col->at(r).at(c); 28 | } 29 | void operator+=(int64_t n) { 30 | if (n < 0) { 31 | throw "oops"; 32 | } 33 | for (int i = 0; i < n; i++) { 34 | operator++(); 35 | } 36 | } 37 | bool operator==(const bvec_iterator& rhs) const { 38 | return rhs.c == c && rhs.r == r; 39 | } 40 | bool operator<(const bvec_iterator& rhs) const { 41 | if (r < rhs.r) { 42 | return true; 43 | } else if (r == rhs.r) { 44 | return c < rhs.c; 45 | } else { 46 | return false; 47 | } 48 | } 49 | bool operator<=(const bvec_iterator& rhs) const { 50 | if (r < rhs.r) { 51 | return true; 52 | } else if (r == rhs.r) { 53 | return c <= rhs.c; 54 | } else { 55 | return false; 56 | } 57 | } 58 | bool operator!=(const bvec_iterator& rhs) const { 59 | return r != rhs.r || c != rhs.c; 60 | } 61 | int64_t operator-(const bvec_iterator& rhs) const { 62 | int64_t sum = 0; 63 | if (*this < rhs) { 64 | return -1 * (rhs - *this); 65 | } 66 | // subtract cols until last row is reached 67 | if (r == rhs.r) { 68 | return c - rhs.c; 69 | } 70 | sum += c; 71 | sum += col->at(rhs.r).size() - rhs.c; 72 | for (size_t i = rhs.r + 1; i < r; i++) { 73 | sum += col->at(i).size(); 74 | } 75 | return sum; 76 | } 77 | // bvec_iterator operator[](uint64_t idx) { 78 | 79 | // } 80 | //private: 81 | size_t r,c; 82 | vtype* col; 83 | }; 84 | #endif 85 | -------------------------------------------------------------------------------- /src/cluster/src/main.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * main.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "Runner.h" 8 | int main(int argc, char **argv) 9 | { 10 | 11 | // const rlim_t kStackSize = 10024 * 1024L * 1024L; // min stack size = 1024 Mb 12 | // struct rlimit rl; 13 | // int result; 14 | 15 | // result = getrlimit(RLIMIT_STACK, &rl); 16 | // if (result == 0) { 17 | // if (rl.rlim_cur < kStackSize) { 18 | // rl.rlim_cur = kStackSize; 19 | // result = setrlimit(RLIMIT_STACK, &rl); 20 | // if (result != 0) { 21 | // fprintf(stderr, "setrlimit returned result = %d\n", result); 22 | // } 23 | // } 24 | // } 25 | Runner runner(argc, argv); 26 | return runner.run(); 27 | } 28 | -------------------------------------------------------------------------------- /src/cluster/src/needleman_wunsch.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * needleman_wunsch.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "needleman_wunsch.h" 8 | 9 | 10 | //flags that can be combined 11 | #define HORIZ 1 12 | #define VERT 2 13 | #define DIAG 4 14 | void needleman_wunsch::fill(int i, int j) 15 | { 16 | if (i == 0 || j == 0) { 17 | if (i == j) { 18 | int offset = at(i, j); 19 | score[offset] = 0; 20 | direction[offset] = DIAG; // for backtracking 21 | horiz_gap_len[offset] = 0; 22 | vert_gap_len[offset] = 0; 23 | } else if (i == 0) { 24 | int offset = at(0, j); 25 | int last_offset = at(0, j-1); 26 | score[offset] = score[last_offset] + gap(j); 27 | horiz_gap_len[offset] = 0; 28 | vert_gap_len[offset] = j; 29 | direction[offset] = VERT; 30 | } else { // j == 0 31 | int offset = at(i, 0); 32 | int last_offset = at(i-1, 0); 33 | score[offset] = score[last_offset] + gap(i); 34 | horiz_gap_len[offset] = i; 35 | vert_gap_len[offset] = 0; 36 | direction[offset] = HORIZ; 37 | } 38 | return; 39 | } 40 | int i_diag = at(i-1, j-1); 41 | int i_horiz = at(i-1, j); 42 | int i_vert = at(i, j-1); 43 | int i_cur = at(i, j); 44 | 45 | int hlen = horiz_gap_len[i_horiz] + 1; 46 | int vlen = vert_gap_len[i_vert] + 1; 47 | 48 | int diag_score = score[i_diag] + match_score(s1[i], s2[j]); 49 | int horiz_score = score[i_horiz] + gap(hlen); 50 | int vert_score = score[i_vert] + gap(vlen); 51 | score[i_cur] = std::max(std::max(diag_score, horiz_score), vert_score); 52 | direction[i_cur] = 0; 53 | 54 | // we could match multiple high scores 55 | if (score[i_cur] == diag_score) { 56 | direction[i_cur] |= DIAG; 57 | } 58 | if (score[i_cur] == vert_score) { 59 | direction[i_cur] |= VERT; 60 | vert_gap_len[i_cur] = vlen; 61 | } else { 62 | vert_gap_len[i_cur] = 0; 63 | } 64 | if (score[i_cur] == horiz_score) { 65 | direction[i_cur] |= HORIZ; 66 | horiz_gap_len[i_cur] = hlen; 67 | } else { 68 | horiz_gap_len[i_cur] = 0; 69 | } 70 | } 71 | 72 | std::pair 73 | needleman_wunsch::backtrack() 74 | { 75 | std::string a1 = "", a2 = ""; 76 | int cur_i = l1 - 1; 77 | int cur_j = l2 - 1; 78 | while (cur_i >= 0 && cur_j >= 0) { 79 | uint8_t dir = direction[at(cur_i, cur_j)]; 80 | if (dir & DIAG) { 81 | a1 += s1[cur_i--]; 82 | a2 += s2[cur_j--]; 83 | } else if (dir & HORIZ) { 84 | a1 += s1[cur_i--]; 85 | a2 += '-'; 86 | } else if (dir & VERT) { 87 | a1 += '-'; 88 | a2 += s2[cur_j--]; 89 | } 90 | } 91 | std::string r1(a1.rbegin(), a1.rend()); 92 | std::string r2(a2.rbegin(), a2.rend()); 93 | return std::make_pair(r1, r2); 94 | } 95 | 96 | 97 | std::pair 98 | needleman_wunsch::align() 99 | { 100 | for (int i = 0; i < l1; i++) { 101 | for (int j = 0; j < l2; j++) { 102 | fill(i, j); 103 | } 104 | } 105 | return backtrack(); 106 | } 107 | double needleman_wunsch::identity(std::pair alignment) const 108 | { 109 | int len = alignment.first.length(); 110 | double count = 0; 111 | for (int i = 0; i < len; i++) { 112 | if (alignment.first[i] == alignment.second[i]) { 113 | count++; 114 | } 115 | } 116 | return 1.0 * count / len; 117 | } 118 | 119 | int needleman_wunsch::gap(int gaplen) const 120 | { 121 | return sigma + (gaplen - 1) * epsilon; 122 | } 123 | 124 | int needleman_wunsch::match_score(char a, char b) const 125 | { 126 | return a == b ? match : mismatch; 127 | } 128 | 129 | needleman_wunsch::needleman_wunsch(const std::string &s1_, const std::string& s2_, int match_, int mismatch_, int sigma_, int epsilon_) 130 | { 131 | int l1_ = s1_.length(); 132 | int l2_ = s2_.length(); 133 | if (l1_ >= l2_) { 134 | l1 = l1_; 135 | l2 = l2_; 136 | s1 = s1_; 137 | s2 = s2_; 138 | } else { 139 | l1 = l2_; 140 | l2 = l1_; 141 | s1 = s2_; 142 | s2 = s1_; 143 | } 144 | sigma = -sigma_; 145 | epsilon = -epsilon_; 146 | match = match_; 147 | mismatch = mismatch_; 148 | int matlen = l1 * l2; 149 | score = new int[matlen]; 150 | direction = new uint8_t[matlen]; 151 | horiz_gap_len = new int[matlen]; 152 | vert_gap_len = new int[matlen]; 153 | } 154 | -------------------------------------------------------------------------------- /src/cluster/src/needleman_wunsch.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * needleman_wunsch.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef NEEDLEMAN_WUNSCH_H 9 | #define NEEDLEMAN_WUNSCH_H 10 | 11 | #include 12 | 13 | class needleman_wunsch { 14 | public: 15 | needleman_wunsch(const std::string& s1, const std::string& s2, int match_, int mismatch_, int sigma_, int epsilon_); 16 | ~needleman_wunsch() { 17 | delete[] score; 18 | delete[] direction; 19 | delete[] horiz_gap_len; 20 | delete[] vert_gap_len; 21 | } 22 | double identity(std::pair p) const; 23 | std::pair 24 | align(); 25 | private: 26 | int gap(int gap_len) const; 27 | int match_score(char a, char b) const; 28 | inline int at(int a, int b) const { return a * l2 + b; }; 29 | void fill(int,int); 30 | std::pair backtrack(); 31 | int match, mismatch; 32 | int sigma, epsilon; 33 | std::string s1, s2; 34 | int l1, l2; 35 | 36 | int *score; 37 | uint8_t *direction; 38 | int *horiz_gap_len; 39 | int *vert_gap_len; 40 | }; 41 | 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/exception/FileDoesNotExistException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * FileDoesNotExistException.cpp 3 | * 4 | * Created on: Apr 30, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "FileDoesNotExistException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception{ 16 | 17 | FileDoesNotExistException::FileDoesNotExistException(string massage) { 18 | cerr << "File Does Not Exist Exception" << endl; 19 | cerr << massage << endl; 20 | } 21 | 22 | FileDoesNotExistException::~FileDoesNotExistException() { 23 | // TODO Auto-generated destructor stub 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/exception/FileDoesNotExistException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * FileDoesNotExistException.h 3 | * 4 | * Created on: Apr 30, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef FILEDOESNOTEXISTEXCEPTION_H_ 9 | #define FILEDOESNOTEXISTEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception { 16 | class FileDoesNotExistException { 17 | public: 18 | FileDoesNotExistException(string); 19 | ~FileDoesNotExistException(); 20 | }; 21 | } 22 | 23 | #endif /* FILEDOESNOTEXISTEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src/exception/InvalidInputException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidInputException.cpp 3 | * 4 | * Created on: May 1, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "InvalidInputException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | namespace exception{ 15 | 16 | InvalidInputException::InvalidInputException(string msg) { 17 | cerr << "Invalid Input Exception" << endl; 18 | cerr << msg << endl; 19 | } 20 | 21 | InvalidInputException::~InvalidInputException() { 22 | // TODO Auto-generated destructor stub 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/exception/InvalidInputException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidInputException.h 3 | * 4 | * Created on: May 1, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDINPUTEXCEPTION_H_ 9 | #define INVALIDINPUTEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception { 16 | class InvalidInputException { 17 | public: 18 | InvalidInputException(string); 19 | ~InvalidInputException(); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDINPUTEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src/exception/InvalidOperationException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOperationException.cpp 3 | * 4 | * Created on: Dec 20, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include 9 | #include "InvalidOperationException.h" 10 | 11 | 12 | namespace exception { 13 | 14 | InvalidOperationException::InvalidOperationException(string msg) : std::runtime_error(msg) { 15 | cerr << "Invalid Operation Exception." << endl; 16 | cerr << what() << endl; 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/exception/InvalidOperationException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOperationException.h 3 | * 4 | * Created on: Dec 20, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDOPERATIONEXCEPTION_H_ 9 | #define INVALIDOPERATIONEXCEPTION_H_ 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | namespace exception { 17 | 18 | class InvalidOperationException : public std::runtime_error{ 19 | public: 20 | InvalidOperationException(string msg); 21 | //virtual ~InvalidOperationException(); 22 | }; 23 | 24 | } 25 | 26 | #endif /* INVALIDOPERATIONEXCEPTION_H_ */ 27 | -------------------------------------------------------------------------------- /src/exception/InvalidOrderOfOperationsException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOrderOfOperationsException.cpp 3 | * 4 | * Created on: Apr 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "InvalidOrderOfOperationsException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | namespace exception{ 15 | 16 | InvalidOrderOfOperationsException::InvalidOrderOfOperationsException(string massage) { 17 | cerr << "Invalid Order Of Operations Exception" << endl; 18 | cerr << massage << endl; 19 | } 20 | 21 | InvalidOrderOfOperationsException::~InvalidOrderOfOperationsException() { 22 | // TODO Auto-generated destructor stub 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/exception/InvalidOrderOfOperationsException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOrderOfOperationsException.h 3 | * 4 | * Created on: Apr 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDORDEROFOPERATIONSEXCEPTION_H_ 9 | #define INVALIDORDEROFOPERATIONSEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception{ 16 | class InvalidOrderOfOperationsException { 17 | public: 18 | InvalidOrderOfOperationsException(string); 19 | ~InvalidOrderOfOperationsException(); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDORDEROFOPERATIONSEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src/exception/InvalidScoreException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidScoreException.cpp 3 | * 4 | * Created on: Apr 27, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "InvalidScoreException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | namespace exception{ 15 | 16 | InvalidScoreException::InvalidScoreException(string massage) { 17 | cerr << "Invalid Score Exception." << endl; 18 | cerr << massage << endl; 19 | } 20 | 21 | InvalidScoreException::~InvalidScoreException() { 22 | // TODO Auto-generated destructor stub 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/exception/InvalidScoreException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidScoreException.h 3 | * 4 | * Created on: Apr 27, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDSCOREEXCEPTION_H_ 9 | #define INVALIDSCOREEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception{ 16 | class InvalidScoreException { 17 | public: 18 | InvalidScoreException(string); 19 | virtual ~InvalidScoreException(); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDSCOREEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src/exception/InvalidStateException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidStateException.cpp 3 | * 4 | * Created on: Aug 9, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include 9 | #include 10 | #include "InvalidStateException.h" 11 | 12 | using namespace std; 13 | 14 | 15 | namespace exception { 16 | InvalidStateException::InvalidStateException(string msg) : 17 | std::runtime_error(msg) { 18 | cerr << "Invalid State Exception." << endl; 19 | cerr << what() << endl; 20 | } 21 | } 22 | 23 | //InvalidStateException::~InvalidStateException() { 24 | // TODO Auto-generated destructor stub 25 | //} 26 | -------------------------------------------------------------------------------- /src/exception/InvalidStateException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidStateException.h 3 | * 4 | * Created on: Aug 9, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDSTATEEXCEPTION_H_ 9 | #define INVALIDSTATEEXCEPTION_H_ 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | namespace exception { 17 | class InvalidStateException : public std::runtime_error{ 18 | public: 19 | InvalidStateException(string); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDSTATEEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src/nonltr/ChromDetector.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetector.cpp 3 | * 4 | * Created on: Nov 8, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include 9 | 10 | #include "ChromDetector.h" 11 | #include "Detector.h" 12 | #include "../utility/Util.h" 13 | 14 | using namespace std; 15 | using namespace nonltr; 16 | using namespace utility; 17 | 18 | ChromDetector::ChromDetector(double s, double w, double pDelta, double b, 19 | double mDelta, vector * scores, 20 | const vector *> * segmentList) { 21 | 22 | regions = new vector *>(); 23 | 24 | for (int i = 0; i < segmentList->size(); i++) { 25 | Detector * detector = new Detector(segmentList->at(i)->at(0), 26 | segmentList->at(i)->at(1), s, w, pDelta, b, mDelta, scores); 27 | vector *> * segRegions = detector->getRegions(); 28 | regions->insert(regions->end(), segRegions->begin(), segRegions->end()); 29 | delete detector; 30 | } 31 | } 32 | 33 | ChromDetector::~ChromDetector() { 34 | Util::deleteInVector(regions); 35 | regions->clear(); 36 | delete regions; 37 | } 38 | 39 | vector *> * ChromDetector::getRegions() { 40 | return regions; 41 | } 42 | -------------------------------------------------------------------------------- /src/nonltr/ChromDetector.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetector.h 3 | * 4 | * Created on: Nov 8, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMDETECTOR_H_ 9 | #define CHROMDETECTOR_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace nonltr{ 16 | class ChromDetector { 17 | 18 | private: 19 | vector *> * regions; 20 | 21 | public: 22 | ChromDetector(double, double, double, double, double, vector *, 23 | const vector *> *); 24 | virtual ~ChromDetector(); 25 | vector *> * getRegions(); 26 | }; 27 | } 28 | 29 | #endif /* CHROMDETECTOR_H_ */ 30 | -------------------------------------------------------------------------------- /src/nonltr/ChromDetectorMaxima.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetectorMaxima.cpp 3 | * 4 | * Created on: Jun 6, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "ChromDetectorMaxima.h" 9 | 10 | namespace nonltr { 11 | 12 | ChromDetectorMaxima::ChromDetectorMaxima(double s, double w, double m, 13 | double t, double p, int e, vector * oScores, 14 | ChromosomeOneDigit * chrom) { 15 | header = chrom->getHeader(); 16 | start(s, w, m, t, p, e, oScores, chrom->getSegment()); 17 | 18 | } 19 | 20 | ChromDetectorMaxima::ChromDetectorMaxima(double s, double w, double m, 21 | double t, double p, int e, vector * oScores, const vector *> * segmentList) { 23 | header = string("chrUnknown"); 24 | start(s, w, m, t, p, e, oScores, segmentList); 25 | } 26 | 27 | void ChromDetectorMaxima::start(double s, double w, double m, double t, 28 | double p, int e, vector * oScores, 29 | const vector *> * segmentList) { 30 | 31 | regionList = new vector (); 32 | 33 | int segmentCount = segmentList->size(); 34 | for (int i = 0; i < segmentCount; i++) { 35 | int segStart = segmentList->at(i)->at(0); 36 | int segEnd = segmentList->at(i)->at(1); 37 | 38 | // The effective length is shorter than the actual length by 2w 39 | int effLen = 2 * w + 10; 40 | int segLen = segEnd - segStart + 1; 41 | 42 | if (segLen > effLen) { 43 | DetectorMaxima * detector = new DetectorMaxima(segStart, segEnd, s, 44 | w, m, t, p, e, oScores); 45 | 46 | const vector * segRegions = detector->getRegionList(); 47 | int segRegionCount = segRegions->size(); 48 | for (int h = 0; h < segRegionCount; h++) { 49 | regionList->push_back(new Location(*(segRegions->at(h)))); 50 | } 51 | 52 | delete detector; 53 | } else { 54 | cout << "\tSkipping a short segment: "; 55 | cout << segStart << "-" << segEnd << endl; 56 | } 57 | } 58 | } 59 | 60 | ChromDetectorMaxima::~ChromDetectorMaxima() { 61 | Util::deleteInVector(regionList); 62 | regionList->clear(); 63 | delete regionList; 64 | } 65 | 66 | void ChromDetectorMaxima::printIndex(string outputFile) { 67 | printIndex(outputFile, false); 68 | } 69 | 70 | void ChromDetectorMaxima::printIndex(string outputFile, bool canAppend) { 71 | ofstream outIndex; 72 | 73 | if (canAppend) { 74 | outIndex.open(outputFile.c_str(), ios::out | ios::app); 75 | } else { 76 | outIndex.open(outputFile.c_str(), ios::out); 77 | } 78 | 79 | // Write the index of the repeat segment [x,y[ 80 | for (int j = 0; j < regionList->size(); j++) { 81 | outIndex << header << ":"; 82 | outIndex << ((int) (regionList->at(j)->getStart())) << "-"; 83 | outIndex << ((int) (regionList->at(j)->getEnd() + 1)) << " "; 84 | outIndex << endl; 85 | } 86 | 87 | outIndex.close(); 88 | } 89 | 90 | const vector* ChromDetectorMaxima::getRegionList() const { 91 | return regionList; 92 | } 93 | 94 | } /* namespace nonltr */ 95 | -------------------------------------------------------------------------------- /src/nonltr/ChromDetectorMaxima.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetectorMaxima.h 3 | * 4 | * Created on: Jun 6, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMDETECTORMAXIMA_H_ 9 | #define CHROMDETECTORMAXIMA_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "ChromosomeOneDigit.h" 15 | #include "DetectorMaxima.h" 16 | 17 | #include "../utility/Util.h" 18 | #include "../utility/ILocation.h" 19 | #include "../utility/Location.h" 20 | 21 | using namespace std; 22 | using namespace utility; 23 | 24 | namespace nonltr { 25 | 26 | class ChromDetectorMaxima { 27 | private: 28 | vector * regionList; 29 | string header; 30 | 31 | void start(double, double, double, double, double, int, vector *, 32 | const vector *> *); 33 | 34 | public: 35 | ChromDetectorMaxima(double, double, double, double, double, int, 36 | vector *, ChromosomeOneDigit *); 37 | ChromDetectorMaxima(double, double, double, double, double, int, 38 | vector *, const vector *> *); 39 | virtual ~ChromDetectorMaxima(); 40 | const vector* getRegionList() const; 41 | void printIndex(string); 42 | void printIndex(string, bool); 43 | 44 | }; 45 | 46 | } /* namespace nonltr */ 47 | #endif /* CHROMDETECTORMAXIMA_H_ */ 48 | -------------------------------------------------------------------------------- /src/nonltr/ChromListMaker.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromListMaker.cpp 3 | * 4 | * Created on: Mar 13, 2014 5 | * Author: Hani Zakaira Girgis 6 | */ 7 | 8 | #include "ChromListMaker.h" 9 | 10 | namespace nonltr { 11 | 12 | ChromListMaker::ChromListMaker(string seqFileIn) { 13 | seqFile = seqFileIn; 14 | chromList = new vector(); 15 | } 16 | 17 | ChromListMaker::~ChromListMaker() { 18 | Util::deleteInVector(chromList); 19 | delete chromList; 20 | } 21 | 22 | 23 | std::istream& safe_getline(std::istream& is, std::string& t) 24 | { 25 | t.clear(); 26 | std::istream::sentry se(is, true); 27 | std::streambuf* sb = is.rdbuf(); 28 | for(;;) { 29 | int c = sb->sbumpc(); 30 | switch (c) { 31 | case '\n': 32 | return is; 33 | case '\r': 34 | if (sb->sgetc() == '\n') { 35 | sb->sbumpc(); 36 | } 37 | return is; 38 | case std::streambuf::traits_type::eof(): 39 | if (t.empty()) { 40 | is.setstate(std::ios::eofbit); 41 | } 42 | return is; 43 | default: 44 | t += (char)c; 45 | } 46 | } 47 | } 48 | 49 | const vector * ChromListMaker::makeChromList() { 50 | ifstream in(seqFile.c_str()); 51 | bool isFirst = true; 52 | Chromosome * chrom; 53 | 54 | while (in.good()) { 55 | string line; 56 | safe_getline(in, line); 57 | if (line[0] == '>') { 58 | if (!isFirst) { 59 | chrom->finalize(); 60 | chromList->push_back(chrom); 61 | } else { 62 | isFirst = false; 63 | } 64 | 65 | chrom = new Chromosome(); 66 | chrom->setHeader(line); 67 | } else if (line[0] == ' ' || line[0] == '\t') { 68 | bool all_spaces = true; 69 | for (auto c : line) { 70 | if (c != ' ' && c != '\t') { 71 | all_spaces = false; 72 | } 73 | } 74 | if (all_spaces) { 75 | continue; 76 | } 77 | std::ostringstream oss; 78 | oss << chrom->getHeader() << line; 79 | std::string new_header = oss.str(); 80 | chrom->setHeader(new_header); 81 | } else { 82 | chrom->appendToSequence(line); 83 | } 84 | } 85 | chrom->finalize(); 86 | chromList->push_back(chrom); 87 | in.close(); 88 | 89 | return chromList; 90 | } 91 | 92 | const vector * ChromListMaker::makeChromOneDigitList() { 93 | ifstream in(seqFile.c_str()); 94 | bool isFirst = true; 95 | ChromosomeOneDigit * chrom; 96 | 97 | while (in.good()) { 98 | string line; 99 | safe_getline(in, line); 100 | if (line[0] == '>') { 101 | if (!isFirst) { 102 | chrom->finalize(); 103 | chromList->push_back(chrom); 104 | } else { 105 | isFirst = false; 106 | } 107 | 108 | chrom = new ChromosomeOneDigit(); 109 | chrom->setHeader(line); 110 | } else { 111 | chrom->appendToSequence(line); 112 | } 113 | } 114 | 115 | chrom->finalize(); 116 | chromList->push_back(chrom); 117 | in.close(); 118 | 119 | return chromList; 120 | } 121 | 122 | } 123 | /* namespace nonltr */ 124 | -------------------------------------------------------------------------------- /src/nonltr/ChromListMaker.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromListMaker.h 3 | * 4 | * Created on: Mar 13, 2014 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMLISTMAKER_H_ 9 | #define CHROMLISTMAKER_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "Chromosome.h" 15 | #include "ChromosomeOneDigit.h" 16 | 17 | #include "../utility/Util.h" 18 | 19 | using namespace std; 20 | using namespace utility; 21 | 22 | namespace nonltr { 23 | 24 | class ChromListMaker { 25 | private: 26 | vector * chromList; 27 | string seqFile; 28 | 29 | public: 30 | ChromListMaker(string); 31 | virtual ~ChromListMaker(); 32 | const vector * makeChromList(); 33 | const vector * makeChromOneDigitList(); 34 | 35 | }; 36 | 37 | } /* namespace nonltr */ 38 | #endif /* CHROMLISTMAKER_H_ */ 39 | -------------------------------------------------------------------------------- /src/nonltr/Chromosome.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Chromosome.cpp 3 | * 4 | * Created on: Mar 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | #include "Chromosome.h" 8 | 9 | Chromosome::Chromosome() { 10 | header = string(""); 11 | base = string(""); 12 | isHeaderReady = false; 13 | isBaseReady = false; 14 | isFinalized = false; 15 | } 16 | 17 | Chromosome::Chromosome(string fileName) { 18 | chromFile = fileName; 19 | readFasta(); 20 | help(1000000, true); 21 | } 22 | 23 | Chromosome::Chromosome(string fileName, bool canMerge) { 24 | chromFile = fileName; 25 | readFasta(); 26 | help(1000000, canMerge); 27 | } 28 | 29 | Chromosome::Chromosome(string fileName, int len) { 30 | chromFile = fileName; 31 | readFasta(); 32 | help(len, true); 33 | } 34 | 35 | Chromosome::Chromosome(string &seq, string &info) { 36 | header = info; 37 | base = seq; 38 | help(1000000, true); 39 | } 40 | 41 | Chromosome::Chromosome(string &seq, string &info, int len) { 42 | header = info; 43 | base = seq; 44 | help(len, true); 45 | } 46 | 47 | void Chromosome::setHeader(string& info) { 48 | if (isFinalized) { 49 | string msg("This chromosome has been finalized. "); 50 | msg.append("The header cannot be modified."); 51 | throw InvalidOperationException(msg); 52 | } else { 53 | header = info; 54 | isHeaderReady = true; 55 | } 56 | } 57 | 58 | /** 59 | * This method can waste memory if the sequence is large. 60 | * Consider using the method appendToSequence instead 61 | */ 62 | void Chromosome::setSequence(string& seq) { 63 | if (isFinalized) { 64 | string msg("This chromosome has been finalized. "); 65 | msg.append("The sequence cannot be modified."); 66 | throw InvalidOperationException(msg); 67 | } else { 68 | base = seq; 69 | isBaseReady = true; 70 | } 71 | } 72 | 73 | void Chromosome::appendToSequence(string& line) { 74 | if (isFinalized) { 75 | string msg("This chromosome has been finalized. "); 76 | msg.append("The sequence cannot be modified."); 77 | throw InvalidOperationException(msg); 78 | } else { 79 | base.append(line); 80 | isBaseReady = true; 81 | } 82 | } 83 | 84 | void Chromosome::finalize() { 85 | if (isFinalized) { 86 | string msg("This chromosome has been already finalized. "); 87 | msg.append("Finalize can be only called once."); 88 | throw InvalidOperationException(msg); 89 | } else if (!(isHeaderReady && isBaseReady)) { 90 | string msg( 91 | "The header and the sequence must be set before calling finalize"); 92 | throw InvalidOperationException(msg); 93 | } else { 94 | help(1000000, true); 95 | isFinalized = true; 96 | } 97 | } 98 | 99 | void Chromosome::help(int len, bool canMerge) { 100 | effectiveSize = 0; 101 | segLength = len; 102 | segment = new vector *>(); 103 | // segment->reserve(100); 104 | 105 | toUpperCase(); 106 | removeN(); 107 | if (canMerge) { 108 | mergeSegments(); 109 | } 110 | makeSegmentList(); 111 | calculateEffectiveSize(); 112 | } 113 | 114 | Chromosome::~Chromosome() { 115 | base.clear(); 116 | 117 | Util::deleteInVector(segment); 118 | segment->clear(); 119 | delete segment; 120 | } 121 | 122 | void Chromosome::readFasta() { 123 | bool isFirst = true; 124 | header = string(""); 125 | base = string(""); 126 | 127 | ifstream in(chromFile.c_str()); 128 | while (in.good()) { 129 | string line; 130 | getline(in, line); 131 | if (line[0] == '>') { 132 | if (!isFirst) { 133 | string msg = "Chromosome file: "; 134 | msg = msg + chromFile; 135 | msg = 136 | msg 137 | + " must have one sequence only. But it has more than one."; 138 | throw InvalidInputException(msg); 139 | } else { 140 | header = line; 141 | isFirst = false; 142 | } 143 | } else { 144 | base.append(line); 145 | } 146 | } 147 | in.close(); 148 | } 149 | 150 | /** 151 | * Convert alphabet to upper case if it has not been done before 152 | **/ 153 | void Chromosome::toUpperCase() { 154 | for (int i = 0; i < base.length(); i++) { 155 | base[i] = toupper(base[i]); 156 | } 157 | } 158 | 159 | /** 160 | * Segment coordinates are inclusive [s,e] 161 | **/ 162 | void Chromosome::removeN() { 163 | // Store non-N index 164 | int start = -1; 165 | for (int i = 0; i < base.size(); i++) { 166 | if (base[i] != 'N' && start == -1) { 167 | start = i; 168 | } else if (base[i] == 'N' && start != -1) { 169 | vector * v = new vector(); 170 | v->push_back(start); 171 | v->push_back(i - 1); 172 | segment->push_back(v); 173 | 174 | start = -1; 175 | } else if (i == base.size() - 1 && base[i] != 'N' && start != -1) { 176 | vector * v = new vector(); 177 | v->push_back(start); 178 | v->push_back(i); 179 | 180 | segment->push_back(v); 181 | start = -1; 182 | } 183 | } 184 | } 185 | 186 | /** 187 | * If the gap between two consecutive segments is less than 10 bp. 188 | * Segments that are shorter than 20 bp are not added. 189 | */ 190 | void Chromosome::mergeSegments() { 191 | vector *> * mSegment = new vector *>(); 192 | 193 | int s = segment->at(0)->at(0); 194 | int e = segment->at(0)->at(1); 195 | 196 | for (int i = 1; i < segment->size(); i++) { 197 | int s1 = segment->at(i)->at(0); 198 | int e1 = segment->at(i)->at(1); 199 | 200 | if (s1 - e < 10) { 201 | e = e1; 202 | } else { 203 | if (e - s + 1 >= 20) { 204 | vector * seg = new vector(); 205 | seg->push_back(s); 206 | seg->push_back(e); 207 | mSegment->push_back(seg); 208 | } 209 | 210 | s = s1; 211 | e = e1; 212 | } 213 | } 214 | 215 | // Handle the last index 216 | if (e - s + 1 >= 20) { 217 | vector * seg = new vector(); 218 | seg->push_back(s); 219 | seg->push_back(e); 220 | mSegment->push_back(seg); 221 | } 222 | 223 | Util::deleteInVector(segment); 224 | segment->clear(); 225 | segment = mSegment; 226 | } 227 | 228 | void Chromosome::makeSegmentList() { 229 | vector *> * segmentList = new vector *>(); 230 | int segmentCount = segment->size(); 231 | for (int oo = 0; oo < segmentCount; oo++) { 232 | int s = segment->at(oo)->at(0); 233 | int e = segment->at(oo)->at(1); 234 | 235 | if (e - s + 1 > segLength) { 236 | int fragNum = (int) (e - s + 1) / segLength; 237 | 238 | for (int h = 0; h < fragNum; h++) { 239 | int fragStart = s + (h * segLength); 240 | int fragEnd = 241 | (h == fragNum - 1) ? e : fragStart + segLength - 1; 242 | vector * v = new vector(); 243 | v->push_back(fragStart); 244 | v->push_back(fragEnd); 245 | segmentList->push_back(v); 246 | } 247 | } else { 248 | vector * v = new vector(); 249 | v->push_back(segment->at(oo)->at(0)); 250 | v->push_back(segment->at(oo)->at(1)); 251 | segmentList->push_back(v); 252 | } 253 | } 254 | 255 | Util::deleteInVector(segment); 256 | delete segment; 257 | segment = segmentList; 258 | } 259 | 260 | const string* Chromosome::getBase() { 261 | return &base; 262 | } 263 | 264 | const vector *> * Chromosome::getSegment() { 265 | return segment; 266 | } 267 | 268 | void Chromosome::printSegmentList(){ 269 | int l = segment->size(); 270 | cout << "Segment list size = " << l << endl; 271 | for(int i = 0; i < l; i++){ 272 | cout << segment->at(i)->at(0) << "\t"; 273 | cout << segment->at(i)->at(1) << endl; 274 | } 275 | } 276 | 277 | string Chromosome::getHeader() { 278 | return header; 279 | } 280 | 281 | int Chromosome::size() { 282 | return base.size(); 283 | } 284 | 285 | void Chromosome::calculateEffectiveSize() { 286 | int segmentCount = segment->size(); 287 | for (int oo = 0; oo < segmentCount; oo++) { 288 | int s = segment->at(oo)->at(0); 289 | int e = segment->at(oo)->at(1); 290 | effectiveSize += (e - s + 1); 291 | } 292 | } 293 | 294 | int Chromosome::getEffectiveSize() { 295 | return effectiveSize; 296 | } 297 | 298 | int Chromosome::getGcContent() { 299 | int gc = 0; 300 | int size = base.size(); 301 | for (int i = 0; i < size; i++) { 302 | char n = base.at(i); 303 | if (n == 'C' || n == 'G') { 304 | gc++; 305 | } 306 | } 307 | return gc; 308 | } 309 | -------------------------------------------------------------------------------- /src/nonltr/Chromosome.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Chromosome.h 3 | * 4 | * Created on: Mar 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | #ifndef CHROMOSOME_H_ 8 | #define CHROMOSOME_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "IChromosome.h" 17 | #include "../exception/InvalidOperationException.h" 18 | #include "../exception/InvalidInputException.h" 19 | #include "../utility/Util.h" 20 | 21 | using namespace std; 22 | using namespace nonltr; 23 | using namespace utility; 24 | using namespace exception; 25 | 26 | namespace nonltr { 27 | class Chromosome: public IChromosome { 28 | public: 29 | Chromosome(); 30 | Chromosome(string); 31 | Chromosome(string, bool); 32 | Chromosome(string, int); 33 | Chromosome(string &, string&); 34 | Chromosome(string &, string&, int); 35 | 36 | int getGcContent(); 37 | 38 | virtual ~Chromosome(); 39 | 40 | virtual const string* getBase(); 41 | virtual const vector *> * getSegment(); 42 | virtual void printSegmentList(); 43 | virtual string getHeader(); 44 | virtual int size(); 45 | virtual int getEffectiveSize(); 46 | virtual void setHeader(string&); 47 | virtual void setSequence(string&); 48 | virtual void appendToSequence(string&); 49 | virtual void finalize(); 50 | 51 | 52 | protected: 53 | string chromFile; 54 | string header; 55 | string base; 56 | int effectiveSize; 57 | int segLength; 58 | 59 | vector *> * segment; 60 | void readFasta(); 61 | void toUpperCase(); 62 | void removeN(); 63 | void mergeSegments(); 64 | virtual void help(int, bool); 65 | void makeSegmentList(); 66 | void calculateEffectiveSize(); 67 | 68 | private: 69 | bool isHeaderReady; 70 | bool isBaseReady; 71 | bool isFinalized; 72 | 73 | void reverseSegments(); 74 | 75 | }; 76 | } 77 | 78 | #endif /* CHROMOSOME_H_ */ 79 | -------------------------------------------------------------------------------- /src/nonltr/ChromosomeOneDigit.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeOneDigit.cpp 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD at the NCB1/NLM/NIH 6 | * A A 7 | * T T 8 | * G G 9 | * C C 10 | * R G or A 11 | * Y T or C 12 | * M A or C 13 | * K G or T 14 | * S G or C 15 | * W A or T 16 | * H A or C or T 17 | * B G or T or C 18 | * V G or C or A 19 | * D G or T or A 20 | * N G or T or A or C 21 | */ 22 | #include 23 | #include 24 | 25 | #include "Chromosome.h" 26 | #include "ChromosomeOneDigit.h" 27 | #include "../exception/InvalidInputException.h" 28 | 29 | using namespace exception; 30 | 31 | namespace nonltr { 32 | 33 | ChromosomeOneDigit::ChromosomeOneDigit() : 34 | Chromosome() { 35 | } 36 | 37 | ChromosomeOneDigit::ChromosomeOneDigit(string fileName) : 38 | Chromosome(fileName) { 39 | help(); 40 | } 41 | 42 | ChromosomeOneDigit::ChromosomeOneDigit(string seq, string info) : 43 | Chromosome(seq, info) { 44 | help(); 45 | } 46 | 47 | void ChromosomeOneDigit::help() { 48 | // Build codes 49 | buildCodes(); 50 | // Modify the sequence in the super class 51 | encodeNucleotides(); 52 | } 53 | 54 | void ChromosomeOneDigit::finalize() { 55 | Chromosome::finalize(); 56 | help(); 57 | } 58 | 59 | void ChromosomeOneDigit::buildCodes() { 60 | // Make map 61 | codes = new map(); 62 | 63 | // Certain nucleotides 64 | codes->insert(map::value_type('A', (char) 0)); 65 | codes->insert(map::value_type('C', (char) 1)); 66 | codes->insert(map::value_type('G', (char) 2)); 67 | codes->insert(map::value_type('T', (char) 3)); 68 | 69 | // Common uncertain nucleotide 70 | // codes->insert(map::value_type('N', (char) 4)); 71 | 72 | // Uncertain nucleotides 73 | codes->insert(map::value_type('R', codes->at('G'))); 74 | codes->insert(map::value_type('Y', codes->at('C'))); 75 | codes->insert(map::value_type('M', codes->at('A'))); 76 | codes->insert(map::value_type('K', codes->at('T'))); 77 | codes->insert(map::value_type('S', codes->at('G'))); 78 | codes->insert(map::value_type('W', codes->at('T'))); 79 | codes->insert(map::value_type('H', codes->at('C'))); 80 | codes->insert(map::value_type('B', codes->at('T'))); 81 | codes->insert(map::value_type('V', codes->at('A'))); 82 | codes->insert(map::value_type('D', codes->at('T'))); 83 | codes->insert(map::value_type('N', codes->at('C'))); 84 | codes->insert(map::value_type('X', codes->at('G'))); 85 | } 86 | 87 | ChromosomeOneDigit::~ChromosomeOneDigit() { 88 | codes->clear(); 89 | delete codes; 90 | } 91 | 92 | /** 93 | * This method converts nucleotides in the segments to single digit codes 94 | */ 95 | void ChromosomeOneDigit::encodeNucleotides() { 96 | 97 | for (int s = 0; s < segment->size(); s++) { 98 | int segStart = segment->at(s)->at(0); 99 | int segEnd = segment->at(s)->at(1); 100 | for (int i = segStart; i <= segEnd; i++) { 101 | if (codes->count(base[i]) > 0) { 102 | base[i] = codes->at(base[i]); 103 | } else { 104 | string msg = "Invalid nucleotide: "; 105 | msg.append(1, base[i]); 106 | throw InvalidInputException(msg); 107 | } 108 | } 109 | } 110 | 111 | // Digitize skipped segments 112 | int segNum = segment->size(); 113 | if(segNum > 0){ 114 | // The first interval - before the first segment 115 | int segStart = 0; 116 | int segEnd = segment->at(0)->at(0)-1; 117 | 118 | for (int s = 0; s <= segNum; s++) { 119 | for (int i = segStart; i <= segEnd; i++) { 120 | char c = base[i]; 121 | if(c != 'N'){ 122 | if (codes->count(c) > 0) { 123 | base[i] = codes->at(c); 124 | } else { 125 | string msg = "Invalid nucleotide: "; 126 | msg.append(1, c); 127 | throw InvalidInputException(msg); 128 | } 129 | } 130 | } 131 | 132 | // The regular intervals between two segments 133 | if(s < segNum-1){ 134 | segStart = segment->at(s)->at(1)+1; 135 | segEnd = segment->at(s+1)->at(0)-1; 136 | } 137 | // The last interval - after the last segment 138 | else if(s == segNum - 1){ 139 | segStart = segment->at(s)->at(1)+1; 140 | segEnd = base.size()-1; 141 | } 142 | } 143 | } 144 | } 145 | 146 | /* 147 | void ChromosomeOneDigit::encodeNucleotides() { 148 | int seqLen = base.size(); 149 | 150 | for (int i = 0; i < seqLen; i++) { 151 | if (codes->count(base[i]) > 0) { 152 | base[i] = codes->at(base[i]); 153 | } else { 154 | string msg = "Invalid nucleotide: "; 155 | msg.append(1, base[i]); 156 | throw InvalidInputException(msg); 157 | } 158 | } 159 | 160 | } 161 | */ 162 | 163 | /** 164 | * Cannot be called on already finalized object. 165 | */ 166 | void ChromosomeOneDigit::makeR() { 167 | //cout << "Making reverse ..." << endl; 168 | makeReverse(); 169 | reverseSegments(); 170 | } 171 | 172 | /** 173 | * Cannot be called on already finalized object. 174 | */ 175 | void ChromosomeOneDigit::makeRC() { 176 | //cout << "Making reverse complement ..." << endl; 177 | makeComplement(); 178 | makeReverse(); 179 | reverseSegments(); 180 | } 181 | 182 | void ChromosomeOneDigit::makeComplement() { 183 | map complement; 184 | 185 | // Certain nucleotides 186 | complement.insert(map::value_type((char) 0, (char) 3)); 187 | complement.insert(map::value_type((char) 1, (char) 2)); 188 | complement.insert(map::value_type((char) 2, (char) 1)); 189 | complement.insert(map::value_type((char) 3, (char) 0)); 190 | 191 | // Unknown nucleotide 192 | complement.insert(map::value_type('N', 'N')); 193 | // complement.insert(map::value_type((char) 4, (char) 4)); 194 | 195 | // Convert a sequence to its complement 196 | int seqLen = base.size(); 197 | for (int i = 0; i < seqLen; i++) { 198 | if (complement.count(base[i]) > 0) { 199 | base[i] = complement.at(base[i]); 200 | } else { 201 | cerr << "Error: The digit " << (char) base[i]; 202 | cerr << " does not represent a base." << endl; 203 | exit(2); 204 | } 205 | } 206 | } 207 | 208 | void ChromosomeOneDigit::makeReverse() { 209 | int last = base.size() - 1; 210 | 211 | // Last index to be switched 212 | int middle = base.size() / 2; 213 | 214 | for (int i = 0; i < middle; i++) { 215 | char temp = base[last - i]; 216 | base[last - i] = base[i]; 217 | base[i] = temp; 218 | } 219 | } 220 | 221 | void ChromosomeOneDigit::reverseSegments() { 222 | int segNum = segment->size(); 223 | int lastBase = size() - 1; 224 | 225 | // Calculate the coordinate on the main strand 226 | for (int i = 0; i < segNum; i++) { 227 | vector * seg = segment->at(i); 228 | 229 | int s = lastBase - seg->at(1); 230 | int e = lastBase - seg->at(0); 231 | seg->clear(); 232 | seg->push_back(s); 233 | seg->push_back(e); 234 | } 235 | 236 | // Reverse the regions within the list 237 | int lastRegion = segNum - 1; 238 | int middle = segNum / 2; 239 | for (int i = 0; i < middle; i++) { 240 | vector * temp = segment->at(lastRegion - i); 241 | (*segment)[lastRegion - i] = segment->at(i); 242 | (*segment)[i] = temp; 243 | } 244 | } 245 | 246 | } 247 | -------------------------------------------------------------------------------- /src/nonltr/ChromosomeOneDigit.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeOneDigit.h 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | 8 | #ifndef CHROMOSOMEONEDIGIT_H_ 9 | #define CHROMOSOMEONEDIGIT_H_ 10 | 11 | #include 12 | #include "Chromosome.h" 13 | 14 | namespace nonltr { 15 | class ChromosomeOneDigit: public Chromosome { 16 | 17 | private: 18 | /* Fields */ 19 | map * codes; 20 | 21 | /* Methods */ 22 | void help(); 23 | void buildCodes(); 24 | void encodeNucleotides(); 25 | 26 | void makeReverse(); 27 | void makeComplement(); 28 | void reverseSegments(); 29 | 30 | public: 31 | /* Methods */ 32 | ChromosomeOneDigit(); 33 | ChromosomeOneDigit(string); 34 | ChromosomeOneDigit(string, string); 35 | virtual ~ChromosomeOneDigit(); 36 | virtual void finalize(); 37 | 38 | void makeR(); 39 | void makeRC(); 40 | }; 41 | } 42 | 43 | #endif /* CHROMOSOMEONEDIGIT_H_ */ 44 | -------------------------------------------------------------------------------- /src/nonltr/ChromosomeRandom.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeRandom.h 3 | * 4 | * Created on: Feb 4, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMOSOMERANDOM_H_ 9 | #define CHROMOSOMERANDOM_H_ 10 | 11 | #include 12 | 13 | #include "IChromosome.h" 14 | 15 | namespace nonltr { 16 | 17 | class ChromosomeRandom: public nonltr::IChromosome { 18 | // Key-value pair type. 19 | typedef map::value_type valType; 20 | 21 | private: 22 | int n; 23 | char unread; 24 | IChromosome * oChrom; 25 | vector * alpha; 26 | map * table; 27 | string * rBase; 28 | vector * keyList; 29 | map * codes; 30 | 31 | void fillKeyList(); 32 | void initializeTable(); 33 | void countWords(); 34 | void convertToProbabilities(); 35 | void printTable(); 36 | void generateRandomSequence(); 37 | 38 | public: 39 | ChromosomeRandom(int, IChromosome*, char, vector*); 40 | virtual ~ChromosomeRandom(); 41 | 42 | virtual const string* getBase(); 43 | virtual const vector *> * getSegment(); 44 | virtual string getHeader(); 45 | virtual void printSequence(string); 46 | void printSequence(string, string *); 47 | void printEffectiveSequence(string); 48 | }; 49 | 50 | } /* namespace nonltr */ 51 | #endif /* CHROMOSOMERANDOM_H_ */ 52 | -------------------------------------------------------------------------------- /src/nonltr/DetectorMaxima.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DetectorMaxima.h 3 | * 4 | * Created on: May 31, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef DETECTORMAXIMA_H_ 9 | #define DETECTORMAXIMA_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "../utility/ILocation.h" 15 | 16 | using namespace std; 17 | using namespace utility; 18 | 19 | namespace nonltr { 20 | 21 | class DetectorMaxima { 22 | private: 23 | 24 | int segStart; 25 | int segEnd; 26 | double s; 27 | double w; 28 | double m; 29 | double t; 30 | double p; 31 | int e; 32 | int halfS; 33 | 34 | vector * oScores; 35 | vector * scores; 36 | vector * mask; 37 | vector * first; 38 | vector * second; 39 | vector * maxima; 40 | // vector *> * allMaxima; 41 | 42 | vector * separatorList; 43 | vector * regionList; 44 | 45 | void makeMask(); 46 | void smooth(); 47 | void deriveFirst(); 48 | void deriveSecond(); 49 | void findMaxima(); 50 | 51 | void findSeparators(); 52 | void findRegions(); 53 | 54 | void extendRegions(); 55 | 56 | int countLessThan(vector *, int, int, double); 57 | 58 | /** 59 | * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c 60 | */ 61 | inline double round(double number) { 62 | return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); 63 | } 64 | 65 | public: 66 | DetectorMaxima(int, int, double, double, double, double, double, int, 67 | vector *); 68 | virtual ~DetectorMaxima(); 69 | const vector* getRegionList() const; 70 | const vector* getFirst() const; 71 | const vector* getSecond() const; 72 | 73 | // const vector *>* getAllMaxima() const; 74 | }; 75 | 76 | } /* namespace nonltr */ 77 | #endif /* DETECTORMAXIMA_H_ */ 78 | -------------------------------------------------------------------------------- /src/nonltr/EnrichmentMarkovView.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * EnrichmentMarkovView.cpp 3 | * 4 | * Created on: Apr 17, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | namespace nonltr { 9 | 10 | /** 11 | * The Markov order. It start at 0. 12 | */ 13 | template 14 | EnrichmentMarkovView::EnrichmentMarkovView(int k, int order, int m) : 15 | minObs(m), factor(10000.00), KmerHashTable(k) { 16 | initialize(order); 17 | } 18 | 19 | template 20 | EnrichmentMarkovView::EnrichmentMarkovView(int k, V initValue, int order, 21 | int m) : 22 | minObs(m), factor(10000.00), KmerHashTable(k, initValue) { 23 | initialize(order); 24 | } 25 | 26 | template 27 | void EnrichmentMarkovView::initialize(int order) { 28 | // Test start 29 | // cout << "Testing: " << minObs << endl; 30 | // Test end 31 | 32 | o = order; 33 | if (o < 0) { 34 | string msg("The Markov order must be non-negative integer. "); 35 | msg.append("The invalid input is: "); 36 | msg.append(Util::int2string(o)); 37 | msg.append("."); 38 | throw InvalidInputException(msg); 39 | } 40 | 41 | if (o >= KmerHashTable::k) { 42 | string msg("The Markov order cannot be >= k (k-mer)."); 43 | throw InvalidInputException(msg); 44 | } 45 | 46 | l = 0; 47 | modelList = new vector *>(); 48 | 49 | for (int i = 1; i <= o + 1; i++) { 50 | modelList->push_back(new KmerHashTable(i)); 51 | } 52 | } 53 | 54 | template 55 | EnrichmentMarkovView::~EnrichmentMarkovView() { 56 | Util::deleteInVector(modelList); 57 | delete modelList; 58 | } 59 | 60 | /** 61 | * This method count words of size 1 to order+1 in the input sequence. 62 | * In other words, it updates the background tables. In addition, it 63 | * updates the length of the genome. 64 | * 65 | * sequence: is the input sequence. 66 | * start: the start index - inclosing. 67 | * end: the end index - inclosing. 68 | */ 69 | template 70 | void EnrichmentMarkovView::count(const char * sequence, int start, 71 | int end) { 72 | 73 | // Multiple by 2 if scanning the forward strand and its reverse complement 74 | // l = l + (2 * (end - start + 1)); 75 | l = l + (end - start + 1); 76 | 77 | int modelNumber = modelList->size(); 78 | for (int i = 0; i < modelNumber; i++) { 79 | KmerHashTable * t = modelList->at(i); 80 | t->wholesaleIncrement(sequence, start, end - i); 81 | } 82 | } 83 | 84 | /** 85 | * Normalize the count of words in each model. 86 | * Values stored in these models are multiplied by "factor." 87 | */ 88 | template 89 | void EnrichmentMarkovView::generateProbapilities() { 90 | int modelNumber = modelList->size(); 91 | 92 | for (int m = 0; m < modelNumber; m++) { 93 | KmerHashTable * t = modelList->at(m); 94 | int tSize = t->getMaxTableSize(); 95 | 96 | for (int i = 0; i < tSize; i += 4) { 97 | double sum = 0.0; 98 | 99 | for (int j = i; j < i + 4; j++) { 100 | sum += t->valueOf(j); 101 | } 102 | 103 | for (int j = i; j < i + 4; j++) { 104 | t->insert(j, round(factor * ((double) t->valueOf(j) / sum))); 105 | } 106 | } 107 | } 108 | } 109 | 110 | template 111 | void EnrichmentMarkovView::processTable() { 112 | char base = 4; 113 | int modelNumber = modelList->size(); 114 | 115 | // Make a zero in quaternary form as a string of length k. 116 | string q(""); 117 | for (int x = 0; x < KmerHashTable::k; x++) { 118 | q.append(1, 0); 119 | } 120 | 121 | double lowerP; 122 | double upperP; 123 | for (I y = 0; y < KmerHashTable::maxTableSize; y++) { 124 | if (y % 10000000 == 0) { 125 | cout << "Processing " << y << " keys out of " 126 | << KmerHashTable::maxTableSize; 127 | cout << endl; 128 | } 129 | 130 | const char * qc = q.c_str(); 131 | 132 | // Calculate the expected number of occurrences. 133 | 134 | // a. Calculate probability from lower order models. 135 | // Lower probabilities are the same for four consecutive words of length of k-1 136 | if (y % 4 == 0) { 137 | lowerP = 1.0; 138 | for (int m = 0; m < modelNumber - 1; m++) { 139 | KmerHashTable * oTable = modelList->at(m); 140 | lowerP *= (((double) oTable->valueOf(qc, 0)) / factor); 141 | } 142 | } 143 | 144 | // b. Calculate probability based on the specified order. 145 | KmerHashTable * oTable = modelList->at(modelNumber - 1); 146 | int resultsSize = KmerHashTable::k - o - 1; 147 | 148 | // Upper probabilities are the same for four consecutive words of length of k-1 149 | // The scanning of words or length corresponding to the highest order + 1 150 | // This step is not needed if k = o + 1, i.e. resultsSize = 0. 151 | if (y % 4 == 0) { 152 | if (resultsSize > 0) { 153 | //Initialize the elements of the vector invalid index 154 | vector results = vector(resultsSize, -987); 155 | oTable->wholesaleValueOf(qc, 0, resultsSize - 1, &results, 0); 156 | 157 | upperP = 1.0; 158 | for (int i = 0; i < resultsSize; i++) { 159 | upperP *= (((double) results.at(i)) / factor); 160 | } 161 | results.clear(); 162 | 163 | } else { 164 | upperP = 1.0; 165 | } 166 | } 167 | 168 | // The expected number of occurances 169 | double exp = l * lowerP * upperP 170 | * (((double) oTable->valueOf(qc, resultsSize)) / factor); 171 | 172 | // Calculate the enrichment value. 173 | // Log value 174 | // values[y] = round((log((double) values[y] + 1.0) - log(exp + 1.0))); 175 | 176 | // Raw value 177 | // Requirement: if observed is >= 5 && observed > expected then the value is the difference 178 | // otherwise the value is zero 179 | 180 | V observed = KmerHashTable::values[y]; 181 | 182 | if (observed >= minObs && observed > exp) { 183 | 184 | KmerHashTable::values[y] = round(observed - exp); 185 | } else { 186 | KmerHashTable::values[y] = 0; 187 | } 188 | 189 | /* 190 | KmerHashTable::values[y] = 191 | round( 192 | (((double) KmerHashTable::values[y] + 1.0) 193 | / (exp + 1.0))); 194 | */ 195 | 196 | // Increment the quaternary number: 197 | // 1 - guard against overflow. 198 | if (q[0] == base - 1) { 199 | string z(""); 200 | z.append(1, 0); 201 | q = z + q; 202 | } 203 | 204 | // 2 - increment the quaternary number by 1. 205 | int qLen = q.size(); 206 | for (int i = qLen - 1; i >= 0; i--) { 207 | if (q[i] + 1 < base) { 208 | q[i] = q[i] + 1; 209 | break; 210 | } else { 211 | q[i] = 0; 212 | } 213 | } 214 | } 215 | } 216 | 217 | } /* namespace nonltr */ 218 | -------------------------------------------------------------------------------- /src/nonltr/EnrichmentMarkovView.h: -------------------------------------------------------------------------------- 1 | /* 2 | * EnrichmentMarkovView.h 3 | * 4 | * Created on: Apr 17, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ENRICHMENTMARKOVVIEW_H_ 9 | #define ENRICHMENTMARKOVVIEW_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "KmerHashTable.h" 16 | #include "../utility/Util.h" 17 | #include "../exception/InvalidInputException.h" 18 | 19 | using namespace std; 20 | using namespace utility; 21 | using namespace exception; 22 | 23 | namespace nonltr { 24 | 25 | template 26 | class EnrichmentMarkovView: public KmerHashTable{ 27 | 28 | private: 29 | // The minimum number of the observed k-mers 30 | const int minObs; 31 | 32 | // This template specification should work up to order of 14, 33 | // i.e. word length = 15 34 | vector *> * modelList; 35 | 36 | // Markov order 37 | int o; 38 | 39 | // Total length 40 | long l; 41 | 42 | // Multiplied the probability of word by this factor 43 | // Equivalent to four decimal points. 44 | const double factor; // = 10000.00; 45 | 46 | // Initialize data members 47 | void initialize(int); 48 | 49 | /** 50 | * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c 51 | */ 52 | inline double round(double number) { 53 | return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); 54 | } 55 | 56 | public: 57 | EnrichmentMarkovView(int, int, int); 58 | EnrichmentMarkovView(int, V, int, int); 59 | virtual ~EnrichmentMarkovView(); 60 | 61 | void count(const char *, int, int); 62 | void generateProbapilities(); 63 | void processTable(); 64 | }; 65 | } /* namespace nonltr */ 66 | 67 | #include "EnrichmentMarkovView.cpp" 68 | 69 | #endif /* ENRICHMENTMARKOVVIEW_H_ */ 70 | -------------------------------------------------------------------------------- /src/nonltr/HMM.h: -------------------------------------------------------------------------------- 1 | /* 2 | * HMM.h 3 | * 4 | * Created on: Jun 21, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef HMM_H_ 9 | #define HMM_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "../utility/ILocation.h" 17 | 18 | using namespace std; 19 | using namespace utility; 20 | 21 | namespace nonltr { 22 | 23 | class HMM { 24 | private: 25 | const int PRECISION; 26 | double minusInf; 27 | vector * pList; 28 | vector *> * tList; 29 | vector * oList; 30 | 31 | void initializeHelper(); 32 | // Returns the index of the last candidate in the segment 33 | int trainHelper1(int, int, int); 34 | void trainHelper2(int, int, int, int); 35 | void trainPositive(int, int); 36 | void trainNegative(int, int); 37 | void move(int, int); 38 | void checkBase(double); 39 | 40 | /* 41 | inline int getPstvState(int score) { 42 | int state = ceil(log(score) / logBase); 43 | if (state < 0) { 44 | state = 0; 45 | } 46 | return state; 47 | } 48 | 49 | inline int getNgtvState(int score) { 50 | int state = ceil(log(score) / logBase); 51 | if (state < 0) { 52 | state = 0; 53 | } 54 | return state + positiveStateNumber; 55 | } 56 | */ 57 | 58 | inline int getPstvState(int index) { 59 | int state = scoreList->at(index); 60 | return state; 61 | } 62 | 63 | inline int getNgtvState(int index) { 64 | int state = scoreList->at(index); 65 | return state + positiveStateNumber; 66 | } 67 | 68 | protected: 69 | double base; 70 | double logBase; 71 | int stateNumber; 72 | int positiveStateNumber; 73 | 74 | vector * scoreList; 75 | const vector *> * segmentList; 76 | const vector * candidateList; 77 | 78 | void initialize(double, int); 79 | /** 80 | * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c 81 | */ 82 | inline double round(double number) { 83 | return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); 84 | } 85 | 86 | public: 87 | HMM(string); // Build a model from file 88 | HMM(double, int); 89 | // HMM(vector *, const vector *> *, 90 | // const vector *, double); 91 | virtual ~HMM(); 92 | void train(vector *, const vector *> *, const vector *); 93 | void normalize(); 94 | double decode(int, int, vector *, vector&); 95 | double decode(int, int, vector *, vector&); 96 | int getPositiveStateNumber(); 97 | void print(); 98 | void print(string); 99 | double getBase(); 100 | }; 101 | 102 | } /* namespace nonltr */ 103 | #endif /* HMM_H_ */ 104 | -------------------------------------------------------------------------------- /src/nonltr/IChromosome.h: -------------------------------------------------------------------------------- 1 | /* 2 | * IChromosome.h 3 | * 4 | * Created on: Feb 4, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ICHROMOSOME_H_ 9 | #define ICHROMOSOME_H_ 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | namespace nonltr { 17 | 18 | class IChromosome { 19 | public: 20 | //IChromosome(); 21 | //virtual ~IChromosome(); 22 | virtual const string* getBase() = 0; 23 | virtual const vector *> * getSegment() = 0; 24 | virtual string getHeader() = 0; 25 | }; 26 | 27 | } /* namespace tr */ 28 | #endif /* ICHROMOSOME_H_ */ 29 | -------------------------------------------------------------------------------- /src/nonltr/ITableView.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ITableView.h 3 | * 4 | * Created on: Aug 9, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ITABLEVIEW_H_ 9 | #define ITABLEVIEW_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace nonltr { 16 | 17 | template 18 | class ITableView { 19 | public: 20 | virtual V valueOf(const char*) = 0 ; 21 | virtual V valueOf(const char*, int) = 0; 22 | virtual V valueOf(I) = 0; 23 | 24 | virtual int getK() = 0; 25 | virtual I getMaxTableSize() = 0; 26 | virtual const V * getValues() const = 0; 27 | 28 | virtual void wholesaleValueOf(const char *, int, int, vector *) = 0; 29 | virtual void wholesaleValueOf(const char *, int, int, vector *, int) = 0; 30 | }; 31 | 32 | } 33 | 34 | #endif /* ITABLEVIEW_H_ */ 35 | -------------------------------------------------------------------------------- /src/nonltr/KmerHashTable.h: -------------------------------------------------------------------------------- 1 | /* 2 | * KmerHashTable.h 3 | * 4 | * Created on: Jul 25, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | 8 | #ifndef KMERHASHTABLE_H_ 9 | #define KMERHASHTABLE_H_ 10 | 11 | #include 12 | #include 13 | #include "ITableView.h" 14 | 15 | using namespace std; 16 | using namespace nonltr; 17 | 18 | namespace nonltr { 19 | 20 | template 21 | class KmerHashTable: public ITableView { 22 | 23 | protected: 24 | /* Fields */ 25 | static const int maxKeyLength = 15; 26 | int k; 27 | 28 | 29 | I maxTableSize; 30 | 31 | // The hashed values, i.e. the values of the hash table. 32 | // The index is the 4ry representation of the key 33 | V * values; 34 | V initialValue; 35 | 36 | private: 37 | // [4^0, 4^1, ... , 4^(k-1)] 38 | I * bases; 39 | I * mMinusOne; 40 | void initialize(int, V); 41 | 42 | public: 43 | /* Methods */ 44 | KmerHashTable(int); 45 | KmerHashTable(int, V); 46 | 47 | virtual ~KmerHashTable(); 48 | 49 | I hash(const char *); 50 | I hash(const char *, int); 51 | void hash(const char *, int, int, vector *); 52 | 53 | void insert(const char*, V); 54 | void insert(const char*, int, V); 55 | void insert(I, V); 56 | 57 | void increment(const char*); 58 | void increment(const char*, int); 59 | void wholesaleIncrement(const char*, int, int); 60 | 61 | void addReverseComplement(); 62 | I countNonInitialEntries(); 63 | vector *getKeys(); 64 | void printTable(string); 65 | void checkOverflow(); 66 | 67 | /*Vritual methods from ITableView*/ 68 | virtual V valueOf(const char*); 69 | virtual V valueOf(const char*, int); 70 | virtual V valueOf(I); 71 | virtual void wholesaleValueOf(const char *, int, int, vector *); 72 | virtual void wholesaleValueOf(const char *, int, int, vector *, int); 73 | 74 | virtual int getK(); 75 | virtual I getMaxTableSize(); 76 | virtual V getMaxValue(); 77 | virtual const V * getValues() const; 78 | }; 79 | } 80 | 81 | #include "KmerHashTable.cpp" 82 | 83 | #endif /* KMERHASHTABLE_H_ */ 84 | -------------------------------------------------------------------------------- /src/nonltr/LocationList.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationList.cpp 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Zakaria Girgis, PhD 6 | * 7 | * 8 | * An instance of this class holds a list of merged locations. 9 | */ 10 | 11 | #include "LocationList.h" 12 | 13 | namespace nonltr { 14 | 15 | LocationList::LocationList(string chromNameIn) { 16 | chromName = chromNameIn; 17 | regionList = new vector(); 18 | merge(); 19 | } 20 | 21 | LocationList::~LocationList() { 22 | Util::deleteInVector(regionList); 23 | delete regionList; 24 | } 25 | 26 | void LocationList::add(int start, int end) { 27 | regionList->push_back(new Location(start, end)); 28 | } 29 | 30 | void LocationList::merge() { 31 | int regionCount = regionList->size(); 32 | int gg = 0; 33 | while (gg < regionCount) { 34 | ILocation * region = regionList->at(gg); 35 | 36 | int regionStart = region->getStart(); 37 | int regionEnd = region->getEnd(); 38 | 39 | if (gg > 0) { 40 | ILocation * pRegion = regionList->at(gg - 1); 41 | int pStart = pRegion->getStart(); 42 | int pEnd = pRegion->getEnd(); 43 | 44 | if (Util::isOverlapping(pStart, pEnd, regionStart, regionEnd)) { 45 | pRegion->setEnd(regionEnd > pEnd ? regionEnd : pEnd); 46 | regionList->erase(regionList->begin() + gg); 47 | delete region; 48 | regionCount = regionList->size(); 49 | } else { 50 | gg++; 51 | } 52 | } 53 | 54 | if (gg == 0) { 55 | gg++; 56 | } 57 | } 58 | } 59 | 60 | void LocationList::mergeWithAnotherList( 61 | const vector * const otherList) { 62 | //A pre-condition: Ensure that the other list is sorted 63 | for (int h = 1; h < otherList->size(); h++) { 64 | if (otherList->at(h)->getStart() < otherList->at(h - 1)->getStart()) { 65 | throw InvalidStateException( 66 | string("LocationList - The other list is not sorted.")); 67 | } 68 | } 69 | 70 | // Start 71 | vector * mergedList = new vector(); 72 | 73 | int i = 0; 74 | int j = 0; 75 | int iLimit = regionList->size(); 76 | int jLimit = otherList->size(); 77 | 78 | // Continue until one list is finished 79 | while (i < iLimit && j < jLimit) { 80 | ILocation * iLoc = regionList->at(i); 81 | ILocation * jLoc = otherList->at(j); 82 | 83 | if (iLoc->getStart() < jLoc->getStart()) { 84 | mergedList->push_back(iLoc); 85 | i++; 86 | } else { 87 | mergedList->push_back(new Location(*jLoc)); 88 | j++; 89 | } 90 | } 91 | 92 | // Once one list is finished, copy the rest of the other list 93 | if (i == iLimit) { 94 | for (; j < jLimit; j++) { 95 | mergedList->push_back(new Location(*(otherList->at(j)))); 96 | } 97 | } else if (j == jLimit) { 98 | for (; i < iLimit; i++) { 99 | mergedList->push_back(regionList->at(i)); 100 | } 101 | } 102 | 103 | // Once done 104 | // Util::deleteInVector(regionList); 105 | regionList->clear(); // Need to test this line 106 | delete regionList; 107 | regionList = mergedList; 108 | 109 | merge(); 110 | 111 | //A post-condition: Ensure that the list is sorted 112 | for (int h = 1; h < regionList->size(); h++) { 113 | if (regionList->at(h)->getStart() < regionList->at(h - 1)->getStart()) { 114 | throw InvalidStateException(string("This list is not sorted.")); 115 | } 116 | } 117 | } 118 | 119 | void LocationList::print() { 120 | cout << endl << chromName << endl; 121 | for (int i = 0; i < regionList->size(); i++) { 122 | int s = regionList->at(i)->getStart(); 123 | int e = regionList->at(i)->getEnd(); 124 | cout << s << "-" << e << endl; 125 | } 126 | } 127 | 128 | const vector * LocationList::getList() { 129 | return regionList; 130 | } 131 | 132 | void LocationList::convertToRedFormat() { 133 | trim(1); 134 | } 135 | 136 | void LocationList::trim(int x) { 137 | for (int i = regionList->size() - 1; i >= 0; i--) { 138 | ILocation * region = regionList->at(i); 139 | int start = region->getStart(); 140 | int newEnd = region->getEnd() - x; 141 | 142 | if (newEnd < 0 || start > newEnd) { 143 | regionList->erase(regionList->begin() + i); 144 | delete region; 145 | } else { 146 | region->setEnd(newEnd); 147 | } 148 | } 149 | } 150 | 151 | } 152 | 153 | /* namespace nonltr */ 154 | -------------------------------------------------------------------------------- /src/nonltr/LocationList.h: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationList.h 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Z. Girgis, PhD 6 | */ 7 | 8 | #ifndef SRC_NONLTR_LOCATIONLIST_H_ 9 | #define SRC_NONLTR_LOCATIONLIST_H_ 10 | 11 | #include 12 | #include "../utility/Util.h" 13 | #include "../utility/ILocation.h" 14 | #include "../utility/Location.h" 15 | #include "../exception/InvalidStateException.h" 16 | 17 | using namespace std; 18 | using namespace utility; 19 | using namespace exception; 20 | 21 | namespace nonltr { 22 | 23 | class LocationList { 24 | private: 25 | string chromName; 26 | vector * regionList; 27 | void merge(); 28 | 29 | public: 30 | LocationList(string); 31 | virtual ~LocationList(); 32 | 33 | void add(int, int); 34 | 35 | /** 36 | * Take a sorted list 37 | */ 38 | void mergeWithAnotherList(const vector * const); 39 | 40 | 41 | /** 42 | * Print locations 43 | */ 44 | void print(); 45 | 46 | const vector * getList(); 47 | void convertToRedFormat(); 48 | void trim(int ); 49 | }; 50 | 51 | } /* namespace nonltr */ 52 | 53 | #endif /* SRC_NONLTR_LOCATIONLIST_H_ */ 54 | -------------------------------------------------------------------------------- /src/nonltr/LocationListCollection.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationListCollection.cpp 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "LocationListCollection.h" 9 | 10 | namespace nonltr { 11 | 12 | LocationListCollection::LocationListCollection(string fileNameIn) { 13 | fileName = fileNameIn; 14 | collection = new map(); 15 | readCoordinates(); 16 | } 17 | 18 | LocationListCollection::~LocationListCollection() { 19 | collection->clear(); 20 | delete collection; 21 | } 22 | 23 | void LocationListCollection::readCoordinates() { 24 | Util::checkFile(fileName); 25 | 26 | ifstream in(fileName.c_str()); 27 | LocationList * locList; 28 | string previousChromName(""); 29 | 30 | while (in.good()) { 31 | string line; 32 | getline(in, line); 33 | 34 | if (line.compare(string("")) != 0) { 35 | int colIndex = line.find_last_of(':'); 36 | int dashIndex = line.find_last_of('-'); 37 | 38 | string chromName = line.substr(0, colIndex); 39 | 40 | if (previousChromName.compare(chromName) != 0) { 41 | 42 | cout << "Processing regions of " << chromName << endl; 43 | 44 | locList = new LocationList(chromName); 45 | collection->insert( 46 | map::value_type(chromName, 47 | locList)); 48 | 49 | previousChromName = chromName; 50 | } 51 | 52 | int start = 53 | atoi( 54 | line.substr(colIndex + 1, dashIndex - colIndex - 1).c_str()); 55 | int end = atoi(line.substr(dashIndex + 1).c_str()); 56 | locList->add(start, end); 57 | } 58 | } 59 | 60 | in.close(); 61 | } 62 | 63 | void LocationListCollection::print() { 64 | map::iterator itr_s = collection->begin(); 65 | map::iterator itr_e = collection->end(); 66 | while (itr_s != itr_e) { 67 | collection->at(itr_s->first)->print(); 68 | ++itr_s; 69 | } 70 | } 71 | 72 | LocationList * const LocationListCollection::getLocationList(string header) { 73 | if (collection->count(header) == 0) { 74 | string msg("Regions of "); 75 | msg.append(header); 76 | msg.append(" cannot be found.\n"); 77 | throw InvalidStateException(msg); 78 | } 79 | 80 | return collection->at(header); 81 | } 82 | 83 | void LocationListCollection::convertToRedFormat() { 84 | map::iterator itr_s = collection->begin(); 85 | map::iterator itr_e = collection->end(); 86 | while (itr_s != itr_e) { 87 | collection->at(itr_s->first)->convertToRedFormat(); 88 | ++itr_s; 89 | } 90 | } 91 | 92 | void LocationListCollection::trim(int x) { 93 | map::iterator itr_s = collection->begin(); 94 | map::iterator itr_e = collection->end(); 95 | while (itr_s != itr_e) { 96 | collection->at(itr_s->first)->trim(x); 97 | ++itr_s; 98 | } 99 | } 100 | 101 | } /* namespace nonltr */ 102 | -------------------------------------------------------------------------------- /src/nonltr/LocationListCollection.h: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationListCollection.h 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ 9 | #define SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "LocationList.h" 15 | #include "../utility/Util.h" 16 | #include "../exception/InvalidStateException.h" 17 | 18 | using namespace std; 19 | using namespace utility; 20 | 21 | namespace nonltr { 22 | 23 | class LocationListCollection { 24 | 25 | private: 26 | string fileName; 27 | map * collection; 28 | void readCoordinates(); 29 | 30 | public: 31 | LocationListCollection(string); 32 | virtual ~LocationListCollection(); 33 | LocationList * const getLocationList(string); 34 | void print(); 35 | void convertToRedFormat(); 36 | void trim(int ); 37 | }; 38 | 39 | } /* namespace nonltr */ 40 | 41 | #endif /* SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ */ 42 | -------------------------------------------------------------------------------- /src/nonltr/Scanner.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Scanner.h 3 | * 4 | * Created on: Aug 19, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef SCANNER_H_ 9 | #define SCANNER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "Chromosome.h" 16 | #include "ChromosomeOneDigit.h" 17 | #include "HMM.h" 18 | #include "ITableView.h" 19 | #include "Scorer.h" 20 | #include "../utility/Util.h" 21 | #include "../utility/ILocation.h" 22 | #include "../utility/Location.h" 23 | #include "../exception/InvalidInputException.h" 24 | #include "../exception/InvalidStateException.h" 25 | #include "../exception/FileDoesNotExistException.h" 26 | #include "../exception/InvalidOperationException.h" 27 | 28 | using namespace std; 29 | using namespace utility; 30 | using namespace exception; 31 | 32 | namespace nonltr { 33 | 34 | class Scanner { 35 | private: 36 | //string chromFile; 37 | ChromosomeOneDigit * chrom; 38 | const vector *> * segmentList; 39 | Scorer * scorer; 40 | vector * scoreList; 41 | vector * regionList; 42 | int k; 43 | HMM * hmm; 44 | // bool isTrainMode; 45 | 46 | // Methods 47 | void start(); 48 | void check(); 49 | void decode(); 50 | void extendByK(); 51 | int extendByKHelper(int, int, int); 52 | void merge(); 53 | 54 | public: 55 | static const int FRMT_POS = 1; 56 | static const int FRMT_BED = 2; 57 | 58 | Scanner(HMM *, int, ChromosomeOneDigit *, string); 59 | Scanner(HMM *, int, ChromosomeOneDigit *, ITableView *); 60 | virtual ~Scanner(); 61 | void makeForwardCoordinates(); 62 | 63 | void printScores(string, bool); 64 | void printIndex(string, bool, int); 65 | void printMasked(string, Chromosome&, bool); 66 | void mergeWithOtherRegions(const vector *); 67 | const vector* getRegionList(); 68 | }; 69 | 70 | } /* namespace nonltr */ 71 | #endif /* SCANNER_H_ */ 72 | -------------------------------------------------------------------------------- /src/nonltr/Scorer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Scorer.cpp 3 | * 4 | * Created on: Aug 3, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | #include "Scorer.h" 8 | 9 | Scorer::Scorer(ChromosomeOneDigit * chromIn, 10 | ITableView * const table) { 11 | chrom = chromIn; 12 | kmerTable = table; 13 | scores = new vector(chrom->getBase()->size(), 0); 14 | k = kmerTable->getK(); 15 | max = -1; 16 | score(); 17 | calculateMax(); 18 | } 19 | 20 | Scorer::~Scorer() { 21 | scores->clear(); 22 | delete scores; 23 | } 24 | 25 | /** 26 | * This method scores each nucleotide in the chromosome. 27 | * The nucleotides represented by 'N' are assigned zero. 28 | */ 29 | void Scorer::score() { 30 | const vector *> * segment = chrom->getSegment(); 31 | const char * segBases = chrom->getBase()->c_str(); 32 | 33 | for (int s = 0; s < segment->size(); s++) { 34 | int start = segment->at(s)->at(0); 35 | int end = segment->at(s)->at(1); 36 | kmerTable->wholesaleValueOf(segBases, start, end - k + 1, scores, 37 | start); 38 | 39 | // Handle the last word from end - k + 2 till the end, inclusive. 40 | for (int i = end - k + 2; i <= end; i++) { 41 | (*scores)[i] = scores->at(i - 1); 42 | } 43 | } 44 | } 45 | 46 | /** 47 | * This method takes the logarithm of the scores according to the base. 48 | * If the score equals zero, it is left the same. 49 | */ 50 | void Scorer::takeLog(double base) { 51 | // Handle the case where base is one 52 | bool isOne = false; 53 | if (fabs(base - 1.0) < std::numeric_limits::epsilon()) { 54 | isOne = true; 55 | } 56 | double logBase = isOne ? log(1.5) : log(base); 57 | 58 | const vector *> * segment = chrom->getSegment(); 59 | for (int s = 0; s < segment->size(); s++) { 60 | int start = segment->at(s)->at(0); 61 | int end = segment->at(s)->at(1); 62 | for (int h = start; h <= end; h++) { 63 | int score = scores->at(h); 64 | 65 | if (score != 0) { 66 | if (!isOne || (isOne && score > 1)) { 67 | (*scores)[h] = ceil(log(score) / logBase); 68 | } 69 | } 70 | } 71 | } 72 | } 73 | 74 | int Scorer::getK() { 75 | return k; 76 | } 77 | 78 | vector* Scorer::getScores() { 79 | return scores; 80 | } 81 | 82 | void Scorer::printScores(string outputFile, bool canAppend) { 83 | ofstream outScores; 84 | if (canAppend) { 85 | outScores.open(outputFile.c_str(), ios::out | ios::app); 86 | } else { 87 | outScores.open(outputFile.c_str(), ios::out); 88 | } 89 | 90 | int step = 50; 91 | outScores << chrom->getHeader() << endl; 92 | int len = scores->size(); 93 | for (int i = 0; i < len; i = i + step) { 94 | int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; 95 | for (int k = i; k <= e; k++) { 96 | outScores << scores->at(k) << " "; 97 | } 98 | outScores << endl; 99 | } 100 | outScores << endl; 101 | 102 | outScores.close(); 103 | } 104 | 105 | int Scorer::countLessOrEqual(int thr) { 106 | int count = 0; 107 | const vector *> * segment = chrom->getSegment(); 108 | for (int s = 0; s < segment->size(); s++) { 109 | int start = segment->at(s)->at(0); 110 | int end = segment->at(s)->at(1); 111 | for (int h = start; h <= end; h++) { 112 | if (scores->at(h) <= thr) { 113 | count++; 114 | } 115 | } 116 | } 117 | return count; 118 | } 119 | 120 | void Scorer::calculateMax() { 121 | const vector *> * segmentList = chrom->getSegment(); 122 | int segmentCount = segmentList->size(); 123 | for (int jj = 0; jj < segmentCount; jj++) { 124 | vector * segment = segmentList->at(jj); 125 | int start = segment->at(0); 126 | int end = segment->at(1); 127 | for (int ss = start; ss <= end; ss++) { 128 | int score = scores->at(ss); 129 | if (score > max) { 130 | max = score; 131 | } 132 | } 133 | } 134 | 135 | if (max == -1) { 136 | string msg("Error occurred while finding the maximum score."); 137 | throw InvalidStateException(msg); 138 | } 139 | } 140 | 141 | int Scorer::getMax() { 142 | return max; 143 | } 144 | -------------------------------------------------------------------------------- /src/nonltr/Scorer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Scorer.h 3 | * 4 | * Created on: Aug 3, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef SCORER_H_ 9 | #define SCORER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "ITableView.h" 18 | #include "ChromosomeOneDigit.h" 19 | #include "../utility/Util.h" 20 | #include "../exception/InvalidStateException.h" 21 | 22 | using namespace std; 23 | using namespace nonltr; 24 | using namespace utility; 25 | using namespace exception; 26 | 27 | namespace nonltr { 28 | class Scorer { 29 | private: 30 | /* Fields */ 31 | ChromosomeOneDigit * chrom; 32 | ITableView * kmerTable; 33 | vector * scores; 34 | int k; 35 | int max; 36 | 37 | /* Methods */ 38 | void score(); 39 | void calculateMax(); 40 | 41 | public: 42 | /* Methods */ 43 | Scorer(ChromosomeOneDigit *, ITableView *); 44 | virtual ~Scorer(); 45 | void printScores(string, bool); 46 | vector* getScores(); 47 | int getK(); 48 | void takeLog(double); 49 | int countLessOrEqual(int); 50 | int getMax(); 51 | }; 52 | } 53 | 54 | #endif /* Scorer_H_ */ 55 | -------------------------------------------------------------------------------- /src/nonltr/TableBuilder.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * TableBuilder.cpp 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "TableBuilder.h" 9 | 10 | TableBuilder::TableBuilder(string dir, int motifSize, int order, int minObs) { 11 | genomeDir = dir; 12 | k = motifSize; 13 | genomeLength = 0; 14 | // kmerTable = new KmerHashTable(k); 15 | // kmerTable = new EnrichmentView(k); 16 | 17 | // Whenever you change the template, modify line 50 and 70 and the header file line 35 18 | kmerTable = new EnrichmentMarkovView(k, order, minObs); 19 | 20 | buildTable(); 21 | } 22 | 23 | TableBuilder::~TableBuilder() { 24 | delete kmerTable; 25 | } 26 | 27 | void TableBuilder::buildTable() { 28 | vector * fileList = new vector(); 29 | Util::readChromList(genomeDir, fileList, "fa"); 30 | 31 | for (int i = 0; i < fileList->size(); i++) { 32 | cout << "Counting k-mers in " << fileList->at(i) << " ..." << endl; 33 | ChromListMaker * maker = new ChromListMaker(fileList->at(i)); 34 | const vector * chromList = maker->makeChromOneDigitList(); 35 | 36 | for (int h = 0; h < chromList->size(); h++) { 37 | ChromosomeOneDigit * chrom = 38 | dynamic_cast(chromList->at(h)); 39 | if (chrom) { 40 | genomeLength += chrom->getEffectiveSize(); 41 | updateTable(chrom); 42 | } else { 43 | throw InvalidStateException(string("Dynamic cast failed.")); 44 | } 45 | } 46 | 47 | delete maker; 48 | } 49 | // Check if overflow has occurred 50 | kmerTable->checkOverflow(); 51 | 52 | // View 53 | // EnrichmentView * view = dynamic_cast(kmerTable); 54 | EnrichmentMarkovView * view = 55 | dynamic_cast *>(kmerTable); 56 | 57 | if (view) { 58 | view->generateProbapilities(); 59 | view->processTable(); 60 | maxValue = view->getMaxValue(); 61 | } else { 62 | throw InvalidStateException(string("Dynamic cast failed.")); 63 | } 64 | cout << "Enrichment view is ready." << endl; 65 | 66 | fileList->clear(); 67 | delete fileList; 68 | 69 | /* If you would like to see the contents of the table.*/ 70 | // kmerTable-> printTable(); 71 | } 72 | 73 | void TableBuilder::updateTable(ChromosomeOneDigit * chrom) { 74 | // EnrichmentView * view = dynamic_cast(kmerTable); 75 | EnrichmentMarkovView * view = 76 | dynamic_cast *>(kmerTable); 77 | 78 | const vector *> * segment = chrom->getSegment(); 79 | const char * segBases = chrom->getBase()->c_str(); 80 | 81 | for (int s = 0; s < segment->size(); s++) { 82 | int start = segment->at(s)->at(0); 83 | int end = segment->at(s)->at(1); 84 | // cerr << "The segment length is: " << (end-start+1) << endl; 85 | 86 | // Fast, but require some memory proportional to the segment length. 87 | kmerTable->wholesaleIncrement(segBases, start, end - k + 1); 88 | if (view) { 89 | view->count(segBases, start, end); 90 | } else { 91 | throw InvalidStateException(string("Dynamic cast failed.")); 92 | } 93 | 94 | // Slow, but memory efficient 95 | /* 96 | vector hashList = vector(); 97 | kmerTable->hash(segBases, start, end - k + 1, &hashList); 98 | 99 | for (int i = start; i <= end - k + 1; i++) { 100 | kmerTable->increment(segBases, i); 101 | } 102 | */ 103 | } 104 | } 105 | 106 | KmerHashTable * const TableBuilder::getKmerTable() { 107 | return kmerTable; 108 | } 109 | 110 | long TableBuilder::getGenomeLength() { 111 | if (genomeLength < 0) { 112 | string msg("The length of the genome cannot be negative."); 113 | throw InvalidStateException(msg); 114 | } 115 | 116 | return genomeLength; 117 | } 118 | 119 | int TableBuilder::getMaxValue() { 120 | return maxValue; 121 | } 122 | -------------------------------------------------------------------------------- /src/nonltr/TableBuilder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * TableBuilder.h 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | 8 | #ifndef TABLEBUILDER_H_ 9 | #define TABLEBUILDER_H_ 10 | 11 | #include "KmerHashTable.h" 12 | #include "EnrichmentMarkovView.h" 13 | #include "ChromosomeOneDigit.h" 14 | #include "ChromListMaker.h" 15 | #include "IChromosome.h" 16 | 17 | #include "../utility/Util.h" 18 | #include "../exception/InvalidStateException.h" 19 | 20 | #include 21 | 22 | using namespace std; 23 | using namespace nonltr; 24 | using namespace utility; 25 | using namespace exception; 26 | 27 | namespace nonltr { 28 | class TableBuilder { 29 | private: 30 | /** 31 | * k-mer table 32 | */ 33 | KmerHashTable * kmerTable; 34 | int maxValue; 35 | 36 | /** 37 | * Directory including the FASTA files comprising the genome. 38 | * These files must have the 39 | */ 40 | string genomeDir; 41 | 42 | /** 43 | * The size of the motif 44 | */ 45 | int k; 46 | 47 | /** 48 | * The total length of the whole genome 49 | */ 50 | long genomeLength; 51 | 52 | /** 53 | * Methods 54 | */ 55 | void buildTable(); 56 | void updateTable(ChromosomeOneDigit *); 57 | 58 | public: 59 | TableBuilder(string, int, int, int); 60 | virtual ~TableBuilder(); 61 | KmerHashTable * const getKmerTable(); 62 | void printTable(); 63 | long getGenomeLength(); 64 | int getMaxValue(); 65 | }; 66 | } 67 | 68 | #endif /* TABLEBUILDER_H_ */ 69 | -------------------------------------------------------------------------------- /src/nonltr/Trainer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Trainer.cpp 3 | * 4 | * Created on: Aug 20, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "Trainer.h" 9 | 10 | namespace nonltr { 11 | 12 | // Pass the isCND and the isCON parameters 13 | 14 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 15 | double tIn, string candidateDirIn, int m) : minObs(m) { 16 | candidateDir = candidateDirIn; 17 | canPrintCandidates = true; 18 | isCND = true; 19 | isCON = false; 20 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 21 | } 22 | 23 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 24 | double tIn, string candidateDirIn, bool isCNDIn, string otherDirIn, int m) : minObs(m) { 25 | candidateDir = candidateDirIn; 26 | canPrintCandidates = true; 27 | isCND = isCNDIn; 28 | isCON = true; 29 | otherDir = otherDirIn; 30 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 31 | } 32 | 33 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 34 | double tIn, int m) : minObs(m) { 35 | canPrintCandidates = false; 36 | isCND = true; 37 | isCON = false; 38 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 39 | } 40 | 41 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 42 | double tIn, bool isCNDIn, string otherDirIn, int m) : minObs(m) { 43 | canPrintCandidates = false; 44 | isCND = isCNDIn; 45 | isCON = true; 46 | otherDir = otherDirIn; 47 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 48 | } 49 | 50 | void Trainer::initialize(string genomeDirIn, int orderIn, int kIn, double sIn, 51 | double tIn) { 52 | 53 | if (isCND == false && isCON == false) { 54 | string msg("Training using the candidates or the other repeats is required. "); 55 | msg.append("Please specify which regions to be used for training. "); 56 | msg.append("Any of the two sets or a combination of both can be used."); 57 | throw InvalidStateException(msg); 58 | } 59 | 60 | genomeDir = genomeDirIn; 61 | fileList = new vector(); 62 | Util::readChromList(genomeDir, fileList, string("fa")); 63 | chromCount = fileList->size(); 64 | order = orderIn; 65 | k = kIn; 66 | s = sIn; 67 | t = tIn; 68 | p = 0.0; 69 | tDetector = tIn + 0.1; 70 | max = -1; 71 | 72 | stage1(); 73 | 74 | if (isCND) { 75 | stage2(); 76 | } 77 | stage3(); 78 | } 79 | 80 | Trainer::~Trainer() { 81 | fileList->clear(); 82 | delete fileList; 83 | delete builder; 84 | delete hmm; 85 | } 86 | 87 | /** 88 | * Stage 1: Building the table 89 | */ 90 | void Trainer::stage1() { 91 | cout << endl << endl; 92 | cout << "Stage 1: Building the table ..." << endl; 93 | builder = new TableBuilder(genomeDir, k, order, minObs); 94 | table = builder->getKmerTable(); 95 | genomeLength = builder->getGenomeLength(); 96 | max = builder->getMaxValue(); 97 | } 98 | 99 | void Trainer::stage2() { 100 | cout << endl << endl; 101 | cout << "Stage 2: Calculating the percentage ..." << endl; 102 | 103 | double effectiveSize = 0.0; 104 | double countLessOrEqual = 0.0; 105 | for (int i = 0; i < chromCount; i++) { 106 | cout << "Calculating the percentage in: " << fileList->at(i) << " ..."; 107 | cout << endl; 108 | ChromListMaker * maker = new ChromListMaker(fileList->at(i)); 109 | const vector * chromList = maker->makeChromOneDigitList(); 110 | 111 | for (int h = 0; h < chromList->size(); h++) { 112 | ChromosomeOneDigit * chrom = 113 | dynamic_cast(chromList->at(h)); 114 | Scorer * scorer = new Scorer(chrom, table); 115 | 116 | effectiveSize += chrom->getEffectiveSize(); 117 | countLessOrEqual += scorer->countLessOrEqual(t); 118 | 119 | delete scorer; 120 | } 121 | delete maker; 122 | } 123 | 124 | if (effectiveSize == 0) { 125 | string msg("The size of the genome cannot be zero."); 126 | throw InvalidStateException(msg); 127 | } else { 128 | p = 100.00 * countLessOrEqual / effectiveSize; 129 | cout << "The percentage is " << p << endl; 130 | if (p < 52.5) { 131 | p = 52.5; 132 | cout << "The percentage is increased to " << p << endl; 133 | } 134 | } 135 | } 136 | 137 | /** 138 | * Stage 3: Training 139 | */ 140 | void Trainer::stage3() { 141 | cout << endl << endl; 142 | cout << "Stage 3: Training ..." << endl; 143 | 144 | // Handle the case when the threshold is one. 145 | bool isOne = false; 146 | if (fabs(t - 1.0) < std::numeric_limits::epsilon()) { 147 | isOne = true; 148 | } 149 | double hmmBase = isOne ? 1.5 : t; 150 | 151 | // Make a list of candidate HMM 152 | int stateCount = 2 * (ceil(log(max) / log(hmmBase)) + 1); 153 | 154 | // Initialize the HMM 155 | hmm = new HMM(hmmBase, stateCount); 156 | 157 | // Start training the models 158 | for (int i = 0; i < chromCount; i++) { 159 | cout << "Training on: " << fileList->at(i) << endl; 160 | // Name of candidates file 161 | string path(fileList->at(i)); 162 | int slashLastIndex = path.find_last_of(Util::fileSeparator); 163 | int dotLastIndex = path.find_last_of("."); 164 | string nickName = path.substr(slashLastIndex + 1, dotLastIndex - slashLastIndex - 1); 165 | 166 | // May or may not be used 167 | string cndFile = candidateDir + Util::fileSeparator + nickName + ".cnd"; 168 | 169 | // Work on the other repeats if desired 170 | LocationListCollection * otherRegionListCollection; 171 | bool isConRepAvailable = false; 172 | if (isCON) { 173 | string otherFile = otherDir + Util::fileSeparator + nickName + ".rpt"; 174 | ifstream f1(otherFile.c_str()); 175 | if (!f1) { 176 | string message = string("Warning: "); 177 | message.append(otherFile); 178 | message.append(" does not exist. "); 179 | message.append("Repeats of this sequence will not used for training the HMM."); 180 | cout << message << endl; 181 | } else { 182 | otherRegionListCollection = new LocationListCollection(otherFile); 183 | otherRegionListCollection->convertToRedFormat(); 184 | otherRegionListCollection->trim(k - 1); 185 | 186 | isConRepAvailable = true; 187 | } 188 | f1.close(); 189 | } 190 | 191 | // Read sequences in the file 192 | ChromListMaker * maker = new ChromListMaker(fileList->at(i)); 193 | const vector * chromList = maker->makeChromOneDigitList(); 194 | 195 | for (int h = 0; h < chromList->size(); h++) { 196 | ChromosomeOneDigit * chrom = dynamic_cast(chromList->at(h)); 197 | Scorer * scorer = new Scorer(chrom, table); 198 | vector * scoreList = scorer->getScores(); 199 | 200 | // Detect candidates if desired 201 | ChromDetectorMaxima * detector; 202 | const vector * trainingRegionList; 203 | bool canDeleteDetector = true; 204 | if (isCND) { 205 | if (canPrintCandidates) { 206 | detector = new ChromDetectorMaxima(s, 10, 0, tDetector, p,s, scoreList, chrom); 207 | if (h > 0) { 208 | bool canAppend = true; 209 | detector->printIndex(cndFile, canAppend); 210 | } else { 211 | cout << "Printing candidates to: " << cndFile << endl; 212 | detector->printIndex(cndFile); 213 | } 214 | } else { 215 | detector = new ChromDetectorMaxima(s, 10, 0, tDetector, p, s, scoreList, chrom->getSegment()); 216 | } 217 | trainingRegionList = detector->getRegionList(); 218 | 219 | 220 | } 221 | 222 | if (isCON && isConRepAvailable) { 223 | LocationList * const locList = otherRegionListCollection->getLocationList(chrom->getHeader()); 224 | if (isCND) { 225 | locList->mergeWithAnotherList(detector->getRegionList()); 226 | } 227 | trainingRegionList = locList->getList(); 228 | 229 | } 230 | 231 | // The candidate regions are already copied to the location list 232 | if (isCND && isCON && isConRepAvailable) { 233 | delete detector; 234 | canDeleteDetector = false; 235 | } 236 | 237 | // Train the HMM 238 | if(isCND || (isCON && isConRepAvailable)){ 239 | 240 | scorer->takeLog(t); 241 | scoreList = scorer->getScores(); 242 | hmm->train(scoreList, chrom->getSegment(), trainingRegionList); 243 | } 244 | 245 | // Free more memory 246 | if (isCND && canDeleteDetector) { 247 | delete detector; 248 | } 249 | delete scorer; 250 | } 251 | 252 | if (isCON && isConRepAvailable) { 253 | delete otherRegionListCollection; 254 | } 255 | delete maker; 256 | } 257 | 258 | // Normalize HMM's once training is finished 259 | hmm->normalize(); 260 | } 261 | 262 | void Trainer::printTable(string fileName) { 263 | table->printTable(fileName); 264 | } 265 | 266 | HMM*& Trainer::getHmm() { 267 | return hmm; 268 | } 269 | 270 | KmerHashTable * Trainer::getTable() { 271 | return table; 272 | } 273 | 274 | void Trainer::printHmm(string fileName) { 275 | hmm->print(fileName); 276 | } 277 | 278 | } /* namespace nonltr */ 279 | -------------------------------------------------------------------------------- /src/nonltr/Trainer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Trainer.h 3 | * 4 | * Created on: Aug 20, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef TRAINER_H_ 9 | #define TRAINER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "TableBuilder.h" 18 | #include "KmerHashTable.h" 19 | #include "HMM.h" 20 | #include "ChromDetectorMaxima.h" 21 | #include "Scorer.h" 22 | #include "ChromListMaker.h" 23 | #include "LocationListCollection.h" 24 | #include "../utility/Util.h" 25 | #include "../exception/InvalidStateException.h" 26 | 27 | using namespace std; 28 | using namespace utility; 29 | using namespace exception; 30 | 31 | namespace nonltr { 32 | 33 | class Trainer { 34 | private: 35 | string genomeDir; 36 | string candidateDir; 37 | string otherDir; 38 | bool canPrintCandidates; 39 | vector * fileList; 40 | int chromCount; 41 | int order; 42 | int k; 43 | int max; // Maximum score in the entire genome 44 | double t; // Score threshold 45 | double tDetector; // threshold for the detector because it uses < not <=; 46 | double p; // Percentage of scores below the threshold, t, in non-repeats 47 | //double r; 48 | double s; // Half width of the mask 49 | long genomeLength; 50 | //vector * sampleList; 51 | TableBuilder * builder; 52 | KmerHashTable * table; 53 | HMM * hmm; 54 | int isCND; 55 | int isCON; 56 | // The minimum number of the observed k-mers 57 | const int minObs; 58 | 59 | void stage1(); 60 | void stage2(); 61 | void stage3(); 62 | //void stage4(); 63 | 64 | public: 65 | Trainer(string, int, int, double, double, string, int); 66 | Trainer(string, int, int, double, double, string, bool, string, int); 67 | Trainer(string, int, int, double, double, int); 68 | Trainer(string, int, int, double, double, bool, string, int); 69 | 70 | void initialize(string, int, int, double, double); 71 | virtual ~Trainer(); 72 | void printTable(string); 73 | void printHmm(string); 74 | HMM*& getHmm(); 75 | KmerHashTable * getTable(); 76 | 77 | }; 78 | 79 | } /* namespace nonltr */ 80 | #endif /* TRAINER_H_ */ 81 | -------------------------------------------------------------------------------- /src/utility/AffineId.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * AffineId.cpp 3 | * 4 | * Created on: Dec 6, 2012 5 | * Modified on: Nov 6, 2017 6 | * Author: Hani Zakaria Girgis, PhD 7 | */ 8 | 9 | // ToDo: 10 | // 1. Add pre-conditions after testing 11 | #include "AffineId.h" 12 | 13 | #include "Util.h" 14 | #include "../exception/InvalidInputException.h" 15 | 16 | #include 17 | #include 18 | using namespace std; 19 | //using namespace exception; 20 | 21 | namespace utility { 22 | 23 | AffineId::AffineId(const char * seq1In, int start1In, int end1In, 24 | const char * seq2In, int start2In, int end2In) { 25 | 26 | // The shorter of the two sequences is seq2 27 | seq1 = seq1In; 28 | start1 = start1In; 29 | end1 = end1In; 30 | 31 | seq2 = seq2In; 32 | start2 = start2In; 33 | end2 = end2In; 34 | 35 | if (end1 - start1 < end2 - start2) { 36 | seq1 = seq2In; 37 | start1 = start2In; 38 | end1 = end2In; 39 | 40 | seq2 = seq1In; 41 | start2 = start1In; 42 | end2 = end1In; 43 | } 44 | 45 | /* if (start1 < 0 || end1 < 0 || start1 > end1) { 46 | string msg("Invalid Input. Start1 is "); 47 | msg.append(Util::int2string(start1)); 48 | msg.append(". End 1 is "); 49 | msg.append(Util::int2string(end1)); 50 | msg.append("."); 51 | //throw InvalidInputException(msg); 52 | 53 | cerr << msg << endl; 54 | throw exception(); 55 | } 56 | 57 | if (start2 < 0 || end2 < 0 || start2 > end2) { 58 | string msg("Invalid Input. Start2 is "); 59 | msg.append(Util::int2string(start2)); 60 | msg.append(". End2 is "); 61 | msg.append(Util::int2string(end2)); 62 | msg.append("."); 63 | //throw InvalidInputException(msg); 64 | 65 | cerr << msg << endl; 66 | throw exception(); 67 | }*/ 68 | 69 | // Validate input 70 | // cout << start1 << " " << end1 << endl; 71 | // cout << start2 << " " << end2 << endl; 72 | 73 | len1 = end1 - start1 + 2; 74 | len2 = end2 - start2 + 2; 75 | 76 | align(); 77 | } 78 | 79 | AffineId::~AffineId() { 80 | } 81 | 82 | void AffineId::align() { 83 | // Initialize needed arrays 84 | auto m = new int[len2][2](); // Middle level array 85 | auto u = new int[len2][2](); // Upper level array 86 | auto mId = new int[len2][2](); // Array storing number of matches in the middle array 87 | auto uId = new int[len2][2](); // Array storing number of matches in the upper array 88 | auto mPath = new int[len2][2](); // Array storing number of steps in the middle array 89 | auto uPath = new int[len2][2](); // Array storing number of steps in the upper array 90 | 91 | // Apply the DP 92 | // The i index is only used to get a character from the first sequence 93 | // It is not used for filling the DP matrix 94 | for (int i = 1; i < len1; i++) { 95 | char base1 = seq1[start1 + i - 1]; 96 | int lower = 0; 97 | int lowerId = 0; 98 | int lowerPath = 0; 99 | 100 | // j is the row. There are only two columns 0 and 1 101 | for (int j = 1; j < len2; j++) { 102 | // Update the lower value 103 | int extLower = lower + EXT; 104 | int openLower = m[j - 1][0] + OPEN; 105 | if (extLower > openLower) { 106 | lower = extLower; 107 | lowerPath++; 108 | } else { 109 | lower = openLower; 110 | lowerId = mId[j - 1][0]; 111 | lowerPath = mPath[j - 1][0] + 1; 112 | } 113 | 114 | // Fill the array of the upper level 115 | int extUpper = u[j][0] + EXT; 116 | int openUpper = m[j][0] + OPEN; 117 | if (extUpper > openUpper) { 118 | u[j][1] = extUpper; 119 | uId[j][1] = uId[j][0]; 120 | uPath[j][1] = uPath[j][0] + 1; 121 | } else { 122 | u[j][1] = openUpper; 123 | uId[j][1] = mId[j][0]; 124 | uPath[j][1] = mPath[j][0] + 1; 125 | } 126 | 127 | // Fill the array of the middle level 128 | int matchOrMis; 129 | if (base1 == seq2[start2 + j - 1]) { 130 | matchOrMis = m[j - 1][0] + MATCH; 131 | } else { 132 | matchOrMis = m[j - 1][0] + MIS; 133 | } 134 | 135 | int lowerOrUpper; 136 | if (lower > u[j][1]) { 137 | lowerOrUpper = lower; 138 | } else { 139 | lowerOrUpper = u[j][1]; 140 | } 141 | 142 | if (matchOrMis > lowerOrUpper) { 143 | m[j][1] = matchOrMis; 144 | mPath[j][1] = mPath[j - 1][0] + 1; 145 | if (base1 == seq2[start2 + j - 1]) { 146 | mId[j][1] = mId[j - 1][0] + 1; 147 | } else { 148 | mId[j][1] = mId[j - 1][0]; 149 | } 150 | } else { 151 | m[j][1] = lowerOrUpper; 152 | if (lower > u[j][1]) { 153 | mId[j][1] = lowerId; 154 | mPath[j][1] = lowerPath; 155 | } else { 156 | mId[j][1] = uId[j][1]; 157 | mPath[j][1] = uPath[j][1]; 158 | } 159 | } 160 | } 161 | 162 | // // Test 163 | // for (int h = 0; h < len2; h++) { 164 | // cout << m[h][0] << "\t" << m[h][1] << "----" << mId[h][0] << "\t" 165 | // << mId[h][1] << endl; 166 | // } 167 | // cout << "---------------------------------------------------" << endl; 168 | // // End of test 169 | 170 | // Copy the second column to the first one 171 | if (i != len1 - 1) { 172 | for (int h = 0; h < len2; h++) { 173 | m[h][0] = m[h][1]; 174 | u[h][0] = u[h][1]; 175 | mId[h][0] = mId[h][1]; 176 | uId[h][0] = uId[h][1]; 177 | mPath[h][0] = mPath[h][1]; 178 | uPath[h][0] = uPath[h][1]; 179 | } 180 | } 181 | } 182 | 183 | lenCS = mId[len2 - 1][1]; 184 | lenPath = mPath[len2 - 1][1]; 185 | //cout << "Alignment length = " << lenPath << endl; 186 | delete[] u; 187 | delete[] m; 188 | delete[] mId; 189 | delete[] uId; 190 | delete[] mPath; 191 | delete[] uPath; 192 | } 193 | 194 | double AffineId::getAlign() { 195 | double amt = lenCS; 196 | return amt / (double)lenPath; 197 | } 198 | 199 | } 200 | /* namespace utility */ 201 | 202 | // // Testing code 203 | // int main() { 204 | // string s1("GATCTCAG"); 205 | // string s2("GACAG"); 206 | 207 | // utility::AffineId id(s1.c_str(), 0, s1.length() - 1, s2.c_str(), 0, 208 | // s2.length() - 1); 209 | // cout << "Length = " << id.getLenCS() << endl; 210 | 211 | // return 0; 212 | // } 213 | -------------------------------------------------------------------------------- /src/utility/AffineId.h: -------------------------------------------------------------------------------- 1 | /* 2 | * AffineId.h 3 | * 4 | * Created on: Dec 6, 2012 5 | * Modified on: Nov 6, 2017 6 | * Author: Hani Zakaria Girgis, PhD 7 | */ 8 | 9 | #ifndef AFFINEID_H_ 10 | #define AFFINEID_H_ 11 | 12 | namespace utility { 13 | 14 | class AffineId { 15 | private: 16 | const char * seq1; 17 | int start1; 18 | int end1; 19 | const char * seq2; 20 | int start2; 21 | int end2; 22 | 23 | int len1; 24 | int len2; 25 | //int lenTotal; 26 | int lenCS; 27 | int lenPath; 28 | int * m; // Middle level 29 | //int * l; // Lower level 30 | int * u; // Upper level 31 | 32 | // const int MATCH = 4; // Score of a match 33 | // const int MIS = -4; // Score of a mismatch 34 | // const int OPEN = -2; // Score of a gap opening 35 | // const int EXT = -1; // Score of a gap extension 36 | 37 | const int MATCH = 1; 38 | const int MIS = -1; 39 | const int OPEN = -2; 40 | const int EXT = -1; 41 | void align(); 42 | 43 | public: 44 | AffineId(const char *, int, int, const char *, int, int); 45 | virtual ~AffineId(); 46 | double getAlign(); 47 | }; 48 | 49 | } /* namespace utility */ 50 | #endif /* AFFINEID_H_ */ 51 | -------------------------------------------------------------------------------- /src/utility/EmptyLocation.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * EmptyLocation.cpp 3 | * 4 | * Created on: Dec 28, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "EmptyLocation.h" 9 | #include "../exception/InvalidOperationException.h" 10 | 11 | using namespace exception; 12 | 13 | namespace utility { 14 | 15 | EmptyLocation * EmptyLocation::INSTANCE = new EmptyLocation(); 16 | 17 | EmptyLocation * EmptyLocation::getInstance(){ 18 | return INSTANCE; 19 | } 20 | 21 | EmptyLocation::EmptyLocation() { 22 | msg = new string("Empty location does not allow this operation."); 23 | } 24 | 25 | EmptyLocation::~EmptyLocation() { 26 | delete msg; 27 | } 28 | 29 | string EmptyLocation::toString() { 30 | return string("Empty"); 31 | } 32 | 33 | int EmptyLocation::getEnd() const { 34 | throw InvalidOperationException(*msg); 35 | } 36 | 37 | int EmptyLocation::getStart() const { 38 | throw InvalidOperationException(*msg); 39 | } 40 | 41 | void EmptyLocation::setEnd(int int1) { 42 | throw InvalidOperationException(*msg); 43 | } 44 | 45 | void EmptyLocation::setStart(int int1) { 46 | throw InvalidOperationException(*msg); 47 | } 48 | 49 | int EmptyLocation::getLength() { 50 | throw InvalidOperationException(*msg); 51 | } 52 | 53 | } /* namespace tr */ 54 | -------------------------------------------------------------------------------- /src/utility/EmptyLocation.h: -------------------------------------------------------------------------------- 1 | /* 2 | * EmptyLocation.h 3 | * 4 | * Created on: Dec 28, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef EMPTYLOCATION_H_ 9 | #define EMPTYLOCATION_H_ 10 | 11 | #include "ILocation.h" 12 | 13 | namespace utility { 14 | 15 | class EmptyLocation: public ILocation { 16 | private: 17 | string * msg; 18 | static EmptyLocation * INSTANCE; 19 | EmptyLocation(); 20 | virtual ~EmptyLocation(); 21 | 22 | public: 23 | virtual int getEnd() const; 24 | virtual int getStart() const; 25 | virtual void setEnd(int); 26 | virtual void setStart(int); 27 | virtual int getLength(); 28 | virtual string toString(); 29 | 30 | static EmptyLocation * getInstance(); 31 | 32 | }; 33 | 34 | } /* namespace tr */ 35 | #endif /* EMPTYLOCATION_H_ */ 36 | -------------------------------------------------------------------------------- /src/utility/GlobAlignE.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Joseph Valencia 3 | * Modified by Benjamin James 4 | * Date: 12/14/17 5 | * Bioinformatics Toolsmith Laboratory, University of Tulsa 6 | * */ 7 | #include 8 | #include "../exception/InvalidStateException.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "GlobAlignE.h" 17 | 18 | using namespace std; 19 | using namespace utility; 20 | using namespace exception; 21 | 22 | GlobAlignE::GlobAlignE(const char * seq1In, int start1In, int end1In, const char * seq2In, 23 | int start2In, int end2In, int matchIn, int mismatchIn, int gapOpenIn, int gapContinueIn){ 24 | 25 | seq1 = seq1In; 26 | start1 = start1In; 27 | end1 = end1In; 28 | 29 | seq2 = seq2In; 30 | start2 = start2In; 31 | end2 = end2In; 32 | 33 | len1 = end1 - start1 + 2; 34 | len2 = end2 - start2 + 2; 35 | 36 | //Incremental score storage 37 | matches = new int[len1]; 38 | upperGap = new int[len1]; 39 | lowerGap = new int[len1]; 40 | 41 | 42 | 43 | //Incremental length storage 44 | matchLen = new int[len1]; 45 | upperLen = new int[len1]; 46 | lowerLen = new int[len1]; 47 | 48 | //Incremental identity storage 49 | matchId = new int[len1]; 50 | upperId = new int[len1]; 51 | lowerId = new int[len1]; 52 | 53 | match = matchIn; 54 | mismatch = mismatchIn; 55 | gapOpen = gapOpenIn; 56 | gapContinue = gapContinueIn; 57 | findAlignment(); 58 | 59 | } 60 | /* 61 | GlobAlignE::GlobAlignE(string filename1,string filename2, int matchIn, int mismatchIn, int gapOpenIn, int gapContinueIn):GlobAlignE(string1.c_str(),0,string.size(),string2.c_str(),0,string2.size(),matchIn,mismatchIn,gapOpenIn,gapContinueIn){ 62 | 63 | ifstream ifs; 64 | 65 | ifs.open (filename1, ifstream::in); 66 | cout<<"FILE OPENED"<'){ 70 | 71 | while(c!='\n'){ 72 | c = ifs.get(); 73 | 74 | } 75 | } 76 | 77 | string string1 =""; 78 | 79 | while (ifs.good()) { 80 | 81 | 82 | if (c!='\n'){ 83 | string1+=c; 84 | } 85 | c = ifs.get(); 86 | } 87 | 88 | ifs.close(); 89 | 90 | 91 | ifstream ifs2; 92 | 93 | ifs2.open (filename2, ifstream::in); 94 | 95 | c = ifs2.get(); 96 | 97 | if(c == '>'){ 98 | 99 | while(c!='\n'){ 100 | c = ifs2.get(); 101 | } 102 | } 103 | 104 | string string2 =""; 105 | 106 | while (ifs2.good()) { 107 | 108 | if(c!='\n'){ 109 | string2+=c; 110 | } 111 | c = ifs2.get(); 112 | } 113 | 114 | ifs2.close(); 115 | 116 | std::transform(string1.begin(),string1.end(),string1.begin(),::toupper); 117 | std::transform(string2.begin(),string2.end(),string2.begin(),::toupper); 118 | 119 | // return GlobAlignE(string1.c_str(),0,string.size(),string2.c_str(),0,string2.size(),matchIn,mismatchIn,gapOpenIn,gapContinueIn); 120 | 121 | } 122 | */ 123 | void GlobAlignE::findAlignment(){ 124 | 125 | int shorter = min(len2,len1)-1; 126 | int lenDiff = abs(len2-len1); 127 | int maxDiff=0; 128 | 129 | if (lenDiff >=1){ 130 | maxDiff += -gapOpen- (lenDiff*gapContinue); 131 | } 132 | 133 | maxDiff+= (mismatch* shorter)-1; 134 | 135 | const int negativeInf = maxDiff; 136 | 137 | matches[0]= 0; 138 | upperGap[0] = negativeInf; 139 | lowerGap[0] = negativeInf; 140 | 141 | matchLen[0] =0; 142 | upperLen[0] =0; 143 | lowerLen[0] =0; 144 | 145 | matchId[0] =0; 146 | upperId[0] = 0; 147 | lowerId[0] =0; 148 | 149 | //initial values 150 | for (int i = 1; i 10 | 11 | using namespace std; 12 | 13 | namespace utility{ 14 | 15 | class GlobAlignE{ 16 | 17 | private: 18 | const char * seq1; //first sequence to be aligned 19 | int start1; 20 | int end1; 21 | const char * seq2;//second sequence to be aligned 22 | int start2; 23 | int end2; 24 | int len1; 25 | int len2; 26 | int lenTotal; 27 | int match; //score for base pair match 28 | int mismatch;//score for base pair mismatch 29 | int gapOpen; //cost to open a gap 30 | int gapContinue; //cost to continue a gap 31 | int * matches; 32 | int * upperGap; 33 | int * lowerGap; 34 | int * matchLen; 35 | int * upperLen; 36 | int * lowerLen; 37 | int * matchId; 38 | int * upperId; 39 | int * lowerId; 40 | int alignmentScore; 41 | int alignmentLength; 42 | int totalMatches; 43 | string topString; 44 | string bottomString; 45 | public: 46 | GlobAlignE(const char*,int,int,const char *,int,int, int,int,int,int); 47 | GlobAlignE(string,string,int,int,int,int); 48 | virtual ~GlobAlignE(); 49 | void findAlignment(); 50 | double getIdentity(); 51 | int getLength(); 52 | void printAlignment(); 53 | int getScore(); 54 | int getLengthAlignment(); 55 | 56 | }; 57 | } 58 | #endif 59 | -------------------------------------------------------------------------------- /src/utility/ILocation.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ILocation.h 3 | * 4 | * Created on: Dec 20, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ILOCATION_H_ 9 | #define ILOCATION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace utility { 16 | 17 | class ILocation { 18 | public: 19 | virtual int getEnd() const = 0; 20 | virtual int getStart() const = 0; 21 | virtual void setEnd(int) = 0; 22 | virtual void setStart(int) = 0; 23 | virtual int getLength() = 0; 24 | virtual string toString() = 0; 25 | }; 26 | 27 | } 28 | 29 | #endif /* ILOCATION_H_ */ 30 | -------------------------------------------------------------------------------- /src/utility/LCSLen.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * LCSLen.cpp 3 | * 4 | * Created on: Dec 6, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "LCSLen.h" 9 | #include "../utility/Util.h" 10 | #include "../exception/InvalidInputException.h" 11 | 12 | #include 13 | 14 | using namespace std; 15 | using namespace exception; 16 | 17 | namespace utility { 18 | 19 | LCSLen::LCSLen(const char * seq1In, int start1In, int end1In, 20 | const char * seq2In, int start2In, int end2In) { 21 | seq1 = seq1In; 22 | start1 = start1In; 23 | end1 = end1In; 24 | 25 | seq2 = seq2In; 26 | start2 = start2In; 27 | end2 = end2In; 28 | 29 | if(start1 < 0 || end1 < 0 || start1 > end1){ 30 | string msg("Invalid Input. Start1 is "); 31 | msg.append(Util::int2string(start1)); 32 | msg.append(". End 1 is "); 33 | msg.append(Util::int2string(end1)); 34 | msg.append("."); 35 | throw InvalidInputException(msg); 36 | } 37 | 38 | if(start2 < 0 || end2 < 0 || start2 > end2){ 39 | string msg("Invalid Input. Start2 is "); 40 | msg.append(Util::int2string(start2)); 41 | msg.append(". End2 is "); 42 | msg.append(Util::int2string(end2)); 43 | msg.append("."); 44 | throw InvalidInputException(msg); 45 | } 46 | 47 | // Validate input 48 | // cout << start1 << " " << end1 << endl; 49 | // cout << start2 << " " << end2 << endl; 50 | 51 | 52 | len1 = end1 - start1 + 2; 53 | len2 = end2 - start2 + 2; 54 | 55 | lenTotal = 2 * len2; 56 | cTable = new int[lenTotal]; 57 | 58 | for (int i = 0; i < lenTotal; i++) { 59 | cTable[i] = 0; 60 | } 61 | 62 | findLcs(); 63 | } 64 | 65 | LCSLen::~LCSLen() { 66 | delete[] cTable; 67 | } 68 | 69 | void LCSLen::findLcs() { 70 | int iM1Index = 0; 71 | int iIndex = len2; 72 | 73 | for (int i = 1; i < len1; i++) { 74 | char base1 = seq1[start1 + i - 1]; 75 | 76 | for (int j = 1; j < len2; j++) { 77 | int ijIndex = iIndex + j; 78 | if (base1 == seq2[start2 + j - 1]) { 79 | cTable[ijIndex] = cTable[iM1Index + j - 1] + 1; 80 | } else { 81 | if (cTable[iM1Index + j] > cTable[iIndex + j - 1]) { 82 | cTable[ijIndex] = cTable[iM1Index + j]; 83 | } else { 84 | cTable[ijIndex] = cTable[iIndex + j - 1]; 85 | } 86 | } 87 | } 88 | 89 | if(i != len1-1){ 90 | for(int h = 0; h < len2; h++){ 91 | cTable[h] = cTable[len2+h]; 92 | } 93 | } 94 | } 95 | lenCS = cTable[lenTotal-1]; 96 | } 97 | 98 | int LCSLen::getLenCS(){ 99 | return lenCS; 100 | } 101 | 102 | } 103 | /* namespace utility */ 104 | -------------------------------------------------------------------------------- /src/utility/LCSLen.h: -------------------------------------------------------------------------------- 1 | /* 2 | * LCSLen.h 3 | * 4 | * Created on: Dec 6, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef LCSLEN_H_ 9 | #define LCSLEN_H_ 10 | 11 | namespace utility { 12 | 13 | class LCSLen { 14 | private: 15 | const char * seq1; 16 | int start1; 17 | int end1; 18 | const char * seq2; 19 | int start2; 20 | int end2; 21 | 22 | int len1; 23 | int len2; 24 | int lenTotal; 25 | int lenCS; 26 | 27 | int * cTable; 28 | void findLcs(); 29 | 30 | public: 31 | LCSLen(const char *, int, int, const char *, int, int); 32 | virtual ~LCSLen(); 33 | int getLenCS(); 34 | }; 35 | 36 | } /* namespace utility */ 37 | #endif /* LCSLEN_H_ */ 38 | -------------------------------------------------------------------------------- /src/utility/Location.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Location.cpp 3 | * 4 | * Created on: Dec 19, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "Location.h" 9 | #include "Util.h" 10 | #include "../exception/InvalidInputException.h" 11 | 12 | using namespace exception; 13 | 14 | namespace utility { 15 | 16 | Location::Location(int startIn, int endIn) { 17 | initialize(startIn, endIn); 18 | } 19 | 20 | Location::Location(ILocation& cp) { 21 | initialize(cp.getStart(), cp.getEnd()); 22 | } 23 | 24 | void Location::initialize(int startIn, int endIn) { 25 | start = startIn; 26 | end = endIn; 27 | check(); 28 | 29 | } 30 | 31 | void Location::check() { 32 | if (start < 0 || end < 0 || start > end) { 33 | string msg("Invalid Input. Start is "); 34 | msg.append(Util::int2string(start)); 35 | msg.append(". End is "); 36 | msg.append(Util::int2string(end)); 37 | msg.append("."); 38 | throw InvalidInputException(msg); 39 | } 40 | } 41 | 42 | Location::~Location() { 43 | } 44 | 45 | int Location::getEnd() const { 46 | return end; 47 | } 48 | 49 | int Location::getStart() const { 50 | return start; 51 | } 52 | 53 | void Location::setEnd(int endIn) { 54 | end = endIn; 55 | check(); 56 | } 57 | 58 | void Location::setStart(int startIn) { 59 | start = startIn; 60 | check(); 61 | } 62 | 63 | int Location::getLength() { 64 | return end - start + 1; 65 | } 66 | 67 | string Location::toString() { 68 | string msg = (Util::int2string(start)); 69 | msg.append("-"); 70 | msg.append(Util::int2string(end)); 71 | 72 | return msg; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/utility/Location.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Location.h 3 | * 4 | * Created on: Dec 19, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef LOCATION_H_ 9 | #define LOCATION_H_ 10 | 11 | #include "ILocation.h" 12 | 13 | #include 14 | 15 | using namespace std; 16 | 17 | namespace utility { 18 | 19 | class Location : public ILocation{ 20 | private: 21 | int start; 22 | int end; 23 | void initialize(int, int); 24 | void check(); 25 | 26 | public: 27 | Location(int, int); 28 | Location(ILocation&); 29 | virtual ~Location(); 30 | 31 | int getEnd() const; 32 | int getStart() const; 33 | void setEnd(int); 34 | void setStart(int); 35 | int getLength(); 36 | string toString(); 37 | }; 38 | 39 | } 40 | 41 | #endif /* LOCATION_H_ */ 42 | -------------------------------------------------------------------------------- /src/utility/Util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Util.h 3 | * 4 | * Created on: Apr 24, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef UTIL_H_ 9 | #define UTIL_H_ 10 | 11 | #include "Location.h" 12 | #include "../exception/FileDoesNotExistException.h" 13 | #include "../exception/InvalidInputException.h" 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | using namespace std; 24 | using namespace utility; 25 | using namespace exception; 26 | 27 | namespace utility { 28 | class Util { 29 | private: 30 | Util(); 31 | ~Util(); 32 | 33 | public: 34 | static string * emptyString; 35 | static string fileSeparator; 36 | static void readFasta(string, vector *, vector *, bool); 37 | static void readFasta(string, vector *, vector *); 38 | static void readCoordinates(string, vector *); 39 | static void readChromList(string, vector *, string); 40 | static void toUpperCase(string*); 41 | static void toUpperCase(string&); 42 | static string int2string(int); 43 | static string double2string(double); 44 | static string long2string(long); 45 | static void deleteFile(string); 46 | static void deleteFilesUnderDirectory(string); 47 | static void checkFile(string); 48 | static bool isOverlapping(int, int, int, int); 49 | static void revCompDig(string *, string *); 50 | static void revCompDig(const char* sequence, int, int, string *); 51 | 52 | static void writeFasta(const string&, const string&, const string&); 53 | 54 | static int sumTotalLength(const vector *); 55 | 56 | /** 57 | * Delete the objects pointed to by pointers in a vector. 58 | * It does not delete the vector itself. 59 | * 60 | * Credit: http://stackoverflow.com/questions/594089/does-stdvector-clear-do-delete-free-memory-on-each-element 61 | */ 62 | template 63 | static void deleteInVector(vector * deleteMe) { 64 | while (!deleteMe->empty()) { 65 | delete deleteMe->back(); 66 | deleteMe->pop_back(); 67 | } 68 | 69 | // Set the size to zero 70 | deleteMe->clear(); 71 | 72 | // Set the capacity to zero 73 | vector empty; 74 | deleteMe->swap(empty); 75 | } 76 | }; 77 | } 78 | 79 | #endif /* UTIL_H_ */ 80 | --------------------------------------------------------------------------------