├── README.md └── src_2.0 ├── HowToCompile.txt ├── Makefile ├── RepeatsDetector.cpp ├── exception ├── FileDoesNotExistException.cpp ├── FileDoesNotExistException.h ├── InvalidInputException.cpp ├── InvalidInputException.h ├── InvalidOperationException.cpp ├── InvalidOperationException.h ├── InvalidOrderOfOperationsException.cpp ├── InvalidOrderOfOperationsException.h ├── InvalidScoreException.cpp ├── InvalidScoreException.h ├── InvalidStateException.cpp └── InvalidStateException.h ├── nonltr ├── ChromDetector.cpp ├── ChromDetector.h ├── ChromDetectorMaxima.cpp ├── ChromDetectorMaxima.h ├── ChromListMaker.cpp ├── ChromListMaker.h ├── Chromosome.cpp ├── Chromosome.h ├── ChromosomeOneDigit.cpp ├── ChromosomeOneDigit.h ├── ChromosomeRandom.cpp ├── ChromosomeRandom.h ├── DetectorMaxima.cpp ├── DetectorMaxima.h ├── EnrichmentMarkovView.cpp ├── EnrichmentMarkovView.h ├── HMM.cpp ├── HMM.h ├── IChromosome.h ├── ITableView.h ├── KmerHashTable.cpp ├── KmerHashTable.h ├── LocationList.cpp ├── LocationList.h ├── LocationListCollection.cpp ├── LocationListCollection.h ├── Scanner.cpp ├── Scanner.h ├── Scorer.cpp ├── Scorer.h ├── TableBuilder.cpp ├── TableBuilder.h ├── Trainer.cpp └── Trainer.h └── utility ├── EmptyLocation.cpp ├── EmptyLocation.h ├── ILocation.h ├── Location.cpp ├── Location.h ├── Util.cpp └── Util.h /README.md: -------------------------------------------------------------------------------- 1 | # Red 2 | Red: an intelligent, rapid, accurate tool for detecting repeats de-novo on the genomic scale. 3 | 4 | Requirement: GNU gcc8.2 or higher. Please change the name (CXX) of the compiler in the Makefile. 5 | 6 | Compiling the source code 7 | 8 | The following command makes the required directories: 9 | > make bin 10 | 11 | The following command makes the binary that is located under the ``bin'' directory: 12 | > make 13 | 14 | To find the binary: 15 | > cd ../bin 16 | 17 | Please cite the following paper: 18 | 19 | Girgis, H.Z. (2015) Red: an intelligent, rapid, accurate tool for 20 | detecting repeats de-novo on the genomic scale. BMC Bioinformatics, 21 | 16, 227. 22 | -------------------------------------------------------------------------------- /src_2.0/HowToCompile.txt: -------------------------------------------------------------------------------- 1 | Compiling the source code 2 | 3 | Requirement: GNU gcc8.2 or higher. Please change the name (CXX) of the compiler 4 | in the Makefile. 5 | 6 | The following command makes the required directories: 7 | > make bin 8 | 9 | The following command makes the binary that is located under the ``bin'' directory: 10 | > make 11 | 12 | To find the binary: 13 | > cd ../bin 14 | 15 | Please cite the following paper: 16 | 17 | Girgis, H.Z. (2015) Red: an intelligent, rapid, accurate tool for 18 | detecting repeats de-novo on the genomic scale. BMC Bioinformatics, 19 | 16, 227. -------------------------------------------------------------------------------- /src_2.0/Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := red 2 | 3 | # CXX = /usr/bin/c++ 4 | # CXX = /usr/bin/g++ 5 | CXX = g++-8 6 | 7 | CXXFLAGS = -std=c++14 -fopenmp -O3 -g -fmessage-length=0 -Wall -fpermissive 8 | 9 | # 10 | # Objects 11 | # 12 | 13 | ORed = ../bin/Red.o 14 | 15 | # Exception 16 | OInvalidInputException = ../bin/exception/InvalidInputException.o 17 | OInvalidStateException = ../bin/exception/InvalidStateException.o 18 | OFileDoesNotExistException = ../bin/exception/FileDoesNotExistException.o 19 | OInvalidOrderOfOperationsException = ../bin/exception/InvalidOrderOfOperationsException.o 20 | OInvalidScoreException = ../bin/exception/InvalidScoreException.o 21 | OInvalidOperationException = ../bin/exception/InvalidOperationException.o 22 | 23 | # Utility 24 | OUtil = ../bin/utility/Util.o 25 | OLocation = ../bin/utility/Location.o 26 | OEmptyLocation = ../bin/utility/EmptyLocation.o 27 | 28 | # Non TR 29 | OChromosome = ../bin/nonltr/Chromosome.o 30 | OChromosomeOneDigit = ../bin/nonltr/ChromosomeOneDigit.o 31 | OChromosomeRandom = ../bin/nonltr/ChromosomeRandom.o 32 | OChromListMaker = ../bin/nonltr/ChromListMaker.o 33 | OTableBuilder = ../bin/nonltr/TableBuilder.o 34 | OScorer = ../bin/nonltr/Scorer.o 35 | ODetectorMaxima = ../bin/nonltr/DetectorMaxima.o 36 | OChromDetectorMaxima = ../bin/nonltr/ChromDetectorMaxima.o 37 | OHMM = ../bin/nonltr/HMM.o 38 | OScanner = ../bin/nonltr/Scanner.o 39 | OTrainer = ../bin/nonltr/Trainer.o 40 | OLocationList = ../bin/nonltr/LocationList.o 41 | OLocationListCollection = ../bin/nonltr/LocationListCollection.o 42 | 43 | OBJS = $(ORed) $(OInvalidInputException) $(OInvalidStateException) $(OFileDoesNotExistException) $(OInvalidOrderOfOperationsException) $(OInvalidOperationException) $(OInvalidScoreException) $(OUtil) $(OLocation) $(OEmptyLocation) $(OChromosome) $(OChromosomeOneDigit) $(OChromosomeRandom) $(OChromListMaker) $(OTableBuilder) $(OScorer) $(ODetectorMaxima) $(OChromDetector) $(OChromDetectorMaxima) $(OHMM) $(OScanner) $(OTrainer) $(OLocationList) $(OLocationListCollection) 44 | 45 | # 46 | # Target 47 | # 48 | 49 | TRed = ../bin/Red 50 | 51 | # 52 | # Make RepeatsDetector 53 | # 54 | 55 | $(TRed): $(OBJS) 56 | $(CXX) $(CXXFLAGS) -o $(TRed) $(OBJS) 57 | 58 | # 59 | # RepeatsDetector 60 | # 61 | 62 | $(ORed): RepeatsDetector.cpp nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/TableBuilder.h nonltr/HMM.h nonltr/Scanner.h nonltr/Trainer.h utility/Util.h 63 | $(CXX) $(CXXFLAGS) -c RepeatsDetector.cpp -o $(ORed) 64 | 65 | # 66 | # Exception 67 | # 68 | $(OInvalidInputException): exception/InvalidInputException.cpp exception/InvalidInputException.h 69 | $(CXX) $(CXXFLAGS) -c exception/InvalidInputException.cpp -o $(OInvalidInputException) 70 | 71 | $(OInvalidStateException): exception/InvalidStateException.cpp exception/InvalidStateException.h 72 | $(CXX) $(CXXFLAGS) -c exception/InvalidStateException.cpp -o $(OInvalidStateException) 73 | 74 | $(OFileDoesNotExistException): exception/FileDoesNotExistException.cpp exception/FileDoesNotExistException.h 75 | $(CXX) $(CXXFLAGS) -c exception/FileDoesNotExistException.cpp -o $(OFileDoesNotExistException) 76 | 77 | $(OInvalidOrderOfOperationsException): exception/InvalidOrderOfOperationsException.cpp exception/InvalidOrderOfOperationsException.h 78 | $(CXX) $(CXXFLAGS) -c exception/InvalidOrderOfOperationsException.cpp -o $(OInvalidOrderOfOperationsException) 79 | 80 | $(OInvalidScoreException): exception/InvalidScoreException.cpp exception/InvalidScoreException.h 81 | $(CXX) $(CXXFLAGS) -c exception/InvalidScoreException.cpp -o $(OInvalidScoreException) 82 | 83 | $(OInvalidOperationException): exception/InvalidOperationException.cpp exception/InvalidOperationException.h 84 | $(CXX) $(CXXFLAGS) -c exception/InvalidOperationException.cpp -o $(OInvalidOperationException) 85 | 86 | # 87 | # Utility 88 | # 89 | 90 | $(OUtil): utility/Util.cpp utility/Util.h utility/Location.h exception/FileDoesNotExistException.h 91 | $(CXX) $(CXXFLAGS) -c utility/Util.cpp -o $(OUtil) 92 | 93 | $(OLocation): utility/Location.cpp utility/Location.h utility/ILocation.h exception/InvalidInputException.h utility/Util.h 94 | $(CXX) $(CXXFLAGS) -c utility/Location.cpp -o $(OLocation) 95 | 96 | $(OEmptyLocation): utility/EmptyLocation.cpp utility/EmptyLocation.h utility/ILocation.h exception/InvalidOperationException.h 97 | $(CXX) $(CXXFLAGS) -c utility/EmptyLocation.cpp -o $(OEmptyLocation) 98 | 99 | # 100 | # Non LTR 101 | # 102 | 103 | $(OChromosome): nonltr/Chromosome.cpp nonltr/Chromosome.h nonltr/IChromosome.h utility/Util.h exception/InvalidInputException.h exception/InvalidOperationException.h 104 | $(CXX) $(CXXFLAGS) -c nonltr/Chromosome.cpp -o $(OChromosome) 105 | 106 | $(OChromosomeOneDigit): nonltr/ChromosomeOneDigit.cpp nonltr/ChromosomeOneDigit.h nonltr/Chromosome.h exception/InvalidInputException.h 107 | $(CXX) $(CXXFLAGS) -c nonltr/ChromosomeOneDigit.cpp -o $(OChromosomeOneDigit) 108 | 109 | $(OChromosomeRandom): nonltr/ChromosomeRandom.cpp nonltr/ChromosomeRandom.h nonltr/IChromosome.h exception/InvalidInputException.h exception/InvalidStateException.h utility/Util.h 110 | $(CXX) $(CXXFLAGS) -c nonltr/ChromosomeRandom.cpp -o $(OChromosomeRandom) 111 | 112 | $(OTableBuilder): nonltr/TableBuilder.cpp nonltr/TableBuilder.h utility/Util.h nonltr/ChromosomeOneDigit.h nonltr/ITableView.h nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/EnrichmentMarkovView.h nonltr/EnrichmentMarkovView.cpp exception/InvalidStateException.h nonltr/ChromListMaker.h nonltr/IChromosome.h 113 | $(CXX) $(CXXFLAGS) -c nonltr/TableBuilder.cpp -o $(OTableBuilder) 114 | 115 | $(OScorer): nonltr/Scorer.cpp nonltr/Scorer.h nonltr/ChromosomeOneDigit.h utility/Util.h exception/InvalidStateException.h 116 | $(CXX) $(CXXFLAGS) -c nonltr/Scorer.cpp -o $(OScorer) 117 | 118 | $(ODetectorMaxima): nonltr/DetectorMaxima.cpp nonltr/DetectorMaxima.h utility/ILocation.h exception/InvalidStateException.h 119 | $(CXX) $(CXXFLAGS) -c nonltr/DetectorMaxima.cpp -o $(ODetectorMaxima) 120 | 121 | $(OChromDetectorMaxima): nonltr/ChromDetectorMaxima.cpp nonltr/ChromDetectorMaxima.h nonltr/DetectorMaxima.h nonltr/ChromosomeOneDigit.h utility/Util.h utility/ILocation.h utility/Location.h 122 | $(CXX) $(CXXFLAGS) -c nonltr/ChromDetectorMaxima.cpp -o $(OChromDetectorMaxima) 123 | 124 | $(OHMM): nonltr/HMM.cpp nonltr/HMM.h utility/ILocation.h exception/InvalidStateException.h exception/InvalidInputException.h exception/FileDoesNotExistException.h exception/InvalidOperationException.h 125 | $(CXX) $(CXXFLAGS) -c nonltr/HMM.cpp -o $(OHMM) 126 | 127 | $(OScanner): nonltr/Scanner.cpp nonltr/Scanner.h nonltr/Chromosome.h nonltr/ChromosomeOneDigit.h nonltr/HMM.h nonltr/ITableView.h nonltr/Scorer.h utility/Util.h utility/ILocation.h exception/InvalidInputException.h exception/InvalidStateException.h exception/FileDoesNotExistException.h exception/InvalidOperationException.h 128 | $(CXX) $(CXXFLAGS) -c nonltr/Scanner.cpp -o $(OScanner) 129 | 130 | $(OTrainer): nonltr/Trainer.cpp nonltr/Trainer.h nonltr/TableBuilder.h nonltr/KmerHashTable.h nonltr/KmerHashTable.cpp nonltr/HMM.h nonltr/ChromDetectorMaxima.h nonltr/Scorer.h nonltr/ChromListMaker.h utility/Util.h nonltr/LocationListCollection.h 131 | $(CXX) $(CXXFLAGS) -c nonltr/Trainer.cpp -o $(OTrainer) 132 | 133 | $(OChromListMaker): nonltr/ChromListMaker.cpp nonltr/ChromListMaker.h nonltr/Chromosome.h nonltr/ChromosomeOneDigit.h utility/Util.h 134 | $(CXX) $(CXXFLAGS) -c nonltr/ChromListMaker.cpp -o $(OChromListMaker) 135 | 136 | $(OCluster): nonltr/Cluster.cpp nonltr/Cluster.h utility/Util.h exception/InvalidStateException.h exception/InvalidInputException.h 137 | $(CXX) $(CXXFLAGS) -c nonltr/Cluster.cpp -o $(OCluster) 138 | 139 | $(OLocationList): nonltr/LocationList.cpp nonltr/LocationList.h utility/ILocation.h utility/Location.h exception/InvalidStateException.h 140 | $(CXX) $(CXXFLAGS) -c nonltr/LocationList.cpp -o $(OLocationList) 141 | 142 | $(OLocationListCollection): nonltr/LocationListCollection.cpp nonltr/LocationListCollection.h utility/Location.h exception/InvalidStateException.h 143 | $(CXX) $(CXXFLAGS) -c nonltr/LocationListCollection.cpp -o $(OLocationListCollection) 144 | 145 | 146 | # 147 | # Make binary directories 148 | # 149 | 150 | red: $(TRed) 151 | 152 | # 153 | # Make Red 154 | # 155 | 156 | bin: 157 | mkdir ../bin 158 | mkdir ../bin/exception 159 | mkdir ../bin/utility 160 | mkdir ../bin/nonltr 161 | 162 | # 163 | # Make clean 164 | # 165 | 166 | clean: 167 | rm -f ../bin/*.o ../bin/exception/*.o ../bin/ms/*.o ../bin/nonltr/*.o ../bin/test/*.o ../bin/utility/*.o ../bin/tr/*.o *.o $(TRed) 168 | -------------------------------------------------------------------------------- /src_2.0/RepeatsDetector.cpp: -------------------------------------------------------------------------------- 1 | //============================================================================ 2 | // Name : RepeatsDetector.cpp 3 | // Author : Hani Zakaria Girgis, PhD 4 | // Version : 2.0 5 | // Description : Red (RepeatsDetector) 6 | //============================================================================ 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "nonltr/Trainer.h" 16 | #include "nonltr/KmerHashTable.h" 17 | #include "nonltr/TableBuilder.h" 18 | #include "nonltr/HMM.h" 19 | #include "nonltr/Scanner.h" 20 | #include "nonltr/ChromListMaker.h" 21 | #include "utility/Util.h" 22 | 23 | using namespace std; 24 | using namespace nonltr; 25 | using namespace utility; 26 | using namespace exception; 27 | 28 | /** 29 | * Parameters 30 | */ 31 | // Required parameters 32 | const static string LEN_PRM = string("-len"); // k - length of the motif. 33 | 34 | // Train and Scan the whole genome 35 | const static string GNM_PRM = string("-gnm"); // Train and scan. 36 | const static string ORD_PRM = string("-ord"); // order of background markov chain. 37 | const static string GAU_PRM = string("-gau"); // Half width of the Gaussian mask. 38 | const static string THR_PRM = string("-thr"); // The threshold part of the definition of non-repeats 39 | const static string MIN_PRM = string("-min"); // The minimum number of observations 40 | 41 | // Scan using pre-calculated scores and a trained HMM 42 | const static string HMI_PRM = string("-hmi"); // File including the trained model 43 | const static string SEQ_PRM = string("-seq"); // File including the sequence 44 | const static string SCI_PRM = string("-sci"); // File including the scores of the sequence 45 | 46 | // Output options with -gnm only 47 | const static string TBL_PRM = string("-tbl"); // Write the k-mer to the provided file 48 | const static string SCO_PRM = string("-sco"); // Write the scores to the 49 | const static string HMO_PRM = string("-hmo"); // The Markov model is writen to this file. 50 | const static string CND_PRM = string("-cnd"); // Write candidate region to a directory 51 | 52 | // Output options with -gnm and -hmm 53 | const static string MSK_PRM = string("-msk"); // Write masked sequence(s) to file or directory 54 | const static string RPT_PRM = string("-rpt"); // Write coordinates to file or directory 55 | const static string DIR_PRM = string("-dir"); // Read additional sequences(.fa) or scores (.sc) under directory 56 | const static string FRM_PRM = string("-frm"); // Format of the ouput 57 | 58 | // Cores 59 | const static string COR_PRM = string("-cor"); 60 | 61 | void drive(map * const param) { 62 | // Delete old output files 63 | if (param->count(MSK_PRM) > 0) { 64 | if (param->count(GNM_PRM) > 0) { 65 | cout << "Deleting pre-existing files under " << param->at(MSK_PRM); 66 | cout << endl; 67 | Util::deleteFilesUnderDirectory(param->at(MSK_PRM)); 68 | } else if (param->count(HMI_PRM) > 0) { 69 | cout << "Deleting pre-existing " << param->at(MSK_PRM) << endl; 70 | Util::deleteFile(param->at(MSK_PRM)); 71 | } 72 | } 73 | 74 | if (param->count(RPT_PRM) > 0) { 75 | if (param->count(GNM_PRM) > 0) { 76 | cout << "Deleting pre-existing files under " << param->at(RPT_PRM); 77 | cout << endl; 78 | Util::deleteFilesUnderDirectory(param->at(RPT_PRM)); 79 | } else if (param->count(HMI_PRM) > 0) { 80 | cout << "Deleting pre-existing " << param->at(RPT_PRM) << endl; 81 | Util::deleteFile(param->at(RPT_PRM)); 82 | } 83 | } 84 | 85 | if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) > 0) { 86 | cout << "Deleting pre-existing files under " << param->at(SCO_PRM); 87 | cout << endl; 88 | Util::deleteFilesUnderDirectory(param->at(SCO_PRM)); 89 | } 90 | 91 | if (param->count(HMO_PRM) > 0 && param->count(GNM_PRM) > 0) { 92 | cout << "Deleting pre-existing " << param->at(HMO_PRM) << endl; 93 | Util::deleteFile(param->at(HMO_PRM)); 94 | } 95 | 96 | if (param->count(TBL_PRM) > 0 && param->count(GNM_PRM) > 0) { 97 | cout << "Deleting pre-existing " << param->at(TBL_PRM) << endl; 98 | Util::deleteFile(param->at(TBL_PRM)); 99 | } 100 | 101 | // Process the input 102 | int k = atoi(param->at(LEN_PRM).c_str()); 103 | 104 | if (param->count(GNM_PRM) > 0) { 105 | string genomeDir = param->at(GNM_PRM); 106 | int order = atoi(param->at(ORD_PRM).c_str()); 107 | double s = atoi(param->at(GAU_PRM).c_str()); 108 | double t = atoi(param->at(THR_PRM).c_str()); 109 | int minObs = atoi(param->at(MIN_PRM).c_str()); 110 | 111 | // Adjust the threshold when it is one because of the log base. 112 | if (((int) t) == 1) { 113 | t = 1.5; 114 | cout << "The base of the logarithmic function is adjusted." << endl; 115 | } 116 | 117 | // This part or the next 118 | Trainer * trainer; 119 | if (param->count(CND_PRM) > 0) { 120 | trainer = new Trainer(genomeDir, order, k, s, t, param->at(CND_PRM), 121 | minObs); 122 | } else { 123 | trainer = new Trainer(genomeDir, order, k, s, t, minObs); 124 | } 125 | 126 | if (param->count(TBL_PRM)) { 127 | cout << "Printing the count of the kmer's to: "; 128 | cout << param->at(TBL_PRM) << endl; 129 | trainer->printTable(param->at(TBL_PRM)); 130 | } 131 | 132 | if (param->count(HMO_PRM) > 0) { 133 | cout << "Printing the HMM to: " << endl; 134 | cout << param->at(HMO_PRM) << endl; 135 | trainer->printHmm(param->at(HMO_PRM)); 136 | } 137 | 138 | // Stage 4: Scan 139 | cout << endl << endl; 140 | cout << "Stage 4: Scanning ..." << endl; 141 | vector *fileList = new vector(); 142 | Util::readChromList(genomeDir, fileList, string("fa")); 143 | if (param->count(DIR_PRM) > 0) { 144 | Util::readChromList(param->at(DIR_PRM), fileList, string("fa")); 145 | } 146 | 147 | unsigned long genomeLen = 0; 148 | unsigned long repeatLen = 0; 149 | 150 | unsigned int chromCount = fileList->size(); 151 | # pragma omp parallel for schedule(dynamic) num_threads(Util::CORE_NUM) 152 | for (unsigned int i = 0; i < chromCount; i++) { 153 | # pragma omp critical 154 | { 155 | cout << "Scanning: " << fileList->at(i) << endl; 156 | } 157 | // Output file name 158 | string path(fileList->at(i)); 159 | int slashLastIndex = path.find_last_of(Util::fileSeparator); 160 | int dotLastIndex = path.find_last_of("."); 161 | string nickName = path.substr(slashLastIndex + 1, 162 | dotLastIndex - slashLastIndex - 1); 163 | 164 | // Process each sequence with the ith file 165 | ChromListMaker * maker = new ChromListMaker(fileList->at(i)); 166 | const vector * chromList = 167 | maker->makeChromOneDigitList(); 168 | 169 | ChromListMaker * oMaker = new ChromListMaker(fileList->at(i)); 170 | const vector * oChromList; 171 | if (param->count(MSK_PRM) > 0) { 172 | oChromList = oMaker->makeChromList(); 173 | } 174 | 175 | for (unsigned int h = 0; h < chromList->size(); h++) { 176 | ChromosomeOneDigit * chrom = 177 | dynamic_cast(chromList->at(h)); 178 | genomeLen += chrom->size(); 179 | 180 | HMM * copyHMM = new HMM(*trainer->getHmm()); 181 | 182 | // Scan the forward strand 183 | Scanner * scanner = new Scanner(copyHMM, k, chrom, 184 | trainer->getTable()); 185 | 186 | // Scan the reverse complement 187 | chrom->makeRC(); 188 | Scanner * scannerRC = new Scanner(copyHMM, k, chrom, 189 | trainer->getTable()); 190 | scannerRC->makeForwardCoordinates(); 191 | scanner->mergeWithOtherRegions(scannerRC->getRegionList()); 192 | delete scannerRC; 193 | chrom->makeRC(); 194 | 195 | // Scan the reverse 196 | chrom->makeR(); 197 | Scanner * scannerR = new Scanner(copyHMM, k, chrom, 198 | trainer->getTable()); 199 | scannerR->makeForwardCoordinates(); 200 | scanner->mergeWithOtherRegions(scannerR->getRegionList()); 201 | delete scannerR; 202 | 203 | repeatLen += scanner->getTotalRegionLength(); 204 | 205 | //@@ The chromosome now has the sequence of the reverse strand 206 | // The actual strand is calculated if the user requested the scores. 207 | 208 | // Print according to the user's requests 209 | bool canAppend = (h == 0) ? false : true; 210 | 211 | if (param->count(SCO_PRM) > 0) { 212 | // Calculate the forward strand from the reverse 213 | chrom->makeR(); 214 | 215 | string scoFile = param->at(SCO_PRM) + Util::fileSeparator 216 | + nickName + ".scr"; 217 | if (!canAppend) { 218 | # pragma omp critical 219 | { 220 | cout << "Printing scores to: " << scoFile << endl; 221 | } 222 | } 223 | // Make sure to print the original E-values not their logarithm 224 | Scorer * scorer = new Scorer(chrom, trainer->getTable()); 225 | scorer->printScores(scoFile, canAppend); 226 | delete scorer; 227 | } 228 | 229 | if (param->count(RPT_PRM) > 0) { 230 | string ext(".rpt"); 231 | if (atoi(param->at(FRM_PRM).c_str()) == 2) { 232 | ext = string(".bed"); 233 | } 234 | string rptFile = param->at(RPT_PRM) + Util::fileSeparator 235 | + nickName + ext; 236 | if (!canAppend) { 237 | # pragma omp critical 238 | { 239 | cout << "Printing locations to: " << rptFile << endl; 240 | } 241 | } 242 | scanner->printIndex(rptFile, canAppend, 243 | atoi(param->at(FRM_PRM).c_str())); 244 | } 245 | 246 | if (param->count(MSK_PRM) > 0) { 247 | string mskFile = param->at(MSK_PRM) + Util::fileSeparator 248 | + nickName + ".msk"; 249 | if (!canAppend) { 250 | # pragma omp critical 251 | { 252 | cout << "Printing masked sequence to: " << mskFile 253 | << endl; 254 | } 255 | } 256 | Chromosome * oChrom = oChromList->at(h); 257 | scanner->printMasked(mskFile, *oChrom, canAppend); 258 | } 259 | 260 | // Free memory 261 | delete scanner; 262 | delete copyHMM; 263 | } 264 | 265 | delete maker; 266 | delete oMaker; 267 | } 268 | 269 | cout << "Genome length: " << genomeLen; 270 | cout << " - Repeats length: " << repeatLen; 271 | cout << " - Repeats content: " 272 | << 100.00 * ((double) repeatLen / genomeLen) << "%" << endl; 273 | 274 | // Free memory 275 | fileList->clear(); 276 | delete fileList; 277 | delete trainer; 278 | } else if (param->count(HMI_PRM) > 0) { 279 | HMM * hmm = new HMM(param->at(HMI_PRM)); 280 | 281 | string chromFile = param->at(SEQ_PRM); 282 | string scoresFile = param->at(SCI_PRM); 283 | 284 | ChromosomeOneDigit * chrom = new ChromosomeOneDigit(chromFile); 285 | Scanner * scanner = new Scanner(hmm, k, chrom, scoresFile); 286 | 287 | if (param->count(RPT_PRM) > 0) { 288 | string rptFile = param->at(RPT_PRM); 289 | cout << "Printing locations to: " << rptFile << endl; 290 | scanner->printIndex(rptFile, false, 291 | atoi(param->at(FRM_PRM).c_str())); 292 | } 293 | 294 | if (param->count(MSK_PRM) > 0) { 295 | string mskFile = param->at(MSK_PRM); 296 | cout << "Printing masked sequence to: " << mskFile << endl; 297 | Chromosome oChrom(chromFile); 298 | scanner->printMasked(mskFile, oChrom, false); 299 | } 300 | 301 | // Free memory 302 | delete scanner; 303 | delete chrom; 304 | delete hmm; 305 | } 306 | } 307 | 308 | int main(int argc, char * argv[]) { 309 | cout << endl << endl; 310 | cout << "This is Red (REpeat Detector) designed and developed by "; 311 | cout << "Hani Zakaria Girgis, PhD." << endl << endl; 312 | 313 | cout << "Version: 2.0" << endl << endl; 314 | 315 | string message = string("Valid argument pairs:\n"); 316 | 317 | message.append("\t-gnm input genome directory, required.\n"); 318 | message.append( 319 | "\t\tFiles with \".fa\" extension in this directory are used for completing the table of the adjusted counts.\n"); 320 | message.append("\t\tThese Files are scanned for repeats.\n"); 321 | message.append( 322 | "\t-dir directory including additional input sequences, optional.\n"); 323 | message.append( 324 | "\t\tFiles with \".fa\" extension in this directory are NOT used for completing the table.\n"); 325 | message.append( 326 | "\t\tThese Files MUST have different names from those in the genome directory.\n"); 327 | message.append("\t\tThese Files are scanned for repeats.\n"); 328 | 329 | message.append( 330 | "\t-len word length equals k defining the k-mer. The default is floor(log_4(genome size)).\n"); 331 | message.append( 332 | "\t-ord order of the background Markov chain. The default is floor(k/2)-1.\n"); 333 | message.append( 334 | "\t-gau half width of the mask. The default is based on the GC content.\n"); 335 | message.append("\t\t20 if the GC content > 33% and < 67%, 40 otherwise.\n"); 336 | 337 | message.append( 338 | "\t-thr the threshold score of the low adjusted scores of non-repeats. The default is 2.\n"); 339 | message.append( 340 | "\t-min the minimum number of the observed k-mers. The default is 3.\n"); 341 | message.append( 342 | "\t-tbl file where the table of the adjusted counts is written, optional.\n"); 343 | message.append("\t-sco directory where scores are saved, optional.\n"); 344 | message.append("\t\tScore files have the \".scr\" extension.\n"); 345 | 346 | message.append( 347 | "\t-cnd directory where candidate regions are saved, optional.\n"); 348 | message.append("\t\tCandidates files have the \".cnd\" extension.\n"); 349 | message.append( 350 | "\t-rpt directory where repeats locations are saved, optional.\n"); 351 | message.append("\t\tRepeats files have the \".rpt\" extension.\n"); 352 | message.append( 353 | "\t-msk directory where masked sequences are saved, optional.\n"); 354 | message.append("\t\tMasked sequences files have the \".msk\" extension.\n"); 355 | 356 | message.append( 357 | "\t-frm the format of the output: 1 (chrName:start-end) or 2 (chrName\tstart\tend).\n"); 358 | message.append( 359 | "\t\tThe output format are zero based and the end is exclusive.\n"); 360 | message.append("\t-hmo file where the HMM is saved, optional.\n"); 361 | message.append("\t-cor integer of the number of threads, optional.\n"); 362 | message.append("\t\tThe more threads, the higher the memory requirement.\n"); 363 | message.append("\t\tThe defaul is the number of cores - 1, or 1 if single core is found.\n\n"); 364 | 365 | message.append("Examples:\n"); 366 | message.append( 367 | "\tThe following command runs Red with the defaults and generates the masked sequences.\n"); 368 | message.append("\tRed -gnm genome_directory -msk output_directory\n\n"); 369 | message.append( 370 | "\tThe following command runs Red with the defaults and generates the masked sequences and the locations of repeats.\n"); 371 | message.append( 372 | "\tRed -gnm genome_directory -msk output_directory -rpt output_directory\n\n"); 373 | 374 | 375 | // Table of valid argument pairs 376 | map *validParam = new map(); 377 | validParam->insert(map::value_type(LEN_PRM, "DUMMY")); 378 | validParam->insert(map::value_type(GNM_PRM, "DUMMY")); 379 | validParam->insert(map::value_type(ORD_PRM, "DUMMY")); 380 | validParam->insert(map::value_type(GAU_PRM, "DUMMY")); 381 | validParam->insert(map::value_type(THR_PRM, "DUMMY")); 382 | validParam->insert(map::value_type(HMI_PRM, "DUMMY")); 383 | validParam->insert(map::value_type(SEQ_PRM, "DUMMY")); 384 | validParam->insert(map::value_type(SCI_PRM, "DUMMY")); 385 | validParam->insert(map::value_type(TBL_PRM, "DUMMY")); 386 | validParam->insert(map::value_type(SCO_PRM, "DUMMY")); 387 | validParam->insert(map::value_type(HMO_PRM, "DUMMY")); 388 | validParam->insert(map::value_type(MSK_PRM, "DUMMY")); 389 | validParam->insert(map::value_type(RPT_PRM, "DUMMY")); 390 | validParam->insert(map::value_type(CND_PRM, "DUMMY")); 391 | validParam->insert(map::value_type(DIR_PRM, "DUMMY")); 392 | validParam->insert(map::value_type(MIN_PRM, "DUMMY")); 393 | validParam->insert(map::value_type(FRM_PRM, "DUMMY")); 394 | validParam->insert(map::value_type(COR_PRM, "DUMMY")); 395 | 396 | // Make a table of the user provided arguments 397 | map *param = new map(); 398 | if (argc > 1 && argc % 2 == 1) { 399 | for (int i = 1; i < argc - 1; i += 2) { 400 | if (validParam->count(argv[i]) > 0) { 401 | param->insert( 402 | map::value_type(argv[i], argv[i + 1])); 403 | } else { 404 | cerr << "Invalid argument: " << argv[i] << " " << argv[i + 1]; 405 | cerr << endl; 406 | cerr << message << endl; 407 | return 1; 408 | } 409 | } 410 | 411 | // Set the number of threads 412 | if(param->count(COR_PRM) > 0) { 413 | int corNum = atoi(param->at(COR_PRM).c_str()); 414 | if( corNum < Util::CORE_NUM) { 415 | Util::CORE_NUM = corNum; 416 | cout << "Using " << Util::CORE_NUM << " threads."; 417 | cout << endl; 418 | } else { 419 | cout << "The number of threads provided it too high. "; 420 | cout << "Using the default of "; 421 | cout << Util::CORE_NUM << " threads." << endl; 422 | } 423 | } 424 | 425 | // Check if the user provided the essential arguments 426 | 427 | if (param->count(LEN_PRM) == 0) { 428 | if (param->count(GNM_PRM) > 0) { 429 | // Calculate the size of the genome 430 | long genomeLength = 0; 431 | vector *fileList = new vector(); 432 | Util::readChromList(param->at(GNM_PRM), fileList, "fa"); 433 | cout << "Calculating the length, k, of the k-mer "; 434 | cout << "based on the input genome ... " << endl; 435 | 436 | # pragma omp parallel for schedule(dynamic) num_threads(Util::CORE_NUM) reduction(+: genomeLength) 437 | for (unsigned int i = 0; i < fileList->size(); i++) { 438 | # pragma omp critical 439 | { 440 | cout << "\t" << fileList->at(i) << endl; 441 | } 442 | ChromListMaker * maker = new ChromListMaker( 443 | fileList->at(i)); 444 | const vector * chromList = 445 | maker->makeChromList(); 446 | for (unsigned int h = 0; h < chromList->size(); h++) { 447 | genomeLength += chromList->at(h)->getEffectiveSize(); 448 | } 449 | delete maker; 450 | } 451 | fileList->clear(); 452 | delete fileList; 453 | 454 | double temp = log(genomeLength) / log(4.0); 455 | 456 | int k = floor(temp); 457 | cout << "The recommended k is " << k << "." << endl; 458 | if (k > 15) { 459 | cout << "Due to a memory constraint, k is set to 15."; 460 | cout << endl; 461 | k = 15; 462 | } 463 | 464 | if (k < 12) { 465 | cout 466 | << "Due to a statistical consideration, k is set to 12."; 467 | cout << endl; 468 | k = 12; 469 | } 470 | cout << endl; 471 | 472 | string kString = Util::int2string(k); 473 | param->insert( 474 | map::value_type(LEN_PRM, kString)); 475 | 476 | } else { 477 | cerr << "The word length is required." << endl; 478 | cerr << message << endl; 479 | return 1; 480 | } 481 | } 482 | 483 | if (param->count(FRM_PRM) == 0) { 484 | cout << "Using the default output format chrName:start-end" << endl; 485 | param->insert( 486 | map::value_type(FRM_PRM, 487 | Util::int2string(Scanner::FRMT_POS))); 488 | } else { 489 | if (atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_POS 490 | && atoi(param->at(FRM_PRM).c_str()) != Scanner::FRMT_BED) { 491 | cerr << "The output format must be " << Scanner::FRMT_POS 492 | << " or "; 493 | cerr << Scanner::FRMT_BED << ". The format received is "; 494 | cerr << param->at(FRM_PRM) << "." << endl; 495 | return 1; 496 | } 497 | } 498 | 499 | if (param->count(GNM_PRM) > 0) { 500 | Util::checkFile(param->at(GNM_PRM)); 501 | 502 | if (param->count(ORD_PRM) == 0) { 503 | double k = atoi(param->at(LEN_PRM).c_str()); 504 | int o = floor(k / 2.0) - 1; 505 | 506 | cout << "Using the default background order: " << o << "."; 507 | cout << endl; 508 | 509 | string oString = Util::int2string(o); 510 | param->insert( 511 | map::value_type(ORD_PRM, oString)); 512 | } 513 | 514 | if (param->count(THR_PRM) == 0) { 515 | cout << "Using the default threshold: 2." << endl; 516 | param->insert( 517 | map::value_type(THR_PRM, string("2"))); 518 | } else { 519 | if (atoi(param->at(THR_PRM).c_str()) < 1) { 520 | cerr << "The threshold cannot be less than 1."; 521 | cerr << endl; 522 | cerr << message << endl; 523 | return 1; 524 | } 525 | } 526 | 527 | if (param->count(MIN_PRM) == 0) { 528 | cout 529 | << "Using the default minimum of the observed count of k-mers: 3." 530 | << endl; 531 | param->insert( 532 | map::value_type(MIN_PRM, string("3"))); 533 | } else { 534 | if (atoi(param->at(MIN_PRM).c_str()) < 0) { 535 | cerr 536 | << "The minimum of the observed count of k-mers cannot be less than 0."; 537 | cerr << endl; 538 | cerr << message << endl; 539 | return 1; 540 | } 541 | } 542 | 543 | if (param->count(GAU_PRM) == 0) { 544 | cout << "Calculating GC content ..." << endl; 545 | 546 | // 1: Count the gc content of the input genome 547 | long genomeLength = 0; 548 | long genomeGc = 0; 549 | vector *fileList = new vector(); 550 | Util::readChromList(param->at(GNM_PRM), fileList, "fa"); 551 | 552 | # pragma omp parallel for num_threads(Util::CORE_NUM) schedule(dynamic) reduction(+: genomeGc, genomeLength) 553 | for (unsigned int i = 0; i < fileList->size(); i++) { 554 | # pragma omp critical 555 | { 556 | cout << "\t" << fileList->at(i) << endl; 557 | } 558 | 559 | 560 | ChromListMaker * maker = new ChromListMaker( 561 | fileList->at(i)); 562 | const vector * chromList = 563 | maker->makeChromList(); 564 | 565 | for (unsigned int h = 0; h < chromList->size(); h++) { 566 | genomeGc += chromList->at(h)->getGcContent(); 567 | genomeLength += chromList->at(h)->getEffectiveSize(); 568 | } 569 | 570 | delete maker; 571 | } 572 | fileList->clear(); 573 | delete fileList; 574 | 575 | // 2: Calculate the gc content of the input genome 576 | double gc = 100.00 * genomeGc / genomeLength; 577 | int w = 20; 578 | if (gc < 33 || gc > 67) { 579 | w = 40; 580 | } 581 | cout << "Using the default half width: " << w; 582 | cout << " based on the GC content of " << gc << endl; 583 | string wString = Util::int2string(w); 584 | param->insert( 585 | map::value_type(GAU_PRM, wString)); 586 | } 587 | } else if (param->count(HMI_PRM) > 0) { 588 | Util::checkFile(param->at(HMI_PRM)); 589 | 590 | if (param->count(SEQ_PRM) == 0) { 591 | cerr << "The sequence file is required."; 592 | cerr << endl; 593 | cerr << message << endl; 594 | return 1; 595 | } else { 596 | Util::checkFile(param->at(SEQ_PRM)); 597 | } 598 | 599 | if (param->count(SCI_PRM) == 0) { 600 | cerr << "The scores file is required."; 601 | cerr << endl; 602 | cerr << message << endl; 603 | return 1; 604 | } else { 605 | Util::checkFile(param->at(SCI_PRM)); 606 | } 607 | 608 | } else { 609 | cerr << "A mode is required: training and scanning (-gnm) or "; 610 | cerr << "scanning only (-hmi)." << endl; 611 | cerr << message << endl; 612 | return 1; 613 | } 614 | 615 | // Check optional parameters 616 | if (param->count(TBL_PRM) > 0 && param->count(GNM_PRM) == 0) { 617 | cerr << "Printing the k-mer table is optional with -gnm only."; 618 | cerr << endl; 619 | cerr << message << endl; 620 | return 1; 621 | } 622 | 623 | if (param->count(HMO_PRM) > 0 && param->count(GNM_PRM) == 0) { 624 | cerr << "Printing the HMM is optional with -gnm only."; 625 | cerr << endl; 626 | cerr << message << endl; 627 | return 1; 628 | } 629 | 630 | if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) == 0) { 631 | cerr << "Printing the scores is optional with -gnm only."; 632 | cerr << endl; 633 | cerr << message << endl; 634 | return 1; 635 | } else if (param->count(SCO_PRM) > 0 && param->count(GNM_PRM) > 0) { 636 | Util::checkFile(param->at(SCO_PRM)); 637 | } 638 | 639 | if (param->count(CND_PRM) > 0 && param->count(GNM_PRM) == 0) { 640 | cerr << "Printing candidate regions is optional with -gnm only."; 641 | cerr << endl; 642 | cerr << message << endl; 643 | return 1; 644 | } else if (param->count(CND_PRM) > 0 && param->count(GNM_PRM) > 0) { 645 | Util::checkFile(param->at(CND_PRM)); 646 | } 647 | 648 | if (param->count(DIR_PRM) > 0 && param->count(GNM_PRM) == 0) { 649 | cerr 650 | << "Processing additional sequences is optional with -gnm only."; 651 | cerr << endl; 652 | cerr << message << endl; 653 | return 1; 654 | } else if (param->count(DIR_PRM) > 0 && param->count(GNM_PRM) > 0) { 655 | Util::checkFile(param->at(DIR_PRM)); 656 | } 657 | 658 | if (param->count(MSK_PRM) > 0 && param->count(GNM_PRM) > 0) { 659 | Util::checkFile(param->at(MSK_PRM)); 660 | } 661 | 662 | if (param->count(RPT_PRM) > 0 && param->count(GNM_PRM) > 0) { 663 | Util::checkFile(param->at(RPT_PRM)); 664 | } 665 | 666 | // Print out the parameters table 667 | typedef map myMap; 668 | myMap::iterator sIter = param->begin(); 669 | myMap::iterator eIter = param->end(); 670 | cout << endl << "List of final parameters: " << endl; 671 | while (sIter != eIter) { 672 | cout << (*sIter).first << ": " << (*sIter).second << endl; 673 | sIter++; 674 | } 675 | cout << endl; 676 | 677 | // Start! 678 | drive(param); 679 | 680 | // Clear parameters when done. 681 | param->clear(); 682 | delete param; 683 | } else { 684 | cerr << "Argument pairs of the form: -flag value are required."; 685 | cerr << endl; 686 | cerr << message << endl; 687 | } 688 | 689 | //return EXIT_SUCCESS; 690 | return 0; 691 | } 692 | -------------------------------------------------------------------------------- /src_2.0/exception/FileDoesNotExistException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * FileDoesNotExistException.cpp 3 | * 4 | * Created on: Apr 30, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "FileDoesNotExistException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception{ 16 | 17 | FileDoesNotExistException::FileDoesNotExistException(string massage) { 18 | cerr << "File Does Not Exist Exception" << endl; 19 | cerr << massage << endl; 20 | } 21 | 22 | FileDoesNotExistException::~FileDoesNotExistException() { 23 | // TODO Auto-generated destructor stub 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src_2.0/exception/FileDoesNotExistException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * FileDoesNotExistException.h 3 | * 4 | * Created on: Apr 30, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef FILEDOESNOTEXISTEXCEPTION_H_ 9 | #define FILEDOESNOTEXISTEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception { 16 | class FileDoesNotExistException { 17 | public: 18 | FileDoesNotExistException(string); 19 | ~FileDoesNotExistException(); 20 | }; 21 | } 22 | 23 | #endif /* FILEDOESNOTEXISTEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src_2.0/exception/InvalidInputException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidInputException.cpp 3 | * 4 | * Created on: May 1, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "InvalidInputException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | namespace exception{ 15 | 16 | InvalidInputException::InvalidInputException(string msg) { 17 | cerr << "Invalid Input Exception" << endl; 18 | cerr << msg << endl; 19 | } 20 | 21 | InvalidInputException::~InvalidInputException() { 22 | // TODO Auto-generated destructor stub 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src_2.0/exception/InvalidInputException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidInputException.h 3 | * 4 | * Created on: May 1, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDINPUTEXCEPTION_H_ 9 | #define INVALIDINPUTEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception { 16 | class InvalidInputException { 17 | public: 18 | InvalidInputException(string); 19 | ~InvalidInputException(); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDINPUTEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src_2.0/exception/InvalidOperationException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOperationException.cpp 3 | * 4 | * Created on: Dec 20, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include 9 | #include "InvalidOperationException.h" 10 | 11 | 12 | namespace exception { 13 | 14 | InvalidOperationException::InvalidOperationException(string msg) : std::runtime_error(msg) { 15 | cerr << "Invalid Operation Exception." << endl; 16 | cerr << what() << endl; 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src_2.0/exception/InvalidOperationException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOperationException.h 3 | * 4 | * Created on: Dec 20, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDOPERATIONEXCEPTION_H_ 9 | #define INVALIDOPERATIONEXCEPTION_H_ 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | namespace exception { 17 | 18 | class InvalidOperationException : public std::runtime_error{ 19 | public: 20 | InvalidOperationException(string msg); 21 | //virtual ~InvalidOperationException(); 22 | }; 23 | 24 | } 25 | 26 | #endif /* INVALIDOPERATIONEXCEPTION_H_ */ 27 | -------------------------------------------------------------------------------- /src_2.0/exception/InvalidOrderOfOperationsException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOrderOfOperationsException.cpp 3 | * 4 | * Created on: Apr 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "InvalidOrderOfOperationsException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | namespace exception{ 15 | 16 | InvalidOrderOfOperationsException::InvalidOrderOfOperationsException(string massage) { 17 | cerr << "Invalid Order Of Operations Exception" << endl; 18 | cerr << massage << endl; 19 | } 20 | 21 | InvalidOrderOfOperationsException::~InvalidOrderOfOperationsException() { 22 | // TODO Auto-generated destructor stub 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src_2.0/exception/InvalidOrderOfOperationsException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOrderOfOperationsException.h 3 | * 4 | * Created on: Apr 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDORDEROFOPERATIONSEXCEPTION_H_ 9 | #define INVALIDORDEROFOPERATIONSEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception{ 16 | class InvalidOrderOfOperationsException { 17 | public: 18 | InvalidOrderOfOperationsException(string); 19 | ~InvalidOrderOfOperationsException(); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDORDEROFOPERATIONSEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src_2.0/exception/InvalidScoreException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidScoreException.cpp 3 | * 4 | * Created on: Apr 27, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "InvalidScoreException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | namespace exception{ 15 | 16 | InvalidScoreException::InvalidScoreException(string massage) { 17 | cerr << "Invalid Score Exception." << endl; 18 | cerr << massage << endl; 19 | } 20 | 21 | InvalidScoreException::~InvalidScoreException() { 22 | // TODO Auto-generated destructor stub 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src_2.0/exception/InvalidScoreException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidScoreException.h 3 | * 4 | * Created on: Apr 27, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDSCOREEXCEPTION_H_ 9 | #define INVALIDSCOREEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception{ 16 | class InvalidScoreException { 17 | public: 18 | InvalidScoreException(string); 19 | virtual ~InvalidScoreException(); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDSCOREEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src_2.0/exception/InvalidStateException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidStateException.cpp 3 | * 4 | * Created on: Aug 9, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include 9 | #include 10 | #include "InvalidStateException.h" 11 | 12 | using namespace std; 13 | 14 | 15 | namespace exception { 16 | InvalidStateException::InvalidStateException(string msg) : 17 | std::runtime_error(msg) { 18 | cerr << "Invalid State Exception." << endl; 19 | cerr << what() << endl; 20 | } 21 | } 22 | 23 | //InvalidStateException::~InvalidStateException() { 24 | // TODO Auto-generated destructor stub 25 | //} 26 | -------------------------------------------------------------------------------- /src_2.0/exception/InvalidStateException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidStateException.h 3 | * 4 | * Created on: Aug 9, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDSTATEEXCEPTION_H_ 9 | #define INVALIDSTATEEXCEPTION_H_ 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | namespace exception { 17 | class InvalidStateException : public std::runtime_error{ 18 | public: 19 | InvalidStateException(string); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDSTATEEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ChromDetector.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetector.cpp 3 | * 4 | * Created on: Nov 8, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include 9 | 10 | #include "ChromDetector.h" 11 | #include "Detector.h" 12 | #include "../utility/Util.h" 13 | 14 | using namespace std; 15 | using namespace nonltr; 16 | using namespace utility; 17 | 18 | ChromDetector::ChromDetector(double s, double w, double pDelta, double b, 19 | double mDelta, vector * scores, 20 | const vector *> * segmentList) { 21 | 22 | regions = new vector *>(); 23 | 24 | for (unsigned int i = 0; i < segmentList->size(); i++) { 25 | Detector * detector = new Detector(segmentList->at(i)->at(0), 26 | segmentList->at(i)->at(1), s, w, pDelta, b, mDelta, scores); 27 | vector *> * segRegions = detector->getRegions(); 28 | regions->insert(regions->end(), segRegions->begin(), segRegions->end()); 29 | delete detector; 30 | } 31 | } 32 | 33 | ChromDetector::~ChromDetector() { 34 | Util::deleteInVector(regions); 35 | regions->clear(); 36 | delete regions; 37 | } 38 | 39 | vector *> * ChromDetector::getRegions() { 40 | return regions; 41 | } 42 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ChromDetector.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetector.h 3 | * 4 | * Created on: Nov 8, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMDETECTOR_H_ 9 | #define CHROMDETECTOR_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace nonltr{ 16 | class ChromDetector { 17 | 18 | private: 19 | vector *> * regions; 20 | 21 | public: 22 | ChromDetector(double, double, double, double, double, vector *, 23 | const vector *> *); 24 | virtual ~ChromDetector(); 25 | vector *> * getRegions(); 26 | }; 27 | } 28 | 29 | #endif /* CHROMDETECTOR_H_ */ 30 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ChromDetectorMaxima.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetectorMaxima.cpp 3 | * 4 | * Created on: Jun 6, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "ChromDetectorMaxima.h" 9 | 10 | namespace nonltr { 11 | 12 | ChromDetectorMaxima::ChromDetectorMaxima(double s, double w, double m, 13 | double t, double p, int e, vector * oScores, 14 | ChromosomeOneDigit * chrom) { 15 | header = chrom->getHeader(); 16 | start(s, w, m, t, p, e, oScores, chrom->getSegment()); 17 | 18 | } 19 | 20 | ChromDetectorMaxima::ChromDetectorMaxima(double s, double w, double m, 21 | double t, double p, int e, vector * oScores, const vector *> * segmentList) { 23 | header = string("chrUnknown"); 24 | start(s, w, m, t, p, e, oScores, segmentList); 25 | } 26 | 27 | void ChromDetectorMaxima::start(double s, double w, double m, double t, 28 | double p, int e, vector * oScores, 29 | const vector *> * segmentList) { 30 | 31 | regionList = new vector (); 32 | 33 | int segmentCount = segmentList->size(); 34 | for (int i = 0; i < segmentCount; i++) { 35 | int segStart = segmentList->at(i)->at(0); 36 | int segEnd = segmentList->at(i)->at(1); 37 | 38 | // The effective length is shorter than the actual length by 2w 39 | int effLen = 2 * w + 10; 40 | int segLen = segEnd - segStart + 1; 41 | 42 | if (segLen > effLen) { 43 | DetectorMaxima * detector = new DetectorMaxima(segStart, segEnd, s, 44 | w, m, t, p, e, oScores); 45 | 46 | const vector * segRegions = detector->getRegionList(); 47 | int segRegionCount = segRegions->size(); 48 | for (int h = 0; h < segRegionCount; h++) { 49 | regionList->push_back(new Location(*(segRegions->at(h)))); 50 | } 51 | 52 | delete detector; 53 | } else { 54 | cout << "\tSkipping a short segment: "; 55 | cout << segStart << "-" << segEnd << endl; 56 | } 57 | } 58 | } 59 | 60 | ChromDetectorMaxima::~ChromDetectorMaxima() { 61 | Util::deleteInVector(regionList); 62 | regionList->clear(); 63 | delete regionList; 64 | } 65 | 66 | void ChromDetectorMaxima::printIndex(string outputFile) { 67 | printIndex(outputFile, false); 68 | } 69 | 70 | void ChromDetectorMaxima::printIndex(string outputFile, bool canAppend) { 71 | ofstream outIndex; 72 | 73 | if (canAppend) { 74 | outIndex.open(outputFile.c_str(), ios::out | ios::app); 75 | } else { 76 | outIndex.open(outputFile.c_str(), ios::out); 77 | } 78 | 79 | // Write the index of the repeat segment [x,y[ 80 | for (unsigned int j = 0; j < regionList->size(); j++) { 81 | outIndex << header << ":"; 82 | outIndex << ((int) (regionList->at(j)->getStart())) << "-"; 83 | outIndex << ((int) (regionList->at(j)->getEnd() + 1)) << " "; 84 | outIndex << endl; 85 | } 86 | 87 | outIndex.close(); 88 | } 89 | 90 | const vector* ChromDetectorMaxima::getRegionList() const { 91 | return regionList; 92 | } 93 | 94 | } /* namespace nonltr */ 95 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ChromDetectorMaxima.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetectorMaxima.h 3 | * 4 | * Created on: Jun 6, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMDETECTORMAXIMA_H_ 9 | #define CHROMDETECTORMAXIMA_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "ChromosomeOneDigit.h" 15 | #include "DetectorMaxima.h" 16 | 17 | #include "../utility/Util.h" 18 | #include "../utility/ILocation.h" 19 | #include "../utility/Location.h" 20 | 21 | using namespace std; 22 | using namespace utility; 23 | 24 | namespace nonltr { 25 | 26 | class ChromDetectorMaxima { 27 | private: 28 | vector * regionList; 29 | string header; 30 | 31 | void start(double, double, double, double, double, int, vector *, 32 | const vector *> *); 33 | 34 | public: 35 | ChromDetectorMaxima(double, double, double, double, double, int, 36 | vector *, ChromosomeOneDigit *); 37 | ChromDetectorMaxima(double, double, double, double, double, int, 38 | vector *, const vector *> *); 39 | virtual ~ChromDetectorMaxima(); 40 | const vector* getRegionList() const; 41 | void printIndex(string); 42 | void printIndex(string, bool); 43 | 44 | }; 45 | 46 | } /* namespace nonltr */ 47 | #endif /* CHROMDETECTORMAXIMA_H_ */ 48 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ChromListMaker.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromListMaker.cpp 3 | * 4 | * Created on: Mar 13, 2014 5 | * Author: Hani Zakaira Girgis 6 | */ 7 | 8 | #include "ChromListMaker.h" 9 | 10 | namespace nonltr { 11 | 12 | ChromListMaker::ChromListMaker(string seqFileIn) { 13 | seqFile = seqFileIn; 14 | chromList = new vector(); 15 | } 16 | 17 | ChromListMaker::~ChromListMaker() { 18 | Util::deleteInVector(chromList); 19 | delete chromList; 20 | } 21 | 22 | /* 23 | void ChromListMaker::makeChromList(vector& chromList) { 24 | ifstream in(seqFile.c_str()); 25 | bool isFirst = true; 26 | 27 | Chromosome * chrom; 28 | while (in.good()) { 29 | string line; 30 | getline(in, line); 31 | if (line[0] == '>') { 32 | if (!isFirst) { 33 | chrom->finalize(); 34 | if (chrom->getEffectiveSize() > 0) { 35 | chromList.push_back(chrom); 36 | } 37 | } else { 38 | isFirst = false; 39 | } 40 | chrom = new Chromosome(); 41 | 42 | chrom->setHeader(line); 43 | } else { 44 | chrom->appendToSequence(line); 45 | } 46 | } 47 | chrom->finalize(); 48 | 49 | if (chrom->getEffectiveSize() > 0) { 50 | chromList.push_back(chrom); 51 | } 52 | 53 | in.close(); 54 | } 55 | */ 56 | 57 | const vector * ChromListMaker::makeChromList() { 58 | ifstream in(seqFile.c_str()); 59 | bool isFirst = true; 60 | Chromosome * chrom; 61 | 62 | while (in.good()) { 63 | string line; 64 | getline(in, line); 65 | if (line[0] == '>') { 66 | if (!isFirst) { 67 | chrom->finalize(); 68 | if (chrom->getEffectiveSize() > 0) { 69 | chromList->push_back(chrom); 70 | } else { 71 | delete chrom; 72 | } 73 | } else { 74 | isFirst = false; 75 | } 76 | 77 | chrom = new Chromosome(); 78 | chrom->setHeader(line); 79 | } else { 80 | chrom->appendToSequence(line); 81 | } 82 | line.clear(); 83 | } 84 | chrom->finalize(); 85 | 86 | if (chrom->getEffectiveSize() > 0) { 87 | chromList->push_back(chrom); 88 | } else { 89 | delete chrom; 90 | } 91 | 92 | in.close(); 93 | 94 | // 3/31/2016 95 | chromList->shrink_to_fit(); 96 | 97 | return chromList; 98 | } 99 | 100 | const vector * ChromListMaker::makeChromOneDigitList() { 101 | ifstream in(seqFile.c_str()); 102 | bool isFirst = true; 103 | ChromosomeOneDigit * chrom; 104 | 105 | while (in.good()) { 106 | string line; 107 | getline(in, line); 108 | if (line[0] == '>') { 109 | if (!isFirst) { 110 | chrom->finalize(); 111 | if (chrom->getEffectiveSize() > 0) { 112 | chromList->push_back(chrom); 113 | } else { 114 | delete chrom; 115 | } 116 | } else { 117 | isFirst = false; 118 | } 119 | 120 | chrom = new ChromosomeOneDigit(); 121 | chrom->setHeader(line); 122 | } else { 123 | chrom->appendToSequence(line); 124 | } 125 | } 126 | 127 | chrom->finalize(); 128 | if (chrom->getEffectiveSize() > 0) { 129 | chromList->push_back(chrom); 130 | } else { 131 | delete chrom; 132 | } 133 | in.close(); 134 | 135 | // 3/31/2016 136 | chromList->shrink_to_fit(); 137 | 138 | return chromList; 139 | } 140 | 141 | } 142 | /* namespace nonltr */ 143 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ChromListMaker.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromListMaker.h 3 | * 4 | * Created on: Mar 13, 2014 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMLISTMAKER_H_ 9 | #define CHROMLISTMAKER_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "Chromosome.h" 15 | #include "ChromosomeOneDigit.h" 16 | 17 | #include "../utility/Util.h" 18 | 19 | using namespace std; 20 | using namespace utility; 21 | 22 | namespace nonltr { 23 | 24 | class ChromListMaker { 25 | private: 26 | vector * chromList; 27 | string seqFile; 28 | 29 | public: 30 | ChromListMaker (string); 31 | virtual ~ChromListMaker(); 32 | const vector * makeChromList(); 33 | //void makeChromList(vector&); 34 | const vector * makeChromOneDigitList(); 35 | }; 36 | 37 | } /* namespace nonltr */ 38 | #endif /* CHROMLISTMAKER_H_ */ 39 | -------------------------------------------------------------------------------- /src_2.0/nonltr/Chromosome.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Chromosome.cpp 3 | * 4 | * Created on: Mar 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | #include "Chromosome.h" 8 | 9 | Chromosome::Chromosome() { 10 | header = string(""); 11 | base = string(""); 12 | isHeaderReady = false; 13 | isBaseReady = false; 14 | isFinalized = false; 15 | } 16 | 17 | Chromosome::Chromosome(string fileName) { 18 | chromFile = fileName; 19 | readFasta(); 20 | help(1000000, true); 21 | } 22 | 23 | Chromosome::Chromosome(string fileName, bool canMerge) { 24 | chromFile = fileName; 25 | readFasta(); 26 | help(1000000, canMerge); 27 | } 28 | 29 | Chromosome::Chromosome(string fileName, int len) { 30 | chromFile = fileName; 31 | readFasta(); 32 | help(len, true); 33 | } 34 | 35 | Chromosome::Chromosome(string &seq, string &info) { 36 | header = info; 37 | base = seq; 38 | help(1000000, true); 39 | } 40 | 41 | Chromosome::Chromosome(string &seq, string &info, int len) { 42 | header = info; 43 | base = seq; 44 | help(len, true); 45 | } 46 | 47 | void Chromosome::setHeader(string& info) { 48 | if (isFinalized) { 49 | string msg("This chromosome has been finalized. "); 50 | msg.append("The header cannot be modified."); 51 | throw InvalidOperationException(msg); 52 | } else { 53 | header = info; 54 | isHeaderReady = true; 55 | } 56 | } 57 | 58 | /** 59 | * This method can waste memory if the sequence is large. 60 | * Consider using the method appendToSequence instead 61 | */ 62 | void Chromosome::setSequence(string& seq) { 63 | if (isFinalized) { 64 | string msg("This chromosome has been finalized. "); 65 | msg.append("The sequence cannot be modified."); 66 | throw InvalidOperationException(msg); 67 | } else { 68 | base = seq; 69 | isBaseReady = true; 70 | } 71 | } 72 | 73 | void Chromosome::appendToSequence(string& line) { 74 | if (isFinalized) { 75 | string msg("This chromosome has been finalized. "); 76 | msg.append("The sequence cannot be modified."); 77 | throw InvalidOperationException(msg); 78 | } else { 79 | base.append(line); 80 | isBaseReady = true; 81 | } 82 | } 83 | 84 | void Chromosome::finalize() { 85 | if (isFinalized) { 86 | string msg("This chromosome has been already finalized. "); 87 | msg.append("Finalize can be only called once."); 88 | throw InvalidOperationException(msg); 89 | } else if (!(isHeaderReady && isBaseReady)) { 90 | string msg( 91 | "The header and the sequence must be set before calling finalize"); 92 | throw InvalidOperationException(msg); 93 | } else { 94 | help(1000000, true); 95 | isFinalized = true; 96 | } 97 | } 98 | 99 | void Chromosome::help(int len, bool canMerge) { 100 | effectiveSize = 0; 101 | segLength = len; 102 | segment = new vector *>(); 103 | // 3/31/2016 104 | //segment->reserve(100); 105 | 106 | toUpperCase(); 107 | removeN(); 108 | if (canMerge) { 109 | mergeSegments(); 110 | } 111 | 112 | if( size() > segLength ){ 113 | makeSegmentList(); 114 | } 115 | 116 | calculateEffectiveSize(); 117 | } 118 | 119 | Chromosome::~Chromosome() { 120 | header.clear(); 121 | base.clear(); 122 | 123 | Util::deleteInVector(segment); 124 | segment->clear(); 125 | delete segment; 126 | } 127 | 128 | void Chromosome::readFasta() { 129 | bool isFirst = true; 130 | header = string(""); 131 | base = string(""); 132 | 133 | ifstream in(chromFile.c_str()); 134 | while (in.good()) { 135 | string line; 136 | getline(in, line); 137 | if (line[0] == '>') { 138 | if (!isFirst) { 139 | string msg = "Chromosome file: "; 140 | msg = msg + chromFile; 141 | msg = 142 | msg 143 | + " must have one sequence only. But it has more than one."; 144 | throw InvalidInputException(msg); 145 | } else { 146 | header = line; 147 | isFirst = false; 148 | } 149 | } else { 150 | base.append(line); 151 | } 152 | } 153 | in.close(); 154 | } 155 | 156 | /** 157 | * Convert alphabet to upper case if it has not been done before 158 | **/ 159 | void Chromosome::toUpperCase() { 160 | for (unsigned int i = 0; i < base.length(); i++) { 161 | base[i] = toupper(base[i]); 162 | } 163 | } 164 | 165 | /** 166 | * Segment coordinates are inclusive [s,e] 167 | **/ 168 | void Chromosome::removeN() { 169 | // Store non-N index 170 | int start = -1; 171 | for (unsigned int i = 0; i < base.size(); i++) { 172 | if (base[i] != 'N' && start == -1) { 173 | start = i; 174 | } else if (base[i] == 'N' && start != -1) { 175 | vector * v = new vector(); 176 | 177 | v->push_back(start); 178 | v->push_back(i - 1); 179 | 180 | // 3/31/201 181 | v->shrink_to_fit(); 182 | 183 | segment->push_back(v); 184 | 185 | start = -1; 186 | } else if (i == base.size() - 1 && base[i] != 'N' && start != -1) { 187 | vector * v = new vector(); 188 | v->push_back(start); 189 | v->push_back(i); 190 | 191 | // 3/31/201 192 | v->shrink_to_fit(); 193 | 194 | 195 | segment->push_back(v); 196 | 197 | start = -1; 198 | } 199 | } 200 | } 201 | 202 | /** 203 | * If the gap between two consecutive segments is less than 10 bp. 204 | * Segments that are shorter than 20 bp are not added. 205 | */ 206 | void Chromosome::mergeSegments() { 207 | // To do: set the size of the segment to 2 208 | 209 | vector *> * mSegment = new vector *>(); 210 | 211 | int s = segment->at(0)->at(0); 212 | int e = segment->at(0)->at(1); 213 | 214 | for (unsigned int i = 1; i < segment->size(); i++) { 215 | int s1 = segment->at(i)->at(0); 216 | int e1 = segment->at(i)->at(1); 217 | 218 | if (s1 - e < 10) { 219 | e = e1; 220 | } else { 221 | if (e - s + 1 >= 20) { 222 | vector * seg = new vector(); 223 | seg->push_back(s); 224 | seg->push_back(e); 225 | 226 | // 3/31/201 227 | seg->shrink_to_fit(); 228 | 229 | 230 | mSegment->push_back(seg); 231 | } 232 | 233 | s = s1; 234 | e = e1; 235 | } 236 | } 237 | 238 | // Handle the last index 239 | if (e - s + 1 >= 20) { 240 | vector * seg = new vector(); 241 | seg->push_back(s); 242 | seg->push_back(e); 243 | 244 | // 3/31/201 245 | seg->shrink_to_fit(); 246 | 247 | mSegment->push_back(seg); 248 | } 249 | 250 | Util::deleteInVector(segment); 251 | segment->clear(); 252 | segment = mSegment; 253 | 254 | } 255 | 256 | void Chromosome::makeSegmentList() { 257 | vector *> * segmentList = new vector *>(); 258 | int segmentCount = segment->size(); 259 | for (int oo = 0; oo < segmentCount; oo++) { 260 | int s = segment->at(oo)->at(0); 261 | int e = segment->at(oo)->at(1); 262 | 263 | if (e - s + 1 > segLength) { 264 | int fragNum = (int) (e - s + 1) / segLength; 265 | 266 | for (int h = 0; h < fragNum; h++) { 267 | int fragStart = s + (h * segLength); 268 | int fragEnd = (h == fragNum - 1) ? e : fragStart + segLength - 1; 269 | vector * v = new vector(); 270 | v->push_back(fragStart); 271 | v->push_back(fragEnd); 272 | 273 | // 3/31/201 274 | v->shrink_to_fit(); 275 | 276 | segmentList->push_back(v); 277 | } 278 | } else { 279 | vector * v = new vector(); 280 | v->push_back(segment->at(oo)->at(0)); 281 | v->push_back(segment->at(oo)->at(1)); 282 | 283 | // 3/31/201 284 | v->shrink_to_fit(); 285 | 286 | segmentList->push_back(v); 287 | } 288 | } 289 | 290 | Util::deleteInVector(segment); 291 | delete segment; 292 | segment = segmentList; 293 | } 294 | 295 | 296 | const string* Chromosome::getBase() { 297 | return &base; 298 | } 299 | 300 | const vector *> * Chromosome::getSegment() { 301 | return segment; 302 | } 303 | 304 | void Chromosome::printSegmentList() { 305 | int l = segment->size(); 306 | cout << "Segment list size = " << l << endl; 307 | for (int i = 0; i < l; i++) { 308 | cout << segment->at(i)->at(0) << "\t"; 309 | cout << segment->at(i)->at(1) << endl; 310 | } 311 | } 312 | 313 | string Chromosome::getHeader() { 314 | return header; 315 | } 316 | 317 | int Chromosome::size() { 318 | return base.size(); 319 | } 320 | 321 | void Chromosome::calculateEffectiveSize() { 322 | int segmentCount = segment->size(); 323 | for (int oo = 0; oo < segmentCount; oo++) { 324 | int s = segment->at(oo)->at(0); 325 | int e = segment->at(oo)->at(1); 326 | effectiveSize += (e - s + 1); 327 | } 328 | } 329 | 330 | int Chromosome::getEffectiveSize() { 331 | return effectiveSize; 332 | } 333 | 334 | int Chromosome::getGcContent() { 335 | int gc = 0; 336 | int size = base.size(); 337 | for (int i = 0; i < size; i++) { 338 | char n = base.at(i); 339 | if (n == 'C' || n == 'G') { 340 | gc++; 341 | } 342 | } 343 | return gc; 344 | } 345 | -------------------------------------------------------------------------------- /src_2.0/nonltr/Chromosome.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Chromosome.h 3 | * 4 | * Created on: Mar 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | #ifndef CHROMOSOME_H_ 8 | #define CHROMOSOME_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "IChromosome.h" 17 | #include "../exception/InvalidOperationException.h" 18 | #include "../exception/InvalidInputException.h" 19 | #include "../utility/Util.h" 20 | 21 | using namespace std; 22 | using namespace nonltr; 23 | using namespace utility; 24 | using namespace exception; 25 | 26 | namespace nonltr { 27 | class Chromosome: public IChromosome { 28 | public: 29 | Chromosome(); 30 | Chromosome(string); 31 | Chromosome(string, bool); 32 | Chromosome(string, int); 33 | Chromosome(string &, string&); 34 | Chromosome(string &, string&, int); 35 | 36 | int getGcContent(); 37 | 38 | virtual ~Chromosome(); 39 | 40 | virtual const string* getBase(); 41 | virtual const vector *> * getSegment(); 42 | virtual void printSegmentList(); 43 | virtual string getHeader(); 44 | virtual int size(); 45 | virtual int getEffectiveSize(); 46 | virtual void setHeader(string&); 47 | virtual void setSequence(string&); 48 | virtual void appendToSequence(string&); 49 | virtual void finalize(); 50 | 51 | 52 | protected: 53 | string chromFile; 54 | string header; 55 | string base; 56 | int effectiveSize; 57 | int segLength; 58 | 59 | vector *> * segment; 60 | void readFasta(); 61 | void toUpperCase(); 62 | void removeN(); 63 | void mergeSegments(); 64 | virtual void help(int, bool); 65 | void makeSegmentList(); 66 | void calculateEffectiveSize(); 67 | 68 | private: 69 | bool isHeaderReady; 70 | bool isBaseReady; 71 | bool isFinalized; 72 | 73 | void reverseSegments(); 74 | 75 | }; 76 | } 77 | 78 | #endif /* CHROMOSOME_H_ */ 79 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ChromosomeOneDigit.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeOneDigit.cpp 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD at the NCB1/NLM/NIH 6 | * A A 7 | * T T 8 | * G G 9 | * C C 10 | * R G or A 11 | * Y T or C 12 | * M A or C 13 | * K G or T 14 | * S G or C 15 | * W A or T 16 | * H A or C or T 17 | * B G or T or C 18 | * V G or C or A 19 | * D G or T or A 20 | * N G or T or A or C 21 | */ 22 | #include 23 | #include 24 | 25 | #include "Chromosome.h" 26 | #include "ChromosomeOneDigit.h" 27 | #include "../exception/InvalidInputException.h" 28 | 29 | using namespace exception; 30 | 31 | namespace nonltr { 32 | 33 | ChromosomeOneDigit::ChromosomeOneDigit() : 34 | Chromosome() { 35 | } 36 | 37 | ChromosomeOneDigit::ChromosomeOneDigit(string fileName) : 38 | Chromosome(fileName) { 39 | help(); 40 | } 41 | 42 | ChromosomeOneDigit::ChromosomeOneDigit(string seq, string info) : 43 | Chromosome(seq, info) { 44 | help(); 45 | } 46 | 47 | void ChromosomeOneDigit::help() { 48 | // Build codes 49 | buildCodes(); 50 | // Modify the sequence in the super class 51 | encodeNucleotides(); 52 | } 53 | 54 | void ChromosomeOneDigit::finalize() { 55 | Chromosome::finalize(); 56 | help(); 57 | } 58 | 59 | void ChromosomeOneDigit::buildCodes() { 60 | // Make map 61 | codes = new map(); 62 | 63 | // Certain nucleotides 64 | codes->insert(map::value_type('A', (char) 0)); 65 | codes->insert(map::value_type('C', (char) 1)); 66 | codes->insert(map::value_type('G', (char) 2)); 67 | codes->insert(map::value_type('T', (char) 3)); 68 | 69 | // Common uncertain nucleotide 70 | // codes->insert(map::value_type('N', (char) 4)); 71 | 72 | // Uncertain nucleotides 73 | codes->insert(map::value_type('R', codes->at('G'))); 74 | codes->insert(map::value_type('Y', codes->at('C'))); 75 | codes->insert(map::value_type('M', codes->at('A'))); 76 | codes->insert(map::value_type('K', codes->at('T'))); 77 | codes->insert(map::value_type('S', codes->at('G'))); 78 | codes->insert(map::value_type('W', codes->at('T'))); 79 | codes->insert(map::value_type('H', codes->at('C'))); 80 | codes->insert(map::value_type('B', codes->at('T'))); 81 | codes->insert(map::value_type('V', codes->at('A'))); 82 | codes->insert(map::value_type('D', codes->at('T'))); 83 | codes->insert(map::value_type('N', codes->at('C'))); 84 | codes->insert(map::value_type('X', codes->at('G'))); 85 | } 86 | 87 | ChromosomeOneDigit::~ChromosomeOneDigit() { 88 | codes->clear(); 89 | delete codes; 90 | } 91 | 92 | /** 93 | * This method converts nucleotides in the segments to single digit codes 94 | */ 95 | void ChromosomeOneDigit::encodeNucleotides() { 96 | 97 | for (unsigned int s = 0; s < segment->size(); s++) { 98 | int segStart = segment->at(s)->at(0); 99 | int segEnd = segment->at(s)->at(1); 100 | for (int i = segStart; i <= segEnd; i++) { 101 | if (codes->count(base[i]) > 0) { 102 | base[i] = codes->at(base[i]); 103 | } else { 104 | string msg = "Invalid nucleotide: "; 105 | msg.append(1, base[i]); 106 | throw InvalidInputException(msg); 107 | } 108 | } 109 | } 110 | 111 | // Digitize skipped segments 112 | int segNum = segment->size(); 113 | if(segNum > 0){ 114 | // The first interval - before the first segment 115 | int segStart = 0; 116 | int segEnd = segment->at(0)->at(0)-1; 117 | 118 | for (int s = 0; s <= segNum; s++) { 119 | for (int i = segStart; i <= segEnd; i++) { 120 | char c = base[i]; 121 | if(c != 'N'){ 122 | if (codes->count(c) > 0) { 123 | base[i] = codes->at(c); 124 | } else { 125 | string msg = "Invalid nucleotide: "; 126 | msg.append(1, c); 127 | throw InvalidInputException(msg); 128 | } 129 | } 130 | } 131 | 132 | // The regular intervals between two segments 133 | if(s < segNum-1){ 134 | segStart = segment->at(s)->at(1)+1; 135 | segEnd = segment->at(s+1)->at(0)-1; 136 | } 137 | // The last interval - after the last segment 138 | else if(s == segNum - 1){ 139 | segStart = segment->at(s)->at(1)+1; 140 | segEnd = base.size()-1; 141 | } 142 | } 143 | } 144 | } 145 | 146 | /* 147 | void ChromosomeOneDigit::encodeNucleotides() { 148 | int seqLen = base.size(); 149 | 150 | for (int i = 0; i < seqLen; i++) { 151 | if (codes->count(base[i]) > 0) { 152 | base[i] = codes->at(base[i]); 153 | } else { 154 | string msg = "Invalid nucleotide: "; 155 | msg.append(1, base[i]); 156 | throw InvalidInputException(msg); 157 | } 158 | } 159 | 160 | } 161 | */ 162 | 163 | /** 164 | * Cannot be called on already finalized object. 165 | */ 166 | void ChromosomeOneDigit::makeR() { 167 | //cout << "Making reverse ..." << endl; 168 | makeReverse(); 169 | reverseSegments(); 170 | } 171 | 172 | /** 173 | * Cannot be called on already finalized object. 174 | */ 175 | void ChromosomeOneDigit::makeRC() { 176 | //cout << "Making reverse complement ..." << endl; 177 | makeComplement(); 178 | makeReverse(); 179 | reverseSegments(); 180 | } 181 | 182 | void ChromosomeOneDigit::makeComplement() { 183 | map complement; 184 | 185 | // Certain nucleotides 186 | complement.insert(map::value_type((char) 0, (char) 3)); 187 | complement.insert(map::value_type((char) 1, (char) 2)); 188 | complement.insert(map::value_type((char) 2, (char) 1)); 189 | complement.insert(map::value_type((char) 3, (char) 0)); 190 | 191 | // Unknown nucleotide 192 | complement.insert(map::value_type('N', 'N')); 193 | // complement.insert(map::value_type((char) 4, (char) 4)); 194 | 195 | // Convert a sequence to its complement 196 | int seqLen = base.size(); 197 | for (int i = 0; i < seqLen; i++) { 198 | if (complement.count(base[i]) > 0) { 199 | base[i] = complement.at(base[i]); 200 | } else { 201 | cerr << "Error: The digit " << (char) base[i]; 202 | cerr << " does not represent a base." << endl; 203 | exit(2); 204 | } 205 | } 206 | } 207 | 208 | void ChromosomeOneDigit::makeReverse() { 209 | int last = base.size() - 1; 210 | 211 | // Last index to be switched 212 | int middle = base.size() / 2; 213 | 214 | for (int i = 0; i < middle; i++) { 215 | char temp = base[last - i]; 216 | base[last - i] = base[i]; 217 | base[i] = temp; 218 | } 219 | } 220 | 221 | void ChromosomeOneDigit::reverseSegments() { 222 | int segNum = segment->size(); 223 | int lastBase = size() - 1; 224 | 225 | // Calculate the coordinate on the main strand 226 | for (int i = 0; i < segNum; i++) { 227 | vector * seg = segment->at(i); 228 | 229 | int s = lastBase - seg->at(1); 230 | int e = lastBase - seg->at(0); 231 | seg->clear(); 232 | seg->push_back(s); 233 | seg->push_back(e); 234 | } 235 | 236 | // Reverse the regions within the list 237 | int lastRegion = segNum - 1; 238 | int middle = segNum / 2; 239 | for (int i = 0; i < middle; i++) { 240 | vector * temp = segment->at(lastRegion - i); 241 | (*segment)[lastRegion - i] = segment->at(i); 242 | (*segment)[i] = temp; 243 | } 244 | } 245 | 246 | } 247 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ChromosomeOneDigit.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeOneDigit.h 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | 8 | #ifndef CHROMOSOMEONEDIGIT_H_ 9 | #define CHROMOSOMEONEDIGIT_H_ 10 | 11 | #include 12 | #include "Chromosome.h" 13 | 14 | namespace nonltr { 15 | class ChromosomeOneDigit: public Chromosome { 16 | 17 | private: 18 | /* Fields */ 19 | map * codes; 20 | 21 | /* Methods */ 22 | void help(); 23 | void buildCodes(); 24 | void encodeNucleotides(); 25 | 26 | void makeReverse(); 27 | void makeComplement(); 28 | void reverseSegments(); 29 | 30 | public: 31 | /* Methods */ 32 | ChromosomeOneDigit(); 33 | ChromosomeOneDigit(string); 34 | ChromosomeOneDigit(string, string); 35 | virtual ~ChromosomeOneDigit(); 36 | virtual void finalize(); 37 | 38 | void makeR(); 39 | void makeRC(); 40 | }; 41 | } 42 | 43 | #endif /* CHROMOSOMEONEDIGIT_H_ */ 44 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ChromosomeRandom.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeRandom.cpp 3 | * 4 | * Created on: Feb 4, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | * 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "ChromosomeRandom.h" 17 | #include "../exception/InvalidInputException.h" 18 | #include "../exception/InvalidStateException.h" 19 | #include "../utility/Util.h" 20 | 21 | using namespace std; 22 | using namespace exception; 23 | using namespace utility; 24 | 25 | namespace nonltr { 26 | 27 | ChromosomeRandom::ChromosomeRandom(int nIn, IChromosome* oChromIn, 28 | char unreadIn, vector* alphaIn) { 29 | // Check the order 30 | if (nIn < 0) { 31 | string msg("The Markov order must be non-negative. "); 32 | msg.append("The order received is: "); 33 | msg.append(Util::int2string(nIn)); 34 | msg.append("."); 35 | throw InvalidInputException(msg); 36 | } 37 | 38 | // n here is the length of the word, i.e. the order + 1 39 | n = nIn + 1; 40 | oChrom = oChromIn; 41 | unread = unreadIn; 42 | alpha = alphaIn; 43 | 44 | // Initialize the random sequence 45 | int size = oChrom->getBase()->size(); 46 | rBase = new string(size, unread); 47 | 48 | // Initialize key list 49 | keyList = new vector(); 50 | 51 | // Initialize the table 52 | table = new map(); 53 | 54 | // Handle unusual characters in the first word of a segment 55 | // Make map 56 | codes = new map(); 57 | codes->insert(map::value_type('A', 'A')); 58 | codes->insert(map::value_type('C', 'C')); 59 | codes->insert(map::value_type('G', 'G')); 60 | codes->insert(map::value_type('T', 'T')); 61 | codes->insert(map::value_type('R', 'G')); 62 | codes->insert(map::value_type('Y', 'C')); 63 | codes->insert(map::value_type('M', 'A')); 64 | codes->insert(map::value_type('K', 'T')); 65 | codes->insert(map::value_type('S', 'G')); 66 | codes->insert(map::value_type('W', 'T')); 67 | codes->insert(map::value_type('H', 'C')); 68 | codes->insert(map::value_type('B', 'T')); 69 | codes->insert(map::value_type('V', 'A')); 70 | codes->insert(map::value_type('D', 'T')); 71 | codes->insert(map::value_type('N', 'C')); 72 | codes->insert(map::value_type('X', 'G')); 73 | 74 | // Start operations 75 | cout << "\tFilling key list ..." << endl; 76 | fillKeyList(); 77 | 78 | cout << "\tInitializing table ..." << endl; 79 | initializeTable(); 80 | 81 | cout << "\tCounting words ..." << endl; 82 | countWords(); 83 | 84 | cout << "\tCalculating probabilities ..." << endl; 85 | convertToProbabilities(); 86 | 87 | //cout << "\tPrinting the table ..." << endl; 88 | //printTable(); 89 | 90 | cout << "\tGenerating the random sequence ..." << endl; 91 | generateRandomSequence(); 92 | } 93 | 94 | ChromosomeRandom::~ChromosomeRandom() { 95 | codes->clear(); 96 | delete codes; 97 | 98 | keyList->clear(); 99 | delete keyList; 100 | 101 | table->clear(); 102 | delete table; 103 | 104 | delete rBase; 105 | } 106 | 107 | void ChromosomeRandom::fillKeyList() { 108 | // Collect keys 109 | int alphaCount = alpha->size(); 110 | 111 | // Order 0 112 | 113 | for (int h = 0; h < alphaCount; h++) { 114 | string s(""); 115 | s.append(1, alpha->at(h)); 116 | keyList->push_back(s); 117 | } 118 | 119 | // Order 1 and higher 120 | for (int g = 1; g < n; g++) { 121 | vector o; 122 | int keyListSize = keyList->size(); 123 | for (int i = 0; i < keyListSize; i++) { 124 | for (int j = 0; j < alphaCount; j++) { 125 | string s(keyList->at(i)); 126 | s.append(1, alpha->at(j)); 127 | o.push_back(s); 128 | } 129 | } 130 | keyList->clear(); 131 | (*keyList) = o; 132 | } 133 | } 134 | 135 | void ChromosomeRandom::initializeTable() { 136 | int keyListSize = keyList->size(); 137 | for (int i = 0; i < keyListSize; i++) { 138 | table->insert(valType(keyList->at(i), 1)); 139 | } 140 | } 141 | 142 | void ChromosomeRandom::countWords() { 143 | // Get the original sequence 144 | const string* oBase = oChrom->getBase(); 145 | 146 | // Count words 147 | const vector *> * segmentList = oChrom->getSegment(); 148 | int segmentCount = segmentList->size(); 149 | for (int i = 0; i < segmentCount; i++) { 150 | int s = segmentList->at(i)->at(0); 151 | int e = segmentList->at(i)->at(1); 152 | if (e - s + 1 >= n) { 153 | 154 | int limit = e - n + 1; 155 | 156 | for (int h = s; h <= limit; h++) { 157 | // Check if the current base is a standard one. 158 | // Words including non-standard bases are not counted. 159 | 160 | char c = oBase->at(h); 161 | 162 | int alphaCount = alpha->size(); 163 | bool isStandard = false; 164 | for (int a = 0; a < alphaCount; a++) { 165 | if (alpha->at(a) == c) { 166 | isStandard = true; 167 | break; 168 | } 169 | } 170 | 171 | // Increment the count 172 | if (isStandard) { 173 | string word = oBase->substr(h, n); 174 | if (table->count(word) > 0) { 175 | (*table)[word] = table->at(word) + 1; 176 | } else { 177 | cout << "\t\tIgnoring " << word << endl; 178 | } 179 | } 180 | } 181 | } 182 | } 183 | } 184 | 185 | void ChromosomeRandom::convertToProbabilities() { 186 | int alphaCount = alpha->size(); 187 | int keyListSize = keyList->size(); 188 | for (int i = 0; i < keyListSize; i += alphaCount) { 189 | double sum = 0; 190 | for (int j = 0; j < alphaCount; j++) { 191 | string key = keyList->at(i + j); 192 | sum += table->at(key); 193 | } 194 | for (int j = 0; j < alphaCount; j++) { 195 | string key = keyList->at(i + j); 196 | (*table)[key] = ((double) table->at(key)) / sum; 197 | } 198 | } 199 | } 200 | 201 | void ChromosomeRandom::generateRandomSequence() { 202 | // Get the original sequence 203 | const string* oBase = oChrom->getBase(); 204 | 205 | // Alphabet count 206 | int alphaCount = alpha->size(); 207 | 208 | // Get the original segments 209 | const vector *> * segmentList = oChrom->getSegment(); 210 | int segmentCount = segmentList->size(); 211 | 212 | // Generate random segments 213 | for (int i = 0; i < segmentCount; i++) { 214 | int s = segmentList->at(i)->at(0); 215 | int e = segmentList->at(i)->at(1); 216 | 217 | if (e - s + 1 > n) { 218 | //string order = oBase->substr(s, n - 1); 219 | string order(""); 220 | // The first order is based on the original sequence. 221 | for (int w = s; w < s + n - 1; w++) { 222 | (*rBase)[w] = codes->at(oBase->at(w)); 223 | order.append(1, codes->at(oBase->at(w))); 224 | } 225 | 226 | for (int h = s + n - 1; h <= e; h++) { 227 | // Subsequent orders are based on the random sequence. 228 | order = rBase->substr(h - n + 1, n - 1); 229 | vector > lottery; 230 | int chanceSoFar = 0; 231 | for (int k = 0; k < alphaCount; k++) { 232 | string temp = order; 233 | temp.append(1, alpha->at(k)); 234 | if (table->count(temp) > 0) { 235 | int periodStart = chanceSoFar; 236 | int periodEnd = periodStart + (100 * table->at(temp)); 237 | chanceSoFar = periodEnd + 1; 238 | vector entry; 239 | entry.push_back(alpha->at(k)); 240 | entry.push_back(periodStart); 241 | entry.push_back(periodEnd); 242 | lottery.push_back(entry); 243 | } else { 244 | string msg("This word must exist in the table: "); 245 | msg.append(temp); 246 | msg.append("."); 247 | throw InvalidStateException(msg); 248 | } 249 | } 250 | 251 | if (lottery.size() > 0) { 252 | int randInt = rand() % chanceSoFar; 253 | 254 | for (int tt = 0; tt < alphaCount; tt++) { 255 | vector entry = lottery.at(tt); 256 | if (randInt >= entry.at(1) && randInt <= entry.at(2)) { 257 | (*rBase)[h] = entry.at(0); 258 | break; 259 | } 260 | } 261 | lottery.clear(); 262 | } else { 263 | string msg("The lottery vector cannot be empty."); 264 | throw InvalidStateException(msg); 265 | } 266 | } 267 | } 268 | } 269 | 270 | // Make sure that the generated sequence has the same length as the original sequence 271 | if (oBase->size() != rBase->size()) { 272 | cerr << "The original sequence and the random sequence "; 273 | cerr << "do not have the same size." << endl; 274 | cerr << "Original sequence size is: " << oBase->size() << endl; 275 | cerr << "Generated sequence size is: " << rBase->size() << endl; 276 | } 277 | } 278 | 279 | void ChromosomeRandom::printTable() { 280 | map::iterator iterStart = table->begin(); 281 | map::iterator iterEnd = table->end(); 282 | while (iterStart != iterEnd) { 283 | cout << (*iterStart).first << " -> " << (*iterStart).second << endl; 284 | iterStart++; 285 | } 286 | } 287 | 288 | /** 289 | * Returns the segments of the original chromosome 290 | */ 291 | const vector *> * ChromosomeRandom::getSegment() { 292 | return oChrom->getSegment(); 293 | } 294 | 295 | /** 296 | * Returns the random sequence 297 | */ 298 | const string* ChromosomeRandom::getBase() { 299 | return rBase; 300 | } 301 | 302 | /** 303 | * Returns the header indicating the order of the Markov chain 304 | */ 305 | string ChromosomeRandom::getHeader() { 306 | string header = oChrom->getHeader(); 307 | //header.append(" - Random based on "); 308 | //header.append(Util::int2string(n - 1)); 309 | //header.append("-order Markov chain."); 310 | return header; 311 | } 312 | 313 | void ChromosomeRandom::printEffectiveSequence(string outputFile) { 314 | int totalSize = rBase->size(); 315 | string * effectiveRBase = new string(""); 316 | for (int i = 0; i < totalSize; i++) { 317 | char b = rBase->at(i); 318 | if (b != unread) { 319 | effectiveRBase->append(1, b); 320 | } 321 | } 322 | 323 | // Make sure that the effective sequence is shorter than the original 324 | // length 325 | if (effectiveRBase->size() > totalSize) { 326 | cerr << "The effective length must be <= the original length." << endl; 327 | cerr << "Generated sequence size is: " << totalSize << endl; 328 | cerr << "The effective size is: " << effectiveRBase->size() << endl; 329 | 330 | } 331 | 332 | printSequence(outputFile, effectiveRBase); 333 | 334 | delete effectiveRBase; 335 | } 336 | 337 | void ChromosomeRandom::printSequence(string outputFile) { 338 | printSequence(outputFile, rBase); 339 | } 340 | 341 | void ChromosomeRandom::printSequence(string outputFile, string * baseToPrint) { 342 | cout << "Printing chromosome to file ..." << endl; 343 | ofstream outSequence; 344 | outSequence.open(outputFile.c_str(), ios::out); 345 | 346 | int step = 50; 347 | 348 | outSequence << getHeader() << endl; 349 | int len = baseToPrint->size(); 350 | 351 | for (int i = 0; i < len; i = i + step) { 352 | int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; 353 | for (int k = i; k <= e; k++) { 354 | outSequence << baseToPrint->at(k); 355 | } 356 | outSequence << endl; 357 | } 358 | outSequence << endl; 359 | 360 | outSequence.close(); 361 | } 362 | 363 | } /* namespace nonltr */ 364 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ChromosomeRandom.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeRandom.h 3 | * 4 | * Created on: Feb 4, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMOSOMERANDOM_H_ 9 | #define CHROMOSOMERANDOM_H_ 10 | 11 | #include 12 | 13 | #include "IChromosome.h" 14 | 15 | namespace nonltr { 16 | 17 | class ChromosomeRandom: public nonltr::IChromosome { 18 | // Key-value pair type. 19 | typedef map::value_type valType; 20 | 21 | private: 22 | int n; 23 | char unread; 24 | IChromosome * oChrom; 25 | vector * alpha; 26 | map * table; 27 | string * rBase; 28 | vector * keyList; 29 | map * codes; 30 | 31 | void fillKeyList(); 32 | void initializeTable(); 33 | void countWords(); 34 | void convertToProbabilities(); 35 | void printTable(); 36 | void generateRandomSequence(); 37 | 38 | public: 39 | ChromosomeRandom(int, IChromosome*, char, vector*); 40 | virtual ~ChromosomeRandom(); 41 | 42 | virtual const string* getBase(); 43 | virtual const vector *> * getSegment(); 44 | virtual string getHeader(); 45 | virtual void printSequence(string); 46 | void printSequence(string, string *); 47 | void printEffectiveSequence(string); 48 | }; 49 | 50 | } /* namespace nonltr */ 51 | #endif /* CHROMOSOMERANDOM_H_ */ 52 | -------------------------------------------------------------------------------- /src_2.0/nonltr/DetectorMaxima.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * DetectorMaxima.cpp 3 | * 4 | * Created on: May 31, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "DetectorMaxima.h" 9 | #include "../utility/Util.h" 10 | #include "../utility/Location.h" 11 | #include "../exception/InvalidStateException.h" 12 | 13 | #include 14 | // Delete start 15 | #include 16 | using namespace std; 17 | // Delete end 18 | 19 | using namespace exception; 20 | 21 | namespace nonltr { 22 | 23 | DetectorMaxima::DetectorMaxima(int segStartIn, int segEndIn, double sIn, 24 | double wIn, double mIn, double tIn, double pIn, int eIn, 25 | vector * oScoresIn) { 26 | 27 | // ToDo: make sure that segStart and segEnd are within the input scores. 28 | segStart = segStartIn; 29 | segEnd = segEndIn; 30 | s = sIn; 31 | w = wIn; 32 | m = mIn; 33 | t = tIn; 34 | p = pIn; 35 | e = eIn; 36 | oScores = oScoresIn; 37 | 38 | halfS = s; 39 | //s / 2; 40 | 41 | mask = new vector(); 42 | // Complete 43 | scores = new vector(); 44 | 45 | // Trimmed on both sides 46 | first = new vector(); 47 | 48 | // Trimmed on both sides 49 | second = new vector(); 50 | 51 | // Coordinates according to the complete sequence 52 | maxima = new vector(); 53 | 54 | // Coordinates according to the complete sequence 55 | // allMaxima = new vector *>(); 56 | 57 | // Coordinates according to the complete sequence 58 | separatorList = new vector(); 59 | 60 | // Coordinates according to the complete sequence 61 | regionList = new vector(); 62 | 63 | makeMask(); 64 | 65 | smooth(); 66 | 67 | deriveFirst(); 68 | 69 | deriveSecond(); 70 | 71 | // Free memory start 72 | mask->clear(); 73 | delete mask; 74 | scores->clear(); 75 | delete scores; 76 | // Free memory end 77 | 78 | findMaxima(); 79 | 80 | // Free memory start 81 | first->clear(); 82 | delete first; 83 | second->clear(); 84 | delete second; 85 | // Free memory end 86 | 87 | findSeparators(); 88 | 89 | findRegions(); 90 | 91 | // Free memory start 92 | maxima->clear(); 93 | delete maxima; 94 | Util::deleteInVector(separatorList); 95 | separatorList->clear(); 96 | delete separatorList; 97 | // Free memory end 98 | 99 | extendRegions(); 100 | } 101 | 102 | /* 103 | const vector *>* DetectorMaxima::getAllMaxima() const { 104 | return allMaxima; 105 | } 106 | */ 107 | 108 | const vector* DetectorMaxima::getFirst() const { 109 | return first; 110 | } 111 | 112 | const vector* DetectorMaxima::getSecond() const { 113 | return second; 114 | } 115 | 116 | const vector * DetectorMaxima::getRegionList() const { 117 | return regionList; 118 | } 119 | 120 | DetectorMaxima::~DetectorMaxima() { 121 | /* 122 | Util::deleteInVector (allMaxima); 123 | allMaxima->clear(); 124 | delete allMaxima; 125 | */ 126 | 127 | Util::deleteInVector(regionList); 128 | regionList->clear(); 129 | delete regionList; 130 | } 131 | 132 | void DetectorMaxima::makeMask() { 133 | const double PI = 3.14159265358979323846; 134 | double sigma = (double) s / 3.5; 135 | const double PART_1 = 1 / sqrt(2 * PI * pow(sigma, 2)); 136 | 137 | int l = 2 * s + 1; 138 | for (int i = 0; i < l; i++) { 139 | double g = PART_1 * exp(-1 * pow(i - s, 2) / (2 * pow(sigma, 2))); 140 | mask->push_back(g); 141 | } 142 | 143 | // For testing only 144 | /* 145 | for (int i = 0; i < l; i++) { 146 | cout << i << "\t" << mask->at(i) << endl; 147 | } 148 | cout << endl; 149 | cout << endl; 150 | */ 151 | // End testing 152 | } 153 | 154 | void DetectorMaxima::smooth() { 155 | for (int i = segStart; i <= segEnd; i++) { 156 | int winS = i - s; 157 | int maskS = 0; 158 | if (winS < segStart) { 159 | maskS = -1 * (winS - segStart); 160 | winS = segStart; 161 | } 162 | 163 | int winE = (i + s > segEnd) ? segEnd : i + s; 164 | // int winL = winE - winS + 1; 165 | 166 | double sum = 0.0; 167 | double maskSum = 0.0; 168 | 169 | int j = winS; 170 | int h = maskS; 171 | 172 | while (j <= winE) { 173 | double weight = mask->at(h); 174 | sum += oScores->at(j) * weight; 175 | maskSum += weight; 176 | 177 | j++; 178 | h++; 179 | } 180 | 181 | if (maskSum <= 0.0) { 182 | string msg("The sum of the weights in the mask must be > 0"); 183 | throw InvalidStateException(msg); 184 | } 185 | 186 | scores->push_back(sum / maskSum); 187 | // scores->push_back(sum / winL); 188 | } 189 | 190 | // Testing - start 191 | /* 192 | cout << "The smoothed scores ... " << endl; 193 | for (int k = 0; k < scores->size(); k++) { 194 | if (k % 25 == 0) { 195 | cout << endl; 196 | } 197 | cout << scores->at(k) << " "; 198 | } 199 | cout << endl; 200 | cout << endl; 201 | */ 202 | // Testing - end 203 | } 204 | 205 | void DetectorMaxima::deriveFirst() { 206 | double l = 0.0; 207 | double r = 0.0; 208 | 209 | for (int i = 0; i < w; i++) { 210 | l += scores->at(i); 211 | } 212 | 213 | for (int i = w + 1; i <= 2 * w; i++) { 214 | r += scores->at(i); 215 | } 216 | 217 | first->push_back(round(-1 * l + r)); 218 | 219 | for (int i = w + 1; i < scores->size() - w; i++) { 220 | l -= scores->at(i - w - 1); 221 | l += scores->at(i - 1); 222 | r -= scores->at(i); 223 | r += scores->at(i + w); 224 | first->push_back(round(-1 * l + r)); 225 | } 226 | 227 | // For testing only 228 | /* 229 | for (int i = 0; i < first->size(); i++) { 230 | cout << first->at(i) << " "; 231 | } 232 | cout << endl; 233 | */ 234 | } 235 | 236 | void DetectorMaxima::deriveSecond() { 237 | double l = 0.0; 238 | double r = 0.0; 239 | double d = 2 * w; 240 | 241 | for (int i = 0; i < w; i++) { 242 | l += scores->at(i); 243 | } 244 | 245 | for (int i = w + 1; i <= 2 * w; i++) { 246 | r += scores->at(i); 247 | } 248 | 249 | second->push_back(round(l + r - d * scores->at(w))); 250 | 251 | for (int i = w + 1; i < scores->size() - w; i++) { 252 | l -= scores->at(i - w - 1); 253 | l += scores->at(i - 1); 254 | r -= scores->at(i); 255 | r += scores->at(i + w); 256 | second->push_back(round(l + r - d * scores->at(i))); 257 | } 258 | 259 | // For testing only 260 | /* 261 | for (int i = 0; i < second->size(); i++) { 262 | cout << second->at(i) << " "; 263 | } 264 | cout << endl; 265 | */ 266 | } 267 | 268 | void DetectorMaxima::findMaxima() { 269 | int firstSize = first->size(); 270 | 271 | for (int i = 1; i < firstSize; i++) { 272 | double magnitude = abs(first->at(i - 1) - first->at(i)); 273 | 274 | if (first->at(i) == 0 || (first->at(i - 1) < 0 & first->at(i) > 0) 275 | || (first->at(i - 1) > 0 && first->at(i) < 0)) { 276 | if (second->at(i) < 0) { 277 | // Adjust index 278 | int peakIndex = i + w + segStart; 279 | 280 | // Record the index of the peak and its magnitude 281 | /* 282 | vector * pair = new vector(); 283 | pair->push_back(peakIndex); 284 | pair->push_back(magnitude); 285 | allMaxima->push_back(pair); 286 | */ 287 | 288 | // Make sure that the peak is in a high-scoring region of width s centered on the peak 289 | if (magnitude > m) { 290 | // Make sure that the peak is in a high-scoring region of width s centered on the peak 291 | int peakStart = peakIndex - halfS; 292 | if (peakStart < segStart) { 293 | peakStart = segStart; 294 | } 295 | int peakEnd = peakIndex + halfS; 296 | if (peakEnd > segEnd) { 297 | peakEnd = segEnd; 298 | } 299 | 300 | double count = countLessThan(oScores, peakStart, peakEnd, 301 | t); 302 | double v = (100.00 * count) 303 | / ((double) peakEnd - peakStart + 1); 304 | if (v < p) { 305 | maxima->push_back(peakIndex); 306 | } 307 | } 308 | } 309 | } 310 | } 311 | 312 | // Testing - start 313 | /* 314 | cout << "Maxima: " << endl; 315 | for (int i = 0; i < maxima->size(); i++) { 316 | cout << maxima->at(i) << " "; 317 | } 318 | cout << endl << endl; 319 | */ 320 | // Testing - end 321 | } 322 | 323 | int DetectorMaxima::countLessThan(vector * list, int s, int e, double t) { 324 | int count = 0; 325 | for (int u = s; u <= e; u++) { 326 | if (list->at(u) < t) { 327 | count++; 328 | } 329 | } 330 | return count; 331 | } 332 | 333 | void DetectorMaxima::findSeparators() { 334 | int n = maxima->size(); 335 | 336 | if (n > 0) { 337 | for (int i = 0; i < n - 1; i++) { 338 | int j = i + 1; 339 | int s = maxima->at(i); 340 | int e = maxima->at(j); 341 | 342 | double count = countLessThan(oScores, s, e, t); 343 | double v = (100.00 * count) / ((double) e - s + 1); 344 | if (v >= p) { 345 | separatorList->push_back(new Location(s, e)); 346 | } 347 | } 348 | } 349 | 350 | // For testing only 351 | /* 352 | cout << "Separators: " << endl; 353 | for (int h = 0; h < separatorList->size(); h++) { 354 | cout << separatorList->at(h)->toString() << endl; 355 | } 356 | cout << endl; 357 | */ 358 | } 359 | 360 | void DetectorMaxima::findRegions() { 361 | // Determine regions 362 | int maximaCount = maxima->size(); 363 | if (maximaCount > 0) { 364 | int segStart = maxima->at(0); 365 | int separatorCount = separatorList->size(); 366 | for (int k = 0; k < separatorCount; k++) { 367 | int segEnd = separatorList->at(k)->getStart(); 368 | regionList->push_back(new Location(segStart, segEnd)); 369 | segStart = separatorList->at(k)->getEnd(); 370 | } 371 | regionList->push_back( 372 | new Location(segStart, maxima->at(maximaCount - 1))); 373 | } 374 | 375 | // For testing only 376 | /* 377 | cout << "Regions: " << endl; 378 | for (int r = 0; r < regionList->size(); r++) { 379 | cout << regionList->at(r)->toString() << endl; 380 | } 381 | cout << endl; 382 | */ 383 | // End testing 384 | } 385 | 386 | /* 387 | * 388 | */ 389 | void DetectorMaxima::extendRegions() { 390 | int regionCount = regionList->size(); 391 | int gg = 0; 392 | while (gg < regionCount) { 393 | ILocation * region = regionList->at(gg); 394 | 395 | int regionStart = region->getStart(); 396 | int regionEnd = region->getEnd(); 397 | 398 | // Handle the case where the region is made of one nucleotide 399 | if (regionStart == regionEnd) { 400 | regionStart = regionStart - halfS; 401 | if (regionStart < segStart) { 402 | regionStart = segStart; 403 | } 404 | region->setStart(regionStart); 405 | 406 | regionEnd = regionEnd + halfS; 407 | if (regionEnd > segEnd) { 408 | regionEnd = segEnd; 409 | } 410 | region->setEnd(regionEnd); 411 | } 412 | 413 | // Left end: Extend step by step 414 | int lEnd = (gg == 0) ? segStart : regionList->at(gg - 1)->getEnd(); 415 | for (int u = regionStart; u >= lEnd; u = u - e) { 416 | int d = u - e + 1; 417 | if (d < lEnd) { 418 | d = lEnd; 419 | } 420 | double v = (100.0 * countLessThan(oScores, d, u, t)) / ((double) e); 421 | if (v >= p) { 422 | break; 423 | } else { 424 | regionStart = d; 425 | } 426 | } 427 | 428 | // Left end: Extend or erode base by base 429 | if (oScores->at(regionStart) < t) { 430 | for (int a = regionStart; a < regionEnd; a++) { 431 | if (oScores->at(a) >= t) { 432 | regionStart = a; 433 | break; 434 | } 435 | } 436 | } else { 437 | for (int a = regionStart; a >= lEnd; a--) { 438 | if (oScores->at(a) >= t) { 439 | regionStart = a; 440 | } else { 441 | break; 442 | } 443 | } 444 | } 445 | 446 | // Set new start to check for validity 447 | region->setStart(regionStart); 448 | 449 | // Right end: extend to the right step by step 450 | int rEnd = 451 | (gg == regionCount - 1) ? 452 | segEnd : regionList->at(gg + 1)->getStart(); 453 | for (int u = regionEnd; u <= rEnd; u = u + e) { 454 | int d = u + e - 1; 455 | if (d > rEnd) { 456 | d = rEnd; 457 | } 458 | double v = (100.0 * countLessThan(oScores, u, d, t)) / ((double) e); 459 | if (v >= p) { 460 | break; 461 | } else { 462 | regionEnd = d; 463 | } 464 | } 465 | 466 | // Right end: extend or erod base by base 467 | if (oScores->at(regionEnd) < t) { 468 | for (int a = regionEnd; a > regionStart; a--) { 469 | if (oScores->at(a) >= t) { 470 | regionEnd = a; 471 | break; 472 | } 473 | } 474 | } else { 475 | for (int a = regionEnd; a <= rEnd; a++) { 476 | if (oScores->at(a) >= t) { 477 | regionEnd = a; 478 | } else { 479 | break; 480 | } 481 | } 482 | } 483 | 484 | // Set new end to check for validity 485 | region->setEnd(regionEnd); 486 | 487 | // Merge overlapping regions 488 | if (gg > 0) { 489 | ILocation * pRegion = regionList->at(gg - 1); 490 | int pStart = pRegion->getStart(); 491 | int pEnd = pRegion->getEnd(); 492 | 493 | if (Util::isOverlapping(pStart, pEnd, regionStart, regionEnd)) { 494 | pRegion->setEnd(regionEnd); 495 | regionList->erase(regionList->begin() + gg); 496 | regionCount = regionList->size(); 497 | } else { 498 | gg++; 499 | } 500 | } 501 | 502 | if (gg == 0) { 503 | gg++; 504 | } 505 | } 506 | 507 | // Testing - Start 508 | /* 509 | cout << "Extended regions: " << endl; 510 | for (int r = 0; r < regionList->size(); r++) { 511 | cout << regionList->at(r)->toString() << endl; 512 | } 513 | cout << endl; 514 | */ 515 | // Testing - End 516 | } 517 | 518 | } /* namespace nonltr */ 519 | -------------------------------------------------------------------------------- /src_2.0/nonltr/DetectorMaxima.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DetectorMaxima.h 3 | * 4 | * Created on: May 31, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef DETECTORMAXIMA_H_ 9 | #define DETECTORMAXIMA_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "../utility/ILocation.h" 15 | 16 | using namespace std; 17 | using namespace utility; 18 | 19 | namespace nonltr { 20 | 21 | class DetectorMaxima { 22 | private: 23 | 24 | int segStart; 25 | int segEnd; 26 | double s; 27 | double w; 28 | double m; 29 | double t; 30 | double p; 31 | int e; 32 | int halfS; 33 | 34 | vector * oScores; 35 | vector * scores; 36 | vector * mask; 37 | vector * first; 38 | vector * second; 39 | vector * maxima; 40 | // vector *> * allMaxima; 41 | 42 | vector * separatorList; 43 | vector * regionList; 44 | 45 | void makeMask(); 46 | void smooth(); 47 | void deriveFirst(); 48 | void deriveSecond(); 49 | void findMaxima(); 50 | 51 | void findSeparators(); 52 | void findRegions(); 53 | 54 | void extendRegions(); 55 | 56 | int countLessThan(vector *, int, int, double); 57 | 58 | /** 59 | * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c 60 | */ 61 | inline double round(double number) { 62 | return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); 63 | } 64 | 65 | public: 66 | DetectorMaxima(int, int, double, double, double, double, double, int, 67 | vector *); 68 | virtual ~DetectorMaxima(); 69 | const vector* getRegionList() const; 70 | const vector* getFirst() const; 71 | const vector* getSecond() const; 72 | 73 | // const vector *>* getAllMaxima() const; 74 | }; 75 | 76 | } /* namespace nonltr */ 77 | #endif /* DETECTORMAXIMA_H_ */ 78 | -------------------------------------------------------------------------------- /src_2.0/nonltr/EnrichmentMarkovView.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * EnrichmentMarkovView.cpp 3 | * 4 | * Created on: Apr 17, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | namespace nonltr { 9 | 10 | /** 11 | * The Markov order. It start at 0. 12 | */ 13 | template 14 | EnrichmentMarkovView::EnrichmentMarkovView(int k, int order, int m) : 15 | minObs(m), factor(10000.00), KmerHashTable(k) { 16 | initialize(order); 17 | } 18 | 19 | template 20 | EnrichmentMarkovView::EnrichmentMarkovView(int k, V initValue, int order, 21 | int m) : 22 | minObs(m), factor(10000.00), KmerHashTable(k, initValue) { 23 | initialize(order); 24 | } 25 | 26 | template 27 | void EnrichmentMarkovView::initialize(int order) { 28 | o = order; 29 | if (o < 0) { 30 | string msg("The Markov order must be non-negative integer. "); 31 | msg.append("The invalid input is: "); 32 | msg.append(Util::int2string(o)); 33 | msg.append("."); 34 | throw InvalidInputException(msg); 35 | } 36 | 37 | if (o >= KmerHashTable::k) { 38 | string msg("The Markov order cannot be >= k (k-mer)."); 39 | throw InvalidInputException(msg); 40 | } 41 | 42 | l = 0; 43 | modelList = new vector *>(); 44 | 45 | for (int i = 1; i <= o + 1; i++) { 46 | modelList->push_back(new KmerHashTable(i)); 47 | } 48 | } 49 | 50 | template 51 | EnrichmentMarkovView::~EnrichmentMarkovView() { 52 | Util::deleteInVector(modelList); 53 | delete modelList; 54 | } 55 | 56 | /** 57 | * This method/function converts a decimal number to a quaternay one 58 | */ 59 | template 60 | string EnrichmentMarkovView::convertToQuaternary(I decimal){ 61 | string q(""); 62 | for (int x = 0; x < KmerHashTable::k; x++) { 63 | q.append(1, 0); 64 | } 65 | int index = 0; 66 | while (decimal > 0){ 67 | q.at(index++) = decimal % 4; 68 | decimal /= 4; 69 | } 70 | std::reverse(q.begin(), q.end()); 71 | 72 | return q; 73 | } 74 | 75 | /** 76 | * This method count words of size 1 to order+1 in the input sequence. 77 | * In other words, it updates the background tables. In addition, it 78 | * updates the length of the genome. 79 | * 80 | * sequence: is the input sequence. 81 | * start: the start index - inclosing. 82 | * end: the end index - inclosing. 83 | */ 84 | template 85 | void EnrichmentMarkovView::count(const char * sequence, int start, 86 | int end) { 87 | 88 | // Multiple by 2 if scanning the forward strand and its reverse complement 89 | // l = l + (2 * (end - start + 1)); 90 | l = l + (end - start + 1); 91 | 92 | int modelNumber = modelList->size(); 93 | for (int i = 0; i < modelNumber; i++) { 94 | KmerHashTable * t = modelList->at(i); 95 | t->wholesaleIncrement(sequence, start, end - i); 96 | } 97 | } 98 | 99 | /** 100 | * Normalize the count of words in each model. 101 | * Values stored in these models are multiplied by "factor." 102 | */ 103 | template 104 | void EnrichmentMarkovView::generateProbapilities() { 105 | int modelNumber = modelList->size(); 106 | 107 | for (int m = 0; m < modelNumber; m++) { 108 | KmerHashTable * t = modelList->at(m); 109 | int tSize = t->getMaxTableSize(); 110 | 111 | for (int i = 0; i < tSize; i += 4) { 112 | double sum = 0.0; 113 | 114 | for (int j = i; j < i + 4; j++) { 115 | sum += t->valueOf(j); 116 | } 117 | 118 | for (int j = i; j < i + 4; j++) { 119 | t->insert(j, round(factor * ((double) t->valueOf(j) / sum))); 120 | } 121 | } 122 | } 123 | } 124 | 125 | template 126 | void EnrichmentMarkovView::processTable() { 127 | char base = 4; 128 | int modelNumber = modelList->size(); 129 | 130 | // Dividing this loop into multiple loops starting at different starts 131 | // to take advantage of concurrency. 132 | // Every core will process about the same segment of the table. 133 | // int segLength = ceil(KmerHashTable::maxTableSize / (double) Util::CORE_NUM); 134 | 135 | 136 | std::pair segInfo = getSegmentLengthAndCount(); 137 | # pragma omp parallel for num_threads(Util::CORE_NUM) schedule(dynamic) 138 | for(I hani = 0; hani < std::get<1>(segInfo); hani++){ 139 | // Set the boundaries of the inner loop 140 | I segLength = std::get<0>(segInfo); 141 | I haniStart = hani * segLength; 142 | I haniLimit = haniStart + segLength; 143 | if(haniLimit > KmerHashTable::maxTableSize){ 144 | haniLimit = KmerHashTable::maxTableSize; 145 | } 146 | 147 | # pragma omp critical 148 | { 149 | // cout << "Processing " << y << " keys out of "; 150 | // cout << KmerHashTable::maxTableSize; 151 | // cout << endl; 152 | cout << "Processing " << haniStart << " to " << haniLimit-1; 153 | cout << endl; 154 | } 155 | 156 | // Make a zero in quaternary form as a string of length k. 157 | // string q(""); 158 | // for (int x = 0; x < KmerHashTable::k; x++) { 159 | // q.append(1, 0); 160 | // } 161 | string q = convertToQuaternary(haniStart); 162 | double lowerP; 163 | double upperP; 164 | for (I y = haniStart; y < haniLimit; y++) { 165 | //if (y % 10000000 == 0) { 166 | // # pragma omp critical 167 | // { 168 | // // cout << "Processing " << y << " keys out of "; 169 | // // cout << KmerHashTable::maxTableSize; 170 | // // cout << endl; 171 | // cout << "Processing " << haniStart << " to " << haniLimit-1; 172 | // cout << endl; 173 | // } 174 | //} 175 | 176 | const char * qc = q.c_str(); 177 | 178 | // Calculate the expected number of occurrences. 179 | 180 | // a. Calculate probability from lower order models. 181 | // Lower probabilities are the same for four consecutive words of length of k-1 182 | if (y % 4 == 0) { 183 | lowerP = 1.0; 184 | for (int m = 0; m < modelNumber - 1; m++) { 185 | KmerHashTable * oTable = modelList->at(m); 186 | lowerP *= (((double) oTable->valueOf(qc, 0)) / factor); 187 | } 188 | } 189 | 190 | // b. Calculate probability based on the specified order. 191 | KmerHashTable * oTable = modelList->at(modelNumber - 1); 192 | int resultsSize = KmerHashTable::k - o - 1; 193 | 194 | // Upper probabilities are the same for four consecutive words of length of k-1 195 | // The scanning of words or length corresponding to the highest order + 1 196 | // This step is not needed if k = o + 1, i.e. resultsSize = 0. 197 | if (y % 4 == 0) { 198 | if (resultsSize > 0) { 199 | //Initialize the elements of the vector invalid index 200 | vector results = vector(resultsSize, -987); 201 | oTable->wholesaleValueOf(qc, 0, resultsSize - 1, &results, 0); 202 | 203 | upperP = 1.0; 204 | for (int i = 0; i < resultsSize; i++) { 205 | upperP *= (((double) results.at(i)) / factor); 206 | } 207 | results.clear(); 208 | 209 | } else { 210 | upperP = 1.0; 211 | } 212 | } 213 | 214 | // The expected number of occurances 215 | double exp = l * lowerP * upperP 216 | * (((double) oTable->valueOf(qc, resultsSize)) / factor); 217 | 218 | // Calculate the enrichment value. 219 | // Log value 220 | // values[y] = round((log((double) values[y] + 1.0) - log(exp + 1.0))); 221 | 222 | // Raw value 223 | // Requirement: if observed is >= 5 && observed > expected then the value is the difference 224 | // otherwise the value is zero 225 | 226 | V observed = KmerHashTable::values[y]; 227 | 228 | if (observed >= minObs && observed > exp) { 229 | KmerHashTable::values[y] = round(observed - exp); 230 | } else { 231 | KmerHashTable::values[y] = 0; 232 | } 233 | 234 | /* 235 | KmerHashTable::values[y] = 236 | round( 237 | (((double) KmerHashTable::values[y] + 1.0) 238 | / (exp + 1.0))); 239 | */ 240 | 241 | // Increment the quaternary number: 242 | // 1 - guard against overflow. 243 | // @@ Need to review this part 244 | if (q[0] == base - 1) { 245 | string z(""); 246 | z.append(1, 0); 247 | q = z + q; 248 | } 249 | 250 | // 2 - increment the quaternary number by 1. 251 | int qLen = q.size(); 252 | for (int i = qLen - 1; i >= 0; i--) { 253 | if (q[i] + 1 < base) { 254 | q[i] = q[i] + 1; 255 | break; 256 | } else { 257 | q[i] = 0; 258 | } 259 | } 260 | } 261 | } 262 | } 263 | 264 | } /* namespace nonltr */ 265 | -------------------------------------------------------------------------------- /src_2.0/nonltr/EnrichmentMarkovView.h: -------------------------------------------------------------------------------- 1 | /* 2 | * EnrichmentMarkovView.h 3 | * 4 | * Created on: Apr 17, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ENRICHMENTMARKOVVIEW_H_ 9 | #define ENRICHMENTMARKOVVIEW_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "KmerHashTable.h" 18 | #include "../utility/Util.h" 19 | #include "../exception/InvalidInputException.h" 20 | 21 | using namespace std; 22 | using namespace utility; 23 | using namespace exception; 24 | 25 | namespace nonltr { 26 | 27 | template 28 | class EnrichmentMarkovView: public KmerHashTable{ 29 | 30 | private: 31 | // The minimum number of the observed k-mers 32 | const int minObs; 33 | 34 | // This template specification should work up to order of 14, 35 | // i.e. word length = 15 36 | vector *> * modelList; 37 | 38 | // Markov order 39 | int o; 40 | 41 | // Total length 42 | long l; 43 | 44 | // Multiplied the probability of word by this factor 45 | // Equivalent to four decimal points. 46 | const double factor; // = 10000.00; 47 | 48 | // Initialize data members 49 | void initialize(int); 50 | 51 | /** 52 | * Based on code from 53 | * //www.geeksforgeeks.org/convert-base-decimal-vice-versa/ 54 | */ 55 | string convertToQuaternary(I); 56 | 57 | /** 58 | * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c 59 | */ 60 | inline double round(double number) { 61 | return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); 62 | } 63 | 64 | public: 65 | EnrichmentMarkovView(int, int, int); 66 | EnrichmentMarkovView(int, V, int, int); 67 | virtual ~EnrichmentMarkovView(); 68 | 69 | void count(const char *, int, int); 70 | void generateProbapilities(); 71 | void processTable(); 72 | }; 73 | } /* namespace nonltr */ 74 | 75 | #include "EnrichmentMarkovView.cpp" 76 | 77 | #endif /* ENRICHMENTMARKOVVIEW_H_ */ 78 | -------------------------------------------------------------------------------- /src_2.0/nonltr/HMM.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * HMM.cpp 3 | * 4 | * Created on: Jun 21, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "HMM.h" 9 | 10 | #include 11 | #include 12 | 13 | #include "../utility/Util.h" 14 | #include "../exception/InvalidStateException.h" 15 | #include "../exception/InvalidInputException.h" 16 | #include "../exception/FileDoesNotExistException.h" 17 | #include "../exception/InvalidOperationException.h" 18 | 19 | using namespace std; 20 | using namespace utility; 21 | using namespace exception; 22 | 23 | namespace nonltr { 24 | 25 | HMM::HMM(HMM& other) { 26 | oList = new vector(*other.getOList()); 27 | pList = new vector(*other.getPList()); 28 | tList = new vector *>(); 29 | auto otherTList = other.getTList(); 30 | for (unsigned int i = 0; i < otherTList->size(); i++) { 31 | tList->push_back(new vector(*otherTList->at(i))); 32 | } 33 | 34 | stateNumber = other.getStateNumber(); 35 | positiveStateNumber = other.getPositiveStateNumber(); 36 | base = other.getBase(); 37 | minusInf = other.getMinusInf(); 38 | normalized = true; 39 | } 40 | 41 | HMM::HMM(string hmmFile) { 42 | normalized = false; 43 | 44 | // ToDo: Fix this operation 45 | string msg("Reading HMM from file is temporarily disabled."); 46 | throw InvalidOperationException(msg); 47 | 48 | ifstream in(hmmFile.c_str()); 49 | in.precision(PRECISION); 50 | 51 | if (in) { 52 | string token; 53 | bool isLogBase = false; 54 | bool isStates = false; 55 | bool isPriors = false; 56 | bool isTransition = false; 57 | 58 | while (in >> token) { 59 | if (isLogBase) { 60 | base = atof(token.c_str()); 61 | 62 | checkBase(base); 63 | 64 | logBase = log(base); 65 | isLogBase = false; 66 | } else if (isStates) { 67 | stateNumber = atoi(token.c_str()); 68 | positiveStateNumber = stateNumber / 2; 69 | initializeHelper(); 70 | 71 | isStates = false; 72 | } else if (isPriors) { 73 | //Skip state names 74 | for (int i = 1; i < stateNumber; i++) { 75 | in >> token; 76 | } 77 | for (int i = 0; i < stateNumber; i++) { 78 | in >> token; 79 | (*pList)[i] = atof(token.c_str()); 80 | } 81 | 82 | isPriors = false; 83 | } else if (isTransition) { 84 | //Skip state names 85 | for (int i = 1; i < stateNumber; i++) { 86 | in >> token; 87 | } 88 | 89 | for (int i = 0; i < stateNumber; i++) { 90 | //Skip the name of the state at the beginning of the line 91 | for (int j = -1; j < stateNumber; j++) { 92 | in >> token; 93 | if (j > -1) { 94 | (*(tList->at(i)))[j] = atof(token.c_str()); 95 | } 96 | } 97 | } 98 | 99 | isTransition = false; 100 | } 101 | 102 | if (token.compare("Base") == 0) { 103 | isLogBase = true; 104 | } else if (token.compare("States") == 0) { 105 | isStates = true; 106 | } else if (token.compare("Priors") == 0) { 107 | isPriors = true; 108 | } else if (token.compare("Transition") == 0) { 109 | isTransition = true; 110 | } 111 | } 112 | 113 | in.close(); 114 | } else { 115 | string msg(hmmFile); 116 | msg.append(" does not exist."); 117 | throw FileDoesNotExistException(msg); 118 | } 119 | in.close(); 120 | } 121 | 122 | /** 123 | * Use this constructor to train on the entire genome. 124 | * The client has to call train on each chromosome. 125 | * base is the threshold. 126 | 127 | */ 128 | HMM::HMM(double base, int stateNumber) : 129 | PRECISION(numeric_limits::digits10 + 1) { 130 | normalized = false; 131 | initialize(base, stateNumber); 132 | } 133 | 134 | void HMM::initialize(double baseIn, int stateNumberIn) { 135 | base = baseIn; 136 | checkBase(base); 137 | 138 | logBase = log(baseIn); 139 | 140 | stateNumber = stateNumberIn; 141 | // Make sure that the number of states is even and > 0 142 | if (stateNumber % 2 != 0 || stateNumber == 0) { 143 | string msg("The number of states must be even and > zero."); 144 | throw InvalidInputException(msg); 145 | } 146 | 147 | positiveStateNumber = stateNumber / 2; 148 | initializeHelper(); 149 | } 150 | 151 | /** 152 | * This method makes sure that the base is not zero. 153 | */ 154 | void HMM::checkBase(double base) { 155 | if (fabs(base - 0.0) < std::numeric_limits::epsilon()) { 156 | string msg("The base cannot be zero because log(base) is not defined."); 157 | throw InvalidInputException(msg); 158 | } 159 | } 160 | 161 | void HMM::initializeHelper() { 162 | // Ensure that the number of the states is positive 163 | if (stateNumber < 1) { 164 | string msg("The number of states must be positive."); 165 | throw InvalidStateException(msg); 166 | } 167 | 168 | pList = new vector(stateNumber, 1); 169 | tList = new vector *>; 170 | for (int i = 0; i < stateNumber; i++) { 171 | tList->push_back(new vector(stateNumber, 1)); 172 | } 173 | oList = new vector(stateNumber, 1); 174 | 175 | // Check if infinity can be handled 176 | if (!std::numeric_limits::has_infinity) { 177 | string msg("This compiler does not handle infinite values. "); 178 | msg.append(string("The decoding algorithm will not function.")); 179 | throw InvalidStateException(msg); 180 | } else { 181 | minusInf = -1.0 * std::numeric_limits::infinity(); 182 | } 183 | } 184 | 185 | HMM::~HMM() { 186 | pList->clear(); 187 | delete pList; 188 | 189 | Util::deleteInVector(tList); 190 | delete tList; 191 | 192 | oList->clear(); 193 | delete oList; 194 | } 195 | 196 | void HMM::train(vector * scoreListIn, 197 | const vector *> * segmentListIn, 198 | const vector * candidateListIn) { 199 | 200 | scoreList = scoreListIn; 201 | segmentList = segmentListIn; 202 | candidateList = candidateListIn; 203 | 204 | int candidateCount = candidateList->size(); 205 | if (candidateCount > 0) { 206 | int firstCandIndex = 0; 207 | int lastCandIndex = 0; 208 | int segmentNumber = segmentList->size(); 209 | for (int i = 0; i < segmentNumber; i++) { 210 | vector * s = segmentList->at(i); 211 | ILocation * c = candidateList->at(firstCandIndex); 212 | // A segment may have no detections 213 | if (Util::isOverlapping(s->at(0), s->at(1), c->getStart(), 214 | c->getEnd())) { 215 | lastCandIndex = trainHelper1(s->at(0), s->at(1), 216 | firstCandIndex); 217 | trainHelper2(s->at(0), s->at(1), firstCandIndex, lastCandIndex); 218 | firstCandIndex = lastCandIndex + 1; 219 | if (firstCandIndex >= candidateCount) { 220 | break; 221 | } 222 | } 223 | } 224 | } 225 | scoreList = NULL; 226 | } 227 | 228 | int HMM::trainHelper1(int segStart, int segEnd, int firstCandIndex) { 229 | 230 | ILocation * cand = candidateList->at(firstCandIndex); 231 | if (!Util::isOverlapping(segStart, segEnd, cand->getStart(), 232 | cand->getEnd())) { 233 | string msg("The first candidate is not overlapping with the segment. "); 234 | msg.append("Candidate location is: "); 235 | msg.append(cand->toString()); 236 | msg.append(" Segment location is: "); 237 | msg.append(Util::int2string(segStart)); 238 | msg.append("-"); 239 | msg.append(Util::int2string(segEnd)); 240 | throw InvalidInputException(msg); 241 | } 242 | 243 | int lastCandIndex = -1; 244 | int candidateNumber = candidateList->size(); 245 | for (int c = firstCandIndex; c < candidateNumber; c++) { 246 | ILocation * cand = candidateList->at(c); 247 | if (Util::isOverlapping(segStart, segEnd, cand->getStart(), 248 | cand->getEnd())) { 249 | lastCandIndex = c; 250 | } else { 251 | break; 252 | } 253 | } 254 | 255 | if (lastCandIndex < 0) { 256 | string msg("The index of the last candidate cannot be negative."); 257 | throw InvalidStateException(msg); 258 | } 259 | 260 | return lastCandIndex; 261 | } 262 | 263 | void HMM::trainHelper2(int segStart, int segEnd, int firstCandIndex, 264 | int lastCandIndex) { 265 | 266 | ILocation * f = candidateList->at(firstCandIndex); 267 | 268 | // First negative region if present 269 | int fStart = f->getStart(); 270 | if (fStart > segStart) { 271 | trainNegative(segStart, fStart - 1); 272 | 273 | move(getNgtvState(fStart - 1), getPstvState(fStart)); 274 | } 275 | 276 | // Alternating positive and negative regions 277 | for (int i = firstCandIndex; i < lastCandIndex; i++) { 278 | ILocation * c = candidateList->at(i); 279 | int cStart = c->getStart(); 280 | int cEnd = c->getEnd(); 281 | trainPositive(cStart, cEnd); 282 | 283 | move(getPstvState(cEnd), getNgtvState(cEnd + 1)); 284 | 285 | int nextStart = candidateList->at(i + 1)->getStart(); 286 | trainNegative(cEnd + 1, nextStart - 1); 287 | move(getNgtvState(nextStart - 1), getPstvState(nextStart)); 288 | } 289 | 290 | // Last positive region 291 | ILocation * l = candidateList->at(lastCandIndex); 292 | int lEnd = l->getEnd(); 293 | trainPositive(l->getStart(), lEnd); 294 | 295 | // Last negative region if present 296 | if (segEnd > lEnd) { 297 | move(getPstvState(lEnd), getNgtvState(lEnd + 1)); 298 | trainNegative(lEnd + 1, segEnd); 299 | } 300 | } 301 | 302 | void HMM::trainPositive(int s, int e) { 303 | int pIndex = getPstvState(s); 304 | (*pList)[pIndex] = pList->at(pIndex) + 1; 305 | for (int i = s; i <= e; i++) { 306 | int index = getPstvState(i); 307 | (*oList)[index] = oList->at(index) + 1; 308 | } 309 | 310 | for (int i = s; i < e; i++) { 311 | move(getPstvState(i), getPstvState(i + 1)); 312 | } 313 | } 314 | 315 | void HMM::trainNegative(int s, int e) { 316 | int pIndex = getNgtvState(s); 317 | (*pList)[pIndex] = pList->at(pIndex) + 1; 318 | 319 | for (int i = s; i <= e; i++) { 320 | int index = getNgtvState(i); 321 | //Error in below line (index out of bounds somehow). 322 | (*oList)[index] = oList->at(index) + 1; 323 | } 324 | for (int i = s; i < e; i++) { 325 | move(getNgtvState(i), getNgtvState(i + 1)); 326 | } 327 | } 328 | 329 | void HMM::move(int state1, int state2) { 330 | vector * state1Row = tList->at(state1); 331 | (*state1Row)[state2] = state1Row->at(state2) + 1; 332 | } 333 | 334 | void HMM::normalize() { 335 | if (normalized) { 336 | cerr << "HMM already normalized. Exiting..." << endl; 337 | exit(0); 338 | } 339 | // Priors 340 | double sum = 0.0; 341 | for (int i = 0; i < stateNumber; i++) { 342 | sum += pList->at(i); 343 | } 344 | for (int i = 0; i < stateNumber; i++) { 345 | (*pList)[i] = log(pList->at(i) / sum); 346 | } 347 | 348 | // Output 349 | for (int i = 0; i < stateNumber; i++) { 350 | (*oList)[i] = log(1.0); 351 | } 352 | 353 | // Transition 354 | for (int i = 0; i < stateNumber; i++) { 355 | vector * row = tList->at(i); 356 | double sum = 0.0; 357 | for (int j = 0; j < stateNumber; j++) { 358 | sum += row->at(j); 359 | } 360 | 361 | for (int j = 0; j < stateNumber; j++) { 362 | (*row)[j] = log(row->at(j) / sum); 363 | } 364 | } 365 | normalized = true; 366 | } 367 | 368 | void HMM::print() { 369 | cout.precision(PRECISION); 370 | 371 | // State names 372 | vector v; 373 | for (int j = 0; j < positiveStateNumber; j++) { 374 | v.push_back(Util::int2string(j)); 375 | } 376 | string m("-"); 377 | for (int j = 0; j < positiveStateNumber; j++) { 378 | v.push_back(m + Util::int2string(j)); 379 | } 380 | 381 | cout << "Priors:" << endl; 382 | for (int g = 0; g < 2; g++) { 383 | for (int i = 0; i < positiveStateNumber; i++) { 384 | cout << v.at(i + (g * positiveStateNumber)) << "\t"; 385 | } 386 | 387 | for (int i = 0; i < positiveStateNumber; i++) { 388 | cout << pList->at(i + (g * positiveStateNumber)) << "\t"; 389 | } 390 | cout << endl; 391 | } 392 | cout << endl; 393 | 394 | cout << "Transition:" << endl << "\t"; 395 | for (unsigned int i = 0; i < v.size(); i++) { 396 | cout << v.at(i) << "\t"; 397 | } 398 | cout << endl; 399 | 400 | for (int i = 0; i < stateNumber; i++) { 401 | vector * row = tList->at(i); 402 | cout << v.at(i) << "\t"; 403 | for (int j = 0; j < stateNumber; j++) { 404 | cout << row->at(j) << "\t"; 405 | } 406 | cout << endl; 407 | } 408 | cout << endl << endl; 409 | } 410 | 411 | void HMM::print(string hmo) { 412 | ofstream out(hmo.c_str()); 413 | out.precision(PRECISION); 414 | 415 | out << "Base" << endl << base << endl; 416 | 417 | out << "States" << endl << stateNumber << endl; 418 | 419 | vector v; 420 | for (int j = 0; j < positiveStateNumber; j++) { 421 | v.push_back(Util::int2string(j)); 422 | } 423 | string m("-"); 424 | for (int j = 0; j < positiveStateNumber; j++) { 425 | v.push_back(m + Util::int2string(j)); 426 | } 427 | 428 | out << "Priors" << endl; 429 | for (unsigned int i = 0; i < v.size(); i++) { 430 | out << v.at(i) << "\t"; 431 | } 432 | out << endl; 433 | 434 | for (unsigned int i = 0; i < v.size(); i++) { 435 | out << pList->at(i) << "\t"; 436 | } 437 | out << endl; 438 | 439 | out << "Transition" << endl << "\t"; 440 | for (unsigned int i = 0; i < v.size(); i++) { 441 | out << v.at(i) << "\t"; 442 | } 443 | out << endl; 444 | 445 | for (int i = 0; i < stateNumber; i++) { 446 | vector * row = tList->at(i); 447 | out << v.at(i) << "\t"; 448 | for (int j = 0; j < stateNumber; j++) { 449 | out << row->at(j) << "\t"; 450 | } 451 | out << endl; 452 | } 453 | out << endl << endl; 454 | 455 | out.close(); 456 | } 457 | 458 | /** 459 | * This method will append the state sequence to the end of the input state list 460 | * This method returns the log likelihood 461 | */ 462 | double HMM::decode(int rStart, int rEnd, vector * scoreListIn, 463 | vector& stateList) { 464 | scoreList = scoreListIn; 465 | 466 | // Make sure that the coordinates represent valid location 467 | Location check(rStart, rEnd); 468 | // End check 469 | 470 | vector > v(stateNumber); 471 | int size = rEnd - rStart + 1; 472 | for (int i = 0; i < stateNumber; i++) { 473 | v[i] = vector(size, minusInf); 474 | } 475 | 476 | vector > p(stateNumber); 477 | for (int i = 0; i < stateNumber; i++) { 478 | p[i] = vector(size, -1); 479 | } 480 | 481 | // Initialize 482 | int firstPstvState = getPstvState(rStart); 483 | int firstNgtvState = positiveStateNumber + firstPstvState; 484 | v[firstPstvState][0] = pList->at(firstPstvState); 485 | v[firstNgtvState][0] = pList->at(firstNgtvState); 486 | 487 | // Recurs 488 | for (int i = rStart + 1; i <= rEnd; i++) { 489 | int vIndex = i - rStart; 490 | 491 | // Obtain states from scores 492 | int pPstvState = getPstvState(i - 1); 493 | int pNgtvState = positiveStateNumber + pPstvState; 494 | int cPstvState = getPstvState(i); 495 | int cNgtvState = positiveStateNumber + cPstvState; 496 | 497 | // Set positive state 498 | double p1 = v[pPstvState][vIndex - 1] 499 | + (*(*tList)[pPstvState])[cPstvState]; 500 | double p2 = v[pNgtvState][vIndex - 1] 501 | + (*(*tList)[pNgtvState])[cPstvState]; 502 | if (p1 > p2) { 503 | v[cPstvState][vIndex] = p1; 504 | p[cPstvState][vIndex] = pPstvState; 505 | } else { 506 | v[cPstvState][vIndex] = p2; 507 | p[cPstvState][vIndex] = pNgtvState; 508 | } 509 | 510 | // Set negative state 511 | double p3 = v[pPstvState][vIndex - 1] 512 | + (*(*tList)[pPstvState])[cNgtvState]; 513 | double p4 = v[pNgtvState][vIndex - 1] 514 | + (*(*tList)[pNgtvState])[cNgtvState]; 515 | if (p3 > p4) { 516 | v[cNgtvState][vIndex] = p3; 517 | p[cNgtvState][vIndex] = pPstvState; 518 | } else { 519 | v[cNgtvState][vIndex] = p4; 520 | p[cNgtvState][vIndex] = pNgtvState; 521 | } 522 | } 523 | 524 | // Decode 525 | int lastBestState = 0; 526 | double lastBestValue = v[0][size - 1]; 527 | for (int i = 1; i < stateNumber; i++) { 528 | double currentValue = v[i][size - 1]; 529 | if (currentValue > lastBestValue) { 530 | lastBestState = i; 531 | lastBestValue = currentValue; 532 | } 533 | } 534 | 535 | int stateListOriginalSize = stateList.size(); 536 | for (int i = stateListOriginalSize; i < stateListOriginalSize + size; i++) { 537 | stateList.push_back(-1); 538 | } 539 | 540 | stateList[stateListOriginalSize + size - 1] = lastBestState; 541 | for (int i = size - 1; i > 0; i--) { 542 | lastBestState = p[lastBestState][i]; 543 | stateList[stateListOriginalSize + i - 1] = lastBestState; 544 | } 545 | 546 | // Make sure that no state in the results has the value of -1 547 | for (int i = stateListOriginalSize; i < stateListOriginalSize + size; i++) { 548 | if (stateList[i] == -1) { 549 | string msg("At least one state was not determined properly."); 550 | throw InvalidStateException(msg); 551 | } 552 | } 553 | 554 | return lastBestValue; 555 | } 556 | 557 | /** 558 | * Append positive regions at the end of regionList 559 | */ 560 | double HMM::decode(int rStart, int rEnd, vector * scoreListIn, 561 | vector& regionList) { 562 | 563 | vector stateList; 564 | double logLikelihood = decode(rStart, rEnd, scoreListIn, stateList); 565 | 566 | int size = stateList.size(); 567 | bool inRpt = false; 568 | bool canFill = false; 569 | int s = -1; 570 | int e = -1; 571 | 572 | for (int i = 0; i < size; i++) { 573 | // Start a new repeat 574 | if (stateList.at(i) < positiveStateNumber && !inRpt) { 575 | inRpt = true; 576 | s = i; 577 | } 578 | // End a the current repeat 579 | else if (stateList.at(i) >= positiveStateNumber && inRpt) { 580 | e = i - 1; 581 | inRpt = false; 582 | canFill = true; 583 | } 584 | // If the current repeat at the end of the segment 585 | else if (i == size - 1 && inRpt) { 586 | e = i; 587 | inRpt = false; 588 | canFill = true; 589 | } 590 | // Extract features of the just recognized repeat 591 | if (canFill) { 592 | regionList.push_back(new Location(s + rStart, e + rStart)); 593 | s = -1; 594 | e = -1; 595 | canFill = false; 596 | } 597 | } 598 | 599 | return logLikelihood; 600 | } 601 | 602 | /** 603 | * Append positive regions at the end of regionList 604 | */ 605 | double HMM::decodeNew(int rStart, int rEnd, vector * scoreListIn, 606 | vector& regionList) { 607 | 608 | vector stateList; 609 | double logLikelihood = decodeNew(rStart, rEnd, scoreListIn, stateList); 610 | 611 | int size = stateList.size(); 612 | bool inRpt = false; 613 | bool canFill = false; 614 | int s = -1; 615 | int e = -1; 616 | 617 | for (int i = 0; i < size; i++) { 618 | // Start a new repeat 619 | if (stateList.at(i) < positiveStateNumber && !inRpt) { 620 | inRpt = true; 621 | s = i; 622 | } 623 | // End a the current repeat 624 | else if (stateList.at(i) >= positiveStateNumber && inRpt) { 625 | e = i - 1; 626 | inRpt = false; 627 | canFill = true; 628 | } 629 | // If the current repeat at the end of the segment 630 | else if (i == size - 1 && inRpt) { 631 | e = i; 632 | inRpt = false; 633 | canFill = true; 634 | } 635 | // Extract features of the just recognized repeat 636 | if (canFill) { 637 | regionList.push_back(new Location(s + rStart, e + rStart)); 638 | s = -1; 639 | e = -1; 640 | canFill = false; 641 | } 642 | } 643 | 644 | return logLikelihood; 645 | } 646 | 647 | //replacement Viterbi Algorithm function 648 | // Cased a crash on chromosome 20 but worked on 21 649 | double HMM::decodeNew(int rStart, int rEnd, vector * scoreListIn, 650 | vector& stateList) { 651 | if (!normalized) { 652 | cerr 653 | << "Invalid operation: HMM cannot be traversed until after it is normalized. Exiting..." 654 | << endl; 655 | exit(0); 656 | } 657 | // print(); 658 | scoreList = scoreListIn; 659 | int len = rEnd - rStart; 660 | stateList.resize(len); 661 | 662 | // Make sure that the coordinates represent valid location (copied from current method) 663 | Location check(rStart, rEnd); 664 | // End check 665 | 666 | //this is a simple list of the probabilities and previous states 667 | //the list will hold pairs of the score, and a boolean making + or -, which will indicate whether the state came from a + or - previous state 668 | vector > stateProbs(len * 2 + 2); 669 | 670 | //start the list by adding the prior probabilities 671 | int firstPstvState = getPstvState(rStart); 672 | int firstNgtvState = positiveStateNumber + firstPstvState; 673 | 674 | stateProbs.at(0).first = pList->at(firstPstvState); 675 | stateProbs.at(1).first = pList->at(firstNgtvState); 676 | 677 | //compute the most likely transition at each and point and record if it was from + or - 678 | // int score, j; 679 | int j; 680 | double p1, p2, p3, p4; 681 | 682 | for (int i = rStart + 1; i <= rEnd; i++) { 683 | j = i - rStart; 684 | 685 | int pPstvState = getPstvState(i - 1); 686 | int pNgtvState = positiveStateNumber + pPstvState; 687 | int cPstvState = getPstvState(i); 688 | int cNgtvState = positiveStateNumber + cPstvState; 689 | 690 | p1 = stateProbs.at(2 * j - 2).first 691 | + (*(*tList)[pPstvState])[cPstvState]; 692 | 693 | p2 = stateProbs.at(2 * j - 1).first 694 | + (*(*tList)[pNgtvState])[cPstvState]; 695 | 696 | if (p1 > p2) { 697 | stateProbs.at(2 * j).first = p1; 698 | stateProbs.at(2 * j).second = true; 699 | } else { 700 | stateProbs.at(2 * j).first = p2; 701 | stateProbs.at(2 * j).second = false; 702 | } 703 | //- to - probability *check this syntax* 704 | p3 = stateProbs.at(2 * j - 1).first 705 | + (*(*tList)[pNgtvState])[cNgtvState]; 706 | 707 | //- to + probability *check this syntax* 708 | p4 = stateProbs.at(2 * j - 2).first 709 | + (*(*tList)[pPstvState])[cNgtvState]; 710 | 711 | if (p4 > p3) { 712 | stateProbs.at(2 * j + 1).first = p4; 713 | stateProbs.at(2 * j + 1).second = true; 714 | } else { 715 | stateProbs.at(2 * j + 1).first = p3; 716 | stateProbs.at(2 * j + 1).second = false; 717 | } 718 | } 719 | //backtrack through the table to generate the complete state list 720 | int k = len * 2; 721 | if (stateProbs[k].first < stateProbs[len * 2 + 1].first) { 722 | k += 1; 723 | } 724 | bool sign = stateProbs[k].second; 725 | while (k >= 0) { 726 | 727 | if (k % 2 == 0) { 728 | //adds the positive state of this index's emission to the state list. *ask if format is good* 729 | stateList[k / 2] = scoreList->at(rStart + k / 2); 730 | if (sign) { 731 | k -= 2; 732 | } else { 733 | k -= 1; 734 | } 735 | } else { 736 | //adds the negative state of this index's emission to the state list. *ask if format is good* 737 | stateList[(k - 1) / 2] = scoreList->at(rStart + (k) / 2) 738 | + positiveStateNumber; 739 | if (sign) { 740 | k -= 3; 741 | } else { 742 | k -= 2; 743 | } 744 | } 745 | if (k > -1) { 746 | sign = stateProbs[k].second; 747 | } 748 | 749 | } 750 | if (stateProbs[len * 2 + 1].first > stateProbs[len * 2].first) { 751 | return stateProbs[len * 2 + 1].first; 752 | } else { 753 | return stateProbs[len * 2].first; 754 | } 755 | delete scoreList; 756 | } 757 | 758 | vector* HMM::getOList() { 759 | return oList; 760 | } 761 | 762 | vector*>* HMM::getTList() { 763 | return tList; 764 | } 765 | 766 | vector* HMM::getPList() { 767 | return pList; 768 | } 769 | 770 | int HMM::getStateNumber() { 771 | return stateNumber; 772 | } 773 | 774 | int HMM::getPositiveStateNumber() { 775 | return positiveStateNumber; 776 | } 777 | 778 | double HMM::getBase() { 779 | return base; 780 | } 781 | 782 | double HMM::getMinusInf() { 783 | return minusInf; 784 | } 785 | 786 | } 787 | /* namespace nonltr */ 788 | -------------------------------------------------------------------------------- /src_2.0/nonltr/HMM.h: -------------------------------------------------------------------------------- 1 | /* 2 | * HMM.h 3 | * 4 | * Created on: Jun 21, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef HMM_H_ 9 | #define HMM_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "../utility/ILocation.h" 18 | 19 | using namespace std; 20 | using namespace utility; 21 | 22 | namespace nonltr { 23 | 24 | class HMM { 25 | private: 26 | const int PRECISION = numeric_limits::digits10 + 1; 27 | double minusInf; 28 | bool normalized; 29 | vector * pList; 30 | vector *> * tList; 31 | vector * oList; 32 | 33 | void initializeHelper(); 34 | // Returns the index of the last candidate in the segment 35 | int trainHelper1(int, int, int); 36 | void trainHelper2(int, int, int, int); 37 | void trainPositive(int, int); 38 | void trainNegative(int, int); 39 | void move(int, int); 40 | void checkBase(double); 41 | 42 | inline int getPstvState(int index) { 43 | int state = scoreList->at(index); 44 | if (state > (stateNumber - 2) / 2) { 45 | state = (stateNumber - 2) / 2; 46 | } 47 | return state; 48 | } 49 | 50 | inline int getNgtvState(int index) { 51 | int state = scoreList->at(index); 52 | if (state > (stateNumber - 2) / 2) { 53 | state = (stateNumber - 2) / 2; 54 | } 55 | return state + positiveStateNumber; 56 | } 57 | 58 | protected: 59 | double base; 60 | double logBase; 61 | int stateNumber; 62 | int positiveStateNumber; 63 | 64 | vector * scoreList; 65 | const vector *> * segmentList; 66 | const vector * candidateList; 67 | 68 | void initialize(double, int); 69 | /** 70 | * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c 71 | */ 72 | inline double round(double number) { 73 | return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); 74 | } 75 | 76 | public: 77 | HMM(string); // Build a model from file 78 | HMM(double, int); 79 | HMM(HMM&); 80 | virtual ~HMM(); 81 | void train(vector *, const vector *> *, 82 | const vector *); 83 | void normalize(); 84 | double decode(int, int, vector *, vector&); 85 | double decode(int, int, vector *, vector&); 86 | double decodeNew(int, int, vector *, vector&); 87 | double decodeNew(int, int, vector *, vector&); 88 | 89 | void print(); 90 | void print(string); 91 | 92 | vector * getPList(); 93 | vector *> * getTList(); 94 | vector * getOList(); 95 | double getBase(); 96 | int getStateNumber(); 97 | int getPositiveStateNumber(); 98 | double getMinusInf(); 99 | 100 | }; 101 | 102 | } /* namespace nonltr */ 103 | 104 | #endif /* HMM_H_ */ 105 | -------------------------------------------------------------------------------- /src_2.0/nonltr/IChromosome.h: -------------------------------------------------------------------------------- 1 | /* 2 | * IChromosome.h 3 | * 4 | * Created on: Feb 4, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ICHROMOSOME_H_ 9 | #define ICHROMOSOME_H_ 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | namespace nonltr { 17 | 18 | class IChromosome { 19 | public: 20 | //IChromosome(); 21 | //virtual ~IChromosome(); 22 | virtual const string* getBase() = 0; 23 | virtual const vector *> * getSegment() = 0; 24 | virtual string getHeader() = 0; 25 | }; 26 | 27 | } /* namespace tr */ 28 | #endif /* ICHROMOSOME_H_ */ 29 | -------------------------------------------------------------------------------- /src_2.0/nonltr/ITableView.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ITableView.h 3 | * 4 | * Created on: Aug 9, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ITABLEVIEW_H_ 9 | #define ITABLEVIEW_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace nonltr { 16 | 17 | template 18 | class ITableView { 19 | public: 20 | virtual V valueOf(const char*) = 0 ; 21 | virtual V valueOf(const char*, int) = 0; 22 | virtual V valueOf(I) = 0; 23 | 24 | virtual int getK() = 0; 25 | virtual I getMaxTableSize() = 0; 26 | virtual const V * getValues() const = 0; 27 | 28 | virtual void wholesaleValueOf(const char *, int, int, vector *) = 0; 29 | virtual void wholesaleValueOf(const char *, int, int, vector *, int) = 0; 30 | }; 31 | 32 | } 33 | 34 | #endif /* ITABLEVIEW_H_ */ 35 | -------------------------------------------------------------------------------- /src_2.0/nonltr/KmerHashTable.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * KmerHashTable.cpp 3 | * 4 | * Created on: Jul 25, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "../utility/Util.h" 13 | #include "../exception/InvalidInputException.h" 14 | #include "../exception/InvalidStateException.h" 15 | 16 | using namespace std; 17 | using namespace exception; 18 | using namespace nonltr; 19 | using namespace utility; 20 | 21 | template 22 | KmerHashTable::KmerHashTable(int keyLength) { 23 | initialize(keyLength, 0); 24 | } 25 | 26 | template 27 | KmerHashTable::KmerHashTable(int keyLength, V initValue) { 28 | initialize(keyLength, initValue); 29 | } 30 | 31 | template 32 | void KmerHashTable::initialize(int keyLength, V initialValueIn) { 33 | /* 34 | if (keyLength > maxKeyLength) { 35 | string msg = "The maximum size (k) of the k-mer is "; 36 | char temp[3]; 37 | sprintf(temp, "%d", maxKeyLength); 38 | msg += temp; 39 | throw InvalidInputException(msg); 40 | } 41 | */ 42 | 43 | k = keyLength; 44 | initialValue = initialValueIn; 45 | 46 | // Initialize bases 47 | bases = new I[k]; 48 | for (int i = k - 1; i >= 0; i--) { 49 | bases[k - 1 - i] = (I) pow(4.0, i); 50 | } 51 | 52 | // Initialize mMinusOne 53 | mMinusOne = new I[4]; 54 | for (int i = 0; i < 4; i++) { 55 | mMinusOne[i] = i * bases[0]; 56 | } 57 | 58 | // Get maximum size of table 59 | char * temp = new char[k]; 60 | for (int i = 0; i < k; i++) { 61 | temp[i] = 3; 62 | } 63 | 64 | maxTableSize = hash(temp) + 1; 65 | delete[] temp; 66 | 67 | // Initialize values 68 | values = new V[maxTableSize]; 69 | for (I i = 0; i < maxTableSize; i++) { 70 | values[i] = initialValue; 71 | } 72 | 73 | // Test 74 | /* 75 | char key[] = { 3, 3, 3, 3, 0, 0, 0, 0 }; 76 | long value = 100; 77 | insert(key, 4, value); 78 | long index = hash(key, 4); 79 | cout << "Index: " << index << " " << values[index] << endl; 80 | cout << "Index: " << index << " " << valueOf(key, 4) << endl; 81 | cout << "Number of filled entries: " << countNonZeroEntries() << endl; 82 | */ 83 | } 84 | 85 | template 86 | KmerHashTable::~KmerHashTable() { 87 | delete[] bases; 88 | delete[] mMinusOne; 89 | delete[] values; 90 | } 91 | 92 | /** 93 | * word: an array of characters. 94 | * The maximum integer value is 3 and the minimum is 0 95 | */ 96 | template 97 | I KmerHashTable::hash(const char * key) { 98 | return hash(key, 0); 99 | } 100 | 101 | /** 102 | * seq: an array of characters e.g. [0,0,1,1,1,3,2]. 103 | * start: the start index of the key. 104 | * This method is designed to process a long sequence. 105 | */ 106 | template 107 | I KmerHashTable::hash(const char * sequence, int keyStart) { 108 | I index = 0; 109 | for (int i = 0; i < k; i++) { 110 | char nucleotide = sequence[keyStart + i]; 111 | if (nucleotide >= 0 && nucleotide <= 3) { 112 | index += bases[i] * sequence[keyStart + i]; 113 | } else { 114 | string msg("The value of the char representing the nucleotide "); 115 | msg.append("must be between 0 and 3."); 116 | msg.append("The int value is "); 117 | msg.append(Util::int2string((int) nucleotide)); 118 | msg.append(" of nucleotide at index "); 119 | msg.append(Util::int2string(keyStart + i)); 120 | 121 | for (int h = 0 + keyStart; h < k + keyStart; h++) { 122 | cerr << (int) sequence[h]; 123 | } 124 | cerr << endl; 125 | 126 | throw InvalidInputException(msg); 127 | } 128 | } 129 | return index; 130 | } 131 | 132 | template 133 | void KmerHashTable::hash(const char * sequence, int start, int end, 134 | vector * hashList) { 135 | 136 | for (int i = start; i <= end; i++) { 137 | char nucleotide = sequence[i]; 138 | if (!(nucleotide >= 0 && nucleotide <= 3)) { 139 | string msg("The value of the char representing the nucleotide "); 140 | msg.append("must be between 0 and 3."); 141 | msg.append("The int value is "); 142 | msg.append(Util::int2string((int) nucleotide)); 143 | msg.append(" of nucleotide at index "); 144 | msg.append(Util::int2string(i)); 145 | 146 | throw InvalidInputException(msg); 147 | } 148 | } 149 | 150 | I lastHash = hash(sequence, start); 151 | hashList->push_back(lastHash); 152 | 153 | for (int i = start + 1; i <= end; i++) { 154 | I s1 = 4 * (lastHash - mMinusOne[(int) sequence[i - 1]]) 155 | + (int) sequence[i + k - 1]; 156 | hashList->push_back(s1); 157 | lastHash = s1; 158 | } 159 | } 160 | 161 | /** 162 | * This method put the key-value pair in the table. 163 | * Note: keys are unique, i.e. no duplicate keys. 164 | */ 165 | template 166 | void KmerHashTable::insert(const char* key, V value) { 167 | insert(key, 0, value); 168 | } 169 | 170 | /** 171 | * Similar to the above method. 172 | * The key begins at start in seq. 173 | * The length of the key is k. 174 | */ 175 | template 176 | void KmerHashTable::insert(const char* sequence, int keyStart, V value) { 177 | values[hash(sequence, keyStart)] = value; 178 | } 179 | 180 | template 181 | void KmerHashTable::insert(I keyHash, V value) { 182 | values[keyHash] = value; 183 | } 184 | 185 | /** 186 | * Call wholesaleIncrement on the segment itself. 187 | * Then, call it again on the reverse complement of this segment. 188 | * 189 | * sequence: is a long sequence usually a long segment of a chromosome. 190 | * sFirstKmer: is the start index of the first k-mer. 191 | * sLastKmer: is the start index of the last k-mer. 192 | */ 193 | template 194 | void KmerHashTable::wholesaleIncrement(const char* sequence, 195 | int firstKmerStart, int lastKmerStart) { 196 | // Increment k-mer's in the forward strand 197 | vector hashList = vector(); 198 | hash(sequence, firstKmerStart, lastKmerStart, &hashList); 199 | 200 | # pragma omp critical 201 | { 202 | unsigned int size = hashList.size(); 203 | for (unsigned int i = 0; i < size; i++) { 204 | I keyHash = hashList.at(i); 205 | values[keyHash]++; 206 | } 207 | } 208 | // Increment k-mer's in the reverse complement 209 | /* 210 | string rc(""); 211 | Util::revCompDig(sequence, firstKmerStart, lastKmerStart + k - 1, &rc); 212 | 213 | hashList.clear(); 214 | hash(rc.c_str(), 0, rc.size() - k, &hashList); 215 | size = hashList.size(); 216 | 217 | for (int i = 0; i < size; i++) { 218 | I keyHash = hashList.at(i); 219 | values[keyHash]++; 220 | }*/ 221 | } 222 | 223 | /** 224 | * Increment the entry associated with the key by one. 225 | */ 226 | template 227 | void KmerHashTable::increment(const char* key) { 228 | increment(key, 0); 229 | } 230 | 231 | /** 232 | * Increment the value associated with the key starting at keyStart in the 233 | * sequence by one. Also, this method increments the count of the reverse complement 234 | * of the kmer by one. 235 | */ 236 | template 237 | void KmerHashTable::increment(const char* sequence, int keyStart) { 238 | // Increment the count of the kmer by one. 239 | I index = hash(sequence, keyStart); 240 | values[index]++; 241 | 242 | // Generate the reverse complement of the kmer. 243 | char * rcKmer = new char[k]; 244 | for (int j = 0; j < k; j++) { 245 | switch (sequence[j + keyStart]) { 246 | case 0: 247 | rcKmer[k - 1 - j] = 3; 248 | break; 249 | case 1: 250 | rcKmer[k - 1 - j] = 2; 251 | break; 252 | case 2: 253 | rcKmer[k - 1 - j] = 1; 254 | break; 255 | case 3: 256 | rcKmer[k - 1 - j] = 0; 257 | break; 258 | default: 259 | string msg = string("Invalid code of a nucleotide: "); 260 | msg.append(1, sequence[j + keyStart]); 261 | msg.append(". Valid codes are 0, 1, 2, and 3."); 262 | throw InvalidInputException(msg); 263 | } 264 | } 265 | 266 | // Update the count of the reverse complement of the kmer by one. 267 | I rcIndex = hash(rcKmer, 0); 268 | values[rcIndex]++; 269 | 270 | // Free memory 271 | delete[] rcKmer; 272 | } 273 | 274 | /** 275 | * Return the value associated with the key 276 | */ 277 | template 278 | V KmerHashTable::valueOf(const char* key) { 279 | return valueOf(key, 0); 280 | } 281 | 282 | /** 283 | * Return the value associated with the key 284 | * The key is a substring of length k starting at keyStart in the sequence 285 | */ 286 | template 287 | V KmerHashTable::valueOf(const char* sequence, int keyStart) { 288 | return values[hash(sequence, keyStart)]; 289 | } 290 | 291 | template 292 | V KmerHashTable::valueOf(I keyHash) { 293 | return values[keyHash]; 294 | } 295 | 296 | template 297 | void KmerHashTable::wholesaleValueOf(const char * sequence, 298 | int firstKmerStart, int lastKmerStart, vector * results) { 299 | wholesaleValueOf(sequence, firstKmerStart, lastKmerStart, results, 0); 300 | } 301 | 302 | /** 303 | * The values are set in the results vector starting at the resultsStart. 304 | * The contents of vector "results" must be initialized. 305 | * Otherwise, the program will crash outputting: "segmentation fault 11" 306 | */ 307 | template 308 | void KmerHashTable::wholesaleValueOf(const char * sequence, 309 | int firstKmerStart, int lastKmerStart, vector * results, 310 | int resultsStart) { 311 | 312 | int index = resultsStart; 313 | vector hashList = vector(); 314 | hash(sequence, firstKmerStart, lastKmerStart, &hashList); 315 | int size = hashList.size(); 316 | 317 | for (int i = 0; i < size; i++) { 318 | (*results)[index] = values[hashList.at(i)]; 319 | index++; 320 | } 321 | } 322 | 323 | /** 324 | * This method returns the number of occupied entries in the table. 325 | * A non-occupied entry has the initial value. 326 | */ 327 | template 328 | I KmerHashTable::countNonInitialEntries() { 329 | I count = 0; 330 | for (I i = 0; i < maxTableSize; i++) { 331 | if (values[i] != initialValue) { 332 | count++; 333 | } 334 | } 335 | return count; 336 | } 337 | 338 | /** 339 | * Make a list of the k-mers. 340 | */ 341 | template 342 | void KmerHashTable::getKeys(vector& keys) { 343 | vector * alpha = new vector(); 344 | alpha->push_back((char) 0); 345 | alpha->push_back((char) 1); 346 | alpha->push_back((char) 2); 347 | alpha->push_back((char) 3); 348 | 349 | vector *words = new vector(); 350 | for (unsigned int h = 0; h < alpha->size(); h++) { 351 | words->push_back(string(1, alpha->at(h))); 352 | } 353 | 354 | int wLen = k; 355 | for (int i = 1; i < wLen; i++) { 356 | vector *wordsAtItrI = new vector(); 357 | for (I j = 0; j < words->size(); j++) { 358 | for (unsigned int h = 0; h < alpha->size(); h++) { 359 | string w = string(words->at(j)); 360 | w.append(1, alpha->at(h)); 361 | wordsAtItrI->push_back(w); 362 | } 363 | } 364 | words->clear(); 365 | delete words; 366 | words = new vector(*wordsAtItrI); 367 | 368 | // Free memory 369 | wordsAtItrI->clear(); 370 | delete wordsAtItrI; 371 | } 372 | 373 | // Change the type of the elements 374 | for (I j = 0; j < words->size(); j++) { 375 | keys.push_back(words->at(j).c_str()); 376 | } 377 | 378 | // Free memory 379 | alpha->clear(); 380 | delete alpha; 381 | } 382 | 383 | /** 384 | * Print the contents of the whole table 385 | */ 386 | template 387 | void KmerHashTable::printTable(string output) { 388 | vector keys; 389 | getKeys(keys); 390 | 391 | ofstream out(output.c_str()); 392 | 393 | for (I i = 0; i < keys.size(); i++) { 394 | const char * kmer = keys.at(i); 395 | for (int j = 0; j < k; j++) { 396 | out << (int) kmer[j]; 397 | } 398 | cerr << "Hash: " << hash(keys.at(i), 0) << endl; 399 | 400 | out << " -> " << values[hash(keys.at(i), 0)] << endl; 401 | } 402 | 403 | out.close(); 404 | keys.clear(); 405 | } 406 | 407 | template 408 | int KmerHashTable::getK() { 409 | return k; 410 | } 411 | 412 | template 413 | I KmerHashTable::getMaxTableSize() { 414 | return maxTableSize; 415 | } 416 | 417 | template 418 | const V * KmerHashTable::getValues() const { 419 | return values; 420 | } 421 | 422 | /** 423 | * Call after building the table. 424 | * A negative value is a likely indication of overflow. 425 | */ 426 | template 427 | void KmerHashTable::checkOverflow() { 428 | for (I y = 0; y < maxTableSize; y++) { 429 | if (values[y] < 0) { 430 | string msg("A negative value is a likely indication of overflow. "); 431 | msg.append( 432 | "To the developer, consider larger data type in KmerHashTable."); 433 | throw InvalidStateException(msg); 434 | } 435 | } 436 | } 437 | 438 | template 439 | V KmerHashTable::getMaxValue() { 440 | V max = 0; 441 | for (I y = 0; y < maxTableSize; y++) { 442 | if (values[y] > max) { 443 | max = values[y]; 444 | } 445 | } 446 | return max; 447 | } 448 | 449 | /** 450 | * Divide the size of the table into roughly equal segment, 451 | * each of which is divisible by 4 and will be processed by 452 | * a separate core. 453 | * Return: 454 | */ 455 | template 456 | std::pair KmerHashTable::getSegmentLengthAndCount(){ 457 | I segLength = ceil(KmerHashTable::maxTableSize / (double) Util::CORE_NUM); 458 | 459 | if(segLength % 4 != 0){ 460 | segLength = segLength + 4 - (segLength % 4 ); 461 | } 462 | 463 | if(segLength % 4 != 0){ 464 | cerr << "The table segment must be divisable by 4."; 465 | cerr << endl; 466 | throw std::exception(); 467 | } 468 | I segCount = ceil(KmerHashTable::maxTableSize / (long double) segLength); 469 | return std::make_pair(segLength, segCount); 470 | } 471 | -------------------------------------------------------------------------------- /src_2.0/nonltr/KmerHashTable.h: -------------------------------------------------------------------------------- 1 | /* 2 | * KmerHashTable.h 3 | * 4 | * Created on: Jul 25, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | 8 | #ifndef KMERHASHTABLE_H_ 9 | #define KMERHASHTABLE_H_ 10 | 11 | #include 12 | #include 13 | #include "ITableView.h" 14 | 15 | using namespace std; 16 | using namespace nonltr; 17 | 18 | namespace nonltr { 19 | 20 | template 21 | class KmerHashTable: public ITableView { 22 | 23 | protected: 24 | /* Fields */ 25 | static const int maxKeyLength = 15; 26 | int k; 27 | 28 | I maxTableSize; 29 | 30 | // The hashed values, i.e. the values of the hash table. 31 | // The index is the 4ry representation of the key 32 | V * values; 33 | V initialValue; 34 | virtual std::pair getSegmentLengthAndCount(); 35 | 36 | private: 37 | // [4^0, 4^1, ... , 4^(k-1)] 38 | I * bases; 39 | I * mMinusOne; 40 | void initialize(int, V); 41 | 42 | public: 43 | /* Methods */ 44 | KmerHashTable(int); 45 | KmerHashTable(int, V); 46 | 47 | virtual ~KmerHashTable(); 48 | 49 | I hash(const char *); 50 | I hash(const char *, int); 51 | void hash(const char *, int, int, vector *); 52 | 53 | void insert(const char*, V); 54 | void insert(const char*, int, V); 55 | void insert(I, V); 56 | 57 | void increment(const char*); 58 | void increment(const char*, int); 59 | void wholesaleIncrement(const char*, int, int); 60 | 61 | void addReverseComplement(); 62 | I countNonInitialEntries(); 63 | void getKeys(vector& keys); 64 | void printTable(string); 65 | void checkOverflow(); 66 | 67 | /*Vritual methods from ITableView*/ 68 | virtual V valueOf(const char*); 69 | virtual V valueOf(const char*, int); 70 | virtual V valueOf(I); 71 | virtual void wholesaleValueOf(const char *, int, int, vector *); 72 | virtual void wholesaleValueOf(const char *, int, int, vector *, int); 73 | 74 | virtual int getK(); 75 | virtual I getMaxTableSize(); 76 | virtual V getMaxValue(); 77 | virtual const V * getValues() const; 78 | }; 79 | } 80 | 81 | #include "KmerHashTable.cpp" 82 | 83 | #endif /* KMERHASHTABLE_H_ */ 84 | -------------------------------------------------------------------------------- /src_2.0/nonltr/LocationList.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationList.cpp 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Zakaria Girgis, PhD 6 | * 7 | * 8 | * An instance of this class holds a list of merged locations. 9 | */ 10 | 11 | #include "LocationList.h" 12 | 13 | namespace nonltr { 14 | 15 | LocationList::LocationList(string chromNameIn) { 16 | chromName = chromNameIn; 17 | regionList = new vector(); 18 | merge(); 19 | } 20 | 21 | LocationList::~LocationList() { 22 | Util::deleteInVector(regionList); 23 | delete regionList; 24 | } 25 | 26 | void LocationList::add(int start, int end) { 27 | regionList->push_back(new Location(start, end)); 28 | } 29 | 30 | void LocationList::merge() { 31 | int regionCount = regionList->size(); 32 | int gg = 0; 33 | while (gg < regionCount) { 34 | ILocation * region = regionList->at(gg); 35 | 36 | int regionStart = region->getStart(); 37 | int regionEnd = region->getEnd(); 38 | 39 | if (gg > 0) { 40 | ILocation * pRegion = regionList->at(gg - 1); 41 | int pStart = pRegion->getStart(); 42 | int pEnd = pRegion->getEnd(); 43 | 44 | if (Util::isOverlapping(pStart, pEnd, regionStart, regionEnd)) { 45 | pRegion->setEnd(regionEnd > pEnd ? regionEnd : pEnd); 46 | regionList->erase(regionList->begin() + gg); 47 | delete region; 48 | regionCount = regionList->size(); 49 | } else { 50 | gg++; 51 | } 52 | } 53 | 54 | if (gg == 0) { 55 | gg++; 56 | } 57 | } 58 | } 59 | 60 | void LocationList::mergeWithAnotherList( 61 | const vector * const otherList) { 62 | //A pre-condition: Ensure that the other list is sorted 63 | for (unsigned int h = 1; h < otherList->size(); h++) { 64 | if (otherList->at(h)->getStart() < otherList->at(h - 1)->getStart()) { 65 | throw InvalidStateException( 66 | string("LocationList - The other list is not sorted.")); 67 | } 68 | } 69 | 70 | // Start 71 | vector * mergedList = new vector(); 72 | 73 | int i = 0; 74 | int j = 0; 75 | int iLimit = regionList->size(); 76 | int jLimit = otherList->size(); 77 | 78 | // Continue until one list is finished 79 | while (i < iLimit && j < jLimit) { 80 | ILocation * iLoc = regionList->at(i); 81 | ILocation * jLoc = otherList->at(j); 82 | 83 | if (iLoc->getStart() < jLoc->getStart()) { 84 | mergedList->push_back(iLoc); 85 | i++; 86 | } else { 87 | mergedList->push_back(new Location(*jLoc)); 88 | j++; 89 | } 90 | } 91 | 92 | // Once one list is finished, copy the rest of the other list 93 | if (i == iLimit) { 94 | for (; j < jLimit; j++) { 95 | mergedList->push_back(new Location(*(otherList->at(j)))); 96 | } 97 | } else if (j == jLimit) { 98 | for (; i < iLimit; i++) { 99 | mergedList->push_back(regionList->at(i)); 100 | } 101 | } 102 | 103 | // Once done 104 | // Util::deleteInVector(regionList); 105 | regionList->clear(); // Need to test this line 106 | delete regionList; 107 | regionList = mergedList; 108 | 109 | merge(); 110 | 111 | //A post-condition: Ensure that the list is sorted 112 | for (unsigned int h = 1; h < regionList->size(); h++) { 113 | if (regionList->at(h)->getStart() < regionList->at(h - 1)->getStart()) { 114 | throw InvalidStateException(string("This list is not sorted.")); 115 | } 116 | } 117 | } 118 | 119 | void LocationList::print() { 120 | cout << endl << chromName << endl; 121 | for (unsigned int i = 0; i < regionList->size(); i++) { 122 | int s = regionList->at(i)->getStart(); 123 | int e = regionList->at(i)->getEnd(); 124 | cout << s << "-" << e << endl; 125 | } 126 | } 127 | 128 | const vector * LocationList::getList() { 129 | return regionList; 130 | } 131 | 132 | void LocationList::convertToRedFormat() { 133 | trim(1); 134 | } 135 | 136 | void LocationList::trim(int x) { 137 | for (unsigned int i = regionList->size() - 1; i >= 0; i--) { 138 | ILocation * region = regionList->at(i); 139 | int start = region->getStart(); 140 | int newEnd = region->getEnd() - x; 141 | 142 | if (newEnd < 0 || start > newEnd) { 143 | regionList->erase(regionList->begin() + i); 144 | delete region; 145 | } else { 146 | region->setEnd(newEnd); 147 | } 148 | } 149 | } 150 | 151 | } 152 | 153 | /* namespace nonltr */ 154 | -------------------------------------------------------------------------------- /src_2.0/nonltr/LocationList.h: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationList.h 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Z. Girgis, PhD 6 | */ 7 | 8 | #ifndef SRC_NONLTR_LOCATIONLIST_H_ 9 | #define SRC_NONLTR_LOCATIONLIST_H_ 10 | 11 | #include 12 | #include "../utility/Util.h" 13 | #include "../utility/ILocation.h" 14 | #include "../utility/Location.h" 15 | #include "../exception/InvalidStateException.h" 16 | 17 | using namespace std; 18 | using namespace utility; 19 | using namespace exception; 20 | 21 | namespace nonltr { 22 | 23 | class LocationList { 24 | private: 25 | string chromName; 26 | vector * regionList; 27 | void merge(); 28 | 29 | public: 30 | LocationList(string); 31 | virtual ~LocationList(); 32 | 33 | void add(int, int); 34 | 35 | /** 36 | * Take a sorted list 37 | */ 38 | void mergeWithAnotherList(const vector * const); 39 | 40 | 41 | /** 42 | * Print locations 43 | */ 44 | void print(); 45 | 46 | const vector * getList(); 47 | void convertToRedFormat(); 48 | void trim(int ); 49 | }; 50 | 51 | } /* namespace nonltr */ 52 | 53 | #endif /* SRC_NONLTR_LOCATIONLIST_H_ */ 54 | -------------------------------------------------------------------------------- /src_2.0/nonltr/LocationListCollection.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationListCollection.cpp 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "LocationListCollection.h" 9 | 10 | namespace nonltr { 11 | 12 | LocationListCollection::LocationListCollection(string fileNameIn) { 13 | fileName = fileNameIn; 14 | collection = new map(); 15 | readCoordinates(); 16 | } 17 | 18 | LocationListCollection::~LocationListCollection() { 19 | collection->clear(); 20 | delete collection; 21 | } 22 | 23 | void LocationListCollection::readCoordinates() { 24 | Util::checkFile(fileName); 25 | 26 | ifstream in(fileName.c_str()); 27 | LocationList * locList; 28 | string previousChromName(""); 29 | 30 | while (in.good()) { 31 | string line; 32 | getline(in, line); 33 | 34 | if (line.compare(string("")) != 0) { 35 | int colIndex = line.find_last_of(':'); 36 | int dashIndex = line.find_last_of('-'); 37 | 38 | string chromName = line.substr(0, colIndex); 39 | 40 | if (previousChromName.compare(chromName) != 0) { 41 | 42 | cout << "Processing regions of " << chromName << endl; 43 | 44 | locList = new LocationList(chromName); 45 | collection->insert( 46 | map::value_type(chromName, 47 | locList)); 48 | 49 | previousChromName = chromName; 50 | } 51 | 52 | int start = 53 | atoi( 54 | line.substr(colIndex + 1, dashIndex - colIndex - 1).c_str()); 55 | int end = atoi(line.substr(dashIndex + 1).c_str()); 56 | locList->add(start, end); 57 | } 58 | } 59 | 60 | in.close(); 61 | } 62 | 63 | void LocationListCollection::print() { 64 | map::iterator itr_s = collection->begin(); 65 | map::iterator itr_e = collection->end(); 66 | while (itr_s != itr_e) { 67 | collection->at(itr_s->first)->print(); 68 | ++itr_s; 69 | } 70 | } 71 | 72 | LocationList * const LocationListCollection::getLocationList(string header) { 73 | if (collection->count(header) == 0) { 74 | string msg("Regions of "); 75 | msg.append(header); 76 | msg.append(" cannot be found.\n"); 77 | throw InvalidStateException(msg); 78 | } 79 | 80 | return collection->at(header); 81 | } 82 | 83 | void LocationListCollection::convertToRedFormat() { 84 | map::iterator itr_s = collection->begin(); 85 | map::iterator itr_e = collection->end(); 86 | while (itr_s != itr_e) { 87 | collection->at(itr_s->first)->convertToRedFormat(); 88 | ++itr_s; 89 | } 90 | } 91 | 92 | void LocationListCollection::trim(int x) { 93 | map::iterator itr_s = collection->begin(); 94 | map::iterator itr_e = collection->end(); 95 | while (itr_s != itr_e) { 96 | collection->at(itr_s->first)->trim(x); 97 | ++itr_s; 98 | } 99 | } 100 | 101 | } /* namespace nonltr */ 102 | -------------------------------------------------------------------------------- /src_2.0/nonltr/LocationListCollection.h: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationListCollection.h 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ 9 | #define SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "LocationList.h" 15 | #include "../utility/Util.h" 16 | #include "../exception/InvalidStateException.h" 17 | 18 | using namespace std; 19 | using namespace utility; 20 | 21 | namespace nonltr { 22 | 23 | class LocationListCollection { 24 | 25 | private: 26 | string fileName; 27 | map * collection; 28 | void readCoordinates(); 29 | 30 | public: 31 | LocationListCollection(string); 32 | virtual ~LocationListCollection(); 33 | LocationList * const getLocationList(string); 34 | void print(); 35 | void convertToRedFormat(); 36 | void trim(int ); 37 | }; 38 | 39 | } /* namespace nonltr */ 40 | 41 | #endif /* SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ */ 42 | -------------------------------------------------------------------------------- /src_2.0/nonltr/Scanner.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Scanner.cpp 3 | * 4 | * Created on: Aug 19, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | #include "Scanner.h" 8 | 9 | namespace nonltr { 10 | 11 | Scanner::Scanner(HMM * hmmIn, int kIn, ChromosomeOneDigit * chromIn, 12 | string scoresFile) { 13 | // ToDo: Fix this operation 14 | string msg("Scanning file of scores is temporarily disabled."); 15 | throw InvalidOperationException(msg); 16 | 17 | hmm = hmmIn; 18 | k = kIn; 19 | chrom = chromIn; 20 | segmentList = chrom->getSegment(); 21 | scorer = NULL; 22 | scoreList = new vector(); 23 | ifstream in(scoresFile.c_str()); 24 | if (in) { 25 | string header; 26 | getline(in, header); 27 | 28 | string score; 29 | while (in >> score) { 30 | scoreList->push_back(atoi(score.c_str())); 31 | } 32 | in.close(); 33 | } else { 34 | string msg(scoresFile); 35 | msg.append(" does not exist."); 36 | throw FileDoesNotExistException(msg); 37 | } 38 | 39 | regionList = new vector(); 40 | 41 | // Start scanning 42 | start(); 43 | } 44 | 45 | Scanner::Scanner(HMM * hmmIn, int kIn, ChromosomeOneDigit * chromIn, 46 | ITableView * table) { 47 | hmm = hmmIn; 48 | k = kIn; 49 | 50 | chrom = chromIn; 51 | segmentList = chrom->getSegment(); 52 | scorer = new Scorer(chrom, table); 53 | scorer->takeLog(hmm->getBase()); 54 | scoreList = scorer->getScores(); 55 | regionList = new vector(); 56 | 57 | // Start scanning 58 | start(); 59 | } 60 | 61 | Scanner::~Scanner() { 62 | if (scorer == NULL) { 63 | scoreList->clear(); 64 | delete scoreList; 65 | } else { 66 | delete scorer; 67 | } 68 | 69 | Util::deleteInVector(regionList); 70 | delete regionList; 71 | } 72 | 73 | void Scanner::start() { 74 | check(); 75 | 76 | decode(); 77 | 78 | extendByK(); 79 | 80 | merge(); 81 | } 82 | 83 | void Scanner::check() { 84 | if (chrom->size() != scoreList->size()) { 85 | string msg("The size of the sequence is not the same as the size of "); 86 | msg.append("the scores. The size of sequence is: "); 87 | msg.append(Util::int2string(chrom->size())); 88 | msg.append(". The size of the scores is: "); 89 | msg.append(Util::int2string(scoreList->size())); 90 | msg.append("."); 91 | throw InvalidStateException(msg); 92 | } 93 | } 94 | 95 | void Scanner::decode() { 96 | int segmentCount = segmentList->size(); 97 | for (int tt = 0; tt < segmentCount; tt++) { 98 | vector * segment = segmentList->at(tt); 99 | hmm->decode(segment->at(0), segment->at(1), scoreList, *regionList); 100 | } 101 | } 102 | 103 | void Scanner::extendByK() { 104 | int regionCount = regionList->size(); 105 | if (regionCount > 0) { 106 | int firstCandIndex = 0; 107 | int lastCandIndex = 0; 108 | int segmentNumber = segmentList->size(); 109 | for (int i = 0; i < segmentNumber; i++) { 110 | vector * s = segmentList->at(i); 111 | ILocation * c = regionList->at(firstCandIndex); 112 | // Sometimes a segment have no repeats 113 | if (Util::isOverlapping(s->at(0), s->at(1), c->getStart(), 114 | c->getEnd())) { 115 | lastCandIndex = extendByKHelper(s->at(0), s->at(1), 116 | firstCandIndex); 117 | firstCandIndex = lastCandIndex + 1; 118 | if (firstCandIndex >= regionCount) { 119 | break; 120 | } 121 | } 122 | } 123 | } 124 | } 125 | 126 | int Scanner::extendByKHelper(int segStart, int segEnd, int firstCandIndex) { 127 | ILocation * cand = regionList->at(firstCandIndex); 128 | 129 | // Make sure that the first region is overlapping with the segment 130 | if (!Util::isOverlapping(segStart, segEnd, cand->getStart(), 131 | cand->getEnd())) { 132 | string msg("The first region is not overlapping with the segment."); 133 | msg.append(" Region: "); 134 | msg.append(Util::int2string(cand->getStart())); 135 | msg.append(":"); 136 | msg.append(Util::int2string(cand->getEnd())); 137 | msg.append(" Segment: "); 138 | msg.append(Util::int2string(segStart)); 139 | msg.append(":"); 140 | msg.append(Util::int2string(segEnd)); 141 | throw InvalidInputException(msg); 142 | } 143 | 144 | int lastCandIndex = -1; 145 | int candidateNumber = regionList->size(); 146 | for (int c = firstCandIndex; c < candidateNumber; c++) { 147 | ILocation * cand = regionList->at(c); 148 | if (Util::isOverlapping(segStart, segEnd, cand->getStart(), 149 | cand->getEnd())) { 150 | int newEnd = cand->getEnd() + k - 1; 151 | if (newEnd > segEnd) { 152 | newEnd = segEnd; 153 | } 154 | cand->setEnd(newEnd); 155 | lastCandIndex = c; 156 | } else { 157 | break; 158 | } 159 | } 160 | 161 | if (lastCandIndex < 0) { 162 | string msg("The index of the last region cannot be negative."); 163 | throw InvalidStateException(msg); 164 | } 165 | 166 | return lastCandIndex; 167 | } 168 | 169 | void Scanner::merge() { 170 | int regionCount = regionList->size(); 171 | int gg = 0; 172 | while (gg < regionCount) { 173 | ILocation * region = regionList->at(gg); 174 | 175 | int regionStart = region->getStart(); 176 | int regionEnd = region->getEnd(); 177 | 178 | if (gg > 0) { 179 | ILocation * pRegion = regionList->at(gg - 1); 180 | int pStart = pRegion->getStart(); 181 | int pEnd = pRegion->getEnd(); 182 | 183 | if (Util::isOverlapping(pStart, pEnd, regionStart, regionEnd)) { 184 | pRegion->setEnd(regionEnd > pEnd ? regionEnd : pEnd); 185 | regionList->erase(regionList->begin() + gg); 186 | delete region; 187 | regionCount = regionList->size(); 188 | } else { 189 | gg++; 190 | } 191 | } 192 | 193 | if (gg == 0) { 194 | gg++; 195 | } 196 | } 197 | } 198 | 199 | void Scanner::mergeWithOtherRegions(const vector * otherList) { 200 | vector * mergedList = new vector(); 201 | 202 | int i = 0; 203 | int j = 0; 204 | int iLimit = regionList->size(); 205 | int jLimit = otherList->size(); 206 | 207 | // Continue until one list is finished 208 | while (i < iLimit && j < jLimit) { 209 | ILocation * iLoc = regionList->at(i); 210 | ILocation * jLoc = otherList->at(j); 211 | 212 | if (iLoc->getStart() < jLoc->getStart()) { 213 | mergedList->push_back(iLoc); 214 | i++; 215 | } else { 216 | mergedList->push_back(new Location(*jLoc)); 217 | j++; 218 | } 219 | } 220 | 221 | // Once one list is finished, copy the rest of the other list 222 | if (i == iLimit) { 223 | for (; j < jLimit; j++) { 224 | mergedList->push_back(new Location(*(otherList->at(j)))); 225 | } 226 | } else if (j == jLimit) { 227 | for (; i < iLimit; i++) { 228 | mergedList->push_back(regionList->at(i)); 229 | } 230 | } 231 | 232 | // Once done 233 | // Util::deleteInVector(regionList); 234 | // @@ Need to be tested 235 | regionList->clear(); 236 | delete regionList; 237 | regionList = mergedList; 238 | 239 | merge(); 240 | 241 | //Ensure that the list is sorted 242 | for (unsigned int h = 1; h < regionList->size(); h++) { 243 | if (regionList->at(h)->getStart() < regionList->at(h - 1)->getStart()) { 244 | throw InvalidStateException(string("This list is not sorted.")); 245 | } 246 | } 247 | } 248 | 249 | void Scanner::makeForwardCoordinates() { 250 | int regionNum = regionList->size(); 251 | int lastBase = chrom->size() - 1; 252 | 253 | // Calculate the coordinate on the main strand 254 | for (int i = 0; i < regionNum; i++) { 255 | ILocation * oldLoc = regionList->at(i); 256 | regionList->at(i) = new Location(lastBase - oldLoc->getEnd(), 257 | lastBase - oldLoc->getStart()); 258 | delete oldLoc; 259 | } 260 | 261 | // Reverse the regions within the list 262 | int lastRegion = regionNum - 1; 263 | int middle = regionNum / 2; 264 | for (int i = 0; i < middle; i++) { 265 | ILocation * temp = regionList->at(lastRegion - i); 266 | regionList->at(lastRegion - i) = regionList->at(i); 267 | regionList->at(i) = temp; 268 | } 269 | 270 | } 271 | 272 | /** 273 | * Warning: this method prints the logarithm values of the scores 274 | */ 275 | void Scanner::printScores(string outputFile, bool canAppend) { 276 | cout << "Printing the logarithmic values of the scores "; 277 | cout << "NOT the original scores." << endl; 278 | 279 | ofstream outScores; 280 | if (canAppend) { 281 | outScores.open(outputFile.c_str(), ios::out | ios::app); 282 | } else { 283 | outScores.open(outputFile.c_str(), ios::out); 284 | } 285 | 286 | int step = 50; 287 | outScores << chrom->getHeader() << endl; 288 | int len = scoreList->size(); 289 | for (int i = 0; i < len; i = i + step) { 290 | int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; 291 | for (int k = i; k <= e; k++) { 292 | outScores << scoreList->at(k) << " "; 293 | } 294 | outScores << endl; 295 | } 296 | outScores << endl; 297 | outScores.close(); 298 | } 299 | 300 | void Scanner::printIndex(string outputFile, bool canAppend, int frmt) { 301 | 302 | if(frmt != FRMT_POS && frmt != FRMT_BED){ 303 | string msg("Unknown output format: "); 304 | msg.append(Util::int2string(frmt)); 305 | msg.append(". The known formats are: "); 306 | msg.append(Util::int2string(FRMT_POS)); 307 | msg.append(" and "); 308 | msg.append(Util::int2string(FRMT_BED)); 309 | msg.append("."); 310 | throw InvalidInputException(msg); 311 | } 312 | 313 | ofstream outIndex; 314 | if (canAppend) { 315 | outIndex.open(outputFile.c_str(), ios::out | ios::app); 316 | } else { 317 | outIndex.open(outputFile.c_str(), ios::out); 318 | } 319 | 320 | // Write the index of the repeat segment [x,y[ 321 | string header = chrom->getHeader(); 322 | 323 | if(frmt == FRMT_POS){ 324 | for (unsigned int j = 0; j < regionList->size(); j++) { 325 | outIndex << header << ":"; 326 | outIndex << ((int) (regionList->at(j)->getStart())) << "-"; 327 | outIndex << ((int) (regionList->at(j)->getEnd() + 1)); 328 | outIndex << endl; 329 | } 330 | }else if(frmt == FRMT_BED){ 331 | for (unsigned int j = 0; j < regionList->size(); j++) { 332 | outIndex << header.substr(1) << "\t"; 333 | outIndex << ((int) (regionList->at(j)->getStart())) << "\t"; 334 | outIndex << ((int) (regionList->at(j)->getEnd() + 1)); 335 | outIndex << endl; 336 | } 337 | } 338 | 339 | outIndex.close(); 340 | } 341 | 342 | unsigned int Scanner::getTotalRegionLength() { 343 | unsigned int l = 0; 344 | for (unsigned int j = 0; j < regionList->size(); j++) { 345 | l += regionList->at(j)->getEnd() - regionList->at(j)->getStart() + 1; 346 | } 347 | return l; 348 | } 349 | 350 | void Scanner::printMasked(string outputFile, Chromosome& oChrom, 351 | bool canAppend) { 352 | 353 | string baseCopy = *(oChrom.getBase()); 354 | int regionCount = regionList->size(); 355 | for (int j = 0; j < regionCount; j++) { 356 | for (int h = regionList->at(j)->getStart(); 357 | h <= regionList->at(j)->getEnd(); h++) { 358 | baseCopy[h] = tolower(baseCopy[h]); 359 | } 360 | } 361 | 362 | ofstream outMask; 363 | 364 | if (canAppend) { 365 | outMask.open(outputFile.c_str(), ios::out | ios::app); 366 | } else { 367 | outMask.open(outputFile.c_str(), ios::out); 368 | } 369 | 370 | outMask << oChrom.getHeader() << endl; 371 | int step = 50; 372 | int len = baseCopy.size(); 373 | for (int i = 0; i < len; i = i + step) { 374 | int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; 375 | for (int k = i; k <= e; k++) { 376 | outMask << baseCopy[k]; 377 | } 378 | outMask << endl; 379 | } 380 | outMask.close(); 381 | } 382 | 383 | const vector* Scanner::getRegionList() { 384 | return regionList; 385 | } 386 | 387 | } /* namespace nonltr */ 388 | -------------------------------------------------------------------------------- /src_2.0/nonltr/Scanner.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Scanner.h 3 | * 4 | * Created on: Aug 19, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef SCANNER_H_ 9 | #define SCANNER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "Chromosome.h" 16 | #include "ChromosomeOneDigit.h" 17 | #include "HMM.h" 18 | #include "ITableView.h" 19 | #include "Scorer.h" 20 | #include "../utility/Util.h" 21 | #include "../utility/ILocation.h" 22 | #include "../utility/Location.h" 23 | #include "../exception/InvalidInputException.h" 24 | #include "../exception/InvalidStateException.h" 25 | #include "../exception/FileDoesNotExistException.h" 26 | #include "../exception/InvalidOperationException.h" 27 | 28 | using namespace std; 29 | using namespace utility; 30 | using namespace exception; 31 | 32 | namespace nonltr { 33 | 34 | class Scanner { 35 | private: 36 | //string chromFile; 37 | ChromosomeOneDigit * chrom; 38 | const vector *> * segmentList; 39 | Scorer * scorer; 40 | vector * scoreList; 41 | vector * regionList; 42 | int k; 43 | HMM * hmm; 44 | // bool isTrainMode; 45 | 46 | // Methods 47 | void start(); 48 | void check(); 49 | void decode(); 50 | void extendByK(); 51 | int extendByKHelper(int, int, int); 52 | void merge(); 53 | 54 | public: 55 | static const int FRMT_POS = 1; 56 | static const int FRMT_BED = 2; 57 | 58 | Scanner(HMM *, int, ChromosomeOneDigit *, string); 59 | Scanner(HMM *, int, ChromosomeOneDigit *, ITableView *); 60 | virtual ~Scanner(); 61 | void makeForwardCoordinates(); 62 | 63 | void printScores(string, bool); 64 | void printIndex(string, bool, int); 65 | void printMasked(string, Chromosome&, bool); 66 | void mergeWithOtherRegions(const vector *); 67 | const vector* getRegionList(); 68 | unsigned int getTotalRegionLength(); 69 | }; 70 | 71 | } /* namespace nonltr */ 72 | #endif /* SCANNER_H_ */ 73 | -------------------------------------------------------------------------------- /src_2.0/nonltr/Scorer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Scorer.cpp 3 | * 4 | * Created on: Aug 3, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | #include "Scorer.h" 8 | 9 | Scorer::Scorer(ChromosomeOneDigit * chromIn, 10 | ITableView * const table) { 11 | chrom = chromIn; 12 | kmerTable = table; 13 | scores = new vector(chrom->getBase()->size(), 0); 14 | k = kmerTable->getK(); 15 | max = -1; 16 | score(); 17 | calculateMax(); 18 | } 19 | 20 | Scorer::~Scorer() { 21 | scores->clear(); 22 | delete scores; 23 | } 24 | 25 | /** 26 | * This method scores each nucleotide in the chromosome. 27 | * The nucleotides represented by 'N' are assigned zero. 28 | */ 29 | void Scorer::score() { 30 | const vector *> * segment = chrom->getSegment(); 31 | const char * segBases = chrom->getBase()->c_str(); 32 | 33 | for (int s = 0; s < segment->size(); s++) { 34 | int start = segment->at(s)->at(0); 35 | int end = segment->at(s)->at(1); 36 | kmerTable->wholesaleValueOf(segBases, start, end - k + 1, scores, 37 | start); 38 | 39 | // Handle the last word from end - k + 2 till the end, inclusive. 40 | for (int i = end - k + 2; i <= end; i++) { 41 | (*scores)[i] = scores->at(i - 1); 42 | } 43 | } 44 | } 45 | 46 | /** 47 | * This method takes the logarithm of the scores according to the base. 48 | * If the score equals zero, it is left the same. 49 | */ 50 | void Scorer::takeLog(double base) { 51 | // Handle the case where base is one 52 | bool isOne = false; 53 | if (fabs(base - 1.0) < std::numeric_limits::epsilon()) { 54 | isOne = true; 55 | } 56 | double logBase = isOne ? log(1.5) : log(base); 57 | 58 | const vector *> * segment = chrom->getSegment(); 59 | for (int s = 0; s < segment->size(); s++) { 60 | int start = segment->at(s)->at(0); 61 | int end = segment->at(s)->at(1); 62 | for (int h = start; h <= end; h++) { 63 | int score = scores->at(h); 64 | 65 | if (score != 0) { 66 | if (!isOne || (isOne && score > 1)) { 67 | (*scores)[h] = ceil(log(score) / logBase); 68 | } 69 | } 70 | } 71 | } 72 | } 73 | 74 | int Scorer::getK() { 75 | return k; 76 | } 77 | 78 | vector* Scorer::getScores() { 79 | return scores; 80 | } 81 | 82 | void Scorer::printScores(string outputFile, bool canAppend) { 83 | ofstream outScores; 84 | if (canAppend) { 85 | outScores.open(outputFile.c_str(), ios::out | ios::app); 86 | } else { 87 | outScores.open(outputFile.c_str(), ios::out); 88 | } 89 | 90 | int step = 50; 91 | outScores << chrom->getHeader() << endl; 92 | int len = scores->size(); 93 | for (int i = 0; i < len; i = i + step) { 94 | int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; 95 | for (int k = i; k <= e; k++) { 96 | outScores << scores->at(k) << " "; 97 | } 98 | outScores << endl; 99 | } 100 | outScores << endl; 101 | 102 | outScores.close(); 103 | } 104 | 105 | int Scorer::countLessOrEqual(int thr) { 106 | int count = 0; 107 | const vector *> * segment = chrom->getSegment(); 108 | for (int s = 0; s < segment->size(); s++) { 109 | int start = segment->at(s)->at(0); 110 | int end = segment->at(s)->at(1); 111 | for (int h = start; h <= end; h++) { 112 | if (scores->at(h) <= thr) { 113 | count++; 114 | } 115 | } 116 | } 117 | return count; 118 | } 119 | 120 | void Scorer::calculateMax() { 121 | const vector *> * segmentList = chrom->getSegment(); 122 | int segmentCount = segmentList->size(); 123 | for (int jj = 0; jj < segmentCount; jj++) { 124 | vector * segment = segmentList->at(jj); 125 | int start = segment->at(0); 126 | int end = segment->at(1); 127 | for (int ss = start; ss <= end; ss++) { 128 | int score = scores->at(ss); 129 | if (score > max) { 130 | max = score; 131 | } 132 | } 133 | } 134 | 135 | if (max == -1) { 136 | string msg("Error occurred while finding the maximum score."); 137 | throw InvalidStateException(msg); 138 | } 139 | } 140 | 141 | int Scorer::getMax() { 142 | return max; 143 | } 144 | -------------------------------------------------------------------------------- /src_2.0/nonltr/Scorer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Scorer.h 3 | * 4 | * Created on: Aug 3, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef SCORER_H_ 9 | #define SCORER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "ITableView.h" 18 | #include "ChromosomeOneDigit.h" 19 | #include "../utility/Util.h" 20 | #include "../exception/InvalidStateException.h" 21 | 22 | using namespace std; 23 | using namespace nonltr; 24 | using namespace utility; 25 | using namespace exception; 26 | 27 | namespace nonltr { 28 | class Scorer { 29 | private: 30 | /* Fields */ 31 | ChromosomeOneDigit * chrom; 32 | ITableView * kmerTable; 33 | vector * scores; 34 | int k; 35 | int max; 36 | 37 | /* Methods */ 38 | void score(); 39 | void calculateMax(); 40 | 41 | public: 42 | /* Methods */ 43 | Scorer(ChromosomeOneDigit *, ITableView *); 44 | virtual ~Scorer(); 45 | void printScores(string, bool); 46 | vector* getScores(); 47 | int getK(); 48 | void takeLog(double); 49 | int countLessOrEqual(int); 50 | int getMax(); 51 | }; 52 | } 53 | 54 | #endif /* Scorer_H_ */ 55 | -------------------------------------------------------------------------------- /src_2.0/nonltr/TableBuilder.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * TableBuilder.cpp 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "TableBuilder.h" 9 | 10 | TableBuilder::TableBuilder(string dir, int motifSize, int order, int minObs) { 11 | genomeDir = dir; 12 | k = motifSize; 13 | genomeLength = 0; 14 | // kmerTable = new KmerHashTable(k); 15 | // kmerTable = new EnrichmentView(k); 16 | 17 | // Whenever you change the template, modify line 50 and 70 and the header file line 35 18 | kmerTable = new EnrichmentMarkovView(k, order, minObs); 19 | 20 | buildTable(); 21 | } 22 | 23 | TableBuilder::~TableBuilder() { 24 | delete kmerTable; 25 | } 26 | 27 | void TableBuilder::buildTable() { 28 | vector * fileList = new vector(); 29 | Util::readChromList(genomeDir, fileList, "fa"); 30 | 31 | # pragma omp parallel for schedule(dynamic) num_threads(Util::CORE_NUM) reduction(+: genomeLength) 32 | for (unsigned int i = 0; i < fileList->size(); i++) { 33 | # pragma omp critical 34 | { 35 | cout << "Counting k-mers in " << fileList->at(i) << " ..." << endl; 36 | } 37 | ChromListMaker * maker = new ChromListMaker(fileList->at(i)); 38 | const vector * chromList = maker->makeChromOneDigitList(); 39 | 40 | for (unsigned int h = 0; h < chromList->size(); h++) { 41 | ChromosomeOneDigit * chrom = 42 | dynamic_cast(chromList->at(h)); 43 | if (chrom) { 44 | genomeLength += chrom->getEffectiveSize(); 45 | updateTable(chrom); 46 | } else { 47 | throw InvalidStateException(string("Dynamic cast failed.")); 48 | } 49 | } 50 | 51 | delete maker; 52 | } 53 | // Check if overflow has occurred 54 | kmerTable->checkOverflow(); 55 | 56 | // View 57 | // EnrichmentView * view = dynamic_cast(kmerTable); 58 | EnrichmentMarkovView * view = 59 | dynamic_cast *>(kmerTable); 60 | 61 | if (view) { 62 | view->generateProbapilities(); 63 | view->processTable(); 64 | maxValue = view->getMaxValue(); 65 | } else { 66 | throw InvalidStateException(string("Dynamic cast failed.")); 67 | } 68 | cout << "Enrichment view is ready." << endl; 69 | 70 | fileList->clear(); 71 | delete fileList; 72 | 73 | /* If you would like to see the contents of the table.*/ 74 | // kmerTable-> printTable(); 75 | } 76 | 77 | void TableBuilder::updateTable(ChromosomeOneDigit * chrom) { 78 | // EnrichmentView * view = dynamic_cast(kmerTable); 79 | EnrichmentMarkovView * view = 80 | dynamic_cast *>(kmerTable); 81 | 82 | const vector *> * segment = chrom->getSegment(); 83 | const char * segBases = chrom->getBase()->c_str(); 84 | 85 | for (unsigned int s = 0; s < segment->size(); s++) { 86 | int start = segment->at(s)->at(0); 87 | int end = segment->at(s)->at(1); 88 | // cerr << "The segment length is: " << (end-start+1) << endl; 89 | 90 | // Fast, but require some memory proportional to the segment length. 91 | kmerTable->wholesaleIncrement(segBases, start, end - k + 1); 92 | if (view) { 93 | view->count(segBases, start, end); 94 | } else { 95 | throw InvalidStateException(string("Dynamic cast failed.")); 96 | } 97 | 98 | // Slow, but memory efficient 99 | /* 100 | vector hashList = vector(); 101 | kmerTable->hash(segBases, start, end - k + 1, &hashList); 102 | 103 | for (int i = start; i <= end - k + 1; i++) { 104 | kmerTable->increment(segBases, i); 105 | } 106 | */ 107 | } 108 | } 109 | 110 | KmerHashTable * const TableBuilder::getKmerTable() { 111 | return kmerTable; 112 | } 113 | 114 | long TableBuilder::getGenomeLength() { 115 | if (genomeLength < 0) { 116 | string msg("The length of the genome cannot be negative."); 117 | throw InvalidStateException(msg); 118 | } 119 | 120 | return genomeLength; 121 | } 122 | 123 | int TableBuilder::getMaxValue() { 124 | return maxValue; 125 | } 126 | -------------------------------------------------------------------------------- /src_2.0/nonltr/TableBuilder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * TableBuilder.h 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | 8 | #ifndef TABLEBUILDER_H_ 9 | #define TABLEBUILDER_H_ 10 | 11 | #include "KmerHashTable.h" 12 | #include "EnrichmentMarkovView.h" 13 | #include "ChromosomeOneDigit.h" 14 | #include "ChromListMaker.h" 15 | #include "IChromosome.h" 16 | 17 | #include "../utility/Util.h" 18 | #include "../exception/InvalidStateException.h" 19 | 20 | #include 21 | 22 | using namespace std; 23 | using namespace nonltr; 24 | using namespace utility; 25 | using namespace exception; 26 | 27 | namespace nonltr { 28 | class TableBuilder { 29 | private: 30 | /** 31 | * k-mer table 32 | */ 33 | KmerHashTable * kmerTable; 34 | int maxValue; 35 | 36 | /** 37 | * Directory including the FASTA files comprising the genome. 38 | * These files must have the 39 | */ 40 | string genomeDir; 41 | 42 | /** 43 | * The size of the motif 44 | */ 45 | int k; 46 | 47 | /** 48 | * The total length of the whole genome 49 | */ 50 | long genomeLength; 51 | 52 | /** 53 | * Methods 54 | */ 55 | void buildTable(); 56 | void updateTable(ChromosomeOneDigit *); 57 | 58 | public: 59 | TableBuilder(string, int, int, int); 60 | virtual ~TableBuilder(); 61 | KmerHashTable * const getKmerTable(); 62 | void printTable(); 63 | long getGenomeLength(); 64 | int getMaxValue(); 65 | }; 66 | } 67 | 68 | #endif /* TABLEBUILDER_H_ */ 69 | -------------------------------------------------------------------------------- /src_2.0/nonltr/Trainer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Trainer.cpp 3 | * 4 | * Created on: Aug 20, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "Trainer.h" 9 | 10 | namespace nonltr { 11 | 12 | // Pass the isCND and the isCON parameters 13 | 14 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 15 | double tIn, string candidateDirIn, int m) : 16 | minObs(m) { 17 | candidateDir = candidateDirIn; 18 | canPrintCandidates = true; 19 | isCND = true; 20 | isCON = false; 21 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 22 | } 23 | 24 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 25 | double tIn, string candidateDirIn, bool isCNDIn, string otherDirIn, 26 | int m) : 27 | minObs(m) { 28 | candidateDir = candidateDirIn; 29 | canPrintCandidates = true; 30 | isCND = isCNDIn; 31 | isCON = true; 32 | otherDir = otherDirIn; 33 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 34 | } 35 | 36 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 37 | double tIn, int m) : 38 | minObs(m) { 39 | canPrintCandidates = false; 40 | isCND = true; 41 | isCON = false; 42 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 43 | } 44 | 45 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 46 | double tIn, bool isCNDIn, string otherDirIn, int m) : 47 | minObs(m) { 48 | canPrintCandidates = false; 49 | isCND = isCNDIn; 50 | isCON = true; 51 | otherDir = otherDirIn; 52 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 53 | } 54 | 55 | void Trainer::initialize(string genomeDirIn, int orderIn, int kIn, double sIn, 56 | double tIn) { 57 | 58 | if (isCND == false && isCON == false) { 59 | string msg( 60 | "Training using the candidates or the other repeats is required. "); 61 | msg.append("Please specify which regions to be used for training. "); 62 | msg.append("Any of the two sets or a combination of both can be used."); 63 | throw InvalidStateException(msg); 64 | } 65 | 66 | genomeDir = genomeDirIn; 67 | fileList = new vector(); 68 | Util::readChromList(genomeDir, fileList, string("fa")); 69 | chromCount = fileList->size(); 70 | order = orderIn; 71 | k = kIn; 72 | s = sIn; 73 | t = tIn; 74 | p = 0.0; 75 | tDetector = tIn + 0.1; 76 | max = -1; 77 | 78 | stage1(); 79 | 80 | if (isCND) { 81 | stage2(); 82 | } 83 | stage3(); 84 | } 85 | 86 | Trainer::~Trainer() { 87 | fileList->clear(); 88 | delete fileList; 89 | delete builder; 90 | delete hmm; 91 | } 92 | 93 | /** 94 | * Stage 1: Building the table 95 | */ 96 | void Trainer::stage1() { 97 | cout << endl << endl; 98 | cout << "Stage 1: Building the table ..." << endl; 99 | builder = new TableBuilder(genomeDir, k, order, minObs); 100 | table = builder->getKmerTable(); 101 | genomeLength = builder->getGenomeLength(); 102 | max = builder->getMaxValue(); 103 | } 104 | 105 | void Trainer::stage2() { 106 | cout << endl << endl; 107 | cout << "Stage 2: Calculating the percentage ..." << endl; 108 | 109 | double effectiveSize = 0.0; 110 | double countLessOrEqual = 0.0; 111 | 112 | # pragma omp parallel for schedule(dynamic) num_threads(Util::CORE_NUM) \ 113 | reduction(+: effectiveSize, countLessOrEqual) 114 | for (int i = 0; i < chromCount; i++) { 115 | # pragma omp critical 116 | { 117 | cout << "Calculating the percentage in: " << fileList->at(i) << " ..."; 118 | cout << endl; 119 | } 120 | ChromListMaker * maker = new ChromListMaker(fileList->at(i)); 121 | const vector * chromList = maker->makeChromOneDigitList(); 122 | 123 | for (unsigned int h = 0; h < chromList->size(); h++) { 124 | ChromosomeOneDigit * chrom = 125 | dynamic_cast(chromList->at(h)); 126 | int effSize = chrom->getEffectiveSize(); 127 | 128 | effectiveSize += effSize; 129 | Scorer * scorer = new Scorer(chrom, table); 130 | countLessOrEqual += scorer->countLessOrEqual(t); 131 | delete scorer; 132 | } 133 | delete maker; 134 | } 135 | 136 | if (effectiveSize == 0) { 137 | string msg("The size of the genome cannot be zero."); 138 | throw InvalidStateException(msg); 139 | } else { 140 | p = 100.00 * countLessOrEqual / effectiveSize; 141 | cout << "The percentage is " << p << endl; 142 | if (p < 52.5) { 143 | p = 52.5; 144 | cout << "The percentage is increased to " << p << endl; 145 | } 146 | } 147 | } 148 | 149 | /** 150 | * Stage 3: Training 151 | */ 152 | void Trainer::stage3() { 153 | cout << endl << endl; 154 | cout << "Stage 3: Training ..." << endl; 155 | 156 | // Handle the case when the threshold is one. 157 | bool isOne = false; 158 | if (fabs(t - 1.0) < std::numeric_limits::epsilon()) { 159 | isOne = true; 160 | } 161 | double hmmBase = isOne ? 1.5 : t; 162 | 163 | // Make a list of candidate HMM 164 | int stateCount = 2 * (ceil(log(max) / log(hmmBase)) + 1); 165 | 166 | // Initialize the HMM 167 | hmm = new HMM(hmmBase, stateCount); 168 | 169 | // Start training the models 170 | #pragma omp parallel for schedule(dynamic) num_threads(Util::CORE_NUM) 171 | for (int i = 0; i < chromCount; i++) { 172 | # pragma omp critical 173 | { 174 | cout << "Training on: " << fileList->at(i) << endl; 175 | } 176 | // Name of candidates file 177 | string path(fileList->at(i)); 178 | int slashLastIndex = path.find_last_of(Util::fileSeparator); 179 | int dotLastIndex = path.find_last_of("."); 180 | string nickName = path.substr(slashLastIndex + 1, 181 | dotLastIndex - slashLastIndex - 1); 182 | 183 | // May or may not be used 184 | string cndFile = candidateDir + Util::fileSeparator + nickName + ".cnd"; 185 | 186 | // Work on the other repeats if desired 187 | LocationListCollection * otherRegionListCollection; 188 | bool isConRepAvailable = false; 189 | if (isCON) { 190 | string otherFile = otherDir + Util::fileSeparator + nickName 191 | + ".rpt"; 192 | ifstream f1(otherFile.c_str()); 193 | if (!f1) { 194 | string message = string("Warning: "); 195 | message.append(otherFile); 196 | message.append(" does not exist. "); 197 | message.append( 198 | "Repeats of this sequence will not used for training the HMM."); 199 | # pragma omp critical 200 | { 201 | cout << message << endl; 202 | } 203 | } else { 204 | otherRegionListCollection = new LocationListCollection( 205 | otherFile); 206 | otherRegionListCollection->convertToRedFormat(); 207 | otherRegionListCollection->trim(k - 1); 208 | 209 | isConRepAvailable = true; 210 | } 211 | f1.close(); 212 | } 213 | 214 | // Read sequences in the file 215 | ChromListMaker * maker = new ChromListMaker(fileList->at(i)); 216 | const vector * chromList = maker->makeChromOneDigitList(); 217 | 218 | for (unsigned int h = 0; h < chromList->size(); h++) { 219 | ChromosomeOneDigit * chrom = 220 | dynamic_cast(chromList->at(h)); 221 | Scorer * scorer = new Scorer(chrom, table); 222 | vector * scoreList = scorer->getScores(); 223 | 224 | // Detect candidates if desired 225 | ChromDetectorMaxima * detector; 226 | const vector * trainingRegionList; 227 | bool canDeleteDetector = true; 228 | if (isCND) { 229 | if (canPrintCandidates) { 230 | detector = new ChromDetectorMaxima(s, 10, 0, tDetector, p, 231 | s, scoreList, chrom); 232 | if (h > 0) { 233 | bool canAppend = true; 234 | detector->printIndex(cndFile, canAppend); 235 | } else { 236 | # pragma omp critical 237 | { 238 | cout << "Printing candidates to: " << cndFile << endl; 239 | } 240 | detector->printIndex(cndFile); 241 | } 242 | } else { 243 | detector = new ChromDetectorMaxima(s, 10, 0, tDetector, p, 244 | s, scoreList, chrom->getSegment()); 245 | } 246 | trainingRegionList = detector->getRegionList(); 247 | } 248 | 249 | if (isCON && isConRepAvailable) { 250 | LocationList * const locList = 251 | otherRegionListCollection->getLocationList( 252 | chrom->getHeader()); 253 | if (isCND) { 254 | locList->mergeWithAnotherList(detector->getRegionList()); 255 | } 256 | trainingRegionList = locList->getList(); 257 | } 258 | 259 | // The candidate regions are already copied to the location list 260 | if (isCND && isCON && isConRepAvailable) { 261 | delete detector; 262 | canDeleteDetector = false; 263 | } 264 | 265 | // Train the HMM 266 | if (isCND || (isCON && isConRepAvailable)) { 267 | scorer->takeLog(t); 268 | scoreList = scorer->getScores(); 269 | # pragma omp critical 270 | { 271 | hmm->train(scoreList, chrom->getSegment(), trainingRegionList); 272 | } 273 | } 274 | 275 | // Free more memory 276 | if (isCND && canDeleteDetector) { 277 | delete detector; 278 | } 279 | delete scorer; 280 | } 281 | 282 | if (isCON && isConRepAvailable) { 283 | delete otherRegionListCollection; 284 | } 285 | delete maker; 286 | } 287 | 288 | // Normalize HMM's once training is finished 289 | hmm->normalize(); 290 | } 291 | 292 | void Trainer::printTable(string fileName) { 293 | table->printTable(fileName); 294 | } 295 | 296 | HMM*& Trainer::getHmm() { 297 | return hmm; 298 | } 299 | 300 | KmerHashTable * Trainer::getTable() { 301 | return table; 302 | } 303 | 304 | void Trainer::printHmm(string fileName) { 305 | hmm->print(fileName); 306 | } 307 | 308 | } /* namespace nonltr */ 309 | -------------------------------------------------------------------------------- /src_2.0/nonltr/Trainer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Trainer.h 3 | * 4 | * Created on: Aug 20, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef TRAINER_H_ 9 | #define TRAINER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "TableBuilder.h" 18 | #include "KmerHashTable.h" 19 | #include "HMM.h" 20 | #include "ChromDetectorMaxima.h" 21 | #include "Scorer.h" 22 | #include "ChromListMaker.h" 23 | #include "LocationListCollection.h" 24 | #include "../utility/Util.h" 25 | #include "../exception/InvalidStateException.h" 26 | 27 | using namespace std; 28 | using namespace utility; 29 | using namespace exception; 30 | 31 | namespace nonltr { 32 | 33 | class Trainer { 34 | private: 35 | string genomeDir; 36 | string candidateDir; 37 | string otherDir; 38 | bool canPrintCandidates; 39 | vector * fileList; 40 | int chromCount; 41 | int order; 42 | int k; 43 | int max; // Maximum score in the entire genome 44 | double t; // Score threshold 45 | double tDetector; // threshold for the detector because it uses < not <=; 46 | double p; // Percentage of scores below the threshold, t, in non-repeats 47 | //double r; 48 | double s; // Half width of the mask 49 | long genomeLength; 50 | //vector * sampleList; 51 | TableBuilder * builder; 52 | KmerHashTable * table; 53 | HMM * hmm; 54 | int isCND; 55 | int isCON; 56 | // The minimum number of the observed k-mers 57 | const int minObs; 58 | 59 | void stage1(); 60 | void stage2(); 61 | void stage3(); 62 | //void stage4(); 63 | 64 | public: 65 | Trainer(string, int, int, double, double, string, int); 66 | Trainer(string, int, int, double, double, string, bool, string, int); 67 | Trainer(string, int, int, double, double, int); 68 | Trainer(string, int, int, double, double, bool, string, int); 69 | 70 | void initialize(string, int, int, double, double); 71 | virtual ~Trainer(); 72 | void printTable(string); 73 | void printHmm(string); 74 | HMM*& getHmm(); 75 | KmerHashTable * getTable(); 76 | 77 | }; 78 | 79 | } /* namespace nonltr */ 80 | #endif /* TRAINER_H_ */ 81 | -------------------------------------------------------------------------------- /src_2.0/utility/EmptyLocation.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * EmptyLocation.cpp 3 | * 4 | * Created on: Dec 28, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "EmptyLocation.h" 9 | #include "../exception/InvalidOperationException.h" 10 | 11 | using namespace exception; 12 | 13 | namespace utility { 14 | 15 | EmptyLocation * EmptyLocation::INSTANCE = new EmptyLocation(); 16 | 17 | EmptyLocation * EmptyLocation::getInstance(){ 18 | return INSTANCE; 19 | } 20 | 21 | EmptyLocation::EmptyLocation() { 22 | msg = new string("Empty location does not allow this operation."); 23 | } 24 | 25 | EmptyLocation::~EmptyLocation() { 26 | delete msg; 27 | } 28 | 29 | string EmptyLocation::toString() { 30 | return string("Empty"); 31 | } 32 | 33 | int EmptyLocation::getEnd() const { 34 | throw InvalidOperationException(*msg); 35 | } 36 | 37 | int EmptyLocation::getStart() const { 38 | throw InvalidOperationException(*msg); 39 | } 40 | 41 | void EmptyLocation::setEnd(int int1) { 42 | throw InvalidOperationException(*msg); 43 | } 44 | 45 | void EmptyLocation::setStart(int int1) { 46 | throw InvalidOperationException(*msg); 47 | } 48 | 49 | int EmptyLocation::getLength() { 50 | throw InvalidOperationException(*msg); 51 | } 52 | 53 | } /* namespace tr */ 54 | -------------------------------------------------------------------------------- /src_2.0/utility/EmptyLocation.h: -------------------------------------------------------------------------------- 1 | /* 2 | * EmptyLocation.h 3 | * 4 | * Created on: Dec 28, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef EMPTYLOCATION_H_ 9 | #define EMPTYLOCATION_H_ 10 | 11 | #include "ILocation.h" 12 | 13 | namespace utility { 14 | 15 | class EmptyLocation: public ILocation { 16 | private: 17 | string * msg; 18 | static EmptyLocation * INSTANCE; 19 | EmptyLocation(); 20 | virtual ~EmptyLocation(); 21 | 22 | public: 23 | virtual int getEnd() const; 24 | virtual int getStart() const; 25 | virtual void setEnd(int); 26 | virtual void setStart(int); 27 | virtual int getLength(); 28 | virtual string toString(); 29 | 30 | static EmptyLocation * getInstance(); 31 | 32 | }; 33 | 34 | } /* namespace tr */ 35 | #endif /* EMPTYLOCATION_H_ */ 36 | -------------------------------------------------------------------------------- /src_2.0/utility/ILocation.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ILocation.h 3 | * 4 | * Created on: Dec 20, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ILOCATION_H_ 9 | #define ILOCATION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace utility { 16 | 17 | class ILocation { 18 | public: 19 | virtual int getEnd() const = 0; 20 | virtual int getStart() const = 0; 21 | virtual void setEnd(int) = 0; 22 | virtual void setStart(int) = 0; 23 | virtual int getLength() = 0; 24 | virtual string toString() = 0; 25 | virtual ~ILocation(){} 26 | }; 27 | 28 | } 29 | 30 | #endif /* ILOCATION_H_ */ 31 | -------------------------------------------------------------------------------- /src_2.0/utility/Location.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Location.cpp 3 | * 4 | * Created on: Dec 19, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "Location.h" 9 | #include "Util.h" 10 | #include "../exception/InvalidInputException.h" 11 | 12 | using namespace exception; 13 | 14 | namespace utility { 15 | 16 | Location::Location(int startIn, int endIn) { 17 | initialize(startIn, endIn); 18 | } 19 | 20 | Location::Location(ILocation& cp) { 21 | initialize(cp.getStart(), cp.getEnd()); 22 | } 23 | 24 | void Location::initialize(int startIn, int endIn) { 25 | start = startIn; 26 | end = endIn; 27 | check(); 28 | 29 | } 30 | 31 | void Location::check() { 32 | if (start < 0 || end < 0 || start > end) { 33 | string msg("Invalid Input. Start is "); 34 | msg.append(Util::int2string(start)); 35 | msg.append(". End is "); 36 | msg.append(Util::int2string(end)); 37 | msg.append("."); 38 | throw InvalidInputException(msg); 39 | } 40 | } 41 | 42 | Location::~Location() { 43 | } 44 | 45 | int Location::getEnd() const { 46 | return end; 47 | } 48 | 49 | int Location::getStart() const { 50 | return start; 51 | } 52 | 53 | void Location::setEnd(int endIn) { 54 | end = endIn; 55 | check(); 56 | } 57 | 58 | void Location::setStart(int startIn) { 59 | start = startIn; 60 | check(); 61 | } 62 | 63 | int Location::getLength() { 64 | return end - start + 1; 65 | } 66 | 67 | string Location::toString() { 68 | string msg = (Util::int2string(start)); 69 | msg.append("-"); 70 | msg.append(Util::int2string(end)); 71 | 72 | return msg; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src_2.0/utility/Location.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Location.h 3 | * 4 | * Created on: Dec 19, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef LOCATION_H_ 9 | #define LOCATION_H_ 10 | 11 | #include "ILocation.h" 12 | 13 | #include 14 | 15 | using namespace std; 16 | 17 | namespace utility { 18 | 19 | class Location : public ILocation{ 20 | private: 21 | int start; 22 | int end; 23 | void initialize(int, int); 24 | void check(); 25 | 26 | public: 27 | Location(int, int); 28 | Location(ILocation&); 29 | virtual ~Location(); 30 | 31 | int getEnd() const; 32 | int getStart() const; 33 | void setEnd(int); 34 | void setStart(int); 35 | int getLength(); 36 | string toString(); 37 | }; 38 | 39 | } 40 | 41 | #endif /* LOCATION_H_ */ 42 | -------------------------------------------------------------------------------- /src_2.0/utility/Util.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Util.cpp 3 | * 4 | * Created on: Apr 24, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | * This class has a collection of utilities. 7 | */ 8 | #include "Util.h" 9 | 10 | Util::Util() { 11 | // TODO Auto-generated constructor stub 12 | 13 | } 14 | 15 | Util::~Util() { 16 | // TODO Auto-generated destructor stub 17 | } 18 | 19 | string Util::fileSeparator("/"); 20 | 21 | string * Util::emptyString = new string(""); 22 | 23 | unsigned int Util::CORE_NUM = (std::thread::hardware_concurrency() == 1)? 1 : std::thread::hardware_concurrency() - 1; 24 | 25 | void Util::readFasta(string seqFile, vector * infoList, 26 | vector * seqList, bool canCheckFormat) { 27 | ifstream in(seqFile.c_str()); 28 | string info; 29 | 30 | bool isFirst = true; 31 | string basePtr(""); 32 | 33 | while (in.good()) { 34 | string line; 35 | getline(in, line); 36 | if (line[0] == '>') { 37 | if (canCheckFormat) { 38 | int colIndex = line.find_first_of(':'); 39 | int dashIndex = line.find_first_of('-'); 40 | if (colIndex < 0 || dashIndex < 0) { 41 | string msg = 42 | "The header must be in the following format: chromosome:start-end\n"; 43 | msg += "The current input: " + line; 44 | throw InvalidInputException(msg); 45 | } 46 | } 47 | 48 | infoList->push_back(line); 49 | if (!isFirst) { 50 | seqList->push_back(basePtr); 51 | basePtr = string(""); 52 | } else { 53 | isFirst = false; 54 | } 55 | } else { 56 | basePtr.append(line); 57 | } 58 | } 59 | seqList->push_back(basePtr); 60 | in.close(); 61 | 62 | // cout << "The system read " << infoList->size() << " sequences." << endl; 63 | 64 | // Post condition 65 | if (infoList->size() != seqList->size()) { 66 | cerr << "Error while reading the fasta input file. " 67 | << "Header count = " << infoList->size() << " " 68 | << "Sequence count = " << seqList->size() << endl; 69 | exit(1); 70 | } 71 | } 72 | 73 | void Util::readFasta(string seqFile, vector * infoList, 74 | vector * seqList) { 75 | ifstream in(seqFile.c_str()); 76 | string info; 77 | 78 | bool isFirst = true; 79 | string * basePtr = new string(""); 80 | while (in.good()) { 81 | string line; 82 | getline(in, line); 83 | if (line[0] == '>') { 84 | infoList->push_back(line); 85 | if (!isFirst) { 86 | seqList->push_back(*basePtr); 87 | basePtr = new string(""); 88 | } else { 89 | isFirst = false; 90 | } 91 | } else { 92 | basePtr->append(line); 93 | } 94 | } 95 | seqList->push_back(*basePtr); 96 | in.close(); 97 | 98 | // Post condition 99 | if (infoList->size() != seqList->size()) { 100 | cerr << "Error while reading the fasta input file. " 101 | << "Header count = " << infoList->size() << " " 102 | << "Sequence count = " << seqList->size() << endl; 103 | exit(1); 104 | } 105 | } 106 | 107 | void Util::readCoordinates(string fileName, vector * coor) { 108 | checkFile(fileName); 109 | 110 | ifstream in(fileName.c_str()); 111 | string line; 112 | 113 | while (in >> line) { 114 | int colIndex = line.find_first_of(':'); 115 | int dashIndex = line.find_first_of('-'); 116 | 117 | int start = atoi( 118 | line.substr(colIndex + 1, dashIndex - colIndex - 1).c_str()); 119 | int end = atoi(line.substr(dashIndex + 1).c_str()); 120 | Location * loc = new Location(start, end); 121 | coor->push_back(loc); 122 | } 123 | 124 | in.close(); 125 | } 126 | 127 | std::vector Util::tokenize(std::string s, char delim) { 128 | std::vector v; 129 | std::string token = ""; 130 | for (unsigned int i = 0; i < s.size(); i++) { 131 | if (s[i] == delim) { 132 | v.push_back(token); 133 | token = ""; 134 | } else { 135 | token += s[i]; 136 | } 137 | } 138 | v.push_back(token); 139 | 140 | return v; 141 | } 142 | 143 | void Util::readCoordinates(string fileName, 144 | unordered_map *> * coor) { 145 | checkFile(fileName); 146 | 147 | ifstream in(fileName.c_str()); 148 | string line; 149 | 150 | while (getline(in, line)) { 151 | std::vector splitLine = tokenize(line, '\t'); 152 | 153 | string header = splitLine[0]; 154 | int start = atoi(splitLine[1].c_str()); 155 | int end = atoi(splitLine[2].c_str()); 156 | if (coor->count(header) == 0) { 157 | vector * a = new vector(); 158 | coor->emplace(header, a); 159 | } 160 | Location * loc = new Location(start, end); 161 | coor->at(header)->push_back(loc); 162 | } 163 | 164 | in.close(); 165 | } 166 | 167 | void Util::readCoordinates(string fileName, 168 | unordered_map *> * coor) { 169 | checkFile(fileName); 170 | 171 | ifstream in(fileName.c_str()); 172 | string line; 173 | 174 | while (getline(in, line)) { 175 | std::vector splitLine = tokenize(line, '\t'); 176 | 177 | string header = splitLine[0]; 178 | int start = atoi(splitLine[1].c_str()); 179 | int end = atoi(splitLine[2].c_str()); 180 | if (coor->count(header) == 0) { 181 | deque * a = new deque(); 182 | coor->emplace(header, a); 183 | } 184 | Location * loc = new Location(start, end); 185 | coor->at(header)->push_back(loc); 186 | } 187 | 188 | in.close(); 189 | } 190 | 191 | void Util::readChromList(string genomeDir, vector * chromList, 192 | string ext) { 193 | // This function may not be platform-independent 194 | // Credit: http://www.cplusplus.com/forum/beginner/9173/ 195 | DIR * dirPtr; 196 | 197 | if (!(dirPtr = opendir(genomeDir.c_str()))) { 198 | cerr << "Error with directory: " << genomeDir << endl; 199 | throw std::exception(); 200 | } 201 | 202 | struct dirent * entry; 203 | entry = readdir(dirPtr); 204 | if (entry == NULL) { 205 | cerr << "invalid directory" << endl; 206 | exit(0); 207 | } 208 | while (entry) { 209 | string file(entry->d_name); 210 | // Credit: http://stackoverflow.com/questions/51949/how-to-get-file-extension-from-string-in-c 211 | if (file.substr(file.find_last_of(".") + 1) == ext) { 212 | chromList->push_back(genomeDir + fileSeparator + entry->d_name); 213 | } 214 | entry = readdir(dirPtr); 215 | } 216 | 217 | closedir(dirPtr); 218 | } 219 | 220 | // This method will modify the contents of its parameter basePtr! 221 | void Util::toUpperCase(string * basePtr) { 222 | string base = *basePtr; 223 | // Convert alphabet to upper case 224 | for (unsigned int i = 0; i < base.length(); i++) { 225 | base[i] = toupper(base[i]); 226 | } 227 | } 228 | 229 | void Util::toUpperCase(string& base) { 230 | // Convert alphabet to upper case 231 | for (unsigned int i = 0; i < base.length(); i++) { 232 | base[i] = toupper(base[i]); 233 | } 234 | } 235 | 236 | // credit: http://stackoverflow.com/questions/228005/alternative-to-itoa-for-converting-integer-to-string-c 237 | string Util::int2string(int i) { 238 | string s; 239 | stringstream out; 240 | out << i; 241 | s = out.str(); 242 | return s; 243 | } 244 | 245 | // Need to use templates 246 | string Util::double2string(double i) { 247 | string s; 248 | stringstream out; 249 | out << i; 250 | s = out.str(); 251 | return s; 252 | } 253 | 254 | string Util::long2string(long i) { 255 | string s; 256 | stringstream out; 257 | out << i; 258 | s = out.str(); 259 | return s; 260 | } 261 | 262 | void Util::checkFile(string fileName) { 263 | ifstream f1(fileName.c_str()); 264 | if (!f1) { 265 | string message = string("ERROR: "); 266 | message.append(fileName); 267 | message.append(" does not exist.\n"); 268 | throw FileDoesNotExistException(message); 269 | } 270 | f1.close(); 271 | } 272 | 273 | /* 274 | https://stackoverflow.com/questions/18100097/portable-way-to-check-if-directory-exists-windows-linux-c 275 | */ 276 | void Util::checkDir(string dirName) { 277 | string message = string("ERROR: "); 278 | message.append(dirName); 279 | message.append(" does not exist.\n"); 280 | 281 | struct stat info; 282 | 283 | if (stat(dirName.c_str(), &info) != 0) { 284 | throw FileDoesNotExistException(message); 285 | } else if (info.st_mode & S_IFDIR) // S_ISDIR() doesn't exist on my windows 286 | { 287 | 288 | } else { 289 | throw FileDoesNotExistException(message); 290 | } 291 | 292 | } 293 | 294 | void Util::deleteFile(string fileName) { 295 | ifstream f1(fileName.c_str()); 296 | if (f1) { 297 | if (remove(fileName.c_str()) != 0) { 298 | cerr << "Could not remove: " << fileName << endl; 299 | } else { 300 | cout << "Deleting: " << fileName << endl; 301 | } 302 | } else { 303 | cerr << "Warning! This file does not exist: " << fileName << endl; 304 | } 305 | f1.close(); 306 | } 307 | 308 | void Util::deleteFilesUnderDirectory(string dirName) { 309 | // This function may not be platform-independent 310 | // Credit: http://www.cplusplus.com/forum/beginner/9173/ 311 | DIR * dirPtr = opendir(dirName.c_str()); 312 | struct dirent * entry; 313 | entry = readdir(dirPtr); 314 | while (entry) { 315 | string file(entry->d_name); 316 | if (file.compare(string(".")) == 0 || file.compare(string("..")) == 0) { 317 | // Skip current and parent directories 318 | } else { 319 | string url = dirName; 320 | url.append(fileSeparator); 321 | url.append(file); 322 | deleteFile(url); 323 | } 324 | entry = readdir(dirPtr); 325 | } 326 | closedir(dirPtr); 327 | } 328 | 329 | bool Util::isOverlapping(int s1, int e1, int s2, int e2) { 330 | if (s1 > e1) { 331 | string msg("Util::isOverlapping. Invalid Input. s1 is "); 332 | msg.append(Util::int2string(s1)); 333 | msg.append(". e1 is "); 334 | msg.append(Util::int2string(e1)); 335 | msg.append("."); 336 | throw InvalidInputException(msg); 337 | } 338 | 339 | if (s2 > e2) { 340 | string msg("Util::isOverlapping. Invalid Input. s2 is "); 341 | msg.append(Util::int2string(s2)); 342 | msg.append(". e2 is "); 343 | msg.append(Util::int2string(e2)); 344 | msg.append("."); 345 | throw InvalidInputException(msg); 346 | } 347 | 348 | bool isStartWithin = s2 >= s1 && s2 <= e1; 349 | bool isEndWithin = e2 >= s1 && e2 <= e1; 350 | bool isIncluding = s2 >= s1 && e2 <= e1; 351 | bool isIncluded = s1 >= s2 && e1 <= e2; 352 | bool isAdjacent = (e1 == (s2 + 1)) || (e2 == (s1 + 1)); 353 | 354 | return (isStartWithin || isEndWithin || isIncluding || isIncluded 355 | || isAdjacent); 356 | } 357 | 358 | bool Util::merge(ILocation* a, ILocation* b) { 359 | int s1 = a->getStart(); 360 | int e1 = a->getEnd(); 361 | int s2 = b->getStart(); 362 | int e2 = b->getEnd(); 363 | 364 | bool isStartWithin = s2 >= s1 && s2 <= e1; 365 | bool isEndWithin = e2 >= s1 && e2 <= e1; 366 | bool isIncluding = s2 >= s1 && e2 <= e1; 367 | bool isIncluded = s1 >= s2 && e1 <= e2; 368 | bool isAdjacent = (e1 == (s2 + 1)) || (e2 == (s1 + 1)); 369 | 370 | if (isIncluded) { 371 | a->setStart(s2); 372 | a->setEnd(e2); 373 | } else if (isStartWithin) { 374 | a->setEnd(e2); 375 | } else if (isEndWithin) { 376 | a->setStart(s2); 377 | } 378 | return (isStartWithin || isEndWithin || isIncluding || isIncluded 379 | || isAdjacent); 380 | } 381 | 382 | /** 383 | * The input string is s. 384 | * The reverse complement is rc. 385 | * The start, and the end are inclusive. 386 | */ 387 | void Util::revCompDig(const char * s, int start, int end, string * rc) { 388 | for (int i = end; i >= start; i--) { 389 | char b = s[i]; 390 | switch (b) { 391 | case 0: 392 | rc->append(1, 3); 393 | break; 394 | case 3: 395 | rc->append(1, 0); 396 | break; 397 | case 1: 398 | rc->append(1, 2); 399 | break; 400 | case 2: 401 | rc->append(1, 1); 402 | break; 403 | default: 404 | string msg("Valid codes are 0-3. The invalid code is "); 405 | msg.append(1, b); 406 | throw InvalidInputException(msg); 407 | } 408 | } 409 | } 410 | 411 | void Util::revCompDig(string * s, string * rc) { 412 | revCompDig(s->c_str(), 0, s->size() - 1, rc); 413 | } 414 | 415 | void Util::writeFasta(const string& sequence, const string& header, 416 | const string& outputFile) { 417 | ofstream outMask; 418 | outMask.open(outputFile.c_str(), ios::out); 419 | outMask << header << endl; 420 | int step = 50; 421 | int len = sequence.size(); 422 | for (int i = 0; i < len; i = i + step) { 423 | int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; 424 | for (int k = i; k <= e; k++) { 425 | outMask << sequence[k]; 426 | } 427 | outMask << endl; 428 | } 429 | outMask.close(); 430 | } 431 | 432 | int Util::sumTotalLength(const vector * list) { 433 | int size = list->size(); 434 | int sum = 0; 435 | for (int i = 0; i < size; i++) { 436 | sum += list->at(i)->getLength(); 437 | } 438 | return sum; 439 | } 440 | 441 | string Util::getLargestFile(const string& dirName) { 442 | /* 443 | Adapted from http://www.cplusplus.com/doc/tutorial/files/ 444 | */ 445 | DIR * dirPtr = opendir(dirName.c_str()); 446 | struct dirent * entry; 447 | entry = readdir(dirPtr); 448 | string largestFile = ""; 449 | streampos largestFileSize = -1; 450 | while (entry) { 451 | string file(entry->d_name); 452 | if (file.compare(string(".")) == 0 || file.compare(string("..")) == 0) { 453 | // Skip current and parent directories 454 | } else { 455 | string url = dirName; 456 | url.append(fileSeparator); 457 | url.append(file); 458 | ifstream candidateFile(url, ios::in | ios::binary | ios::ate); 459 | if (candidateFile.is_open()) { 460 | if (candidateFile.tellg() > largestFileSize) { 461 | largestFile = url; 462 | largestFileSize = candidateFile.tellg(); 463 | } 464 | } else { 465 | cout << "Cannot open " << url << "!" << endl; 466 | throw std::exception(); 467 | } 468 | candidateFile.close(); 469 | 470 | } 471 | entry = readdir(dirPtr); 472 | } 473 | closedir(dirPtr); 474 | if (largestFileSize == -1) { 475 | cerr << "There are no files under " << dirName << endl; 476 | throw std::exception(); 477 | } 478 | return largestFile; 479 | } 480 | -------------------------------------------------------------------------------- /src_2.0/utility/Util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Util.h 3 | * 4 | * Created on: Apr 24, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef UTIL_H_ 9 | #define UTIL_H_ 10 | 11 | #include "Location.h" 12 | #include "../exception/FileDoesNotExistException.h" 13 | #include "../exception/InvalidInputException.h" 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | using namespace std; 29 | using namespace utility; 30 | using namespace exception; 31 | 32 | namespace utility { 33 | class Util { 34 | private: 35 | Util(); 36 | ~Util(); 37 | 38 | public: 39 | static string * emptyString; 40 | static string fileSeparator; 41 | static unsigned int CORE_NUM; 42 | static void readFasta(string, vector *, vector *, bool); 43 | static void readFasta(string, vector *, vector *); 44 | static vector tokenize(std::string, char); 45 | static void readCoordinates(string, 46 | unordered_map *> *); 47 | static void readCoordinates(string, 48 | unordered_map *> *); 49 | static void readCoordinates(string, vector *); 50 | // static void readCoordinates(string, RegionList *); 51 | static void readChromList(string, vector *, string); 52 | static void toUpperCase(string*); 53 | static void toUpperCase(string&); 54 | static string int2string(int); 55 | static string double2string(double); 56 | static string long2string(long); 57 | static void deleteFile(string); 58 | static void deleteFilesUnderDirectory(string); 59 | static void checkFile(string); 60 | static void checkDir(string); 61 | static bool isOverlapping(int, int, int, int); 62 | static bool merge(utility::ILocation*, utility::ILocation*); 63 | static void revCompDig(string *, string *); 64 | static void revCompDig(const char* sequence, int, int, string *); 65 | 66 | static void writeFasta(const string&, const string&, const string&); 67 | 68 | static int sumTotalLength(const vector *); 69 | static string getLargestFile(const string&); 70 | 71 | /** 72 | * Delete the objects pointed to by pointers in a vector. 73 | * It does not delete the vector itself. 74 | * 75 | * Credit: http://stackoverflow.com/questions/594089/does-stdvector-clear-do-delete-free-memory-on-each-element 76 | */ 77 | template 78 | static void deleteInVector(vector * deleteMe) { 79 | while (!deleteMe->empty()) { 80 | delete deleteMe->back(); 81 | deleteMe->pop_back(); 82 | } 83 | 84 | // Set the size to zero 85 | deleteMe->clear(); 86 | 87 | // Set the capacity to zero 88 | vector empty; 89 | deleteMe->swap(empty); 90 | } 91 | }; 92 | } 93 | 94 | #endif /* UTIL_H_ */ 95 | --------------------------------------------------------------------------------