├── .gitignore ├── CMakeLists.txt ├── README.md └── src ├── cluster ├── CRunner.cpp ├── CRunner.h ├── Center.h ├── ClusterFactory.cpp ├── ClusterFactory.h ├── Trainer.cpp ├── Trainer.h ├── bvec.cpp ├── bvec.h ├── bvec_iterator.cpp ├── bvec_iterator.h └── meshclust2.cpp ├── clutil ├── Clock.cpp ├── Clock.h ├── Datatype.cpp ├── Datatype.h ├── DivergencePoint.cpp ├── DivergencePoint.h ├── Histogram.cpp ├── Histogram.h ├── LCG.h ├── Loader.cpp ├── Loader.h ├── Point.h ├── Progress.cpp ├── Progress.h ├── Random.h ├── SingleFileLoader.cpp └── SingleFileLoader.h ├── exception ├── FileDoesNotExistException.cpp ├── FileDoesNotExistException.h ├── InvalidInputException.cpp ├── InvalidInputException.h ├── InvalidOperationException.cpp ├── InvalidOperationException.h ├── InvalidOrderOfOperationsException.cpp ├── InvalidOrderOfOperationsException.h ├── InvalidScoreException.cpp ├── InvalidScoreException.h ├── InvalidStateException.cpp └── InvalidStateException.h ├── fastcar ├── FC_Runner.cpp ├── FC_Runner.h └── fastcar.cpp ├── nonltr ├── ChromDetector.cpp ├── ChromDetector.h ├── ChromDetectorMaxima.cpp ├── ChromDetectorMaxima.h ├── ChromListMaker.cpp ├── ChromListMaker.h ├── Chromosome.cpp ├── Chromosome.h ├── ChromosomeOneDigit.cpp ├── ChromosomeOneDigit.h ├── ChromosomeOneDigitDna.cpp ├── ChromosomeOneDigitDna.h ├── ChromosomeOneDigitProtein.cpp ├── ChromosomeOneDigitProtein.h ├── ChromosomeRandom.cpp ├── ChromosomeRandom.h ├── DetectorMaxima.cpp ├── DetectorMaxima.h ├── EnrichmentMarkovView.cpp ├── EnrichmentMarkovView.h ├── HMM.cpp ├── HMM.h ├── IChromosome.h ├── ITableView.h ├── KmerHashTable.cpp ├── KmerHashTable.h ├── LocationList.cpp ├── LocationList.h ├── LocationListCollection.cpp ├── LocationListCollection.h ├── RepeatsDetector.cpp ├── Scanner.cpp ├── Scanner.h ├── Scorer.cpp ├── Scorer.h ├── TableBuilder.cpp ├── TableBuilder.h ├── Trainer.cpp └── Trainer.h ├── predict ├── BestFirstSelector.cpp ├── BestFirstSelector.h ├── Feature.cpp ├── Feature.h ├── FeatureSelector.cpp ├── FeatureSelector.h ├── GLM.cpp ├── GLM.h ├── GreedySelector.cpp ├── GreedySelector.h ├── HandleSeq.cpp ├── HandleSeq.h ├── Matrix.cpp ├── Matrix.h ├── MultiMute.cpp ├── MultiMute.h ├── Predictor.cpp ├── Predictor.h ├── SingMute.cpp └── SingMute.h └── utility ├── EmptyLocation.cpp ├── EmptyLocation.h ├── GlobAlignE.cpp ├── GlobAlignE.h ├── ILocation.h ├── LCSLen.cpp ├── LCSLen.h ├── Location.cpp ├── Location.h ├── Util.cpp └── Util.h /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 3.1) 2 | project (MeshClust2) 3 | 4 | include_directories(src/exception src/nonltr src/utility src/cluster src/prediction src/clutil src/fastcar) 5 | set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin) 6 | set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) 7 | set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}) 8 | 9 | 10 | add_library(Fastcar 11 | ${CMAKE_SOURCE_DIR}/src/fastcar/FC_Runner.cpp 12 | ) 13 | 14 | add_library(ClusterUtil 15 | ${CMAKE_SOURCE_DIR}/src/clutil/DivergencePoint.cpp 16 | ${CMAKE_SOURCE_DIR}/src/clutil/Histogram.cpp 17 | ${CMAKE_SOURCE_DIR}/src/clutil/Loader.cpp 18 | ${CMAKE_SOURCE_DIR}/src/clutil/SingleFileLoader.cpp 19 | ${CMAKE_SOURCE_DIR}/src/clutil/Progress.cpp 20 | ${CMAKE_SOURCE_DIR}/src/clutil/Datatype.cpp 21 | ${CMAKE_SOURCE_DIR}/src/clutil/Clock.cpp 22 | ) 23 | 24 | add_library(Predict 25 | ${CMAKE_SOURCE_DIR}/src/predict/Feature.cpp 26 | ${CMAKE_SOURCE_DIR}/src/predict/GLM.cpp 27 | ${CMAKE_SOURCE_DIR}/src/predict/HandleSeq.cpp 28 | ${CMAKE_SOURCE_DIR}/src/predict/Matrix.cpp 29 | ${CMAKE_SOURCE_DIR}/src/predict/MultiMute.cpp 30 | ${CMAKE_SOURCE_DIR}/src/predict/Predictor.cpp 31 | ${CMAKE_SOURCE_DIR}/src/predict/SingMute.cpp 32 | ${CMAKE_SOURCE_DIR}/src/predict/FeatureSelector.cpp 33 | ${CMAKE_SOURCE_DIR}/src/predict/GreedySelector.cpp 34 | ${CMAKE_SOURCE_DIR}/src/predict/BestFirstSelector.cpp 35 | ) 36 | 37 | add_library(Cluster 38 | ${CMAKE_SOURCE_DIR}/src/cluster/ClusterFactory.cpp 39 | ${CMAKE_SOURCE_DIR}/src/cluster/CRunner.cpp 40 | ${CMAKE_SOURCE_DIR}/src/cluster/Trainer.cpp 41 | ${CMAKE_SOURCE_DIR}/src/cluster/bvec.cpp 42 | ${CMAKE_SOURCE_DIR}/src/cluster/bvec_iterator.cpp 43 | 44 | ) 45 | 46 | add_library(Exception 47 | ${CMAKE_SOURCE_DIR}/src/exception/FileDoesNotExistException.cpp 48 | ${CMAKE_SOURCE_DIR}/src/exception/InvalidInputException.cpp 49 | ${CMAKE_SOURCE_DIR}/src/exception/InvalidOperationException.cpp 50 | ${CMAKE_SOURCE_DIR}/src/exception/InvalidOrderOfOperationsException.cpp 51 | ${CMAKE_SOURCE_DIR}/src/exception/InvalidScoreException.cpp 52 | ${CMAKE_SOURCE_DIR}/src/exception/InvalidStateException.cpp 53 | ) 54 | 55 | add_library(Nonltr 56 | ${CMAKE_SOURCE_DIR}/src/nonltr/ChromDetectorMaxima.cpp 57 | ${CMAKE_SOURCE_DIR}/src/nonltr/ChromListMaker.cpp 58 | ${CMAKE_SOURCE_DIR}/src/nonltr/Chromosome.cpp 59 | ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigit.cpp 60 | ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitDna.cpp 61 | ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitProtein.cpp 62 | ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeRandom.cpp 63 | ${CMAKE_SOURCE_DIR}/src/nonltr/DetectorMaxima.cpp 64 | ${CMAKE_SOURCE_DIR}/src/nonltr/HMM.cpp 65 | ${CMAKE_SOURCE_DIR}/src/nonltr/LocationList.cpp 66 | ${CMAKE_SOURCE_DIR}/src/nonltr/LocationListCollection.cpp 67 | ${CMAKE_SOURCE_DIR}/src/nonltr/Scanner.cpp 68 | ${CMAKE_SOURCE_DIR}/src/nonltr/Scorer.cpp 69 | ${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.cpp 70 | ${CMAKE_SOURCE_DIR}/src/nonltr/Trainer.cpp 71 | ) 72 | 73 | add_library(Utility 74 | ${CMAKE_SOURCE_DIR}/src/utility/EmptyLocation.cpp 75 | ${CMAKE_SOURCE_DIR}/src/utility/GlobAlignE.cpp 76 | ${CMAKE_SOURCE_DIR}/src/utility/Location.cpp 77 | ${CMAKE_SOURCE_DIR}/src/utility/Util.cpp 78 | ) 79 | 80 | target_include_directories(Exception PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 81 | target_include_directories(Nonltr PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 82 | target_include_directories(Utility PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 83 | target_include_directories(Cluster PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 84 | target_include_directories(Fastcar PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 85 | target_include_directories(ClusterUtil PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 86 | target_include_directories(Predict PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 87 | 88 | set (HEADER_FILES 89 | ${CMAKE_SOURCE_DIR}/src/nonltr/KmerHashTable.h 90 | ${CMAKE_SOURCE_DIR}/src/nonltr/EnrichmentMarkovView.h 91 | ${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.h 92 | ) 93 | 94 | set (CMAKE_CXX_COMPILER g++) 95 | set (CMAKE_CXX_STANDARD 11) 96 | set (CMAKE_CXX_STANDARD_REQUIRED on) 97 | set (CMAKE_CXX_FLAGS "-fopenmp -g -O3 -march=native -std=c++11") 98 | 99 | target_compile_definitions(Cluster PRIVATE VERSION="2.3.0") 100 | target_compile_definitions(Fastcar PRIVATE VERSION="0.7.1") 101 | 102 | add_executable(Red ${CMAKE_SOURCE_DIR}/src/nonltr/RepeatsDetector.cpp ) 103 | add_executable(meshclust2 ${CMAKE_SOURCE_DIR}/src/cluster/meshclust2.cpp) 104 | add_executable(fastcar ${CMAKE_SOURCE_DIR}/src/fastcar/fastcar.cpp) 105 | 106 | target_link_libraries(Red Exception Nonltr Utility ${HEADER_FILES}) 107 | target_link_libraries(Utility Exception ${HEADER_FILES}) 108 | target_link_libraries(Nonltr Utility Exception ${HEADER_FILES}) 109 | target_link_libraries(ClusterUtil Nonltr ${HEADER_FILES}) 110 | target_link_libraries(Predict ClusterUtil Nonltr ${HEADER_FILES}) 111 | target_link_libraries(meshclust2 Cluster Nonltr ClusterUtil Predict ${HEADER_FILES}) 112 | target_link_libraries(fastcar Nonltr ClusterUtil Fastcar Predict ${HEADER_FILES}) 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## MeShClust2 2 | 3 | The newest version of MeShClust (v3.0) can be obtained from https://github.com/BioinformaticsToolsmith/Identity.git 4 | 5 | This repository is no longer supported. 6 | 7 | Release version - 2.3.0 8 | 9 | ### Requirements 10 | g++ 4.9.1 or later, requires Homebrew on Mac OS X 11 | Compilation using g++ (homebrew) and CMake on Mac OS X see [this link](https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite) 12 | 13 | ### Linux/Unix compilation 14 | ``` sh 15 | mkdir bin && cd bin 16 | cmake .. 17 | make 18 | ``` 19 | 20 | ### Citation 21 | If you find this tool helpful, please cite: 22 | 23 | [James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278.](https://doi.org/10.1101/451278) 24 | 25 | ### Usage 26 | 27 | Usage: meshclust2 --id 0.x [OPTIONS] *.fasta 28 | 29 | --id The most important parameter, --id, controls the identity cutoff of the sequences. 30 | Needs to be between 0 and 1. 31 | If it is not specified, an identity of 0.9 is used. 32 | 33 | --kmer decides the size of the kmers. It is by default automatically decided by average sequence 34 | length, but if provided, MeShClust can speed up a little by not having to find the largest 35 | sequence length. Increasing kmer size can increase accuracy, but increases memory consumption. 36 | 37 | --dump Run until the classifier is trained, and then dump the weights to the file, 38 | default 'weights.txt'. Can be used with --recover to recover the weights 39 | instead of re-training. 40 | 41 | --recover Recover weights for the classifier trained by a previous run which used --dump to dump 42 | the weights. 43 | 44 | --list Instead of specifying files as extra arguments, provide a text file with 45 | a list of files. Can use pipes or process substitutions such as "--list <(ls *.fasta) " 46 | 47 | --no-train-list Same as --list, but these files are not passed to the classifier, 48 | e.g. unassembled genomes 49 | 50 | --mut-type {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation} 51 | changes the mutation generation algorithm. By default, "both" is used, utilizing 52 | single point and block mutations. On higher identity data sets, "single", which includes only single point mutations, 53 | is preferable. The option "nonsingle-typical" uses only block mutations, 54 | disallowing single point mutations. Other options include "all", which includes single, 55 | block, and nontypical mutations translocation and reversion. 56 | 57 | --feat determines the combinations of features to be used. By default, "slow" allows 11 58 | combinations to be selected from. "fast" removes 2 slower features from "slow" 59 | which include logarithm based features. 60 | 61 | --single-file Using this option, (no value is needed), each file is treated as a single sequence. 62 | If multiple sequences in a file are encountered, they are joined with 50 Ns, 63 | and the k-mers are not counted in that region. 64 | However, to be most accurate, it is advised to not use these sequences in the 65 | training step (for mutations) and instead 1) train using un-joined sequences and 66 | use --dump to dump to a file, and 2) use --recover with --single-file for the 67 | file list. 68 | 69 | --sample selects the total number of sequences used for both training and testing. 70 | 2000 is the default value. That is, --sample 2000 provides 2000 training 71 | pairs and 2000 testing pairs. 72 | 73 | --num-templates selects the number of "template" sequences from which to mutate. 74 | For example, if 300 (the default) templates are requested, and the number of 75 | "samples" is requested to be 2000 (the default), 300 sequences will be read in 76 | and mutated 2000/300 times each to create 2000 semi-synthetic pairs. 77 | 78 | --min-feat (default 4) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs 79 | will be used. Recall that features include pairwise combinations of the "feat" option. 80 | 81 | --max-feat (default 4) sets the maximum feature pairs to be used. Diminishing returns appears quickly, 82 | so a very large maximum (>10) is not advised. 83 | 84 | --min-id (default 0.35) sets the lower bound for mutation identity scores to be calculated. 85 | Shouldn't need to be set normally, as lower identites take much longer, 86 | especially with single mutations only. 87 | 88 | --datatype (8,16,32,64) Decides the integer size of the histograms. If not provided, 89 | all sequences are read in and counted to ensure the largest k-mer does not 90 | overflow. If the provided k-mer is too small, it will overflow. 91 | 92 | --threads sets the number of threads to be used. By default OpenMP uses the number of available cores 93 | on your machine, but this parameter overwrites that. 94 | 95 | --output specifies the output file, in CD-HIT's CLSTR format, described below: 96 | A '>Cluster ' followed by an increasing index designates a cluster. 97 | Otherwise, the sequence is printed out. 98 | A '*' at the end of a sequence designates the center of the cluster. 99 | An example of a small data set: 100 | 101 | >Cluster 0 102 | 0 993nt, >seq128 template_6... * 103 | >Cluster 1 104 | 0 1043nt, >seq235 template_10... 105 | 1 1000nt, >seq216 template_10... * 106 | 2 1015nt, >seq237 template_10... 107 | 108 | --delta decides how many clusters are looked around in the final clustering stage. 109 | Increasing it creates more accuracy, but takes more time. Default value is 5. 110 | 111 | --iterations specifies how many iterations in the final stage of merging are done until convergence. 112 | Default value is 15. 113 | 114 | If the argument is not listed here, it is interpreted as an input (FASTA format) file. 115 | 116 | 117 | ### License 118 | 119 | Academic use: The software is provided as-is under the GNU GPLv3. 120 | Any restrictions to use for-profit or non-academics: License needed. 121 | -------------------------------------------------------------------------------- /src/cluster/CRunner.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Runner.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef CRUNNER_H 8 | #define CRUNNER_H 9 | 10 | #include 11 | #include 12 | #include "../clutil/Point.h" 13 | #include "../predict/HandleSeq.h" 14 | #include "../predict/Predictor.h" 15 | using namespace std; 16 | 17 | class Runner { 18 | public: 19 | Runner(int argc, char** argv); 20 | ~Runner() {}; 21 | int run(); 22 | private: 23 | template int do_run(); 24 | template void print_output(const map*, vector*>*> &m) const; 25 | int k = -1; 26 | int bandwidth; 27 | double similarity = 0.90; 28 | long largest_count = 0; 29 | int iterations = 15; 30 | int delta = 5; 31 | bool align = false; 32 | int total_sample_size = 2000; 33 | int n_templates = 300; 34 | int min_n_feat = 4; 35 | int max_n_feat = 4; 36 | bool is_single_file = false; 37 | double bias = 0; 38 | int mut_type = HandleSeq::BOTH; 39 | uint64_t feat_type = PRED_FEAT_FAST; 40 | double min_id = 0.35; 41 | std::vector files, notrain_files, all_files; 42 | string output = "output.clstr"; 43 | void get_opts(int argc, char** argv); 44 | int find_k(); 45 | 46 | bool dump = false; 47 | bool recover = false; 48 | std::string dump_str = "weights.txt"; 49 | Predictor *pred64 = NULL; 50 | }; 51 | #endif 52 | -------------------------------------------------------------------------------- /src/cluster/Center.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Center.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef CENTER_H 8 | #define CENTER_H 9 | 10 | #include "../clutil/Point.h" 11 | 12 | template 13 | struct Center { 14 | Center(Point* c, const vector*> &pts) : center(c->clone()), points(pts), is_to_delete(false) { 15 | } 16 | Center(const Center &cc) : center(cc.center->clone()), points(cc.points), is_to_delete(cc.is_to_delete) {} 17 | 18 | 19 | // Center(const Center& c) { 20 | // center = c.get_clone(); 21 | // points = c.getPoints_c(); 22 | // is_to_delete = c.is_delete(); 23 | // } 24 | ~Center() { if (is_to_delete) { delete center; }} 25 | 26 | Point* getCenter() { return center; } 27 | vector*> &getPoints() { return points; } 28 | 29 | const vector*> &getPoints_c() const { return points; }; 30 | bool is_delete() const { return is_to_delete; } 31 | void lazy_remove() { is_to_delete = true; } 32 | size_t size() const { return points.size(); } 33 | bool empty() const { return points.empty(); } 34 | Point* get_clone() const { 35 | return center->clone(); 36 | } 37 | Point *center; 38 | vector*> points; 39 | bool is_to_delete; 40 | }; 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /src/cluster/ClusterFactory.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * ClusterFactory.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef CLUSTERFACTORY_H 9 | #define CLUSTERFACTORY_H 10 | 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "../nonltr/ChromosomeOneDigit.h" 17 | #include "../nonltr/KmerHashTable.h" 18 | #include "../clutil/Point.h" 19 | #include "Trainer.h" 20 | #include "bvec.h" 21 | 22 | template 23 | class ClusterFactory { 24 | public: 25 | ClusterFactory(int k_len, int npp=std::numeric_limits::max()) : k(k_len), num_per_partition(npp) {} 26 | std::vector*> build_points(vector files, std::function*(ChromosomeOneDigit*)> get_point); 27 | Point* get_histogram(ChromosomeOneDigit *chrom); 28 | Point* get_divergence_point(ChromosomeOneDigit *chrom); 29 | T find_h(const std::vector*> ¢ers) const; 30 | void sort_nn(std::vector*> &points, Point* nearest_to=NULL, int arg=3) const; 31 | void MS(bvec &points, T bandwidth, double sim, const Trainer& trn, string output, int iter, int delta); 32 | private: 33 | vector lookup_table; 34 | const int num_per_partition; 35 | int k; 36 | //void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values); 37 | }; 38 | 39 | template 40 | void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values) 41 | { 42 | const int k = table.getK(); 43 | auto segment = chrom->getSegment(); 44 | const char *seg_bases = chrom->getBase()->c_str(); 45 | for (vector *v : *segment) { 46 | int start = v->at(0); 47 | int end = v->at(1); 48 | table.wholesaleIncrement(seg_bases, start, end - k + 1); 49 | } 50 | unsigned long tableSize = table.getMaxTableSize(); 51 | values.reserve(values.size() + tableSize); 52 | const V * valueArray = table.getValues(); 53 | std::copy(&valueArray[0], &valueArray[tableSize], std::back_inserter(values)); 54 | } 55 | // template 56 | // void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values) 57 | // { 58 | // const int k = table.getK(); 59 | // auto segment = chrom->getSegment(); 60 | // const char *seg_bases = chrom->getBase()->c_str(); 61 | // for (vector *v : *segment) { 62 | // int start = v->at(0); 63 | // int end = v->at(1); 64 | // table.wholesaleIncrement(seg_bases, start, end - k + 1); 65 | // } 66 | // std::vector *keys = table.getKeys(); 67 | // for (std::string str : *keys) { 68 | // values.push_back(table.valueOf(str.c_str())); 69 | // } 70 | // keys->clear(); 71 | // delete keys; 72 | // } 73 | 74 | #ifdef HEADER_HACK 75 | #ifndef CLUSTERFACTORY_C 76 | #define CLUSTERFACTORY_C 77 | #include "ClusterFactory.cpp" 78 | #endif 79 | #endif 80 | 81 | #endif 82 | -------------------------------------------------------------------------------- /src/cluster/Trainer.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | /* 3 | * Trainer.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "Trainer.h" 8 | #include "../predict/HandleSeq.h" 9 | #include "../clutil/Datatype.h" 10 | #include "../clutil/Loader.h" 11 | #include "ClusterFactory.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "../predict/Predictor.h" 17 | #include "../predict/GLM.h" 18 | #include "../predict/Feature.h" 19 | #include "../clutil/Progress.h" 20 | #include "../clutil/Random.h" 21 | 22 | template 23 | std::tuple*,double,size_t,size_t> Trainer::get_close(Point *p, bvec_iterator istart, bvec_iterator iend, bool &is_min_r) const 24 | { 25 | int ncols = weights.getNumRow(); 26 | #pragma omp declare reduction(pmax:std::tuple*,double,size_t,size_t>: \ 27 | omp_out = get<1>(omp_in) > get<1>(omp_out) ? omp_in : omp_out ) \ 28 | initializer (omp_priv=std::make_tuple((Point*)NULL,-1,0,0)) 29 | 30 | std::tuple*, 31 | double, 32 | size_t, 33 | size_t> result = std::tuple*, double, size_t, size_t>(NULL, 34 | -1, 35 | 0, 36 | 0); 37 | bool has_found = false; 38 | bool is_min = true; 39 | uint64_t min_len = p->get_length() * cutoff; 40 | uint64_t max_len = p->get_length() / cutoff; 41 | #pragma omp parallel for reduction(pmax:result), reduction(&&:is_min) 42 | for (bvec_iterator i = istart; i < iend; ++i) { 43 | Point* pt = (*i).first; 44 | 45 | uint64_t len = pt->get_length(); 46 | if (len < min_len || len > max_len) { 47 | continue; 48 | } 49 | auto cache = feat->compute(*pt, *p); 50 | double dist = (*feat)(0, cache); 51 | double sum = classify(pt, p); 52 | double res = round(sum) > 0; 53 | // #pragma omp critical 54 | // cout << "Result: " << sum << " raw_sigmoid: " << matrix::GLM::logistic(sum) << " classify_sum: " << Predictor::classify_sum(sum) << " final: " << res << endl; 55 | // set second to true if result is not 1.0 56 | // which means it will be removed 57 | result = (dist > std::get<1>(result)) ? std::make_tuple(pt, dist, i.r, i.c) : result; 58 | is_min = is_min && (res != 1.0); 59 | // has_found = has_found || (res != 1.0); 60 | if (res == 1.0) { 61 | *i = std::make_pair(pt, true); 62 | // (*i).second = true; 63 | } 64 | } 65 | 66 | // is_min = !has_found; 67 | is_min_r = is_min; 68 | // return get<0>(result); 69 | return result; 70 | 71 | } 72 | 73 | template 74 | long Trainer::merge(vector > ¢ers, long current, long begin, long last) const 75 | { 76 | #pragma omp declare reduction(ldpmax:std::pair: \ 77 | omp_out = omp_in.second > omp_out.second ? omp_in : omp_out ) \ 78 | initializer (omp_priv=std::make_pair(0, std::numeric_limits::min())) 79 | std::pair best = std::make_pair(0, std::numeric_limits::min()); 80 | Point* p = centers[current].getCenter(); 81 | uint64_t cen_length = p->get_length(); 82 | uint64_t min_length = cen_length * get_id(); 83 | uint64_t max_length = cen_length / get_id(); 84 | #pragma omp parallel for reduction(ldpmax:best) 85 | for (long i = begin; i <= last; i++) { 86 | double sum = weights.get(0, 0); 87 | double dist = 0; 88 | 89 | Point* cen = centers[i].getCenter(); 90 | uint64_t cen_len = cen->get_length(); 91 | bool length_pass = cen_len >= min_length && cen_len <= max_length; 92 | if (length_pass) { 93 | auto cache = feat->compute(*cen, *p); 94 | for (int col = 1; col < weights.getNumRow(); col++) { 95 | double d = (*feat)(col-1, cache); 96 | if (col == 1) { 97 | dist = d; 98 | } 99 | sum += weights.get(col, 0) * d; 100 | } 101 | double res = round(Predictor::classify_sum(sum)); 102 | 103 | if (res == 1) { 104 | best = best.second > dist ? best : std::make_pair(i, dist); 105 | } 106 | } 107 | } 108 | return best.first; 109 | } 110 | 111 | template 112 | double Trainer::classify(Point*a, Point*b) const 113 | { 114 | double sum = weights.get(0, 0); 115 | auto cache = feat->compute(*a, *b); 116 | for (int col = 1; col < weights.getNumRow(); col++) { 117 | sum += weights.get(col, 0) * (*feat)(col-1, cache); 118 | } 119 | return Predictor::classify_sum(sum); 120 | } 121 | 122 | template 123 | void Trainer::filter(Point *p, vector *, bool> > &vec) const 124 | { 125 | uint64_t cen_length = p->get_length(); 126 | uint64_t min_length = cen_length * get_id(); 127 | uint64_t max_length = cen_length / get_id(); 128 | for (auto& pt : vec) { 129 | uint64_t pt_len = pt.first->get_length(); 130 | bool length_pass = pt_len >= min_length && pt_len <= max_length; 131 | pt.second = true; 132 | if (length_pass) { 133 | double sum = classify(p, pt.first); 134 | double res = round(sum); 135 | pt.second = (res == 0); 136 | } 137 | } 138 | vec.erase(std::remove_if(vec.begin(), vec.end(), [](pair*, bool> p) { 139 | return p.second; 140 | }), vec.end()); 141 | } 142 | 143 | template 144 | Point* Trainer::closest(Point *p, vector *, bool> > &vec) const 145 | { 146 | Point* best_pt = NULL; 147 | double best_dist = 0; 148 | for (auto& pt : vec) { 149 | double sum = weights.get(0, 0); 150 | double dist = pt.first->distance_d(*p); 151 | if (best_pt == NULL || dist < best_dist) { 152 | best_dist = dist; 153 | best_pt = pt.first; 154 | } 155 | } 156 | return best_pt; 157 | } 158 | 159 | template 160 | void Trainer::train(std::string dump_str) 161 | { 162 | Predictor pred(dump_str); 163 | delete feat; 164 | auto pr = pred.get_class(); 165 | feat = pr.first; 166 | feat->set_save(false); 167 | matrix::GLM glm = pr.second; 168 | weights = glm.get_weights(); 169 | } 170 | 171 | template 172 | void Trainer::train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, std::string dump_str, double acc_cutoff) 173 | { 174 | std::cout << "Splitting data" << endl; 175 | uintmax_t _id = points.size(); 176 | Predictor pred(k, cutoff, PRED_MODE_CLASS, feat_type, 177 | mut_type, min_n_feat, max_n_feat, min_id); 178 | pred.train(points, _id, n_samples, n_templates); 179 | delete feat; 180 | auto pr = pred.get_class(); 181 | feat = pr.first; 182 | matrix::GLM glm = pr.second; 183 | weights = glm.get_weights(); 184 | 185 | if (dump_str != "") { 186 | pred.save(dump_str, Datatype::get()); 187 | exit(0); 188 | } else { 189 | pred.save("weights.txt", Datatype::get()); 190 | } 191 | } 192 | 193 | template class Trainer; 194 | template class Trainer; 195 | template class Trainer; 196 | template class Trainer; 197 | template class Trainer; 198 | template class Trainer; 199 | -------------------------------------------------------------------------------- /src/cluster/Trainer.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | /* 3 | * Trainer.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef TRAINER_H 9 | #define TRAINER_H 10 | 11 | #include "../clutil/Point.h" 12 | #include "../predict/GLM.h" 13 | #include "../predict/Feature.h" 14 | #include "../predict/Predictor.h" 15 | #include "bvec.h" 16 | #include "Center.h" 17 | #include 18 | 19 | template 20 | class Trainer { 21 | public: 22 | Trainer(std::vector*> v, size_t num_points, size_t largest_count, double cutoff_, size_t max_pts_from_one_, int ksize) : points(v), n_samples(num_points), cutoff(cutoff_), n_templates(max_pts_from_one_), k(ksize) { 23 | uintmax_t size = 1000 * 1000 * 10; 24 | feat = new Feature(k); 25 | }; 26 | ~Trainer() { delete feat; } 27 | void train(std::string); 28 | void train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, std::string dump_str, double acc_cutoff=97.5); 29 | 30 | std::tuple*,double,size_t,size_t> get_close(Point*, bvec_iterator istart, bvec_iterator iend, bool& is_min) const; 31 | 32 | void filter(Point*, vector*,bool> >&) const; 33 | Point* closest(Point*, vector*,bool> >&) const; 34 | long merge(vector > ¢ers, long current, long begin, long end) const; 35 | 36 | double get_id() const { return cutoff > 1 ? cutoff / 100.0 : cutoff; } 37 | private: 38 | double classify(Point*, Point*) const; 39 | matrix::Matrix weights; 40 | Feature *feat; 41 | std::vector*> points; 42 | size_t n_samples, n_templates; 43 | double cutoff; 44 | int k; 45 | }; 46 | #endif 47 | -------------------------------------------------------------------------------- /src/cluster/bvec.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * bvec.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #ifndef BVEC_H 8 | #define BVEC_H 9 | 10 | #include "../clutil/Point.h" 11 | #include "bvec_iterator.h" 12 | 13 | typedef struct bvec_idx { 14 | size_t first, second; 15 | bool is_empty = false; 16 | } bvec_idx_t; 17 | 18 | /* 19 | * operations needed: 20 | * 21 | * find bounds (range) 22 | * get available or min and remove 23 | * 24 | */ 25 | template 26 | using bv_data_type = std::pair*, bool>; 27 | 28 | template 29 | using bv_row_type = vector >; 30 | 31 | template 32 | using bv_col_type = vector >; 33 | 34 | template 35 | class bvec { 36 | public: 37 | bvec(vector& lengths, uint64_t bin_size=1000); 38 | 39 | Point* pop(); 40 | Point* peek() const; 41 | void insert(Point* data); 42 | void insert_finalize(); /* sorts bins */ 43 | 44 | 45 | bool index_of(uint64_t length, size_t* front, size_t* back) const; 46 | bool inner_index_of(uint64_t length, size_t& idx, size_t *front, size_t *back) const; 47 | bool empty() const; 48 | 49 | std::pair 50 | get_range(uint64_t begin_len, uint64_t end_len) const; 51 | 52 | void remove_available(bvec_idx_t begin, bvec_idx_t end, std::vector*> &); 53 | 54 | uint64_t absolute_idx(bvec_idx_t idx) const; 55 | 56 | bvec_iterator iter(bvec_idx_t idx); 57 | typedef bvec_iterator iterator; 58 | typedef bvec_iterator const_iterator; 59 | 60 | size_t report() const; 61 | size_t size() const; 62 | 63 | void erase(size_t r, size_t c); 64 | private: 65 | bv_col_type data; 66 | vector begin_bounds; 67 | }; 68 | 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /src/cluster/bvec_iterator.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | /* 3 | * bvec_iterator.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "bvec_iterator.h" 8 | 9 | template 10 | bvec_iterator bvec_iterator::operator++() 11 | { 12 | if (r != col->size()) { 13 | if (c + 1 < col->at(r).size()) { 14 | c++; 15 | } else { 16 | r++; 17 | c = 0; 18 | while (r < col->size() && col->at(r).empty()) { 19 | r++; 20 | } 21 | } 22 | } else { 23 | cerr << "tried incrementing null iterator" << endl; 24 | throw 10; 25 | } 26 | return *this; 27 | } 28 | 29 | template class bvec_iterator; 30 | template class bvec_iterator; 31 | template class bvec_iterator; 32 | template class bvec_iterator; 33 | template class bvec_iterator; 34 | template class bvec_iterator; 35 | -------------------------------------------------------------------------------- /src/cluster/bvec_iterator.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * bvec_iterator.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "bvec.h" 8 | #ifndef BVEC_ITERATOR_H 9 | #define BVEC_ITERATOR_H 10 | 11 | 12 | template 13 | class bvec_iterator { 14 | public: 15 | // iterator: split ALL possible points into chunks by indices 16 | using dtype = std::pair*,bool>; 17 | using vtype = vector >; 18 | bvec_iterator(size_t _r, 19 | size_t _c, 20 | vtype* col_) : r(_r), c(_c), col(col_) {} 21 | 22 | bvec_iterator operator++(); 23 | bvec_iterator operator++(int x) { 24 | return ++(*this); 25 | } 26 | dtype& operator*() { 27 | return col->at(r).at(c); 28 | } 29 | void operator+=(int64_t n) { 30 | if (n < 0) { 31 | throw "oops"; 32 | } 33 | for (int i = 0; i < n; i++) { 34 | operator++(); 35 | } 36 | } 37 | bool operator==(const bvec_iterator& rhs) const { 38 | return rhs.c == c && rhs.r == r; 39 | } 40 | bool operator<(const bvec_iterator& rhs) const { 41 | if (r < rhs.r) { 42 | return true; 43 | } else if (r == rhs.r) { 44 | return c < rhs.c; 45 | } else { 46 | return false; 47 | } 48 | } 49 | bool operator<=(const bvec_iterator& rhs) const { 50 | if (r < rhs.r) { 51 | return true; 52 | } else if (r == rhs.r) { 53 | return c <= rhs.c; 54 | } else { 55 | return false; 56 | } 57 | } 58 | bool operator!=(const bvec_iterator& rhs) const { 59 | return r != rhs.r || c != rhs.c; 60 | } 61 | int64_t operator-(const bvec_iterator& rhs) const { 62 | int64_t sum = 0; 63 | if (*this < rhs) { 64 | return -1 * (rhs - *this); 65 | } 66 | // subtract cols until last row is reached 67 | if (r == rhs.r) { 68 | return c - rhs.c; 69 | } 70 | sum += c; 71 | sum += col->at(rhs.r).size() - rhs.c; 72 | for (size_t i = rhs.r + 1; i < r; i++) { 73 | sum += col->at(i).size(); 74 | } 75 | return sum; 76 | } 77 | // bvec_iterator operator[](uint64_t idx) { 78 | 79 | // } 80 | //private: 81 | size_t r,c; 82 | vtype* col; 83 | }; 84 | #endif 85 | -------------------------------------------------------------------------------- /src/cluster/meshclust2.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * main.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "CRunner.h" 8 | int main(int argc, char **argv) 9 | { 10 | Runner runner(argc, argv); 11 | return runner.run(); 12 | } 13 | -------------------------------------------------------------------------------- /src/clutil/Clock.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | /* 3 | * Clock.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #include "Clock.h" 9 | #include 10 | #include 11 | 12 | static const auto _begin = std::chrono::system_clock::now(); 13 | 14 | void Clock::stamp(std::string desc) 15 | { 16 | auto end = std::chrono::system_clock::now(); 17 | std::chrono::duration diff = end - _begin; 18 | std::cout << "timestamp " << desc << " " << diff.count() << std::endl; 19 | } 20 | -------------------------------------------------------------------------------- /src/clutil/Clock.h: -------------------------------------------------------------------------------- 1 | // -*- C++ -*- 2 | /* 3 | * Clock.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef CLOCK_H 9 | #define CLOCK_H 10 | #include 11 | 12 | class Clock { 13 | public: 14 | static void stamp(std::string desc); 15 | }; 16 | #endif 17 | -------------------------------------------------------------------------------- /src/clutil/Datatype.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | /* 3 | * Datatype.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #include "Datatype.h" 9 | std::string _dt_datatype = ""; 10 | 11 | std::string Datatype::get() 12 | { 13 | return _dt_datatype; 14 | } 15 | 16 | void Datatype::set(std::string s) 17 | { 18 | _dt_datatype = s; 19 | } 20 | -------------------------------------------------------------------------------- /src/clutil/Datatype.h: -------------------------------------------------------------------------------- 1 | // -*- C++ -*_ 2 | /* 3 | * Datatype.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef DATATYPE_H 9 | #define DATATYPE_H 10 | #include 11 | 12 | class Datatype { 13 | public: 14 | static std::string get(); 15 | static void set(std::string s); 16 | }; 17 | #endif 18 | -------------------------------------------------------------------------------- /src/clutil/DivergencePoint.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * DivergencePoint.cpp 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Main histogram type, includes distance() which is intersection() in Feature.cpp 8 | */ 9 | #include "DivergencePoint.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | template 17 | double DivergencePoint::prob_under(Point &p) const 18 | { 19 | const DivergencePoint& c = dynamic_cast&>(p); 20 | double sum = 0; 21 | const size_t s = points.size(); 22 | double total = 0; 23 | std::feclearexcept(FE_OVERFLOW); 24 | std::feclearexcept(FE_UNDERFLOW); 25 | for (int i = 0; i < s; i++) { 26 | sum += c.points[i]; 27 | if (i % 4 == 3) { 28 | for (int j = i - 3; j <= i; j++) { 29 | double prob = c.points[j] / sum; 30 | double log_prob = log(prob); 31 | total += (points[j] - 1) * log_prob; 32 | if ((bool)std::fetestexcept(FE_UNDERFLOW)) { 33 | cout << "Underflow!" << endl; 34 | } 35 | // cond.push_back(log(prob)/log4); 36 | } 37 | sum = 0; 38 | } 39 | } 40 | // for (size_t q = 0; q < s; q += 4) { 41 | // double sum = 0; 42 | // for (int i = q; i < q + 4; i++) { 43 | // sum += c.points[i]; 44 | // } 45 | // for (int i = q; i < q + 4; i++) { 46 | // double prob = c.points[i] / sum; 47 | // double log_prob = log(prob); 48 | // total += (points[i] - 1) * log_prob; 49 | // } 50 | // } 51 | return exp(total / s); 52 | } 53 | 54 | template 55 | double DivergencePoint::distance_d(Point& p) const 56 | { 57 | const DivergencePoint& c = dynamic_cast&>(p); 58 | uint64_t dist = 0; 59 | uint64_t mag = 0; 60 | for (auto i = 0; i < points.size(); i++) { 61 | dist += 2 * min(points[i],(T)round(c.points[i])); 62 | mag += points[i] + c.points[i]; 63 | } 64 | double frac = (double)dist / mag; 65 | return 10000.0 * (1.0 - frac * frac); 66 | } 67 | 68 | 69 | template 70 | uint64_t DivergencePoint::distance(const Point& p) const 71 | { 72 | const DivergencePoint& c = dynamic_cast&>(p); 73 | uint64_t dist = 0; 74 | const uint64_t mag = getPseudoMagnitude() + c.getPseudoMagnitude(); 75 | #pragma omp simd 76 | for (auto i = 0; i < points.size(); i++) { 77 | dist += min(points[i], c.points[i]); 78 | } 79 | dist *= 2; 80 | double frac = (double)dist / mag; 81 | return 10000.0 * (1.0 - frac * frac); 82 | } 83 | 84 | template 85 | double DivergencePoint::distance_k1(const Point &p) const 86 | { 87 | uint64_t dist = 0; 88 | 89 | auto a = Point::get_1mers(), b = p.get_1mers(); 90 | uint64_t mag = 0; 91 | for (auto i = 0; i < 4; i++) { 92 | dist += std::min(a[i], b[i]); 93 | mag += a[i]; 94 | } 95 | return (double)dist / (double)mag; 96 | 97 | } 98 | template 99 | DivergencePoint::DivergencePoint(const std::vector& pts, uint64_t len) 100 | { 101 | mag = 0; 102 | for (unsigned int i = 0; i < pts.size(); i++) { 103 | points.push_back(pts.at(i)); 104 | mag += pts.at(i); 105 | } 106 | // display(); 107 | nucl_length = len; 108 | to_delete = false; 109 | id = 0; 110 | } 111 | 112 | 113 | template 114 | DivergencePoint::DivergencePoint(unsigned int size) 115 | { 116 | for (unsigned int i = 0; i < size; i++) { 117 | points.push_back(0); 118 | } 119 | to_delete = false; 120 | nucl_length = 0; 121 | id = 0; 122 | } 123 | 124 | template 125 | void DivergencePoint::operator*=(double d) 126 | { 127 | unsigned int size = points.size(); 128 | for (auto& pt : points) { 129 | pt *= d; 130 | } 131 | } 132 | 133 | template 134 | bool DivergencePoint::operator<(Point& p) const 135 | { 136 | const DivergencePoint& h = dynamic_cast&>(p); 137 | unsigned int size = std::min(points.size(),h.points.size()); 138 | /*int boundary = 0; 139 | for (unsigned int i = 0; i < size; i++) { 140 | if (points.at(i) > h.points.at(i)) { 141 | boundary++; 142 | } else if (points.at(i) < h.points.at(i)) { 143 | boundary--; 144 | } 145 | } 146 | return boundary < 0;*/ 147 | for (unsigned int i = 0; i < size; i++) { 148 | if (points.at(i) >= h.points.at(i)) { 149 | return false; 150 | } 151 | } 152 | return true; 153 | } 154 | 155 | template 156 | void DivergencePoint::operator/=(double d) 157 | { 158 | unsigned int size = points.size(); 159 | for (unsigned int i = 0; i < size; i++) { 160 | points[i] /= d; 161 | } 162 | // cout << endl; 163 | } 164 | 165 | template 166 | void DivergencePoint::operator+=(Point& p) 167 | { 168 | const DivergencePoint& h = dynamic_cast&>(p); 169 | unsigned int size = std::min(points.size(),h.points.size()); 170 | for (unsigned int i = 0; i < size; i++) { 171 | points.at(i) += h.points.at(i); 172 | } 173 | } 174 | 175 | template 176 | uint64_t DivergencePoint::operator-(const Point& p) const 177 | { 178 | return distance(p); 179 | } 180 | 181 | template 182 | void DivergencePoint::set(Point& p) 183 | { 184 | const DivergencePoint& h = dynamic_cast&>(p); 185 | points = std::vector(h.points); 186 | set_length(h.get_length()); 187 | to_delete = h.to_delete; 188 | Point::set_header(h.get_header()); 189 | set_id(h.get_id()); 190 | } 191 | 192 | template 193 | void DivergencePoint::display() const 194 | { 195 | unsigned size = points.size(); 196 | for (unsigned i = 0; i < size; i++) { 197 | std::cout << points.at(i) << " "; 198 | } 199 | std::cout << std::endl; 200 | } 201 | 202 | template 203 | void DivergencePoint::zero() 204 | { 205 | for (auto &i : points) { 206 | i = 0; 207 | } 208 | } 209 | 210 | template 211 | void DivergencePoint::addOne() 212 | { 213 | for (auto& a : points) { 214 | a++; 215 | } 216 | } 217 | 218 | template 219 | void DivergencePoint::subOne() 220 | { 221 | for (auto& a : points) { 222 | a--; 223 | } 224 | } 225 | 226 | /* 227 | * p(y|x) = cond_p 228 | * q(y|x) = cond_p 229 | */ 230 | template 231 | double DivergencePoint::divergence(Point& p) const 232 | { 233 | const DivergencePoint& d = dynamic_cast&>(p); 234 | T sum4_p = 0, sum4_q = 0; // Sum for every 4 nucleotides 235 | double total_sum_p = 0, total_sum_q = 0; // Total running sum of all nucleotides 236 | double outer_sum_p = 0, outer_sum_q = 0; // Prior K-mer sum 237 | for (int i = 0; i < points.size(); i++) { // Compute divergence for P and Q simultaneously 238 | sum4_p += points[i]; 239 | sum4_q += d.points[i]; 240 | if (i % 4 == 3) { //finished counting word, now compute probabilities 241 | double inner_sum_p = 0; // Sum of p(X|Y) * log(p(X|Y) / q(X|Y)) 242 | double inner_sum_q = 0; // Sum of q(X|Y) * log(q(X|Y) / p(X|Y)) 243 | for (int j = i - 3; j <= i; j++) { 244 | double conditional_p = points[j] / sum4_p; 245 | double conditional_q = d.points[j] / sum4_q; 246 | double lg = log(conditional_p) - log(conditional_q); 247 | inner_sum_p += conditional_p * lg; 248 | inner_sum_q += -1 * conditional_q * lg; 249 | } 250 | outer_sum_p += sum4_p * inner_sum_p; 251 | outer_sum_q += sum4_q * inner_sum_q; 252 | 253 | total_sum_p += sum4_p; 254 | total_sum_q += sum4_q; 255 | sum4_p = 0; 256 | sum4_q = 0; 257 | } 258 | } 259 | double left = outer_sum_p / total_sum_p; 260 | double right = outer_sum_q / total_sum_q; 261 | return (left + right) / 2.0; 262 | } 263 | 264 | template 265 | uint64_t DivergencePoint::getPseudoMagnitude() const 266 | { 267 | return mag; 268 | } 269 | 270 | 271 | template 272 | uint64_t DivergencePoint::getRealMagnitude() const 273 | { 274 | return mag - points.size(); 275 | } 276 | 277 | #ifndef HEADER_HACK 278 | template class DivergencePoint; 279 | template class DivergencePoint; 280 | template class DivergencePoint; 281 | template class DivergencePoint; 282 | template class DivergencePoint; 283 | template class DivergencePoint; 284 | #endif 285 | -------------------------------------------------------------------------------- /src/clutil/DivergencePoint.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * DivergencePoint.h 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Header for most often used k-mer histogram type 8 | */ 9 | #ifndef DIVERGENCE_POINT_H 10 | #define DIVERGENCE_POINT_H 11 | #include "Point.h" 12 | #include 13 | template 14 | class DivergencePoint : public Point { 15 | public: 16 | DivergencePoint(const std::vector& pts, uint64_t len); 17 | DivergencePoint(unsigned int size); 18 | ~DivergencePoint() { points.clear(); } 19 | void operator*=(double d); 20 | void operator/=(double d); 21 | uint64_t operator-(const Point& p) const; 22 | bool operator<(Point& p) const; 23 | void operator+=(Point& p); 24 | void set(Point& p); 25 | void display() const; 26 | void zero(); 27 | void addOne(); 28 | void subOne(); 29 | double prob_under(Point& p) const; 30 | uint64_t getRealMagnitude() const; 31 | uint64_t getPseudoMagnitude() const; 32 | // T magnitude() const { return getRealMagnitude(); }; 33 | double distance_k1(const Point& p) const; 34 | double get_stddev() const { return s_dev; }; 35 | DivergencePoint* clone() const { 36 | auto d = new DivergencePoint(points, to_delete); 37 | d->set_header(Point::get_header()); 38 | d->set_id(get_id()); 39 | d->set_length(get_length()); 40 | d->set_stddev(get_stddev()); 41 | d->set_data_str(Point::get_data_str()); 42 | return d; 43 | } 44 | DivergencePoint* create() const { 45 | return new DivergencePoint(points.size()); 46 | } 47 | Point* create_double() const { 48 | vector v; 49 | for (auto val : points) { 50 | v.push_back(val); 51 | } 52 | return new DivergencePoint(v, nucl_length); 53 | } 54 | void set_arg_to_this_d(Point& p) const { 55 | DivergencePoint& c = dynamic_cast< DivergencePoint&>(p); 56 | for (int i = 0; i < points.size(); i++) { 57 | c.points[i] = points[i]; 58 | } 59 | c.set_id(id); 60 | }; 61 | 62 | 63 | bool is_to_delete() const { 64 | return to_delete; 65 | } 66 | void set_to_delete(bool b) { 67 | to_delete = b; 68 | } 69 | double divergence(Point& p) const; 70 | double distance_d(Point& p) const; 71 | uint64_t distance(const Point& p) const; 72 | const vector& get_data() const { return points; } 73 | void set_id(uintmax_t c_id) { id = c_id; }; 74 | const uintmax_t get_id() const { return id; }; 75 | 76 | void set_length(unsigned long len) { nucl_length = len; }; 77 | void set_stddev(double s_dev_) { s_dev = s_dev_; }; 78 | unsigned long get_length() const { return nucl_length; }; 79 | unsigned long size() const { return points.size(); }; 80 | std::vector points; 81 | 82 | private: 83 | uintmax_t mag; 84 | bool to_delete; 85 | uint64_t id; 86 | uint64_t nucl_length; 87 | double s_dev; 88 | }; 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /src/clutil/Histogram.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Histogram.cpp 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Artifact from early development of MeShClust 8 | */ 9 | #ifndef HEADER_HACK 10 | #include "Histogram.h" 11 | #endif 12 | 13 | #include 14 | #include 15 | 16 | template 17 | double Histogram::distance_k1(const Point &p) const 18 | { 19 | throw "Not implemented"; 20 | const Histogram& h = dynamic_cast&>(p); 21 | uint64_t dist = 0; 22 | auto size = std::min(points.size(),h.points.size()); 23 | /* 24 | for (unsigned int i = 0; i < size; i++) { 25 | T l = points.at(i); 26 | T r = h.points.at(i); 27 | dist += (l > r) ? (l - r) : (r - l); 28 | } 29 | */ 30 | uint64_t avg_mag = (magnitude() + h.magnitude()) / 2.0; 31 | for (auto i = 0; i < size; i++) { 32 | T l = points[i]; 33 | T r = h.points[i]; 34 | dist += min(l, r); 35 | } 36 | return 1.0 - dist / avg_mag; 37 | } 38 | template 39 | Histogram::Histogram(std::vector pts, char mark) 40 | { 41 | for (T t : pts) { 42 | points.push_back(t); 43 | } 44 | to_delete = false; 45 | } 46 | template 47 | Histogram::Histogram(std::vector pts) 48 | { 49 | for (T t : pts) { 50 | points.push_back(t); 51 | } 52 | to_delete = false; 53 | } 54 | 55 | template 56 | Histogram::Histogram(std::vector pts, bool toDelete) 57 | { 58 | for (T t : pts) { 59 | points.push_back(t); 60 | } 61 | to_delete = toDelete; 62 | } 63 | 64 | template 65 | Histogram::Histogram(unsigned int size) 66 | { 67 | for (unsigned int i = 0; i < size; i++) { 68 | points.push_back(0); 69 | } 70 | to_delete = false; 71 | } 72 | 73 | template 74 | void Histogram::operator*=(double d) 75 | { 76 | for (T &t : points) { 77 | t *= d; 78 | } 79 | } 80 | 81 | template 82 | bool Histogram::operator<(Point& p) const 83 | { 84 | const Histogram& h = dynamic_cast&>(p); 85 | unsigned int size = std::min(points.size(),h.points.size()); 86 | for (unsigned int i = 0; i < size; i++) { 87 | if (points.at(i) >= h.points.at(i)) { 88 | return false; 89 | } 90 | } 91 | return true; 92 | } 93 | 94 | template 95 | void Histogram::operator/=(double d) 96 | { 97 | unsigned int size = points.size(); 98 | for (unsigned int i = 0; i < size; i++) { 99 | points.at(i) = points.at(i) / d; 100 | } 101 | } 102 | 103 | template 104 | void Histogram::operator+=(Point& p) 105 | { 106 | const Histogram& h = dynamic_cast&>(p); 107 | unsigned int size = std::min(points.size(),h.points.size()); 108 | for (unsigned int i = 0; i < size; i++) { 109 | points.at(i) += h.points.at(i); 110 | } 111 | } 112 | 113 | template 114 | uint64_t Histogram::operator-(const Point& p) const 115 | { 116 | return distance(p); 117 | } 118 | 119 | template 120 | void Histogram::set(Point& p) 121 | { 122 | const Histogram& h = dynamic_cast&>(p); 123 | points = h.points; 124 | } 125 | 126 | template 127 | void Histogram::display() const 128 | { 129 | unsigned size = points.size(); 130 | for (unsigned i = 0; i < size; i++) { 131 | std::cout << points.at(i) << " "; 132 | } 133 | std::cout << std::endl; 134 | } 135 | 136 | template 137 | void Histogram::addOne() 138 | { 139 | for (auto &a : points) { 140 | a++; 141 | } 142 | } 143 | template 144 | void Histogram::subOne() 145 | { 146 | for (auto &a : points) { 147 | a--; 148 | } 149 | } 150 | 151 | template 152 | void Histogram::zero() 153 | { 154 | for (typename std::vector::iterator it = points.begin(); it != points.end(); ++it) { 155 | *it = 0; 156 | } 157 | } 158 | 159 | template 160 | uint64_t Histogram::distance(const Point& p) const 161 | { 162 | /* 163 | // Vectors should be the same width 164 | const Histogram& h = dynamic_cast&>(p); 165 | T dist = 0; 166 | unsigned int size = std::min(points.size(),h.points.size()); 167 | for (unsigned int i = 0; i < size; i++) { 168 | T l = points.at(i); 169 | T r = h.points.at(i); 170 | dist += (l > r) ? (l - r) : (r - l); 171 | } 172 | return dist; 173 | */ 174 | throw "Not implemented"; 175 | return 0; 176 | } 177 | 178 | template 179 | uint64_t Histogram::magnitude() const 180 | { 181 | uint64_t dist = 0; 182 | for (auto const& p : points) { 183 | dist += p; 184 | } 185 | return dist; 186 | } 187 | 188 | #ifndef HEADER_HACK 189 | template class Histogram; 190 | template class Histogram; 191 | template class Histogram; 192 | template class Histogram; 193 | template class Histogram; 194 | template class Histogram; 195 | #endif 196 | -------------------------------------------------------------------------------- /src/clutil/Histogram.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Histogram.h 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Artifact from early development of MeShClust 8 | */ 9 | #ifndef HISTOGRAM_H 10 | #define HISTOGRAM_H 11 | #include 12 | #include "Point.h" 13 | 14 | template 15 | class Histogram : public Point { 16 | public: 17 | Histogram(std::vector pts); 18 | Histogram(std::vector pts, char marker); 19 | Histogram(std::vector pts, bool to_delete); 20 | Histogram(unsigned int size); 21 | ~Histogram() {} 22 | void operator*=(double d); 23 | void operator/=(double d); 24 | uint64_t operator-(const Point& p) const; 25 | bool operator<(Point& p) const; 26 | void operator+=(Point& p); 27 | void set(Point& p); 28 | void display() const; 29 | void zero(); 30 | void addOne(); 31 | void subOne(); 32 | double distance_k1(const Point& p) const; 33 | double prob_under(Point& p) const { return distance(p); }; 34 | uint64_t distance(const Point& p) const; 35 | uint64_t magnitude() const; 36 | uint64_t getRealMagnitude() const { return 0; }; 37 | double distance_d(Point& p) const { 38 | throw "not implemented"; 39 | return 0; 40 | } 41 | void set_arg_to_this_d(Point& p) const { 42 | throw "not implemented"; 43 | } 44 | Point* create_double() const { 45 | throw "not implemented"; 46 | return NULL; 47 | } 48 | Histogram* clone() const { 49 | return new Histogram(points, to_delete); 50 | } 51 | Histogram* create() const { 52 | return new Histogram(points.size()); 53 | } 54 | bool is_to_delete() const { 55 | return to_delete; 56 | } 57 | void set_to_delete(bool b) { 58 | to_delete = b; 59 | } 60 | const vector& get_data() const { return points; } 61 | void set_id(uintmax_t c_id) { id = c_id; }; 62 | const uintmax_t get_id() const { return id; }; 63 | void set_length(unsigned long len) { nucl_length = len; }; 64 | unsigned long get_length() const { return nucl_length; }; 65 | unsigned long size() const { return points.size(); }; 66 | private: 67 | std::vector points; 68 | bool to_delete; 69 | uintmax_t id; 70 | unsigned long nucl_length; 71 | }; 72 | 73 | #ifdef HEADER_HACK 74 | #ifndef HISTOGRAM_C 75 | #define HISTORGRAM_C 76 | #include "Histogram.cpp" 77 | #endif 78 | #endif 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /src/clutil/LCG.h: -------------------------------------------------------------------------------- 1 | // -*- C++ -*- 2 | /* 3 | * LCG.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef LCG_H 9 | #define LCG_H 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | class LCG { 17 | public: 18 | LCG(uint64_t seed_) { 19 | seed = seed_; 20 | } 21 | 22 | template 23 | T randMod(T max) { 24 | if (max == 0) { 25 | return 0; 26 | } else { 27 | uint64_t x = random() % max; 28 | return (T)x; 29 | } 30 | } 31 | 32 | uint64_t nextRandSeed() { 33 | return random(); 34 | } 35 | double rand_between(double id, double range, double low, double high) { 36 | uint64_t rnd = random(); 37 | double res = (double)rnd / std::numeric_limits::max(); 38 | double mn = std::max(id - range, low); 39 | double mx = std::min(id + range, high); 40 | return mn + (mx - mn) * res; 41 | } 42 | uint64_t random() { 43 | // MMIX random, from https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use 44 | // Should be thread safe 45 | seed = seed * 6364136223846793005 + 1442695040888963407; 46 | return seed; 47 | } 48 | private: 49 | uint64_t seed; 50 | }; 51 | #endif 52 | -------------------------------------------------------------------------------- /src/clutil/Loader.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Loader.cpp 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Class which can 'preload' chunks of sequences from a file list, 8 | * and then count the k-mers separately, which can be done in 9 | * multiple threads 10 | */ 11 | #include "Loader.h" 12 | #include "Datatype.h" 13 | 14 | static uint64_t num_overflow = 0; 15 | std::string next_histogram(std::string cur_type) 16 | { 17 | if (cur_type == "uint8_t") { 18 | return "uint16_t"; 19 | } else if (cur_type == "uint16_t") { 20 | return "uint32_t"; 21 | } else { 22 | return "uint64_t"; 23 | } 24 | } 25 | 26 | template 27 | std::string Loader::get_warning() 28 | { 29 | if (num_overflow == 0) { 30 | return ""; 31 | } else { 32 | std::ostringstream oss; 33 | oss << "For " << num_overflow << " sequences, the histogram type " << Datatype::get() << " was too small for holding sequences." << endl; 34 | oss << "Performance may be slightly hindered, but can be improved by increasing the integral type (--datatype " << next_histogram(Datatype::get()) << ")" << endl; 35 | return oss.str(); 36 | } 37 | } 38 | 39 | 40 | 41 | template 42 | void Loader::fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values) 43 | { 44 | const int k = table.getK(); 45 | auto segment = chrom->getSegment(); 46 | const char *seg_bases = chrom->getBase()->c_str(); 47 | for (vector *v : *segment) { 48 | int start = v->at(0); 49 | int end = v->at(1); 50 | 51 | // Hani Z Girgis added the following line 52 | // It is possible 53 | if(end - start + 1 >= k){ 54 | int r = table.wholesaleIncrementNoOverflow(seg_bases, start, end - k + 1); 55 | if (r == -1) { 56 | num_overflow++; 57 | // #pragma omp critical 58 | // { 59 | // std::ostringstream oss; 60 | // oss << "In header \"" << chrom->getHeader() << "\"" << endl; 61 | // oss << "Histogram type " << Runner::get_datatype() << " is too small for holding sequences." << endl; 62 | // oss << "Performance may be slightly hindered, but can be improved by increasing the integral type (--datatype " << next_histogram(Runner::get_datatype()) << ")" << endl; 63 | // _loader_warning = oss.str(); 64 | // cerr << get_warning() << endl; 65 | // } 66 | } 67 | } 68 | } 69 | std::string header = chrom->getHeader(); 70 | header = header.substr(1, header.find(' ')-1); 71 | // Hani Z. Girgis added the following lines on 10/3/2018 72 | // This should result in significant speed up. 73 | unsigned long tableSize = table.getMaxTableSize(); 74 | values.reserve(values.size() + tableSize); 75 | const V * valueArray = table.getValues(); 76 | 77 | copy(&valueArray[0], &valueArray[tableSize], back_inserter(values)); 78 | 79 | // Commented out by Hani Z. Girgis on 10/3/2018 and replaced by the code above 80 | // std::vector *keys = table.getKeys(); 81 | // for (std::string str : *keys) { 82 | // values.push_back(table.valueOf(str.c_str())); 83 | // } 84 | // keys->clear(); 85 | // delete keys; 86 | } 87 | 88 | template 89 | bool Loader::done() const 90 | { 91 | return file_idx == files.size(); 92 | } 93 | 94 | template 95 | void Loader::preload(int tid) 96 | { 97 | if (file_idx == files.size()) { 98 | return; 99 | } 100 | for (uint64_t j = 0; j < chunk_size; j++) { 101 | auto chrom = next(); 102 | if (chrom.first == "") { 103 | return; 104 | } 105 | cache_list.at(tid).emplace_back(chrom.first, chrom.second); 106 | } 107 | } 108 | 109 | 110 | // Modified by Hani Z. Girgis on Oct 2, 2018 111 | template 112 | Point* Loader::get_point(std::string header, const std::string &base, uintmax_t& id, int k, bool set_seq) 113 | { 114 | ostringstream obase; 115 | for (int i = 0; i < base.length(); i++) { 116 | if (base[i] == 'A' || base[i] == 'C' || 117 | base[i] == 'G' || base[i] == 'T') { 118 | obase << base[i]; 119 | } 120 | } 121 | ChromosomeOneDigit * chrom; 122 | if(Util::isDna){ 123 | chrom = new ChromosomeOneDigitDna(); 124 | }else{ 125 | chrom = new ChromosomeOneDigitProtein(); 126 | } 127 | 128 | chrom->setHeader(header); 129 | chrom->appendToSequence(obase.str()); 130 | chrom->finalize(); 131 | Point *p = Loader::get_point(chrom, id, k, set_seq); 132 | delete chrom; 133 | return p; 134 | } 135 | 136 | // Modified by Hani Z. Girgis on Oct 2, 2018 137 | template 138 | Point* Loader::get_point(ChromosomeOneDigit* chrom, uintmax_t& id, int k, bool set_seq) 139 | { 140 | 141 | KmerHashTable table(k, 1); 142 | // Hani Z. Girgis changed the following line 143 | // The table_k1 was initialized from 0 now it is 1 144 | KmerHashTable table_k1(1, 1); 145 | std::vector values; 146 | vector values_k1; 147 | // values.clear(); 148 | 149 | Loader::fill_table(table, chrom, values); 150 | Loader::fill_table(table_k1, chrom, values_k1); 151 | // int tmplate = get_template(chrom->getHeader(), templates); 152 | Point *p = new DivergencePoint(values, chrom->size()); 153 | // cout << "mag: " << ((DivergencePoint*)p)->getPseudoMagnitude() << std::endl; 154 | p->set_1mers(values_k1); 155 | p->set_header(chrom->getHeader()); 156 | p->set_length(chrom->getEffectiveSize()); 157 | if (set_seq) { 158 | p->set_data_str(*chrom->getBase()); 159 | } 160 | // Added by Hani Z. Girgis on Oct 7 2018 161 | p->setK(k); 162 | DivergencePoint* q = dynamic_cast*>(p); 163 | const auto N = q->points.size(); 164 | double aq = (double) q->getPseudoMagnitude() / N; 165 | double sq = 0; 166 | for (auto i = 0; i < N; i++) { 167 | double qdiff = q->points[i] - aq; 168 | sq += qdiff * qdiff; 169 | } 170 | sq = sqrt(sq / N); 171 | q->set_stddev(sq); 172 | p->set_id(id); 173 | #pragma omp atomic 174 | id++; 175 | 176 | // Clean 177 | 178 | return p; 179 | } 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | template 188 | std::vector*> Loader::load_next(int tid) 189 | { 190 | std::vector*> points; 191 | for (size_t i = 0; i < cache_list.at(tid).size(); i++) { 192 | auto pr = cache_list.at(tid).at(i); 193 | Point* p = get_point(pr.first, *pr.second, id_list.at(tid), k); 194 | points.push_back(p); 195 | delete pr.second; 196 | } 197 | cache_list.at(tid).clear(); 198 | return points; 199 | } 200 | 201 | template 202 | std::pair Loader::next() 203 | { 204 | auto n = maker->next(); 205 | if (n.first != "") { 206 | return n; 207 | } 208 | delete maker; 209 | maker = NULL; 210 | file_idx++; 211 | if (file_idx >= files.size()) { 212 | return n; 213 | } 214 | maker = new SingleFileLoader(files.at(file_idx)); 215 | return maker->next(); 216 | } 217 | 218 | template class Loader; 219 | template class Loader; 220 | template class Loader; 221 | template class Loader; 222 | template class Loader; 223 | template class Loader; 224 | -------------------------------------------------------------------------------- /src/clutil/Loader.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Loader.h 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Class which can 'preload' chunks of sequences from a file list, 8 | * and then count the k-mers separately, which can be done in 9 | * multiple threads 10 | */ 11 | #ifndef LOADER_H 12 | #define LOADER_H 13 | 14 | #include 15 | 16 | #include "SingleFileLoader.h" 17 | #include "Point.h" 18 | #include "DivergencePoint.h" 19 | #include "../nonltr/KmerHashTable.h" 20 | // Add by Hani Z. Girgis, PhD on Oct 2, 2018 21 | #include "../nonltr/ChromosomeOneDigit.h" 22 | #include "../nonltr/ChromosomeOneDigitDna.h" 23 | #include "../nonltr/ChromosomeOneDigitProtein.h" 24 | 25 | 26 | 27 | template 28 | class Loader { 29 | public: 30 | Loader(std::vector files_, 31 | uint64_t total_num_points_, 32 | uint64_t chunk_size_, 33 | int num_threads_, 34 | int k_, 35 | uint64_t start_id=0) 36 | : 37 | chunk_size(chunk_size_), 38 | num_threads(num_threads_), 39 | k(k_), 40 | files(files_) { 41 | 42 | maker = new SingleFileLoader(files.at(0)); 43 | uint64_t total_id = start_id; 44 | for (int i = 0; i < num_threads_; i++) { 45 | id_list.push_back(total_id); 46 | total_id += total_num_points_; 47 | cache_list.push_back(std::vector >()); 48 | } 49 | // preload(); 50 | }; 51 | 52 | ~Loader() { 53 | if (get_warning() != "") { 54 | cerr << get_warning() << endl; 55 | } 56 | cache_list.clear(); 57 | id_list.clear(); 58 | if (maker != NULL) { 59 | delete maker; 60 | } 61 | } 62 | 63 | // single threaded 64 | void preload(int tnum); 65 | 66 | bool done() const; 67 | // multi-thread accessible 68 | std::vector*> load_next(int tid); 69 | 70 | static Point* get_point(std::string header, const std::string &base, uintmax_t& id, int k, bool set_seq=true); 71 | static Point* get_point(ChromosomeOneDigit* dna, uintmax_t& id, int k, bool set_seq=true); 72 | 73 | static void fill_table(KmerHashTable &table, ChromosomeOneDigit *chrom, std::vector& values); 74 | static std::string get_warning(); 75 | private: 76 | 77 | std::pair next(); 78 | 79 | uint64_t chunk_size; 80 | int num_threads, k; 81 | 82 | std::vector > > cache_list; 83 | std::vector id_list; 84 | 85 | std::vector files; 86 | size_t file_idx = 0; 87 | SingleFileLoader *maker = NULL; 88 | 89 | }; 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /src/clutil/Point.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Point.h 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * For some reason this class was made pure virtual 8 | * in early development of MeShClust, making Histogram 9 | * and DivergencePoint both derivatives that essentially 10 | * did the same thing 11 | */ 12 | #ifndef POINT_H 13 | #define POINT_H 14 | 15 | #include 16 | #include "../nonltr/ChromosomeOneDigit.h" 17 | 18 | /* 19 | * Pure virtual class that defines behavior for 20 | * points. Has clone() and create() that allow for 21 | * polymorphic behavior 22 | */ 23 | template 24 | class Point { 25 | public: 26 | virtual ~Point() { data.clear(); }; 27 | virtual void operator*=(double d) = 0; 28 | virtual void operator/=(double d) = 0; 29 | virtual bool operator<(Point& p) const = 0; 30 | virtual uint64_t operator-(const Point& p) const = 0; 31 | virtual void operator+=(Point& p) = 0; 32 | virtual void set(Point& p) = 0; 33 | virtual void display() const = 0; 34 | virtual uint64_t distance(const Point& p) const = 0; 35 | virtual double distance_d(Point& p) const = 0; 36 | virtual Point* clone() const = 0; 37 | virtual Point* create() const = 0; 38 | 39 | virtual void zero() = 0; 40 | virtual void addOne() = 0; 41 | virtual double distance_k1(const Point& p) const = 0; 42 | virtual double prob_under(Point& center) const = 0; 43 | virtual void subOne() = 0; 44 | virtual uint64_t getRealMagnitude() const = 0; 45 | // virtual T magnitude() const = 0; 46 | virtual bool is_to_delete() const = 0; 47 | virtual void set_to_delete(bool b) = 0; 48 | 49 | virtual Point* create_double() const = 0; 50 | virtual void set_arg_to_this_d(Point& p) const = 0; 51 | 52 | virtual const vector& get_data() const = 0; 53 | 54 | void set_header(const std::string c) { header = string(c); }; 55 | const std::string get_header() const { return header; }; 56 | 57 | void set_data_str(const std::string& c) { data = c; }; 58 | const std::string & get_data_str() const { return data; }; 59 | 60 | void set_1mers(const vector &vec) { 61 | // for (auto i = 0; i < Util::getAlphabetSize(); i++) { 62 | // one_mers[i] = vec[i]; 63 | // } 64 | one_mers = vector(vec); 65 | } 66 | 67 | vector get_1mers() const { 68 | // vector vec; 69 | // for (auto i = 0; i < Util::getAlphabetSize(); i++) { 70 | // vec.push_back(one_mers[i]); 71 | // } 72 | // return vec; 73 | return one_mers; 74 | } 75 | virtual unsigned long size() const = 0; 76 | virtual void set_id(uintmax_t c_id) = 0;//{ id = c_id; }; 77 | virtual const uintmax_t get_id() const = 0;//{ return id; }; 78 | virtual void set_length(unsigned long len) = 0; 79 | virtual unsigned long get_length() const = 0; 80 | 81 | // Added by Hani Z. Girgis on Oct 7 2018 82 | int getK(){ 83 | return k; 84 | } 85 | void setK(int k){ 86 | this->k = k; 87 | } 88 | 89 | private: 90 | vector one_mers; 91 | std::string header; 92 | std::string data; 93 | // Added by Hani Z. Girgis on Oct 7 2018 94 | // The k in k-mer used to build the table 95 | int k; 96 | }; 97 | 98 | #endif 99 | -------------------------------------------------------------------------------- /src/clutil/Progress.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Progress.cpp 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Progress bar that uses carriage return '\r' 8 | * to seek to the beginning of a line to redraw 9 | */ 10 | #include "Progress.h" 11 | #include 12 | Progress::Progress(long num, std::string prefix_) 13 | { 14 | pmax = num; 15 | ended = 0; 16 | pcur = 0; 17 | old_prog = -1; 18 | prefix = prefix_; 19 | barWidth = 70 - (prefix.size()+1); 20 | print(); 21 | } 22 | 23 | void Progress::print() 24 | { 25 | #ifndef NOPROG 26 | double prog = (double)pcur / pmax; 27 | if (old_prog != int(prog * 100)) { 28 | std::cout << prefix << " ["; 29 | int pos = barWidth * prog; 30 | for (int i = 0; i < barWidth; i++) { 31 | if (i < pos) { 32 | std::cout << "="; 33 | } else if (i == pos) { 34 | std::cout << ">"; 35 | } else { 36 | std::cout << " "; 37 | } 38 | } 39 | std::cout << "] " << int(prog * 100.0) << " %\r"; 40 | std::cout.flush(); 41 | } 42 | old_prog = int(prog * 100); 43 | #endif 44 | } 45 | 46 | void Progress::end() 47 | { 48 | if (!ended) { 49 | pcur = pmax; 50 | print(); 51 | std::cout << std::endl; 52 | } 53 | ended = true; 54 | } 55 | 56 | 57 | void Progress::set(int num) 58 | { 59 | pcur = num; 60 | print(); 61 | } 62 | 63 | void Progress::operator++() 64 | { 65 | pcur++; 66 | print(); 67 | } 68 | void Progress::operator++(int) 69 | { 70 | print(); 71 | pcur++; 72 | } 73 | 74 | 75 | void Progress::operator+=(size_t num) 76 | { 77 | pcur += num; 78 | print(); 79 | } 80 | -------------------------------------------------------------------------------- /src/clutil/Progress.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Progress.h 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Progress bar that uses carriage return '\r' 8 | * to seek to the beginning of a line to redraw 9 | * 10 | */ 11 | #include 12 | #ifndef PROGRESS_H 13 | #define PROGRESS_H 14 | 15 | class Progress { 16 | public: 17 | Progress(long num, std::string prefix_); 18 | ~Progress() { end(); } 19 | void end(); 20 | void operator++(); 21 | void operator++(int); 22 | void operator+=(size_t); 23 | void set(int); 24 | private: 25 | void print(); 26 | long pmax; 27 | long pcur; 28 | long old_prog; 29 | bool ended; 30 | std::string prefix; 31 | int barWidth; 32 | }; 33 | #endif 34 | -------------------------------------------------------------------------------- /src/clutil/Random.h: -------------------------------------------------------------------------------- 1 | // -*- C++ -*- 2 | /* 3 | * Random.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef RANDOM_H 9 | #define RANDOM_H 10 | #include 11 | #include 12 | class Random { 13 | public: 14 | Random(std::random_device::result_type seed=0xAA) : mt(seed) {} 15 | 16 | template 17 | T randMod(T max) { 18 | T res; 19 | #pragma omp critical 20 | { 21 | if (max == 0) { 22 | res = 0; 23 | } else { 24 | std::uniform_int_distribution distribution(0, max-1); 25 | res = distribution(mt); 26 | } 27 | } 28 | return res; 29 | } 30 | 31 | double random() { 32 | double res = 0; 33 | #pragma omp critical 34 | { 35 | std::uniform_real_distribution distribution(0.0, 1.0); 36 | res = distribution(mt); 37 | } 38 | return res; 39 | } 40 | double rand_between(double id, double range, double low, double high) { 41 | double res = 0; 42 | #pragma omp critical 43 | { 44 | double mn = std::max(id - range, low); 45 | double mx = std::min(id + range, high); 46 | std::uniform_real_distribution distribution(mn, mx); 47 | 48 | res = distribution(mt); 49 | } 50 | return res; 51 | } 52 | std::random_device::result_type nextRandSeed() { 53 | using rt = std::random_device::result_type; 54 | return randMod(std::numeric_limits::max()); 55 | } 56 | std::mt19937& gen() { return mt; } 57 | private: 58 | std::mt19937 mt; 59 | 60 | }; 61 | #endif 62 | -------------------------------------------------------------------------------- /src/clutil/SingleFileLoader.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * SingleFileLoader.cpp 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Reads sequences one by one from a file 8 | */ 9 | #include "SingleFileLoader.h" 10 | #include 11 | #include 12 | 13 | std::istream& safe_getline(std::istream& is, std::string& t) 14 | { 15 | t.clear(); 16 | std::istream::sentry se(is, true); 17 | std::streambuf* sb = is.rdbuf(); 18 | for(;;) { 19 | int c = sb->sbumpc(); 20 | switch (c) { 21 | case '\n': 22 | return is; 23 | case '\r': 24 | if (sb->sgetc() == '\n') { 25 | sb->sbumpc(); 26 | } 27 | return is; 28 | case std::streambuf::traits_type::eof(): 29 | if (t.empty()) { 30 | is.setstate(std::ios::eofbit); 31 | } 32 | return is; 33 | default: 34 | t += (char)c; 35 | } 36 | } 37 | } 38 | 39 | 40 | SingleFileLoader::SingleFileLoader(std::string filename) 41 | { 42 | in = new std::ifstream(filename); 43 | is_first = true; 44 | } 45 | std::pair SingleFileLoader::next() 46 | { 47 | std::pair ret = std::make_pair("", (std::string*)NULL); 48 | if (!in->good()) { 49 | return ret; 50 | } 51 | clock_t begin = clock(); 52 | ret.second = new std::string(""); 53 | if (is_first) { 54 | safe_getline(*in, buffer); 55 | is_first = false; 56 | } 57 | do { 58 | if (buffer[0] == '>') { 59 | if (ret.first != "") { 60 | return ret; 61 | } 62 | ret.first = buffer; 63 | } else if (buffer[0] == ' ' || buffer[0] == '\t') { 64 | bool all_spaces = true; 65 | for (auto c : buffer) { 66 | if (c != ' ' && c != '\t') { 67 | all_spaces = false; 68 | } 69 | } 70 | if (!all_spaces) { 71 | std::ostringstream oss; 72 | oss << ret.first << buffer; 73 | std::string new_header = oss.str(); 74 | ret.first = new_header; 75 | } 76 | } else { 77 | ret.second->append(buffer); 78 | } 79 | safe_getline(*in, buffer); 80 | } while (in->good()); 81 | double diff = clock() - begin; 82 | // std::cout << "next(): " << diff / CLOCKS_PER_SEC << std::endl; 83 | return ret; 84 | } 85 | ChromosomeOneDigitDna* SingleFileLoader::nextChrom() 86 | { 87 | ChromosomeOneDigitDna* ret = NULL; 88 | if (!in->good()) { 89 | return ret; 90 | } 91 | if (is_first) { 92 | safe_getline(*in, buffer); 93 | is_first = false; 94 | } 95 | do { 96 | if (buffer[0] == '>') { 97 | if (ret != NULL) { 98 | ret->finalize(); 99 | return ret; 100 | } 101 | ret = new ChromosomeOneDigitDna(); 102 | ret->setHeader(buffer); 103 | } else if (buffer[0] == ' ' || buffer[0] == '\t') { 104 | bool all_spaces = true; 105 | for (auto c : buffer) { 106 | if (c != ' ' && c != '\t') { 107 | all_spaces = false; 108 | } 109 | } 110 | if (!all_spaces) { 111 | std::ostringstream oss; 112 | oss << ret->getHeader() << buffer; 113 | std::string new_header = oss.str(); 114 | ret->setHeader(new_header); 115 | } 116 | } else { 117 | ret->appendToSequence(buffer); 118 | } 119 | safe_getline(*in, buffer); 120 | } while (in->good()); 121 | ret->finalize(); 122 | return ret; 123 | } 124 | -------------------------------------------------------------------------------- /src/clutil/SingleFileLoader.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * SingleFileLoader.h 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * A way of reading in 1 sequence at a time 8 | * from FASTA, sequence is heap allocated 9 | */ 10 | #ifndef SINGLEFILELOADER_H 11 | #define SINGLEFILELOADER_H 12 | 13 | #include 14 | #include "../nonltr/ChromosomeOneDigitDna.h" 15 | class SingleFileLoader { 16 | public: 17 | SingleFileLoader(std::string file); 18 | ~SingleFileLoader() { 19 | if (in != NULL) { 20 | delete in; 21 | } 22 | } 23 | std::pair next(); 24 | ChromosomeOneDigitDna* nextChrom(); 25 | private: 26 | std::ifstream *in; 27 | std::string buffer; 28 | bool is_first; 29 | }; 30 | #endif 31 | -------------------------------------------------------------------------------- /src/exception/FileDoesNotExistException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * FileDoesNotExistException.cpp 3 | * 4 | * Created on: Apr 30, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "FileDoesNotExistException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception{ 16 | 17 | FileDoesNotExistException::FileDoesNotExistException(string massage) { 18 | cerr << "File Does Not Exist Exception" << endl; 19 | cerr << massage << endl; 20 | } 21 | 22 | FileDoesNotExistException::~FileDoesNotExistException() { 23 | // TODO Auto-generated destructor stub 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/exception/FileDoesNotExistException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * FileDoesNotExistException.h 3 | * 4 | * Created on: Apr 30, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef FILEDOESNOTEXISTEXCEPTION_H_ 9 | #define FILEDOESNOTEXISTEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception { 16 | class FileDoesNotExistException { 17 | public: 18 | FileDoesNotExistException(string); 19 | ~FileDoesNotExistException(); 20 | }; 21 | } 22 | 23 | #endif /* FILEDOESNOTEXISTEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src/exception/InvalidInputException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidInputException.cpp 3 | * 4 | * Created on: May 1, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "InvalidInputException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | namespace exception{ 15 | 16 | InvalidInputException::InvalidInputException(string msg) { 17 | cerr << "Invalid Input Exception" << endl; 18 | cerr << msg << endl; 19 | } 20 | 21 | InvalidInputException::~InvalidInputException() { 22 | // TODO Auto-generated destructor stub 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/exception/InvalidInputException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidInputException.h 3 | * 4 | * Created on: May 1, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDINPUTEXCEPTION_H_ 9 | #define INVALIDINPUTEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception { 16 | class InvalidInputException { 17 | public: 18 | InvalidInputException(string); 19 | ~InvalidInputException(); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDINPUTEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src/exception/InvalidOperationException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOperationException.cpp 3 | * 4 | * Created on: Dec 20, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include 9 | #include "InvalidOperationException.h" 10 | 11 | 12 | namespace exception { 13 | 14 | InvalidOperationException::InvalidOperationException(string msg) : std::runtime_error(msg) { 15 | cerr << "Invalid Operation Exception." << endl; 16 | cerr << what() << endl; 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/exception/InvalidOperationException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOperationException.h 3 | * 4 | * Created on: Dec 20, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDOPERATIONEXCEPTION_H_ 9 | #define INVALIDOPERATIONEXCEPTION_H_ 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | namespace exception { 17 | 18 | class InvalidOperationException : public std::runtime_error{ 19 | public: 20 | InvalidOperationException(string msg); 21 | //virtual ~InvalidOperationException(); 22 | }; 23 | 24 | } 25 | 26 | #endif /* INVALIDOPERATIONEXCEPTION_H_ */ 27 | -------------------------------------------------------------------------------- /src/exception/InvalidOrderOfOperationsException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOrderOfOperationsException.cpp 3 | * 4 | * Created on: Apr 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "InvalidOrderOfOperationsException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | namespace exception{ 15 | 16 | InvalidOrderOfOperationsException::InvalidOrderOfOperationsException(string massage) { 17 | cerr << "Invalid Order Of Operations Exception" << endl; 18 | cerr << massage << endl; 19 | } 20 | 21 | InvalidOrderOfOperationsException::~InvalidOrderOfOperationsException() { 22 | // TODO Auto-generated destructor stub 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/exception/InvalidOrderOfOperationsException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidOrderOfOperationsException.h 3 | * 4 | * Created on: Apr 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDORDEROFOPERATIONSEXCEPTION_H_ 9 | #define INVALIDORDEROFOPERATIONSEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception{ 16 | class InvalidOrderOfOperationsException { 17 | public: 18 | InvalidOrderOfOperationsException(string); 19 | ~InvalidOrderOfOperationsException(); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDORDEROFOPERATIONSEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src/exception/InvalidScoreException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidScoreException.cpp 3 | * 4 | * Created on: Apr 27, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "InvalidScoreException.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | namespace exception{ 15 | 16 | InvalidScoreException::InvalidScoreException(string massage) { 17 | cerr << "Invalid Score Exception." << endl; 18 | cerr << massage << endl; 19 | } 20 | 21 | InvalidScoreException::~InvalidScoreException() { 22 | // TODO Auto-generated destructor stub 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/exception/InvalidScoreException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidScoreException.h 3 | * 4 | * Created on: Apr 27, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDSCOREEXCEPTION_H_ 9 | #define INVALIDSCOREEXCEPTION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace exception{ 16 | class InvalidScoreException { 17 | public: 18 | InvalidScoreException(string); 19 | virtual ~InvalidScoreException(); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDSCOREEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src/exception/InvalidStateException.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidStateException.cpp 3 | * 4 | * Created on: Aug 9, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include 9 | #include 10 | #include "InvalidStateException.h" 11 | 12 | using namespace std; 13 | 14 | 15 | namespace exception { 16 | InvalidStateException::InvalidStateException(string msg) : 17 | std::runtime_error(msg) { 18 | cerr << "Invalid State Exception." << endl; 19 | cerr << what() << endl; 20 | } 21 | } 22 | 23 | //InvalidStateException::~InvalidStateException() { 24 | // TODO Auto-generated destructor stub 25 | //} 26 | -------------------------------------------------------------------------------- /src/exception/InvalidStateException.h: -------------------------------------------------------------------------------- 1 | /* 2 | * InvalidStateException.h 3 | * 4 | * Created on: Aug 9, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef INVALIDSTATEEXCEPTION_H_ 9 | #define INVALIDSTATEEXCEPTION_H_ 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | namespace exception { 17 | class InvalidStateException : public std::runtime_error{ 18 | public: 19 | InvalidStateException(string); 20 | }; 21 | } 22 | 23 | #endif /* INVALIDSTATEEXCEPTION_H_ */ 24 | -------------------------------------------------------------------------------- /src/fastcar/FC_Runner.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Runner.h 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Runner class, sets default params 8 | * and runs program 9 | */ 10 | #ifndef FC_RUNNER_H 11 | #define FC_RUNNER_H 12 | 13 | #include 14 | #include 15 | #include 16 | #include "../clutil/Point.h" 17 | #include "../predict/Predictor.h" 18 | #include "../predict/HandleSeq.h" 19 | #include "../nonltr/ChromosomeOneDigitDna.h" 20 | using namespace std; 21 | 22 | class Runner { 23 | public: 24 | Runner(int argc, char** argv); 25 | ~Runner() { indices.clear(); files.clear(); qfiles.clear(); if (pred64) {delete pred64;}}; 26 | int run(); 27 | private: 28 | void usage(std::string progname) const; 29 | template int do_run(std::vector &sequences); 30 | template void print_output(const map*, vector*>*> &m) const; 31 | int k = -1; 32 | int bandwidth; 33 | double similarity = -1; 34 | long largest_count = 0; 35 | bool align = false; 36 | bool recover = false; 37 | int sample_size = 300; 38 | int mut_type = HandleSeq::SINGLE; 39 | uint8_t mode = 0; 40 | uint64_t feats = 0; 41 | uint64_t chunk_size = 10000; 42 | std::vector files, qfiles; 43 | std::vector indices; 44 | bool dump = false; 45 | bool format = true; 46 | string output = "output.search"; 47 | string dump_str = "weights.txt"; 48 | void get_opts(int argc, char** argv); 49 | Predictor *pred64 = NULL; 50 | 51 | 52 | }; 53 | #endif 54 | -------------------------------------------------------------------------------- /src/fastcar/fastcar.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * main.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "FC_Runner.h" 8 | int main(int argc, char **argv) 9 | { 10 | Runner runner(argc, argv); 11 | return runner.run(); 12 | } 13 | -------------------------------------------------------------------------------- /src/nonltr/ChromDetector.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetector.cpp 3 | * 4 | * Created on: Nov 8, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include 9 | 10 | #include "ChromDetector.h" 11 | #include "Detector.h" 12 | #include "../utility/Util.h" 13 | 14 | using namespace std; 15 | using namespace nonltr; 16 | using namespace utility; 17 | 18 | ChromDetector::ChromDetector(double s, double w, double pDelta, double b, 19 | double mDelta, vector * scores, 20 | const vector *> * segmentList) { 21 | 22 | regions = new vector *>(); 23 | 24 | for (int i = 0; i < segmentList->size(); i++) { 25 | Detector * detector = new Detector(segmentList->at(i)->at(0), 26 | segmentList->at(i)->at(1), s, w, pDelta, b, mDelta, scores); 27 | vector *> * segRegions = detector->getRegions(); 28 | regions->insert(regions->end(), segRegions->begin(), segRegions->end()); 29 | delete detector; 30 | } 31 | } 32 | 33 | ChromDetector::~ChromDetector() { 34 | Util::deleteInVector(regions); 35 | regions->clear(); 36 | delete regions; 37 | } 38 | 39 | vector *> * ChromDetector::getRegions() { 40 | return regions; 41 | } 42 | -------------------------------------------------------------------------------- /src/nonltr/ChromDetector.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetector.h 3 | * 4 | * Created on: Nov 8, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMDETECTOR_H_ 9 | #define CHROMDETECTOR_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace nonltr{ 16 | class ChromDetector { 17 | 18 | private: 19 | vector *> * regions; 20 | 21 | public: 22 | ChromDetector(double, double, double, double, double, vector *, 23 | const vector *> *); 24 | virtual ~ChromDetector(); 25 | vector *> * getRegions(); 26 | }; 27 | } 28 | 29 | #endif /* CHROMDETECTOR_H_ */ 30 | -------------------------------------------------------------------------------- /src/nonltr/ChromDetectorMaxima.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetectorMaxima.cpp 3 | * 4 | * Created on: Jun 6, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "ChromDetectorMaxima.h" 9 | 10 | namespace nonltr { 11 | 12 | ChromDetectorMaxima::ChromDetectorMaxima(double s, double w, double m, 13 | double t, double p, int e, vector * oScores, 14 | ChromosomeOneDigit * chrom) { 15 | header = chrom->getHeader(); 16 | start(s, w, m, t, p, e, oScores, chrom->getSegment()); 17 | 18 | } 19 | 20 | ChromDetectorMaxima::ChromDetectorMaxima(double s, double w, double m, 21 | double t, double p, int e, vector * oScores, const vector *> * segmentList) { 23 | header = string("chrUnknown"); 24 | start(s, w, m, t, p, e, oScores, segmentList); 25 | } 26 | 27 | void ChromDetectorMaxima::start(double s, double w, double m, double t, 28 | double p, int e, vector * oScores, 29 | const vector *> * segmentList) { 30 | 31 | regionList = new vector (); 32 | 33 | int segmentCount = segmentList->size(); 34 | for (int i = 0; i < segmentCount; i++) { 35 | int segStart = segmentList->at(i)->at(0); 36 | int segEnd = segmentList->at(i)->at(1); 37 | 38 | // The effective length is shorter than the actual length by 2w 39 | int effLen = 2 * w + 10; 40 | int segLen = segEnd - segStart + 1; 41 | 42 | if (segLen > effLen) { 43 | DetectorMaxima * detector = new DetectorMaxima(segStart, segEnd, s, 44 | w, m, t, p, e, oScores); 45 | 46 | const vector * segRegions = detector->getRegionList(); 47 | int segRegionCount = segRegions->size(); 48 | for (int h = 0; h < segRegionCount; h++) { 49 | regionList->push_back(new Location(*(segRegions->at(h)))); 50 | } 51 | 52 | delete detector; 53 | } else { 54 | cout << "\tSkipping a short segment: "; 55 | cout << segStart << "-" << segEnd << endl; 56 | } 57 | } 58 | } 59 | 60 | ChromDetectorMaxima::~ChromDetectorMaxima() { 61 | Util::deleteInVector(regionList); 62 | regionList->clear(); 63 | delete regionList; 64 | } 65 | 66 | void ChromDetectorMaxima::printIndex(string outputFile) { 67 | printIndex(outputFile, false); 68 | } 69 | 70 | void ChromDetectorMaxima::printIndex(string outputFile, bool canAppend) { 71 | ofstream outIndex; 72 | 73 | if (canAppend) { 74 | outIndex.open(outputFile.c_str(), ios::out | ios::app); 75 | } else { 76 | outIndex.open(outputFile.c_str(), ios::out); 77 | } 78 | 79 | // Write the index of the repeat segment [x,y[ 80 | for (int j = 0; j < regionList->size(); j++) { 81 | outIndex << header << ":"; 82 | outIndex << ((int) (regionList->at(j)->getStart())) << "-"; 83 | outIndex << ((int) (regionList->at(j)->getEnd() + 1)) << " "; 84 | outIndex << endl; 85 | } 86 | 87 | outIndex.close(); 88 | } 89 | 90 | const vector* ChromDetectorMaxima::getRegionList() const { 91 | return regionList; 92 | } 93 | 94 | } /* namespace nonltr */ 95 | -------------------------------------------------------------------------------- /src/nonltr/ChromDetectorMaxima.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromDetectorMaxima.h 3 | * 4 | * Created on: Jun 6, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMDETECTORMAXIMA_H_ 9 | #define CHROMDETECTORMAXIMA_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "ChromosomeOneDigit.h" 15 | #include "DetectorMaxima.h" 16 | 17 | #include "../utility/Util.h" 18 | #include "../utility/ILocation.h" 19 | #include "../utility/Location.h" 20 | 21 | using namespace std; 22 | using namespace utility; 23 | 24 | namespace nonltr { 25 | 26 | class ChromDetectorMaxima { 27 | private: 28 | vector * regionList; 29 | string header; 30 | 31 | void start(double, double, double, double, double, int, vector *, 32 | const vector *> *); 33 | 34 | public: 35 | ChromDetectorMaxima(double, double, double, double, double, int, 36 | vector *, ChromosomeOneDigit *); 37 | ChromDetectorMaxima(double, double, double, double, double, int, 38 | vector *, const vector *> *); 39 | virtual ~ChromDetectorMaxima(); 40 | const vector* getRegionList() const; 41 | void printIndex(string); 42 | void printIndex(string, bool); 43 | 44 | }; 45 | 46 | } /* namespace nonltr */ 47 | #endif /* CHROMDETECTORMAXIMA_H_ */ 48 | -------------------------------------------------------------------------------- /src/nonltr/ChromListMaker.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromListMaker.cpp 3 | * 4 | * Created on: Mar 13, 2014 5 | * Author: Hani Zakaira Girgis 6 | */ 7 | 8 | #include "ChromListMaker.h" 9 | 10 | namespace nonltr { 11 | 12 | ChromListMaker::ChromListMaker(string seqFileIn, bool is_oneseq_) { 13 | seqFile = seqFileIn; 14 | is_oneseq = is_oneseq_; 15 | chromList = new vector(); 16 | } 17 | 18 | ChromListMaker::~ChromListMaker() { 19 | Util::deleteInVector(chromList); 20 | delete chromList; 21 | } 22 | 23 | 24 | std::istream& safe_getline(std::istream& is, std::string& t) 25 | { 26 | t.clear(); 27 | std::istream::sentry se(is, true); 28 | std::streambuf* sb = is.rdbuf(); 29 | for(;;) { 30 | int c = sb->sbumpc(); 31 | switch (c) { 32 | case '\n': 33 | return is; 34 | case '\r': 35 | if (sb->sgetc() == '\n') { 36 | sb->sbumpc(); 37 | } 38 | return is; 39 | case std::streambuf::traits_type::eof(): 40 | if (t.empty()) { 41 | is.setstate(std::ios::eofbit); 42 | } 43 | return is; 44 | default: 45 | t += (char)c; 46 | } 47 | } 48 | } 49 | 50 | const vector * ChromListMaker::makeChromList() { 51 | ifstream in(seqFile.c_str()); 52 | bool isFirst = true; 53 | Chromosome * chrom; 54 | vector size_list = getSize(); 55 | uint64_t cur_seq = 0; 56 | if (is_oneseq) { 57 | uint64_t sum = 0; 58 | for (uint64_t len : size_list) { 59 | sum += len + 50; 60 | } 61 | size_list.clear(); 62 | size_list.push_back(sum); 63 | } 64 | while (in.good()) { 65 | string line; 66 | safe_getline(in, line); 67 | if (line[0] == '>') { 68 | if (!isFirst) { 69 | if (is_oneseq) { 70 | std::string interseq(50, 'N'); 71 | // chrom->insert(interseq); 72 | chrom->appendToSequence(interseq); 73 | } else { 74 | chrom->finalize(); 75 | chromList->push_back(chrom); 76 | chrom = new Chromosome(size_list.at(cur_seq++)); 77 | chrom->setHeader(line); 78 | } 79 | } else { 80 | isFirst = false; 81 | chrom = new Chromosome(size_list.at(cur_seq++)); 82 | chrom->setHeader(line); 83 | } 84 | } else if (line[0] == ' ' || line[0] == '\t') { 85 | } else { 86 | // chrom->insert(line); 87 | chrom->appendToSequence(line); 88 | } 89 | } 90 | chrom->finalize(); 91 | chromList->push_back(chrom); 92 | in.close(); 93 | 94 | return chromList; 95 | } 96 | 97 | const vector ChromListMaker::getSize() { 98 | ifstream in(seqFile.c_str()); 99 | vector size_list; 100 | uint64_t current_size = 0; 101 | while (in.good()) { 102 | string line; 103 | safe_getline(in, line); 104 | if (line[0] == '>') { 105 | if (current_size > 0) { 106 | size_list.push_back(current_size); 107 | } 108 | current_size = 0; 109 | } else if (line[0] == ' ' || line[0] == '\t') { 110 | } else { 111 | current_size += line.length(); 112 | } 113 | } 114 | size_list.push_back(current_size); 115 | return size_list; 116 | } 117 | const vector * ChromListMaker::makeChromOneDigitDnaList() { 118 | ifstream in(seqFile.c_str()); 119 | bool isFirst = true; 120 | ChromosomeOneDigitDna * chrom; 121 | vector size_list = getSize(); 122 | uint64_t cur_seq = 0; 123 | if (is_oneseq) { 124 | uint64_t sum = 0; 125 | for (uint64_t len : size_list) { 126 | sum += len + 50; 127 | } 128 | if (sum > 0) { 129 | sum -= 50; 130 | } 131 | size_list.clear(); 132 | size_list.push_back(sum); 133 | } 134 | while (in.good()) { 135 | string line; 136 | safe_getline(in, line); 137 | if (line[0] == '>') { 138 | if (!isFirst) { 139 | if (is_oneseq) { 140 | std::string interseq(50, 'N'); 141 | chrom->insert(interseq); 142 | } else { 143 | chrom->finalize(); 144 | chromList->push_back(chrom); 145 | chrom = new ChromosomeOneDigitDna(size_list.at(cur_seq++)); 146 | chrom->setHeader(line); 147 | } 148 | } else { 149 | isFirst = false; 150 | chrom = new ChromosomeOneDigitDna(size_list.at(cur_seq++)); 151 | chrom->setHeader(line); 152 | 153 | } 154 | } else if (line[0] == ' ' || line[0] == '\t') { 155 | } else { 156 | chrom->insert(line); 157 | // chrom->appendToSequence(line); 158 | } 159 | } 160 | chrom->finalize(); 161 | chromList->push_back(chrom); 162 | in.close(); 163 | 164 | return chromList; 165 | } 166 | 167 | const vector * ChromListMaker::makeChromOneDigitProteinList() { 168 | ifstream in(seqFile.c_str()); 169 | bool isFirst = true; 170 | ChromosomeOneDigitProtein * chrom; 171 | 172 | while (in.good()) { 173 | string line; 174 | safe_getline(in, line); 175 | if (line[0] == '>') { 176 | if (!isFirst) { 177 | chrom->finalize(); 178 | chromList->push_back(chrom); 179 | } else { 180 | isFirst = false; 181 | } 182 | 183 | chrom = new ChromosomeOneDigitProtein(); 184 | chrom->setHeader(line); 185 | } else { 186 | chrom->appendToSequence(line); 187 | } 188 | } 189 | 190 | chrom->finalize(); 191 | chromList->push_back(chrom); 192 | in.close(); 193 | 194 | return chromList; 195 | } 196 | 197 | } 198 | /* namespace nonltr */ 199 | -------------------------------------------------------------------------------- /src/nonltr/ChromListMaker.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromListMaker.h 3 | * 4 | * Created on: Mar 13, 2014 5 | * Modified on: Oct 2, 2018 6 | * Author: Hani Zakaria Girgis, PhD 7 | */ 8 | 9 | #ifndef CHROMLISTMAKER_H_ 10 | #define CHROMLISTMAKER_H_ 11 | 12 | #include 13 | #include 14 | 15 | #include "Chromosome.h" 16 | #include "ChromosomeOneDigitDna.h" 17 | #include "ChromosomeOneDigitProtein.h" 18 | 19 | #include "../utility/Util.h" 20 | 21 | using namespace std; 22 | using namespace utility; 23 | 24 | namespace nonltr { 25 | 26 | class ChromListMaker { 27 | private: 28 | vector * chromList; 29 | string seqFile; 30 | bool is_oneseq; 31 | public: 32 | ChromListMaker(string, bool is_oneseq_=false); 33 | virtual ~ChromListMaker(); 34 | const vector getSize(); 35 | const vector * makeChromList(); 36 | const vector * makeChromOneDigitDnaList(); 37 | const vector * makeChromOneDigitProteinList(); 38 | 39 | }; 40 | 41 | } /* namespace nonltr */ 42 | #endif /* CHROMLISTMAKER_H_ */ 43 | -------------------------------------------------------------------------------- /src/nonltr/Chromosome.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Chromosome.h 3 | * 4 | * Created on: Mar 26, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | #ifndef CHROMOSOME_H_ 8 | #define CHROMOSOME_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "IChromosome.h" 17 | #include "../exception/InvalidOperationException.h" 18 | #include "../exception/InvalidInputException.h" 19 | #include "../utility/Util.h" 20 | 21 | using namespace std; 22 | using namespace nonltr; 23 | using namespace utility; 24 | using namespace exception; 25 | 26 | namespace nonltr { 27 | class Chromosome: public IChromosome { 28 | public: 29 | Chromosome(); 30 | Chromosome(uint64_t); 31 | Chromosome(string); 32 | Chromosome(string, bool); 33 | Chromosome(string, int); 34 | Chromosome(string, int, int); 35 | Chromosome(string &, string&); 36 | Chromosome(string &, string&, int); 37 | 38 | int getGcContent(); 39 | 40 | virtual ~Chromosome(); 41 | 42 | virtual string& getBaseRef(); 43 | virtual string& getHeaderRef(); 44 | 45 | virtual const string* getBase(); 46 | virtual const vector *> * getSegment(); 47 | virtual void printSegmentList(); 48 | virtual string getHeader(); 49 | virtual int size(); 50 | virtual int getEffectiveSize(); 51 | virtual void setHeader(string&); 52 | virtual void setSequence(string&); 53 | virtual void appendToSequence(const string&); 54 | virtual void finalize(); 55 | virtual vector * getBaseCount(); 56 | virtual void insert(const string&); 57 | 58 | protected: 59 | string chromFile; 60 | string header; 61 | string base; 62 | int str_len; 63 | 64 | int effectiveSize; 65 | int segLength; 66 | 67 | vector *> * segment; 68 | void readFasta(); 69 | void readFasta(int); 70 | void toUpperCase(); 71 | void removeAmbiguous(); 72 | void mergeSegments(); 73 | virtual void help(int, bool); 74 | void makeSegmentList(); 75 | void calculateEffectiveSize(); 76 | 77 | private: 78 | bool isHeaderReady; 79 | bool isBaseReady; 80 | bool isFinalized; 81 | bool canClean = false; 82 | 83 | void reverseSegments(); 84 | void makeBaseCount(); 85 | vector * baseCount; 86 | }; 87 | } 88 | 89 | #endif /* CHROMOSOME_H_ */ 90 | -------------------------------------------------------------------------------- /src/nonltr/ChromosomeOneDigit.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeOneDigit.cpp 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD at the NCB1/NLM/NIH 6 | */ 7 | #include 8 | #include 9 | #include 10 | #include "Chromosome.h" 11 | #include "ChromosomeOneDigit.h" 12 | #include "../exception/InvalidInputException.h" 13 | 14 | using namespace exception; 15 | 16 | namespace nonltr { 17 | 18 | ChromosomeOneDigit::ChromosomeOneDigit() : 19 | Chromosome() { 20 | //cout << "The no args constructor is called" << endl; 21 | } 22 | 23 | ChromosomeOneDigit::ChromosomeOneDigit(uint64_t s) : 24 | Chromosome(s) { 25 | //cout << "The no args constructor is called" << endl; 26 | } 27 | 28 | ChromosomeOneDigit::ChromosomeOneDigit(string fileName) : 29 | Chromosome(fileName) { 30 | help(); 31 | } 32 | 33 | ChromosomeOneDigit::ChromosomeOneDigit(string fileName, int segmentLength, 34 | int maxLength) : 35 | Chromosome(fileName, segmentLength, maxLength) { 36 | help(); 37 | } 38 | 39 | ChromosomeOneDigit::ChromosomeOneDigit(string& seq, string& info) : 40 | Chromosome(seq, info) { 41 | //cout << "Two string constructor is called" << endl; 42 | help(); 43 | } 44 | 45 | ChromosomeOneDigit::ChromosomeOneDigit(string& seq, string& info, int length) : 46 | Chromosome(seq, info, length) { 47 | //cout << "Two string constructor is called" << endl; 48 | help(); 49 | } 50 | 51 | void ChromosomeOneDigit::finalize() { 52 | Chromosome::finalize(); 53 | help(); 54 | } 55 | 56 | void ChromosomeOneDigit::help() { 57 | // Can delete the codes 58 | canClean = true; 59 | 60 | // Make map 61 | codes = new map(); 62 | 63 | // Build codes 64 | buildCodes(); 65 | // Modify the sequence in the super class 66 | encode(); 67 | } 68 | 69 | ChromosomeOneDigit::~ChromosomeOneDigit() { 70 | if (canClean) { 71 | codes->clear(); 72 | delete codes; 73 | } 74 | } 75 | 76 | /** 77 | * This method converts nucleotides in the segments to single digit codes 78 | */ 79 | void ChromosomeOneDigit::encode() { 80 | 81 | for (int s = 0; s < segment->size(); s++) { 82 | int segStart = segment->at(s)->at(0); 83 | int segEnd = segment->at(s)->at(1); 84 | for (int i = segStart; i <= segEnd; i++) { 85 | 86 | if (codes->count(base[i]) > 0) { 87 | base[i] = codes->at(base[i]); 88 | } else { 89 | string msg = "Invalid nucleotide: "; 90 | std::ostringstream oss; 91 | int b_int = base[i]; 92 | oss << msg << b_int; 93 | throw InvalidInputException(oss.str()); 94 | } 95 | } 96 | } 97 | 98 | // Digitize skipped segments 99 | char uncertainChar = Util::isDna? 'N' : 'X'; 100 | int segNum = segment->size(); 101 | if (segNum > 0) { 102 | // The first interval - before the first segment 103 | int segStart = 0; 104 | int segEnd = segment->at(0)->at(0) - 1; 105 | 106 | for (int s = 0; s <= segNum; s++) { 107 | for (int i = segStart; i <= segEnd; i++) { 108 | char c = base[i]; 109 | 110 | if (c != uncertainChar) { 111 | if (codes->count(c) > 0) { 112 | base[i] = codes->at(c); 113 | } else { 114 | string msg = "ChromosomeOneDigit::encode() found invalid letter: "; 115 | msg.append(1, c); 116 | throw InvalidInputException(msg); 117 | } 118 | } 119 | } 120 | 121 | // The regular intervals between two segments 122 | if (s < segNum - 1) { 123 | segStart = segment->at(s)->at(1) + 1; 124 | segEnd = segment->at(s + 1)->at(0) - 1; 125 | } 126 | // The last interval - after the last segment 127 | else if (s == segNum - 1) { 128 | segStart = segment->at(s)->at(1) + 1; 129 | segEnd = base.size() - 1; 130 | } 131 | } 132 | } 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /src/nonltr/ChromosomeOneDigit.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeOneDigit.h 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | 8 | #ifndef CHROMOSOMEONEDIGIT_H_ 9 | #define CHROMOSOMEONEDIGIT_H_ 10 | 11 | #include 12 | #include "Chromosome.h" 13 | 14 | namespace nonltr { 15 | class ChromosomeOneDigit: public Chromosome { 16 | 17 | private: 18 | void encode(); 19 | void help(); 20 | 21 | 22 | protected: 23 | bool canClean = false; 24 | map * codes; 25 | virtual void buildCodes() = 0; 26 | 27 | 28 | public: 29 | /* Methods */ 30 | ChromosomeOneDigit(); 31 | ChromosomeOneDigit(uint64_t); 32 | ChromosomeOneDigit(string); 33 | ChromosomeOneDigit(string, int, int); 34 | ChromosomeOneDigit(string&, string&); 35 | ChromosomeOneDigit(string&, string&, int); 36 | virtual ~ChromosomeOneDigit(); 37 | virtual void finalize(); 38 | 39 | 40 | }; 41 | } 42 | 43 | #endif /* CHROMOSOMEONEDIGIT_H_ */ 44 | -------------------------------------------------------------------------------- /src/nonltr/ChromosomeOneDigitDna.cpp: -------------------------------------------------------------------------------- 1 | #include "ChromosomeOneDigitDna.h" 2 | 3 | namespace nonltr{ 4 | 5 | ChromosomeOneDigitDna::ChromosomeOneDigitDna() : ChromosomeOneDigit() {} 6 | ChromosomeOneDigitDna::ChromosomeOneDigitDna(uint64_t s) : ChromosomeOneDigit(s) {} 7 | 8 | ChromosomeOneDigitDna::ChromosomeOneDigitDna(string fileName) : 9 | ChromosomeOneDigit(fileName){ 10 | 11 | } 12 | 13 | ChromosomeOneDigitDna::ChromosomeOneDigitDna(string fileName, int segmentLength, int maxLength) : 14 | ChromosomeOneDigit(fileName, segmentLength, maxLength) { 15 | 16 | } 17 | 18 | ChromosomeOneDigitDna::ChromosomeOneDigitDna(string& seq, string& info) : 19 | ChromosomeOneDigit(seq, info){ 20 | 21 | } 22 | 23 | ChromosomeOneDigitDna::ChromosomeOneDigitDna(string& seq, string& info, int length) : 24 | ChromosomeOneDigit(seq, info, length) { 25 | } 26 | 27 | ChromosomeOneDigitDna::~ChromosomeOneDigitDna(){ 28 | 29 | } 30 | 31 | /** 32 | * A A 33 | * T T 34 | * G G 35 | * C C 36 | * R G or A 37 | * Y T or C 38 | * M A or C 39 | * K G or T 40 | * S G or C 41 | * W A or T 42 | * H A or C or T 43 | * B G or T or C 44 | * V G or C or A 45 | * D G or T or A 46 | * N G or T or A or C 47 | */ 48 | void ChromosomeOneDigitDna::buildCodes() { 49 | // Certain nucleotides 50 | codes->insert(map::value_type('A', (char) 0)); 51 | codes->insert(map::value_type('C', (char) 1)); 52 | codes->insert(map::value_type('G', (char) 2)); 53 | codes->insert(map::value_type('T', (char) 3)); 54 | 55 | // Uncertain nucleotides 56 | codes->insert(map::value_type('R', codes->at('G'))); 57 | codes->insert(map::value_type('Y', codes->at('C'))); 58 | codes->insert(map::value_type('M', codes->at('A'))); 59 | codes->insert(map::value_type('K', codes->at('T'))); 60 | codes->insert(map::value_type('S', codes->at('G'))); 61 | codes->insert(map::value_type('W', codes->at('T'))); 62 | codes->insert(map::value_type('H', codes->at('C'))); 63 | codes->insert(map::value_type('B', codes->at('T'))); 64 | codes->insert(map::value_type('V', codes->at('A'))); 65 | codes->insert(map::value_type('D', codes->at('T'))); 66 | codes->insert(map::value_type('N', codes->at('C'))); 67 | codes->insert(map::value_type('X', codes->at('G'))); 68 | } 69 | 70 | /** 71 | * Cannot be called on already finalized object. 72 | */ 73 | void ChromosomeOneDigitDna::makeR() { 74 | //cout << "Making reverse ..." << endl; 75 | makeReverse(); 76 | reverseSegments(); 77 | } 78 | 79 | /** 80 | * Cannot be called on already finalized object. 81 | */ 82 | void ChromosomeOneDigitDna::makeRC() { 83 | //cout << "Making reverse complement ..." << endl; 84 | makeComplement(); 85 | makeReverse(); 86 | reverseSegments(); 87 | } 88 | 89 | void ChromosomeOneDigitDna::makeComplement() { 90 | map complement; 91 | 92 | // Certain nucleotides 93 | complement.insert(map::value_type((char) 0, (char) 3)); 94 | complement.insert(map::value_type((char) 1, (char) 2)); 95 | complement.insert(map::value_type((char) 2, (char) 1)); 96 | complement.insert(map::value_type((char) 3, (char) 0)); 97 | 98 | // Unknown nucleotide 99 | complement.insert(map::value_type('N', 'N')); 100 | // complement.insert(map::value_type((char) 4, (char) 4)); 101 | 102 | // Convert a sequence to its complement 103 | int seqLen = base.size(); 104 | for (int i = 0; i < seqLen; i++) { 105 | if (complement.count(base[i]) > 0) { 106 | base[i] = complement.at(base[i]); 107 | } else { 108 | cerr << "Error: The digit " << (char) base[i]; 109 | cerr << " does not represent a base." << endl; 110 | exit(2); 111 | } 112 | } 113 | } 114 | 115 | void ChromosomeOneDigitDna::makeReverse() { 116 | int last = base.size() - 1; 117 | 118 | // Last index to be switched 119 | int middle = base.size() / 2; 120 | 121 | for (int i = 0; i < middle; i++) { 122 | char temp = base[last - i]; 123 | base[last - i] = base[i]; 124 | base[i] = temp; 125 | } 126 | } 127 | 128 | void ChromosomeOneDigitDna::reverseSegments() { 129 | int segNum = segment->size(); 130 | int lastBase = size() - 1; 131 | 132 | // Calculate the coordinate on the main strand 133 | for (int i = 0; i < segNum; i++) { 134 | vector * seg = segment->at(i); 135 | 136 | int s = lastBase - seg->at(1); 137 | int e = lastBase - seg->at(0); 138 | seg->clear(); 139 | seg->push_back(s); 140 | seg->push_back(e); 141 | } 142 | 143 | // Reverse the regions within the list 144 | int lastRegion = segNum - 1; 145 | int middle = segNum / 2; 146 | for (int i = 0; i < middle; i++) { 147 | vector * temp = segment->at(lastRegion - i); 148 | (*segment)[lastRegion - i] = segment->at(i); 149 | (*segment)[i] = temp; 150 | } 151 | } 152 | 153 | 154 | } 155 | -------------------------------------------------------------------------------- /src/nonltr/ChromosomeOneDigitDna.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeOneDigitDna.h 3 | * Created on: September 28, 2018 4 | * Author: Hani Z. Girgis, PhD 5 | */ 6 | 7 | #ifndef HROMOSOMEONEDIGITDNA_H_ 8 | #define HROMOSOMEONEDIGITDNA_H_ 9 | 10 | #include "ChromosomeOneDigit.h" 11 | 12 | namespace nonltr{ 13 | class ChromosomeOneDigitDna: public ChromosomeOneDigit{ 14 | private: 15 | void makeReverse(); 16 | void makeComplement(); 17 | void reverseSegments(); 18 | 19 | protected: 20 | virtual void buildCodes(); 21 | 22 | public: 23 | ChromosomeOneDigitDna(); 24 | ChromosomeOneDigitDna(uint64_t); 25 | ChromosomeOneDigitDna(string); 26 | ChromosomeOneDigitDna(string, int, int); 27 | ChromosomeOneDigitDna(string&, string&); 28 | ChromosomeOneDigitDna(string&, string&, int); 29 | virtual ~ChromosomeOneDigitDna(); 30 | 31 | void makeR(); 32 | void makeRC(); 33 | }; 34 | } 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/nonltr/ChromosomeOneDigitProtein.cpp: -------------------------------------------------------------------------------- 1 | #include "ChromosomeOneDigitProtein.h" 2 | 3 | namespace nonltr{ 4 | 5 | ChromosomeOneDigitProtein::ChromosomeOneDigitProtein() : 6 | ChromosomeOneDigit() { 7 | 8 | } 9 | 10 | ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string fileName) : 11 | ChromosomeOneDigit(fileName){ 12 | 13 | } 14 | 15 | ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string fileName, int segmentLength, int maxLength) : 16 | ChromosomeOneDigit(fileName, segmentLength, maxLength) { 17 | 18 | } 19 | 20 | ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string& seq, string& info) : 21 | ChromosomeOneDigit(seq, info){ 22 | 23 | } 24 | 25 | ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string& seq, string& info, int length) : 26 | ChromosomeOneDigit(seq, info, length) { 27 | } 28 | 29 | ChromosomeOneDigitProtein::~ChromosomeOneDigitProtein(){ 30 | 31 | } 32 | 33 | void ChromosomeOneDigitProtein::buildCodes() { 34 | // https://en.wikipedia.org/wiki/Proteinogenic_amino_acid 35 | codes->insert(map::value_type('A', (char) 0)); 36 | codes->insert(map::value_type('C', (char) 1)); 37 | codes->insert(map::value_type('D', (char) 2)); 38 | codes->insert(map::value_type('E', (char) 3)); 39 | codes->insert(map::value_type('F', (char) 4)); 40 | codes->insert(map::value_type('G', (char) 5)); 41 | codes->insert(map::value_type('H', (char) 6)); 42 | codes->insert(map::value_type('I', (char) 7)); 43 | codes->insert(map::value_type('K', (char) 8)); 44 | codes->insert(map::value_type('L', (char) 9)); 45 | codes->insert(map::value_type('M', (char) 10)); 46 | codes->insert(map::value_type('N', (char) 11)); 47 | codes->insert(map::value_type('O', (char) 12)); 48 | codes->insert(map::value_type('P', (char) 13)); 49 | codes->insert(map::value_type('Q', (char) 14)); 50 | codes->insert(map::value_type('R', (char) 15)); 51 | codes->insert(map::value_type('S', (char) 16)); 52 | codes->insert(map::value_type('T', (char) 17)); 53 | codes->insert(map::value_type('U', (char) 18)); 54 | codes->insert(map::value_type('V', (char) 19)); 55 | codes->insert(map::value_type('W', (char) 20)); 56 | codes->insert(map::value_type('Y', (char) 21)); 57 | 58 | // Uncertain uncleotides 59 | codes->insert(map::value_type('B', codes->at('D'))); 60 | codes->insert(map::value_type('Z', codes->at('E'))); 61 | codes->insert(map::value_type('J', codes->at('L'))); 62 | } 63 | 64 | }// End namespace -------------------------------------------------------------------------------- /src/nonltr/ChromosomeOneDigitProtein.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeOneDigitProtein.h 3 | * Created on: October 2, 2018 4 | * Author: Hani Z. Girgis, PhD 5 | */ 6 | 7 | #ifndef HROMOSOMEONEDIGITPROTEIN_H_ 8 | #define HROMOSOMEONEDIGITPROTEIN_H_ 9 | 10 | #include "ChromosomeOneDigit.h" 11 | 12 | namespace nonltr{ 13 | class ChromosomeOneDigitProtein: public ChromosomeOneDigit{ 14 | 15 | protected: 16 | virtual void buildCodes(); 17 | 18 | public: 19 | ChromosomeOneDigitProtein(); 20 | ChromosomeOneDigitProtein(string); 21 | ChromosomeOneDigitProtein(string, int, int); 22 | ChromosomeOneDigitProtein(string&, string&); 23 | ChromosomeOneDigitProtein(string&, string&, int); 24 | virtual ~ChromosomeOneDigitProtein(); 25 | }; 26 | } 27 | 28 | #endif -------------------------------------------------------------------------------- /src/nonltr/ChromosomeRandom.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ChromosomeRandom.h 3 | * 4 | * Created on: Feb 4, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef CHROMOSOMERANDOM_H_ 9 | #define CHROMOSOMERANDOM_H_ 10 | 11 | #include 12 | 13 | #include "IChromosome.h" 14 | 15 | namespace nonltr { 16 | 17 | class ChromosomeRandom: public nonltr::IChromosome { 18 | // Key-value pair type. 19 | typedef map::value_type valType; 20 | 21 | private: 22 | int n; 23 | char unread; 24 | IChromosome * oChrom; 25 | vector * alpha; 26 | map * table; 27 | string * rBase; 28 | vector * keyList; 29 | map * codes; 30 | 31 | void fillKeyList(); 32 | void initializeTable(); 33 | void countWords(); 34 | void convertToProbabilities(); 35 | void printTable(); 36 | void generateRandomSequence(); 37 | 38 | public: 39 | ChromosomeRandom(int, IChromosome*, char, vector*); 40 | virtual ~ChromosomeRandom(); 41 | 42 | virtual const string* getBase(); 43 | virtual const vector *> * getSegment(); 44 | virtual string getHeader(); 45 | virtual void printSequence(string); 46 | void printSequence(string, string *); 47 | void printEffectiveSequence(string); 48 | }; 49 | 50 | } /* namespace nonltr */ 51 | #endif /* CHROMOSOMERANDOM_H_ */ 52 | -------------------------------------------------------------------------------- /src/nonltr/DetectorMaxima.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DetectorMaxima.h 3 | * 4 | * Created on: May 31, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef DETECTORMAXIMA_H_ 9 | #define DETECTORMAXIMA_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "../utility/ILocation.h" 15 | 16 | using namespace std; 17 | using namespace utility; 18 | 19 | namespace nonltr { 20 | 21 | class DetectorMaxima { 22 | private: 23 | 24 | int segStart; 25 | int segEnd; 26 | double s; 27 | double w; 28 | double m; 29 | double t; 30 | double p; 31 | int e; 32 | int halfS; 33 | 34 | vector * oScores; 35 | vector * scores; 36 | vector * mask; 37 | vector * first; 38 | vector * second; 39 | vector * maxima; 40 | // vector *> * allMaxima; 41 | 42 | vector * separatorList; 43 | vector * regionList; 44 | 45 | void makeMask(); 46 | void smooth(); 47 | void deriveFirst(); 48 | void deriveSecond(); 49 | void findMaxima(); 50 | 51 | void findSeparators(); 52 | void findRegions(); 53 | 54 | void extendRegions(); 55 | 56 | int countLessThan(vector *, int, int, double); 57 | 58 | /** 59 | * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c 60 | */ 61 | inline double round(double number) { 62 | return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); 63 | } 64 | 65 | public: 66 | DetectorMaxima(int, int, double, double, double, double, double, int, 67 | vector *); 68 | virtual ~DetectorMaxima(); 69 | const vector* getRegionList() const; 70 | const vector* getFirst() const; 71 | const vector* getSecond() const; 72 | 73 | // const vector *>* getAllMaxima() const; 74 | }; 75 | 76 | } /* namespace nonltr */ 77 | #endif /* DETECTORMAXIMA_H_ */ 78 | -------------------------------------------------------------------------------- /src/nonltr/EnrichmentMarkovView.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * EnrichmentMarkovView.cpp 3 | * 4 | * Created on: Apr 17, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | namespace nonltr { 9 | 10 | /** 11 | * The Markov order. It start at 0. 12 | */ 13 | template 14 | EnrichmentMarkovView::EnrichmentMarkovView(int k, int order, int m) : 15 | minObs(m), factor(10000.00), KmerHashTable(k) { 16 | initialize(order); 17 | } 18 | 19 | template 20 | EnrichmentMarkovView::EnrichmentMarkovView(int k, V initValue, int order, 21 | int m) : 22 | minObs(m), factor(10000.00), KmerHashTable(k, initValue) { 23 | initialize(order); 24 | } 25 | 26 | template 27 | void EnrichmentMarkovView::initialize(int order) { 28 | // Test start 29 | // cout << "Testing: " << minObs << endl; 30 | // Test end 31 | 32 | o = order; 33 | if (o < 0) { 34 | string msg("The Markov order must be non-negative integer. "); 35 | msg.append("The invalid input is: "); 36 | msg.append(Util::int2string(o)); 37 | msg.append("."); 38 | throw InvalidInputException(msg); 39 | } 40 | 41 | if (o >= KmerHashTable::k) { 42 | string msg("The Markov order cannot be >= k (k-mer)."); 43 | throw InvalidInputException(msg); 44 | } 45 | 46 | l = 0; 47 | modelList = new vector *>(); 48 | 49 | for (int i = 1; i <= o + 1; i++) { 50 | modelList->push_back(new KmerHashTable(i)); 51 | } 52 | } 53 | 54 | template 55 | EnrichmentMarkovView::~EnrichmentMarkovView() { 56 | Util::deleteInVector(modelList); 57 | delete modelList; 58 | } 59 | 60 | /** 61 | * This method count words of size 1 to order+1 in the input sequence. 62 | * In other words, it updates the background tables. In addition, it 63 | * updates the length of the genome. 64 | * 65 | * sequence: is the input sequence. 66 | * start: the start index - inclosing. 67 | * end: the end index - inclosing. 68 | */ 69 | template 70 | void EnrichmentMarkovView::count(const char * sequence, int start, 71 | int end) { 72 | 73 | // Multiple by 2 if scanning the forward strand and its reverse complement 74 | // l = l + (2 * (end - start + 1)); 75 | l = l + (end - start + 1); 76 | 77 | int modelNumber = modelList->size(); 78 | for (int i = 0; i < modelNumber; i++) { 79 | KmerHashTable * t = modelList->at(i); 80 | t->wholesaleIncrement(sequence, start, end - i); 81 | } 82 | } 83 | 84 | /** 85 | * Normalize the count of words in each model. 86 | * Values stored in these models are multiplied by "factor." 87 | */ 88 | template 89 | void EnrichmentMarkovView::generateProbapilities() { 90 | int modelNumber = modelList->size(); 91 | 92 | for (int m = 0; m < modelNumber; m++) { 93 | KmerHashTable * t = modelList->at(m); 94 | int tSize = t->getMaxTableSize(); 95 | 96 | for (int i = 0; i < tSize; i += 4) { 97 | double sum = 0.0; 98 | 99 | for (int j = i; j < i + 4; j++) { 100 | sum += t->valueOf(j); 101 | } 102 | 103 | for (int j = i; j < i + 4; j++) { 104 | t->insert(j, round(factor * ((double) t->valueOf(j) / sum))); 105 | } 106 | } 107 | } 108 | } 109 | 110 | template 111 | void EnrichmentMarkovView::processTable() { 112 | char base = 4; 113 | int modelNumber = modelList->size(); 114 | 115 | // Make a zero in quaternary form as a string of length k. 116 | string q(""); 117 | for (int x = 0; x < KmerHashTable::k; x++) { 118 | q.append(1, 0); 119 | } 120 | 121 | double lowerP; 122 | double upperP; 123 | for (I y = 0; y < KmerHashTable::maxTableSize; y++) { 124 | if (y % 10000000 == 0) { 125 | cout << "Processing " << y << " keys out of " 126 | << KmerHashTable::maxTableSize; 127 | cout << endl; 128 | } 129 | 130 | const char * qc = q.c_str(); 131 | 132 | // Calculate the expected number of occurrences. 133 | 134 | // a. Calculate probability from lower order models. 135 | // Lower probabilities are the same for four consecutive words of length of k-1 136 | if (y % 4 == 0) { 137 | lowerP = 1.0; 138 | for (int m = 0; m < modelNumber - 1; m++) { 139 | KmerHashTable * oTable = modelList->at(m); 140 | lowerP *= (((double) oTable->valueOf(qc, 0)) / factor); 141 | } 142 | } 143 | 144 | // b. Calculate probability based on the specified order. 145 | KmerHashTable * oTable = modelList->at(modelNumber - 1); 146 | int resultsSize = KmerHashTable::k - o - 1; 147 | 148 | // Upper probabilities are the same for four consecutive words of length of k-1 149 | // The scanning of words or length corresponding to the highest order + 1 150 | // This step is not needed if k = o + 1, i.e. resultsSize = 0. 151 | if (y % 4 == 0) { 152 | if (resultsSize > 0) { 153 | //Initialize the elements of the vector invalid index 154 | vector results = vector(resultsSize, -987); 155 | oTable->wholesaleValueOf(qc, 0, resultsSize - 1, &results, 0); 156 | 157 | upperP = 1.0; 158 | for (int i = 0; i < resultsSize; i++) { 159 | upperP *= (((double) results.at(i)) / factor); 160 | } 161 | results.clear(); 162 | 163 | } else { 164 | upperP = 1.0; 165 | } 166 | } 167 | 168 | // The expected number of occurances 169 | double exp = l * lowerP * upperP 170 | * (((double) oTable->valueOf(qc, resultsSize)) / factor); 171 | 172 | // Calculate the enrichment value. 173 | // Log value 174 | // values[y] = round((log((double) values[y] + 1.0) - log(exp + 1.0))); 175 | 176 | // Raw value 177 | // Requirement: if observed is >= 5 && observed > expected then the value is the difference 178 | // otherwise the value is zero 179 | 180 | V observed = KmerHashTable::values[y]; 181 | 182 | if (observed >= minObs && observed > exp) { 183 | 184 | KmerHashTable::values[y] = round(observed - exp); 185 | } else { 186 | KmerHashTable::values[y] = 0; 187 | } 188 | 189 | /* 190 | KmerHashTable::values[y] = 191 | round( 192 | (((double) KmerHashTable::values[y] + 1.0) 193 | / (exp + 1.0))); 194 | */ 195 | 196 | // Increment the quaternary number: 197 | // 1 - guard against overflow. 198 | if (q[0] == base - 1) { 199 | string z(""); 200 | z.append(1, 0); 201 | q = z + q; 202 | } 203 | 204 | // 2 - increment the quaternary number by 1. 205 | int qLen = q.size(); 206 | for (int i = qLen - 1; i >= 0; i--) { 207 | if (q[i] + 1 < base) { 208 | q[i] = q[i] + 1; 209 | break; 210 | } else { 211 | q[i] = 0; 212 | } 213 | } 214 | } 215 | } 216 | 217 | } /* namespace nonltr */ 218 | -------------------------------------------------------------------------------- /src/nonltr/EnrichmentMarkovView.h: -------------------------------------------------------------------------------- 1 | /* 2 | * EnrichmentMarkovView.h 3 | * 4 | * Created on: Apr 17, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ENRICHMENTMARKOVVIEW_H_ 9 | #define ENRICHMENTMARKOVVIEW_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "KmerHashTable.h" 16 | #include "../utility/Util.h" 17 | #include "../exception/InvalidInputException.h" 18 | 19 | using namespace std; 20 | using namespace utility; 21 | using namespace exception; 22 | 23 | namespace nonltr { 24 | 25 | template 26 | class EnrichmentMarkovView: public KmerHashTable{ 27 | 28 | private: 29 | // The minimum number of the observed k-mers 30 | const int minObs; 31 | 32 | // This template specification should work up to order of 14, 33 | // i.e. word length = 15 34 | vector *> * modelList; 35 | 36 | // Markov order 37 | int o; 38 | 39 | // Total length 40 | long l; 41 | 42 | // Multiplied the probability of word by this factor 43 | // Equivalent to four decimal points. 44 | const double factor; // = 10000.00; 45 | 46 | // Initialize data members 47 | void initialize(int); 48 | 49 | /** 50 | * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c 51 | */ 52 | inline double round(double number) { 53 | return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); 54 | } 55 | 56 | public: 57 | EnrichmentMarkovView(int, int, int); 58 | EnrichmentMarkovView(int, V, int, int); 59 | virtual ~EnrichmentMarkovView(); 60 | 61 | void count(const char *, int, int); 62 | void generateProbapilities(); 63 | void processTable(); 64 | }; 65 | } /* namespace nonltr */ 66 | 67 | #include "EnrichmentMarkovView.cpp" 68 | 69 | #endif /* ENRICHMENTMARKOVVIEW_H_ */ 70 | -------------------------------------------------------------------------------- /src/nonltr/HMM.h: -------------------------------------------------------------------------------- 1 | /* 2 | * HMM.h 3 | * 4 | * Created on: Jun 21, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef HMM_H_ 9 | #define HMM_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "../utility/ILocation.h" 17 | 18 | using namespace std; 19 | using namespace utility; 20 | 21 | namespace nonltr { 22 | 23 | class HMM { 24 | private: 25 | const int PRECISION; 26 | double minusInf; 27 | vector * pList; 28 | vector *> * tList; 29 | vector * oList; 30 | 31 | void initializeHelper(); 32 | // Returns the index of the last candidate in the segment 33 | int trainHelper1(int, int, int); 34 | void trainHelper2(int, int, int, int); 35 | void trainPositive(int, int); 36 | void trainNegative(int, int); 37 | void move(int, int); 38 | void checkBase(double); 39 | 40 | /* 41 | inline int getPstvState(int score) { 42 | int state = ceil(log(score) / logBase); 43 | if (state < 0) { 44 | state = 0; 45 | } 46 | return state; 47 | } 48 | 49 | inline int getNgtvState(int score) { 50 | int state = ceil(log(score) / logBase); 51 | if (state < 0) { 52 | state = 0; 53 | } 54 | return state + positiveStateNumber; 55 | } 56 | */ 57 | 58 | inline int getPstvState(int index) { 59 | int state = scoreList->at(index); 60 | return state; 61 | } 62 | 63 | inline int getNgtvState(int index) { 64 | int state = scoreList->at(index); 65 | return state + positiveStateNumber; 66 | } 67 | 68 | protected: 69 | double base; 70 | double logBase; 71 | int stateNumber; 72 | int positiveStateNumber; 73 | 74 | vector * scoreList; 75 | const vector *> * segmentList; 76 | const vector * candidateList; 77 | 78 | void initialize(double, int); 79 | /** 80 | * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c 81 | */ 82 | inline double round(double number) { 83 | return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); 84 | } 85 | 86 | public: 87 | HMM(string); // Build a model from file 88 | HMM(double, int); 89 | // HMM(vector *, const vector *> *, 90 | // const vector *, double); 91 | virtual ~HMM(); 92 | void train(vector *, const vector *> *, const vector *); 93 | void normalize(); 94 | double decode(int, int, vector *, vector&); 95 | double decode(int, int, vector *, vector&); 96 | int getPositiveStateNumber(); 97 | void print(); 98 | void print(string); 99 | double getBase(); 100 | }; 101 | 102 | } /* namespace nonltr */ 103 | #endif /* HMM_H_ */ 104 | -------------------------------------------------------------------------------- /src/nonltr/IChromosome.h: -------------------------------------------------------------------------------- 1 | /* 2 | * IChromosome.h 3 | * 4 | * Created on: Feb 4, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ICHROMOSOME_H_ 9 | #define ICHROMOSOME_H_ 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | namespace nonltr { 17 | 18 | class IChromosome { 19 | public: 20 | //IChromosome(); 21 | //virtual ~IChromosome(); 22 | virtual const string* getBase() = 0; 23 | virtual const vector *> * getSegment() = 0; 24 | virtual string getHeader() = 0; 25 | }; 26 | 27 | } /* namespace tr */ 28 | #endif /* ICHROMOSOME_H_ */ 29 | -------------------------------------------------------------------------------- /src/nonltr/ITableView.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ITableView.h 3 | * 4 | * Created on: Aug 9, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ITABLEVIEW_H_ 9 | #define ITABLEVIEW_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace nonltr { 16 | 17 | template 18 | class ITableView { 19 | public: 20 | virtual V valueOf(const char*) = 0 ; 21 | virtual V valueOf(const char*, int) = 0; 22 | virtual V valueOf(I) = 0; 23 | 24 | virtual int getK() = 0; 25 | virtual I getMaxTableSize() = 0; 26 | virtual const V * getValues() const = 0; 27 | 28 | virtual void wholesaleValueOf(const char *, int, int, vector *) = 0; 29 | virtual void wholesaleValueOf(const char *, int, int, vector *, int) = 0; 30 | }; 31 | 32 | } 33 | 34 | #endif /* ITABLEVIEW_H_ */ 35 | -------------------------------------------------------------------------------- /src/nonltr/KmerHashTable.h: -------------------------------------------------------------------------------- 1 | /* 2 | * KmerHashTable.h 3 | * 4 | * Created on: Jul 25, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | 8 | #ifndef KMERHASHTABLE_H_ 9 | #define KMERHASHTABLE_H_ 10 | 11 | #include 12 | #include 13 | #include "ITableView.h" 14 | 15 | using namespace std; 16 | using namespace nonltr; 17 | 18 | namespace nonltr { 19 | 20 | template 21 | class KmerHashTable: public ITableView { 22 | 23 | protected: 24 | /* Fields */ 25 | static const int maxKeyLength = 15; 26 | int k; 27 | 28 | 29 | I maxTableSize; 30 | 31 | // The hashed values, i.e. the values of the hash table. 32 | // The index is the 4ry representation of the key 33 | V * values; 34 | V initialValue; 35 | 36 | private: 37 | // [4^0, 4^1, ... , 4^(k-1)] 38 | I * bases; 39 | I * mMinusOne; 40 | void initialize(int, V); 41 | 42 | public: 43 | /* Methods */ 44 | KmerHashTable(int); 45 | KmerHashTable(int, V); 46 | 47 | virtual ~KmerHashTable(); 48 | 49 | I hash(const char *); 50 | I hash(const char *, int); 51 | void hash(const char *, int, int, vector *); 52 | 53 | void insert(const char*, V); 54 | void insert(const char*, int, V); 55 | void insert(I, V); 56 | 57 | void increment(const char*); 58 | void increment(const char*, int); 59 | void wholesaleIncrement(const char*, int, int); 60 | int wholesaleIncrementNoOverflow(const char*, int, int); 61 | 62 | void addReverseComplement(); 63 | I countNonInitialEntries(); 64 | vector *getKeys(); 65 | void printTable(string); 66 | void checkOverflow(); 67 | 68 | /*Vritual methods from ITableView*/ 69 | virtual V valueOf(const char*); 70 | virtual V valueOf(const char*, int); 71 | virtual V valueOf(I); 72 | virtual void wholesaleValueOf(const char *, int, int, vector *); 73 | virtual void wholesaleValueOf(const char *, int, int, vector *, int); 74 | 75 | virtual int getK(); 76 | virtual I getMaxTableSize(); 77 | virtual V getMaxValue(); 78 | virtual const V * getValues() const; 79 | }; 80 | } 81 | 82 | #include "KmerHashTable.cpp" 83 | 84 | #endif /* KMERHASHTABLE_H_ */ 85 | -------------------------------------------------------------------------------- /src/nonltr/LocationList.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationList.cpp 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Zakaria Girgis, PhD 6 | * 7 | * 8 | * An instance of this class holds a list of merged locations. 9 | */ 10 | 11 | #include "LocationList.h" 12 | 13 | namespace nonltr { 14 | 15 | LocationList::LocationList(string chromNameIn) { 16 | chromName = chromNameIn; 17 | regionList = new vector(); 18 | merge(); 19 | } 20 | 21 | LocationList::~LocationList() { 22 | Util::deleteInVector(regionList); 23 | delete regionList; 24 | } 25 | 26 | void LocationList::add(int start, int end) { 27 | regionList->push_back(new Location(start, end)); 28 | } 29 | 30 | void LocationList::merge() { 31 | int regionCount = regionList->size(); 32 | int gg = 0; 33 | while (gg < regionCount) { 34 | ILocation * region = regionList->at(gg); 35 | 36 | int regionStart = region->getStart(); 37 | int regionEnd = region->getEnd(); 38 | 39 | if (gg > 0) { 40 | ILocation * pRegion = regionList->at(gg - 1); 41 | int pStart = pRegion->getStart(); 42 | int pEnd = pRegion->getEnd(); 43 | 44 | if (Util::isOverlapping(pStart, pEnd, regionStart, regionEnd)) { 45 | pRegion->setEnd(regionEnd > pEnd ? regionEnd : pEnd); 46 | regionList->erase(regionList->begin() + gg); 47 | delete region; 48 | regionCount = regionList->size(); 49 | } else { 50 | gg++; 51 | } 52 | } 53 | 54 | if (gg == 0) { 55 | gg++; 56 | } 57 | } 58 | } 59 | 60 | void LocationList::mergeWithAnotherList( 61 | const vector * const otherList) { 62 | //A pre-condition: Ensure that the other list is sorted 63 | for (int h = 1; h < otherList->size(); h++) { 64 | if (otherList->at(h)->getStart() < otherList->at(h - 1)->getStart()) { 65 | throw InvalidStateException( 66 | string("LocationList - The other list is not sorted.")); 67 | } 68 | } 69 | 70 | // Start 71 | vector * mergedList = new vector(); 72 | 73 | int i = 0; 74 | int j = 0; 75 | int iLimit = regionList->size(); 76 | int jLimit = otherList->size(); 77 | 78 | // Continue until one list is finished 79 | while (i < iLimit && j < jLimit) { 80 | ILocation * iLoc = regionList->at(i); 81 | ILocation * jLoc = otherList->at(j); 82 | 83 | if (iLoc->getStart() < jLoc->getStart()) { 84 | mergedList->push_back(iLoc); 85 | i++; 86 | } else { 87 | mergedList->push_back(new Location(*jLoc)); 88 | j++; 89 | } 90 | } 91 | 92 | // Once one list is finished, copy the rest of the other list 93 | if (i == iLimit) { 94 | for (; j < jLimit; j++) { 95 | mergedList->push_back(new Location(*(otherList->at(j)))); 96 | } 97 | } else if (j == jLimit) { 98 | for (; i < iLimit; i++) { 99 | mergedList->push_back(regionList->at(i)); 100 | } 101 | } 102 | 103 | // Once done 104 | // Util::deleteInVector(regionList); 105 | regionList->clear(); // Need to test this line 106 | delete regionList; 107 | regionList = mergedList; 108 | 109 | merge(); 110 | 111 | //A post-condition: Ensure that the list is sorted 112 | for (int h = 1; h < regionList->size(); h++) { 113 | if (regionList->at(h)->getStart() < regionList->at(h - 1)->getStart()) { 114 | throw InvalidStateException(string("This list is not sorted.")); 115 | } 116 | } 117 | } 118 | 119 | void LocationList::print() { 120 | cout << endl << chromName << endl; 121 | for (int i = 0; i < regionList->size(); i++) { 122 | int s = regionList->at(i)->getStart(); 123 | int e = regionList->at(i)->getEnd(); 124 | cout << s << "-" << e << endl; 125 | } 126 | } 127 | 128 | const vector * LocationList::getList() { 129 | return regionList; 130 | } 131 | 132 | void LocationList::convertToRedFormat() { 133 | trim(1); 134 | } 135 | 136 | void LocationList::trim(int x) { 137 | for (int i = regionList->size() - 1; i >= 0; i--) { 138 | ILocation * region = regionList->at(i); 139 | int start = region->getStart(); 140 | int newEnd = region->getEnd() - x; 141 | 142 | if (newEnd < 0 || start > newEnd) { 143 | regionList->erase(regionList->begin() + i); 144 | delete region; 145 | } else { 146 | region->setEnd(newEnd); 147 | } 148 | } 149 | } 150 | 151 | } 152 | 153 | /* namespace nonltr */ 154 | -------------------------------------------------------------------------------- /src/nonltr/LocationList.h: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationList.h 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Z. Girgis, PhD 6 | */ 7 | 8 | #ifndef SRC_NONLTR_LOCATIONLIST_H_ 9 | #define SRC_NONLTR_LOCATIONLIST_H_ 10 | 11 | #include 12 | #include "../utility/Util.h" 13 | #include "../utility/ILocation.h" 14 | #include "../utility/Location.h" 15 | #include "../exception/InvalidStateException.h" 16 | 17 | using namespace std; 18 | using namespace utility; 19 | using namespace exception; 20 | 21 | namespace nonltr { 22 | 23 | class LocationList { 24 | private: 25 | string chromName; 26 | vector * regionList; 27 | void merge(); 28 | 29 | public: 30 | LocationList(string); 31 | virtual ~LocationList(); 32 | 33 | void add(int, int); 34 | 35 | /** 36 | * Take a sorted list 37 | */ 38 | void mergeWithAnotherList(const vector * const); 39 | 40 | 41 | /** 42 | * Print locations 43 | */ 44 | void print(); 45 | 46 | const vector * getList(); 47 | void convertToRedFormat(); 48 | void trim(int ); 49 | }; 50 | 51 | } /* namespace nonltr */ 52 | 53 | #endif /* SRC_NONLTR_LOCATIONLIST_H_ */ 54 | -------------------------------------------------------------------------------- /src/nonltr/LocationListCollection.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationListCollection.cpp 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "LocationListCollection.h" 9 | 10 | namespace nonltr { 11 | 12 | LocationListCollection::LocationListCollection(string fileNameIn) { 13 | fileName = fileNameIn; 14 | collection = new map(); 15 | readCoordinates(); 16 | } 17 | 18 | LocationListCollection::~LocationListCollection() { 19 | collection->clear(); 20 | delete collection; 21 | } 22 | 23 | void LocationListCollection::readCoordinates() { 24 | Util::checkFile(fileName); 25 | 26 | ifstream in(fileName.c_str()); 27 | LocationList * locList; 28 | string previousChromName(""); 29 | 30 | while (in.good()) { 31 | string line; 32 | getline(in, line); 33 | 34 | if (line.compare(string("")) != 0) { 35 | int colIndex = line.find_last_of(':'); 36 | int dashIndex = line.find_last_of('-'); 37 | 38 | string chromName = line.substr(0, colIndex); 39 | 40 | if (previousChromName.compare(chromName) != 0) { 41 | 42 | cout << "Processing regions of " << chromName << endl; 43 | 44 | locList = new LocationList(chromName); 45 | collection->insert( 46 | map::value_type(chromName, 47 | locList)); 48 | 49 | previousChromName = chromName; 50 | } 51 | 52 | int start = 53 | atoi( 54 | line.substr(colIndex + 1, dashIndex - colIndex - 1).c_str()); 55 | int end = atoi(line.substr(dashIndex + 1).c_str()); 56 | locList->add(start, end); 57 | } 58 | } 59 | 60 | in.close(); 61 | } 62 | 63 | void LocationListCollection::print() { 64 | map::iterator itr_s = collection->begin(); 65 | map::iterator itr_e = collection->end(); 66 | while (itr_s != itr_e) { 67 | collection->at(itr_s->first)->print(); 68 | ++itr_s; 69 | } 70 | } 71 | 72 | LocationList * const LocationListCollection::getLocationList(string header) { 73 | if (collection->count(header) == 0) { 74 | string msg("Regions of "); 75 | msg.append(header); 76 | msg.append(" cannot be found.\n"); 77 | throw InvalidStateException(msg); 78 | } 79 | 80 | return collection->at(header); 81 | } 82 | 83 | void LocationListCollection::convertToRedFormat() { 84 | map::iterator itr_s = collection->begin(); 85 | map::iterator itr_e = collection->end(); 86 | while (itr_s != itr_e) { 87 | collection->at(itr_s->first)->convertToRedFormat(); 88 | ++itr_s; 89 | } 90 | } 91 | 92 | void LocationListCollection::trim(int x) { 93 | map::iterator itr_s = collection->begin(); 94 | map::iterator itr_e = collection->end(); 95 | while (itr_s != itr_e) { 96 | collection->at(itr_s->first)->trim(x); 97 | ++itr_s; 98 | } 99 | } 100 | 101 | } /* namespace nonltr */ 102 | -------------------------------------------------------------------------------- /src/nonltr/LocationListCollection.h: -------------------------------------------------------------------------------- 1 | /* 2 | * LocationListCollection.h 3 | * 4 | * Created on: Feb 19, 2015 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ 9 | #define SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include "LocationList.h" 15 | #include "../utility/Util.h" 16 | #include "../exception/InvalidStateException.h" 17 | 18 | using namespace std; 19 | using namespace utility; 20 | 21 | namespace nonltr { 22 | 23 | class LocationListCollection { 24 | 25 | private: 26 | string fileName; 27 | map * collection; 28 | void readCoordinates(); 29 | 30 | public: 31 | LocationListCollection(string); 32 | virtual ~LocationListCollection(); 33 | LocationList * const getLocationList(string); 34 | void print(); 35 | void convertToRedFormat(); 36 | void trim(int ); 37 | }; 38 | 39 | } /* namespace nonltr */ 40 | 41 | #endif /* SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ */ 42 | -------------------------------------------------------------------------------- /src/nonltr/Scanner.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Scanner.h 3 | * 4 | * Created on: Aug 19, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef SCANNER_H_ 9 | #define SCANNER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "Chromosome.h" 16 | #include "ChromosomeOneDigit.h" 17 | #include "HMM.h" 18 | #include "ITableView.h" 19 | #include "Scorer.h" 20 | #include "../utility/Util.h" 21 | #include "../utility/ILocation.h" 22 | #include "../utility/Location.h" 23 | #include "../exception/InvalidInputException.h" 24 | #include "../exception/InvalidStateException.h" 25 | #include "../exception/FileDoesNotExistException.h" 26 | #include "../exception/InvalidOperationException.h" 27 | 28 | using namespace std; 29 | using namespace utility; 30 | using namespace exception; 31 | 32 | namespace nonltr { 33 | 34 | class Scanner { 35 | private: 36 | //string chromFile; 37 | ChromosomeOneDigit * chrom; 38 | const vector *> * segmentList; 39 | Scorer * scorer; 40 | vector * scoreList; 41 | vector * regionList; 42 | int k; 43 | HMM * hmm; 44 | // bool isTrainMode; 45 | 46 | // Methods 47 | void start(); 48 | void check(); 49 | void decode(); 50 | void extendByK(); 51 | int extendByKHelper(int, int, int); 52 | void merge(); 53 | 54 | public: 55 | static const int FRMT_POS = 1; 56 | static const int FRMT_BED = 2; 57 | 58 | Scanner(HMM *, int, ChromosomeOneDigit *, string); 59 | Scanner(HMM *, int, ChromosomeOneDigit *, ITableView *); 60 | virtual ~Scanner(); 61 | void makeForwardCoordinates(); 62 | 63 | void printScores(string, bool); 64 | void printIndex(string, bool, int); 65 | void printMasked(string, Chromosome&, bool); 66 | void mergeWithOtherRegions(const vector *); 67 | const vector* getRegionList(); 68 | }; 69 | 70 | } /* namespace nonltr */ 71 | #endif /* SCANNER_H_ */ 72 | -------------------------------------------------------------------------------- /src/nonltr/Scorer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Scorer.cpp 3 | * 4 | * Created on: Aug 3, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | #include "Scorer.h" 8 | 9 | Scorer::Scorer(ChromosomeOneDigit * chromIn, 10 | ITableView * const table) { 11 | chrom = chromIn; 12 | kmerTable = table; 13 | scores = new vector(chrom->getBase()->size(), 0); 14 | k = kmerTable->getK(); 15 | max = -1; 16 | score(); 17 | calculateMax(); 18 | } 19 | 20 | Scorer::~Scorer() { 21 | scores->clear(); 22 | delete scores; 23 | } 24 | 25 | /** 26 | * This method scores each nucleotide in the chromosome. 27 | * The nucleotides represented by 'N' are assigned zero. 28 | */ 29 | void Scorer::score() { 30 | const vector *> * segment = chrom->getSegment(); 31 | const char * segBases = chrom->getBase()->c_str(); 32 | 33 | for (int s = 0; s < segment->size(); s++) { 34 | int start = segment->at(s)->at(0); 35 | int end = segment->at(s)->at(1); 36 | kmerTable->wholesaleValueOf(segBases, start, end - k + 1, scores, 37 | start); 38 | 39 | // Handle the last word from end - k + 2 till the end, inclusive. 40 | for (int i = end - k + 2; i <= end; i++) { 41 | (*scores)[i] = scores->at(i - 1); 42 | } 43 | } 44 | } 45 | 46 | /** 47 | * This method takes the logarithm of the scores according to the base. 48 | * If the score equals zero, it is left the same. 49 | */ 50 | void Scorer::takeLog(double base) { 51 | // Handle the case where base is one 52 | bool isOne = false; 53 | if (fabs(base - 1.0) < std::numeric_limits::epsilon()) { 54 | isOne = true; 55 | } 56 | double logBase = isOne ? log(1.5) : log(base); 57 | 58 | const vector *> * segment = chrom->getSegment(); 59 | for (int s = 0; s < segment->size(); s++) { 60 | int start = segment->at(s)->at(0); 61 | int end = segment->at(s)->at(1); 62 | for (int h = start; h <= end; h++) { 63 | int score = scores->at(h); 64 | 65 | if (score != 0) { 66 | if (!isOne || (isOne && score > 1)) { 67 | (*scores)[h] = ceil(log(score) / logBase); 68 | } 69 | } 70 | } 71 | } 72 | } 73 | 74 | int Scorer::getK() { 75 | return k; 76 | } 77 | 78 | vector* Scorer::getScores() { 79 | return scores; 80 | } 81 | 82 | void Scorer::printScores(string outputFile, bool canAppend) { 83 | ofstream outScores; 84 | if (canAppend) { 85 | outScores.open(outputFile.c_str(), ios::out | ios::app); 86 | } else { 87 | outScores.open(outputFile.c_str(), ios::out); 88 | } 89 | 90 | int step = 50; 91 | outScores << chrom->getHeader() << endl; 92 | int len = scores->size(); 93 | for (int i = 0; i < len; i = i + step) { 94 | int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1; 95 | for (int k = i; k <= e; k++) { 96 | outScores << scores->at(k) << " "; 97 | } 98 | outScores << endl; 99 | } 100 | outScores << endl; 101 | 102 | outScores.close(); 103 | } 104 | 105 | int Scorer::countLessOrEqual(int thr) { 106 | int count = 0; 107 | const vector *> * segment = chrom->getSegment(); 108 | for (int s = 0; s < segment->size(); s++) { 109 | int start = segment->at(s)->at(0); 110 | int end = segment->at(s)->at(1); 111 | for (int h = start; h <= end; h++) { 112 | if (scores->at(h) <= thr) { 113 | count++; 114 | } 115 | } 116 | } 117 | return count; 118 | } 119 | 120 | void Scorer::calculateMax() { 121 | const vector *> * segmentList = chrom->getSegment(); 122 | int segmentCount = segmentList->size(); 123 | for (int jj = 0; jj < segmentCount; jj++) { 124 | vector * segment = segmentList->at(jj); 125 | int start = segment->at(0); 126 | int end = segment->at(1); 127 | for (int ss = start; ss <= end; ss++) { 128 | int score = scores->at(ss); 129 | if (score > max) { 130 | max = score; 131 | } 132 | } 133 | } 134 | 135 | if (max == -1) { 136 | string msg("Error occurred while finding the maximum score."); 137 | throw InvalidStateException(msg); 138 | } 139 | } 140 | 141 | int Scorer::getMax() { 142 | return max; 143 | } 144 | -------------------------------------------------------------------------------- /src/nonltr/Scorer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Scorer.h 3 | * 4 | * Created on: Aug 3, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef SCORER_H_ 9 | #define SCORER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "ITableView.h" 18 | #include "ChromosomeOneDigit.h" 19 | #include "../utility/Util.h" 20 | #include "../exception/InvalidStateException.h" 21 | 22 | using namespace std; 23 | using namespace nonltr; 24 | using namespace utility; 25 | using namespace exception; 26 | 27 | namespace nonltr { 28 | class Scorer { 29 | private: 30 | /* Fields */ 31 | ChromosomeOneDigit * chrom; 32 | ITableView * kmerTable; 33 | vector * scores; 34 | int k; 35 | int max; 36 | 37 | /* Methods */ 38 | void score(); 39 | void calculateMax(); 40 | 41 | public: 42 | /* Methods */ 43 | Scorer(ChromosomeOneDigit *, ITableView *); 44 | virtual ~Scorer(); 45 | void printScores(string, bool); 46 | vector* getScores(); 47 | int getK(); 48 | void takeLog(double); 49 | int countLessOrEqual(int); 50 | int getMax(); 51 | }; 52 | } 53 | 54 | #endif /* Scorer_H_ */ 55 | -------------------------------------------------------------------------------- /src/nonltr/TableBuilder.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * TableBuilder.cpp 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "TableBuilder.h" 9 | 10 | TableBuilder::TableBuilder(string dir, int motifSize, int order, int minObs) { 11 | genomeDir = dir; 12 | k = motifSize; 13 | genomeLength = 0; 14 | // kmerTable = new KmerHashTable(k); 15 | // kmerTable = new EnrichmentView(k); 16 | 17 | // Whenever you change the template, modify line 50 and 70 and the header file line 35 18 | kmerTable = new EnrichmentMarkovView(k, order, minObs); 19 | 20 | buildTable(); 21 | } 22 | 23 | TableBuilder::~TableBuilder() { 24 | delete kmerTable; 25 | } 26 | 27 | void TableBuilder::buildTable() { 28 | vector * fileList = new vector(); 29 | Util::readChromList(genomeDir, fileList, "fa"); 30 | 31 | for (int i = 0; i < fileList->size(); i++) { 32 | cout << "Counting k-mers in " << fileList->at(i) << " ..." << endl; 33 | ChromListMaker * maker = new ChromListMaker(fileList->at(i)); 34 | const vector * chromList = maker->makeChromOneDigitDnaList(); 35 | 36 | for (int h = 0; h < chromList->size(); h++) { 37 | ChromosomeOneDigit * chrom = 38 | dynamic_cast(chromList->at(h)); 39 | if (chrom) { 40 | genomeLength += chrom->getEffectiveSize(); 41 | updateTable(chrom); 42 | } else { 43 | throw InvalidStateException(string("Dynamic cast failed.")); 44 | } 45 | } 46 | 47 | delete maker; 48 | } 49 | // Check if overflow has occurred 50 | kmerTable->checkOverflow(); 51 | 52 | // View 53 | // EnrichmentView * view = dynamic_cast(kmerTable); 54 | EnrichmentMarkovView * view = 55 | dynamic_cast *>(kmerTable); 56 | 57 | if (view) { 58 | view->generateProbapilities(); 59 | view->processTable(); 60 | maxValue = view->getMaxValue(); 61 | } else { 62 | throw InvalidStateException(string("Dynamic cast failed.")); 63 | } 64 | cout << "Enrichment view is ready." << endl; 65 | 66 | fileList->clear(); 67 | delete fileList; 68 | 69 | /* If you would like to see the contents of the table.*/ 70 | // kmerTable-> printTable(); 71 | } 72 | 73 | void TableBuilder::updateTable(ChromosomeOneDigit * chrom) { 74 | // EnrichmentView * view = dynamic_cast(kmerTable); 75 | EnrichmentMarkovView * view = 76 | dynamic_cast *>(kmerTable); 77 | 78 | const vector *> * segment = chrom->getSegment(); 79 | const char * segBases = chrom->getBase()->c_str(); 80 | 81 | for (int s = 0; s < segment->size(); s++) { 82 | int start = segment->at(s)->at(0); 83 | int end = segment->at(s)->at(1); 84 | // cerr << "The segment length is: " << (end-start+1) << endl; 85 | 86 | // Fast, but require some memory proportional to the segment length. 87 | kmerTable->wholesaleIncrement(segBases, start, end - k + 1); 88 | if (view) { 89 | view->count(segBases, start, end); 90 | } else { 91 | throw InvalidStateException(string("Dynamic cast failed.")); 92 | } 93 | 94 | // Slow, but memory efficient 95 | /* 96 | vector hashList = vector(); 97 | kmerTable->hash(segBases, start, end - k + 1, &hashList); 98 | 99 | for (int i = start; i <= end - k + 1; i++) { 100 | kmerTable->increment(segBases, i); 101 | } 102 | */ 103 | } 104 | } 105 | 106 | KmerHashTable * const TableBuilder::getKmerTable() { 107 | return kmerTable; 108 | } 109 | 110 | long TableBuilder::getGenomeLength() { 111 | if (genomeLength < 0) { 112 | string msg("The length of the genome cannot be negative."); 113 | throw InvalidStateException(msg); 114 | } 115 | 116 | return genomeLength; 117 | } 118 | 119 | int TableBuilder::getMaxValue() { 120 | return maxValue; 121 | } 122 | -------------------------------------------------------------------------------- /src/nonltr/TableBuilder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * TableBuilder.h 3 | * 4 | * Created on: Jul 31, 2012 5 | * Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH 6 | */ 7 | 8 | #ifndef TABLEBUILDER_H_ 9 | #define TABLEBUILDER_H_ 10 | 11 | #include "KmerHashTable.h" 12 | #include "EnrichmentMarkovView.h" 13 | #include "ChromosomeOneDigit.h" 14 | #include "ChromListMaker.h" 15 | #include "IChromosome.h" 16 | 17 | #include "../utility/Util.h" 18 | #include "../exception/InvalidStateException.h" 19 | 20 | #include 21 | 22 | using namespace std; 23 | using namespace nonltr; 24 | using namespace utility; 25 | using namespace exception; 26 | 27 | namespace nonltr { 28 | class TableBuilder { 29 | private: 30 | /** 31 | * k-mer table 32 | */ 33 | KmerHashTable * kmerTable; 34 | int maxValue; 35 | 36 | /** 37 | * Directory including the FASTA files comprising the genome. 38 | * These files must have the 39 | */ 40 | string genomeDir; 41 | 42 | /** 43 | * The size of the motif 44 | */ 45 | int k; 46 | 47 | /** 48 | * The total length of the whole genome 49 | */ 50 | long genomeLength; 51 | 52 | /** 53 | * Methods 54 | */ 55 | void buildTable(); 56 | void updateTable(ChromosomeOneDigit *); 57 | 58 | public: 59 | TableBuilder(string, int, int, int); 60 | virtual ~TableBuilder(); 61 | KmerHashTable * const getKmerTable(); 62 | void printTable(); 63 | long getGenomeLength(); 64 | int getMaxValue(); 65 | }; 66 | } 67 | 68 | #endif /* TABLEBUILDER_H_ */ 69 | -------------------------------------------------------------------------------- /src/nonltr/Trainer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Trainer.cpp 3 | * 4 | * Created on: Aug 20, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "Trainer.h" 9 | 10 | namespace nonltr { 11 | 12 | // Pass the isCND and the isCON parameters 13 | 14 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 15 | double tIn, string candidateDirIn, int m) : minObs(m) { 16 | candidateDir = candidateDirIn; 17 | canPrintCandidates = true; 18 | isCND = true; 19 | isCON = false; 20 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 21 | } 22 | 23 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 24 | double tIn, string candidateDirIn, bool isCNDIn, string otherDirIn, int m) : minObs(m) { 25 | candidateDir = candidateDirIn; 26 | canPrintCandidates = true; 27 | isCND = isCNDIn; 28 | isCON = true; 29 | otherDir = otherDirIn; 30 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 31 | } 32 | 33 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 34 | double tIn, int m) : minObs(m) { 35 | canPrintCandidates = false; 36 | isCND = true; 37 | isCON = false; 38 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 39 | } 40 | 41 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn, 42 | double tIn, bool isCNDIn, string otherDirIn, int m) : minObs(m) { 43 | canPrintCandidates = false; 44 | isCND = isCNDIn; 45 | isCON = true; 46 | otherDir = otherDirIn; 47 | initialize(genomeDirIn, orderIn, kIn, sIn, tIn); 48 | } 49 | 50 | void Trainer::initialize(string genomeDirIn, int orderIn, int kIn, double sIn, 51 | double tIn) { 52 | 53 | if (isCND == false && isCON == false) { 54 | string msg("Training using the candidates or the other repeats is required. "); 55 | msg.append("Please specify which regions to be used for training. "); 56 | msg.append("Any of the two sets or a combination of both can be used."); 57 | throw InvalidStateException(msg); 58 | } 59 | 60 | genomeDir = genomeDirIn; 61 | fileList = new vector(); 62 | Util::readChromList(genomeDir, fileList, string("fa")); 63 | chromCount = fileList->size(); 64 | order = orderIn; 65 | k = kIn; 66 | s = sIn; 67 | t = tIn; 68 | p = 0.0; 69 | tDetector = tIn + 0.1; 70 | max = -1; 71 | 72 | stage1(); 73 | 74 | if (isCND) { 75 | stage2(); 76 | } 77 | stage3(); 78 | } 79 | 80 | Trainer::~Trainer() { 81 | fileList->clear(); 82 | delete fileList; 83 | delete builder; 84 | delete hmm; 85 | } 86 | 87 | /** 88 | * Stage 1: Building the table 89 | */ 90 | void Trainer::stage1() { 91 | cout << endl << endl; 92 | cout << "Stage 1: Building the table ..." << endl; 93 | builder = new TableBuilder(genomeDir, k, order, minObs); 94 | table = builder->getKmerTable(); 95 | genomeLength = builder->getGenomeLength(); 96 | max = builder->getMaxValue(); 97 | } 98 | 99 | void Trainer::stage2() { 100 | cout << endl << endl; 101 | cout << "Stage 2: Calculating the percentage ..." << endl; 102 | 103 | double effectiveSize = 0.0; 104 | double countLessOrEqual = 0.0; 105 | for (int i = 0; i < chromCount; i++) { 106 | cout << "Calculating the percentage in: " << fileList->at(i) << " ..."; 107 | cout << endl; 108 | ChromListMaker * maker = new ChromListMaker(fileList->at(i)); 109 | const vector * chromList = maker->makeChromOneDigitDnaList(); 110 | 111 | for (int h = 0; h < chromList->size(); h++) { 112 | ChromosomeOneDigit * chrom = 113 | dynamic_cast(chromList->at(h)); 114 | Scorer * scorer = new Scorer(chrom, table); 115 | 116 | effectiveSize += chrom->getEffectiveSize(); 117 | countLessOrEqual += scorer->countLessOrEqual(t); 118 | 119 | delete scorer; 120 | } 121 | delete maker; 122 | } 123 | 124 | if (effectiveSize == 0) { 125 | string msg("The size of the genome cannot be zero."); 126 | throw InvalidStateException(msg); 127 | } else { 128 | p = 100.00 * countLessOrEqual / effectiveSize; 129 | cout << "The percentage is " << p << endl; 130 | if (p < 52.5) { 131 | p = 52.5; 132 | cout << "The percentage is increased to " << p << endl; 133 | } 134 | } 135 | } 136 | 137 | /** 138 | * Stage 3: Training 139 | */ 140 | void Trainer::stage3() { 141 | cout << endl << endl; 142 | cout << "Stage 3: Training ..." << endl; 143 | 144 | // Handle the case when the threshold is one. 145 | bool isOne = false; 146 | if (fabs(t - 1.0) < std::numeric_limits::epsilon()) { 147 | isOne = true; 148 | } 149 | double hmmBase = isOne ? 1.5 : t; 150 | 151 | // Make a list of candidate HMM 152 | int stateCount = 2 * (ceil(log(max) / log(hmmBase)) + 1); 153 | 154 | // Initialize the HMM 155 | hmm = new HMM(hmmBase, stateCount); 156 | 157 | // Start training the models 158 | for (int i = 0; i < chromCount; i++) { 159 | cout << "Training on: " << fileList->at(i) << endl; 160 | // Name of candidates file 161 | string path(fileList->at(i)); 162 | int slashLastIndex = path.find_last_of(Util::fileSeparator); 163 | int dotLastIndex = path.find_last_of("."); 164 | string nickName = path.substr(slashLastIndex + 1, dotLastIndex - slashLastIndex - 1); 165 | 166 | // May or may not be used 167 | string cndFile = candidateDir + Util::fileSeparator + nickName + ".cnd"; 168 | 169 | // Work on the other repeats if desired 170 | LocationListCollection * otherRegionListCollection; 171 | bool isConRepAvailable = false; 172 | if (isCON) { 173 | string otherFile = otherDir + Util::fileSeparator + nickName + ".rpt"; 174 | ifstream f1(otherFile.c_str()); 175 | if (!f1) { 176 | string message = string("Warning: "); 177 | message.append(otherFile); 178 | message.append(" does not exist. "); 179 | message.append("Repeats of this sequence will not used for training the HMM."); 180 | cout << message << endl; 181 | } else { 182 | otherRegionListCollection = new LocationListCollection(otherFile); 183 | otherRegionListCollection->convertToRedFormat(); 184 | otherRegionListCollection->trim(k - 1); 185 | 186 | isConRepAvailable = true; 187 | } 188 | f1.close(); 189 | } 190 | 191 | // Read sequences in the file 192 | ChromListMaker * maker = new ChromListMaker(fileList->at(i)); 193 | const vector * chromList = maker->makeChromOneDigitDnaList(); 194 | 195 | for (int h = 0; h < chromList->size(); h++) { 196 | ChromosomeOneDigit * chrom = dynamic_cast(chromList->at(h)); 197 | Scorer * scorer = new Scorer(chrom, table); 198 | vector * scoreList = scorer->getScores(); 199 | 200 | // Detect candidates if desired 201 | ChromDetectorMaxima * detector; 202 | const vector * trainingRegionList; 203 | bool canDeleteDetector = true; 204 | if (isCND) { 205 | if (canPrintCandidates) { 206 | detector = new ChromDetectorMaxima(s, 10, 0, tDetector, p,s, scoreList, chrom); 207 | if (h > 0) { 208 | bool canAppend = true; 209 | detector->printIndex(cndFile, canAppend); 210 | } else { 211 | cout << "Printing candidates to: " << cndFile << endl; 212 | detector->printIndex(cndFile); 213 | } 214 | } else { 215 | detector = new ChromDetectorMaxima(s, 10, 0, tDetector, p, s, scoreList, chrom->getSegment()); 216 | } 217 | trainingRegionList = detector->getRegionList(); 218 | 219 | 220 | } 221 | 222 | if (isCON && isConRepAvailable) { 223 | LocationList * const locList = otherRegionListCollection->getLocationList(chrom->getHeader()); 224 | if (isCND) { 225 | locList->mergeWithAnotherList(detector->getRegionList()); 226 | } 227 | trainingRegionList = locList->getList(); 228 | 229 | } 230 | 231 | // The candidate regions are already copied to the location list 232 | if (isCND && isCON && isConRepAvailable) { 233 | delete detector; 234 | canDeleteDetector = false; 235 | } 236 | 237 | // Train the HMM 238 | if(isCND || (isCON && isConRepAvailable)){ 239 | 240 | scorer->takeLog(t); 241 | scoreList = scorer->getScores(); 242 | hmm->train(scoreList, chrom->getSegment(), trainingRegionList); 243 | } 244 | 245 | // Free more memory 246 | if (isCND && canDeleteDetector) { 247 | delete detector; 248 | } 249 | delete scorer; 250 | } 251 | 252 | if (isCON && isConRepAvailable) { 253 | delete otherRegionListCollection; 254 | } 255 | delete maker; 256 | } 257 | 258 | // Normalize HMM's once training is finished 259 | hmm->normalize(); 260 | } 261 | 262 | void Trainer::printTable(string fileName) { 263 | table->printTable(fileName); 264 | } 265 | 266 | HMM*& Trainer::getHmm() { 267 | return hmm; 268 | } 269 | 270 | KmerHashTable * Trainer::getTable() { 271 | return table; 272 | } 273 | 274 | void Trainer::printHmm(string fileName) { 275 | hmm->print(fileName); 276 | } 277 | 278 | } /* namespace nonltr */ 279 | -------------------------------------------------------------------------------- /src/nonltr/Trainer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Trainer.h 3 | * 4 | * Created on: Aug 20, 2013 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef TRAINER_H_ 9 | #define TRAINER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "TableBuilder.h" 18 | #include "KmerHashTable.h" 19 | #include "HMM.h" 20 | #include "ChromDetectorMaxima.h" 21 | #include "Scorer.h" 22 | #include "ChromListMaker.h" 23 | #include "LocationListCollection.h" 24 | #include "../utility/Util.h" 25 | #include "../exception/InvalidStateException.h" 26 | 27 | using namespace std; 28 | using namespace utility; 29 | using namespace exception; 30 | 31 | namespace nonltr { 32 | 33 | class Trainer { 34 | private: 35 | string genomeDir; 36 | string candidateDir; 37 | string otherDir; 38 | bool canPrintCandidates; 39 | vector * fileList; 40 | int chromCount; 41 | int order; 42 | int k; 43 | int max; // Maximum score in the entire genome 44 | double t; // Score threshold 45 | double tDetector; // threshold for the detector because it uses < not <=; 46 | double p; // Percentage of scores below the threshold, t, in non-repeats 47 | //double r; 48 | double s; // Half width of the mask 49 | long genomeLength; 50 | //vector * sampleList; 51 | TableBuilder * builder; 52 | KmerHashTable * table; 53 | HMM * hmm; 54 | int isCND; 55 | int isCON; 56 | // The minimum number of the observed k-mers 57 | const int minObs; 58 | 59 | void stage1(); 60 | void stage2(); 61 | void stage3(); 62 | //void stage4(); 63 | 64 | public: 65 | Trainer(string, int, int, double, double, string, int); 66 | Trainer(string, int, int, double, double, string, bool, string, int); 67 | Trainer(string, int, int, double, double, int); 68 | Trainer(string, int, int, double, double, bool, string, int); 69 | 70 | void initialize(string, int, int, double, double); 71 | virtual ~Trainer(); 72 | void printTable(string); 73 | void printHmm(string); 74 | HMM*& getHmm(); 75 | KmerHashTable * getTable(); 76 | 77 | }; 78 | 79 | } /* namespace nonltr */ 80 | #endif /* TRAINER_H_ */ 81 | -------------------------------------------------------------------------------- /src/predict/BestFirstSelector.h: -------------------------------------------------------------------------------- 1 | // -*- C++ -*- 2 | /* 3 | * BestFirstSelector.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef BEST_FIRST_SELECTOR_H 9 | #define BEST_FIRST_SELECTOR_H 10 | #include "FeatureSelector.h" 11 | #include 12 | template 13 | class BestFirstSelector : public FeatureSelector { 14 | public: 15 | BestFirstSelector(vector > possible_feats_, int min_n_feat, int max_n_feat) : possible_feats(possible_feats_), min_num_feat(min_n_feat), max_num_feat(max_n_feat) {} 16 | ~BestFirstSelector() {} 17 | 18 | pair*,matrix::GLM> train_regression(Feature* tfeat, const vector > &training,const vector > &testing); 19 | pair*,matrix::GLM> train_class(Feature* tfeat, const vector > &training,const vector > &testing, double id); 20 | 21 | private: 22 | int max_num_feat, min_num_feat; 23 | vector > possible_feats; 24 | }; 25 | #endif 26 | -------------------------------------------------------------------------------- /src/predict/FeatureSelector.cpp: -------------------------------------------------------------------------------- 1 | // -*- C++ -*- 2 | /* 3 | * FeatureSelector.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #include "FeatureSelector.h" 9 | template 10 | std::pair FeatureSelector::generate_feat_mat(const vector > &data, Feature& feat, double cutoff) 11 | { 12 | bool classify = (cutoff > 0); 13 | int nrows = data.size(); 14 | int ncols = feat.size()+1; 15 | matrix::Matrix feat_mat(nrows, ncols); 16 | matrix::Matrix labels(nrows, 1); 17 | // #pragma omp parallel for 18 | for (int row = 0; row < data.size(); row++) { 19 | auto kv = data.at(row); 20 | vector cache; 21 | // #pragma omp critical 22 | // { 23 | cache = feat.compute(*kv.first, *kv.second); 24 | // } 25 | feat_mat.set(row, 0, 1); 26 | if (classify) { 27 | labels.set(row, 0, kv.val >= cutoff ? 1 : -1); 28 | } else { 29 | labels.set(row, 0, kv.val); 30 | // labels.set(row, 0, (kv.val - smin) / (smax - smin)); 31 | } 32 | for (int col = 1; col < ncols; col++) { 33 | double val = feat(col-1, cache); 34 | feat_mat.set(row, col, val); 35 | } 36 | } 37 | return std::make_pair(feat_mat, labels); 38 | } 39 | 40 | 41 | template 42 | std::pair FeatureSelector::regression_train(const vector > &data, Feature& feat) 43 | { 44 | auto pr = generate_feat_mat(data, feat, -1); 45 | matrix::GLM glm; 46 | glm.train(pr.first, pr.second); 47 | auto result1 = pr.first * glm.get_weights(); 48 | auto diff1 = result1 - pr.second; 49 | double sum = 0; 50 | for (int i = 0; i < diff1.getNumRow(); i++) { 51 | sum += fabs(diff1.get(i, 0)); 52 | } 53 | sum /= diff1.getNumRow(); 54 | return {sum, glm}; 55 | } 56 | 57 | template 58 | std::pair FeatureSelector::class_train(const vector > &data, Feature& feat, double cutoff) 59 | { 60 | auto pr = generate_feat_mat(data, feat, cutoff); 61 | matrix::GLM glm; 62 | glm.train(pr.first, pr.second); 63 | matrix::Matrix p = glm.predict(pr.first); 64 | for (int row = 0; row < p.getNumRow(); row++) { 65 | if (p.get(row, 0) == 0) { 66 | p.set(row, 0, -1); 67 | } 68 | } 69 | auto tup = glm.accuracy(pr.second, p); 70 | double acc = get<0>(tup); 71 | double sens = get<1>(tup); 72 | double spec = get<2>(tup); 73 | return {acc, glm}; 74 | } 75 | 76 | template 77 | double FeatureSelector::regression_test(const vector >& data, Feature& feat, const matrix::GLM& glm) 78 | { 79 | auto pr = generate_feat_mat(data, feat, -1); 80 | auto result1 = pr.first * glm.get_weights(); 81 | auto diff1 = result1 - pr.second; 82 | double sum = 0; 83 | for (int i = 0; i < diff1.getNumRow(); i++) { 84 | sum += fabs(diff1.get(i, 0)); 85 | } 86 | sum /= diff1.getNumRow(); 87 | return sum; 88 | } 89 | 90 | template 91 | tuple FeatureSelector::class_test(const vector >& data, Feature& feat, const matrix::GLM& glm, double cutoff) 92 | { 93 | auto pr = generate_feat_mat(data, feat, cutoff); 94 | matrix::Matrix p = glm.predict(pr.first); 95 | for (int row = 0; row < p.getNumRow(); row++) { 96 | if (p.get(row, 0) == 0) { 97 | p.set(row, 0, -1); 98 | } 99 | } 100 | auto tup = glm.accuracy(pr.second, p); 101 | return tup; 102 | 103 | } 104 | 105 | template class FeatureSelector; 106 | template class FeatureSelector; 107 | template class FeatureSelector; 108 | template class FeatureSelector; 109 | template class FeatureSelector; 110 | template class FeatureSelector; 111 | -------------------------------------------------------------------------------- /src/predict/FeatureSelector.h: -------------------------------------------------------------------------------- 1 | // -*- C++ -*- 2 | /* 3 | * FeatureSelector.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef FEATURE_SELECTOR_H 9 | #define FEATURE_SELECTOR_H 10 | 11 | #include "GLM.h" 12 | #include "Feature.h" 13 | 14 | template 15 | class FeatureSelector { 16 | public: 17 | virtual ~FeatureSelector() {}; 18 | static std::pair generate_feat_mat(const vector > &data, Feature& feat, double cutoff); 19 | static std::pair class_train(const vector > &data, Feature& feat, double cutoff); 20 | static std::pair regression_train(const vector > &data, Feature& feat); 21 | static double regression_test(const vector >& data, Feature& feat, const matrix::GLM& glm); 22 | static tuple class_test(const vector >& data, Feature& feat, const matrix::GLM& glm, double cutoff); 23 | 24 | virtual pair*,matrix::GLM> train_regression(Feature* tfeat, const vector > &training,const vector > &testing) = 0; 25 | virtual pair*,matrix::GLM> train_class(Feature* tfeat, const vector > &training,const vector > &testing, double id) = 0; 26 | }; 27 | #endif 28 | -------------------------------------------------------------------------------- /src/predict/GLM.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * glm.cpp 3 | * 4 | * Created on: May 29, 2017 5 | * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa 6 | * 7 | * Modified by Benjamin T James 8 | */ 9 | 10 | #include "GLM.h" 11 | #include "Matrix.h" 12 | 13 | #include 14 | #include 15 | using namespace std; 16 | // using namespace matrix; 17 | 18 | namespace matrix{ 19 | 20 | void GLM::train(Matrix& features, Matrix& labels){ 21 | weights = features.transpose() * features; 22 | weights = weights.pseudoInverse() * features.transpose() * labels; 23 | } 24 | 25 | 26 | double GLM::logistic(double x) 27 | { 28 | return 1.0 / (1 + exp(-x)); 29 | } 30 | Matrix GLM::predict(Matrix& features) const { 31 | Matrix labels; 32 | labels = features * weights; 33 | double log; 34 | for(int i = 0; i < labels.getNumRow(); i++){ 35 | //log = round(1/(1 + exp(-(labels.get(i,0)))) + 0.1); 36 | labels.set(i,0, round(logistic(labels.get(i, 0)))); 37 | } 38 | return labels; 39 | } 40 | 41 | std::tuple GLM::accuracy(Matrix& oLabels, Matrix& pLabels) const { 42 | int sum = 0; 43 | int negSum = 0; 44 | int negSame = 0; 45 | int posSum = 0; 46 | int posSame = 0; 47 | for(int i = 0; i < oLabels.getNumRow(); i++){ 48 | if(oLabels.get(i,0) == -1){ 49 | negSum++; 50 | if(oLabels.get(i,0) == pLabels.get(i, 0)){ 51 | sum++; 52 | negSame++; 53 | } 54 | }else{ 55 | posSum++; 56 | if(oLabels.get(i,0) == pLabels.get(i, 0)){ 57 | sum++; 58 | posSame++; 59 | } 60 | } 61 | } 62 | double acc = (((double)sum*100)/(oLabels.getNumRow())); 63 | double sens = (((double)posSame*100)/(posSum)); 64 | double spec = (((double)negSame*100)/(negSum)); 65 | // cout << "Accuracy: " << acc << "% "; 66 | // cout << "Sensitivity: " << sens << "% "; 67 | // cout << "Specificity: " << spec << "% " << endl; 68 | return make_tuple(acc, sens, spec); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/predict/GLM.h: -------------------------------------------------------------------------------- 1 | /* 2 | * glm.h 3 | * 4 | * Created on: May 29, 2017 5 | * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa 6 | * 7 | * Modified by Benjamin T James 8 | */ 9 | 10 | #ifndef SRC_MATRIX_GLM_H_ 11 | #define SRC_MATRIX_GLM_H_ 12 | 13 | #include "Matrix.h" 14 | #include 15 | namespace matrix { 16 | 17 | class GLM { 18 | private: 19 | Matrix weights; 20 | 21 | public: 22 | void load(Matrix weights_) { weights = weights_; } 23 | void train(matrix::Matrix& features, matrix::Matrix& labels); 24 | Matrix predict(matrix::Matrix& features) const; 25 | static double logistic(double x); 26 | static double linear(double x); 27 | std::tuple accuracy(matrix::Matrix& oLabels, matrix::Matrix& pLabels) const; 28 | const Matrix& get_weights() const { return weights; }; 29 | }; 30 | 31 | } 32 | 33 | #endif /* SRC_MATRIX_GLM_H_ */ 34 | -------------------------------------------------------------------------------- /src/predict/GreedySelector.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | /* 3 | * GreedySelector.cpp 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | #include "GreedySelector.h" 8 | #include "../clutil/Progress.h" 9 | 10 | template 11 | pair*,matrix::GLM> GreedySelector::train_regression(Feature* feat, const vector > &training,const vector > &testing) 12 | { 13 | auto c_size = feat->get_combos().size(); 14 | for (int i = 0; i < c_size; i++) { 15 | feat->remove_feature(); 16 | } 17 | vector used_list; 18 | double abs_best_regr = 1000000; 19 | // Progress prog(possible_feats.size() * max_num_feat, "Feature selection:"); 20 | for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) { 21 | double best_regr_err = abs_best_regr; 22 | uintmax_t best_idx = -1, cur_idx = 1; 23 | auto best_regr_feat = possible_feats.front(); 24 | for (uint64_t i = 0; i < possible_feats.size(); i++) { 25 | if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) { 26 | continue; 27 | } 28 | auto rfeat = possible_feats[i]; 29 | feat->add_feature(rfeat.first, rfeat.second); 30 | feat->normalize(training); 31 | feat->finalize(); 32 | auto pr = FeatureSelector::regression_train(training, *feat); 33 | auto name = feat->feat_names().back(); 34 | double regr_mse = FeatureSelector::regression_test(testing, *feat, pr.second); 35 | feat->remove_feature(); 36 | // prog++; 37 | //cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " err: " << regr_mse << endl; 38 | if (regr_mse < best_regr_err) { 39 | best_regr_err = regr_mse; 40 | best_regr_feat = rfeat; 41 | best_idx = i; 42 | } 43 | } 44 | if (best_regr_err < abs_best_regr) { 45 | feat->add_feature(best_regr_feat.first, best_regr_feat.second); 46 | feat->normalize(training); 47 | feat->finalize(); 48 | abs_best_regr = best_regr_err; 49 | used_list.push_back(best_idx); 50 | //possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_regr_feat), possible_feats.end()); 51 | } 52 | } 53 | // prog.end(); 54 | 55 | Feature* feat_r = new Feature(*feat); 56 | feat_r->set_save(false); 57 | auto pr = FeatureSelector::regression_train(training, *feat_r); 58 | matrix::GLM r_glm = pr.second; 59 | double tr_regr_mse = FeatureSelector::regression_test(testing, *feat_r, r_glm); // "training" 60 | cout << "Training Mean Error: " << pr.first << endl; 61 | double regr_mse = FeatureSelector::regression_test(testing, *feat_r, r_glm);//, "testing"); 62 | cout << "Testing Mean Error: " << regr_mse << endl; 63 | cout << "Features: "<< endl; 64 | for (auto line : feat_r->feat_names()) { 65 | cout << "\t" << line << endl; 66 | } 67 | auto w = r_glm.get_weights(); 68 | for (int r = 0; r < w.getNumRow(); r++) { 69 | cout << "weight: "; 70 | for (int c = 0; c < w.getNumCol(); c++) { 71 | cout << w.get(r, c) << " "; 72 | } 73 | cout << endl; 74 | } 75 | 76 | } 77 | 78 | template 79 | std::pair*,matrix::GLM> GreedySelector::train_class(Feature* feat, const vector > &training,const vector > &testing, double id) 80 | { 81 | auto c_size = feat->get_combos().size(); 82 | for (int i = 0; i < c_size; i++) { 83 | feat->remove_feature(); 84 | } 85 | vector used_list; 86 | double abs_best_acc = 0; 87 | // cout << "possible feats at one step: " << possible_feats.size() << endl; 88 | Progress prog(possible_feats.size() * max_num_feat, "Feature selection:"); 89 | 90 | std::ostringstream oss; 91 | for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) { 92 | double best_class_acc = abs_best_acc; 93 | uintmax_t best_idx = -1, cur_idx = 1; 94 | auto best_class_feat = possible_feats.front(); 95 | for (uint64_t i = 0; i < possible_feats.size(); i++) { 96 | if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) { 97 | continue; 98 | } 99 | auto rfeat = possible_feats[i]; 100 | feat->add_feature(rfeat.first, rfeat.second); 101 | feat->normalize(training); 102 | feat->finalize(); 103 | auto name = feat->feat_names().back(); 104 | auto pr = FeatureSelector::class_train(training, *feat, id); 105 | auto class_ac = FeatureSelector::class_test(testing, *feat, pr.second, id); 106 | double class_accuracy = get<0>(class_ac);//sqrt(get<1>(class_ac) * get<2>(class_ac)); 107 | feat->remove_feature(); 108 | prog++; 109 | // cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " acc: " << get<0>(class_ac) << " sens: " << get<1>(class_ac) << " spec: " << get<2>(class_ac) << endl; 110 | if (class_accuracy > best_class_acc) { 111 | best_class_acc = class_accuracy; 112 | best_class_feat = rfeat; 113 | best_idx = i; 114 | } 115 | } 116 | /* accept the feature if either 1. we don't have enough features 117 | * or 2. it improves accuracy by over 0.5% 118 | */ 119 | if (best_class_acc > abs_best_acc || num_feat <= min_num_feat) { 120 | feat->add_feature(best_class_feat.first, best_class_feat.second); 121 | feat->normalize(training); 122 | feat->finalize(); 123 | abs_best_acc = best_class_acc; 124 | used_list.push_back(best_idx); 125 | oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl; 126 | oss << "Accuracy: " << best_class_acc << endl; 127 | possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_class_feat), possible_feats.end()); 128 | } 129 | } 130 | prog.end(); 131 | cout << oss.str(); 132 | Feature* feat_c = new Feature(*feat); 133 | feat_c->set_save(false); 134 | auto pr = FeatureSelector::class_train(training, *feat_c, id); 135 | matrix::GLM c_glm = pr.second; 136 | auto train_results = FeatureSelector::class_test(training, *feat_c, c_glm, id);//, "train"); 137 | cout << "Training ACC: " << get<0>(train_results) << " " << get<1>(train_results) << " " << get<2>(train_results) << endl; 138 | auto test_results = FeatureSelector::class_test(testing, *feat_c, c_glm, id);//, "test"); 139 | double class_acc = get<0>(test_results); 140 | cout << "Testing ACC: " << class_acc << " " << get<1>(test_results) << " " << get<2>(test_results) << endl; 141 | 142 | cout << "Features: "<< endl; 143 | for (auto line : feat_c->feat_names()) { 144 | cout << "\t" << line << endl; 145 | } 146 | return std::make_pair(feat_c, c_glm); 147 | } 148 | 149 | template class GreedySelector; 150 | template class GreedySelector; 151 | template class GreedySelector; 152 | template class GreedySelector; 153 | template class GreedySelector; 154 | template class GreedySelector; 155 | -------------------------------------------------------------------------------- /src/predict/GreedySelector.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | /* 3 | * GreedySelector.h 4 | * 5 | * Author: Benjamin T James 6 | */ 7 | 8 | #ifndef GREEDY_SELECTOR_H 9 | #define GREEDY_SELECTOR_H 10 | #include "FeatureSelector.h" 11 | 12 | template 13 | class GreedySelector : public FeatureSelector { 14 | public: 15 | GreedySelector(vector > possible_feats_, int min_n_feat, int max_n_feat) : possible_feats(possible_feats_), min_num_feat(min_n_feat), max_num_feat(max_n_feat) {} 16 | ~GreedySelector() {} 17 | pair*,matrix::GLM> train_regression(Feature* tfeat, const vector > &training,const vector > &testing); 18 | pair*,matrix::GLM> train_class(Feature* tfeat, const vector > &training,const vector > &testing, double id); 19 | private: 20 | int max_num_feat, min_num_feat; 21 | vector > possible_feats; 22 | }; 23 | #endif 24 | -------------------------------------------------------------------------------- /src/predict/HandleSeq.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Alex Baumgartner 3 | * Modified by Benjamin T James 4 | * The Bioinformatics Toolsmith Laboratory, the University of Tulsa 5 | * 5/15/2018 6 | * 7 | * Purpose: 8 | * The pupose of this module is to take a sequence and mutate it to returns 9 | It also serves as a way to parse a file for all sequences 10 | */ 11 | 12 | #include "HandleSeq.h" 13 | #include 14 | // d 15 | HandleSeq::HandleSeq(int m, std::random_device::result_type rnd) { 16 | 17 | mode = m & HandleSeq::BOTH; 18 | enableTrans = m & HandleSeq::TRANSLOCATION; 19 | enableRev = m & HandleSeq::REVERSION; 20 | random = new LCG(rnd); 21 | // disable = (m & HandleSeq::ATYPICAL) > 0 ? 0 : 1; 22 | } 23 | 24 | pair, vector> HandleSeq::parseFile(string fileName) { 25 | ifstream fileIn; 26 | //Uses the file the user supplies to take in sequences 27 | fileIn.open(fileName, ifstream::in); 28 | if(fileIn.is_open()){ 29 | vector sequences; 30 | vector names; 31 | string inString; 32 | //Boolean to make sure that the first sequence 33 | //has already been found, prevents a null string being written 34 | bool foundFirst = false; 35 | string currentLine; 36 | while (!fileIn.eof()) { 37 | getline(fileIn, currentLine); 38 | //Skip the line if nothing is on it 39 | if (currentLine.length() == 0) { 40 | continue; 41 | } 42 | //If the line has a '>' symbol, the start of a new sequence 43 | else if (currentLine.at(0) == '>' && foundFirst) { 44 | //Push the current saved sequene onto the vector, 45 | //then reset the strings value 46 | sequences.push_back(inString); 47 | names.push_back(currentLine.substr(1, currentLine.find_first_of(' '))); 48 | inString = ""; 49 | } 50 | else if(currentLine.at(0) == '>' && !foundFirst){ 51 | foundFirst = true; 52 | names.push_back(currentLine.substr(1, currentLine.find_first_of(' '))); 53 | } 54 | //If this is the first >, set found first to true 55 | else if (!foundFirst) { 56 | foundFirst = true; 57 | } 58 | //Otherwise, add the current Line to 59 | //the string of current lines 60 | else { 61 | inString = inString + currentLine; 62 | } 63 | } 64 | //Push the last found string on 65 | //(There is no > at the end of a .fa file) 66 | sequences.push_back(inString); 67 | fileIn.close(); 68 | return {names, sequences}; 69 | } 70 | else{ 71 | cout << "Could not find File" << endl; 72 | exit(2); 73 | } 74 | } 75 | 76 | pair HandleSeq::mutate(string sequence, int muteRate, int split) { 77 | percMute = muteRate; 78 | if (muteRate == 0) { 79 | return std::make_pair(1, sequence); 80 | } 81 | auto nucls = countNucl(sequence); 82 | //Assing the percent of each nucleotide in the sequence 83 | int percAs = (nucls.at(0) * 100) / sequence.length(); 84 | int percCs = (nucls.at(1) * 100) / sequence.length(); 85 | int percGs = (nucls.at(2) * 100) / sequence.length(); 86 | int percTs = (nucls.at(3) * 100) / sequence.length(); 87 | int percMulti, percSing; 88 | string * seq = new string(sequence); 89 | int length = sequence.length(); 90 | //If the user only wants single 91 | if (mode == 1) { 92 | percMulti = 0; 93 | //Allocate all mutations to single 94 | percSing = percMute; 95 | } 96 | //Or if the user only wants non single 97 | else if (mode == 2) { 98 | //Allocate all mutations to non-single 99 | percSing = 0; 100 | percMulti = percMute; 101 | } 102 | //Otherwise, assing a random percentage to both 103 | else { 104 | percMulti = split; 105 | // percMulti = random.randMod(percMute); 106 | percSing = percMute - percMulti; 107 | } 108 | //Define a new multiple mutation 109 | MultiMute multi(percAs, percCs, percGs, percTs, 110 | percMulti, enableTrans, enableRev, random->nextRandSeed()); 111 | //Run the multiple mutations, 112 | //get back its vector of what is valid to mutate and what isn't 113 | vector mutes = multi.genMulti(seq); 114 | uint64_t cnt = 0; 115 | for (bool b : mutes) { 116 | cnt += b ? 1 : 0; 117 | } 118 | if (mutes.size() != seq->length()) { 119 | cerr << "mutation size is not matching the multi-sequence" << endl; 120 | throw 100; 121 | } 122 | SingMute sing(percAs, percCs, percGs, percTs, 123 | percSing, seq, mutes, random->nextRandSeed()); 124 | float alignmentLength = multi.getAlignmentLength() + sing.getAlignmentLength() + length; 125 | // cout << "alignLength: " << alignmentLength << endl; 126 | float IBP = length - multi.getIBP() - sing.getIBP(); 127 | // cout << "ibp: " << IBP << endl; 128 | float alignment = IBP / alignmentLength; 129 | // cout << "ratio: size: " << mutes.size() << " expected: " << (float)cnt / mutes.size() << " found: " << ((float)length - multi.getIBP()) / ((float)multi.getAlignmentLength() + length) << " align: " << alignment << endl; 130 | //assign the sequence to the 131 | //value that the seq pointer stores to 132 | //clear the heap 133 | delete seq; 134 | //Return the now mutated sequence 135 | std::string outseq = sing.getSeq(); 136 | return make_pair(alignment, outseq); 137 | } 138 | 139 | vector HandleSeq::countNucl(const string& sequence) { 140 | int a = 0; 141 | int c = 0; 142 | int g = 0; 143 | int t = 0; 144 | for (int i = 0; i < sequence.length(); i++) { 145 | if (sequence.at(i) == 'A') { 146 | a++; 147 | } else if (sequence.at(i) == 'C') { 148 | c++; 149 | } else if (sequence.at(i) == 'G') { 150 | g++; 151 | } else if (sequence.at(i) == 'T') { 152 | t++; 153 | } 154 | } 155 | vector values; 156 | values.push_back(a); 157 | values.push_back(c); 158 | values.push_back(g); 159 | values.push_back(t); 160 | return values; 161 | } 162 | -------------------------------------------------------------------------------- /src/predict/HandleSeq.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Alex Baumgartner 3 | * Modified by Benjamin T James 4 | * The Bioinformatics Toolsmith Laboratory, the University of Tulsa 5 | * 5/15/2018 6 | * 7 | * Purpose: 8 | * The pupose of this module is to take a sequence and mutate it to returns 9 | It also serves as a way to parse a file for all sequences 10 | */ 11 | #ifndef HANDLESEQ_H 12 | #define HANDLESEQ_H 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "LCG.h" 19 | #include "MultiMute.h" 20 | #include "SingMute.h" 21 | 22 | using namespace std; 23 | 24 | class HandleSeq { 25 | public: 26 | // Single — point — mutations only 27 | static const int SINGLE = (1 << 0); 28 | // Non-single point mutations only 29 | static const int NON_SINGLE = (1 << 1); 30 | // Single and non-single mutations 31 | static const int BOTH = SINGLE | NON_SINGLE; 32 | // translocations and reversions 33 | static const int TRANSLOCATION = (1 << 2); 34 | static const int REVERSION = (1 << 3); 35 | static const int ATYPICAL = TRANSLOCATION | REVERSION; 36 | static const int ALL = ATYPICAL | BOTH; 37 | /* 38 | constructor 39 | 40 | @param: 41 | int: the mode of the program 42 | (Single only = 1, nonsingle only = 2, both = 3) 43 | */ 44 | HandleSeq(int, std::random_device::result_type seed); 45 | ~HandleSeq() { if (random != NULL) { delete random; }} 46 | /* 47 | returns a vector of all sequences in a file inputted 48 | 49 | @param: 50 | std::string: file name 51 | int: the mutation rate 52 | 53 | @return: 54 | std::vector: Vector of all found sequences 55 | */ 56 | pair, vector> parseFile(string); 57 | /* 58 | Mutates a sequence based on parameters inputted in constructor, 59 | and returns the mutated sequence 60 | */ 61 | pair mutate(string, int, int); 62 | 63 | uint32_t getSeed() const { return seed; } 64 | private: 65 | uint32_t seed; 66 | int mode; 67 | int percMute; 68 | bool enableTrans, enableRev; 69 | LCG *random = NULL; 70 | /* 71 | Counts the nucleotides in a file, 72 | and returns a vector corresponding to their values {A, C, G, T} 73 | 74 | @param: 75 | std::string: the sequences 76 | 77 | @return: 78 | std::vector: vector containing ints of each nucleotide count 79 | */ 80 | vector countNucl(const string&); 81 | 82 | }; 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /src/predict/Matrix.h: -------------------------------------------------------------------------------- 1 | /* 2 | * matrix.h 3 | * 4 | * Created on: May 10, 2017 5 | * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa 6 | * Modified by Benjamin T James 7 | */ 8 | 9 | 10 | #ifndef MATRIX_H_ 11 | #define MATRIX_H_ 12 | 13 | #include 14 | #include 15 | 16 | namespace matrix { 17 | 18 | class Matrix 19 | { 20 | private: 21 | std::vector > m; 22 | int numRow; 23 | int numCol; 24 | 25 | 26 | public: 27 | Matrix(std::vector m); 28 | Matrix(int r, int c); 29 | Matrix(); 30 | ~Matrix(); 31 | Matrix operator+(Matrix n); 32 | Matrix operator-(Matrix n); 33 | Matrix operator*(Matrix n); 34 | Matrix transpose(); 35 | Matrix gaussJordanInverse(); 36 | Matrix pseudoInverse(); 37 | void userFill(); 38 | double determinant(); 39 | double get(int r, int c) const; 40 | void set(int r, int c, double val); 41 | void addRow(double); 42 | void addCol(double); 43 | void print(); 44 | void printToFile(std::string); 45 | void randFill(double low, double high); 46 | void fileFill(std::string filename); 47 | void normalize(double a, double b); 48 | void rowToVector(int, std::vector&); 49 | void colToVector(int, std::vector&); 50 | int getNumRow() const; 51 | int getNumCol() const { return numCol; }; 52 | }; 53 | } 54 | #endif /* MATRIX_H_ */ 55 | -------------------------------------------------------------------------------- /src/predict/MultiMute.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Alex Baumgartner 3 | * Modified by Benjamin T James 4 | * The Bioinformatics Toolsmith Laboratory, the University of Tulsa 5 | * 5/15/2018 6 | * 7 | * Purpose: 8 | * The pupose of this module is to perform non single mutations on sequences 9 | */ 10 | 11 | #ifndef MULTIMUTE_H 12 | #define MULTIMUTE_H 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "Random.h" 20 | #include "LCG.h" 21 | using namespace std; 22 | 23 | class MultiMute { 24 | public: 25 | /* 26 | Constructor, creates values 27 | and assignes allocations based on inputted data 28 | 29 | @param: 30 | int: percentage of A's 31 | int: percentage of C's 32 | int: percentage of G's 33 | int: percentage of T's 34 | int: The total allocation for non-single mutations 35 | int: bool to exclude Translocate and reverse, 1 for disable, any other umber for include 36 | */ 37 | MultiMute(int, int, int, int, int, bool, bool, std::random_device::result_type); 38 | /* 39 | Takes in a string pointer, 40 | and mutates it based on the allocation given to the constructor. 41 | Returns a vector of all valid and invalid indexes 42 | 43 | @param: 44 | std::string *: pointer to the string to be mutated 45 | 46 | @return: 47 | std::vector: vector of mutations, 48 | false means that index has been mutated 49 | */ 50 | std::vector genMulti(std::string *); 51 | int getAlignmentLength(); 52 | int getIBP(); 53 | 54 | private: 55 | int percAs; 56 | int percCs; 57 | int percGs; 58 | int percTs; 59 | int64_t maxReverse; 60 | int64_t maxInsert; 61 | int64_t maxTrans; 62 | int64_t maxDel; 63 | int64_t maxDup; 64 | int64_t maxNonMutations; 65 | int64_t alignmentLength; 66 | int64_t IBP; 67 | int64_t total_alloc; 68 | LCG rng; 69 | 70 | int64_t max_block_size; 71 | std::vector * insertions; 72 | std::vector * mutationStrings; 73 | std::string * seq; 74 | /* 75 | Takes in a vector 76 | */ 77 | void reverse(vector *); 78 | /* 79 | Translocates a random, nonmutaded part of the sequence, 80 | no larger than its max allocation 81 | */ 82 | void translocate(vector *); 83 | /* 84 | Inserts at random, nonmutaded part of the sequence, 85 | no larger than its max allocation 86 | */ 87 | void insert(vector *); 88 | /* 89 | Deletes a random, nonmutaded part of the sequence, 90 | no larger than its max allocation 91 | */ 92 | void deleteNucl(vector *); 93 | /* 94 | Duplicates a random, nonmutaded part of the sequence, 95 | no larger than its max allocation 96 | to an index directly after the duplicated string 97 | */ 98 | void duplicate(vector *); 99 | /* 100 | Checks inclusively, [first, last], if a portion is valid 101 | 102 | @param: 103 | int: The starting index (first) 104 | int: The ending index (last) 105 | 106 | @return: 107 | bool: true if all indexes in range are valid 108 | */ 109 | bool checkPalindrome(int, int); 110 | void checkForAllPalindromes(vector *); 111 | /* 112 | Marks all indexes in the range as invalid 113 | 114 | @param: 115 | int: first index to be marked false 116 | int: last index tobe marked false 117 | */ 118 | vector formatString(int, vector *); 119 | 120 | /* 121 | Generates a randomized string based on the inputed size 122 | @param: 123 | int: size of string to generate 124 | @return 125 | string: randomized string 126 | */ 127 | std::string genInsert(int); 128 | /* 129 | Adds all translocations to the insertions array 130 | @param: 131 | vector *: pointer to a char vector with mutation characters 132 | */ 133 | void getTranslocations(vector *); 134 | /* 135 | converts a vector of strings into a vector of chars 136 | @param: 137 | vector *: the vector to be converted 138 | @return: 139 | vector *: the vector of characters 140 | */ 141 | vector * genCharVector(vector *); 142 | }; 143 | #endif 144 | -------------------------------------------------------------------------------- /src/predict/Predictor.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- 2 | * 3 | * Predictor.h 4 | * 5 | * Author: Benjamin T James 6 | * 7 | * Main class for training and prediction 8 | * Does bulk training, but can be adapted for on-line training 9 | */ 10 | 11 | #ifndef PREDICTOR_H 12 | #define PREDICTOR_H 13 | 14 | #include "GLM.h" 15 | #include "Point.h" 16 | #include "Feature.h" 17 | #include 18 | #include "Random.h" 19 | #include 20 | #define PRED_MODE_CLASS 1 21 | #define PRED_MODE_REGR 2 22 | 23 | #define PRED_FEAT_FAST (FEAT_EUCLIDEAN | FEAT_MANHATTAN | FEAT_INTERSECTION | FEAT_KULCZYNSKI2 | FEAT_SIMRATIO | FEAT_NORMALIZED_VECTORS | FEAT_PEARSON_COEFF | FEAT_EMD | FEAT_LENGTHD ) 24 | #define PRED_FEAT_DIV (FEAT_JEFFEREY_DIV | FEAT_JENSEN_SHANNON) 25 | #define PRED_FEAT_ALL (FEAT_HELLINGER|FEAT_MANHATTAN|FEAT_EUCLIDEAN|FEAT_CHI_SQUARED|FEAT_NORMALIZED_VECTORS|FEAT_HARMONIC_MEAN|FEAT_JEFFEREY_DIV|FEAT_K_DIV|FEAT_PEARSON_COEFF|FEAT_SQCHORD|FEAT_KL_COND|FEAT_MARKOV|FEAT_INTERSECTION|FEAT_RRE_K_R|FEAT_D2z|FEAT_SIM_MM|FEAT_EUCLIDEAN_Z|FEAT_EMD|FEAT_SPEARMAN|FEAT_JACCARD|FEAT_LENGTHD|FEAT_D2s|FEAT_AFD|FEAT_MISMATCH|FEAT_CANBERRA|FEAT_KULCZYNSKI1|FEAT_KULCZYNSKI2|FEAT_SIMRATIO|FEAT_JENSEN_SHANNON|FEAT_D2_star|FEAT_N2R|FEAT_N2RC|FEAT_N2RRC) 26 | 27 | template 28 | class Predictor { 29 | public: 30 | Predictor(int k_, double id_, uint8_t mode_, uint64_t feats, int mut_type_, int min_num_feat_=3, int max_num_feat_=5, double min_id_=0.35) : k(k_), id(id_), is_trained(false), is_training(false), mode(mode_), max_num_feat(max_num_feat_), mut_type(mut_type_), min_num_feat(min_num_feat_), min_id(min_id_ * 100), feats64(feats) { 31 | add_feats(possible_feats, feats); 32 | feat_c = NULL; 33 | feat_r = NULL; 34 | omp_init_lock(&lock); 35 | }; 36 | Predictor(const std::string filename); 37 | ~Predictor() { 38 | possible_feats.clear(); 39 | omp_destroy_lock(&lock); 40 | if (feat_c) { 41 | delete feat_c; 42 | } 43 | if (feat_r) { 44 | delete feat_r; 45 | } 46 | training.clear(); 47 | testing.clear(); 48 | } 49 | static double classify_sum(double sum); 50 | static void set_bias(double bias); 51 | void train(const std::vector* >& vec, uintmax_t& _id, size_t num_sample, size_t n_templates); 52 | double similarity(Point* a, Point* b); 53 | bool close(Point* a, Point* b); 54 | void save(std::string file, std::string datatype); 55 | void check(); 56 | uint8_t get_mode() const { return mode; } 57 | pair*, matrix::GLM> get_class() { return std::make_pair(new Feature(*feat_c), c_glm); } 58 | void mutate_seqs(Point* p, size_t num_seq, vector > &,vector > & , double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed); 59 | void mutate_seqs(Point* p, size_t num_seq,vector > &,double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed); 60 | std::string get_datatype() const { return datatype; } 61 | int get_k() const { return k; } 62 | double get_id() const { return id; } 63 | private: 64 | static void add_feats(std::vector >& vec, uint64_t flags); 65 | static pair*> read_from(std::ifstream &in, int k_); 66 | static void write_to(std::ofstream &out, Feature* f, matrix::GLM glm); 67 | void filter(std::vector > &s, std::string prefix=""); 68 | void train(); 69 | void train_class(Feature* feat); 70 | void train_regr(Feature* feat); 71 | void train_class_regr(Feature* feat); 72 | double predict(Point* a, Point* b); 73 | bool p_close(Point* a, Point* b); 74 | double p_predict(Point* a, Point* b); 75 | 76 | Feature *feat_c, *feat_r; 77 | matrix::GLM c_glm, r_glm; 78 | vector > training, testing; 79 | bool is_trained, is_training; 80 | int min_num_feat, max_num_feat, k, mut_type; 81 | uint8_t mode; 82 | double id, min_id; 83 | vector > possible_feats; 84 | omp_lock_t lock; 85 | Random random; 86 | uint64_t feats64; 87 | std::string datatype; 88 | double scale_min = 1000; 89 | double scale_max = -1000; 90 | }; 91 | #endif 92 | -------------------------------------------------------------------------------- /src/predict/SingMute.cpp: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | /* 3 | * SingMute.cpp 4 | * 5 | * Original Author: Alexander Baumgartner 6 | * Modified by Benjamin T James 7 | */ 8 | #include "SingMute.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #ifdef MUTDEBUG 15 | static const std::string INSERT_BEGIN = "["; 16 | static const std::string INSERT_END = "]"; 17 | static const std::string SWITCH_BEGIN = "("; 18 | static const std::string SWITCH_END = ")"; 19 | static const std::string DEL = "-"; 20 | #else 21 | static const std::string INSERT_BEGIN = ""; 22 | static const std::string INSERT_END = ""; 23 | static const std::string SWITCH_BEGIN = ""; 24 | static const std::string SWITCH_END = ""; 25 | static const std::string DEL = ""; 26 | #endif 27 | 28 | 29 | char SingMute::randNucl() 30 | { 31 | char character; 32 | int value = rng.randMod(percAs + percCs + percGs + percTs); 33 | // int value = 40436 % (percAs + percCs + percGs + percTs); 34 | if (value < percAs) { 35 | character = 'A'; 36 | } else if (value < percAs + percCs) { 37 | character = 'C'; 38 | } else if (value < percAs + percCs + percGs) { 39 | character = 'G'; 40 | } else { 41 | character = 'T'; 42 | } 43 | return character; 44 | } 45 | void SingMute::init(const std::vector &valid) 46 | { 47 | maxInsert = 0; 48 | maxDel = 0; 49 | maxSwitch = 0; 50 | if (num_mut == 0) { 51 | out_seq = std::string(*seq); 52 | IBP = 0; 53 | alignmentLength = 0; 54 | return; 55 | } else if (num_mut == 1) { 56 | maxInsert = 1; 57 | maxDel = 0; 58 | maxSwitch = 0; 59 | } else { 60 | maxSwitch = rng.randMod(num_mut); 61 | num_mut -= maxSwitch; 62 | 63 | if (maxSwitch % 2 == 1 && num_mut >= 1) { 64 | maxSwitch++; 65 | num_mut--; 66 | } else if (num_mut == 0) { 67 | maxSwitch--; 68 | num_mut++; 69 | } 70 | if (num_mut > 1) { 71 | maxInsert = rng.randMod(num_mut); 72 | num_mut -= maxInsert; 73 | } else { 74 | maxInsert = num_mut; 75 | num_mut -= maxInsert; 76 | } 77 | maxDel = num_mut; 78 | } 79 | size_t seq_len = seq->length(); 80 | 81 | maxDel *= seq_len / 100.0; 82 | maxInsert *= seq_len / 100.0; 83 | maxSwitch *= seq_len / 100.0; 84 | alignmentLength = maxInsert; 85 | IBP = maxDel + maxSwitch; 86 | 87 | 88 | std::vector command_str(seq_len, 'S'); 89 | long idx = 0; 90 | long nons_len = maxInsert + maxDel + maxSwitch; 91 | for (long i = 0; i < maxInsert; i++) { 92 | command_str[idx++] = 'I'; 93 | } 94 | for (long i = 0; i < maxDel; i++) { 95 | command_str[idx++] = 'D'; 96 | } 97 | for (long i = 0; i < maxSwitch; i++) { 98 | command_str[idx++] = 'W'; 99 | } 100 | //std::shuffle(command_str.begin(), command_str.end(), rng.gen()); 101 | std::shuffle(command_str.begin(), command_str.end(), std::minstd_rand0(rng.nextRandSeed())); 102 | std::vector valid_indices; 103 | long repl = command_str.size() - 1; 104 | for (long i = 0; i < command_str.size(); i++) { 105 | if (command_str[i] != 'S' && !valid[i]) { 106 | if (!valid_indices.empty()) { 107 | repl = valid_indices.back(); 108 | valid_indices.pop_back(); 109 | } else { 110 | for (; repl > 0; repl--) { 111 | if (valid[repl]) { 112 | break; 113 | } 114 | } 115 | } 116 | std::swap(command_str[i], command_str[repl]); 117 | } else if (command_str[i] == 'S' 118 | && valid[i] 119 | && valid_indices.size() < nons_len) { 120 | 121 | valid_indices.push_back(i); 122 | } 123 | } 124 | // std::set s_ins, s_del, s_switch; 125 | // generate_unique_set(command_str.size(), s_ins, maxInsert, s_del, s_switch, valid); 126 | // generate_unique_set(command_str.size(), s_del, maxDel, s_ins, s_switch, valid); 127 | // generate_unique_set(command_str.size(), s_switch, maxSwitch, s_ins, s_del, valid); 128 | // for (auto idx : s_ins) { 129 | // command_str[idx] = 'I'; 130 | // } 131 | // for (auto idx : s_del) { 132 | // command_str[idx] = 'D'; 133 | // } 134 | // for (auto idx : s_switch) { 135 | // command_str[idx] = 'W'; 136 | // } 137 | out_seq = ""; 138 | out_seq.reserve(maxInsert + seq_len - maxDel + 1); 139 | 140 | for (long i = 0; i < seq_len; i++) { 141 | auto cmd = command_str.at(i); 142 | switch (cmd) { 143 | case 'I': { 144 | out_seq += INSERT_BEGIN + randNucl() + INSERT_END; 145 | out_seq += seq->at(i); 146 | break; 147 | } 148 | case 'S': { 149 | out_seq += seq->at(i); 150 | break; 151 | } 152 | case 'D': { 153 | out_seq += DEL; 154 | break; 155 | } 156 | case 'W': { 157 | out_seq += SWITCH_BEGIN + randNucl() + SWITCH_END; 158 | break; 159 | } 160 | } 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/predict/SingMute.h: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | /* 3 | * SingMute.h 4 | * 5 | * Original Author: Alexander Baumgartner 6 | * Modified by Benjamin T James 7 | */ 8 | 9 | #ifndef SINGMUTE_H 10 | #define SINGMUTE_H 11 | 12 | #include 13 | #include 14 | #include "Random.h" 15 | #include "LCG.h" 16 | 17 | class SingMute { 18 | public: 19 | /* 20 | Constructor, creates values 21 | and assignes allocations based on inputted data 22 | 23 | @param: 24 | int: percentage of A's 25 | int: percentage of C's 26 | int: percentage of G's 27 | int: percentage of T's 28 | int: The total allocation for non-single mutations 29 | */ 30 | SingMute(int pa, int pc, int pg, int pt, uintmax_t tt, const std::string* s, const std::vector &valid_, std::random_device::result_type seed) : percAs(pa), 31 | percCs(pc), percGs(pg), percTs(pt), num_mut(tt), seq(s), rng(seed) { 32 | init(valid_); 33 | } 34 | long getAlignmentLength() { return alignmentLength; } 35 | long getIBP() { return IBP; } 36 | void init(const std::vector &valid); 37 | std::string& getSeq() { return out_seq; }; 38 | private: 39 | uintmax_t num_mut; 40 | int percAs; 41 | int percCs; 42 | int percGs; 43 | int percTs; 44 | 45 | long maxDel; 46 | long maxInsert; 47 | long maxSwitch; 48 | 49 | long alignmentLength; 50 | long IBP; 51 | const std::string * seq; 52 | std::string out_seq; 53 | char randNucl(); 54 | LCG rng; 55 | }; 56 | #endif 57 | -------------------------------------------------------------------------------- /src/utility/EmptyLocation.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * EmptyLocation.cpp 3 | * 4 | * Created on: Dec 28, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "EmptyLocation.h" 9 | #include "../exception/InvalidOperationException.h" 10 | 11 | using namespace exception; 12 | 13 | namespace utility { 14 | 15 | EmptyLocation * EmptyLocation::INSTANCE = new EmptyLocation(); 16 | 17 | EmptyLocation * EmptyLocation::getInstance(){ 18 | return INSTANCE; 19 | } 20 | 21 | EmptyLocation::EmptyLocation() { 22 | msg = new string("Empty location does not allow this operation."); 23 | } 24 | 25 | EmptyLocation::~EmptyLocation() { 26 | delete msg; 27 | } 28 | 29 | string EmptyLocation::toString() { 30 | return string("Empty"); 31 | } 32 | 33 | int EmptyLocation::getEnd() const { 34 | throw InvalidOperationException(*msg); 35 | } 36 | 37 | int EmptyLocation::getStart() const { 38 | throw InvalidOperationException(*msg); 39 | } 40 | 41 | void EmptyLocation::setEnd(int int1) { 42 | throw InvalidOperationException(*msg); 43 | } 44 | 45 | void EmptyLocation::setStart(int int1) { 46 | throw InvalidOperationException(*msg); 47 | } 48 | 49 | int EmptyLocation::getLength() { 50 | throw InvalidOperationException(*msg); 51 | } 52 | 53 | } /* namespace tr */ 54 | -------------------------------------------------------------------------------- /src/utility/EmptyLocation.h: -------------------------------------------------------------------------------- 1 | /* 2 | * EmptyLocation.h 3 | * 4 | * Created on: Dec 28, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef EMPTYLOCATION_H_ 9 | #define EMPTYLOCATION_H_ 10 | 11 | #include "ILocation.h" 12 | 13 | namespace utility { 14 | 15 | class EmptyLocation: public ILocation { 16 | private: 17 | string * msg; 18 | static EmptyLocation * INSTANCE; 19 | EmptyLocation(); 20 | virtual ~EmptyLocation(); 21 | 22 | public: 23 | virtual int getEnd() const; 24 | virtual int getStart() const; 25 | virtual void setEnd(int); 26 | virtual void setStart(int); 27 | virtual int getLength(); 28 | virtual string toString(); 29 | 30 | static EmptyLocation * getInstance(); 31 | 32 | }; 33 | 34 | } /* namespace tr */ 35 | #endif /* EMPTYLOCATION_H_ */ 36 | -------------------------------------------------------------------------------- /src/utility/GlobAlignE.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Joseph Valencia 3 | * Modified by Benjamin James 4 | * Date: 12/14/17 5 | * Bioinformatics Toolsmith Laboratory, University of Tulsa 6 | * */ 7 | #include 8 | #include "../exception/InvalidStateException.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "GlobAlignE.h" 17 | 18 | using namespace std; 19 | using namespace utility; 20 | using namespace exception; 21 | 22 | GlobAlignE::GlobAlignE(const char * seq1In, int start1In, int end1In, const char * seq2In, 23 | int start2In, int end2In, int matchIn, int mismatchIn, int gapOpenIn, int gapContinueIn){ 24 | 25 | seq1 = seq1In; 26 | start1 = start1In; 27 | end1 = end1In; 28 | 29 | seq2 = seq2In; 30 | start2 = start2In; 31 | end2 = end2In; 32 | 33 | len1 = end1 - start1 + 2; 34 | len2 = end2 - start2 + 2; 35 | 36 | //Incremental score storage 37 | matches = new int[len1]; 38 | upperGap = new int[len1]; 39 | lowerGap = new int[len1]; 40 | 41 | 42 | 43 | //Incremental length storage 44 | matchLen = new int[len1]; 45 | upperLen = new int[len1]; 46 | lowerLen = new int[len1]; 47 | 48 | //Incremental identity storage 49 | matchId = new int[len1]; 50 | upperId = new int[len1]; 51 | lowerId = new int[len1]; 52 | 53 | match = matchIn; 54 | mismatch = mismatchIn; 55 | gapOpen = gapOpenIn; 56 | gapContinue = gapContinueIn; 57 | findAlignment(); 58 | 59 | } 60 | /* 61 | GlobAlignE::GlobAlignE(string filename1,string filename2, int matchIn, int mismatchIn, int gapOpenIn, int gapContinueIn):GlobAlignE(string1.c_str(),0,string.size(),string2.c_str(),0,string2.size(),matchIn,mismatchIn,gapOpenIn,gapContinueIn){ 62 | 63 | ifstream ifs; 64 | 65 | ifs.open (filename1, ifstream::in); 66 | cout<<"FILE OPENED"<'){ 70 | 71 | while(c!='\n'){ 72 | c = ifs.get(); 73 | 74 | } 75 | } 76 | 77 | string string1 =""; 78 | 79 | while (ifs.good()) { 80 | 81 | 82 | if (c!='\n'){ 83 | string1+=c; 84 | } 85 | c = ifs.get(); 86 | } 87 | 88 | ifs.close(); 89 | 90 | 91 | ifstream ifs2; 92 | 93 | ifs2.open (filename2, ifstream::in); 94 | 95 | c = ifs2.get(); 96 | 97 | if(c == '>'){ 98 | 99 | while(c!='\n'){ 100 | c = ifs2.get(); 101 | } 102 | } 103 | 104 | string string2 =""; 105 | 106 | while (ifs2.good()) { 107 | 108 | if(c!='\n'){ 109 | string2+=c; 110 | } 111 | c = ifs2.get(); 112 | } 113 | 114 | ifs2.close(); 115 | 116 | std::transform(string1.begin(),string1.end(),string1.begin(),::toupper); 117 | std::transform(string2.begin(),string2.end(),string2.begin(),::toupper); 118 | 119 | // return GlobAlignE(string1.c_str(),0,string.size(),string2.c_str(),0,string2.size(),matchIn,mismatchIn,gapOpenIn,gapContinueIn); 120 | 121 | } 122 | */ 123 | void GlobAlignE::findAlignment(){ 124 | 125 | int shorter = min(len2,len1)-1; 126 | int lenDiff = abs(len2-len1); 127 | int maxDiff=0; 128 | 129 | if (lenDiff >=1){ 130 | maxDiff += -gapOpen- (lenDiff*gapContinue); 131 | } 132 | 133 | maxDiff+= (mismatch* shorter)-1; 134 | 135 | const int negativeInf = maxDiff; 136 | 137 | matches[0]= 0; 138 | upperGap[0] = negativeInf; 139 | lowerGap[0] = negativeInf; 140 | 141 | matchLen[0] =0; 142 | upperLen[0] =0; 143 | lowerLen[0] =0; 144 | 145 | matchId[0] =0; 146 | upperId[0] = 0; 147 | lowerId[0] =0; 148 | 149 | //initial values 150 | for (int i = 1; i 10 | 11 | using namespace std; 12 | 13 | namespace utility{ 14 | 15 | class GlobAlignE{ 16 | 17 | private: 18 | const char * seq1; //first sequence to be aligned 19 | int start1; 20 | int end1; 21 | const char * seq2;//second sequence to be aligned 22 | int start2; 23 | int end2; 24 | int len1; 25 | int len2; 26 | int lenTotal; 27 | int match; //score for base pair match 28 | int mismatch;//score for base pair mismatch 29 | int gapOpen; //cost to open a gap 30 | int gapContinue; //cost to continue a gap 31 | int * matches; 32 | int * upperGap; 33 | int * lowerGap; 34 | int * matchLen; 35 | int * upperLen; 36 | int * lowerLen; 37 | int * matchId; 38 | int * upperId; 39 | int * lowerId; 40 | int alignmentScore; 41 | int alignmentLength; 42 | int totalMatches; 43 | string topString; 44 | string bottomString; 45 | public: 46 | GlobAlignE(const char*,int,int,const char *,int,int, int,int,int,int); 47 | GlobAlignE(string,string,int,int,int,int); 48 | virtual ~GlobAlignE(); 49 | void findAlignment(); 50 | double getIdentity(); 51 | int getLength(); 52 | void printAlignment(); 53 | int getScore(); 54 | int getLengthAlignment(); 55 | 56 | }; 57 | } 58 | #endif 59 | -------------------------------------------------------------------------------- /src/utility/ILocation.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ILocation.h 3 | * 4 | * Created on: Dec 20, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef ILOCATION_H_ 9 | #define ILOCATION_H_ 10 | 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace utility { 16 | 17 | class ILocation { 18 | public: 19 | virtual int getEnd() const = 0; 20 | virtual int getStart() const = 0; 21 | virtual void setEnd(int) = 0; 22 | virtual void setStart(int) = 0; 23 | virtual int getLength() = 0; 24 | virtual string toString() = 0; 25 | }; 26 | 27 | } 28 | 29 | #endif /* ILOCATION_H_ */ 30 | -------------------------------------------------------------------------------- /src/utility/LCSLen.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * LCSLen.cpp 3 | * 4 | * Created on: Dec 6, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "LCSLen.h" 9 | #include "../utility/Util.h" 10 | #include "../exception/InvalidInputException.h" 11 | 12 | #include 13 | 14 | using namespace std; 15 | using namespace exception; 16 | 17 | namespace utility { 18 | 19 | LCSLen::LCSLen(const char * seq1In, int start1In, int end1In, 20 | const char * seq2In, int start2In, int end2In) { 21 | seq1 = seq1In; 22 | start1 = start1In; 23 | end1 = end1In; 24 | 25 | seq2 = seq2In; 26 | start2 = start2In; 27 | end2 = end2In; 28 | 29 | if(start1 < 0 || end1 < 0 || start1 > end1){ 30 | string msg("Invalid Input. Start1 is "); 31 | msg.append(Util::int2string(start1)); 32 | msg.append(". End 1 is "); 33 | msg.append(Util::int2string(end1)); 34 | msg.append("."); 35 | throw InvalidInputException(msg); 36 | } 37 | 38 | if(start2 < 0 || end2 < 0 || start2 > end2){ 39 | string msg("Invalid Input. Start2 is "); 40 | msg.append(Util::int2string(start2)); 41 | msg.append(". End2 is "); 42 | msg.append(Util::int2string(end2)); 43 | msg.append("."); 44 | throw InvalidInputException(msg); 45 | } 46 | 47 | // Validate input 48 | // cout << start1 << " " << end1 << endl; 49 | // cout << start2 << " " << end2 << endl; 50 | 51 | 52 | len1 = end1 - start1 + 2; 53 | len2 = end2 - start2 + 2; 54 | 55 | lenTotal = 2 * len2; 56 | cTable = new int[lenTotal]; 57 | 58 | for (int i = 0; i < lenTotal; i++) { 59 | cTable[i] = 0; 60 | } 61 | 62 | findLcs(); 63 | } 64 | 65 | LCSLen::~LCSLen() { 66 | delete[] cTable; 67 | } 68 | 69 | void LCSLen::findLcs() { 70 | int iM1Index = 0; 71 | int iIndex = len2; 72 | 73 | for (int i = 1; i < len1; i++) { 74 | char base1 = seq1[start1 + i - 1]; 75 | 76 | for (int j = 1; j < len2; j++) { 77 | int ijIndex = iIndex + j; 78 | if (base1 == seq2[start2 + j - 1]) { 79 | cTable[ijIndex] = cTable[iM1Index + j - 1] + 1; 80 | } else { 81 | if (cTable[iM1Index + j] > cTable[iIndex + j - 1]) { 82 | cTable[ijIndex] = cTable[iM1Index + j]; 83 | } else { 84 | cTable[ijIndex] = cTable[iIndex + j - 1]; 85 | } 86 | } 87 | } 88 | 89 | if(i != len1-1){ 90 | for(int h = 0; h < len2; h++){ 91 | cTable[h] = cTable[len2+h]; 92 | } 93 | } 94 | } 95 | lenCS = cTable[lenTotal-1]; 96 | } 97 | 98 | int LCSLen::getLenCS(){ 99 | return lenCS; 100 | } 101 | 102 | } 103 | /* namespace utility */ 104 | -------------------------------------------------------------------------------- /src/utility/LCSLen.h: -------------------------------------------------------------------------------- 1 | /* 2 | * LCSLen.h 3 | * 4 | * Created on: Dec 6, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef LCSLEN_H_ 9 | #define LCSLEN_H_ 10 | 11 | namespace utility { 12 | 13 | class LCSLen { 14 | private: 15 | const char * seq1; 16 | int start1; 17 | int end1; 18 | const char * seq2; 19 | int start2; 20 | int end2; 21 | 22 | int len1; 23 | int len2; 24 | int lenTotal; 25 | int lenCS; 26 | 27 | int * cTable; 28 | void findLcs(); 29 | 30 | public: 31 | LCSLen(const char *, int, int, const char *, int, int); 32 | virtual ~LCSLen(); 33 | int getLenCS(); 34 | }; 35 | 36 | } /* namespace utility */ 37 | #endif /* LCSLEN_H_ */ 38 | -------------------------------------------------------------------------------- /src/utility/Location.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Location.cpp 3 | * 4 | * Created on: Dec 19, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #include "Location.h" 9 | #include "Util.h" 10 | #include "../exception/InvalidInputException.h" 11 | 12 | using namespace exception; 13 | 14 | namespace utility { 15 | 16 | Location::Location(int startIn, int endIn) { 17 | initialize(startIn, endIn); 18 | } 19 | 20 | Location::Location(ILocation& cp) { 21 | initialize(cp.getStart(), cp.getEnd()); 22 | } 23 | 24 | void Location::initialize(int startIn, int endIn) { 25 | start = startIn; 26 | end = endIn; 27 | check(); 28 | 29 | } 30 | 31 | void Location::check() { 32 | if (start < 0 || end < 0 || start > end) { 33 | string msg("Invalid Input. Start is "); 34 | msg.append(Util::int2string(start)); 35 | msg.append(". End is "); 36 | msg.append(Util::int2string(end)); 37 | msg.append("."); 38 | throw InvalidInputException(msg); 39 | } 40 | } 41 | 42 | Location::~Location() { 43 | } 44 | 45 | int Location::getEnd() const { 46 | return end; 47 | } 48 | 49 | int Location::getStart() const { 50 | return start; 51 | } 52 | 53 | void Location::setEnd(int endIn) { 54 | end = endIn; 55 | check(); 56 | } 57 | 58 | void Location::setStart(int startIn) { 59 | start = startIn; 60 | check(); 61 | } 62 | 63 | int Location::getLength() { 64 | return end - start + 1; 65 | } 66 | 67 | string Location::toString() { 68 | string msg = (Util::int2string(start)); 69 | msg.append("-"); 70 | msg.append(Util::int2string(end)); 71 | 72 | return msg; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/utility/Location.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Location.h 3 | * 4 | * Created on: Dec 19, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef LOCATION_H_ 9 | #define LOCATION_H_ 10 | 11 | #include "ILocation.h" 12 | 13 | #include 14 | 15 | using namespace std; 16 | 17 | namespace utility { 18 | 19 | class Location : public ILocation{ 20 | private: 21 | int start; 22 | int end; 23 | void initialize(int, int); 24 | void check(); 25 | 26 | public: 27 | Location(int, int); 28 | Location(ILocation&); 29 | virtual ~Location(); 30 | 31 | int getEnd() const; 32 | int getStart() const; 33 | void setEnd(int); 34 | void setStart(int); 35 | int getLength(); 36 | string toString(); 37 | }; 38 | 39 | } 40 | 41 | #endif /* LOCATION_H_ */ 42 | -------------------------------------------------------------------------------- /src/utility/Util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Util.h 3 | * 4 | * Created on: Apr 24, 2012 5 | * Author: Hani Zakaria Girgis, PhD 6 | */ 7 | 8 | #ifndef UTIL_H_ 9 | #define UTIL_H_ 10 | 11 | #include "Location.h" 12 | #include "../exception/FileDoesNotExistException.h" 13 | #include "../exception/InvalidInputException.h" 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | using namespace std; 24 | using namespace utility; 25 | using namespace exception; 26 | 27 | namespace utility { 28 | class Util { 29 | private: 30 | Util(); 31 | ~Util(); 32 | 33 | public: 34 | static string * emptyString; 35 | static string fileSeparator; 36 | static bool isDna; 37 | static void readFasta(string, vector *, vector *, bool); 38 | static void readFasta(string, vector *, vector *); 39 | static void readCoordinates(string, vector *); 40 | static void readChromList(string, vector *, string); 41 | static void toUpperCase(string*); 42 | static void toUpperCase(string&); 43 | static string int2string(int); 44 | static string double2string(double); 45 | static string long2string(long); 46 | static void deleteFile(string); 47 | static void deleteFilesUnderDirectory(string); 48 | static void checkFile(string); 49 | static bool isOverlapping(int, int, int, int); 50 | static void revCompDig(string *, string *); 51 | static void revCompDig(const char* sequence, int, int, string *); 52 | 53 | static void writeFasta(const string&, const string&, const string&); 54 | 55 | static int sumTotalLength(const vector *); 56 | 57 | // Added on Oct 6 2018 58 | static const int getAlphabetSize(); 59 | 60 | /** 61 | * Delete the objects pointed to by pointers in a vector. 62 | * It does not delete the vector itself. 63 | * 64 | * Credit: http://stackoverflow.com/questions/594089/does-stdvector-clear-do-delete-free-memory-on-each-element 65 | */ 66 | template 67 | static void deleteInVector(vector * deleteMe) { 68 | while (!deleteMe->empty()) { 69 | delete deleteMe->back(); 70 | deleteMe->pop_back(); 71 | } 72 | 73 | // Set the size to zero 74 | deleteMe->clear(); 75 | 76 | // Set the capacity to zero 77 | vector empty; 78 | deleteMe->swap(empty); 79 | } 80 | }; 81 | } 82 | 83 | #endif /* UTIL_H_ */ 84 | --------------------------------------------------------------------------------