├── .gitignore
├── CMakeLists.txt
├── README.md
└── src
    ├── cluster
        ├── CRunner.cpp
        ├── CRunner.h
        ├── Center.h
        ├── ClusterFactory.cpp
        ├── ClusterFactory.h
        ├── Trainer.cpp
        ├── Trainer.h
        ├── bvec.cpp
        ├── bvec.h
        ├── bvec_iterator.cpp
        ├── bvec_iterator.h
        └── meshclust2.cpp
    ├── clutil
        ├── Clock.cpp
        ├── Clock.h
        ├── Datatype.cpp
        ├── Datatype.h
        ├── DivergencePoint.cpp
        ├── DivergencePoint.h
        ├── Histogram.cpp
        ├── Histogram.h
        ├── LCG.h
        ├── Loader.cpp
        ├── Loader.h
        ├── Point.h
        ├── Progress.cpp
        ├── Progress.h
        ├── Random.h
        ├── SingleFileLoader.cpp
        └── SingleFileLoader.h
    ├── exception
        ├── FileDoesNotExistException.cpp
        ├── FileDoesNotExistException.h
        ├── InvalidInputException.cpp
        ├── InvalidInputException.h
        ├── InvalidOperationException.cpp
        ├── InvalidOperationException.h
        ├── InvalidOrderOfOperationsException.cpp
        ├── InvalidOrderOfOperationsException.h
        ├── InvalidScoreException.cpp
        ├── InvalidScoreException.h
        ├── InvalidStateException.cpp
        └── InvalidStateException.h
    ├── fastcar
        ├── FC_Runner.cpp
        ├── FC_Runner.h
        └── fastcar.cpp
    ├── nonltr
        ├── ChromDetector.cpp
        ├── ChromDetector.h
        ├── ChromDetectorMaxima.cpp
        ├── ChromDetectorMaxima.h
        ├── ChromListMaker.cpp
        ├── ChromListMaker.h
        ├── Chromosome.cpp
        ├── Chromosome.h
        ├── ChromosomeOneDigit.cpp
        ├── ChromosomeOneDigit.h
        ├── ChromosomeOneDigitDna.cpp
        ├── ChromosomeOneDigitDna.h
        ├── ChromosomeOneDigitProtein.cpp
        ├── ChromosomeOneDigitProtein.h
        ├── ChromosomeRandom.cpp
        ├── ChromosomeRandom.h
        ├── DetectorMaxima.cpp
        ├── DetectorMaxima.h
        ├── EnrichmentMarkovView.cpp
        ├── EnrichmentMarkovView.h
        ├── HMM.cpp
        ├── HMM.h
        ├── IChromosome.h
        ├── ITableView.h
        ├── KmerHashTable.cpp
        ├── KmerHashTable.h
        ├── LocationList.cpp
        ├── LocationList.h
        ├── LocationListCollection.cpp
        ├── LocationListCollection.h
        ├── RepeatsDetector.cpp
        ├── Scanner.cpp
        ├── Scanner.h
        ├── Scorer.cpp
        ├── Scorer.h
        ├── TableBuilder.cpp
        ├── TableBuilder.h
        ├── Trainer.cpp
        └── Trainer.h
    ├── predict
        ├── BestFirstSelector.cpp
        ├── BestFirstSelector.h
        ├── Feature.cpp
        ├── Feature.h
        ├── FeatureSelector.cpp
        ├── FeatureSelector.h
        ├── GLM.cpp
        ├── GLM.h
        ├── GreedySelector.cpp
        ├── GreedySelector.h
        ├── HandleSeq.cpp
        ├── HandleSeq.h
        ├── Matrix.cpp
        ├── Matrix.h
        ├── MultiMute.cpp
        ├── MultiMute.h
        ├── Predictor.cpp
        ├── Predictor.h
        ├── SingMute.cpp
        └── SingMute.h
    └── utility
        ├── EmptyLocation.cpp
        ├── EmptyLocation.h
        ├── GlobAlignE.cpp
        ├── GlobAlignE.h
        ├── ILocation.h
        ├── LCSLen.cpp
        ├── LCSLen.h
        ├── Location.cpp
        ├── Location.h
        ├── Util.cpp
        └── Util.h


/.gitignore:
--------------------------------------------------------------------------------
1 | bin/
2 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required (VERSION 3.1)
  2 | project (MeshClust2)
  3 | 
  4 | include_directories(src/exception src/nonltr src/utility src/cluster src/prediction src/clutil src/fastcar)
  5 | set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin)
  6 | set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
  7 | set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
  8 | 
  9 | 
 10 | add_library(Fastcar
 11 |   ${CMAKE_SOURCE_DIR}/src/fastcar/FC_Runner.cpp
 12 | )
 13 | 
 14 | add_library(ClusterUtil
 15 |   ${CMAKE_SOURCE_DIR}/src/clutil/DivergencePoint.cpp
 16 |   ${CMAKE_SOURCE_DIR}/src/clutil/Histogram.cpp
 17 |   ${CMAKE_SOURCE_DIR}/src/clutil/Loader.cpp
 18 |   ${CMAKE_SOURCE_DIR}/src/clutil/SingleFileLoader.cpp
 19 |   ${CMAKE_SOURCE_DIR}/src/clutil/Progress.cpp
 20 |   ${CMAKE_SOURCE_DIR}/src/clutil/Datatype.cpp
 21 |   ${CMAKE_SOURCE_DIR}/src/clutil/Clock.cpp
 22 | )
 23 | 
 24 | add_library(Predict
 25 |   ${CMAKE_SOURCE_DIR}/src/predict/Feature.cpp
 26 |   ${CMAKE_SOURCE_DIR}/src/predict/GLM.cpp
 27 |   ${CMAKE_SOURCE_DIR}/src/predict/HandleSeq.cpp
 28 |   ${CMAKE_SOURCE_DIR}/src/predict/Matrix.cpp
 29 |   ${CMAKE_SOURCE_DIR}/src/predict/MultiMute.cpp
 30 |   ${CMAKE_SOURCE_DIR}/src/predict/Predictor.cpp
 31 |   ${CMAKE_SOURCE_DIR}/src/predict/SingMute.cpp
 32 |   ${CMAKE_SOURCE_DIR}/src/predict/FeatureSelector.cpp
 33 |   ${CMAKE_SOURCE_DIR}/src/predict/GreedySelector.cpp
 34 |   ${CMAKE_SOURCE_DIR}/src/predict/BestFirstSelector.cpp
 35 | )
 36 | 
 37 | add_library(Cluster
 38 |   ${CMAKE_SOURCE_DIR}/src/cluster/ClusterFactory.cpp
 39 |   ${CMAKE_SOURCE_DIR}/src/cluster/CRunner.cpp
 40 |   ${CMAKE_SOURCE_DIR}/src/cluster/Trainer.cpp
 41 |   ${CMAKE_SOURCE_DIR}/src/cluster/bvec.cpp
 42 |   ${CMAKE_SOURCE_DIR}/src/cluster/bvec_iterator.cpp
 43 | 
 44 | )
 45 | 
 46 | add_library(Exception
 47 |   ${CMAKE_SOURCE_DIR}/src/exception/FileDoesNotExistException.cpp
 48 |   ${CMAKE_SOURCE_DIR}/src/exception/InvalidInputException.cpp
 49 |   ${CMAKE_SOURCE_DIR}/src/exception/InvalidOperationException.cpp
 50 |   ${CMAKE_SOURCE_DIR}/src/exception/InvalidOrderOfOperationsException.cpp
 51 |   ${CMAKE_SOURCE_DIR}/src/exception/InvalidScoreException.cpp
 52 |   ${CMAKE_SOURCE_DIR}/src/exception/InvalidStateException.cpp
 53 | )
 54 | 
 55 | add_library(Nonltr
 56 |   ${CMAKE_SOURCE_DIR}/src/nonltr/ChromDetectorMaxima.cpp
 57 |   ${CMAKE_SOURCE_DIR}/src/nonltr/ChromListMaker.cpp
 58 |   ${CMAKE_SOURCE_DIR}/src/nonltr/Chromosome.cpp
 59 |   ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigit.cpp
 60 |   ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitDna.cpp
 61 |   ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeOneDigitProtein.cpp
 62 |   ${CMAKE_SOURCE_DIR}/src/nonltr/ChromosomeRandom.cpp
 63 |   ${CMAKE_SOURCE_DIR}/src/nonltr/DetectorMaxima.cpp
 64 |   ${CMAKE_SOURCE_DIR}/src/nonltr/HMM.cpp
 65 |   ${CMAKE_SOURCE_DIR}/src/nonltr/LocationList.cpp
 66 |   ${CMAKE_SOURCE_DIR}/src/nonltr/LocationListCollection.cpp
 67 |   ${CMAKE_SOURCE_DIR}/src/nonltr/Scanner.cpp
 68 |   ${CMAKE_SOURCE_DIR}/src/nonltr/Scorer.cpp
 69 |   ${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.cpp
 70 |   ${CMAKE_SOURCE_DIR}/src/nonltr/Trainer.cpp
 71 | )
 72 | 
 73 | add_library(Utility
 74 |   ${CMAKE_SOURCE_DIR}/src/utility/EmptyLocation.cpp
 75 |   ${CMAKE_SOURCE_DIR}/src/utility/GlobAlignE.cpp
 76 |   ${CMAKE_SOURCE_DIR}/src/utility/Location.cpp
 77 |   ${CMAKE_SOURCE_DIR}/src/utility/Util.cpp
 78 | )
 79 | 
 80 | target_include_directories(Exception PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 81 | target_include_directories(Nonltr PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 82 | target_include_directories(Utility PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 83 | target_include_directories(Cluster PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 84 | target_include_directories(Fastcar PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 85 | target_include_directories(ClusterUtil PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 86 | target_include_directories(Predict PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 87 | 
 88 | set (HEADER_FILES
 89 |   ${CMAKE_SOURCE_DIR}/src/nonltr/KmerHashTable.h
 90 |   ${CMAKE_SOURCE_DIR}/src/nonltr/EnrichmentMarkovView.h
 91 |   ${CMAKE_SOURCE_DIR}/src/nonltr/TableBuilder.h
 92 | )
 93 | 
 94 | set (CMAKE_CXX_COMPILER g++)
 95 | set (CMAKE_CXX_STANDARD 11)
 96 | set (CMAKE_CXX_STANDARD_REQUIRED on)
 97 | set (CMAKE_CXX_FLAGS "-fopenmp -g -O3 -march=native -std=c++11")
 98 | 
 99 | target_compile_definitions(Cluster PRIVATE VERSION="2.3.0")
100 | target_compile_definitions(Fastcar PRIVATE VERSION="0.7.1")
101 | 
102 | add_executable(Red ${CMAKE_SOURCE_DIR}/src/nonltr/RepeatsDetector.cpp )
103 | add_executable(meshclust2 ${CMAKE_SOURCE_DIR}/src/cluster/meshclust2.cpp)
104 | add_executable(fastcar ${CMAKE_SOURCE_DIR}/src/fastcar/fastcar.cpp)
105 | 
106 | target_link_libraries(Red Exception Nonltr Utility ${HEADER_FILES})
107 | target_link_libraries(Utility Exception ${HEADER_FILES})
108 | target_link_libraries(Nonltr Utility Exception ${HEADER_FILES})
109 | target_link_libraries(ClusterUtil Nonltr ${HEADER_FILES})
110 | target_link_libraries(Predict ClusterUtil Nonltr ${HEADER_FILES})
111 | target_link_libraries(meshclust2 Cluster Nonltr ClusterUtil Predict ${HEADER_FILES})
112 | target_link_libraries(fastcar Nonltr ClusterUtil Fastcar Predict ${HEADER_FILES})
113 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## MeShClust2
  2 | 
  3 | The newest version of MeShClust (v3.0) can be obtained from https://github.com/BioinformaticsToolsmith/Identity.git
  4 | 
  5 | This repository is no longer supported.
  6 | 
  7 | Release version - 2.3.0
  8 | 
  9 | ### Requirements
 10 | g++ 4.9.1 or later, requires Homebrew on Mac OS X
 11 | Compilation using g++ (homebrew) and CMake on Mac OS X see [this link](https://stackoverflow.com/questions/29057437/compile-openmp-programs-with-gcc-compiler-on-os-x-yosemite)
 12 | 
 13 | ### Linux/Unix compilation
 14 | ``` sh
 15 | mkdir bin && cd bin
 16 | cmake ..
 17 | make
 18 | ```
 19 | 
 20 | ### Citation
 21 | If you find this tool helpful, please cite:
 22 | 
 23 | [James, Benjamin T. et al. (2018), MeShClust2: Application of alignment-free identity scores in clustering long DNA sequences. bioRxiv, 451278.](https://doi.org/10.1101/451278)
 24 | 
 25 | ### Usage
 26 | 
 27 |     Usage: meshclust2 --id 0.x [OPTIONS] *.fasta
 28 | 
 29 |     --id          The most important parameter, --id, controls the identity cutoff of the sequences.
 30 |                   Needs to be between 0 and 1.
 31 |                   If it is not specified, an identity of 0.9 is used.
 32 | 
 33 |     --kmer        decides the size of the kmers. It is by default automatically decided by average sequence
 34 |                   length, but if provided, MeShClust can speed up a little by not having to find the largest
 35 |                   sequence length. Increasing kmer size can increase accuracy, but increases memory consumption.
 36 | 
 37 |     --dump       Run until the classifier is trained, and then dump the weights to the file,
 38 |                  default 'weights.txt'. Can be used with --recover to recover the weights
 39 |                  instead of re-training.
 40 | 
 41 |     --recover    Recover weights for the classifier trained by a previous run which used --dump to dump
 42 |                  the weights.
 43 | 
 44 |     --list       Instead of specifying files as extra arguments, provide a text file with
 45 |                  a list of files. Can use pipes or process substitutions such as "--list <(ls *.fasta) "
 46 | 
 47 |     --no-train-list    Same as --list, but these files are not passed to the classifier,
 48 |                        e.g. unassembled genomes
 49 | 
 50 |     --mut-type   {single, both, nonsingle-typical, nonsingle-all, all-but-reversion, all-but-translocation}
 51 |                  changes the mutation generation algorithm. By default, "both" is used, utilizing
 52 |                  single point and block mutations. On higher identity data sets, "single", which includes only single point mutations,
 53 |                  is preferable. The option "nonsingle-typical" uses only block mutations,
 54 |                  disallowing single point mutations. Other options include "all", which includes single,
 55 |                  block, and nontypical mutations translocation and reversion.
 56 | 
 57 |     --feat       determines the combinations of features to be used. By default, "slow" allows 11
 58 |                  combinations to be selected from. "fast" removes 2 slower features from "slow"
 59 |                  which include logarithm based features.
 60 | 
 61 |     --single-file  Using this option, (no value is needed), each file is treated as a single sequence.
 62 |                    If multiple sequences in a file are encountered, they are joined with 50 Ns,
 63 |                    and the k-mers are not counted in that region.
 64 |                    However, to be most accurate, it is advised to not use these sequences in the
 65 |                    training step (for mutations) and instead 1) train using un-joined sequences and
 66 |                    use --dump to dump to a file, and 2) use --recover with --single-file for the
 67 |                    file list.
 68 | 
 69 |     --sample     selects the total number of sequences used for both training and testing.
 70 |                  2000 is the default value. That is, --sample 2000 provides 2000 training
 71 |                  pairs and 2000 testing pairs.
 72 | 
 73 |     --num-templates   selects the number of "template" sequences from which to mutate.
 74 |                  For example, if 300 (the default) templates are requested, and the number of
 75 |                  "samples" is requested to be 2000 (the default), 300 sequences will be read in
 76 |                  and mutated 2000/300 times each to create 2000 semi-synthetic pairs.
 77 | 
 78 |     --min-feat   (default 4) sets the minimum feature pairs to be used. If set to 2, at least 2 feature pairs
 79 |                  will be used. Recall that features include pairwise combinations of the "feat" option.
 80 | 
 81 |     --max-feat   (default 4) sets the maximum feature pairs to be used. Diminishing returns appears quickly,
 82 |                  so a very large maximum (>10) is not advised.
 83 | 
 84 |     --min-id     (default 0.35) sets the lower bound for mutation identity scores to be calculated.
 85 |                  Shouldn't need to be set normally, as lower identites take much longer,
 86 |                  especially with single mutations only.
 87 | 
 88 |     --datatype   (8,16,32,64) Decides the integer size of the histograms. If not provided,
 89 |                  all sequences are read in and counted to ensure the largest k-mer does not
 90 |                  overflow. If the provided k-mer is too small, it will overflow.
 91 | 
 92 |     --threads    sets the number of threads to be used. By default OpenMP uses the number of available cores
 93 |                  on your machine, but this parameter overwrites that.
 94 | 
 95 |     --output     specifies the output file, in CD-HIT's CLSTR format, described below:
 96 |                  A '>Cluster ' followed by an increasing index designates a cluster.
 97 |                  Otherwise, the sequence is printed out.
 98 |                  A '*' at the end of a sequence designates the center of the cluster.
 99 |                  An example of a small data set:
100 | 
101 |                  >Cluster 0
102 |                  0       993nt, >seq128 template_6... *
103 |                  >Cluster 1
104 |                  0       1043nt, >seq235 template_10...
105 |                  1       1000nt, >seq216 template_10... *
106 |                  2       1015nt, >seq237 template_10...
107 | 
108 |     --delta      decides how many clusters are looked around in the final clustering stage.
109 |                  Increasing it creates more accuracy, but takes more time. Default value is 5.
110 | 
111 |     --iterations specifies how many iterations in the final stage of merging are done until convergence.
112 |                  Default value is 15.
113 | 
114 |     If the argument is not listed here, it is interpreted as an input (FASTA format) file.
115 | 
116 | 
117 | ### License
118 | 
119 | Academic use: The software is provided as-is under the GNU GPLv3.
120 | Any restrictions to use for-profit or non-academics: License needed.
121 | 


--------------------------------------------------------------------------------
/src/cluster/CRunner.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * Runner.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | #ifndef CRUNNER_H
 8 | #define CRUNNER_H
 9 | 
10 | #include <iostream>
11 | #include <map>
12 | #include "../clutil/Point.h"
13 | #include "../predict/HandleSeq.h"
14 | #include "../predict/Predictor.h"
15 | using namespace std;
16 | 
17 | class Runner {
18 | public:
19 | 	Runner(int argc, char** argv);
20 | 	~Runner() {};
21 | 	int run();
22 | private:
23 | 	template<class T> int do_run();
24 | 	template<class T> void print_output(const map<Point<T>*, vector<Point<T>*>*> &m) const;
25 | 	int k = -1;
26 |         int bandwidth;
27 | 	double similarity = 0.90;
28 | 	long largest_count = 0;
29 | 	int iterations = 15;
30 | 	int delta = 5;
31 | 	bool align = false;
32 | 	int total_sample_size = 2000;
33 | 	int n_templates = 300;
34 | 	int min_n_feat = 4;
35 | 	int max_n_feat = 4;
36 | 	bool is_single_file = false;
37 | 	double bias = 0;
38 | 	int mut_type = HandleSeq::BOTH;
39 | 	uint64_t feat_type = PRED_FEAT_FAST;
40 | 	double min_id = 0.35;
41 | 	std::vector<std::string> files, notrain_files, all_files;
42 | 	string output = "output.clstr";
43 | 	void get_opts(int argc, char** argv);
44 |         int find_k();
45 | 
46 | 	bool dump = false;
47 | 	bool recover = false;
48 | 	std::string dump_str = "weights.txt";
49 | 	Predictor<uint64_t> *pred64 = NULL;
50 | };
51 | #endif
52 | 


--------------------------------------------------------------------------------
/src/cluster/Center.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * Center.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | #ifndef CENTER_H
 8 | #define CENTER_H
 9 | 
10 | #include "../clutil/Point.h"
11 | 
12 | template<class T>
13 | struct Center {
14 | 	Center(Point<T>* c, const vector<Point<T>*> &pts) : center(c->clone()), points(pts), is_to_delete(false) {
15 | 	}
16 | 	Center(const Center<T> &cc) : center(cc.center->clone()), points(cc.points), is_to_delete(cc.is_to_delete) {}
17 | 
18 | 
19 | 	// Center(const Center<T>& c) {
20 | 	// 	center = c.get_clone();
21 | 	// 	points = c.getPoints_c();
22 | 	// 	is_to_delete = c.is_delete();
23 | 	// }
24 | 	~Center() { if (is_to_delete) { delete center; }}
25 | 
26 | 	Point<T>* getCenter() { return center; }
27 | 	vector<Point<T>*> &getPoints() { return points; }
28 | 
29 | 	const vector<Point<T>*> &getPoints_c() const { return points; };
30 | 	bool is_delete() const { return is_to_delete; }
31 | 	void lazy_remove() { is_to_delete = true; }
32 | 	size_t size() const { return points.size(); }
33 | 	bool empty() const { return points.empty(); }
34 | 	Point<T>* get_clone() const {
35 | 		return center->clone();
36 | 	}
37 | 	Point<T> *center;
38 | 	vector<Point<T>*> points;
39 | 	bool is_to_delete;
40 | };
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/src/cluster/ClusterFactory.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * ClusterFactory.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #ifndef CLUSTERFACTORY_H
 9 | #define CLUSTERFACTORY_H
10 | 
11 | 
12 | #include <iostream>
13 | #include <vector>
14 | #include <functional>
15 | #include <limits>
16 | #include "../nonltr/ChromosomeOneDigit.h"
17 | #include "../nonltr/KmerHashTable.h"
18 | #include "../clutil/Point.h"
19 | #include "Trainer.h"
20 | #include "bvec.h"
21 | 
22 | template<class T>
23 | class ClusterFactory {
24 | public:
25 | 	ClusterFactory(int k_len, int npp=std::numeric_limits<int>::max()) : k(k_len), num_per_partition(npp) {}
26 | 	std::vector<Point<T>*> build_points(vector<string> files, std::function<Point<T>*(ChromosomeOneDigit*)> get_point);
27 |         Point<T>* get_histogram(ChromosomeOneDigit *chrom);
28 | 	Point<T>* get_divergence_point(ChromosomeOneDigit *chrom);
29 | 	T find_h(const std::vector<Point<T>*> &centers) const;
30 | 	void sort_nn(std::vector<Point<T>*> &points, Point<T>* nearest_to=NULL, int arg=3) const;
31 | 	void MS(bvec<T> &points, T bandwidth, double sim, const Trainer<T>& trn, string output, int iter, int delta);
32 | private:
33 | 	vector<int> lookup_table;
34 | 	const int num_per_partition;
35 | 	int k;
36 | 	//void fill_table(KmerHashTable<unsigned long, T> &table, ChromosomeOneDigit *chrom, std::vector<T>& values);
37 | };
38 | 
39 | template<class V>
40 | void fill_table(KmerHashTable<unsigned long, V> &table, ChromosomeOneDigit *chrom, std::vector<V>& values)
41 | {
42 | 	const int k = table.getK();
43 | 	auto segment = chrom->getSegment();
44 | 	const char *seg_bases = chrom->getBase()->c_str();
45 | 	for (vector<int> *v : *segment) {
46 | 		int start = v->at(0);
47 | 		int end = v->at(1);
48 | 		table.wholesaleIncrement(seg_bases, start, end - k + 1);
49 | 	}
50 | 	unsigned long tableSize = table.getMaxTableSize();
51 | 	values.reserve(values.size() + tableSize);
52 | 	const V * valueArray = table.getValues();
53 | 	std::copy(&valueArray[0], &valueArray[tableSize], std::back_inserter(values));
54 | }
55 | // template<class V>
56 | // void fill_table(KmerHashTable<unsigned long, V> &table, ChromosomeOneDigit *chrom, std::vector<V>& values)
57 | // {
58 | // 	const int k = table.getK();
59 | // 	auto segment = chrom->getSegment();
60 | // 	const char *seg_bases = chrom->getBase()->c_str();
61 | // 	for (vector<int> *v : *segment) {
62 | // 		int start = v->at(0);
63 | // 		int end = v->at(1);
64 | // 		table.wholesaleIncrement(seg_bases, start, end - k + 1);
65 | // 	}
66 | // 	std::vector<std::string> *keys = table.getKeys();
67 | // 	for (std::string str : *keys) {
68 | // 		values.push_back(table.valueOf(str.c_str()));
69 | // 	}
70 | // 	keys->clear();
71 | // 	delete keys;
72 | // }
73 | 
74 | #ifdef HEADER_HACK
75 | #ifndef CLUSTERFACTORY_C
76 | #define CLUSTERFACTORY_C
77 | #include "ClusterFactory.cpp"
78 | #endif
79 | #endif
80 | 
81 | #endif
82 | 


--------------------------------------------------------------------------------
/src/cluster/Trainer.cpp:
--------------------------------------------------------------------------------
  1 | /* -*- C++ -*- */
  2 | /*
  3 |  * Trainer.cpp
  4 |  *
  5 |  * Author: Benjamin T James
  6 |  */
  7 | #include "Trainer.h"
  8 | #include "../predict/HandleSeq.h"
  9 | #include "../clutil/Datatype.h"
 10 | #include "../clutil/Loader.h"
 11 | #include "ClusterFactory.h"
 12 | #include <algorithm>
 13 | #include <set>
 14 | #include <map>
 15 | #include <cmath>
 16 | #include "../predict/Predictor.h"
 17 | #include "../predict/GLM.h"
 18 | #include "../predict/Feature.h"
 19 | #include "../clutil/Progress.h"
 20 | #include "../clutil/Random.h"
 21 | 
 22 | template<class T>
 23 | std::tuple<Point<T>*,double,size_t,size_t> Trainer<T>::get_close(Point<T> *p, bvec_iterator<T> istart, bvec_iterator<T> iend, bool &is_min_r) const
 24 | {
 25 | 	int ncols = weights.getNumRow();
 26 | #pragma omp declare reduction(pmax:std::tuple<Point<T>*,double,size_t,size_t>: \
 27 | 			      omp_out = get<1>(omp_in) > get<1>(omp_out) ? omp_in : omp_out ) \
 28 | 	initializer (omp_priv=std::make_tuple((Point<T>*)NULL,-1,0,0))
 29 | 
 30 | 	std::tuple<Point<T>*,
 31 | 		   double,
 32 | 		   size_t,
 33 | 		   size_t> result = std::tuple<Point<T>*, double, size_t, size_t>(NULL,
 34 | 				     -1,
 35 | 				     0,
 36 | 				     0);
 37 | 	bool has_found = false;
 38 | 	bool is_min = true;
 39 | 	uint64_t min_len = p->get_length() * cutoff;
 40 | 	uint64_t max_len = p->get_length() / cutoff;
 41 | #pragma omp parallel for reduction(pmax:result), reduction(&&:is_min)
 42 | 	for (bvec_iterator<T> i = istart; i < iend; ++i) {
 43 | 		Point<T>* pt = (*i).first;
 44 | 
 45 | 		uint64_t len = pt->get_length();
 46 | 		if (len < min_len || len > max_len) {
 47 | 			continue;
 48 | 		}
 49 | 		auto cache = feat->compute(*pt, *p);
 50 | 		double dist = (*feat)(0, cache);
 51 | 		double sum = classify(pt, p);
 52 | 		double res = round(sum) > 0;
 53 | 		// #pragma omp critical
 54 | 		// cout << "Result: " << sum << " raw_sigmoid: " << matrix::GLM::logistic(sum) << " classify_sum: " << Predictor<T>::classify_sum(sum) << " final: " << res << endl;
 55 | // set second to true if result is not 1.0
 56 | // which means it will be removed
 57 | 		result = (dist > std::get<1>(result)) ? std::make_tuple(pt, dist, i.r, i.c) : result;
 58 | 		is_min = is_min && (res != 1.0);
 59 | //		has_found = has_found || (res != 1.0);
 60 | 		if (res == 1.0) {
 61 | 			*i = std::make_pair(pt, true);
 62 | //			(*i).second = true;
 63 | 		}
 64 | 	}
 65 | 
 66 | //	is_min = !has_found;
 67 | 	is_min_r = is_min;
 68 | //	return get<0>(result);
 69 | 	return result;
 70 | 
 71 | }
 72 | 
 73 | template<class T>
 74 | long Trainer<T>::merge(vector<Center<T> > &centers, long current, long begin, long last) const
 75 | {
 76 | #pragma omp declare reduction(ldpmax:std::pair<long,double>:			\
 77 | 			      omp_out = omp_in.second > omp_out.second ? omp_in : omp_out ) \
 78 | 	initializer (omp_priv=std::make_pair(0, std::numeric_limits<double>::min()))
 79 | 	std::pair<long,double> best = std::make_pair(0, std::numeric_limits<double>::min());
 80 | 	Point<T>* p = centers[current].getCenter();
 81 | 	uint64_t cen_length = p->get_length();
 82 | 	uint64_t min_length = cen_length * get_id();
 83 | 	uint64_t max_length = cen_length / get_id();
 84 | #pragma omp parallel for reduction(ldpmax:best)
 85 | 	for (long i = begin; i <= last; i++) {
 86 | 		double sum = weights.get(0, 0);
 87 | 		double dist = 0;
 88 | 
 89 | 		Point<T>* cen = centers[i].getCenter();
 90 | 		uint64_t cen_len = cen->get_length();
 91 | 		bool length_pass = cen_len >= min_length && cen_len <= max_length;
 92 | 		if (length_pass) {
 93 | 			auto cache = feat->compute(*cen, *p);
 94 | 			for (int col = 1; col < weights.getNumRow(); col++) {
 95 | 				double d = (*feat)(col-1, cache);
 96 | 				if (col == 1) {
 97 | 					dist = d;
 98 | 				}
 99 | 				sum += weights.get(col, 0) * d;
100 | 			}
101 | 			double res = round(Predictor<T>::classify_sum(sum));
102 | 
103 | 			if (res == 1) {
104 | 				best = best.second > dist ? best : std::make_pair(i, dist);
105 | 			}
106 | 		}
107 | 	}
108 | 	return best.first;
109 | }
110 | 
111 | template<class T>
112 | double Trainer<T>::classify(Point<T>*a, Point<T>*b) const
113 | {
114 | 	double sum = weights.get(0, 0);
115 | 	auto cache = feat->compute(*a, *b);
116 | 	for (int col = 1; col < weights.getNumRow(); col++) {
117 | 		sum += weights.get(col, 0) * (*feat)(col-1, cache);
118 | 	}
119 | 	return Predictor<T>::classify_sum(sum);
120 | }
121 | 
122 | template<class T>
123 | void Trainer<T>::filter(Point<T> *p, vector<pair<Point<T> *, bool> > &vec) const
124 | {
125 | 	uint64_t cen_length = p->get_length();
126 | 	uint64_t min_length = cen_length * get_id();
127 | 	uint64_t max_length = cen_length / get_id();
128 | 	for (auto& pt : vec) {
129 | 		uint64_t pt_len = pt.first->get_length();
130 | 		bool length_pass = pt_len >= min_length && pt_len <= max_length;
131 | 		pt.second = true;
132 | 		if (length_pass) {
133 | 			double sum = classify(p, pt.first);
134 | 			double res = round(sum);
135 | 			pt.second = (res == 0);
136 | 		}
137 | 	}
138 | 	vec.erase(std::remove_if(vec.begin(), vec.end(), [](pair<Point<T>*, bool> p) {
139 | 				return p.second;
140 | 			}), vec.end());
141 | }
142 | 
143 | template<class T>
144 | Point<T>* Trainer<T>::closest(Point<double> *p, vector<pair<Point<T> *, bool> > &vec) const
145 | {
146 | 	Point<T>* best_pt = NULL;
147 | 	double best_dist = 0;
148 | 	for (auto& pt : vec) {
149 | 		double sum = weights.get(0, 0);
150 | 		double dist = pt.first->distance_d(*p);
151 | 		if (best_pt == NULL || dist < best_dist) {
152 | 			best_dist = dist;
153 | 			best_pt = pt.first;
154 | 		}
155 | 	}
156 | 	return best_pt;
157 | }
158 | 
159 | template<class T>
160 | void Trainer<T>::train(std::string dump_str)
161 | {
162 | 	Predictor<T> pred(dump_str);
163 | 	delete feat;
164 | 	auto pr = pred.get_class();
165 | 	feat = pr.first;
166 | 	feat->set_save(false);
167 | 	matrix::GLM glm = pr.second;
168 | 	weights = glm.get_weights();
169 | }
170 | 
171 | template<class T>
172 | void Trainer<T>::train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, std::string dump_str, double acc_cutoff)
173 | {
174 | 	std::cout << "Splitting data" << endl;
175 | 	uintmax_t _id = points.size();
176 | 	Predictor<T> pred(k, cutoff, PRED_MODE_CLASS, feat_type,
177 | 			  mut_type, min_n_feat, max_n_feat, min_id);
178 | 	pred.train(points, _id, n_samples, n_templates);
179 | 	delete feat;
180 | 	auto pr = pred.get_class();
181 | 	feat = pr.first;
182 | 	matrix::GLM glm = pr.second;
183 | 	weights = glm.get_weights();
184 | 
185 | 	if (dump_str != "") {
186 | 		pred.save(dump_str, Datatype::get());
187 | 		exit(0);
188 | 	} else {
189 | 		pred.save("weights.txt", Datatype::get());
190 | 	}
191 | }
192 | 
193 | template class Trainer<uint8_t>;
194 | template class Trainer<uint16_t>;
195 | template class Trainer<uint32_t>;
196 | template class Trainer<uint64_t>;
197 | template class Trainer<int>;
198 | template class Trainer<double>;
199 | 


--------------------------------------------------------------------------------
/src/cluster/Trainer.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*- */
 2 | /*
 3 |  * Trainer.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #ifndef TRAINER_H
 9 | #define TRAINER_H
10 | 
11 | #include "../clutil/Point.h"
12 | #include "../predict/GLM.h"
13 | #include "../predict/Feature.h"
14 | #include "../predict/Predictor.h"
15 | #include "bvec.h"
16 | #include "Center.h"
17 | #include <set>
18 | 
19 | template<class T>
20 | class Trainer {
21 | public:
22 | 	Trainer(std::vector<Point<T>*> v, size_t num_points, size_t largest_count, double cutoff_, size_t max_pts_from_one_, int ksize) : points(v), n_samples(num_points), cutoff(cutoff_), n_templates(max_pts_from_one_), k(ksize) {
23 | 		uintmax_t size = 1000 * 1000 * 10;
24 | 		feat = new Feature<T>(k);
25 | 	};
26 | 	~Trainer() { delete feat; }
27 | 	void train(std::string);
28 | 	void train(int min_n_feat, int max_n_feat, uint64_t feat_type, int mut_type, double min_id, std::string dump_str, double acc_cutoff=97.5);
29 | 
30 | 	std::tuple<Point<T>*,double,size_t,size_t> get_close(Point<T>*, bvec_iterator<T> istart, bvec_iterator<T> iend,  bool& is_min) const;
31 | 
32 | 	void filter(Point<T>*, vector<pair<Point<T>*,bool> >&) const;
33 | 	Point<T>* closest(Point<double>*, vector<pair<Point<T>*,bool> >&) const;
34 | 	long merge(vector<Center<T> > &centers, long current, long begin, long end) const;
35 | 
36 | 	double get_id() const { return cutoff > 1 ? cutoff / 100.0 : cutoff; }
37 | private:
38 | 	double classify(Point<T>*, Point<T>*) const;
39 | 	matrix::Matrix weights;
40 | 	Feature<T> *feat;
41 | 	std::vector<Point<T>*> points;
42 | 	size_t n_samples, n_templates;
43 | 	double cutoff;
44 | 	int k;
45 | };
46 | #endif
47 | 


--------------------------------------------------------------------------------
/src/cluster/bvec.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * bvec.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | #ifndef BVEC_H
 8 | #define BVEC_H
 9 | 
10 | #include "../clutil/Point.h"
11 | #include "bvec_iterator.h"
12 | 
13 | typedef struct bvec_idx {
14 | 	size_t first, second;
15 | 	bool is_empty = false;
16 | } bvec_idx_t;
17 | 
18 | /*
19 |  * operations needed:
20 |  *
21 |  * find bounds (range)
22 |  * get available or min and remove
23 |  *
24 |  */
25 | template<class T>
26 | using bv_data_type = std::pair<Point<T>*, bool>;
27 | 
28 | template<class T>
29 | using bv_row_type = vector<bv_data_type<T> >;
30 | 
31 | template<class T>
32 | using bv_col_type = vector<bv_row_type<T> >;
33 | 
34 | template<class T>
35 | class bvec {
36 | public:
37 | 	bvec(vector<uint64_t>& lengths, uint64_t bin_size=1000);
38 | 
39 | 	Point<T>* pop();
40 | 	Point<T>* peek() const;
41 | 	void insert(Point<T>* data);
42 | 	void insert_finalize(); /* sorts bins */
43 | 
44 | 
45 | 	bool index_of(uint64_t length, size_t* front, size_t* back) const;
46 | 	bool inner_index_of(uint64_t length, size_t& idx, size_t *front, size_t *back) const;
47 | 	bool empty() const;
48 | 
49 | 	std::pair<bvec_idx_t, bvec_idx_t>
50 | 	get_range(uint64_t begin_len, uint64_t end_len) const;
51 | 
52 | 	void remove_available(bvec_idx_t begin, bvec_idx_t end, std::vector<Point<T>*> &);
53 | 
54 | 	uint64_t absolute_idx(bvec_idx_t idx) const;
55 | 
56 |         bvec_iterator<T> iter(bvec_idx_t idx);
57 | 	typedef bvec_iterator<T> iterator;
58 | 	typedef bvec_iterator<T> const_iterator;
59 | 
60 | 	size_t report() const;
61 | 	size_t size() const;
62 | 
63 | 	void erase(size_t r, size_t c);
64 | private:
65 |         bv_col_type<T> data;
66 | 	vector<uint64_t> begin_bounds;
67 | };
68 | 
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/src/cluster/bvec_iterator.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*- */
 2 | /*
 3 |  * bvec_iterator.cpp
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | #include "bvec_iterator.h"
 8 | 
 9 | template<class T>
10 | bvec_iterator<T> bvec_iterator<T>::operator++()
11 | {
12 | 	if (r != col->size()) {
13 | 		if (c + 1 < col->at(r).size()) {
14 | 			c++;
15 | 		} else {
16 | 			r++;
17 | 			c = 0;
18 | 			while (r < col->size() && col->at(r).empty()) {
19 | 				r++;
20 | 			}
21 | 		}
22 | 	} else {
23 | 		cerr << "tried incrementing null iterator" << endl;
24 | 		throw 10;
25 | 	}
26 | 	return *this;
27 | }
28 | 
29 | template class bvec_iterator<uint8_t>;
30 | template class bvec_iterator<uint16_t>;
31 | template class bvec_iterator<uint32_t>;
32 | template class bvec_iterator<uint64_t>;
33 | template class bvec_iterator<int>;
34 | template class bvec_iterator<double>;
35 | 


--------------------------------------------------------------------------------
/src/cluster/bvec_iterator.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * bvec_iterator.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | #include "bvec.h"
 8 | #ifndef BVEC_ITERATOR_H
 9 | #define BVEC_ITERATOR_H
10 | 
11 | 
12 | template<class T>
13 | class bvec_iterator {
14 | public:
15 | 	// iterator: split ALL possible points into chunks by indices
16 | 	using dtype = std::pair<Point<T>*,bool>;
17 | 	using vtype = vector<vector<dtype> >;
18 | 	bvec_iterator(size_t _r,
19 | 		      size_t _c,
20 | 		      vtype* col_) : r(_r), c(_c), col(col_) {}
21 | 
22 | 	bvec_iterator operator++();
23 | 	bvec_iterator operator++(int x) {
24 | 		return ++(*this);
25 | 	}
26 | 	dtype& operator*() {
27 | 		return col->at(r).at(c);
28 | 	}
29 | 	void operator+=(int64_t n) {
30 | 		if (n < 0) {
31 | 			throw "oops";
32 | 		}
33 | 		for (int i = 0; i < n; i++) {
34 | 			operator++();
35 | 		}
36 | 	}
37 | 	bool operator==(const bvec_iterator& rhs) const {
38 | 		return rhs.c == c && rhs.r == r;
39 | 	}
40 | 	bool operator<(const bvec_iterator& rhs) const {
41 | 		if (r < rhs.r) {
42 | 			return true;
43 | 		} else if (r == rhs.r) {
44 | 			return c < rhs.c;
45 | 		} else {
46 | 			return false;
47 | 		}
48 | 	}
49 | 	bool operator<=(const bvec_iterator& rhs) const {
50 | 		if (r < rhs.r) {
51 | 			return true;
52 | 		} else if (r == rhs.r) {
53 | 			return c <= rhs.c;
54 | 		} else {
55 | 			return false;
56 | 		}
57 | 	}
58 | 	bool operator!=(const bvec_iterator& rhs) const {
59 | 		return r != rhs.r || c != rhs.c;
60 | 	}
61 | 	int64_t operator-(const bvec_iterator& rhs) const {
62 | 		int64_t sum = 0;
63 | 		if (*this < rhs) {
64 | 			return -1 * (rhs - *this);
65 | 		}
66 | 		// subtract cols until last row is reached
67 | 		if (r == rhs.r) {
68 | 			return c - rhs.c;
69 | 		}
70 | 		sum += c;
71 | 		sum += col->at(rhs.r).size() - rhs.c;
72 | 		for (size_t i = rhs.r + 1; i < r; i++) {
73 | 			sum += col->at(i).size();
74 | 		}
75 | 		return sum;
76 | 	}
77 | 	// bvec_iterator operator[](uint64_t idx) {
78 | 
79 | 	// }
80 | //private:
81 | 	size_t r,c;
82 |         vtype* col;
83 | };
84 | #endif
85 | 


--------------------------------------------------------------------------------
/src/cluster/meshclust2.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * main.cpp
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | #include "CRunner.h"
 8 | int main(int argc, char **argv)
 9 | {
10 | 	Runner runner(argc, argv);
11 | 	return runner.run();
12 | }
13 | 


--------------------------------------------------------------------------------
/src/clutil/Clock.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*- */
 2 | /*
 3 |  * Clock.cpp
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #include "Clock.h"
 9 | #include <chrono>
10 | #include <ctime>
11 | 
12 | static const auto _begin = std::chrono::system_clock::now();
13 | 
14 | void Clock::stamp(std::string desc)
15 | {
16 | 	auto end = std::chrono::system_clock::now();
17 | 	std::chrono::duration<double> diff = end - _begin;
18 | 	std::cout << "timestamp " << desc << " " << diff.count() << std::endl;
19 | }
20 | 


--------------------------------------------------------------------------------
/src/clutil/Clock.h:
--------------------------------------------------------------------------------
 1 | // -*- C++ -*-
 2 | /*
 3 |  * Clock.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #ifndef CLOCK_H
 9 | #define CLOCK_H
10 | #include <iostream>
11 | 
12 | class Clock {
13 | public:
14 | 	static void stamp(std::string desc);
15 | };
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/clutil/Datatype.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*- */
 2 | /*
 3 |  * Datatype.cpp
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #include "Datatype.h"
 9 | std::string _dt_datatype = "";
10 | 
11 | std::string Datatype::get()
12 | {
13 | 	return _dt_datatype;
14 | }
15 | 
16 | void Datatype::set(std::string s)
17 | {
18 | 	_dt_datatype = s;
19 | }
20 | 


--------------------------------------------------------------------------------
/src/clutil/Datatype.h:
--------------------------------------------------------------------------------
 1 | // -*- C++ -*_
 2 | /*
 3 |  * Datatype.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #ifndef DATATYPE_H
 9 | #define DATATYPE_H
10 | #include <string>
11 | 
12 | class Datatype {
13 | public:
14 | 	static std::string get();
15 | 	static void set(std::string s);
16 | };
17 | #endif
18 | 


--------------------------------------------------------------------------------
/src/clutil/DivergencePoint.cpp:
--------------------------------------------------------------------------------
  1 | /* -*- C++ -*-
  2 |  *
  3 |  * DivergencePoint.cpp
  4 |  *
  5 |  * Author: Benjamin T James
  6 |  *
  7 |  * Main histogram type, includes distance() which is intersection() in Feature.cpp
  8 |  */
  9 | #include "DivergencePoint.h"
 10 | #include <cmath>
 11 | #include <cstring>
 12 | #include <cfenv>
 13 | #include <iostream>
 14 | 
 15 | 
 16 | template<class T>
 17 | double DivergencePoint<T>::prob_under(Point<T> &p) const
 18 | {
 19 | 	const DivergencePoint<T>& c = dynamic_cast<const DivergencePoint<T>&>(p);
 20 | 	double sum = 0;
 21 | 	const size_t s = points.size();
 22 | 	double total = 0;
 23 | 	std::feclearexcept(FE_OVERFLOW);
 24 | 	std::feclearexcept(FE_UNDERFLOW);
 25 | 	for (int i = 0; i < s; i++) {
 26 | 		sum += c.points[i];
 27 | 		if (i % 4 == 3) {
 28 | 			for (int j = i - 3; j <= i; j++) {
 29 | 				double prob = c.points[j] / sum;
 30 | 				double log_prob = log(prob);
 31 | 				total += (points[j] - 1) * log_prob;
 32 | 				if ((bool)std::fetestexcept(FE_UNDERFLOW)) {
 33 | 					cout << "Underflow!" << endl;
 34 | 				}
 35 | 				//	cond.push_back(log(prob)/log4);
 36 | 			}
 37 | 			sum = 0;
 38 | 		}
 39 | 	}
 40 | 	// for (size_t q = 0; q < s; q += 4) {
 41 | 	// 	double sum = 0;
 42 | 	// 	for (int i = q; i < q + 4; i++) {
 43 | 	// 		sum += c.points[i];
 44 | 	// 	}
 45 | 	// 	for (int i = q; i < q + 4; i++) {
 46 | 	// 		double prob = c.points[i] / sum;
 47 | 	// 		double log_prob = log(prob);
 48 | 	// 		total += (points[i] - 1) * log_prob;
 49 | 	// 	}
 50 | 	// }
 51 | 	return exp(total / s);
 52 | }
 53 | 
 54 | template<class T>
 55 | double DivergencePoint<T>::distance_d(Point<double>& p) const
 56 | {
 57 | 	const DivergencePoint<double>& c = dynamic_cast<const DivergencePoint<double>&>(p);
 58 | 	uint64_t dist = 0;
 59 | 	uint64_t mag = 0;
 60 | 	for (auto i = 0; i < points.size(); i++) {
 61 | 		dist += 2 * min(points[i],(T)round(c.points[i]));
 62 | 		mag += points[i] + c.points[i];
 63 | 	}
 64 | 	double frac = (double)dist / mag;
 65 | 	return 10000.0 * (1.0 - frac * frac);
 66 | }
 67 | 
 68 | 
 69 | template<class T>
 70 | uint64_t DivergencePoint<T>::distance(const Point<T>& p) const
 71 | {
 72 | 	const DivergencePoint<T>& c = dynamic_cast<const DivergencePoint<T>&>(p);
 73 | 	uint64_t dist = 0;
 74 | 	const uint64_t mag = getPseudoMagnitude() + c.getPseudoMagnitude();
 75 | 	#pragma omp simd
 76 | 	for (auto i = 0; i < points.size(); i++) {
 77 | 		dist += min(points[i], c.points[i]);
 78 | 	}
 79 | 	dist *= 2;
 80 | 	double frac = (double)dist / mag;
 81 | 	return 10000.0 * (1.0 - frac * frac);
 82 | }
 83 | 
 84 | template<class T>
 85 | double DivergencePoint<T>::distance_k1(const Point<T> &p) const
 86 | {
 87 | 	uint64_t dist = 0;
 88 | 
 89 | 	auto a = Point<T>::get_1mers(), b = p.get_1mers();
 90 | 	uint64_t mag = 0;
 91 | 	for (auto i = 0; i < 4; i++) {
 92 | 		dist += std::min(a[i], b[i]);
 93 | 		mag += a[i];
 94 | 	}
 95 | 	return (double)dist / (double)mag;
 96 | 
 97 | }
 98 | template<class T>
 99 | DivergencePoint<T>::DivergencePoint(const std::vector<T>& pts, uint64_t len)
100 | {
101 | 	mag = 0;
102 | 	for (unsigned int i = 0; i < pts.size(); i++) {
103 | 		points.push_back(pts.at(i));
104 | 		mag += pts.at(i);
105 | 	}
106 | //	display();
107 | 	nucl_length = len;
108 | 	to_delete = false;
109 | 	id = 0;
110 | }
111 | 
112 | 
113 | template<class T>
114 | DivergencePoint<T>::DivergencePoint(unsigned int size)
115 | {
116 | 	for (unsigned int i = 0; i < size; i++) {
117 | 		points.push_back(0);
118 | 	}
119 | 	to_delete = false;
120 | 	nucl_length = 0;
121 | 	id = 0;
122 | }
123 | 
124 | template<class T>
125 | void DivergencePoint<T>::operator*=(double d)
126 | {
127 | 	unsigned int size = points.size();
128 | 	for (auto& pt : points) {
129 | 		pt *= d;
130 | 	}
131 | }
132 | 
133 | template<class T>
134 | bool DivergencePoint<T>::operator<(Point<T>& p) const
135 | {
136 | 	const DivergencePoint<T>& h = dynamic_cast<const DivergencePoint<T>&>(p);
137 | 	unsigned int size = std::min(points.size(),h.points.size());
138 | 	/*int boundary = 0;
139 | 	for (unsigned int i = 0; i < size; i++) {
140 | 		if (points.at(i) > h.points.at(i)) {
141 | 			boundary++;
142 | 		} else if (points.at(i) < h.points.at(i)) {
143 | 			boundary--;
144 | 		}
145 | 	}
146 | 	return boundary < 0;*/
147 | 	for (unsigned int i = 0; i < size; i++) {
148 | 		if (points.at(i) >= h.points.at(i)) {
149 | 			return false;
150 | 		}
151 | 	}
152 | 	return true;
153 | }
154 | 
155 | template<class T>
156 | void DivergencePoint<T>::operator/=(double d)
157 | {
158 | 	unsigned int size = points.size();
159 | 	for (unsigned int i = 0; i < size; i++) {
160 | 		points[i] /= d;
161 | 	}
162 | //	cout << endl;
163 | }
164 | 
165 | template<class T>
166 | void DivergencePoint<T>::operator+=(Point<T>& p)
167 | {
168 | 	const DivergencePoint<T>& h = dynamic_cast<const DivergencePoint<T>&>(p);
169 | 	unsigned int size = std::min(points.size(),h.points.size());
170 | 	for (unsigned int i = 0; i < size; i++) {
171 | 		points.at(i) += h.points.at(i);
172 | 	}
173 | }
174 | 
175 | template<class T>
176 | uint64_t DivergencePoint<T>::operator-(const Point<T>& p) const
177 | {
178 | 	return distance(p);
179 | }
180 | 
181 | template<class T>
182 | void DivergencePoint<T>::set(Point<T>& p)
183 | {
184 | 	const DivergencePoint<T>& h = dynamic_cast<const DivergencePoint<T>&>(p);
185 | 	points = std::vector<T>(h.points);
186 | 	set_length(h.get_length());
187 | 	to_delete = h.to_delete;
188 | 	Point<T>::set_header(h.get_header());
189 | 	set_id(h.get_id());
190 | }
191 | 
192 | template<class T>
193 | void DivergencePoint<T>::display() const
194 | {
195 | 	unsigned size = points.size();
196 | 	for (unsigned i = 0; i < size; i++) {
197 | 		std::cout << points.at(i) << " ";
198 | 	}
199 | 	std::cout << std::endl;
200 | }
201 | 
202 | template<class T>
203 | void DivergencePoint<T>::zero()
204 | {
205 | 	for (auto &i : points) {
206 | 		i = 0;
207 | 	}
208 | }
209 | 
210 | template<class T>
211 | void DivergencePoint<T>::addOne()
212 | {
213 | 	for (auto& a : points) {
214 | 		a++;
215 | 	}
216 | }
217 | 
218 | template<class T>
219 | void DivergencePoint<T>::subOne()
220 | {
221 | 	for (auto& a : points) {
222 | 		a--;
223 | 	}
224 | }
225 | 
226 | /*
227 |  * p(y|x) = cond_p
228 |  * q(y|x) = cond_p
229 |  */
230 | template<class T>
231 | double DivergencePoint<T>::divergence(Point<T>& p) const
232 | {
233 | 	const DivergencePoint<T>& d = dynamic_cast<const DivergencePoint<T>&>(p);
234 | 	T sum4_p = 0,      sum4_q = 0;                 // Sum for every 4 nucleotides
235 |         double total_sum_p = 0, total_sum_q = 0;       // Total running sum of all nucleotides
236 | 	double outer_sum_p = 0, outer_sum_q = 0;       // Prior K-mer sum
237 | 	for (int i = 0; i < points.size(); i++) { // Compute divergence for P and Q simultaneously
238 | 		sum4_p += points[i];
239 | 		sum4_q += d.points[i];
240 | 		if (i % 4 == 3) { //finished counting word, now compute probabilities
241 | 			double inner_sum_p = 0;        // Sum of p(X|Y) * log(p(X|Y) / q(X|Y))
242 | 			double inner_sum_q = 0;        // Sum of q(X|Y) * log(q(X|Y) / p(X|Y))
243 | 			for (int j = i - 3; j <= i; j++) {
244 | 				double conditional_p =   points[j] / sum4_p;
245 | 				double conditional_q = d.points[j] / sum4_q;
246 | 				double lg = log(conditional_p) - log(conditional_q);
247 | 				inner_sum_p +=      conditional_p * lg;
248 | 				inner_sum_q += -1 * conditional_q * lg;
249 | 			}
250 | 			outer_sum_p += sum4_p * inner_sum_p;
251 | 			outer_sum_q += sum4_q * inner_sum_q;
252 | 
253 | 			total_sum_p += sum4_p;
254 | 			total_sum_q += sum4_q;
255 | 			sum4_p = 0;
256 | 			sum4_q = 0;
257 | 		}
258 | 	}
259 | 	double left = outer_sum_p / total_sum_p;
260 | 	double right = outer_sum_q / total_sum_q;
261 | 	return (left + right) / 2.0;
262 | }
263 | 
264 | template<class T>
265 | uint64_t DivergencePoint<T>::getPseudoMagnitude() const
266 | {
267 | 	return mag;
268 | }
269 | 
270 | 
271 | template<class T>
272 | uint64_t DivergencePoint<T>::getRealMagnitude() const
273 | {
274 | 	return mag - points.size();
275 | }
276 | 
277 | #ifndef HEADER_HACK
278 | template class DivergencePoint<int>;
279 | template class DivergencePoint<double>;
280 | template class DivergencePoint<uint64_t>;
281 | template class DivergencePoint<uint32_t>;
282 | template class DivergencePoint<uint16_t>;
283 | template class DivergencePoint<uint8_t>;
284 | #endif
285 | 


--------------------------------------------------------------------------------
/src/clutil/DivergencePoint.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * DivergencePoint.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  *
 7 |  * Header for most often used k-mer histogram type
 8 |  */
 9 | #ifndef DIVERGENCE_POINT_H
10 | #define DIVERGENCE_POINT_H
11 | #include "Point.h"
12 | #include <vector>
13 | template<class T>
14 | class DivergencePoint : public Point<T> {
15 | public:
16 | 	DivergencePoint(const std::vector<T>& pts, uint64_t len);
17 | 	DivergencePoint(unsigned int size);
18 | 	~DivergencePoint() { points.clear(); }
19 | 	void operator*=(double d);
20 | 	void operator/=(double d);
21 | 	uint64_t operator-(const Point<T>& p) const;
22 | 	bool operator<(Point<T>& p) const;
23 | 	void operator+=(Point<T>& p);
24 | 	void set(Point<T>& p);
25 | 	void display() const;
26 | 	void zero();
27 | 	void addOne();
28 | 	void subOne();
29 | 	double prob_under(Point<T>& p) const;
30 | 	uint64_t getRealMagnitude() const;
31 | 	uint64_t getPseudoMagnitude() const;
32 | //	T magnitude() const { return getRealMagnitude(); };
33 | 	double distance_k1(const Point<T>& p) const;
34 | 	double get_stddev() const { return s_dev; };
35 | 	DivergencePoint* clone() const {
36 | 		auto d = new DivergencePoint(points, to_delete);
37 | 		d->set_header(Point<T>::get_header());
38 | 		d->set_id(get_id());
39 | 		d->set_length(get_length());
40 | 		d->set_stddev(get_stddev());
41 | 		d->set_data_str(Point<T>::get_data_str());
42 | 		return d;
43 | 	}
44 | 	DivergencePoint* create() const {
45 | 		return new DivergencePoint(points.size());
46 | 	}
47 | 	Point<double>* create_double() const {
48 | 		vector<double> v;
49 | 		for (auto val : points) {
50 | 			v.push_back(val);
51 | 		}
52 | 		return new DivergencePoint<double>(v, nucl_length);
53 | 	}
54 | 	void set_arg_to_this_d(Point<double>& p) const {
55 | 		DivergencePoint<double>& c = dynamic_cast< DivergencePoint<double>&>(p);
56 | 		for (int i = 0; i < points.size(); i++) {
57 | 			c.points[i] = points[i];
58 | 		}
59 | 		c.set_id(id);
60 | 	};
61 | 
62 | 
63 | 	bool is_to_delete() const {
64 | 		return to_delete;
65 | 	}
66 | 	void set_to_delete(bool b) {
67 | 		to_delete = b;
68 | 	}
69 | 	double divergence(Point<T>& p) const;
70 | 	double distance_d(Point<double>& p) const;
71 | 	uint64_t distance(const Point<T>& p) const;
72 | 	const vector<T>& get_data() const { return points; }
73 | 	void set_id(uintmax_t c_id) { id = c_id; };
74 | 	const uintmax_t get_id() const { return id; };
75 | 
76 | 	void set_length(unsigned long len) { nucl_length = len; };
77 | 	void set_stddev(double s_dev_) { s_dev = s_dev_; };
78 | 	unsigned long get_length() const { return nucl_length; };
79 | 	unsigned long size() const { return points.size(); };
80 | 	std::vector<T> points;
81 | 
82 | private:
83 | 	uintmax_t mag;
84 | 	bool to_delete;
85 | 	uint64_t id;
86 | 	uint64_t nucl_length;
87 | 	double s_dev;
88 | };
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/src/clutil/Histogram.cpp:
--------------------------------------------------------------------------------
  1 | /* -*- C++ -*-
  2 |  *
  3 |  * Histogram.cpp
  4 |  *
  5 |  * Author: Benjamin T James
  6 |  *
  7 |  * Artifact from early development of MeShClust
  8 |  */
  9 | #ifndef HEADER_HACK
 10 | #include "Histogram.h"
 11 | #endif
 12 | 
 13 | #include <vector>
 14 | #include <iostream>
 15 | 
 16 | template<class T>
 17 | double Histogram<T>::distance_k1(const Point<T> &p) const
 18 | {
 19 | 	throw "Not implemented";
 20 | 	const Histogram<T>& h = dynamic_cast<const Histogram<T>&>(p);
 21 | 	uint64_t dist = 0;
 22 |         auto size = std::min(points.size(),h.points.size());
 23 | /*
 24 | 	for (unsigned int i = 0; i < size; i++) {
 25 | 		T l = points.at(i);
 26 | 		T r = h.points.at(i);
 27 | 		dist += (l > r) ? (l - r) : (r - l);
 28 | 	}
 29 | */
 30 | 	uint64_t avg_mag = (magnitude() + h.magnitude()) / 2.0;
 31 | 	for (auto i = 0; i < size; i++) {
 32 | 		T l = points[i];
 33 | 		T r = h.points[i];
 34 | 		dist += min(l, r);
 35 | 	}
 36 | 	return 1.0 - dist / avg_mag;
 37 | }
 38 | template<class T>
 39 | Histogram<T>::Histogram(std::vector<T> pts, char mark)
 40 | {
 41 | 	for (T t : pts) {
 42 | 		points.push_back(t);
 43 | 	}
 44 | 	to_delete = false;
 45 | }
 46 | template<class T>
 47 | Histogram<T>::Histogram(std::vector<T> pts)
 48 | {
 49 | 	for (T t : pts) {
 50 | 		points.push_back(t);
 51 | 	}
 52 | 	to_delete = false;
 53 | }
 54 | 
 55 | template<class T>
 56 | Histogram<T>::Histogram(std::vector<T> pts, bool toDelete)
 57 | {
 58 | 	for (T t : pts) {
 59 | 		points.push_back(t);
 60 | 	}
 61 | 	to_delete = toDelete;
 62 | }
 63 | 
 64 | template<class T>
 65 | Histogram<T>::Histogram(unsigned int size)
 66 | {
 67 | 	for (unsigned int i = 0; i < size; i++) {
 68 | 		points.push_back(0);
 69 | 	}
 70 | 	to_delete = false;
 71 | }
 72 | 
 73 | template<class T>
 74 | void Histogram<T>::operator*=(double d)
 75 | {
 76 | 	for (T &t : points) {
 77 | 		t *= d;
 78 | 	}
 79 | }
 80 | 
 81 | template<class T>
 82 | bool Histogram<T>::operator<(Point<T>& p) const
 83 | {
 84 | 	const Histogram<T>& h = dynamic_cast<const Histogram<T>&>(p);
 85 | 	unsigned int size = std::min(points.size(),h.points.size());
 86 | 	for (unsigned int i = 0; i < size; i++) {
 87 | 		if (points.at(i) >= h.points.at(i)) {
 88 | 			return false;
 89 | 		}
 90 | 	}
 91 | 	return true;
 92 | }
 93 | 
 94 | template<class T>
 95 | void Histogram<T>::operator/=(double d)
 96 | {
 97 | 	unsigned int size = points.size();
 98 | 	for (unsigned int i = 0; i < size; i++) {
 99 | 		points.at(i) = points.at(i) / d;
100 | 	}
101 | }
102 | 
103 | template<class T>
104 | void Histogram<T>::operator+=(Point<T>& p)
105 | {
106 | 	const Histogram<T>& h = dynamic_cast<const Histogram<T>&>(p);
107 | 	unsigned int size = std::min(points.size(),h.points.size());
108 | 	for (unsigned int i = 0; i < size; i++) {
109 | 		points.at(i) += h.points.at(i);
110 | 	}
111 | }
112 | 
113 | template<class T>
114 | uint64_t Histogram<T>::operator-(const Point<T>& p) const
115 | {
116 | 	return distance(p);
117 | }
118 | 
119 | template<class T>
120 | void Histogram<T>::set(Point<T>& p)
121 | {
122 | 	const Histogram<T>& h = dynamic_cast<const Histogram<T>&>(p);
123 | 	points = h.points;
124 | }
125 | 
126 | template<class T>
127 | void Histogram<T>::display() const
128 | {
129 | 	unsigned size = points.size();
130 | 	for (unsigned i = 0; i < size; i++) {
131 | 		std::cout << points.at(i) << " ";
132 | 	}
133 | 	std::cout << std::endl;
134 | }
135 | 
136 | template<class T>
137 | void Histogram<T>::addOne()
138 | {
139 | 	for (auto &a : points) {
140 | 		a++;
141 | 	}
142 | }
143 | template<class T>
144 | void Histogram<T>::subOne()
145 | {
146 | 	for (auto &a : points) {
147 | 		a--;
148 | 	}
149 | }
150 | 
151 | template<class T>
152 | void Histogram<T>::zero()
153 | {
154 | 	for (typename std::vector<T>::iterator it = points.begin(); it != points.end(); ++it) {
155 | 		*it = 0;
156 | 	}
157 | }
158 | 
159 | template<class T>
160 | uint64_t Histogram<T>::distance(const Point<T>& p) const
161 | {
162 | /*
163 | 	// Vectors should be the same width
164 | 	const Histogram<T>& h = dynamic_cast<const Histogram<T>&>(p);
165 | 	T dist = 0;
166 | 	unsigned int size = std::min(points.size(),h.points.size());
167 | 	for (unsigned int i = 0; i < size; i++) {
168 | 		T l = points.at(i);
169 | 		T r = h.points.at(i);
170 | 		dist += (l > r) ? (l - r) : (r - l);
171 | 	}
172 | 	return dist;
173 | */
174 | 	throw "Not implemented";
175 | 	return 0;
176 | }
177 | 
178 | template<class T>
179 | uint64_t Histogram<T>::magnitude() const
180 | {
181 | 	uint64_t dist = 0;
182 | 	for (auto const& p : points) {
183 | 		dist += p;
184 | 	}
185 | 	return dist;
186 | }
187 | 
188 | #ifndef HEADER_HACK
189 | template class Histogram<int>;
190 | template class Histogram<double>;
191 | template class Histogram<uint64_t>;
192 | template class Histogram<uint32_t>;
193 | template class Histogram<uint16_t>;
194 | template class Histogram<uint8_t>;
195 | #endif
196 | 


--------------------------------------------------------------------------------
/src/clutil/Histogram.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * Histogram.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  *
 7 |  * Artifact from early development of MeShClust
 8 |  */
 9 | #ifndef HISTOGRAM_H
10 | #define HISTOGRAM_H
11 | #include <vector>
12 | #include "Point.h"
13 | 
14 | template<class T>
15 | class Histogram : public Point<T> {
16 | public:
17 | 	Histogram(std::vector<T> pts);
18 | 	Histogram(std::vector<T> pts, char marker);
19 | 	Histogram(std::vector<T> pts, bool to_delete);
20 | 	Histogram(unsigned int size);
21 | 	~Histogram() {}
22 | 	void operator*=(double d);
23 | 	void operator/=(double d);
24 | 	uint64_t operator-(const Point<T>& p) const;
25 | 	bool operator<(Point<T>& p) const;
26 | 	void operator+=(Point<T>& p);
27 | 	void set(Point<T>& p);
28 | 	void display() const;
29 | 	void zero();
30 | 	void addOne();
31 | 	void subOne();
32 | 	double distance_k1(const Point<T>& p) const;
33 | 	double prob_under(Point<T>& p) const { return distance(p); };
34 | 	uint64_t distance(const Point<T>& p) const;
35 | 	uint64_t magnitude() const;
36 | 	uint64_t getRealMagnitude() const { return 0; };
37 | 	double distance_d(Point<double>& p) const {
38 | 		throw "not implemented";
39 | 		return 0;
40 | 	}
41 | 	void set_arg_to_this_d(Point<double>& p) const {
42 | 		throw "not implemented";
43 | 	}
44 | 	Point<double>* create_double() const {
45 | 		throw "not implemented";
46 | 		return NULL;
47 | 	}
48 | 	Histogram* clone() const {
49 | 		return new Histogram(points, to_delete);
50 | 	}
51 | 	Histogram* create() const {
52 | 		return new Histogram(points.size());
53 | 	}
54 | 	bool is_to_delete() const {
55 | 		return to_delete;
56 | 	}
57 | 	void set_to_delete(bool b) {
58 | 		to_delete = b;
59 | 	}
60 | 	const vector<T>& get_data() const { return points; }
61 | 	void set_id(uintmax_t c_id) { id = c_id; };
62 | 	const uintmax_t get_id() const { return id; };
63 | 	void set_length(unsigned long len) { nucl_length = len; };
64 | 	unsigned long get_length() const { return nucl_length; };
65 |         unsigned long size() const { return points.size(); };
66 | private:
67 | 	std::vector<T> points;
68 | 	bool to_delete;
69 | 	uintmax_t id;
70 | 	unsigned long nucl_length;
71 | };
72 | 
73 | #ifdef HEADER_HACK
74 | #ifndef HISTOGRAM_C
75 | #define HISTORGRAM_C
76 | #include "Histogram.cpp"
77 | #endif
78 | #endif
79 | 
80 | #endif
81 | 


--------------------------------------------------------------------------------
/src/clutil/LCG.h:
--------------------------------------------------------------------------------
 1 | // -*- C++ -*-
 2 | /*
 3 |  * LCG.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #ifndef LCG_H
 9 | #define LCG_H
10 | 
11 | #include <limits>
12 | #include <algorithm>
13 | #include <stdint.h>
14 | #include <iostream>
15 | 
16 | class LCG {
17 | public:
18 | 	LCG(uint64_t seed_) {
19 | 		seed = seed_;
20 | 	}
21 | 
22 | 	template<class T>
23 |         T randMod(T max) {
24 | 		if (max == 0) {
25 | 			return 0;
26 | 		} else {
27 | 			uint64_t x = random() % max;
28 | 			return (T)x;
29 | 		}
30 | 	}
31 | 
32 | 	uint64_t nextRandSeed() {
33 | 		return random();
34 | 	}
35 | 	double rand_between(double id, double range, double low, double high) {
36 | 		uint64_t rnd = random();
37 | 		double res = (double)rnd / std::numeric_limits<uint64_t>::max();
38 | 		double mn = std::max(id - range, low);
39 | 		double mx = std::min(id + range, high);
40 | 		return mn + (mx - mn) * res;
41 | 	}
42 | 	uint64_t random() {
43 | 		// MMIX random, from https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use
44 | 		// Should be thread safe
45 | 		seed = seed * 6364136223846793005 + 1442695040888963407;
46 | 		return seed;
47 | 	}
48 | private:
49 | 	uint64_t seed;
50 | };
51 | #endif
52 | 


--------------------------------------------------------------------------------
/src/clutil/Loader.cpp:
--------------------------------------------------------------------------------
  1 | /* -*- C++ -*-
  2 |  *
  3 |  * Loader.cpp
  4 |  *
  5 |  * Author: Benjamin T James
  6 |  *
  7 |  * Class which can 'preload' chunks of sequences from a file list,
  8 |  * and then count the k-mers separately, which can be done in
  9 |  * multiple threads
 10 |  */
 11 | #include "Loader.h"
 12 | #include "Datatype.h"
 13 | 
 14 | static uint64_t num_overflow = 0;
 15 | std::string next_histogram(std::string cur_type)
 16 | {
 17 | 	if (cur_type == "uint8_t") {
 18 | 		return "uint16_t";
 19 | 	} else if (cur_type == "uint16_t") {
 20 | 		return "uint32_t";
 21 | 	} else {
 22 | 		return "uint64_t";
 23 | 	}
 24 | }
 25 | 
 26 | template<class T>
 27 | std::string Loader<T>::get_warning()
 28 | {
 29 | 	if (num_overflow == 0) {
 30 | 		return "";
 31 | 	} else {
 32 | 		std::ostringstream oss;
 33 | 		oss << "For " << num_overflow << " sequences, the histogram type " << Datatype::get() << " was too small for holding sequences." << endl;
 34 | 		oss << "Performance may be slightly hindered, but can be improved by increasing the integral type (--datatype " << next_histogram(Datatype::get())  << ")" << endl;
 35 | 		return oss.str();
 36 | 	}
 37 | }
 38 | 
 39 | 
 40 | 
 41 | template<class V>
 42 | void Loader<V>::fill_table(KmerHashTable<unsigned long, V> &table, ChromosomeOneDigit *chrom, std::vector<V>& values)
 43 | {
 44 | 	const int k = table.getK();
 45 | 	auto segment = chrom->getSegment();
 46 | 	const char *seg_bases = chrom->getBase()->c_str();
 47 | 	for (vector<int> *v : *segment) {
 48 | 		int start = v->at(0);
 49 | 		int end = v->at(1);
 50 | 
 51 | 		// Hani Z Girgis added the following line
 52 | 		// It is possible
 53 | 		if(end - start + 1 >= k){
 54 | 			int r = table.wholesaleIncrementNoOverflow(seg_bases, start, end - k + 1);
 55 | 			if (r == -1) {
 56 | 				num_overflow++;
 57 | 				// #pragma omp critical
 58 | 				// {
 59 | 				// 	std::ostringstream oss;
 60 | 				// 	oss << "In header \"" << chrom->getHeader() << "\"" << endl;
 61 | 				// 	oss << "Histogram type " << Runner::get_datatype() << " is too small for holding sequences." << endl;
 62 | 				// 	oss << "Performance may be slightly hindered, but can be improved by increasing the integral type (--datatype " << next_histogram(Runner::get_datatype())  << ")" << endl;
 63 | 				// 	_loader_warning = oss.str();
 64 | 				// 	cerr << get_warning() << endl;
 65 | 				// }
 66 | 			}
 67 | 		}
 68 | 	}
 69 | 	std::string header = chrom->getHeader();
 70 | 	header = header.substr(1, header.find(' ')-1);
 71 | 	// Hani Z. Girgis added the following lines on 10/3/2018
 72 | 	// This should result in significant speed up.
 73 | 	unsigned long tableSize = table.getMaxTableSize();
 74 | 	values.reserve(values.size() + tableSize);
 75 | 	const V * valueArray = table.getValues();
 76 | 
 77 | 	copy(&valueArray[0], &valueArray[tableSize], back_inserter(values));
 78 | 
 79 |     // Commented out by Hani Z. Girgis on 10/3/2018 and replaced by the code above
 80 | 	// std::vector<std::string> *keys = table.getKeys();
 81 | 	// for (std::string str : *keys) {
 82 | 	// 	values.push_back(table.valueOf(str.c_str()));
 83 | 	// }
 84 | 	// keys->clear();
 85 | 	// delete keys;
 86 | }
 87 | 
 88 | template<class T>
 89 | bool Loader<T>::done() const
 90 | {
 91 | 	return file_idx == files.size();
 92 | }
 93 | 
 94 | template<class T>
 95 | void Loader<T>::preload(int tid)
 96 | {
 97 | 	if (file_idx == files.size()) {
 98 | 		return;
 99 | 	}
100 | 	for (uint64_t j = 0; j < chunk_size; j++) {
101 | 		auto chrom = next();
102 | 		if (chrom.first == "") {
103 | 			return;
104 | 		}
105 | 		cache_list.at(tid).emplace_back(chrom.first, chrom.second);
106 | 	}
107 | }
108 | 
109 | 
110 | // Modified by Hani Z. Girgis on Oct 2, 2018
111 | template<class T>
112 | Point<T>* Loader<T>::get_point(std::string header, const std::string &base, uintmax_t& id, int k, bool set_seq)
113 | {
114 | 	ostringstream obase;
115 | 	for (int i = 0; i < base.length(); i++) {
116 | 		if (base[i] == 'A' || base[i] == 'C' ||
117 | 		    base[i] == 'G' || base[i] == 'T') {
118 | 			obase << base[i];
119 | 		}
120 | 	}
121 | 	ChromosomeOneDigit * chrom;
122 | 	if(Util::isDna){
123 | 		chrom = new ChromosomeOneDigitDna();
124 | 	}else{
125 | 		chrom = new ChromosomeOneDigitProtein();
126 | 	}
127 | 
128 | 	chrom->setHeader(header);
129 | 	chrom->appendToSequence(obase.str());
130 | 	chrom->finalize();
131 | 	Point<T> *p = Loader<T>::get_point(chrom, id, k, set_seq);
132 | 	delete chrom;
133 | 	return p;
134 | }
135 | 
136 | // Modified by Hani Z. Girgis on Oct 2, 2018
137 | template<class T>
138 | Point<T>* Loader<T>::get_point(ChromosomeOneDigit* chrom, uintmax_t& id, int k, bool set_seq)
139 | {
140 | 
141 | 	KmerHashTable<unsigned long, T> table(k, 1);
142 | 	// Hani Z. Girgis changed the following line
143 | 	// The table_k1 was initialized from 0 now it is 1
144 | 	KmerHashTable<unsigned long, uint64_t> table_k1(1, 1);
145 | 	std::vector<T> values;
146 | 	vector<uint64_t> values_k1;
147 | 	// values.clear();
148 | 
149 | 	Loader<T>::fill_table(table, chrom, values);
150 | 	Loader<uint64_t>::fill_table(table_k1, chrom, values_k1);
151 | //	int tmplate = get_template(chrom->getHeader(), templates);
152 | 	Point<T> *p = new DivergencePoint<T>(values, chrom->size());
153 | //	cout << "mag: " << ((DivergencePoint<T>*)p)->getPseudoMagnitude() << std::endl;
154 | 	p->set_1mers(values_k1);
155 | 	p->set_header(chrom->getHeader());
156 | 	p->set_length(chrom->getEffectiveSize());
157 | 	if (set_seq) {
158 | 		p->set_data_str(*chrom->getBase());
159 | 	}
160 | 	// Added by Hani Z. Girgis on Oct 7 2018
161 | 	p->setK(k);
162 | 	DivergencePoint<T>* q = dynamic_cast<DivergencePoint<T>*>(p);
163 | 	const auto N = q->points.size();
164 | 	double aq = (double) q->getPseudoMagnitude() / N;
165 | 	double sq = 0;
166 | 	for (auto i = 0; i < N; i++) {
167 | 		double qdiff = q->points[i] - aq;
168 | 		sq += qdiff * qdiff;
169 | 	}
170 | 	sq = sqrt(sq / N);
171 | 	q->set_stddev(sq);
172 | 	p->set_id(id);
173 | 	#pragma omp atomic
174 | 	id++;
175 | 
176 | 	// Clean
177 | 
178 | 	return p;
179 | }
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | template<class T>
188 | std::vector<Point<T>*> Loader<T>::load_next(int tid)
189 | {
190 | 	std::vector<Point<T>*> points;
191 | 	for (size_t i = 0; i < cache_list.at(tid).size(); i++) {
192 | 	        auto pr = cache_list.at(tid).at(i);
193 | 		Point<T>* p = get_point(pr.first, *pr.second, id_list.at(tid), k);
194 | 		points.push_back(p);
195 | 		delete pr.second;
196 | 	}
197 | 	cache_list.at(tid).clear();
198 | 	return points;
199 | }
200 | 
201 | template<class T>
202 | std::pair<std::string,std::string*> Loader<T>::next()
203 | {
204 | 	auto n = maker->next();
205 | 	if (n.first != "") {
206 | 		return n;
207 | 	}
208 | 	delete maker;
209 | 	maker = NULL;
210 | 	file_idx++;
211 | 	if (file_idx >= files.size()) {
212 | 		return n;
213 | 	}
214 | 	maker = new SingleFileLoader(files.at(file_idx));
215 | 	return maker->next();
216 | }
217 | 
218 | template class Loader<double>;
219 | template class Loader<int>;
220 | template class Loader<uint64_t>;
221 | template class Loader<uint32_t>;
222 | template class Loader<uint16_t>;
223 | template class Loader<uint8_t>;
224 | 


--------------------------------------------------------------------------------
/src/clutil/Loader.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * Loader.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  *
 7 |  * Class which can 'preload' chunks of sequences from a file list,
 8 |  * and then count the k-mers separately, which can be done in
 9 |  * multiple threads
10 |  */
11 | #ifndef LOADER_H
12 | #define LOADER_H
13 | 
14 | #include <omp.h>
15 | 
16 | #include "SingleFileLoader.h"
17 | #include "Point.h"
18 | #include "DivergencePoint.h"
19 | #include "../nonltr/KmerHashTable.h"
20 | // Add by Hani Z. Girgis, PhD on Oct 2, 2018
21 | #include "../nonltr/ChromosomeOneDigit.h"
22 | #include "../nonltr/ChromosomeOneDigitDna.h"
23 | #include "../nonltr/ChromosomeOneDigitProtein.h"
24 | 
25 | 
26 | 
27 | template<class T>
28 | class Loader {
29 | public:
30 | 	Loader(std::vector<std::string> files_,
31 | 	       uint64_t total_num_points_,
32 | 	       uint64_t chunk_size_,
33 | 	       int num_threads_,
34 | 	       int k_,
35 | 	       uint64_t start_id=0)
36 | 		:
37 | 		chunk_size(chunk_size_),
38 | 		num_threads(num_threads_),
39 | 		k(k_),
40 | 		files(files_) {
41 | 
42 | 		maker = new SingleFileLoader(files.at(0));
43 | 		uint64_t total_id = start_id;
44 | 		for (int i = 0; i < num_threads_; i++) {
45 | 			id_list.push_back(total_id);
46 | 			total_id += total_num_points_;
47 | 			cache_list.push_back(std::vector<std::pair<std::string,std::string*> >());
48 | 		}
49 | //		preload();
50 | 	};
51 | 
52 | 	~Loader() {
53 | 		if (get_warning() != "") {
54 | 			cerr << get_warning() << endl;
55 | 		}
56 | 		cache_list.clear();
57 | 		id_list.clear();
58 | 		if (maker != NULL) {
59 | 			delete maker;
60 | 		}
61 | 	}
62 | 
63 | 	// single threaded
64 | 	void preload(int tnum);
65 | 
66 | 	bool done() const;
67 | 	// multi-thread accessible
68 | 	std::vector<Point<T>*> load_next(int tid);
69 | 
70 | 	static Point<T>* get_point(std::string header, const std::string &base, uintmax_t& id, int k, bool set_seq=true);
71 | 	static Point<T>* get_point(ChromosomeOneDigit* dna, uintmax_t& id, int k, bool set_seq=true);
72 | 
73 | 	static void fill_table(KmerHashTable<unsigned long, T> &table, ChromosomeOneDigit *chrom, std::vector<T>& values);
74 | 	static std::string get_warning();
75 | private:
76 | 
77 | 	std::pair<std::string,std::string*> next();
78 | 
79 | 	uint64_t chunk_size;
80 | 	int num_threads, k;
81 | 
82 | 	std::vector<std::vector<std::pair<std::string,std::string*> > > cache_list;
83 | 	std::vector<uint64_t> id_list;
84 | 
85 | 	std::vector<std::string> files;
86 | 	size_t file_idx = 0;
87 | 	SingleFileLoader *maker = NULL;
88 | 
89 | };
90 | 
91 | #endif
92 | 


--------------------------------------------------------------------------------
/src/clutil/Point.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * Point.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  *
 7 |  * For some reason this class was made pure virtual
 8 |  * in early development of MeShClust, making Histogram
 9 |  * and DivergencePoint both derivatives that essentially
10 |  * did the same thing
11 |  */
12 | #ifndef POINT_H
13 | #define POINT_H
14 | 
15 | #include <string>
16 | #include "../nonltr/ChromosomeOneDigit.h"
17 | 
18 | /*
19 |  * Pure virtual class that defines behavior for
20 |  * points. Has clone() and create() that allow for
21 |  * polymorphic behavior
22 |  */
23 | template<class T>
24 | class Point {
25 | public:
26 | 	virtual ~Point() { data.clear(); };
27 | 	virtual void operator*=(double d) = 0;
28 | 	virtual void operator/=(double d) = 0;
29 | 	virtual bool operator<(Point<T>& p) const = 0;
30 | 	virtual uint64_t operator-(const Point<T>& p) const = 0;
31 | 	virtual void operator+=(Point<T>& p) = 0;
32 | 	virtual void set(Point<T>& p) = 0;
33 | 	virtual void display() const = 0;
34 | 	virtual uint64_t distance(const Point<T>& p) const = 0;
35 | 	virtual double distance_d(Point<double>& p) const = 0;
36 | 	virtual Point* clone() const = 0;
37 | 	virtual Point* create() const = 0;
38 | 
39 | 	virtual void zero() = 0;
40 | 	virtual void addOne() = 0;
41 | 	virtual double distance_k1(const Point<T>& p) const = 0;
42 | 	virtual double prob_under(Point<T>& center) const = 0;
43 | 	virtual void subOne() = 0;
44 | 	virtual uint64_t getRealMagnitude() const = 0;
45 | //	virtual T magnitude() const = 0;
46 | 	virtual bool is_to_delete() const = 0;
47 | 	virtual void set_to_delete(bool b) = 0;
48 | 
49 | 	virtual Point<double>* create_double() const = 0;
50 | 	virtual void set_arg_to_this_d(Point<double>& p) const = 0;
51 | 
52 | 	virtual const vector<T>& get_data() const = 0;
53 | 
54 | 	void set_header(const std::string c) { header = string(c); };
55 | 	const std::string get_header() const { return header; };
56 | 
57 | 	void set_data_str(const std::string& c) { data = c; };
58 | 	const std::string & get_data_str() const { return data; };
59 | 
60 | 	void set_1mers(const vector<uint64_t> &vec) {
61 | 		// for (auto i = 0; i < Util::getAlphabetSize(); i++) {
62 | 		// 	one_mers[i] = vec[i];
63 | 		// }
64 | 		one_mers = vector<uint64_t>(vec);
65 | 	}
66 | 
67 | 	vector<uint64_t> get_1mers() const {
68 | 		// vector<uint64_t> vec;
69 | 		// for (auto i = 0; i < Util::getAlphabetSize(); i++) {
70 | 		// 	vec.push_back(one_mers[i]);
71 | 		// }
72 | 		// return vec;
73 | 		return one_mers;
74 | 	}
75 | 	virtual unsigned long size() const = 0;
76 | 	virtual void set_id(uintmax_t c_id) = 0;//{ id = c_id; };
77 | 	virtual const uintmax_t get_id() const = 0;//{ return id; };
78 | 	virtual void set_length(unsigned long len) = 0;
79 | 	virtual unsigned long get_length() const = 0;
80 | 
81 | 	// Added by Hani Z. Girgis on Oct 7 2018
82 | 	int getK(){
83 | 		return k;
84 | 	}
85 | 	void setK(int k){
86 | 		this->k = k;
87 | 	}
88 | 
89 | private:
90 | 	vector<uint64_t> one_mers;
91 |     std::string header;
92 | 	std::string data;
93 | 	// Added by Hani Z. Girgis on Oct 7 2018
94 | 	// The k in k-mer used to build the table
95 | 	int k;
96 | };
97 | 
98 | #endif
99 | 


--------------------------------------------------------------------------------
/src/clutil/Progress.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * Progress.cpp
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  *
 7 |  * Progress bar that uses carriage return '\r'
 8 |  * to seek to the beginning of a line to redraw
 9 |  */
10 | #include "Progress.h"
11 | #include <iostream>
12 | Progress::Progress(long num, std::string prefix_)
13 | {
14 | 	pmax = num;
15 | 	ended = 0;
16 | 	pcur = 0;
17 | 	old_prog = -1;
18 | 	prefix = prefix_;
19 | 	barWidth = 70 - (prefix.size()+1);
20 | 	print();
21 | }
22 | 
23 | void Progress::print()
24 | {
25 | 	#ifndef NOPROG
26 | 	double prog = (double)pcur / pmax;
27 | 	if (old_prog != int(prog * 100)) {
28 | 		std::cout << prefix << " [";
29 | 		int pos = barWidth * prog;
30 | 		for (int i = 0; i < barWidth; i++) {
31 | 			if (i < pos) {
32 | 				std::cout << "=";
33 | 			} else if (i == pos) {
34 | 				std::cout << ">";
35 | 			} else {
36 | 				std::cout << " ";
37 | 			}
38 | 		}
39 | 		std::cout << "] " << int(prog * 100.0) << " %\r";
40 | 		std::cout.flush();
41 | 	}
42 | 	old_prog = int(prog * 100);
43 | 	#endif
44 | }
45 | 
46 | void Progress::end()
47 | {
48 | 	if (!ended) {
49 | 		pcur = pmax;
50 | 		print();
51 | 		std::cout << std::endl;
52 | 	}
53 | 	ended = true;
54 | }
55 | 
56 | 
57 | void Progress::set(int num)
58 | {
59 | 	pcur = num;
60 | 	print();
61 | }
62 | 
63 | void Progress::operator++()
64 | {
65 | 	pcur++;
66 | 	print();
67 | }
68 | void Progress::operator++(int)
69 | {
70 | 	print();
71 | 	pcur++;
72 | }
73 | 
74 | 
75 | void Progress::operator+=(size_t num)
76 | {
77 | 	pcur += num;
78 | 	print();
79 | }
80 | 


--------------------------------------------------------------------------------
/src/clutil/Progress.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * Progress.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  *
 7 |  * Progress bar that uses carriage return '\r'
 8 |  * to seek to the beginning of a line to redraw
 9 |  *
10 |  */
11 | #include <iostream>
12 | #ifndef PROGRESS_H
13 | #define PROGRESS_H
14 | 
15 | class Progress {
16 | public:
17 | 	Progress(long num, std::string prefix_);
18 | 	~Progress() { end(); }
19 | 	void end();
20 | 	void operator++();
21 | 	void operator++(int);
22 | 	void operator+=(size_t);
23 | 	void set(int);
24 | private:
25 | 	void print();
26 | 	long pmax;
27 | 	long pcur;
28 | 	long old_prog;
29 | 	bool ended;
30 | 	std::string prefix;
31 | 	int barWidth;
32 | };
33 | #endif
34 | 


--------------------------------------------------------------------------------
/src/clutil/Random.h:
--------------------------------------------------------------------------------
 1 | // -*- C++ -*-
 2 | /*
 3 |  * Random.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #ifndef RANDOM_H
 9 | #define RANDOM_H
10 | #include <random>
11 | #include <iostream>
12 | class Random {
13 | public:
14 | 	Random(std::random_device::result_type seed=0xAA) : mt(seed) {}
15 | 
16 | 	template<class T>
17 | 	T randMod(T max) {
18 | 		T res;
19 | #pragma omp critical
20 | 		{
21 | 			if (max == 0) {
22 | 				res = 0;
23 | 			} else {
24 | 				std::uniform_int_distribution<T> distribution(0, max-1);
25 | 				res = distribution(mt);
26 | 			}
27 | 		}
28 | 		return res;
29 | 	}
30 | 
31 | 	double random() {
32 | 		double res = 0;
33 | 		#pragma omp critical
34 | 		{
35 | 		std::uniform_real_distribution<double> distribution(0.0, 1.0);
36 | 		res = distribution(mt);
37 | 		}
38 | 		return res;
39 | 	}
40 | 	double rand_between(double id, double range, double low, double high) {
41 | 		double res = 0;
42 | 		#pragma omp critical
43 | 		{
44 | 		double mn = std::max(id - range, low);
45 | 		double mx = std::min(id + range, high);
46 | 		std::uniform_real_distribution<double> distribution(mn, mx);
47 | 
48 | 		res = distribution(mt);
49 | 		}
50 | 		return res;
51 | 	}
52 | 	std::random_device::result_type nextRandSeed() {
53 | 		using rt = std::random_device::result_type;
54 | 		return randMod<rt>(std::numeric_limits<rt>::max());
55 | 	}
56 | 	std::mt19937& gen() { return mt; }
57 | private:
58 |         std::mt19937 mt;
59 | 
60 | };
61 | #endif
62 | 


--------------------------------------------------------------------------------
/src/clutil/SingleFileLoader.cpp:
--------------------------------------------------------------------------------
  1 | /* -*- C++ -*-
  2 |  *
  3 |  * SingleFileLoader.cpp
  4 |  *
  5 |  * Author: Benjamin T James
  6 |  *
  7 |  * Reads sequences one by one from a file
  8 |  */
  9 | #include "SingleFileLoader.h"
 10 | #include <sstream>
 11 | #include <iostream>
 12 | 
 13 | std::istream& safe_getline(std::istream& is, std::string& t)
 14 | {
 15 | 	t.clear();
 16 | 	std::istream::sentry se(is, true);
 17 | 	std::streambuf* sb = is.rdbuf();
 18 | 	for(;;) {
 19 | 		int c = sb->sbumpc();
 20 | 		switch (c) {
 21 | 		case '\n':
 22 | 			return is;
 23 | 		case '\r':
 24 | 			if (sb->sgetc() == '\n') {
 25 | 				sb->sbumpc();
 26 | 			}
 27 | 			return is;
 28 | 		case std::streambuf::traits_type::eof():
 29 | 			if (t.empty()) {
 30 | 				is.setstate(std::ios::eofbit);
 31 | 			}
 32 | 			return is;
 33 | 		default:
 34 | 			t += (char)c;
 35 | 		}
 36 | 	}
 37 | }
 38 | 
 39 | 
 40 | SingleFileLoader::SingleFileLoader(std::string filename)
 41 | {
 42 | 	in = new std::ifstream(filename);
 43 | 	is_first = true;
 44 | }
 45 | std::pair<std::string, std::string*> SingleFileLoader::next()
 46 | {
 47 | 	std::pair<std::string,std::string*> ret = std::make_pair("", (std::string*)NULL);
 48 | 	if (!in->good()) {
 49 | 		return ret;
 50 | 	}
 51 | 	clock_t begin = clock();
 52 | 	ret.second = new std::string("");
 53 | 	if (is_first) {
 54 | 		safe_getline(*in, buffer);
 55 | 		is_first = false;
 56 | 	}
 57 | 	do {
 58 | 		if (buffer[0] == '>') {
 59 | 			if (ret.first != "")  {
 60 | 				return ret;
 61 | 			}
 62 | 			ret.first = buffer;
 63 | 		} else if (buffer[0] == ' ' || buffer[0] == '\t') {
 64 | 			bool all_spaces = true;
 65 | 			for (auto c : buffer) {
 66 | 				if (c != ' ' && c != '\t') {
 67 | 					all_spaces = false;
 68 | 				}
 69 | 			}
 70 | 			if (!all_spaces) {
 71 | 				std::ostringstream oss;
 72 | 				oss << ret.first << buffer;
 73 | 				std::string new_header = oss.str();
 74 | 				ret.first = new_header;
 75 | 			}
 76 | 		} else {
 77 | 			ret.second->append(buffer);
 78 | 		}
 79 | 		safe_getline(*in, buffer);
 80 | 	} while (in->good());
 81 | 	double diff = clock() - begin;
 82 | //	std::cout << "next(): " << diff / CLOCKS_PER_SEC << std::endl;
 83 | 	return ret;
 84 | }
 85 | ChromosomeOneDigitDna* SingleFileLoader::nextChrom()
 86 | {
 87 | 	ChromosomeOneDigitDna* ret = NULL;
 88 | 	if (!in->good()) {
 89 | 		return ret;
 90 | 	}
 91 | 	if (is_first) {
 92 | 		safe_getline(*in, buffer);
 93 | 		is_first = false;
 94 | 	}
 95 | 	do {
 96 | 		if (buffer[0] == '>') {
 97 | 			if (ret != NULL)  {
 98 | 				ret->finalize();
 99 | 				return ret;
100 | 			}
101 | 			ret = new ChromosomeOneDigitDna();
102 | 			ret->setHeader(buffer);
103 | 		} else if (buffer[0] == ' ' || buffer[0] == '\t') {
104 | 			bool all_spaces = true;
105 | 			for (auto c : buffer) {
106 | 				if (c != ' ' && c != '\t') {
107 | 					all_spaces = false;
108 | 				}
109 | 			}
110 | 			if (!all_spaces) {
111 | 				std::ostringstream oss;
112 | 				oss << ret->getHeader() << buffer;
113 | 				std::string new_header = oss.str();
114 | 				ret->setHeader(new_header);
115 | 			}
116 | 		} else {
117 | 			ret->appendToSequence(buffer);
118 | 		}
119 | 		safe_getline(*in, buffer);
120 | 	} while (in->good());
121 | 	ret->finalize();
122 | 	return ret;
123 | }
124 | 


--------------------------------------------------------------------------------
/src/clutil/SingleFileLoader.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * SingleFileLoader.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  *
 7 |  * A way of reading in 1 sequence at a time
 8 |  * from FASTA, sequence is heap allocated
 9 |  */
10 | #ifndef SINGLEFILELOADER_H
11 | #define SINGLEFILELOADER_H
12 | 
13 | #include <fstream>
14 | #include "../nonltr/ChromosomeOneDigitDna.h"
15 | class SingleFileLoader {
16 | public:
17 | 	SingleFileLoader(std::string file);
18 | 	~SingleFileLoader() {
19 | 		if (in != NULL) {
20 | 			delete in;
21 | 		}
22 | 	}
23 | 	std::pair<std::string,std::string*> next();
24 | 	ChromosomeOneDigitDna* nextChrom();
25 | private:
26 | 	std::ifstream *in;
27 | 	std::string buffer;
28 | 	bool is_first;
29 | };
30 | #endif
31 | 


--------------------------------------------------------------------------------
/src/exception/FileDoesNotExistException.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * FileDoesNotExistException.cpp
 3 |  *
 4 |  *  Created on: Apr 30, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #include "FileDoesNotExistException.h"
 9 | 
10 | #include <iostream>
11 | #include <string>
12 | 
13 | using namespace std;
14 | 
15 | namespace exception{
16 | 
17 | FileDoesNotExistException::FileDoesNotExistException(string massage) {
18 | 	cerr << "File Does Not Exist Exception" << endl;
19 | 	cerr << massage << endl;
20 | }
21 | 
22 | FileDoesNotExistException::~FileDoesNotExistException() {
23 | 	// TODO Auto-generated destructor stub
24 | }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/exception/FileDoesNotExistException.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * FileDoesNotExistException.h
 3 |  *
 4 |  *  Created on: Apr 30, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef FILEDOESNOTEXISTEXCEPTION_H_
 9 | #define FILEDOESNOTEXISTEXCEPTION_H_
10 | 
11 | #include <string>
12 | 
13 | using namespace std;
14 | 
15 | namespace exception {
16 | 	class FileDoesNotExistException {
17 | 	public:
18 | 		FileDoesNotExistException(string);
19 | 		~FileDoesNotExistException();
20 | 	};
21 | }
22 | 
23 | #endif /* FILEDOESNOTEXISTEXCEPTION_H_ */
24 | 


--------------------------------------------------------------------------------
/src/exception/InvalidInputException.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * InvalidInputException.cpp
 3 |  *
 4 |  *  Created on: May 1, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #include "InvalidInputException.h"
 9 | 
10 | #include <string>
11 | #include <iostream>
12 | 
13 | using namespace std;
14 | namespace exception{
15 | 
16 | InvalidInputException::InvalidInputException(string msg) {
17 | 	cerr << "Invalid Input Exception" << endl;
18 | 	cerr << msg << endl;
19 | }
20 | 
21 | InvalidInputException::~InvalidInputException() {
22 | 	// TODO Auto-generated destructor stub
23 | }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/exception/InvalidInputException.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * InvalidInputException.h
 3 |  *
 4 |  *  Created on: May 1, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef INVALIDINPUTEXCEPTION_H_
 9 | #define INVALIDINPUTEXCEPTION_H_
10 | 
11 | #include<string>
12 | 
13 | using namespace std;
14 | 
15 | namespace exception {
16 | 	class InvalidInputException {
17 | 	public:
18 | 		InvalidInputException(string);
19 | 		~InvalidInputException();
20 | 	};
21 | }
22 | 
23 | #endif /* INVALIDINPUTEXCEPTION_H_ */
24 | 


--------------------------------------------------------------------------------
/src/exception/InvalidOperationException.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * InvalidOperationException.cpp
 3 |  *
 4 |  *  Created on: Dec 20, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #include <iostream>
 9 | #include "InvalidOperationException.h"
10 | 
11 | 
12 | namespace exception {
13 | 
14 | InvalidOperationException::InvalidOperationException(string msg) : std::runtime_error(msg) {
15 | 	cerr << "Invalid Operation Exception." << endl;
16 | 	cerr << what() << endl;
17 | }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/exception/InvalidOperationException.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * InvalidOperationException.h
 3 |  *
 4 |  *  Created on: Dec 20, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef INVALIDOPERATIONEXCEPTION_H_
 9 | #define INVALIDOPERATIONEXCEPTION_H_
10 | 
11 | #include <string>
12 | #include <stdexcept>
13 | 
14 | using namespace std;
15 | 
16 | namespace exception {
17 | 
18 | class InvalidOperationException : public std::runtime_error{
19 | public:
20 | 	InvalidOperationException(string msg);
21 | 	//virtual ~InvalidOperationException();
22 | };
23 | 
24 | }
25 | 
26 | #endif /* INVALIDOPERATIONEXCEPTION_H_ */
27 | 


--------------------------------------------------------------------------------
/src/exception/InvalidOrderOfOperationsException.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * InvalidOrderOfOperationsException.cpp
 3 |  *
 4 |  *  Created on: Apr 26, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #include "InvalidOrderOfOperationsException.h"
 9 | 
10 | #include <string>
11 | #include <iostream>
12 | 
13 | using namespace std;
14 | namespace exception{
15 | 
16 | InvalidOrderOfOperationsException::InvalidOrderOfOperationsException(string massage) {
17 | 	cerr << "Invalid Order Of Operations Exception" << endl;
18 | 	cerr << massage << endl;
19 | }
20 | 
21 | InvalidOrderOfOperationsException::~InvalidOrderOfOperationsException() {
22 | 	// TODO Auto-generated destructor stub
23 | }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/exception/InvalidOrderOfOperationsException.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * InvalidOrderOfOperationsException.h
 3 |  *
 4 |  *  Created on: Apr 26, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef INVALIDORDEROFOPERATIONSEXCEPTION_H_
 9 | #define INVALIDORDEROFOPERATIONSEXCEPTION_H_
10 | 
11 | #include <string>
12 | 
13 | using namespace std;
14 | 
15 | namespace exception{
16 | 	class InvalidOrderOfOperationsException {
17 | 	public:
18 | 		InvalidOrderOfOperationsException(string);
19 | 		~InvalidOrderOfOperationsException();
20 | 	};
21 | }
22 | 
23 | #endif /* INVALIDORDEROFOPERATIONSEXCEPTION_H_ */
24 | 


--------------------------------------------------------------------------------
/src/exception/InvalidScoreException.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * InvalidScoreException.cpp
 3 |  *
 4 |  *  Created on: Apr 27, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #include "InvalidScoreException.h"
 9 | 
10 | #include <string>
11 | #include <iostream>
12 | 
13 | using namespace std;
14 | namespace exception{
15 | 
16 | InvalidScoreException::InvalidScoreException(string massage) {
17 | 	cerr << "Invalid Score Exception." << endl;
18 | 	cerr << massage << endl;
19 | }
20 | 
21 | InvalidScoreException::~InvalidScoreException() {
22 | 	// TODO Auto-generated destructor stub
23 | }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/exception/InvalidScoreException.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * InvalidScoreException.h
 3 |  *
 4 |  *  Created on: Apr 27, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef INVALIDSCOREEXCEPTION_H_
 9 | #define INVALIDSCOREEXCEPTION_H_
10 | 
11 | #include <string>
12 | 
13 | using namespace std;
14 | 
15 | namespace exception{
16 | 	class InvalidScoreException {
17 | 	public:
18 | 		InvalidScoreException(string);
19 | 		virtual ~InvalidScoreException();
20 | 	};
21 | }
22 | 
23 | #endif /* INVALIDSCOREEXCEPTION_H_ */
24 | 


--------------------------------------------------------------------------------
/src/exception/InvalidStateException.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * InvalidStateException.cpp
 3 |  *
 4 |  *  Created on: Aug 9, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #include <iostream>
 9 | #include <string>
10 | #include "InvalidStateException.h"
11 | 
12 | using namespace std;
13 | 
14 | 
15 | namespace exception {
16 | InvalidStateException::InvalidStateException(string msg) :
17 | 		std::runtime_error(msg) {
18 | 	cerr << "Invalid State Exception." << endl;
19 | 	cerr << what() << endl;
20 | }
21 | }
22 | 
23 | //InvalidStateException::~InvalidStateException() {
24 | // TODO Auto-generated destructor stub
25 | //}
26 | 


--------------------------------------------------------------------------------
/src/exception/InvalidStateException.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * InvalidStateException.h
 3 |  *
 4 |  *  Created on: Aug 9, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef INVALIDSTATEEXCEPTION_H_
 9 | #define INVALIDSTATEEXCEPTION_H_
10 | 
11 | #include <string>
12 | #include <stdexcept>
13 | 
14 | using namespace std;
15 | 
16 | namespace exception {
17 | 	class InvalidStateException : public std::runtime_error{
18 | 	public:
19 | 		InvalidStateException(string);
20 | 	};
21 | }
22 | 
23 | #endif /* INVALIDSTATEEXCEPTION_H_ */
24 | 


--------------------------------------------------------------------------------
/src/fastcar/FC_Runner.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * Runner.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  *
 7 |  * Runner class, sets default params
 8 |  * and runs program
 9 |  */
10 | #ifndef FC_RUNNER_H
11 | #define FC_RUNNER_H
12 | 
13 | #include <iostream>
14 | #include <map>
15 | #include <set>
16 | #include "../clutil/Point.h"
17 | #include "../predict/Predictor.h"
18 | #include "../predict/HandleSeq.h"
19 | #include "../nonltr/ChromosomeOneDigitDna.h"
20 | using namespace std;
21 | 
22 | class Runner {
23 | public:
24 | 	Runner(int argc, char** argv);
25 | 	~Runner() { indices.clear(); files.clear(); qfiles.clear(); if (pred64) {delete pred64;}};
26 | 	int run();
27 | private:
28 | 	void usage(std::string progname) const;
29 | 	template<class T> int do_run(std::vector<ChromosomeOneDigit*> &sequences);
30 | 	template<class T> void print_output(const map<Point<T>*, vector<Point<T>*>*> &m) const;
31 | 	int k = -1;
32 |         int bandwidth;
33 | 	double similarity = -1;
34 | 	long largest_count = 0;
35 | 	bool align = false;
36 | 	bool recover = false;
37 | 	int sample_size = 300;
38 | 	int mut_type = HandleSeq::SINGLE;
39 | 	uint8_t mode = 0;
40 | 	uint64_t feats = 0;
41 | 	uint64_t chunk_size = 10000;
42 | 	std::vector<std::string> files, qfiles;
43 | 	std::vector<size_t> indices;
44 | 	bool dump = false;
45 | 	bool format = true;
46 | 	string output = "output.search";
47 | 	string dump_str = "weights.txt";
48 | 	void get_opts(int argc, char** argv);
49 | 	Predictor<uint64_t> *pred64 = NULL;
50 | 
51 | 
52 | };
53 | #endif
54 | 


--------------------------------------------------------------------------------
/src/fastcar/fastcar.cpp:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * main.cpp
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | #include "FC_Runner.h"
 8 | int main(int argc, char **argv)
 9 | {
10 | 	Runner runner(argc, argv);
11 | 	return runner.run();
12 | }
13 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromDetector.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ChromDetector.cpp
 3 |  *
 4 |  *  Created on: Nov 8, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #include <vector>
 9 | 
10 | #include "ChromDetector.h"
11 | #include "Detector.h"
12 | #include "../utility/Util.h"
13 | 
14 | using namespace std;
15 | using namespace nonltr;
16 | using namespace utility;
17 | 
18 | ChromDetector::ChromDetector(double s, double w, double pDelta, double b,
19 | 		double mDelta, vector<int> * scores,
20 | 		const vector<vector<int> *> * segmentList) {
21 | 
22 | 	regions = new vector<vector<int> *>();
23 | 
24 | 	for (int i = 0; i < segmentList->size(); i++) {
25 | 		Detector * detector = new Detector(segmentList->at(i)->at(0),
26 | 				segmentList->at(i)->at(1), s, w, pDelta, b, mDelta, scores);
27 | 		vector<vector<int> *> * segRegions = detector->getRegions();
28 | 		regions->insert(regions->end(), segRegions->begin(), segRegions->end());
29 | 		delete detector;
30 | 	}
31 | }
32 | 
33 | ChromDetector::~ChromDetector() {
34 | 	Util::deleteInVector(regions);
35 | 	regions->clear();
36 | 	delete regions;
37 | }
38 | 
39 | vector<vector<int> *> * ChromDetector::getRegions() {
40 | 	return regions;
41 | }
42 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromDetector.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ChromDetector.h
 3 |  *
 4 |  *  Created on: Nov 8, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef CHROMDETECTOR_H_
 9 | #define CHROMDETECTOR_H_
10 | 
11 | #include <vector>
12 | 
13 | using namespace std;
14 | 
15 | namespace nonltr{
16 | class ChromDetector {
17 | 
18 | private:
19 | 	vector<vector<int> *> * regions;
20 | 
21 | public:
22 | 	ChromDetector(double, double, double, double, double, vector<int> *,
23 | 			const vector<vector<int> *> *);
24 | 	virtual ~ChromDetector();
25 | 	vector<vector<int> *> * getRegions();
26 | };
27 | }
28 | 
29 | #endif /* CHROMDETECTOR_H_ */
30 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromDetectorMaxima.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ChromDetectorMaxima.cpp
 3 |  *
 4 |  *  Created on: Jun 6, 2013
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #include "ChromDetectorMaxima.h"
 9 | 
10 | namespace nonltr {
11 | 
12 | ChromDetectorMaxima::ChromDetectorMaxima(double s, double w, double m,
13 | 		double t, double p, int e, vector<int> * oScores,
14 | 		ChromosomeOneDigit * chrom) {
15 | 	header = chrom->getHeader();
16 | 	start(s, w, m, t, p, e, oScores, chrom->getSegment());
17 | 
18 | }
19 | 
20 | ChromDetectorMaxima::ChromDetectorMaxima(double s, double w, double m,
21 | 		double t, double p, int e, vector<int> * oScores, const vector<vector<
22 | 				int> *> * segmentList) {
23 | 	header = string("chrUnknown");
24 | 	start(s, w, m, t, p, e, oScores, segmentList);
25 | }
26 | 
27 | void ChromDetectorMaxima::start(double s, double w, double m, double t,
28 | 		double p, int e, vector<int> * oScores,
29 | 		const vector<vector<int> *> * segmentList) {
30 | 
31 | 	regionList = new vector<ILocation *> ();
32 | 
33 | 	int segmentCount = segmentList->size();
34 | 	for (int i = 0; i < segmentCount; i++) {
35 | 		int segStart = segmentList->at(i)->at(0);
36 | 		int segEnd = segmentList->at(i)->at(1);
37 | 
38 | 		// The effective length is shorter than the actual length by 2w
39 | 		int effLen = 2 * w + 10;
40 | 		int segLen = segEnd - segStart + 1;
41 | 
42 | 		if (segLen > effLen) {
43 | 			DetectorMaxima * detector = new DetectorMaxima(segStart, segEnd, s,
44 | 					w, m, t, p, e, oScores);
45 | 
46 | 			const vector<ILocation *> * segRegions = detector->getRegionList();
47 | 			int segRegionCount = segRegions->size();
48 | 			for (int h = 0; h < segRegionCount; h++) {
49 | 				regionList->push_back(new Location(*(segRegions->at(h))));
50 | 			}
51 | 
52 | 			delete detector;
53 | 		} else {
54 | 			cout << "\tSkipping a short segment: ";
55 | 			cout << segStart << "-" << segEnd << endl;
56 | 		}
57 | 	}
58 | }
59 | 
60 | ChromDetectorMaxima::~ChromDetectorMaxima() {
61 | 	Util::deleteInVector(regionList);
62 | 	regionList->clear();
63 | 	delete regionList;
64 | }
65 | 
66 | void ChromDetectorMaxima::printIndex(string outputFile) {
67 | 	printIndex(outputFile, false);
68 | }
69 | 
70 | void ChromDetectorMaxima::printIndex(string outputFile, bool canAppend) {
71 | 	ofstream outIndex;
72 | 
73 | 	if (canAppend) {
74 | 		outIndex.open(outputFile.c_str(), ios::out | ios::app);
75 | 	} else {
76 | 		outIndex.open(outputFile.c_str(), ios::out);
77 | 	}
78 | 
79 | 	// Write the index of the repeat segment [x,y[
80 | 	for (int j = 0; j < regionList->size(); j++) {
81 | 		outIndex << header << ":";
82 | 		outIndex << ((int) (regionList->at(j)->getStart())) << "-";
83 | 		outIndex << ((int) (regionList->at(j)->getEnd() + 1)) << " ";
84 | 		outIndex << endl;
85 | 	}
86 | 
87 | 	outIndex.close();
88 | }
89 | 
90 | const vector<ILocation*>* ChromDetectorMaxima::getRegionList() const {
91 | 	return regionList;
92 | }
93 | 
94 | } /* namespace nonltr */
95 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromDetectorMaxima.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ChromDetectorMaxima.h
 3 |  *
 4 |  *  Created on: Jun 6, 2013
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef CHROMDETECTORMAXIMA_H_
 9 | #define CHROMDETECTORMAXIMA_H_
10 | 
11 | #include <fstream>
12 | #include <vector>
13 | 
14 | #include "ChromosomeOneDigit.h"
15 | #include "DetectorMaxima.h"
16 | 
17 | #include "../utility/Util.h"
18 | #include "../utility/ILocation.h"
19 | #include "../utility/Location.h"
20 | 
21 | using namespace std;
22 | using namespace utility;
23 | 
24 | namespace nonltr {
25 | 
26 | class ChromDetectorMaxima {
27 | private:
28 | 	vector<ILocation *> * regionList;
29 | 	string header;
30 | 
31 | 	void start(double, double, double, double, double, int, vector<int> *,
32 | 			const vector<vector<int> *> *);
33 | 
34 | public:
35 | 	ChromDetectorMaxima(double, double, double, double, double, int,
36 | 			vector<int> *, ChromosomeOneDigit *);
37 | 	ChromDetectorMaxima(double, double, double, double, double, int,
38 | 			vector<int> *, const vector<vector<int> *> *);
39 | 	virtual ~ChromDetectorMaxima();
40 | 	const vector<ILocation*>* getRegionList() const;
41 | 	void printIndex(string);
42 | 	void printIndex(string, bool);
43 | 
44 | };
45 | 
46 | } /* namespace nonltr */
47 | #endif /* CHROMDETECTORMAXIMA_H_ */
48 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromListMaker.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ChromListMaker.cpp
  3 |  *
  4 |  *  Created on: Mar 13, 2014
  5 |  *      Author: Hani Zakaira Girgis
  6 |  */
  7 | 
  8 | #include "ChromListMaker.h"
  9 | 
 10 | namespace nonltr {
 11 | 
 12 | ChromListMaker::ChromListMaker(string seqFileIn, bool is_oneseq_) {
 13 | 	seqFile = seqFileIn;
 14 | 	is_oneseq = is_oneseq_;
 15 | 	chromList = new vector<Chromosome *>();
 16 | }
 17 | 
 18 | ChromListMaker::~ChromListMaker() {
 19 | 	Util::deleteInVector(chromList);
 20 | 	delete chromList;
 21 | }
 22 | 
 23 | 
 24 | std::istream& safe_getline(std::istream& is, std::string& t)
 25 | {
 26 | 	t.clear();
 27 | 	std::istream::sentry se(is, true);
 28 | 	std::streambuf* sb = is.rdbuf();
 29 | 	for(;;) {
 30 | 		int c = sb->sbumpc();
 31 | 		switch (c) {
 32 | 		case '\n':
 33 | 			return is;
 34 | 		case '\r':
 35 | 			if (sb->sgetc() == '\n') {
 36 | 				sb->sbumpc();
 37 | 			}
 38 | 			return is;
 39 | 		case std::streambuf::traits_type::eof():
 40 | 			if (t.empty()) {
 41 | 				is.setstate(std::ios::eofbit);
 42 | 			}
 43 | 			return is;
 44 | 		default:
 45 | 			t += (char)c;
 46 | 		}
 47 | 	}
 48 | }
 49 | 
 50 | const vector<Chromosome *> * ChromListMaker::makeChromList() {
 51 | 	ifstream in(seqFile.c_str());
 52 | 	bool isFirst = true;
 53 | 	Chromosome * chrom;
 54 | 	vector<uint64_t> size_list = getSize();
 55 | 	uint64_t cur_seq = 0;
 56 | 	if (is_oneseq) {
 57 | 		uint64_t sum = 0;
 58 | 		for (uint64_t len : size_list) {
 59 | 			sum += len + 50;
 60 | 		}
 61 | 		size_list.clear();
 62 | 		size_list.push_back(sum);
 63 | 	}
 64 | 	while (in.good()) {
 65 | 		string line;
 66 | 		safe_getline(in, line);
 67 | 		if (line[0] == '>') {
 68 | 			if (!isFirst) {
 69 | 				if (is_oneseq) {
 70 | 					std::string interseq(50, 'N');
 71 | 					//	chrom->insert(interseq);
 72 | 					chrom->appendToSequence(interseq);
 73 | 				} else {
 74 | 					chrom->finalize();
 75 | 					chromList->push_back(chrom);
 76 | 					chrom = new Chromosome(size_list.at(cur_seq++));
 77 | 					chrom->setHeader(line);
 78 | 				}
 79 | 			} else {
 80 | 				isFirst = false;
 81 | 				chrom = new Chromosome(size_list.at(cur_seq++));
 82 | 				chrom->setHeader(line);
 83 | 			}
 84 | 		} else if (line[0] == ' ' || line[0] == '\t') {
 85 | 		} else {
 86 | 			//	chrom->insert(line);
 87 | 			chrom->appendToSequence(line);
 88 | 		}
 89 | 	}
 90 | 	chrom->finalize();
 91 | 	chromList->push_back(chrom);
 92 | 	in.close();
 93 | 
 94 | 	return chromList;
 95 | }
 96 | 
 97 | const vector<uint64_t> ChromListMaker::getSize() {
 98 | 	ifstream in(seqFile.c_str());
 99 | 	vector<uint64_t> size_list;
100 | 	uint64_t current_size = 0;
101 | 	while (in.good()) {
102 | 		string line;
103 | 		safe_getline(in, line);
104 | 		if (line[0] == '>') {
105 | 			if (current_size > 0) {
106 | 				size_list.push_back(current_size);
107 | 			}
108 | 			current_size = 0;
109 | 		} else if (line[0] == ' ' || line[0] == '\t') {
110 | 		} else {
111 | 			current_size += line.length();
112 | 		}
113 | 	}
114 | 	size_list.push_back(current_size);
115 | 	return size_list;
116 | }
117 | const vector<Chromosome *> * ChromListMaker::makeChromOneDigitDnaList() {
118 | 	ifstream in(seqFile.c_str());
119 | 	bool isFirst = true;
120 | 	ChromosomeOneDigitDna * chrom;
121 | 	vector<uint64_t> size_list = getSize();
122 | 	uint64_t cur_seq = 0;
123 | 	if (is_oneseq) {
124 | 		uint64_t sum = 0;
125 | 		for (uint64_t len : size_list) {
126 | 			sum += len + 50;
127 | 		}
128 | 		if (sum > 0) {
129 | 			sum -= 50;
130 | 		}
131 | 		size_list.clear();
132 | 		size_list.push_back(sum);
133 | 	}
134 | 	while (in.good()) {
135 | 		string line;
136 | 		safe_getline(in, line);
137 | 		if (line[0] == '>') {
138 | 			if (!isFirst) {
139 | 				if (is_oneseq) {
140 | 					std::string interseq(50, 'N');
141 | 					chrom->insert(interseq);
142 | 				} else {
143 | 					chrom->finalize();
144 | 					chromList->push_back(chrom);
145 | 					chrom = new ChromosomeOneDigitDna(size_list.at(cur_seq++));
146 | 					chrom->setHeader(line);
147 | 				}
148 | 			} else {
149 | 				isFirst = false;
150 | 				chrom = new ChromosomeOneDigitDna(size_list.at(cur_seq++));
151 | 				chrom->setHeader(line);
152 | 
153 | 			}
154 | 		} else if (line[0] == ' ' || line[0] == '\t') {
155 | 		} else {
156 | 			chrom->insert(line);
157 | //			chrom->appendToSequence(line);
158 | 		}
159 | 	}
160 | 	chrom->finalize();
161 | 	chromList->push_back(chrom);
162 | 	in.close();
163 | 
164 | 	return chromList;
165 | }
166 | 
167 | const vector<Chromosome *> * ChromListMaker::makeChromOneDigitProteinList() {
168 | 	ifstream in(seqFile.c_str());
169 | 	bool isFirst = true;
170 | 	ChromosomeOneDigitProtein * chrom;
171 | 
172 | 	while (in.good()) {
173 | 		string line;
174 | 		safe_getline(in, line);
175 | 		if (line[0] == '>') {
176 | 			if (!isFirst) {
177 | 				chrom->finalize();
178 | 				chromList->push_back(chrom);
179 | 			} else {
180 | 				isFirst = false;
181 | 			}
182 | 
183 | 			chrom = new ChromosomeOneDigitProtein();
184 | 			chrom->setHeader(line);
185 | 		} else {
186 | 			chrom->appendToSequence(line);
187 | 		}
188 | 	}
189 | 
190 | 	chrom->finalize();
191 | 	chromList->push_back(chrom);
192 | 	in.close();
193 | 
194 | 	return chromList;
195 | }
196 | 
197 | }
198 | /* namespace nonltr */
199 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromListMaker.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ChromListMaker.h
 3 |  *
 4 |  *   Created on: Mar 13, 2014
 5 |  *  Modified on: Oct 2, 2018
 6 |  *       Author: Hani Zakaria Girgis, PhD
 7 |  */
 8 | 
 9 | #ifndef CHROMLISTMAKER_H_
10 | #define CHROMLISTMAKER_H_
11 | 
12 | #include <string>
13 | #include <vector>
14 | 
15 | #include "Chromosome.h"
16 | #include "ChromosomeOneDigitDna.h"
17 | #include "ChromosomeOneDigitProtein.h"
18 | 
19 | #include "../utility/Util.h"
20 | 
21 | using namespace std;
22 | using namespace utility;
23 | 
24 | namespace nonltr {
25 | 
26 | class ChromListMaker {
27 | private:
28 | 	vector<Chromosome *> * chromList;
29 | 	string seqFile;
30 | 	bool is_oneseq;
31 | public:
32 | 	ChromListMaker(string, bool is_oneseq_=false);
33 | 	virtual ~ChromListMaker();
34 | 	const vector<uint64_t> getSize();
35 | 	const vector<Chromosome *> * makeChromList();
36 | 	const vector<Chromosome *> * makeChromOneDigitDnaList();
37 | 	const vector<Chromosome *> * makeChromOneDigitProteinList();
38 | 
39 | };
40 | 
41 | } /* namespace nonltr */
42 | #endif /* CHROMLISTMAKER_H_ */
43 | 


--------------------------------------------------------------------------------
/src/nonltr/Chromosome.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Chromosome.h
 3 |  *
 4 |  *  Created on: Mar 26, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH
 6 |  */
 7 | #ifndef CHROMOSOME_H_
 8 | #define CHROMOSOME_H_
 9 | 
10 | #include <string>
11 | #include <fstream>
12 | #include <vector>
13 | #include <iostream>
14 | #include <map>
15 | 
16 | #include "IChromosome.h"
17 | #include "../exception/InvalidOperationException.h"
18 | #include "../exception/InvalidInputException.h"
19 | #include "../utility/Util.h"
20 | 
21 | using namespace std;
22 | using namespace nonltr;
23 | using namespace utility;
24 | using namespace exception;
25 | 
26 | namespace nonltr {
27 | class Chromosome: public IChromosome {
28 | public:
29 | 	Chromosome();
30 | 	Chromosome(uint64_t);
31 | 	Chromosome(string);
32 | 	Chromosome(string, bool);
33 | 	Chromosome(string, int);
34 | 	Chromosome(string, int, int);
35 | 	Chromosome(string &, string&);
36 | 	Chromosome(string &, string&, int);
37 | 
38 | 	int getGcContent();
39 | 
40 | 	virtual ~Chromosome();
41 | 
42 | 	virtual string& getBaseRef();
43 | 	virtual string& getHeaderRef();
44 | 
45 | 	virtual const string* getBase();
46 | 	virtual const vector<vector<int> *> * getSegment();
47 | 	virtual void printSegmentList();
48 | 	virtual string getHeader();
49 | 	virtual int size();
50 | 	virtual int getEffectiveSize();
51 | 	virtual void setHeader(string&);
52 | 	virtual void setSequence(string&);
53 | 	virtual void appendToSequence(const string&);
54 | 	virtual void finalize();
55 | 	virtual vector<int> * getBaseCount();
56 | 	virtual void insert(const string&);
57 | 
58 | protected:
59 | 	string chromFile;
60 | 	string header;
61 | 	string base;
62 | 	int str_len;
63 | 
64 | 	int effectiveSize;
65 | 	int segLength;
66 | 
67 | 	vector<vector<int> *> * segment;
68 | 	void readFasta();
69 | 	void readFasta(int);
70 | 	void toUpperCase();
71 | 	void removeAmbiguous();
72 | 	void mergeSegments();
73 | 	virtual void help(int, bool);
74 | 	void makeSegmentList();
75 | 	void calculateEffectiveSize();
76 | 
77 | private:
78 | 	bool isHeaderReady;
79 | 	bool isBaseReady;
80 | 	bool isFinalized;
81 | 	bool canClean = false;
82 | 
83 | 	void reverseSegments();
84 | 	void makeBaseCount();
85 | 	vector<int> * baseCount;
86 | };
87 | }
88 | 
89 | #endif /* CHROMOSOME_H_ */
90 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromosomeOneDigit.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ChromosomeOneDigit.cpp
  3 |  *
  4 |  *  Created on: Jul 31, 2012
  5 |  *      Author: Hani Zakaria Girgis, PhD at the NCB1/NLM/NIH
  6 |  */
  7 | #include <iostream>
  8 | #include <map>
  9 | #include <sstream>
 10 | #include "Chromosome.h"
 11 | #include "ChromosomeOneDigit.h"
 12 | #include "../exception/InvalidInputException.h"
 13 | 
 14 | using namespace exception;
 15 | 
 16 | namespace nonltr {
 17 | 
 18 | ChromosomeOneDigit::ChromosomeOneDigit() :
 19 | 		Chromosome() {
 20 | 	//cout << "The no args constructor is called" << endl;
 21 | }
 22 | 
 23 | ChromosomeOneDigit::ChromosomeOneDigit(uint64_t s) :
 24 | 		Chromosome(s) {
 25 | 	//cout << "The no args constructor is called" << endl;
 26 | }
 27 | 
 28 | ChromosomeOneDigit::ChromosomeOneDigit(string fileName) :
 29 | 		Chromosome(fileName) {
 30 | 	help();
 31 | }
 32 | 
 33 | ChromosomeOneDigit::ChromosomeOneDigit(string fileName, int segmentLength,
 34 | 		int maxLength) :
 35 | 		Chromosome(fileName, segmentLength, maxLength) {
 36 | 	help();
 37 | }
 38 | 
 39 | ChromosomeOneDigit::ChromosomeOneDigit(string& seq, string& info) :
 40 | 		Chromosome(seq, info) {
 41 | 	//cout << "Two string constructor is called" << endl;
 42 | 	help();
 43 | }
 44 | 
 45 | ChromosomeOneDigit::ChromosomeOneDigit(string& seq, string& info, int length) :
 46 | 		Chromosome(seq, info, length) {
 47 | 	//cout << "Two string constructor is called" << endl;
 48 | 	help();
 49 | }
 50 | 
 51 | void ChromosomeOneDigit::finalize() {
 52 | 	Chromosome::finalize();
 53 | 	help();
 54 | }
 55 | 
 56 | void ChromosomeOneDigit::help() {
 57 | 	// Can delete the codes
 58 | 	canClean = true;
 59 | 
 60 | 	// Make map
 61 | 	codes = new map<char, char>();
 62 | 
 63 | 	// Build codes
 64 | 	buildCodes();
 65 | 	// Modify the sequence in the super class
 66 | 	encode();
 67 | }
 68 | 
 69 | ChromosomeOneDigit::~ChromosomeOneDigit() {
 70 | 	if (canClean) {
 71 | 		codes->clear();
 72 | 		delete codes;
 73 | 	}
 74 | }
 75 | 
 76 | /**
 77 |  * This method converts nucleotides in the segments to single digit codes
 78 |  */
 79 | void ChromosomeOneDigit::encode() {
 80 | 
 81 | 	for (int s = 0; s < segment->size(); s++) {
 82 | 		int segStart = segment->at(s)->at(0);
 83 | 		int segEnd = segment->at(s)->at(1);
 84 | 		for (int i = segStart; i <= segEnd; i++) {
 85 | 
 86 | 			if (codes->count(base[i]) > 0) {
 87 | 				base[i] = codes->at(base[i]);
 88 | 			} else {
 89 | 				string msg = "Invalid nucleotide: ";
 90 | 				std::ostringstream oss;
 91 | 				int b_int = base[i];
 92 | 				oss << msg << b_int;
 93 | 				throw InvalidInputException(oss.str());
 94 | 			}
 95 | 		}
 96 | 	}
 97 | 
 98 | 	// Digitize skipped segments
 99 | 	char uncertainChar = Util::isDna? 'N' : 'X';
100 | 	int segNum = segment->size();
101 | 	if (segNum > 0) {
102 | 		// The first interval - before the first segment
103 | 		int segStart = 0;
104 | 		int segEnd = segment->at(0)->at(0) - 1;
105 | 
106 | 		for (int s = 0; s <= segNum; s++) {
107 | 			for (int i = segStart; i <= segEnd; i++) {
108 | 				char c = base[i];
109 | 
110 | 				if (c != uncertainChar) {
111 | 					if (codes->count(c) > 0) {
112 | 						base[i] = codes->at(c);
113 | 					} else {
114 | 						string msg = "ChromosomeOneDigit::encode() found invalid letter: ";
115 | 						msg.append(1, c);
116 | 						throw InvalidInputException(msg);
117 | 					}
118 | 				}
119 | 			}
120 | 
121 | 			// The regular intervals between two segments
122 | 			if (s < segNum - 1) {
123 | 				segStart = segment->at(s)->at(1) + 1;
124 | 				segEnd = segment->at(s + 1)->at(0) - 1;
125 | 			}
126 | 			// The last interval - after the last segment
127 | 			else if (s == segNum - 1) {
128 | 				segStart = segment->at(s)->at(1) + 1;
129 | 				segEnd = base.size() - 1;
130 | 			}
131 | 		}
132 | 	}
133 | }
134 | 
135 | }
136 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromosomeOneDigit.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ChromosomeOneDigit.h
 3 |  *
 4 |  *  Created on: Jul 31, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH
 6 |  */
 7 | 
 8 | #ifndef CHROMOSOMEONEDIGIT_H_
 9 | #define CHROMOSOMEONEDIGIT_H_
10 | 
11 | #include <map>
12 | #include "Chromosome.h"
13 | 
14 | namespace nonltr {
15 | class ChromosomeOneDigit: public Chromosome {
16 | 
17 | private:
18 | 	void encode();
19 | 	void help();
20 | 
21 | 
22 | protected:
23 | 	bool canClean = false;
24 | 	map<char, char> * codes;
25 | 	virtual void buildCodes() = 0;
26 | 
27 | 
28 | public:
29 | 	/* Methods */
30 | 	ChromosomeOneDigit();
31 | 	ChromosomeOneDigit(uint64_t);
32 | 	ChromosomeOneDigit(string);
33 | 	ChromosomeOneDigit(string, int, int);
34 | 	ChromosomeOneDigit(string&, string&);
35 | 	ChromosomeOneDigit(string&, string&, int);
36 | 	virtual ~ChromosomeOneDigit();
37 | 	virtual void finalize();
38 | 
39 | 
40 | };
41 | }
42 | 
43 | #endif /* CHROMOSOMEONEDIGIT_H_ */
44 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromosomeOneDigitDna.cpp:
--------------------------------------------------------------------------------
  1 | #include "ChromosomeOneDigitDna.h"
  2 | 
  3 | namespace nonltr{
  4 | 
  5 | ChromosomeOneDigitDna::ChromosomeOneDigitDna() : ChromosomeOneDigit() {}
  6 | ChromosomeOneDigitDna::ChromosomeOneDigitDna(uint64_t s) : ChromosomeOneDigit(s) {}
  7 | 
  8 | ChromosomeOneDigitDna::ChromosomeOneDigitDna(string fileName) :
  9 | 	ChromosomeOneDigit(fileName){
 10 | 
 11 | }
 12 | 
 13 | ChromosomeOneDigitDna::ChromosomeOneDigitDna(string fileName, int segmentLength, int maxLength) :
 14 | 	ChromosomeOneDigit(fileName, segmentLength, maxLength) {
 15 | 
 16 | }
 17 | 
 18 | ChromosomeOneDigitDna::ChromosomeOneDigitDna(string& seq, string& info) :
 19 | 	ChromosomeOneDigit(seq, info){
 20 | 
 21 | }
 22 | 
 23 | ChromosomeOneDigitDna::ChromosomeOneDigitDna(string& seq, string& info, int length) :
 24 | 	ChromosomeOneDigit(seq, info, length) {
 25 | }
 26 | 
 27 | ChromosomeOneDigitDna::~ChromosomeOneDigitDna(){
 28 | 
 29 | }
 30 | 
 31 | /**
 32 |  * A	A
 33 |  * T	T
 34 |  * G	G
 35 |  * C	C
 36 |  * R	G or A
 37 |  * Y	T or C
 38 |  * M	A or C
 39 |  * K	G or T
 40 |  * S	G or C
 41 |  * W	A or T
 42 |  * H	A or C or T
 43 |  * B	G or T or C
 44 |  * V	G or C or A
 45 |  * D	G or T or A
 46 |  * N	G or T or A or C
 47 |  */
 48 | void ChromosomeOneDigitDna::buildCodes() {
 49 | 	// Certain nucleotides
 50 | 	codes->insert(map<char, char>::value_type('A', (char) 0));
 51 | 	codes->insert(map<char, char>::value_type('C', (char) 1));
 52 | 	codes->insert(map<char, char>::value_type('G', (char) 2));
 53 | 	codes->insert(map<char, char>::value_type('T', (char) 3));
 54 | 
 55 | 	// Uncertain nucleotides
 56 | 	codes->insert(map<char, char>::value_type('R', codes->at('G')));
 57 | 	codes->insert(map<char, char>::value_type('Y', codes->at('C')));
 58 | 	codes->insert(map<char, char>::value_type('M', codes->at('A')));
 59 | 	codes->insert(map<char, char>::value_type('K', codes->at('T')));
 60 | 	codes->insert(map<char, char>::value_type('S', codes->at('G')));
 61 | 	codes->insert(map<char, char>::value_type('W', codes->at('T')));
 62 | 	codes->insert(map<char, char>::value_type('H', codes->at('C')));
 63 | 	codes->insert(map<char, char>::value_type('B', codes->at('T')));
 64 | 	codes->insert(map<char, char>::value_type('V', codes->at('A')));
 65 | 	codes->insert(map<char, char>::value_type('D', codes->at('T')));
 66 | 	codes->insert(map<char, char>::value_type('N', codes->at('C')));
 67 | 	codes->insert(map<char, char>::value_type('X', codes->at('G')));
 68 | }
 69 | 
 70 | /**
 71 |  * Cannot be called on already finalized object.
 72 |  */
 73 | void ChromosomeOneDigitDna::makeR() {
 74 | 	//cout << "Making reverse ..." << endl;
 75 | 	makeReverse();
 76 | 	reverseSegments();
 77 | }
 78 | 
 79 | /**
 80 |  * Cannot be called on already finalized object.
 81 |  */
 82 | void ChromosomeOneDigitDna::makeRC() {
 83 | 	//cout << "Making reverse complement ..." << endl;
 84 | 	makeComplement();
 85 | 	makeReverse();
 86 | 	reverseSegments();
 87 | }
 88 | 
 89 | void ChromosomeOneDigitDna::makeComplement() {
 90 | 	map<char, char> complement;
 91 | 
 92 | 	// Certain nucleotides
 93 | 	complement.insert(map<char, char>::value_type((char) 0, (char) 3));
 94 | 	complement.insert(map<char, char>::value_type((char) 1, (char) 2));
 95 | 	complement.insert(map<char, char>::value_type((char) 2, (char) 1));
 96 | 	complement.insert(map<char, char>::value_type((char) 3, (char) 0));
 97 | 
 98 | 	// Unknown nucleotide
 99 | 	complement.insert(map<char, char>::value_type('N', 'N'));
100 | 	// complement.insert(map<char, char>::value_type((char) 4, (char) 4));
101 | 
102 | 	// Convert a sequence to its complement
103 | 	int seqLen = base.size();
104 | 	for (int i = 0; i < seqLen; i++) {
105 | 		if (complement.count(base[i]) > 0) {
106 | 			base[i] = complement.at(base[i]);
107 | 		} else {
108 | 			cerr << "Error: The digit " << (char) base[i];
109 | 			cerr << " does not represent a base." << endl;
110 | 			exit(2);
111 | 		}
112 | 	}
113 | }
114 | 
115 | void ChromosomeOneDigitDna::makeReverse() {
116 | 	int last = base.size() - 1;
117 | 
118 | 	// Last index to be switched
119 | 	int middle = base.size() / 2;
120 | 
121 | 	for (int i = 0; i < middle; i++) {
122 | 		char temp = base[last - i];
123 | 		base[last - i] = base[i];
124 | 		base[i] = temp;
125 | 	}
126 | }
127 | 
128 | void ChromosomeOneDigitDna::reverseSegments() {
129 | 	int segNum = segment->size();
130 | 	int lastBase = size() - 1;
131 | 
132 | 	// Calculate the coordinate on the main strand
133 | 	for (int i = 0; i < segNum; i++) {
134 | 		vector<int> * seg = segment->at(i);
135 | 
136 | 		int s = lastBase - seg->at(1);
137 | 		int e = lastBase - seg->at(0);
138 | 		seg->clear();
139 | 		seg->push_back(s);
140 | 		seg->push_back(e);
141 | 	}
142 | 
143 | 	// Reverse the regions within the list
144 | 	int lastRegion = segNum - 1;
145 | 	int middle = segNum / 2;
146 | 	for (int i = 0; i < middle; i++) {
147 | 		vector<int> * temp = segment->at(lastRegion - i);
148 | 		(*segment)[lastRegion - i] = segment->at(i);
149 | 		(*segment)[i] = temp;
150 | 	}
151 | }
152 | 
153 | 
154 | }
155 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromosomeOneDigitDna.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ChromosomeOneDigitDna.h
 3 |  * Created on: September 28, 2018
 4 |  *     Author: Hani Z. Girgis, PhD
 5 |  */
 6 | 
 7 |  #ifndef HROMOSOMEONEDIGITDNA_H_
 8 |  #define HROMOSOMEONEDIGITDNA_H_
 9 | 
10 | #include "ChromosomeOneDigit.h"
11 | 
12 | namespace nonltr{
13 | 	class ChromosomeOneDigitDna: public ChromosomeOneDigit{
14 | 		private:
15 | 			void makeReverse();
16 | 			void makeComplement();
17 | 			void reverseSegments();
18 | 
19 | 		protected:
20 | 			virtual void buildCodes();
21 | 
22 | 		public:
23 | 			ChromosomeOneDigitDna();
24 | 		        ChromosomeOneDigitDna(uint64_t);
25 | 			ChromosomeOneDigitDna(string);
26 | 			ChromosomeOneDigitDna(string, int, int);
27 | 			ChromosomeOneDigitDna(string&, string&);
28 | 			ChromosomeOneDigitDna(string&, string&, int);
29 | 			virtual ~ChromosomeOneDigitDna();
30 | 
31 | 			void makeR();
32 | 			void makeRC();
33 | 	};
34 | }
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/src/nonltr/ChromosomeOneDigitProtein.cpp:
--------------------------------------------------------------------------------
 1 | #include "ChromosomeOneDigitProtein.h"
 2 | 
 3 | namespace nonltr{
 4 | 
 5 | ChromosomeOneDigitProtein::ChromosomeOneDigitProtein() : 
 6 | 	ChromosomeOneDigit() {
 7 | 
 8 | }
 9 | 
10 | ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string fileName) : 
11 | 	ChromosomeOneDigit(fileName){
12 | 
13 | }
14 | 
15 | ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string fileName, int segmentLength, int maxLength) : 
16 | 	ChromosomeOneDigit(fileName, segmentLength, maxLength) {
17 | 
18 | }
19 | 
20 | ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string& seq, string& info) : 
21 | 	ChromosomeOneDigit(seq, info){
22 | 
23 | }
24 | 
25 | ChromosomeOneDigitProtein::ChromosomeOneDigitProtein(string& seq, string& info, int length) : 
26 | 	ChromosomeOneDigit(seq, info, length) {
27 | }
28 | 
29 | ChromosomeOneDigitProtein::~ChromosomeOneDigitProtein(){
30 | 
31 | }
32 | 
33 | void ChromosomeOneDigitProtein::buildCodes() {
34 | 	// https://en.wikipedia.org/wiki/Proteinogenic_amino_acid
35 | 	codes->insert(map<char, char>::value_type('A', (char) 0));
36 | 	codes->insert(map<char, char>::value_type('C', (char) 1));
37 | 	codes->insert(map<char, char>::value_type('D', (char) 2));
38 | 	codes->insert(map<char, char>::value_type('E', (char) 3));
39 | 	codes->insert(map<char, char>::value_type('F', (char) 4));
40 | 	codes->insert(map<char, char>::value_type('G', (char) 5));
41 | 	codes->insert(map<char, char>::value_type('H', (char) 6));
42 | 	codes->insert(map<char, char>::value_type('I', (char) 7));
43 | 	codes->insert(map<char, char>::value_type('K', (char) 8));
44 | 	codes->insert(map<char, char>::value_type('L', (char) 9));
45 | 	codes->insert(map<char, char>::value_type('M', (char) 10));
46 | 	codes->insert(map<char, char>::value_type('N', (char) 11));
47 | 	codes->insert(map<char, char>::value_type('O', (char) 12));
48 | 	codes->insert(map<char, char>::value_type('P', (char) 13));
49 | 	codes->insert(map<char, char>::value_type('Q', (char) 14));
50 | 	codes->insert(map<char, char>::value_type('R', (char) 15));
51 | 	codes->insert(map<char, char>::value_type('S', (char) 16));
52 | 	codes->insert(map<char, char>::value_type('T', (char) 17));
53 | 	codes->insert(map<char, char>::value_type('U', (char) 18));
54 | 	codes->insert(map<char, char>::value_type('V', (char) 19));
55 | 	codes->insert(map<char, char>::value_type('W', (char) 20));
56 | 	codes->insert(map<char, char>::value_type('Y', (char) 21));
57 | 
58 | 	// Uncertain uncleotides
59 | 	codes->insert(map<char, char>::value_type('B', codes->at('D')));
60 | 	codes->insert(map<char, char>::value_type('Z', codes->at('E')));
61 | 	codes->insert(map<char, char>::value_type('J', codes->at('L')));
62 | }
63 | 
64 | }// End namespace


--------------------------------------------------------------------------------
/src/nonltr/ChromosomeOneDigitProtein.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ChromosomeOneDigitProtein.h
 3 |  * Created on: October 2, 2018
 4 |  *     Author: Hani Z. Girgis, PhD
 5 |  */
 6 | 
 7 |  #ifndef HROMOSOMEONEDIGITPROTEIN_H_
 8 |  #define HROMOSOMEONEDIGITPROTEIN_H_
 9 | 
10 | #include "ChromosomeOneDigit.h"
11 | 
12 | namespace nonltr{
13 | 	class ChromosomeOneDigitProtein: public ChromosomeOneDigit{
14 | 
15 | 		protected:
16 | 			virtual void buildCodes();
17 | 
18 | 		public:
19 | 			ChromosomeOneDigitProtein();
20 | 			ChromosomeOneDigitProtein(string);
21 | 			ChromosomeOneDigitProtein(string, int, int);
22 | 			ChromosomeOneDigitProtein(string&, string&);
23 | 			ChromosomeOneDigitProtein(string&, string&, int);
24 | 			virtual ~ChromosomeOneDigitProtein();
25 | 	};
26 | }
27 | 
28 | #endif


--------------------------------------------------------------------------------
/src/nonltr/ChromosomeRandom.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ChromosomeRandom.h
 3 |  *
 4 |  *  Created on: Feb 4, 2013
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef CHROMOSOMERANDOM_H_
 9 | #define CHROMOSOMERANDOM_H_
10 | 
11 | #include <map>
12 | 
13 | #include "IChromosome.h"
14 | 
15 | namespace nonltr {
16 | 
17 | class ChromosomeRandom: public nonltr::IChromosome {
18 | 	// Key-value pair type.
19 | 	typedef map<string, double>::value_type valType;
20 | 
21 | private:
22 | 	int n;
23 | 	char unread;
24 | 	IChromosome * oChrom;
25 | 	vector<char> * alpha;
26 | 	map<string, double> * table;
27 | 	string * rBase;
28 | 	vector<string> * keyList;
29 | 	map<char, char> * codes;
30 | 
31 | 	void fillKeyList();
32 | 	void initializeTable();
33 | 	void countWords();
34 | 	void convertToProbabilities();
35 | 	void printTable();
36 | 	void generateRandomSequence();
37 | 
38 | public:
39 | 	ChromosomeRandom(int, IChromosome*, char, vector<char>*);
40 | 	virtual ~ChromosomeRandom();
41 | 
42 | 	virtual const string* getBase();
43 | 	virtual const vector<vector<int> *> * getSegment();
44 | 	virtual string getHeader();
45 | 	virtual void printSequence(string);
46 | 	void printSequence(string, string *);
47 | 	void printEffectiveSequence(string);
48 | };
49 | 
50 | } /* namespace nonltr */
51 | #endif /* CHROMOSOMERANDOM_H_ */
52 | 


--------------------------------------------------------------------------------
/src/nonltr/DetectorMaxima.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * DetectorMaxima.h
 3 |  *
 4 |  *  Created on: May 31, 2013
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef DETECTORMAXIMA_H_
 9 | #define DETECTORMAXIMA_H_
10 | 
11 | #include <vector>
12 | #include <math.h>
13 | 
14 | #include "../utility/ILocation.h"
15 | 
16 | using namespace std;
17 | using namespace utility;
18 | 
19 | namespace nonltr {
20 | 
21 | class DetectorMaxima {
22 | private:
23 | 
24 | 	int segStart;
25 | 	int segEnd;
26 | 	double s;
27 | 	double w;
28 | 	double m;
29 | 	double t;
30 | 	double p;
31 | 	int e;
32 | 	int halfS;
33 | 
34 | 	vector<int> * oScores;
35 | 	vector<double> * scores;
36 | 	vector<double> * mask;
37 | 	vector<double> * first;
38 | 	vector<double> * second;
39 | 	vector<int> * maxima;
40 | 	// vector<vector<double> *> * allMaxima;
41 | 
42 | 	vector<ILocation *> * separatorList;
43 | 	vector<ILocation *> * regionList;
44 | 
45 | 	void makeMask();
46 | 	void smooth();
47 | 	void deriveFirst();
48 | 	void deriveSecond();
49 | 	void findMaxima();
50 | 
51 | 	void findSeparators();
52 | 	void findRegions();
53 | 
54 | 	void extendRegions();
55 | 
56 | 	int countLessThan(vector<int> *, int, int, double);
57 | 
58 | 	/**
59 | 	 * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c
60 | 	 */
61 | 	inline double round(double number) {
62 | 		return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5);
63 | 	}
64 | 
65 | public:
66 | 	DetectorMaxima(int, int, double, double, double, double, double, int,
67 | 			vector<int> *);
68 | 	virtual ~DetectorMaxima();
69 | 	const vector<ILocation*>* getRegionList() const;
70 | 	const vector<double>* getFirst() const;
71 | 	const vector<double>* getSecond() const;
72 | 
73 | 	// const vector<vector<double> *>* getAllMaxima() const;
74 | };
75 | 
76 | } /* namespace nonltr */
77 | #endif /* DETECTORMAXIMA_H_ */
78 | 


--------------------------------------------------------------------------------
/src/nonltr/EnrichmentMarkovView.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * EnrichmentMarkovView.cpp
  3 |  *
  4 |  *  Created on: Apr 17, 2013
  5 |  *      Author: Hani Zakaria Girgis, PhD
  6 |  */
  7 | 
  8 | namespace nonltr {
  9 | 
 10 | /**
 11 |  * The Markov order. It start at 0.
 12 |  */
 13 | template<class I, class V>
 14 | EnrichmentMarkovView<I, V>::EnrichmentMarkovView(int k, int order, int m) :
 15 | 		minObs(m), factor(10000.00), KmerHashTable<I, V>(k) {
 16 | 	initialize(order);
 17 | }
 18 | 
 19 | template<class I, class V>
 20 | EnrichmentMarkovView<I, V>::EnrichmentMarkovView(int k, V initValue, int order,
 21 | 		int m) :
 22 | 		minObs(m), factor(10000.00), KmerHashTable<I, V>(k, initValue) {
 23 | 	initialize(order);
 24 | }
 25 | 
 26 | template<class I, class V>
 27 | void EnrichmentMarkovView<I, V>::initialize(int order) {
 28 | 	// Test start
 29 | 	// cout << "Testing: " << minObs << endl;
 30 | 	// Test end
 31 | 
 32 | 	o = order;
 33 | 	if (o < 0) {
 34 | 		string msg("The Markov order must be non-negative integer. ");
 35 | 		msg.append("The invalid input is: ");
 36 | 		msg.append(Util::int2string(o));
 37 | 		msg.append(".");
 38 | 		throw InvalidInputException(msg);
 39 | 	}
 40 | 
 41 | 	if (o >= KmerHashTable<I, V>::k) {
 42 | 		string msg("The Markov order cannot be >= k (k-mer).");
 43 | 		throw InvalidInputException(msg);
 44 | 	}
 45 | 
 46 | 	l = 0;
 47 | 	modelList = new vector<KmerHashTable<int, int> *>();
 48 | 
 49 | 	for (int i = 1; i <= o + 1; i++) {
 50 | 		modelList->push_back(new KmerHashTable<int, int>(i));
 51 | 	}
 52 | }
 53 | 
 54 | template<class I, class V>
 55 | EnrichmentMarkovView<I, V>::~EnrichmentMarkovView() {
 56 | 	Util::deleteInVector(modelList);
 57 | 	delete modelList;
 58 | }
 59 | 
 60 | /**
 61 |  * This method count words of size 1 to order+1 in the input sequence.
 62 |  * In other words, it updates the background tables. In addition, it
 63 |  * updates the length of the genome.
 64 |  *
 65 |  * sequence: is the input sequence.
 66 |  * start: the start index - inclosing.
 67 |  * end: the end index - inclosing.
 68 |  */
 69 | template<class I, class V>
 70 | void EnrichmentMarkovView<I, V>::count(const char * sequence, int start,
 71 | 		int end) {
 72 | 
 73 | 	// Multiple by 2 if scanning the forward strand and its reverse complement
 74 | 	// l = l + (2 * (end - start + 1));
 75 | 	l = l + (end - start + 1);
 76 | 
 77 | 	int modelNumber = modelList->size();
 78 | 	for (int i = 0; i < modelNumber; i++) {
 79 | 		KmerHashTable<int, int> * t = modelList->at(i);
 80 | 		t->wholesaleIncrement(sequence, start, end - i);
 81 | 	}
 82 | }
 83 | 
 84 | /**
 85 |  * Normalize the count of words in each model.
 86 |  * Values stored in these models are multiplied by "factor."
 87 |  */
 88 | template<class I, class V>
 89 | void EnrichmentMarkovView<I, V>::generateProbapilities() {
 90 | 	int modelNumber = modelList->size();
 91 | 
 92 | 	for (int m = 0; m < modelNumber; m++) {
 93 | 		KmerHashTable<int, int> * t = modelList->at(m);
 94 | 		int tSize = t->getMaxTableSize();
 95 | 
 96 | 		for (int i = 0; i < tSize; i += 4) {
 97 | 			double sum = 0.0;
 98 | 
 99 | 			for (int j = i; j < i + 4; j++) {
100 | 				sum += t->valueOf(j);
101 | 			}
102 | 
103 | 			for (int j = i; j < i + 4; j++) {
104 | 				t->insert(j, round(factor * ((double) t->valueOf(j) / sum)));
105 | 			}
106 | 		}
107 | 	}
108 | }
109 | 
110 | template<class I, class V>
111 | void EnrichmentMarkovView<I, V>::processTable() {
112 | 	char base = 4;
113 | 	int modelNumber = modelList->size();
114 | 
115 | 	// Make a zero in quaternary form as a string of length k.
116 | 	string q("");
117 | 	for (int x = 0; x < KmerHashTable<I, V>::k; x++) {
118 | 		q.append(1, 0);
119 | 	}
120 | 
121 | 	double lowerP;
122 | 	double upperP;
123 | 	for (I y = 0; y < KmerHashTable<I, V>::maxTableSize; y++) {
124 | 		if (y % 10000000 == 0) {
125 | 			cout << "Processing " << y << " keys out of "
126 | 					<< KmerHashTable<I, V>::maxTableSize;
127 | 			cout << endl;
128 | 		}
129 | 
130 | 		const char * qc = q.c_str();
131 | 
132 | 		// Calculate the expected number of occurrences.
133 | 
134 | 		// a. Calculate probability from lower order models.
135 | 		// Lower probabilities are the same for four consecutive words of length of k-1
136 | 		if (y % 4 == 0) {
137 | 			lowerP = 1.0;
138 | 			for (int m = 0; m < modelNumber - 1; m++) {
139 | 				KmerHashTable<int, int> * oTable = modelList->at(m);
140 | 				lowerP *= (((double) oTable->valueOf(qc, 0)) / factor);
141 | 			}
142 | 		}
143 | 
144 | 		// b. Calculate probability based on the specified order.
145 | 		KmerHashTable<int, int> * oTable = modelList->at(modelNumber - 1);
146 | 		int resultsSize = KmerHashTable<I, V>::k - o - 1;
147 | 
148 | 		// Upper probabilities are the same for four consecutive words of length of k-1
149 | 		// The scanning of words or length corresponding to the highest order + 1
150 | 		// This step is not needed if k = o + 1, i.e. resultsSize = 0.
151 | 		if (y % 4 == 0) {
152 | 			if (resultsSize > 0) {
153 | 				//Initialize the elements of the vector invalid index
154 | 				vector<int> results = vector<int>(resultsSize, -987);
155 | 				oTable->wholesaleValueOf(qc, 0, resultsSize - 1, &results, 0);
156 | 
157 | 				upperP = 1.0;
158 | 				for (int i = 0; i < resultsSize; i++) {
159 | 					upperP *= (((double) results.at(i)) / factor);
160 | 				}
161 | 				results.clear();
162 | 
163 | 			} else {
164 | 				upperP = 1.0;
165 | 			}
166 | 		}
167 | 
168 | 		// The expected number of occurances
169 | 		double exp = l * lowerP * upperP
170 | 				* (((double) oTable->valueOf(qc, resultsSize)) / factor);
171 | 
172 | 		// Calculate the enrichment value.
173 | 		// Log value
174 | 		// values[y] = round((log((double) values[y] + 1.0) - log(exp + 1.0)));
175 | 
176 | 		// Raw value
177 | 		// Requirement: if observed is >= 5 && observed > expected then the value is the difference
178 | 		// otherwise the value is zero
179 | 
180 | 		V observed = KmerHashTable<I, V>::values[y];
181 | 
182 | 		if (observed >= minObs && observed > exp) {
183 | 
184 | 			KmerHashTable<I, V>::values[y] = round(observed - exp);
185 | 		} else {
186 | 			KmerHashTable<I, V>::values[y] = 0;
187 | 		}
188 | 
189 | 		/*
190 | 		 KmerHashTable<I, V>::values[y] =
191 | 		 round(
192 | 		 (((double) KmerHashTable<I, V>::values[y] + 1.0)
193 | 		 / (exp + 1.0)));
194 | 		 */
195 | 
196 | 		// Increment the quaternary number:
197 | 		// 1 - guard against overflow.
198 | 		if (q[0] == base - 1) {
199 | 			string z("");
200 | 			z.append(1, 0);
201 | 			q = z + q;
202 | 		}
203 | 
204 | 		// 2 - increment the quaternary number by 1.
205 | 		int qLen = q.size();
206 | 		for (int i = qLen - 1; i >= 0; i--) {
207 | 			if (q[i] + 1 < base) {
208 | 				q[i] = q[i] + 1;
209 | 				break;
210 | 			} else {
211 | 				q[i] = 0;
212 | 			}
213 | 		}
214 | 	}
215 | }
216 | 
217 | } /* namespace nonltr */
218 | 


--------------------------------------------------------------------------------
/src/nonltr/EnrichmentMarkovView.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * EnrichmentMarkovView.h
 3 |  *
 4 |  *  Created on: Apr 17, 2013
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef ENRICHMENTMARKOVVIEW_H_
 9 | #define ENRICHMENTMARKOVVIEW_H_
10 | 
11 | #include <cmath>
12 | #include <vector>
13 | #include <iostream>
14 | 
15 | #include "KmerHashTable.h"
16 | #include "../utility/Util.h"
17 | #include "../exception/InvalidInputException.h"
18 | 
19 | using namespace std;
20 | using namespace utility;
21 | using namespace exception;
22 | 
23 | namespace nonltr {
24 | 
25 | template<class I, class V>
26 | class EnrichmentMarkovView: public KmerHashTable<I,V>{
27 | 
28 | private:
29 | 	// The minimum number of the observed k-mers
30 | 	const int minObs;
31 | 
32 | 	// This template specification should work up to order of 14,
33 | 	// i.e. word length = 15
34 | 	vector<KmerHashTable<int,int> *> * modelList;
35 | 
36 | 	// Markov order
37 | 	int o;
38 | 
39 | 	// Total length
40 | 	long l;
41 | 
42 | 	// Multiplied the probability of word by this factor
43 | 	// Equivalent to four decimal points.
44 | 	const double factor;	// = 10000.00;
45 | 
46 | 	// Initialize data members
47 | 	void initialize(int);
48 | 
49 | 	/**
50 | 	 * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c
51 | 	 */
52 | 	inline double round(double number) {
53 | 		return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5);
54 | 	}
55 | 
56 | public:
57 | 	EnrichmentMarkovView(int, int, int);
58 | 	EnrichmentMarkovView(int, V, int, int);
59 | 	virtual ~EnrichmentMarkovView();
60 | 
61 | 	void count(const char *, int, int);
62 | 	void generateProbapilities();
63 | 	void processTable();
64 | };
65 | } /* namespace nonltr */
66 | 
67 | #include "EnrichmentMarkovView.cpp"
68 | 
69 | #endif /* ENRICHMENTMARKOVVIEW_H_ */
70 | 


--------------------------------------------------------------------------------
/src/nonltr/HMM.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * HMM.h
  3 |  *
  4 |  *  Created on: Jun 21, 2013
  5 |  *      Author: Hani Zakaria Girgis, PhD
  6 |  */
  7 | 
  8 | #ifndef HMM_H_
  9 | #define HMM_H_
 10 | 
 11 | #include <vector>
 12 | #include <math.h>
 13 | #include <limits>
 14 | #include <stdlib.h>
 15 | 
 16 | #include "../utility/ILocation.h"
 17 | 
 18 | using namespace std;
 19 | using namespace utility;
 20 | 
 21 | namespace nonltr {
 22 | 
 23 | class HMM {
 24 | private:
 25 | 	const int PRECISION;
 26 | 	double minusInf;
 27 | 	vector<double> * pList;
 28 | 	vector<vector<double> *> * tList;
 29 | 	vector<double> * oList;
 30 | 
 31 | 	void initializeHelper();
 32 | 	// Returns the index of the last candidate in the segment
 33 | 	int trainHelper1(int, int, int);
 34 | 	void trainHelper2(int, int, int, int);
 35 | 	void trainPositive(int, int);
 36 | 	void trainNegative(int, int);
 37 | 	void move(int, int);
 38 | 	void checkBase(double);
 39 | 
 40 | 	/*
 41 | 	 inline int getPstvState(int score) {
 42 | 	 int state = ceil(log(score) / logBase);
 43 | 	 if (state < 0) {
 44 | 	 state = 0;
 45 | 	 }
 46 | 	 return state;
 47 | 	 }
 48 | 
 49 | 	 inline int getNgtvState(int score) {
 50 | 	 int state = ceil(log(score) / logBase);
 51 | 	 if (state < 0) {
 52 | 	 state = 0;
 53 | 	 }
 54 | 	 return state + positiveStateNumber;
 55 | 	 }
 56 | 	 */
 57 | 
 58 | 	inline int getPstvState(int index) {
 59 | 		int state = scoreList->at(index);
 60 | 		return state;
 61 | 	}
 62 | 
 63 | 	inline int getNgtvState(int index) {
 64 | 		int state = scoreList->at(index);
 65 | 		return state + positiveStateNumber;
 66 | 	}
 67 | 
 68 | protected:
 69 | 	double base;
 70 | 	double logBase;
 71 | 	int stateNumber;
 72 | 	int positiveStateNumber;
 73 | 
 74 | 	vector<int> * scoreList;
 75 | 	const vector<vector<int> *> * segmentList;
 76 | 	const vector<ILocation*> * candidateList;
 77 | 
 78 | 	void initialize(double, int);
 79 | 	/**
 80 | 	 * Credit: http://stackoverflow.com/questions/554204/where-is-round-in-c
 81 | 	 */
 82 | 	inline double round(double number) {
 83 | 		return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5);
 84 | 	}
 85 | 
 86 | public:
 87 | 	HMM(string); // Build a model from file
 88 | 	HMM(double, int);
 89 | 	// HMM(vector<int> *, const vector<vector<int> *> *,
 90 | 	//		const vector<ILocation*> *, double);
 91 | 	virtual ~HMM();
 92 | 	void train(vector<int> *, const vector<vector<int> *> *, const vector<ILocation*> *);
 93 | 	void normalize();
 94 | 	double decode(int, int, vector<int> *, vector<int>&);
 95 | 	double decode(int, int, vector<int> *, vector<ILocation *>&);
 96 | 	int getPositiveStateNumber();
 97 | 	void print();
 98 | 	void print(string);
 99 | 	double getBase();
100 | };
101 | 
102 | } /* namespace nonltr */
103 | #endif /* HMM_H_ */
104 | 


--------------------------------------------------------------------------------
/src/nonltr/IChromosome.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * IChromosome.h
 3 |  *
 4 |  *  Created on: Feb 4, 2013
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef ICHROMOSOME_H_
 9 | #define ICHROMOSOME_H_
10 | 
11 | #include <string>
12 | #include <vector>
13 | 
14 | using namespace std;
15 | 
16 | namespace nonltr {
17 | 
18 | class IChromosome {
19 | public:
20 | 	//IChromosome();
21 | 	//virtual ~IChromosome();
22 | 	virtual const string* getBase() = 0;
23 | 	virtual const vector<vector<int> *> * getSegment() = 0;
24 | 	virtual string getHeader() = 0;
25 | };
26 | 
27 | } /* namespace tr */
28 | #endif /* ICHROMOSOME_H_ */
29 | 


--------------------------------------------------------------------------------
/src/nonltr/ITableView.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ITableView.h
 3 |  *
 4 |  *  Created on: Aug 9, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef ITABLEVIEW_H_
 9 | #define ITABLEVIEW_H_
10 | 
11 | #include <vector>
12 | 
13 | using namespace std;
14 | 
15 | namespace nonltr {
16 | 
17 | template<class I, class V>
18 | class ITableView {
19 | public:
20 | 	virtual V valueOf(const char*) = 0 ;
21 | 	virtual V valueOf(const char*, int) = 0;
22 | 	virtual V valueOf(I) = 0;
23 | 
24 | 	virtual int getK() = 0;
25 | 	virtual I getMaxTableSize() = 0;
26 | 	virtual const V * getValues() const = 0;
27 | 
28 | 	virtual void wholesaleValueOf(const char *, int, int, vector<V> *) = 0;
29 | 	virtual void wholesaleValueOf(const char *, int, int, vector<V> *, int) = 0;
30 | };
31 | 
32 | }
33 | 
34 | #endif /* ITABLEVIEW_H_ */
35 | 


--------------------------------------------------------------------------------
/src/nonltr/KmerHashTable.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * KmerHashTable.h
 3 |  *
 4 |  *  Created on: Jul 25, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH
 6 |  */
 7 | 
 8 | #ifndef KMERHASHTABLE_H_
 9 | #define KMERHASHTABLE_H_
10 | 
11 | #include <string>
12 | #include <vector>
13 | #include "ITableView.h"
14 | 
15 | using namespace std;
16 | using namespace nonltr;
17 | 
18 | namespace nonltr {
19 | 
20 | template<class I, class V>
21 | class KmerHashTable: public ITableView<I,V> {
22 | 
23 | protected:
24 | 	/* Fields */
25 | 	static const int maxKeyLength = 15;
26 | 	int k;
27 | 
28 | 
29 | 	I maxTableSize;
30 | 
31 | 	// The hashed values, i.e. the values of the hash table.
32 | 	// The index is the 4ry representation of the key
33 | 	V * values;
34 | 	V initialValue;
35 | 
36 | private:
37 | 	// [4^0, 4^1, ... , 4^(k-1)]
38 | 	I * bases;
39 | 	I * mMinusOne;
40 | 	void initialize(int, V);
41 | 
42 | public:
43 | 	/* Methods */
44 | 	KmerHashTable(int);
45 | 	KmerHashTable(int, V);
46 | 
47 | 	virtual ~KmerHashTable();
48 | 
49 | 	I hash(const char *);
50 | 	I hash(const char *, int);
51 | 	void hash(const char *, int, int, vector<I> *);
52 | 
53 | 	void insert(const char*, V);
54 | 	void insert(const char*, int, V);
55 | 	void insert(I, V);
56 | 
57 | 	void increment(const char*);
58 | 	void increment(const char*, int);
59 | 	void wholesaleIncrement(const char*, int, int);
60 | 	int wholesaleIncrementNoOverflow(const char*, int, int);
61 | 
62 | 	void addReverseComplement();
63 | 	I countNonInitialEntries();
64 | 	vector<string> *getKeys();
65 | 	void printTable(string);
66 | 	void checkOverflow();
67 | 
68 | 	/*Vritual methods from ITableView*/
69 | 	virtual V valueOf(const char*);
70 | 	virtual V valueOf(const char*, int);
71 | 	virtual V valueOf(I);
72 | 	virtual void wholesaleValueOf(const char *, int, int, vector<V> *);
73 | 	virtual void wholesaleValueOf(const char *, int, int, vector<V> *, int);
74 | 
75 | 	virtual int getK();
76 | 	virtual I getMaxTableSize();
77 | 	virtual V getMaxValue();
78 | 	virtual const V * getValues() const;
79 | };
80 | }
81 | 
82 | #include "KmerHashTable.cpp"
83 | 
84 | #endif /* KMERHASHTABLE_H_ */
85 | 


--------------------------------------------------------------------------------
/src/nonltr/LocationList.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * LocationList.cpp
  3 |  *
  4 |  *  Created on: Feb 19, 2015
  5 |  *      Author: Hani Zakaria Girgis, PhD
  6 |  *
  7 |  *
  8 |  * An instance of this class holds a list of merged locations.
  9 |  */
 10 | 
 11 | #include "LocationList.h"
 12 | 
 13 | namespace nonltr {
 14 | 
 15 | LocationList::LocationList(string chromNameIn) {
 16 | 	chromName = chromNameIn;
 17 | 	regionList = new vector<ILocation *>();
 18 | 	merge();
 19 | }
 20 | 
 21 | LocationList::~LocationList() {
 22 | 	Util::deleteInVector(regionList);
 23 | 	delete regionList;
 24 | }
 25 | 
 26 | void LocationList::add(int start, int end) {
 27 | 	regionList->push_back(new Location(start, end));
 28 | }
 29 | 
 30 | void LocationList::merge() {
 31 | 	int regionCount = regionList->size();
 32 | 	int gg = 0;
 33 | 	while (gg < regionCount) {
 34 | 		ILocation * region = regionList->at(gg);
 35 | 
 36 | 		int regionStart = region->getStart();
 37 | 		int regionEnd = region->getEnd();
 38 | 
 39 | 		if (gg > 0) {
 40 | 			ILocation * pRegion = regionList->at(gg - 1);
 41 | 			int pStart = pRegion->getStart();
 42 | 			int pEnd = pRegion->getEnd();
 43 | 
 44 | 			if (Util::isOverlapping(pStart, pEnd, regionStart, regionEnd)) {
 45 | 				pRegion->setEnd(regionEnd > pEnd ? regionEnd : pEnd);
 46 | 				regionList->erase(regionList->begin() + gg);
 47 | 				delete region;
 48 | 				regionCount = regionList->size();
 49 | 			} else {
 50 | 				gg++;
 51 | 			}
 52 | 		}
 53 | 
 54 | 		if (gg == 0) {
 55 | 			gg++;
 56 | 		}
 57 | 	}
 58 | }
 59 | 
 60 | void LocationList::mergeWithAnotherList(
 61 | 		const vector<ILocation *> * const otherList) {
 62 | 	//A pre-condition: Ensure that the other list is sorted
 63 | 	for (int h = 1; h < otherList->size(); h++) {
 64 | 		if (otherList->at(h)->getStart() < otherList->at(h - 1)->getStart()) {
 65 | 			throw InvalidStateException(
 66 | 					string("LocationList - The other list is not sorted."));
 67 | 		}
 68 | 	}
 69 | 
 70 | 	// Start
 71 | 	vector<ILocation *> * mergedList = new vector<ILocation *>();
 72 | 
 73 | 	int i = 0;
 74 | 	int j = 0;
 75 | 	int iLimit = regionList->size();
 76 | 	int jLimit = otherList->size();
 77 | 
 78 | 	// Continue until one list is finished
 79 | 	while (i < iLimit && j < jLimit) {
 80 | 		ILocation * iLoc = regionList->at(i);
 81 | 		ILocation * jLoc = otherList->at(j);
 82 | 
 83 | 		if (iLoc->getStart() < jLoc->getStart()) {
 84 | 			mergedList->push_back(iLoc);
 85 | 			i++;
 86 | 		} else {
 87 | 			mergedList->push_back(new Location(*jLoc));
 88 | 			j++;
 89 | 		}
 90 | 	}
 91 | 
 92 | 	// Once one list is finished, copy the rest of the other list
 93 | 	if (i == iLimit) {
 94 | 		for (; j < jLimit; j++) {
 95 | 			mergedList->push_back(new Location(*(otherList->at(j))));
 96 | 		}
 97 | 	} else if (j == jLimit) {
 98 | 		for (; i < iLimit; i++) {
 99 | 			mergedList->push_back(regionList->at(i));
100 | 		}
101 | 	}
102 | 
103 | 	// Once done
104 | 	// Util::deleteInVector(regionList);
105 | 	regionList->clear();	// Need to test this line
106 | 	delete regionList;
107 | 	regionList = mergedList;
108 | 
109 | 	merge();
110 | 
111 | 	//A post-condition: Ensure that the list is sorted
112 | 	for (int h = 1; h < regionList->size(); h++) {
113 | 		if (regionList->at(h)->getStart() < regionList->at(h - 1)->getStart()) {
114 | 			throw InvalidStateException(string("This list is not sorted."));
115 | 		}
116 | 	}
117 | }
118 | 
119 | void LocationList::print() {
120 | 	cout << endl << chromName << endl;
121 | 	for (int i = 0; i < regionList->size(); i++) {
122 | 		int s = regionList->at(i)->getStart();
123 | 		int e = regionList->at(i)->getEnd();
124 | 		cout << s << "-" << e << endl;
125 | 	}
126 | }
127 | 
128 | const vector<ILocation*> * LocationList::getList() {
129 | 	return regionList;
130 | }
131 | 
132 | void LocationList::convertToRedFormat() {
133 | 	trim(1);
134 | }
135 | 
136 | void LocationList::trim(int x) {
137 | 	for (int i = regionList->size() - 1; i >= 0; i--) {
138 | 		ILocation * region = regionList->at(i);
139 | 		int start = region->getStart();
140 | 		int newEnd = region->getEnd() - x;
141 | 
142 | 		if (newEnd < 0 || start > newEnd) {
143 | 			regionList->erase(regionList->begin() + i);
144 | 			delete region;
145 | 		} else {
146 | 			region->setEnd(newEnd);
147 | 		}
148 | 	}
149 | }
150 | 
151 | }
152 | 
153 | /* namespace nonltr */
154 | 


--------------------------------------------------------------------------------
/src/nonltr/LocationList.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * LocationList.h
 3 |  *
 4 |  *  Created on: Feb 19, 2015
 5 |  *      Author: Hani Z. Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef SRC_NONLTR_LOCATIONLIST_H_
 9 | #define SRC_NONLTR_LOCATIONLIST_H_
10 | 
11 | #include <vector>
12 | #include "../utility/Util.h"
13 | #include "../utility/ILocation.h"
14 | #include "../utility/Location.h"
15 | #include "../exception/InvalidStateException.h"
16 | 
17 | using namespace std;
18 | using namespace utility;
19 | using namespace exception;
20 | 
21 | namespace nonltr {
22 | 
23 | class LocationList {
24 | private:
25 | 	string chromName;
26 | 	vector<ILocation *> * regionList;
27 | 	void merge();
28 | 
29 | public:
30 | 	LocationList(string);
31 | 	virtual ~LocationList();
32 | 
33 | 	void add(int, int);
34 | 
35 | 	/**
36 | 	 * Take a sorted list
37 | 	 */
38 | 	void mergeWithAnotherList(const vector<ILocation *> * const);
39 | 
40 | 
41 | 	/**
42 | 	 * Print locations
43 | 	 */
44 | 	void print();
45 | 
46 | 	const vector<ILocation*> * getList();
47 | 	void convertToRedFormat();
48 | 	void trim(int );
49 | };
50 | 
51 | } /* namespace nonltr */
52 | 
53 | #endif /* SRC_NONLTR_LOCATIONLIST_H_ */
54 | 


--------------------------------------------------------------------------------
/src/nonltr/LocationListCollection.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * LocationListCollection.cpp
  3 |  *
  4 |  *  Created on: Feb 19, 2015
  5 |  *      Author: Hani Zakaria Girgis, PhD
  6 |  */
  7 | 
  8 | #include "LocationListCollection.h"
  9 | 
 10 | namespace nonltr {
 11 | 
 12 | LocationListCollection::LocationListCollection(string fileNameIn) {
 13 | 	fileName = fileNameIn;
 14 | 	collection = new map<string, LocationList *>();
 15 | 	readCoordinates();
 16 | }
 17 | 
 18 | LocationListCollection::~LocationListCollection() {
 19 | 	collection->clear();
 20 | 	delete collection;
 21 | }
 22 | 
 23 | void LocationListCollection::readCoordinates() {
 24 | 	Util::checkFile(fileName);
 25 | 
 26 | 	ifstream in(fileName.c_str());
 27 | 	LocationList * locList;
 28 | 	string previousChromName("");
 29 | 
 30 | 	while (in.good()) {
 31 | 		string line;
 32 | 		getline(in, line);
 33 | 
 34 | 		if (line.compare(string("")) != 0) {
 35 | 			int colIndex = line.find_last_of(':');
 36 | 			int dashIndex = line.find_last_of('-');
 37 | 
 38 | 			string chromName = line.substr(0, colIndex);
 39 | 
 40 | 			if (previousChromName.compare(chromName) != 0) {
 41 | 
 42 | 				cout << "Processing regions of " << chromName << endl;
 43 | 
 44 | 				locList = new LocationList(chromName);
 45 | 				collection->insert(
 46 | 						map<string, LocationList *>::value_type(chromName,
 47 | 								locList));
 48 | 
 49 | 				previousChromName = chromName;
 50 | 			}
 51 | 
 52 | 			int start =
 53 | 					atoi(
 54 | 							line.substr(colIndex + 1, dashIndex - colIndex - 1).c_str());
 55 | 			int end = atoi(line.substr(dashIndex + 1).c_str());
 56 | 			locList->add(start, end);
 57 | 		}
 58 | 	}
 59 | 
 60 | 	in.close();
 61 | }
 62 | 
 63 | void LocationListCollection::print() {
 64 | 	map<string, LocationList *>::iterator itr_s = collection->begin();
 65 | 	map<string, LocationList *>::iterator itr_e = collection->end();
 66 | 	while (itr_s != itr_e) {
 67 | 		collection->at(itr_s->first)->print();
 68 | 		++itr_s;
 69 | 	}
 70 | }
 71 | 
 72 | LocationList * const LocationListCollection::getLocationList(string header) {
 73 | 	if (collection->count(header) == 0) {
 74 | 		string msg("Regions of ");
 75 | 		msg.append(header);
 76 | 		msg.append(" cannot be found.\n");
 77 | 		throw InvalidStateException(msg);
 78 | 	}
 79 | 
 80 | 	return collection->at(header);
 81 | }
 82 | 
 83 | void LocationListCollection::convertToRedFormat() {
 84 | 	map<string, LocationList *>::iterator itr_s = collection->begin();
 85 | 	map<string, LocationList *>::iterator itr_e = collection->end();
 86 | 	while (itr_s != itr_e) {
 87 | 		collection->at(itr_s->first)->convertToRedFormat();
 88 | 		++itr_s;
 89 | 	}
 90 | }
 91 | 
 92 | void LocationListCollection::trim(int x) {
 93 | 	map<string, LocationList *>::iterator itr_s = collection->begin();
 94 | 	map<string, LocationList *>::iterator itr_e = collection->end();
 95 | 	while (itr_s != itr_e) {
 96 | 		collection->at(itr_s->first)->trim(x);
 97 | 		++itr_s;
 98 | 	}
 99 | }
100 | 
101 | } /* namespace nonltr */
102 | 


--------------------------------------------------------------------------------
/src/nonltr/LocationListCollection.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * LocationListCollection.h
 3 |  *
 4 |  *  Created on: Feb 19, 2015
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef SRC_NONLTR_LOCATIONLISTCOLLECTION_H_
 9 | #define SRC_NONLTR_LOCATIONLISTCOLLECTION_H_
10 | 
11 | #include <fstream>
12 | #include <map>
13 | 
14 | #include "LocationList.h"
15 | #include "../utility/Util.h"
16 | #include "../exception/InvalidStateException.h"
17 | 
18 | using namespace std;
19 | using namespace utility;
20 | 
21 | namespace nonltr {
22 | 
23 | class LocationListCollection {
24 | 
25 | private:
26 | 	string fileName;
27 | 	map<string, LocationList *> * collection;
28 | 	void readCoordinates();
29 | 
30 | public:
31 | 	LocationListCollection(string);
32 | 	virtual ~LocationListCollection();
33 | 	LocationList * const getLocationList(string);
34 | 	void print();
35 | 	void convertToRedFormat();
36 | 	void trim(int );
37 | };
38 | 
39 | } /* namespace nonltr */
40 | 
41 | #endif /* SRC_NONLTR_LOCATIONLISTCOLLECTION_H_ */
42 | 


--------------------------------------------------------------------------------
/src/nonltr/Scanner.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Scanner.h
 3 |  *
 4 |  *  Created on: Aug 19, 2013
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef SCANNER_H_
 9 | #define SCANNER_H_
10 | 
11 | #include <vector>
12 | #include <iostream>
13 | #include <fstream>
14 | 
15 | #include "Chromosome.h"
16 | #include "ChromosomeOneDigit.h"
17 | #include "HMM.h"
18 | #include "ITableView.h"
19 | #include "Scorer.h"
20 | #include "../utility/Util.h"
21 | #include "../utility/ILocation.h"
22 | #include "../utility/Location.h"
23 | #include "../exception/InvalidInputException.h"
24 | #include "../exception/InvalidStateException.h"
25 | #include "../exception/FileDoesNotExistException.h"
26 | #include "../exception/InvalidOperationException.h"
27 | 
28 | using namespace std;
29 | using namespace utility;
30 | using namespace exception;
31 | 
32 | namespace nonltr {
33 | 
34 | class Scanner {
35 | private:
36 | 	//string chromFile;
37 | 	ChromosomeOneDigit * chrom;
38 | 	const vector<vector<int> *> * segmentList;
39 | 	Scorer * scorer;
40 | 	vector<int> * scoreList;
41 | 	vector<ILocation *> * regionList;
42 | 	int k;
43 | 	HMM * hmm;
44 | 	// bool isTrainMode;
45 | 
46 | 	// Methods
47 | 	void start();
48 | 	void check();
49 | 	void decode();
50 | 	void extendByK();
51 | 	int extendByKHelper(int, int, int);
52 | 	void merge();
53 | 
54 | public:
55 | 	static const int FRMT_POS = 1;
56 | 	static const int FRMT_BED = 2;
57 | 
58 | 	Scanner(HMM *, int, ChromosomeOneDigit *, string);
59 | 	Scanner(HMM *, int, ChromosomeOneDigit *, ITableView<unsigned long, int> *);
60 | 	virtual ~Scanner();
61 | 	void makeForwardCoordinates();
62 | 
63 | 	void printScores(string, bool);
64 | 	void printIndex(string, bool, int);
65 | 	void printMasked(string, Chromosome&, bool);
66 | 	void mergeWithOtherRegions(const vector<ILocation *> *);
67 | 	const vector<ILocation*>* getRegionList();
68 | };
69 | 
70 | } /* namespace nonltr */
71 | #endif /* SCANNER_H_ */
72 | 


--------------------------------------------------------------------------------
/src/nonltr/Scorer.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Scorer.cpp
  3 |  *
  4 |  *  Created on: Aug 3, 2012
  5 |  *      Author: Hani Zakaria Girgis, PhD
  6 |  */
  7 | #include "Scorer.h"
  8 | 
  9 | Scorer::Scorer(ChromosomeOneDigit * chromIn,
 10 | 		ITableView<unsigned long, int> * const table) {
 11 | 	chrom = chromIn;
 12 | 	kmerTable = table;
 13 | 	scores = new vector<int>(chrom->getBase()->size(), 0);
 14 | 	k = kmerTable->getK();
 15 | 	max = -1;
 16 | 	score();
 17 | 	calculateMax();
 18 | }
 19 | 
 20 | Scorer::~Scorer() {
 21 | 	scores->clear();
 22 | 	delete scores;
 23 | }
 24 | 
 25 | /**
 26 |  * This method scores each nucleotide in the chromosome.
 27 |  * The nucleotides represented by 'N' are assigned zero.
 28 |  */
 29 | void Scorer::score() {
 30 | 	const vector<vector<int> *> * segment = chrom->getSegment();
 31 | 	const char * segBases = chrom->getBase()->c_str();
 32 | 
 33 | 	for (int s = 0; s < segment->size(); s++) {
 34 | 		int start = segment->at(s)->at(0);
 35 | 		int end = segment->at(s)->at(1);
 36 | 		kmerTable->wholesaleValueOf(segBases, start, end - k + 1, scores,
 37 | 				start);
 38 | 
 39 | 		// Handle the last word from end - k + 2 till the end, inclusive.
 40 | 		for (int i = end - k + 2; i <= end; i++) {
 41 | 			(*scores)[i] = scores->at(i - 1);
 42 | 		}
 43 | 	}
 44 | }
 45 | 
 46 | /**
 47 |  * This method takes the logarithm of the scores according to the base.
 48 |  * If the score equals zero, it is left the same.
 49 |  */
 50 | void Scorer::takeLog(double base) {
 51 | 	// Handle the case where base is one
 52 | 	bool isOne = false;
 53 | 	if (fabs(base - 1.0) < std::numeric_limits<double>::epsilon()) {
 54 | 		isOne = true;
 55 | 	}
 56 | 	double logBase = isOne ? log(1.5) : log(base);
 57 | 
 58 | 	const vector<vector<int> *> * segment = chrom->getSegment();
 59 | 	for (int s = 0; s < segment->size(); s++) {
 60 | 		int start = segment->at(s)->at(0);
 61 | 		int end = segment->at(s)->at(1);
 62 | 		for (int h = start; h <= end; h++) {
 63 | 			int score = scores->at(h);
 64 | 
 65 | 			if (score != 0) {
 66 | 				if (!isOne || (isOne && score > 1)) {
 67 | 					(*scores)[h] = ceil(log(score) / logBase);
 68 | 				}
 69 | 			}
 70 | 		}
 71 | 	}
 72 | }
 73 | 
 74 | int Scorer::getK() {
 75 | 	return k;
 76 | }
 77 | 
 78 | vector<int>* Scorer::getScores() {
 79 | 	return scores;
 80 | }
 81 | 
 82 | void Scorer::printScores(string outputFile, bool canAppend) {
 83 | 	ofstream outScores;
 84 | 	if (canAppend) {
 85 | 		outScores.open(outputFile.c_str(), ios::out | ios::app);
 86 | 	} else {
 87 | 		outScores.open(outputFile.c_str(), ios::out);
 88 | 	}
 89 | 
 90 | 	int step = 50;
 91 | 	outScores << chrom->getHeader() << endl;
 92 | 	int len = scores->size();
 93 | 	for (int i = 0; i < len; i = i + step) {
 94 | 		int e = (i + step - 1 > len - 1) ? len - 1 : i + step - 1;
 95 | 		for (int k = i; k <= e; k++) {
 96 | 			outScores << scores->at(k) << " ";
 97 | 		}
 98 | 		outScores << endl;
 99 | 	}
100 | 	outScores << endl;
101 | 
102 | 	outScores.close();
103 | }
104 | 
105 | int Scorer::countLessOrEqual(int thr) {
106 | 	int count = 0;
107 | 	const vector<vector<int> *> * segment = chrom->getSegment();
108 | 	for (int s = 0; s < segment->size(); s++) {
109 | 		int start = segment->at(s)->at(0);
110 | 		int end = segment->at(s)->at(1);
111 | 		for (int h = start; h <= end; h++) {
112 | 			if (scores->at(h) <= thr) {
113 | 				count++;
114 | 			}
115 | 		}
116 | 	}
117 | 	return count;
118 | }
119 | 
120 | void Scorer::calculateMax() {
121 | 	const vector<vector<int> *> * segmentList = chrom->getSegment();
122 | 	int segmentCount = segmentList->size();
123 | 	for (int jj = 0; jj < segmentCount; jj++) {
124 | 		vector<int> * segment = segmentList->at(jj);
125 | 		int start = segment->at(0);
126 | 		int end = segment->at(1);
127 | 		for (int ss = start; ss <= end; ss++) {
128 | 			int score = scores->at(ss);
129 | 			if (score > max) {
130 | 				max = score;
131 | 			}
132 | 		}
133 | 	}
134 | 
135 | 	if (max == -1) {
136 | 		string msg("Error occurred while finding the maximum score.");
137 | 		throw InvalidStateException(msg);
138 | 	}
139 | }
140 | 
141 | int Scorer::getMax() {
142 | 	return max;
143 | }
144 | 


--------------------------------------------------------------------------------
/src/nonltr/Scorer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Scorer.h
 3 |  *
 4 |  *  Created on: Aug 3, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef SCORER_H_
 9 | #define SCORER_H_
10 | 
11 | #include <vector>
12 | #include <fstream>
13 | #include <iostream>
14 | #include <math.h>
15 | #include <limits>
16 | 
17 | #include "ITableView.h"
18 | #include "ChromosomeOneDigit.h"
19 | #include "../utility/Util.h"
20 | #include "../exception/InvalidStateException.h"
21 | 
22 | using namespace std;
23 | using namespace nonltr;
24 | using namespace utility;
25 | using namespace exception;
26 | 
27 | namespace nonltr {
28 | class Scorer {
29 | private:
30 | 	/* Fields */
31 | 	ChromosomeOneDigit * chrom;
32 | 	ITableView<unsigned long, int> * kmerTable;
33 | 	vector<int> * scores;
34 | 	int k;
35 | 	int max;
36 | 
37 | 	/* Methods */
38 | 	void score();
39 | 	void calculateMax();
40 | 
41 | public:
42 | 	/* Methods */
43 | 	Scorer(ChromosomeOneDigit *, ITableView<unsigned long, int> *);
44 | 	virtual ~Scorer();
45 | 	void printScores(string, bool);
46 | 	vector<int>* getScores();
47 | 	int getK();
48 | 	void takeLog(double);
49 | 	int countLessOrEqual(int);
50 | 	int getMax();
51 | };
52 | }
53 | 
54 | #endif /* Scorer_H_ */
55 | 


--------------------------------------------------------------------------------
/src/nonltr/TableBuilder.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * TableBuilder.cpp
  3 |  *
  4 |  *  Created on: Jul 31, 2012
  5 |  *      Author: Hani Zakaria Girgis, PhD
  6 |  */
  7 | 
  8 | #include "TableBuilder.h"
  9 | 
 10 | TableBuilder::TableBuilder(string dir, int motifSize, int order, int minObs) {
 11 | 	genomeDir = dir;
 12 | 	k = motifSize;
 13 | 	genomeLength = 0;
 14 | 	// kmerTable = new KmerHashTable(k);
 15 | 	// kmerTable = new EnrichmentView(k);
 16 | 
 17 | 	// Whenever you change the template, modify line 50 and 70 and the header file line 35
 18 | 	kmerTable = new EnrichmentMarkovView<unsigned long, int>(k, order, minObs);
 19 | 
 20 | 	buildTable();
 21 | }
 22 | 
 23 | TableBuilder::~TableBuilder() {
 24 | 	delete kmerTable;
 25 | }
 26 | 
 27 | void TableBuilder::buildTable() {
 28 | 	vector<string> * fileList = new vector<string>();
 29 | 	Util::readChromList(genomeDir, fileList, "fa");
 30 | 
 31 | 	for (int i = 0; i < fileList->size(); i++) {
 32 | 		cout << "Counting k-mers in " << fileList->at(i) << " ..." << endl;
 33 | 		ChromListMaker * maker = new ChromListMaker(fileList->at(i));
 34 | 		const vector<Chromosome *> * chromList = maker->makeChromOneDigitDnaList();
 35 | 
 36 | 		for (int h = 0; h < chromList->size(); h++) {
 37 | 			ChromosomeOneDigit * chrom =
 38 | 					dynamic_cast<ChromosomeOneDigit *>(chromList->at(h));
 39 | 			if (chrom) {
 40 | 				genomeLength += chrom->getEffectiveSize();
 41 | 				updateTable(chrom);
 42 | 			} else {
 43 | 				throw InvalidStateException(string("Dynamic cast failed."));
 44 | 			}
 45 | 		}
 46 | 
 47 | 		delete maker;
 48 | 	}
 49 | 	// Check if overflow has occurred
 50 | 	kmerTable->checkOverflow();
 51 | 
 52 | 	// View
 53 | 	// EnrichmentView * view = dynamic_cast<EnrichmentView *>(kmerTable);
 54 | 	EnrichmentMarkovView<unsigned long, int> * view =
 55 | 			dynamic_cast<EnrichmentMarkovView<unsigned long, int> *>(kmerTable);
 56 | 
 57 | 	if (view) {
 58 | 		view->generateProbapilities();
 59 | 		view->processTable();
 60 | 		maxValue = view->getMaxValue();
 61 | 	} else {
 62 | 		throw InvalidStateException(string("Dynamic cast failed."));
 63 | 	}
 64 | 	cout << "Enrichment view is ready." << endl;
 65 | 
 66 | 	fileList->clear();
 67 | 	delete fileList;
 68 | 
 69 | 	/* If you would like to see the contents of the table.*/
 70 | 	// kmerTable-> printTable();
 71 | }
 72 | 
 73 | void TableBuilder::updateTable(ChromosomeOneDigit * chrom) {
 74 | 	// EnrichmentView * view = dynamic_cast<EnrichmentView *>(kmerTable);
 75 | 	EnrichmentMarkovView<unsigned long, int> * view =
 76 | 			dynamic_cast<EnrichmentMarkovView<unsigned long, int> *>(kmerTable);
 77 | 
 78 | 	const vector<vector<int> *> * segment = chrom->getSegment();
 79 | 	const char * segBases = chrom->getBase()->c_str();
 80 | 
 81 | 	for (int s = 0; s < segment->size(); s++) {
 82 | 		int start = segment->at(s)->at(0);
 83 | 		int end = segment->at(s)->at(1);
 84 | 		// cerr << "The segment length is: " << (end-start+1) << endl;
 85 | 
 86 | 		// Fast, but require some memory proportional to the segment length.
 87 | 		kmerTable->wholesaleIncrement(segBases, start, end - k + 1);
 88 | 		if (view) {
 89 | 			view->count(segBases, start, end);
 90 | 		} else {
 91 | 			throw InvalidStateException(string("Dynamic cast failed."));
 92 | 		}
 93 | 
 94 | 		// Slow, but memory efficient
 95 | 		/*
 96 | 		 vector<int> hashList = vector<int>();
 97 | 		 kmerTable->hash(segBases, start, end - k + 1, &hashList);
 98 | 
 99 | 		 for (int i = start; i <= end - k + 1; i++) {
100 | 		 kmerTable->increment(segBases, i);
101 | 		 }
102 | 		 */
103 | 	}
104 | }
105 | 
106 | KmerHashTable<unsigned long, int> * const TableBuilder::getKmerTable() {
107 | 	return kmerTable;
108 | }
109 | 
110 | long TableBuilder::getGenomeLength() {
111 | 	if (genomeLength < 0) {
112 | 		string msg("The length of the genome cannot be negative.");
113 | 		throw InvalidStateException(msg);
114 | 	}
115 | 
116 | 	return genomeLength;
117 | }
118 | 
119 | int TableBuilder::getMaxValue() {
120 | 	return maxValue;
121 | }
122 | 


--------------------------------------------------------------------------------
/src/nonltr/TableBuilder.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * TableBuilder.h
 3 |  *
 4 |  *  Created on: Jul 31, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD - NCBI/NLM/NIH
 6 |  */
 7 | 
 8 | #ifndef TABLEBUILDER_H_
 9 | #define TABLEBUILDER_H_
10 | 
11 | #include "KmerHashTable.h"
12 | #include "EnrichmentMarkovView.h"
13 | #include "ChromosomeOneDigit.h"
14 | #include "ChromListMaker.h"
15 | #include "IChromosome.h"
16 | 
17 | #include "../utility/Util.h"
18 | #include "../exception/InvalidStateException.h"
19 | 
20 | #include <iostream>
21 | 
22 | using namespace std;
23 | using namespace nonltr;
24 | using namespace utility;
25 | using namespace exception;
26 | 
27 | namespace nonltr {
28 | class TableBuilder {
29 | private:
30 | 	/**
31 | 	 * k-mer table
32 | 	 */
33 | 	KmerHashTable<unsigned long,int> * kmerTable;
34 | 	int maxValue;
35 | 
36 | 	/**
37 | 	 * Directory including the FASTA files comprising the genome.
38 | 	 * These files must have the
39 | 	 */
40 | 	string genomeDir;
41 | 
42 | 	/**
43 | 	 * The size of the motif
44 | 	 */
45 | 	int k;
46 | 
47 | 	/**
48 | 	 * The total length of the whole genome
49 | 	 */
50 | 	long genomeLength;
51 | 
52 | 	/**
53 | 	 * Methods
54 | 	 */
55 | 	void buildTable();
56 | 	void updateTable(ChromosomeOneDigit *);
57 | 
58 | public:
59 | 	TableBuilder(string, int, int, int);
60 | 	virtual ~TableBuilder();
61 | 	KmerHashTable<unsigned long,int> * const getKmerTable();
62 | 	void printTable();
63 | 	long getGenomeLength();
64 | 	int getMaxValue();
65 | };
66 | }
67 | 
68 | #endif /* TABLEBUILDER_H_ */
69 | 


--------------------------------------------------------------------------------
/src/nonltr/Trainer.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Trainer.cpp
  3 |  *
  4 |  *  Created on: Aug 20, 2013
  5 |  *      Author: Hani Zakaria Girgis, PhD
  6 |  */
  7 | 
  8 | #include "Trainer.h"
  9 | 
 10 | namespace nonltr {
 11 | 
 12 | // Pass the isCND and the isCON parameters
 13 | 
 14 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn,
 15 | 		double tIn, string candidateDirIn, int m) : minObs(m) {
 16 | 	candidateDir = candidateDirIn;
 17 | 	canPrintCandidates = true;
 18 | 	isCND = true;
 19 | 	isCON = false;
 20 | 	initialize(genomeDirIn, orderIn, kIn, sIn, tIn);
 21 | }
 22 | 
 23 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn,
 24 | 		double tIn, string candidateDirIn, bool isCNDIn, string otherDirIn, int m) : minObs(m) {
 25 | 	candidateDir = candidateDirIn;
 26 | 	canPrintCandidates = true;
 27 | 	isCND = isCNDIn;
 28 | 	isCON = true;
 29 | 	otherDir = otherDirIn;
 30 | 	initialize(genomeDirIn, orderIn, kIn, sIn, tIn);
 31 | }
 32 | 
 33 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn,
 34 | 		double tIn, int m) : minObs(m) {
 35 | 	canPrintCandidates = false;
 36 | 	isCND = true;
 37 | 	isCON = false;
 38 | 	initialize(genomeDirIn, orderIn, kIn, sIn, tIn);
 39 | }
 40 | 
 41 | Trainer::Trainer(string genomeDirIn, int orderIn, int kIn, double sIn,
 42 | 		double tIn, bool isCNDIn, string otherDirIn, int m) : minObs(m) {
 43 | 	canPrintCandidates = false;
 44 | 	isCND = isCNDIn;
 45 | 	isCON = true;
 46 | 	otherDir = otherDirIn;
 47 | 	initialize(genomeDirIn, orderIn, kIn, sIn, tIn);
 48 | }
 49 | 
 50 | void Trainer::initialize(string genomeDirIn, int orderIn, int kIn, double sIn,
 51 | 		double tIn) {
 52 | 
 53 | 	if (isCND == false && isCON == false) {
 54 | 		string msg("Training using the candidates or the other repeats is required. ");
 55 | 		msg.append("Please specify which regions to be used for training. ");
 56 | 		msg.append("Any of the two sets or a combination of both can be used.");
 57 | 		throw InvalidStateException(msg);
 58 | 	}
 59 | 
 60 | 	genomeDir = genomeDirIn;
 61 | 	fileList = new vector<string>();
 62 | 	Util::readChromList(genomeDir, fileList, string("fa"));
 63 | 	chromCount = fileList->size();
 64 | 	order = orderIn;
 65 | 	k = kIn;
 66 | 	s = sIn;
 67 | 	t = tIn;
 68 | 	p = 0.0;
 69 | 	tDetector = tIn + 0.1;
 70 | 	max = -1;
 71 | 
 72 | 	stage1();
 73 | 
 74 | 	if (isCND) {
 75 | 		stage2();
 76 | 	}
 77 | 	stage3();
 78 | }
 79 | 
 80 | Trainer::~Trainer() {
 81 | 	fileList->clear();
 82 | 	delete fileList;
 83 | 	delete builder;
 84 | 	delete hmm;
 85 | }
 86 | 
 87 | /**
 88 |  * Stage 1: Building the table
 89 |  */
 90 | void Trainer::stage1() {
 91 | 	cout << endl << endl;
 92 | 	cout << "Stage 1: Building the table ..." << endl;
 93 | 	builder = new TableBuilder(genomeDir, k, order, minObs);
 94 | 	table = builder->getKmerTable();
 95 | 	genomeLength = builder->getGenomeLength();
 96 | 	max = builder->getMaxValue();
 97 | }
 98 | 
 99 | void Trainer::stage2() {
100 | 	cout << endl << endl;
101 | 	cout << "Stage 2: Calculating the percentage ..." << endl;
102 | 
103 | 	double effectiveSize = 0.0;
104 | 	double countLessOrEqual = 0.0;
105 | 	for (int i = 0; i < chromCount; i++) {
106 | 		cout << "Calculating the percentage in: " << fileList->at(i) << " ...";
107 | 		cout << endl;
108 | 		ChromListMaker * maker = new ChromListMaker(fileList->at(i));
109 | 		const vector<Chromosome *> * chromList = maker->makeChromOneDigitDnaList();
110 | 
111 | 		for (int h = 0; h < chromList->size(); h++) {
112 | 			ChromosomeOneDigit * chrom =
113 | 					dynamic_cast<ChromosomeOneDigit *>(chromList->at(h));
114 | 			Scorer * scorer = new Scorer(chrom, table);
115 | 
116 | 			effectiveSize += chrom->getEffectiveSize();
117 | 			countLessOrEqual += scorer->countLessOrEqual(t);
118 | 
119 | 			delete scorer;
120 | 		}
121 | 		delete maker;
122 | 	}
123 | 
124 | 	if (effectiveSize == 0) {
125 | 		string msg("The size of the genome cannot be zero.");
126 | 		throw InvalidStateException(msg);
127 | 	} else {
128 | 		p = 100.00 * countLessOrEqual / effectiveSize;
129 | 		cout << "The percentage is " << p << endl;
130 | 		if (p < 52.5) {
131 | 			p = 52.5;
132 | 			cout << "The percentage is increased to " << p << endl;
133 | 		}
134 | 	}
135 | }
136 | 
137 | /**
138 |  * Stage 3: Training
139 |  */
140 | void Trainer::stage3() {
141 | 	cout << endl << endl;
142 | 	cout << "Stage 3: Training ..." << endl;
143 | 
144 | 	// Handle the case when the threshold is one.
145 | 	bool isOne = false;
146 | 	if (fabs(t - 1.0) < std::numeric_limits<double>::epsilon()) {
147 | 		isOne = true;
148 | 	}
149 | 	double hmmBase = isOne ? 1.5 : t;
150 | 
151 | 	// Make a list of candidate HMM
152 | 	int stateCount = 2 * (ceil(log(max) / log(hmmBase)) + 1);
153 | 
154 | 	// Initialize the HMM
155 | 	hmm = new HMM(hmmBase, stateCount);
156 | 
157 | 	// Start training the models
158 | 	for (int i = 0; i < chromCount; i++) {
159 | 		cout << "Training on: " << fileList->at(i) << endl;
160 | 		// Name of candidates file
161 | 		string path(fileList->at(i));
162 | 		int slashLastIndex = path.find_last_of(Util::fileSeparator);
163 | 		int dotLastIndex = path.find_last_of(".");
164 | 		string nickName = path.substr(slashLastIndex + 1, dotLastIndex - slashLastIndex - 1);
165 | 
166 | 		// May or may not be used
167 | 		string cndFile = candidateDir + Util::fileSeparator + nickName + ".cnd";
168 | 
169 | 		// Work on the other repeats if desired
170 | 		LocationListCollection * otherRegionListCollection;
171 | 		bool isConRepAvailable = false;
172 | 		if (isCON) {
173 | 			string otherFile = otherDir + Util::fileSeparator + nickName + ".rpt";
174 | 			ifstream f1(otherFile.c_str());
175 | 			if (!f1) {
176 | 				string message = string("Warning: ");
177 | 				message.append(otherFile);
178 | 				message.append(" does not exist. ");
179 | 				message.append("Repeats of this sequence will not used for training the HMM.");
180 | 				cout << message << endl;
181 | 			} else {
182 | 				otherRegionListCollection = new LocationListCollection(otherFile);
183 | 				otherRegionListCollection->convertToRedFormat();
184 | 				otherRegionListCollection->trim(k - 1);
185 | 
186 | 				isConRepAvailable = true;
187 | 			}
188 | 			f1.close();
189 | 		}
190 | 
191 | 		// Read sequences in the file
192 | 		ChromListMaker * maker = new ChromListMaker(fileList->at(i));
193 | 		const vector<Chromosome *> * chromList = maker->makeChromOneDigitDnaList();
194 | 
195 | 		for (int h = 0; h < chromList->size(); h++) {
196 | 			ChromosomeOneDigit * chrom = dynamic_cast<ChromosomeOneDigit *>(chromList->at(h));
197 | 			Scorer * scorer = new Scorer(chrom, table);
198 | 			vector<int> * scoreList = scorer->getScores();
199 | 
200 | 			// Detect candidates if desired
201 | 			ChromDetectorMaxima * detector;
202 | 			const vector<ILocation*> * trainingRegionList;
203 | 			bool canDeleteDetector = true;
204 | 			if (isCND) {
205 | 				if (canPrintCandidates) {
206 | 					detector = new ChromDetectorMaxima(s, 10, 0, tDetector, p,s, scoreList, chrom);
207 | 					if (h > 0) {
208 | 						bool canAppend = true;
209 | 						detector->printIndex(cndFile, canAppend);
210 | 					} else {
211 | 						cout << "Printing candidates to: " << cndFile << endl;
212 | 						detector->printIndex(cndFile);
213 | 					}
214 | 				} else {
215 | 					detector = new ChromDetectorMaxima(s, 10, 0, tDetector, p, s, scoreList, chrom->getSegment());
216 | 				}
217 | 				trainingRegionList = detector->getRegionList();
218 | 
219 | 
220 | 			}
221 | 
222 | 			if (isCON && isConRepAvailable) {
223 | 				LocationList * const locList = otherRegionListCollection->getLocationList(chrom->getHeader());
224 | 				if (isCND) {
225 | 					locList->mergeWithAnotherList(detector->getRegionList());
226 | 				}
227 | 				trainingRegionList = locList->getList();
228 | 
229 | 			}
230 | 
231 | 			// The candidate regions are already copied to the location list
232 | 			if (isCND && isCON && isConRepAvailable) {
233 | 				delete detector;
234 | 				canDeleteDetector = false;
235 | 			}
236 | 
237 | 			// Train the HMM
238 | 			if(isCND || (isCON && isConRepAvailable)){
239 | 
240 | 				scorer->takeLog(t);
241 | 				scoreList = scorer->getScores();
242 | 				hmm->train(scoreList, chrom->getSegment(), trainingRegionList);
243 | 			}
244 | 
245 | 			// Free more memory
246 | 			if (isCND && canDeleteDetector) {
247 | 				delete detector;
248 | 			}
249 | 			delete scorer;
250 | 		}
251 | 
252 | 		if (isCON && isConRepAvailable) {
253 | 			delete otherRegionListCollection;
254 | 		}
255 | 		delete maker;
256 | 	}
257 | 
258 | 	// Normalize HMM's once training is finished
259 | 	hmm->normalize();
260 | }
261 | 
262 | void Trainer::printTable(string fileName) {
263 | 	table->printTable(fileName);
264 | }
265 | 
266 | HMM*& Trainer::getHmm() {
267 | 	return hmm;
268 | }
269 | 
270 | KmerHashTable<unsigned long, int> * Trainer::getTable() {
271 | 	return table;
272 | }
273 | 
274 | void Trainer::printHmm(string fileName) {
275 | 	hmm->print(fileName);
276 | }
277 | 
278 | } /* namespace nonltr */
279 | 


--------------------------------------------------------------------------------
/src/nonltr/Trainer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Trainer.h
 3 |  *
 4 |  *  Created on: Aug 20, 2013
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef TRAINER_H_
 9 | #define TRAINER_H_
10 | 
11 | #include <string>
12 | #include <vector>
13 | #include <iostream>
14 | #include <limits>
15 | #include <algorithm>
16 | 
17 | #include "TableBuilder.h"
18 | #include "KmerHashTable.h"
19 | #include "HMM.h"
20 | #include "ChromDetectorMaxima.h"
21 | #include "Scorer.h"
22 | #include "ChromListMaker.h"
23 | #include "LocationListCollection.h"
24 | #include "../utility/Util.h"
25 | #include "../exception/InvalidStateException.h"
26 | 
27 | using namespace std;
28 | using namespace utility;
29 | using namespace exception;
30 | 
31 | namespace nonltr {
32 | 
33 | class Trainer {
34 | private:
35 | 	string genomeDir;
36 | 	string candidateDir;
37 | 	string otherDir;
38 | 	bool canPrintCandidates;
39 | 	vector<string> * fileList;
40 | 	int chromCount;
41 | 	int order;
42 | 	int k;
43 | 	int max; // Maximum score in the entire genome
44 | 	double t; // Score threshold
45 | 	double tDetector; // threshold for the detector because it uses < not <=;
46 | 	double p; // Percentage of scores below the threshold, t, in non-repeats
47 | 	//double r;
48 | 	double s; // Half width of the mask
49 | 	long genomeLength;
50 | 	//vector<int> * sampleList;
51 | 	TableBuilder * builder;
52 | 	KmerHashTable<unsigned long, int> * table;
53 | 	HMM * hmm;
54 | 	int isCND;
55 | 	int isCON;
56 | 	// The minimum number of the observed k-mers
57 | 	const int minObs;
58 | 
59 | 	void stage1();
60 | 	void stage2();
61 | 	void stage3();
62 | 	//void stage4();
63 | 
64 | public:
65 | 	Trainer(string, int, int, double, double, string, int);
66 | 	Trainer(string, int, int, double, double, string, bool, string, int);
67 | 	Trainer(string, int, int, double, double, int);
68 | 	Trainer(string, int, int, double, double, bool, string, int);
69 | 
70 | 	void initialize(string, int, int, double, double);
71 | 	virtual ~Trainer();
72 | 	void printTable(string);
73 | 	void printHmm(string);
74 | 	HMM*& getHmm();
75 | 	KmerHashTable<unsigned long, int> * getTable();
76 | 
77 | };
78 | 
79 | } /* namespace nonltr */
80 | #endif /* TRAINER_H_ */
81 | 


--------------------------------------------------------------------------------
/src/predict/BestFirstSelector.h:
--------------------------------------------------------------------------------
 1 | // -*- C++ -*-
 2 | /*
 3 |  * BestFirstSelector.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #ifndef BEST_FIRST_SELECTOR_H
 9 | #define BEST_FIRST_SELECTOR_H
10 | #include "FeatureSelector.h"
11 | #include <set>
12 | template<class T>
13 | class BestFirstSelector : public FeatureSelector<T> {
14 | public:
15 | 	BestFirstSelector(vector<std::pair<uint64_t, Combo> > possible_feats_, int min_n_feat, int max_n_feat) : possible_feats(possible_feats_), min_num_feat(min_n_feat), max_num_feat(max_n_feat) {}
16 | 	~BestFirstSelector() {}
17 | 
18 | 	pair<Feature<T>*,matrix::GLM> train_regression(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing);
19 | 	pair<Feature<T>*,matrix::GLM> train_class(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id);
20 | 
21 | private:
22 | 	int max_num_feat, min_num_feat;
23 | 	vector<std::pair<uint64_t, Combo> > possible_feats;
24 | };
25 | #endif
26 | 


--------------------------------------------------------------------------------
/src/predict/FeatureSelector.cpp:
--------------------------------------------------------------------------------
  1 | // -*- C++ -*-
  2 | /*
  3 |  * FeatureSelector.cpp
  4 |  *
  5 |  * Author: Benjamin T James
  6 |  */
  7 | 
  8 | #include "FeatureSelector.h"
  9 | template<class T>
 10 | std::pair<matrix::Matrix,matrix::Matrix> FeatureSelector<T>::generate_feat_mat(const vector<pra<T> > &data, Feature<T>& feat, double cutoff)
 11 | {
 12 | 	bool classify = (cutoff > 0);
 13 | 	int nrows = data.size();
 14 | 	int ncols = feat.size()+1;
 15 | 	matrix::Matrix feat_mat(nrows, ncols);
 16 | 	matrix::Matrix labels(nrows, 1);
 17 | //	#pragma omp parallel for
 18 | 	for (int row = 0; row < data.size(); row++) {
 19 | 		auto kv = data.at(row);
 20 | 		vector<double> cache;
 21 |  		// #pragma omp critical
 22 | 		// {
 23 | 			cache = feat.compute(*kv.first, *kv.second);
 24 | 		// }
 25 | 		feat_mat.set(row, 0, 1);
 26 | 		if (classify) {
 27 | 			labels.set(row, 0, kv.val >= cutoff ? 1 : -1);
 28 | 		} else {
 29 | 			labels.set(row, 0, kv.val);
 30 | 			//	labels.set(row, 0, (kv.val - smin) / (smax - smin));
 31 | 		}
 32 | 		for (int col = 1; col < ncols; col++) {
 33 | 			double val = feat(col-1, cache);
 34 | 			feat_mat.set(row, col, val);
 35 | 		}
 36 | 	}
 37 | 	return std::make_pair(feat_mat, labels);
 38 | }
 39 | 
 40 | 
 41 | template<class T>
 42 | std::pair<double, matrix::GLM> FeatureSelector<T>::regression_train(const vector<pra<T> > &data, Feature<T>& feat)
 43 | {
 44 | 	auto pr = generate_feat_mat(data, feat, -1);
 45 | 	matrix::GLM glm;
 46 | 	glm.train(pr.first, pr.second);
 47 | 	auto result1 = pr.first * glm.get_weights();
 48 | 	auto diff1 = result1 - pr.second;
 49 | 	double sum = 0;
 50 | 	for (int i = 0; i < diff1.getNumRow(); i++) {
 51 | 		sum += fabs(diff1.get(i, 0));
 52 | 	}
 53 | 	sum /= diff1.getNumRow();
 54 | 	return {sum, glm};
 55 | }
 56 | 
 57 | template<class T>
 58 | std::pair<double, matrix::GLM> FeatureSelector<T>::class_train(const vector<pra<T> > &data, Feature<T>& feat, double cutoff)
 59 | {
 60 | 	auto pr = generate_feat_mat(data, feat, cutoff);
 61 | 	matrix::GLM glm;
 62 | 	glm.train(pr.first, pr.second);
 63 | 	matrix::Matrix p = glm.predict(pr.first);
 64 | 	for (int row = 0; row < p.getNumRow(); row++) {
 65 | 		if (p.get(row, 0) == 0) {
 66 | 			p.set(row, 0, -1);
 67 | 		}
 68 | 	}
 69 | 	auto tup = glm.accuracy(pr.second, p);
 70 | 	double acc = get<0>(tup);
 71 | 	double sens = get<1>(tup);
 72 | 	double spec = get<2>(tup);
 73 | 	return {acc, glm};
 74 | }
 75 | 
 76 | template<class T>
 77 | double FeatureSelector<T>::regression_test(const vector<pra<T> >& data, Feature<T>& feat, const matrix::GLM& glm)
 78 | {
 79 | 	auto pr = generate_feat_mat(data, feat, -1);
 80 | 	auto result1 = pr.first * glm.get_weights();
 81 | 	auto diff1 = result1 - pr.second;
 82 | 	double sum = 0;
 83 | 	for (int i = 0; i < diff1.getNumRow(); i++) {
 84 | 		sum += fabs(diff1.get(i, 0));
 85 | 	}
 86 | 	sum /= diff1.getNumRow();
 87 | 	return sum;
 88 | }
 89 | 
 90 | template<class T>
 91 | tuple<double,double,double> FeatureSelector<T>::class_test(const vector<pra<T> >& data, Feature<T>& feat, const matrix::GLM& glm, double cutoff)
 92 | {
 93 | 	auto pr = generate_feat_mat(data, feat, cutoff);
 94 | 	matrix::Matrix p = glm.predict(pr.first);
 95 | 	for (int row = 0; row < p.getNumRow(); row++) {
 96 | 		if (p.get(row, 0) == 0) {
 97 | 			p.set(row, 0, -1);
 98 | 		}
 99 | 	}
100 | 	auto tup = glm.accuracy(pr.second, p);
101 | 	return tup;
102 | 
103 | }
104 | 
105 | template class FeatureSelector<uint8_t>;
106 | template class FeatureSelector<uint16_t>;
107 | template class FeatureSelector<uint32_t>;
108 | template class FeatureSelector<uint64_t>;
109 | template class FeatureSelector<int>;
110 | template class FeatureSelector<double>;
111 | 


--------------------------------------------------------------------------------
/src/predict/FeatureSelector.h:
--------------------------------------------------------------------------------
 1 | // -*- C++ -*-
 2 | /*
 3 |  * FeatureSelector.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #ifndef FEATURE_SELECTOR_H
 9 | #define FEATURE_SELECTOR_H
10 | 
11 | #include "GLM.h"
12 | #include "Feature.h"
13 | 
14 | template<class T>
15 | class FeatureSelector {
16 | public:
17 | 	virtual ~FeatureSelector() {};
18 | 	static std::pair<matrix::Matrix,matrix::Matrix> generate_feat_mat(const vector<pra<T> > &data, Feature<T>& feat, double cutoff);
19 | 	static std::pair<double, matrix::GLM> class_train(const vector<pra<T> > &data, Feature<T>& feat, double cutoff);
20 | 	static std::pair<double, matrix::GLM> regression_train(const vector<pra<T> > &data, Feature<T>& feat);
21 | 	static double regression_test(const vector<pra<T> >& data, Feature<T>& feat, const matrix::GLM& glm);
22 | 	static tuple<double,double,double> class_test(const vector<pra<T> >& data, Feature<T>& feat, const matrix::GLM& glm, double cutoff);
23 | 
24 | 	virtual pair<Feature<T>*,matrix::GLM> train_regression(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing) = 0;
25 | 	virtual pair<Feature<T>*,matrix::GLM> train_class(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id) = 0;
26 | };
27 | #endif
28 | 


--------------------------------------------------------------------------------
/src/predict/GLM.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * glm.cpp
 3 |  *
 4 |  * Created on: May 29, 2017
 5 |  * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa
 6 |  *
 7 |  * Modified by Benjamin T James
 8 |  */
 9 | 
10 | #include "GLM.h"
11 | #include "Matrix.h"
12 | 
13 | #include <math.h>
14 | #include <iostream>
15 | using namespace std;
16 | // using namespace matrix;
17 | 
18 | namespace matrix{
19 | 
20 | void GLM::train(Matrix& features, Matrix& labels){
21 | 	weights = features.transpose() * features;
22 | 	weights = weights.pseudoInverse() * features.transpose() * labels;
23 | }
24 | 
25 | 
26 | double GLM::logistic(double x)
27 | {
28 | 	return 1.0 / (1 + exp(-x));
29 | }
30 | Matrix GLM::predict(Matrix& features) const {
31 | 	Matrix labels;
32 | 	labels	= features * weights;
33 | 	double log;
34 | 	for(int i = 0; i < labels.getNumRow(); i++){
35 | 		//log = round(1/(1 + exp(-(labels.get(i,0)))) + 0.1);
36 | 		labels.set(i,0, round(logistic(labels.get(i, 0))));
37 | 	}
38 | 	return labels;
39 | }
40 | 
41 | std::tuple<double,double,double> GLM::accuracy(Matrix& oLabels, Matrix& pLabels) const {
42 | 	int sum = 0;
43 | 	int negSum = 0;
44 | 	int negSame = 0;
45 | 	int posSum = 0;
46 | 	int posSame = 0;
47 | 	for(int i = 0; i < oLabels.getNumRow(); i++){
48 | 		if(oLabels.get(i,0) == -1){
49 | 			negSum++;
50 | 			if(oLabels.get(i,0) == pLabels.get(i, 0)){
51 | 				sum++;
52 | 				negSame++;
53 | 			}
54 | 		}else{
55 | 			posSum++;
56 | 			if(oLabels.get(i,0) == pLabels.get(i, 0)){
57 | 				sum++;
58 | 				posSame++;
59 | 			}
60 | 		}
61 | 	}
62 | 	double acc = (((double)sum*100)/(oLabels.getNumRow()));
63 | 	double sens =  (((double)posSame*100)/(posSum));
64 | 	double spec = (((double)negSame*100)/(negSum));
65 | 	// cout << "Accuracy: " << acc << "% ";
66 | 	// cout << "Sensitivity: " << sens << "% ";
67 | 	// cout << "Specificity: " << spec << "% " << endl;
68 | 	return make_tuple(acc, sens, spec);
69 | }
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/src/predict/GLM.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * glm.h
 3 |  *
 4 |  * Created on: May 29, 2017
 5 |  * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa
 6 |  *
 7 |  * Modified by Benjamin T James
 8 |  */
 9 | 
10 | #ifndef SRC_MATRIX_GLM_H_
11 | #define SRC_MATRIX_GLM_H_
12 | 
13 | #include "Matrix.h"
14 | #include <tuple>
15 | namespace matrix {
16 | 
17 | class GLM {
18 | private:
19 | 	Matrix weights;
20 | 
21 | public:
22 | 	void load(Matrix weights_) { weights = weights_; }
23 | 	void train(matrix::Matrix& features, matrix::Matrix& labels);
24 | 	Matrix predict(matrix::Matrix& features) const;
25 | 	static double logistic(double x);
26 | 	static double linear(double x);
27 | 	std::tuple<double,double,double> accuracy(matrix::Matrix& oLabels, matrix::Matrix& pLabels) const;
28 | 	const Matrix& get_weights() const { return weights; };
29 | };
30 | 
31 | }
32 | 
33 | #endif /* SRC_MATRIX_GLM_H_ */
34 | 


--------------------------------------------------------------------------------
/src/predict/GreedySelector.cpp:
--------------------------------------------------------------------------------
  1 | /* -*- C++ -*- */
  2 | /*
  3 |  * GreedySelector.cpp
  4 |  *
  5 |  * Author: Benjamin T James
  6 |  */
  7 | #include "GreedySelector.h"
  8 | #include "../clutil/Progress.h"
  9 | 
 10 | template<class T>
 11 | pair<Feature<T>*,matrix::GLM> GreedySelector<T>::train_regression(Feature<T>* feat, const vector<pra<T> > &training,const vector<pra<T> > &testing)
 12 | {
 13 | 	auto c_size = feat->get_combos().size();
 14 | 	for (int i = 0; i < c_size; i++) {
 15 | 		feat->remove_feature();
 16 | 	}
 17 | 	vector<uintmax_t> used_list;
 18 | 	double abs_best_regr = 1000000;
 19 | //	Progress prog(possible_feats.size() * max_num_feat, "Feature selection:");
 20 | 	for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) {
 21 | 		double best_regr_err = abs_best_regr;
 22 | 		uintmax_t best_idx = -1, cur_idx = 1;
 23 | 		auto best_regr_feat = possible_feats.front();
 24 | 		for (uint64_t i = 0; i < possible_feats.size(); i++) {
 25 | 			if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) {
 26 | 				continue;
 27 | 			}
 28 | 			auto rfeat = possible_feats[i];
 29 | 		        feat->add_feature(rfeat.first, rfeat.second);
 30 | 			feat->normalize(training);
 31 | 			feat->finalize();
 32 | 			auto pr = FeatureSelector<T>::regression_train(training, *feat);
 33 | 			auto name = feat->feat_names().back();
 34 | 			double regr_mse = FeatureSelector<T>::regression_test(testing, *feat, pr.second);
 35 | 			feat->remove_feature();
 36 | 			//	prog++;
 37 | 			//cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name << " err: " << regr_mse << endl;
 38 | 			if (regr_mse < best_regr_err) {
 39 | 				best_regr_err = regr_mse;
 40 | 				best_regr_feat = rfeat;
 41 | 				best_idx = i;
 42 | 			}
 43 | 		}
 44 | 		if (best_regr_err < abs_best_regr) {
 45 | 			feat->add_feature(best_regr_feat.first, best_regr_feat.second);
 46 | 			feat->normalize(training);
 47 | 			feat->finalize();
 48 | 			abs_best_regr = best_regr_err;
 49 | 			used_list.push_back(best_idx);
 50 | 			//possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_regr_feat), possible_feats.end());
 51 | 		}
 52 | 	}
 53 | //	prog.end();
 54 | 
 55 | 	Feature<T>* feat_r = new Feature<T>(*feat);
 56 | 	feat_r->set_save(false);
 57 | 	auto pr = FeatureSelector<T>::regression_train(training, *feat_r);
 58 | 	matrix::GLM r_glm = pr.second;
 59 | 	double tr_regr_mse = FeatureSelector<T>::regression_test(testing, *feat_r, r_glm); // "training"
 60 | 	cout << "Training Mean Error: " << pr.first << endl;
 61 | 	double regr_mse = FeatureSelector<T>::regression_test(testing, *feat_r, r_glm);//, "testing");
 62 | 	cout << "Testing Mean Error: " << regr_mse << endl;
 63 | 	cout << "Features: "<< endl;
 64 | 	for (auto line : feat_r->feat_names()) {
 65 | 		cout << "\t" << line << endl;
 66 | 	}
 67 | 	auto w = r_glm.get_weights();
 68 | 	for (int r = 0; r < w.getNumRow(); r++) {
 69 | 		cout << "weight: ";
 70 | 		for (int c = 0; c < w.getNumCol(); c++) {
 71 | 			cout << w.get(r, c) << " ";
 72 | 		}
 73 | 		cout << endl;
 74 | 	}
 75 | 
 76 | }
 77 | 
 78 | template<class T>
 79 | std::pair<Feature<T>*,matrix::GLM> GreedySelector<T>::train_class(Feature<T>* feat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id)
 80 | {
 81 | 	auto c_size = feat->get_combos().size();
 82 | 	for (int i = 0; i < c_size; i++) {
 83 | 		feat->remove_feature();
 84 | 	}
 85 | 	vector<uintmax_t> used_list;
 86 | 	double abs_best_acc = 0;
 87 | //	cout << "possible feats at one step: " << possible_feats.size() << endl;
 88 | 	Progress prog(possible_feats.size() * max_num_feat, "Feature selection:");
 89 | 
 90 | 	std::ostringstream oss;
 91 | 	for (auto num_feat = 1; num_feat <= max_num_feat; num_feat++) {
 92 | 		double best_class_acc = abs_best_acc;
 93 | 		uintmax_t best_idx = -1, cur_idx = 1;
 94 | 		auto best_class_feat = possible_feats.front();
 95 | 		for (uint64_t i = 0; i < possible_feats.size(); i++) {
 96 | 			if (std::find(used_list.begin(), used_list.end(), i) != used_list.end()) {
 97 | 				continue;
 98 | 			}
 99 | 			auto rfeat = possible_feats[i];
100 | 		        feat->add_feature(rfeat.first, rfeat.second);
101 | 			feat->normalize(training);
102 | 			feat->finalize();
103 | 			auto name = feat->feat_names().back();
104 | 			auto pr = FeatureSelector<T>::class_train(training, *feat, id);
105 | 			auto class_ac = FeatureSelector<T>::class_test(testing, *feat, pr.second, id);
106 | 			double class_accuracy = get<0>(class_ac);//sqrt(get<1>(class_ac) * get<2>(class_ac));
107 | 			feat->remove_feature();
108 | 			prog++;
109 | //			cout << "Feature: " << cur_idx++ << "/" << possible_feats.size() - used_list.size() << " " << num_feat << "/" << max_num_feat << " " << name  << " acc: " << get<0>(class_ac) << " sens: " << get<1>(class_ac) << " spec: " << get<2>(class_ac) << endl;
110 | 			if (class_accuracy > best_class_acc) {
111 | 				best_class_acc = class_accuracy;
112 | 				best_class_feat = rfeat;
113 | 				best_idx = i;
114 | 			}
115 | 		}
116 | 		/* accept the feature if either 1. we don't have enough features
117 | 		 * or 2. it improves accuracy by over 0.5%
118 | 		 */
119 | 		if (best_class_acc > abs_best_acc || num_feat <= min_num_feat) {
120 | 			feat->add_feature(best_class_feat.first, best_class_feat.second);
121 | 			feat->normalize(training);
122 | 			feat->finalize();
123 | 			abs_best_acc = best_class_acc;
124 | 			used_list.push_back(best_idx);
125 | 			oss << "Feature added: " << best_class_feat.first << " " << (int)best_class_feat.second << endl;
126 | 			oss << "Accuracy: " << best_class_acc << endl;
127 | 			possible_feats.erase(std::remove(possible_feats.begin(), possible_feats.end(), best_class_feat), possible_feats.end());
128 | 		}
129 | 	}
130 | 	prog.end();
131 | 	cout << oss.str();
132 | 	Feature<T>* feat_c = new Feature<T>(*feat);
133 | 	feat_c->set_save(false);
134 | 	auto pr = FeatureSelector<T>::class_train(training, *feat_c, id);
135 | 	matrix::GLM c_glm = pr.second;
136 | 	auto train_results = FeatureSelector<T>::class_test(training, *feat_c, c_glm, id);//, "train");
137 | 	cout << "Training ACC: " << get<0>(train_results) << " " << get<1>(train_results) << " " << get<2>(train_results) << endl;
138 | 	auto test_results = FeatureSelector<T>::class_test(testing, *feat_c, c_glm, id);//, "test");
139 | 	double class_acc = get<0>(test_results);
140 | 	cout << "Testing ACC: " << class_acc << " " << get<1>(test_results) << " " << get<2>(test_results) << endl;
141 | 
142 | 	cout << "Features: "<< endl;
143 | 	for (auto line : feat_c->feat_names()) {
144 | 		cout << "\t" << line << endl;
145 | 	}
146 | 	return std::make_pair(feat_c, c_glm);
147 | }
148 | 
149 | template class GreedySelector<uint8_t>;
150 | template class GreedySelector<uint16_t>;
151 | template class GreedySelector<uint32_t>;
152 | template class GreedySelector<uint64_t>;
153 | template class GreedySelector<int>;
154 | template class GreedySelector<double>;
155 | 


--------------------------------------------------------------------------------
/src/predict/GreedySelector.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*- */
 2 | /*
 3 |  * GreedySelector.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  */
 7 | 
 8 | #ifndef GREEDY_SELECTOR_H
 9 | #define GREEDY_SELECTOR_H
10 | #include "FeatureSelector.h"
11 | 
12 | template<class T>
13 | class GreedySelector : public FeatureSelector<T> {
14 | public:
15 | 	GreedySelector(vector<std::pair<uint64_t, Combo> > possible_feats_, int min_n_feat, int max_n_feat) : possible_feats(possible_feats_), min_num_feat(min_n_feat), max_num_feat(max_n_feat) {}
16 | 	~GreedySelector() {}
17 | 	pair<Feature<T>*,matrix::GLM> train_regression(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing);
18 | 	pair<Feature<T>*,matrix::GLM> train_class(Feature<T>* tfeat, const vector<pra<T> > &training,const vector<pra<T> > &testing, double id);
19 | private:
20 | 	int max_num_feat, min_num_feat;
21 | 	vector<std::pair<uint64_t, Combo> > possible_feats;
22 | };
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/predict/HandleSeq.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Author: Alex Baumgartner
  3 |  * Modified by Benjamin T James
  4 |  * The Bioinformatics Toolsmith Laboratory, the University of Tulsa
  5 |  * 5/15/2018
  6 |  *
  7 |  * Purpose:
  8 |  *	The pupose of this module is to take a sequence and mutate it to returns
  9 |  			It also serves as a way to parse a file for all sequences
 10 |  */
 11 | 
 12 | #include "HandleSeq.h"
 13 | #include <omp.h>
 14 | // d
 15 | HandleSeq::HandleSeq(int m, std::random_device::result_type rnd) {
 16 | 
 17 | 	mode = m & HandleSeq::BOTH;
 18 | 	enableTrans = m & HandleSeq::TRANSLOCATION;
 19 | 	enableRev = m & HandleSeq::REVERSION;
 20 | 	random = new LCG(rnd);
 21 | 	// disable = (m & HandleSeq::ATYPICAL) > 0 ? 0 : 1;
 22 | }
 23 | 
 24 | pair<vector<string>, vector<string>> HandleSeq::parseFile(string fileName) {
 25 | 	ifstream fileIn;
 26 | 	//Uses the file the user supplies to take in sequences
 27 | 	fileIn.open(fileName, ifstream::in);
 28 | 	if(fileIn.is_open()){
 29 | 	vector<string> sequences;
 30 | 	vector<string> names;
 31 | 	string inString;
 32 | 	//Boolean to make sure that the first sequence
 33 | 	//has already been found, prevents a null string being written
 34 | 	bool foundFirst = false;
 35 | 	string currentLine;
 36 | 	while (!fileIn.eof()) {
 37 | 		getline(fileIn, currentLine);
 38 | 		//Skip the line if nothing is on it
 39 | 		if (currentLine.length() == 0) {
 40 | 			continue;
 41 | 		}
 42 | 		//If the line has a '>' symbol, the start of a new sequence
 43 | 		else if (currentLine.at(0) == '>' && foundFirst) {
 44 | 			//Push the current saved sequene onto the vector,
 45 | 			//then reset the strings value
 46 | 			sequences.push_back(inString);
 47 | 			names.push_back(currentLine.substr(1, currentLine.find_first_of(' ')));
 48 | 			inString = "";
 49 | 		}
 50 | 		else if(currentLine.at(0) == '>' && !foundFirst){
 51 | 			foundFirst = true;
 52 | 			names.push_back(currentLine.substr(1, currentLine.find_first_of(' ')));
 53 | 		}
 54 | 		//If this is the first >, set found first to true
 55 | 		else if (!foundFirst) {
 56 | 			foundFirst = true;
 57 | 		}
 58 | 		//Otherwise, add the current Line to
 59 | 		//the string of current lines
 60 | 		else {
 61 | 			inString = inString + currentLine;
 62 | 		}
 63 | 	}
 64 | 	//Push the last found string on
 65 | 	//(There is no > at the end of a .fa file)
 66 | 	sequences.push_back(inString);
 67 | 	fileIn.close();
 68 | 	return {names, sequences};
 69 | 	}
 70 | 	else{
 71 | 		cout << "Could not find File" << endl;
 72 | 		exit(2);
 73 | 	}
 74 | }
 75 | 
 76 | pair<float, string> HandleSeq::mutate(string sequence, int muteRate, int split) {
 77 | 	percMute = muteRate;
 78 | 	if (muteRate == 0) {
 79 | 		return std::make_pair(1, sequence);
 80 | 	}
 81 | 	auto nucls = countNucl(sequence);
 82 | 	//Assing the percent of each nucleotide in the sequence
 83 | 	int percAs = (nucls.at(0) * 100) / sequence.length();
 84 | 	int percCs = (nucls.at(1) * 100) / sequence.length();
 85 | 	int percGs = (nucls.at(2) * 100) / sequence.length();
 86 | 	int percTs = (nucls.at(3) * 100) / sequence.length();
 87 | 	int percMulti, percSing;
 88 | 	string * seq = new string(sequence);
 89 | 	int length = sequence.length();
 90 | 	//If the user only wants single
 91 | 	if (mode == 1) {
 92 | 		percMulti = 0;
 93 | 		//Allocate all mutations to single
 94 | 		percSing = percMute;
 95 | 	}
 96 | 	//Or if the user only wants non single
 97 | 	else if (mode == 2) {
 98 | 		//Allocate all mutations to non-single
 99 | 		percSing = 0;
100 | 		percMulti = percMute;
101 | 	}
102 | 	//Otherwise, assing a random percentage to both
103 | 	else {
104 | 		percMulti = split;
105 | //		percMulti = random.randMod<int>(percMute);
106 | 		percSing = percMute - percMulti;
107 | 	}
108 | 	//Define a new multiple mutation
109 | 	MultiMute multi(percAs, percCs, percGs, percTs,
110 | 			percMulti, enableTrans, enableRev, random->nextRandSeed());
111 | 	//Run the multiple mutations,
112 | 	//get back its vector of what is valid to mutate and what isn't
113 | 	vector<bool> mutes = multi.genMulti(seq);
114 | 	uint64_t cnt = 0;
115 | 	for (bool b : mutes) {
116 | 		cnt += b ? 1 : 0;
117 | 	}
118 | 	if (mutes.size() != seq->length()) {
119 | 		cerr << "mutation size is not matching the multi-sequence" << endl;
120 | 		throw 100;
121 | 	}
122 | 	SingMute sing(percAs, percCs, percGs, percTs,
123 | 		      percSing, seq, mutes, random->nextRandSeed());
124 | 	float alignmentLength = multi.getAlignmentLength() + sing.getAlignmentLength() + length;
125 | //	cout << "alignLength: " << alignmentLength << endl;
126 | 	float IBP = length - multi.getIBP() - sing.getIBP();
127 | //	cout << "ibp: " << IBP << endl;
128 | 	float alignment = IBP / alignmentLength;
129 | //	cout << "ratio: size: " << mutes.size() << " expected: " << (float)cnt / mutes.size() << " found: " << ((float)length - multi.getIBP()) / ((float)multi.getAlignmentLength() + length) << " align: " << alignment << endl;
130 | 	//assign the sequence to the
131 | 	//value that the seq pointer stores to
132 | 	//clear the heap
133 | 	delete seq;
134 | 	//Return the now mutated sequence
135 | 	std::string outseq = sing.getSeq();
136 | 	return make_pair(alignment, outseq);
137 | }
138 | 
139 | vector<int> HandleSeq::countNucl(const string& sequence) {
140 | 	int a = 0;
141 | 	int c = 0;
142 | 	int g = 0;
143 | 	int t = 0;
144 | 	for (int i = 0; i < sequence.length(); i++) {
145 | 		if (sequence.at(i) == 'A') {
146 | 			a++;
147 | 		} else if (sequence.at(i) == 'C') {
148 | 			c++;
149 | 		} else if (sequence.at(i) == 'G') {
150 | 			g++;
151 | 		} else if (sequence.at(i) == 'T') {
152 | 			t++;
153 | 		}
154 | 	}
155 | 	vector<int> values;
156 | 	values.push_back(a);
157 | 	values.push_back(c);
158 | 	values.push_back(g);
159 | 	values.push_back(t);
160 | 	return values;
161 | }
162 | 


--------------------------------------------------------------------------------
/src/predict/HandleSeq.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Alex Baumgartner
 3 |  * Modified by Benjamin T James
 4 |  * The Bioinformatics Toolsmith Laboratory, the University of Tulsa
 5 |  * 5/15/2018
 6 |  *
 7 |  * Purpose:
 8 |  *	The pupose of this module is to take a sequence and mutate it to returns
 9 |  			It also serves as a way to parse a file for all sequences
10 |  */
11 | #ifndef HANDLESEQ_H
12 | #define  HANDLESEQ_H
13 | 
14 | #include <iostream>
15 | #include <vector>
16 | #include <fstream>
17 | #include <string>
18 | #include "LCG.h"
19 | #include "MultiMute.h"
20 | #include "SingMute.h"
21 | 
22 | using namespace std;
23 | 
24 | class HandleSeq {
25 | public:
26 | 	// Single — point — mutations only
27 | 	static const int SINGLE = (1 << 0);
28 | 	// Non-single point mutations only
29 | 	static const int NON_SINGLE = (1 << 1);
30 | 	// Single and non-single mutations
31 | 	static const int BOTH = SINGLE | NON_SINGLE;
32 | // translocations and reversions
33 | 	static const int TRANSLOCATION = (1 << 2);
34 | 	static const int REVERSION = (1 << 3);
35 | 	static const int ATYPICAL = TRANSLOCATION | REVERSION;
36 | 	static const int ALL = ATYPICAL | BOTH;
37 | 	/*
38 | 	 constructor
39 | 
40 | 	 @param:
41 | 	 int: the mode of the program
42 | 	 				(Single only = 1, nonsingle only = 2, both = 3)
43 | 	 */
44 | 	HandleSeq(int, std::random_device::result_type seed);
45 | 	~HandleSeq() { if (random != NULL) { delete random; }}
46 | 	/*
47 | 	 returns a vector of all sequences in a file inputted
48 | 
49 | 	 @param:
50 | 	 std::string: file name
51 | 	 int: the mutation rate
52 | 
53 | 	 @return:
54 | 	 std::vector<std::string>: Vector of all found sequences
55 | 	 */
56 | 	pair<vector<string>, vector<string>> parseFile(string);
57 | 	/*
58 | 	 Mutates a sequence based on parameters inputted in constructor,
59 | 	 	and returns the mutated sequence
60 | 	 */
61 | 	pair<float, string> mutate(string, int, int);
62 | 
63 | 	uint32_t getSeed() const { return seed; }
64 | private:
65 | 	uint32_t seed;
66 | 	int mode;
67 | 	int percMute;
68 | 	bool enableTrans, enableRev;
69 |         LCG *random = NULL;
70 | 	/*
71 | 	 Counts the nucleotides in a file,
72 | 	 	and returns a vector corresponding to their values {A, C, G, T}
73 | 
74 | 	 @param:
75 | 	 std::string: the sequences
76 | 
77 | 	 @return:
78 | 	 std::vector<int>: vector containing ints of each nucleotide count
79 | 	 */
80 | 	vector<int> countNucl(const string&);
81 | 
82 | };
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/src/predict/Matrix.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * matrix.h
 3 |  *
 4 |  * Created on: May 10, 2017
 5 |  * Author: Robert Geraghty, The Bioinformatics Toolsmith Laboratory, The University of Tulsa
 6 |  * Modified by Benjamin T James
 7 |  */
 8 | 
 9 | 
10 | #ifndef MATRIX_H_
11 | #define MATRIX_H_
12 | 
13 | #include <vector>
14 | #include <string>
15 | 
16 | namespace matrix {
17 | 
18 | class Matrix
19 | {
20 | private:
21 | 	std::vector<std::vector<double> > m;
22 | 	int numRow;
23 | 	int numCol;
24 | 
25 | 
26 | public:
27 | 	Matrix(std::vector<double> m);
28 | 	Matrix(int r, int c);
29 | 	Matrix();
30 | 	~Matrix();
31 | 	Matrix operator+(Matrix n);
32 | 	Matrix operator-(Matrix n);
33 | 	Matrix operator*(Matrix n);
34 | 	Matrix transpose();
35 | 	Matrix gaussJordanInverse();
36 | 	Matrix pseudoInverse();
37 | 	void userFill();
38 | 	double determinant();
39 | 	double get(int r, int c) const;
40 | 	void set(int r, int c, double val);
41 | 	void addRow(double);
42 | 	void addCol(double);
43 | 	void print();
44 | 	void printToFile(std::string);
45 | 	void randFill(double low, double high);
46 | 	void fileFill(std::string filename);
47 | 	void normalize(double a, double b);
48 | 	void rowToVector(int, std::vector<double>&);
49 | 	void colToVector(int, std::vector<double>&);
50 | 	int getNumRow() const;
51 | 	int getNumCol() const { return numCol; };
52 | };
53 | }
54 | #endif /* MATRIX_H_ */
55 | 


--------------------------------------------------------------------------------
/src/predict/MultiMute.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Author: Alex Baumgartner
  3 |  * Modified by Benjamin T James
  4 |  * The Bioinformatics Toolsmith Laboratory, the University of Tulsa
  5 |  * 5/15/2018
  6 |  *
  7 |  * Purpose:
  8 |  *	The pupose of this module is to perform non single mutations on sequences
  9 |  */
 10 | 
 11 | #ifndef MULTIMUTE_H
 12 | #define MULTIMUTE_H
 13 | 
 14 | #include <iostream>
 15 | #include <vector>
 16 | #include <string>
 17 | #include <algorithm>
 18 | #include <random>
 19 | #include "Random.h"
 20 | #include "LCG.h"
 21 | using namespace std;
 22 | 
 23 | class MultiMute {
 24 | public:
 25 | 	/*
 26 | 	 Constructor, creates values
 27 | 	 	and assignes allocations based on inputted data
 28 | 
 29 | 	 @param:
 30 | 	 int: percentage of A's
 31 | 	 int: percentage of C's
 32 | 	 int: percentage of G's
 33 | 	 int: percentage of T's
 34 | 	 int: The total allocation for non-single mutations
 35 | 	 int: bool to exclude Translocate and reverse, 1 for disable, any other umber for include
 36 | 	 */
 37 | 	MultiMute(int, int, int, int, int, bool, bool, std::random_device::result_type);
 38 | 	/*
 39 | 	 Takes in a string pointer,
 40 | 	 	and mutates it based on the allocation given to the constructor.
 41 | 	 	Returns a vector of all valid and invalid indexes
 42 | 
 43 | 	 @param:
 44 | 	 std::string *: pointer to the string to be mutated
 45 | 
 46 | 	 @return:
 47 | 	 std::vector<bool>: vector of mutations,
 48 | 	 											false means that index has been mutated
 49 | 	 */
 50 | 	std::vector<bool> genMulti(std::string *);
 51 | 	int getAlignmentLength();
 52 | 	int getIBP();
 53 | 
 54 | 	private:
 55 | 	int percAs;
 56 | 	int percCs;
 57 | 	int percGs;
 58 | 	int percTs;
 59 | 	int64_t maxReverse;
 60 | 	int64_t maxInsert;
 61 | 	int64_t maxTrans;
 62 | 	int64_t maxDel;
 63 | 	int64_t maxDup;
 64 | 	int64_t maxNonMutations;
 65 | 	int64_t alignmentLength;
 66 | 	int64_t IBP;
 67 | 	int64_t total_alloc;
 68 | 	LCG rng;
 69 | 
 70 | 	int64_t max_block_size;
 71 | 	std::vector<std::string> * insertions;
 72 | 	std::vector<string> * mutationStrings;
 73 | 	std::string * seq;
 74 | 	/*
 75 | 	 Takes in a vector
 76 | 	 */
 77 | 	void reverse(vector<string> *);
 78 | 	/*
 79 | 	 Translocates a random, nonmutaded part of the sequence,
 80 | 	 	no larger than its max allocation
 81 | 	 */
 82 | 	void translocate(vector<string> *);
 83 | 	/*
 84 | 	 Inserts at random, nonmutaded part of the sequence,
 85 | 	 	no larger than its max allocation
 86 | 	 */
 87 | 	void insert(vector<string> *);
 88 | 	/*
 89 | 	 Deletes a random, nonmutaded part of the sequence,
 90 | 	 	no larger than its max allocation
 91 | 	 */
 92 | 	void deleteNucl(vector<string> *);
 93 | 	/*
 94 | 	 Duplicates a random, nonmutaded part of the sequence,
 95 | 	 	no larger than its max allocation
 96 | 	 	to an index directly after the duplicated string
 97 | 	 */
 98 | 	void duplicate(vector<string> *);
 99 | 	/*
100 | 	 Checks inclusively, [first, last], if a portion is valid
101 | 
102 | 	 @param:
103 | 	 int: The starting index (first)
104 | 	 int: The ending index (last)
105 | 
106 | 	 @return:
107 | 	 bool: true if all indexes in range are valid
108 | 	 */
109 | 	bool checkPalindrome(int, int);
110 | 	void checkForAllPalindromes(vector<string> *);
111 | 	/*
112 | 	 Marks all indexes in the range as invalid
113 | 
114 | 	 @param:
115 | 	 int: first index to be marked false
116 | 	 int: last index tobe marked false
117 | 	 */
118 | 	vector<bool> formatString(int, vector<char> *);
119 | 
120 | 	/*
121 | 		Generates a randomized string based on the inputed size
122 | 		@param:
123 | 		int: size of string to generate
124 | 		@return
125 | 		string: randomized string
126 | 	*/
127 | 	std::string genInsert(int);
128 | 	/*
129 | 		Adds all translocations to the insertions array
130 | 		@param:
131 | 		vector<char> *: pointer to a char vector with mutation characters
132 | 	*/
133 | 	void getTranslocations(vector<char> *);
134 | 	/*
135 | 		converts a vector of strings into a vector of chars
136 | 		@param:
137 | 		vector<string> *: the vector to be converted
138 | 		@return:
139 | 		vector<char> *: the vector of characters
140 | 	*/
141 | 	vector<char> * genCharVector(vector<string> *);
142 | };
143 | #endif
144 | 


--------------------------------------------------------------------------------
/src/predict/Predictor.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*-
 2 |  *
 3 |  * Predictor.h
 4 |  *
 5 |  * Author: Benjamin T James
 6 |  *
 7 |  * Main class for training and prediction
 8 |  * Does bulk training, but can be adapted for on-line training
 9 |  */
10 | 
11 | #ifndef PREDICTOR_H
12 | #define PREDICTOR_H
13 | 
14 | #include "GLM.h"
15 | #include "Point.h"
16 | #include "Feature.h"
17 | #include <set>
18 | #include "Random.h"
19 | #include <omp.h>
20 | #define PRED_MODE_CLASS 1
21 | #define PRED_MODE_REGR  2
22 | 
23 | #define PRED_FEAT_FAST (FEAT_EUCLIDEAN | FEAT_MANHATTAN | FEAT_INTERSECTION | FEAT_KULCZYNSKI2 | FEAT_SIMRATIO | FEAT_NORMALIZED_VECTORS | FEAT_PEARSON_COEFF | FEAT_EMD | FEAT_LENGTHD )
24 | #define PRED_FEAT_DIV (FEAT_JEFFEREY_DIV | FEAT_JENSEN_SHANNON)
25 | #define PRED_FEAT_ALL (FEAT_HELLINGER|FEAT_MANHATTAN|FEAT_EUCLIDEAN|FEAT_CHI_SQUARED|FEAT_NORMALIZED_VECTORS|FEAT_HARMONIC_MEAN|FEAT_JEFFEREY_DIV|FEAT_K_DIV|FEAT_PEARSON_COEFF|FEAT_SQCHORD|FEAT_KL_COND|FEAT_MARKOV|FEAT_INTERSECTION|FEAT_RRE_K_R|FEAT_D2z|FEAT_SIM_MM|FEAT_EUCLIDEAN_Z|FEAT_EMD|FEAT_SPEARMAN|FEAT_JACCARD|FEAT_LENGTHD|FEAT_D2s|FEAT_AFD|FEAT_MISMATCH|FEAT_CANBERRA|FEAT_KULCZYNSKI1|FEAT_KULCZYNSKI2|FEAT_SIMRATIO|FEAT_JENSEN_SHANNON|FEAT_D2_star|FEAT_N2R|FEAT_N2RC|FEAT_N2RRC)
26 | 
27 | template<class T>
28 | class Predictor {
29 | public:
30 | 	Predictor(int k_, double id_, uint8_t mode_, uint64_t feats, int mut_type_, int min_num_feat_=3, int max_num_feat_=5, double min_id_=0.35) : k(k_), id(id_), is_trained(false), is_training(false), mode(mode_), max_num_feat(max_num_feat_), mut_type(mut_type_), min_num_feat(min_num_feat_), min_id(min_id_ * 100), feats64(feats) {
31 | 		add_feats(possible_feats, feats);
32 | 		feat_c = NULL;
33 | 		feat_r = NULL;
34 | 		omp_init_lock(&lock);
35 | 	};
36 | 	Predictor(const std::string filename);
37 | 	~Predictor() {
38 | 		possible_feats.clear();
39 | 		omp_destroy_lock(&lock);
40 | 		if (feat_c) {
41 | 			delete feat_c;
42 | 		}
43 | 		if (feat_r) {
44 | 			delete feat_r;
45 | 		}
46 | 		training.clear();
47 | 		testing.clear();
48 | 	}
49 | 	static double classify_sum(double sum);
50 | 	static void set_bias(double bias);
51 | 	void train(const std::vector<Point<T>* >& vec, uintmax_t& _id, size_t num_sample, size_t n_templates);
52 | 	double similarity(Point<T>* a, Point<T>* b);
53 | 	bool close(Point<T>* a, Point<T>* b);
54 | 	void save(std::string file, std::string datatype);
55 | 	void check();
56 | 	uint8_t get_mode() const { return mode; }
57 | 	pair<Feature<T>*, matrix::GLM> get_class() { return std::make_pair(new Feature<T>(*feat_c), c_glm); }
58 | 	void mutate_seqs(Point<T>* p, size_t num_seq, vector<pra<T> > &,vector<pra<T> > & , double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed);
59 | 	void mutate_seqs(Point<T>* p, size_t num_seq,vector<pra<T> >  &,double id_begin, double id_end, uintmax_t& _id, std::random_device::result_type seed);
60 | 	std::string get_datatype() const { return datatype; }
61 | 	int get_k() const { return k; }
62 | 	double get_id() const { return id; }
63 | private:
64 | 	static void add_feats(std::vector<std::pair<uint64_t, Combo> >& vec, uint64_t flags);
65 | 	static pair<matrix::GLM, Feature<T>*> read_from(std::ifstream &in, int k_);
66 | 	static void write_to(std::ofstream &out, Feature<T>* f, matrix::GLM glm);
67 | 	void filter(std::vector<pra<T> > &s, std::string prefix="");
68 | 	void train();
69 | 	void train_class(Feature<T>* feat);
70 | 	void train_regr(Feature<T>* feat);
71 | 	void train_class_regr(Feature<T>* feat);
72 | 	double predict(Point<T>* a, Point<T>* b);
73 | 	bool p_close(Point<T>* a, Point<T>* b);
74 | 	double p_predict(Point<T>* a, Point<T>* b);
75 | 
76 | 	Feature<T> *feat_c, *feat_r;
77 | 	matrix::GLM c_glm, r_glm;
78 |         vector<pra<T> > training, testing;
79 | 	bool is_trained, is_training;
80 | 	int min_num_feat, max_num_feat, k, mut_type;
81 | 	uint8_t mode;
82 | 	double id, min_id;
83 | 	vector<std::pair<uint64_t, Combo> > possible_feats;
84 | 	omp_lock_t lock;
85 | 	Random random;
86 | 	uint64_t feats64;
87 | 	std::string datatype;
88 | 	double scale_min = 1000;
89 | 	double scale_max = -1000;
90 | };
91 | #endif
92 | 


--------------------------------------------------------------------------------
/src/predict/SingMute.cpp:
--------------------------------------------------------------------------------
  1 | /* -*- C++ -*- */
  2 | /*
  3 |  * SingMute.cpp
  4 |  *
  5 |  * Original Author: Alexander Baumgartner
  6 |  * Modified by Benjamin T James
  7 |  */
  8 | #include "SingMute.h"
  9 | #include <set>
 10 | #include <random>
 11 | #include <algorithm>
 12 | #include <iostream>
 13 | 
 14 | #ifdef MUTDEBUG
 15 | static const std::string INSERT_BEGIN = "[";
 16 | static const std::string INSERT_END = "]";
 17 | static const std::string SWITCH_BEGIN = "(";
 18 | static const std::string SWITCH_END = ")";
 19 | static const std::string DEL = "-";
 20 | #else
 21 | static const std::string INSERT_BEGIN = "";
 22 | static const std::string INSERT_END = "";
 23 | static const std::string SWITCH_BEGIN = "";
 24 | static const std::string SWITCH_END = "";
 25 | static const std::string DEL = "";
 26 | #endif
 27 | 
 28 | 
 29 | char SingMute::randNucl()
 30 | {
 31 | 	char character;
 32 | 	int value = rng.randMod<int>(percAs + percCs + percGs + percTs);
 33 | //	int value = 40436 % (percAs + percCs + percGs + percTs);
 34 | 	if (value < percAs) {
 35 | 		character = 'A';
 36 | 	} else if (value < percAs + percCs) {
 37 | 		character = 'C';
 38 | 	} else if (value < percAs + percCs + percGs) {
 39 | 		character = 'G';
 40 | 	} else {
 41 | 		character = 'T';
 42 | 	}
 43 | 	return character;
 44 | }
 45 | void SingMute::init(const std::vector<bool> &valid)
 46 | {
 47 | 	maxInsert = 0;
 48 | 	maxDel = 0;
 49 | 	maxSwitch = 0;
 50 | 	if (num_mut == 0) {
 51 | 		out_seq = std::string(*seq);
 52 | 		IBP = 0;
 53 | 		alignmentLength = 0;
 54 | 		return;
 55 | 	} else if (num_mut == 1) {
 56 | 		maxInsert = 1;
 57 | 		maxDel = 0;
 58 | 		maxSwitch = 0;
 59 | 	} else {
 60 | 		maxSwitch = rng.randMod<long>(num_mut);
 61 | 		num_mut -= maxSwitch;
 62 | 
 63 | 		if (maxSwitch % 2 == 1 && num_mut >= 1) {
 64 | 			maxSwitch++;
 65 | 			num_mut--;
 66 | 		} else if (num_mut == 0) {
 67 | 			maxSwitch--;
 68 | 			num_mut++;
 69 | 		}
 70 | 		if (num_mut > 1) {
 71 | 			maxInsert = rng.randMod<long>(num_mut);
 72 | 			num_mut -= maxInsert;
 73 | 		} else {
 74 | 			maxInsert = num_mut;
 75 | 			num_mut -= maxInsert;
 76 | 		}
 77 | 		maxDel = num_mut;
 78 | 	}
 79 | 	size_t seq_len = seq->length();
 80 | 
 81 | 	maxDel *= seq_len / 100.0;
 82 | 	maxInsert *= seq_len / 100.0;
 83 | 	maxSwitch *= seq_len / 100.0;
 84 | 	alignmentLength = maxInsert;
 85 | 	IBP = maxDel + maxSwitch;
 86 | 
 87 | 
 88 | 	std::vector<char> command_str(seq_len, 'S');
 89 | 	long idx = 0;
 90 | 	long nons_len = maxInsert + maxDel + maxSwitch;
 91 | 	for (long i = 0; i < maxInsert; i++) {
 92 | 		command_str[idx++] = 'I';
 93 | 	}
 94 | 	for (long i = 0; i < maxDel; i++) {
 95 | 		command_str[idx++] = 'D';
 96 | 	}
 97 | 	for (long i = 0; i < maxSwitch; i++) {
 98 | 		command_str[idx++] = 'W';
 99 | 	}
100 | 	//std::shuffle(command_str.begin(), command_str.end(), rng.gen());
101 | 	std::shuffle(command_str.begin(), command_str.end(), std::minstd_rand0(rng.nextRandSeed()));
102 | 	std::vector<long> valid_indices;
103 | 	long repl = command_str.size() - 1;
104 | 	for (long i = 0; i < command_str.size(); i++) {
105 | 		if (command_str[i] != 'S' && !valid[i]) {
106 | 			if (!valid_indices.empty()) {
107 | 				repl = valid_indices.back();
108 | 				valid_indices.pop_back();
109 | 			} else {
110 | 				for (; repl > 0; repl--) {
111 | 					if (valid[repl]) {
112 | 						break;
113 | 					}
114 | 				}
115 | 			}
116 | 			std::swap(command_str[i], command_str[repl]);
117 | 		} else if (command_str[i] == 'S'
118 | 			   && valid[i]
119 | 			   && valid_indices.size() < nons_len) {
120 | 
121 | 			valid_indices.push_back(i);
122 | 		}
123 | 	}
124 | 	// std::set<long> s_ins, s_del, s_switch;
125 | 	// generate_unique_set(command_str.size(), s_ins, maxInsert, s_del, s_switch, valid);
126 | 	// generate_unique_set(command_str.size(), s_del, maxDel, s_ins, s_switch, valid);
127 | 	// generate_unique_set(command_str.size(), s_switch, maxSwitch, s_ins, s_del, valid);
128 | 	// for (auto idx : s_ins) {
129 | 	// 	command_str[idx] = 'I';
130 | 	// }
131 | 	// for (auto idx : s_del) {
132 | 	// 	command_str[idx] = 'D';
133 | 	// }
134 | 	// for (auto idx : s_switch) {
135 | 	// 	command_str[idx] = 'W';
136 | 	// }
137 | 	out_seq = "";
138 | 	out_seq.reserve(maxInsert + seq_len - maxDel + 1);
139 | 
140 | 	for (long i = 0; i < seq_len; i++) {
141 | 		auto cmd = command_str.at(i);
142 | 		switch (cmd) {
143 | 		case 'I': {
144 | 			out_seq += INSERT_BEGIN + randNucl() + INSERT_END;
145 | 			out_seq += seq->at(i);
146 | 			break;
147 | 		}
148 | 		case 'S': {
149 | 			out_seq += seq->at(i);
150 | 			break;
151 | 		}
152 | 		case 'D': {
153 | 			out_seq += DEL;
154 | 			break;
155 | 		}
156 | 		case 'W': {
157 | 			out_seq += SWITCH_BEGIN + randNucl() + SWITCH_END;
158 | 			break;
159 | 		}
160 | 		}
161 | 	}
162 | }
163 | 


--------------------------------------------------------------------------------
/src/predict/SingMute.h:
--------------------------------------------------------------------------------
 1 | /* -*- C++ -*- */
 2 | /*
 3 |  * SingMute.h
 4 |  *
 5 |  * Original Author: Alexander Baumgartner
 6 |  * Modified by Benjamin T James
 7 |  */
 8 | 
 9 | #ifndef SINGMUTE_H
10 | #define SINGMUTE_H
11 | 
12 | #include <vector>
13 | #include <string>
14 | #include "Random.h"
15 | #include "LCG.h"
16 | 
17 | class SingMute {
18 | public:
19 | 	/*
20 | 	 Constructor, creates values
21 | 	 	and assignes allocations based on inputted data
22 | 
23 | 	 @param:
24 | 	 int: percentage of A's
25 | 	 int: percentage of C's
26 | 	 int: percentage of G's
27 | 	 int: percentage of T's
28 | 	 int: The total allocation for non-single mutations
29 | 	 */
30 | 	SingMute(int pa, int pc, int pg, int pt, uintmax_t tt, const std::string* s, const std::vector<bool> &valid_, std::random_device::result_type seed) : percAs(pa),
31 | 																	percCs(pc), percGs(pg), percTs(pt), num_mut(tt), seq(s), rng(seed) {
32 | 		init(valid_);
33 | 	}
34 | 	long getAlignmentLength() { return alignmentLength; }
35 | 	long getIBP() { return IBP; }
36 | 	void init(const std::vector<bool> &valid);
37 | 	std::string& getSeq() { return out_seq; };
38 |   private:
39 | 	uintmax_t num_mut;
40 | 	int percAs;
41 | 	int percCs;
42 | 	int percGs;
43 | 	int percTs;
44 | 
45 | 	long maxDel;
46 | 	long maxInsert;
47 | 	long maxSwitch;
48 | 
49 | 	long alignmentLength;
50 | 	long IBP;
51 | 	const std::string * seq;
52 | 	std::string out_seq;
53 | 	char randNucl();
54 | 	LCG rng;
55 | };
56 | #endif
57 | 


--------------------------------------------------------------------------------
/src/utility/EmptyLocation.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * EmptyLocation.cpp
 3 |  *
 4 |  *  Created on: Dec 28, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #include "EmptyLocation.h"
 9 | #include "../exception/InvalidOperationException.h"
10 | 
11 | using namespace exception;
12 | 
13 | namespace utility {
14 | 
15 | EmptyLocation * EmptyLocation::INSTANCE = new EmptyLocation();
16 | 
17 | EmptyLocation * EmptyLocation::getInstance(){
18 | 	return INSTANCE;
19 | }
20 | 
21 | EmptyLocation::EmptyLocation() {
22 | 	msg = new string("Empty location does not allow this operation.");
23 | }
24 | 
25 | EmptyLocation::~EmptyLocation() {
26 | 	delete msg;
27 | }
28 | 
29 | string EmptyLocation::toString() {
30 | 	return string("Empty");
31 | }
32 | 
33 | int EmptyLocation::getEnd() const {
34 | 	throw InvalidOperationException(*msg);
35 | }
36 | 
37 | int EmptyLocation::getStart() const {
38 | 	throw InvalidOperationException(*msg);
39 | }
40 | 
41 | void EmptyLocation::setEnd(int int1) {
42 | 	throw InvalidOperationException(*msg);
43 | }
44 | 
45 | void EmptyLocation::setStart(int int1) {
46 | 	throw InvalidOperationException(*msg);
47 | }
48 | 
49 | int EmptyLocation::getLength() {
50 | 	throw InvalidOperationException(*msg);
51 | }
52 | 
53 | } /* namespace tr */
54 | 


--------------------------------------------------------------------------------
/src/utility/EmptyLocation.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * EmptyLocation.h
 3 |  *
 4 |  *  Created on: Dec 28, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef EMPTYLOCATION_H_
 9 | #define EMPTYLOCATION_H_
10 | 
11 | #include "ILocation.h"
12 | 
13 | namespace utility {
14 | 
15 | class EmptyLocation: public ILocation {
16 | private:
17 | 	string * msg;
18 | 	static EmptyLocation * INSTANCE;
19 | 	EmptyLocation();
20 | 	virtual ~EmptyLocation();
21 | 
22 | public:
23 | 	virtual int getEnd() const;
24 | 	virtual int getStart() const;
25 | 	virtual void setEnd(int);
26 | 	virtual void setStart(int);
27 | 	virtual int getLength();
28 | 	virtual string toString();
29 | 
30 | 	static EmptyLocation * getInstance();
31 | 
32 | };
33 | 
34 | } /* namespace tr */
35 | #endif /* EMPTYLOCATION_H_ */
36 | 


--------------------------------------------------------------------------------
/src/utility/GlobAlignE.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Author: Joseph Valencia
  3 |  * Modified by Benjamin James
  4 |  * Date: 12/14/17
  5 |  * Bioinformatics Toolsmith Laboratory, University of Tulsa
  6 |  * */
  7 | #include <string>
  8 | #include "../exception/InvalidStateException.h"
  9 | #include <algorithm>
 10 | #include <vector>
 11 | #include <iostream>
 12 | #include <fstream>
 13 | #include <limits.h>
 14 | #include <string.h>
 15 | #include <cmath>
 16 | #include "GlobAlignE.h"
 17 | 
 18 | using namespace std;
 19 | using namespace utility;
 20 | using namespace exception;
 21 | 
 22 | GlobAlignE::GlobAlignE(const char * seq1In, int start1In, int end1In, const char * seq2In,
 23 |         int start2In, int end2In, int matchIn, int mismatchIn, int gapOpenIn, int gapContinueIn){
 24 | 
 25 |     seq1 = seq1In;
 26 | 	start1 = start1In;
 27 | 	end1 = end1In;
 28 | 
 29 |     seq2 = seq2In;
 30 | 	start2 = start2In;
 31 | 	end2 = end2In;
 32 | 
 33 | 	len1 = end1 - start1 + 2;
 34 |     len2 = end2 - start2 + 2;
 35 | 
 36 |     //Incremental score storage
 37 |     matches = new int[len1];
 38 |     upperGap = new int[len1];
 39 |     lowerGap = new int[len1];
 40 | 
 41 | 
 42 | 
 43 |     //Incremental length storage
 44 |     matchLen = new int[len1];
 45 |     upperLen = new int[len1];
 46 |     lowerLen = new int[len1];
 47 | 
 48 |     //Incremental identity storage
 49 |     matchId = new int[len1];
 50 |     upperId = new int[len1];
 51 |     lowerId = new int[len1];
 52 | 
 53 |     match = matchIn;
 54 |     mismatch = mismatchIn;
 55 |     gapOpen = gapOpenIn;
 56 |     gapContinue = gapContinueIn;
 57 |     findAlignment();
 58 | 
 59 | }
 60 | /*
 61 | GlobAlignE::GlobAlignE(string filename1,string filename2, int matchIn, int mismatchIn, int gapOpenIn, int gapContinueIn):GlobAlignE(string1.c_str(),0,string.size(),string2.c_str(),0,string2.size(),matchIn,mismatchIn,gapOpenIn,gapContinueIn){
 62 | 
 63 |     ifstream ifs;
 64 | 
 65 |     ifs.open (filename1, ifstream::in);
 66 |     cout<<"FILE OPENED"<<endl;
 67 |     char c = ifs.get();
 68 | 
 69 |     if(c == '>'){
 70 | 
 71 |         while(c!='\n'){
 72 |             c = ifs.get();
 73 | 
 74 |         }
 75 |     }
 76 | 
 77 |      string string1  ="";
 78 | 
 79 |       while (ifs.good()) {
 80 | 
 81 | 
 82 |         if (c!='\n'){
 83 |         string1+=c;
 84 |         }
 85 |         c = ifs.get();
 86 |       }
 87 | 
 88 |       ifs.close();
 89 | 
 90 | 
 91 |      ifstream ifs2;
 92 | 
 93 |      ifs2.open (filename2, ifstream::in);
 94 | 
 95 |      c = ifs2.get();
 96 | 
 97 |      if(c == '>'){
 98 | 
 99 |          while(c!='\n'){
100 |               c = ifs2.get();
101 |          }
102 |      }
103 | 
104 |      string string2  ="";
105 | 
106 |      while (ifs2.good()) {
107 | 
108 |          if(c!='\n'){
109 |          string2+=c;
110 |          }
111 |          c = ifs2.get();
112 |      }
113 | 
114 |      ifs2.close();
115 | 
116 |      std::transform(string1.begin(),string1.end(),string1.begin(),::toupper);
117 |      std::transform(string2.begin(),string2.end(),string2.begin(),::toupper);
118 | 
119 |    // return GlobAlignE(string1.c_str(),0,string.size(),string2.c_str(),0,string2.size(),matchIn,mismatchIn,gapOpenIn,gapContinueIn);
120 | 
121 | }
122 | */
123 | void GlobAlignE::findAlignment(){
124 | 
125 |     int shorter = min(len2,len1)-1;
126 |     int lenDiff = abs(len2-len1);
127 |     int maxDiff=0;
128 | 
129 |     if (lenDiff >=1){
130 |         maxDiff += -gapOpen- (lenDiff*gapContinue);
131 |     }
132 | 
133 |     maxDiff+= (mismatch* shorter)-1;
134 | 
135 |     const int negativeInf = maxDiff;
136 | 
137 |     matches[0]= 0;
138 |     upperGap[0] = negativeInf;
139 |     lowerGap[0] = negativeInf;
140 | 
141 |     matchLen[0] =0;
142 |     upperLen[0] =0;
143 |     lowerLen[0] =0;
144 | 
145 |     matchId[0] =0;
146 |     upperId[0] = 0;
147 |     lowerId[0] =0;
148 | 
149 |     //initial values
150 |     for (int i = 1; i<len1;i++){
151 |         upperGap[i] = negativeInf;
152 |         matches[i] = negativeInf;
153 |         lowerGap[i] = (-gapOpen)- (i*gapContinue);
154 |         matchLen[i]=i;
155 |         upperLen[i]=i;
156 |         lowerLen[i]=i;
157 |         matchId[i] =0;
158 |         upperId[i] =0;
159 |         lowerId[i] =0;
160 |     }
161 | 
162 |     for( int j = 1;j<len2;j++){
163 | 
164 |         int matchLag = matches[0]; //used for calculation of matches
165 |         int matchLenLag = matchLen[0];
166 |         int matchIdLag = matchId[0];
167 | 
168 |         int upperGapLag = (-gapOpen)-((j-1)*gapContinue);
169 |         int upperLenLag = j-1;
170 |         int upperIdLag =0;
171 | 
172 |         for(int i =1;i<len1;i++){
173 | 
174 |             //compute values for upperGap
175 |             int ygapBegin = matches[i]-(gapOpen+gapContinue);
176 |             int ygapCont = upperGap[i]-gapContinue;
177 | 
178 |             int ans = max(ygapBegin,ygapCont);
179 | 
180 |             int store1 = upperGap[i];
181 |             int store2 = upperLen[i];
182 |             int store3 = upperId[i];
183 | 
184 |             upperGap[i] = ans;
185 | 
186 |             if( ans == ygapBegin){
187 |                 upperLen[i] = matchLen[i]+1;
188 |                 upperId[i] = matchId[i];
189 |             }
190 |             else if(ans == ygapCont){
191 |                 upperLen[i] = upperLen[i]+1;
192 |                 upperId[i] = upperId[i];
193 |             }
194 | 
195 | 
196 |             // compute values for match/mismatch
197 |             char a= seq1[start1+i-1];
198 |             char b = seq2[start2+j-1];
199 |             int matchScore = (a == b) ? match : mismatch;
200 | 
201 |             int matched = matchLag + matchScore;
202 | 
203 |             int xgapEnd = lowerGap[i-1] + matchScore;
204 | 
205 |             int ygapEnd = upperGapLag+ matchScore;
206 | 
207 |             ans = max(max(matched,xgapEnd),ygapEnd);
208 | 
209 |             matchLag = matches[i]; //store current val matches in lag
210 |             matches[i] =ans;
211 | 
212 |             int temp = matchLen[i];
213 |             int save = matchId[i];
214 | 
215 |             if(ans == matched){
216 |                 matchLen[i] = matchLenLag+1;
217 |                 if(matchScore == match){
218 |                     matchId[i] = matchIdLag+1;
219 |                 }
220 |                 else{
221 |                     matchId[i] = matchIdLag;
222 |                 }
223 |             }
224 |             else if (ans == xgapEnd){
225 |                 matchLen[i] = lowerLen[i-1]+1;
226 |                 if(matchScore ==match){
227 |                     matchId[i] = lowerId[i-1]+1;
228 |                 }
229 |                 else{
230 |                     matchId[i] = lowerId[i-1];
231 |                 }
232 |             }
233 |             else{
234 |                 matchLen[i] = upperLenLag+1;
235 |                 if(matchScore ==match){
236 |                 matchId[i] = upperIdLag+1;
237 |                 }
238 |                 else{
239 |                     matchId[i] = upperIdLag;
240 |                 }
241 |             }
242 |             matchLenLag = temp;
243 |             matchIdLag = save;
244 |             upperGapLag= store1;
245 |             upperLenLag = store2;
246 |             upperIdLag = store3;
247 | 
248 |         }
249 | 
250 |         matches[0] = negativeInf;
251 |         matchLen[0] = j;
252 |         matchId[0] =0;
253 | 
254 |         lowerGap[0]= negativeInf;
255 |         lowerLen[0] = j;
256 |         lowerId[0] =0;
257 | 
258 |         for(int i = 1;i<len1;i++){
259 | 
260 |                 int xgapBegin = matches[i-1] -(gapOpen+gapContinue);
261 |                 int xgapCont = lowerGap[i-1]- gapContinue;
262 |                 int ans = max(xgapBegin,xgapCont);
263 |                 lowerGap[i]=ans;
264 |                 if(ans ==xgapBegin){
265 |                     lowerLen[i] = matchLen[i-1]+1;
266 |                     lowerId[i] = matchId[i-1];
267 |                 }
268 |                 else{
269 |                     lowerLen[i] = lowerLen[i-1]+1;
270 |                     lowerId[i] = lowerId[i-1];
271 |                 }
272 | 
273 |         }
274 | 
275 | 
276 |     }
277 | 
278 |    alignmentScore= max(max(matches[len1-1], lowerGap[len1-1]), upperGap[len1-1]);
279 | 
280 |    if(alignmentScore == matches[len1-1]){
281 |            alignmentLength = matchLen[len1-1];
282 |            totalMatches = matchId[len1-1];
283 |     }
284 |     else if(alignmentScore == lowerGap[len1-1]){
285 |             alignmentLength = lowerLen[len1-1];
286 |             totalMatches= lowerId[len1-1];
287 |     }
288 |     else{
289 |             alignmentLength = upperLen[len1-1];
290 |             totalMatches = upperId[len1-1];
291 |     }
292 | }
293 | 
294 | int GlobAlignE::getScore(){
295 |     return alignmentScore;
296 | }
297 | int GlobAlignE::getLength(){
298 |     return alignmentLength;
299 | }
300 | 
301 | double GlobAlignE::getIdentity(){
302 |    double totalMatch = (double) totalMatches;
303 | 
304 |     return totalMatch/alignmentLength;
305 | }
306 | GlobAlignE::~GlobAlignE(){
307 |     delete [] matches;
308 |     delete [] upperGap;
309 |     delete [] lowerGap;
310 |     delete [] matchLen;
311 |     delete [] upperLen;
312 |     delete [] lowerLen;
313 |     delete [] matchId;
314 |     delete [] upperId;
315 |     delete [] lowerId;
316 | 
317 | }
318 | 


--------------------------------------------------------------------------------
/src/utility/GlobAlignE.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /**
 3 |  * Author: Joseph Valencia
 4 |  * Modified by Benjamin James
 5 |  * Date: 12/14/17
 6 |  * Bioinformatics Toolsmith Laboratory, University of Tulsa
 7 |  * */
 8 | #ifndef Glob_AlignE_H_
 9 | #include <string>
10 | 
11 | using namespace std;
12 | 
13 | namespace utility{
14 | 
15 | class GlobAlignE{
16 | 
17 | private:
18 |     const char * seq1; //first sequence to be aligned
19 |     int start1;
20 |     int end1;
21 |     const char * seq2;//second sequence to be aligned
22 |     int start2;
23 |     int end2;
24 |     int len1;
25 |     int len2;
26 |     int lenTotal;
27 |     int match; //score for base pair match
28 |     int mismatch;//score for base pair mismatch
29 |     int gapOpen; //cost to open a gap
30 |     int gapContinue; //cost to continue a gap
31 |     int * matches;
32 |     int * upperGap;
33 |     int * lowerGap;
34 |     int * matchLen;
35 |     int * upperLen;
36 |     int * lowerLen;
37 |     int * matchId;
38 |     int * upperId;
39 |     int * lowerId;
40 |     int alignmentScore;
41 |     int alignmentLength;
42 |     int totalMatches;
43 |     string topString;
44 |     string bottomString;
45 | public:
46 |     GlobAlignE(const char*,int,int,const char *,int,int, int,int,int,int);
47 |     GlobAlignE(string,string,int,int,int,int);
48 |     virtual ~GlobAlignE();
49 |     void findAlignment();
50 |     double getIdentity();
51 |     int getLength();
52 |     void printAlignment();
53 |     int getScore();
54 |     int getLengthAlignment();
55 | 
56 | };
57 | }
58 | #endif
59 | 


--------------------------------------------------------------------------------
/src/utility/ILocation.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ILocation.h
 3 |  *
 4 |  *  Created on: Dec 20, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef ILOCATION_H_
 9 | #define ILOCATION_H_
10 | 
11 | #include <string>
12 | 
13 | using namespace std;
14 | 
15 | namespace utility {
16 | 
17 | class ILocation {
18 | public:
19 | 	virtual int getEnd() const = 0;
20 | 	virtual int getStart() const = 0;
21 | 	virtual void setEnd(int) = 0;
22 | 	virtual void setStart(int) = 0;
23 | 	virtual int getLength() = 0;
24 | 	virtual string toString() = 0;
25 | };
26 | 
27 | }
28 | 
29 | #endif /* ILOCATION_H_ */
30 | 


--------------------------------------------------------------------------------
/src/utility/LCSLen.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * LCSLen.cpp
  3 |  *
  4 |  *  Created on: Dec 6, 2012
  5 |  *      Author: Hani Zakaria Girgis, PhD
  6 |  */
  7 | 
  8 | #include "LCSLen.h"
  9 | #include "../utility/Util.h"
 10 | #include "../exception/InvalidInputException.h"
 11 | 
 12 | #include <iostream>
 13 | 
 14 | using namespace std;
 15 | using namespace exception;
 16 | 
 17 | namespace utility {
 18 | 
 19 | LCSLen::LCSLen(const char * seq1In, int start1In, int end1In,
 20 | 		const char * seq2In, int start2In, int end2In) {
 21 | 	seq1 = seq1In;
 22 | 	start1 = start1In;
 23 | 	end1 = end1In;
 24 | 
 25 | 	seq2 = seq2In;
 26 | 	start2 = start2In;
 27 | 	end2 = end2In;
 28 | 
 29 | 	if(start1 < 0 || end1 < 0 || start1 > end1){
 30 | 		string msg("Invalid Input. Start1 is ");
 31 | 		msg.append(Util::int2string(start1));
 32 | 		msg.append(". End 1 is ");
 33 | 		msg.append(Util::int2string(end1));
 34 | 		msg.append(".");
 35 | 		throw InvalidInputException(msg);
 36 | 	}
 37 | 
 38 | 	if(start2 < 0 || end2 < 0 || start2 > end2){
 39 | 		string msg("Invalid Input. Start2 is ");
 40 | 		msg.append(Util::int2string(start2));
 41 | 		msg.append(". End2 is ");
 42 | 		msg.append(Util::int2string(end2));
 43 | 		msg.append(".");
 44 | 		throw InvalidInputException(msg);
 45 | 	}
 46 | 
 47 | 	// Validate input
 48 | 	// cout << start1 << " " << end1 << endl;
 49 | 	// cout << start2 << " " << end2 << endl;
 50 | 
 51 | 
 52 | 	len1 = end1 - start1 + 2;
 53 | 	len2 = end2 - start2 + 2;
 54 | 
 55 | 	lenTotal = 2 * len2;
 56 | 	cTable = new int[lenTotal];
 57 | 
 58 | 	for (int i = 0; i < lenTotal; i++) {
 59 | 		cTable[i] = 0;
 60 | 	}
 61 | 
 62 | 	findLcs();
 63 | }
 64 | 
 65 | LCSLen::~LCSLen() {
 66 | 	delete[] cTable;
 67 | }
 68 | 
 69 | void LCSLen::findLcs() {
 70 | 	int iM1Index = 0;
 71 | 	int iIndex = len2;
 72 | 
 73 | 	for (int i = 1; i < len1; i++) {
 74 | 		char base1 = seq1[start1 + i - 1];
 75 | 
 76 | 		for (int j = 1; j < len2; j++) {
 77 | 			int ijIndex = iIndex + j;
 78 | 			if (base1 == seq2[start2 + j - 1]) {
 79 | 				cTable[ijIndex] = cTable[iM1Index + j - 1] + 1;
 80 | 			} else {
 81 | 				if (cTable[iM1Index + j] > cTable[iIndex + j - 1]) {
 82 | 					cTable[ijIndex] = cTable[iM1Index + j];
 83 | 				} else {
 84 | 					cTable[ijIndex] = cTable[iIndex + j - 1];
 85 | 				}
 86 | 			}
 87 | 		}
 88 | 
 89 | 		if(i != len1-1){
 90 | 			for(int h = 0; h < len2; h++){
 91 | 				cTable[h] = cTable[len2+h];
 92 | 			}
 93 | 		}
 94 | 	}
 95 | 	lenCS =  cTable[lenTotal-1];
 96 | }
 97 | 
 98 | int LCSLen::getLenCS(){
 99 | 	return lenCS;
100 | }
101 | 
102 | }
103 | /* namespace utility */
104 | 


--------------------------------------------------------------------------------
/src/utility/LCSLen.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * LCSLen.h
 3 |  *
 4 |  *  Created on: Dec 6, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef LCSLEN_H_
 9 | #define LCSLEN_H_
10 | 
11 | namespace utility {
12 | 
13 | class LCSLen {
14 | private:
15 | 	const char * seq1;
16 | 	int start1;
17 | 	int end1;
18 | 	const char * seq2;
19 | 	int start2;
20 | 	int end2;
21 | 
22 | 	int len1;
23 | 	int len2;
24 | 	int lenTotal;
25 | 	int lenCS;
26 | 
27 | 	int * cTable;
28 | 	void findLcs();
29 | 
30 | public:
31 | 	LCSLen(const char *, int, int, const char *, int, int);
32 | 	virtual ~LCSLen();
33 | 	int getLenCS();
34 | };
35 | 
36 | } /* namespace utility */
37 | #endif /* LCSLEN_H_ */
38 | 


--------------------------------------------------------------------------------
/src/utility/Location.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Location.cpp
 3 |  *
 4 |  *  Created on: Dec 19, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #include "Location.h"
 9 | #include "Util.h"
10 | #include "../exception/InvalidInputException.h"
11 | 
12 | using namespace exception;
13 | 
14 | namespace utility {
15 | 
16 | Location::Location(int startIn, int endIn) {
17 | 	initialize(startIn, endIn);
18 | }
19 | 
20 | Location::Location(ILocation& cp) {
21 | 	initialize(cp.getStart(), cp.getEnd());
22 | }
23 | 
24 | void Location::initialize(int startIn, int endIn) {
25 | 	start = startIn;
26 | 	end = endIn;
27 | 	check();
28 | 
29 | }
30 | 
31 | void Location::check() {
32 | 	if (start < 0 || end < 0 || start > end) {
33 | 		string msg("Invalid Input. Start is ");
34 | 		msg.append(Util::int2string(start));
35 | 		msg.append(". End is ");
36 | 		msg.append(Util::int2string(end));
37 | 		msg.append(".");
38 | 		throw InvalidInputException(msg);
39 | 	}
40 | }
41 | 
42 | Location::~Location() {
43 | }
44 | 
45 | int Location::getEnd() const {
46 | 	return end;
47 | }
48 | 
49 | int Location::getStart() const {
50 | 	return start;
51 | }
52 | 
53 | void Location::setEnd(int endIn) {
54 | 	end = endIn;
55 | 	check();
56 | }
57 | 
58 | void Location::setStart(int startIn) {
59 | 	start = startIn;
60 | 	check();
61 | }
62 | 
63 | int Location::getLength() {
64 | 	return end - start + 1;
65 | }
66 | 
67 | string Location::toString() {
68 | 	string msg = (Util::int2string(start));
69 | 	msg.append("-");
70 | 	msg.append(Util::int2string(end));
71 | 
72 | 	return msg;
73 | }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/utility/Location.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Location.h
 3 |  *
 4 |  *  Created on: Dec 19, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef LOCATION_H_
 9 | #define LOCATION_H_
10 | 
11 | #include "ILocation.h"
12 | 
13 | #include <string>
14 | 
15 | using namespace std;
16 | 
17 | namespace utility {
18 | 
19 | class Location : public ILocation{
20 | private:
21 | 	int start;
22 | 	int end;
23 | 	void initialize(int, int);
24 | 	void check();
25 | 
26 | public:
27 | 	Location(int, int);
28 | 	Location(ILocation&);
29 | 	virtual ~Location();
30 | 
31 | 	int getEnd() const;
32 | 	int getStart() const;
33 | 	void setEnd(int);
34 | 	void setStart(int);
35 | 	int getLength();
36 | 	string toString();
37 | };
38 | 
39 | }
40 | 
41 | #endif /* LOCATION_H_ */
42 | 


--------------------------------------------------------------------------------
/src/utility/Util.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Util.h
 3 |  *
 4 |  *  Created on: Apr 24, 2012
 5 |  *      Author: Hani Zakaria Girgis, PhD
 6 |  */
 7 | 
 8 | #ifndef UTIL_H_
 9 | #define UTIL_H_
10 | 
11 | #include "Location.h"
12 | #include "../exception/FileDoesNotExistException.h"
13 | #include "../exception/InvalidInputException.h"
14 | 
15 | #include <vector>
16 | #include <string>
17 | #include <iostream>
18 | #include <fstream>
19 | #include <sstream>
20 | #include <stdlib.h>
21 | #include <dirent.h>
22 | 
23 | using namespace std;
24 | using namespace utility;
25 | using namespace exception;
26 | 
27 | namespace utility {
28 | class Util {
29 | private:
30 | 	Util();
31 | 	~Util();
32 | 
33 | public:
34 | 	static string * emptyString;
35 | 	static string fileSeparator;
36 | 	static bool isDna;
37 | 	static void readFasta(string, vector<string> *, vector<string> *, bool);
38 | 	static void readFasta(string, vector<string> *, vector<string> *);
39 | 	static void readCoordinates(string, vector<Location *> *);
40 | 	static void readChromList(string, vector<string> *, string);
41 | 	static void toUpperCase(string*);
42 | 	static void toUpperCase(string&);
43 | 	static string int2string(int);
44 | 	static string double2string(double);
45 | 	static string long2string(long);
46 | 	static void deleteFile(string);
47 | 	static void deleteFilesUnderDirectory(string);
48 | 	static void checkFile(string);
49 | 	static bool isOverlapping(int, int, int, int);
50 | 	static void revCompDig(string *, string *);
51 | 	static void revCompDig(const char* sequence, int, int, string *);
52 | 
53 | 	static void writeFasta(const string&, const string&, const string&);
54 | 
55 | 	static int sumTotalLength(const vector<ILocation *> *);
56 | 
57 | 	// Added on Oct 6 2018
58 | 	static const int getAlphabetSize();
59 | 
60 | 	/**
61 | 	 * Delete the objects pointed to by pointers in a vector.
62 | 	 * It does not delete the vector itself.
63 | 	 *
64 | 	 * Credit: http://stackoverflow.com/questions/594089/does-stdvector-clear-do-delete-free-memory-on-each-element
65 | 	 */
66 | 	template<class T>
67 | 	static void deleteInVector(vector<T*> * deleteMe) {
68 | 		while (!deleteMe->empty()) {
69 | 			delete deleteMe->back();
70 | 			deleteMe->pop_back();
71 | 		}
72 | 
73 | 		// Set the size to zero
74 | 		deleteMe->clear();
75 | 
76 | 		// Set the capacity to zero
77 | 		vector<T*> empty;
78 | 		deleteMe->swap(empty);
79 | 	}
80 | };
81 | }
82 | 
83 | #endif /* UTIL_H_ */
84 | 


--------------------------------------------------------------------------------