├── .gitconfig ├── LICENSE.txt ├── README.md └── ftrie.hpp /.gitconfig: -------------------------------------------------------------------------------- 1 | [github] 2 | mirror = remuval 3 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 20129 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | The code in this repository is a complement to the paper [Practical universal _k_-mer sets for minimizer schemes](https://dl.acm.org/citation.cfm?id=3342144). 4 | Its provide a C++ library to read and query the _k_-mer data files available on Zonodo ([All General and Human sK2-sK6](https://zenodo.org/record/3385067), [Human sK7-sK10](https://zenodo.org/record/3385069)). 5 | 6 | # Usage 7 | 8 | To install, simply copy the file `ftrie.hpp` into your own project. 9 | 10 | The following code reads a file, and queries whether the _k_-mer `ACGT` is present in the trie. 11 | 12 | ```C++ 13 | #include 14 | #include 15 | #include "ftrie.hpp" 16 | 17 | typedef ftrie<4> trie_type; // Trie with alphabet of size 4 18 | 19 | int main(int argc, char *argv[]) { 20 | std::ifstream ftrie_in("path/to/file"); 21 | trie_type uhmers(ftrie_in); 22 | ftrie_in.close(); 23 | 24 | // Search for ACGT 25 | switch(uhmers.find("ACGT")) { 26 | case trie_type::NOT: 27 | std::cerr << "Not found\n"; 28 | break; 29 | 30 | case trie_type::HAS_PREFIX: 31 | std::cerr << "A prefix of ACGT is present in the trie\n"; 32 | break; 33 | 34 | case trie_type::PREFIX_OF: 35 | std::cerr << "ACGT is a prefix of a k-mer in the trie\n"; 36 | break; 37 | } 38 | 39 | return 0; 40 | } 41 | ``` 42 | -------------------------------------------------------------------------------- /ftrie.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __FTRIE_H__ 2 | #define __FTRIE_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | template 12 | class ftrie { 13 | protected: 14 | enum STATE { EMPTY, INTERNAL, FINAL }; 15 | struct node { 16 | STATE state; 17 | OFFSET edges[AS]; 18 | node(STATE s = EMPTY) 19 | : state(s) 20 | { 21 | clear(); 22 | } 23 | 24 | void clear() { 25 | memset(edges, '\0', sizeof(OFFSET)*AS); 26 | } 27 | }; 28 | static_assert(std::is_nothrow_move_constructible::value, 29 | "That's a BUG. Node should be nothrow move constructible"); 30 | static_assert(AS > 0, 31 | "Alphabet size cannot be 0"); 32 | 33 | static constexpr OFFSET root_offset = 0; // Guard value. Node containing the root node 34 | static constexpr OFFSET final_offset = 1; // Guard value. Node containing the FINAL state 35 | const std::string alphabet; 36 | const std::vector translate; 37 | OFFSET empty_head; // head of list of empty nodes 38 | std::vector nodes; // vector of nodes creating the trie 39 | 40 | static std::vector create_translate(const char* a, size_t s) { 41 | if(s != AS) 42 | throw std::range_error("Alphabet of wrong length"); 43 | std::vector res(256, -1); 44 | for(size_t i = 0; i < s; ++i) 45 | res[a[i]] = i; 46 | return res; 47 | } 48 | 49 | static std::string read_alphabet(std::istream& is) { 50 | std::string res; 51 | std::getline(is, res); 52 | return res; 53 | } 54 | 55 | public: 56 | enum FOUND { NOT, HAS_PREFIX, PREFIX_OF }; 57 | 58 | ftrie(const char* a, size_t s) 59 | : alphabet(a, s) 60 | , translate(create_translate(a, s)) 61 | , empty_head(0) 62 | { 63 | nodes.emplace_back(INTERNAL); 64 | nodes.emplace_back(FINAL); 65 | } 66 | ftrie(const char* a) 67 | : ftrie(a, strlen(a)) 68 | { } 69 | ftrie(const std::string& s) 70 | : ftrie(s.c_str(), s.size()) 71 | { } 72 | 73 | ftrie(std::istream& is) 74 | : ftrie(read_alphabet(is)) 75 | { 76 | is >> empty_head; 77 | if(!is.good()) throw std::range_error("Incomplete ftrie file"); 78 | 79 | nodes.clear(); 80 | node n; 81 | OFFSET cur = 0; 82 | for( ; true; ++cur) { 83 | unsigned s; 84 | is >> s; 85 | if(s > FINAL) throw std::range_error("Invalid node state"); 86 | n.state = static_cast(s); 87 | if(cur == 0 && n.state != INTERNAL) 88 | throw std::range_error("Root node must have INTERNAL state"); 89 | if(cur == 1 && n.state != FINAL) 90 | throw std::range_error("Guard node must have FINAL state"); 91 | for(unsigned i = 0; i < AS; ++i) 92 | is >> n.edges[i]; 93 | if(is.eof()) return; 94 | if(!is.good()) throw std::range_error("Incomplete ftrie file"); 95 | nodes.push_back(n); 96 | } 97 | } 98 | 99 | protected: 100 | // Recursively delete an unused subtree and add the nodes to the 101 | // empty list for future reuse 102 | void delete_subtree(OFFSET start) { 103 | if(nodes[start].state == FINAL) { 104 | assert(start == final_offset); 105 | return; 106 | } 107 | 108 | for(unsigned i = 0; i < AS; ++i) { 109 | auto &e = nodes[start].edges[i]; 110 | if(e) delete_subtree(e); 111 | e = 0; 112 | } 113 | // Mark node as empty and prepend to list of empty nodes 114 | nodes[start] = EMPTY; 115 | nodes[start].edges[0] = empty_head; 116 | empty_head = start; 117 | } 118 | 119 | // Pop the first empty node from the empty list, if any. Otherwise, 120 | // append a new one at the end of the nodes vector. 121 | OFFSET next_empty_node() { 122 | if(empty_head) { 123 | OFFSET res = empty_head; 124 | empty_head = nodes[res].edges[0]; 125 | nodes[res].clear(); 126 | nodes[res].state = INTERNAL; 127 | return res; 128 | } 129 | nodes.emplace_back(INTERNAL); 130 | return nodes.size() - 1; 131 | } 132 | 133 | // Recursively check equality between the nodes n1 and n2 in the ftries f1 and f2 134 | static bool equal_nodes(const ftrie& f1, const ftrie& f2, const node n1, const node n2) { 135 | for(unsigned i = 0; i < AS; ++i) { 136 | if(!n1.edges[i] != !n2.edges[i]) return false; 137 | if(n1.edges[i] != 0 && !equal_nodes(f1, f2, f1.nodes[n1.edges[i]], f2.nodes[n2.edges[i]])) return false; 138 | } 139 | return true; 140 | } 141 | 142 | public: 143 | bool operator==(const ftrie& rhs) const { 144 | return equal_nodes(*this, rhs, nodes[0], rhs.nodes[0]); 145 | } 146 | 147 | // Insert a new string length size in the ftrie 148 | void insert(const char* s, size_t size) { 149 | if(size == 0) return; 150 | OFFSET cur = 0; 151 | 152 | if(size > 1) { 153 | for(size_t i = 0; i < size - 1; ++i) { 154 | assert(nodes[cur].state != EMPTY); // BUG! 155 | if(nodes[cur].state == FINAL) return; // Entire subtree is already assumed present 156 | const auto v = translate[s[i]]; 157 | if(v < 0) throw std::range_error("Invalid letter in alphabet"); 158 | const auto next = nodes[cur].edges[v]; 159 | if(!next) { 160 | const auto ncur = next_empty_node(); 161 | cur = nodes[cur].edges[v] = ncur; 162 | } else { 163 | cur = next; 164 | } 165 | } 166 | } 167 | 168 | // Last letter 169 | const size_t i = size - 1; 170 | assert(nodes[cur].state != EMPTY); // BUG! 171 | if(nodes[cur].state == FINAL) return; // Entire subtree is already assumed present 172 | const auto v = translate[s[i]]; 173 | if(v < 0) throw std::range_error("Invalid letter in alphabet"); 174 | auto& next = nodes[cur].edges[v]; 175 | if(next) 176 | delete_subtree(next); 177 | next = final_offset; 178 | } 179 | 180 | void insert(const char* s) { insert(s, strlen(s)); } 181 | void insert(const std::string& s) { insert(s.c_str(), s.size()); } 182 | 183 | FOUND find(const char* s, size_t size) const { 184 | const node* const vec = nodes.data(); 185 | const node* cur = vec; 186 | 187 | for(size_t i = 0; i < size; ++i) { 188 | assert(cur->state != EMPTY); // BUG! 189 | if(cur->state == FINAL) return HAS_PREFIX; 190 | const auto v = translate[s[i]]; 191 | if(v < 0) throw std::range_error("Invalid letter in alphabet"); 192 | const node* next = vec + cur->edges[v]; 193 | if(next == vec) return NOT; 194 | cur = next; 195 | } 196 | 197 | assert(cur->state != EMPTY); 198 | return cur->state == FINAL ? HAS_PREFIX : PREFIX_OF; 199 | } 200 | 201 | FOUND find(const char* s) const { return find(s, strlen(s)); } 202 | FOUND find(const std::string& s) const { return find(s.c_str(), s.size()); } 203 | 204 | std::ostream& dump(std::ostream& os) const { 205 | os << alphabet << '\n' 206 | << empty_head << '\n'; 207 | for(const auto& n : nodes) { 208 | os << n.state; 209 | for(unsigned i = 0; i < AS; ++i) 210 | os << ' ' << n.edges[i]; 211 | os << '\n'; 212 | } 213 | 214 | return os; 215 | } 216 | 217 | template 218 | void all_mers(unsigned depth, Container& mers, OFFSET start, std::string& mer) const { 219 | if(depth == 0) { 220 | mers.push_back(mer); 221 | return; 222 | } 223 | if(nodes[start].state == FINAL) { 224 | for(unsigned i = 0; i < AS; ++i) { 225 | mer[mer.size() - depth] = alphabet[i]; 226 | all_mers(depth-1, mers, start, mer); 227 | } 228 | return; 229 | } 230 | for(unsigned i = 0; i < AS; ++i) { 231 | const auto next = nodes[start].edges[i]; 232 | if(next) { 233 | mer[mer.size() - depth] = alphabet[i]; 234 | all_mers(depth-1, mers, next, mer); 235 | } 236 | } 237 | } 238 | 239 | // Write all mers into any container that has a push_back(std::string) method. 240 | template 241 | auto all_mers(unsigned depth, Container& mers) const 242 | -> typename std::enable_if::value, void>::type 243 | { 244 | std::string mer(depth, '\0'); 245 | all_mers(depth, mers, 0, mer); 246 | } 247 | 248 | // Create a vector with all mers 249 | std::vector all_mers(unsigned depth) const { 250 | std::vector res; 251 | all_mers(depth, res); 252 | return res; 253 | } 254 | 255 | // Write all mers, one per line, into an ostream 256 | std::ostream& all_mers(unsigned depth, std::ostream& os) const { 257 | struct StreamContainer { 258 | std::ostream& os; 259 | void push_back(const std::string& s) { os << s << '\n'; } 260 | }; 261 | StreamContainer sc{os}; 262 | all_mers(depth, sc); 263 | return os; 264 | } 265 | 266 | // Size of the encoded set of mers at a given depth. It is equal to 267 | // all_mers(depth).size(), but much faster. 268 | double size(unsigned depth, OFFSET start = root_offset) { 269 | if(depth == 0 || nodes[start].state == FINAL) 270 | return std::pow(alphabet.size(), depth); 271 | double res = 0; 272 | for(unsigned i = 0; i < AS; ++i) { 273 | const auto next = nodes[start].edges[i]; 274 | if(next) 275 | res += size(depth - 1, next); 276 | } 277 | return res; 278 | } 279 | }; 280 | 281 | 282 | 283 | #endif /* __FTRIE_H__ */ 284 | --------------------------------------------------------------------------------