├── Darragh_Bonsai_93 ├── Bonsai.cpp ├── Bonsai.h ├── compile_run.sh └── main.cpp ├── Hash ├── hash.cpp ├── hash.h ├── mBonsaiHash.cpp ├── mBonsaiHash.h ├── orighashFunc.cpp └── orighashFunc.h ├── README.md ├── SPIRE2015-Improved Practical Compact Dynamic Tries.pdf ├── datasets ├── SRR034939.dat ├── datasets.tar.gz └── tranSortedAccidents.dat ├── mBonsai_gamma ├── DArray │ ├── blockedDArray.cpp │ ├── blockedDArray.h │ ├── gammaBlock.cpp │ └── gammaBlock.h ├── compile_run.sh ├── mBonsaiGm.cpp ├── mBonsaiGm.h └── main.cpp ├── mBonsai_recursive ├── cht.cpp ├── cht.h ├── compile_run.sh ├── mBonsai.cpp ├── mBonsai.h └── main.cpp └── readio ├── data.cpp ├── data.h └── dataTST.cpp /Darragh_Bonsai_93/Bonsai.cpp: -------------------------------------------------------------------------------- 1 | #include "Bonsai.h" 2 | 3 | Bonsai::Bonsai(uint32_t nodeNumber, uint32_t alphabet, double loadFactor, 4 | char* file) 5 | : sigma(alphabet + 1) 6 | , M((nodeNumber * 1) / (loadFactor)) 7 | , noValue(M + 10) 8 | , nodeNumberCount(1) 9 | , origNodeCount(1) 10 | , lambda(32) 11 | { 12 | srand(time(NULL)); 13 | setData(file); 14 | cmax = (((lambda * (sigma - 1)) + lambda - 1) * M) + (M - 1); 15 | prime = nextPrimeNumber(cmax); 16 | a = ULONG_MAX / prime; 17 | emptySymbol = lambda * sigma + 2; 18 | rootID = (uint32_t)lambda * sigma + 1; 19 | rootLambda = 0; 20 | uint32_t w = sdsl::bits::hi(emptySymbol) + 1; 21 | quotient = sdsl::int_vector<0>(M, emptySymbol, w); 22 | V = sdsl::bit_vector(M, 0); 23 | C = sdsl::bit_vector(M, 1); 24 | rootAddress = (long)(rand() % M); 25 | quotient[rootAddress] = rootID; 26 | V[rootAddress] = 1; 27 | } 28 | 29 | /* Function that checks whether or not a given number is 30 | * a prime number or not. 31 | */ 32 | bool Bonsai::isPrime(uint64_t input) 33 | { 34 | int i; 35 | bool prime = true; 36 | if (input == 2) 37 | return true; 38 | 39 | if (input % 2 == 0 || input <= 1) { 40 | prime = false; 41 | } else { 42 | for (i = 3; i <= sqrt(input); i += 2) { 43 | if (input % i == 0) { 44 | prime = false; 45 | } 46 | } 47 | } 48 | return prime; 49 | } // end isPrime 50 | 51 | /* 52 | * Function for determining the next prime number 53 | */ 54 | uint64_t Bonsai::nextPrimeNumber(uint64_t inputNumber) 55 | { 56 | uint64_t nextPrimeNumber; 57 | if (inputNumber <= 0) { 58 | std::cout << "The number you have entered is zero or negative.\n"; 59 | } else { 60 | while (inputNumber != 0) { 61 | 62 | nextPrimeNumber = inputNumber + 1; 63 | // if the number is even, make it odd (2 is special case) 64 | if (nextPrimeNumber % 2 == 0 && nextPrimeNumber != 2) { 65 | nextPrimeNumber += 1; 66 | } 67 | // while its not a prime number, check the next odd number 68 | while (!isPrime(nextPrimeNumber)) { 69 | nextPrimeNumber += 2; 70 | } 71 | if (isPrime(nextPrimeNumber)) 72 | return nextPrimeNumber; 73 | } 74 | } 75 | return nextPrimeNumber; 76 | } // end nextPrimeNumber 77 | 78 | /* build phase 79 | * It goes through the dataset transaction by transaction 80 | */ 81 | void Bonsai::build() 82 | { 83 | int count = 0; 84 | while (Transaction* t = data->getNext()) { 85 | count++; 86 | insert(t, count); 87 | delete t; 88 | } 89 | } // end build 90 | 91 | /* 92 | * Inserts nodes to appropriate positions 93 | * handles Virgin and Change bit 94 | * handles collisions and groups according to const lambda 95 | */ 96 | uint32_t Bonsai::insert(Transaction* t, uint32_t line) 97 | { 98 | uint64_t prevInitAd = rootAddress; 99 | uint32_t prevJ = rootLambda; 100 | uint32_t curAddress; 101 | uint32_t associatedC; 102 | origHash* key = new origHash(); 103 | for (uint32_t i = 0; i < t->length; i++) { 104 | key->getKey(prevInitAd, t->t[i], prevJ, M, prime, a); 105 | if (quotient[key->initAd] == emptySymbol) { 106 | quotient[key->initAd] = key->quotient; 107 | V[key->initAd] = 1; 108 | C[key->initAd] = 1; 109 | prevInitAd = key->initAd; 110 | prevJ = 0; 111 | nodeNumberCount++; 112 | if (t->t[i] == 5) 113 | origNodeCount++; 114 | } else { 115 | associatedC = getAssociatedC(key->initAd); 116 | if (V[key->initAd] == 0) { // start of block 117 | if (associatedC != noValue) 118 | startNewBlock(key->initAd, associatedC); 119 | prevJ = 0; 120 | nodeNumberCount++; 121 | if (t->t[i] == 5) 122 | origNodeCount++; 123 | V[key->initAd] = 1; 124 | C[curEmptySlot] = 1; 125 | quotient[curEmptySlot] = key->quotient; 126 | prevInitAd = key->initAd; 127 | } else { // block already exists 128 | prevJ = findSpace(associatedC, key->quotient); 129 | 130 | if (prevJ < lambda) { // if item doesn't exist 131 | if (t->t[i] == 5) 132 | origNodeCount++; 133 | nodeNumberCount++; 134 | quotient[curEmptySlot] = key->quotient; 135 | C[curEmptySlot] = 0; 136 | } else { // prepare for next insertion 137 | prevJ -= lambda; 138 | } 139 | prevInitAd = key->initAd; 140 | } 141 | prevInitAd = key->initAd; 142 | curEmptySlot = noValue; 143 | } 144 | } 145 | delete key; 146 | return curEmptySlot; 147 | } 148 | /* 149 | * returns 32 + curJ if item exists 150 | * returns curJ if item does not exist 151 | */ 152 | 153 | uint32_t Bonsai::findSpace(uint32_t cVal, uint32_t quo) 154 | { 155 | // we have curEmptySlot & assoC 156 | uint32_t curJ = 0; 157 | uint32_t curC; 158 | // check if the value is already inserted 159 | uint32_t tmpSlot; 160 | // j is 0 161 | if (itemExists(cVal, quo)) { 162 | curEmptySlot = cVal; 163 | return lambda; // 0+lambda 164 | } else if (quotient[cVal] == emptySymbol) { 165 | curEmptySlot = cVal; 166 | return 0; 167 | } 168 | // start going upwards until block ends where c!=0 169 | // increment curJ to return for the next item 170 | if (cVal == M - 1) 171 | curC = 0; 172 | else 173 | curC = cVal + 1; // curC=cVal++ 174 | curJ++; 175 | // go upwards towards the end of the block 176 | while (C[curC] == 0) { 177 | if (itemExists(curC, quo)) { 178 | curEmptySlot = curC; 179 | return lambda + curJ; 180 | } 181 | if (curC == M - 1) 182 | curC = 0; 183 | else 184 | curC++; 185 | curJ++; 186 | } 187 | 188 | if (curC == 0) 189 | curC = M - 1; 190 | else 191 | curC--; // go one back to stay in the block 192 | 193 | // push all the slots upto curC to insert it in curC 194 | while (curEmptySlot != curC) { 195 | if (curEmptySlot == M - 1) 196 | tmpSlot = 0; 197 | else 198 | tmpSlot = curEmptySlot + 1; 199 | 200 | quotient[curEmptySlot] = quotient[tmpSlot]; 201 | C[curEmptySlot] = C[tmpSlot]; 202 | 203 | if (curEmptySlot == M - 1) 204 | curEmptySlot = 0; 205 | else 206 | curEmptySlot++; 207 | } 208 | curEmptySlot = curC; 209 | return curJ; 210 | } // end findSpace 211 | 212 | /* 213 | * starts a new block/group of collisions 214 | */ 215 | void Bonsai::startNewBlock(uint32_t vVal, uint32_t cVal) 216 | { 217 | uint32_t tmpSlot; 218 | uint32_t curC; 219 | // increase c loc 220 | if (cVal == M - 1) 221 | curC = 0; 222 | else 223 | curC = cVal + 1; 224 | // reach the end of the current block 225 | while (C[curC] == 0) { 226 | if (curC == M - 1) 227 | curC = 0; 228 | else 229 | curC++; 230 | } 231 | // push items to make space 232 | while (curEmptySlot != curC) { 233 | if (curEmptySlot == M - 1) 234 | tmpSlot = 0; 235 | else 236 | tmpSlot = curEmptySlot + 1; 237 | 238 | quotient[curEmptySlot] = quotient[tmpSlot]; 239 | C[curEmptySlot] = C[tmpSlot]; 240 | curEmptySlot = tmpSlot; 241 | } 242 | if (curEmptySlot == 0) 243 | curEmptySlot = M - 1; 244 | else 245 | curEmptySlot--; 246 | } 247 | /* 248 | * returns the locations of the associated change bit 249 | * or noValue if not found. 250 | */ 251 | uint32_t Bonsai::getAssociatedC(uint32_t curAddress) 252 | { 253 | // count ones in V and C 254 | uint32_t vOnesDown = 0; 255 | uint32_t cOnesUp = 0; 256 | uint32_t posMoves = 0; 257 | // count vOnes downwards including current address 258 | if (V[curAddress] == 1) 259 | vOnesDown++; 260 | // start moving downwards 261 | if (curAddress == 0) 262 | curAddress = M - 1; 263 | else 264 | curAddress--; 265 | posMoves++; 266 | // go downwards untill empty slot and count Vones 267 | while (quotient[curAddress] != emptySymbol) { 268 | if (V[curAddress] == 1) 269 | vOnesDown++; 270 | if (curAddress == 0) 271 | curAddress = M - 1; 272 | else 273 | curAddress--; 274 | posMoves++; 275 | } 276 | // get emptyslot and start moving upwards 277 | curEmptySlot = curAddress; 278 | if (vOnesDown == 0) 279 | return noValue; 280 | 281 | if (curAddress == M - 1) 282 | curAddress = 0; 283 | else 284 | curAddress++; 285 | 286 | // go upwards 287 | // count cOnes AFTER emptySlot until conesUp==vOnes down 288 | 289 | while (cOnesUp < vOnesDown) { 290 | if (C[curAddress] == 1) 291 | cOnesUp++; 292 | if (curAddress == M - 1) 293 | curAddress = 0; 294 | else 295 | curAddress++; 296 | } 297 | // return associated C value, sta 298 | if (curAddress == 0) 299 | return (M - 1); 300 | else 301 | return --curAddress; 302 | } 303 | 304 | /* 305 | * return if th item exists 306 | */ 307 | bool Bonsai::itemExists(uint32_t cVal, uint32_t quo) 308 | { 309 | if (quotient[cVal] == quo) 310 | return true; 311 | else 312 | return false; 313 | } 314 | 315 | uint32_t Bonsai::findItem(uint32_t vVal, uint32_t cVal, uint32_t quo) 316 | { 317 | uint32_t JVal = 0; 318 | uint32_t curC; 319 | // check if the value is already inserted 320 | bool itExists = itemExists(cVal, quo); 321 | if (itExists) 322 | return 0; 323 | 324 | if (cVal == M - 1) 325 | curC = 0; 326 | else 327 | curC = cVal + 1; 328 | // go upwards towards the end of the block 329 | while (C[curC] == 0) { 330 | itExists = itemExists(curC, quo); 331 | if (itExists) { 332 | return ++JVal; 333 | } 334 | if (curC == M - 1) 335 | curC = 0; 336 | else 337 | curC++; 338 | JVal++; 339 | } 340 | return noValue; 341 | } // end findItem 342 | 343 | /* Search phase 344 | * Used for searchBenchmarks 345 | * Goes through a search file searching transactions by transactions. 346 | * This bench is designed spesifically for successful search operations 347 | * Outputs error if search is unsuccessful. 348 | */ 349 | uint64_t Bonsai::searchBench(char* file) 350 | { 351 | uint64_t schCounter = 0, x = 0; 352 | std::ifstream infile; 353 | infile.open(file); 354 | std::vector str; 355 | std::string rawData; 356 | while (getline(infile, rawData)) { 357 | str = getVector(rawData); 358 | schCounter += str.size(); 359 | search(str); 360 | str.clear(); 361 | } 362 | std::cout << "x: " << x << " " << schCounter << std::endl; 363 | return schCounter; 364 | } 365 | 366 | /* 367 | * reads transaction by transaction 368 | * to be searched 369 | */ 370 | std::vector Bonsai::getVector(std::string s) 371 | { 372 | char *cstr, *p; 373 | std::vector items; 374 | cstr = new char[s.size() + 1]; 375 | strcpy(cstr, s.c_str()); 376 | p = strtok(cstr, " "); 377 | while (p != NULL) { 378 | items.push_back(atoi(p)); 379 | p = strtok(NULL, " "); 380 | } 381 | delete[] cstr; 382 | return items; 383 | } // getVector 384 | 385 | /* 386 | * searches Items if not found prints error 387 | */ 388 | uint32_t Bonsai::search(std::vector t) 389 | { 390 | // initialize variables 391 | uint64_t prevInitAd = rootAddress; 392 | uint32_t prevJ = rootLambda; 393 | uint32_t associatedC; 394 | origHash* key = new origHash(); 395 | for (uint32_t i = 0; i < t.size(); i++) { 396 | key->getKey(prevInitAd, (uint64_t)t[i], prevJ, M, prime, a); 397 | if (V[key->initAd] == 0) { 398 | std::cout << "We searched every corner of bonsai universe. Item is not " 399 | "found! :(" 400 | << std::endl; 401 | return noValue; 402 | } else { 403 | associatedC = getAssociatedC(key->initAd); 404 | prevJ = findItem(key->initAd, associatedC, key->quotient); 405 | } 406 | if (prevJ == noValue) 407 | return noValue; 408 | else 409 | prevInitAd = key->initAd; 410 | } 411 | return associatedC; 412 | } 413 | -------------------------------------------------------------------------------- /Darragh_Bonsai_93/Bonsai.h: -------------------------------------------------------------------------------- 1 | #ifndef DARRAGH_BONSAI 2 | #define DARRAGH_BONSAI 3 | #include "../Hash/orighashFunc.h" 4 | #include "../readio/data.h" 5 | #include "limits.h" 6 | #include "sdsl/int_vector.hpp" 7 | #include 8 | 9 | class Bonsai { 10 | 11 | public: 12 | // structure 13 | sdsl::int_vector<0> quotient; 14 | sdsl::bit_vector V; 15 | sdsl::bit_vector C; 16 | 17 | // init 18 | Bonsai() {} 19 | Bonsai(uint32_t nodeNumber, uint32_t sigma, double loadFactor, char* file); 20 | void setData(char* file) { data = new Data(file); } 21 | 22 | // build phase 23 | void build(); 24 | uint32_t insert(Transaction* t, uint32_t line); 25 | // navigations collision handling 26 | uint32_t getAssociatedC(uint32_t curAddress); 27 | uint32_t findSpace(uint32_t cVal, uint32_t quotient); 28 | bool itemExists(uint32_t cVal, uint32_t quotient); 29 | void startNewBlock(uint32_t vVal, uint32_t cVal); 30 | uint32_t findItem(uint32_t vVal, uint32_t cVal, uint32_t quotient); 31 | 32 | // misc 33 | bool isPrime(uint64_t input); 34 | uint64_t nextPrimeNumber(uint64_t inputNumber); 35 | 36 | // search phase 37 | uint64_t searchBench(char* file); 38 | std::vector getVector(std::string s); 39 | uint32_t search(std::vector t); 40 | 41 | // args for printing and counting 42 | uint64_t sigma; 43 | uint64_t M; 44 | uint32_t nodeNumberCount; 45 | uint32_t origNodeCount; 46 | 47 | private: 48 | uint64_t cmax; 49 | uint64_t noValue; 50 | uint32_t rootID; 51 | uint32_t rootLambda; 52 | uint64_t lambda; 53 | uint64_t prime; 54 | uint32_t a; 55 | uint32_t rootAddress; 56 | uint32_t emptySymbol; 57 | uint32_t curEmptySlot; 58 | Data* data; // readio 59 | }; 60 | #endif 61 | -------------------------------------------------------------------------------- /Darragh_Bonsai_93/compile_run.sh: -------------------------------------------------------------------------------- 1 | g++ -O3 -Wall -std=c++11 -w -Wall -Wextra -DNDEBUG -g -ffast-math -funroll-loops -msse4.2 -I ~/include -L ~/lib ../Hash/orighashFunc.cpp ../readio/data.cpp Bonsai.cpp main.cpp -o Bonsai -lsdsl -ldivsufsort -ldivsufsort64 2 | echo ready to run 3 | 4 | # ./Bonsai 4242318 442 ../datasets/sortedAccidents.dat 0.8 ../datasets/sortedAccidents.dat 5 | # ./Bonsai 3095560 5 ../datasets/SRR034939.dat 0.8 ../datasets/SRR034939.dat 6 | 7 | ./Bonsai 3095561 5 ../datasets/SRR034939.dat 0.8 ../datasets/SRR034939.dat 8 | ./Bonsai 21005059 5 /cxml-data/ap480/Datasets/dnaStrings/SRR034944.txt 0.8 /cxml-data/ap480/Datasets/dnaStrings/SRR034944.txt 9 | ./Bonsai 1556235309 5 /cxml-data/ap480/Datasets/dnaStrings/SRR034940_1.txt 0.8 /cxml-data/ap480/Datasets/dnaStrings/SRR034940_1.txt 10 | ./Bonsai 1728553810 5 /cxml-data/ap480/Datasets/dnaStrings/SRR034945_1.txt 0.8 /cxml-data/ap480/Datasets/dnaStrings/SRR034945_1.txt 11 | 12 | ./Bonsai 35004 119 /cxml-data/ap480/Datasets/tranSortedMushroom.dat 0.8 /cxml-data/ap480/Datasets/tranSortedMushroom.dat 13 | ./Bonsai 38610 75 /cxml-data/ap480/Datasets/tranSortedChess.dat 0.8 /cxml-data/ap480/Datasets/tranSortedChess.dat 14 | 15 | ./Bonsai 4242318 442 ../datasets/sortedAccidents.dat 0.8 ../datasets/sortedAccidents.dat 16 | 17 | ./Bonsai 63985704 364 /cxml-data/ap480/Datasets/tranSortedWebdocs8.dat 0.8 /cxml-data/ap480/Datasets/tranSortedWebdocs8.dat 18 | 19 | g++ -O3 -Wall -fpermissive -std=c++11 -w -Wall -Wextra -DNDEBUG -g -ffast-math -funroll-loops -msse4.2 -I ~/include -L ~/lib cht.cpp ../Hash/mBonsaiHash.cpp ../readio/data.cpp Bonsai.cpp main.cpp -o Bonsai -lsdsl -ldivsufsort -ldivsufsort64 -Dretail 20 | 21 | ./Bonsai 653217 8919 /cxml-data/ap480/Datasets/tranSortedRetail.dat 0.8 /cxml-data/ap480/Datasets/tranSortedRetail.dat 22 | 23 | g++ -O3 -Wall -fpermissive -std=c++11 -w -Wall -Wextra -DNDEBUG -g -ffast-math -funroll-loops -msse4.2 -I ~/include -L ~/lib cht.cpp ../Hash/mBonsaiHash.cpp ../readio/data.cpp Bonsai.cpp main.cpp -o Bonsai -lsdsl -ldivsufsort -ldivsufsort64 -Dwebdocs 24 | ./Bonsai 231232676 59717 /cxml-data/ap480/Datasets/tranSortedWebdocs.dat 0.8 /cxml-data/ap480/Datasets/tranSortedWebdocs.dat 25 | -------------------------------------------------------------------------------- /Darragh_Bonsai_93/main.cpp: -------------------------------------------------------------------------------- 1 | #include "Bonsai.h" 2 | #include 3 | using namespace std; 4 | 5 | 6 | // arguments 7 | // 1 nodeNum 8 | // 2 alphabet 9 | // 3 dataset 10 | // 4 loadFactor 11 | // 5 searchFile 12 | double printSpace(Bonsai); 13 | 14 | int main(int argc, char *argv[]) { 15 | ofstream outdata; 16 | outdata.open("benchmarksDarragh.csv", ios::app); 17 | 18 | uint32_t nodeNum = atol(argv[1]); 19 | uint32_t sigma = atoi(argv[2]); 20 | char *file = argv[3]; 21 | double loadFactor = atof(argv[4]); 22 | char *searchFile = argv[5]; 23 | 24 | Bonsai b(nodeNum, sigma, loadFactor, file); 25 | auto begin = std::chrono::high_resolution_clock::now(); // wall time 26 | b.build(); 27 | auto end = std::chrono::high_resolution_clock::now(); 28 | auto dur = end - begin; 29 | auto ns = std::chrono::duration_cast(dur).count(); 30 | std::cout << "Total wall time Build[ " << ns / 1000000 << " ms]" 31 | << std::endl; 32 | std::cout << "node Count: " << b.nodeNumberCount << std::endl; 33 | outdata << file<<", "<< b.nodeNumberCount<<", "<(dur).count(); 39 | std::cout << "Total wall time searchbench[ " << ns / 1000000 << " ms]" 40 | << std::endl; 41 | outdata << ns / 1000000 << ", "<<(ns /(double)schCounter) << ","< 3 | #include /* sqrt */ 4 | #define mulmod(a, b, m) (uint64_t)(((sdsl::uint128_t)(a) * (sdsl::uint128_t)(b)) % ((sdsl::uint128_t)(m))) 5 | 6 | 7 | // Function to find modulo inverse of a 8 | uint64_t hashFunction::modInverse(sdsl::uint128_t m) 9 | { 10 | sdsl::uint128_t x, y; 11 | sdsl::uint128_t g = gcdExtended(a, m, &x, &y); 12 | aInv = (x % m + m) % m; 13 | if ((a * aInv) % m != 1) { 14 | /*Inverse doesn't exist*/ 15 | ++a; 16 | return modInverse(m); 17 | } else { 18 | return aInv; 19 | } 20 | } 21 | 22 | // C function for extended Euclidean Algorithm 23 | sdsl::uint128_t hashFunction::gcdExtended(sdsl::uint128_t aTemp, sdsl::uint128_t m, sdsl::uint128_t* x, sdsl::uint128_t* y) 24 | { 25 | // Base Case 26 | if (!aTemp) { 27 | *x = 0, *y = 1; 28 | return m; 29 | } 30 | sdsl::uint128_t x1; 31 | sdsl::uint128_t y1; // To store results of recursive call 32 | sdsl::uint128_t gcd = gcdExtended(m % aTemp, aTemp, &x1, &y1); 33 | 34 | // Update x and y using results of recursive 35 | // call 36 | sdsl::uint128_t diff = ((m / aTemp) * x1); 37 | *x = y1 - diff; 38 | *y = x1; 39 | return gcd; 40 | } 41 | 42 | /* Function that checks whether or not a given number is 43 | * a prime number or not. 44 | */ 45 | bool isPrime(uint64_t input) 46 | { 47 | uint64_t i; 48 | bool prime = true; 49 | if (input == 2) { 50 | return true; 51 | } 52 | if (input % 2 == 0 || input <= 1) { 53 | prime = false; 54 | } else { 55 | for (i = 3; i <= sqrt(input); i += 2) { 56 | if (input % i == 0) { 57 | prime = false; 58 | } 59 | } 60 | } 61 | return prime; 62 | } // end isPrime 63 | 64 | /* 65 | * Function for determining the next prime number 66 | */ 67 | uint64_t nextPrimeNumber(uint64_t inputNumber) 68 | { 69 | uint64_t nextPrimeNumber; 70 | if (inputNumber <= 0) { 71 | } else { 72 | while (inputNumber != 0) { 73 | nextPrimeNumber = inputNumber + 1; 74 | if (nextPrimeNumber % 2 == 0 && nextPrimeNumber != 2) { 75 | nextPrimeNumber += 1; 76 | } 77 | while (!isPrime(nextPrimeNumber)) { 78 | nextPrimeNumber += 2; 79 | } 80 | if (isPrime(nextPrimeNumber)) 81 | return nextPrimeNumber; 82 | } 83 | } 84 | return nextPrimeNumber; 85 | } // end nextPrimeNumber 86 | hashFunction::hashFunction(uint64_t M, uint64_t cmax): 87 | M(M) 88 | { 89 | prime = nextPrimeNumber(cmax); 90 | a = 0.66 * prime; 91 | aInv = modInverse(prime); 92 | } 93 | 94 | /* 95 | * calculates and returns the key which includes 96 | * quotient value (stored in the hashtable) 97 | * mod value (is the initial address of the key) 98 | */ 99 | void hashFunction::getKey(uint64_t parentLoc, 100 | uint64_t itemID) 101 | { 102 | uint64_t c = (itemID * M) + parentLoc; 103 | uint64_t cRand = mulmod(a, c, prime); 104 | initAd = cRand % M; 105 | quotient = cRand / M; 106 | } 107 | 108 | /* 109 | * reverses the hashfunction to get the itemID 110 | */ 111 | uint64_t hashFunction::recoverID(uint64_t initAd, uint64_t DIVM) 112 | { 113 | sdsl::uint128_t cRandRec = ((sdsl::uint128_t)DIVM * (sdsl::uint128_t)M + (sdsl::uint128_t)initAd); 114 | uint64_t cRec = mulmod(aInv, cRandRec, prime); //(aInv*cRandRec)%prime; 115 | uint64_t IDRec = cRec / M; 116 | return IDRec; 117 | } 118 | /* 119 | * reverses the hashfunction to get the parent location 120 | */ 121 | uint64_t hashFunction::recoverParentLoc(uint64_t initAd, uint64_t DIVM) 122 | { 123 | sdsl::uint128_t cRandRec = ((sdsl::uint128_t)DIVM * (sdsl::uint128_t)M + (sdsl::uint128_t)initAd); 124 | uint64_t cRec = mulmod(aInv, cRandRec, prime); //aInv % prime; 125 | uint64_t parentLoc = cRec % M; 126 | return parentLoc; 127 | } 128 | -------------------------------------------------------------------------------- /Hash/hash.h: -------------------------------------------------------------------------------- 1 | #ifndef HASH 2 | #define HASH 3 | #include "sdsl/uint128_t.hpp" 4 | #include 5 | 6 | class hashFunction { 7 | public: 8 | uint64_t initAd; 9 | uint64_t quotient; 10 | uint64_t a; 11 | uint64_t aInv; 12 | uint64_t prime; 13 | uint64_t M; 14 | 15 | hashFunction(){} 16 | hashFunction(uint64_t M, uint64_t cmax); 17 | void getKey(uint64_t parentLoc, uint64_t itemID); 18 | 19 | inline uint64_t getInitAd() { return initAd; } 20 | inline uint64_t getQuotient() { return quotient; } 21 | 22 | uint64_t recoverID(uint64_t initAd, uint64_t DIVM); 23 | uint64_t recoverParentLoc(uint64_t initAd, uint64_t DIVM); 24 | uint64_t modInverse(sdsl::uint128_t m); 25 | 26 | sdsl::uint128_t gcdExtended(sdsl::uint128_t aTemp, sdsl::uint128_t m, sdsl::uint128_t* x, sdsl::uint128_t* y); 27 | }; 28 | #endif 29 | -------------------------------------------------------------------------------- /Hash/mBonsaiHash.cpp: -------------------------------------------------------------------------------- 1 | #include "mBonsaiHash.h" 2 | #include 3 | #define mulmod(a, b, m) (uint64_t)(((sdsl::uint128_t)(a) * (sdsl::uint128_t)(b)) % ((sdsl::uint128_t)(m))) 4 | /* 5 | * calculates and returns the key which includes 6 | * quotient value (stored in the hashtable) 7 | * mod value (is the initial address of the key) 8 | */ 9 | void hashFunction::getKey(uint64_t parentLoc, 10 | uint64_t itemID, 11 | uint64_t M, 12 | uint64_t prime, 13 | uint64_t a) 14 | { 15 | uint64_t c = (itemID * M) + parentLoc; 16 | uint64_t cRand = mulmod(a, c, prime); //(c * a) % prime; 17 | initAd = cRand % M; 18 | quotient = cRand / M; 19 | } 20 | 21 | /* 22 | * reverses the hashfunction to get the itemID 23 | */ 24 | uint64_t hashFunction::recoverID(uint64_t initAd, uint64_t DIVM, uint64_t M, uint64_t prime, uint64_t aInv) 25 | { 26 | sdsl::uint128_t cRandRec = ((sdsl::uint128_t)DIVM * (sdsl::uint128_t)M + (sdsl::uint128_t)initAd); 27 | uint64_t cRec = mulmod(aInv, cRandRec, prime); //(aInv*cRandRec)%prime; 28 | uint64_t IDRec = cRec / M; 29 | return IDRec; 30 | } 31 | /* 32 | * reverses the hashfunction to get the parent location 33 | */ 34 | uint64_t hashFunction::recoverParentLoc(uint64_t initAd, uint64_t DIVM, uint64_t M, uint64_t prime, uint64_t aInv) 35 | { 36 | sdsl::uint128_t cRandRec = ((sdsl::uint128_t)DIVM * (sdsl::uint128_t)M + (sdsl::uint128_t)initAd); 37 | uint64_t cRec = mulmod(aInv, cRandRec, prime); //aInv % prime; 38 | //cRec = (cRec * cRandRec) % prime; (a*b)%c == (a%c)*(b%c) 39 | uint64_t parentLoc = cRec % M; 40 | return parentLoc; 41 | } 42 | 43 | /* 44 | // A naive method to find modulor multiplicative inverse of 45 | // 'a' under modulo 'm' 46 | long long mBonsai::getModInverse(long long a, uint64_t prime) { 47 | a = a % prime; 48 | for (uint64_t x = 1; x < prime; x++) 49 | if (((a * x) % prime == 1) && x!=0) 50 | return x; 51 | return 0; 52 | } 53 | 54 | void mBonsai::euclAlgorithm(uint64_t prime) { 55 | uint64_t aTemp = (uint64_t)(0.72 * (double)prime); 56 | uint64_t aInvTemp = 0; 57 | while (true) { 58 | aInvTemp = getModInverse(aTemp, prime); 59 | if (aInvTemp == 0) 60 | aTemp++; 61 | else 62 | break; 63 | } 64 | a = aTemp; 65 | aInv = aInvTemp; 66 | cout<<"a: "< 5 | 6 | class hashFunction { 7 | public: 8 | uint64_t initAd; 9 | uint64_t quotient; 10 | 11 | void getKey(uint64_t parentLoc, uint64_t itemID, 12 | uint64_t M, uint64_t prime, 13 | uint64_t a); 14 | 15 | inline uint64_t getInitAd() { return initAd; } 16 | inline uint64_t getQuotient() { return quotient; } 17 | 18 | uint64_t recoverID(uint64_t initAd, uint64_t DIVM, uint64_t M, uint64_t prime, uint64_t aInv); 19 | uint64_t recoverParentLoc(uint64_t initAd, uint64_t DIVM, uint64_t M, uint64_t prime, uint64_t aInv); 20 | }; 21 | #endif 22 | -------------------------------------------------------------------------------- /Hash/orighashFunc.cpp: -------------------------------------------------------------------------------- 1 | #ifndef DARRAGH_HASH 2 | #define DARRAGH value 3 | #include "orighashFunc.h" 4 | 5 | /* 6 | * calculates and returns the key which includes 7 | * quotient value (stored in the hashtable) 8 | * mod value (is the initial address of the key) 9 | */ 10 | origHash origHash::getKey(uint64_t parentLoc, 11 | uint64_t itemID, uint64_t keyJ, 12 | uint64_t M, uint64_t prime, 13 | uint64_t a) 14 | { 15 | origHash key; 16 | uint64_t c = (itemID * (uint64_t)32 + keyJ) * M + parentLoc; 17 | uint64_t cRand = ((c % prime) * a); 18 | cRand = cRand % prime; 19 | initAd = cRand % M; 20 | quotient = cRand / M; // it includes the parents J 21 | return key; 22 | } 23 | 24 | #endif 25 | /* 26 | * reverses the hashfunction to get the itemID 27 | */ 28 | /*uint32_t origHash::recoverID(uint64_t initAd, uint32_t DIVM, 29 | uint64_t M, uint64_t prime, 30 | uint64_t aInv) { 31 | uint64_t cRandRec = (DIVM * M + initAd); 32 | uint64_t cRec = (aInv * cRandRec) % prime; 33 | uint32_t IDRec = cRec / (32 * M); 34 | return IDRec; 35 | }*/ 36 | 37 | /* 38 | * reverses the hashfunction to get the parent location 39 | */ 40 | /*uint64_t origHash::recoverParentLoc(uint64_t initAd, 41 | uint32_t DIVM, 42 | uint64_t M, 43 | uint64_t prime, 44 | uint64_t aInv) { 45 | uint64_t cRandRec = (DIVM * M + initAd); 46 | uint64_t cRec = (aInv * cRandRec) % prime; 47 | uint64_t parentLoc = cRec % M; 48 | return parentLoc; 49 | }*/ 50 | 51 | /* 52 | * reverses the hashfunction to get the lambda value 53 | */ 54 | /*uint64_t origHash::recoverJ(uint64_t initAd, 55 | uint32_t DIVM, uint64_t M, 56 | uint64_t prime, 57 | uint64_t aInv) { 58 | uint64_t cRandRec = (DIVM * M + initAd); 59 | uint64_t cRec = (aInv * cRandRec) % prime; 60 | uint64_t JNumber = (cRec % (32 * M)) / M; 61 | return JNumber; 62 | }*/ 63 | -------------------------------------------------------------------------------- /Hash/orighashFunc.h: -------------------------------------------------------------------------------- 1 | #ifndef CLEARY_HASH 2 | #define CLEARY_HASH 3 | #include 4 | class origHash { 5 | public: 6 | uint64_t initAd; 7 | uint64_t quotient; 8 | 9 | origHash getKey(uint64_t parentLoc, uint64_t itemID, uint64_t keyJ, 10 | uint64_t M, uint64_t prime, uint64_t a); 11 | uint64_t getInitAd() { return initAd; } 12 | uint32_t getQuotient() { return quotient; } 13 | /* 14 | *** The functions below work only if a is carefully selected such that its 15 | *mod inverse (aInv) 16 | *** will be able to reverse the hashfunction 17 | */ 18 | /* uint32_t recoverID(uint64_t initAd, uint32_t DIVM, 19 | uint64_t M, uint64_t prime, 20 | uint64_t aInv); 21 | uint64_t recoverParentLoc(uint64_t initAd, 22 | uint32_t DIVM, uint64_t M, 23 | uint64_t prime, 24 | uint64_t aInv); 25 | uint64_t recoverJ(uint64_t initAd, uint32_t DIVM, 26 | uint64_t M, uint64_t prime, 27 | uint64_t aInv);*/ 28 | }; 29 | #endif 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mame-Bonsai 2 | ========= 3 | Description 4 | ----------- 5 | mame-Bonsai or m-Bonsai is a compact representation of tries using hashtables. 6 | 7 | Making use of any mame-Bonsai or Bonsai implementation for research or commercial purposes, the reference will be: 8 | 9 | A. Poyias and R. Raman. [Improved Practical Compact Tynamic Tries.](http://dblp.uni-trier.de/pers/hd/p/Poyias:Andreas) Proc. SPIRE 2015, pp. 324-336. 10 | 11 | Paper .pdf 12 | ----------- 13 | See the .pdf file for more details. 14 | 15 | Implementations: 16 | ----------- 17 | 1. mame-Bonsai(recursive),proposed by A. Poyias and R. Raman. 18 | 2. mame-Bonsai(gamma), proposed by A. Poyias and R. Raman. 19 | 3. Bonsai, proposed by J. Darragh and J. Cleary. 20 | 21 | m-Bonsai approaches: 22 | 1. m-Bonsai(recursive): 23 | Fast and compact approach. 24 | 2. m-Bonsai (gamma): 25 | Even more compact approach but slower than m-Bonsai(recursive). 26 | 27 | ADT: 28 | ----------- 29 | 1. create(): Create a new empty tree. 30 | 2. getRoot(): return the root of the current tree. 31 | 3. getChild (v, c): return child node of node v with symbol c, if any (and return −1 if no such child exists). 32 | 4. getParent(v): return the parent of node v. 33 | 5. addChild (v, c): add a new child of v with symbol c and return the newly created node. 34 | 6. delLeaf (v, c): delete the child of v with symbol c, provided that the child indicated is a leaf (if the user asks to delete a child that is not a leaf, the subsequent operations may not execute correctly). 35 | 36 | Installation 37 | ----------- 38 | 39 | To be able to compile the m-Bonsai libraries. 40 | 41 | 1. Install sdsl-lite. 42 | Follow the installation guide here: 43 | https://github.com/simongog/sdsl-lite 44 | 3. run .sh file 45 | 46 | 47 | Datasets 48 | ----------- 49 | Given two datasets in the format of transaction database where each line is a sequence of characters. 50 | 1. Frequent Itemset Mining (FIM) dataset 51 | transSortedAccidents.dat dataset is used for FIM applications taken from FIMI repository. http://fimi.ua.ac.be/data/ 52 | 2. FASTQ format dataset 53 | FASTQ format is a text-based format for storing both a biological sequence (usually nucleotide sequence) and its corresponding quality scores. Both the sequence letter and quality score are each encoded with a single ASCII character for brevity. 54 | We transformed each chararacter in the dataset from {A,G,C,T,N} to {0,1,2,3,4}. 55 | 56 | 57 | -------------------------------------------------------------------------------- /SPIRE2015-Improved Practical Compact Dynamic Tries.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Poyias/mBonsai/5647df73919b733c80de5aaa714d7f05ede5cef6/SPIRE2015-Improved Practical Compact Dynamic Tries.pdf -------------------------------------------------------------------------------- /datasets/datasets.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Poyias/mBonsai/5647df73919b733c80de5aaa714d7f05ede5cef6/datasets/datasets.tar.gz -------------------------------------------------------------------------------- /mBonsai_gamma/DArray/blockedDArray.cpp: -------------------------------------------------------------------------------- 1 | #include "blockedDArray.h" 2 | 3 | blockedDArray::blockedDArray(uint64_t size, uint64_t numBlocks) : size(size) { 4 | darray = new gammaBlock *[numBlocks]; 5 | for (uint64_t i = 0; i < numBlocks; i++) { 6 | darray[i] = new gammaBlock(size); 7 | } 8 | } 9 | 10 | uint64_t blockedDArray::get(uint64_t loc) { 11 | uint64_t gammaBlock = loc / size; 12 | return darray[gammaBlock]->get(loc % size); 13 | } 14 | 15 | void blockedDArray::set(uint64_t loc, uint64_t item) { 16 | uint64_t gammaBlock = loc / size; // get gammaBlock 17 | darray[gammaBlock]->set((uint64_t)(loc % size), item); 18 | } 19 | -------------------------------------------------------------------------------- /mBonsai_gamma/DArray/blockedDArray.h: -------------------------------------------------------------------------------- 1 | #ifndef MBONSAIGAMMA_BLOCKEDDARRAY 2 | #define MBONSAIGAMMA_BLOCKEDDARRAY 3 | #include "gammaBlock.h" 4 | #include "sdsl/int_vector.hpp" 5 | class blockedDArray { 6 | public: 7 | gammaBlock **darray; 8 | uint64_t size; 9 | 10 | blockedDArray() {} 11 | blockedDArray(uint64_t size, uint64_t numBlocks); 12 | uint64_t get(uint64_t loc); 13 | void set(uint64_t loc, uint64_t item); 14 | }; 15 | #endif 16 | -------------------------------------------------------------------------------- /mBonsai_gamma/DArray/gammaBlock.cpp: -------------------------------------------------------------------------------- 1 | #include "gammaBlock.h" 2 | 3 | gammaBlock::gammaBlock(uint64_t size) { 4 | sdsl::int_vector<> tmp(size, 1); 5 | sdsl::coder::elias_gamma::encode(tmp, dBlock); 6 | tmp.bit_resize(0); 7 | } 8 | 9 | uint64_t gammaBlock::get(uint64_t loc) { 10 | uint64_t idx = loc / 1024; 11 | return (sdsl::coder::elias_gamma::decode( 12 | dBlock.data(), 0, loc + 1 - 1024 * idx)) - 13 | 1; 14 | } 15 | 16 | void gammaBlock::set(uint64_t loc, uint64_t item) { 17 | // decode values in tmp 18 | sdsl::int_vector<> tmp; 19 | sdsl::coder::elias_gamma::decode(dBlock, tmp); 20 | // insert item in location 21 | tmp[loc] = item + 1; 22 | // encode tmp in block and delete tmp 23 | dBlock.resize(0); 24 | sdsl::coder::elias_gamma::encode(tmp, dBlock); 25 | tmp.resize(0); 26 | } -------------------------------------------------------------------------------- /mBonsai_gamma/DArray/gammaBlock.h: -------------------------------------------------------------------------------- 1 | #ifndef MBONSAIGAMMA_GAMMABLOCK 2 | #define MBONSAIGAMMA_GAMMABLOCK 3 | #include "sdsl/coder_elias_gamma.hpp" 4 | #include "sdsl/int_vector.hpp" 5 | #include "sdsl/vlc_vector.hpp" 6 | class gammaBlock { 7 | 8 | public: 9 | sdsl::int_vector<> dBlock; 10 | gammaBlock() {} 11 | gammaBlock(uint64_t size); 12 | void set(uint64_t loc, uint64_t item); 13 | uint64_t get(uint64_t loc); 14 | }; 15 | #endif -------------------------------------------------------------------------------- /mBonsai_gamma/compile_run.sh: -------------------------------------------------------------------------------- 1 | g++ -O3 -Wall -fpermissive -std=c++11 -w -Wall -Wextra -DNDEBUG -g -ffast-math -funroll-loops -msse4.2 -I ~/include -L ~/lib DArray/gammaBlock.cpp DArray/blockedDArray.cpp ../Hash/mBonsaiHash.cpp ../readio/data.cpp mBonsaiGm.cpp main.cpp -o mBonsaiGm -lsdsl -ldivsufsort -ldivsufsort64 2 | 3 | echo ready to run 4 | 5 | ./mBonsaiGm 3095560 5 ../datasets/SRR034939.dat 0.8 256 ../datasets/SRR034939.dat 6 | ./mBonsaiGm 4242318 442 ../datasets/tranSortedAccidents.dat 0.8 256 ../datasets/tranSortedAccidents.dat 7 | -------------------------------------------------------------------------------- /mBonsai_gamma/mBonsaiGm.cpp: -------------------------------------------------------------------------------- 1 | #include "mBonsaiGm.h" 2 | #include "../Hash/mBonsaiHash.h" 3 | 4 | mBonsaiGm::mBonsaiGm(uint32_t nodeNumber, uint32_t sigma, double loadFactor, 5 | char* file, uint32_t blockSize, uint32_t numBlocks) 6 | : sigma(sigma) 7 | , M(nodeNumber * 1.25) 8 | , nodeNotFound(M + 10) 9 | , nodeNumberCount(1) 10 | 11 | { 12 | 13 | srand(time(NULL)); 14 | setData(file); // 3 is for flat numbered datasets 15 | uint64_t cmax = sigma * M + (M - 1); 16 | prime = nextPrimeNumber(cmax); 17 | 18 | // calcAandInv(prime+1); 19 | a = (ULONG_MAX - M) / prime; 20 | emptyLoc = sigma + 2; 21 | rootID = rand() % (sigma - 1); 22 | uint32_t w = sdsl::bits::hi(emptyLoc) + 1; 23 | quotient = sdsl::int_vector<0>(M, emptyLoc, w); 24 | D = blockedDArray(blockSize, numBlocks); 25 | 26 | randRootAdrs = (long)(rand() % M); // 17;// 27 | quotient[randRootAdrs] = rootID; 28 | std::cout << "w: " << w << std::endl; 29 | } 30 | 31 | /* Function that checks whether or not a given number is 32 | * a prime number or not. 33 | */ 34 | bool mBonsaiGm::isPrime(uint64_t input) 35 | { 36 | int i; 37 | bool prime = true; 38 | if (input == 2) { 39 | return true; 40 | } 41 | if (input % 2 == 0 || input <= 1) { 42 | prime = false; 43 | } else { 44 | for (i = 3; i <= sqrt(input); i += 2) { 45 | if (input % i == 0) { 46 | prime = false; 47 | } 48 | } 49 | } 50 | return prime; 51 | } // end isPrime 52 | 53 | /* 54 | * Function for determining the next prime number 55 | */ 56 | uint64_t mBonsaiGm::nextPrimeNumber(uint64_t inputNumber) 57 | { 58 | uint64_t nextPrimeNumber; 59 | if (inputNumber <= 0) { 60 | std::cout << "The number you have entered is zero or negative.\n"; 61 | } else { 62 | while (inputNumber != 0) { 63 | 64 | nextPrimeNumber = inputNumber + 1; 65 | // if the number is even, make it odd (2 is special case) 66 | if (nextPrimeNumber % 2 == 0 && nextPrimeNumber != 2) { 67 | nextPrimeNumber += 1; 68 | } 69 | // while its not a prime number, check the next odd number 70 | while (!isPrime(nextPrimeNumber)) { 71 | nextPrimeNumber += 2; 72 | } 73 | if (isPrime(nextPrimeNumber)) 74 | return nextPrimeNumber; 75 | } 76 | } 77 | return nextPrimeNumber; 78 | } // end nextPrimeNumber 79 | 80 | /* uses Data to read next Transaction 81 | * creates transaction object passes it to insertTrans(t) 82 | */ 83 | void mBonsaiGm::build() 84 | { 85 | while (Transaction* t = data->getNext()) { 86 | insertTrans(t); 87 | delete t; 88 | } 89 | } // end build 90 | 91 | /* During the build phase 92 | * called in build 93 | * param: trans[] of items where trans[0] is parent of trans[1] and so on. 94 | * it calculates the quotient value and mod hash location 95 | * it calles the setAddress with the required data to handle collision and 96 | * correct node match 97 | */ 98 | void mBonsaiGm::insertTrans(Transaction* t) 99 | { 100 | hashFunction* key = new hashFunction(); 101 | uint64_t prevInitAd = randRootAdrs; 102 | for (int i = 0; i < t->length; i++) { 103 | key->getKey(prevInitAd, t->t[i], M, prime, a); 104 | prevInitAd = setAddress(key->initAd, key->quotient); 105 | } // end first for 106 | delete key; 107 | } // end of manageDeltaTrie 108 | 109 | /* During the build phase 110 | * called in insertTrans 111 | * it sets the quotient value in the correct mod hash location 112 | * it inserts the respective value in DArray 113 | * returns the hash loc so the next item (its child) can use it 114 | */ 115 | uint64_t mBonsaiGm::setAddress(uint64_t initAd, uint32_t DIVM) 116 | { 117 | uint32_t DCount = 0; 118 | while (true) { 119 | // insert value in empty loc 120 | if (quotient[initAd] == emptyLoc) { 121 | quotient[initAd] = DIVM; 122 | nodeNumberCount++; 123 | if (DCount != 0) 124 | D.set(initAd, DCount); 125 | return initAd; 126 | } else if ((quotient[initAd] == DIVM) && (initAd != randRootAdrs)) { 127 | // item found don't set DArray 128 | if (DCount == D.get(initAd)) { 129 | return initAd; 130 | } else { 131 | DCount++; 132 | initAd++; 133 | if (initAd >= M) 134 | initAd = 0; 135 | } 136 | } else { 137 | DCount++; 138 | initAd++; 139 | if (initAd >= M) 140 | initAd = 0; 141 | } 142 | } 143 | } // end setAddress 144 | 145 | /* Search phase 146 | * Used for searchBenchmarks 147 | * Goes through a search file searching transactions by transactions. 148 | * This bench is designed spesifically for successful search operations 149 | * Outputs error if search is unsuccessful. 150 | */ 151 | uint64_t mBonsaiGm::searchBench(char* file) 152 | { 153 | uint64_t schCounter = 0, x = 0; 154 | std::ifstream infile; 155 | infile.open(file); 156 | std::vector str; 157 | std::string rawData; 158 | 159 | while (getline(infile, rawData)) { 160 | str = getVector(rawData); 161 | hashFunction* key = new hashFunction(); 162 | uint64_t prevInitAd = randRootAdrs; 163 | for (int i = 0; i < str.size(); ++i) { 164 | key->getKey(prevInitAd, (uint64_t)str[i], M, prime, a); 165 | prevInitAd = searchItem(key->initAd, key->quotient); 166 | schCounter++; 167 | x += prevInitAd / (i + 1); 168 | } 169 | delete key; 170 | str.clear(); 171 | } 172 | std::cout << "x: " << x << " counter: " << schCounter << std::endl; 173 | return schCounter; 174 | } // end searchBench 175 | 176 | /* 177 | * reads transaction by transaction 178 | * to be searched 179 | */ 180 | std::vector mBonsaiGm::getVector(std::string s) 181 | { 182 | char *cstr, *p; 183 | std::vector items; 184 | cstr = new char[s.size() + 1]; 185 | strcpy(cstr, s.c_str()); 186 | p = strtok(cstr, " "); 187 | while (p != NULL) { 188 | items.push_back(atoi(p)); 189 | p = strtok(NULL, " "); 190 | } 191 | delete[] cstr; 192 | return items; 193 | } // end getVector 194 | 195 | /* 196 | * searches Items if not found prints error 197 | */ 198 | uint64_t mBonsaiGm::searchItem(uint64_t initAd, uint32_t DIVM) 199 | { 200 | uint32_t DCount = 0; 201 | while (true) { 202 | // insert value in empty loc 203 | if (quotient[initAd] == emptyLoc) { 204 | std::cout 205 | << "We searched every corner of mB universe. Item is not found! :(" 206 | << std::endl; 207 | return nodeNotFound; 208 | } else if ((quotient[initAd] == DIVM) && (initAd != randRootAdrs)) { 209 | // item found don't set DArray 210 | if (DCount == D.get(initAd)) { 211 | return initAd; 212 | } else { 213 | DCount++; 214 | initAd++; 215 | if (initAd >= M) 216 | initAd = 0; 217 | } 218 | } else { 219 | DCount++; 220 | initAd++; 221 | if (initAd >= M) 222 | initAd = 0; 223 | } 224 | } 225 | } // end setAddress 226 | -------------------------------------------------------------------------------- /mBonsai_gamma/mBonsaiGm.h: -------------------------------------------------------------------------------- 1 | #ifndef MBONSAIGAMMA_MBONSAIGM 2 | #define MBONSAIGAMMA_MBONSAIGM 3 | #include "../readio/data.h" 4 | #include "DArray/blockedDArray.h" 5 | #include "limits.h" 6 | class mBonsaiGm { 7 | 8 | public: 9 | // structure 10 | sdsl::int_vector<0> quotient; 11 | blockedDArray D; 12 | 13 | // init 14 | mBonsaiGm() {} 15 | mBonsaiGm(uint32_t nodeNumber, uint32_t sigma, double loadFactor, char* file, 16 | uint32_t blockSize, uint32_t numBlocks); 17 | void setData(char* file) { data = new Data(file); } 18 | 19 | // build 20 | void build(); 21 | void insertTrans(Transaction* t); 22 | uint64_t setAddress(uint64_t initAd, uint32_t DIVM); 23 | 24 | // search benchmarks 25 | uint64_t searchBench(char* file); 26 | std::vector getVector(std::string s); // readio 27 | uint64_t searchItem(uint64_t initAd, uint32_t DIVM); 28 | 29 | // misc 30 | bool isPrime(uint64_t input); 31 | uint64_t nextPrimeNumber(uint64_t inputNumber); 32 | 33 | // args for printing and counting 34 | uint64_t sigma; 35 | uint64_t M; 36 | uint32_t nodeNumberCount; 37 | 38 | private: 39 | uint64_t nodeNotFound; 40 | uint32_t rootID; 41 | uint64_t prime; 42 | uint64_t a; 43 | uint64_t aInv; 44 | uint32_t randRootAdrs; 45 | uint32_t emptyLoc; 46 | Data* data; // readio 47 | }; 48 | #endif 49 | -------------------------------------------------------------------------------- /mBonsai_gamma/main.cpp: -------------------------------------------------------------------------------- 1 | #include "mBonsaiGm.h" 2 | #include 3 | #include /* ceil */ 4 | using namespace std; 5 | 6 | // arguments 7 | // 1 nodeNum 8 | // 2 alphabet 9 | // 3 dataset 10 | // 4 loadFactor 11 | // 5 blockSize 12 | // 6 searchFile 13 | void printSpace(mBonsaiGm, uint32_t); 14 | 15 | int main(int argc, char* argv[]) 16 | { 17 | uint32_t nodeNum = atoi(argv[1]); 18 | uint32_t sigma = atoi(argv[2]); 19 | char* file = argv[3]; 20 | double loadFactor = atof(argv[4]); 21 | uint32_t blockSize = atoi(argv[5]); 22 | uint32_t numBlocks = (double)(((double)nodeNum / loadFactor) / (double)blockSize) + 1; 23 | char* searchFile = argv[6]; 24 | 25 | mBonsaiGm mbGm(nodeNum, sigma, loadFactor, file, blockSize, numBlocks); 26 | auto begin = std::chrono::high_resolution_clock::now(); // wall time 27 | mbGm.build(); 28 | auto end = std::chrono::high_resolution_clock::now(); 29 | auto dur = end - begin; 30 | auto ns = std::chrono::duration_cast(dur).count(); 31 | std::cout << "Total wall time Build[ " << ns / 1000000 << " ms]" << std::endl; 32 | std::cout << "NodeNumber : " << mbGm.nodeNumberCount << std::endl; 33 | 34 | begin = std::chrono::high_resolution_clock::now(); // wall time 35 | uint64_t schCounter = mbGm.searchBench(searchFile); 36 | end = std::chrono::high_resolution_clock::now(); 37 | dur = end - begin; 38 | ns = std::chrono::duration_cast(dur).count(); 39 | std::cout << "Total wall time searchbench[ " << ns / 1000000 << " ms]" 40 | << std::endl; 41 | printSpace(mbGm, numBlocks); 42 | return 0; 43 | } 44 | 45 | void printSpace(mBonsaiGm mbGm, uint32_t numBlocks) 46 | { 47 | double quotientSpace = sdsl::size_in_bytes(mbGm.quotient) / (double)mbGm.nodeNumberCount * 8.0; 48 | double sumBlockSpace = 0; 49 | for (uint32_t i = 0; i < numBlocks; i++) { 50 | sumBlockSpace += (sdsl::size_in_bytes(mbGm.D.darray[i]->dBlock) * 8.0); 51 | } 52 | sumBlockSpace /= (double)mbGm.nodeNumberCount; 53 | double pointerSpace = (double)numBlocks * 64.0 / (double)(mbGm.nodeNumberCount * 8.0); 54 | std::cout << "space in bits (in detail): " << std::endl; 55 | std::cout << "quotient: " << quotientSpace << std::endl; 56 | std::cout << "DArray Blocked Gamma: " << sumBlockSpace << std::endl; 57 | std::cout << "Block Pointers: " << pointerSpace << std::endl; 58 | // average space per M 59 | std::cout << "Total Space: " << quotientSpace + sumBlockSpace + pointerSpace 60 | << std::endl; 61 | } 62 | -------------------------------------------------------------------------------- /mBonsai_recursive/cht.cpp: -------------------------------------------------------------------------------- 1 | #include "cht.h" 2 | 3 | cht::cht(uint64_t universe, uint64_t Mnew, uint32_t sat, uint32_t difference, 4 | double up = 0.80, double down = 0.60) 5 | : dif(difference), M(Mnew), nodeNumberCount(0), satWidth(sat), up_limit(up), 6 | down_limit(down), expand_shrink_by((2.0 * up * down) / (up + down)) { 7 | if (M < 10) 8 | M = 10; 9 | uint64_t cmax = universe - 1; 10 | valNotFound = M + 10; 11 | valExists = valNotFound + 10; 12 | prime = nextPrimeNumber(cmax); 13 | euclAlgorithm(prime); 14 | keyNotFound = prime + 10; 15 | // mersenne_twister_engine::seed 16 | emptyLoc = ceil(prime / M) + 1; 17 | 18 | uint32_t w = sdsl::bits::hi(emptyLoc) + 1 + satWidth + 1; 19 | uint64_t initVal = (emptyLoc << (satWidth + 1)) | 1; 20 | quotient_items_C = sdsl::int_vector<0>(M, initVal, w); 21 | V = sdsl::bit_vector(M, 0); 22 | } 23 | cht::~cht() { 24 | quotient_items_C.resize(0); 25 | V.resize(0); 26 | } 27 | void cht::resize() { 28 | quotient_items_C.resize(0); 29 | V.resize(0); 30 | } 31 | 32 | void cht::resize(uint64_t universe, uint64_t Mnew, uint64_t satWidthnew, 33 | uint32_t difference, double up = 0.80, double down = 0.60) { 34 | if (M < 10) 35 | M = 10; 36 | 37 | up_limit = up; 38 | down_limit = down; 39 | expand_shrink_by = (2.0 * up * down) / (up + down); 40 | dif = difference; 41 | satWidth = satWidthnew; 42 | M = Mnew; 43 | nodeNumberCount = 0; 44 | expectedNodes = (0.80 * M) + 1; 45 | valNotFound = M + 10; 46 | valExists = valNotFound + 10; 47 | prime = nextPrimeNumber(universe); 48 | euclAlgorithm(prime); 49 | keyNotFound = prime + 10; 50 | 51 | emptyLoc = ceil(prime / M) + 1; 52 | uint32_t w = sdsl::bits::hi(emptyLoc) + 1 + satWidth + 1; 53 | uint64_t initVal = (emptyLoc << (satWidth + 1)) | 1; 54 | quotient_items_C = sdsl::int_vector<0>(M, initVal, w); 55 | V = sdsl::bit_vector(M, 0); 56 | } 57 | 58 | uint64_t cht::minorExpand(uint64_t universe, double expandRatio = 0) { 59 | uint64_t newM; 60 | if (expandRatio) 61 | newM = (uint64_t)((M * expandRatio) + 1); 62 | else 63 | newM = (uint64_t)((double)(up_limit * M) / expand_shrink_by); 64 | cht cht_new(universe, newM, satWidth, 0); 65 | uint64_t key; 66 | for (uint64_t i = 0; i < M; i++) { 67 | key = getKey(i); 68 | if (key != keyNotFound) { 69 | cht_new.insert(key, getSat(i)); 70 | } 71 | } 72 | quotient_items_C.resize(0); 73 | V.resize(0); 74 | resize(universe, newM, satWidth, 0); 75 | for (uint32_t i = 0; i < newM; i++) { 76 | quotient_items_C[i] = cht_new.quotient_items_C[i]; 77 | V[i] = cht_new.V[i]; 78 | } 79 | cht_new.quotient_items_C.resize(0); 80 | cht_new.V.resize(0); 81 | return newM; 82 | } 83 | 84 | uint64_t cht::minorShrink(uint64_t universe, double shrinkRatio = 0) { 85 | uint64_t newM; 86 | if (shrinkRatio) 87 | newM = (uint64_t)((M * shrinkRatio) + 1); 88 | else 89 | // newM = (uint64_t)((M * expand_shrink_by)+1); 90 | newM = (uint64_t)((double)(down_limit * M) / expand_shrink_by); 91 | cht cht_new(universe, newM, satWidth, 0); 92 | uint64_t key; 93 | for (uint64_t i = 0; i < M; i++) { 94 | key = getKey(i); 95 | if (key != keyNotFound) { 96 | cht_new.insert(key, getSat(i)); 97 | } 98 | } 99 | quotient_items_C.resize(0); 100 | V.resize(0); 101 | resize(universe, newM, satWidth, 0); 102 | for (uint32_t i = 0; i < newM; i++) { 103 | quotient_items_C[i] = cht_new.quotient_items_C[i]; 104 | V[i] = cht_new.V[i]; 105 | } 106 | cht_new.quotient_items_C.resize(0); 107 | cht_new.V.resize(0); 108 | return newM; 109 | } 110 | 111 | bool cht::remove(uint64_t key) { 112 | uint64_t cRand = ((key % prime) * a) % prime; 113 | uint64_t initAd = cRand % M; 114 | uint64_t quot = cRand / M; 115 | uint64_t c = findByC(key); 116 | uint64_t tmpSlot = c; 117 | 118 | if (c == valNotFound) 119 | return false; 120 | // if the group leader is in the initAd, then there was no collision 121 | if (c == initAd && getC(c) == 1) { 122 | V[c] = 0; 123 | setItem(c, emptyLoc, 0, 1); 124 | return true; 125 | } 126 | if (getC(c) == 1) { 127 | (tmpSlot == M - 1) ? tmpSlot = 0 : tmpSlot++; 128 | if (getC(tmpSlot) == 0) { 129 | quotient_items_C[tmpSlot] = quotient_items_C[tmpSlot] | 1; 130 | } else { 131 | V[initAd] = 0; 132 | } 133 | } 134 | tmpSlot = c; 135 | uint64_t vCnt = 0, cCnt = 0; 136 | while (getQuo(tmpSlot) != emptyLoc) 137 | (tmpSlot == 0) ? tmpSlot = M - 1 : tmpSlot--; 138 | uint64_t checkPoint = tmpSlot, empt = tmpSlot; 139 | // move one step after empty 140 | (tmpSlot == M - 1) ? tmpSlot = 0 : tmpSlot++; 141 | while (tmpSlot != c) { 142 | if (V[tmpSlot] == 1) 143 | vCnt++; 144 | if (getC(tmpSlot) == 1) 145 | cCnt++; 146 | (tmpSlot == M - 1) ? tmpSlot = 0 : tmpSlot++; 147 | if (vCnt == cCnt) 148 | checkPoint = tmpSlot; 149 | } 150 | uint64_t i, j; 151 | tmpSlot = c; 152 | j = tmpSlot; 153 | (tmpSlot == 0) ? i = M - 1 : i = tmpSlot - 1; 154 | while (j != checkPoint) { 155 | quotient_items_C[j] = quotient_items_C[i]; 156 | j = i; 157 | (i == 0) ? i = M - 1 : i--; 158 | } 159 | setItem(checkPoint, emptyLoc, 0, 1); 160 | return true; 161 | } 162 | 163 | uint64_t cht::findByC(uint64_t key) { 164 | uint64_t cRand = ((key % prime) * a) % prime; 165 | uint64_t curAddress = cRand % M; 166 | uint64_t quot = cRand / M; 167 | uint64_t cc = 0; 168 | uint64_t vOnesDown = 0; 169 | uint64_t cOnesUp = 0; 170 | // count vOnes downwards including current address 171 | if (V[curAddress] == 1) 172 | vOnesDown++; 173 | else 174 | return valNotFound; 175 | // start moving downwards 176 | (curAddress == 0) ? curAddress = M - 1 : curAddress--; 177 | // go downwards untill empty slot and count Vones 178 | while (getQuo(curAddress) != emptyLoc) { 179 | if (V[curAddress] == 1) 180 | vOnesDown++; 181 | (curAddress == 0) ? curAddress = M - 1 : curAddress--; 182 | } 183 | // get emptyslot and start moving upwards 184 | curEmptySlot = curAddress; 185 | if (vOnesDown == 0) 186 | return valNotFound; 187 | (curAddress == M - 1) ? curAddress = 0 : curAddress++; 188 | 189 | // go upwards 190 | // count cOnes AFTER emptySlot until conesUp==vOnes down 191 | while (cOnesUp < vOnesDown) { 192 | if (getC(curAddress) == 1) 193 | cOnesUp++; 194 | (curAddress == M - 1) ? curAddress = 0 : curAddress++; 195 | } 196 | // return associated C value, sta 197 | (curAddress == 0) ? curAddress = M - 1 : curAddress--; 198 | if (item_exists(curAddress, quot)) { 199 | return curAddress; 200 | } 201 | uint64_t cntr = 0; 202 | (curAddress == M - 1) ? curAddress = 0 : curAddress++; 203 | while (getC(curAddress) == 0) { 204 | if (item_exists(curAddress, quot)) 205 | return curAddress; 206 | (curAddress == M - 1) ? curAddress = 0 : curAddress++; 207 | } 208 | return valNotFound; 209 | } 210 | /* Function that checks whether or not a given number is 211 | * a prime number or not. 212 | */ 213 | bool cht::isPrime(uint64_t input) { 214 | uint64_t i; 215 | bool prime = true; 216 | 217 | if (input == 2) { 218 | return true; 219 | } 220 | 221 | if (input % 2 == 0 || input <= 1) { 222 | prime = false; 223 | } else { 224 | for (i = 3; i <= sqrt(input); i += 2) { 225 | if (input % i == 0) { 226 | prime = false; 227 | } 228 | } 229 | } 230 | return prime; 231 | } // end isPrime 232 | 233 | /* 234 | * Function for determining the next prime number 235 | */ 236 | uint64_t cht::nextPrimeNumber(uint64_t inputNumber) { 237 | uint64_t nextPrimeNumber; 238 | if (inputNumber <= 0) { 239 | std::cout << "The number you have entered is zero or negative.\n"; 240 | } else { 241 | while (inputNumber != 0) { 242 | nextPrimeNumber = inputNumber + 1; 243 | if (nextPrimeNumber % 2 == 0 && nextPrimeNumber != 2) { 244 | nextPrimeNumber += 1; 245 | } 246 | while (!isPrime(nextPrimeNumber)) { 247 | nextPrimeNumber += 2; 248 | } 249 | if (isPrime(nextPrimeNumber)) 250 | return nextPrimeNumber; 251 | } 252 | } 253 | return nextPrimeNumber; 254 | } // end nextPrimeNumber 255 | 256 | // A naive method to find modulor multiplicative inverse of 257 | // 'a' under modulo 'm' 258 | uint64_t cht::getModInverse(uint64_t a, uint64_t prime) { 259 | a = a % prime; 260 | for (uint64_t x = 1; x < prime; x++) 261 | if ((a * x) % prime == 1) 262 | return x; 263 | return prime; 264 | } 265 | 266 | void cht::euclAlgorithm(uint64_t prime) { 267 | long long aTemp = (long long)(0.66 * (double)prime); 268 | long long aInvTemp = prime; 269 | while (true) { 270 | aInvTemp = getModInverse(aTemp, prime); 271 | if (aInvTemp == prime) 272 | aTemp++; 273 | else 274 | break; 275 | } 276 | a = aTemp; 277 | aInv = aInvTemp; 278 | } 279 | 280 | uint64_t cht::getInitAd(uint64_t loc) { 281 | // check if empty 282 | if (getQuo(loc) == emptyLoc) 283 | return valNotFound; 284 | // count Cones down including first not the last(the empty one) 285 | uint64_t cOnes = 0, vOnes = 0, index = loc; 286 | while (getQuo(index) != emptyLoc) { 287 | if (getC(index) == 1) 288 | cOnes++; 289 | (index == 0) ? index = M - 1 : index--; 290 | } 291 | while (vOnes != cOnes) { 292 | if (V[index] == 1) 293 | vOnes++; 294 | (index == M - 1) ? index = 0 : index++; 295 | } 296 | return (index == 0) ? index = M - 1 : index - 1; 297 | } 298 | 299 | uint64_t cht::getKey(uint64_t loc) { 300 | if (getQuo(loc) == emptyLoc) 301 | return keyNotFound; 302 | uint64_t initAd = getInitAd(loc); 303 | if (initAd == valNotFound) 304 | return keyNotFound; 305 | uint64_t cRand = (getQuo(loc) * M) + initAd; 306 | uint64_t key = (cRand * aInv) % prime; 307 | return key; 308 | } 309 | 310 | /* 311 | * Searches items by key, called in mBonsai.cpp. 312 | * Possible return: dif<=x<=(127+dif) //satelite data 313 | * If search fails return 127+dif+1 314 | */ 315 | uint64_t cht::find(uint64_t key) { 316 | uint64_t cRand = ((key % prime) * a) % prime; 317 | uint64_t initAd = cRand % M; 318 | uint64_t quotient = cRand / M; 319 | if (V[initAd] == 0) { 320 | return (1 << satWidth) + dif; // not found 321 | } else { 322 | uint64_t sog = getStartOfGroup(initAd); 323 | uint64_t exists = getSatelite(initAd, sog, quotient); 324 | if (exists != valNotFound) { 325 | return getSat(curEmptySlot); 326 | } 327 | return (1 << satWidth) + dif; // not found 328 | } 329 | } // end find 330 | 331 | /* Returns the location of the displacement value 332 | * which is stored as satelite data. 333 | * Called in find only when node exists. 334 | */ 335 | uint64_t cht::getSatelite(uint64_t vVal, uint64_t cVal, uint64_t quotient) { 336 | uint64_t curC; 337 | uint64_t tmpSlot; 338 | // check if the value is already inserted 339 | if (item_exists(cVal, quotient)) { 340 | curEmptySlot = cVal; 341 | return cVal; 342 | } else if (is_empty(cVal)) { 343 | curEmptySlot = cVal; 344 | return valNotFound; 345 | } 346 | // start going upwards until block ends where c!=0 347 | curC = (cVal + 1) % M; 348 | // go upwards towards the end of the block 349 | while (getC(curC) == 0) { 350 | if (item_exists(curC, quotient)) { 351 | curEmptySlot = curC; 352 | return curC; 353 | } 354 | curC = (curC + 1) % M; 355 | } 356 | return valNotFound; 357 | } // end getSatelite 358 | 359 | /* 360 | * changeBit Loc must be in a specific location relative to the virgin bit. 361 | */ 362 | uint64_t cht::getChangeBitLoc(uint64_t curAddress) { 363 | uint32_t vOnesDown = 0; 364 | uint32_t cOnesUp = 0; 365 | // count vOnes downwards including current address 366 | if (V[curAddress] == 1) 367 | vOnesDown++; 368 | // start moving downwards 369 | curAddress = (curAddress + M - 1) % M; 370 | 371 | // go downwards untill empty slot and count Vones 372 | while (!is_empty(curAddress)) { 373 | if (V[curAddress] == 1) 374 | vOnesDown++; 375 | curAddress = (curAddress + M - 1) % M; 376 | } 377 | // get emptyslot and start moving upwards 378 | curEmptySlot = curAddress; 379 | if (vOnesDown == 0) 380 | return valNotFound; 381 | curAddress = (curAddress + 1) % M; 382 | 383 | // go upwards 384 | // count cOnes AFTER emptySlot until conesUp==vOnes down 385 | while (cOnesUp < vOnesDown) { 386 | if (getC(curAddress) == 1) 387 | cOnesUp++; 388 | curAddress = (curAddress + 1) % M; 389 | } 390 | // return associated C value, sta 391 | return ((curAddress + M - 1) % M); 392 | } 393 | bool cht::insert(uint64_t key, uint32_t value) { 394 | uint64_t cRand = ((key % prime) * a) % prime; 395 | uint64_t initAd = cRand % M; 396 | uint64_t quotient = cRand / M; 397 | bool nodeIncrease = true; 398 | 399 | if (is_empty(initAd)) { 400 | nodeNumberCount++; 401 | setItem(initAd, quotient, (value - dif), 1); 402 | V[initAd] = 1; 403 | return nodeIncrease; 404 | } else { 405 | uint64_t changeBit = getItemLocation(initAd); 406 | if (V[initAd] == 0) { 407 | if (changeBit != valNotFound) 408 | startNewBlock(initAd, changeBit); 409 | V[initAd] = 1; 410 | setItem(curEmptySlot, quotient, (value - dif), 1); 411 | nodeNumberCount++; 412 | return nodeIncrease; 413 | } else { 414 | nodeIncrease = findSpace(changeBit, quotient); 415 | if (nodeIncrease) 416 | nodeNumberCount++; 417 | setItem(curEmptySlot, quotient, (value - dif), 0); 418 | return nodeIncrease; 419 | } 420 | } 421 | } 422 | 423 | uint64_t cht::getItemLocation(uint64_t curAddress) { 424 | uint64_t vOnesDown = 0; 425 | uint64_t cOnesUp = 0; 426 | // count vOnes downwards including current address 427 | if (V[curAddress] == 1) 428 | vOnesDown++; 429 | // start moving downwards 430 | curAddress = (curAddress + M - 1) % M; 431 | // go downwards untill empty slot and count Vones 432 | while (!is_empty(curAddress)) { 433 | 434 | if (V[curAddress] == 1) 435 | vOnesDown++; 436 | curAddress = (curAddress + M - 1) % M; 437 | } 438 | // get emptyslot and start moving upwards 439 | curEmptySlot = curAddress; 440 | if (vOnesDown == 0) 441 | return valNotFound; 442 | // (curAddress == M - 1) ? curAddress = 0 : curAddress++; 443 | curAddress = (curAddress + 1) % M; 444 | // go upwards 445 | // count cOnes AFTER emptySlot until conesUp==vOnes down 446 | while (cOnesUp < vOnesDown) { 447 | if (getC(curAddress) == 1) 448 | cOnesUp++; 449 | curAddress = (curAddress + 1) % M; 450 | } 451 | // return associated C value, sta 452 | return ((curAddress + M - 1) % M); 453 | } 454 | 455 | /* We already found our associateC location. 456 | * We push c values and hashtables accordingly, 457 | * so we make room to insert the new node in the associateC location 458 | * return boolean if we found the item in the 459 | */ 460 | bool cht::findSpace(uint64_t cVal, uint64_t quotient) { 461 | uint64_t curC; 462 | // check if the value is already inserted 463 | uint64_t tmpSlot; 464 | if (item_exists(cVal, quotient)) { 465 | curEmptySlot = cVal; 466 | return false; 467 | } 468 | if (is_empty(cVal)) { 469 | curEmptySlot = cVal; 470 | return true; 471 | } 472 | // start going upwards until block ends where c!=0 473 | curC = (cVal + 1) % M; 474 | // go upwards towards the end of the block 475 | while (getC(curC) == 0) { 476 | if (item_exists(cVal, quotient)) { 477 | curEmptySlot = curC; 478 | return false; 479 | } 480 | curC = (curC + 1) % M; 481 | } 482 | // go one back to stay in the block 483 | curC = (curC + M - 1) % M; 484 | 485 | // push all the slots upto curC to insert it in curC 486 | while (curEmptySlot != curC) { 487 | tmpSlot = (curEmptySlot + 1) % M; 488 | quotient_items_C[curEmptySlot] = quotient_items_C[tmpSlot]; 489 | curEmptySlot = (curEmptySlot + 1) % M; 490 | } 491 | curEmptySlot = curC; 492 | } // end findSpace 493 | 494 | /* In case we eant to start a new Block 495 | * and the changeBit lies in another block. 496 | * We handle this situation differently. 497 | */ 498 | void cht::startNewBlock(uint64_t vVal, uint64_t cVal) { 499 | 500 | uint32_t tmpSlot; 501 | uint64_t curC; 502 | curC = (cVal + 1) % M; 503 | 504 | while (getC(curC) == 0) 505 | curC = (curC + 1) % M; 506 | 507 | while (curEmptySlot != curC) { 508 | tmpSlot = (curEmptySlot + 1) % M; 509 | quotient_items_C[curEmptySlot] = quotient_items_C[tmpSlot]; 510 | curEmptySlot = tmpSlot; 511 | } 512 | curEmptySlot = (curEmptySlot + M - 1) % M; 513 | } // end startNewBlock 514 | 515 | /* 516 | * changeBit Loc must be in a specific location relative to the virgin bit. 517 | */ 518 | uint64_t cht::getStartOfGroup(uint64_t curAddress) { 519 | uint32_t vOnesDown = 0; 520 | uint32_t cOnesUp = 0; 521 | // count vOnes downwards including current address 522 | if (V[curAddress] == 1) 523 | vOnesDown++; 524 | // start moving downwards 525 | curAddress = (curAddress + M - 1) % M; 526 | // go downwards untill empty slot and count Vones 527 | while (!is_empty(curAddress)) { 528 | if (V[curAddress] == 1) 529 | vOnesDown++; 530 | curAddress = (curAddress + M - 1) % M; 531 | } 532 | // get emptyslot and start moving upwards 533 | curEmptySlot = curAddress; 534 | if (vOnesDown == 0) 535 | return valNotFound; 536 | curAddress = (curAddress + 1) % M; 537 | 538 | // go upwards 539 | // count cOnes AFTER emptySlot until conesUp==vOnes down 540 | while (cOnesUp < vOnesDown) { 541 | if (getC(curAddress) == 1) 542 | cOnesUp++; 543 | curAddress = (curAddress + 1) % M; 544 | } 545 | // return associated C value, sta 546 | return (curAddress + M - 1) % M; 547 | } 548 | -------------------------------------------------------------------------------- /mBonsai_recursive/cht.h: -------------------------------------------------------------------------------- 1 | #ifndef COMPACTHASHTABLE_CHT 2 | #define COMPACTHASHTABLE_CHT 3 | 4 | #include "limits.h" 5 | #include "sdsl/int_vector.hpp" 6 | #include 7 | class cht { 8 | 9 | public: 10 | // structure 11 | sdsl::int_vector<0> quotient_items_C; 12 | sdsl::bit_vector V; 13 | 14 | // init 15 | cht() {} 16 | cht(uint64_t universe, uint64_t M, uint32_t sat, uint32_t diff, 17 | double up = 0.80, double down = 0.60); 18 | ~cht(); 19 | void resize(uint64_t universe, uint64_t M, uint64_t sat, uint32_t diff, 20 | double up = 0.80, double down = 0.60); 21 | void resize(); 22 | uint64_t minorExpand(uint64_t universe, double expandRatio = 0); 23 | uint64_t minorShrink(uint64_t universe, double shrinkRatio = 0); 24 | // misc 25 | uint64_t getModInverse(uint64_t a, uint64_t prime); 26 | void euclAlgorithm(uint64_t prime); 27 | bool isPrime(uint64_t input); 28 | uint64_t nextPrimeNumber(uint64_t inputNumber); 29 | 30 | // reverse key 31 | uint64_t getInitAd(uint64_t loc); 32 | uint64_t getKey(uint64_t loc); 33 | 34 | // API 35 | bool insert(uint64_t key, uint32_t value); 36 | bool remove(uint64_t key); 37 | uint64_t find(uint64_t key); 38 | 39 | uint64_t findByC(uint64_t key); 40 | uint64_t getStartOfGroup(uint64_t curAddress); 41 | uint64_t getItemLocation(uint64_t curAddress); 42 | uint64_t getChangeBitLoc(uint64_t curAddress); 43 | bool findSpace(uint64_t cVal, uint64_t quotient); 44 | void startNewBlock(uint64_t vVal, uint64_t cVal); 45 | // fetch displacement value stored as satelite data 46 | uint64_t getSatelite(uint64_t vVal, uint64_t cVal, uint64_t quotients); 47 | 48 | inline void setItem(uint64_t loc, uint64_t quo, uint64_t sat, uint64_t c) { 49 | quotient_items_C[loc] = (((quo << satWidth) | sat) << 1) | c; 50 | } 51 | inline uint64_t getQuo(uint64_t loc) { 52 | return quotient_items_C[loc] >> (satWidth + 1); 53 | } 54 | inline uint64_t getSat(uint64_t loc) { 55 | return (quotient_items_C[loc] >> 1) & ((1 << satWidth) - 1); 56 | } 57 | inline uint64_t getC(uint64_t loc) { return quotient_items_C[loc] & 1; } 58 | inline bool item_exists(uint64_t loc, uint64_t quotient) { 59 | return ((quotient_items_C[loc] >> (satWidth + 1)) == quotient); 60 | } 61 | inline bool is_empty(uint64_t loc) { 62 | return ((quotient_items_C[loc] >> (satWidth + 1)) == emptyLoc); 63 | } 64 | uint32_t nodeNumberCount; 65 | uint64_t M; 66 | uint64_t satWidth; 67 | 68 | private: 69 | double up_limit, down_limit, expand_shrink_by; 70 | uint32_t dif; 71 | uint64_t cmax; 72 | uint64_t valNotFound; 73 | uint64_t valExists; 74 | uint64_t keyNotFound; 75 | uint64_t sigma; 76 | uint64_t prime; 77 | uint64_t a; 78 | uint64_t aInv; 79 | uint64_t emptyLoc; 80 | uint64_t curEmptySlot; 81 | uint64_t expectedNodes; 82 | }; 83 | #endif 84 | -------------------------------------------------------------------------------- /mBonsai_recursive/compile_run.sh: -------------------------------------------------------------------------------- 1 | g++ -O3 -Wall -fpermissive -std=c++11 -w -Wall -Wextra -DNDEBUG -g -ffast-math -funroll-loops -msse4.2 -I ~/include -L ~/lib cht.cpp ../Hash/hash.cpp ../readio/data.cpp mBonsai.cpp main.cpp -o mBonsai -lsdsl -ldivsufsort -ldivsufsort64 2 | 3 | ./mBonsai 3095561 5 ../datasets/SRR034939.dat 0.8 ../datasets/SRR034939.dat 4 | ./mBonsai 4242318 442 ../datasets/tranSortedAccidents.dat 0.8 ../datasets/tranSortedAccidents.dat -------------------------------------------------------------------------------- /mBonsai_recursive/mBonsai.cpp: -------------------------------------------------------------------------------- 1 | #include "mBonsai.h" 2 | using namespace std; 3 | 4 | /* 5 | * Constructor initialise trie 6 | */ 7 | mBonsai::mBonsai(uint64_t nodeNumber, uint64_t sigma, double loadFactor, 8 | char *file) 9 | : sigma(sigma), M((uint64_t)(nodeNumber) / loadFactor), valNotFound(M + 10), 10 | nodeNumberCount(1) { 11 | srand(time(NULL)); 12 | setData(file); 13 | uint64_t cmax = (sigma * M) + (M - 1); 14 | hashF = hashFunction(M, cmax); 15 | randRootAdrs = rand() % M; 16 | emptyLoc = sigma + 2; 17 | rootID = 1; 18 | uint64_t w = sdsl::bits::hi(emptyLoc) + 1 + dWidth; 19 | uint64_t initVal = emptyLoc << dWidth; 20 | quotient_D = sdsl::int_vector<0>(M, initVal, w); 21 | cht_sl = 22 | cht(M, 1.25 * ((double)M * chtRatio) + 1.0, (1 << dWidth) - 1, end_of_l1); 23 | quotient_D[randRootAdrs] = rootID << dWidth; 24 | } 25 | /* 26 | * It goes through the dataset transaction by transaction 27 | */ 28 | void mBonsai::build() { 29 | uint64_t trCounter = 0; 30 | while (Transaction *t = data->getNext()) { 31 | insert(t, trCounter); 32 | // cout<length; ++i) { 46 | hashF.getKey(prevInitAd, (uint64_t)t->t[i]); 47 | prevInitAd = setAddress(hashF.initAd, hashF.quotient); 48 | } // end first for 49 | } // end of insert 50 | 51 | /* 52 | * called by insert 53 | * it sets the quotient value in the correct location 54 | * It handles displacement value accordingly 55 | * returns the hash loc so the next item (its child) can use it 56 | */ 57 | uint64_t mBonsai::setAddress(uint64_t initAd, uint64_t DIVM) { 58 | uint64_t DCount = 0; 59 | while (true) { 60 | // EMPTY LOC soo insert 61 | if (getQuo(initAd) == emptyLoc && (initAd != randRootAdrs)) { 62 | singlepath = true; 63 | ++nodeNumberCount; 64 | if (DCount != 0) { 65 | if (DCount < end_of_l1) { 66 | setQuo_D(initAd, DIVM, DCount); 67 | } else if (DCount >= end_of_l1 && DCount <= end_of_l2) { 68 | setQuo_D(initAd, DIVM, end_of_l1); 69 | cht_sl.insert(initAd, DCount); 70 | } else if (DCount > end_of_l2) { 71 | setQuo_D(initAd, DIVM, end_of_l1); 72 | mapSl.insert(std::pair(initAd, DCount)); 73 | } 74 | } else { 75 | setQuo(initAd, DIVM); 76 | } 77 | return initAd; 78 | // check if it already exists 79 | } else if ((getQuo(initAd) == DIVM) && (singlepath == false) && 80 | (initAd != randRootAdrs)) { 81 | // option for main 82 | if (DCount < end_of_l1 || (getD(initAd) < end_of_l1)) { 83 | if (DCount == getD(initAd)) { 84 | return initAd; 85 | } else { 86 | ++DCount; 87 | initAd = (initAd + 1) % M; 88 | } 89 | // sublayer 90 | } else if (getD(initAd) == end_of_l1 && DCount >= end_of_l1 && 91 | DCount <= end_of_l2) { 92 | uint64_t tmpSat = cht_sl.find(initAd); 93 | if (tmpSat == end_of_l2 + 1) { 94 | ++DCount; 95 | initAd = (initAd + 1) % M; 96 | } else if ((tmpSat + end_of_l1) == DCount) { 97 | return initAd; 98 | } else { 99 | ++DCount; 100 | initAd = (initAd + 1) % M; 101 | } 102 | // final c++ hash map 103 | } else if (getD(initAd) == end_of_l1 && DCount > end_of_l2) { 104 | if (mapSl.find(initAd) == mapSl.end() || 105 | mapSl.find(initAd)->second != DCount) { 106 | ++DCount; 107 | initAd = (initAd + 1) % M; 108 | } else { 109 | return initAd; 110 | } 111 | } 112 | // NOT EMPTY_LOC and NOT SAME_DIV.. MOVE_ON then 113 | } else { 114 | ++DCount; 115 | initAd = (initAd + 1) % M; 116 | } 117 | } // end while 118 | } // end setAddress 119 | 120 | /* Used for searchBenchmarks 121 | * Goes through a search file searching transactions by transactions. 122 | * This bench is designed spesifically for successful search operations 123 | * Outputs error if search is unsuccessful.*/ 124 | uint64_t mBonsai::searchBench(char *file) { 125 | uint64_t counter = 0; 126 | std::ifstream infile; 127 | infile.open(file); 128 | std::vector str; 129 | std::string rawData; 130 | 131 | while (getline(infile, rawData)) { 132 | str = getVector(rawData); 133 | uint64_t prevInitAd = randRootAdrs; 134 | for (int i = 0; i < str.size(); ++i) { 135 | hashF.getKey(prevInitAd, (uint64_t)str[i]); 136 | prevInitAd = searchItem(hashF.initAd, hashF.quotient, str[i]); 137 | counter++; 138 | } 139 | str.clear(); 140 | } 141 | return counter; 142 | } // end searchBench 143 | 144 | /* reads transaction by transaction 145 | * to be searched */ 146 | std::vector mBonsai::getVector(std::string s) { 147 | char *cstr, *p; 148 | std::vector items; 149 | cstr = new char[s.size() + 1]; 150 | strcpy(cstr, s.c_str()); 151 | p = strtok(cstr, " "); 152 | while (p != NULL) { 153 | items.push_back(atoi(p)); 154 | p = strtok(NULL, " "); 155 | } 156 | delete[] cstr; 157 | return items; 158 | } // end getVector 159 | 160 | /* searches Items if not found prints error*/ 161 | uint64_t mBonsai::searchItem(uint64_t initAd, uint64_t DIVM, uint64_t itemID) { 162 | uint64_t DCount = 0; 163 | while (true) { 164 | // EMPTY LOC so item not Found 165 | if (getQuo(initAd) == emptyLoc) { 166 | return valNotFound; 167 | // check if it alreadey exists 168 | } else if ((getQuo(initAd) == DIVM) && (initAd != randRootAdrs)) { 169 | // option for main 170 | if (DCount < end_of_l1 || getD(initAd) < end_of_l1) { 171 | if (DCount == getD(initAd)) { 172 | return initAd; 173 | } else { 174 | ++DCount; 175 | initAd = (initAd + 1) % M; 176 | } 177 | // option for sublayer 178 | } else if (getD(initAd) == end_of_l1 && DCount >= end_of_l1 && 179 | DCount <= end_of_l2) { 180 | uint64_t tmpSat = cht_sl.find(initAd); 181 | if (tmpSat == end_of_l2 + 1) { 182 | ++DCount; 183 | initAd = (initAd + 1) % M; 184 | } else if ((tmpSat + end_of_l1) == DCount) { 185 | return initAd; 186 | } else { 187 | ++DCount; 188 | initAd = (initAd + 1) % M; 189 | } 190 | // final option hash map 191 | } else if (getD(initAd) == end_of_l1 && DCount > end_of_l2) { 192 | if (mapSl.find(initAd) == mapSl.end() || 193 | mapSl.find(initAd)->second != DCount) { 194 | ++DCount; 195 | initAd = (initAd + 1) % M; 196 | } else { 197 | return initAd; 198 | } 199 | } 200 | // NOT EMPTY_LOC NOT SAME_DIV move to next one 201 | } else { 202 | ++DCount; 203 | initAd = (initAd + 1) % M; 204 | } 205 | } // end while 206 | } // end searchItem 207 | 208 | uint64_t mBonsai::getInitAd(uint64_t loc) { 209 | if (getQuo(loc) == emptyLoc) 210 | return valNotFound; 211 | if (getD(loc) < end_of_l1) 212 | return (loc + M - getD(loc)) % M; 213 | uint64_t tmpSat = cht_sl.find(loc); 214 | if (tmpSat != 135) 215 | return (loc + M - (tmpSat + end_of_l1)) % M; 216 | return (loc + M - mapSl.find(loc)->second) % M; 217 | } 218 | 219 | uint64_t mBonsai::getParent(uint64_t loc) { 220 | uint64_t initAd = getInitAd(loc); 221 | if (initAd == valNotFound) 222 | return valNotFound; 223 | return hashF.recoverParentLoc(initAd, getQuo(loc)); 224 | } 225 | uint64_t mBonsai::getItemLabel(uint64_t loc) { 226 | uint64_t initAd = getInitAd(loc); 227 | if (initAd == valNotFound) 228 | return valNotFound; 229 | if (loc == randRootAdrs) 230 | return rootID; 231 | return hashF.recoverID(initAd, getQuo(loc)); 232 | } 233 | 234 | mBonsai_expand::mBonsai_expand(uint64_t n, uint64_t M, uint64_t sigma) { 235 | uint64_t w = sdsl::bits::hi(sigma) + 1; 236 | degree_cntr = sdsl::int_vector<0>(M, 0, w); 237 | unary_bit = sdsl::bit_vector(M + n + 1); 238 | labels = sdsl::int_vector<0>(n, sigma + 1, w); 239 | displ_level1 = sdsl::int_vector<0>(n, 0, 3); 240 | displ_level2 = cht(n, 1.25 * ((double)n * 0.1) + 1.0, 7, 7); 241 | } 242 | 243 | double mBonsai::traverse() { 244 | double timeMs = 0.0; 245 | auto begin = std::chrono::high_resolution_clock::now(); 246 | uint64_t n = nodeNumberCount; 247 | // 1) create degree_cntr array of Mlogsigma 248 | mbe = mBonsai_expand(n, M, sigma); 249 | for (uint64_t i = 0; i < M; ++i) { 250 | if (getQuo(i) != emptyLoc && i != randRootAdrs) { 251 | uint64_t parLoc = getParent(i); 252 | if (parLoc != valNotFound) 253 | mbe.degree_cntr[parLoc]++; 254 | } 255 | } // end for 1 256 | 257 | // 2) represent mbe.degree_cntr in unary bit_vec of M+n bits 258 | uint64_t offset = 0; 259 | for (uint64_t i = 0; i < M; ++i) { 260 | uint64_t end_of_unary = offset + mbe.degree_cntr[i]; 261 | while (offset < end_of_unary) 262 | mbe.unary_bit[offset++] = 1; 263 | mbe.unary_bit[offset++] = 0; 264 | } // end for 2 265 | for (uint64_t i = 0; i < M; ++i) { 266 | mbe.degree_cntr[i] = 0; 267 | } // end for 3 268 | // create select of unary bit 269 | mbe.unary_sel = sdsl::select_support_mcl<0, 1>(&mbe.unary_bit); 270 | 271 | // print time 272 | auto end = std::chrono::high_resolution_clock::now(); 273 | auto dur = end - begin; 274 | auto ns = std::chrono::duration_cast(dur).count(); 275 | std::cout << "Simple traverse - preparation phase:" << std::endl; 276 | std::cout << "init degree_counter(1) + init unary bitvector(2) + init " 277 | "select(3)+initlabels tmpDaray\n[ " 278 | << ns / 1000000 << " ms]" << std::endl; 279 | timeMs += ns / 1000000; 280 | // step 2 ->start insertion to n mbe.labels 281 | begin = std::chrono::high_resolution_clock::now(); // wall time 282 | uint64_t label, parLoc, labelPos, displValue; 283 | for (uint64_t i = 0; i < M; ++i) { 284 | if (!(getQuo(i) == emptyLoc || i == randRootAdrs)) { 285 | label = getItemLabel(i); 286 | parLoc = getParent(i) + 1; 287 | labelPos = 288 | mbe.unary_sel.select(parLoc) - parLoc - mbe.degree_cntr[parLoc - 1]; 289 | mbe.degree_cntr[parLoc - 1]++; // increase offset 290 | mbe.labels[labelPos] = label; // insert label 291 | } 292 | } // end for 4 293 | end = std::chrono::high_resolution_clock::now(); 294 | dur = end - begin; 295 | ns = std::chrono::duration_cast(dur).count(); 296 | std::cout << "set labels and displ_values in labels and displ array\n[ " 297 | << ns / 1000000 << " ms]" << std::endl; 298 | timeMs += ns / 1000000; 299 | 300 | // expand temp data structure 301 | // step 3 -> traverse old insert in new 302 | // inorder traversal 303 | nodeCounter = 0; 304 | std::cout << "Simple traverse - traverse phase:" << std::endl; 305 | begin = std::chrono::high_resolution_clock::now(); // wall time 306 | inOrderTraverse(randRootAdrs); 307 | end = std::chrono::high_resolution_clock::now(); 308 | dur = end - begin; 309 | ns = std::chrono::duration_cast(dur).count(); 310 | std::cout << "init new tmp Bonsai + traverse old and insert into new replace " 311 | "old with new\n[ " 312 | << ns / 1000000 << " ms] " << nodeCounter << std::endl; 313 | timeMs += ns / 1000000; 314 | return timeMs; 315 | } 316 | 317 | void mBonsai::inOrderTraverse(uint64_t oldParentId) { 318 | const uint64_t parent = oldParentId; 319 | uint64_t label, labelPos; 320 | uint64_t initLabel = mbe.unary_sel.select(parent + 1) - (parent + 1); 321 | // stop when siblings finish 322 | while (mbe.degree_cntr[parent] != 0) { 323 | mbe.degree_cntr[parent]--; 324 | labelPos = initLabel - mbe.degree_cntr[parent]; 325 | label = mbe.labels[labelPos]; 326 | hashF.getKey(parent, label); 327 | oldParentId = searchItem(hashF.initAd, hashF.quotient, label); 328 | inOrderTraverse(oldParentId); 329 | nodeCounter++; 330 | } 331 | } 332 | 333 | void mBonsai::naiveTraverse() { 334 | uint64_t w = sdsl::bits::hi(sigma) + 1; 335 | degreeCounter = sdsl::int_vector<0>(M, 0, w); 336 | testTraverse = 0; 337 | nodeCounter = 0; 338 | testTraverse += nSigmaTraversal(randRootAdrs); 339 | } 340 | uint64_t mBonsai::nSigmaTraversal(uint64_t oldParentId) { 341 | const uint64_t parent = oldParentId; 342 | // stop when siblings finish 343 | while (degreeCounter[parent] <= sigma) { 344 | // calc old child ID 345 | hashF.getKey(parent, degreeCounter[parent]); 346 | oldParentId = 347 | searchItem(hashF.initAd, hashF.quotient, degreeCounter[parent]); 348 | if (oldParentId != valNotFound) { 349 | testTraverse += nSigmaTraversal(oldParentId); 350 | nodeCounter++; 351 | } 352 | degreeCounter[parent]++; 353 | } 354 | return oldParentId; 355 | } 356 | uint64_t mBonsai::getDisplVal(uint64_t loc) { 357 | if (getD(loc) < end_of_l1) 358 | return getD(loc); 359 | uint64_t tmpSat = cht_sl.find(loc); 360 | if (tmpSat != end_of_l2 + 1) 361 | return (tmpSat + end_of_l1); 362 | return mapSl.find(loc)->second; 363 | } 364 | uint64_t mBonsai::getDisplVal_tmp(uint64_t loc) { 365 | if (mbe.displ_level1[loc] < end_of_l1) 366 | return mbe.displ_level1[loc]; 367 | uint64_t tmpSat = mbe.displ_level2.find(loc); 368 | if (tmpSat != end_of_l2 + 1) 369 | return (tmpSat + end_of_l1); 370 | return mbe.displ_level3.find(loc)->second; 371 | } 372 | 373 | void mBonsai::replace(const mBonsai &other) { 374 | sigma = other.sigma; 375 | M = other.M; 376 | valNotFound = other.valNotFound; 377 | aInv = other.aInv; 378 | emptyLoc = other.emptyLoc; 379 | rootID = other.rootID; 380 | randRootAdrs = other.randRootAdrs; 381 | uint64_t w = sdsl::bits::hi(emptyLoc) + 1 + dWidth; 382 | uint64_t initVal = emptyLoc << dWidth; 383 | quotient_D = sdsl::int_vector<0>(M, initVal, w); 384 | cht_sl = 385 | cht(M, 1.25 * ((double)M * chtRatio) + 1.0, (1 << dWidth) - 1, end_of_l1); 386 | mapSl = other.mapSl; 387 | for (uint64_t i = 0; i < M; ++i) { 388 | quotient_D[i] = other.quotient_D[i]; 389 | } 390 | for (uint64_t i = 0; i < cht_sl.M; ++i) { 391 | cht_sl.quotient_items_C[i] = other.cht_sl.quotient_items_C[i]; 392 | } 393 | for (uint64_t i = 0; i < cht_sl.M; ++i) { 394 | cht_sl.V[i] = other.cht_sl.V[i]; 395 | } 396 | other.resize(); 397 | } 398 | 399 | mBonsai::~mBonsai() { 400 | quotient_D.resize(0); 401 | cht_sl.resize(); 402 | mapSl = std::map(); 403 | } 404 | void mBonsai::resize() { 405 | quotient_D.resize(0); 406 | cht_sl.resize(); 407 | mapSl = std::map(); 408 | } 409 | 410 | void mBonsai::extendTrie(double expandRatio = 2.0) { 411 | auto begin = std::chrono::high_resolution_clock::now(); 412 | uint64_t n = nodeNumberCount; 413 | // 1) create degree_cntr array of Mlogsigma 414 | mbe = mBonsai_expand(n, M, sigma); 415 | for (uint64_t i = 0; i < M; ++i) { 416 | if (getQuo(i) != emptyLoc && i != randRootAdrs) { 417 | uint64_t parLoc = getParent(i); 418 | if (parLoc != valNotFound) 419 | mbe.degree_cntr[parLoc]++; 420 | } 421 | } // end for 1 422 | 423 | // 2) represent mbe.degree_cntr in unary bit_vec of M+n bits 424 | uint64_t offset = 0; 425 | for (uint64_t i = 0; i < M; ++i) { 426 | uint64_t end_of_unary = offset + mbe.degree_cntr[i]; 427 | while (offset < end_of_unary) 428 | mbe.unary_bit[offset++] = 1; 429 | mbe.unary_bit[offset++] = 0; 430 | } // end for 2 431 | for (uint64_t i = 0; i < M; ++i) { 432 | mbe.degree_cntr[i] = 0; 433 | } // end for 3 434 | // create select of unary bit 435 | mbe.unary_sel = sdsl::select_support_mcl<0, 1>(&mbe.unary_bit); 436 | 437 | // print time 438 | auto end = std::chrono::high_resolution_clock::now(); 439 | auto dur = end - begin; 440 | auto ns = std::chrono::duration_cast(dur).count(); 441 | std::cout << "init degree_counter(1) + init unary bitvector(2) + init " 442 | "select(3)+initlabels tmpDaray\n[ " 443 | << ns / 1000000 << " ms]" << std::endl; 444 | 445 | // step 2 ->start insertion to n mbe.labels 446 | begin = std::chrono::high_resolution_clock::now(); // wall time 447 | uint64_t label, parLoc, labelPos, displValue; 448 | for (uint64_t i = 0; i < M; ++i) { 449 | if (!(getQuo(i) == emptyLoc || i == randRootAdrs)) { 450 | label = getItemLabel(i); 451 | parLoc = getParent(i) + 1; 452 | labelPos = 453 | mbe.unary_sel.select(parLoc) - parLoc - mbe.degree_cntr[parLoc - 1]; 454 | mbe.degree_cntr[parLoc - 1]++; // increase offset 455 | displValue = getDisplVal(i); // get_D from original DArray 456 | // insert D in temp DArray 457 | if (displValue < end_of_l1) { 458 | mbe.displ_level1[labelPos] = displValue; 459 | } else if (displValue >= end_of_l1 && displValue <= end_of_l2) { 460 | mbe.displ_level1[labelPos] = end_of_l1; 461 | mbe.displ_level2.insert(labelPos, displValue); 462 | } else { 463 | mbe.displ_level1[labelPos] = end_of_l1; 464 | mbe.displ_level3.insert( 465 | std::pair(labelPos, displValue)); 466 | } 467 | mbe.labels[labelPos] = label; // insert label 468 | } 469 | } // end for 4 470 | end = std::chrono::high_resolution_clock::now(); 471 | dur = end - begin; 472 | ns = std::chrono::duration_cast(dur).count(); 473 | std::cout << "set labels and displ_values in labels and displ array\n[ " 474 | << ns / 1000000 << " ms]" << std::endl; 475 | 476 | // expand temp data structure 477 | // step 3 -> traverse old insert in new 478 | // inorder traversal 479 | mBonsai tmp_mbr(n * expandRatio, sigma, 0.8, ":p"); 480 | begin = std::chrono::high_resolution_clock::now(); // wall time 481 | inOrderExpand(randRootAdrs, tmp_mbr, tmp_mbr.randRootAdrs); 482 | end = std::chrono::high_resolution_clock::now(); 483 | dur = end - begin; 484 | ns = std::chrono::duration_cast(dur).count(); 485 | std::cout << "init new tmp Bonsai + traverse old and insert into new replace " 486 | "old with new\n[ " 487 | << ns / 1000000 << " ms]" << std::endl; 488 | resize(); 489 | replace(tmp_mbr); 490 | } 491 | void mBonsai::inOrderExpand(uint64_t oldParentId, const mBonsai &tmp_mbr, 492 | uint64_t newParentId) { 493 | const uint64_t parent = oldParentId; 494 | const uint64_t newParent = newParentId; 495 | // hashFunction key; 496 | uint64_t label, labelPos; 497 | uint64_t initLabel = mbe.unary_sel.select(parent + 1) - (parent + 1); 498 | // stop when siblings finish 499 | while (mbe.degree_cntr[parent] != 0) { 500 | mbe.degree_cntr[parent]--; 501 | labelPos = initLabel - mbe.degree_cntr[parent]; 502 | label = mbe.labels[labelPos]; 503 | // calc old child ID 504 | hashF.getKey(parent, label); 505 | oldParentId = (hashF.initAd + getDisplVal_tmp(labelPos)) % M; 506 | // calc new child ID 507 | tmp_mbr.hashF.getKey(newParent, label); 508 | newParentId = 509 | tmp_mbr.setAddress(tmp_mbr.hashF.initAd, tmp_mbr.hashF.quotient); 510 | // pass them as parents 511 | inOrderExpand(oldParentId, tmp_mbr, newParentId); 512 | } 513 | } -------------------------------------------------------------------------------- /mBonsai_recursive/mBonsai.h: -------------------------------------------------------------------------------- 1 | #ifndef MBONSAIRECURSIVE_MBONSAI 2 | #define MBONSAIRECURSIVE_MBONSAI 3 | // #include "../DArray/mBonsai_recursive/cht_subLayer.h" 4 | #include "../Hash/hash.h" 5 | #include "../readio/data.h" 6 | #include "cht.h" 7 | #include "limits.h" 8 | #include "sdsl/int_vector.hpp" 9 | #include "sdsl/select_support.hpp" 10 | #include "sdsl/uint128_t.hpp" 11 | 12 | class mBonsai; 13 | 14 | class mBonsai_expand { 15 | friend class mBonsai; 16 | 17 | private: 18 | mBonsai_expand() {} 19 | mBonsai_expand(uint64_t n, uint64_t M, uint64_t sigma); 20 | ~mBonsai_expand() {} 21 | sdsl::int_vector<0> labels; //= sdsl::int_vector<0>(n,sigma+1,w); 22 | sdsl::int_vector<0> displ_level1; // = sdsl::int_vector<3>(n,0); 23 | cht displ_level2; // = cht(0.062303, M, 7, 0.25, 7); 24 | std::map displ_level3; 25 | sdsl::select_support_mcl<0, 1> unary_sel; 26 | sdsl::bit_vector unary_bit; // n+num 27 | sdsl::int_vector<0> degree_cntr; 28 | }; 29 | 30 | class mBonsai { 31 | // friend class mBonsai_expand; 32 | 33 | public: 34 | // args for printing and counting 35 | uint64_t sigma; 36 | uint64_t M; 37 | uint64_t nodeNumberCount; 38 | uint64_t randRootAdrs; 39 | sdsl::int_vector<0> quotient_D; 40 | cht cht_sl; // sublayer displacement array 41 | std::map mapSl; // overflown Displacement 42 | // init 43 | mBonsai() {} 44 | mBonsai(uint64_t nodeNumber, uint64_t sigma, double loadFactor, char *file); 45 | ~mBonsai(); 46 | // build 47 | void build(); 48 | // search bencmarks 49 | uint64_t searchBench(char *file); 50 | // expand 51 | void extendTrie(double expandRatio = 2.0); 52 | void resize(); 53 | void replace(const mBonsai &other); 54 | // traverse 55 | double traverse(); 56 | void inOrderTraverse(uint64_t oldParentId); 57 | // naive traverse 58 | void naiveTraverse(); 59 | uint64_t nSigmaTraversal(uint64_t oldParentId); 60 | 61 | // private: 62 | void setData(char *file) { data = new Data(file); } 63 | void insert(Transaction *t, uint64_t trCounter); 64 | uint64_t setAddress(uint64_t initAd, uint64_t DIVM); 65 | uint64_t getParent(uint64_t location); 66 | uint64_t getInitAd(uint64_t); 67 | std::vector getVector(std::string s); // readio 68 | uint64_t searchItem(uint64_t initAd, uint64_t DIVM, uint64_t itemID); 69 | void inOrderExpand(uint64_t oldParentId, const mBonsai &tmp_mbr, 70 | uint64_t newParentId); 71 | // misc 72 | bool isPrime(uint64_t input); 73 | uint64_t nextPrimeNumber(uint64_t inputNumber); 74 | void checkInv(double prime); 75 | sdsl::uint128_t gcdExtended(sdsl::uint128_t aTemp, sdsl::uint128_t b, 76 | sdsl::uint128_t *x, sdsl::uint128_t *y); 77 | uint64_t modInverse(sdsl::uint128_t m); 78 | 79 | // access quotient_D 80 | inline uint64_t getQuo(uint64_t loc) { return quotient_D[loc] >> dWidth; } 81 | inline uint64_t getD(uint64_t loc) { 82 | return quotient_D[loc] & ((1 << dWidth) - 1); 83 | } 84 | inline uint64_t setQuo(uint64_t loc, uint64_t quo) { 85 | quotient_D[loc] = quo << dWidth; 86 | } 87 | inline uint64_t setQuo_D(uint64_t loc, uint64_t quo, uint64_t D) { 88 | quotient_D[loc] = (quo << dWidth) | D; 89 | } 90 | sdsl::int_vector<0> degreeCounter; 91 | mBonsai_expand mbe; 92 | Data *data; // readio 93 | hashFunction hashF; 94 | uint64_t getItemLabel(uint64_t loc); 95 | uint64_t getDisplVal(uint64_t loc); 96 | uint64_t getDisplVal_tmp(uint64_t loc); 97 | const uint64_t end_of_l1 = 7; 98 | const uint64_t end_of_l2 = 134; 99 | const uint64_t dWidth = 3; 100 | uint64_t rootID; 101 | bool singlepath; 102 | uint64_t prime; 103 | uint64_t a; 104 | uint64_t aInv; 105 | uint64_t valNotFound; 106 | uint64_t emptyLoc; 107 | const double chtRatio = 0.07; // 0.062303; 108 | uint64_t testTraverse; 109 | uint64_t nodeCounter; 110 | }; 111 | #endif 112 | -------------------------------------------------------------------------------- /mBonsai_recursive/main.cpp: -------------------------------------------------------------------------------- 1 | #include "limits.h" 2 | #include "mBonsai.h" 3 | 4 | /* arguments 5 | * 1 nodeNum 6 | * 2 alphabet = sigma 7 | * 3 dataset 8 | * 4 loadFactor = 0.8 9 | * 5 searchFile 10 | */ 11 | #include 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | void printSpace(mBonsai); 17 | void printCHTSpace(mBonsai); 18 | 19 | int main(int argc, char *argv[]) { 20 | // arguments 21 | uint64_t nodeNum = atoi(argv[1]); 22 | uint64_t sigma = atoi(argv[2]); 23 | char *file = argv[3]; 24 | double loadFactor = atof(argv[4]); 25 | char *searchFile = argv[5]; 26 | if (argc < 5) { 27 | std::cerr << "Use " << argv[0] 28 | << " <1+epsilon> [searchFile]" 29 | << std::endl; 30 | exit(-1); 31 | } 32 | mBonsai mbr(nodeNum, sigma, loadFactor, file); 33 | auto begin = std::chrono::high_resolution_clock::now(); // wall time 34 | mbr.build(); // build 35 | auto end = std::chrono::high_resolution_clock::now(); 36 | auto dur = end - begin; 37 | auto ns = std::chrono::duration_cast(dur).count(); 38 | std::cout << "Total wall time Build[ " << ns / 1000000 << " ms]" << std::endl; 39 | std::cout << "nodeNum: " << mbr.nodeNumberCount << std::endl; 40 | std::cout << "capacity: " 41 | << (double)mbr.cht_sl.nodeNumberCount / 42 | (double)mbr.cht_sl.quotient_items_C.size() 43 | << std::endl; 44 | 45 | double traverseTimeMs = mbr.traverse(); 46 | std::cout << "Total wall time simple traverse[ " << traverseTimeMs << " ms]" 47 | << std::endl; 48 | /*begin = std::chrono::high_resolution_clock::now(); // wall time 49 | mbr.naiveTraverse(); 50 | end = std::chrono::high_resolution_clock::now(); 51 | dur = end - begin; 52 | ns = std::chrono::duration_cast(dur).count(); 53 | std::cout << "Total wall time naiveTraverse[ " << ns / 1000000 << " ms]" 54 | << std::endl;*/ 55 | begin = std::chrono::high_resolution_clock::now(); // wall time 56 | uint64_t schCounter = mbr.searchBench(searchFile); 57 | end = std::chrono::high_resolution_clock::now(); 58 | dur = end - begin; 59 | ns = std::chrono::duration_cast(dur).count(); 60 | std::cout << "Total wall time searchbench[ " << ns / 1000000 << " ms]" 61 | << std::endl; 62 | std::cout << "===" << std::endl; 63 | // mbr.extendTrie(); 64 | printSpace(mbr); 65 | 66 | return 0; 67 | } 68 | 69 | /* 70 | * Prints the Compact hash table per CHT_size 71 | */ 72 | void printCHTSpace(mBonsai mbr) { 73 | std::cout << "CHT space(per cht_M): " << std::endl; 74 | std::cout << "quotients_sat_C: " 75 | << sdsl::size_in_bytes(mbr.cht_sl.quotient_items_C) / 76 | (double)mbr.cht_sl.M * 8.0 77 | << std::endl; 78 | std::cout << "bitvectors (V): " 79 | << sdsl::size_in_bytes(mbr.cht_sl.V) / (double)mbr.cht_sl.M * 8.0 80 | << std::endl; 81 | std::cout << "==========" << std::endl; 82 | } 83 | /* 84 | * Prints detailed space sage of mBonsai 85 | */ 86 | void printSpace(mBonsai mbr) { 87 | std::cout << "space in bits (in detail): " << std::endl; 88 | std::cout << "\tquotient_D: " 89 | << sdsl::size_in_bytes(mbr.quotient_D) / 90 | (double)mbr.nodeNumberCount * 8.0 91 | << std::endl; 92 | std::cout << "\tDArray cht_sublayer: " << std::endl; 93 | std::cout << "\t\t" 94 | << (sdsl::size_in_bytes(mbr.cht_sl.quotient_items_C) / 95 | (double)mbr.nodeNumberCount) * 96 | 8.0 97 | << std::endl; 98 | std::cout << "\tD subLayer (V): " << std::endl; 99 | std::cout << "\t\t" 100 | << (sdsl::size_in_bytes(mbr.cht_sl.V) / 101 | (double)mbr.nodeNumberCount) * 102 | 8.0 103 | << std::endl; 104 | std::cout << "\tmapsl : " << std::endl; 105 | std::cout << "\t\t" 106 | << 48.0 * 8.0 * mbr.mapSl.size() / (double)mbr.nodeNumberCount 107 | << std::endl; 108 | 109 | // average space per M 110 | double avgSize = 111 | (sdsl::size_in_bytes(mbr.cht_sl.quotient_items_C) + 112 | sdsl::size_in_bytes(mbr.cht_sl.V) + (mbr.mapSl.size() * 48.0)) * 113 | 8.0; 114 | avgSize = avgSize / (double)mbr.nodeNumberCount; 115 | std::cout << "Space summary: " << std::endl; 116 | std::cout 117 | << "quotient only: " 118 | << ((sdsl::size_in_bytes(mbr.quotient_D) / (double)mbr.nodeNumberCount) * 119 | 8.0) - 120 | 3.0 121 | << " bits \nTotal DArray size: " 122 | << (3.0 + 123 | (((sdsl::size_in_bytes(mbr.cht_sl.quotient_items_C) + 124 | sdsl::size_in_bytes(mbr.cht_sl.V) + 48.0 * mbr.mapSl.size()) * 125 | 8.0)) / 126 | (double)mbr.nodeNumberCount) 127 | << " bits\n" 128 | << "===========" << std::endl; 129 | } -------------------------------------------------------------------------------- /readio/data.cpp: -------------------------------------------------------------------------------- 1 | #include "data.h" 2 | #include 3 | using namespace std; 4 | 5 | Transaction::Transaction(const Transaction& tr) 6 | { 7 | length = tr.length; 8 | t = new uint64_t[tr.length]; 9 | for (uint64_t i = 0; i < length; i++) 10 | t[i] = tr.t[i]; 11 | } 12 | 13 | Data::Data(char* filename) 14 | { 15 | fn = filename; 16 | current = 0; 17 | in = fopen(fn, "rt"); 18 | } 19 | 20 | Data::~Data() 21 | { 22 | if (in) 23 | fclose(in); 24 | } 25 | 26 | Transaction* Data::getNext() 27 | { 28 | vector list; 29 | char c; 30 | 31 | // read row by row, push in list 32 | do { 33 | uint64_t item = 0, pos = 0; 34 | c = getc(in); 35 | while ((c >= '0') && (c <= '9')) { 36 | item *= 10; 37 | item += int(c) - int('0'); 38 | c = getc(in); 39 | pos++; 40 | } 41 | if (pos) 42 | list.push_back(item); 43 | } while (c != '\n' && !feof(in)); 44 | 45 | // if end of file is reached, rewind to beginning for next pass 46 | if (feof(in)) { 47 | rewind(in); 48 | return 0; 49 | } 50 | 51 | // put items in *t 52 | Transaction* t = new Transaction(list.size()); 53 | for (uint64_t i = 0; i < uint64_t(list.size()); i++) 54 | t->t[i] = list[i]; 55 | 56 | return t; 57 | } 58 | -------------------------------------------------------------------------------- /readio/data.h: -------------------------------------------------------------------------------- 1 | #ifndef READIO_DATA 2 | #define READIO_DATA 3 | #include 4 | #include 5 | 6 | class Transaction { 7 | public: 8 | uint64_t length; 9 | uint64_t* t; 10 | 11 | Transaction(uint64_t l) 12 | : length(l) 13 | { 14 | t = new uint64_t[l]; 15 | } 16 | Transaction(const Transaction& tr); 17 | ~Transaction() { delete[] t; } 18 | }; 19 | 20 | class Data { 21 | public: 22 | Data(char* filename); 23 | ~Data(); 24 | Transaction* getNext(); 25 | char* fn; 26 | 27 | private: 28 | FILE* in; 29 | uint64_t current; 30 | }; 31 | #endif 32 | -------------------------------------------------------------------------------- /readio/dataTST.cpp: -------------------------------------------------------------------------------- 1 | #include "data.h" 2 | #include 3 | #include 4 | using namespace std; 5 | 6 | Transaction::Transaction(const Transaction& tr) 7 | { 8 | length = tr.length; 9 | t = new int[tr.length]; 10 | for (int i = 0; i < length; i++) 11 | t[i] = tr.t[i]; 12 | } 13 | 14 | Data::Data(char* filename) 15 | { 16 | fn = filename; 17 | current = 0; 18 | in = fopen(fn, "rt"); 19 | } 20 | 21 | Data::~Data() 22 | { 23 | if (in) 24 | fclose(in); 25 | } 26 | 27 | Transaction* Data::getNext() 28 | { 29 | vector list; 30 | char c; 31 | 32 | // read row by row, push in list 33 | do { 34 | int item = 0, pos = 0; 35 | c = getc(in); 36 | while ((c >= '0') && (c <= '9')) { 37 | item *= 10; 38 | item += int(c) - int('0'); 39 | c = getc(in); 40 | pos++; 41 | } 42 | if (pos) 43 | list.push_back(item); 44 | } while (c != '\n' && !feof(in)); 45 | 46 | // if end of file is reached, rewind to beginning for next pass 47 | if (feof(in)) { 48 | rewind(in); 49 | return 0; 50 | } 51 | 52 | // put items in *t 53 | Transaction* t = new Transaction(list.size() + 1); 54 | t->t[0] = 2000000000; 55 | for (int i = 0; i < int(list.size()); i++) { 56 | t->t[i + 1] = list[i] + 1; 57 | } 58 | /// 59 | /* if(t) current++; 60 | else { 61 | rewind(in); 62 | current=0; 63 | } 64 | */ 65 | return t; 66 | } 67 | --------------------------------------------------------------------------------