├── .gitignore ├── .travis.yml ├── .github └── workflows │ └── ubuntu22.yml ├── CMakeLists.txt ├── Makefile ├── include ├── adler32.h ├── ztimer.h ├── characterhash.h ├── threewisehash.h ├── generalhash.h ├── cyclichash.h ├── rabinkarphash.h └── mersennetwister.h ├── examples ├── example64bits.cpp ├── example4.cpp ├── example2.cpp ├── example3.cpp ├── example.cpp ├── example6.cpp └── example5.cpp ├── README.md ├── benchmarks └── speedtesting.cpp └── tests └── unit.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | CMakeSettings.json 2 | .vs 3 | out 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | sudo: false 3 | compiler: 4 | - clang 5 | 6 | script: make && ./unit 7 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu22.yml: -------------------------------------------------------------------------------- 1 | name: Ubuntu 22.04 CI (GCC 11) 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | ubuntu-build: 7 | runs-on: ubuntu-22.04 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: Use cmake 11 | run: | 12 | mkdir build && 13 | cd build && 14 | cmake .. && 15 | cmake --build . && 16 | ctest --output-on-failure -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0...3.23) 2 | 3 | project(rollinghashcpp) 4 | set(CMAKE_CXX_STANDARD 11) 5 | 6 | if(WIN32) 7 | set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc") 8 | endif() 9 | include_directories(include) 10 | add_executable(example "examples/example.cpp") 11 | add_executable(example2 "examples/example2.cpp") 12 | add_executable(example3 "examples/example3.cpp") 13 | add_executable(example4 "examples/example2.cpp") 14 | add_executable(example5 "examples/example3.cpp") 15 | add_executable(example6 "examples/example2.cpp") 16 | add_executable(example64bits "examples/example64bits.cpp") 17 | add_executable(speedtesting "benchmarks/speedtesting.cpp") 18 | add_executable(unit "tests/unit.cpp") 19 | enable_testing() 20 | add_test(unit unit) 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | .SUFFIXES: 3 | # 4 | .SUFFIXES: .cpp .o .c .h 5 | 6 | CXXFLAGS = -std=c++11 -fexceptions -pedantic -ggdb -g3 -O2 -Wall -Woverloaded-virtual -Wsign-promo -Wold-style-cast 7 | #-DNDEBUG 8 | all: unit speedtesting example example2 example3 example64bits example4 example5 example6 9 | 10 | SRCS = unit.cpp speedtesting.cpp example.cpp example2.cpp example3.cpp example4.cpp example5.cpp example6.cpp example64bits.cpp 11 | 12 | package: 13 | zip -9 ngramhashing_`date +%Y-%m-%d`.zip Makefile README *.h *.cpp 14 | 15 | depend: 16 | makedepend -- $(CXXFLAGS) -- $(SRCS) 17 | 18 | clean: 19 | rm -f *.o unit speedtesting example example2 example3 example4 example64bits example5 example6 20 | 21 | HEADERS=cyclichash.h characterhash.h mersennetwister.h rabinkarphash.h generalhash.h threewisehash.h 22 | unit.o: $(HEADERS) 23 | speedtesting.o: $(HEADERS) 24 | example.o: $(HEADERS) 25 | example2.o: $(HEADERS) 26 | example3.o: $(HEADERS) 27 | example4.o: $(HEADERS) 28 | example5.o: $(HEADERS) 29 | example6.o: $(HEADERS) 30 | example64bits.o: $(HEADERS) 31 | 32 | -------------------------------------------------------------------------------- /include/adler32.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // contributed by Dmitry Artamonov 4 | // this is *deterministic* 5 | class Adler32 { 6 | uint32 sum1, sum2; 7 | 8 | public: 9 | static const uint32_t Base = 65521; 10 | uint32_t hashvalue; 11 | int len; 12 | 13 | Adler32(int window) : sum1(1), sum2(0), hashvalue(0), len(window) {} 14 | 15 | void eat(uint8_t inchar) { 16 | sum1 = (sum1 + inchar) % Base; 17 | sum2 = (sum2 + sum1) % Base; 18 | 19 | hashvalue = (sum2 << 16) | sum1; 20 | } 21 | 22 | void reset() { 23 | sum1 = 1; 24 | sum2 = 0; 25 | hashvalue = 0; 26 | } 27 | 28 | void update(uint8_t outchar, uint8_t inchar) { 29 | int sum2 = (hashvalue >> 16) & 0xffff; 30 | int sum1 = hashvalue & 0xffff; 31 | 32 | sum1 += inchar - outchar; 33 | if (sum1 >= Base) { 34 | sum1 -= Base; 35 | } else if (sum1 < 0) { 36 | sum1 += Base; 37 | } 38 | 39 | sum2 = (int(sum2 - len * outchar + sum1 - 1) % int(Base)); 40 | if (sum2 < 0) { 41 | sum2 += Base; 42 | } 43 | hashvalue = (sum2 << 16) | sum1; 44 | } 45 | }; 46 | -------------------------------------------------------------------------------- /examples/example64bits.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | // Example of 64-bit hashing 7 | 8 | #include "cyclichash.h" 9 | 10 | int main() { 11 | CyclicHash hf(5, 64); 12 | string input = "ABCDE"; 13 | hf.eat(input[0]); // A 14 | hf.eat(input[1]); // B 15 | hf.eat(input[2]); // C 16 | hf.eat(input[3]); // D 17 | cout << "Hash value of ABCD is " << hf.hashvalue << endl; 18 | // we check the answer going the long way... 19 | const std::vector charvectslice(input.begin(), 20 | input.begin() + 4); 21 | uint64_t trueanswerslice = hf.hash(charvectslice); 22 | if (trueanswerslice != hf.hashvalue) 23 | throw runtime_error("bug"); 24 | // we continue 25 | hf.eat(input[4]); // E 26 | cout << "Hash value of ABCDE is " << hf.hashvalue << endl; 27 | // we check the answer going the long way 28 | const std::vector charvect(input.begin(), input.end()); 29 | uint64_t trueanswer = hf.hash(charvect); 30 | if (trueanswer != hf.hashvalue) 31 | throw runtime_error("bug"); 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /examples/example4.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "cyclichash.h" 7 | 8 | /** 9 | * Test of the prepend and append functions to test slightly longer and slightly 10 | * shorter n-grams. 11 | */ 12 | 13 | int main(int argc, char *argv[]) { 14 | CyclicHash hf(4, 64); 15 | string input = "XABCDY"; 16 | string base(input.begin() + 1, input.end() - 1); 17 | string extend(input.begin() + 1, input.end()); 18 | string prepend(input.begin(), input.end() - 1); 19 | 20 | for (string::const_iterator j = base.begin(); j != base.end(); ++j) { 21 | hf.eat(*j); 22 | } 23 | 24 | std::cout << base << " " << hf.hash(base) << std::endl; 25 | std::cout << prepend << " " << hf.hash_prepend(input[0]) << " " 26 | << hf.hash(prepend) << std::endl; 27 | std::cout << extend << " " << hf.hash_extend(input.back()) << " " 28 | << hf.hash(extend) << std::endl; 29 | 30 | assert(hf.hashvalue == hf.hash(base)); 31 | assert(hf.hash_prepend(input[0]) == hf.hash(prepend)); 32 | assert(hf.hash_extend(input.back()) == hf.hash(extend)); 33 | 34 | return 0; 35 | } 36 | -------------------------------------------------------------------------------- /examples/example2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | // given hash value of "ABCD", can I have value of 7 | // "ABCDE", without computing the whole hash value? 8 | 9 | #include "cyclichash.h" 10 | 11 | int main() { 12 | CyclicHash<> hf(5, 19); 13 | string input = "ABCDE"; 14 | hf.eat(input[0]); // A 15 | hf.eat(input[1]); // B 16 | hf.eat(input[2]); // C 17 | hf.eat(input[3]); // D 18 | cout << "Hash value of ABCD is " << hf.hashvalue << endl; 19 | // we check the answer going the long way... 20 | const std::vector charvectslice(input.begin(), 21 | input.begin() + 4); 22 | uint32_t trueanswerslice = hf.hash(charvectslice); 23 | if (trueanswerslice != hf.hashvalue) 24 | throw runtime_error("bug"); 25 | // we continue 26 | hf.eat(input[4]); // E 27 | cout << "Hash value of ABCDE is " << hf.hashvalue << endl; 28 | // we check the answer going the long way 29 | const std::vector charvect(input.begin(), input.end()); 30 | uint32_t trueanswer = hf.hash(charvect); 31 | if (trueanswer != hf.hashvalue) 32 | throw runtime_error("bug"); 33 | return 0; 34 | } -------------------------------------------------------------------------------- /examples/example3.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "cyclichash.h" 7 | 8 | // given hash value of "BCD", can I have value of 9 | // "ABC"quicky? 10 | 11 | int demo1() { 12 | CyclicHash<> hf(3, 32); 13 | string input = "ABCD"; 14 | hf.eat(input[1]); // B 15 | hf.eat(input[2]); // C 16 | hf.eat(input[3]); // D 17 | cout << "Hash value of BCD is " << hf.hashvalue << endl; 18 | // we check the answer going the long way... 19 | const std::vector charvectslice(input.begin() + 1, 20 | input.begin() + 4); 21 | uint32_t trueanswerslice = hf.hash(charvectslice); 22 | if (trueanswerslice != hf.hashvalue) 23 | throw runtime_error("bug"); 24 | // we continue 25 | hf.reverse_update(input[0], input[3]); // remove D, prepend A 26 | cout << "Hash value of ABC is " << hf.hashvalue << endl; 27 | // we check the answer going the long way 28 | const std::vector charvect(input.begin(), input.begin() + 3); 29 | uint32_t trueanswer = hf.hash(charvect); 30 | if (trueanswer != hf.hashvalue) 31 | throw runtime_error("bug"); 32 | return 0; 33 | } 34 | 35 | int main() { demo1(); } 36 | -------------------------------------------------------------------------------- /examples/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "rabinkarphash.h" 7 | 8 | int main() { 9 | size_t q = 3; 10 | size_t k = 4; 11 | typedef KarpRabinHash<> HashFunction; 12 | std::vector> hashPtr(q); 13 | for (size_t z = 0; z < hashPtr.size(); ++z) { 14 | std::unique_ptr &ptr = hashPtr[z]; 15 | ptr.reset(new HashFunction(k, 12)); 16 | } 17 | 18 | std::string str = "ACGTAACGT"; 19 | for (size_t j = 0; j < k; j++) { 20 | for (size_t z = 0; z < hashPtr.size(); ++z) { 21 | std::unique_ptr &ptr = hashPtr[z]; 22 | ptr->eat(str[j]); 23 | } 24 | } 25 | 26 | for (size_t i = 0;; i++) { 27 | std::cout << std::string(str.begin() + i, str.begin() + i + k); 28 | for (size_t z = 0; z < hashPtr.size(); ++z) { 29 | std::unique_ptr &ptr = hashPtr[z]; 30 | std::cout << ' ' << ptr->hashvalue; 31 | } 32 | 33 | std::cout << std::endl; 34 | if (i + k < str.size()) { 35 | for (size_t z = 0; z < hashPtr.size(); ++z) { 36 | std::unique_ptr &ptr = hashPtr[z]; 37 | ptr->update(str[i], str[i + k]); 38 | } 39 | } else { 40 | break; 41 | } 42 | } 43 | 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /include/ztimer.h: -------------------------------------------------------------------------------- 1 | #ifndef ZTIMER 2 | #define ZTIMER 3 | 4 | #include 5 | #include 6 | #ifdef _WIN32 7 | /*Porting gettimeofday to Windows, 8 | source: https://www.codefull.net/2015/12/systime-h-replacement-for-windows/ 9 | TODO: Consider simply using std::chrono for timing operations 10 | */ 11 | #include 12 | 13 | #define __need_clock_t 14 | #include 15 | #include 16 | typedef long long suseconds_t; 17 | /* Structure describing CPU time used by a process and its children. */ 18 | struct tms { 19 | clock_t tms_utime; /* User CPU time. */ 20 | clock_t tms_stime; /* System CPU time. */ 21 | 22 | clock_t tms_cutime; /* User CPU time of dead children. */ 23 | clock_t tms_cstime; /* System CPU time of dead children. */ 24 | }; 25 | 26 | /* Store the CPU time used by this process and all its 27 | dead children (and their dead children) in BUFFER. 28 | Return the elapsed real time, or (clock_t) -1 for errors. 29 | All times are in CLK_TCKths of a second. */ 30 | clock_t times(struct tms *__buffer) { 31 | 32 | __buffer->tms_utime = clock(); 33 | __buffer->tms_stime = 0; 34 | __buffer->tms_cstime = 0; 35 | __buffer->tms_cutime = 0; 36 | return __buffer->tms_utime; 37 | } 38 | 39 | int gettimeofday(struct timeval *t, void *timezone) { 40 | struct _timeb timebuffer; 41 | _ftime(&timebuffer); 42 | t->tv_sec = timebuffer.time; 43 | t->tv_usec = 1000 * timebuffer.millitm; 44 | return 0; 45 | } 46 | 47 | #else 48 | #include 49 | #endif 50 | class ZTimer { 51 | public: 52 | struct timeval t1, t2; 53 | 54 | public: 55 | ZTimer() { 56 | gettimeofday(&t1, 0); 57 | t2 = t1; 58 | } 59 | void reset() { 60 | gettimeofday(&t1, 0); 61 | t2 = t1; 62 | } 63 | int elapsed() { 64 | return ((t2.tv_sec - t1.tv_sec) * 1000) + 65 | ((t2.tv_usec - t1.tv_usec) / 1000); 66 | } 67 | int split() { 68 | gettimeofday(&t2, 0); 69 | return elapsed(); 70 | } 71 | }; 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /examples/example6.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This example is from Dmitry Artamonov, it shows that to get the same 3 | * hash values for the same substrings, you need to use the same hasher object 4 | * (since they are randomized). 5 | */ 6 | 7 | #include "adler32.h" 8 | #include "cyclichash.h" 9 | #include "generalhash.h" 10 | #include "rabinkarphash.h" 11 | #include "threewisehash.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | void CalcHashes(const std::string &Inp, const int WindowSize, 18 | KarpRabinHash<> &h1, ThreeWiseHash<> &h2, GeneralHash<> &h3, 19 | CyclicHash<> &h4, Adler32 &h5) { 20 | 21 | int WindowPos = 0; 22 | h1.reset(); 23 | h2.reset(); 24 | h3.reset(); 25 | h4.reset(); 26 | h5.reset(); 27 | 28 | for (int i = 0; i < Inp.length(); i++) { 29 | unsigned char InChar = Inp[i]; 30 | 31 | bool Eat = (i < WindowSize); 32 | unsigned char OutChar = ' '; 33 | if (Eat) { 34 | h1.eat(InChar); 35 | h2.eat(InChar); 36 | h3.eat(InChar); 37 | h4.eat(InChar); 38 | h5.eat(InChar); 39 | } else { 40 | OutChar = Inp[i - WindowSize]; 41 | h1.update(OutChar, InChar); 42 | h2.update(OutChar, InChar); 43 | h3.update(OutChar, InChar); 44 | h4.update(OutChar, InChar); 45 | h5.update(OutChar, InChar); 46 | } 47 | if (i + 1 >= WindowSize) { 48 | auto current = Inp.substr(i + 1 - WindowSize, WindowSize); 49 | printf("%04d %02d %c %c %06x %06x %06x %06x %06x %c %s \n", i, WindowPos, 50 | InChar, OutChar, h1.hashvalue, h2.hashvalue, h3.hashvalue, 51 | h4.hashvalue, h5.hashvalue, (Eat) ? '*' : ' ', current.c_str()); 52 | assert(h1.hash(current) == h1.hashvalue); 53 | assert(h2.hash(current) == h2.hashvalue); 54 | assert(h3.hash(current) == h3.hashvalue); 55 | assert(h4.hash(current) == h4.hashvalue); 56 | } 57 | 58 | WindowPos = (WindowPos + 1) % WindowSize; 59 | } 60 | } 61 | 62 | // ---------------------------------------------------------------------------- 63 | 64 | void Compare() { 65 | const int WindowSize = 16; 66 | KarpRabinHash<> h1(WindowSize); 67 | ThreeWiseHash<> h2(WindowSize); 68 | GeneralHash<> h3(WindowSize); 69 | CyclicHash<> h4(WindowSize); 70 | Adler32 h5(WindowSize); 71 | 72 | std::string s1 = "Test string for rolling hashes."; // 32 chars 73 | CalcHashes(s1, WindowSize, h1, h2, h3, h4, h5); 74 | 75 | printf("---------------------------------------\n"); 76 | 77 | std::string s2 = "This is some preamble."; 78 | CalcHashes(s2 + s1, WindowSize, h1, h2, h3, h4, h5); 79 | } 80 | 81 | int main() { Compare(); } 82 | -------------------------------------------------------------------------------- /include/characterhash.h: -------------------------------------------------------------------------------- 1 | #ifndef CHARACTERHASH 2 | #define CHARACTERHASH 3 | 4 | typedef unsigned long long uint64; 5 | typedef unsigned int uint32; 6 | typedef unsigned int uint; 7 | 8 | #include "mersennetwister.h" 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | class mersenneRNG { 16 | public: 17 | mersenneRNG(uint32 maxval) : mtr(), n(maxval){}; 18 | uint32 operator()() { return mtr.randInt(n); } 19 | void seed(uint32 seedval) { mtr.seed(seedval); } 20 | void seed() { mtr.seed(); } 21 | uint32 rand_max() { return n; } 22 | 23 | private: 24 | MTRand mtr; 25 | int n; 26 | }; 27 | 28 | template 29 | #if __cplusplus >= 201402L 30 | constexpr 31 | #endif 32 | hashvaluetype 33 | maskfnc(int bits) { 34 | assert(bits > 0); 35 | assert(size_t(bits) <= sizeof(hashvaluetype) * 8); 36 | hashvaluetype x = static_cast(1) << (bits - 1); 37 | return x ^ (x - 1); 38 | } 39 | 40 | template 41 | class CharacterHash { 42 | public: 43 | CharacterHash(hashvaluetype maxval) { 44 | if (sizeof(hashvaluetype) <= 4) { 45 | mersenneRNG randomgenerator(maxval); 46 | for (size_t k = 0; k < nbrofchars; ++k) 47 | hashvalues[k] = static_cast(randomgenerator()); 48 | } else if (sizeof(hashvaluetype) == 8) { 49 | mersenneRNG randomgenerator(maxval >> 32); 50 | mersenneRNG randomgeneratorbase((maxval >> 32) == 0 ? maxval 51 | : 0xFFFFFFFFU); 52 | for (size_t k = 0; k < nbrofchars; ++k) 53 | hashvalues[k] = static_cast(randomgeneratorbase()) | 54 | (static_cast(randomgenerator()) << 32); 55 | } else 56 | throw runtime_error("unsupported hash value type"); 57 | } 58 | 59 | CharacterHash(hashvaluetype maxval, uint32 seed1, uint32 seed2) { 60 | if (sizeof(hashvaluetype) <= 4) { 61 | mersenneRNG randomgenerator(maxval); 62 | randomgenerator.seed(seed1); 63 | for (size_t k = 0; k < nbrofchars; ++k) 64 | hashvalues[k] = static_cast(randomgenerator()); 65 | } else if (sizeof(hashvaluetype) == 8) { 66 | mersenneRNG randomgenerator(maxval >> 32); 67 | mersenneRNG randomgeneratorbase((maxval >> 32) == 0 ? maxval 68 | : 0xFFFFFFFFU); 69 | randomgenerator.seed(seed1); 70 | randomgeneratorbase.seed(seed2); 71 | for (size_t k = 0; k < nbrofchars; ++k) 72 | hashvalues[k] = static_cast(randomgeneratorbase()) | 73 | (static_cast(randomgenerator()) << 32); 74 | } else 75 | throw runtime_error("unsupported hash value type"); 76 | } 77 | 78 | enum { nbrofchars = 1 << (sizeof(chartype) * 8) }; 79 | 80 | hashvaluetype hashvalues[1 << (sizeof(chartype) * 8)]; 81 | }; 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Randomized rolling hash functions in C++ 2 | [![Ubuntu 22.04 CI (GCC 11)](https://github.com/lemire/rollinghashcpp/actions/workflows/ubuntu22.yml/badge.svg)](https://github.com/lemire/rollinghashcpp/actions/workflows/ubuntu22.yml) 3 | 4 | License: Apache 2.0 5 | 6 | 7 | ## What is this? 8 | 9 | This is a set of C++ classes implementing various recursive n-gram hashing techniques, also called rolling hashing (http://en.wikipedia.org/wiki/Rolling_hash), including: 10 | 11 | * Randomized Karp-Rabin (sometimes called Rabin-Karp) 12 | * Hashing by Cyclic Polynomials (also known as Buzhash) 13 | * Hashing by Irreducible Polynomials 14 | 15 | This library is used by [khmer](https://github.com/dib-lab/khmer/): the in-memory nucleotide sequence k-mer engine. 16 | 17 | 18 | These are randomized hash functions, meaning that each time you create a new hasher instance, you will 19 | get new hash values for a given input. 20 | 21 | ## Code sample 22 | ```cpp 23 | 24 | const uint n(3);//hash all sequences of 3 characters 25 | const uint L(7); // you need 7 bits 26 | CyclicHash hf(n,L );// if you want 64-bit values replace uint32 by uint64 27 | for(uint32 k = 0; k 6 | #include 7 | 8 | using namespace std; 9 | 10 | /** 11 | * Each instance is a rolling hash function meant to hash streams of characters. 12 | * Each new instance of this class comes with new random keys. 13 | * 14 | * Recommended usage to get L-bit hash values over n-grams: 15 | * ThreeWiseHash<> hf(n,L ); 16 | * for(uint32 k = 0; k 28 | class ThreeWiseHash { 29 | 30 | public: 31 | // myn is the length of the sequences, e.g., 3 means that you want to hash 32 | // sequences of 3 characters mywordsize is the number of bits you which to 33 | // receive as hash values, e.g., 19 means that the hash values are 19-bit 34 | // integers 35 | ThreeWiseHash(int myn, int mywordsize = 19) 36 | : n(myn), wordsize(mywordsize), hashers(), hasher(0) { 37 | if (static_cast(wordsize) > 8 * sizeof(hashvaluetype)) { 38 | cerr << "Can't create " << wordsize << "-bit hash values" << endl; 39 | throw "abord"; 40 | } 41 | for (int i = 0; i < n; ++i) { 42 | CharacterHash ch( 43 | maskfnc(wordsize)); 44 | hashers.push_back(ch); 45 | } 46 | } 47 | 48 | // add inchar as an input, this is used typically only at the start 49 | // the hash value is updated to that of a longer string (one where inchar was 50 | // appended) 51 | void eat(chartype inchar) { 52 | ngram.push_back(inchar); 53 | __updateHashValue(); 54 | } 55 | 56 | // add inchar as an input and remove outchar, the hashvalue is updated 57 | // this function can be used to update the hash value from the hash value of 58 | // [outchar]ABC to the hash value of ABC[inchar] 59 | void update(chartype, chartype inchar) { 60 | ngram.push_back(inchar); 61 | ngram.pop_front(); 62 | __updateHashValue(); 63 | } 64 | 65 | // prepare to process a new string, you will need to call "eat" again 66 | void reset() { 67 | hashvalue = 0; 68 | ngram.clear(); 69 | } 70 | 71 | void __updateHashValue() { 72 | hashvalue = 0; 73 | for (size_t k = 0; k < ngram.size(); ++k) { 74 | hashvalue ^= hashers[k].hashvalues[ngram[k]]; 75 | } 76 | } 77 | 78 | // this is a convenience function, use eat,update and .hashvalue to use as a 79 | // rolling hash function 80 | template hashvaluetype hash(container &c) { 81 | hashvaluetype answer(0); 82 | for (size_t k = 0; k < c.size(); ++k) { 83 | answer ^= hashers[k].hashvalues[c[k]]; 84 | } 85 | return answer; 86 | } 87 | 88 | hashvaluetype hashvalue; 89 | int n; 90 | const int wordsize; 91 | deque ngram; 92 | vector> hashers; 93 | CharacterHash hasher; // placeholder 94 | }; 95 | 96 | #endif 97 | -------------------------------------------------------------------------------- /benchmarks/speedtesting.cpp: -------------------------------------------------------------------------------- 1 | #include "cyclichash.h" 2 | #include "generalhash.h" 3 | #include "rabinkarphash.h" 4 | #include "threewisehash.h" 5 | #include "ztimer.h" 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | template 12 | double hashALot(int n, int L, uint ttimes, uint sizeoftest, 13 | vector &recorder) { 14 | ZTimer t; 15 | for (uint times = 0; times < ttimes; ++times) { 16 | hashfunction hf(n, L); 17 | for (uint k = 0; k < static_cast(n); ++k) { 18 | hf.eat(static_cast(k)); 19 | } 20 | for (uint k = n; k < sizeoftest; ++k) { 21 | hf.update(static_cast(k - n), 22 | static_cast(k)); 23 | } 24 | /* The goal of the recorder is to prevent 25 | the compiler from deciding that this whole computation 26 | is not required! 27 | */ 28 | recorder.push_back(hf.hashvalue); 29 | } 30 | return t.split() / (1000.0 * ttimes); 31 | } 32 | 33 | template 34 | double hashALot(int n, int L, uint ttimes, vector &recorder, 35 | vector &data) { 36 | ZTimer t; 37 | for (uint times = 0; times < ttimes; ++times) { 38 | hashfunction hf(n, L); 39 | for (uint k = 0; k < static_cast(n); ++k) { 40 | hf.eat(data[k]); 41 | } 42 | for (uint k = n; k < data.size(); ++k) { 43 | hf.update(data[k - n], data[k]); 44 | } 45 | /* The goal of the recorder is to prevent 46 | the compiler from deciding that this whole computation 47 | is not required! 48 | */ 49 | recorder.push_back(hf.hashvalue); 50 | } 51 | return t.split() / 1000.0; 52 | } 53 | 54 | void synthetic() { 55 | int L = 19; 56 | vector recorder; 57 | uint sizeoftest = 100000000; 58 | cout << "#n three-wise General BufferedGeneral Cyclic Karp-Rabin " << endl; 59 | for (uint n = 1; n + L <= 32; ++n) { 60 | cout << n << " " << hashALot>(n, L, 1, sizeoftest, recorder) 61 | << " "; 62 | cout << hashALot>(n, L, 1, sizeoftest, recorder) 63 | << " "; 64 | cout << hashALot>(n, L, 1, sizeoftest, recorder) 65 | << " "; 66 | cout << hashALot>(n, L + n, 1, sizeoftest, recorder) << " "; 67 | cout << hashALot>(n, L, 1, sizeoftest, recorder) << endl; 68 | } 69 | cout << "# L= " << L << " char-length= " << sizeoftest << endl; 70 | } 71 | 72 | void grabFileContent(vector &data, string filename) { 73 | string line; 74 | ifstream file(filename.c_str()); 75 | std::getline(file, line); 76 | while (file.good()) { 77 | std::getline(file, line); 78 | for (uint k = 0; k < line.size(); ++k) 79 | data.push_back(line[k]); // presumably not very fast to do it char by char 80 | } 81 | file.close(); 82 | } 83 | void realdata(string filename) { 84 | int L = 19; 85 | vector recorder; 86 | uint repeats = 1; 87 | vector data; 88 | grabFileContent(data, filename); 89 | cout << "#n three-wise General BufferedGeneral Cyclic Karp-Rabin " << endl; 90 | for (uint n = 1; n + L <= 32; ++n) { 91 | cout << n << " " << hashALot>(n, L, repeats, recorder, data) 92 | << " "; 93 | cout << hashALot>(n, L, repeats, recorder, data) 94 | << " "; 95 | cout << hashALot>(n, L, repeats, recorder, data) 96 | << " "; 97 | cout << hashALot>(n, L + n, repeats, recorder, data) << " "; 98 | cout << hashALot>(n, L, repeats, recorder, data) << endl; 99 | } 100 | cout << "# L= " << L << " char-length= " << data.size() 101 | << " repeats=" << repeats << endl; 102 | } 103 | 104 | int main(int params, char **args) { 105 | if (params == 1) 106 | synthetic(); 107 | else 108 | realdata(args[1]); 109 | 110 | return 0; 111 | } 112 | -------------------------------------------------------------------------------- /examples/example5.cpp: -------------------------------------------------------------------------------- 1 | #include "cyclichash.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* 8 | An issue is application-specific and has to do with the nature of DNA. Even 9 | though we usually represent DNA as a string of characters (such as `GATTACA`), 10 | this is really only half the story. DNA is double stranded with `A` pairing 11 | to `T` and `C` pairing to `G`, so the string `GATTACA` really represents the 12 | following molecule. 13 | 14 | ``` 15 | gattaca 16 | ||||||| 17 | ɔʇɐɐʇƃʇ 18 | ``` 19 | 20 | In most contexts, we have no way of knowing whether the original piece of DNA 21 | sampled was from the top strand or the bottom strand, and so when we hash DNA 22 | sequences we typically want the two complementary sequences to hash to the 23 | same value. 24 | 25 | I used two cyclic hashes: one for the "top" strand of DNA (observed from the 26 | provided string, updated using forward updates) and one for the "bottom" strand 27 | (inferred from the provided string, updated using reverse updates). Then to get 28 | the hash for a particular k-mer (n-gram) in the DNA, I just XOR the current 29 | forward and reverse hashes. 30 | */ 31 | 32 | // Define DNA's complementary nucleotides 33 | // 34 | // Daniel: This is probably inefficient. Needlessly so. 35 | // if efficiency matters, you want to define the character hash so that it takes 36 | // the key 'A' to the hash value of 'T' and so forth. 37 | // 38 | #define nucleotide_complement(ch) \ 39 | ((toupper(ch)) == 'A' ? 'T' \ 40 | : (toupper(ch)) == 'T' ? 'A' \ 41 | : (toupper(ch)) == 'C' ? 'G' \ 42 | : 'C') 43 | 44 | // A sequence and its reverse complement (such as "GATTACA" and "TGTAATC") are 45 | // biologically identical and should hash to the same value. A sequence that is 46 | // equal to its reverse complement is a special case and should be handled 47 | // accordingly. 48 | // 49 | #define canonical_hash(fwd, rev) (fwd == rev ? rev : fwd ^ rev) 50 | 51 | #define WORDSIZE 5 52 | #define SEED1 42 53 | #define SEED2 1985 54 | #define HASHBITS 64 55 | 56 | // full string hash from scratch (for comparison) 57 | uint64_t fullhash(const string &input) { 58 | assert(input.size() == WORDSIZE); 59 | CyclicHash forward(input.size(), SEED1, SEED2, HASHBITS); 60 | CyclicHash reverse(input.size(), SEED1, SEED2, HASHBITS); 61 | for (int j = 0; j < input.size(); j++) { 62 | forward.eat(input[j]); 63 | reverse.eat(nucleotide_complement(input[input.size() - 1 - j])); 64 | } 65 | return canonical_hash(forward.hashvalue, reverse.hashvalue); 66 | } 67 | 68 | // check the rolling hash 69 | // k is the k-gram size, input is any string 70 | void demo(int k, string input) { 71 | // Initialize the hash function to compute the hash of the first k-mer. 72 | CyclicHash forward(k, SEED1, SEED2, HASHBITS); 73 | CyclicHash reverse(k, SEED1, SEED2, HASHBITS); 74 | for (int j = 0; j < k; j++) { 75 | forward.eat(input[j]); 76 | // going backward 77 | reverse.eat(nucleotide_complement(input[k - 1 - j])); 78 | } 79 | // rolling has 80 | uint64_t hashval = canonical_hash(forward.hashvalue, reverse.hashvalue); 81 | assert(fullhash(input.substr(0, k)) == hashval); 82 | std::cout << input.substr(0, k) << " " << hashval << std::endl; 83 | 84 | for (int j = k; j < input.size(); j++) { 85 | forward.update(input[j - k], input[j]); 86 | // note: you to flip the parameters of reverse_update 87 | reverse.reverse_update(nucleotide_complement(input[j]), 88 | nucleotide_complement(input[j - k])); 89 | // compute the rolling has 90 | hashval = canonical_hash(forward.hashvalue, reverse.hashvalue); 91 | // compare with full string hash 92 | assert(fullhash(input.substr(j - k + 1, k)) == hashval); 93 | std::cout << input.substr(j - k + 1, k) << " " << hashval << std::endl; 94 | } 95 | } 96 | 97 | int main(int argc, char *argv[]) { 98 | demo(5, "GATTACACAATAGCAAATT"); 99 | std::cout << " code looks good " << std::endl; 100 | return 0; 101 | } 102 | -------------------------------------------------------------------------------- /include/generalhash.h: -------------------------------------------------------------------------------- 1 | #ifndef GENERALHASH 2 | #define GENERALHASH 3 | 4 | #include 5 | #include 6 | 7 | #include "characterhash.h" 8 | 9 | using namespace std; 10 | 11 | enum { NOPRECOMP, FULLPRECOMP }; 12 | 13 | /** 14 | * Each instance is a rolling hash function meant to hash streams of characters. 15 | * Each new instance of this class comes with new random keys. 16 | * 17 | * Recommended usage to get L-bit hash values over n-grams: 18 | * GeneralHash<> hf(n,L ); 19 | * for(uint32 k = 0; k 32 | class GeneralHash { 33 | public: 34 | // myn is the length of the sequences, e.g., 3 means that you want to hash 35 | // sequences of 3 characters mywordsize is the number of bits you which to 36 | // receive as hash values, e.g., 19 means that the hash values are 19-bit 37 | // integers 38 | GeneralHash(int myn, int mywordsize = 19) 39 | : hashvalue(0), wordsize(mywordsize), n(myn), irreduciblepoly(0), 40 | hasher(maskfnc(wordsize)), 41 | lastbit(static_cast(1) << wordsize), 42 | precomputedshift(precomputationtype == FULLPRECOMP ? (1 << n) : 0) { 43 | if (wordsize == 19) { 44 | irreduciblepoly = 1 + (1 << 1) + (1 << 2) + (1 << 5) + (1 << 19); 45 | } else if (wordsize == 9) { 46 | irreduciblepoly = 1 + (1 << 2) + (1 << 3) + (1 << 5) + (1 << 9); 47 | } else { 48 | cerr << "unsupported wordsize " << wordsize << " bits, try 19 or 9" 49 | << endl; 50 | } 51 | // in case the precomp is activated at the template level 52 | if (precomputationtype == FULLPRECOMP) { 53 | for (hashvaluetype x = 0; x < precomputedshift.size(); ++x) { 54 | hashvaluetype leftover = x << (wordsize - n); 55 | fastleftshift(leftover, n); 56 | precomputedshift[x] = leftover; 57 | } 58 | } 59 | } 60 | // prepare to process a new string, you will need to call "eat" again 61 | void reset() { hashvalue = 0; } 62 | 63 | void fastleftshift(hashvaluetype &x, int r) const { 64 | for (int i = 0; i < r; ++i) { 65 | x <<= 1; 66 | if ((x & lastbit) == lastbit) 67 | x ^= irreduciblepoly; 68 | } 69 | } 70 | 71 | void fastleftshiftn(hashvaluetype &x) const { 72 | x = 73 | // take the last n bits and look-up the result 74 | precomputedshift[(x >> (wordsize - n))] ^ 75 | // then just shift the first L-n bits 76 | ((x << n) & (lastbit - 1)); 77 | } 78 | 79 | // add inchar as an input and remove outchar, the hashvalue is updated 80 | // this function can be used to update the hash value from the hash value of 81 | // [outchar]ABC to the hash value of ABC[inchar] 82 | void update(chartype outchar, chartype inchar) { 83 | hashvalue <<= 1; 84 | if ((hashvalue & lastbit) == lastbit) 85 | hashvalue ^= irreduciblepoly; 86 | // 87 | hashvaluetype z(hasher.hashvalues[outchar]); 88 | // the compiler should optimize away the next if/else 89 | if (precomputationtype == FULLPRECOMP) { 90 | fastleftshiftn(z); 91 | hashvalue ^= z ^ hasher.hashvalues[inchar]; 92 | } else { 93 | fastleftshift(z, n); 94 | hashvalue ^= z ^ hasher.hashvalues[inchar]; 95 | } 96 | } 97 | 98 | // add inchar as an input, this is used typically only at the start 99 | // the hash value is updated to that of a longer string (one where inchar was 100 | // appended) 101 | void eat(chartype inchar) { 102 | fastleftshift(hashvalue, 1); 103 | hashvalue ^= hasher.hashvalues[inchar]; 104 | } 105 | 106 | // this is a convenience function, use eat,update and .hashvalue to use as a 107 | // rolling hash function 108 | template hashvaluetype hash(container &c) const { 109 | hashvaluetype answer(0); 110 | for (uint k = 0; k < c.size(); ++k) { 111 | fastleftshift(answer, 1); 112 | answer ^= hasher.hashvalues[c[k]]; 113 | } 114 | return answer; 115 | } 116 | 117 | hashvaluetype hashvalue; 118 | const int wordsize; 119 | int n; 120 | hashvaluetype irreduciblepoly; 121 | CharacterHash hasher; 122 | const hashvaluetype lastbit; 123 | vector precomputedshift; 124 | }; 125 | 126 | #endif 127 | -------------------------------------------------------------------------------- /include/cyclichash.h: -------------------------------------------------------------------------------- 1 | #ifndef CYCLICHASH 2 | #define CYCLICHASH 3 | 4 | #include "characterhash.h" 5 | 6 | /** 7 | * Each instance is a rolling hash function meant to hash streams of characters. 8 | * Each new instance of this class comes with new random keys. 9 | * 10 | * Recommended usage to get L-bit hash values over n-grams: 11 | * CyclicHash<> hf(n,L ); 12 | * for(uint32 k = 0; k 24 | class CyclicHash { 25 | 26 | public: 27 | // myn is the length of the sequences, e.g., 3 means that you want to hash 28 | // sequences of 3 characters mywordsize is the number of bits you which to 29 | // receive as hash values, e.g., 19 means that the hash values are 19-bit 30 | // integers 31 | CyclicHash(int myn, int mywordsize = 19) 32 | : hashvalue(0), n(myn), wordsize(mywordsize), 33 | hasher(maskfnc(wordsize)), 34 | mask1(maskfnc(wordsize - 1)), myr(n % wordsize), 35 | maskn(maskfnc(wordsize - myr)) { 36 | if (static_cast(wordsize) > 8 * sizeof(hashvaluetype)) { 37 | cerr << "Can't create " << wordsize << "-bit hash values" << endl; 38 | throw "abord"; 39 | } 40 | } 41 | 42 | CyclicHash(int myn, uint32 seed1, uint32 seed2, int mywordsize = 19) 43 | : hashvalue(0), n(myn), wordsize(mywordsize), 44 | hasher(maskfnc(wordsize), seed1, seed2), 45 | mask1(maskfnc(wordsize - 1)), myr(n % wordsize), 46 | maskn(maskfnc(wordsize - myr)) { 47 | if (static_cast(wordsize) > 8 * sizeof(hashvaluetype)) { 48 | cerr << "Can't create " << wordsize << "-bit hash values" << endl; 49 | throw "abord"; 50 | } 51 | } 52 | 53 | void fastleftshiftn(hashvaluetype &x) const { 54 | x = ((x & maskn) << myr) | (x >> (wordsize - myr)); 55 | } 56 | 57 | void fastleftshift1(hashvaluetype &x) const { 58 | x = ((x & mask1) << 1) | (x >> (wordsize - 1)); 59 | } 60 | 61 | void fastrightshift1(hashvaluetype &x) const { 62 | x = (x >> 1) | ((x & 1) << (wordsize - 1)); 63 | } 64 | 65 | hashvaluetype getfastleftshift1(hashvaluetype x) const { 66 | return ((x & mask1) << 1) | (x >> (wordsize - 1)); 67 | } 68 | 69 | hashvaluetype getfastrightshift1(hashvaluetype x) const { 70 | return (x >> 1) | ((x & 1) << (wordsize - 1)); 71 | } 72 | 73 | // this is a convenience function, use eat,update and .hashvalue to use as a 74 | // rolling hash function 75 | template hashvaluetype hash(container &c) { 76 | hashvaluetype answer(0); 77 | for (uint k = 0; k < c.size(); ++k) { 78 | fastleftshift1(answer); 79 | answer ^= hasher.hashvalues[static_cast(c[k])]; 80 | } 81 | return answer; 82 | } 83 | 84 | hashvaluetype hashz(chartype outchar, uint n) { 85 | hashvaluetype answer = 86 | hasher.hashvalues[static_cast(outchar)]; 87 | for (uint k = 0; k < n; ++k) { 88 | fastleftshift1(answer); 89 | } 90 | return answer; 91 | } 92 | 93 | // add inchar as an input and remove outchar, the hashvalue is updated 94 | // this function can be used to update the hash value from the hash value of 95 | // [outchar]ABC to the hash value of ABC[inchar] 96 | void update(chartype outchar, chartype inchar) { 97 | hashvaluetype z(hasher.hashvalues[outchar]); 98 | fastleftshiftn(z); 99 | hashvalue = getfastleftshift1(hashvalue) ^ z ^ hasher.hashvalues[inchar]; 100 | } 101 | 102 | // this is the reverse of the update function. 103 | // this function can be used to update the hash value from the hash value of 104 | // ABC[inchar] to the hash value of [outchar]ABC 105 | void reverse_update(chartype outchar, chartype inchar) { 106 | hashvaluetype z(hasher.hashvalues[outchar]); 107 | fastleftshiftn(z); 108 | hashvalue ^= z ^ hasher.hashvalues[inchar]; 109 | hashvalue = getfastrightshift1(hashvalue); 110 | } 111 | 112 | // add inchar as an input, this is used typically only at the start 113 | // the hash value is updated to that of a longer string (one where inchar was 114 | // appended) 115 | void eat(chartype inchar) { 116 | fastleftshift1(hashvalue); 117 | hashvalue ^= hasher.hashvalues[inchar]; 118 | } 119 | 120 | // for an n-gram X it returns hash value of (n + 1)-gram XY without changing 121 | // the object X. For example, if X = "ABC", then X.hash_extend("D") returns 122 | // value of "ABCD" without changing the state of X 123 | hashvaluetype hash_extend(chartype Y) { 124 | return getfastleftshift1(hashvalue) ^ hasher.hashvalues[Y]; 125 | } 126 | 127 | // same as hash_extend, but with prepending the n-gram with character Y. If X 128 | // = "ABC", then X.hash_prepend("D") returns value of "DABC" without changing 129 | // the state of X 130 | hashvaluetype hash_prepend(chartype Y) { 131 | hashvaluetype z(hasher.hashvalues[Y]); 132 | fastleftshiftn(z); 133 | return z ^ hashvalue; 134 | } 135 | 136 | // prepare to process a new string, you will need to call "eat" again 137 | void reset() { hashvalue = 0; } 138 | 139 | hashvaluetype hashvalue; 140 | int n; 141 | const int wordsize; 142 | CharacterHash hasher; 143 | const hashvaluetype mask1; 144 | const int myr; 145 | const hashvaluetype maskn; 146 | }; 147 | 148 | #endif 149 | -------------------------------------------------------------------------------- /tests/unit.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "cyclichash.h" 5 | #include "generalhash.h" 6 | #include "rabinkarphash.h" 7 | 8 | #include "threewisehash.h" 9 | 10 | using namespace std; 11 | 12 | template bool testExtendAndPrepend(uint L = 19) { 13 | const uint n(4); // n-grams 14 | hashfunction hf(n, L); 15 | string input = "XABCDY"; 16 | string base(input.begin() + 1, input.end() - 1); 17 | assert(base.size() == n); 18 | string extend(input.begin() + 1, input.end()); 19 | string prepend(input.begin(), input.end() - 1); 20 | 21 | for (string::const_iterator j = base.begin(); j != base.end(); ++j) { 22 | hf.eat(*j); 23 | } 24 | if (hf.hashvalue != hf.hash(base)) { 25 | std::cout << "bug!" << std::endl; 26 | std::cout << base << " " << hf.hash(base) << std::endl; 27 | return false; 28 | } 29 | if (hf.hash_prepend(input[0]) != hf.hash(prepend)) { 30 | std::cout << "bug!" << std::endl; 31 | std::cout << prepend << " " << hf.hash_prepend(input[0]) << " " 32 | << hf.hash(prepend) << std::endl; 33 | return false; 34 | } 35 | if (hf.hash_extend(input.back()) != hf.hash(extend)) { 36 | std::cout << "bug!" << std::endl; 37 | std::cout << extend << " " << hf.hash_extend(input.back()) << " " 38 | << hf.hash(extend) << std::endl; 39 | return false; 40 | } 41 | 42 | assert(hf.hashvalue == hf.hash(base)); 43 | assert(hf.hash_prepend(input[0]) == hf.hash(prepend)); 44 | assert(hf.hash_extend(input.back()) == hf.hash(extend)); 45 | 46 | return true; 47 | } 48 | 49 | template bool isItAFunction(uint L = 7) { 50 | mersenneRNG generator(5); 51 | const uint n(3); // n-grams 52 | hashfunction hf(n, L); 53 | deque s; 54 | for (uint32 k = 0; k < n; ++k) { 55 | unsigned char c = static_cast(generator() + 65); 56 | s.push_back(c); 57 | hf.eat(c); 58 | } 59 | for (uint32 k = 0; k < 100000; ++k) { 60 | unsigned char out = s.front(); 61 | s.pop_front(); 62 | char c(generator() + 65); 63 | 64 | s.push_back(c); 65 | hf.update(out, c); 66 | if (hf.hash(s) != hf.hashvalue) { 67 | for (deque::iterator ii = s.begin(); ii != s.end(); ++ii) 68 | cout << *ii << " " << static_cast(*ii) << endl; 69 | cerr << "bug" << endl; 70 | cerr << s[0] << s[1] << s[2] << " was hashed to " << hf.hashvalue 71 | << " when true hash value is " << hf.hash(s) << endl; 72 | for (uint j = 0; j < n; ++j) 73 | cerr << s[j] << "->" << hf.hasher.hashvalues[s[j]] << endl; 74 | return false; 75 | } 76 | } 77 | return true; 78 | } 79 | 80 | template bool doesReverseUpdateWorks(uint L = 7) { 81 | mersenneRNG generator(5); 82 | const uint n(3); // n-grams 83 | hashfunction hf(n, L); 84 | deque s; 85 | for (uint32 k = 0; k < n; ++k) { 86 | unsigned char c = static_cast(generator() + 65); 87 | s.push_back(c); 88 | hf.eat(c); 89 | } 90 | for (uint32 k = 0; k < 100000; ++k) { 91 | unsigned char out = s.front(); 92 | s.pop_front(); 93 | char c(generator() + 65); 94 | s.push_back(c); 95 | hf.update(out, c); 96 | hf.reverse_update(out, c); 97 | hf.update(out, c); 98 | if (hf.hash(s) != hf.hashvalue) { 99 | return false; 100 | } 101 | } 102 | return true; 103 | } 104 | 105 | template bool isItRandom(uint L = 19) { 106 | cout << "checking that it is randomized " << endl; 107 | int n = 5; 108 | vector data(n); 109 | for (int k = 0; k < n; ++k) { 110 | data[k] = static_cast(k); 111 | } 112 | hashfunction base(n, L); 113 | uint64 x = base.hash(data); 114 | for (int k = 0; k < 100; ++k) { 115 | hashfunction hf(n, L); 116 | uint64 y = hf.hash(data); 117 | if (y != x) { 118 | cout << "It is randomized! " << endl; 119 | return true; 120 | } 121 | cout << "collision " << y << endl; 122 | } 123 | cout << "Not randomized! " << endl; 124 | return false; // we conclude that it always hashes to the same value (this is 125 | // bad) 126 | } 127 | 128 | bool test() { 129 | bool ok(true); 130 | cout << "Karp-Rabin" << endl; 131 | for (uint L = 1; L <= 32; ++L) { 132 | if (!ok) 133 | return false; 134 | ok &= isItAFunction>(); 135 | } 136 | ok &= isItRandom>(); 137 | for (uint L = 1; L <= 64; ++L) { 138 | if (!ok) 139 | return false; 140 | ok &= isItAFunction>(); 141 | } 142 | ok &= isItRandom>(); 143 | if (!ok) 144 | return false; 145 | cout << "cyclic" << endl; 146 | for (uint L = 2; L <= 32; ++L) { 147 | if (!ok) 148 | return false; 149 | ok &= testExtendAndPrepend>(L); 150 | ok &= isItAFunction>(L); 151 | ok &= doesReverseUpdateWorks>(L); 152 | } 153 | for (uint L = 2; L <= 64; ++L) { 154 | if (!ok) 155 | return false; 156 | ok &= testExtendAndPrepend>(L); 157 | ok &= isItAFunction>(L); 158 | } 159 | ok &= isItRandom>(); 160 | ok &= isItRandom>(); 161 | 162 | cout << "three-wise" << endl; 163 | for (uint L = 1; L <= 32; ++L) { 164 | ok &= isItAFunction>(L); 165 | } 166 | ok &= isItRandom>(); 167 | for (uint L = 1; L <= 64; ++L) { 168 | ok &= isItAFunction>(L); 169 | } 170 | ok &= isItRandom>(); 171 | 172 | cout << "general" << endl; 173 | ok &= isItAFunction>(9); 174 | if (!ok) 175 | return false; 176 | ok &= isItRandom>(); 177 | if (!ok) 178 | return false; 179 | ok &= isItAFunction>(19); 180 | cout << "general" << endl; 181 | ok &= isItAFunction>(9); 182 | if (!ok) 183 | return false; 184 | ok &= isItRandom>(); 185 | if (!ok) 186 | return false; 187 | ok &= isItAFunction>(19); 188 | return ok; 189 | } 190 | 191 | int main() { 192 | bool ok(test()); 193 | if (ok) 194 | cout << "your code is ok!" << endl; 195 | else 196 | cout << "you have a bug of some kind" << endl; 197 | return 0; 198 | } 199 | -------------------------------------------------------------------------------- /include/rabinkarphash.h: -------------------------------------------------------------------------------- 1 | #ifndef KARPRABINHASH 2 | #define KARPRABINHASH 3 | 4 | #include "characterhash.h" 5 | #include 6 | 7 | /** 8 | * This is a randomized version of the Karp-Rabin hash function. 9 | * Each instance is a rolling hash function meant to hash streams of characters. 10 | * Each new instance of this class comes with new random keys. 11 | * 12 | * Recommended usage to get L-bit hash values over n-grams: 13 | * KarpRabinHash<> hf(n,L ); 14 | * for(uint32 k = 0; k 26 | class KarpRabinHash { 27 | 28 | public: 29 | // myn is the length of the sequences, e.g., 3 means that you want to hash 30 | // sequences of 3 characters mywordsize is the number of bits you which to 31 | // receive as hash values, e.g., 19 means that the hash values are 19-bit 32 | // integers 33 | KarpRabinHash(int myn, int mywordsize = 19) 34 | : hashvalue(0), n(myn), wordsize(mywordsize), 35 | hasher(maskfnc(wordsize)), 36 | HASHMASK(maskfnc(wordsize)), BtoN(1) { 37 | for (int i = 0; i < n; ++i) { 38 | BtoN *= B; 39 | BtoN &= HASHMASK; 40 | } 41 | } 42 | 43 | // prepare to process a new string, you will need to call "eat" again 44 | void reset() { hashvalue = 0; } 45 | 46 | // this is a convenience function, use eat,update and .hashvalue to use as a 47 | // rolling hash function 48 | template hashvaluetype hash(container &c) { 49 | hashvaluetype answer(0); 50 | for (uint k = 0; k < c.size(); ++k) { 51 | hashvaluetype x(1); 52 | for (uint j = 0; j < c.size() - 1 - k; ++j) { 53 | x = (x * B) & HASHMASK; 54 | } 55 | x = (x * hasher.hashvalues[c[k]]) & HASHMASK; 56 | answer = (answer + x) & HASHMASK; 57 | } 58 | return answer; 59 | } 60 | 61 | // add inchar as an input, this is used typically only at the start 62 | // the hash value is updated to that of a longer string (one where inchar was 63 | // appended) 64 | void eat(chartype inchar) { 65 | hashvalue = (B * hashvalue + hasher.hashvalues[inchar]) & HASHMASK; 66 | } 67 | 68 | // add inchar as an input and remove outchar, the hashvalue is updated 69 | // this function can be used to update the hash value from the hash value of 70 | // [outchar]ABC to the hash value of ABC[inchar] 71 | void update(chartype outchar, chartype inchar) { 72 | hashvalue = (B * hashvalue + hasher.hashvalues[inchar] - 73 | BtoN * hasher.hashvalues[outchar]) & 74 | HASHMASK; 75 | } 76 | 77 | hashvaluetype hashvalue; 78 | int n; 79 | const int wordsize; 80 | CharacterHash hasher; 81 | const hashvaluetype HASHMASK; 82 | hashvaluetype BtoN; 83 | static const hashvaluetype B = 37; 84 | }; 85 | 86 | template 88 | class KarpRabinHashBits { 89 | // The key difference between KarpRabinHashBits and KarpRabinHash is that 90 | // wordsize is now templated And the masking is only performed if nbits != the 91 | // number of bits in the type 92 | public: 93 | // myn is the length of the sequences, e.g., 3 means that you want to hash 94 | // sequences of 3 characters mywordsize is the number of bits you which to 95 | // receive as hash values, e.g., 19 means that the hash values are 19-bit 96 | // integers 97 | KarpRabinHashBits(int myn) 98 | : hashvalue(0), n(myn), hasher(maskfnc(wordsize)), 99 | HASHMASK(maskfnc(wordsize)), BtoN(1) { 100 | for (int i = 0; i < n; ++i) { 101 | BtoN *= B; 102 | if (!is_full_word()) 103 | BtoN &= HASHMASK; 104 | } 105 | } 106 | 107 | // prepare to process a new string, you will need to call "eat" again 108 | void reset() { hashvalue = 0; } 109 | static constexpr bool is_full_word() { 110 | return wordsize == (CHAR_BIT * sizeof(hashvaluetype)); 111 | } 112 | template void mask_value(T &val) const { 113 | #if __cplusplus >= 201703L 114 | #define CONSTIF if constexpr 115 | #else 116 | #define CONSTIF if 117 | #endif 118 | CONSTIF(!is_full_word()) val &= HASHMASK; 119 | #undef CONSTIF 120 | } 121 | 122 | // this is a convenience function, use eat,update and .hashvalue to use as a 123 | // rolling hash function 124 | template hashvaluetype hash(container &c) const { 125 | hashvaluetype answer(0); 126 | for (uint k = 0; k < c.size(); ++k) { 127 | hashvaluetype x(1); 128 | for (uint j = 0; j < c.size() - 1 - k; ++j) { 129 | x = (x * B); 130 | mask_value(x); 131 | } 132 | x = (x * hasher.hashvalues[c[k]]); 133 | mask_value(x); 134 | answer = (answer + x); 135 | mask_value(answer); 136 | } 137 | return answer; 138 | } 139 | hashvaluetype hash(char *s) const { 140 | return hash(static_cast(s)); 141 | } 142 | hashvaluetype hash(const char *s) const { 143 | hashvaluetype answer(0); 144 | uint csz = std::strlen(s); 145 | for (uint k = 0; k < csz; ++k) { 146 | hashvaluetype x(1); 147 | for (uint j = 0; j < csz - 1 - k; ++j) { 148 | x = (x * B); 149 | mask_value(x); 150 | } 151 | x = (x * hasher.hashvalues[s[k]]); 152 | mask_value(x); 153 | answer = (answer + x); 154 | mask_value(answer); 155 | } 156 | return answer; 157 | } 158 | 159 | // add inchar as an input, this is used typically only at the start 160 | // the hash value is updated to that of a longer string (one where inchar was 161 | // appended) 162 | void eat(chartype inchar) { 163 | hashvalue = (B * hashvalue + hasher.hashvalues[inchar]); 164 | mask_value(hashvalue); 165 | } 166 | 167 | // add inchar as an input and remove outchar, the hashvalue is updated 168 | // this function can be used to update the hash value from the hash value of 169 | // [outchar]ABC to the hash value of ABC[inchar] 170 | void update(chartype outchar, chartype inchar) { 171 | hashvalue = (B * hashvalue + hasher.hashvalues[inchar] - 172 | BtoN * hasher.hashvalues[outchar]); 173 | mask_value(hashvalue); 174 | } 175 | 176 | hashvaluetype hashvalue; 177 | int n; 178 | CharacterHash hasher; 179 | const hashvaluetype HASHMASK; 180 | hashvaluetype BtoN; 181 | static constexpr hashvaluetype B = 37; 182 | }; 183 | 184 | #endif 185 | -------------------------------------------------------------------------------- /include/mersennetwister.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * High performance random generator. 4 | * Mersenne Twister 5 | 6 | @article{matsumoto1998mtd, 7 | title={{Mersenne Twister: A 623-Dimensionally Equidistributed Uniform 8 | Pseudo-Random Number Generator}}, author={MATSUMOTO, M. and NISHIMURA, T.}, 9 | journal={ACM Transactions on Modeling and Computer Simulation}, 10 | volume={8}, 11 | number={1}, 12 | pages={3-30}, 13 | year={1998} 14 | } 15 | */ 16 | // MersenneTwister.h 17 | // Mersenne Twister random number generator -- a C++ class MTRand 18 | // Based on code by Makoto Matsumoto, Takuji Nishimura, and Shawn Cokus 19 | // Richard J. Wagner v1.0 15 May 2003 rjwagner@writeme.com 20 | 21 | // The Mersenne Twister is an algorithm for generating random numbers. It 22 | // was designed with consideration of the flaws in various other generators. 23 | // The period, 2^19937-1, and the order of equidistribution, 623 dimensions, 24 | // are far greater. The generator is also fast; it avoids multiplication and 25 | // division, and it benefits from caches and pipelines. For more information 26 | // see the inventors' web page at http://www.math.keio.ac.jp/~matumoto/emt.html 27 | 28 | // Reference 29 | // M. Matsumoto and T. Nishimura, "Mersenne Twister: A 623-Dimensionally 30 | // Equidistributed Uniform Pseudo-Random Number Generator", ACM Transactions on 31 | // Modeling and Computer Simulation, Vol. 8, No. 1, January 1998, pp 3-30. 32 | 33 | // Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, 34 | // Copyright (C) 2000 - 2003, Richard J. Wagner 35 | // All rights reserved. 36 | // 37 | // Redistribution and use in source and binary forms, with or without 38 | // modification, are permitted provided that the following conditions 39 | // are met: 40 | // 41 | // 1. Redistributions of source code must retain the above copyright 42 | // notice, this list of conditions and the following disclaimer. 43 | // 44 | // 2. Redistributions in binary form must reproduce the above copyright 45 | // notice, this list of conditions and the following disclaimer in the 46 | // documentation and/or other materials provided with the distribution. 47 | // 48 | // 3. The names of its contributors may not be used to endorse or promote 49 | // products derived from this software without specific prior written 50 | // permission. 51 | // 52 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 53 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 54 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 55 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 56 | // OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 57 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 58 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 59 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 60 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 61 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 62 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 63 | 64 | // The original code included the following notice: 65 | // 66 | // When you use this, send an email to: matumoto@math.keio.ac.jp 67 | // with an appropriate reference to your work. 68 | // 69 | // It would be nice to CC: rjwagner@writeme.com and Cokus@math.washington.edu 70 | // when you write. 71 | 72 | #ifndef MERSENNETWISTER_H 73 | #define MERSENNETWISTER_H 74 | 75 | // Not thread safe (unless auto-initialization is avoided and each thread has 76 | // its own MTRand object) 77 | 78 | #include 79 | #include 80 | #include 81 | #include 82 | #include 83 | 84 | class MTRand { 85 | // Data 86 | public: 87 | typedef unsigned long uint32; // unsigned integer type, at least 32 bits 88 | 89 | enum { N = 624 }; // length of state vector 90 | enum { SAVE = N + 1 }; // length of array for save() 91 | 92 | protected: 93 | enum { M = 397 }; // period parameter 94 | 95 | uint32 state[N]; // internal state 96 | uint32 *pNext; // next value to get from state 97 | int left; // number of values left before reload needed 98 | 99 | // Methods 100 | public: 101 | MTRand(const uint32 &oneSeed); // initialize with a simple uint32 102 | MTRand(uint32 *const bigSeed, uint32 const seedLength = N); // or an array 103 | MTRand(); // auto-initialize with /dev/urandom or time() and clock() 104 | 105 | // Do NOT use for CRYPTOGRAPHY without securely hashing several returned 106 | // values together, otherwise the generator state can be learned after 107 | // reading 624 consecutive values. 108 | 109 | // Access to 32-bit random numbers 110 | double rand(); // real number in [0,1] 111 | double rand(const double &n); // real number in [0,n] 112 | double randExc(); // real number in [0,1) 113 | double randExc(const double &n); // real number in [0,n) 114 | double randDblExc(); // real number in (0,1) 115 | double randDblExc(const double &n); // real number in (0,n) 116 | uint32 randInt(); // integer in [0,2^32-1] 117 | uint32 randInt(const uint32 &n); // integer in [0,n] for n < 2^32 118 | double operator()() { 119 | return rand(); // same as rand() 120 | } 121 | 122 | // Access to 53-bit random numbers (capacity of IEEE double precision) 123 | double rand53(); // real number in [0,1) 124 | 125 | // Access to nonuniform random number distributions 126 | double randNorm(const double &mean = 0.0, const double &variance = 0.0); 127 | 128 | // Re-seeding functions with same behavior as initializers 129 | void seed(const uint32 oneSeed); 130 | void seed(uint32 *const bigSeed, const uint32 seedLength = N); 131 | void seed(); 132 | 133 | // Saving and loading generator state 134 | void save(uint32 *saveArray) const; // to array of size SAVE 135 | void load(uint32 *const loadArray); // from such array 136 | friend std::ostream &operator<<(std::ostream &os, const MTRand &mtrand); 137 | friend std::istream &operator>>(std::istream &is, MTRand &mtrand); 138 | 139 | protected: 140 | void initialize(const uint32 oneSeed); 141 | void reload(); 142 | uint32 hiBit(const uint32 &u) const { return u & 0x80000000UL; } 143 | uint32 loBit(const uint32 &u) const { return u & 0x00000001UL; } 144 | uint32 loBits(const uint32 &u) const { return u & 0x7fffffffUL; } 145 | uint32 mixBits(const uint32 &u, const uint32 &v) const { 146 | return hiBit(u) | loBits(v); 147 | } 148 | uint32 twist(const uint32 &m, const uint32 &s0, const uint32 &s1) const { 149 | return m ^ (mixBits(s0, s1) >> 1) ^ 150 | (-static_cast(loBit(s1)) & 0x9908b0dfUL); 151 | } 152 | static uint32 hash(time_t t, clock_t c); 153 | }; 154 | 155 | MTRand::MTRand(const uint32 &oneSeed) { seed(oneSeed); } 156 | 157 | MTRand::MTRand(uint32 *const bigSeed, const uint32 seedLength) { 158 | seed(bigSeed, seedLength); 159 | } 160 | 161 | MTRand::MTRand() { seed(); } 162 | 163 | double MTRand::rand() { return double(randInt()) * (1.0 / 4294967295.0); } 164 | 165 | double MTRand::rand(const double &n) { return rand() * n; } 166 | 167 | double MTRand::randExc() { return double(randInt()) * (1.0 / 4294967296.0); } 168 | 169 | double MTRand::randExc(const double &n) { return randExc() * n; } 170 | 171 | double MTRand::randDblExc() { 172 | return (double(randInt()) + 0.5) * (1.0 / 4294967296.0); 173 | } 174 | 175 | double MTRand::randDblExc(const double &n) { return randDblExc() * n; } 176 | 177 | double MTRand::rand53() { 178 | uint32 a = randInt() >> 5, b = randInt() >> 6; 179 | return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0); // by Isaku Wada 180 | } 181 | 182 | double MTRand::randNorm(const double &mean, const double &variance) { 183 | // Return a real number from a normal (Gaussian) distribution with given 184 | // mean and variance by Box-Muller method 185 | double r = sqrt(-2.0 * log(1.0 - randDblExc())) * variance; 186 | double phi = 2.0 * 3.14159265358979323846264338328 * randExc(); 187 | return mean + r * cos(phi); 188 | } 189 | 190 | MTRand::uint32 MTRand::randInt() { 191 | // Pull a 32-bit integer from the generator state 192 | // Every other access function simply transforms the numbers extracted here 193 | 194 | if (left == 0) 195 | reload(); 196 | --left; 197 | 198 | uint32 s1; 199 | s1 = *pNext++; 200 | s1 ^= (s1 >> 11); 201 | s1 ^= (s1 << 7) & 0x9d2c5680UL; 202 | s1 ^= (s1 << 15) & 0xefc60000UL; 203 | return (s1 ^ (s1 >> 18)); 204 | } 205 | 206 | MTRand::uint32 MTRand::randInt(const uint32 &n) { 207 | // Find which bits are used in n 208 | // Optimized by Magnus Jonsson (magnus@smartelectronix.com) 209 | uint32 used = n; 210 | used |= used >> 1; 211 | used |= used >> 2; 212 | used |= used >> 4; 213 | used |= used >> 8; 214 | used |= used >> 16; 215 | 216 | // Draw numbers until one is found in [0,n] 217 | uint32 i; 218 | do 219 | i = randInt() & used; // toss unused bits to shorten search 220 | while (i > n); 221 | return i; 222 | } 223 | 224 | void MTRand::seed(const uint32 oneSeed) { 225 | // Seed the generator with a simple uint32 226 | initialize(oneSeed); 227 | reload(); 228 | } 229 | 230 | void MTRand::seed(uint32 *const bigSeed, const uint32 seedLength) { 231 | // Seed the generator with an array of uint32's 232 | // There are 2^19937-1 possible initial states. This function allows 233 | // all of those to be accessed by providing at least 19937 bits (with a 234 | // default seed length of N = 624 uint32's). Any bits above the lower 32 235 | // in each element are discarded. 236 | // Just call seed() if you want to get array from /dev/urandom 237 | initialize(19650218UL); 238 | int i = 1; 239 | uint32 j = 0; 240 | int k = (uint32(N) > seedLength ? int(N) : int(seedLength)); 241 | for (; k; --k) { 242 | state[i] = state[i] ^ ((state[i - 1] ^ (state[i - 1] >> 30)) * 1664525UL); 243 | state[i] += (bigSeed[j] & 0xffffffffUL) + j; 244 | state[i] &= 0xffffffffUL; 245 | ++i; 246 | ++j; 247 | if (i >= N) { 248 | state[0] = state[N - 1]; 249 | i = 1; 250 | } 251 | if (j >= seedLength) 252 | j = 0; 253 | } 254 | for (k = N - 1; k; --k) { 255 | state[i] = 256 | state[i] ^ ((state[i - 1] ^ (state[i - 1] >> 30)) * 1566083941UL); 257 | state[i] -= i; 258 | state[i] &= 0xffffffffUL; 259 | ++i; 260 | if (i >= N) { 261 | state[0] = state[N - 1]; 262 | i = 1; 263 | } 264 | } 265 | state[0] = 0x80000000UL; // MSB is 1, assuring non-zero initial array 266 | reload(); 267 | } 268 | 269 | void MTRand::seed() { 270 | // Seed the generator with an array from /dev/urandom if available 271 | // Otherwise use a hash of time() and clock() values 272 | 273 | // First try getting an array from /dev/urandom 274 | FILE *urandom = fopen("/dev/urandom", "rb"); 275 | if (urandom) { 276 | uint32 bigSeed[N]; 277 | uint32 *s = bigSeed; 278 | int i = N; 279 | bool success = true; 280 | while (success && i--) 281 | success = fread(s++, sizeof(uint32), 1, urandom); 282 | fclose(urandom); 283 | if (success) { 284 | seed(bigSeed, N); 285 | return; 286 | } 287 | } 288 | 289 | // Was not successful, so use time() and clock() instead 290 | seed(hash(time(NULL), clock())); 291 | } 292 | 293 | void MTRand::initialize(const uint32 seed) { 294 | // Initialize generator state with seed 295 | // See Knuth TAOCP Vol 2, 3rd Ed, p.106 for multiplier. 296 | // In previous versions, most significant bits (MSBs) of the seed affect 297 | // only MSBs of the state array. Modified 9 Jan 2002 by Makoto Matsumoto. 298 | uint32 *s = state; 299 | uint32 *r = state; 300 | int i = 1; 301 | *s++ = seed & 0xffffffffUL; 302 | for (; i < N; ++i) { 303 | *s++ = (1812433253UL * (*r ^ (*r >> 30)) + i) & 0xffffffffUL; 304 | r++; 305 | } 306 | } 307 | 308 | void MTRand::reload() { 309 | // Generate N new values in state 310 | // Made clearer and faster by Matthew Bellew (matthew.bellew@home.com) 311 | uint32 *p = state; 312 | int i; 313 | for (i = N - M; i--; ++p) 314 | *p = twist(p[M], p[0], p[1]); 315 | for (i = M; --i; ++p) 316 | *p = twist(p[M - N], p[0], p[1]); 317 | *p = twist(p[M - N], p[0], state[0]); 318 | 319 | left = N, pNext = state; 320 | } 321 | 322 | MTRand::uint32 MTRand::hash(time_t t, clock_t c) { 323 | // Get a uint32 from t and c 324 | // Better than uint32(x) in case x is floating point in [0,1] 325 | // Based on code by Lawrence Kirby (fred@genesis.demon.co.uk) 326 | 327 | static uint32 differ = 0; // guarantee time-based seeds will change 328 | 329 | uint32 h1 = 0; 330 | unsigned char *p = reinterpret_cast(&t); 331 | for (size_t i = 0; i < sizeof(t); ++i) { 332 | h1 *= UCHAR_MAX + 2U; 333 | h1 += p[i]; 334 | } 335 | uint32 h2 = 0; 336 | p = reinterpret_cast(&c); 337 | for (size_t j = 0; j < sizeof(c); ++j) { 338 | h2 *= UCHAR_MAX + 2U; 339 | h2 += p[j]; 340 | } 341 | return (h1 + differ++) ^ h2; 342 | } 343 | 344 | void MTRand::save(uint32 *saveArray) const { 345 | uint32 *sa = saveArray; 346 | const uint32 *s = state; 347 | int i = N; 348 | for (; i--; *sa++ = *s++) { 349 | } 350 | *sa = left; 351 | } 352 | 353 | void MTRand::load(uint32 *const loadArray) { 354 | uint32 *s = state; 355 | uint32 *la = loadArray; 356 | int i = N; 357 | for (; i--; *s++ = *la++) { 358 | } 359 | left = *la; 360 | pNext = &state[N - left]; 361 | } 362 | 363 | std::ostream &operator<<(std::ostream &os, const MTRand &mtrand) { 364 | const MTRand::uint32 *s = mtrand.state; 365 | int i = mtrand.N; 366 | for (; i--; os << *s++ << "\t") { 367 | } 368 | return os << mtrand.left; 369 | } 370 | 371 | std::istream &operator>>(std::istream &is, MTRand &mtrand) { 372 | MTRand::uint32 *s = mtrand.state; 373 | int i = mtrand.N; 374 | for (; i--; is >> *s++) { 375 | } 376 | is >> mtrand.left; 377 | mtrand.pNext = &mtrand.state[mtrand.N - mtrand.left]; 378 | return is; 379 | } 380 | 381 | #endif // MERSENNETWISTER_H 382 | 383 | // Change log: 384 | // 385 | // v0.1 - First release on 15 May 2000 386 | // - Based on code by Makoto Matsumoto, Takuji Nishimura, and Shawn Cokus 387 | // - Translated from C to C++ 388 | // - Made completely ANSI compliant 389 | // - Designed convenient interface for initialization, seeding, and 390 | // obtaining numbers in default or user-defined ranges 391 | // - Added automatic seeding from /dev/urandom or time() and clock() 392 | // - Provided functions for saving and loading generator state 393 | // 394 | // v0.2 - Fixed bug which reloaded generator one step too late 395 | // 396 | // v0.3 - Switched to clearer, faster reload() code from Matthew Bellew 397 | // 398 | // v0.4 - Removed trailing newline in saved generator format to be consistent 399 | // with output format of built-in types 400 | // 401 | // v0.5 - Improved portability by replacing static const int's with enum's and 402 | // clarifying return values in seed(); suggested by Eric Heimburg 403 | // - Removed MAXINT constant; use 0xffffffffUL instead 404 | // 405 | // v0.6 - Eliminated seed overflow when uint32 is larger than 32 bits 406 | // - Changed integer [0,n] generator to give better uniformity 407 | // 408 | // v0.7 - Fixed operator precedence ambiguity in reload() 409 | // - Added access for real numbers in (0,1) and (0,n) 410 | // 411 | // v0.8 - Included time.h header to properly support time_t and clock_t 412 | // 413 | // v1.0 - Revised seeding to match 26 Jan 2002 update of Nishimura and Matsumoto 414 | // - Allowed for seeding with arrays of any length 415 | // - Added access for real numbers in [0,1) with 53-bit resolution 416 | // - Added access for real numbers from normal (Gaussian) distributions 417 | // - Increased overall speed by optimizing twist() 418 | // - Doubled speed of integer [0,n] generation 419 | // - Fixed out-of-range number generation on 64-bit machines 420 | // - Improved portability by substituting literal constants for long enum's 421 | // - Changed license from GNU LGPL to BSD 422 | --------------------------------------------------------------------------------