├── .gitignore
├── .travis.yml
├── .github
    └── workflows
    │   └── ubuntu22.yml
├── CMakeLists.txt
├── Makefile
├── include
    ├── adler32.h
    ├── ztimer.h
    ├── characterhash.h
    ├── threewisehash.h
    ├── generalhash.h
    ├── cyclichash.h
    ├── rabinkarphash.h
    └── mersennetwister.h
├── examples
    ├── example64bits.cpp
    ├── example4.cpp
    ├── example2.cpp
    ├── example3.cpp
    ├── example.cpp
    ├── example6.cpp
    └── example5.cpp
├── README.md
├── benchmarks
    └── speedtesting.cpp
└── tests
    └── unit.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | CMakeSettings.json
2 | .vs
3 | out
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: cpp
2 | sudo: false
3 | compiler:
4 |   - clang
5 | 
6 | script: make && ./unit
7 | 


--------------------------------------------------------------------------------
/.github/workflows/ubuntu22.yml:
--------------------------------------------------------------------------------
 1 | name: Ubuntu 22.04 CI (GCC 11)
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   ubuntu-build:
 7 |     runs-on: ubuntu-22.04
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |       - name: Use cmake
11 |         run: |
12 |           mkdir build &&
13 |           cd build &&
14 |           cmake  ..  &&
15 |           cmake --build .   &&
16 |           ctest --output-on-failure


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0...3.23)
 2 | 
 3 | project(rollinghashcpp)
 4 | set(CMAKE_CXX_STANDARD 11)
 5 | 
 6 | if(WIN32)
 7 |  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")	
 8 | endif()
 9 | include_directories(include)
10 | add_executable(example "examples/example.cpp")
11 | add_executable(example2 "examples/example2.cpp")
12 | add_executable(example3 "examples/example3.cpp")
13 | add_executable(example4 "examples/example2.cpp")
14 | add_executable(example5 "examples/example3.cpp")
15 | add_executable(example6 "examples/example2.cpp")
16 | add_executable(example64bits "examples/example64bits.cpp")
17 | add_executable(speedtesting "benchmarks/speedtesting.cpp")
18 | add_executable(unit "tests/unit.cpp")
19 | enable_testing()
20 | add_test(unit unit)
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | .SUFFIXES:
 3 | #
 4 | .SUFFIXES: .cpp .o .c .h
 5 | 
 6 | CXXFLAGS =  -std=c++11 -fexceptions -pedantic -ggdb -g3 -O2  -Wall -Woverloaded-virtual  -Wsign-promo -Wold-style-cast 
 7 | #-DNDEBUG
 8 | all: unit speedtesting example example2 example3 example64bits example4 example5 example6
 9 | 
10 | SRCS = unit.cpp speedtesting.cpp example.cpp example2.cpp example3.cpp example4.cpp example5.cpp example6.cpp example64bits.cpp
11 | 
12 | package:
13 | 	zip -9 ngramhashing_`date +%Y-%m-%d`.zip Makefile README *.h *.cpp
14 | 
15 | depend:
16 | 	makedepend -- $(CXXFLAGS) -- $(SRCS)
17 | 
18 | clean:
19 | 	rm -f *.o unit speedtesting example example2 example3 example4 example64bits example5 example6
20 | 
21 | HEADERS=cyclichash.h characterhash.h mersennetwister.h rabinkarphash.h generalhash.h threewisehash.h
22 | unit.o: $(HEADERS)
23 | speedtesting.o: $(HEADERS)
24 | example.o: $(HEADERS)
25 | example2.o: $(HEADERS)
26 | example3.o: $(HEADERS)
27 | example4.o: $(HEADERS)
28 | example5.o: $(HEADERS)
29 | example6.o: $(HEADERS)
30 | example64bits.o: $(HEADERS)
31 | 
32 | 


--------------------------------------------------------------------------------
/include/adler32.h:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | 
 3 | // contributed by Dmitry Artamonov
 4 | // this is *deterministic*
 5 | class Adler32 {
 6 |   uint32 sum1, sum2;
 7 | 
 8 | public:
 9 |   static const uint32_t Base = 65521;
10 |   uint32_t hashvalue;
11 |   int len;
12 | 
13 |   Adler32(int window) : sum1(1), sum2(0), hashvalue(0), len(window) {}
14 | 
15 |   void eat(uint8_t inchar) {
16 |     sum1 = (sum1 + inchar) % Base;
17 |     sum2 = (sum2 + sum1) % Base;
18 | 
19 |     hashvalue = (sum2 << 16) | sum1;
20 |   }
21 | 
22 |   void reset() {
23 |     sum1 = 1;
24 |     sum2 = 0;
25 |     hashvalue = 0;
26 |   }
27 | 
28 |   void update(uint8_t outchar, uint8_t inchar) {
29 |     int sum2 = (hashvalue >> 16) & 0xffff;
30 |     int sum1 = hashvalue & 0xffff;
31 | 
32 |     sum1 += inchar - outchar;
33 |     if (sum1 >= Base) {
34 |       sum1 -= Base;
35 |     } else if (sum1 < 0) {
36 |       sum1 += Base;
37 |     }
38 | 
39 |     sum2 = (int(sum2 - len * outchar + sum1 - 1) % int(Base));
40 |     if (sum2 < 0) {
41 |       sum2 += Base;
42 |     }
43 |     hashvalue = (sum2 << 16) | sum1;
44 |   }
45 | };
46 | 


--------------------------------------------------------------------------------
/examples/example64bits.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <memory>
 3 | #include <string>
 4 | #include <vector>
 5 | 
 6 | // Example of 64-bit hashing
 7 | 
 8 | #include "cyclichash.h"
 9 | 
10 | int main() {
11 |   CyclicHash<uint64> hf(5, 64);
12 |   string input = "ABCDE";
13 |   hf.eat(input[0]); // A
14 |   hf.eat(input[1]); // B
15 |   hf.eat(input[2]); // C
16 |   hf.eat(input[3]); // D
17 |   cout << "Hash value of ABCD is " << hf.hashvalue << endl;
18 |   // we check the answer going the long way...
19 |   const std::vector<unsigned char> charvectslice(input.begin(),
20 |                                                  input.begin() + 4);
21 |   uint64_t trueanswerslice = hf.hash(charvectslice);
22 |   if (trueanswerslice != hf.hashvalue)
23 |     throw runtime_error("bug");
24 |   // we continue
25 |   hf.eat(input[4]); // E
26 |   cout << "Hash value of ABCDE is " << hf.hashvalue << endl;
27 |   // we check the answer going the long way
28 |   const std::vector<unsigned char> charvect(input.begin(), input.end());
29 |   uint64_t trueanswer = hf.hash(charvect);
30 |   if (trueanswer != hf.hashvalue)
31 |     throw runtime_error("bug");
32 |   return 0;
33 | }
34 | 


--------------------------------------------------------------------------------
/examples/example4.cpp:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | #include <iostream>
 3 | #include <memory>
 4 | #include <string>
 5 | 
 6 | #include "cyclichash.h"
 7 | 
 8 | /**
 9 |  * Test of the prepend and append functions to test slightly longer and slightly
10 |  * shorter n-grams.
11 |  */
12 | 
13 | int main(int argc, char *argv[]) {
14 |   CyclicHash<uint64_t> hf(4, 64);
15 |   string input = "XABCDY";
16 |   string base(input.begin() + 1, input.end() - 1);
17 |   string extend(input.begin() + 1, input.end());
18 |   string prepend(input.begin(), input.end() - 1);
19 | 
20 |   for (string::const_iterator j = base.begin(); j != base.end(); ++j) {
21 |     hf.eat(*j);
22 |   }
23 | 
24 |   std::cout << base << " " << hf.hash(base) << std::endl;
25 |   std::cout << prepend << " " << hf.hash_prepend(input[0]) << " "
26 |             << hf.hash(prepend) << std::endl;
27 |   std::cout << extend << " " << hf.hash_extend(input.back()) << " "
28 |             << hf.hash(extend) << std::endl;
29 | 
30 |   assert(hf.hashvalue == hf.hash(base));
31 |   assert(hf.hash_prepend(input[0]) == hf.hash(prepend));
32 |   assert(hf.hash_extend(input.back()) == hf.hash(extend));
33 | 
34 |   return 0;
35 | }
36 | 


--------------------------------------------------------------------------------
/examples/example2.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <memory>
 3 | #include <string>
 4 | #include <vector>
 5 | 
 6 | // given hash value of "ABCD", can I have value of
 7 | // "ABCDE", without computing the whole hash value?
 8 | 
 9 | #include "cyclichash.h"
10 | 
11 | int main() {
12 |   CyclicHash<> hf(5, 19);
13 |   string input = "ABCDE";
14 |   hf.eat(input[0]); // A
15 |   hf.eat(input[1]); // B
16 |   hf.eat(input[2]); // C
17 |   hf.eat(input[3]); // D
18 |   cout << "Hash value of ABCD is " << hf.hashvalue << endl;
19 |   // we check the answer going the long way...
20 |   const std::vector<unsigned char> charvectslice(input.begin(),
21 |                                                  input.begin() + 4);
22 |   uint32_t trueanswerslice = hf.hash(charvectslice);
23 |   if (trueanswerslice != hf.hashvalue)
24 |     throw runtime_error("bug");
25 |   // we continue
26 |   hf.eat(input[4]); // E
27 |   cout << "Hash value of ABCDE is " << hf.hashvalue << endl;
28 |   // we check the answer going the long way
29 |   const std::vector<unsigned char> charvect(input.begin(), input.end());
30 |   uint32_t trueanswer = hf.hash(charvect);
31 |   if (trueanswer != hf.hashvalue)
32 |     throw runtime_error("bug");
33 |   return 0;
34 | }


--------------------------------------------------------------------------------
/examples/example3.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <memory>
 3 | #include <string>
 4 | #include <vector>
 5 | 
 6 | #include "cyclichash.h"
 7 | 
 8 | // given hash value of "BCD", can I have value of
 9 | // "ABC"quicky?
10 | 
11 | int demo1() {
12 |   CyclicHash<> hf(3, 32);
13 |   string input = "ABCD";
14 |   hf.eat(input[1]); // B
15 |   hf.eat(input[2]); // C
16 |   hf.eat(input[3]); // D
17 |   cout << "Hash value of BCD is " << hf.hashvalue << endl;
18 |   // we check the answer going the long way...
19 |   const std::vector<unsigned char> charvectslice(input.begin() + 1,
20 |                                                  input.begin() + 4);
21 |   uint32_t trueanswerslice = hf.hash(charvectslice);
22 |   if (trueanswerslice != hf.hashvalue)
23 |     throw runtime_error("bug");
24 |   // we continue
25 |   hf.reverse_update(input[0], input[3]); // remove D, prepend A
26 |   cout << "Hash value of ABC is " << hf.hashvalue << endl;
27 |   // we check the answer going the long way
28 |   const std::vector<unsigned char> charvect(input.begin(), input.begin() + 3);
29 |   uint32_t trueanswer = hf.hash(charvect);
30 |   if (trueanswer != hf.hashvalue)
31 |     throw runtime_error("bug");
32 |   return 0;
33 | }
34 | 
35 | int main() { demo1(); }
36 | 


--------------------------------------------------------------------------------
/examples/example.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <memory>
 3 | #include <string>
 4 | #include <vector>
 5 | 
 6 | #include "rabinkarphash.h"
 7 | 
 8 | int main() {
 9 |   size_t q = 3;
10 |   size_t k = 4;
11 |   typedef KarpRabinHash<> HashFunction;
12 |   std::vector<std::unique_ptr<HashFunction>> hashPtr(q);
13 |   for (size_t z = 0; z < hashPtr.size(); ++z) {
14 |     std::unique_ptr<HashFunction> &ptr = hashPtr[z];
15 |     ptr.reset(new HashFunction(k, 12));
16 |   }
17 | 
18 |   std::string str = "ACGTAACGT";
19 |   for (size_t j = 0; j < k; j++) {
20 |     for (size_t z = 0; z < hashPtr.size(); ++z) {
21 |       std::unique_ptr<HashFunction> &ptr = hashPtr[z];
22 |       ptr->eat(str[j]);
23 |     }
24 |   }
25 | 
26 |   for (size_t i = 0;; i++) {
27 |     std::cout << std::string(str.begin() + i, str.begin() + i + k);
28 |     for (size_t z = 0; z < hashPtr.size(); ++z) {
29 |       std::unique_ptr<HashFunction> &ptr = hashPtr[z];
30 |       std::cout << ' ' << ptr->hashvalue;
31 |     }
32 | 
33 |     std::cout << std::endl;
34 |     if (i + k < str.size()) {
35 |       for (size_t z = 0; z < hashPtr.size(); ++z) {
36 |         std::unique_ptr<HashFunction> &ptr = hashPtr[z];
37 |         ptr->update(str[i], str[i + k]);
38 |       }
39 |     } else {
40 |       break;
41 |     }
42 |   }
43 | 
44 |   return 0;
45 | }
46 | 


--------------------------------------------------------------------------------
/include/ztimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef ZTIMER
 2 | #define ZTIMER
 3 | 
 4 | #include <sys/stat.h>
 5 | #include <sys/types.h>
 6 | #ifdef _WIN32
 7 | /*Porting gettimeofday to Windows,
 8 | source: https://www.codefull.net/2015/12/systime-h-replacement-for-windows/
 9 | TODO: Consider simply using std::chrono for timing operations
10 | */
11 | #include <winsock2.h>
12 | 
13 | #define __need_clock_t
14 | #include <sys/timeb.h>
15 | #include <time.h>
16 | typedef long long suseconds_t;
17 | /* Structure describing CPU time used by a process and its children.  */
18 | struct tms {
19 |   clock_t tms_utime; /* User CPU time.  */
20 |   clock_t tms_stime; /* System CPU time.  */
21 | 
22 |   clock_t tms_cutime; /* User CPU time of dead children.  */
23 |   clock_t tms_cstime; /* System CPU time of dead children.  */
24 | };
25 | 
26 | /* Store the CPU time used by this process and all its
27 |    dead children (and their dead children) in BUFFER.
28 |    Return the elapsed real time, or (clock_t) -1 for errors.
29 |    All times are in CLK_TCKths of a second.  */
30 | clock_t times(struct tms *__buffer) {
31 | 
32 |   __buffer->tms_utime = clock();
33 |   __buffer->tms_stime = 0;
34 |   __buffer->tms_cstime = 0;
35 |   __buffer->tms_cutime = 0;
36 |   return __buffer->tms_utime;
37 | }
38 | 
39 | int gettimeofday(struct timeval *t, void *timezone) {
40 |   struct _timeb timebuffer;
41 |   _ftime(&timebuffer);
42 |   t->tv_sec = timebuffer.time;
43 |   t->tv_usec = 1000 * timebuffer.millitm;
44 |   return 0;
45 | }
46 | 
47 | #else
48 | #include <sys/time.h>
49 | #endif
50 | class ZTimer {
51 | public:
52 |   struct timeval t1, t2;
53 | 
54 | public:
55 |   ZTimer() {
56 |     gettimeofday(&t1, 0);
57 |     t2 = t1;
58 |   }
59 |   void reset() {
60 |     gettimeofday(&t1, 0);
61 |     t2 = t1;
62 |   }
63 |   int elapsed() {
64 |     return ((t2.tv_sec - t1.tv_sec) * 1000) +
65 |            ((t2.tv_usec - t1.tv_usec) / 1000);
66 |   }
67 |   int split() {
68 |     gettimeofday(&t2, 0);
69 |     return elapsed();
70 |   }
71 | };
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/examples/example6.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This example is from Dmitry Artamonov, it shows that to get the same
 3 |  * hash values for the same substrings, you need to use the same hasher object
 4 |  * (since they are randomized).
 5 |  */
 6 | 
 7 | #include "adler32.h"
 8 | #include "cyclichash.h"
 9 | #include "generalhash.h"
10 | #include "rabinkarphash.h"
11 | #include "threewisehash.h"
12 | #include <cassert>
13 | #include <iostream>
14 | #include <memory>
15 | #include <string>
16 | 
17 | void CalcHashes(const std::string &Inp, const int WindowSize,
18 |                 KarpRabinHash<> &h1, ThreeWiseHash<> &h2, GeneralHash<> &h3,
19 |                 CyclicHash<> &h4, Adler32 &h5) {
20 | 
21 |   int WindowPos = 0;
22 |   h1.reset();
23 |   h2.reset();
24 |   h3.reset();
25 |   h4.reset();
26 |   h5.reset();
27 | 
28 |   for (int i = 0; i < Inp.length(); i++) {
29 |     unsigned char InChar = Inp[i];
30 | 
31 |     bool Eat = (i < WindowSize);
32 |     unsigned char OutChar = ' ';
33 |     if (Eat) {
34 |       h1.eat(InChar);
35 |       h2.eat(InChar);
36 |       h3.eat(InChar);
37 |       h4.eat(InChar);
38 |       h5.eat(InChar);
39 |     } else {
40 |       OutChar = Inp[i - WindowSize];
41 |       h1.update(OutChar, InChar);
42 |       h2.update(OutChar, InChar);
43 |       h3.update(OutChar, InChar);
44 |       h4.update(OutChar, InChar);
45 |       h5.update(OutChar, InChar);
46 |     }
47 |     if (i + 1 >= WindowSize) {
48 |       auto current = Inp.substr(i + 1 - WindowSize, WindowSize);
49 |       printf("%04d %02d %c %c %06x %06x %06x %06x %06x %c %s \n", i, WindowPos,
50 |              InChar, OutChar, h1.hashvalue, h2.hashvalue, h3.hashvalue,
51 |              h4.hashvalue, h5.hashvalue, (Eat) ? '*' : ' ', current.c_str());
52 |       assert(h1.hash(current) == h1.hashvalue);
53 |       assert(h2.hash(current) == h2.hashvalue);
54 |       assert(h3.hash(current) == h3.hashvalue);
55 |       assert(h4.hash(current) == h4.hashvalue);
56 |     }
57 | 
58 |     WindowPos = (WindowPos + 1) % WindowSize;
59 |   }
60 | }
61 | 
62 | // ----------------------------------------------------------------------------
63 | 
64 | void Compare() {
65 |   const int WindowSize = 16;
66 |   KarpRabinHash<> h1(WindowSize);
67 |   ThreeWiseHash<> h2(WindowSize);
68 |   GeneralHash<> h3(WindowSize);
69 |   CyclicHash<> h4(WindowSize);
70 |   Adler32 h5(WindowSize);
71 | 
72 |   std::string s1 = "Test string for rolling hashes."; // 32 chars
73 |   CalcHashes(s1, WindowSize, h1, h2, h3, h4, h5);
74 | 
75 |   printf("---------------------------------------\n");
76 | 
77 |   std::string s2 = "This is some preamble.";
78 |   CalcHashes(s2 + s1, WindowSize, h1, h2, h3, h4, h5);
79 | }
80 | 
81 | int main() { Compare(); }
82 | 


--------------------------------------------------------------------------------
/include/characterhash.h:
--------------------------------------------------------------------------------
 1 | #ifndef CHARACTERHASH
 2 | #define CHARACTERHASH
 3 | 
 4 | typedef unsigned long long uint64;
 5 | typedef unsigned int uint32;
 6 | typedef unsigned int uint;
 7 | 
 8 | #include "mersennetwister.h"
 9 | #include <cassert>
10 | #include <iostream>
11 | #include <stdexcept>
12 | 
13 | using namespace std;
14 | 
15 | class mersenneRNG {
16 | public:
17 |   mersenneRNG(uint32 maxval) : mtr(), n(maxval){};
18 |   uint32 operator()() { return mtr.randInt(n); }
19 |   void seed(uint32 seedval) { mtr.seed(seedval); }
20 |   void seed() { mtr.seed(); }
21 |   uint32 rand_max() { return n; }
22 | 
23 | private:
24 |   MTRand mtr;
25 |   int n;
26 | };
27 | 
28 | template <typename hashvaluetype>
29 | #if __cplusplus >= 201402L
30 | constexpr
31 | #endif
32 |     hashvaluetype
33 |     maskfnc(int bits) {
34 |   assert(bits > 0);
35 |   assert(size_t(bits) <= sizeof(hashvaluetype) * 8);
36 |   hashvaluetype x = static_cast<hashvaluetype>(1) << (bits - 1);
37 |   return x ^ (x - 1);
38 | }
39 | 
40 | template <typename hashvaluetype = uint32, typename chartype = unsigned char>
41 | class CharacterHash {
42 | public:
43 |   CharacterHash(hashvaluetype maxval) {
44 |     if (sizeof(hashvaluetype) <= 4) {
45 |       mersenneRNG randomgenerator(maxval);
46 |       for (size_t k = 0; k < nbrofchars; ++k)
47 |         hashvalues[k] = static_cast<hashvaluetype>(randomgenerator());
48 |     } else if (sizeof(hashvaluetype) == 8) {
49 |       mersenneRNG randomgenerator(maxval >> 32);
50 |       mersenneRNG randomgeneratorbase((maxval >> 32) == 0 ? maxval
51 |                                                           : 0xFFFFFFFFU);
52 |       for (size_t k = 0; k < nbrofchars; ++k)
53 |         hashvalues[k] = static_cast<hashvaluetype>(randomgeneratorbase()) |
54 |                         (static_cast<hashvaluetype>(randomgenerator()) << 32);
55 |     } else
56 |       throw runtime_error("unsupported hash value type");
57 |   }
58 | 
59 |   CharacterHash(hashvaluetype maxval, uint32 seed1, uint32 seed2) {
60 |     if (sizeof(hashvaluetype) <= 4) {
61 |       mersenneRNG randomgenerator(maxval);
62 |       randomgenerator.seed(seed1);
63 |       for (size_t k = 0; k < nbrofchars; ++k)
64 |         hashvalues[k] = static_cast<hashvaluetype>(randomgenerator());
65 |     } else if (sizeof(hashvaluetype) == 8) {
66 |       mersenneRNG randomgenerator(maxval >> 32);
67 |       mersenneRNG randomgeneratorbase((maxval >> 32) == 0 ? maxval
68 |                                                           : 0xFFFFFFFFU);
69 |       randomgenerator.seed(seed1);
70 |       randomgeneratorbase.seed(seed2);
71 |       for (size_t k = 0; k < nbrofchars; ++k)
72 |         hashvalues[k] = static_cast<hashvaluetype>(randomgeneratorbase()) |
73 |                         (static_cast<hashvaluetype>(randomgenerator()) << 32);
74 |     } else
75 |       throw runtime_error("unsupported hash value type");
76 |   }
77 | 
78 |   enum { nbrofchars = 1 << (sizeof(chartype) * 8) };
79 | 
80 |   hashvaluetype hashvalues[1 << (sizeof(chartype) * 8)];
81 | };
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Randomized rolling hash functions in C++
 2 | [![Ubuntu 22.04 CI (GCC 11)](https://github.com/lemire/rollinghashcpp/actions/workflows/ubuntu22.yml/badge.svg)](https://github.com/lemire/rollinghashcpp/actions/workflows/ubuntu22.yml)
 3 | 
 4 | License: Apache 2.0
 5 | 
 6 | 
 7 | ## What is this?
 8 | 
 9 | This is a set of C++ classes implementing various recursive n-gram hashing techniques, also called rolling hashing (http://en.wikipedia.org/wiki/Rolling_hash), including:
10 | 
11 | *   Randomized Karp-Rabin (sometimes called Rabin-Karp)
12 | *   Hashing by Cyclic Polynomials (also known as Buzhash)
13 | *   Hashing by Irreducible Polynomials
14 | 
15 | This library is used by [khmer](https://github.com/dib-lab/khmer/): the in-memory nucleotide sequence k-mer engine.
16 |  
17 | 
18 | These are randomized hash functions, meaning that each time you create a new hasher instance, you will
19 | get new hash values for a given input.
20 | 
21 | ##  Code sample
22 | ```cpp
23 | 
24 |         const uint n(3);//hash all sequences of 3 characters
25 |         const uint L(7); // you need 7 bits
26 |         CyclicHash<uint32> hf(n,L );// if you want 64-bit values replace uint32 by uint64
27 |         for(uint32 k = 0; k<n;++k) {
28 |                   chartype c = ... ; // grab some character
29 |                   hf.eat(c); // feed it to the hasher
30 |         }
31 |         while(...) { // go over your string
32 |            hf.hashvalue; // at all times, this contains the hash value
33 |            chartype c = ... ;// point to the next character
34 |            chartype out = ...; // character we want to forget
35 |            hf.update(out,c); // update hash value
36 |         }
37 |         hf.reset(); // you can now hash a new string
38 | ```
39 | 
40 | 
41 | ##  Requirements
42 | 
43 | A recent GNU GCC C++ compiler or a recent CLANG.
44 | 
45 | ##  What should I do after I download it?
46 | 
47 | It is a conventional Cmake projet.
48 | 
49 | ```
50 | cmake -B build
51 | cmake --build build
52 | ctest --test-dir build
53 | ```
54 | 
55 | 
56 | ## Nim version
57 | 
58 | See [Cyclic-Polynomial-Hash](https://github.com/MarcAzar/Cyclic-Polynomial-Hash) for a similar library written in Nim.
59 | 
60 | ##  References
61 | 
62 | * Daniel Lemire, Owen Kaser: Recursive n-gram hashing is pairwise independent, at best, Computer Speech & Language, Volume 24, Issue 4, October 2010, Pages 698-710 http://arxiv.org/abs/0705.4676
63 | * Daniel Lemire, The universality of iterated hashing over variable-length strings, Discrete Applied Mathematics 160 (4-5), 2012. http://arxiv.org/abs/1008.1715
64 | * Owen Kaser and Daniel Lemire, Strongly universal string hashing is fast, Computer Journal (2014) 57 (11): 1624-1638. http://arxiv.org/abs/1202.4961
65 | 
66 | 
67 | This work has been used in genomics, see
68 | 
69 | 
70 | * Ilia Minkin, Son Pham, Paul Medvedev, TwoPaCo: an efficient algorithm to build the compacted de Bruijn graph from many complete genomes, Bioinformatics (to appear). https://doi.org/10.1093/bioinformatics/btw609 and http://github.com/medvedevgroup/TwoPaCo
71 | * Xiaofei Zhao, BinDash, software for fast genome distance estimation on a typical personal laptop, Bioinformatics. https://academic.oup.com/bioinformatics/article/35/4/671/5058094?login=true and https://github.com/zhaoxiaofei/bindash?tab=readme-ov-file 
72 | 


--------------------------------------------------------------------------------
/include/threewisehash.h:
--------------------------------------------------------------------------------
 1 | #ifndef THREEWISEHASH
 2 | #define THREEWISEHASH
 3 | 
 4 | #include "characterhash.h"
 5 | #include <deque>
 6 | #include <vector>
 7 | 
 8 | using namespace std;
 9 | 
10 | /**
11 |  * Each instance is a rolling hash function meant to hash streams of characters.
12 |  * Each new instance of this class comes with new random keys.
13 |  *
14 |  * Recommended usage to get L-bit hash values over n-grams:
15 |  *        ThreeWiseHash<> hf(n,L );
16 |  *        for(uint32 k = 0; k<n;++k) {
17 |  *                  unsigned char c = ... ; // grab some character
18 |  *                  hf.eat(c); // feed it to the hasher
19 |  *        }
20 |  *        while(...) { // go over your string
21 |  *           hf.hashvalue; // at all times, this contains the hash value
22 |  *           unsigned char c = ... ;// points to the next character
23 |  *           unsigned char out = ...; // character we want to forget
24 |  *           hf.update(out,c); // update hash value
25 |  *        }
26 |  */
27 | template <typename hashvaluetype = uint32, typename chartype = unsigned char>
28 | class ThreeWiseHash {
29 | 
30 | public:
31 |   // myn is the length of the sequences, e.g., 3 means that you want to hash
32 |   // sequences of 3 characters mywordsize is the number of bits you which to
33 |   // receive as hash values, e.g., 19 means that the hash values are 19-bit
34 |   // integers
35 |   ThreeWiseHash(int myn, int mywordsize = 19)
36 |       : n(myn), wordsize(mywordsize), hashers(), hasher(0) {
37 |     if (static_cast<uint>(wordsize) > 8 * sizeof(hashvaluetype)) {
38 |       cerr << "Can't create " << wordsize << "-bit hash values" << endl;
39 |       throw "abord";
40 |     }
41 |     for (int i = 0; i < n; ++i) {
42 |       CharacterHash<hashvaluetype, chartype> ch(
43 |           maskfnc<hashvaluetype>(wordsize));
44 |       hashers.push_back(ch);
45 |     }
46 |   }
47 | 
48 |   // add inchar as an input, this is used typically only at the start
49 |   // the hash value is updated to that of a longer string (one where inchar was
50 |   // appended)
51 |   void eat(chartype inchar) {
52 |     ngram.push_back(inchar);
53 |     __updateHashValue();
54 |   }
55 | 
56 |   // add inchar as an input and remove outchar, the hashvalue is updated
57 |   // this function can be used to update the hash value from the hash value of
58 |   // [outchar]ABC to the hash value of ABC[inchar]
59 |   void update(chartype, chartype inchar) {
60 |     ngram.push_back(inchar);
61 |     ngram.pop_front();
62 |     __updateHashValue();
63 |   }
64 | 
65 |   // prepare to process a new string, you will need to call "eat" again
66 |   void reset() {
67 |     hashvalue = 0;
68 |     ngram.clear();
69 |   }
70 | 
71 |   void __updateHashValue() {
72 |     hashvalue = 0;
73 |     for (size_t k = 0; k < ngram.size(); ++k) {
74 |       hashvalue ^= hashers[k].hashvalues[ngram[k]];
75 |     }
76 |   }
77 | 
78 |   // this is a convenience function, use eat,update and .hashvalue to use as a
79 |   // rolling hash function
80 |   template <class container> hashvaluetype hash(container &c) {
81 |     hashvaluetype answer(0);
82 |     for (size_t k = 0; k < c.size(); ++k) {
83 |       answer ^= hashers[k].hashvalues[c[k]];
84 |     }
85 |     return answer;
86 |   }
87 | 
88 |   hashvaluetype hashvalue;
89 |   int n;
90 |   const int wordsize;
91 |   deque<chartype> ngram;
92 |   vector<CharacterHash<hashvaluetype, chartype>> hashers;
93 |   CharacterHash<hashvaluetype, chartype> hasher; // placeholder
94 | };
95 | 
96 | #endif
97 | 


--------------------------------------------------------------------------------
/benchmarks/speedtesting.cpp:
--------------------------------------------------------------------------------
  1 | #include "cyclichash.h"
  2 | #include "generalhash.h"
  3 | #include "rabinkarphash.h"
  4 | #include "threewisehash.h"
  5 | #include "ztimer.h"
  6 | #include <fstream>
  7 | #include <string>
  8 | 
  9 | using namespace std;
 10 | 
 11 | template <class hashfunction>
 12 | double hashALot(int n, int L, uint ttimes, uint sizeoftest,
 13 |                 vector<uint32> &recorder) {
 14 |   ZTimer t;
 15 |   for (uint times = 0; times < ttimes; ++times) {
 16 |     hashfunction hf(n, L);
 17 |     for (uint k = 0; k < static_cast<uint>(n); ++k) {
 18 |       hf.eat(static_cast<unsigned char>(k));
 19 |     }
 20 |     for (uint k = n; k < sizeoftest; ++k) {
 21 |       hf.update(static_cast<unsigned char>(k - n),
 22 |                 static_cast<unsigned char>(k));
 23 |     }
 24 |     /* The goal of the recorder is to prevent
 25 |     the compiler from deciding that this whole computation
 26 |     is not required!
 27 |     */
 28 |     recorder.push_back(hf.hashvalue);
 29 |   }
 30 |   return t.split() / (1000.0 * ttimes);
 31 | }
 32 | 
 33 | template <class hashfunction>
 34 | double hashALot(int n, int L, uint ttimes, vector<uint32> &recorder,
 35 |                 vector<unsigned char> &data) {
 36 |   ZTimer t;
 37 |   for (uint times = 0; times < ttimes; ++times) {
 38 |     hashfunction hf(n, L);
 39 |     for (uint k = 0; k < static_cast<uint>(n); ++k) {
 40 |       hf.eat(data[k]);
 41 |     }
 42 |     for (uint k = n; k < data.size(); ++k) {
 43 |       hf.update(data[k - n], data[k]);
 44 |     }
 45 |     /* The goal of the recorder is to prevent
 46 |     the compiler from deciding that this whole computation
 47 |     is not required!
 48 |     */
 49 |     recorder.push_back(hf.hashvalue);
 50 |   }
 51 |   return t.split() / 1000.0;
 52 | }
 53 | 
 54 | void synthetic() {
 55 |   int L = 19;
 56 |   vector<uint32> recorder;
 57 |   uint sizeoftest = 100000000;
 58 |   cout << "#n three-wise General BufferedGeneral Cyclic Karp-Rabin " << endl;
 59 |   for (uint n = 1; n + L <= 32; ++n) {
 60 |     cout << n << " " << hashALot<ThreeWiseHash<>>(n, L, 1, sizeoftest, recorder)
 61 |          << " ";
 62 |     cout << hashALot<GeneralHash<NOPRECOMP>>(n, L, 1, sizeoftest, recorder)
 63 |          << " ";
 64 |     cout << hashALot<GeneralHash<FULLPRECOMP>>(n, L, 1, sizeoftest, recorder)
 65 |          << " ";
 66 |     cout << hashALot<CyclicHash<>>(n, L + n, 1, sizeoftest, recorder) << " ";
 67 |     cout << hashALot<KarpRabinHash<>>(n, L, 1, sizeoftest, recorder) << endl;
 68 |   }
 69 |   cout << "# L= " << L << " char-length= " << sizeoftest << endl;
 70 | }
 71 | 
 72 | void grabFileContent(vector<unsigned char> &data, string filename) {
 73 |   string line;
 74 |   ifstream file(filename.c_str());
 75 |   std::getline(file, line);
 76 |   while (file.good()) {
 77 |     std::getline(file, line);
 78 |     for (uint k = 0; k < line.size(); ++k)
 79 |       data.push_back(line[k]); // presumably not very fast to do it char by char
 80 |   }
 81 |   file.close();
 82 | }
 83 | void realdata(string filename) {
 84 |   int L = 19;
 85 |   vector<uint32> recorder;
 86 |   uint repeats = 1;
 87 |   vector<unsigned char> data;
 88 |   grabFileContent(data, filename);
 89 |   cout << "#n three-wise General BufferedGeneral Cyclic Karp-Rabin " << endl;
 90 |   for (uint n = 1; n + L <= 32; ++n) {
 91 |     cout << n << " " << hashALot<ThreeWiseHash<>>(n, L, repeats, recorder, data)
 92 |          << " ";
 93 |     cout << hashALot<GeneralHash<NOPRECOMP>>(n, L, repeats, recorder, data)
 94 |          << " ";
 95 |     cout << hashALot<GeneralHash<FULLPRECOMP>>(n, L, repeats, recorder, data)
 96 |          << " ";
 97 |     cout << hashALot<CyclicHash<>>(n, L + n, repeats, recorder, data) << " ";
 98 |     cout << hashALot<KarpRabinHash<>>(n, L, repeats, recorder, data) << endl;
 99 |   }
100 |   cout << "# L= " << L << " char-length= " << data.size()
101 |        << " repeats=" << repeats << endl;
102 | }
103 | 
104 | int main(int params, char **args) {
105 |   if (params == 1)
106 |     synthetic();
107 |   else
108 |     realdata(args[1]);
109 | 
110 |   return 0;
111 | }
112 | 


--------------------------------------------------------------------------------
/examples/example5.cpp:
--------------------------------------------------------------------------------
  1 | #include "cyclichash.h"
  2 | #include <cassert>
  3 | #include <iostream>
  4 | #include <memory>
  5 | #include <string>
  6 | 
  7 | /*
  8 | An issue is application-specific and has to do with the nature of DNA. Even
  9 | though we usually represent DNA as a string of characters (such as `GATTACA`),
 10 | this is really only half the story. DNA is double stranded with `A` pairing
 11 | to `T` and `C` pairing to `G`, so the string `GATTACA` really represents the
 12 | following molecule.
 13 | 
 14 | ```
 15 | gattaca
 16 | |||||||
 17 | ɔʇɐɐʇƃʇ
 18 | ```
 19 | 
 20 | In most contexts, we have no way of knowing whether the original piece of DNA
 21 | sampled was from the top strand or the bottom strand, and so when we hash DNA
 22 | sequences we typically want the two complementary sequences to hash to the
 23 | same value.
 24 | 
 25 | I used two cyclic hashes: one for the "top" strand of DNA (observed from the
 26 | provided string, updated using forward updates) and one for the "bottom" strand
 27 | (inferred from the provided string, updated using reverse updates). Then to get
 28 | the hash for a particular k-mer (n-gram) in the DNA, I just XOR the current
 29 | forward and reverse hashes.
 30 | */
 31 | 
 32 | // Define DNA's complementary nucleotides
 33 | //
 34 | // Daniel: This is probably  inefficient. Needlessly so.
 35 | // if efficiency matters, you want to define the character hash so that it takes
 36 | // the key 'A' to the hash value of 'T' and so forth.
 37 | //
 38 | #define nucleotide_complement(ch)                                              \
 39 |   ((toupper(ch)) == 'A'   ? 'T'                                                \
 40 |    : (toupper(ch)) == 'T' ? 'A'                                                \
 41 |    : (toupper(ch)) == 'C' ? 'G'                                                \
 42 |                           : 'C')
 43 | 
 44 | // A sequence and its reverse complement (such as "GATTACA" and "TGTAATC") are
 45 | // biologically identical and should hash to the same value. A sequence that is
 46 | // equal to its reverse complement is a special case and should be handled
 47 | // accordingly.
 48 | //
 49 | #define canonical_hash(fwd, rev) (fwd == rev ? rev : fwd ^ rev)
 50 | 
 51 | #define WORDSIZE 5
 52 | #define SEED1 42
 53 | #define SEED2 1985
 54 | #define HASHBITS 64
 55 | 
 56 | // full string hash from scratch (for comparison)
 57 | uint64_t fullhash(const string &input) {
 58 |   assert(input.size() == WORDSIZE);
 59 |   CyclicHash<uint64_t> forward(input.size(), SEED1, SEED2, HASHBITS);
 60 |   CyclicHash<uint64_t> reverse(input.size(), SEED1, SEED2, HASHBITS);
 61 |   for (int j = 0; j < input.size(); j++) {
 62 |     forward.eat(input[j]);
 63 |     reverse.eat(nucleotide_complement(input[input.size() - 1 - j]));
 64 |   }
 65 |   return canonical_hash(forward.hashvalue, reverse.hashvalue);
 66 | }
 67 | 
 68 | // check the rolling hash
 69 | // k is the k-gram size, input is any string
 70 | void demo(int k, string input) {
 71 |   // Initialize the hash function to compute the hash of the first k-mer.
 72 |   CyclicHash<uint64_t> forward(k, SEED1, SEED2, HASHBITS);
 73 |   CyclicHash<uint64_t> reverse(k, SEED1, SEED2, HASHBITS);
 74 |   for (int j = 0; j < k; j++) {
 75 |     forward.eat(input[j]);
 76 |     // going backward
 77 |     reverse.eat(nucleotide_complement(input[k - 1 - j]));
 78 |   }
 79 |   // rolling has
 80 |   uint64_t hashval = canonical_hash(forward.hashvalue, reverse.hashvalue);
 81 |   assert(fullhash(input.substr(0, k)) == hashval);
 82 |   std::cout << input.substr(0, k) << " " << hashval << std::endl;
 83 | 
 84 |   for (int j = k; j < input.size(); j++) {
 85 |     forward.update(input[j - k], input[j]);
 86 |     // note: you to flip the parameters of reverse_update
 87 |     reverse.reverse_update(nucleotide_complement(input[j]),
 88 |                            nucleotide_complement(input[j - k]));
 89 |     // compute the rolling has
 90 |     hashval = canonical_hash(forward.hashvalue, reverse.hashvalue);
 91 |     // compare with full string hash
 92 |     assert(fullhash(input.substr(j - k + 1, k)) == hashval);
 93 |     std::cout << input.substr(j - k + 1, k) << " " << hashval << std::endl;
 94 |   }
 95 | }
 96 | 
 97 | int main(int argc, char *argv[]) {
 98 |   demo(5, "GATTACACAATAGCAAATT");
 99 |   std::cout << " code looks good " << std::endl;
100 |   return 0;
101 | }
102 | 


--------------------------------------------------------------------------------
/include/generalhash.h:
--------------------------------------------------------------------------------
  1 | #ifndef GENERALHASH
  2 | #define GENERALHASH
  3 | 
  4 | #include <iostream>
  5 | #include <vector>
  6 | 
  7 | #include "characterhash.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | enum { NOPRECOMP, FULLPRECOMP };
 12 | 
 13 | /**
 14 |  * Each instance is a rolling hash function meant to hash streams of characters.
 15 |  * Each new instance of this class comes with new random keys.
 16 |  *
 17 |  * Recommended usage to get L-bit hash values over n-grams:
 18 |  *        GeneralHash<> hf(n,L );
 19 |  *        for(uint32 k = 0; k<n;++k) {
 20 |  *                  unsigned char c = ... ; // grab some character
 21 |  *                  hf.eat(c); // feed it to the hasher
 22 |  *        }
 23 |  *        while(...) { // go over your string
 24 |  *           hf.hashvalue; // at all times, this contains the hash value
 25 |  *           unsigned char c = ... ;// points to the next character
 26 |  *           unsigned char out = ...; // character we want to forget
 27 |  *           hf.update(out,c); // update hash value
 28 |  *        }
 29 |  */
 30 | template <int precomputationtype = NOPRECOMP, typename hashvaluetype = uint32,
 31 |           typename chartype = unsigned char>
 32 | class GeneralHash {
 33 | public:
 34 |   // myn is the length of the sequences, e.g., 3 means that you want to hash
 35 |   // sequences of 3 characters mywordsize is the number of bits you which to
 36 |   // receive as hash values, e.g., 19 means that the hash values are 19-bit
 37 |   // integers
 38 |   GeneralHash(int myn, int mywordsize = 19)
 39 |       : hashvalue(0), wordsize(mywordsize), n(myn), irreduciblepoly(0),
 40 |         hasher(maskfnc<hashvaluetype>(wordsize)),
 41 |         lastbit(static_cast<hashvaluetype>(1) << wordsize),
 42 |         precomputedshift(precomputationtype == FULLPRECOMP ? (1 << n) : 0) {
 43 |     if (wordsize == 19) {
 44 |       irreduciblepoly = 1 + (1 << 1) + (1 << 2) + (1 << 5) + (1 << 19);
 45 |     } else if (wordsize == 9) {
 46 |       irreduciblepoly = 1 + (1 << 2) + (1 << 3) + (1 << 5) + (1 << 9);
 47 |     } else {
 48 |       cerr << "unsupported wordsize " << wordsize << " bits, try 19 or 9"
 49 |            << endl;
 50 |     }
 51 |     // in case the precomp is activated at the template level
 52 |     if (precomputationtype == FULLPRECOMP) {
 53 |       for (hashvaluetype x = 0; x < precomputedshift.size(); ++x) {
 54 |         hashvaluetype leftover = x << (wordsize - n);
 55 |         fastleftshift(leftover, n);
 56 |         precomputedshift[x] = leftover;
 57 |       }
 58 |     }
 59 |   }
 60 |   // prepare to process a new string, you will need to call "eat" again
 61 |   void reset() { hashvalue = 0; }
 62 | 
 63 |   void fastleftshift(hashvaluetype &x, int r) const {
 64 |     for (int i = 0; i < r; ++i) {
 65 |       x <<= 1;
 66 |       if ((x & lastbit) == lastbit)
 67 |         x ^= irreduciblepoly;
 68 |     }
 69 |   }
 70 | 
 71 |   void fastleftshiftn(hashvaluetype &x) const {
 72 |     x =
 73 |         // take the last n bits and look-up the result
 74 |         precomputedshift[(x >> (wordsize - n))] ^
 75 |         // then just shift the first L-n bits
 76 |         ((x << n) & (lastbit - 1));
 77 |   }
 78 | 
 79 |   // add inchar as an input and remove outchar, the hashvalue is updated
 80 |   // this function can be used to update the hash value from the hash value of
 81 |   // [outchar]ABC to the hash value of ABC[inchar]
 82 |   void update(chartype outchar, chartype inchar) {
 83 |     hashvalue <<= 1;
 84 |     if ((hashvalue & lastbit) == lastbit)
 85 |       hashvalue ^= irreduciblepoly;
 86 |     //
 87 |     hashvaluetype z(hasher.hashvalues[outchar]);
 88 |     // the compiler should optimize away the next if/else
 89 |     if (precomputationtype == FULLPRECOMP) {
 90 |       fastleftshiftn(z);
 91 |       hashvalue ^= z ^ hasher.hashvalues[inchar];
 92 |     } else {
 93 |       fastleftshift(z, n);
 94 |       hashvalue ^= z ^ hasher.hashvalues[inchar];
 95 |     }
 96 |   }
 97 | 
 98 |   // add inchar as an input, this is used typically only at the start
 99 |   // the hash value is updated to that of a longer string (one where inchar was
100 |   // appended)
101 |   void eat(chartype inchar) {
102 |     fastleftshift(hashvalue, 1);
103 |     hashvalue ^= hasher.hashvalues[inchar];
104 |   }
105 | 
106 |   // this is a convenience function, use eat,update and .hashvalue to use as a
107 |   // rolling hash function
108 |   template <class container> hashvaluetype hash(container &c) const {
109 |     hashvaluetype answer(0);
110 |     for (uint k = 0; k < c.size(); ++k) {
111 |       fastleftshift(answer, 1);
112 |       answer ^= hasher.hashvalues[c[k]];
113 |     }
114 |     return answer;
115 |   }
116 | 
117 |   hashvaluetype hashvalue;
118 |   const int wordsize;
119 |   int n;
120 |   hashvaluetype irreduciblepoly;
121 |   CharacterHash<hashvaluetype, chartype> hasher;
122 |   const hashvaluetype lastbit;
123 |   vector<hashvaluetype> precomputedshift;
124 | };
125 | 
126 | #endif
127 | 


--------------------------------------------------------------------------------
/include/cyclichash.h:
--------------------------------------------------------------------------------
  1 | #ifndef CYCLICHASH
  2 | #define CYCLICHASH
  3 | 
  4 | #include "characterhash.h"
  5 | 
  6 | /**
  7 |  * Each instance is a rolling hash function meant to hash streams of characters.
  8 |  * Each new instance of this class comes with new random keys.
  9 |  *
 10 |  * Recommended usage to get L-bit hash values over n-grams:
 11 |  *        CyclicHash<> hf(n,L );
 12 |  *        for(uint32 k = 0; k<n;++k) {
 13 |  *                  unsigned char c = ... ; // grab some character
 14 |  *                  hf.eat(c); // feed it to the hasher
 15 |  *        }
 16 |  *        while(...) { // go over your string
 17 |  *           hf.hashvalue; // at all times, this contains the hash value
 18 |  *           unsigned char c = ... ;// points to the next character
 19 |  *           unsigned char out = ...; // character we want to forget
 20 |  *           hf.update(out,c); // update hash value
 21 |  *        }
 22 |  */
 23 | template <typename hashvaluetype = uint32, typename chartype = unsigned char>
 24 | class CyclicHash {
 25 | 
 26 | public:
 27 |   // myn is the length of the sequences, e.g., 3 means that you want to hash
 28 |   // sequences of 3 characters mywordsize is the number of bits you which to
 29 |   // receive as hash values, e.g., 19 means that the hash values are 19-bit
 30 |   // integers
 31 |   CyclicHash(int myn, int mywordsize = 19)
 32 |       : hashvalue(0), n(myn), wordsize(mywordsize),
 33 |         hasher(maskfnc<hashvaluetype>(wordsize)),
 34 |         mask1(maskfnc<hashvaluetype>(wordsize - 1)), myr(n % wordsize),
 35 |         maskn(maskfnc<hashvaluetype>(wordsize - myr)) {
 36 |     if (static_cast<uint>(wordsize) > 8 * sizeof(hashvaluetype)) {
 37 |       cerr << "Can't create " << wordsize << "-bit hash values" << endl;
 38 |       throw "abord";
 39 |     }
 40 |   }
 41 | 
 42 |   CyclicHash(int myn, uint32 seed1, uint32 seed2, int mywordsize = 19)
 43 |       : hashvalue(0), n(myn), wordsize(mywordsize),
 44 |         hasher(maskfnc<hashvaluetype>(wordsize), seed1, seed2),
 45 |         mask1(maskfnc<hashvaluetype>(wordsize - 1)), myr(n % wordsize),
 46 |         maskn(maskfnc<hashvaluetype>(wordsize - myr)) {
 47 |     if (static_cast<uint>(wordsize) > 8 * sizeof(hashvaluetype)) {
 48 |       cerr << "Can't create " << wordsize << "-bit hash values" << endl;
 49 |       throw "abord";
 50 |     }
 51 |   }
 52 | 
 53 |   void fastleftshiftn(hashvaluetype &x) const {
 54 |     x = ((x & maskn) << myr) | (x >> (wordsize - myr));
 55 |   }
 56 | 
 57 |   void fastleftshift1(hashvaluetype &x) const {
 58 |     x = ((x & mask1) << 1) | (x >> (wordsize - 1));
 59 |   }
 60 | 
 61 |   void fastrightshift1(hashvaluetype &x) const {
 62 |     x = (x >> 1) | ((x & 1) << (wordsize - 1));
 63 |   }
 64 | 
 65 |   hashvaluetype getfastleftshift1(hashvaluetype x) const {
 66 |     return ((x & mask1) << 1) | (x >> (wordsize - 1));
 67 |   }
 68 | 
 69 |   hashvaluetype getfastrightshift1(hashvaluetype x) const {
 70 |     return (x >> 1) | ((x & 1) << (wordsize - 1));
 71 |   }
 72 | 
 73 |   // this is a convenience function, use eat,update and .hashvalue to use as a
 74 |   // rolling hash function
 75 |   template <class container> hashvaluetype hash(container &c) {
 76 |     hashvaluetype answer(0);
 77 |     for (uint k = 0; k < c.size(); ++k) {
 78 |       fastleftshift1(answer);
 79 |       answer ^= hasher.hashvalues[static_cast<unsigned int>(c[k])];
 80 |     }
 81 |     return answer;
 82 |   }
 83 | 
 84 |   hashvaluetype hashz(chartype outchar, uint n) {
 85 |     hashvaluetype answer =
 86 |         hasher.hashvalues[static_cast<unsigned int>(outchar)];
 87 |     for (uint k = 0; k < n; ++k) {
 88 |       fastleftshift1(answer);
 89 |     }
 90 |     return answer;
 91 |   }
 92 | 
 93 |   // add inchar as an input and remove outchar, the hashvalue is updated
 94 |   // this function can be used to update the hash value from the hash value of
 95 |   // [outchar]ABC to the hash value of ABC[inchar]
 96 |   void update(chartype outchar, chartype inchar) {
 97 |     hashvaluetype z(hasher.hashvalues[outchar]);
 98 |     fastleftshiftn(z);
 99 |     hashvalue = getfastleftshift1(hashvalue) ^ z ^ hasher.hashvalues[inchar];
100 |   }
101 | 
102 |   // this is the reverse of the update function.
103 |   // this function can be used to update the hash value from the hash value of
104 |   // ABC[inchar] to the hash value of [outchar]ABC
105 |   void reverse_update(chartype outchar, chartype inchar) {
106 |     hashvaluetype z(hasher.hashvalues[outchar]);
107 |     fastleftshiftn(z);
108 |     hashvalue ^= z ^ hasher.hashvalues[inchar];
109 |     hashvalue = getfastrightshift1(hashvalue);
110 |   }
111 | 
112 |   // add inchar as an input, this is used typically only at the start
113 |   // the hash value is updated to that of a longer string (one where inchar was
114 |   // appended)
115 |   void eat(chartype inchar) {
116 |     fastleftshift1(hashvalue);
117 |     hashvalue ^= hasher.hashvalues[inchar];
118 |   }
119 | 
120 |   // for an n-gram X it returns hash value of (n + 1)-gram XY without changing
121 |   // the object X. For example, if X = "ABC", then X.hash_extend("D") returns
122 |   // value of "ABCD" without changing the state of X
123 |   hashvaluetype hash_extend(chartype Y) {
124 |     return getfastleftshift1(hashvalue) ^ hasher.hashvalues[Y];
125 |   }
126 | 
127 |   //  same as hash_extend, but with prepending the n-gram with character Y. If X
128 |   //  = "ABC", then X.hash_prepend("D") returns value of "DABC" without changing
129 |   //  the state of X
130 |   hashvaluetype hash_prepend(chartype Y) {
131 |     hashvaluetype z(hasher.hashvalues[Y]);
132 |     fastleftshiftn(z);
133 |     return z ^ hashvalue;
134 |   }
135 | 
136 |   // prepare to process a new string, you will need to call "eat" again
137 |   void reset() { hashvalue = 0; }
138 | 
139 |   hashvaluetype hashvalue;
140 |   int n;
141 |   const int wordsize;
142 |   CharacterHash<hashvaluetype, chartype> hasher;
143 |   const hashvaluetype mask1;
144 |   const int myr;
145 |   const hashvaluetype maskn;
146 | };
147 | 
148 | #endif
149 | 


--------------------------------------------------------------------------------
/tests/unit.cpp:
--------------------------------------------------------------------------------
  1 | #include <deque>
  2 | #include <map>
  3 | 
  4 | #include "cyclichash.h"
  5 | #include "generalhash.h"
  6 | #include "rabinkarphash.h"
  7 | 
  8 | #include "threewisehash.h"
  9 | 
 10 | using namespace std;
 11 | 
 12 | template <class hashfunction> bool testExtendAndPrepend(uint L = 19) {
 13 |   const uint n(4); // n-grams
 14 |   hashfunction hf(n, L);
 15 |   string input = "XABCDY";
 16 |   string base(input.begin() + 1, input.end() - 1);
 17 |   assert(base.size() == n);
 18 |   string extend(input.begin() + 1, input.end());
 19 |   string prepend(input.begin(), input.end() - 1);
 20 | 
 21 |   for (string::const_iterator j = base.begin(); j != base.end(); ++j) {
 22 |     hf.eat(*j);
 23 |   }
 24 |   if (hf.hashvalue != hf.hash(base)) {
 25 |     std::cout << "bug!" << std::endl;
 26 |     std::cout << base << " " << hf.hash(base) << std::endl;
 27 |     return false;
 28 |   }
 29 |   if (hf.hash_prepend(input[0]) != hf.hash(prepend)) {
 30 |     std::cout << "bug!" << std::endl;
 31 |     std::cout << prepend << " " << hf.hash_prepend(input[0]) << " "
 32 |               << hf.hash(prepend) << std::endl;
 33 |     return false;
 34 |   }
 35 |   if (hf.hash_extend(input.back()) != hf.hash(extend)) {
 36 |     std::cout << "bug!" << std::endl;
 37 |     std::cout << extend << " " << hf.hash_extend(input.back()) << " "
 38 |               << hf.hash(extend) << std::endl;
 39 |     return false;
 40 |   }
 41 | 
 42 |   assert(hf.hashvalue == hf.hash(base));
 43 |   assert(hf.hash_prepend(input[0]) == hf.hash(prepend));
 44 |   assert(hf.hash_extend(input.back()) == hf.hash(extend));
 45 | 
 46 |   return true;
 47 | }
 48 | 
 49 | template <class hashfunction> bool isItAFunction(uint L = 7) {
 50 |   mersenneRNG generator(5);
 51 |   const uint n(3); // n-grams
 52 |   hashfunction hf(n, L);
 53 |   deque<unsigned char> s;
 54 |   for (uint32 k = 0; k < n; ++k) {
 55 |     unsigned char c = static_cast<unsigned char>(generator() + 65);
 56 |     s.push_back(c);
 57 |     hf.eat(c);
 58 |   }
 59 |   for (uint32 k = 0; k < 100000; ++k) {
 60 |     unsigned char out = s.front();
 61 |     s.pop_front();
 62 |     char c(generator() + 65);
 63 | 
 64 |     s.push_back(c);
 65 |     hf.update(out, c);
 66 |     if (hf.hash(s) != hf.hashvalue) {
 67 |       for (deque<unsigned char>::iterator ii = s.begin(); ii != s.end(); ++ii)
 68 |         cout << *ii << " " << static_cast<uint32>(*ii) << endl;
 69 |       cerr << "bug" << endl;
 70 |       cerr << s[0] << s[1] << s[2] << " was hashed to " << hf.hashvalue
 71 |            << " when true hash value is " << hf.hash(s) << endl;
 72 |       for (uint j = 0; j < n; ++j)
 73 |         cerr << s[j] << "->" << hf.hasher.hashvalues[s[j]] << endl;
 74 |       return false;
 75 |     }
 76 |   }
 77 |   return true;
 78 | }
 79 | 
 80 | template <class hashfunction> bool doesReverseUpdateWorks(uint L = 7) {
 81 |   mersenneRNG generator(5);
 82 |   const uint n(3); // n-grams
 83 |   hashfunction hf(n, L);
 84 |   deque<unsigned char> s;
 85 |   for (uint32 k = 0; k < n; ++k) {
 86 |     unsigned char c = static_cast<unsigned char>(generator() + 65);
 87 |     s.push_back(c);
 88 |     hf.eat(c);
 89 |   }
 90 |   for (uint32 k = 0; k < 100000; ++k) {
 91 |     unsigned char out = s.front();
 92 |     s.pop_front();
 93 |     char c(generator() + 65);
 94 |     s.push_back(c);
 95 |     hf.update(out, c);
 96 |     hf.reverse_update(out, c);
 97 |     hf.update(out, c);
 98 |     if (hf.hash(s) != hf.hashvalue) {
 99 |       return false;
100 |     }
101 |   }
102 |   return true;
103 | }
104 | 
105 | template <class hashfunction> bool isItRandom(uint L = 19) {
106 |   cout << "checking that it is randomized " << endl;
107 |   int n = 5;
108 |   vector<unsigned char> data(n);
109 |   for (int k = 0; k < n; ++k) {
110 |     data[k] = static_cast<unsigned char>(k);
111 |   }
112 |   hashfunction base(n, L);
113 |   uint64 x = base.hash(data);
114 |   for (int k = 0; k < 100; ++k) {
115 |     hashfunction hf(n, L);
116 |     uint64 y = hf.hash(data);
117 |     if (y != x) {
118 |       cout << "It is randomized! " << endl;
119 |       return true;
120 |     }
121 |     cout << "collision " << y << endl;
122 |   }
123 |   cout << "Not randomized! " << endl;
124 |   return false; // we conclude that it always hashes to the same value (this is
125 |                 // bad)
126 | }
127 | 
128 | bool test() {
129 |   bool ok(true);
130 |   cout << "Karp-Rabin" << endl;
131 |   for (uint L = 1; L <= 32; ++L) {
132 |     if (!ok)
133 |       return false;
134 |     ok &= isItAFunction<KarpRabinHash<>>();
135 |   }
136 |   ok &= isItRandom<KarpRabinHash<>>();
137 |   for (uint L = 1; L <= 64; ++L) {
138 |     if (!ok)
139 |       return false;
140 |     ok &= isItAFunction<KarpRabinHash<uint64>>();
141 |   }
142 |   ok &= isItRandom<KarpRabinHash<uint64>>();
143 |   if (!ok)
144 |     return false;
145 |   cout << "cyclic" << endl;
146 |   for (uint L = 2; L <= 32; ++L) {
147 |     if (!ok)
148 |       return false;
149 |     ok &= testExtendAndPrepend<CyclicHash<>>(L);
150 |     ok &= isItAFunction<CyclicHash<>>(L);
151 |     ok &= doesReverseUpdateWorks<CyclicHash<>>(L);
152 |   }
153 |   for (uint L = 2; L <= 64; ++L) {
154 |     if (!ok)
155 |       return false;
156 |     ok &= testExtendAndPrepend<CyclicHash<uint64>>(L);
157 |     ok &= isItAFunction<CyclicHash<uint64>>(L);
158 |   }
159 |   ok &= isItRandom<CyclicHash<>>();
160 |   ok &= isItRandom<CyclicHash<uint64>>();
161 | 
162 |   cout << "three-wise" << endl;
163 |   for (uint L = 1; L <= 32; ++L) {
164 |     ok &= isItAFunction<ThreeWiseHash<>>(L);
165 |   }
166 |   ok &= isItRandom<ThreeWiseHash<>>();
167 |   for (uint L = 1; L <= 64; ++L) {
168 |     ok &= isItAFunction<ThreeWiseHash<uint64>>(L);
169 |   }
170 |   ok &= isItRandom<ThreeWiseHash<uint64>>();
171 | 
172 |   cout << "general" << endl;
173 |   ok &= isItAFunction<GeneralHash<NOPRECOMP>>(9);
174 |   if (!ok)
175 |     return false;
176 |   ok &= isItRandom<GeneralHash<NOPRECOMP>>();
177 |   if (!ok)
178 |     return false;
179 |   ok &= isItAFunction<GeneralHash<NOPRECOMP>>(19);
180 |   cout << "general" << endl;
181 |   ok &= isItAFunction<GeneralHash<FULLPRECOMP>>(9);
182 |   if (!ok)
183 |     return false;
184 |   ok &= isItRandom<GeneralHash<FULLPRECOMP>>();
185 |   if (!ok)
186 |     return false;
187 |   ok &= isItAFunction<GeneralHash<FULLPRECOMP>>(19);
188 |   return ok;
189 | }
190 | 
191 | int main() {
192 |   bool ok(test());
193 |   if (ok)
194 |     cout << "your code is ok!" << endl;
195 |   else
196 |     cout << "you have a bug of some kind" << endl;
197 |   return 0;
198 | }
199 | 


--------------------------------------------------------------------------------
/include/rabinkarphash.h:
--------------------------------------------------------------------------------
  1 | #ifndef KARPRABINHASH
  2 | #define KARPRABINHASH
  3 | 
  4 | #include "characterhash.h"
  5 | #include <cstring>
  6 | 
  7 | /**
  8 |  * This is a randomized version of the Karp-Rabin hash function.
  9 |  * Each instance is a rolling hash function meant to hash streams of characters.
 10 |  * Each new instance of this class comes with new random keys.
 11 |  *
 12 |  * Recommended usage to get L-bit hash values over n-grams:
 13 |  *        KarpRabinHash<> hf(n,L );
 14 |  *        for(uint32 k = 0; k<n;++k) {
 15 |  *                  unsigned char c = ... ; // grab some character
 16 |  *                  hf.eat(c); // feed it to the hasher
 17 |  *        }
 18 |  *        while(...) { // go over your string
 19 |  *           hf.hashvalue; // at all times, this contains the hash value
 20 |  *           unsigned char c = ... ;// points to the next character
 21 |  *           unsigned char out = ...; // character we want to forget
 22 |  *           hf.update(out,c); // update hash value
 23 |  *        }
 24 |  */
 25 | template <typename hashvaluetype = uint32, typename chartype = unsigned char>
 26 | class KarpRabinHash {
 27 | 
 28 | public:
 29 |   // myn is the length of the sequences, e.g., 3 means that you want to hash
 30 |   // sequences of 3 characters mywordsize is the number of bits you which to
 31 |   // receive as hash values, e.g., 19 means that the hash values are 19-bit
 32 |   // integers
 33 |   KarpRabinHash(int myn, int mywordsize = 19)
 34 |       : hashvalue(0), n(myn), wordsize(mywordsize),
 35 |         hasher(maskfnc<hashvaluetype>(wordsize)),
 36 |         HASHMASK(maskfnc<hashvaluetype>(wordsize)), BtoN(1) {
 37 |     for (int i = 0; i < n; ++i) {
 38 |       BtoN *= B;
 39 |       BtoN &= HASHMASK;
 40 |     }
 41 |   }
 42 | 
 43 |   // prepare to process a new string, you will need to call "eat" again
 44 |   void reset() { hashvalue = 0; }
 45 | 
 46 |   // this is a convenience function, use eat,update and .hashvalue to use as a
 47 |   // rolling hash function
 48 |   template <class container> hashvaluetype hash(container &c) {
 49 |     hashvaluetype answer(0);
 50 |     for (uint k = 0; k < c.size(); ++k) {
 51 |       hashvaluetype x(1);
 52 |       for (uint j = 0; j < c.size() - 1 - k; ++j) {
 53 |         x = (x * B) & HASHMASK;
 54 |       }
 55 |       x = (x * hasher.hashvalues[c[k]]) & HASHMASK;
 56 |       answer = (answer + x) & HASHMASK;
 57 |     }
 58 |     return answer;
 59 |   }
 60 | 
 61 |   // add inchar as an input, this is used typically only at the start
 62 |   // the hash value is updated to that of a longer string (one where inchar was
 63 |   // appended)
 64 |   void eat(chartype inchar) {
 65 |     hashvalue = (B * hashvalue + hasher.hashvalues[inchar]) & HASHMASK;
 66 |   }
 67 | 
 68 |   // add inchar as an input and remove outchar, the hashvalue is updated
 69 |   // this function can be used to update the hash value from the hash value of
 70 |   // [outchar]ABC to the hash value of ABC[inchar]
 71 |   void update(chartype outchar, chartype inchar) {
 72 |     hashvalue = (B * hashvalue + hasher.hashvalues[inchar] -
 73 |                  BtoN * hasher.hashvalues[outchar]) &
 74 |                 HASHMASK;
 75 |   }
 76 | 
 77 |   hashvaluetype hashvalue;
 78 |   int n;
 79 |   const int wordsize;
 80 |   CharacterHash<hashvaluetype, chartype> hasher;
 81 |   const hashvaluetype HASHMASK;
 82 |   hashvaluetype BtoN;
 83 |   static const hashvaluetype B = 37;
 84 | };
 85 | 
 86 | template <typename hashvaluetype = uint32, typename chartype = unsigned char,
 87 |           unsigned wordsize = CHAR_BIT * sizeof(hashvaluetype)>
 88 | class KarpRabinHashBits {
 89 |   // The key difference between KarpRabinHashBits and KarpRabinHash is that
 90 |   // wordsize is now templated And the masking is only performed if nbits != the
 91 |   // number of bits in the type
 92 | public:
 93 |   // myn is the length of the sequences, e.g., 3 means that you want to hash
 94 |   // sequences of 3 characters mywordsize is the number of bits you which to
 95 |   // receive as hash values, e.g., 19 means that the hash values are 19-bit
 96 |   // integers
 97 |   KarpRabinHashBits(int myn)
 98 |       : hashvalue(0), n(myn), hasher(maskfnc<hashvaluetype>(wordsize)),
 99 |         HASHMASK(maskfnc<hashvaluetype>(wordsize)), BtoN(1) {
100 |     for (int i = 0; i < n; ++i) {
101 |       BtoN *= B;
102 |       if (!is_full_word())
103 |         BtoN &= HASHMASK;
104 |     }
105 |   }
106 | 
107 |   // prepare to process a new string, you will need to call "eat" again
108 |   void reset() { hashvalue = 0; }
109 |   static constexpr bool is_full_word() {
110 |     return wordsize == (CHAR_BIT * sizeof(hashvaluetype));
111 |   }
112 |   template <typename T> void mask_value(T &val) const {
113 | #if __cplusplus >= 201703L
114 | #define CONSTIF if constexpr
115 | #else
116 | #define CONSTIF if
117 | #endif
118 |     CONSTIF(!is_full_word()) val &= HASHMASK;
119 | #undef CONSTIF
120 |   }
121 | 
122 |   // this is a convenience function, use eat,update and .hashvalue to use as a
123 |   // rolling hash function
124 |   template <class container> hashvaluetype hash(container &c) const {
125 |     hashvaluetype answer(0);
126 |     for (uint k = 0; k < c.size(); ++k) {
127 |       hashvaluetype x(1);
128 |       for (uint j = 0; j < c.size() - 1 - k; ++j) {
129 |         x = (x * B);
130 |         mask_value(x);
131 |       }
132 |       x = (x * hasher.hashvalues[c[k]]);
133 |       mask_value(x);
134 |       answer = (answer + x);
135 |       mask_value(answer);
136 |     }
137 |     return answer;
138 |   }
139 |   hashvaluetype hash(char *s) const {
140 |     return hash(static_cast<const char *>(s));
141 |   }
142 |   hashvaluetype hash(const char *s) const {
143 |     hashvaluetype answer(0);
144 |     uint csz = std::strlen(s);
145 |     for (uint k = 0; k < csz; ++k) {
146 |       hashvaluetype x(1);
147 |       for (uint j = 0; j < csz - 1 - k; ++j) {
148 |         x = (x * B);
149 |         mask_value(x);
150 |       }
151 |       x = (x * hasher.hashvalues[s[k]]);
152 |       mask_value(x);
153 |       answer = (answer + x);
154 |       mask_value(answer);
155 |     }
156 |     return answer;
157 |   }
158 | 
159 |   // add inchar as an input, this is used typically only at the start
160 |   // the hash value is updated to that of a longer string (one where inchar was
161 |   // appended)
162 |   void eat(chartype inchar) {
163 |     hashvalue = (B * hashvalue + hasher.hashvalues[inchar]);
164 |     mask_value(hashvalue);
165 |   }
166 | 
167 |   // add inchar as an input and remove outchar, the hashvalue is updated
168 |   // this function can be used to update the hash value from the hash value of
169 |   // [outchar]ABC to the hash value of ABC[inchar]
170 |   void update(chartype outchar, chartype inchar) {
171 |     hashvalue = (B * hashvalue + hasher.hashvalues[inchar] -
172 |                  BtoN * hasher.hashvalues[outchar]);
173 |     mask_value(hashvalue);
174 |   }
175 | 
176 |   hashvaluetype hashvalue;
177 |   int n;
178 |   CharacterHash<hashvaluetype, chartype> hasher;
179 |   const hashvaluetype HASHMASK;
180 |   hashvaluetype BtoN;
181 |   static constexpr hashvaluetype B = 37;
182 | };
183 | 
184 | #endif
185 | 


--------------------------------------------------------------------------------
/include/mersennetwister.h:
--------------------------------------------------------------------------------
  1 | 
  2 | /**
  3 | * High performance random generator.
  4 | * Mersenne Twister
  5 | 
  6 | @article{matsumoto1998mtd,
  7 |   title={{Mersenne Twister: A 623-Dimensionally Equidistributed Uniform
  8 | Pseudo-Random Number Generator}}, author={MATSUMOTO, M. and NISHIMURA, T.},
  9 |   journal={ACM Transactions on Modeling and Computer Simulation},
 10 |   volume={8},
 11 |   number={1},
 12 |   pages={3-30},
 13 |   year={1998}
 14 | }
 15 | */
 16 | // MersenneTwister.h
 17 | // Mersenne Twister random number generator -- a C++ class MTRand
 18 | // Based on code by Makoto Matsumoto, Takuji Nishimura, and Shawn Cokus
 19 | // Richard J. Wagner  v1.0  15 May 2003  rjwagner@writeme.com
 20 | 
 21 | // The Mersenne Twister is an algorithm for generating random numbers.  It
 22 | // was designed with consideration of the flaws in various other generators.
 23 | // The period, 2^19937-1, and the order of equidistribution, 623 dimensions,
 24 | // are far greater.  The generator is also fast; it avoids multiplication and
 25 | // division, and it benefits from caches and pipelines.  For more information
 26 | // see the inventors' web page at http://www.math.keio.ac.jp/~matumoto/emt.html
 27 | 
 28 | // Reference
 29 | // M. Matsumoto and T. Nishimura, "Mersenne Twister: A 623-Dimensionally
 30 | // Equidistributed Uniform Pseudo-Random Number Generator", ACM Transactions on
 31 | // Modeling and Computer Simulation, Vol. 8, No. 1, January 1998, pp 3-30.
 32 | 
 33 | // Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
 34 | // Copyright (C) 2000 - 2003, Richard J. Wagner
 35 | // All rights reserved.
 36 | //
 37 | // Redistribution and use in source and binary forms, with or without
 38 | // modification, are permitted provided that the following conditions
 39 | // are met:
 40 | //
 41 | //   1. Redistributions of source code must retain the above copyright
 42 | //      notice, this list of conditions and the following disclaimer.
 43 | //
 44 | //   2. Redistributions in binary form must reproduce the above copyright
 45 | //      notice, this list of conditions and the following disclaimer in the
 46 | //      documentation and/or other materials provided with the distribution.
 47 | //
 48 | //   3. The names of its contributors may not be used to endorse or promote
 49 | //      products derived from this software without specific prior written
 50 | //      permission.
 51 | //
 52 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 53 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 54 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 55 | // A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER
 56 | // OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 57 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 58 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 59 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 60 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 61 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 62 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 63 | 
 64 | // The original code included the following notice:
 65 | //
 66 | //     When you use this, send an email to: matumoto@math.keio.ac.jp
 67 | //     with an appropriate reference to your work.
 68 | //
 69 | // It would be nice to CC: rjwagner@writeme.com and Cokus@math.washington.edu
 70 | // when you write.
 71 | 
 72 | #ifndef MERSENNETWISTER_H
 73 | #define MERSENNETWISTER_H
 74 | 
 75 | // Not thread safe (unless auto-initialization is avoided and each thread has
 76 | // its own MTRand object)
 77 | 
 78 | #include <iostream>
 79 | #include <limits.h>
 80 | #include <math.h>
 81 | #include <stdio.h>
 82 | #include <time.h>
 83 | 
 84 | class MTRand {
 85 |   // Data
 86 | public:
 87 |   typedef unsigned long uint32; // unsigned integer type, at least 32 bits
 88 | 
 89 |   enum { N = 624 };      // length of state vector
 90 |   enum { SAVE = N + 1 }; // length of array for save()
 91 | 
 92 | protected:
 93 |   enum { M = 397 }; // period parameter
 94 | 
 95 |   uint32 state[N]; // internal state
 96 |   uint32 *pNext;   // next value to get from state
 97 |   int left;        // number of values left before reload needed
 98 | 
 99 |   // Methods
100 | public:
101 |   MTRand(const uint32 &oneSeed); // initialize with a simple uint32
102 |   MTRand(uint32 *const bigSeed, uint32 const seedLength = N); // or an array
103 |   MTRand(); // auto-initialize with /dev/urandom or time() and clock()
104 | 
105 |   // Do NOT use for CRYPTOGRAPHY without securely hashing several returned
106 |   // values together, otherwise the generator state can be learned after
107 |   // reading 624 consecutive values.
108 | 
109 |   // Access to 32-bit random numbers
110 |   double rand();                      // real number in [0,1]
111 |   double rand(const double &n);       // real number in [0,n]
112 |   double randExc();                   // real number in [0,1)
113 |   double randExc(const double &n);    // real number in [0,n)
114 |   double randDblExc();                // real number in (0,1)
115 |   double randDblExc(const double &n); // real number in (0,n)
116 |   uint32 randInt();                   // integer in [0,2^32-1]
117 |   uint32 randInt(const uint32 &n);    // integer in [0,n] for n < 2^32
118 |   double operator()() {
119 |     return rand(); // same as rand()
120 |   }
121 | 
122 |   // Access to 53-bit random numbers (capacity of IEEE double precision)
123 |   double rand53(); // real number in [0,1)
124 | 
125 |   // Access to nonuniform random number distributions
126 |   double randNorm(const double &mean = 0.0, const double &variance = 0.0);
127 | 
128 |   // Re-seeding functions with same behavior as initializers
129 |   void seed(const uint32 oneSeed);
130 |   void seed(uint32 *const bigSeed, const uint32 seedLength = N);
131 |   void seed();
132 | 
133 |   // Saving and loading generator state
134 |   void save(uint32 *saveArray) const; // to array of size SAVE
135 |   void load(uint32 *const loadArray); // from such array
136 |   friend std::ostream &operator<<(std::ostream &os, const MTRand &mtrand);
137 |   friend std::istream &operator>>(std::istream &is, MTRand &mtrand);
138 | 
139 | protected:
140 |   void initialize(const uint32 oneSeed);
141 |   void reload();
142 |   uint32 hiBit(const uint32 &u) const { return u & 0x80000000UL; }
143 |   uint32 loBit(const uint32 &u) const { return u & 0x00000001UL; }
144 |   uint32 loBits(const uint32 &u) const { return u & 0x7fffffffUL; }
145 |   uint32 mixBits(const uint32 &u, const uint32 &v) const {
146 |     return hiBit(u) | loBits(v);
147 |   }
148 |   uint32 twist(const uint32 &m, const uint32 &s0, const uint32 &s1) const {
149 |     return m ^ (mixBits(s0, s1) >> 1) ^
150 |            (-static_cast<long>(loBit(s1)) & 0x9908b0dfUL);
151 |   }
152 |   static uint32 hash(time_t t, clock_t c);
153 | };
154 | 
155 | MTRand::MTRand(const uint32 &oneSeed) { seed(oneSeed); }
156 | 
157 | MTRand::MTRand(uint32 *const bigSeed, const uint32 seedLength) {
158 |   seed(bigSeed, seedLength);
159 | }
160 | 
161 | MTRand::MTRand() { seed(); }
162 | 
163 | double MTRand::rand() { return double(randInt()) * (1.0 / 4294967295.0); }
164 | 
165 | double MTRand::rand(const double &n) { return rand() * n; }
166 | 
167 | double MTRand::randExc() { return double(randInt()) * (1.0 / 4294967296.0); }
168 | 
169 | double MTRand::randExc(const double &n) { return randExc() * n; }
170 | 
171 | double MTRand::randDblExc() {
172 |   return (double(randInt()) + 0.5) * (1.0 / 4294967296.0);
173 | }
174 | 
175 | double MTRand::randDblExc(const double &n) { return randDblExc() * n; }
176 | 
177 | double MTRand::rand53() {
178 |   uint32 a = randInt() >> 5, b = randInt() >> 6;
179 |   return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0); // by Isaku Wada
180 | }
181 | 
182 | double MTRand::randNorm(const double &mean, const double &variance) {
183 |   // Return a real number from a normal (Gaussian) distribution with given
184 |   // mean and variance by Box-Muller method
185 |   double r = sqrt(-2.0 * log(1.0 - randDblExc())) * variance;
186 |   double phi = 2.0 * 3.14159265358979323846264338328 * randExc();
187 |   return mean + r * cos(phi);
188 | }
189 | 
190 | MTRand::uint32 MTRand::randInt() {
191 |   // Pull a 32-bit integer from the generator state
192 |   // Every other access function simply transforms the numbers extracted here
193 | 
194 |   if (left == 0)
195 |     reload();
196 |   --left;
197 | 
198 |   uint32 s1;
199 |   s1 = *pNext++;
200 |   s1 ^= (s1 >> 11);
201 |   s1 ^= (s1 << 7) & 0x9d2c5680UL;
202 |   s1 ^= (s1 << 15) & 0xefc60000UL;
203 |   return (s1 ^ (s1 >> 18));
204 | }
205 | 
206 | MTRand::uint32 MTRand::randInt(const uint32 &n) {
207 |   // Find which bits are used in n
208 |   // Optimized by Magnus Jonsson (magnus@smartelectronix.com)
209 |   uint32 used = n;
210 |   used |= used >> 1;
211 |   used |= used >> 2;
212 |   used |= used >> 4;
213 |   used |= used >> 8;
214 |   used |= used >> 16;
215 | 
216 |   // Draw numbers until one is found in [0,n]
217 |   uint32 i;
218 |   do
219 |     i = randInt() & used; // toss unused bits to shorten search
220 |   while (i > n);
221 |   return i;
222 | }
223 | 
224 | void MTRand::seed(const uint32 oneSeed) {
225 |   // Seed the generator with a simple uint32
226 |   initialize(oneSeed);
227 |   reload();
228 | }
229 | 
230 | void MTRand::seed(uint32 *const bigSeed, const uint32 seedLength) {
231 |   // Seed the generator with an array of uint32's
232 |   // There are 2^19937-1 possible initial states.  This function allows
233 |   // all of those to be accessed by providing at least 19937 bits (with a
234 |   // default seed length of N = 624 uint32's).  Any bits above the lower 32
235 |   // in each element are discarded.
236 |   // Just call seed() if you want to get array from /dev/urandom
237 |   initialize(19650218UL);
238 |   int i = 1;
239 |   uint32 j = 0;
240 |   int k = (uint32(N) > seedLength ? int(N) : int(seedLength));
241 |   for (; k; --k) {
242 |     state[i] = state[i] ^ ((state[i - 1] ^ (state[i - 1] >> 30)) * 1664525UL);
243 |     state[i] += (bigSeed[j] & 0xffffffffUL) + j;
244 |     state[i] &= 0xffffffffUL;
245 |     ++i;
246 |     ++j;
247 |     if (i >= N) {
248 |       state[0] = state[N - 1];
249 |       i = 1;
250 |     }
251 |     if (j >= seedLength)
252 |       j = 0;
253 |   }
254 |   for (k = N - 1; k; --k) {
255 |     state[i] =
256 |         state[i] ^ ((state[i - 1] ^ (state[i - 1] >> 30)) * 1566083941UL);
257 |     state[i] -= i;
258 |     state[i] &= 0xffffffffUL;
259 |     ++i;
260 |     if (i >= N) {
261 |       state[0] = state[N - 1];
262 |       i = 1;
263 |     }
264 |   }
265 |   state[0] = 0x80000000UL; // MSB is 1, assuring non-zero initial array
266 |   reload();
267 | }
268 | 
269 | void MTRand::seed() {
270 |   // Seed the generator with an array from /dev/urandom if available
271 |   // Otherwise use a hash of time() and clock() values
272 | 
273 |   // First try getting an array from /dev/urandom
274 |   FILE *urandom = fopen("/dev/urandom", "rb");
275 |   if (urandom) {
276 |     uint32 bigSeed[N];
277 |     uint32 *s = bigSeed;
278 |     int i = N;
279 |     bool success = true;
280 |     while (success && i--)
281 |       success = fread(s++, sizeof(uint32), 1, urandom);
282 |     fclose(urandom);
283 |     if (success) {
284 |       seed(bigSeed, N);
285 |       return;
286 |     }
287 |   }
288 | 
289 |   // Was not successful, so use time() and clock() instead
290 |   seed(hash(time(NULL), clock()));
291 | }
292 | 
293 | void MTRand::initialize(const uint32 seed) {
294 |   // Initialize generator state with seed
295 |   // See Knuth TAOCP Vol 2, 3rd Ed, p.106 for multiplier.
296 |   // In previous versions, most significant bits (MSBs) of the seed affect
297 |   // only MSBs of the state array.  Modified 9 Jan 2002 by Makoto Matsumoto.
298 |   uint32 *s = state;
299 |   uint32 *r = state;
300 |   int i = 1;
301 |   *s++ = seed & 0xffffffffUL;
302 |   for (; i < N; ++i) {
303 |     *s++ = (1812433253UL * (*r ^ (*r >> 30)) + i) & 0xffffffffUL;
304 |     r++;
305 |   }
306 | }
307 | 
308 | void MTRand::reload() {
309 |   // Generate N new values in state
310 |   // Made clearer and faster by Matthew Bellew (matthew.bellew@home.com)
311 |   uint32 *p = state;
312 |   int i;
313 |   for (i = N - M; i--; ++p)
314 |     *p = twist(p[M], p[0], p[1]);
315 |   for (i = M; --i; ++p)
316 |     *p = twist(p[M - N], p[0], p[1]);
317 |   *p = twist(p[M - N], p[0], state[0]);
318 | 
319 |   left = N, pNext = state;
320 | }
321 | 
322 | MTRand::uint32 MTRand::hash(time_t t, clock_t c) {
323 |   // Get a uint32 from t and c
324 |   // Better than uint32(x) in case x is floating point in [0,1]
325 |   // Based on code by Lawrence Kirby (fred@genesis.demon.co.uk)
326 | 
327 |   static uint32 differ = 0; // guarantee time-based seeds will change
328 | 
329 |   uint32 h1 = 0;
330 |   unsigned char *p = reinterpret_cast<unsigned char *>(&t);
331 |   for (size_t i = 0; i < sizeof(t); ++i) {
332 |     h1 *= UCHAR_MAX + 2U;
333 |     h1 += p[i];
334 |   }
335 |   uint32 h2 = 0;
336 |   p = reinterpret_cast<unsigned char *>(&c);
337 |   for (size_t j = 0; j < sizeof(c); ++j) {
338 |     h2 *= UCHAR_MAX + 2U;
339 |     h2 += p[j];
340 |   }
341 |   return (h1 + differ++) ^ h2;
342 | }
343 | 
344 | void MTRand::save(uint32 *saveArray) const {
345 |   uint32 *sa = saveArray;
346 |   const uint32 *s = state;
347 |   int i = N;
348 |   for (; i--; *sa++ = *s++) {
349 |   }
350 |   *sa = left;
351 | }
352 | 
353 | void MTRand::load(uint32 *const loadArray) {
354 |   uint32 *s = state;
355 |   uint32 *la = loadArray;
356 |   int i = N;
357 |   for (; i--; *s++ = *la++) {
358 |   }
359 |   left = *la;
360 |   pNext = &state[N - left];
361 | }
362 | 
363 | std::ostream &operator<<(std::ostream &os, const MTRand &mtrand) {
364 |   const MTRand::uint32 *s = mtrand.state;
365 |   int i = mtrand.N;
366 |   for (; i--; os << *s++ << "\t") {
367 |   }
368 |   return os << mtrand.left;
369 | }
370 | 
371 | std::istream &operator>>(std::istream &is, MTRand &mtrand) {
372 |   MTRand::uint32 *s = mtrand.state;
373 |   int i = mtrand.N;
374 |   for (; i--; is >> *s++) {
375 |   }
376 |   is >> mtrand.left;
377 |   mtrand.pNext = &mtrand.state[mtrand.N - mtrand.left];
378 |   return is;
379 | }
380 | 
381 | #endif // MERSENNETWISTER_H
382 | 
383 | // Change log:
384 | //
385 | // v0.1 - First release on 15 May 2000
386 | //      - Based on code by Makoto Matsumoto, Takuji Nishimura, and Shawn Cokus
387 | //      - Translated from C to C++
388 | //      - Made completely ANSI compliant
389 | //      - Designed convenient interface for initialization, seeding, and
390 | //        obtaining numbers in default or user-defined ranges
391 | //      - Added automatic seeding from /dev/urandom or time() and clock()
392 | //      - Provided functions for saving and loading generator state
393 | //
394 | // v0.2 - Fixed bug which reloaded generator one step too late
395 | //
396 | // v0.3 - Switched to clearer, faster reload() code from Matthew Bellew
397 | //
398 | // v0.4 - Removed trailing newline in saved generator format to be consistent
399 | //        with output format of built-in types
400 | //
401 | // v0.5 - Improved portability by replacing static const int's with enum's and
402 | //        clarifying return values in seed(); suggested by Eric Heimburg
403 | //      - Removed MAXINT constant; use 0xffffffffUL instead
404 | //
405 | // v0.6 - Eliminated seed overflow when uint32 is larger than 32 bits
406 | //      - Changed integer [0,n] generator to give better uniformity
407 | //
408 | // v0.7 - Fixed operator precedence ambiguity in reload()
409 | //      - Added access for real numbers in (0,1) and (0,n)
410 | //
411 | // v0.8 - Included time.h header to properly support time_t and clock_t
412 | //
413 | // v1.0 - Revised seeding to match 26 Jan 2002 update of Nishimura and Matsumoto
414 | //      - Allowed for seeding with arrays of any length
415 | //      - Added access for real numbers in [0,1) with 53-bit resolution
416 | //      - Added access for real numbers from normal (Gaussian) distributions
417 | //      - Increased overall speed by optimizing twist()
418 | //      - Doubled speed of integer [0,n] generation
419 | //      - Fixed out-of-range number generation on 64-bit machines
420 | //      - Improved portability by substituting literal constants for long enum's
421 | //      - Changed license from GNU LGPL to BSD
422 | 


--------------------------------------------------------------------------------