├── .travis.yml ├── LICENSE ├── README.md ├── go.mod ├── ntHash.go └── ntHash_test.go /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.14.x 5 | 6 | before_install: 7 | - go get -t -v ./... 8 | 9 | script: 10 | - go test -race -coverprofile=coverage.txt -covermode=atomic ./... 11 | 12 | after_success: 13 | - bash <(curl -s https://codecov.io/bash) 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright © 2018 Will Rowe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |

ntHash

3 |

ntHash implementation in Go

4 |

5 |

6 |

7 |

8 |

9 |

10 | 11 | --- 12 | 13 | ## Overview 14 | 15 | This is a Go implementation of the [ntHash](https://github.com/bcgsc/ntHash) recursive hash function for hashing all possible k-mers in a DNA/RNA sequence. 16 | 17 | For more information, read the ntHash [paper](http://dx.doi.org/10.1093/bioinformatics/btw397) by Mohamadi et al. or check out their C++ [implementation](https://github.com/bcgsc/ntHash). 18 | 19 | This implementation was inspired by [Luiz Irber](https://luizirber.org/) and his recent [blog post](https://blog.luizirber.org/2018/09/13/nthash/) on his cool [Rust ntHash implementation](https://github.com/luizirber/nthash). 20 | 21 | I have coded this up in Go so that ntHash can be used in my [HULK](https://github.com/will-rowe/hulk) and [GROOT](https://github.com/will-rowe/groot) projects but feel free to use it for yourselves. 22 | 23 | ## Installation 24 | 25 | ```go 26 | go get github.com/will-rowe/nthash 27 | ``` 28 | 29 | ## Example usage 30 | 31 | ### range over ntHash values for a sequence 32 | 33 | ```go 34 | package main 35 | 36 | import ( 37 | "log" 38 | "github.com/will-rowe/nthash" 39 | ) 40 | 41 | var ( 42 | sequence = []byte("ACGTCGTCAGTCGATGCAGTACGTCGTCAGTCGATGCAGT") 43 | kmerSize = 11 44 | ) 45 | 46 | func main() { 47 | 48 | // create the ntHash iterator using a pointer to the sequence and a k-mer size 49 | hasher, err := ntHash.New(&sequence, kmerSize) 50 | 51 | // check for errors (e.g. bad k-mer size choice) 52 | if err != nil { 53 | log.Fatal(err) 54 | } 55 | 56 | // collect the hashes by ranging over the hash channel produced by the Hash method 57 | canonical := true 58 | for hash := range hasher.Hash(canonical) { 59 | log.Println(hash) 60 | } 61 | } 62 | ``` 63 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/will-rowe/nthash 2 | 3 | go 1.14 4 | -------------------------------------------------------------------------------- /ntHash.go: -------------------------------------------------------------------------------- 1 | // Package nthash is a port of ntHash (https://github.com/bcgsc/ntHash) recursive hash function for DNA kmers. 2 | // 3 | // It was inspired by the Rust port by Luiz Irber (https://github.com/luizirber/nthash) 4 | // 5 | package nthash 6 | 7 | import ( 8 | "fmt" 9 | "math" 10 | "sync" 11 | ) 12 | 13 | const ( 14 | // maxK is the maximum k-mer size permitted 15 | maxK uint = math.MaxUint32 16 | 17 | // bufferSize is the size of te buffer used by the channel in the Hash method 18 | bufferSize uint = 128 19 | 20 | // offset is used as a mask to retrieve a base's complement in the seed table 21 | offset uint8 = 0x07 22 | 23 | // seedA is the 64-bit random seed corresponding to base A 24 | seedA uint64 = 0x3c8bfbb395c60474 25 | 26 | // seedC is the 64-bit random seed corresponding to base C 27 | seedC uint64 = 0x3193c18562a02b4c 28 | 29 | // seedG is the 64-bit random seed corresponding to base G 30 | seedG uint64 = 0x20323ed082572324 31 | 32 | // seedT is the 64-bit random seed corresponding to base T 33 | seedT uint64 = 0x295549f54be24456 34 | 35 | // seedN is the 64-bit random seed corresponding to N 36 | seedN uint64 = 0x0000000000000000 37 | 38 | // seed for gerenerating multiple hash values 39 | multiSeed uint64 = 0x90b45d39fb6da1fa 40 | 41 | // multiShift is used for gerenerating multiple hash values 42 | multiShift uint = 27 43 | ) 44 | 45 | // seedTab is the lookup table for the bases on their complements 46 | var seedTab = [256]uint64{ 47 | seedN, seedT, seedN, seedG, seedA, seedA, seedN, seedC, // 0..7 48 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 8..15 49 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 16..23 50 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 24..31 51 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 32..39 52 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 40..47 53 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 48..55 54 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 56..63 55 | seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 64..71 56 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 72..79 57 | seedN, seedN, seedN, seedN, seedT, seedT, seedN, seedN, // 80..87 58 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 88..95 59 | seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 96..103 60 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 104..111 61 | seedN, seedN, seedN, seedN, seedT, seedT, seedN, seedN, // 112..119 62 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 120..127 63 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 128..135 64 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 136..143 65 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 144..151 66 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 152..159 67 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 160..167 68 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 168..175 69 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 176..183 70 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 184..191 71 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 192..199 72 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 200..207 73 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 208..215 74 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 216..223 75 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 224..231 76 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 232..239 77 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 240..247 78 | seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 248..255 79 | } 80 | 81 | // NTHi is the ntHash iterator 82 | type NTHi struct { 83 | seq *[]byte // the sequence being hashed 84 | k uint // the k-mer size 85 | fh uint64 // the current forward hash value 86 | rh uint64 // the current reverse hash value 87 | currentIdx uint // the current index position in the sequence being hashed 88 | maxIdx uint // the maximum index position to hash up to 89 | } 90 | 91 | // use object pool to reducing GC load for computation of huge number of sequences. 92 | var poolNTHi = &sync.Pool{New: func() interface{} { 93 | return &NTHi{} 94 | }} 95 | 96 | // NewHasher is the constructor function for the ntHash iterator 97 | // seq is a pointer to the sequence being hashed 98 | // k is the k-mer size to use 99 | func NewHasher(seq *[]byte, k uint) (*NTHi, error) { 100 | seqLen := uint(len(*seq)) 101 | if k > seqLen { 102 | return nil, fmt.Errorf("k size is greater than sequence length (%d vs %d)", k, seqLen) 103 | } 104 | if k > maxK { 105 | return nil, fmt.Errorf("k size is greater than the maximum allowed k size (%d vs %d)", k, maxK) 106 | } 107 | fh := ntf64((*seq)[0:k], 0, k) 108 | rh := ntr64((*seq)[0:k], 0, k) 109 | 110 | nthi := poolNTHi.Get().(*NTHi) 111 | nthi.seq = seq 112 | nthi.k = k 113 | nthi.fh = fh 114 | nthi.rh = rh 115 | nthi.currentIdx = 0 116 | nthi.maxIdx = seqLen - (k - 1) 117 | 118 | return nthi, nil 119 | } 120 | 121 | // Next returns the next ntHash value from an ntHash iterator 122 | func (nthi *NTHi) Next(canonical bool) (uint64, bool) { 123 | 124 | // end the iterator if we have got to the maximum index position TODO: this needs to be done in a better way. 125 | if nthi.currentIdx >= nthi.maxIdx { 126 | poolNTHi.Put(nthi) 127 | return 0, false 128 | } 129 | 130 | // roll the hash if index>0 131 | if nthi.currentIdx != 0 { 132 | prevBase := (*nthi.seq)[nthi.currentIdx-1] 133 | endBase := (*nthi.seq)[nthi.currentIdx+nthi.k-1] 134 | // alg 3. of ntHash paper 135 | nthi.fh = roL(nthi.fh, 1) 136 | nthi.fh ^= roL(seedTab[prevBase], nthi.k) 137 | nthi.fh ^= seedTab[endBase] 138 | nthi.rh = roR(nthi.rh, 1) 139 | nthi.rh ^= roR(seedTab[prevBase&offset], 1) 140 | nthi.rh ^= roL(seedTab[endBase&offset], nthi.k-1) 141 | } 142 | nthi.currentIdx++ 143 | 144 | if canonical { 145 | return nthi.getCanonical(), true 146 | } 147 | return nthi.fh, true 148 | } 149 | 150 | // Hash returns a channel to range over the canonical ntHash values of a sequence 151 | // canonical is set true to return the canonical k-mers, otherwise the forward hashes are returned 152 | func (nthi *NTHi) Hash(canonical bool) <-chan uint64 { 153 | hashChan := make(chan uint64, bufferSize) 154 | go func() { 155 | defer close(hashChan) 156 | 157 | // start the rolling hash 158 | for { 159 | 160 | // check that rolling can continue 161 | if nthi.currentIdx >= nthi.maxIdx { 162 | poolNTHi.Put(nthi) 163 | return 164 | } 165 | 166 | // start the hashing 167 | if nthi.currentIdx != 0 { 168 | prevBase := (*nthi.seq)[nthi.currentIdx-1] 169 | endBase := (*nthi.seq)[nthi.currentIdx+nthi.k-1] 170 | // alg 3. of ntHash paper 171 | nthi.fh = roL(nthi.fh, 1) 172 | nthi.fh ^= roL(seedTab[prevBase], nthi.k) 173 | nthi.fh ^= seedTab[endBase] 174 | nthi.rh = roR(nthi.rh, 1) 175 | nthi.rh ^= roR(seedTab[prevBase&offset], 1) 176 | nthi.rh ^= roL(seedTab[endBase&offset], nthi.k-1) 177 | } 178 | 179 | // calculate and return the canonical ntHash if requested 180 | if canonical { 181 | hashChan <- nthi.getCanonical() 182 | } else { 183 | hashChan <- nthi.fh 184 | } 185 | 186 | // increment the index 187 | nthi.currentIdx++ 188 | } 189 | }() 190 | return hashChan 191 | } 192 | 193 | // MultiHash returns a channel to range over the canonical multi ntHash values of a sequence 194 | // canonical is set true to return the canonical k-mers, otherwise the forward hashes are returned 195 | // numMultiHash sets the number of multi hashes to generate for each k-mer 196 | func (nthi *NTHi) MultiHash(canonical bool, numMultiHash uint) <-chan []uint64 { 197 | hashChan := make(chan []uint64, bufferSize) 198 | go func() { 199 | defer close(hashChan) 200 | 201 | // start the rolling hash 202 | for { 203 | 204 | // check that rolling can continue 205 | if nthi.currentIdx >= nthi.maxIdx { 206 | poolNTHi.Put(nthi) 207 | return 208 | } 209 | 210 | // start the hashing 211 | if nthi.currentIdx != 0 { 212 | prevBase := (*nthi.seq)[nthi.currentIdx-1] 213 | endBase := (*nthi.seq)[nthi.currentIdx+nthi.k-1] 214 | // alg 3. of ntHash paper 215 | nthi.fh = roL(nthi.fh, 1) 216 | nthi.fh ^= roL(seedTab[prevBase], nthi.k) 217 | nthi.fh ^= seedTab[endBase] 218 | nthi.rh = roR(nthi.rh, 1) 219 | nthi.rh ^= roR(seedTab[prevBase&offset], 1) 220 | nthi.rh ^= roL(seedTab[endBase&offset], nthi.k-1) 221 | } 222 | 223 | // set up the return slice 224 | multiHashes := make([]uint64, numMultiHash) 225 | if canonical { 226 | multiHashes[0] = nthi.getCanonical() 227 | } else { 228 | multiHashes[0] = nthi.fh 229 | } 230 | 231 | for i := uint64(1); i < uint64(numMultiHash); i++ { 232 | tVal := multiHashes[0] * (i ^ uint64(nthi.k)*multiSeed) 233 | tVal ^= tVal >> multiShift 234 | multiHashes[i] = tVal 235 | } 236 | 237 | // send the multihashes for this k-mer 238 | hashChan <- multiHashes 239 | 240 | // increment the index 241 | nthi.currentIdx++ 242 | } 243 | }() 244 | return hashChan 245 | } 246 | 247 | // getCanonical returns the canonical hash value currently held by the iterator 248 | func (nthi *NTHi) getCanonical() uint64 { 249 | if nthi.rh < nthi.fh { 250 | return nthi.rh 251 | } 252 | return nthi.fh 253 | } 254 | 255 | // roL is a function to bit shift to the left by "n" positions 256 | func roL(v uint64, n uint) uint64 { 257 | if (n & 63) == 0 { 258 | return v 259 | } 260 | return (v << n) | (v >> (64 - n)) 261 | } 262 | 263 | // roR is a function to bit shift to the right by "n" positions 264 | func roR(v uint64, n uint) uint64 { 265 | if (n & 63) == 0 { 266 | return v 267 | } 268 | return (v >> n) | (v << (64 - n)) 269 | } 270 | 271 | // ntf64 generates the ntHash for the forward strand of the kmer 272 | func ntf64(seq []byte, i, k uint) uint64 { 273 | var hv uint64 274 | for i < k { 275 | hv = roL(hv, 1) 276 | hv ^= seedTab[seq[i]] 277 | i++ 278 | } 279 | return hv 280 | } 281 | 282 | // ntr64 generates the ntHash for the reverse strand of the kmer 283 | func ntr64(seq []byte, i, k uint) uint64 { 284 | var hv uint64 285 | for i < k { 286 | hv = roL(hv, 1) 287 | hv ^= seedTab[seq[k-1-i]&offset] 288 | i++ 289 | } 290 | return hv 291 | } 292 | 293 | // ntc64 generates the canonical ntHash 294 | func ntc64(seq []byte, i, k uint) uint64 { 295 | fh := ntf64(seq, i, k) 296 | rh := ntr64(seq, i, k) 297 | if rh < fh { 298 | return rh 299 | } 300 | return fh 301 | } 302 | 303 | // nthash returns the canonical ntHash for each k-mer in a sequence 304 | // it does not use the rolling hash properties of ntHash 305 | func nthash(seq []byte, k int) []uint64 { 306 | hvs := make([]uint64, (len(seq) - (k - 1))) 307 | for i := 0; i <= (len(seq) - k); i++ { 308 | hvs[i] = ntc64(seq[i:i+k], 0, uint(k)) 309 | } 310 | return hvs 311 | } 312 | -------------------------------------------------------------------------------- /ntHash_test.go: -------------------------------------------------------------------------------- 1 | // test values have been lifted from Luiz Irber -- all credit and my thanks to him! 2 | // see https://github.com/luizirber/nthash/blob/master/src/lib.rs 3 | package nthash 4 | 5 | import ( 6 | "fmt" 7 | "testing" 8 | ) 9 | 10 | var ( 11 | kmer = []byte("TGCAG") 12 | sequence = []byte("ACGTCGTCAGTCGATGCAGT") 13 | kmer2 = []byte("ACTGC") 14 | ) 15 | 16 | // test seed lookup 17 | func TestSeedLookup(t *testing.T) { 18 | if seedTab[kmer[0]] != 0x295549f54be24456 { 19 | t.Fatal() 20 | } 21 | if seedTab[kmer[1]] != 0x20323ed082572324 { 22 | t.Fatal() 23 | } 24 | if seedTab[kmer[2]] != 0x3193c18562a02b4c { 25 | t.Fatal() 26 | } 27 | if seedTab[kmer[3]] != 0x3c8bfbb395c60474 { 28 | t.Fatal() 29 | } 30 | } 31 | 32 | // test forward ntHash 33 | func TestNTF64hash(t *testing.T) { 34 | hv := ntf64(kmer, 0, 5) 35 | t.Log(fmt.Printf("%x\n", hv)) 36 | if hv != 0xbafa6728fc6dabf { 37 | t.Fatal() 38 | } 39 | } 40 | 41 | // test reverse ntHash 42 | func TestNTR64(t *testing.T) { 43 | hv := ntr64(kmer, 0, 5) 44 | t.Log(fmt.Printf("%x\n", hv)) 45 | if hv != 0x8cf2d4072cca480e { 46 | t.Fatal() 47 | } 48 | } 49 | 50 | // test the canonical ntHash 51 | func TestNTC64(t *testing.T) { 52 | hv := ntc64(kmer, 0, 5) 53 | t.Log(fmt.Printf("%x\n", hv)) 54 | if hv != 0xbafa6728fc6dabf { 55 | t.Fatal() 56 | } 57 | } 58 | 59 | // test the ntHash function TODO: actually test this.... 60 | func TestNTHash(t *testing.T) { 61 | hvs := nthash(sequence, 5) 62 | for i, h := range hvs { 63 | t.Log(i, h) 64 | } 65 | } 66 | 67 | // test the ntHash iterator constructor 68 | func TestNewHasherNTHI(t *testing.T) { 69 | if _, err := NewHasher(&kmer, 10); err == nil { 70 | t.Fatal("should trigger k > seq error") 71 | } 72 | if _, err := NewHasher(&kmer, 200); err == nil { 73 | t.Fatal("should trigger k > max_k error") 74 | } 75 | nthi, err := NewHasher(&sequence, 5) 76 | if err != nil { 77 | t.Fatal() 78 | } 79 | t.Log(nthi) 80 | } 81 | 82 | // test the ntHash iterator next method 83 | func TestNext(t *testing.T) { 84 | nthi, err := NewHasher(&kmer2, 3) 85 | if err != nil { 86 | t.Fatal() 87 | } 88 | // should return the pre-calculated ntHash for the first canonical k-mer 89 | 90 | if h, _ := nthi.Next(true); h != 0x9b1eda9a185413ce { 91 | t.Fatal() 92 | } 93 | t.Log(nthi) 94 | // should calculate the next canonical k-mer ntHash and return it 95 | if h, _ := nthi.Next(true); h != 0x9f6acfa2235b86fc { 96 | t.Fatal() 97 | } 98 | // should calculate the final canonical k-mer ntHash and return it 99 | if h, _ := nthi.Next(true); h != 0xd4a29bf149877c5c { 100 | t.Fatal() 101 | } 102 | } 103 | 104 | // test the ntHash iterator hash method 105 | func TestHash(t *testing.T) { 106 | nthi, err := NewHasher(&kmer2, 3) 107 | if err != nil { 108 | t.Fatal() 109 | } 110 | counter := 0 111 | // use the canonical switch 112 | for hash := range nthi.Hash(true) { 113 | t.Log(hash) 114 | counter++ 115 | switch counter { 116 | case 1: 117 | if hash != 0x9b1eda9a185413ce { 118 | t.Fatal() 119 | } 120 | case 2: 121 | if hash != 0x9f6acfa2235b86fc { 122 | t.Fatal() 123 | } 124 | case 3: 125 | if hash != 0xd4a29bf149877c5c { 126 | t.Fatal() 127 | } 128 | default: 129 | t.Fatal("unexpected output from nthi") 130 | } 131 | } 132 | if counter != 3 { 133 | t.Fatal("wrong iteration") 134 | } 135 | } 136 | 137 | // test the ntHash iterator multihash method 138 | func TestMultiHash(t *testing.T) { 139 | nthi, err := NewHasher(&kmer2, 3) 140 | if err != nil { 141 | t.Fatal() 142 | } 143 | counter := 0 144 | 145 | // use the canonical switch and 3 multihashes 146 | for hashes := range nthi.MultiHash(true, 3) { 147 | t.Log(hashes) 148 | counter++ 149 | switch counter { 150 | case 1: 151 | if hashes[0] != 0x9b1eda9a185413ce { 152 | t.Fatal() 153 | } 154 | case 2: 155 | if hashes[0] != 0x9f6acfa2235b86fc { 156 | t.Fatal() 157 | } 158 | case 3: 159 | if hashes[0] != 0xd4a29bf149877c5c { 160 | t.Fatal() 161 | } 162 | default: 163 | t.Fatal("unexpected output from nthi") 164 | } 165 | } 166 | if counter != 3 { 167 | t.Fatal("wrong iteration") 168 | } 169 | } 170 | 171 | // run benchmarks of ntHash 172 | func BenchmarkHash(b *testing.B) { 173 | // run the ntHash iterator b.N times 174 | for n := 0; n < b.N; n++ { 175 | nthi, err := NewHasher(&sequence, 7) 176 | if err != nil { 177 | b.Fatal() 178 | } 179 | for range nthi.Hash(false) { 180 | } 181 | } 182 | } 183 | 184 | func BenchmarkCanonicalHash(b *testing.B) { 185 | // run the ntHash iterator b.N times 186 | for n := 0; n < b.N; n++ { 187 | nthi, err := NewHasher(&sequence, 7) 188 | if err != nil { 189 | b.Fatal() 190 | } 191 | for range nthi.Hash(true) { 192 | } 193 | } 194 | } 195 | --------------------------------------------------------------------------------