├── speed.xlsx
├── illustration.xlsx
├── illustration_randstrobes_order2.jpg
├── illustration_randstrobes_order3.jpg
├── strobemers.go
├── evaluation
    ├── q1-snp7.fasta
    ├── q2-snp3.fasta
    ├── r0.s.fasta
    ├── r1.s.fasta
    ├── r2.s.fasta
    ├── q1-snp7.rc.fasta
    ├── q2-snp3-gap1.fasta
    ├── q0-snp1.fasta
    ├── q2-snp3.fasta.blastn
    ├── q2-snp3-gap1.fasta.blastn
    ├── q0-snp1.fasta.blastn
    ├── q1-snp7.fasta.blastn
    ├── README.md
    └── test1_matches.go
├── go.mod
├── .gitignore
├── LICENSE
├── util.go
├── randstrobes_test.go
├── minstrobes_test.go
├── go.sum
├── README.md
├── common.go
├── strobemers_test.go
├── randstrobes.go
└── minstrobes.go


/speed.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/speed.xlsx


--------------------------------------------------------------------------------
/illustration.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/illustration.xlsx


--------------------------------------------------------------------------------
/illustration_randstrobes_order2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/illustration_randstrobes_order2.jpg


--------------------------------------------------------------------------------
/illustration_randstrobes_order3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/illustration_randstrobes_order3.jpg


--------------------------------------------------------------------------------
/strobemers.go:
--------------------------------------------------------------------------------
1 | //Package strobemers is a Go implementation of the https://github.com/ksahlin/strobemers.
2 | 
3 | package strobemers
4 | 


--------------------------------------------------------------------------------
/evaluation/q1-snp7.fasta:
--------------------------------------------------------------------------------
1 | >q1
2 | CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCTGAAGGGCT
3 | GCCGAAGGACAAGTCGCCGAGCAAGCCCGCCAGCTTCCGCTGTCTGGGCAAGCCGGAACC
4 | GGCGGTACAGAAGATCCTCGACCAACGACT
5 | 


--------------------------------------------------------------------------------
/evaluation/q2-snp3.fasta:
--------------------------------------------------------------------------------
1 | >q2
2 | GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACTG
3 | CGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA
4 | GACACCCGTCACAAGCATGACATGATAATT
5 | 


--------------------------------------------------------------------------------
/evaluation/r0.s.fasta:
--------------------------------------------------------------------------------
1 | >r0
2 | CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC
3 | GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG
4 | AGGAAATGGAAGAAACCGTGGACTTCCTGG
5 | 


--------------------------------------------------------------------------------
/evaluation/r1.s.fasta:
--------------------------------------------------------------------------------
1 | >r1
2 | CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCCGAAGGGCT
3 | GCCGCAGGACAAGTCGCCGAGCAAGCACGCCAGCTTTCGCTGGCTGGGCAAGCCGGAACC
4 | GGCGGTACAGAAGATCCTCGACGAACAACT
5 | 


--------------------------------------------------------------------------------
/evaluation/r2.s.fasta:
--------------------------------------------------------------------------------
1 | >r2
2 | GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAGACGGATGTGAACTG
3 | CGAACGACTGACACTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA
4 | GACACCCGTCACAAGCATGACATGATAATT
5 | 


--------------------------------------------------------------------------------
/evaluation/q1-snp7.rc.fasta:
--------------------------------------------------------------------------------
1 | >q1_rc
2 | AGTCGTTGGTCGAGGATCTTCTGTACCGCCGGTTCCGGCTTGCCCAGACAGCGGAAGCTG
3 | GCGGGCTTGCTCGGCGACTTGTCCTTCGGCAGCCCTTCAGGCACCTGCACGTAGAGCCCG
4 | TAGGCATGACTCTTGTCCCAATCGAAGGCG
5 | 


--------------------------------------------------------------------------------
/evaluation/q2-snp3-gap1.fasta:
--------------------------------------------------------------------------------
1 | >q2
2 | GAGGAATTAACGAACAGATACACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACTG
3 | CGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA
4 | GACACCCGTCACAAGCATGACATGATAATT
5 | 


--------------------------------------------------------------------------------
/evaluation/q0-snp1.fasta:
--------------------------------------------------------------------------------
1 | >RL|S1|R634/1
2 | CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC
3 | GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG
4 | AGGAAATGCAAGAAACCGTGGACTTCCTGG
5 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/shenwei356/strobemers
 2 | 
 3 | go 1.16
 4 | 
 5 | require (
 6 | 	github.com/shenwei356/bio v0.1.0
 7 | 	github.com/shenwei356/util v0.3.0
 8 | 	github.com/will-rowe/nthash v0.3.0
 9 | 	github.com/zeebo/xxh3 v0.10.0
10 | )
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | 
 8 | # Test binary, built with `go test -c`
 9 | *.test
10 | 
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 | 
14 | # Dependency directories (remove the comment below to include it)
15 | # vendor/
16 | 
17 | *.directory
18 | doc/site/*
19 | 
20 | *.brename_detail.txt
21 | 
22 | *cpu.pprof
23 | *mem.pprof
24 | *trace.out
25 | 
26 | t_*
27 | 
28 | blastdb
29 | test/test
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Wei Shen (shenwei356@gmail.com)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
 1 | package strobemers
 2 | 
 3 | func roundup64(x uint64) uint64 {
 4 | 	if x == 0 {
 5 | 		return 1
 6 | 	}
 7 | 	x--
 8 | 	x |= x >> 1
 9 | 	x |= x >> 2
10 | 	x |= x >> 4
11 | 	x |= x >> 8
12 | 	x |= x >> 16
13 | 	x |= x >> 32
14 | 	return (x | x>>64) + 1
15 | }
16 | 
17 | // only used in tests
18 | var cbases [256]byte = [256]byte{
19 | 	'T', 'G', 'C', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
20 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
21 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
22 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
23 | 	'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
24 | 	'N', 'N', 'N', 'N', 'A', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
25 | 	'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
26 | 	'N', 'N', 'N', 'N', 'A', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
27 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
28 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
29 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
30 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
31 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
32 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
33 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
34 | 	'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
35 | }
36 | 


--------------------------------------------------------------------------------
/randstrobes_test.go:
--------------------------------------------------------------------------------
 1 | package strobemers
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"strings"
 6 | 	"testing"
 7 | )
 8 | 
 9 | func TestRandStrobesOrder2(t *testing.T) {
10 | 	_s := "ACGATCTGGTACCTAG"
11 | 	s := []byte(_s)
12 | 
13 | 	n := 2
14 | 	l := 3
15 | 	wMin := 3
16 | 	wMax := 5
17 | 	rs, err := NewRandStrobes(&s, n, l, wMin, wMax)
18 | 	if err != nil {
19 | 		t.Error(err)
20 | 	}
21 | 
22 | 	var h uint64
23 | 	var ok bool
24 | 	var ps []int
25 | 	var i1, i2 int
26 | 	for {
27 | 		h, ok = rs.Next()
28 | 		if !ok {
29 | 			break
30 | 		}
31 | 
32 | 		if !debug {
33 | 			continue
34 | 		}
35 | 
36 | 		ps = rs.Indexes()
37 | 		i1, i2 = ps[0], ps[1]
38 | 		fmt.Printf("%s len:%d\n", _s, len(_s))
39 | 		fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1)
40 | 		fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2)
41 | 		fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h)
42 | 		fmt.Println()
43 | 	}
44 | }
45 | 
46 | func TestRandStrobesOrder3(t *testing.T) {
47 | 	_s := "ACGATCTGGTACCTAG"
48 | 	s := []byte(_s)
49 | 
50 | 	n := 3
51 | 	l := 3
52 | 	wMin := 3
53 | 	wMax := 5
54 | 	rs, err := NewRandStrobes(&s, n, l, wMin, wMax)
55 | 	if err != nil {
56 | 		t.Error(err)
57 | 	}
58 | 
59 | 	var h uint64
60 | 	var ok bool
61 | 	var ps []int
62 | 	var i1, i2, i3 int
63 | 	for {
64 | 		h, ok = rs.Next()
65 | 		if !ok {
66 | 			break
67 | 		}
68 | 
69 | 		if !debug {
70 | 			continue
71 | 		}
72 | 
73 | 		ps = rs.Indexes()
74 | 		i1, i2, i3 = ps[0], ps[1], ps[2]
75 | 		fmt.Printf("%s len:%d\n", _s, len(_s))
76 | 		fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1)
77 | 		fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2)
78 | 		fmt.Printf("%s%s i3:%d\n", strings.Repeat(" ", i3), _s[i3:i3+l], i3)
79 | 		fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h)
80 | 		fmt.Println()
81 | 	}
82 | }
83 | 


--------------------------------------------------------------------------------
/minstrobes_test.go:
--------------------------------------------------------------------------------
 1 | package strobemers
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"strings"
 6 | 	"testing"
 7 | )
 8 | 
 9 | func TestMinStrobesOrders2(t *testing.T) {
10 | 	_s := "ACGATCTGGTACCTAG"
11 | 	s := []byte(_s)
12 | 
13 | 	n := 2
14 | 	l := 3
15 | 	wMin := 3
16 | 	wMax := 5
17 | 	ms, err := NewMinStrobes(&s, n, l, wMin, wMax)
18 | 	if err != nil {
19 | 		t.Error(err)
20 | 	}
21 | 
22 | 	// for i, m := range ms.minhashes {
23 | 	// 	fmt.Println(i, m)
24 | 	// }
25 | 
26 | 	var h uint64
27 | 	var ok bool
28 | 	var ps []int
29 | 	var i1, i2 int
30 | 	for {
31 | 		h, ok = ms.Next()
32 | 		if !ok {
33 | 			break
34 | 		}
35 | 
36 | 		if !debug {
37 | 			continue
38 | 		}
39 | 
40 | 		ps = ms.Indexes()
41 | 		i1, i2 = ps[0], ps[1]
42 | 		fmt.Printf("%s len:%d\n", _s, len(_s))
43 | 		fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1)
44 | 		fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2)
45 | 		fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h)
46 | 		fmt.Println()
47 | 	}
48 | }
49 | 
50 | func TestMinStrobesOrder3(t *testing.T) {
51 | 	_s := "ACGATCTGGTACCTAG"
52 | 	s := []byte(_s)
53 | 
54 | 	n := 3
55 | 	l := 3
56 | 	wMin := 3
57 | 	wMax := 5
58 | 	rs, err := NewMinStrobes(&s, n, l, wMin, wMax)
59 | 	if err != nil {
60 | 		t.Error(err)
61 | 	}
62 | 
63 | 	var h uint64
64 | 	var ok bool
65 | 	var ps []int
66 | 	var i1, i2, i3 int
67 | 	for {
68 | 		h, ok = rs.Next()
69 | 		if !ok {
70 | 			break
71 | 		}
72 | 
73 | 		if !debug {
74 | 			continue
75 | 		}
76 | 
77 | 		ps = rs.Indexes()
78 | 		i1, i2, i3 = ps[0], ps[1], ps[2]
79 | 		fmt.Printf("%s len:%d\n", _s, len(_s))
80 | 		fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1)
81 | 		fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2)
82 | 		fmt.Printf("%s%s i3:%d\n", strings.Repeat(" ", i3), _s[i3:i3+l], i3)
83 | 		fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h)
84 | 		fmt.Println()
85 | 	}
86 | }
87 | 


--------------------------------------------------------------------------------
/evaluation/q2-snp3.fasta.blastn:
--------------------------------------------------------------------------------
 1 | BLASTN 2.11.0+
 2 | 
 3 | 
 4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb
 5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J
 6 | Comput Biol 2000; 7(1-2):203-14.
 7 | 
 8 | 
 9 | 
10 | Database: r2.fasta
11 |            1 sequences; 1,688,298 total letters
12 | 
13 | 
14 | 
15 | Query= q2
16 | 
17 | Length=150
18 |                                                                       Score     E
19 | Sequences producing significant alignments:                          (Bits)  Value
20 | 
21 | r2                                                                    261     4e-71
22 | 
23 | 
24 | >r2
25 | Length=1688298
26 | 
27 |  Score = 261 bits (141),  Expect = 4e-71
28 |  Identities = 147/150 (98%), Gaps = 0/150 (0%)
29 |  Strand=Plus/Plus
30 | 
31 | Query  1       GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACTG  60
32 |                ||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||
33 | Sbjct  869619  GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAGACGGATGTGAACTG  869678
34 | 
35 | Query  61      CGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA  120
36 |                ||| |||||||| |||||||||||||||||||||||||||||||||||||||||||||||
37 | Sbjct  869679  CGAACGACTGACACTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA  869738
38 | 
39 | Query  121     GACACCCGTCACAAGCATGACATGATAATT  150
40 |                ||||||||||||||||||||||||||||||
41 | Sbjct  869739  GACACCCGTCACAAGCATGACATGATAATT  869768
42 | 
43 | 
44 | 
45 | Lambda      K        H
46 |     1.33    0.621     1.12 
47 | 
48 | Gapped
49 | Lambda      K        H
50 |     1.28    0.460    0.850 
51 | 
52 | Effective search space used: 221164549
53 | 
54 | 
55 |   Database: r2.fasta
56 |     Posted date:  Apr 14, 2021  7:44 PM
57 |   Number of letters in database: 1,688,298
58 |   Number of sequences in database:  1
59 | 
60 | 
61 | 
62 | Matrix: blastn matrix 1 -2
63 | Gap Penalties: Existence: 0, Extension: 2.5
64 | 


--------------------------------------------------------------------------------
/evaluation/q2-snp3-gap1.fasta.blastn:
--------------------------------------------------------------------------------
 1 | BLASTN 2.11.0+
 2 | 
 3 | 
 4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb
 5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J
 6 | Comput Biol 2000; 7(1-2):203-14.
 7 | 
 8 | 
 9 | 
10 | Database: r2.fasta
11 |            1 sequences; 1,688,298 total letters
12 | 
13 | 
14 | 
15 | Query= q2
16 | 
17 | Length=151
18 |                                                                       Score     E
19 | Sequences producing significant alignments:                          (Bits)  Value
20 | 
21 | r2                                                                    255     2e-69
22 | 
23 | 
24 | >r2
25 | Length=1688298
26 | 
27 |  Score = 255 bits (138),  Expect = 2e-69
28 |  Identities = 147/151 (97%), Gaps = 1/151 (1%)
29 |  Strand=Plus/Plus
30 | 
31 | Query  1       GAGGAATTAACGAACAGATACACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACT  60
32 |                |||||||||||||||||||| ||||||||||||||||||||||||| |||||||||||||
33 | Sbjct  869619  GAGGAATTAACGAACAGATA-ACGCATATTGTCCCGTTTGATTGAAGACGGATGTGAACT  869677
34 | 
35 | Query  61      GCGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCG  120
36 |                |||| |||||||| ||||||||||||||||||||||||||||||||||||||||||||||
37 | Sbjct  869678  GCGAACGACTGACACTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCG  869737
38 | 
39 | Query  121     AGACACCCGTCACAAGCATGACATGATAATT  151
40 |                |||||||||||||||||||||||||||||||
41 | Sbjct  869738  AGACACCCGTCACAAGCATGACATGATAATT  869768
42 | 
43 | 
44 | 
45 | Lambda      K        H
46 |     1.33    0.621     1.12 
47 | 
48 | Gapped
49 | Lambda      K        H
50 |     1.28    0.460    0.850 
51 | 
52 | Effective search space used: 222852828
53 | 
54 | 
55 |   Database: r2.fasta
56 |     Posted date:  Apr 14, 2021  7:44 PM
57 |   Number of letters in database: 1,688,298
58 |   Number of sequences in database:  1
59 | 
60 | 
61 | 
62 | Matrix: blastn matrix 1 -2
63 | Gap Penalties: Existence: 0, Extension: 2.5
64 | 


--------------------------------------------------------------------------------
/evaluation/q0-snp1.fasta.blastn:
--------------------------------------------------------------------------------
 1 | BLASTN 2.11.0+
 2 | 
 3 | 
 4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb
 5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J
 6 | Comput Biol 2000; 7(1-2):203-14.
 7 | 
 8 | 
 9 | 
10 | Database: r0.fasta
11 |            1 sequences; 1,560,393 total letters
12 | 
13 | 
14 | 
15 | Query= RL|S1|R634/1
16 | 
17 | Length=150
18 |                                                                       Score     E
19 | Sequences producing significant alignments:                          (Bits)  Value
20 | 
21 | r0                                                                    272     2e-74
22 | 
23 | 
24 | >r0
25 | Length=1560393
26 | 
27 |  Score = 272 bits (147),  Expect = 2e-74
28 |  Identities = 149/150 (99%), Gaps = 0/150 (0%)
29 |  Strand=Plus/Plus
30 | 
31 | Query  1       CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC  60
32 |                ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
33 | Sbjct  396270  CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC  396329
34 | 
35 | Query  61      GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG  120
36 |                ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
37 | Sbjct  396330  GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG  396389
38 | 
39 | Query  121     AGGAAATGCAAGAAACCGTGGACTTCCTGG  150
40 |                |||||||| |||||||||||||||||||||
41 | Sbjct  396390  AGGAAATGGAAGAAACCGTGGACTTCCTGG  396419
42 | 
43 | 
44 | 
45 | Lambda      K        H
46 |     1.33    0.621     1.12 
47 | 
48 | Gapped
49 | Lambda      K        H
50 |     1.28    0.460    0.850 
51 | 
52 | Effective search space used: 204408994
53 | 
54 | 
55 |   Database: r0.fasta
56 |     Posted date:  Apr 14, 2021  7:44 PM
57 |   Number of letters in database: 1,560,393
58 |   Number of sequences in database:  1
59 | 
60 | 
61 | 
62 | Matrix: blastn matrix 1 -2
63 | Gap Penalties: Existence: 0, Extension: 2.5
64 | 


--------------------------------------------------------------------------------
/evaluation/q1-snp7.fasta.blastn:
--------------------------------------------------------------------------------
 1 | BLASTN 2.11.0+
 2 | 
 3 | 
 4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb
 5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J
 6 | Comput Biol 2000; 7(1-2):203-14.
 7 | 
 8 | 
 9 | 
10 | Database: r1.fasta
11 |            1 sequences; 2,833,277 total letters
12 | 
13 | 
14 | 
15 | Query= q1
16 | 
17 | Length=150
18 |                                                                       Score     E
19 | Sequences producing significant alignments:                          (Bits)  Value
20 | 
21 | r1                                                                    239     3e-64
22 | 
23 | 
24 | >r1
25 | Length=2833277
26 | 
27 |  Score = 239 bits (129),  Expect = 3e-64
28 |  Identities = 143/150 (95%), Gaps = 0/150 (0%)
29 |  Strand=Plus/Minus
30 | 
31 | Query  1        CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCTGAAGGGCT  60
32 |                 ||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||
33 | Sbjct  2741385  CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCCGAAGGGCT  2741326
34 | 
35 | Query  61       GCCGAAGGACAAGTCGCCGAGCAAGCCCGCCAGCTTCCGCTGTCTGGGCAAGCCGGAACC  120
36 |                 |||| ||||||||||||||||||||| ||||||||| ||||| |||||||||||||||||
37 | Sbjct  2741325  GCCGCAGGACAAGTCGCCGAGCAAGCACGCCAGCTTTCGCTGGCTGGGCAAGCCGGAACC  2741266
38 | 
39 | Query  121      GGCGGTACAGAAGATCCTCGACCAACGACT  150
40 |                 |||||||||||||||||||||| ||| |||
41 | Sbjct  2741265  GGCGGTACAGAAGATCCTCGACGAACAACT  2741236
42 | 
43 | 
44 | 
45 | Lambda      K        H
46 |     1.33    0.621     1.12 
47 | 
48 | Gapped
49 | Lambda      K        H
50 |     1.28    0.460    0.850 
51 | 
52 | Effective search space used: 368323410
53 | 
54 | 
55 |   Database: r1.fasta
56 |     Posted date:  Apr 14, 2021  7:44 PM
57 |   Number of letters in database: 2,833,277
58 |   Number of sequences in database:  1
59 | 
60 | 
61 | 
62 | Matrix: blastn matrix 1 -2
63 | Gap Penalties: Existence: 0, Extension: 2.5
64 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548/go.mod h1:e6NPNENfs9mPDVNRekM7lKScauxd5kXTr1Mfyig6TDM=
 2 | github.com/cznic/sortutil v0.0.0-20181122101858-f5f958428db8 h1:LpMLYGyy67BoAFGda1NeOBQwqlv7nUXpm+rIVHGxZZ4=
 3 | github.com/cznic/sortutil v0.0.0-20181122101858-f5f958428db8/go.mod h1:q2w6Bg5jeox1B+QkJ6Wp/+Vn0G/bo3f1uY7Fn3vivIQ=
 4 | github.com/edsrzf/mmap-go v1.0.0/go.mod h1:YO35OhQPt3KJa3ryjFM5Bs14WD66h8eGKpfaBNrHW5M=
 5 | github.com/klauspost/compress v1.11.4 h1:kz40R/YWls3iqT9zX9AHN3WoVsrAWVyui5sxuLqiXqU=
 6 | github.com/klauspost/compress v1.11.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
 7 | github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE=
 8 | github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
 9 | github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
10 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
11 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
12 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
13 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
14 | github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
15 | github.com/shenwei356/bio v0.1.0 h1:VDnI28zcdybywdn6/tcZvplAJ1IxOAAYrTJhhTB1SLQ=
16 | github.com/shenwei356/bio v0.1.0/go.mod h1:NgFauYHlpmjCYEf2XP8foITht6ej6poggQkILpjraN4=
17 | github.com/shenwei356/bpool v0.0.0-20160710042833-f9e0ee4d0403 h1:/3JklLnHXiWUBxWc3joQYavDQJpncRhRA909cUb7eOw=
18 | github.com/shenwei356/bpool v0.0.0-20160710042833-f9e0ee4d0403/go.mod h1:YkgdTWfNnJgv5HVJbVSDmxQtkK3/jZWDoqcG26BVU8k=
19 | github.com/shenwei356/breader v0.1.0/go.mod h1:YXIrHIPtbJCP6Kv27qGp+cXQl7hyzD0iQrEVYCy/gqw=
20 | github.com/shenwei356/util v0.3.0 h1:gTVa3sGwcyGEHgNpXTzdL3MaaJN/bGAypVKSCnT4QfU=
21 | github.com/shenwei356/util v0.3.0/go.mod h1:n3qhc3bQzlqJ2/5v79hgl0Gd3WzJOkI8XcUix25Brdg=
22 | github.com/shenwei356/xopen v0.0.0-20181203091311-f4f16ddd3992 h1:RXEEyKj0JL3SrRIYsWIEyy4AwjHbI3I8aDGK6CA4+YI=
23 | github.com/shenwei356/xopen v0.0.0-20181203091311-f4f16ddd3992/go.mod h1:6EQUa6I7Zsl2GQKqcL9qGLrTzVE+oZyly+uhzovQYSk=
24 | github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553/go.mod h1:Rj7Csq/tZ/egz+Ltc2IVpsA5309AmSMEswjkTZmq2Xc=
25 | github.com/will-rowe/nthash v0.3.0 h1:yN+Il98GRWyp7HdaiEbsE7KC4ySEKtPatm+SLZ5uQBk=
26 | github.com/will-rowe/nthash v0.3.0/go.mod h1:5ezweuK0J5j+/7lih/RkrSmnxI3hoaPpQiVWJ7rd960=
27 | github.com/zeebo/xxh3 v0.10.0 h1:1+2Mov9zfxTNUeoDG9k9i13VfxTR0p1JQu8L0vikxB0=
28 | github.com/zeebo/xxh3 v0.10.0/go.mod h1:AQY73TOrhF3jNsdiM9zZOb8MThrYbZONHj7ryDBaLpg=
29 | golang.org/x/sys v0.0.0-20200727154430-2d971f7391a4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
30 | golang.org/x/sys v0.0.0-20210315160823-c6e025ad8005 h1:pDMpM2zh2MT0kHy037cKlSby2nEhD50SYqwQk76Nm40=
31 | golang.org/x/sys v0.0.0-20210315160823-c6e025ad8005/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
32 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
33 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Strobemers in Go
  2 | 
  3 | [![GoDoc](https://godoc.org/github.com/shenwei356/strobemers?status.svg)](https://godoc.org/github.com/shenwei356/strobemers)
  4 | [![Go Report Card](https://goreportcard.com/badge/github.com/shenwei356/strobemers)](https://goreportcard.com/report/github.com/shenwei356/strobemers)
  5 | 
  6 | ## Introduction
  7 | 
  8 | This is a Go implementation of the [strobemers](https://github.com/ksahlin/strobemers) (minstrobes and randstrobes),
  9 | with some [differences](#differences).
 10 | 
 11 | The implementation of `Randstrobes` has a not-bad performance (2-3X slower) compared to regular k-mer,
 12 | while it's 10-20X slower than [ntHash](https://github.com/will-rowe/nthash/). 
 13 | Besides, `Randstrobes` is only slightly slower than `MinStrobes` (see [benchmark](#benchmark)).
 14 | 
 15 | ### Attention
 16 | 
 17 | The current implementation only computes strobemers of the positive strand,
 18 | because the strobes are asymmetrical and the location matters.
 19 | 
 20 | ## Installation
 21 | 
 22 |     go get github.com/shenwei356/strobemers
 23 | 
 24 | ## Quick Start
 25 | 
 26 | We followed the code style of [ntHash](https://github.com/will-rowe/nthash/).
 27 | 
 28 | ```go
 29 | n := 2
 30 | l := 3
 31 | w_min := 3
 32 | w_max := 5
 33 | rs, err := strobemers.NewRandStrobes(seq, n, l, w_min, w_max)
 34 | checkError(err)
 35 | 
 36 | var hash uint64
 37 | var ok bool
 38 | var i int  // 0-based index
 39 | var positions []int // 0-based indexes of all strobes
 40 | 
 41 | rs.SetWindowShrink(true)
 42 | for {
 43 |     hash, ok = rs.Next()
 44 |     if !ok {
 45 |         break
 46 |     }
 47 | 
 48 |     i = rs.Index()
 49 |     positions = rs.Indexes()
 50 | }
 51 | 
 52 | ```
 53 | 
 54 | ## Differences
 55 | 
 56 | Here are some differences compared to the original implementation,
 57 | see discussion: [#1](https://github.com/ksahlin/strobemers/issues/1), [#2](https://github.com/ksahlin/strobemers/issues/2).
 58 | 
 59 | item                  |orginal                |this                              |comment
 60 | :---------------------|:----------------------|:---------------------------------|:-----------------------------------------
 61 | window range          |`w_min < w_max`        |`w_min <= w_max`                  |allow a fixed position
 62 | shrinking window      |all `w_min` and `w_max`|optional shrinking last `w_max`   |see figures below
 63 | number of strobemers  |`len(seq)-n*l+1`       |`len(seq)-n*l+1-(n-1)*l`          |window shrinked
 64 | number of strobemers  |                       |`len(seq)-n*l+1-(n-1)*(l+w_min-1)`|window not shrinked
 65 | choice of min hash    |`(h(m)+h(mj))%q`       |`(h(m)+h(mj))&q`                  |`&` is faster than `%`
 66 | final hash value (n=2)|`h(m1)-h(m2)`          |`h(m1)/2+h(m2)/3`                 |keep asymmetry and avoid `uint64` overflow
 67 | final hash value (n=3)|`h(m1)-h(m2)+2*h(m3)`  |`h(m1)/3+h(m2)/4+h(m3)/5`         |~
 68 | 
 69 | <img src="illustration_randstrobes_order2.jpg" width="750" />
 70 | 
 71 | <img src="illustration_randstrobes_order3.jpg" width="750" />
 72 | 
 73 | ## Benchmark
 74 | 
 75 | method                 |time  |relative_time
 76 | :----------------------|:-----|:------------
 77 | ntHashKmers(30)        |8590  |1
 78 | Kmers(30)              |55579 |6
 79 | MinStrobes(2,15,20,30) |104520|12
 80 | MinStrobes(3,10,20,30) |111662|13
 81 | RandStrobes(2,15,20,30)|93436 |11
 82 | RandStrobes(3,10,20,30)|152461|18
 83 | 
 84 |     $ go test . -bench=Benchmark* -benchmem \
 85 |         | grep Bench \
 86 |         | perl -pe 's/\s\s+/\t/g' \
 87 |         | csvtk cut -Ht -f 1,3-5 \
 88 |         | csvtk add-header -t -n test,time,memory,allocs \
 89 |         | csvtk pretty -t -r
 90 | 
 91 |                                      test           time       memory        allocs
 92 |     -------------------------------------   ------------   ----------   -----------
 93 |                BenchmarkNTHash/1.00_KB-16     8590 ns/op      48 B/op   1 allocs/op
 94 |                 BenchmarkKmers/1.00_KB-16    55579 ns/op      32 B/op   1 allocs/op
 95 |      BenchmarkMinStrobesOrder2/1.00_KB-16   104520 ns/op   25064 B/op   7 allocs/op
 96 |      BenchmarkMinStrobesOrder3/1.00_KB-16   111662 ns/op   25064 B/op   7 allocs/op
 97 |     BenchmarkRandStrobesOrder2/1.00_KB-16    93436 ns/op    8432 B/op   3 allocs/op
 98 |     BenchmarkRandStrobesOrder3/1.00_KB-16   152461 ns/op    8432 B/op   3 allocs/op
 99 | 
100 | 
101 | ## Similar Projects
102 | 
103 | - [strobemer_cpptest](https://github.com/BGI-Qingdao/strobemer_cpptest)
104 | 
105 | ## References
106 | 
107 | - [ntHash](http://dx.doi.org/10.1093/bioinformatics/btw397)
108 | - [strobemers](https://doi.org/10.1101/2021.01.28.428549)
109 | 


--------------------------------------------------------------------------------
/common.go:
--------------------------------------------------------------------------------
  1 | package strobemers
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"sort"
  6 | 
  7 | 	"github.com/will-rowe/nthash"
  8 | )
  9 | 
 10 | // defaultPrimeNumber is the prime number in minimizing h(m)+h(mj) mod q.
 11 | // In this package, we use (h(m)+h(mj)) & q, where q = roundup(q) - 1
 12 | var defaultPrimeNumber uint64 = (1 << 20) - 1
 13 | 
 14 | // ------------------------------------------------------------------------
 15 | // errors
 16 | 
 17 | // ErrOrderNotSupported means a big strobemer order is not supported.
 18 | var ErrOrderNotSupported = fmt.Errorf("strobemers: strobemer order not supported")
 19 | 
 20 | // ErrInvalidOrder means
 21 | var ErrInvalidOrder = fmt.Errorf("strobemers: strobemer order too small")
 22 | 
 23 | // ErrInvalidSequence means the given sequence is invalid
 24 | var ErrInvalidSequence = fmt.Errorf("strobemers: invalid DNA sequence")
 25 | 
 26 | // ErrSequenceTooShort means the sequence is too short
 27 | var ErrSequenceTooShort = fmt.Errorf("strobemers: sequence too short")
 28 | 
 29 | // ErrStrobeLengthTooSmall means the strobe length is too small
 30 | var ErrStrobeLengthTooSmall = fmt.Errorf("strobemers: strobe length too small")
 31 | 
 32 | // ErrInvalidWindowOffsets means invalid window offsets
 33 | var ErrInvalidWindowOffsets = fmt.Errorf("strobemers: window offset should be > 0, and wMin <= wMax")
 34 | 
 35 | // ErrIncompleteHashValues means incomplete hash values
 36 | var ErrIncompleteHashValues = fmt.Errorf("strobemers: incomplete hash values")
 37 | 
 38 | var ErrPrimeNumberTooSmall = fmt.Errorf("strobemers: the primer number is too small")
 39 | 
 40 | // ------------------------------------------------------------------------
 41 | 
 42 | func computeHashes(sequence *[]byte, k int) ([]uint64, error) {
 43 | 	hasher, err := nthash.NewHasher(sequence, uint(k))
 44 | 	if err != nil {
 45 | 		return nil, err
 46 | 	}
 47 | 
 48 | 	hashes := make([]uint64, len(*sequence)-k+1)
 49 | 	var hash uint64
 50 | 	var ok bool
 51 | 	var i int
 52 | 	for {
 53 | 		hash, ok = hasher.Next(true)
 54 | 		if !ok {
 55 | 			break
 56 | 		}
 57 | 		hashes[i] = hash
 58 | 		i++
 59 | 	}
 60 | 
 61 | 	if i != len(*sequence)-k+1 {
 62 | 		return nil, ErrIncompleteHashValues
 63 | 	}
 64 | 
 65 | 	return hashes, nil
 66 | }
 67 | 
 68 | func computeMinHashes(hashes []uint64, w int) ([]int, []uint64) {
 69 | 	locs := make([]int, len(hashes))
 70 | 	if w == 1 {
 71 | 		for i := range hashes {
 72 | 			locs[i] = i
 73 | 		}
 74 | 		return locs, hashes
 75 | 	}
 76 | 
 77 | 	minHashes := make([]uint64, len(hashes))
 78 | 
 79 | 	var hash uint64
 80 | 	var i, idxMw, b, e, t int
 81 | 	var i2v IdxValue
 82 | 	var flag bool
 83 | 
 84 | 	buf := make([]IdxValue, 0, w)
 85 | 	end := len(hashes)
 86 | 	r := w - 1 // last position in the buffer
 87 | 
 88 | 	for idx := 0; idx < end; idx++ { // idx is end position of a window
 89 | 		hash = hashes[idx]
 90 | 
 91 | 		if idx < r { // front of w
 92 | 			buf = append(buf, IdxValue{Idx: idx, Val: hash}) // add current hash to buf
 93 | 			continue
 94 | 		}
 95 | 
 96 | 		if idx == r { // position w
 97 | 			buf = append(buf, IdxValue{Idx: idx, Val: hash}) // add current hash to buf
 98 | 			sort.Sort(idxValues(buf))
 99 | 
100 | 			i2v = buf[0] // the smallest one
101 | 			locs[idx] = i2v.Idx
102 | 			minHashes[idx] = i2v.Val
103 | 			continue
104 | 		}
105 | 
106 | 		// find min k-mer
107 | 
108 | 		// remove k-mer not in this window.
109 | 		// have to check position/index one by one
110 | 		idxMw = idx - w
111 | 		for i, i2v = range buf {
112 | 			if i2v.Idx == idxMw {
113 | 				if i < r { // not the last element
114 | 					copy(buf[i:r], buf[i+1:])
115 | 				} // happen to be at the end
116 | 				buf = buf[:r]
117 | 				break
118 | 			}
119 | 		}
120 | 
121 | 		// add new k-mer
122 | 		flag = false
123 | 		// using binary search, faster han linear search
124 | 		b, e = 0, r-1
125 | 		for {
126 | 			t = b + (e-b)/2
127 | 			if hash < buf[t].Val {
128 | 				e = t - 1 // end search here
129 | 				if e <= b {
130 | 					flag = true
131 | 					i = b
132 | 					break
133 | 				}
134 | 			} else {
135 | 				b = t + 1 // start here
136 | 				if b >= r {
137 | 					flag = false
138 | 					break
139 | 				}
140 | 				if b >= e {
141 | 					flag = true
142 | 					i = e // right here
143 | 					break
144 | 				}
145 | 			}
146 | 		}
147 | 		if !flag { // it's the biggest one, append to the end
148 | 			buf = append(buf, IdxValue{idx, hash})
149 | 		} else {
150 | 			if hash >= buf[i].Val { // have to check again
151 | 				i++
152 | 			}
153 | 			buf = append(buf, blankI2V) // append one element
154 | 			copy(buf[i+1:], buf[i:r])   // move right
155 | 			buf[i] = IdxValue{idx, hash}
156 | 		}
157 | 
158 | 		i2v = buf[0] // the smallest one
159 | 		locs[idx] = i2v.Idx
160 | 		minHashes[idx] = i2v.Val
161 | 	}
162 | 
163 | 	return locs, minHashes
164 | }
165 | 
166 | type IdxValue struct {
167 | 	Idx int    // index
168 | 	Val uint64 // hash
169 | }
170 | 
171 | var blankI2V = IdxValue{0, 0}
172 | 
173 | type idxValues []IdxValue
174 | 
175 | func (l idxValues) Len() int               { return len(l) }
176 | func (l idxValues) Less(i int, j int) bool { return l[i].Val < l[j].Val }
177 | func (l idxValues) Swap(i int, j int)      { l[i], l[j] = l[j], l[i] }
178 | 


--------------------------------------------------------------------------------
/strobemers_test.go:
--------------------------------------------------------------------------------
  1 | //Package strobemers is a Go implementation of the https://github.com/ksahlin/strobemers.
  2 | 
  3 | package strobemers
  4 | 
  5 | import (
  6 | 	"math/rand"
  7 | 	"testing"
  8 | 
  9 | 	"github.com/shenwei356/util/bytesize"
 10 | 	"github.com/will-rowe/nthash"
 11 | 	"github.com/zeebo/xxh3"
 12 | )
 13 | 
 14 | var debug = true
 15 | 
 16 | var seqs [][]byte
 17 | 
 18 | var bit2base = [4]byte{'A', 'C', 'G', 'T'}
 19 | 
 20 | func init() {
 21 | 	rand.Seed(11)
 22 | 
 23 | 	sizes := []int{1 << 10} //, 1 << 20} //, 10 << 20}
 24 | 	seqs = make([][]byte, len(sizes))
 25 | 	for i, size := range sizes {
 26 | 		sequence := make([]byte, size)
 27 | 		for j := 0; j < size; j++ {
 28 | 			sequence[j] = bit2base[rand.Intn(4)]
 29 | 		}
 30 | 		seqs[i] = sequence
 31 | 	}
 32 | }
 33 | 
 34 | var _hash uint64
 35 | var _k int = 30
 36 | var _n2 int = 2
 37 | var _l2 int = 15
 38 | var _n3 int = 3
 39 | var _l3 int = 10
 40 | var _w_min int = 20
 41 | var _w_max int = 30
 42 | 
 43 | func BenchmarkNTHash(b *testing.B) {
 44 | 	for i := range seqs {
 45 | 		size := len(seqs[i])
 46 | 		b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
 47 | 			for j := 0; j < b.N; j++ {
 48 | 				var hash uint64
 49 | 				var ok bool
 50 | 				var hasher *nthash.NTHi
 51 | 				var err error
 52 | 
 53 | 				hasher, err = nthash.NewHasher(&seqs[i], uint(_k))
 54 | 				if err != nil {
 55 | 					b.Errorf("fail to create ntHasher iterator. seq length: %d", size)
 56 | 				}
 57 | 
 58 | 				for {
 59 | 					hash, ok = hasher.Next(true)
 60 | 					if !ok {
 61 | 						break
 62 | 					}
 63 | 
 64 | 					_hash = hash
 65 | 				}
 66 | 			}
 67 | 		})
 68 | 	}
 69 | }
 70 | 
 71 | func BenchmarkKmers(b *testing.B) {
 72 | 	for i := range seqs {
 73 | 		size := len(seqs[i])
 74 | 		b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
 75 | 			for j := 0; j < b.N; j++ {
 76 | 				var hash, hashrc uint64
 77 | 				var end int
 78 | 				var seq []byte
 79 | 				var rc []byte
 80 | 				var _i, _j int
 81 | 
 82 | 				rc = make([]byte, _k)
 83 | 				seq = seqs[i]
 84 | 				end = len(seq) - _k + 1
 85 | 
 86 | 				for i := 0; i < end; i++ {
 87 | 					hash = xxh3.Hash(seq[i : i+_k])
 88 | 
 89 | 					// complementary sequence
 90 | 					for _i = 0; _i < _k; _i++ {
 91 | 						rc[_i] = cbases[seq[i+_i]]
 92 | 					}
 93 | 					// reverse
 94 | 					for _i, _j = 0, _k-1; _i < _j; _i, _j = _i+1, _j-1 {
 95 | 						rc[_i], rc[_j] = rc[_j], rc[_i]
 96 | 					}
 97 | 					hashrc = xxh3.Hash(rc)
 98 | 
 99 | 					// canonical kmer
100 | 					if hash < hashrc {
101 | 						_hash = hash
102 | 					} else {
103 | 						_hash = hashrc
104 | 					}
105 | 				}
106 | 			}
107 | 		})
108 | 	}
109 | }
110 | 
111 | func BenchmarkMinStrobesOrder2(b *testing.B) {
112 | 	for i := range seqs {
113 | 		size := len(seqs[i])
114 | 		b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
115 | 			for j := 0; j < b.N; j++ {
116 | 				var hash uint64
117 | 				var ok bool
118 | 				var rs *MinStrobes
119 | 				var err error
120 | 
121 | 				rs, err = NewMinStrobes(&seqs[i], _n2, _l2, _w_min, _w_max)
122 | 				if err != nil {
123 | 					b.Errorf("fail to create MinStrobes. seq length: %d", size)
124 | 				}
125 | 
126 | 				for {
127 | 					hash, ok = rs.Next()
128 | 					if !ok {
129 | 						break
130 | 					}
131 | 
132 | 					_hash = hash
133 | 				}
134 | 			}
135 | 		})
136 | 	}
137 | }
138 | 
139 | func BenchmarkMinStrobesOrder3(b *testing.B) {
140 | 	for i := range seqs {
141 | 		size := len(seqs[i])
142 | 		b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
143 | 			for j := 0; j < b.N; j++ {
144 | 				var hash uint64
145 | 				var ok bool
146 | 				var rs *MinStrobes
147 | 				var err error
148 | 
149 | 				rs, err = NewMinStrobes(&seqs[i], _n3, _l3, _w_min, _w_max)
150 | 				if err != nil {
151 | 					b.Errorf("fail to create MinStrobes. seq length: %d", size)
152 | 				}
153 | 
154 | 				for {
155 | 					hash, ok = rs.Next()
156 | 					if !ok {
157 | 						break
158 | 					}
159 | 
160 | 					_hash = hash
161 | 				}
162 | 			}
163 | 		})
164 | 	}
165 | }
166 | 
167 | func BenchmarkRandStrobesOrder2(b *testing.B) {
168 | 	for i := range seqs {
169 | 		size := len(seqs[i])
170 | 		b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
171 | 			for j := 0; j < b.N; j++ {
172 | 				var hash uint64
173 | 				var ok bool
174 | 				var rs *RandStrobes
175 | 				var err error
176 | 
177 | 				rs, err = NewRandStrobes(&seqs[i], _n2, _l2, _w_min, _w_max)
178 | 				if err != nil {
179 | 					b.Errorf("fail to create RandStrobes. seq length: %d", size)
180 | 				}
181 | 
182 | 				for {
183 | 					hash, ok = rs.Next()
184 | 					if !ok {
185 | 						break
186 | 					}
187 | 
188 | 					_hash = hash
189 | 				}
190 | 			}
191 | 		})
192 | 	}
193 | }
194 | 
195 | func BenchmarkRandStrobesOrder3(b *testing.B) {
196 | 	for i := range seqs {
197 | 		size := len(seqs[i])
198 | 		b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
199 | 			for j := 0; j < b.N; j++ {
200 | 				var hash uint64
201 | 				var ok bool
202 | 				var rs *RandStrobes
203 | 				var err error
204 | 
205 | 				rs, err = NewRandStrobes(&seqs[i], _n3, _l3, _w_min, _w_max)
206 | 				if err != nil {
207 | 					b.Errorf("fail to create RandStrobes. seq length: %d", size)
208 | 				}
209 | 
210 | 				for {
211 | 					hash, ok = rs.Next()
212 | 					if !ok {
213 | 						break
214 | 					}
215 | 
216 | 					_hash = hash
217 | 				}
218 | 			}
219 | 		})
220 | 	}
221 | }
222 | 


--------------------------------------------------------------------------------
/randstrobes.go:
--------------------------------------------------------------------------------
  1 | package strobemers
  2 | 
  3 | import "math"
  4 | 
  5 | // RandStrobes is a iterator for randstrobes
  6 | type RandStrobes struct {
  7 | 	seq *[]byte // DNA sequence
  8 | 
  9 | 	n    int // strobemer order
 10 | 	l    int // strobes length
 11 | 	wMin int // minimum window offset
 12 | 	wMax int // maximum window offset
 13 | 
 14 | 	idx, idx2, idx3     int    // indexes of m1, m2, m3
 15 | 	hash1, hash2, hash3 uint64 // hash value of m1, m2, m3
 16 | 
 17 | 	hashes []uint64 // precomputed ntHash values of l-mers
 18 | 
 19 | 	endHash int // position of the last l-mer
 20 | 	endIdx  int // position of the last m1
 21 | 
 22 | 	wStart, wEnd, w2Start, w2End int // window start and end
 23 | 
 24 | 	prime uint64
 25 | 
 26 | 	// shrink the last searching window for positions near the end of sequence.
 27 | 	shrinkWindow bool
 28 | 
 29 | 	// tmp variable
 30 | 	i    int
 31 | 	hash uint64
 32 | }
 33 | 
 34 | // NewRandStrobes creates a RandStrobes iterator.
 35 | // Parameters:
 36 | //     n    - strobemer order
 37 | //     l    - strobes length
 38 | //     wMin - minimum window offset, wMin > 0
 39 | //     wMax - maximum window offset, wMin <= wMax.
 40 | func NewRandStrobes(seq *[]byte, n int, l int, wMin int, wMax int) (*RandStrobes, error) {
 41 | 	if seq == nil || len(*seq) == 0 {
 42 | 		return nil, ErrInvalidSequence
 43 | 	}
 44 | 	if n < 2 {
 45 | 		return nil, ErrInvalidOrder
 46 | 	}
 47 | 	if n > 3 {
 48 | 		return nil, ErrOrderNotSupported
 49 | 	}
 50 | 	if len(*seq) < (n-1)*(wMax+1) {
 51 | 		return nil, ErrSequenceTooShort
 52 | 	}
 53 | 	if l < 1 {
 54 | 		return nil, ErrStrobeLengthTooSmall
 55 | 	}
 56 | 	if !(wMin > 0 && wMax > 0 && wMin <= wMax) {
 57 | 		return nil, ErrInvalidWindowOffsets
 58 | 	}
 59 | 
 60 | 	rs := &RandStrobes{
 61 | 		seq:  seq,
 62 | 		n:    n,
 63 | 		l:    l,
 64 | 		wMin: wMin,
 65 | 		wMax: wMax,
 66 | 
 67 | 		endHash: len(*seq) - l,           // position of the last l-mer
 68 | 		endIdx:  len(*seq) - l - (n-1)*l, // position of the last m1
 69 | 
 70 | 		shrinkWindow: true,
 71 | 
 72 | 		prime: defaultPrimeNumber,
 73 | 	}
 74 | 
 75 | 	var err error
 76 | 	rs.hashes, err = computeHashes(seq, l)
 77 | 
 78 | 	return rs, err
 79 | }
 80 | 
 81 | // SetPrime sets the prime number (q) in minimizing h(m)+h(mj) mod q.
 82 | // In this package, we use (h(m)+h(mj)) & q, where q = roundup(q) - 1.
 83 | // The value should not be too small, at least 256.
 84 | func (rs *RandStrobes) SetPrime(q uint64) {
 85 | 	if q < 256 {
 86 | 		q = 256
 87 | 	}
 88 | 	rs.prime = roundup64(q) - 1
 89 | }
 90 | 
 91 | // SetWindowShrink decides whether shrink the search window at positions
 92 | // near the end of the sequence. Default is true.
 93 | func (rs *RandStrobes) SetWindowShrink(shrink bool) {
 94 | 	rs.shrinkWindow = shrink
 95 | }
 96 | 
 97 | // Index returns the current index (0-based) of strobemers
 98 | func (rs *RandStrobes) Index() int {
 99 | 	return rs.idx - 1
100 | }
101 | 
102 | // Indexes returns current indexes (0-based) of strobes
103 | func (rs *RandStrobes) Indexes() []int {
104 | 	return []int{rs.idx - 1, rs.idx2, rs.idx3}
105 | }
106 | 
107 | // Next returns the next hash value of randstrobe
108 | func (rs *RandStrobes) Next() (uint64, bool) {
109 | 	switch rs.n {
110 | 	case 2:
111 | 		return rs.nextOrder2()
112 | 	case 3:
113 | 		return rs.nextOrder3()
114 | 	default:
115 | 	}
116 | 
117 | 	return 0, false
118 | }
119 | 
120 | func (rs *RandStrobes) nextOrder2() (uint64, bool) {
121 | 	if rs.idx > rs.endIdx {
122 | 		return 0, false
123 | 	}
124 | 
125 | 	rs.wStart = rs.idx + rs.wMin
126 | 	rs.wEnd = rs.idx + rs.wMax
127 | 
128 | 	// for positions near the end of the sequence, shrink the window size from the right
129 | 	if rs.wEnd > rs.endHash {
130 | 		if !rs.shrinkWindow {
131 | 			return 0, false
132 | 		}
133 | 		rs.wEnd = rs.endHash
134 | 	}
135 | 
136 | 	// fmt.Printf("i:%d, window (%d-%d)\n", rs.idx, rs.wStart, rs.wEnd)
137 | 
138 | 	rs.hash1 = rs.hashes[rs.idx]
139 | 	rs.hash2 = math.MaxUint64
140 | 	for rs.i = rs.wStart; rs.i <= rs.wEnd; rs.i++ {
141 | 		rs.hash = (rs.hash1 + rs.hashes[rs.i]) & rs.prime
142 | 		if rs.hash < rs.hash2 {
143 | 			rs.idx2 = rs.i
144 | 			rs.hash2 = rs.hash
145 | 		}
146 | 	}
147 | 	rs.hash2 = rs.hash1/2 + rs.hashes[rs.idx2]/3
148 | 
149 | 	rs.idx++
150 | 	return rs.hash2, true
151 | }
152 | 
153 | func (rs *RandStrobes) nextOrder3() (uint64, bool) {
154 | 	if rs.idx > rs.endIdx {
155 | 		return 0, false
156 | 	}
157 | 
158 | 	rs.w2Start = rs.idx + rs.wMax + rs.wMin
159 | 	rs.w2End = rs.idx + rs.wMax<<1
160 | 	if rs.w2Start > rs.endHash {
161 | 		return 0, false
162 | 	}
163 | 	// for positions near the end of the sequence, shrink the last window size from the right
164 | 	if rs.w2End > rs.endHash {
165 | 		if !rs.shrinkWindow {
166 | 			return 0, false
167 | 		}
168 | 		rs.w2End = rs.endHash
169 | 	}
170 | 
171 | 	rs.wStart = rs.idx + rs.wMin
172 | 	rs.wEnd = rs.idx + rs.wMax
173 | 
174 | 	// fmt.Printf("i:%d, window (%d-%d)\n", rs.idx, rs.wStart, rs.wEnd)
175 | 	// fmt.Printf("i:%d, window2 (%d-%d)\n", rs.idx, rs.w2Start, rs.w2End)
176 | 
177 | 	rs.hash1 = rs.hashes[rs.idx]
178 | 	rs.hash2 = math.MaxUint64
179 | 	for rs.i = rs.wStart; rs.i <= rs.wEnd; rs.i++ {
180 | 		rs.hash = (rs.hash1 + rs.hashes[rs.i]) & rs.prime
181 | 		if rs.hash < rs.hash2 {
182 | 			rs.idx2 = rs.i
183 | 			rs.hash2 = rs.hash
184 | 		}
185 | 	}
186 | 	rs.hash2 = rs.hash1/3 + rs.hashes[rs.idx2]/4
187 | 
188 | 	rs.hash3 = math.MaxUint64
189 | 	for rs.i = rs.w2Start; rs.i <= rs.w2End; rs.i++ {
190 | 		rs.hash = (rs.hash2 + rs.hashes[rs.i]) & rs.prime
191 | 		if rs.hash < rs.hash3 {
192 | 			rs.idx3 = rs.i
193 | 			rs.hash3 = rs.hash
194 | 		}
195 | 	}
196 | 	rs.hash3 = rs.hash2 + rs.hashes[rs.idx3]/5
197 | 
198 | 	rs.idx++
199 | 	return rs.hash3, true
200 | }
201 | 


--------------------------------------------------------------------------------
/minstrobes.go:
--------------------------------------------------------------------------------
  1 | package strobemers
  2 | 
  3 | import (
  4 | 	"math"
  5 | )
  6 | 
  7 | // MinStrobes is a iterator for MinStrobes
  8 | type MinStrobes struct {
  9 | 	seq *[]byte // DNA sequence
 10 | 
 11 | 	n    int // strobemer order
 12 | 	l    int // strobes length
 13 | 	wMin int // minimum window offset
 14 | 	wMax int // maximum window offset
 15 | 
 16 | 	idx, idx2, idx3     int    // indexes of m1, m2, m3
 17 | 	hash1, hash2, hash3 uint64 // hash value of m1, m2, m3
 18 | 
 19 | 	hashes []uint64 // precomputed ntHash values of l-mers
 20 | 
 21 | 	minlocs   []int    // locations of min hash
 22 | 	minhashes []uint64 // minhashes of window [i-w,i]
 23 | 
 24 | 	endHash int // position of the last l-mer
 25 | 	endIdx  int // position of the last m1
 26 | 
 27 | 	wStart, wEnd, w2Start, w2End int // window start and end
 28 | 
 29 | 	prime uint64
 30 | 
 31 | 	// shrink the last searching window for positions near the end of sequence.
 32 | 	shrinkWindow bool
 33 | 
 34 | 	// tmp variable
 35 | 	i    int
 36 | 	hash uint64
 37 | }
 38 | 
 39 | // NewMinStrobes creates a MinStrobes iterator.
 40 | // Parametems:
 41 | //     n    - strobemer order
 42 | //     l    - strobes length
 43 | //     wMin - minimum window offset, wMin > 0
 44 | //     wMax - maximum window offset, wMin <= wMax.
 45 | func NewMinStrobes(seq *[]byte, n int, l int, wMin int, wMax int) (*MinStrobes, error) {
 46 | 	if seq == nil || len(*seq) == 0 {
 47 | 		return nil, ErrInvalidSequence
 48 | 	}
 49 | 	if n < 2 {
 50 | 		return nil, ErrInvalidOrder
 51 | 	}
 52 | 	if n > 3 {
 53 | 		return nil, ErrOrderNotSupported
 54 | 	}
 55 | 	if len(*seq) < (n-1)*(wMax+1) {
 56 | 		return nil, ErrSequenceTooShort
 57 | 	}
 58 | 	if l < 1 {
 59 | 		return nil, ErrStrobeLengthTooSmall
 60 | 	}
 61 | 	if !(wMin > 0 && wMax > 0 && wMin <= wMax) {
 62 | 		return nil, ErrInvalidWindowOffsets
 63 | 	}
 64 | 
 65 | 	ms := &MinStrobes{
 66 | 		seq:  seq,
 67 | 		n:    n,
 68 | 		l:    l,
 69 | 		wMin: wMin,
 70 | 		wMax: wMax,
 71 | 
 72 | 		endHash: len(*seq) - l,           // position of the last l-mer
 73 | 		endIdx:  len(*seq) - l - (n-1)*l, // position of the last m1
 74 | 
 75 | 		shrinkWindow: true,
 76 | 
 77 | 		prime: defaultPrimeNumber,
 78 | 	}
 79 | 
 80 | 	var err error
 81 | 	ms.hashes, err = computeHashes(seq, l)
 82 | 	if err != nil {
 83 | 		return nil, err
 84 | 	}
 85 | 
 86 | 	ms.minlocs, ms.minhashes = computeMinHashes(ms.hashes, wMax-wMin+1)
 87 | 
 88 | 	return ms, err
 89 | }
 90 | 
 91 | // SetPrime sets the prime number (q) in minimizing h(m)+h(mj) mod q.
 92 | // In this package, we use (h(m)+h(mj)) & q, where q = roundup(q) - 1.
 93 | // The value should not be too small, at least 256.
 94 | func (ms *MinStrobes) SetPrime(q uint64) {
 95 | 	if q < 256 {
 96 | 		q = 256
 97 | 	}
 98 | 	ms.prime = roundup64(q) - 1
 99 | }
100 | 
101 | // SetWindowShrink decides whether shrink the search window at positions
102 | // near the end of the sequence. Default is true.
103 | func (ms *MinStrobes) SetWindowShrink(shrink bool) {
104 | 	ms.shrinkWindow = shrink
105 | }
106 | 
107 | // Index returns the current index (0-based) of strobemers
108 | func (ms *MinStrobes) Index() int {
109 | 	return ms.idx - 1
110 | }
111 | 
112 | // Indexes returns current indexes (0-based) of strobes
113 | func (ms *MinStrobes) Indexes() []int {
114 | 	return []int{ms.idx - 1, ms.idx2, ms.idx3}
115 | }
116 | 
117 | // Next returns the next hash value of randstrobe
118 | func (ms *MinStrobes) Next() (uint64, bool) {
119 | 	switch ms.n {
120 | 	case 2:
121 | 		return ms.nextOrder2()
122 | 	case 3:
123 | 		return ms.nextOrder3()
124 | 	default:
125 | 	}
126 | 
127 | 	return 0, false
128 | }
129 | 
130 | func (ms *MinStrobes) nextOrder2() (uint64, bool) {
131 | 	if ms.idx > ms.endIdx {
132 | 		return 0, false
133 | 	}
134 | 
135 | 	ms.wStart = ms.idx + ms.wMin
136 | 	ms.wEnd = ms.idx + ms.wMax
137 | 
138 | 	// for positions near the end of the sequence, shrink the window size from the right
139 | 	if ms.wEnd > ms.endHash {
140 | 		if !ms.shrinkWindow {
141 | 			return 0, false
142 | 		}
143 | 		ms.wEnd = ms.endHash
144 | 
145 | 		// fmt.Printf("i:%d, window (%d-%d)\n", ms.idx, ms.wStart, ms.wEnd)
146 | 
147 | 		ms.hash1 = ms.hashes[ms.idx]
148 | 		ms.hash2 = math.MaxUint64
149 | 		for ms.i = ms.wStart; ms.i <= ms.wEnd; ms.i++ {
150 | 			ms.hash = ms.hashes[ms.i]
151 | 			if ms.hash < ms.hash2 {
152 | 				ms.idx2 = ms.i
153 | 				ms.hash2 = ms.hash
154 | 			}
155 | 		}
156 | 		// For 1) asymmetry, 2) avoid value overflow
157 | 		ms.hash2 = ms.hash1/2 + ms.hashes[ms.idx2]/3
158 | 	} else { // use precomputed min hashes
159 | 		ms.hash1 = ms.hashes[ms.idx]
160 | 		ms.idx2 = ms.minlocs[ms.wEnd]
161 | 		ms.hash2 = ms.hash1/2 + ms.minhashes[ms.wEnd]/3
162 | 	}
163 | 
164 | 	ms.idx++
165 | 	return ms.hash2, true
166 | }
167 | 
168 | func (ms *MinStrobes) nextOrder3() (uint64, bool) {
169 | 	if ms.idx > ms.endIdx {
170 | 		return 0, false
171 | 	}
172 | 
173 | 	ms.w2Start = ms.idx + ms.wMax + ms.wMin
174 | 	ms.w2End = ms.idx + ms.wMax<<1
175 | 	if ms.w2Start > ms.endHash {
176 | 		return 0, false
177 | 	}
178 | 
179 | 	ms.wStart = ms.idx + ms.wMin
180 | 	ms.wEnd = ms.idx + ms.wMax
181 | 
182 | 	// use precomputed min hashes
183 | 	ms.hash1 = ms.hashes[ms.idx]
184 | 	ms.idx2 = ms.minlocs[ms.wEnd]
185 | 	ms.hash2 = ms.hash1/3 + ms.minhashes[ms.wEnd]/4
186 | 
187 | 	// for positions near the end of the sequence, shrink the last window size from the right
188 | 	if ms.w2End > ms.endHash {
189 | 		if !ms.shrinkWindow {
190 | 			return 0, false
191 | 		}
192 | 		ms.w2End = ms.endHash
193 | 
194 | 		ms.hash3 = math.MaxUint64
195 | 		for ms.i = ms.w2Start; ms.i <= ms.w2End; ms.i++ {
196 | 			ms.hash = (ms.hash2 + ms.hashes[ms.i]) & ms.prime
197 | 			if ms.hash < ms.hash3 {
198 | 				ms.idx3 = ms.i
199 | 				ms.hash3 = ms.hash
200 | 			}
201 | 		}
202 | 		ms.hash3 = ms.hash2 + ms.hashes[ms.idx3]/5
203 | 	} else {
204 | 		ms.idx3 = ms.minlocs[ms.w2End]
205 | 		ms.hash3 = ms.hash2 + ms.minhashes[ms.w2End]/5
206 | 	}
207 | 
208 | 	// fmt.Printf("i:%d, window (%d-%d)\n", ms.idx, ms.wStart, ms.wEnd)
209 | 	// fmt.Printf("i:%d, window2 (%d-%d)\n", ms.idx, ms.w2Start, ms.w2End)
210 | 
211 | 	ms.idx++
212 | 	return ms.hash3, true
213 | }
214 | 


--------------------------------------------------------------------------------
/evaluation/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation
 2 | 
 3 | ## Number of matched strobemers
 4 | 
 5 | [A similar test](https://github.com/BGI-Qingdao/strobemer_cpptest#benchmark_sim-r-match-only) with approximate results.
 6 | 
 7 | query: 150bp, snp: 1 (0.006)
 8 | 
 9 |     $ go run test1_matches.go  q0-snp1.fasta r0.fasta  | csvtk pretty -t
10 |     query     ref   method                           nQuery   nRef      nCommon   qCov
11 |     -------   ---   ------------------------------   ------   -------   -------   -----
12 |     q0-snp1   r0    Kmer(20)                         131      1546586   111       84.73 *
13 |     q0-snp1   r0    MinStrobes(2,10,12,12,shrink)    131      1548767   109       83.21
14 |     q0-snp1   r0    MinStrobes(2,10,12,12)           129      1548765   109       84.50
15 |     q0-snp1   r0    RankStrobes(2,10,12,12,shrink)   131      1548767   109       83.21
16 |     q0-snp1   r0    RankStrobes(2,10,12,12)          129      1548765   109       84.50
17 |                                                                                 
18 |     q0-snp1   r0    Kmer(21)                         130      1547218   109       83.85
19 |     q0-snp1   r0    MinStrobes(3,7,9,9,shrink)       126      1549315   108       85.71 *
20 |     q0-snp1   r0    MinStrobes(3,7,9,9)              126      1549315   108       85.71 *
21 |     q0-snp1   r0    RankStrobes(3,7,9,9,shrink)      126      1549315   108       85.71 *
22 |     q0-snp1   r0    RankStrobes(3,7,9,9)             126      1549315   108       85.71 *
23 |                                                                                 
24 |     q0-snp1   r0    Kmer(20)                         131      1546586   111       84.73
25 |     q0-snp1   r0    MinStrobes(2,10,12,16,shrink)    131      1548376   107       81.68
26 |     q0-snp1   r0    MinStrobes(2,10,12,16)           125      1548370   107       85.60  *
27 |     q0-snp1   r0    RankStrobes(2,10,12,16,shrink)   131      1548438   108       82.44
28 |     q0-snp1   r0    RankStrobes(2,10,12,16)          125      1548432   108       86.40  *
29 |                                                                                 
30 |     q0-snp1   r0    Kmer(21)                         130      1547218   109       83.85
31 |     q0-snp1   r0    MinStrobes(3,7,9,13,shrink)      122      1545403   102       83.61
32 |     q0-snp1   r0    MinStrobes(3,7,9,13)             118      1545399   102       86.44 **
33 |     q0-snp1   r0    RankStrobes(3,7,9,13,shrink)     122      1545522   107       87.70 **
34 |     q0-snp1   r0    RankStrobes(3,7,9,13)            118      1545518   105       88.98 **
35 |     
36 | query: 150bp, snp: 3 (0.02)
37 | 
38 |     $ go run test1_matches.go  q2-snp3.fasta r2.fasta  | csvtk pretty -t
39 |     query     ref   method                           nQuery   nRef      nCommon   qCov
40 |     -------   ---   ------------------------------   ------   -------   -------   -----
41 |     q2-snp3   r2    Kmer(20)                         131      1687558   84        64.12 *
42 |     q2-snp3   r2    MinStrobes(2,10,12,12,shrink)    131      1687785   82        62.60
43 |     q2-snp3   r2    MinStrobes(2,10,12,12)           129      1687783   82        63.57
44 |     q2-snp3   r2    RankStrobes(2,10,12,12,shrink)   131      1687785   82        62.60
45 |     q2-snp3   r2    RankStrobes(2,10,12,12)          129      1687783   82        63.57
46 |                                                                                 
47 |     q2-snp3   r2    Kmer(21)                         130      1687656   82        63.08
48 |     q2-snp3   r2    MinStrobes(3,7,9,9,shrink)       126      1687865   84        66.67 *
49 |     q2-snp3   r2    MinStrobes(3,7,9,9)              126      1687865   84        66.67 *
50 |     q2-snp3   r2    RankStrobes(3,7,9,9,shrink)      126      1687865   84        66.67 *
51 |     q2-snp3   r2    RankStrobes(3,7,9,9)             126      1687865   84        66.67 *
52 |                                                                                 
53 |     q2-snp3   r2    Kmer(20)                         131      1687558   84        64.12 *
54 |     q2-snp3   r2    MinStrobes(2,10,12,16,shrink)    131      1687487   76        58.02
55 |     q2-snp3   r2    MinStrobes(2,10,12,16)           125      1687481   76        60.80
56 |     q2-snp3   r2    RankStrobes(2,10,12,16,shrink)   131      1687529   77        58.78
57 |     q2-snp3   r2    RankStrobes(2,10,12,16)          125      1687523   76        60.80
58 |                                                                                 
59 |     q2-snp3   r2    Kmer(21)                         130      1687656   82        63.08 *
60 |     q2-snp3   r2    MinStrobes(3,7,9,13,shrink)      122      1684611   74        60.66
61 |     q2-snp3   r2    MinStrobes(3,7,9,13)             118      1684607   72        61.02
62 |     q2-snp3   r2    RankStrobes(3,7,9,13,shrink)     122      1684661   71        58.20
63 |     q2-snp3   r2    RankStrobes(3,7,9,13)            118      1684657   68        57.63
64 |     
65 | query: 150bp, snp: 7 (0.47)
66 | 
67 |     $ go run test1_matches.go  q1-snp7.rc.fasta r1.fasta  | csvtk pretty -t
68 |     query        ref   method                           nQuery   nRef      nCommon   qCov
69 |     ----------   ---   ------------------------------   ------   -------   -------   -----
70 |     q1-snp7.rc   r1    Kmer(20)                         131      2802879   54        41.22 *
71 |     q1-snp7.rc   r1    MinStrobes(2,10,12,12,shrink)    131      2804781   52        39.69
72 |     q1-snp7.rc   r1    MinStrobes(2,10,12,12)           129      2804779   52        40.31
73 |     q1-snp7.rc   r1    RankStrobes(2,10,12,12,shrink)   131      2804781   52        39.69
74 |     q1-snp7.rc   r1    RankStrobes(2,10,12,12)          129      2804779   52        40.31
75 |                                                                                     
76 |     q1-snp7.rc   r1    Kmer(21)                         130      2804365   51        39.23 *
77 |     q1-snp7.rc   r1    MinStrobes(3,7,9,9,shrink)       126      2806161   48        38.10
78 |     q1-snp7.rc   r1    MinStrobes(3,7,9,9)              126      2806161   48        38.10
79 |     q1-snp7.rc   r1    RankStrobes(3,7,9,9,shrink)      126      2806161   48        38.10
80 |     q1-snp7.rc   r1    RankStrobes(3,7,9,9)             126      2806161   48        38.10
81 |                                                                                     
82 |     q1-snp7.rc   r1    Kmer(20)                         131      2802879   54        41.22 *
83 |     q1-snp7.rc   r1    MinStrobes(2,10,12,16,shrink)    131      2803507   51        38.93
84 |     q1-snp7.rc   r1    MinStrobes(2,10,12,16)           125      2803501   51        40.80
85 |     q1-snp7.rc   r1    RankStrobes(2,10,12,16,shrink)   131      2803659   47        35.88
86 |     q1-snp7.rc   r1    RankStrobes(2,10,12,16)          125      2803653   44        35.20
87 |                                                                                     
88 |     q1-snp7.rc   r1    Kmer(21)                         130      2804365   51        39.23 *
89 |     q1-snp7.rc   r1    MinStrobes(3,7,9,13,shrink)      122      2797218   36        29.51
90 |     q1-snp7.rc   r1    MinStrobes(3,7,9,13)             118      2797214   36        30.51
91 |     q1-snp7.rc   r1    RankStrobes(3,7,9,13,shrink)     122      2797918   42        34.43
92 |     q1-snp7.rc   r1    RankStrobes(3,7,9,13)            118      2797914   41        34.75
93 | 


--------------------------------------------------------------------------------
/evaluation/test1_matches.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 	"runtime"
  9 | 	"strings"
 10 | 	"sync"
 11 | 
 12 | 	"github.com/shenwei356/bio/seqio/fastx"
 13 | 	"github.com/shenwei356/strobemers"
 14 | 	"github.com/will-rowe/nthash"
 15 | )
 16 | 
 17 | func main() {
 18 | 	args := os.Args
 19 | 	if len(args) != 3 {
 20 | 		checkError(fmt.Errorf("usage: %s query.fasta ref.fasta", os.Args[0]))
 21 | 	}
 22 | 
 23 | 	fileQuery, fileRef := args[1], args[2]
 24 | 
 25 | 	q, _ := filepathTrimExtension(filepath.Base(fileQuery))
 26 | 	r, _ := filepathTrimExtension(filepath.Base(fileRef))
 27 | 
 28 | 	type Test struct {
 29 | 		n    int
 30 | 		l    int
 31 | 		wMin int
 32 | 		wMax int
 33 | 	}
 34 | 
 35 | 	tests := []Test{
 36 | 		{n: 2, l: 10, wMin: 12, wMax: 12},
 37 | 		{n: 3, l: 7, wMin: 9, wMax: 9},
 38 | 
 39 | 		{n: 2, l: 10, wMin: 12, wMax: 16},
 40 | 		{n: 3, l: 7, wMin: 9, wMax: 13},
 41 | 	}
 42 | 
 43 | 	seqsQ := readSeqs(fileQuery)
 44 | 	seqsR := readSeqs(fileRef)
 45 | 
 46 | 	var kmersQ, kmersR map[uint64]interface{}
 47 | 	var rstrobesSQ, rstrobesSR, rstrobesQ, rstrobesR map[uint64]interface{}
 48 | 	var mstrobesSQ, mstrobesSR, mstrobesQ, mstrobesR map[uint64]interface{}
 49 | 	var kmersInter, rstrobesSInter, rstrobesInter, mstrobesSInter, mstrobesInter int
 50 | 
 51 | 	fmt.Printf("query\tref\tmethod\tnQuery\tnRef\tnCommon\tqCov\n")
 52 | 	runtime.GOMAXPROCS(10)
 53 | 	for _, t := range tests {
 54 | 		var wg sync.WaitGroup
 55 | 		wg.Add(10)
 56 | 
 57 | 		// kmers
 58 | 		go func() {
 59 | 			kmersQ = list2map(computeKmers(seqsQ, t.n*t.l))
 60 | 			wg.Done()
 61 | 		}()
 62 | 		go func() {
 63 | 			kmersR = list2map(computeKmers(seqsR, t.n*t.l))
 64 | 			wg.Done()
 65 | 		}()
 66 | 
 67 | 		// randstrobes
 68 | 		go func() {
 69 | 			rstrobesSQ = list2map(computeRandStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, true))
 70 | 			wg.Done()
 71 | 		}()
 72 | 		go func() {
 73 | 			rstrobesSR = list2map(computeRandStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, true))
 74 | 			wg.Done()
 75 | 		}()
 76 | 		go func() {
 77 | 			rstrobesQ = list2map(computeRandStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, false))
 78 | 			wg.Done()
 79 | 		}()
 80 | 		go func() {
 81 | 			rstrobesR = list2map(computeRandStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, false))
 82 | 			wg.Done()
 83 | 		}()
 84 | 
 85 | 		// minstrobes
 86 | 		go func() {
 87 | 			mstrobesSQ = list2map(computeMinStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, true))
 88 | 			wg.Done()
 89 | 		}()
 90 | 		go func() {
 91 | 			mstrobesSR = list2map(computeMinStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, true))
 92 | 			wg.Done()
 93 | 		}()
 94 | 		go func() {
 95 | 			mstrobesQ = list2map(computeMinStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, false))
 96 | 			wg.Done()
 97 | 		}()
 98 | 		go func() {
 99 | 			mstrobesR = list2map(computeMinStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, false))
100 | 			wg.Done()
101 | 		}()
102 | 		wg.Wait()
103 | 
104 | 		// intersection
105 | 
106 | 		var wg2 sync.WaitGroup
107 | 		wg2.Add(5)
108 | 		go func() {
109 | 			kmersInter = intersection(kmersQ, kmersR)
110 | 			wg2.Done()
111 | 		}()
112 | 
113 | 		go func() {
114 | 			rstrobesSInter = intersection(rstrobesSQ, rstrobesSR)
115 | 			wg2.Done()
116 | 		}()
117 | 		go func() {
118 | 			rstrobesInter = intersection(rstrobesQ, rstrobesR)
119 | 			wg2.Done()
120 | 		}()
121 | 
122 | 		go func() {
123 | 			mstrobesSInter = intersection(mstrobesSQ, mstrobesSR)
124 | 			wg2.Done()
125 | 		}()
126 | 		go func() {
127 | 			mstrobesInter = intersection(mstrobesQ, mstrobesR)
128 | 			wg2.Done()
129 | 		}()
130 | 
131 | 		wg2.Wait()
132 | 
133 | 		// kmers
134 | 		fmt.Printf("%s\t%s\tKmer(%d)\t%d\t%d\t%d\t%.2f\n",
135 | 			q, r, t.n*t.l, len(kmersQ), len(kmersR),
136 | 			kmersInter, float64(kmersInter)/float64(len(kmersQ))*100)
137 | 
138 | 		// minstrobes
139 | 		fmt.Printf("%s\t%s\tMinStrobes(%d,%d,%d,%d,shrink)\t%d\t%d\t%d\t%.2f\n",
140 | 			q, r, t.n, t.l, t.wMin, t.wMax, len(mstrobesSQ), len(mstrobesSR),
141 | 			mstrobesSInter, float64(mstrobesSInter)/float64(len(mstrobesSQ))*100)
142 | 		fmt.Printf("%s\t%s\tMinStrobes(%d,%d,%d,%d)\t%d\t%d\t%d\t%.2f\n",
143 | 			q, r, t.n, t.l, t.wMin, t.wMax, len(mstrobesQ), len(mstrobesR),
144 | 			mstrobesInter, float64(mstrobesInter)/float64(len(mstrobesQ))*100)
145 | 
146 | 		// randstrobes
147 | 		fmt.Printf("%s\t%s\tRankStrobes(%d,%d,%d,%d,shrink)\t%d\t%d\t%d\t%.2f\n",
148 | 			q, r, t.n, t.l, t.wMin, t.wMax, len(rstrobesSQ), len(rstrobesSR),
149 | 			rstrobesSInter, float64(rstrobesSInter)/float64(len(rstrobesSQ))*100)
150 | 		fmt.Printf("%s\t%s\tRankStrobes(%d,%d,%d,%d)\t%d\t%d\t%d\t%.2f\n",
151 | 			q, r, t.n, t.l, t.wMin, t.wMax, len(rstrobesQ), len(rstrobesR),
152 | 			rstrobesInter, float64(rstrobesInter)/float64(len(rstrobesQ))*100)
153 | 
154 | 		fmt.Printf(" \t \t \t \t \t \t \n")
155 | 	}
156 | 
157 | }
158 | 
159 | func checkError(e error) {
160 | 	if e != nil {
161 | 		fmt.Fprintf(os.Stderr, "%s\n", e)
162 | 		os.Exit(0)
163 | 	}
164 | }
165 | 
166 | func readSeqs(file string) [][]byte {
167 | 	reader, err := fastx.NewDefaultReader(file)
168 | 	checkError(err)
169 | 
170 | 	sequences := make([][]byte, 0, 8)
171 | 
172 | 	var record *fastx.Record
173 | 	for {
174 | 		record, err = reader.Read()
175 | 		if err != nil {
176 | 			if err == io.EOF {
177 | 				break
178 | 			}
179 | 			checkError(err)
180 | 			break
181 | 		}
182 | 
183 | 		sequences = append(sequences, record.Seq.Seq)
184 | 	}
185 | 
186 | 	return sequences
187 | }
188 | 
189 | func computeKmers(sequences [][]byte, k int) []uint64 {
190 | 	hashes := make([]uint64, 0, 1024)
191 | 
192 | 	var hash uint64
193 | 	var ok bool
194 | 	var hasher *nthash.NTHi
195 | 	var err error
196 | 	for _, _seq := range sequences {
197 | 		hasher, err = nthash.NewHasher(&_seq, uint(k))
198 | 		checkError(err)
199 | 
200 | 		for {
201 | 			hash, ok = hasher.Next(true)
202 | 			if !ok {
203 | 				break
204 | 			}
205 | 
206 | 			hashes = append(hashes, hash)
207 | 		}
208 | 	}
209 | 
210 | 	return hashes
211 | }
212 | 
213 | func computeRandStrobes(sequences [][]byte, n int, l int, wMin int, wMax int, shrink bool) []uint64 {
214 | 	hashes := make([]uint64, 0, 1024)
215 | 
216 | 	var hash uint64
217 | 	var ok bool
218 | 	var rs *strobemers.RandStrobes
219 | 	var err error
220 | 
221 | 	for _, _seq := range sequences {
222 | 		rs, err = strobemers.NewRandStrobes(&_seq, n, l, wMin, wMax)
223 | 		checkError(err)
224 | 
225 | 		rs.SetWindowShrink(shrink)
226 | 		for {
227 | 			hash, ok = rs.Next()
228 | 			if !ok {
229 | 				break
230 | 			}
231 | 
232 | 			hashes = append(hashes, hash)
233 | 		}
234 | 	}
235 | 
236 | 	return hashes
237 | }
238 | 
239 | func computeMinStrobes(sequences [][]byte, n int, l int, wMin int, wMax int, shrink bool) []uint64 {
240 | 	hashes := make([]uint64, 0, 1024)
241 | 
242 | 	var hash uint64
243 | 	var ok bool
244 | 	var rs *strobemers.MinStrobes
245 | 	var err error
246 | 
247 | 	for _, _seq := range sequences {
248 | 		rs, err = strobemers.NewMinStrobes(&_seq, n, l, wMin, wMax)
249 | 		checkError(err)
250 | 
251 | 		rs.SetWindowShrink(shrink)
252 | 		for {
253 | 			hash, ok = rs.Next()
254 | 			if !ok {
255 | 				break
256 | 			}
257 | 
258 | 			hashes = append(hashes, hash)
259 | 		}
260 | 	}
261 | 
262 | 	return hashes
263 | }
264 | 
265 | func list2map(data []uint64) map[uint64]interface{} {
266 | 	m := make(map[uint64]interface{}, len(data))
267 | 	for _, k := range data {
268 | 		m[k] = struct{}{}
269 | 	}
270 | 	return m
271 | }
272 | 
273 | func intersection(m1, m2 map[uint64]interface{}) int {
274 | 	n := 0
275 | 	var ok bool
276 | 	for k := range m1 {
277 | 		if _, ok = m2[k]; ok {
278 | 			n++
279 | 		}
280 | 	}
281 | 	return n
282 | }
283 | 
284 | func filepathTrimExtension(file string) (string, string) {
285 | 	gz := strings.HasSuffix(file, ".gz") || strings.HasSuffix(file, ".GZ")
286 | 	if gz {
287 | 		file = file[0 : len(file)-3]
288 | 	}
289 | 	extension := filepath.Ext(file)
290 | 	name := file[0 : len(file)-len(extension)]
291 | 	if gz {
292 | 		extension += ".gz"
293 | 	}
294 | 	return name, extension
295 | }
296 | 


--------------------------------------------------------------------------------