├── speed.xlsx
├── illustration.xlsx
├── illustration_randstrobes_order2.jpg
├── illustration_randstrobes_order3.jpg
├── strobemers.go
├── evaluation
├── q1-snp7.fasta
├── q2-snp3.fasta
├── r0.s.fasta
├── r1.s.fasta
├── r2.s.fasta
├── q1-snp7.rc.fasta
├── q2-snp3-gap1.fasta
├── q0-snp1.fasta
├── q2-snp3.fasta.blastn
├── q2-snp3-gap1.fasta.blastn
├── q0-snp1.fasta.blastn
├── q1-snp7.fasta.blastn
├── README.md
└── test1_matches.go
├── go.mod
├── .gitignore
├── LICENSE
├── util.go
├── randstrobes_test.go
├── minstrobes_test.go
├── go.sum
├── README.md
├── common.go
├── strobemers_test.go
├── randstrobes.go
└── minstrobes.go
/speed.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/speed.xlsx
--------------------------------------------------------------------------------
/illustration.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/illustration.xlsx
--------------------------------------------------------------------------------
/illustration_randstrobes_order2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/illustration_randstrobes_order2.jpg
--------------------------------------------------------------------------------
/illustration_randstrobes_order3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/illustration_randstrobes_order3.jpg
--------------------------------------------------------------------------------
/strobemers.go:
--------------------------------------------------------------------------------
1 | //Package strobemers is a Go implementation of the https://github.com/ksahlin/strobemers.
2 |
3 | package strobemers
4 |
--------------------------------------------------------------------------------
/evaluation/q1-snp7.fasta:
--------------------------------------------------------------------------------
1 | >q1
2 | CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCTGAAGGGCT
3 | GCCGAAGGACAAGTCGCCGAGCAAGCCCGCCAGCTTCCGCTGTCTGGGCAAGCCGGAACC
4 | GGCGGTACAGAAGATCCTCGACCAACGACT
5 |
--------------------------------------------------------------------------------
/evaluation/q2-snp3.fasta:
--------------------------------------------------------------------------------
1 | >q2
2 | GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACTG
3 | CGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA
4 | GACACCCGTCACAAGCATGACATGATAATT
5 |
--------------------------------------------------------------------------------
/evaluation/r0.s.fasta:
--------------------------------------------------------------------------------
1 | >r0
2 | CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC
3 | GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG
4 | AGGAAATGGAAGAAACCGTGGACTTCCTGG
5 |
--------------------------------------------------------------------------------
/evaluation/r1.s.fasta:
--------------------------------------------------------------------------------
1 | >r1
2 | CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCCGAAGGGCT
3 | GCCGCAGGACAAGTCGCCGAGCAAGCACGCCAGCTTTCGCTGGCTGGGCAAGCCGGAACC
4 | GGCGGTACAGAAGATCCTCGACGAACAACT
5 |
--------------------------------------------------------------------------------
/evaluation/r2.s.fasta:
--------------------------------------------------------------------------------
1 | >r2
2 | GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAGACGGATGTGAACTG
3 | CGAACGACTGACACTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA
4 | GACACCCGTCACAAGCATGACATGATAATT
5 |
--------------------------------------------------------------------------------
/evaluation/q1-snp7.rc.fasta:
--------------------------------------------------------------------------------
1 | >q1_rc
2 | AGTCGTTGGTCGAGGATCTTCTGTACCGCCGGTTCCGGCTTGCCCAGACAGCGGAAGCTG
3 | GCGGGCTTGCTCGGCGACTTGTCCTTCGGCAGCCCTTCAGGCACCTGCACGTAGAGCCCG
4 | TAGGCATGACTCTTGTCCCAATCGAAGGCG
5 |
--------------------------------------------------------------------------------
/evaluation/q2-snp3-gap1.fasta:
--------------------------------------------------------------------------------
1 | >q2
2 | GAGGAATTAACGAACAGATACACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACTG
3 | CGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA
4 | GACACCCGTCACAAGCATGACATGATAATT
5 |
--------------------------------------------------------------------------------
/evaluation/q0-snp1.fasta:
--------------------------------------------------------------------------------
1 | >RL|S1|R634/1
2 | CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC
3 | GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG
4 | AGGAAATGCAAGAAACCGTGGACTTCCTGG
5 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/shenwei356/strobemers
2 |
3 | go 1.16
4 |
5 | require (
6 | github.com/shenwei356/bio v0.1.0
7 | github.com/shenwei356/util v0.3.0
8 | github.com/will-rowe/nthash v0.3.0
9 | github.com/zeebo/xxh3 v0.10.0
10 | )
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 |
8 | # Test binary, built with `go test -c`
9 | *.test
10 |
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 |
14 | # Dependency directories (remove the comment below to include it)
15 | # vendor/
16 |
17 | *.directory
18 | doc/site/*
19 |
20 | *.brename_detail.txt
21 |
22 | *cpu.pprof
23 | *mem.pprof
24 | *trace.out
25 |
26 | t_*
27 |
28 | blastdb
29 | test/test
30 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Wei Shen (shenwei356@gmail.com)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
1 | package strobemers
2 |
3 | func roundup64(x uint64) uint64 {
4 | if x == 0 {
5 | return 1
6 | }
7 | x--
8 | x |= x >> 1
9 | x |= x >> 2
10 | x |= x >> 4
11 | x |= x >> 8
12 | x |= x >> 16
13 | x |= x >> 32
14 | return (x | x>>64) + 1
15 | }
16 |
17 | // only used in tests
18 | var cbases [256]byte = [256]byte{
19 | 'T', 'G', 'C', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
20 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
21 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
22 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
23 | 'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
24 | 'N', 'N', 'N', 'N', 'A', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
25 | 'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
26 | 'N', 'N', 'N', 'N', 'A', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
27 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
28 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
29 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
30 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
31 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
32 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
33 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
34 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
35 | }
36 |
--------------------------------------------------------------------------------
/randstrobes_test.go:
--------------------------------------------------------------------------------
1 | package strobemers
2 |
3 | import (
4 | "fmt"
5 | "strings"
6 | "testing"
7 | )
8 |
9 | func TestRandStrobesOrder2(t *testing.T) {
10 | _s := "ACGATCTGGTACCTAG"
11 | s := []byte(_s)
12 |
13 | n := 2
14 | l := 3
15 | wMin := 3
16 | wMax := 5
17 | rs, err := NewRandStrobes(&s, n, l, wMin, wMax)
18 | if err != nil {
19 | t.Error(err)
20 | }
21 |
22 | var h uint64
23 | var ok bool
24 | var ps []int
25 | var i1, i2 int
26 | for {
27 | h, ok = rs.Next()
28 | if !ok {
29 | break
30 | }
31 |
32 | if !debug {
33 | continue
34 | }
35 |
36 | ps = rs.Indexes()
37 | i1, i2 = ps[0], ps[1]
38 | fmt.Printf("%s len:%d\n", _s, len(_s))
39 | fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1)
40 | fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2)
41 | fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h)
42 | fmt.Println()
43 | }
44 | }
45 |
46 | func TestRandStrobesOrder3(t *testing.T) {
47 | _s := "ACGATCTGGTACCTAG"
48 | s := []byte(_s)
49 |
50 | n := 3
51 | l := 3
52 | wMin := 3
53 | wMax := 5
54 | rs, err := NewRandStrobes(&s, n, l, wMin, wMax)
55 | if err != nil {
56 | t.Error(err)
57 | }
58 |
59 | var h uint64
60 | var ok bool
61 | var ps []int
62 | var i1, i2, i3 int
63 | for {
64 | h, ok = rs.Next()
65 | if !ok {
66 | break
67 | }
68 |
69 | if !debug {
70 | continue
71 | }
72 |
73 | ps = rs.Indexes()
74 | i1, i2, i3 = ps[0], ps[1], ps[2]
75 | fmt.Printf("%s len:%d\n", _s, len(_s))
76 | fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1)
77 | fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2)
78 | fmt.Printf("%s%s i3:%d\n", strings.Repeat(" ", i3), _s[i3:i3+l], i3)
79 | fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h)
80 | fmt.Println()
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/minstrobes_test.go:
--------------------------------------------------------------------------------
1 | package strobemers
2 |
3 | import (
4 | "fmt"
5 | "strings"
6 | "testing"
7 | )
8 |
9 | func TestMinStrobesOrders2(t *testing.T) {
10 | _s := "ACGATCTGGTACCTAG"
11 | s := []byte(_s)
12 |
13 | n := 2
14 | l := 3
15 | wMin := 3
16 | wMax := 5
17 | ms, err := NewMinStrobes(&s, n, l, wMin, wMax)
18 | if err != nil {
19 | t.Error(err)
20 | }
21 |
22 | // for i, m := range ms.minhashes {
23 | // fmt.Println(i, m)
24 | // }
25 |
26 | var h uint64
27 | var ok bool
28 | var ps []int
29 | var i1, i2 int
30 | for {
31 | h, ok = ms.Next()
32 | if !ok {
33 | break
34 | }
35 |
36 | if !debug {
37 | continue
38 | }
39 |
40 | ps = ms.Indexes()
41 | i1, i2 = ps[0], ps[1]
42 | fmt.Printf("%s len:%d\n", _s, len(_s))
43 | fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1)
44 | fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2)
45 | fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h)
46 | fmt.Println()
47 | }
48 | }
49 |
50 | func TestMinStrobesOrder3(t *testing.T) {
51 | _s := "ACGATCTGGTACCTAG"
52 | s := []byte(_s)
53 |
54 | n := 3
55 | l := 3
56 | wMin := 3
57 | wMax := 5
58 | rs, err := NewMinStrobes(&s, n, l, wMin, wMax)
59 | if err != nil {
60 | t.Error(err)
61 | }
62 |
63 | var h uint64
64 | var ok bool
65 | var ps []int
66 | var i1, i2, i3 int
67 | for {
68 | h, ok = rs.Next()
69 | if !ok {
70 | break
71 | }
72 |
73 | if !debug {
74 | continue
75 | }
76 |
77 | ps = rs.Indexes()
78 | i1, i2, i3 = ps[0], ps[1], ps[2]
79 | fmt.Printf("%s len:%d\n", _s, len(_s))
80 | fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1)
81 | fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2)
82 | fmt.Printf("%s%s i3:%d\n", strings.Repeat(" ", i3), _s[i3:i3+l], i3)
83 | fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h)
84 | fmt.Println()
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/evaluation/q2-snp3.fasta.blastn:
--------------------------------------------------------------------------------
1 | BLASTN 2.11.0+
2 |
3 |
4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb
5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J
6 | Comput Biol 2000; 7(1-2):203-14.
7 |
8 |
9 |
10 | Database: r2.fasta
11 | 1 sequences; 1,688,298 total letters
12 |
13 |
14 |
15 | Query= q2
16 |
17 | Length=150
18 | Score E
19 | Sequences producing significant alignments: (Bits) Value
20 |
21 | r2 261 4e-71
22 |
23 |
24 | >r2
25 | Length=1688298
26 |
27 | Score = 261 bits (141), Expect = 4e-71
28 | Identities = 147/150 (98%), Gaps = 0/150 (0%)
29 | Strand=Plus/Plus
30 |
31 | Query 1 GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACTG 60
32 | ||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||
33 | Sbjct 869619 GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAGACGGATGTGAACTG 869678
34 |
35 | Query 61 CGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA 120
36 | ||| |||||||| |||||||||||||||||||||||||||||||||||||||||||||||
37 | Sbjct 869679 CGAACGACTGACACTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA 869738
38 |
39 | Query 121 GACACCCGTCACAAGCATGACATGATAATT 150
40 | ||||||||||||||||||||||||||||||
41 | Sbjct 869739 GACACCCGTCACAAGCATGACATGATAATT 869768
42 |
43 |
44 |
45 | Lambda K H
46 | 1.33 0.621 1.12
47 |
48 | Gapped
49 | Lambda K H
50 | 1.28 0.460 0.850
51 |
52 | Effective search space used: 221164549
53 |
54 |
55 | Database: r2.fasta
56 | Posted date: Apr 14, 2021 7:44 PM
57 | Number of letters in database: 1,688,298
58 | Number of sequences in database: 1
59 |
60 |
61 |
62 | Matrix: blastn matrix 1 -2
63 | Gap Penalties: Existence: 0, Extension: 2.5
64 |
--------------------------------------------------------------------------------
/evaluation/q2-snp3-gap1.fasta.blastn:
--------------------------------------------------------------------------------
1 | BLASTN 2.11.0+
2 |
3 |
4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb
5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J
6 | Comput Biol 2000; 7(1-2):203-14.
7 |
8 |
9 |
10 | Database: r2.fasta
11 | 1 sequences; 1,688,298 total letters
12 |
13 |
14 |
15 | Query= q2
16 |
17 | Length=151
18 | Score E
19 | Sequences producing significant alignments: (Bits) Value
20 |
21 | r2 255 2e-69
22 |
23 |
24 | >r2
25 | Length=1688298
26 |
27 | Score = 255 bits (138), Expect = 2e-69
28 | Identities = 147/151 (97%), Gaps = 1/151 (1%)
29 | Strand=Plus/Plus
30 |
31 | Query 1 GAGGAATTAACGAACAGATACACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACT 60
32 | |||||||||||||||||||| ||||||||||||||||||||||||| |||||||||||||
33 | Sbjct 869619 GAGGAATTAACGAACAGATA-ACGCATATTGTCCCGTTTGATTGAAGACGGATGTGAACT 869677
34 |
35 | Query 61 GCGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCG 120
36 | |||| |||||||| ||||||||||||||||||||||||||||||||||||||||||||||
37 | Sbjct 869678 GCGAACGACTGACACTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCG 869737
38 |
39 | Query 121 AGACACCCGTCACAAGCATGACATGATAATT 151
40 | |||||||||||||||||||||||||||||||
41 | Sbjct 869738 AGACACCCGTCACAAGCATGACATGATAATT 869768
42 |
43 |
44 |
45 | Lambda K H
46 | 1.33 0.621 1.12
47 |
48 | Gapped
49 | Lambda K H
50 | 1.28 0.460 0.850
51 |
52 | Effective search space used: 222852828
53 |
54 |
55 | Database: r2.fasta
56 | Posted date: Apr 14, 2021 7:44 PM
57 | Number of letters in database: 1,688,298
58 | Number of sequences in database: 1
59 |
60 |
61 |
62 | Matrix: blastn matrix 1 -2
63 | Gap Penalties: Existence: 0, Extension: 2.5
64 |
--------------------------------------------------------------------------------
/evaluation/q0-snp1.fasta.blastn:
--------------------------------------------------------------------------------
1 | BLASTN 2.11.0+
2 |
3 |
4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb
5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J
6 | Comput Biol 2000; 7(1-2):203-14.
7 |
8 |
9 |
10 | Database: r0.fasta
11 | 1 sequences; 1,560,393 total letters
12 |
13 |
14 |
15 | Query= RL|S1|R634/1
16 |
17 | Length=150
18 | Score E
19 | Sequences producing significant alignments: (Bits) Value
20 |
21 | r0 272 2e-74
22 |
23 |
24 | >r0
25 | Length=1560393
26 |
27 | Score = 272 bits (147), Expect = 2e-74
28 | Identities = 149/150 (99%), Gaps = 0/150 (0%)
29 | Strand=Plus/Plus
30 |
31 | Query 1 CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC 60
32 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
33 | Sbjct 396270 CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC 396329
34 |
35 | Query 61 GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG 120
36 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
37 | Sbjct 396330 GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG 396389
38 |
39 | Query 121 AGGAAATGCAAGAAACCGTGGACTTCCTGG 150
40 | |||||||| |||||||||||||||||||||
41 | Sbjct 396390 AGGAAATGGAAGAAACCGTGGACTTCCTGG 396419
42 |
43 |
44 |
45 | Lambda K H
46 | 1.33 0.621 1.12
47 |
48 | Gapped
49 | Lambda K H
50 | 1.28 0.460 0.850
51 |
52 | Effective search space used: 204408994
53 |
54 |
55 | Database: r0.fasta
56 | Posted date: Apr 14, 2021 7:44 PM
57 | Number of letters in database: 1,560,393
58 | Number of sequences in database: 1
59 |
60 |
61 |
62 | Matrix: blastn matrix 1 -2
63 | Gap Penalties: Existence: 0, Extension: 2.5
64 |
--------------------------------------------------------------------------------
/evaluation/q1-snp7.fasta.blastn:
--------------------------------------------------------------------------------
1 | BLASTN 2.11.0+
2 |
3 |
4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb
5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J
6 | Comput Biol 2000; 7(1-2):203-14.
7 |
8 |
9 |
10 | Database: r1.fasta
11 | 1 sequences; 2,833,277 total letters
12 |
13 |
14 |
15 | Query= q1
16 |
17 | Length=150
18 | Score E
19 | Sequences producing significant alignments: (Bits) Value
20 |
21 | r1 239 3e-64
22 |
23 |
24 | >r1
25 | Length=2833277
26 |
27 | Score = 239 bits (129), Expect = 3e-64
28 | Identities = 143/150 (95%), Gaps = 0/150 (0%)
29 | Strand=Plus/Minus
30 |
31 | Query 1 CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCTGAAGGGCT 60
32 | ||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||
33 | Sbjct 2741385 CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCCGAAGGGCT 2741326
34 |
35 | Query 61 GCCGAAGGACAAGTCGCCGAGCAAGCCCGCCAGCTTCCGCTGTCTGGGCAAGCCGGAACC 120
36 | |||| ||||||||||||||||||||| ||||||||| ||||| |||||||||||||||||
37 | Sbjct 2741325 GCCGCAGGACAAGTCGCCGAGCAAGCACGCCAGCTTTCGCTGGCTGGGCAAGCCGGAACC 2741266
38 |
39 | Query 121 GGCGGTACAGAAGATCCTCGACCAACGACT 150
40 | |||||||||||||||||||||| ||| |||
41 | Sbjct 2741265 GGCGGTACAGAAGATCCTCGACGAACAACT 2741236
42 |
43 |
44 |
45 | Lambda K H
46 | 1.33 0.621 1.12
47 |
48 | Gapped
49 | Lambda K H
50 | 1.28 0.460 0.850
51 |
52 | Effective search space used: 368323410
53 |
54 |
55 | Database: r1.fasta
56 | Posted date: Apr 14, 2021 7:44 PM
57 | Number of letters in database: 2,833,277
58 | Number of sequences in database: 1
59 |
60 |
61 |
62 | Matrix: blastn matrix 1 -2
63 | Gap Penalties: Existence: 0, Extension: 2.5
64 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548/go.mod h1:e6NPNENfs9mPDVNRekM7lKScauxd5kXTr1Mfyig6TDM=
2 | github.com/cznic/sortutil v0.0.0-20181122101858-f5f958428db8 h1:LpMLYGyy67BoAFGda1NeOBQwqlv7nUXpm+rIVHGxZZ4=
3 | github.com/cznic/sortutil v0.0.0-20181122101858-f5f958428db8/go.mod h1:q2w6Bg5jeox1B+QkJ6Wp/+Vn0G/bo3f1uY7Fn3vivIQ=
4 | github.com/edsrzf/mmap-go v1.0.0/go.mod h1:YO35OhQPt3KJa3ryjFM5Bs14WD66h8eGKpfaBNrHW5M=
5 | github.com/klauspost/compress v1.11.4 h1:kz40R/YWls3iqT9zX9AHN3WoVsrAWVyui5sxuLqiXqU=
6 | github.com/klauspost/compress v1.11.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
7 | github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE=
8 | github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
9 | github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
10 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
11 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
12 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
13 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
14 | github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
15 | github.com/shenwei356/bio v0.1.0 h1:VDnI28zcdybywdn6/tcZvplAJ1IxOAAYrTJhhTB1SLQ=
16 | github.com/shenwei356/bio v0.1.0/go.mod h1:NgFauYHlpmjCYEf2XP8foITht6ej6poggQkILpjraN4=
17 | github.com/shenwei356/bpool v0.0.0-20160710042833-f9e0ee4d0403 h1:/3JklLnHXiWUBxWc3joQYavDQJpncRhRA909cUb7eOw=
18 | github.com/shenwei356/bpool v0.0.0-20160710042833-f9e0ee4d0403/go.mod h1:YkgdTWfNnJgv5HVJbVSDmxQtkK3/jZWDoqcG26BVU8k=
19 | github.com/shenwei356/breader v0.1.0/go.mod h1:YXIrHIPtbJCP6Kv27qGp+cXQl7hyzD0iQrEVYCy/gqw=
20 | github.com/shenwei356/util v0.3.0 h1:gTVa3sGwcyGEHgNpXTzdL3MaaJN/bGAypVKSCnT4QfU=
21 | github.com/shenwei356/util v0.3.0/go.mod h1:n3qhc3bQzlqJ2/5v79hgl0Gd3WzJOkI8XcUix25Brdg=
22 | github.com/shenwei356/xopen v0.0.0-20181203091311-f4f16ddd3992 h1:RXEEyKj0JL3SrRIYsWIEyy4AwjHbI3I8aDGK6CA4+YI=
23 | github.com/shenwei356/xopen v0.0.0-20181203091311-f4f16ddd3992/go.mod h1:6EQUa6I7Zsl2GQKqcL9qGLrTzVE+oZyly+uhzovQYSk=
24 | github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553/go.mod h1:Rj7Csq/tZ/egz+Ltc2IVpsA5309AmSMEswjkTZmq2Xc=
25 | github.com/will-rowe/nthash v0.3.0 h1:yN+Il98GRWyp7HdaiEbsE7KC4ySEKtPatm+SLZ5uQBk=
26 | github.com/will-rowe/nthash v0.3.0/go.mod h1:5ezweuK0J5j+/7lih/RkrSmnxI3hoaPpQiVWJ7rd960=
27 | github.com/zeebo/xxh3 v0.10.0 h1:1+2Mov9zfxTNUeoDG9k9i13VfxTR0p1JQu8L0vikxB0=
28 | github.com/zeebo/xxh3 v0.10.0/go.mod h1:AQY73TOrhF3jNsdiM9zZOb8MThrYbZONHj7ryDBaLpg=
29 | golang.org/x/sys v0.0.0-20200727154430-2d971f7391a4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
30 | golang.org/x/sys v0.0.0-20210315160823-c6e025ad8005 h1:pDMpM2zh2MT0kHy037cKlSby2nEhD50SYqwQk76Nm40=
31 | golang.org/x/sys v0.0.0-20210315160823-c6e025ad8005/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
32 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
33 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
34 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Strobemers in Go
2 |
3 | [](https://godoc.org/github.com/shenwei356/strobemers)
4 | [](https://goreportcard.com/report/github.com/shenwei356/strobemers)
5 |
6 | ## Introduction
7 |
8 | This is a Go implementation of the [strobemers](https://github.com/ksahlin/strobemers) (minstrobes and randstrobes),
9 | with some [differences](#differences).
10 |
11 | The implementation of `Randstrobes` has a not-bad performance (2-3X slower) compared to regular k-mer,
12 | while it's 10-20X slower than [ntHash](https://github.com/will-rowe/nthash/).
13 | Besides, `Randstrobes` is only slightly slower than `MinStrobes` (see [benchmark](#benchmark)).
14 |
15 | ### Attention
16 |
17 | The current implementation only computes strobemers of the positive strand,
18 | because the strobes are asymmetrical and the location matters.
19 |
20 | ## Installation
21 |
22 | go get github.com/shenwei356/strobemers
23 |
24 | ## Quick Start
25 |
26 | We followed the code style of [ntHash](https://github.com/will-rowe/nthash/).
27 |
28 | ```go
29 | n := 2
30 | l := 3
31 | w_min := 3
32 | w_max := 5
33 | rs, err := strobemers.NewRandStrobes(seq, n, l, w_min, w_max)
34 | checkError(err)
35 |
36 | var hash uint64
37 | var ok bool
38 | var i int // 0-based index
39 | var positions []int // 0-based indexes of all strobes
40 |
41 | rs.SetWindowShrink(true)
42 | for {
43 | hash, ok = rs.Next()
44 | if !ok {
45 | break
46 | }
47 |
48 | i = rs.Index()
49 | positions = rs.Indexes()
50 | }
51 |
52 | ```
53 |
54 | ## Differences
55 |
56 | Here are some differences compared to the original implementation,
57 | see discussion: [#1](https://github.com/ksahlin/strobemers/issues/1), [#2](https://github.com/ksahlin/strobemers/issues/2).
58 |
59 | item |orginal |this |comment
60 | :---------------------|:----------------------|:---------------------------------|:-----------------------------------------
61 | window range |`w_min < w_max` |`w_min <= w_max` |allow a fixed position
62 | shrinking window |all `w_min` and `w_max`|optional shrinking last `w_max` |see figures below
63 | number of strobemers |`len(seq)-n*l+1` |`len(seq)-n*l+1-(n-1)*l` |window shrinked
64 | number of strobemers | |`len(seq)-n*l+1-(n-1)*(l+w_min-1)`|window not shrinked
65 | choice of min hash |`(h(m)+h(mj))%q` |`(h(m)+h(mj))&q` |`&` is faster than `%`
66 | final hash value (n=2)|`h(m1)-h(m2)` |`h(m1)/2+h(m2)/3` |keep asymmetry and avoid `uint64` overflow
67 | final hash value (n=3)|`h(m1)-h(m2)+2*h(m3)` |`h(m1)/3+h(m2)/4+h(m3)/5` |~
68 |
69 |
70 |
71 |
72 |
73 | ## Benchmark
74 |
75 | method |time |relative_time
76 | :----------------------|:-----|:------------
77 | ntHashKmers(30) |8590 |1
78 | Kmers(30) |55579 |6
79 | MinStrobes(2,15,20,30) |104520|12
80 | MinStrobes(3,10,20,30) |111662|13
81 | RandStrobes(2,15,20,30)|93436 |11
82 | RandStrobes(3,10,20,30)|152461|18
83 |
84 | $ go test . -bench=Benchmark* -benchmem \
85 | | grep Bench \
86 | | perl -pe 's/\s\s+/\t/g' \
87 | | csvtk cut -Ht -f 1,3-5 \
88 | | csvtk add-header -t -n test,time,memory,allocs \
89 | | csvtk pretty -t -r
90 |
91 | test time memory allocs
92 | ------------------------------------- ------------ ---------- -----------
93 | BenchmarkNTHash/1.00_KB-16 8590 ns/op 48 B/op 1 allocs/op
94 | BenchmarkKmers/1.00_KB-16 55579 ns/op 32 B/op 1 allocs/op
95 | BenchmarkMinStrobesOrder2/1.00_KB-16 104520 ns/op 25064 B/op 7 allocs/op
96 | BenchmarkMinStrobesOrder3/1.00_KB-16 111662 ns/op 25064 B/op 7 allocs/op
97 | BenchmarkRandStrobesOrder2/1.00_KB-16 93436 ns/op 8432 B/op 3 allocs/op
98 | BenchmarkRandStrobesOrder3/1.00_KB-16 152461 ns/op 8432 B/op 3 allocs/op
99 |
100 |
101 | ## Similar Projects
102 |
103 | - [strobemer_cpptest](https://github.com/BGI-Qingdao/strobemer_cpptest)
104 |
105 | ## References
106 |
107 | - [ntHash](http://dx.doi.org/10.1093/bioinformatics/btw397)
108 | - [strobemers](https://doi.org/10.1101/2021.01.28.428549)
109 |
--------------------------------------------------------------------------------
/common.go:
--------------------------------------------------------------------------------
1 | package strobemers
2 |
3 | import (
4 | "fmt"
5 | "sort"
6 |
7 | "github.com/will-rowe/nthash"
8 | )
9 |
10 | // defaultPrimeNumber is the prime number in minimizing h(m)+h(mj) mod q.
11 | // In this package, we use (h(m)+h(mj)) & q, where q = roundup(q) - 1
12 | var defaultPrimeNumber uint64 = (1 << 20) - 1
13 |
14 | // ------------------------------------------------------------------------
15 | // errors
16 |
17 | // ErrOrderNotSupported means a big strobemer order is not supported.
18 | var ErrOrderNotSupported = fmt.Errorf("strobemers: strobemer order not supported")
19 |
20 | // ErrInvalidOrder means
21 | var ErrInvalidOrder = fmt.Errorf("strobemers: strobemer order too small")
22 |
23 | // ErrInvalidSequence means the given sequence is invalid
24 | var ErrInvalidSequence = fmt.Errorf("strobemers: invalid DNA sequence")
25 |
26 | // ErrSequenceTooShort means the sequence is too short
27 | var ErrSequenceTooShort = fmt.Errorf("strobemers: sequence too short")
28 |
29 | // ErrStrobeLengthTooSmall means the strobe length is too small
30 | var ErrStrobeLengthTooSmall = fmt.Errorf("strobemers: strobe length too small")
31 |
32 | // ErrInvalidWindowOffsets means invalid window offsets
33 | var ErrInvalidWindowOffsets = fmt.Errorf("strobemers: window offset should be > 0, and wMin <= wMax")
34 |
35 | // ErrIncompleteHashValues means incomplete hash values
36 | var ErrIncompleteHashValues = fmt.Errorf("strobemers: incomplete hash values")
37 |
38 | var ErrPrimeNumberTooSmall = fmt.Errorf("strobemers: the primer number is too small")
39 |
40 | // ------------------------------------------------------------------------
41 |
42 | func computeHashes(sequence *[]byte, k int) ([]uint64, error) {
43 | hasher, err := nthash.NewHasher(sequence, uint(k))
44 | if err != nil {
45 | return nil, err
46 | }
47 |
48 | hashes := make([]uint64, len(*sequence)-k+1)
49 | var hash uint64
50 | var ok bool
51 | var i int
52 | for {
53 | hash, ok = hasher.Next(true)
54 | if !ok {
55 | break
56 | }
57 | hashes[i] = hash
58 | i++
59 | }
60 |
61 | if i != len(*sequence)-k+1 {
62 | return nil, ErrIncompleteHashValues
63 | }
64 |
65 | return hashes, nil
66 | }
67 |
68 | func computeMinHashes(hashes []uint64, w int) ([]int, []uint64) {
69 | locs := make([]int, len(hashes))
70 | if w == 1 {
71 | for i := range hashes {
72 | locs[i] = i
73 | }
74 | return locs, hashes
75 | }
76 |
77 | minHashes := make([]uint64, len(hashes))
78 |
79 | var hash uint64
80 | var i, idxMw, b, e, t int
81 | var i2v IdxValue
82 | var flag bool
83 |
84 | buf := make([]IdxValue, 0, w)
85 | end := len(hashes)
86 | r := w - 1 // last position in the buffer
87 |
88 | for idx := 0; idx < end; idx++ { // idx is end position of a window
89 | hash = hashes[idx]
90 |
91 | if idx < r { // front of w
92 | buf = append(buf, IdxValue{Idx: idx, Val: hash}) // add current hash to buf
93 | continue
94 | }
95 |
96 | if idx == r { // position w
97 | buf = append(buf, IdxValue{Idx: idx, Val: hash}) // add current hash to buf
98 | sort.Sort(idxValues(buf))
99 |
100 | i2v = buf[0] // the smallest one
101 | locs[idx] = i2v.Idx
102 | minHashes[idx] = i2v.Val
103 | continue
104 | }
105 |
106 | // find min k-mer
107 |
108 | // remove k-mer not in this window.
109 | // have to check position/index one by one
110 | idxMw = idx - w
111 | for i, i2v = range buf {
112 | if i2v.Idx == idxMw {
113 | if i < r { // not the last element
114 | copy(buf[i:r], buf[i+1:])
115 | } // happen to be at the end
116 | buf = buf[:r]
117 | break
118 | }
119 | }
120 |
121 | // add new k-mer
122 | flag = false
123 | // using binary search, faster han linear search
124 | b, e = 0, r-1
125 | for {
126 | t = b + (e-b)/2
127 | if hash < buf[t].Val {
128 | e = t - 1 // end search here
129 | if e <= b {
130 | flag = true
131 | i = b
132 | break
133 | }
134 | } else {
135 | b = t + 1 // start here
136 | if b >= r {
137 | flag = false
138 | break
139 | }
140 | if b >= e {
141 | flag = true
142 | i = e // right here
143 | break
144 | }
145 | }
146 | }
147 | if !flag { // it's the biggest one, append to the end
148 | buf = append(buf, IdxValue{idx, hash})
149 | } else {
150 | if hash >= buf[i].Val { // have to check again
151 | i++
152 | }
153 | buf = append(buf, blankI2V) // append one element
154 | copy(buf[i+1:], buf[i:r]) // move right
155 | buf[i] = IdxValue{idx, hash}
156 | }
157 |
158 | i2v = buf[0] // the smallest one
159 | locs[idx] = i2v.Idx
160 | minHashes[idx] = i2v.Val
161 | }
162 |
163 | return locs, minHashes
164 | }
165 |
166 | type IdxValue struct {
167 | Idx int // index
168 | Val uint64 // hash
169 | }
170 |
171 | var blankI2V = IdxValue{0, 0}
172 |
173 | type idxValues []IdxValue
174 |
175 | func (l idxValues) Len() int { return len(l) }
176 | func (l idxValues) Less(i int, j int) bool { return l[i].Val < l[j].Val }
177 | func (l idxValues) Swap(i int, j int) { l[i], l[j] = l[j], l[i] }
178 |
--------------------------------------------------------------------------------
/strobemers_test.go:
--------------------------------------------------------------------------------
1 | //Package strobemers is a Go implementation of the https://github.com/ksahlin/strobemers.
2 |
3 | package strobemers
4 |
5 | import (
6 | "math/rand"
7 | "testing"
8 |
9 | "github.com/shenwei356/util/bytesize"
10 | "github.com/will-rowe/nthash"
11 | "github.com/zeebo/xxh3"
12 | )
13 |
14 | var debug = true
15 |
16 | var seqs [][]byte
17 |
18 | var bit2base = [4]byte{'A', 'C', 'G', 'T'}
19 |
20 | func init() {
21 | rand.Seed(11)
22 |
23 | sizes := []int{1 << 10} //, 1 << 20} //, 10 << 20}
24 | seqs = make([][]byte, len(sizes))
25 | for i, size := range sizes {
26 | sequence := make([]byte, size)
27 | for j := 0; j < size; j++ {
28 | sequence[j] = bit2base[rand.Intn(4)]
29 | }
30 | seqs[i] = sequence
31 | }
32 | }
33 |
34 | var _hash uint64
35 | var _k int = 30
36 | var _n2 int = 2
37 | var _l2 int = 15
38 | var _n3 int = 3
39 | var _l3 int = 10
40 | var _w_min int = 20
41 | var _w_max int = 30
42 |
43 | func BenchmarkNTHash(b *testing.B) {
44 | for i := range seqs {
45 | size := len(seqs[i])
46 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
47 | for j := 0; j < b.N; j++ {
48 | var hash uint64
49 | var ok bool
50 | var hasher *nthash.NTHi
51 | var err error
52 |
53 | hasher, err = nthash.NewHasher(&seqs[i], uint(_k))
54 | if err != nil {
55 | b.Errorf("fail to create ntHasher iterator. seq length: %d", size)
56 | }
57 |
58 | for {
59 | hash, ok = hasher.Next(true)
60 | if !ok {
61 | break
62 | }
63 |
64 | _hash = hash
65 | }
66 | }
67 | })
68 | }
69 | }
70 |
71 | func BenchmarkKmers(b *testing.B) {
72 | for i := range seqs {
73 | size := len(seqs[i])
74 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
75 | for j := 0; j < b.N; j++ {
76 | var hash, hashrc uint64
77 | var end int
78 | var seq []byte
79 | var rc []byte
80 | var _i, _j int
81 |
82 | rc = make([]byte, _k)
83 | seq = seqs[i]
84 | end = len(seq) - _k + 1
85 |
86 | for i := 0; i < end; i++ {
87 | hash = xxh3.Hash(seq[i : i+_k])
88 |
89 | // complementary sequence
90 | for _i = 0; _i < _k; _i++ {
91 | rc[_i] = cbases[seq[i+_i]]
92 | }
93 | // reverse
94 | for _i, _j = 0, _k-1; _i < _j; _i, _j = _i+1, _j-1 {
95 | rc[_i], rc[_j] = rc[_j], rc[_i]
96 | }
97 | hashrc = xxh3.Hash(rc)
98 |
99 | // canonical kmer
100 | if hash < hashrc {
101 | _hash = hash
102 | } else {
103 | _hash = hashrc
104 | }
105 | }
106 | }
107 | })
108 | }
109 | }
110 |
111 | func BenchmarkMinStrobesOrder2(b *testing.B) {
112 | for i := range seqs {
113 | size := len(seqs[i])
114 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
115 | for j := 0; j < b.N; j++ {
116 | var hash uint64
117 | var ok bool
118 | var rs *MinStrobes
119 | var err error
120 |
121 | rs, err = NewMinStrobes(&seqs[i], _n2, _l2, _w_min, _w_max)
122 | if err != nil {
123 | b.Errorf("fail to create MinStrobes. seq length: %d", size)
124 | }
125 |
126 | for {
127 | hash, ok = rs.Next()
128 | if !ok {
129 | break
130 | }
131 |
132 | _hash = hash
133 | }
134 | }
135 | })
136 | }
137 | }
138 |
139 | func BenchmarkMinStrobesOrder3(b *testing.B) {
140 | for i := range seqs {
141 | size := len(seqs[i])
142 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
143 | for j := 0; j < b.N; j++ {
144 | var hash uint64
145 | var ok bool
146 | var rs *MinStrobes
147 | var err error
148 |
149 | rs, err = NewMinStrobes(&seqs[i], _n3, _l3, _w_min, _w_max)
150 | if err != nil {
151 | b.Errorf("fail to create MinStrobes. seq length: %d", size)
152 | }
153 |
154 | for {
155 | hash, ok = rs.Next()
156 | if !ok {
157 | break
158 | }
159 |
160 | _hash = hash
161 | }
162 | }
163 | })
164 | }
165 | }
166 |
167 | func BenchmarkRandStrobesOrder2(b *testing.B) {
168 | for i := range seqs {
169 | size := len(seqs[i])
170 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
171 | for j := 0; j < b.N; j++ {
172 | var hash uint64
173 | var ok bool
174 | var rs *RandStrobes
175 | var err error
176 |
177 | rs, err = NewRandStrobes(&seqs[i], _n2, _l2, _w_min, _w_max)
178 | if err != nil {
179 | b.Errorf("fail to create RandStrobes. seq length: %d", size)
180 | }
181 |
182 | for {
183 | hash, ok = rs.Next()
184 | if !ok {
185 | break
186 | }
187 |
188 | _hash = hash
189 | }
190 | }
191 | })
192 | }
193 | }
194 |
195 | func BenchmarkRandStrobesOrder3(b *testing.B) {
196 | for i := range seqs {
197 | size := len(seqs[i])
198 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) {
199 | for j := 0; j < b.N; j++ {
200 | var hash uint64
201 | var ok bool
202 | var rs *RandStrobes
203 | var err error
204 |
205 | rs, err = NewRandStrobes(&seqs[i], _n3, _l3, _w_min, _w_max)
206 | if err != nil {
207 | b.Errorf("fail to create RandStrobes. seq length: %d", size)
208 | }
209 |
210 | for {
211 | hash, ok = rs.Next()
212 | if !ok {
213 | break
214 | }
215 |
216 | _hash = hash
217 | }
218 | }
219 | })
220 | }
221 | }
222 |
--------------------------------------------------------------------------------
/randstrobes.go:
--------------------------------------------------------------------------------
1 | package strobemers
2 |
3 | import "math"
4 |
5 | // RandStrobes is a iterator for randstrobes
6 | type RandStrobes struct {
7 | seq *[]byte // DNA sequence
8 |
9 | n int // strobemer order
10 | l int // strobes length
11 | wMin int // minimum window offset
12 | wMax int // maximum window offset
13 |
14 | idx, idx2, idx3 int // indexes of m1, m2, m3
15 | hash1, hash2, hash3 uint64 // hash value of m1, m2, m3
16 |
17 | hashes []uint64 // precomputed ntHash values of l-mers
18 |
19 | endHash int // position of the last l-mer
20 | endIdx int // position of the last m1
21 |
22 | wStart, wEnd, w2Start, w2End int // window start and end
23 |
24 | prime uint64
25 |
26 | // shrink the last searching window for positions near the end of sequence.
27 | shrinkWindow bool
28 |
29 | // tmp variable
30 | i int
31 | hash uint64
32 | }
33 |
34 | // NewRandStrobes creates a RandStrobes iterator.
35 | // Parameters:
36 | // n - strobemer order
37 | // l - strobes length
38 | // wMin - minimum window offset, wMin > 0
39 | // wMax - maximum window offset, wMin <= wMax.
40 | func NewRandStrobes(seq *[]byte, n int, l int, wMin int, wMax int) (*RandStrobes, error) {
41 | if seq == nil || len(*seq) == 0 {
42 | return nil, ErrInvalidSequence
43 | }
44 | if n < 2 {
45 | return nil, ErrInvalidOrder
46 | }
47 | if n > 3 {
48 | return nil, ErrOrderNotSupported
49 | }
50 | if len(*seq) < (n-1)*(wMax+1) {
51 | return nil, ErrSequenceTooShort
52 | }
53 | if l < 1 {
54 | return nil, ErrStrobeLengthTooSmall
55 | }
56 | if !(wMin > 0 && wMax > 0 && wMin <= wMax) {
57 | return nil, ErrInvalidWindowOffsets
58 | }
59 |
60 | rs := &RandStrobes{
61 | seq: seq,
62 | n: n,
63 | l: l,
64 | wMin: wMin,
65 | wMax: wMax,
66 |
67 | endHash: len(*seq) - l, // position of the last l-mer
68 | endIdx: len(*seq) - l - (n-1)*l, // position of the last m1
69 |
70 | shrinkWindow: true,
71 |
72 | prime: defaultPrimeNumber,
73 | }
74 |
75 | var err error
76 | rs.hashes, err = computeHashes(seq, l)
77 |
78 | return rs, err
79 | }
80 |
81 | // SetPrime sets the prime number (q) in minimizing h(m)+h(mj) mod q.
82 | // In this package, we use (h(m)+h(mj)) & q, where q = roundup(q) - 1.
83 | // The value should not be too small, at least 256.
84 | func (rs *RandStrobes) SetPrime(q uint64) {
85 | if q < 256 {
86 | q = 256
87 | }
88 | rs.prime = roundup64(q) - 1
89 | }
90 |
91 | // SetWindowShrink decides whether shrink the search window at positions
92 | // near the end of the sequence. Default is true.
93 | func (rs *RandStrobes) SetWindowShrink(shrink bool) {
94 | rs.shrinkWindow = shrink
95 | }
96 |
97 | // Index returns the current index (0-based) of strobemers
98 | func (rs *RandStrobes) Index() int {
99 | return rs.idx - 1
100 | }
101 |
102 | // Indexes returns current indexes (0-based) of strobes
103 | func (rs *RandStrobes) Indexes() []int {
104 | return []int{rs.idx - 1, rs.idx2, rs.idx3}
105 | }
106 |
107 | // Next returns the next hash value of randstrobe
108 | func (rs *RandStrobes) Next() (uint64, bool) {
109 | switch rs.n {
110 | case 2:
111 | return rs.nextOrder2()
112 | case 3:
113 | return rs.nextOrder3()
114 | default:
115 | }
116 |
117 | return 0, false
118 | }
119 |
120 | func (rs *RandStrobes) nextOrder2() (uint64, bool) {
121 | if rs.idx > rs.endIdx {
122 | return 0, false
123 | }
124 |
125 | rs.wStart = rs.idx + rs.wMin
126 | rs.wEnd = rs.idx + rs.wMax
127 |
128 | // for positions near the end of the sequence, shrink the window size from the right
129 | if rs.wEnd > rs.endHash {
130 | if !rs.shrinkWindow {
131 | return 0, false
132 | }
133 | rs.wEnd = rs.endHash
134 | }
135 |
136 | // fmt.Printf("i:%d, window (%d-%d)\n", rs.idx, rs.wStart, rs.wEnd)
137 |
138 | rs.hash1 = rs.hashes[rs.idx]
139 | rs.hash2 = math.MaxUint64
140 | for rs.i = rs.wStart; rs.i <= rs.wEnd; rs.i++ {
141 | rs.hash = (rs.hash1 + rs.hashes[rs.i]) & rs.prime
142 | if rs.hash < rs.hash2 {
143 | rs.idx2 = rs.i
144 | rs.hash2 = rs.hash
145 | }
146 | }
147 | rs.hash2 = rs.hash1/2 + rs.hashes[rs.idx2]/3
148 |
149 | rs.idx++
150 | return rs.hash2, true
151 | }
152 |
153 | func (rs *RandStrobes) nextOrder3() (uint64, bool) {
154 | if rs.idx > rs.endIdx {
155 | return 0, false
156 | }
157 |
158 | rs.w2Start = rs.idx + rs.wMax + rs.wMin
159 | rs.w2End = rs.idx + rs.wMax<<1
160 | if rs.w2Start > rs.endHash {
161 | return 0, false
162 | }
163 | // for positions near the end of the sequence, shrink the last window size from the right
164 | if rs.w2End > rs.endHash {
165 | if !rs.shrinkWindow {
166 | return 0, false
167 | }
168 | rs.w2End = rs.endHash
169 | }
170 |
171 | rs.wStart = rs.idx + rs.wMin
172 | rs.wEnd = rs.idx + rs.wMax
173 |
174 | // fmt.Printf("i:%d, window (%d-%d)\n", rs.idx, rs.wStart, rs.wEnd)
175 | // fmt.Printf("i:%d, window2 (%d-%d)\n", rs.idx, rs.w2Start, rs.w2End)
176 |
177 | rs.hash1 = rs.hashes[rs.idx]
178 | rs.hash2 = math.MaxUint64
179 | for rs.i = rs.wStart; rs.i <= rs.wEnd; rs.i++ {
180 | rs.hash = (rs.hash1 + rs.hashes[rs.i]) & rs.prime
181 | if rs.hash < rs.hash2 {
182 | rs.idx2 = rs.i
183 | rs.hash2 = rs.hash
184 | }
185 | }
186 | rs.hash2 = rs.hash1/3 + rs.hashes[rs.idx2]/4
187 |
188 | rs.hash3 = math.MaxUint64
189 | for rs.i = rs.w2Start; rs.i <= rs.w2End; rs.i++ {
190 | rs.hash = (rs.hash2 + rs.hashes[rs.i]) & rs.prime
191 | if rs.hash < rs.hash3 {
192 | rs.idx3 = rs.i
193 | rs.hash3 = rs.hash
194 | }
195 | }
196 | rs.hash3 = rs.hash2 + rs.hashes[rs.idx3]/5
197 |
198 | rs.idx++
199 | return rs.hash3, true
200 | }
201 |
--------------------------------------------------------------------------------
/minstrobes.go:
--------------------------------------------------------------------------------
1 | package strobemers
2 |
3 | import (
4 | "math"
5 | )
6 |
7 | // MinStrobes is a iterator for MinStrobes
8 | type MinStrobes struct {
9 | seq *[]byte // DNA sequence
10 |
11 | n int // strobemer order
12 | l int // strobes length
13 | wMin int // minimum window offset
14 | wMax int // maximum window offset
15 |
16 | idx, idx2, idx3 int // indexes of m1, m2, m3
17 | hash1, hash2, hash3 uint64 // hash value of m1, m2, m3
18 |
19 | hashes []uint64 // precomputed ntHash values of l-mers
20 |
21 | minlocs []int // locations of min hash
22 | minhashes []uint64 // minhashes of window [i-w,i]
23 |
24 | endHash int // position of the last l-mer
25 | endIdx int // position of the last m1
26 |
27 | wStart, wEnd, w2Start, w2End int // window start and end
28 |
29 | prime uint64
30 |
31 | // shrink the last searching window for positions near the end of sequence.
32 | shrinkWindow bool
33 |
34 | // tmp variable
35 | i int
36 | hash uint64
37 | }
38 |
39 | // NewMinStrobes creates a MinStrobes iterator.
40 | // Parametems:
41 | // n - strobemer order
42 | // l - strobes length
43 | // wMin - minimum window offset, wMin > 0
44 | // wMax - maximum window offset, wMin <= wMax.
45 | func NewMinStrobes(seq *[]byte, n int, l int, wMin int, wMax int) (*MinStrobes, error) {
46 | if seq == nil || len(*seq) == 0 {
47 | return nil, ErrInvalidSequence
48 | }
49 | if n < 2 {
50 | return nil, ErrInvalidOrder
51 | }
52 | if n > 3 {
53 | return nil, ErrOrderNotSupported
54 | }
55 | if len(*seq) < (n-1)*(wMax+1) {
56 | return nil, ErrSequenceTooShort
57 | }
58 | if l < 1 {
59 | return nil, ErrStrobeLengthTooSmall
60 | }
61 | if !(wMin > 0 && wMax > 0 && wMin <= wMax) {
62 | return nil, ErrInvalidWindowOffsets
63 | }
64 |
65 | ms := &MinStrobes{
66 | seq: seq,
67 | n: n,
68 | l: l,
69 | wMin: wMin,
70 | wMax: wMax,
71 |
72 | endHash: len(*seq) - l, // position of the last l-mer
73 | endIdx: len(*seq) - l - (n-1)*l, // position of the last m1
74 |
75 | shrinkWindow: true,
76 |
77 | prime: defaultPrimeNumber,
78 | }
79 |
80 | var err error
81 | ms.hashes, err = computeHashes(seq, l)
82 | if err != nil {
83 | return nil, err
84 | }
85 |
86 | ms.minlocs, ms.minhashes = computeMinHashes(ms.hashes, wMax-wMin+1)
87 |
88 | return ms, err
89 | }
90 |
91 | // SetPrime sets the prime number (q) in minimizing h(m)+h(mj) mod q.
92 | // In this package, we use (h(m)+h(mj)) & q, where q = roundup(q) - 1.
93 | // The value should not be too small, at least 256.
94 | func (ms *MinStrobes) SetPrime(q uint64) {
95 | if q < 256 {
96 | q = 256
97 | }
98 | ms.prime = roundup64(q) - 1
99 | }
100 |
101 | // SetWindowShrink decides whether shrink the search window at positions
102 | // near the end of the sequence. Default is true.
103 | func (ms *MinStrobes) SetWindowShrink(shrink bool) {
104 | ms.shrinkWindow = shrink
105 | }
106 |
107 | // Index returns the current index (0-based) of strobemers
108 | func (ms *MinStrobes) Index() int {
109 | return ms.idx - 1
110 | }
111 |
112 | // Indexes returns current indexes (0-based) of strobes
113 | func (ms *MinStrobes) Indexes() []int {
114 | return []int{ms.idx - 1, ms.idx2, ms.idx3}
115 | }
116 |
117 | // Next returns the next hash value of randstrobe
118 | func (ms *MinStrobes) Next() (uint64, bool) {
119 | switch ms.n {
120 | case 2:
121 | return ms.nextOrder2()
122 | case 3:
123 | return ms.nextOrder3()
124 | default:
125 | }
126 |
127 | return 0, false
128 | }
129 |
130 | func (ms *MinStrobes) nextOrder2() (uint64, bool) {
131 | if ms.idx > ms.endIdx {
132 | return 0, false
133 | }
134 |
135 | ms.wStart = ms.idx + ms.wMin
136 | ms.wEnd = ms.idx + ms.wMax
137 |
138 | // for positions near the end of the sequence, shrink the window size from the right
139 | if ms.wEnd > ms.endHash {
140 | if !ms.shrinkWindow {
141 | return 0, false
142 | }
143 | ms.wEnd = ms.endHash
144 |
145 | // fmt.Printf("i:%d, window (%d-%d)\n", ms.idx, ms.wStart, ms.wEnd)
146 |
147 | ms.hash1 = ms.hashes[ms.idx]
148 | ms.hash2 = math.MaxUint64
149 | for ms.i = ms.wStart; ms.i <= ms.wEnd; ms.i++ {
150 | ms.hash = ms.hashes[ms.i]
151 | if ms.hash < ms.hash2 {
152 | ms.idx2 = ms.i
153 | ms.hash2 = ms.hash
154 | }
155 | }
156 | // For 1) asymmetry, 2) avoid value overflow
157 | ms.hash2 = ms.hash1/2 + ms.hashes[ms.idx2]/3
158 | } else { // use precomputed min hashes
159 | ms.hash1 = ms.hashes[ms.idx]
160 | ms.idx2 = ms.minlocs[ms.wEnd]
161 | ms.hash2 = ms.hash1/2 + ms.minhashes[ms.wEnd]/3
162 | }
163 |
164 | ms.idx++
165 | return ms.hash2, true
166 | }
167 |
168 | func (ms *MinStrobes) nextOrder3() (uint64, bool) {
169 | if ms.idx > ms.endIdx {
170 | return 0, false
171 | }
172 |
173 | ms.w2Start = ms.idx + ms.wMax + ms.wMin
174 | ms.w2End = ms.idx + ms.wMax<<1
175 | if ms.w2Start > ms.endHash {
176 | return 0, false
177 | }
178 |
179 | ms.wStart = ms.idx + ms.wMin
180 | ms.wEnd = ms.idx + ms.wMax
181 |
182 | // use precomputed min hashes
183 | ms.hash1 = ms.hashes[ms.idx]
184 | ms.idx2 = ms.minlocs[ms.wEnd]
185 | ms.hash2 = ms.hash1/3 + ms.minhashes[ms.wEnd]/4
186 |
187 | // for positions near the end of the sequence, shrink the last window size from the right
188 | if ms.w2End > ms.endHash {
189 | if !ms.shrinkWindow {
190 | return 0, false
191 | }
192 | ms.w2End = ms.endHash
193 |
194 | ms.hash3 = math.MaxUint64
195 | for ms.i = ms.w2Start; ms.i <= ms.w2End; ms.i++ {
196 | ms.hash = (ms.hash2 + ms.hashes[ms.i]) & ms.prime
197 | if ms.hash < ms.hash3 {
198 | ms.idx3 = ms.i
199 | ms.hash3 = ms.hash
200 | }
201 | }
202 | ms.hash3 = ms.hash2 + ms.hashes[ms.idx3]/5
203 | } else {
204 | ms.idx3 = ms.minlocs[ms.w2End]
205 | ms.hash3 = ms.hash2 + ms.minhashes[ms.w2End]/5
206 | }
207 |
208 | // fmt.Printf("i:%d, window (%d-%d)\n", ms.idx, ms.wStart, ms.wEnd)
209 | // fmt.Printf("i:%d, window2 (%d-%d)\n", ms.idx, ms.w2Start, ms.w2End)
210 |
211 | ms.idx++
212 | return ms.hash3, true
213 | }
214 |
--------------------------------------------------------------------------------
/evaluation/README.md:
--------------------------------------------------------------------------------
1 | # Evaluation
2 |
3 | ## Number of matched strobemers
4 |
5 | [A similar test](https://github.com/BGI-Qingdao/strobemer_cpptest#benchmark_sim-r-match-only) with approximate results.
6 |
7 | query: 150bp, snp: 1 (0.006)
8 |
9 | $ go run test1_matches.go q0-snp1.fasta r0.fasta | csvtk pretty -t
10 | query ref method nQuery nRef nCommon qCov
11 | ------- --- ------------------------------ ------ ------- ------- -----
12 | q0-snp1 r0 Kmer(20) 131 1546586 111 84.73 *
13 | q0-snp1 r0 MinStrobes(2,10,12,12,shrink) 131 1548767 109 83.21
14 | q0-snp1 r0 MinStrobes(2,10,12,12) 129 1548765 109 84.50
15 | q0-snp1 r0 RankStrobes(2,10,12,12,shrink) 131 1548767 109 83.21
16 | q0-snp1 r0 RankStrobes(2,10,12,12) 129 1548765 109 84.50
17 |
18 | q0-snp1 r0 Kmer(21) 130 1547218 109 83.85
19 | q0-snp1 r0 MinStrobes(3,7,9,9,shrink) 126 1549315 108 85.71 *
20 | q0-snp1 r0 MinStrobes(3,7,9,9) 126 1549315 108 85.71 *
21 | q0-snp1 r0 RankStrobes(3,7,9,9,shrink) 126 1549315 108 85.71 *
22 | q0-snp1 r0 RankStrobes(3,7,9,9) 126 1549315 108 85.71 *
23 |
24 | q0-snp1 r0 Kmer(20) 131 1546586 111 84.73
25 | q0-snp1 r0 MinStrobes(2,10,12,16,shrink) 131 1548376 107 81.68
26 | q0-snp1 r0 MinStrobes(2,10,12,16) 125 1548370 107 85.60 *
27 | q0-snp1 r0 RankStrobes(2,10,12,16,shrink) 131 1548438 108 82.44
28 | q0-snp1 r0 RankStrobes(2,10,12,16) 125 1548432 108 86.40 *
29 |
30 | q0-snp1 r0 Kmer(21) 130 1547218 109 83.85
31 | q0-snp1 r0 MinStrobes(3,7,9,13,shrink) 122 1545403 102 83.61
32 | q0-snp1 r0 MinStrobes(3,7,9,13) 118 1545399 102 86.44 **
33 | q0-snp1 r0 RankStrobes(3,7,9,13,shrink) 122 1545522 107 87.70 **
34 | q0-snp1 r0 RankStrobes(3,7,9,13) 118 1545518 105 88.98 **
35 |
36 | query: 150bp, snp: 3 (0.02)
37 |
38 | $ go run test1_matches.go q2-snp3.fasta r2.fasta | csvtk pretty -t
39 | query ref method nQuery nRef nCommon qCov
40 | ------- --- ------------------------------ ------ ------- ------- -----
41 | q2-snp3 r2 Kmer(20) 131 1687558 84 64.12 *
42 | q2-snp3 r2 MinStrobes(2,10,12,12,shrink) 131 1687785 82 62.60
43 | q2-snp3 r2 MinStrobes(2,10,12,12) 129 1687783 82 63.57
44 | q2-snp3 r2 RankStrobes(2,10,12,12,shrink) 131 1687785 82 62.60
45 | q2-snp3 r2 RankStrobes(2,10,12,12) 129 1687783 82 63.57
46 |
47 | q2-snp3 r2 Kmer(21) 130 1687656 82 63.08
48 | q2-snp3 r2 MinStrobes(3,7,9,9,shrink) 126 1687865 84 66.67 *
49 | q2-snp3 r2 MinStrobes(3,7,9,9) 126 1687865 84 66.67 *
50 | q2-snp3 r2 RankStrobes(3,7,9,9,shrink) 126 1687865 84 66.67 *
51 | q2-snp3 r2 RankStrobes(3,7,9,9) 126 1687865 84 66.67 *
52 |
53 | q2-snp3 r2 Kmer(20) 131 1687558 84 64.12 *
54 | q2-snp3 r2 MinStrobes(2,10,12,16,shrink) 131 1687487 76 58.02
55 | q2-snp3 r2 MinStrobes(2,10,12,16) 125 1687481 76 60.80
56 | q2-snp3 r2 RankStrobes(2,10,12,16,shrink) 131 1687529 77 58.78
57 | q2-snp3 r2 RankStrobes(2,10,12,16) 125 1687523 76 60.80
58 |
59 | q2-snp3 r2 Kmer(21) 130 1687656 82 63.08 *
60 | q2-snp3 r2 MinStrobes(3,7,9,13,shrink) 122 1684611 74 60.66
61 | q2-snp3 r2 MinStrobes(3,7,9,13) 118 1684607 72 61.02
62 | q2-snp3 r2 RankStrobes(3,7,9,13,shrink) 122 1684661 71 58.20
63 | q2-snp3 r2 RankStrobes(3,7,9,13) 118 1684657 68 57.63
64 |
65 | query: 150bp, snp: 7 (0.47)
66 |
67 | $ go run test1_matches.go q1-snp7.rc.fasta r1.fasta | csvtk pretty -t
68 | query ref method nQuery nRef nCommon qCov
69 | ---------- --- ------------------------------ ------ ------- ------- -----
70 | q1-snp7.rc r1 Kmer(20) 131 2802879 54 41.22 *
71 | q1-snp7.rc r1 MinStrobes(2,10,12,12,shrink) 131 2804781 52 39.69
72 | q1-snp7.rc r1 MinStrobes(2,10,12,12) 129 2804779 52 40.31
73 | q1-snp7.rc r1 RankStrobes(2,10,12,12,shrink) 131 2804781 52 39.69
74 | q1-snp7.rc r1 RankStrobes(2,10,12,12) 129 2804779 52 40.31
75 |
76 | q1-snp7.rc r1 Kmer(21) 130 2804365 51 39.23 *
77 | q1-snp7.rc r1 MinStrobes(3,7,9,9,shrink) 126 2806161 48 38.10
78 | q1-snp7.rc r1 MinStrobes(3,7,9,9) 126 2806161 48 38.10
79 | q1-snp7.rc r1 RankStrobes(3,7,9,9,shrink) 126 2806161 48 38.10
80 | q1-snp7.rc r1 RankStrobes(3,7,9,9) 126 2806161 48 38.10
81 |
82 | q1-snp7.rc r1 Kmer(20) 131 2802879 54 41.22 *
83 | q1-snp7.rc r1 MinStrobes(2,10,12,16,shrink) 131 2803507 51 38.93
84 | q1-snp7.rc r1 MinStrobes(2,10,12,16) 125 2803501 51 40.80
85 | q1-snp7.rc r1 RankStrobes(2,10,12,16,shrink) 131 2803659 47 35.88
86 | q1-snp7.rc r1 RankStrobes(2,10,12,16) 125 2803653 44 35.20
87 |
88 | q1-snp7.rc r1 Kmer(21) 130 2804365 51 39.23 *
89 | q1-snp7.rc r1 MinStrobes(3,7,9,13,shrink) 122 2797218 36 29.51
90 | q1-snp7.rc r1 MinStrobes(3,7,9,13) 118 2797214 36 30.51
91 | q1-snp7.rc r1 RankStrobes(3,7,9,13,shrink) 122 2797918 42 34.43
92 | q1-snp7.rc r1 RankStrobes(3,7,9,13) 118 2797914 41 34.75
93 |
--------------------------------------------------------------------------------
/evaluation/test1_matches.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "os"
7 | "path/filepath"
8 | "runtime"
9 | "strings"
10 | "sync"
11 |
12 | "github.com/shenwei356/bio/seqio/fastx"
13 | "github.com/shenwei356/strobemers"
14 | "github.com/will-rowe/nthash"
15 | )
16 |
17 | func main() {
18 | args := os.Args
19 | if len(args) != 3 {
20 | checkError(fmt.Errorf("usage: %s query.fasta ref.fasta", os.Args[0]))
21 | }
22 |
23 | fileQuery, fileRef := args[1], args[2]
24 |
25 | q, _ := filepathTrimExtension(filepath.Base(fileQuery))
26 | r, _ := filepathTrimExtension(filepath.Base(fileRef))
27 |
28 | type Test struct {
29 | n int
30 | l int
31 | wMin int
32 | wMax int
33 | }
34 |
35 | tests := []Test{
36 | {n: 2, l: 10, wMin: 12, wMax: 12},
37 | {n: 3, l: 7, wMin: 9, wMax: 9},
38 |
39 | {n: 2, l: 10, wMin: 12, wMax: 16},
40 | {n: 3, l: 7, wMin: 9, wMax: 13},
41 | }
42 |
43 | seqsQ := readSeqs(fileQuery)
44 | seqsR := readSeqs(fileRef)
45 |
46 | var kmersQ, kmersR map[uint64]interface{}
47 | var rstrobesSQ, rstrobesSR, rstrobesQ, rstrobesR map[uint64]interface{}
48 | var mstrobesSQ, mstrobesSR, mstrobesQ, mstrobesR map[uint64]interface{}
49 | var kmersInter, rstrobesSInter, rstrobesInter, mstrobesSInter, mstrobesInter int
50 |
51 | fmt.Printf("query\tref\tmethod\tnQuery\tnRef\tnCommon\tqCov\n")
52 | runtime.GOMAXPROCS(10)
53 | for _, t := range tests {
54 | var wg sync.WaitGroup
55 | wg.Add(10)
56 |
57 | // kmers
58 | go func() {
59 | kmersQ = list2map(computeKmers(seqsQ, t.n*t.l))
60 | wg.Done()
61 | }()
62 | go func() {
63 | kmersR = list2map(computeKmers(seqsR, t.n*t.l))
64 | wg.Done()
65 | }()
66 |
67 | // randstrobes
68 | go func() {
69 | rstrobesSQ = list2map(computeRandStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, true))
70 | wg.Done()
71 | }()
72 | go func() {
73 | rstrobesSR = list2map(computeRandStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, true))
74 | wg.Done()
75 | }()
76 | go func() {
77 | rstrobesQ = list2map(computeRandStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, false))
78 | wg.Done()
79 | }()
80 | go func() {
81 | rstrobesR = list2map(computeRandStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, false))
82 | wg.Done()
83 | }()
84 |
85 | // minstrobes
86 | go func() {
87 | mstrobesSQ = list2map(computeMinStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, true))
88 | wg.Done()
89 | }()
90 | go func() {
91 | mstrobesSR = list2map(computeMinStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, true))
92 | wg.Done()
93 | }()
94 | go func() {
95 | mstrobesQ = list2map(computeMinStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, false))
96 | wg.Done()
97 | }()
98 | go func() {
99 | mstrobesR = list2map(computeMinStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, false))
100 | wg.Done()
101 | }()
102 | wg.Wait()
103 |
104 | // intersection
105 |
106 | var wg2 sync.WaitGroup
107 | wg2.Add(5)
108 | go func() {
109 | kmersInter = intersection(kmersQ, kmersR)
110 | wg2.Done()
111 | }()
112 |
113 | go func() {
114 | rstrobesSInter = intersection(rstrobesSQ, rstrobesSR)
115 | wg2.Done()
116 | }()
117 | go func() {
118 | rstrobesInter = intersection(rstrobesQ, rstrobesR)
119 | wg2.Done()
120 | }()
121 |
122 | go func() {
123 | mstrobesSInter = intersection(mstrobesSQ, mstrobesSR)
124 | wg2.Done()
125 | }()
126 | go func() {
127 | mstrobesInter = intersection(mstrobesQ, mstrobesR)
128 | wg2.Done()
129 | }()
130 |
131 | wg2.Wait()
132 |
133 | // kmers
134 | fmt.Printf("%s\t%s\tKmer(%d)\t%d\t%d\t%d\t%.2f\n",
135 | q, r, t.n*t.l, len(kmersQ), len(kmersR),
136 | kmersInter, float64(kmersInter)/float64(len(kmersQ))*100)
137 |
138 | // minstrobes
139 | fmt.Printf("%s\t%s\tMinStrobes(%d,%d,%d,%d,shrink)\t%d\t%d\t%d\t%.2f\n",
140 | q, r, t.n, t.l, t.wMin, t.wMax, len(mstrobesSQ), len(mstrobesSR),
141 | mstrobesSInter, float64(mstrobesSInter)/float64(len(mstrobesSQ))*100)
142 | fmt.Printf("%s\t%s\tMinStrobes(%d,%d,%d,%d)\t%d\t%d\t%d\t%.2f\n",
143 | q, r, t.n, t.l, t.wMin, t.wMax, len(mstrobesQ), len(mstrobesR),
144 | mstrobesInter, float64(mstrobesInter)/float64(len(mstrobesQ))*100)
145 |
146 | // randstrobes
147 | fmt.Printf("%s\t%s\tRankStrobes(%d,%d,%d,%d,shrink)\t%d\t%d\t%d\t%.2f\n",
148 | q, r, t.n, t.l, t.wMin, t.wMax, len(rstrobesSQ), len(rstrobesSR),
149 | rstrobesSInter, float64(rstrobesSInter)/float64(len(rstrobesSQ))*100)
150 | fmt.Printf("%s\t%s\tRankStrobes(%d,%d,%d,%d)\t%d\t%d\t%d\t%.2f\n",
151 | q, r, t.n, t.l, t.wMin, t.wMax, len(rstrobesQ), len(rstrobesR),
152 | rstrobesInter, float64(rstrobesInter)/float64(len(rstrobesQ))*100)
153 |
154 | fmt.Printf(" \t \t \t \t \t \t \n")
155 | }
156 |
157 | }
158 |
159 | func checkError(e error) {
160 | if e != nil {
161 | fmt.Fprintf(os.Stderr, "%s\n", e)
162 | os.Exit(0)
163 | }
164 | }
165 |
166 | func readSeqs(file string) [][]byte {
167 | reader, err := fastx.NewDefaultReader(file)
168 | checkError(err)
169 |
170 | sequences := make([][]byte, 0, 8)
171 |
172 | var record *fastx.Record
173 | for {
174 | record, err = reader.Read()
175 | if err != nil {
176 | if err == io.EOF {
177 | break
178 | }
179 | checkError(err)
180 | break
181 | }
182 |
183 | sequences = append(sequences, record.Seq.Seq)
184 | }
185 |
186 | return sequences
187 | }
188 |
189 | func computeKmers(sequences [][]byte, k int) []uint64 {
190 | hashes := make([]uint64, 0, 1024)
191 |
192 | var hash uint64
193 | var ok bool
194 | var hasher *nthash.NTHi
195 | var err error
196 | for _, _seq := range sequences {
197 | hasher, err = nthash.NewHasher(&_seq, uint(k))
198 | checkError(err)
199 |
200 | for {
201 | hash, ok = hasher.Next(true)
202 | if !ok {
203 | break
204 | }
205 |
206 | hashes = append(hashes, hash)
207 | }
208 | }
209 |
210 | return hashes
211 | }
212 |
213 | func computeRandStrobes(sequences [][]byte, n int, l int, wMin int, wMax int, shrink bool) []uint64 {
214 | hashes := make([]uint64, 0, 1024)
215 |
216 | var hash uint64
217 | var ok bool
218 | var rs *strobemers.RandStrobes
219 | var err error
220 |
221 | for _, _seq := range sequences {
222 | rs, err = strobemers.NewRandStrobes(&_seq, n, l, wMin, wMax)
223 | checkError(err)
224 |
225 | rs.SetWindowShrink(shrink)
226 | for {
227 | hash, ok = rs.Next()
228 | if !ok {
229 | break
230 | }
231 |
232 | hashes = append(hashes, hash)
233 | }
234 | }
235 |
236 | return hashes
237 | }
238 |
239 | func computeMinStrobes(sequences [][]byte, n int, l int, wMin int, wMax int, shrink bool) []uint64 {
240 | hashes := make([]uint64, 0, 1024)
241 |
242 | var hash uint64
243 | var ok bool
244 | var rs *strobemers.MinStrobes
245 | var err error
246 |
247 | for _, _seq := range sequences {
248 | rs, err = strobemers.NewMinStrobes(&_seq, n, l, wMin, wMax)
249 | checkError(err)
250 |
251 | rs.SetWindowShrink(shrink)
252 | for {
253 | hash, ok = rs.Next()
254 | if !ok {
255 | break
256 | }
257 |
258 | hashes = append(hashes, hash)
259 | }
260 | }
261 |
262 | return hashes
263 | }
264 |
265 | func list2map(data []uint64) map[uint64]interface{} {
266 | m := make(map[uint64]interface{}, len(data))
267 | for _, k := range data {
268 | m[k] = struct{}{}
269 | }
270 | return m
271 | }
272 |
273 | func intersection(m1, m2 map[uint64]interface{}) int {
274 | n := 0
275 | var ok bool
276 | for k := range m1 {
277 | if _, ok = m2[k]; ok {
278 | n++
279 | }
280 | }
281 | return n
282 | }
283 |
284 | func filepathTrimExtension(file string) (string, string) {
285 | gz := strings.HasSuffix(file, ".gz") || strings.HasSuffix(file, ".GZ")
286 | if gz {
287 | file = file[0 : len(file)-3]
288 | }
289 | extension := filepath.Ext(file)
290 | name := file[0 : len(file)-len(extension)]
291 | if gz {
292 | extension += ".gz"
293 | }
294 | return name, extension
295 | }
296 |
--------------------------------------------------------------------------------