├── speed.xlsx ├── illustration.xlsx ├── illustration_randstrobes_order2.jpg ├── illustration_randstrobes_order3.jpg ├── strobemers.go ├── evaluation ├── q1-snp7.fasta ├── q2-snp3.fasta ├── r0.s.fasta ├── r1.s.fasta ├── r2.s.fasta ├── q1-snp7.rc.fasta ├── q2-snp3-gap1.fasta ├── q0-snp1.fasta ├── q2-snp3.fasta.blastn ├── q2-snp3-gap1.fasta.blastn ├── q0-snp1.fasta.blastn ├── q1-snp7.fasta.blastn ├── README.md └── test1_matches.go ├── go.mod ├── .gitignore ├── LICENSE ├── util.go ├── randstrobes_test.go ├── minstrobes_test.go ├── go.sum ├── README.md ├── common.go ├── strobemers_test.go ├── randstrobes.go └── minstrobes.go /speed.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/speed.xlsx -------------------------------------------------------------------------------- /illustration.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/illustration.xlsx -------------------------------------------------------------------------------- /illustration_randstrobes_order2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/illustration_randstrobes_order2.jpg -------------------------------------------------------------------------------- /illustration_randstrobes_order3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/strobemers/HEAD/illustration_randstrobes_order3.jpg -------------------------------------------------------------------------------- /strobemers.go: -------------------------------------------------------------------------------- 1 | //Package strobemers is a Go implementation of the https://github.com/ksahlin/strobemers. 2 | 3 | package strobemers 4 | -------------------------------------------------------------------------------- /evaluation/q1-snp7.fasta: -------------------------------------------------------------------------------- 1 | >q1 2 | CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCTGAAGGGCT 3 | GCCGAAGGACAAGTCGCCGAGCAAGCCCGCCAGCTTCCGCTGTCTGGGCAAGCCGGAACC 4 | GGCGGTACAGAAGATCCTCGACCAACGACT 5 | -------------------------------------------------------------------------------- /evaluation/q2-snp3.fasta: -------------------------------------------------------------------------------- 1 | >q2 2 | GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACTG 3 | CGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA 4 | GACACCCGTCACAAGCATGACATGATAATT 5 | -------------------------------------------------------------------------------- /evaluation/r0.s.fasta: -------------------------------------------------------------------------------- 1 | >r0 2 | CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC 3 | GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG 4 | AGGAAATGGAAGAAACCGTGGACTTCCTGG 5 | -------------------------------------------------------------------------------- /evaluation/r1.s.fasta: -------------------------------------------------------------------------------- 1 | >r1 2 | CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCCGAAGGGCT 3 | GCCGCAGGACAAGTCGCCGAGCAAGCACGCCAGCTTTCGCTGGCTGGGCAAGCCGGAACC 4 | GGCGGTACAGAAGATCCTCGACGAACAACT 5 | -------------------------------------------------------------------------------- /evaluation/r2.s.fasta: -------------------------------------------------------------------------------- 1 | >r2 2 | GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAGACGGATGTGAACTG 3 | CGAACGACTGACACTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA 4 | GACACCCGTCACAAGCATGACATGATAATT 5 | -------------------------------------------------------------------------------- /evaluation/q1-snp7.rc.fasta: -------------------------------------------------------------------------------- 1 | >q1_rc 2 | AGTCGTTGGTCGAGGATCTTCTGTACCGCCGGTTCCGGCTTGCCCAGACAGCGGAAGCTG 3 | GCGGGCTTGCTCGGCGACTTGTCCTTCGGCAGCCCTTCAGGCACCTGCACGTAGAGCCCG 4 | TAGGCATGACTCTTGTCCCAATCGAAGGCG 5 | -------------------------------------------------------------------------------- /evaluation/q2-snp3-gap1.fasta: -------------------------------------------------------------------------------- 1 | >q2 2 | GAGGAATTAACGAACAGATACACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACTG 3 | CGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA 4 | GACACCCGTCACAAGCATGACATGATAATT 5 | -------------------------------------------------------------------------------- /evaluation/q0-snp1.fasta: -------------------------------------------------------------------------------- 1 | >RL|S1|R634/1 2 | CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC 3 | GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG 4 | AGGAAATGCAAGAAACCGTGGACTTCCTGG 5 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/shenwei356/strobemers 2 | 3 | go 1.16 4 | 5 | require ( 6 | github.com/shenwei356/bio v0.1.0 7 | github.com/shenwei356/util v0.3.0 8 | github.com/will-rowe/nthash v0.3.0 9 | github.com/zeebo/xxh3 v0.10.0 10 | ) 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | 17 | *.directory 18 | doc/site/* 19 | 20 | *.brename_detail.txt 21 | 22 | *cpu.pprof 23 | *mem.pprof 24 | *trace.out 25 | 26 | t_* 27 | 28 | blastdb 29 | test/test 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Wei Shen (shenwei356@gmail.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package strobemers 2 | 3 | func roundup64(x uint64) uint64 { 4 | if x == 0 { 5 | return 1 6 | } 7 | x-- 8 | x |= x >> 1 9 | x |= x >> 2 10 | x |= x >> 4 11 | x |= x >> 8 12 | x |= x >> 16 13 | x |= x >> 32 14 | return (x | x>>64) + 1 15 | } 16 | 17 | // only used in tests 18 | var cbases [256]byte = [256]byte{ 19 | 'T', 'G', 'C', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 20 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 21 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 22 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 23 | 'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 24 | 'N', 'N', 'N', 'N', 'A', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 25 | 'N', 'T', 'N', 'G', 'N', 'N', 'N', 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 26 | 'N', 'N', 'N', 'N', 'A', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 27 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 28 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 29 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 30 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 31 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 32 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 33 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 34 | 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 35 | } 36 | -------------------------------------------------------------------------------- /randstrobes_test.go: -------------------------------------------------------------------------------- 1 | package strobemers 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "testing" 7 | ) 8 | 9 | func TestRandStrobesOrder2(t *testing.T) { 10 | _s := "ACGATCTGGTACCTAG" 11 | s := []byte(_s) 12 | 13 | n := 2 14 | l := 3 15 | wMin := 3 16 | wMax := 5 17 | rs, err := NewRandStrobes(&s, n, l, wMin, wMax) 18 | if err != nil { 19 | t.Error(err) 20 | } 21 | 22 | var h uint64 23 | var ok bool 24 | var ps []int 25 | var i1, i2 int 26 | for { 27 | h, ok = rs.Next() 28 | if !ok { 29 | break 30 | } 31 | 32 | if !debug { 33 | continue 34 | } 35 | 36 | ps = rs.Indexes() 37 | i1, i2 = ps[0], ps[1] 38 | fmt.Printf("%s len:%d\n", _s, len(_s)) 39 | fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1) 40 | fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2) 41 | fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h) 42 | fmt.Println() 43 | } 44 | } 45 | 46 | func TestRandStrobesOrder3(t *testing.T) { 47 | _s := "ACGATCTGGTACCTAG" 48 | s := []byte(_s) 49 | 50 | n := 3 51 | l := 3 52 | wMin := 3 53 | wMax := 5 54 | rs, err := NewRandStrobes(&s, n, l, wMin, wMax) 55 | if err != nil { 56 | t.Error(err) 57 | } 58 | 59 | var h uint64 60 | var ok bool 61 | var ps []int 62 | var i1, i2, i3 int 63 | for { 64 | h, ok = rs.Next() 65 | if !ok { 66 | break 67 | } 68 | 69 | if !debug { 70 | continue 71 | } 72 | 73 | ps = rs.Indexes() 74 | i1, i2, i3 = ps[0], ps[1], ps[2] 75 | fmt.Printf("%s len:%d\n", _s, len(_s)) 76 | fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1) 77 | fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2) 78 | fmt.Printf("%s%s i3:%d\n", strings.Repeat(" ", i3), _s[i3:i3+l], i3) 79 | fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h) 80 | fmt.Println() 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /minstrobes_test.go: -------------------------------------------------------------------------------- 1 | package strobemers 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "testing" 7 | ) 8 | 9 | func TestMinStrobesOrders2(t *testing.T) { 10 | _s := "ACGATCTGGTACCTAG" 11 | s := []byte(_s) 12 | 13 | n := 2 14 | l := 3 15 | wMin := 3 16 | wMax := 5 17 | ms, err := NewMinStrobes(&s, n, l, wMin, wMax) 18 | if err != nil { 19 | t.Error(err) 20 | } 21 | 22 | // for i, m := range ms.minhashes { 23 | // fmt.Println(i, m) 24 | // } 25 | 26 | var h uint64 27 | var ok bool 28 | var ps []int 29 | var i1, i2 int 30 | for { 31 | h, ok = ms.Next() 32 | if !ok { 33 | break 34 | } 35 | 36 | if !debug { 37 | continue 38 | } 39 | 40 | ps = ms.Indexes() 41 | i1, i2 = ps[0], ps[1] 42 | fmt.Printf("%s len:%d\n", _s, len(_s)) 43 | fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1) 44 | fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2) 45 | fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h) 46 | fmt.Println() 47 | } 48 | } 49 | 50 | func TestMinStrobesOrder3(t *testing.T) { 51 | _s := "ACGATCTGGTACCTAG" 52 | s := []byte(_s) 53 | 54 | n := 3 55 | l := 3 56 | wMin := 3 57 | wMax := 5 58 | rs, err := NewMinStrobes(&s, n, l, wMin, wMax) 59 | if err != nil { 60 | t.Error(err) 61 | } 62 | 63 | var h uint64 64 | var ok bool 65 | var ps []int 66 | var i1, i2, i3 int 67 | for { 68 | h, ok = rs.Next() 69 | if !ok { 70 | break 71 | } 72 | 73 | if !debug { 74 | continue 75 | } 76 | 77 | ps = rs.Indexes() 78 | i1, i2, i3 = ps[0], ps[1], ps[2] 79 | fmt.Printf("%s len:%d\n", _s, len(_s)) 80 | fmt.Printf("%s%s i1:%d\n", strings.Repeat(" ", i1), _s[i1:i1+l], i1) 81 | fmt.Printf("%s%s i2:%d\n", strings.Repeat(" ", i2), _s[i2:i2+l], i2) 82 | fmt.Printf("%s%s i3:%d\n", strings.Repeat(" ", i3), _s[i3:i3+l], i3) 83 | fmt.Printf("%s%d\n", strings.Repeat(" ", len(_s)+1), h) 84 | fmt.Println() 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /evaluation/q2-snp3.fasta.blastn: -------------------------------------------------------------------------------- 1 | BLASTN 2.11.0+ 2 | 3 | 4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb 5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J 6 | Comput Biol 2000; 7(1-2):203-14. 7 | 8 | 9 | 10 | Database: r2.fasta 11 | 1 sequences; 1,688,298 total letters 12 | 13 | 14 | 15 | Query= q2 16 | 17 | Length=150 18 | Score E 19 | Sequences producing significant alignments: (Bits) Value 20 | 21 | r2 261 4e-71 22 | 23 | 24 | >r2 25 | Length=1688298 26 | 27 | Score = 261 bits (141), Expect = 4e-71 28 | Identities = 147/150 (98%), Gaps = 0/150 (0%) 29 | Strand=Plus/Plus 30 | 31 | Query 1 GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACTG 60 32 | ||||||||||||||||||||||||||||||||||||||||||||| |||||||||||||| 33 | Sbjct 869619 GAGGAATTAACGAACAGATAACGCATATTGTCCCGTTTGATTGAAGACGGATGTGAACTG 869678 34 | 35 | Query 61 CGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA 120 36 | ||| |||||||| ||||||||||||||||||||||||||||||||||||||||||||||| 37 | Sbjct 869679 CGAACGACTGACACTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCGA 869738 38 | 39 | Query 121 GACACCCGTCACAAGCATGACATGATAATT 150 40 | |||||||||||||||||||||||||||||| 41 | Sbjct 869739 GACACCCGTCACAAGCATGACATGATAATT 869768 42 | 43 | 44 | 45 | Lambda K H 46 | 1.33 0.621 1.12 47 | 48 | Gapped 49 | Lambda K H 50 | 1.28 0.460 0.850 51 | 52 | Effective search space used: 221164549 53 | 54 | 55 | Database: r2.fasta 56 | Posted date: Apr 14, 2021 7:44 PM 57 | Number of letters in database: 1,688,298 58 | Number of sequences in database: 1 59 | 60 | 61 | 62 | Matrix: blastn matrix 1 -2 63 | Gap Penalties: Existence: 0, Extension: 2.5 64 | -------------------------------------------------------------------------------- /evaluation/q2-snp3-gap1.fasta.blastn: -------------------------------------------------------------------------------- 1 | BLASTN 2.11.0+ 2 | 3 | 4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb 5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J 6 | Comput Biol 2000; 7(1-2):203-14. 7 | 8 | 9 | 10 | Database: r2.fasta 11 | 1 sequences; 1,688,298 total letters 12 | 13 | 14 | 15 | Query= q2 16 | 17 | Length=151 18 | Score E 19 | Sequences producing significant alignments: (Bits) Value 20 | 21 | r2 255 2e-69 22 | 23 | 24 | >r2 25 | Length=1688298 26 | 27 | Score = 255 bits (138), Expect = 2e-69 28 | Identities = 147/151 (97%), Gaps = 1/151 (1%) 29 | Strand=Plus/Plus 30 | 31 | Query 1 GAGGAATTAACGAACAGATACACGCATATTGTCCCGTTTGATTGAAAACGGATGTGAACT 60 32 | |||||||||||||||||||| ||||||||||||||||||||||||| ||||||||||||| 33 | Sbjct 869619 GAGGAATTAACGAACAGATA-ACGCATATTGTCCCGTTTGATTGAAGACGGATGTGAACT 869677 34 | 35 | Query 61 GCGAGCGACTGACTCTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCG 120 36 | |||| |||||||| |||||||||||||||||||||||||||||||||||||||||||||| 37 | Sbjct 869678 GCGAACGACTGACACTATACTTGCCACGATATGGATCTGAAACGTAGAACGAACCGCTCG 869737 38 | 39 | Query 121 AGACACCCGTCACAAGCATGACATGATAATT 151 40 | ||||||||||||||||||||||||||||||| 41 | Sbjct 869738 AGACACCCGTCACAAGCATGACATGATAATT 869768 42 | 43 | 44 | 45 | Lambda K H 46 | 1.33 0.621 1.12 47 | 48 | Gapped 49 | Lambda K H 50 | 1.28 0.460 0.850 51 | 52 | Effective search space used: 222852828 53 | 54 | 55 | Database: r2.fasta 56 | Posted date: Apr 14, 2021 7:44 PM 57 | Number of letters in database: 1,688,298 58 | Number of sequences in database: 1 59 | 60 | 61 | 62 | Matrix: blastn matrix 1 -2 63 | Gap Penalties: Existence: 0, Extension: 2.5 64 | -------------------------------------------------------------------------------- /evaluation/q0-snp1.fasta.blastn: -------------------------------------------------------------------------------- 1 | BLASTN 2.11.0+ 2 | 3 | 4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb 5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J 6 | Comput Biol 2000; 7(1-2):203-14. 7 | 8 | 9 | 10 | Database: r0.fasta 11 | 1 sequences; 1,560,393 total letters 12 | 13 | 14 | 15 | Query= RL|S1|R634/1 16 | 17 | Length=150 18 | Score E 19 | Sequences producing significant alignments: (Bits) Value 20 | 21 | r0 272 2e-74 22 | 23 | 24 | >r0 25 | Length=1560393 26 | 27 | Score = 272 bits (147), Expect = 2e-74 28 | Identities = 149/150 (99%), Gaps = 0/150 (0%) 29 | Strand=Plus/Plus 30 | 31 | Query 1 CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC 60 32 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 33 | Sbjct 396270 CCTGCGTGGTGGCCGACTTGCCGTTCGCCAGCTACCAGGAATCGCCCCGACAGGCGTTCC 396329 34 | 35 | Query 61 GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG 120 36 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 37 | Sbjct 396330 GCAACGCCGCACGCCTGCTGGCCGACAGCGGCGCCCAGGCGGTGAAGCTGGAAGGCGGTG 396389 38 | 39 | Query 121 AGGAAATGCAAGAAACCGTGGACTTCCTGG 150 40 | |||||||| ||||||||||||||||||||| 41 | Sbjct 396390 AGGAAATGGAAGAAACCGTGGACTTCCTGG 396419 42 | 43 | 44 | 45 | Lambda K H 46 | 1.33 0.621 1.12 47 | 48 | Gapped 49 | Lambda K H 50 | 1.28 0.460 0.850 51 | 52 | Effective search space used: 204408994 53 | 54 | 55 | Database: r0.fasta 56 | Posted date: Apr 14, 2021 7:44 PM 57 | Number of letters in database: 1,560,393 58 | Number of sequences in database: 1 59 | 60 | 61 | 62 | Matrix: blastn matrix 1 -2 63 | Gap Penalties: Existence: 0, Extension: 2.5 64 | -------------------------------------------------------------------------------- /evaluation/q1-snp7.fasta.blastn: -------------------------------------------------------------------------------- 1 | BLASTN 2.11.0+ 2 | 3 | 4 | Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb 5 | Miller (2000), "A greedy algorithm for aligning DNA sequences", J 6 | Comput Biol 2000; 7(1-2):203-14. 7 | 8 | 9 | 10 | Database: r1.fasta 11 | 1 sequences; 2,833,277 total letters 12 | 13 | 14 | 15 | Query= q1 16 | 17 | Length=150 18 | Score E 19 | Sequences producing significant alignments: (Bits) Value 20 | 21 | r1 239 3e-64 22 | 23 | 24 | >r1 25 | Length=2833277 26 | 27 | Score = 239 bits (129), Expect = 3e-64 28 | Identities = 143/150 (95%), Gaps = 0/150 (0%) 29 | Strand=Plus/Minus 30 | 31 | Query 1 CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCTGAAGGGCT 60 32 | ||||||||||||||||||||||||||||||||||||||||||||||||||| |||||||| 33 | Sbjct 2741385 CGCCTTCGATTGGGACAAGAGTCATGCCTACGGGCTCTACGTGCAGGTGCCCGAAGGGCT 2741326 34 | 35 | Query 61 GCCGAAGGACAAGTCGCCGAGCAAGCCCGCCAGCTTCCGCTGTCTGGGCAAGCCGGAACC 120 36 | |||| ||||||||||||||||||||| ||||||||| ||||| ||||||||||||||||| 37 | Sbjct 2741325 GCCGCAGGACAAGTCGCCGAGCAAGCACGCCAGCTTTCGCTGGCTGGGCAAGCCGGAACC 2741266 38 | 39 | Query 121 GGCGGTACAGAAGATCCTCGACCAACGACT 150 40 | |||||||||||||||||||||| ||| ||| 41 | Sbjct 2741265 GGCGGTACAGAAGATCCTCGACGAACAACT 2741236 42 | 43 | 44 | 45 | Lambda K H 46 | 1.33 0.621 1.12 47 | 48 | Gapped 49 | Lambda K H 50 | 1.28 0.460 0.850 51 | 52 | Effective search space used: 368323410 53 | 54 | 55 | Database: r1.fasta 56 | Posted date: Apr 14, 2021 7:44 PM 57 | Number of letters in database: 2,833,277 58 | Number of sequences in database: 1 59 | 60 | 61 | 62 | Matrix: blastn matrix 1 -2 63 | Gap Penalties: Existence: 0, Extension: 2.5 64 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548/go.mod h1:e6NPNENfs9mPDVNRekM7lKScauxd5kXTr1Mfyig6TDM= 2 | github.com/cznic/sortutil v0.0.0-20181122101858-f5f958428db8 h1:LpMLYGyy67BoAFGda1NeOBQwqlv7nUXpm+rIVHGxZZ4= 3 | github.com/cznic/sortutil v0.0.0-20181122101858-f5f958428db8/go.mod h1:q2w6Bg5jeox1B+QkJ6Wp/+Vn0G/bo3f1uY7Fn3vivIQ= 4 | github.com/edsrzf/mmap-go v1.0.0/go.mod h1:YO35OhQPt3KJa3ryjFM5Bs14WD66h8eGKpfaBNrHW5M= 5 | github.com/klauspost/compress v1.11.4 h1:kz40R/YWls3iqT9zX9AHN3WoVsrAWVyui5sxuLqiXqU= 6 | github.com/klauspost/compress v1.11.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= 7 | github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE= 8 | github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= 9 | github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= 10 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= 11 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 12 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= 13 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 14 | github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= 15 | github.com/shenwei356/bio v0.1.0 h1:VDnI28zcdybywdn6/tcZvplAJ1IxOAAYrTJhhTB1SLQ= 16 | github.com/shenwei356/bio v0.1.0/go.mod h1:NgFauYHlpmjCYEf2XP8foITht6ej6poggQkILpjraN4= 17 | github.com/shenwei356/bpool v0.0.0-20160710042833-f9e0ee4d0403 h1:/3JklLnHXiWUBxWc3joQYavDQJpncRhRA909cUb7eOw= 18 | github.com/shenwei356/bpool v0.0.0-20160710042833-f9e0ee4d0403/go.mod h1:YkgdTWfNnJgv5HVJbVSDmxQtkK3/jZWDoqcG26BVU8k= 19 | github.com/shenwei356/breader v0.1.0/go.mod h1:YXIrHIPtbJCP6Kv27qGp+cXQl7hyzD0iQrEVYCy/gqw= 20 | github.com/shenwei356/util v0.3.0 h1:gTVa3sGwcyGEHgNpXTzdL3MaaJN/bGAypVKSCnT4QfU= 21 | github.com/shenwei356/util v0.3.0/go.mod h1:n3qhc3bQzlqJ2/5v79hgl0Gd3WzJOkI8XcUix25Brdg= 22 | github.com/shenwei356/xopen v0.0.0-20181203091311-f4f16ddd3992 h1:RXEEyKj0JL3SrRIYsWIEyy4AwjHbI3I8aDGK6CA4+YI= 23 | github.com/shenwei356/xopen v0.0.0-20181203091311-f4f16ddd3992/go.mod h1:6EQUa6I7Zsl2GQKqcL9qGLrTzVE+oZyly+uhzovQYSk= 24 | github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553/go.mod h1:Rj7Csq/tZ/egz+Ltc2IVpsA5309AmSMEswjkTZmq2Xc= 25 | github.com/will-rowe/nthash v0.3.0 h1:yN+Il98GRWyp7HdaiEbsE7KC4ySEKtPatm+SLZ5uQBk= 26 | github.com/will-rowe/nthash v0.3.0/go.mod h1:5ezweuK0J5j+/7lih/RkrSmnxI3hoaPpQiVWJ7rd960= 27 | github.com/zeebo/xxh3 v0.10.0 h1:1+2Mov9zfxTNUeoDG9k9i13VfxTR0p1JQu8L0vikxB0= 28 | github.com/zeebo/xxh3 v0.10.0/go.mod h1:AQY73TOrhF3jNsdiM9zZOb8MThrYbZONHj7ryDBaLpg= 29 | golang.org/x/sys v0.0.0-20200727154430-2d971f7391a4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 30 | golang.org/x/sys v0.0.0-20210315160823-c6e025ad8005 h1:pDMpM2zh2MT0kHy037cKlSby2nEhD50SYqwQk76Nm40= 31 | golang.org/x/sys v0.0.0-20210315160823-c6e025ad8005/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 32 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 33 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Strobemers in Go 2 | 3 | [![GoDoc](https://godoc.org/github.com/shenwei356/strobemers?status.svg)](https://godoc.org/github.com/shenwei356/strobemers) 4 | [![Go Report Card](https://goreportcard.com/badge/github.com/shenwei356/strobemers)](https://goreportcard.com/report/github.com/shenwei356/strobemers) 5 | 6 | ## Introduction 7 | 8 | This is a Go implementation of the [strobemers](https://github.com/ksahlin/strobemers) (minstrobes and randstrobes), 9 | with some [differences](#differences). 10 | 11 | The implementation of `Randstrobes` has a not-bad performance (2-3X slower) compared to regular k-mer, 12 | while it's 10-20X slower than [ntHash](https://github.com/will-rowe/nthash/). 13 | Besides, `Randstrobes` is only slightly slower than `MinStrobes` (see [benchmark](#benchmark)). 14 | 15 | ### Attention 16 | 17 | The current implementation only computes strobemers of the positive strand, 18 | because the strobes are asymmetrical and the location matters. 19 | 20 | ## Installation 21 | 22 | go get github.com/shenwei356/strobemers 23 | 24 | ## Quick Start 25 | 26 | We followed the code style of [ntHash](https://github.com/will-rowe/nthash/). 27 | 28 | ```go 29 | n := 2 30 | l := 3 31 | w_min := 3 32 | w_max := 5 33 | rs, err := strobemers.NewRandStrobes(seq, n, l, w_min, w_max) 34 | checkError(err) 35 | 36 | var hash uint64 37 | var ok bool 38 | var i int // 0-based index 39 | var positions []int // 0-based indexes of all strobes 40 | 41 | rs.SetWindowShrink(true) 42 | for { 43 | hash, ok = rs.Next() 44 | if !ok { 45 | break 46 | } 47 | 48 | i = rs.Index() 49 | positions = rs.Indexes() 50 | } 51 | 52 | ``` 53 | 54 | ## Differences 55 | 56 | Here are some differences compared to the original implementation, 57 | see discussion: [#1](https://github.com/ksahlin/strobemers/issues/1), [#2](https://github.com/ksahlin/strobemers/issues/2). 58 | 59 | item |orginal |this |comment 60 | :---------------------|:----------------------|:---------------------------------|:----------------------------------------- 61 | window range |`w_min < w_max` |`w_min <= w_max` |allow a fixed position 62 | shrinking window |all `w_min` and `w_max`|optional shrinking last `w_max` |see figures below 63 | number of strobemers |`len(seq)-n*l+1` |`len(seq)-n*l+1-(n-1)*l` |window shrinked 64 | number of strobemers | |`len(seq)-n*l+1-(n-1)*(l+w_min-1)`|window not shrinked 65 | choice of min hash |`(h(m)+h(mj))%q` |`(h(m)+h(mj))&q` |`&` is faster than `%` 66 | final hash value (n=2)|`h(m1)-h(m2)` |`h(m1)/2+h(m2)/3` |keep asymmetry and avoid `uint64` overflow 67 | final hash value (n=3)|`h(m1)-h(m2)+2*h(m3)` |`h(m1)/3+h(m2)/4+h(m3)/5` |~ 68 | 69 | 70 | 71 | 72 | 73 | ## Benchmark 74 | 75 | method |time |relative_time 76 | :----------------------|:-----|:------------ 77 | ntHashKmers(30) |8590 |1 78 | Kmers(30) |55579 |6 79 | MinStrobes(2,15,20,30) |104520|12 80 | MinStrobes(3,10,20,30) |111662|13 81 | RandStrobes(2,15,20,30)|93436 |11 82 | RandStrobes(3,10,20,30)|152461|18 83 | 84 | $ go test . -bench=Benchmark* -benchmem \ 85 | | grep Bench \ 86 | | perl -pe 's/\s\s+/\t/g' \ 87 | | csvtk cut -Ht -f 1,3-5 \ 88 | | csvtk add-header -t -n test,time,memory,allocs \ 89 | | csvtk pretty -t -r 90 | 91 | test time memory allocs 92 | ------------------------------------- ------------ ---------- ----------- 93 | BenchmarkNTHash/1.00_KB-16 8590 ns/op 48 B/op 1 allocs/op 94 | BenchmarkKmers/1.00_KB-16 55579 ns/op 32 B/op 1 allocs/op 95 | BenchmarkMinStrobesOrder2/1.00_KB-16 104520 ns/op 25064 B/op 7 allocs/op 96 | BenchmarkMinStrobesOrder3/1.00_KB-16 111662 ns/op 25064 B/op 7 allocs/op 97 | BenchmarkRandStrobesOrder2/1.00_KB-16 93436 ns/op 8432 B/op 3 allocs/op 98 | BenchmarkRandStrobesOrder3/1.00_KB-16 152461 ns/op 8432 B/op 3 allocs/op 99 | 100 | 101 | ## Similar Projects 102 | 103 | - [strobemer_cpptest](https://github.com/BGI-Qingdao/strobemer_cpptest) 104 | 105 | ## References 106 | 107 | - [ntHash](http://dx.doi.org/10.1093/bioinformatics/btw397) 108 | - [strobemers](https://doi.org/10.1101/2021.01.28.428549) 109 | -------------------------------------------------------------------------------- /common.go: -------------------------------------------------------------------------------- 1 | package strobemers 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | 7 | "github.com/will-rowe/nthash" 8 | ) 9 | 10 | // defaultPrimeNumber is the prime number in minimizing h(m)+h(mj) mod q. 11 | // In this package, we use (h(m)+h(mj)) & q, where q = roundup(q) - 1 12 | var defaultPrimeNumber uint64 = (1 << 20) - 1 13 | 14 | // ------------------------------------------------------------------------ 15 | // errors 16 | 17 | // ErrOrderNotSupported means a big strobemer order is not supported. 18 | var ErrOrderNotSupported = fmt.Errorf("strobemers: strobemer order not supported") 19 | 20 | // ErrInvalidOrder means 21 | var ErrInvalidOrder = fmt.Errorf("strobemers: strobemer order too small") 22 | 23 | // ErrInvalidSequence means the given sequence is invalid 24 | var ErrInvalidSequence = fmt.Errorf("strobemers: invalid DNA sequence") 25 | 26 | // ErrSequenceTooShort means the sequence is too short 27 | var ErrSequenceTooShort = fmt.Errorf("strobemers: sequence too short") 28 | 29 | // ErrStrobeLengthTooSmall means the strobe length is too small 30 | var ErrStrobeLengthTooSmall = fmt.Errorf("strobemers: strobe length too small") 31 | 32 | // ErrInvalidWindowOffsets means invalid window offsets 33 | var ErrInvalidWindowOffsets = fmt.Errorf("strobemers: window offset should be > 0, and wMin <= wMax") 34 | 35 | // ErrIncompleteHashValues means incomplete hash values 36 | var ErrIncompleteHashValues = fmt.Errorf("strobemers: incomplete hash values") 37 | 38 | var ErrPrimeNumberTooSmall = fmt.Errorf("strobemers: the primer number is too small") 39 | 40 | // ------------------------------------------------------------------------ 41 | 42 | func computeHashes(sequence *[]byte, k int) ([]uint64, error) { 43 | hasher, err := nthash.NewHasher(sequence, uint(k)) 44 | if err != nil { 45 | return nil, err 46 | } 47 | 48 | hashes := make([]uint64, len(*sequence)-k+1) 49 | var hash uint64 50 | var ok bool 51 | var i int 52 | for { 53 | hash, ok = hasher.Next(true) 54 | if !ok { 55 | break 56 | } 57 | hashes[i] = hash 58 | i++ 59 | } 60 | 61 | if i != len(*sequence)-k+1 { 62 | return nil, ErrIncompleteHashValues 63 | } 64 | 65 | return hashes, nil 66 | } 67 | 68 | func computeMinHashes(hashes []uint64, w int) ([]int, []uint64) { 69 | locs := make([]int, len(hashes)) 70 | if w == 1 { 71 | for i := range hashes { 72 | locs[i] = i 73 | } 74 | return locs, hashes 75 | } 76 | 77 | minHashes := make([]uint64, len(hashes)) 78 | 79 | var hash uint64 80 | var i, idxMw, b, e, t int 81 | var i2v IdxValue 82 | var flag bool 83 | 84 | buf := make([]IdxValue, 0, w) 85 | end := len(hashes) 86 | r := w - 1 // last position in the buffer 87 | 88 | for idx := 0; idx < end; idx++ { // idx is end position of a window 89 | hash = hashes[idx] 90 | 91 | if idx < r { // front of w 92 | buf = append(buf, IdxValue{Idx: idx, Val: hash}) // add current hash to buf 93 | continue 94 | } 95 | 96 | if idx == r { // position w 97 | buf = append(buf, IdxValue{Idx: idx, Val: hash}) // add current hash to buf 98 | sort.Sort(idxValues(buf)) 99 | 100 | i2v = buf[0] // the smallest one 101 | locs[idx] = i2v.Idx 102 | minHashes[idx] = i2v.Val 103 | continue 104 | } 105 | 106 | // find min k-mer 107 | 108 | // remove k-mer not in this window. 109 | // have to check position/index one by one 110 | idxMw = idx - w 111 | for i, i2v = range buf { 112 | if i2v.Idx == idxMw { 113 | if i < r { // not the last element 114 | copy(buf[i:r], buf[i+1:]) 115 | } // happen to be at the end 116 | buf = buf[:r] 117 | break 118 | } 119 | } 120 | 121 | // add new k-mer 122 | flag = false 123 | // using binary search, faster han linear search 124 | b, e = 0, r-1 125 | for { 126 | t = b + (e-b)/2 127 | if hash < buf[t].Val { 128 | e = t - 1 // end search here 129 | if e <= b { 130 | flag = true 131 | i = b 132 | break 133 | } 134 | } else { 135 | b = t + 1 // start here 136 | if b >= r { 137 | flag = false 138 | break 139 | } 140 | if b >= e { 141 | flag = true 142 | i = e // right here 143 | break 144 | } 145 | } 146 | } 147 | if !flag { // it's the biggest one, append to the end 148 | buf = append(buf, IdxValue{idx, hash}) 149 | } else { 150 | if hash >= buf[i].Val { // have to check again 151 | i++ 152 | } 153 | buf = append(buf, blankI2V) // append one element 154 | copy(buf[i+1:], buf[i:r]) // move right 155 | buf[i] = IdxValue{idx, hash} 156 | } 157 | 158 | i2v = buf[0] // the smallest one 159 | locs[idx] = i2v.Idx 160 | minHashes[idx] = i2v.Val 161 | } 162 | 163 | return locs, minHashes 164 | } 165 | 166 | type IdxValue struct { 167 | Idx int // index 168 | Val uint64 // hash 169 | } 170 | 171 | var blankI2V = IdxValue{0, 0} 172 | 173 | type idxValues []IdxValue 174 | 175 | func (l idxValues) Len() int { return len(l) } 176 | func (l idxValues) Less(i int, j int) bool { return l[i].Val < l[j].Val } 177 | func (l idxValues) Swap(i int, j int) { l[i], l[j] = l[j], l[i] } 178 | -------------------------------------------------------------------------------- /strobemers_test.go: -------------------------------------------------------------------------------- 1 | //Package strobemers is a Go implementation of the https://github.com/ksahlin/strobemers. 2 | 3 | package strobemers 4 | 5 | import ( 6 | "math/rand" 7 | "testing" 8 | 9 | "github.com/shenwei356/util/bytesize" 10 | "github.com/will-rowe/nthash" 11 | "github.com/zeebo/xxh3" 12 | ) 13 | 14 | var debug = true 15 | 16 | var seqs [][]byte 17 | 18 | var bit2base = [4]byte{'A', 'C', 'G', 'T'} 19 | 20 | func init() { 21 | rand.Seed(11) 22 | 23 | sizes := []int{1 << 10} //, 1 << 20} //, 10 << 20} 24 | seqs = make([][]byte, len(sizes)) 25 | for i, size := range sizes { 26 | sequence := make([]byte, size) 27 | for j := 0; j < size; j++ { 28 | sequence[j] = bit2base[rand.Intn(4)] 29 | } 30 | seqs[i] = sequence 31 | } 32 | } 33 | 34 | var _hash uint64 35 | var _k int = 30 36 | var _n2 int = 2 37 | var _l2 int = 15 38 | var _n3 int = 3 39 | var _l3 int = 10 40 | var _w_min int = 20 41 | var _w_max int = 30 42 | 43 | func BenchmarkNTHash(b *testing.B) { 44 | for i := range seqs { 45 | size := len(seqs[i]) 46 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) { 47 | for j := 0; j < b.N; j++ { 48 | var hash uint64 49 | var ok bool 50 | var hasher *nthash.NTHi 51 | var err error 52 | 53 | hasher, err = nthash.NewHasher(&seqs[i], uint(_k)) 54 | if err != nil { 55 | b.Errorf("fail to create ntHasher iterator. seq length: %d", size) 56 | } 57 | 58 | for { 59 | hash, ok = hasher.Next(true) 60 | if !ok { 61 | break 62 | } 63 | 64 | _hash = hash 65 | } 66 | } 67 | }) 68 | } 69 | } 70 | 71 | func BenchmarkKmers(b *testing.B) { 72 | for i := range seqs { 73 | size := len(seqs[i]) 74 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) { 75 | for j := 0; j < b.N; j++ { 76 | var hash, hashrc uint64 77 | var end int 78 | var seq []byte 79 | var rc []byte 80 | var _i, _j int 81 | 82 | rc = make([]byte, _k) 83 | seq = seqs[i] 84 | end = len(seq) - _k + 1 85 | 86 | for i := 0; i < end; i++ { 87 | hash = xxh3.Hash(seq[i : i+_k]) 88 | 89 | // complementary sequence 90 | for _i = 0; _i < _k; _i++ { 91 | rc[_i] = cbases[seq[i+_i]] 92 | } 93 | // reverse 94 | for _i, _j = 0, _k-1; _i < _j; _i, _j = _i+1, _j-1 { 95 | rc[_i], rc[_j] = rc[_j], rc[_i] 96 | } 97 | hashrc = xxh3.Hash(rc) 98 | 99 | // canonical kmer 100 | if hash < hashrc { 101 | _hash = hash 102 | } else { 103 | _hash = hashrc 104 | } 105 | } 106 | } 107 | }) 108 | } 109 | } 110 | 111 | func BenchmarkMinStrobesOrder2(b *testing.B) { 112 | for i := range seqs { 113 | size := len(seqs[i]) 114 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) { 115 | for j := 0; j < b.N; j++ { 116 | var hash uint64 117 | var ok bool 118 | var rs *MinStrobes 119 | var err error 120 | 121 | rs, err = NewMinStrobes(&seqs[i], _n2, _l2, _w_min, _w_max) 122 | if err != nil { 123 | b.Errorf("fail to create MinStrobes. seq length: %d", size) 124 | } 125 | 126 | for { 127 | hash, ok = rs.Next() 128 | if !ok { 129 | break 130 | } 131 | 132 | _hash = hash 133 | } 134 | } 135 | }) 136 | } 137 | } 138 | 139 | func BenchmarkMinStrobesOrder3(b *testing.B) { 140 | for i := range seqs { 141 | size := len(seqs[i]) 142 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) { 143 | for j := 0; j < b.N; j++ { 144 | var hash uint64 145 | var ok bool 146 | var rs *MinStrobes 147 | var err error 148 | 149 | rs, err = NewMinStrobes(&seqs[i], _n3, _l3, _w_min, _w_max) 150 | if err != nil { 151 | b.Errorf("fail to create MinStrobes. seq length: %d", size) 152 | } 153 | 154 | for { 155 | hash, ok = rs.Next() 156 | if !ok { 157 | break 158 | } 159 | 160 | _hash = hash 161 | } 162 | } 163 | }) 164 | } 165 | } 166 | 167 | func BenchmarkRandStrobesOrder2(b *testing.B) { 168 | for i := range seqs { 169 | size := len(seqs[i]) 170 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) { 171 | for j := 0; j < b.N; j++ { 172 | var hash uint64 173 | var ok bool 174 | var rs *RandStrobes 175 | var err error 176 | 177 | rs, err = NewRandStrobes(&seqs[i], _n2, _l2, _w_min, _w_max) 178 | if err != nil { 179 | b.Errorf("fail to create RandStrobes. seq length: %d", size) 180 | } 181 | 182 | for { 183 | hash, ok = rs.Next() 184 | if !ok { 185 | break 186 | } 187 | 188 | _hash = hash 189 | } 190 | } 191 | }) 192 | } 193 | } 194 | 195 | func BenchmarkRandStrobesOrder3(b *testing.B) { 196 | for i := range seqs { 197 | size := len(seqs[i]) 198 | b.Run(bytesize.ByteSize(size).String(), func(b *testing.B) { 199 | for j := 0; j < b.N; j++ { 200 | var hash uint64 201 | var ok bool 202 | var rs *RandStrobes 203 | var err error 204 | 205 | rs, err = NewRandStrobes(&seqs[i], _n3, _l3, _w_min, _w_max) 206 | if err != nil { 207 | b.Errorf("fail to create RandStrobes. seq length: %d", size) 208 | } 209 | 210 | for { 211 | hash, ok = rs.Next() 212 | if !ok { 213 | break 214 | } 215 | 216 | _hash = hash 217 | } 218 | } 219 | }) 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /randstrobes.go: -------------------------------------------------------------------------------- 1 | package strobemers 2 | 3 | import "math" 4 | 5 | // RandStrobes is a iterator for randstrobes 6 | type RandStrobes struct { 7 | seq *[]byte // DNA sequence 8 | 9 | n int // strobemer order 10 | l int // strobes length 11 | wMin int // minimum window offset 12 | wMax int // maximum window offset 13 | 14 | idx, idx2, idx3 int // indexes of m1, m2, m3 15 | hash1, hash2, hash3 uint64 // hash value of m1, m2, m3 16 | 17 | hashes []uint64 // precomputed ntHash values of l-mers 18 | 19 | endHash int // position of the last l-mer 20 | endIdx int // position of the last m1 21 | 22 | wStart, wEnd, w2Start, w2End int // window start and end 23 | 24 | prime uint64 25 | 26 | // shrink the last searching window for positions near the end of sequence. 27 | shrinkWindow bool 28 | 29 | // tmp variable 30 | i int 31 | hash uint64 32 | } 33 | 34 | // NewRandStrobes creates a RandStrobes iterator. 35 | // Parameters: 36 | // n - strobemer order 37 | // l - strobes length 38 | // wMin - minimum window offset, wMin > 0 39 | // wMax - maximum window offset, wMin <= wMax. 40 | func NewRandStrobes(seq *[]byte, n int, l int, wMin int, wMax int) (*RandStrobes, error) { 41 | if seq == nil || len(*seq) == 0 { 42 | return nil, ErrInvalidSequence 43 | } 44 | if n < 2 { 45 | return nil, ErrInvalidOrder 46 | } 47 | if n > 3 { 48 | return nil, ErrOrderNotSupported 49 | } 50 | if len(*seq) < (n-1)*(wMax+1) { 51 | return nil, ErrSequenceTooShort 52 | } 53 | if l < 1 { 54 | return nil, ErrStrobeLengthTooSmall 55 | } 56 | if !(wMin > 0 && wMax > 0 && wMin <= wMax) { 57 | return nil, ErrInvalidWindowOffsets 58 | } 59 | 60 | rs := &RandStrobes{ 61 | seq: seq, 62 | n: n, 63 | l: l, 64 | wMin: wMin, 65 | wMax: wMax, 66 | 67 | endHash: len(*seq) - l, // position of the last l-mer 68 | endIdx: len(*seq) - l - (n-1)*l, // position of the last m1 69 | 70 | shrinkWindow: true, 71 | 72 | prime: defaultPrimeNumber, 73 | } 74 | 75 | var err error 76 | rs.hashes, err = computeHashes(seq, l) 77 | 78 | return rs, err 79 | } 80 | 81 | // SetPrime sets the prime number (q) in minimizing h(m)+h(mj) mod q. 82 | // In this package, we use (h(m)+h(mj)) & q, where q = roundup(q) - 1. 83 | // The value should not be too small, at least 256. 84 | func (rs *RandStrobes) SetPrime(q uint64) { 85 | if q < 256 { 86 | q = 256 87 | } 88 | rs.prime = roundup64(q) - 1 89 | } 90 | 91 | // SetWindowShrink decides whether shrink the search window at positions 92 | // near the end of the sequence. Default is true. 93 | func (rs *RandStrobes) SetWindowShrink(shrink bool) { 94 | rs.shrinkWindow = shrink 95 | } 96 | 97 | // Index returns the current index (0-based) of strobemers 98 | func (rs *RandStrobes) Index() int { 99 | return rs.idx - 1 100 | } 101 | 102 | // Indexes returns current indexes (0-based) of strobes 103 | func (rs *RandStrobes) Indexes() []int { 104 | return []int{rs.idx - 1, rs.idx2, rs.idx3} 105 | } 106 | 107 | // Next returns the next hash value of randstrobe 108 | func (rs *RandStrobes) Next() (uint64, bool) { 109 | switch rs.n { 110 | case 2: 111 | return rs.nextOrder2() 112 | case 3: 113 | return rs.nextOrder3() 114 | default: 115 | } 116 | 117 | return 0, false 118 | } 119 | 120 | func (rs *RandStrobes) nextOrder2() (uint64, bool) { 121 | if rs.idx > rs.endIdx { 122 | return 0, false 123 | } 124 | 125 | rs.wStart = rs.idx + rs.wMin 126 | rs.wEnd = rs.idx + rs.wMax 127 | 128 | // for positions near the end of the sequence, shrink the window size from the right 129 | if rs.wEnd > rs.endHash { 130 | if !rs.shrinkWindow { 131 | return 0, false 132 | } 133 | rs.wEnd = rs.endHash 134 | } 135 | 136 | // fmt.Printf("i:%d, window (%d-%d)\n", rs.idx, rs.wStart, rs.wEnd) 137 | 138 | rs.hash1 = rs.hashes[rs.idx] 139 | rs.hash2 = math.MaxUint64 140 | for rs.i = rs.wStart; rs.i <= rs.wEnd; rs.i++ { 141 | rs.hash = (rs.hash1 + rs.hashes[rs.i]) & rs.prime 142 | if rs.hash < rs.hash2 { 143 | rs.idx2 = rs.i 144 | rs.hash2 = rs.hash 145 | } 146 | } 147 | rs.hash2 = rs.hash1/2 + rs.hashes[rs.idx2]/3 148 | 149 | rs.idx++ 150 | return rs.hash2, true 151 | } 152 | 153 | func (rs *RandStrobes) nextOrder3() (uint64, bool) { 154 | if rs.idx > rs.endIdx { 155 | return 0, false 156 | } 157 | 158 | rs.w2Start = rs.idx + rs.wMax + rs.wMin 159 | rs.w2End = rs.idx + rs.wMax<<1 160 | if rs.w2Start > rs.endHash { 161 | return 0, false 162 | } 163 | // for positions near the end of the sequence, shrink the last window size from the right 164 | if rs.w2End > rs.endHash { 165 | if !rs.shrinkWindow { 166 | return 0, false 167 | } 168 | rs.w2End = rs.endHash 169 | } 170 | 171 | rs.wStart = rs.idx + rs.wMin 172 | rs.wEnd = rs.idx + rs.wMax 173 | 174 | // fmt.Printf("i:%d, window (%d-%d)\n", rs.idx, rs.wStart, rs.wEnd) 175 | // fmt.Printf("i:%d, window2 (%d-%d)\n", rs.idx, rs.w2Start, rs.w2End) 176 | 177 | rs.hash1 = rs.hashes[rs.idx] 178 | rs.hash2 = math.MaxUint64 179 | for rs.i = rs.wStart; rs.i <= rs.wEnd; rs.i++ { 180 | rs.hash = (rs.hash1 + rs.hashes[rs.i]) & rs.prime 181 | if rs.hash < rs.hash2 { 182 | rs.idx2 = rs.i 183 | rs.hash2 = rs.hash 184 | } 185 | } 186 | rs.hash2 = rs.hash1/3 + rs.hashes[rs.idx2]/4 187 | 188 | rs.hash3 = math.MaxUint64 189 | for rs.i = rs.w2Start; rs.i <= rs.w2End; rs.i++ { 190 | rs.hash = (rs.hash2 + rs.hashes[rs.i]) & rs.prime 191 | if rs.hash < rs.hash3 { 192 | rs.idx3 = rs.i 193 | rs.hash3 = rs.hash 194 | } 195 | } 196 | rs.hash3 = rs.hash2 + rs.hashes[rs.idx3]/5 197 | 198 | rs.idx++ 199 | return rs.hash3, true 200 | } 201 | -------------------------------------------------------------------------------- /minstrobes.go: -------------------------------------------------------------------------------- 1 | package strobemers 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | // MinStrobes is a iterator for MinStrobes 8 | type MinStrobes struct { 9 | seq *[]byte // DNA sequence 10 | 11 | n int // strobemer order 12 | l int // strobes length 13 | wMin int // minimum window offset 14 | wMax int // maximum window offset 15 | 16 | idx, idx2, idx3 int // indexes of m1, m2, m3 17 | hash1, hash2, hash3 uint64 // hash value of m1, m2, m3 18 | 19 | hashes []uint64 // precomputed ntHash values of l-mers 20 | 21 | minlocs []int // locations of min hash 22 | minhashes []uint64 // minhashes of window [i-w,i] 23 | 24 | endHash int // position of the last l-mer 25 | endIdx int // position of the last m1 26 | 27 | wStart, wEnd, w2Start, w2End int // window start and end 28 | 29 | prime uint64 30 | 31 | // shrink the last searching window for positions near the end of sequence. 32 | shrinkWindow bool 33 | 34 | // tmp variable 35 | i int 36 | hash uint64 37 | } 38 | 39 | // NewMinStrobes creates a MinStrobes iterator. 40 | // Parametems: 41 | // n - strobemer order 42 | // l - strobes length 43 | // wMin - minimum window offset, wMin > 0 44 | // wMax - maximum window offset, wMin <= wMax. 45 | func NewMinStrobes(seq *[]byte, n int, l int, wMin int, wMax int) (*MinStrobes, error) { 46 | if seq == nil || len(*seq) == 0 { 47 | return nil, ErrInvalidSequence 48 | } 49 | if n < 2 { 50 | return nil, ErrInvalidOrder 51 | } 52 | if n > 3 { 53 | return nil, ErrOrderNotSupported 54 | } 55 | if len(*seq) < (n-1)*(wMax+1) { 56 | return nil, ErrSequenceTooShort 57 | } 58 | if l < 1 { 59 | return nil, ErrStrobeLengthTooSmall 60 | } 61 | if !(wMin > 0 && wMax > 0 && wMin <= wMax) { 62 | return nil, ErrInvalidWindowOffsets 63 | } 64 | 65 | ms := &MinStrobes{ 66 | seq: seq, 67 | n: n, 68 | l: l, 69 | wMin: wMin, 70 | wMax: wMax, 71 | 72 | endHash: len(*seq) - l, // position of the last l-mer 73 | endIdx: len(*seq) - l - (n-1)*l, // position of the last m1 74 | 75 | shrinkWindow: true, 76 | 77 | prime: defaultPrimeNumber, 78 | } 79 | 80 | var err error 81 | ms.hashes, err = computeHashes(seq, l) 82 | if err != nil { 83 | return nil, err 84 | } 85 | 86 | ms.minlocs, ms.minhashes = computeMinHashes(ms.hashes, wMax-wMin+1) 87 | 88 | return ms, err 89 | } 90 | 91 | // SetPrime sets the prime number (q) in minimizing h(m)+h(mj) mod q. 92 | // In this package, we use (h(m)+h(mj)) & q, where q = roundup(q) - 1. 93 | // The value should not be too small, at least 256. 94 | func (ms *MinStrobes) SetPrime(q uint64) { 95 | if q < 256 { 96 | q = 256 97 | } 98 | ms.prime = roundup64(q) - 1 99 | } 100 | 101 | // SetWindowShrink decides whether shrink the search window at positions 102 | // near the end of the sequence. Default is true. 103 | func (ms *MinStrobes) SetWindowShrink(shrink bool) { 104 | ms.shrinkWindow = shrink 105 | } 106 | 107 | // Index returns the current index (0-based) of strobemers 108 | func (ms *MinStrobes) Index() int { 109 | return ms.idx - 1 110 | } 111 | 112 | // Indexes returns current indexes (0-based) of strobes 113 | func (ms *MinStrobes) Indexes() []int { 114 | return []int{ms.idx - 1, ms.idx2, ms.idx3} 115 | } 116 | 117 | // Next returns the next hash value of randstrobe 118 | func (ms *MinStrobes) Next() (uint64, bool) { 119 | switch ms.n { 120 | case 2: 121 | return ms.nextOrder2() 122 | case 3: 123 | return ms.nextOrder3() 124 | default: 125 | } 126 | 127 | return 0, false 128 | } 129 | 130 | func (ms *MinStrobes) nextOrder2() (uint64, bool) { 131 | if ms.idx > ms.endIdx { 132 | return 0, false 133 | } 134 | 135 | ms.wStart = ms.idx + ms.wMin 136 | ms.wEnd = ms.idx + ms.wMax 137 | 138 | // for positions near the end of the sequence, shrink the window size from the right 139 | if ms.wEnd > ms.endHash { 140 | if !ms.shrinkWindow { 141 | return 0, false 142 | } 143 | ms.wEnd = ms.endHash 144 | 145 | // fmt.Printf("i:%d, window (%d-%d)\n", ms.idx, ms.wStart, ms.wEnd) 146 | 147 | ms.hash1 = ms.hashes[ms.idx] 148 | ms.hash2 = math.MaxUint64 149 | for ms.i = ms.wStart; ms.i <= ms.wEnd; ms.i++ { 150 | ms.hash = ms.hashes[ms.i] 151 | if ms.hash < ms.hash2 { 152 | ms.idx2 = ms.i 153 | ms.hash2 = ms.hash 154 | } 155 | } 156 | // For 1) asymmetry, 2) avoid value overflow 157 | ms.hash2 = ms.hash1/2 + ms.hashes[ms.idx2]/3 158 | } else { // use precomputed min hashes 159 | ms.hash1 = ms.hashes[ms.idx] 160 | ms.idx2 = ms.minlocs[ms.wEnd] 161 | ms.hash2 = ms.hash1/2 + ms.minhashes[ms.wEnd]/3 162 | } 163 | 164 | ms.idx++ 165 | return ms.hash2, true 166 | } 167 | 168 | func (ms *MinStrobes) nextOrder3() (uint64, bool) { 169 | if ms.idx > ms.endIdx { 170 | return 0, false 171 | } 172 | 173 | ms.w2Start = ms.idx + ms.wMax + ms.wMin 174 | ms.w2End = ms.idx + ms.wMax<<1 175 | if ms.w2Start > ms.endHash { 176 | return 0, false 177 | } 178 | 179 | ms.wStart = ms.idx + ms.wMin 180 | ms.wEnd = ms.idx + ms.wMax 181 | 182 | // use precomputed min hashes 183 | ms.hash1 = ms.hashes[ms.idx] 184 | ms.idx2 = ms.minlocs[ms.wEnd] 185 | ms.hash2 = ms.hash1/3 + ms.minhashes[ms.wEnd]/4 186 | 187 | // for positions near the end of the sequence, shrink the last window size from the right 188 | if ms.w2End > ms.endHash { 189 | if !ms.shrinkWindow { 190 | return 0, false 191 | } 192 | ms.w2End = ms.endHash 193 | 194 | ms.hash3 = math.MaxUint64 195 | for ms.i = ms.w2Start; ms.i <= ms.w2End; ms.i++ { 196 | ms.hash = (ms.hash2 + ms.hashes[ms.i]) & ms.prime 197 | if ms.hash < ms.hash3 { 198 | ms.idx3 = ms.i 199 | ms.hash3 = ms.hash 200 | } 201 | } 202 | ms.hash3 = ms.hash2 + ms.hashes[ms.idx3]/5 203 | } else { 204 | ms.idx3 = ms.minlocs[ms.w2End] 205 | ms.hash3 = ms.hash2 + ms.minhashes[ms.w2End]/5 206 | } 207 | 208 | // fmt.Printf("i:%d, window (%d-%d)\n", ms.idx, ms.wStart, ms.wEnd) 209 | // fmt.Printf("i:%d, window2 (%d-%d)\n", ms.idx, ms.w2Start, ms.w2End) 210 | 211 | ms.idx++ 212 | return ms.hash3, true 213 | } 214 | -------------------------------------------------------------------------------- /evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation 2 | 3 | ## Number of matched strobemers 4 | 5 | [A similar test](https://github.com/BGI-Qingdao/strobemer_cpptest#benchmark_sim-r-match-only) with approximate results. 6 | 7 | query: 150bp, snp: 1 (0.006) 8 | 9 | $ go run test1_matches.go q0-snp1.fasta r0.fasta | csvtk pretty -t 10 | query ref method nQuery nRef nCommon qCov 11 | ------- --- ------------------------------ ------ ------- ------- ----- 12 | q0-snp1 r0 Kmer(20) 131 1546586 111 84.73 * 13 | q0-snp1 r0 MinStrobes(2,10,12,12,shrink) 131 1548767 109 83.21 14 | q0-snp1 r0 MinStrobes(2,10,12,12) 129 1548765 109 84.50 15 | q0-snp1 r0 RankStrobes(2,10,12,12,shrink) 131 1548767 109 83.21 16 | q0-snp1 r0 RankStrobes(2,10,12,12) 129 1548765 109 84.50 17 | 18 | q0-snp1 r0 Kmer(21) 130 1547218 109 83.85 19 | q0-snp1 r0 MinStrobes(3,7,9,9,shrink) 126 1549315 108 85.71 * 20 | q0-snp1 r0 MinStrobes(3,7,9,9) 126 1549315 108 85.71 * 21 | q0-snp1 r0 RankStrobes(3,7,9,9,shrink) 126 1549315 108 85.71 * 22 | q0-snp1 r0 RankStrobes(3,7,9,9) 126 1549315 108 85.71 * 23 | 24 | q0-snp1 r0 Kmer(20) 131 1546586 111 84.73 25 | q0-snp1 r0 MinStrobes(2,10,12,16,shrink) 131 1548376 107 81.68 26 | q0-snp1 r0 MinStrobes(2,10,12,16) 125 1548370 107 85.60 * 27 | q0-snp1 r0 RankStrobes(2,10,12,16,shrink) 131 1548438 108 82.44 28 | q0-snp1 r0 RankStrobes(2,10,12,16) 125 1548432 108 86.40 * 29 | 30 | q0-snp1 r0 Kmer(21) 130 1547218 109 83.85 31 | q0-snp1 r0 MinStrobes(3,7,9,13,shrink) 122 1545403 102 83.61 32 | q0-snp1 r0 MinStrobes(3,7,9,13) 118 1545399 102 86.44 ** 33 | q0-snp1 r0 RankStrobes(3,7,9,13,shrink) 122 1545522 107 87.70 ** 34 | q0-snp1 r0 RankStrobes(3,7,9,13) 118 1545518 105 88.98 ** 35 | 36 | query: 150bp, snp: 3 (0.02) 37 | 38 | $ go run test1_matches.go q2-snp3.fasta r2.fasta | csvtk pretty -t 39 | query ref method nQuery nRef nCommon qCov 40 | ------- --- ------------------------------ ------ ------- ------- ----- 41 | q2-snp3 r2 Kmer(20) 131 1687558 84 64.12 * 42 | q2-snp3 r2 MinStrobes(2,10,12,12,shrink) 131 1687785 82 62.60 43 | q2-snp3 r2 MinStrobes(2,10,12,12) 129 1687783 82 63.57 44 | q2-snp3 r2 RankStrobes(2,10,12,12,shrink) 131 1687785 82 62.60 45 | q2-snp3 r2 RankStrobes(2,10,12,12) 129 1687783 82 63.57 46 | 47 | q2-snp3 r2 Kmer(21) 130 1687656 82 63.08 48 | q2-snp3 r2 MinStrobes(3,7,9,9,shrink) 126 1687865 84 66.67 * 49 | q2-snp3 r2 MinStrobes(3,7,9,9) 126 1687865 84 66.67 * 50 | q2-snp3 r2 RankStrobes(3,7,9,9,shrink) 126 1687865 84 66.67 * 51 | q2-snp3 r2 RankStrobes(3,7,9,9) 126 1687865 84 66.67 * 52 | 53 | q2-snp3 r2 Kmer(20) 131 1687558 84 64.12 * 54 | q2-snp3 r2 MinStrobes(2,10,12,16,shrink) 131 1687487 76 58.02 55 | q2-snp3 r2 MinStrobes(2,10,12,16) 125 1687481 76 60.80 56 | q2-snp3 r2 RankStrobes(2,10,12,16,shrink) 131 1687529 77 58.78 57 | q2-snp3 r2 RankStrobes(2,10,12,16) 125 1687523 76 60.80 58 | 59 | q2-snp3 r2 Kmer(21) 130 1687656 82 63.08 * 60 | q2-snp3 r2 MinStrobes(3,7,9,13,shrink) 122 1684611 74 60.66 61 | q2-snp3 r2 MinStrobes(3,7,9,13) 118 1684607 72 61.02 62 | q2-snp3 r2 RankStrobes(3,7,9,13,shrink) 122 1684661 71 58.20 63 | q2-snp3 r2 RankStrobes(3,7,9,13) 118 1684657 68 57.63 64 | 65 | query: 150bp, snp: 7 (0.47) 66 | 67 | $ go run test1_matches.go q1-snp7.rc.fasta r1.fasta | csvtk pretty -t 68 | query ref method nQuery nRef nCommon qCov 69 | ---------- --- ------------------------------ ------ ------- ------- ----- 70 | q1-snp7.rc r1 Kmer(20) 131 2802879 54 41.22 * 71 | q1-snp7.rc r1 MinStrobes(2,10,12,12,shrink) 131 2804781 52 39.69 72 | q1-snp7.rc r1 MinStrobes(2,10,12,12) 129 2804779 52 40.31 73 | q1-snp7.rc r1 RankStrobes(2,10,12,12,shrink) 131 2804781 52 39.69 74 | q1-snp7.rc r1 RankStrobes(2,10,12,12) 129 2804779 52 40.31 75 | 76 | q1-snp7.rc r1 Kmer(21) 130 2804365 51 39.23 * 77 | q1-snp7.rc r1 MinStrobes(3,7,9,9,shrink) 126 2806161 48 38.10 78 | q1-snp7.rc r1 MinStrobes(3,7,9,9) 126 2806161 48 38.10 79 | q1-snp7.rc r1 RankStrobes(3,7,9,9,shrink) 126 2806161 48 38.10 80 | q1-snp7.rc r1 RankStrobes(3,7,9,9) 126 2806161 48 38.10 81 | 82 | q1-snp7.rc r1 Kmer(20) 131 2802879 54 41.22 * 83 | q1-snp7.rc r1 MinStrobes(2,10,12,16,shrink) 131 2803507 51 38.93 84 | q1-snp7.rc r1 MinStrobes(2,10,12,16) 125 2803501 51 40.80 85 | q1-snp7.rc r1 RankStrobes(2,10,12,16,shrink) 131 2803659 47 35.88 86 | q1-snp7.rc r1 RankStrobes(2,10,12,16) 125 2803653 44 35.20 87 | 88 | q1-snp7.rc r1 Kmer(21) 130 2804365 51 39.23 * 89 | q1-snp7.rc r1 MinStrobes(3,7,9,13,shrink) 122 2797218 36 29.51 90 | q1-snp7.rc r1 MinStrobes(3,7,9,13) 118 2797214 36 30.51 91 | q1-snp7.rc r1 RankStrobes(3,7,9,13,shrink) 122 2797918 42 34.43 92 | q1-snp7.rc r1 RankStrobes(3,7,9,13) 118 2797914 41 34.75 93 | -------------------------------------------------------------------------------- /evaluation/test1_matches.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | "path/filepath" 8 | "runtime" 9 | "strings" 10 | "sync" 11 | 12 | "github.com/shenwei356/bio/seqio/fastx" 13 | "github.com/shenwei356/strobemers" 14 | "github.com/will-rowe/nthash" 15 | ) 16 | 17 | func main() { 18 | args := os.Args 19 | if len(args) != 3 { 20 | checkError(fmt.Errorf("usage: %s query.fasta ref.fasta", os.Args[0])) 21 | } 22 | 23 | fileQuery, fileRef := args[1], args[2] 24 | 25 | q, _ := filepathTrimExtension(filepath.Base(fileQuery)) 26 | r, _ := filepathTrimExtension(filepath.Base(fileRef)) 27 | 28 | type Test struct { 29 | n int 30 | l int 31 | wMin int 32 | wMax int 33 | } 34 | 35 | tests := []Test{ 36 | {n: 2, l: 10, wMin: 12, wMax: 12}, 37 | {n: 3, l: 7, wMin: 9, wMax: 9}, 38 | 39 | {n: 2, l: 10, wMin: 12, wMax: 16}, 40 | {n: 3, l: 7, wMin: 9, wMax: 13}, 41 | } 42 | 43 | seqsQ := readSeqs(fileQuery) 44 | seqsR := readSeqs(fileRef) 45 | 46 | var kmersQ, kmersR map[uint64]interface{} 47 | var rstrobesSQ, rstrobesSR, rstrobesQ, rstrobesR map[uint64]interface{} 48 | var mstrobesSQ, mstrobesSR, mstrobesQ, mstrobesR map[uint64]interface{} 49 | var kmersInter, rstrobesSInter, rstrobesInter, mstrobesSInter, mstrobesInter int 50 | 51 | fmt.Printf("query\tref\tmethod\tnQuery\tnRef\tnCommon\tqCov\n") 52 | runtime.GOMAXPROCS(10) 53 | for _, t := range tests { 54 | var wg sync.WaitGroup 55 | wg.Add(10) 56 | 57 | // kmers 58 | go func() { 59 | kmersQ = list2map(computeKmers(seqsQ, t.n*t.l)) 60 | wg.Done() 61 | }() 62 | go func() { 63 | kmersR = list2map(computeKmers(seqsR, t.n*t.l)) 64 | wg.Done() 65 | }() 66 | 67 | // randstrobes 68 | go func() { 69 | rstrobesSQ = list2map(computeRandStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, true)) 70 | wg.Done() 71 | }() 72 | go func() { 73 | rstrobesSR = list2map(computeRandStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, true)) 74 | wg.Done() 75 | }() 76 | go func() { 77 | rstrobesQ = list2map(computeRandStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, false)) 78 | wg.Done() 79 | }() 80 | go func() { 81 | rstrobesR = list2map(computeRandStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, false)) 82 | wg.Done() 83 | }() 84 | 85 | // minstrobes 86 | go func() { 87 | mstrobesSQ = list2map(computeMinStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, true)) 88 | wg.Done() 89 | }() 90 | go func() { 91 | mstrobesSR = list2map(computeMinStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, true)) 92 | wg.Done() 93 | }() 94 | go func() { 95 | mstrobesQ = list2map(computeMinStrobes(seqsQ, t.n, t.l, t.wMin, t.wMax, false)) 96 | wg.Done() 97 | }() 98 | go func() { 99 | mstrobesR = list2map(computeMinStrobes(seqsR, t.n, t.l, t.wMin, t.wMax, false)) 100 | wg.Done() 101 | }() 102 | wg.Wait() 103 | 104 | // intersection 105 | 106 | var wg2 sync.WaitGroup 107 | wg2.Add(5) 108 | go func() { 109 | kmersInter = intersection(kmersQ, kmersR) 110 | wg2.Done() 111 | }() 112 | 113 | go func() { 114 | rstrobesSInter = intersection(rstrobesSQ, rstrobesSR) 115 | wg2.Done() 116 | }() 117 | go func() { 118 | rstrobesInter = intersection(rstrobesQ, rstrobesR) 119 | wg2.Done() 120 | }() 121 | 122 | go func() { 123 | mstrobesSInter = intersection(mstrobesSQ, mstrobesSR) 124 | wg2.Done() 125 | }() 126 | go func() { 127 | mstrobesInter = intersection(mstrobesQ, mstrobesR) 128 | wg2.Done() 129 | }() 130 | 131 | wg2.Wait() 132 | 133 | // kmers 134 | fmt.Printf("%s\t%s\tKmer(%d)\t%d\t%d\t%d\t%.2f\n", 135 | q, r, t.n*t.l, len(kmersQ), len(kmersR), 136 | kmersInter, float64(kmersInter)/float64(len(kmersQ))*100) 137 | 138 | // minstrobes 139 | fmt.Printf("%s\t%s\tMinStrobes(%d,%d,%d,%d,shrink)\t%d\t%d\t%d\t%.2f\n", 140 | q, r, t.n, t.l, t.wMin, t.wMax, len(mstrobesSQ), len(mstrobesSR), 141 | mstrobesSInter, float64(mstrobesSInter)/float64(len(mstrobesSQ))*100) 142 | fmt.Printf("%s\t%s\tMinStrobes(%d,%d,%d,%d)\t%d\t%d\t%d\t%.2f\n", 143 | q, r, t.n, t.l, t.wMin, t.wMax, len(mstrobesQ), len(mstrobesR), 144 | mstrobesInter, float64(mstrobesInter)/float64(len(mstrobesQ))*100) 145 | 146 | // randstrobes 147 | fmt.Printf("%s\t%s\tRankStrobes(%d,%d,%d,%d,shrink)\t%d\t%d\t%d\t%.2f\n", 148 | q, r, t.n, t.l, t.wMin, t.wMax, len(rstrobesSQ), len(rstrobesSR), 149 | rstrobesSInter, float64(rstrobesSInter)/float64(len(rstrobesSQ))*100) 150 | fmt.Printf("%s\t%s\tRankStrobes(%d,%d,%d,%d)\t%d\t%d\t%d\t%.2f\n", 151 | q, r, t.n, t.l, t.wMin, t.wMax, len(rstrobesQ), len(rstrobesR), 152 | rstrobesInter, float64(rstrobesInter)/float64(len(rstrobesQ))*100) 153 | 154 | fmt.Printf(" \t \t \t \t \t \t \n") 155 | } 156 | 157 | } 158 | 159 | func checkError(e error) { 160 | if e != nil { 161 | fmt.Fprintf(os.Stderr, "%s\n", e) 162 | os.Exit(0) 163 | } 164 | } 165 | 166 | func readSeqs(file string) [][]byte { 167 | reader, err := fastx.NewDefaultReader(file) 168 | checkError(err) 169 | 170 | sequences := make([][]byte, 0, 8) 171 | 172 | var record *fastx.Record 173 | for { 174 | record, err = reader.Read() 175 | if err != nil { 176 | if err == io.EOF { 177 | break 178 | } 179 | checkError(err) 180 | break 181 | } 182 | 183 | sequences = append(sequences, record.Seq.Seq) 184 | } 185 | 186 | return sequences 187 | } 188 | 189 | func computeKmers(sequences [][]byte, k int) []uint64 { 190 | hashes := make([]uint64, 0, 1024) 191 | 192 | var hash uint64 193 | var ok bool 194 | var hasher *nthash.NTHi 195 | var err error 196 | for _, _seq := range sequences { 197 | hasher, err = nthash.NewHasher(&_seq, uint(k)) 198 | checkError(err) 199 | 200 | for { 201 | hash, ok = hasher.Next(true) 202 | if !ok { 203 | break 204 | } 205 | 206 | hashes = append(hashes, hash) 207 | } 208 | } 209 | 210 | return hashes 211 | } 212 | 213 | func computeRandStrobes(sequences [][]byte, n int, l int, wMin int, wMax int, shrink bool) []uint64 { 214 | hashes := make([]uint64, 0, 1024) 215 | 216 | var hash uint64 217 | var ok bool 218 | var rs *strobemers.RandStrobes 219 | var err error 220 | 221 | for _, _seq := range sequences { 222 | rs, err = strobemers.NewRandStrobes(&_seq, n, l, wMin, wMax) 223 | checkError(err) 224 | 225 | rs.SetWindowShrink(shrink) 226 | for { 227 | hash, ok = rs.Next() 228 | if !ok { 229 | break 230 | } 231 | 232 | hashes = append(hashes, hash) 233 | } 234 | } 235 | 236 | return hashes 237 | } 238 | 239 | func computeMinStrobes(sequences [][]byte, n int, l int, wMin int, wMax int, shrink bool) []uint64 { 240 | hashes := make([]uint64, 0, 1024) 241 | 242 | var hash uint64 243 | var ok bool 244 | var rs *strobemers.MinStrobes 245 | var err error 246 | 247 | for _, _seq := range sequences { 248 | rs, err = strobemers.NewMinStrobes(&_seq, n, l, wMin, wMax) 249 | checkError(err) 250 | 251 | rs.SetWindowShrink(shrink) 252 | for { 253 | hash, ok = rs.Next() 254 | if !ok { 255 | break 256 | } 257 | 258 | hashes = append(hashes, hash) 259 | } 260 | } 261 | 262 | return hashes 263 | } 264 | 265 | func list2map(data []uint64) map[uint64]interface{} { 266 | m := make(map[uint64]interface{}, len(data)) 267 | for _, k := range data { 268 | m[k] = struct{}{} 269 | } 270 | return m 271 | } 272 | 273 | func intersection(m1, m2 map[uint64]interface{}) int { 274 | n := 0 275 | var ok bool 276 | for k := range m1 { 277 | if _, ok = m2[k]; ok { 278 | n++ 279 | } 280 | } 281 | return n 282 | } 283 | 284 | func filepathTrimExtension(file string) (string, string) { 285 | gz := strings.HasSuffix(file, ".gz") || strings.HasSuffix(file, ".GZ") 286 | if gz { 287 | file = file[0 : len(file)-3] 288 | } 289 | extension := filepath.Ext(file) 290 | name := file[0 : len(file)-len(extension)] 291 | if gz { 292 | extension += ".gz" 293 | } 294 | return name, extension 295 | } 296 | --------------------------------------------------------------------------------