├── .gitignore
├── CHANGELOG.md
├── INDEX_FORMAT_CHANGELOG.tsv
├── LICENSE
├── README.md
├── demo
├── README.md
├── ass2species.map
├── bench
│ ├── b.amr.fasta
│ ├── b.gene_E_coli_16S.fasta
│ ├── b.gene_E_faecalis_SecY.fasta
│ └── b.plasmid_pCUVET18-1784.4.fasta
├── files.txt
├── prefix.hist.png
├── q.gene.fasta
├── q.gene.fasta.lexicmap.tsv
├── q.long-reads.fasta.gz
├── q.prophage.fasta
├── q.prophage.fasta.lexicmap.tsv
├── refs
│ ├── GCF_000006945.2.fa.gz
│ ├── GCF_000017205.1.fa.gz
│ ├── GCF_000148585.2.fa.gz
│ ├── GCF_000392875.1.fa.gz
│ ├── GCF_000742135.1.fa.gz
│ ├── GCF_001027105.1.fa.gz
│ ├── GCF_001096185.1.fa.gz
│ ├── GCF_001457655.1.fa.gz
│ ├── GCF_001544255.1.fa.gz
│ ├── GCF_002949675.1.fa.gz
│ ├── GCF_002950215.1.fa.gz
│ ├── GCF_003697165.2.fa.gz
│ ├── GCF_006742205.1.fa.gz
│ ├── GCF_009759685.1.fa.gz
│ └── GCF_900638025.1.fa.gz
├── taxid.map
└── taxonomy.tsv
├── docs
├── archetypes
│ └── default.md
├── content
│ ├── _index.md
│ ├── faqs
│ │ └── _index.md
│ ├── installation
│ │ └── _index.md
│ ├── introduction
│ │ └── _index.md
│ ├── logo.svg
│ ├── notes
│ │ ├── _index.md
│ │ └── motivation.md
│ ├── performance@genbank.tsv
│ ├── performance@genbank.tsv.sh
│ ├── releases
│ │ └── _index.md
│ ├── tutorials
│ │ ├── _index.md
│ │ ├── index
│ │ │ ├── _index.md
│ │ │ ├── parameters-batches.tsv
│ │ │ ├── parameters-general.tsv
│ │ │ ├── parameters-masks.tsv
│ │ │ └── parameters-seeds.tsv
│ │ ├── misc
│ │ │ ├── _index.md
│ │ │ ├── index-allthebacteria.md
│ │ │ ├── index-genbank.md
│ │ │ ├── index-globdb.md
│ │ │ ├── index-gtdb.md
│ │ │ └── index-uhgg.md
│ │ ├── parameters-align.tsv
│ │ ├── parameters-general.tsv
│ │ ├── parameters-seeding.tsv
│ │ └── search.md
│ └── usage
│ │ ├── _index.md
│ │ ├── index
│ │ └── _index.md
│ │ ├── lexicmap.md
│ │ ├── search.md
│ │ └── utils
│ │ ├── 2blast.md
│ │ ├── _index.md
│ │ ├── genomes.md
│ │ ├── kmers.md
│ │ ├── masks.md
│ │ ├── reindex-seeds.md
│ │ ├── remerge.md
│ │ ├── seed-pos.md
│ │ └── subseq.md
├── data
│ └── menu
│ │ ├── extra.yaml
│ │ └── more.yaml
├── hugo.toml
└── static
│ ├── AllTheBacteria-v0.2.url.txt
│ ├── GCF_000017205.1.png
│ ├── GCF_000017205.1.seed_number.png
│ ├── GCF_000392875.1.png
│ ├── GCF_000392875.1.seed_number.png
│ ├── GCF_002949675.1.png
│ ├── GCF_002949675.1.seed_number.png
│ ├── custom.css
│ ├── favicon
│ ├── android-chrome-192x192.png
│ ├── android-chrome-512x512.png
│ ├── apple-touch-icon.png
│ ├── browserconfig.xml
│ ├── favicon-16x16.png
│ ├── favicon-32x32.png
│ ├── favicon-48x48.png
│ ├── favicon.ico
│ ├── favicon.svg
│ ├── mstile-144x144.png
│ ├── mstile-150x150.png
│ ├── mstile-310x150.png
│ ├── mstile-310x310.png
│ ├── mstile-70x70.png
│ ├── safari-pinned-tab.svg
│ └── site.webmanifest
│ ├── indexing.svg
│ ├── logo.svg
│ ├── overview.svg
│ ├── prefix.hist.png
│ └── searching.svg
├── go.mod
├── go.sum
├── lexicmap
├── .gitignore
├── build.sh
├── cmd
│ ├── 2blast.go
│ ├── autocomplete.go
│ ├── genome
│ │ ├── genome.go
│ │ └── genome_test.go
│ ├── genomes.go
│ ├── index.go
│ ├── kmers.go
│ ├── kv
│ │ ├── kv-data.go
│ │ ├── kv-data_test.go
│ │ ├── kv-encoding.go
│ │ ├── kv-reader.go
│ │ ├── kv-searcher.go
│ │ └── kv-searcher2.go
│ ├── lib-chaining.go
│ ├── lib-chaining2.go
│ ├── lib-chaining3.go
│ ├── lib-chaining_test.go
│ ├── lib-index-build.go
│ ├── lib-index-merge.go
│ ├── lib-index-search-util.go
│ ├── lib-index-search.go
│ ├── lib-seq_compare.go
│ ├── lib-seq_compare_test.go
│ ├── masks.go
│ ├── re-merge.go
│ ├── recount-bases.go
│ ├── reindex-seeds.go
│ ├── root.go
│ ├── search.go
│ ├── seed-pos.go
│ ├── seedposition
│ │ ├── seed_position.go
│ │ └── seed_position_test.go
│ ├── subseq.go
│ ├── tree
│ │ ├── tree.go
│ │ └── tree_test.go
│ ├── util-cli.go
│ ├── util-io.go
│ ├── util-logging.go
│ ├── util.go
│ ├── util
│ │ ├── kmers.go
│ │ ├── kmers_test.go
│ │ ├── util.go
│ │ ├── varint-GB.go
│ │ └── varint-GB_test.go
│ ├── utils.go
│ └── version.go
├── main.go
└── packaging.sh
├── logo.svg
└── overview.svg
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 |
8 | # Test binary, built with `go test -c`
9 | *.test
10 |
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 |
14 |
15 | *.directory
16 | .Rhistory
17 |
18 | *ssshtest
19 | *.nextflow.log*
20 | *.brename_detail.txt
21 | */Rplots.pdf
22 | *.pprof
23 |
24 | docs/public
25 | docs/themes
26 | .hugo_build.lock
27 |
28 | demo/demo.lmi
29 | demo/demo.lmi-no-df
30 | demo/*.lexicmap.tsv.gz
31 | demo/seed_distance.tsv
32 | demo/seed_distance
33 | demo/seed_distance-no-df
34 | demo/seed-pos.tsv.gz
35 | demo/t.txt
36 | demo/kmers.tsv*
37 |
38 |
39 | lexicmap/binaries/*
40 | lexicmap/lexicmap*
41 | lexicmap/*.fasta
42 | lexicmap/indexes
43 |
--------------------------------------------------------------------------------
/INDEX_FORMAT_CHANGELOG.tsv:
--------------------------------------------------------------------------------
1 | Index version LexicMap version Supported LexicMap versions Date Changes
2 | 3.4 0.7.0 0.6.0 + 2025-04-11 Fix filling the seed desert region behind the last seed of a genome.
3 | 3.3 0.6.0 0.6.0 + 2025-03-25 Reduce index size for batches <= 512. Add the total bases of index to info.toml for computing the Evalue. Denser seeds.
4 | 3.1 0.5.0 0.4.0 + 2024-12-18 Change the default partitions of seed data index.
5 | 3.0 0.4.0 0.4.0 + 2024-08-15 Support suffix matching of seeds. Better seed desert filling for highly-repetitive regions. Denser seeds.
6 | 1.1 0.3.0 0.3.0 2024-05-14 Change the format of seed data index. Use longer contig intervals.
7 | 0.1 0.1.0 0.1.0 - 0.2.0 2024-01-25 First version.
8 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2023 - 2024 Wei Shen (shenwei356@gmail.com)
2 |
3 | The MIT License
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 |
--------------------------------------------------------------------------------
/demo/ass2species.map:
--------------------------------------------------------------------------------
1 | GCF_000742135.1 Klebsiella pneumoniae
2 | GCF_003697165.2 Escherichia coli
3 | GCF_002949675.1 Shigella dysenteriae
4 | GCF_002950215.1 Shigella flexneri
5 | GCF_000006945.2 Salmonella enterica
6 | GCF_001544255.1 Enterococcus faecium
7 | GCF_000392875.1 Enterococcus faecalis
8 | GCF_001457655.1 Haemophilus influenzae
9 | GCF_900638025.1 Haemophilus parainfluenzae
10 | GCF_001027105.1 Staphylococcus aureus
11 | GCF_006742205.1 Staphylococcus epidermidis
12 | GCF_001096185.1 Streptococcus pneumoniae
13 | GCF_000148585.2 Streptococcus mitis
14 | GCF_009759685.1 Acinetobacter baumannii
15 | GCF_000017205.1 Pseudomonas aeruginosa
16 |
--------------------------------------------------------------------------------
/demo/bench/b.gene_E_coli_16S.fasta:
--------------------------------------------------------------------------------
1 | >NC_000913.3:4166659-4168200 rrsB [organism=Escherichia coli str. K-12 substr. MG1655] [GeneID=948466] [chromosome=]
2 | AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGT
3 | AACAGGAAGAAGCTTGCTTCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGAT
4 | GGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTC
5 | GGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGAC
6 | GATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAG
7 | GCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCT
8 | TCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCC
9 | GCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAA
10 | TTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAA
11 | CTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCG
12 | TAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGC
13 | GTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCC
14 | CTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAAC
15 | TCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACC
16 | TTACCTGGTCTTGACATCCACGGAAGTTTTCAGAGATGAGAATGTGCCTTCGGGAACCGTGAGACAGGTG
17 | CTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCC
18 | TTTGTTGCCAGCGGTCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATG
19 | ACGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCG
20 | ACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCAT
21 | GAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACC
22 | GCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACT
23 | TTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCT
24 | TA
25 |
--------------------------------------------------------------------------------
/demo/bench/b.gene_E_faecalis_SecY.fasta:
--------------------------------------------------------------------------------
1 | >lcl|NZ_CP064374.1_cds_WP_002359350.1_906 [gene=secY] [locus_tag=IUJ47_RS04625] [protein=preprotein translocase subunit SecY] [protein_id=WP_002359350.1] [location=960938..962236] [gbkey=CDS]
2 | TTGTTCAAGCTATTAAAGAACGCCTTTAAAGTCAAAGACATTAGATCAAAAATCTTATTTACAGTTTTAA
3 | TCTTGTTTGTATTTCGCCTAGGTGCGCACATTACTGTGCCCGGGGTGAATGCAAAGGGATTGTCTGATTT
4 | AAGTAGCTTACCCTTTTTGAATATGTTGAATATGGTGAGTGGTAGTGCCATGCAAAACTTCTCTATCTTC
5 | TCGATGGGGGTTTCGCCATACATTACAGCCTCTATTATTATTCAACTATTGCAAATGGATATTGTACCTA
6 | GATTTGTAGAATGGTCAAAACAAGGGGAAGTTGGGCGTAAGAAATTAAATCAAGCTACAAGATATCTAAC
7 | GATTGTCTTGGGTGTGGCTCAGTCAATGGGGATCACTGCTGGTTTTAATAGCTTAAGTCAAACTGGGATT
8 | GTAAACAATCCAACCTTAGGTACCTTTGTGATGATTGCAGTTATTTTAACTGCTGGGACGATGTTTGTGA
9 | CTTGGATGGGTGAACAAATTACAGAAAAAGGAATCGGAAATGGTGTTTCAATGATTATCTTTGCCGGGAT
10 | TATTTCTCGTTTGCCAGGAGCAGTCAAAGAAATCTATGAAGATTACTTCGTCAATATCGAGTCTTCTCGT
11 | ATTTGGCAATCTGTTATTTTCATTGCAATCTTAGTTATTGCTATTTTGGTGATTGTTACAGTCGTAACGT
12 | TCTTCCAACAAGCAGAGCGTAAGATTCCAATCCAATATACAAAACGTGTTTCTGGTGCACCAACAAGTAG
13 | TTATTTACCGTTAAAAGTAAATGCTGCTGGGGTTATTCCAGTTATCTTTGCCAGCTCGTTAATTGCAACA
14 | CCAAATGCCATTTTACAAGCTTTCTCATCAAAATTCGCTGGTGAAAATTGGTATGACATTATGACAAAAA
15 | TCTTCAGTTATAACACAGTTCCAGGGGCAATCATCTATACTGTCCTAATCGTTGCGTTTACGTTCTTCTA
16 | TGCATTTGTTCAAGTAAACCCTGAGAAATTAGCGGAAAACTTACAAAAACAAGGAAGCTACATTCCAAGC
17 | GTGCGACCAGGTAAAGGTACAGAAGAATATGTATCTGGCGTGTTAATGAGATTAAGTGTTGTCGGCTCAA
18 | TTTTCCTAGGACTTGTTGCTTTACTTCCAATCATTGCGCAAATGGTTTGGAACTTACCTCAATCAATCGG
19 | TTTAGGTGGAACAAGTTTACTAATCGTTATCGGGGTTGCATTAGAAACAACGAAACAATTAGAAGGATTA
20 | ATGATGAAACGTCAATATGTCGGCTTTATCAATAAGTAA
21 |
--------------------------------------------------------------------------------
/demo/files.txt:
--------------------------------------------------------------------------------
1 | refs/GCF_000006945.2.fa.gz
2 | refs/GCF_000017205.1.fa.gz
3 | refs/GCF_000148585.2.fa.gz
4 | refs/GCF_000392875.1.fa.gz
5 | refs/GCF_000742135.1.fa.gz
6 | refs/GCF_001027105.1.fa.gz
7 | refs/GCF_001096185.1.fa.gz
8 | refs/GCF_001457655.1.fa.gz
9 | refs/GCF_001544255.1.fa.gz
10 | refs/GCF_002949675.1.fa.gz
11 | refs/GCF_002950215.1.fa.gz
12 | refs/GCF_003697165.2.fa.gz
13 | refs/GCF_006742205.1.fa.gz
14 | refs/GCF_009759685.1.fa.gz
15 | refs/GCF_900638025.1.fa.gz
16 |
--------------------------------------------------------------------------------
/demo/prefix.hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/prefix.hist.png
--------------------------------------------------------------------------------
/demo/q.gene.fasta:
--------------------------------------------------------------------------------
1 | >NC_000913.3:4166659-4168200 rrsB [organism=Escherichia coli str. K-12 substr. MG1655] [GeneID=948466] [chromosome=]
2 | AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGT
3 | AACAGGAAGAAGCTTGCTTCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGAT
4 | GGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTC
5 | GGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGAC
6 | GATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAG
7 | GCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCT
8 | TCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCC
9 | GCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAA
10 | TTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAA
11 | CTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCG
12 | TAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGC
13 | GTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCC
14 | CTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAAC
15 | TCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACC
16 | TTACCTGGTCTTGACATCCACGGAAGTTTTCAGAGATGAGAATGTGCCTTCGGGAACCGTGAGACAGGTG
17 | CTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCC
18 | TTTGTTGCCAGCGGTCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATG
19 | ACGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCG
20 | ACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCAT
21 | GAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACC
22 | GCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACT
23 | TTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCT
24 | TA
25 |
--------------------------------------------------------------------------------
/demo/q.long-reads.fasta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/q.long-reads.fasta.gz
--------------------------------------------------------------------------------
/demo/q.prophage.fasta.lexicmap.tsv:
--------------------------------------------------------------------------------
1 | query qlen hits sgenome sseqid qcovGnm cls hsp qcovHSP alenHSP pident gaps qstart qend sstart send sstr slen evalue bitscore
2 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 1 1 27.890 9371 97.716 2 1 9369 1864411 1873781 + 4903501 0.00e+00 15953
3 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 1 2 0.301 101 98.020 0 10308 10408 1873846 1873946 + 4903501 1.72e-43 174
4 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 2 3 20.665 6942 96.528 4 17441 24382 1882011 1888948 + 4903501 0.00e+00 11459
5 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 3 4 17.685 5941 97.980 0 24355 30295 1853098 1859038 + 4903501 0.00e+00 10174
6 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 4 5 8.993 3021 91.526 0 10308 13328 1873846 1876866 + 4903501 0.00e+00 4295
7 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 5 6 2.438 820 84.390 1 14540 15358 1878798 1879617 + 4903501 1.29e-264 911
8 | NC_001895.1 33593 2 GCF_002949675.1 NZ_CP026774.1 0.976 1 1 0.976 331 85.801 3 13919 14246 3704319 3704649 - 4395762 6.35e-112 403
9 |
--------------------------------------------------------------------------------
/demo/refs/GCF_000006945.2.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_000006945.2.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_000017205.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_000017205.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_000148585.2.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_000148585.2.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_000392875.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_000392875.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_000742135.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_000742135.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_001027105.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_001027105.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_001096185.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_001096185.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_001457655.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_001457655.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_001544255.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_001544255.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_002949675.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_002949675.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_002950215.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_002950215.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_003697165.2.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_003697165.2.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_006742205.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_006742205.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_009759685.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_009759685.1.fa.gz
--------------------------------------------------------------------------------
/demo/refs/GCF_900638025.1.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_900638025.1.fa.gz
--------------------------------------------------------------------------------
/demo/taxid.map:
--------------------------------------------------------------------------------
1 | GCF_000742135.1 573
2 | GCF_003697165.2 562
3 | GCF_002949675.1 622
4 | GCF_002950215.1 623
5 | GCF_000006945.2 28901
6 | GCF_001544255.1 1352
7 | GCF_000392875.1 1351
8 | GCF_001457655.1 727
9 | GCF_900638025.1 729
10 | GCF_001027105.1 1280
11 | GCF_006742205.1 1282
12 | GCF_001096185.1 1313
13 | GCF_000148585.2 28037
14 | GCF_009759685.1 470
15 | GCF_000017205.1 287
16 |
--------------------------------------------------------------------------------
/demo/taxonomy.tsv:
--------------------------------------------------------------------------------
1 | id superkingdom phylum class order family genus species
2 | GCF_000742135.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Klebsiella Klebsiella pneumoniae
3 | GCF_003697165.2 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Escherichia Escherichia coli
4 | GCF_002949675.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Shigella Shigella dysenteriae
5 | GCF_002950215.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Shigella Shigella flexneri
6 | GCF_000006945.2 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Salmonella Salmonella enterica
7 | GCF_001544255.1 Bacteria Bacillota Bacilli Lactobacillales Enterococcaceae Enterococcus Enterococcus faecium
8 | GCF_000392875.1 Bacteria Bacillota Bacilli Lactobacillales Enterococcaceae Enterococcus Enterococcus faecalis
9 | GCF_001457655.1 Bacteria Pseudomonadota Gammaproteobacteria Pasteurellales Pasteurellaceae Haemophilus Haemophilus influenzae
10 | GCF_900638025.1 Bacteria Pseudomonadota Gammaproteobacteria Pasteurellales Pasteurellaceae Haemophilus Haemophilus parainfluenzae
11 | GCF_001027105.1 Bacteria Bacillota Bacilli Bacillales Staphylococcaceae Staphylococcus Staphylococcus aureus
12 | GCF_006742205.1 Bacteria Bacillota Bacilli Bacillales Staphylococcaceae Staphylococcus Staphylococcus epidermidis
13 | GCF_001096185.1 Bacteria Bacillota Bacilli Lactobacillales Streptococcaceae Streptococcus Streptococcus pneumoniae
14 | GCF_000148585.2 Bacteria Bacillota Bacilli Lactobacillales Streptococcaceae Streptococcus Streptococcus mitis
15 | GCF_009759685.1 Bacteria Pseudomonadota Gammaproteobacteria Moraxellales Moraxellaceae Acinetobacter Acinetobacter baumannii
16 | GCF_000017205.1 Bacteria Pseudomonadota Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas aeruginosa
17 |
--------------------------------------------------------------------------------
/docs/archetypes/default.md:
--------------------------------------------------------------------------------
1 | +++
2 | title = '{{ replace .File.ContentBaseName "-" " " | title }}'
3 | date = {{ .Date }}
4 | draft = true
5 | +++
6 |
--------------------------------------------------------------------------------
/docs/content/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title:
3 | geekdocNav: false
4 | geekdocAlign: center
5 | geekdocAnchor: false
6 | ---
7 | #
LexicMap
8 |
9 |
10 |
11 |
12 | [](https://github.com/shenwei356/LexicMap/releases)
13 | [](https://anaconda.org/bioconda/lexicmap)
14 | [](http://bioinf.shenwei.me/LexicMap/installation/)
15 | [](https://github.com/shenwei356/taxonkit/blob/master/LICENSE)
16 |
17 |
18 |
19 | LexicMap is a **nucleotide sequence alignment** tool for efficiently querying **gene, plasmid, virus, or long-read sequences (>100 bp)** against up to **millions** of **prokaryotic genomes**.
20 |
21 |
22 | {{< button size="medium" relref="introduction" >}}Introduction{{< /button >}}
23 |
24 |
25 |
26 | ## Feature overview
27 |
28 | {{< columns >}}
29 |
30 | ### Easy to install
31 |
32 | Linux, Windows, MacOS and more OS are supported.
33 |
34 | Both x86 and ARM CPUs are supported.
35 |
36 | Just [download](https://github.com/shenwei356/lexicmap/releases) the binary files and run!
37 |
38 |
39 | Or install it by
40 |
41 | conda install -c bioconda lexicmap
42 |
43 |
44 | {{< button size="small" relref="installation" >}}Installation{{< /button >}}
45 | {{< button size="small" relref="releases" >}}Releases{{< /button >}}
46 |
47 | <--->
48 |
49 | ### Easy to use
50 |
51 | Step 1: indexing
52 |
53 | lexicmap index -I genomes/ -O db.lmi
54 |
55 | Step 2: searching
56 |
57 | lexicmap search -d db.lmi q.fasta -o r.tsv
58 |
59 | {{< button size="small" relref="tutorials/index" >}}Tutorials{{< /button >}}
60 | {{< button size="small" relref="usage/lexicmap" >}}Usages{{< /button >}}
61 | {{< button size="small" relref="faqs" >}}FAQs{{< /button >}}
62 |
63 | <--->
64 |
65 | ### Accurate and efficient alignment
66 |
67 | Using LexicMap to align in the whole **2,340,672** Genbank+Refseq prokaryotic genomes with 48 CPUs.
68 |
69 | |Query |Genome hits|Time |RAM(GB)|
70 | |:----------------|----------:|------:|------:|
71 | |A 1.3-kb gene |41,718 |3m:06s |3.97 |
72 | |A 1.5-kb 16S rRNA|1,955,167 |32m:59s|11.09 |
73 | |A 52.8-kb plasmid|560,330 |52m:22s|14.48 |
74 | |1003 AMR genes |30,967,882 |15h:52m|24.86 |
75 |
76 |
77 | ***Blastn** is unable to run with the same dataset on common servers as it requires >2000 GB RAM*.
78 |
79 |
80 | {{< /columns >}}
81 |
82 |
--------------------------------------------------------------------------------
/docs/content/faqs/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: FAQs
3 | weight: 60
4 | ---
5 | ## Table of contents
6 |
7 | {{< toc format=html >}}
8 |
9 | ## Does LexicMap support short reads?
10 |
11 | LexicMap is mainly designed for sequence alignment with a small number of queries (gene/plasmid/virus/phage sequences) longer than 100 bp by default.
12 |
13 | If you just want to search long (>1kb) queries for highly similar (>95%) targets, you can build an index with a bigger `-D/--seed-max-desert` (default 100) and `-d/--seed-in-desert-dist` (default 50), e.g.,
14 |
15 | --seed-max-desert 300 --seed-in-desert-dist 150
16 |
17 | Bigger values decrease the search sensitivity for distant targets, speed up the indexing
18 | speed, decrease the indexing memory occupation and decrease the index size. While the
19 | alignment speed is almost not affected.
20 |
21 |
22 | ## Does LexicMap support fungi genomes?
23 |
24 | Yes. LexicMap mainly supports small genomes including prokaryotic, viral, and plasmid genomes.
25 | **Fungi can also be supported, just remember to increase the value of `-g/--max-genome` when running `lexicmap index`,
26 | which is used to skip genomes larger than 15Mb by default**.
27 |
28 | ```
29 | -g, --max-genome int ► Maximum genome size. Extremely large genomes (e.g., non-isolate
30 | assemblies from Genbank) will be skipped. (default 15000000)
31 | ```
32 |
33 | Maximum genome size is about 268 Mb (268,435,456). More precisely:
34 |
35 | $total_bases + ($num_contigs - 1) * 1000 <= 268,435,456
36 |
37 | as we concatenate contigs with 1000-bp intervals of N’s to reduce the sequence scale to index.
38 |
39 | For big and complex genomes, like the human genome (chr1 is ~248 Mb) which has many repetitive sequences, LexicMap would be slow to align.
40 |
41 |
42 | ## How's the hardware requirement?
43 |
44 | - For index building. See details [hardware requirement](https://bioinf.shenwei.me/LexicMap/tutorials/index/#hardware-requirements).
45 | - For seaching. See details [hardware requirement](https://bioinf.shenwei.me/LexicMap/tutorials/search/#hardware-requirements).
46 |
47 |
48 |
49 | ## How to resume the indexing as Slurm job time limit is almost reached while lexicmap index is still in the merging step?
50 |
51 | Use [lexicmap utils remerge](https://bioinf.shenwei.me/LexicMap/usage/utils/remerge/) (available since v0.5.0), which reruns the merging step for an unfinished index.
52 |
53 | > When to use this command?
54 | > - Only one thread is used for merging indexes, which happens when there are
55 | > a lot (>200 batches) of batches (`$inpu_files / --batch-size`) and the value
56 | > of `--max-open-files` is not big enough.
57 | > - The Slurm/PBS job time limit is almost reached and the merging step won't be finished before that.
58 | > - Disk quota is reached in the merging step.
59 |
60 | So you can stop the indexing command by press `Ctrl` + `C` (**make sure it is in the merging step**, see example below), and run `lexicmap utils remerge -d index.lmi`,
61 | where `index.lmi` is the output index directory in `lexicmap index`.
62 |
63 | Optionally, you might set bigger values of
64 | flag `--max-open-files` and `-J/--seed-data-threads` if you have hundreds of thousands of input genomes or have set
65 | a small batch size with `-b/--batch-size`. E.g.,
66 |
67 | 22:54:24.420 [INFO] merging 297 indexes...
68 | 22:54:24.455 [INFO] [round 1]
69 | 22:54:24.455 [INFO] batch 1/1, merging 297 indexes to xxx.lmi.tmp/r1_b1 with 1 threads...
70 |
71 | There's only one thread was used for seed data merging, it would take a long time.
72 | So we can set a larger `--max-open-files`, e.g., `4096`,
73 | and it would allow `4096 / (297+2) = 13.7` threads for merging, let's set `--seed-data-threads 12`.
74 |
75 | # specify the maximum open files per process
76 | ulimit -n 4096
77 |
78 | lexicmap utils remerge -d index.lmi --max-open-files 4096 --seed-data-threads 12
79 |
80 |
81 | ## Can I extract the matched sequences?
82 |
83 | Yes, `lexicmap search` has a flag
84 |
85 | ```
86 | -a, --all ► Output more columns, e.g., matched sequences. Use this if you
87 | want to output blast-style format with "lexicmap utils 2blast".
88 | ```
89 |
90 | to output CIGAR string, aligned query and subject sequences.
91 |
92 | 21. cigar, CIGAR string of the alignment. (optional with -a/--all)
93 | 22. qseq, Aligned part of query sequence. (optional with -a/--all)
94 | 23. sseq, Aligned part of subject sequence. (optional with -a/--all)
95 | 24. align, Alignment text ("|" and " ") between qseq and sseq. (optional with -a/--all)
96 |
97 |
98 | An example:
99 |
100 | # Extracting similar sequences for a query gene.
101 |
102 | # search matches with query coverage >= 90%
103 | lexicmap search -d gtdb_complete.lmi/ b.gene_E_faecalis_SecY.fasta -o results.tsv \
104 | --min-qcov-per-hsp 90 --all
105 |
106 | # extract matched sequences as FASTA format
107 | sed 1d results.tsv | awk -F'\t' '{print ">"$5":"$15"-"$16":"$17"\n"$23;}' \
108 | | seqkit seq -g > results.fasta
109 |
110 | seqkit head -n 1 results.fasta | head -n 3
111 | >NZ_JALSCK010000007.1:39224-40522:-
112 | TTGTTCAAGCTATTAAAGAACGCCTTTAAAGTCAAAGACATTAGATCAAAAATCTTATTT
113 | ACAGTTTTAATCTTGTTTGTATTTCGCCTAGGTGCGCACATTACTGTGCCCGGGGTGAAT
114 |
115 |
116 | And `lexicmap util 2blast` can help to convert the tabular format to Blast-style format,
117 | see [examples](https://bioinf.shenwei.me/LexicMap/usage/utils/2blast/#examples).
118 |
119 | ## How can I extract the upstream and downstream flanking sequences of matched regions?
120 |
121 | [lexicmap utils subseq](https://bioinf.shenwei.me/LexicMap/usage/utils/subseq/)
122 | can extract subsequencess via genome ID, sequence ID and positions.
123 | So you can use these information from the search result and expand the region positions to extract flanking sequences.
124 |
125 |
126 |
127 | ## Why isn't the pident 100% when aligning with a sequence from the reference genomes?
128 |
129 | It happens if there are some degenerate bases (e.g., `N`) in the query sequence.
130 | In the indexing step, all degenerate bases are converted to their lexicographic first bases. E.g., `N` is converted to `A`.
131 | While for the query sequences, we don't convert them.
132 |
133 |
134 | ## Why is LexicMap slow for batch searching?
135 |
136 | LexicMap is mainly designed for sequence alignment with a small number of queries against a database with a huge number (millions) of genomes.
137 |
138 | There are some ways to improve the search speed of `lexicmap search`:
139 | http://bioinf.shenwei.me/LexicMap/tutorials/search/#improving-searching-speed
140 |
141 | {{< button relref="/usage/search" >}}Click{{< /button >}} to read more detail of the usage.
142 |
143 | ## How can I know if an index is compatible with a LexicMap version? Should I rebuild an existing index?
144 |
145 | LexicMap is under active development, but we are striving to preserve index compatibility as we implement new features and improvements.
146 | The change history and compatibility information are available [here](https://bioinf.shenwei.me/LexicMap/tutorials/index/#index-format-changelog).
147 |
--------------------------------------------------------------------------------
/docs/content/installation/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Installation
3 | weight: 20
4 | ---
5 |
6 | LexicMap can be installed via [conda](#conda), downloading [executable binary files](#binary-files),
7 | or [compiling from the source](#compile-from-the-source).
8 |
9 | Besides, it supports [shell completion](#shell-completion), which could help accelerate typing.
10 |
11 | ## Conda/Pixi
12 |
13 | [Install conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html), then run
14 |
15 | conda install -c bioconda lexicmap
16 |
17 | Or use [mamba](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html), which is faster.
18 |
19 | conda install -c conda-forge mamba
20 | mamba install -c bioconda lexicmap
21 |
22 | Or use [pixi](https://pixi.sh/), which is even faster.
23 |
24 | pixi config channels add bioconda
25 | pixi add lexicmap
26 |
27 | Linux and MacOS (both x86 and arm CPUs) are supported.
28 |
29 | ## Binary files
30 |
31 | {{< tabs "uniqueid" >}}
32 |
33 | {{< tab "Linux" >}}
34 |
35 | 1. Download the binary file.
36 |
37 | |OS |Arch |File, 中国镜像 |
38 | |:------|:---------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
39 | |Linux |**64-bit**|[**lexicmap_linux_amd64.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_linux_amd64.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_linux_amd64.tar.gz) |
40 | |Linux |arm64 |[**lexicmap_linux_arm64.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_linux_arm64.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_linux_arm64.tar.gz) |
41 |
42 | 2. Decompress it:
43 |
44 | tar -zxvf lexicmap_linux_amd64.tar.gz
45 |
46 | 3. If you have the root privilege, simply copy it to `/usr/local/bin`:
47 |
48 | sudo cp lexicmap /usr/local/bin/
49 |
50 | 4. If you don't have the root privilege, copy it to any directory in the environment variable `PATH`:
51 |
52 | mkdir -p $HOME/bin/; cp lexicmap $HOME/bin/
53 |
54 | And optionally add the directory into the environment variable `PATH` if it's not in.
55 |
56 | # bash
57 | echo export PATH=\$PATH:\$HOME/bin/ >> $HOME/.bashrc
58 | source $HOME/.bashrc # apply the configuration
59 |
60 | # zsh
61 | echo export PATH=\$PATH:\$HOME/bin/ >> $HOME/.zshrc
62 | source $HOME/.zshrc # apply the configuration
63 |
64 |
65 | {{< /tab >}}
66 |
67 | {{< tab "MacOS" >}}
68 |
69 | 1. Download the binary file.
70 |
71 | |OS |Arch |File, 中国镜像 |
72 | |:------|:---------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
73 | |macOS |64-bit|[**lexicmap_darwin_amd64.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_darwin_amd64.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_darwin_amd64.tar.gz) |
74 | |macOS |**arm64** |[**lexicmap_darwin_arm64.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_darwin_arm64.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_darwin_arm64.tar.gz) |
75 |
76 | 2. Copy it to any directory in the environment variable `PATH`:
77 |
78 | mkdir -p $HOME/bin/; cp lexicmap $HOME/bin/
79 |
80 | And optionally add the directory into the environment variable `PATH` if it's not in.
81 |
82 | # bash
83 | echo export PATH=\$PATH:\$HOME/bin/ >> $HOME/.bashrc
84 | source $HOME/.bashrc # apply the configuration
85 |
86 | # zsh
87 | echo export PATH=\$PATH:\$HOME/bin/ >> $HOME/.zshrc
88 | source $HOME/.zshrc # apply the configuration
89 |
90 |
91 | {{< /tab >}}
92 |
93 | {{< tab "FreeBSD" >}}
94 |
95 | 1. Download the binary file.
96 |
97 | |OS |Arch |File, 中国镜像 |
98 | |:------|:---------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
99 | |FreeBSD|**64-bit**|[**lexicmap_freebsd_amd64.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_freebsd_amd64.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_freebsd_amd64.tar.gz) |
100 |
101 | {{< /tab >}}
102 |
103 |
104 | {{< tab "Windows" >}}
105 |
106 | 1. Download the binary file.
107 |
108 |
109 | |OS |Arch |File, 中国镜像 |
110 | |:------|:---------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
111 | |Windows|**64-bit**|[**lexicmap_windows_amd64.exe.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_windows_amd64.exe.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_windows_amd64.exe.tar.gz)|
112 |
113 |
114 | 2. Decompress it.
115 |
116 | 2. Copy `lexicmap.exe` to `C:\WINDOWS\system32`.
117 |
118 | {{< /tab >}}
119 |
120 | {{< tab "Others" >}}
121 |
122 | - Please [open an issue](https://github.com/shenwei356/LexicMap/issues) to request binaries for other platforms.
123 | - Or [compiling from the source](#compile-from-the-source).
124 |
125 | {{< /tab>}}
126 |
127 |
128 | {{< /tabs >}}
129 |
130 |
131 |
132 | ## Compile from the source
133 |
134 |
135 | 1. [Install go](https://go.dev/doc/install) (go 1.22 or later versions).
136 |
137 | wget https://go.dev/dl/go1.24.1.linux-amd64.tar.gz
138 |
139 | tar -zxf go1.24.1.linux-amd64.tar.gz -C $HOME/
140 |
141 | # or
142 | # echo "export PATH=$PATH:$HOME/go/bin" >> ~/.bashrc
143 | # source ~/.bashrc
144 | export PATH=$PATH:$HOME/go/bin
145 |
146 | 2. Compile LexicMap.
147 |
148 | # ------------- the latest stable version -------------
149 |
150 | go install -v github.com/shenwei356/LexicMap@latest
151 |
152 | # The executable binary file is located in:
153 | # ~/go/bin/lexicmap
154 | # You can also move it to anywhere in the $PATH
155 | mkdir -p $HOME/bin
156 | cp ~/go/bin/lexicmap $HOME/bin/
157 |
158 |
159 | # --------------- the development version --------------
160 |
161 | git clone https://github.com/shenwei356/LexicMap
162 | cd LexicMap/lexicmap/
163 | go build
164 |
165 | # The executable binary file is located in:
166 | # ./lexicmap
167 | # You can also move it to anywhere in the $PATH
168 | mkdir -p $HOME/bin
169 | cp ./lexicmap $HOME/bin/
170 |
171 |
172 | ## Shell-completion
173 |
174 | Supported shell: bash|zsh|fish|powershell
175 |
176 | Bash:
177 |
178 | # generate completion shell
179 | lexicmap autocompletion --shell bash
180 |
181 | # configure if never did.
182 | # install bash-completion if the "complete" command is not found.
183 | echo "for bcfile in ~/.bash_completion.d/* ; do source \$bcfile; done" >> ~/.bash_completion
184 | echo "source ~/.bash_completion" >> ~/.bashrc
185 |
186 | Zsh:
187 |
188 | # generate completion shell
189 | lexicmap autocompletion --shell zsh --file ~/.zfunc/_kmcp
190 |
191 | # configure if never did
192 | echo 'fpath=( ~/.zfunc "${fpath[@]}" )' >> ~/.zshrc
193 | echo "autoload -U compinit; compinit" >> ~/.zshrc
194 |
195 | fish:
196 |
197 | lexicmap autocompletion --shell fish --file ~/.config/fish/completions/lexicmap.fish
198 |
--------------------------------------------------------------------------------
/docs/content/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
64 |
--------------------------------------------------------------------------------
/docs/content/notes/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Notes
3 | weight: 100
4 | ---
5 |
--------------------------------------------------------------------------------
/docs/content/notes/motivation.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Motivation
3 | weight: 0
4 | ---
5 |
6 | 1. BLASTN is not able to scale to millions of bacterial genomes, it's slow and has a high memory occupation.
7 | For example, it requires >2000 GB for alignment a 2-kb gene sequence against all the 2.34 millions of prokaryotics genomes in Genbank and RefSeq.
8 |
9 | 2. [Large-scale sequence searching tools](https://kamimrcht.github.io/webpage/set_kmer_sets2.html) only return which genomes a query matches (color), but they can't return positional information.
10 |
--------------------------------------------------------------------------------
/docs/content/performance@genbank.tsv:
--------------------------------------------------------------------------------
1 | Query Genome hits Genome hits
(high-similarity) Genome hits
(medium-similarity) Genome hits
(low-similarity) Time RAM
2 | A 1.3-kb marker gene 41718 11746 115 29857 3m:06s 3.97 GB
3 | A 1.5-kb 16S rRNA 1955167 245884 501691 1207592 32m:59s 11.09 GB
4 | A 52.8-kb plasmid 560330 96 15370 544864 52m:22s 14.48 GB
5 | 1003 AMR genes 30967882 7636386 4858063 18473433 15h:52m:08s 24.86 GB
6 |
--------------------------------------------------------------------------------
/docs/content/performance@genbank.tsv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | cat performance@genbank.tsv \
4 | | csvtk replace -t -f RAM -p ' .+' \
5 | | csvtk rename -t -f RAM -n 'RAM(GB)' \
6 | | csvtk replace -t -f Query -p 'marker ' \
7 | | csvtk replace -t -f Time -p '(\d+h:\d+m):\d+s' -r '$1' \
8 | | csvtk cut -t -f 1,2,6,7 \
9 | | csvtk comma -t -f 2 \
10 | | csvtk csv2md -t -a l,r,r,r
11 |
12 | echo
13 |
14 | cat performance@genbank.tsv \
15 | | csvtk comma -t -f 2-5 \
16 | | csvtk csv2md -t -a l,r,r,r,r,r,r
17 |
--------------------------------------------------------------------------------
/docs/content/tutorials/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Tutorials
3 | weight: 40
4 | ---
5 |
--------------------------------------------------------------------------------
/docs/content/tutorials/index/parameters-batches.tsv:
--------------------------------------------------------------------------------
1 | Flag Value Function Comment
2 | **`-b/--batch-size`** Max: 131072, default: 5000 Maximum number of genomes in each batch If the number of input files exceeds this number, input files are split into multiple batches and indexes are built for all batches. In the end, seed files are merged, while genome data files are kept unchanged and collected. ■ Bigger values increase indexing memory occupation and increase batch searching speed, while single query searching speed is not affected.
3 |
--------------------------------------------------------------------------------
/docs/content/tutorials/index/parameters-general.tsv:
--------------------------------------------------------------------------------
1 | Flag Value Function Comment
2 | **`-j/--threads`** Default: all available CPUs Number of CPU cores to use. ► If the value is smaller than the number of available CPUs, make sure set the same value to `-c/--chunks`.
3 |
--------------------------------------------------------------------------------
/docs/content/tutorials/index/parameters-masks.tsv:
--------------------------------------------------------------------------------
1 | Flag Value Function Comment
2 | `-M/--mask-file` A file File with custom masks "File with custom masks, which could be exported from an existing index or newly generated by ""lexicmap utils masks"". This flag oversides `-k/--kmer`, `-m/--masks`, `-s/--rand-seed`, etc."
3 | **`-k/--kmer`** Max: 32, default: 31 K-mer size ■ Bigger values improve the search specificity and do not increase the index size.
4 | **`-m/--masks`** Default: 20,000 Number of masks ■ Bigger values improve the search sensitivity slightly, increase the index size, and slow down the search speed. For smaller genomes like phages/viruses, m=5,000 is high enough.
5 |
--------------------------------------------------------------------------------
/docs/content/tutorials/index/parameters-seeds.tsv:
--------------------------------------------------------------------------------
1 | Flag Value Function Comment
2 | **`--seed-max-desert`** Default: 100 Maximum length of distances between seeds The default value of 100 guarantees queries >=200 bp would match at least two seeds. ► Large regions with no seeds are called sketching deserts. Deserts with seed distance larger than this value will be filled by choosing k-mers roughly every --seed-in-desert-dist (50 by default) bases. ■ Bigger values decrease the search sensitivity for distant targets, speed up the indexing speed, decrease the indexing memory occupation and decrease the index size. While the alignment speed is almost not affected.
3 | **`-c/--chunks`** Maximum: 128, default: value of -j/--threads Number of seed file chunks Bigger values accelerate the search speed at the cost of a high disk reading load. ► The value should not exceed the maximum number of open files set by the operating systems. ► Make sure the value of `-j/--threads` in `lexicmap search` is >= this value.
4 | **`-J/--seed-data-threads`** Maximum: -c/--chunks, default: 8 Number of threads for writing seed data and merging seed chunks from all batches The actual value is min(--seed-data-threads, max(1, --max-open-files/($batches_1_round + 2))), where $batches_1_round = min(int($input_files / --batch-size), --max-open-files). ■ Bigger values increase indexing speed at the cost of slightly higher memory occupation.
5 | `-p/--partitions` Default: 4096 Number of partitions for indexing each seed file Bigger values bring a little higher memory occupation. ► After indexing, `lexicmap utils reindex-seeds` can be used to reindex the seeds data with another value of this flag.
6 | **`--max-open-files`** Default: 1024 Maximum number of open files It's only used in merging indexes of multiple genome batches. If there are >100 batches, i.e., ($input_files / --batch-size), please increase this value and set a bigger `ulimit -n` in shell.
7 |
--------------------------------------------------------------------------------
/docs/content/tutorials/misc/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: More
3 | weight: 40
4 |
5 | geekdocCollapseSection: true
6 | ---
7 |
--------------------------------------------------------------------------------
/docs/content/tutorials/misc/index-genbank.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Indexing GenBank+RefSeq
3 | weight: 10
4 | ---
5 |
6 | **Make sure you have enough disk space, >10 TB is preferred.**
7 |
8 | Tools:
9 |
10 | - https://github.com/pirovc/genome_updater, for downloading genomes
11 | - https://github.com/shenwei356/seqkit, for checking sequence files
12 | - https://github.com/shenwei356/rush, for running jobs
13 |
14 | Data:
15 |
16 | time genome_updater.sh -d "refseq,genbank" -g "archaea,bacteria" \
17 | -f "genomic.fna.gz" -o "genbank" -M "ncbi" -t 12 -m -L curl
18 |
19 | cd genbank/2024-02-15_11-00-51/
20 |
21 |
22 | # ----------------- check the file integrity -----------------
23 |
24 | genomes=files
25 |
26 | # corrupted files
27 | # find $genomes -name "*.gz" \
28 | fd ".gz$" $genomes \
29 | | rush --eta 'seqkit seq -w 0 {} > /dev/null; if [ $? -ne 0 ]; then echo {}; fi' \
30 | > failed.txt
31 |
32 | # empty files
33 | find $genomes -name "*.gz" -size 0 >> failed.txt
34 |
35 | # delete these files
36 | cat failed.txt | rush '/bin/rm {}'
37 |
38 | # redownload them:
39 | # run the genome_updater command again, with the flag -i
40 |
41 | Indexing. On a 48-CPU machine, time: 56 h, ram: 181 GB, index size: 4.96 TiB.
42 | If you don't have enough memory, please decrease the value of `-b`.
43 |
44 | lexicmap index \
45 | -I files/ \
46 | --ref-name-regexp '^(\w{3}_\d{9}\.\d+)' \
47 | -O genbank_refseq.lmi --log genbank_refseq.lmi.log \
48 | -b 25000
49 |
50 | # dirsize genbank_refseq.lmi
51 | genbank_refseq.lmi: 4.96 TiB (5,454,659,703,138)
52 | 2.79 TiB seeds
53 | 2.17 TiB genomes
54 | 55.81 MiB genomes.map.bin
55 | 156.28 KiB masks.bin
56 | 3.59 KiB genomes.chunks.bin
57 | 619 B info.toml
58 |
--------------------------------------------------------------------------------
/docs/content/tutorials/misc/index-globdb.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Indexing GlobDB
3 | weight: 20
4 | ---
5 |
6 |
7 | Info:
8 |
9 | - [GlobDB](https://globdb.org/) , a dereplicated dataset of the species reps of the GTDB, GEM, SPIRE and SMAG datasets a lot.
10 | - https://x.com/daanspeth/status/1822964436950192218
11 |
12 |
13 | Steps:
14 |
15 | # download data
16 | wget https://fileshare.lisc.univie.ac.at/globdb/globdb_r220/globdb_r220_genome_fasta.tar.gz
17 |
18 | tar -zxf globdb_r220_genome_fasta.tar.gz
19 |
20 | # file list
21 | find globdb_r220_genome_fasta/ -name "*.fa.gz" > files.txt
22 |
23 | # index with lexicmap
24 | # elapsed time: 3h:40m:38s
25 | # peak rss: 87.15 GB
26 | lexicmap index -S -X files.txt -O globdb_r220.lmi --log globdb_r220.lmi -g 50000000
27 |
28 |
--------------------------------------------------------------------------------
/docs/content/tutorials/misc/index-gtdb.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Indexing GTDB
3 | weight: 5
4 | ---
5 |
6 | Info:
7 |
8 | - https://gtdb.ecogenomic.org/
9 |
10 | Tools:
11 |
12 | - https://github.com/pirovc/genome_updater, for downloading genomes
13 | - https://github.com/shenwei356/seqkit, for checking sequence files
14 | - https://github.com/shenwei356/rush, for running jobs
15 |
16 | Data:
17 |
18 | time genome_updater.sh -d "refseq,genbank" -g "archaea,bacteria" \
19 | -f "genomic.fna.gz" -o "GTDB_complete" -M "gtdb" -t 12 -m -L curl
20 |
21 | cd GTDB_complete/2024-01-30_19-34-40/
22 |
23 |
24 | # ----------------- check the file integrity -----------------
25 |
26 | genomes=files
27 |
28 | # corrupted files
29 | # find $genomes -name "*.gz" \
30 | fd ".gz$" $genomes \
31 | | rush --eta 'seqkit seq -w 0 {} > /dev/null; if [ $? -ne 0 ]; then echo {}; fi' \
32 | > failed.txt
33 |
34 | # empty files
35 | find $genomes -name "*.gz" -size 0 >> failed.txt
36 |
37 | # delete these files
38 | cat failed.txt | rush '/bin/rm {}'
39 |
40 | # redownload them:
41 | # run the genome_updater command again, with the flag -i
42 |
43 | Indexing. On a 48-CPU machine, time: 8h:19m:28s, ram: 73 GB, index size: 906 GB.
44 | If you don't have enough memory, please decrease the value of `-b`.
45 |
46 | lexicmap index \
47 | -I files/ \
48 | --ref-name-regexp '^(\w{3}_\d{9}\.\d+)' \
49 | -O gtdb_complete.lmi --log gtdb_complete.lmi.log \
50 | -b 5000
51 |
52 | Files:
53 |
54 | $ du -sh files gtdb_complete.lmi --apparent-size
55 | 413G files
56 | 907G gtdb_complete.lmi
57 |
58 | $ dirsize gtdb_complete.lmi
59 | gtdb_complete.lmi: 905.34 GiB (972,098,200,328)
60 | 542.34 GiB seeds
61 | 362.99 GiB genomes
62 | 9.60 MiB genomes.map.bin
63 | 156.28 KiB masks.bin
64 | 616 B info.toml
65 | 168 B genomes.chunks.bin
66 |
--------------------------------------------------------------------------------
/docs/content/tutorials/misc/index-uhgg.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Indexing UHGG
3 | weight: 25
4 | ---
5 |
6 | Info:
7 |
8 | - [Unified Human Gastrointestinal Genome (UHGG) v2.0.2](https://www.ebi.ac.uk/metagenomics/genome-catalogues/human-gut-v2-0-2)
9 | - [A unified catalog of 204,938 reference genomes from the human gut microbiome](https://www.nature.com/articles/s41587-020-0603-3)
10 | - Number of Genomes: 289,232
11 |
12 | Tools:
13 |
14 | - https://github.com/shenwei356/seqkit, for checking sequence files
15 | - https://github.com/shenwei356/rush, for running jobs
16 |
17 | Data:
18 |
19 | # meta data
20 | wget https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.2/genomes-all_metadata.tsv
21 |
22 | # gff url
23 | sed 1d genomes-all_metadata.tsv | cut -f 20 | sed 's/v2.0/v2.0.2/' | sed -E 's/^ftp/https/' > url.txt
24 |
25 | # download gff files
26 | mkdir -p files; cd files
27 |
28 | time cat ../url.txt \
29 | | rush --eta -v 'dir={///%}/{//%}' \
30 | 'mkdir -p {dir}; curl -s -o {dir}/{%} {}' \
31 | -c -C download.rush -j 12
32 | cd ..
33 |
34 | # extract sequences from gff files
35 | find files/ -name "*.gff.gz" \
36 | | rush --eta \
37 | 'zcat {} | perl -ne "print if \$s; \$s=true if /^##FASTA/" | seqkit seq -w 0 -o {/}/{%:}.fna.gz' \
38 | -c -C extract.rush
39 |
40 |
41 | Indexing. On a 48-CPU machine, time: 3 h, ram: 41 GB, index size: 426 GB.
42 | If you don't have enough memory, please decrease the value of `-b`.
43 |
44 | lexicmap index \
45 | -I files/ \
46 | -O uhgg.lmi --log uhgg.lmi.log \
47 | -b 5000
48 |
49 | File sizes:
50 |
51 | $ du -sh files/ uhgg.lmi
52 | 658G files/
53 | 509G uhgg.lmi
54 |
55 | $ du -sh files/ uhgg.lmi --apparent-size
56 | 425G files/
57 | 426G uhgg.lmi
58 |
59 | $ dirsize uhgg.lmi
60 | uhgg.lmi: 425.15 GiB (456,497,171,291)
61 | 243.47 GiB seeds
62 | 181.67 GiB genomes
63 | 6.34 MiB genomes.map.bin
64 | 312.53 KiB masks.bin
65 | 330 B info.toml
66 |
--------------------------------------------------------------------------------
/docs/content/tutorials/parameters-align.tsv:
--------------------------------------------------------------------------------
1 | Flag Value Function Comment
2 | **`-Q/--min-qcov-per-genome`** Default 0 Minimum query coverage (percentage) per genome.
3 | **`-q/--min-qcov-per-hsp`** Default 0 Minimum query coverage (percentage) per HSP.
4 | **`-l/--align-min-match-len`** Default 50 Minimum aligned length in a HSP segment.
5 | **`-i/--align-min-match-pident`** Default 70 Minimum base identity (percentage) in a HSP segment.
6 | `--align-band` Default 100 Band size in backtracking the score matrix.
7 | `--align-ext-len` Default 1000 Extend length of upstream and downstream of seed regions, for extracting query and target sequences for alignment. It should be <= contig interval length in database.
8 | `--align-max-gap` Default 20 Maximum gap in a HSP segment.
9 |
--------------------------------------------------------------------------------
/docs/content/tutorials/parameters-general.tsv:
--------------------------------------------------------------------------------
1 | Flag Value Function Comment
2 | **`-j/--threads`** Default: all available cpus Number of CPU cores to use. The value should be >= the number of seed chunk files (“chunks” in info.toml, set by `-c/--chunks` in `lexicmap index`).
3 | **`-w/--load-whole-seeds`** Load the whole seed data into memory for faster search Use this if the index is not big and many queries are needed to search.
4 | **`-n/--top-n-genomes`** Default 0, 0 for all Keep top N genome matches for a query in the chaining phase Value 1 is not recommended as the best chaining result does not always bring the best alignment, so it better be >= 5. The final number of genome hits might be smaller than this number as some chaining results might fail to pass the criteria in the alignment step.
5 | **`-a/--all`** Output more columns, e.g., matched sequences. "Use this if you want to output blast-style format with ""lexicmap utils 2blast"""
6 | `-J/--max-query-conc` Default 12, 0 for all Maximum number of concurrent queries Bigger values do not improve the batch searching speed and consume much memory.
7 | `--max-open-files` Default: 1024 Maximum number of open files It mainly affects candidate subsequence extraction. Increase this value if you have hundreds of genome batches or have multiple queries, and do not forgot to set a bigger `ulimit -n` in shell if the value is > 1024.
8 |
--------------------------------------------------------------------------------
/docs/content/tutorials/parameters-seeding.tsv:
--------------------------------------------------------------------------------
1 | Flag Value Function Comment
2 | **`-p, --seed-min-prefix`** Default 15 Minimum (prefix) length of matched seeds. Smaller values produce more results at the cost of slow speed.
3 | **`-P, --seed-min-single-prefix`** Default 17 Minimum (prefix) length of matched seeds if there's only one pair of seeds matched. Smaller values produce more results at the cost of slow speed.
4 | `--seed-max-dist` Default 1000 Max distance between seeds in seed chaining. It should be <= contig interval length in database.
5 | `--seed-max-gap` Default 50 Max gap in seed chaining.
6 |
--------------------------------------------------------------------------------
/docs/content/usage/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Usage
3 | weight: 50
4 | ---
5 |
--------------------------------------------------------------------------------
/docs/content/usage/lexicmap.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: lexicmap
3 | weight: 0
4 | ---
5 |
6 | ```plain
7 | $ lexicmap -h
8 |
9 | LexicMap: efficient sequence alignment against millions of prokaryotic genomes
10 |
11 | Version: v0.7.0
12 | Documents: https://bioinf.shenwei.me/LexicMap
13 | Source code: https://github.com/shenwei356/LexicMap
14 |
15 | Usage:
16 | lexicmap [command]
17 |
18 | Available Commands:
19 | autocompletion Generate shell autocompletion scripts
20 | index Generate an index from FASTA/Q sequences
21 | search Search sequences against an index
22 | utils Some utilities
23 | version Print version information and check for update
24 |
25 | Flags:
26 | -h, --help help for lexicmap
27 | -X, --infile-list string ► File of input file list (one file per line). If given, they are
28 | appended to files from CLI arguments.
29 | --log string ► Log file.
30 | --quiet ► Do not print any verbose information. But you can write them to a file
31 | with --log.
32 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores.
33 | (default 16)
34 |
35 | Use "lexicmap [command] --help" for more information about a command.
36 | ```
37 |
--------------------------------------------------------------------------------
/docs/content/usage/search.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: search
3 | weight: 20
4 | ---
5 |
6 | ```plain
7 | $ lexicmap search -h
8 | Search sequences against an index
9 |
10 | Attention:
11 | 1. Input should be (gzipped) FASTA or FASTQ records from files or stdin.
12 | 2. For multiple queries, the order of queries in output might be different from the input.
13 |
14 | Tips:
15 | 1. When using -a/--all, the search result would be formatted to Blast-style format
16 | with 'lexicmap utils 2blast'. And the search speed would be slightly slowed down.
17 | 2. Alignment result filtering is performed in the final phase, so stricter filtering criteria,
18 | including -q/--min-qcov-per-hsp, -Q/--min-qcov-per-genome, and -i/--align-min-match-pident,
19 | do not significantly accelerate the search speed. Hence, you can search with default
20 | parameters and then filter the result with tools like awk or csvtk.
21 |
22 | Alignment result relationship:
23 |
24 | Query
25 | ├── Subject genome
26 | ├── Subject sequence
27 | ├── HSP cluster (a cluster of neighboring HSPs)
28 | ├── High-Scoring segment Pair (HSP)
29 |
30 | Here, the defination of HSP is similar with that in BLAST. Actually there are small gaps in HSPs.
31 |
32 | > A High-scoring Segment Pair (HSP) is a local alignment with no gaps that achieves one of the
33 | > highest alignment scores in a given search. https://www.ncbi.nlm.nih.gov/books/NBK62051/
34 |
35 | Output format:
36 | Tab-delimited format with 20+ columns, with 1-based positions.
37 |
38 | 1. query, Query sequence ID.
39 | 2. qlen, Query sequence length.
40 | 3. hits, Number of subject genomes.
41 | 4. sgenome, Subject genome ID.
42 | 5. sseqid, Subject sequence ID.
43 | 6. qcovGnm, Query coverage (percentage) per genome: $(aligned bases in the genome)/$qlen.
44 | 7. cls, Nth HSP cluster in the genome. (just for improving readability)
45 | It's useful to show if multiple adjacent HSPs are collinear.
46 | 8. hsp, Nth HSP in the genome. (just for improving readability)
47 | 9. qcovHSP Query coverage (percentage) per HSP: $(aligned bases in a HSP)/$qlen.
48 | 10. alenHSP, Aligned length in the current HSP.
49 | 11. pident, Percentage of identical matches in the current HSP.
50 | 12. gaps, Gaps in the current HSP.
51 | 13. qstart, Start of alignment in query sequence.
52 | 14. qend, End of alignment in query sequence.
53 | 15. sstart, Start of alignment in subject sequence.
54 | 16. send, End of alignment in subject sequence.
55 | 17. sstr, Subject strand.
56 | 18. slen, Subject sequence length.
57 | 19. evalue, Expect value.
58 | 20. bitscore, Bit score.
59 | 21. cigar, CIGAR string of the alignment. (optional with -a/--all)
60 | 22. qseq, Aligned part of query sequence. (optional with -a/--all)
61 | 23. sseq, Aligned part of subject sequence. (optional with -a/--all)
62 | 24. align, Alignment text ("|" and " ") between qseq and sseq. (optional with -a/--all)
63 |
64 | Result ordering:
65 | For a HSP cluster, SimilarityScore = max(bitscore*pident)
66 | 1. Within each HSP cluster, HSPs are sorted by sstart.
67 | 2. Within each subject genome, HSP clusters are sorted in descending order by SimilarityScore.
68 | 3. Results of multiple subject genomes are sorted by the highest SimilarityScore of HSP clusters.
69 |
70 | Usage:
71 | lexicmap search [flags] -d [query.fasta.gz ...] [-o query.tsv.gz]
72 |
73 | Flags:
74 | --align-band int ► Band size in backtracking the score matrix (pseudo alignment
75 | phase). (default 100)
76 | --align-ext-len int ► Extend length of upstream and downstream of seed regions, for
77 | extracting query and target sequences for alignment. It should be
78 | <= contig interval length in database. (default 1000)
79 | --align-max-gap int ► Maximum gap in a HSP segment. (default 20)
80 | -l, --align-min-match-len int ► Minimum aligned length in a HSP segment. (default 50)
81 | -i, --align-min-match-pident float ► Minimum base identity (percentage) in a HSP segment. (default 70)
82 | -a, --all ► Output more columns, e.g., matched sequences. Use this if you
83 | want to output blast-style format with "lexicmap utils 2blast".
84 | --debug ► Print debug information, including a progress bar.
85 | (recommended when searching with one query).
86 | -h, --help help for search
87 | -d, --index string ► Index directory created by "lexicmap index".
88 | -w, --load-whole-seeds ► Load the whole seed data into memory for faster seed
89 | matching. It will consume a lot of RAM.
90 | -e, --max-evalue float ► Maximum evalue of a HSP segment. (default 10)
91 | --max-open-files int ► Maximum opened files. It mainly affects candidate subsequence
92 | extraction. Increase this value if you have hundreds of genome
93 | batches or have multiple queries, and do not forgot to set a
94 | bigger "ulimit -n" in shell if the value is > 1024. (default 1024)
95 | -J, --max-query-conc int ► Maximum number of concurrent queries. Bigger values do not
96 | improve the batch searching speed and consume much memory.
97 | (default 12)
98 | -Q, --min-qcov-per-genome float ► Minimum query coverage (percentage) per genome.
99 | -q, --min-qcov-per-hsp float ► Minimum query coverage (percentage) per HSP.
100 | -o, --out-file string ► Out file, supports a ".gz" suffix ("-" for stdout). (default "-")
101 | --seed-max-dist int ► Minimum distance between seeds in seed chaining. It should be
102 | <= contig interval length in database. (default 1000)
103 | --seed-max-gap int ► Minimum gap in seed chaining. (default 50)
104 | -p, --seed-min-prefix int ► Minimum (prefix/suffix) length of matched seeds (anchors).
105 | (default 15)
106 | -P, --seed-min-single-prefix int ► Minimum (prefix/suffix) length of matched seeds (anchors) if
107 | there's only one pair of seeds matched. (default 17)
108 | -n, --top-n-genomes int ► Keep top N genome matches for a query (0 for all) in chaining
109 | phase. Value 1 is not recommended as the best chaining result
110 | does not always bring the best alignment, so it better be >= 100.
111 | (default 0)
112 |
113 | Global Flags:
114 | -X, --infile-list string ► File of input file list (one file per line). If given, they are
115 | appended to files from CLI arguments.
116 | --log string ► Log file.
117 | --quiet ► Do not print any verbose information. But you can write them to a file
118 | with --log.
119 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores.
120 | (default 16)
121 | ```
122 |
123 |
124 | ## Examples
125 |
126 | See {{< button size="small" relref="tutorials/search" >}}Searching{{< /button >}}
127 |
--------------------------------------------------------------------------------
/docs/content/usage/utils/2blast.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 2blast
3 | weight: 0
4 | ---
5 |
6 | ## Usage
7 |
8 | ```plain
9 | $ lexicmap utils 2blast -h
10 | Convert the default search output to blast-style format
11 |
12 | LexicMap only stores genome IDs and sequence IDs, without description information.
13 | But the option -g/--kv-file-genome enables adding description data after the genome ID
14 | with a tabular key-value mapping file.
15 |
16 | Input:
17 | - Output of 'lexicmap search' with the flag -a/--all.
18 |
19 | Usage:
20 | lexicmap utils 2blast [flags]
21 |
22 | Flags:
23 | -b, --buffer-size string ► Size of buffer, supported unit: K, M, G. You need increase the value
24 | when "bufio.Scanner: token too long" error reported (default "20M")
25 | -h, --help help for 2blast
26 | -i, --ignore-case ► Ignore cases of sgenome and sseqid
27 | -g, --kv-file-genome string ► Two-column tabular file for mapping the target genome ID (sgenome)
28 | to the corresponding value
29 | -s, --kv-file-seq string ► Two-column tabular file for mapping the target sequence ID (sseqid)
30 | to the corresponding value
31 | -o, --out-file string ► Out file, supports and recommends a ".gz" suffix ("-" for stdout).
32 | (default "-")
33 |
34 | Global Flags:
35 | -X, --infile-list string ► File of input file list (one file per line). If given, they are
36 | appended to files from CLI arguments.
37 | --log string ► Log file.
38 | --quiet ► Do not print any verbose information. But you can write them to a file
39 | with --log.
40 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores.
41 | (default 16)
42 | ```
43 |
44 | ## Examples
45 |
46 |
47 | From stdin.
48 |
49 | ```text
50 | $ seqkit seq -M 500 q.long-reads.fasta.gz \
51 | | seqkit head -n 2 \
52 | | lexicmap search -d demo.lmi/ -a \
53 | | lexicmap utils 2blast --kv-file-genome ass2species.map
54 |
55 | Query = GCF_000017205.1_r160
56 | Length = 478
57 |
58 | [Subject genome #1/1] = GCF_000017205.1 Pseudomonas aeruginosa
59 | Query coverage per genome = 98.536%
60 |
61 | >NC_009656.1
62 | Length = 6588339
63 |
64 | HSP cluster #1, HSP #1
65 | Score = 883 bits, Expect = 3.60e-256
66 | Query coverage per seq = 98.536%, Aligned length = 479, Identities = 94.990%, Gaps = 15
67 | Query range = 7-477, Subject range = 4866857-4867328, Strand = Plus/Plus
68 |
69 | Query 7 GGTGGCCCTCAAACGAGTCC-AACAGGCCAACGCCTAGCAATCCCTCCCCTGTGGGGCAG 65
70 | ||||||| |||||||||||| |||||||| |||||| | ||||||||||||| ||||||
71 | Sbjct 4866857 GGTGGCC-TCAAACGAGTCCGAACAGGCCCACGCCTCACGATCCCTCCCCTGTCGGGCAG 4866915
72 |
73 | Query 66 GGAAAATCGTCCTTTATGGTCCGTTCCGGGCACGCACCGGAACGGCGGTCATCTTCCACG 125
74 | |||||||||||||||||||||||||||||||||||||||||||||||||||| |||||||
75 | Sbjct 4866916 GGAAAATCGTCCTTTATGGTCCGTTCCGGGCACGCACCGGAACGGCGGTCAT-TTCCACG 4866974
76 |
77 | Query 126 GTGCCCGCCCACGGCGGACCCGCGGAAACCGACCCGGGCGCCAAGGCGCCCGGGAACGGA 185
78 | ||||||||| ||||||||||| ||||||||||||||||||||||||||||||||||||||
79 | Sbjct 4866975 GTGCCCGCC-ACGGCGGACCC-CGGAAACCGACCCGGGCGCCAAGGCGCCCGGGAACGGA 4867032
80 |
81 | Query 186 GTA-CACTCGGCGTTCGGCCAGCGACAGC---GACGCGTTGCCGCCCACCGCGGTGGTGT 241
82 | ||| |||||||||| |||||||||||||| ||||||||||||||||||||||||||||
83 | Sbjct 4867033 GTATCACTCGGCGT-CGGCCAGCGACAGCAGCGACGCGTTGCCGCCCACCGCGGTGGTGT 4867091
84 |
85 | Query 242 TCACCGAGGTGGTGCGCTCGCTGAC-AAACGCAGCAGGTAGTTCGGCCCGCCGGCCTTGG 300
86 | ||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||
87 | Sbjct 4867092 TCACCGAGGTGGTGCGCTCGCTGACGAAACGCAGCAGGTAGTTCGGCCCGCCGGCCTTGG 4867151
88 |
89 | Query 301 GACCG-TGCCGGACAGCCCGTGGCCGCCGAACAGTTGCACGCCCACCACCGCGCCGAT-T 358
90 | ||||| |||||||||||||||||||||||||| ||||||||||||||||||||||||| |
91 | Sbjct 4867152 GACCGGTGCCGGACAGCCCGTGGCCGCCGAACGGTTGCACGCCCACCACCGCGCCGATCT 4867211
92 |
93 | Query 359 GGTTTCGGTTGACGTAGAGGTTGCCGACCCGCGCCAGCTCTTGGATGCGGCGGGCGGTTT 418
94 | |||| ||||||||||||||||||||||||||||||||||||| |||||||||||||||||
95 | Sbjct 4867212 GGTTGCGGTTGACGTAGAGGTTGCCGACCCGCGCCAGCTCTTCGATGCGGCGGGCGGTTT 4867271
96 |
97 | Query 419 CCTCGTTGCGGCTGTGGACCCCCATGGTCAGGCCGAAACCGGTGGCGTTTGATGGCCCT 477
98 | ||||||||||||||||||||||||||||||||||||||||||||||||| ||| ||| |
99 | Sbjct 4867272 CCTCGTTGCGGCTGTGGACCCCCATGGTCAGGCCGAAACCGGTGGCGTT-GATCGCC-T 4867328
100 |
101 |
102 | Query = GCF_006742205.1_r100
103 | Length = 431
104 |
105 | [Subject genome #1/1] = GCF_006742205.1 Staphylococcus epidermidis
106 | Query coverage per genome = 93.968%
107 |
108 | >NZ_AP019721.1
109 | Length = 2422602
110 |
111 | HSP cluster #1, HSP #1
112 | Score = 740 bits, Expect = 2.39e-213
113 | Query coverage per seq = 93.968%, Aligned length = 408, Identities = 98.284%, Gaps = 4
114 | Query range = 27-431, Subject range = 1321677-1322083, Strand = Plus/Minus
115 |
116 | Query 27 TTCATTTAAAACGATTGCTAATGAGTCACGTATTTCATCTGGTTCGGTAACTATACCGTC 86
117 | ||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||
118 | Sbjct 1322083 TTCATCTAAAACGATTGCTAATGAGTCACGTATTTCATCTGGTTCGGTAACTATACCGTC 1322024
119 |
120 | Query 87 TACTATGGACTCAGTGTAACCCTGTAATAAAGAGATTGGCGTACGTAATTCATGTG-TAC 145
121 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||| |||
122 | Sbjct 1322023 TACTATGGACTCAGTGTAACCCTGTAATAAAGAGATTGGCGTACGTAATTCATGTGATAC 1321964
123 |
124 | Query 146 ATTTGCTATAAAATCTTTTTTCATTTGATCAAGATTATGTTCATTTGTCATATCACAGGA 205
125 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||| |||
126 | Sbjct 1321963 ATTTGCTATAAAATCTTTTTTCATTTGATCAAGATTATGTTCATTTGTCATATCAC-GGA 1321905
127 |
128 | Query 206 TGACCATGACAATACCACTTCTACCATTTGTTTGAATTCTATCTATATAACTGGAGATAA 265
129 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
130 | Sbjct 1321904 TGACCATGACAATACCACTTCTACCATTTGTTTGAATTCTATCTATATAACTGGAGATAA 1321845
131 |
132 | Query 266 ATACATAGTACCTTGTATTAATTTCTAATTCTAA-TACTCATTCTGTTGTGATTCAAATG 324
133 | |||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||
134 | Sbjct 1321844 ATACATAGTACCTTGTATTAATTTCTAATTCTAAATACTCATTCTGTTGTGATTCAAATG 1321785
135 |
136 | Query 325 GTGCTTCAATTTGCTGTTCAATAGATTCTTTTGAAAAATCATCAATGTGACGCATAATAT 384
137 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
138 | Sbjct 1321784 TTGCTTCAATTTGCTGTTCAATAGATTCTTTTGAAAAATCATCAATGTGACGCATAATAT 1321725
139 |
140 | Query 385 AATCAGCCATCTTGTT-GACAATATGATTTCACGTTGATTATTAATGC 431
141 | ||||||||||||||| |||||||||||||||||||||||||||||||
142 | Sbjct 1321724 CATCAGCCATCTTGTTTGACAATATGATTTCACGTTGATTATTAATGC 1321677
143 |
144 | ```
145 |
146 |
147 | From file.
148 |
149 | $ lexicmap utils 2blast r.lexicmap.tsv -o r.lexicmap.txt
150 |
--------------------------------------------------------------------------------
/docs/content/usage/utils/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: utils
3 | weight: 40
4 | geekdocCollapseSection: true
5 | ---
6 |
7 | ```plain
8 | $ lexicmap utils
9 | Some utilities
10 |
11 | Usage:
12 | lexicmap utils [command]
13 |
14 | Available Commands:
15 | 2blast Convert the default search output to blast-style format
16 | genomes View genome IDs in the index
17 | kmers View k-mers captured by the masks
18 | masks View masks of the index or generate new masks randomly
19 | reindex-seeds Recreate indexes of k-mer-value (seeds) data
20 | remerge Rerun the merging step for an unfinished index
21 | seed-pos Extract and plot seed positions via reference name(s)
22 | subseq Extract subsequence via reference name, sequence ID, position and strand
23 |
24 | Flags:
25 | -h, --help help for utils
26 |
27 | Global Flags:
28 | -X, --infile-list string ► File of input file list (one file per line). If given, they are
29 | appended to files from CLI arguments.
30 | --log string ► Log file.
31 | --quiet ► Do not print any verbose information. But you can write them to a file
32 | with --log.
33 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores.
34 | (default 16)
35 | ```
36 |
37 |
38 | Subcommands:
39 |
40 | - [2blast](2blast/)
41 | - [masks](masks/)
42 | - [kmers](kmers/)
43 | - [genomes](genomes/)
44 | - [subseq](subseq/)
45 | - [seed-pos](seed-pos/)
46 | - [reindex-seeds](reindex-seeds/)
47 | - [remerge](remerge/)
48 |
--------------------------------------------------------------------------------
/docs/content/usage/utils/genomes.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: genomes
3 | weight: 20
4 | ---
5 |
6 | ## Usage
7 |
8 | ```plain
9 | $ lexicmap utils genomes -h
10 | View genome IDs in the index
11 |
12 | Usage:
13 | lexicmap utils genomes [flags]
14 |
15 | Flags:
16 | -h, --help help for genomes
17 | -d, --index string ► Index directory created by "lexicmap index".
18 | -o, --out-file string ► Out file, supports the ".gz" suffix ("-" for stdout). (default "-")
19 |
20 | Global Flags:
21 | -X, --infile-list string ► File of input file list (one file per line). If given, they are
22 | appended to files from CLI arguments.
23 | --log string ► Log file.
24 | --quiet ► Do not print any verbose information. But you can write them to a file
25 | with --log.
26 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores.
27 | (default 8)
28 | ```
29 |
30 | ## Examples
31 |
32 |
33 | ```
34 | $ lexicmap utils genomes -d demo.lmi/
35 | GCF_000148585.2
36 | GCF_001457655.1
37 | GCF_900638025.1
38 | GCF_001096185.1
39 | GCF_006742205.1
40 | GCF_001544255.1
41 | GCF_000392875.1
42 | GCF_001027105.1
43 | GCF_009759685.1
44 | GCF_002949675.1
45 | GCF_002950215.1
46 | GCF_000006945.2
47 | GCF_003697165.2
48 | GCF_000742135.1
49 | GCF_000017205.1
50 | ```
51 |
--------------------------------------------------------------------------------
/docs/content/usage/utils/masks.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: masks
3 | weight: 5
4 | ---
5 |
6 | ```plain
7 | $ lexicmap utils masks -h
8 | View masks of the index or generate new masks randomly
9 |
10 | Usage:
11 | lexicmap utils masks [flags] { -d | [-k ] [-n ] [-s ] } [-o out.tsv.gz]
12 |
13 | Flags:
14 | -h, --help help for masks
15 | -d, --index string ► Index directory created by "lexicmap index".
16 | -k, --kmer int ► Maximum k-mer size. K needs to be <= 32. (default 31)
17 | -m, --masks int ► Number of masks. (default 40000)
18 | -o, --out-file string ► Out file, supports and recommends a ".gz" suffix ("-" for stdout).
19 | (default "-")
20 | -p, --prefix int ► Length of mask k-mer prefix for checking low-complexity (0 for no
21 | checking). (default 15)
22 | -s, --seed int ► The seed for generating random masks. (default 1)
23 |
24 | Global Flags:
25 | -X, --infile-list string ► File of input file list (one file per line). If given, they are
26 | appended to files from CLI arguments.
27 | --log string ► Log file.
28 | --quiet ► Do not print any verbose information. But you can write them to a file
29 | with --log.
30 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores.
31 | (default 16)
32 | ```
33 |
34 | ## Examples
35 |
36 | ```plain
37 | $ lexicmap utils masks --quiet -d demo.lmi/ | head -n 10
38 | 1 AAAAAAATTCTCGGCGGTGTTTCCAGGCGCA
39 | 2 AAAAAACGTGGCGTCCCCTGTATAACGGCTA
40 | 3 AAAAAAGAGGGGAAGCAAGCTGAAGGATATG
41 | 4 AAAAAATACAGGCTGGCATCTTTAACCCACC
42 | 5 AAAAAATCCAGGGTTCCGTTAAGGATCTGTC
43 | 6 AAAAACATTCATGCTAGCATACCTTGGCAAC
44 | 7 AAAAACCACAATGTGGAAGCACGAGAGGATT
45 | 8 AAAAACCTGTACCCACCCGACGTGGATCCTC
46 | 9 AAAAACGTAGGCGTACCTCTCATAGCTTGTA
47 | 10 AAAAACTATGGATACTTGCCGTAAATCACCT
48 |
49 | $ lexicmap utils masks --quiet -d demo.lmi/ | tail -n 10
50 | 19991 TTTTTGAACTTGTGAAAAAGGCAGATGTGTG
51 | 19992 TTTTTGCGTTTATGCTGCCCTCAAACCATCT
52 | 19993 TTTTTGGATCCACTGTACGAGCACACTACCC
53 | 19994 TTTTTGTGGCTCATCGGGATCGGGAGCAGTC
54 | 19995 TTTTTTACATGTTGGGCTAGGGGCGGTTCAC
55 | 19996 TTTTTTATCGGACGCCAAGTTTGTAATCGTC
56 | 19997 TTTTTTCTTGCATCGTATTCAGCACGTTCCT
57 | 19998 TTTTTTGCCGAGTGACCCCGAAAAGCTCACA
58 | 19999 TTTTTTTATCGAGGCATGGTTGAAGACGGGT
59 | 20000 TTTTTTTCCGTAACTAGGTTCTGGCGATTCC
60 |
61 | # check a specific mask
62 |
63 | $ lexicmap utils masks --quiet -d demo.lmi/ -m 12345
64 | 12345 GCTGCACACGCAAAGACTCACGTCTTCAACG
65 | ```
66 |
67 | Freqency of prefixes.
68 |
69 | ```
70 | $ lexicmap utils masks --quiet -d demo.lmi/ \
71 | | csvtk mutate -Ht -f 2 -p '^(.{7})' \
72 | | csvtk freq -Ht -f 3 -nr \
73 | | head -n 10
74 | AAAAAAT 2
75 | AAAAACC 2
76 | AAAAACT 2
77 | AAAAAGG 2
78 | AAAAAGT 2
79 | AAAAATT 2
80 | AAAACCA 2
81 | AAAACCC 2
82 | AAAACGA 2
83 | AAAACTA 2
84 |
85 | $ lexicmap utils masks --quiet -d demo.lmi/ \
86 | | csvtk mutate -Ht -f 2 -p '^(.{7})' \
87 | | csvtk freq -Ht -f 3 -n \
88 | | head -n 10
89 | AAAAAAA 1
90 | AAAAAAC 1
91 | AAAAAAG 1
92 | AAAAACA 1
93 | AAAAACG 1
94 | AAAAAGA 1
95 | AAAAAGC 1
96 | AAAAATA 1
97 | AAAAATC 1
98 | AAAAATG 1
99 | ```
100 |
101 | Frequency of frequencies. i.e., for 20,000 masks, 4*7* = 16384.
102 | In them, 3,616 of them are duplicated 2 times. 12768 + 2 * 3616 = 20000.
103 |
104 | ```
105 | $ lexicmap utils masks --quiet -d demo.lmi/ \
106 | | csvtk mutate -Ht -f 2 -p '^(.{7})' \
107 | | csvtk freq -Ht -f 3 -n \
108 | | csvtk freq -Ht -f 2 -k
109 | 1 12768
110 | 2 3616
111 | ```
112 |
--------------------------------------------------------------------------------
/docs/content/usage/utils/reindex-seeds.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: reindex-seeds
3 | weight: 50
4 | ---
5 |
6 | ## Usage
7 |
8 | ```plain
9 | $ lexicmap utils reindex-seeds -h
10 | Recreate indexes of k-mer-value (seeds) data
11 |
12 | Usage:
13 | lexicmap utils reindex-seeds [flags]
14 |
15 | Flags:
16 | -h, --help help for reindex-seeds
17 | -d, --index string ► Index directory created by "lexicmap index".
18 | --partitions int ► Number of partitions for re-indexing seeds (k-mer-value data) files. The
19 | value needs to be the power of 4. (default 4096)
20 |
21 | Global Flags:
22 | -X, --infile-list string ► File of input file list (one file per line). If given, they are
23 | appended to files from CLI arguments.
24 | --log string ► Log file.
25 | --quiet ► Do not print any verbose information. But you can write them to a file
26 | with --log.
27 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores.
28 | (default 16)
29 | ```
30 |
31 | ## Examples
32 |
33 |
34 | $ lexicmap utils reindex-seeds -d demo.lmi/ --partitions 1024
35 | 10:20:29.150 [INFO] recreating seed indexes with 1024 partitions for: demo.lmi/
36 | processed files: 16 / 16 [======================================] ETA: 0s. done
37 | 10:20:29.166 [INFO] update index information file: demo.lmi/info.toml
38 | 10:20:29.166 [INFO] finished updating the index information file: demo.lmi/info.toml
39 | 10:20:29.166 [INFO]
40 | 10:20:29.166 [INFO] elapsed time: 15.981266ms
41 | 10:20:29.166 [INFO]
42 |
--------------------------------------------------------------------------------
/docs/content/usage/utils/remerge.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: remerge
3 | weight: 60
4 | ---
5 |
6 | ```plain
7 | $ lexicmap utils remerge -h
8 | Rerun the merging step for an unfinished index
9 |
10 | When to use this command?
11 |
12 | - Only one thread is used for merging indexes, which happens when there are
13 | a lot (>200 batches) of batches ($inpu_files / --batch-size) and the value
14 | of --max-open-files is not big enough. E.g.,
15 |
16 | 22:54:24.420 [INFO] merging 297 indexes...
17 | 22:54:24.455 [INFO] [round 1]
18 | 22:54:24.455 [INFO] batch 1/1, merging 297 indexes to xxx.lmi.tmp/r1_b1 with 1 threads...
19 |
20 | ► Then you can run this command with a bigger --max-open-files (e.g., 4096) and
21 | -J/--seed-data-threads (e.g., 12. 12 needs be <= 4096/(297+2)=13.7).
22 | And you need to set a bigger 'ulimit -n' if the value of --max-open-files is bigger than 1024.
23 |
24 | - The Slurm/PBS job time limit is almost reached and the merging step won't be finished before that.
25 |
26 | - Disk quota is reached in the merging step.
27 |
28 | Usage:
29 | lexicmap utils remerge [flags] [flags] -d
30 |
31 | Flags:
32 | -h, --help help for remerge
33 | -d, --index string ► Index directory created by "lexicmap index".
34 | --max-open-files int ► Maximum opened files, used in merging indexes. If there are >100
35 | batches, please increase this value and set a bigger "ulimit -n" in
36 | shell. (default 1024)
37 | -J, --seed-data-threads int ► Number of threads for writing seed data and merging seed chunks from
38 | all batches, the value should be in range of [1, -c/--chunks]. If there
39 | are >100 batches, please also increase the value of --max-open-files and
40 | set a bigger "ulimit -n" in shell. (default 8)
41 |
42 | Global Flags:
43 | -X, --infile-list string ► File of input file list (one file per line). If given, they are
44 | appended to files from CLI arguments.
45 | --log string ► Log file.
46 | --quiet ► Do not print any verbose information. But you can write them to a file
47 | with --log.
48 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores.
49 | (default 16)
50 | ```
51 |
--------------------------------------------------------------------------------
/docs/content/usage/utils/subseq.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: subseq
3 | weight: 20
4 | ---
5 |
6 | ## Usage
7 |
8 | ```plain
9 | $ lexicmap utils subseq -h
10 | Exextract subsequence via reference name, sequence ID, position and strand
11 |
12 | Attention:
13 | 1. The option -s/--seq-id is optional.
14 | 1) If given, the positions are these in the original sequence.
15 | 2) If not given, the positions are these in the concatenated sequence.
16 | 2. All degenerate bases in reference genomes were converted to the lexicographic first bases.
17 | E.g., N was converted to A. Therefore, consecutive A's in output might be N's in the genomes.
18 |
19 | Usage:
20 | lexicmap utils subseq [flags]
21 |
22 | Flags:
23 | -h, --help help for subseq
24 | -d, --index string ► Index directory created by "lexicmap index".
25 | -w, --line-width int ► Line width of sequence (0 for no wrap). (default 60)
26 | -o, --out-file string ► Out file, supports the ".gz" suffix ("-" for stdout).
27 | (default "-")
28 | -n, --ref-name string ► Reference name.
29 | -r, --region string ► Region of the subsequence (1-based).
30 | -R, --revcom ► Extract subsequence on the negative strand.
31 | -s, --seq-id string ► Sequence ID. If the value is empty, the positions in the region are
32 | treated as that in the concatenated sequence.
33 |
34 | Global Flags:
35 | -X, --infile-list string ► File of input file list (one file per line). If given, they are
36 | appended to files from CLI arguments.
37 | --log string ► Log file.
38 | --quiet ► Do not print any verbose information. But you can write them to a file
39 | with --log.
40 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores.
41 | (default 16)
42 | ```
43 |
44 | ## Examples
45 |
46 | 1. Extracting subsequence with genome ID, sequence ID, position range and strand information.
47 |
48 |
49 | $ lexicmap utils subseq -d demo.lmi/ -n GCF_003697165.2 -s NZ_CP033092.2 -r 4591684:4593225 -R
50 | >NZ_CP033092.2:4591684-4593225:-
51 | AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA
52 | GTCGAACGGTAACAGGAAGCAGCTTGCTGCTTTGCTGACGAGTGGCGGACGGGTGAGTAA
53 | TGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCAT
54 | AACGTCGCAAGACCAAAGAGGGGGACCTTAGGGCCTCTTGCCATCGGATGTGCCCAGATG
55 | GGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGACGATCCCTAGCTGGTCTGAGA
56 | GGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGG
57 | GGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCT
58 | TCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATT
59 | GACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAG
60 | GGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCA
61 | GATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTC
62 | GTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACC
63 | GGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCA
64 | AACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCC
65 | CTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCA
66 | AGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAAT
67 | TCGATGCAACGCGAAGAACCTTACCTGGTCTTGACATCCACGGAAGTTTTCAGAGATGAG
68 | AATGTGCCTTCGGGAACCGTGAGACAGGTGCTGCATGGCTGTCGTCAGCTCGTGTTGTGA
69 | AATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCTTTGTTGCCAGCGGTCCGGC
70 | CGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGACGTCAAGTC
71 | ATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCG
72 | ACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAAC
73 | TCGACTCCATGAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGT
74 | TCCCGGGCCTTGTACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGT
75 | AGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAA
76 | CAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTA
77 |
78 | 1. If the sequence ID (`-s/--seq-id`) is not given, the positions are these in the concatenated sequence.
79 |
80 | Checking sequence lengths of a genome with [seqkit](https://github.com/shenwei356/seqkit).
81 |
82 | $ seqkit fx2tab -nil refs/GCF_003697165.2.fa.gz
83 | NZ_CP033092.2 4903501
84 | NZ_CP033091.2 131333
85 |
86 | Extracting the 1000-bp interval sequence inserted by `lexicmap index`.
87 |
88 | $ lexicmap utils subseq -d demo.lmi/ -n GCF_003697165.2 -r 4903502:4904501
89 | >GCF_003697165.2:4903502-4904501:+
90 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
91 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
92 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
93 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
94 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
95 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
96 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
97 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
98 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
99 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
100 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
101 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
102 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
103 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
104 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
105 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
106 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
107 |
108 | 1. It detects if the end position is larger than the sequence length.
109 |
110 | # the length of NZ_CP033092.2 is 4903501
111 |
112 | $ lexicmap utils subseq -d demo.lmi/ -n GCF_003697165.2 -s NZ_CP033092.2 -r 4903501:1000000000
113 | >NZ_CP033092.2:4903501-4903501:+
114 | C
115 |
116 |
117 | $ lexicmap utils subseq -d demo.lmi/ -n GCF_003697165.2 -s NZ_CP033092.2 -r 4903502:1000000000
118 | >NZ_CP033092.2:4903502-4903501:+
119 |
120 |
--------------------------------------------------------------------------------
/docs/data/menu/extra.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | header:
3 | - name: GitHub
4 | ref: https://github.com/shenwei356/LexicMap
5 | icon: gdoc_github
6 | external: true
7 |
--------------------------------------------------------------------------------
/docs/data/menu/more.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | more:
3 | - name: More tools
4 | ref: "https://github.com/shenwei356"
5 | external: true
6 | icon: "gdoc_github"
7 |
--------------------------------------------------------------------------------
/docs/hugo.toml:
--------------------------------------------------------------------------------
1 | baseURL = 'https://bioinf.shenwei.me/LexicMap'
2 | languageCode = 'en-us'
3 | title = 'LexicMap: efficient sequence alignment against millions of prokaryotic genomes'
4 | theme = 'hugo-geekdoc'
5 |
6 | defaultContentLanguage = 'en'
7 |
8 | pluralizeListTitles = false
9 |
10 | # Geekdoc required configuration
11 | pygmentsUseClasses = true
12 | pygmentsCodeFences = true
13 | disablePathToLower = true
14 |
15 | # Required if you want to render robots.txt template
16 | enableRobotsTXT = true
17 |
18 | # Needed for mermaid shortcodes
19 | [markup]
20 | [markup.goldmark.renderer]
21 | # Needed for mermaid shortcode or when nesting shortcodes (e.g. img within
22 | # columns or tabs)
23 | unsafe = true
24 | [markup.tableOfContents]
25 | startLevel = 1
26 | endLevel = 3
27 |
28 | [taxonomies]
29 | tag = "tags"
30 |
31 | [params]
32 | geekdocToC = 3
33 |
34 | geekdocLogo = "logo.svg"
35 |
--------------------------------------------------------------------------------
/docs/static/GCF_000017205.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_000017205.1.png
--------------------------------------------------------------------------------
/docs/static/GCF_000017205.1.seed_number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_000017205.1.seed_number.png
--------------------------------------------------------------------------------
/docs/static/GCF_000392875.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_000392875.1.png
--------------------------------------------------------------------------------
/docs/static/GCF_000392875.1.seed_number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_000392875.1.seed_number.png
--------------------------------------------------------------------------------
/docs/static/GCF_002949675.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_002949675.1.png
--------------------------------------------------------------------------------
/docs/static/GCF_002949675.1.seed_number.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_002949675.1.seed_number.png
--------------------------------------------------------------------------------
/docs/static/custom.css:
--------------------------------------------------------------------------------
1 | .gdoc-nav nav {
2 | position: fixed;
3 | }
4 |
--------------------------------------------------------------------------------
/docs/static/favicon/android-chrome-192x192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/android-chrome-192x192.png
--------------------------------------------------------------------------------
/docs/static/favicon/android-chrome-512x512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/android-chrome-512x512.png
--------------------------------------------------------------------------------
/docs/static/favicon/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/apple-touch-icon.png
--------------------------------------------------------------------------------
/docs/static/favicon/browserconfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | #da532c
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/docs/static/favicon/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/favicon-16x16.png
--------------------------------------------------------------------------------
/docs/static/favicon/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/favicon-32x32.png
--------------------------------------------------------------------------------
/docs/static/favicon/favicon-48x48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/favicon-48x48.png
--------------------------------------------------------------------------------
/docs/static/favicon/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/favicon.ico
--------------------------------------------------------------------------------
/docs/static/favicon/favicon.svg:
--------------------------------------------------------------------------------
1 |
2 |
64 |
--------------------------------------------------------------------------------
/docs/static/favicon/mstile-144x144.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/mstile-144x144.png
--------------------------------------------------------------------------------
/docs/static/favicon/mstile-150x150.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/mstile-150x150.png
--------------------------------------------------------------------------------
/docs/static/favicon/mstile-310x150.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/mstile-310x150.png
--------------------------------------------------------------------------------
/docs/static/favicon/mstile-310x310.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/mstile-310x310.png
--------------------------------------------------------------------------------
/docs/static/favicon/mstile-70x70.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/mstile-70x70.png
--------------------------------------------------------------------------------
/docs/static/favicon/safari-pinned-tab.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
31 |
--------------------------------------------------------------------------------
/docs/static/favicon/site.webmanifest:
--------------------------------------------------------------------------------
1 | {
2 | "name": "",
3 | "short_name": "",
4 | "icons": [
5 | {
6 | "src": "/android-chrome-192x192.png",
7 | "sizes": "192x192",
8 | "type": "image/png"
9 | },
10 | {
11 | "src": "/android-chrome-512x512.png",
12 | "sizes": "512x512",
13 | "type": "image/png"
14 | }
15 | ],
16 | "theme_color": "#ffffff",
17 | "background_color": "#ffffff",
18 | "display": "standalone"
19 | }
20 |
--------------------------------------------------------------------------------
/docs/static/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
64 |
--------------------------------------------------------------------------------
/docs/static/prefix.hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/prefix.hist.png
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/shenwei356/LexicMap
2 |
3 | go 1.24
4 |
5 | // replace github.com/shenwei356/lexichash => /home/shenwei/go/src/github.com/shenwei356/lexichash/
6 | // replace github.com/shenwei356/wfa => /home/shenwei/go/src/github.com/shenwei356/wfa/
7 |
8 | require (
9 | github.com/dustin/go-humanize v1.0.1
10 | github.com/iafan/cwalk v0.0.0-20210125030640-586a8832a711
11 | github.com/klauspost/pgzip v1.2.6
12 | github.com/mattn/go-colorable v0.1.13
13 | github.com/mitchellh/go-homedir v1.1.0
14 | github.com/pelletier/go-toml/v2 v2.1.1
15 | github.com/pkg/errors v0.9.1
16 | github.com/rdleal/intervalst v1.3.0
17 | github.com/shenwei356/bio v0.13.6
18 | github.com/shenwei356/go-logging v0.0.0-20171012171522-c6b9702d88ba
19 | github.com/shenwei356/kmers v0.1.0
20 | github.com/shenwei356/lexichash v0.5.0
21 | github.com/shenwei356/util v0.5.2
22 | github.com/shenwei356/wfa v0.4.0
23 | github.com/shenwei356/xopen v0.3.2
24 | github.com/spf13/cobra v1.8.0
25 | github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553
26 | github.com/vbauerster/mpb/v8 v8.7.2
27 | github.com/zeebo/wyhash v0.0.1
28 | gonum.org/v1/gonum v0.14.0
29 | gonum.org/v1/plot v0.14.0
30 | )
31 |
32 | require (
33 | git.sr.ht/~sbinet/gg v0.5.0 // indirect
34 | github.com/VividCortex/ewma v1.2.0 // indirect
35 | github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d // indirect
36 | github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b // indirect
37 | github.com/campoy/embedmd v1.0.0 // indirect
38 | github.com/dsnet/compress v0.0.1 // indirect
39 | github.com/elliotwutingfeng/asciiset v0.0.0-20230602022725-51bbb787efab // indirect
40 | github.com/go-fonts/liberation v0.3.1 // indirect
41 | github.com/go-latex/latex v0.0.0-20230307184459-12ec69307ad9 // indirect
42 | github.com/go-pdf/fpdf v0.8.0 // indirect
43 | github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
44 | github.com/inconshreveable/mousetrap v1.1.0 // indirect
45 | github.com/klauspost/compress v1.18.0 // indirect
46 | github.com/kr/text v0.2.0 // indirect
47 | github.com/mattn/go-isatty v0.0.16 // indirect
48 | github.com/mattn/go-runewidth v0.0.15 // indirect
49 | github.com/pmezard/go-difflib v1.0.0 // indirect
50 | github.com/rivo/uniseg v0.4.4 // indirect
51 | github.com/rogpeppe/go-internal v1.12.0 // indirect
52 | github.com/shenwei356/breader v0.3.2 // indirect
53 | github.com/shenwei356/natsort v0.0.0-20190418160752-600d539c017d // indirect
54 | github.com/spf13/pflag v1.0.5 // indirect
55 | github.com/ulikunitz/xz v0.5.12 // indirect
56 | golang.org/x/image v0.18.0 // indirect
57 | golang.org/x/sys v0.16.0 // indirect
58 | golang.org/x/text v0.16.0 // indirect
59 | )
60 |
--------------------------------------------------------------------------------
/lexicmap/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 |
8 | # Test binary, built with `go test -c`
9 | *.test
10 |
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 |
14 | # data
15 | t_*
16 | t.gz
17 | t.*
18 | *.fna.gz
19 | *.xz
20 | *.zst
21 | *.bz2
22 |
23 | # lexicmap index
24 | *.bin
25 | *.idx
26 | info.toml
27 | *.png
28 |
29 | name.map
30 | taxid.map
31 | masks.txt
32 |
33 | ont-*
34 | hifi-*
35 |
--------------------------------------------------------------------------------
/lexicmap/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | commit=$(git rev-parse --short HEAD)
4 |
5 | CGO_ENABLED=0 go build -trimpath -o=lexicmap -ldflags="-s -w -X github.com/shenwei356/LexicMap/lexicmap/cmd.COMMIT=$commit" -tags netgo
6 |
7 | ./lexicmap version
8 |
--------------------------------------------------------------------------------
/lexicmap/cmd/autocomplete.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "fmt"
25 | "os"
26 | "path/filepath"
27 |
28 | homedir "github.com/mitchellh/go-homedir"
29 | "github.com/shenwei356/util/pathutil"
30 | "github.com/spf13/cobra"
31 | )
32 |
33 | // autocompletionCmd represents the fq2fa command
34 | var autocompletionCmd = &cobra.Command{
35 | Use: "autocompletion",
36 | Short: "Generate shell autocompletion scripts",
37 | Long: `Generate shell autocompletion scripts
38 |
39 | Supported shell: bash|zsh|fish|powershell
40 |
41 | Bash:
42 |
43 | # generate completion shell
44 | lexicmap autocompletion --shell bash
45 |
46 | # configure if never did.
47 | # install bash-completion if the "complete" command is not found.
48 | echo "for bcfile in ~/.bash_completion.d/* ; do source \$bcfile; done" >> ~/.bash_completion
49 | echo "source ~/.bash_completion" >> ~/.bashrc
50 |
51 | Zsh:
52 |
53 | # generate completion shell
54 | lexicmap autocompletion --shell zsh --file ~/.zfunc/_lexicmap
55 |
56 | # configure if never did
57 | echo 'fpath=( ~/.zfunc "${fpath[@]}" )' >> ~/.zshrc
58 | echo "autoload -U compinit; compinit" >> ~/.zshrc
59 |
60 | fish:
61 |
62 | lexicmap autocompletion --shell fish --file ~/.config/fish/completions/lexicmap.fish
63 |
64 | `,
65 | Run: func(cmd *cobra.Command, args []string) {
66 | outfile := getFlagString(cmd, "file")
67 | shell := getFlagString(cmd, "shell")
68 |
69 | dir := filepath.Dir(outfile)
70 | ok, err := pathutil.DirExists(dir)
71 | checkError(err)
72 | if !ok {
73 | os.MkdirAll(dir, 0744)
74 | }
75 |
76 | switch shell {
77 | case "bash":
78 | checkError(cmd.Root().GenBashCompletionFile(outfile))
79 | case "zsh":
80 | checkError(cmd.Root().GenZshCompletionFile(outfile))
81 | case "fish":
82 | checkError(cmd.Root().GenFishCompletionFile(outfile, true))
83 | case "powershell":
84 | checkError(cmd.Root().GenPowerShellCompletionFile(outfile))
85 | default:
86 | checkError(fmt.Errorf("unsupported shell: %s", shell))
87 | }
88 |
89 | log.Infof("%s completion file for lexicmap saved to %s", shell, outfile)
90 | },
91 | }
92 |
93 | func init() {
94 | RootCmd.AddCommand(autocompletionCmd)
95 | defaultCompletionFile, err := homedir.Expand("~/.bash_completion.d/lexicmap.sh")
96 | checkError(err)
97 | autocompletionCmd.Flags().StringP("file", "", defaultCompletionFile, "autocompletion file")
98 | autocompletionCmd.Flags().StringP("shell", "", "bash", "autocompletion type (bash|zsh|fish|powershell)")
99 | }
100 |
--------------------------------------------------------------------------------
/lexicmap/cmd/genome/genome_test.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package genome
22 |
23 | import (
24 | "bytes"
25 | "fmt"
26 | "os"
27 | "testing"
28 | )
29 |
30 | func TestGenomeWritingAndSeqExtraction(t *testing.T) {
31 | _seq := []byte("ACTAGACGACGTACGCGTACGTAGTACGATGCTCGA")
32 | var s, s2 []byte
33 | var b2 *[]byte
34 | var err error
35 | for n := 1; n < len(_seq); n++ {
36 | s = _seq[:n]
37 | b2 = Seq2TwoBit(s)
38 | s2, err = TwoBit2Seq(*b2, n)
39 | if err != nil {
40 | t.Error(err)
41 | return
42 | }
43 | if !bytes.Equal(s, s2) {
44 | t.Errorf("expected: %s, results: %s\n", s, s2)
45 | return
46 | }
47 | RecycleTwoBit(b2)
48 | }
49 | }
50 |
51 | func TestReadAndWrite(t *testing.T) {
52 | file := "t.2bit"
53 |
54 | // ----------------------- write --------------
55 |
56 | w, err := NewWriter(file, 1)
57 | if err != nil {
58 | t.Error(err)
59 | return
60 | }
61 |
62 | _seqs := [][]byte{
63 | []byte("A"),
64 | []byte("C"),
65 | []byte("CA"),
66 | []byte("CAT"),
67 | []byte("CATG"),
68 | []byte("CATGC"),
69 | []byte("CATGCC"),
70 | []byte("CATGCCA"),
71 | []byte("CATGCCAC"),
72 | []byte("CATGCCACG"),
73 | []byte("ACCCTCGAGCGACTAG"),
74 | []byte("ACTAGACGACGTACGCGTACGTAGTACGATGCTCGA"),
75 | []byte("ACGCAGTCGTCATCATGCGTGTCGCATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACATGCTGCATGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATGCTGTGATGCGTCTCAGTAGATGAT"),
76 | }
77 |
78 | for i, s := range _seqs {
79 | id := []byte(fmt.Sprintf("seq_%d", i+1))
80 |
81 | g := PoolGenome.Get().(*Genome)
82 | g.Reset()
83 | g.ID = append(g.ID, id...)
84 | g.Seq = append(g.Seq, s...)
85 | g.GenomeSize = len(s)
86 | g.Len = len(s)
87 | g.NumSeqs = 1
88 | g.SeqSizes = append(g.SeqSizes, len(s))
89 | seqid := []byte("test")
90 | g.SeqIDs = append(g.SeqIDs, &seqid)
91 |
92 | err = w.Write(g)
93 | if err != nil {
94 | t.Error(err)
95 | return
96 | }
97 |
98 | RecycleGenome(g)
99 | }
100 |
101 | err = w.Close()
102 | if err != nil {
103 | t.Error(err)
104 | return
105 | }
106 |
107 | // ----------------------- read --------------
108 |
109 | r, err := NewReader(file)
110 | if err != nil {
111 | t.Error(err)
112 | return
113 | }
114 |
115 | var start, end int
116 | var s1 []byte
117 | var s2 *Genome
118 | for i, s := range _seqs {
119 | // subseq
120 | for start = 0; start < len(s); start++ {
121 | for end = start; end < len(s); end++ {
122 | s2, err = r.SubSeq(i, start, end)
123 | if err != nil {
124 | t.Error(err)
125 | return
126 | }
127 | s1 = s[start : end+1]
128 | if !bytes.Equal(s1, s2.Seq) {
129 | t.Errorf("idx: %d:%d-%d, expected: %s, results: %s",
130 | i, start, end, s1, s2.Seq)
131 | return
132 | }
133 | RecycleGenome(s2)
134 | }
135 | }
136 |
137 | // whole seq
138 | s2, err = r.Seq(i)
139 | if err != nil {
140 | t.Error(err)
141 | return
142 | }
143 | if !bytes.Equal(s, s2.Seq) {
144 | t.Errorf("idx: %d not matched", i)
145 | }
146 | RecycleGenome(s2)
147 | }
148 |
149 | r.Close()
150 |
151 | // clean up
152 |
153 | err = os.RemoveAll(file)
154 | if err != nil {
155 | t.Error(err)
156 | return
157 | }
158 |
159 | err = os.RemoveAll(file + GenomeIndexFileExt)
160 | if err != nil {
161 | t.Error(err)
162 | return
163 | }
164 | }
165 |
--------------------------------------------------------------------------------
/lexicmap/cmd/genomes.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "bufio"
25 | "fmt"
26 | "io"
27 | "os"
28 | "path/filepath"
29 | "strings"
30 |
31 | "github.com/shenwei356/bio/seq"
32 | "github.com/spf13/cobra"
33 | )
34 |
35 | var genomesCmd = &cobra.Command{
36 | Use: "genomes",
37 | Short: "View genome IDs in the index",
38 | Long: `View genome IDs in the index
39 |
40 | `,
41 | Run: func(cmd *cobra.Command, args []string) {
42 | opt := getOptions(cmd)
43 | seq.ValidateSeq = false
44 |
45 | // ------------------------------
46 |
47 | dbDir := getFlagString(cmd, "index")
48 | if dbDir == "" {
49 | checkError(fmt.Errorf("flag -d/--index needed"))
50 | }
51 |
52 | outFile := getFlagString(cmd, "out-file")
53 |
54 | // output file handler
55 | outfh, gw, w, err := outStream(outFile, strings.HasSuffix(outFile, ".gz"), opt.CompressionLevel)
56 | checkError(err)
57 | defer func() {
58 | outfh.Flush()
59 | if gw != nil {
60 | gw.Close()
61 | }
62 | w.Close()
63 | }()
64 |
65 | // -----------------------------------------------------
66 | // read genome chunks data if existed
67 | genomeChunks, err := readGenomeChunksMap(filepath.Join(dbDir, FileGenomeChunks))
68 | if err != nil {
69 | checkError(fmt.Errorf("failed to read genome chunk file: %s", err))
70 | }
71 | var hasGenomeChunks bool
72 | if len(genomeChunks) > 0 {
73 | hasGenomeChunks = true
74 | }
75 |
76 | // ---------------------------------------------------------------
77 |
78 | // genomes.map file for mapping index to genome id
79 | fh, err := os.Open(filepath.Join(dbDir, FileGenomeIndex))
80 | if err != nil {
81 | checkError(fmt.Errorf("failed to read genome index mapping file: %s", err))
82 | }
83 | defer fh.Close()
84 |
85 | r := bufio.NewReader(fh)
86 |
87 | buf := make([]byte, 8)
88 | var n, lenID int
89 | var batchIDAndRefID uint64
90 | var ok bool
91 |
92 | outfh.WriteString("ref\tchunked\n")
93 | for {
94 | n, err = io.ReadFull(r, buf[:2])
95 | if err != nil {
96 | if err == io.EOF {
97 | break
98 | }
99 | checkError(fmt.Errorf("failed to read genome index mapping file: %s", err))
100 | }
101 | if n < 2 {
102 | checkError(fmt.Errorf("broken genome map file"))
103 | }
104 | lenID = int(be.Uint16(buf[:2]))
105 | id := make([]byte, lenID)
106 |
107 | n, err = io.ReadFull(r, id)
108 | if err != nil {
109 | checkError(fmt.Errorf("broken genome map file"))
110 | }
111 | if n < lenID {
112 | checkError(fmt.Errorf("broken genome map file"))
113 | }
114 |
115 | n, err = io.ReadFull(r, buf)
116 | if err != nil {
117 | checkError(fmt.Errorf("broken genome map file"))
118 | }
119 | if n < 8 {
120 | checkError(fmt.Errorf("broken genome map file"))
121 | }
122 |
123 | batchIDAndRefID = be.Uint64(buf)
124 |
125 | if hasGenomeChunks {
126 | if _, ok = genomeChunks[batchIDAndRefID]; ok {
127 | fmt.Fprintf(outfh, "%s\t%s\n", id, "yes")
128 | } else {
129 | fmt.Fprintf(outfh, "%s\t\n", id)
130 | }
131 | } else {
132 | fmt.Fprintf(outfh, "%s\t\n", id)
133 | }
134 |
135 | }
136 | },
137 | }
138 |
139 | func init() {
140 | utilsCmd.AddCommand(genomesCmd)
141 |
142 | genomesCmd.Flags().StringP("index", "d", "",
143 | formatFlagUsage(`Index directory created by "lexicmap index".`))
144 |
145 | genomesCmd.Flags().StringP("out-file", "o", "-",
146 | formatFlagUsage(`Out file, supports the ".gz" suffix ("-" for stdout).`))
147 |
148 | genomesCmd.SetUsageTemplate(usageTemplate(""))
149 | }
150 |
--------------------------------------------------------------------------------
/lexicmap/cmd/kv/kv-encoding.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package kv
22 |
23 | import "encoding/binary"
24 |
25 | var be = binary.BigEndian
26 |
27 | // var PutUint64FourBytes func([]byte, uint64) = be.PutUint64
28 |
29 | // PutUint64ThreeBytes puts uint64 to 7 low bytes.
30 | func PutUint64ThreeBytes(b []byte, v uint64) {
31 | _ = b[6] // early bounds check to guarantee safety of writes below
32 | b[0] = byte(v >> 48)
33 | b[1] = byte(v >> 40)
34 | b[2] = byte(v >> 32)
35 | b[3] = byte(v >> 24)
36 | b[4] = byte(v >> 16)
37 | b[5] = byte(v >> 8)
38 | b[6] = byte(v)
39 | }
40 |
41 | // Uint64ThreeBytes returns an uint64 from 7 bytes
42 | func Uint64ThreeBytes(b []byte) uint64 {
43 | _ = b[6] // bounds check hint to compiler
44 | return uint64(b[6]) | uint64(b[5])<<8 | uint64(b[4])<<16 | uint64(b[3])<<24 |
45 | uint64(b[2])<<32 | uint64(b[1])<<40 | uint64(b[0])<<48
46 | }
47 |
--------------------------------------------------------------------------------
/lexicmap/cmd/lib-chaining_test.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "testing"
25 | )
26 |
27 | func TestChaining(t *testing.T) {
28 | /* command to prepare seeds from a certain query
29 | cat t.txt | csvtk grep -t -f target -p GCA_013693855.1 \
30 | | csvtk cut -t -f qstart,tstart | sed 1d \
31 | | awk '{print "{QBegin: "$1", TBegin: "$2", Len: 31},"}'
32 | */
33 | subs := []*SubstrPair{
34 | // two sequences on different strands
35 | // {QBegin: 18, TBegin: 3453, Len: 31},
36 | // {QBegin: 18, TBegin: 3640464, Len: 31},
37 | // {QBegin: 1924, TBegin: 1547, Len: 31},
38 | // {QBegin: 1924, TBegin: 3638544, Len: 31},
39 |
40 | // not perfect in this case, there are two chains: 0,1 and 2., while it should be one.
41 | {QBegin: 552, TBegin: 3798905, Len: 17},
42 | {QBegin: 667, TBegin: 3799019, Len: 15},
43 | {QBegin: 1332, TBegin: 3799686, Len: 31},
44 |
45 | // a kmer has multiple matches
46 | {QBegin: 1384, TBegin: 628584, Len: 31},
47 | {QBegin: 1490, TBegin: 628690, Len: 31},
48 | {QBegin: 1879, TBegin: 900465, Len: 31},
49 | {QBegin: 1879, TBegin: 629079, Len: 31},
50 | {QBegin: 1879, TBegin: 627005, Len: 31},
51 | {QBegin: 1910, TBegin: 6123921, Len: 23},
52 |
53 | // same strands
54 |
55 | {QBegin: 182, TBegin: 1282695, Len: 26},
56 | {QBegin: 182, TBegin: 1769573, Len: 26},
57 | {QBegin: 315, TBegin: 1282830, Len: 15},
58 | {QBegin: 315, TBegin: 1769708, Len: 15},
59 | {QBegin: 343, TBegin: 1769724, Len: 27},
60 |
61 | {QBegin: 10, TBegin: 314159, Len: 20},
62 |
63 | // this case is kept in the chainning step,
64 | // because we can not simply limit
65 | // the minimum distance between two anchors.
66 | {QBegin: 60, TBegin: 14234, Len: 15},
67 | {QBegin: 61, TBegin: 14235, Len: 15},
68 |
69 | {QBegin: 60, TBegin: 3395374, Len: 15},
70 | {QBegin: 70, TBegin: 3395384, Len: 15},
71 |
72 | {QBegin: 50, TBegin: 950, Len: 31},
73 | {QBegin: 79, TBegin: 3637976, Len: 31},
74 | {QBegin: 100, TBegin: 3637997, Len: 31},
75 | {QBegin: 519, TBegin: 1419, Len: 31},
76 | {QBegin: 550, TBegin: 3638447, Len: 31},
77 | {QBegin: 647, TBegin: 3638544, Len: 31},
78 |
79 | {QBegin: 111, TBegin: 1146311, Len: 31},
80 | {QBegin: 136, TBegin: 1146336, Len: 31},
81 | {QBegin: 138, TBegin: 1146338, Len: 31},
82 | {QBegin: 139, TBegin: 1146339, Len: 31},
83 | {QBegin: 264, TBegin: 1146464, Len: 31},
84 | {QBegin: 1479, TBegin: 1147679, Len: 31},
85 | {QBegin: 1484, TBegin: 1147684, Len: 31},
86 | {QBegin: 1543, TBegin: 1147743, Len: 31},
87 | {QBegin: 1566, TBegin: 1147766, Len: 31},
88 | {QBegin: 1919, TBegin: 1148119, Len: 31},
89 | }
90 | tmp := []*SearchResult{
91 | {
92 | Subs: &subs,
93 | },
94 | }
95 | rs := &tmp
96 |
97 | cf := &DefaultChainingOptions
98 |
99 | chainer := NewChainer(cf)
100 | for _, r := range *rs {
101 | paths, sumMaxScore := chainer.Chain(r.Subs)
102 |
103 | t.Logf("sum score: %f, paths:\n", sumMaxScore)
104 | for _, p := range *paths {
105 | t.Logf(" %d\n", *p)
106 | }
107 |
108 | RecycleChainingResult(paths)
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/lexicmap/cmd/lib-index-search-util.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "math"
25 | "slices"
26 | "sync"
27 |
28 | "github.com/shenwei356/LexicMap/lexicmap/cmd/tree"
29 | "github.com/shenwei356/lexichash/iterator"
30 | "github.com/shenwei356/wfa"
31 | )
32 |
33 | // extendMatch an alignment region using a chaining algorithm.
34 | func extendMatch(seq1, seq2 []byte, start1, end1, start2, end2 int, extLen int, tBegin, maxExtLen int, rc bool) ([]byte, []byte, int, int, int, int, error) {
35 | var m uint8 = 2
36 |
37 | // fmt.Println("before:", start1, end1, start2, end2)
38 |
39 | var _s1, _e1, _s2, _e2 int // extend length
40 | var _extLen int
41 |
42 | // 3', right
43 | if end1+int(m) < len(seq1) && end2+int(m) < len(seq2) {
44 | if rc {
45 | _extLen = min(extLen, tBegin)
46 | } else {
47 | _extLen = min(extLen, maxExtLen)
48 | }
49 |
50 | if _extLen > 2 {
51 | e1, e2 := min(end1+_extLen, len(seq1)), min(end2+_extLen, len(seq2))
52 | _seq1, _seq2 := seq1[end1:e1], seq2[end2:e2]
53 | // fmt.Printf("seq1: %s\nseq2: %s\n", _seq1, _seq2)
54 |
55 | _e1, _e2 = _extendRight(_seq1, _seq2)
56 | if _e1 > 0 || _e2 > 0 {
57 | end1 += _e1
58 | end2 += _e2
59 | }
60 | }
61 | }
62 |
63 | // 5', left
64 | if start1 > int(m) && start2 > int(m) {
65 | if rc {
66 | _extLen = min(extLen, maxExtLen) // tBegin is 0-based
67 | } else {
68 | _extLen = min(extLen, tBegin) // tBegin is 0-based
69 | }
70 |
71 | if _extLen > 2 {
72 | s1, s2 := max(start1-_extLen, 0), max(start2-_extLen, 0)
73 | _seq1, _seq2 := reverseBytes(seq1[s1:start1]), reverseBytes(seq2[s2:start2])
74 | // fmt.Printf("seq1: %s\nseq2: %s\n", _seq1, _seq2)
75 |
76 | _s1, _s2 = _extendRight(*_seq1, *_seq2)
77 | if _s1 > 0 || _s2 > 0 {
78 | start1 -= _s1
79 | start2 -= _s2
80 | }
81 | poolRevBytes.Put(_seq1)
82 | poolRevBytes.Put(_seq2)
83 | }
84 | }
85 |
86 | // fmt.Println("after:", start1, end1, start2, end2)
87 | return seq1[start1:end1], seq2[start2:end2], _s1, _e1, _s2, _e2, nil
88 | }
89 |
90 | func _extendRight(s1, s2 []byte) (int, int) {
91 | _k := 2
92 | var m uint8 = 2
93 |
94 | // k-mer iterator
95 | iter, err := iterator.NewKmerIterator(s1, _k)
96 | if err != nil {
97 | return 0, 0
98 | }
99 |
100 | // index
101 | t := tree.NewTree(uint8(_k))
102 | var kmer uint64
103 | var ok bool
104 | for {
105 | kmer, ok, _ = iter.NextPositiveKmer()
106 | if !ok {
107 | break
108 | }
109 | t.Insert(kmer, uint32(iter.Index()))
110 | }
111 |
112 | // match
113 | iter, err = iterator.NewKmerIterator(s2, _k)
114 | if err != nil {
115 | return 0, 0
116 | }
117 |
118 | subs := poolSubs.Get().(*[]*SubstrPair)
119 | *subs = (*subs)[:0]
120 | defer RecycleSubstrPairs(poolSub, poolSubs, subs)
121 |
122 | var v, p uint32
123 | var srs *[]*tree.SearchResult
124 | var sr *tree.SearchResult
125 |
126 | for {
127 | kmer, ok, _ = iter.NextPositiveKmer()
128 | if !ok {
129 | break
130 | }
131 |
132 | srs, ok = t.Search(kmer, m)
133 | if !ok {
134 | continue
135 | }
136 |
137 | for _, sr = range *srs {
138 | // fmt.Printf("%s vs %s, len:%d\n", kmers.MustDecode(kmer, _k), kmers.MustDecode(sr.Kmer, _k), sr.LenPrefix)
139 | for _, v = range sr.Values {
140 | p = v
141 |
142 | _sub := poolSub.Get().(*SubstrPair)
143 | _sub.QBegin = int32(p)
144 | _sub.TBegin = int32(iter.Index())
145 | _sub.Len = uint8(sr.LenPrefix)
146 | _sub.QRC = false
147 | _sub.TRC = false
148 |
149 | *subs = append(*subs, _sub)
150 | }
151 | }
152 | t.RecycleSearchResult(srs)
153 | }
154 | tree.RecycleTree(t)
155 |
156 | if len(*subs) == 0 {
157 | return 0, 0
158 | }
159 |
160 | if len(*subs) > 1 {
161 | // no need to clean as k == min_len
162 | // ClearSubstrPairs(poolSub, subs, _k)
163 |
164 | slices.SortFunc(*subs, func(a, b *SubstrPair) int {
165 | if a.QBegin == b.QBegin {
166 | if a.QBegin+int32(a.Len) == b.QBegin+int32(b.Len) {
167 | return int(a.TBegin - b.TBegin)
168 | }
169 | return int(b.QBegin) + int(b.Len) - (int(a.QBegin) + int(a.Len))
170 | }
171 | return int(a.QBegin - b.QBegin)
172 | })
173 | }
174 |
175 | // for _, s := range *subs {
176 | // fmt.Println(s)
177 | // }
178 |
179 | // chaining
180 | chainer := poolChainers3.Get().(*Chainer3)
181 | chain := chainer.Chain(subs)
182 |
183 | poolChainers3.Put(chainer)
184 |
185 | if chain != nil {
186 | // fmt.Printf("q: %d-%d, t: %d-%d\n", chain.QBegin, chain.QEnd, chain.TBegin, chain.TEnd)
187 | poolChain3.Put(chain)
188 | return chain.QEnd + 1, chain.TEnd + 1
189 | }
190 |
191 | return 0, 0
192 | }
193 |
194 | // remember to recycle the result
195 | func reverseBytes(s []byte) *[]byte {
196 | t := poolRevBytes.Get().(*[]byte)
197 | if len(s) == len(*t) {
198 |
199 | } else if len(s) < len(*t) {
200 | *t = (*t)[:len(s)]
201 | } else {
202 | n := len(s) - len(*t)
203 | for i := 0; i < n; i++ {
204 | *t = append(*t, 0)
205 | }
206 | }
207 | copy(*t, s)
208 |
209 | for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 {
210 | (*t)[i], (*t)[j] = (*t)[j], (*t)[i]
211 | }
212 |
213 | return t
214 | }
215 |
216 | var poolRevBytes = &sync.Pool{New: func() interface{} {
217 | tmp := make([]byte, 128)
218 | return &tmp
219 | }}
220 |
221 | // ------------------------------------------------------------------------------------------
222 |
223 | const OpM = uint64('M')
224 | const OpD = uint64('D')
225 | const OpI = uint64('I')
226 | const OpX = uint64('X')
227 | const OpH = uint64('H')
228 |
229 | // trimOps trim ops to keep only aligned region
230 | func trimOps(ops []uint64) []uint64 {
231 | var start, end int
232 | start, end = -1, -1
233 | for i, op := range ops {
234 | if op>>32 == OpM {
235 | start = i
236 | break
237 | }
238 | }
239 | for i := len(ops) - 1; i >= 0; i-- {
240 | if ops[i]>>32 == OpM {
241 | end = i
242 | break
243 | }
244 | }
245 | return ops[start : end+1]
246 | }
247 |
248 | func scoreAndEvalue(match, mismatch, gapOpen, gapExt int, totalBase int, lambda, k float64) func(qlen int, cigar *wfa.AlignmentResult) (int, int, float64) {
249 | // var Kn float64 = float64(k) * float64(totalBase)
250 | lnK := math.Log(k)
251 | ftotalBase := float64(totalBase)
252 |
253 | return func(qlen int, cigar *wfa.AlignmentResult) (int, int, float64) {
254 | ops := trimOps(cigar.Ops)
255 | var score, n int
256 | for _, op := range ops {
257 | n = int(op & 4294967295)
258 |
259 | // switch op.Op {
260 | switch op >> 32 {
261 | // match:
262 | case OpM:
263 | score += n * match
264 | // mismatch
265 | case OpX:
266 | score += n * mismatch
267 | // gap
268 | case OpI:
269 | score += gapOpen + n*gapExt
270 | // case 'D', 'H':
271 | case OpD, OpH:
272 | score += gapOpen + n*gapExt
273 | }
274 | }
275 |
276 | _score := score
277 |
278 | // from blastn_values_2_3 in ncbi-blast-2.15.0+-src/c++/src/algo/blast/core/blast_stat.c
279 | // Any odd score must be rounded down to the nearest even number before calculating the e-value
280 | if _score&1 == 1 {
281 | _score--
282 | }
283 |
284 | bitScore := (lambda*float64(_score) - lnK) / math.Ln2
285 |
286 | // evalue := Kn * float64(qlen) * math.Pow(math.E, -lambda*float64(_score))
287 |
288 | evalue := ftotalBase * math.Pow(2, -bitScore) * float64(qlen)
289 |
290 | return score, int(bitScore), evalue
291 | }
292 | }
293 |
--------------------------------------------------------------------------------
/lexicmap/cmd/lib-seq_compare_test.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "sync"
25 | "testing"
26 | )
27 |
28 | func TestSeqCompare(t *testing.T) {
29 | // Identities: 271/288(94%)
30 |
31 | // Query 8 AGGTCCTGCCCCGCGACCTGCACGCCGAATACGTAGCGGCGATCGCCTTAGTCGGTACAG 67
32 | // |||||||||||||||||||||||||| |||||||| ||||||||||||||| |||||
33 | // Sbjct 15 AGGTCCTGCCCCGCGACCTGCACGCC-AATACGTA-TAGCGATCGCCTTAGTC--TACAG 70
34 |
35 | // Query 68 CCCTGGAAAACATGGCCACCGAAGTTCGTTCCCTGCAACGGACCGAAATCCACGAAGTCG 127
36 | // |||||||||||||||||||||||||||||| |||||||||||||||||||||| |||||
37 | // Sbjct 71 CCCTGGAAAACATGGCCACCGAAGTTCGTT-CCTGCAACGGACCGAAATCCACTGAGTCG 129
38 |
39 | // Query 128 AAGAACACTTTGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAA 187
40 | // || |||| |||||||||||||||||||||||||||||||||||||||||||||||||
41 | // Sbjct 130 --CAATACTTCGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAA 187
42 |
43 | // Query 188 TTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGGCAACGTGGTGACCGCCT 247
44 | // ||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||
45 | // Sbjct 188 TTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGG-AACGTGGTGACCGCCT 246
46 |
47 | // Query 248 ACGAAGACGTGACCCTCTGGCACGAACGCGACATCTCCCACTCCAGTG 295
48 | // |||||||||||||||| ||||||||||||||||||||||||||||||
49 | // Sbjct 247 ACGAAGACGTGACCCTTCGGCACGAACGCGACATCTCCCACTCCAGTG 294
50 |
51 | s1 := []byte("GGTTACGTATTGCTAGGTCCTGCCCCGCGACCTGCACGCCAATACGTATAGCGATCGCCTTAGTCTACAGCCCTGGAAAACATGGCCACCGAAGTTCGTTCCTGCAACGGACCGAAATCCACTGAGTCGCAATACTTCGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAATTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGGAACGTGGTGACCGCCTACGAAGACGTGACCCTTCGGCACGAACGCGACATCTCCCACTCCAGTGAGCAATACGTAACTGAACGAAGAACATCCGCAAAAAAAA")
52 | s2 := []byte("TCCACCCAGGTCCTGCCCCGCGACCTGCACGCCGAATACGTAGCGGCGATCGCCTTAGTCGGTACAGCCCTGGAAAACATGGCCACCGAAGTTCGTTCCCTGCAACGGACCGAAATCCACGAAGTCGAAGAACACTTTGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAATTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGGCAACGTGGTGACCGCCTACGAAGACGTGACCCTCTGGCACGAACGCGACATCTCCCACTCCAGTGCCGAACGGATGATTCTGCCGGACTCCACGGCGCTGTTG")
53 |
54 | // s1 := []byte("GGTTACGTATTGCTAGGTCCTGCCCCGCGACCTGCACGCCAATACGTATAGCGATCGCCTTAGTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTGCAACGGACCGAAATCCACTGAGTCGCAATACTTCGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAATTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGGcccccccccccccccccccccccccccccccTCGGCACGAACGCGACATCTCCCACTCCAGTGAGCAATACGTAACTGAACGAAGAACATCCGCAAAAAAAA")
55 | // s2 := []byte("TCCACCCAGGTCCTGCCCCGCGACCTGCACGCCGAATACGTAGCGGCGATCGCCTTAGTCGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCTGCAACGGACCGAAATCCACGAAGTCGAAGAACACTTTGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAATTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGGCaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaTCTGGCACGAACGCGACATCTCCCACTCCAGTGCCGAACGGATGATTCTGCCGGACTCCACGGCGCTGTTG")
56 |
57 | // alignment
58 | // alg := align.NewAligner(&align.AlignOptions{
59 | // MatchScore: 1,
60 | // MisMatchScore: -1,
61 | // GapScore: -1,
62 | // SaveAlignments: true,
63 | // SaveMatrix: false,
64 | // })
65 | // r := alg.Global(s2, s1)
66 |
67 | // t.Logf("matches: %d, gaps: %d, len: %d, identity: %.2f%%\n",
68 | // r.Matches, r.Gaps, r.Len, float64(r.Matches)/float64(r.Len)*100)
69 |
70 | // compare
71 |
72 | cpr := NewSeqComparator(&DefaultSeqComparatorOptions, &sync.Pool{New: func() interface{} {
73 | return NewChainer2(&DefaultChaining2Options)
74 | }})
75 |
76 | err := cpr.Index(s1)
77 | if err != nil {
78 | t.Logf("%s", err)
79 | return
80 | }
81 |
82 | cr, err := cpr.Compare(0, uint32(len(s2)-1), s2, len(s2))
83 | if err != nil {
84 | t.Logf("%s", err)
85 | return
86 | }
87 | if cr != nil {
88 | t.Logf("aligned bases: %d\n", cr.AlignedBases)
89 |
90 | RecycleSeqComparatorResult(cr)
91 | }
92 | }
93 |
94 | //
95 |
--------------------------------------------------------------------------------
/lexicmap/cmd/masks.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "fmt"
25 | "os"
26 | "path/filepath"
27 | "strings"
28 | "time"
29 |
30 | "github.com/shenwei356/bio/seq"
31 | "github.com/shenwei356/lexichash"
32 | "github.com/shenwei356/util/pathutil"
33 | "github.com/spf13/cobra"
34 | )
35 |
36 | var masksCmd = &cobra.Command{
37 | Use: "masks",
38 | Short: "View masks of the index or generate new masks randomly",
39 | Long: `View masks of the index or generate new masks randomly
40 |
41 | `,
42 | Run: func(cmd *cobra.Command, args []string) {
43 | opt := getOptions(cmd)
44 | seq.ValidateSeq = false
45 |
46 | var fhLog *os.File
47 | if opt.Log2File {
48 | fhLog = addLog(opt.LogFile, opt.Verbose)
49 | }
50 |
51 | outputLog := opt.Verbose || opt.Log2File
52 |
53 | timeStart := time.Now()
54 | defer func() {
55 | if outputLog {
56 | log.Info()
57 | log.Infof("elapsed time: %s", time.Since(timeStart))
58 | log.Info()
59 | }
60 | if opt.Log2File {
61 | fhLog.Close()
62 | }
63 | }()
64 |
65 | var err error
66 |
67 | // ---------------------------------------------------------------
68 | dbDir := getFlagString(cmd, "index")
69 |
70 | outFile := getFlagString(cmd, "out-file")
71 |
72 | k := getFlagPositiveInt(cmd, "kmer")
73 | if k < minK || k > 32 {
74 | checkError(fmt.Errorf("the value of flag -k/--kmer should be in range of [%d, 32]", minK))
75 | }
76 |
77 | nMasks := getFlagPositiveInt(cmd, "masks")
78 | lcPrefix := getFlagNonNegativeInt(cmd, "prefix")
79 | seed := getFlagPositiveInt(cmd, "seed")
80 |
81 | // ---------------------------------------------------------------
82 | // output file handler
83 | outfh, gw, w, err := outStream(outFile, strings.HasSuffix(outFile, ".gz"), opt.CompressionLevel)
84 | checkError(err)
85 | defer func() {
86 | outfh.Flush()
87 | if gw != nil {
88 | gw.Close()
89 | }
90 | w.Close()
91 | }()
92 |
93 | // ---------------------------------------------------------------
94 |
95 | var lh *lexichash.LexicHash
96 |
97 | decoder := lexichash.MustDecoder()
98 |
99 | if dbDir != "" { // from the index
100 | if outputLog {
101 | log.Info()
102 | log.Infof("checking index: %s", dbDir)
103 | }
104 |
105 | // Mask file
106 | fileMask := filepath.Join(dbDir, FileMasks)
107 | ok, err := pathutil.Exists(fileMask)
108 | if err != nil || !ok {
109 | checkError(fmt.Errorf("mask file not found: %s", fileMask))
110 | }
111 |
112 | lh, err = lexichash.NewFromFile(fileMask)
113 | if err != nil {
114 | checkError(fmt.Errorf("%s", err))
115 | }
116 |
117 | if outputLog {
118 | log.Infof(" checking passed")
119 | log.Infof("reading masks...")
120 | }
121 |
122 | _k := uint8(lh.K)
123 |
124 | maskChanged := cmd.Flags().Lookup("masks").Changed
125 | if maskChanged {
126 | fmt.Fprintf(outfh, "%d\t%s\n", nMasks, decoder(lh.Masks[nMasks-1], _k))
127 | } else {
128 | for i, code := range lh.Masks {
129 | fmt.Fprintf(outfh, "%d\t%s\n", i+1, decoder(code, _k))
130 | }
131 | }
132 | } else { // re generate
133 | if outputLog {
134 | log.Infof("generating new mask...")
135 | }
136 | lh, err = lexichash.NewWithSeed(k, nMasks, int64(seed), lcPrefix)
137 | checkError(err)
138 |
139 | _k := uint8(lh.K)
140 |
141 | for i, code := range lh.Masks {
142 | fmt.Fprintf(outfh, "%d\t%s\n", i+1, decoder(code, _k))
143 | }
144 | }
145 | },
146 | }
147 |
148 | func init() {
149 | utilsCmd.AddCommand(masksCmd)
150 |
151 | masksCmd.Flags().StringP("index", "d", "",
152 | formatFlagUsage(`Index directory created by "lexicmap index".`))
153 |
154 | masksCmd.Flags().StringP("out-file", "o", "-",
155 | formatFlagUsage(`Out file, supports and recommends a ".gz" suffix ("-" for stdout).`))
156 |
157 | masksCmd.Flags().IntP("kmer", "k", 31,
158 | formatFlagUsage(`Maximum k-mer size. K needs to be <= 32.`))
159 |
160 | masksCmd.Flags().IntP("masks", "m", 40000,
161 | formatFlagUsage(`Number of masks.`))
162 |
163 | masksCmd.Flags().IntP("seed", "s", 1,
164 | formatFlagUsage(`The seed for generating random masks.`))
165 |
166 | masksCmd.Flags().IntP("prefix", "p", 15,
167 | formatFlagUsage(`Length of mask k-mer prefix for checking low-complexity (0 for no checking).`))
168 |
169 | masksCmd.SetUsageTemplate(usageTemplate("{ -d | [-k ] [-n ] [-s ] } [-o out.tsv.gz]"))
170 | }
171 |
--------------------------------------------------------------------------------
/lexicmap/cmd/re-merge.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "fmt"
25 | "os"
26 | "path/filepath"
27 | "regexp"
28 | "sort"
29 | "time"
30 |
31 | "github.com/pkg/errors"
32 | "github.com/shenwei356/LexicMap/lexicmap/cmd/kv"
33 | "github.com/shenwei356/bio/seq"
34 | "github.com/shenwei356/lexichash"
35 | "github.com/shenwei356/util/pathutil"
36 | "github.com/spf13/cobra"
37 | )
38 |
39 | var remergeCmd = &cobra.Command{
40 | Use: "remerge",
41 | Short: "Rerun the merging step for an unfinished index",
42 | Long: `Rerun the merging step for an unfinished index
43 |
44 | When to use this command?
45 |
46 | - Only one thread is used for merging indexes, which happens when there are
47 | a lot (>200 batches) of batches ($inpu_files / --batch-size) and the value
48 | of --max-open-files is not big enough. E.g.,
49 |
50 | 22:54:24.420 [INFO] merging 297 indexes...
51 | 22:54:24.455 [INFO] [round 1]
52 | 22:54:24.455 [INFO] batch 1/1, merging 297 indexes to xxx.lmi.tmp/r1_b1 with 1 threads...
53 |
54 | ► Then you can run this command with a bigger --max-open-files (e.g., 4096) and
55 | -J/--seed-data-threads (e.g., 12. 12 needs be <= 4096/(297+2)=13.7).
56 | And you need to set a bigger 'ulimit -n' if the value of --max-open-files is bigger than 1024.
57 |
58 | - The Slurm/PBS job time limit is almost reached and the merging step won't be finished before that.
59 |
60 | - Disk quota is reached in the merging step.
61 |
62 | `,
63 | Run: func(cmd *cobra.Command, args []string) {
64 | opt := getOptions(cmd)
65 | seq.ValidateSeq = false
66 |
67 | var fhLog *os.File
68 | if opt.Log2File {
69 | fhLog = addLog(opt.LogFile, opt.Verbose)
70 | }
71 |
72 | outputLog := opt.Verbose || opt.Log2File
73 |
74 | timeStart := time.Now()
75 | defer func() {
76 | if outputLog {
77 | log.Info()
78 | log.Infof("elapsed time: %s", time.Since(timeStart))
79 | log.Info()
80 | }
81 | if opt.Log2File {
82 | fhLog.Close()
83 | }
84 | }()
85 |
86 | // ---------------------------------------------------------------
87 |
88 | dbDir := getFlagString(cmd, "index")
89 | if dbDir == "" {
90 | checkError(fmt.Errorf("index directory is need"))
91 | }
92 |
93 | tmpDir := filepath.Clean(dbDir) + ExtTmpDir
94 | ok, err := pathutil.DirExists((tmpDir))
95 | if err != nil {
96 | checkError(fmt.Errorf("index directory is need"))
97 | }
98 | if !ok {
99 | checkError(fmt.Errorf("tmp directory is not found: %s", tmpDir))
100 | }
101 |
102 | mergeThreads := getFlagPositiveInt(cmd, "seed-data-threads")
103 |
104 | maxOpenFiles := getFlagPositiveInt(cmd, "max-open-files")
105 |
106 | // ---------------------------------------------------------------
107 | // check indexes of all batches
108 |
109 | if opt.Verbose || opt.Log2File {
110 | log.Infof("checking indexes ...")
111 | }
112 |
113 | // batch dirs
114 | batchDirs := make([]string, 0, 512)
115 | pattern := regexp.MustCompile(`^batch_\d+$`)
116 | files, err := os.ReadDir(tmpDir)
117 | if err != nil {
118 | checkError(errors.Errorf("failed to read dir: %s", err))
119 | }
120 | for _, file := range files {
121 | if file.Name() == "." || file.Name() == ".." {
122 | continue
123 | }
124 | if file.IsDir() && pattern.MatchString(file.Name()) {
125 | batchDirs = append(batchDirs, filepath.Join(tmpDir, file.Name()))
126 | }
127 | }
128 |
129 | if len(batchDirs) == 0 {
130 | checkError(fmt.Errorf("no indexes found in %s", tmpDir))
131 | } else if opt.Verbose || opt.Log2File {
132 | log.Infof(" %d index directries found in %s", len(batchDirs), tmpDir)
133 | }
134 |
135 | // ---------------------------------------------------------------
136 | // prepare arguments for mergeIndexes
137 |
138 | sort.Strings(batchDirs)
139 | OneIndex := batchDirs[0]
140 |
141 | // lh *lexichash.LexicHash, read from one batch
142 | fileMask := filepath.Join(OneIndex, FileMasks)
143 | ok, err = pathutil.Exists(fileMask)
144 | if err != nil || !ok {
145 | checkError(fmt.Errorf("mask file not found: %s. Was the index merged?", fileMask))
146 | }
147 | var lh *lexichash.LexicHash
148 | lh, err = lexichash.NewFromFile(fileMask)
149 | if err != nil {
150 | checkError(fmt.Errorf("checking mask file: %s", err))
151 | }
152 | // fmt.Println(len(lh.Masks))
153 |
154 | // maskPrefix uint8, anchorPrefix uint8, read from one batch with ReadKVIndexInfo
155 | var maskPrefix, anchorPrefix uint8
156 | fileSeedChunk := filepath.Join(OneIndex, DirSeeds, chunkFile(0))
157 | _, _, _, maskPrefix, anchorPrefix, err = kv.ReadKVIndexInfo(filepath.Clean(fileSeedChunk) + kv.KVIndexFileExt)
158 | if err != nil {
159 | checkError(fmt.Errorf("checking seed information: %s", err))
160 | }
161 | // fmt.Println(maskPrefix, anchorPrefix)
162 |
163 | // kvChunks int, read from one batch, info file
164 | var info *IndexInfo
165 | info, err = readIndexInfo(filepath.Join(OneIndex, FileInfo))
166 | if err != nil {
167 | checkError(fmt.Errorf("failed to open info file: %s", err))
168 | }
169 | kvChunks := info.Chunks
170 | if mergeThreads > kvChunks {
171 | mergeThreads = kvChunks
172 | }
173 |
174 | // opt *IndexBuildingOptions, create one, used: opt.Verbose, opt.Log2File, opt.MaxOpenFiles, opt.MergeThreads
175 | bopt := &IndexBuildingOptions{
176 | // general
177 | NumCPUs: opt.NumCPUs,
178 | Verbose: opt.Verbose,
179 | Log2File: opt.Log2File,
180 | MaxOpenFiles: maxOpenFiles,
181 | MergeThreads: mergeThreads,
182 | }
183 |
184 | // outdir string, dbDir
185 | // paths []string, batchDirs
186 | // tmpDir string, tmpDir
187 | // round int 1
188 |
189 | err = mergeIndexes(lh, maskPrefix, anchorPrefix, bopt, kvChunks, dbDir, batchDirs, tmpDir, 1)
190 | if err != nil {
191 | checkError(fmt.Errorf("failed to merge indexes: %s", err))
192 | }
193 |
194 | // clean tmp dir
195 | err = os.RemoveAll(tmpDir)
196 | if err != nil {
197 | checkError(fmt.Errorf("failed to remove tmp directory: %s", err))
198 | }
199 | },
200 | }
201 |
202 | func init() {
203 | utilsCmd.AddCommand(remergeCmd)
204 |
205 | remergeCmd.Flags().StringP("index", "d", "",
206 | formatFlagUsage(`Index directory created by "lexicmap index".`))
207 |
208 | remergeCmd.Flags().IntP("seed-data-threads", "J", 8,
209 | formatFlagUsage(`Number of threads for writing seed data and merging seed chunks from all batches, the value should be in range of [1, -c/--chunks]. If there are >100 batches, please also increase the value of --max-open-files and set a bigger "ulimit -n" in shell.`))
210 |
211 | remergeCmd.Flags().IntP("max-open-files", "", 1024,
212 | formatFlagUsage(`Maximum opened files, used in merging indexes. If there are >100 batches, please increase this value and set a bigger "ulimit -n" in shell.`))
213 |
214 | remergeCmd.SetUsageTemplate(usageTemplate("[flags] -d "))
215 | }
216 |
--------------------------------------------------------------------------------
/lexicmap/cmd/recount-bases.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "fmt"
25 | "path/filepath"
26 | "sync"
27 | "time"
28 |
29 | "github.com/dustin/go-humanize"
30 | "github.com/shenwei356/LexicMap/lexicmap/cmd/genome"
31 | "github.com/shenwei356/bio/seq"
32 | "github.com/spf13/cobra"
33 | )
34 |
35 | var countbasesCmd = &cobra.Command{
36 | Use: "recount-bases",
37 | Short: "Recount bases for index version <=3.2",
38 | Long: `Recount bases for index version <=3.2
39 |
40 | This command is only needed for indexes created by LexicMap v0.6.0 (3c257ca) or before versions.
41 |
42 | `,
43 | Run: func(cmd *cobra.Command, args []string) {
44 | opt := getOptions(cmd)
45 | seq.ValidateSeq = false
46 |
47 | // ------------------------------
48 |
49 | dbDir := getFlagString(cmd, "index")
50 | if dbDir == "" {
51 | checkError(fmt.Errorf("flag -d/--index needed"))
52 | }
53 |
54 | // info file
55 | fileInfo := filepath.Join(dbDir, FileInfo)
56 | info, err := readIndexInfo(fileInfo)
57 | if err != nil {
58 | checkError(fmt.Errorf("failed to read info file: %s", err))
59 | }
60 | if info.MainVersion != MainVersion {
61 | checkError(fmt.Errorf("index main versions do not match: %d (index) != %d (tool). please re-create the index", info.MainVersion, MainVersion))
62 | }
63 |
64 | var startTime time.Time
65 |
66 | old := info.InputBases
67 | totalBases, err := updateInputBases(info, dbDir, opt.NumCPUs)
68 | checkError(err)
69 |
70 | if opt.Verbose {
71 | fmt.Printf("update input bases from %d to %s in %s\n", old, humanize.Comma(totalBases), startTime)
72 | }
73 | },
74 | }
75 |
76 | func init() {
77 | // utilsCmd.AddCommand(countbasesCmd)
78 |
79 | countbasesCmd.Flags().StringP("index", "d", "",
80 | formatFlagUsage(`Index directory created by "lexicmap index".`))
81 |
82 | countbasesCmd.SetUsageTemplate(usageTemplate(""))
83 | }
84 |
85 | func updateInputBases(info *IndexInfo, dbDir string, threads int) (int64, error) {
86 | // sum bases
87 | var totalBases int64
88 | ch := make(chan int64, threads)
89 | done := make(chan int)
90 | go func() {
91 | for b := range ch {
92 | totalBases += b
93 | }
94 | done <- 1
95 | }()
96 |
97 | // extract genome sizes
98 | var wg sync.WaitGroup
99 | tokens := make(chan int, threads)
100 | for i := 0; i < info.GenomeBatches; i++ {
101 | wg.Add(1)
102 | tokens <- 1
103 | go func(i int) {
104 | fileGenomes := filepath.Join(dbDir, DirGenomes, batchDir(i), FileGenomes)
105 | rdr, err := genome.NewReader(fileGenomes)
106 | if err != nil {
107 | checkError(fmt.Errorf("failed to create genome reader: %s", err))
108 | }
109 |
110 | _totalBases, err := rdr.TotalBases()
111 | if err != nil {
112 | checkError(fmt.Errorf("failed to check total bases for %s: %s", fileGenomes, err))
113 | }
114 |
115 | ch <- _totalBases
116 |
117 | wg.Done()
118 | <-tokens
119 | }(i)
120 | }
121 | wg.Wait()
122 | close(ch)
123 | <-done
124 |
125 | // update info file
126 | info.InputBases = totalBases
127 |
128 | err := writeIndexInfo(filepath.Join(dbDir, FileInfo), info)
129 | if err != nil {
130 | return 0, (fmt.Errorf("failed to write info file: %s", err))
131 | }
132 |
133 | return totalBases, err
134 | }
135 |
--------------------------------------------------------------------------------
/lexicmap/cmd/reindex-seeds.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "fmt"
25 | "os"
26 | "path/filepath"
27 | "sync"
28 | "time"
29 |
30 | "github.com/shenwei356/LexicMap/lexicmap/cmd/kv"
31 | "github.com/shenwei356/bio/seq"
32 | "github.com/spf13/cobra"
33 | "github.com/vbauerster/mpb/v8"
34 | "github.com/vbauerster/mpb/v8/decor"
35 | )
36 |
37 | var reindexSeedsCmd = &cobra.Command{
38 | Use: "reindex-seeds",
39 | Short: "Recreate indexes of k-mer-value (seeds) data",
40 | Long: `Recreate indexes of k-mer-value (seeds) data
41 |
42 | `,
43 | Run: func(cmd *cobra.Command, args []string) {
44 | opt := getOptions(cmd)
45 | seq.ValidateSeq = false
46 |
47 | // ------------------------------
48 |
49 | dbDir := getFlagString(cmd, "index")
50 | if dbDir == "" {
51 | checkError(fmt.Errorf("flag -d/--index needed"))
52 | }
53 |
54 | partitions := getFlagPositiveInt(cmd, "partitions")
55 |
56 | // ---------------------------------------------------------------
57 |
58 | if opt.Verbose {
59 | log.Infof("recreating seed indexes with %d partitions for: %s", partitions, dbDir)
60 | }
61 |
62 | // info file for the number of genome batches
63 | fileInfo := filepath.Join(dbDir, FileInfo)
64 | info, err := readIndexInfo(fileInfo)
65 | if err != nil {
66 | checkError(fmt.Errorf("failed to read info file: %s", err))
67 | }
68 |
69 | // ---------------------------------------------------------------
70 |
71 | timeStart := time.Now()
72 | defer func() {
73 | if opt.Verbose {
74 | log.Info()
75 | log.Infof("elapsed time: %s", time.Since(timeStart))
76 | log.Info()
77 | }
78 | }()
79 |
80 | showProgressBar := opt.Verbose
81 |
82 | // process bar
83 | var pbs *mpb.Progress
84 | var bar *mpb.Bar
85 | var chDuration chan time.Duration
86 | var doneDuration chan int
87 | if showProgressBar {
88 | pbs = mpb.New(mpb.WithWidth(40), mpb.WithOutput(os.Stderr))
89 | bar = pbs.AddBar(int64(info.Chunks),
90 | mpb.PrependDecorators(
91 | decor.Name("processed files: ", decor.WC{W: len("processed files: "), C: decor.DindentRight}),
92 | decor.Name("", decor.WCSyncSpaceR),
93 | decor.CountersNoUnit("%d / %d", decor.WCSyncWidth),
94 | ),
95 | mpb.AppendDecorators(
96 | decor.Name("ETA: ", decor.WC{W: len("ETA: ")}),
97 | decor.EwmaETA(decor.ET_STYLE_GO, 3),
98 | decor.OnComplete(decor.Name(""), ". done"),
99 | ),
100 | )
101 |
102 | chDuration = make(chan time.Duration, opt.NumCPUs)
103 | doneDuration = make(chan int)
104 | go func() {
105 | for t := range chDuration {
106 | bar.EwmaIncrBy(1, t)
107 | }
108 | doneDuration <- 1
109 | }()
110 | }
111 |
112 | var wg sync.WaitGroup
113 | tokens := make(chan int, opt.NumCPUs)
114 | threadsFloat := float64(opt.NumCPUs)
115 | for chunk := 0; chunk < info.Chunks; chunk++ {
116 | file := filepath.Join(dbDir, DirSeeds, chunkFile(chunk))
117 | wg.Add(1)
118 | tokens <- 1
119 |
120 | go func(file string) {
121 | timeStart := time.Now()
122 | err := kv.CreateKVIndex(file, partitions)
123 | checkError(err)
124 | if showProgressBar {
125 | chDuration <- time.Duration(float64(time.Since(timeStart)) / threadsFloat)
126 | }
127 | <-tokens
128 | wg.Done()
129 | }(file)
130 | }
131 | wg.Wait()
132 |
133 | if showProgressBar {
134 | close(chDuration)
135 | <-doneDuration
136 | pbs.Wait()
137 | }
138 |
139 | if opt.Verbose {
140 | log.Infof("update index information file: %s", fileInfo)
141 | }
142 | info.Partitions = partitions
143 | err = writeIndexInfo(fileInfo, info)
144 | if err != nil {
145 | checkError(fmt.Errorf("failed to write info file: %s", err))
146 | }
147 | if opt.Verbose {
148 | log.Infof(" finished updating the index information file: %s", fileInfo)
149 | }
150 | },
151 | }
152 |
153 | func init() {
154 | utilsCmd.AddCommand(reindexSeedsCmd)
155 |
156 | reindexSeedsCmd.Flags().StringP("index", "d", "",
157 | formatFlagUsage(`Index directory created by "lexicmap index".`))
158 | reindexSeedsCmd.Flags().IntP("partitions", "", 4096,
159 | formatFlagUsage(`Number of partitions for re-indexing seeds (k-mer-value data) files. The value needs to be the power of 4.`))
160 |
161 | reindexSeedsCmd.SetUsageTemplate(usageTemplate(""))
162 | }
163 |
--------------------------------------------------------------------------------
/lexicmap/cmd/root.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "fmt"
25 | "os"
26 | "runtime"
27 |
28 | "github.com/spf13/cobra"
29 | )
30 |
31 | // RootCmd represents the base command when called without any subcommands
32 | var RootCmd = &cobra.Command{
33 | Use: "lexicmap",
34 | Short: "efficient sequence alignment against millions of prokaryotic genomes",
35 | Long: fmt.Sprintf(`
36 | LexicMap: efficient sequence alignment against millions of prokaryotic genomes
37 |
38 | Version: v%s
39 | Documents: https://bioinf.shenwei.me/LexicMap
40 | Source code: https://github.com/shenwei356/LexicMap
41 |
42 | `, VERSION),
43 | }
44 |
45 | // Execute adds all child commands to the root command sets flags appropriately.
46 | // This is called by main.main(). It only needs to happen once to the rootCmd.
47 | func Execute() {
48 | if err := RootCmd.Execute(); err != nil {
49 | fmt.Println(err)
50 | os.Exit(-1)
51 | }
52 | }
53 |
54 | func init() {
55 |
56 | defaultThreads := runtime.NumCPU()
57 |
58 | RootCmd.PersistentFlags().IntP("threads", "j", defaultThreads,
59 | formatFlagUsage("Number of CPU cores to use. By default, it uses all available cores."))
60 |
61 | // RootCmd.PersistentFlags().BoolP("verbose", "", false, "print verbose information (recommended)")
62 |
63 | RootCmd.PersistentFlags().BoolP("quiet", "", false,
64 | formatFlagUsage("Do not print any verbose information. But you can write them to a file with --log."))
65 |
66 | RootCmd.PersistentFlags().StringP("infile-list", "X", "",
67 | formatFlagUsage("File of input file list (one file per line). If given, they are appended to files from CLI arguments."))
68 |
69 | RootCmd.PersistentFlags().StringP("log", "", "", formatFlagUsage("Log file."))
70 |
71 | RootCmd.CompletionOptions.DisableDefaultCmd = true
72 |
73 | RootCmd.SetHelpCommand(&cobra.Command{Hidden: true})
74 |
75 | RootCmd.SetUsageTemplate(usageTemplate(""))
76 | }
77 |
78 | func formatFlagUsage(s string) string {
79 | return "► " + s
80 | }
81 |
82 | func usageTemplate(s string) string {
83 | return fmt.Sprintf(`Usage:{{if .Runnable}}
84 | {{.UseLine}}{{end}}{{if .HasAvailableSubCommands}}
85 | {{.CommandPath}} [command]{{end}} %s{{if gt (len .Aliases) 0}}
86 |
87 | Aliases:
88 | {{.NameAndAliases}}{{end}}{{if .HasExample}}
89 |
90 | Examples:
91 | {{.Example}}{{end}}{{if .HasAvailableSubCommands}}{{$cmds := .Commands}}{{if eq (len .Groups) 0}}
92 |
93 | Available Commands:{{range $cmds}}{{if (or .IsAvailableCommand (eq .Name "help"))}}
94 | {{rpad .Name .NamePadding }} {{.Short}}{{end}}{{end}}{{else}}{{range $group := .Groups}}
95 |
96 | {{.Title}}{{range $cmds}}{{if (and (eq .GroupID $group.ID) (or .IsAvailableCommand (eq .Name "help")))}}
97 | {{rpad .Name .NamePadding }} {{.Short}}{{end}}{{end}}{{end}}{{if not .AllChildCommandsHaveGroup}}
98 |
99 | Additional Commands:{{range $cmds}}{{if (and (eq .GroupID "") (or .IsAvailableCommand (eq .Name "help")))}}
100 | {{rpad .Name .NamePadding }} {{.Short}}{{end}}{{end}}{{end}}{{end}}{{end}}{{if .HasAvailableLocalFlags}}
101 |
102 | Flags:
103 | {{.LocalFlags.FlagUsagesWrapped 110 | trimTrailingWhitespaces}}{{end}}{{if .HasAvailableInheritedFlags}}
104 |
105 | Global Flags:
106 | {{.InheritedFlags.FlagUsagesWrapped 110 | trimTrailingWhitespaces}}{{end}}{{if .HasHelpSubCommands}}
107 |
108 | Additional help topics:{{range .Commands}}{{if .IsAdditionalHelpTopicCommand}}
109 | {{rpad .CommandPath .CommandPathPadding}} {{.Short}}{{end}}{{end}}{{end}}{{if .HasAvailableSubCommands}}
110 |
111 | Use "{{.CommandPath}} [command] --help" for more information about a command.{{end}}
112 | `, s)
113 | }
114 |
--------------------------------------------------------------------------------
/lexicmap/cmd/seedposition/seed_position_test.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package seedposition
22 |
23 | import (
24 | "math/rand"
25 | "os"
26 | "testing"
27 | )
28 |
29 | func TestSeedPositions(t *testing.T) {
30 | tests := [][]uint32{
31 | {},
32 | {1},
33 | {1, 15},
34 | {1, 15, 300},
35 | {1, 15, 300, 301},
36 | {1, 15, 300, 301, 2500},
37 | {1, 15, 300, 301, 2500, 3100},
38 | {1, 15, 300, 301, 2500, 3100, 3111},
39 | {1, 15, 300, 301, 2500, 3100, 3111, 5000},
40 | {1, 15, 300, 301, 2500, 3100, 3111, 5000, 10000},
41 | }
42 |
43 | file := "test.bin"
44 |
45 | // ---------------------------------------
46 |
47 | wtr, err := NewWriter(file, 0)
48 | if err != nil {
49 | t.Error(err)
50 | return
51 | }
52 | for i, test := range tests {
53 | err = wtr.Write(test)
54 | if err != nil {
55 | t.Errorf("write #%d data: %s", i+1, err)
56 | return
57 | }
58 | }
59 | err = wtr.Close()
60 | if err != nil {
61 | t.Error(err)
62 | return
63 | }
64 |
65 | idxs := make([]int, len(tests))
66 | for i := range tests {
67 | idxs[i] = i
68 | }
69 | rand.Shuffle(len(tests), func(i, j int) { idxs[i], idxs[j] = idxs[j], idxs[i] })
70 |
71 | // ---------------------------------------
72 |
73 | rdr, err := NewReader(file)
74 | if err != nil {
75 | t.Error(err)
76 | return
77 | }
78 |
79 | locs := make([]uint32, 64)
80 | var test []uint32
81 | var v uint32
82 | var j int
83 |
84 | for _, i := range idxs {
85 | test = tests[i]
86 | err = rdr.SeedPositions(i, &locs)
87 | if err != nil {
88 | t.Errorf("read #%d data: %s", i, err)
89 | return
90 | }
91 |
92 | if len(locs) != len(test) {
93 | t.Errorf("[#%d] unequal of position numbers, expected: %d, returned %d",
94 | i, len(test), len(locs))
95 | return
96 | }
97 |
98 | for j, v = range locs {
99 | if v != test[j] {
100 | t.Errorf("[#%d] unequal of positions, expected: %d, returned %d", i, test[j], v)
101 | return
102 | }
103 | }
104 | }
105 |
106 | // clean up
107 |
108 | err = os.RemoveAll(file)
109 | if err != nil {
110 | t.Error(err)
111 | return
112 | }
113 |
114 | err = os.RemoveAll(file + PositionsIndexFileExt)
115 | if err != nil {
116 | t.Error(err)
117 | return
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/lexicmap/cmd/subseq.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "fmt"
25 | "path/filepath"
26 | "regexp"
27 | "strconv"
28 | "strings"
29 |
30 | "github.com/shenwei356/LexicMap/lexicmap/cmd/genome"
31 | "github.com/shenwei356/bio/seq"
32 | "github.com/spf13/cobra"
33 | )
34 |
35 | var subseqCmd = &cobra.Command{
36 | Use: "subseq",
37 | Short: "Extract subsequence via reference name, sequence ID, position and strand",
38 | Long: `Exextract subsequence via reference name, sequence ID, position and strand
39 |
40 | Attention:
41 | 1. The option -s/--seq-id is optional.
42 | 1) If given, the positions are these in the original sequence.
43 | 2) If not given, the positions are these in the concatenated sequence.
44 | 2. All degenerate bases in reference genomes were converted to the lexicographic first bases.
45 | E.g., N was converted to A. Therefore, consecutive A's in output might be N's in the genomes.
46 |
47 | `,
48 | Run: func(cmd *cobra.Command, args []string) {
49 | opt := getOptions(cmd)
50 | seq.ValidateSeq = false
51 |
52 | // ------------------------------
53 |
54 | dbDir := getFlagString(cmd, "index")
55 | if dbDir == "" {
56 | checkError(fmt.Errorf("flag -d/--index needed"))
57 | }
58 |
59 | refname := getFlagString(cmd, "ref-name")
60 | if refname == "" {
61 | checkError(fmt.Errorf("flag -n/--ref-name needed"))
62 | }
63 |
64 | seqid := getFlagString(cmd, "seq-id")
65 | var concatenatedPositions bool
66 | if seqid == "" {
67 | concatenatedPositions = true
68 | }
69 |
70 | var reRegion = regexp.MustCompile(`\-?\d+:\-?\d+`)
71 |
72 | region := getFlagString(cmd, "region")
73 | if region == "" {
74 | checkError(fmt.Errorf("flag -r/--region needed"))
75 | }
76 | revcom := getFlagBool(cmd, "revcom")
77 |
78 | lineWidth := getFlagNonNegativeInt(cmd, "line-width")
79 |
80 | if !reRegion.MatchString(region) {
81 | checkError(fmt.Errorf(`invalid region: %s. type "lexicmap utils subseq -h" for more examples`, region))
82 | }
83 | var start, end int
84 | var err error
85 |
86 | r := strings.Split(region, ":")
87 | start, err = strconv.Atoi(r[0])
88 | checkError(err)
89 | end, err = strconv.Atoi(r[1])
90 | checkError(err)
91 | if start <= 0 || end <= 0 {
92 | checkError(fmt.Errorf("both begin and end position should not be <= 0"))
93 | }
94 | if start > end {
95 | checkError(fmt.Errorf("begin position should be < end position"))
96 | }
97 |
98 | outFile := getFlagString(cmd, "out-file")
99 |
100 | // ---------------------------------------------------------------
101 |
102 | // genomes.map file for mapping index to genome id
103 | m, err := readGenomeMapName2Idx(filepath.Join(dbDir, FileGenomeIndex))
104 | if err != nil {
105 | checkError(fmt.Errorf("failed to read genomes index mapping file: %s", err))
106 | }
107 |
108 | var batchIDAndRefIDs *[]uint64
109 |
110 | var ok bool
111 | if batchIDAndRefIDs, ok = m[refname]; !ok {
112 | checkError(fmt.Errorf("reference name not found: %s", refname))
113 | }
114 |
115 | var tSeq *genome.Genome
116 | var genomeBatch, genomeIdx int
117 | var rdr *genome.Reader
118 |
119 | var _end int
120 |
121 | for _, batchIDAndRefID := range *batchIDAndRefIDs {
122 | genomeBatch = int(batchIDAndRefID >> BITS_GENOME_IDX)
123 | genomeIdx = int(batchIDAndRefID & MASK_GENOME_IDX)
124 |
125 | fileGenome := filepath.Join(dbDir, DirGenomes, batchDir(genomeBatch), FileGenomes)
126 | rdr, err = genome.NewReader(fileGenome)
127 | if err != nil {
128 | checkError(fmt.Errorf("failed to read genome data file: %s", err))
129 | }
130 |
131 | if concatenatedPositions {
132 | tSeq, err = rdr.SubSeq(genomeIdx, start-1, end-1)
133 | } else {
134 | tSeq, _end, err = rdr.SubSeq2(genomeIdx, []byte(seqid), start-1, end-1)
135 | _end++ // returned end is 0-based.
136 | }
137 | if err == nil {
138 | break
139 | // checkError(fmt.Errorf("failed to read subsequence: %s", err))
140 | }
141 | }
142 | if err != nil {
143 | checkError(fmt.Errorf("failed to read subsequence: %s", err))
144 | }
145 |
146 | end = _end // update end
147 |
148 | // output file handler
149 | outfh, gw, w, err := outStream(outFile, strings.HasSuffix(outFile, ".gz"), opt.CompressionLevel)
150 | checkError(err)
151 | defer func() {
152 | outfh.Flush()
153 | if gw != nil {
154 | gw.Close()
155 | }
156 | w.Close()
157 | }()
158 |
159 | s, err := seq.NewSeq(seq.DNAredundant, tSeq.Seq)
160 | checkError(err)
161 |
162 | strand := "+"
163 | if revcom {
164 | strand = "-"
165 | s.RevComInplace()
166 | }
167 |
168 | if concatenatedPositions {
169 | fmt.Fprintf(outfh, ">%s:%d-%d:%s\n", refname, start, end, strand)
170 | } else {
171 | fmt.Fprintf(outfh, ">%s:%d-%d:%s\n", seqid, start, end, strand)
172 | }
173 | outfh.Write(s.FormatSeq(lineWidth))
174 | outfh.WriteByte('\n')
175 |
176 | genome.RecycleGenome(tSeq)
177 | checkError(rdr.Close())
178 | },
179 | }
180 |
181 | func init() {
182 | utilsCmd.AddCommand(subseqCmd)
183 |
184 | subseqCmd.Flags().StringP("index", "d", "",
185 | formatFlagUsage(`Index directory created by "lexicmap index".`))
186 |
187 | subseqCmd.Flags().StringP("ref-name", "n", "",
188 | formatFlagUsage(`Reference name.`))
189 |
190 | subseqCmd.Flags().StringP("seq-id", "s", "",
191 | formatFlagUsage(`Sequence ID. If the value is empty, the positions in the region are treated as that in the concatenated sequence.`))
192 |
193 | subseqCmd.Flags().StringP("out-file", "o", "-",
194 | formatFlagUsage(`Out file, supports the ".gz" suffix ("-" for stdout).`))
195 |
196 | subseqCmd.Flags().StringP("region", "r", "",
197 | formatFlagUsage(`Region of the subsequence (1-based).`))
198 |
199 | subseqCmd.Flags().BoolP("revcom", "R", false,
200 | formatFlagUsage("Extract subsequence on the negative strand."))
201 |
202 | subseqCmd.Flags().IntP("line-width", "w", 60,
203 | formatFlagUsage("Line width of sequence (0 for no wrap)."))
204 |
205 | subseqCmd.SetUsageTemplate(usageTemplate(""))
206 | }
207 |
--------------------------------------------------------------------------------
/lexicmap/cmd/tree/tree_test.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package tree
22 |
23 | import (
24 | "testing"
25 |
26 | "github.com/shenwei356/kmers"
27 | "github.com/shenwei356/lexichash"
28 | )
29 |
30 | func TestTree(t *testing.T) {
31 | var k uint8
32 | var n uint64
33 | var i uint64
34 | var v uint32
35 | var query string
36 | var code uint64
37 | var srs *[]*SearchResult
38 |
39 | for j := 0; j < 1000; j++ {
40 | k = 6
41 | n = uint64(1 << (k * 2))
42 |
43 | _t := NewTree(uint8(k))
44 |
45 | for i = 0; i < n; i++ {
46 | v = uint32(i & 3)
47 | if v == 3 || v == 0 {
48 | continue
49 | }
50 | _t.Insert(i, v)
51 | }
52 |
53 | query = "ACTGAC"
54 | code, _ = kmers.Encode([]byte(query))
55 | // srs, _ := tree.Search(code, uint8(len(query)), 4)
56 | srs, _ = _t.Search(code, 5)
57 | t.Logf("query: %s\n", query)
58 | for _, sr := range *srs {
59 | t.Logf(" %s, len(prefix): %d, %v\n",
60 | lexichash.MustDecode(sr.Kmer, k), sr.LenPrefix, sr.Values)
61 | }
62 | _t.RecycleSearchResult(srs)
63 |
64 | RecycleTree(_t)
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/lexicmap/cmd/util-cli.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "bufio"
25 | "fmt"
26 | "os"
27 | "strconv"
28 | "strings"
29 |
30 | "github.com/pkg/errors"
31 | "github.com/shenwei356/util/stringutil"
32 | "github.com/spf13/cobra"
33 | )
34 |
35 | func checkError(err error) {
36 | if err != nil {
37 | log.Error(err)
38 | os.Exit(-1)
39 | }
40 | }
41 |
42 | func isStdin(file string) bool {
43 | return file == "-"
44 | }
45 |
46 | func isStdout(file string) bool {
47 | return file == "-"
48 | }
49 |
50 | func getFlagInt(cmd *cobra.Command, flag string) int {
51 | value, err := cmd.Flags().GetInt(flag)
52 | checkError(err)
53 | return value
54 | }
55 |
56 | func getFlagIntSlice(cmd *cobra.Command, flag string) []int {
57 | value, err := cmd.Flags().GetIntSlice(flag)
58 | checkError(err)
59 | return value
60 | }
61 |
62 | func getFlagUint8(cmd *cobra.Command, flag string) uint8 {
63 | value, err := cmd.Flags().GetUint8(flag)
64 | checkError(err)
65 | return value
66 | }
67 |
68 | func getFlagUint32(cmd *cobra.Command, flag string) uint32 {
69 | value, err := cmd.Flags().GetUint32(flag)
70 | checkError(err)
71 | return value
72 | }
73 |
74 | func getFlagUint64(cmd *cobra.Command, flag string) uint64 {
75 | value, err := cmd.Flags().GetUint64(flag)
76 | checkError(err)
77 | return value
78 | }
79 |
80 | func getFlagPositiveInt(cmd *cobra.Command, flag string) int {
81 | value, err := cmd.Flags().GetInt(flag)
82 | checkError(err)
83 | if value <= 0 {
84 | checkError(fmt.Errorf("value of flag --%s should be greater than 0", flag))
85 | }
86 | return value
87 | }
88 |
89 | func getFlagPositiveFloat64(cmd *cobra.Command, flag string) float64 {
90 | value, err := cmd.Flags().GetFloat64(flag)
91 | checkError(err)
92 | if value <= 0 {
93 | checkError(fmt.Errorf("value of flag --%s should be greater than 0", flag))
94 | }
95 | return value
96 | }
97 |
98 | func getFlagNonNegativeInt(cmd *cobra.Command, flag string) int {
99 | value, err := cmd.Flags().GetInt(flag)
100 | checkError(err)
101 | if value < 0 {
102 | checkError(fmt.Errorf("value of flag --%s should be greater than or equal to 0", flag))
103 | }
104 | return value
105 | }
106 |
107 | func getFlagNonNegativeFloat64(cmd *cobra.Command, flag string) float64 {
108 | value, err := cmd.Flags().GetFloat64(flag)
109 | checkError(err)
110 | if value < 0 {
111 | checkError(fmt.Errorf("value of flag --%s should be greater than or equal to ", flag))
112 | }
113 | return value
114 | }
115 |
116 | func getFlagBool(cmd *cobra.Command, flag string) bool {
117 | value, err := cmd.Flags().GetBool(flag)
118 | checkError(err)
119 | return value
120 | }
121 |
122 | func getFlagString(cmd *cobra.Command, flag string) string {
123 | value, err := cmd.Flags().GetString(flag)
124 | checkError(err)
125 | return value
126 | }
127 |
128 | func getFlagNonEmptyString(cmd *cobra.Command, flag string) string {
129 | value, err := cmd.Flags().GetString(flag)
130 | checkError(err)
131 | if value == "" {
132 | checkError(fmt.Errorf("flag --%s needed", flag))
133 | }
134 | return value
135 | }
136 |
137 | func getFlagCommaSeparatedStrings(cmd *cobra.Command, flag string) []string {
138 | value, err := cmd.Flags().GetString(flag)
139 | checkError(err)
140 | return stringutil.Split(value, ",")
141 | }
142 |
143 | func getFlagSemicolonSeparatedStrings(cmd *cobra.Command, flag string) []string {
144 | value, err := cmd.Flags().GetString(flag)
145 | checkError(err)
146 | return stringutil.Split(value, ";")
147 | }
148 |
149 | func getFlagCommaSeparatedInts(cmd *cobra.Command, flag string) []int {
150 | filedsStrList := getFlagCommaSeparatedStrings(cmd, flag)
151 | fields := make([]int, len(filedsStrList))
152 | for i, value := range filedsStrList {
153 | v, err := strconv.Atoi(value)
154 | if err != nil {
155 | checkError(fmt.Errorf("value of flag --%s should be comma separated integers", flag))
156 | }
157 | fields[i] = v
158 | }
159 | return fields
160 | }
161 |
162 | func getFlagRune(cmd *cobra.Command, flag string) rune {
163 | value, err := cmd.Flags().GetString(flag)
164 | checkError(err)
165 | if len(value) > 1 {
166 | checkError(fmt.Errorf("value of flag --%s should has length of 1", flag))
167 | }
168 | var v rune
169 | for _, r := range value {
170 | v = r
171 | break
172 | }
173 | return v
174 | }
175 |
176 | func getFlagFloat64(cmd *cobra.Command, flag string) float64 {
177 | value, err := cmd.Flags().GetFloat64(flag)
178 | checkError(err)
179 | return value
180 | }
181 |
182 | func getFlagInt64(cmd *cobra.Command, flag string) int64 {
183 | value, err := cmd.Flags().GetInt64(flag)
184 | checkError(err)
185 | return value
186 | }
187 |
188 | func getFlagStringSlice(cmd *cobra.Command, flag string) []string {
189 | value, err := cmd.Flags().GetStringSlice(flag)
190 | checkError(err)
191 | return value
192 | }
193 |
194 | func getFileList(args []string, checkFile bool) []string {
195 | files := make([]string, 0, 1024)
196 | if len(args) == 0 {
197 | files = append(files, "-")
198 | } else {
199 | for _, file := range args {
200 | if isStdin(file) {
201 | continue
202 | }
203 | if !checkFile {
204 | continue
205 | }
206 | if _, err := os.Stat(file); os.IsNotExist(err) {
207 | checkError(errors.Wrap(err, file))
208 | }
209 | }
210 | files = args
211 | }
212 | return files
213 | }
214 |
215 | func getFileListFromFile(file string, checkFile bool) ([]string, error) {
216 | var fh *os.File
217 | var err error
218 | if file == "-" {
219 | fh = os.Stdin
220 | } else {
221 | fh, err = os.Open(file)
222 | if err != nil {
223 | return nil, fmt.Errorf("read file list from '%s': %s", file, err)
224 | }
225 | }
226 |
227 | var _file string
228 | lists := make([]string, 0, 1024)
229 | scanner := bufio.NewScanner(fh)
230 | for scanner.Scan() {
231 | _file = scanner.Text()
232 | if strings.TrimSpace(_file) == "" {
233 | continue
234 | }
235 | lists = append(lists, _file)
236 | }
237 | if err = scanner.Err(); err != nil {
238 | return nil, fmt.Errorf("read file list from '%s': %s", file, err)
239 | }
240 |
241 | if !checkFile {
242 | return lists, fh.Close()
243 | }
244 |
245 | for _, _file = range lists {
246 | if !isStdin(_file) {
247 | if _, err = os.Stat(_file); os.IsNotExist(err) {
248 | return lists, fmt.Errorf("check file '%s': %s", _file, err)
249 | }
250 | }
251 | }
252 |
253 | return lists, fh.Close()
254 | }
255 |
256 | func getFileListFromArgsAndFile(cmd *cobra.Command, args []string, checkFileFromArgs bool, flag string, checkFileFromFile bool) []string {
257 | infileList := getFlagString(cmd, flag)
258 | files := getFileList(args, checkFileFromArgs)
259 | if infileList != "" {
260 | _files, err := getFileListFromFile(infileList, checkFileFromFile)
261 | checkError(err)
262 | if len(_files) == 0 {
263 | log.Warningf("no files found in file list: %s", infileList)
264 | return files
265 | }
266 |
267 | if len(files) == 1 && isStdin(files[0]) {
268 | return _files
269 | }
270 | files = append(files, _files...)
271 | }
272 | return files
273 | }
274 |
275 | // ParseByteSize parses byte size from string
276 | func ParseByteSize(val string) (int64, error) {
277 | val = strings.Trim(val, " \t\r\n")
278 | if val == "" {
279 | return 0, nil
280 | }
281 | var u int64
282 | var noUnit bool
283 | switch val[len(val)-1] {
284 | case 'B', 'b':
285 | u = 1
286 | case 'K', 'k':
287 | u = 1 << 10
288 | case 'M', 'm':
289 | u = 1 << 20
290 | case 'G', 'g':
291 | u = 1 << 30
292 | case 'T', 't':
293 | u = 1 << 40
294 | default:
295 | noUnit = true
296 | u = 1
297 | }
298 | var size float64
299 | var err error
300 | if noUnit {
301 | size, err = strconv.ParseFloat(val, 10)
302 | if err != nil {
303 | return 0, fmt.Errorf("invalid byte size: %s", val)
304 | }
305 | if size < 0 {
306 | size = 0
307 | }
308 | return int64(size), nil
309 | }
310 |
311 | if len(val) == 1 { // no value
312 | return 0, nil
313 | }
314 |
315 | size, err = strconv.ParseFloat(strings.Trim(val[0:len(val)-1], " \t\r\n"), 10)
316 | if err != nil {
317 | return 0, fmt.Errorf("invalid byte size: %s", val)
318 | }
319 | if size < 0 {
320 | size = 0
321 | }
322 | return int64(size * float64(u)), nil
323 | }
324 |
--------------------------------------------------------------------------------
/lexicmap/cmd/util-io.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "bufio"
25 | "errors"
26 | "fmt"
27 | "io"
28 | "os"
29 | "path/filepath"
30 |
31 | gzip "github.com/klauspost/pgzip"
32 | )
33 |
34 | // BufferSize is size of buffer
35 | var BufferSize = 65536 // os.Getpagesize()
36 |
37 | func outStream(file string, gzipped bool, level int) (*bufio.Writer, io.WriteCloser, *os.File, error) {
38 | var w *os.File
39 | if file == "-" {
40 | w = os.Stdout
41 | } else {
42 | dir := filepath.Dir(file)
43 | fi, err := os.Stat(dir)
44 | if err == nil && !fi.IsDir() {
45 | return nil, nil, nil, fmt.Errorf("can not write file into a non-directory path: %s", dir)
46 | }
47 | if os.IsNotExist(err) {
48 | os.MkdirAll(dir, 0755)
49 | }
50 |
51 | w, err = os.Create(file)
52 | if err != nil {
53 | return nil, nil, nil, fmt.Errorf("fail to write %s: %s", file, err)
54 | }
55 | }
56 |
57 | if gzipped {
58 | // gw := gzip.NewWriter(w)
59 | gw, err := gzip.NewWriterLevel(w, level)
60 | if err != nil {
61 | return nil, nil, nil, fmt.Errorf("fail to write %s: %s", file, err)
62 | }
63 | return bufio.NewWriterSize(gw, BufferSize), gw, w, nil
64 | }
65 | return bufio.NewWriterSize(w, BufferSize), nil, w, nil
66 | }
67 |
68 | func inStream(file string) (*bufio.Reader, *os.File, bool, error) {
69 | var err error
70 | var r *os.File
71 | var gzipped bool
72 | if file == "-" {
73 | if !detectStdin() {
74 | return nil, nil, gzipped, errors.New("stdin not detected")
75 | }
76 | r = os.Stdin
77 | } else {
78 | r, err = os.Open(file)
79 | if err != nil {
80 | return nil, nil, gzipped, fmt.Errorf("fail to read %s: %s", file, err)
81 | }
82 | }
83 |
84 | br := bufio.NewReaderSize(r, BufferSize)
85 |
86 | if gzipped, err = isGzip(br); err != nil {
87 | return nil, nil, gzipped, fmt.Errorf("fail to check is file (%s) gzipped: %s", file, err)
88 | } else if gzipped {
89 | // gr, err := gzip.NewReader(br)
90 | gr, err := gzip.NewReaderN(br, 65536, 8)
91 | if err != nil {
92 | return nil, r, gzipped, fmt.Errorf("fail to create gzip reader for %s: %s", file, err)
93 | }
94 | br = bufio.NewReaderSize(gr, BufferSize)
95 | }
96 | return br, r, gzipped, nil
97 | }
98 |
99 | func isGzip(b *bufio.Reader) (bool, error) {
100 | return checkBytes(b, []byte{0x1f, 0x8b})
101 | }
102 |
103 | func checkBytes(b *bufio.Reader, buf []byte) (bool, error) {
104 | m, err := b.Peek(len(buf))
105 | if err != nil {
106 | return false, fmt.Errorf("no content")
107 | }
108 | for i := range buf {
109 | if m[i] != buf[i] {
110 | return false, nil
111 | }
112 | }
113 | return true, nil
114 | }
115 |
116 | func detectStdin() bool {
117 | // http://stackoverflow.com/a/26567513
118 | stat, err := os.Stdin.Stat()
119 | if err != nil {
120 | return false
121 | }
122 | return (stat.Mode() & os.ModeCharDevice) == 0
123 | }
124 |
--------------------------------------------------------------------------------
/lexicmap/cmd/util-logging.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "fmt"
25 | "io"
26 | "os"
27 | "runtime"
28 |
29 | "github.com/mattn/go-colorable"
30 | "github.com/shenwei356/go-logging"
31 | )
32 |
33 | var log *logging.Logger
34 |
35 | var logFormat = logging.MustStringFormatter(
36 | `%{time:15:04:05.000} %{color}[%{level:.4s}]%{color:reset} %{message}`,
37 | )
38 |
39 | var backendFormatter logging.Backend
40 |
41 | func init() {
42 | var stderr io.Writer = os.Stderr
43 | if runtime.GOOS == "windows" {
44 | stderr = colorable.NewColorableStderr()
45 | }
46 | backend := logging.NewLogBackend(stderr, "", 0)
47 | backendFormatter = logging.NewBackendFormatter(backend, logFormat)
48 |
49 | logging.SetBackend(backendFormatter)
50 |
51 | log = logging.MustGetLogger("lexicmap")
52 | }
53 |
54 | func addLog(file string, verbose bool) *os.File {
55 | w, err := os.Create(file)
56 | if err != nil {
57 | checkError(fmt.Errorf("failed to write log file %s: %s", file, err))
58 | }
59 |
60 | var logFormat2 = logging.MustStringFormatter(
61 | `%{time:15:04:05.000} [%{level:.4s}] %{message}`,
62 | )
63 | backend := logging.NewLogBackend(w, "", 0)
64 | backendFormatter2 := logging.NewBackendFormatter(backend, logFormat2)
65 |
66 | if !verbose {
67 | logging.SetBackend(backendFormatter2)
68 | } else {
69 | logging.SetBackend(backendFormatter, backendFormatter2)
70 | }
71 |
72 | log = logging.MustGetLogger("lexicmap")
73 |
74 | return w
75 | }
76 |
--------------------------------------------------------------------------------
/lexicmap/cmd/util/kmers_test.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package util
22 |
23 | import (
24 | "fmt"
25 | "testing"
26 |
27 | "github.com/shenwei356/kmers"
28 | )
29 |
30 | func TestIsLowComplexityDust(t *testing.T) {
31 | mer := []byte("TAAAAATACCTCAAAAAGAATAAAAATCCCG")
32 | k := len(mer)
33 |
34 | code, err := kmers.Encode(mer)
35 | if err != nil {
36 | t.Error(err)
37 | return
38 | }
39 |
40 | fmt.Printf("%s, low-complexity: %v\n", mer, IsLowComplexityDust(code, uint8(k)))
41 | }
42 |
43 | func TestNs(t *testing.T) {
44 | var k uint8 = 5
45 | values := []uint64{
46 | Ns(0b00, k), // A
47 | Ns(0b01, k), // C
48 | Ns(0b10, k), // G
49 | Ns(0b11, k), // T
50 | }
51 | for _, v := range values {
52 | fmt.Printf("%s, %064b\n", kmers.MustDecode(v, int(k)), v)
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/lexicmap/cmd/util/util.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package util
22 |
23 | import "github.com/twotwotwo/sorts/sortutil"
24 |
25 | // https://gist.github.com/badboy/6267743 .
26 | // version with mask: https://gist.github.com/lh3/974ced188be2f90422cc .
27 | func Hash64(key uint64) uint64 {
28 | key = (^key) + (key << 21) // key = (key << 21) - key - 1
29 | key = key ^ (key >> 24)
30 | key = (key + (key << 3)) + (key << 8) // key * 265
31 | key = key ^ (key >> 14)
32 | key = (key + (key << 2)) + (key << 4) // key * 21
33 | key = key ^ (key >> 28)
34 | key = key + (key << 31)
35 | return key
36 | }
37 |
38 | // UniqUint64s removes duplicates in a uint64 list
39 | func UniqUint64s(list *[]uint64) {
40 | if len(*list) == 0 || len(*list) == 1 {
41 | return
42 | }
43 |
44 | sortutil.Uint64s(*list)
45 |
46 | var i, j int
47 | var p, v uint64
48 | var flag bool
49 | p = (*list)[0]
50 | for i = 1; i < len(*list); i++ {
51 | v = (*list)[i]
52 | if v == p {
53 | if !flag {
54 | j = i // mark insertion position
55 | flag = true
56 | }
57 | continue
58 | }
59 |
60 | if flag { // need to insert to previous position
61 | (*list)[j] = v
62 | j++
63 | }
64 | p = v
65 | }
66 | if j > 0 {
67 | *list = (*list)[:j]
68 | }
69 | }
70 |
71 | // ReverseInts reverses a list of ints
72 | func ReverseInts(s []int) {
73 | for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 {
74 | s[i], s[j] = s[j], s[i]
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/lexicmap/cmd/util/varint-GB.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2018-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package util
22 |
23 | var offsetsUint64 = []uint8{56, 48, 40, 32, 24, 16, 8, 0}
24 | var offsetsUint32 = []uint8{24, 16, 8, 0}
25 |
26 | // PutUint64s encodes two uint64s into 2-16 bytes, and returns control byte
27 | // and encoded byte length.
28 | func PutUint64s(buf []byte, v1, v2 uint64) (ctrl byte, n int) {
29 | blen := ByteLengthUint64(v1)
30 | ctrl |= byte(blen - 1)
31 | for _, offset := range offsetsUint64[8-blen:] {
32 | buf[n] = byte((v1 >> offset) & 0xff)
33 | n++
34 | }
35 |
36 | ctrl <<= 3
37 | blen = ByteLengthUint64(v2)
38 | ctrl |= byte(blen - 1)
39 | for _, offset := range offsetsUint64[8-blen:] {
40 | buf[n] = byte((v2 >> offset) & 0xff)
41 | n++
42 | }
43 | return
44 | }
45 |
46 | // PutUint32s encodes four uint32s into 4-16 bytes, and returns control byte
47 | // and encoded byte length.
48 | func PutUint32s(buf []byte, v1, v2, v3, v4 uint32) (ctrl byte, n int) {
49 | blen := ByteLengthUint32(v1)
50 | ctrl |= byte(blen - 1)
51 | for _, offset := range offsetsUint32[4-blen:] {
52 | buf[n] = byte((v1 >> offset) & 0xff)
53 | n++
54 | }
55 |
56 | ctrl <<= 2
57 | blen = ByteLengthUint32(v2)
58 | ctrl |= byte(blen - 1)
59 | for _, offset := range offsetsUint32[4-blen:] {
60 | buf[n] = byte((v2 >> offset) & 0xff)
61 | n++
62 | }
63 |
64 | ctrl <<= 2
65 | blen = ByteLengthUint32(v3)
66 | ctrl |= byte(blen - 1)
67 | for _, offset := range offsetsUint32[4-blen:] {
68 | buf[n] = byte((v3 >> offset) & 0xff)
69 | n++
70 | }
71 |
72 | ctrl <<= 2
73 | blen = ByteLengthUint32(v4)
74 | ctrl |= byte(blen - 1)
75 | for _, offset := range offsetsUint32[4-blen:] {
76 | buf[n] = byte((v4 >> offset) & 0xff)
77 | n++
78 | }
79 |
80 | return
81 | }
82 |
83 | // Uint64s decodes encoded bytes.
84 | func Uint64s(ctrl byte, buf []byte) (v1, v2 uint64, n int) {
85 | blen1 := int((ctrl>>3)&7) + 1
86 | blen2 := int(ctrl&7) + 1
87 | if len(buf) < blen1+blen2 {
88 | return 0, 0, 0
89 | }
90 |
91 | var j int
92 |
93 | for j = 0; j < blen1; j++ {
94 | v1 <<= 8
95 | v1 |= uint64(buf[n])
96 | n++
97 | }
98 |
99 | for j = 0; j < blen2; j++ {
100 | v2 <<= 8
101 | v2 |= uint64(buf[n])
102 | n++
103 | }
104 |
105 | return
106 | }
107 |
108 | // Uint32s decodes encoded bytes.
109 | func Uint32s(ctrl byte, buf []byte) (v1, v2, v3, v4 uint32, n int) {
110 | blen1 := int((ctrl>>6)&3) + 1
111 | blen2 := int((ctrl>>4)&3) + 1
112 | blen3 := int((ctrl>>2)&3) + 1
113 | blen4 := int(ctrl&3) + 1
114 | if len(buf) < blen1+blen2+blen3+blen4 {
115 | return 0, 0, 0, 0, 0
116 | }
117 |
118 | var j int
119 |
120 | for j = 0; j < blen1; j++ {
121 | v1 <<= 8
122 | v1 |= uint32(buf[n])
123 | n++
124 | }
125 |
126 | for j = 0; j < blen2; j++ {
127 | v2 <<= 8
128 | v2 |= uint32(buf[n])
129 | n++
130 | }
131 |
132 | for j = 0; j < blen3; j++ {
133 | v3 <<= 8
134 | v3 |= uint32(buf[n])
135 | n++
136 | }
137 |
138 | for j = 0; j < blen4; j++ {
139 | v4 <<= 8
140 | v4 |= uint32(buf[n])
141 | n++
142 | }
143 |
144 | return
145 | }
146 |
147 | // ByteLengthUint64 returns the minimum number of bytes to store a integer.
148 | func ByteLengthUint64(n uint64) uint8 {
149 | if n < 256 {
150 | return 1
151 | }
152 | if n < 65536 {
153 | return 2
154 | }
155 | if n < 16777216 {
156 | return 3
157 | }
158 | if n < 4294967296 {
159 | return 4
160 | }
161 | if n < 1099511627776 {
162 | return 5
163 | }
164 | if n < 281474976710656 {
165 | return 6
166 | }
167 | if n < 72057594037927936 {
168 | return 7
169 | }
170 | return 8
171 | }
172 |
173 | // ByteLengthUint32 returns the minimum number of bytes to store a integer.
174 | func ByteLengthUint32(n uint32) uint8 {
175 | if n < 256 {
176 | return 1
177 | }
178 | if n < 65536 {
179 | return 2
180 | }
181 | if n < 16777216 {
182 | return 3
183 | }
184 | return 4
185 | }
186 |
187 | // CtrlByte2ByteLengthsUint64 returns the byte length for a given control byte.
188 | func CtrlByte2ByteLengthsUint64(ctrl byte) int {
189 | return int(ctrl>>3&7+ctrl&7) + 2
190 | }
191 |
192 | // CtrlByte2ByteLengthsUint32 returns the byte length for a given control byte.
193 | func CtrlByte2ByteLengthsUint32(ctrl byte) int {
194 | return int(ctrl>>6&3+ctrl>>4&3+ctrl>>2&3+ctrl&3) + 4
195 | }
196 |
--------------------------------------------------------------------------------
/lexicmap/cmd/util/varint-GB_test.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2018-2021 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package util
22 |
23 | import (
24 | "math/rand"
25 | "testing"
26 | )
27 |
28 | var testsUint64 [][2]uint64
29 | var testsUint32 [][4]uint32
30 |
31 | func init() {
32 | ntests := 10000
33 | testsUint64 = make([][2]uint64, ntests)
34 | testsUint32 = make([][4]uint32, ntests)
35 | var i int
36 | for ; i < ntests/4; i++ {
37 | testsUint64[i] = [2]uint64{rand.Uint64(), rand.Uint64()}
38 | testsUint32[i] = [4]uint32{rand.Uint32(), rand.Uint32(), rand.Uint32(), rand.Uint32()}
39 | }
40 | for ; i < ntests/2; i++ {
41 | testsUint64[i] = [2]uint64{uint64(rand.Uint32()), uint64(rand.Uint32())}
42 | testsUint32[i] = [4]uint32{rand.Uint32(), rand.Uint32(), rand.Uint32(), rand.Uint32()}
43 | }
44 | for ; i < ntests*3/4; i++ {
45 | testsUint64[i] = [2]uint64{uint64(rand.Intn(65536)), uint64(rand.Intn(256))}
46 | testsUint32[i] = [4]uint32{uint32(rand.Intn(65536)), uint32(rand.Intn(256)), uint32(rand.Intn(65536)), uint32(rand.Intn(256))}
47 | }
48 | for ; i < ntests; i++ {
49 | testsUint64[i] = [2]uint64{uint64(rand.Intn(256)), uint64(rand.Intn(256))}
50 | testsUint32[i] = [4]uint32{uint32(rand.Intn(256)), uint32(rand.Intn(256)), uint32(rand.Intn(256)), uint32(rand.Intn(256))}
51 | }
52 | }
53 |
54 | func TestStreamVByte64(t *testing.T) {
55 | buf := make([]byte, 16)
56 | var ctrl byte
57 | var n, n2 int
58 | var v1, v2 uint64
59 | for i, test := range testsUint64 {
60 | ctrl, n = PutUint64s(buf, test[0], test[1])
61 | if CtrlByte2ByteLengthsUint64(ctrl) != n {
62 | t.Errorf("#%d, wrong byte length", i)
63 | }
64 |
65 | v1, v2, n2 = Uint64s(ctrl, buf[0:n])
66 | if n2 == 0 {
67 | t.Errorf("#%d, wrong decoded number", i)
68 | }
69 |
70 | if v1 != test[0] || v2 != test[1] {
71 | t.Errorf("#%d, wrong decoded result: %d, %d, answer: %d, %d", i, v1, v2, test[0], test[1])
72 | }
73 | // fmt.Printf("%d, %d => n=%d, buf=%v\n", test[0], test[1], n, buf[0:n])
74 | }
75 | }
76 |
77 | func TestStreamVByte32(t *testing.T) {
78 | buf := make([]byte, 16)
79 | var ctrl byte
80 | var n, n2 int
81 | var v1, v2, v3, v4 uint32
82 | for i, test := range testsUint32 {
83 | ctrl, n = PutUint32s(buf, test[0], test[1], test[2], test[3])
84 | if CtrlByte2ByteLengthsUint32(ctrl) != n {
85 | t.Errorf("#%d, wrong byte length", i)
86 | }
87 |
88 | v1, v2, v3, v4, n2 = Uint32s(ctrl, buf[0:n])
89 | if n2 == 0 {
90 | t.Errorf("#%d, wrong decoded number", i)
91 | }
92 |
93 | if v1 != test[0] || v2 != test[1] || v3 != test[2] || v4 != test[3] {
94 | t.Errorf("#%d, wrong decoded result: %d, %d, %d, %d, answer: %d, %d, %d, %d", i, v1, v2, v3, v4, test[0], test[1], test[2], test[3])
95 | }
96 | // fmt.Printf("%d, %d => n=%d, buf=%v\n", test[0], test[1], n, buf[0:n])
97 | }
98 | }
99 |
100 | var _v1, _v2 uint64
101 |
102 | func BenchmarkUint64s(b *testing.B) {
103 | buf := make([]byte, 16)
104 | var ctrl byte
105 | var n, n2 int
106 | var v1, v2 uint64
107 | for i := 0; i < b.N; i++ {
108 | for i, test := range testsUint64 {
109 | ctrl, n = PutUint64s(buf, test[0], test[1])
110 |
111 | v1, v2, n2 = Uint64s(ctrl, buf[0:n])
112 | if n2 == 0 {
113 | b.Errorf("#%d, wrong decoded number", i)
114 | }
115 |
116 | if v1 != test[0] || v2 != test[1] {
117 | b.Errorf("#%d, wrong decoded result: %d, %d, answer: %d, %d", i, v1, v2, test[0], test[1])
118 | }
119 | // fmt.Printf("%d, %d => n=%d, buf=%v\n", test[0], test[1], n, buf[0:n])
120 | }
121 | }
122 | _v1, _v2 = v1, v2
123 | }
124 |
125 | var __v1, __v2, __v3, __v4 uint32
126 |
127 | func BenchmarkUint32s(b *testing.B) {
128 | buf := make([]byte, 16)
129 | var ctrl byte
130 | var n, n2 int
131 | var v1, v2, v3, v4 uint32
132 | for i := 0; i < b.N; i++ {
133 | for i, test := range testsUint32 {
134 | ctrl, n = PutUint32s(buf, test[0], test[1], test[2], test[3])
135 |
136 | v1, v2, v3, v4, n2 = Uint32s(ctrl, buf[0:n])
137 | if n2 == 0 {
138 | b.Errorf("#%d, wrong decoded number", i)
139 | }
140 |
141 | if v1 != test[0] || v2 != test[1] || v3 != test[2] || v4 != test[3] {
142 | b.Errorf("#%d, wrong decoded result: %d, %d, %d, %d, answer: %d, %d, %d, %d", i, v1, v2, v3, v4, test[0], test[1], test[2], test[3])
143 | }
144 | // fmt.Printf("%d, %d => n=%d, buf=%v\n", test[0], test[1], n, buf[0:n])
145 | }
146 | }
147 | __v1, __v2, __v3, __v4 = v1, v2, v3, v4
148 | }
149 |
--------------------------------------------------------------------------------
/lexicmap/cmd/utils.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "github.com/spf13/cobra"
25 | )
26 |
27 | var utilsCmd = &cobra.Command{
28 | Use: "utils",
29 | Short: "Some utilities",
30 | Long: `Some utilities
31 | `,
32 | }
33 |
34 | func init() {
35 | RootCmd.AddCommand(utilsCmd)
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/lexicmap/cmd/version.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2023-2024 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package cmd
22 |
23 | import (
24 | "fmt"
25 | "net/http"
26 | "strings"
27 |
28 | "github.com/shenwei356/util/cliutil"
29 | "github.com/spf13/cobra"
30 | )
31 |
32 | // VERSION is the version
33 | var VERSION = "0.7.1"
34 |
35 | // COMMIT is the last commit
36 | // var COMMIT = func() string {
37 | // if info, ok := debug.ReadBuildInfo(); ok {
38 | // for _, setting := range info.Settings {
39 | // if setting.Key == "vcs.revision" {
40 | // return setting.Value[:7]
41 | // }
42 | // }
43 | // }
44 | // return ""
45 | // }()
46 |
47 | // can pass from from command line:
48 | // commit=$(git rev-parse --short HEAD)
49 | // go build -trimpath -o=lexicmap -ldflags="-s -w -X github.com/shenwei356/LexicMap/lexicmap/cmd.COMMIT=$commit" -tags netgo
50 | var COMMIT = ""
51 |
52 | // versionCmd represents the version command
53 | var versionCmd = &cobra.Command{
54 | Use: "version",
55 | Short: "Print version information and check for update",
56 | Long: `Print version information and check for update
57 |
58 | `,
59 | Run: func(cmd *cobra.Command, args []string) {
60 | app := "LexicMap"
61 | if COMMIT == "" {
62 | fmt.Printf("%s v%s\n", app, VERSION)
63 | } else {
64 | fmt.Printf("%s v%s (%s)\n", app, VERSION, COMMIT)
65 | }
66 |
67 | if !cliutil.GetFlagBool(cmd, "check-update") {
68 | return
69 | }
70 |
71 | fmt.Println("\nChecking new version...")
72 |
73 | resp, err := http.Get(fmt.Sprintf("https://github.com/shenwei356/%s/releases/latest", app))
74 | if err != nil {
75 | checkError(fmt.Errorf("network error"))
76 | }
77 | items := strings.Split(resp.Request.URL.String(), "/")
78 | version := ""
79 | if items[len(items)-1] == "" {
80 | version = items[len(items)-2]
81 | } else {
82 | version = items[len(items)-1]
83 | }
84 | if version == "v"+VERSION {
85 | fmt.Printf("You are using the latest version of %s\n", app)
86 | } else {
87 | fmt.Printf("New version available: %s %s at %s\n", app, version, resp.Request.URL.String())
88 | }
89 | },
90 | }
91 |
92 | func init() {
93 | RootCmd.AddCommand(versionCmd)
94 |
95 | versionCmd.Flags().BoolP("check-update", "u", false, `check update`)
96 | }
97 |
--------------------------------------------------------------------------------
/lexicmap/main.go:
--------------------------------------------------------------------------------
1 | // Copyright © 2018-2020 Wei Shen
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 |
21 | package main
22 |
23 | import (
24 | "github.com/shenwei356/LexicMap/lexicmap/cmd"
25 | )
26 |
27 | func main() {
28 | // go tool pprof -http=:8080 cpu.pprof
29 | // defer profile.Start(profile.CPUProfile, profile.ProfilePath(".")).Stop()
30 |
31 | // go tool trace -http=:8080 trace.out
32 | // defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop()
33 |
34 | // go tool pprof -http=:8080 mem.pprof
35 | // defer profile.Start(profile.MemProfile, profile.MemProfileRate(1), profile.ProfilePath(".")).Stop()
36 | // defer profile.Start(profile.MemProfile, profile.ProfilePath(".")).Stop()
37 |
38 | cmd.Execute()
39 | }
40 |
--------------------------------------------------------------------------------
/lexicmap/packaging.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | commit=""
4 |
5 | if [ $# -gt 0 ]; then
6 | commit=" -X github.com/shenwei356/LexicMap/lexicmap/cmd.COMMIT=$(git rev-parse --short HEAD)"
7 | fi
8 |
9 | CGO_ENABLED=0 gox -os="windows darwin linux freebsd" -arch="amd64 arm64" -tags netgo -ldflags "-w -s $commit" -asmflags '-trimpath' \
10 | -output "lexicmap_{{.OS}}_{{.Arch}}"
11 |
12 | dir=binaries
13 | mkdir -p $dir;
14 | rm -rf $dir/$f;
15 |
16 | for f in lexicmap_*; do
17 | mkdir -p $dir/$f;
18 | mv $f $dir/$f;
19 | cd $dir/$f;
20 | mv $f $(echo $f | perl -pe 's/_[^\.]+//g');
21 | tar -zcf $f.tar.gz lexicmap*;
22 | mv *.tar.gz ../;
23 | cd ..;
24 | rm -rf $f;
25 | cd ..;
26 | done;
27 |
28 | ls binaries/*.tar.gz | rush 'cd {/}; md5sum {%} > {%}.md5.txt'
29 |
--------------------------------------------------------------------------------
/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
64 |
--------------------------------------------------------------------------------