├── .gitignore ├── CHANGELOG.md ├── INDEX_FORMAT_CHANGELOG.tsv ├── LICENSE ├── README.md ├── demo ├── README.md ├── ass2species.map ├── bench │ ├── b.amr.fasta │ ├── b.gene_E_coli_16S.fasta │ ├── b.gene_E_faecalis_SecY.fasta │ └── b.plasmid_pCUVET18-1784.4.fasta ├── files.txt ├── prefix.hist.png ├── q.gene.fasta ├── q.gene.fasta.lexicmap.tsv ├── q.long-reads.fasta.gz ├── q.prophage.fasta ├── q.prophage.fasta.lexicmap.tsv ├── refs │ ├── GCF_000006945.2.fa.gz │ ├── GCF_000017205.1.fa.gz │ ├── GCF_000148585.2.fa.gz │ ├── GCF_000392875.1.fa.gz │ ├── GCF_000742135.1.fa.gz │ ├── GCF_001027105.1.fa.gz │ ├── GCF_001096185.1.fa.gz │ ├── GCF_001457655.1.fa.gz │ ├── GCF_001544255.1.fa.gz │ ├── GCF_002949675.1.fa.gz │ ├── GCF_002950215.1.fa.gz │ ├── GCF_003697165.2.fa.gz │ ├── GCF_006742205.1.fa.gz │ ├── GCF_009759685.1.fa.gz │ └── GCF_900638025.1.fa.gz ├── taxid.map └── taxonomy.tsv ├── docs ├── archetypes │ └── default.md ├── content │ ├── _index.md │ ├── faqs │ │ └── _index.md │ ├── installation │ │ └── _index.md │ ├── introduction │ │ └── _index.md │ ├── logo.svg │ ├── notes │ │ ├── _index.md │ │ └── motivation.md │ ├── performance@genbank.tsv │ ├── performance@genbank.tsv.sh │ ├── releases │ │ └── _index.md │ ├── tutorials │ │ ├── _index.md │ │ ├── index │ │ │ ├── _index.md │ │ │ ├── parameters-batches.tsv │ │ │ ├── parameters-general.tsv │ │ │ ├── parameters-masks.tsv │ │ │ └── parameters-seeds.tsv │ │ ├── misc │ │ │ ├── _index.md │ │ │ ├── index-allthebacteria.md │ │ │ ├── index-genbank.md │ │ │ ├── index-globdb.md │ │ │ ├── index-gtdb.md │ │ │ └── index-uhgg.md │ │ ├── parameters-align.tsv │ │ ├── parameters-general.tsv │ │ ├── parameters-seeding.tsv │ │ └── search.md │ └── usage │ │ ├── _index.md │ │ ├── index │ │ └── _index.md │ │ ├── lexicmap.md │ │ ├── search.md │ │ └── utils │ │ ├── 2blast.md │ │ ├── _index.md │ │ ├── genomes.md │ │ ├── kmers.md │ │ ├── masks.md │ │ ├── reindex-seeds.md │ │ ├── remerge.md │ │ ├── seed-pos.md │ │ └── subseq.md ├── data │ └── menu │ │ ├── extra.yaml │ │ └── more.yaml ├── hugo.toml └── static │ ├── AllTheBacteria-v0.2.url.txt │ ├── GCF_000017205.1.png │ ├── GCF_000017205.1.seed_number.png │ ├── GCF_000392875.1.png │ ├── GCF_000392875.1.seed_number.png │ ├── GCF_002949675.1.png │ ├── GCF_002949675.1.seed_number.png │ ├── custom.css │ ├── favicon │ ├── android-chrome-192x192.png │ ├── android-chrome-512x512.png │ ├── apple-touch-icon.png │ ├── browserconfig.xml │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── favicon-48x48.png │ ├── favicon.ico │ ├── favicon.svg │ ├── mstile-144x144.png │ ├── mstile-150x150.png │ ├── mstile-310x150.png │ ├── mstile-310x310.png │ ├── mstile-70x70.png │ ├── safari-pinned-tab.svg │ └── site.webmanifest │ ├── indexing.svg │ ├── logo.svg │ ├── overview.svg │ ├── prefix.hist.png │ └── searching.svg ├── go.mod ├── go.sum ├── lexicmap ├── .gitignore ├── build.sh ├── cmd │ ├── 2blast.go │ ├── autocomplete.go │ ├── genome │ │ ├── genome.go │ │ └── genome_test.go │ ├── genomes.go │ ├── index.go │ ├── kmers.go │ ├── kv │ │ ├── kv-data.go │ │ ├── kv-data_test.go │ │ ├── kv-encoding.go │ │ ├── kv-reader.go │ │ ├── kv-searcher.go │ │ └── kv-searcher2.go │ ├── lib-chaining.go │ ├── lib-chaining2.go │ ├── lib-chaining3.go │ ├── lib-chaining_test.go │ ├── lib-index-build.go │ ├── lib-index-merge.go │ ├── lib-index-search-util.go │ ├── lib-index-search.go │ ├── lib-seq_compare.go │ ├── lib-seq_compare_test.go │ ├── masks.go │ ├── re-merge.go │ ├── recount-bases.go │ ├── reindex-seeds.go │ ├── root.go │ ├── search.go │ ├── seed-pos.go │ ├── seedposition │ │ ├── seed_position.go │ │ └── seed_position_test.go │ ├── subseq.go │ ├── tree │ │ ├── tree.go │ │ └── tree_test.go │ ├── util-cli.go │ ├── util-io.go │ ├── util-logging.go │ ├── util.go │ ├── util │ │ ├── kmers.go │ │ ├── kmers_test.go │ │ ├── util.go │ │ ├── varint-GB.go │ │ └── varint-GB_test.go │ ├── utils.go │ └── version.go ├── main.go └── packaging.sh ├── logo.svg └── overview.svg /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | 15 | *.directory 16 | .Rhistory 17 | 18 | *ssshtest 19 | *.nextflow.log* 20 | *.brename_detail.txt 21 | */Rplots.pdf 22 | *.pprof 23 | 24 | docs/public 25 | docs/themes 26 | .hugo_build.lock 27 | 28 | demo/demo.lmi 29 | demo/demo.lmi-no-df 30 | demo/*.lexicmap.tsv.gz 31 | demo/seed_distance.tsv 32 | demo/seed_distance 33 | demo/seed_distance-no-df 34 | demo/seed-pos.tsv.gz 35 | demo/t.txt 36 | demo/kmers.tsv* 37 | 38 | 39 | lexicmap/binaries/* 40 | lexicmap/lexicmap* 41 | lexicmap/*.fasta 42 | lexicmap/indexes 43 | -------------------------------------------------------------------------------- /INDEX_FORMAT_CHANGELOG.tsv: -------------------------------------------------------------------------------- 1 | Index version LexicMap version Supported LexicMap versions Date Changes 2 | 3.4 0.7.0 0.6.0 + 2025-04-11 Fix filling the seed desert region behind the last seed of a genome. 3 | 3.3 0.6.0 0.6.0 + 2025-03-25 Reduce index size for batches <= 512. Add the total bases of index to info.toml for computing the Evalue. Denser seeds. 4 | 3.1 0.5.0 0.4.0 + 2024-12-18 Change the default partitions of seed data index. 5 | 3.0 0.4.0 0.4.0 + 2024-08-15 Support suffix matching of seeds. Better seed desert filling for highly-repetitive regions. Denser seeds. 6 | 1.1 0.3.0 0.3.0 2024-05-14 Change the format of seed data index. Use longer contig intervals. 7 | 0.1 0.1.0 0.1.0 - 0.2.0 2024-01-25 First version. 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2023 - 2024 Wei Shen (shenwei356@gmail.com) 2 | 3 | The MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /demo/ass2species.map: -------------------------------------------------------------------------------- 1 | GCF_000742135.1 Klebsiella pneumoniae 2 | GCF_003697165.2 Escherichia coli 3 | GCF_002949675.1 Shigella dysenteriae 4 | GCF_002950215.1 Shigella flexneri 5 | GCF_000006945.2 Salmonella enterica 6 | GCF_001544255.1 Enterococcus faecium 7 | GCF_000392875.1 Enterococcus faecalis 8 | GCF_001457655.1 Haemophilus influenzae 9 | GCF_900638025.1 Haemophilus parainfluenzae 10 | GCF_001027105.1 Staphylococcus aureus 11 | GCF_006742205.1 Staphylococcus epidermidis 12 | GCF_001096185.1 Streptococcus pneumoniae 13 | GCF_000148585.2 Streptococcus mitis 14 | GCF_009759685.1 Acinetobacter baumannii 15 | GCF_000017205.1 Pseudomonas aeruginosa 16 | -------------------------------------------------------------------------------- /demo/bench/b.gene_E_coli_16S.fasta: -------------------------------------------------------------------------------- 1 | >NC_000913.3:4166659-4168200 rrsB [organism=Escherichia coli str. K-12 substr. MG1655] [GeneID=948466] [chromosome=] 2 | AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGT 3 | AACAGGAAGAAGCTTGCTTCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGAT 4 | GGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTC 5 | GGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGAC 6 | GATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAG 7 | GCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCT 8 | TCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCC 9 | GCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAA 10 | TTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAA 11 | CTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCG 12 | TAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGC 13 | GTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCC 14 | CTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAAC 15 | TCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACC 16 | TTACCTGGTCTTGACATCCACGGAAGTTTTCAGAGATGAGAATGTGCCTTCGGGAACCGTGAGACAGGTG 17 | CTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCC 18 | TTTGTTGCCAGCGGTCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATG 19 | ACGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCG 20 | ACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCAT 21 | GAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACC 22 | GCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACT 23 | TTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCT 24 | TA 25 | -------------------------------------------------------------------------------- /demo/bench/b.gene_E_faecalis_SecY.fasta: -------------------------------------------------------------------------------- 1 | >lcl|NZ_CP064374.1_cds_WP_002359350.1_906 [gene=secY] [locus_tag=IUJ47_RS04625] [protein=preprotein translocase subunit SecY] [protein_id=WP_002359350.1] [location=960938..962236] [gbkey=CDS] 2 | TTGTTCAAGCTATTAAAGAACGCCTTTAAAGTCAAAGACATTAGATCAAAAATCTTATTTACAGTTTTAA 3 | TCTTGTTTGTATTTCGCCTAGGTGCGCACATTACTGTGCCCGGGGTGAATGCAAAGGGATTGTCTGATTT 4 | AAGTAGCTTACCCTTTTTGAATATGTTGAATATGGTGAGTGGTAGTGCCATGCAAAACTTCTCTATCTTC 5 | TCGATGGGGGTTTCGCCATACATTACAGCCTCTATTATTATTCAACTATTGCAAATGGATATTGTACCTA 6 | GATTTGTAGAATGGTCAAAACAAGGGGAAGTTGGGCGTAAGAAATTAAATCAAGCTACAAGATATCTAAC 7 | GATTGTCTTGGGTGTGGCTCAGTCAATGGGGATCACTGCTGGTTTTAATAGCTTAAGTCAAACTGGGATT 8 | GTAAACAATCCAACCTTAGGTACCTTTGTGATGATTGCAGTTATTTTAACTGCTGGGACGATGTTTGTGA 9 | CTTGGATGGGTGAACAAATTACAGAAAAAGGAATCGGAAATGGTGTTTCAATGATTATCTTTGCCGGGAT 10 | TATTTCTCGTTTGCCAGGAGCAGTCAAAGAAATCTATGAAGATTACTTCGTCAATATCGAGTCTTCTCGT 11 | ATTTGGCAATCTGTTATTTTCATTGCAATCTTAGTTATTGCTATTTTGGTGATTGTTACAGTCGTAACGT 12 | TCTTCCAACAAGCAGAGCGTAAGATTCCAATCCAATATACAAAACGTGTTTCTGGTGCACCAACAAGTAG 13 | TTATTTACCGTTAAAAGTAAATGCTGCTGGGGTTATTCCAGTTATCTTTGCCAGCTCGTTAATTGCAACA 14 | CCAAATGCCATTTTACAAGCTTTCTCATCAAAATTCGCTGGTGAAAATTGGTATGACATTATGACAAAAA 15 | TCTTCAGTTATAACACAGTTCCAGGGGCAATCATCTATACTGTCCTAATCGTTGCGTTTACGTTCTTCTA 16 | TGCATTTGTTCAAGTAAACCCTGAGAAATTAGCGGAAAACTTACAAAAACAAGGAAGCTACATTCCAAGC 17 | GTGCGACCAGGTAAAGGTACAGAAGAATATGTATCTGGCGTGTTAATGAGATTAAGTGTTGTCGGCTCAA 18 | TTTTCCTAGGACTTGTTGCTTTACTTCCAATCATTGCGCAAATGGTTTGGAACTTACCTCAATCAATCGG 19 | TTTAGGTGGAACAAGTTTACTAATCGTTATCGGGGTTGCATTAGAAACAACGAAACAATTAGAAGGATTA 20 | ATGATGAAACGTCAATATGTCGGCTTTATCAATAAGTAA 21 | -------------------------------------------------------------------------------- /demo/files.txt: -------------------------------------------------------------------------------- 1 | refs/GCF_000006945.2.fa.gz 2 | refs/GCF_000017205.1.fa.gz 3 | refs/GCF_000148585.2.fa.gz 4 | refs/GCF_000392875.1.fa.gz 5 | refs/GCF_000742135.1.fa.gz 6 | refs/GCF_001027105.1.fa.gz 7 | refs/GCF_001096185.1.fa.gz 8 | refs/GCF_001457655.1.fa.gz 9 | refs/GCF_001544255.1.fa.gz 10 | refs/GCF_002949675.1.fa.gz 11 | refs/GCF_002950215.1.fa.gz 12 | refs/GCF_003697165.2.fa.gz 13 | refs/GCF_006742205.1.fa.gz 14 | refs/GCF_009759685.1.fa.gz 15 | refs/GCF_900638025.1.fa.gz 16 | -------------------------------------------------------------------------------- /demo/prefix.hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/prefix.hist.png -------------------------------------------------------------------------------- /demo/q.gene.fasta: -------------------------------------------------------------------------------- 1 | >NC_000913.3:4166659-4168200 rrsB [organism=Escherichia coli str. K-12 substr. MG1655] [GeneID=948466] [chromosome=] 2 | AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGT 3 | AACAGGAAGAAGCTTGCTTCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGAT 4 | GGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTC 5 | GGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGAC 6 | GATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAG 7 | GCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCT 8 | TCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCC 9 | GCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAA 10 | TTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAA 11 | CTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCG 12 | TAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGC 13 | GTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCC 14 | CTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAAC 15 | TCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACC 16 | TTACCTGGTCTTGACATCCACGGAAGTTTTCAGAGATGAGAATGTGCCTTCGGGAACCGTGAGACAGGTG 17 | CTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCC 18 | TTTGTTGCCAGCGGTCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATG 19 | ACGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCG 20 | ACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCAT 21 | GAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACC 22 | GCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACT 23 | TTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCT 24 | TA 25 | -------------------------------------------------------------------------------- /demo/q.long-reads.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/q.long-reads.fasta.gz -------------------------------------------------------------------------------- /demo/q.prophage.fasta.lexicmap.tsv: -------------------------------------------------------------------------------- 1 | query qlen hits sgenome sseqid qcovGnm cls hsp qcovHSP alenHSP pident gaps qstart qend sstart send sstr slen evalue bitscore 2 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 1 1 27.890 9371 97.716 2 1 9369 1864411 1873781 + 4903501 0.00e+00 15953 3 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 1 2 0.301 101 98.020 0 10308 10408 1873846 1873946 + 4903501 1.72e-43 174 4 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 2 3 20.665 6942 96.528 4 17441 24382 1882011 1888948 + 4903501 0.00e+00 11459 5 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 3 4 17.685 5941 97.980 0 24355 30295 1853098 1859038 + 4903501 0.00e+00 10174 6 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 4 5 8.993 3021 91.526 0 10308 13328 1873846 1876866 + 4903501 0.00e+00 4295 7 | NC_001895.1 33593 2 GCF_003697165.2 NZ_CP033092.2 77.588 5 6 2.438 820 84.390 1 14540 15358 1878798 1879617 + 4903501 1.29e-264 911 8 | NC_001895.1 33593 2 GCF_002949675.1 NZ_CP026774.1 0.976 1 1 0.976 331 85.801 3 13919 14246 3704319 3704649 - 4395762 6.35e-112 403 9 | -------------------------------------------------------------------------------- /demo/refs/GCF_000006945.2.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_000006945.2.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_000017205.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_000017205.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_000148585.2.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_000148585.2.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_000392875.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_000392875.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_000742135.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_000742135.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_001027105.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_001027105.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_001096185.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_001096185.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_001457655.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_001457655.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_001544255.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_001544255.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_002949675.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_002949675.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_002950215.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_002950215.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_003697165.2.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_003697165.2.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_006742205.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_006742205.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_009759685.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_009759685.1.fa.gz -------------------------------------------------------------------------------- /demo/refs/GCF_900638025.1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/demo/refs/GCF_900638025.1.fa.gz -------------------------------------------------------------------------------- /demo/taxid.map: -------------------------------------------------------------------------------- 1 | GCF_000742135.1 573 2 | GCF_003697165.2 562 3 | GCF_002949675.1 622 4 | GCF_002950215.1 623 5 | GCF_000006945.2 28901 6 | GCF_001544255.1 1352 7 | GCF_000392875.1 1351 8 | GCF_001457655.1 727 9 | GCF_900638025.1 729 10 | GCF_001027105.1 1280 11 | GCF_006742205.1 1282 12 | GCF_001096185.1 1313 13 | GCF_000148585.2 28037 14 | GCF_009759685.1 470 15 | GCF_000017205.1 287 16 | -------------------------------------------------------------------------------- /demo/taxonomy.tsv: -------------------------------------------------------------------------------- 1 | id superkingdom phylum class order family genus species 2 | GCF_000742135.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Klebsiella Klebsiella pneumoniae 3 | GCF_003697165.2 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Escherichia Escherichia coli 4 | GCF_002949675.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Shigella Shigella dysenteriae 5 | GCF_002950215.1 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Shigella Shigella flexneri 6 | GCF_000006945.2 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Salmonella Salmonella enterica 7 | GCF_001544255.1 Bacteria Bacillota Bacilli Lactobacillales Enterococcaceae Enterococcus Enterococcus faecium 8 | GCF_000392875.1 Bacteria Bacillota Bacilli Lactobacillales Enterococcaceae Enterococcus Enterococcus faecalis 9 | GCF_001457655.1 Bacteria Pseudomonadota Gammaproteobacteria Pasteurellales Pasteurellaceae Haemophilus Haemophilus influenzae 10 | GCF_900638025.1 Bacteria Pseudomonadota Gammaproteobacteria Pasteurellales Pasteurellaceae Haemophilus Haemophilus parainfluenzae 11 | GCF_001027105.1 Bacteria Bacillota Bacilli Bacillales Staphylococcaceae Staphylococcus Staphylococcus aureus 12 | GCF_006742205.1 Bacteria Bacillota Bacilli Bacillales Staphylococcaceae Staphylococcus Staphylococcus epidermidis 13 | GCF_001096185.1 Bacteria Bacillota Bacilli Lactobacillales Streptococcaceae Streptococcus Streptococcus pneumoniae 14 | GCF_000148585.2 Bacteria Bacillota Bacilli Lactobacillales Streptococcaceae Streptococcus Streptococcus mitis 15 | GCF_009759685.1 Bacteria Pseudomonadota Gammaproteobacteria Moraxellales Moraxellaceae Acinetobacter Acinetobacter baumannii 16 | GCF_000017205.1 Bacteria Pseudomonadota Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas aeruginosa 17 | -------------------------------------------------------------------------------- /docs/archetypes/default.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = '{{ replace .File.ContentBaseName "-" " " | title }}' 3 | date = {{ .Date }} 4 | draft = true 5 | +++ 6 | -------------------------------------------------------------------------------- /docs/content/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 3 | geekdocNav: false 4 | geekdocAlign: center 5 | geekdocAnchor: false 6 | --- 7 | # LexicMap 8 | 9 | 10 | 11 | 12 | [![Latest Version](https://img.shields.io/github/release/shenwei356/LexicMap.svg?style=flat?maxAge=86400)](https://github.com/shenwei356/LexicMap/releases) 13 | [![Anaconda Cloud](https://anaconda.org/bioconda/lexicmap/badges/version.svg)](https://anaconda.org/bioconda/lexicmap) 14 | [![Cross-platform](https://img.shields.io/badge/platform-any-ec2eb4.svg?style=flat)](http://bioinf.shenwei.me/LexicMap/installation/) 15 | [![license](https://img.shields.io/github/license/shenwei356/taxonkit.svg?maxAge=2592000)](https://github.com/shenwei356/taxonkit/blob/master/LICENSE) 16 | 17 | 18 | 19 | LexicMap is a **nucleotide sequence alignment** tool for efficiently querying **gene, plasmid, virus, or long-read sequences (>100 bp)** against up to **millions** of **prokaryotic genomes**. 20 | 21 | 22 | {{< button size="medium" relref="introduction" >}}Introduction{{< /button >}} 23 | 24 | 25 | 26 | ## Feature overview 27 | 28 | {{< columns >}} 29 | 30 | ### Easy to install 31 | 32 | Linux, Windows, MacOS and more OS are supported. 33 | 34 | Both x86 and ARM CPUs are supported. 35 | 36 | Just [download](https://github.com/shenwei356/lexicmap/releases) the binary files and run! 37 | 38 | 39 | Or install it by 40 | 41 | conda install -c bioconda lexicmap 42 | 43 | 44 | {{< button size="small" relref="installation" >}}Installation{{< /button >}} 45 | {{< button size="small" relref="releases" >}}Releases{{< /button >}} 46 | 47 | <---> 48 | 49 | ### Easy to use 50 | 51 | Step 1: indexing 52 | 53 | lexicmap index -I genomes/ -O db.lmi 54 | 55 | Step 2: searching 56 | 57 | lexicmap search -d db.lmi q.fasta -o r.tsv 58 | 59 | {{< button size="small" relref="tutorials/index" >}}Tutorials{{< /button >}} 60 | {{< button size="small" relref="usage/lexicmap" >}}Usages{{< /button >}} 61 | {{< button size="small" relref="faqs" >}}FAQs{{< /button >}} 62 | 63 | <---> 64 | 65 | ### Accurate and efficient alignment 66 | 67 | Using LexicMap to align in the whole **2,340,672** Genbank+Refseq prokaryotic genomes with 48 CPUs. 68 | 69 | |Query |Genome hits|Time |RAM(GB)| 70 | |:----------------|----------:|------:|------:| 71 | |A 1.3-kb gene |41,718 |3m:06s |3.97 | 72 | |A 1.5-kb 16S rRNA|1,955,167 |32m:59s|11.09 | 73 | |A 52.8-kb plasmid|560,330 |52m:22s|14.48 | 74 | |1003 AMR genes |30,967,882 |15h:52m|24.86 | 75 | 76 | 77 | ***Blastn** is unable to run with the same dataset on common servers as it requires >2000 GB RAM*. 78 | 79 | 80 | {{< /columns >}} 81 | 82 | -------------------------------------------------------------------------------- /docs/content/faqs/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: FAQs 3 | weight: 60 4 | --- 5 | ## Table of contents 6 | 7 | {{< toc format=html >}} 8 | 9 | ## Does LexicMap support short reads? 10 | 11 | LexicMap is mainly designed for sequence alignment with a small number of queries (gene/plasmid/virus/phage sequences) longer than 100 bp by default. 12 | 13 | If you just want to search long (>1kb) queries for highly similar (>95%) targets, you can build an index with a bigger `-D/--seed-max-desert` (default 100) and `-d/--seed-in-desert-dist` (default 50), e.g., 14 | 15 | --seed-max-desert 300 --seed-in-desert-dist 150 16 | 17 | Bigger values decrease the search sensitivity for distant targets, speed up the indexing 18 | speed, decrease the indexing memory occupation and decrease the index size. While the 19 | alignment speed is almost not affected. 20 | 21 | 22 | ## Does LexicMap support fungi genomes? 23 | 24 | Yes. LexicMap mainly supports small genomes including prokaryotic, viral, and plasmid genomes. 25 | **Fungi can also be supported, just remember to increase the value of `-g/--max-genome` when running `lexicmap index`, 26 | which is used to skip genomes larger than 15Mb by default**. 27 | 28 | ``` 29 | -g, --max-genome int ► Maximum genome size. Extremely large genomes (e.g., non-isolate 30 | assemblies from Genbank) will be skipped. (default 15000000) 31 | ``` 32 | 33 | Maximum genome size is about 268 Mb (268,435,456). More precisely: 34 | 35 | $total_bases + ($num_contigs - 1) * 1000 <= 268,435,456 36 | 37 | as we concatenate contigs with 1000-bp intervals of N’s to reduce the sequence scale to index. 38 | 39 | For big and complex genomes, like the human genome (chr1 is ~248 Mb) which has many repetitive sequences, LexicMap would be slow to align. 40 | 41 | 42 | ## How's the hardware requirement? 43 | 44 | - For index building. See details [hardware requirement](https://bioinf.shenwei.me/LexicMap/tutorials/index/#hardware-requirements). 45 | - For seaching. See details [hardware requirement](https://bioinf.shenwei.me/LexicMap/tutorials/search/#hardware-requirements). 46 | 47 | 48 | 49 | ## How to resume the indexing as Slurm job time limit is almost reached while lexicmap index is still in the merging step? 50 | 51 | Use [lexicmap utils remerge](https://bioinf.shenwei.me/LexicMap/usage/utils/remerge/) (available since v0.5.0), which reruns the merging step for an unfinished index. 52 | 53 | > When to use this command? 54 | > - Only one thread is used for merging indexes, which happens when there are 55 | > a lot (>200 batches) of batches (`$inpu_files / --batch-size`) and the value 56 | > of `--max-open-files` is not big enough. 57 | > - The Slurm/PBS job time limit is almost reached and the merging step won't be finished before that. 58 | > - Disk quota is reached in the merging step. 59 | 60 | So you can stop the indexing command by press `Ctrl` + `C` (**make sure it is in the merging step**, see example below), and run `lexicmap utils remerge -d index.lmi`, 61 | where `index.lmi` is the output index directory in `lexicmap index`. 62 | 63 | Optionally, you might set bigger values of 64 | flag `--max-open-files` and `-J/--seed-data-threads` if you have hundreds of thousands of input genomes or have set 65 | a small batch size with `-b/--batch-size`. E.g., 66 | 67 | 22:54:24.420 [INFO] merging 297 indexes... 68 | 22:54:24.455 [INFO] [round 1] 69 | 22:54:24.455 [INFO] batch 1/1, merging 297 indexes to xxx.lmi.tmp/r1_b1 with 1 threads... 70 | 71 | There's only one thread was used for seed data merging, it would take a long time. 72 | So we can set a larger `--max-open-files`, e.g., `4096`, 73 | and it would allow `4096 / (297+2) = 13.7` threads for merging, let's set `--seed-data-threads 12`. 74 | 75 | # specify the maximum open files per process 76 | ulimit -n 4096 77 | 78 | lexicmap utils remerge -d index.lmi --max-open-files 4096 --seed-data-threads 12 79 | 80 | 81 | ## Can I extract the matched sequences? 82 | 83 | Yes, `lexicmap search` has a flag 84 | 85 | ``` 86 | -a, --all ► Output more columns, e.g., matched sequences. Use this if you 87 | want to output blast-style format with "lexicmap utils 2blast". 88 | ``` 89 | 90 | to output CIGAR string, aligned query and subject sequences. 91 | 92 | 21. cigar, CIGAR string of the alignment. (optional with -a/--all) 93 | 22. qseq, Aligned part of query sequence. (optional with -a/--all) 94 | 23. sseq, Aligned part of subject sequence. (optional with -a/--all) 95 | 24. align, Alignment text ("|" and " ") between qseq and sseq. (optional with -a/--all) 96 | 97 | 98 | An example: 99 | 100 | # Extracting similar sequences for a query gene. 101 | 102 | # search matches with query coverage >= 90% 103 | lexicmap search -d gtdb_complete.lmi/ b.gene_E_faecalis_SecY.fasta -o results.tsv \ 104 | --min-qcov-per-hsp 90 --all 105 | 106 | # extract matched sequences as FASTA format 107 | sed 1d results.tsv | awk -F'\t' '{print ">"$5":"$15"-"$16":"$17"\n"$23;}' \ 108 | | seqkit seq -g > results.fasta 109 | 110 | seqkit head -n 1 results.fasta | head -n 3 111 | >NZ_JALSCK010000007.1:39224-40522:- 112 | TTGTTCAAGCTATTAAAGAACGCCTTTAAAGTCAAAGACATTAGATCAAAAATCTTATTT 113 | ACAGTTTTAATCTTGTTTGTATTTCGCCTAGGTGCGCACATTACTGTGCCCGGGGTGAAT 114 | 115 | 116 | And `lexicmap util 2blast` can help to convert the tabular format to Blast-style format, 117 | see [examples](https://bioinf.shenwei.me/LexicMap/usage/utils/2blast/#examples). 118 | 119 | ## How can I extract the upstream and downstream flanking sequences of matched regions? 120 | 121 | [lexicmap utils subseq](https://bioinf.shenwei.me/LexicMap/usage/utils/subseq/) 122 | can extract subsequencess via genome ID, sequence ID and positions. 123 | So you can use these information from the search result and expand the region positions to extract flanking sequences. 124 | 125 | 126 | 127 | ## Why isn't the pident 100% when aligning with a sequence from the reference genomes? 128 | 129 | It happens if there are some degenerate bases (e.g., `N`) in the query sequence. 130 | In the indexing step, all degenerate bases are converted to their lexicographic first bases. E.g., `N` is converted to `A`. 131 | While for the query sequences, we don't convert them. 132 | 133 | 134 | ## Why is LexicMap slow for batch searching? 135 | 136 | LexicMap is mainly designed for sequence alignment with a small number of queries against a database with a huge number (millions) of genomes. 137 | 138 | There are some ways to improve the search speed of `lexicmap search`: 139 | http://bioinf.shenwei.me/LexicMap/tutorials/search/#improving-searching-speed 140 | 141 | {{< button relref="/usage/search" >}}Click{{< /button >}} to read more detail of the usage. 142 | 143 | ## How can I know if an index is compatible with a LexicMap version? Should I rebuild an existing index? 144 | 145 | LexicMap is under active development, but we are striving to preserve index compatibility as we implement new features and improvements. 146 | The change history and compatibility information are available [here](https://bioinf.shenwei.me/LexicMap/tutorials/index/#index-format-changelog). 147 | -------------------------------------------------------------------------------- /docs/content/installation/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Installation 3 | weight: 20 4 | --- 5 | 6 | LexicMap can be installed via [conda](#conda), downloading [executable binary files](#binary-files), 7 | or [compiling from the source](#compile-from-the-source). 8 | 9 | Besides, it supports [shell completion](#shell-completion), which could help accelerate typing. 10 | 11 | ## Conda/Pixi 12 | 13 | [Install conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html), then run 14 | 15 | conda install -c bioconda lexicmap 16 | 17 | Or use [mamba](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html), which is faster. 18 | 19 | conda install -c conda-forge mamba 20 | mamba install -c bioconda lexicmap 21 | 22 | Or use [pixi](https://pixi.sh/), which is even faster. 23 | 24 | pixi config channels add bioconda 25 | pixi add lexicmap 26 | 27 | Linux and MacOS (both x86 and arm CPUs) are supported. 28 | 29 | ## Binary files 30 | 31 | {{< tabs "uniqueid" >}} 32 | 33 | {{< tab "Linux" >}} 34 | 35 | 1. Download the binary file. 36 | 37 | |OS |Arch |File, 中国镜像 | 38 | |:------|:---------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 39 | |Linux |**64-bit**|[**lexicmap_linux_amd64.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_linux_amd64.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_linux_amd64.tar.gz) | 40 | |Linux |arm64 |[**lexicmap_linux_arm64.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_linux_arm64.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_linux_arm64.tar.gz) | 41 | 42 | 2. Decompress it: 43 | 44 | tar -zxvf lexicmap_linux_amd64.tar.gz 45 | 46 | 3. If you have the root privilege, simply copy it to `/usr/local/bin`: 47 | 48 | sudo cp lexicmap /usr/local/bin/ 49 | 50 | 4. If you don't have the root privilege, copy it to any directory in the environment variable `PATH`: 51 | 52 | mkdir -p $HOME/bin/; cp lexicmap $HOME/bin/ 53 | 54 | And optionally add the directory into the environment variable `PATH` if it's not in. 55 | 56 | # bash 57 | echo export PATH=\$PATH:\$HOME/bin/ >> $HOME/.bashrc 58 | source $HOME/.bashrc # apply the configuration 59 | 60 | # zsh 61 | echo export PATH=\$PATH:\$HOME/bin/ >> $HOME/.zshrc 62 | source $HOME/.zshrc # apply the configuration 63 | 64 | 65 | {{< /tab >}} 66 | 67 | {{< tab "MacOS" >}} 68 | 69 | 1. Download the binary file. 70 | 71 | |OS |Arch |File, 中国镜像 | 72 | |:------|:---------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 73 | |macOS |64-bit|[**lexicmap_darwin_amd64.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_darwin_amd64.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_darwin_amd64.tar.gz) | 74 | |macOS |**arm64** |[**lexicmap_darwin_arm64.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_darwin_arm64.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_darwin_arm64.tar.gz) | 75 | 76 | 2. Copy it to any directory in the environment variable `PATH`: 77 | 78 | mkdir -p $HOME/bin/; cp lexicmap $HOME/bin/ 79 | 80 | And optionally add the directory into the environment variable `PATH` if it's not in. 81 | 82 | # bash 83 | echo export PATH=\$PATH:\$HOME/bin/ >> $HOME/.bashrc 84 | source $HOME/.bashrc # apply the configuration 85 | 86 | # zsh 87 | echo export PATH=\$PATH:\$HOME/bin/ >> $HOME/.zshrc 88 | source $HOME/.zshrc # apply the configuration 89 | 90 | 91 | {{< /tab >}} 92 | 93 | {{< tab "FreeBSD" >}} 94 | 95 | 1. Download the binary file. 96 | 97 | |OS |Arch |File, 中国镜像 | 98 | |:------|:---------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 99 | |FreeBSD|**64-bit**|[**lexicmap_freebsd_amd64.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_freebsd_amd64.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_freebsd_amd64.tar.gz) | 100 | 101 | {{< /tab >}} 102 | 103 | 104 | {{< tab "Windows" >}} 105 | 106 | 1. Download the binary file. 107 | 108 | 109 | |OS |Arch |File, 中国镜像 | 110 | |:------|:---------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 111 | |Windows|**64-bit**|[**lexicmap_windows_amd64.exe.tar.gz**](https://github.com/shenwei356/LexicMap/releases/download/v0.7.0/lexicmap_windows_amd64.exe.tar.gz), [中国镜像](http://app.shenwei.me/data/lexicmap/lexicmap_windows_amd64.exe.tar.gz)| 112 | 113 | 114 | 2. Decompress it. 115 | 116 | 2. Copy `lexicmap.exe` to `C:\WINDOWS\system32`. 117 | 118 | {{< /tab >}} 119 | 120 | {{< tab "Others" >}} 121 | 122 | - Please [open an issue](https://github.com/shenwei356/LexicMap/issues) to request binaries for other platforms. 123 | - Or [compiling from the source](#compile-from-the-source). 124 | 125 | {{< /tab>}} 126 | 127 | 128 | {{< /tabs >}} 129 | 130 | 131 | 132 | ## Compile from the source 133 | 134 | 135 | 1. [Install go](https://go.dev/doc/install) (go 1.22 or later versions). 136 | 137 | wget https://go.dev/dl/go1.24.1.linux-amd64.tar.gz 138 | 139 | tar -zxf go1.24.1.linux-amd64.tar.gz -C $HOME/ 140 | 141 | # or 142 | # echo "export PATH=$PATH:$HOME/go/bin" >> ~/.bashrc 143 | # source ~/.bashrc 144 | export PATH=$PATH:$HOME/go/bin 145 | 146 | 2. Compile LexicMap. 147 | 148 | # ------------- the latest stable version ------------- 149 | 150 | go install -v github.com/shenwei356/LexicMap@latest 151 | 152 | # The executable binary file is located in: 153 | # ~/go/bin/lexicmap 154 | # You can also move it to anywhere in the $PATH 155 | mkdir -p $HOME/bin 156 | cp ~/go/bin/lexicmap $HOME/bin/ 157 | 158 | 159 | # --------------- the development version -------------- 160 | 161 | git clone https://github.com/shenwei356/LexicMap 162 | cd LexicMap/lexicmap/ 163 | go build 164 | 165 | # The executable binary file is located in: 166 | # ./lexicmap 167 | # You can also move it to anywhere in the $PATH 168 | mkdir -p $HOME/bin 169 | cp ./lexicmap $HOME/bin/ 170 | 171 | 172 | ## Shell-completion 173 | 174 | Supported shell: bash|zsh|fish|powershell 175 | 176 | Bash: 177 | 178 | # generate completion shell 179 | lexicmap autocompletion --shell bash 180 | 181 | # configure if never did. 182 | # install bash-completion if the "complete" command is not found. 183 | echo "for bcfile in ~/.bash_completion.d/* ; do source \$bcfile; done" >> ~/.bash_completion 184 | echo "source ~/.bash_completion" >> ~/.bashrc 185 | 186 | Zsh: 187 | 188 | # generate completion shell 189 | lexicmap autocompletion --shell zsh --file ~/.zfunc/_kmcp 190 | 191 | # configure if never did 192 | echo 'fpath=( ~/.zfunc "${fpath[@]}" )' >> ~/.zshrc 193 | echo "autoload -U compinit; compinit" >> ~/.zshrc 194 | 195 | fish: 196 | 197 | lexicmap autocompletion --shell fish --file ~/.config/fish/completions/lexicmap.fish 198 | -------------------------------------------------------------------------------- /docs/content/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | MapexicL 64 | -------------------------------------------------------------------------------- /docs/content/notes/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Notes 3 | weight: 100 4 | --- 5 | -------------------------------------------------------------------------------- /docs/content/notes/motivation.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Motivation 3 | weight: 0 4 | --- 5 | 6 | 1. BLASTN is not able to scale to millions of bacterial genomes, it's slow and has a high memory occupation. 7 | For example, it requires >2000 GB for alignment a 2-kb gene sequence against all the 2.34 millions of prokaryotics genomes in Genbank and RefSeq. 8 | 9 | 2. [Large-scale sequence searching tools](https://kamimrcht.github.io/webpage/set_kmer_sets2.html) only return which genomes a query matches (color), but they can't return positional information. 10 | -------------------------------------------------------------------------------- /docs/content/performance@genbank.tsv: -------------------------------------------------------------------------------- 1 | Query Genome hits Genome hits
(high-similarity) Genome hits
(medium-similarity) Genome hits
(low-similarity) Time RAM 2 | A 1.3-kb marker gene 41718 11746 115 29857 3m:06s 3.97 GB 3 | A 1.5-kb 16S rRNA 1955167 245884 501691 1207592 32m:59s 11.09 GB 4 | A 52.8-kb plasmid 560330 96 15370 544864 52m:22s 14.48 GB 5 | 1003 AMR genes 30967882 7636386 4858063 18473433 15h:52m:08s 24.86 GB 6 | -------------------------------------------------------------------------------- /docs/content/performance@genbank.tsv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cat performance@genbank.tsv \ 4 | | csvtk replace -t -f RAM -p ' .+' \ 5 | | csvtk rename -t -f RAM -n 'RAM(GB)' \ 6 | | csvtk replace -t -f Query -p 'marker ' \ 7 | | csvtk replace -t -f Time -p '(\d+h:\d+m):\d+s' -r '$1' \ 8 | | csvtk cut -t -f 1,2,6,7 \ 9 | | csvtk comma -t -f 2 \ 10 | | csvtk csv2md -t -a l,r,r,r 11 | 12 | echo 13 | 14 | cat performance@genbank.tsv \ 15 | | csvtk comma -t -f 2-5 \ 16 | | csvtk csv2md -t -a l,r,r,r,r,r,r 17 | -------------------------------------------------------------------------------- /docs/content/tutorials/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Tutorials 3 | weight: 40 4 | --- 5 | -------------------------------------------------------------------------------- /docs/content/tutorials/index/parameters-batches.tsv: -------------------------------------------------------------------------------- 1 | Flag Value Function Comment 2 | **`-b/--batch-size`** Max: 131072, default: 5000 Maximum number of genomes in each batch If the number of input files exceeds this number, input files are split into multiple batches and indexes are built for all batches. In the end, seed files are merged, while genome data files are kept unchanged and collected. ■ Bigger values increase indexing memory occupation and increase batch searching speed, while single query searching speed is not affected. 3 | -------------------------------------------------------------------------------- /docs/content/tutorials/index/parameters-general.tsv: -------------------------------------------------------------------------------- 1 | Flag Value Function Comment 2 | **`-j/--threads`** Default: all available CPUs Number of CPU cores to use. ► If the value is smaller than the number of available CPUs, make sure set the same value to `-c/--chunks`. 3 | -------------------------------------------------------------------------------- /docs/content/tutorials/index/parameters-masks.tsv: -------------------------------------------------------------------------------- 1 | Flag Value Function Comment 2 | `-M/--mask-file` A file File with custom masks "File with custom masks, which could be exported from an existing index or newly generated by ""lexicmap utils masks"". This flag oversides `-k/--kmer`, `-m/--masks`, `-s/--rand-seed`, etc." 3 | **`-k/--kmer`** Max: 32, default: 31 K-mer size ■ Bigger values improve the search specificity and do not increase the index size. 4 | **`-m/--masks`** Default: 20,000 Number of masks ■ Bigger values improve the search sensitivity slightly, increase the index size, and slow down the search speed. For smaller genomes like phages/viruses, m=5,000 is high enough. 5 | -------------------------------------------------------------------------------- /docs/content/tutorials/index/parameters-seeds.tsv: -------------------------------------------------------------------------------- 1 | Flag Value Function Comment 2 | **`--seed-max-desert`** Default: 100 Maximum length of distances between seeds The default value of 100 guarantees queries >=200 bp would match at least two seeds. ► Large regions with no seeds are called sketching deserts. Deserts with seed distance larger than this value will be filled by choosing k-mers roughly every --seed-in-desert-dist (50 by default) bases. ■ Bigger values decrease the search sensitivity for distant targets, speed up the indexing speed, decrease the indexing memory occupation and decrease the index size. While the alignment speed is almost not affected. 3 | **`-c/--chunks`** Maximum: 128, default: value of -j/--threads Number of seed file chunks Bigger values accelerate the search speed at the cost of a high disk reading load. ► The value should not exceed the maximum number of open files set by the operating systems. ► Make sure the value of `-j/--threads` in `lexicmap search` is >= this value. 4 | **`-J/--seed-data-threads`** Maximum: -c/--chunks, default: 8 Number of threads for writing seed data and merging seed chunks from all batches The actual value is min(--seed-data-threads, max(1, --max-open-files/($batches_1_round + 2))), where $batches_1_round = min(int($input_files / --batch-size), --max-open-files). ■ Bigger values increase indexing speed at the cost of slightly higher memory occupation. 5 | `-p/--partitions` Default: 4096 Number of partitions for indexing each seed file Bigger values bring a little higher memory occupation. ► After indexing, `lexicmap utils reindex-seeds` can be used to reindex the seeds data with another value of this flag. 6 | **`--max-open-files`** Default: 1024 Maximum number of open files It's only used in merging indexes of multiple genome batches. If there are >100 batches, i.e., ($input_files / --batch-size), please increase this value and set a bigger `ulimit -n` in shell. 7 | -------------------------------------------------------------------------------- /docs/content/tutorials/misc/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: More 3 | weight: 40 4 | 5 | geekdocCollapseSection: true 6 | --- 7 | -------------------------------------------------------------------------------- /docs/content/tutorials/misc/index-genbank.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Indexing GenBank+RefSeq 3 | weight: 10 4 | --- 5 | 6 | **Make sure you have enough disk space, >10 TB is preferred.** 7 | 8 | Tools: 9 | 10 | - https://github.com/pirovc/genome_updater, for downloading genomes 11 | - https://github.com/shenwei356/seqkit, for checking sequence files 12 | - https://github.com/shenwei356/rush, for running jobs 13 | 14 | Data: 15 | 16 | time genome_updater.sh -d "refseq,genbank" -g "archaea,bacteria" \ 17 | -f "genomic.fna.gz" -o "genbank" -M "ncbi" -t 12 -m -L curl 18 | 19 | cd genbank/2024-02-15_11-00-51/ 20 | 21 | 22 | # ----------------- check the file integrity ----------------- 23 | 24 | genomes=files 25 | 26 | # corrupted files 27 | # find $genomes -name "*.gz" \ 28 | fd ".gz$" $genomes \ 29 | | rush --eta 'seqkit seq -w 0 {} > /dev/null; if [ $? -ne 0 ]; then echo {}; fi' \ 30 | > failed.txt 31 | 32 | # empty files 33 | find $genomes -name "*.gz" -size 0 >> failed.txt 34 | 35 | # delete these files 36 | cat failed.txt | rush '/bin/rm {}' 37 | 38 | # redownload them: 39 | # run the genome_updater command again, with the flag -i 40 | 41 | Indexing. On a 48-CPU machine, time: 56 h, ram: 181 GB, index size: 4.96 TiB. 42 | If you don't have enough memory, please decrease the value of `-b`. 43 | 44 | lexicmap index \ 45 | -I files/ \ 46 | --ref-name-regexp '^(\w{3}_\d{9}\.\d+)' \ 47 | -O genbank_refseq.lmi --log genbank_refseq.lmi.log \ 48 | -b 25000 49 | 50 | # dirsize genbank_refseq.lmi 51 | genbank_refseq.lmi: 4.96 TiB (5,454,659,703,138) 52 | 2.79 TiB seeds 53 | 2.17 TiB genomes 54 | 55.81 MiB genomes.map.bin 55 | 156.28 KiB masks.bin 56 | 3.59 KiB genomes.chunks.bin 57 | 619 B info.toml 58 | -------------------------------------------------------------------------------- /docs/content/tutorials/misc/index-globdb.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Indexing GlobDB 3 | weight: 20 4 | --- 5 | 6 | 7 | Info: 8 | 9 | - [GlobDB](https://globdb.org/) , a dereplicated dataset of the species reps of the GTDB, GEM, SPIRE and SMAG datasets a lot. 10 | - https://x.com/daanspeth/status/1822964436950192218 11 | 12 | 13 | Steps: 14 | 15 | # download data 16 | wget https://fileshare.lisc.univie.ac.at/globdb/globdb_r220/globdb_r220_genome_fasta.tar.gz 17 | 18 | tar -zxf globdb_r220_genome_fasta.tar.gz 19 | 20 | # file list 21 | find globdb_r220_genome_fasta/ -name "*.fa.gz" > files.txt 22 | 23 | # index with lexicmap 24 | # elapsed time: 3h:40m:38s 25 | # peak rss: 87.15 GB 26 | lexicmap index -S -X files.txt -O globdb_r220.lmi --log globdb_r220.lmi -g 50000000 27 | 28 | -------------------------------------------------------------------------------- /docs/content/tutorials/misc/index-gtdb.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Indexing GTDB 3 | weight: 5 4 | --- 5 | 6 | Info: 7 | 8 | - https://gtdb.ecogenomic.org/ 9 | 10 | Tools: 11 | 12 | - https://github.com/pirovc/genome_updater, for downloading genomes 13 | - https://github.com/shenwei356/seqkit, for checking sequence files 14 | - https://github.com/shenwei356/rush, for running jobs 15 | 16 | Data: 17 | 18 | time genome_updater.sh -d "refseq,genbank" -g "archaea,bacteria" \ 19 | -f "genomic.fna.gz" -o "GTDB_complete" -M "gtdb" -t 12 -m -L curl 20 | 21 | cd GTDB_complete/2024-01-30_19-34-40/ 22 | 23 | 24 | # ----------------- check the file integrity ----------------- 25 | 26 | genomes=files 27 | 28 | # corrupted files 29 | # find $genomes -name "*.gz" \ 30 | fd ".gz$" $genomes \ 31 | | rush --eta 'seqkit seq -w 0 {} > /dev/null; if [ $? -ne 0 ]; then echo {}; fi' \ 32 | > failed.txt 33 | 34 | # empty files 35 | find $genomes -name "*.gz" -size 0 >> failed.txt 36 | 37 | # delete these files 38 | cat failed.txt | rush '/bin/rm {}' 39 | 40 | # redownload them: 41 | # run the genome_updater command again, with the flag -i 42 | 43 | Indexing. On a 48-CPU machine, time: 8h:19m:28s, ram: 73 GB, index size: 906 GB. 44 | If you don't have enough memory, please decrease the value of `-b`. 45 | 46 | lexicmap index \ 47 | -I files/ \ 48 | --ref-name-regexp '^(\w{3}_\d{9}\.\d+)' \ 49 | -O gtdb_complete.lmi --log gtdb_complete.lmi.log \ 50 | -b 5000 51 | 52 | Files: 53 | 54 | $ du -sh files gtdb_complete.lmi --apparent-size 55 | 413G files 56 | 907G gtdb_complete.lmi 57 | 58 | $ dirsize gtdb_complete.lmi 59 | gtdb_complete.lmi: 905.34 GiB (972,098,200,328) 60 | 542.34 GiB seeds 61 | 362.99 GiB genomes 62 | 9.60 MiB genomes.map.bin 63 | 156.28 KiB masks.bin 64 | 616 B info.toml 65 | 168 B genomes.chunks.bin 66 | -------------------------------------------------------------------------------- /docs/content/tutorials/misc/index-uhgg.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Indexing UHGG 3 | weight: 25 4 | --- 5 | 6 | Info: 7 | 8 | - [Unified Human Gastrointestinal Genome (UHGG) v2.0.2](https://www.ebi.ac.uk/metagenomics/genome-catalogues/human-gut-v2-0-2) 9 | - [A unified catalog of 204,938 reference genomes from the human gut microbiome](https://www.nature.com/articles/s41587-020-0603-3) 10 | - Number of Genomes: 289,232 11 | 12 | Tools: 13 | 14 | - https://github.com/shenwei356/seqkit, for checking sequence files 15 | - https://github.com/shenwei356/rush, for running jobs 16 | 17 | Data: 18 | 19 | # meta data 20 | wget https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.2/genomes-all_metadata.tsv 21 | 22 | # gff url 23 | sed 1d genomes-all_metadata.tsv | cut -f 20 | sed 's/v2.0/v2.0.2/' | sed -E 's/^ftp/https/' > url.txt 24 | 25 | # download gff files 26 | mkdir -p files; cd files 27 | 28 | time cat ../url.txt \ 29 | | rush --eta -v 'dir={///%}/{//%}' \ 30 | 'mkdir -p {dir}; curl -s -o {dir}/{%} {}' \ 31 | -c -C download.rush -j 12 32 | cd .. 33 | 34 | # extract sequences from gff files 35 | find files/ -name "*.gff.gz" \ 36 | | rush --eta \ 37 | 'zcat {} | perl -ne "print if \$s; \$s=true if /^##FASTA/" | seqkit seq -w 0 -o {/}/{%:}.fna.gz' \ 38 | -c -C extract.rush 39 | 40 | 41 | Indexing. On a 48-CPU machine, time: 3 h, ram: 41 GB, index size: 426 GB. 42 | If you don't have enough memory, please decrease the value of `-b`. 43 | 44 | lexicmap index \ 45 | -I files/ \ 46 | -O uhgg.lmi --log uhgg.lmi.log \ 47 | -b 5000 48 | 49 | File sizes: 50 | 51 | $ du -sh files/ uhgg.lmi 52 | 658G files/ 53 | 509G uhgg.lmi 54 | 55 | $ du -sh files/ uhgg.lmi --apparent-size 56 | 425G files/ 57 | 426G uhgg.lmi 58 | 59 | $ dirsize uhgg.lmi 60 | uhgg.lmi: 425.15 GiB (456,497,171,291) 61 | 243.47 GiB seeds 62 | 181.67 GiB genomes 63 | 6.34 MiB genomes.map.bin 64 | 312.53 KiB masks.bin 65 | 330 B info.toml 66 | -------------------------------------------------------------------------------- /docs/content/tutorials/parameters-align.tsv: -------------------------------------------------------------------------------- 1 | Flag Value Function Comment 2 | **`-Q/--min-qcov-per-genome`** Default 0 Minimum query coverage (percentage) per genome. 3 | **`-q/--min-qcov-per-hsp`** Default 0 Minimum query coverage (percentage) per HSP. 4 | **`-l/--align-min-match-len`** Default 50 Minimum aligned length in a HSP segment. 5 | **`-i/--align-min-match-pident`** Default 70 Minimum base identity (percentage) in a HSP segment. 6 | `--align-band` Default 100 Band size in backtracking the score matrix. 7 | `--align-ext-len` Default 1000 Extend length of upstream and downstream of seed regions, for extracting query and target sequences for alignment. It should be <= contig interval length in database. 8 | `--align-max-gap` Default 20 Maximum gap in a HSP segment. 9 | -------------------------------------------------------------------------------- /docs/content/tutorials/parameters-general.tsv: -------------------------------------------------------------------------------- 1 | Flag Value Function Comment 2 | **`-j/--threads`** Default: all available cpus Number of CPU cores to use. The value should be >= the number of seed chunk files (“chunks” in info.toml, set by `-c/--chunks` in `lexicmap index`). 3 | **`-w/--load-whole-seeds`** Load the whole seed data into memory for faster search Use this if the index is not big and many queries are needed to search. 4 | **`-n/--top-n-genomes`** Default 0, 0 for all Keep top N genome matches for a query in the chaining phase Value 1 is not recommended as the best chaining result does not always bring the best alignment, so it better be >= 5. The final number of genome hits might be smaller than this number as some chaining results might fail to pass the criteria in the alignment step. 5 | **`-a/--all`** Output more columns, e.g., matched sequences. "Use this if you want to output blast-style format with ""lexicmap utils 2blast""" 6 | `-J/--max-query-conc` Default 12, 0 for all Maximum number of concurrent queries Bigger values do not improve the batch searching speed and consume much memory. 7 | `--max-open-files` Default: 1024 Maximum number of open files It mainly affects candidate subsequence extraction. Increase this value if you have hundreds of genome batches or have multiple queries, and do not forgot to set a bigger `ulimit -n` in shell if the value is > 1024. 8 | -------------------------------------------------------------------------------- /docs/content/tutorials/parameters-seeding.tsv: -------------------------------------------------------------------------------- 1 | Flag Value Function Comment 2 | **`-p, --seed-min-prefix`** Default 15 Minimum (prefix) length of matched seeds. Smaller values produce more results at the cost of slow speed. 3 | **`-P, --seed-min-single-prefix`** Default 17 Minimum (prefix) length of matched seeds if there's only one pair of seeds matched. Smaller values produce more results at the cost of slow speed. 4 | `--seed-max-dist` Default 1000 Max distance between seeds in seed chaining. It should be <= contig interval length in database. 5 | `--seed-max-gap` Default 50 Max gap in seed chaining. 6 | -------------------------------------------------------------------------------- /docs/content/usage/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Usage 3 | weight: 50 4 | --- 5 | -------------------------------------------------------------------------------- /docs/content/usage/lexicmap.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: lexicmap 3 | weight: 0 4 | --- 5 | 6 | ```plain 7 | $ lexicmap -h 8 | 9 | LexicMap: efficient sequence alignment against millions of prokaryotic genomes 10 | 11 | Version: v0.7.0 12 | Documents: https://bioinf.shenwei.me/LexicMap 13 | Source code: https://github.com/shenwei356/LexicMap 14 | 15 | Usage: 16 | lexicmap [command] 17 | 18 | Available Commands: 19 | autocompletion Generate shell autocompletion scripts 20 | index Generate an index from FASTA/Q sequences 21 | search Search sequences against an index 22 | utils Some utilities 23 | version Print version information and check for update 24 | 25 | Flags: 26 | -h, --help help for lexicmap 27 | -X, --infile-list string ► File of input file list (one file per line). If given, they are 28 | appended to files from CLI arguments. 29 | --log string ► Log file. 30 | --quiet ► Do not print any verbose information. But you can write them to a file 31 | with --log. 32 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores. 33 | (default 16) 34 | 35 | Use "lexicmap [command] --help" for more information about a command. 36 | ``` 37 | -------------------------------------------------------------------------------- /docs/content/usage/search.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: search 3 | weight: 20 4 | --- 5 | 6 | ```plain 7 | $ lexicmap search -h 8 | Search sequences against an index 9 | 10 | Attention: 11 | 1. Input should be (gzipped) FASTA or FASTQ records from files or stdin. 12 | 2. For multiple queries, the order of queries in output might be different from the input. 13 | 14 | Tips: 15 | 1. When using -a/--all, the search result would be formatted to Blast-style format 16 | with 'lexicmap utils 2blast'. And the search speed would be slightly slowed down. 17 | 2. Alignment result filtering is performed in the final phase, so stricter filtering criteria, 18 | including -q/--min-qcov-per-hsp, -Q/--min-qcov-per-genome, and -i/--align-min-match-pident, 19 | do not significantly accelerate the search speed. Hence, you can search with default 20 | parameters and then filter the result with tools like awk or csvtk. 21 | 22 | Alignment result relationship: 23 | 24 | Query 25 | ├── Subject genome 26 | ├── Subject sequence 27 | ├── HSP cluster (a cluster of neighboring HSPs) 28 | ├── High-Scoring segment Pair (HSP) 29 | 30 | Here, the defination of HSP is similar with that in BLAST. Actually there are small gaps in HSPs. 31 | 32 | > A High-scoring Segment Pair (HSP) is a local alignment with no gaps that achieves one of the 33 | > highest alignment scores in a given search. https://www.ncbi.nlm.nih.gov/books/NBK62051/ 34 | 35 | Output format: 36 | Tab-delimited format with 20+ columns, with 1-based positions. 37 | 38 | 1. query, Query sequence ID. 39 | 2. qlen, Query sequence length. 40 | 3. hits, Number of subject genomes. 41 | 4. sgenome, Subject genome ID. 42 | 5. sseqid, Subject sequence ID. 43 | 6. qcovGnm, Query coverage (percentage) per genome: $(aligned bases in the genome)/$qlen. 44 | 7. cls, Nth HSP cluster in the genome. (just for improving readability) 45 | It's useful to show if multiple adjacent HSPs are collinear. 46 | 8. hsp, Nth HSP in the genome. (just for improving readability) 47 | 9. qcovHSP Query coverage (percentage) per HSP: $(aligned bases in a HSP)/$qlen. 48 | 10. alenHSP, Aligned length in the current HSP. 49 | 11. pident, Percentage of identical matches in the current HSP. 50 | 12. gaps, Gaps in the current HSP. 51 | 13. qstart, Start of alignment in query sequence. 52 | 14. qend, End of alignment in query sequence. 53 | 15. sstart, Start of alignment in subject sequence. 54 | 16. send, End of alignment in subject sequence. 55 | 17. sstr, Subject strand. 56 | 18. slen, Subject sequence length. 57 | 19. evalue, Expect value. 58 | 20. bitscore, Bit score. 59 | 21. cigar, CIGAR string of the alignment. (optional with -a/--all) 60 | 22. qseq, Aligned part of query sequence. (optional with -a/--all) 61 | 23. sseq, Aligned part of subject sequence. (optional with -a/--all) 62 | 24. align, Alignment text ("|" and " ") between qseq and sseq. (optional with -a/--all) 63 | 64 | Result ordering: 65 | For a HSP cluster, SimilarityScore = max(bitscore*pident) 66 | 1. Within each HSP cluster, HSPs are sorted by sstart. 67 | 2. Within each subject genome, HSP clusters are sorted in descending order by SimilarityScore. 68 | 3. Results of multiple subject genomes are sorted by the highest SimilarityScore of HSP clusters. 69 | 70 | Usage: 71 | lexicmap search [flags] -d [query.fasta.gz ...] [-o query.tsv.gz] 72 | 73 | Flags: 74 | --align-band int ► Band size in backtracking the score matrix (pseudo alignment 75 | phase). (default 100) 76 | --align-ext-len int ► Extend length of upstream and downstream of seed regions, for 77 | extracting query and target sequences for alignment. It should be 78 | <= contig interval length in database. (default 1000) 79 | --align-max-gap int ► Maximum gap in a HSP segment. (default 20) 80 | -l, --align-min-match-len int ► Minimum aligned length in a HSP segment. (default 50) 81 | -i, --align-min-match-pident float ► Minimum base identity (percentage) in a HSP segment. (default 70) 82 | -a, --all ► Output more columns, e.g., matched sequences. Use this if you 83 | want to output blast-style format with "lexicmap utils 2blast". 84 | --debug ► Print debug information, including a progress bar. 85 | (recommended when searching with one query). 86 | -h, --help help for search 87 | -d, --index string ► Index directory created by "lexicmap index". 88 | -w, --load-whole-seeds ► Load the whole seed data into memory for faster seed 89 | matching. It will consume a lot of RAM. 90 | -e, --max-evalue float ► Maximum evalue of a HSP segment. (default 10) 91 | --max-open-files int ► Maximum opened files. It mainly affects candidate subsequence 92 | extraction. Increase this value if you have hundreds of genome 93 | batches or have multiple queries, and do not forgot to set a 94 | bigger "ulimit -n" in shell if the value is > 1024. (default 1024) 95 | -J, --max-query-conc int ► Maximum number of concurrent queries. Bigger values do not 96 | improve the batch searching speed and consume much memory. 97 | (default 12) 98 | -Q, --min-qcov-per-genome float ► Minimum query coverage (percentage) per genome. 99 | -q, --min-qcov-per-hsp float ► Minimum query coverage (percentage) per HSP. 100 | -o, --out-file string ► Out file, supports a ".gz" suffix ("-" for stdout). (default "-") 101 | --seed-max-dist int ► Minimum distance between seeds in seed chaining. It should be 102 | <= contig interval length in database. (default 1000) 103 | --seed-max-gap int ► Minimum gap in seed chaining. (default 50) 104 | -p, --seed-min-prefix int ► Minimum (prefix/suffix) length of matched seeds (anchors). 105 | (default 15) 106 | -P, --seed-min-single-prefix int ► Minimum (prefix/suffix) length of matched seeds (anchors) if 107 | there's only one pair of seeds matched. (default 17) 108 | -n, --top-n-genomes int ► Keep top N genome matches for a query (0 for all) in chaining 109 | phase. Value 1 is not recommended as the best chaining result 110 | does not always bring the best alignment, so it better be >= 100. 111 | (default 0) 112 | 113 | Global Flags: 114 | -X, --infile-list string ► File of input file list (one file per line). If given, they are 115 | appended to files from CLI arguments. 116 | --log string ► Log file. 117 | --quiet ► Do not print any verbose information. But you can write them to a file 118 | with --log. 119 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores. 120 | (default 16) 121 | ``` 122 | 123 | 124 | ## Examples 125 | 126 | See {{< button size="small" relref="tutorials/search" >}}Searching{{< /button >}} 127 | -------------------------------------------------------------------------------- /docs/content/usage/utils/2blast.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 2blast 3 | weight: 0 4 | --- 5 | 6 | ## Usage 7 | 8 | ```plain 9 | $ lexicmap utils 2blast -h 10 | Convert the default search output to blast-style format 11 | 12 | LexicMap only stores genome IDs and sequence IDs, without description information. 13 | But the option -g/--kv-file-genome enables adding description data after the genome ID 14 | with a tabular key-value mapping file. 15 | 16 | Input: 17 | - Output of 'lexicmap search' with the flag -a/--all. 18 | 19 | Usage: 20 | lexicmap utils 2blast [flags] 21 | 22 | Flags: 23 | -b, --buffer-size string ► Size of buffer, supported unit: K, M, G. You need increase the value 24 | when "bufio.Scanner: token too long" error reported (default "20M") 25 | -h, --help help for 2blast 26 | -i, --ignore-case ► Ignore cases of sgenome and sseqid 27 | -g, --kv-file-genome string ► Two-column tabular file for mapping the target genome ID (sgenome) 28 | to the corresponding value 29 | -s, --kv-file-seq string ► Two-column tabular file for mapping the target sequence ID (sseqid) 30 | to the corresponding value 31 | -o, --out-file string ► Out file, supports and recommends a ".gz" suffix ("-" for stdout). 32 | (default "-") 33 | 34 | Global Flags: 35 | -X, --infile-list string ► File of input file list (one file per line). If given, they are 36 | appended to files from CLI arguments. 37 | --log string ► Log file. 38 | --quiet ► Do not print any verbose information. But you can write them to a file 39 | with --log. 40 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores. 41 | (default 16) 42 | ``` 43 | 44 | ## Examples 45 | 46 | 47 | From stdin. 48 | 49 | ```text 50 | $ seqkit seq -M 500 q.long-reads.fasta.gz \ 51 | | seqkit head -n 2 \ 52 | | lexicmap search -d demo.lmi/ -a \ 53 | | lexicmap utils 2blast --kv-file-genome ass2species.map 54 | 55 | Query = GCF_000017205.1_r160 56 | Length = 478 57 | 58 | [Subject genome #1/1] = GCF_000017205.1 Pseudomonas aeruginosa 59 | Query coverage per genome = 98.536% 60 | 61 | >NC_009656.1 62 | Length = 6588339 63 | 64 | HSP cluster #1, HSP #1 65 | Score = 883 bits, Expect = 3.60e-256 66 | Query coverage per seq = 98.536%, Aligned length = 479, Identities = 94.990%, Gaps = 15 67 | Query range = 7-477, Subject range = 4866857-4867328, Strand = Plus/Plus 68 | 69 | Query 7 GGTGGCCCTCAAACGAGTCC-AACAGGCCAACGCCTAGCAATCCCTCCCCTGTGGGGCAG 65 70 | ||||||| |||||||||||| |||||||| |||||| | ||||||||||||| |||||| 71 | Sbjct 4866857 GGTGGCC-TCAAACGAGTCCGAACAGGCCCACGCCTCACGATCCCTCCCCTGTCGGGCAG 4866915 72 | 73 | Query 66 GGAAAATCGTCCTTTATGGTCCGTTCCGGGCACGCACCGGAACGGCGGTCATCTTCCACG 125 74 | |||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||| 75 | Sbjct 4866916 GGAAAATCGTCCTTTATGGTCCGTTCCGGGCACGCACCGGAACGGCGGTCAT-TTCCACG 4866974 76 | 77 | Query 126 GTGCCCGCCCACGGCGGACCCGCGGAAACCGACCCGGGCGCCAAGGCGCCCGGGAACGGA 185 78 | ||||||||| ||||||||||| |||||||||||||||||||||||||||||||||||||| 79 | Sbjct 4866975 GTGCCCGCC-ACGGCGGACCC-CGGAAACCGACCCGGGCGCCAAGGCGCCCGGGAACGGA 4867032 80 | 81 | Query 186 GTA-CACTCGGCGTTCGGCCAGCGACAGC---GACGCGTTGCCGCCCACCGCGGTGGTGT 241 82 | ||| |||||||||| |||||||||||||| |||||||||||||||||||||||||||| 83 | Sbjct 4867033 GTATCACTCGGCGT-CGGCCAGCGACAGCAGCGACGCGTTGCCGCCCACCGCGGTGGTGT 4867091 84 | 85 | Query 242 TCACCGAGGTGGTGCGCTCGCTGAC-AAACGCAGCAGGTAGTTCGGCCCGCCGGCCTTGG 300 86 | ||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||| 87 | Sbjct 4867092 TCACCGAGGTGGTGCGCTCGCTGACGAAACGCAGCAGGTAGTTCGGCCCGCCGGCCTTGG 4867151 88 | 89 | Query 301 GACCG-TGCCGGACAGCCCGTGGCCGCCGAACAGTTGCACGCCCACCACCGCGCCGAT-T 358 90 | ||||| |||||||||||||||||||||||||| ||||||||||||||||||||||||| | 91 | Sbjct 4867152 GACCGGTGCCGGACAGCCCGTGGCCGCCGAACGGTTGCACGCCCACCACCGCGCCGATCT 4867211 92 | 93 | Query 359 GGTTTCGGTTGACGTAGAGGTTGCCGACCCGCGCCAGCTCTTGGATGCGGCGGGCGGTTT 418 94 | |||| ||||||||||||||||||||||||||||||||||||| ||||||||||||||||| 95 | Sbjct 4867212 GGTTGCGGTTGACGTAGAGGTTGCCGACCCGCGCCAGCTCTTCGATGCGGCGGGCGGTTT 4867271 96 | 97 | Query 419 CCTCGTTGCGGCTGTGGACCCCCATGGTCAGGCCGAAACCGGTGGCGTTTGATGGCCCT 477 98 | ||||||||||||||||||||||||||||||||||||||||||||||||| ||| ||| | 99 | Sbjct 4867272 CCTCGTTGCGGCTGTGGACCCCCATGGTCAGGCCGAAACCGGTGGCGTT-GATCGCC-T 4867328 100 | 101 | 102 | Query = GCF_006742205.1_r100 103 | Length = 431 104 | 105 | [Subject genome #1/1] = GCF_006742205.1 Staphylococcus epidermidis 106 | Query coverage per genome = 93.968% 107 | 108 | >NZ_AP019721.1 109 | Length = 2422602 110 | 111 | HSP cluster #1, HSP #1 112 | Score = 740 bits, Expect = 2.39e-213 113 | Query coverage per seq = 93.968%, Aligned length = 408, Identities = 98.284%, Gaps = 4 114 | Query range = 27-431, Subject range = 1321677-1322083, Strand = Plus/Minus 115 | 116 | Query 27 TTCATTTAAAACGATTGCTAATGAGTCACGTATTTCATCTGGTTCGGTAACTATACCGTC 86 117 | ||||| |||||||||||||||||||||||||||||||||||||||||||||||||||||| 118 | Sbjct 1322083 TTCATCTAAAACGATTGCTAATGAGTCACGTATTTCATCTGGTTCGGTAACTATACCGTC 1322024 119 | 120 | Query 87 TACTATGGACTCAGTGTAACCCTGTAATAAAGAGATTGGCGTACGTAATTCATGTG-TAC 145 121 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||| 122 | Sbjct 1322023 TACTATGGACTCAGTGTAACCCTGTAATAAAGAGATTGGCGTACGTAATTCATGTGATAC 1321964 123 | 124 | Query 146 ATTTGCTATAAAATCTTTTTTCATTTGATCAAGATTATGTTCATTTGTCATATCACAGGA 205 125 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||| 126 | Sbjct 1321963 ATTTGCTATAAAATCTTTTTTCATTTGATCAAGATTATGTTCATTTGTCATATCAC-GGA 1321905 127 | 128 | Query 206 TGACCATGACAATACCACTTCTACCATTTGTTTGAATTCTATCTATATAACTGGAGATAA 265 129 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 130 | Sbjct 1321904 TGACCATGACAATACCACTTCTACCATTTGTTTGAATTCTATCTATATAACTGGAGATAA 1321845 131 | 132 | Query 266 ATACATAGTACCTTGTATTAATTTCTAATTCTAA-TACTCATTCTGTTGTGATTCAAATG 324 133 | |||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||| 134 | Sbjct 1321844 ATACATAGTACCTTGTATTAATTTCTAATTCTAAATACTCATTCTGTTGTGATTCAAATG 1321785 135 | 136 | Query 325 GTGCTTCAATTTGCTGTTCAATAGATTCTTTTGAAAAATCATCAATGTGACGCATAATAT 384 137 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 138 | Sbjct 1321784 TTGCTTCAATTTGCTGTTCAATAGATTCTTTTGAAAAATCATCAATGTGACGCATAATAT 1321725 139 | 140 | Query 385 AATCAGCCATCTTGTT-GACAATATGATTTCACGTTGATTATTAATGC 431 141 | ||||||||||||||| ||||||||||||||||||||||||||||||| 142 | Sbjct 1321724 CATCAGCCATCTTGTTTGACAATATGATTTCACGTTGATTATTAATGC 1321677 143 | 144 | ``` 145 | 146 | 147 | From file. 148 | 149 | $ lexicmap utils 2blast r.lexicmap.tsv -o r.lexicmap.txt 150 | -------------------------------------------------------------------------------- /docs/content/usage/utils/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: utils 3 | weight: 40 4 | geekdocCollapseSection: true 5 | --- 6 | 7 | ```plain 8 | $ lexicmap utils 9 | Some utilities 10 | 11 | Usage: 12 | lexicmap utils [command] 13 | 14 | Available Commands: 15 | 2blast Convert the default search output to blast-style format 16 | genomes View genome IDs in the index 17 | kmers View k-mers captured by the masks 18 | masks View masks of the index or generate new masks randomly 19 | reindex-seeds Recreate indexes of k-mer-value (seeds) data 20 | remerge Rerun the merging step for an unfinished index 21 | seed-pos Extract and plot seed positions via reference name(s) 22 | subseq Extract subsequence via reference name, sequence ID, position and strand 23 | 24 | Flags: 25 | -h, --help help for utils 26 | 27 | Global Flags: 28 | -X, --infile-list string ► File of input file list (one file per line). If given, they are 29 | appended to files from CLI arguments. 30 | --log string ► Log file. 31 | --quiet ► Do not print any verbose information. But you can write them to a file 32 | with --log. 33 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores. 34 | (default 16) 35 | ``` 36 | 37 | 38 | Subcommands: 39 | 40 | - [2blast](2blast/) 41 | - [masks](masks/) 42 | - [kmers](kmers/) 43 | - [genomes](genomes/) 44 | - [subseq](subseq/) 45 | - [seed-pos](seed-pos/) 46 | - [reindex-seeds](reindex-seeds/) 47 | - [remerge](remerge/) 48 | -------------------------------------------------------------------------------- /docs/content/usage/utils/genomes.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: genomes 3 | weight: 20 4 | --- 5 | 6 | ## Usage 7 | 8 | ```plain 9 | $ lexicmap utils genomes -h 10 | View genome IDs in the index 11 | 12 | Usage: 13 | lexicmap utils genomes [flags] 14 | 15 | Flags: 16 | -h, --help help for genomes 17 | -d, --index string ► Index directory created by "lexicmap index". 18 | -o, --out-file string ► Out file, supports the ".gz" suffix ("-" for stdout). (default "-") 19 | 20 | Global Flags: 21 | -X, --infile-list string ► File of input file list (one file per line). If given, they are 22 | appended to files from CLI arguments. 23 | --log string ► Log file. 24 | --quiet ► Do not print any verbose information. But you can write them to a file 25 | with --log. 26 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores. 27 | (default 8) 28 | ``` 29 | 30 | ## Examples 31 | 32 | 33 | ``` 34 | $ lexicmap utils genomes -d demo.lmi/ 35 | GCF_000148585.2 36 | GCF_001457655.1 37 | GCF_900638025.1 38 | GCF_001096185.1 39 | GCF_006742205.1 40 | GCF_001544255.1 41 | GCF_000392875.1 42 | GCF_001027105.1 43 | GCF_009759685.1 44 | GCF_002949675.1 45 | GCF_002950215.1 46 | GCF_000006945.2 47 | GCF_003697165.2 48 | GCF_000742135.1 49 | GCF_000017205.1 50 | ``` 51 | -------------------------------------------------------------------------------- /docs/content/usage/utils/masks.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: masks 3 | weight: 5 4 | --- 5 | 6 | ```plain 7 | $ lexicmap utils masks -h 8 | View masks of the index or generate new masks randomly 9 | 10 | Usage: 11 | lexicmap utils masks [flags] { -d | [-k ] [-n ] [-s ] } [-o out.tsv.gz] 12 | 13 | Flags: 14 | -h, --help help for masks 15 | -d, --index string ► Index directory created by "lexicmap index". 16 | -k, --kmer int ► Maximum k-mer size. K needs to be <= 32. (default 31) 17 | -m, --masks int ► Number of masks. (default 40000) 18 | -o, --out-file string ► Out file, supports and recommends a ".gz" suffix ("-" for stdout). 19 | (default "-") 20 | -p, --prefix int ► Length of mask k-mer prefix for checking low-complexity (0 for no 21 | checking). (default 15) 22 | -s, --seed int ► The seed for generating random masks. (default 1) 23 | 24 | Global Flags: 25 | -X, --infile-list string ► File of input file list (one file per line). If given, they are 26 | appended to files from CLI arguments. 27 | --log string ► Log file. 28 | --quiet ► Do not print any verbose information. But you can write them to a file 29 | with --log. 30 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores. 31 | (default 16) 32 | ``` 33 | 34 | ## Examples 35 | 36 | ```plain 37 | $ lexicmap utils masks --quiet -d demo.lmi/ | head -n 10 38 | 1 AAAAAAATTCTCGGCGGTGTTTCCAGGCGCA 39 | 2 AAAAAACGTGGCGTCCCCTGTATAACGGCTA 40 | 3 AAAAAAGAGGGGAAGCAAGCTGAAGGATATG 41 | 4 AAAAAATACAGGCTGGCATCTTTAACCCACC 42 | 5 AAAAAATCCAGGGTTCCGTTAAGGATCTGTC 43 | 6 AAAAACATTCATGCTAGCATACCTTGGCAAC 44 | 7 AAAAACCACAATGTGGAAGCACGAGAGGATT 45 | 8 AAAAACCTGTACCCACCCGACGTGGATCCTC 46 | 9 AAAAACGTAGGCGTACCTCTCATAGCTTGTA 47 | 10 AAAAACTATGGATACTTGCCGTAAATCACCT 48 | 49 | $ lexicmap utils masks --quiet -d demo.lmi/ | tail -n 10 50 | 19991 TTTTTGAACTTGTGAAAAAGGCAGATGTGTG 51 | 19992 TTTTTGCGTTTATGCTGCCCTCAAACCATCT 52 | 19993 TTTTTGGATCCACTGTACGAGCACACTACCC 53 | 19994 TTTTTGTGGCTCATCGGGATCGGGAGCAGTC 54 | 19995 TTTTTTACATGTTGGGCTAGGGGCGGTTCAC 55 | 19996 TTTTTTATCGGACGCCAAGTTTGTAATCGTC 56 | 19997 TTTTTTCTTGCATCGTATTCAGCACGTTCCT 57 | 19998 TTTTTTGCCGAGTGACCCCGAAAAGCTCACA 58 | 19999 TTTTTTTATCGAGGCATGGTTGAAGACGGGT 59 | 20000 TTTTTTTCCGTAACTAGGTTCTGGCGATTCC 60 | 61 | # check a specific mask 62 | 63 | $ lexicmap utils masks --quiet -d demo.lmi/ -m 12345 64 | 12345 GCTGCACACGCAAAGACTCACGTCTTCAACG 65 | ``` 66 | 67 | Freqency of prefixes. 68 | 69 | ``` 70 | $ lexicmap utils masks --quiet -d demo.lmi/ \ 71 | | csvtk mutate -Ht -f 2 -p '^(.{7})' \ 72 | | csvtk freq -Ht -f 3 -nr \ 73 | | head -n 10 74 | AAAAAAT 2 75 | AAAAACC 2 76 | AAAAACT 2 77 | AAAAAGG 2 78 | AAAAAGT 2 79 | AAAAATT 2 80 | AAAACCA 2 81 | AAAACCC 2 82 | AAAACGA 2 83 | AAAACTA 2 84 | 85 | $ lexicmap utils masks --quiet -d demo.lmi/ \ 86 | | csvtk mutate -Ht -f 2 -p '^(.{7})' \ 87 | | csvtk freq -Ht -f 3 -n \ 88 | | head -n 10 89 | AAAAAAA 1 90 | AAAAAAC 1 91 | AAAAAAG 1 92 | AAAAACA 1 93 | AAAAACG 1 94 | AAAAAGA 1 95 | AAAAAGC 1 96 | AAAAATA 1 97 | AAAAATC 1 98 | AAAAATG 1 99 | ``` 100 | 101 | Frequency of frequencies. i.e., for 20,000 masks, 4*7* = 16384. 102 | In them, 3,616 of them are duplicated 2 times. 12768 + 2 * 3616 = 20000. 103 | 104 | ``` 105 | $ lexicmap utils masks --quiet -d demo.lmi/ \ 106 | | csvtk mutate -Ht -f 2 -p '^(.{7})' \ 107 | | csvtk freq -Ht -f 3 -n \ 108 | | csvtk freq -Ht -f 2 -k 109 | 1 12768 110 | 2 3616 111 | ``` 112 | -------------------------------------------------------------------------------- /docs/content/usage/utils/reindex-seeds.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: reindex-seeds 3 | weight: 50 4 | --- 5 | 6 | ## Usage 7 | 8 | ```plain 9 | $ lexicmap utils reindex-seeds -h 10 | Recreate indexes of k-mer-value (seeds) data 11 | 12 | Usage: 13 | lexicmap utils reindex-seeds [flags] 14 | 15 | Flags: 16 | -h, --help help for reindex-seeds 17 | -d, --index string ► Index directory created by "lexicmap index". 18 | --partitions int ► Number of partitions for re-indexing seeds (k-mer-value data) files. The 19 | value needs to be the power of 4. (default 4096) 20 | 21 | Global Flags: 22 | -X, --infile-list string ► File of input file list (one file per line). If given, they are 23 | appended to files from CLI arguments. 24 | --log string ► Log file. 25 | --quiet ► Do not print any verbose information. But you can write them to a file 26 | with --log. 27 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores. 28 | (default 16) 29 | ``` 30 | 31 | ## Examples 32 | 33 | 34 | $ lexicmap utils reindex-seeds -d demo.lmi/ --partitions 1024 35 | 10:20:29.150 [INFO] recreating seed indexes with 1024 partitions for: demo.lmi/ 36 | processed files: 16 / 16 [======================================] ETA: 0s. done 37 | 10:20:29.166 [INFO] update index information file: demo.lmi/info.toml 38 | 10:20:29.166 [INFO] finished updating the index information file: demo.lmi/info.toml 39 | 10:20:29.166 [INFO] 40 | 10:20:29.166 [INFO] elapsed time: 15.981266ms 41 | 10:20:29.166 [INFO] 42 | -------------------------------------------------------------------------------- /docs/content/usage/utils/remerge.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: remerge 3 | weight: 60 4 | --- 5 | 6 | ```plain 7 | $ lexicmap utils remerge -h 8 | Rerun the merging step for an unfinished index 9 | 10 | When to use this command? 11 | 12 | - Only one thread is used for merging indexes, which happens when there are 13 | a lot (>200 batches) of batches ($inpu_files / --batch-size) and the value 14 | of --max-open-files is not big enough. E.g., 15 | 16 | 22:54:24.420 [INFO] merging 297 indexes... 17 | 22:54:24.455 [INFO] [round 1] 18 | 22:54:24.455 [INFO] batch 1/1, merging 297 indexes to xxx.lmi.tmp/r1_b1 with 1 threads... 19 | 20 | ► Then you can run this command with a bigger --max-open-files (e.g., 4096) and 21 | -J/--seed-data-threads (e.g., 12. 12 needs be <= 4096/(297+2)=13.7). 22 | And you need to set a bigger 'ulimit -n' if the value of --max-open-files is bigger than 1024. 23 | 24 | - The Slurm/PBS job time limit is almost reached and the merging step won't be finished before that. 25 | 26 | - Disk quota is reached in the merging step. 27 | 28 | Usage: 29 | lexicmap utils remerge [flags] [flags] -d 30 | 31 | Flags: 32 | -h, --help help for remerge 33 | -d, --index string ► Index directory created by "lexicmap index". 34 | --max-open-files int ► Maximum opened files, used in merging indexes. If there are >100 35 | batches, please increase this value and set a bigger "ulimit -n" in 36 | shell. (default 1024) 37 | -J, --seed-data-threads int ► Number of threads for writing seed data and merging seed chunks from 38 | all batches, the value should be in range of [1, -c/--chunks]. If there 39 | are >100 batches, please also increase the value of --max-open-files and 40 | set a bigger "ulimit -n" in shell. (default 8) 41 | 42 | Global Flags: 43 | -X, --infile-list string ► File of input file list (one file per line). If given, they are 44 | appended to files from CLI arguments. 45 | --log string ► Log file. 46 | --quiet ► Do not print any verbose information. But you can write them to a file 47 | with --log. 48 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores. 49 | (default 16) 50 | ``` 51 | -------------------------------------------------------------------------------- /docs/content/usage/utils/subseq.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: subseq 3 | weight: 20 4 | --- 5 | 6 | ## Usage 7 | 8 | ```plain 9 | $ lexicmap utils subseq -h 10 | Exextract subsequence via reference name, sequence ID, position and strand 11 | 12 | Attention: 13 | 1. The option -s/--seq-id is optional. 14 | 1) If given, the positions are these in the original sequence. 15 | 2) If not given, the positions are these in the concatenated sequence. 16 | 2. All degenerate bases in reference genomes were converted to the lexicographic first bases. 17 | E.g., N was converted to A. Therefore, consecutive A's in output might be N's in the genomes. 18 | 19 | Usage: 20 | lexicmap utils subseq [flags] 21 | 22 | Flags: 23 | -h, --help help for subseq 24 | -d, --index string ► Index directory created by "lexicmap index". 25 | -w, --line-width int ► Line width of sequence (0 for no wrap). (default 60) 26 | -o, --out-file string ► Out file, supports the ".gz" suffix ("-" for stdout). 27 | (default "-") 28 | -n, --ref-name string ► Reference name. 29 | -r, --region string ► Region of the subsequence (1-based). 30 | -R, --revcom ► Extract subsequence on the negative strand. 31 | -s, --seq-id string ► Sequence ID. If the value is empty, the positions in the region are 32 | treated as that in the concatenated sequence. 33 | 34 | Global Flags: 35 | -X, --infile-list string ► File of input file list (one file per line). If given, they are 36 | appended to files from CLI arguments. 37 | --log string ► Log file. 38 | --quiet ► Do not print any verbose information. But you can write them to a file 39 | with --log. 40 | -j, --threads int ► Number of CPU cores to use. By default, it uses all available cores. 41 | (default 16) 42 | ``` 43 | 44 | ## Examples 45 | 46 | 1. Extracting subsequence with genome ID, sequence ID, position range and strand information. 47 | 48 | 49 | $ lexicmap utils subseq -d demo.lmi/ -n GCF_003697165.2 -s NZ_CP033092.2 -r 4591684:4593225 -R 50 | >NZ_CP033092.2:4591684-4593225:- 51 | AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAA 52 | GTCGAACGGTAACAGGAAGCAGCTTGCTGCTTTGCTGACGAGTGGCGGACGGGTGAGTAA 53 | TGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCAT 54 | AACGTCGCAAGACCAAAGAGGGGGACCTTAGGGCCTCTTGCCATCGGATGTGCCCAGATG 55 | GGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGACGATCCCTAGCTGGTCTGAGA 56 | GGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGG 57 | GGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCT 58 | TCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATT 59 | GACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAG 60 | GGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCA 61 | GATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTC 62 | GTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACC 63 | GGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCA 64 | AACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCC 65 | CTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCA 66 | AGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAAT 67 | TCGATGCAACGCGAAGAACCTTACCTGGTCTTGACATCCACGGAAGTTTTCAGAGATGAG 68 | AATGTGCCTTCGGGAACCGTGAGACAGGTGCTGCATGGCTGTCGTCAGCTCGTGTTGTGA 69 | AATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCTTTGTTGCCAGCGGTCCGGC 70 | CGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGACGTCAAGTC 71 | ATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCG 72 | ACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAAC 73 | TCGACTCCATGAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGT 74 | TCCCGGGCCTTGTACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGT 75 | AGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAA 76 | CAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTA 77 | 78 | 1. If the sequence ID (`-s/--seq-id`) is not given, the positions are these in the concatenated sequence. 79 | 80 | Checking sequence lengths of a genome with [seqkit](https://github.com/shenwei356/seqkit). 81 | 82 | $ seqkit fx2tab -nil refs/GCF_003697165.2.fa.gz 83 | NZ_CP033092.2 4903501 84 | NZ_CP033091.2 131333 85 | 86 | Extracting the 1000-bp interval sequence inserted by `lexicmap index`. 87 | 88 | $ lexicmap utils subseq -d demo.lmi/ -n GCF_003697165.2 -r 4903502:4904501 89 | >GCF_003697165.2:4903502-4904501:+ 90 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 91 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 92 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 93 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 94 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 95 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 96 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 97 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 98 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 99 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 100 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 101 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 102 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 103 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 104 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 105 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 106 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 107 | 108 | 1. It detects if the end position is larger than the sequence length. 109 | 110 | # the length of NZ_CP033092.2 is 4903501 111 | 112 | $ lexicmap utils subseq -d demo.lmi/ -n GCF_003697165.2 -s NZ_CP033092.2 -r 4903501:1000000000 113 | >NZ_CP033092.2:4903501-4903501:+ 114 | C 115 | 116 | 117 | $ lexicmap utils subseq -d demo.lmi/ -n GCF_003697165.2 -s NZ_CP033092.2 -r 4903502:1000000000 118 | >NZ_CP033092.2:4903502-4903501:+ 119 | 120 | -------------------------------------------------------------------------------- /docs/data/menu/extra.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | header: 3 | - name: GitHub 4 | ref: https://github.com/shenwei356/LexicMap 5 | icon: gdoc_github 6 | external: true 7 | -------------------------------------------------------------------------------- /docs/data/menu/more.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | more: 3 | - name: More tools 4 | ref: "https://github.com/shenwei356" 5 | external: true 6 | icon: "gdoc_github" 7 | -------------------------------------------------------------------------------- /docs/hugo.toml: -------------------------------------------------------------------------------- 1 | baseURL = 'https://bioinf.shenwei.me/LexicMap' 2 | languageCode = 'en-us' 3 | title = 'LexicMap: efficient sequence alignment against millions of prokaryotic genomes​' 4 | theme = 'hugo-geekdoc' 5 | 6 | defaultContentLanguage = 'en' 7 | 8 | pluralizeListTitles = false 9 | 10 | # Geekdoc required configuration 11 | pygmentsUseClasses = true 12 | pygmentsCodeFences = true 13 | disablePathToLower = true 14 | 15 | # Required if you want to render robots.txt template 16 | enableRobotsTXT = true 17 | 18 | # Needed for mermaid shortcodes 19 | [markup] 20 | [markup.goldmark.renderer] 21 | # Needed for mermaid shortcode or when nesting shortcodes (e.g. img within 22 | # columns or tabs) 23 | unsafe = true 24 | [markup.tableOfContents] 25 | startLevel = 1 26 | endLevel = 3 27 | 28 | [taxonomies] 29 | tag = "tags" 30 | 31 | [params] 32 | geekdocToC = 3 33 | 34 | geekdocLogo = "logo.svg" 35 | -------------------------------------------------------------------------------- /docs/static/GCF_000017205.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_000017205.1.png -------------------------------------------------------------------------------- /docs/static/GCF_000017205.1.seed_number.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_000017205.1.seed_number.png -------------------------------------------------------------------------------- /docs/static/GCF_000392875.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_000392875.1.png -------------------------------------------------------------------------------- /docs/static/GCF_000392875.1.seed_number.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_000392875.1.seed_number.png -------------------------------------------------------------------------------- /docs/static/GCF_002949675.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_002949675.1.png -------------------------------------------------------------------------------- /docs/static/GCF_002949675.1.seed_number.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/GCF_002949675.1.seed_number.png -------------------------------------------------------------------------------- /docs/static/custom.css: -------------------------------------------------------------------------------- 1 | .gdoc-nav nav { 2 | position: fixed; 3 | } 4 | -------------------------------------------------------------------------------- /docs/static/favicon/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/android-chrome-192x192.png -------------------------------------------------------------------------------- /docs/static/favicon/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/android-chrome-512x512.png -------------------------------------------------------------------------------- /docs/static/favicon/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/apple-touch-icon.png -------------------------------------------------------------------------------- /docs/static/favicon/browserconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #da532c 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /docs/static/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /docs/static/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /docs/static/favicon/favicon-48x48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/favicon-48x48.png -------------------------------------------------------------------------------- /docs/static/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/favicon.ico -------------------------------------------------------------------------------- /docs/static/favicon/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | MapexicL 64 | -------------------------------------------------------------------------------- /docs/static/favicon/mstile-144x144.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/mstile-144x144.png -------------------------------------------------------------------------------- /docs/static/favicon/mstile-150x150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/mstile-150x150.png -------------------------------------------------------------------------------- /docs/static/favicon/mstile-310x150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/mstile-310x150.png -------------------------------------------------------------------------------- /docs/static/favicon/mstile-310x310.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/mstile-310x310.png -------------------------------------------------------------------------------- /docs/static/favicon/mstile-70x70.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/favicon/mstile-70x70.png -------------------------------------------------------------------------------- /docs/static/favicon/safari-pinned-tab.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 7 | 8 | Created by potrace 1.14, written by Peter Selinger 2001-2017 9 | 10 | 12 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /docs/static/favicon/site.webmanifest: -------------------------------------------------------------------------------- 1 | { 2 | "name": "", 3 | "short_name": "", 4 | "icons": [ 5 | { 6 | "src": "/android-chrome-192x192.png", 7 | "sizes": "192x192", 8 | "type": "image/png" 9 | }, 10 | { 11 | "src": "/android-chrome-512x512.png", 12 | "sizes": "512x512", 13 | "type": "image/png" 14 | } 15 | ], 16 | "theme_color": "#ffffff", 17 | "background_color": "#ffffff", 18 | "display": "standalone" 19 | } 20 | -------------------------------------------------------------------------------- /docs/static/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | MapexicL 64 | -------------------------------------------------------------------------------- /docs/static/prefix.hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/LexicMap/56421de38339588edecf74bd1955dee109ea9ced/docs/static/prefix.hist.png -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/shenwei356/LexicMap 2 | 3 | go 1.24 4 | 5 | // replace github.com/shenwei356/lexichash => /home/shenwei/go/src/github.com/shenwei356/lexichash/ 6 | // replace github.com/shenwei356/wfa => /home/shenwei/go/src/github.com/shenwei356/wfa/ 7 | 8 | require ( 9 | github.com/dustin/go-humanize v1.0.1 10 | github.com/iafan/cwalk v0.0.0-20210125030640-586a8832a711 11 | github.com/klauspost/pgzip v1.2.6 12 | github.com/mattn/go-colorable v0.1.13 13 | github.com/mitchellh/go-homedir v1.1.0 14 | github.com/pelletier/go-toml/v2 v2.1.1 15 | github.com/pkg/errors v0.9.1 16 | github.com/rdleal/intervalst v1.3.0 17 | github.com/shenwei356/bio v0.13.6 18 | github.com/shenwei356/go-logging v0.0.0-20171012171522-c6b9702d88ba 19 | github.com/shenwei356/kmers v0.1.0 20 | github.com/shenwei356/lexichash v0.5.0 21 | github.com/shenwei356/util v0.5.2 22 | github.com/shenwei356/wfa v0.4.0 23 | github.com/shenwei356/xopen v0.3.2 24 | github.com/spf13/cobra v1.8.0 25 | github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553 26 | github.com/vbauerster/mpb/v8 v8.7.2 27 | github.com/zeebo/wyhash v0.0.1 28 | gonum.org/v1/gonum v0.14.0 29 | gonum.org/v1/plot v0.14.0 30 | ) 31 | 32 | require ( 33 | git.sr.ht/~sbinet/gg v0.5.0 // indirect 34 | github.com/VividCortex/ewma v1.2.0 // indirect 35 | github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d // indirect 36 | github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b // indirect 37 | github.com/campoy/embedmd v1.0.0 // indirect 38 | github.com/dsnet/compress v0.0.1 // indirect 39 | github.com/elliotwutingfeng/asciiset v0.0.0-20230602022725-51bbb787efab // indirect 40 | github.com/go-fonts/liberation v0.3.1 // indirect 41 | github.com/go-latex/latex v0.0.0-20230307184459-12ec69307ad9 // indirect 42 | github.com/go-pdf/fpdf v0.8.0 // indirect 43 | github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect 44 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 45 | github.com/klauspost/compress v1.18.0 // indirect 46 | github.com/kr/text v0.2.0 // indirect 47 | github.com/mattn/go-isatty v0.0.16 // indirect 48 | github.com/mattn/go-runewidth v0.0.15 // indirect 49 | github.com/pmezard/go-difflib v1.0.0 // indirect 50 | github.com/rivo/uniseg v0.4.4 // indirect 51 | github.com/rogpeppe/go-internal v1.12.0 // indirect 52 | github.com/shenwei356/breader v0.3.2 // indirect 53 | github.com/shenwei356/natsort v0.0.0-20190418160752-600d539c017d // indirect 54 | github.com/spf13/pflag v1.0.5 // indirect 55 | github.com/ulikunitz/xz v0.5.12 // indirect 56 | golang.org/x/image v0.18.0 // indirect 57 | golang.org/x/sys v0.16.0 // indirect 58 | golang.org/x/text v0.16.0 // indirect 59 | ) 60 | -------------------------------------------------------------------------------- /lexicmap/.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # data 15 | t_* 16 | t.gz 17 | t.* 18 | *.fna.gz 19 | *.xz 20 | *.zst 21 | *.bz2 22 | 23 | # lexicmap index 24 | *.bin 25 | *.idx 26 | info.toml 27 | *.png 28 | 29 | name.map 30 | taxid.map 31 | masks.txt 32 | 33 | ont-* 34 | hifi-* 35 | -------------------------------------------------------------------------------- /lexicmap/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | commit=$(git rev-parse --short HEAD) 4 | 5 | CGO_ENABLED=0 go build -trimpath -o=lexicmap -ldflags="-s -w -X github.com/shenwei356/LexicMap/lexicmap/cmd.COMMIT=$commit" -tags netgo 6 | 7 | ./lexicmap version 8 | -------------------------------------------------------------------------------- /lexicmap/cmd/autocomplete.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "fmt" 25 | "os" 26 | "path/filepath" 27 | 28 | homedir "github.com/mitchellh/go-homedir" 29 | "github.com/shenwei356/util/pathutil" 30 | "github.com/spf13/cobra" 31 | ) 32 | 33 | // autocompletionCmd represents the fq2fa command 34 | var autocompletionCmd = &cobra.Command{ 35 | Use: "autocompletion", 36 | Short: "Generate shell autocompletion scripts", 37 | Long: `Generate shell autocompletion scripts 38 | 39 | Supported shell: bash|zsh|fish|powershell 40 | 41 | Bash: 42 | 43 | # generate completion shell 44 | lexicmap autocompletion --shell bash 45 | 46 | # configure if never did. 47 | # install bash-completion if the "complete" command is not found. 48 | echo "for bcfile in ~/.bash_completion.d/* ; do source \$bcfile; done" >> ~/.bash_completion 49 | echo "source ~/.bash_completion" >> ~/.bashrc 50 | 51 | Zsh: 52 | 53 | # generate completion shell 54 | lexicmap autocompletion --shell zsh --file ~/.zfunc/_lexicmap 55 | 56 | # configure if never did 57 | echo 'fpath=( ~/.zfunc "${fpath[@]}" )' >> ~/.zshrc 58 | echo "autoload -U compinit; compinit" >> ~/.zshrc 59 | 60 | fish: 61 | 62 | lexicmap autocompletion --shell fish --file ~/.config/fish/completions/lexicmap.fish 63 | 64 | `, 65 | Run: func(cmd *cobra.Command, args []string) { 66 | outfile := getFlagString(cmd, "file") 67 | shell := getFlagString(cmd, "shell") 68 | 69 | dir := filepath.Dir(outfile) 70 | ok, err := pathutil.DirExists(dir) 71 | checkError(err) 72 | if !ok { 73 | os.MkdirAll(dir, 0744) 74 | } 75 | 76 | switch shell { 77 | case "bash": 78 | checkError(cmd.Root().GenBashCompletionFile(outfile)) 79 | case "zsh": 80 | checkError(cmd.Root().GenZshCompletionFile(outfile)) 81 | case "fish": 82 | checkError(cmd.Root().GenFishCompletionFile(outfile, true)) 83 | case "powershell": 84 | checkError(cmd.Root().GenPowerShellCompletionFile(outfile)) 85 | default: 86 | checkError(fmt.Errorf("unsupported shell: %s", shell)) 87 | } 88 | 89 | log.Infof("%s completion file for lexicmap saved to %s", shell, outfile) 90 | }, 91 | } 92 | 93 | func init() { 94 | RootCmd.AddCommand(autocompletionCmd) 95 | defaultCompletionFile, err := homedir.Expand("~/.bash_completion.d/lexicmap.sh") 96 | checkError(err) 97 | autocompletionCmd.Flags().StringP("file", "", defaultCompletionFile, "autocompletion file") 98 | autocompletionCmd.Flags().StringP("shell", "", "bash", "autocompletion type (bash|zsh|fish|powershell)") 99 | } 100 | -------------------------------------------------------------------------------- /lexicmap/cmd/genome/genome_test.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package genome 22 | 23 | import ( 24 | "bytes" 25 | "fmt" 26 | "os" 27 | "testing" 28 | ) 29 | 30 | func TestGenomeWritingAndSeqExtraction(t *testing.T) { 31 | _seq := []byte("ACTAGACGACGTACGCGTACGTAGTACGATGCTCGA") 32 | var s, s2 []byte 33 | var b2 *[]byte 34 | var err error 35 | for n := 1; n < len(_seq); n++ { 36 | s = _seq[:n] 37 | b2 = Seq2TwoBit(s) 38 | s2, err = TwoBit2Seq(*b2, n) 39 | if err != nil { 40 | t.Error(err) 41 | return 42 | } 43 | if !bytes.Equal(s, s2) { 44 | t.Errorf("expected: %s, results: %s\n", s, s2) 45 | return 46 | } 47 | RecycleTwoBit(b2) 48 | } 49 | } 50 | 51 | func TestReadAndWrite(t *testing.T) { 52 | file := "t.2bit" 53 | 54 | // ----------------------- write -------------- 55 | 56 | w, err := NewWriter(file, 1) 57 | if err != nil { 58 | t.Error(err) 59 | return 60 | } 61 | 62 | _seqs := [][]byte{ 63 | []byte("A"), 64 | []byte("C"), 65 | []byte("CA"), 66 | []byte("CAT"), 67 | []byte("CATG"), 68 | []byte("CATGC"), 69 | []byte("CATGCC"), 70 | []byte("CATGCCA"), 71 | []byte("CATGCCAC"), 72 | []byte("CATGCCACG"), 73 | []byte("ACCCTCGAGCGACTAG"), 74 | []byte("ACTAGACGACGTACGCGTACGTAGTACGATGCTCGA"), 75 | []byte("ACGCAGTCGTCATCATGCGTGTCGCATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACATGCTGCATGCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATGCTGTGATGCGTCTCAGTAGATGAT"), 76 | } 77 | 78 | for i, s := range _seqs { 79 | id := []byte(fmt.Sprintf("seq_%d", i+1)) 80 | 81 | g := PoolGenome.Get().(*Genome) 82 | g.Reset() 83 | g.ID = append(g.ID, id...) 84 | g.Seq = append(g.Seq, s...) 85 | g.GenomeSize = len(s) 86 | g.Len = len(s) 87 | g.NumSeqs = 1 88 | g.SeqSizes = append(g.SeqSizes, len(s)) 89 | seqid := []byte("test") 90 | g.SeqIDs = append(g.SeqIDs, &seqid) 91 | 92 | err = w.Write(g) 93 | if err != nil { 94 | t.Error(err) 95 | return 96 | } 97 | 98 | RecycleGenome(g) 99 | } 100 | 101 | err = w.Close() 102 | if err != nil { 103 | t.Error(err) 104 | return 105 | } 106 | 107 | // ----------------------- read -------------- 108 | 109 | r, err := NewReader(file) 110 | if err != nil { 111 | t.Error(err) 112 | return 113 | } 114 | 115 | var start, end int 116 | var s1 []byte 117 | var s2 *Genome 118 | for i, s := range _seqs { 119 | // subseq 120 | for start = 0; start < len(s); start++ { 121 | for end = start; end < len(s); end++ { 122 | s2, err = r.SubSeq(i, start, end) 123 | if err != nil { 124 | t.Error(err) 125 | return 126 | } 127 | s1 = s[start : end+1] 128 | if !bytes.Equal(s1, s2.Seq) { 129 | t.Errorf("idx: %d:%d-%d, expected: %s, results: %s", 130 | i, start, end, s1, s2.Seq) 131 | return 132 | } 133 | RecycleGenome(s2) 134 | } 135 | } 136 | 137 | // whole seq 138 | s2, err = r.Seq(i) 139 | if err != nil { 140 | t.Error(err) 141 | return 142 | } 143 | if !bytes.Equal(s, s2.Seq) { 144 | t.Errorf("idx: %d not matched", i) 145 | } 146 | RecycleGenome(s2) 147 | } 148 | 149 | r.Close() 150 | 151 | // clean up 152 | 153 | err = os.RemoveAll(file) 154 | if err != nil { 155 | t.Error(err) 156 | return 157 | } 158 | 159 | err = os.RemoveAll(file + GenomeIndexFileExt) 160 | if err != nil { 161 | t.Error(err) 162 | return 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /lexicmap/cmd/genomes.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "bufio" 25 | "fmt" 26 | "io" 27 | "os" 28 | "path/filepath" 29 | "strings" 30 | 31 | "github.com/shenwei356/bio/seq" 32 | "github.com/spf13/cobra" 33 | ) 34 | 35 | var genomesCmd = &cobra.Command{ 36 | Use: "genomes", 37 | Short: "View genome IDs in the index", 38 | Long: `View genome IDs in the index 39 | 40 | `, 41 | Run: func(cmd *cobra.Command, args []string) { 42 | opt := getOptions(cmd) 43 | seq.ValidateSeq = false 44 | 45 | // ------------------------------ 46 | 47 | dbDir := getFlagString(cmd, "index") 48 | if dbDir == "" { 49 | checkError(fmt.Errorf("flag -d/--index needed")) 50 | } 51 | 52 | outFile := getFlagString(cmd, "out-file") 53 | 54 | // output file handler 55 | outfh, gw, w, err := outStream(outFile, strings.HasSuffix(outFile, ".gz"), opt.CompressionLevel) 56 | checkError(err) 57 | defer func() { 58 | outfh.Flush() 59 | if gw != nil { 60 | gw.Close() 61 | } 62 | w.Close() 63 | }() 64 | 65 | // ----------------------------------------------------- 66 | // read genome chunks data if existed 67 | genomeChunks, err := readGenomeChunksMap(filepath.Join(dbDir, FileGenomeChunks)) 68 | if err != nil { 69 | checkError(fmt.Errorf("failed to read genome chunk file: %s", err)) 70 | } 71 | var hasGenomeChunks bool 72 | if len(genomeChunks) > 0 { 73 | hasGenomeChunks = true 74 | } 75 | 76 | // --------------------------------------------------------------- 77 | 78 | // genomes.map file for mapping index to genome id 79 | fh, err := os.Open(filepath.Join(dbDir, FileGenomeIndex)) 80 | if err != nil { 81 | checkError(fmt.Errorf("failed to read genome index mapping file: %s", err)) 82 | } 83 | defer fh.Close() 84 | 85 | r := bufio.NewReader(fh) 86 | 87 | buf := make([]byte, 8) 88 | var n, lenID int 89 | var batchIDAndRefID uint64 90 | var ok bool 91 | 92 | outfh.WriteString("ref\tchunked\n") 93 | for { 94 | n, err = io.ReadFull(r, buf[:2]) 95 | if err != nil { 96 | if err == io.EOF { 97 | break 98 | } 99 | checkError(fmt.Errorf("failed to read genome index mapping file: %s", err)) 100 | } 101 | if n < 2 { 102 | checkError(fmt.Errorf("broken genome map file")) 103 | } 104 | lenID = int(be.Uint16(buf[:2])) 105 | id := make([]byte, lenID) 106 | 107 | n, err = io.ReadFull(r, id) 108 | if err != nil { 109 | checkError(fmt.Errorf("broken genome map file")) 110 | } 111 | if n < lenID { 112 | checkError(fmt.Errorf("broken genome map file")) 113 | } 114 | 115 | n, err = io.ReadFull(r, buf) 116 | if err != nil { 117 | checkError(fmt.Errorf("broken genome map file")) 118 | } 119 | if n < 8 { 120 | checkError(fmt.Errorf("broken genome map file")) 121 | } 122 | 123 | batchIDAndRefID = be.Uint64(buf) 124 | 125 | if hasGenomeChunks { 126 | if _, ok = genomeChunks[batchIDAndRefID]; ok { 127 | fmt.Fprintf(outfh, "%s\t%s\n", id, "yes") 128 | } else { 129 | fmt.Fprintf(outfh, "%s\t\n", id) 130 | } 131 | } else { 132 | fmt.Fprintf(outfh, "%s\t\n", id) 133 | } 134 | 135 | } 136 | }, 137 | } 138 | 139 | func init() { 140 | utilsCmd.AddCommand(genomesCmd) 141 | 142 | genomesCmd.Flags().StringP("index", "d", "", 143 | formatFlagUsage(`Index directory created by "lexicmap index".`)) 144 | 145 | genomesCmd.Flags().StringP("out-file", "o", "-", 146 | formatFlagUsage(`Out file, supports the ".gz" suffix ("-" for stdout).`)) 147 | 148 | genomesCmd.SetUsageTemplate(usageTemplate("")) 149 | } 150 | -------------------------------------------------------------------------------- /lexicmap/cmd/kv/kv-encoding.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package kv 22 | 23 | import "encoding/binary" 24 | 25 | var be = binary.BigEndian 26 | 27 | // var PutUint64FourBytes func([]byte, uint64) = be.PutUint64 28 | 29 | // PutUint64ThreeBytes puts uint64 to 7 low bytes. 30 | func PutUint64ThreeBytes(b []byte, v uint64) { 31 | _ = b[6] // early bounds check to guarantee safety of writes below 32 | b[0] = byte(v >> 48) 33 | b[1] = byte(v >> 40) 34 | b[2] = byte(v >> 32) 35 | b[3] = byte(v >> 24) 36 | b[4] = byte(v >> 16) 37 | b[5] = byte(v >> 8) 38 | b[6] = byte(v) 39 | } 40 | 41 | // Uint64ThreeBytes returns an uint64 from 7 bytes 42 | func Uint64ThreeBytes(b []byte) uint64 { 43 | _ = b[6] // bounds check hint to compiler 44 | return uint64(b[6]) | uint64(b[5])<<8 | uint64(b[4])<<16 | uint64(b[3])<<24 | 45 | uint64(b[2])<<32 | uint64(b[1])<<40 | uint64(b[0])<<48 46 | } 47 | -------------------------------------------------------------------------------- /lexicmap/cmd/lib-chaining_test.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "testing" 25 | ) 26 | 27 | func TestChaining(t *testing.T) { 28 | /* command to prepare seeds from a certain query 29 | cat t.txt | csvtk grep -t -f target -p GCA_013693855.1 \ 30 | | csvtk cut -t -f qstart,tstart | sed 1d \ 31 | | awk '{print "{QBegin: "$1", TBegin: "$2", Len: 31},"}' 32 | */ 33 | subs := []*SubstrPair{ 34 | // two sequences on different strands 35 | // {QBegin: 18, TBegin: 3453, Len: 31}, 36 | // {QBegin: 18, TBegin: 3640464, Len: 31}, 37 | // {QBegin: 1924, TBegin: 1547, Len: 31}, 38 | // {QBegin: 1924, TBegin: 3638544, Len: 31}, 39 | 40 | // not perfect in this case, there are two chains: 0,1 and 2., while it should be one. 41 | {QBegin: 552, TBegin: 3798905, Len: 17}, 42 | {QBegin: 667, TBegin: 3799019, Len: 15}, 43 | {QBegin: 1332, TBegin: 3799686, Len: 31}, 44 | 45 | // a kmer has multiple matches 46 | {QBegin: 1384, TBegin: 628584, Len: 31}, 47 | {QBegin: 1490, TBegin: 628690, Len: 31}, 48 | {QBegin: 1879, TBegin: 900465, Len: 31}, 49 | {QBegin: 1879, TBegin: 629079, Len: 31}, 50 | {QBegin: 1879, TBegin: 627005, Len: 31}, 51 | {QBegin: 1910, TBegin: 6123921, Len: 23}, 52 | 53 | // same strands 54 | 55 | {QBegin: 182, TBegin: 1282695, Len: 26}, 56 | {QBegin: 182, TBegin: 1769573, Len: 26}, 57 | {QBegin: 315, TBegin: 1282830, Len: 15}, 58 | {QBegin: 315, TBegin: 1769708, Len: 15}, 59 | {QBegin: 343, TBegin: 1769724, Len: 27}, 60 | 61 | {QBegin: 10, TBegin: 314159, Len: 20}, 62 | 63 | // this case is kept in the chainning step, 64 | // because we can not simply limit 65 | // the minimum distance between two anchors. 66 | {QBegin: 60, TBegin: 14234, Len: 15}, 67 | {QBegin: 61, TBegin: 14235, Len: 15}, 68 | 69 | {QBegin: 60, TBegin: 3395374, Len: 15}, 70 | {QBegin: 70, TBegin: 3395384, Len: 15}, 71 | 72 | {QBegin: 50, TBegin: 950, Len: 31}, 73 | {QBegin: 79, TBegin: 3637976, Len: 31}, 74 | {QBegin: 100, TBegin: 3637997, Len: 31}, 75 | {QBegin: 519, TBegin: 1419, Len: 31}, 76 | {QBegin: 550, TBegin: 3638447, Len: 31}, 77 | {QBegin: 647, TBegin: 3638544, Len: 31}, 78 | 79 | {QBegin: 111, TBegin: 1146311, Len: 31}, 80 | {QBegin: 136, TBegin: 1146336, Len: 31}, 81 | {QBegin: 138, TBegin: 1146338, Len: 31}, 82 | {QBegin: 139, TBegin: 1146339, Len: 31}, 83 | {QBegin: 264, TBegin: 1146464, Len: 31}, 84 | {QBegin: 1479, TBegin: 1147679, Len: 31}, 85 | {QBegin: 1484, TBegin: 1147684, Len: 31}, 86 | {QBegin: 1543, TBegin: 1147743, Len: 31}, 87 | {QBegin: 1566, TBegin: 1147766, Len: 31}, 88 | {QBegin: 1919, TBegin: 1148119, Len: 31}, 89 | } 90 | tmp := []*SearchResult{ 91 | { 92 | Subs: &subs, 93 | }, 94 | } 95 | rs := &tmp 96 | 97 | cf := &DefaultChainingOptions 98 | 99 | chainer := NewChainer(cf) 100 | for _, r := range *rs { 101 | paths, sumMaxScore := chainer.Chain(r.Subs) 102 | 103 | t.Logf("sum score: %f, paths:\n", sumMaxScore) 104 | for _, p := range *paths { 105 | t.Logf(" %d\n", *p) 106 | } 107 | 108 | RecycleChainingResult(paths) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /lexicmap/cmd/lib-index-search-util.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "math" 25 | "slices" 26 | "sync" 27 | 28 | "github.com/shenwei356/LexicMap/lexicmap/cmd/tree" 29 | "github.com/shenwei356/lexichash/iterator" 30 | "github.com/shenwei356/wfa" 31 | ) 32 | 33 | // extendMatch an alignment region using a chaining algorithm. 34 | func extendMatch(seq1, seq2 []byte, start1, end1, start2, end2 int, extLen int, tBegin, maxExtLen int, rc bool) ([]byte, []byte, int, int, int, int, error) { 35 | var m uint8 = 2 36 | 37 | // fmt.Println("before:", start1, end1, start2, end2) 38 | 39 | var _s1, _e1, _s2, _e2 int // extend length 40 | var _extLen int 41 | 42 | // 3', right 43 | if end1+int(m) < len(seq1) && end2+int(m) < len(seq2) { 44 | if rc { 45 | _extLen = min(extLen, tBegin) 46 | } else { 47 | _extLen = min(extLen, maxExtLen) 48 | } 49 | 50 | if _extLen > 2 { 51 | e1, e2 := min(end1+_extLen, len(seq1)), min(end2+_extLen, len(seq2)) 52 | _seq1, _seq2 := seq1[end1:e1], seq2[end2:e2] 53 | // fmt.Printf("seq1: %s\nseq2: %s\n", _seq1, _seq2) 54 | 55 | _e1, _e2 = _extendRight(_seq1, _seq2) 56 | if _e1 > 0 || _e2 > 0 { 57 | end1 += _e1 58 | end2 += _e2 59 | } 60 | } 61 | } 62 | 63 | // 5', left 64 | if start1 > int(m) && start2 > int(m) { 65 | if rc { 66 | _extLen = min(extLen, maxExtLen) // tBegin is 0-based 67 | } else { 68 | _extLen = min(extLen, tBegin) // tBegin is 0-based 69 | } 70 | 71 | if _extLen > 2 { 72 | s1, s2 := max(start1-_extLen, 0), max(start2-_extLen, 0) 73 | _seq1, _seq2 := reverseBytes(seq1[s1:start1]), reverseBytes(seq2[s2:start2]) 74 | // fmt.Printf("seq1: %s\nseq2: %s\n", _seq1, _seq2) 75 | 76 | _s1, _s2 = _extendRight(*_seq1, *_seq2) 77 | if _s1 > 0 || _s2 > 0 { 78 | start1 -= _s1 79 | start2 -= _s2 80 | } 81 | poolRevBytes.Put(_seq1) 82 | poolRevBytes.Put(_seq2) 83 | } 84 | } 85 | 86 | // fmt.Println("after:", start1, end1, start2, end2) 87 | return seq1[start1:end1], seq2[start2:end2], _s1, _e1, _s2, _e2, nil 88 | } 89 | 90 | func _extendRight(s1, s2 []byte) (int, int) { 91 | _k := 2 92 | var m uint8 = 2 93 | 94 | // k-mer iterator 95 | iter, err := iterator.NewKmerIterator(s1, _k) 96 | if err != nil { 97 | return 0, 0 98 | } 99 | 100 | // index 101 | t := tree.NewTree(uint8(_k)) 102 | var kmer uint64 103 | var ok bool 104 | for { 105 | kmer, ok, _ = iter.NextPositiveKmer() 106 | if !ok { 107 | break 108 | } 109 | t.Insert(kmer, uint32(iter.Index())) 110 | } 111 | 112 | // match 113 | iter, err = iterator.NewKmerIterator(s2, _k) 114 | if err != nil { 115 | return 0, 0 116 | } 117 | 118 | subs := poolSubs.Get().(*[]*SubstrPair) 119 | *subs = (*subs)[:0] 120 | defer RecycleSubstrPairs(poolSub, poolSubs, subs) 121 | 122 | var v, p uint32 123 | var srs *[]*tree.SearchResult 124 | var sr *tree.SearchResult 125 | 126 | for { 127 | kmer, ok, _ = iter.NextPositiveKmer() 128 | if !ok { 129 | break 130 | } 131 | 132 | srs, ok = t.Search(kmer, m) 133 | if !ok { 134 | continue 135 | } 136 | 137 | for _, sr = range *srs { 138 | // fmt.Printf("%s vs %s, len:%d\n", kmers.MustDecode(kmer, _k), kmers.MustDecode(sr.Kmer, _k), sr.LenPrefix) 139 | for _, v = range sr.Values { 140 | p = v 141 | 142 | _sub := poolSub.Get().(*SubstrPair) 143 | _sub.QBegin = int32(p) 144 | _sub.TBegin = int32(iter.Index()) 145 | _sub.Len = uint8(sr.LenPrefix) 146 | _sub.QRC = false 147 | _sub.TRC = false 148 | 149 | *subs = append(*subs, _sub) 150 | } 151 | } 152 | t.RecycleSearchResult(srs) 153 | } 154 | tree.RecycleTree(t) 155 | 156 | if len(*subs) == 0 { 157 | return 0, 0 158 | } 159 | 160 | if len(*subs) > 1 { 161 | // no need to clean as k == min_len 162 | // ClearSubstrPairs(poolSub, subs, _k) 163 | 164 | slices.SortFunc(*subs, func(a, b *SubstrPair) int { 165 | if a.QBegin == b.QBegin { 166 | if a.QBegin+int32(a.Len) == b.QBegin+int32(b.Len) { 167 | return int(a.TBegin - b.TBegin) 168 | } 169 | return int(b.QBegin) + int(b.Len) - (int(a.QBegin) + int(a.Len)) 170 | } 171 | return int(a.QBegin - b.QBegin) 172 | }) 173 | } 174 | 175 | // for _, s := range *subs { 176 | // fmt.Println(s) 177 | // } 178 | 179 | // chaining 180 | chainer := poolChainers3.Get().(*Chainer3) 181 | chain := chainer.Chain(subs) 182 | 183 | poolChainers3.Put(chainer) 184 | 185 | if chain != nil { 186 | // fmt.Printf("q: %d-%d, t: %d-%d\n", chain.QBegin, chain.QEnd, chain.TBegin, chain.TEnd) 187 | poolChain3.Put(chain) 188 | return chain.QEnd + 1, chain.TEnd + 1 189 | } 190 | 191 | return 0, 0 192 | } 193 | 194 | // remember to recycle the result 195 | func reverseBytes(s []byte) *[]byte { 196 | t := poolRevBytes.Get().(*[]byte) 197 | if len(s) == len(*t) { 198 | 199 | } else if len(s) < len(*t) { 200 | *t = (*t)[:len(s)] 201 | } else { 202 | n := len(s) - len(*t) 203 | for i := 0; i < n; i++ { 204 | *t = append(*t, 0) 205 | } 206 | } 207 | copy(*t, s) 208 | 209 | for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 { 210 | (*t)[i], (*t)[j] = (*t)[j], (*t)[i] 211 | } 212 | 213 | return t 214 | } 215 | 216 | var poolRevBytes = &sync.Pool{New: func() interface{} { 217 | tmp := make([]byte, 128) 218 | return &tmp 219 | }} 220 | 221 | // ------------------------------------------------------------------------------------------ 222 | 223 | const OpM = uint64('M') 224 | const OpD = uint64('D') 225 | const OpI = uint64('I') 226 | const OpX = uint64('X') 227 | const OpH = uint64('H') 228 | 229 | // trimOps trim ops to keep only aligned region 230 | func trimOps(ops []uint64) []uint64 { 231 | var start, end int 232 | start, end = -1, -1 233 | for i, op := range ops { 234 | if op>>32 == OpM { 235 | start = i 236 | break 237 | } 238 | } 239 | for i := len(ops) - 1; i >= 0; i-- { 240 | if ops[i]>>32 == OpM { 241 | end = i 242 | break 243 | } 244 | } 245 | return ops[start : end+1] 246 | } 247 | 248 | func scoreAndEvalue(match, mismatch, gapOpen, gapExt int, totalBase int, lambda, k float64) func(qlen int, cigar *wfa.AlignmentResult) (int, int, float64) { 249 | // var Kn float64 = float64(k) * float64(totalBase) 250 | lnK := math.Log(k) 251 | ftotalBase := float64(totalBase) 252 | 253 | return func(qlen int, cigar *wfa.AlignmentResult) (int, int, float64) { 254 | ops := trimOps(cigar.Ops) 255 | var score, n int 256 | for _, op := range ops { 257 | n = int(op & 4294967295) 258 | 259 | // switch op.Op { 260 | switch op >> 32 { 261 | // match: 262 | case OpM: 263 | score += n * match 264 | // mismatch 265 | case OpX: 266 | score += n * mismatch 267 | // gap 268 | case OpI: 269 | score += gapOpen + n*gapExt 270 | // case 'D', 'H': 271 | case OpD, OpH: 272 | score += gapOpen + n*gapExt 273 | } 274 | } 275 | 276 | _score := score 277 | 278 | // from blastn_values_2_3 in ncbi-blast-2.15.0+-src/c++/src/algo/blast/core/blast_stat.c 279 | // Any odd score must be rounded down to the nearest even number before calculating the e-value 280 | if _score&1 == 1 { 281 | _score-- 282 | } 283 | 284 | bitScore := (lambda*float64(_score) - lnK) / math.Ln2 285 | 286 | // evalue := Kn * float64(qlen) * math.Pow(math.E, -lambda*float64(_score)) 287 | 288 | evalue := ftotalBase * math.Pow(2, -bitScore) * float64(qlen) 289 | 290 | return score, int(bitScore), evalue 291 | } 292 | } 293 | -------------------------------------------------------------------------------- /lexicmap/cmd/lib-seq_compare_test.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "sync" 25 | "testing" 26 | ) 27 | 28 | func TestSeqCompare(t *testing.T) { 29 | // Identities: 271/288(94%) 30 | 31 | // Query 8 AGGTCCTGCCCCGCGACCTGCACGCCGAATACGTAGCGGCGATCGCCTTAGTCGGTACAG 67 32 | // |||||||||||||||||||||||||| |||||||| ||||||||||||||| ||||| 33 | // Sbjct 15 AGGTCCTGCCCCGCGACCTGCACGCC-AATACGTA-TAGCGATCGCCTTAGTC--TACAG 70 34 | 35 | // Query 68 CCCTGGAAAACATGGCCACCGAAGTTCGTTCCCTGCAACGGACCGAAATCCACGAAGTCG 127 36 | // |||||||||||||||||||||||||||||| |||||||||||||||||||||| ||||| 37 | // Sbjct 71 CCCTGGAAAACATGGCCACCGAAGTTCGTT-CCTGCAACGGACCGAAATCCACTGAGTCG 129 38 | 39 | // Query 128 AAGAACACTTTGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAA 187 40 | // || |||| ||||||||||||||||||||||||||||||||||||||||||||||||| 41 | // Sbjct 130 --CAATACTTCGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAA 187 42 | 43 | // Query 188 TTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGGCAACGTGGTGACCGCCT 247 44 | // ||||||||||||||||||||||||||||||||||||||||||| |||||||||||||||| 45 | // Sbjct 188 TTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGG-AACGTGGTGACCGCCT 246 46 | 47 | // Query 248 ACGAAGACGTGACCCTCTGGCACGAACGCGACATCTCCCACTCCAGTG 295 48 | // |||||||||||||||| |||||||||||||||||||||||||||||| 49 | // Sbjct 247 ACGAAGACGTGACCCTTCGGCACGAACGCGACATCTCCCACTCCAGTG 294 50 | 51 | s1 := []byte("GGTTACGTATTGCTAGGTCCTGCCCCGCGACCTGCACGCCAATACGTATAGCGATCGCCTTAGTCTACAGCCCTGGAAAACATGGCCACCGAAGTTCGTTCCTGCAACGGACCGAAATCCACTGAGTCGCAATACTTCGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAATTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGGAACGTGGTGACCGCCTACGAAGACGTGACCCTTCGGCACGAACGCGACATCTCCCACTCCAGTGAGCAATACGTAACTGAACGAAGAACATCCGCAAAAAAAA") 52 | s2 := []byte("TCCACCCAGGTCCTGCCCCGCGACCTGCACGCCGAATACGTAGCGGCGATCGCCTTAGTCGGTACAGCCCTGGAAAACATGGCCACCGAAGTTCGTTCCCTGCAACGGACCGAAATCCACGAAGTCGAAGAACACTTTGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAATTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGGCAACGTGGTGACCGCCTACGAAGACGTGACCCTCTGGCACGAACGCGACATCTCCCACTCCAGTGCCGAACGGATGATTCTGCCGGACTCCACGGCGCTGTTG") 53 | 54 | // s1 := []byte("GGTTACGTATTGCTAGGTCCTGCCCCGCGACCTGCACGCCAATACGTATAGCGATCGCCTTAGTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTGCAACGGACCGAAATCCACTGAGTCGCAATACTTCGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAATTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGGcccccccccccccccccccccccccccccccTCGGCACGAACGCGACATCTCCCACTCCAGTGAGCAATACGTAACTGAACGAAGAACATCCGCAAAAAAAA") 55 | // s2 := []byte("TCCACCCAGGTCCTGCCCCGCGACCTGCACGCCGAATACGTAGCGGCGATCGCCTTAGTCGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCTGCAACGGACCGAAATCCACGAAGTCGAAGAACACTTTGCTAAGGGCCAAAAGGGCTCGTCAGCCATGCCGCACAAGCGGAACCCAATTGGCTCCGAAAACATCTGCGGCTGTGCCCGGGTCCTGCGGGGCaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaTCTGGCACGAACGCGACATCTCCCACTCCAGTGCCGAACGGATGATTCTGCCGGACTCCACGGCGCTGTTG") 56 | 57 | // alignment 58 | // alg := align.NewAligner(&align.AlignOptions{ 59 | // MatchScore: 1, 60 | // MisMatchScore: -1, 61 | // GapScore: -1, 62 | // SaveAlignments: true, 63 | // SaveMatrix: false, 64 | // }) 65 | // r := alg.Global(s2, s1) 66 | 67 | // t.Logf("matches: %d, gaps: %d, len: %d, identity: %.2f%%\n", 68 | // r.Matches, r.Gaps, r.Len, float64(r.Matches)/float64(r.Len)*100) 69 | 70 | // compare 71 | 72 | cpr := NewSeqComparator(&DefaultSeqComparatorOptions, &sync.Pool{New: func() interface{} { 73 | return NewChainer2(&DefaultChaining2Options) 74 | }}) 75 | 76 | err := cpr.Index(s1) 77 | if err != nil { 78 | t.Logf("%s", err) 79 | return 80 | } 81 | 82 | cr, err := cpr.Compare(0, uint32(len(s2)-1), s2, len(s2)) 83 | if err != nil { 84 | t.Logf("%s", err) 85 | return 86 | } 87 | if cr != nil { 88 | t.Logf("aligned bases: %d\n", cr.AlignedBases) 89 | 90 | RecycleSeqComparatorResult(cr) 91 | } 92 | } 93 | 94 | // 95 | -------------------------------------------------------------------------------- /lexicmap/cmd/masks.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "fmt" 25 | "os" 26 | "path/filepath" 27 | "strings" 28 | "time" 29 | 30 | "github.com/shenwei356/bio/seq" 31 | "github.com/shenwei356/lexichash" 32 | "github.com/shenwei356/util/pathutil" 33 | "github.com/spf13/cobra" 34 | ) 35 | 36 | var masksCmd = &cobra.Command{ 37 | Use: "masks", 38 | Short: "View masks of the index or generate new masks randomly", 39 | Long: `View masks of the index or generate new masks randomly 40 | 41 | `, 42 | Run: func(cmd *cobra.Command, args []string) { 43 | opt := getOptions(cmd) 44 | seq.ValidateSeq = false 45 | 46 | var fhLog *os.File 47 | if opt.Log2File { 48 | fhLog = addLog(opt.LogFile, opt.Verbose) 49 | } 50 | 51 | outputLog := opt.Verbose || opt.Log2File 52 | 53 | timeStart := time.Now() 54 | defer func() { 55 | if outputLog { 56 | log.Info() 57 | log.Infof("elapsed time: %s", time.Since(timeStart)) 58 | log.Info() 59 | } 60 | if opt.Log2File { 61 | fhLog.Close() 62 | } 63 | }() 64 | 65 | var err error 66 | 67 | // --------------------------------------------------------------- 68 | dbDir := getFlagString(cmd, "index") 69 | 70 | outFile := getFlagString(cmd, "out-file") 71 | 72 | k := getFlagPositiveInt(cmd, "kmer") 73 | if k < minK || k > 32 { 74 | checkError(fmt.Errorf("the value of flag -k/--kmer should be in range of [%d, 32]", minK)) 75 | } 76 | 77 | nMasks := getFlagPositiveInt(cmd, "masks") 78 | lcPrefix := getFlagNonNegativeInt(cmd, "prefix") 79 | seed := getFlagPositiveInt(cmd, "seed") 80 | 81 | // --------------------------------------------------------------- 82 | // output file handler 83 | outfh, gw, w, err := outStream(outFile, strings.HasSuffix(outFile, ".gz"), opt.CompressionLevel) 84 | checkError(err) 85 | defer func() { 86 | outfh.Flush() 87 | if gw != nil { 88 | gw.Close() 89 | } 90 | w.Close() 91 | }() 92 | 93 | // --------------------------------------------------------------- 94 | 95 | var lh *lexichash.LexicHash 96 | 97 | decoder := lexichash.MustDecoder() 98 | 99 | if dbDir != "" { // from the index 100 | if outputLog { 101 | log.Info() 102 | log.Infof("checking index: %s", dbDir) 103 | } 104 | 105 | // Mask file 106 | fileMask := filepath.Join(dbDir, FileMasks) 107 | ok, err := pathutil.Exists(fileMask) 108 | if err != nil || !ok { 109 | checkError(fmt.Errorf("mask file not found: %s", fileMask)) 110 | } 111 | 112 | lh, err = lexichash.NewFromFile(fileMask) 113 | if err != nil { 114 | checkError(fmt.Errorf("%s", err)) 115 | } 116 | 117 | if outputLog { 118 | log.Infof(" checking passed") 119 | log.Infof("reading masks...") 120 | } 121 | 122 | _k := uint8(lh.K) 123 | 124 | maskChanged := cmd.Flags().Lookup("masks").Changed 125 | if maskChanged { 126 | fmt.Fprintf(outfh, "%d\t%s\n", nMasks, decoder(lh.Masks[nMasks-1], _k)) 127 | } else { 128 | for i, code := range lh.Masks { 129 | fmt.Fprintf(outfh, "%d\t%s\n", i+1, decoder(code, _k)) 130 | } 131 | } 132 | } else { // re generate 133 | if outputLog { 134 | log.Infof("generating new mask...") 135 | } 136 | lh, err = lexichash.NewWithSeed(k, nMasks, int64(seed), lcPrefix) 137 | checkError(err) 138 | 139 | _k := uint8(lh.K) 140 | 141 | for i, code := range lh.Masks { 142 | fmt.Fprintf(outfh, "%d\t%s\n", i+1, decoder(code, _k)) 143 | } 144 | } 145 | }, 146 | } 147 | 148 | func init() { 149 | utilsCmd.AddCommand(masksCmd) 150 | 151 | masksCmd.Flags().StringP("index", "d", "", 152 | formatFlagUsage(`Index directory created by "lexicmap index".`)) 153 | 154 | masksCmd.Flags().StringP("out-file", "o", "-", 155 | formatFlagUsage(`Out file, supports and recommends a ".gz" suffix ("-" for stdout).`)) 156 | 157 | masksCmd.Flags().IntP("kmer", "k", 31, 158 | formatFlagUsage(`Maximum k-mer size. K needs to be <= 32.`)) 159 | 160 | masksCmd.Flags().IntP("masks", "m", 40000, 161 | formatFlagUsage(`Number of masks.`)) 162 | 163 | masksCmd.Flags().IntP("seed", "s", 1, 164 | formatFlagUsage(`The seed for generating random masks.`)) 165 | 166 | masksCmd.Flags().IntP("prefix", "p", 15, 167 | formatFlagUsage(`Length of mask k-mer prefix for checking low-complexity (0 for no checking).`)) 168 | 169 | masksCmd.SetUsageTemplate(usageTemplate("{ -d | [-k ] [-n ] [-s ] } [-o out.tsv.gz]")) 170 | } 171 | -------------------------------------------------------------------------------- /lexicmap/cmd/re-merge.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "fmt" 25 | "os" 26 | "path/filepath" 27 | "regexp" 28 | "sort" 29 | "time" 30 | 31 | "github.com/pkg/errors" 32 | "github.com/shenwei356/LexicMap/lexicmap/cmd/kv" 33 | "github.com/shenwei356/bio/seq" 34 | "github.com/shenwei356/lexichash" 35 | "github.com/shenwei356/util/pathutil" 36 | "github.com/spf13/cobra" 37 | ) 38 | 39 | var remergeCmd = &cobra.Command{ 40 | Use: "remerge", 41 | Short: "Rerun the merging step for an unfinished index", 42 | Long: `Rerun the merging step for an unfinished index 43 | 44 | When to use this command? 45 | 46 | - Only one thread is used for merging indexes, which happens when there are 47 | a lot (>200 batches) of batches ($inpu_files / --batch-size) and the value 48 | of --max-open-files is not big enough. E.g., 49 | 50 | 22:54:24.420 [INFO] merging 297 indexes... 51 | 22:54:24.455 [INFO] [round 1] 52 | 22:54:24.455 [INFO] batch 1/1, merging 297 indexes to xxx.lmi.tmp/r1_b1 with 1 threads... 53 | 54 | ► Then you can run this command with a bigger --max-open-files (e.g., 4096) and 55 | -J/--seed-data-threads (e.g., 12. 12 needs be <= 4096/(297+2)=13.7). 56 | And you need to set a bigger 'ulimit -n' if the value of --max-open-files is bigger than 1024. 57 | 58 | - The Slurm/PBS job time limit is almost reached and the merging step won't be finished before that. 59 | 60 | - Disk quota is reached in the merging step. 61 | 62 | `, 63 | Run: func(cmd *cobra.Command, args []string) { 64 | opt := getOptions(cmd) 65 | seq.ValidateSeq = false 66 | 67 | var fhLog *os.File 68 | if opt.Log2File { 69 | fhLog = addLog(opt.LogFile, opt.Verbose) 70 | } 71 | 72 | outputLog := opt.Verbose || opt.Log2File 73 | 74 | timeStart := time.Now() 75 | defer func() { 76 | if outputLog { 77 | log.Info() 78 | log.Infof("elapsed time: %s", time.Since(timeStart)) 79 | log.Info() 80 | } 81 | if opt.Log2File { 82 | fhLog.Close() 83 | } 84 | }() 85 | 86 | // --------------------------------------------------------------- 87 | 88 | dbDir := getFlagString(cmd, "index") 89 | if dbDir == "" { 90 | checkError(fmt.Errorf("index directory is need")) 91 | } 92 | 93 | tmpDir := filepath.Clean(dbDir) + ExtTmpDir 94 | ok, err := pathutil.DirExists((tmpDir)) 95 | if err != nil { 96 | checkError(fmt.Errorf("index directory is need")) 97 | } 98 | if !ok { 99 | checkError(fmt.Errorf("tmp directory is not found: %s", tmpDir)) 100 | } 101 | 102 | mergeThreads := getFlagPositiveInt(cmd, "seed-data-threads") 103 | 104 | maxOpenFiles := getFlagPositiveInt(cmd, "max-open-files") 105 | 106 | // --------------------------------------------------------------- 107 | // check indexes of all batches 108 | 109 | if opt.Verbose || opt.Log2File { 110 | log.Infof("checking indexes ...") 111 | } 112 | 113 | // batch dirs 114 | batchDirs := make([]string, 0, 512) 115 | pattern := regexp.MustCompile(`^batch_\d+$`) 116 | files, err := os.ReadDir(tmpDir) 117 | if err != nil { 118 | checkError(errors.Errorf("failed to read dir: %s", err)) 119 | } 120 | for _, file := range files { 121 | if file.Name() == "." || file.Name() == ".." { 122 | continue 123 | } 124 | if file.IsDir() && pattern.MatchString(file.Name()) { 125 | batchDirs = append(batchDirs, filepath.Join(tmpDir, file.Name())) 126 | } 127 | } 128 | 129 | if len(batchDirs) == 0 { 130 | checkError(fmt.Errorf("no indexes found in %s", tmpDir)) 131 | } else if opt.Verbose || opt.Log2File { 132 | log.Infof(" %d index directries found in %s", len(batchDirs), tmpDir) 133 | } 134 | 135 | // --------------------------------------------------------------- 136 | // prepare arguments for mergeIndexes 137 | 138 | sort.Strings(batchDirs) 139 | OneIndex := batchDirs[0] 140 | 141 | // lh *lexichash.LexicHash, read from one batch 142 | fileMask := filepath.Join(OneIndex, FileMasks) 143 | ok, err = pathutil.Exists(fileMask) 144 | if err != nil || !ok { 145 | checkError(fmt.Errorf("mask file not found: %s. Was the index merged?", fileMask)) 146 | } 147 | var lh *lexichash.LexicHash 148 | lh, err = lexichash.NewFromFile(fileMask) 149 | if err != nil { 150 | checkError(fmt.Errorf("checking mask file: %s", err)) 151 | } 152 | // fmt.Println(len(lh.Masks)) 153 | 154 | // maskPrefix uint8, anchorPrefix uint8, read from one batch with ReadKVIndexInfo 155 | var maskPrefix, anchorPrefix uint8 156 | fileSeedChunk := filepath.Join(OneIndex, DirSeeds, chunkFile(0)) 157 | _, _, _, maskPrefix, anchorPrefix, err = kv.ReadKVIndexInfo(filepath.Clean(fileSeedChunk) + kv.KVIndexFileExt) 158 | if err != nil { 159 | checkError(fmt.Errorf("checking seed information: %s", err)) 160 | } 161 | // fmt.Println(maskPrefix, anchorPrefix) 162 | 163 | // kvChunks int, read from one batch, info file 164 | var info *IndexInfo 165 | info, err = readIndexInfo(filepath.Join(OneIndex, FileInfo)) 166 | if err != nil { 167 | checkError(fmt.Errorf("failed to open info file: %s", err)) 168 | } 169 | kvChunks := info.Chunks 170 | if mergeThreads > kvChunks { 171 | mergeThreads = kvChunks 172 | } 173 | 174 | // opt *IndexBuildingOptions, create one, used: opt.Verbose, opt.Log2File, opt.MaxOpenFiles, opt.MergeThreads 175 | bopt := &IndexBuildingOptions{ 176 | // general 177 | NumCPUs: opt.NumCPUs, 178 | Verbose: opt.Verbose, 179 | Log2File: opt.Log2File, 180 | MaxOpenFiles: maxOpenFiles, 181 | MergeThreads: mergeThreads, 182 | } 183 | 184 | // outdir string, dbDir 185 | // paths []string, batchDirs 186 | // tmpDir string, tmpDir 187 | // round int 1 188 | 189 | err = mergeIndexes(lh, maskPrefix, anchorPrefix, bopt, kvChunks, dbDir, batchDirs, tmpDir, 1) 190 | if err != nil { 191 | checkError(fmt.Errorf("failed to merge indexes: %s", err)) 192 | } 193 | 194 | // clean tmp dir 195 | err = os.RemoveAll(tmpDir) 196 | if err != nil { 197 | checkError(fmt.Errorf("failed to remove tmp directory: %s", err)) 198 | } 199 | }, 200 | } 201 | 202 | func init() { 203 | utilsCmd.AddCommand(remergeCmd) 204 | 205 | remergeCmd.Flags().StringP("index", "d", "", 206 | formatFlagUsage(`Index directory created by "lexicmap index".`)) 207 | 208 | remergeCmd.Flags().IntP("seed-data-threads", "J", 8, 209 | formatFlagUsage(`Number of threads for writing seed data and merging seed chunks from all batches, the value should be in range of [1, -c/--chunks]. If there are >100 batches, please also increase the value of --max-open-files and set a bigger "ulimit -n" in shell.`)) 210 | 211 | remergeCmd.Flags().IntP("max-open-files", "", 1024, 212 | formatFlagUsage(`Maximum opened files, used in merging indexes. If there are >100 batches, please increase this value and set a bigger "ulimit -n" in shell.`)) 213 | 214 | remergeCmd.SetUsageTemplate(usageTemplate("[flags] -d ")) 215 | } 216 | -------------------------------------------------------------------------------- /lexicmap/cmd/recount-bases.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "fmt" 25 | "path/filepath" 26 | "sync" 27 | "time" 28 | 29 | "github.com/dustin/go-humanize" 30 | "github.com/shenwei356/LexicMap/lexicmap/cmd/genome" 31 | "github.com/shenwei356/bio/seq" 32 | "github.com/spf13/cobra" 33 | ) 34 | 35 | var countbasesCmd = &cobra.Command{ 36 | Use: "recount-bases", 37 | Short: "Recount bases for index version <=3.2", 38 | Long: `Recount bases for index version <=3.2 39 | 40 | This command is only needed for indexes created by LexicMap v0.6.0 (3c257ca) or before versions. 41 | 42 | `, 43 | Run: func(cmd *cobra.Command, args []string) { 44 | opt := getOptions(cmd) 45 | seq.ValidateSeq = false 46 | 47 | // ------------------------------ 48 | 49 | dbDir := getFlagString(cmd, "index") 50 | if dbDir == "" { 51 | checkError(fmt.Errorf("flag -d/--index needed")) 52 | } 53 | 54 | // info file 55 | fileInfo := filepath.Join(dbDir, FileInfo) 56 | info, err := readIndexInfo(fileInfo) 57 | if err != nil { 58 | checkError(fmt.Errorf("failed to read info file: %s", err)) 59 | } 60 | if info.MainVersion != MainVersion { 61 | checkError(fmt.Errorf("index main versions do not match: %d (index) != %d (tool). please re-create the index", info.MainVersion, MainVersion)) 62 | } 63 | 64 | var startTime time.Time 65 | 66 | old := info.InputBases 67 | totalBases, err := updateInputBases(info, dbDir, opt.NumCPUs) 68 | checkError(err) 69 | 70 | if opt.Verbose { 71 | fmt.Printf("update input bases from %d to %s in %s\n", old, humanize.Comma(totalBases), startTime) 72 | } 73 | }, 74 | } 75 | 76 | func init() { 77 | // utilsCmd.AddCommand(countbasesCmd) 78 | 79 | countbasesCmd.Flags().StringP("index", "d", "", 80 | formatFlagUsage(`Index directory created by "lexicmap index".`)) 81 | 82 | countbasesCmd.SetUsageTemplate(usageTemplate("")) 83 | } 84 | 85 | func updateInputBases(info *IndexInfo, dbDir string, threads int) (int64, error) { 86 | // sum bases 87 | var totalBases int64 88 | ch := make(chan int64, threads) 89 | done := make(chan int) 90 | go func() { 91 | for b := range ch { 92 | totalBases += b 93 | } 94 | done <- 1 95 | }() 96 | 97 | // extract genome sizes 98 | var wg sync.WaitGroup 99 | tokens := make(chan int, threads) 100 | for i := 0; i < info.GenomeBatches; i++ { 101 | wg.Add(1) 102 | tokens <- 1 103 | go func(i int) { 104 | fileGenomes := filepath.Join(dbDir, DirGenomes, batchDir(i), FileGenomes) 105 | rdr, err := genome.NewReader(fileGenomes) 106 | if err != nil { 107 | checkError(fmt.Errorf("failed to create genome reader: %s", err)) 108 | } 109 | 110 | _totalBases, err := rdr.TotalBases() 111 | if err != nil { 112 | checkError(fmt.Errorf("failed to check total bases for %s: %s", fileGenomes, err)) 113 | } 114 | 115 | ch <- _totalBases 116 | 117 | wg.Done() 118 | <-tokens 119 | }(i) 120 | } 121 | wg.Wait() 122 | close(ch) 123 | <-done 124 | 125 | // update info file 126 | info.InputBases = totalBases 127 | 128 | err := writeIndexInfo(filepath.Join(dbDir, FileInfo), info) 129 | if err != nil { 130 | return 0, (fmt.Errorf("failed to write info file: %s", err)) 131 | } 132 | 133 | return totalBases, err 134 | } 135 | -------------------------------------------------------------------------------- /lexicmap/cmd/reindex-seeds.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "fmt" 25 | "os" 26 | "path/filepath" 27 | "sync" 28 | "time" 29 | 30 | "github.com/shenwei356/LexicMap/lexicmap/cmd/kv" 31 | "github.com/shenwei356/bio/seq" 32 | "github.com/spf13/cobra" 33 | "github.com/vbauerster/mpb/v8" 34 | "github.com/vbauerster/mpb/v8/decor" 35 | ) 36 | 37 | var reindexSeedsCmd = &cobra.Command{ 38 | Use: "reindex-seeds", 39 | Short: "Recreate indexes of k-mer-value (seeds) data", 40 | Long: `Recreate indexes of k-mer-value (seeds) data 41 | 42 | `, 43 | Run: func(cmd *cobra.Command, args []string) { 44 | opt := getOptions(cmd) 45 | seq.ValidateSeq = false 46 | 47 | // ------------------------------ 48 | 49 | dbDir := getFlagString(cmd, "index") 50 | if dbDir == "" { 51 | checkError(fmt.Errorf("flag -d/--index needed")) 52 | } 53 | 54 | partitions := getFlagPositiveInt(cmd, "partitions") 55 | 56 | // --------------------------------------------------------------- 57 | 58 | if opt.Verbose { 59 | log.Infof("recreating seed indexes with %d partitions for: %s", partitions, dbDir) 60 | } 61 | 62 | // info file for the number of genome batches 63 | fileInfo := filepath.Join(dbDir, FileInfo) 64 | info, err := readIndexInfo(fileInfo) 65 | if err != nil { 66 | checkError(fmt.Errorf("failed to read info file: %s", err)) 67 | } 68 | 69 | // --------------------------------------------------------------- 70 | 71 | timeStart := time.Now() 72 | defer func() { 73 | if opt.Verbose { 74 | log.Info() 75 | log.Infof("elapsed time: %s", time.Since(timeStart)) 76 | log.Info() 77 | } 78 | }() 79 | 80 | showProgressBar := opt.Verbose 81 | 82 | // process bar 83 | var pbs *mpb.Progress 84 | var bar *mpb.Bar 85 | var chDuration chan time.Duration 86 | var doneDuration chan int 87 | if showProgressBar { 88 | pbs = mpb.New(mpb.WithWidth(40), mpb.WithOutput(os.Stderr)) 89 | bar = pbs.AddBar(int64(info.Chunks), 90 | mpb.PrependDecorators( 91 | decor.Name("processed files: ", decor.WC{W: len("processed files: "), C: decor.DindentRight}), 92 | decor.Name("", decor.WCSyncSpaceR), 93 | decor.CountersNoUnit("%d / %d", decor.WCSyncWidth), 94 | ), 95 | mpb.AppendDecorators( 96 | decor.Name("ETA: ", decor.WC{W: len("ETA: ")}), 97 | decor.EwmaETA(decor.ET_STYLE_GO, 3), 98 | decor.OnComplete(decor.Name(""), ". done"), 99 | ), 100 | ) 101 | 102 | chDuration = make(chan time.Duration, opt.NumCPUs) 103 | doneDuration = make(chan int) 104 | go func() { 105 | for t := range chDuration { 106 | bar.EwmaIncrBy(1, t) 107 | } 108 | doneDuration <- 1 109 | }() 110 | } 111 | 112 | var wg sync.WaitGroup 113 | tokens := make(chan int, opt.NumCPUs) 114 | threadsFloat := float64(opt.NumCPUs) 115 | for chunk := 0; chunk < info.Chunks; chunk++ { 116 | file := filepath.Join(dbDir, DirSeeds, chunkFile(chunk)) 117 | wg.Add(1) 118 | tokens <- 1 119 | 120 | go func(file string) { 121 | timeStart := time.Now() 122 | err := kv.CreateKVIndex(file, partitions) 123 | checkError(err) 124 | if showProgressBar { 125 | chDuration <- time.Duration(float64(time.Since(timeStart)) / threadsFloat) 126 | } 127 | <-tokens 128 | wg.Done() 129 | }(file) 130 | } 131 | wg.Wait() 132 | 133 | if showProgressBar { 134 | close(chDuration) 135 | <-doneDuration 136 | pbs.Wait() 137 | } 138 | 139 | if opt.Verbose { 140 | log.Infof("update index information file: %s", fileInfo) 141 | } 142 | info.Partitions = partitions 143 | err = writeIndexInfo(fileInfo, info) 144 | if err != nil { 145 | checkError(fmt.Errorf("failed to write info file: %s", err)) 146 | } 147 | if opt.Verbose { 148 | log.Infof(" finished updating the index information file: %s", fileInfo) 149 | } 150 | }, 151 | } 152 | 153 | func init() { 154 | utilsCmd.AddCommand(reindexSeedsCmd) 155 | 156 | reindexSeedsCmd.Flags().StringP("index", "d", "", 157 | formatFlagUsage(`Index directory created by "lexicmap index".`)) 158 | reindexSeedsCmd.Flags().IntP("partitions", "", 4096, 159 | formatFlagUsage(`Number of partitions for re-indexing seeds (k-mer-value data) files. The value needs to be the power of 4.`)) 160 | 161 | reindexSeedsCmd.SetUsageTemplate(usageTemplate("")) 162 | } 163 | -------------------------------------------------------------------------------- /lexicmap/cmd/root.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "fmt" 25 | "os" 26 | "runtime" 27 | 28 | "github.com/spf13/cobra" 29 | ) 30 | 31 | // RootCmd represents the base command when called without any subcommands 32 | var RootCmd = &cobra.Command{ 33 | Use: "lexicmap", 34 | Short: "efficient sequence alignment against millions of prokaryotic genomes", 35 | Long: fmt.Sprintf(` 36 | LexicMap: efficient sequence alignment against millions of prokaryotic genomes 37 | 38 | Version: v%s 39 | Documents: https://bioinf.shenwei.me/LexicMap 40 | Source code: https://github.com/shenwei356/LexicMap 41 | 42 | `, VERSION), 43 | } 44 | 45 | // Execute adds all child commands to the root command sets flags appropriately. 46 | // This is called by main.main(). It only needs to happen once to the rootCmd. 47 | func Execute() { 48 | if err := RootCmd.Execute(); err != nil { 49 | fmt.Println(err) 50 | os.Exit(-1) 51 | } 52 | } 53 | 54 | func init() { 55 | 56 | defaultThreads := runtime.NumCPU() 57 | 58 | RootCmd.PersistentFlags().IntP("threads", "j", defaultThreads, 59 | formatFlagUsage("Number of CPU cores to use. By default, it uses all available cores.")) 60 | 61 | // RootCmd.PersistentFlags().BoolP("verbose", "", false, "print verbose information (recommended)") 62 | 63 | RootCmd.PersistentFlags().BoolP("quiet", "", false, 64 | formatFlagUsage("Do not print any verbose information. But you can write them to a file with --log.")) 65 | 66 | RootCmd.PersistentFlags().StringP("infile-list", "X", "", 67 | formatFlagUsage("File of input file list (one file per line). If given, they are appended to files from CLI arguments.")) 68 | 69 | RootCmd.PersistentFlags().StringP("log", "", "", formatFlagUsage("Log file.")) 70 | 71 | RootCmd.CompletionOptions.DisableDefaultCmd = true 72 | 73 | RootCmd.SetHelpCommand(&cobra.Command{Hidden: true}) 74 | 75 | RootCmd.SetUsageTemplate(usageTemplate("")) 76 | } 77 | 78 | func formatFlagUsage(s string) string { 79 | return "► " + s 80 | } 81 | 82 | func usageTemplate(s string) string { 83 | return fmt.Sprintf(`Usage:{{if .Runnable}} 84 | {{.UseLine}}{{end}}{{if .HasAvailableSubCommands}} 85 | {{.CommandPath}} [command]{{end}} %s{{if gt (len .Aliases) 0}} 86 | 87 | Aliases: 88 | {{.NameAndAliases}}{{end}}{{if .HasExample}} 89 | 90 | Examples: 91 | {{.Example}}{{end}}{{if .HasAvailableSubCommands}}{{$cmds := .Commands}}{{if eq (len .Groups) 0}} 92 | 93 | Available Commands:{{range $cmds}}{{if (or .IsAvailableCommand (eq .Name "help"))}} 94 | {{rpad .Name .NamePadding }} {{.Short}}{{end}}{{end}}{{else}}{{range $group := .Groups}} 95 | 96 | {{.Title}}{{range $cmds}}{{if (and (eq .GroupID $group.ID) (or .IsAvailableCommand (eq .Name "help")))}} 97 | {{rpad .Name .NamePadding }} {{.Short}}{{end}}{{end}}{{end}}{{if not .AllChildCommandsHaveGroup}} 98 | 99 | Additional Commands:{{range $cmds}}{{if (and (eq .GroupID "") (or .IsAvailableCommand (eq .Name "help")))}} 100 | {{rpad .Name .NamePadding }} {{.Short}}{{end}}{{end}}{{end}}{{end}}{{end}}{{if .HasAvailableLocalFlags}} 101 | 102 | Flags: 103 | {{.LocalFlags.FlagUsagesWrapped 110 | trimTrailingWhitespaces}}{{end}}{{if .HasAvailableInheritedFlags}} 104 | 105 | Global Flags: 106 | {{.InheritedFlags.FlagUsagesWrapped 110 | trimTrailingWhitespaces}}{{end}}{{if .HasHelpSubCommands}} 107 | 108 | Additional help topics:{{range .Commands}}{{if .IsAdditionalHelpTopicCommand}} 109 | {{rpad .CommandPath .CommandPathPadding}} {{.Short}}{{end}}{{end}}{{end}}{{if .HasAvailableSubCommands}} 110 | 111 | Use "{{.CommandPath}} [command] --help" for more information about a command.{{end}} 112 | `, s) 113 | } 114 | -------------------------------------------------------------------------------- /lexicmap/cmd/seedposition/seed_position_test.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package seedposition 22 | 23 | import ( 24 | "math/rand" 25 | "os" 26 | "testing" 27 | ) 28 | 29 | func TestSeedPositions(t *testing.T) { 30 | tests := [][]uint32{ 31 | {}, 32 | {1}, 33 | {1, 15}, 34 | {1, 15, 300}, 35 | {1, 15, 300, 301}, 36 | {1, 15, 300, 301, 2500}, 37 | {1, 15, 300, 301, 2500, 3100}, 38 | {1, 15, 300, 301, 2500, 3100, 3111}, 39 | {1, 15, 300, 301, 2500, 3100, 3111, 5000}, 40 | {1, 15, 300, 301, 2500, 3100, 3111, 5000, 10000}, 41 | } 42 | 43 | file := "test.bin" 44 | 45 | // --------------------------------------- 46 | 47 | wtr, err := NewWriter(file, 0) 48 | if err != nil { 49 | t.Error(err) 50 | return 51 | } 52 | for i, test := range tests { 53 | err = wtr.Write(test) 54 | if err != nil { 55 | t.Errorf("write #%d data: %s", i+1, err) 56 | return 57 | } 58 | } 59 | err = wtr.Close() 60 | if err != nil { 61 | t.Error(err) 62 | return 63 | } 64 | 65 | idxs := make([]int, len(tests)) 66 | for i := range tests { 67 | idxs[i] = i 68 | } 69 | rand.Shuffle(len(tests), func(i, j int) { idxs[i], idxs[j] = idxs[j], idxs[i] }) 70 | 71 | // --------------------------------------- 72 | 73 | rdr, err := NewReader(file) 74 | if err != nil { 75 | t.Error(err) 76 | return 77 | } 78 | 79 | locs := make([]uint32, 64) 80 | var test []uint32 81 | var v uint32 82 | var j int 83 | 84 | for _, i := range idxs { 85 | test = tests[i] 86 | err = rdr.SeedPositions(i, &locs) 87 | if err != nil { 88 | t.Errorf("read #%d data: %s", i, err) 89 | return 90 | } 91 | 92 | if len(locs) != len(test) { 93 | t.Errorf("[#%d] unequal of position numbers, expected: %d, returned %d", 94 | i, len(test), len(locs)) 95 | return 96 | } 97 | 98 | for j, v = range locs { 99 | if v != test[j] { 100 | t.Errorf("[#%d] unequal of positions, expected: %d, returned %d", i, test[j], v) 101 | return 102 | } 103 | } 104 | } 105 | 106 | // clean up 107 | 108 | err = os.RemoveAll(file) 109 | if err != nil { 110 | t.Error(err) 111 | return 112 | } 113 | 114 | err = os.RemoveAll(file + PositionsIndexFileExt) 115 | if err != nil { 116 | t.Error(err) 117 | return 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /lexicmap/cmd/subseq.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "fmt" 25 | "path/filepath" 26 | "regexp" 27 | "strconv" 28 | "strings" 29 | 30 | "github.com/shenwei356/LexicMap/lexicmap/cmd/genome" 31 | "github.com/shenwei356/bio/seq" 32 | "github.com/spf13/cobra" 33 | ) 34 | 35 | var subseqCmd = &cobra.Command{ 36 | Use: "subseq", 37 | Short: "Extract subsequence via reference name, sequence ID, position and strand", 38 | Long: `Exextract subsequence via reference name, sequence ID, position and strand 39 | 40 | Attention: 41 | 1. The option -s/--seq-id is optional. 42 | 1) If given, the positions are these in the original sequence. 43 | 2) If not given, the positions are these in the concatenated sequence. 44 | 2. All degenerate bases in reference genomes were converted to the lexicographic first bases. 45 | E.g., N was converted to A. Therefore, consecutive A's in output might be N's in the genomes. 46 | 47 | `, 48 | Run: func(cmd *cobra.Command, args []string) { 49 | opt := getOptions(cmd) 50 | seq.ValidateSeq = false 51 | 52 | // ------------------------------ 53 | 54 | dbDir := getFlagString(cmd, "index") 55 | if dbDir == "" { 56 | checkError(fmt.Errorf("flag -d/--index needed")) 57 | } 58 | 59 | refname := getFlagString(cmd, "ref-name") 60 | if refname == "" { 61 | checkError(fmt.Errorf("flag -n/--ref-name needed")) 62 | } 63 | 64 | seqid := getFlagString(cmd, "seq-id") 65 | var concatenatedPositions bool 66 | if seqid == "" { 67 | concatenatedPositions = true 68 | } 69 | 70 | var reRegion = regexp.MustCompile(`\-?\d+:\-?\d+`) 71 | 72 | region := getFlagString(cmd, "region") 73 | if region == "" { 74 | checkError(fmt.Errorf("flag -r/--region needed")) 75 | } 76 | revcom := getFlagBool(cmd, "revcom") 77 | 78 | lineWidth := getFlagNonNegativeInt(cmd, "line-width") 79 | 80 | if !reRegion.MatchString(region) { 81 | checkError(fmt.Errorf(`invalid region: %s. type "lexicmap utils subseq -h" for more examples`, region)) 82 | } 83 | var start, end int 84 | var err error 85 | 86 | r := strings.Split(region, ":") 87 | start, err = strconv.Atoi(r[0]) 88 | checkError(err) 89 | end, err = strconv.Atoi(r[1]) 90 | checkError(err) 91 | if start <= 0 || end <= 0 { 92 | checkError(fmt.Errorf("both begin and end position should not be <= 0")) 93 | } 94 | if start > end { 95 | checkError(fmt.Errorf("begin position should be < end position")) 96 | } 97 | 98 | outFile := getFlagString(cmd, "out-file") 99 | 100 | // --------------------------------------------------------------- 101 | 102 | // genomes.map file for mapping index to genome id 103 | m, err := readGenomeMapName2Idx(filepath.Join(dbDir, FileGenomeIndex)) 104 | if err != nil { 105 | checkError(fmt.Errorf("failed to read genomes index mapping file: %s", err)) 106 | } 107 | 108 | var batchIDAndRefIDs *[]uint64 109 | 110 | var ok bool 111 | if batchIDAndRefIDs, ok = m[refname]; !ok { 112 | checkError(fmt.Errorf("reference name not found: %s", refname)) 113 | } 114 | 115 | var tSeq *genome.Genome 116 | var genomeBatch, genomeIdx int 117 | var rdr *genome.Reader 118 | 119 | var _end int 120 | 121 | for _, batchIDAndRefID := range *batchIDAndRefIDs { 122 | genomeBatch = int(batchIDAndRefID >> BITS_GENOME_IDX) 123 | genomeIdx = int(batchIDAndRefID & MASK_GENOME_IDX) 124 | 125 | fileGenome := filepath.Join(dbDir, DirGenomes, batchDir(genomeBatch), FileGenomes) 126 | rdr, err = genome.NewReader(fileGenome) 127 | if err != nil { 128 | checkError(fmt.Errorf("failed to read genome data file: %s", err)) 129 | } 130 | 131 | if concatenatedPositions { 132 | tSeq, err = rdr.SubSeq(genomeIdx, start-1, end-1) 133 | } else { 134 | tSeq, _end, err = rdr.SubSeq2(genomeIdx, []byte(seqid), start-1, end-1) 135 | _end++ // returned end is 0-based. 136 | } 137 | if err == nil { 138 | break 139 | // checkError(fmt.Errorf("failed to read subsequence: %s", err)) 140 | } 141 | } 142 | if err != nil { 143 | checkError(fmt.Errorf("failed to read subsequence: %s", err)) 144 | } 145 | 146 | end = _end // update end 147 | 148 | // output file handler 149 | outfh, gw, w, err := outStream(outFile, strings.HasSuffix(outFile, ".gz"), opt.CompressionLevel) 150 | checkError(err) 151 | defer func() { 152 | outfh.Flush() 153 | if gw != nil { 154 | gw.Close() 155 | } 156 | w.Close() 157 | }() 158 | 159 | s, err := seq.NewSeq(seq.DNAredundant, tSeq.Seq) 160 | checkError(err) 161 | 162 | strand := "+" 163 | if revcom { 164 | strand = "-" 165 | s.RevComInplace() 166 | } 167 | 168 | if concatenatedPositions { 169 | fmt.Fprintf(outfh, ">%s:%d-%d:%s\n", refname, start, end, strand) 170 | } else { 171 | fmt.Fprintf(outfh, ">%s:%d-%d:%s\n", seqid, start, end, strand) 172 | } 173 | outfh.Write(s.FormatSeq(lineWidth)) 174 | outfh.WriteByte('\n') 175 | 176 | genome.RecycleGenome(tSeq) 177 | checkError(rdr.Close()) 178 | }, 179 | } 180 | 181 | func init() { 182 | utilsCmd.AddCommand(subseqCmd) 183 | 184 | subseqCmd.Flags().StringP("index", "d", "", 185 | formatFlagUsage(`Index directory created by "lexicmap index".`)) 186 | 187 | subseqCmd.Flags().StringP("ref-name", "n", "", 188 | formatFlagUsage(`Reference name.`)) 189 | 190 | subseqCmd.Flags().StringP("seq-id", "s", "", 191 | formatFlagUsage(`Sequence ID. If the value is empty, the positions in the region are treated as that in the concatenated sequence.`)) 192 | 193 | subseqCmd.Flags().StringP("out-file", "o", "-", 194 | formatFlagUsage(`Out file, supports the ".gz" suffix ("-" for stdout).`)) 195 | 196 | subseqCmd.Flags().StringP("region", "r", "", 197 | formatFlagUsage(`Region of the subsequence (1-based).`)) 198 | 199 | subseqCmd.Flags().BoolP("revcom", "R", false, 200 | formatFlagUsage("Extract subsequence on the negative strand.")) 201 | 202 | subseqCmd.Flags().IntP("line-width", "w", 60, 203 | formatFlagUsage("Line width of sequence (0 for no wrap).")) 204 | 205 | subseqCmd.SetUsageTemplate(usageTemplate("")) 206 | } 207 | -------------------------------------------------------------------------------- /lexicmap/cmd/tree/tree_test.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package tree 22 | 23 | import ( 24 | "testing" 25 | 26 | "github.com/shenwei356/kmers" 27 | "github.com/shenwei356/lexichash" 28 | ) 29 | 30 | func TestTree(t *testing.T) { 31 | var k uint8 32 | var n uint64 33 | var i uint64 34 | var v uint32 35 | var query string 36 | var code uint64 37 | var srs *[]*SearchResult 38 | 39 | for j := 0; j < 1000; j++ { 40 | k = 6 41 | n = uint64(1 << (k * 2)) 42 | 43 | _t := NewTree(uint8(k)) 44 | 45 | for i = 0; i < n; i++ { 46 | v = uint32(i & 3) 47 | if v == 3 || v == 0 { 48 | continue 49 | } 50 | _t.Insert(i, v) 51 | } 52 | 53 | query = "ACTGAC" 54 | code, _ = kmers.Encode([]byte(query)) 55 | // srs, _ := tree.Search(code, uint8(len(query)), 4) 56 | srs, _ = _t.Search(code, 5) 57 | t.Logf("query: %s\n", query) 58 | for _, sr := range *srs { 59 | t.Logf(" %s, len(prefix): %d, %v\n", 60 | lexichash.MustDecode(sr.Kmer, k), sr.LenPrefix, sr.Values) 61 | } 62 | _t.RecycleSearchResult(srs) 63 | 64 | RecycleTree(_t) 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /lexicmap/cmd/util-cli.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "bufio" 25 | "fmt" 26 | "os" 27 | "strconv" 28 | "strings" 29 | 30 | "github.com/pkg/errors" 31 | "github.com/shenwei356/util/stringutil" 32 | "github.com/spf13/cobra" 33 | ) 34 | 35 | func checkError(err error) { 36 | if err != nil { 37 | log.Error(err) 38 | os.Exit(-1) 39 | } 40 | } 41 | 42 | func isStdin(file string) bool { 43 | return file == "-" 44 | } 45 | 46 | func isStdout(file string) bool { 47 | return file == "-" 48 | } 49 | 50 | func getFlagInt(cmd *cobra.Command, flag string) int { 51 | value, err := cmd.Flags().GetInt(flag) 52 | checkError(err) 53 | return value 54 | } 55 | 56 | func getFlagIntSlice(cmd *cobra.Command, flag string) []int { 57 | value, err := cmd.Flags().GetIntSlice(flag) 58 | checkError(err) 59 | return value 60 | } 61 | 62 | func getFlagUint8(cmd *cobra.Command, flag string) uint8 { 63 | value, err := cmd.Flags().GetUint8(flag) 64 | checkError(err) 65 | return value 66 | } 67 | 68 | func getFlagUint32(cmd *cobra.Command, flag string) uint32 { 69 | value, err := cmd.Flags().GetUint32(flag) 70 | checkError(err) 71 | return value 72 | } 73 | 74 | func getFlagUint64(cmd *cobra.Command, flag string) uint64 { 75 | value, err := cmd.Flags().GetUint64(flag) 76 | checkError(err) 77 | return value 78 | } 79 | 80 | func getFlagPositiveInt(cmd *cobra.Command, flag string) int { 81 | value, err := cmd.Flags().GetInt(flag) 82 | checkError(err) 83 | if value <= 0 { 84 | checkError(fmt.Errorf("value of flag --%s should be greater than 0", flag)) 85 | } 86 | return value 87 | } 88 | 89 | func getFlagPositiveFloat64(cmd *cobra.Command, flag string) float64 { 90 | value, err := cmd.Flags().GetFloat64(flag) 91 | checkError(err) 92 | if value <= 0 { 93 | checkError(fmt.Errorf("value of flag --%s should be greater than 0", flag)) 94 | } 95 | return value 96 | } 97 | 98 | func getFlagNonNegativeInt(cmd *cobra.Command, flag string) int { 99 | value, err := cmd.Flags().GetInt(flag) 100 | checkError(err) 101 | if value < 0 { 102 | checkError(fmt.Errorf("value of flag --%s should be greater than or equal to 0", flag)) 103 | } 104 | return value 105 | } 106 | 107 | func getFlagNonNegativeFloat64(cmd *cobra.Command, flag string) float64 { 108 | value, err := cmd.Flags().GetFloat64(flag) 109 | checkError(err) 110 | if value < 0 { 111 | checkError(fmt.Errorf("value of flag --%s should be greater than or equal to ", flag)) 112 | } 113 | return value 114 | } 115 | 116 | func getFlagBool(cmd *cobra.Command, flag string) bool { 117 | value, err := cmd.Flags().GetBool(flag) 118 | checkError(err) 119 | return value 120 | } 121 | 122 | func getFlagString(cmd *cobra.Command, flag string) string { 123 | value, err := cmd.Flags().GetString(flag) 124 | checkError(err) 125 | return value 126 | } 127 | 128 | func getFlagNonEmptyString(cmd *cobra.Command, flag string) string { 129 | value, err := cmd.Flags().GetString(flag) 130 | checkError(err) 131 | if value == "" { 132 | checkError(fmt.Errorf("flag --%s needed", flag)) 133 | } 134 | return value 135 | } 136 | 137 | func getFlagCommaSeparatedStrings(cmd *cobra.Command, flag string) []string { 138 | value, err := cmd.Flags().GetString(flag) 139 | checkError(err) 140 | return stringutil.Split(value, ",") 141 | } 142 | 143 | func getFlagSemicolonSeparatedStrings(cmd *cobra.Command, flag string) []string { 144 | value, err := cmd.Flags().GetString(flag) 145 | checkError(err) 146 | return stringutil.Split(value, ";") 147 | } 148 | 149 | func getFlagCommaSeparatedInts(cmd *cobra.Command, flag string) []int { 150 | filedsStrList := getFlagCommaSeparatedStrings(cmd, flag) 151 | fields := make([]int, len(filedsStrList)) 152 | for i, value := range filedsStrList { 153 | v, err := strconv.Atoi(value) 154 | if err != nil { 155 | checkError(fmt.Errorf("value of flag --%s should be comma separated integers", flag)) 156 | } 157 | fields[i] = v 158 | } 159 | return fields 160 | } 161 | 162 | func getFlagRune(cmd *cobra.Command, flag string) rune { 163 | value, err := cmd.Flags().GetString(flag) 164 | checkError(err) 165 | if len(value) > 1 { 166 | checkError(fmt.Errorf("value of flag --%s should has length of 1", flag)) 167 | } 168 | var v rune 169 | for _, r := range value { 170 | v = r 171 | break 172 | } 173 | return v 174 | } 175 | 176 | func getFlagFloat64(cmd *cobra.Command, flag string) float64 { 177 | value, err := cmd.Flags().GetFloat64(flag) 178 | checkError(err) 179 | return value 180 | } 181 | 182 | func getFlagInt64(cmd *cobra.Command, flag string) int64 { 183 | value, err := cmd.Flags().GetInt64(flag) 184 | checkError(err) 185 | return value 186 | } 187 | 188 | func getFlagStringSlice(cmd *cobra.Command, flag string) []string { 189 | value, err := cmd.Flags().GetStringSlice(flag) 190 | checkError(err) 191 | return value 192 | } 193 | 194 | func getFileList(args []string, checkFile bool) []string { 195 | files := make([]string, 0, 1024) 196 | if len(args) == 0 { 197 | files = append(files, "-") 198 | } else { 199 | for _, file := range args { 200 | if isStdin(file) { 201 | continue 202 | } 203 | if !checkFile { 204 | continue 205 | } 206 | if _, err := os.Stat(file); os.IsNotExist(err) { 207 | checkError(errors.Wrap(err, file)) 208 | } 209 | } 210 | files = args 211 | } 212 | return files 213 | } 214 | 215 | func getFileListFromFile(file string, checkFile bool) ([]string, error) { 216 | var fh *os.File 217 | var err error 218 | if file == "-" { 219 | fh = os.Stdin 220 | } else { 221 | fh, err = os.Open(file) 222 | if err != nil { 223 | return nil, fmt.Errorf("read file list from '%s': %s", file, err) 224 | } 225 | } 226 | 227 | var _file string 228 | lists := make([]string, 0, 1024) 229 | scanner := bufio.NewScanner(fh) 230 | for scanner.Scan() { 231 | _file = scanner.Text() 232 | if strings.TrimSpace(_file) == "" { 233 | continue 234 | } 235 | lists = append(lists, _file) 236 | } 237 | if err = scanner.Err(); err != nil { 238 | return nil, fmt.Errorf("read file list from '%s': %s", file, err) 239 | } 240 | 241 | if !checkFile { 242 | return lists, fh.Close() 243 | } 244 | 245 | for _, _file = range lists { 246 | if !isStdin(_file) { 247 | if _, err = os.Stat(_file); os.IsNotExist(err) { 248 | return lists, fmt.Errorf("check file '%s': %s", _file, err) 249 | } 250 | } 251 | } 252 | 253 | return lists, fh.Close() 254 | } 255 | 256 | func getFileListFromArgsAndFile(cmd *cobra.Command, args []string, checkFileFromArgs bool, flag string, checkFileFromFile bool) []string { 257 | infileList := getFlagString(cmd, flag) 258 | files := getFileList(args, checkFileFromArgs) 259 | if infileList != "" { 260 | _files, err := getFileListFromFile(infileList, checkFileFromFile) 261 | checkError(err) 262 | if len(_files) == 0 { 263 | log.Warningf("no files found in file list: %s", infileList) 264 | return files 265 | } 266 | 267 | if len(files) == 1 && isStdin(files[0]) { 268 | return _files 269 | } 270 | files = append(files, _files...) 271 | } 272 | return files 273 | } 274 | 275 | // ParseByteSize parses byte size from string 276 | func ParseByteSize(val string) (int64, error) { 277 | val = strings.Trim(val, " \t\r\n") 278 | if val == "" { 279 | return 0, nil 280 | } 281 | var u int64 282 | var noUnit bool 283 | switch val[len(val)-1] { 284 | case 'B', 'b': 285 | u = 1 286 | case 'K', 'k': 287 | u = 1 << 10 288 | case 'M', 'm': 289 | u = 1 << 20 290 | case 'G', 'g': 291 | u = 1 << 30 292 | case 'T', 't': 293 | u = 1 << 40 294 | default: 295 | noUnit = true 296 | u = 1 297 | } 298 | var size float64 299 | var err error 300 | if noUnit { 301 | size, err = strconv.ParseFloat(val, 10) 302 | if err != nil { 303 | return 0, fmt.Errorf("invalid byte size: %s", val) 304 | } 305 | if size < 0 { 306 | size = 0 307 | } 308 | return int64(size), nil 309 | } 310 | 311 | if len(val) == 1 { // no value 312 | return 0, nil 313 | } 314 | 315 | size, err = strconv.ParseFloat(strings.Trim(val[0:len(val)-1], " \t\r\n"), 10) 316 | if err != nil { 317 | return 0, fmt.Errorf("invalid byte size: %s", val) 318 | } 319 | if size < 0 { 320 | size = 0 321 | } 322 | return int64(size * float64(u)), nil 323 | } 324 | -------------------------------------------------------------------------------- /lexicmap/cmd/util-io.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "bufio" 25 | "errors" 26 | "fmt" 27 | "io" 28 | "os" 29 | "path/filepath" 30 | 31 | gzip "github.com/klauspost/pgzip" 32 | ) 33 | 34 | // BufferSize is size of buffer 35 | var BufferSize = 65536 // os.Getpagesize() 36 | 37 | func outStream(file string, gzipped bool, level int) (*bufio.Writer, io.WriteCloser, *os.File, error) { 38 | var w *os.File 39 | if file == "-" { 40 | w = os.Stdout 41 | } else { 42 | dir := filepath.Dir(file) 43 | fi, err := os.Stat(dir) 44 | if err == nil && !fi.IsDir() { 45 | return nil, nil, nil, fmt.Errorf("can not write file into a non-directory path: %s", dir) 46 | } 47 | if os.IsNotExist(err) { 48 | os.MkdirAll(dir, 0755) 49 | } 50 | 51 | w, err = os.Create(file) 52 | if err != nil { 53 | return nil, nil, nil, fmt.Errorf("fail to write %s: %s", file, err) 54 | } 55 | } 56 | 57 | if gzipped { 58 | // gw := gzip.NewWriter(w) 59 | gw, err := gzip.NewWriterLevel(w, level) 60 | if err != nil { 61 | return nil, nil, nil, fmt.Errorf("fail to write %s: %s", file, err) 62 | } 63 | return bufio.NewWriterSize(gw, BufferSize), gw, w, nil 64 | } 65 | return bufio.NewWriterSize(w, BufferSize), nil, w, nil 66 | } 67 | 68 | func inStream(file string) (*bufio.Reader, *os.File, bool, error) { 69 | var err error 70 | var r *os.File 71 | var gzipped bool 72 | if file == "-" { 73 | if !detectStdin() { 74 | return nil, nil, gzipped, errors.New("stdin not detected") 75 | } 76 | r = os.Stdin 77 | } else { 78 | r, err = os.Open(file) 79 | if err != nil { 80 | return nil, nil, gzipped, fmt.Errorf("fail to read %s: %s", file, err) 81 | } 82 | } 83 | 84 | br := bufio.NewReaderSize(r, BufferSize) 85 | 86 | if gzipped, err = isGzip(br); err != nil { 87 | return nil, nil, gzipped, fmt.Errorf("fail to check is file (%s) gzipped: %s", file, err) 88 | } else if gzipped { 89 | // gr, err := gzip.NewReader(br) 90 | gr, err := gzip.NewReaderN(br, 65536, 8) 91 | if err != nil { 92 | return nil, r, gzipped, fmt.Errorf("fail to create gzip reader for %s: %s", file, err) 93 | } 94 | br = bufio.NewReaderSize(gr, BufferSize) 95 | } 96 | return br, r, gzipped, nil 97 | } 98 | 99 | func isGzip(b *bufio.Reader) (bool, error) { 100 | return checkBytes(b, []byte{0x1f, 0x8b}) 101 | } 102 | 103 | func checkBytes(b *bufio.Reader, buf []byte) (bool, error) { 104 | m, err := b.Peek(len(buf)) 105 | if err != nil { 106 | return false, fmt.Errorf("no content") 107 | } 108 | for i := range buf { 109 | if m[i] != buf[i] { 110 | return false, nil 111 | } 112 | } 113 | return true, nil 114 | } 115 | 116 | func detectStdin() bool { 117 | // http://stackoverflow.com/a/26567513 118 | stat, err := os.Stdin.Stat() 119 | if err != nil { 120 | return false 121 | } 122 | return (stat.Mode() & os.ModeCharDevice) == 0 123 | } 124 | -------------------------------------------------------------------------------- /lexicmap/cmd/util-logging.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "fmt" 25 | "io" 26 | "os" 27 | "runtime" 28 | 29 | "github.com/mattn/go-colorable" 30 | "github.com/shenwei356/go-logging" 31 | ) 32 | 33 | var log *logging.Logger 34 | 35 | var logFormat = logging.MustStringFormatter( 36 | `%{time:15:04:05.000} %{color}[%{level:.4s}]%{color:reset} %{message}`, 37 | ) 38 | 39 | var backendFormatter logging.Backend 40 | 41 | func init() { 42 | var stderr io.Writer = os.Stderr 43 | if runtime.GOOS == "windows" { 44 | stderr = colorable.NewColorableStderr() 45 | } 46 | backend := logging.NewLogBackend(stderr, "", 0) 47 | backendFormatter = logging.NewBackendFormatter(backend, logFormat) 48 | 49 | logging.SetBackend(backendFormatter) 50 | 51 | log = logging.MustGetLogger("lexicmap") 52 | } 53 | 54 | func addLog(file string, verbose bool) *os.File { 55 | w, err := os.Create(file) 56 | if err != nil { 57 | checkError(fmt.Errorf("failed to write log file %s: %s", file, err)) 58 | } 59 | 60 | var logFormat2 = logging.MustStringFormatter( 61 | `%{time:15:04:05.000} [%{level:.4s}] %{message}`, 62 | ) 63 | backend := logging.NewLogBackend(w, "", 0) 64 | backendFormatter2 := logging.NewBackendFormatter(backend, logFormat2) 65 | 66 | if !verbose { 67 | logging.SetBackend(backendFormatter2) 68 | } else { 69 | logging.SetBackend(backendFormatter, backendFormatter2) 70 | } 71 | 72 | log = logging.MustGetLogger("lexicmap") 73 | 74 | return w 75 | } 76 | -------------------------------------------------------------------------------- /lexicmap/cmd/util/kmers_test.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package util 22 | 23 | import ( 24 | "fmt" 25 | "testing" 26 | 27 | "github.com/shenwei356/kmers" 28 | ) 29 | 30 | func TestIsLowComplexityDust(t *testing.T) { 31 | mer := []byte("TAAAAATACCTCAAAAAGAATAAAAATCCCG") 32 | k := len(mer) 33 | 34 | code, err := kmers.Encode(mer) 35 | if err != nil { 36 | t.Error(err) 37 | return 38 | } 39 | 40 | fmt.Printf("%s, low-complexity: %v\n", mer, IsLowComplexityDust(code, uint8(k))) 41 | } 42 | 43 | func TestNs(t *testing.T) { 44 | var k uint8 = 5 45 | values := []uint64{ 46 | Ns(0b00, k), // A 47 | Ns(0b01, k), // C 48 | Ns(0b10, k), // G 49 | Ns(0b11, k), // T 50 | } 51 | for _, v := range values { 52 | fmt.Printf("%s, %064b\n", kmers.MustDecode(v, int(k)), v) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /lexicmap/cmd/util/util.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package util 22 | 23 | import "github.com/twotwotwo/sorts/sortutil" 24 | 25 | // https://gist.github.com/badboy/6267743 . 26 | // version with mask: https://gist.github.com/lh3/974ced188be2f90422cc . 27 | func Hash64(key uint64) uint64 { 28 | key = (^key) + (key << 21) // key = (key << 21) - key - 1 29 | key = key ^ (key >> 24) 30 | key = (key + (key << 3)) + (key << 8) // key * 265 31 | key = key ^ (key >> 14) 32 | key = (key + (key << 2)) + (key << 4) // key * 21 33 | key = key ^ (key >> 28) 34 | key = key + (key << 31) 35 | return key 36 | } 37 | 38 | // UniqUint64s removes duplicates in a uint64 list 39 | func UniqUint64s(list *[]uint64) { 40 | if len(*list) == 0 || len(*list) == 1 { 41 | return 42 | } 43 | 44 | sortutil.Uint64s(*list) 45 | 46 | var i, j int 47 | var p, v uint64 48 | var flag bool 49 | p = (*list)[0] 50 | for i = 1; i < len(*list); i++ { 51 | v = (*list)[i] 52 | if v == p { 53 | if !flag { 54 | j = i // mark insertion position 55 | flag = true 56 | } 57 | continue 58 | } 59 | 60 | if flag { // need to insert to previous position 61 | (*list)[j] = v 62 | j++ 63 | } 64 | p = v 65 | } 66 | if j > 0 { 67 | *list = (*list)[:j] 68 | } 69 | } 70 | 71 | // ReverseInts reverses a list of ints 72 | func ReverseInts(s []int) { 73 | for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 { 74 | s[i], s[j] = s[j], s[i] 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /lexicmap/cmd/util/varint-GB.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2018-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package util 22 | 23 | var offsetsUint64 = []uint8{56, 48, 40, 32, 24, 16, 8, 0} 24 | var offsetsUint32 = []uint8{24, 16, 8, 0} 25 | 26 | // PutUint64s encodes two uint64s into 2-16 bytes, and returns control byte 27 | // and encoded byte length. 28 | func PutUint64s(buf []byte, v1, v2 uint64) (ctrl byte, n int) { 29 | blen := ByteLengthUint64(v1) 30 | ctrl |= byte(blen - 1) 31 | for _, offset := range offsetsUint64[8-blen:] { 32 | buf[n] = byte((v1 >> offset) & 0xff) 33 | n++ 34 | } 35 | 36 | ctrl <<= 3 37 | blen = ByteLengthUint64(v2) 38 | ctrl |= byte(blen - 1) 39 | for _, offset := range offsetsUint64[8-blen:] { 40 | buf[n] = byte((v2 >> offset) & 0xff) 41 | n++ 42 | } 43 | return 44 | } 45 | 46 | // PutUint32s encodes four uint32s into 4-16 bytes, and returns control byte 47 | // and encoded byte length. 48 | func PutUint32s(buf []byte, v1, v2, v3, v4 uint32) (ctrl byte, n int) { 49 | blen := ByteLengthUint32(v1) 50 | ctrl |= byte(blen - 1) 51 | for _, offset := range offsetsUint32[4-blen:] { 52 | buf[n] = byte((v1 >> offset) & 0xff) 53 | n++ 54 | } 55 | 56 | ctrl <<= 2 57 | blen = ByteLengthUint32(v2) 58 | ctrl |= byte(blen - 1) 59 | for _, offset := range offsetsUint32[4-blen:] { 60 | buf[n] = byte((v2 >> offset) & 0xff) 61 | n++ 62 | } 63 | 64 | ctrl <<= 2 65 | blen = ByteLengthUint32(v3) 66 | ctrl |= byte(blen - 1) 67 | for _, offset := range offsetsUint32[4-blen:] { 68 | buf[n] = byte((v3 >> offset) & 0xff) 69 | n++ 70 | } 71 | 72 | ctrl <<= 2 73 | blen = ByteLengthUint32(v4) 74 | ctrl |= byte(blen - 1) 75 | for _, offset := range offsetsUint32[4-blen:] { 76 | buf[n] = byte((v4 >> offset) & 0xff) 77 | n++ 78 | } 79 | 80 | return 81 | } 82 | 83 | // Uint64s decodes encoded bytes. 84 | func Uint64s(ctrl byte, buf []byte) (v1, v2 uint64, n int) { 85 | blen1 := int((ctrl>>3)&7) + 1 86 | blen2 := int(ctrl&7) + 1 87 | if len(buf) < blen1+blen2 { 88 | return 0, 0, 0 89 | } 90 | 91 | var j int 92 | 93 | for j = 0; j < blen1; j++ { 94 | v1 <<= 8 95 | v1 |= uint64(buf[n]) 96 | n++ 97 | } 98 | 99 | for j = 0; j < blen2; j++ { 100 | v2 <<= 8 101 | v2 |= uint64(buf[n]) 102 | n++ 103 | } 104 | 105 | return 106 | } 107 | 108 | // Uint32s decodes encoded bytes. 109 | func Uint32s(ctrl byte, buf []byte) (v1, v2, v3, v4 uint32, n int) { 110 | blen1 := int((ctrl>>6)&3) + 1 111 | blen2 := int((ctrl>>4)&3) + 1 112 | blen3 := int((ctrl>>2)&3) + 1 113 | blen4 := int(ctrl&3) + 1 114 | if len(buf) < blen1+blen2+blen3+blen4 { 115 | return 0, 0, 0, 0, 0 116 | } 117 | 118 | var j int 119 | 120 | for j = 0; j < blen1; j++ { 121 | v1 <<= 8 122 | v1 |= uint32(buf[n]) 123 | n++ 124 | } 125 | 126 | for j = 0; j < blen2; j++ { 127 | v2 <<= 8 128 | v2 |= uint32(buf[n]) 129 | n++ 130 | } 131 | 132 | for j = 0; j < blen3; j++ { 133 | v3 <<= 8 134 | v3 |= uint32(buf[n]) 135 | n++ 136 | } 137 | 138 | for j = 0; j < blen4; j++ { 139 | v4 <<= 8 140 | v4 |= uint32(buf[n]) 141 | n++ 142 | } 143 | 144 | return 145 | } 146 | 147 | // ByteLengthUint64 returns the minimum number of bytes to store a integer. 148 | func ByteLengthUint64(n uint64) uint8 { 149 | if n < 256 { 150 | return 1 151 | } 152 | if n < 65536 { 153 | return 2 154 | } 155 | if n < 16777216 { 156 | return 3 157 | } 158 | if n < 4294967296 { 159 | return 4 160 | } 161 | if n < 1099511627776 { 162 | return 5 163 | } 164 | if n < 281474976710656 { 165 | return 6 166 | } 167 | if n < 72057594037927936 { 168 | return 7 169 | } 170 | return 8 171 | } 172 | 173 | // ByteLengthUint32 returns the minimum number of bytes to store a integer. 174 | func ByteLengthUint32(n uint32) uint8 { 175 | if n < 256 { 176 | return 1 177 | } 178 | if n < 65536 { 179 | return 2 180 | } 181 | if n < 16777216 { 182 | return 3 183 | } 184 | return 4 185 | } 186 | 187 | // CtrlByte2ByteLengthsUint64 returns the byte length for a given control byte. 188 | func CtrlByte2ByteLengthsUint64(ctrl byte) int { 189 | return int(ctrl>>3&7+ctrl&7) + 2 190 | } 191 | 192 | // CtrlByte2ByteLengthsUint32 returns the byte length for a given control byte. 193 | func CtrlByte2ByteLengthsUint32(ctrl byte) int { 194 | return int(ctrl>>6&3+ctrl>>4&3+ctrl>>2&3+ctrl&3) + 4 195 | } 196 | -------------------------------------------------------------------------------- /lexicmap/cmd/util/varint-GB_test.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2018-2021 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package util 22 | 23 | import ( 24 | "math/rand" 25 | "testing" 26 | ) 27 | 28 | var testsUint64 [][2]uint64 29 | var testsUint32 [][4]uint32 30 | 31 | func init() { 32 | ntests := 10000 33 | testsUint64 = make([][2]uint64, ntests) 34 | testsUint32 = make([][4]uint32, ntests) 35 | var i int 36 | for ; i < ntests/4; i++ { 37 | testsUint64[i] = [2]uint64{rand.Uint64(), rand.Uint64()} 38 | testsUint32[i] = [4]uint32{rand.Uint32(), rand.Uint32(), rand.Uint32(), rand.Uint32()} 39 | } 40 | for ; i < ntests/2; i++ { 41 | testsUint64[i] = [2]uint64{uint64(rand.Uint32()), uint64(rand.Uint32())} 42 | testsUint32[i] = [4]uint32{rand.Uint32(), rand.Uint32(), rand.Uint32(), rand.Uint32()} 43 | } 44 | for ; i < ntests*3/4; i++ { 45 | testsUint64[i] = [2]uint64{uint64(rand.Intn(65536)), uint64(rand.Intn(256))} 46 | testsUint32[i] = [4]uint32{uint32(rand.Intn(65536)), uint32(rand.Intn(256)), uint32(rand.Intn(65536)), uint32(rand.Intn(256))} 47 | } 48 | for ; i < ntests; i++ { 49 | testsUint64[i] = [2]uint64{uint64(rand.Intn(256)), uint64(rand.Intn(256))} 50 | testsUint32[i] = [4]uint32{uint32(rand.Intn(256)), uint32(rand.Intn(256)), uint32(rand.Intn(256)), uint32(rand.Intn(256))} 51 | } 52 | } 53 | 54 | func TestStreamVByte64(t *testing.T) { 55 | buf := make([]byte, 16) 56 | var ctrl byte 57 | var n, n2 int 58 | var v1, v2 uint64 59 | for i, test := range testsUint64 { 60 | ctrl, n = PutUint64s(buf, test[0], test[1]) 61 | if CtrlByte2ByteLengthsUint64(ctrl) != n { 62 | t.Errorf("#%d, wrong byte length", i) 63 | } 64 | 65 | v1, v2, n2 = Uint64s(ctrl, buf[0:n]) 66 | if n2 == 0 { 67 | t.Errorf("#%d, wrong decoded number", i) 68 | } 69 | 70 | if v1 != test[0] || v2 != test[1] { 71 | t.Errorf("#%d, wrong decoded result: %d, %d, answer: %d, %d", i, v1, v2, test[0], test[1]) 72 | } 73 | // fmt.Printf("%d, %d => n=%d, buf=%v\n", test[0], test[1], n, buf[0:n]) 74 | } 75 | } 76 | 77 | func TestStreamVByte32(t *testing.T) { 78 | buf := make([]byte, 16) 79 | var ctrl byte 80 | var n, n2 int 81 | var v1, v2, v3, v4 uint32 82 | for i, test := range testsUint32 { 83 | ctrl, n = PutUint32s(buf, test[0], test[1], test[2], test[3]) 84 | if CtrlByte2ByteLengthsUint32(ctrl) != n { 85 | t.Errorf("#%d, wrong byte length", i) 86 | } 87 | 88 | v1, v2, v3, v4, n2 = Uint32s(ctrl, buf[0:n]) 89 | if n2 == 0 { 90 | t.Errorf("#%d, wrong decoded number", i) 91 | } 92 | 93 | if v1 != test[0] || v2 != test[1] || v3 != test[2] || v4 != test[3] { 94 | t.Errorf("#%d, wrong decoded result: %d, %d, %d, %d, answer: %d, %d, %d, %d", i, v1, v2, v3, v4, test[0], test[1], test[2], test[3]) 95 | } 96 | // fmt.Printf("%d, %d => n=%d, buf=%v\n", test[0], test[1], n, buf[0:n]) 97 | } 98 | } 99 | 100 | var _v1, _v2 uint64 101 | 102 | func BenchmarkUint64s(b *testing.B) { 103 | buf := make([]byte, 16) 104 | var ctrl byte 105 | var n, n2 int 106 | var v1, v2 uint64 107 | for i := 0; i < b.N; i++ { 108 | for i, test := range testsUint64 { 109 | ctrl, n = PutUint64s(buf, test[0], test[1]) 110 | 111 | v1, v2, n2 = Uint64s(ctrl, buf[0:n]) 112 | if n2 == 0 { 113 | b.Errorf("#%d, wrong decoded number", i) 114 | } 115 | 116 | if v1 != test[0] || v2 != test[1] { 117 | b.Errorf("#%d, wrong decoded result: %d, %d, answer: %d, %d", i, v1, v2, test[0], test[1]) 118 | } 119 | // fmt.Printf("%d, %d => n=%d, buf=%v\n", test[0], test[1], n, buf[0:n]) 120 | } 121 | } 122 | _v1, _v2 = v1, v2 123 | } 124 | 125 | var __v1, __v2, __v3, __v4 uint32 126 | 127 | func BenchmarkUint32s(b *testing.B) { 128 | buf := make([]byte, 16) 129 | var ctrl byte 130 | var n, n2 int 131 | var v1, v2, v3, v4 uint32 132 | for i := 0; i < b.N; i++ { 133 | for i, test := range testsUint32 { 134 | ctrl, n = PutUint32s(buf, test[0], test[1], test[2], test[3]) 135 | 136 | v1, v2, v3, v4, n2 = Uint32s(ctrl, buf[0:n]) 137 | if n2 == 0 { 138 | b.Errorf("#%d, wrong decoded number", i) 139 | } 140 | 141 | if v1 != test[0] || v2 != test[1] || v3 != test[2] || v4 != test[3] { 142 | b.Errorf("#%d, wrong decoded result: %d, %d, %d, %d, answer: %d, %d, %d, %d", i, v1, v2, v3, v4, test[0], test[1], test[2], test[3]) 143 | } 144 | // fmt.Printf("%d, %d => n=%d, buf=%v\n", test[0], test[1], n, buf[0:n]) 145 | } 146 | } 147 | __v1, __v2, __v3, __v4 = v1, v2, v3, v4 148 | } 149 | -------------------------------------------------------------------------------- /lexicmap/cmd/utils.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "github.com/spf13/cobra" 25 | ) 26 | 27 | var utilsCmd = &cobra.Command{ 28 | Use: "utils", 29 | Short: "Some utilities", 30 | Long: `Some utilities 31 | `, 32 | } 33 | 34 | func init() { 35 | RootCmd.AddCommand(utilsCmd) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /lexicmap/cmd/version.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2023-2024 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package cmd 22 | 23 | import ( 24 | "fmt" 25 | "net/http" 26 | "strings" 27 | 28 | "github.com/shenwei356/util/cliutil" 29 | "github.com/spf13/cobra" 30 | ) 31 | 32 | // VERSION is the version 33 | var VERSION = "0.7.1" 34 | 35 | // COMMIT is the last commit 36 | // var COMMIT = func() string { 37 | // if info, ok := debug.ReadBuildInfo(); ok { 38 | // for _, setting := range info.Settings { 39 | // if setting.Key == "vcs.revision" { 40 | // return setting.Value[:7] 41 | // } 42 | // } 43 | // } 44 | // return "" 45 | // }() 46 | 47 | // can pass from from command line: 48 | // commit=$(git rev-parse --short HEAD) 49 | // go build -trimpath -o=lexicmap -ldflags="-s -w -X github.com/shenwei356/LexicMap/lexicmap/cmd.COMMIT=$commit" -tags netgo 50 | var COMMIT = "" 51 | 52 | // versionCmd represents the version command 53 | var versionCmd = &cobra.Command{ 54 | Use: "version", 55 | Short: "Print version information and check for update", 56 | Long: `Print version information and check for update 57 | 58 | `, 59 | Run: func(cmd *cobra.Command, args []string) { 60 | app := "LexicMap" 61 | if COMMIT == "" { 62 | fmt.Printf("%s v%s\n", app, VERSION) 63 | } else { 64 | fmt.Printf("%s v%s (%s)\n", app, VERSION, COMMIT) 65 | } 66 | 67 | if !cliutil.GetFlagBool(cmd, "check-update") { 68 | return 69 | } 70 | 71 | fmt.Println("\nChecking new version...") 72 | 73 | resp, err := http.Get(fmt.Sprintf("https://github.com/shenwei356/%s/releases/latest", app)) 74 | if err != nil { 75 | checkError(fmt.Errorf("network error")) 76 | } 77 | items := strings.Split(resp.Request.URL.String(), "/") 78 | version := "" 79 | if items[len(items)-1] == "" { 80 | version = items[len(items)-2] 81 | } else { 82 | version = items[len(items)-1] 83 | } 84 | if version == "v"+VERSION { 85 | fmt.Printf("You are using the latest version of %s\n", app) 86 | } else { 87 | fmt.Printf("New version available: %s %s at %s\n", app, version, resp.Request.URL.String()) 88 | } 89 | }, 90 | } 91 | 92 | func init() { 93 | RootCmd.AddCommand(versionCmd) 94 | 95 | versionCmd.Flags().BoolP("check-update", "u", false, `check update`) 96 | } 97 | -------------------------------------------------------------------------------- /lexicmap/main.go: -------------------------------------------------------------------------------- 1 | // Copyright © 2018-2020 Wei Shen 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | 21 | package main 22 | 23 | import ( 24 | "github.com/shenwei356/LexicMap/lexicmap/cmd" 25 | ) 26 | 27 | func main() { 28 | // go tool pprof -http=:8080 cpu.pprof 29 | // defer profile.Start(profile.CPUProfile, profile.ProfilePath(".")).Stop() 30 | 31 | // go tool trace -http=:8080 trace.out 32 | // defer profile.Start(profile.TraceProfile, profile.ProfilePath(".")).Stop() 33 | 34 | // go tool pprof -http=:8080 mem.pprof 35 | // defer profile.Start(profile.MemProfile, profile.MemProfileRate(1), profile.ProfilePath(".")).Stop() 36 | // defer profile.Start(profile.MemProfile, profile.ProfilePath(".")).Stop() 37 | 38 | cmd.Execute() 39 | } 40 | -------------------------------------------------------------------------------- /lexicmap/packaging.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | commit="" 4 | 5 | if [ $# -gt 0 ]; then 6 | commit=" -X github.com/shenwei356/LexicMap/lexicmap/cmd.COMMIT=$(git rev-parse --short HEAD)" 7 | fi 8 | 9 | CGO_ENABLED=0 gox -os="windows darwin linux freebsd" -arch="amd64 arm64" -tags netgo -ldflags "-w -s $commit" -asmflags '-trimpath' \ 10 | -output "lexicmap_{{.OS}}_{{.Arch}}" 11 | 12 | dir=binaries 13 | mkdir -p $dir; 14 | rm -rf $dir/$f; 15 | 16 | for f in lexicmap_*; do 17 | mkdir -p $dir/$f; 18 | mv $f $dir/$f; 19 | cd $dir/$f; 20 | mv $f $(echo $f | perl -pe 's/_[^\.]+//g'); 21 | tar -zcf $f.tar.gz lexicmap*; 22 | mv *.tar.gz ../; 23 | cd ..; 24 | rm -rf $f; 25 | cd ..; 26 | done; 27 | 28 | ls binaries/*.tar.gz | rush 'cd {/}; md5sum {%} > {%}.md5.txt' 29 | -------------------------------------------------------------------------------- /logo.svg: -------------------------------------------------------------------------------- 1 | 2 | MapexicL 64 | --------------------------------------------------------------------------------