├── test ├── data │ ├── MT-human.fa.fai │ ├── MT-orang.fa.fai │ ├── seqs2.fasta.fai │ ├── sample_fasta_for_fai.fasta.fai │ ├── toy.bam │ ├── toy.bed │ ├── toy.cram │ ├── seqs.fasta.gz │ ├── seqs.fastq.gz │ ├── seqs.txt.gz │ ├── toy.bam.bai │ ├── toy.cram.crai │ ├── seqs.fasta.fai │ ├── invalid │ │ ├── seqs_bad_base.fasta.fai │ │ ├── invalid_fai_float.fai │ │ ├── invalid_fai_missing_col.fai │ │ ├── seqs_bad_base.txt │ │ ├── invalid_with_header.bed │ │ ├── seqs_bad_base.fastq │ │ ├── seqs_bad_name.fastq │ │ ├── seqs_bad_qual.fastq │ │ ├── seqs_bad_qual_len.fastq │ │ └── seqs_bad_base.fasta │ ├── seqs2.fasta │ ├── valid_fai.fai │ ├── sample_fasta_for_fai.fasta │ ├── seqs.txt │ ├── toy.sam │ ├── valid_without_header.bed │ ├── valid_with_header.bed │ ├── seqs.fastq │ ├── seqs.fasta │ └── toy.vcf ├── bench │ ├── revcomp.codon │ ├── fqcnt.codon │ ├── kmercnt.codon │ ├── hash.codon │ ├── fmindex.codon │ ├── hamming.codon │ ├── knucleotide.codon │ ├── cpg.codon │ ├── sw.codon │ ├── 16mer.codon │ ├── bedcov.codon │ ├── fasta.codon │ ├── rc.codon │ ├── fastx.codon │ └── match.codon ├── CMakeLists.txt.in ├── core │ ├── containers.codon │ ├── serialization.codon │ ├── match.codon │ ├── bltin.codon │ ├── bwtsa.codon │ ├── align.codon │ ├── proteins.codon │ └── kmers.codon ├── apps │ ├── snap │ │ ├── test.seq │ │ ├── genomeindex.seq │ │ └── hashtable.seq │ ├── minimap2 │ │ ├── sw_simple.codon │ │ └── sw.codon │ ├── mrsfast │ │ └── exact.codon │ ├── umi │ │ └── whitelist.codon │ ├── bwa │ │ ├── fastmap.codon │ │ └── fastmap_build.codon │ ├── cora │ │ ├── hom_exact.codon │ │ └── hom_inexact.codon │ └── avid │ │ └── avid.codon ├── pipeline │ ├── prefetch.codon │ ├── interalign.codon │ └── canonical_opt.codon └── main.cpp ├── .clang-format ├── logo └── logo.png ├── seq.h ├── ir ├── seq.h ├── seq.cpp ├── pipeline.h └── revcomp.h ├── plugin.toml ├── stdlib └── bio │ ├── __init__.codon │ ├── types.codon │ ├── prefetch.codon │ ├── block.codon │ ├── iter.codon │ ├── locus.codon │ ├── c_htslib.codon │ ├── fai.codon │ ├── fastq.codon │ ├── pseq.codon │ ├── builtin.codon │ └── bwa.codon ├── .gitignore ├── .github ├── build-linux │ ├── entrypoint.sh │ ├── Dockerfile.manylinux2014-x86_64 │ ├── Dockerfile.linux-x86_64 │ ├── Dockerfile.linux-aarch64 │ └── Dockerfile.manylinux2014-aarch64 └── workflows │ └── ci.yml ├── README.md ├── htslib-config.h.cmake └── sw └── ksw2_gg2_sse.cpp /test/data/MT-human.fa.fai: -------------------------------------------------------------------------------- 1 | MT_human 16569 10 60 61 2 | -------------------------------------------------------------------------------- /test/data/MT-orang.fa.fai: -------------------------------------------------------------------------------- 1 | MT_orang 16499 10 60 61 2 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: LLVM 3 | ColumnLimit: 88 4 | -------------------------------------------------------------------------------- /logo/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exaloop/seq/HEAD/logo/logo.png -------------------------------------------------------------------------------- /test/data/seqs2.fasta.fai: -------------------------------------------------------------------------------- 1 | chrA 10 6 10 11 2 | chrC 120 23 120 121 3 | -------------------------------------------------------------------------------- /test/data/sample_fasta_for_fai.fasta.fai: -------------------------------------------------------------------------------- 1 | one 66 5 30 31 2 | two 28 98 14 15 3 | -------------------------------------------------------------------------------- /test/data/toy.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/toy.bam -------------------------------------------------------------------------------- /test/data/toy.bed: -------------------------------------------------------------------------------- 1 | chr1 11 22 2 | chr2 33 44 3 | chr3 55 66 4 | -------------------------------------------------------------------------------- /test/data/toy.cram: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/toy.cram -------------------------------------------------------------------------------- /test/data/seqs.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/seqs.fasta.gz -------------------------------------------------------------------------------- /test/data/seqs.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/seqs.fastq.gz -------------------------------------------------------------------------------- /test/data/seqs.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/seqs.txt.gz -------------------------------------------------------------------------------- /test/data/toy.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/toy.bam.bai -------------------------------------------------------------------------------- /test/data/toy.cram.crai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/toy.cram.crai -------------------------------------------------------------------------------- /test/data/seqs.fasta.fai: -------------------------------------------------------------------------------- 1 | chrA 460 27 50 51 2 | chrB 489 503 50 51 3 | chrC 500 1008 50 51 4 | chrD 49 1530 49 50 5 | -------------------------------------------------------------------------------- /test/data/invalid/seqs_bad_base.fasta.fai: -------------------------------------------------------------------------------- 1 | chrA 460 6 50 51 2 | chrB 489 482 50 51 3 | chrC 500 987 50 51 4 | chrD 49 1503 49 50 5 | -------------------------------------------------------------------------------- /test/data/seqs2.fasta: -------------------------------------------------------------------------------- 1 | >chrA 2 | GCCTTAACAT 3 | >chrC 4 | GGGGGGGGGGGCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAGGTATTCATCCTATGTGGGTAATTGAGGAGTATGCTAAGATTTTGCGTAGCGGGGGGGGGG 5 | -------------------------------------------------------------------------------- /test/data/valid_fai.fai: -------------------------------------------------------------------------------- 1 | fastq1 66 8 30 31 79 2 | fastq2 28 156 14 15 188 3 | fastq2 84 12 566 988 91 4 | fastq3 29 72 146 19 134 5 | fastq4 66 8 30 31 79 6 | fastq5 28 156 14 15 188 7 | -------------------------------------------------------------------------------- /test/data/sample_fasta_for_fai.fasta: -------------------------------------------------------------------------------- 1 | >one 2 | ATGCATGCATGCATGCATGCATGCATGCAT 3 | GCATGCATGCATGCATGCATGCATGCATGC 4 | ATGCAT 5 | >two another chromosome 6 | ATGCATGCATGCAT 7 | GCATGCATGCATGC 8 | -------------------------------------------------------------------------------- /test/data/invalid/invalid_fai_float.fai: -------------------------------------------------------------------------------- 1 | fastq1 66 8 30 31 79.2 2 | fastq2 28 156 14 15 188 3 | fastq2 84 12 566 988 91 4 | fastq3 29 72 146 19 134 5 | fastq4 66 8 30 31 79 6 | fastq5 28 156 14 15 188 7 | -------------------------------------------------------------------------------- /test/data/invalid/invalid_fai_missing_col.fai: -------------------------------------------------------------------------------- 1 | fastq2 28 156 14 15 188 2 | fastq1 66 8 30 31 3 | fastq2 84 12 566 988 91 4 | fastq3 29 72 146 19 134 5 | fastq4 66 8 30 31 79 6 | fastq5 28 156 14 15 188 7 | -------------------------------------------------------------------------------- /seq.h: -------------------------------------------------------------------------------- 1 | #ifndef SEQ_H 2 | #define SEQ_H 3 | 4 | #include 5 | 6 | #define SEQ_FUNC extern "C" 7 | 8 | typedef int64_t seq_int_t; 9 | 10 | struct seq_t { 11 | seq_int_t len; 12 | char *seq; 13 | }; 14 | 15 | #endif /* SEQ_H */ 16 | -------------------------------------------------------------------------------- /ir/seq.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "codon/dsl/dsl.h" 4 | 5 | namespace seq { 6 | 7 | class Seq : public codon::DSL { 8 | public: 9 | void addIRPasses(codon::ir::transform::PassManager *pm, bool debug) override; 10 | }; 11 | 12 | } // namespace seq 13 | -------------------------------------------------------------------------------- /plugin.toml: -------------------------------------------------------------------------------- 1 | [about] 2 | name = "Seq" 3 | description = "a high-performance language for bioinformatics" 4 | version = "0.11.4" 5 | url = "https://seq-lang.org" 6 | supported = ">=0.18.2" 7 | 8 | [library] 9 | cpp = "build/libseq" 10 | codon = "stdlib" 11 | link = ["{root}/build/libseq_static.a"] 12 | -------------------------------------------------------------------------------- /test/bench/revcomp.codon: -------------------------------------------------------------------------------- 1 | import sys, bio 2 | 3 | def process(l): 4 | w = 60 5 | rc = ~bio.seq(''.join(l)) 6 | for i in range(0, len(rc), w): 7 | print rc[i:i + w] 8 | 9 | l = list[str]() 10 | for line in sys.stdin: 11 | if line[0] == '>': 12 | process(l) 13 | l.clear() 14 | print line 15 | else: 16 | l.append(line) 17 | process(l) 18 | -------------------------------------------------------------------------------- /test/bench/fqcnt.codon: -------------------------------------------------------------------------------- 1 | # FASTQ counter benchmark from https://github.com/lh3/biofast 2 | from sys import argv, exit, stderr 3 | from bio import * 4 | 5 | if len(argv) != 2: 6 | stderr.write("Usage: fqcnt.py \n") 7 | exit(1) 8 | 9 | n, slen, qlen = 0, 0, 0 10 | for r in FASTQ(argv[1], validate=False): 11 | n += 1 12 | slen += len(r.read) 13 | qlen += len(r.qual) 14 | 15 | print n, slen, qlen 16 | -------------------------------------------------------------------------------- /test/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.2) 2 | 3 | project(googletest-download NONE) 4 | 5 | include(ExternalProject) 6 | ExternalProject_Add(googletest 7 | GIT_REPOSITORY https://github.com/google/googletest.git 8 | GIT_TAG release-1.10.0 9 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" 10 | BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /test/core/containers.codon: -------------------------------------------------------------------------------- 1 | @test 2 | def test_interval_tree(): 3 | from bio.intervals import IntervalTree 4 | t = IntervalTree() 5 | t.add("chr1", 20, 30, "a") 6 | t.add("chr2", 10, 30, "b") 7 | t.add("chr1", 10, 25, "c") 8 | t.index() 9 | overlaps = {(a.start, a.end) for a in t.overlap("chr1", 15, 22)} 10 | assert overlaps == {(20, 30), (10, 25)} 11 | assert "chr1" in t 12 | assert "chr2" in t 13 | assert "chr3" not in t 14 | assert {(a.start, a.end, a.data) for a in t} == {(20, 30, "a"), (10, 30, "b"), (10, 25, "c")} 15 | assert len(t) == 3 16 | test_interval_tree() 17 | -------------------------------------------------------------------------------- /test/data/seqs.txt: -------------------------------------------------------------------------------- 1 | GTCCTAAATTGTTGTACGAAAGAACGTGACAGAGGGAAGGCACTCGGCGTGGCTGAGAGTTGCGGCTACCGCACTGTTACACGGTATGCTAGTTAAAACTTGGAAGAGGGCAAAGCGACTATGCACTGTGGCTGGATCGCTATGACCCCTG 2 | GACGTGTGGAGTAAGCATAAGTCACTATATCAACAAGCCCGCAACAATACTTGTAGAGAATCTGAACCGGCTAGGCGCTCAACGCTACAGGGTGTCATTTCGTACCCCTAACACTGCTATTCGTTTCGGAAGAGGCACCTCGGTGAAGAAA 3 | GAAGCTGGAGCGAAGTCGATGTTTTTGCTGTACCAGGCGTGAGTTTGTAGATAAGCGGTCTGATACCGCAGAAGCAGGGTACAGTATAGACACGGGTTAAGTCGAGAGACAGGTCAAACAATTAACGCCAAAGAGGTCCCAGTCAGGAGCT 4 | AATGAAGTGGGGTAATTATGATTCTATAAATTTGTAGGGAAATGGGTTTAGCGCCTGAACGACAAGCGATAGATTATGGGCTGAGGAATAGTAGTTACTCCGCGGGCGGCCGCATTCGATATTTTGCGTCATCATAGATCAAGTTTCCGGG 5 | -------------------------------------------------------------------------------- /test/bench/kmercnt.codon: -------------------------------------------------------------------------------- 1 | # Implementation of benchmark at https://github.com/lh3/kmer-cnt 2 | # Usage: seqc kmercnt.seq 3 | from sys import argv 4 | from time import timing 5 | from bio import * 6 | 7 | def print_hist(h, N = 256): 8 | cnt = [0 for _ in range(N)] 9 | for v in h.values(): 10 | cnt[min(v, N - 1)] += 1 11 | for i in range(1, N): 12 | print f'{i}\t{cnt[i]}' 13 | 14 | with timing('k-mer counting'), FASTQ(argv[1], copy=False, validate=False) as fastq: 15 | h: Dict[Kmer[31], int] = {} 16 | fastq |> seqs |> kmers(step=1, k=31) |> canonical |> h.increment 17 | print_hist(h) 18 | -------------------------------------------------------------------------------- /test/data/invalid/seqs_bad_base.txt: -------------------------------------------------------------------------------- 1 | GTCCTAAATTGTTGTACGAAAGAACGTGACAGAGGGAAGGCACTCGGCGTGGCTGAGAGTTGCGGCTACCGCACTGTTACACGGTATGCTAGTTAAAACTTGGAAGAGGGCAAAGCGACTATGCACTGTGGCTGGATCGCTATGACCCCTG 2 | GACGTGTGGAGTAAGCATAAGTCACTATATCAACAAGCCCGCAACAATACTTGTAGAGAATCTGAACCGGCTAGZCGCTCAACGCTACAGGGTGTCATTTCGTACCCCTAACACTGCTATTCGTTTCGGAAGAGGCACCTCGGTGAAGAAA 3 | GAAGCTGGAGCGAAGTCGATGTTTTTGCTGTACCAGGCGTGAGTTTGTAGATAAGCGGTCTGATACCGCAGAAGCAGGGTACAGTATAGACACGGGTTAAGTCGAGAGACAGGTCAAACAATTAACGCCAAAGAGGTCCCAGTCAGGAGCT 4 | AATGAAGTGGGGTAATTATGATTCTATAAATTTGTAGGGAAATGGGTTTAGCGCCTGAACGACAAGCGATAGATTATGGGCTGAGGAATAGTAGTTACTCCGCGGGCGGCCGCATTCGATATTTTGCGTCATCATAGATCAAGTTTCCGGG 5 | -------------------------------------------------------------------------------- /stdlib/bio/__init__.codon: -------------------------------------------------------------------------------- 1 | from bio.seq import * 2 | from bio.kmer import * 3 | from bio.pseq import * 4 | 5 | from bio.builtin import * 6 | 7 | from bio.block import Block, blocks 8 | from bio.locus import Locus 9 | from bio.iter import Seqs 10 | 11 | from bio.align import SubMat, CIGAR, Alignment 12 | from bio.pseq import pseq, translate 13 | from bio.bwt import _saisxx, _saisxx_bwt 14 | 15 | from bio.fasta import FASTARecord, FASTA, pFASTARecord, pFASTA 16 | from bio.fastq import FASTQRecord, FASTQ 17 | from bio.fai import FAIRecord, FAI 18 | from bio.bam import SAMRecord, SAM, BAM, CRAM 19 | from bio.bed import BEDRecord, BED 20 | from bio.vcf import VCFRecord, VCF, BCF 21 | 22 | from bio.prefetch import * 23 | from bio.types import * 24 | -------------------------------------------------------------------------------- /ir/seq.cpp: -------------------------------------------------------------------------------- 1 | #include "seq.h" 2 | #include "pipeline.h" 3 | #include "revcomp.h" 4 | 5 | #include "codon/cir/transform/lowering/pipeline.h" 6 | 7 | namespace seq { 8 | 9 | void Seq::addIRPasses(codon::ir::transform::PassManager *pm, bool debug) { 10 | pm->registerPass(std::make_unique()); 11 | if (debug) 12 | return; 13 | auto dep = codon::ir::transform::lowering::PipelineLowering::KEY; 14 | pm->registerPass(std::make_unique(), dep); 15 | pm->registerPass(std::make_unique(), dep); 16 | pm->registerPass(std::make_unique(), dep); 17 | } 18 | 19 | } // namespace seq 20 | 21 | extern "C" std::unique_ptr load() { return std::make_unique(); } 22 | -------------------------------------------------------------------------------- /test/data/toy.sam: -------------------------------------------------------------------------------- 1 | @SQ SN:ref LN:45 2 | @SQ SN:ref2 LN:40 3 | r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 4 | r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * 5 | r003 0 ref 9 30 5H6M * 0 0 AGCTAA * 6 | r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * 7 | r003 16 ref 29 30 6H5M * 0 0 TAGGC * 8 | r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * 9 | x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ???????????????????? 10 | x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ????????????????????? 11 | x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ?????????????????????????? 12 | x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ????????????????????????? 13 | x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ???????????????????????? 14 | x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ??????????????????????? 15 | -------------------------------------------------------------------------------- /test/bench/hash.codon: -------------------------------------------------------------------------------- 1 | ############################## 2 | # Test k-mer hash collisions # 3 | ############################## 4 | from sys import argv 5 | from bio import * 6 | d = {} 7 | #d.resize(1 << 32) 8 | 9 | def test(use_bad_hash: bool, K: Static[int]): 10 | def update(kmer, use_bad_hash, seen): 11 | if kmer not in seen: 12 | h = int(kmer.as_int()) if use_bad_hash else hash(kmer) 13 | d[h] = d.get(h, 0) + 1 14 | seen.add(kmer) 15 | 16 | seen: Set[Kmer[K]] = set() 17 | #seen.resize(1 << 32) 18 | FASTA(argv[1]) |> seqs |> kmers(1, K) |> update(use_bad_hash, seen) 19 | m = max((v, k) for k,v in d.items())[0] 20 | a = sum(v for v in d.values()) / len(d) 21 | print f'{K}-mer ({use_bad_hash=}):\tmax={m}, avg={a}' 22 | d.clear() 23 | 24 | print 'start' 25 | test(False, 64) 26 | test(True, 64) 27 | -------------------------------------------------------------------------------- /test/bench/fmindex.codon: -------------------------------------------------------------------------------- 1 | ###################### 2 | # Prefetch benchmark # 3 | ###################### 4 | from sys import argv 5 | from bio import * 6 | from bio.fmindex import FMIndex 7 | from time import timing 8 | from pickle import load 9 | import gzip 10 | 11 | fmi = None 12 | with gzip.open(argv[1], 'rb') as jar: 13 | fmi = load(jar, FMIndex) 14 | 15 | step = 20 16 | n = 0 17 | 18 | def update(count): 19 | global n 20 | n += count 21 | 22 | @prefetch 23 | def find(s, fmi): 24 | intv = fmi.interval(s[-1]) 25 | s = s[:-1] 26 | while s and intv: 27 | intv = fmi[intv, s[-1]] 28 | s = s[:-1] 29 | return len(intv) 30 | 31 | for k in (10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32): 32 | n = 0 33 | with timing(f'{k=}'): 34 | FASTQ(argv[2]) |> seqs |> split(k, step=step) |> find(fmi) |> update 35 | print n 36 | -------------------------------------------------------------------------------- /test/data/valid_without_header.bed: -------------------------------------------------------------------------------- 1 | chr7 127471196 127472363 Pos1 882.13435 + 127471196 127472363 255,0,0 4 11,12,13,14 14,55,66,99 2 | chr7 127472363 127473530 Pos2 0 + 127472363 127473530 255,0,0 3 | chr7 127473530 127474697 Pos3 0 + 127473530 127474697 255,0,0 4 | chr7 127474697 127475864 Pos4 0 + 127474697 127475864 255,0,0 5 | chr7 127475864 127477031 Neg1 0 - 127475864 127477031 0,0,255 6 | chr7 127477031 127478198 Neg2 0 - 127477031 127478198 0,0,255 7 | chr7 127478198 127479365 Neg3 0 - 127478198 127479365 0,0,255 8 | chr7 127479365 127480532 Pos5 0 + 127479365 127480532 255,0,0 9 | chr7 127480532 127481699 Neg4 0 - 127480532 127481699 0,0,255 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ###################### 2 | # Generic .gitignore # 3 | ###################### 4 | 5 | # Compiled source # 6 | ################### 7 | *.com 8 | *.class 9 | *.dll 10 | *.exe 11 | *.o 12 | *.a 13 | *.obj 14 | *.so 15 | *.dylib 16 | *.pyc 17 | build/ 18 | build_*/ 19 | extra/jupyter/build/ 20 | 21 | # Packages # 22 | ############ 23 | # it's better to unpack these files and commit the raw source 24 | # git has its own built-in compression methods 25 | *.7z 26 | *.dmg 27 | *.iso 28 | *.jar 29 | *.rar 30 | *.tar 31 | *.zip 32 | 33 | # Logs and databases # 34 | ###################### 35 | *.log 36 | *.sql 37 | *.sqlite 38 | 39 | # OS generated files # 40 | ###################### 41 | .DS_Store 42 | .DS_Store? 43 | ._* 44 | .Spotlight-V100 45 | .Trashes 46 | ehthumbs.db 47 | Thumbs.db 48 | 49 | # IDE generated files # 50 | ####################### 51 | .idea 52 | .mypy_cache 53 | .vscode 54 | 55 | extra/jupyter/share/jupyter/kernels/codon/kernel.json 56 | scratch.* 57 | -------------------------------------------------------------------------------- /stdlib/bio/types.codon: -------------------------------------------------------------------------------- 1 | @extend 2 | class byte: 3 | def comp(self) -> byte: 4 | _byte_comp_table = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN-.NNNNNNNNNNNNNNNNNNTVGHNNCDNNMNKNNNNYSAABWNRNNNNNNNtvghNNcdNNmNknNNNysaabwNrNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" 5 | return _byte_comp_table.ptr[int(self)] 6 | 7 | @extend 8 | class Dict: 9 | def prefetch(self, key: K): 10 | if self._n_buckets: 11 | from internal.types.collections.dict import _dict_hash 12 | mask = self._n_buckets - 1 13 | k = _dict_hash(key) 14 | i = k & mask 15 | (self._keys + i).__prefetch_r1__() 16 | (self._vals + i).__prefetch_r1__() 17 | (self._flags + (i >> 4)).__prefetch_r1__() 18 | 19 | @extend 20 | class List: 21 | def prefetch(self, idx: int): 22 | (self.arr.ptr + idx).__prefetch_r3__() 23 | -------------------------------------------------------------------------------- /ir/pipeline.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "codon/cir/cir.h" 4 | #include "codon/cir/transform/pass.h" 5 | 6 | namespace seq { 7 | 8 | class PipelineSubstitutionOptimization : public codon::ir::transform::OperatorPass { 9 | static const std::string KEY; 10 | std::string getKey() const override { return KEY; } 11 | void handle(codon::ir::PipelineFlow *) override; 12 | }; 13 | 14 | class PipelinePrefetchOptimization : public codon::ir::transform::OperatorPass { 15 | const unsigned SCHED_WIDTH_PREFETCH = 16; 16 | static const std::string KEY; 17 | std::string getKey() const override { return KEY; } 18 | void handle(codon::ir::PipelineFlow *) override; 19 | }; 20 | 21 | class PipelineInterAlignOptimization : public codon::ir::transform::OperatorPass { 22 | const unsigned SCHED_WIDTH_INTERALIGN = 2048; 23 | static const std::string KEY; 24 | std::string getKey() const override { return KEY; } 25 | void handle(codon::ir::PipelineFlow *) override; 26 | }; 27 | 28 | } // namespace seq 29 | -------------------------------------------------------------------------------- /test/data/invalid/invalid_with_header.bed: -------------------------------------------------------------------------------- 1 | browser position chr7:127471196-127495720 2 | browser hide all 3 | track name="ItemRGBDemo" description="Item RGB demonstration" visibility=2 itemRgb="On" 4 | chr7 127471196a 127472363 Pos1 0 + 127471196 127472363 255,0,0 4 11,12,13,14 14,55,66,99 5 | chr7 127472363 127473530 Pos2 0 + 127472363 127473530 255,0,0 6 | chr7 127473530 127474697 Pos3 0 + 127473530 127474697 255,0,0 7 | chr7 127474697 127475864 Pos4 0 + 127474697 127475864 255,0,0 8 | chr7 127475864 127477031 Neg1 0 - 127475864 127477031 0,0,255 9 | chr7 127477031 127478198 Neg2 0 - 127477031 127478198 0,0,255 10 | chr7 127478198 127479365 Neg3 0 - 127478198 127479365 0,0,255 11 | chr7 127479365 127480532 Pos5 0 + 127479365 127480532 255,0,0 12 | chr7 127480532 127481699 Neg4 0 - 127480532 127481699 0,0,255 13 | -------------------------------------------------------------------------------- /test/data/valid_with_header.bed: -------------------------------------------------------------------------------- 1 | browser position chr7:127471196-127495720 2 | browser hide all 3 | track name="ItemRGBDemo" description="Item RGB demonstration" visibility=2 itemRgb="On" 4 | chr7 127471196 127472363 Pos1 882.13435 + 127471196 127472363 255,0,0 4 11,12,13,14 14,55,66,99 5 | chr7 127472363 127473530 Pos2 0 + 127472363 127473530 255,0,0 6 | chr7 127473530 127474697 Pos3 0 + 127473530 127474697 255,0,0 7 | chr7 127474697 127475864 Pos4 0 + 127474697 127475864 255,0,0 8 | chr7 127475864 127477031 Neg1 0 - 127475864 127477031 0,0,255 9 | chr7 127477031 127478198 Neg2 0 - 127477031 127478198 0,0,255 10 | chr7 127478198 127479365 Neg3 0 - 127478198 127479365 0,0,255 11 | chr7 127479365 127480532 Pos5 0 + 127479365 127480532 255,0,0 12 | chr7 127480532 127481699 Neg4 0 - 127480532 127481699 0,0,255 13 | -------------------------------------------------------------------------------- /stdlib/bio/prefetch.codon: -------------------------------------------------------------------------------- 1 | @inline 2 | def _dynamic_coroutine_scheduler[A,B,T,C](value: A, coro: B, states: Array[Generator[T]], I: Ptr[int], N: Ptr[int], M: int, args: C): 3 | n = N[0] 4 | if n < M: 5 | states[n] = coro(value, *args) 6 | N[0] = n + 1 7 | else: 8 | i = I[0] 9 | while True: 10 | g = states[i] 11 | if g.done(): 12 | if not isinstance(T, None): 13 | yield g.next() 14 | g.destroy() 15 | states[i] = coro(value, *args) 16 | break 17 | i = (i + 1) & (M - 1) 18 | I[0] = i 19 | 20 | @inline 21 | def _dynamic_coroutine_scheduler_drain[T](states: Array[Generator[T]], N: int): 22 | i = 0 23 | while i < N: 24 | g = states[i] 25 | while not g.done(): 26 | g.next() 27 | if not isinstance(T, None): 28 | yield g.next() 29 | g.destroy() 30 | i += 1 31 | 32 | @inline 33 | def _dummy_prefetch_terminal_stage(x): 34 | pass 35 | -------------------------------------------------------------------------------- /test/core/serialization.codon: -------------------------------------------------------------------------------- 1 | from bio import * 2 | import pickle 3 | 4 | @test 5 | def test_pickle[T](x: T): 6 | import gzip 7 | path = 'testjar.bin' 8 | jar = gzip.open(path, 'wb') 9 | pickle.dump(x, jar) 10 | jar.close() 11 | 12 | jar = gzopen(path, 'rb') 13 | y = pickle.load(jar, T) 14 | jar.close() 15 | 16 | assert x == y 17 | 18 | K = Kmer[8] 19 | test_pickle(s'ACGTAAGG') 20 | test_pickle(~s'ACGTAAGG') 21 | test_pickle(K(s'ACGTAAGG')) 22 | test_pickle([K(s'ACGTAAGG'), K(s'TATCTGTT')]) 23 | test_pickle(list[K]()) 24 | test_pickle({K(s'ACGTAAGG'), K(s'CATTTTTA')}) 25 | test_pickle({~s'ACGTAAGG'}) 26 | test_pickle({K(s'ACGTAAGG'), K(s'TTTTGGTT')}) 27 | test_pickle(set[K]()) 28 | test_pickle({K(s'ACGTAAGG'): 99, K(s'TTATTCTT'): 42}) 29 | test_pickle(dict[K,K]()) 30 | test_pickle({~s'ACGTAAGG': ~s'ACGTAAGG'}) 31 | test_pickle((42, 3.14, True, byte(90), s'ACGTAAGG', K(s'ACGTAAGG'))) 32 | test_pickle({i32(42): [[{s'ACG', s'ACGTAGCG', ~s'ACGTAGCG'}, {s'ACG', s'ACGTAGCG', ~s'ACGTAGCG'}], list[set[seq]](), [set[seq]()], [{~s''}, {s'', s'GCGC'}]]}) 33 | -------------------------------------------------------------------------------- /test/apps/snap/test.seq: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | from bio import FASTQ 3 | from genomeindex import * 4 | K: Static[int] = 20 5 | 6 | def update(counts, pos, max_pos, max_count): 7 | count = counts.get(pos, 0) + 1 8 | counts[pos] = count 9 | return (pos, count) if count > max_count else (max_pos, max_count) 10 | 11 | def main(args): 12 | index = GenomeIndex[Kmer[K]](args[0]) 13 | for read in FASTQ(args[1]): 14 | counts: Dict[int, int] = {} 15 | max_pos, max_count = 0, 0 16 | 17 | for i,kmer in enumerate(read.seq.kmers(K, K)): 18 | offset = i * K 19 | hits = index[kmer] 20 | hits_rev = index[~kmer] 21 | 22 | for i in range(len(hits)): 23 | pos = int(hits[i]) - offset 24 | max_pos, max_count = update(counts, pos, max_pos, max_count) 25 | 26 | for i in range(len(hits_rev)): 27 | pos = int(hits_rev[i]) - offset 28 | max_pos, max_count = update(counts, pos, max_pos, max_count) 29 | 30 | print read, max_pos 31 | 32 | if len(argv) > 0: 33 | main(argv) 34 | -------------------------------------------------------------------------------- /.github/build-linux/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -l 2 | set -e 3 | set -x 4 | 5 | WORKSPACE="${1:-/github/workspace}" 6 | 7 | export ARCHDEFAULT="$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m)" 8 | ARCH=${2:-$ARCHDEFAULT} 9 | 10 | TEST=${3:-no} 11 | CODON_VERSION=${4:-0.19.2} 12 | 13 | echo "Workspace: ${WORKSPACE}; arch: ${ARCH}" 14 | cd "$WORKSPACE" 15 | 16 | curl -L https://github.com/exaloop/codon/releases/download/v${CODON_VERSION}/codon-${ARCH}.tar.gz | tar zxvf - 17 | export CODON_DIR=$(pwd)/codon-deploy-${ARCH} 18 | 19 | # Build Seq 20 | cmake -S . -B build \ 21 | -G Ninja \ 22 | -DCMAKE_BUILD_TYPE=Release \ 23 | -DCODON_PATH=${CODON_DIR} \ 24 | -DCMAKE_C_COMPILER=/opt/llvm-codon/bin/clang \ 25 | -DCMAKE_CXX_COMPILER=/opt/llvm-codon/bin/clang++ 26 | cmake --build build 27 | cmake --install build --prefix=${CODON_DIR}/lib/codon/plugins/seq 28 | 29 | # Test 30 | if [ "$TEST" = "yes" ]; then 31 | CODON_PATH=${CODON_DIR}/lib/codon/stdlib build/seqtest 32 | fi 33 | 34 | # Package 35 | export BUILD_ARCHIVE=seq-${ARCH}.tar.gz 36 | tar czf ${BUILD_ARCHIVE} -C ${CODON_DIR}/lib/codon/plugins seq/ 37 | du -sh ${BUILD_ARCHIVE} 38 | -------------------------------------------------------------------------------- /ir/revcomp.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "codon/cir/cir.h" 4 | #include "codon/cir/dsl/nodes.h" 5 | #include "codon/cir/transform/pass.h" 6 | 7 | namespace seq { 8 | 9 | class KmerRevcomp 10 | : public codon::ir::AcceptorExtend { 11 | private: 12 | codon::ir::Value *kmer; 13 | 14 | public: 15 | static const char NodeId; 16 | 17 | using AcceptorExtend::AcceptorExtend; 18 | 19 | explicit KmerRevcomp(codon::ir::Value *kmer) : AcceptorExtend(), kmer(kmer) {} 20 | 21 | std::unique_ptr getBuilder() const override; 22 | std::unique_ptr getCFBuilder() const override; 23 | 24 | bool match(const codon::ir::Value *v) const override; 25 | codon::ir::Value *doClone(codon::ir::util::CloneVisitor &cv) const override; 26 | std::ostream &doFormat(std::ostream &os) const override; 27 | }; 28 | 29 | class KmerRevcompInterceptor : public codon::ir::transform::Pass { 30 | static const std::string KEY; 31 | std::string getKey() const override { return KEY; } 32 | void run(codon::ir::Module *) override; 33 | }; 34 | 35 | } // namespace seq 36 | -------------------------------------------------------------------------------- /test/bench/hamming.codon: -------------------------------------------------------------------------------- 1 | ############################### 2 | # Hammming distance benchmark # 3 | ############################### 4 | from sys import argv 5 | from time import timing 6 | from bio import * 7 | 8 | def dist_fast(k1, k2): 9 | return abs(k1 - k2) 10 | 11 | def dist_slow(k1, k2): 12 | d = 0 13 | for i in range(type(k1).len()): 14 | if k1[i] != k2[i]: 15 | d += 1 16 | return d 17 | 18 | def test(use_slow_dist, K: Static[int]): 19 | n = 0 20 | with timing(f'{K}-mer ({use_slow_dist=})'): 21 | for s in FASTA(argv[1]) |> seqs: 22 | for kmer in s |> kmers(1, K): 23 | d = 0 24 | rckmer = ~kmer 25 | if use_slow_dist: 26 | d = dist_slow(rckmer, kmer) 27 | else: 28 | d = dist_fast(rckmer, kmer) 29 | n ^= d 30 | print n 31 | 32 | print 'start' 33 | test(False, 4) 34 | test(True, 4) 35 | test(False, 8) 36 | test(True, 8) 37 | test(False, 16) 38 | test(True, 16) 39 | test(False, 32) 40 | test(True, 32) 41 | test(False, 64) 42 | test(True, 64) 43 | test(False, 128) 44 | test(True, 128) 45 | -------------------------------------------------------------------------------- /.github/build-linux/Dockerfile.manylinux2014-x86_64: -------------------------------------------------------------------------------- 1 | FROM quay.io/pypa/manylinux2014_x86_64 2 | 3 | # - Codon needs ninja >= 1.10. 4 | # - Codon needs Python 3 with shared libraries. 5 | # Since manylinux ships with static libraries, pyenv is used to build shared libraries. 6 | # - Python needs OpenSSL >= 1.1. 7 | # - The recent git security checks are disabled by marking everything "safe" until all dependencies get updated. 8 | RUN yum -y update && \ 9 | yum -y install gcc gcc-c++ gcc-gfortran make wget openssl11-devel \ 10 | patch bzip2 readline-devel sqlite sqlite-devel \ 11 | bzip2-devel tk-devel libffi-devel xz-devel zlib-devel && \ 12 | yum -y install ninja-build && \ 13 | git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \ 14 | CPPFLAGS="-I/usr/include/openssl11" LDFLAGS="-L/usr/lib64/openssl11 -lssl -lcrypto" ~/.pyenv/bin/pyenv install -s 3.11 && \ 15 | curl -L https://github.com/exaloop/llvm-project/releases/download/codon-20.1.7/llvm-codon-20.1.7-manylinux2014-x86_64.tar.bz2 | tar jxf - -C /opt && \ 16 | echo -ne "[safe]\ndirectory = *" > /root/.gitconfig 17 | 18 | COPY entrypoint.sh /entrypoint.sh 19 | ENTRYPOINT ["/entrypoint.sh"] 20 | -------------------------------------------------------------------------------- /test/bench/knucleotide.codon: -------------------------------------------------------------------------------- 1 | import sys, time 2 | from bio import * 3 | 4 | def pad(x, n, w): 5 | s = str(x) 6 | if len(s) < n: 7 | return s + (w * (n - len(s))) 8 | return s 9 | 10 | def hashcnt(s: seq, K: Static[int]): 11 | d = {} 12 | for k in s.kmers(1, k=K): 13 | d[k] = d.get(k, 0) + 1 14 | return d 15 | 16 | def cnt(s: seq, q: Kmer[K], K: Static[int]): 17 | d = hashcnt(s, K=K) 18 | print str(d.get(q, 0)) + "\t" + str(q) 19 | 20 | def freq(s: seq): 21 | d1 = hashcnt(s, K=1) 22 | for k, v in sorted(d1.items(), lambda a: -a[1]): 23 | print k, pad(round((100.0 * v) / len(s), 3), 6, '0') 24 | print 25 | 26 | d2 = hashcnt(s, K=2) 27 | for k, v in sorted(d2.items(), lambda a: -a[1]): 28 | print k, pad(round((100.0 * v) / (len(s)-1), 3), 5, '0') 29 | print 30 | 31 | def process(sq): 32 | freq(sq) 33 | cnt(sq, Kmer[3](s'GGT')) 34 | cnt(sq, Kmer[4](s'GGTA')) 35 | cnt(sq, Kmer[6](s'GGTATT')) 36 | cnt(sq, Kmer[12](s'GGTATTTTAATT')) 37 | cnt(sq, Kmer[18](s'GGTATTTTAATTTATAGT')) 38 | 39 | t = time.time() 40 | for line in sys.stdin: 41 | if line[:6] == '>THREE': 42 | l = list[str]() 43 | for line in sys.stdin: 44 | if line[0] == '>': break 45 | l.append(line) 46 | s = seq(str.cat(l)) 47 | process(s) 48 | break 49 | -------------------------------------------------------------------------------- /test/data/invalid/seqs_bad_base.fastq: -------------------------------------------------------------------------------- 1 | @SL-HXF:348:HKLFWCCXX:1:2101:15676:57231:CACCAAAAGTACATGA 2 | GTGCACAGAAAAAAAGGTTAAATTGAAAAGTAAATATGATAGAAATGATTGCAAATGTTGGCAAACCACTAAATCGACTAAAACTTGAATAAAAGTAAAAATCATCCATGTCATTTATAAAGCGACTCAACTAAAGCATAAGGATATAAGA 3 | + 4 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKFKKKKKKKFKKFKKKKFKKKFK<,AAAFKFKKFAFKKA,,,A tuple[int, int]: 39 | match s: 40 | case 'C*': return is_cpg_i(s[1:], c + 1, g) 41 | case 'G*': return is_cpg_i(s[1:], c, g + 1) 42 | case _: return c, g 43 | 44 | def cpgs(s): 45 | i = 0 46 | while i < len(s): 47 | c, g = is_cpg_i(s[i:], 0, 0) 48 | if c and g: 49 | yield c + g 50 | i += c + g + 1 51 | 52 | cnt = 0 53 | def collect(c): 54 | global cnt, m, M 55 | cnt += 1 56 | m = min(m, c) 57 | M = max(M, c) 58 | 59 | def idiomatic(): 60 | FASTA(sys.argv[1], fai=False) |> seqs |> cpgs |> collect 61 | print cnt, m, M 62 | 63 | with time.timing("naive"): 64 | naive() 65 | 66 | m, M = 99999, 0 67 | with time.timing("idiomatic"): 68 | idiomatic() 69 | -------------------------------------------------------------------------------- /test/data/seqs.fastq: -------------------------------------------------------------------------------- 1 | @SL-HXF:348:HKLFWCCXX:1:2101:15676:57231:CACCAAAAGTACATGA comment A B C 2 | GTGCACAGAAAAAAAGGTTAAATTGAAAAGTAAATATGATAGAAATGATTGCAAATGTTGGCAAACCACTAAATCGACTAAAACTTGAATAAAAGTAAAAATCATCCATGTCATTTATAAAGCGACTCAACTAAAGCATAAGGATATAAGA 3 | + 4 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKFKKKKKKKFKKFKKKKFKKKFK<,AAAFKFKKFAFKKA,,,A process_intra 47 | print checksum 48 | 49 | checksum = 0 50 | with timing(f'inter ({m=})'): 51 | zip(seqs(in1), seqs(in2)) |> process_inter 52 | print checksum 53 | -------------------------------------------------------------------------------- /test/bench/16mer.codon: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | from bio import * 4 | 5 | def revcomp(c): 6 | return ('A' if c == 'T' else \ 7 | ('C' if c == 'G' else \ 8 | ('G' if c == 'C' else \ 9 | ('T' if c == 'A' else c)))) 10 | 11 | def process(k): 12 | return len(k) 13 | 14 | def ksplit(s, k, step): 15 | i = 0 16 | while i + k <= len(s): 17 | yield s[i:i + k] 18 | i += step 19 | 20 | def sym(s, k): 21 | return 1 if all(s[i] == revcomp(s[-i-1]) for i in range(k//2)) else 0 22 | 23 | def naive(): 24 | f = FASTA(sys.argv[1], fai=False) 25 | total, t2, t3 = 0, 0, 0 26 | k = 16 27 | for l in f: 28 | for s in ksplit(str(l.seq), k, 1): 29 | total += process(s) 30 | t2 += 1 31 | t3 += sym(s, k) 32 | print total, t2, t3 33 | 34 | t1 = 0 35 | def process_i(k): 36 | global t1 37 | t1 += len(k) 38 | return k 39 | 40 | t2 = 0 41 | def count(k): 42 | global t2 43 | t2 += 1 44 | return k 45 | 46 | t3 = 0 47 | def sym_i(k): 48 | global t3 49 | def is_sym(s) -> bool: 50 | match s: 51 | case 'A*T' | 'T*A' | 'C*G' | 'G*C' | 'N*N': 52 | return is_sym(s[1:-1]) 53 | case '': return True 54 | case _: return False 55 | t3 += 1 if is_sym(k) else 0 56 | 57 | def idiomatic(): 58 | (FASTA(sys.argv[1], fai=False) 59 | |> seqs 60 | |> split(16, 1) 61 | |> process_i 62 | |> count 63 | |> sym_i) 64 | print t1, t2, t3 65 | 66 | with time.timing("naive"): 67 | naive() 68 | with time.timing("idiomatic"): 69 | idiomatic() 70 | -------------------------------------------------------------------------------- /.github/build-linux/Dockerfile.linux-x86_64: -------------------------------------------------------------------------------- 1 | FROM quay.io/pypa/manylinux_2_28_x86_64 2 | 3 | # - Codon needs ninja >= 1.10. 4 | # - Codon needs Python 3 with shared libraries. 5 | # Since manylinux ships with static libraries, pyenv is used to build shared libraries. 6 | # - Codon's clang must be forced to use the correct gcc-14 toolset for C++20 support via clang.cfg. 7 | # - The recent git security checks are disabled by marking everything "safe" until all dependencies get updated. 8 | RUN yum -y update && \ 9 | yum -y install gcc gcc-c++ gcc-gfortran make wget openssl-devel \ 10 | patch bzip2 readline-devel sqlite sqlite-devel \ 11 | bzip2-devel tk-devel libffi-devel xz-devel zlib-devel && \ 12 | yum -y install https://dl.fedoraproject.org/pub/archive/epel/7/x86_64/Packages/n/ninja-build-1.10.2-3.el7.x86_64.rpm && \ 13 | git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \ 14 | CPPFLAGS="-I/usr/include/openssl11" LDFLAGS="-L/usr/lib64/openssl11 -lssl -lcrypto" ~/.pyenv/bin/pyenv install -s 3.11 && \ 15 | curl -L https://github.com/exaloop/llvm-project/releases/download/codon-20.1.7/llvm-codon-20.1.7-linux-x86_64.tar.bz2 | tar jxf - -C /opt && \ 16 | echo -ne "[safe]\ndirectory = *" > /root/.gitconfig && \ 17 | echo "--gcc-install-dir=/opt/rh/gcc-toolset-14/root/usr/lib/gcc/x86_64-redhat-linux/14" > /opt/llvm-codon/bin/clang.cfg && \ 18 | echo "--gcc-install-dir=/opt/rh/gcc-toolset-14/root/usr/lib/gcc/x86_64-redhat-linux/14" > /opt/llvm-codon/bin/clang++.cfg 19 | 20 | COPY entrypoint.sh /entrypoint.sh 21 | ENTRYPOINT ["/entrypoint.sh"] 22 | -------------------------------------------------------------------------------- /.github/build-linux/Dockerfile.linux-aarch64: -------------------------------------------------------------------------------- 1 | FROM quay.io/pypa/manylinux_2_28_aarch64 2 | 3 | # - Codon needs ninja >= 1.10. 4 | # - Codon needs Python 3 with shared libraries. 5 | # Since manylinux ships with static libraries, pyenv is used to build shared libraries. 6 | # - Codon's clang must be forced to use the correct gcc-14 toolset for C++20 support via clang.cfg. 7 | # - The recent git security checks are disabled by marking everything "safe" until all dependencies get updated. 8 | RUN yum -y update && \ 9 | yum -y install gcc gcc-c++ gcc-gfortran make wget openssl-devel \ 10 | patch bzip2 readline-devel sqlite sqlite-devel \ 11 | bzip2-devel tk-devel libffi-devel xz-devel zlib-devel && \ 12 | yum -y install https://linux.mellanox.com/public/repo/doca/2.2.0/centos7.6/aarch64/ninja-build-1.10.2-3.el7.aarch64.rpm && \ 13 | git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \ 14 | CPPFLAGS="-I/usr/include/openssl11" LDFLAGS="-L/usr/lib64/openssl11 -lssl -lcrypto" ~/.pyenv/bin/pyenv install -s 3.11 && \ 15 | curl -L https://github.com/exaloop/llvm-project/releases/download/codon-20.1.7/llvm-codon-20.1.7-linux-aarch64.tar.bz2 | tar jxf - -C /opt && \ 16 | echo -ne "[safe]\ndirectory = *" > /root/.gitconfig && \ 17 | echo "--gcc-install-dir=/opt/rh/gcc-toolset-14/root/usr/lib/gcc/aarch64-redhat-linux/14" > /opt/llvm-codon/bin/clang.cfg && \ 18 | echo "--gcc-install-dir=/opt/rh/gcc-toolset-14/root/usr/lib/gcc/aarch64-redhat-linux/14" > /opt/llvm-codon/bin/clang++.cfg 19 | 20 | COPY entrypoint.sh /entrypoint.sh 21 | ENTRYPOINT ["/entrypoint.sh"] 22 | -------------------------------------------------------------------------------- /stdlib/bio/block.codon: -------------------------------------------------------------------------------- 1 | @tuple 2 | class Block[T]: 3 | ''' 4 | Represents a block of data; useful in parallelization to batch data 5 | ''' 6 | _data: Ptr[T] 7 | _size: int 8 | 9 | def __new__(size: int): 10 | return Block[T](Ptr[T](size), 0) 11 | 12 | def __iter__(self): 13 | data = self._data 14 | size = self._size 15 | i = 0 16 | while i < size: 17 | yield data[i] 18 | i += 1 19 | 20 | def __getitem__(self, idx: int): 21 | if not (0 <= idx < len(self)): 22 | raise ValueError("block index out of range") 23 | return self._data[idx] 24 | 25 | def __len__(self): 26 | return self._size 27 | 28 | def __bool__(self): 29 | return len(self) != 0 30 | 31 | def __repr__(self): 32 | return f'' 33 | 34 | def _add(self, elem: T): 35 | self._data[self._size] = elem 36 | return Block[T](self._data, self._size + 1) 37 | 38 | def _blocks[T](g: Generator[T], size: int): 39 | b = Block[T](size) 40 | for a in g: 41 | if len(b) == size: 42 | yield b 43 | b = Block[T](size) 44 | b = b._add(a) 45 | if b: 46 | yield b 47 | 48 | def blocks(x, size: int): 49 | ''' 50 | Partitions the given object into blocks of the specified size 51 | by calling the `__blocks__` magic method. 52 | ''' 53 | if size <= 0: 54 | raise ValueError(f"invalid block size: {size}") 55 | if isinstance(x, Generator): 56 | return _blocks(x, size) 57 | else: 58 | return x.__blocks__(size) 59 | -------------------------------------------------------------------------------- /test/bench/bedcov.codon: -------------------------------------------------------------------------------- 1 | # BED coverage benchmark from https://github.com/lh3/biofast 2 | from sys import argv 3 | from time import timing 4 | from bio import * 5 | from bio.intervals import IntervalTree 6 | 7 | with timing('bed coverage (total)'): 8 | interval_tree = IntervalTree() 9 | 10 | with timing('reading first BED file'): 11 | for record in BED(argv[1], copy=True, validate=False): 12 | interval_tree.add(record.chrom, record.chrom_start, record.chrom_end, None) 13 | 14 | with timing('indexing'): 15 | interval_tree.index() 16 | 17 | with timing('querying second BED file'): 18 | for record in BED(argv[2], copy=False, validate=False): 19 | cov, cov_st, cov_en, n = 0, 0, 0, 0 20 | st1, en1 = record.chrom_start, record.chrom_end 21 | for item in interval_tree.overlap(record.chrom, st1, en1): 22 | n += 1 23 | # calcualte overlap length/coverage 24 | st0, en0 = item.start, item.end 25 | if st0 < st1: st0 = st1 26 | if en0 > en1: en0 = en1 27 | if st0 > cov_en: # no overlap with previous found intervals 28 | # set coverage to current interval 29 | cov += cov_en - cov_st 30 | cov_st, cov_en = st0, en0 31 | elif cov_en < en0: 32 | cov_en = en0 # overlap with previous found intervals 33 | cov += cov_en - cov_st 34 | # print chrom, start, end, count, # of coverage nt 35 | print f'{record.chrom}\t{record.chrom_start}\t{record.chrom_end}\t{n}\t{cov}' 36 | -------------------------------------------------------------------------------- /.github/build-linux/Dockerfile.manylinux2014-aarch64: -------------------------------------------------------------------------------- 1 | FROM quay.io/pypa/manylinux2014_aarch64 2 | 3 | # - Codon needs ninja >= 1.10. 4 | # - Codon needs Python 3 with shared libraries. 5 | # Since manylinux ships with static libraries, pyenv is used to build shared libraries. 6 | # - Python needs OpenSSL >= 1.1. 7 | # - The recent git security checks are disabled by marking everything "safe" until all dependencies get updated. 8 | RUN yum -y update && \ 9 | yum -y install gcc gcc-c++ gcc-gfortran make wget \ 10 | patch bzip2 readline-devel sqlite sqlite-devel \ 11 | bzip2-devel tk-devel libffi-devel xz-devel zlib-devel && \ 12 | yum -y install https://linux.mellanox.com/public/repo/doca/2.2.0/centos7.6/aarch64/openssl11-devel-1.1.1k-5.el7.aarch64.rpm \ 13 | https://linux.mellanox.com/public/repo/doca/2.2.0/centos7.6/aarch64/openssl11-1.1.1k-5.el7.aarch64.rpm \ 14 | https://linux.mellanox.com/public/repo/doca/2.2.0/centos7.6/aarch64/openssl11-libs-1.1.1k-5.el7.aarch64.rpm && \ 15 | yum -y install https://linux.mellanox.com/public/repo/doca/2.2.0/centos7.6/aarch64/ninja-build-1.10.2-3.el7.aarch64.rpm && \ 16 | git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \ 17 | CPPFLAGS="-I/usr/include/openssl11" LDFLAGS="-L/usr/lib64/openssl11 -lssl -lcrypto" ~/.pyenv/bin/pyenv install -s 3.11 && \ 18 | curl -L https://github.com/exaloop/llvm-project/releases/download/codon-20.1.7/llvm-codon-20.1.7-manylinux2014-aarch64.tar.bz2 | tar jxf - -C /opt && \ 19 | echo -ne "[safe]\ndirectory = *" > /root/.gitconfig 20 | 21 | COPY entrypoint.sh /entrypoint.sh 22 | ENTRYPOINT ["/entrypoint.sh"] 23 | -------------------------------------------------------------------------------- /test/data/invalid/seqs_bad_base.fasta: -------------------------------------------------------------------------------- 1 | >chrA 2 | TCCTCCCCGTTCGCTGGACCCTTGGTCTATCTAGTCAAGAATTAACTCCC 3 | ATTTAGTTGGCTGTTCGTCGATTACCCCTACTGGACCGTCGCAACGGCTC 4 | ACGTGGAGGTCTTAGACCAAGAAAGCTACTGTATGCGGGGATATCACATC 5 | AGATTGCCAGGCGAGCAGCTCTAGCGTGACACGCCTAGACTCATTCGTTG 6 | TTCCTTGTCAATCCCAGGGGTCTCCACAGGGAGTGGATCGAGCTAATCAC 7 | CGTTTCGAGTCCGTCAGGCGGAGAGTAGCAGTAAGTACAAACTTCTGCTA 8 | GTCGCTCTGCCACAACGTAGCCACCTAAGATTAACCCTGGAATTGTCCGG 9 | GCGGCATGATCCATCGAGGAGTTAGCGGGGACAGGGAGTTACCAGTCGAG 10 | ACGTCCATGGTGGTGCTGCAATCCATGGATACCATCTCCTTGCCATTCCT 11 | AGGGACATCG 12 | >chrB 13 | TATGGGGTAGCATCATTAAGTGGGGAGGTAGACCAGGAGTTCGGTTCCCG 14 | GAGTTTCGTTAGTTCAGGTAGCGTGACCTCGTCTTAGTATGCAGTCGTGA 15 | AATAATAGACATTTCTGCCTGTCAGGTTGCACTAATCACACCCAGGCTGT 16 | TAACGAGGCGGCTCGTAGTATAAACGCTTTGGACTAGACTCGATACCTAG 17 | CGGCGCGCATTGATAAATGGTGCATCTATAGTAAAAGGCGTCCCAACCTG 18 | GGGACTTAGCGTTGAATACCCGTGCAAGGATCTCTAATCGGTTCTCAATG 19 | GCTGCCTGCTCTTTCTCTAAACGAGACTCTAATATCATGTGTGGTCCTTG 20 | CTTTCGGTCGAGAAAAAGCCTTTGATCGCATCCCAAACCGACATCTAAAA 21 | GCTTCATGTATGTCGGCAGCGAAAAAACGACCAAATAGAAGTCCCCTAAC 22 | GGAGAATAGGCCGCCCACGACAGAACACCGCTTCGTCCT 23 | >chrC 24 | CCCGCTAGCCGTGCCTGATCCTCAACCAAGCTGGGTAAAGACAACCGTCT 25 | AATCATTAACTTACGTTGTTACGTCATTTTGCGCTTAAAATTGTCGCACC 26 | GGAATCCGTCGAGACTTCCCGAGACATGTCCCCTTAATAAATGTACGGTG 27 | TGACCTAACGATCGGATCACCGTCCGTGCTAAAACACACAACCGCTGCGT 28 | GACACCGACCGAACGTTACCGAAGGCTGTCCGCCTAACATCTATATTTGG 29 | CGGTAAAGAGGCGGTTCCGGCGGACTATAAAGTCACAGGCCACTGTTTCT 30 | TTGCAAGATATGGCTCTCTGTCAGGACCGCCCCCTAGGGTCAGCTCAAAT 31 | AAGCTTGTCCCGGACTCCGTACTTCCAACAGAAAGGTGACCGCTACATTC 32 | TGCTATTGACCCCTCACACAACGTTCCCCCGCATGGCGTACGTGTTACCA 33 | GGGCGGTTGCGGCCTTACGTCGCCATAAGCACGTATATAAGTCACCCACT 34 | >chrD 35 | TGCCGTGACCACCCCGCGAGAATCTCATAATGATATCTCCAATCGAGT 36 | -------------------------------------------------------------------------------- /test/data/seqs.fasta: -------------------------------------------------------------------------------- 1 | >chrA my random comment 2 | TCCTCCCCGTTCGCTGGACCCTTGGTCTATCTAGTCAAGAATTAACTCCC 3 | ATTTAGTTGGCTGTTCGTCGATTACCCCTACTGGACCGTCGCAACGGCTC 4 | ACGTGGAGGTCTTAGACCAAGAAAGCTACTGTATGCGGGGATATCACATC 5 | AGATTGCCAGGCGAGCAGCTCTAGCGTGACACGCCTAGACTCATTCGTTG 6 | TTCCTTGTCAATCCCAGGGGTCTCCACAGGGAGTGGATCGAGCTAATCAC 7 | CGTTTCGAGTCCGTCAGGCGGAGAGTAGCAGTAAGTACAAACTTCTGCTA 8 | GTCGCTCTGCCACAACGTAGCCACCTAAGATTAACCCTGGAATTGTCCGG 9 | GCGGCATGATCCATCGAGGAGTTAGCGGGGACAGGGAGTTACCAGTCGAG 10 | ACGTCCATGGTGGTGCTGCAATCCATGGATACCATCTCCTTGCCATTCCT 11 | AGGGACATCG 12 | >chrB 13 | TATGGGGTAGCATCATTAAGTGGGGAGGTAGACCAGGAGTTCGGTTCCCG 14 | GAGTTTCGTTAGTTCAGGTAGCGTGACCTCGTCTTAGTATGCAGTCGTGA 15 | AATAATAGACATTTCTGCCTGTCAGGTTGCACTAATCACACCCAGGCTGT 16 | TAACGAGGCGGCTCGTAGTATAAACGCTTTGGACTAGACTCGATACCTAG 17 | CGGCGCGCATTGATAAATGGTGCATCTATAGTAAAAGGCGTCCCAACCTG 18 | GGGACTTAGCGTTGAATACCCGTGCAAGGATCTCTAATCGGTTCTCAATG 19 | GCTGCCTGCTCTTTCTCTAAACGAGACTCTAATATCATGTGTGGTCCTTG 20 | CTTTCGGTCGAGAAAAAGCCTTTGATCGCATCCCAAACCGACATCTAAAA 21 | GCTTCATGTATGTCGGCAGCGAAAAAACGACCAAATAGAAGTCCCCTAAC 22 | GGAGAATAGGCCGCCCACGACAGAACACCGCTTCGTCCT 23 | >chrC 24 | CCCGCTAGCCGTGCCTGATCCTCAACCAAGCTGGGTAAAGACAACCGTCT 25 | AATCATTAACTTACGTTGTTACGTCATTTTGCGCTTAAAATTGTCGCACC 26 | GGAATCCGTCGAGACTTCCCGAGACATGTCCCCTTAATAAATGTACGGTG 27 | TGACCTAACGATCGGATCACCGTCCGTGCTAAAACACACAACCGCTGCGT 28 | GACACCGACCGAACGTTACCGAAGGCTGTCCGCCTAACATCTATATTTGG 29 | CGGTAAAGAGGCGGTTCCGGCGGACTATAAAGTCACAGGCCACTGTTTCT 30 | TTGCAAGATATGGCTCTCTGTCAGGACCGCCCCCTAGGGTCAGCTCAAAT 31 | AAGCTTGTCCCGGACTCCGTACTTCCAACAGAAAGGTGACCGCTACATTC 32 | TGCTATTGACCCCTCACACAACGTTCCCCCGCATGGCGTACGTGTTACCA 33 | GGGCGGTTGCGGCCTTACGTCGCCATAAGCACGTATATAAGTCACCCACT 34 | >chrD hello 35 | TGCCGTGACCACCCCGCGAGAATCTCATAATGATATCTCCAATCGAGTA 36 | -------------------------------------------------------------------------------- /test/apps/minimap2/sw_simple.codon: -------------------------------------------------------------------------------- 1 | # Smith-Waterman alignment from minimap2 implemented using Seq's inter-sequence alignment 2 | # https://github.com/lh3/minimap2 3 | # https://github.com/lh3/ksw2 4 | # Usage: seqc sw.seq 5 | # and are text files of the same length with one sequence per line. 6 | from time import TimeInterval 7 | from sys import argv 8 | from bio import seqs, inter_align 9 | from math import sqrt 10 | from statistics import mean, stdev 11 | queries = argv[1] 12 | targets = argv[2] 13 | 14 | total, num = 0, 0 15 | score = True # must be global 16 | 17 | @inter_align 18 | def process_inter(t): 19 | global total, num 20 | query, target = t 21 | inter_score = query.align(target, a=1, b=2, ambig=0, gapo=4, gape=2, score_only=score).score 22 | total += inter_score 23 | num += 1 24 | 25 | def process_intra(t): 26 | global total, num 27 | query, target = t 28 | inter_score = query.align(target, a=1, b=2, ambig=0, gapo=4, gape=2, score_only=score).score 29 | total += inter_score 30 | num += 1 31 | 32 | def run(queries, targets, msg, f): 33 | global num, total, score 34 | for s in [False, True]: 35 | times = [] 36 | for i in range(3): 37 | num, total = 0, 0 38 | t = TimeInterval() 39 | score = s 40 | zip(seqs(queries), seqs(targets)) |> f 41 | times.append(t.elapsed()) 42 | # print '-', i, num, total, times[-1] 43 | m = mean(times) 44 | print f'[sw-time] seq-{msg} {int(score)} {m} {sqrt(sum((i - m)**2 for i in times) / len(times))}' 45 | # print stdev(times) # broken 46 | 47 | run(queries, targets, 'intra', process_intra) 48 | run(queries, targets, 'inter', process_inter) 49 | -------------------------------------------------------------------------------- /test/bench/fasta.codon: -------------------------------------------------------------------------------- 1 | import sys, bisect, time 2 | 3 | alu = 'GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA' 4 | iub = list(zip('acgtBDHKMNRSVWY', [0.27, 0.12, 0.12, 0.27] + [0.02]*11)) 5 | 6 | homosapiens = [ 7 | ('a', 0.3029549426680), 8 | ('c', 0.1979883004921), 9 | ('g', 0.1975473066391), 10 | ('t', 0.3015094502008) 11 | ] 12 | 13 | def random_generator(ia, ic, im): 14 | seed = 42 15 | imf = float(im) 16 | while 1: 17 | seed = (seed * ia + ic) % im 18 | yield seed / imf 19 | 20 | def make_cumulative(table): 21 | P = list[float]() 22 | C = list[str]() 23 | prob = 0. 24 | for char, p in table: 25 | prob += p 26 | P += [prob] 27 | C += [char] 28 | return (P, C) 29 | 30 | def repeat_fasta(src, n): 31 | width = 60 32 | r = len(src) 33 | s = src + src + src[:n % r] 34 | for i in range(0, n // width): 35 | j = i * width % r 36 | print s[j:j + width] 37 | if n % width: print s[- (n % width):] 38 | 39 | def random_fasta(table, n, rand): 40 | width = 60 41 | probs, chars = make_cumulative(table) 42 | s = str.cat(chars[bisect.bisect(probs, next(rand))] for i in range(n)) 43 | for i in range(0, n, width): 44 | print s[i:i + width] 45 | 46 | n = int(sys.argv[1]) # 1000, 25000000 47 | 48 | rand = random_generator(3877, 29573, 139968) 49 | 50 | print '>ONE Homo sapiens alu' 51 | repeat_fasta(alu, n*2) 52 | 53 | print '>TWO IUB ambiguity codes' 54 | random_fasta(iub, n*3, rand) 55 | 56 | print '>THREE Homo sapiens frequency' 57 | random_fasta(homosapiens, n*5, rand) 58 | -------------------------------------------------------------------------------- /test/data/toy.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.0 2 | ##fileDate=20090805 3 | ##source=myImputationProgramV3.1 4 | ##reference=1000GenomesPilot-NCBI36 5 | ##contig= 6 | ##phasing=partial 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##INFO= 15 | ##FILTER= 16 | ##FILTER= 17 | ##FORMAT= 18 | ##FORMAT= 19 | ##FORMAT= 20 | ##FORMAT= 21 | ##ALT= 22 | ##ALT= 23 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 24 | 20 14370 rs6054257 G A 29.2 q10;s50 NS=3;DP=14,22;AF=0.5;AA=String,Multi,Val;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 25 | 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. 26 | 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=FA,DADA;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. 27 | 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. 28 | 20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 29 | 20 1235237 . T . . . . GT 0/0 0|0 ./. 30 | -------------------------------------------------------------------------------- /test/bench/rc.codon: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | from bio import * 4 | 5 | def revcomp1(c): 6 | return ('A' if c == 'T' else \ 7 | ('C' if c == 'G' else \ 8 | ('G' if c == 'C' else \ 9 | ('T' if c == 'A' else c)))) 10 | 11 | def rc_copy(s): 12 | rc = str.cat(revcomp1(s[len(s) - i - 1]) for i in range(len(s))) 13 | # print rc 14 | return len(rc) 15 | 16 | def naive(): 17 | f = FASTA(sys.argv[1], fai=False) 18 | total = 0 19 | for l in f: 20 | total += rc_copy(str(l.seq)) 21 | print total 22 | 23 | total = 0 24 | def rc(s): 25 | s = ~s 26 | # print s 27 | global total 28 | total += len(s) 29 | 30 | def idiomatic(): 31 | FASTA(sys.argv[1], fai=False) |> seqs |> rc 32 | print total 33 | 34 | n = 0 35 | def update(kmer): 36 | global n 37 | x = type(kmer)() |> base(len(kmer) // 2, k'T') 38 | if kmer > x: 39 | n += 1 40 | 41 | def test_fast(K: Static[int]): 42 | global n 43 | n = 0 44 | with time.timing(f'{K}-mer (fast)'): 45 | FASTA(sys.argv[1]) |> seqs |> kmers(step=1, k=K) |> revcomp |> update 46 | print n 47 | 48 | 49 | def test_slow(K: Static[int]): 50 | global n 51 | n = 0 52 | with time.timing(f'{K}-mer (slow)'): 53 | for a in seqs(FASTA(sys.argv[1])): 54 | for b in kmers(a, step=1, k=K): 55 | c = revcomp(b) 56 | update(c) 57 | print n 58 | 59 | def test_super_slow(K: Static[int]): 60 | global n 61 | n = 0 62 | with time.timing(f'{K}-mer (super slow)'): 63 | for a in seqs(FASTA(sys.argv[1])): 64 | for b in split(a, K, step=1): 65 | if not b.N(): 66 | c = Kmer[K](b) 67 | d = revcomp(c) 68 | update(d) 69 | print n 70 | 71 | with time.timing("naive"): 72 | naive() 73 | with time.timing("idiomatic"): 74 | idiomatic() 75 | with time.timing("pipeline"): 76 | test_fast(4) 77 | test_slow(4) 78 | test_super_slow(4) 79 | 80 | test_fast(8) 81 | test_slow(8) 82 | test_super_slow(8) 83 | 84 | test_fast(16) 85 | test_slow(16) 86 | test_super_slow(16) 87 | 88 | test_fast(32) 89 | test_slow(32) 90 | test_super_slow(32) 91 | 92 | test_fast(64) 93 | test_slow(64) 94 | test_super_slow(64) 95 | 96 | test_fast(128) 97 | test_slow(128) 98 | test_super_slow(128) 99 | -------------------------------------------------------------------------------- /test/apps/minimap2/sw.codon: -------------------------------------------------------------------------------- 1 | # Smith-Waterman alignment from minimap2 implemented using Seq's inter-sequence alignment 2 | # https://github.com/lh3/minimap2 3 | # https://github.com/lh3/ksw2 4 | # Usage: seqc sw.seq 5 | # and are text files of the same length with one sequence per line. 6 | from time import TimeInterval 7 | from sys import argv 8 | from bio import seqs, inter_align 9 | from math import sqrt 10 | from statistics import mean, stdev 11 | queries = argv[1] 12 | targets = argv[2] 13 | 14 | total, num = 0, 0 15 | score = True # must be global 16 | 17 | @inter_align 18 | def process_inter(t): 19 | global total, num 20 | query, target = t 21 | inter_score = query.align(target, a=1, b=2, ambig=0, gapo=4, gape=2, zdrop=400, bandwidth=751, ext_only=False, score_only=score).score 22 | total += inter_score 23 | num += 1 24 | 25 | def process_intra(t): 26 | global total, num 27 | query, target = t 28 | inter_score = query.align(target, a=1, b=2, ambig=0, gapo=4, gape=2, zdrop=400, bandwidth=751, ext_only=False, score_only=score).score 29 | total += inter_score 30 | num += 1 31 | 32 | def run(queries, targets, msg, f, filter=False): 33 | global num, total, score 34 | def filter_len(t): 35 | query, target = t 36 | if len(query) <= 512 and len(target) <= 512: 37 | yield query, target 38 | 39 | for s in [False, True]: 40 | times = [] 41 | for i in range(3): 42 | num, total = 0, 0 43 | t = TimeInterval() 44 | score = s 45 | if filter: 46 | zip(seqs(queries), seqs(targets)) |> filter_len |> f 47 | else: 48 | zip(seqs(queries), seqs(targets)) |> f 49 | times.append(t.elapsed()) 50 | # print '-', i, num, total, times[-1] 51 | m = mean(times) 52 | print f'{msg}: seq-{msg} {int(score)} {m} {sqrt(sum((i - m)**2 for i in times) / len(times))}' 53 | # print stdev(times) # broken 54 | 55 | from C import seq_get_interaln_simd() -> str 56 | from C import seq_set_sw_maxsimd(int) 57 | 58 | run(queries, targets, 'intra', process_intra) 59 | run(queries, targets, 'intra-512', process_intra, True) 60 | 61 | for simd in [0x20, 0x80, 0x100]: 62 | seq_set_sw_maxsimd(simd) 63 | print seq_get_interaln_simd() 64 | run(queries, targets, 'inter', process_inter) 65 | run(queries, targets, 'inter-512', process_inter, True) 66 | 67 | -------------------------------------------------------------------------------- /test/bench/fastx.codon: -------------------------------------------------------------------------------- 1 | ############################ 2 | # Format parsing benchmark # 3 | ############################ 4 | from sys import argv 5 | from time import timing 6 | from bio import * 7 | n, m = 0, 0 8 | 9 | def test_fasta_options(path): 10 | def process(rec): 11 | global n 12 | n += len(rec.name) + len(rec.seq) 13 | global n, m 14 | m = 0 15 | opts4 = [(a,b,c,d) for a in (True, False) 16 | for b in (True, False) 17 | for c in (True, False) 18 | for d in (True, False)] 19 | for validate, gzip, copy, fai in opts4: 20 | n = 0 21 | with timing(f'validate={validate} gzip={gzip} copy={copy} fai={fai}'): 22 | FASTA(path, validate=validate, gzip=gzip, copy=copy, fai=fai) |> iter |> process 23 | if m == 0: 24 | m = n 25 | else: 26 | assert m == n 27 | 28 | def test_fastq_options(path): 29 | def process(rec): 30 | global n 31 | n += len(rec.name) + len(rec.seq) + len(rec.qual) 32 | global n, m 33 | m = 0 34 | opts2 = [(a,b) for a in (True, False) 35 | for b in (True, False)] 36 | for validate, gzip in opts2: 37 | n = 0 38 | with timing(f'validate={validate} gzip={gzip}'): 39 | FASTQ(path, validate=validate, gzip=gzip, copy=True) |> iter |> process 40 | if m == 0: 41 | m = n 42 | else: 43 | assert m == n 44 | 45 | def test_seqs_options(path): 46 | def process(rec): 47 | global n 48 | n += len(rec) 49 | global n, m 50 | m = 0 51 | opts3 = [(a,b,c) for a in (True, False) 52 | for b in (True, False) 53 | for c in (True, False)] 54 | for validate, gzip, copy in opts3: 55 | n = 0 56 | with timing(f'validate={validate} gzip={gzip} copy={copy}'): 57 | Seqs(path, validate=validate, gzip=gzip, copy=copy) |> iter |> process 58 | if m == 0: 59 | m = n 60 | print m 61 | else: 62 | print m, n 63 | assert m == n 64 | 65 | for path in argv[1:]: 66 | if path.endswith('.fa') or path.endswith('.fasta'): 67 | print 'Testing as FASTA' 68 | test_fasta_options(path) 69 | elif path.endswith('.fq') or path.endswith('.fastq'): 70 | print 'Testing as FASTQ' 71 | test_fastq_options(path) 72 | else: 73 | print 'Testing as TXT' 74 | test_seqs_options(path) 75 | -------------------------------------------------------------------------------- /stdlib/bio/iter.codon: -------------------------------------------------------------------------------- 1 | from bio.seq import seq 2 | from copy import copy 3 | 4 | # Sequence reader in text, line-by-line format. 5 | @tuple 6 | class SeqReader: 7 | ''' 8 | Parser for a plain txt-based sequence format, with one sequence per line. 9 | 10 | ''' 11 | _file: cobj 12 | validate: bool 13 | gzip: bool 14 | copy: bool 15 | 16 | def __new__(path: str, validate: bool, gzip: bool, copy: bool) -> SeqReader: 17 | return SeqReader(gzopen(path, "r").__raw__() if gzip else open(path, "r").__raw__(), validate, gzip, copy) 18 | 19 | @property 20 | def file(self): 21 | assert not self.gzip 22 | p = __array__[cobj](1) 23 | p.ptr[0] = self._file 24 | return Ptr[File](p.ptr.as_byte())[0] 25 | 26 | @property 27 | def gzfile(self): 28 | assert self.gzip 29 | p = __array__[cobj](1) 30 | p.ptr[0] = self._file 31 | return Ptr[gzFile](p.ptr.as_byte())[0] 32 | 33 | def _preprocess(self, a: str): 34 | from bio.builtin import _validate_str_as_seq 35 | if self.validate: 36 | return _validate_str_as_seq(a, self.copy) 37 | else: 38 | return copy(seq(a.ptr, a.len)) if self.copy else seq(a.ptr, a.len) 39 | 40 | def __seqs__(self): 41 | return self.__iter__() 42 | 43 | def __iter__(self): 44 | if self.gzip: 45 | for a in self.gzfile._iter_trim_newline(): 46 | s = self._preprocess(a) 47 | assert s.len >= 0 48 | yield s 49 | else: 50 | for a in self.file._iter_trim_newline(): 51 | s = self._preprocess(a) 52 | assert s.len >= 0 53 | yield s 54 | self.close() 55 | 56 | def __blocks__(self, size: int): 57 | from bio.block import _blocks 58 | if not self.copy: 59 | raise ValueError("cannot read sequences in blocks with copy=False") 60 | return _blocks(self.__iter__(), size) 61 | 62 | def close(self): 63 | if self.gzip: 64 | self.gzfile.close() 65 | else: 66 | self.file.close() 67 | 68 | def __enter__(self): 69 | pass 70 | 71 | def __exit__(self): 72 | self.close() 73 | 74 | def Seqs(path: str, validate: bool = True, gzip: bool = True, copy: bool = True): 75 | return SeqReader(path=path, validate=validate, gzip=gzip, copy=copy) 76 | 77 | @extend 78 | class str: 79 | def __seqs__(self): 80 | return iter(Seqs(self)) 81 | 82 | def __blocks__(self, size: int): 83 | from bio.block import _blocks 84 | return _blocks(self.__seqs__(), size) 85 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Seq CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - develop 8 | tags: 9 | - '*' 10 | pull_request: 11 | branches: 12 | - develop 13 | 14 | jobs: 15 | create_release: 16 | name: GitHub Release 17 | runs-on: ubuntu-latest 18 | outputs: 19 | upload_url: ${{ steps.create_release.outputs.upload_url }} 20 | permissions: 21 | contents: write 22 | steps: 23 | - name: Create Release 24 | if: contains(github.ref, 'tags/v') 25 | id: create_release 26 | uses: ncipollo/release-action@v1 27 | 28 | build: 29 | strategy: 30 | matrix: 31 | include: 32 | - os: ubuntu-latest 33 | arch: linux-x86_64 34 | - os: ubuntu-latest 35 | arch: manylinux2014-x86_64 36 | - os: ubuntu-24.04-arm 37 | arch: linux-aarch64 38 | - os: ubuntu-24.04-arm 39 | arch: manylinux2014-aarch64 40 | - os: macos-13 41 | arch: darwin-x86_64 42 | - os: macos-14 43 | arch: darwin-arm64 44 | runs-on: ${{ matrix.os }} 45 | name: Build Seq 46 | needs: create_release 47 | permissions: 48 | contents: write 49 | steps: 50 | - uses: actions/checkout@v4 51 | 52 | - name: Build (Ubuntu) 53 | if: startsWith(matrix.os, 'ubuntu') 54 | run: | 55 | (cd .github/build-linux && docker build -t local -f Dockerfile.${{ matrix.arch }} .) 56 | docker run -v $(pwd):/github/workspace local /github/workspace ${{ matrix.arch }} yes 0.19.2 57 | 58 | - name: Build (macOS) 59 | if: startsWith(matrix.os, 'macos') 60 | run: | 61 | sudo mkdir -p /opt/llvm-codon 62 | sudo chown -R $(whoami) /opt/llvm-codon 63 | curl -L https://github.com/exaloop/llvm-project/releases/download/codon-20.1.7/llvm-codon-20.1.7-${{ matrix.arch }}.tar.bz2 | tar jxf - -C /opt 64 | bash .github/build-linux/entrypoint.sh ${{ github.workspace }} ${{ matrix.arch }} yes 0.19.2 65 | 66 | - name: Upload Artifacts 67 | uses: actions/upload-artifact@v4 68 | with: 69 | name: seq-${{ matrix.arch }}.tar.gz 70 | path: seq-${{ matrix.arch }}.tar.gz 71 | 72 | - name: Upload Release Asset 73 | if: contains(github.ref, 'tags/v') 74 | uses: actions/upload-release-asset@v1.0.2 75 | env: 76 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 77 | with: 78 | upload_url: ${{ needs.create_release.outputs.upload_url }} 79 | asset_path: ./seq-${{ matrix.arch }}.tar.gz 80 | asset_name: seq-${{ matrix.arch }}.tar.gz 81 | asset_content_type: application/gzip 82 | -------------------------------------------------------------------------------- /stdlib/bio/locus.codon: -------------------------------------------------------------------------------- 1 | @tuple 2 | class Contig: 3 | ''' 4 | Representation of a contig, consisting of an ID, name and length. 5 | ''' 6 | 7 | _tid: u32 8 | _name: str 9 | _len: int 10 | 11 | def __new__(tid: int, name: str, len: int) -> Contig: 12 | return Contig(u32(tid), name, len) 13 | 14 | @property 15 | def tid(self): 16 | return int(self._tid) 17 | 18 | @property 19 | def name(self): 20 | return self._name 21 | 22 | @property 23 | def len(self): 24 | return self._len 25 | 26 | def __str__(self): 27 | return self.name 28 | 29 | def __len__(self): 30 | return self.len 31 | 32 | def __eq__(self, other: Contig): 33 | return self.tid == other.tid 34 | 35 | def __ne__(self, other: Contig): 36 | return self.tid != other.tid 37 | 38 | def __lt__(self, other: Contig): 39 | return self.tid < other.tid 40 | 41 | def __gt__(self, other: Contig): 42 | return self.tid > other.tid 43 | 44 | def __le__(self, other: Contig): 45 | return self.tid <= other.tid 46 | 47 | def __ge__(self, other: Contig): 48 | return self.tid >= other.tid 49 | 50 | def __hash__(self): 51 | return self.tid 52 | 53 | @tuple 54 | class Locus: 55 | ''' 56 | Representation of a locus, consisting of a contig ID and 0-based position. 57 | ''' 58 | 59 | _tid: u32 60 | _pos: u32 61 | 62 | def __new__(tid: int, pos: int) -> Locus: 63 | ''' 64 | Constructs a `Locus` with specified contig ID and 0-based position. 65 | Negative positions indicate loci on the reverse strand. 66 | ''' 67 | return Locus(u32(tid), u32(pos)) 68 | 69 | def __lt__(self, other: Locus): 70 | return (self.tid, self.pos) < (other.tid, other.pos) 71 | 72 | def __gt__(self, other: Locus): 73 | return (self.tid, self.pos) > (other.tid, other.pos) 74 | 75 | def __le__(self, other: Locus): 76 | return (self.tid, self.pos) <= (other.tid, other.pos) 77 | 78 | def __ge__(self, other: Locus): 79 | return (self.tid, self.pos) >= (other.tid, other.pos) 80 | 81 | @property 82 | def tid(self): 83 | return int(self._tid) 84 | 85 | @property 86 | def pos(self): 87 | return abs(int(i32(int(self._pos)))) 88 | 89 | @property 90 | def reversed(self): 91 | ''' 92 | Whether this locus is on the reverse strand 93 | ''' 94 | return i32(int(self._pos)) < i32(0) 95 | 96 | def __invert__(self): 97 | ''' 98 | Returns the corresponding `Locus` on the reverse strand 99 | ''' 100 | return Locus(self.tid, self.pos if self.reversed else -self.pos) 101 | 102 | def __repr__(self): 103 | return f"Locus(tid={self.tid}, pos={self.pos}, reversed={self.reversed})" 104 | -------------------------------------------------------------------------------- /test/apps/mrsfast/exact.codon: -------------------------------------------------------------------------------- 1 | # Implementation of exact mapping using FM-index 2 | # Usage: 3 | # Index: seqc exact.seq index reference.fa 4 | # Search: seqc exact.seq search reference.fa reads.fq output.sam 5 | 6 | from sys import argv, stderr, exit 7 | from bio.fmindex import FMIndex 8 | from bio import FASTARecord, FASTQRecord, FASTQ, FASTA 9 | 10 | class GenomeIndex: 11 | ref: List[FASTARecord] 12 | fmi: FMIndex 13 | 14 | def open_index_file(basename, mode): 15 | import gzip 16 | return gzip.open(f'{basename}.exact.idx', mode) 17 | 18 | def index_load(basename): 19 | from pickle import load 20 | with open_index_file(basename, 'rb') as jar: 21 | return load(jar, GenomeIndex) 22 | 23 | def main_index(basename): 24 | from pickle import dump 25 | ref = [rec for rec in FASTA(basename)] 26 | print('making FM-index...', file=stderr) 27 | fmi = FMIndex(basename) 28 | index = GenomeIndex(ref, fmi) 29 | print('writing to disk...', file=stderr) 30 | with open_index_file(basename, 'wb0') as jar: 31 | dump(index, jar) 32 | 33 | num_aligned = 0 34 | 35 | #@prefetch 36 | def align(read: FASTQRecord, genome_index: GenomeIndex, out: File): 37 | global num_aligned 38 | for rc in (False, True): 39 | read_seq = read.seq 40 | read_len = len(read_seq) 41 | if rc: 42 | read_seq = ~read_seq 43 | 44 | s = read_seq 45 | intv = genome_index.fmi.interval(s[-1]) 46 | s = s[:-1] 47 | while s and intv: 48 | intv = genome_index.fmi[intv, s[-1]] 49 | s = s[:-1] 50 | 51 | for rid, name, ref_pos in genome_index.fmi.results(intv): 52 | ref_len = len(genome_index.ref[rid].seq) 53 | if not (0 <= ref_pos <= ref_len - read_len): 54 | continue 55 | if genome_index.ref[rid].seq[ref_pos:ref_pos + read_len].N(): 56 | continue 57 | print( 58 | read.name, 16 if rc else 0, name, ref_pos + 1, 255, f'{read_len}M', 59 | '*', 0, 0, read_seq, read.qual if not rc else read.qual[::-1], 'NM:i:0', 60 | sep='\t', file=out 61 | ) 62 | 63 | num_aligned += 1 64 | if num_aligned % 10000000 == 0: 65 | print 'aligned 10M reads' 66 | 67 | def main_search(ref_path, fastq_path, out_path): 68 | from time import timing 69 | print('loading index...', file=stderr) 70 | genome_index = index_load(ref_path) 71 | print('running alignment pipeline...', file=stderr) 72 | with open(out_path, 'w') as out, timing('alignment pipeline'): 73 | FASTQ(fastq_path) |> iter |> align(genome_index, out) 74 | 75 | match argv[1:]: 76 | case ['index', basename]: 77 | main_index(basename) 78 | case ['search', ref_path, fastq_path, out_path]: 79 | main_search(ref_path, fastq_path, out_path) 80 | case _: 81 | print("error: unknown mode: valid modes are 'index' and 'search'", file=stderr) 82 | exit(1) 83 | -------------------------------------------------------------------------------- /test/apps/snap/genomeindex.seq: -------------------------------------------------------------------------------- 1 | # Implementation of SNAP aligner's genome index 2 | # https://github.com/amplab/snap/blob/master/SNAPLib/GenomeIndex.{cpp,h} 3 | 4 | # Need the following hooks linked to convert C++ GenomeIndex to Seq object: 5 | # snap_index_from_dir(Ptr[byte]) -> Ptr[byte] -- read object from specified directory 6 | # snap_index_ht_count(Ptr[byte]) -> int -- extract hash table count 7 | # snap_index_ht_get(Ptr[byte], int) -> Ptr[byte] -- extract specified (0-indexed) hash table 8 | # snap_index_overflow_ptr(Ptr[byte]) -> Ptr[u32] -- extract overflow table pointer 9 | # snap_index_overflow_len(Ptr[byte]) -> int -- extract overflow table length 10 | # snap_index_count_of_bases(Ptr[byte]) -> int -- extract count of genome bases 11 | 12 | from hashtable import SNAPHashTable 13 | from bio import Kmer 14 | 15 | class GenomeIndex[K]: 16 | hash_tables: Array[SNAPHashTable[Kmer[16],u32]] 17 | overflow_table: Array[u32] 18 | count_of_bases: int 19 | 20 | def _partition(k: K): 21 | n = int(k.as_int()) 22 | return (Kmer[16](n & ((1 << 32) - 1)), n >> 32) 23 | 24 | def __init__(self, dir: str): 25 | assert Kmer[16].len() <= K.len() <= Kmer[32].len() 26 | from C import snap_index_from_dir(Ptr[byte]) -> Ptr[byte] 27 | from C import snap_index_ht_count(Ptr[byte]) -> int 28 | from C import snap_index_ht_get(Ptr[byte], int) -> Ptr[byte] 29 | from C import snap_index_overflow_ptr(Ptr[byte]) -> Ptr[u32] 30 | from C import snap_index_overflow_len(Ptr[byte]) -> int 31 | from C import snap_index_count_of_bases(Ptr[byte]) -> int 32 | 33 | p = snap_index_from_dir(dir.c_str()) 34 | assert p 35 | hash_tables = Array[SNAPHashTable[Kmer[16],u32]](snap_index_ht_count(p)) 36 | for i in range(len(hash_tables)): 37 | hash_tables[i] = SNAPHashTable[Kmer[16],u32](snap_index_ht_get(p, i)) 38 | 39 | self.hash_tables = hash_tables 40 | self.overflow_table = Array[u32](snap_index_overflow_ptr(p), snap_index_overflow_len(p)) 41 | self.count_of_bases = snap_index_count_of_bases(p) 42 | 43 | def __getitem__(self, seed: K): 44 | kmer, which = GenomeIndex[K]._partition(seed) 45 | table = self.hash_tables[which] 46 | value_ptr = table.get_value_ptr_for_key(kmer) 47 | 48 | if not value_ptr or value_ptr[0] == table.invalid_val: 49 | return Array[u32](value_ptr, 0) 50 | 51 | value = value_ptr[0] 52 | 53 | if int(value) < self.count_of_bases: 54 | return Array[u32](value_ptr, 1) 55 | else: 56 | overflow_table_offset = int(value) - self.count_of_bases 57 | hit_count = int(self.overflow_table[overflow_table_offset]) 58 | return Array[u32](self.overflow_table.ptr + overflow_table_offset + 1, hit_count) 59 | 60 | def __prefetch__(self, seed: K): 61 | kmer, which = GenomeIndex[K]._partition(seed) 62 | table = self.hash_tables[which] 63 | table.__prefetch__(kmer) 64 | 65 | -------------------------------------------------------------------------------- /test/core/match.codon: -------------------------------------------------------------------------------- 1 | from bio import * 2 | 3 | def f(k): 4 | match k: 5 | case '': 6 | yield 1 7 | case _: 8 | yield 0 9 | 10 | match k: 11 | case 'GCGT': 12 | yield 1 13 | case _: 14 | yield 0 15 | 16 | match k: 17 | case 'GCGTC': 18 | yield 1 19 | case _: 20 | yield 0 21 | 22 | match k: 23 | case 'GCATC': 24 | yield 1 25 | case _: 26 | yield 0 27 | 28 | match k: 29 | case 'G_GT_': 30 | yield 1 31 | case _: 32 | yield 0 33 | 34 | match k: 35 | case '_TG__': 36 | yield 1 37 | case _: 38 | yield 0 39 | 40 | match k: 41 | case 'GC*ATC': 42 | yield 1 43 | case _: 44 | yield 0 45 | 46 | match k: 47 | case 'GC*TC': 48 | yield 1 49 | case _: 50 | yield 0 51 | 52 | match k: 53 | case 'GC*A': 54 | yield 1 55 | case _: 56 | yield 0 57 | 58 | match k: 59 | case 'GCG*': 60 | yield 1 61 | case _: 62 | yield 0 63 | 64 | match k: 65 | case 'GGG*': 66 | yield 1 67 | case _: 68 | yield 0 69 | 70 | match k: 71 | case '*TC': 72 | yield 1 73 | case _: 74 | yield 0 75 | 76 | match k: 77 | case '*T': 78 | yield 1 79 | case _: 80 | yield 0 81 | 82 | match k: 83 | case '*': 84 | yield 1 85 | case _: 86 | yield 0 87 | 88 | match k: 89 | case '_C*G_C': 90 | yield 1 91 | case _: 92 | yield 0 93 | 94 | match k: 95 | case '*G_C': 96 | yield 1 97 | case _: 98 | yield 0 99 | 100 | match k: 101 | case '_C*': 102 | yield 1 103 | case _: 104 | yield 0 105 | 106 | match k: 107 | case 'A_*G_C': 108 | yield 1 109 | case _: 110 | yield 0 111 | 112 | match k: 113 | case '*C_C': 114 | yield 1 115 | case _: 116 | yield 0 117 | 118 | match k: 119 | case '_T*': 120 | yield 1 121 | case _: 122 | yield 0 123 | 124 | match k: 125 | case '__*__': 126 | yield 1 127 | case _: 128 | yield 0 129 | 130 | @test 131 | def test_seq_match(): 132 | s = s'GCGTC' 133 | t = ~s'GACGC' # == ~s 134 | assert list(f(s)) == [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] 135 | assert list(f(t)) == [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] 136 | assert list(f(Kmer[5](s))) == [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1] 137 | test_seq_match() 138 | -------------------------------------------------------------------------------- /test/core/bltin.codon: -------------------------------------------------------------------------------- 1 | @test 2 | def test_min_max(): 3 | assert max(2, 1, 1, 1, 1) == 2 4 | assert max(1, 2, 1, 1, 1) == 2 5 | assert max(1, 1, 2, 1, 1) == 2 6 | assert max(1, 1, 1, 2, 1) == 2 7 | assert max(1, 1, 1, 1, 2) == 2 8 | assert max(2, 1, 1, 1) == 2 9 | assert max(1, 2, 1, 1) == 2 10 | assert max(1, 1, 2, 1) == 2 11 | assert max(1, 1, 1, 2) == 2 12 | assert max(2, 1, 1) == 2 13 | assert max(1, 2, 1) == 2 14 | assert max(1, 1, 2) == 2 15 | assert max(2, 1) == 2 16 | assert max(1, 2) == 2 17 | 18 | assert min(2, 1, 1, 1, 1) == 1 19 | assert min(1, 2, 1, 1, 1) == 1 20 | assert min(1, 1, 2, 1, 1) == 1 21 | assert min(1, 1, 1, 2, 1) == 1 22 | assert min(1, 1, 1, 1, 2) == 1 23 | assert min(2, 1, 1, 1) == 1 24 | assert min(1, 2, 1, 1) == 1 25 | assert min(1, 1, 2, 1) == 1 26 | assert min(1, 1, 1, 2) == 1 27 | assert min(2, 1, 1) == 1 28 | assert min(1, 2, 1) == 1 29 | assert min(1, 1, 2) == 1 30 | assert min(2, 1) == 1 31 | assert min(1, 2) == 1 32 | 33 | assert max(0, 1, 1, 1, 1) == 1 34 | assert max(1, 0, 1, 1, 1) == 1 35 | assert max(1, 1, 0, 1, 1) == 1 36 | assert max(1, 1, 1, 0, 1) == 1 37 | assert max(1, 1, 1, 1, 0) == 1 38 | assert max(0, 1, 1, 1) == 1 39 | assert max(1, 0, 1, 1) == 1 40 | assert max(1, 1, 0, 1) == 1 41 | assert max(1, 1, 1, 0) == 1 42 | assert max(0, 1, 1) == 1 43 | assert max(1, 0, 1) == 1 44 | assert max(1, 1, 0) == 1 45 | assert max(0, 1) == 1 46 | assert max(1, 0) == 1 47 | 48 | assert min(0, 1, 1, 1, 1) == 0 49 | assert min(1, 0, 1, 1, 1) == 0 50 | assert min(1, 1, 0, 1, 1) == 0 51 | assert min(1, 1, 1, 0, 1) == 0 52 | assert min(1, 1, 1, 1, 0) == 0 53 | assert min(0, 1, 1, 1) == 0 54 | assert min(1, 0, 1, 1) == 0 55 | assert min(1, 1, 0, 1) == 0 56 | assert min(1, 1, 1, 0) == 0 57 | assert min(0, 1, 1) == 0 58 | assert min(1, 0, 1) == 0 59 | assert min(1, 1, 0) == 0 60 | assert min(0, 1) == 0 61 | assert min(1, 0) == 0 62 | 63 | assert min(a*a for a in range(3)) == 0 64 | assert max(a*a for a in range(3)) == 4 65 | assert min([0, 2, -1]) == -1 66 | assert max([0, 2, -1]) == 2 67 | 68 | @test 69 | def test_map_filter(): 70 | assert list(map(lambda i: i+1, (i*2 for i in range(5)))) == [1, 3, 5, 7, 9] 71 | assert list(map(lambda i: i+1, (i*2 for i in range(0)))) == [] 72 | assert list(map(lambda i: i//2, map(lambda i: i-1, map(lambda i: i+1, (i*2 for i in range(5)))))) == [0, 1, 2, 3, 4] 73 | def f(x: int) -> int: 74 | return x - 1 75 | def g(x: int) -> int: 76 | return x + 1 77 | assert list(map(f, map(g, (i*2 for i in range(5))))) == [0, 2, 4, 6, 8] 78 | 79 | def h(x: list[int]): 80 | return x 81 | assert h(list(map(lambda i: i-1, map(lambda i: i+1, range(5))))) == [0, 1, 2, 3, 4] 82 | 83 | assert list(filter(lambda i: i % 2 == 0, range(5))) == [0, 2, 4] 84 | assert list(filter(lambda i: i % 2 == 1, filter(lambda i: i % 2 == 0, range(5)))) == [] 85 | 86 | assert list(filter(lambda i: i%2 == 0, map(lambda i: i*i, range(10)))) == [0, 4, 16, 36, 64] 87 | 88 | test_min_max() 89 | test_map_filter() 90 | 91 | -------------------------------------------------------------------------------- /stdlib/bio/c_htslib.codon: -------------------------------------------------------------------------------- 1 | # 2 | from C import hts_open(cobj, cobj) -> cobj 3 | from C import hts_set_threads(cobj, i32) -> i32 4 | from C import hts_close(cobj) 5 | from C import hts_idx_destroy(cobj) 6 | from C import hts_itr_destroy(cobj) 7 | from C import hts_itr_destroy(cobj) 8 | from C import hts_itr_next(cobj, cobj, cobj, cobj) -> i32 9 | from C import sam_index_load(cobj, cobj) -> cobj 10 | from C import sam_hdr_read(cobj) -> cobj 11 | from C import sam_itr_querys(cobj, cobj, cobj) -> cobj 12 | from C import sam_read1(cobj, cobj, cobj) -> i32 13 | from C import bam_read1(cobj, cobj) -> i32 14 | from C import bam_init1() -> cobj 15 | @pure 16 | @C 17 | def bam_cigar2qlen(a: int, b: Ptr[u32]) -> int: pass 18 | @pure 19 | @C 20 | def bam_cigar2rlen(a: int, b: Ptr[u32]) -> int: pass 21 | @pure 22 | @C 23 | def bam_aux_get(a: cobj, b: Ptr[byte]) -> Ptr[u8]: pass 24 | @pure 25 | @C 26 | def bam_aux2i(a: Ptr[u8]) -> int: pass 27 | @pure 28 | @C 29 | def bam_aux2f(a: Ptr[u8]) -> float: pass 30 | @pure 31 | @C 32 | def bam_aux2A(a: Ptr[u8]) -> byte: pass 33 | @pure 34 | @C 35 | def bam_aux2Z(a: Ptr[u8]) -> Ptr[byte]: pass 36 | @pure 37 | @C 38 | def bam_auxB_len(a: Ptr[u8]) -> u32: pass 39 | @pure 40 | @C 41 | def bam_auxB2i(a: Ptr[u8], b: u32) -> int: pass 42 | @pure 43 | @C 44 | def bam_auxB2f(a: Ptr[u8], b: u32) -> float: pass 45 | @pure 46 | @C 47 | def bam_endpos(a: cobj) -> int: pass 48 | from C import sam_hdr_destroy(cobj) 49 | from C import bam_destroy1(cobj) 50 | 51 | from C import bcf_hdr_read(cobj) -> cobj 52 | from C import bcf_read(cobj, cobj, cobj) -> i32 53 | from C import bcf_unpack(cobj, i32) -> i32 54 | from C import bcf_hdr_destroy(cobj) 55 | from C import bcf_init() -> cobj 56 | from C import bcf_destroy(cobj) 57 | from C import bcf_get_variant_types(cobj) -> i32 58 | from C import bcf_get_variant_type(cobj, i32) -> i32 59 | from C import bcf_is_snp(cobj) -> i32 60 | from C import bcf_has_filter(cobj, cobj, cobj) -> i32 61 | from C import bcf_get_fmt(cobj, cobj, cobj) -> cobj 62 | from C import bcf_get_info(cobj, cobj, cobj) -> cobj 63 | from C import bcf_get_fmt_id(cobj, i32) -> cobj 64 | from C import bcf_get_info_id(cobj, i32) -> cobj 65 | from C import bcf_get_format_values(cobj, cobj, cobj, cobj, cobj, i32) -> i32 66 | from C import bcf_hdr_id2int(cobj, i32, cobj) -> i32 67 | from C import bcf_has_filter(cobj, cobj, cobj) -> i32 68 | from C import bcf_hrec_format(cobj, cobj) 69 | from C import bcf_clear(cobj) 70 | from C import bcf_empty(cobj) 71 | from C import bcf_dup(cobj) -> cobj 72 | from C import bcf_get_info_values(cobj, cobj, Ptr[byte], Ptr[cobj], Ptr[i32], i32) -> i32 73 | 74 | @pure 75 | @C 76 | def hts_version() -> cobj: pass 77 | 78 | @pure 79 | @C 80 | def seq_get_htsfile_fp(a: cobj) -> cobj: pass 81 | @pure 82 | @C 83 | def seq_is_htsfile_cram(a: cobj) -> bool: pass 84 | @pure 85 | @C 86 | def seq_is_htsfile_bgzf(a: cobj) -> bool: pass 87 | 88 | # Seq HTSlib 89 | def sam_itr_next(file: cobj, itr: cobj, r: cobj) -> int: 90 | is_cram = seq_is_htsfile_cram(file) 91 | is_bgzf = seq_is_htsfile_bgzf(file) 92 | if not is_cram and not is_bgzf: 93 | raise ValueError('not BGZF compressed') 94 | if not itr: 95 | raise ValueError('null iterator') 96 | return int(hts_itr_next( 97 | seq_get_htsfile_fp(file) if is_bgzf else cobj(), 98 | itr, r, file)) 99 | -------------------------------------------------------------------------------- /test/bench/match.codon: -------------------------------------------------------------------------------- 1 | ###################### 2 | # Matching benchmark # 3 | ###################### 4 | from sys import argv 5 | from time import timing 6 | from bio import * 7 | 8 | def match_fast1(k): 9 | match k: 10 | case 'T_T_T_T_T_T_T_T_T_T_T_T_T_T_T_T_..._A_A_A_A_A_A_A_A_A_A_A_A_A_A_A_A': 11 | return True 12 | case _: 13 | return False 14 | 15 | def match_slow1(k): 16 | for i in range(0, 32, 2): 17 | if k[i] != k'T': 18 | return False 19 | for i in range(0, 32, 2): 20 | if k[len(k) - 1 - i] != k'A': 21 | return False 22 | return True 23 | 24 | def test1(use_slow_match, K: Static[int]): 25 | n = 0 26 | with timing(f'{K}-mer ({use_slow_match=})'): 27 | for s in FASTA(argv[1]) |> seqs: 28 | for kmer in s |> kmers(1, K): 29 | b = False 30 | if use_slow_match: 31 | b = match_slow1(kmer) 32 | else: 33 | b = match_fast1(kmer) 34 | n += 1 if b else 0 35 | print n 36 | 37 | def match_fast2(k): 38 | match k: 39 | case 'T_T_..._A_A': 40 | return True 41 | case _: 42 | return False 43 | 44 | def match_slow2(k): 45 | for i in range(0, 4, 2): 46 | if k[i] != k'T': 47 | return False 48 | for i in range(0, 4, 2): 49 | if k[len(k) - 1 - i] != k'A': 50 | return False 51 | return True 52 | 53 | def test2(use_slow_match, K: Static[int]): 54 | n = 0 55 | with timing(f'{K}-mer ({use_slow_match=})'): 56 | for s in FASTA(argv[1]) |> seqs: 57 | for kmer in s |> kmers(1, K): 58 | b = False 59 | if use_slow_match: 60 | b = match_slow2(kmer) 61 | else: 62 | b = match_fast2(kmer) 63 | n += 1 if b else 0 64 | print n 65 | 66 | print 'TEST1:' 67 | test1(False, 64) 68 | test1(True, 64) 69 | test1(False, 96) 70 | test1(True, 96) 71 | test1(False, 128) 72 | test1(True, 128) 73 | test1(False, 160) 74 | test1(True, 160) 75 | test1(False, 192) 76 | test1(True, 192) 77 | test1(False, 224) 78 | test1(True, 224) 79 | test1(False, 256) 80 | test1(True, 256) 81 | test1(False, 288) 82 | test1(True, 288) 83 | test1(False, 320) 84 | test1(True, 320) 85 | test1(False, 352) 86 | test1(True, 352) 87 | test1(False, 384) 88 | test1(True, 384) 89 | test1(False, 416) 90 | test1(True, 416) 91 | test1(False, 448) 92 | test1(True, 448) 93 | test1(False, 480) 94 | test1(True, 480) 95 | test1(False, 512) 96 | test1(True, 512) 97 | 98 | print 'TEST2:' 99 | test2(False, 64) 100 | test2(True, 64) 101 | test2(False, 96) 102 | test2(True, 96) 103 | test2(False, 128) 104 | test2(True, 128) 105 | test2(False, 160) 106 | test2(True, 160) 107 | test2(False, 192) 108 | test2(True, 192) 109 | test2(False, 224) 110 | test2(True, 224) 111 | test2(False, 256) 112 | test2(True, 256) 113 | test2(False, 288) 114 | test2(True, 288) 115 | test2(False, 320) 116 | test2(True, 320) 117 | test2(False, 352) 118 | test2(True, 352) 119 | test2(False, 384) 120 | test2(True, 384) 121 | test2(False, 416) 122 | test2(True, 416) 123 | test2(False, 448) 124 | test2(True, 448) 125 | test2(False, 480) 126 | test2(True, 480) 127 | test2(False, 512) 128 | test2(True, 512) 129 | -------------------------------------------------------------------------------- /stdlib/bio/fai.codon: -------------------------------------------------------------------------------- 1 | @tuple 2 | class FAIRecord: 3 | _name: str 4 | _length: int 5 | _offset: int 6 | _linebases: int 7 | _linewidth: int 8 | _qualoffset: int 9 | 10 | @property 11 | def name(self): 12 | return self._name 13 | 14 | @property 15 | def length(self): 16 | return self._length 17 | 18 | @property 19 | def offset(self): 20 | return self._offset 21 | 22 | @property 23 | def line_bases(self): 24 | return self._linebases 25 | 26 | @property 27 | def line_width(self): 28 | return self._linewidth 29 | 30 | @property 31 | def qual_offset(self): 32 | return self._qualoffset 33 | 34 | FAI_COL_NAMES = ["name", "length", "offset", "line_bases", "line_width", "qual_offset"] 35 | 36 | class FAIReader: 37 | fastq: bool 38 | validate: bool 39 | copy: bool 40 | _file: gzFile 41 | 42 | def __init__(self, path: str, fastq: bool, validate: bool, copy: bool): 43 | self.validate = validate 44 | self.copy = copy 45 | self._file = gzopen(path, "r") 46 | self.fastq = fastq 47 | 48 | @property 49 | def file(self): 50 | return self._file 51 | 52 | @property 53 | def num_necessary_cols(self): 54 | return 6 if self.fastq else 5 55 | 56 | def __iter__(self): 57 | for lnum, l in enumerate(self.file._iter_trim_newline()): 58 | line = l.__ptrcopy__() if self.copy else l 59 | rec: FAIRecord = self._FAIRecord_from_str(line, lnum + 1) 60 | yield rec 61 | self.close() 62 | 63 | def _FAIRecord_from_str(self, s: str, lnum: int): 64 | col_strs = s.split("\t") 65 | 66 | if self.validate and len(col_strs) < self.num_necessary_cols: 67 | raise ValueError(f"Line {lnum} does not have the required number of columns, {self.num_necessary_cols}") 68 | 69 | name, length, offset = "", 0, 0 70 | line_bases, line_width, qual_offset = 0, 0, 0 71 | 72 | val_ptrs: List[Ptr[byte]] = [__ptr__(name).as_byte(), __ptr__(length).as_byte(), 73 | __ptr__(offset).as_byte(), __ptr__(line_bases).as_byte(), 74 | __ptr__(line_width).as_byte(), __ptr__(qual_offset).as_byte()] 75 | 76 | for i in range(self.num_necessary_cols): 77 | col_name = FAI_COL_NAMES[i] 78 | val_ptr = val_ptrs[i] 79 | val_str = col_strs[i] 80 | if col_name == "name": 81 | val_str_ptr = Ptr[str](val_ptr) 82 | val_str_ptr[0] = val_str 83 | else: 84 | val_num_ptr = Ptr[int](val_ptr) 85 | val_num_ptr[0] = self._get_int_from_fai(val_str, col_name, lnum) 86 | 87 | return FAIRecord(name, length, offset, line_bases, line_width, qual_offset) 88 | 89 | def _get_int_from_fai(self, val_str: str, col_name: str, lnum: int): 90 | if self.validate: 91 | try: 92 | return int(val_str) 93 | except: 94 | raise ValueError(f"{col_name}, must be integer, line: {lnum}") 95 | return int(val_str) 96 | 97 | def close(self): 98 | self.file.close() 99 | 100 | def __enter__(self): 101 | pass 102 | 103 | def __exit__(self): 104 | self.close() 105 | 106 | def FAI(path: str, fastq: bool = False, validate: bool = True, copy: bool = True) -> FAIReader: 107 | return FAIReader(path=path, fastq=fastq, validate=validate, copy=copy) 108 | -------------------------------------------------------------------------------- /test/apps/umi/whitelist.codon: -------------------------------------------------------------------------------- 1 | # Implementation of barcode whitelisting from UMI tools 2 | # https://github.com/CGATOxford/UMI-tools/blob/master/umi_tools/whitelist.py 3 | # Assumes 16bp barcode at the start of the read 4 | # Usage: seqc whitelist.seq 5 | 6 | from sys import argv 7 | from time import timing 8 | from bio import * 9 | 10 | BARCODE_LEN = 16 11 | 12 | def get_knee_estimate_distance(cell_barcode_counts): 13 | def get_knee_distance(values): 14 | from math import sqrt 15 | a = (values[-1] - values[0]) / (len(values) - 1) 16 | b = -1 17 | c = values[0] 18 | h = sqrt(a**2 + b**2) 19 | dist = lambda x,y: abs(a*x + b*y + c)/h 20 | dist_to_line = [dist(x, y) for x, y in enumerate(values)] 21 | best_idx = max((y, x) for x, y in enumerate(dist_to_line))[1] 22 | return (dist_to_line, best_idx) 23 | 24 | def cumsum(values): 25 | total = values[0] 26 | for i in range(1, len(values)): 27 | total += values[i] 28 | values[i] = total 29 | return values 30 | 31 | bc = [(count, barcode) for barcode,count in cell_barcode_counts.items()] 32 | bc.sort(reverse=True) 33 | counts = [count for count,barcode in bc] 34 | values = cumsum(counts) 35 | prev_idx_of_best_point = 0 36 | dist_to_line, idx_of_best_point = get_knee_distance(values) 37 | 38 | max_iterations = 100 39 | iterations = 0 40 | while idx_of_best_point - prev_idx_of_best_point != 0: 41 | prev_idx_of_best_point = idx_of_best_point 42 | iterations += 1 43 | if iterations >= max_iterations: 44 | break 45 | dist_to_line, idx_of_best_point = get_knee_distance(values[:idx_of_best_point*3]) 46 | 47 | knee_final_barcodes = [x[1] for x in bc[:idx_of_best_point+1]] 48 | return knee_final_barcodes 49 | 50 | def get_error_correct_mapping(cell_barcodes, whitelist): 51 | def neighbors(barcode): 52 | for i in range(len(barcode)): 53 | for b in (s'A', s'C', s'G', s'T'): 54 | if barcode[i] != b: 55 | s = str(barcode) 56 | yield seq(''.join((s[:i], str(b), s[i+1:]))) 57 | 58 | true_to_false = {} 59 | whitelist_set = set(whitelist) 60 | for cell_barcode in cell_barcodes: 61 | if cell_barcode in whitelist_set: 62 | continue 63 | candidates = [neighbor for neighbor in neighbors(cell_barcode) if neighbor in whitelist_set] 64 | if len(candidates) == 1: 65 | true_to_false.setdefault(candidates[0], []).append(cell_barcode) 66 | return true_to_false 67 | 68 | def get_cell_whitelist(cell_barcode_counts): 69 | cell_whitelist = get_knee_estimate_distance(cell_barcode_counts) 70 | true_to_false_map = get_error_correct_mapping(cell_barcode_counts.keys(), cell_whitelist) 71 | return cell_whitelist, true_to_false_map 72 | 73 | with timing('whitelist construction'): 74 | cell_barcode_counts = Dict[seq, int]() 75 | FASTQ(argv[1]) |> seqs |> (lambda read: read[:BARCODE_LEN]) |> cell_barcode_counts.increment 76 | cell_whitelist, true_to_false_map = get_cell_whitelist(cell_barcode_counts) 77 | 78 | for barcode in sorted(cell_whitelist): 79 | corrected_barcodes, corrected_barcode_counts = "", "" 80 | if barcode in true_to_false_map: 81 | corrected_barcodes = ",".join([str(k) for k in sorted(true_to_false_map[barcode])]) 82 | corrected_barcode_counts = ",".join([str(cell_barcode_counts[x]) for x in sorted(true_to_false_map[barcode])]) 83 | 84 | print(f'{barcode}\t{corrected_barcodes}\t{cell_barcode_counts[barcode]}\t{corrected_barcode_counts}') 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Seq 3 |

4 | 5 |

Seq — the bioinformatics module for Codon

6 | 7 | ## Introduction 8 | 9 | Seq is a programming language for computational genomics and bioinformatics. With a Python-compatible syntax and a host of domain-specific features and optimizations, Seq makes writing high-performance genomics software as easy as writing Python code, and achieves performance comparable to (and in many cases better than) C/C++. 10 | 11 | Seq is able to outperform Python code by up to 160x. Seq can further beat equivalent C/C++ code by up to 2x without any manual interventions, and also natively supports parallelism out of the box. Implementation details and benchmarks are discussed [in our paper](https://dl.acm.org/citation.cfm?id=3360551). 12 | 13 | Learn more by following the [tutorial](https://docs.seq-lang.org/tutorial) or from the [cookbook](https://docs.seq-lang.org/cookbook). 14 | 15 | ## Examples 16 | Here is an example showcasing some of Seq's bioinformatics features, which include native sequence and k-mer types. 17 | 18 | ```python 19 | from bio import * 20 | s = s'ACGTACGT' # sequence literal 21 | print(s[2:5]) # subsequence 22 | print(~s) # reverse complement 23 | kmer = Kmer[8](s) # convert to k-mer 24 | 25 | # iterate over length-3 subsequences 26 | # with step 2 27 | for sub in s.split(3, step=2): 28 | print(sub[-1]) # last base 29 | 30 | # iterate over 2-mers with step 1 31 | for kmer in sub.kmers(step=1, k=2): 32 | print(~kmer) # '~' also works on k-mers 33 | ``` 34 | 35 | ## Installation 36 | 37 | You need [Codon](https://github.com/exaloop/codon) for Seq to work. Assuming that Codon is installed in `$HOME/.codon`, run: 38 | ```bash 39 | # Download the latest release for your platform from https://github.com/exaloop/seq/releases 40 | tar zxvf seq-[OS]-[ARCH].tar.gz -C ${HOME}/.codon/lib/codon/plugins 41 | ``` 42 | 43 | Afterwards, you can use Seq with `-plugin seq`; for example: 44 | ```bash 45 | codon run -plugin seq test.codon 46 | ``` 47 | 48 | > **Note:** The default aarch64 Linux binaries use NEON emulation. Other builds might also be suboptimal (e.g., using SSE2 instead of SSE4 or AVX). 49 | > You should compile Seq locally to ensure the optimal alignment performance on your hardware. 50 | > 51 | > Please consult [our build file](.github/build-linux/entrypoint.sh) for the build instructions. 52 | 53 | ## Documentation 54 | 55 | Please check [docs.seq-lang.org](https://docs.seq-lang.org) for in-depth documentation. 56 | 57 | ## Citing Seq 58 | 59 | If you use Seq in your research, please cite: 60 | 61 | > Ariya Shajii, Ibrahim Numanagić, Riyadh Baghdadi, Bonnie Berger, and Saman Amarasinghe. 2019. Seq: a high-performance language for bioinformatics. *Proc. ACM Program. Lang.* 3, OOPSLA, Article 125 (October 2019), 29 pages. DOI: https://doi.org/10.1145/3360551 62 | 63 | BibTeX: 64 | 65 | ``` 66 | @article{Shajii:2019:SHL:3366395.3360551, 67 | author = {Shajii, Ariya and Numanagi\'{c}, Ibrahim and Baghdadi, Riyadh and Berger, Bonnie and Amarasinghe, Saman}, 68 | title = {Seq: A High-performance Language for Bioinformatics}, 69 | journal = {Proc. ACM Program. Lang.}, 70 | issue_date = {October 2019}, 71 | volume = {3}, 72 | number = {OOPSLA}, 73 | month = oct, 74 | year = {2019}, 75 | issn = {2475-1421}, 76 | pages = {125:1--125:29}, 77 | articleno = {125}, 78 | numpages = {29}, 79 | url = {http://doi.acm.org/10.1145/3360551}, 80 | doi = {10.1145/3360551}, 81 | acmid = {3360551}, 82 | publisher = {ACM}, 83 | address = {New York, NY, USA}, 84 | keywords = {Python, bioinformatics, computational biology, domain-specific language, optimization, programming language}, 85 | } 86 | ``` 87 | -------------------------------------------------------------------------------- /test/pipeline/prefetch.codon: -------------------------------------------------------------------------------- 1 | from bio import * 2 | 3 | class MyIndex[K]: 4 | special: K 5 | getitem_calls: int 6 | prefetch_calls: int 7 | 8 | def __init__(self, special: K): 9 | self.special = special 10 | self.getitem_calls = 0 11 | self.prefetch_calls = 0 12 | 13 | def __getitem__(self, k: K): 14 | self.getitem_calls += 1 15 | return 1 if k == self.special else 0 16 | 17 | def __prefetch__(self, k: K): 18 | self.prefetch_calls += 1 19 | 20 | def lookup1[K](kmer: K, idx: MyIndex[K]): 21 | return (kmer, idx[kmer]) 22 | 23 | @prefetch 24 | def lookup2[K](kmer: K, idx: MyIndex[K]): 25 | return (kmer, idx[kmer]) 26 | 27 | @prefetch 28 | def lookup3[K](kmer: K, idx: MyIndex[K]): 29 | try: 30 | idx[kmer] # issues prefetch 31 | idx[~kmer] # issues prefetch 32 | return (kmer, -99999999) 33 | finally: 34 | idx[kmer] # issues prefetch 35 | idx[~kmer] # issues prefetch 36 | return (kmer, idx[kmer]) 37 | 38 | @prefetch 39 | def lookup4(kmer, idx, v): 40 | v.append((kmer, idx[kmer])) 41 | 42 | @prefetch 43 | def lookup5(kmer, idx, v): 44 | v.append((kmer, idx[kmer])) 45 | return 0 46 | 47 | K: Static[int] = 3 48 | 49 | @test 50 | def test_prefetch_transformation(s): 51 | idx1 = MyIndex(Kmer[K]()) 52 | idx2 = MyIndex(Kmer[K]()) 53 | idx3 = MyIndex(Kmer[K]()) 54 | idx4 = MyIndex(Kmer[K]()) 55 | idx5 = MyIndex(Kmer[K]()) 56 | idx6 = MyIndex(Kmer[K]()) 57 | v1 = [] 58 | v2 = [] 59 | v3 = [] 60 | v4: List[Tuple[Kmer[K], int]] = [] 61 | v5: List[Tuple[Kmer[K], int]] = [] 62 | v6: List[Tuple[Kmer[K], int]] = [] 63 | s = s'ACGTACGTAAAACGTACGTAAAACGTACGT' 64 | 65 | def my_kmers(s): 66 | return s.kmers(1, K) 67 | 68 | s |> kmers(1, K) |> lookup1(idx1) |> v1.append 69 | s |> kmers(1, K) |> lookup2(idx2) |> v2.append 70 | s |> kmers(1, K) |> lookup3(idx3) |> v3.append 71 | my_kmers(s) |> lookup4(idx4, v4) 72 | s |> kmers(1, K) |> lookup5(idx5, v5) 73 | 74 | def pass_prefetch_func_by_arg(s, lookup, idx, v): 75 | s |> kmers(1, K) |> lookup(idx) |> v.append 76 | 77 | pass_prefetch_func_by_arg(s, lookup3, idx6, v6) 78 | 79 | assert set(v1) == set(v2) 80 | assert set(v1) == set(v3) 81 | assert set(v1) == set(v4) 82 | assert set(v1) == set(v5) 83 | assert set(v1) == set(v6) 84 | assert idx1.getitem_calls == len(s) - (K - 1) 85 | assert idx1.prefetch_calls == 0 86 | assert idx1.getitem_calls == idx2.getitem_calls 87 | assert idx3.getitem_calls == 5 * idx1.getitem_calls 88 | assert idx2.prefetch_calls == idx2.getitem_calls 89 | assert idx3.prefetch_calls == 5 * idx2.prefetch_calls 90 | assert idx4.getitem_calls == idx2.getitem_calls 91 | assert idx4.prefetch_calls == idx2.prefetch_calls 92 | assert idx5.getitem_calls == idx4.getitem_calls 93 | assert idx5.prefetch_calls == idx4.prefetch_calls 94 | assert idx6.getitem_calls == idx3.getitem_calls 95 | assert idx6.prefetch_calls == idx3.prefetch_calls 96 | 97 | test_prefetch_transformation(s'') 98 | test_prefetch_transformation(s'A') 99 | test_prefetch_transformation(s'ACG') 100 | test_prefetch_transformation(s'ACGTA') 101 | test_prefetch_transformation(s'ACGTACGTAAAACGTACGTAAAACGTACGT') 102 | test_prefetch_transformation(s'ACGTACGTAAAACGTACGTAAAACGTACGTACGTACGTAAAACGTACGTAAAACGTACGTACGTACGTAAAACGTACGTAAAACGTACGTACGTACGTAAAACGTACGTAAAACGTACGT') 103 | 104 | @test 105 | def test_list_prefetch(): 106 | v = [0] 107 | v.prefetch(0) 108 | test_list_prefetch() 109 | 110 | @test 111 | def test_dict_prefetch(): 112 | d = {0:1} 113 | d.prefetch(0) 114 | d.prefetch(42) 115 | test_dict_prefetch() 116 | -------------------------------------------------------------------------------- /test/apps/bwa/fastmap.codon: -------------------------------------------------------------------------------- 1 | from sys import argv, stderr, exit 2 | from time import timing 3 | from bio import FASTQ, FASTA 4 | from bio.fmindex import FMIndex, SMEM 5 | 6 | min_seed = 17 7 | min_iwidth = 20 8 | min_intv = 1 9 | 10 | def open_index_file(basename, mode): 11 | import gzip 12 | return gzip.open(f'{basename}.fastmap.idx', mode) 13 | 14 | def index_load(basename): 15 | from pickle import load 16 | with open_index_file(basename, 'rb') as jar: 17 | return load(jar, FMIndex) 18 | 19 | #@prefetch 20 | def fastmap(rec, fmi, out): 21 | prev, curr, mems = [], [], [] 22 | q = rec.seq 23 | l = len(q) 24 | 25 | start = 0 26 | while True: 27 | while start < l and q[start].N(): 28 | start += 1 29 | if start >= l: 30 | break 31 | 32 | mems.clear() 33 | prev.clear() 34 | curr.clear() 35 | x = start 36 | 37 | if q[x].N(): 38 | return 39 | 40 | ik = SMEM(fmi.biinterval(q[x]), start=x, stop=x+1) 41 | 42 | # forward search 43 | i = x + 1 44 | while i < l: 45 | if not q[i].N(): # an A/C/G/T base 46 | ok = ~fmi[~(ik.interval), ~q[i]] 47 | if len(ok) != len(ik.interval): # change of the interval size 48 | curr.append(ik) 49 | if len(ok) < min_intv: 50 | break # the interval size is too small to be extended further 51 | ik = SMEM(ok, start=x, stop=i+1) 52 | else: # an ambiguous base 53 | curr.append(ik) 54 | break 55 | i += 1 56 | 57 | if i == l: 58 | curr.append(ik) 59 | curr.reverse() 60 | ret = curr[0].stop 61 | prev, curr = curr, prev 62 | 63 | # backward search for MEMs 64 | i = x - 1 65 | while i >= -1: 66 | c = i >= 0 and not q[i].N() 67 | curr.clear() 68 | for p in prev: 69 | ok = None 70 | if c: 71 | ok = fmi[p.interval, q[i]] 72 | if not c or len(ok) < min_intv: 73 | if len(curr) == 0: 74 | if len(mems) == 0 or i + 1 < mems[-1].start: 75 | if len(ik := SMEM(p.interval, start=i+1, stop=p.stop)) >= min_seed: 76 | mems.append(ik) 77 | elif len(curr) == 0 or len(ok) != len(curr[-1].interval): 78 | curr.append(SMEM(ok, start=p.start, stop=p.stop)) 79 | if len(curr) == 0: 80 | break 81 | prev, curr = curr, prev 82 | i -= 1 83 | 84 | mems.reverse() # s.t. sorted by the start coordinate 85 | start = ret 86 | 87 | for mem in mems: 88 | intv = mem.interval 89 | offset = mem.start 90 | match_size = len(mem) 91 | 92 | print(f'{rec.name}\tEM\t{offset}\t{offset+match_size}\t{len(intv)}', end='', file=out) 93 | if len(intv) <= min_iwidth: 94 | for rid, name, pos, is_rev in fmi.biresults(mem): 95 | print(f"\t{name}:{'-' if is_rev else '+'}{pos+1}", end='', file=out) 96 | else: 97 | print("\t*", file=out) 98 | print("", file=out) 99 | 100 | def main_index(path): 101 | from pickle import dump 102 | print('building FM-index...', file=stderr) 103 | fmi = FMIndex(path, FMD=True) 104 | print('writing to disk...', file=stderr) 105 | with open_index_file(path, 'wb0') as jar: 106 | dump(fmi, jar) 107 | 108 | def main_search(index, fastq, result): 109 | print('loading index...', file=stderr) 110 | fmi = None 111 | with timing('load'): 112 | fmi = index_load(index) 113 | print('running alignment pipeline...', file=stderr) 114 | with open(result, 'w') as out, timing('fastmap'): 115 | FASTQ(fastq) |> iter |> fastmap(fmi, out) 116 | 117 | match argv[1:]: 118 | case ['index', path]: 119 | main_index(path) 120 | case ['search', index, fastq, result]: 121 | main_search(index, fastq, result) 122 | case _: 123 | print("error: unknown mode: valid modes are 'index' and 'search'", file=stderr) 124 | exit(1) 125 | -------------------------------------------------------------------------------- /test/apps/bwa/fastmap_build.codon: -------------------------------------------------------------------------------- 1 | from sys import argv, stderr, exit 2 | from time import timing 3 | from bio import FASTQ, FASTA 4 | from bio.fmindex import FMIndex, SMEM 5 | 6 | min_seed = 17 7 | min_iwidth = 20 8 | min_intv = 1 9 | 10 | def open_index_file(basename, mode): 11 | import gzip 12 | return gzip.open(f'{basename}.fastmap.idx', mode) 13 | 14 | def index_load(basename): 15 | from pickle import load 16 | with open_index_file(basename, 'rb') as jar: 17 | return load(jar, FMIndex) 18 | 19 | #@prefetch 20 | def fastmap(rec, fmi, out): 21 | prev, curr, mems = [], [], [] 22 | q = rec.seq 23 | l = len(q) 24 | 25 | start = 0 26 | while True: 27 | while start < l and q[start].N(): 28 | start += 1 29 | if start >= l: 30 | break 31 | 32 | mems.clear() 33 | prev.clear() 34 | curr.clear() 35 | x = start 36 | 37 | if q[x].N(): 38 | return 39 | 40 | ik = SMEM(fmi.biinterval(q[x]), start=x, stop=x+1) 41 | 42 | # forward search 43 | i = x + 1 44 | while i < l: 45 | if not q[i].N(): # an A/C/G/T base 46 | ok = ~fmi[~(ik.interval), ~q[i]] 47 | if len(ok) != len(ik.interval): # change of the interval size 48 | curr.append(ik) 49 | if len(ok) < min_intv: 50 | break # the interval size is too small to be extended further 51 | ik = SMEM(ok, start=x, stop=i+1) 52 | else: # an ambiguous base 53 | curr.append(ik) 54 | break 55 | i += 1 56 | 57 | if i == l: 58 | curr.append(ik) 59 | curr.reverse() 60 | ret = curr[0].stop 61 | prev, curr = curr, prev 62 | 63 | # backward search for MEMs 64 | i = x - 1 65 | while i >= -1: 66 | c = i >= 0 and not q[i].N() 67 | curr.clear() 68 | for p in prev: 69 | ok = None 70 | if c: 71 | ok = fmi[p.interval, q[i]] 72 | if not c or len(ok) < min_intv: 73 | if len(curr) == 0: 74 | if len(mems) == 0 or i + 1 < mems[-1].start: 75 | if len(ik := SMEM(p.interval, start=i+1, stop=p.stop)) >= min_seed: 76 | mems.append(ik) 77 | elif len(curr) == 0 or len(ok) != len(curr[-1].interval): 78 | curr.append(SMEM(ok, start=p.start, stop=p.stop)) 79 | if len(curr) == 0: 80 | break 81 | prev, curr = curr, prev 82 | i -= 1 83 | 84 | mems.reverse() # s.t. sorted by the start coordinate 85 | start = ret 86 | 87 | for mem in mems: 88 | intv = mem.interval 89 | offset = mem.start 90 | match_size = len(mem) 91 | 92 | print(f'{rec.name}\tEM\t{offset}\t{offset+match_size}\t{len(intv)}', end='', file=out) 93 | if len(intv) <= min_iwidth: 94 | for rid, name, pos, is_rev in fmi.biresults(mem): 95 | print(f"\t{name}:{'-' if is_rev else '+'}{pos+1}", end='', file=out) 96 | else: 97 | print("\t*", file=out) 98 | print("", file=out) 99 | 100 | def main_index(path): 101 | from pickle import dump 102 | print('building FM-index...', file=stderr) 103 | fmi = FMIndex(path, FMD=True) 104 | print('writing to disk...', file=stderr) 105 | with open_index_file(path, 'wb0') as jar: 106 | dump(fmi, jar) 107 | 108 | def main_search(index, fastq, result): 109 | print('loading index...', file=stderr) 110 | fmi = None 111 | with timing('load'): 112 | fmi = FMIndex(index, FMD=True) 113 | print('running alignment pipeline...', file=stderr) 114 | with open(result, 'w') as out, timing('fastmap'): 115 | FASTQ(fastq) |> iter |> fastmap(fmi, out) 116 | 117 | match argv[1:]: 118 | case ['index', path]: 119 | main_index(path) 120 | case ['search', index, fastq, result]: 121 | main_search(index, fastq, result) 122 | case _: 123 | print("error: unknown mode: valid modes are 'index' and 'search'", file=stderr) 124 | exit(1) 125 | -------------------------------------------------------------------------------- /test/apps/snap/hashtable.seq: -------------------------------------------------------------------------------- 1 | # Implementation of SNAP aligner's hash table 2 | # https://github.com/amplab/snap/blob/master/SNAPLib/HashTable.{cpp,h} 3 | 4 | # Need the following hooks linked to convert C++ SNAPHashTable to Seq object: 5 | # snap_hashtable_ptr(Ptr[byte]) -> Ptr[tuple[K,V]] -- extract table pointer 6 | # snap_hashtable_len(Ptr[byte]) -> int -- extract table length 7 | # snap_hashtable_invalid_val(Ptr[byte]) -> V -- extract "invalid" value 8 | 9 | QUADRATIC_CHAINING_DEPTH = 5 10 | 11 | class SNAPHashTable[K,V]: 12 | table: Array[Tuple[V,K]] # this order is consistent with SNAP 13 | invalid_val: V 14 | 15 | def _hash(k): 16 | key = hash(k) 17 | key ^= int(UInt[64](key) >> UInt[64](33)) 18 | key *= 0xff51afd7ed558ccd 19 | key ^= int(UInt[64](key) >> UInt[64](33)) 20 | key *= 0xc4ceb9fe1a85ec53 21 | key ^= int(UInt[64](key) >> UInt[64](33)) 22 | return key 23 | 24 | def __init__(self, size: int, invalid_val: V): 25 | self.table = Array(size) 26 | self.invalid_val = invalid_val 27 | 28 | for i in range(size): 29 | self.table[i] = (invalid_val, K()) 30 | 31 | def __init__(self, p: Ptr[byte]): 32 | from C import snap_hashtable_ptr(Ptr[byte]) -> Ptr[byte] 33 | from C import snap_hashtable_len(Ptr[byte]) -> int 34 | from C import snap_hashtable_invalid_val(Ptr[byte]) -> int 35 | self.table = Array[Tuple[V,K]](Ptr[Tuple[V,K]](snap_hashtable_ptr(p)), snap_hashtable_len(p)) 36 | self.invalid_val = V(snap_hashtable_invalid_val(p)) 37 | 38 | def _get_index(self, where: int): 39 | return int(UInt[64](where) % UInt[64](len(self.table))) 40 | 41 | def _get_entry_index_for_key(self, k: K): 42 | table = self.table 43 | table_size = table.len 44 | table_index = self._get_index(SNAPHashTable[K,V]._hash(k)) 45 | wrapped = False 46 | n_probes = 1 47 | invalid_val = self.invalid_val 48 | 49 | while table[table_index][1] != k and table[table_index][0] != invalid_val: 50 | table_index += (n_probes ** 2) if n_probes < QUADRATIC_CHAINING_DEPTH else 1 51 | n_probes += 1 52 | 53 | if table_index >=table_size: 54 | if wrapped: 55 | return -1 56 | 57 | wrapped = True 58 | table_index %= table_size 59 | 60 | return table_index 61 | 62 | def get_value_ptr_for_key(self, k: K): 63 | table = self.table 64 | table_size = table.len 65 | table_index = self._get_index(SNAPHashTable[K,V]._hash(k)) 66 | invalid_val = self.invalid_val 67 | entry = table[table_index] 68 | 69 | if entry[1] == k and entry[0] != invalid_val: 70 | return Ptr[V]((table.ptr + table_index).as_byte()) 71 | else: 72 | n_probes = 0 73 | while True: 74 | n_probes += 1 75 | 76 | if n_probes > table_size + QUADRATIC_CHAINING_DEPTH: 77 | return Ptr[V]() 78 | 79 | diff = (n_probes**2) if n_probes < QUADRATIC_CHAINING_DEPTH else 1 80 | table_index = (table_index + diff) % table_size 81 | 82 | entry = table[table_index] 83 | if not (entry[1] != k and entry[0] != invalid_val): 84 | break 85 | 86 | return Ptr[V]((table.ptr + table_index).as_byte()) 87 | 88 | def __prefetch__(self, k: K): 89 | table = self.table 90 | table_index = self._get_index(SNAPHashTable[K,V]._hash(k)) 91 | (self.table.ptr + table_index).__prefetch_r3__() 92 | 93 | def lookup_slow(self, k: K): 94 | entry = self._get_entry_index_for_key(k) 95 | return self.table[entry][0] if entry >= 0 else self.invalid_val 96 | 97 | def __getitem__(self, k: K): 98 | p = self.get_value_ptr_for_key(k) 99 | return p[0] if p else self.invalid_val 100 | 101 | def __setitem__(self, k: K, v: V): 102 | entry = self._get_entry_index_for_key(k) 103 | if entry >= 0: 104 | self.table[entry] = (v, k) 105 | 106 | def _test(): 107 | h = SNAPHashTable[i32,i32](100, i32(-1)) 108 | for i in range(10): 109 | h[i32(42 + i*100)] = i32(i) 110 | 111 | for i in range(100): 112 | print int(h[i32(42 + i*100)]) 113 | -------------------------------------------------------------------------------- /test/pipeline/interalign.codon: -------------------------------------------------------------------------------- 1 | from bio import * 2 | 3 | Q,T = ['test/data/' + a for a in ('MT-orang.fa','MT-human.fa')] 4 | 5 | def normal_align(query: seq, 6 | target: seq, 7 | a: int = 2, 8 | b: int = 4, 9 | ambig: int = 0, 10 | gapo: int = 4, 11 | gape: int = 2, 12 | bandwidth: int = -1, 13 | zdrop: int = -1, 14 | end_bonus: int = 0, 15 | ext_only: bool = False, 16 | score_only: bool = False, 17 | rev_cigar: bool = False): 18 | return query.align(target, a=a, b=b, ambig=ambig, gapo=gapo, gape=gape, bandwidth=bandwidth, zdrop=zdrop, 19 | end_bonus=end_bonus, ext_only=ext_only, score_only=score_only, rev_cigar=rev_cigar) 20 | 21 | @inter_align 22 | @test 23 | def aln1(t): 24 | query, target = t 25 | score = query.align(target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=5, score_only=True).score 26 | score_exp = normal_align(query, target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=5, score_only=True).score 27 | assert score == score_exp 28 | 29 | rev_cigar = True 30 | ext_only = True 31 | 32 | def walk_cigar(query, target, cigar): 33 | a=1 34 | b=2 35 | ambig=0 36 | gapo=2 37 | gape=1 38 | 39 | i, j = 0, 0 40 | score = 0 41 | for n, op in cigar: 42 | match op: 43 | case 'M': 44 | for k in range(n): 45 | q = query[i + k] 46 | t = target[j + k] 47 | if q.N() or t.N(): 48 | score += ambig 49 | elif q.__int__() == t.__int__(): 50 | score += a 51 | else: 52 | score -= b 53 | i += n 54 | j += n 55 | case 'I': 56 | score -= (gapo + n*gape) 57 | i += n 58 | case 'D': 59 | score -= (gapo + n*gape) 60 | j += n 61 | case _: 62 | assert False 63 | return score 64 | 65 | @inter_align 66 | @test 67 | def aln2(t): 68 | query, target = t 69 | my_ext_only = ext_only 70 | inter = query.align(target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=0, ext_only=my_ext_only, score_only=False) 71 | intra = normal_align(query, target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=0, ext_only=my_ext_only, score_only=False) 72 | assert inter.score == intra.score 73 | assert walk_cigar(query, target, inter.cigar) == inter.score 74 | assert walk_cigar(query, target, intra.cigar) == intra.score 75 | assert inter.score == query.align(target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=0, ext_only=my_ext_only).score 76 | 77 | a = 1 78 | b = 2 79 | ambig = 0 80 | gapo = 2 81 | gape = 1 82 | zdrop = 100 83 | bandwidth = 100 84 | end_bonus = 5 85 | 86 | @inter_align 87 | @test 88 | def aln3(t): 89 | query, target = t 90 | if query[0] == target[0]: 91 | score = query.align(target, a=a, b=b, ambig=ambig, gapo=gapo, gape=1, zdrop=100, bandwidth=100, end_bonus=5).score 92 | score_exp = normal_align(query, target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=5).score 93 | assert score == score_exp 94 | score2 = query.align(target, a=1, b=2, ambig=0, gapo=2, gape=gape, zdrop=zdrop, bandwidth=bandwidth, end_bonus=end_bonus).score 95 | assert score == score2 96 | 97 | @inter_align 98 | @test 99 | def aln4(t): 100 | # tests intra-alignment demotion 101 | for i in range(2): 102 | query, target = t 103 | query = ~query 104 | target = ~target 105 | score = query.align(target, a=a, b=b, ambig=ambig, gapo=gapo, gape=1, zdrop=100, bandwidth=100, end_bonus=5).score 106 | score_exp = normal_align(query, target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=5).score 107 | assert score == score_exp 108 | query = query[:len(query)//2] 109 | target = target[:len(target)//2] 110 | 111 | def subs(path: str, n: int = 20): 112 | for a in seqs(FASTA(path)): 113 | for b in a.split(n, 1): 114 | yield b 115 | 116 | zip(subs(Q), subs(T)) |> aln1 117 | zip(subs(Q), subs(T)) |> aln2 118 | zip(subs(Q), subs(T)) |> aln3 119 | zip(subs(Q, 1024), subs(T, 1024)) |> aln4 120 | -------------------------------------------------------------------------------- /htslib-config.h.cmake: -------------------------------------------------------------------------------- 1 | /* config.h. Generated from config.h.in by configure. */ 2 | /* config.h.in. Generated from configure.ac by autoheader. */ 3 | 4 | /* If you use configure, this file provides #defines reflecting your 5 | configuration choices. If you have not run configure, suitable 6 | conservative defaults will be used. 7 | 8 | Autoheader adds a number of items to this template file that are not 9 | used by HTSlib: STDC_HEADERS and most HAVE_*_H header file defines 10 | are immaterial, as we assume standard ISO C headers and facilities; 11 | the PACKAGE_* defines are unused and are overridden by the more 12 | accurate PACKAGE_VERSION as computed by the Makefile. */ 13 | 14 | /* Define if HTSlib should enable GCS support. */ 15 | /* #undef ENABLE_GCS */ 16 | 17 | /* Define if HTSlib should enable plugins. */ 18 | /* #undef ENABLE_PLUGINS */ 19 | 20 | /* Define if HTSlib should enable S3 support. */ 21 | /* #undef ENABLE_S3 */ 22 | 23 | /* Define if you have the Common Crypto library. */ 24 | /* #undef HAVE_COMMONCRYPTO */ 25 | 26 | /* Define to 1 if you have the `drand48' function. */ 27 | #define HAVE_DRAND48 1 28 | 29 | /* Define if using an external libhtscodecs */ 30 | /* #undef HAVE_EXTERNAL_LIBHTSCODECS */ 31 | 32 | /* Define to 1 if you have the `fdatasync' function. */ 33 | /* #undef HAVE_FDATASYNC */ 34 | 35 | /* Define to 1 if you have the `fsync' function. */ 36 | #define HAVE_FSYNC 1 37 | 38 | /* Define to 1 if you have the `getpagesize' function. */ 39 | #define HAVE_GETPAGESIZE 1 40 | 41 | /* Define to 1 if you have the `gmtime_r' function. */ 42 | #define HAVE_GMTIME_R 1 43 | 44 | /* Define if you have libcrypto-style HMAC(). */ 45 | /* #undef HAVE_HMAC */ 46 | 47 | /* Define to 1 if you have the header file. */ 48 | #define HAVE_INTTYPES_H 1 49 | 50 | /* Define to 1 if you have the `bz2' library (-lbz2). */ 51 | #define HAVE_LIBBZ2 1 52 | 53 | /* Define if libcurl file access is enabled. */ 54 | /* #undef HAVE_LIBCURL */ 55 | 56 | /* Define if libdeflate is available. */ 57 | /* #undef HAVE_LIBDEFLATE */ 58 | 59 | /* Define to 1 if you have the `lzma' library (-llzma). */ 60 | #define HAVE_LIBLZMA 1 61 | 62 | /* Define to 1 if you have the `z' library (-lz). */ 63 | #define HAVE_LIBZ 1 64 | 65 | /* Define to 1 if you have the header file. */ 66 | #define HAVE_LZMA_H 1 67 | 68 | /* Define to 1 if you have a working `mmap' system call. */ 69 | #define HAVE_MMAP 1 70 | 71 | /* Define to 1 if you have the `srand48_deterministic' function. */ 72 | /* #undef HAVE_SRAND48_DETERMINISTIC */ 73 | 74 | /* Define to 1 if you have the header file. */ 75 | #define HAVE_STDINT_H 1 76 | 77 | /* Define to 1 if you have the header file. */ 78 | #define HAVE_STDIO_H 1 79 | 80 | /* Define to 1 if you have the header file. */ 81 | #define HAVE_STDLIB_H 1 82 | 83 | /* Define to 1 if you have the header file. */ 84 | #define HAVE_STRINGS_H 1 85 | 86 | /* Define to 1 if you have the header file. */ 87 | #define HAVE_STRING_H 1 88 | 89 | /* Define to 1 if you have the header file. */ 90 | #define HAVE_SYS_PARAM_H 1 91 | 92 | /* Define to 1 if you have the header file. */ 93 | #define HAVE_SYS_STAT_H 1 94 | 95 | /* Define to 1 if you have the header file. */ 96 | #define HAVE_SYS_TYPES_H 1 97 | 98 | /* Define to 1 if you have the header file. */ 99 | #define HAVE_UNISTD_H 1 100 | 101 | /* Define to the address where bug reports for this package should be sent. */ 102 | #define PACKAGE_BUGREPORT "samtools-help@lists.sourceforge.net" 103 | 104 | /* Define to the full name of this package. */ 105 | #define PACKAGE_NAME "HTSlib" 106 | 107 | /* Define to the full name and version of this package. */ 108 | #define PACKAGE_STRING "HTSlib 1.13-23-g3eada2f" 109 | 110 | /* Define to the one symbol short name of this package. */ 111 | #define PACKAGE_TARNAME "htslib" 112 | 113 | /* Define to the home page for this package. */ 114 | #define PACKAGE_URL "http://www.htslib.org/" 115 | 116 | /* Define to the version of this package. */ 117 | #define PACKAGE_VERSION "1.13-23-g3eada2f" 118 | 119 | /* Platform-dependent plugin filename extension. */ 120 | /* #undef PLUGIN_EXT */ 121 | 122 | /* Define to 1 if all of the C90 standard headers exist (not just the ones 123 | required in a freestanding environment). This macro is provided for 124 | backward compatibility; new code need not use it. */ 125 | #define STDC_HEADERS 1 126 | 127 | /* Number of bits in a file offset, on hosts where this is settable. */ 128 | /* #undef _FILE_OFFSET_BITS */ 129 | 130 | /* Define for large files, on AIX-style hosts. */ 131 | /* #undef _LARGE_FILES */ 132 | 133 | /* Needed for PTHREAD_MUTEX_RECURSIVE */ 134 | /* #undef _XOPEN_SOURCE */ 135 | -------------------------------------------------------------------------------- /stdlib/bio/fastq.codon: -------------------------------------------------------------------------------- 1 | # FASTQ format parser 2 | # https://en.wikipedia.org/wiki/FASTQ_format 3 | from bio.seq import seq 4 | from copy import copy 5 | 6 | @tuple 7 | class FASTQRecord: 8 | _header: str 9 | _read: seq 10 | _qual: str 11 | 12 | @property 13 | def header(self): 14 | return self._header 15 | 16 | @property 17 | def name(self): 18 | from bio.builtin import _split_header_on_space 19 | return _split_header_on_space(self.header)[0] 20 | 21 | @property 22 | def comment(self): 23 | from bio.builtin import _split_header_on_space 24 | return _split_header_on_space(self.header)[1] 25 | 26 | @property 27 | def read(self): 28 | return self._read 29 | 30 | # FASTA compatibility 31 | @property 32 | def seq(self): 33 | return self._read 34 | 35 | @property 36 | def qual(self): 37 | return self._qual 38 | 39 | @tuple 40 | class FASTQReader: 41 | _file: cobj 42 | validate: bool 43 | gzip: bool 44 | copy: bool 45 | 46 | def __new__(path: str, validate: bool, gzip: bool, copy: bool) -> FASTQReader: 47 | return FASTQReader(gzopen(path, "r").__raw__() if gzip else open(path, "r").__raw__(), validate, gzip, copy) 48 | 49 | @property 50 | def file(self): 51 | assert not self.gzip 52 | p = __array__[cobj](1) 53 | p.ptr[0] = self._file 54 | return Ptr[File](p.ptr.as_byte())[0] 55 | 56 | @property 57 | def gzfile(self): 58 | assert self.gzip 59 | p = __array__[cobj](1) 60 | p.ptr[0] = self._file 61 | return Ptr[gzFile](p.ptr.as_byte())[0] 62 | 63 | def _preprocess_read(self, a: str): 64 | from bio.builtin import _validate_str_as_seq 65 | if self.validate: 66 | return _validate_str_as_seq(a, self.copy) 67 | else: 68 | return copy(seq(a.ptr, a.len)) if self.copy else seq(a.ptr, a.len) 69 | 70 | def _preprocess_qual(self, a: str): 71 | from bio.builtin import _validate_str_as_qual 72 | if self.validate: 73 | return _validate_str_as_qual(a, self.copy) 74 | else: 75 | return a.__ptrcopy__() if self.copy else a 76 | 77 | def _iter_core(self, file, seqs: bool) -> Generator[FASTQRecord]: 78 | line = 0 79 | name, read, qual = "", s"", "" 80 | for a in file._iter_trim_newline(): 81 | x = line % 4 82 | if x == 0: 83 | if self.validate and a[0] != "@": 84 | raise ValueError(f"sequence name on line {line + 1} of FASTQ does not begin with '@'") 85 | name = a[1:].__ptrcopy__() if self.copy else a[1:] 86 | elif x == 1: 87 | read = self._preprocess_read(a) 88 | if seqs: 89 | yield FASTQRecord("", read, "") 90 | elif x == 2: 91 | if self.validate and a[0] != "+": 92 | raise ValueError(f"invalid separator on line {line + 1} of FASTQ") 93 | elif x == 3: 94 | if self.validate and len(a) != len(read): 95 | raise ValueError(f"quality and sequence length mismatch on line {line + 1} of FASTQ") 96 | qual = self._preprocess_qual(a) 97 | assert read.len >= 0 98 | if not seqs: 99 | yield FASTQRecord(name, read, qual) 100 | else: 101 | assert False 102 | line += 1 103 | 104 | def __seqs__(self): 105 | if self.gzip: 106 | for rec in self._iter_core(self.gzfile, seqs=True): 107 | yield rec.seq 108 | else: 109 | for rec in self._iter_core(self.file, seqs=True): 110 | yield rec.seq 111 | self.close() 112 | 113 | def __iter__(self): 114 | if not self.copy: 115 | raise ValueError("cannot iterate over FASTQ records with copy=False") 116 | if self.gzip: 117 | yield from self._iter_core(self.gzfile, seqs=False) 118 | else: 119 | yield from self._iter_core(self.file, seqs=False) 120 | self.close() 121 | 122 | def __blocks__(self, size: int): 123 | from bio.block import _blocks 124 | if not self.copy: 125 | raise ValueError("cannot read sequences in blocks with copy=False") 126 | return _blocks(self.__iter__(), size) 127 | 128 | def close(self): 129 | if self.gzip: 130 | self.gzfile.close() 131 | else: 132 | self.file.close() 133 | 134 | def __enter__(self): 135 | pass 136 | 137 | def __exit__(self): 138 | self.close() 139 | 140 | def FASTQ(path: str, validate: bool = True, gzip: bool = True, copy: bool = True): 141 | return FASTQReader(path=path, validate=validate, gzip=gzip, copy=copy) 142 | -------------------------------------------------------------------------------- /test/apps/cora/hom_exact.codon: -------------------------------------------------------------------------------- 1 | # Implementation of CORA's exact homology table construction 2 | # https://github.com/denizy/cora/blob/master/homTable_setup.cpp 3 | 4 | # Usage: seqc hom_exact.seq 5 | # Output format (gzip'd): 6 | # - N = total records [i64] 7 | # - N times: 8 | # - block size [i64] 9 | # - C = count [i64] 10 | # - C times: 11 | # - Chromosome index [i64] 12 | # - Chromosome position (0-based) [i64] 13 | # - Reverse complemented? [i8] 14 | 15 | from sys import argv, stderr, exit 16 | from pickle import dump 17 | from bio import FASTA, Locus 18 | import gzip 19 | 20 | K: Static[int] = 64 21 | 22 | @tuple 23 | class BitSet: 24 | v: List[int] 25 | 26 | def __new__(n: int) -> BitSet: 27 | return BitSet([0 for _ in range((n // 64) + 1)],) 28 | def __getitem__(self, idx: int): 29 | return (self.v[idx // 64] & (1 << (idx % 64))) != 0 30 | def __setitem__(self, idx: int, b: bool): 31 | if b: 32 | self.v[idx // 64] |= (1 << (idx % 64)) 33 | else: 34 | self.v[idx // 64] &= ~(1 << (idx % 64)) 35 | 36 | if len(argv) != 3: 37 | print(f'usage: {argv[0]} ', file=stderr) 38 | exit(1) 39 | 40 | path = argv[1] 41 | num_kmers = sum(2 if kmer == ~kmer else 1 # palindromes added twice 42 | for rec in FASTA(path) for kmer in rec.seq.kmers(1, K)) 43 | print 'num_kmers:', num_kmers 44 | kmer_list = List(num_kmers) 45 | 46 | @tuple 47 | class EqClass: 48 | idx: int 49 | count: int 50 | 51 | def __getitem__(self, idx: int): 52 | return kmer_list[self.idx + idx][1] 53 | 54 | bitsets = [] # markers for equivalence class representatives 55 | for tid, rec in enumerate(FASTA(path)): 56 | print 'processing', rec.name 57 | bitsets.append(BitSet(len(rec.seq))) 58 | for pos, kmer in rec.seq.kmers_with_pos(1, K): 59 | kmer_rev = ~kmer 60 | add_pal = (kmer == kmer_rev) 61 | if kmer_rev < kmer: 62 | kmer = kmer_rev 63 | pos = -pos 64 | kmer_list.append((kmer, Locus(tid, pos))) 65 | if add_pal: # add palindrome again 66 | kmer_list.append((kmer, Locus(tid, -pos))) 67 | 68 | print 'sorting kmer_list...' 69 | kmer_list.sort() 70 | print 'done' 71 | 72 | num_classes = 0 73 | i = 0 74 | while i < len(kmer_list): 75 | j = i + 1 76 | while j < len(kmer_list) and kmer_list[i][0] == kmer_list[j][0]: 77 | j += 1 78 | if j - i > 1: 79 | num_classes += 1 80 | i = j 81 | 82 | print 'num_classes:', num_classes 83 | eq_set = List(num_classes) 84 | i = 0 85 | while i < len(kmer_list): 86 | j = i + 1 87 | while j < len(kmer_list) and kmer_list[i][0] == kmer_list[j][0]: 88 | j += 1 89 | count = j - i 90 | if count > 1: 91 | eq_set.append(EqClass(i, count)) 92 | # make sure representative is always forward-facing 93 | if kmer_list[i][1].reversed: 94 | for k in range(count): 95 | kmer_list[i + k] = (kmer_list[i + k][0], ~kmer_list[i + k][1]) 96 | bitsets[kmer_list[i][1].tid][kmer_list[i][1].pos] = True 97 | i = j 98 | 99 | print 'sorting eq_set...' 100 | eq_set.sort(key=lambda x: (x[0].tid, x[0].pos)) 101 | print 'done' 102 | 103 | def find_block_size(start: int, eq_set: list[EqClass]): 104 | base_idx = eq_set[start].idx 105 | base_len = eq_set[start].count 106 | dist = 1 107 | while (start + dist < len(eq_set) and 108 | eq_set[start][0].tid == eq_set[start + dist][0].tid and 109 | eq_set[start][0].pos + dist == eq_set[start + dist][0].pos): 110 | comp_len = eq_set[start + dist].count 111 | 112 | if comp_len != base_len: 113 | return dist 114 | 115 | for k in range(1, base_len): 116 | if (eq_set[start][k].reversed != eq_set[start + dist][k].reversed or 117 | eq_set[start][k].tid != eq_set[start + dist][k].tid): 118 | return dist 119 | 120 | offset = -dist if eq_set[start][k].reversed else dist 121 | if eq_set[start][k].pos + offset != eq_set[start + dist][k].pos: 122 | return dist 123 | 124 | dist += 1 125 | return dist 126 | 127 | total = 0 128 | i = 0 129 | while i < len(eq_set): 130 | total += 1 131 | i += find_block_size(i, eq_set) 132 | 133 | with gzip.open(argv[2] + '.hom_exact', 'wb') as out, gzip.open(argv[2] + '.reps_bitsets', 'wb') as reps: 134 | dump(total, out) 135 | i = 0 136 | while i < len(eq_set): 137 | block_size = find_block_size(i, eq_set) 138 | count = eq_set[i].count 139 | dump(block_size, out) 140 | dump(count, out) 141 | for k in range(count): 142 | dump(eq_set[i][k].tid, out) 143 | dump(eq_set[i][k].pos, out) 144 | dump(eq_set[i][k].reversed, out) 145 | i += block_size 146 | dump(bitsets, reps) 147 | -------------------------------------------------------------------------------- /test/core/bwtsa.codon: -------------------------------------------------------------------------------- 1 | from bio import * 2 | from bio.fmindex import FMIndex, FMDIndex 3 | import gzip 4 | import pickle 5 | 6 | Q,T = ['test/data/' + a for a in ('MT-orang.fa','MT-human.fa')] 7 | 8 | def suffix_array_slow[T](s: T): 9 | return [i for _, i in sorted([(s, i) for i in range(len(s))], key=lambda t: t[0][t[1]:])] 10 | 11 | def bwt_slow[T](s: T): 12 | t = str(s) + '$' 13 | n = len(t) 14 | m = sorted([t[i:n]+t[0:i] for i in range(n)]) 15 | return ''.join([q[-1] for q in m]) 16 | 17 | @test 18 | def test_suffix_array(): 19 | assert len(s''.suffix_array()) == 0 20 | assert s'A'.suffix_array() == [0] 21 | 22 | for s in list(seqs(FASTA(Q))) + list(seqs(FASTA(T))): 23 | SA = s.suffix_array() 24 | assert SA == suffix_array_slow(s) 25 | s = ~s 26 | SA = s.suffix_array() 27 | assert SA == suffix_array_slow(s) 28 | 29 | @test 30 | def test_bwt(): 31 | assert s''.bwt() == s'$' 32 | assert s'A'.bwt() == s'A$' 33 | 34 | for s in list(seqs(FASTA(Q))) + list(seqs(FASTA(T))): 35 | b = str(s.bwt()) 36 | assert b == bwt_slow(s) 37 | s = ~s 38 | b = str(s.bwt()) 39 | assert b == bwt_slow(s) 40 | 41 | @test 42 | def test_fmindex(FMD: bool): 43 | # sequence-based 44 | if not FMD: 45 | fmi = FMIndex(s'TAACGAGGCGGCTCGTAGTATAAACGCTTTGGACTAGACTCGATACCTAG') 46 | assert fmi.count(s'TA') == 7 47 | assert fmi.count(s'TAA') == 2 48 | assert fmi.count(s'TATT') == 0 49 | assert sorted(list(fmi[s'TAA'])) == sorted(list(fmi[~s'TTA'])) == [0, 20] 50 | assert len(list(fmi[s'TATT'])) == 0 51 | 52 | # FASTA-based 53 | fmi = FMIndex('test/data/seqs.fasta', FMD=FMD) 54 | with gzip.open('fmi.bin', 'wb') as jar: 55 | pickle.dump(fmi, jar) 56 | 57 | with gzip.open('fmi.bin', 'rb') as jar: 58 | fmi = pickle.load(jar, FMIndex) 59 | 60 | assert fmi.sequence(1, 20, rid=0) == fmi.sequence(1, 20, name='chrA') == s'CCTCCCCGTTCGCTGGACC' 61 | assert fmi.sequence(1, 20, rid=3) == fmi.sequence(1, 20, name='chrD') == s'GCCGTGACCACCCCGCGAG' 62 | assert [(a.tid, a.name, a.len) for a in fmi.contigs()] == [(0, 'chrA', 460), (1, 'chrB', 489), (2, 'chrC', 500), (3, 'chrD', 49)] 63 | if not FMD: 64 | assert fmi.count(s'TATA') == 6 # note TATATA in chrC 65 | assert fmi.count(s'TATAC') == 0 66 | assert sorted(list(fmi.locate(s'TATAA'))) == [(1, 'chrB', 168), (2, 'chrC', 275), (2, 'chrC', 485)] 67 | assert sorted(list(fmi.loci(fmi._get_interval(s'TATAA')))) == [Locus(tid=1, pos=168), Locus(tid=2, pos=275), Locus(tid=2, pos=485)] 68 | 69 | @test 70 | def test_fmdindex(): 71 | # FASTA-based 72 | fmi = FMDIndex('test/data/seqs.fasta') 73 | with gzip.open('fmi.bin', 'wb') as jar: 74 | pickle.dump(fmi, jar) 75 | 76 | with gzip.open('fmi.bin', 'rb') as jar: 77 | fmi = pickle.load(jar, FMDIndex) 78 | 79 | assert fmi.sequence(1, 20, rid=0) == fmi.sequence(1, 20, name='chrA') == s'CCTCCCCGTTCGCTGGACC' 80 | assert fmi.sequence(1, 20, rid=3) == fmi.sequence(1, 20, name='chrD') == s'GCCGTGACCACCCCGCGAG' 81 | assert [(a.tid, a.name, a.len) for a in fmi.contigs()] == [(0, 'chrA', 460), (1, 'chrB', 489), (2, 'chrC', 500), (3, 'chrD', 49)] 82 | assert sorted(list(fmi.locate(s'TATAA'))) == [(1, 'chrB', 168, False), (2, 'chrC', 275, False), (2, 'chrC', 485, False)] 83 | assert sorted(list(fmi.locate(s'CAGGG', both_strands=True))) == [(0, 'chrA', 214, False), (0, 'chrA', 226, False), (0, 'chrA', 338, True), (0, 'chrA', 381, False), (2, 'chrC', 448, False)] 84 | assert sorted(list(fmi.loci(fmi._get_interval(s'CAGGG')))) == [Locus(tid=0, pos=214), Locus(tid=0, pos=226), Locus(tid=0, pos=-338), Locus(tid=0, pos=381), Locus(tid=2, pos=448)] 85 | 86 | @test 87 | def test_smems[FM](fmi: FM, path: str): 88 | # FASTA-based 89 | ref = [rec for rec in FASTA(path)] 90 | with gzip.open('fmi.bin', 'wb') as jar: 91 | pickle.dump(fmi, jar) 92 | 93 | with gzip.open('fmi.bin', 'rb') as jar: 94 | fmi = pickle.load(jar, FM) 95 | 96 | q = s'ACCAAACCCAGCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAA' 97 | v = [[(name, pos, is_rev, ref[rid].seq[pos:pos + len(smem)]) for rid, name, pos, is_rev in fmi.biresults(smem)] for smem in fmi.smems(q, x=20, min_intv=1)[1]] 98 | assert v == [[('chrC', 61, True, s'TATTCATCCTATGTGGGTAATTGAGGAGTATGCTAAGATTTTGCGTAGC'), ('chrC', 10, False, s'GCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATA')]] 99 | 100 | q = s'CTTAA' 101 | v = [[(name, pos, is_rev, ref[rid].seq[pos:pos + len(smem)]) for rid, name, pos, is_rev in fmi.biresults(smem)] for smem in fmi.smems(q, x=1, min_intv=1)[1]] 102 | assert v == [[('chrA', 2, False, s'CTTAA')]] 103 | 104 | test_suffix_array() 105 | test_bwt() 106 | test_fmindex(FMD=True) 107 | test_fmindex(FMD=False) 108 | test_fmdindex() 109 | 110 | path = 'test/data/seqs2.fasta' 111 | test_smems(FMIndex(path, FMD=True), path) 112 | test_smems(FMDIndex(path), path) 113 | -------------------------------------------------------------------------------- /test/core/align.codon: -------------------------------------------------------------------------------- 1 | from bio import * 2 | Q,T = ['test/data/' + a for a in ('MT-orang.fa','MT-human.fa')] 3 | 4 | @test 5 | def align_test(): 6 | for target in FASTA(Q) |> seqs: 7 | for query in FASTA(T) |> seqs: 8 | a = query @ target 9 | assert abs(a.score) == 3315 # edit distance 10 | assert str(a.cigar) == '8I1M10I1M1I2M4I1M1I1M7I1M3I1M2I1M1I2M3I1M2I2M3I1M2I2M7I3M3I1M1I1M1I10M3I1M2I2M1I2M5I1M2I1M4I7M1I1M1I1M1I1M1I1M5I1M6I3M1I2M2I1M8I1M1I2M2I2M2I1M3I1M3I1M1I1M2I1M4I4M1I1M3I1M1I1M2I1M4I1M13I1M1I2M1I1M1I2M9I1M2I1M10I2M4I2M1I1M1I1M2I1M11I1M2I1M2I1M8I1M7I1M16I1M1I1M6I2M15I2M24I2M10I1M4I2M18I1M4I1M1I1M1I1M11I1M55I1M3I3M1I1M8I2M3I1M5I1M6I2M1I1M5I1M7I3M17I4M3I1M6I3M1I3M4I2M1I2M2I1M2I1M1I2M1I1M4I1M2I1M1I1M1I1M11I1M1I1M3I1M6I3M3I1M1I1M3I1M9I1M1I1M4I1M6I3M2I2M5I2M3I1M7I1M1I7M7I1M1I2M3I2M2I2M2I3M3I1M2I1M1I2M1I1M2I1M5I1M2I1M1I3M1I1M4I2M3I2M2I1M2I1M1I3M7I1M2I163M1D559M1I6M1D550M1I2M1I148M1I3M1D134M1I3M1D47M1D696M1I52M1D7M1D61M1D592M1I3M1D485M1I5M1D1211M1I59M1I156M1I31M1D98M1D18M6D7M7D1M1D1542M1I4M1D70M1I345M1I9M1D397M1D1M6D3M4D1M1D1M2D3M1D1M1D8M2D6M1D390M1D5M1I193M1D6M1I195M1I7M1D1826M1I10M1D1256M1I49M1I1423M1D2M1I46M1I5M1D2137M1I3M1D50M1I3M1D40M1D3M4D57M1D53M3I1M1I19M1D39M15D1M1D1M2D1M5D5M1D1M1D5M1D6M1D7M1D2M1D1M3D1M4D1M4D1M2D3M1D1M1D3M2D1M1D8M4D5M1D5M1D2M3D1M1D2M3D3M1D2M1D3M4D3M1D2M3D6M1D1M1D6M2D5M1D4M8D1M1D1M4D2M1D5M1D1M2D1M3D5M2D2M1D1M3D1M2D5M2D2M1D1M1D2M5D1M3D1M1D10M8D1M1D3M2D3M4D1M6D1M1D1M6D3M2D2M1D5M1D1M2D5M1D4M1D1M1D2M9D1M3D1M9D1M1D1M3D3M1D1M1D1M3D2M5D1M4D1M8D1M1D1M4D3M7D2M1D1M2D1M2D1M1D2M2D12M1D2M4D3M1D2M3D1M1D10M2D3M3D1M2D1M2D2M2D1M4D1M3D3M3D1M2D1M4D1M5D1M7D4M6D1M2D1M1D1M2D1M1D1M3D2M1D2M5D1M2D2M1D1M3D2M1D1M2D1M2D2M1D2M3D6M2D2M1D1M1D1M1D3M1D2M8D2M1D2M2D6M1D6M1D1M3D5M4D2M5D2M1D2M1D3M9D2M2D2M6D1M7D1M1D2M9D1M3D2M4D2M7D10M11D2M2D1M7D1M1D1M1D2M1D2M4D1M1D1M4D2M4D1M1D1M5D2M2D1M2D7M4D1M1D1M1D4M4D4M1D1M1D1M8D10M3D2M2D1M1D1M1D6M3D6M1D3M1D6M1D7M3D1M1D4M2D3M1D17M1D3M' 11 | 12 | # ./ksw2-test test/MT-orang.fa test/MT-human.fa 13 | for target in FASTA(Q) |> seqs: 14 | for query in FASTA(T) |> seqs: 15 | a = query.align(target, a=2, b=4, gapo=4, gape=2, gapo2=13, gape2=1) 16 | assert a.score == 17127 17 | assert str(a.cigar) == '576I14M2I4M3D37M1I85M1I232M1D559M1I6M1D550M1I2M1I146M2D3M1I3M1I132M1I3M1D40M3D13M1I1M1I335M3D4M1I3M2I342M1I52M1D13M3D1M2I52M1D592M1I3M1D485M1I5M1D974M3D4M3I230M1I59M1I156M1I31M1D98M1D26M14D329M3D7M3I1203M1I4M1D70M1I345M1I9M1D398M7D8M8D1M1D9M3D2M1I2M1D390M1D5M1I193M1D6M1I195M1I7M1D1826M1I10M1D1256M1I49M1I157M3I5M3D48M2D1M1D3M3I1203M1D2M2I1M1D44M2I2M1D2M1D38M2I16M2D2081M1I3M1D50M1I3M1D43M5D57M1D54M4I19M1D39M2I8M1D7M1D22M1D5M1D4M1I5M1D2M2I29M2D20M1I13M1I1M2D8M1I45M1I15M3I4M2D17M1I56M1I2M1D131M1D37M474D1M' 18 | 19 | # ./ksw2-test -t exts2_sse test/MT-orang.fa test/MT-human.fa 20 | for target in FASTA(Q) |> seqs: 21 | for query in FASTA(T) |> seqs: 22 | a = query.align(target, a=1, b=2, gapo=2, gape=1, gapo2=32, gape2=4, splice=True, splice_fwd=True) 23 | assert a.score == 9027 24 | assert str(a.cigar) == '576I14M2I4M3D37M1I85M1I232M1D559M1I6M1D550M1I2M1I146M2D3M1I3M1I132M1I3M1D40M3D13M1I1M1I335M3D4M1I3M2I342M1I52M1D13M3D1M2I52M1D592M1I3M1D485M1I5M1D974M3D4M3I230M1I59M1I156M1I31M1D98M1D26M14D329M3D7M3I1203M1I4M1D70M1I345M1I9M1D398M7D8M8D1M1D9M3D2M1I2M1D390M1D5M1I193M1D6M1I195M1I7M1D1826M1I10M1D1256M1I49M1I157M3I5M3D48M2D1M1D3M3I1203M1D2M2I1M1D44M2I2M1D2M1D38M2I16M2D2081M1I3M1D50M1I3M1D43M5D57M1D54M4I19M1D39M2I8M1D7M1D22M1D5M1D4M1I5M1D2M2I29M2D20M1I13M1I1M2D8M1I45M1I15M3I4M2D17M1I56M1I2M1D131M1D37M474N1M' 25 | 26 | # ./ksw2-test -t gg2_sse test/MT-orang.fa test/MT-human.fa 27 | for target in FASTA(Q) |> seqs: 28 | for query in FASTA(T) |> seqs: 29 | a = query.align(target, a=2, b=4, gapo=4, gape=2) 30 | assert a.score == 16102 31 | assert str(a.cigar) == '1M155I4M63I5M103I4M56I3M6I4M192I37M1I85M1I232M1D559M1I6M1D550M1I2M1I146M2D3M1I3M1I132M1I3M1D40M3D13M1I1M1I335M3D4M1I3M2I342M1I52M1D13M3D1M2I52M1D592M1I3M1D485M1I5M1D974M3D4M3I230M1I59M1I156M1I31M1D98M1D26M14D329M3D7M3I1203M1I4M1D70M1I345M1I9M1D398M7D8M8D1M1D9M3D2M1I2M1D390M1D5M1I193M1D6M1I195M1I7M1D1826M1I10M1D1256M1I49M1I157M3I5M3D48M2D1M1D3M3I1203M1D2M2I1M1D44M2I2M1D2M1D38M2I16M2D2081M1I3M1D50M1I3M1D43M5D57M1D54M4I19M1D39M2I8M1D7M1D22M1D5M1D4M1I5M1D2M2I29M2D20M1I13M1I1M2D8M1I45M1I15M3I4M2D17M1I56M1I2M1D131M1D37M474D1M' 32 | 33 | @test 34 | def cigar_test(): 35 | def check_cigar(s: str): 36 | return str(CIGAR(s)) == s 37 | 38 | def check_cigar_fail(s: str): 39 | try: 40 | CIGAR(s) 41 | return False 42 | except ValueError: 43 | return True 44 | 45 | assert check_cigar('') 46 | assert check_cigar('1M') 47 | assert check_cigar('11M') 48 | assert check_cigar('3M11I12H111D') 49 | assert check_cigar_fail('3M11I12q111D') 50 | assert check_cigar_fail('3M11I12H111') 51 | assert check_cigar_fail(' ') 52 | assert check_cigar_fail('M') 53 | assert check_cigar_fail('1') 54 | assert check_cigar_fail('M1') 55 | assert check_cigar_fail('MMM') 56 | assert check_cigar_fail('MM1') 57 | assert check_cigar_fail('M1M') 58 | assert check_cigar_fail('M11') 59 | assert check_cigar_fail('1MM') 60 | assert check_cigar_fail('1M1') 61 | assert check_cigar_fail('111') 62 | 63 | assert bool(CIGAR('')) == False 64 | assert bool(CIGAR('1M')) == True 65 | 66 | align_test() 67 | cigar_test() 68 | -------------------------------------------------------------------------------- /test/apps/cora/hom_inexact.codon: -------------------------------------------------------------------------------- 1 | # Implementation of CORA's inexact homology table construction 2 | # https://github.com/denizy/cora/blob/master/homTable_setup.cpp 3 | 4 | # Usage: seqc hom_inexact.seq 5 | # Output format (gzip'd; 100 output files): 6 | # Repeated: 7 | # - Locus 1 [Locus] 8 | # - Locus 2 [Locus] 9 | # - MAX_MISMATCHES times: 10 | # - 0-based error offset, or 255 if none [byte] 11 | 12 | from sys import argv, stderr, exit 13 | from pickle import dump, load 14 | from bio import FASTA, Locus, Kmer 15 | import gzip, itertools 16 | 17 | MAX_MISMATCHES = 3 18 | 19 | K: Static[int] = 64 20 | 21 | @tuple 22 | class BitSet: 23 | v: List[int] 24 | 25 | def __new__(n: int) -> BitSet: 26 | return BitSet([0 for _ in range((n // 64) + 1)],) 27 | def __getitem__(self, idx: int): 28 | return (self.v[idx // 64] & (1 << (idx % 64))) != 0 29 | def __setitem__(self, idx: int, b: bool): 30 | if b: 31 | self.v[idx // 64] |= (1 << (idx % 64)) 32 | else: 33 | self.v[idx // 64] &= ~(1 << (idx % 64)) 34 | 35 | if len(argv) != 4: 36 | print(f'usage: {argv[0]} ', file=stderr) 37 | exit(1) 38 | 39 | ref = [rec for rec in FASTA(argv[1])] 40 | num_mismatches = int(argv[2]) 41 | 42 | if not (1 <= num_mismatches <= MAX_MISMATCHES): 43 | print(f'error: num_mismatches argument must be between 1 and {MAX_MISMATCHES}', file=stderr) 44 | exit(1) 45 | 46 | num_seeds = num_mismatches + 1 47 | bitsets = None 48 | with gzip.open(argv[3] + '.reps_bitsets', 'rb') as reps: 49 | bitsets = load(reps, List[BitSet]) 50 | 51 | class MapSegment[K]: 52 | locus: Locus 53 | next: Optional[MapSegment[K]] = None 54 | 55 | @property 56 | def kmer(self): 57 | k = K(ref[self.locus.tid].seq[self.locus.pos:self.locus.pos + K.len()]) 58 | return ~k if self.locus.reversed else k 59 | 60 | def segment_code(seed_no, kmer): 61 | assert 0 <= seed_no < num_seeds 62 | k = len(kmer) 63 | h = 0 64 | for i in range(0, k // 2 - num_seeds + 1, num_seeds): 65 | h <<= 2 66 | h |= int(kmer[i + seed_no].as_int()) 67 | h <<= 2 68 | h |= int(kmer[k - 1 - i - seed_no].as_int()) 69 | return h 70 | 71 | def gen_signals(): 72 | t = (k'A', k'C', k'G', k'T') 73 | for a, b in itertools.product(t, t): 74 | x, y = (a, b), (~b, ~a) 75 | if x <= y: 76 | yield (x, y) 77 | 78 | def extract_signal(seed_no, kmer): 79 | assert 0 <= seed_no < num_seeds 80 | k = len(kmer) 81 | a1 = kmer[seed_no] 82 | a2 = kmer[seed_no + num_seeds] 83 | a3 = kmer[seed_no + 2*num_seeds] 84 | b1 = kmer[k - 1 - 2*num_seeds - seed_no] 85 | b2 = kmer[k - 1 - num_seeds - seed_no] 86 | b3 = kmer[k - 1 - seed_no] 87 | return ((a1, b3), (a2, b2), (a3, b1)) 88 | 89 | def signal_match(signal, target1, target2): 90 | return signal[0] in target1[0] and signal[1] in target1[1] and signal[2] in target2 91 | 92 | def compare_and_report(seg1, seg2, seed_no, precompact): 93 | k1, k2 = seg1.kmer, seg2.kmer 94 | if any(segment_code(i, k1) == segment_code(i, k2) for i in range(seed_no)): 95 | return # homology was reported by an earlier seed match 96 | 97 | hamming_dist = abs(k1 - k2) 98 | if hamming_dist <= num_mismatches: 99 | dump(seg1.locus, precompact) 100 | dump(seg2.locus, precompact) 101 | for i in range(len(k1)): 102 | if k1[i] != k2[i]: 103 | dump(byte(i), precompact) 104 | for _ in range(MAX_MISMATCHES - hamming_dist): 105 | dump(byte(-1), precompact) # pad to MAX_MISMATCHES 106 | 107 | signals1 = [(a,b) for a in gen_signals() for b in gen_signals()] 108 | signals2 = [a for a in gen_signals()] 109 | basename = f'{argv[3]}.hom_inexact.precompact' 110 | 111 | def process_signal(t): 112 | idx, signal1 = t 113 | d = {} 114 | d.resize(10000000) 115 | with gzip.open(basename + '.' + str(idx), 'wb0') as precompact: 116 | for signal2 in signals2: 117 | for seed_no in range(num_seeds): 118 | d.clear() 119 | for tid, rec in enumerate(ref): 120 | for pos, kmer in rec.seq.kmers_with_pos(1, K): 121 | if not (bitsets[tid][pos] and 122 | signal_match(extract_signal(seed_no, kmer), signal1, signal2)): 123 | continue 124 | 125 | kmer_rev = ~kmer 126 | add_pal = (kmer == kmer_rev) 127 | if kmer_rev < kmer: 128 | kmer = kmer_rev 129 | pos = -pos 130 | 131 | s = segment_code(seed_no, kmer) 132 | m: Optional = MapSegment[Kmer[K]](Locus(tid, pos)) 133 | p = d.setdefault(s, m) 134 | 135 | if m is not p: 136 | target = p 137 | while target is not None: 138 | compare_and_report(m, target, seed_no, precompact) 139 | target = target.next 140 | m.next = p 141 | d[s] = m 142 | 143 | if add_pal: 144 | m_rev = MapSegment[Kmer[K]](Locus(tid, -pos)) 145 | target = m.next # don't compare with m, since m == ~m_rev 146 | while target is not None: 147 | compare_and_report(m_rev, target, seed_no, precompact) 148 | target = target.next 149 | m_rev.next = m 150 | d[s] = m_rev 151 | 152 | signals1 |> enumerate ||> process_signal 153 | -------------------------------------------------------------------------------- /test/apps/avid/avid.codon: -------------------------------------------------------------------------------- 1 | import sys 2 | import bio 3 | import bio.fmindex 4 | import itertools 5 | import time 6 | 7 | @tuple 8 | class Segment: 9 | xs: int 10 | ys: int 11 | xe: int 12 | ye: int 13 | aln: bio.Alignment 14 | anchor: bool 15 | 16 | @dataclass(init=False) 17 | class LCPNode: 18 | lcp: int 19 | start: int 20 | end: int 21 | children: List[LCPNode] 22 | data: List[List[int]] 23 | 24 | def __init__(self, lcp, start, end): 25 | self.lcp, self.start, self.end = lcp, start, end 26 | self.children = [] 27 | self.data = [[] for i in range(5)] 28 | 29 | def get_mems(n, s, sa, lcp, min_size, anchors): 30 | """ Find MEMs """ 31 | i, ci = n.start, 0 32 | while i < n.end: 33 | d = [[] for i in range(5)] 34 | if ci < len(n.children) and i == n.children[ci].start: 35 | i = n.children[ci].end 36 | d = n.children[ci].data 37 | # TODO: clear d 38 | ci += 1 39 | else: 40 | d[bio.fmindex._enc(s._at(sa[i] - 1)) if sa[i] else 4].append(i) 41 | i += 1 42 | if n.lcp >= min_size: 43 | for a, ap in itertools.product(range(5), range(5)): 44 | if a == ap: 45 | continue 46 | for posp, pos in itertools.product(d[a], n.data[ap]): 47 | a, b = sa[posp], sa[pos] 48 | if a > b: a, b = b, a 49 | anchors.append((a, b, n.lcp)) 50 | for a in range(5): 51 | n.data[a].extend(d[a]) 52 | 53 | def lcp_bottom_up(self, sa, lcp, min_size, anchors: List[Tuple[int,int,int]]): 54 | """ Reconstruct suffix tree from SA and find all MEMs whose length is >= min_size """ 55 | interval = None 56 | stack = [LCPNode(0, 0, -1)] 57 | for i in range(1, len(lcp)): 58 | start = i - 1 59 | # assert len(stack)>0 60 | while lcp[i] < stack[-1].lcp: 61 | interval = stack.pop() 62 | # assert len(stack)>0 63 | interval.end = i # [start, end) 64 | get_mems(interval, self, sa, lcp, min_size, anchors) 65 | start = interval.start 66 | if lcp[i] <= stack[-1].lcp: 67 | stack[-1].children.append(interval) 68 | interval = None 69 | if lcp[i] > stack[-1].lcp: 70 | stack.append(LCPNode(lcp[i], start, -1)) 71 | if interval: 72 | stack[-1].children.append(interval) 73 | interval = None 74 | 75 | def anchor(x, y, sfxa, lcp, anchors, xs, xe, ys, ye, depth = 0): 76 | # AVID only uses MEMs for sequences >= 4 KB 77 | if xe - xs <= 4 * 1024 and ye - ys <= 4 * 1024: 78 | yield Segment(xs, ys, xe, ye, x[xs:xe] @ y[ys:ye], False) 79 | return 80 | 81 | an = [] 82 | max_l = 2 83 | # Find anchor matches 84 | for sa, sb, l in anchors: 85 | sb -= len(x) + 1 86 | if l >= max_l // 2 and xs <= sa and sa + l < xe and ys <= sb and sb + l < ye: 87 | max_l = max(max_l, l) 88 | # TODO SEQ BUG: seq negative index out of range works ... s[-5:1] 89 | aln1 = x[max(0, sa - 10):sa] @ y[max(0, sb - 10):sb] 90 | aln2 = x[sa + l:sa + l + 10] @ y[sb + l:sb + l + 10] 91 | an.append(Segment(sa, sb, sa + l, sb + l, bio.Alignment(f'{l}M', l*10 + aln1.score + aln2.score), True)) 92 | # Use only large anchors 93 | an = [a for a in an if a.xe - a.xs >= max_l // 2] 94 | 95 | # No anchors: low-quality alignment, use gaps to cover it 96 | if not an: 97 | c = (f'{xe - xs}I' if xs < xe else '') + (f'{ye - ys}D' if ys < ye else '') 98 | yield Segment(xs, ys, xe, ye, bio.Alignment(c, -1), False) 99 | return 100 | 101 | # Run LIS on anchor list 102 | an.sort() 103 | best = 0 104 | scores = [(an[0].aln.score, -1)] 105 | for i in range(1, len(an)): 106 | scores.append((-100000000, -1)) 107 | for j in range(0, i): 108 | if an[j].xe <= an[i].xs and an[j].ye <= an[i].ys and scores[j][0] + an[j].aln.score > scores[i][0]: 109 | scores[i] = (scores[j][0] + an[j].aln.score, j) 110 | if scores[i] > scores[best]: 111 | best = i 112 | selected = [] 113 | while best != -1: 114 | selected.append(best) 115 | best = scores[best][1] 116 | 117 | # Recursively align the remaining gaps 118 | px, py = xs, ys 119 | for si in range(len(selected) - 1, -1, -1): 120 | i = selected[si] 121 | if (px, py) != (an[i].xs, an[i].ys): 122 | yield from anchor(x, y, sfxa, lcp, anchors, px, an[i].xs, py, an[i].ys, depth+1) 123 | yield an[i] 124 | # TODO SEQ BUG: px, py = 1, py, 2 works! and gives wrong number! 125 | px, py = an[i].xe, an[i].ye 126 | if (px, py) != (xe, ye): 127 | yield from anchor(x, y, sfxa, lcp, anchors, px, xe, py, ye, depth+1) 128 | 129 | def avid(x, y): 130 | # Construct SA & LCP 131 | t = time.time() 132 | s = bio.seq(f'{x}X{y}') 133 | sfxa = s.suffix_array() 134 | lcp = s.lcp(sfxa) 135 | # print(f'LCP & SA done... {time.time()-t}') 136 | 137 | # Get all MEMs 138 | t = time.time() 139 | anchors = [] 140 | lcp_bottom_up(s, sfxa, lcp, 10, anchors) 141 | # print(f'MEMs done, found {len(anchors)} MEMs... {time.time()-t}') 142 | 143 | # Get all anchors 144 | yield from anchor(x, y, sfxa, lcp, anchors, 0, len(x), 0, len(y)) 145 | 146 | # Read sequences 147 | t = time.time() 148 | with open(sys.argv[1]) as fi: 149 | for li, l in enumerate(fi): 150 | print f'{li}', 151 | x, y = l.split() 152 | x, y = bio.seq(x), bio.seq(y) 153 | 154 | # Run AVID & print alignment 155 | anchors = avid(x, y) 156 | mat, mis, ind, nind = 0, 0, 0, 0 157 | xi, yi = 0, 0 158 | for a in anchors: 159 | print a.aln.cigar, 160 | for sz, op in a.aln.cigar: 161 | if op == 'I': 162 | ind += sz; xi += sz; nind += 1 163 | elif op == 'D': 164 | ind += sz; yi += sz; nind += 1 165 | elif op == 'M': 166 | for i in range(sz): 167 | if x[xi + i] == y[yi + i]: 168 | mat += 1 169 | else: 170 | mis += 1 171 | xi += sz; yi += sz 172 | else: 173 | assert False 174 | print f'{mat} {mis} {ind} {nind}' 175 | 176 | print 'AVID is done.', time.time() - t 177 | -------------------------------------------------------------------------------- /stdlib/bio/pseq.codon: -------------------------------------------------------------------------------- 1 | from bio.seq import seq 2 | from bio.kmer import Kmer 3 | 4 | @tuple 5 | class pseq: 6 | ''' 7 | Amino acid sequence 8 | ''' 9 | len: int 10 | ptr: cobj 11 | 12 | def __new__(p: cobj, n: int) -> pseq: 13 | return pseq(n, p) 14 | 15 | def __new__(s: str) -> pseq: 16 | return pseq(s.len, s.ptr) 17 | 18 | def __eq__(self, other: pseq): 19 | n = len(self) 20 | if n != len(other): 21 | return False 22 | i = 0 23 | while i < n: 24 | if self._at(i) != other._at(i): 25 | return False 26 | i += 1 27 | return True 28 | 29 | def __ne__(self, other: pseq): 30 | return not (self == other) 31 | 32 | def _cmp(self, other: pseq): 33 | self_len = len(self) 34 | other_len = len(other) 35 | n = min(self_len, other_len) 36 | i = 0 37 | while i < n: 38 | c1 = self._at(i) 39 | c2 = other._at(i) 40 | if c1 != c2: 41 | return int(c1) - int(c2) 42 | i += 1 43 | return self_len - other_len 44 | 45 | def __lt__(self, other: pseq): 46 | return self._cmp(other) < 0 47 | 48 | def __le__(self, other: pseq): 49 | return self._cmp(other) <= 0 50 | 51 | def __gt__(self, other: pseq): 52 | return self._cmp(other) > 0 53 | 54 | def __ge__(self, other: pseq): 55 | return self._cmp(other) >= 0 56 | 57 | def __str__(self): 58 | return str(self.ptr, self.len) 59 | 60 | def __repr__(self): 61 | return f"p'{self.__str__()}'" 62 | 63 | def __len__(self): 64 | return self.len 65 | 66 | def __bool__(self): 67 | return self.len != 0 68 | 69 | def __hash__(self): 70 | h = 0 71 | for i in range(len(self)): 72 | h = 31*h + int(self._at(i)) 73 | return h 74 | 75 | def __getitem__(self, idx: int): 76 | n = len(self) 77 | if idx < 0: 78 | idx += n 79 | if not (0 <= idx < n): 80 | raise IndexError("pseq index out of range") 81 | return pseq(self.ptr + idx, 1) 82 | 83 | def _at(self, idx: int): 84 | return self.ptr[idx] 85 | 86 | def _slice_direct(self, a: int, b: int): 87 | return pseq(self.ptr + a, b - a) 88 | 89 | def __getitem__(self, s: Slice): 90 | assert s.step is None 91 | if s.start is None and s.stop is None and s.step is None: 92 | return self.__copy__() 93 | elif s.start is None: 94 | b = s.stop.__val__() 95 | n = len(self) 96 | if b < 0: b += n 97 | if b > n: b = n 98 | return pseq(self.ptr, b) 99 | elif s.stop is None: 100 | a = s.start.__val__() 101 | n = len(self) 102 | if a < 0: a += n 103 | if a > n: a = n 104 | return pseq(self.ptr + a, n - a) 105 | else: 106 | a, b = s.start.__val__(), s.stop.__val__() 107 | n = len(self) 108 | if a < 0: a += n 109 | if b < 0: b += n 110 | if a > n: a = n 111 | if b > n: b = n 112 | return self._slice_direct(a, b) 113 | 114 | def _copy_to(self, p: cobj): 115 | str.memcpy(p, self.ptr, self.len) 116 | 117 | def __copy__(self): 118 | n = len(self) 119 | p = cobj(n) 120 | self._copy_to(p) 121 | return pseq(p, n) 122 | 123 | def split(self, k: int, step: int): 124 | ''' 125 | Iterator over length-`k` subsequences of the given sequence 126 | with the specified step size. 127 | ''' 128 | i = 0 129 | while i + k <= len(self): 130 | yield self._slice_direct(i,i+k) 131 | i += step 132 | 133 | def __iter__(self): 134 | return self.split(1, 1) 135 | 136 | def __reversed__(self): 137 | i = len(self) - 1 138 | while i >= 0: 139 | yield self._slice_direct(i,i+1) 140 | i -= 1 141 | 142 | def translate(s: seq, table: Optional[Dict[seq, pseq]] = None): 143 | ''' 144 | Performs DNA to amino acid translation. An optional mapping from 145 | length-3 DNA sequences to amino acids can be given via `table`, 146 | otherwise the standard mapping is assumed. 147 | ''' 148 | def encode_triple(s: seq): 149 | if s.N(): 150 | raise ValueError("codon '{s}' contains an ambiguous base") 151 | K1 = Kmer[1] 152 | a, b, c = K1(s[0]), K1(s[1]), K1(s[2]) 153 | n = (int(c.as_int()) | 154 | (int(b.as_int()) << 2) | 155 | (int(a.as_int()) << 4)) 156 | return n 157 | 158 | def translate_encoded(n: int): 159 | # Note(!) this table must be consistent with k-mer encoding 160 | table = 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF' 161 | return table.ptr[n] 162 | 163 | if table is not None: 164 | for k,v in table.items(): 165 | if len(k) != 3: 166 | raise ValueError("translation table key does not have length 3") 167 | if k.N(): 168 | raise ValueError("ambiguous base in translation table key '{k}'") 169 | if len(v) != 1: 170 | raise ValueError("translation table value does not have length 1") 171 | 172 | n = len(s) 173 | m = n // 3 174 | p = cobj(m) 175 | i = 0 176 | j = 0 177 | while i < n: 178 | codon = s._slice_direct(i, i + 3) 179 | if table is None: 180 | p[j] = translate_encoded(encode_triple(codon)) 181 | else: 182 | p[j] = table.get(codon, p'X').ptr[0] 183 | i += 3 184 | j += 1 185 | return pseq(p, m) 186 | 187 | @extend 188 | class seq: 189 | def translate(self): 190 | return translate(self) 191 | 192 | @extend 193 | class str: 194 | def __prefix_p__(s: str, N: Static[int] = 0) -> pseq: 195 | return pseq(s) 196 | -------------------------------------------------------------------------------- /sw/ksw2_gg2_sse.cpp: -------------------------------------------------------------------------------- 1 | #include "ksw2.h" 2 | #include 3 | 4 | #if defined(__ARM_NEON__) 5 | #define __SSE2__ 6 | #define __SSE4_1__ 7 | #elif defined(__aarch64__) 8 | #define __SSE2__ // SIMDE emulation 9 | #endif 10 | 11 | #define SIMDE_ENABLE_NATIVE_ALIASES 12 | 13 | #ifdef __SSE2__ 14 | #include 15 | 16 | #ifdef KSW_SSE2_ONLY 17 | #undef __SSE4_1__ 18 | #endif 19 | 20 | #ifdef __SSE4_1__ 21 | #include 22 | #endif 23 | 24 | int ksw_gg2_sse(void *km, int qlen, const uint8_t *query, int tlen, 25 | const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, 26 | int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_) { 27 | int r, t, n_col, n_col_, *off, tlen_, last_st, last_en, H0 = 0, last_H0_t = 0; 28 | uint8_t *qr, *mem, *mem2; 29 | __m128i *u, *v, *x, *y, *s, *p; 30 | __m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_; 31 | 32 | zero_ = _mm_set1_epi8(0); 33 | q_ = _mm_set1_epi8(q); 34 | qe2_ = _mm_set1_epi8((q + e) * 2); 35 | flag1_ = _mm_set1_epi8(1); 36 | flag2_ = _mm_set1_epi8(2); 37 | flag8_ = _mm_set1_epi8(0x08); 38 | flag16_ = _mm_set1_epi8(0x10); 39 | 40 | if (w < 0) 41 | w = tlen > qlen ? tlen : qlen; 42 | n_col = w + 1 < tlen ? w + 1 : tlen; // number of columns in the backtrack matrix 43 | tlen_ = (tlen + 15) / 16; 44 | n_col_ = (n_col + 15) / 16 + 1; 45 | n_col = n_col_ * 16; 46 | 47 | mem = (uint8_t *)kcalloc(km, tlen_ * 5 + 1, 16); 48 | u = (__m128i *)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned 49 | v = u + tlen_, x = v + tlen_, y = x + tlen_, s = y + tlen_; 50 | qr = (uint8_t *)kcalloc(km, qlen, 1); 51 | mem2 = (uint8_t *)kmalloc(km, ((qlen + tlen - 1) * n_col_ + 1) * 16); 52 | p = (__m128i *)(((size_t)mem2 + 15) >> 4 << 4); 53 | off = (int *)kmalloc(km, (qlen + tlen - 1) * sizeof(int)); 54 | 55 | for (t = 0; t < qlen; ++t) 56 | qr[t] = query[qlen - 1 - t]; 57 | 58 | for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) { 59 | int st = 0, en = tlen - 1, st0, en0, st_, en_; 60 | int8_t x1, v1; 61 | __m128i x1_, v1_, *pr; 62 | // find the boundaries 63 | if (st < r - qlen + 1) 64 | st = r - qlen + 1; 65 | if (en > r) 66 | en = r; 67 | if (st < ((r - w + 1) >> 1)) 68 | st = (r - w + 1) >> 1; // take the ceil 69 | if (en > (r + w) >> 1) 70 | en = (r + w) >> 1; // take the floor 71 | st0 = st, en0 = en; 72 | st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1; 73 | off[r] = st; 74 | // set boundary conditions 75 | if (st > 0) { 76 | if (st - 1 >= last_st && st - 1 <= last_en) 77 | x1 = ((uint8_t *)x)[st - 1], 78 | v1 = ((uint8_t *)v)[st - 1]; // (r-1,s-1) calculated in the last round 79 | else 80 | x1 = v1 = 0; // not calculated; set to zeros 81 | } else 82 | x1 = 0, v1 = r ? q : 0; 83 | if (en >= r) 84 | ((uint8_t *)y)[r] = 0, ((uint8_t *)u)[r] = r ? q : 0; 85 | // loop fission: set scores first 86 | for (t = st0; t <= en0; ++t) 87 | ((uint8_t *)s)[t] = mat[target[t] * m + qr[t + qlen - 1 - r]]; 88 | // core loop 89 | x1_ = _mm_cvtsi32_si128(x1); 90 | v1_ = _mm_cvtsi32_si128(v1); 91 | st_ = st >> 4, en_ = en >> 4; 92 | pr = p + r * n_col_ - st_; 93 | for (t = st_; t <= en_; ++t) { 94 | __m128i d, z, a, b, xt1, vt1, ut, tmp; 95 | 96 | z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); 97 | 98 | xt1 = _mm_load_si128(&x[t]); // xt1 <- x[r-1][t..t+15] 99 | tmp = _mm_srli_si128(xt1, 15); // tmp <- x[r-1][t+15] 100 | xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), 101 | x1_); // xt1 <- x[r-1][t-1..t+14] 102 | x1_ = tmp; 103 | vt1 = _mm_load_si128(&v[t]); // vt1 <- v[r-1][t..t+15] 104 | tmp = _mm_srli_si128(vt1, 15); // tmp <- v[r-1][t+15] 105 | vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), 106 | v1_); // vt1 <- v[r-1][t-1..t+14] 107 | v1_ = tmp; 108 | a = _mm_add_epi8(xt1, 109 | vt1); // a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] 110 | 111 | ut = _mm_load_si128(&u[t]); // ut <- u[t..t+15] 112 | b = _mm_add_epi8(_mm_load_si128(&y[t]), 113 | ut); // b <- y[r-1][t..t+15] + u[r-1][t..t+15] 114 | 115 | d = _mm_and_si128(_mm_cmpgt_epi8(a, z), 116 | flag1_); // d = a > z? 1 : 0 117 | #ifdef __SSE4_1__ 118 | z = _mm_max_epi8(z, a); // z = z > a? z : a (signed) 119 | tmp = _mm_cmpgt_epi8(b, z); 120 | d = _mm_blendv_epi8(d, flag2_, 121 | tmp); // d = b > z? 2 : d 122 | #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and 123 | // _mm_blendv_epi8() 124 | z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0; 125 | z = _mm_max_epu8(z, 126 | a); // z = max(z, a); this works because both are non-negative 127 | tmp = _mm_cmpgt_epi8(b, z); 128 | d = _mm_or_si128(_mm_andnot_si128(tmp, d), 129 | _mm_and_si128(tmp, 130 | flag2_)); // d = b > z? 2 : d; emulating blendv 131 | #endif 132 | z = _mm_max_epu8(z, 133 | b); // z = max(z, b); this works because both are non-negative 134 | _mm_store_si128(&u[t], 135 | _mm_sub_epi8(z, 136 | vt1)); // u[r][t..t+15] <- z - v[r-1][t-1..t+14] 137 | _mm_store_si128(&v[t], _mm_sub_epi8(z, 138 | ut)); // v[r][t..t+15] <- z - u[r-1][t..t+15] 139 | 140 | z = _mm_sub_epi8(z, q_); 141 | a = _mm_sub_epi8(a, z); 142 | b = _mm_sub_epi8(b, z); 143 | tmp = _mm_cmpgt_epi8(a, zero_); 144 | d = _mm_or_si128(d, _mm_and_si128(flag8_, tmp)); 145 | _mm_store_si128(&x[t], _mm_and_si128(a, tmp)); 146 | tmp = _mm_cmpgt_epi8(b, zero_); 147 | d = _mm_or_si128(d, _mm_and_si128(flag16_, tmp)); 148 | _mm_store_si128(&y[t], _mm_and_si128(b, tmp)); 149 | _mm_store_si128(&pr[t], d); 150 | } 151 | if (r > 0) { 152 | if (last_H0_t >= st0 && last_H0_t <= en0) 153 | H0 += ((uint8_t *)v)[last_H0_t] - (q + e); 154 | else 155 | ++last_H0_t, H0 += ((uint8_t *)u)[last_H0_t] - (q + e); 156 | } else 157 | H0 = ((uint8_t *)v)[0] - 2 * (q + e), last_H0_t = 0; 158 | last_st = st, last_en = en; 159 | } 160 | kfree(km, mem); 161 | kfree(km, qr); 162 | ksw_backtrack(km, 1, 0, 0, (uint8_t *)p, off, 0, n_col, tlen - 1, qlen - 1, m_cigar_, 163 | n_cigar_, cigar_); 164 | kfree(km, mem2); 165 | kfree(km, off); 166 | return H0; 167 | } 168 | #endif // __SSE2__ 169 | -------------------------------------------------------------------------------- /test/core/proteins.codon: -------------------------------------------------------------------------------- 1 | from bio import * 2 | 3 | table = {s'ATA': p'I', s'ATC': p'I', s'ATT': p'I', s'ATG': p'M', 4 | s'ACA': p'T', s'ACC': p'T', s'ACG': p'T', s'ACT': p'T', 5 | s'AAC': p'N', s'AAT': p'N', s'AAA': p'K', s'AAG': p'K', 6 | s'AGC': p'S', s'AGT': p'S', s'AGA': p'R', s'AGG': p'R', 7 | s'CTA': p'L', s'CTC': p'L', s'CTG': p'L', s'CTT': p'L', 8 | s'CCA': p'P', s'CCC': p'P', s'CCG': p'P', s'CCT': p'P', 9 | s'CAC': p'H', s'CAT': p'H', s'CAA': p'Q', s'CAG': p'Q', 10 | s'CGA': p'R', s'CGC': p'R', s'CGG': p'R', s'CGT': p'R', 11 | s'GTA': p'V', s'GTC': p'V', s'GTG': p'V', s'GTT': p'V', 12 | s'GCA': p'A', s'GCC': p'A', s'GCG': p'A', s'GCT': p'A', 13 | s'GAC': p'D', s'GAT': p'D', s'GAA': p'E', s'GAG': p'E', 14 | s'GGA': p'G', s'GGC': p'G', s'GGG': p'G', s'GGT': p'G', 15 | s'TCA': p'S', s'TCC': p'S', s'TCG': p'S', s'TCT': p'S', 16 | s'TTC': p'F', s'TTT': p'F', s'TTA': p'L', s'TTG': p'L', 17 | s'TAC': p'Y', s'TAT': p'Y', s'TAA': p'X', s'TGC': p'C', 18 | s'TGT': p'C', s'TGA': p'X', s'TGG': p'W'} 19 | dna = s'ACCATGACAACGATCAACATAAGGCCTACTAGCAAGAGACATAATATTCTGCTACTCCACAAACCGAGTCCACAACCCTATGGTTGTCGACAGCGCGATCGGCTTTGCGGGTAGGGATAAGGCTACGAGTCGTTTGACCGTGAATCAGCAGTAGCCGTCGCGGTGTTCGTTGCTTTATGATTGTCCTGGTCT' 20 | print dna |> translate # EXPECT: TMTTINIRPTSKRHNILLLHKPSPQPYGCRQRDRLCG*G*GYESFDRESAVAVAVFVAL*LSWS 21 | protein = dna |> translate(table=table) 22 | print protein # EXPECT: TMTTINIRPTSKRHNILLLHKPSPQPYGCRQRDRLCGXGXGYESFDRESAVAVAVFVALXLSWS 23 | protein = ~dna |> translate(table=table) 24 | print protein # EXPECT: RPGQSXSNEHRDGYCXFTVKRLVALSLPAKPIALSTTIGLWTRFVEXQNIMSLASRPYVDRCHG 25 | 26 | print protein[0], protein[-1] # EXPECT: R G 27 | print protein[1:5] # EXPECT: PGQS 28 | print protein[:-20] # EXPECT: RPGQSXSNEHRDGYCXFTVKRLVALSLPAKPIALSTTIGLWTRF 29 | print protein[12:] # EXPECT: GYCXFTVKRLVALSLPAKPIALSTTIGLWTRFVEXQNIMSLASRPYVDRCHG 30 | print protein[:] # EXPECT: RPGQSXSNEHRDGYCXFTVKRLVALSLPAKPIALSTTIGLWTRFVEXQNIMSLASRPYVDRCHG 31 | 32 | p1 = p'HEAGAWGHEE' 33 | p2 = p'HPAWHEAE' 34 | 35 | print p1 @ p2 # EXPECT: Alignment('1M2I7M', -5) 36 | 37 | pam90 = {('B', 'N'): 4, ('G', 'G'): 5, ('K', 'G'): -4, ('S', 'E'): -2, ('Y', 'E'): -6, 38 | ('W', 'R'): 0, ('V', 'M'): 1, ('N', 'R'): -2, ('W', 'Q'): -7, ('L', 'Q'): -3, 39 | ('V', 'N'): -4, ('F', 'K'): -8, ('G', 'E'): -1, ('S', 'L'): -5, ('P', 'R'): -1, 40 | ('E', 'D'): 4, ('Y', 'G'): -8, ('W', 'P'): -8, ('Q', 'A'): -2, ('G', 'D'): -1, 41 | ('K', 'D'): -2, ('T', 'N'): 0, ('W', 'W'): 13, ('L', 'D'): -7, ('S', 'S'): 4, 42 | ('K', 'C'): -8, ('S', 'A'): 1, ('Y', 'I'): -3, ('V', 'I'): 3, ('Q', 'C'): -8, 43 | ('Z', 'P'): -2, ('T', 'G'): -2, ('B', 'P'): -3, ('T', 'L'): -3, ('Z', 'F'): -8, 44 | ('F', 'G'): -6, ('Z', 'Q'): 5, ('V', 'T'): -1, ('S', 'H'): -3, ('B', 'Q'): 0, 45 | ('I', 'Q'): -4, ('Y', 'K'): -6, ('W', 'T'): -7, ('P', 'D'): -4, ('I', 'C'): -3, 46 | ('K', 'R'): 2, ('Z', 'R'): -1, ('T', 'E'): -2, ('B', 'R'): -3, ('Q', 'R'): 0, 47 | ('K', 'Q'): -1, ('Z', 'S'): -2, ('B', 'S'): 0, ('Y', 'M'): -6, ('V', 'E'): -3, 48 | ('Z', 'T'): -2, ('Y', 'D'): -6, ('V', 'W'): -9, ('T', 'C'): -4, ('B', 'T'): -1, 49 | ('T', 'H'): -3, ('F', 'Q'): -7, ('L', 'I'): 1, ('M', 'Q'): -2, ('R', 'A'): -4, 50 | ('C', 'D'): -8, ('V', 'F'): -4, ('F', 'C'): -7, ('C', 'R'): -5, ('D', 'D'): 6, 51 | ('V', 'P'): -3, ('S', 'D'): -1, ('P', 'C'): -5, ('F', 'R'): -6, ('C', 'C'): 9, 52 | ('I', 'G'): -5, ('W', 'K'): -6, ('I', 'N'): -3, ('Z', 'V'): -3, ('T', 'A'): 1, 53 | ('B', 'V'): -4, ('K', 'L'): -5, ('L', 'G'): -6, ('F', 'A'): -5, ('Z', 'W'): -8, 54 | ('S', 'K'): -1, ('B', 'W'): -7, ('K', 'K'): 5, ('E', 'N'): 0, ('Y', 'Q'): -6, 55 | ('V', 'A'): 0, ('W', 'I'): -8, ('V', 'S'): -3, ('T', 'T'): 5, ('F', 'M'): -1, 56 | ('L', 'E'): -5, ('M', 'M'): 9, ('W', 'H'): -4, ('S', 'R'): -1, ('P', 'Q'): -1, 57 | ('P', 'N'): -2, ('B', 'Y'): -4, ('H', 'A'): -4, ('P', 'G'): -3, ('F', 'N'): -5, 58 | ('H', 'N'): 2, ('P', 'K'): -3, ('T', 'M'): -2, ('K', 'H'): -2, ('T', 'R'): -3, 59 | ('L', 'C'): -9, ('W', 'N'): -5, ('E', 'Q'): 2, ('S', 'G'): 0, ('Z', 'H'): 1, 60 | ('Y', 'S'): -4, ('G', 'R'): -5, ('W', 'M'): -7, ('F', 'D'): -8, ('T', 'K'): -1, 61 | ('C', 'N'): -6, ('T', 'P'): -1, ('V', 'L'): 0, ('F', 'I'): 0, ('G', 'Q'): -3, 62 | ('L', 'A'): -3, ('M', 'I'): 1, ('W', 'L'): -3, ('S', 'N'): 1, ('I', 'R'): -3, 63 | ('H', 'E'): -1, ('Y', 'W'): -2, ('I', 'D'): -4, ('W', 'C'): -10, ('N', 'A'): -1, 64 | ('T', 'I'): 0, ('Z', 'K'): -1, ('Q', 'N'): -1, ('M', 'K'): 0, ('K', 'E'): -2, 65 | ('S', 'C'): -1, ('Z', 'L'): -4, ('Y', 'Y'): 9, ('V', 'Y'): -4, ('W', 'A'): -8, 66 | ('Y', 'F'): 4, ('Z', 'M'): -3, ('M', 'R'): -2, ('V', 'H'): -4, ('F', 'E'): -8, 67 | ('M', 'E'): -4, ('H', 'R'): 1, ('P', 'P'): 7, ('P', 'I'): -4, ('Q', 'Q'): 6, 68 | ('P', 'F'): -6, ('B', 'A'): -1, ('Z', 'N'): 0, ('I', 'A'): -2, ('F', 'F'): 8, 69 | ('I', 'H'): -5, ('W', 'G'): -9, ('Y', 'H'): -1, ('B', 'B'): 4, ('M', 'L'): 2, 70 | ('M', 'G'): -5, ('S', 'Q'): -2, ('W', 'F'): -2, ('D', 'A'): -1, ('K', 'A'): -3, 71 | ('N', 'N'): 5, ('B', 'C'): -7, ('V', 'K'): -5, ('W', 'E'): -10, ('L', 'R'): -5, 72 | ('T', 'S'): 2, ('B', 'D'): 5, ('Z', 'A'): -1, ('M', 'N'): -4, ('V', 'D'): -4, 73 | ('Q', 'D'): 0, ('M', 'A'): -2, ('V', 'V'): 6, ('W', 'D'): -9, ('S', 'F'): -4, 74 | ('D', 'N'): 3, ('P', 'M'): -4, ('H', 'D'): -1, ('B', 'E'): 2, ('Z', 'B'): 2, 75 | ('I', 'E'): -3, ('R', 'R'): 7, ('K', 'N'): 1, ('Y', 'L'): -3, ('T', 'Q'): -3, 76 | ('E', 'C'): -8, ('B', 'F'): -6, ('Z', 'C'): -8, ('M', 'H'): -5, ('M', 'C'): -8, 77 | ('S', 'M'): -3, ('E', 'R'): -4, ('E', 'E'): 6, ('B', 'G'): -1, ('Z', 'D'): 3, 78 | ('V', 'G'): -3, ('G', 'N'): -1, ('A', 'A'): 4, ('V', 'Q'): -4, ('L', 'N'): -4, 79 | ('Y', 'N'): -2, ('B', 'H'): 1, ('Z', 'E'): 5, ('V', 'R'): -4, ('P', 'H'): -2, 80 | ('H', 'C'): -5, ('P', 'A'): 0, ('F', 'L'): 0, ('H', 'H'): 8, ('B', 'I'): -3, 81 | ('C', 'A'): -3, ('I', 'I'): 6, ('T', 'F'): -5, ('L', 'L'): 6, ('Y', 'P'): -8, 82 | ('Z', 'G'): -2, ('D', 'R'): -5, ('M', 'D'): -5, ('G', 'C'): -5, ('S', 'I'): -3, 83 | ('Y', 'A'): -5, ('E', 'A'): 0, ('K', 'I'): -3, ('B', 'K'): 0, ('V', 'C'): -3, 84 | ('T', 'D'): -2, ('Y', 'R'): -6, ('B', 'L'): -5, ('Z', 'Y'): -6, ('G', 'A'): 0, 85 | ('S', 'P'): 0, ('Z', 'I'): -3, ('H', 'Q'): 2, ('Y', 'C'): -1, ('P', 'L'): -4, 86 | ('H', 'G'): -5, ('P', 'E'): -2, ('F', 'H'): -3, ('B', 'M'): -5, ('Z', 'Z'): 5, 87 | ('W', 'S'): -3, ('L', 'H'): -3, ('Y', 'T'): -4} 88 | 89 | print p1.align(p2, mat=SubMat(pam90)) # EXPECT: Alignment('1M2I7M', 4) 90 | print p1.align(p2, gapo=4, gape=2, gapo2=13, gape2=1, mat=SubMat(pam90)) # EXPECT: Alignment('1M2I7M', 4) 91 | print p1.align(p2, gapo=4, gape=2, ext_only=True, mat=SubMat(pam90)) # EXPECT: Alignment('3M', 7) 92 | print p1, p2 # EXPECT: HEAGAWGHEE HPAWHEAE 93 | -------------------------------------------------------------------------------- /stdlib/bio/builtin.codon: -------------------------------------------------------------------------------- 1 | from bio.seq import seq 2 | from bio.kmer import Kmer 3 | 4 | @__attribute__ 5 | def prefetch(): 6 | pass 7 | 8 | @__attribute__ 9 | def inter_align(): 10 | pass 11 | 12 | def seqs(x): 13 | ''' 14 | Returns an iterator over sequences from the specified 15 | object by invoking the `__seqs__` magic method. 16 | 17 | `__seqs__` is defined for most common formats, like 18 | FASTA, FASTQ, SAM and BAM. 19 | ''' 20 | return x.__seqs__() 21 | 22 | def split(self: seq, k: int, step: int): 23 | ''' 24 | Iterator over length-`k` subsequences of the given sequence 25 | with the specified step size. 26 | ''' 27 | return self.split(k, step) 28 | 29 | def kmers(self: seq, step: int, k: Static[int]): 30 | ''' 31 | Iterator over k-mers (size `k`) of the given sequence 32 | with the specified step size. Note that k-mers spanning 33 | ambiguous bases will be skipped. 34 | ''' 35 | return self.kmers(step, k) 36 | 37 | def kmers_with_pos(self: seq, step: int, k: Static[int]): 38 | ''' 39 | Iterator over (0-based index, k-mer) tuples of the given 40 | sequence with the specified step size. Note that k-mers 41 | spanning ambiguous bases will be skipped. 42 | ''' 43 | return self.kmers_with_pos(step, k) 44 | 45 | def revcomp(s): 46 | ''' 47 | Returns the reverse complement of the argument sequence or k-mer. 48 | ''' 49 | return ~s 50 | 51 | def revcomp_with_pos(t): 52 | ''' 53 | Returns the reverse complement of the argument sequence or k-mer, 54 | where the argument also contains a position (e.g. as yielded by 55 | `kmers_with_pos`). 56 | ''' 57 | return (t[0], ~t[1]) 58 | 59 | def _kmers_revcomp_with_pos[K](self: seq, step: int): 60 | return self._kmers_revcomp_with_pos(step, K.k) 61 | 62 | def _kmers_revcomp[K](self: seq, step: int): 63 | return self._kmers_revcomp(step, K.k) 64 | 65 | def canonical(k): 66 | ''' 67 | Returns the minimum of a sequence / k-mer and its reverse complement. 68 | ''' 69 | kr = ~k 70 | return k if k < kr else kr 71 | 72 | def canonical_with_pos(t): 73 | ''' 74 | Returns the minimum of a sequence / k-mer and its reverse complement, 75 | where the argument also contains a position (e.g. as yielded by 76 | `kmers_with_pos`). 77 | ''' 78 | return (t[0], canonical(t[1])) 79 | 80 | def _kmers_canonical[K](self: seq): 81 | return self.kmers_canonical(K.k) 82 | 83 | def _kmers_canonical_with_pos[K](self: seq): 84 | return self.kmers_canonical_with_pos(K.k) 85 | 86 | def base[K,T](kmer: K, idx: int, b: T): 87 | ''' 88 | Returns a new k-mer equal to `K` but with the base at index `idx` set to `b` 89 | ''' 90 | U = type(kmer.as_int()) 91 | if idx < 0: 92 | idx += len(kmer) 93 | idx = K.len() - idx - 1 94 | n = U(int(Kmer[1](b).as_int())) 95 | k = kmer.as_int() & ~(U(3) << U(2*idx)) 96 | k |= n << U(2*idx) 97 | return K(k) 98 | 99 | @__force__ 100 | def _is_iupac_nt(b: byte) -> bool: 101 | iupac = ('\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 102 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 103 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01\x00' 104 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 105 | '\x00\x01\x01\x01\x01\x00\x00\x01\x01\x00\x00\x01\x00\x01\x01\x00' 106 | '\x00\x00\x01\x01\x01\x01\x01\x01\x00\x01\x00\x00\x00\x00\x00\x00' 107 | '\x00\x01\x01\x01\x01\x00\x00\x01\x01\x00\x00\x01\x00\x01\x01\x00' 108 | '\x00\x00\x01\x01\x01\x01\x01\x01\x00\x01\x00\x00\x00\x00\x00\x00' 109 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 110 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 111 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 112 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 113 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 114 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 115 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 116 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00') 117 | return bool(iupac.ptr[int(b)]) 118 | 119 | @__force__ 120 | def _is_iupac_aa(b: byte) -> bool: 121 | iupac = ('\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 122 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 123 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 124 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 125 | '\x00\x01\x00\x01\x01\x01\x01\x01\x01\x01\x00\x01\x01\x01\x01\x00' 126 | '\x01\x01\x01\x01\x01\x00\x01\x01\x00\x01\x00\x00\x00\x00\x00\x00' 127 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 128 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 129 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 130 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 131 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 132 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 133 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 134 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 135 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 136 | '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00') 137 | return bool(iupac.ptr[int(b)]) 138 | 139 | @__force__ 140 | def _validate_str_as_seq(s: str, copy: bool = False): 141 | def ensure_valid(b: byte, i: int): 142 | if not _is_iupac_nt(b): 143 | raise ValueError(f"invalid base '{str(b)}' at position {i} of sequence") 144 | p = s.ptr 145 | n = s.len 146 | i = 0 147 | if copy: 148 | q = Ptr[byte](n) 149 | while i < n: 150 | b = p[i] 151 | ensure_valid(b, i) 152 | q[i] = b 153 | i += 1 154 | return seq(q, n) 155 | else: 156 | while i < n: 157 | ensure_valid(p[i], i) 158 | i += 1 159 | return seq(p, n) 160 | 161 | @__force__ 162 | def _validate_str_as_qual(s: str, copy: bool = False): 163 | def ensure_valid(b: byte, i: int): 164 | if not (byte(0x21) <= b <= byte(0x7e)): 165 | raise ValueError(f"invalid quality score '{str(b)}' at position {i} of quality score string") 166 | p = s.ptr 167 | n = s.len 168 | i = 0 169 | if copy: 170 | q = Ptr[byte](n) 171 | while i < n: 172 | b = p[i] 173 | ensure_valid(b, i) 174 | q[i] = b 175 | i += 1 176 | return str(q, n) 177 | else: 178 | while i < n: 179 | ensure_valid(p[i], i) 180 | i += 1 181 | return str(p, n) 182 | 183 | @__force__ 184 | def _split_header_on_space(s: str): 185 | a = 0 186 | while a < len(s) and not s[a].isspace(): 187 | a += 1 188 | 189 | b = a 190 | while b < len(s) and s[b].isspace(): 191 | b += 1 192 | 193 | return s[:a], s[b:] 194 | -------------------------------------------------------------------------------- /test/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "codon/cir/llvm/llvisitor.h" 15 | #include "codon/cir/transform/manager.h" 16 | #include "codon/cir/transform/pass.h" 17 | #include "codon/cir/util/inlining.h" 18 | #include "codon/cir/util/irtools.h" 19 | #include "codon/cir/util/outlining.h" 20 | #include "codon/compiler/compiler.h" 21 | #include "codon/compiler/error.h" 22 | #include "codon/util/common.h" 23 | 24 | #include "gtest/gtest.h" 25 | 26 | using namespace codon; 27 | using namespace std; 28 | 29 | vector splitLines(const string &output) { 30 | vector result; 31 | string line; 32 | istringstream stream(output); 33 | const char delim = '\n'; 34 | 35 | while (getline(stream, line, delim)) 36 | result.push_back(line); 37 | 38 | return result; 39 | } 40 | 41 | static pair findExpectOnLine(const string &line) { 42 | for (auto EXPECT_STR : vector>{ 43 | {false, "# EXPECT: "}, {false, "#: "}, {true, "#! "}}) { 44 | size_t pos = line.find(EXPECT_STR.second); 45 | if (pos != string::npos) 46 | return {EXPECT_STR.first, line.substr(pos + EXPECT_STR.second.length())}; 47 | } 48 | return {false, ""}; 49 | } 50 | 51 | static pair, bool> findExpects(const string &filename, bool isCode) { 52 | vector result; 53 | bool isError = false; 54 | string line; 55 | if (!isCode) { 56 | ifstream file(filename); 57 | if (!file.good()) { 58 | cerr << "error: could not open " << filename << endl; 59 | exit(EXIT_FAILURE); 60 | } 61 | 62 | while (getline(file, line)) { 63 | auto expect = findExpectOnLine(line); 64 | if (!expect.second.empty()) { 65 | result.push_back(expect.second); 66 | isError |= expect.first; 67 | } 68 | } 69 | file.close(); 70 | } else { 71 | istringstream file(filename); 72 | while (getline(file, line)) { 73 | auto expect = findExpectOnLine(line); 74 | if (!expect.second.empty()) { 75 | result.push_back(expect.second); 76 | isError |= expect.first; 77 | } 78 | } 79 | } 80 | return {result, isError}; 81 | } 82 | 83 | string argv0; 84 | extern "C" void GC_atfork_prepare(); 85 | extern "C" void GC_atfork_parent(); 86 | extern "C" void GC_atfork_child(); 87 | 88 | class SeqTest 89 | : public testing::TestWithParam> { 92 | vector buf; 93 | int out_pipe[2]; 94 | pid_t pid; 95 | 96 | public: 97 | SeqTest() : buf(65536), out_pipe(), pid() {} 98 | string getFilename(const string &basename) { 99 | return string(TEST_DIR) + "/" + basename; 100 | } 101 | int runInChildProcess() { 102 | assert(pipe(out_pipe) != -1); 103 | pid = fork(); 104 | GC_atfork_prepare(); 105 | assert(pid != -1); 106 | 107 | if (pid == 0) { 108 | GC_atfork_child(); 109 | dup2(out_pipe[1], STDOUT_FILENO); 110 | close(out_pipe[0]); 111 | close(out_pipe[1]); 112 | 113 | auto file = getFilename(get<0>(GetParam())); 114 | bool debug = get<1>(GetParam()); 115 | auto code = get<3>(GetParam()); 116 | auto startLine = get<4>(GetParam()); 117 | int testFlags = 1 + get<5>(GetParam()); 118 | 119 | auto compiler = std::make_unique( 120 | argv0, debug, /*disabledPasses=*/std::vector{}, /*isTest=*/true); 121 | llvm::cantFail(compiler->load(".")); 122 | llvm::handleAllErrors(code.empty() 123 | ? compiler->parseFile(file, testFlags) 124 | : compiler->parseCode(file, code, startLine, testFlags), 125 | [](const error::ParserErrorInfo &e) { 126 | for (auto &group : e.getErrors()) { 127 | for (auto &msg : group) { 128 | getLogger().level = 0; 129 | printf("%s\n", msg.getMessage().c_str()); 130 | } 131 | } 132 | fflush(stdout); 133 | exit(EXIT_FAILURE); 134 | }); 135 | 136 | llvm::cantFail(compiler->compile()); 137 | compiler->getLLVMVisitor()->run({file}); 138 | fflush(stdout); 139 | exit(EXIT_SUCCESS); 140 | } else { 141 | GC_atfork_parent(); 142 | int status = -1; 143 | close(out_pipe[1]); 144 | assert(waitpid(pid, &status, 0) == pid); 145 | read(out_pipe[0], buf.data(), buf.size() - 1); 146 | close(out_pipe[0]); 147 | return status; 148 | } 149 | return -1; 150 | } 151 | string result() { return string(buf.data()); } 152 | }; 153 | static string 154 | getTestNameFromParam(const testing::TestParamInfo &info) { 155 | const string basename = get<0>(info.param); 156 | const bool debug = get<1>(info.param); 157 | string normname = basename; 158 | replace(normname.begin(), normname.end(), '/', '_'); 159 | replace(normname.begin(), normname.end(), '.', '_'); 160 | return normname + (debug ? "_debug" : ""); 161 | } 162 | TEST_P(SeqTest, Run) { 163 | const string file = get<0>(GetParam()); 164 | int status; 165 | bool isCase = !get<2>(GetParam()).empty(); 166 | if (!isCase) 167 | status = runInChildProcess(); 168 | else 169 | status = runInChildProcess(); 170 | ASSERT_TRUE(WIFEXITED(status)); 171 | 172 | string output = result(); 173 | 174 | auto expects = findExpects(!isCase ? getFilename(file) : get<3>(GetParam()), isCase); 175 | if (WEXITSTATUS(status) != int(expects.second)) 176 | fprintf(stderr, "%s\n", output.c_str()); 177 | ASSERT_EQ(WEXITSTATUS(status), int(expects.second)); 178 | const bool assertsFailed = output.find("TEST FAILED") != string::npos; 179 | EXPECT_FALSE(assertsFailed); 180 | if (assertsFailed) 181 | std::cerr << output << std::endl; 182 | 183 | if (!expects.first.empty()) { 184 | vector results = splitLines(output); 185 | for (unsigned i = 0; i < min(results.size(), expects.first.size()); i++) 186 | if (expects.second) 187 | EXPECT_EQ(results[i].substr(0, expects.first[i].size()), expects.first[i]); 188 | else 189 | EXPECT_EQ(results[i], expects.first[i]); 190 | EXPECT_EQ(results.size(), expects.first.size()); 191 | } 192 | } 193 | 194 | // clang-format off 195 | INSTANTIATE_TEST_SUITE_P( 196 | CoreTests, SeqTest, 197 | testing::Combine( 198 | testing::Values( 199 | "core/align.codon", 200 | "core/big.codon", 201 | "core/bwtsa.codon", 202 | "core/containers.codon", 203 | "core/formats.codon", 204 | "core/kmers.codon", 205 | "core/match.codon", 206 | "core/proteins.codon", 207 | "core/serialization.codon" 208 | ), 209 | testing::Values(true, false), 210 | testing::Values(""), 211 | testing::Values(""), 212 | testing::Values(0), 213 | testing::Values(false) 214 | ), 215 | getTestNameFromParam); 216 | 217 | INSTANTIATE_TEST_SUITE_P( 218 | PipelineTests, SeqTest, 219 | testing::Combine( 220 | testing::Values( 221 | "pipeline/canonical_opt.codon", 222 | "pipeline/interalign.codon", 223 | "pipeline/prefetch.codon", 224 | "pipeline/revcomp_opt.codon" 225 | ), 226 | testing::Values(false), 227 | testing::Values(""), 228 | testing::Values(""), 229 | testing::Values(0), 230 | testing::Values(false) 231 | ), 232 | getTestNameFromParam); 233 | // clang-format on 234 | 235 | int main(int argc, char *argv[]) { 236 | argv0 = ast::Filesystem::executable_path(argv[0]); 237 | testing::InitGoogleTest(&argc, argv); 238 | return RUN_ALL_TESTS(); 239 | } 240 | -------------------------------------------------------------------------------- /test/pipeline/canonical_opt.codon: -------------------------------------------------------------------------------- 1 | from bio import * 2 | 3 | # test kmers |> canonical optimization 4 | @test 5 | def test(s: seq, K: Static[int]): 6 | got1 = list[Kmer[K]]() 7 | s |> kmers(1, k=K) |> canonical |> got1.append 8 | exp1 = [min(k, ~k) for k in s.kmers(step=1, k=K)] 9 | assert got1 == exp1 10 | 11 | got2 = list[tuple[int,Kmer[K]]]() 12 | s |> kmers_with_pos(step=1, k=K) |> canonical_with_pos |> got2.append 13 | exp2 = [(i, min(k, ~k)) for i,k in s.kmers_with_pos(step=1, k=K)] 14 | assert got2 == exp2 15 | 16 | # test revcomp'd seq 17 | s = ~s 18 | 19 | got1 = list[Kmer[K]]() 20 | s |> kmers(step=1, k=K) |> canonical |> got1.append 21 | exp1 = [min(k, ~k) for k in s.kmers(step=1, k=K)] 22 | assert got1 == exp1 23 | 24 | got2 = list[tuple[int,Kmer[K]]]() 25 | s |> kmers_with_pos(1, k=K) |> canonical_with_pos |> got2.append 26 | exp2 = [(i, min(k, ~k)) for i,k in s.kmers_with_pos(step=1, k=K)] 27 | assert got2 == exp2 28 | 29 | def test_all(s: list[seq], K: Static[int]): 30 | for a in s: 31 | test(a, K) 32 | 33 | v = [ s'C', 34 | s'GA', 35 | s'CTC', 36 | s'TATG', 37 | s'GTAGC', 38 | s'TGCAGT', 39 | s'TACTAGG', 40 | s'GTTAGAAA', 41 | s'TGGATCATA', 42 | s'TTGGCCACGG', 43 | s'CCTCTGCGGGA', 44 | s'ATGATAGGAAAG', 45 | s'CTCTTTGGGGTTA', 46 | s'GCAGTTTTTCGCTT', 47 | s'GCTTTGTGAAATGGT', 48 | s'CATGGAGCGGCCGTGT', 49 | s'TTAACGGGCCCTTGCCG', 50 | s'CCTTGTTTGCTCGCGAGA', 51 | s'GGCCAAGCTTATCTGTGTT', 52 | s'CCGCTTGCCTTACATTGGCC', 53 | s'GAGTGTGTAATAAATAATCGT', 54 | s'GCGACAATATACTGCGTGGTCA', 55 | s'TTCATATATGACCCCGTACAGGG', 56 | s'GCTTTGATGAATTAAGGATAGCTT', 57 | s'GTTATATCCCCCCGCGCTTTAACCG', 58 | s'AAACCTGACCGTTAAGACTTTAAGAC', 59 | s'GACCTTGCCCTTGAAGGAACATCTGGG', 60 | s'CGTCCATGTTTGCATACGACGTTGTGTT', 61 | s'TAAACGAGACCGTCTCTGCTCATAGCCGT', 62 | s'CAGTACGTTAGTGTGACCGCAAAAGGTGGT', 63 | s'GAGGATATATAACAACCTCAGGAGTCTGTTC', 64 | s'AGGATAAGGTTCACAGAACTATTGTTATGATT', 65 | s'CACAGAGAGCGGGCTCTATTGTTGTCGTGGGGA', 66 | s'TACTCGTGGCGAGGAGGAGTTTTCTTAAATCGGA', 67 | s'GGGCTTTTCGCCTGGTTATAACTTGGTATGGAGCT', 68 | s'AGGACATTTAAGATTTAAAAGCACCCTCAGCAACAT', 69 | s'CATGAAGGATCAGTGACTTCCTTACCTGTGCTGCCGG', 70 | s'CGTATTGGTTCGTGACTTCCACGGGGTGTCACGGCGGA', 71 | s'GCGAAACGCATCATCCGGGACAATCAACTCATCAGGCGG', 72 | s'CACAACCGGGGCTCGATCCCAAGCACCATTACACGATGAC', 73 | s'TCACGAAGACACAACGGTCCACACCTACTTTCCCCCCAACG', 74 | s'ATCTACTGAAGCGCTACAGTCATCTACTGCCACTGAACGGAC', 75 | s'GTGTCCATGCTCCGTTTAAAATAGAGAAGCCAAAGGAGTACCA', 76 | s'AGTAGGCTGCAAATCGATCGCCACGCAAATGCTAAAAGTTGTGC', 77 | s'GTGATTAGAGGAGGTAACAGCCCAAACGCTCTCTTCCTCGCTATA', 78 | s'TGTTGTGTCCGATAAGATTGCGTCCTGGTGGAGTGGTTGCGGGTTA', 79 | s'GCTTAACTAACCGAGCCGGTGCGGAGATTCCGTGTGTGGCTAGTTAT', 80 | s'CTCGCGTGCGGAGACCTCTCCCCCAGGAATAACCGATTGACCGTGGAT', 81 | s'ACATTCGCAGATAGGGCAACGGTTTAGTGCGCACCATGCTAAGCTAAGG', 82 | s'TTTGCACCGGTTAGCTGTCGTTAATCCGACGGGCCTTATCTCGGTACATT', 83 | s'TACTCGGACATCCATCCAAAGTTGGAGATAGCTTTGTATGAAAATGATGTA', 84 | s'CAGACTTTCTTATGTATCCCTCTCTTGCACAGATTGTTCATGTCATTTGCTG', 85 | s'CAGCGGGCCCTCTGCTTATAGCCGGGTAGTCCGTATTACGACCGAGGTCCCGT', 86 | s'GGGTAACAACAAAGGATGAGGTTAATTCGGAGGGAAAACGTCGCGAACGATAGT', 87 | s'GCTGAGCCCAGGAGGGGAGCTCTCGGGAGTCGCAATGAGATCCGTGGGCCCCCAA', 88 | s'CGATTAGCGACCAAATGGTATACATTACGTATCGGGTGTCGAACCCCCTACCACTT', 89 | s'CCGAACCATTCCTAGACTTGCATCGAAGACGCCGGAGACATAGAGTATACTATTTCG', 90 | s'CAATAAAGGAAGATCGACCCAGTTGCTAGGCCCGGCGCTTCCCTGGAGCCTTGGTGAA', 91 | s'AAATGGAGCCAGACAGCGTGACGCCCTGCGCGCGACAAGCTACGTCGAAGGCACATTCT', 92 | s'CAATCTATGATTTTTGTCGGGTTGCGAGCCTAAGTTAACTTGATCTCCGTACGTGCCACA', 93 | s'CAACTACTGAGCTCCGTTAGGTCCCTTTCATTTCGCTGCGTATGTGTAACCAACCTCGTAC', 94 | s'TGGAGAGTCCCTTCCCACGCGCAACCTGCGGTTAGCATCGCAGATCTTTCTACAGCCCTATG', 95 | s'GATCCTAACCTTCTACACTCATCGAACACGAACATGATGCTGTGCGGTGTCATTTGGATTGTG', 96 | s'GTCATTACATAACCATGAGCGCCCCCGTTCAAACTTCAGCGTTTTTGTAAAGGGCATCATTGCC', 97 | s'GGGGCAAGCAGACGGATTACACACTCTTTTAACCTCGTTTCTACACTCCTAATCAGTCCGTACAG', 98 | s'CATGCGGTCCCCGCAGCCCAGATCCAATCCGGCAGCAGAAGGCATTATCTGCTGGCGTTGCCTTAC', 99 | s'TAAGTCGACCGAAGGATGCAAAAGGAATCCCCGACAGTATCTGTCACATCCCTGCAGCCGTCTATTT', 100 | s'GATGATCACCTAATGACCGTCGCGGGACCAATGGTATCCGAGTGATGGAATCCTACGATTGATCAACC', 101 | s'AAAGAATTTCCCAAAGTCCCACTTGTTAATACCGTGACGCAGCCGGACTAACACAGTCCCAGGTATTGA', 102 | s'ATCGTTTATGTAGATCACGGCATATCTCTAACTAACGGCATAGTACCGGCATGCAGTTGAGCAGACTGGG', 103 | s'AGACTCTGGCATATCTGTCCATACCCATAAGCCGGCCCACGCAGGATCAGGAGTGAATTGCCGCGCAGATC', 104 | s'CTGGAGCCATGAATATAGCATCCCAACCCAACGTTTGAGTGTCTCCAGCGGAATAAAACGCCTAATTTTTGG', 105 | s'TTTAAGCCACCTGTCATTAGATATATCGCGCCCGCCACTTACATCTACCTGTTCATAGAGACTATGCGTACTA', 106 | s'CTCACATCTCTATCGATTTAGGTAGAGGCTAGCGGTCATAAATCTAAGTCTAGTCAGCCCGCAACGCCACATCG', 107 | s'TACAGCCCGTGCAGTGTACAAGCGCACACGGGAGCATCCTGGGTTATGAGCCCCCCGAGGCCAGTGCAAGACCAG', 108 | s'GTCTCTTGGCAGTGTCCAGACAGACGCGGTGCCTGGGAAGACGTTAATTGGTGCTACTTCAACCACGCCTTGCTTG', 109 | s'GGGGCTATTTGGGATTAGAGAAGTGCCTACACTCGGTGGCCCAGCTGGGCTGTCCATTGCAGACACGCGTCGCGTGT', 110 | s'CAGGAGTATGGACCCAACATGACTTTGGCGGCAGCAGTATCCATGCCCGTTGCCTGTAGTCTTTAGGAGGATCGTAAT', 111 | s'CGCGGTGTACCGTTTCAGATTCTATCTACTACGACTAGGTGCTTAGCAATTACCAATCGTACGCGACCGATATAAAACT', 112 | s'CGCCAGGGGTCATCCCGGGGATTCCATCAGTTAAACGTCTTGAAACCAGTTAGAGAACAGAGACCAGTTAGGCAACTCGG', 113 | s'TTCCCGCTTCTCTGCCCGATACCCAATTCTTGATACGTAGTCTCCCTACCGCAGGATATGACGCACGCCCCTATAACGAAC', 114 | s'AGCTACCACATACACAGGGAGACCGCGCGGTTTTGACCGTATGGTCAACCCATCTTCTAAGTTGCAGCGTCGAAGAGTCCCG', 115 | s'GGTAACGACGATGGGACAAAATGATAAATCACGTCGTACTTAGTTCACTTAGGTAAGGGTTTGGTGTGGCGTGAATAACTTGA', 116 | s'GTGCTCGAGGCAGCGACACGCGTTCATCAGATTGTGACAATGACCATATATTAGACGTCAGTTAAAGGAAGTTCACCGGTAACG', 117 | s'GTAAACTTCAGTTGCTAAAGTTAAATTCATCTGATGCTTACGTGGAAACTTGAGAAGGAGCCATTGAGACTTCGCTCCAGATAAC', 118 | s'CCGAATTCCGTTATCAACGTAAGCTAAGCGGCTTGGGCGTAGAGTTTCTAAGGGTGAACGTACACTTCTTTCGCTGCACCGATGCC', 119 | s'CCACTCCCTGATTAGCTTTGTTCTGTATTGCATGAATAAGGTTCAATTTGCGACCTTATGAAACAGGTAATCTGGGAAGCCTTAGTG', 120 | s'CATAAGTCCCAGATTCTCGCCGGATGGCAATTCTAGCGTCACGTTAGACAACGGTGAGAACGGAGGAGATTCAGAGCACAGGCTTGAA', 121 | s'AAGCTTTAGAGATCCAGGACCTTATGTCGGTACAAGAAACTAGAACTCGCTAAGTAGATCAGGTCCTGGCAGCATCCATGCCCCATTTG', 122 | s'TACTTATTACCACCTTTGTTGGGCTTTGACAGATATTACAGTGGTCTGATTCGTGGGGGCTTACTGCAACGCATACTATGGCGAAGGTCC', 123 | s'GGCAAGCTGAGGAGGGACGGTCACCCACTGAAAACATTTGAAACCGGGCGGGCTTGAACAGGGCCAATCAAGACCCCTCTCATAGGATGGC', 124 | s'CCATGCGAATTTTCCGGTCAAGGAACACCTGATTCAGAGCGGGCTACCAGAACAGGCAAGCAGCCCTACATCGCTTCTTAAAAAATATTAAG', 125 | s'TGGCTATGCATCTCACTTGGCTTTTCACGGGGGTGCCCAGAGGACATAGATACAGCACGGTCCCATGCTAGGATCCAACGAGTGCATTAGAAT', 126 | s'CTAAGCCTATGCCAGTCTTCCTTGTTTACTCGGTGGTCCTGTACGTCCATATCATTTACGTCCATGAAGCCAACCCCCGAGCAAATACCCGGTA', 127 | s'GATAAATTCCTCCCATATCAAAGTTCTTGCCCACGCGGGCTACCCAGCTAACGTAACTGTTTCGACACTAGAGATAACAACGCGTTGCGACTCTC', 128 | s'GGCGGCATGTAGGACGGCGTCAGTGGGTATACTATCGCTCTTAGGTCTCCAGTCAAAAAAATGTGGCATCCGGTAGTTGCTGGCAGATCTGCACTT', 129 | s'ATTTATTTGCCCCGCAGTGTCCTTTTTCTAGTCATAAATCCTCATACCGCGGGCCCTTCATCCGGTTTGATTCGAAGCATTGGTATGTTAGATACGT', 130 | s'ACACGCCATGAGGTAAATAACTCTGGAATTGTCAGTCAAGCACCGTGTGTTCAGTGTAAGTTTCTCGGACCAAGGCATATCGACGCTATGCGGTTTAT', 131 | s'AACCTCAGTCGGGCAGGCCATGGCGCGAAATGACTCGAGTAGACTCCATCTCTAAGGAGCGGAGCTGTTGCAACTAGGGTGACACACAGCTCGCCATGA', 132 | s'TATTGCAAGGCCCTACGCGGCTACGTCTCAATATATCCTATGGGCCGCAGCGTTCGGCCAATTCACATGGATGAGACATGGGTCCAAAATTTGCGGGATA' ] 133 | 134 | test_all(v, 1) 135 | test_all(v, 2) 136 | test_all(v, 3) 137 | test_all(v, 4) 138 | test_all(v, 5) 139 | test_all(v, 6) 140 | test_all(v, 7) 141 | test_all(v, 8) 142 | test_all(v, 9) 143 | test_all(v, 10) 144 | test_all(v, 11) 145 | test_all(v, 12) 146 | test_all(v, 13) 147 | test_all(v, 14) 148 | test_all(v, 15) 149 | test_all(v, 16) 150 | test_all(v, 17) 151 | test_all(v, 18) 152 | test_all(v, 19) 153 | test_all(v, 20) 154 | -------------------------------------------------------------------------------- /stdlib/bio/bwa.codon: -------------------------------------------------------------------------------- 1 | from bio import seq, CIGAR 2 | 3 | from os import getenv as _getenv 4 | BWA_LIB = _getenv('BWA_LIB') 5 | if not BWA_LIB: 6 | raise OSError("BWA error: 'BWA_LIB' environment variable not set") 7 | 8 | @tuple 9 | class bntann1_t: 10 | _offset: i64 11 | _len: i32 12 | _n_ambs: i32 13 | _gi: u32 14 | _is_alt: i32 15 | _name: Ptr[byte] 16 | _anno: Ptr[byte] 17 | 18 | @tuple 19 | class bntamb1_t: 20 | _offset: i64 21 | _len: i32 22 | _amb: byte 23 | 24 | @tuple 25 | class bntseq_t: 26 | _l_pac: i64 27 | _n_seqs: i32 28 | _seed: u32 29 | _anns: Ptr[bntann1_t] 30 | _n_holes: i32 31 | _ambs: Ptr[bntamb1_t] 32 | _fp_pac: cobj 33 | 34 | @tuple 35 | class mem_alnreg_t: 36 | _rb: i64 37 | _re: i64 38 | _qb: i32 39 | _qe: i32 40 | _rid: i32 41 | _score: i32 42 | _truesc: i32 43 | _sub: i32 44 | _alt_sc: i32 45 | _csub: i32 46 | _sub_n: i32 47 | _w: i32 48 | _seedcov: i32 49 | _secondary: i32 50 | _secondary_all: i32 51 | _seedlen0: i32 52 | _bitfields: u32 53 | _frac_rep: u32 # really a 32-bit float 54 | _hash: u64 55 | 56 | @property 57 | def rb(self): 58 | return int(self._rb) 59 | 60 | @property 61 | def re(self): 62 | return int(self._re) 63 | 64 | @property 65 | def qb(self): 66 | return int(self._qb) 67 | 68 | @property 69 | def qe(self): 70 | return int(self._qe) 71 | 72 | @property 73 | def rid(self): 74 | return int(self._rid) 75 | 76 | @property 77 | def score(self): 78 | return int(self._score) 79 | 80 | @property 81 | def true_score(self): 82 | return int(self._truesc) 83 | 84 | @property 85 | def sub(self): 86 | return int(self._sub) 87 | 88 | @property 89 | def alt_score(self): 90 | return int(self._alt_sc) 91 | 92 | @property 93 | def csub(self): 94 | return int(self._csub) 95 | 96 | @property 97 | def sub_n(self): 98 | return int(self._sub_n) 99 | 100 | @property 101 | def w(self): 102 | return int(self._w) 103 | 104 | @property 105 | def seedcov(self): 106 | return int(self._seedcov) 107 | 108 | @property 109 | def secondary(self): 110 | return int(self._secondary) 111 | 112 | @property 113 | def secondary_all(self): 114 | return int(self._secondary_all) 115 | 116 | @tuple 117 | class mem_aln_t: 118 | _pos: i64 119 | _rid: i32 120 | _flag: i32 121 | _bitfields: u32 122 | _n_cigar: i32 123 | _cigar: Ptr[u32] 124 | _XA: Ptr[byte] 125 | _score: i32 126 | _sub: i32 127 | _alt_sc: i32 128 | 129 | @property 130 | def pos(self): 131 | return int(self._pos) 132 | 133 | @property 134 | def rid(self): 135 | return int(self._rid) 136 | 137 | @property 138 | def rev(self): 139 | return bool(self._bitfields & u32(0x1)) 140 | 141 | @property 142 | def alt(self): 143 | return bool(self._bitfields & u32(0x2)) 144 | 145 | @property 146 | def mapq(self): 147 | return int((self._bitfields & u32(0x3fc)) >> u32(2)) 148 | 149 | @property 150 | def NM(self): 151 | return int((self._bitfields & u32(0xfffffc00)) >> u32(10)) 152 | 153 | @property 154 | def cigar(self): 155 | return CIGAR(self._cigar, int(self._n_cigar)) 156 | 157 | @property 158 | def score(self): 159 | return int(self._score) 160 | 161 | @property 162 | def sub(self): 163 | return int(self._sub) 164 | 165 | @property 166 | def alt_score(self): 167 | return int(self._alt_sc) 168 | 169 | @tuple 170 | class mem_alnreg_v: 171 | n: int 172 | m: int 173 | a: Ptr[mem_alnreg_t] 174 | 175 | def __new__() -> mem_alnreg_v: 176 | return mem_alnreg_v(0, 0, Ptr[mem_alnreg_t]()) 177 | 178 | def __getitem__(self, idx: int): 179 | if not (0 <= idx < self.n): 180 | raise IndexError("alignment index out of range") 181 | return self.a[idx] 182 | 183 | def __len__(self): 184 | return self.n 185 | 186 | def __iter__(self): 187 | i = 0 188 | while i < self.n: 189 | yield self.a[i] 190 | i += 1 191 | 192 | @tuple 193 | class bwaidx_t: 194 | _bwt: cobj 195 | _bns: Ptr[bntseq_t] 196 | _pac: cobj 197 | _is_shm: i32 198 | _l_mem: i64 199 | _mem: cobj 200 | 201 | def name(self, aln: mem_aln_t): 202 | def _strlen(p: Ptr[byte]): 203 | n = 0 204 | while p[n]: n += 1 205 | return n 206 | p = self._bns[0]._anns[aln.rid]._name 207 | return str(p, _strlen(p)) 208 | 209 | _BWA_IDX_BWT = i32(0x1) 210 | _BWA_IDX_BNS = i32(0x2) 211 | _BWA_IDX_PAC = i32(0x4) 212 | _BWA_IDX_ALL = i32(0x7) 213 | 214 | from C import BWA_LIB.mem_opt_init() -> cobj 215 | from C import BWA_LIB.bwa_fill_scmat(i32, i32, Ptr[i8]) -> cobj 216 | from C import BWA_LIB.bwa_idx_load(cobj, i32) -> Ptr[bwaidx_t] 217 | from C import BWA_LIB.mem_align1(Ptr[mem_alnreg_v], cobj, cobj, Ptr[bntseq_t], cobj, i32, Ptr[byte]) 218 | from C import BWA_LIB.mem_reg2aln(Ptr[mem_aln_t], cobj, Ptr[bntseq_t], cobj, i32, Ptr[byte], Ptr[mem_alnreg_t]) 219 | 220 | def options(match_score: int = 1, 221 | mismatch_score: int = 4, 222 | open_del: int = 6, 223 | open_ins: int = 6, 224 | extend_del: int = 1, 225 | extend_ins: int = 1, 226 | bandwidth: int = 100, 227 | zdrop: int = 100, 228 | clip_penalty: Tuple[int,int] = (5,5), 229 | unpaired_penalty: int = 17): 230 | # offsets below are based on BWA mem_opt_t definition 231 | opt = mem_opt_init() 232 | p = Ptr[i32](opt + 0 * 4); p[0] = i32(match_score) 233 | p = Ptr[i32](opt + 1 * 4); p[0] = i32(mismatch_score) 234 | p = Ptr[i32](opt + 2 * 4); p[0] = i32(open_del) 235 | p = Ptr[i32](opt + 3 * 4); p[0] = i32(extend_del) 236 | p = Ptr[i32](opt + 4 * 4); p[0] = i32(open_ins) 237 | p = Ptr[i32](opt + 5 * 4); p[0] = i32(extend_ins) 238 | p = Ptr[i32](opt + 6 * 4); p[0] = i32(unpaired_penalty) 239 | p = Ptr[i32](opt + 7 * 4); p[0] = i32(clip_penalty[0]) 240 | p = Ptr[i32](opt + 8 * 4); p[0] = i32(clip_penalty[1]) 241 | p = Ptr[i32](opt + 9 * 4); p[0] = i32(bandwidth) 242 | p = Ptr[i32](opt + 10 * 4); p[0] = i32(zdrop) 243 | bwa_fill_scmat(i32(match_score), i32(mismatch_score), Ptr[i8](opt + 140)) 244 | return opt 245 | 246 | @tuple 247 | class BWA: 248 | opt: cobj 249 | p: Ptr[bwaidx_t] 250 | 251 | def __new__(hint: str) -> BWA: 252 | return BWA(mem_opt_init(), bwa_idx_load(hint.c_str(), _BWA_IDX_ALL)) 253 | 254 | def __new__(opt: cobj, hint: str) -> BWA: 255 | return BWA(opt, bwa_idx_load(hint.c_str(), _BWA_IDX_ALL)) 256 | 257 | def name(self, aln: mem_aln_t): 258 | return self.p[0].name(aln) 259 | 260 | def reg2aln(self, s: seq, reg: mem_alnreg_t): 261 | from C import free(Ptr[byte]) 262 | a = mem_aln_t() 263 | mem_reg2aln(__ptr__(a), self.opt, self.p[0]._bns, self.p[0]._pac, i32(len(s)), s.ptr, __ptr__(reg)) 264 | # fix CIGAR & marshal to GC: 265 | n_cigar = int(a._n_cigar) 266 | cigar = Ptr[u32](n_cigar) 267 | for i in range(n_cigar): 268 | c = a._cigar[i] 269 | n, op = c >> u32(4), c & u32(0xf) 270 | if u32(3) <= op < u32(0xf): 271 | op += u32(1) 272 | cigar[i] = (n << u32(4)) | op 273 | free(a._cigar.as_byte()) 274 | Ptr[Ptr[u32]](__ptr__(a).as_byte() + 24)[0] = cigar # note offset 275 | return a 276 | 277 | def align(self, s: seq): 278 | import internal.gc as gc 279 | from C import free(Ptr[byte]) 280 | p = s.ptr if s.len >= 0 else str(s).ptr 281 | ar = mem_alnreg_v() 282 | mem_align1(__ptr__(ar), self.opt, self.p[0]._bwt, self.p[0]._bns, self.p[0]._pac, i32(len(s)), p) 283 | # marshal to GC: 284 | n_bytes = ar.n * gc.sizeof(mem_alnreg_t) 285 | copy = gc.alloc_atomic(n_bytes) 286 | str.memcpy(copy.as_byte(), ar.a.as_byte(), n_bytes) 287 | free(ar.a.as_byte()) 288 | Ptr[Ptr[mem_alnreg_t]](__ptr__(ar).as_byte() + 16)[0] = Ptr[mem_alnreg_t](copy) # note offset 289 | return ar 290 | -------------------------------------------------------------------------------- /test/core/kmers.codon: -------------------------------------------------------------------------------- 1 | from bio import * 2 | 3 | K: Static[int] = 5 4 | 5 | s = s'ACGTAACGTA' 6 | print s # EXPECT: ACGTAACGTA 7 | print list(s.kmers(1, K)) # EXPECT: [k'ACGTA', k'CGTAA', k'GTAAC', k'TAACG', k'AACGT', k'ACGTA'] 8 | print list(s.split(5, 1)) # EXPECT: [s'ACGTA', s'CGTAA', s'GTAAC', s'TAACG', s'AACGT', s'ACGTA'] 9 | print ~s # EXPECT: TACGTTACGT 10 | print list((~s).kmers(1, K)) # EXPECT: [k'TACGT', k'ACGTT', k'CGTTA', k'GTTAC', k'TTACG', k'TACGT'] 11 | print list((~s).split(5, 1)) # EXPECT: [s'TACGT', s'ACGTT', s'CGTTA', s'GTTAC', s'TTACG', s'TACGT'] 12 | 13 | s = s'AANGGCCAGTC' 14 | print list(s.kmers_with_pos(1, 2)) # EXPECT: [(0, k'AA'), (3, k'GG'), (4, k'GC'), (5, k'CC'), (6, k'CA'), (7, k'AG'), (8, k'GT'), (9, k'TC')] 15 | print list(~s |> kmers_with_pos(1, 2)) # EXPECT: [(0, k'GA'), (1, k'AC'), (2, k'CT'), (3, k'TG'), (4, k'GG'), (5, k'GC'), (6, k'CC'), (9, k'TT')] 16 | 17 | s = s'AGACCTTAGC' 18 | print s # EXPECT: AGACCTTAGC 19 | print list(s.kmers(1, 3)) # EXPECT: [k'AGA', k'GAC', k'ACC', k'CCT', k'CTT', k'TTA', k'TAG', k'AGC'] 20 | print list(s.kmers(2, 3)) # EXPECT: [k'AGA', k'ACC', k'CTT', k'TAG'] 21 | print list(s.kmers(4, 3)) # EXPECT: [k'AGA', k'CTT'] 22 | print ~s # EXPECT: GCTAAGGTCT 23 | print list((~s).kmers(1, 3)) # EXPECT: [k'GCT', k'CTA', k'TAA', k'AAG', k'AGG', k'GGT', k'GTC', k'TCT'] 24 | print list((~s).kmers(2, 3)) # EXPECT: [k'GCT', k'TAA', k'AGG', k'GTC'] 25 | print list((~s).kmers(4, 3)) # EXPECT: [k'GCT', k'AGG'] 26 | 27 | s = s'AGACCTNTAGNC' 28 | print s # EXPECT: AGACCTNTAGNC 29 | print list(s.kmers_with_pos(1, 3)) # EXPECT: [(0, k'AGA'), (1, k'GAC'), (2, k'ACC'), (3, k'CCT'), (7, k'TAG')] 30 | print list(s.kmers_with_pos(2, 3)) # EXPECT: [(0, k'AGA'), (2, k'ACC')] 31 | print list(s.kmers_with_pos(4, 3)) # EXPECT: [(0, k'AGA')] 32 | print ~s # EXPECT: GNCTANAGGTCT 33 | print list((~s).kmers_with_pos(1, 3)) # EXPECT: [(2, k'CTA'), (6, k'AGG'), (7, k'GGT'), (8, k'GTC'), (9, k'TCT')] 34 | print list((~s).kmers_with_pos(2, 3)) # EXPECT: [(2, k'CTA'), (6, k'AGG'), (8, k'GTC')] 35 | print list((~s).kmers_with_pos(4, 3)) # EXPECT: [(8, k'GTC')] 36 | 37 | s = s'AGACCTNTAGC' 38 | print list(s.split(k=100, step=1)) # EXPECT: [] 39 | print list((~s).split(k=100, step=1)) # EXPECT: [] 40 | print list(s.kmers(step=1, k=100)) # EXPECT: [] 41 | print list((~s).kmers(step=1, k=100)) # EXPECT: [] 42 | 43 | s = s'TAGCC' 44 | print list(s.split(k=5, step=17)) # EXPECT: [s'TAGCC'] 45 | print list((~s).split(k=5, step=17)) # EXPECT: [s'GGCTA'] 46 | print list(s.kmers(step=17, k=5)) # EXPECT: [k'TAGCC'] 47 | print list((~s).kmers(step=17, k=5)) # EXPECT: [k'GGCTA'] 48 | 49 | k1 = Kmer[K](s'ACGTA') 50 | k2 = Kmer[K](s'ATGTT') 51 | 52 | print [k1[i] for i in range(len(k1))] # EXPECT: [k'A', k'C', k'G', k'T', k'A'] 53 | print [k2[-i - 1] for i in range(len(k2))] # EXPECT: [k'T', k'T', k'G', k'T', k'A'] 54 | 55 | print ~k1 # EXPECT: TACGT 56 | print ~k2 # EXPECT: AACAT 57 | 58 | print abs(k1 - k2) # EXPECT: 2 59 | print abs(k2 - k1) # EXPECT: 2 60 | 61 | if k1 > k2: 62 | print k2 - k1 # EXPECT: -2 63 | print k1 - k2 # EXPECT: 2 64 | else: 65 | print k1 - k2 66 | print k2 - k1 67 | 68 | k1, k2 = k2, k1 69 | if k1 > k2: 70 | print k2 - k1 # EXPECT: -2 71 | print k1 - k2 # EXPECT: 2 72 | else: 73 | print k1 - k2 74 | print k2 - k1 75 | 76 | k1 = Kmer[K](s'ACGTA') 77 | k2 = Kmer[K](s'ACGTA') 78 | print k1 - k2 # EXPECT: 0 79 | print k2 - k1 # EXPECT: 0 80 | 81 | k1long = Kmer[100]() |> base(0, k'T') |> base(42, k'C') |> base(77, k'G') 82 | k2long = Kmer[100]() |> base(0, k'T') |> base(43, k'C') |> base(77, k'T') 83 | print abs(k1long - k2long) # EXPECT: 3 84 | 85 | if k1long > k2long: 86 | print k2long - k1long # EXPECT: -3 87 | print k1long - k2long # EXPECT: 3 88 | else: 89 | print k1long - k2long 90 | print k2long - k1long 91 | 92 | k1long, k2long = k2long, k1long 93 | if k1long > k2long: 94 | print k2long - k1long # EXPECT: -3 95 | print k1long - k2long # EXPECT: 3 96 | else: 97 | print k1long - k2long 98 | print k2long - k1long 99 | 100 | print k1 << s'G' # EXPECT: CGTAG 101 | print k1 >> s'G' # EXPECT: GACGT 102 | print k1 << ~s'G' # EXPECT: CGTAC 103 | print k1 >> ~s'G' # EXPECT: CACGT 104 | 105 | K100 = Kmer[100] 106 | K1 = Kmer[1] 107 | print K100() |> base(-1, K1(s'C')) |> base(98, s'G') |> base(0, K1(s'T')) 108 | # EXPECT: TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGC 109 | 110 | k3 = K100() 111 | h1 = hash(k3) 112 | h2 = hash(k3 |> base(0, k'T')) 113 | h3 = hash(k3 |> base(99, k'T')) 114 | h4 = hash(k3 |> base(0, k'T') |> base(99, k'T')) 115 | # bases on both ends should be involved in k-mer hash: 116 | print h1 == h2 # EXPECT: False 117 | print h1 == h3 # EXPECT: False 118 | print h1 == h4 # EXPECT: False 119 | print h2 == h3 # EXPECT: False 120 | print h2 == h4 # EXPECT: False 121 | print h3 == h4 # EXPECT: False 122 | 123 | print k'ACGT' in s'GGACGTGG' # EXPECT: True 124 | print k'ACGT' in s'GGAGTGG' # EXPECT: False 125 | print s'ACGT' in k'GGACGTGG' # EXPECT: True 126 | print s'ACGT' in k'GGAGTGG' # EXPECT: False 127 | 128 | @test 129 | def test_N(): 130 | assert s''.N() == False 131 | assert s'ACGTacgt'.N() == False 132 | assert s'ACGTNacgt'.N() == True 133 | assert s'N'.N() == True 134 | assert s'AAN'.N() == True 135 | assert s'NAA'.N() == True 136 | assert s'ANA'.N() == True 137 | test_N() 138 | 139 | @test 140 | def test_base_counts(): 141 | assert s''.bases == BaseCounts(0,0,0,0,0) 142 | assert s'A'.bases == BaseCounts(1,0,0,0,0) 143 | assert s'C'.bases == BaseCounts(0,1,0,0,0) 144 | assert s'G'.bases == BaseCounts(0,0,1,0,0) 145 | assert s'T'.bases == BaseCounts(0,0,0,1,0) 146 | assert s'N'.bases == BaseCounts(0,0,0,0,1) 147 | assert s'AAGAGACTNTN'.bases == BaseCounts(4,1,2,2,2) 148 | assert (s'A'.bases + s'G'.bases) - s'A'.bases == s'G'.bases 149 | assert s'A'.bases.add(T=True) - s'A'.bases == s'T'.bases 150 | test_base_counts() 151 | 152 | @test 153 | def test_kmer_revcomp(path: str, K: Static[int]): 154 | v1 = [] 155 | v2 = [] 156 | 157 | for x in FASTA(path, fai=False): 158 | for y in x.seq.split(k=K, step=1): 159 | if not y.N(): 160 | v1.append(str(~y).upper()) 161 | 162 | for x in FASTA(path, fai=False): 163 | for y in x.seq.kmers(step=1, k=K): 164 | v2.append(str(~y).upper()) 165 | 166 | ''' 167 | if len(v1) == len(v2): 168 | for i in range(len(v1)): 169 | if v1[i] != v2[i]: 170 | print i, v1[i], v2[i] 171 | ''' 172 | 173 | assert v1 == v2 174 | 175 | testfile = 'test/data/MT-human.fa' 176 | test_kmer_revcomp(testfile, 1) 177 | test_kmer_revcomp(testfile, 2) 178 | test_kmer_revcomp(testfile, 3) 179 | test_kmer_revcomp(testfile, 4) 180 | test_kmer_revcomp(testfile, 5) 181 | test_kmer_revcomp(testfile, 25) 182 | test_kmer_revcomp(testfile, 32) 183 | test_kmer_revcomp(testfile, 31) 184 | test_kmer_revcomp(testfile, 33) 185 | test_kmer_revcomp(testfile, 64) 186 | test_kmer_revcomp(testfile, 65) 187 | test_kmer_revcomp(testfile, 129) 188 | test_kmer_revcomp(testfile, 1000) 189 | 190 | @test 191 | def test_kmer_iteration(path: str, K: Static[int]): 192 | for rc in (True, False): 193 | for step in range(1, K + 2): 194 | v1 = [] 195 | v2 = [] 196 | 197 | for x in FASTA(path, fai=False): 198 | for y in (~(x.seq) if rc else x.seq).split(k=K, step=step): 199 | if not y.N(): 200 | v1.append(str(y).upper()) 201 | 202 | for x in FASTA(path, fai=False): 203 | for y in (~(x.seq) if rc else x.seq).kmers(step=step, k=K): 204 | v2.append(str(y).upper()) 205 | 206 | assert v1 == v2 207 | 208 | testfile = 'test/data/MT-human.fa' 209 | test_kmer_iteration(testfile, 1) 210 | test_kmer_iteration(testfile, 2) 211 | test_kmer_iteration(testfile, 3) 212 | test_kmer_iteration(testfile, 4) 213 | test_kmer_iteration(testfile, 5) 214 | test_kmer_iteration(testfile, 25) 215 | test_kmer_iteration(testfile, 32) 216 | test_kmer_iteration(testfile, 31) 217 | test_kmer_iteration(testfile, 33) 218 | test_kmer_iteration(testfile, 64) 219 | test_kmer_iteration(testfile, 65) 220 | test_kmer_iteration(testfile, 129) 221 | --------------------------------------------------------------------------------