├── test
    ├── data
    │   ├── MT-human.fa.fai
    │   ├── MT-orang.fa.fai
    │   ├── seqs2.fasta.fai
    │   ├── sample_fasta_for_fai.fasta.fai
    │   ├── toy.bam
    │   ├── toy.bed
    │   ├── toy.cram
    │   ├── seqs.fasta.gz
    │   ├── seqs.fastq.gz
    │   ├── seqs.txt.gz
    │   ├── toy.bam.bai
    │   ├── toy.cram.crai
    │   ├── seqs.fasta.fai
    │   ├── invalid
    │   │   ├── seqs_bad_base.fasta.fai
    │   │   ├── invalid_fai_float.fai
    │   │   ├── invalid_fai_missing_col.fai
    │   │   ├── seqs_bad_base.txt
    │   │   ├── invalid_with_header.bed
    │   │   ├── seqs_bad_base.fastq
    │   │   ├── seqs_bad_name.fastq
    │   │   ├── seqs_bad_qual.fastq
    │   │   ├── seqs_bad_qual_len.fastq
    │   │   └── seqs_bad_base.fasta
    │   ├── seqs2.fasta
    │   ├── valid_fai.fai
    │   ├── sample_fasta_for_fai.fasta
    │   ├── seqs.txt
    │   ├── toy.sam
    │   ├── valid_without_header.bed
    │   ├── valid_with_header.bed
    │   ├── seqs.fastq
    │   ├── seqs.fasta
    │   └── toy.vcf
    ├── bench
    │   ├── revcomp.codon
    │   ├── fqcnt.codon
    │   ├── kmercnt.codon
    │   ├── hash.codon
    │   ├── fmindex.codon
    │   ├── hamming.codon
    │   ├── knucleotide.codon
    │   ├── cpg.codon
    │   ├── sw.codon
    │   ├── 16mer.codon
    │   ├── bedcov.codon
    │   ├── fasta.codon
    │   ├── rc.codon
    │   ├── fastx.codon
    │   └── match.codon
    ├── CMakeLists.txt.in
    ├── core
    │   ├── containers.codon
    │   ├── serialization.codon
    │   ├── match.codon
    │   ├── bltin.codon
    │   ├── bwtsa.codon
    │   ├── align.codon
    │   ├── proteins.codon
    │   └── kmers.codon
    ├── apps
    │   ├── snap
    │   │   ├── test.seq
    │   │   ├── genomeindex.seq
    │   │   └── hashtable.seq
    │   ├── minimap2
    │   │   ├── sw_simple.codon
    │   │   └── sw.codon
    │   ├── mrsfast
    │   │   └── exact.codon
    │   ├── umi
    │   │   └── whitelist.codon
    │   ├── bwa
    │   │   ├── fastmap.codon
    │   │   └── fastmap_build.codon
    │   ├── cora
    │   │   ├── hom_exact.codon
    │   │   └── hom_inexact.codon
    │   └── avid
    │   │   └── avid.codon
    ├── pipeline
    │   ├── prefetch.codon
    │   ├── interalign.codon
    │   └── canonical_opt.codon
    └── main.cpp
├── .clang-format
├── logo
    └── logo.png
├── seq.h
├── ir
    ├── seq.h
    ├── seq.cpp
    ├── pipeline.h
    └── revcomp.h
├── plugin.toml
├── stdlib
    └── bio
    │   ├── __init__.codon
    │   ├── types.codon
    │   ├── prefetch.codon
    │   ├── block.codon
    │   ├── iter.codon
    │   ├── locus.codon
    │   ├── c_htslib.codon
    │   ├── fai.codon
    │   ├── fastq.codon
    │   ├── pseq.codon
    │   ├── builtin.codon
    │   └── bwa.codon
├── .gitignore
├── .github
    ├── build-linux
    │   ├── entrypoint.sh
    │   ├── Dockerfile.manylinux2014-x86_64
    │   ├── Dockerfile.linux-x86_64
    │   ├── Dockerfile.linux-aarch64
    │   └── Dockerfile.manylinux2014-aarch64
    └── workflows
    │   └── ci.yml
├── README.md
├── htslib-config.h.cmake
└── sw
    └── ksw2_gg2_sse.cpp


/test/data/MT-human.fa.fai:
--------------------------------------------------------------------------------
1 | MT_human	16569	10	60	61
2 | 


--------------------------------------------------------------------------------
/test/data/MT-orang.fa.fai:
--------------------------------------------------------------------------------
1 | MT_orang	16499	10	60	61
2 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | ---
2 | BasedOnStyle: LLVM
3 | ColumnLimit: 88
4 | 


--------------------------------------------------------------------------------
/logo/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exaloop/seq/HEAD/logo/logo.png


--------------------------------------------------------------------------------
/test/data/seqs2.fasta.fai:
--------------------------------------------------------------------------------
1 | chrA	10	6	10	11
2 | chrC	120	23	120	121
3 | 


--------------------------------------------------------------------------------
/test/data/sample_fasta_for_fai.fasta.fai:
--------------------------------------------------------------------------------
1 | one	66	5	30	31
2 | two	28	98	14	15
3 | 


--------------------------------------------------------------------------------
/test/data/toy.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/toy.bam


--------------------------------------------------------------------------------
/test/data/toy.bed:
--------------------------------------------------------------------------------
1 | chr1    11    22
2 | chr2    33    44
3 | chr3    55    66
4 | 


--------------------------------------------------------------------------------
/test/data/toy.cram:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/toy.cram


--------------------------------------------------------------------------------
/test/data/seqs.fasta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/seqs.fasta.gz


--------------------------------------------------------------------------------
/test/data/seqs.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/seqs.fastq.gz


--------------------------------------------------------------------------------
/test/data/seqs.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/seqs.txt.gz


--------------------------------------------------------------------------------
/test/data/toy.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/toy.bam.bai


--------------------------------------------------------------------------------
/test/data/toy.cram.crai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/exaloop/seq/HEAD/test/data/toy.cram.crai


--------------------------------------------------------------------------------
/test/data/seqs.fasta.fai:
--------------------------------------------------------------------------------
1 | chrA	460	27	50	51
2 | chrB	489	503	50	51
3 | chrC	500	1008	50	51
4 | chrD	49	1530	49	50
5 | 


--------------------------------------------------------------------------------
/test/data/invalid/seqs_bad_base.fasta.fai:
--------------------------------------------------------------------------------
1 | chrA	460	6	50	51
2 | chrB	489	482	50	51
3 | chrC	500	987	50	51
4 | chrD	49	1503	49	50
5 | 


--------------------------------------------------------------------------------
/test/data/seqs2.fasta:
--------------------------------------------------------------------------------
1 | >chrA
2 | GCCTTAACAT
3 | >chrC
4 | GGGGGGGGGGGCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAGGTATTCATCCTATGTGGGTAATTGAGGAGTATGCTAAGATTTTGCGTAGCGGGGGGGGGG
5 | 


--------------------------------------------------------------------------------
/test/data/valid_fai.fai:
--------------------------------------------------------------------------------
1 | fastq1	66	8	30	31	79
2 | fastq2	28	156	14	15	188
3 | fastq2	84	12	566	988	91
4 | fastq3	29	72	146	19	134
5 | fastq4	66	8	30	31	79
6 | fastq5	28	156	14	15	188
7 | 


--------------------------------------------------------------------------------
/test/data/sample_fasta_for_fai.fasta:
--------------------------------------------------------------------------------
1 | >one
2 | ATGCATGCATGCATGCATGCATGCATGCAT
3 | GCATGCATGCATGCATGCATGCATGCATGC
4 | ATGCAT
5 | >two another chromosome
6 | ATGCATGCATGCAT
7 | GCATGCATGCATGC
8 | 


--------------------------------------------------------------------------------
/test/data/invalid/invalid_fai_float.fai:
--------------------------------------------------------------------------------
1 | fastq1	66	8	30	31	79.2
2 | fastq2	28	156	14	15	188
3 | fastq2	84	12	566	988	91
4 | fastq3	29	72	146	19	134
5 | fastq4	66	8	30	31	79
6 | fastq5	28	156	14	15	188
7 | 


--------------------------------------------------------------------------------
/test/data/invalid/invalid_fai_missing_col.fai:
--------------------------------------------------------------------------------
1 | fastq2	28	156	14	15	188
2 | fastq1	66	8	30	31
3 | fastq2	84	12	566	988	91
4 | fastq3	29	72	146	19	134
5 | fastq4	66	8	30	31	79
6 | fastq5	28	156	14	15	188
7 | 


--------------------------------------------------------------------------------
/seq.h:
--------------------------------------------------------------------------------
 1 | #ifndef SEQ_H
 2 | #define SEQ_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | #define SEQ_FUNC extern "C"
 7 | 
 8 | typedef int64_t seq_int_t;
 9 | 
10 | struct seq_t {
11 |   seq_int_t len;
12 |   char *seq;
13 | };
14 | 
15 | #endif /* SEQ_H */
16 | 


--------------------------------------------------------------------------------
/ir/seq.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "codon/dsl/dsl.h"
 4 | 
 5 | namespace seq {
 6 | 
 7 | class Seq : public codon::DSL {
 8 | public:
 9 |   void addIRPasses(codon::ir::transform::PassManager *pm, bool debug) override;
10 | };
11 | 
12 | } // namespace seq
13 | 


--------------------------------------------------------------------------------
/plugin.toml:
--------------------------------------------------------------------------------
 1 | [about]
 2 | name = "Seq"
 3 | description = "a high-performance language for bioinformatics"
 4 | version = "0.11.4"
 5 | url = "https://seq-lang.org"
 6 | supported = ">=0.18.2"
 7 | 
 8 | [library]
 9 | cpp = "build/libseq"
10 | codon = "stdlib"
11 | link = ["{root}/build/libseq_static.a"]
12 | 


--------------------------------------------------------------------------------
/test/bench/revcomp.codon:
--------------------------------------------------------------------------------
 1 | import sys, bio
 2 | 
 3 | def process(l):
 4 |    w = 60
 5 |    rc = ~bio.seq(''.join(l))
 6 |    for i in range(0, len(rc), w):
 7 |       print rc[i:i + w]
 8 | 
 9 | l = list[str]()
10 | for line in sys.stdin:
11 |    if line[0] == '>':
12 |       process(l)
13 |       l.clear()
14 |       print line
15 |    else:
16 |       l.append(line)
17 | process(l)
18 | 


--------------------------------------------------------------------------------
/test/bench/fqcnt.codon:
--------------------------------------------------------------------------------
 1 | # FASTQ counter benchmark from https://github.com/lh3/biofast
 2 | from sys import argv, exit, stderr
 3 | from bio import *
 4 | 
 5 | if len(argv) != 2:
 6 |     stderr.write("Usage: fqcnt.py <in.fq.gz>\n")
 7 |     exit(1)
 8 | 
 9 | n, slen, qlen = 0, 0, 0
10 | for r in FASTQ(argv[1], validate=False):
11 |     n += 1
12 |     slen += len(r.read)
13 |     qlen += len(r.qual)
14 | 
15 | print n, slen, qlen
16 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt.in:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8.2)
 2 | 
 3 | project(googletest-download NONE)
 4 | 
 5 | include(ExternalProject)
 6 | ExternalProject_Add(googletest
 7 |   GIT_REPOSITORY    https://github.com/google/googletest.git
 8 |   GIT_TAG           release-1.10.0
 9 |   SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
10 |   BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
11 |   CONFIGURE_COMMAND ""
12 |   BUILD_COMMAND     ""
13 |   INSTALL_COMMAND   ""
14 |   TEST_COMMAND      ""
15 | )
16 | 


--------------------------------------------------------------------------------
/test/core/containers.codon:
--------------------------------------------------------------------------------
 1 | @test
 2 | def test_interval_tree():
 3 |     from bio.intervals import IntervalTree
 4 |     t = IntervalTree()
 5 |     t.add("chr1", 20, 30, "a")
 6 |     t.add("chr2", 10, 30, "b")
 7 |     t.add("chr1", 10, 25, "c")
 8 |     t.index()
 9 |     overlaps = {(a.start, a.end) for a in t.overlap("chr1", 15, 22)}
10 |     assert overlaps == {(20, 30), (10, 25)}
11 |     assert "chr1" in t
12 |     assert "chr2" in t
13 |     assert "chr3" not in t
14 |     assert {(a.start, a.end, a.data) for a in t} == {(20, 30, "a"), (10, 30, "b"), (10, 25, "c")}
15 |     assert len(t) == 3
16 | test_interval_tree()
17 | 


--------------------------------------------------------------------------------
/test/data/seqs.txt:
--------------------------------------------------------------------------------
1 | GTCCTAAATTGTTGTACGAAAGAACGTGACAGAGGGAAGGCACTCGGCGTGGCTGAGAGTTGCGGCTACCGCACTGTTACACGGTATGCTAGTTAAAACTTGGAAGAGGGCAAAGCGACTATGCACTGTGGCTGGATCGCTATGACCCCTG
2 | GACGTGTGGAGTAAGCATAAGTCACTATATCAACAAGCCCGCAACAATACTTGTAGAGAATCTGAACCGGCTAGGCGCTCAACGCTACAGGGTGTCATTTCGTACCCCTAACACTGCTATTCGTTTCGGAAGAGGCACCTCGGTGAAGAAA
3 | GAAGCTGGAGCGAAGTCGATGTTTTTGCTGTACCAGGCGTGAGTTTGTAGATAAGCGGTCTGATACCGCAGAAGCAGGGTACAGTATAGACACGGGTTAAGTCGAGAGACAGGTCAAACAATTAACGCCAAAGAGGTCCCAGTCAGGAGCT
4 | AATGAAGTGGGGTAATTATGATTCTATAAATTTGTAGGGAAATGGGTTTAGCGCCTGAACGACAAGCGATAGATTATGGGCTGAGGAATAGTAGTTACTCCGCGGGCGGCCGCATTCGATATTTTGCGTCATCATAGATCAAGTTTCCGGG
5 | 


--------------------------------------------------------------------------------
/test/bench/kmercnt.codon:
--------------------------------------------------------------------------------
 1 | # Implementation of benchmark at https://github.com/lh3/kmer-cnt
 2 | # Usage: seqc kmercnt.seq <input.fastq>
 3 | from sys import argv
 4 | from time import timing
 5 | from bio import *
 6 | 
 7 | def print_hist(h, N = 256):
 8 |     cnt = [0 for _ in range(N)]
 9 |     for v in h.values():
10 |         cnt[min(v, N - 1)] += 1
11 |     for i in range(1, N):
12 |         print f'{i}\t{cnt[i]}'
13 | 
14 | with timing('k-mer counting'), FASTQ(argv[1], copy=False, validate=False) as fastq:
15 |     h: Dict[Kmer[31], int] = {}
16 |     fastq |> seqs |> kmers(step=1, k=31) |> canonical |> h.increment
17 |     print_hist(h)
18 | 


--------------------------------------------------------------------------------
/test/data/invalid/seqs_bad_base.txt:
--------------------------------------------------------------------------------
1 | GTCCTAAATTGTTGTACGAAAGAACGTGACAGAGGGAAGGCACTCGGCGTGGCTGAGAGTTGCGGCTACCGCACTGTTACACGGTATGCTAGTTAAAACTTGGAAGAGGGCAAAGCGACTATGCACTGTGGCTGGATCGCTATGACCCCTG
2 | GACGTGTGGAGTAAGCATAAGTCACTATATCAACAAGCCCGCAACAATACTTGTAGAGAATCTGAACCGGCTAGZCGCTCAACGCTACAGGGTGTCATTTCGTACCCCTAACACTGCTATTCGTTTCGGAAGAGGCACCTCGGTGAAGAAA
3 | GAAGCTGGAGCGAAGTCGATGTTTTTGCTGTACCAGGCGTGAGTTTGTAGATAAGCGGTCTGATACCGCAGAAGCAGGGTACAGTATAGACACGGGTTAAGTCGAGAGACAGGTCAAACAATTAACGCCAAAGAGGTCCCAGTCAGGAGCT
4 | AATGAAGTGGGGTAATTATGATTCTATAAATTTGTAGGGAAATGGGTTTAGCGCCTGAACGACAAGCGATAGATTATGGGCTGAGGAATAGTAGTTACTCCGCGGGCGGCCGCATTCGATATTTTGCGTCATCATAGATCAAGTTTCCGGG
5 | 


--------------------------------------------------------------------------------
/stdlib/bio/__init__.codon:
--------------------------------------------------------------------------------
 1 | from bio.seq import *
 2 | from bio.kmer import *
 3 | from bio.pseq import *
 4 | 
 5 | from bio.builtin import *
 6 | 
 7 | from bio.block import Block, blocks
 8 | from bio.locus import Locus
 9 | from bio.iter import Seqs
10 | 
11 | from bio.align import SubMat, CIGAR, Alignment
12 | from bio.pseq import pseq, translate
13 | from bio.bwt import _saisxx, _saisxx_bwt
14 | 
15 | from bio.fasta import FASTARecord, FASTA, pFASTARecord, pFASTA
16 | from bio.fastq import FASTQRecord, FASTQ
17 | from bio.fai import FAIRecord, FAI
18 | from bio.bam import SAMRecord, SAM, BAM, CRAM
19 | from bio.bed import BEDRecord, BED
20 | from bio.vcf import VCFRecord, VCF, BCF
21 | 
22 | from bio.prefetch import *
23 | from bio.types import *
24 | 


--------------------------------------------------------------------------------
/ir/seq.cpp:
--------------------------------------------------------------------------------
 1 | #include "seq.h"
 2 | #include "pipeline.h"
 3 | #include "revcomp.h"
 4 | 
 5 | #include "codon/cir/transform/lowering/pipeline.h"
 6 | 
 7 | namespace seq {
 8 | 
 9 | void Seq::addIRPasses(codon::ir::transform::PassManager *pm, bool debug) {
10 |   pm->registerPass(std::make_unique<KmerRevcompInterceptor>());
11 |   if (debug)
12 |     return;
13 |   auto dep = codon::ir::transform::lowering::PipelineLowering::KEY;
14 |   pm->registerPass(std::make_unique<PipelineSubstitutionOptimization>(), dep);
15 |   pm->registerPass(std::make_unique<PipelinePrefetchOptimization>(), dep);
16 |   pm->registerPass(std::make_unique<PipelineInterAlignOptimization>(), dep);
17 | }
18 | 
19 | } // namespace seq
20 | 
21 | extern "C" std::unique_ptr<codon::DSL> load() { return std::make_unique<seq::Seq>(); }
22 | 


--------------------------------------------------------------------------------
/test/data/toy.sam:
--------------------------------------------------------------------------------
 1 | @SQ	SN:ref	LN:45
 2 | @SQ	SN:ref2	LN:40
 3 | r001	163	ref	7	30	8M4I4M1D3M	=	37	39	TTAGATAAAGAGGATACTG	*	XX:B:S,12561,2,20,112
 4 | r002	0	ref	9	30	1S2I6M1P1I1P1I4M2I	*	0	0	AAAAGATAAGGGATAAA	*
 5 | r003	0	ref	9	30	5H6M	*	0	0	AGCTAA	*
 6 | r004	0	ref	16	30	6M14N1I5M	*	0	0	ATAGCTCTCAGC	*
 7 | r003	16	ref	29	30	6H5M	*	0	0	TAGGC	*
 8 | r001	83	ref	37	30	9M	=	7	-39	CAGCGCCAT	*
 9 | x1	0	ref2	1	30	20M	*	0	0	aggttttataaaacaaataa	????????????????????
10 | x2	0	ref2	2	30	21M	*	0	0	ggttttataaaacaaataatt	?????????????????????
11 | x3	0	ref2	6	30	9M4I13M	*	0	0	ttataaaacAAATaattaagtctaca	??????????????????????????
12 | x4	0	ref2	10	30	25M	*	0	0	CaaaTaattaagtctacagagcaac	?????????????????????????
13 | x5	0	ref2	12	30	24M	*	0	0	aaTaattaagtctacagagcaact	????????????????????????
14 | x6	0	ref2	14	30	23M	*	0	0	Taattaagtctacagagcaacta	???????????????????????
15 | 


--------------------------------------------------------------------------------
/test/bench/hash.codon:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | # Test k-mer hash collisions #
 3 | ##############################
 4 | from sys import argv
 5 | from bio import *
 6 | d = {}
 7 | #d.resize(1 << 32)
 8 | 
 9 | def test(use_bad_hash: bool, K: Static[int]):
10 |     def update(kmer, use_bad_hash, seen):
11 |         if kmer not in seen:
12 |             h = int(kmer.as_int()) if use_bad_hash else hash(kmer)
13 |             d[h] = d.get(h, 0) + 1
14 |             seen.add(kmer)
15 | 
16 |     seen: Set[Kmer[K]] = set()
17 |     #seen.resize(1 << 32)
18 |     FASTA(argv[1]) |> seqs |> kmers(1, K) |> update(use_bad_hash, seen)
19 |     m = max((v, k) for k,v in d.items())[0]
20 |     a = sum(v for v in d.values()) / len(d)
21 |     print f'{K}-mer ({use_bad_hash=}):\tmax={m}, avg={a}'
22 |     d.clear()
23 | 
24 | print 'start'
25 | test(False, 64)
26 | test(True, 64)
27 | 


--------------------------------------------------------------------------------
/test/bench/fmindex.codon:
--------------------------------------------------------------------------------
 1 | ######################
 2 | # Prefetch benchmark #
 3 | ######################
 4 | from sys import argv
 5 | from bio import *
 6 | from bio.fmindex import FMIndex
 7 | from time import timing
 8 | from pickle import load
 9 | import gzip
10 | 
11 | fmi = None
12 | with gzip.open(argv[1], 'rb') as jar:
13 |     fmi = load(jar, FMIndex)
14 | 
15 | step = 20
16 | n = 0
17 | 
18 | def update(count):
19 |     global n
20 |     n += count
21 | 
22 | @prefetch
23 | def find(s, fmi):
24 |     intv = fmi.interval(s[-1])
25 |     s = s[:-1]
26 |     while s and intv:
27 |         intv = fmi[intv, s[-1]]
28 |         s = s[:-1]
29 |     return len(intv)
30 | 
31 | for k in (10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32):
32 |     n = 0
33 |     with timing(f'{k=}'):
34 |         FASTQ(argv[2]) |> seqs |> split(k, step=step) |> find(fmi) |> update
35 |     print n
36 | 


--------------------------------------------------------------------------------
/test/data/valid_without_header.bed:
--------------------------------------------------------------------------------
 1 | chr7    127471196    127472363    Pos1    882.13435    +    127471196    127472363    255,0,0   4   11,12,13,14     14,55,66,99
 2 | chr7    127472363    127473530    Pos2    0    +    127472363    127473530    255,0,0
 3 | chr7    127473530    127474697    Pos3    0    +    127473530    127474697    255,0,0
 4 | chr7    127474697    127475864    Pos4    0    +    127474697    127475864    255,0,0
 5 | chr7    127475864    127477031    Neg1    0    -    127475864    127477031    0,0,255
 6 | chr7    127477031    127478198    Neg2    0    -    127477031    127478198    0,0,255
 7 | chr7    127478198    127479365    Neg3    0    -    127478198    127479365    0,0,255
 8 | chr7    127479365    127480532    Pos5    0    +    127479365    127480532    255,0,0
 9 | chr7    127480532    127481699    Neg4    0    -    127480532    127481699    0,0,255
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ######################
 2 | # Generic .gitignore #
 3 | ######################
 4 | 
 5 | # Compiled source #
 6 | ###################
 7 | *.com
 8 | *.class
 9 | *.dll
10 | *.exe
11 | *.o
12 | *.a
13 | *.obj
14 | *.so
15 | *.dylib
16 | *.pyc
17 | build/
18 | build_*/
19 | extra/jupyter/build/
20 | 
21 | # Packages #
22 | ############
23 | # it's better to unpack these files and commit the raw source
24 | # git has its own built-in compression methods
25 | *.7z
26 | *.dmg
27 | *.iso
28 | *.jar
29 | *.rar
30 | *.tar
31 | *.zip
32 | 
33 | # Logs and databases #
34 | ######################
35 | *.log
36 | *.sql
37 | *.sqlite
38 | 
39 | # OS generated files #
40 | ######################
41 | .DS_Store
42 | .DS_Store?
43 | ._*
44 | .Spotlight-V100
45 | .Trashes
46 | ehthumbs.db
47 | Thumbs.db
48 | 
49 | # IDE generated files #
50 | #######################
51 | .idea
52 | .mypy_cache
53 | .vscode
54 | 
55 | extra/jupyter/share/jupyter/kernels/codon/kernel.json
56 | scratch.*
57 | 


--------------------------------------------------------------------------------
/stdlib/bio/types.codon:
--------------------------------------------------------------------------------
 1 | @extend
 2 | class byte:
 3 |     def comp(self) -> byte:
 4 |         _byte_comp_table = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN-.NNNNNNNNNNNNNNNNNNTVGHNNCDNNMNKNNNNYSAABWNRNNNNNNNtvghNNcdNNmNknNNNysaabwNrNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"
 5 |         return _byte_comp_table.ptr[int(self)]
 6 | 
 7 | @extend
 8 | class Dict:
 9 |     def prefetch(self, key: K):
10 |         if self._n_buckets:
11 |             from internal.types.collections.dict import _dict_hash
12 |             mask = self._n_buckets - 1
13 |             k = _dict_hash(key)
14 |             i = k & mask
15 |             (self._keys + i).__prefetch_r1__()
16 |             (self._vals + i).__prefetch_r1__()
17 |             (self._flags + (i >> 4)).__prefetch_r1__()
18 | 
19 | @extend
20 | class List:
21 |     def prefetch(self, idx: int):
22 |         (self.arr.ptr + idx).__prefetch_r3__()
23 | 


--------------------------------------------------------------------------------
/ir/pipeline.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "codon/cir/cir.h"
 4 | #include "codon/cir/transform/pass.h"
 5 | 
 6 | namespace seq {
 7 | 
 8 | class PipelineSubstitutionOptimization : public codon::ir::transform::OperatorPass {
 9 |   static const std::string KEY;
10 |   std::string getKey() const override { return KEY; }
11 |   void handle(codon::ir::PipelineFlow *) override;
12 | };
13 | 
14 | class PipelinePrefetchOptimization : public codon::ir::transform::OperatorPass {
15 |   const unsigned SCHED_WIDTH_PREFETCH = 16;
16 |   static const std::string KEY;
17 |   std::string getKey() const override { return KEY; }
18 |   void handle(codon::ir::PipelineFlow *) override;
19 | };
20 | 
21 | class PipelineInterAlignOptimization : public codon::ir::transform::OperatorPass {
22 |   const unsigned SCHED_WIDTH_INTERALIGN = 2048;
23 |   static const std::string KEY;
24 |   std::string getKey() const override { return KEY; }
25 |   void handle(codon::ir::PipelineFlow *) override;
26 | };
27 | 
28 | } // namespace seq
29 | 


--------------------------------------------------------------------------------
/test/data/invalid/invalid_with_header.bed:
--------------------------------------------------------------------------------
 1 | browser position chr7:127471196-127495720
 2 | browser hide all
 3 | track name="ItemRGBDemo" description="Item RGB demonstration" visibility=2 itemRgb="On"
 4 | chr7    127471196a    127472363    Pos1    0    +    127471196    127472363    255,0,0   4   11,12,13,14     14,55,66,99
 5 | chr7    127472363    127473530    Pos2    0    +    127472363    127473530    255,0,0
 6 | chr7    127473530    127474697    Pos3    0    +    127473530    127474697    255,0,0
 7 | chr7    127474697    127475864    Pos4    0    +    127474697    127475864    255,0,0
 8 | chr7    127475864    127477031    Neg1    0    -    127475864    127477031    0,0,255
 9 | chr7    127477031    127478198    Neg2    0    -    127477031    127478198    0,0,255
10 | chr7    127478198    127479365    Neg3    0    -    127478198    127479365    0,0,255
11 | chr7    127479365    127480532    Pos5    0    +    127479365    127480532    255,0,0
12 | chr7    127480532    127481699    Neg4    0    -    127480532    127481699    0,0,255
13 | 


--------------------------------------------------------------------------------
/test/data/valid_with_header.bed:
--------------------------------------------------------------------------------
 1 | browser position chr7:127471196-127495720
 2 | browser hide all
 3 | track name="ItemRGBDemo" description="Item RGB demonstration" visibility=2 itemRgb="On"
 4 | chr7    127471196    127472363    Pos1    882.13435    +    127471196    127472363    255,0,0   4   11,12,13,14     14,55,66,99
 5 | chr7    127472363    127473530    Pos2    0    +    127472363    127473530    255,0,0
 6 | chr7    127473530    127474697    Pos3    0    +    127473530    127474697    255,0,0
 7 | chr7    127474697    127475864    Pos4    0    +    127474697    127475864    255,0,0
 8 | chr7    127475864    127477031    Neg1    0    -    127475864    127477031    0,0,255
 9 | chr7    127477031    127478198    Neg2    0    -    127477031    127478198    0,0,255
10 | chr7    127478198    127479365    Neg3    0    -    127478198    127479365    0,0,255
11 | chr7    127479365    127480532    Pos5    0    +    127479365    127480532    255,0,0
12 | chr7    127480532    127481699    Neg4    0    -    127480532    127481699    0,0,255
13 | 


--------------------------------------------------------------------------------
/stdlib/bio/prefetch.codon:
--------------------------------------------------------------------------------
 1 | @inline
 2 | def _dynamic_coroutine_scheduler[A,B,T,C](value: A, coro: B, states: Array[Generator[T]], I: Ptr[int], N: Ptr[int], M: int, args: C):
 3 |     n = N[0]
 4 |     if n < M:
 5 |         states[n] = coro(value, *args)
 6 |         N[0] = n + 1
 7 |     else:
 8 |         i = I[0]
 9 |         while True:
10 |             g = states[i]
11 |             if g.done():
12 |                 if not isinstance(T, None):
13 |                     yield g.next()
14 |                 g.destroy()
15 |                 states[i] = coro(value, *args)
16 |                 break
17 |             i = (i + 1) & (M - 1)
18 |         I[0] = i
19 | 
20 | @inline
21 | def _dynamic_coroutine_scheduler_drain[T](states: Array[Generator[T]], N: int):
22 |     i = 0
23 |     while i < N:
24 |         g = states[i]
25 |         while not g.done():
26 |             g.next()
27 |         if not isinstance(T, None):
28 |             yield g.next()
29 |         g.destroy()
30 |         i += 1
31 | 
32 | @inline
33 | def _dummy_prefetch_terminal_stage(x):
34 |     pass
35 | 


--------------------------------------------------------------------------------
/test/core/serialization.codon:
--------------------------------------------------------------------------------
 1 | from bio import *
 2 | import pickle
 3 | 
 4 | @test
 5 | def test_pickle[T](x: T):
 6 |     import gzip
 7 |     path = 'testjar.bin'
 8 |     jar = gzip.open(path, 'wb')
 9 |     pickle.dump(x, jar)
10 |     jar.close()
11 | 
12 |     jar = gzopen(path, 'rb')
13 |     y = pickle.load(jar, T)
14 |     jar.close()
15 | 
16 |     assert x == y
17 | 
18 | K = Kmer[8]
19 | test_pickle(s'ACGTAAGG')
20 | test_pickle(~s'ACGTAAGG')
21 | test_pickle(K(s'ACGTAAGG'))
22 | test_pickle([K(s'ACGTAAGG'), K(s'TATCTGTT')])
23 | test_pickle(list[K]())
24 | test_pickle({K(s'ACGTAAGG'), K(s'CATTTTTA')})
25 | test_pickle({~s'ACGTAAGG'})
26 | test_pickle({K(s'ACGTAAGG'), K(s'TTTTGGTT')})
27 | test_pickle(set[K]())
28 | test_pickle({K(s'ACGTAAGG'): 99, K(s'TTATTCTT'): 42})
29 | test_pickle(dict[K,K]())
30 | test_pickle({~s'ACGTAAGG': ~s'ACGTAAGG'})
31 | test_pickle((42, 3.14, True, byte(90), s'ACGTAAGG', K(s'ACGTAAGG')))
32 | test_pickle({i32(42): [[{s'ACG', s'ACGTAGCG', ~s'ACGTAGCG'}, {s'ACG', s'ACGTAGCG', ~s'ACGTAGCG'}], list[set[seq]](), [set[seq]()], [{~s''}, {s'', s'GCGC'}]]})
33 | 


--------------------------------------------------------------------------------
/test/apps/snap/test.seq:
--------------------------------------------------------------------------------
 1 | from sys import argv
 2 | from bio import FASTQ
 3 | from genomeindex import *
 4 | K: Static[int] = 20
 5 | 
 6 | def update(counts, pos, max_pos, max_count):
 7 |     count = counts.get(pos, 0) + 1
 8 |     counts[pos] = count
 9 |     return (pos, count) if count > max_count else (max_pos, max_count)
10 | 
11 | def main(args):
12 |     index = GenomeIndex[Kmer[K]](args[0])
13 |     for read in FASTQ(args[1]):
14 |         counts: Dict[int, int] = {}
15 |         max_pos, max_count = 0, 0
16 | 
17 |         for i,kmer in enumerate(read.seq.kmers(K, K)):
18 |             offset = i * K
19 |             hits = index[kmer]
20 |             hits_rev = index[~kmer]
21 | 
22 |             for i in range(len(hits)):
23 |                 pos = int(hits[i]) - offset
24 |                 max_pos, max_count = update(counts, pos, max_pos, max_count)
25 | 
26 |             for i in range(len(hits_rev)):
27 |                 pos = int(hits_rev[i]) - offset
28 |                 max_pos, max_count = update(counts, pos, max_pos, max_count)
29 | 
30 |         print read, max_pos
31 | 
32 | if len(argv) > 0:
33 |     main(argv)
34 | 


--------------------------------------------------------------------------------
/.github/build-linux/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh -l
 2 | set -e
 3 | set -x
 4 | 
 5 | WORKSPACE="${1:-/github/workspace}"
 6 | 
 7 | export ARCHDEFAULT="$(uname -s | tr '[:upper:]' '[:lower:]')-$(uname -m)"
 8 | ARCH=${2:-$ARCHDEFAULT}
 9 | 
10 | TEST=${3:-no}
11 | CODON_VERSION=${4:-0.19.2}
12 | 
13 | echo "Workspace: ${WORKSPACE}; arch: ${ARCH}"
14 | cd "$WORKSPACE"
15 | 
16 | curl -L https://github.com/exaloop/codon/releases/download/v${CODON_VERSION}/codon-${ARCH}.tar.gz | tar zxvf -
17 | export CODON_DIR=$(pwd)/codon-deploy-${ARCH}
18 | 
19 | # Build Seq
20 | cmake -S . -B build \
21 |   -G Ninja \
22 |   -DCMAKE_BUILD_TYPE=Release \
23 |   -DCODON_PATH=${CODON_DIR} \
24 |   -DCMAKE_C_COMPILER=/opt/llvm-codon/bin/clang \
25 |   -DCMAKE_CXX_COMPILER=/opt/llvm-codon/bin/clang++
26 | cmake --build build
27 | cmake --install build --prefix=${CODON_DIR}/lib/codon/plugins/seq
28 | 
29 | # Test
30 | if [ "$TEST" = "yes" ]; then
31 |   CODON_PATH=${CODON_DIR}/lib/codon/stdlib build/seqtest
32 | fi
33 | 
34 | # Package
35 | export BUILD_ARCHIVE=seq-${ARCH}.tar.gz
36 | tar czf ${BUILD_ARCHIVE} -C ${CODON_DIR}/lib/codon/plugins seq/
37 | du -sh ${BUILD_ARCHIVE}
38 | 


--------------------------------------------------------------------------------
/ir/revcomp.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "codon/cir/cir.h"
 4 | #include "codon/cir/dsl/nodes.h"
 5 | #include "codon/cir/transform/pass.h"
 6 | 
 7 | namespace seq {
 8 | 
 9 | class KmerRevcomp
10 |     : public codon::ir::AcceptorExtend<KmerRevcomp, codon::ir::dsl::CustomInstr> {
11 | private:
12 |   codon::ir::Value *kmer;
13 | 
14 | public:
15 |   static const char NodeId;
16 | 
17 |   using AcceptorExtend::AcceptorExtend;
18 | 
19 |   explicit KmerRevcomp(codon::ir::Value *kmer) : AcceptorExtend(), kmer(kmer) {}
20 | 
21 |   std::unique_ptr<codon::ir::dsl::codegen::ValueBuilder> getBuilder() const override;
22 |   std::unique_ptr<codon::ir::dsl::codegen::CFBuilder> getCFBuilder() const override;
23 | 
24 |   bool match(const codon::ir::Value *v) const override;
25 |   codon::ir::Value *doClone(codon::ir::util::CloneVisitor &cv) const override;
26 |   std::ostream &doFormat(std::ostream &os) const override;
27 | };
28 | 
29 | class KmerRevcompInterceptor : public codon::ir::transform::Pass {
30 |   static const std::string KEY;
31 |   std::string getKey() const override { return KEY; }
32 |   void run(codon::ir::Module *) override;
33 | };
34 | 
35 | } // namespace seq
36 | 


--------------------------------------------------------------------------------
/test/bench/hamming.codon:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | # Hammming distance benchmark #
 3 | ###############################
 4 | from sys import argv
 5 | from time import timing
 6 | from bio import *
 7 | 
 8 | def dist_fast(k1, k2):
 9 |     return abs(k1 - k2)
10 | 
11 | def dist_slow(k1, k2):
12 |     d = 0
13 |     for i in range(type(k1).len()):
14 |         if k1[i] != k2[i]:
15 |             d += 1
16 |     return d
17 | 
18 | def test(use_slow_dist, K: Static[int]):
19 |     n = 0
20 |     with timing(f'{K}-mer ({use_slow_dist=})'):
21 |         for s in FASTA(argv[1]) |> seqs:
22 |             for kmer in s |> kmers(1, K):
23 |                 d = 0
24 |                 rckmer = ~kmer
25 |                 if use_slow_dist:
26 |                     d = dist_slow(rckmer, kmer)
27 |                 else:
28 |                     d = dist_fast(rckmer, kmer)
29 |                 n ^= d
30 |     print n
31 | 
32 | print 'start'
33 | test(False, 4)
34 | test(True, 4)
35 | test(False, 8)
36 | test(True, 8)
37 | test(False, 16)
38 | test(True, 16)
39 | test(False, 32)
40 | test(True, 32)
41 | test(False, 64)
42 | test(True, 64)
43 | test(False, 128)
44 | test(True, 128)
45 | 


--------------------------------------------------------------------------------
/.github/build-linux/Dockerfile.manylinux2014-x86_64:
--------------------------------------------------------------------------------
 1 | FROM quay.io/pypa/manylinux2014_x86_64
 2 | 
 3 | # - Codon needs ninja >= 1.10.
 4 | # - Codon needs Python 3 with shared libraries.
 5 | #   Since manylinux ships with static libraries, pyenv is used to build shared libraries.
 6 | # - Python needs OpenSSL >= 1.1.
 7 | # - The recent git security checks are disabled by marking everything "safe" until all dependencies get updated.
 8 | RUN yum -y update && \
 9 |     yum -y install gcc gcc-c++ gcc-gfortran make wget openssl11-devel \
10 |                    patch bzip2 readline-devel sqlite sqlite-devel \
11 |                    bzip2-devel tk-devel libffi-devel xz-devel zlib-devel && \
12 |     yum -y install ninja-build && \
13 |     git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \
14 |     CPPFLAGS="-I/usr/include/openssl11" LDFLAGS="-L/usr/lib64/openssl11 -lssl -lcrypto" ~/.pyenv/bin/pyenv install -s 3.11 && \
15 |     curl -L https://github.com/exaloop/llvm-project/releases/download/codon-20.1.7/llvm-codon-20.1.7-manylinux2014-x86_64.tar.bz2 | tar jxf - -C /opt && \
16 |     echo -ne "[safe]\ndirectory = *" > /root/.gitconfig
17 | 
18 | COPY entrypoint.sh /entrypoint.sh
19 | ENTRYPOINT ["/entrypoint.sh"]
20 | 


--------------------------------------------------------------------------------
/test/bench/knucleotide.codon:
--------------------------------------------------------------------------------
 1 | import sys, time
 2 | from bio import *
 3 | 
 4 | def pad(x, n, w):
 5 |   s = str(x)
 6 |   if len(s) < n:
 7 |     return s + (w * (n - len(s)))
 8 |   return s
 9 | 
10 | def hashcnt(s: seq, K: Static[int]):
11 |   d = {}
12 |   for k in s.kmers(1, k=K):
13 |     d[k] = d.get(k, 0) + 1
14 |   return d
15 | 
16 | def cnt(s: seq, q: Kmer[K], K: Static[int]):
17 |   d = hashcnt(s, K=K)
18 |   print str(d.get(q, 0)) + "\t" + str(q)
19 | 
20 | def freq(s: seq):
21 |   d1 = hashcnt(s, K=1)
22 |   for k, v in sorted(d1.items(), lambda a: -a[1]):
23 |     print k, pad(round((100.0 * v) / len(s), 3), 6, '0')
24 |   print
25 | 
26 |   d2 = hashcnt(s, K=2)
27 |   for k, v in sorted(d2.items(), lambda a: -a[1]):
28 |     print k, pad(round((100.0 * v) / (len(s)-1), 3), 5, '0')
29 |   print
30 | 
31 | def process(sq):
32 |   freq(sq)
33 |   cnt(sq, Kmer[3](s'GGT'))
34 |   cnt(sq, Kmer[4](s'GGTA'))
35 |   cnt(sq, Kmer[6](s'GGTATT'))
36 |   cnt(sq, Kmer[12](s'GGTATTTTAATT'))
37 |   cnt(sq, Kmer[18](s'GGTATTTTAATTTATAGT'))
38 | 
39 | t = time.time()
40 | for line in sys.stdin:
41 |   if line[:6] == '>THREE':
42 |     l = list[str]()
43 |     for line in sys.stdin:
44 |       if line[0] == '>': break
45 |       l.append(line)
46 |     s = seq(str.cat(l))
47 |     process(s)
48 |     break
49 | 


--------------------------------------------------------------------------------
/test/data/invalid/seqs_bad_base.fastq:
--------------------------------------------------------------------------------
 1 | @SL-HXF:348:HKLFWCCXX:1:2101:15676:57231:CACCAAAAGTACATGA
 2 | GTGCACAGAAAAAAAGGTTAAATTGAAAAGTAAATATGATAGAAATGATTGCAAATGTTGGCAAACCACTAAATCGACTAAAACTTGAATAAAAGTAAAAATCATCCATGTCATTTATAAAGCGACTCAACTAAAGCATAAGGATATAAGA
 3 | +
 4 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKFKKKKKKKFKKFKKKKFKKKFK<,AAAFKFKKFAFKKA,,,A<FFAFFK<AAKFFKKKFK,<,,7F<
 5 | @SL-HXF:348:HKLFWCCXX:1:2121:24495:55877:CACCAAAAGTACATGA
 6 | TATATTCGTGTCCACTTCATGATTCCATTCAATTCCATCTAATGTTGATTCCATTTGATTCCATTTGATGATTCAGTTCGATTCCTTGCAATGATTCCCTACGATTCCTTTCTATGATGATTCCATTCGATTCCATTCATTGATGATTTCA
 7 | +
 8 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKKFKKKKKKKKKKKKFKKKKKKKKKFFK7AAFKKFKKKKKFAKKKFKKFKKKF7A7F<<KKF,
 9 | @SL-HXF:348:HKLFWCCXX:1:2220:28361:38491:CACCAAAAGTACATGA
10 | CCTGCATCACGACGACCGCCGCCACCGTCAGCCCAGCCCACCCACTGCACTCCACCCTCAGCACCACAGTGAGCCCGAATACCACCACCCCCCCCACCACCACCACCACACAAACAACCACCACCACCACAACCACCCTCACCACCATCAC
11 | +
12 | ,A,<,A,F,,,,,,,,,,(((,,(<((7,A,A,(,((((,,(7,,,,,,F,,FK,F7<,,,7F,FFF7,,,77,,(((,,,7F,FF,A77AF7FK(((,<,<,,,,,<<,,,,,,,<A7AF,7A<KKA<F,,,7,,<(,,,,,,,,,,,,,
13 | @SL-HXF:348:HKLFWCCXX:4:1106:4553:37893:CACCAAAAGTACATGA
14 | TCAATTCGATTCTATTCGATGATGATTCCATTGGATTTCACTTGATGATTCTATTCGATTCCATTCAATGATGATTCACTTCTCGTCCATTGGATGATTCCATTTCATTCCATTCTATGATGATTCCATTCGATTCCATTTGATGATAAXT
15 | +
16 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKFKKKKKKFKKK7AFFKAFKKKKK,FKKFKAFFA7<<,FFAFKFAF7FKK77<,,,,,,,,,,<F7A,<AK,AFFK<<KKF<,AA7<F,,7
17 | 


--------------------------------------------------------------------------------
/test/data/invalid/seqs_bad_name.fastq:
--------------------------------------------------------------------------------
 1 | SL-HXF:348:HKLFWCCXX:1:2101:15676:57231:CACCAAAAGTACATGA
 2 | GTGCACAGAAAAAAAGGTTAAATTGAAAAGTAAATATGATAGAAATGATTGCAAATGTTGGCAAACCACTAAATCGACTAAAACTTGAATAAAAGTAAAAATCATCCATGTCATTTATAAAGCGACTCAACTAAAGCATAAGGATATAAGA
 3 | +
 4 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKFKKKKKKKFKKFKKKKFKKKFK<,AAAFKFKKFAFKKA,,,A<FFAFFK<AAKFFKKKFK,<,,7F<
 5 | @SL-HXF:348:HKLFWCCXX:1:2121:24495:55877:CACCAAAAGTACATGA
 6 | TATATTCGTGTCCACTTCATGATTCCATTCAATTCCATCTAATGTTGATTCCATTTGATTCCATTTGATGATTCAGTTCGATTCCTTGCAATGATTCCCTACGATTCCTTTCTATGATGATTCCATTCGATTCCATTCATTGATGATTTCA
 7 | +
 8 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKKFKKKKKKKKKKKKFKKKKKKKKKFFK7AAFKKFKKKKKFAKKKFKKFKKKF7A7F<<KKF,
 9 | @SL-HXF:348:HKLFWCCXX:1:2220:28361:38491:CACCAAAAGTACATGA
10 | CCTGCATCACGACGACCGCCGCCACCGTCAGCCCAGCCCACCCACTGCACTCCACCCTCAGCACCACAGTGAGCCCGAATACCACCACCCCCCCCACCACCACCACCACACAAACAACCACCACCACCACAACCACCCTCACCACCATCAC
11 | +
12 | ,A,<,A,F,,,,,,,,,,(((,,(<((7,A,A,(,((((,,(7,,,,,,F,,FK,F7<,,,7F,FFF7,,,77,,(((,,,7F,FF,A77AF7FK(((,<,<,,,,,<<,,,,,,,<A7AF,7A<KKA<F,,,7,,<(,,,,,,,,,,,,,
13 | @SL-HXF:348:HKLFWCCXX:4:1106:4553:37893:CACCAAAAGTACATGA
14 | TCAATTCGATTCTATTCGATGATGATTCCATTGGATTTCACTTGATGATTCTATTCGATTCCATTCAATGATGATTCACTTCTCGTCCATTGGATGATTCCATTTCATTCCATTCTATGATGATTCCATTCGATTCCATTTGATGATAATT
15 | +
16 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKFKKKKKKFKKK7AFFKAFKKKKK,FKKFKAFFA7<<,FFAFKFAF7FKK77<,,,,,,,,,,<F7A,<AK,AFFK<<KKF<,AA7<F,,7
17 | 


--------------------------------------------------------------------------------
/test/data/invalid/seqs_bad_qual.fastq:
--------------------------------------------------------------------------------
 1 | @SL-HXF:348:HKLFWCCXX:1:2101:15676:57231:CACCAAAAGTACATGA
 2 | GTGCACAGAAAAAAAGGTTAAATTGAAAAGTAAATATGATAGAAATGATTGCAAATGTTGGCAAACCACTAAATCGACTAAAACTTGAATAAAAGTAAAAATCATCCATGTCATTTATAAAGCGACTCAACTAAAGCATAAGGATATAAGA
 3 | +
 4 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKFKKKKKKKFKKFKKKKFKKKFK<,AAAFKFKKFAFKKA,,,A<FFAFFK<AAKFFKKKFK,<,,7F<
 5 | @SL-HXF:348:HKLFWCCXX:1:2121:24495:55877:CACCAAAAGTACATGA
 6 | TATATTCGTGTCCACTTCATGATTCCATTCAATTCCATCTAATGTTGATTCCATTTGATTCCATTTGATGATTCAGTTCGATTCCTTGCAATGATTCCCTACGATTCCTTTCTATGATGATTCCATTCGATTCCATTCATTGATGATTTCA
 7 | +
 8 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKKFKKKKKKKKKKKKFKKKKKKKKKFFK7AAFKKFKKKKKFAKKKFKKFKKKF7A7F<<KKF,
 9 | @SL-HXF:348:HKLFWCCXX:1:2220:28361:38491:CACCAAAAGTACATGA
10 | CCTGCATCACGACGACCGCCGCCACCGTCAGCCCAGCCCACCCACTGCACTCCACCCTCAGCACCACAGTGAGCCCGAATACCACCACCCCCCCCACCACCACCACCACACAAACAACCACCACCACCACAACCACCCTCACCACCATCAC
11 | +
12 | ,A,<,A,F,,,,,,,,,,(((,,(<((7,A,A,(,((((,,(7,,,,,,F,,FK,F7<,,,7F,FFF7,,,77,,(((,,,7F,FF,A77AF7FK(((,<,<,,,,,<<,,,,,,,<A7AF,7A<KKA<F,,,7,,<(,,,,,,,,,,,,,
13 | @SL-HXF:348:HKLFWCCXX:4:1106:4553:37893:CACCAAAAGTACATGA
14 | TCAATTCGATTCTATTCGATGATGATTCCATTGGATTTCACTTGATGATTCTATTCGATTCCATTCAATGATGATTCACTTCTCGTCCATTGGATGATTCCATTTCATTCCATTCTATGATGATTCCATTCGATTCCATTTGATGATAATT
15 | +
16 |  AFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKFKKKKKKFKKK7AFFKAFKKKKK,FKKFKAFFA7<<,FFAFKFAF7FKK77<,,,,,,,,,,<F7A,<AK,AFFK<<KKF<,AA7<F,,7
17 | 


--------------------------------------------------------------------------------
/test/data/invalid/seqs_bad_qual_len.fastq:
--------------------------------------------------------------------------------
 1 | @SL-HXF:348:HKLFWCCXX:1:2101:15676:57231:CACCAAAAGTACATGA
 2 | GTGCACAGAAAAAAAGGTTAAATTGAAAAGTAAATATGATAGAAATGATTGCAAATGTTGGCAAACCACTAAATCGACTAAAACTTGAATAAAAGTAAAAATCATCCATGTCATTTATAAAGCGACTCAACTAAAGCATAAGGATATAAGA
 3 | +
 4 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKFKKKKKKKFKKFKKKKFKKKFK<,AAAFKFKKFAFKKA,,,A<FFAFFK<AAKFFKKKFK,<,,7F<
 5 | @SL-HXF:348:HKLFWCCXX:1:2121:24495:55877:CACCAAAAGTACATGA
 6 | TATATTCGTGTCCACTTCATGATTCCATTCAATTCCATCTAATGTTGATTCCATTTGATTCCATTTGATGATTCAGTTCGATTCCTTGCAATGATTCCCTACGATTCCTTTCTATGATGATTCCATTCGATTCCATTCATTGATGATTTCA
 7 | +
 8 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKFKKKKKKKKKKKKFKKKKKKKKKFFK7AAFKKFKKKKKFAKKKFKKFKKKF7A7F<<KKF,
 9 | @SL-HXF:348:HKLFWCCXX:1:2220:28361:38491:CACCAAAAGTACATGA
10 | CCTGCATCACGACGACCGCCGCCACCGTCAGCCCAGCCCACCCACTGCACTCCACCCTCAGCACCACAGTGAGCCCGAATACCACCACCCCCCCCACCACCACCACCACACAAACAACCACCACCACCACAACCACCCTCACCACCATCAC
11 | +
12 | ,A,<,A,F,,,,,,,,,,(((,,(<((7,A,A,(,((((,,(7,,,,,,F,,FK,F7<,,,7F,FFF7,,,77,,(((,,,7F,FF,A77AF7FK(((,<,<,,,,,<<,,,,,,,<A7AF,7A<KKA<F,,,7,,<(,,,,,,,,,,,,,
13 | @SL-HXF:348:HKLFWCCXX:4:1106:4553:37893:CACCAAAAGTACATGA
14 | TCAATTCGATTCTATTCGATGATGATTCCATTGGATTTCACTTGATGATTCTATTCGATTCCATTCAATGATGATTCACTTCTCGTCCATTGGATGATTCCATTTCATTCCATTCTATGATGATTCCATTCGATTCCATTTGATGATAATT
15 | +
16 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKFKKKKKKFKKK7AFFKAFKKKKK,FKKFKAFFA7<<,FFAFKFAF7FKK77<,,,,,,,,,,<F7A,<AK,AFFK<<KKF<,AA7<F,,7
17 | 


--------------------------------------------------------------------------------
/test/bench/cpg.codon:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | from bio import *
 4 | 
 5 | def is_cpg(s):
 6 |    return 1 if s == 'C' else (2 if s == 'G' else 0)
 7 | 
 8 | def cpg(s):
 9 |    for c in s:
10 |       i = is_cpg(c)
11 |       if i == 0: break
12 |       yield i
13 | 
14 | m, M = 99999, 0
15 | def cpg_count(s):
16 |    global m, M
17 |    i = 0
18 |    count = 0
19 |    while i < len(s):
20 |       cc, l = 0, 0
21 |       for j in cpg(s[i:]):
22 |          cc |= j
23 |          l += 1
24 |       if cc == 3:
25 |          count += 1
26 |          m = min(m, l)
27 |          M = max(M, l)
28 |       i += l + 1
29 |    return count
30 | 
31 | def naive():
32 |    f = FASTA(sys.argv[1], fai=False)
33 |    cnt = 0
34 |    for l in f:
35 |       cnt += cpg_count(str(l.seq))
36 |    print cnt, m, M
37 | 
38 | def is_cpg_i(s, c, g) -> tuple[int, int]:
39 |    match s:
40 |       case 'C*': return is_cpg_i(s[1:], c + 1, g)
41 |       case 'G*': return is_cpg_i(s[1:], c, g + 1)
42 |       case _: return c, g
43 | 
44 | def cpgs(s):
45 |    i = 0
46 |    while i < len(s):
47 |       c, g = is_cpg_i(s[i:], 0, 0)
48 |       if c and g:
49 |          yield c + g
50 |       i += c + g + 1
51 | 
52 | cnt = 0
53 | def collect(c):
54 |    global cnt, m, M
55 |    cnt += 1
56 |    m = min(m, c)
57 |    M = max(M, c)
58 | 
59 | def idiomatic():
60 |    FASTA(sys.argv[1], fai=False) |> seqs |> cpgs |> collect
61 |    print cnt, m, M
62 | 
63 | with time.timing("naive"):
64 |    naive()
65 | 
66 | m, M = 99999, 0
67 | with time.timing("idiomatic"):
68 |    idiomatic()
69 | 


--------------------------------------------------------------------------------
/test/data/seqs.fastq:
--------------------------------------------------------------------------------
 1 | @SL-HXF:348:HKLFWCCXX:1:2101:15676:57231:CACCAAAAGTACATGA comment A B C
 2 | GTGCACAGAAAAAAAGGTTAAATTGAAAAGTAAATATGATAGAAATGATTGCAAATGTTGGCAAACCACTAAATCGACTAAAACTTGAATAAAAGTAAAAATCATCCATGTCATTTATAAAGCGACTCAACTAAAGCATAAGGATATAAGA
 3 | +
 4 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKFKKKKKKKFKKFKKKKFKKKFK<,AAAFKFKKFAFKKA,,,A<FFAFFK<AAKFFKKKFK,<,,7F<
 5 | @SL-HXF:348:HKLFWCCXX:1:2121:24495:55877:CACCAAAAGTACATGA
 6 | TATATTCGTGTCCACTTCATGATTCCATTCAATTCCATCTAATGTTGATTCCATTTGATTCCATTTGATGATTCAGTTCGATTCCTTGCAATGATTCCCTACGATTCCTTTCTATGATGATTCCATTCGATTCCATTCATTGATGATTTCA
 7 | +
 8 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKKKKKKKKKKKFKKKKKKKKKKKKFKKKKKKKKKFFK7AAFKKFKKKKKFAKKKFKKFKKKF7A7F<<KKF,
 9 | @SL-HXF:348:HKLFWCCXX:1:2220:28361:38491:CACCAAAAGTACATGA		comment with tabs
10 | CCTGCATCACGACGACCGCCGCCACCGTCAGCCCAGCCCACCCACTGCACTCCACCCTCAGCACCACAGTGAGCCCGAATACCACCACCCCCCCCACCACCACCACCACACAAACAACCACCACCACCACAACCACCCTCACCACCATCAC
11 | +
12 | ,A,<,A,F,,,,,,,,,,(((,,(<((7,A,A,(,((((,,(7,,,,,,F,,FK,F7<,,,7F,FFF7,,,77,,(((,,,7F,FF,A77AF7FK(((,<,<,,,,,<<,,,,,,,<A7AF,7A<KKA<F,,,7,,<(,,,,,,,,,,,,,
13 | @SL-HXF:348:HKLFWCCXX:4:1106:4553:37893:CACCAAAAGTACATGA
14 | TCAATTCGATTCTATTCGATGATGATTCCATTGGATTTCACTTGATGATTCTATTCGATTCCATTCAATGATGATTCACTTCTCGTCCATTGGATGATTCCATTTCATTCCATTCTATGATGATTCCATTCGATTCCATTTGATGATAATT
15 | +
16 | AAFFFKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKFKKKKFKKKKKKFKKK7AFFKAFKKKKK,FKKFKAFFA7<<,FFAFKFAF7FKK77<,,,,,,,,,,<F7A,<AK,AFFK<<KKF<,AA7<F,,7
17 | 


--------------------------------------------------------------------------------
/test/bench/sw.codon:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | # Inter-sequence alignment benchmark #
 3 | ######################################
 4 | from sys import argv
 5 | from time import timing
 6 | from bio import *
 7 | 
 8 | prefix = argv[1]
 9 | checksum = 0
10 | 
11 | @inter_align
12 | def process_inter(t):
13 |     global checksum
14 |     query, target = t
15 |     score = query.align(target,
16 |                         a=1,
17 |                         b=2,
18 |                         ambig=0,
19 |                         gapo=2,
20 |                         gape=1,
21 |                         zdrop=100,
22 |                         bandwidth=100,
23 |                         end_bonus=5).score
24 |     checksum += score
25 | 
26 | def process_intra(t):
27 |     global checksum
28 |     query, target = t
29 |     score = query.align(target,
30 |                         a=1,
31 |                         b=2,
32 |                         ambig=0,
33 |                         gapo=2,
34 |                         gape=1,
35 |                         zdrop=100,
36 |                         bandwidth=100,
37 |                         end_bonus=5).score
38 |     checksum += score
39 | 
40 | for m in range(30, 125, 5):
41 |     in1 = f'{prefix}.max_{m}.1.txt'
42 |     in2 = f'{prefix}.max_{m}.2.txt'
43 | 
44 |     checksum = 0
45 |     with timing(f'intra ({m=})'):
46 |         zip(seqs(in1), seqs(in2)) |> process_intra
47 |     print checksum
48 | 
49 |     checksum = 0
50 |     with timing(f'inter ({m=})'):
51 |         zip(seqs(in1), seqs(in2)) |> process_inter
52 |     print checksum
53 | 


--------------------------------------------------------------------------------
/test/bench/16mer.codon:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | from bio import *
 4 | 
 5 | def revcomp(c):
 6 |    return ('A' if c == 'T' else \
 7 |           ('C' if c == 'G' else \
 8 |           ('G' if c == 'C' else \
 9 |           ('T' if c == 'A' else c))))
10 | 
11 | def process(k):
12 |    return len(k)
13 | 
14 | def ksplit(s, k, step):
15 |    i = 0
16 |    while i + k <= len(s):
17 |       yield s[i:i + k]
18 |       i += step
19 | 
20 | def sym(s, k):
21 |    return 1 if all(s[i] == revcomp(s[-i-1]) for i in range(k//2)) else 0
22 | 
23 | def naive():
24 |    f = FASTA(sys.argv[1], fai=False)
25 |    total, t2, t3 = 0, 0, 0
26 |    k = 16
27 |    for l in f:
28 |       for s in ksplit(str(l.seq), k, 1):
29 |          total += process(s)
30 |          t2 += 1
31 |          t3 += sym(s, k)
32 |    print total, t2, t3
33 | 
34 | t1 = 0
35 | def process_i(k):
36 |    global t1
37 |    t1 += len(k)
38 |    return k
39 | 
40 | t2 = 0
41 | def count(k):
42 |    global t2
43 |    t2 += 1
44 |    return k
45 | 
46 | t3 = 0
47 | def sym_i(k):
48 |    global t3
49 |    def is_sym(s) -> bool:
50 |       match s:
51 |          case 'A*T' | 'T*A' | 'C*G' | 'G*C' | 'N*N':
52 |             return is_sym(s[1:-1])
53 |          case '': return True
54 |          case _:  return False
55 |    t3 += 1 if is_sym(k) else 0
56 | 
57 | def idiomatic():
58 |    (FASTA(sys.argv[1], fai=False)
59 |    |> seqs
60 |    |> split(16, 1)
61 |    |> process_i
62 |    |> count
63 |    |> sym_i)
64 |    print t1, t2, t3
65 | 
66 | with time.timing("naive"):
67 |    naive()
68 | with time.timing("idiomatic"):
69 |    idiomatic()
70 | 


--------------------------------------------------------------------------------
/.github/build-linux/Dockerfile.linux-x86_64:
--------------------------------------------------------------------------------
 1 | FROM quay.io/pypa/manylinux_2_28_x86_64
 2 | 
 3 | # - Codon needs ninja >= 1.10.
 4 | # - Codon needs Python 3 with shared libraries.
 5 | #   Since manylinux ships with static libraries, pyenv is used to build shared libraries.
 6 | # - Codon's clang must be forced to use the correct gcc-14 toolset for C++20 support via clang.cfg.
 7 | # - The recent git security checks are disabled by marking everything "safe" until all dependencies get updated.
 8 | RUN yum -y update && \
 9 |     yum -y install gcc gcc-c++ gcc-gfortran make wget openssl-devel \
10 |                    patch bzip2 readline-devel sqlite sqlite-devel \
11 |                    bzip2-devel tk-devel libffi-devel xz-devel zlib-devel && \
12 |     yum -y install https://dl.fedoraproject.org/pub/archive/epel/7/x86_64/Packages/n/ninja-build-1.10.2-3.el7.x86_64.rpm && \
13 |     git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \
14 |     CPPFLAGS="-I/usr/include/openssl11" LDFLAGS="-L/usr/lib64/openssl11 -lssl -lcrypto" ~/.pyenv/bin/pyenv install -s 3.11 && \
15 |     curl -L https://github.com/exaloop/llvm-project/releases/download/codon-20.1.7/llvm-codon-20.1.7-linux-x86_64.tar.bz2 | tar jxf - -C /opt && \
16 |     echo -ne "[safe]\ndirectory = *" > /root/.gitconfig && \
17 |     echo "--gcc-install-dir=/opt/rh/gcc-toolset-14/root/usr/lib/gcc/x86_64-redhat-linux/14" > /opt/llvm-codon/bin/clang.cfg && \
18 |     echo "--gcc-install-dir=/opt/rh/gcc-toolset-14/root/usr/lib/gcc/x86_64-redhat-linux/14" > /opt/llvm-codon/bin/clang++.cfg
19 | 
20 | COPY entrypoint.sh /entrypoint.sh
21 | ENTRYPOINT ["/entrypoint.sh"]
22 | 


--------------------------------------------------------------------------------
/.github/build-linux/Dockerfile.linux-aarch64:
--------------------------------------------------------------------------------
 1 | FROM quay.io/pypa/manylinux_2_28_aarch64
 2 | 
 3 | # - Codon needs ninja >= 1.10.
 4 | # - Codon needs Python 3 with shared libraries.
 5 | #   Since manylinux ships with static libraries, pyenv is used to build shared libraries.
 6 | # - Codon's clang must be forced to use the correct gcc-14 toolset for C++20 support via clang.cfg.
 7 | # - The recent git security checks are disabled by marking everything "safe" until all dependencies get updated.
 8 | RUN yum -y update && \
 9 |     yum -y install gcc gcc-c++ gcc-gfortran make wget openssl-devel \
10 |                    patch bzip2 readline-devel sqlite sqlite-devel \
11 |                    bzip2-devel tk-devel libffi-devel xz-devel zlib-devel && \
12 |     yum -y install https://linux.mellanox.com/public/repo/doca/2.2.0/centos7.6/aarch64/ninja-build-1.10.2-3.el7.aarch64.rpm && \
13 |     git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \
14 |     CPPFLAGS="-I/usr/include/openssl11" LDFLAGS="-L/usr/lib64/openssl11 -lssl -lcrypto" ~/.pyenv/bin/pyenv install -s 3.11 && \
15 |     curl -L https://github.com/exaloop/llvm-project/releases/download/codon-20.1.7/llvm-codon-20.1.7-linux-aarch64.tar.bz2 | tar jxf - -C /opt && \
16 |     echo -ne "[safe]\ndirectory = *" > /root/.gitconfig && \
17 |     echo "--gcc-install-dir=/opt/rh/gcc-toolset-14/root/usr/lib/gcc/aarch64-redhat-linux/14" > /opt/llvm-codon/bin/clang.cfg && \
18 |     echo "--gcc-install-dir=/opt/rh/gcc-toolset-14/root/usr/lib/gcc/aarch64-redhat-linux/14" > /opt/llvm-codon/bin/clang++.cfg
19 | 
20 | COPY entrypoint.sh /entrypoint.sh
21 | ENTRYPOINT ["/entrypoint.sh"]
22 | 


--------------------------------------------------------------------------------
/stdlib/bio/block.codon:
--------------------------------------------------------------------------------
 1 | @tuple
 2 | class Block[T]:
 3 |     '''
 4 |     Represents a block of data; useful in parallelization to batch data
 5 |     '''
 6 |     _data: Ptr[T]
 7 |     _size: int
 8 | 
 9 |     def __new__(size: int):
10 |         return Block[T](Ptr[T](size), 0)
11 | 
12 |     def __iter__(self):
13 |         data = self._data
14 |         size = self._size
15 |         i = 0
16 |         while i < size:
17 |             yield data[i]
18 |             i += 1
19 | 
20 |     def __getitem__(self, idx: int):
21 |         if not (0 <= idx < len(self)):
22 |             raise ValueError("block index out of range")
23 |         return self._data[idx]
24 | 
25 |     def __len__(self):
26 |         return self._size
27 | 
28 |     def __bool__(self):
29 |         return len(self) != 0
30 | 
31 |     def __repr__(self):
32 |         return f'<block of size {self._size}>'
33 | 
34 |     def _add(self, elem: T):
35 |         self._data[self._size] = elem
36 |         return Block[T](self._data, self._size + 1)
37 | 
38 | def _blocks[T](g: Generator[T], size: int):
39 |     b = Block[T](size)
40 |     for a in g:
41 |         if len(b) == size:
42 |             yield b
43 |             b = Block[T](size)
44 |         b = b._add(a)
45 |     if b:
46 |         yield b
47 | 
48 | def blocks(x, size: int):
49 |     '''
50 |     Partitions the given object into blocks of the specified size
51 |     by calling the `__blocks__` magic method.
52 |     '''
53 |     if size <= 0:
54 |         raise ValueError(f"invalid block size: {size}")
55 |     if isinstance(x, Generator):
56 |         return _blocks(x, size)
57 |     else:
58 |         return x.__blocks__(size)
59 | 


--------------------------------------------------------------------------------
/test/bench/bedcov.codon:
--------------------------------------------------------------------------------
 1 | # BED coverage benchmark from https://github.com/lh3/biofast
 2 | from sys import argv
 3 | from time import timing
 4 | from bio import *
 5 | from bio.intervals import IntervalTree
 6 | 
 7 | with timing('bed coverage (total)'):
 8 |     interval_tree = IntervalTree()
 9 | 
10 |     with timing('reading first BED file'):
11 |         for record in BED(argv[1], copy=True, validate=False):
12 |             interval_tree.add(record.chrom, record.chrom_start, record.chrom_end, None)
13 | 
14 |     with timing('indexing'):
15 |         interval_tree.index()
16 | 
17 |     with timing('querying second BED file'):
18 |         for record in BED(argv[2], copy=False, validate=False):
19 |             cov, cov_st, cov_en, n = 0, 0, 0, 0
20 |             st1, en1 = record.chrom_start, record.chrom_end
21 |             for item in interval_tree.overlap(record.chrom, st1, en1):
22 |                 n += 1
23 |                 # calcualte overlap length/coverage
24 |                 st0, en0 = item.start, item.end
25 |                 if st0 < st1: st0 = st1
26 |                 if en0 > en1: en0 = en1
27 |                 if st0 > cov_en:  # no overlap with previous found intervals
28 |                     # set coverage to current interval
29 |                     cov += cov_en - cov_st
30 |                     cov_st, cov_en = st0, en0
31 |                 elif cov_en < en0:
32 |                     cov_en = en0  # overlap with previous found intervals
33 |             cov += cov_en - cov_st
34 |             # print chrom, start, end, count, # of coverage nt
35 |             print f'{record.chrom}\t{record.chrom_start}\t{record.chrom_end}\t{n}\t{cov}'
36 | 


--------------------------------------------------------------------------------
/.github/build-linux/Dockerfile.manylinux2014-aarch64:
--------------------------------------------------------------------------------
 1 | FROM quay.io/pypa/manylinux2014_aarch64
 2 | 
 3 | # - Codon needs ninja >= 1.10.
 4 | # - Codon needs Python 3 with shared libraries.
 5 | #   Since manylinux ships with static libraries, pyenv is used to build shared libraries.
 6 | # - Python needs OpenSSL >= 1.1.
 7 | # - The recent git security checks are disabled by marking everything "safe" until all dependencies get updated.
 8 | RUN yum -y update && \
 9 |     yum -y install gcc gcc-c++ gcc-gfortran make wget \
10 |                    patch bzip2 readline-devel sqlite sqlite-devel \
11 |                    bzip2-devel tk-devel libffi-devel xz-devel zlib-devel && \
12 |     yum -y install https://linux.mellanox.com/public/repo/doca/2.2.0/centos7.6/aarch64/openssl11-devel-1.1.1k-5.el7.aarch64.rpm \
13 |                    https://linux.mellanox.com/public/repo/doca/2.2.0/centos7.6/aarch64/openssl11-1.1.1k-5.el7.aarch64.rpm \
14 |                    https://linux.mellanox.com/public/repo/doca/2.2.0/centos7.6/aarch64/openssl11-libs-1.1.1k-5.el7.aarch64.rpm && \
15 |     yum -y install https://linux.mellanox.com/public/repo/doca/2.2.0/centos7.6/aarch64/ninja-build-1.10.2-3.el7.aarch64.rpm && \
16 |     git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \
17 |     CPPFLAGS="-I/usr/include/openssl11" LDFLAGS="-L/usr/lib64/openssl11 -lssl -lcrypto" ~/.pyenv/bin/pyenv install -s 3.11 && \
18 |     curl -L https://github.com/exaloop/llvm-project/releases/download/codon-20.1.7/llvm-codon-20.1.7-manylinux2014-aarch64.tar.bz2 | tar jxf - -C /opt && \
19 |     echo -ne "[safe]\ndirectory = *" > /root/.gitconfig
20 | 
21 | COPY entrypoint.sh /entrypoint.sh
22 | ENTRYPOINT ["/entrypoint.sh"]
23 | 


--------------------------------------------------------------------------------
/test/data/invalid/seqs_bad_base.fasta:
--------------------------------------------------------------------------------
 1 | >chrA
 2 | TCCTCCCCGTTCGCTGGACCCTTGGTCTATCTAGTCAAGAATTAACTCCC
 3 | ATTTAGTTGGCTGTTCGTCGATTACCCCTACTGGACCGTCGCAACGGCTC
 4 | ACGTGGAGGTCTTAGACCAAGAAAGCTACTGTATGCGGGGATATCACATC
 5 | AGATTGCCAGGCGAGCAGCTCTAGCGTGACACGCCTAGACTCATTCGTTG
 6 | TTCCTTGTCAATCCCAGGGGTCTCCACAGGGAGTGGATCGAGCTAATCAC
 7 | CGTTTCGAGTCCGTCAGGCGGAGAGTAGCAGTAAGTACAAACTTCTGCTA
 8 | GTCGCTCTGCCACAACGTAGCCACCTAAGATTAACCCTGGAATTGTCCGG
 9 | GCGGCATGATCCATCGAGGAGTTAGCGGGGACAGGGAGTTACCAGTCGAG
10 | ACGTCCATGGTGGTGCTGCAATCCATGGATACCATCTCCTTGCCATTCCT
11 | AGGGACATCG
12 | >chrB
13 | TATGGGGTAGCATCATTAAGTGGGGAGGTAGACCAGGAGTTCGGTTCCCG
14 | GAGTTTCGTTAGTTCAGGTAGCGTGACCTCGTCTTAGTATGCAGTCGTGA
15 | AATAATAGACATTTCTGCCTGTCAGGTTGCACTAATCACACCCAGGCTGT
16 | TAACGAGGCGGCTCGTAGTATAAACGCTTTGGACTAGACTCGATACCTAG
17 | CGGCGCGCATTGATAAATGGTGCATCTATAGTAAAAGGCGTCCCAACCTG
18 | GGGACTTAGCGTTGAATACCCGTGCAAGGATCTCTAATCGGTTCTCAATG
19 | GCTGCCTGCTCTTTCTCTAAACGAGACTCTAATATCATGTGTGGTCCTTG
20 | CTTTCGGTCGAGAAAAAGCCTTTGATCGCATCCCAAACCGACATCTAAAA
21 | GCTTCATGTATGTCGGCAGCGAAAAAACGACCAAATAGAAGTCCCCTAAC
22 | GGAGAATAGGCCGCCCACGACAGAACACCGCTTCGTCCT
23 | >chrC
24 | CCCGCTAGCCGTGCCTGATCCTCAACCAAGCTGGGTAAAGACAACCGTCT
25 | AATCATTAACTTACGTTGTTACGTCATTTTGCGCTTAAAATTGTCGCACC
26 | GGAATCCGTCGAGACTTCCCGAGACATGTCCCCTTAATAAATGTACGGTG
27 | TGACCTAACGATCGGATCACCGTCCGTGCTAAAACACACAACCGCTGCGT
28 | GACACCGACCGAACGTTACCGAAGGCTGTCCGCCTAACATCTATATTTGG
29 | CGGTAAAGAGGCGGTTCCGGCGGACTATAAAGTCACAGGCCACTGTTTCT
30 | TTGCAAGATATGGCTCTCTGTCAGGACCGCCCCCTAGGGTCAGCTCAAAT
31 | AAGCTTGTCCCGGACTCCGTACTTCCAACAGAAAGGTGACCGCTACATTC
32 | TGCTATTGACCCCTCACACAACGTTCCCCCGCATGGCGTACGTGTTACCA
33 | GGGCGGTTGCGGCCTTACGTCGCCATAAGCACGTATATAAGTCACCCACT
34 | >chrD
35 |  TGCCGTGACCACCCCGCGAGAATCTCATAATGATATCTCCAATCGAGT
36 | 


--------------------------------------------------------------------------------
/test/data/seqs.fasta:
--------------------------------------------------------------------------------
 1 | >chrA    my random comment
 2 | TCCTCCCCGTTCGCTGGACCCTTGGTCTATCTAGTCAAGAATTAACTCCC
 3 | ATTTAGTTGGCTGTTCGTCGATTACCCCTACTGGACCGTCGCAACGGCTC
 4 | ACGTGGAGGTCTTAGACCAAGAAAGCTACTGTATGCGGGGATATCACATC
 5 | AGATTGCCAGGCGAGCAGCTCTAGCGTGACACGCCTAGACTCATTCGTTG
 6 | TTCCTTGTCAATCCCAGGGGTCTCCACAGGGAGTGGATCGAGCTAATCAC
 7 | CGTTTCGAGTCCGTCAGGCGGAGAGTAGCAGTAAGTACAAACTTCTGCTA
 8 | GTCGCTCTGCCACAACGTAGCCACCTAAGATTAACCCTGGAATTGTCCGG
 9 | GCGGCATGATCCATCGAGGAGTTAGCGGGGACAGGGAGTTACCAGTCGAG
10 | ACGTCCATGGTGGTGCTGCAATCCATGGATACCATCTCCTTGCCATTCCT
11 | AGGGACATCG
12 | >chrB
13 | TATGGGGTAGCATCATTAAGTGGGGAGGTAGACCAGGAGTTCGGTTCCCG
14 | GAGTTTCGTTAGTTCAGGTAGCGTGACCTCGTCTTAGTATGCAGTCGTGA
15 | AATAATAGACATTTCTGCCTGTCAGGTTGCACTAATCACACCCAGGCTGT
16 | TAACGAGGCGGCTCGTAGTATAAACGCTTTGGACTAGACTCGATACCTAG
17 | CGGCGCGCATTGATAAATGGTGCATCTATAGTAAAAGGCGTCCCAACCTG
18 | GGGACTTAGCGTTGAATACCCGTGCAAGGATCTCTAATCGGTTCTCAATG
19 | GCTGCCTGCTCTTTCTCTAAACGAGACTCTAATATCATGTGTGGTCCTTG
20 | CTTTCGGTCGAGAAAAAGCCTTTGATCGCATCCCAAACCGACATCTAAAA
21 | GCTTCATGTATGTCGGCAGCGAAAAAACGACCAAATAGAAGTCCCCTAAC
22 | GGAGAATAGGCCGCCCACGACAGAACACCGCTTCGTCCT
23 | >chrC
24 | CCCGCTAGCCGTGCCTGATCCTCAACCAAGCTGGGTAAAGACAACCGTCT
25 | AATCATTAACTTACGTTGTTACGTCATTTTGCGCTTAAAATTGTCGCACC
26 | GGAATCCGTCGAGACTTCCCGAGACATGTCCCCTTAATAAATGTACGGTG
27 | TGACCTAACGATCGGATCACCGTCCGTGCTAAAACACACAACCGCTGCGT
28 | GACACCGACCGAACGTTACCGAAGGCTGTCCGCCTAACATCTATATTTGG
29 | CGGTAAAGAGGCGGTTCCGGCGGACTATAAAGTCACAGGCCACTGTTTCT
30 | TTGCAAGATATGGCTCTCTGTCAGGACCGCCCCCTAGGGTCAGCTCAAAT
31 | AAGCTTGTCCCGGACTCCGTACTTCCAACAGAAAGGTGACCGCTACATTC
32 | TGCTATTGACCCCTCACACAACGTTCCCCCGCATGGCGTACGTGTTACCA
33 | GGGCGGTTGCGGCCTTACGTCGCCATAAGCACGTATATAAGTCACCCACT
34 | >chrD hello
35 | TGCCGTGACCACCCCGCGAGAATCTCATAATGATATCTCCAATCGAGTA
36 | 


--------------------------------------------------------------------------------
/test/apps/minimap2/sw_simple.codon:
--------------------------------------------------------------------------------
 1 | # Smith-Waterman alignment from minimap2 implemented using Seq's inter-sequence alignment
 2 | # https://github.com/lh3/minimap2
 3 | # https://github.com/lh3/ksw2
 4 | # Usage: seqc sw.seq <queries> <targets>
 5 | # <queries> and <targets> are text files of the same length with one sequence per line.
 6 | from time import TimeInterval
 7 | from sys import argv
 8 | from bio import seqs, inter_align
 9 | from math import sqrt
10 | from statistics import mean, stdev
11 | queries = argv[1]
12 | targets = argv[2]
13 | 
14 | total, num = 0, 0
15 | score = True  # must be global
16 | 
17 | @inter_align
18 | def process_inter(t):
19 |     global total, num
20 |     query, target = t
21 |     inter_score = query.align(target, a=1, b=2, ambig=0, gapo=4, gape=2, score_only=score).score
22 |     total += inter_score
23 |     num += 1
24 | 
25 | def process_intra(t):
26 |     global total, num
27 |     query, target = t
28 |     inter_score = query.align(target, a=1, b=2, ambig=0, gapo=4, gape=2, score_only=score).score
29 |     total += inter_score
30 |     num += 1
31 | 
32 | def run(queries, targets, msg, f):
33 |     global num, total, score
34 |     for s in [False, True]:
35 |         times = []
36 |         for i in range(3):
37 |             num, total = 0, 0
38 |             t = TimeInterval()
39 |             score = s
40 |             zip(seqs(queries), seqs(targets)) |> f
41 |             times.append(t.elapsed())
42 |             # print '-', i, num, total, times[-1]
43 |         m = mean(times)
44 |         print f'[sw-time] seq-{msg} {int(score)} {m} {sqrt(sum((i - m)**2 for i in times) / len(times))}'
45 |     # print stdev(times) # broken
46 | 
47 | run(queries, targets, 'intra', process_intra)
48 | run(queries, targets, 'inter', process_inter)
49 | 


--------------------------------------------------------------------------------
/test/bench/fasta.codon:
--------------------------------------------------------------------------------
 1 | import sys, bisect, time
 2 | 
 3 | alu = 'GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA'
 4 | iub = list(zip('acgtBDHKMNRSVWY', [0.27, 0.12, 0.12, 0.27] + [0.02]*11))
 5 | 
 6 | homosapiens = [
 7 |     ('a', 0.3029549426680),
 8 |     ('c', 0.1979883004921),
 9 |     ('g', 0.1975473066391),
10 |     ('t', 0.3015094502008)
11 | ]
12 | 
13 | def random_generator(ia, ic, im):
14 |     seed = 42
15 |     imf = float(im)
16 |     while 1:
17 |         seed = (seed * ia + ic) % im
18 |         yield seed / imf
19 | 
20 | def make_cumulative(table):
21 |     P = list[float]()
22 |     C = list[str]()
23 |     prob = 0.
24 |     for char, p in table:
25 |         prob += p
26 |         P += [prob]
27 |         C += [char]
28 |     return (P, C)
29 | 
30 | def repeat_fasta(src, n):
31 |     width = 60
32 |     r = len(src)
33 |     s = src + src + src[:n % r]
34 |     for i in range(0, n // width):
35 |         j = i * width % r
36 |         print s[j:j + width]
37 |     if n % width: print s[- (n % width):]
38 | 
39 | def random_fasta(table, n, rand):
40 |     width = 60
41 |     probs, chars = make_cumulative(table)
42 |     s = str.cat(chars[bisect.bisect(probs, next(rand))] for i in range(n))
43 |     for i in range(0, n, width):
44 |         print s[i:i + width]
45 | 
46 | n = int(sys.argv[1]) # 1000, 25000000
47 | 
48 | rand = random_generator(3877, 29573, 139968)
49 | 
50 | print '>ONE Homo sapiens alu'
51 | repeat_fasta(alu, n*2)
52 | 
53 | print '>TWO IUB ambiguity codes'
54 | random_fasta(iub, n*3, rand)
55 | 
56 | print '>THREE Homo sapiens frequency'
57 | random_fasta(homosapiens, n*5, rand)
58 | 


--------------------------------------------------------------------------------
/test/data/toy.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.0
 2 | ##fileDate=20090805
 3 | ##source=myImputationProgramV3.1
 4 | ##reference=1000GenomesPilot-NCBI36
 5 | ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
 6 | ##phasing=partial
 7 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
 8 | ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
 9 | ##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
10 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
11 | ##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
12 | ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
13 | ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
14 | ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
15 | ##FILTER=<ID=q10,Description="Quality below 10">
16 | ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
17 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
18 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
19 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
20 | ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
21 | ##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
22 | ##ALT=<ID=CNV,Description="Copy number variable region">
23 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
24 | 20	14370	rs6054257	G	A	29.2	q10;s50	NS=3;DP=14,22;AF=0.5;AA=String,Multi,Val;DB;H2	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
25 | 20	17330	.	T	A	3	q10	NS=3;DP=11;AF=0.017	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3:.,.
26 | 20	1110696	rs6040355	A	G,T	67	PASS	NS=2;DP=10;AF=0.333,0.667;AA=FA,DADA;DB	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4:.,.
27 | 20	1230237	.	T	.	47	PASS	NS=3;DP=13;AA=T	GT:GQ:DP:HQ	0|0:54:.:56,60	0|0:48:4:51,51	0/0:61:2:.,.
28 | 20	1234567	microsat1	G	GA,GAC	50	PASS	NS=3;DP=9;AA=G;AN=6;AC=3,1	GT:GQ:DP	0/1:.:4	0/2:17:2	1/1:40:3
29 | 20	1235237	.	T	.	.	.	.	GT	0/0	0|0	./.
30 | 


--------------------------------------------------------------------------------
/test/bench/rc.codon:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | from bio import *
 4 | 
 5 | def revcomp1(c):
 6 |    return ('A' if c == 'T' else \
 7 |           ('C' if c == 'G' else \
 8 |           ('G' if c == 'C' else \
 9 |           ('T' if c == 'A' else c))))
10 | 
11 | def rc_copy(s):
12 |    rc = str.cat(revcomp1(s[len(s) - i - 1]) for i in range(len(s)))
13 |    # print rc
14 |    return len(rc)
15 | 
16 | def naive():
17 |    f = FASTA(sys.argv[1], fai=False)
18 |    total = 0
19 |    for l in f:
20 |       total += rc_copy(str(l.seq))
21 |    print total
22 | 
23 | total = 0
24 | def rc(s):
25 |    s = ~s
26 |    # print s
27 |    global total
28 |    total += len(s)
29 | 
30 | def idiomatic():
31 |    FASTA(sys.argv[1], fai=False) |> seqs |> rc
32 |    print total
33 | 
34 | n = 0
35 | def update(kmer):
36 |     global n
37 |     x = type(kmer)() |> base(len(kmer) // 2, k'T')
38 |     if kmer > x:
39 |         n += 1
40 | 
41 | def test_fast(K: Static[int]):
42 |     global n
43 |     n = 0
44 |     with time.timing(f'{K}-mer (fast)'):
45 |         FASTA(sys.argv[1]) |> seqs |> kmers(step=1, k=K) |> revcomp |> update
46 |     print n
47 | 
48 | 
49 | def test_slow(K: Static[int]):
50 |     global n
51 |     n = 0
52 |     with time.timing(f'{K}-mer (slow)'):
53 |         for a in seqs(FASTA(sys.argv[1])):
54 |             for b in kmers(a, step=1, k=K):
55 |                 c = revcomp(b)
56 |                 update(c)
57 |     print n
58 | 
59 | def test_super_slow(K: Static[int]):
60 |     global n
61 |     n = 0
62 |     with time.timing(f'{K}-mer (super slow)'):
63 |         for a in seqs(FASTA(sys.argv[1])):
64 |             for b in split(a, K, step=1):
65 |                 if not b.N():
66 |                     c = Kmer[K](b)
67 |                     d = revcomp(c)
68 |                     update(d)
69 |     print n
70 | 
71 | with time.timing("naive"):
72 |    naive()
73 | with time.timing("idiomatic"):
74 |    idiomatic()
75 | with time.timing("pipeline"):
76 |    test_fast(4)
77 |    test_slow(4)
78 |    test_super_slow(4)
79 | 
80 |    test_fast(8)
81 |    test_slow(8)
82 |    test_super_slow(8)
83 | 
84 |    test_fast(16)
85 |    test_slow(16)
86 |    test_super_slow(16)
87 | 
88 |    test_fast(32)
89 |    test_slow(32)
90 |    test_super_slow(32)
91 | 
92 |    test_fast(64)
93 |    test_slow(64)
94 |    test_super_slow(64)
95 | 
96 |    test_fast(128)
97 |    test_slow(128)
98 |    test_super_slow(128)
99 | 


--------------------------------------------------------------------------------
/test/apps/minimap2/sw.codon:
--------------------------------------------------------------------------------
 1 | # Smith-Waterman alignment from minimap2 implemented using Seq's inter-sequence alignment
 2 | # https://github.com/lh3/minimap2
 3 | # https://github.com/lh3/ksw2
 4 | # Usage: seqc sw.seq <queries> <targets>
 5 | # <queries> and <targets> are text files of the same length with one sequence per line.
 6 | from time import TimeInterval
 7 | from sys import argv
 8 | from bio import seqs, inter_align
 9 | from math import sqrt
10 | from statistics import mean, stdev
11 | queries = argv[1]
12 | targets = argv[2]
13 | 
14 | total, num = 0, 0
15 | score = True  # must be global
16 | 
17 | @inter_align
18 | def process_inter(t):
19 |     global total, num
20 |     query, target = t
21 |     inter_score = query.align(target, a=1, b=2, ambig=0, gapo=4, gape=2, zdrop=400, bandwidth=751, ext_only=False, score_only=score).score
22 |     total += inter_score
23 |     num += 1
24 | 
25 | def process_intra(t):
26 |     global total, num
27 |     query, target = t
28 |     inter_score = query.align(target, a=1, b=2, ambig=0, gapo=4, gape=2, zdrop=400, bandwidth=751, ext_only=False, score_only=score).score
29 |     total += inter_score
30 |     num += 1
31 | 
32 | def run(queries, targets, msg, f, filter=False):
33 |     global num, total, score
34 |     def filter_len(t):
35 |         query, target = t
36 |         if len(query) <= 512 and len(target) <= 512:
37 |             yield query, target
38 | 
39 |     for s in [False, True]:
40 |         times = []
41 |         for i in range(3):
42 |             num, total = 0, 0
43 |             t = TimeInterval()
44 |             score = s
45 |             if filter:
46 |                 zip(seqs(queries), seqs(targets)) |> filter_len |> f
47 |             else:
48 |                 zip(seqs(queries), seqs(targets)) |> f
49 |             times.append(t.elapsed())
50 |             # print '-', i, num, total, times[-1]
51 |         m = mean(times)
52 |         print f'{msg}: seq-{msg} {int(score)} {m} {sqrt(sum((i - m)**2 for i in times) / len(times))}'
53 |     # print stdev(times) # broken
54 | 
55 | from C import seq_get_interaln_simd() -> str
56 | from C import seq_set_sw_maxsimd(int)
57 | 
58 | run(queries, targets, 'intra', process_intra)
59 | run(queries, targets, 'intra-512', process_intra, True)
60 | 
61 | for simd in [0x20, 0x80, 0x100]:
62 |     seq_set_sw_maxsimd(simd)
63 |     print seq_get_interaln_simd()
64 |     run(queries, targets, 'inter', process_inter)
65 |     run(queries, targets, 'inter-512', process_inter, True)
66 | 
67 | 


--------------------------------------------------------------------------------
/test/bench/fastx.codon:
--------------------------------------------------------------------------------
 1 | ############################
 2 | # Format parsing benchmark #
 3 | ############################
 4 | from sys import argv
 5 | from time import timing
 6 | from bio import *
 7 | n, m = 0, 0
 8 | 
 9 | def test_fasta_options(path):
10 |     def process(rec):
11 |         global n
12 |         n += len(rec.name) + len(rec.seq)
13 |     global n, m
14 |     m = 0
15 |     opts4 = [(a,b,c,d) for a in (True, False)
16 |                        for b in (True, False)
17 |                        for c in (True, False)
18 |                        for d in (True, False)]
19 |     for validate, gzip, copy, fai in opts4:
20 |         n = 0
21 |         with timing(f'validate={validate} gzip={gzip} copy={copy} fai={fai}'):
22 |             FASTA(path, validate=validate, gzip=gzip, copy=copy, fai=fai) |> iter |> process
23 |         if m == 0:
24 |             m = n
25 |         else:
26 |             assert m == n
27 | 
28 | def test_fastq_options(path):
29 |     def process(rec):
30 |         global n
31 |         n += len(rec.name) + len(rec.seq) + len(rec.qual)
32 |     global n, m
33 |     m = 0
34 |     opts2 = [(a,b) for a in (True, False)
35 |                    for b in (True, False)]
36 |     for validate, gzip in opts2:
37 |         n = 0
38 |         with timing(f'validate={validate} gzip={gzip}'):
39 |             FASTQ(path, validate=validate, gzip=gzip, copy=True) |> iter |> process
40 |         if m == 0:
41 |             m = n
42 |         else:
43 |             assert m == n
44 | 
45 | def test_seqs_options(path):
46 |     def process(rec):
47 |         global n
48 |         n += len(rec)
49 |     global n, m
50 |     m = 0
51 |     opts3 = [(a,b,c) for a in (True, False)
52 |                      for b in (True, False)
53 |                      for c in (True, False)]
54 |     for validate, gzip, copy in opts3:
55 |         n = 0
56 |         with timing(f'validate={validate} gzip={gzip} copy={copy}'):
57 |             Seqs(path, validate=validate, gzip=gzip, copy=copy) |> iter |> process
58 |         if m == 0:
59 |             m = n
60 |             print m
61 |         else:
62 |             print m, n
63 |             assert m == n
64 | 
65 | for path in argv[1:]:
66 |     if path.endswith('.fa') or path.endswith('.fasta'):
67 |         print 'Testing as FASTA'
68 |         test_fasta_options(path)
69 |     elif path.endswith('.fq') or path.endswith('.fastq'):
70 |         print 'Testing as FASTQ'
71 |         test_fastq_options(path)
72 |     else:
73 |         print 'Testing as TXT'
74 |         test_seqs_options(path)
75 | 


--------------------------------------------------------------------------------
/stdlib/bio/iter.codon:
--------------------------------------------------------------------------------
 1 | from bio.seq import seq
 2 | from copy import copy
 3 | 
 4 | # Sequence reader in text, line-by-line format.
 5 | @tuple
 6 | class SeqReader:
 7 |     '''
 8 |     Parser for a plain txt-based sequence format, with one sequence per line.
 9 | 
10 |     '''
11 |     _file: cobj
12 |     validate: bool
13 |     gzip: bool
14 |     copy: bool
15 | 
16 |     def __new__(path: str, validate: bool, gzip: bool, copy: bool) -> SeqReader:
17 |         return SeqReader(gzopen(path, "r").__raw__() if gzip else open(path, "r").__raw__(), validate, gzip, copy)
18 | 
19 |     @property
20 |     def file(self):
21 |         assert not self.gzip
22 |         p = __array__[cobj](1)
23 |         p.ptr[0] = self._file
24 |         return Ptr[File](p.ptr.as_byte())[0]
25 | 
26 |     @property
27 |     def gzfile(self):
28 |         assert self.gzip
29 |         p = __array__[cobj](1)
30 |         p.ptr[0] = self._file
31 |         return Ptr[gzFile](p.ptr.as_byte())[0]
32 | 
33 |     def _preprocess(self, a: str):
34 |         from bio.builtin import _validate_str_as_seq
35 |         if self.validate:
36 |             return _validate_str_as_seq(a, self.copy)
37 |         else:
38 |             return copy(seq(a.ptr, a.len)) if self.copy else seq(a.ptr, a.len)
39 | 
40 |     def __seqs__(self):
41 |         return self.__iter__()
42 | 
43 |     def __iter__(self):
44 |         if self.gzip:
45 |             for a in self.gzfile._iter_trim_newline():
46 |                 s = self._preprocess(a)
47 |                 assert s.len >= 0
48 |                 yield s
49 |         else:
50 |             for a in self.file._iter_trim_newline():
51 |                 s = self._preprocess(a)
52 |                 assert s.len >= 0
53 |                 yield s
54 |         self.close()
55 | 
56 |     def __blocks__(self, size: int):
57 |         from bio.block import _blocks
58 |         if not self.copy:
59 |             raise ValueError("cannot read sequences in blocks with copy=False")
60 |         return _blocks(self.__iter__(), size)
61 | 
62 |     def close(self):
63 |         if self.gzip:
64 |             self.gzfile.close()
65 |         else:
66 |             self.file.close()
67 | 
68 |     def __enter__(self):
69 |         pass
70 | 
71 |     def __exit__(self):
72 |         self.close()
73 | 
74 | def Seqs(path: str, validate: bool = True, gzip: bool = True, copy: bool = True):
75 |     return SeqReader(path=path, validate=validate, gzip=gzip, copy=copy)
76 | 
77 | @extend
78 | class str:
79 |     def __seqs__(self):
80 |         return iter(Seqs(self))
81 | 
82 |     def __blocks__(self, size: int):
83 |         from bio.block import _blocks
84 |         return _blocks(self.__seqs__(), size)
85 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Seq CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |       - develop
 8 |     tags:
 9 |       - '*'
10 |   pull_request:
11 |     branches:
12 |       - develop
13 | 
14 | jobs:
15 |   create_release:
16 |     name: GitHub Release
17 |     runs-on: ubuntu-latest
18 |     outputs:
19 |       upload_url: ${{ steps.create_release.outputs.upload_url }}
20 |     permissions:
21 |       contents: write
22 |     steps:
23 |       - name: Create Release
24 |         if: contains(github.ref, 'tags/v')
25 |         id: create_release
26 |         uses: ncipollo/release-action@v1
27 | 
28 |   build:
29 |     strategy:
30 |       matrix:
31 |         include:
32 |           - os: ubuntu-latest
33 |             arch: linux-x86_64
34 |           - os: ubuntu-latest
35 |             arch: manylinux2014-x86_64
36 |           - os: ubuntu-24.04-arm
37 |             arch: linux-aarch64
38 |           - os: ubuntu-24.04-arm
39 |             arch: manylinux2014-aarch64
40 |           - os: macos-13
41 |             arch: darwin-x86_64
42 |           - os: macos-14
43 |             arch: darwin-arm64
44 |     runs-on: ${{ matrix.os }}
45 |     name: Build Seq
46 |     needs: create_release
47 |     permissions:
48 |       contents: write
49 |     steps:
50 |       - uses: actions/checkout@v4
51 | 
52 |       - name: Build (Ubuntu)
53 |         if: startsWith(matrix.os, 'ubuntu')
54 |         run: |
55 |           (cd .github/build-linux && docker build -t local -f Dockerfile.${{ matrix.arch }} .)
56 |           docker run -v $(pwd):/github/workspace local /github/workspace ${{ matrix.arch }} yes 0.19.2
57 | 
58 |       - name: Build (macOS)
59 |         if: startsWith(matrix.os, 'macos')
60 |         run: |
61 |           sudo mkdir -p /opt/llvm-codon
62 |           sudo chown -R $(whoami) /opt/llvm-codon
63 |           curl -L https://github.com/exaloop/llvm-project/releases/download/codon-20.1.7/llvm-codon-20.1.7-${{ matrix.arch }}.tar.bz2 | tar jxf - -C /opt
64 |           bash .github/build-linux/entrypoint.sh ${{ github.workspace }} ${{ matrix.arch }} yes 0.19.2
65 | 
66 |       - name: Upload Artifacts
67 |         uses: actions/upload-artifact@v4
68 |         with:
69 |           name: seq-${{ matrix.arch }}.tar.gz
70 |           path: seq-${{ matrix.arch }}.tar.gz
71 | 
72 |       - name: Upload Release Asset
73 |         if: contains(github.ref, 'tags/v')
74 |         uses: actions/upload-release-asset@v1.0.2
75 |         env:
76 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
77 |         with:
78 |           upload_url: ${{ needs.create_release.outputs.upload_url }}
79 |           asset_path: ./seq-${{ matrix.arch }}.tar.gz
80 |           asset_name: seq-${{ matrix.arch }}.tar.gz
81 |           asset_content_type: application/gzip
82 | 


--------------------------------------------------------------------------------
/stdlib/bio/locus.codon:
--------------------------------------------------------------------------------
  1 | @tuple
  2 | class Contig:
  3 |     '''
  4 |     Representation of a contig, consisting of an ID, name and length.
  5 |     '''
  6 | 
  7 |     _tid: u32
  8 |     _name: str
  9 |     _len: int
 10 | 
 11 |     def __new__(tid: int, name: str, len: int) -> Contig:
 12 |         return Contig(u32(tid), name, len)
 13 | 
 14 |     @property
 15 |     def tid(self):
 16 |         return int(self._tid)
 17 | 
 18 |     @property
 19 |     def name(self):
 20 |         return self._name
 21 | 
 22 |     @property
 23 |     def len(self):
 24 |         return self._len
 25 | 
 26 |     def __str__(self):
 27 |         return self.name
 28 | 
 29 |     def __len__(self):
 30 |         return self.len
 31 | 
 32 |     def __eq__(self, other: Contig):
 33 |         return self.tid == other.tid
 34 | 
 35 |     def __ne__(self, other: Contig):
 36 |         return self.tid != other.tid
 37 | 
 38 |     def __lt__(self, other: Contig):
 39 |         return self.tid < other.tid
 40 | 
 41 |     def __gt__(self, other: Contig):
 42 |         return self.tid > other.tid
 43 | 
 44 |     def __le__(self, other: Contig):
 45 |         return self.tid <= other.tid
 46 | 
 47 |     def __ge__(self, other: Contig):
 48 |         return self.tid >= other.tid
 49 | 
 50 |     def __hash__(self):
 51 |         return self.tid
 52 | 
 53 | @tuple
 54 | class Locus:
 55 |     '''
 56 |     Representation of a locus, consisting of a contig ID and 0-based position.
 57 |     '''
 58 | 
 59 |     _tid: u32
 60 |     _pos: u32
 61 | 
 62 |     def __new__(tid: int, pos: int) -> Locus:
 63 |         '''
 64 |         Constructs a `Locus` with specified contig ID and 0-based position.
 65 |         Negative positions indicate loci on the reverse strand.
 66 |         '''
 67 |         return Locus(u32(tid), u32(pos))
 68 | 
 69 |     def __lt__(self, other: Locus):
 70 |         return (self.tid, self.pos) < (other.tid, other.pos)
 71 | 
 72 |     def __gt__(self, other: Locus):
 73 |         return (self.tid, self.pos) > (other.tid, other.pos)
 74 | 
 75 |     def __le__(self, other: Locus):
 76 |         return (self.tid, self.pos) <= (other.tid, other.pos)
 77 | 
 78 |     def __ge__(self, other: Locus):
 79 |         return (self.tid, self.pos) >= (other.tid, other.pos)
 80 | 
 81 |     @property
 82 |     def tid(self):
 83 |         return int(self._tid)
 84 | 
 85 |     @property
 86 |     def pos(self):
 87 |         return abs(int(i32(int(self._pos))))
 88 | 
 89 |     @property
 90 |     def reversed(self):
 91 |         '''
 92 |         Whether this locus is on the reverse strand
 93 |         '''
 94 |         return i32(int(self._pos)) < i32(0)
 95 | 
 96 |     def __invert__(self):
 97 |         '''
 98 |         Returns the corresponding `Locus` on the reverse strand
 99 |         '''
100 |         return Locus(self.tid, self.pos if self.reversed else -self.pos)
101 | 
102 |     def __repr__(self):
103 |         return f"Locus(tid={self.tid}, pos={self.pos}, reversed={self.reversed})"
104 | 


--------------------------------------------------------------------------------
/test/apps/mrsfast/exact.codon:
--------------------------------------------------------------------------------
 1 | # Implementation of exact mapping using FM-index
 2 | # Usage:
 3 | #   Index:  seqc exact.seq index reference.fa
 4 | #   Search: seqc exact.seq search reference.fa reads.fq output.sam
 5 | 
 6 | from sys import argv, stderr, exit
 7 | from bio.fmindex import FMIndex
 8 | from bio import FASTARecord, FASTQRecord, FASTQ, FASTA
 9 | 
10 | class GenomeIndex:
11 |     ref: List[FASTARecord]
12 |     fmi: FMIndex
13 | 
14 | def open_index_file(basename, mode):
15 |     import gzip
16 |     return gzip.open(f'{basename}.exact.idx', mode)
17 | 
18 | def index_load(basename):
19 |     from pickle import load
20 |     with open_index_file(basename, 'rb') as jar:
21 |         return load(jar, GenomeIndex)
22 | 
23 | def main_index(basename):
24 |     from pickle import dump
25 |     ref = [rec for rec in FASTA(basename)]
26 |     print('making FM-index...', file=stderr)
27 |     fmi = FMIndex(basename)
28 |     index = GenomeIndex(ref, fmi)
29 |     print('writing to disk...', file=stderr)
30 |     with open_index_file(basename, 'wb0') as jar:
31 |         dump(index, jar)
32 | 
33 | num_aligned = 0
34 | 
35 | #@prefetch
36 | def align(read: FASTQRecord, genome_index: GenomeIndex, out: File):
37 |     global num_aligned
38 |     for rc in (False, True):
39 |         read_seq = read.seq
40 |         read_len = len(read_seq)
41 |         if rc:
42 |             read_seq = ~read_seq
43 | 
44 |         s = read_seq
45 |         intv = genome_index.fmi.interval(s[-1])
46 |         s = s[:-1]
47 |         while s and intv:
48 |             intv = genome_index.fmi[intv, s[-1]]
49 |             s = s[:-1]
50 | 
51 |         for rid, name, ref_pos in genome_index.fmi.results(intv):
52 |             ref_len = len(genome_index.ref[rid].seq)
53 |             if not (0 <= ref_pos <= ref_len - read_len):
54 |                 continue
55 |             if genome_index.ref[rid].seq[ref_pos:ref_pos + read_len].N():
56 |                 continue
57 |             print(
58 |                 read.name, 16 if rc else 0, name, ref_pos + 1, 255, f'{read_len}M',
59 |                 '*', 0, 0, read_seq, read.qual if not rc else read.qual[::-1], 'NM:i:0',
60 |                 sep='\t', file=out
61 |             )
62 | 
63 |     num_aligned += 1
64 |     if num_aligned % 10000000 == 0:
65 |         print 'aligned 10M reads'
66 | 
67 | def main_search(ref_path, fastq_path, out_path):
68 |     from time import timing
69 |     print('loading index...', file=stderr)
70 |     genome_index = index_load(ref_path)
71 |     print('running alignment pipeline...', file=stderr)
72 |     with open(out_path, 'w') as out, timing('alignment pipeline'):
73 |         FASTQ(fastq_path) |> iter |> align(genome_index, out)
74 | 
75 | match argv[1:]:
76 |     case ['index', basename]:
77 |         main_index(basename)
78 |     case ['search', ref_path, fastq_path, out_path]:
79 |         main_search(ref_path, fastq_path, out_path)
80 |     case _:
81 |         print("error: unknown mode: valid modes are 'index' and 'search'", file=stderr)
82 |         exit(1)
83 | 


--------------------------------------------------------------------------------
/test/apps/snap/genomeindex.seq:
--------------------------------------------------------------------------------
 1 | # Implementation of SNAP aligner's genome index
 2 | # https://github.com/amplab/snap/blob/master/SNAPLib/GenomeIndex.{cpp,h}
 3 | 
 4 | # Need the following hooks linked to convert C++ GenomeIndex to Seq object:
 5 | # snap_index_from_dir(Ptr[byte]) -> Ptr[byte]     --  read object from specified directory
 6 | # snap_index_ht_count(Ptr[byte]) -> int           --  extract hash table count
 7 | # snap_index_ht_get(Ptr[byte], int) -> Ptr[byte]  --  extract specified (0-indexed) hash table
 8 | # snap_index_overflow_ptr(Ptr[byte]) -> Ptr[u32]  --  extract overflow table pointer
 9 | # snap_index_overflow_len(Ptr[byte]) -> int       --  extract overflow table length
10 | # snap_index_count_of_bases(Ptr[byte]) -> int     --  extract count of genome bases
11 | 
12 | from hashtable import SNAPHashTable
13 | from bio import Kmer
14 | 
15 | class GenomeIndex[K]:
16 |     hash_tables: Array[SNAPHashTable[Kmer[16],u32]]
17 |     overflow_table: Array[u32]
18 |     count_of_bases: int
19 | 
20 |     def _partition(k: K):
21 |         n = int(k.as_int())
22 |         return (Kmer[16](n & ((1 << 32) - 1)), n >> 32)
23 | 
24 |     def __init__(self, dir: str):
25 |         assert Kmer[16].len() <= K.len() <= Kmer[32].len()
26 |         from C import snap_index_from_dir(Ptr[byte]) -> Ptr[byte]
27 |         from C import snap_index_ht_count(Ptr[byte]) -> int
28 |         from C import snap_index_ht_get(Ptr[byte], int) -> Ptr[byte]
29 |         from C import snap_index_overflow_ptr(Ptr[byte]) -> Ptr[u32]
30 |         from C import snap_index_overflow_len(Ptr[byte]) -> int
31 |         from C import snap_index_count_of_bases(Ptr[byte]) -> int
32 | 
33 |         p = snap_index_from_dir(dir.c_str())
34 |         assert p
35 |         hash_tables = Array[SNAPHashTable[Kmer[16],u32]](snap_index_ht_count(p))
36 |         for i in range(len(hash_tables)):
37 |             hash_tables[i] = SNAPHashTable[Kmer[16],u32](snap_index_ht_get(p, i))
38 | 
39 |         self.hash_tables = hash_tables
40 |         self.overflow_table = Array[u32](snap_index_overflow_ptr(p), snap_index_overflow_len(p))
41 |         self.count_of_bases = snap_index_count_of_bases(p)
42 | 
43 |     def __getitem__(self, seed: K):
44 |         kmer, which = GenomeIndex[K]._partition(seed)
45 |         table = self.hash_tables[which]
46 |         value_ptr = table.get_value_ptr_for_key(kmer)
47 | 
48 |         if not value_ptr or value_ptr[0] == table.invalid_val:
49 |             return Array[u32](value_ptr, 0)
50 | 
51 |         value = value_ptr[0]
52 | 
53 |         if int(value) < self.count_of_bases:
54 |             return Array[u32](value_ptr, 1)
55 |         else:
56 |             overflow_table_offset = int(value) - self.count_of_bases
57 |             hit_count = int(self.overflow_table[overflow_table_offset])
58 |             return Array[u32](self.overflow_table.ptr + overflow_table_offset + 1, hit_count)
59 | 
60 |     def __prefetch__(self, seed: K):
61 |         kmer, which = GenomeIndex[K]._partition(seed)
62 |         table = self.hash_tables[which]
63 |         table.__prefetch__(kmer)
64 | 
65 | 


--------------------------------------------------------------------------------
/test/core/match.codon:
--------------------------------------------------------------------------------
  1 | from bio import *
  2 | 
  3 | def f(k):
  4 |     match k:
  5 |         case '':
  6 |             yield 1
  7 |         case _:
  8 |             yield 0
  9 | 
 10 |     match k:
 11 |         case 'GCGT':
 12 |             yield 1
 13 |         case _:
 14 |             yield 0
 15 | 
 16 |     match k:
 17 |         case 'GCGTC':
 18 |             yield 1
 19 |         case _:
 20 |             yield 0
 21 | 
 22 |     match k:
 23 |         case 'GCATC':
 24 |             yield 1
 25 |         case _:
 26 |             yield 0
 27 | 
 28 |     match k:
 29 |         case 'G_GT_':
 30 |             yield 1
 31 |         case _:
 32 |             yield 0
 33 | 
 34 |     match k:
 35 |         case '_TG__':
 36 |             yield 1
 37 |         case _:
 38 |             yield 0
 39 | 
 40 |     match k:
 41 |         case 'GC*ATC':
 42 |             yield 1
 43 |         case _:
 44 |             yield 0
 45 | 
 46 |     match k:
 47 |         case 'GC*TC':
 48 |             yield 1
 49 |         case _:
 50 |             yield 0
 51 | 
 52 |     match k:
 53 |         case 'GC*A':
 54 |             yield 1
 55 |         case _:
 56 |             yield 0
 57 | 
 58 |     match k:
 59 |         case 'GCG*':
 60 |             yield 1
 61 |         case _:
 62 |             yield 0
 63 | 
 64 |     match k:
 65 |         case 'GGG*':
 66 |             yield 1
 67 |         case _:
 68 |             yield 0
 69 | 
 70 |     match k:
 71 |         case '*TC':
 72 |             yield 1
 73 |         case _:
 74 |             yield 0
 75 | 
 76 |     match k:
 77 |         case '*T':
 78 |             yield 1
 79 |         case _:
 80 |             yield 0
 81 | 
 82 |     match k:
 83 |         case '*':
 84 |             yield 1
 85 |         case _:
 86 |             yield 0
 87 | 
 88 |     match k:
 89 |         case '_C*G_C':
 90 |             yield 1
 91 |         case _:
 92 |             yield 0
 93 | 
 94 |     match k:
 95 |         case '*G_C':
 96 |             yield 1
 97 |         case _:
 98 |             yield 0
 99 | 
100 |     match k:
101 |         case '_C*':
102 |             yield 1
103 |         case _:
104 |             yield 0
105 | 
106 |     match k:
107 |         case 'A_*G_C':
108 |             yield 1
109 |         case _:
110 |             yield 0
111 | 
112 |     match k:
113 |         case '*C_C':
114 |             yield 1
115 |         case _:
116 |             yield 0
117 | 
118 |     match k:
119 |         case '_T*':
120 |             yield 1
121 |         case _:
122 |             yield 0
123 | 
124 |     match k:
125 |         case '__*__':
126 |             yield 1
127 |         case _:
128 |             yield 0
129 | 
130 | @test
131 | def test_seq_match():
132 |     s = s'GCGTC'
133 |     t = ~s'GACGC' # == ~s
134 |     assert list(f(s)) == [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1]
135 |     assert list(f(t)) == [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1]
136 |     assert list(f(Kmer[5](s))) == [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1]
137 | test_seq_match()
138 | 


--------------------------------------------------------------------------------
/test/core/bltin.codon:
--------------------------------------------------------------------------------
 1 | @test
 2 | def test_min_max():
 3 |     assert max(2, 1, 1, 1, 1) == 2
 4 |     assert max(1, 2, 1, 1, 1) == 2
 5 |     assert max(1, 1, 2, 1, 1) == 2
 6 |     assert max(1, 1, 1, 2, 1) == 2
 7 |     assert max(1, 1, 1, 1, 2) == 2
 8 |     assert max(2, 1, 1, 1) == 2
 9 |     assert max(1, 2, 1, 1) == 2
10 |     assert max(1, 1, 2, 1) == 2
11 |     assert max(1, 1, 1, 2) == 2
12 |     assert max(2, 1, 1) == 2
13 |     assert max(1, 2, 1) == 2
14 |     assert max(1, 1, 2) == 2
15 |     assert max(2, 1) == 2
16 |     assert max(1, 2) == 2
17 | 
18 |     assert min(2, 1, 1, 1, 1) == 1
19 |     assert min(1, 2, 1, 1, 1) == 1
20 |     assert min(1, 1, 2, 1, 1) == 1
21 |     assert min(1, 1, 1, 2, 1) == 1
22 |     assert min(1, 1, 1, 1, 2) == 1
23 |     assert min(2, 1, 1, 1) == 1
24 |     assert min(1, 2, 1, 1) == 1
25 |     assert min(1, 1, 2, 1) == 1
26 |     assert min(1, 1, 1, 2) == 1
27 |     assert min(2, 1, 1) == 1
28 |     assert min(1, 2, 1) == 1
29 |     assert min(1, 1, 2) == 1
30 |     assert min(2, 1) == 1
31 |     assert min(1, 2) == 1
32 | 
33 |     assert max(0, 1, 1, 1, 1) == 1
34 |     assert max(1, 0, 1, 1, 1) == 1
35 |     assert max(1, 1, 0, 1, 1) == 1
36 |     assert max(1, 1, 1, 0, 1) == 1
37 |     assert max(1, 1, 1, 1, 0) == 1
38 |     assert max(0, 1, 1, 1) == 1
39 |     assert max(1, 0, 1, 1) == 1
40 |     assert max(1, 1, 0, 1) == 1
41 |     assert max(1, 1, 1, 0) == 1
42 |     assert max(0, 1, 1) == 1
43 |     assert max(1, 0, 1) == 1
44 |     assert max(1, 1, 0) == 1
45 |     assert max(0, 1) == 1
46 |     assert max(1, 0) == 1
47 | 
48 |     assert min(0, 1, 1, 1, 1) == 0
49 |     assert min(1, 0, 1, 1, 1) == 0
50 |     assert min(1, 1, 0, 1, 1) == 0
51 |     assert min(1, 1, 1, 0, 1) == 0
52 |     assert min(1, 1, 1, 1, 0) == 0
53 |     assert min(0, 1, 1, 1) == 0
54 |     assert min(1, 0, 1, 1) == 0
55 |     assert min(1, 1, 0, 1) == 0
56 |     assert min(1, 1, 1, 0) == 0
57 |     assert min(0, 1, 1) == 0
58 |     assert min(1, 0, 1) == 0
59 |     assert min(1, 1, 0) == 0
60 |     assert min(0, 1) == 0
61 |     assert min(1, 0) == 0
62 | 
63 |     assert min(a*a for a in range(3)) == 0
64 |     assert max(a*a for a in range(3)) == 4
65 |     assert min([0, 2, -1]) == -1
66 |     assert max([0, 2, -1]) == 2
67 | 
68 | @test
69 | def test_map_filter():
70 |     assert list(map(lambda i: i+1, (i*2 for i in range(5)))) == [1, 3, 5, 7, 9]
71 |     assert list(map(lambda i: i+1, (i*2 for i in range(0)))) == []
72 |     assert list(map(lambda i: i//2, map(lambda i: i-1, map(lambda i: i+1, (i*2 for i in range(5)))))) == [0, 1, 2, 3, 4]
73 |     def f(x: int) -> int:
74 |         return x - 1
75 |     def g(x: int) -> int:
76 |         return x + 1
77 |     assert list(map(f, map(g, (i*2 for i in range(5))))) == [0, 2, 4, 6, 8]
78 | 
79 |     def h(x: list[int]):
80 |         return x
81 |     assert h(list(map(lambda i: i-1, map(lambda i: i+1, range(5))))) == [0, 1, 2, 3, 4]
82 | 
83 |     assert list(filter(lambda i: i % 2 == 0, range(5))) == [0, 2, 4]
84 |     assert list(filter(lambda i: i % 2 == 1, filter(lambda i: i % 2 == 0, range(5)))) == []
85 | 
86 |     assert list(filter(lambda i: i%2 == 0, map(lambda i: i*i, range(10)))) == [0, 4, 16, 36, 64]
87 | 
88 | test_min_max()
89 | test_map_filter()
90 | 
91 | 


--------------------------------------------------------------------------------
/stdlib/bio/c_htslib.codon:
--------------------------------------------------------------------------------
 1 | # <htslib.h>
 2 | from C import hts_open(cobj, cobj) -> cobj
 3 | from C import hts_set_threads(cobj, i32) -> i32
 4 | from C import hts_close(cobj)
 5 | from C import hts_idx_destroy(cobj)
 6 | from C import hts_itr_destroy(cobj)
 7 | from C import hts_itr_destroy(cobj)
 8 | from C import hts_itr_next(cobj, cobj, cobj, cobj) -> i32
 9 | from C import sam_index_load(cobj, cobj) -> cobj
10 | from C import sam_hdr_read(cobj) -> cobj
11 | from C import sam_itr_querys(cobj, cobj, cobj) -> cobj
12 | from C import sam_read1(cobj, cobj, cobj) -> i32
13 | from C import bam_read1(cobj, cobj) -> i32
14 | from C import bam_init1() -> cobj
15 | @pure
16 | @C
17 | def bam_cigar2qlen(a: int, b: Ptr[u32]) -> int: pass
18 | @pure
19 | @C
20 | def bam_cigar2rlen(a: int, b: Ptr[u32]) -> int: pass
21 | @pure
22 | @C
23 | def bam_aux_get(a: cobj, b: Ptr[byte]) -> Ptr[u8]: pass
24 | @pure
25 | @C
26 | def bam_aux2i(a: Ptr[u8]) -> int: pass
27 | @pure
28 | @C
29 | def bam_aux2f(a: Ptr[u8]) -> float: pass
30 | @pure
31 | @C
32 | def bam_aux2A(a: Ptr[u8]) -> byte: pass
33 | @pure
34 | @C
35 | def bam_aux2Z(a: Ptr[u8]) -> Ptr[byte]: pass
36 | @pure
37 | @C
38 | def bam_auxB_len(a: Ptr[u8]) -> u32: pass
39 | @pure
40 | @C
41 | def bam_auxB2i(a: Ptr[u8], b: u32) -> int: pass
42 | @pure
43 | @C
44 | def bam_auxB2f(a: Ptr[u8], b: u32) -> float: pass
45 | @pure
46 | @C
47 | def bam_endpos(a: cobj) -> int: pass
48 | from C import sam_hdr_destroy(cobj)
49 | from C import bam_destroy1(cobj)
50 | 
51 | from C import bcf_hdr_read(cobj) -> cobj
52 | from C import bcf_read(cobj, cobj, cobj) -> i32
53 | from C import bcf_unpack(cobj, i32) -> i32
54 | from C import bcf_hdr_destroy(cobj)
55 | from C import bcf_init() -> cobj
56 | from C import bcf_destroy(cobj)
57 | from C import bcf_get_variant_types(cobj) -> i32
58 | from C import bcf_get_variant_type(cobj, i32) -> i32
59 | from C import bcf_is_snp(cobj) -> i32
60 | from C import bcf_has_filter(cobj, cobj, cobj) -> i32
61 | from C import bcf_get_fmt(cobj, cobj, cobj) -> cobj
62 | from C import bcf_get_info(cobj, cobj, cobj) -> cobj
63 | from C import bcf_get_fmt_id(cobj, i32) -> cobj
64 | from C import bcf_get_info_id(cobj, i32) -> cobj
65 | from C import bcf_get_format_values(cobj, cobj, cobj, cobj, cobj, i32) -> i32
66 | from C import bcf_hdr_id2int(cobj, i32, cobj) -> i32
67 | from C import bcf_has_filter(cobj, cobj, cobj) -> i32
68 | from C import bcf_hrec_format(cobj, cobj)
69 | from C import bcf_clear(cobj)
70 | from C import bcf_empty(cobj)
71 | from C import bcf_dup(cobj) -> cobj
72 | from C import bcf_get_info_values(cobj, cobj, Ptr[byte], Ptr[cobj], Ptr[i32], i32) -> i32
73 | 
74 | @pure
75 | @C
76 | def hts_version() -> cobj: pass
77 | 
78 | @pure
79 | @C
80 | def seq_get_htsfile_fp(a: cobj) -> cobj: pass
81 | @pure
82 | @C
83 | def seq_is_htsfile_cram(a: cobj) -> bool: pass
84 | @pure
85 | @C
86 | def seq_is_htsfile_bgzf(a: cobj) -> bool: pass
87 | 
88 | # Seq HTSlib
89 | def sam_itr_next(file: cobj, itr: cobj, r: cobj) -> int:
90 |     is_cram = seq_is_htsfile_cram(file)
91 |     is_bgzf = seq_is_htsfile_bgzf(file)
92 |     if not is_cram and not is_bgzf:
93 |         raise ValueError('not BGZF compressed')
94 |     if not itr:
95 |         raise ValueError('null iterator')
96 |     return int(hts_itr_next(
97 |         seq_get_htsfile_fp(file) if is_bgzf else cobj(),
98 |         itr, r, file))
99 | 


--------------------------------------------------------------------------------
/test/bench/match.codon:
--------------------------------------------------------------------------------
  1 | ######################
  2 | # Matching benchmark #
  3 | ######################
  4 | from sys import argv
  5 | from time import timing
  6 | from bio import *
  7 | 
  8 | def match_fast1(k):
  9 |     match k:
 10 |         case 'T_T_T_T_T_T_T_T_T_T_T_T_T_T_T_T_..._A_A_A_A_A_A_A_A_A_A_A_A_A_A_A_A':
 11 |             return True
 12 |         case _:
 13 |             return False
 14 | 
 15 | def match_slow1(k):
 16 |     for i in range(0, 32, 2):
 17 |         if k[i] != k'T':
 18 |             return False
 19 |     for i in range(0, 32, 2):
 20 |         if k[len(k) - 1 - i] != k'A':
 21 |             return False
 22 |     return True
 23 | 
 24 | def test1(use_slow_match, K: Static[int]):
 25 |     n = 0
 26 |     with timing(f'{K}-mer ({use_slow_match=})'):
 27 |         for s in FASTA(argv[1]) |> seqs:
 28 |             for kmer in s |> kmers(1, K):
 29 |                 b = False
 30 |                 if use_slow_match:
 31 |                     b = match_slow1(kmer)
 32 |                 else:
 33 |                     b = match_fast1(kmer)
 34 |                 n += 1 if b else 0
 35 |     print n
 36 | 
 37 | def match_fast2(k):
 38 |     match k:
 39 |         case 'T_T_..._A_A':
 40 |             return True
 41 |         case _:
 42 |             return False
 43 | 
 44 | def match_slow2(k):
 45 |     for i in range(0, 4, 2):
 46 |         if k[i] != k'T':
 47 |             return False
 48 |     for i in range(0, 4, 2):
 49 |         if k[len(k) - 1 - i] != k'A':
 50 |             return False
 51 |     return True
 52 | 
 53 | def test2(use_slow_match, K: Static[int]):
 54 |     n = 0
 55 |     with timing(f'{K}-mer ({use_slow_match=})'):
 56 |         for s in FASTA(argv[1]) |> seqs:
 57 |             for kmer in s |> kmers(1, K):
 58 |                 b = False
 59 |                 if use_slow_match:
 60 |                     b = match_slow2(kmer)
 61 |                 else:
 62 |                     b = match_fast2(kmer)
 63 |                 n += 1 if b else 0
 64 |     print n
 65 | 
 66 | print 'TEST1:'
 67 | test1(False, 64)
 68 | test1(True, 64)
 69 | test1(False, 96)
 70 | test1(True, 96)
 71 | test1(False, 128)
 72 | test1(True, 128)
 73 | test1(False, 160)
 74 | test1(True, 160)
 75 | test1(False, 192)
 76 | test1(True, 192)
 77 | test1(False, 224)
 78 | test1(True, 224)
 79 | test1(False, 256)
 80 | test1(True, 256)
 81 | test1(False, 288)
 82 | test1(True, 288)
 83 | test1(False, 320)
 84 | test1(True, 320)
 85 | test1(False, 352)
 86 | test1(True, 352)
 87 | test1(False, 384)
 88 | test1(True, 384)
 89 | test1(False, 416)
 90 | test1(True, 416)
 91 | test1(False, 448)
 92 | test1(True, 448)
 93 | test1(False, 480)
 94 | test1(True, 480)
 95 | test1(False, 512)
 96 | test1(True, 512)
 97 | 
 98 | print 'TEST2:'
 99 | test2(False, 64)
100 | test2(True, 64)
101 | test2(False, 96)
102 | test2(True, 96)
103 | test2(False, 128)
104 | test2(True, 128)
105 | test2(False, 160)
106 | test2(True, 160)
107 | test2(False, 192)
108 | test2(True, 192)
109 | test2(False, 224)
110 | test2(True, 224)
111 | test2(False, 256)
112 | test2(True, 256)
113 | test2(False, 288)
114 | test2(True, 288)
115 | test2(False, 320)
116 | test2(True, 320)
117 | test2(False, 352)
118 | test2(True, 352)
119 | test2(False, 384)
120 | test2(True, 384)
121 | test2(False, 416)
122 | test2(True, 416)
123 | test2(False, 448)
124 | test2(True, 448)
125 | test2(False, 480)
126 | test2(True, 480)
127 | test2(False, 512)
128 | test2(True, 512)
129 | 


--------------------------------------------------------------------------------
/stdlib/bio/fai.codon:
--------------------------------------------------------------------------------
  1 | @tuple
  2 | class FAIRecord:
  3 |     _name: str
  4 |     _length: int
  5 |     _offset: int
  6 |     _linebases: int
  7 |     _linewidth: int
  8 |     _qualoffset: int
  9 | 
 10 |     @property
 11 |     def name(self):
 12 |         return self._name
 13 | 
 14 |     @property
 15 |     def length(self):
 16 |         return self._length
 17 | 
 18 |     @property
 19 |     def offset(self):
 20 |         return self._offset
 21 | 
 22 |     @property
 23 |     def line_bases(self):
 24 |         return self._linebases
 25 | 
 26 |     @property
 27 |     def line_width(self):
 28 |         return self._linewidth
 29 | 
 30 |     @property
 31 |     def qual_offset(self):
 32 |         return self._qualoffset
 33 | 
 34 | FAI_COL_NAMES = ["name", "length", "offset", "line_bases", "line_width", "qual_offset"]
 35 | 
 36 | class FAIReader:
 37 |     fastq: bool
 38 |     validate: bool
 39 |     copy: bool
 40 |     _file: gzFile
 41 | 
 42 |     def __init__(self, path: str, fastq: bool, validate: bool, copy: bool):
 43 |         self.validate = validate
 44 |         self.copy = copy
 45 |         self._file = gzopen(path, "r")
 46 |         self.fastq = fastq
 47 | 
 48 |     @property
 49 |     def file(self):
 50 |         return self._file
 51 | 
 52 |     @property
 53 |     def num_necessary_cols(self):
 54 |         return 6 if self.fastq else 5
 55 | 
 56 |     def __iter__(self):
 57 |         for lnum, l in enumerate(self.file._iter_trim_newline()):
 58 |             line = l.__ptrcopy__() if self.copy else l
 59 |             rec: FAIRecord = self._FAIRecord_from_str(line, lnum + 1)
 60 |             yield rec
 61 |         self.close()
 62 | 
 63 |     def _FAIRecord_from_str(self, s: str, lnum: int):
 64 |         col_strs = s.split("\t")
 65 | 
 66 |         if self.validate and len(col_strs) < self.num_necessary_cols:
 67 |             raise ValueError(f"Line {lnum} does not have the required number of columns, {self.num_necessary_cols}")
 68 | 
 69 |         name, length, offset = "", 0, 0
 70 |         line_bases, line_width, qual_offset = 0, 0, 0
 71 | 
 72 |         val_ptrs: List[Ptr[byte]] = [__ptr__(name).as_byte(), __ptr__(length).as_byte(),
 73 |                                      __ptr__(offset).as_byte(), __ptr__(line_bases).as_byte(),
 74 |                                      __ptr__(line_width).as_byte(), __ptr__(qual_offset).as_byte()]
 75 | 
 76 |         for i in range(self.num_necessary_cols):
 77 |             col_name = FAI_COL_NAMES[i]
 78 |             val_ptr = val_ptrs[i]
 79 |             val_str = col_strs[i]
 80 |             if col_name == "name":
 81 |                 val_str_ptr = Ptr[str](val_ptr)
 82 |                 val_str_ptr[0] = val_str
 83 |             else:
 84 |                 val_num_ptr = Ptr[int](val_ptr)
 85 |                 val_num_ptr[0] = self._get_int_from_fai(val_str, col_name, lnum)
 86 | 
 87 |         return FAIRecord(name, length, offset, line_bases, line_width, qual_offset)
 88 | 
 89 |     def _get_int_from_fai(self, val_str: str, col_name: str, lnum: int):
 90 |         if self.validate:
 91 |             try:
 92 |                 return int(val_str)
 93 |             except:
 94 |                 raise ValueError(f"{col_name}, must be integer, line: {lnum}")
 95 |         return int(val_str)
 96 | 
 97 |     def close(self):
 98 |         self.file.close()
 99 | 
100 |     def __enter__(self):
101 |         pass
102 | 
103 |     def __exit__(self):
104 |         self.close()
105 | 
106 | def FAI(path: str, fastq: bool = False, validate: bool = True, copy: bool = True) -> FAIReader:
107 |     return FAIReader(path=path, fastq=fastq, validate=validate, copy=copy)
108 | 


--------------------------------------------------------------------------------
/test/apps/umi/whitelist.codon:
--------------------------------------------------------------------------------
 1 | # Implementation of barcode whitelisting from UMI tools
 2 | # https://github.com/CGATOxford/UMI-tools/blob/master/umi_tools/whitelist.py
 3 | # Assumes 16bp barcode at the start of the read
 4 | # Usage: seqc whitelist.seq <input fastq>
 5 | 
 6 | from sys import argv
 7 | from time import timing
 8 | from bio import *
 9 | 
10 | BARCODE_LEN = 16
11 | 
12 | def get_knee_estimate_distance(cell_barcode_counts):
13 |     def get_knee_distance(values):
14 |         from math import sqrt
15 |         a = (values[-1] - values[0]) / (len(values) - 1)
16 |         b = -1
17 |         c = values[0]
18 |         h = sqrt(a**2 + b**2)
19 |         dist = lambda x,y: abs(a*x + b*y + c)/h
20 |         dist_to_line = [dist(x, y) for x, y in enumerate(values)]
21 |         best_idx = max((y, x) for x, y in enumerate(dist_to_line))[1]
22 |         return (dist_to_line, best_idx)
23 | 
24 |     def cumsum(values):
25 |         total = values[0]
26 |         for i in range(1, len(values)):
27 |             total += values[i]
28 |             values[i] = total
29 |         return values
30 | 
31 |     bc = [(count, barcode) for barcode,count in cell_barcode_counts.items()]
32 |     bc.sort(reverse=True)
33 |     counts = [count for count,barcode in bc]
34 |     values = cumsum(counts)
35 |     prev_idx_of_best_point = 0
36 |     dist_to_line, idx_of_best_point = get_knee_distance(values)
37 | 
38 |     max_iterations = 100
39 |     iterations = 0
40 |     while idx_of_best_point - prev_idx_of_best_point != 0:
41 |         prev_idx_of_best_point = idx_of_best_point
42 |         iterations += 1
43 |         if iterations >= max_iterations:
44 |             break
45 |         dist_to_line, idx_of_best_point = get_knee_distance(values[:idx_of_best_point*3])
46 | 
47 |     knee_final_barcodes = [x[1] for x in bc[:idx_of_best_point+1]]
48 |     return knee_final_barcodes
49 | 
50 | def get_error_correct_mapping(cell_barcodes, whitelist):
51 |     def neighbors(barcode):
52 |         for i in range(len(barcode)):
53 |             for b in (s'A', s'C', s'G', s'T'):
54 |                 if barcode[i] != b:
55 |                     s = str(barcode)
56 |                     yield seq(''.join((s[:i], str(b), s[i+1:])))
57 | 
58 |     true_to_false = {}
59 |     whitelist_set = set(whitelist)
60 |     for cell_barcode in cell_barcodes:
61 |         if cell_barcode in whitelist_set:
62 |             continue
63 |         candidates = [neighbor for neighbor in neighbors(cell_barcode) if neighbor in whitelist_set]
64 |         if len(candidates) == 1:
65 |             true_to_false.setdefault(candidates[0], []).append(cell_barcode)
66 |     return true_to_false
67 | 
68 | def get_cell_whitelist(cell_barcode_counts):
69 |     cell_whitelist = get_knee_estimate_distance(cell_barcode_counts)
70 |     true_to_false_map = get_error_correct_mapping(cell_barcode_counts.keys(), cell_whitelist)
71 |     return cell_whitelist, true_to_false_map
72 | 
73 | with timing('whitelist construction'):
74 |     cell_barcode_counts = Dict[seq, int]()
75 |     FASTQ(argv[1]) |> seqs |> (lambda read: read[:BARCODE_LEN]) |> cell_barcode_counts.increment
76 |     cell_whitelist, true_to_false_map = get_cell_whitelist(cell_barcode_counts)
77 | 
78 |     for barcode in sorted(cell_whitelist):
79 |         corrected_barcodes, corrected_barcode_counts = "", ""
80 |         if barcode in true_to_false_map:
81 |             corrected_barcodes = ",".join([str(k) for k in sorted(true_to_false_map[barcode])])
82 |             corrected_barcode_counts = ",".join([str(cell_barcode_counts[x]) for x in sorted(true_to_false_map[barcode])])
83 | 
84 |         print(f'{barcode}\t{corrected_barcodes}\t{cell_barcode_counts[barcode]}\t{corrected_barcode_counts}')
85 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |  <img src="logo/logo.png?raw=true" width="200" alt="Seq"/>
 3 | </p>
 4 | 
 5 | <h1 align="center"> Seq — the bioinformatics module for Codon</h1>
 6 | 
 7 | ## Introduction
 8 | 
 9 | Seq is a programming language for computational genomics and bioinformatics. With a Python-compatible syntax and a host of domain-specific features and optimizations, Seq makes writing high-performance genomics software as easy as writing Python code, and achieves performance comparable to (and in many cases better than) C/C++.
10 | 
11 | Seq is able to outperform Python code by up to 160x. Seq can further beat equivalent C/C++ code by up to 2x without any manual interventions, and also natively supports parallelism out of the box. Implementation details and benchmarks are discussed [in our paper](https://dl.acm.org/citation.cfm?id=3360551).
12 | 
13 | Learn more by following the [tutorial](https://docs.seq-lang.org/tutorial) or from the [cookbook](https://docs.seq-lang.org/cookbook).
14 | 
15 | ## Examples
16 | Here is an example showcasing some of Seq's bioinformatics features, which include native sequence and k-mer types.
17 | 
18 | ```python
19 | from bio import *
20 | s = s'ACGTACGT'     # sequence literal
21 | print(s[2:5])       # subsequence
22 | print(~s)           # reverse complement
23 | kmer = Kmer[8](s)   # convert to k-mer
24 | 
25 | # iterate over length-3 subsequences
26 | # with step 2
27 | for sub in s.split(3, step=2):
28 |     print(sub[-1])  # last base
29 | 
30 |     # iterate over 2-mers with step 1
31 |     for kmer in sub.kmers(step=1, k=2):
32 |         print(~kmer)  # '~' also works on k-mers
33 | ```
34 | 
35 | ## Installation
36 | 
37 | You need [Codon](https://github.com/exaloop/codon) for Seq to work. Assuming that Codon is installed in `$HOME/.codon`, run:
38 | ```bash
39 | # Download the latest release for your platform from https://github.com/exaloop/seq/releases
40 | tar zxvf seq-[OS]-[ARCH].tar.gz -C ${HOME}/.codon/lib/codon/plugins
41 | ```
42 | 
43 | Afterwards, you can use Seq with `-plugin seq`; for example:
44 | ```bash
45 | codon run -plugin seq test.codon
46 | ```
47 | 
48 | > **Note:** The default aarch64 Linux binaries use NEON emulation. Other builds might also be suboptimal (e.g., using SSE2 instead of SSE4 or AVX).
49 | > You should compile Seq locally to ensure the optimal alignment performance on your hardware.
50 | >
51 | > Please consult [our build file](.github/build-linux/entrypoint.sh) for the build instructions.
52 | 
53 | ## Documentation
54 | 
55 | Please check [docs.seq-lang.org](https://docs.seq-lang.org) for in-depth documentation.
56 | 
57 | ## Citing Seq
58 | 
59 | If you use Seq in your research, please cite:
60 | 
61 | > Ariya Shajii, Ibrahim Numanagić, Riyadh Baghdadi, Bonnie Berger, and Saman Amarasinghe. 2019. Seq: a high-performance language for bioinformatics. *Proc. ACM Program. Lang.* 3, OOPSLA, Article 125 (October 2019), 29 pages. DOI: https://doi.org/10.1145/3360551
62 | 
63 | BibTeX:
64 | 
65 | ```
66 | @article{Shajii:2019:SHL:3366395.3360551,
67 |  author = {Shajii, Ariya and Numanagi\'{c}, Ibrahim and Baghdadi, Riyadh and Berger, Bonnie and Amarasinghe, Saman},
68 |  title = {Seq: A High-performance Language for Bioinformatics},
69 |  journal = {Proc. ACM Program. Lang.},
70 |  issue_date = {October 2019},
71 |  volume = {3},
72 |  number = {OOPSLA},
73 |  month = oct,
74 |  year = {2019},
75 |  issn = {2475-1421},
76 |  pages = {125:1--125:29},
77 |  articleno = {125},
78 |  numpages = {29},
79 |  url = {http://doi.acm.org/10.1145/3360551},
80 |  doi = {10.1145/3360551},
81 |  acmid = {3360551},
82 |  publisher = {ACM},
83 |  address = {New York, NY, USA},
84 |  keywords = {Python, bioinformatics, computational biology, domain-specific language, optimization, programming language},
85 | }
86 | ```
87 | 


--------------------------------------------------------------------------------
/test/pipeline/prefetch.codon:
--------------------------------------------------------------------------------
  1 | from bio import *
  2 | 
  3 | class MyIndex[K]:
  4 |     special: K
  5 |     getitem_calls: int
  6 |     prefetch_calls: int
  7 | 
  8 |     def __init__(self, special: K):
  9 |         self.special = special
 10 |         self.getitem_calls = 0
 11 |         self.prefetch_calls = 0
 12 | 
 13 |     def __getitem__(self, k: K):
 14 |         self.getitem_calls += 1
 15 |         return 1 if k == self.special else 0
 16 | 
 17 |     def __prefetch__(self, k: K):
 18 |         self.prefetch_calls += 1
 19 | 
 20 | def lookup1[K](kmer: K, idx: MyIndex[K]):
 21 |     return (kmer, idx[kmer])
 22 | 
 23 | @prefetch
 24 | def lookup2[K](kmer: K, idx: MyIndex[K]):
 25 |     return (kmer, idx[kmer])
 26 | 
 27 | @prefetch
 28 | def lookup3[K](kmer: K, idx: MyIndex[K]):
 29 |     try:
 30 |         idx[kmer]   # issues prefetch
 31 |         idx[~kmer]  # issues prefetch
 32 |         return (kmer, -99999999)
 33 |     finally:
 34 |         idx[kmer]   # issues prefetch
 35 |         idx[~kmer]  # issues prefetch
 36 |         return (kmer, idx[kmer])
 37 | 
 38 | @prefetch
 39 | def lookup4(kmer, idx, v):
 40 |     v.append((kmer, idx[kmer]))
 41 | 
 42 | @prefetch
 43 | def lookup5(kmer, idx, v):
 44 |     v.append((kmer, idx[kmer]))
 45 |     return 0
 46 | 
 47 | K: Static[int] = 3
 48 | 
 49 | @test
 50 | def test_prefetch_transformation(s):
 51 |     idx1 = MyIndex(Kmer[K]())
 52 |     idx2 = MyIndex(Kmer[K]())
 53 |     idx3 = MyIndex(Kmer[K]())
 54 |     idx4 = MyIndex(Kmer[K]())
 55 |     idx5 = MyIndex(Kmer[K]())
 56 |     idx6 = MyIndex(Kmer[K]())
 57 |     v1 = []
 58 |     v2 = []
 59 |     v3 = []
 60 |     v4: List[Tuple[Kmer[K], int]] = []
 61 |     v5: List[Tuple[Kmer[K], int]] = []
 62 |     v6: List[Tuple[Kmer[K], int]] = []
 63 |     s = s'ACGTACGTAAAACGTACGTAAAACGTACGT'
 64 | 
 65 |     def my_kmers(s):
 66 |         return s.kmers(1, K)
 67 | 
 68 |     s |> kmers(1, K) |> lookup1(idx1) |> v1.append
 69 |     s |> kmers(1, K) |> lookup2(idx2) |> v2.append
 70 |     s |> kmers(1, K) |> lookup3(idx3) |> v3.append
 71 |     my_kmers(s) |> lookup4(idx4, v4)
 72 |     s |> kmers(1, K) |> lookup5(idx5, v5)
 73 | 
 74 |     def pass_prefetch_func_by_arg(s, lookup, idx, v):
 75 |         s |> kmers(1, K) |> lookup(idx) |> v.append
 76 | 
 77 |     pass_prefetch_func_by_arg(s, lookup3, idx6, v6)
 78 | 
 79 |     assert set(v1) == set(v2)
 80 |     assert set(v1) == set(v3)
 81 |     assert set(v1) == set(v4)
 82 |     assert set(v1) == set(v5)
 83 |     assert set(v1) == set(v6)
 84 |     assert idx1.getitem_calls == len(s) - (K - 1)
 85 |     assert idx1.prefetch_calls == 0
 86 |     assert idx1.getitem_calls == idx2.getitem_calls
 87 |     assert idx3.getitem_calls == 5 * idx1.getitem_calls
 88 |     assert idx2.prefetch_calls == idx2.getitem_calls
 89 |     assert idx3.prefetch_calls == 5 * idx2.prefetch_calls
 90 |     assert idx4.getitem_calls == idx2.getitem_calls
 91 |     assert idx4.prefetch_calls == idx2.prefetch_calls
 92 |     assert idx5.getitem_calls == idx4.getitem_calls
 93 |     assert idx5.prefetch_calls == idx4.prefetch_calls
 94 |     assert idx6.getitem_calls == idx3.getitem_calls
 95 |     assert idx6.prefetch_calls == idx3.prefetch_calls
 96 | 
 97 | test_prefetch_transformation(s'')
 98 | test_prefetch_transformation(s'A')
 99 | test_prefetch_transformation(s'ACG')
100 | test_prefetch_transformation(s'ACGTA')
101 | test_prefetch_transformation(s'ACGTACGTAAAACGTACGTAAAACGTACGT')
102 | test_prefetch_transformation(s'ACGTACGTAAAACGTACGTAAAACGTACGTACGTACGTAAAACGTACGTAAAACGTACGTACGTACGTAAAACGTACGTAAAACGTACGTACGTACGTAAAACGTACGTAAAACGTACGT')
103 | 
104 | @test
105 | def test_list_prefetch():
106 |     v = [0]
107 |     v.prefetch(0)
108 | test_list_prefetch()
109 | 
110 | @test
111 | def test_dict_prefetch():
112 |     d = {0:1}
113 |     d.prefetch(0)
114 |     d.prefetch(42)
115 | test_dict_prefetch()
116 | 


--------------------------------------------------------------------------------
/test/apps/bwa/fastmap.codon:
--------------------------------------------------------------------------------
  1 | from sys import argv, stderr, exit
  2 | from time import timing
  3 | from bio import FASTQ, FASTA
  4 | from bio.fmindex import FMIndex, SMEM
  5 | 
  6 | min_seed   = 17
  7 | min_iwidth = 20
  8 | min_intv   = 1
  9 | 
 10 | def open_index_file(basename, mode):
 11 |     import gzip
 12 |     return gzip.open(f'{basename}.fastmap.idx', mode)
 13 | 
 14 | def index_load(basename):
 15 |     from pickle import load
 16 |     with open_index_file(basename, 'rb') as jar:
 17 |         return load(jar, FMIndex)
 18 | 
 19 | #@prefetch
 20 | def fastmap(rec, fmi, out):
 21 |     prev, curr, mems = [], [], []
 22 |     q = rec.seq
 23 |     l = len(q)
 24 | 
 25 |     start = 0
 26 |     while True:
 27 |         while start < l and q[start].N():
 28 |             start += 1
 29 |         if start >= l:
 30 |             break
 31 | 
 32 |         mems.clear()
 33 |         prev.clear()
 34 |         curr.clear()
 35 |         x = start
 36 | 
 37 |         if q[x].N():
 38 |             return
 39 | 
 40 |         ik = SMEM(fmi.biinterval(q[x]), start=x, stop=x+1)
 41 | 
 42 |         # forward search
 43 |         i = x + 1
 44 |         while i < l:
 45 |             if not q[i].N():  # an A/C/G/T base
 46 |                 ok = ~fmi[~(ik.interval), ~q[i]]
 47 |                 if len(ok) != len(ik.interval):  # change of the interval size
 48 |                     curr.append(ik)
 49 |                     if len(ok) < min_intv:
 50 |                         break  # the interval size is too small to be extended further
 51 |                 ik = SMEM(ok, start=x, stop=i+1)
 52 |             else:  # an ambiguous base
 53 |                 curr.append(ik)
 54 |                 break
 55 |             i += 1
 56 | 
 57 |         if i == l:
 58 |             curr.append(ik)
 59 |         curr.reverse()
 60 |         ret = curr[0].stop
 61 |         prev, curr = curr, prev
 62 | 
 63 |         # backward search for MEMs
 64 |         i = x - 1
 65 |         while i >= -1:
 66 |             c = i >= 0 and not q[i].N()
 67 |             curr.clear()
 68 |             for p in prev:
 69 |                 ok = None
 70 |                 if c:
 71 |                     ok = fmi[p.interval, q[i]]
 72 |                 if not c or len(ok) < min_intv:
 73 |                     if len(curr) == 0:
 74 |                         if len(mems) == 0 or i + 1 < mems[-1].start:
 75 |                             if len(ik := SMEM(p.interval, start=i+1, stop=p.stop)) >= min_seed:
 76 |                                 mems.append(ik)
 77 |                 elif len(curr) == 0 or len(ok) != len(curr[-1].interval):
 78 |                     curr.append(SMEM(ok, start=p.start, stop=p.stop))
 79 |             if len(curr) == 0:
 80 |                 break
 81 |             prev, curr = curr, prev
 82 |             i -= 1
 83 | 
 84 |         mems.reverse()  # s.t. sorted by the start coordinate
 85 |         start = ret
 86 | 
 87 |         for mem in mems:
 88 |             intv = mem.interval
 89 |             offset = mem.start
 90 |             match_size = len(mem)
 91 | 
 92 |             print(f'{rec.name}\tEM\t{offset}\t{offset+match_size}\t{len(intv)}', end='', file=out)
 93 |             if len(intv) <= min_iwidth:
 94 |                 for rid, name, pos, is_rev in fmi.biresults(mem):
 95 |                     print(f"\t{name}:{'-' if is_rev else '+'}{pos+1}", end='', file=out)
 96 |             else:
 97 |                 print("\t*", file=out)
 98 |             print("", file=out)
 99 | 
100 | def main_index(path):
101 |     from pickle import dump
102 |     print('building FM-index...', file=stderr)
103 |     fmi = FMIndex(path, FMD=True)
104 |     print('writing to disk...', file=stderr)
105 |     with open_index_file(path, 'wb0') as jar:
106 |         dump(fmi, jar)
107 | 
108 | def main_search(index, fastq, result):
109 |     print('loading index...', file=stderr)
110 |     fmi = None
111 |     with timing('load'):
112 |         fmi = index_load(index)
113 |     print('running alignment pipeline...', file=stderr)
114 |     with open(result, 'w') as out, timing('fastmap'):
115 |         FASTQ(fastq) |> iter |> fastmap(fmi, out)
116 | 
117 | match argv[1:]:
118 |     case ['index', path]:
119 |         main_index(path)
120 |     case ['search', index, fastq, result]:
121 |         main_search(index, fastq, result)
122 |     case _:
123 |         print("error: unknown mode: valid modes are 'index' and 'search'", file=stderr)
124 |         exit(1)
125 | 


--------------------------------------------------------------------------------
/test/apps/bwa/fastmap_build.codon:
--------------------------------------------------------------------------------
  1 | from sys import argv, stderr, exit
  2 | from time import timing
  3 | from bio import FASTQ, FASTA
  4 | from bio.fmindex import FMIndex, SMEM
  5 | 
  6 | min_seed   = 17
  7 | min_iwidth = 20
  8 | min_intv   = 1
  9 | 
 10 | def open_index_file(basename, mode):
 11 |     import gzip
 12 |     return gzip.open(f'{basename}.fastmap.idx', mode)
 13 | 
 14 | def index_load(basename):
 15 |     from pickle import load
 16 |     with open_index_file(basename, 'rb') as jar:
 17 |         return load(jar, FMIndex)
 18 | 
 19 | #@prefetch
 20 | def fastmap(rec, fmi, out):
 21 |     prev, curr, mems = [], [], []
 22 |     q = rec.seq
 23 |     l = len(q)
 24 | 
 25 |     start = 0
 26 |     while True:
 27 |         while start < l and q[start].N():
 28 |             start += 1
 29 |         if start >= l:
 30 |             break
 31 | 
 32 |         mems.clear()
 33 |         prev.clear()
 34 |         curr.clear()
 35 |         x = start
 36 | 
 37 |         if q[x].N():
 38 |             return
 39 | 
 40 |         ik = SMEM(fmi.biinterval(q[x]), start=x, stop=x+1)
 41 | 
 42 |         # forward search
 43 |         i = x + 1
 44 |         while i < l:
 45 |             if not q[i].N():  # an A/C/G/T base
 46 |                 ok = ~fmi[~(ik.interval), ~q[i]]
 47 |                 if len(ok) != len(ik.interval):  # change of the interval size
 48 |                     curr.append(ik)
 49 |                     if len(ok) < min_intv:
 50 |                         break  # the interval size is too small to be extended further
 51 |                 ik = SMEM(ok, start=x, stop=i+1)
 52 |             else:  # an ambiguous base
 53 |                 curr.append(ik)
 54 |                 break
 55 |             i += 1
 56 | 
 57 |         if i == l:
 58 |             curr.append(ik)
 59 |         curr.reverse()
 60 |         ret = curr[0].stop
 61 |         prev, curr = curr, prev
 62 | 
 63 |         # backward search for MEMs
 64 |         i = x - 1
 65 |         while i >= -1:
 66 |             c = i >= 0 and not q[i].N()
 67 |             curr.clear()
 68 |             for p in prev:
 69 |                 ok = None
 70 |                 if c:
 71 |                     ok = fmi[p.interval, q[i]]
 72 |                 if not c or len(ok) < min_intv:
 73 |                     if len(curr) == 0:
 74 |                         if len(mems) == 0 or i + 1 < mems[-1].start:
 75 |                             if len(ik := SMEM(p.interval, start=i+1, stop=p.stop)) >= min_seed:
 76 |                                 mems.append(ik)
 77 |                 elif len(curr) == 0 or len(ok) != len(curr[-1].interval):
 78 |                     curr.append(SMEM(ok, start=p.start, stop=p.stop))
 79 |             if len(curr) == 0:
 80 |                 break
 81 |             prev, curr = curr, prev
 82 |             i -= 1
 83 | 
 84 |         mems.reverse()  # s.t. sorted by the start coordinate
 85 |         start = ret
 86 | 
 87 |         for mem in mems:
 88 |             intv = mem.interval
 89 |             offset = mem.start
 90 |             match_size = len(mem)
 91 | 
 92 |             print(f'{rec.name}\tEM\t{offset}\t{offset+match_size}\t{len(intv)}', end='', file=out)
 93 |             if len(intv) <= min_iwidth:
 94 |                 for rid, name, pos, is_rev in fmi.biresults(mem):
 95 |                     print(f"\t{name}:{'-' if is_rev else '+'}{pos+1}", end='', file=out)
 96 |             else:
 97 |                 print("\t*", file=out)
 98 |             print("", file=out)
 99 | 
100 | def main_index(path):
101 |     from pickle import dump
102 |     print('building FM-index...', file=stderr)
103 |     fmi = FMIndex(path, FMD=True)
104 |     print('writing to disk...', file=stderr)
105 |     with open_index_file(path, 'wb0') as jar:
106 |         dump(fmi, jar)
107 | 
108 | def main_search(index, fastq, result):
109 |     print('loading index...', file=stderr)
110 |     fmi = None
111 |     with timing('load'):
112 |         fmi = FMIndex(index, FMD=True)
113 |     print('running alignment pipeline...', file=stderr)
114 |     with open(result, 'w') as out, timing('fastmap'):
115 |         FASTQ(fastq) |> iter |> fastmap(fmi, out)
116 | 
117 | match argv[1:]:
118 |     case ['index', path]:
119 |         main_index(path)
120 |     case ['search', index, fastq, result]:
121 |         main_search(index, fastq, result)
122 |     case _:
123 |         print("error: unknown mode: valid modes are 'index' and 'search'", file=stderr)
124 |         exit(1)
125 | 


--------------------------------------------------------------------------------
/test/apps/snap/hashtable.seq:
--------------------------------------------------------------------------------
  1 | # Implementation of SNAP aligner's hash table
  2 | # https://github.com/amplab/snap/blob/master/SNAPLib/HashTable.{cpp,h}
  3 | 
  4 | # Need the following hooks linked to convert C++ SNAPHashTable to Seq object:
  5 | # snap_hashtable_ptr(Ptr[byte]) -> Ptr[tuple[K,V]]  --  extract table pointer
  6 | # snap_hashtable_len(Ptr[byte]) -> int              --  extract table length
  7 | # snap_hashtable_invalid_val(Ptr[byte]) -> V        --  extract "invalid" value
  8 | 
  9 | QUADRATIC_CHAINING_DEPTH = 5
 10 | 
 11 | class SNAPHashTable[K,V]:
 12 |     table: Array[Tuple[V,K]]  # this order is consistent with SNAP
 13 |     invalid_val: V
 14 | 
 15 |     def _hash(k):
 16 |         key = hash(k)
 17 |         key ^= int(UInt[64](key) >> UInt[64](33))
 18 |         key *= 0xff51afd7ed558ccd
 19 |         key ^= int(UInt[64](key) >> UInt[64](33))
 20 |         key *= 0xc4ceb9fe1a85ec53
 21 |         key ^= int(UInt[64](key) >> UInt[64](33))
 22 |         return key
 23 | 
 24 |     def __init__(self, size: int, invalid_val: V):
 25 |         self.table = Array(size)
 26 |         self.invalid_val = invalid_val
 27 | 
 28 |         for i in range(size):
 29 |             self.table[i] = (invalid_val, K())
 30 | 
 31 |     def __init__(self, p: Ptr[byte]):
 32 |         from C import snap_hashtable_ptr(Ptr[byte]) -> Ptr[byte]
 33 |         from C import snap_hashtable_len(Ptr[byte]) -> int
 34 |         from C import snap_hashtable_invalid_val(Ptr[byte]) -> int
 35 |         self.table = Array[Tuple[V,K]](Ptr[Tuple[V,K]](snap_hashtable_ptr(p)), snap_hashtable_len(p))
 36 |         self.invalid_val = V(snap_hashtable_invalid_val(p))
 37 | 
 38 |     def _get_index(self, where: int):
 39 |         return int(UInt[64](where) % UInt[64](len(self.table)))
 40 | 
 41 |     def _get_entry_index_for_key(self, k: K):
 42 |         table = self.table
 43 |         table_size = table.len
 44 |         table_index = self._get_index(SNAPHashTable[K,V]._hash(k))
 45 |         wrapped = False
 46 |         n_probes = 1
 47 |         invalid_val = self.invalid_val
 48 | 
 49 |         while table[table_index][1] != k and table[table_index][0] != invalid_val:
 50 |             table_index += (n_probes ** 2) if n_probes < QUADRATIC_CHAINING_DEPTH else 1
 51 |             n_probes += 1
 52 | 
 53 |             if table_index >=table_size:
 54 |                 if wrapped:
 55 |                     return -1
 56 | 
 57 |                 wrapped = True
 58 |                 table_index %= table_size
 59 | 
 60 |         return table_index
 61 | 
 62 |     def get_value_ptr_for_key(self, k: K):
 63 |         table = self.table
 64 |         table_size = table.len
 65 |         table_index = self._get_index(SNAPHashTable[K,V]._hash(k))
 66 |         invalid_val = self.invalid_val
 67 |         entry = table[table_index]
 68 | 
 69 |         if entry[1] == k and entry[0] != invalid_val:
 70 |             return Ptr[V]((table.ptr + table_index).as_byte())
 71 |         else:
 72 |             n_probes = 0
 73 |             while True:
 74 |                 n_probes += 1
 75 | 
 76 |                 if n_probes > table_size + QUADRATIC_CHAINING_DEPTH:
 77 |                     return Ptr[V]()
 78 | 
 79 |                 diff = (n_probes**2) if n_probes < QUADRATIC_CHAINING_DEPTH else 1
 80 |                 table_index = (table_index + diff) % table_size
 81 | 
 82 |                 entry = table[table_index]
 83 |                 if not (entry[1] != k and entry[0] != invalid_val):
 84 |                     break
 85 | 
 86 |             return Ptr[V]((table.ptr + table_index).as_byte())
 87 | 
 88 |     def __prefetch__(self, k: K):
 89 |         table = self.table
 90 |         table_index = self._get_index(SNAPHashTable[K,V]._hash(k))
 91 |         (self.table.ptr + table_index).__prefetch_r3__()
 92 | 
 93 |     def lookup_slow(self, k: K):
 94 |         entry = self._get_entry_index_for_key(k)
 95 |         return self.table[entry][0] if entry >= 0 else self.invalid_val
 96 | 
 97 |     def __getitem__(self, k: K):
 98 |         p = self.get_value_ptr_for_key(k)
 99 |         return p[0] if p else self.invalid_val
100 | 
101 |     def __setitem__(self, k: K, v: V):
102 |         entry = self._get_entry_index_for_key(k)
103 |         if entry >= 0:
104 |             self.table[entry] = (v, k)
105 | 
106 | def _test():
107 |     h = SNAPHashTable[i32,i32](100, i32(-1))
108 |     for i in range(10):
109 |         h[i32(42 + i*100)] = i32(i)
110 | 
111 |     for i in range(100):
112 |         print int(h[i32(42 + i*100)])
113 | 


--------------------------------------------------------------------------------
/test/pipeline/interalign.codon:
--------------------------------------------------------------------------------
  1 | from bio import *
  2 | 
  3 | Q,T = ['test/data/' + a for a in ('MT-orang.fa','MT-human.fa')]
  4 | 
  5 | def normal_align(query: seq,
  6 |                 target: seq,
  7 |                 a: int = 2,
  8 |                 b: int = 4,
  9 |                 ambig: int = 0,
 10 |                 gapo: int = 4,
 11 |                 gape: int = 2,
 12 |                 bandwidth: int = -1,
 13 |                 zdrop: int = -1,
 14 |                 end_bonus: int = 0,
 15 |                 ext_only: bool = False,
 16 |                 score_only: bool = False,
 17 |                 rev_cigar: bool = False):
 18 |     return query.align(target, a=a, b=b, ambig=ambig, gapo=gapo, gape=gape, bandwidth=bandwidth, zdrop=zdrop,
 19 |                        end_bonus=end_bonus, ext_only=ext_only, score_only=score_only, rev_cigar=rev_cigar)
 20 | 
 21 | @inter_align
 22 | @test
 23 | def aln1(t):
 24 |     query, target = t
 25 |     score = query.align(target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=5, score_only=True).score
 26 |     score_exp = normal_align(query, target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=5, score_only=True).score
 27 |     assert score == score_exp
 28 | 
 29 | rev_cigar = True
 30 | ext_only = True
 31 | 
 32 | def walk_cigar(query, target, cigar):
 33 |     a=1
 34 |     b=2
 35 |     ambig=0
 36 |     gapo=2
 37 |     gape=1
 38 | 
 39 |     i, j = 0, 0
 40 |     score = 0
 41 |     for n, op in cigar:
 42 |         match op:
 43 |             case 'M':
 44 |                 for k in range(n):
 45 |                     q = query[i + k]
 46 |                     t = target[j + k]
 47 |                     if q.N() or t.N():
 48 |                         score += ambig
 49 |                     elif q.__int__() == t.__int__():
 50 |                         score += a
 51 |                     else:
 52 |                         score -= b
 53 |                 i += n
 54 |                 j += n
 55 |             case 'I':
 56 |                 score -= (gapo + n*gape)
 57 |                 i += n
 58 |             case 'D':
 59 |                 score -= (gapo + n*gape)
 60 |                 j += n
 61 |             case _:
 62 |                 assert False
 63 |     return score
 64 | 
 65 | @inter_align
 66 | @test
 67 | def aln2(t):
 68 |     query, target = t
 69 |     my_ext_only = ext_only
 70 |     inter = query.align(target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=0, ext_only=my_ext_only, score_only=False)
 71 |     intra = normal_align(query, target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=0, ext_only=my_ext_only, score_only=False)
 72 |     assert inter.score == intra.score
 73 |     assert walk_cigar(query, target, inter.cigar) == inter.score
 74 |     assert walk_cigar(query, target, intra.cigar) == intra.score
 75 |     assert inter.score == query.align(target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=0, ext_only=my_ext_only).score
 76 | 
 77 | a = 1
 78 | b = 2
 79 | ambig = 0
 80 | gapo = 2
 81 | gape = 1
 82 | zdrop = 100
 83 | bandwidth = 100
 84 | end_bonus = 5
 85 | 
 86 | @inter_align
 87 | @test
 88 | def aln3(t):
 89 |     query, target = t
 90 |     if query[0] == target[0]:
 91 |         score = query.align(target, a=a, b=b, ambig=ambig, gapo=gapo, gape=1, zdrop=100, bandwidth=100, end_bonus=5).score
 92 |         score_exp = normal_align(query, target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=5).score
 93 |         assert score == score_exp
 94 |         score2 = query.align(target, a=1, b=2, ambig=0, gapo=2, gape=gape, zdrop=zdrop, bandwidth=bandwidth, end_bonus=end_bonus).score
 95 |         assert score == score2
 96 | 
 97 | @inter_align
 98 | @test
 99 | def aln4(t):
100 |     # tests intra-alignment demotion
101 |     for i in range(2):
102 |         query, target = t
103 |         query = ~query
104 |         target = ~target
105 |         score = query.align(target, a=a, b=b, ambig=ambig, gapo=gapo, gape=1, zdrop=100, bandwidth=100, end_bonus=5).score
106 |         score_exp = normal_align(query, target, a=1, b=2, ambig=0, gapo=2, gape=1, zdrop=100, bandwidth=100, end_bonus=5).score
107 |         assert score == score_exp
108 |         query = query[:len(query)//2]
109 |         target = target[:len(target)//2]
110 | 
111 | def subs(path: str, n: int = 20):
112 |     for a in seqs(FASTA(path)):
113 |         for b in a.split(n, 1):
114 |             yield b
115 | 
116 | zip(subs(Q), subs(T)) |> aln1
117 | zip(subs(Q), subs(T)) |> aln2
118 | zip(subs(Q), subs(T)) |> aln3
119 | zip(subs(Q, 1024), subs(T, 1024)) |> aln4
120 | 


--------------------------------------------------------------------------------
/htslib-config.h.cmake:
--------------------------------------------------------------------------------
  1 | /* config.h.  Generated from config.h.in by configure.  */
  2 | /* config.h.in.  Generated from configure.ac by autoheader.  */
  3 | 
  4 | /* If you use configure, this file provides #defines reflecting your
  5 |    configuration choices.  If you have not run configure, suitable
  6 |    conservative defaults will be used.
  7 | 
  8 |    Autoheader adds a number of items to this template file that are not
  9 |    used by HTSlib: STDC_HEADERS and most HAVE_*_H header file defines
 10 |    are immaterial, as we assume standard ISO C headers and facilities;
 11 |    the PACKAGE_* defines are unused and are overridden by the more
 12 |    accurate PACKAGE_VERSION as computed by the Makefile.  */
 13 | 
 14 | /* Define if HTSlib should enable GCS support. */
 15 | /* #undef ENABLE_GCS */
 16 | 
 17 | /* Define if HTSlib should enable plugins. */
 18 | /* #undef ENABLE_PLUGINS */
 19 | 
 20 | /* Define if HTSlib should enable S3 support. */
 21 | /* #undef ENABLE_S3 */
 22 | 
 23 | /* Define if you have the Common Crypto library. */
 24 | /* #undef HAVE_COMMONCRYPTO */
 25 | 
 26 | /* Define to 1 if you have the `drand48' function. */
 27 | #define HAVE_DRAND48 1
 28 | 
 29 | /* Define if using an external libhtscodecs */
 30 | /* #undef HAVE_EXTERNAL_LIBHTSCODECS */
 31 | 
 32 | /* Define to 1 if you have the `fdatasync' function. */
 33 | /* #undef HAVE_FDATASYNC */
 34 | 
 35 | /* Define to 1 if you have the `fsync' function. */
 36 | #define HAVE_FSYNC 1
 37 | 
 38 | /* Define to 1 if you have the `getpagesize' function. */
 39 | #define HAVE_GETPAGESIZE 1
 40 | 
 41 | /* Define to 1 if you have the `gmtime_r' function. */
 42 | #define HAVE_GMTIME_R 1
 43 | 
 44 | /* Define if you have libcrypto-style HMAC(). */
 45 | /* #undef HAVE_HMAC */
 46 | 
 47 | /* Define to 1 if you have the <inttypes.h> header file. */
 48 | #define HAVE_INTTYPES_H 1
 49 | 
 50 | /* Define to 1 if you have the `bz2' library (-lbz2). */
 51 | #define HAVE_LIBBZ2 1
 52 | 
 53 | /* Define if libcurl file access is enabled. */
 54 | /* #undef HAVE_LIBCURL */
 55 | 
 56 | /* Define if libdeflate is available. */
 57 | /* #undef HAVE_LIBDEFLATE */
 58 | 
 59 | /* Define to 1 if you have the `lzma' library (-llzma). */
 60 | #define HAVE_LIBLZMA 1
 61 | 
 62 | /* Define to 1 if you have the `z' library (-lz). */
 63 | #define HAVE_LIBZ 1
 64 | 
 65 | /* Define to 1 if you have the <lzma.h> header file. */
 66 | #define HAVE_LZMA_H 1
 67 | 
 68 | /* Define to 1 if you have a working `mmap' system call. */
 69 | #define HAVE_MMAP 1
 70 | 
 71 | /* Define to 1 if you have the `srand48_deterministic' function. */
 72 | /* #undef HAVE_SRAND48_DETERMINISTIC */
 73 | 
 74 | /* Define to 1 if you have the <stdint.h> header file. */
 75 | #define HAVE_STDINT_H 1
 76 | 
 77 | /* Define to 1 if you have the <stdio.h> header file. */
 78 | #define HAVE_STDIO_H 1
 79 | 
 80 | /* Define to 1 if you have the <stdlib.h> header file. */
 81 | #define HAVE_STDLIB_H 1
 82 | 
 83 | /* Define to 1 if you have the <strings.h> header file. */
 84 | #define HAVE_STRINGS_H 1
 85 | 
 86 | /* Define to 1 if you have the <string.h> header file. */
 87 | #define HAVE_STRING_H 1
 88 | 
 89 | /* Define to 1 if you have the <sys/param.h> header file. */
 90 | #define HAVE_SYS_PARAM_H 1
 91 | 
 92 | /* Define to 1 if you have the <sys/stat.h> header file. */
 93 | #define HAVE_SYS_STAT_H 1
 94 | 
 95 | /* Define to 1 if you have the <sys/types.h> header file. */
 96 | #define HAVE_SYS_TYPES_H 1
 97 | 
 98 | /* Define to 1 if you have the <unistd.h> header file. */
 99 | #define HAVE_UNISTD_H 1
100 | 
101 | /* Define to the address where bug reports for this package should be sent. */
102 | #define PACKAGE_BUGREPORT "samtools-help@lists.sourceforge.net"
103 | 
104 | /* Define to the full name of this package. */
105 | #define PACKAGE_NAME "HTSlib"
106 | 
107 | /* Define to the full name and version of this package. */
108 | #define PACKAGE_STRING "HTSlib 1.13-23-g3eada2f"
109 | 
110 | /* Define to the one symbol short name of this package. */
111 | #define PACKAGE_TARNAME "htslib"
112 | 
113 | /* Define to the home page for this package. */
114 | #define PACKAGE_URL "http://www.htslib.org/"
115 | 
116 | /* Define to the version of this package. */
117 | #define PACKAGE_VERSION "1.13-23-g3eada2f"
118 | 
119 | /* Platform-dependent plugin filename extension. */
120 | /* #undef PLUGIN_EXT */
121 | 
122 | /* Define to 1 if all of the C90 standard headers exist (not just the ones
123 |    required in a freestanding environment). This macro is provided for
124 |    backward compatibility; new code need not use it. */
125 | #define STDC_HEADERS 1
126 | 
127 | /* Number of bits in a file offset, on hosts where this is settable. */
128 | /* #undef _FILE_OFFSET_BITS */
129 | 
130 | /* Define for large files, on AIX-style hosts. */
131 | /* #undef _LARGE_FILES */
132 | 
133 | /* Needed for PTHREAD_MUTEX_RECURSIVE */
134 | /* #undef _XOPEN_SOURCE */
135 | 


--------------------------------------------------------------------------------
/stdlib/bio/fastq.codon:
--------------------------------------------------------------------------------
  1 | # FASTQ format parser
  2 | # https://en.wikipedia.org/wiki/FASTQ_format
  3 | from bio.seq import seq
  4 | from copy import copy
  5 | 
  6 | @tuple
  7 | class FASTQRecord:
  8 |     _header: str
  9 |     _read: seq
 10 |     _qual: str
 11 | 
 12 |     @property
 13 |     def header(self):
 14 |         return self._header
 15 | 
 16 |     @property
 17 |     def name(self):
 18 |         from bio.builtin import _split_header_on_space
 19 |         return _split_header_on_space(self.header)[0]
 20 | 
 21 |     @property
 22 |     def comment(self):
 23 |         from bio.builtin import _split_header_on_space
 24 |         return _split_header_on_space(self.header)[1]
 25 | 
 26 |     @property
 27 |     def read(self):
 28 |         return self._read
 29 | 
 30 |     # FASTA compatibility
 31 |     @property
 32 |     def seq(self):
 33 |         return self._read
 34 | 
 35 |     @property
 36 |     def qual(self):
 37 |         return self._qual
 38 | 
 39 | @tuple
 40 | class FASTQReader:
 41 |     _file: cobj
 42 |     validate: bool
 43 |     gzip: bool
 44 |     copy: bool
 45 | 
 46 |     def __new__(path: str, validate: bool, gzip: bool, copy: bool) -> FASTQReader:
 47 |         return FASTQReader(gzopen(path, "r").__raw__() if gzip else open(path, "r").__raw__(), validate, gzip, copy)
 48 | 
 49 |     @property
 50 |     def file(self):
 51 |         assert not self.gzip
 52 |         p = __array__[cobj](1)
 53 |         p.ptr[0] = self._file
 54 |         return Ptr[File](p.ptr.as_byte())[0]
 55 | 
 56 |     @property
 57 |     def gzfile(self):
 58 |         assert self.gzip
 59 |         p = __array__[cobj](1)
 60 |         p.ptr[0] = self._file
 61 |         return Ptr[gzFile](p.ptr.as_byte())[0]
 62 | 
 63 |     def _preprocess_read(self, a: str):
 64 |         from bio.builtin import _validate_str_as_seq
 65 |         if self.validate:
 66 |             return _validate_str_as_seq(a, self.copy)
 67 |         else:
 68 |             return copy(seq(a.ptr, a.len)) if self.copy else seq(a.ptr, a.len)
 69 | 
 70 |     def _preprocess_qual(self, a: str):
 71 |         from bio.builtin import _validate_str_as_qual
 72 |         if self.validate:
 73 |             return _validate_str_as_qual(a, self.copy)
 74 |         else:
 75 |             return a.__ptrcopy__() if self.copy else a
 76 | 
 77 |     def _iter_core(self, file, seqs: bool) -> Generator[FASTQRecord]:
 78 |         line = 0
 79 |         name, read, qual = "", s"", ""
 80 |         for a in file._iter_trim_newline():
 81 |             x = line % 4
 82 |             if x == 0:
 83 |                 if self.validate and a[0] != "@":
 84 |                     raise ValueError(f"sequence name on line {line + 1} of FASTQ does not begin with '@'")
 85 |                 name = a[1:].__ptrcopy__() if self.copy else a[1:]
 86 |             elif x == 1:
 87 |                 read = self._preprocess_read(a)
 88 |                 if seqs:
 89 |                     yield FASTQRecord("", read, "")
 90 |             elif x == 2:
 91 |                 if self.validate and a[0] != "+":
 92 |                     raise ValueError(f"invalid separator on line {line + 1} of FASTQ")
 93 |             elif x == 3:
 94 |                 if self.validate and len(a) != len(read):
 95 |                     raise ValueError(f"quality and sequence length mismatch on line {line + 1} of FASTQ")
 96 |                 qual = self._preprocess_qual(a)
 97 |                 assert read.len >= 0
 98 |                 if not seqs:
 99 |                     yield FASTQRecord(name, read, qual)
100 |             else:
101 |                 assert False
102 |             line += 1
103 | 
104 |     def __seqs__(self):
105 |         if self.gzip:
106 |             for rec in self._iter_core(self.gzfile, seqs=True):
107 |                 yield rec.seq
108 |         else:
109 |             for rec in self._iter_core(self.file, seqs=True):
110 |                 yield rec.seq
111 |         self.close()
112 | 
113 |     def __iter__(self):
114 |         if not self.copy:
115 |             raise ValueError("cannot iterate over FASTQ records with copy=False")
116 |         if self.gzip:
117 |             yield from self._iter_core(self.gzfile, seqs=False)
118 |         else:
119 |             yield from self._iter_core(self.file, seqs=False)
120 |         self.close()
121 | 
122 |     def __blocks__(self, size: int):
123 |         from bio.block import _blocks
124 |         if not self.copy:
125 |             raise ValueError("cannot read sequences in blocks with copy=False")
126 |         return _blocks(self.__iter__(), size)
127 | 
128 |     def close(self):
129 |         if self.gzip:
130 |             self.gzfile.close()
131 |         else:
132 |             self.file.close()
133 | 
134 |     def __enter__(self):
135 |         pass
136 | 
137 |     def __exit__(self):
138 |         self.close()
139 | 
140 | def FASTQ(path: str, validate: bool = True, gzip: bool = True, copy: bool = True):
141 |     return FASTQReader(path=path, validate=validate, gzip=gzip, copy=copy)
142 | 


--------------------------------------------------------------------------------
/test/apps/cora/hom_exact.codon:
--------------------------------------------------------------------------------
  1 | # Implementation of CORA's exact homology table construction
  2 | # https://github.com/denizy/cora/blob/master/homTable_setup.cpp
  3 | 
  4 | # Usage: seqc hom_exact.seq <input_ref.fa> <output_prefix>
  5 | # Output format (gzip'd):
  6 | #   - N = total records [i64]
  7 | #   - N times:
  8 | #     - block size [i64]
  9 | #     - C = count [i64]
 10 | #     - C times:
 11 | #       - Chromosome index [i64]
 12 | #       - Chromosome position (0-based) [i64]
 13 | #       - Reverse complemented? [i8]
 14 | 
 15 | from sys import argv, stderr, exit
 16 | from pickle import dump
 17 | from bio import FASTA, Locus
 18 | import gzip
 19 | 
 20 | K: Static[int] = 64
 21 | 
 22 | @tuple
 23 | class BitSet:
 24 |     v: List[int]
 25 | 
 26 |     def __new__(n: int) -> BitSet:
 27 |         return BitSet([0 for _ in range((n // 64) + 1)],)
 28 |     def __getitem__(self, idx: int):
 29 |         return (self.v[idx // 64] & (1 << (idx % 64))) != 0
 30 |     def __setitem__(self, idx: int, b: bool):
 31 |         if b:
 32 |             self.v[idx // 64] |= (1 << (idx % 64))
 33 |         else:
 34 |             self.v[idx // 64] &= ~(1 << (idx % 64))
 35 | 
 36 | if len(argv) != 3:
 37 |     print(f'usage: {argv[0]} <input_ref.fa> <output_prefix>', file=stderr)
 38 |     exit(1)
 39 | 
 40 | path = argv[1]
 41 | num_kmers = sum(2 if kmer == ~kmer else 1  # palindromes added twice
 42 |                 for rec in FASTA(path) for kmer in rec.seq.kmers(1, K))
 43 | print 'num_kmers:', num_kmers
 44 | kmer_list = List(num_kmers)
 45 | 
 46 | @tuple
 47 | class EqClass:
 48 |     idx: int
 49 |     count: int
 50 | 
 51 |     def __getitem__(self, idx: int):
 52 |         return kmer_list[self.idx + idx][1]
 53 | 
 54 | bitsets = []  # markers for equivalence class representatives
 55 | for tid, rec in enumerate(FASTA(path)):
 56 |     print 'processing', rec.name
 57 |     bitsets.append(BitSet(len(rec.seq)))
 58 |     for pos, kmer in rec.seq.kmers_with_pos(1, K):
 59 |         kmer_rev = ~kmer
 60 |         add_pal = (kmer == kmer_rev)
 61 |         if kmer_rev < kmer:
 62 |             kmer = kmer_rev
 63 |             pos = -pos
 64 |         kmer_list.append((kmer, Locus(tid, pos)))
 65 |         if add_pal:  # add palindrome again
 66 |             kmer_list.append((kmer, Locus(tid, -pos)))
 67 | 
 68 | print 'sorting kmer_list...'
 69 | kmer_list.sort()
 70 | print 'done'
 71 | 
 72 | num_classes = 0
 73 | i = 0
 74 | while i < len(kmer_list):
 75 |     j = i + 1
 76 |     while j < len(kmer_list) and kmer_list[i][0] == kmer_list[j][0]:
 77 |         j += 1
 78 |     if j - i > 1:
 79 |         num_classes += 1
 80 |     i = j
 81 | 
 82 | print 'num_classes:', num_classes
 83 | eq_set = List(num_classes)
 84 | i = 0
 85 | while i < len(kmer_list):
 86 |     j = i + 1
 87 |     while j < len(kmer_list) and kmer_list[i][0] == kmer_list[j][0]:
 88 |         j += 1
 89 |     count = j - i
 90 |     if count > 1:
 91 |         eq_set.append(EqClass(i, count))
 92 |         # make sure representative is always forward-facing
 93 |         if kmer_list[i][1].reversed:
 94 |             for k in range(count):
 95 |                 kmer_list[i + k] = (kmer_list[i + k][0], ~kmer_list[i + k][1])
 96 |     bitsets[kmer_list[i][1].tid][kmer_list[i][1].pos] = True
 97 |     i = j
 98 | 
 99 | print 'sorting eq_set...'
100 | eq_set.sort(key=lambda x: (x[0].tid, x[0].pos))
101 | print 'done'
102 | 
103 | def find_block_size(start: int, eq_set: list[EqClass]):
104 |     base_idx = eq_set[start].idx
105 |     base_len = eq_set[start].count
106 |     dist = 1
107 |     while (start + dist < len(eq_set) and
108 |            eq_set[start][0].tid == eq_set[start + dist][0].tid and
109 |            eq_set[start][0].pos + dist == eq_set[start + dist][0].pos):
110 |         comp_len = eq_set[start + dist].count
111 | 
112 |         if comp_len != base_len:
113 |             return dist
114 | 
115 |         for k in range(1, base_len):
116 |             if (eq_set[start][k].reversed != eq_set[start + dist][k].reversed or
117 |                 eq_set[start][k].tid != eq_set[start + dist][k].tid):
118 |                 return dist
119 | 
120 |             offset = -dist if eq_set[start][k].reversed else dist
121 |             if eq_set[start][k].pos + offset != eq_set[start + dist][k].pos:
122 |                 return dist
123 | 
124 |         dist += 1
125 |     return dist
126 | 
127 | total = 0
128 | i = 0
129 | while i < len(eq_set):
130 |     total += 1
131 |     i += find_block_size(i, eq_set)
132 | 
133 | with gzip.open(argv[2] + '.hom_exact', 'wb') as out, gzip.open(argv[2] + '.reps_bitsets', 'wb') as reps:
134 |     dump(total, out)
135 |     i = 0
136 |     while i < len(eq_set):
137 |         block_size = find_block_size(i, eq_set)
138 |         count = eq_set[i].count
139 |         dump(block_size, out)
140 |         dump(count, out)
141 |         for k in range(count):
142 |             dump(eq_set[i][k].tid, out)
143 |             dump(eq_set[i][k].pos, out)
144 |             dump(eq_set[i][k].reversed, out)
145 |         i += block_size
146 |     dump(bitsets, reps)
147 | 


--------------------------------------------------------------------------------
/test/core/bwtsa.codon:
--------------------------------------------------------------------------------
  1 | from bio import *
  2 | from bio.fmindex import FMIndex, FMDIndex
  3 | import gzip
  4 | import pickle
  5 | 
  6 | Q,T = ['test/data/' + a for a in ('MT-orang.fa','MT-human.fa')]
  7 | 
  8 | def suffix_array_slow[T](s: T):
  9 |     return [i for _, i in sorted([(s, i) for i in range(len(s))], key=lambda t: t[0][t[1]:])]
 10 | 
 11 | def bwt_slow[T](s: T):
 12 |     t = str(s) + '$'
 13 |     n = len(t)
 14 |     m = sorted([t[i:n]+t[0:i] for i in range(n)])
 15 |     return ''.join([q[-1] for q in m])
 16 | 
 17 | @test
 18 | def test_suffix_array():
 19 |     assert len(s''.suffix_array()) == 0
 20 |     assert s'A'.suffix_array() == [0]
 21 | 
 22 |     for s in list(seqs(FASTA(Q))) + list(seqs(FASTA(T))):
 23 |         SA = s.suffix_array()
 24 |         assert SA == suffix_array_slow(s)
 25 |         s = ~s
 26 |         SA = s.suffix_array()
 27 |         assert SA == suffix_array_slow(s)
 28 | 
 29 | @test
 30 | def test_bwt():
 31 |     assert s''.bwt() == s'$'
 32 |     assert s'A'.bwt() == s'A$'
 33 | 
 34 |     for s in list(seqs(FASTA(Q))) + list(seqs(FASTA(T))):
 35 |         b = str(s.bwt())
 36 |         assert b == bwt_slow(s)
 37 |         s = ~s
 38 |         b = str(s.bwt())
 39 |         assert b == bwt_slow(s)
 40 | 
 41 | @test
 42 | def test_fmindex(FMD: bool):
 43 |     # sequence-based
 44 |     if not FMD:
 45 |         fmi = FMIndex(s'TAACGAGGCGGCTCGTAGTATAAACGCTTTGGACTAGACTCGATACCTAG')
 46 |         assert fmi.count(s'TA') == 7
 47 |         assert fmi.count(s'TAA') == 2
 48 |         assert fmi.count(s'TATT') == 0
 49 |         assert sorted(list(fmi[s'TAA'])) == sorted(list(fmi[~s'TTA'])) == [0, 20]
 50 |         assert len(list(fmi[s'TATT'])) == 0
 51 | 
 52 |     # FASTA-based
 53 |     fmi = FMIndex('test/data/seqs.fasta', FMD=FMD)
 54 |     with gzip.open('fmi.bin', 'wb') as jar:
 55 |         pickle.dump(fmi, jar)
 56 | 
 57 |     with gzip.open('fmi.bin', 'rb') as jar:
 58 |         fmi = pickle.load(jar, FMIndex)
 59 | 
 60 |     assert fmi.sequence(1, 20, rid=0) == fmi.sequence(1, 20, name='chrA') == s'CCTCCCCGTTCGCTGGACC'
 61 |     assert fmi.sequence(1, 20, rid=3) == fmi.sequence(1, 20, name='chrD') == s'GCCGTGACCACCCCGCGAG'
 62 |     assert [(a.tid, a.name, a.len) for a in fmi.contigs()] == [(0, 'chrA', 460), (1, 'chrB', 489), (2, 'chrC', 500), (3, 'chrD', 49)]
 63 |     if not FMD:
 64 |         assert fmi.count(s'TATA') == 6  # note TATATA in chrC
 65 |         assert fmi.count(s'TATAC') == 0
 66 |     assert sorted(list(fmi.locate(s'TATAA'))) == [(1, 'chrB', 168), (2, 'chrC', 275), (2, 'chrC', 485)]
 67 |     assert sorted(list(fmi.loci(fmi._get_interval(s'TATAA')))) == [Locus(tid=1, pos=168), Locus(tid=2, pos=275), Locus(tid=2, pos=485)]
 68 | 
 69 | @test
 70 | def test_fmdindex():
 71 |     # FASTA-based
 72 |     fmi = FMDIndex('test/data/seqs.fasta')
 73 |     with gzip.open('fmi.bin', 'wb') as jar:
 74 |         pickle.dump(fmi, jar)
 75 | 
 76 |     with gzip.open('fmi.bin', 'rb') as jar:
 77 |         fmi = pickle.load(jar, FMDIndex)
 78 | 
 79 |     assert fmi.sequence(1, 20, rid=0) == fmi.sequence(1, 20, name='chrA') == s'CCTCCCCGTTCGCTGGACC'
 80 |     assert fmi.sequence(1, 20, rid=3) == fmi.sequence(1, 20, name='chrD') == s'GCCGTGACCACCCCGCGAG'
 81 |     assert [(a.tid, a.name, a.len) for a in fmi.contigs()] == [(0, 'chrA', 460), (1, 'chrB', 489), (2, 'chrC', 500), (3, 'chrD', 49)]
 82 |     assert sorted(list(fmi.locate(s'TATAA'))) == [(1, 'chrB', 168, False), (2, 'chrC', 275, False), (2, 'chrC', 485, False)]
 83 |     assert sorted(list(fmi.locate(s'CAGGG', both_strands=True))) == [(0, 'chrA', 214, False), (0, 'chrA', 226, False), (0, 'chrA', 338, True), (0, 'chrA', 381, False), (2, 'chrC', 448, False)]
 84 |     assert sorted(list(fmi.loci(fmi._get_interval(s'CAGGG')))) == [Locus(tid=0, pos=214), Locus(tid=0, pos=226), Locus(tid=0, pos=-338), Locus(tid=0, pos=381), Locus(tid=2, pos=448)]
 85 | 
 86 | @test
 87 | def test_smems[FM](fmi: FM, path: str):
 88 |     # FASTA-based
 89 |     ref = [rec for rec in FASTA(path)]
 90 |     with gzip.open('fmi.bin', 'wb') as jar:
 91 |         pickle.dump(fmi, jar)
 92 | 
 93 |     with gzip.open('fmi.bin', 'rb') as jar:
 94 |         fmi = pickle.load(jar, FM)
 95 | 
 96 |     q = s'ACCAAACCCAGCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAA'
 97 |     v = [[(name, pos, is_rev, ref[rid].seq[pos:pos + len(smem)]) for rid, name, pos, is_rev in fmi.biresults(smem)] for smem in fmi.smems(q, x=20, min_intv=1)[1]]
 98 |     assert v == [[('chrC', 61, True, s'TATTCATCCTATGTGGGTAATTGAGGAGTATGCTAAGATTTTGCGTAGC'), ('chrC', 10, False, s'GCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATA')]]
 99 | 
100 |     q = s'CTTAA'
101 |     v = [[(name, pos, is_rev, ref[rid].seq[pos:pos + len(smem)]) for rid, name, pos, is_rev in fmi.biresults(smem)] for smem in fmi.smems(q, x=1, min_intv=1)[1]]
102 |     assert v == [[('chrA', 2, False, s'CTTAA')]]
103 | 
104 | test_suffix_array()
105 | test_bwt()
106 | test_fmindex(FMD=True)
107 | test_fmindex(FMD=False)
108 | test_fmdindex()
109 | 
110 | path = 'test/data/seqs2.fasta'
111 | test_smems(FMIndex(path, FMD=True), path)
112 | test_smems(FMDIndex(path), path)
113 | 


--------------------------------------------------------------------------------
/test/core/align.codon:
--------------------------------------------------------------------------------
 1 | from bio import *
 2 | Q,T = ['test/data/' + a for a in ('MT-orang.fa','MT-human.fa')]
 3 | 
 4 | @test
 5 | def align_test():
 6 |     for target in FASTA(Q) |> seqs:
 7 |         for query in FASTA(T) |> seqs:
 8 |             a = query @ target
 9 |             assert abs(a.score) == 3315  # edit distance
10 |             assert str(a.cigar) == '8I1M10I1M1I2M4I1M1I1M7I1M3I1M2I1M1I2M3I1M2I2M3I1M2I2M7I3M3I1M1I1M1I10M3I1M2I2M1I2M5I1M2I1M4I7M1I1M1I1M1I1M1I1M5I1M6I3M1I2M2I1M8I1M1I2M2I2M2I1M3I1M3I1M1I1M2I1M4I4M1I1M3I1M1I1M2I1M4I1M13I1M1I2M1I1M1I2M9I1M2I1M10I2M4I2M1I1M1I1M2I1M11I1M2I1M2I1M8I1M7I1M16I1M1I1M6I2M15I2M24I2M10I1M4I2M18I1M4I1M1I1M1I1M11I1M55I1M3I3M1I1M8I2M3I1M5I1M6I2M1I1M5I1M7I3M17I4M3I1M6I3M1I3M4I2M1I2M2I1M2I1M1I2M1I1M4I1M2I1M1I1M1I1M11I1M1I1M3I1M6I3M3I1M1I1M3I1M9I1M1I1M4I1M6I3M2I2M5I2M3I1M7I1M1I7M7I1M1I2M3I2M2I2M2I3M3I1M2I1M1I2M1I1M2I1M5I1M2I1M1I3M1I1M4I2M3I2M2I1M2I1M1I3M7I1M2I163M1D559M1I6M1D550M1I2M1I148M1I3M1D134M1I3M1D47M1D696M1I52M1D7M1D61M1D592M1I3M1D485M1I5M1D1211M1I59M1I156M1I31M1D98M1D18M6D7M7D1M1D1542M1I4M1D70M1I345M1I9M1D397M1D1M6D3M4D1M1D1M2D3M1D1M1D8M2D6M1D390M1D5M1I193M1D6M1I195M1I7M1D1826M1I10M1D1256M1I49M1I1423M1D2M1I46M1I5M1D2137M1I3M1D50M1I3M1D40M1D3M4D57M1D53M3I1M1I19M1D39M15D1M1D1M2D1M5D5M1D1M1D5M1D6M1D7M1D2M1D1M3D1M4D1M4D1M2D3M1D1M1D3M2D1M1D8M4D5M1D5M1D2M3D1M1D2M3D3M1D2M1D3M4D3M1D2M3D6M1D1M1D6M2D5M1D4M8D1M1D1M4D2M1D5M1D1M2D1M3D5M2D2M1D1M3D1M2D5M2D2M1D1M1D2M5D1M3D1M1D10M8D1M1D3M2D3M4D1M6D1M1D1M6D3M2D2M1D5M1D1M2D5M1D4M1D1M1D2M9D1M3D1M9D1M1D1M3D3M1D1M1D1M3D2M5D1M4D1M8D1M1D1M4D3M7D2M1D1M2D1M2D1M1D2M2D12M1D2M4D3M1D2M3D1M1D10M2D3M3D1M2D1M2D2M2D1M4D1M3D3M3D1M2D1M4D1M5D1M7D4M6D1M2D1M1D1M2D1M1D1M3D2M1D2M5D1M2D2M1D1M3D2M1D1M2D1M2D2M1D2M3D6M2D2M1D1M1D1M1D3M1D2M8D2M1D2M2D6M1D6M1D1M3D5M4D2M5D2M1D2M1D3M9D2M2D2M6D1M7D1M1D2M9D1M3D2M4D2M7D10M11D2M2D1M7D1M1D1M1D2M1D2M4D1M1D1M4D2M4D1M1D1M5D2M2D1M2D7M4D1M1D1M1D4M4D4M1D1M1D1M8D10M3D2M2D1M1D1M1D6M3D6M1D3M1D6M1D7M3D1M1D4M2D3M1D17M1D3M'
11 | 
12 |     # ./ksw2-test test/MT-orang.fa test/MT-human.fa
13 |     for target in FASTA(Q) |> seqs:
14 |         for query in FASTA(T) |> seqs:
15 |             a = query.align(target, a=2, b=4, gapo=4, gape=2, gapo2=13, gape2=1)
16 |             assert a.score == 17127
17 |             assert str(a.cigar) == '576I14M2I4M3D37M1I85M1I232M1D559M1I6M1D550M1I2M1I146M2D3M1I3M1I132M1I3M1D40M3D13M1I1M1I335M3D4M1I3M2I342M1I52M1D13M3D1M2I52M1D592M1I3M1D485M1I5M1D974M3D4M3I230M1I59M1I156M1I31M1D98M1D26M14D329M3D7M3I1203M1I4M1D70M1I345M1I9M1D398M7D8M8D1M1D9M3D2M1I2M1D390M1D5M1I193M1D6M1I195M1I7M1D1826M1I10M1D1256M1I49M1I157M3I5M3D48M2D1M1D3M3I1203M1D2M2I1M1D44M2I2M1D2M1D38M2I16M2D2081M1I3M1D50M1I3M1D43M5D57M1D54M4I19M1D39M2I8M1D7M1D22M1D5M1D4M1I5M1D2M2I29M2D20M1I13M1I1M2D8M1I45M1I15M3I4M2D17M1I56M1I2M1D131M1D37M474D1M'
18 | 
19 |     # ./ksw2-test -t exts2_sse test/MT-orang.fa test/MT-human.fa
20 |     for target in FASTA(Q) |> seqs:
21 |         for query in FASTA(T) |> seqs:
22 |             a = query.align(target, a=1, b=2, gapo=2, gape=1, gapo2=32, gape2=4, splice=True, splice_fwd=True)
23 |             assert a.score == 9027
24 |             assert str(a.cigar) == '576I14M2I4M3D37M1I85M1I232M1D559M1I6M1D550M1I2M1I146M2D3M1I3M1I132M1I3M1D40M3D13M1I1M1I335M3D4M1I3M2I342M1I52M1D13M3D1M2I52M1D592M1I3M1D485M1I5M1D974M3D4M3I230M1I59M1I156M1I31M1D98M1D26M14D329M3D7M3I1203M1I4M1D70M1I345M1I9M1D398M7D8M8D1M1D9M3D2M1I2M1D390M1D5M1I193M1D6M1I195M1I7M1D1826M1I10M1D1256M1I49M1I157M3I5M3D48M2D1M1D3M3I1203M1D2M2I1M1D44M2I2M1D2M1D38M2I16M2D2081M1I3M1D50M1I3M1D43M5D57M1D54M4I19M1D39M2I8M1D7M1D22M1D5M1D4M1I5M1D2M2I29M2D20M1I13M1I1M2D8M1I45M1I15M3I4M2D17M1I56M1I2M1D131M1D37M474N1M'
25 | 
26 |     # ./ksw2-test -t gg2_sse test/MT-orang.fa test/MT-human.fa
27 |     for target in FASTA(Q) |> seqs:
28 |         for query in FASTA(T) |> seqs:
29 |             a = query.align(target, a=2, b=4, gapo=4, gape=2)
30 |             assert a.score == 16102
31 |             assert str(a.cigar) == '1M155I4M63I5M103I4M56I3M6I4M192I37M1I85M1I232M1D559M1I6M1D550M1I2M1I146M2D3M1I3M1I132M1I3M1D40M3D13M1I1M1I335M3D4M1I3M2I342M1I52M1D13M3D1M2I52M1D592M1I3M1D485M1I5M1D974M3D4M3I230M1I59M1I156M1I31M1D98M1D26M14D329M3D7M3I1203M1I4M1D70M1I345M1I9M1D398M7D8M8D1M1D9M3D2M1I2M1D390M1D5M1I193M1D6M1I195M1I7M1D1826M1I10M1D1256M1I49M1I157M3I5M3D48M2D1M1D3M3I1203M1D2M2I1M1D44M2I2M1D2M1D38M2I16M2D2081M1I3M1D50M1I3M1D43M5D57M1D54M4I19M1D39M2I8M1D7M1D22M1D5M1D4M1I5M1D2M2I29M2D20M1I13M1I1M2D8M1I45M1I15M3I4M2D17M1I56M1I2M1D131M1D37M474D1M'
32 | 
33 | @test
34 | def cigar_test():
35 |     def check_cigar(s: str):
36 |         return str(CIGAR(s)) == s
37 | 
38 |     def check_cigar_fail(s: str):
39 |         try:
40 |             CIGAR(s)
41 |             return False
42 |         except ValueError:
43 |             return True
44 | 
45 |     assert check_cigar('')
46 |     assert check_cigar('1M')
47 |     assert check_cigar('11M')
48 |     assert check_cigar('3M11I12H111D')
49 |     assert check_cigar_fail('3M11I12q111D')
50 |     assert check_cigar_fail('3M11I12H111')
51 |     assert check_cigar_fail(' ')
52 |     assert check_cigar_fail('M')
53 |     assert check_cigar_fail('1')
54 |     assert check_cigar_fail('M1')
55 |     assert check_cigar_fail('MMM')
56 |     assert check_cigar_fail('MM1')
57 |     assert check_cigar_fail('M1M')
58 |     assert check_cigar_fail('M11')
59 |     assert check_cigar_fail('1MM')
60 |     assert check_cigar_fail('1M1')
61 |     assert check_cigar_fail('111')
62 | 
63 |     assert bool(CIGAR('')) == False
64 |     assert bool(CIGAR('1M')) == True
65 | 
66 | align_test()
67 | cigar_test()
68 | 


--------------------------------------------------------------------------------
/test/apps/cora/hom_inexact.codon:
--------------------------------------------------------------------------------
  1 | # Implementation of CORA's inexact homology table construction
  2 | # https://github.com/denizy/cora/blob/master/homTable_setup.cpp
  3 | 
  4 | # Usage: seqc hom_inexact.seq <input_ref.fa> <num_mismatches> <output_prefix>
  5 | # Output format (gzip'd; 100 output files):
  6 | #   Repeated:
  7 | #     - Locus 1 [Locus]
  8 | #     - Locus 2 [Locus]
  9 | #     - MAX_MISMATCHES times:
 10 | #       - 0-based error offset, or 255 if none [byte]
 11 | 
 12 | from sys import argv, stderr, exit
 13 | from pickle import dump, load
 14 | from bio import FASTA, Locus, Kmer
 15 | import gzip, itertools
 16 | 
 17 | MAX_MISMATCHES = 3
 18 | 
 19 | K: Static[int] = 64
 20 | 
 21 | @tuple
 22 | class BitSet:
 23 |     v: List[int]
 24 | 
 25 |     def __new__(n: int) -> BitSet:
 26 |         return BitSet([0 for _ in range((n // 64) + 1)],)
 27 |     def __getitem__(self, idx: int):
 28 |         return (self.v[idx // 64] & (1 << (idx % 64))) != 0
 29 |     def __setitem__(self, idx: int, b: bool):
 30 |         if b:
 31 |             self.v[idx // 64] |= (1 << (idx % 64))
 32 |         else:
 33 |             self.v[idx // 64] &= ~(1 << (idx % 64))
 34 | 
 35 | if len(argv) != 4:
 36 |     print(f'usage: {argv[0]} <input_ref.fa> <num_mismatches> <output_prefix>', file=stderr)
 37 |     exit(1)
 38 | 
 39 | ref = [rec for rec in FASTA(argv[1])]
 40 | num_mismatches = int(argv[2])
 41 | 
 42 | if not (1 <= num_mismatches <= MAX_MISMATCHES):
 43 |     print(f'error: num_mismatches argument must be between 1 and {MAX_MISMATCHES}', file=stderr)
 44 |     exit(1)
 45 | 
 46 | num_seeds = num_mismatches + 1
 47 | bitsets = None
 48 | with gzip.open(argv[3] + '.reps_bitsets', 'rb') as reps:
 49 |     bitsets = load(reps, List[BitSet])
 50 | 
 51 | class MapSegment[K]:
 52 |     locus: Locus
 53 |     next: Optional[MapSegment[K]] = None
 54 | 
 55 |     @property
 56 |     def kmer(self):
 57 |         k = K(ref[self.locus.tid].seq[self.locus.pos:self.locus.pos + K.len()])
 58 |         return ~k if self.locus.reversed else k
 59 | 
 60 | def segment_code(seed_no, kmer):
 61 |     assert 0 <= seed_no < num_seeds
 62 |     k = len(kmer)
 63 |     h = 0
 64 |     for i in range(0, k // 2 - num_seeds + 1, num_seeds):
 65 |         h <<= 2
 66 |         h |= int(kmer[i + seed_no].as_int())
 67 |         h <<= 2
 68 |         h |= int(kmer[k - 1 - i - seed_no].as_int())
 69 |     return h
 70 | 
 71 | def gen_signals():
 72 |     t = (k'A', k'C', k'G', k'T')
 73 |     for a, b in itertools.product(t, t):
 74 |         x, y = (a, b), (~b, ~a)
 75 |         if x <= y:
 76 |             yield (x, y)
 77 | 
 78 | def extract_signal(seed_no, kmer):
 79 |     assert 0 <= seed_no < num_seeds
 80 |     k = len(kmer)
 81 |     a1 = kmer[seed_no]
 82 |     a2 = kmer[seed_no + num_seeds]
 83 |     a3 = kmer[seed_no + 2*num_seeds]
 84 |     b1 = kmer[k - 1 - 2*num_seeds - seed_no]
 85 |     b2 = kmer[k - 1 - num_seeds - seed_no]
 86 |     b3 = kmer[k - 1 - seed_no]
 87 |     return ((a1, b3), (a2, b2), (a3, b1))
 88 | 
 89 | def signal_match(signal, target1, target2):
 90 |     return signal[0] in target1[0] and signal[1] in target1[1] and signal[2] in target2
 91 | 
 92 | def compare_and_report(seg1, seg2, seed_no, precompact):
 93 |     k1, k2 = seg1.kmer, seg2.kmer
 94 |     if any(segment_code(i, k1) == segment_code(i, k2) for i in range(seed_no)):
 95 |         return  # homology was reported by an earlier seed match
 96 | 
 97 |     hamming_dist = abs(k1 - k2)
 98 |     if hamming_dist <= num_mismatches:
 99 |         dump(seg1.locus, precompact)
100 |         dump(seg2.locus, precompact)
101 |         for i in range(len(k1)):
102 |             if k1[i] != k2[i]:
103 |                 dump(byte(i), precompact)
104 |         for _ in range(MAX_MISMATCHES - hamming_dist):
105 |             dump(byte(-1), precompact)  # pad to MAX_MISMATCHES
106 | 
107 | signals1 = [(a,b) for a in gen_signals() for b in gen_signals()]
108 | signals2 = [a for a in gen_signals()]
109 | basename = f'{argv[3]}.hom_inexact.precompact'
110 | 
111 | def process_signal(t):
112 |     idx, signal1 = t
113 |     d = {}
114 |     d.resize(10000000)
115 |     with gzip.open(basename + '.' + str(idx), 'wb0') as precompact:
116 |         for signal2 in signals2:
117 |             for seed_no in range(num_seeds):
118 |                 d.clear()
119 |                 for tid, rec in enumerate(ref):
120 |                     for pos, kmer in rec.seq.kmers_with_pos(1, K):
121 |                         if not (bitsets[tid][pos] and
122 |                                 signal_match(extract_signal(seed_no, kmer), signal1, signal2)):
123 |                             continue
124 | 
125 |                         kmer_rev = ~kmer
126 |                         add_pal = (kmer == kmer_rev)
127 |                         if kmer_rev < kmer:
128 |                             kmer = kmer_rev
129 |                             pos = -pos
130 | 
131 |                         s = segment_code(seed_no, kmer)
132 |                         m: Optional = MapSegment[Kmer[K]](Locus(tid, pos))
133 |                         p = d.setdefault(s, m)
134 | 
135 |                         if m is not p:
136 |                             target = p
137 |                             while target is not None:
138 |                                 compare_and_report(m, target, seed_no, precompact)
139 |                                 target = target.next
140 |                             m.next = p
141 |                         d[s] = m
142 | 
143 |                         if add_pal:
144 |                             m_rev = MapSegment[Kmer[K]](Locus(tid, -pos))
145 |                             target = m.next  # don't compare with m, since m == ~m_rev
146 |                             while target is not None:
147 |                                 compare_and_report(m_rev, target, seed_no, precompact)
148 |                                 target = target.next
149 |                             m_rev.next = m
150 |                             d[s] = m_rev
151 | 
152 | signals1 |> enumerate ||> process_signal
153 | 


--------------------------------------------------------------------------------
/test/apps/avid/avid.codon:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import bio
  3 | import bio.fmindex
  4 | import itertools
  5 | import time
  6 | 
  7 | @tuple
  8 | class Segment:
  9 |   xs: int
 10 |   ys: int
 11 |   xe: int
 12 |   ye: int
 13 |   aln: bio.Alignment
 14 |   anchor: bool
 15 | 
 16 | @dataclass(init=False)
 17 | class LCPNode:
 18 |   lcp: int
 19 |   start: int
 20 |   end: int
 21 |   children: List[LCPNode]
 22 |   data: List[List[int]]
 23 | 
 24 |   def __init__(self, lcp, start, end):
 25 |     self.lcp, self.start, self.end = lcp, start, end
 26 |     self.children = []
 27 |     self.data = [[] for i in range(5)]
 28 | 
 29 | def get_mems(n, s, sa, lcp, min_size, anchors):
 30 |   """ Find MEMs """
 31 |   i, ci = n.start, 0
 32 |   while i < n.end:
 33 |     d = [[] for i in range(5)]
 34 |     if ci < len(n.children) and i == n.children[ci].start:
 35 |       i = n.children[ci].end
 36 |       d = n.children[ci].data
 37 |       # TODO: clear d
 38 |       ci += 1
 39 |     else:
 40 |       d[bio.fmindex._enc(s._at(sa[i] - 1)) if sa[i] else 4].append(i)
 41 |       i += 1
 42 |     if n.lcp >= min_size:
 43 |       for a, ap in itertools.product(range(5), range(5)):
 44 |         if a == ap:
 45 |           continue
 46 |         for posp, pos in itertools.product(d[a], n.data[ap]):
 47 |           a, b = sa[posp], sa[pos]
 48 |           if a > b: a, b = b, a
 49 |           anchors.append((a, b, n.lcp))
 50 |     for a in range(5):
 51 |       n.data[a].extend(d[a])
 52 | 
 53 | def lcp_bottom_up(self, sa, lcp, min_size, anchors: List[Tuple[int,int,int]]):
 54 |   """ Reconstruct suffix tree from SA and find all MEMs whose length is >= min_size """
 55 |   interval = None
 56 |   stack = [LCPNode(0, 0, -1)]
 57 |   for i in range(1, len(lcp)):
 58 |     start = i - 1
 59 |     # assert len(stack)>0
 60 |     while lcp[i] < stack[-1].lcp:
 61 |       interval = stack.pop()
 62 |       # assert len(stack)>0
 63 |       interval.end = i # [start, end)
 64 |       get_mems(interval, self, sa, lcp, min_size, anchors)
 65 |       start = interval.start
 66 |       if lcp[i] <= stack[-1].lcp:
 67 |         stack[-1].children.append(interval)
 68 |         interval = None
 69 |     if lcp[i] > stack[-1].lcp:
 70 |       stack.append(LCPNode(lcp[i], start, -1))
 71 |       if interval:
 72 |         stack[-1].children.append(interval)
 73 |         interval = None
 74 | 
 75 | def anchor(x, y, sfxa, lcp, anchors, xs, xe, ys, ye, depth = 0):
 76 |   # AVID only uses MEMs for sequences >= 4 KB
 77 |   if xe - xs <= 4 * 1024 and ye - ys <= 4 * 1024:
 78 |     yield Segment(xs, ys, xe, ye, x[xs:xe] @ y[ys:ye], False)
 79 |     return
 80 | 
 81 |   an = []
 82 |   max_l = 2
 83 |   # Find anchor matches
 84 |   for sa, sb, l in anchors:
 85 |     sb -= len(x) + 1
 86 |     if l >= max_l // 2 and xs <= sa and sa + l < xe and ys <= sb and sb + l < ye:
 87 |       max_l = max(max_l, l)
 88 |       # TODO SEQ BUG: seq negative index out of range works ... s[-5:1]
 89 |       aln1 = x[max(0, sa - 10):sa] @ y[max(0, sb - 10):sb]
 90 |       aln2 = x[sa + l:sa + l + 10] @ y[sb + l:sb + l + 10]
 91 |       an.append(Segment(sa, sb, sa + l, sb + l, bio.Alignment(f'{l}M', l*10 + aln1.score + aln2.score), True))
 92 |   # Use only large anchors
 93 |   an = [a for a in an if a.xe - a.xs >= max_l // 2]
 94 | 
 95 |   # No anchors: low-quality alignment, use gaps to cover it
 96 |   if not an:
 97 |     c = (f'{xe - xs}I' if xs < xe else '') + (f'{ye - ys}D' if ys < ye else '')
 98 |     yield Segment(xs, ys, xe, ye, bio.Alignment(c, -1), False)
 99 |     return
100 | 
101 |   # Run LIS on anchor list
102 |   an.sort()
103 |   best = 0
104 |   scores = [(an[0].aln.score, -1)]
105 |   for i in range(1, len(an)):
106 |     scores.append((-100000000, -1))
107 |     for j in range(0, i):
108 |       if an[j].xe <= an[i].xs and an[j].ye <= an[i].ys and scores[j][0] + an[j].aln.score > scores[i][0]:
109 |         scores[i] = (scores[j][0] + an[j].aln.score, j)
110 |     if scores[i] > scores[best]:
111 |       best = i
112 |   selected = []
113 |   while best != -1:
114 |     selected.append(best)
115 |     best = scores[best][1]
116 | 
117 |   # Recursively align the remaining gaps
118 |   px, py = xs, ys
119 |   for si in range(len(selected) - 1, -1, -1):
120 |     i = selected[si]
121 |     if (px, py) != (an[i].xs, an[i].ys):
122 |       yield from anchor(x, y, sfxa, lcp, anchors, px, an[i].xs, py, an[i].ys, depth+1)
123 |     yield an[i]
124 |     # TODO SEQ BUG: px, py = 1, py, 2 works! and gives wrong number!
125 |     px, py = an[i].xe, an[i].ye
126 |   if (px, py) != (xe, ye):
127 |     yield from anchor(x, y, sfxa, lcp, anchors, px, xe, py, ye, depth+1)
128 | 
129 | def avid(x, y):
130 |   # Construct SA & LCP
131 |   t = time.time()
132 |   s = bio.seq(f'{x}X{y}')
133 |   sfxa = s.suffix_array()
134 |   lcp = s.lcp(sfxa)
135 |   # print(f'LCP & SA done... {time.time()-t}')
136 | 
137 |   # Get all MEMs
138 |   t = time.time()
139 |   anchors = []
140 |   lcp_bottom_up(s, sfxa, lcp, 10, anchors)
141 |   # print(f'MEMs done, found {len(anchors)} MEMs... {time.time()-t}')
142 | 
143 |   # Get all anchors
144 |   yield from anchor(x, y, sfxa, lcp, anchors, 0, len(x), 0, len(y))
145 | 
146 | # Read sequences
147 | t = time.time()
148 | with open(sys.argv[1]) as fi:
149 |   for li, l in enumerate(fi):
150 |     print f'{li}',
151 |     x, y = l.split()
152 |     x, y = bio.seq(x), bio.seq(y)
153 | 
154 |     # Run AVID & print alignment
155 |     anchors = avid(x, y)
156 |     mat, mis, ind, nind = 0, 0, 0, 0
157 |     xi, yi = 0, 0
158 |     for a in anchors:
159 |       print a.aln.cigar,
160 |       for sz, op in a.aln.cigar:
161 |         if op == 'I':
162 |           ind += sz; xi += sz; nind += 1
163 |         elif op == 'D':
164 |           ind += sz; yi += sz; nind += 1
165 |         elif op == 'M':
166 |           for i in range(sz):
167 |             if x[xi + i] == y[yi + i]:
168 |               mat += 1
169 |             else:
170 |               mis += 1
171 |           xi += sz; yi += sz
172 |         else:
173 |           assert False
174 |     print f'{mat} {mis} {ind} {nind}'
175 | 
176 | print 'AVID is done.', time.time() - t
177 | 


--------------------------------------------------------------------------------
/stdlib/bio/pseq.codon:
--------------------------------------------------------------------------------
  1 | from bio.seq import seq
  2 | from bio.kmer import Kmer
  3 | 
  4 | @tuple
  5 | class pseq:
  6 |     '''
  7 |     Amino acid sequence
  8 |     '''
  9 |     len: int
 10 |     ptr: cobj
 11 | 
 12 |     def __new__(p: cobj, n: int) -> pseq:
 13 |         return pseq(n, p)
 14 | 
 15 |     def __new__(s: str) -> pseq:
 16 |         return pseq(s.len, s.ptr)
 17 | 
 18 |     def __eq__(self, other: pseq):
 19 |         n = len(self)
 20 |         if n != len(other):
 21 |             return False
 22 |         i = 0
 23 |         while i < n:
 24 |             if self._at(i) != other._at(i):
 25 |                 return False
 26 |             i += 1
 27 |         return True
 28 | 
 29 |     def __ne__(self, other: pseq):
 30 |         return not (self == other)
 31 | 
 32 |     def _cmp(self, other: pseq):
 33 |         self_len = len(self)
 34 |         other_len = len(other)
 35 |         n = min(self_len, other_len)
 36 |         i = 0
 37 |         while i < n:
 38 |             c1 = self._at(i)
 39 |             c2 = other._at(i)
 40 |             if c1 != c2:
 41 |                 return int(c1) - int(c2)
 42 |             i += 1
 43 |         return self_len - other_len
 44 | 
 45 |     def __lt__(self, other: pseq):
 46 |         return self._cmp(other) < 0
 47 | 
 48 |     def __le__(self, other: pseq):
 49 |         return self._cmp(other) <= 0
 50 | 
 51 |     def __gt__(self, other: pseq):
 52 |         return self._cmp(other) > 0
 53 | 
 54 |     def __ge__(self, other: pseq):
 55 |         return self._cmp(other) >= 0
 56 | 
 57 |     def __str__(self):
 58 |         return str(self.ptr, self.len)
 59 | 
 60 |     def __repr__(self):
 61 |         return f"p'{self.__str__()}'"
 62 | 
 63 |     def __len__(self):
 64 |         return self.len
 65 | 
 66 |     def __bool__(self):
 67 |         return self.len != 0
 68 | 
 69 |     def __hash__(self):
 70 |         h = 0
 71 |         for i in range(len(self)):
 72 |             h = 31*h + int(self._at(i))
 73 |         return h
 74 | 
 75 |     def __getitem__(self, idx: int):
 76 |         n = len(self)
 77 |         if idx < 0:
 78 |             idx += n
 79 |         if not (0 <= idx < n):
 80 |             raise IndexError("pseq index out of range")
 81 |         return pseq(self.ptr + idx, 1)
 82 | 
 83 |     def _at(self, idx: int):
 84 |         return self.ptr[idx]
 85 | 
 86 |     def _slice_direct(self, a: int, b: int):
 87 |         return pseq(self.ptr + a, b - a)
 88 | 
 89 |     def __getitem__(self, s: Slice):
 90 |         assert s.step is None
 91 |         if s.start is None and s.stop is None and s.step is None:
 92 |             return self.__copy__()
 93 |         elif s.start is None:
 94 |             b = s.stop.__val__()
 95 |             n = len(self)
 96 |             if b < 0: b += n
 97 |             if b > n: b = n
 98 |             return pseq(self.ptr, b)
 99 |         elif s.stop is None:
100 |             a = s.start.__val__()
101 |             n = len(self)
102 |             if a < 0: a += n
103 |             if a > n: a = n
104 |             return pseq(self.ptr + a, n - a)
105 |         else:
106 |             a, b = s.start.__val__(), s.stop.__val__()
107 |             n = len(self)
108 |             if a < 0: a += n
109 |             if b < 0: b += n
110 |             if a > n: a = n
111 |             if b > n: b = n
112 |             return self._slice_direct(a, b)
113 | 
114 |     def _copy_to(self, p: cobj):
115 |         str.memcpy(p, self.ptr, self.len)
116 | 
117 |     def __copy__(self):
118 |         n = len(self)
119 |         p = cobj(n)
120 |         self._copy_to(p)
121 |         return pseq(p, n)
122 | 
123 |     def split(self, k: int, step: int):
124 |         '''
125 |         Iterator over length-`k` subsequences of the given sequence
126 |         with the specified step size.
127 |         '''
128 |         i = 0
129 |         while i + k <= len(self):
130 |             yield self._slice_direct(i,i+k)
131 |             i += step
132 | 
133 |     def __iter__(self):
134 |         return self.split(1, 1)
135 | 
136 |     def __reversed__(self):
137 |         i = len(self) - 1
138 |         while i >= 0:
139 |             yield self._slice_direct(i,i+1)
140 |             i -= 1
141 | 
142 | def translate(s: seq, table: Optional[Dict[seq, pseq]] = None):
143 |     '''
144 |     Performs DNA to amino acid translation. An optional mapping from
145 |     length-3 DNA sequences to amino acids can be given via `table`,
146 |     otherwise the standard mapping is assumed.
147 |     '''
148 |     def encode_triple(s: seq):
149 |         if s.N():
150 |             raise ValueError("codon '{s}' contains an ambiguous base")
151 |         K1 = Kmer[1]
152 |         a, b, c = K1(s[0]), K1(s[1]), K1(s[2])
153 |         n = (int(c.as_int())       |
154 |             (int(b.as_int()) << 2) |
155 |             (int(a.as_int()) << 4))
156 |         return n
157 | 
158 |     def translate_encoded(n: int):
159 |         # Note(!) this table must be consistent with k-mer encoding
160 |         table = 'KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF'
161 |         return table.ptr[n]
162 | 
163 |     if table is not None:
164 |         for k,v in table.items():
165 |             if len(k) != 3:
166 |                 raise ValueError("translation table key does not have length 3")
167 |             if k.N():
168 |                 raise ValueError("ambiguous base in translation table key '{k}'")
169 |             if len(v) != 1:
170 |                 raise ValueError("translation table value does not have length 1")
171 | 
172 |     n = len(s)
173 |     m = n // 3
174 |     p = cobj(m)
175 |     i = 0
176 |     j = 0
177 |     while i < n:
178 |         codon = s._slice_direct(i, i + 3)
179 |         if table is None:
180 |             p[j] = translate_encoded(encode_triple(codon))
181 |         else:
182 |             p[j] = table.get(codon, p'X').ptr[0]
183 |         i += 3
184 |         j += 1
185 |     return pseq(p, m)
186 | 
187 | @extend
188 | class seq:
189 |     def translate(self):
190 |         return translate(self)
191 | 
192 | @extend
193 | class str:
194 |     def __prefix_p__(s: str, N: Static[int] = 0) -> pseq:
195 |         return pseq(s)
196 | 


--------------------------------------------------------------------------------
/sw/ksw2_gg2_sse.cpp:
--------------------------------------------------------------------------------
  1 | #include "ksw2.h"
  2 | #include <cstddef>
  3 | 
  4 | #if defined(__ARM_NEON__)
  5 | #define __SSE2__
  6 | #define __SSE4_1__
  7 | #elif defined(__aarch64__)
  8 | #define __SSE2__  // SIMDE emulation
  9 | #endif
 10 | 
 11 | #define SIMDE_ENABLE_NATIVE_ALIASES
 12 | 
 13 | #ifdef __SSE2__
 14 | #include <simde/x86/sse2.h>
 15 | 
 16 | #ifdef KSW_SSE2_ONLY
 17 | #undef __SSE4_1__
 18 | #endif
 19 | 
 20 | #ifdef __SSE4_1__
 21 | #include <simde/x86/sse4.1.h>
 22 | #endif
 23 | 
 24 | int ksw_gg2_sse(void *km, int qlen, const uint8_t *query, int tlen,
 25 |                 const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e,
 26 |                 int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_) {
 27 |   int r, t, n_col, n_col_, *off, tlen_, last_st, last_en, H0 = 0, last_H0_t = 0;
 28 |   uint8_t *qr, *mem, *mem2;
 29 |   __m128i *u, *v, *x, *y, *s, *p;
 30 |   __m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_;
 31 | 
 32 |   zero_ = _mm_set1_epi8(0);
 33 |   q_ = _mm_set1_epi8(q);
 34 |   qe2_ = _mm_set1_epi8((q + e) * 2);
 35 |   flag1_ = _mm_set1_epi8(1);
 36 |   flag2_ = _mm_set1_epi8(2);
 37 |   flag8_ = _mm_set1_epi8(0x08);
 38 |   flag16_ = _mm_set1_epi8(0x10);
 39 | 
 40 |   if (w < 0)
 41 |     w = tlen > qlen ? tlen : qlen;
 42 |   n_col = w + 1 < tlen ? w + 1 : tlen; // number of columns in the backtrack matrix
 43 |   tlen_ = (tlen + 15) / 16;
 44 |   n_col_ = (n_col + 15) / 16 + 1;
 45 |   n_col = n_col_ * 16;
 46 | 
 47 |   mem = (uint8_t *)kcalloc(km, tlen_ * 5 + 1, 16);
 48 |   u = (__m128i *)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
 49 |   v = u + tlen_, x = v + tlen_, y = x + tlen_, s = y + tlen_;
 50 |   qr = (uint8_t *)kcalloc(km, qlen, 1);
 51 |   mem2 = (uint8_t *)kmalloc(km, ((qlen + tlen - 1) * n_col_ + 1) * 16);
 52 |   p = (__m128i *)(((size_t)mem2 + 15) >> 4 << 4);
 53 |   off = (int *)kmalloc(km, (qlen + tlen - 1) * sizeof(int));
 54 | 
 55 |   for (t = 0; t < qlen; ++t)
 56 |     qr[t] = query[qlen - 1 - t];
 57 | 
 58 |   for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
 59 |     int st = 0, en = tlen - 1, st0, en0, st_, en_;
 60 |     int8_t x1, v1;
 61 |     __m128i x1_, v1_, *pr;
 62 |     // find the boundaries
 63 |     if (st < r - qlen + 1)
 64 |       st = r - qlen + 1;
 65 |     if (en > r)
 66 |       en = r;
 67 |     if (st < ((r - w + 1) >> 1))
 68 |       st = (r - w + 1) >> 1; // take the ceil
 69 |     if (en > (r + w) >> 1)
 70 |       en = (r + w) >> 1; // take the floor
 71 |     st0 = st, en0 = en;
 72 |     st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1;
 73 |     off[r] = st;
 74 |     // set boundary conditions
 75 |     if (st > 0) {
 76 |       if (st - 1 >= last_st && st - 1 <= last_en)
 77 |         x1 = ((uint8_t *)x)[st - 1],
 78 |         v1 = ((uint8_t *)v)[st - 1]; // (r-1,s-1) calculated in the last round
 79 |       else
 80 |         x1 = v1 = 0; // not calculated; set to zeros
 81 |     } else
 82 |       x1 = 0, v1 = r ? q : 0;
 83 |     if (en >= r)
 84 |       ((uint8_t *)y)[r] = 0, ((uint8_t *)u)[r] = r ? q : 0;
 85 |     // loop fission: set scores first
 86 |     for (t = st0; t <= en0; ++t)
 87 |       ((uint8_t *)s)[t] = mat[target[t] * m + qr[t + qlen - 1 - r]];
 88 |     // core loop
 89 |     x1_ = _mm_cvtsi32_si128(x1);
 90 |     v1_ = _mm_cvtsi32_si128(v1);
 91 |     st_ = st >> 4, en_ = en >> 4;
 92 |     pr = p + r * n_col_ - st_;
 93 |     for (t = st_; t <= en_; ++t) {
 94 |       __m128i d, z, a, b, xt1, vt1, ut, tmp;
 95 | 
 96 |       z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_);
 97 | 
 98 |       xt1 = _mm_load_si128(&x[t]);   // xt1 <- x[r-1][t..t+15]
 99 |       tmp = _mm_srli_si128(xt1, 15); // tmp <- x[r-1][t+15]
100 |       xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1),
101 |                          x1_); // xt1 <- x[r-1][t-1..t+14]
102 |       x1_ = tmp;
103 |       vt1 = _mm_load_si128(&v[t]);   // vt1 <- v[r-1][t..t+15]
104 |       tmp = _mm_srli_si128(vt1, 15); // tmp <- v[r-1][t+15]
105 |       vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1),
106 |                          v1_); // vt1 <- v[r-1][t-1..t+14]
107 |       v1_ = tmp;
108 |       a = _mm_add_epi8(xt1,
109 |                        vt1); // a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14]
110 | 
111 |       ut = _mm_load_si128(&u[t]); // ut <- u[t..t+15]
112 |       b = _mm_add_epi8(_mm_load_si128(&y[t]),
113 |                        ut); // b <- y[r-1][t..t+15] + u[r-1][t..t+15]
114 | 
115 |       d = _mm_and_si128(_mm_cmpgt_epi8(a, z),
116 |                         flag1_); // d = a > z? 1 : 0
117 | #ifdef __SSE4_1__
118 |       z = _mm_max_epi8(z, a); // z = z > a? z : a (signed)
119 |       tmp = _mm_cmpgt_epi8(b, z);
120 |       d = _mm_blendv_epi8(d, flag2_,
121 |                           tmp); // d = b > z? 2 : d
122 | #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and
123 |       // _mm_blendv_epi8()
124 |       z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0;
125 |       z = _mm_max_epu8(z,
126 |                        a); // z = max(z, a); this works because both are non-negative
127 |       tmp = _mm_cmpgt_epi8(b, z);
128 |       d = _mm_or_si128(_mm_andnot_si128(tmp, d),
129 |                        _mm_and_si128(tmp,
130 |                                      flag2_)); // d = b > z? 2 : d; emulating blendv
131 | #endif
132 |       z = _mm_max_epu8(z,
133 |                        b); // z = max(z, b); this works because both are non-negative
134 |       _mm_store_si128(&u[t],
135 |                       _mm_sub_epi8(z,
136 |                                    vt1)); // u[r][t..t+15] <- z - v[r-1][t-1..t+14]
137 |       _mm_store_si128(&v[t], _mm_sub_epi8(z,
138 |                                           ut)); // v[r][t..t+15] <- z - u[r-1][t..t+15]
139 | 
140 |       z = _mm_sub_epi8(z, q_);
141 |       a = _mm_sub_epi8(a, z);
142 |       b = _mm_sub_epi8(b, z);
143 |       tmp = _mm_cmpgt_epi8(a, zero_);
144 |       d = _mm_or_si128(d, _mm_and_si128(flag8_, tmp));
145 |       _mm_store_si128(&x[t], _mm_and_si128(a, tmp));
146 |       tmp = _mm_cmpgt_epi8(b, zero_);
147 |       d = _mm_or_si128(d, _mm_and_si128(flag16_, tmp));
148 |       _mm_store_si128(&y[t], _mm_and_si128(b, tmp));
149 |       _mm_store_si128(&pr[t], d);
150 |     }
151 |     if (r > 0) {
152 |       if (last_H0_t >= st0 && last_H0_t <= en0)
153 |         H0 += ((uint8_t *)v)[last_H0_t] - (q + e);
154 |       else
155 |         ++last_H0_t, H0 += ((uint8_t *)u)[last_H0_t] - (q + e);
156 |     } else
157 |       H0 = ((uint8_t *)v)[0] - 2 * (q + e), last_H0_t = 0;
158 |     last_st = st, last_en = en;
159 |   }
160 |   kfree(km, mem);
161 |   kfree(km, qr);
162 |   ksw_backtrack(km, 1, 0, 0, (uint8_t *)p, off, 0, n_col, tlen - 1, qlen - 1, m_cigar_,
163 |                 n_cigar_, cigar_);
164 |   kfree(km, mem2);
165 |   kfree(km, off);
166 |   return H0;
167 | }
168 | #endif // __SSE2__
169 | 


--------------------------------------------------------------------------------
/test/core/proteins.codon:
--------------------------------------------------------------------------------
 1 | from bio import *
 2 | 
 3 | table = {s'ATA': p'I', s'ATC': p'I', s'ATT': p'I', s'ATG': p'M',
 4 |          s'ACA': p'T', s'ACC': p'T', s'ACG': p'T', s'ACT': p'T',
 5 |          s'AAC': p'N', s'AAT': p'N', s'AAA': p'K', s'AAG': p'K',
 6 |          s'AGC': p'S', s'AGT': p'S', s'AGA': p'R', s'AGG': p'R',
 7 |          s'CTA': p'L', s'CTC': p'L', s'CTG': p'L', s'CTT': p'L',
 8 |          s'CCA': p'P', s'CCC': p'P', s'CCG': p'P', s'CCT': p'P',
 9 |          s'CAC': p'H', s'CAT': p'H', s'CAA': p'Q', s'CAG': p'Q',
10 |          s'CGA': p'R', s'CGC': p'R', s'CGG': p'R', s'CGT': p'R',
11 |          s'GTA': p'V', s'GTC': p'V', s'GTG': p'V', s'GTT': p'V',
12 |          s'GCA': p'A', s'GCC': p'A', s'GCG': p'A', s'GCT': p'A',
13 |          s'GAC': p'D', s'GAT': p'D', s'GAA': p'E', s'GAG': p'E',
14 |          s'GGA': p'G', s'GGC': p'G', s'GGG': p'G', s'GGT': p'G',
15 |          s'TCA': p'S', s'TCC': p'S', s'TCG': p'S', s'TCT': p'S',
16 |          s'TTC': p'F', s'TTT': p'F', s'TTA': p'L', s'TTG': p'L',
17 |          s'TAC': p'Y', s'TAT': p'Y', s'TAA': p'X', s'TGC': p'C',
18 |          s'TGT': p'C', s'TGA': p'X', s'TGG': p'W'}
19 | dna = s'ACCATGACAACGATCAACATAAGGCCTACTAGCAAGAGACATAATATTCTGCTACTCCACAAACCGAGTCCACAACCCTATGGTTGTCGACAGCGCGATCGGCTTTGCGGGTAGGGATAAGGCTACGAGTCGTTTGACCGTGAATCAGCAGTAGCCGTCGCGGTGTTCGTTGCTTTATGATTGTCCTGGTCT'
20 | print dna |> translate  # EXPECT: TMTTINIRPTSKRHNILLLHKPSPQPYGCRQRDRLCG*G*GYESFDRESAVAVAVFVAL*LSWS
21 | protein = dna |> translate(table=table)
22 | print protein  # EXPECT: TMTTINIRPTSKRHNILLLHKPSPQPYGCRQRDRLCGXGXGYESFDRESAVAVAVFVALXLSWS
23 | protein = ~dna |> translate(table=table)
24 | print protein  # EXPECT: RPGQSXSNEHRDGYCXFTVKRLVALSLPAKPIALSTTIGLWTRFVEXQNIMSLASRPYVDRCHG
25 | 
26 | print protein[0], protein[-1]  # EXPECT: R G
27 | print protein[1:5]   # EXPECT: PGQS
28 | print protein[:-20]  # EXPECT: RPGQSXSNEHRDGYCXFTVKRLVALSLPAKPIALSTTIGLWTRF
29 | print protein[12:]   # EXPECT: GYCXFTVKRLVALSLPAKPIALSTTIGLWTRFVEXQNIMSLASRPYVDRCHG
30 | print protein[:]     # EXPECT: RPGQSXSNEHRDGYCXFTVKRLVALSLPAKPIALSTTIGLWTRFVEXQNIMSLASRPYVDRCHG
31 | 
32 | p1 = p'HEAGAWGHEE'
33 | p2 = p'HPAWHEAE'
34 | 
35 | print p1 @ p2  # EXPECT: Alignment('1M2I7M', -5)
36 | 
37 | pam90 = {('B', 'N'): 4, ('G', 'G'): 5, ('K', 'G'): -4, ('S', 'E'): -2, ('Y', 'E'): -6,
38 |          ('W', 'R'): 0, ('V', 'M'): 1, ('N', 'R'): -2, ('W', 'Q'): -7, ('L', 'Q'): -3,
39 |          ('V', 'N'): -4, ('F', 'K'): -8, ('G', 'E'): -1, ('S', 'L'): -5, ('P', 'R'): -1,
40 |          ('E', 'D'): 4, ('Y', 'G'): -8, ('W', 'P'): -8, ('Q', 'A'): -2, ('G', 'D'): -1,
41 |          ('K', 'D'): -2, ('T', 'N'): 0, ('W', 'W'): 13, ('L', 'D'): -7, ('S', 'S'): 4,
42 |          ('K', 'C'): -8, ('S', 'A'): 1, ('Y', 'I'): -3, ('V', 'I'): 3, ('Q', 'C'): -8,
43 |          ('Z', 'P'): -2, ('T', 'G'): -2, ('B', 'P'): -3, ('T', 'L'): -3, ('Z', 'F'): -8,
44 |          ('F', 'G'): -6, ('Z', 'Q'): 5, ('V', 'T'): -1, ('S', 'H'): -3, ('B', 'Q'): 0,
45 |          ('I', 'Q'): -4, ('Y', 'K'): -6, ('W', 'T'): -7, ('P', 'D'): -4, ('I', 'C'): -3,
46 |          ('K', 'R'): 2, ('Z', 'R'): -1, ('T', 'E'): -2, ('B', 'R'): -3, ('Q', 'R'): 0,
47 |          ('K', 'Q'): -1, ('Z', 'S'): -2, ('B', 'S'): 0, ('Y', 'M'): -6, ('V', 'E'): -3,
48 |          ('Z', 'T'): -2, ('Y', 'D'): -6, ('V', 'W'): -9, ('T', 'C'): -4, ('B', 'T'): -1,
49 |          ('T', 'H'): -3, ('F', 'Q'): -7, ('L', 'I'): 1, ('M', 'Q'): -2, ('R', 'A'): -4,
50 |          ('C', 'D'): -8, ('V', 'F'): -4, ('F', 'C'): -7, ('C', 'R'): -5, ('D', 'D'): 6,
51 |          ('V', 'P'): -3, ('S', 'D'): -1, ('P', 'C'): -5, ('F', 'R'): -6, ('C', 'C'): 9,
52 |          ('I', 'G'): -5, ('W', 'K'): -6, ('I', 'N'): -3, ('Z', 'V'): -3, ('T', 'A'): 1,
53 |          ('B', 'V'): -4, ('K', 'L'): -5, ('L', 'G'): -6, ('F', 'A'): -5, ('Z', 'W'): -8,
54 |          ('S', 'K'): -1, ('B', 'W'): -7, ('K', 'K'): 5, ('E', 'N'): 0, ('Y', 'Q'): -6,
55 |          ('V', 'A'): 0, ('W', 'I'): -8, ('V', 'S'): -3, ('T', 'T'): 5, ('F', 'M'): -1,
56 |          ('L', 'E'): -5, ('M', 'M'): 9, ('W', 'H'): -4, ('S', 'R'): -1, ('P', 'Q'): -1,
57 |          ('P', 'N'): -2, ('B', 'Y'): -4, ('H', 'A'): -4, ('P', 'G'): -3, ('F', 'N'): -5,
58 |          ('H', 'N'): 2, ('P', 'K'): -3, ('T', 'M'): -2, ('K', 'H'): -2, ('T', 'R'): -3,
59 |          ('L', 'C'): -9, ('W', 'N'): -5, ('E', 'Q'): 2, ('S', 'G'): 0, ('Z', 'H'): 1,
60 |          ('Y', 'S'): -4, ('G', 'R'): -5, ('W', 'M'): -7, ('F', 'D'): -8, ('T', 'K'): -1,
61 |          ('C', 'N'): -6, ('T', 'P'): -1, ('V', 'L'): 0, ('F', 'I'): 0, ('G', 'Q'): -3,
62 |          ('L', 'A'): -3, ('M', 'I'): 1, ('W', 'L'): -3, ('S', 'N'): 1, ('I', 'R'): -3,
63 |          ('H', 'E'): -1, ('Y', 'W'): -2, ('I', 'D'): -4, ('W', 'C'): -10, ('N', 'A'): -1,
64 |          ('T', 'I'): 0, ('Z', 'K'): -1, ('Q', 'N'): -1, ('M', 'K'): 0, ('K', 'E'): -2,
65 |          ('S', 'C'): -1, ('Z', 'L'): -4, ('Y', 'Y'): 9, ('V', 'Y'): -4, ('W', 'A'): -8,
66 |          ('Y', 'F'): 4, ('Z', 'M'): -3, ('M', 'R'): -2, ('V', 'H'): -4, ('F', 'E'): -8,
67 |          ('M', 'E'): -4, ('H', 'R'): 1, ('P', 'P'): 7, ('P', 'I'): -4, ('Q', 'Q'): 6,
68 |          ('P', 'F'): -6, ('B', 'A'): -1, ('Z', 'N'): 0, ('I', 'A'): -2, ('F', 'F'): 8,
69 |          ('I', 'H'): -5, ('W', 'G'): -9, ('Y', 'H'): -1, ('B', 'B'): 4, ('M', 'L'): 2,
70 |          ('M', 'G'): -5, ('S', 'Q'): -2, ('W', 'F'): -2, ('D', 'A'): -1, ('K', 'A'): -3,
71 |          ('N', 'N'): 5, ('B', 'C'): -7, ('V', 'K'): -5, ('W', 'E'): -10, ('L', 'R'): -5,
72 |          ('T', 'S'): 2, ('B', 'D'): 5, ('Z', 'A'): -1, ('M', 'N'): -4, ('V', 'D'): -4,
73 |          ('Q', 'D'): 0, ('M', 'A'): -2, ('V', 'V'): 6, ('W', 'D'): -9, ('S', 'F'): -4,
74 |          ('D', 'N'): 3, ('P', 'M'): -4, ('H', 'D'): -1, ('B', 'E'): 2, ('Z', 'B'): 2,
75 |          ('I', 'E'): -3, ('R', 'R'): 7, ('K', 'N'): 1, ('Y', 'L'): -3, ('T', 'Q'): -3,
76 |          ('E', 'C'): -8, ('B', 'F'): -6, ('Z', 'C'): -8, ('M', 'H'): -5, ('M', 'C'): -8,
77 |          ('S', 'M'): -3, ('E', 'R'): -4, ('E', 'E'): 6, ('B', 'G'): -1, ('Z', 'D'): 3,
78 |          ('V', 'G'): -3, ('G', 'N'): -1, ('A', 'A'): 4, ('V', 'Q'): -4, ('L', 'N'): -4,
79 |          ('Y', 'N'): -2, ('B', 'H'): 1, ('Z', 'E'): 5, ('V', 'R'): -4, ('P', 'H'): -2,
80 |          ('H', 'C'): -5, ('P', 'A'): 0, ('F', 'L'): 0, ('H', 'H'): 8, ('B', 'I'): -3,
81 |          ('C', 'A'): -3, ('I', 'I'): 6, ('T', 'F'): -5, ('L', 'L'): 6, ('Y', 'P'): -8,
82 |          ('Z', 'G'): -2, ('D', 'R'): -5, ('M', 'D'): -5, ('G', 'C'): -5, ('S', 'I'): -3,
83 |          ('Y', 'A'): -5, ('E', 'A'): 0, ('K', 'I'): -3, ('B', 'K'): 0, ('V', 'C'): -3,
84 |          ('T', 'D'): -2, ('Y', 'R'): -6, ('B', 'L'): -5, ('Z', 'Y'): -6, ('G', 'A'): 0,
85 |          ('S', 'P'): 0, ('Z', 'I'): -3, ('H', 'Q'): 2, ('Y', 'C'): -1, ('P', 'L'): -4,
86 |          ('H', 'G'): -5, ('P', 'E'): -2, ('F', 'H'): -3, ('B', 'M'): -5, ('Z', 'Z'): 5,
87 |          ('W', 'S'): -3, ('L', 'H'): -3, ('Y', 'T'): -4}
88 | 
89 | print p1.align(p2, mat=SubMat(pam90))  # EXPECT: Alignment('1M2I7M', 4)
90 | print p1.align(p2, gapo=4, gape=2, gapo2=13, gape2=1, mat=SubMat(pam90))  # EXPECT: Alignment('1M2I7M', 4)
91 | print p1.align(p2, gapo=4, gape=2, ext_only=True, mat=SubMat(pam90))  # EXPECT: Alignment('3M', 7)
92 | print p1, p2  # EXPECT: HEAGAWGHEE HPAWHEAE
93 | 


--------------------------------------------------------------------------------
/stdlib/bio/builtin.codon:
--------------------------------------------------------------------------------
  1 | from bio.seq import seq
  2 | from bio.kmer import Kmer
  3 | 
  4 | @__attribute__
  5 | def prefetch():
  6 |     pass
  7 | 
  8 | @__attribute__
  9 | def inter_align():
 10 |     pass
 11 | 
 12 | def seqs(x):
 13 |     '''
 14 |     Returns an iterator over sequences from the specified
 15 |     object by invoking the `__seqs__` magic method.
 16 | 
 17 |     `__seqs__` is defined for most common formats, like
 18 |     FASTA, FASTQ, SAM and BAM.
 19 |     '''
 20 |     return x.__seqs__()
 21 | 
 22 | def split(self: seq, k: int, step: int):
 23 |     '''
 24 |     Iterator over length-`k` subsequences of the given sequence
 25 |     with the specified step size.
 26 |     '''
 27 |     return self.split(k, step)
 28 | 
 29 | def kmers(self: seq, step: int, k: Static[int]):
 30 |     '''
 31 |     Iterator over k-mers (size `k`) of the given sequence
 32 |     with the specified step size. Note that k-mers spanning
 33 |     ambiguous bases will be skipped.
 34 |     '''
 35 |     return self.kmers(step, k)
 36 | 
 37 | def kmers_with_pos(self: seq, step: int, k: Static[int]):
 38 |     '''
 39 |     Iterator over (0-based index, k-mer) tuples of the given
 40 |     sequence with the specified step size. Note that k-mers
 41 |     spanning ambiguous bases will be skipped.
 42 |     '''
 43 |     return self.kmers_with_pos(step, k)
 44 | 
 45 | def revcomp(s):
 46 |     '''
 47 |     Returns the reverse complement of the argument sequence or k-mer.
 48 |     '''
 49 |     return ~s
 50 | 
 51 | def revcomp_with_pos(t):
 52 |     '''
 53 |     Returns the reverse complement of the argument sequence or k-mer,
 54 |     where the argument also contains a position (e.g. as yielded by
 55 |     `kmers_with_pos`).
 56 |     '''
 57 |     return (t[0], ~t[1])
 58 | 
 59 | def _kmers_revcomp_with_pos[K](self: seq, step: int):
 60 |     return self._kmers_revcomp_with_pos(step, K.k)
 61 | 
 62 | def _kmers_revcomp[K](self: seq, step: int):
 63 |     return self._kmers_revcomp(step, K.k)
 64 | 
 65 | def canonical(k):
 66 |     '''
 67 |     Returns the minimum of a sequence / k-mer and its reverse complement.
 68 |     '''
 69 |     kr = ~k
 70 |     return k if k < kr else kr
 71 | 
 72 | def canonical_with_pos(t):
 73 |     '''
 74 |     Returns the minimum of a sequence / k-mer and its reverse complement,
 75 |     where the argument also contains a position (e.g. as yielded by
 76 |     `kmers_with_pos`).
 77 |     '''
 78 |     return (t[0], canonical(t[1]))
 79 | 
 80 | def _kmers_canonical[K](self: seq):
 81 |     return self.kmers_canonical(K.k)
 82 | 
 83 | def _kmers_canonical_with_pos[K](self: seq):
 84 |     return self.kmers_canonical_with_pos(K.k)
 85 | 
 86 | def base[K,T](kmer: K, idx: int, b: T):
 87 |     '''
 88 |     Returns a new k-mer equal to `K` but with the base at index `idx` set to `b`
 89 |     '''
 90 |     U = type(kmer.as_int())
 91 |     if idx < 0:
 92 |         idx += len(kmer)
 93 |     idx = K.len() - idx - 1
 94 |     n = U(int(Kmer[1](b).as_int()))
 95 |     k = kmer.as_int() & ~(U(3) << U(2*idx))
 96 |     k |= n << U(2*idx)
 97 |     return K(k)
 98 | 
 99 | @__force__
100 | def _is_iupac_nt(b: byte) -> bool:
101 |     iupac = ('\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
102 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
103 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01\x00'
104 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
105 |              '\x00\x01\x01\x01\x01\x00\x00\x01\x01\x00\x00\x01\x00\x01\x01\x00'
106 |              '\x00\x00\x01\x01\x01\x01\x01\x01\x00\x01\x00\x00\x00\x00\x00\x00'
107 |              '\x00\x01\x01\x01\x01\x00\x00\x01\x01\x00\x00\x01\x00\x01\x01\x00'
108 |              '\x00\x00\x01\x01\x01\x01\x01\x01\x00\x01\x00\x00\x00\x00\x00\x00'
109 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
110 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
111 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
112 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
113 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
114 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
115 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
116 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
117 |     return bool(iupac.ptr[int(b)])
118 | 
119 | @__force__
120 | def _is_iupac_aa(b: byte) -> bool:
121 |     iupac = ('\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
122 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
123 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
124 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
125 |              '\x00\x01\x00\x01\x01\x01\x01\x01\x01\x01\x00\x01\x01\x01\x01\x00'
126 |              '\x01\x01\x01\x01\x01\x00\x01\x01\x00\x01\x00\x00\x00\x00\x00\x00'
127 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
128 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
129 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
130 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
131 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
132 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
133 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
134 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
135 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
136 |              '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
137 |     return bool(iupac.ptr[int(b)])
138 | 
139 | @__force__
140 | def _validate_str_as_seq(s: str, copy: bool = False):
141 |     def ensure_valid(b: byte, i: int):
142 |         if not _is_iupac_nt(b):
143 |             raise ValueError(f"invalid base '{str(b)}' at position {i} of sequence")
144 |     p = s.ptr
145 |     n = s.len
146 |     i = 0
147 |     if copy:
148 |         q = Ptr[byte](n)
149 |         while i < n:
150 |             b = p[i]
151 |             ensure_valid(b, i)
152 |             q[i] = b
153 |             i += 1
154 |         return seq(q, n)
155 |     else:
156 |         while i < n:
157 |             ensure_valid(p[i], i)
158 |             i += 1
159 |         return seq(p, n)
160 | 
161 | @__force__
162 | def _validate_str_as_qual(s: str, copy: bool = False):
163 |     def ensure_valid(b: byte, i: int):
164 |         if not (byte(0x21) <= b <= byte(0x7e)):
165 |             raise ValueError(f"invalid quality score '{str(b)}' at position {i} of quality score string")
166 |     p = s.ptr
167 |     n = s.len
168 |     i = 0
169 |     if copy:
170 |         q = Ptr[byte](n)
171 |         while i < n:
172 |             b = p[i]
173 |             ensure_valid(b, i)
174 |             q[i] = b
175 |             i += 1
176 |         return str(q, n)
177 |     else:
178 |         while i < n:
179 |             ensure_valid(p[i], i)
180 |             i += 1
181 |         return str(p, n)
182 | 
183 | @__force__
184 | def _split_header_on_space(s: str):
185 |     a = 0
186 |     while a < len(s) and not s[a].isspace():
187 |         a += 1
188 | 
189 |     b = a
190 |     while b < len(s) and s[b].isspace():
191 |         b += 1
192 | 
193 |     return s[:a], s[b:]
194 | 


--------------------------------------------------------------------------------
/test/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <dirent.h>
  3 | #include <fcntl.h>
  4 | #include <fstream>
  5 | #include <iostream>
  6 | #include <sstream>
  7 | #include <string>
  8 | #include <sys/types.h>
  9 | #include <sys/wait.h>
 10 | #include <tuple>
 11 | #include <unistd.h>
 12 | #include <vector>
 13 | 
 14 | #include "codon/cir/llvm/llvisitor.h"
 15 | #include "codon/cir/transform/manager.h"
 16 | #include "codon/cir/transform/pass.h"
 17 | #include "codon/cir/util/inlining.h"
 18 | #include "codon/cir/util/irtools.h"
 19 | #include "codon/cir/util/outlining.h"
 20 | #include "codon/compiler/compiler.h"
 21 | #include "codon/compiler/error.h"
 22 | #include "codon/util/common.h"
 23 | 
 24 | #include "gtest/gtest.h"
 25 | 
 26 | using namespace codon;
 27 | using namespace std;
 28 | 
 29 | vector<string> splitLines(const string &output) {
 30 |   vector<string> result;
 31 |   string line;
 32 |   istringstream stream(output);
 33 |   const char delim = '\n';
 34 | 
 35 |   while (getline(stream, line, delim))
 36 |     result.push_back(line);
 37 | 
 38 |   return result;
 39 | }
 40 | 
 41 | static pair<bool, string> findExpectOnLine(const string &line) {
 42 |   for (auto EXPECT_STR : vector<pair<bool, string>>{
 43 |            {false, "# EXPECT: "}, {false, "#: "}, {true, "#! "}}) {
 44 |     size_t pos = line.find(EXPECT_STR.second);
 45 |     if (pos != string::npos)
 46 |       return {EXPECT_STR.first, line.substr(pos + EXPECT_STR.second.length())};
 47 |   }
 48 |   return {false, ""};
 49 | }
 50 | 
 51 | static pair<vector<string>, bool> findExpects(const string &filename, bool isCode) {
 52 |   vector<string> result;
 53 |   bool isError = false;
 54 |   string line;
 55 |   if (!isCode) {
 56 |     ifstream file(filename);
 57 |     if (!file.good()) {
 58 |       cerr << "error: could not open " << filename << endl;
 59 |       exit(EXIT_FAILURE);
 60 |     }
 61 | 
 62 |     while (getline(file, line)) {
 63 |       auto expect = findExpectOnLine(line);
 64 |       if (!expect.second.empty()) {
 65 |         result.push_back(expect.second);
 66 |         isError |= expect.first;
 67 |       }
 68 |     }
 69 |     file.close();
 70 |   } else {
 71 |     istringstream file(filename);
 72 |     while (getline(file, line)) {
 73 |       auto expect = findExpectOnLine(line);
 74 |       if (!expect.second.empty()) {
 75 |         result.push_back(expect.second);
 76 |         isError |= expect.first;
 77 |       }
 78 |     }
 79 |   }
 80 |   return {result, isError};
 81 | }
 82 | 
 83 | string argv0;
 84 | extern "C" void GC_atfork_prepare();
 85 | extern "C" void GC_atfork_parent();
 86 | extern "C" void GC_atfork_child();
 87 | 
 88 | class SeqTest
 89 |     : public testing::TestWithParam<tuple<
 90 |           string /*filename*/, bool /*debug*/, string /* case name */,
 91 |           string /* case code */, int /* case line */, bool /* barebones stdlib */>> {
 92 |   vector<char> buf;
 93 |   int out_pipe[2];
 94 |   pid_t pid;
 95 | 
 96 | public:
 97 |   SeqTest() : buf(65536), out_pipe(), pid() {}
 98 |   string getFilename(const string &basename) {
 99 |     return string(TEST_DIR) + "/" + basename;
100 |   }
101 |   int runInChildProcess() {
102 |     assert(pipe(out_pipe) != -1);
103 |     pid = fork();
104 |     GC_atfork_prepare();
105 |     assert(pid != -1);
106 | 
107 |     if (pid == 0) {
108 |       GC_atfork_child();
109 |       dup2(out_pipe[1], STDOUT_FILENO);
110 |       close(out_pipe[0]);
111 |       close(out_pipe[1]);
112 | 
113 |       auto file = getFilename(get<0>(GetParam()));
114 |       bool debug = get<1>(GetParam());
115 |       auto code = get<3>(GetParam());
116 |       auto startLine = get<4>(GetParam());
117 |       int testFlags = 1 + get<5>(GetParam());
118 | 
119 |       auto compiler = std::make_unique<Compiler>(
120 |           argv0, debug, /*disabledPasses=*/std::vector<std::string>{}, /*isTest=*/true);
121 |       llvm::cantFail(compiler->load("."));
122 |       llvm::handleAllErrors(code.empty()
123 |                                 ? compiler->parseFile(file, testFlags)
124 |                                 : compiler->parseCode(file, code, startLine, testFlags),
125 |                             [](const error::ParserErrorInfo &e) {
126 |                               for (auto &group : e.getErrors()) {
127 |                                 for (auto &msg : group) {
128 |                                   getLogger().level = 0;
129 |                                   printf("%s\n", msg.getMessage().c_str());
130 |                                 }
131 |                               }
132 |                               fflush(stdout);
133 |                               exit(EXIT_FAILURE);
134 |                             });
135 | 
136 |       llvm::cantFail(compiler->compile());
137 |       compiler->getLLVMVisitor()->run({file});
138 |       fflush(stdout);
139 |       exit(EXIT_SUCCESS);
140 |     } else {
141 |       GC_atfork_parent();
142 |       int status = -1;
143 |       close(out_pipe[1]);
144 |       assert(waitpid(pid, &status, 0) == pid);
145 |       read(out_pipe[0], buf.data(), buf.size() - 1);
146 |       close(out_pipe[0]);
147 |       return status;
148 |     }
149 |     return -1;
150 |   }
151 |   string result() { return string(buf.data()); }
152 | };
153 | static string
154 | getTestNameFromParam(const testing::TestParamInfo<SeqTest::ParamType> &info) {
155 |   const string basename = get<0>(info.param);
156 |   const bool debug = get<1>(info.param);
157 |   string normname = basename;
158 |   replace(normname.begin(), normname.end(), '/', '_');
159 |   replace(normname.begin(), normname.end(), '.', '_');
160 |   return normname + (debug ? "_debug" : "");
161 | }
162 | TEST_P(SeqTest, Run) {
163 |   const string file = get<0>(GetParam());
164 |   int status;
165 |   bool isCase = !get<2>(GetParam()).empty();
166 |   if (!isCase)
167 |     status = runInChildProcess();
168 |   else
169 |     status = runInChildProcess();
170 |   ASSERT_TRUE(WIFEXITED(status));
171 | 
172 |   string output = result();
173 | 
174 |   auto expects = findExpects(!isCase ? getFilename(file) : get<3>(GetParam()), isCase);
175 |   if (WEXITSTATUS(status) != int(expects.second))
176 |     fprintf(stderr, "%s\n", output.c_str());
177 |   ASSERT_EQ(WEXITSTATUS(status), int(expects.second));
178 |   const bool assertsFailed = output.find("TEST FAILED") != string::npos;
179 |   EXPECT_FALSE(assertsFailed);
180 |   if (assertsFailed)
181 |     std::cerr << output << std::endl;
182 | 
183 |   if (!expects.first.empty()) {
184 |     vector<string> results = splitLines(output);
185 |     for (unsigned i = 0; i < min(results.size(), expects.first.size()); i++)
186 |       if (expects.second)
187 |         EXPECT_EQ(results[i].substr(0, expects.first[i].size()), expects.first[i]);
188 |       else
189 |         EXPECT_EQ(results[i], expects.first[i]);
190 |     EXPECT_EQ(results.size(), expects.first.size());
191 |   }
192 | }
193 | 
194 | // clang-format off
195 | INSTANTIATE_TEST_SUITE_P(
196 |     CoreTests, SeqTest,
197 |     testing::Combine(
198 |       testing::Values(
199 |         "core/align.codon",
200 |         "core/big.codon",
201 |         "core/bwtsa.codon",
202 |         "core/containers.codon",
203 |         "core/formats.codon",
204 |         "core/kmers.codon",
205 |         "core/match.codon",
206 |         "core/proteins.codon",
207 |         "core/serialization.codon"
208 |       ),
209 |       testing::Values(true, false),
210 |       testing::Values(""),
211 |       testing::Values(""),
212 |       testing::Values(0),
213 |       testing::Values(false)
214 |     ),
215 |     getTestNameFromParam);
216 | 
217 | INSTANTIATE_TEST_SUITE_P(
218 |     PipelineTests, SeqTest,
219 |     testing::Combine(
220 |       testing::Values(
221 |         "pipeline/canonical_opt.codon",
222 |         "pipeline/interalign.codon",
223 |         "pipeline/prefetch.codon",
224 |         "pipeline/revcomp_opt.codon"
225 |       ),
226 |       testing::Values(false),
227 |       testing::Values(""),
228 |       testing::Values(""),
229 |       testing::Values(0),
230 |       testing::Values(false)
231 |     ),
232 |     getTestNameFromParam);
233 | // clang-format on
234 | 
235 | int main(int argc, char *argv[]) {
236 |   argv0 = ast::Filesystem::executable_path(argv[0]);
237 |   testing::InitGoogleTest(&argc, argv);
238 |   return RUN_ALL_TESTS();
239 | }
240 | 


--------------------------------------------------------------------------------
/test/pipeline/canonical_opt.codon:
--------------------------------------------------------------------------------
  1 | from bio import *
  2 | 
  3 | # test kmers |> canonical optimization
  4 | @test
  5 | def test(s: seq, K: Static[int]):
  6 |     got1 = list[Kmer[K]]()
  7 |     s |> kmers(1, k=K) |> canonical |> got1.append
  8 |     exp1 = [min(k, ~k) for k in s.kmers(step=1, k=K)]
  9 |     assert got1 == exp1
 10 | 
 11 |     got2 = list[tuple[int,Kmer[K]]]()
 12 |     s |> kmers_with_pos(step=1, k=K) |> canonical_with_pos |> got2.append
 13 |     exp2 = [(i, min(k, ~k)) for i,k in s.kmers_with_pos(step=1, k=K)]
 14 |     assert got2 == exp2
 15 | 
 16 |     # test revcomp'd seq
 17 |     s = ~s
 18 | 
 19 |     got1 = list[Kmer[K]]()
 20 |     s |> kmers(step=1, k=K) |> canonical |> got1.append
 21 |     exp1 = [min(k, ~k) for k in s.kmers(step=1, k=K)]
 22 |     assert got1 == exp1
 23 | 
 24 |     got2 = list[tuple[int,Kmer[K]]]()
 25 |     s |> kmers_with_pos(1, k=K) |> canonical_with_pos |> got2.append
 26 |     exp2 = [(i, min(k, ~k)) for i,k in s.kmers_with_pos(step=1, k=K)]
 27 |     assert got2 == exp2
 28 | 
 29 | def test_all(s: list[seq], K: Static[int]):
 30 |     for a in s:
 31 |         test(a, K)
 32 | 
 33 | v = [   s'C',
 34 |         s'GA',
 35 |         s'CTC',
 36 |         s'TATG',
 37 |         s'GTAGC',
 38 |         s'TGCAGT',
 39 |         s'TACTAGG',
 40 |         s'GTTAGAAA',
 41 |         s'TGGATCATA',
 42 |         s'TTGGCCACGG',
 43 |         s'CCTCTGCGGGA',
 44 |         s'ATGATAGGAAAG',
 45 |         s'CTCTTTGGGGTTA',
 46 |         s'GCAGTTTTTCGCTT',
 47 |         s'GCTTTGTGAAATGGT',
 48 |         s'CATGGAGCGGCCGTGT',
 49 |         s'TTAACGGGCCCTTGCCG',
 50 |         s'CCTTGTTTGCTCGCGAGA',
 51 |         s'GGCCAAGCTTATCTGTGTT',
 52 |         s'CCGCTTGCCTTACATTGGCC',
 53 |         s'GAGTGTGTAATAAATAATCGT',
 54 |         s'GCGACAATATACTGCGTGGTCA',
 55 |         s'TTCATATATGACCCCGTACAGGG',
 56 |         s'GCTTTGATGAATTAAGGATAGCTT',
 57 |         s'GTTATATCCCCCCGCGCTTTAACCG',
 58 |         s'AAACCTGACCGTTAAGACTTTAAGAC',
 59 |         s'GACCTTGCCCTTGAAGGAACATCTGGG',
 60 |         s'CGTCCATGTTTGCATACGACGTTGTGTT',
 61 |         s'TAAACGAGACCGTCTCTGCTCATAGCCGT',
 62 |         s'CAGTACGTTAGTGTGACCGCAAAAGGTGGT',
 63 |         s'GAGGATATATAACAACCTCAGGAGTCTGTTC',
 64 |         s'AGGATAAGGTTCACAGAACTATTGTTATGATT',
 65 |         s'CACAGAGAGCGGGCTCTATTGTTGTCGTGGGGA',
 66 |         s'TACTCGTGGCGAGGAGGAGTTTTCTTAAATCGGA',
 67 |         s'GGGCTTTTCGCCTGGTTATAACTTGGTATGGAGCT',
 68 |         s'AGGACATTTAAGATTTAAAAGCACCCTCAGCAACAT',
 69 |         s'CATGAAGGATCAGTGACTTCCTTACCTGTGCTGCCGG',
 70 |         s'CGTATTGGTTCGTGACTTCCACGGGGTGTCACGGCGGA',
 71 |         s'GCGAAACGCATCATCCGGGACAATCAACTCATCAGGCGG',
 72 |         s'CACAACCGGGGCTCGATCCCAAGCACCATTACACGATGAC',
 73 |         s'TCACGAAGACACAACGGTCCACACCTACTTTCCCCCCAACG',
 74 |         s'ATCTACTGAAGCGCTACAGTCATCTACTGCCACTGAACGGAC',
 75 |         s'GTGTCCATGCTCCGTTTAAAATAGAGAAGCCAAAGGAGTACCA',
 76 |         s'AGTAGGCTGCAAATCGATCGCCACGCAAATGCTAAAAGTTGTGC',
 77 |         s'GTGATTAGAGGAGGTAACAGCCCAAACGCTCTCTTCCTCGCTATA',
 78 |         s'TGTTGTGTCCGATAAGATTGCGTCCTGGTGGAGTGGTTGCGGGTTA',
 79 |         s'GCTTAACTAACCGAGCCGGTGCGGAGATTCCGTGTGTGGCTAGTTAT',
 80 |         s'CTCGCGTGCGGAGACCTCTCCCCCAGGAATAACCGATTGACCGTGGAT',
 81 |         s'ACATTCGCAGATAGGGCAACGGTTTAGTGCGCACCATGCTAAGCTAAGG',
 82 |         s'TTTGCACCGGTTAGCTGTCGTTAATCCGACGGGCCTTATCTCGGTACATT',
 83 |         s'TACTCGGACATCCATCCAAAGTTGGAGATAGCTTTGTATGAAAATGATGTA',
 84 |         s'CAGACTTTCTTATGTATCCCTCTCTTGCACAGATTGTTCATGTCATTTGCTG',
 85 |         s'CAGCGGGCCCTCTGCTTATAGCCGGGTAGTCCGTATTACGACCGAGGTCCCGT',
 86 |         s'GGGTAACAACAAAGGATGAGGTTAATTCGGAGGGAAAACGTCGCGAACGATAGT',
 87 |         s'GCTGAGCCCAGGAGGGGAGCTCTCGGGAGTCGCAATGAGATCCGTGGGCCCCCAA',
 88 |         s'CGATTAGCGACCAAATGGTATACATTACGTATCGGGTGTCGAACCCCCTACCACTT',
 89 |         s'CCGAACCATTCCTAGACTTGCATCGAAGACGCCGGAGACATAGAGTATACTATTTCG',
 90 |         s'CAATAAAGGAAGATCGACCCAGTTGCTAGGCCCGGCGCTTCCCTGGAGCCTTGGTGAA',
 91 |         s'AAATGGAGCCAGACAGCGTGACGCCCTGCGCGCGACAAGCTACGTCGAAGGCACATTCT',
 92 |         s'CAATCTATGATTTTTGTCGGGTTGCGAGCCTAAGTTAACTTGATCTCCGTACGTGCCACA',
 93 |         s'CAACTACTGAGCTCCGTTAGGTCCCTTTCATTTCGCTGCGTATGTGTAACCAACCTCGTAC',
 94 |         s'TGGAGAGTCCCTTCCCACGCGCAACCTGCGGTTAGCATCGCAGATCTTTCTACAGCCCTATG',
 95 |         s'GATCCTAACCTTCTACACTCATCGAACACGAACATGATGCTGTGCGGTGTCATTTGGATTGTG',
 96 |         s'GTCATTACATAACCATGAGCGCCCCCGTTCAAACTTCAGCGTTTTTGTAAAGGGCATCATTGCC',
 97 |         s'GGGGCAAGCAGACGGATTACACACTCTTTTAACCTCGTTTCTACACTCCTAATCAGTCCGTACAG',
 98 |         s'CATGCGGTCCCCGCAGCCCAGATCCAATCCGGCAGCAGAAGGCATTATCTGCTGGCGTTGCCTTAC',
 99 |         s'TAAGTCGACCGAAGGATGCAAAAGGAATCCCCGACAGTATCTGTCACATCCCTGCAGCCGTCTATTT',
100 |         s'GATGATCACCTAATGACCGTCGCGGGACCAATGGTATCCGAGTGATGGAATCCTACGATTGATCAACC',
101 |         s'AAAGAATTTCCCAAAGTCCCACTTGTTAATACCGTGACGCAGCCGGACTAACACAGTCCCAGGTATTGA',
102 |         s'ATCGTTTATGTAGATCACGGCATATCTCTAACTAACGGCATAGTACCGGCATGCAGTTGAGCAGACTGGG',
103 |         s'AGACTCTGGCATATCTGTCCATACCCATAAGCCGGCCCACGCAGGATCAGGAGTGAATTGCCGCGCAGATC',
104 |         s'CTGGAGCCATGAATATAGCATCCCAACCCAACGTTTGAGTGTCTCCAGCGGAATAAAACGCCTAATTTTTGG',
105 |         s'TTTAAGCCACCTGTCATTAGATATATCGCGCCCGCCACTTACATCTACCTGTTCATAGAGACTATGCGTACTA',
106 |         s'CTCACATCTCTATCGATTTAGGTAGAGGCTAGCGGTCATAAATCTAAGTCTAGTCAGCCCGCAACGCCACATCG',
107 |         s'TACAGCCCGTGCAGTGTACAAGCGCACACGGGAGCATCCTGGGTTATGAGCCCCCCGAGGCCAGTGCAAGACCAG',
108 |         s'GTCTCTTGGCAGTGTCCAGACAGACGCGGTGCCTGGGAAGACGTTAATTGGTGCTACTTCAACCACGCCTTGCTTG',
109 |         s'GGGGCTATTTGGGATTAGAGAAGTGCCTACACTCGGTGGCCCAGCTGGGCTGTCCATTGCAGACACGCGTCGCGTGT',
110 |         s'CAGGAGTATGGACCCAACATGACTTTGGCGGCAGCAGTATCCATGCCCGTTGCCTGTAGTCTTTAGGAGGATCGTAAT',
111 |         s'CGCGGTGTACCGTTTCAGATTCTATCTACTACGACTAGGTGCTTAGCAATTACCAATCGTACGCGACCGATATAAAACT',
112 |         s'CGCCAGGGGTCATCCCGGGGATTCCATCAGTTAAACGTCTTGAAACCAGTTAGAGAACAGAGACCAGTTAGGCAACTCGG',
113 |         s'TTCCCGCTTCTCTGCCCGATACCCAATTCTTGATACGTAGTCTCCCTACCGCAGGATATGACGCACGCCCCTATAACGAAC',
114 |         s'AGCTACCACATACACAGGGAGACCGCGCGGTTTTGACCGTATGGTCAACCCATCTTCTAAGTTGCAGCGTCGAAGAGTCCCG',
115 |         s'GGTAACGACGATGGGACAAAATGATAAATCACGTCGTACTTAGTTCACTTAGGTAAGGGTTTGGTGTGGCGTGAATAACTTGA',
116 |         s'GTGCTCGAGGCAGCGACACGCGTTCATCAGATTGTGACAATGACCATATATTAGACGTCAGTTAAAGGAAGTTCACCGGTAACG',
117 |         s'GTAAACTTCAGTTGCTAAAGTTAAATTCATCTGATGCTTACGTGGAAACTTGAGAAGGAGCCATTGAGACTTCGCTCCAGATAAC',
118 |         s'CCGAATTCCGTTATCAACGTAAGCTAAGCGGCTTGGGCGTAGAGTTTCTAAGGGTGAACGTACACTTCTTTCGCTGCACCGATGCC',
119 |         s'CCACTCCCTGATTAGCTTTGTTCTGTATTGCATGAATAAGGTTCAATTTGCGACCTTATGAAACAGGTAATCTGGGAAGCCTTAGTG',
120 |         s'CATAAGTCCCAGATTCTCGCCGGATGGCAATTCTAGCGTCACGTTAGACAACGGTGAGAACGGAGGAGATTCAGAGCACAGGCTTGAA',
121 |         s'AAGCTTTAGAGATCCAGGACCTTATGTCGGTACAAGAAACTAGAACTCGCTAAGTAGATCAGGTCCTGGCAGCATCCATGCCCCATTTG',
122 |         s'TACTTATTACCACCTTTGTTGGGCTTTGACAGATATTACAGTGGTCTGATTCGTGGGGGCTTACTGCAACGCATACTATGGCGAAGGTCC',
123 |         s'GGCAAGCTGAGGAGGGACGGTCACCCACTGAAAACATTTGAAACCGGGCGGGCTTGAACAGGGCCAATCAAGACCCCTCTCATAGGATGGC',
124 |         s'CCATGCGAATTTTCCGGTCAAGGAACACCTGATTCAGAGCGGGCTACCAGAACAGGCAAGCAGCCCTACATCGCTTCTTAAAAAATATTAAG',
125 |         s'TGGCTATGCATCTCACTTGGCTTTTCACGGGGGTGCCCAGAGGACATAGATACAGCACGGTCCCATGCTAGGATCCAACGAGTGCATTAGAAT',
126 |         s'CTAAGCCTATGCCAGTCTTCCTTGTTTACTCGGTGGTCCTGTACGTCCATATCATTTACGTCCATGAAGCCAACCCCCGAGCAAATACCCGGTA',
127 |         s'GATAAATTCCTCCCATATCAAAGTTCTTGCCCACGCGGGCTACCCAGCTAACGTAACTGTTTCGACACTAGAGATAACAACGCGTTGCGACTCTC',
128 |         s'GGCGGCATGTAGGACGGCGTCAGTGGGTATACTATCGCTCTTAGGTCTCCAGTCAAAAAAATGTGGCATCCGGTAGTTGCTGGCAGATCTGCACTT',
129 |         s'ATTTATTTGCCCCGCAGTGTCCTTTTTCTAGTCATAAATCCTCATACCGCGGGCCCTTCATCCGGTTTGATTCGAAGCATTGGTATGTTAGATACGT',
130 |         s'ACACGCCATGAGGTAAATAACTCTGGAATTGTCAGTCAAGCACCGTGTGTTCAGTGTAAGTTTCTCGGACCAAGGCATATCGACGCTATGCGGTTTAT',
131 |         s'AACCTCAGTCGGGCAGGCCATGGCGCGAAATGACTCGAGTAGACTCCATCTCTAAGGAGCGGAGCTGTTGCAACTAGGGTGACACACAGCTCGCCATGA',
132 |         s'TATTGCAAGGCCCTACGCGGCTACGTCTCAATATATCCTATGGGCCGCAGCGTTCGGCCAATTCACATGGATGAGACATGGGTCCAAAATTTGCGGGATA'   ]
133 | 
134 | test_all(v, 1)
135 | test_all(v, 2)
136 | test_all(v, 3)
137 | test_all(v, 4)
138 | test_all(v, 5)
139 | test_all(v, 6)
140 | test_all(v, 7)
141 | test_all(v, 8)
142 | test_all(v, 9)
143 | test_all(v, 10)
144 | test_all(v, 11)
145 | test_all(v, 12)
146 | test_all(v, 13)
147 | test_all(v, 14)
148 | test_all(v, 15)
149 | test_all(v, 16)
150 | test_all(v, 17)
151 | test_all(v, 18)
152 | test_all(v, 19)
153 | test_all(v, 20)
154 | 


--------------------------------------------------------------------------------
/stdlib/bio/bwa.codon:
--------------------------------------------------------------------------------
  1 | from bio import seq, CIGAR
  2 | 
  3 | from os import getenv as _getenv
  4 | BWA_LIB = _getenv('BWA_LIB')
  5 | if not BWA_LIB:
  6 |     raise OSError("BWA error: 'BWA_LIB' environment variable not set")
  7 | 
  8 | @tuple
  9 | class bntann1_t:
 10 |     _offset: i64
 11 |     _len: i32
 12 |     _n_ambs: i32
 13 |     _gi: u32
 14 |     _is_alt: i32
 15 |     _name: Ptr[byte]
 16 |     _anno: Ptr[byte]
 17 | 
 18 | @tuple
 19 | class bntamb1_t:
 20 |     _offset: i64
 21 |     _len: i32
 22 |     _amb: byte
 23 | 
 24 | @tuple
 25 | class bntseq_t:
 26 |     _l_pac: i64
 27 |     _n_seqs: i32
 28 |     _seed: u32
 29 |     _anns: Ptr[bntann1_t]
 30 |     _n_holes: i32
 31 |     _ambs: Ptr[bntamb1_t]
 32 |     _fp_pac: cobj
 33 | 
 34 | @tuple
 35 | class mem_alnreg_t:
 36 |     _rb: i64
 37 |     _re: i64
 38 |     _qb: i32
 39 |     _qe: i32
 40 |     _rid: i32
 41 |     _score: i32
 42 |     _truesc: i32
 43 |     _sub: i32
 44 |     _alt_sc: i32
 45 |     _csub: i32
 46 |     _sub_n: i32
 47 |     _w: i32
 48 |     _seedcov: i32
 49 |     _secondary: i32
 50 |     _secondary_all: i32
 51 |     _seedlen0: i32
 52 |     _bitfields: u32
 53 |     _frac_rep: u32  # really a 32-bit float
 54 |     _hash: u64
 55 | 
 56 |     @property
 57 |     def rb(self):
 58 |         return int(self._rb)
 59 | 
 60 |     @property
 61 |     def re(self):
 62 |         return int(self._re)
 63 | 
 64 |     @property
 65 |     def qb(self):
 66 |         return int(self._qb)
 67 | 
 68 |     @property
 69 |     def qe(self):
 70 |         return int(self._qe)
 71 | 
 72 |     @property
 73 |     def rid(self):
 74 |         return int(self._rid)
 75 | 
 76 |     @property
 77 |     def score(self):
 78 |         return int(self._score)
 79 | 
 80 |     @property
 81 |     def true_score(self):
 82 |         return int(self._truesc)
 83 | 
 84 |     @property
 85 |     def sub(self):
 86 |         return int(self._sub)
 87 | 
 88 |     @property
 89 |     def alt_score(self):
 90 |         return int(self._alt_sc)
 91 | 
 92 |     @property
 93 |     def csub(self):
 94 |         return int(self._csub)
 95 | 
 96 |     @property
 97 |     def sub_n(self):
 98 |         return int(self._sub_n)
 99 | 
100 |     @property
101 |     def w(self):
102 |         return int(self._w)
103 | 
104 |     @property
105 |     def seedcov(self):
106 |         return int(self._seedcov)
107 | 
108 |     @property
109 |     def secondary(self):
110 |         return int(self._secondary)
111 | 
112 |     @property
113 |     def secondary_all(self):
114 |         return int(self._secondary_all)
115 | 
116 | @tuple
117 | class mem_aln_t:
118 |     _pos: i64
119 |     _rid: i32
120 |     _flag: i32
121 |     _bitfields: u32
122 |     _n_cigar: i32
123 |     _cigar: Ptr[u32]
124 |     _XA: Ptr[byte]
125 |     _score: i32
126 |     _sub: i32
127 |     _alt_sc: i32
128 | 
129 |     @property
130 |     def pos(self):
131 |         return int(self._pos)
132 | 
133 |     @property
134 |     def rid(self):
135 |         return int(self._rid)
136 | 
137 |     @property
138 |     def rev(self):
139 |         return bool(self._bitfields & u32(0x1))
140 | 
141 |     @property
142 |     def alt(self):
143 |         return bool(self._bitfields & u32(0x2))
144 | 
145 |     @property
146 |     def mapq(self):
147 |         return int((self._bitfields & u32(0x3fc)) >> u32(2))
148 | 
149 |     @property
150 |     def NM(self):
151 |         return int((self._bitfields & u32(0xfffffc00)) >> u32(10))
152 | 
153 |     @property
154 |     def cigar(self):
155 |         return CIGAR(self._cigar, int(self._n_cigar))
156 | 
157 |     @property
158 |     def score(self):
159 |         return int(self._score)
160 | 
161 |     @property
162 |     def sub(self):
163 |         return int(self._sub)
164 | 
165 |     @property
166 |     def alt_score(self):
167 |         return int(self._alt_sc)
168 | 
169 | @tuple
170 | class mem_alnreg_v:
171 |     n: int
172 |     m: int
173 |     a: Ptr[mem_alnreg_t]
174 | 
175 |     def __new__() -> mem_alnreg_v:
176 |         return mem_alnreg_v(0, 0, Ptr[mem_alnreg_t]())
177 | 
178 |     def __getitem__(self, idx: int):
179 |         if not (0 <= idx < self.n):
180 |             raise IndexError("alignment index out of range")
181 |         return self.a[idx]
182 | 
183 |     def __len__(self):
184 |         return self.n
185 | 
186 |     def __iter__(self):
187 |         i = 0
188 |         while i < self.n:
189 |             yield self.a[i]
190 |             i += 1
191 | 
192 | @tuple
193 | class bwaidx_t:
194 |     _bwt: cobj
195 |     _bns: Ptr[bntseq_t]
196 |     _pac: cobj
197 |     _is_shm: i32
198 |     _l_mem: i64
199 |     _mem: cobj
200 | 
201 |     def name(self, aln: mem_aln_t):
202 |         def _strlen(p: Ptr[byte]):
203 |             n = 0
204 |             while p[n]: n += 1
205 |             return n
206 |         p = self._bns[0]._anns[aln.rid]._name
207 |         return str(p, _strlen(p))
208 | 
209 | _BWA_IDX_BWT = i32(0x1)
210 | _BWA_IDX_BNS = i32(0x2)
211 | _BWA_IDX_PAC = i32(0x4)
212 | _BWA_IDX_ALL = i32(0x7)
213 | 
214 | from C import BWA_LIB.mem_opt_init() -> cobj
215 | from C import BWA_LIB.bwa_fill_scmat(i32, i32, Ptr[i8]) -> cobj
216 | from C import BWA_LIB.bwa_idx_load(cobj, i32) -> Ptr[bwaidx_t]
217 | from C import BWA_LIB.mem_align1(Ptr[mem_alnreg_v], cobj, cobj, Ptr[bntseq_t], cobj, i32, Ptr[byte])
218 | from C import BWA_LIB.mem_reg2aln(Ptr[mem_aln_t], cobj, Ptr[bntseq_t], cobj, i32, Ptr[byte], Ptr[mem_alnreg_t])
219 | 
220 | def options(match_score: int = 1,
221 |             mismatch_score: int = 4,
222 |             open_del: int = 6,
223 |             open_ins: int = 6,
224 |             extend_del: int = 1,
225 |             extend_ins: int = 1,
226 |             bandwidth: int = 100,
227 |             zdrop: int = 100,
228 |             clip_penalty: Tuple[int,int] = (5,5),
229 |             unpaired_penalty: int = 17):
230 |     # offsets below are based on BWA mem_opt_t definition
231 |     opt = mem_opt_init()
232 |     p = Ptr[i32](opt + 0 * 4); p[0] = i32(match_score)
233 |     p = Ptr[i32](opt + 1 * 4); p[0] = i32(mismatch_score)
234 |     p = Ptr[i32](opt + 2 * 4); p[0] = i32(open_del)
235 |     p = Ptr[i32](opt + 3 * 4); p[0] = i32(extend_del)
236 |     p = Ptr[i32](opt + 4 * 4); p[0] = i32(open_ins)
237 |     p = Ptr[i32](opt + 5 * 4); p[0] = i32(extend_ins)
238 |     p = Ptr[i32](opt + 6 * 4); p[0] = i32(unpaired_penalty)
239 |     p = Ptr[i32](opt + 7 * 4); p[0] = i32(clip_penalty[0])
240 |     p = Ptr[i32](opt + 8 * 4); p[0] = i32(clip_penalty[1])
241 |     p = Ptr[i32](opt + 9 * 4); p[0] = i32(bandwidth)
242 |     p = Ptr[i32](opt + 10 * 4); p[0] = i32(zdrop)
243 |     bwa_fill_scmat(i32(match_score), i32(mismatch_score), Ptr[i8](opt + 140))
244 |     return opt
245 | 
246 | @tuple
247 | class BWA:
248 |     opt: cobj
249 |     p: Ptr[bwaidx_t]
250 | 
251 |     def __new__(hint: str) -> BWA:
252 |         return BWA(mem_opt_init(), bwa_idx_load(hint.c_str(), _BWA_IDX_ALL))
253 | 
254 |     def __new__(opt: cobj, hint: str) -> BWA:
255 |         return BWA(opt, bwa_idx_load(hint.c_str(), _BWA_IDX_ALL))
256 | 
257 |     def name(self, aln: mem_aln_t):
258 |         return self.p[0].name(aln)
259 | 
260 |     def reg2aln(self, s: seq, reg: mem_alnreg_t):
261 |         from C import free(Ptr[byte])
262 |         a = mem_aln_t()
263 |         mem_reg2aln(__ptr__(a), self.opt, self.p[0]._bns, self.p[0]._pac, i32(len(s)), s.ptr, __ptr__(reg))
264 |         # fix CIGAR & marshal to GC:
265 |         n_cigar = int(a._n_cigar)
266 |         cigar = Ptr[u32](n_cigar)
267 |         for i in range(n_cigar):
268 |             c = a._cigar[i]
269 |             n, op = c >> u32(4), c & u32(0xf)
270 |             if u32(3) <= op < u32(0xf):
271 |                 op += u32(1)
272 |             cigar[i] = (n << u32(4)) | op
273 |         free(a._cigar.as_byte())
274 |         Ptr[Ptr[u32]](__ptr__(a).as_byte() + 24)[0] = cigar  # note offset
275 |         return a
276 | 
277 |     def align(self, s: seq):
278 |         import internal.gc as gc
279 |         from C import free(Ptr[byte])
280 |         p = s.ptr if s.len >= 0 else str(s).ptr
281 |         ar = mem_alnreg_v()
282 |         mem_align1(__ptr__(ar), self.opt, self.p[0]._bwt, self.p[0]._bns, self.p[0]._pac, i32(len(s)), p)
283 |         # marshal to GC:
284 |         n_bytes = ar.n * gc.sizeof(mem_alnreg_t)
285 |         copy = gc.alloc_atomic(n_bytes)
286 |         str.memcpy(copy.as_byte(), ar.a.as_byte(), n_bytes)
287 |         free(ar.a.as_byte())
288 |         Ptr[Ptr[mem_alnreg_t]](__ptr__(ar).as_byte() + 16)[0] = Ptr[mem_alnreg_t](copy)  # note offset
289 |         return ar
290 | 


--------------------------------------------------------------------------------
/test/core/kmers.codon:
--------------------------------------------------------------------------------
  1 | from bio import *
  2 | 
  3 | K: Static[int] = 5
  4 | 
  5 | s = s'ACGTAACGTA'
  6 | print s                       # EXPECT: ACGTAACGTA
  7 | print list(s.kmers(1, K))     # EXPECT: [k'ACGTA', k'CGTAA', k'GTAAC', k'TAACG', k'AACGT', k'ACGTA']
  8 | print list(s.split(5, 1))     # EXPECT: [s'ACGTA', s'CGTAA', s'GTAAC', s'TAACG', s'AACGT', s'ACGTA']
  9 | print ~s                      # EXPECT: TACGTTACGT
 10 | print list((~s).kmers(1, K))  # EXPECT: [k'TACGT', k'ACGTT', k'CGTTA', k'GTTAC', k'TTACG', k'TACGT']
 11 | print list((~s).split(5, 1))  # EXPECT: [s'TACGT', s'ACGTT', s'CGTTA', s'GTTAC', s'TTACG', s'TACGT']
 12 | 
 13 | s = s'AANGGCCAGTC'
 14 | print list(s.kmers_with_pos(1, 2))  # EXPECT: [(0, k'AA'), (3, k'GG'), (4, k'GC'), (5, k'CC'), (6, k'CA'), (7, k'AG'), (8, k'GT'), (9, k'TC')]
 15 | print list(~s |> kmers_with_pos(1, 2))  # EXPECT: [(0, k'GA'), (1, k'AC'), (2, k'CT'), (3, k'TG'), (4, k'GG'), (5, k'GC'), (6, k'CC'), (9, k'TT')]
 16 | 
 17 | s = s'AGACCTTAGC'
 18 | print s  # EXPECT: AGACCTTAGC
 19 | print list(s.kmers(1, 3))  # EXPECT: [k'AGA', k'GAC', k'ACC', k'CCT', k'CTT', k'TTA', k'TAG', k'AGC']
 20 | print list(s.kmers(2, 3))  # EXPECT: [k'AGA', k'ACC', k'CTT', k'TAG']
 21 | print list(s.kmers(4, 3))  # EXPECT: [k'AGA', k'CTT']
 22 | print ~s  # EXPECT: GCTAAGGTCT
 23 | print list((~s).kmers(1, 3))  # EXPECT: [k'GCT', k'CTA', k'TAA', k'AAG', k'AGG', k'GGT', k'GTC', k'TCT']
 24 | print list((~s).kmers(2, 3))  # EXPECT: [k'GCT', k'TAA', k'AGG', k'GTC']
 25 | print list((~s).kmers(4, 3))  # EXPECT: [k'GCT', k'AGG']
 26 | 
 27 | s = s'AGACCTNTAGNC'
 28 | print s  # EXPECT: AGACCTNTAGNC
 29 | print list(s.kmers_with_pos(1, 3))  # EXPECT: [(0, k'AGA'), (1, k'GAC'), (2, k'ACC'), (3, k'CCT'), (7, k'TAG')]
 30 | print list(s.kmers_with_pos(2, 3))  # EXPECT: [(0, k'AGA'), (2, k'ACC')]
 31 | print list(s.kmers_with_pos(4, 3))  # EXPECT: [(0, k'AGA')]
 32 | print ~s  # EXPECT: GNCTANAGGTCT
 33 | print list((~s).kmers_with_pos(1, 3))  # EXPECT: [(2, k'CTA'), (6, k'AGG'), (7, k'GGT'), (8, k'GTC'), (9, k'TCT')]
 34 | print list((~s).kmers_with_pos(2, 3))  # EXPECT: [(2, k'CTA'), (6, k'AGG'), (8, k'GTC')]
 35 | print list((~s).kmers_with_pos(4, 3))  # EXPECT: [(8, k'GTC')]
 36 | 
 37 | s = s'AGACCTNTAGC'
 38 | print list(s.split(k=100, step=1))         # EXPECT: []
 39 | print list((~s).split(k=100, step=1))      # EXPECT: []
 40 | print list(s.kmers(step=1, k=100))         # EXPECT: []
 41 | print list((~s).kmers(step=1, k=100))      # EXPECT: []
 42 | 
 43 | s = s'TAGCC'
 44 | print list(s.split(k=5, step=17))         # EXPECT: [s'TAGCC']
 45 | print list((~s).split(k=5, step=17))      # EXPECT: [s'GGCTA']
 46 | print list(s.kmers(step=17, k=5))         # EXPECT: [k'TAGCC']
 47 | print list((~s).kmers(step=17, k=5))      # EXPECT: [k'GGCTA']
 48 | 
 49 | k1 = Kmer[K](s'ACGTA')
 50 | k2 = Kmer[K](s'ATGTT')
 51 | 
 52 | print [k1[i] for i in range(len(k1))]  # EXPECT: [k'A', k'C', k'G', k'T', k'A']
 53 | print [k2[-i - 1] for i in range(len(k2))]  # EXPECT: [k'T', k'T', k'G', k'T', k'A']
 54 | 
 55 | print ~k1  # EXPECT: TACGT
 56 | print ~k2  # EXPECT: AACAT
 57 | 
 58 | print abs(k1 - k2)  # EXPECT: 2
 59 | print abs(k2 - k1)  # EXPECT: 2
 60 | 
 61 | if k1 > k2:
 62 |     print k2 - k1   # EXPECT: -2
 63 |     print k1 - k2   # EXPECT: 2
 64 | else:
 65 |     print k1 - k2
 66 |     print k2 - k1
 67 | 
 68 | k1, k2 = k2, k1
 69 | if k1 > k2:
 70 |     print k2 - k1   # EXPECT: -2
 71 |     print k1 - k2   # EXPECT: 2
 72 | else:
 73 |     print k1 - k2
 74 |     print k2 - k1
 75 | 
 76 | k1 = Kmer[K](s'ACGTA')
 77 | k2 = Kmer[K](s'ACGTA')
 78 | print k1 - k2  # EXPECT: 0
 79 | print k2 - k1  # EXPECT: 0
 80 | 
 81 | k1long = Kmer[100]() |> base(0, k'T') |> base(42, k'C') |> base(77, k'G')
 82 | k2long = Kmer[100]() |> base(0, k'T') |> base(43, k'C') |> base(77, k'T')
 83 | print abs(k1long - k2long)  # EXPECT: 3
 84 | 
 85 | if k1long > k2long:
 86 |     print k2long - k1long   # EXPECT: -3
 87 |     print k1long - k2long   # EXPECT: 3
 88 | else:
 89 |     print k1long - k2long
 90 |     print k2long - k1long
 91 | 
 92 | k1long, k2long = k2long, k1long
 93 | if k1long > k2long:
 94 |     print k2long - k1long   # EXPECT: -3
 95 |     print k1long - k2long   # EXPECT: 3
 96 | else:
 97 |     print k1long - k2long
 98 |     print k2long - k1long
 99 | 
100 | print k1 << s'G'   # EXPECT: CGTAG
101 | print k1 >> s'G'   # EXPECT: GACGT
102 | print k1 << ~s'G'  # EXPECT: CGTAC
103 | print k1 >> ~s'G'  # EXPECT: CACGT
104 | 
105 | K100 = Kmer[100]
106 | K1 = Kmer[1]
107 | print K100() |> base(-1, K1(s'C')) |> base(98, s'G') |> base(0, K1(s'T'))
108 | # EXPECT: TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGC
109 | 
110 | k3 = K100()
111 | h1 = hash(k3)
112 | h2 = hash(k3 |> base(0, k'T'))
113 | h3 = hash(k3 |> base(99, k'T'))
114 | h4 = hash(k3 |> base(0, k'T') |> base(99, k'T'))
115 | # bases on both ends should be involved in k-mer hash:
116 | print h1 == h2  # EXPECT: False
117 | print h1 == h3  # EXPECT: False
118 | print h1 == h4  # EXPECT: False
119 | print h2 == h3  # EXPECT: False
120 | print h2 == h4  # EXPECT: False
121 | print h3 == h4  # EXPECT: False
122 | 
123 | print k'ACGT' in s'GGACGTGG'  # EXPECT: True
124 | print k'ACGT' in s'GGAGTGG'   # EXPECT: False
125 | print s'ACGT' in k'GGACGTGG'  # EXPECT: True
126 | print s'ACGT' in k'GGAGTGG'   # EXPECT: False
127 | 
128 | @test
129 | def test_N():
130 |     assert s''.N() == False
131 |     assert s'ACGTacgt'.N() == False
132 |     assert s'ACGTNacgt'.N() == True
133 |     assert s'N'.N() == True
134 |     assert s'AAN'.N() == True
135 |     assert s'NAA'.N() == True
136 |     assert s'ANA'.N() == True
137 | test_N()
138 | 
139 | @test
140 | def test_base_counts():
141 |     assert s''.bases == BaseCounts(0,0,0,0,0)
142 |     assert s'A'.bases == BaseCounts(1,0,0,0,0)
143 |     assert s'C'.bases == BaseCounts(0,1,0,0,0)
144 |     assert s'G'.bases == BaseCounts(0,0,1,0,0)
145 |     assert s'T'.bases == BaseCounts(0,0,0,1,0)
146 |     assert s'N'.bases == BaseCounts(0,0,0,0,1)
147 |     assert s'AAGAGACTNTN'.bases == BaseCounts(4,1,2,2,2)
148 |     assert (s'A'.bases + s'G'.bases) - s'A'.bases == s'G'.bases
149 |     assert s'A'.bases.add(T=True) - s'A'.bases == s'T'.bases
150 | test_base_counts()
151 | 
152 | @test
153 | def test_kmer_revcomp(path: str, K: Static[int]):
154 |     v1 = []
155 |     v2 = []
156 | 
157 |     for x in FASTA(path, fai=False):
158 |         for y in x.seq.split(k=K, step=1):
159 |             if not y.N():
160 |                 v1.append(str(~y).upper())
161 | 
162 |     for x in FASTA(path, fai=False):
163 |         for y in x.seq.kmers(step=1, k=K):
164 |             v2.append(str(~y).upper())
165 | 
166 |     '''
167 |     if len(v1) == len(v2):
168 |         for i in range(len(v1)):
169 |             if v1[i] != v2[i]:
170 |                 print i, v1[i], v2[i]
171 |     '''
172 | 
173 |     assert v1 == v2
174 | 
175 | testfile = 'test/data/MT-human.fa'
176 | test_kmer_revcomp(testfile, 1)
177 | test_kmer_revcomp(testfile, 2)
178 | test_kmer_revcomp(testfile, 3)
179 | test_kmer_revcomp(testfile, 4)
180 | test_kmer_revcomp(testfile, 5)
181 | test_kmer_revcomp(testfile, 25)
182 | test_kmer_revcomp(testfile, 32)
183 | test_kmer_revcomp(testfile, 31)
184 | test_kmer_revcomp(testfile, 33)
185 | test_kmer_revcomp(testfile, 64)
186 | test_kmer_revcomp(testfile, 65)
187 | test_kmer_revcomp(testfile, 129)
188 | test_kmer_revcomp(testfile, 1000)
189 | 
190 | @test
191 | def test_kmer_iteration(path: str, K: Static[int]):
192 |     for rc in (True, False):
193 |         for step in range(1, K + 2):
194 |             v1 = []
195 |             v2 = []
196 | 
197 |             for x in FASTA(path, fai=False):
198 |                 for y in (~(x.seq) if rc else x.seq).split(k=K, step=step):
199 |                     if not y.N():
200 |                         v1.append(str(y).upper())
201 | 
202 |             for x in FASTA(path, fai=False):
203 |                 for y in (~(x.seq) if rc else x.seq).kmers(step=step, k=K):
204 |                     v2.append(str(y).upper())
205 | 
206 |             assert v1 == v2
207 | 
208 | testfile = 'test/data/MT-human.fa'
209 | test_kmer_iteration(testfile, 1)
210 | test_kmer_iteration(testfile, 2)
211 | test_kmer_iteration(testfile, 3)
212 | test_kmer_iteration(testfile, 4)
213 | test_kmer_iteration(testfile, 5)
214 | test_kmer_iteration(testfile, 25)
215 | test_kmer_iteration(testfile, 32)
216 | test_kmer_iteration(testfile, 31)
217 | test_kmer_iteration(testfile, 33)
218 | test_kmer_iteration(testfile, 64)
219 | test_kmer_iteration(testfile, 65)
220 | test_kmer_iteration(testfile, 129)
221 | 


--------------------------------------------------------------------------------