├── test ├── sql │ ├── exondb-release-with-deb-info │ │ ├── bed │ │ │ ├── test3.bed │ │ │ ├── test3.bed.gz │ │ │ ├── test3.bed.zst │ │ │ └── hg38.head.bed │ │ ├── test.mixed-desc.fasta │ │ ├── test.fasta │ │ ├── bam │ │ │ ├── test.bam │ │ │ └── example1.bam │ │ ├── test.gff.gz │ │ ├── test.gff.zst │ │ ├── test.fasta.gz │ │ ├── test.fasta.gzip │ │ ├── test.fasta.zst │ │ ├── test.fasta.zstd │ │ ├── test.fastq.gz │ │ ├── test.fastq.gzip │ │ ├── test.fastq.zst │ │ ├── test.fastq.zstd │ │ ├── test.gff.gzip │ │ ├── test.gff.zstd │ │ ├── vcf │ │ │ ├── index.bcf │ │ │ ├── index.vcf.gz │ │ │ ├── vcf_meta_meta.vcf │ │ │ └── vcf_file.vcf │ │ ├── test.gff │ │ ├── bam-index │ │ │ ├── test.bam │ │ │ └── test.bam.bai │ │ ├── bcf-index │ │ │ ├── index.bcf │ │ │ └── index.bcf.csi │ │ ├── fasta │ │ │ ├── copy-a.fasta.gz │ │ │ └── copy-b.fasta.gz │ │ ├── vcf-index │ │ │ ├── index.vcf.gz │ │ │ └── index.vcf.gz.tbi │ │ ├── gb │ │ │ ├── Alouatta_caraya_NC_021938.1.gb.gz │ │ │ ├── Alouatta_caraya_NC_021938.1.gb.zst │ │ │ └── Alouatta_caraya_NC_021938.1.gb.gzip │ │ ├── test_hmm_scan.test │ │ ├── sam │ │ │ ├── little.sam │ │ │ ├── example1.sam │ │ │ └── comment1.sam │ │ ├── test.fastq │ │ ├── test2.fastq │ │ ├── fastq │ │ │ ├── copy-a.fastq │ │ │ └── copy-b.fastq │ │ ├── test_bed_io.test │ │ ├── test_acknowledgements_function.test │ │ ├── test_mzml_scan.test │ │ ├── test_gtf_scan.test │ │ ├── test_sam_record_scan.test │ │ ├── test_sam_flags.test │ │ ├── test_bam_record_scan.test │ │ ├── gff │ │ │ └── raw-test.gff │ │ ├── test_genbank_scan.test │ │ ├── test_gff_copy.test │ │ ├── test_vcf_record_scan.test │ │ ├── test_fasta_scan.test │ │ ├── test_fastq_copy.test │ │ ├── test_fastq_scan.test │ │ ├── test_fasta_copy.test │ │ ├── test_gff_scan.test │ │ ├── test_scalar_functions.test │ │ └── test.pfam.hmmout │ └── exondb-align │ │ └── test_align.test └── README.md ├── .gitmodules ├── environment.yml ├── exon ├── src │ ├── CMakeLists.txt │ ├── exon │ │ ├── bcf_query_function │ │ │ ├── CMakeLists.txt │ │ │ └── module.cpp │ │ ├── core │ │ │ ├── CMakeLists.txt │ │ │ └── module.cpp │ │ ├── sam_functions │ │ │ ├── CMakeLists.txt │ │ │ └── module.cpp │ │ ├── vcf_query_function │ │ │ ├── CMakeLists.txt │ │ │ └── module.cpp │ │ ├── arrow_table_function │ │ │ └── CMakeLists.txt │ │ ├── bam_query_function │ │ │ ├── CMakeLists.txt │ │ │ └── module.cpp │ │ ├── gff_functions │ │ │ ├── CMakeLists.txt │ │ │ └── module.cpp │ │ ├── alignment_functions │ │ │ └── CMakeLists.txt │ │ ├── fastq_functions │ │ │ ├── CMakeLists.txt │ │ │ └── module.cpp │ │ ├── sequence_functions │ │ │ ├── CMakeLists.txt │ │ │ └── module.cpp │ │ └── CMakeLists.txt │ └── exon_extension.cpp └── include │ ├── exon_extension.hpp │ ├── exon │ ├── gff_functions │ │ └── module.hpp │ ├── core │ │ └── module.hpp │ ├── fastq_functions │ │ └── module.hpp │ ├── sam_functions │ │ └── module.hpp │ ├── sequence_functions │ │ └── module.hpp │ ├── bam_query_function │ │ └── module.hpp │ ├── bcf_query_function │ │ └── module.hpp │ ├── vcf_query_function │ │ └── module.hpp │ ├── arrow_table_function │ │ └── module.hpp │ └── alignment_functions │ │ └── module.hpp │ └── rust.hpp ├── cz.json ├── README.md ├── LICENSE ├── CHANGELOG.md ├── rust ├── Cargo.toml ├── src │ ├── lib.rs │ ├── bam_query_reader.rs │ ├── bcf_query_reader.rs │ ├── vcf_query_reader.rs │ ├── sam_functions.rs │ └── arrow_reader.rs └── build.rs ├── .github ├── workflows │ ├── MacOS.yml │ ├── TestInsalls.yml │ ├── Windows.yml │ └── Linux.yml └── Duckdb+Exon.svg ├── Makefile ├── bin └── upload-artifacts.py ├── CMakeLists.txt └── .gitignore /test/sql/exondb-release-with-deb-info/bed/test3.bed: -------------------------------------------------------------------------------- 1 | sq0 7 13 . 0 . 7 13 0 2 2,1 0,3 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "duckdb"] 2 | path = duckdb 3 | url = https://github.com/duckdb/duckdb.git 4 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.mixed-desc.fasta: -------------------------------------------------------------------------------- 1 | >a description 2 | ATCG 3 | >b 4 | ATCG 5 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.fasta: -------------------------------------------------------------------------------- 1 | >a description 2 | ATCG 3 | >b description2 4 | ATCG 5 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/bam/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/bam/test.bam -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.gff.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.gff.gz -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.gff.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.gff.zst -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: exondb 3 | channels: 4 | - conda-forge 5 | - bioconda 6 | dependencies: 7 | - pip 8 | - pip: 9 | - boto3 10 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.fasta.gz -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.fasta.gzip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.fasta.gzip -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.fasta.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.fasta.zst -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.fasta.zstd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.fasta.zstd -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.fastq.gz -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.fastq.gzip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.fastq.gzip -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.fastq.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.fastq.zst -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.fastq.zstd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.fastq.zstd -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.gff.gzip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.gff.gzip -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.gff.zstd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/test.gff.zstd -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/vcf/index.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/vcf/index.bcf -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/bam/example1.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/bam/example1.bam -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/bed/test3.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/bed/test3.bed.gz -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/bed/test3.bed.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/bed/test3.bed.zst -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.gff: -------------------------------------------------------------------------------- 1 | sq0 caat gene 8 13 . + . gene_id=caat1;gene_name=gene0 2 | sq1 caat gene 8 14 0.1 + 0 gene_id=caat2;gene_name=gene0 3 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/vcf/index.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/vcf/index.vcf.gz -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/bam-index/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/bam-index/test.bam -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/bcf-index/index.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/bcf-index/index.bcf -------------------------------------------------------------------------------- /exon/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(exon) 2 | 3 | set(EXTENSION_SOURCES 4 | ${EXTENSION_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/exon_extension.cpp 5 | PARENT_SCOPE) -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/bam-index/test.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/bam-index/test.bam.bai -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/fasta/copy-a.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/fasta/copy-a.fasta.gz -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/fasta/copy-b.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/fasta/copy-b.fasta.gz -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/vcf-index/index.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/vcf-index/index.vcf.gz -------------------------------------------------------------------------------- /exon/src/exon/bcf_query_function/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(EXTENSION_SOURCES 2 | ${EXTENSION_SOURCES} 3 | ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp 4 | PARENT_SCOPE 5 | ) -------------------------------------------------------------------------------- /exon/src/exon/core/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(EXTENSION_SOURCES 2 | ${EXTENSION_SOURCES} 3 | ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp 4 | PARENT_SCOPE 5 | ) 6 | 7 | -------------------------------------------------------------------------------- /exon/src/exon/sam_functions/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(EXTENSION_SOURCES 2 | ${EXTENSION_SOURCES} 3 | ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp 4 | PARENT_SCOPE 5 | ) 6 | -------------------------------------------------------------------------------- /exon/src/exon/vcf_query_function/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(EXTENSION_SOURCES 2 | ${EXTENSION_SOURCES} 3 | ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp 4 | PARENT_SCOPE 5 | ) -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/bcf-index/index.bcf.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/bcf-index/index.bcf.csi -------------------------------------------------------------------------------- /exon/src/exon/arrow_table_function/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(EXTENSION_SOURCES 2 | ${EXTENSION_SOURCES} 3 | ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp 4 | PARENT_SCOPE 5 | ) -------------------------------------------------------------------------------- /exon/src/exon/bam_query_function/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(EXTENSION_SOURCES 2 | ${EXTENSION_SOURCES} 3 | ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp 4 | PARENT_SCOPE 5 | ) 6 | -------------------------------------------------------------------------------- /exon/src/exon/gff_functions/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(EXTENSION_SOURCES 2 | ${EXTENSION_SOURCES} 3 | ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp 4 | PARENT_SCOPE 5 | ) 6 | 7 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/vcf-index/index.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/vcf-index/index.vcf.gz.tbi -------------------------------------------------------------------------------- /exon/src/exon/alignment_functions/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(EXTENSION_SOURCES 2 | ${EXTENSION_SOURCES} 3 | ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp 4 | PARENT_SCOPE 5 | ) 6 | -------------------------------------------------------------------------------- /exon/src/exon/fastq_functions/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(EXTENSION_SOURCES 2 | ${EXTENSION_SOURCES} 3 | ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp 4 | PARENT_SCOPE 5 | ) 6 | 7 | -------------------------------------------------------------------------------- /exon/src/exon/sequence_functions/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(EXTENSION_SOURCES 2 | ${EXTENSION_SOURCES} 3 | ${CMAKE_CURRENT_SOURCE_DIR}/module.cpp 4 | PARENT_SCOPE 5 | ) 6 | 7 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb.gz -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb.zst -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb.gzip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wheretrue/exon-duckdb/HEAD/test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb.gzip -------------------------------------------------------------------------------- /exon/include/exon_extension.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "duckdb.hpp" 4 | 5 | namespace duckdb { 6 | 7 | class ExonExtension : public Extension { 8 | public: 9 | void Load(DuckDB &db) override; 10 | std::string Name() override; 11 | }; 12 | 13 | } // namespace duckdb -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_hmm_scan.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Test count 5 | query I 6 | SELECT COUNT(*) FROM read_hmm_dom_tbl_out('./test/sql/exondb-release-with-deb-info/test.pfam.hmmout') 7 | ---- 8 | 100 9 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/bed/hg38.head.bed: -------------------------------------------------------------------------------- 1 | chr1 69090 70008 2 | chr1 450739 451678 3 | chr1 685715 686654 4 | chr1 925941 926013 5 | chr1 930154 930336 6 | chr1 931038 931089 7 | chr1 935771 935896 8 | chr1 939039 939129 9 | chr1 939274 939460 10 | chr1 941143 941306 11 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/sam/little.sam: -------------------------------------------------------------------------------- 1 | 8 16 chrX 121893 0 75M * 0 0 GACCTAGGCCCAATGCAGACTCTAAAGGTTGCACAGTCTGCCCTCTATCTGTCCTCAATGAGACCTAGGCCCAGT * NM:i:0 MD:Z:75 AS:i:75 XS:i:75 2 | 12 16 chrX 124324 0 75M * 0 0 CCTCAATGAGACCTAGGCCCAATGCAGACTCTAAAGGTTGCACAGTCTGCCCTCTATCTGTCCTCAATGAGACCT * NM:i:0 MD:Z:75 AS:i:75 XS:i:75 3 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.fastq: -------------------------------------------------------------------------------- 1 | @SEQ_ID This is a description 2 | GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT 3 | + 4 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 5 | @SEQ_ID2 6 | GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT 7 | + 8 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 9 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test2.fastq: -------------------------------------------------------------------------------- 1 | @SEQ_ID 2 | GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT 3 | +This is a description 4 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 5 | @SEQ_ID2 6 | GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT 7 | + 8 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 9 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/fastq/copy-a.fastq: -------------------------------------------------------------------------------- 1 | @SEQ_ID This is a description 2 | GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT 3 | + 4 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 5 | @SEQ_ID2 6 | GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT 7 | + 8 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 9 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/fastq/copy-b.fastq: -------------------------------------------------------------------------------- 1 | @SEQ_ID This is a description 2 | GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT 3 | + 4 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 5 | @SEQ_ID2 6 | GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT 7 | + 8 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 9 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_bed_io.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | query IIIIIIIIIIII 5 | SELECT * FROM read_bed_file('test/sql/exondb-release-with-deb-info/bed/test3.bed') LIMIT 1; 6 | ---- 7 | sq0 8 | 8 9 | 13 10 | NULL 11 | NULL 12 | NULL 13 | 8 14 | 13 15 | NULL 16 | 2 17 | 2,1 18 | 0,3 19 | -------------------------------------------------------------------------------- /cz.json: -------------------------------------------------------------------------------- 1 | { 2 | "commitizen": { 3 | "name": "cz_conventional_commits", 4 | "version": "0.8.0", 5 | "tag_format": "v$version", 6 | "update_changelog_on_bump": true, 7 | "version_files": [ 8 | "./CMakeLists.txt", 9 | "./Makefile", 10 | "./bin/upload-artifacts.py", 11 | "./exon/src/exon/core/module.cpp" 12 | ] 13 | } 14 | } -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_acknowledgements_function.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # query I 5 | # SELECT name FROM exondb_third_party_acknowledgements() ORDER BY name; 6 | # ---- 7 | # DuckDB 8 | # WFA2-lib 9 | # brotli 10 | # cpp-httplib 11 | # gb-io 12 | # nlohmann/json 13 | # noodles 14 | # openssl 15 | 16 | statement ok 17 | SELECT exondb_version(); 18 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/sam/example1.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.4 SO:queryname 2 | @CO The MIT License 3 | @SQ SN:ref1 LN:56 M5:08c04d512d4797d9ba2a156c1daba468 4 | ref1_grp1_p001 99 ref1 1 0 10M = 25 34 CGAGCTCGGT !!!!!!!!!! MD:Z:10 NM:i:0 RG:Z:grp1 BC:Z:ACGT H0:i:1 aa:A:! ab:A:~ fa:f:3.14159 za:Z:Hello world! ha:H:DEADBEEF ba:B:c,-128,0,127 bb:B:C,0,127,255 bc:B:s,-32768,0,32767 bd:B:S,0,32768,65535 be:B:i,-2147483648,0,2147483647 bf:B:I,0,2147483648,4294967295 bg:B:f,2.71828,6.626e-34,2.9979e+09 5 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/vcf/vcf_meta_meta.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.3 2 | ##FILTER= 3 | ##META= 4 | ##META= 5 | ##META= 6 | ##META= 7 | ##contig= 8 | #CHROM POS ID REF ALT QUAL FILTER INFO 9 | 1 123 test TC T . . . 10 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_mzml_scan.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Test counting from a GFF file 5 | query III 6 | SELECT id, intensity, wavelength FROM read_mzml('./test/sql/exondb-release-with-deb-info/test.mzml') LIMIT 1; 7 | ---- 8 | declaration=0 collection=0 scan=0 9 | {'intensity': [15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]} 10 | {'wavelength': [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0]} 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | exon-duckdb 3 |

4 | 5 | Exon-DuckDB is a DuckDB Extension for Exon that allows for users to use exon functionality through DuckDB. 6 | 7 | For example, you can use the following query count the sequences in a FASTA file: 8 | 9 | ```sql 10 | LOAD exon; 11 | 12 | SELECT COUNT(*) 13 | FROM read_fasta('file.fasta'); 14 | ``` 15 | 16 | You can read more about how to use Exon-DuckDB in the [documentation](https://www.wheretrue.dev/docs/exon/exondb/). 17 | -------------------------------------------------------------------------------- /exon/src/exon/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(arrow_table_function) 2 | add_subdirectory(sam_functions) 3 | add_subdirectory(sequence_functions) 4 | add_subdirectory(gff_functions) 5 | add_subdirectory(vcf_query_function) 6 | add_subdirectory(bam_query_function) 7 | add_subdirectory(bcf_query_function) 8 | add_subdirectory(fastq_functions) 9 | add_subdirectory(core) 10 | 11 | if(WFA2_ENABLED) 12 | add_subdirectory(alignment_functions) 13 | endif() 14 | 15 | set(EXTENSION_SOURCES 16 | ${EXTENSION_SOURCES} 17 | PARENT_SCOPE 18 | ) 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2023 WHERE TRUE Technologies 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## v0.8.0 (2023-08-01) 2 | 3 | ### Feat 4 | 5 | - updating exon (#16) 6 | - add mzml function (#13) 7 | - add vcf/bam/bcf scan (#11) 8 | - add gtf (#10) 9 | 10 | ## v0.7.2 (2023-06-20) 11 | 12 | ### Fix 13 | 14 | - refactor arrow to support filters (#8) 15 | 16 | ## v0.7.1 (2023-06-17) 17 | 18 | ## v0.7.0 (2023-06-13) 19 | 20 | ### Feat 21 | 22 | - add gcs (#5) 23 | 24 | ## v0.6.0 (2023-06-13) 25 | 26 | ### Feat 27 | 28 | - lock on v0.8.0 29 | 30 | ## v0.5.0 (2023-06-12) 31 | 32 | ### Feat 33 | 34 | - add alignment functions (#4) 35 | - add tests 36 | - compiling on windows 37 | - diff repo start 38 | -------------------------------------------------------------------------------- /rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | build = "build.rs" 3 | edition = "2021" 4 | name = "rust" 5 | version = "1.0.0" 6 | 7 | [features] 8 | all = [] 9 | statically_linked = [] 10 | 11 | [lib] 12 | crate-type = ["staticlib"] 13 | name = "rust" 14 | 15 | [dependencies] 16 | arrow = {version = "43", default-features = false, features = ["ffi"]} 17 | datafusion = {version = "28.0.0", features = ["default"]} 18 | exon = {version = "0.2.6", features = ["all"]} 19 | noodles = {version = "0.46.0", features = ["sam", "fasta", "fastq", "gff"]} 20 | tokio = {version = "1", features = ["rt-multi-thread"]} 21 | 22 | [build-dependencies] 23 | cbindgen = "0.24.5" 24 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_gtf_scan.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Test counting from a GFF file 5 | query IIIIIIIII 6 | SELECT seqname, source, type, start, "end", score, strand, frame, attributes['gene_id'][1] FROM read_gtf('./test/sql/exondb-release-with-deb-info/gtf/test.gtf') LIMIT 1; 7 | ---- 8 | chr1 9 | processed_transcript 10 | exon 11 | 11869 12 | 12227 13 | NULL 14 | + 15 | NULL 16 | ENSG00000223972 17 | 18 | # Test counting from a GFF file 19 | query I 20 | SELECT count(*) FROM read_gtf('./test/sql/exondb-release-with-deb-info/gtf/test.gtf'); 21 | ---- 22 | 77 23 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_sam_record_scan.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Example File 5 | query IIIIIIIIII 6 | SELECT name, flag, reference, start, "end", mapping_quality, cigar, mate_reference, sequence, quality_score FROM read_sam_file_records('./test/sql/exondb-release-with-deb-info/sam/example1.sam') 7 | ---- 8 | ref1_grp1_p001 9 | 99 10 | ref1 11 | 1 12 | 10 13 | 0 14 | 10M 15 | ref1 16 | CGAGCTCGGT 17 | !!!!!!!!!! 18 | 19 | # Missing file throws an error 20 | statement error 21 | SELECT * FROM read_sam_file_records('./test/sql/exondb-release-with-deb-info/sam/missing.sam') 22 | ---- 23 | -------------------------------------------------------------------------------- /test/sql/exondb-align/test_align.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | query I 5 | SELECT alignment_string_wfa_gap_affine('AACC', 'AAACC') 6 | ---- 7 | 2M1D2M 8 | 9 | statement error 10 | SELECT alignment_string_wfa_gap_affine('AACC', 'AAACC', 1, 1, 1, 1, 'memory_low') 11 | ---- 12 | 13 | query I 14 | SELECT alignment_string_wfa_gap_affine('AACC', 'AAACC', -1, 1, 2, 3, 'memory_low') 15 | ---- 16 | 2M1D2M 17 | 18 | query I 19 | SELECT alignment_string_wfa_gap_affine('AACC', 'AAACC', 1, 1, 1, 'memory_low') 20 | ---- 21 | 2M1D2M 22 | 23 | query I 24 | SELECT alignment_score_wfa_gap_affine('AACC', 'AACC') 25 | ---- 26 | 0.0 27 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Testing the quack extension 2 | This directory contains all the tests for the quack extension. The `sql` directory holds tests that are written as [SQLLogicTests](https://duckdb.org/dev/sqllogictest/intro.html). DuckDB aims to have most its tests in this format as SQL statements, so for the quack extension, this should probably be the goal too. However, client specific testing is also available. 3 | 4 | The root makefile contains targets to build and run all of these tests. To run the SQLLogicTests: 5 | ```bash 6 | make test 7 | ``` 8 | 9 | To run the python tests: 10 | ```sql 11 | make test_python 12 | ``` 13 | 14 | For other client tests check the makefile in the root of this repository. -------------------------------------------------------------------------------- /rust/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod arrow_reader; 16 | pub mod bam_query_reader; 17 | pub mod bcf_query_reader; 18 | pub mod vcf_query_reader; 19 | 20 | pub mod sam_functions; 21 | -------------------------------------------------------------------------------- /exon/include/exon/gff_functions/module.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | 20 | namespace exon 21 | { 22 | 23 | class GFFunctions 24 | { 25 | public: 26 | static duckdb::CreateScalarFunctionInfo GetGFFParseAttributesFunction(); 27 | }; 28 | } 29 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_sam_flags.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | query I 5 | SELECT is_segmented(1); 6 | ---- 7 | true 8 | 9 | query I 10 | SELECT is_unmapped(7); 11 | ---- 12 | true 13 | 14 | query I 15 | SELECT is_properly_aligned(7); 16 | ---- 17 | true 18 | 19 | query I 20 | SELECT is_mate_unmapped(15); 21 | ---- 22 | true 23 | 24 | query I 25 | SELECT is_reverse_complemented(31); 26 | ---- 27 | true 28 | 29 | query I 30 | SELECT is_mate_reverse_complemented(63); 31 | ---- 32 | true 33 | 34 | query I 35 | SELECT is_first_segment(127); 36 | ---- 37 | true 38 | 39 | query I 40 | SELECT is_last_segment(255); 41 | ---- 42 | true 43 | 44 | query I 45 | SELECT is_secondary(511); 46 | ---- 47 | true 48 | 49 | query I 50 | SELECT is_secondary(255); 51 | ---- 52 | false 53 | 54 | query I 55 | SELECT is_quality_control_failed(2815); 56 | ---- 57 | true 58 | 59 | query I 60 | SELECT is_duplicate(4095); 61 | ---- 62 | true 63 | 64 | query I 65 | SELECT is_supplementary(8191); 66 | ---- 67 | true 68 | -------------------------------------------------------------------------------- /rust/build.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | extern crate cbindgen; 16 | 17 | use std::env; 18 | 19 | fn main() { 20 | let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); 21 | let out_dir = "./../exon/include/"; 22 | 23 | let dest_path = std::path::Path::new(&out_dir).join("rust.hpp"); 24 | 25 | cbindgen::Builder::new() 26 | .with_crate(crate_dir) 27 | // .with_header("#include ") 28 | .generate() 29 | .expect("Unable to generate bindings") 30 | .write_to_file(dest_path); 31 | } 32 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_bam_record_scan.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Example File 5 | query IIIIIIIIII 6 | SELECT * FROM read_bam_file_records('./test/sql/exondb-release-with-deb-info/bam/example1.bam') 7 | ---- 8 | ref1_grp1_p001 9 | 99 10 | ref1 11 | 1 12 | 10 13 | 0 14 | 10M 15 | ref1 16 | CGAGCTCGGT 17 | !!!!!!!!!! 18 | 19 | # Missing file throws an error 20 | statement error 21 | SELECT * FROM read_bam_file_records('./test/sql/exondb-release-with-deb-info/bam/missing.bam') 22 | ---- 23 | 24 | query I 25 | SELECT COUNT(*) FROM bam_query('./test/sql/exondb-release-with-deb-info/bam-index/test.bam', 'chr1'); 26 | ---- 27 | 61 28 | 29 | query IIIIIIIIII 30 | SELECT name, flag, reference, start, "end", mapping_quality, cigar, mate_reference, sequence, quality_score FROM bam_query('./test/sql/exondb-release-with-deb-info/bam-index/test.bam', 'chr1') LIMIT 1; 31 | ---- 32 | READ_ID 33 | 83 34 | chr1 35 | 12203704 36 | 12217173 37 | NULL 38 | 55M13394N21M 39 | chr1 40 | AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 41 | 8DCCCC?::>CDDB<<>@3CCDBD@DBDFHHHFEIIGCAIIHIF@@DDGGEBEGEIHGGGDGG?BHHHDFFFF@@< 42 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/gff/raw-test.gff: -------------------------------------------------------------------------------- 1 | KanNP_rdsDRAFT_3085441342 FGMP CDS 2 151 . 1 0 ID=KanNP_rdsDRAFT_3085441342.1;locus_tag=KanNP_rdsDRAFT_30854413421; 2 | KanNP_rdsDRAFT_3097038968 FGMP CDS 2 151 . 1 0 ID=KanNP_rdsDRAFT_3097038968.1;locus_tag=KanNP_rdsDRAFT_30970389681; 3 | KanNP_rdsDRAFT_3014572339 FP CDS 1 150 . -1 0 ID=KanNP_rdsDRAFT_3014572339.1;locus_tag=KanNP_rdsDRAFT_30145723391; 4 | KanNP_rdsDRAFT_3099557266 FP CDS 1 84 . -1 0 ID=KanNP_rdsDRAFT_3099557266.1;locus_tag=KanNP_rdsDRAFT_30995572661; 5 | KanNP_rdsDRAFT_3065939465 F CDS 1 149 . 1 0 ID=KanNP_rdsDRAFT_3065939465.1;locus_tag=KanNP_rdsDRAFT_30659394651; 6 | KanNP_rdsDRAFT_3065939465 F exon 1 71 . 1 . ID=KanNP_rdsDRAFT_3065939465.1.1; Parent=KanNP_rdsDRAFT_3065939465.1 7 | KanNP_rdsDRAFT_3065939465 F exon 74 149 . 1 . ID=KanNP_rdsDRAFT_3065939465.1.2; Parent=KanNP_rdsDRAFT_3065939465.1 8 | KanNP_rdsDRAFT_3043564758 MP CDS 2 151 . -1 0 ID=KanNP_rdsDRAFT_3043564758.1;locus_tag=KanNP_rdsDRAFT_30435647581; 9 | KanNP_rdsDRAFT_3070203790 FGMP CDS 2 91 . -1 0 ID=KanNP_rdsDRAFT_3070203790.1;locus_tag=KanNP_rdsDRAFT_30702037901; 10 | KanNP_rdsDRAFT_3092558462 FGMP CDS 3 149 . -1 0 ID=KanNP_rdsDRAFT_3092558462.1;locus_tag=KanNP_rdsDRAFT_30925584621; 11 | -------------------------------------------------------------------------------- /exon/include/exon/core/module.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace exon 24 | { 25 | class ExonDbFunctions 26 | { 27 | public: 28 | static duckdb::CreateScalarFunctionInfo GetExonDbVersionFunction(); 29 | static duckdb::CreateTableFunctionInfo GetThirdPartyAcknowledgementTable(); 30 | }; 31 | }; -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/sam/comment1.sam: -------------------------------------------------------------------------------- 1 | @CO Validation of AUX 2 | @CO Type A 3 | z1 4 * 0 0 * * 0 0 CAT QQQ Z0:Z:Simple string Z1:Z:1 Z2:Z:22 Z3:Z:333 4 | z2 4 * 0 0 * * 0 0 CAT QQQ ZZ:Z: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ 5 | z3 4 * 0 0 * * 0 0 CAT QQQ Z0:Z: Z1:Z:empty-test Z2:Z: 6 | z4 4 * 0 0 * * 0 0 CAT QQQ Z0:Z:x Z1:Z:long################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################long Z2:Z:x 7 | -------------------------------------------------------------------------------- /exon/include/exon/fastq_functions/module.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace exon 24 | { 25 | 26 | class FastqFunctions 27 | { 28 | public: 29 | static duckdb::unique_ptr GetFastqTableFunction(); 30 | static duckdb::unique_ptr GetFastqCopyFunction(); 31 | static duckdb::unique_ptr GetQualityScoreStringToList(); 32 | }; 33 | 34 | } // namespace exondb -------------------------------------------------------------------------------- /exon/include/exon/sam_functions/module.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace exon 24 | { 25 | 26 | class SamFunctions 27 | { 28 | public: 29 | static duckdb::unique_ptr GetParseCIGARStringFunction(); 30 | static duckdb::unique_ptr GetExtractFromCIGARFunction(); 31 | static std::vector> GetSamFunctions(); 32 | }; 33 | 34 | } // namespace wtt01 -------------------------------------------------------------------------------- /exon/src/exon/core/module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include "exon/core/module.hpp" 20 | #include "rust.hpp" 21 | 22 | namespace exon 23 | { 24 | 25 | const std::string EXON_VERSION = "0.3.9"; 26 | 27 | duckdb::CreateScalarFunctionInfo ExonDbFunctions::GetExonDbVersionFunction() 28 | { 29 | duckdb::ScalarFunctionSet set("exondb_version"); 30 | 31 | auto duckdb_function = [](duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 32 | { 33 | result.SetValue(0, duckdb::Value(EXON_VERSION)); 34 | }; 35 | 36 | set.AddFunction(duckdb::ScalarFunction({}, duckdb::LogicalType::VARCHAR, duckdb_function)); 37 | 38 | return duckdb::CreateScalarFunctionInfo(std::move(set)); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /.github/workflows/MacOS.yml: -------------------------------------------------------------------------------- 1 | name: MacOS 2 | on: [workflow_dispatch] 3 | 4 | concurrency: 5 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 6 | cancel-in-progress: true 7 | 8 | defaults: 9 | run: 10 | shell: bash 11 | 12 | jobs: 13 | macos: 14 | name: MacOS Release (Universal) 15 | runs-on: macos-latest 16 | strategy: 17 | matrix: 18 | # Add commits/tags to build against other DuckDB versions 19 | duckdb_version: ["v0.8.1"] 20 | 21 | env: 22 | OSX_BUILD_UNIVERSAL: 0 23 | GEN: ninja 24 | 25 | steps: 26 | - uses: actions/checkout@v3 27 | with: 28 | fetch-depth: 0 29 | submodules: "true" 30 | 31 | - name: Install Ninja 32 | run: brew install ninja 33 | 34 | - uses: actions/setup-python@v2 35 | with: 36 | python-version: "3.7" 37 | 38 | - name: Checkout DuckDB to version 39 | if: ${{ matrix.duckdb_version != ''}} 40 | run: | 41 | cd duckdb 42 | git checkout ${{ matrix.duckdb_version }} 43 | 44 | # Build extension 45 | - name: Build extension 46 | shell: bash 47 | run: | 48 | make release 49 | make test 50 | 51 | - name: Upload extension 52 | uses: actions/upload-artifact@v2 53 | with: 54 | name: duckdb-extension 55 | path: build/release/extension/exon/exon.duckdb_extension 56 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_genbank_scan.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | query III 5 | SELECT sequence[:5] AS seq, accession, topology FROM read_genbank('./test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb') LIMIT 1; 6 | ---- 7 | gttaa 8 | NC_021938 9 | circular 10 | 11 | 12 | query I 13 | SELECT COUNT(*) FROM read_genbank('./test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb'); 14 | ---- 15 | 2 16 | 17 | query I 18 | SELECT COUNT(*) FROM read_genbank('./test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb.zst'); 19 | ---- 20 | 2 21 | 22 | query I 23 | SELECT COUNT(*) FROM read_genbank('./test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb.gzip', compression='gzip'); 24 | ---- 25 | 2 26 | 27 | query I 28 | SELECT COUNT(*) FROM read_genbank('./test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb.gz'); 29 | ---- 30 | 2 31 | 32 | # query I 33 | # SELECT COUNT(*) FROM './test/sql/exondb-release-with-deb-info/gb/Alouatta_caraya_NC_021938.1.gb'; 34 | # ---- 35 | # 2 36 | 37 | query I 38 | SELECT COUNT(*) FROM read_genbank('./test/sql/exondb-release-with-deb-info/gbk/'); 39 | ---- 40 | 3 41 | 42 | # query II 43 | # SELECT accession, feature['kind'] FROM (SELECT accession, UNNEST(features) AS feature FROM read_genbank('./test/sql/exondb-release-with-deb-info/gb/BGC0002747.gbk')) WHERE feature.kind = 'CDS' LIMIT 1; 44 | # ---- 45 | # BGC0002747 46 | # CDS 47 | -------------------------------------------------------------------------------- /exon/include/exon/sequence_functions/module.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | 20 | namespace exon 21 | { 22 | class SequenceFunctions 23 | { 24 | public: 25 | static duckdb::CreateScalarFunctionInfo GetReverseComplementFunction(); 26 | static duckdb::CreateScalarFunctionInfo GetComplementFunction(); 27 | static duckdb::CreateScalarFunctionInfo GetGCContentFunction(); 28 | static duckdb::CreateScalarFunctionInfo GetReverseTranscribeRnaToDnaFunction(); 29 | static duckdb::CreateScalarFunctionInfo GetTranslateDnaToAminoAcidFunction(); 30 | 31 | static duckdb::CreateScalarFunctionInfo GetAlignFunction(); 32 | 33 | static duckdb::CreateScalarFunctionInfo GetTranscribeDnaToRnaFunction(); 34 | 35 | static std::vector GetSequenceFunctions(); 36 | }; 37 | } -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_gff_copy.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Test writing to a GFF file 5 | # query I 6 | # COPY (SELECT * FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff')) TO './test/sql/tmp/test.gff' (FORMAT 'gff'); 7 | # ---- 8 | # 2 9 | 10 | # Test reading that file returns the proper structure 11 | # query IIIIIIIII 12 | # SELECT * FROM read_gff('./test/sql/tmp/test.gff'); 13 | # ---- 14 | # sq0 15 | # caat 16 | # gene 17 | # 8 18 | # 13 19 | # NULL 20 | # + 21 | # NULL 22 | # gene_id=caat1;gene_name=gene0; 23 | # sq1 24 | # caat 25 | # gene 26 | # 8 27 | # 14 28 | # 0.1 29 | # + 30 | # 0 31 | # gene_id=caat2;gene_name=gene0; 32 | 33 | # Test writing to a GFF file, gzipped 34 | # query I 35 | # COPY (SELECT * FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff')) TO './test/sql/tmp/test.gff.gz' (FORMAT 'gff'); 36 | # ---- 37 | # 2 38 | 39 | # Test writing to a GFF file, error because it exists 40 | # statement error 41 | # COPY (SELECT * FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff')) TO './test/sql/tmp/test.gff.gz' (FORMAT 'gff'); 42 | 43 | # Now try again, but set force true 44 | # query I 45 | # COPY (SELECT * FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff')) TO './test/sql/tmp/test.gff.gz' (FORMAT 'gff', FORCE true); 46 | # ---- 47 | # 2 48 | 49 | # Test writing to a GFF file, zstd 50 | # query I 51 | # COPY (SELECT * FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff')) TO './test/sql/tmp/test.gff.zst' (FORMAT 'gff'); 52 | # ---- 53 | # 2 54 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_vcf_record_scan.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | query I 5 | SELECT COUNT(*) FROM read_vcf_file_records('./test/sql/exondb-release-with-deb-info/vcf/index.vcf'); 6 | ---- 7 | 621 8 | 9 | 10 | query IIIIIII 11 | SELECT chrom, pos, ref, alt, qual, info.indel, info.dp FROM read_vcf_file_records('./test/sql/exondb-release-with-deb-info/vcf/index.vcf') LIMIT 1; 12 | ---- 13 | 1 14 | 9999919 15 | G 16 | [<*>] 17 | 0.0 18 | NULL 19 | 1 20 | 21 | query IIIIIII 22 | SELECT chrom, pos, ref, alt, qual, info.indel, info.dp FROM read_bcf_file_records('./test/sql/exondb-release-with-deb-info/vcf/index.bcf') LIMIT 1; 23 | ---- 24 | 1 25 | 9999919 26 | G 27 | [<*>] 28 | 0.0 29 | NULL 30 | 1 31 | 32 | query IIIIIII 33 | SELECT chrom, pos, ref, alt, qual, info.indel, info.dp FROM read_vcf_file_records('./test/sql/exondb-release-with-deb-info/vcf/index.vcf.gz') LIMIT 1; 34 | ---- 35 | 1 36 | 9999919 37 | G 38 | [<*>] 39 | 0.0 40 | NULL 41 | 1 42 | 43 | query I 44 | SELECT COUNT(*) FROM vcf_query('./test/sql/exondb-release-with-deb-info/vcf-index/index.vcf.gz', '1'); 45 | ---- 46 | 191 47 | 48 | query IIIIIII 49 | SELECT chrom, pos, ref, alt, qual, info.indel, info.dp FROM vcf_query('./test/sql/exondb-release-with-deb-info/vcf-index/index.vcf.gz', '1') LIMIT 1; 50 | ---- 51 | 1 52 | 9999919 53 | G 54 | [<*>] 55 | 0.0 56 | NULL 57 | 1 58 | 59 | query I 60 | SELECT COUNT(*) FROM bcf_query('./test/sql/exondb-release-with-deb-info/bcf-index/index.bcf', '1'); 61 | ---- 62 | 191 63 | 64 | query IIIIIII 65 | SELECT chrom, pos, ref, alt, qual, info.indel, info.dp FROM bcf_query('./test/sql/exondb-release-with-deb-info/bcf-index/index.bcf', '1') LIMIT 1; 66 | ---- 67 | 1 68 | 9999919 69 | G 70 | [<*>] 71 | 0.0 72 | NULL 73 | 1 74 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_fasta_scan.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Test counting from a FASTA file 5 | query I 6 | SELECT count(*) FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta'); 7 | ---- 8 | 2 9 | 10 | # Test counting from a FASTA file, gzip compression option 11 | query I 12 | SELECT count(*) FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta.gzip', compression='gzip'); 13 | ---- 14 | 2 15 | 16 | # Test counting from a FASTA file, gzip compression auto_detect 17 | query I 18 | SELECT count(*) FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta.gz'); 19 | ---- 20 | 2 21 | 22 | # Test counting from a FASTA file, zstd compression option 23 | query I 24 | SELECT count(*) FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta.zstd', compression='zstd'); 25 | ---- 26 | 2 27 | 28 | # Test counting from a FASTA file 29 | query I 30 | SELECT count(*) FROM './test/sql/exondb-release-with-deb-info/test.fasta'; 31 | ---- 32 | 2 33 | 34 | query I 35 | SELECT count(*) FROM './test/sql/exondb-release-with-deb-info/test.fasta' WHERE id = 'a'; 36 | ---- 37 | 1 38 | 39 | # Test counting from a FASTA file, gzip compression auto_detect 40 | query I 41 | SELECT count(*) FROM './test/sql/exondb-release-with-deb-info/test.fasta.gz'; 42 | ---- 43 | 2 44 | 45 | # Test counting from a FASTA file, zstd compression auto_detect 46 | query I 47 | SELECT count(*) FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta.zst'); 48 | ---- 49 | 2 50 | 51 | # Test that an error is thrown if the file does not exist 52 | statement error 53 | SELECT count(*) FROM read_fasta(''); 54 | 55 | # Test glob 56 | query I 57 | SELECT COUNT(*) FROM read_fasta('./test/sql/exondb-release-with-deb-info/fasta/', compression='gzip'); 58 | ---- 59 | 4 60 | 61 | # Test glob 62 | # query I 63 | # SELECT COUNT(*) FROM read_fasta('./test/sql/exondb-release-with-deb-info/*.fasta.gz'); 64 | # ---- 65 | # 2 66 | -------------------------------------------------------------------------------- /.github/workflows/TestInsalls.yml: -------------------------------------------------------------------------------- 1 | name: Test Installs 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 0 * * *" 7 | 8 | permissions: 9 | contents: read 10 | id-token: write 11 | 12 | jobs: 13 | test-cli: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [ubuntu-latest, macos-latest, windows-latest] 18 | steps: 19 | - name: Install DuckDB on Mac 20 | if: matrix.os == 'macos-latest' 21 | run: | 22 | brew install unzip 23 | wget -O duckdb.zip https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-osx-universal.zip 24 | unzip duckdb.zip -d duckdb 25 | sudo mv duckdb/duckdb /usr/local/bin 26 | - name: Install DuckDB on Linux 27 | if: matrix.os == 'ubuntu-latest' 28 | run: | 29 | sudo apt-get install wget unzip 30 | wget -O duckdb.zip https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-linux-amd64.zip 31 | unzip duckdb.zip -d duckdb 32 | sudo mv duckdb/duckdb /usr/local/bin 33 | - name: Install DuckDB on Windows 34 | if: matrix.os == 'windows-latest' 35 | run: | 36 | choco install wget unzip 37 | wget -O duckdb.zip https://github.com/duckdb/duckdb/releases/download/v0.8.1/duckdb_cli-windows-amd64.zip 38 | unzip duckdb.zip 39 | - name: Create Script 40 | shell: bash 41 | run: | 42 | echo "SET custom_extension_repository='dbe.wheretrue.com/exon/latest';" > test.sql 43 | echo "INSTALL exon;" >> test.sql 44 | echo "LOAD exon;" >> test.sql 45 | echo "SELECT gc_content('ATCG');" >> test.sql 46 | cat test.sql 47 | - name: Run DuckDB (Unix) 48 | shell: bash 49 | if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest' 50 | run: | 51 | duckdb -unsigned < test.sql 52 | - name: Run DuckDB (Windows) 53 | shell: bash 54 | if: matrix.os == 'windows-latest' 55 | run: | 56 | ./duckdb.exe -unsigned < test.sql 57 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_fastq_copy.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Test writing to a FASTQ file 5 | # query I 6 | # COPY (SELECT * FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq')) TO './test/sql/tmp/test.fastq' (FORMAT 'fastq'); 7 | # ---- 8 | # 2 9 | 10 | # Test writing to a FASTQ file, gzipped 11 | # query I 12 | # COPY (FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq')) TO './test/sql/tmp/test.fastq.gz' (FORMAT 'fastq'); 13 | # ---- 14 | # 2 15 | 16 | # Test writing to a FASTQ file, zstd 17 | # query I 18 | # COPY (SELECT * FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq')) TO './test/sql/tmp/test.fastq.zst' (FORMAT 'fastq'); 19 | # ---- 20 | # 2 21 | 22 | # Test writing to a FASTQ file, gzipped 23 | # query I 24 | # COPY (SELECT * FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq')) TO './test/sql/tmp/test.fastq.gzip' (FORMAT 'fastq', COMPRESSION 'gzip'); 25 | # ---- 26 | # 2 27 | 28 | # Test writing to a FASTQ gzipped fastq file to stdout 29 | # query I 30 | # COPY (FROM './test/sql/exondb-release-with-deb-info/test.fastq' LIMIT 1) TO STDOUT (FORMAT 'fastq'); 31 | # ---- 32 | # 1 33 | 34 | # Test writing to a FASTQ file, force its creation 35 | # query I 36 | # COPY (SELECT * FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq')) TO './test/sql/tmp/test.fastq.gzip' (FORMAT 'fastq', COMPRESSION 'gzip', FORCE true); 37 | # ---- 38 | # 2 39 | 40 | # Test we can read back out the FASTQ file 41 | # query I 42 | # SELECT COUNT(*) FROM read_fastq('./test/sql/tmp/test.fastq.gzip', compression='gzip'); 43 | # ---- 44 | # 2 45 | 46 | # Test writing to a FASTQ file, zstd 47 | # query I 48 | # COPY (SELECT * FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq')) TO './test/sql/tmp/test.fastq.zstd' (FORMAT 'fastq', COMPRESSION 'zstd'); 49 | # ---- 50 | # 2 51 | 52 | # Test we can read back out the FASTQ file 53 | # query I 54 | # SELECT COUNT(*) FROM read_fastq('./test/sql/tmp/test.fastq.zstd', compression='zstd'); 55 | # ---- 56 | # 2 57 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_fastq_scan.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Test counting from a FASTQ file 5 | query I 6 | SELECT count(*) FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq'); 7 | ---- 8 | 2 9 | 10 | # Test counting from a FASTQ gzipped file, autodetect extension 11 | query I 12 | SELECT count(*) FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq.gz'); 13 | ---- 14 | 2 15 | 16 | # Test counting from a FASTQ gzipped file, with option 17 | query I 18 | SELECT count(*) FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq.gzip', compression='gzip'); 19 | ---- 20 | 2 21 | 22 | # Test counting from a FASTQ zstd file, autodetect extension 23 | query I 24 | SELECT count(*) FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq.zst'); 25 | ---- 26 | 2 27 | 28 | # Test counting from a FASTQ zstd file, with option 29 | query I 30 | SELECT count(*) FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq.zstd', compression='zstd'); 31 | ---- 32 | 2 33 | 34 | # Test table structure 35 | query IIII 36 | SELECT * FROM read_fastq('./test/sql/exondb-release-with-deb-info/test.fastq') LIMIT 1; 37 | ---- 38 | SEQ_ID 39 | This is a description 40 | GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT 41 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 42 | 43 | # Test counting from a FASTQ file 44 | query I 45 | SELECT count(*) FROM './test/sql/exondb-release-with-deb-info/test.fastq'; 46 | ---- 47 | 2 48 | 49 | # Test counting from a FASTQ gzipped file, autodetect extension 50 | query I 51 | SELECT count(*) FROM './test/sql/exondb-release-with-deb-info/test.fastq.gz'; 52 | ---- 53 | 2 54 | 55 | # Test counting from a FASTQ zstd file, autodetect extension 56 | query I 57 | SELECT count(*) FROM './test/sql/exondb-release-with-deb-info/test.fastq.zst'; 58 | ---- 59 | 2 60 | 61 | statement error 62 | SELECT count(*) FROM read_fastq(''); 63 | 64 | # Test list functionality 65 | query I 66 | SELECT COUNT(*) FROM read_fastq('./test/sql/exondb-release-with-deb-info/fastq/') LIMIT 1; 67 | ---- 68 | 4 69 | -------------------------------------------------------------------------------- /exon/src/exon/fastq_functions/module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "exon/fastq_functions/module.hpp" 23 | #include "rust.hpp" 24 | 25 | namespace exon 26 | { 27 | 28 | duckdb::unique_ptr FastqFunctions::GetQualityScoreStringToList() 29 | { 30 | duckdb::ScalarFunctionSet set("quality_score_string_to_list"); 31 | 32 | auto duckdb_function = [](duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 33 | { 34 | result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); 35 | 36 | for (duckdb::idx_t i = 0; i < args.size(); i++) 37 | { 38 | auto value = args.data[0].GetValue(i); 39 | auto string_value = duckdb::StringValue::Get(value); 40 | 41 | duckdb::vector quality_scores; 42 | 43 | for (auto c : string_value) 44 | { 45 | quality_scores.push_back(duckdb::Value::INTEGER(c - 33)); 46 | } 47 | 48 | result.SetValue(i, duckdb::Value::LIST(quality_scores)); 49 | } 50 | }; 51 | 52 | set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR}, duckdb::LogicalType::LIST(duckdb::LogicalType::INTEGER), duckdb_function)); 53 | return duckdb::make_uniq(set); 54 | } 55 | } -------------------------------------------------------------------------------- /.github/workflows/Windows.yml: -------------------------------------------------------------------------------- 1 | name: Windows 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | tags: 7 | - "v*" 8 | branches: 9 | - main 10 | pull_request: 11 | 12 | 13 | permissions: 14 | contents: read 15 | id-token: write 16 | 17 | concurrency: 18 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 19 | cancel-in-progress: true 20 | defaults: 21 | run: 22 | shell: bash 23 | 24 | jobs: 25 | windows: 26 | name: Release 27 | runs-on: windows-latest 28 | 29 | strategy: 30 | matrix: 31 | # Add commits/tags to build against other DuckDB versions 32 | duckdb_version: ["v0.8.1"] 33 | 34 | steps: 35 | - uses: actions/checkout@v3 36 | with: 37 | fetch-depth: 0 38 | submodules: "true" 39 | 40 | - name: Install OpenSSL 41 | shell: bash 42 | run: | 43 | choco feature disable -n=showDownloadProgress 44 | choco install openssl -y --force --params "/logLevel=quiet" 45 | 46 | - name: Install latest nightly 47 | uses: actions-rs/toolchain@v1 48 | with: 49 | toolchain: stable 50 | 51 | - uses: actions/setup-python@v2 52 | with: 53 | python-version: "3.7" 54 | 55 | - name: Checkout DuckDB to version 56 | # Add commits/tags to build against other DuckDB versions 57 | if: ${{ matrix.duckdb_version != ''}} 58 | run: | 59 | cd duckdb 60 | git checkout ${{ matrix.duckdb_version }} 61 | 62 | - name: Build extension 63 | run: | 64 | make release_windows 65 | 66 | - name: Test 67 | run: | 68 | make test_windows 69 | 70 | - uses: actions/upload-artifact@v2 71 | with: 72 | name: windows-extension 73 | path: | 74 | build/release/extension/exon/exon.duckdb_extension 75 | 76 | - name: Configure AWS Credentials 77 | uses: aws-actions/configure-aws-credentials@v1 78 | with: 79 | aws-region: us-west-2 80 | role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/github-deploy-role 81 | 82 | - name: install boto3 83 | shell: bash 84 | run: | 85 | pip3 install boto3 86 | 87 | - name: upload to s3 88 | shell: bash 89 | run: | 90 | python3 bin/upload-artifacts.py 91 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | default: release 2 | 3 | MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) 4 | PROJ_DIR := $(dir $(MKFILE_PATH)) 5 | 6 | OSX_BUILD_UNIVERSAL_FLAG= 7 | ifeq (${OSX_BUILD_UNIVERSAL}, 1) 8 | OSX_BUILD_UNIVERSAL_FLAG=-DOSX_BUILD_UNIVERSAL=1 9 | endif 10 | ifeq (${STATIC_LIBCPP}, 1) 11 | STATIC_LIBCPP=-DSTATIC_LIBCPP=TRUE 12 | endif 13 | 14 | ifeq ($(GEN),ninja) 15 | GENERATOR=-G "Ninja" 16 | FORCE_COLOR=-DFORCE_COLORED_OUTPUT=1 17 | endif 18 | 19 | BUILD_FLAGS=-DEXTENSION_STATIC_BUILD=1 ${OSX_BUILD_UNIVERSAL_FLAG} ${STATIC_LIBCPP} 20 | ifeq (${BUILD_SHELL}, 0) 21 | BUILD_FLAGS += -DBUILD_SHELL=0 22 | endif 23 | 24 | CLIENT_FLAGS := 25 | 26 | # These flags will make DuckDB build the extension 27 | EXTENSION_FLAGS=-DENABLE_SANITIZER=OFF -DDUCKDB_OOT_EXTENSION_NAMES="exon" -DDUCKDB_OOT_EXTENSION_EXON_PATH="$(PROJ_DIR)" -DDUCKDB_OOT_EXTENSION_EXON_SHOULD_LINK="TRUE" -DDUCKDB_OOT_EXTENSION_EXON_INCLUDE_PATH="$(PROJ_DIR)exon/include" 28 | 29 | release: 30 | mkdir -p build/release && \ 31 | cmake $(GENERATOR) $(FORCE_COLOR) $(EXTENSION_FLAGS) ${CLIENT_FLAGS} -DEXTENSION_STATIC_BUILD=1 -DCMAKE_BUILD_TYPE=Release ${BUILD_FLAGS} -S ./duckdb/ -B build/release && \ 32 | cmake --build build/release --config Release -j 8 --target 'cargo-build_rust' && \ 33 | cmake --build build/release --config Release -j 8 --target wfa2cpp && \ 34 | cmake --build build/release --config Release -j 8 --target wfa2cpp_static && \ 35 | cmake --build build/release --config Release 36 | 37 | release_windows: 38 | mkdir -p build/release && \ 39 | cmake $(GENERATOR) $(FORCE_COLOR) $(EXTENSION_FLAGS) ${CLIENT_FLAGS} -DEXTENSION_STATIC_BUILD=1 -DCMAKE_BUILD_TYPE=Release ${BUILD_FLAGS} -S ./duckdb/ -B build/release && \ 40 | cmake --build build/release --config Release -j 8 --target 'cargo-build_rust' && \ 41 | cmake --build build/release --config Release 42 | 43 | test: release 44 | mkdir -p ./test/sql/tmp/ && \ 45 | rm -rf ./test/sql/tmp/* && \ 46 | ./build/release/test/unittest --test-dir . "[exondb-release-with-deb-info]" && \ 47 | rm -rf ./test/sql/tmp 48 | 49 | test_windows: release_windows 50 | mkdir -p ./test/sql/tmp/ && \ 51 | rm -rf ./test/sql/tmp/* && \ 52 | ./build/release/test/Release/unittest.exe --test-dir . "[exondb-release-with-deb-info]" && \ 53 | rm -rf ./test/sql/tmp 54 | 55 | test_align: 56 | ./build/release/test/unittest --test-dir . "[exondb-align]" 57 | 58 | extension_release: 59 | python bin/upload-artifacts.py 60 | 61 | pull: 62 | git submodule init 63 | git submodule update --recursive --remote 64 | -------------------------------------------------------------------------------- /exon/include/rust.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | struct ReaderResult { 8 | const char *error; 9 | }; 10 | 11 | struct ReplacementScanResult { 12 | const char *file_type; 13 | }; 14 | 15 | struct BAMReaderResult { 16 | const char *error; 17 | }; 18 | 19 | struct BCFReaderResult { 20 | const char *error; 21 | }; 22 | 23 | struct VCFReaderResult { 24 | const char *error; 25 | }; 26 | 27 | struct CResult { 28 | const char *value; 29 | const char *error; 30 | }; 31 | 32 | struct CExtractResponse { 33 | uintptr_t sequence_start; 34 | uintptr_t sequence_len; 35 | const char *extracted_sequence; 36 | const char *error; 37 | }; 38 | 39 | extern "C" { 40 | 41 | ReaderResult new_reader(ArrowArrayStream *stream_ptr, 42 | const char *uri, 43 | uintptr_t batch_size, 44 | const char *compression, 45 | const char *file_format, 46 | const char *filters); 47 | 48 | ReplacementScanResult replacement_scan(const char *uri); 49 | 50 | BAMReaderResult bam_query_reader(ArrowArrayStream *stream_ptr, 51 | const char *uri, 52 | const char *query, 53 | uintptr_t batch_size); 54 | 55 | BCFReaderResult bcf_query_reader(ArrowArrayStream *stream_ptr, 56 | const char *uri, 57 | const char *query, 58 | uintptr_t batch_size); 59 | 60 | VCFReaderResult vcf_query_reader(ArrowArrayStream *stream_ptr, 61 | const char *uri, 62 | const char *query, 63 | uintptr_t batch_size); 64 | 65 | bool is_segmented(uint16_t flag); 66 | 67 | bool is_unmapped(uint16_t flag); 68 | 69 | bool is_properly_aligned(uint16_t flag); 70 | 71 | bool is_mate_unmapped(uint16_t flag); 72 | 73 | bool is_reverse_complemented(uint16_t flag); 74 | 75 | bool is_mate_reverse_complemented(uint16_t flag); 76 | 77 | bool is_first_segment(uint16_t flag); 78 | 79 | bool is_last_segment(uint16_t flag); 80 | 81 | bool is_secondary(uint16_t flag); 82 | 83 | bool is_quality_control_failed(uint16_t flag); 84 | 85 | bool is_duplicate(uint16_t flag); 86 | 87 | bool is_supplementary(uint16_t flag); 88 | 89 | CResult parse_cigar(const char *cigar); 90 | 91 | CExtractResponse extract_from_cigar(const char *sequence_str, const char *cigar_str); 92 | 93 | } // extern "C" 94 | -------------------------------------------------------------------------------- /exon/include/exon/bam_query_function/module.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "duckdb/function/table/arrow.hpp" 23 | 24 | using namespace duckdb; 25 | 26 | namespace exon 27 | { 28 | 29 | struct BAMQueryTableScanInfo : public TableFunctionInfo 30 | { 31 | }; 32 | 33 | struct BAMQueryTableFunction : duckdb::ArrowTableFunction 34 | { 35 | private: 36 | static duckdb::unique_ptr TableBind(ClientContext &context, TableFunctionBindInput &input, 37 | vector &return_types, vector &names); 38 | 39 | static duckdb::unique_ptr InitGlobal(duckdb::ClientContext &context, 40 | duckdb::TableFunctionInitInput &input); 41 | 42 | static void Scan(duckdb::ClientContext &context, duckdb::TableFunctionInput &input, duckdb::DataChunk &output); 43 | 44 | static unique_ptr BAMQueryScanInitLocalInternal(ClientContext &context, 45 | TableFunctionInitInput &input, 46 | GlobalTableFunctionState *global_state); 47 | 48 | static unique_ptr BAMQueryScanInitLocal(ExecutionContext &context, 49 | TableFunctionInitInput &input, 50 | GlobalTableFunctionState *global_state); 51 | 52 | public: 53 | static void Register(duckdb::ClientContext &context); 54 | }; 55 | } 56 | -------------------------------------------------------------------------------- /exon/include/exon/bcf_query_function/module.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "duckdb/function/table/arrow.hpp" 23 | 24 | using namespace duckdb; 25 | 26 | namespace exon 27 | { 28 | 29 | struct BCFQueryTableScanInfo : public TableFunctionInfo 30 | { 31 | }; 32 | 33 | struct BCFQueryTableFunction : duckdb::ArrowTableFunction 34 | { 35 | private: 36 | static duckdb::unique_ptr TableBind(ClientContext &context, TableFunctionBindInput &input, 37 | vector &return_types, vector &names); 38 | 39 | static duckdb::unique_ptr InitGlobal(duckdb::ClientContext &context, 40 | duckdb::TableFunctionInitInput &input); 41 | 42 | static void Scan(duckdb::ClientContext &context, duckdb::TableFunctionInput &input, duckdb::DataChunk &output); 43 | 44 | static unique_ptr BCFQueryScanInitLocalInternal(ClientContext &context, 45 | TableFunctionInitInput &input, 46 | GlobalTableFunctionState *global_state); 47 | 48 | static unique_ptr BCFQueryScanInitLocal(ExecutionContext &context, 49 | TableFunctionInitInput &input, 50 | GlobalTableFunctionState *global_state); 51 | 52 | public: 53 | static void Register(duckdb::ClientContext &context); 54 | }; 55 | } 56 | -------------------------------------------------------------------------------- /exon/include/exon/vcf_query_function/module.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "duckdb/function/table/arrow.hpp" 23 | 24 | using namespace duckdb; 25 | 26 | namespace exon 27 | { 28 | 29 | struct VCFQueryTableScanInfo : public TableFunctionInfo 30 | { 31 | }; 32 | 33 | struct VCFQueryTableFunction : duckdb::ArrowTableFunction 34 | { 35 | private: 36 | static duckdb::unique_ptr TableBind(ClientContext &context, TableFunctionBindInput &input, 37 | vector &return_types, vector &names); 38 | 39 | static duckdb::unique_ptr InitGlobal(duckdb::ClientContext &context, 40 | duckdb::TableFunctionInitInput &input); 41 | 42 | static void Scan(duckdb::ClientContext &context, duckdb::TableFunctionInput &input, duckdb::DataChunk &output); 43 | 44 | static unique_ptr VCFQueryScanInitLocalInternal(ClientContext &context, 45 | TableFunctionInitInput &input, 46 | GlobalTableFunctionState *global_state); 47 | 48 | static unique_ptr VCFQueryScanInitLocal(ExecutionContext &context, 49 | TableFunctionInitInput &input, 50 | GlobalTableFunctionState *global_state); 51 | 52 | public: 53 | static void Register(duckdb::ClientContext &context); 54 | }; 55 | } 56 | -------------------------------------------------------------------------------- /exon/include/exon/arrow_table_function/module.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "duckdb/function/table/arrow.hpp" 23 | 24 | using namespace duckdb; 25 | 26 | namespace exon 27 | { 28 | 29 | struct WTArrowTableScanInfo : public TableFunctionInfo 30 | { 31 | public: 32 | WTArrowTableScanInfo(std::string file_type_p) : file_type(file_type_p) {} 33 | 34 | std::string file_type; 35 | }; 36 | 37 | struct WTArrowTableFunction : duckdb::ArrowTableFunction 38 | { 39 | private: 40 | static duckdb::unique_ptr FileTypeBind(ClientContext &context, TableFunctionBindInput &input, 41 | vector &return_types, vector &names); 42 | 43 | static duckdb::unique_ptr InitGlobal(duckdb::ClientContext &context, 44 | duckdb::TableFunctionInitInput &input); 45 | 46 | static void Scan(duckdb::ClientContext &context, duckdb::TableFunctionInput &input, duckdb::DataChunk &output); 47 | 48 | static unique_ptr ArrowScanInitLocalInternal(ClientContext &context, 49 | TableFunctionInitInput &input, 50 | GlobalTableFunctionState *global_state); 51 | 52 | static unique_ptr ArrowScanInitLocal(ExecutionContext &context, 53 | TableFunctionInitInput &input, 54 | GlobalTableFunctionState *global_state); 55 | 56 | public: 57 | static void Register(std::string name, std::string file_type, duckdb::ClientContext &context); 58 | static unique_ptr ReplacementScan(ClientContext &context, const string &table_name, 59 | ReplacementScanData *data); 60 | }; 61 | } 62 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_fasta_copy.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Test writing to a FASTA file 5 | # query I 6 | # COPY (SELECT * FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta')) TO './test/sql/tmp/test.fasta' (FORMAT 'fasta'); 7 | # ---- 8 | # 2 9 | 10 | # Test that we can re-read what we write_to_file 11 | # query I 12 | # SELECT COUNT(*) FROM read_fasta('./test/sql/tmp/test.fasta'); 13 | # ---- 14 | # 2 15 | 16 | # Test writing to a FASTA file in gzip format 17 | # query I 18 | # COPY (SELECT * FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta')) TO './test/sql/tmp/test.fasta.gz' (FORMAT 'fasta'); 19 | # ---- 20 | # 2 21 | 22 | # Test that we can re-read what we write_to_file 23 | # query I 24 | # SELECT COUNT(*) FROM read_fasta('./test/sql/tmp/test.fasta.gz'); 25 | # ---- 26 | # 2 27 | 28 | # Test writing to a FASTA file in zstd format 29 | # query I 30 | # COPY (SELECT * FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta')) TO './test/sql/tmp/test.fasta.zst' (FORMAT 'fasta'); 31 | # ---- 32 | # 2 33 | 34 | # Test writing to a FASTA file in zstd format, force its creation 35 | # query I 36 | # COPY (SELECT * FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta')) TO './test/sql/tmp/test.fasta.zst' (FORMAT 'fasta', FORCE true); 37 | # ---- 38 | # 2 39 | 40 | # Now don't force it, and expect an error 41 | # statement error 42 | # COPY (SELECT * FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta')) TO './test/sql/tmp/test.fasta.zst' (FORMAT 'fasta'); 43 | 44 | # Test that we can re-read what we write_to_file 45 | # query I 46 | # SELECT COUNT(*) FROM read_fasta('./test/sql/tmp/test.fasta.zst'); 47 | # ---- 48 | # 2 49 | 50 | # Test writing to a FASTA file in gzip format 51 | # query I 52 | # COPY (SELECT * FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta')) TO './test/sql/tmp/test.fasta.gzip' (FORMAT 'fasta', COMPRESSION 'gzip'); 53 | # ---- 54 | # 2 55 | 56 | # Test reading back from that file 57 | # query I 58 | # SELECT COUNT(*) FROM read_fasta('./test/sql/tmp/test.fasta.gzip', compression='gzip'); 59 | # ---- 60 | # 2 61 | 62 | # Test writing to a FASTA file in zstd format 63 | # query I 64 | # COPY (SELECT * FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.fasta')) TO './test/sql/tmp/test.fasta.zstd' (FORMAT 'fasta', COMPRESSION 'zstd'); 65 | # ---- 66 | # 2 67 | 68 | # Test round trip for mixed description null. 69 | # query I 70 | # COPY (FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.mixed-desc.fasta')) TO './test/sql/tmp/test.mixed-desc.fasta' (FORMAT 'fasta'); 71 | # ---- 72 | # 2 73 | 74 | # Test the output of the prior job can be read back correctly 75 | # query III 76 | # FROM read_fasta('./test/sql/tmp/test.mixed-desc.fasta') WHERE description IS NULL; 77 | # ---- 78 | # b 79 | # NULL 80 | # ATCG 81 | 82 | # query I 83 | # COPY (FROM read_fasta('./test/sql/exondb-release-with-deb-info/test.mixed-desc.fasta') LIMIT 1) TO STDOUT (FORMAT 'fasta'); 84 | # ---- 85 | # 1 86 | -------------------------------------------------------------------------------- /exon/include/exon/alignment_functions/module.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifdef WFA2_ENABLED 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include "bindings/cpp/WFAligner.hpp" 23 | 24 | namespace exondb 25 | { 26 | class AlignmentFunctions 27 | { 28 | public: 29 | static duckdb::CreateScalarFunctionInfo GetAlignmentStringFunction(std::string name); 30 | 31 | struct AlignmentStringBindData : public duckdb::FunctionData 32 | { 33 | AlignmentStringBindData() : aligner(4, 6, 2, wfa::WFAligner::Alignment, wfa::WFAligner::MemoryHigh) {} 34 | 35 | AlignmentStringBindData(int32_t mismatch, int32_t gap_opening, int32_t gap_extension, wfa::WFAligner::MemoryModel memory_model) : aligner(mismatch, gap_opening, gap_extension, wfa::WFAligner::Alignment, memory_model) {} 36 | AlignmentStringBindData(int32_t match, int32_t mismatch, int32_t gap_opening, int32_t gap_extension, wfa::WFAligner::MemoryModel memory_model) : aligner(match, mismatch, gap_opening, gap_extension, wfa::WFAligner::Alignment, memory_model) {} 37 | 38 | wfa::WFAlignerGapAffine aligner; 39 | 40 | virtual bool Equals(const duckdb::FunctionData &other_p) const override; 41 | duckdb::unique_ptr Copy() const override; 42 | }; 43 | 44 | static duckdb::CreateScalarFunctionInfo GetAlignmentScoreFunction(std::string name); 45 | 46 | struct AlignmentScoreBindData : public duckdb::FunctionData 47 | { 48 | AlignmentScoreBindData() : aligner(4, 6, 2, wfa::WFAligner::Alignment, wfa::WFAligner::MemoryHigh) {} 49 | 50 | AlignmentScoreBindData(int32_t mismatch, int32_t gap_opening, int32_t gap_extension, wfa::WFAligner::MemoryModel memory_model) : aligner(mismatch, gap_opening, gap_extension, wfa::WFAligner::Alignment, memory_model) {} 51 | AlignmentScoreBindData(int32_t match, int32_t mismatch, int32_t gap_opening, int32_t gap_extension, wfa::WFAligner::MemoryModel memory_model) : aligner(match, mismatch, gap_opening, gap_extension, wfa::WFAligner::Alignment, memory_model) {} 52 | 53 | wfa::WFAlignerGapAffine aligner; 54 | 55 | virtual bool Equals(const duckdb::FunctionData &other_p) const override; 56 | duckdb::unique_ptr Copy() const override; 57 | }; 58 | }; 59 | } 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /rust/src/bam_query_reader.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::{ 16 | ffi::{c_char, CStr, CString}, 17 | sync::Arc, 18 | }; 19 | 20 | use arrow::ffi_stream::FFI_ArrowArrayStream as ArrowArrayStream; 21 | use datafusion::prelude::{SessionConfig, SessionContext}; 22 | use exon::{ffi::create_dataset_stream_from_table_provider, new_exon_config, ExonSessionExt}; 23 | use tokio::runtime::Runtime; 24 | 25 | #[repr(C)] 26 | pub struct BAMReaderResult { 27 | error: *const c_char, 28 | } 29 | 30 | #[no_mangle] 31 | pub unsafe extern "C" fn bam_query_reader( 32 | stream_ptr: *mut ArrowArrayStream, 33 | uri: *const c_char, 34 | query: *const c_char, 35 | batch_size: usize, 36 | ) -> BAMReaderResult { 37 | let uri = match CStr::from_ptr(uri).to_str() { 38 | Ok(uri) => uri, 39 | Err(e) => { 40 | let error = CString::new(format!("could not parse uri: {}", e)).unwrap(); 41 | return BAMReaderResult { 42 | error: error.into_raw(), 43 | }; 44 | } 45 | }; 46 | 47 | let rt = Arc::new(Runtime::new().unwrap()); 48 | 49 | let config = new_exon_config().with_batch_size(batch_size); 50 | let ctx = SessionContext::with_config_exon(config); 51 | 52 | let query = match CStr::from_ptr(query).to_str() { 53 | Ok(query) => query, 54 | Err(e) => { 55 | let error = CString::new(format!("could not parse query: {}", e)).unwrap(); 56 | return BAMReaderResult { 57 | error: error.into_raw(), 58 | }; 59 | } 60 | }; 61 | 62 | rt.block_on(async { 63 | let df = match ctx.query_bam_file(uri, query).await { 64 | Ok(df) => df, 65 | Err(e) => { 66 | let error = CString::new(format!("could not read BAM file: {}", e)).unwrap(); 67 | return BAMReaderResult { 68 | error: error.into_raw(), 69 | }; 70 | } 71 | }; 72 | 73 | match create_dataset_stream_from_table_provider(df, rt.clone(), stream_ptr).await { 74 | Ok(_) => BAMReaderResult { 75 | error: std::ptr::null(), 76 | }, 77 | Err(e) => { 78 | let error = 79 | CString::new(format!("could not create dataset stream: {}", e)).unwrap(); 80 | return BAMReaderResult { 81 | error: error.into_raw(), 82 | }; 83 | } 84 | } 85 | }) 86 | } 87 | -------------------------------------------------------------------------------- /rust/src/bcf_query_reader.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::{ 16 | ffi::{c_char, CStr, CString}, 17 | sync::Arc, 18 | }; 19 | 20 | use arrow::ffi_stream::FFI_ArrowArrayStream as ArrowArrayStream; 21 | use datafusion::prelude::{SessionConfig, SessionContext}; 22 | use exon::{ffi::create_dataset_stream_from_table_provider, new_exon_config, ExonSessionExt}; 23 | use tokio::runtime::Runtime; 24 | 25 | #[repr(C)] 26 | pub struct BCFReaderResult { 27 | error: *const c_char, 28 | } 29 | 30 | #[no_mangle] 31 | pub unsafe extern "C" fn bcf_query_reader( 32 | stream_ptr: *mut ArrowArrayStream, 33 | uri: *const c_char, 34 | query: *const c_char, 35 | batch_size: usize, 36 | ) -> BCFReaderResult { 37 | let uri = match CStr::from_ptr(uri).to_str() { 38 | Ok(uri) => uri, 39 | Err(e) => { 40 | let error = CString::new(format!("could not parse uri: {}", e)).unwrap(); 41 | return BCFReaderResult { 42 | error: error.into_raw(), 43 | }; 44 | } 45 | }; 46 | 47 | let rt = Arc::new(Runtime::new().unwrap()); 48 | 49 | let config = new_exon_config().with_batch_size(batch_size); 50 | let ctx = SessionContext::with_config_exon(config); 51 | 52 | let query = match CStr::from_ptr(query).to_str() { 53 | Ok(query) => query, 54 | Err(e) => { 55 | let error = CString::new(format!("could not parse query: {}", e)).unwrap(); 56 | return BCFReaderResult { 57 | error: error.into_raw(), 58 | }; 59 | } 60 | }; 61 | 62 | rt.block_on(async { 63 | let df = match ctx.query_bcf_file(uri, query).await { 64 | Ok(df) => df, 65 | Err(e) => { 66 | let error = CString::new(format!("could not read BCF file: {}", e)).unwrap(); 67 | return BCFReaderResult { 68 | error: error.into_raw(), 69 | }; 70 | } 71 | }; 72 | 73 | match create_dataset_stream_from_table_provider(df, rt.clone(), stream_ptr).await { 74 | Ok(_) => BCFReaderResult { 75 | error: std::ptr::null(), 76 | }, 77 | Err(e) => { 78 | let error = 79 | CString::new(format!("could not create dataset stream: {}", e)).unwrap(); 80 | return BCFReaderResult { 81 | error: error.into_raw(), 82 | }; 83 | } 84 | } 85 | }) 86 | } 87 | -------------------------------------------------------------------------------- /rust/src/vcf_query_reader.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::{ 16 | ffi::{c_char, CStr, CString}, 17 | sync::Arc, 18 | }; 19 | 20 | use arrow::ffi_stream::FFI_ArrowArrayStream as ArrowArrayStream; 21 | use datafusion::prelude::{SessionConfig, SessionContext}; 22 | use exon::{ffi::create_dataset_stream_from_table_provider, new_exon_config, ExonSessionExt}; 23 | use tokio::runtime::Runtime; 24 | 25 | #[repr(C)] 26 | pub struct VCFReaderResult { 27 | error: *const c_char, 28 | } 29 | 30 | #[no_mangle] 31 | pub unsafe extern "C" fn vcf_query_reader( 32 | stream_ptr: *mut ArrowArrayStream, 33 | uri: *const c_char, 34 | query: *const c_char, 35 | batch_size: usize, 36 | ) -> VCFReaderResult { 37 | let uri = match CStr::from_ptr(uri).to_str() { 38 | Ok(uri) => uri, 39 | Err(e) => { 40 | let error = CString::new(format!("could not parse uri: {}", e)).unwrap(); 41 | return VCFReaderResult { 42 | error: error.into_raw(), 43 | }; 44 | } 45 | }; 46 | 47 | let rt = Arc::new(Runtime::new().unwrap()); 48 | 49 | let config = new_exon_config().with_batch_size(batch_size); 50 | let ctx = SessionContext::with_config_exon(config); 51 | 52 | let query = match CStr::from_ptr(query).to_str() { 53 | Ok(query) => query, 54 | Err(e) => { 55 | let error = CString::new(format!("could not parse query: {}", e)).unwrap(); 56 | return VCFReaderResult { 57 | error: error.into_raw(), 58 | }; 59 | } 60 | }; 61 | 62 | rt.block_on(async { 63 | let df = match ctx.query_vcf_file(uri, query).await { 64 | Ok(df) => df, 65 | Err(e) => { 66 | let error = CString::new(format!("could not read VCF file: {}", e)).unwrap(); 67 | return VCFReaderResult { 68 | error: error.into_raw(), 69 | }; 70 | } 71 | }; 72 | 73 | match create_dataset_stream_from_table_provider(df, rt.clone(), stream_ptr).await { 74 | Ok(_) => VCFReaderResult { 75 | error: std::ptr::null(), 76 | }, 77 | Err(e) => { 78 | let error = 79 | CString::new(format!("could not create dataset stream: {}", e)).unwrap(); 80 | return VCFReaderResult { 81 | error: error.into_raw(), 82 | }; 83 | } 84 | } 85 | }) 86 | } 87 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_gff_scan.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Test counting from a GFF file 5 | query IIIIIIIII 6 | SELECT seqname, source, type, start, "end", score, strand, phase, attributes FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff'); 7 | ---- 8 | sq0 9 | caat 10 | gene 11 | 8 12 | 13 13 | NULL 14 | + 15 | NULL 16 | {gene_id=[caat1], gene_name=[gene0]} 17 | sq1 18 | caat 19 | gene 20 | 8 21 | 14 22 | 0.1 23 | + 24 | 0 25 | {gene_id=[caat2], gene_name=[gene0]} 26 | 27 | # Test counting from a GFF file 28 | query I 29 | SELECT count(*) FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff'); 30 | ---- 31 | 2 32 | 33 | # Test counting from a GFF gzipped file, autodetect extension 34 | query I 35 | SELECT count(*) FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff.gz'); 36 | ---- 37 | 2 38 | 39 | # Test counting from a GFF gzipped file, with option 40 | query I 41 | SELECT count(*) FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff.gzip', compression='gzip'); 42 | ---- 43 | 2 44 | 45 | # Test counting from a GFF zstd file, autodetect extension 46 | query I 47 | SELECT count(*) FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff.zst'); 48 | ---- 49 | 2 50 | 51 | # Test counting from a GFF zstd file, with option 52 | query I 53 | SELECT count(*) FROM read_gff('./test/sql/exondb-release-with-deb-info/test.gff.zstd', compression='zstd'); 54 | ---- 55 | 2 56 | 57 | # Test counting from a GFF file 58 | query I 59 | SELECT count(*) FROM './test/sql/exondb-release-with-deb-info/test.gff'; 60 | ---- 61 | 2 62 | 63 | # Test counting from a GFF gzipped file, autodetect extension 64 | query I 65 | SELECT count(*) FROM './test/sql/exondb-release-with-deb-info/test.gff.gz'; 66 | ---- 67 | 2 68 | 69 | # Test counting from a GFF gzipped file, autodetect extension 70 | query I 71 | SELECT count(*) FROM './test/sql/exondb-release-with-deb-info/test.gff.zst'; 72 | ---- 73 | 2 74 | 75 | # Test missing file throws an error 76 | statement error 77 | SELECT count(*) FROM read_gff(''); 78 | 79 | # Test attribute parsing works 80 | query I 81 | SELECT gff_parse_attributes('ID=KanNP_rdsDRAFT_3085441342.1;locus_tag=KanNP_rdsDRAFT_30854413421;'); 82 | ---- 83 | {ID=KanNP_rdsDRAFT_3085441342.1, locus_tag=KanNP_rdsDRAFT_30854413421} 84 | 85 | # Test attribute parsing works 86 | query I 87 | SELECT gff_parse_attributes('ID=KanNP_rdsDRAFT_3085441342.1;locus_tag=KanNP_rdsDRAFT_30854413421'); 88 | ---- 89 | {ID=KanNP_rdsDRAFT_3085441342.1, locus_tag=KanNP_rdsDRAFT_30854413421} 90 | 91 | # Test attribute parsing works 92 | statement error 93 | SELECT gff_parse_attributes('ID'); 94 | 95 | # Test attribute parsing works 96 | query I 97 | SELECT element_at(gff_parse_attributes('key=value'), 'key')[1]; 98 | ---- 99 | value 100 | 101 | # Test that a "raw" gff can be read 102 | # query IIIIIIIII 103 | # SELECT seqid, source, type, start, "end", score, strand, phase, attributes FROM read_gff_raw('./test/sql/exondb-release-with-deb-info/gff/raw-test.gff') LIMIT 1; 104 | # ---- 105 | # KanNP_rdsDRAFT_3085441342 106 | # FGMP 107 | # CDS 108 | # 2 109 | # 151 110 | # . 111 | # 1 112 | # 0 113 | # ID=KanNP_rdsDRAFT_3085441342.1;locus_tag=KanNP_rdsDRAFT_30854413421; 114 | -------------------------------------------------------------------------------- /bin/upload-artifacts.py: -------------------------------------------------------------------------------- 1 | """Helper script to upload artifacts""" 2 | import argparse 3 | import gzip 4 | import platform 5 | import shutil 6 | from pathlib import Path 7 | 8 | import boto3 9 | 10 | client = boto3.client("s3") 11 | lambda_client = boto3.client("lambda") 12 | 13 | if __name__ == "__main__": 14 | # Setup argparse to parse the arguments 15 | parser = argparse.ArgumentParser( 16 | description="Upload artifacts to S3.", 17 | ) 18 | 19 | parser.add_argument("--name", default="exon") 20 | parser.add_argument("--version", default="v0.8.0") 21 | parser.add_argument("--duckdb_version", default="v0.8.1") 22 | parser.add_argument("--gcc4", action="store_true") 23 | 24 | args = parser.parse_args() 25 | 26 | name = args.name 27 | version = args.version 28 | duckdb_version = args.duckdb_version 29 | gcc4 = args.gcc4 30 | 31 | bucket = "wtt-01-dist-prd" 32 | 33 | print(f"Uploading artifacts to {bucket}.") 34 | 35 | platform_uname = platform.uname() 36 | operating_system = platform_uname.system 37 | architecture = platform_uname.machine 38 | 39 | print(f"Operating system: {operating_system}") 40 | print(f"Architecture: {architecture}") 41 | 42 | if operating_system.lower() == "windows": 43 | arch = "windows_amd64" 44 | elif operating_system.lower() == "darwin" and architecture.lower() == "x86_64": 45 | arch = "osx_amd64" 46 | elif operating_system.lower() == "linux" and architecture.lower() == "x86_64": 47 | arch = "linux_amd64" 48 | 49 | if gcc4: 50 | arch = "linux_amd64_gcc4" 51 | 52 | elif operating_system.lower() == "darwin" and architecture.lower() == "arm64": 53 | arch = "osx_arm64" 54 | else: 55 | raise Exception(f"Unsupported platform: {operating_system} {architecture}") 56 | 57 | filename = f"{name}-{version}-{operating_system}-{architecture}.zip" 58 | full_s3_path = f"s3://{bucket}/extension/{name}/{filename}" 59 | 60 | local_file = Path(f"{name}.duckdb_extension.zip") 61 | 62 | if operating_system.lower() == "windows": 63 | build_target = ( 64 | Path("build") 65 | / "release" 66 | / "extension" 67 | / name 68 | / f"{name}.duckdb_extension" 69 | ) 70 | else: 71 | build_target = ( 72 | Path("build") / "release" / "extension" / name / f"{name}.duckdb_extension" 73 | ) 74 | 75 | # gzip the build_target with python 76 | gzip_build_target = f"{name}.duckdb_extension.gz" 77 | with open(build_target, "rb") as f_in: 78 | with gzip.open(gzip_build_target, "wb") as f_out: 79 | shutil.copyfileobj(f_in, f_out) 80 | 81 | # aws s3 cp $1.duckdb_extension.gz s3://$5/$1/$2/$3/$4/$1.duckdb_extension.gz --acl public-read 82 | duckdb_path = f"{name}/{version}/{duckdb_version}/{arch}/{name}.duckdb_extension.gz" 83 | client.upload_file(gzip_build_target, bucket, duckdb_path) 84 | client.put_object_acl(ACL="public-read", Bucket=bucket, Key=duckdb_path) 85 | 86 | latest_duckdb_path = ( 87 | f"{name}/latest/{duckdb_version}/{arch}/{name}.duckdb_extension.gz" 88 | ) 89 | 90 | client.copy_object( 91 | ACL="public-read", 92 | Bucket=bucket, 93 | CopySource=f"{bucket}/{duckdb_path}", 94 | Key=latest_duckdb_path, 95 | ) 96 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test_scalar_functions.test: -------------------------------------------------------------------------------- 1 | statement ok 2 | LOAD 'build/release/extension/exon/exon.duckdb_extension'; 3 | 4 | # Very basic 5 | query I 6 | SELECT gc_content(seq) FROM (SELECT 'ATGC' AS seq UNION ALL SELECT 'ATGCGC' AS seq); 7 | ---- 8 | 0.5 9 | 0.66 10 | 11 | # Test the gc_content function returns 0 for an empty string 12 | query I 13 | SELECT gc_content('') 14 | ---- 15 | 0 16 | 17 | # Test the gc_content functions returns NULL for a NULL input 18 | query I 19 | SELECT gc_content(NULL) IS NULL 20 | ---- 21 | true 22 | 23 | # Two columns 24 | query I 25 | WITH two_seqs AS (SELECT 'ATCG' AS sequence UNION ALL SELECT 'GGGG') SELECT gc_content(sequence) FROM two_seqs 26 | ---- 27 | 0.5 28 | 1.0 29 | 30 | # Test complement 31 | query I 32 | SELECT complement(seq) FROM (SELECT 'ATGC' AS seq UNION ALL SELECT 'ATGCGC' AS seq); 33 | ---- 34 | TACG 35 | TACGCG 36 | 37 | # Test bad sequence in complement 38 | statement error 39 | SELECT complement('ATCGQ') 40 | 41 | # Test reverse complement 42 | query I 43 | SELECT reverse_complement(seq) FROM (SELECT 'ATCG' AS seq UNION ALL SELECT 'GGGG' AS seq); 44 | ---- 45 | CGAT 46 | TTTT 47 | 48 | # Test transcribe function works with ATCG 49 | query I 50 | SELECT transcribe(t) FROM (SELECT 'ATCG' AS t UNION ALL SELECT 'ATCGATCG' AS t) 51 | ---- 52 | AUCG 53 | AUCGAUCG 54 | 55 | # Test a bad sequence of ATNN throws an InvalidInputException error 56 | statement error 57 | SELECT transcribe('ATNN') 58 | 59 | # Test a bad sequence of ATTT (4 characters) throws an InvalidInputException error 60 | statement error 61 | SELECT translate('ATTT') 62 | 63 | # Test a simple sequence works 64 | query I 65 | SELECT translate_dna_to_aa(seq) FROM (SELECT 'ATGCGC' AS seq UNION ALL SELECT 'ATGCGC' AS seq); 66 | ---- 67 | MR 68 | MR 69 | 70 | # Test a sequence with all codons 71 | query I 72 | SELECT translate_dna_to_aa('AAAAATAACAAGATAATTATCATGACAACTACCACGAGAAGTAGCAGGTAATATTACTAGTTATTTTTCTTGTCATCTTCCTCGTGATGTTGCTGGCAACATCACCAGCTACTTCTCCTGCCACCTCCCCCGCGACGTCGCCGGGAAGATGACGAGGTAGTTGTCGTGGCAGCTGCCGCGGGAGGTGGCGGG') 73 | ---- 74 | KNNKIIIMTTTTRSSR*YY*LFFLSSSS*CCWQHHQLLLLPPPPRRRREDDEVVVVAAAAGGGG 75 | 76 | # Test an unknown codon returns an InvalidInputException error 77 | statement error 78 | SELECT translate_dna_to_aa('NNN') 79 | 80 | # Test reverse transcribe function converts AUCG to ATCG 81 | query I 82 | SELECT reverse_transcribe(seq) FROM (SELECT 'AUCG' AS seq UNION ALL SELECT 'AUCU' AS seq); 83 | ---- 84 | ATCG 85 | ATCT 86 | 87 | # Test a bad sequence of AUNN throws an InvalidInputException error for reverse transcribe 88 | statement error 89 | SELECT reverse_transcribe('AUNN') 90 | 91 | # Test a sam string can be read 92 | query I 93 | SELECT parse_cigar('1M2M123S') 94 | ---- 95 | [{'op': M, 'len': 1}, {'op': M, 'len': 2}, {'op': S, 'len': 123}] 96 | 97 | # Test a CIGAR string parse error is 98 | statement error 99 | SELECT parse_cigar('MMM') as cigar; 100 | ---- 101 | Invalid Error: Invalid CIGAR string: MMM 102 | 103 | statement ok 104 | SELECT parse_cigar('100M') as cigar; 105 | 106 | # Test extracting from a sequence 107 | query I 108 | SELECT extract_from_cigar('AACCAA', '2I2M2I') 109 | ---- 110 | {'sequence_start': 2, 'sequence_end': 4, 'sequence': CC} 111 | 112 | # Test extracting from a sequence 113 | query I 114 | SELECT extract_from_cigar('AACCAAC', '2I2M2I1M') 115 | ---- 116 | {'sequence_start': 2, 'sequence_end': 7, 'sequence': CCAAC} 117 | -------------------------------------------------------------------------------- /exon/src/exon/gff_functions/module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "exon/gff_functions/module.hpp" 25 | #include "rust.hpp" 26 | 27 | namespace exon 28 | { 29 | duckdb::CreateScalarFunctionInfo GFFunctions::GetGFFParseAttributesFunction() 30 | { 31 | duckdb::ScalarFunctionSet set("gff_parse_attributes"); 32 | 33 | auto duckdb_function = [](duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 34 | { 35 | result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); 36 | 37 | // TODO: add soft failure 38 | for (duckdb::idx_t i = 0; i < args.size(); i++) 39 | { 40 | auto value = args.data[0].GetValue(i); 41 | auto string_value = duckdb::StringValue::Get(value); 42 | 43 | auto last_semicolon_pos = string_value.find_last_of(";"); 44 | auto string_len = string_value.size(); 45 | if (last_semicolon_pos == string_len) 46 | { 47 | string_value = string_value.substr(0, string_len - 1); 48 | } 49 | 50 | auto attributes = duckdb::StringUtil::Split(string_value, ";"); 51 | 52 | duckdb::vector items; 53 | 54 | for (auto &attribute : attributes) 55 | { 56 | duckdb::StringUtil::Trim(attribute); 57 | auto key_value = duckdb::StringUtil::Split(attribute, "="); 58 | 59 | if (key_value.size() != 2) 60 | { 61 | throw duckdb::Exception("Invalid attribute: '" + attribute + "' expected 'key=value;key2=value2'"); 62 | } 63 | 64 | duckdb::child_list_t map_struct; 65 | 66 | auto new_key = duckdb::Value(key_value[0]); 67 | auto new_value = duckdb::Value(key_value[1]); 68 | 69 | map_struct.emplace_back(std::make_pair("key", std::move(new_key))); 70 | map_struct.emplace_back(std::make_pair("value", std::move(new_value))); 71 | 72 | items.push_back(duckdb::Value::STRUCT(std::move(map_struct))); 73 | } 74 | 75 | duckdb::LogicalType map_type = duckdb::LogicalType::MAP(duckdb::LogicalType::VARCHAR, duckdb::LogicalType::VARCHAR); 76 | result.SetValue(i, duckdb::Value::MAP(duckdb::ListType::GetChildType(map_type), std::move(items))); 77 | } 78 | }; 79 | 80 | auto return_type = duckdb::LogicalType::MAP(duckdb::LogicalType::VARCHAR, duckdb::LogicalType::VARCHAR); 81 | set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR}, return_type, duckdb_function)); 82 | 83 | return duckdb::CreateScalarFunctionInfo(set); 84 | } 85 | 86 | } -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.2) 2 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 3 | 4 | # Set extension name here 5 | set(TARGET_NAME exon) 6 | set(EXTENSION_NAME ${TARGET_NAME}_extension) 7 | set(CMAKE_CXX_STANDARD 11) 8 | 9 | project(${TARGET_NAME}) 10 | 11 | if (APPLE) 12 | execute_process(COMMAND brew --prefix openssl@3 OUTPUT_VARIABLE OPENSSL_PREFIX) 13 | string(STRIP ${OPENSSL_PREFIX} OPENSSL_ROOT_DIR) 14 | 15 | include_directories(${OPENSSL_ROOT_DIR}/include) 16 | link_directories(${OPENSSL_ROOT_DIR}/lib) 17 | 18 | execute_process(COMMAND brew --prefix xz OUTPUT_VARIABLE XZ_PREFIX) 19 | string(STRIP ${XZ_PREFIX} XZ_ROOT_DIR) 20 | 21 | set(WFA2_ENABLED ON) 22 | add_compile_definitions(WFA2_ENABLED) 23 | 24 | elseif(UNIX) 25 | set(OPENSSL_ROOT_DIR /usr/local/ssl) 26 | 27 | set(WFA2_ENABLED ON) 28 | add_compile_definitions(WFA2_ENABLED) 29 | 30 | include_directories(${OPENSSL_ROOT_DIR}/include) 31 | link_directories(${OPENSSL_ROOT_DIR}/lib) 32 | 33 | elseif(WIN32) 34 | set(WFA2_ENABLED OFF) 35 | endif() 36 | 37 | Include(FetchContent) 38 | 39 | FetchContent_Declare( 40 | arrow 41 | GIT_REPOSITORY https://github.com/apache/arrow.git 42 | GIT_TAG apache-arrow-11.0.0 43 | ) 44 | 45 | FetchContent_Declare( 46 | httplib 47 | GIT_REPOSITORY https://github.com/yhirose/cpp-httplib.git 48 | GIT_TAG v0.12.0 49 | ) 50 | 51 | FetchContent_Declare( 52 | json 53 | URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz 54 | ) 55 | 56 | FetchContent_Declare( 57 | Corrosion 58 | GIT_REPOSITORY https://github.com/corrosion-rs/corrosion.git 59 | GIT_TAG v0.3.5 60 | ) 61 | 62 | FetchContent_Declare( 63 | spdlog 64 | GIT_REPOSITORY https://github.com/gabime/spdlog.git 65 | GIT_TAG v1.11.0 66 | ) 67 | 68 | list(APPEND available_contents httplib json Corrosion spdlog arrow) 69 | 70 | FetchContent_MakeAvailable(${available_contents}) 71 | 72 | if(WFA2_ENABLED) 73 | FetchContent_Declare(wfa2 74 | GIT_REPOSITORY https://github.com/tshauck/WFA2-lib.git 75 | GIT_TAG 4784d5892c25f1967174b4deb7a3a7f1f34a9cb8 76 | ) 77 | 78 | FetchContent_MakeAvailable(wfa2) 79 | include_directories(${wfa2_SOURCE_DIR}) 80 | endif() 81 | 82 | corrosion_import_crate(MANIFEST_PATH rust/Cargo.toml 83 | PROFILE release 84 | ) 85 | 86 | include_directories(exon/include) 87 | add_subdirectory(exon/src) 88 | 89 | add_library(${EXTENSION_NAME} STATIC ${EXTENSION_SOURCES}) 90 | 91 | # Build extensions 92 | set(PARAMETERS "-warnings") 93 | build_loadable_extension(${TARGET_NAME} ${PARAMETERS} ${EXTENSION_SOURCES}) 94 | 95 | find_package(OpenSSL REQUIRED) 96 | message("-- wtt: Found openssl ${OPENSSL_VERSION}") 97 | 98 | if(WIN32) 99 | target_link_libraries(${EXTENSION_NAME} 100 | PUBLIC 101 | "${CMAKE_CURRENT_BINARY_DIR}/Release/rust.lib" 102 | ntdll 103 | Secur32 104 | OpenSSL::SSL 105 | OpenSSL::Crypto 106 | bcrypt 107 | ncrypt 108 | Userenv 109 | ) 110 | elseif(APPLE) 111 | target_link_libraries(${EXTENSION_NAME} 112 | PUBLIC 113 | "${CMAKE_CURRENT_BINARY_DIR}/librust.a" 114 | 115 | ${WFA2_LIBRARIES} 116 | 117 | "${wfa2_BINARY_DIR}/libwfa2cpp.a" 118 | "${wfa2_BINARY_DIR}/libwfa2.a" 119 | 120 | -lz 121 | -lm 122 | -lbz2 123 | ${XZ_ROOT_DIR}/lib/liblzma.a 124 | -lcurl 125 | ${OPENSSL_LIBRARIES} 126 | 127 | "-framework CoreFoundation" 128 | "-framework Security" 129 | "-framework Accelerate" 130 | ) 131 | elseif(UNIX) 132 | target_link_libraries(${EXTENSION_NAME} 133 | PUBLIC 134 | "${CMAKE_CURRENT_BINARY_DIR}/librust.a" 135 | 136 | "${wfa2_BINARY_DIR}/libwfa2cpp.a" 137 | "${wfa2_BINARY_DIR}/libwfa2.a" 138 | 139 | -lz 140 | -lm 141 | -lbz2 142 | -llzma 143 | -lcurl 144 | -lpthread 145 | ${OPENSSL_LIBRARIES} 146 | ) 147 | endif() 148 | 149 | install( 150 | TARGETS ${EXTENSION_NAME} 151 | EXPORT "${DUCKDB_EXPORT_SET}" 152 | LIBRARY DESTINATION "${INSTALL_LIB_DIR}" 153 | ARCHIVE DESTINATION "${INSTALL_LIB_DIR}") 154 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/vcf/vcf_file.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##FILTER= 3 | ##INFO= 4 | ##FORMAT= 5 | ##INFO= 6 | ##FORMAT= 7 | ##FORMAT= 8 | ##FORMAT= 9 | ##FORMAT= 10 | ##FILTER= 11 | ##FILTER= 12 | ##contig= 13 | ##contig= 14 | ##contig= 15 | ##contig= 16 | ##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta 17 | ##readme=AAAAAA 18 | ##readme=BBBBBB 19 | ##INFO= 20 | ##INFO= 21 | ##INFO= 22 | ##INFO= 23 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B 24 | 1 3000150 . C T 59.2 PASS AN=4;AC=2 GT:GQ 0/1:245 0/1:245 25 | 1 3000151 . C T 59.2 PASS AN=4;AC=2 GT:DP:GQ 0/1:32:245 0/1:32:245 26 | 1 3062915 id3D GTTT G 12.9 q10 DP4=1,2,3,4;AN=4;AC=2;INDEL;STR=test GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 0/1:409:35:-20,-5,-20 27 | 1 3062915 idSNP G T,C 12.6 test TEST=5;DP4=1,2,3,4;AN=3;AC=1,1 GT:TT:GQ:DP:GL 0/1:0,1:409:35:-20,-5,-20,-20,-5,-20 2:0,1:409:35:-20,-5,-20 28 | 1 3106154 . CAAA C 342 PASS AN=4;AC=2 GT:GQ:DP 0/1:245:32 0/1:245:32 29 | 1 3106154 . C CT 59.2 PASS AN=4;AC=2 GT:GQ:DP 0/1:245:32 0/1:245:32 30 | 1 3157410 . GA G 90.6 q10 AN=4;AC=4 GT:GQ:DP 1/1:21:21 1/1:21:21 31 | 1 3162006 . GAA G 60.2 PASS AN=4;AC=2 GT:GQ:DP 0/1:212:22 0/1:212:22 32 | 1 3177144 . G T 45 PASS AN=4;AC=2 GT:GQ:DP 0/0:150:30 1/1:150:30 33 | 1 3177144 . G . 45 PASS AN=4;AC=0 GT:GQ:DP 0/0:150:30 0/0:150:30 34 | 1 3184885 . TAAAA TA,T 61.5 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:12:10 1/2:12:10 35 | 2 3199812 . G GTT,GT 82.7 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:322:26 1/2:322:26 36 | 3 3212016 . CTT C,CT 79 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:91:26 1/2:91:26 37 | 4 3258448 . TACACACAC T . PASS AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31 38 | 4 3258501 . C A,T,G,CA,CT,CG,CC,CAA,CAT,CAG,CAC,CTA,CTT,CTG,CTC,CGA,CGT,CGG,CGC,CCA,CCT,CCG,CCC,CAAA,CAAT,CAAG,CAAC,CATA,CATT,CATG,CATC,CAGA,CAGT,CAGG,CAGC,CACA,CACT,CACG,CACC,CTAA,CTAT,CTAG,CTAC,CTTA,CTTT,CTTG,CTTC,CTGA,CTGT,CTGG,CTGC,CTCA,CTCT,CTCG,CTCC,CGAA,CGAT,CGAG,CGAC,CGTA,CGTT,CGTG,CGTC,CGGA,CGGT,CGGG,CGGC,CGCA,CGCT,CGCG,CGCC,CCAA,CCAT,CCAG,CCAC,CCTA,CCTT,CCTG,CCTC,CCGA,CCGT,CCGG,CCGC,CCCA,CCCT,CCCG,CCCC,CAAAA,CAAAT,CAAAG,CAAAC,CAATA,CAATT,CAATG,CAATC,CAAGA,CAAGT,CAAGG,CAAGC,CAACA,CAACT,CAACG,CAACC,CATAA,CATAT,CATAG,CATAC,CATTA,CATTT,CATTG,CATTC,CATGA,CATGT,CATGG,CATGC,CATCA,CATCT,CATCG,CATCC,CAGAA,CAGAT,CAGAG,CAGAC,CAGTA,CAGTT,CAGTG,CAGTC,CAGGA,CAGGT,CAGGG,CAGGC,CAGCA,CAGCT,CAGCG,CAGCC,CACAA,CACAT,CACAG,CACAC,CACTA,CACTT,CACTG,CACTC,CACGA,CACGT,CACGG,CACGC,CACCA,CACCT,CACCG,CACCC,CTAAA,CTAAT,CTAAG,CTAAC,CTATA,CTATT,CTATG,CTATC,CTAGA,CTAGT,CTAGG,CTAGC,CTACA,CTACT,CTACG,CTACC,CTTAA,CTTAT,CTTAG,CTTAC,CTTTA,CTTTT,CTTTG,CTTTC,CTTGA,CTTGT,CTTGG,CTTGC,CTTCA,CTTCT,CTTCG,CTTCC,CTGAA,CTGAT,CTGAG,CTGAC,CTGTA,CTGTT,CTGTG,CTGTC,CTGGA,CTGGT,CTGGG,CTGGC,CTGCA,CTGCT,CTGCG,CTGCC,CTCAA,CTCAT,CTCAG,CTCAC,CTCTA,CTCTT,CTCTG,CTCTC,CTCGA,CTCGT,CTCGG,CTCGC,CTCCA,CTCCT,CTCCG,CTCCC,CGAAA,CGAAT,CGAAG,CGAAC,CGATA,CGATT,CGATG,CGATC,CGAGA,CGAGT,CGAGG,CGAGC,CGACA,CGACT,CGACG,CGACC,CGTAA,CGTAT,CGTAG,CGTAC,CGTTA,CGTTT,CGTTG,CGTTC,CGTGA,CGTGT,CGTGG,CGTGC,CGTCA,CGTCT,CGTCG,CGTCC,CGGAA,CGGAT,CGGAG,CGGAC,CGGTA,CGGTT,CGGTG,CGGTC,CGGGA,CGGGT,CGGGG,CGGGC,CGGCA,CGGCT,CGGCG,CGGCC,CGCAA,CGCAT,CGCAG,CGCAC,CGCTA,CGCTT,CGCTG,CGCTC,CGCGA,CGCGT,CGCGG,CGCGC,CGCCA,CGCCT,CGCCG,CGCCC,CCAAA,CCAAT,CCAAG,CCAAC,CCATA,CCATT,CCATG,CCATC,CCAGA,CCAGT,CCAGG,CCAGC,CCACA,CCACT,CCACG,CCACC,CCTAA,CCTAT,CCTAG,CCTAC,CCTTA,CCTTT,CCTTG,CCTTC,CCTGA,CCTGT 45 PASS AN=4;AC=2 GT 0/300 240/260 39 | -------------------------------------------------------------------------------- /exon/src/exon_extension.cpp: -------------------------------------------------------------------------------- 1 | #define DUCKDB_EXTENSION_MAIN 2 | 3 | #include "exon_extension.hpp" 4 | #include "exon/sam_functions/module.hpp" 5 | #include "exon/arrow_table_function/module.hpp" 6 | #include "exon/sequence_functions/module.hpp" 7 | #include "exon/gff_functions/module.hpp" 8 | #include "exon/fastq_functions/module.hpp" 9 | #include "exon/vcf_query_function/module.hpp" 10 | #include "exon/bcf_query_function/module.hpp" 11 | #include "exon/bam_query_function/module.hpp" 12 | #include "exon/core/module.hpp" 13 | 14 | #if defined(WFA2_ENABLED) 15 | #include "exon/alignment_functions/module.hpp" 16 | #endif 17 | 18 | #include "duckdb.hpp" 19 | 20 | using namespace duckdb; 21 | 22 | namespace duckdb 23 | { 24 | 25 | static void LoadInternal(DatabaseInstance &instance) 26 | { 27 | Connection con(instance); 28 | con.BeginTransaction(); 29 | 30 | auto &context = *con.context; 31 | auto &catalog = Catalog::GetSystemCatalog(context); 32 | 33 | auto &config = DBConfig::GetConfig(context); 34 | 35 | auto get_sam_functions = exon::SamFunctions::GetSamFunctions(); 36 | for (auto &func : get_sam_functions) 37 | { 38 | catalog.CreateFunction(context, *func); 39 | } 40 | 41 | auto sequence_functions = exon::SequenceFunctions::GetSequenceFunctions(); 42 | for (auto &fun : sequence_functions) 43 | { 44 | catalog.CreateFunction(context, fun); 45 | } 46 | 47 | exon::WTArrowTableFunction::Register("read_gff", "gff", context); 48 | exon::WTArrowTableFunction::Register("read_mzml", "mzml", context); 49 | exon::WTArrowTableFunction::Register("read_gtf", "gtf", context); 50 | exon::WTArrowTableFunction::Register("read_fasta", "fasta", context); 51 | exon::WTArrowTableFunction::Register("read_fastq", "fastq", context); 52 | exon::WTArrowTableFunction::Register("read_sam_file_records", "sam", context); 53 | exon::WTArrowTableFunction::Register("read_bam_file_records", "bam", context); 54 | exon::WTArrowTableFunction::Register("read_bed_file", "bed", context); 55 | exon::WTArrowTableFunction::Register("read_vcf_file_records", "vcf", context); 56 | exon::WTArrowTableFunction::Register("read_bcf_file_records", "bcf", context); 57 | exon::WTArrowTableFunction::Register("read_genbank", "genbank", context); 58 | exon::WTArrowTableFunction::Register("read_hmm_dom_tbl_out", "hmmdomtab", context); 59 | 60 | auto get_quality_scores_string_to_list = exon::FastqFunctions::GetQualityScoreStringToList(); 61 | catalog.CreateFunction(context, *get_quality_scores_string_to_list); 62 | 63 | auto gff_parse_attributes = exon::GFFunctions::GetGFFParseAttributesFunction(); 64 | catalog.CreateFunction(context, gff_parse_attributes); 65 | 66 | auto parse_cigar_string = exon::SamFunctions::GetParseCIGARStringFunction(); 67 | catalog.CreateFunction(context, *parse_cigar_string); 68 | 69 | auto extract_sequence_from_cigar = exon::SamFunctions::GetExtractFromCIGARFunction(); 70 | catalog.CreateFunction(context, *extract_sequence_from_cigar); 71 | 72 | auto get_wtt01_version_function = exon::ExonDbFunctions::GetExonDbVersionFunction(); 73 | catalog.CreateFunction(context, get_wtt01_version_function); 74 | 75 | exon::VCFQueryTableFunction::Register(context); 76 | exon::BCFQueryTableFunction::Register(context); 77 | exon::BAMQueryTableFunction::Register(context); 78 | 79 | config.replacement_scans.emplace_back(exon::WTArrowTableFunction::ReplacementScan); 80 | 81 | #if defined(WFA2_ENABLED) 82 | auto get_align_function = exondb::AlignmentFunctions::GetAlignmentStringFunction("alignment_string_wfa_gap_affine"); 83 | catalog.CreateFunction(context, get_align_function); 84 | 85 | auto get_align_function_default = exondb::AlignmentFunctions::GetAlignmentStringFunction("alignment_string"); 86 | catalog.CreateFunction(context, get_align_function_default); 87 | 88 | auto get_align_score_function = exondb::AlignmentFunctions::GetAlignmentScoreFunction("alignment_score_wfa_gap_affine"); 89 | catalog.CreateFunction(context, get_align_score_function); 90 | 91 | auto get_align_score_function_default = exondb::AlignmentFunctions::GetAlignmentScoreFunction("alignment_score"); 92 | catalog.CreateFunction(context, get_align_score_function_default); 93 | #endif 94 | 95 | con.Commit(); 96 | } 97 | 98 | void ExonExtension::Load(DuckDB &db) 99 | { 100 | LoadInternal(*db.instance); 101 | } 102 | 103 | std::string ExonExtension::Name() 104 | { 105 | return "exon"; 106 | } 107 | 108 | } // namespace duckdb 109 | 110 | extern "C" 111 | { 112 | 113 | DUCKDB_EXTENSION_API void exon_init(duckdb::DatabaseInstance &db) 114 | { 115 | LoadInternal(db); 116 | } 117 | 118 | DUCKDB_EXTENSION_API const char *exon_version() 119 | { 120 | return duckdb::DuckDB::LibraryVersion(); 121 | } 122 | } 123 | 124 | #ifndef DUCKDB_EXTENSION_MAIN 125 | #error DUCKDB_EXTENSION_MAIN not defined 126 | #endif 127 | -------------------------------------------------------------------------------- /.github/workflows/Linux.yml: -------------------------------------------------------------------------------- 1 | name: Linux 2 | on: 3 | push: 4 | tags: 5 | - "v*" 6 | branches: 7 | - main 8 | pull_request: 9 | 10 | concurrency: 11 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} 12 | cancel-in-progress: true 13 | 14 | defaults: 15 | run: 16 | shell: bash 17 | 18 | permissions: 19 | contents: read 20 | id-token: write 21 | 22 | jobs: 23 | linux: 24 | name: Linux Release 25 | runs-on: ubuntu-latest 26 | container: ${{ matrix.container }} 27 | strategy: 28 | matrix: 29 | duckdb_version: ["v0.8.1"] 30 | arch: ["linux_amd64_gcc4", "linux_amd64"] 31 | include: 32 | - arch: "linux_amd64" 33 | container: "ubuntu:22.04" 34 | - arch: "linux_amd64_gcc4" 35 | container: "quay.io/pypa/manylinux2014_x86_64" 36 | env: 37 | GEN: ninja 38 | 39 | steps: 40 | - name: Set Up Env Vars for Dev Override if Applicable 41 | if: startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'dev') 42 | run: | 43 | echo "Overriding Env Vars for Dev Build" 44 | echo "ENVIRONMENT=dev" >> $GITHUB_ENV 45 | 46 | - name: Set Up Env Vars for Dev Override if Applicable 47 | if: github.ref == 'refs/heads/main' || github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' || github.event_name == 'push' && github.event.ref != 'refs/heads/main' 48 | run: | 49 | echo "Overriding Env Vars for Dev Build" 50 | echo "ENVIRONMENT=dev" >> $GITHUB_ENV 51 | 52 | - name: Set Up Env Vars for Prod 53 | if: startsWith(github.ref, 'refs/tags/v') && !contains(github.ref, 'dev') 54 | run: | 55 | echo "Setting Prod Build Env Vars" 56 | echo "ENVIRONMENT=prd" >> $GITHUB_ENV 57 | 58 | - name: Install required ubuntu packages 59 | if: ${{ matrix.arch == 'linux_amd64' || matrix.arch == 'linux_arm64' }} 60 | run: | 61 | apt-get update -y -qq 62 | apt-get install -y -qq software-properties-common 63 | add-apt-repository ppa:git-core/ppa 64 | apt-get update -y -qq 65 | apt-get install -y -qq ninja-build make libssl-dev zip unzip checkinstall libffi-dev curl libz-dev ccache git wget pkg-config build-essential autoconf libbz2-dev liblzma-dev libcurl4-openssl-dev 66 | 67 | - name: Additional Yum Packages 68 | if: ${{ matrix.arch == 'linux_amd64_gcc4' }} 69 | run: | 70 | yum install -y autoconf bzip2-devel xz-devel curl-devel openssl-devel 71 | 72 | - name: Install CMake 3.21 73 | shell: bash 74 | if: ${{ matrix.arch == 'linux_amd64' }} 75 | run: | 76 | wget https://github.com/Kitware/CMake/releases/download/v3.21.3/cmake-3.21.3-linux-x86_64.sh 77 | chmod +x cmake-3.21.3-linux-x86_64.sh 78 | ./cmake-3.21.3-linux-x86_64.sh --skip-license --prefix=/usr/local 79 | cmake --version 80 | 81 | - uses: actions/checkout@v3 82 | with: 83 | fetch-depth: 0 84 | submodules: "true" 85 | 86 | - name: Setup mamba 87 | uses: mamba-org/setup-micromamba@main 88 | with: 89 | environment-file: environment.yml 90 | cache-downloads: true 91 | cache-environment: true 92 | 93 | - name: Install latest nightly 94 | uses: actions-rs/toolchain@v1 95 | with: 96 | toolchain: stable 97 | 98 | - name: Checkout DuckDB to version 99 | if: ${{ matrix.duckdb_version != ''}} 100 | run: | 101 | cd duckdb 102 | git checkout ${{ matrix.duckdb_version }} 103 | 104 | - if: ${{ matrix.arch == 'linux_amd64_gcc4' }} 105 | uses: ./duckdb/.github/actions/centos_7_setup 106 | with: 107 | openssl: 1 108 | 109 | # Build extension 110 | - name: Build extension 111 | shell: bash -el {0} 112 | env: 113 | GEN: ninja 114 | STATIC_LIBCPP: 1 115 | CC: ${{ matrix.arch == 'linux_arm64' && 'aarch64-linux-gnu-gcc' || '' }} 116 | CXX: ${{ matrix.arch == 'linux_arm64' && 'aarch64-linux-gnu-g++' || '' }} 117 | run: | 118 | make release 119 | 120 | - name: Test extension 121 | shell: bash -el {0} 122 | continue-on-error: true 123 | run: | 124 | make test 125 | 126 | - uses: actions/upload-artifact@v2 127 | with: 128 | name: ${{matrix.arch}}-extensions 129 | path: | 130 | build/release/extension/exon/exon.duckdb_extension 131 | 132 | - name: Configure AWS Credentials 133 | uses: aws-actions/configure-aws-credentials@v1 134 | with: 135 | aws-region: us-west-2 136 | role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/github-deploy-role 137 | 138 | - name: upload to s3 no gcc 139 | shell: bash -el {0} 140 | if: ${{ matrix.arch == 'linux_amd64' }} 141 | env: 142 | ENVIRONMENT: ${{ env.ENVIRONMENT }} 143 | run: | 144 | python bin/upload-artifacts.py 145 | 146 | - name: upload to s3 gcc4 147 | shell: bash -el {0} 148 | if: ${{ matrix.arch == 'linux_amd64_gcc4' }} 149 | env: 150 | ENVIRONMENT: ${{ env.ENVIRONMENT }} 151 | run: | 152 | python bin/upload-artifacts.py --gcc4 153 | -------------------------------------------------------------------------------- /rust/src/sam_functions.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::ffi::{c_char, CString}; 16 | 17 | use noodles::sam::record::{cigar::Cigar, Flags}; 18 | 19 | #[no_mangle] 20 | pub extern "C" fn is_segmented(flag: u16) -> bool { 21 | let flag = Flags::from_bits_truncate(flag); 22 | flag.contains(Flags::SEGMENTED) 23 | } 24 | 25 | #[no_mangle] 26 | pub extern "C" fn is_unmapped(flag: u16) -> bool { 27 | let flag = Flags::from_bits_truncate(flag); 28 | flag.contains(Flags::UNMAPPED) 29 | } 30 | 31 | #[no_mangle] 32 | pub extern "C" fn is_properly_aligned(flag: u16) -> bool { 33 | let flag = Flags::from_bits_truncate(flag); 34 | flag.contains(Flags::PROPERLY_ALIGNED) 35 | } 36 | 37 | #[no_mangle] 38 | pub extern "C" fn is_mate_unmapped(flag: u16) -> bool { 39 | let flag = Flags::from_bits_truncate(flag); 40 | flag.contains(Flags::MATE_UNMAPPED) 41 | } 42 | 43 | #[no_mangle] 44 | pub extern "C" fn is_reverse_complemented(flag: u16) -> bool { 45 | let flag = Flags::from_bits_truncate(flag); 46 | flag.contains(Flags::REVERSE_COMPLEMENTED) 47 | } 48 | 49 | #[no_mangle] 50 | pub extern "C" fn is_mate_reverse_complemented(flag: u16) -> bool { 51 | let flag = Flags::from_bits_truncate(flag); 52 | flag.contains(Flags::MATE_REVERSE_COMPLEMENTED) 53 | } 54 | 55 | #[no_mangle] 56 | pub extern "C" fn is_first_segment(flag: u16) -> bool { 57 | let flag = Flags::from_bits_truncate(flag); 58 | 59 | flag.contains(Flags::FIRST_SEGMENT) 60 | } 61 | 62 | #[no_mangle] 63 | pub extern "C" fn is_last_segment(flag: u16) -> bool { 64 | let flag = Flags::from_bits_truncate(flag); 65 | flag.contains(Flags::LAST_SEGMENT) 66 | } 67 | 68 | #[no_mangle] 69 | pub extern "C" fn is_secondary(flag: u16) -> bool { 70 | let flag = Flags::from_bits_truncate(flag); 71 | flag.contains(Flags::SECONDARY) 72 | } 73 | 74 | #[no_mangle] 75 | pub extern "C" fn is_quality_control_failed(flag: u16) -> bool { 76 | let flag = Flags::from_bits_truncate(flag); 77 | flag.contains(Flags::QC_FAIL) 78 | } 79 | 80 | #[no_mangle] 81 | pub extern "C" fn is_duplicate(flag: u16) -> bool { 82 | let flag = Flags::from_bits_truncate(flag); 83 | flag.contains(Flags::DUPLICATE) 84 | } 85 | 86 | #[no_mangle] 87 | pub extern "C" fn is_supplementary(flag: u16) -> bool { 88 | let flag = Flags::from_bits_truncate(flag); 89 | flag.contains(Flags::SUPPLEMENTARY) 90 | } 91 | 92 | #[repr(C)] 93 | pub struct CResult { 94 | value: *const c_char, 95 | error: *const c_char, 96 | } 97 | 98 | impl CResult { 99 | fn new(value: &str) -> Self { 100 | Self { 101 | value: CString::new(value).unwrap().into_raw(), 102 | error: std::ptr::null(), 103 | } 104 | } 105 | 106 | fn error(error: &str) -> Self { 107 | Self { 108 | value: std::ptr::null(), 109 | error: CString::new(error).unwrap().into_raw(), 110 | } 111 | } 112 | } 113 | 114 | #[no_mangle] 115 | pub extern "C" fn parse_cigar(cigar: *const c_char) -> CResult { 116 | let cigar = unsafe { std::ffi::CStr::from_ptr(cigar) }; 117 | let cigar = cigar.to_str().unwrap(); 118 | 119 | let cigar_obj: Cigar = match cigar.parse() { 120 | Ok(cigar) => cigar, 121 | Err(e) => return CResult::error(&e.to_string()), 122 | }; 123 | 124 | let serialized_obj = cigar_obj 125 | .iter() 126 | .map(|op| format!("{}={}", op.kind(), op.len())) 127 | .collect::>() 128 | .join(";"); 129 | 130 | CResult::new(serialized_obj.as_str()) 131 | } 132 | 133 | #[repr(C)] 134 | pub struct CExtractResponse { 135 | sequence_start: usize, 136 | sequence_len: usize, 137 | extracted_sequence: *const c_char, 138 | error: *const c_char, 139 | } 140 | 141 | impl CExtractResponse { 142 | fn new(sequence_start: usize, sequence_len: usize, extracted_sequence: &str) -> Self { 143 | Self { 144 | sequence_start, 145 | sequence_len, 146 | extracted_sequence: CString::new(extracted_sequence).unwrap().into_raw(), 147 | error: std::ptr::null(), 148 | } 149 | } 150 | 151 | fn error(error: &str) -> Self { 152 | Self { 153 | sequence_start: 0, 154 | sequence_len: 0, 155 | extracted_sequence: std::ptr::null(), 156 | error: CString::new(error).unwrap().into_raw(), 157 | } 158 | } 159 | } 160 | 161 | #[no_mangle] 162 | pub extern "C" fn extract_from_cigar( 163 | sequence_str: *const c_char, 164 | cigar_str: *const c_char, 165 | ) -> CExtractResponse { 166 | let cigar = unsafe { std::ffi::CStr::from_ptr(cigar_str) }; 167 | let cigar = match cigar.to_str() { 168 | Ok(cigar) => cigar, 169 | Err(e) => return CExtractResponse::error(&e.to_string()), 170 | }; 171 | 172 | let cigar_obj: Cigar = match cigar.parse() { 173 | Ok(cigar) => cigar, 174 | Err(e) => return CExtractResponse::error(&e.to_string()), 175 | }; 176 | 177 | let total_ops = cigar_obj.len(); 178 | let first_ops = cigar_obj[0]; 179 | let last_ops = cigar_obj[total_ops - 1]; 180 | 181 | let sequence = unsafe { std::ffi::CStr::from_ptr(sequence_str) }; 182 | let sequence = match sequence.to_str() { 183 | Ok(sequence) => sequence, 184 | Err(e) => return CExtractResponse::error(&e.to_string()), 185 | }; 186 | 187 | let sequence_start = match first_ops.kind() { 188 | noodles::sam::record::cigar::op::Kind::Insertion => first_ops.len(), 189 | _ => 0, 190 | }; 191 | 192 | let sequence_len = match last_ops.kind() { 193 | noodles::sam::record::cigar::op::Kind::Insertion => sequence.len() - last_ops.len(), 194 | _ => sequence.len(), 195 | }; 196 | 197 | let sequence = &sequence[sequence_start..sequence_len]; 198 | 199 | CExtractResponse::new(sequence_start, sequence_len, sequence) 200 | } 201 | -------------------------------------------------------------------------------- /rust/src/arrow_reader.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::{ 16 | ffi::{c_char, CStr, CString}, 17 | ptr::null, 18 | str::FromStr, 19 | sync::Arc, 20 | }; 21 | 22 | use arrow::ffi_stream::FFI_ArrowArrayStream as ArrowArrayStream; 23 | use datafusion::{ 24 | datasource::file_format::file_type::FileCompressionType, prelude::SessionContext, 25 | }; 26 | use exon::{ 27 | datasources::{ExonFileType, ExonReadOptions}, 28 | ffi::create_dataset_stream_from_table_provider, 29 | new_exon_config, ExonRuntimeEnvExt, ExonSessionExt, 30 | }; 31 | use tokio::runtime::Runtime; 32 | 33 | #[repr(C)] 34 | pub struct ReaderResult { 35 | error: *const c_char, 36 | } 37 | 38 | #[no_mangle] 39 | pub unsafe extern "C" fn new_reader( 40 | stream_ptr: *mut ArrowArrayStream, 41 | uri: *const c_char, 42 | batch_size: usize, 43 | compression: *const c_char, 44 | file_format: *const c_char, 45 | filters: *const c_char, 46 | ) -> ReaderResult { 47 | let uri = match CStr::from_ptr(uri).to_str() { 48 | Ok(uri) => uri, 49 | Err(e) => { 50 | let error = CString::new(format!("could not parse uri: {}", e)).unwrap(); 51 | return ReaderResult { 52 | error: error.into_raw(), 53 | }; 54 | } 55 | }; 56 | 57 | let rt = Arc::new(Runtime::new().unwrap()); 58 | 59 | // if compression is null, try to infer from file extension 60 | let compression_type = if compression.is_null() { 61 | let extension = match uri.split('.').last() { 62 | Some(extension) => extension, 63 | None => { 64 | let error = CString::new("could not parse extension").unwrap(); 65 | return ReaderResult { 66 | error: error.into_raw(), 67 | }; 68 | } 69 | }; 70 | 71 | match extension { 72 | "gz" => FileCompressionType::GZIP, 73 | "zst" => FileCompressionType::ZSTD, 74 | _ => FileCompressionType::UNCOMPRESSED, 75 | } 76 | } else { 77 | let compression = match CStr::from_ptr(compression).to_str() { 78 | Ok(compression) => compression, 79 | Err(e) => { 80 | let error = CString::new(format!("could not parse compression: {}", e)).unwrap(); 81 | return ReaderResult { 82 | error: error.into_raw(), 83 | }; 84 | } 85 | }; 86 | 87 | let compression = 88 | FileCompressionType::from_str(compression).unwrap_or(FileCompressionType::UNCOMPRESSED); 89 | 90 | compression 91 | }; 92 | 93 | let file_type = CStr::from_ptr(file_format).to_str().unwrap(); 94 | let file_type = match ExonFileType::from_str(file_type) { 95 | Ok(file_type) => file_type, 96 | Err(_) => { 97 | let error = CString::new(format!("could not parse file_format {}", file_type)).unwrap(); 98 | return ReaderResult { 99 | error: error.into_raw(), 100 | }; 101 | } 102 | }; 103 | 104 | let config = new_exon_config().with_batch_size(batch_size); 105 | let ctx = SessionContext::with_config_exon(config); 106 | 107 | rt.block_on(async { 108 | if let Err(e) = ctx.runtime_env().exon_register_object_store_uri(uri).await { 109 | return ReaderResult { 110 | error: CString::new(format!("could not register object store: {}", e)) 111 | .unwrap() 112 | .into_raw(), 113 | }; 114 | } 115 | 116 | let options = ExonReadOptions::new(file_type).with_compression(compression_type); 117 | 118 | if let Err(e) = ctx.register_exon_table("exon_table", uri, options).await { 119 | let error = CString::new(format!("could not register table: {}", e)).unwrap(); 120 | return ReaderResult { 121 | error: error.into_raw(), 122 | }; 123 | } 124 | 125 | let mut select_string = format!("SELECT * FROM exon_table"); 126 | 127 | if !filters.is_null() { 128 | let filters_str = match CStr::from_ptr(filters).to_str() { 129 | Ok(filters_str) => filters_str, 130 | Err(e) => { 131 | let error = CString::new(format!("could not parse filters: {}", e)).unwrap(); 132 | return ReaderResult { 133 | error: error.into_raw(), 134 | }; 135 | } 136 | }; 137 | 138 | if filters_str != "" { 139 | select_string.push_str(format!(" WHERE {}", filters_str).as_str()); 140 | } 141 | } 142 | 143 | let df = match ctx.sql(&select_string).await { 144 | Ok(df) => df, 145 | Err(e) => { 146 | let error = CString::new(format!("could not execute sql: {}", e)).unwrap(); 147 | return ReaderResult { 148 | error: error.into_raw(), 149 | }; 150 | } 151 | }; 152 | 153 | match create_dataset_stream_from_table_provider(df, rt.clone(), stream_ptr).await { 154 | Ok(_) => ReaderResult { 155 | error: std::ptr::null(), 156 | }, 157 | Err(e) => { 158 | let error = 159 | CString::new(format!("could not create dataset stream: {}", e)).unwrap(); 160 | return ReaderResult { 161 | error: error.into_raw(), 162 | }; 163 | } 164 | } 165 | }) 166 | } 167 | 168 | #[repr(C)] 169 | pub struct ReplacementScanResult { 170 | file_type: *const c_char, 171 | } 172 | 173 | #[no_mangle] 174 | pub unsafe extern "C" fn replacement_scan(uri: *const c_char) -> ReplacementScanResult { 175 | let uri = CStr::from_ptr(uri).to_str().unwrap(); 176 | let mut exts = uri.rsplit('.'); 177 | let mut splitted = exts.next().unwrap_or(""); 178 | 179 | let file_compression_type = 180 | FileCompressionType::from_str(splitted).unwrap_or(FileCompressionType::UNCOMPRESSED); 181 | 182 | if file_compression_type.is_compressed() { 183 | splitted = exts.next().unwrap_or(""); 184 | } 185 | 186 | match ExonFileType::from_str(splitted) { 187 | Ok(file_type) => { 188 | let ft_string = file_type.to_string(); 189 | return ReplacementScanResult { 190 | file_type: CString::new(ft_string).unwrap().into_raw(), 191 | }; 192 | } 193 | Err(_) => { 194 | return ReplacementScanResult { file_type: null() }; 195 | } 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /exon/src/exon/bam_query_function/module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "exon/arrow_table_function/module.hpp" 24 | #include "exon/bam_query_function/module.hpp" 25 | #include "rust.hpp" 26 | 27 | namespace exon 28 | { 29 | struct BAMQueryScanFunctionData : public TableFunctionData 30 | { 31 | string file_name; 32 | string query; 33 | 34 | unordered_map> arrow_convert_data; 35 | idx_t max_threads = 6; 36 | 37 | vector all_names; 38 | 39 | atomic lines_read; 40 | }; 41 | 42 | duckdb::unique_ptr BAMQueryTableFunction::TableBind(ClientContext &context, 43 | TableFunctionBindInput &input, 44 | vector &return_types, 45 | vector &names) 46 | { 47 | auto result = make_uniq(); 48 | 49 | auto file_name = input.inputs[0].GetValue(); 50 | auto query = input.inputs[1].GetValue(); 51 | 52 | struct ArrowArrayStream stream; 53 | auto vector_size = STANDARD_VECTOR_SIZE; 54 | 55 | // BAMQueryReaderResult bam_query_reader_result 56 | auto bam_query_reader_result = bam_query_reader(&stream, file_name.c_str(), query.c_str(), vector_size); 57 | 58 | if (bam_query_reader_result.error != NULL) 59 | { 60 | throw std::runtime_error(bam_query_reader_result.error); 61 | } 62 | 63 | struct ArrowSchema arrow_schema; 64 | 65 | if (stream.get_schema(&stream, &arrow_schema) != 0) 66 | { 67 | if (stream.release) 68 | { 69 | stream.release(&stream); 70 | } 71 | throw std::runtime_error("Failed to get schema"); 72 | } 73 | 74 | result->all_names.reserve(arrow_schema.n_children); 75 | 76 | auto n_children = arrow_schema.n_children; 77 | for (idx_t col_idx = 0; col_idx < n_children; col_idx++) 78 | { 79 | auto &schema = *arrow_schema.children[col_idx]; 80 | 81 | if (!schema.release) 82 | { 83 | throw InvalidInputException("arrow_scan: released schema passed"); 84 | } 85 | 86 | // TODO: handle dictionary 87 | return_types.emplace_back(GetArrowLogicalType(schema, result->arrow_convert_data, col_idx)); 88 | 89 | auto format = string(schema.format); 90 | auto name = string(schema.name); 91 | if (name.empty()) 92 | { 93 | name = string("v") + to_string(col_idx); 94 | } 95 | names.push_back(name); 96 | 97 | result->all_names.push_back(name); 98 | } 99 | 100 | RenameArrowColumns(names); 101 | 102 | result->file_name = file_name; 103 | result->query = query; 104 | 105 | return std::move(result); 106 | }; 107 | 108 | unique_ptr BAMQueryTableFunction::InitGlobal(ClientContext &context, 109 | TableFunctionInitInput &input) 110 | { 111 | auto &data = (BAMQueryScanFunctionData &)*input.bind_data; 112 | 113 | auto global_state = make_uniq(); 114 | 115 | struct ArrowArrayStream stream; 116 | 117 | auto file_name = data.file_name; 118 | auto query = data.query; 119 | auto vector_size = STANDARD_VECTOR_SIZE; 120 | 121 | auto bam_query_reader_result = bam_query_reader(&stream, file_name.c_str(), query.c_str(), vector_size); 122 | if (bam_query_reader_result.error != NULL) 123 | { 124 | throw std::runtime_error(bam_query_reader_result.error); 125 | } 126 | 127 | global_state->stream = make_uniq(); 128 | global_state->stream->arrow_array_stream = std::move(stream); 129 | 130 | return std::move(global_state); 131 | } 132 | 133 | void BAMQueryTableFunction::Scan(ClientContext &context, TableFunctionInput &input, DataChunk &output) 134 | { 135 | if (!input.local_state) 136 | { 137 | return; 138 | } 139 | auto &data = (BAMQueryScanFunctionData &)*input.bind_data; 140 | auto &state = (ArrowScanLocalState &)*input.local_state; 141 | auto &global_state = (ArrowScanGlobalState &)*input.global_state; 142 | 143 | //! Out of tuples in this chunk 144 | if (state.chunk_offset >= (idx_t)state.chunk->arrow_array.length) 145 | { 146 | if (!ArrowScanParallelStateNext(context, input.bind_data.get(), state, global_state)) 147 | { 148 | return; 149 | } 150 | } 151 | auto output_size = MinValue(STANDARD_VECTOR_SIZE, state.chunk->arrow_array.length - state.chunk_offset); 152 | data.lines_read += output_size; 153 | 154 | if (global_state.CanRemoveFilterColumns()) 155 | { 156 | state.all_columns.Reset(); 157 | state.all_columns.SetCardinality(output_size); 158 | ArrowToDuckDB(state, data.arrow_convert_data, state.all_columns, data.lines_read - output_size, false); 159 | output.ReferenceColumns(state.all_columns, global_state.projection_ids); 160 | } 161 | else 162 | { 163 | output.SetCardinality(output_size); 164 | 165 | ArrowToDuckDB(state, data.arrow_convert_data, output, data.lines_read - output_size, false); 166 | } 167 | 168 | output.Verify(); 169 | state.chunk_offset += output.size(); 170 | } 171 | 172 | void BAMQueryTableFunction::Register(duckdb::ClientContext &context) 173 | { 174 | TableFunction scan; 175 | scan = TableFunction("bam_query", {LogicalType::VARCHAR, LogicalType::VARCHAR}, 176 | BAMQueryTableFunction::Scan, 177 | BAMQueryTableFunction::TableBind, 178 | BAMQueryTableFunction::InitGlobal, 179 | ArrowTableFunction::ArrowScanInitLocal); 180 | 181 | scan.cardinality = ArrowTableFunction::ArrowScanCardinality; 182 | scan.get_batch_index = ArrowTableFunction::ArrowGetBatchIndex; 183 | 184 | scan.projection_pushdown = true; 185 | scan.filter_pushdown = true; 186 | 187 | auto &catalog = Catalog::GetSystemCatalog(context); 188 | 189 | CreateTableFunctionInfo info(scan); 190 | 191 | catalog.CreateTableFunction(context, &info); 192 | }; 193 | 194 | } 195 | -------------------------------------------------------------------------------- /exon/src/exon/bcf_query_function/module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "exon/arrow_table_function/module.hpp" 24 | #include "exon/bcf_query_function/module.hpp" 25 | #include "rust.hpp" 26 | 27 | namespace exon 28 | { 29 | struct BCFQueryScanFunctionData : public TableFunctionData 30 | { 31 | string file_name; 32 | string query; 33 | 34 | unordered_map> arrow_convert_data; 35 | idx_t max_threads = 6; 36 | 37 | vector all_names; 38 | 39 | atomic lines_read; 40 | }; 41 | 42 | duckdb::unique_ptr BCFQueryTableFunction::TableBind(ClientContext &context, 43 | TableFunctionBindInput &input, 44 | vector &return_types, 45 | vector &names) 46 | { 47 | auto result = make_uniq(); 48 | 49 | auto file_name = input.inputs[0].GetValue(); 50 | auto query = input.inputs[1].GetValue(); 51 | 52 | struct ArrowArrayStream stream; 53 | auto vector_size = STANDARD_VECTOR_SIZE; 54 | 55 | // BCFQueryReaderResult bcf_query_reader_result 56 | auto bcf_query_reader_result = bcf_query_reader(&stream, file_name.c_str(), query.c_str(), vector_size); 57 | 58 | if (bcf_query_reader_result.error != NULL) 59 | { 60 | throw std::runtime_error(bcf_query_reader_result.error); 61 | } 62 | 63 | struct ArrowSchema arrow_schema; 64 | 65 | if (stream.get_schema(&stream, &arrow_schema) != 0) 66 | { 67 | if (stream.release) 68 | { 69 | stream.release(&stream); 70 | } 71 | throw std::runtime_error("Failed to get schema"); 72 | } 73 | 74 | result->all_names.reserve(arrow_schema.n_children); 75 | 76 | auto n_children = arrow_schema.n_children; 77 | for (idx_t col_idx = 0; col_idx < n_children; col_idx++) 78 | { 79 | auto &schema = *arrow_schema.children[col_idx]; 80 | 81 | if (!schema.release) 82 | { 83 | throw InvalidInputException("arrow_scan: released schema passed"); 84 | } 85 | 86 | // TODO: handle dictionary 87 | return_types.emplace_back(GetArrowLogicalType(schema, result->arrow_convert_data, col_idx)); 88 | 89 | auto format = string(schema.format); 90 | auto name = string(schema.name); 91 | if (name.empty()) 92 | { 93 | name = string("v") + to_string(col_idx); 94 | } 95 | names.push_back(name); 96 | 97 | result->all_names.push_back(name); 98 | } 99 | 100 | RenameArrowColumns(names); 101 | 102 | result->file_name = file_name; 103 | result->query = query; 104 | 105 | return std::move(result); 106 | }; 107 | 108 | unique_ptr BCFQueryTableFunction::InitGlobal(ClientContext &context, 109 | TableFunctionInitInput &input) 110 | { 111 | auto &data = (BCFQueryScanFunctionData &)*input.bind_data; 112 | 113 | auto global_state = make_uniq(); 114 | 115 | struct ArrowArrayStream stream; 116 | 117 | auto file_name = data.file_name; 118 | auto query = data.query; 119 | auto vector_size = STANDARD_VECTOR_SIZE; 120 | 121 | auto bcf_query_reader_result = bcf_query_reader(&stream, file_name.c_str(), query.c_str(), vector_size); 122 | if (bcf_query_reader_result.error != NULL) 123 | { 124 | throw std::runtime_error(bcf_query_reader_result.error); 125 | } 126 | 127 | global_state->stream = make_uniq(); 128 | global_state->stream->arrow_array_stream = std::move(stream); 129 | 130 | return std::move(global_state); 131 | } 132 | 133 | void BCFQueryTableFunction::Scan(ClientContext &context, TableFunctionInput &input, DataChunk &output) 134 | { 135 | if (!input.local_state) 136 | { 137 | return; 138 | } 139 | auto &data = (BCFQueryScanFunctionData &)*input.bind_data; 140 | auto &state = (ArrowScanLocalState &)*input.local_state; 141 | auto &global_state = (ArrowScanGlobalState &)*input.global_state; 142 | 143 | //! Out of tuples in this chunk 144 | if (state.chunk_offset >= (idx_t)state.chunk->arrow_array.length) 145 | { 146 | if (!ArrowScanParallelStateNext(context, input.bind_data.get(), state, global_state)) 147 | { 148 | return; 149 | } 150 | } 151 | auto output_size = MinValue(STANDARD_VECTOR_SIZE, state.chunk->arrow_array.length - state.chunk_offset); 152 | data.lines_read += output_size; 153 | 154 | if (global_state.CanRemoveFilterColumns()) 155 | { 156 | state.all_columns.Reset(); 157 | state.all_columns.SetCardinality(output_size); 158 | ArrowToDuckDB(state, data.arrow_convert_data, state.all_columns, data.lines_read - output_size, false); 159 | output.ReferenceColumns(state.all_columns, global_state.projection_ids); 160 | } 161 | else 162 | { 163 | output.SetCardinality(output_size); 164 | 165 | ArrowToDuckDB(state, data.arrow_convert_data, output, data.lines_read - output_size, false); 166 | } 167 | 168 | output.Verify(); 169 | state.chunk_offset += output.size(); 170 | } 171 | 172 | void BCFQueryTableFunction::Register(duckdb::ClientContext &context) 173 | { 174 | 175 | TableFunction scan; 176 | scan = TableFunction("bcf_query", {LogicalType::VARCHAR, LogicalType::VARCHAR}, 177 | BCFQueryTableFunction::Scan, 178 | BCFQueryTableFunction::TableBind, 179 | BCFQueryTableFunction::InitGlobal, 180 | ArrowTableFunction::ArrowScanInitLocal); 181 | 182 | scan.cardinality = ArrowTableFunction::ArrowScanCardinality; 183 | scan.get_batch_index = ArrowTableFunction::ArrowGetBatchIndex; 184 | 185 | scan.projection_pushdown = true; 186 | scan.filter_pushdown = true; 187 | 188 | auto &catalog = Catalog::GetSystemCatalog(context); 189 | 190 | CreateTableFunctionInfo info(scan); 191 | 192 | catalog.CreateTableFunction(context, &info); 193 | }; 194 | 195 | } 196 | -------------------------------------------------------------------------------- /exon/src/exon/vcf_query_function/module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "exon/arrow_table_function/module.hpp" 24 | #include "exon/vcf_query_function/module.hpp" 25 | #include "rust.hpp" 26 | 27 | namespace exon 28 | { 29 | struct VCFQueryScanFunctionData : public TableFunctionData 30 | { 31 | string file_name; 32 | string query; 33 | 34 | unordered_map> arrow_convert_data; 35 | idx_t max_threads = 6; 36 | 37 | vector all_names; 38 | 39 | atomic lines_read; 40 | }; 41 | 42 | duckdb::unique_ptr VCFQueryTableFunction::TableBind(ClientContext &context, 43 | TableFunctionBindInput &input, 44 | vector &return_types, 45 | vector &names) 46 | { 47 | auto result = make_uniq(); 48 | 49 | auto file_name = input.inputs[0].GetValue(); 50 | auto query = input.inputs[1].GetValue(); 51 | 52 | struct ArrowArrayStream stream; 53 | auto vector_size = STANDARD_VECTOR_SIZE; 54 | 55 | // VCFQueryReaderResult vcf_query_reader_result 56 | auto vcf_query_reader_result = vcf_query_reader(&stream, file_name.c_str(), query.c_str(), vector_size); 57 | 58 | if (vcf_query_reader_result.error != NULL) 59 | { 60 | throw std::runtime_error(vcf_query_reader_result.error); 61 | } 62 | 63 | struct ArrowSchema arrow_schema; 64 | 65 | if (stream.get_schema(&stream, &arrow_schema) != 0) 66 | { 67 | if (stream.release) 68 | { 69 | stream.release(&stream); 70 | } 71 | throw std::runtime_error("Failed to get schema"); 72 | } 73 | 74 | result->all_names.reserve(arrow_schema.n_children); 75 | 76 | auto n_children = arrow_schema.n_children; 77 | for (idx_t col_idx = 0; col_idx < n_children; col_idx++) 78 | { 79 | auto &schema = *arrow_schema.children[col_idx]; 80 | 81 | if (!schema.release) 82 | { 83 | throw InvalidInputException("arrow_scan: released schema passed"); 84 | } 85 | 86 | // TODO: handle dictionary 87 | return_types.emplace_back(GetArrowLogicalType(schema, result->arrow_convert_data, col_idx)); 88 | 89 | auto format = string(schema.format); 90 | auto name = string(schema.name); 91 | if (name.empty()) 92 | { 93 | name = string("v") + to_string(col_idx); 94 | } 95 | names.push_back(name); 96 | 97 | result->all_names.push_back(name); 98 | } 99 | 100 | RenameArrowColumns(names); 101 | 102 | result->file_name = file_name; 103 | result->query = query; 104 | 105 | return std::move(result); 106 | }; 107 | 108 | unique_ptr VCFQueryTableFunction::InitGlobal(ClientContext &context, 109 | TableFunctionInitInput &input) 110 | { 111 | auto &data = (VCFQueryScanFunctionData &)*input.bind_data; 112 | 113 | auto global_state = make_uniq(); 114 | 115 | struct ArrowArrayStream stream; 116 | 117 | auto file_name = data.file_name; 118 | auto query = data.query; 119 | auto vector_size = STANDARD_VECTOR_SIZE; 120 | 121 | auto vcf_query_reader_result = vcf_query_reader(&stream, file_name.c_str(), query.c_str(), vector_size); 122 | if (vcf_query_reader_result.error != NULL) 123 | { 124 | throw std::runtime_error(vcf_query_reader_result.error); 125 | } 126 | 127 | global_state->stream = make_uniq(); 128 | global_state->stream->arrow_array_stream = std::move(stream); 129 | 130 | return std::move(global_state); 131 | } 132 | 133 | void VCFQueryTableFunction::Scan(ClientContext &context, TableFunctionInput &input, DataChunk &output) 134 | { 135 | if (!input.local_state) 136 | { 137 | return; 138 | } 139 | auto &data = (VCFQueryScanFunctionData &)*input.bind_data; 140 | auto &state = (ArrowScanLocalState &)*input.local_state; 141 | auto &global_state = (ArrowScanGlobalState &)*input.global_state; 142 | 143 | //! Out of tuples in this chunk 144 | if (state.chunk_offset >= (idx_t)state.chunk->arrow_array.length) 145 | { 146 | if (!ArrowScanParallelStateNext(context, input.bind_data.get(), state, global_state)) 147 | { 148 | return; 149 | } 150 | } 151 | auto output_size = MinValue(STANDARD_VECTOR_SIZE, state.chunk->arrow_array.length - state.chunk_offset); 152 | data.lines_read += output_size; 153 | 154 | if (global_state.CanRemoveFilterColumns()) 155 | { 156 | state.all_columns.Reset(); 157 | state.all_columns.SetCardinality(output_size); 158 | ArrowToDuckDB(state, data.arrow_convert_data, state.all_columns, data.lines_read - output_size, false); 159 | output.ReferenceColumns(state.all_columns, global_state.projection_ids); 160 | } 161 | else 162 | { 163 | output.SetCardinality(output_size); 164 | 165 | ArrowToDuckDB(state, data.arrow_convert_data, output, data.lines_read - output_size, false); 166 | } 167 | 168 | output.Verify(); 169 | state.chunk_offset += output.size(); 170 | } 171 | 172 | void VCFQueryTableFunction::Register(duckdb::ClientContext &context) 173 | { 174 | 175 | TableFunction scan; 176 | scan = TableFunction("vcf_query", {LogicalType::VARCHAR, LogicalType::VARCHAR}, 177 | VCFQueryTableFunction::Scan, 178 | VCFQueryTableFunction::TableBind, 179 | VCFQueryTableFunction::InitGlobal, 180 | ArrowTableFunction::ArrowScanInitLocal); 181 | 182 | scan.cardinality = ArrowTableFunction::ArrowScanCardinality; 183 | scan.get_batch_index = ArrowTableFunction::ArrowGetBatchIndex; 184 | 185 | scan.projection_pushdown = true; 186 | scan.filter_pushdown = true; 187 | 188 | auto &catalog = Catalog::GetSystemCatalog(context); 189 | 190 | CreateTableFunctionInfo info(scan); 191 | 192 | catalog.CreateTableFunction(context, &info); 193 | }; 194 | 195 | } 196 | -------------------------------------------------------------------------------- /exon/src/exon/sam_functions/module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "exon/sam_functions/module.hpp" 16 | 17 | #include "rust.hpp" 18 | 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | namespace exon 30 | { 31 | 32 | void ParseCIGARString(duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 33 | { 34 | result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); 35 | 36 | for (duckdb::idx_t i = 0; i < args.size(); i++) 37 | { 38 | auto string_value = args.data[0].GetValue(i); 39 | auto ss = string_value.ToString(); 40 | 41 | CResult cigar = parse_cigar(ss.c_str()); 42 | if (cigar.error) 43 | { 44 | throw std::runtime_error("Invalid CIGAR string: " + ss); 45 | } 46 | 47 | auto ops = duckdb::StringUtil::Split(cigar.value, ';'); 48 | 49 | duckdb::vector op_values; 50 | 51 | for (auto op : ops) 52 | { 53 | duckdb::child_list_t struct_values; 54 | auto op_parts = duckdb::StringUtil::Split(op, '='); 55 | 56 | if (op_parts.size() != 2) 57 | { 58 | throw std::runtime_error("Invalid CIGAR string"); 59 | } 60 | 61 | auto op_type = op_parts[0]; 62 | auto op_length = op_parts[1]; 63 | 64 | auto op_type_value = duckdb::Value(op_type); 65 | auto op_length_value = duckdb::Value::INTEGER(std::atoi(op_length.c_str())); 66 | 67 | struct_values.push_back(std::make_pair("op", op_type_value)); 68 | struct_values.push_back(std::make_pair("len", op_length_value)); 69 | 70 | op_values.push_back(duckdb::Value::STRUCT(struct_values)); 71 | } 72 | 73 | result.SetValue(i, duckdb::Value::LIST(op_values)); 74 | } 75 | } 76 | 77 | void ExtractSequence(duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 78 | { 79 | for (duckdb::idx_t i = 0; i < args.size(); i++) 80 | { 81 | auto sequence = args.data[0].GetValue(i).ToString(); 82 | auto cigar = args.data[1].GetValue(i).ToString(); 83 | 84 | auto extract_result = extract_from_cigar(sequence.c_str(), cigar.c_str()); 85 | if (extract_result.error) 86 | { 87 | throw std::runtime_error("Invalid CIGAR string"); 88 | } 89 | 90 | duckdb::child_list_t struct_values; 91 | struct_values.push_back(std::make_pair("sequence_start", duckdb::Value::INTEGER(extract_result.sequence_start))); 92 | struct_values.push_back(std::make_pair("sequence_end", duckdb::Value::INTEGER(extract_result.sequence_len))); 93 | struct_values.push_back(std::make_pair("sequence", duckdb::Value(extract_result.extracted_sequence))); 94 | 95 | auto struct_value = duckdb::Value::STRUCT(struct_values); 96 | 97 | result.SetValue(i, struct_value); 98 | } 99 | } 100 | 101 | duckdb::unique_ptr SamFunctions::GetExtractFromCIGARFunction() 102 | { 103 | duckdb::ScalarFunctionSet set("extract_from_cigar"); 104 | 105 | duckdb::child_list_t struct_children; 106 | struct_children.push_back(std::make_pair("sequence_start", duckdb::LogicalType::INTEGER)); 107 | struct_children.push_back(std::make_pair("sequence_end", duckdb::LogicalType::INTEGER)); 108 | struct_children.push_back(std::make_pair("sequence", duckdb::LogicalType::VARCHAR)); 109 | 110 | auto record_type = duckdb::LogicalType::STRUCT(std::move(struct_children)); 111 | 112 | set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR, duckdb::LogicalType::VARCHAR}, record_type, ExtractSequence)); 113 | 114 | return duckdb::make_uniq(set); 115 | } 116 | 117 | duckdb::unique_ptr SamFunctions::GetParseCIGARStringFunction() 118 | { 119 | duckdb::ScalarFunctionSet set("parse_cigar"); 120 | 121 | duckdb::child_list_t struct_children; 122 | struct_children.push_back(std::make_pair("op", duckdb::LogicalType::VARCHAR)); 123 | struct_children.push_back(std::make_pair("len", duckdb::LogicalType::INTEGER)); 124 | 125 | auto record_type = duckdb::LogicalType::STRUCT(std::move(struct_children)); 126 | auto row_type = duckdb::LogicalType::LIST(std::move(record_type)); 127 | 128 | set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR}, row_type, ParseCIGARString)); 129 | 130 | return duckdb::make_uniq(set); 131 | } 132 | 133 | std::vector> SamFunctions::GetSamFunctions() 134 | { 135 | 136 | struct SamFunction 137 | { 138 | std::string name; 139 | std::function func; 140 | }; 141 | 142 | std::vector sam_functions = { 143 | {"is_segmented", is_segmented}, 144 | {"is_unmapped", is_unmapped}, 145 | {"is_properly_aligned", is_properly_aligned}, 146 | {"is_mate_unmapped", is_mate_unmapped}, 147 | {"is_reverse_complemented", is_reverse_complemented}, 148 | {"is_mate_reverse_complemented", is_mate_reverse_complemented}, 149 | {"is_first_segment", is_first_segment}, 150 | {"is_last_segment", is_last_segment}, 151 | {"is_secondary", is_secondary}, 152 | {"is_quality_control_failed", is_quality_control_failed}, 153 | {"is_duplicate", is_duplicate}, 154 | {"is_supplementary", is_supplementary}}; 155 | 156 | std::vector> sam_scalar_functions; 157 | 158 | for (auto &sam_function : sam_functions) 159 | { 160 | duckdb::ScalarFunctionSet set(sam_function.name); 161 | 162 | auto duckdb_function = [sam_function](duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 163 | { 164 | result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); 165 | for (duckdb::idx_t i = 0; i < args.size(); i++) 166 | { 167 | auto value = args.data[0].GetValue(i); 168 | auto int_value = duckdb::IntegerValue::Get(value); 169 | 170 | auto bool_value = sam_function.func(int_value); 171 | 172 | result.SetValue(i, duckdb::Value::BOOLEAN(bool_value)); 173 | } 174 | }; 175 | 176 | set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::INTEGER}, duckdb::LogicalType::BOOLEAN, duckdb_function)); 177 | 178 | sam_scalar_functions.emplace_back(duckdb::make_uniq(set)); 179 | } 180 | 181 | return sam_scalar_functions; 182 | } 183 | 184 | } -------------------------------------------------------------------------------- /.github/Duckdb+Exon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 41 | 47 | 58 | Exon& 153 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .idea 3 | cmake-build-debug 4 | duckdb_unittest_tempdir/ 5 | .DS_Store 6 | testext 7 | test/python/__pycache__/ 8 | .Rhistory 9 | rust/target 10 | # Created by https://www.toptal.com/developers/gitignore/api/python 11 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 12 | 13 | ### Python ### 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | share/python-wheels/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | MANIFEST 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .nox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | *.py,cover 63 | .hypothesis/ 64 | .pytest_cache/ 65 | cover/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | local_settings.py 74 | db.sqlite3 75 | db.sqlite3-journal 76 | 77 | # Flask stuff: 78 | instance/ 79 | .webassets-cache 80 | 81 | # Scrapy stuff: 82 | .scrapy 83 | 84 | # Sphinx documentation 85 | docs/_build/ 86 | 87 | # PyBuilder 88 | .pybuilder/ 89 | target/ 90 | 91 | # Jupyter Notebook 92 | .ipynb_checkpoints 93 | 94 | # IPython 95 | profile_default/ 96 | ipython_config.py 97 | 98 | # pyenv 99 | # For a library or package, you might want to ignore these files since the code is 100 | # intended to run in multiple environments; otherwise, check them in: 101 | # .python-version 102 | 103 | # pipenv 104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 107 | # install all needed dependencies. 108 | #Pipfile.lock 109 | 110 | # poetry 111 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 112 | # This is especially recommended for binary packages to ensure reproducibility, and is more 113 | # commonly ignored for libraries. 114 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 115 | #poetry.lock 116 | 117 | # pdm 118 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 119 | #pdm.lock 120 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 121 | # in version control. 122 | # https://pdm.fming.dev/#use-with-ide 123 | .pdm.toml 124 | 125 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 126 | __pypackages__/ 127 | 128 | # Celery stuff 129 | celerybeat-schedule 130 | celerybeat.pid 131 | 132 | # SageMath parsed files 133 | *.sage.py 134 | 135 | # Environments 136 | .env 137 | .venv 138 | env/ 139 | venv/ 140 | ENV/ 141 | env.bak/ 142 | venv.bak/ 143 | 144 | # Spyder project settings 145 | .spyderproject 146 | .spyproject 147 | 148 | # Rope project settings 149 | .ropeproject 150 | 151 | # mkdocs documentation 152 | /site 153 | 154 | # mypy 155 | .mypy_cache/ 156 | .dmypy.json 157 | dmypy.json 158 | 159 | # Pyre type checker 160 | .pyre/ 161 | 162 | # pytype static type analyzer 163 | .pytype/ 164 | 165 | # Cython debug symbols 166 | cython_debug/ 167 | 168 | # PyCharm 169 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 170 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 171 | # and can be added to the global gitignore or merged into this file. For a more nuclear 172 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 173 | #.idea/ 174 | 175 | ### Python Patch ### 176 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 177 | poetry.toml 178 | 179 | # ruff 180 | .ruff_cache/ 181 | 182 | # End of https://www.toptal.com/developers/gitignore/api/python 183 | __pycache__ 184 | # Created by https://www.toptal.com/developers/gitignore/api/javascript 185 | # Edit at https://www.toptal.com/developers/gitignore?templates=javascript 186 | 187 | #!! ERROR: javascript is undefined. Use list command to see defined gitignore types !!# 188 | 189 | # End of https://www.toptal.com/developers/gitignore/api/javascript 190 | n# Created by https://www.toptal.com/developers/gitignore/api/js 191 | # Edit at https://www.toptal.com/developers/gitignore?templates=js 192 | 193 | #!! ERROR: js is undefined. Use list command to see defined gitignore types !!# 194 | 195 | # End of https://www.toptal.com/developers/gitignore/api/js 196 | n# Created by https://www.toptal.com/developers/gitignore/api/node 197 | # Edit at https://www.toptal.com/developers/gitignore?templates=node 198 | 199 | ### Node ### 200 | # Logs 201 | logs 202 | *.log 203 | npm-debug.log* 204 | yarn-debug.log* 205 | yarn-error.log* 206 | lerna-debug.log* 207 | .pnpm-debug.log* 208 | 209 | # Diagnostic reports (https://nodejs.org/api/report.html) 210 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 211 | 212 | # Runtime data 213 | pids 214 | *.pid 215 | *.seed 216 | *.pid.lock 217 | 218 | # Directory for instrumented libs generated by jscoverage/JSCover 219 | lib-cov 220 | 221 | # Coverage directory used by tools like istanbul 222 | coverage 223 | *.lcov 224 | 225 | # nyc test coverage 226 | .nyc_output 227 | 228 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 229 | .grunt 230 | 231 | # Bower dependency directory (https://bower.io/) 232 | bower_components 233 | 234 | # node-waf configuration 235 | .lock-wscript 236 | 237 | # Compiled binary addons (https://nodejs.org/api/addons.html) 238 | build/Release 239 | 240 | # Dependency directories 241 | node_modules/ 242 | jspm_packages/ 243 | 244 | # Snowpack dependency directory (https://snowpack.dev/) 245 | web_modules/ 246 | 247 | # TypeScript cache 248 | *.tsbuildinfo 249 | 250 | # Optional npm cache directory 251 | .npm 252 | 253 | # Optional eslint cache 254 | .eslintcache 255 | 256 | # Optional stylelint cache 257 | .stylelintcache 258 | 259 | # Microbundle cache 260 | .rpt2_cache/ 261 | .rts2_cache_cjs/ 262 | .rts2_cache_es/ 263 | .rts2_cache_umd/ 264 | 265 | # Optional REPL history 266 | .node_repl_history 267 | 268 | # Output of 'npm pack' 269 | *.tgz 270 | 271 | # Yarn Integrity file 272 | .yarn-integrity 273 | 274 | # dotenv environment variable files 275 | .env 276 | .env.development.local 277 | .env.test.local 278 | .env.production.local 279 | .env.local 280 | 281 | # parcel-bundler cache (https://parceljs.org/) 282 | .cache 283 | .parcel-cache 284 | 285 | # Next.js build output 286 | .next 287 | out 288 | 289 | # Nuxt.js build / generate output 290 | .nuxt 291 | dist 292 | 293 | # Gatsby files 294 | .cache/ 295 | # Comment in the public line in if your project uses Gatsby and not Next.js 296 | # https://nextjs.org/blog/next-9-1#public-directory-support 297 | # public 298 | 299 | # vuepress build output 300 | .vuepress/dist 301 | 302 | # vuepress v2.x temp and cache directory 303 | .temp 304 | 305 | # Docusaurus cache and generated files 306 | .docusaurus 307 | 308 | # Serverless directories 309 | .serverless/ 310 | 311 | # FuseBox cache 312 | .fusebox/ 313 | 314 | # DynamoDB Local files 315 | .dynamodb/ 316 | 317 | # TernJS port file 318 | .tern-port 319 | 320 | # Stores VSCode versions used for testing VSCode extensions 321 | .vscode-test 322 | 323 | # yarn v2 324 | .yarn/cache 325 | .yarn/unplugged 326 | .yarn/build-state.yml 327 | .yarn/install-state.gz 328 | .pnp.* 329 | 330 | ### Node Patch ### 331 | # Serverless Webpack directories 332 | .webpack/ 333 | 334 | # Optional stylelint cache 335 | 336 | # SvelteKit build / generate output 337 | .svelte-kit 338 | 339 | # End of https://www.toptal.com/developers/gitignore/api/node 340 | n 341 | .Rproj.user 342 | # Created by https://www.toptal.com/developers/gitignore/api/r 343 | # Edit at https://www.toptal.com/developers/gitignore?templates=r 344 | 345 | ### R ### 346 | # History files 347 | .Rhistory 348 | .Rapp.history 349 | 350 | # Session Data files 351 | .RData 352 | .RDataTmp 353 | 354 | # User-specific files 355 | .Ruserdata 356 | 357 | # Example code in package build process 358 | *-Ex.R 359 | 360 | # Output files from R CMD build 361 | /*.tar.gz 362 | 363 | # Output files from R CMD check 364 | /*.Rcheck/ 365 | 366 | # RStudio files 367 | .Rproj.user/ 368 | 369 | # produced vignettes 370 | vignettes/*.html 371 | vignettes/*.pdf 372 | 373 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 374 | .httr-oauth 375 | 376 | # knitr and R markdown default cache directories 377 | *_cache/ 378 | /cache/ 379 | 380 | # Temporary files created by R markdown 381 | *.utf8.md 382 | *.knit.md 383 | 384 | # R Environment Variables 385 | .Renviron 386 | 387 | # pkgdown site 388 | docs/ 389 | 390 | # translation temp files 391 | po/*~ 392 | 393 | # RStudio Connect folder 394 | rsconnect/ 395 | 396 | ### R.Bookdown Stack ### 397 | # R package: bookdown caching files 398 | /*_files/ 399 | 400 | # End of https://www.toptal.com/developers/gitignore/api/r 401 | n# Created by https://www.toptal.com/developers/gitignore/api/r 402 | # Edit at https://www.toptal.com/developers/gitignore?templates=r 403 | 404 | ### R ### 405 | # History files 406 | .Rhistory 407 | .Rapp.history 408 | 409 | # Session Data files 410 | .RData 411 | .RDataTmp 412 | 413 | # User-specific files 414 | .Ruserdata 415 | 416 | # Example code in package build process 417 | *-Ex.R 418 | 419 | # Output files from R CMD build 420 | /*.tar.gz 421 | 422 | # Output files from R CMD check 423 | /*.Rcheck/ 424 | 425 | # RStudio files 426 | .Rproj.user/ 427 | 428 | # produced vignettes 429 | vignettes/*.html 430 | vignettes/*.pdf 431 | 432 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 433 | .httr-oauth 434 | 435 | # knitr and R markdown default cache directories 436 | *_cache/ 437 | /cache/ 438 | 439 | # Temporary files created by R markdown 440 | *.utf8.md 441 | *.knit.md 442 | 443 | # R Environment Variables 444 | .Renviron 445 | 446 | # pkgdown site 447 | docs/ 448 | 449 | # translation temp files 450 | po/*~ 451 | 452 | # RStudio Connect folder 453 | rsconnect/ 454 | 455 | ### R.Bookdown Stack ### 456 | # R package: bookdown caching files 457 | /*_files/ 458 | 459 | # End of https://www.toptal.com/developers/gitignore/api/r 460 | n 461 | -------------------------------------------------------------------------------- /test/sql/exondb-release-with-deb-info/test.pfam.hmmout: -------------------------------------------------------------------------------- 1 | KanNP_rdsDRAFT_30000008001 - 40 BPD_transp_2 PF02653.11 267 1.1e-09 33.7 1.2 1 1 1.1e-11 1.2e-09 33.7 0.8 135 173 2 40 1 40 0.97 - 2 | KanNP_rdsDRAFT_30000008401 - 30 SAC3 PF12209.3 79 0.0042 13.4 0.0 1 1 4.5e-06 0.0045 13.3 0.0 29 49 5 25 1 26 0.85 - 3 | KanNP_rdsDRAFT_30000014401 - 39 Pyr_redox_3 PF13738.1 203 0.0011 15.1 0.0 1 1 6.8e-06 0.0011 15.1 0.0 78 109 5 36 1 39 0.89 - 4 | KanNP_rdsDRAFT_30000015201 - 31 Peptidase_M4_C PF02868.10 160 0.0002 17.3 0.1 1 1 4e-07 0.0002 17.3 0.1 83 100 1 25 1 30 0.78 - 5 | KanNP_rdsDRAFT_30000016001 - 40 DUF885 PF05960.6 549 0.0061 11.9 0.0 1 1 3e-05 0.0061 11.9 0.0 448 484 2 38 1 40 0.93 - 6 | KanNP_rdsDRAFT_30000016401 - 36 TPP_enzyme_M PF00205.17 137 1.4e-06 24.3 0.4 1 1 5.6e-09 1.4e-06 24.2 0.3 94 128 2 36 1 36 0.89 - 7 | KanNP_rdsDRAFT_30000017601 - 37 Glyco_hydro_2_N PF02837.13 167 8.7e-07 24.8 0.1 1 1 1.8e-09 9.1e-07 24.7 0.1 36 82 2 36 1 37 0.88 - 8 | KanNP_rdsDRAFT_30000018801 - 27 ACR_tran PF00873.14 1021 0.00028 14.6 2.1 1 1 5.8e-06 0.00028 14.6 1.5 870 895 1 26 1 27 0.94 - 9 | KanNP_rdsDRAFT_30000021601 - 31 K_trans PF02705.11 534 1.6e-08 29.5 0.3 1 1 9.6e-11 1.6e-08 29.5 0.2 291 320 1 30 1 31 0.97 - 10 | KanNP_rdsDRAFT_30000025201 - 32 Sulfatase PF00884.18 379 2e-06 23.2 0.1 1 1 2e-08 2e-06 23.2 0.1 155 184 2 31 1 32 0.86 - 11 | KanNP_rdsDRAFT_30000026401 - 40 FGGY_N PF00370.16 245 1e-07 27.6 0.0 1 1 4.1e-10 1e-07 27.6 0.0 66 100 6 40 1 40 0.93 - 12 | KanNP_rdsDRAFT_30000034401 - 35 CarboxypepD_reg PF13620.1 82 1.3e-05 21.4 0.3 1 1 3.5e-07 1.3e-05 21.4 0.2 2 30 2 29 1 35 0.78 - 13 | KanNP_rdsDRAFT_30000041601 - 34 RHS_repeat PF05593.9 38 1.1e-06 24.8 0.1 1 1 9.8e-09 1.2e-06 24.7 0.0 5 38 1 34 1 34 0.96 - 14 | KanNP_rdsDRAFT_30000042001 - 37 Aldedh PF00171.17 462 1.5e-12 42.7 0.1 1 1 5.3e-14 1.6e-12 42.7 0.0 370 405 1 36 1 37 0.97 - 15 | KanNP_rdsDRAFT_30000042401 - 40 DUF939 PF06081.6 141 0.0011 15.0 0.3 1 1 2.2e-06 0.0011 15.0 0.2 3 34 5 36 3 40 0.85 - 16 | KanNP_rdsDRAFT_30000044001 - 33 Topoisom_I PF01028.15 235 1.3e-07 27.0 0.0 1 1 2.7e-10 1.4e-07 27.0 0.0 132 157 7 33 2 33 0.94 - 17 | KanNP_rdsDRAFT_30000046001 - 26 Epimerase PF01370.16 236 0.0011 14.5 0.1 1 1 2.6e-05 0.0011 14.5 0.1 5 28 1 24 1 26 0.92 - 18 | KanNP_rdsDRAFT_30000048801 - 39 tRNA-synt_1 PF00133.17 601 4.8e-10 34.2 0.0 1 1 4.8e-12 4.8e-10 34.2 0.0 560 587 3 30 1 37 0.92 - 19 | KanNP_rdsDRAFT_30000049601 - 27 DUF2855 PF11017.3 314 0.00014 17.2 0.3 1 1 1.4e-07 0.00014 17.2 0.2 84 109 1 26 1 27 0.95 - 20 | KanNP_rdsDRAFT_30000060801 - 29 CHAT PF12770.2 287 9.9e-08 27.5 0.1 1 1 3e-10 1e-07 27.5 0.1 174 201 2 28 1 29 0.95 - 21 | KanNP_rdsDRAFT_30000061201 - 32 GGDEF PF00990.16 161 3.3e-07 26.2 0.0 1 1 4.6e-09 3.3e-07 26.2 0.0 70 99 1 30 1 32 0.90 - 22 | KanNP_rdsDRAFT_30000063601 - 31 Chlor_dismutase PF06778.7 193 6.3e-08 28.7 0.3 1 1 1.9e-10 6.3e-08 28.7 0.2 101 124 2 29 1 31 0.91 - 23 | KanNP_rdsDRAFT_30000064001 - 32 TPR_16 PF13432.1 65 9.3e-05 19.2 0.0 1 1 6.5e-07 9.3e-05 19.2 0.0 9 36 4 31 1 32 0.90 - 24 | KanNP_rdsDRAFT_30000065201 - 34 PPK2 PF03976.9 229 5.9e-09 31.7 0.0 1 1 4.7e-11 5.9e-09 31.7 0.0 27 57 4 34 1 34 0.91 - 25 | KanNP_rdsDRAFT_30000073601 - 27 Iso_dh PF00180.15 348 7.5e-11 37.7 0.1 1 1 5.3e-13 7.5e-11 37.7 0.0 272 297 1 26 1 27 0.96 - 26 | KanNP_rdsDRAFT_30000074001 - 27 NAD_Gly3P_dh_C PF07479.9 149 5.3e-06 22.4 0.3 1 1 1.1e-08 5.4e-06 22.4 0.2 44 71 1 27 1 27 0.96 - 27 | KanNP_rdsDRAFT_30000076801 - 32 Ribul_P_3_epim PF00834.14 201 4.3e-12 41.7 0.0 1 1 4.3e-15 4.3e-12 41.7 0.0 121 150 2 31 1 32 0.93 - 28 | KanNP_rdsDRAFT_30000081201 - 29 2-oxoacid_dh PF00198.18 231 7.2e-06 21.5 0.0 1 1 6.6e-08 7.3e-06 21.5 0.0 86 112 2 29 1 29 0.89 - 29 | KanNP_rdsDRAFT_30000088401 - 38 DUF21 PF01595.15 183 0.0028 13.1 0.0 1 1 8.6e-06 0.0029 13.1 0.0 17 52 2 37 1 38 0.95 - 30 | KanNP_rdsDRAFT_30000090001 - 30 PSD1 PF07587.6 266 1.1e-08 30.9 0.3 1 1 2.3e-11 1.1e-08 30.9 0.2 2 25 7 30 6 30 0.97 - 31 | KanNP_rdsDRAFT_30000091201 - 43 Lyase_1 PF00206.15 312 2.9e-10 35.9 0.0 1 1 6.2e-13 3.1e-10 35.9 0.0 112 154 1 43 1 43 0.99 - 32 | KanNP_rdsDRAFT_30000093601 - 35 Bac_luciferase PF00296.15 307 0.00092 14.5 0.0 1 1 1.8e-05 0.00092 14.5 0.0 52 71 15 35 2 35 0.80 - 33 | KanNP_rdsDRAFT_30000095201 - 28 Virulence_RhuM PF13310.1 260 2.9e-11 39.0 0.5 1 1 3e-14 3e-11 39.0 0.4 118 145 1 28 1 28 0.97 - 34 | KanNP_rdsDRAFT_30000097601 - 36 PHO4 PF01384.15 326 0.0037 12.1 0.0 1 1 1.1e-05 0.0037 12.1 0.0 19 41 4 26 2 35 0.83 - 35 | KanNP_rdsDRAFT_30000099201 - 34 DUF2127 PF09900.4 141 3.1e-06 23.4 0.6 1 1 6.3e-09 3.1e-06 23.4 0.4 104 136 2 34 1 34 0.95 - 36 | KanNP_rdsDRAFT_30000100401 - 30 HMGL-like PF00682.14 237 0.0077 12.0 0.0 1 1 1.6e-05 0.0079 12.0 0.0 100 129 1 30 1 30 0.97 - 37 | KanNP_rdsDRAFT_30000103601 - 30 RHH_1 PF01402.16 39 0.0025 13.9 0.2 1 1 2.9e-06 0.0029 13.7 0.1 5 22 9 26 1 30 0.88 - 38 | KanNP_rdsDRAFT_30000109201 - 33 Transpeptidase PF00905.17 304 6.7e-06 21.4 1.1 1 1 4.7e-08 6.7e-06 21.4 0.8 225 250 2 28 1 33 0.86 - 39 | KanNP_rdsDRAFT_30000112001 - 34 Response_reg PF00072.19 112 0.00038 16.6 0.0 1 1 1.9e-05 0.00038 16.6 0.0 55 84 2 31 1 34 0.90 - 40 | KanNP_rdsDRAFT_30000113601 - 34 NHase_alpha PF02979.11 189 4.8e-11 38.4 0.1 1 1 4.9e-14 4.9e-11 38.4 0.0 23 56 1 34 1 34 0.95 - 41 | KanNP_rdsDRAFT_30000114401 - 34 DUF982 PF06169.7 76 0.0025 13.7 0.0 1 1 4.9e-06 0.0025 13.7 0.0 20 42 10 33 2 34 0.85 - 42 | KanNP_rdsDRAFT_30000115201 - 34 ketoacyl-synt PF00109.21 255 0.0031 13.2 0.0 1 1 1.5e-05 0.0031 13.2 0.0 13 37 1 25 1 33 0.85 - 43 | KanNP_rdsDRAFT_30000116001 - 38 SBP56 PF05694.6 461 1e-11 40.2 0.0 1 1 1e-14 1e-11 40.1 0.0 157 193 2 38 1 38 0.97 - 44 | KanNP_rdsDRAFT_30000118801 - 30 DUF1501 PF07394.7 392 6.6e-06 21.3 0.1 1 1 8.6e-08 6.6e-06 21.3 0.0 231 255 1 25 1 30 0.86 - 45 | KanNP_rdsDRAFT_30000119601 - 36 Transgly PF00912.17 178 8.9e-09 31.0 0.0 1 1 3.7e-11 9.3e-09 31.0 0.0 43 77 2 36 1 36 0.96 - 46 | KanNP_rdsDRAFT_30000123601 - 36 Sulfatase PF00884.18 379 4.2e-07 25.4 0.0 1 1 6.4e-09 4.3e-07 25.4 0.0 228 258 5 33 2 36 0.84 - 47 | KanNP_rdsDRAFT_30000124801 - 29 N6_Mtase PF02384.11 311 7.1e-07 24.7 0.0 1 1 7.3e-10 7.3e-07 24.7 0.0 167 194 1 28 1 29 0.96 - 48 | KanNP_rdsDRAFT_30000125601 - 29 Transaldolase PF00923.14 287 0.0093 11.2 0.0 1 1 2.8e-05 0.0093 11.2 0.0 195 217 6 29 2 29 0.88 - 49 | KanNP_rdsDRAFT_30000129601 - 32 adh_short PF00106.20 167 5.2e-07 25.9 0.0 1 1 1.3e-08 5.3e-07 25.8 0.0 80 110 1 31 1 32 0.96 - 50 | KanNP_rdsDRAFT_30000130801 - 28 TPR_1 PF00515.23 34 1.6e-08 30.0 0.2 1 1 7e-10 1.8e-08 29.8 0.2 12 34 1 23 1 23 0.95 - 51 | KanNP_rdsDRAFT_30000136001 - 35 PQQ PF01011.16 38 0.00012 17.7 0.1 1 1 1.6e-06 0.00012 17.7 0.1 6 36 3 33 1 35 0.86 - 52 | KanNP_rdsDRAFT_30000139601 - 35 SBP_bac_3 PF00497.15 225 8.9e-08 27.8 0.0 1 1 4.5e-10 9e-08 27.7 0.0 185 218 1 34 1 35 0.96 - 53 | KanNP_rdsDRAFT_30000141601 - 42 HATPase_c PF02518.21 111 8.2e-06 21.7 0.0 1 1 2.6e-07 8.6e-06 21.6 0.0 80 109 2 31 1 33 0.94 - 54 | KanNP_rdsDRAFT_30000146001 - 28 tRNA_m1G_MT PF01746.16 186 0.0019 13.9 0.0 1 1 9.6e-06 0.0019 13.9 0.0 31 52 2 23 1 27 0.88 - 55 | KanNP_rdsDRAFT_30000146801 - 37 PBP PF01161.15 145 5.3e-12 42.0 0.2 1 1 2.7e-14 5.3e-12 42.0 0.1 89 122 1 34 1 37 0.81 - 56 | KanNP_rdsDRAFT_30000147601 - 28 RNA_pol_Rpb2_45 PF10385.4 66 9.7e-07 24.7 0.0 1 1 1.9e-09 9.7e-07 24.7 0.0 24 49 2 27 1 28 0.93 - 57 | KanNP_rdsDRAFT_30000148401 - 34 B PF02216.11 54 0.0024 13.8 0.0 1 1 2.5e-06 0.0025 13.7 0.0 18 41 5 29 3 34 0.83 - 58 | KanNP_rdsDRAFT_30000149201 - 30 MFS_2 PF13347.1 427 0.0014 13.2 0.4 1 1 4.2e-06 0.0014 13.2 0.3 396 425 1 30 1 30 0.95 - 59 | KanNP_rdsDRAFT_30000153201 - 36 DUF2636 PF11120.3 62 0.012 11.3 1.4 1 1 1.3e-05 0.013 11.2 1.0 39 59 5 24 1 28 0.72 - 60 | KanNP_rdsDRAFT_30000154001 - 36 CheR PF01739.13 196 3.1e-11 39.1 0.1 1 1 1.6e-13 3.2e-11 39.0 0.1 120 154 2 36 1 36 0.95 - 61 | KanNP_rdsDRAFT_30000154401 - 30 Cupin_2 PF07883.6 71 6.3e-08 28.2 0.2 1 1 5.7e-10 6.3e-08 28.2 0.2 13 40 2 29 1 30 0.93 - 62 | KanNP_rdsDRAFT_30000160801 - 32 HemolysinCabind PF00353.14 18 0.0017 14.8 3.1 2 3 6.9e-06 0.0017 14.8 2.1 4 18 14 28 14 28 0.93 - 63 | KanNP_rdsDRAFT_30000165201 - 36 DNA_pol3_alpha PF07733.7 426 8.7e-18 60.0 1.1 1 1 1.8e-19 9.2e-18 60.0 0.8 78 113 1 36 1 36 0.99 - 64 | KanNP_rdsDRAFT_30000169201 - 28 His_biosynth PF00977.16 228 3.7e-08 29.1 0.0 1 1 1.5e-10 3.7e-08 29.0 0.0 73 100 1 28 1 28 0.97 - 65 | KanNP_rdsDRAFT_30000172001 - 27 HyaE PF07449.6 107 0.0067 12.5 0.0 1 1 6.7e-06 0.0067 12.5 0.0 36 56 1 21 1 26 0.88 - 66 | KanNP_rdsDRAFT_30000176401 - 37 SAP30_Sin3_bdg PF13867.1 53 0.0032 13.5 0.1 1 1 6.5e-06 0.0033 13.5 0.0 26 48 11 33 6 37 0.91 - 67 | KanNP_rdsDRAFT_30000176801 - 40 Iso_dh PF00180.15 348 5.7e-06 21.6 0.0 1 1 3e-08 6e-06 21.5 0.0 181 219 2 40 1 40 0.94 - 68 | KanNP_rdsDRAFT_30000179601 - 33 EAL PF00563.15 236 0.0033 13.1 0.0 1 1 3.3e-05 0.0033 13.1 0.0 192 223 1 32 1 33 0.90 - 69 | KanNP_rdsDRAFT_30000184001 - 32 HisKA PF00512.20 68 5e-12 41.8 0.1 1 1 1.1e-13 5.1e-12 41.8 0.1 4 35 1 31 1 32 0.93 - 70 | KanNP_rdsDRAFT_30000184401 - 32 Response_reg PF00072.19 112 0.00013 18.2 0.0 1 1 6.7e-06 0.00013 18.2 0.0 89 111 4 26 1 27 0.89 - 71 | KanNP_rdsDRAFT_30000185601 - 30 Glyco_hydro_18 PF00704.23 343 0.00034 16.2 0.0 1 1 3.4e-07 0.00034 16.2 0.0 54 81 3 30 1 30 0.92 - 72 | KanNP_rdsDRAFT_30000186001 - 50 Na_sulph_symp PF00939.14 471 4e-08 28.7 0.2 1 1 8.2e-11 4.1e-08 28.6 0.1 251 299 1 49 1 50 0.96 - 73 | KanNP_rdsDRAFT_30000186401 - 32 HATPase_c_2 PF13581.1 125 5.2e-06 22.4 0.0 1 1 2.6e-08 5.2e-06 22.4 0.0 92 113 6 27 1 32 0.84 - 74 | KanNP_rdsDRAFT_30000188801 - 33 ECH PF00378.15 245 3.2e-05 19.3 0.4 1 1 3.9e-07 3.2e-05 19.3 0.3 90 117 4 31 1 33 0.88 - 75 | KanNP_rdsDRAFT_30000189601 - 44 Response_reg PF00072.19 112 4e-05 19.8 0.0 1 1 2.4e-06 4.2e-05 19.7 0.0 64 106 3 44 1 44 0.85 - 76 | KanNP_rdsDRAFT_30000193201 - 31 DUF2133 PF09976.4 43 0.0025 13.6 1.4 1 1 1.6e-05 0.0027 13.4 1.0 8 31 1 24 1 26 0.93 - 77 | KanNP_rdsDRAFT_30000193601 - 28 Biotin_lipoyl PF00364.17 74 6.3e-09 31.6 0.0 1 1 1.9e-11 6.4e-09 31.5 0.0 34 61 1 28 1 28 0.95 - 78 | KanNP_rdsDRAFT_30000194801 - 33 SH3_3 PF08239.6 52 3.8e-05 19.4 0.0 1 1 4e-08 4e-05 19.4 0.0 1 29 3 31 3 32 0.94 - 79 | KanNP_rdsDRAFT_30000195601 - 50 S4 PF01479.20 48 0.0068 12.0 0.0 1 1 7.6e-05 0.013 11.2 0.0 5 27 28 50 26 50 0.92 - 80 | KanNP_rdsDRAFT_30000196001 - 29 DAHP_synth_1 PF00793.15 272 0.0087 10.9 0.0 1 1 5.4e-05 0.009 10.8 0.0 203 228 1 26 1 28 0.96 - 81 | KanNP_rdsDRAFT_30000197201 - 36 Cu2_monoox_C PF03712.10 157 0.012 11.1 0.0 1 1 2.5e-05 0.013 11.0 0.0 94 120 2 36 1 36 0.93 - 82 | KanNP_rdsDRAFT_30000203201 - 33 Ank_5 PF13857.1 56 3.9e-05 20.0 0.1 1 1 2.1e-07 4.2e-05 19.9 0.0 19 53 1 33 1 33 0.97 - 83 | KanNP_rdsDRAFT_30000205601 - 36 Peptidase_A8 PF01252.13 150 1.9e-12 43.2 0.4 1 1 5.8e-15 1.9e-12 43.2 0.3 104 141 1 35 1 36 0.94 - 84 | KanNP_rdsDRAFT_30000206401 - 37 Peptidase_M50 PF02163.17 192 1.8e-10 36.2 2.7 1 1 1.2e-12 1.9e-10 36.1 1.9 1 34 4 37 4 37 0.96 - 85 | KanNP_rdsDRAFT_30000208401 - 39 PP-binding PF00550.20 67 1.6e-06 24.5 0.1 1 1 1.8e-09 1.8e-06 24.3 0.1 2 36 7 39 6 39 0.89 - 86 | KanNP_rdsDRAFT_30000208801 - 28 Hexapep PF00132.19 36 0.0028 13.3 0.0 1 1 2.2e-05 0.0032 13.2 0.0 1 13 16 28 16 28 0.91 - 87 | KanNP_rdsDRAFT_30000209201 - 31 RmuC PF02646.11 305 6e-07 24.7 0.0 1 1 1.2e-09 6.2e-07 24.7 0.0 207 237 1 31 1 31 0.97 - 88 | KanNP_rdsDRAFT_30000210001 - 32 HTH_IclR PF09339.5 52 5.7e-09 31.6 0.1 1 1 4.1e-11 5.8e-09 31.6 0.0 3 33 1 31 1 32 0.96 - 89 | KanNP_rdsDRAFT_30000211201 - 30 EFP_N PF08207.7 58 8.8e-11 37.7 0.0 1 1 2.7e-13 9.1e-11 37.7 0.0 28 56 1 29 1 30 0.98 - 90 | KanNP_rdsDRAFT_30000215201 - 38 SGL PF08450.7 246 7.9e-05 18.3 0.0 1 1 4.7e-07 7.9e-05 18.3 0.0 118 151 1 35 1 38 0.82 - 91 | KanNP_rdsDRAFT_30000216001 - 35 Carboxyl_trans PF01039.17 493 2.9e-05 18.7 0.0 1 1 3.8e-07 2.9e-05 18.7 0.0 15 41 1 27 1 35 0.87 - 92 | KanNP_rdsDRAFT_30000216401 - 45 HTH_psq PF05225.11 45 0.00095 14.9 0.0 1 1 2.6e-06 0.0013 14.4 0.0 26 40 28 43 26 45 0.90 - 93 | KanNP_rdsDRAFT_30000219601 - 34 GGDEF PF00990.16 161 5.9e-06 22.1 0.0 1 1 4.9e-08 6.1e-06 22.0 0.0 12 44 1 33 1 34 0.97 - 94 | KanNP_rdsDRAFT_30000220001 - 37 HWE_HK PF07536.9 83 1.2e-11 41.1 0.2 1 1 1.2e-14 1.2e-11 41.0 0.1 4 40 1 37 1 37 0.99 - 95 | KanNP_rdsDRAFT_30000222001 - 27 DUF4198 PF10670.4 215 0.00018 17.7 0.1 1 1 1.8e-07 0.00018 17.7 0.0 190 213 1 24 1 26 0.94 - 96 | KanNP_rdsDRAFT_30000222401 - 49 GCV_T PF01571.16 211 1.3e-10 37.1 0.1 1 1 6.9e-13 1.4e-10 37.1 0.1 47 95 1 49 1 49 0.95 - 97 | KanNP_rdsDRAFT_30000226001 - 32 DNA_ligase_aden PF01653.13 315 2.1e-07 26.3 0.0 1 1 1.1e-09 2.1e-07 26.3 0.0 165 195 2 32 1 32 0.97 - 98 | KanNP_rdsDRAFT_30000231601 - 41 Pkinase PF00069.20 260 5.2e-05 18.7 0.0 1 1 3.1e-06 5.4e-05 18.6 0.0 1 32 5 36 5 40 0.92 - 99 | KanNP_rdsDRAFT_30000232001 - 42 ETF_QO PF05187.8 110 1.9e-08 30.5 0.6 1 1 4.9e-11 2.4e-08 30.1 0.4 91 109 1 19 1 20 0.95 - 100 | KanNP_rdsDRAFT_30000232001 - 42 Fer4_2 PF12797.2 22 0.0011 15.0 0.6 2 2 1e-05 0.0011 15.0 0.4 2 16 24 38 23 41 0.90 - 101 | -------------------------------------------------------------------------------- /exon/src/exon/sequence_functions/module.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 WHERE TRUE Technologies. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | 20 | #include "duckdb/execution/expression_executor.hpp" 21 | #include "duckdb/function/function_binder.hpp" 22 | #include "duckdb/planner/expression/bound_function_expression.hpp" 23 | 24 | #include "exon/sequence_functions/module.hpp" 25 | 26 | using duckdb::LogicalType; 27 | 28 | namespace exon 29 | { 30 | void ReverseComplementFunction(duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 31 | { 32 | result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); 33 | 34 | for (duckdb::idx_t row_i = 0; row_i < args.size(); row_i++) 35 | { 36 | 37 | auto string_value = args.data[0].GetValue(row_i); 38 | auto sequence = string_value.ToString(); 39 | 40 | std::string reverse_complement; 41 | 42 | int size = sequence.size(); 43 | 44 | for (int i = 0; i < size; i++) 45 | { 46 | if (sequence[i] == 'A') 47 | { 48 | reverse_complement += 'C'; 49 | } 50 | else if (sequence[i] == 'T') 51 | { 52 | reverse_complement += 'G'; 53 | } 54 | else if (sequence[i] == 'C') 55 | { 56 | reverse_complement += 'A'; 57 | } 58 | else if (sequence[i] == 'G') 59 | { 60 | reverse_complement += 'T'; 61 | } 62 | else 63 | { 64 | throw duckdb::InvalidInputException(std::string("Invalid character in sequence: ") + sequence[i]); 65 | } 66 | } 67 | result.SetValue(row_i, duckdb::Value(reverse_complement)); 68 | } 69 | } 70 | 71 | duckdb::CreateScalarFunctionInfo SequenceFunctions::GetReverseComplementFunction() 72 | { 73 | duckdb::ScalarFunctionSet set("reverse_complement"); 74 | 75 | auto reverse_complement_scalar_function = duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR}, duckdb::LogicalType::VARCHAR, ReverseComplementFunction); 76 | set.AddFunction(reverse_complement_scalar_function); 77 | 78 | return duckdb::CreateScalarFunctionInfo(set); 79 | } 80 | 81 | void ComplementFunction(duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 82 | { 83 | result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); 84 | 85 | for (duckdb::idx_t row_i = 0; row_i < args.size(); row_i++) 86 | { 87 | 88 | auto string_value = args.data[0].GetValue(row_i); 89 | auto sequence = string_value.ToString(); 90 | 91 | std::string complement; 92 | 93 | int size = sequence.size(); 94 | 95 | for (int i = 0; i < size; i++) 96 | { 97 | if (sequence[i] == 'A') 98 | { 99 | complement += 'T'; 100 | } 101 | else if (sequence[i] == 'T') 102 | { 103 | complement += 'A'; 104 | } 105 | else if (sequence[i] == 'C') 106 | { 107 | complement += 'G'; 108 | } 109 | else if (sequence[i] == 'G') 110 | { 111 | complement += 'C'; 112 | } 113 | else 114 | { 115 | throw duckdb::InvalidInputException("Invalid character in sequence: %c", sequence[i]); 116 | } 117 | } 118 | 119 | result.SetValue(row_i, duckdb::Value(complement)); 120 | } 121 | } 122 | 123 | duckdb::CreateScalarFunctionInfo SequenceFunctions::GetComplementFunction() 124 | { 125 | duckdb::ScalarFunctionSet set("complement"); 126 | set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR}, duckdb::LogicalType::VARCHAR, ComplementFunction)); 127 | 128 | return duckdb::CreateScalarFunctionInfo(set); 129 | } 130 | 131 | void GcContent(duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 132 | { 133 | // result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); 134 | result.SetVectorType(duckdb::VectorType::CONSTANT_VECTOR); 135 | 136 | for (duckdb::idx_t i = 0; i < args.size(); i++) 137 | { 138 | auto string_value = args.data[0].GetValue(i); 139 | auto ss = string_value.ToString(); 140 | 141 | int size = ss.size(); 142 | if (size == 0) 143 | { 144 | result.SetValue(i, duckdb::Value::FLOAT(0)); 145 | return; 146 | } 147 | 148 | int gc_count = 0; 149 | for (int i = 0; i < size; i++) 150 | { 151 | if (ss[i] == 'G' || ss[i] == 'C') 152 | { 153 | gc_count++; 154 | } 155 | } 156 | result.SetValue(i, duckdb::Value::FLOAT((float)gc_count / (float)size)); 157 | } 158 | } 159 | 160 | duckdb::CreateScalarFunctionInfo SequenceFunctions::GetGCContentFunction() 161 | { 162 | duckdb::ScalarFunctionSet set("gc_content"); 163 | set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR}, duckdb::LogicalType::FLOAT, GcContent)); 164 | 165 | return duckdb::CreateScalarFunctionInfo(set); 166 | } 167 | 168 | void ReverseTranscribeRnaToDna(duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 169 | { 170 | result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); 171 | 172 | for (duckdb::idx_t row_i = 0; row_i < args.size(); row_i++) 173 | { 174 | 175 | auto string_value = args.data[0].GetValue(row_i); 176 | auto sequence = string_value.ToString(); 177 | int size = sequence.size(); 178 | 179 | std::string reversed_transcribed_sequence; 180 | for (int i = 0; i < size; i++) 181 | { 182 | if (sequence[i] == 'U') 183 | { 184 | reversed_transcribed_sequence += 'T'; 185 | } 186 | else if (sequence[i] == 'A') 187 | { 188 | reversed_transcribed_sequence += 'A'; 189 | } 190 | else if (sequence[i] == 'C') 191 | { 192 | reversed_transcribed_sequence += 'C'; 193 | } 194 | else if (sequence[i] == 'G') 195 | { 196 | reversed_transcribed_sequence += 'G'; 197 | } 198 | else 199 | { 200 | throw duckdb::InvalidInputException(std::string("Invalid character in sequence: ") + sequence[i]); 201 | } 202 | } 203 | result.SetValue(row_i, duckdb::Value(reversed_transcribed_sequence)); 204 | } 205 | } 206 | 207 | duckdb::CreateScalarFunctionInfo SequenceFunctions::GetReverseTranscribeRnaToDnaFunction() 208 | { 209 | duckdb::ScalarFunctionSet set("reverse_transcribe"); 210 | set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR}, duckdb::LogicalType::VARCHAR, ReverseTranscribeRnaToDna)); 211 | 212 | return duckdb::CreateScalarFunctionInfo(set); 213 | } 214 | 215 | void TranscribeDnaToRnaFunction(duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 216 | { 217 | result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); 218 | 219 | for (duckdb::idx_t row_i = 0; row_i < args.size(); row_i++) 220 | { 221 | auto string_value = args.data[0].GetValue(row_i); 222 | auto sequence = string_value.ToString(); 223 | 224 | std::string transcribed_sequence; 225 | 226 | int size = sequence.size(); 227 | 228 | for (int i = 0; i < size; i++) 229 | { 230 | if (sequence[i] == 'T') 231 | { 232 | transcribed_sequence += 'U'; 233 | } 234 | else if (sequence[i] == 'A') 235 | { 236 | transcribed_sequence += 'A'; 237 | } 238 | else if (sequence[i] == 'C') 239 | { 240 | transcribed_sequence += 'C'; 241 | } 242 | else if (sequence[i] == 'G') 243 | { 244 | transcribed_sequence += 'G'; 245 | } 246 | else 247 | { 248 | throw duckdb::InvalidInputException(std::string("Invalid character in sequence: ") + sequence[i]); 249 | } 250 | } 251 | result.SetValue(row_i, duckdb::Value(transcribed_sequence)); 252 | } 253 | } 254 | 255 | duckdb::CreateScalarFunctionInfo SequenceFunctions::GetTranscribeDnaToRnaFunction() 256 | { 257 | duckdb::ScalarFunctionSet set("transcribe"); 258 | set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR}, duckdb::LogicalType::VARCHAR, TranscribeDnaToRnaFunction)); 259 | 260 | return duckdb::CreateScalarFunctionInfo(set); 261 | } 262 | 263 | void TranslateDnaToAminoAcid(duckdb::DataChunk &args, duckdb::ExpressionState &state, duckdb::Vector &result) 264 | { 265 | result.SetVectorType(duckdb::VectorType::FLAT_VECTOR); 266 | std::map standard_dna_codon_table = { 267 | {"AAA", "K"}, 268 | {"AAT", "N"}, 269 | {"AAC", "N"}, 270 | {"AAG", "K"}, 271 | {"ATA", "I"}, 272 | {"ATT", "I"}, 273 | {"ATC", "I"}, 274 | {"ATG", "M"}, 275 | {"ACA", "T"}, 276 | {"ACT", "T"}, 277 | {"ACC", "T"}, 278 | {"ACG", "T"}, 279 | {"AGA", "R"}, 280 | {"AGT", "S"}, 281 | {"AGC", "S"}, 282 | {"AGG", "R"}, 283 | {"TAA", "*"}, 284 | {"TAT", "Y"}, 285 | {"TAC", "Y"}, 286 | {"TAG", "*"}, 287 | {"TTA", "L"}, 288 | {"TTT", "F"}, 289 | {"TTC", "F"}, 290 | {"TTG", "L"}, 291 | {"TCA", "S"}, 292 | {"TCT", "S"}, 293 | {"TCC", "S"}, 294 | {"TCG", "S"}, 295 | {"TGA", "*"}, 296 | {"TGT", "C"}, 297 | {"TGC", "C"}, 298 | {"TGG", "W"}, 299 | {"CAA", "Q"}, 300 | {"CAT", "H"}, 301 | {"CAC", "H"}, 302 | {"CAG", "Q"}, 303 | {"CTA", "L"}, 304 | {"CTT", "L"}, 305 | {"CTC", "L"}, 306 | {"CTG", "L"}, 307 | {"CCA", "P"}, 308 | {"CCT", "P"}, 309 | {"CCC", "P"}, 310 | {"CCG", "P"}, 311 | {"CGA", "R"}, 312 | {"CGT", "R"}, 313 | {"CGC", "R"}, 314 | {"CGG", "R"}, 315 | {"GAA", "E"}, 316 | {"GAT", "D"}, 317 | {"GAC", "D"}, 318 | {"GAG", "E"}, 319 | {"GTA", "V"}, 320 | {"GTT", "V"}, 321 | {"GTC", "V"}, 322 | {"GTG", "V"}, 323 | {"GCA", "A"}, 324 | {"GCT", "A"}, 325 | {"GCC", "A"}, 326 | {"GCG", "A"}, 327 | {"GGA", "G"}, 328 | {"GGT", "G"}, 329 | {"GGC", "G"}, 330 | {"GGG", "G"}}; 331 | 332 | for (duckdb::idx_t row_i = 0; row_i < args.size(); row_i++) 333 | { 334 | 335 | auto string_value = args.data[0].GetValue(row_i); 336 | auto sequence = string_value.ToString(); 337 | 338 | std::string amino_acid_sequence; 339 | 340 | int size = sequence.size(); 341 | 342 | if (size % 3 != 0) 343 | { 344 | throw duckdb::InvalidInputException(std::string("Invalid sequence length: ") + std::to_string(size)); 345 | } 346 | 347 | int aa_size = size / 3; 348 | 349 | for (int i = 0; i < aa_size; i++) 350 | { 351 | std::string codon = sequence.substr(i * 3, 3); 352 | 353 | if (standard_dna_codon_table.find(codon) == standard_dna_codon_table.end()) 354 | { 355 | throw duckdb::InvalidInputException(std::string("Invalid codon: ") + codon); 356 | } 357 | amino_acid_sequence += standard_dna_codon_table[codon]; 358 | } 359 | 360 | result.SetValue(row_i, duckdb::Value(amino_acid_sequence)); 361 | } 362 | } 363 | 364 | duckdb::CreateScalarFunctionInfo SequenceFunctions::GetTranslateDnaToAminoAcidFunction() 365 | { 366 | duckdb::ScalarFunctionSet set("translate_dna_to_aa"); 367 | set.AddFunction(duckdb::ScalarFunction({duckdb::LogicalType::VARCHAR}, duckdb::LogicalType::VARCHAR, TranslateDnaToAminoAcid)); 368 | 369 | return duckdb::CreateScalarFunctionInfo(set); 370 | } 371 | 372 | std::vector SequenceFunctions::GetSequenceFunctions() 373 | { 374 | std::vector functions; 375 | functions.push_back(GetTranscribeDnaToRnaFunction()); 376 | functions.push_back(GetTranslateDnaToAminoAcidFunction()); 377 | functions.push_back(GetGCContentFunction()); 378 | functions.push_back(GetReverseComplementFunction()); 379 | functions.push_back(GetComplementFunction()); 380 | functions.push_back(GetReverseTranscribeRnaToDnaFunction()); 381 | 382 | return functions; 383 | } 384 | } --------------------------------------------------------------------------------