├── .gitmodules ├── testdata ├── test_singletons.intervals ├── test.g.bcf ├── complete_same_seq.fa ├── test_paired.bam ├── test_simple.bam ├── test_simple.bam.bai ├── test_variants.bcf ├── test_variants.vcf.gz ├── synced_sparse_1.vcf.gz ├── synced_sparse_2.vcf.gz ├── synced_sparse_1.vcf.gz.csi ├── synced_sparse_2.vcf.gz.csi ├── var_idx │ ├── test_variants.bcf │ ├── test_variants.bcf.csi │ ├── test_variants_csi.vcf.gz │ ├── test_variants_tabix.vcf.gz │ ├── test_variants_csi.vcf.gz.csi │ └── test_variants_tabix.vcf.gz.tbi ├── unindexed │ ├── test_unindexed.bam │ └── test_unindexed.vcf ├── test_reference2.fa ├── test_clean.fq ├── complete_same_seq.fq ├── var_hdr_merge │ ├── test1.vcf │ ├── test2.vcf │ ├── test3.vcf │ └── empty_vcf.vcf ├── test_bed.bed ├── test_gatk.intervals ├── test_mixed.loc ├── test_reference.fa ├── ref_block │ ├── problem3_file2.vcf │ ├── problem2_file2.vcf │ ├── test1.vcf │ ├── test2.vcf │ ├── test5.vcf │ ├── test4.vcf │ ├── problem1.vcf │ ├── problem3_file1.vcf │ ├── test3.vcf │ └── problem2_file1.vcf ├── test_variants_mixed_ploidy.vcf ├── mvr_hdr │ ├── test2.vcf │ └── test1.vcf ├── missing_header.vcf ├── test_variants_alternate_ploidy.vcf ├── test_variants_missing_data.vcf ├── test_variants_02.vcf ├── test_variants_multiple_alt.vcf ├── extra_header.vcf ├── test.g.vcf ├── test_variants_for_variantbuilder.vcf ├── test_variants.vcf └── test_picard.interval_list ├── test ├── main.cpp ├── missing_test.cpp ├── read_group_test.cpp ├── CMakeLists.txt ├── sam_header_test.cpp ├── cigar_test.cpp ├── test_utils.h ├── fastq_reader_test.cpp ├── utils_test.cpp ├── select_if_test.cpp ├── sam_reader_test.cpp ├── fastq_test.cpp └── reference_test.cpp ├── .travis_scripts ├── boost.sh ├── cmake.sh ├── coveralls.sh ├── clang.sh ├── gcc.sh └── update_website_dox.sh ├── gamgee ├── utils │ ├── file_utils.cpp │ ├── file_utils.h │ ├── variant_utils.h │ ├── utils.cpp │ ├── variant_utils.cpp │ ├── variant_field_type.h │ └── merged_vcf_lut.cpp ├── reference_iterator.cpp ├── reference_map.cpp ├── fastq.cpp ├── sam │ ├── sam_writer.cpp │ ├── sam_tag.h │ ├── sam_iterator.cpp │ ├── read_group.h │ ├── base_quals.h │ ├── indexed_sam_iterator.cpp │ ├── sam_header.cpp │ ├── read_bases.h │ ├── sam_header.h │ ├── sam_writer.h │ ├── read_group.cpp │ ├── sam_iterator.h │ ├── sam_builder_data_field.h │ ├── sam_builder_data_field.cpp │ ├── base_quals.cpp │ └── sam_pair_iterator.cpp ├── reference_iterator.h ├── zip.h ├── fastq_reader.cpp ├── reference_map.h ├── variant │ ├── variant_iterator.cpp │ ├── variant_writer.cpp │ ├── indexed_variant_iterator.cpp │ ├── variant_filters.h │ ├── synced_variant_iterator.cpp │ ├── multiple_variant_iterator.cpp │ ├── synced_variant_iterator.h │ ├── indexed_variant_iterator.h │ ├── variant_writer.h │ ├── multiple_variant_iterator.h │ ├── variant_iterator.h │ ├── variant_header_builder.cpp │ └── reference_block_splitting_variant_iterator.h ├── gamgee.h ├── fastq_iterator.cpp ├── fastq_iterator.h ├── missing.h ├── exceptions.h ├── fastq.h └── fastq_reader.h ├── .gitignore ├── .travis.yml ├── LICENSE ├── CMakeLists.txt └── contrib └── htslib.cmake /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /testdata/test_singletons.intervals: -------------------------------------------------------------------------------- 1 | 1:200-300 2 | 1:500 3 | 1:500-500 4 | -------------------------------------------------------------------------------- /test/main.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE unit_tests 2 | #include 3 | -------------------------------------------------------------------------------- /testdata/test.g.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/test.g.bcf -------------------------------------------------------------------------------- /testdata/complete_same_seq.fa: -------------------------------------------------------------------------------- 1 | >test1 example 2 | ACAAGAGATTTAAGAC 3 | >test2 4 | ACAAGAGATTTAAGAC 5 | -------------------------------------------------------------------------------- /testdata/test_paired.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/test_paired.bam -------------------------------------------------------------------------------- /testdata/test_simple.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/test_simple.bam -------------------------------------------------------------------------------- /testdata/test_simple.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/test_simple.bam.bai -------------------------------------------------------------------------------- /testdata/test_variants.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/test_variants.bcf -------------------------------------------------------------------------------- /testdata/test_variants.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/test_variants.vcf.gz -------------------------------------------------------------------------------- /.travis_scripts/boost.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # install boost libraries 4 | sudo apt-get install -qq boost1.55 5 | 6 | -------------------------------------------------------------------------------- /testdata/synced_sparse_1.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/synced_sparse_1.vcf.gz -------------------------------------------------------------------------------- /testdata/synced_sparse_2.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/synced_sparse_2.vcf.gz -------------------------------------------------------------------------------- /testdata/synced_sparse_1.vcf.gz.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/synced_sparse_1.vcf.gz.csi -------------------------------------------------------------------------------- /testdata/synced_sparse_2.vcf.gz.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/synced_sparse_2.vcf.gz.csi -------------------------------------------------------------------------------- /testdata/var_idx/test_variants.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/var_idx/test_variants.bcf -------------------------------------------------------------------------------- /testdata/unindexed/test_unindexed.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/unindexed/test_unindexed.bam -------------------------------------------------------------------------------- /testdata/var_idx/test_variants.bcf.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/var_idx/test_variants.bcf.csi -------------------------------------------------------------------------------- /testdata/var_idx/test_variants_csi.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/var_idx/test_variants_csi.vcf.gz -------------------------------------------------------------------------------- /testdata/var_idx/test_variants_tabix.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/var_idx/test_variants_tabix.vcf.gz -------------------------------------------------------------------------------- /testdata/var_idx/test_variants_csi.vcf.gz.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/var_idx/test_variants_csi.vcf.gz.csi -------------------------------------------------------------------------------- /testdata/var_idx/test_variants_tabix.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/gamgee/HEAD/testdata/var_idx/test_variants_tabix.vcf.gz.tbi -------------------------------------------------------------------------------- /testdata/test_reference2.fa: -------------------------------------------------------------------------------- 1 | >chr1 text comments 1 2 3 4 5 2 | AGGGATCCCCCCCCCCAGTACCNNNNAGTT 3 | >chr2 multiple line test 4 | NNNN 5 | NNAGGGATCCC 6 | NCCCCCCCA 7 | G 8 | TACCNNNNAG 9 | TT 10 | -------------------------------------------------------------------------------- /.travis_scripts/cmake.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://www.broadinstitute.org/gatk/eng/travis/cmake_3.1.0-rc1-1_amd64.deb 4 | sudo apt-get remove cmake cmake-data 5 | sudo dpkg --install cmake_3.1.0-rc1-1_amd64.deb 6 | -------------------------------------------------------------------------------- /testdata/test_clean.fq: -------------------------------------------------------------------------------- 1 | @test1 example 2 | ACAAGAGATTTAAGAC 3 | + 4 | !@!#!#!#!@@@!@@! 5 | @test2 6 | ACAAGAGATTTAAGAC 7 | + 8 | !@!#!#!#!@@@!@@! 9 | @test3 10 | ACAAGAGATTTAAGAC 11 | + 12 | !@!#!#!#!@@@!@@! 13 | 14 | -------------------------------------------------------------------------------- /testdata/complete_same_seq.fq: -------------------------------------------------------------------------------- 1 | @test1 example 2 | ACAAGAGATTTAAGAC 3 | +test1 4 | !@!#!#!#!@@@!@@! 5 | @test2 6 | ACAAGAGATTTAAGAC 7 | +test2 8 | !@!#!#!#!@@@!@@! 9 | @test3 10 | ACAAGAGATTTAAGAC 11 | + 12 | !@!#!#!#!@@@!@@! 13 | 14 | -------------------------------------------------------------------------------- /.travis_scripts/coveralls.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note that this only works if the tests were built using --coverage for 4 | # compile and link flags! 5 | if [ "$CXX" == "g++" ]; 6 | then 7 | sudo pip install cpp-coveralls 8 | cd build 9 | coveralls -r ../ -e CMakeFiles -e contrib -e test -t ${COVERALLS_TOKEN} 10 | fi 11 | -------------------------------------------------------------------------------- /testdata/var_hdr_merge/test1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##FILTER= 4 | ##contig= 5 | ##FORMAT= 6 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 7 | -------------------------------------------------------------------------------- /testdata/test_bed.bed: -------------------------------------------------------------------------------- 1 | 20 132342 4832374 2 | 20 132342 4832374 3 | 20 132342 4832374 4 | 20 132342 4832374 5 | 20 132342 4832374 6 | 20 132342 4832374 7 | 20 132342 4832374 8 | 20 132342 4832374 9 | 20 132342 4832374 10 | 20 132342 4832374 11 | 20 132342 4832374 12 | 20 132342 4832374 13 | 20 132342 4832374 14 | 20 132342 4832374 15 | 20 132342 4832374 16 | 20 132342 4832374 17 | 20 132342 4832374 18 | 20 132342 4832374 19 | -------------------------------------------------------------------------------- /testdata/test_gatk.intervals: -------------------------------------------------------------------------------- 1 | 20:132342-4832374 2 | 20:132342-4832374 3 | 20:132342-4832374 4 | 20:132342-4832374 5 | 20:132342-4832374 6 | 20:132342-4832374 7 | 20:132342-4832374 8 | 20:132342-4832374 9 | 20:132342-4832374 10 | 20:132342-4832374 11 | 20:132342-4832374 12 | 20:132342-4832374 13 | 20:132342-4832374 14 | 20:132342-4832374 15 | 20:132342-4832374 16 | 20:132342-4832374 17 | 20:132342-4832374 18 | 20:132342-4832374 19 | -------------------------------------------------------------------------------- /testdata/test_mixed.loc: -------------------------------------------------------------------------------- 1 | 20:132342-4832374 2 | 20:132342-4832374 3 | 20 132342 4832374 4 | 20 132342 4832374 5 | 20 132342 4832374 6 | 20:132342-4832374 7 | 20:132342-4832374 8 | 20 132342 4832374 9 | 20 132342 4832374 + 10 | 20 132342 4832374 + 11 | 20 132342 4832374 + 12 | 20:132342-4832374 13 | 20:132342-4832374 14 | 20:132342-4832374 15 | 20:132342-4832374 16 | 20 132342 4832374 + 17 | 20 132342 4832374 + 18 | 20 132342 4832374 + 19 | -------------------------------------------------------------------------------- /.travis_scripts/clang.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key | sudo apt-key add - 4 | sudo apt-add-repository 'deb http://llvm.org/apt/precise/ llvm-toolchain-precise-3.5 main' 5 | sudo apt-get -qq update 6 | sudo apt-get -qq --force-yes install clang-3.5 clang-modernize-3.5 # clang-format-3.5 7 | sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-3.5 1 8 | sudo rm /usr/local/clang-3.4/bin/clang++ 9 | -------------------------------------------------------------------------------- /gamgee/utils/file_utils.cpp: -------------------------------------------------------------------------------- 1 | #include "file_utils.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | 9 | namespace gamgee { 10 | namespace utils { 11 | 12 | std::shared_ptr make_shared_ifstream(std::ifstream* ifstream_ptr) { 13 | return shared_ptr(ifstream_ptr, IFStreamDeleter()); 14 | } 15 | 16 | std::shared_ptr make_shared_ifstream(std::string filename) { 17 | return make_shared_ifstream(new std::ifstream{filename}); 18 | } 19 | 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | 6 | # Compiled Dynamic libraries 7 | *.so 8 | *.dylib 9 | 10 | # Compiled Static libraries 11 | *.lai 12 | *.la 13 | *.a 14 | 15 | # Boost build directory 16 | bin/ 17 | 18 | # swap and temporary files 19 | .swp 20 | .ycm_extra_conf.py 21 | .ycm_extra_conf.pyc 22 | .DS_Store 23 | 24 | # doxygen directory 25 | dox/ 26 | 27 | # vim temporaries 28 | *.swp 29 | .vimbup/ 30 | 31 | #emacs temporaries 32 | *.*~ 33 | 34 | # Haroopad temporaries 35 | *.bak 36 | 37 | # Eclipse project files 38 | .settings/ 39 | .cproject 40 | .project 41 | 42 | # Jekyll site cache 43 | _site/ 44 | 45 | # CLion project files 46 | .idea/ -------------------------------------------------------------------------------- /testdata/var_hdr_merge/test2.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##FILTER= 5 | ##FILTER= 6 | ##contig= 7 | ##contig= 8 | ##FORMAT= 9 | ##FORMAT= 10 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE2 SAMPLE3 11 | -------------------------------------------------------------------------------- /gamgee/reference_iterator.cpp: -------------------------------------------------------------------------------- 1 | #include "reference_iterator.h" 2 | 3 | #include "exceptions.h" 4 | #include "fastq_iterator.h" 5 | 6 | namespace gamgee { 7 | 8 | const char ReferenceIterator::ref_base(const std::string& chromosome, const int one_based_location) { 9 | while (m_iterator != FastqIterator{} && chromosome != m_sequence.name()) 10 | m_sequence = ++m_iterator; 11 | 12 | if (m_iterator == FastqIterator{}) 13 | throw new ChromosomeNotFoundException{chromosome}; 14 | 15 | if (one_based_location > static_cast(m_sequence.sequence().size())) 16 | throw new ChromosomeSizeException{chromosome, m_sequence.sequence().size(), one_based_location}; 17 | 18 | return m_sequence.sequence()[one_based_location-1]; 19 | } 20 | 21 | } // namespace gamgee 22 | 23 | -------------------------------------------------------------------------------- /test/missing_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "missing.h" 6 | #include "htslib/vcf.h" 7 | 8 | using namespace std; 9 | using namespace gamgee; 10 | 11 | BOOST_AUTO_TEST_CASE( detect_missing_float ) { 12 | float missing_value; 13 | bcf_float_set_missing(missing_value); 14 | BOOST_CHECK(missing(missing_value)); 15 | } 16 | 17 | BOOST_AUTO_TEST_CASE( distinguish_missing_float_from_other_nan_values ) { 18 | // end of vector should not be confused with missing 19 | float end_of_vector_value; 20 | bcf_float_set_vector_end(end_of_vector_value); 21 | BOOST_CHECK(! missing(end_of_vector_value)); 22 | 23 | // quiet NaN should not be confused with missing 24 | float quiet_nan = std::numeric_limits::quiet_NaN(); 25 | BOOST_CHECK(std::isnan(quiet_nan)); 26 | BOOST_CHECK(! missing(quiet_nan)); 27 | } 28 | -------------------------------------------------------------------------------- /.travis_scripts/gcc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/travis/gcc_4.9.1-1_amd64.deb 4 | sudo apt-get remove cpp libffi-dev 5 | sudo dpkg --install gcc_4.9.1-1_amd64.deb 6 | 7 | echo "BEGIN Eliminating old libstdc++" 8 | sudo rm /usr/lib/gcc/i586-mingw32msvc/4.2.1-sjlj/libstdc++.a 9 | sudo rm /usr/lib/gcc/i586-mingw32msvc/4.2.1-sjlj/libstdc++.la 10 | sudo rm /usr/lib/gcc/i586-mingw32msvc/4.2.1-sjlj/libstdc++_s.a 11 | sudo rm /usr/lib/gcc/i586-mingw32msvc/4.2.1-sjlj/libstdc++_sjlj_6.dll 12 | sudo rm /usr/lib/gcc/x86_64-linux-gnu/4.6/libstdc++.a 13 | sudo rm /usr/lib/gcc/x86_64-linux-gnu/4.6/libstdc++.so 14 | sudo rm /usr/lib/x86_64-linux-gnu/libstdc++.so.6 15 | sudo rm /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.16 16 | echo "END Eliminating old libstdc++" 17 | 18 | export LD_LIBRARY_PATH=/usr/lib64 19 | sudo ln -s /usr/lib64/libstd* /usr/lib/x86_64-linux-gnu/ 20 | -------------------------------------------------------------------------------- /gamgee/reference_map.cpp: -------------------------------------------------------------------------------- 1 | #include "reference_map.h" 2 | 3 | #include "fastq_reader.h" 4 | #include "interval.h" 5 | #include "utils/utils.h" 6 | 7 | #include 8 | #include 9 | 10 | using namespace gamgee; 11 | using namespace std; 12 | 13 | namespace gamgee { 14 | 15 | ReferenceMap::ReferenceMap(const std::string& filename) { 16 | FastqReader reader{filename}; 17 | read_fastq(reader); 18 | } 19 | 20 | void ReferenceMap::read_fastq(FastqReader& reader) { 21 | for (auto& fq : reader) 22 | this->insert({fq.name(), fq.sequence()}); 23 | } 24 | 25 | string ReferenceMap::get_sequence(const Interval& interval, const bool reverse_strand) const { 26 | const auto& seq = this->at(interval.chr()); 27 | auto result = seq.substr(interval.start()-1, interval.size()); 28 | return reverse_strand ? utils::complement(result) : result; 29 | } 30 | 31 | } // namespace gamgee 32 | -------------------------------------------------------------------------------- /testdata/var_hdr_merge/test3.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##FILTER= 6 | ##FILTER= 7 | ##FILTER= 8 | ##contig= 9 | ##contig= 10 | ##contig= 11 | ##FORMAT= 12 | ##FORMAT= 13 | ##FORMAT= 14 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2 SAMPLE3 -------------------------------------------------------------------------------- /test/read_group_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include "sam/read_group.h" 6 | 7 | 8 | using namespace std; 9 | using namespace gamgee; 10 | using namespace boost; 11 | 12 | BOOST_AUTO_TEST_CASE( simple_record_test ) { 13 | auto record = string{"@RG\tID:12345\tPL:Illumina\tSM:#$%^"}; 14 | auto rg = ReadGroup(record); 15 | BOOST_CHECK_EQUAL(rg.id, "12345"); 16 | BOOST_CHECK_EQUAL(rg.sample, "#$%^"); 17 | BOOST_CHECK_EQUAL(rg.platform, "Illumina"); 18 | BOOST_CHECK_EQUAL(rg.center, ""); 19 | } 20 | 21 | BOOST_AUTO_TEST_CASE( strange_punctuation_test) { 22 | auto record = string{"@RG\tID:lalala:lalala\tPL:Spiffy new platform"}; 23 | auto rg = ReadGroup(record); 24 | BOOST_CHECK_EQUAL(rg.id, "lalala:lalala"); 25 | BOOST_CHECK_EQUAL(rg.platform, "Spiffy new platform"); 26 | } 27 | 28 | BOOST_AUTO_TEST_CASE(user_defined_field_test) { 29 | auto record = string{"@RG\txy:xxxx\tab:aaaa"}; 30 | auto rg = ReadGroup(record); 31 | } 32 | -------------------------------------------------------------------------------- /.travis_scripts/update_website_dox.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$CXX" == "clang++" ] && [ "$TRAVIS_BRANCH" == "master" ] && [ "$TRAVIS_PULL_REQUEST" == "false" ]; 4 | then 5 | echo -e "Downloading latest Doxygen..."; 6 | cd ${HOME}; 7 | wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/travis/doxygen_1.8.8-1_amd64.deb 8 | sudo dpkg --install doxygen_1.8.8-1_amd64.deb 9 | cd ${HOME}/build/broadinstitute/gamgee; 10 | doxygen 11 | 12 | echo -e "Publishing doxygen...\n"; 13 | git config --global user.email "travis@travis-ci.org"; 14 | git config --global user.name "travis-ci"; 15 | git clone --branch=gh-pages https://${GH_TOKEN}@github.com/broadinstitute/gamgee gh-pages; 16 | cd gh-pages; 17 | rm -rf doxygen/; 18 | mv ../dox/html doxygen/; 19 | git add doxygen/; 20 | git commit -am "Latest doxygen documentation on successful travis build ${TRAVIS_BUILD_NUMBER} auto-pushed"; 21 | git push origin gh-pages 22 | 23 | echo -e "Published doxygen.\n" 24 | 25 | fi 26 | 27 | -------------------------------------------------------------------------------- /gamgee/utils/file_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__file_utils__guard 2 | #define gamgee__file_utils__guard 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace gamgee { 9 | namespace utils { 10 | 11 | /** 12 | * @brief a functor object to delete an ifstream 13 | */ 14 | struct IFStreamDeleter { 15 | void operator()(std::ifstream* p) const { 16 | if ( p != nullptr ) { 17 | p->close(); 18 | delete p; 19 | } 20 | } 21 | }; 22 | 23 | /** 24 | * @brief wraps a pre-allocated ifstream in a shared_ptr with correct deleter 25 | * @param ifstream_ptr an ifstream raw file pointer 26 | */ 27 | std::shared_ptr make_shared_ifstream(std::ifstream* ifstream_ptr); 28 | 29 | /** 30 | * @brief wraps an input file in a shared_ptr to an ifstream with correct deleter 31 | * @param filename the input filename 32 | */ 33 | std::shared_ptr make_shared_ifstream(std::string filename); 34 | 35 | } 36 | } 37 | 38 | #endif /* gamgee__file_utils__guard */ 39 | -------------------------------------------------------------------------------- /gamgee/fastq.cpp: -------------------------------------------------------------------------------- 1 | #include "fastq.h" 2 | #include "utils/utils.h" 3 | 4 | #include 5 | 6 | using namespace std; 7 | 8 | namespace gamgee { 9 | 10 | bool Fastq::is_fastq() const { 11 | return !m_quals.empty(); 12 | } 13 | 14 | void Fastq::chop(const int nBases) { 15 | m_sequence.erase(0, nBases); 16 | if (is_fastq()) 17 | m_quals.erase(0, nBases); 18 | } 19 | 20 | void Fastq::reverse_complement() { 21 | m_sequence = utils::reverse_complement(m_sequence); 22 | } 23 | 24 | } // end of namespace 25 | 26 | static string print_fasta_record(const gamgee::Fastq& fq) { 27 | return ">" + fq.name() + " " + fq.comment() + "\n" + fq.sequence() + "\n"; 28 | } 29 | 30 | static string print_fastq_record(const gamgee::Fastq& fq) { 31 | return "@" + fq.name() + " " + fq.comment() + "\n" + fq.sequence() + "\n+\n" + fq.quals() + "\n"; 32 | } 33 | 34 | std::ostream& operator<< (std::ostream& os, const gamgee::Fastq& fq) { 35 | return os << (fq.is_fastq() ? print_fastq_record(fq) : print_fasta_record(fq)); 36 | } 37 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | 3 | env: 4 | global: 5 | - secure: gFJsmma5duxLv+bcn5Qvq6M9r3w0XZZ/GVlolSX3t3rlyqyGwEiN71F7+8Rtv+pzgUaMELKuA8JO+x6ZpOCnMCa/RB/TV6KJrZ31UPfV7BJjt6QzWWPMlkT+3i+pM3PUVoN1zTojz+SE9pMhyILeIx+23S/8mHGXmmIW1Es001I= 6 | - secure: Sb3uSRvYdqEnh9sQngr5W+WbrodlZjW7cIg2ZAECj1IabZc4rn4Xp4+vmLiCjRCl5p6idL515ELLG4OYv5joqM/XtY7mIzCbHkaxs81+3PBroGezgW6gxWqLDtwoI76aonk3QHkD38L2uMKhZADUxK3K4l9fmauSrivz2TPU6Rw= 7 | 8 | compiler: 9 | - clang 10 | - gcc 11 | 12 | before_install: 13 | - sudo add-apt-repository -y ppa:boost-latest/ppa 14 | - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test 15 | - sudo apt-get -qq update; 16 | 17 | install: 18 | - .travis_scripts/cmake.sh 19 | - .travis_scripts/boost.sh 20 | - if [ "$CXX" == "clang++" ]; then .travis_scripts/clang.sh; fi 21 | - if [ "$CXX" == "g++" ]; then .travis_scripts/gcc.sh; fi 22 | 23 | script: 24 | - mkdir build; pushd build; 25 | - cmake -D CMAKE_CXX_FLAGS=--coverage .. && make debug && make -j 2 run_test 26 | - popd; 27 | 28 | after_success: 29 | - .travis_scripts/update_website_dox.sh 30 | - .travis_scripts/coveralls.sh 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 MauricioCarneiro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.12) 2 | project(gamgee) 3 | 4 | include(ExternalProject) 5 | 6 | # This is a C++14 library 7 | add_compile_options("-std=c++1y") 8 | 9 | # Dependency: Boost Unit Test Framework (find in the system) 10 | find_package(Boost 1.55 COMPONENTS unit_test_framework REQUIRED) 11 | include_directories(${Boost_INCLUDE_DIRS}) 12 | 13 | # enable installing dependencies 14 | option(INSTALL_DEPENDENCIES 15 | "Install project dependencies" 16 | OFF) 17 | 18 | # Dependency: htslib (download and build) 19 | include("contrib/htslib.cmake") 20 | 21 | include_directories(gamgee) 22 | 23 | add_subdirectory(gamgee) 24 | add_subdirectory(test) 25 | 26 | ADD_CUSTOM_TARGET(debug 27 | COMMAND ${CMAKE_COMMAND} -DCMAKE_BUILD_TYPE=Debug ${CMAKE_SOURCE_DIR} 28 | COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target all 29 | COMMENT "Switch CMAKE_BUILD_TYPE to Debug" 30 | ) 31 | 32 | ADD_CUSTOM_TARGET(release 33 | COMMAND ${CMAKE_COMMAND} -DCMAKE_BUILD_TYPE=Release ${CMAKE_SOURCE_DIR} 34 | COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target all 35 | COMMENT "Switch CMAKE_BUILD_TYPE to Release" 36 | ) 37 | -------------------------------------------------------------------------------- /gamgee/sam/sam_writer.cpp: -------------------------------------------------------------------------------- 1 | #include "sam_writer.h" 2 | 3 | #include "../utils/hts_memory.h" 4 | 5 | namespace gamgee { 6 | 7 | SamWriter::SamWriter(const std::string& output_fname, const bool binary) : 8 | m_out_file {utils::make_unique_hts_file(open_file(output_fname, binary ? "wb" : "w"))}, 9 | m_header {nullptr} 10 | {} 11 | 12 | SamWriter::SamWriter(const SamHeader& header, const std::string& output_fname, const bool binary) : 13 | m_out_file {utils::make_unique_hts_file(open_file(output_fname, binary ? "wb" : "w"))}, 14 | m_header{header} 15 | { 16 | write_header(); 17 | } 18 | 19 | void SamWriter::add_header(const SamHeader& header) { 20 | m_header = header; 21 | write_header(); 22 | } 23 | 24 | void SamWriter::add_record(const Sam& body) { 25 | sam_write1(m_out_file.get(), m_header.m_header.get(), body.m_body.get()); 26 | } 27 | 28 | htsFile* SamWriter::open_file(const std::string& output_fname, const std::string& mode) { 29 | return hts_open(output_fname.empty() ? "-" : output_fname.c_str(), mode.c_str()); 30 | } 31 | 32 | void SamWriter::write_header() const { 33 | sam_hdr_write(m_out_file.get(), m_header.m_header.get()); 34 | } 35 | 36 | 37 | } 38 | -------------------------------------------------------------------------------- /gamgee/sam/sam_tag.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__sam_tag__guard 2 | #define gamgee__sam_tag__guard 3 | 4 | #include 5 | 6 | namespace gamgee { 7 | 8 | /** 9 | * @brief class to represent a Sam TAG:TYPE:VALUE entry 10 | */ 11 | template 12 | class SamTag { 13 | public: 14 | explicit SamTag(const std::string& name, const TAG_TYPE& value, const bool missing = false) : 15 | m_name { name }, 16 | m_value { value }, 17 | m_missing { missing } 18 | {} 19 | 20 | explicit SamTag(const std::string& name, TAG_TYPE&& value, const bool missing = false) : 21 | m_name { name }, 22 | m_value { std::move(value) }, 23 | m_missing { missing } 24 | {} 25 | 26 | SamTag(const SamTag& other) = default; 27 | SamTag(SamTag&& other) = default; 28 | SamTag& operator=(const SamTag& other) = default; 29 | SamTag& operator=(SamTag&& other) = default; 30 | ~SamTag() = default; 31 | 32 | std::string name() const { return m_name; } 33 | TAG_TYPE value() const { return m_value; } 34 | bool missing() const { return m_missing; } 35 | 36 | private: 37 | std::string m_name; 38 | TAG_TYPE m_value; 39 | bool m_missing; 40 | }; 41 | 42 | } 43 | 44 | #endif // gamgee__sam_tag__guard 45 | -------------------------------------------------------------------------------- /testdata/test_reference.fa: -------------------------------------------------------------------------------- 1 | >chrA comment has free form text 2 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 3 | >chrB 4 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 5 | >chrC 6 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 7 | >chrD 8 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 9 | >chrE 10 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 11 | >chrF 12 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 13 | >chrG 14 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 15 | >chrH 16 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 17 | >chrI 18 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 19 | >chrJ 20 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 21 | >chrK 22 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 23 | >chrL 24 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 25 | >chrM 26 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 27 | >chrN 28 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 29 | >chrO 30 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 31 | >chrQ 32 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 33 | >chrR 34 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 35 | >chrS 36 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 37 | >chrT 38 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 39 | >chrU 40 | AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT 41 | -------------------------------------------------------------------------------- /gamgee/reference_iterator.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__reference_iterator__guard 2 | #define gamgee__reference_iterator__guard 3 | 4 | #include "fastq_iterator.h" 5 | #include "fastq_reader.h" 6 | 7 | namespace gamgee { 8 | 9 | /** 10 | * @brief Utility class to access reference bases in a FastA-formatted reference genome 11 | * 12 | * @warn the chromosomes in this reference must be accessed in order, or an exception will be thrown 13 | */ 14 | class ReferenceIterator { 15 | public: 16 | ReferenceIterator(const std::string& filename) : 17 | m_iterator {FastqReader{filename}.begin()}, 18 | m_sequence {*m_iterator} 19 | {} 20 | 21 | /** 22 | * @brief return the reference base character at the desired location 23 | * @param chromosome the chromosome of the desired base 24 | * @param one_based_location the one-based genomic location of the base 25 | */ 26 | const char ref_base(const std::string& chromosome, const int one_based_location); 27 | 28 | private: 29 | FastqIterator m_iterator; ///< @brief the current state of the iterator through the FastA input file 30 | Fastq& m_sequence; ///< @brief a reference to the iterator's current sequence 31 | }; 32 | 33 | } // namespace gamgee 34 | 35 | #endif /* gamgee__reference_iterator__guard */ 36 | -------------------------------------------------------------------------------- /contrib/htslib.cmake: -------------------------------------------------------------------------------- 1 | if (CMAKE_GENERATOR STREQUAL "Unix Makefiles") 2 | # when using the makefile generator, use the special variable $(MAKE) to invoke make 3 | # this enables the jobserver to work correctly 4 | set(MAKE_COMMAND "$(MAKE)") 5 | else() 6 | # invoke make explicitly 7 | # in this case, we assume the parent build system is running in parallel already so no -j flag is added 8 | find_program(MAKE_COMMAND NAMES make gmake) 9 | endif() 10 | 11 | if (INSTALL_DEPENDENCIES) 12 | set(HTSLIB_INSTALL ${MAKE_COMMAND} install prefix=${CMAKE_INSTALL_PREFIX}) 13 | else() 14 | set(HTSLIB_INSTALL "") 15 | endif() 16 | 17 | # build htslib 18 | set(htslib_PREFIX ${CMAKE_BINARY_DIR}/contrib/htslib) 19 | ExternalProject_Add(htslib 20 | PREFIX ${htslib_PREFIX} 21 | GIT_REPOSITORY "https://github.com/broadinstitute/htslib.git" 22 | GIT_TAG broad 23 | BUILD_IN_SOURCE 1 24 | CONFIGURE_COMMAND "" 25 | BUILD_COMMAND ${MAKE_COMMAND} lib-static 26 | INSTALL_COMMAND "${HTSLIB_INSTALL}" 27 | LOG_DOWNLOAD 0 28 | LOG_UPDATE 0 29 | LOG_CONFIGURE 0 30 | LOG_BUILD 0 31 | LOG_TEST 0 32 | LOG_INSTALL 0 33 | ) 34 | 35 | include_directories(${htslib_PREFIX}/src/htslib) 36 | set(htslib_LIB ${htslib_PREFIX}/src/htslib/libhts.a) 37 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(SOURCE_FILES 2 | cigar_test.cpp 3 | fastq_reader_test.cpp 4 | fastq_test.cpp 5 | genotypes_test.cpp 6 | indexed_sam_reader_test.cpp 7 | indexed_variant_reader_test.cpp 8 | interval_test.cpp 9 | main.cpp 10 | missing_test.cpp 11 | multiple_variant_reader_test.cpp 12 | read_group_test.cpp 13 | reference_block_splitting_variant_reader_test.cpp 14 | reference_test.cpp 15 | sam_builder_test.cpp 16 | sam_header_test.cpp 17 | sam_reader_test.cpp 18 | sam_test.cpp 19 | select_if_test.cpp 20 | short_value_optimized_storage_test.cpp 21 | synced_variant_reader_test.cpp 22 | test_utils.h 23 | utils_test.cpp 24 | variant_builder_multi_sample_vector_test.cpp 25 | variant_builder_test.cpp 26 | variant_header_test.cpp 27 | variant_reader_test.cpp 28 | variant_test.cpp) 29 | 30 | add_executable(gamgee_test EXCLUDE_FROM_ALL ${SOURCE_FILES}) 31 | 32 | target_compile_definitions(gamgee_test PUBLIC -DBOOST_TEST_DYN_LINK) 33 | target_link_libraries(gamgee_test gamgee ${htslib_LIB} ${Boost_LIBRARIES} pthread z) 34 | add_dependencies(gamgee_test htslib) 35 | 36 | add_custom_target(run_test COMMAND ${CMAKE_BINARY_DIR}/test/gamgee_test DEPENDS gamgee_test WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) -------------------------------------------------------------------------------- /gamgee/utils/variant_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__variant_utils__guard 2 | #define gamgee__variant_utils__guard 3 | 4 | #include "hts_memory.h" 5 | 6 | #include "htslib/vcf.h" 7 | 8 | #include 9 | #include 10 | 11 | namespace gamgee { 12 | 13 | /** 14 | * @brief allows the caller to include only selected samples in a Variant Reader. To create a sites only file, 15 | * simply pass an empty vector of samples. 16 | * 17 | * @param samples the list of samples you want included/excluded from your iteration 18 | * @param whether you want these samples to be included or excluded from your iteration. 19 | */ 20 | void subset_variant_samples(bcf_hdr_t* hdr_ptr, const std::vector& samples, const bool include); 21 | 22 | enum class AlleleType { REFERENCE, SNP, INSERTION, DELETION }; 23 | 24 | using AlleleMask = std::vector; 25 | 26 | /** 27 | * @brief merges a variant header into another 28 | * 29 | * @param dest_hdr_ptr a shared pointer to a bcf_hdr_t containing the header to be merged into 30 | * @param src_hdr_ptr a shared pointer to a bcf_hdr_t containing the header to be merged from 31 | */ 32 | void merge_variant_headers(const std::shared_ptr& dest_hdr_ptr, const std::shared_ptr& src_hdr_ptr); 33 | } 34 | 35 | #endif /* gamgee__variant_utils__guard */ 36 | -------------------------------------------------------------------------------- /gamgee/zip.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__zip__guard 2 | #define gamgee__zip__guard 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace gamgee { 9 | namespace utils { 10 | 11 | /** 12 | * @brief utility method to zip iterators together with simpler syntax than boost 13 | * 14 | * This is a wrapper over boost's zip_iterator interface to simplify the usage of zip 15 | * iterators especially in for each loops. This function enables the following syntax: 16 | * 17 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | * for (const auto tup : zip(a, b, c, d) { 19 | * ... // use tup values as a boost::tuple 20 | * } 21 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 22 | * 23 | * for more details look at boost's zip_iterator documentation. 24 | */ 25 | template 26 | auto zip(const T&... containers) -> boost::iterator_range> 27 | { 28 | auto zip_begin = boost::make_zip_iterator(boost::make_tuple(std::begin(containers)...)); 29 | auto zip_end = boost::make_zip_iterator(boost::make_tuple(std::end(containers)...)); 30 | return boost::make_iterator_range(zip_begin, zip_end); 31 | } 32 | 33 | } // namespace utils 34 | } // namespace gamgee 35 | 36 | #endif /* gamgee__zip__guard */ 37 | -------------------------------------------------------------------------------- /gamgee/fastq_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "fastq_reader.h" 2 | #include "fastq_iterator.h" 3 | 4 | #include "exceptions.h" 5 | #include "utils/file_utils.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | namespace gamgee { 16 | 17 | FastqReader::FastqReader(const std::string& filename) : 18 | m_input_stream {} 19 | { 20 | if (!filename.empty()) { 21 | init_reader(filename); 22 | } 23 | } 24 | 25 | FastqReader::FastqReader(const std::vector& filenames) : 26 | m_input_stream {} 27 | { 28 | if (filenames.size() > 1) 29 | throw SingleInputException{"filenames", filenames.size()}; 30 | if (!filenames.empty()) { 31 | init_reader(filenames.front()); 32 | } 33 | } 34 | 35 | FastqReader::FastqReader(std::istream* const input) : 36 | m_input_stream{shared_ptr(input)} 37 | {} 38 | 39 | FastqIterator FastqReader::begin() { 40 | return FastqIterator{m_input_stream}; 41 | } 42 | 43 | FastqIterator FastqReader::end() { 44 | return FastqIterator{}; 45 | } 46 | 47 | void FastqReader::init_reader(const std::string& filename) { 48 | m_input_stream = utils::make_shared_ifstream(filename); 49 | if ( m_input_stream->fail() ) { 50 | throw FileOpenException{filename}; 51 | } 52 | } 53 | 54 | 55 | } // end of namespace 56 | -------------------------------------------------------------------------------- /testdata/ref_block/problem3_file2.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##ALT= 3 | ##FILTER= 4 | ##INFO= 5 | ##FORMAT= 6 | ##FORMAT= 7 | ##FORMAT= 8 | ##FORMAT= 9 | ##FORMAT= 10 | ##FORMAT= 11 | ##FORMAT= 12 | ##contig= 13 | ##reference=Homo_sapiens_assembly19.fasta 14 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE2 15 | 1 3319304 . C . . END=3319598 GT:DP:GQ:MIN_DP:PL 0/0:151:99:66:0,63,1800 16 | 1 3319599 . G . . END=3319599 GT:DP:GQ:MIN_DP:PL 0/0:101:52:101:0,52,2707 17 | -------------------------------------------------------------------------------- /gamgee/reference_map.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__reference_map__guard 2 | #define gamgee__reference_map__guard 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "fastq_reader.h" 9 | #include "interval.h" 10 | 11 | namespace gamgee { 12 | 13 | /** 14 | * @brief Utility class to create a reference object for all reference operations in Foghorn. 15 | * 16 | * It hash table stores chr -> sequence as strings internally. 17 | */ 18 | class ReferenceMap : public std::unordered_map { 19 | public: 20 | 21 | /** 22 | * @brief opens a fasta/fastq reference file and creates a ReferenceMap from it 23 | * 24 | * @param filename reference genome fasta or fastq file 25 | */ 26 | ReferenceMap(const std::string& filename); 27 | 28 | /** 29 | * @brief locates the DNA sequence for a given Interval 30 | * 31 | * @return DNA sequence for the requested Intervals 32 | */ 33 | std::string get_sequence(const Interval& interval, ///< location in the genome 34 | const bool reverse_strand = false ///< which strand, releative to the reference genome, to produce the sequence for 35 | )const; 36 | 37 | private: 38 | void load_reference_from_fastq_reader(gamgee::FastqReader& reader); 39 | void read_fastq(gamgee::FastqReader& reader); 40 | }; 41 | 42 | } // namespace gamgee 43 | 44 | #endif /* gamgee__reference_map__guard */ 45 | -------------------------------------------------------------------------------- /testdata/test_variants_mixed_ploidy.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##FILTER= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##FORMAT= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 19 | 1 10000000 db2342 T C 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:25:10,0,100:3.1,2.2:ABA 0:12:0,10,1000:3.1,2.2:CA 1:650:10,100,0:3.1,2.2:XISPAFORRO 20 | -------------------------------------------------------------------------------- /testdata/mvr_hdr/test2.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##FILTER= 5 | ##FILTER= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##FORMAT= 10 | ##FORMAT= 11 | ##FORMAT= 12 | ##FORMAT= 13 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE2 SAMPLE3 14 | 1 10000000 db2342 T C 80 LOW_QUAL AN=6;VALIDATED GT:GQ:PL:AS 0/0:12:0,10,1000:CA 1/1:650:10,100,0:XISPAFORRO 15 | 20 10001000 rs837472 GG AA 8.4 LOW_QUAL AN=6 GT:GQ:PL:AS 0/0:35:0,10,100:ABA 1/1:35:10,100,0:ABA 16 | 20 10002000 . TAGTGQA T . LOW_QUAL AN=6;VALIDATED GT:GQ:PL:AS 0/0:35:0,10,2000000000:ABA 1/1:35:10,100,0:ABA 17 | 20 10003000 . A AGCT . MISSED AN=6 GT:GQ:PL:AS 0/0:35:0,10,100:ABA 1/1:35:10,100,0:ABA 18 | 22 10004000 . GAT G,GATAT . MISSED AN=6 GT:GQ:PL:AS 0/0:35:0,10,100,2,4,8:ABA 1/1:35:10,100,0,2,4,8:ABA 19 | 22 10005000 . GAT G,GATAT . MISSED AN=6 GT:GQ:PL:AS 0/0:.:0,10,100,2,4,8:ABA 1/1:35:10,100,0,2,4,8:. 20 | -------------------------------------------------------------------------------- /test/sam_header_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "sam/sam_reader.h" 4 | #include "test_utils.h" 5 | 6 | using namespace std; 7 | using namespace gamgee; 8 | 9 | BOOST_AUTO_TEST_CASE( sam_header ) { 10 | auto reader = SingleSamReader{"testdata/test_simple.bam"}; 11 | auto header = reader.header(); 12 | BOOST_CHECK_EQUAL("chr1", header.sequence_name(0)); 13 | BOOST_CHECK_EQUAL(100000u, header.sequence_length("chr1")); 14 | BOOST_CHECK_EQUAL(100000u, header.sequence_length(0)); 15 | BOOST_CHECK_EQUAL(0u, header.sequence_length("foo")); 16 | BOOST_CHECK_EQUAL(1u, header.n_sequences()); 17 | } 18 | 19 | BOOST_AUTO_TEST_CASE( sam_header_read_groups ) { 20 | auto reader = SingleSamReader{"testdata/test_simple.bam"}; 21 | auto header = reader.header(); 22 | auto rgs = header.read_groups(); 23 | BOOST_CHECK_EQUAL(1u, rgs.size()); 24 | BOOST_CHECK_EQUAL(rgs[0].id, "exampleBAM.bam"); 25 | BOOST_CHECK_EQUAL(rgs[0].platform, "illumina"); 26 | BOOST_CHECK_EQUAL(rgs[0].library, "exampleBAM.bam"); 27 | } 28 | 29 | /** @todo Need a way to modify the header in between these copies/moves to make sure these are working properly! */ 30 | BOOST_AUTO_TEST_CASE( sam_header_constructors ) { 31 | auto reader = SingleSamReader{"testdata/test_simple.bam"}; 32 | auto h0 = reader.header(); 33 | auto copies = check_copy_constructor(h0); 34 | auto moves = check_copy_constructor(h0); 35 | // need builder to be able to modify the header and check. At least this test will blow up if something is not functional. 36 | } 37 | -------------------------------------------------------------------------------- /testdata/ref_block/problem2_file2.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##ALT= 3 | ##FILTER= 4 | ##INFO= 5 | ##FORMAT= 6 | ##FORMAT= 7 | ##FORMAT= 8 | ##FORMAT= 9 | ##FORMAT= 10 | ##FORMAT= 11 | ##FORMAT= 12 | ##contig= 13 | ##reference=Homo_sapiens_assembly19.fasta 14 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE2 15 | 1 3319304 . C . . END=3319598 GT:DP:GQ:MIN_DP:PL 0/0:151:99:66:0,63,1800 16 | 1 3319599 . G . . END=3319599 GT:DP:GQ:MIN_DP:PL 0/0:101:52:101:0,52,2707 17 | 1 3319600 . T . . END=3319601 GT:DP:GQ:MIN_DP:PL 0/0:100:0:87:0,0,1855 18 | 1 3319602 . C . . END=3319614 GT:DP:GQ:MIN_DP:PL 0/0:84:99:71:0,120,1800 19 | -------------------------------------------------------------------------------- /gamgee/sam/sam_iterator.cpp: -------------------------------------------------------------------------------- 1 | #include "sam_iterator.h" 2 | #include "sam.h" 3 | 4 | #include "../utils/hts_memory.h" 5 | 6 | using namespace std; 7 | 8 | namespace gamgee { 9 | 10 | SamIterator::SamIterator() : 11 | m_sam_file_ptr {nullptr}, 12 | m_sam_header_ptr {nullptr}, 13 | m_sam_record_ptr {nullptr} 14 | {} 15 | 16 | SamIterator::SamIterator(const std::shared_ptr& sam_file_ptr, const std::shared_ptr& sam_header_ptr) : 17 | m_sam_file_ptr {sam_file_ptr}, 18 | m_sam_header_ptr {sam_header_ptr}, 19 | m_sam_record_ptr {utils::make_shared_sam(bam_init1())}, ///< important to initialize the record buffer in the constructor so we can reuse it across the iterator 20 | m_sam_record {m_sam_header_ptr, m_sam_record_ptr} 21 | { 22 | fetch_next_record(); 23 | } 24 | 25 | Sam& SamIterator::operator*() { 26 | return m_sam_record; 27 | } 28 | 29 | Sam& SamIterator::operator++() { 30 | fetch_next_record(); 31 | return m_sam_record; 32 | } 33 | 34 | bool SamIterator::operator!=(const SamIterator& rhs) { 35 | return m_sam_file_ptr != rhs.m_sam_file_ptr; 36 | } 37 | /** 38 | * @brief pre-fetches the next sam record 39 | * @warning we're reusing the existing htslib memory, so users should be aware that all objects from the previous iteration are now stale unless a deep copy has been performed 40 | */ 41 | void SamIterator::fetch_next_record() { 42 | if (sam_read1(m_sam_file_ptr.get(), m_sam_header_ptr.get(), m_sam_record_ptr.get()) < 0) { 43 | m_sam_file_ptr = nullptr; 44 | m_sam_record = Sam{}; 45 | } 46 | } 47 | 48 | } 49 | 50 | -------------------------------------------------------------------------------- /testdata/mvr_hdr/test1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##FILTER= 5 | ##FILTER= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##FORMAT= 10 | ##FORMAT= 11 | ##FORMAT= 12 | ##FORMAT= 13 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2 14 | 1 10000000 db2342 T C 80 PASS AF=0.5;AN=6 GT:GQ:PL:AF 0/1:25:10,0,100:3.1,2.2 0/0:12:0,10,1000:3.1,2.2 15 | 20 10001000 rs837472 GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF 0/1:35:10,0,100:3.1,2.2 0/0:35:0,10,100:3.1,2.2 16 | 20 10002000 . TAGTGQA T . LOW_QUAL AF=0.5;AN=6 GT:GQ:PL:AF 0/1:35:10,0,100:3.1,2.2 0/0:35:0,10,2000000000:3.1,2.2 17 | 20 10003000 . A AGCT . LOW_QUAL AF=0.5;AN=6 GT:GQ:PL:AF 0/1:35:10,0,100:3.1,2.2 0/0:35:0,10,100:3.1,2.2 18 | 22 4000 . GAT G,GATAT . PASS AF=0.5,0;AN=6 GT:GQ:PL:AF 1/2:35:10,0,100,2,4,8:3.1,2.2 0/0:35:0,10,100,2,4,8:3.1,2.2 19 | 22 5000 . GAT G,GATAT . PASS AF=0.5,.;AN=6 GT:GQ:PL:AF 0/1:35:10,0,100,.,4,.:3.1,. 0/0:.:0,10,100,2,4,8:3.1,2.2 20 | -------------------------------------------------------------------------------- /test/cigar_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "sam/cigar.h" 4 | 5 | using namespace std; 6 | using namespace gamgee; 7 | 8 | 9 | BOOST_AUTO_TEST_CASE( cigar_consumes_read_bases ) { 10 | BOOST_CHECK(Cigar::consumes_read_bases(CigarOperator::M)); 11 | BOOST_CHECK(Cigar::consumes_read_bases(CigarOperator::EQ)); 12 | BOOST_CHECK(Cigar::consumes_read_bases(CigarOperator::X)); 13 | BOOST_CHECK(Cigar::consumes_read_bases(CigarOperator::I)); 14 | BOOST_CHECK(!Cigar::consumes_read_bases(CigarOperator::D)); 15 | BOOST_CHECK(Cigar::consumes_read_bases(CigarOperator::S)); 16 | BOOST_CHECK(!Cigar::consumes_read_bases(CigarOperator::H)); 17 | BOOST_CHECK(!Cigar::consumes_read_bases(CigarOperator::N)); 18 | BOOST_CHECK(!Cigar::consumes_read_bases(CigarOperator::P)); 19 | BOOST_CHECK(!Cigar::consumes_read_bases(CigarOperator::B)); 20 | } 21 | 22 | BOOST_AUTO_TEST_CASE( cigar_consumes_reference_bases ) { 23 | BOOST_CHECK(Cigar::consumes_reference_bases(CigarOperator::M)); 24 | BOOST_CHECK(Cigar::consumes_reference_bases(CigarOperator::EQ)); 25 | BOOST_CHECK(Cigar::consumes_reference_bases(CigarOperator::X)); 26 | BOOST_CHECK(!Cigar::consumes_reference_bases(CigarOperator::I)); 27 | BOOST_CHECK(Cigar::consumes_reference_bases(CigarOperator::D)); 28 | BOOST_CHECK(!Cigar::consumes_reference_bases(CigarOperator::S)); 29 | BOOST_CHECK(!Cigar::consumes_reference_bases(CigarOperator::H)); 30 | BOOST_CHECK(Cigar::consumes_reference_bases(CigarOperator::N)); 31 | BOOST_CHECK(!Cigar::consumes_reference_bases(CigarOperator::P)); 32 | BOOST_CHECK(!Cigar::consumes_reference_bases(CigarOperator::B)); 33 | } 34 | -------------------------------------------------------------------------------- /gamgee/variant/variant_iterator.cpp: -------------------------------------------------------------------------------- 1 | #include "variant_iterator.h" 2 | #include "variant.h" 3 | 4 | #include "../utils/hts_memory.h" 5 | 6 | using namespace std; 7 | 8 | namespace gamgee { 9 | 10 | VariantIterator::VariantIterator(const std::shared_ptr& variant_file_ptr, const std::shared_ptr& variant_header_ptr) : 11 | m_variant_file_ptr {variant_file_ptr}, 12 | m_variant_header_ptr {variant_header_ptr}, 13 | m_variant_record_ptr {utils::make_shared_variant(bcf_init1())}, ///< important to initialize the record buffer in the constructor so we can reuse it across the iterator 14 | m_variant_record {m_variant_header_ptr, m_variant_record_ptr} 15 | { 16 | fetch_next_record(); 17 | } 18 | 19 | Variant& VariantIterator::operator*() { 20 | return m_variant_record; 21 | } 22 | 23 | Variant& VariantIterator::operator++() { 24 | fetch_next_record(); 25 | return m_variant_record; 26 | } 27 | 28 | bool VariantIterator::operator!=(const VariantIterator& rhs) const { 29 | return m_variant_file_ptr != rhs.m_variant_file_ptr; 30 | } 31 | 32 | bool VariantIterator::empty() const { 33 | return !m_variant_file_ptr; 34 | } 35 | 36 | /** 37 | * @brief pre-fetches the next variant record 38 | * @warning we're reusing the existing htslib memory, so users should be aware that all objects from the previous iteration are now stale unless a deep copy has been performed 39 | */ 40 | void VariantIterator::fetch_next_record() { 41 | if (bcf_read1(m_variant_file_ptr.get(), m_variant_header_ptr.get(), m_variant_record_ptr.get()) < 0) { 42 | m_variant_file_ptr.reset(); 43 | m_variant_record = Variant{}; 44 | } 45 | } 46 | 47 | } 48 | 49 | -------------------------------------------------------------------------------- /gamgee/utils/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace gamgee { 9 | namespace utils { 10 | 11 | char complement_base (const char base) { 12 | switch (base) { 13 | case 'A': 14 | return 'T'; 15 | case 'C': 16 | return 'G'; 17 | case 'G': 18 | return 'C'; 19 | case 'T': 20 | return 'A'; 21 | case 'a': 22 | return 't'; 23 | case 'c': 24 | return 'g'; 25 | case 'g': 26 | return 'c'; 27 | case 't': 28 | return 'a'; 29 | default: 30 | return base; 31 | } 32 | } 33 | 34 | std::string complement(std::string& sequence) { 35 | std::transform(sequence.begin(), sequence.end(), sequence.begin(), complement_base); 36 | return sequence; 37 | } 38 | 39 | std::string complement(const std::string& sequence) { 40 | auto rev = std::string{}; 41 | rev.reserve(sequence.length()); 42 | std::transform(sequence.cbegin(), sequence.cend(), std::back_inserter(rev), complement_base); 43 | return rev; 44 | } 45 | 46 | std::string reverse_complement(const std::string& sequence) { 47 | auto rev = std::string{}; 48 | rev.reserve(sequence.length()); 49 | std::transform(sequence.crbegin(), sequence.crend(), std::back_inserter(rev), complement_base); 50 | return rev; 51 | } 52 | 53 | std::vector hts_string_array_to_vector(const char * const * const string_array, const uint32_t array_size) { 54 | auto result = std::vector{}; 55 | result.reserve(array_size); 56 | for (auto i = 0u; i != array_size; ++i) 57 | result.emplace_back(string_array[i]); 58 | return result; 59 | } 60 | 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /gamgee/variant/variant_writer.cpp: -------------------------------------------------------------------------------- 1 | #include "variant_writer.h" 2 | 3 | #include "../utils/hts_memory.h" 4 | 5 | #include 6 | 7 | namespace gamgee { 8 | 9 | VariantWriter::VariantWriter(const std::string& output_fname, const bool binary, const int compression_level) : 10 | m_out_file {utils::make_unique_hts_file(open_file(output_fname, write_mode(binary, compression_level)))}, 11 | m_header {nullptr} 12 | {} 13 | 14 | VariantWriter::VariantWriter(const VariantHeader& header, const std::string& output_fname, const bool binary, const int compression_level) : 15 | m_out_file {utils::make_unique_hts_file(open_file(output_fname, write_mode(binary, compression_level)))}, 16 | m_header{header} 17 | { 18 | write_header(); 19 | } 20 | 21 | std::string VariantWriter::write_mode(const bool binary, const int compression_level) const { 22 | if (compression_level != Z_DEFAULT_COMPRESSION) { 23 | if (!binary) 24 | throw new std::runtime_error{"Cannot specify compression level for VCF files"}; 25 | return "wb" + std::to_string(compression_level); 26 | } 27 | else 28 | return binary ? "wb" : "w"; 29 | } 30 | 31 | void VariantWriter::add_header(const VariantHeader& header) { 32 | m_header = header; 33 | write_header(); 34 | } 35 | 36 | void VariantWriter::add_record(const Variant& body) { 37 | bcf_write1(m_out_file.get(), m_header.m_header.get(), body.m_body.get()); 38 | } 39 | 40 | htsFile* VariantWriter::open_file(const std::string& output_fname, const std::string& mode) { 41 | return hts_open(output_fname.empty() ? "-" : output_fname.c_str(), mode.c_str()); 42 | } 43 | 44 | void VariantWriter::write_header() const { 45 | bcf_hdr_write(m_out_file.get(), m_header.m_header.get()); 46 | } 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /gamgee/utils/variant_utils.cpp: -------------------------------------------------------------------------------- 1 | #include "variant_utils.h" 2 | 3 | #include "hts_memory.h" 4 | 5 | #include "../exceptions.h" 6 | 7 | #include "htslib/vcf.h" 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace gamgee { 14 | 15 | void subset_variant_samples(bcf_hdr_t* hdr_ptr, const std::vector& samples, const bool include) { 16 | if (samples.empty() && include) // exclude all samples 17 | bcf_hdr_set_samples(hdr_ptr, NULL, false); 18 | 19 | else if (samples.empty() && !include) // keep all samples 20 | bcf_hdr_set_samples(hdr_ptr, "-", false); 21 | 22 | else { // select some samples 23 | auto sample_list = include ? std::string{} : std::string{"^"}; 24 | std::for_each(samples.begin(), samples.end(), [&sample_list](const auto& s) { sample_list += s + ","; }); 25 | sample_list.erase(sample_list.size() - 1); 26 | bcf_hdr_set_samples(hdr_ptr, sample_list.c_str(), false); 27 | } 28 | 29 | // NOTE: must NOT call bcf_hdr_sync() here, since htslib calls it for us in bcf_hdr_set_samples() 30 | } 31 | 32 | void merge_variant_headers(const std::shared_ptr& dest_hdr_ptr, const std::shared_ptr& src_hdr_ptr) { 33 | auto success = bcf_hdr_combine(dest_hdr_ptr.get(), src_hdr_ptr.get()); 34 | if (success != 0) 35 | throw HtslibException(success); 36 | 37 | // TODO: there is probably a more efficient way 38 | for (auto sample_counter = 0; sample_counter < bcf_hdr_nsamples(src_hdr_ptr.get()); ++sample_counter) { 39 | // don't check for error code because the only "error" is ignoring a duplicate sample, not an error for us 40 | bcf_hdr_add_sample(dest_hdr_ptr.get(), src_hdr_ptr->samples[sample_counter]); 41 | } 42 | 43 | bcf_hdr_sync(dest_hdr_ptr.get()); 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /testdata/var_hdr_merge/empty_vcf.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##FILTER= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##FORMAT= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##INFO= 19 | ##INFO= 20 | ##FORMAT= 21 | ##FORMAT= 22 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 23 | -------------------------------------------------------------------------------- /testdata/missing_header.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##FILTER= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##contig= 11 | ##contig= 12 | ##FORMAT= 13 | ##FORMAT= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 18 | 20 10001000 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 19 | 20 10002000 . TAGTGQA T . LOW_QUAL AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,2000000000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 20 | 20 10003000 . A AGCT . NOT_DEFINED AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 21 | 22 10004000 . GAT G,GATAT . PASS;MISSED AF=0.5,0;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100,2,4,8:3.1,2.2:ABA 0/0:35:0,10,100,2,4,8:3.1,2.2:ABA 1/1:35:10,100,0,2,4,8:3.1,2.2:ABA 22 | -------------------------------------------------------------------------------- /testdata/test_variants_alternate_ploidy.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##FILTER= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##FORMAT= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 19 | 1 10000000 db2342 T C 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 1:25:10,100:3.1,2.2:ABA 0:12:10,1000:3.1,2.2:CA 1:650:10,0:3.1,2.2:XISPAFORRO 20 | 20 10000000 db2342 T C 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:25:10,0,100:3.1,2.2:ABA 0/0:12:0,10,1000:3.1,2.2:CA 1/1:650:10,100,0:3.1,2.2:XISPAFORRO 21 | 22 10000000 db2342 T C 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/0/1:25:10,0,32,100:3.1,2.2:ABA 0/0/0:12:0,10,30,1000:3.1,2.2:CA 1/1/1:650:10,30,100,0:3.1,2.2:XISPAFORRO 22 | -------------------------------------------------------------------------------- /testdata/test_variants_missing_data.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##FILTER= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | ##FORMAT= 20 | ##FORMAT= 21 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 0001A 0002A 0003A 0004A 0005A 0006A 0007A 0007B 0008A 0009A 0009B 22 | 1 13417 . C CGAGA 1058.79 PASS AC=18;AF=5.325e-03;AN=3380 GT:AD:DP:GQ:PL ./. ./. ./. ./. ./. ./. ./. ./. ./. ./. ./. 23 | -------------------------------------------------------------------------------- /testdata/ref_block/test1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##FILTER= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 20 | 1 1 . A . . END=100;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 21 | 1 101 . C . . END=200;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 22 | 20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 23 | -------------------------------------------------------------------------------- /testdata/ref_block/test2.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##FILTER= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 20 | 1 50 . A . . END=150;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 21 | 1 300 . A . . END=400;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 22 | 20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 23 | -------------------------------------------------------------------------------- /testdata/ref_block/test5.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##FILTER= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 20 | 1 20 . C . . END=80;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 21 | 1 600 . C . . END=700;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 22 | 20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 23 | -------------------------------------------------------------------------------- /testdata/test_variants_02.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##FILTER= 7 | ##contig= 8 | ##contig= 9 | ##contig= 10 | ##FORMAT= 11 | ##FORMAT= 12 | ##FORMAT= 13 | ##FORMAT= 14 | ##FORMAT= 15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 16 | 1 10000000 . T C . . AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,0,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 17 | 20 10001000 . GG AA . . AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:34:10,0,100:3.1,2.2:ABA 0/0:35:1,0,100:3.1,2.2:ABA 1/1:36:10,100,0:3.1,2.2:ABA 18 | 20 10002000 . TAGTGQA T . . AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:0:0,20,100:3.1,2.2:ABA 0/0:1:10,0,2000000000:3.1,2.2:ABA 1/1:100:10,0,0:3.1,2.2:ABA 19 | 20 10003000 . A AGCT . . AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:0:0,20,100:3.1,2.2:ABA 0/0:36:0,1,100:3.1,2.2:ABA 1/1:34:0,0,20:3.1,2.2:ABA 20 | 22 10004000 . GAT G,GATAT . . AF=0.5,0;AN=6 GT:GQ:PL:AF:AS 0/1:350:40,10,100,2,4,8:3.1,2.2:ABA 0/0:35:10000,10,100,2,4,8:3.1,2.2:ABA 1/1:2:00,100,0,2,4,8:3.1,2.2:ABA 21 | -------------------------------------------------------------------------------- /gamgee/sam/read_group.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__read_group__guard 2 | #define gamgee__read_group__guard 3 | 4 | #include 5 | 6 | namespace gamgee { 7 | 8 | 9 | /** 10 | * @brief Helper struct to hold one read group record from a sam file header 11 | * 12 | * A read group, which is contained in a line starting with "@RG" in a sam file header, 13 | * contains information about the seuencing process: type of machine, time, center etc. 14 | * Most important for variant calling and quality control is the sample ID -- each sample 15 | * may be split into several read groups. 16 | */ 17 | class ReadGroup { 18 | 19 | public: 20 | std::string id; 21 | std::string center; 22 | std::string description; 23 | std::string date_time; 24 | std::string flow_order; 25 | std::string key_sequence; 26 | std::string library; 27 | std::string programs; 28 | std::string median_insert_size; 29 | std::string platform; 30 | std::string platform_unit; 31 | std::string sample; 32 | 33 | ReadGroup() = default; 34 | ReadGroup(const std::string& header_line); 35 | 36 | private: 37 | static const char ID_TAG []; 38 | static const char CENTER_TAG []; 39 | static const char DESCRIPTION_TAG []; 40 | static const char DATE_TIME_TAG []; 41 | static const char FLOW_ORDER_TAG []; 42 | static const char KEY_SEQUENCE_TAG []; 43 | static const char LIBRARY_TAG []; 44 | static const char PROGRAMS_TAG []; 45 | static const char MEDIAN_INSERT_SIZE_TAG []; 46 | static const char PLATFORM_TAG []; 47 | static const char PLATFORM_UNIT_TAG []; 48 | static const char SAMPLE_TAG []; 49 | 50 | static bool starts_with(const std::string& token, const char tag[2]) { 51 | return token[0] == tag[0] && token[1] == tag[1]; 52 | } 53 | }; 54 | 55 | 56 | } 57 | 58 | 59 | 60 | #endif //gamgee__read_group__guard 61 | -------------------------------------------------------------------------------- /gamgee/sam/base_quals.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__base_quals__guard 2 | #define gamgee__base_quals__guard 3 | 4 | #include "htslib/sam.h" 5 | 6 | #include 7 | 8 | namespace gamgee { 9 | 10 | /** 11 | * @brief Utility class to handle the memory management of the sam record object for a read base qualities 12 | */ 13 | class BaseQuals { 14 | public: 15 | explicit BaseQuals(const std::shared_ptr& sam_record); 16 | BaseQuals(const BaseQuals& other); 17 | BaseQuals(BaseQuals&& other) = default; 18 | BaseQuals& operator=(const BaseQuals& other); 19 | BaseQuals& operator=(BaseQuals&& other) = default; 20 | ~BaseQuals() = default; ///< Default destruction is sufficient, since our shared_ptr will handle deallocation 21 | 22 | uint8_t operator[](const uint32_t index) const; ///< use freely as you would an array. 23 | uint8_t& operator[](const uint32_t index); ///< use freely as you would an array 24 | uint32_t size() const { return m_num_quals; } ///< number of base qualities in the container 25 | bool operator==(const BaseQuals& other) const; ///< check for equality with another BaseQuals object 26 | bool operator!=(const BaseQuals& other) const; ///< check for inequality with another BaseQuals object 27 | std::string to_string() const; ///< produce a string representation of the base qualities in this object 28 | 29 | private: 30 | std::shared_ptr m_sam_record; ///< sam record containing our base qualities, potentially co-owned by multiple other objects 31 | uint8_t* m_quals; ///< Pointer to the start of the base qualities in m_sam_record, cached for efficiency 32 | uint32_t m_num_quals; ///< Number of quality scores in our sam record 33 | 34 | friend class SamBuilder; ///< builder needs access to the internals in order to build efficiently 35 | }; 36 | 37 | } 38 | 39 | #endif /* gamgee__base_quals__guard */ 40 | -------------------------------------------------------------------------------- /testdata/test_variants_multiple_alt.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##FILTER= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##FORMAT= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 19 | 1 10000000 db2342 T C 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:25:10,0,100:3.1,2.2:ABA ./. ./. 20 | 1 10000000 db2342 T G 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS ./. 0/0:12:0,10,1000:3.1,2.2:CA ./. 21 | 1 10000000 db2342 T A 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS ./. ./. 1/1:650:10,100,0:3.1,2.2:XISPAFORRO 22 | 20 10001999 . TAGTGQA T . LOW_QUAL AF=0.5;AN=6;VALIDATED GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,2000000000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 23 | 20 10003000 . A G . NOT_DEFINED AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 24 | -------------------------------------------------------------------------------- /testdata/ref_block/test4.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##FILTER= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 20 | 1 1 . A . . END=101;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 21 | 1 102 . G C, . . AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 22 | 1 103 . T . . END=500;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 23 | 20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 24 | -------------------------------------------------------------------------------- /gamgee/sam/indexed_sam_iterator.cpp: -------------------------------------------------------------------------------- 1 | #include "indexed_sam_iterator.h" 2 | #include "sam.h" 3 | 4 | #include "../utils/hts_memory.h" 5 | 6 | using namespace std; 7 | 8 | namespace gamgee { 9 | 10 | IndexedSamIterator::IndexedSamIterator() : 11 | m_sam_file_ptr {nullptr}, 12 | m_sam_index_ptr {nullptr}, 13 | m_sam_header_ptr {nullptr}, 14 | m_sam_itr_ptr {nullptr}, 15 | m_sam_record_ptr {nullptr} { 16 | } 17 | 18 | IndexedSamIterator::IndexedSamIterator(const std::shared_ptr& sam_file_ptr, const std::shared_ptr& sam_index_ptr, 19 | const std::shared_ptr& sam_header_ptr, const std::vector& interval_list) : 20 | m_sam_file_ptr {sam_file_ptr}, 21 | m_sam_index_ptr {sam_index_ptr}, 22 | m_sam_header_ptr {sam_header_ptr}, 23 | m_interval_list {interval_list}, 24 | m_interval_iterator {m_interval_list.begin()}, 25 | m_sam_itr_ptr {utils::make_unique_hts_itr(sam_itr_querys(m_sam_index_ptr.get(), m_sam_header_ptr.get(), (*m_interval_iterator).c_str()))}, 26 | m_sam_record_ptr {utils::make_shared_sam(bam_init1())}, 27 | m_sam_record {m_sam_header_ptr, m_sam_record_ptr} { 28 | fetch_next_record(); 29 | } 30 | 31 | Sam& IndexedSamIterator::operator*() { 32 | return m_sam_record; 33 | } 34 | 35 | Sam& IndexedSamIterator::operator++() { 36 | fetch_next_record(); 37 | return m_sam_record; 38 | } 39 | 40 | bool IndexedSamIterator::operator!=(const IndexedSamIterator& rhs) { 41 | return m_sam_file_ptr != rhs.m_sam_file_ptr; 42 | } 43 | 44 | void IndexedSamIterator::fetch_next_record() { 45 | while (sam_itr_next(m_sam_file_ptr.get(), m_sam_itr_ptr.get(), m_sam_record_ptr.get()) < 0) { 46 | ++m_interval_iterator; 47 | if (m_interval_list.end() == m_interval_iterator) { 48 | m_sam_file_ptr = nullptr; 49 | return; 50 | } 51 | m_sam_itr_ptr.reset(sam_itr_querys(m_sam_index_ptr.get(), m_sam_header_ptr.get(), (*m_interval_iterator).c_str())); 52 | } 53 | } 54 | 55 | const std::string& IndexedSamIterator::current_interval() const{ 56 | return *m_interval_iterator; 57 | } 58 | 59 | 60 | } 61 | -------------------------------------------------------------------------------- /testdata/ref_block/problem1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##ALT= 3 | ##FILTER= 4 | ##INFO= 5 | ##FORMAT= 6 | ##FORMAT= 7 | ##FORMAT= 8 | ##FORMAT= 9 | ##FORMAT= 10 | ##FORMAT= 11 | ##FORMAT= 12 | ##contig= 13 | ##reference=Homo_sapiens_assembly19.fasta 14 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE 15 | 1 11140505 . C . . END=11140609 GT:DP:GQ:MIN_DP:PL 0/0:102:99:64:0,117,1755 16 | 1 11140610 . G . . END=11140610 GT:DP:GQ:MIN_DP:PL 0/0:65:0:65:0,0,1851 17 | 1 11140611 . A . . END=11140619 GT:DP:GQ:MIN_DP:PL 0/0:83:69:82:0,60,900 18 | 1 11140620 . AACACACAC A,AAC,AACACACACAC, 0.01 . . GT:AD:DP:GQ:PL:SB 0/3:20,0,0,5,0:25:9:198,444,4343,424,3388,3263,0,1715,1379,2412,9,694,634,502,457:6,14,3,2 19 | 1 11140621 . ACACACAC A,AACACAC,AAC, 142.73 . . GT:AD:DP:GQ:PL:SB 0/2:41,0,10,2,0:53:99:204,420,4129,0,2639,2527,217,3164,2429,3061,180,2736,2400,2563,2525:18,23,2,8 20 | 1 11140623 . ACACAC A,AAC, 0 . . GT:AD:DP:GQ:PL:SB 0/0:46,3,7,0:56:58:0,127,3660,58,2726,2688,145,2832,2670,2756:0,0,0,0 21 | 1 11140629 . A . . END=11140658 GT:DP:GQ:MIN_DP:PL 0/0:76:99:73:0,120,1800 22 | 1 11140768 . A . . END=11141021 GT:DP:GQ:MIN_DP:PL 0/0:127:99:93:0,120,1800 23 | -------------------------------------------------------------------------------- /testdata/ref_block/problem3_file1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##ALT= 3 | ##FILTER= 4 | ##INFO= 5 | ##FORMAT= 6 | ##FORMAT= 7 | ##FORMAT= 8 | ##FORMAT= 9 | ##FORMAT= 10 | ##FORMAT= 11 | ##FORMAT= 12 | ##contig= 13 | ##reference=Homo_sapiens_assembly19.fasta 14 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE1 15 | 1 3319304 . C . . END=3319318 GT:DP:GQ:MIN_DP:PL 0/0:19:51:17:0,42,618 16 | 1 3319319 . A . . END=3319403 GT:DP:GQ:MIN_DP:PL 0/0:29:75:21:0,63,758 17 | 1 3319404 . G . . END=3319406 GT:DP:GQ:MIN_DP:PL 0/0:24:57:24:0,57,855 18 | 1 3319407 . G . . END=3319409 GT:DP:GQ:MIN_DP:PL 0/0:24:63:24:0,60,900 19 | 1 3319410 . C . . END=3319412 GT:DP:GQ:MIN_DP:PL 0/0:24:54:24:0,54,810 20 | 1 3319413 . G . . END=3319432 GT:DP:GQ:MIN_DP:PL 0/0:24:63:22:0,60,900 21 | 1 3319433 . C . . END=3319446 GT:DP:GQ:MIN_DP:PL 0/0:21:57:20:0,45,810 22 | 1 3319447 . G . . END=3319557 GT:DP:GQ:MIN_DP:PL 0/0:29:72:21:0,60,715 23 | 1 3319558 . T . . END=3319564 GT:DP:GQ:MIN_DP:PL 0/0:24:57:23:0,54,810 24 | 1 3319565 . G . . END=3319576 GT:DP:GQ:MIN_DP:PL 0/0:22:60:21:0,60,900 25 | 1 3319577 . T . . END=3319600 GT:DP:GQ:MIN_DP:PL 0/0:17:42:13:0,23,495 26 | -------------------------------------------------------------------------------- /testdata/extra_header.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##FILTER= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 20 | 1 10000000 . T C 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 21 | 20 10001000 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 22 | 20 10002000 . TAGTGQA T . LOW_QUAL AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,2000000000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 23 | 20 10003000 . A AGCT . NOT_DEFINED AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 24 | 22 10004000 . GAT G,GATAT . PASS;MISSED AF=0.5,0;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100,2,4,8:3.1,2.2:ABA 0/0:35:0,10,100,2,4,8:3.1,2.2:ABA 1/1:35:10,100,0,2,4,8:3.1,2.2:ABA 25 | -------------------------------------------------------------------------------- /testdata/test.g.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##FILTER= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | ##FORMAT= 20 | ##FORMAT= 21 | ##FORMAT= 22 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 23 | 1 10000000 db2342 T C 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:25:10,0,100:3.1,2.2:ABA 0/0:12:0,10,1000:3.1,2.2:CA 1/1:650:10,100,0:3.1,2.2:XISPAFORRO 24 | 1 20000000 . C . . END=20000123 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0 0/0:0:0:0:0,0,0 0/0:0:0:0:0,0,0 25 | 20 10001000 rs837472 GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 26 | -------------------------------------------------------------------------------- /testdata/ref_block/test3.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##FILTER= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 20 | 1 1 . A . . END=40;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 21 | 1 41 . G C, . . AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 22 | 1 42 . T . . END=60;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 23 | 1 650 . T . . END=750;AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,1000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 24 | 20 10 . GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 25 | -------------------------------------------------------------------------------- /testdata/ref_block/problem2_file1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##ALT= 3 | ##FILTER= 4 | ##INFO= 5 | ##FORMAT= 6 | ##FORMAT= 7 | ##FORMAT= 8 | ##FORMAT= 9 | ##FORMAT= 10 | ##FORMAT= 11 | ##FORMAT= 12 | ##contig= 13 | ##reference=Homo_sapiens_assembly19.fasta 14 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PROBLEM_SAMPLE1 15 | 1 3319304 . C . . END=3319318 GT:DP:GQ:MIN_DP:PL 0/0:19:51:17:0,42,618 16 | 1 3319319 . A . . END=3319403 GT:DP:GQ:MIN_DP:PL 0/0:29:75:21:0,63,758 17 | 1 3319404 . G . . END=3319406 GT:DP:GQ:MIN_DP:PL 0/0:24:57:24:0,57,855 18 | 1 3319407 . G . . END=3319409 GT:DP:GQ:MIN_DP:PL 0/0:24:63:24:0,60,900 19 | 1 3319410 . C . . END=3319412 GT:DP:GQ:MIN_DP:PL 0/0:24:54:24:0,54,810 20 | 1 3319413 . G . . END=3319432 GT:DP:GQ:MIN_DP:PL 0/0:24:63:22:0,60,900 21 | 1 3319433 . C . . END=3319446 GT:DP:GQ:MIN_DP:PL 0/0:21:57:20:0,45,810 22 | 1 3319447 . G . . END=3319557 GT:DP:GQ:MIN_DP:PL 0/0:29:72:21:0,60,715 23 | 1 3319558 . T . . END=3319564 GT:DP:GQ:MIN_DP:PL 0/0:24:57:23:0,54,810 24 | 1 3319565 . G . . END=3319576 GT:DP:GQ:MIN_DP:PL 0/0:22:60:21:0,60,900 25 | 1 3319577 . T . . END=3319600 GT:DP:GQ:MIN_DP:PL 0/0:17:42:13:0,23,495 26 | 1 3319601 . ACCCTCCTCTGAGTCTTCCTCCCCTTCCCGTG A, 155.74 . DP=13;MLEAC=2,0;MLEAF=1.00,0.00;MQ=61.16;MQ0=0 GT:AD:DP:GQ:PL:SB 1/1:0,4,0:4:32:198,32,0,198,32,198:0,0,0,0 27 | -------------------------------------------------------------------------------- /gamgee/variant/indexed_variant_iterator.cpp: -------------------------------------------------------------------------------- 1 | #include "indexed_variant_iterator.h" 2 | #include "variant_iterator.h" 3 | 4 | #include "htslib/vcf.h" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | namespace gamgee { 13 | const std::vector IndexedVariantIterator::all_intervals = {"."}; 14 | 15 | IndexedVariantIterator::IndexedVariantIterator() : 16 | VariantIterator {}, 17 | m_variant_index_ptr {}, 18 | m_interval_list {}, 19 | m_interval_iter {}, 20 | m_index_iter_ptr {} 21 | {} 22 | 23 | IndexedVariantIterator::IndexedVariantIterator(const std::shared_ptr& file_ptr, 24 | const std::shared_ptr& index_ptr, 25 | const std::shared_ptr& header_ptr, 26 | const std::vector& interval_list) : 27 | VariantIterator { file_ptr, header_ptr }, 28 | m_variant_index_ptr { index_ptr }, 29 | m_interval_list { interval_list.empty() ? all_intervals : interval_list }, 30 | m_interval_iter { m_interval_list.begin() }, 31 | m_index_iter_ptr { utils::make_unique_hts_itr(bcf_itr_querys(m_variant_index_ptr.get(), m_variant_header_ptr.get(), m_interval_iter->c_str())) } 32 | { 33 | fetch_next_record(); 34 | } 35 | 36 | bool IndexedVariantIterator::operator!=(const IndexedVariantIterator& rhs) { 37 | return m_variant_file_ptr != rhs.m_variant_file_ptr && 38 | m_index_iter_ptr != rhs.m_index_iter_ptr; 39 | } 40 | 41 | /** 42 | * @brief pre-fetches the next variant record 43 | * @warning we're reusing the existing htslib memory, so users should be aware that all objects from the previous iteration are now stale unless a deep copy has been performed 44 | */ 45 | void IndexedVariantIterator::fetch_next_record() { 46 | while (bcf_itr_next(m_variant_file_ptr, m_index_iter_ptr.get(), m_variant_record_ptr.get()) < 0) { 47 | ++m_interval_iter; 48 | if (m_interval_list.end() == m_interval_iter) { 49 | m_variant_file_ptr.reset(); 50 | m_variant_record = Variant{}; 51 | return; 52 | } 53 | m_index_iter_ptr.reset(bcf_itr_querys(m_variant_index_ptr.get(), m_variant_header_ptr.get(), m_interval_iter->c_str())); 54 | } 55 | } 56 | 57 | } 58 | 59 | -------------------------------------------------------------------------------- /test/test_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee_test_utils__guard 2 | #define gamgee_test_utils__guard 3 | 4 | #include 5 | 6 | /** 7 | * @brief test code for copy construction and copy assignment for any copy enabled object 8 | * 9 | * This tests copy construction, copy assignment and checks for self copy 10 | * assignment. The starting object is copied to three different objects using 11 | * all the above procedures. It returns a tuple with all three copied objects 12 | * for you to verify that the copy occurred and that it didn't affect the 13 | * original object. This is extremely useful when writing tests for a new class. 14 | * 15 | * @tparam T any class that is copy constructible and copy assignable 16 | * @param original a simple object of class T to make the copies from 17 | * @return a tuple with all three copied objects 18 | */ 19 | template 20 | std::tuple check_copy_constructor (T& original) { 21 | auto obj2 = original; // copy construction 22 | auto obj3 = original; // copy construction 23 | obj2 = obj2; // check self copy-assignment 24 | obj3 = obj2; // copy assignment 25 | return std::make_tuple(original, obj2, obj3); 26 | } 27 | 28 | /** 29 | * @brief test code for move construction and move assignment for any move enabled object 30 | * 31 | * This tests move construction, move assignment and checks for self move 32 | * assignment. The starting object is moved through three different objects using 33 | * all the above procedures. It returns the final moved to object 34 | * for you to verify that the move occurred and that it still matches the 35 | * original object. This is extremely useful when writing tests for a new class. 36 | * 37 | * @warning the original object passed in will be forcefully moved from, therefore in unusable state. 38 | * 39 | * @tparam T any class that is move constructible and move assignable 40 | * @param original a simple object of class T to move from (it will be destroyed) 41 | * @return the last moved to object 42 | */ 43 | template 44 | T check_move_constructor (T& original) { 45 | auto obj2 = std::move(original); // move construction 46 | auto obj3 = std::move(obj2); // create a new object (we can't rely on default construction) 47 | obj2 = std::move(obj3); // check move-assignment 48 | return obj2; 49 | } 50 | 51 | 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /gamgee/variant/variant_filters.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__variant_filters__guard 2 | #define gamgee__variant_filters__guard 3 | 4 | #include "variant_filters_iterator.h" 5 | 6 | #include "../utils/hts_memory.h" 7 | 8 | #include "htslib/vcf.h" 9 | 10 | #include 11 | #include 12 | 13 | namespace gamgee { 14 | 15 | /** 16 | * @brief class to manipulate filter field objects without making copies. 17 | * 18 | * It provides all the functionality that a vector of strings should provide. 19 | * You can do random access or iteration style access on it. The iterator is 20 | * implemented as a random iterator for maximum flexibility with standard 21 | * library functions 22 | */ 23 | class VariantFilters { 24 | public: 25 | /** 26 | * @brief standard constructor used by the Variant API 27 | * @param header an htslib variant header to keep shared ownership of the memory 28 | * @param body an htslib variant body to keep shared ownership of the memory 29 | */ 30 | explicit VariantFilters(const std::shared_ptr& header, const std::shared_ptr& body) : m_header {header}, m_body {body} {} 31 | 32 | /** 33 | * @brief random access operator 34 | * @param index the filter index 35 | */ 36 | std::string operator[](int index) const { return utils::htslib_filter_name(m_header.get(), m_body.get(), index); } 37 | 38 | /** 39 | * @brief returns the number of filters in the filter field 40 | */ 41 | uint32_t size() const { return uint32_t(m_body->d.n_flt); } 42 | 43 | /** 44 | * @brief returns true if the filters field is missing 45 | */ 46 | bool missing() const { return size() == 0; } 47 | 48 | /** 49 | * @brief returns an iterator pointing to the first element in the list of filters. 50 | */ 51 | VariantFiltersIterator begin() const {return VariantFiltersIterator{m_header, m_body, 0}; } 52 | 53 | /** 54 | * @brief Returns an iterator referring to one-past-the-last element in the list of filters. 55 | */ 56 | VariantFiltersIterator end() const {return VariantFiltersIterator{m_header, m_body, uint32_t(m_body->d.n_flt)};} 57 | 58 | private: 59 | std::shared_ptr m_header; ///< shared ownership of the VariantHeader record memory so it stays alive while this object is in scope 60 | std::shared_ptr m_body; ///< shared ownership of the Variant record memory so it stays alive while this object is in scope 61 | 62 | }; 63 | 64 | 65 | } 66 | 67 | #endif // gamgee__variant_filters__guard 68 | -------------------------------------------------------------------------------- /gamgee/gamgee.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__gamgee__guard 2 | #define gamgee__gamgee__guard 3 | 4 | #include "exceptions.h" 5 | #include "fastq.h" 6 | #include "fastq_iterator.h" 7 | #include "fastq_reader.h" 8 | #include "interval.h" 9 | #include "missing.h" 10 | #include "reference_iterator.h" 11 | #include "reference_map.h" 12 | #include "zip.h" 13 | 14 | #include "utils/file_utils.h" 15 | #include "utils/genotype_utils.h" 16 | #include "utils/hts_memory.h" 17 | #include "utils/merged_vcf_lut.h" 18 | #include "utils/short_value_optimized_storage.h" 19 | #include "utils/utils.h" 20 | #include "utils/variant_field_type.h" 21 | #include "utils/variant_utils.h" 22 | 23 | #include "sam/base_quals.h" 24 | #include "sam/cigar.h" 25 | #include "sam/indexed_sam_iterator.h" 26 | #include "sam/indexed_sam_reader.h" 27 | #include "sam/read_bases.h" 28 | #include "sam/sam.h" 29 | #include "sam/sam_builder.h" 30 | #include "sam/sam_builder_data_field.h" 31 | #include "sam/sam_header.h" 32 | #include "sam/sam_iterator.h" 33 | #include "sam/sam_pair_iterator.h" 34 | #include "sam/sam_reader.h" 35 | #include "sam/sam_tag.h" 36 | #include "sam/sam_writer.h" 37 | 38 | #include "variant/genotype.h" 39 | #include "variant/indexed_variant_iterator.h" 40 | #include "variant/indexed_variant_reader.h" 41 | #include "variant/individual_field.h" 42 | #include "variant/individual_field_iterator.h" 43 | #include "variant/individual_field_value.h" 44 | #include "variant/individual_field_value_iterator.h" 45 | #include "variant/multiple_variant_iterator.h" 46 | #include "variant/multiple_variant_reader.h" 47 | #include "variant/reference_block_splitting_variant_iterator.h" 48 | #include "variant/shared_field.h" 49 | #include "variant/shared_field_iterator.h" 50 | #include "variant/synced_variant_iterator.h" 51 | #include "variant/synced_variant_reader.h" 52 | #include "variant/variant.h" 53 | #include "variant/variant_builder.h" 54 | #include "variant/variant_builder_individual_field.h" 55 | #include "variant/variant_builder_individual_region.h" 56 | #include "variant/variant_builder_multi_sample_vector.h" 57 | #include "variant/variant_builder_shared_region.h" 58 | #include "variant/variant_filters.h" 59 | #include "variant/variant_filters_iterator.h" 60 | #include "variant/variant_header.h" 61 | #include "variant/variant_header_builder.h" 62 | #include "variant/variant_iterator.h" 63 | #include "variant/variant_reader.h" 64 | #include "variant/variant_writer.h" 65 | #include "variant/variant_header_merger.h" 66 | 67 | #endif /* gamgee__gamgee__guard */ 68 | -------------------------------------------------------------------------------- /test/fastq_reader_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "fastq_reader.h" 4 | #include "test_utils.h" 5 | #include "exceptions.h" 6 | 7 | using namespace std; 8 | using namespace gamgee; 9 | 10 | int count_records(const string& filename) { 11 | auto counter = 0; 12 | for (auto& f : FastqReader{filename}) { 13 | BOOST_CHECK_EQUAL(f.sequence(), "ACAAGAGATTTAAGAC"); 14 | ++counter; 15 | } 16 | return counter; 17 | } 18 | 19 | int count_records_vec(const string& filename) { 20 | auto counter = 0; 21 | for (auto& f : FastqReader{vector{filename}}) { 22 | BOOST_CHECK_EQUAL(f.sequence(), "ACAAGAGATTTAAGAC"); 23 | ++counter; 24 | } 25 | return counter; 26 | } 27 | 28 | void vector_too_large(const string& filename) { 29 | for (auto& f : FastqReader{vector{filename, filename}}) { 30 | BOOST_CHECK_EQUAL(f.sequence(), "ACAAGAGATTTAAGAC"); 31 | } 32 | } 33 | 34 | BOOST_AUTO_TEST_CASE( read_fastq ) 35 | { 36 | BOOST_CHECK_EQUAL(count_records("testdata/complete_same_seq.fa"), 2); 37 | BOOST_CHECK_EQUAL(count_records("testdata/complete_same_seq.fq"), 3); 38 | } 39 | 40 | BOOST_AUTO_TEST_CASE( read_fastq_vector ) 41 | { 42 | BOOST_CHECK_EQUAL(count_records_vec("testdata/complete_same_seq.fa"), 2); 43 | BOOST_CHECK_EQUAL(count_records_vec("testdata/complete_same_seq.fq"), 3); 44 | } 45 | 46 | BOOST_AUTO_TEST_CASE( read_fastq_vector_too_large ) 47 | { 48 | BOOST_CHECK_THROW(vector_too_large("testdata/complete_same_seq.fa"), std::runtime_error); 49 | } 50 | 51 | BOOST_AUTO_TEST_CASE( fastq_reader_move_constructor ) { 52 | auto reader0 = FastqReader{"testdata/complete_same_seq.fa"}; 53 | auto reader1 = FastqReader{"testdata/complete_same_seq.fa"}; 54 | auto moved = check_move_constructor(reader1); 55 | 56 | auto record0 = reader0.begin().operator*(); 57 | auto moved_record = moved.begin().operator*(); 58 | BOOST_CHECK(record0 == moved_record); 59 | } 60 | 61 | BOOST_AUTO_TEST_CASE( fastq_iterator_move_constructor ) { 62 | auto reader0 = FastqReader{"testdata/complete_same_seq.fa"}; 63 | auto iter0 = reader0.begin(); 64 | 65 | auto reader1 = FastqReader{"testdata/complete_same_seq.fa"}; 66 | auto iter1 = reader1.begin(); 67 | auto moved = check_move_constructor(iter1); 68 | 69 | auto record0 = *iter0; 70 | auto moved_record = *moved; 71 | BOOST_CHECK(record0 == moved_record); 72 | } 73 | 74 | BOOST_AUTO_TEST_CASE( fastq_reader_nonexistent_file ) { 75 | BOOST_CHECK_THROW(FastqReader{"foo/bar/nonexistent.fa"}, FileOpenException); 76 | } 77 | -------------------------------------------------------------------------------- /gamgee/sam/sam_header.cpp: -------------------------------------------------------------------------------- 1 | #include "sam_header.h" 2 | 3 | #include "../utils/hts_memory.h" 4 | 5 | using namespace std; 6 | 7 | namespace gamgee { 8 | 9 | /** 10 | * @brief creates a SamHeader object that points to htslib memory already allocated 11 | * 12 | * @note the resulting SamHeader object shares ownership of the pre-allocated memory via 13 | * shared_ptr reference counting 14 | */ 15 | SamHeader::SamHeader(const std::shared_ptr& header) : 16 | m_header { header } 17 | {} 18 | 19 | /** 20 | * @brief creates a deep copy of a SamHeader object 21 | * 22 | * @note the copy will have exclusive ownership over the newly-allocated htslib memory 23 | */ 24 | SamHeader::SamHeader(const SamHeader& other) : 25 | m_header { utils::make_shared_sam_header(utils::sam_header_deep_copy(other.m_header.get())) } 26 | {} 27 | 28 | /** 29 | * @brief creates a deep copy of a SamHeader object 30 | * 31 | * @note the copy will have exclusive ownership over the newly-allocated htslib memory 32 | */ 33 | SamHeader& SamHeader::operator=(const SamHeader& other) { 34 | if ( &other == this ) 35 | return *this; 36 | m_header = utils::make_shared_sam_header(utils::sam_header_deep_copy(other.m_header.get())); ///< shared_ptr assignment will take care of deallocating old sam record if necessary 37 | return *this; 38 | } 39 | 40 | /** 41 | * @brief Returns the length of the given sequence as stored in the \@SQ tag in the BAM header, or 0 if the sequence 42 | * name is not found. 43 | */ 44 | uint32_t SamHeader::sequence_length(const std::string& sequence_name) const { 45 | auto c = sequence_name.c_str(); 46 | for (int i = 0; i < m_header->n_targets; i++) { 47 | if (strcmp(c,m_header->target_name[i]) == 0) { 48 | return m_header->target_len[i]; 49 | } 50 | } 51 | return 0; 52 | } 53 | 54 | /** 55 | * @brief extracts read group objects from a SAM header 56 | */ 57 | vector SamHeader::read_groups() const { 58 | const static auto RG_TAG = "@RG"; 59 | const static auto NOT_FOUND = string::npos; 60 | auto result = vector(); 61 | auto text = header_text(); 62 | 63 | for (auto rg_start=text.find(RG_TAG), rg_end=rg_start; rg_start!=NOT_FOUND; rg_start=text.find(RG_TAG,rg_end+1) ) { 64 | rg_end = text.find('\n', rg_start+1); 65 | auto rg_record = text.substr(rg_start, rg_end-rg_start); 66 | result.push_back(ReadGroup(rg_record)); 67 | } 68 | return result; 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /gamgee/sam/read_bases.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__read_bases__guard 2 | #define gamgee__read_bases__guard 3 | 4 | #include "htslib/sam.h" 5 | 6 | #include 7 | #include 8 | 9 | namespace gamgee { 10 | 11 | /** 12 | * @brief simple enum to hold all valid bases in the SAM format 13 | * @note enum values used here correspond to the 4-bit base encodings in htslib 14 | * so that we can cast directly to Base 15 | */ 16 | enum class Base { A = 1, C = 2, G = 4, T = 8, N = 15 }; 17 | 18 | /** 19 | * @brief Utility class to handle the memory management of the sam record object for read bases 20 | * 21 | * This class uses Base to represent the bases A,C,G,T,N so we can get byte by byte correspondence 22 | * with the underlying compressed memory model. 23 | * 24 | * @note Any functionality lost because of this should be made available by the ReadBases class. 25 | */ 26 | class ReadBases { 27 | public: 28 | explicit ReadBases(const std::shared_ptr& sam_record); 29 | ReadBases(const ReadBases& other); 30 | ReadBases(ReadBases&& other) = default; 31 | ReadBases& operator=(const ReadBases& other); 32 | ReadBases& operator=(ReadBases&& other) = default; 33 | ~ReadBases() = default; ///< default destruction is sufficient, since our shared_ptr will handle deallocation 34 | 35 | Base operator[](const uint32_t index) const; ///< use freely as you would an array. @note currently implemented as read only 36 | void set_base(const uint32_t index, const Base base); ///< modify a base at a specific index 37 | uint32_t size() const { return m_num_bases; }; ///< number of base qualities in the container 38 | bool operator==(const ReadBases& other) const; ///< check for equality with another ReadBases object 39 | bool operator!=(const ReadBases& other) const; ///< check for inequality with another ReadBases object 40 | std::string to_string() const; ///< produce a string representation of the bases in this object 41 | 42 | private: 43 | std::shared_ptr m_sam_record; ///< sam record containing our bases, potentially co-owned by multiple other objects 44 | uint8_t* m_bases; ///< pointer to the start of the bases in m_sam_record, cached for efficiency 45 | uint32_t m_num_bases; ///< number of bases in our sam record 46 | 47 | static const std::map base_to_string_map; ///< @brief simple lookup table to convert Base enum values to chars. 48 | 49 | friend class SamBuilder; ///< builder needs access to the internals in order to build efficiently 50 | }; 51 | 52 | } 53 | 54 | #endif /* gamgee__read_bases__guard */ 55 | -------------------------------------------------------------------------------- /gamgee/variant/synced_variant_iterator.cpp: -------------------------------------------------------------------------------- 1 | #include "htslib/vcf.h" 2 | #include "htslib/synced_bcf_reader.h" 3 | 4 | #include "synced_variant_iterator.h" 5 | #include "variant.h" 6 | 7 | using namespace std; 8 | 9 | namespace gamgee { 10 | 11 | SyncedVariantIterator::SyncedVariantIterator() : 12 | m_synced_readers {}, 13 | m_variant_vector {} 14 | {} 15 | 16 | SyncedVariantIterator::SyncedVariantIterator(const std::shared_ptr& synced_readers) : 17 | m_synced_readers {synced_readers}, 18 | m_variant_vector {} 19 | { 20 | m_variant_vector.reserve(m_synced_readers->nreaders); 21 | fetch_next_record(); 22 | } 23 | 24 | std::vector& SyncedVariantIterator::operator*() { 25 | return m_variant_vector; 26 | } 27 | 28 | std::vector& SyncedVariantIterator::operator++() { 29 | fetch_next_record(); 30 | return m_variant_vector; 31 | } 32 | 33 | // NOTE: this method does the minimal work necessary to determine that we have reached the end of iteration 34 | // it is NOT a valid general-purpose inequality method 35 | bool SyncedVariantIterator::operator!=(const SyncedVariantIterator& rhs) { 36 | return !(m_variant_vector.empty() && rhs.m_variant_vector.empty()); 37 | } 38 | 39 | /** 40 | * @brief initializes m_headers_vector 41 | */ 42 | void SyncedVariantIterator::init_headers_vector() { 43 | m_headers_vector.reserve(m_synced_readers->nreaders); 44 | for (int idx = 0; idx < m_synced_readers->nreaders; idx++) { 45 | auto* hdr_ptr = utils::variant_header_deep_copy(bcf_sr_get_header(m_synced_readers.get(), idx)); 46 | m_headers_vector.emplace_back(utils::make_shared_variant_header(hdr_ptr)); 47 | } 48 | } 49 | 50 | /** 51 | * @brief pre-fetches the next variant record 52 | */ 53 | void SyncedVariantIterator::fetch_next_record() { 54 | if (!bcf_sr_next_line(m_synced_readers.get())) { 55 | m_variant_vector.clear(); 56 | return; 57 | } 58 | else { 59 | m_variant_vector.clear(); 60 | m_variant_vector.resize(m_synced_readers->nreaders); 61 | } 62 | 63 | // can't initialize until a line has been read 64 | if (m_headers_vector.empty()) 65 | init_headers_vector(); 66 | 67 | for (int idx = 0; idx < m_synced_readers->nreaders; idx++) { 68 | if (bcf_sr_has_line(m_synced_readers.get(), idx)) { 69 | // can't cache variant bodies because they may change location in the synced reader 70 | auto* body_ptr = utils::variant_deep_copy(bcf_sr_get_line(m_synced_readers.get(), idx)); 71 | m_variant_vector[idx] = Variant{m_headers_vector[idx], utils::make_shared_variant(body_ptr)}; 72 | } 73 | } 74 | } 75 | 76 | } 77 | 78 | -------------------------------------------------------------------------------- /gamgee/sam/sam_header.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__sam_header__guard 2 | #define gamgee__sam_header__guard 3 | 4 | #include "htslib/sam.h" 5 | #include "read_group.h" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace gamgee { 12 | 13 | /** 14 | * @brief Utility class to hold the header of a sam file 15 | */ 16 | class SamHeader { 17 | public: 18 | explicit SamHeader() = default; ///< @brief initializes a null SamHeader @warning if you need to create a SamHeader from scratch, use the builder instead 19 | explicit SamHeader(const std::shared_ptr& header); ///< @brief creates a SamHeader given htslib object. @note used by all iterators 20 | SamHeader(const SamHeader& other); ///< @brief makes a deep copy of a SamHeader. Shared pointers maintain state to all other associated objects correctly. 21 | SamHeader& operator=(const SamHeader& other); ///< @brief deep copy assignment of a SamHeader. Shared pointers maintain state to all other associated objects correctly. 22 | SamHeader(SamHeader&& other) = default; ///< @brief moves SamHeader accordingly. Shared pointers maintain state to all other associated objects correctly. 23 | SamHeader& operator=(SamHeader&& other) = default; ///< @brief move assignment of a SamHeader. Shared pointers maintain state to all other associated objects correctly. 24 | uint32_t n_sequences() const {return m_header->n_targets;} ///< @brief Returns the number of reference sequences in the header 25 | uint32_t sequence_length(const std::string& sequence_name) const; ///< @brief Returns the length of the given reference sequence as stored in the \@SQ tag in the BAM header. 26 | uint32_t sequence_length(const uint32_t sequence_index) const { return m_header->target_len[sequence_index]; } ///< @brief Returns the length of the given reference sequence as stored in the \@SQ tag in the BAM header. 27 | std::string sequence_name(const uint32_t sequence_index) const { return std::string(m_header->target_name[sequence_index]); } ///< @brief Returns the sequence name for the sequence with the given zero-based index 28 | std::vector read_groups() const; 29 | 30 | private: 31 | std::string header_text() const { return std::string(m_header->text, m_header->l_text); } ///< @brief Returns the text of the SAM header for parsing as an std::string. @note htslib only parses @SQ records, so gamgee is not simply a wrapper for C code. 32 | std::shared_ptr m_header; 33 | 34 | friend class SamWriter; 35 | friend class SamBuilder; 36 | }; 37 | 38 | } 39 | #endif // gamgee__sam_header__guard 40 | -------------------------------------------------------------------------------- /gamgee/variant/multiple_variant_iterator.cpp: -------------------------------------------------------------------------------- 1 | #include "multiple_variant_iterator.h" 2 | 3 | namespace gamgee { 4 | 5 | MultipleVariantIterator::MultipleVariantIterator(const std::vector>& variant_files, const std::vector>& variant_headers) : 6 | m_queue {}, 7 | m_variant_vector {} 8 | { 9 | m_variant_vector.reserve(variant_files.size()); 10 | for (auto i = 0u; i < variant_files.size(); i++) { 11 | m_queue.push(VariantIteratorIndexPair{std::make_shared(variant_files[i], variant_headers[i]), i}); 12 | } 13 | fetch_next_vector(); 14 | } 15 | 16 | std::vector& MultipleVariantIterator::operator*() { 17 | return m_variant_vector; 18 | } 19 | 20 | std::vector& MultipleVariantIterator::operator++() { 21 | fetch_next_vector(); 22 | return m_variant_vector; 23 | } 24 | 25 | // NOTE: this method does the minimal work necessary to determine that we have reached the end of iteration 26 | // it is NOT a valid general-purpose inequality method 27 | bool MultipleVariantIterator::operator!=(const MultipleVariantIterator& rhs) { 28 | return !(m_variant_vector.empty() && rhs.m_variant_vector.empty()); 29 | } 30 | 31 | bool MultipleVariantIterator::Comparator::operator()(const VariantIteratorIndexPair& left, const VariantIteratorIndexPair& right) { 32 | if ((**(left.first)).chromosome() > (**(right.first)).chromosome()) 33 | return true; 34 | 35 | if ((**(left.first)).chromosome() < (**(right.first)).chromosome()) 36 | return false; 37 | 38 | return (**(left.first)).alignment_start() > (**(right.first)).alignment_start(); 39 | } 40 | 41 | void MultipleVariantIterator::fetch_next_vector() { 42 | m_variant_vector.clear(); 43 | auto current_chrom = 0u; 44 | auto current_start = 0u; 45 | 46 | while (!m_queue.empty()) { 47 | const auto top_queue_elem = m_queue.top(); 48 | const auto top_iterator = top_queue_elem.first; 49 | const auto& variant = **top_iterator; 50 | 51 | //top VariantIterator returns 'empty' Variant - no more variants in this VCF 52 | if(variant.missing()) 53 | { 54 | m_queue.pop(); 55 | continue; 56 | } 57 | 58 | if (!m_variant_vector.empty() && !(variant.chromosome() == current_chrom && variant.alignment_start() == current_start)) 59 | break; 60 | else { 61 | current_chrom = variant.chromosome(); 62 | current_start = variant.alignment_start(); 63 | m_variant_vector.push_back(VariantIndexPair{variant, top_queue_elem.second}); 64 | 65 | m_queue.pop(); 66 | top_iterator->operator++(); 67 | if (! top_iterator->empty()) 68 | m_queue.push(std::move(top_queue_elem)); 69 | } 70 | } 71 | } 72 | 73 | } 74 | 75 | -------------------------------------------------------------------------------- /gamgee/sam/sam_writer.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__sam_writer__guard 2 | #define gamgee__sam_writer__guard 3 | 4 | #include 5 | #include 6 | 7 | #include "sam.h" 8 | #include "sam_header.h" 9 | 10 | #include "../utils/hts_memory.h" 11 | 12 | #include "htslib/sam.h" 13 | 14 | namespace gamgee { 15 | 16 | /** 17 | * @brief utility class to write out a SAM/BAM/CRAM file to any stream 18 | * @todo add serialization option 19 | */ 20 | class SamWriter { 21 | 22 | public: 23 | 24 | /** 25 | * @brief Creates a new SamWriter using the specified output file name 26 | * @param output_fname file to write to. The default is stdout (as defined by htslib) 27 | * @param binary whether the output should be in BAM (true) or SAM format (false) 28 | * @note the header is copied and managed internally 29 | */ 30 | explicit SamWriter(const std::string& output_fname = "-", const bool binary = true); 31 | 32 | /** 33 | * @brief Creates a new SamWriter with the header extracted from a Sam record and using the specified output file name 34 | * @param header SamHeader object to make a copy from 35 | * @param output_fname file to write to. The default is stdout (as defined by htslib) 36 | * @param binary whether the output should be in BAM (true) or SAM format (false) 37 | * @note the header is copied and managed internally 38 | */ 39 | explicit SamWriter(const SamHeader& header, const std::string& output_fname = "-", const bool binary = true); 40 | 41 | /** 42 | * @brief a SamWriter cannot be copied safely, as it is iterating over a stream. 43 | */ 44 | 45 | SamWriter(const SamWriter& other) = delete; 46 | SamWriter& operator=(const SamWriter& other) = delete; 47 | 48 | /** 49 | * @brief a SamWriter can be moved 50 | */ 51 | 52 | SamWriter(SamWriter&& other) = default; 53 | SamWriter& operator=(SamWriter&& other) = default; 54 | 55 | /** 56 | * @brief Adds a record to the file stream 57 | * @param body the record 58 | */ 59 | void add_record(const Sam& body); 60 | 61 | /** 62 | * @brief Adds a header to the file stream. 63 | * @param header the header 64 | * @note the header is a requirement to add records 65 | */ 66 | void add_header(const SamHeader& header); 67 | 68 | private: 69 | std::unique_ptr m_out_file; ///< the file or stream to write out to ("-" means stdout) 70 | SamHeader m_header; ///< holds a copy of the header throughout the production of the output (necessary for every record that gets added) 71 | 72 | static htsFile* open_file(const std::string& output_fname, const std::string& binary); 73 | void write_header() const; 74 | 75 | }; 76 | 77 | } 78 | 79 | #endif // gamgee__sam_writer__guard 80 | -------------------------------------------------------------------------------- /gamgee/utils/variant_field_type.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__variant_field_utils__guard 2 | #define gamgee__variant_field_utils__guard 3 | 4 | #include "htslib/vcf.h" 5 | 6 | #include 7 | #include 8 | 9 | namespace gamgee { 10 | namespace utils { 11 | 12 | /** 13 | * @brief an enumeration of the types in htslib for the format field values 14 | * @note these must match the order in the htslib defines in htslib/vcf.h 15 | */ 16 | enum class VariantFieldType {NIL = 0, INT8 = 1, INT16 = 2, INT32 = 3, FLOAT = 5, STRING = 7}; 17 | 18 | 19 | /** 20 | * @brief converts the value in an index from the byte array into int32_t 21 | * 22 | * The byte array's underlying data representation is record specific, meaning that even numbers (like Integer) 23 | * can be represented multiple ways across registers (some with uint8_t others with uint32_t...) dictated by 24 | * the the maximum value in the field. 25 | * 26 | * This member function provides correct index location and appropriately creates a new value of 27 | * VALUE_TYPE to return to the user. 28 | * 29 | * @exception Will throw std::invalid_argument exception if trying to create a string out of a numeric format or 30 | * vice-versa. All numeric type conversions are internally truncated or expanded accordingly. 31 | */ 32 | int32_t convert_data_to_integer(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type); 33 | 34 | /** 35 | * @brief converts the value in an index from the byte array into float 36 | * @copydetails convert_data_to_integer(int, const uint8_t, const VariantFieldType&) 37 | */ 38 | float convert_data_to_float(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type); 39 | 40 | /** 41 | * @brief converts the value in an index from the byte array into string 42 | * @copydetails convert_data_to_integer(int, const uint8_t, const VariantFieldType&) 43 | */ 44 | std::string convert_data_to_string(const uint8_t* data_ptr, const int index, const uint8_t num_bytes_per_value, const VariantFieldType& type); 45 | 46 | /** 47 | * @brief returns the number of bytes for a given VariantFieldType 48 | */ 49 | uint8_t size_for_type(const VariantFieldType& type, const bcf_fmt_t* const format_ptr); 50 | 51 | /** 52 | * @brief returns the number of bytes for a given VariantFieldType 53 | */ 54 | uint8_t size_for_type(const VariantFieldType& type, const bcf_info_t* const info_ptr); 55 | 56 | /** 57 | * @brief - check if type is of type string 58 | */ 59 | inline bool is_string_type(const int32_t& type) { 60 | return (static_cast(type) == VariantFieldType::STRING); 61 | } 62 | 63 | } // end namespace utils 64 | } // end namespace gamgee 65 | 66 | #endif // gamgee__variant_field_utils__guard 67 | -------------------------------------------------------------------------------- /testdata/test_variants_for_variantbuilder.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##FILTER= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##FORMAT= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##INFO= 19 | ##INFO= 20 | ##INFO= 21 | ##INFO= 22 | ##FORMAT= 23 | ##FORMAT= 24 | ##FORMAT= 25 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 26 | 1 10000000 db2342 T C 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:25:10,0,100:3.1,2.2:ABA 0/0:12:0,10,1000:3.1,2.2:CA 1/1:650:10,100,0:3.1,2.2:XISPAFORRO 27 | 20 10001000 rs837472 GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 28 | 20 10002000 . TAGTGQA T . LOW_QUAL AF=0.5;AN=6;VALIDATED GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,2000000000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 29 | 20 10003000 . A AGCT . NOT_DEFINED AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 30 | 22 10004000 . GAT G,GATAT . PASS;MISSED AF=0.5,0;AN=6 GT:GQ:PL:AF:AS 1/2:35:10,0,100,2,4,8:3.1,2.2:ABA 0/0:35:0,10,100,2,4,8:3.1,2.2:ABA 1/1:35:10,100,0,2,4,8:3.1,2.2:ABA 31 | 22 10005000 . GAT G,GATAT . PASS AF=0.5,.;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100,.,4,.:3.1,.:ABA 0/0:.:0,10,100,2,4,8:3.1,2.2:ABA 1/1:35:10,100,0,2,4,8:3.1,2.2:. 32 | -------------------------------------------------------------------------------- /test/utils_test.cpp: -------------------------------------------------------------------------------- 1 | #include "../gamgee/utils/utils.h" 2 | #include "../gamgee/zip.h" 3 | 4 | #include 5 | 6 | using namespace gamgee::utils; 7 | 8 | BOOST_AUTO_TEST_CASE( sequence_utils_reverse_complement_test ) 9 | { 10 | const auto seq = std::string{"TTGATCTCCGAT"}; 11 | const auto rev = std::string{"ATCGGAGATCAA"}; 12 | BOOST_CHECK_EQUAL(reverse_complement(seq), rev); // check that reversing only reverses the sequence (correctly) 13 | BOOST_CHECK_EQUAL(reverse_complement(reverse_complement(seq)), seq); // re-reverse should go back to original 14 | BOOST_CHECK_EQUAL(reverse_complement(reverse_complement("AGGTCGAGT")), "AGGTCGAGT"); 15 | BOOST_CHECK_EQUAL(reverse_complement(reverse_complement("taaacgttaaaatatccctag")), "taaacgttaaaatatccctag"); 16 | BOOST_CHECK_EQUAL(reverse_complement(reverse_complement("taNACTTTTTTTNNtccctag")), "taNACTTTTTTTNNtccctag"); 17 | } 18 | 19 | BOOST_AUTO_TEST_CASE( sequence_utils_complement_test ) 20 | { 21 | auto inplace_seq = std::string{"AGCT"}; 22 | const auto copy_seq = std::string{"AGCT"}; 23 | const auto result = std::string{"TCGA"}; 24 | BOOST_CHECK_EQUAL(complement(inplace_seq), result); 25 | BOOST_CHECK_EQUAL(inplace_seq, result); // after taking the complement in place (above), this should be true. 26 | BOOST_CHECK_EQUAL(complement(copy_seq), result); 27 | BOOST_CHECK_NE(copy_seq, result); // after taking the complement on const (above), the value of copy_seq shouldn't be changed 28 | BOOST_CHECK_EQUAL(complement("TTGATCTCCGAT"), "AACTAGAGGCTA"); 29 | BOOST_CHECK_EQUAL(complement("AAAAAAAAAAAA"), "TTTTTTTTTTTT"); 30 | BOOST_CHECK_EQUAL(complement("G"), "C"); 31 | } 32 | 33 | BOOST_AUTO_TEST_CASE( zip_iterators_test ) { 34 | const auto v_odd = std::vector{1, 3, 5, 7}; 35 | const auto v_even = std::vector{2, 4, 6, 8}; 36 | const auto v_alpha = std::vector{'A', 'B', 'C', 'D'}; 37 | 38 | // simple test with two vectors of the same type 39 | auto i = 0u; 40 | for (const auto tup : zip(v_odd, v_even)) { 41 | BOOST_CHECK_EQUAL(tup.get<0>(), v_odd[i]); 42 | BOOST_CHECK_EQUAL(tup.get<1>(), v_even[i]); 43 | ++i; 44 | } 45 | 46 | // testing different types 47 | auto j = 0u; 48 | for (const auto tup : zip(v_odd, v_even, v_alpha)) { 49 | BOOST_CHECK_EQUAL(tup.get<0>(), v_odd[j]); 50 | BOOST_CHECK_EQUAL(tup.get<1>(), v_even[j]); 51 | BOOST_CHECK_EQUAL(tup.get<2>(), v_alpha[j]); 52 | ++j; 53 | } 54 | 55 | // testing boost::tie 56 | auto k = 0u; 57 | for (const auto tup : zip(v_odd, v_even, v_alpha)) { 58 | auto odd = 0u; 59 | auto even = 0u; 60 | auto alpha = 'Z'; 61 | boost::tie(odd, even, alpha) = tup; 62 | BOOST_CHECK_EQUAL(odd, v_odd[k]); 63 | BOOST_CHECK_EQUAL(even, v_even[k]); 64 | BOOST_CHECK_EQUAL(alpha, v_alpha[k]); 65 | ++k; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /gamgee/sam/read_group.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include "read_group.h" 6 | 7 | using namespace std; 8 | using namespace boost; 9 | 10 | namespace gamgee { 11 | 12 | 13 | /** 14 | * @brief creates and populates a read group object from a sam header line 15 | * 16 | * @note A read group line in a SAM header starts with "@RG" and then has tag/value 17 | * pairs separated by whitespace. The tags are all length-2 strings and the tags are 18 | * separated from values by colons. Ex: @RG ID:12345.67 SM:11111 PL:Illumina 19 | */ 20 | ReadGroup::ReadGroup(const std::string& header_line) { 21 | const static auto CHARACTERS_PER_TAG = 2; 22 | const static auto RG_TAG_LENGTH = 3; 23 | const static auto TAB_SEPARATOR = char_separator("\t"); 24 | 25 | auto fields = header_line.substr(RG_TAG_LENGTH+1); //skip the @RG tag 26 | auto tokens = tokenizer>(fields, TAB_SEPARATOR); 27 | for ( const auto& token : tokens) { 28 | //we skip 3 characters -- 2 for the tag, 1 for the colon 29 | auto value = token.substr(CHARACTERS_PER_TAG+1, token.length() - CHARACTERS_PER_TAG - 1); 30 | if ( starts_with(token, ID_TAG) ) id = value; 31 | else if ( starts_with(token, SAMPLE_TAG) ) sample = value; 32 | else if ( starts_with(token, CENTER_TAG) ) center = value; 33 | else if ( starts_with(token, DESCRIPTION_TAG) ) description = value; 34 | else if ( starts_with(token, DATE_TIME_TAG) ) date_time = value; 35 | else if ( starts_with(token, CENTER_TAG) ) center = value; 36 | else if ( starts_with(token, FLOW_ORDER_TAG) ) flow_order = value; 37 | else if ( starts_with(token, KEY_SEQUENCE_TAG) ) key_sequence = value; 38 | else if ( starts_with(token, LIBRARY_TAG) ) library = value; 39 | else if ( starts_with(token, PROGRAMS_TAG) ) programs = value; 40 | else if ( starts_with(token, MEDIAN_INSERT_SIZE_TAG) ) median_insert_size = value; 41 | else if ( starts_with(token, PLATFORM_TAG) ) platform = value; 42 | else if ( starts_with(token, PLATFORM_UNIT_TAG) ) platform_unit = value; 43 | } 44 | } 45 | 46 | const char ReadGroup::ID_TAG [3] = "ID"; 47 | const char ReadGroup::CENTER_TAG [3] = "CN"; 48 | const char ReadGroup::DESCRIPTION_TAG [3] = "DS"; 49 | const char ReadGroup::DATE_TIME_TAG [3] = "DT"; 50 | const char ReadGroup::FLOW_ORDER_TAG [3] = "FO"; 51 | const char ReadGroup::KEY_SEQUENCE_TAG [3] = "KS"; 52 | const char ReadGroup::LIBRARY_TAG [3] = "LB"; 53 | const char ReadGroup::PROGRAMS_TAG [3] = "PG"; 54 | const char ReadGroup::MEDIAN_INSERT_SIZE_TAG [3] = "PI"; 55 | const char ReadGroup::PLATFORM_TAG [3] = "PL"; 56 | const char ReadGroup::PLATFORM_UNIT_TAG [3] = "PU"; 57 | const char ReadGroup::SAMPLE_TAG [3] = "SM"; 58 | } 59 | -------------------------------------------------------------------------------- /test/select_if_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "variant/variant_reader.h" 4 | #include "variant/variant.h" 5 | 6 | 7 | using namespace std; 8 | using namespace gamgee; 9 | using namespace boost; 10 | 11 | constexpr auto GQ_THRESH = 35; 12 | 13 | BOOST_AUTO_TEST_CASE( select_if_individual_fields ) { 14 | const auto actual_gq_selects = vector>{ 15 | dynamic_bitset<>(string("111")), //Beware that in setting dynamic_bitset that way 16 | dynamic_bitset<>(string("110")), //the bit order is inversed: Here it is actually 011 17 | dynamic_bitset<>(string("100")), 18 | dynamic_bitset<>(string("010")), 19 | dynamic_bitset<>(string("011"))}; 20 | const auto actual_pl_selects = vector>{ 21 | dynamic_bitset<>(string("010")), 22 | dynamic_bitset<>(string("000")), 23 | dynamic_bitset<>(string("001")), 24 | dynamic_bitset<>(string("111")), 25 | dynamic_bitset<>(string("100"))}; 26 | for (const auto& filename : {"testdata/test_variants_02.vcf"}) { 27 | const auto reader = SingleVariantReader{filename}; 28 | auto record_idx = 0u; 29 | for (const auto& record : SingleVariantReader{filename}) { 30 | const auto g_quals = record.integer_individual_field("GQ"); 31 | const auto p_likes = record.integer_individual_field("PL"); 32 | const auto high_gq = [](const IndividualFieldValue& x) { return x[0] >= GQ_THRESH; }; 33 | const auto hom_ref = [](const IndividualFieldValue& x) { return x[0] == 0; }; 34 | const auto comput_gq_select = Variant::select_if(g_quals.begin(), g_quals.end(), high_gq); 35 | const auto comput_pl_select = Variant::select_if(p_likes.begin(), p_likes.end(), hom_ref); 36 | BOOST_CHECK_EQUAL(comput_gq_select.size(), 3u); 37 | BOOST_CHECK_EQUAL(comput_pl_select.size(), 3u); 38 | const auto actual_gq_select = actual_gq_selects[record_idx]; 39 | const auto actual_pl_select = actual_pl_selects[record_idx]; 40 | BOOST_CHECK(comput_pl_select == actual_pl_select); 41 | ++record_idx; 42 | } 43 | BOOST_CHECK_EQUAL(record_idx, 5u); 44 | } 45 | } 46 | 47 | BOOST_AUTO_TEST_CASE( select_if_shared_field ) { 48 | const auto truth_an_counts = std::vector{1,1,1,1,1,1,1}; 49 | const auto truth_af_counts = std::vector{0,0,0,0,1,0,1}; 50 | auto truth_index = 0u; 51 | for (const auto& record : SingleVariantReader{"testdata/test_variants.vcf"}) { 52 | const auto an = record.integer_shared_field("AN"); 53 | const auto r1 = Variant::select_if(an.begin(), an.end(), [](const auto& v) { return v == 6; }); 54 | BOOST_CHECK_EQUAL(r1.count(), truth_an_counts[truth_index]); 55 | const auto af = record.float_shared_field("AF"); 56 | const auto r2 = Variant::select_if(af.begin(), af.end(), [](const auto& v) { return v < 0.5; }); 57 | BOOST_CHECK_EQUAL(r2.count(), truth_af_counts[truth_index++]); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /gamgee/sam/sam_iterator.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__sam_iterator__guard 2 | #define gamgee__sam_iterator__guard 3 | 4 | #include "sam.h" 5 | 6 | #include "htslib/sam.h" 7 | 8 | #include 9 | 10 | namespace gamgee { 11 | 12 | /** 13 | * @brief Utility class to enable for-each style iteration in the SamReader class 14 | */ 15 | class SamIterator { 16 | public: 17 | 18 | /** 19 | * @brief creates an empty iterator (used for the end() method) 20 | */ 21 | SamIterator(); 22 | 23 | /** 24 | * @brief initializes a new iterator based on an input stream (e.g. sam/a file, stdin, ...) 25 | * 26 | * @param sam_file_ptr pointer to a sam file opened via the sam_open() macro from htslib 27 | * @param sam_header_ptr pointer to a sam file header created with the sam_hdr_read() macro from htslib 28 | */ 29 | SamIterator(const std::shared_ptr& sam_file_ptr, const std::shared_ptr& sam_header_ptr); 30 | 31 | /** 32 | * @brief no copy construction/assignment allowed for readers and iterators 33 | */ 34 | SamIterator(const SamIterator&) = delete; 35 | SamIterator& operator=(const SamIterator&) = delete; 36 | 37 | /** 38 | * @brief a SamIterator move constructor guarantees all objects will have the same state. 39 | */ 40 | SamIterator(SamIterator&&) = default; 41 | SamIterator& operator=(SamIterator&&) = default; 42 | 43 | /** 44 | * @brief inequality operator (needed by for-each loop) 45 | * 46 | * @param rhs the other SamIterator to compare to 47 | * 48 | * @return whether or not the two iterators are the same (e.g. have the same input stream on the same 49 | * status) 50 | */ 51 | bool operator!=(const SamIterator& rhs); 52 | 53 | /** 54 | * @brief dereference operator (needed by for-each loop) 55 | * 56 | * @return a Sam object by reference, valid until the next record is fetched (the iterator re-uses memory at each iteration) 57 | */ 58 | Sam& operator*(); 59 | 60 | /** 61 | * @brief pre-fetches the next record and tests for end of file 62 | * 63 | * @return a reference to the object (it can be const& because this return value should only be used 64 | * by the for-each loop to check for the eof) 65 | */ 66 | Sam& operator++(); 67 | 68 | private: 69 | std::shared_ptr m_sam_file_ptr; ///< pointer to the sam file 70 | std::shared_ptr m_sam_header_ptr; ///< pointer to the sam header 71 | std::shared_ptr m_sam_record_ptr; ///< pointer to the internal structure of the sam record. Useful to only allocate it once. 72 | Sam m_sam_record; ///< temporary record to hold between fetch (operator++) and serve (operator*) 73 | 74 | void fetch_next_record(); ///< fetches next Sam record into existing htslib memory without making a copy 75 | }; 76 | 77 | } // end namespace gamgee 78 | 79 | #endif // gamgee__sam_iterator__guard 80 | -------------------------------------------------------------------------------- /gamgee/variant/synced_variant_iterator.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__synced_variant_iterator__guard 2 | #define gamgee__synced_variant_iterator__guard 3 | 4 | #include "htslib/vcf.h" 5 | #include "htslib/synced_bcf_reader.h" 6 | 7 | #include "variant.h" 8 | 9 | #include 10 | 11 | namespace gamgee { 12 | 13 | /** 14 | * @brief Utility class to enable for-each style iteration in the SyncedVariantReader class 15 | */ 16 | class SyncedVariantIterator { 17 | public: 18 | 19 | /** 20 | * @brief creates an empty iterator (used for the end() method) 21 | */ 22 | SyncedVariantIterator(); 23 | 24 | /** 25 | * @brief initializes a new iterator based on a structure of synced vcf/bcf file readers 26 | * 27 | * @param synced_readers an htslib structure of synced vcf/bcf file readers created using the bcf_sr_init() function 28 | */ 29 | SyncedVariantIterator(const std::shared_ptr& synced_readers); 30 | 31 | /** 32 | * @brief a SyncedVariantIterator should never be copied, but it can be moved around 33 | */ 34 | SyncedVariantIterator(SyncedVariantIterator&& other) = default; 35 | SyncedVariantIterator& operator=(SyncedVariantIterator&& other) = default; 36 | 37 | /** 38 | * @brief a SyncedVariantIterator cannot be copied safely, as it is iterating over streams. 39 | */ 40 | SyncedVariantIterator(const SyncedVariantIterator&) = delete; 41 | SyncedVariantIterator& operator=(const SyncedVariantIterator&) = delete; 42 | 43 | /** 44 | * @brief pseudo-inequality operator (needed by for-each loop) 45 | * 46 | * @warning this method does the minimal work necessary to determine that we have reached the end of iteration. 47 | * it is NOT a valid general-purpose inequality method. 48 | * 49 | * @param rhs the other SyncedVariantIterator to compare to 50 | * 51 | * @return whether both iterators have entered their end states 52 | */ 53 | bool operator!=(const SyncedVariantIterator& rhs); 54 | 55 | /** 56 | * @brief dereference operator (needed by for-each loop) 57 | * 58 | * @return a persistent Variant vector independent from the iterator (a copy of the iterator's vector) 59 | */ 60 | std::vector& operator*(); 61 | 62 | /** 63 | * @brief pre-fetches the next vector and tests for end of file 64 | * 65 | * @return a reference to the vector 66 | */ 67 | std::vector& operator++(); 68 | 69 | private: 70 | std::shared_ptr m_synced_readers; ///< pointer to the synced readers of the variant files 71 | std::vector m_variant_vector; ///< caches next Variant vector 72 | std::vector> m_headers_vector; ///< caches each reader's htslib header 73 | 74 | void init_headers_vector(); ///< initializes m_variant_headers 75 | void fetch_next_record(); ///< fetches next Variant vector 76 | }; 77 | 78 | } // end namespace gamgee 79 | 80 | #endif /* gamgee__synced_variant_iterator__guard */ 81 | -------------------------------------------------------------------------------- /gamgee/fastq_iterator.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "fastq_iterator.h" 6 | 7 | using namespace std; 8 | 9 | namespace gamgee { 10 | 11 | FastqIterator::FastqIterator() : 12 | m_input_stream {} 13 | {} 14 | 15 | FastqIterator::FastqIterator(std::shared_ptr& in) : 16 | m_input_stream {in} 17 | { 18 | m_is_fastq = false; 19 | if (m_input_stream->get() == '@') { 20 | m_is_fastq = true; 21 | m_bor_delim = '@'; 22 | m_eos_delim = '+'; 23 | } else { 24 | m_is_fastq = false; 25 | m_bor_delim = '>'; 26 | m_eos_delim = '>'; 27 | } 28 | m_element = fetch_next_element(); 29 | } 30 | 31 | Fastq& FastqIterator::operator*() { 32 | return m_element; 33 | } 34 | 35 | Fastq& FastqIterator::operator++() { 36 | m_element = fetch_next_element(); 37 | return m_element; 38 | } 39 | 40 | bool FastqIterator::operator==(const FastqIterator& rhs) const { 41 | return m_input_stream == rhs.m_input_stream; 42 | } 43 | 44 | bool FastqIterator::operator!=(const FastqIterator& rhs) const { 45 | return !operator==(rhs); 46 | } 47 | 48 | const string FastqIterator::parse_multiline() { 49 | auto s = string{}; 50 | *m_input_stream >> s; 51 | skip_new_lines(); 52 | return s; 53 | } 54 | 55 | string FastqIterator::parse_comment() { 56 | auto comment = string{}; 57 | while (m_input_stream->peek() == ' ') 58 | m_input_stream->ignore(); 59 | getline(*m_input_stream, comment); 60 | return comment; 61 | } 62 | 63 | string FastqIterator::parse_seq() { 64 | auto seq = string{}; 65 | while(m_input_stream->good() && m_input_stream->peek() != m_eos_delim) { 66 | seq += parse_multiline(); 67 | } 68 | return seq; 69 | } 70 | 71 | string FastqIterator::parse_quals(uint32_t seq_length) { 72 | auto qual = string{}; 73 | if (m_is_fastq) { 74 | m_input_stream->ignore(numeric_limits::max(), '\n'); // skip the line with the second record name 75 | while (m_input_stream->good() && qual.length() < seq_length ) { 76 | qual += parse_multiline(); 77 | } 78 | } 79 | m_input_stream->ignore(numeric_limits::max(), m_bor_delim); //skip everything (even if fastq is malformed and has extra quals) until the next record. It's essential that we remove the '@' here. 80 | return qual; 81 | } 82 | 83 | Fastq FastqIterator::fetch_next_element() { 84 | auto name = string{}; 85 | if (!(*m_input_stream >> name)) { // parse name (first string) and abort if we reached the end of the file 86 | m_input_stream.reset(); 87 | return Fastq{}; 88 | } 89 | const auto comment = parse_comment(); 90 | const auto seq = parse_seq(); 91 | const auto quals = parse_quals(seq.length()); 92 | return Fastq {move(name), move(comment), move(seq), move(quals)}; 93 | } 94 | 95 | void FastqIterator::skip_new_lines() { 96 | while (m_input_stream->peek() == '\n') 97 | m_input_stream->ignore(numeric_limits::max(), '\n'); 98 | } 99 | 100 | } 101 | 102 | -------------------------------------------------------------------------------- /testdata/test_variants.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##FILTER= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##FORMAT= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##INFO= 19 | ##INFO= 20 | ##FORMAT= 21 | ##FORMAT= 22 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 23 | 1 10000000 db2342 T C 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:25:10,0,100:3.1,2.2:ABA 0/0:12:0,10,1000:3.1,2.2:CA 1/1:650:10,100,0:3.1,2.2:XISPAFORRO 24 | 20 10001000 rs837472 GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 25 | 20 10002000 . TAGTGQA T . LOW_QUAL AF=0.5;AN=6;VALIDATED GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,2000000000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 26 | 20 10003000 . A AGCT . NOT_DEFINED AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 27 | 22 10004000 . GAT G,GATAT . PASS;MISSED AF=0.5,0;AN=6 GT:GQ:PL:AF:AS 1/2:35:10,0,100,2,4,8:3.1,2.2:ABA 0/0:35:0,10,100,2,4,8:3.1,2.2:ABA 1/1:35:10,100,0,2,4,8:3.1,2.2:ABA 28 | 22 10005000 . GAT G,GATAT . PASS AF=0.5,.;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100,.,4,.:3.1,.:ABA 0/0:.:0,10,100,2,4,8:3.1,2.2:ABA 1/1:35:10,100,0,2,4,8:3.1,2.2:. 29 | 22 10006000 . GAT G,GATAT . PASS AF=0.5,0;AN=6;VLINT=27,57,122;VLFLOAT=5.3,-127.65,24245.9,435.78 GT:GQ:PL:AF:AS:VLINT:VLFLOAT 0/1:35:10,0,100,2,4,8:3.1,2.2:ABA:4,56,21:4.5 0/0:35:0,10,100,2,4,8:3.1,2.2:ABA:36,.,1024,5196:9.25,-15.125 1/1:35:10,100,0,2,4,8:3.1,2.2:ABA:2,1:100.5,.,-92.75,-16345.25 30 | -------------------------------------------------------------------------------- /gamgee/fastq_iterator.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__fastq_iterator__guard 2 | #define gamgee__fastq_iterator__guard 3 | 4 | #include "fastq.h" 5 | 6 | #include 7 | #include 8 | 9 | namespace gamgee { 10 | 11 | /** 12 | * @brief Utility class to enable for-each style iteration in the FastqReader class 13 | */ 14 | class FastqIterator { 15 | public: 16 | 17 | /** 18 | * @brief creates an empty iterator (used for the end() method) 19 | */ 20 | FastqIterator(); 21 | 22 | /** 23 | * @brief initializes a new iterator based on an input stream (e.g. fastq/a file, stdin, ...) 24 | * 25 | * @param in input stream 26 | */ 27 | explicit FastqIterator(std::shared_ptr& in); 28 | 29 | /** 30 | * @brief a FastqIterator should never be copied as the underlying stream can only be 31 | * manipulated by one object. 32 | */ 33 | FastqIterator(const FastqIterator&) = delete; 34 | FastqIterator& operator=(const FastqIterator&) = delete; 35 | 36 | /** 37 | * @brief a FastqIterator move constructor guarantees all objects will have the same state. 38 | */ 39 | FastqIterator(FastqIterator&&) = default; 40 | FastqIterator& operator=(FastqIterator&&) = default; 41 | 42 | /** 43 | * @brief equality operator 44 | * 45 | * @param rhs the other FastqIterator to compare to 46 | * 47 | * @return whether or not the two iterators are the same (e.g. have the same input stream on the same 48 | * status) 49 | */ 50 | bool operator==(const FastqIterator& rhs) const; 51 | 52 | /** 53 | * @brief inequality operator (needed by for-each loop) 54 | * 55 | * @param rhs the other FastqIterator to compare to 56 | * 57 | * @return whether or not the two iterators are the same (e.g. have the same input stream on the same 58 | * status) 59 | */ 60 | bool operator!=(const FastqIterator& rhs) const; 61 | 62 | /** 63 | * @brief dereference operator (needed by for-each loop) 64 | * 65 | * @return a reference to a parsed Fastq object (current in the stream) 66 | */ 67 | Fastq& operator*(); 68 | 69 | /** 70 | * @brief increment operator (needed by for-each loop) 71 | * 72 | * @return the next parsed Fastq object (next in the stream) 73 | */ 74 | Fastq& operator++(); 75 | 76 | private: 77 | std::shared_ptr m_input_stream; ///< a pointer to the input stream 78 | Fastq m_element; ///< the current parsed fastq/fasta element 79 | bool m_is_fastq; ///< whether we are parsing fastq's or fasta's from the input stream 80 | char m_eos_delim; ///< delimits the end of the sequence field in the fastq/fasta file 81 | char m_bor_delim; ///< delimits the beginning of the record in the fastq/fasta file 82 | 83 | Fastq fetch_next_element(); 84 | std::string parse_comment(); 85 | std::string parse_seq(); 86 | std::string parse_quals(uint32_t seq_length); 87 | const std::string parse_multiline(); 88 | void skip_new_lines(); 89 | }; 90 | 91 | } // end namespace gamgee 92 | 93 | #endif // gamgee__fastq_iterator__guard 94 | -------------------------------------------------------------------------------- /testdata/unindexed/test_unindexed.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##INFO= 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##FILTER= 7 | ##FILTER= 8 | ##FILTER= 9 | ##FILTER= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##FORMAT= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##INFO= 19 | ##INFO= 20 | ##FORMAT= 21 | ##FORMAT= 22 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA12891 NA12892 23 | 1 10000000 db2342 T C 80 PASS AF=0.5;AN=6;VALIDATED;DESC=Test1,Test2 GT:GQ:PL:AF:AS 0/1:25:10,0,100:3.1,2.2:ABA 0/0:12:0,10,1000:3.1,2.2:CA 1/1:650:10,100,0:3.1,2.2:XISPAFORRO 24 | 20 10001000 rs837472 GG AA 8.4 PASS AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 25 | 20 10002000 . TAGTGQA T . LOW_QUAL AF=0.5;AN=6;VALIDATED GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,2000000000:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 26 | 20 10003000 . A AGCT . NOT_DEFINED AF=0.5;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100:3.1,2.2:ABA 0/0:35:0,10,100:3.1,2.2:ABA 1/1:35:10,100,0:3.1,2.2:ABA 27 | 22 10004000 . GAT G,GATAT . PASS;MISSED AF=0.5,0;AN=6 GT:GQ:PL:AF:AS 1/2:35:10,0,100,2,4,8:3.1,2.2:ABA 0/0:35:0,10,100,2,4,8:3.1,2.2:ABA 1/1:35:10,100,0,2,4,8:3.1,2.2:ABA 28 | 22 10005000 . GAT G,GATAT . PASS AF=0.5,.;AN=6 GT:GQ:PL:AF:AS 0/1:35:10,0,100,.,4,.:3.1,.:ABA 0/0:.:0,10,100,2,4,8:3.1,2.2:ABA 1/1:35:10,100,0,2,4,8:3.1,2.2:. 29 | 22 10006000 . GAT G,GATAT . PASS AF=0.5,0;AN=6;VLINT=27,57,122;VLFLOAT=5.3,-127.65,24245.9,435.78 GT:GQ:PL:AF:AS:VLINT:VLFLOAT 0/1:35:10,0,100,2,4,8:3.1,2.2:ABA:4,56,21:4.5 0/0:35:0,10,100,2,4,8:3.1,2.2:ABA:36,.,1024,5196:9.25,-15.125 1/1:35:10,100,0,2,4,8:3.1,2.2:ABA:2,1:100.5,.,-92.75,-16345.25 30 | -------------------------------------------------------------------------------- /gamgee/missing.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__missing__guard 2 | #define gamgee__missing__guard 3 | 4 | #include "sam/sam_tag.h" 5 | #include "utils/utils.h" 6 | 7 | #include "htslib/vcf.h" 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace gamgee { 14 | 15 | namespace missing_values { 16 | constexpr auto int8 = bcf_int8_missing; ///< missing value for an int8 17 | constexpr auto int16 = bcf_int16_missing; ///< missing value for an int16 18 | constexpr auto int32 = bcf_int32_missing; ///< missing value for an int32 19 | constexpr auto string_empty = ""; ///< empty string is a missing string 20 | constexpr auto string_dot = "."; ///< "dot" is a missing string in the VCF spec. 21 | } 22 | 23 | inline bool missing (const bool value) { return !value; } ///< Returns true if bool is false (missing). 24 | inline bool missing (const float value) { return bcf_float_is_missing(value); } ///< Returns true if float is missing. 25 | inline bool missing (const int8_t value) { return value == missing_values::int8; } ///< Returns true if int8_t is missing. 26 | inline bool missing (const int16_t value) { return value == missing_values::int16; } ///< Returns true if int16_t is missing. 27 | inline bool missing (const int32_t value) { return value == missing_values::int32; } ///< Returns true if int32_t is missing. 28 | inline bool missing (const std::string& value) { return value.empty() || value == missing_values::string_dot;} ///< Returns true if string is missing. 29 | inline bool missing (const char* value) { return value == missing_values::string_empty || value == missing_values::string_dot;} ///< Returns true if char* is missing. 30 | 31 | /** 32 | * Returns true if value is missing. 33 | * @tparam MISSING_TYPE any class that implements the missing() as a public member function. 34 | * @return True if value is missing. 35 | */ 36 | template 37 | inline bool missing(const MISSING_TYPE& value) { 38 | return value.missing(); 39 | } 40 | 41 | /** 42 | * Missing overload for functions that return a vector of values. It only applies if the entire vector is missing. 43 | * @tparam VALUE any type that can be fit into a container. Any type, really. 44 | * @param v any vector 45 | * @return true if the vector is empty (therefore the value that was returned is missing) 46 | */ 47 | template 48 | inline bool missing(const std::vector& v) { return v.empty(); } 49 | 50 | } 51 | 52 | #endif // gamgee__missing__guard 53 | -------------------------------------------------------------------------------- /test/sam_reader_test.cpp: -------------------------------------------------------------------------------- 1 | #include "sam/sam_reader.h" 2 | #include "sam/indexed_sam_reader.h" 3 | #include "exceptions.h" 4 | 5 | #include "test_utils.h" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | using namespace gamgee; 13 | 14 | BOOST_AUTO_TEST_CASE( single_readers ) 15 | { 16 | for (const auto& filename : {"testdata/test_simple.bam", "testdata/test_simple.sam"}) { 17 | auto read_counter = 0u; 18 | for (const auto& sam : SingleSamReader{filename}) { 19 | BOOST_CHECK_EQUAL(sam.name().substr(0, 15), "30PPJAAXX090125"); 20 | BOOST_CHECK_EQUAL(sam.chromosome(), 0u); 21 | ++read_counter; 22 | } 23 | BOOST_CHECK_EQUAL(read_counter, 33u); 24 | } 25 | } 26 | 27 | BOOST_AUTO_TEST_CASE( paired_readers ) { 28 | for (const auto& filename : {"testdata/test_paired.bam", "testdata/test_paired.sam"}) { 29 | auto read_counter = 0u; 30 | auto secondary_alignments = 0u; 31 | for (const auto& p : PairSamReader{filename}) { 32 | if (p.second.empty()) { 33 | BOOST_CHECK(p.first.secondary() || p.first.supplementary()); 34 | ++secondary_alignments; 35 | } 36 | else { 37 | BOOST_CHECK(p.first.name() == p.second.name()); 38 | read_counter += 2; 39 | } 40 | } 41 | BOOST_CHECK_EQUAL(secondary_alignments, 7u); 42 | BOOST_CHECK_EQUAL(read_counter, 44u); 43 | } 44 | } 45 | 46 | BOOST_AUTO_TEST_CASE( single_sam_reader_move_test ) { 47 | auto reader0 = SingleSamReader{"testdata/test_simple.bam"}; 48 | auto reader1 = SingleSamReader{"testdata/test_simple.bam"}; 49 | auto moved = check_move_constructor(reader1); 50 | 51 | auto record0 = reader0.begin().operator*(); 52 | auto moved_record = moved.begin().operator*(); 53 | BOOST_CHECK_EQUAL(record0.name(), moved_record.name()); 54 | BOOST_CHECK_EQUAL(record0.chromosome(), moved_record.chromosome()); 55 | } 56 | 57 | BOOST_AUTO_TEST_CASE( sam_iterator_move_test ) { 58 | auto reader0 = SingleSamReader{"testdata/test_simple.bam"}; 59 | auto iter0 = reader0.begin(); 60 | 61 | auto reader1 = SingleSamReader{"testdata/test_simple.bam"}; 62 | auto iter1 = reader1.begin(); 63 | auto moved = check_move_constructor(iter1); 64 | 65 | auto record0 = *iter0; 66 | auto moved_record = *moved; 67 | BOOST_CHECK_EQUAL(record0.name(), moved_record.name()); 68 | BOOST_CHECK_EQUAL(record0.chromosome(), moved_record.chromosome()); 69 | } 70 | 71 | BOOST_AUTO_TEST_CASE( single_sam_reader_nonexistent_file ) { 72 | BOOST_CHECK_THROW(SingleSamReader{"foo/bar/nonexistent.bam"}, FileOpenException); 73 | } 74 | 75 | BOOST_AUTO_TEST_CASE( pair_sam_reader_nonexistent_file ) { 76 | BOOST_CHECK_THROW(PairSamReader{"foo/bar/nonexistent.bam"}, FileOpenException); 77 | } 78 | 79 | BOOST_AUTO_TEST_CASE( indexed_sam_reader_nonexistent_file ) { 80 | BOOST_CHECK_THROW(IndexedSamReader("foo/bar/nonexistent.bam", vector{}), FileOpenException); 81 | } 82 | 83 | BOOST_AUTO_TEST_CASE( indexed_sam_reader_nonexistent_index ) { 84 | BOOST_CHECK_THROW(IndexedSamReader("testdata/unindexed/test_unindexed.bam", vector{}), IndexLoadException); 85 | } 86 | -------------------------------------------------------------------------------- /gamgee/variant/indexed_variant_iterator.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__indexed_variant_iterator__guard 2 | #define gamgee__indexed_variant_iterator__guard 3 | 4 | #include "variant_iterator.h" 5 | 6 | #include "../utils/hts_memory.h" 7 | 8 | #include "htslib/vcf.h" 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | namespace gamgee { 15 | 16 | class IndexedVariantIterator : public VariantIterator { 17 | public: 18 | 19 | static const std::vector all_intervals; 20 | 21 | /** 22 | * @brief creates an empty iterator (used for the end() method) 23 | */ 24 | IndexedVariantIterator(); 25 | 26 | /** 27 | * @brief initializes a new iterator based on a file, an index, a header, and a vector of intervals 28 | * 29 | * @param file_ptr shared pointer to a BCF file opened via the bcf_open() macro from htslib 30 | * @param index_ptr shared pointer to a BCF file index (CSI) created with the bcf_index_load() macro from htslib 31 | * @param header_ptr shared pointer to a BCF file header created with the bcf_hdr_read() macro from htslib 32 | * @param interval_list vector of intervals represented by strings 33 | */ 34 | IndexedVariantIterator(const std::shared_ptr& file_ptr, 35 | const std::shared_ptr& index_ptr, 36 | const std::shared_ptr& header_ptr, 37 | const std::vector& interval_list = all_intervals); 38 | 39 | /** 40 | * @brief an IndexedVariantIterator cannot be copied safely, as it is iterating over a stream. 41 | */ 42 | 43 | IndexedVariantIterator(const IndexedVariantIterator& other) = delete; 44 | IndexedVariantIterator& operator=(const IndexedVariantIterator& other) = delete; 45 | 46 | /** 47 | * @brief an IndexedVariantIterator can be moved 48 | */ 49 | 50 | IndexedVariantIterator(IndexedVariantIterator&& other) = default; 51 | IndexedVariantIterator& operator=(IndexedVariantIterator&& other) = default; 52 | 53 | /** 54 | * @brief inequality operator (needed by for-each loop) 55 | * 56 | * @param rhs the other IndexedVariantIterator to compare to 57 | * 58 | * @return whether or not the two iterators are the same (e.g. have the same input file on the same 59 | * status and the same intervals) 60 | */ 61 | bool operator!=(const IndexedVariantIterator& rhs); 62 | 63 | protected: 64 | void fetch_next_record() override; ///< fetches next Variant record into existing htslib memory without making a copy 65 | 66 | private: 67 | std::shared_ptr m_variant_index_ptr; ///< pointer to the internal structure of the index file 68 | std::vector m_interval_list; ///< vector of intervals represented by strings 69 | std::vector::const_iterator m_interval_iter; ///< iterator for the interval list 70 | std::unique_ptr m_index_iter_ptr; ///< pointer to the htslib BCF index iterator 71 | }; 72 | 73 | } 74 | 75 | #endif /* defined(gamgee__indexed_variant_iterator__guard) */ 76 | -------------------------------------------------------------------------------- /gamgee/exceptions.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__exceptions__guard 2 | #define gamgee__exceptions__guard 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | namespace gamgee { 10 | 11 | /** 12 | * @brief Exception for the case where there is an error opening a file for reading/writing 13 | */ 14 | class FileOpenException : public std::runtime_error { 15 | public: 16 | FileOpenException(const std::string& filename) : 17 | std::runtime_error{std::string{"Could not open file "} + filename} {} 18 | }; 19 | 20 | /** 21 | * @brief Exception for the case where an index file cannot be opened for a particular file (eg., bam/vcf/bcf) 22 | */ 23 | class IndexLoadException : public std::runtime_error { 24 | public: 25 | IndexLoadException(const std::string& filename) : 26 | std::runtime_error{std::string{"Could not open index for file "} + filename} {} 27 | }; 28 | 29 | /** 30 | * @brief Exception for the case where a file header could not be read 31 | */ 32 | class HeaderReadException : public std::runtime_error { 33 | public: 34 | HeaderReadException(const std::string& filename) : 35 | std::runtime_error{std::string{"Could not read header for file "} + filename} {} 36 | }; 37 | 38 | /** 39 | * @brief Exception for the case where multiple headers are incompatible in some way 40 | */ 41 | class HeaderCompatibilityException : public std::runtime_error { 42 | public: 43 | HeaderCompatibilityException(const std::string& reason) : 44 | std::runtime_error(std::string{"Incompatible headers: "} + reason) { } 45 | }; 46 | 47 | /** 48 | * @brief an exception class for the case where a single input is required, but more is provided 49 | */ 50 | class SingleInputException : public std::runtime_error { 51 | public: 52 | SingleInputException(const std::string& vector_name, const size_t size) : 53 | std::runtime_error{(boost::format("Error: single input required, but vector %s has size %d") % vector_name % size).str()} { } 54 | }; 55 | 56 | /** 57 | * @brief a catchall exception class for htslib errors 58 | */ 59 | class HtslibException : public std::runtime_error { 60 | public: 61 | HtslibException(const int error_code) : 62 | std::runtime_error{(boost::format("Error: htslib failed with error code %d. See stderr for details.") % error_code).str()} { } 63 | }; 64 | 65 | /** 66 | * @brief an exception class for the case where a chromosome is not found in the reference 67 | */ 68 | class ChromosomeNotFoundException : public std::runtime_error { 69 | public: 70 | ChromosomeNotFoundException(const std::string& chrom_name) : 71 | std::runtime_error{(boost::format("Error: chromosome %s was not found in the given reference") % chrom_name).str()} { } 72 | }; 73 | 74 | /** 75 | * @brief an exception class for the case where a chromosome is not found in the reference 76 | */ 77 | class ChromosomeSizeException : public std::runtime_error { 78 | public: 79 | ChromosomeSizeException(const std::string& chrom_name, const size_t chrom_size, const int desired_location) : 80 | std::runtime_error{(boost::format("Error: chromosome %s is of size %d but location %d was requested") % chrom_name % chrom_size % desired_location).str()} { } 81 | }; 82 | 83 | } // end of namespace gamgee 84 | 85 | #endif // end of gamgee__exceptions__guard 86 | 87 | -------------------------------------------------------------------------------- /gamgee/variant/variant_writer.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__variant_writer__guard 2 | #define gamgee__variant_writer__guard 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "variant.h" 9 | #include "variant_header.h" 10 | 11 | #include "../utils/hts_memory.h" 12 | 13 | #include "htslib/vcf.h" 14 | 15 | namespace gamgee { 16 | 17 | /** 18 | * @brief utility class to write out a VCF/BCF file to any stream 19 | * @todo add serialization option 20 | */ 21 | class VariantWriter { 22 | 23 | public: 24 | 25 | /** 26 | * @brief Creates a new VariantWriter using the specified output file name 27 | * @param output_fname file to write to. The default is stdout (as defined by htslib) 28 | * @param binary whether the output should be in BCF (true) or VCF format (false) 29 | * @param compression_level optional zlib compression level. 0 for none, 1 for best speed, 9 for best compression 30 | * @note the header is copied and managed internally 31 | */ 32 | explicit VariantWriter(const std::string& output_fname = "-", const bool binary = true, const int compression_level = Z_DEFAULT_COMPRESSION); 33 | 34 | /** 35 | * @brief Creates a new VariantWriter with the header extracted from a Variant record and using the specified output file name 36 | * @param header a VariantHeader object to make a copy from 37 | * @param output_fname file to write to. The default is stdout (as defined by htslib) 38 | * @param binary whether the output should be in BCF (true) or VCF format (false) 39 | * @param compression_level optional zlib compression level. 0 for none, 1 for best speed, 9 for best compression 40 | * @note the header is copied and managed internally 41 | */ 42 | explicit VariantWriter(const VariantHeader& header, const std::string& output_fname = "-", const bool binary = true, const int compression_level = Z_DEFAULT_COMPRESSION); 43 | 44 | /** 45 | * @brief a VariantWriter cannot be copied safely, as it is iterating over a stream. 46 | */ 47 | 48 | VariantWriter(const VariantWriter& other) = delete; 49 | VariantWriter& operator=(const VariantWriter& other) = delete; 50 | 51 | /** 52 | * @brief a VariantWriter can be moved 53 | */ 54 | 55 | VariantWriter(VariantWriter&& other) = default; 56 | VariantWriter& operator=(VariantWriter&& other) = default; 57 | 58 | /** 59 | * @brief Adds a record to the file stream 60 | * @param body the record 61 | */ 62 | void add_record(const Variant& body); 63 | 64 | /** 65 | * @brief Adds a header to the file stream. 66 | * @param header the header 67 | * @note the header is a requirement to add records 68 | */ 69 | void add_header(const VariantHeader& header); 70 | 71 | private: 72 | std::unique_ptr m_out_file; ///< the file or stream to write out to ("-" means stdout) 73 | VariantHeader m_header; ///< holds a copy of the header throughout the production of the output (necessary for every record that gets added) 74 | 75 | static htsFile* open_file(const std::string& output_fname, const std::string& binary); 76 | void write_header() const; 77 | std::string write_mode(const bool binary, const int compression_level) const; 78 | }; 79 | 80 | } 81 | 82 | #endif // gamgee__variant_writer__guard 83 | -------------------------------------------------------------------------------- /gamgee/fastq.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__fastq__guard 2 | #define gamgee__fastq__guard 3 | 4 | #include 5 | 6 | namespace gamgee { 7 | 8 | /** 9 | * @brief Utility class to hold one FastA or FastQ record. 10 | * 11 | * Will automatically output a FastA or FastQ based on the presence of quality scores. 12 | */ 13 | class Fastq { 14 | 15 | public: 16 | 17 | /** @brief creates an empty record */ 18 | Fastq() : 19 | m_name {}, m_comment {}, m_sequence{}, m_quals{} {} 20 | 21 | /** @brief creates a full object by assigning all fields */ 22 | Fastq(std::string name, ///< sequence name 23 | std::string comment, ///< optional comment 24 | std::string sequence, ///< sequence bases 25 | std::string quals = "" ///< optional quality scores (leave it out for FastA) 26 | ) : 27 | m_name {name}, m_comment {comment}, m_sequence{sequence}, m_quals{quals} 28 | {} 29 | 30 | Fastq(const Fastq&) = default; 31 | Fastq& operator=(const Fastq&) = default; 32 | Fastq(Fastq&&) = default; 33 | Fastq& operator=(Fastq&&) = default; 34 | 35 | 36 | /** 37 | * @brief inequality comparison of all fields in the record 38 | * 39 | * @return true if any field differs (string comparison) 40 | */ 41 | bool operator!=(const Fastq& other) const { 42 | return !(*this == other); 43 | } 44 | 45 | /** 46 | * @brief equality comparison of all fields in the record 47 | * 48 | * @return true only if every field is the same (string comparison) 49 | */ 50 | bool operator==(const Fastq& other) const { 51 | return m_name == other.m_name && 52 | m_comment == other.m_comment && 53 | m_sequence == other.m_sequence && 54 | m_quals == other.m_quals; 55 | } 56 | 57 | 58 | std::string name() const { return m_name; } 59 | std::string comment() const { return m_comment; } 60 | std::string sequence() const { return m_sequence; } 61 | std::string quals() const { return m_quals; } 62 | void set_name(const std::string& name) { m_name = name; } 63 | void set_comment(const std::string& comment) { m_comment = comment; } 64 | void set_sequence(const std::string& sequence) { m_sequence = sequence; } 65 | void set_quals(const std::string& quals) { m_quals = quals; } 66 | 67 | void chop(const int nBases); ///< @brief hard clips the first n bases of the read. 68 | void reverse_complement(); ///< @brief transform the sequence into it's reverse complement. 69 | bool is_fastq() const; ///< @brief true if the record has a quals in it's qual field 70 | 71 | private: 72 | 73 | std::string m_name; ///< sequence name 74 | std::string m_comment; ///< optional comment 75 | std::string m_sequence; ///< sequence bases 76 | std::string m_quals; ///< optional quality scores 77 | 78 | }; 79 | 80 | } // end of namespace 81 | 82 | /** 83 | * @brief outputs the fastq record in fastq format. 84 | * 85 | * The output checks whether the record has quality scores. If it does, it outputs a fastq record, 86 | * otherwise it outputs a fasta record. 87 | */ 88 | std::ostream& operator<< (std::ostream& os, const gamgee::Fastq& fq); 89 | 90 | #endif // gamgee__fastq__guard 91 | -------------------------------------------------------------------------------- /gamgee/sam/sam_builder_data_field.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__sam_builder_data_field__guard 2 | #define gamgee__sam_builder_data_field__guard 3 | 4 | #include 5 | 6 | namespace gamgee { 7 | 8 | /** 9 | * @brief class to hold encoded byte arrays for individual data fields (cigar, bases, etc.) during building of a Sam 10 | * 11 | * Fields can be created/updated either by copying data from raw pointers, or by moving managed pointers 12 | * into them without copying. Eg., 13 | * 14 | * auto field = SamBuilderDataField{raw_pointer, num_bytes, num_elements}; // does a copy 15 | * auto field = SamBuilderDataField{move(unique_ptr), num_bytes, num_elements}; // no copy 16 | * 17 | * After construction, a field's value can be altered via the update() functions 18 | */ 19 | class SamBuilderDataField { 20 | public: 21 | SamBuilderDataField(); ///< @brief initialize a SamBuilderDataField to an empty value 22 | explicit SamBuilderDataField(const void* copy_source, const uint32_t bytes_to_copy, const uint32_t num_elements); ///< @brief initialize a SamBuilderDataField by copying data from a raw pointer (takes no ownership of copy_source) 23 | explicit SamBuilderDataField(std::unique_ptr&& move_source, const uint32_t source_bytes, const uint32_t num_elements); ///< @brief initialize a SamBuilderDataField by moving an existing unique_ptr into it and taking ownership (without copying the existing data) 24 | SamBuilderDataField(SamBuilderDataField&& other); 25 | SamBuilderDataField& operator=(SamBuilderDataField&& other); 26 | 27 | // Fields cannot be copied (only moved), and use default destruction 28 | SamBuilderDataField(const SamBuilderDataField& other) = delete; 29 | SamBuilderDataField& operator=(const SamBuilderDataField& other) = delete; 30 | ~SamBuilderDataField() = default; 31 | 32 | const uint8_t* raw_data_ptr() const { return m_data.get(); } ///< gets a raw pointer to the data buffer 33 | uint32_t num_bytes() const { return m_num_bytes; } ///< number of bytes in the data buffer 34 | uint32_t num_elements() const { return m_num_elements; } ///< number of elements (cigar operations, bases, etc.) in the data buffer 35 | bool empty() const { return m_num_bytes == 0; } ///< does this field have any data? 36 | 37 | void update(const void* copy_source, const uint32_t bytes_to_copy, const uint32_t num_elements); ///< @brief update the field by copying data from a raw pointer (takes no ownership of copy_source) 38 | void update(std::unique_ptr&& move_source, const uint32_t source_bytes, const uint32_t num_elements); ///< @brief update the field by moving an existing unique_ptr into it and taking ownership (without copying the existing data) 39 | uint8_t* copy_into(uint8_t* destination) const; ///< @brief copy this field's byte array into an arbitrary location 40 | 41 | private: 42 | std::unique_ptr m_data; ///< buffer containing encoded data for the field, managed exclusively by us 43 | uint32_t m_num_bytes; ///< number of bytes in m_data 44 | uint32_t m_num_elements; ///< number of elements (cigar operations, bases, etc.) in m_data 45 | }; 46 | 47 | } 48 | 49 | #endif /* gamgee__sam_builder_data_field__guard */ 50 | -------------------------------------------------------------------------------- /gamgee/variant/multiple_variant_iterator.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__multiple_variant_iterator__guard 2 | #define gamgee__multiple_variant_iterator__guard 3 | 4 | #include "htslib/vcf.h" 5 | 6 | #include "variant.h" 7 | #include "variant_iterator.h" 8 | 9 | #include 10 | #include 11 | 12 | namespace gamgee { 13 | 14 | using VariantIteratorIndexPair = std::pair, uint32_t>; 15 | using VariantIndexPair = std::pair; 16 | 17 | /** 18 | * @brief Utility class to enable for-each style iteration in the MultipleVariantReader class 19 | */ 20 | class MultipleVariantIterator { 21 | public: 22 | 23 | /** 24 | * @brief creates an empty iterator (used for the end() method) 25 | */ 26 | MultipleVariantIterator() = default; 27 | 28 | /** 29 | * @brief initializes a new iterator based on a vector of input files (vcf or bcf) 30 | * 31 | * @param variant_files vector of vcf/bcf files opened via the bcf_open() macro from htslib 32 | * @param variant_headers vector of headers corresponding to the files 33 | */ 34 | MultipleVariantIterator(const std::vector>& variant_files, const std::vector>& variant_headers); 35 | 36 | /** 37 | * @brief a MultipleVariantIterator move constructor guarantees all objects will have the same state. 38 | */ 39 | MultipleVariantIterator(MultipleVariantIterator&&) = default; 40 | MultipleVariantIterator& operator=(MultipleVariantIterator&& other) = default; 41 | 42 | /** 43 | * @brief a MultipleVariantIterator cannot be copied safely, as it is iterating over streams. 44 | */ 45 | MultipleVariantIterator(const MultipleVariantIterator&) = delete; 46 | MultipleVariantIterator& operator=(const MultipleVariantIterator& other) = delete; 47 | 48 | /** 49 | * @brief pseudo-inequality operator (needed by for-each loop) 50 | * 51 | * @warning this method does the minimal work necessary to determine that we have reached the end of iteration. 52 | * it is NOT a valid general-purpose inequality method. 53 | * 54 | * @param rhs the other MultipleVariantIterator to compare to 55 | * 56 | * @return whether both iterators have entered their end states 57 | */ 58 | bool operator!=(const MultipleVariantIterator& rhs); 59 | 60 | /** 61 | * @brief dereference operator (needed by for-each loop) 62 | * 63 | * @return a reference to the iterator's Variant vector 64 | */ 65 | std::vector& operator*(); 66 | 67 | /** 68 | * @brief advances the iterator, fetching the next vector 69 | * 70 | * @return a reference to the iterator's Variant vector 71 | */ 72 | std::vector& operator++(); 73 | 74 | private: 75 | // fetches the next Variant vector 76 | void fetch_next_vector(); 77 | 78 | // comparison class for genomic locations in the priority queue 79 | class Comparator { 80 | public: 81 | bool operator()(const VariantIteratorIndexPair& left, const VariantIteratorIndexPair& right); 82 | }; 83 | 84 | // the individual file iterators 85 | std::priority_queue, Comparator> m_queue; 86 | 87 | // caches next Variant vector 88 | std::vector m_variant_vector; 89 | }; 90 | 91 | } // end namespace gamgee 92 | 93 | #endif // gamgee__multiple_variant_iterator__guard 94 | -------------------------------------------------------------------------------- /gamgee/fastq_reader.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__fastq_reader__guard 2 | #define gamgee__fastq_reader__guard 3 | 4 | #include "fastq_iterator.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace gamgee { 13 | 14 | /** 15 | * @brief Utility class to read many Fastq records from a stream (e.g. Fastq file, stdin, ...) in a 16 | * for-each loop in a for-each loop. 17 | * 18 | * This class is designed to parse fastq files in for-each loops with the following signature: 19 | * 20 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 21 | * for (auto& record : FastqReader(filename)) 22 | * do_something_with_fastq(record); 23 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 24 | * 25 | * You can also use it with the stdin or any other stream by simply giving the reference to the 26 | * stream in the constructor, like so: 27 | * 28 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29 | * for (auto& record : FastqReader(&std::cin)) 30 | * do_something_with_fastq(record); 31 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 32 | * 33 | * Although one could use it as an iterator, if your goal is to do so, you should use the FastqIterator 34 | * class 35 | */ 36 | class FastqReader { 37 | public: 38 | 39 | /** 40 | * @brief reads through all records in a file (fasta or fastq) parsing them into Fastq 41 | * objects 42 | * 43 | * @param filename the name of the fasta/fastq file 44 | */ 45 | explicit FastqReader(const std::string& filename); 46 | 47 | /** 48 | * @brief reads through all records in a file (fasta or fastq) parsing them into Fastq 49 | * objects 50 | * 51 | * @param filenames a vector containing a single element: the name of the fasta/fastq file 52 | */ 53 | explicit FastqReader(const std::vector& filenames); 54 | 55 | /** 56 | * @brief reads through all records in a stream (e.g. stdin) parsing them into Fastq 57 | * objects 58 | * 59 | * @param input a reference to the input stream (e.g. &std::cin) 60 | */ 61 | explicit FastqReader(std::istream* const input); 62 | 63 | /** 64 | * @brief move constructor for the FastqReader class simply transfers all objects with the state 65 | * maintained. 66 | */ 67 | FastqReader(FastqReader&&) = default; 68 | FastqReader& operator=(FastqReader&&) = default; 69 | 70 | /** 71 | * @brief a FastqReader cannot be copied safely, as it is iterating over a stream. 72 | */ 73 | FastqReader(const FastqReader&) = delete; 74 | FastqReader& operator=(const FastqReader&) = delete; 75 | 76 | /** 77 | * @brief creates a FastqIterator pointing at the start of the input stream (needed by for-each 78 | * loop) 79 | * 80 | * @return a FastqIterator ready to start parsing the file 81 | */ 82 | FastqIterator begin(); 83 | 84 | /** 85 | * @brief creates a FastqIterator with a nullified input stream (needed by for-each loop) 86 | * 87 | * @return a FastqIterator that will match the end status of the iterator at the end of the stream 88 | */ 89 | FastqIterator end(); 90 | 91 | private: 92 | std::shared_ptr m_input_stream; ///< a pointer to the input stream 93 | 94 | void init_reader(const std::string& filename); 95 | }; 96 | 97 | } // end of namespace 98 | 99 | #endif // gamgee__fastq_reader__guard 100 | -------------------------------------------------------------------------------- /gamgee/sam/sam_builder_data_field.cpp: -------------------------------------------------------------------------------- 1 | #include "sam_builder_data_field.h" 2 | 3 | #include 4 | 5 | using namespace std; 6 | 7 | namespace gamgee { 8 | 9 | /** 10 | * @brief initialize a SamBuilderDataField to an empty value 11 | */ 12 | SamBuilderDataField::SamBuilderDataField() : 13 | m_data {}, 14 | m_num_bytes { 0 }, // Note: default no-arg constructor would NOT zero out the POD members 15 | m_num_elements { 0 } 16 | {} 17 | 18 | /** 19 | * @brief initialize a SamBuilderDataField by copying data from a raw pointer 20 | * 21 | * @note takes no ownership of copy_source 22 | */ 23 | SamBuilderDataField::SamBuilderDataField(const void* copy_source, const uint32_t bytes_to_copy, const uint32_t num_elements) : 24 | m_data { new uint8_t[bytes_to_copy] }, 25 | m_num_bytes { bytes_to_copy }, 26 | m_num_elements { num_elements } 27 | { 28 | memcpy(m_data.get(), copy_source, bytes_to_copy); 29 | } 30 | 31 | /** 32 | * @brief initialize a SamBuilderDataField by moving an existing unique_ptr into it (without copying the existing data) 33 | * 34 | * @note takes ownership of the memory managed by move_source 35 | */ 36 | SamBuilderDataField::SamBuilderDataField(std::unique_ptr&& move_source, const uint32_t source_bytes, const uint32_t num_elements) : 37 | m_data { move(move_source) }, 38 | m_num_bytes { source_bytes }, 39 | m_num_elements { num_elements } 40 | {} 41 | 42 | /** 43 | * @brief initialize a SamBuilderDataField via move from an existing field 44 | */ 45 | SamBuilderDataField::SamBuilderDataField(SamBuilderDataField&& other) : 46 | m_data { move(other.m_data) }, 47 | m_num_bytes { other.m_num_bytes }, 48 | m_num_elements { other.m_num_elements } 49 | {} 50 | 51 | /** 52 | * @brief move an existing SamBuildDataField into this one 53 | */ 54 | SamBuilderDataField& SamBuilderDataField::operator=(SamBuilderDataField&& other) { 55 | if ( &other == this ) 56 | return *this; 57 | 58 | m_data = move(other.m_data); 59 | m_num_bytes = other.m_num_bytes; 60 | m_num_elements = other.m_num_elements; 61 | return *this; 62 | } 63 | 64 | /** 65 | * @brief update the field by copying data from a raw pointer (takes no ownership of copy_source) 66 | * 67 | * @note any previous value of m_data is destroyed via the unique_ptr assignment 68 | */ 69 | void SamBuilderDataField::update(const void* copy_source, const uint32_t bytes_to_copy, const uint32_t num_elements) { 70 | m_data = unique_ptr{ new uint8_t[bytes_to_copy] }; 71 | memcpy(m_data.get(), copy_source, bytes_to_copy); 72 | m_num_bytes = bytes_to_copy; 73 | m_num_elements = num_elements; 74 | } 75 | 76 | /** 77 | * @brief update the field by moving an existing unique_ptr into it and taking ownership (without copying the existing data) 78 | * 79 | * @note any previous value of m_data is destroyed via the unique_ptr assignment 80 | */ 81 | void SamBuilderDataField::update(std::unique_ptr&& move_source, const uint32_t source_bytes, const uint32_t num_elements) { 82 | m_data = move(move_source); 83 | m_num_bytes = source_bytes; 84 | m_num_elements = num_elements; 85 | } 86 | 87 | /** 88 | * @brief copy this field's byte array into an arbitrary location 89 | * 90 | * @return pointer to the byte just AFTER the end of the copied data 91 | */ 92 | uint8_t* SamBuilderDataField::copy_into(uint8_t* destination) const { 93 | memcpy(destination, m_data.get(), m_num_bytes); 94 | return destination + m_num_bytes; 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /gamgee/variant/variant_iterator.h: -------------------------------------------------------------------------------- 1 | #ifndef gamgee__variant_iterator__guard 2 | #define gamgee__variant_iterator__guard 3 | 4 | #include "variant.h" 5 | 6 | #include "htslib/vcf.h" 7 | 8 | #include 9 | 10 | namespace gamgee { 11 | 12 | /** 13 | * @brief Utility class to enable for-each style iteration in the VariantReader class 14 | */ 15 | class VariantIterator { 16 | public: 17 | 18 | /** 19 | * @brief creates an empty iterator (used for the end() method) 20 | */ 21 | VariantIterator() = default; 22 | 23 | /** 24 | * @brief initializes a new iterator based on an input stream (e.g. a vcf/bcf file, stdin, ...) 25 | * 26 | * @param variant_file_ptr shared pointer to a vcf/bcf file opened via the bcf_open() macro from htslib 27 | * @param variant_header_ptr shared pointer to a vcf/bcf file header created with the bcf_hdr_read() macro from htslib 28 | */ 29 | VariantIterator(const std::shared_ptr& variant_file_ptr, const std::shared_ptr& variant_header_ptr); 30 | 31 | /** 32 | * @brief a VariantIterator move constructor guarantees all objects will have the same state. 33 | */ 34 | VariantIterator(VariantIterator&&) = default; 35 | 36 | /** 37 | * @brief a VariantIterator move assignment operator guarantees all objects will have the same state. 38 | */ 39 | VariantIterator& operator= (VariantIterator&&) = default; 40 | 41 | /** 42 | * @brief a VariantIterator cannot be copy-constructed. 43 | */ 44 | VariantIterator(const VariantIterator&) = delete; 45 | 46 | /** 47 | * @brief a VariantIterator cannot be copied. 48 | */ 49 | VariantIterator& operator= (const VariantIterator&) = delete; 50 | 51 | /** 52 | * @brief inequality operator (needed by for-each loop) 53 | * 54 | * @param rhs the other VariantIterator to compare to 55 | * 56 | * @return whether or not the two iterators are the same (e.g. have the same input stream on the same 57 | * status) 58 | */ 59 | bool operator!=(const VariantIterator& rhs) const; 60 | 61 | /** 62 | * @brief dereference operator (needed by for-each loop) 63 | * 64 | * @return a persistent Variant object independent from the iterator (a copy of the iterator's object) 65 | */ 66 | Variant& operator*(); 67 | 68 | /** 69 | * @brief pre-fetches the next record and tests for end of file 70 | * 71 | * @return a reference to the object (it can be const& because this return value should only be used 72 | * by the for-each loop to check for the eof) 73 | */ 74 | Variant& operator++(); 75 | 76 | /** 77 | * @brief returns whether the iterator has no additional records 78 | * 79 | * @return true if the the iterator has no additional records 80 | */ 81 | bool empty() const; 82 | 83 | protected: 84 | std::shared_ptr m_variant_file_ptr; ///< pointer to the vcf/bcf file 85 | std::shared_ptr m_variant_header_ptr; ///< pointer to the variant header 86 | std::shared_ptr m_variant_record_ptr; ///< pointer to the internal structure of the variant record. Useful to only allocate it once. 87 | Variant m_variant_record; ///< temporary record to hold between fetch (operator++) and serve (operator*) 88 | 89 | virtual void fetch_next_record(); ///< fetches next Variant record into existing htslib memory without making a copy 90 | }; 91 | 92 | } // end namespace gamgee 93 | 94 | #endif // gamgee__variant_iterator__guard 95 | -------------------------------------------------------------------------------- /gamgee/sam/base_quals.cpp: -------------------------------------------------------------------------------- 1 | #include "base_quals.h" 2 | 3 | #include "../utils/hts_memory.h" 4 | #include "../utils/utils.h" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | namespace gamgee { 13 | 14 | /** 15 | * @brief creates a BaseQuals object that points to htslib memory already allocated 16 | * 17 | * @note the resulting BaseQuals object shares ownership of the pre-allocated memory via 18 | * shared_ptr reference counting 19 | */ 20 | BaseQuals::BaseQuals(const std::shared_ptr& sam_record) : 21 | m_sam_record { sam_record }, 22 | m_quals { bam_get_qual(sam_record.get()) }, 23 | m_num_quals { uint32_t((sam_record.get())->core.l_qseq) } 24 | {} 25 | 26 | /** 27 | * @brief creates a deep copy of a BaseQuals object 28 | * 29 | * @note the copy will have exclusive ownership over the newly-allocated htslib memory 30 | */ 31 | BaseQuals::BaseQuals(const BaseQuals& other) : 32 | m_sam_record { utils::make_shared_sam(utils::sam_deep_copy(other.m_sam_record.get())) }, 33 | m_quals { bam_get_qual(m_sam_record.get()) }, 34 | m_num_quals { other.m_num_quals } 35 | {} 36 | 37 | /** 38 | * @brief creates a deep copy of a BaseQuals object 39 | * 40 | * @note the copy will have exclusive ownership over the newly-allocated htslib memory 41 | */ 42 | BaseQuals& BaseQuals::operator=(const BaseQuals& other) { 43 | if ( &other == this ) 44 | return *this; 45 | m_sam_record = utils::make_shared_sam(utils::sam_deep_copy(other.m_sam_record.get())); ///< shared_ptr assignment will take care of deallocating old sam record if necessary 46 | m_quals = bam_get_qual(m_sam_record.get()); 47 | m_num_quals = other.m_num_quals; 48 | return *this; 49 | } 50 | 51 | /** 52 | * @brief access an individual base quality by index 53 | * 54 | * @return base quality at the specified index as an unsigned byte 55 | */ 56 | uint8_t BaseQuals::operator[](const uint32_t index) const { 57 | utils::check_max_boundary(index, m_num_quals); 58 | return m_quals[index]; 59 | } 60 | 61 | /** 62 | * @brief access and/or modify an individual base quality by index 63 | * 64 | * @return base quality at the specified index as an unsigned byte 65 | */ 66 | uint8_t& BaseQuals::operator[](const uint32_t index) { 67 | utils::check_max_boundary(index, m_num_quals); 68 | return m_quals[index]; 69 | } 70 | 71 | /** 72 | * @brief check whether this object contains the same base qualities as another BaseQuals object 73 | */ 74 | bool BaseQuals::operator==(const BaseQuals& other) const { 75 | if ( m_num_quals != other.m_num_quals ) 76 | return false; 77 | 78 | for ( auto i = 0u; i < m_num_quals; ++i ) { 79 | if ( m_quals[i] != other.m_quals[i] ) 80 | return false; 81 | } 82 | 83 | return true; 84 | } 85 | 86 | /** 87 | * @brief check whether this object does not contain the same base qualities as another BaseQuals object 88 | */ 89 | bool BaseQuals::operator!=(const BaseQuals& other) const { 90 | return !(*this == other); 91 | } 92 | 93 | /** 94 | * @brief produce a string representation of the base qualities in this object 95 | */ 96 | std::string BaseQuals::to_string() const { 97 | stringstream stream; 98 | 99 | for ( auto i = 0u; i < m_num_quals; ++i ) { 100 | stream << int(m_quals[i]); 101 | if ( i < m_num_quals - 1 ) 102 | stream << " "; 103 | } 104 | return stream.str(); 105 | } 106 | 107 | } // end of namespace gamgee 108 | -------------------------------------------------------------------------------- /testdata/test_picard.interval_list: -------------------------------------------------------------------------------- 1 | @HD VN:1.0 SO:coordinate 2 | @SQ SN:1 LN:249250621 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:1b22b98cdeb4a9304cb5d48026a85128 SP:Homo Sapiens 3 | @SQ SN:2 LN:243199373 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:a0d9851da00400dec1098a9255ac712e SP:Homo Sapiens 4 | @SQ SN:3 LN:198022430 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:fdfd811849cc2fadebc929bb925902e5 SP:Homo Sapiens 5 | @SQ SN:4 LN:191154276 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:23dccd106897542ad87d2765d28a19a1 SP:Homo Sapiens 6 | @SQ SN:5 LN:180915260 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:0740173db9ffd264d728f32784845cd7 SP:Homo Sapiens 7 | @SQ SN:6 LN:171115067 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:1d3a93a248d92a729ee764823acbbc6b SP:Homo Sapiens 8 | @SQ SN:7 LN:159138663 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:618366e953d6aaad97dbe4777c29375e SP:Homo Sapiens 9 | @SQ SN:8 LN:146364022 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:96f514a9929e410c6651697bded59aec SP:Homo Sapiens 10 | @SQ SN:9 LN:141213431 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:3e273117f15e0a400f01055d9f393768 SP:Homo Sapiens 11 | @SQ SN:10 LN:135534747 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:988c28e000e84c26d552359af1ea2e1d SP:Homo Sapiens 12 | @SQ SN:11 LN:135006516 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:98c59049a2df285c76ffb1c6db8f8b96 SP:Homo Sapiens 13 | @SQ SN:12 LN:133851895 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:51851ac0e1a115847ad36449b0015864 SP:Homo Sapiens 14 | @SQ SN:13 LN:115169878 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:283f8d7892baa81b510a015719ca7b0b SP:Homo Sapiens 15 | @SQ SN:14 LN:107349540 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:98f3cae32b2a2e9524bc19813927542e SP:Homo Sapiens 16 | @SQ SN:15 LN:102531392 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:e5645a794a8238215b2cd77acb95a078 SP:Homo Sapiens 17 | @SQ SN:16 LN:90354753 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:fc9b1a7b42b97a864f56b348b06095e6 SP:Homo Sapiens 18 | @SQ SN:17 LN:81195210 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:351f64d4f4f9ddd45b35336ad97aa6de SP:Homo Sapiens 19 | @SQ SN:18 LN:78077248 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:b15d4b2d29dde9d3e4f93d1d0f2cbc9c SP:Homo Sapiens 20 | @SQ SN:19 LN:59128983 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:1aacd71f30db8e561810913e0b72636d SP:Homo Sapiens 21 | 20 132342 4832374 + 22 | 20 132342 4832374 + 23 | 20 132342 4832374 + 24 | 20 132342 4832374 + 25 | 20 132342 4832374 + 26 | 20 132342 4832374 + 27 | 20 132342 4832374 + 28 | 20 132342 4832374 + 29 | 20 132342 4832374 + 30 | 20 132342 4832374 + 31 | 20 132342 4832374 + 32 | 20 132342 4832374 + 33 | 20 132342 4832374 + 34 | 20 132342 4832374 + 35 | 20 132342 4832374 + 36 | 20 132342 4832374 + 37 | 20 132342 4832374 + 38 | 20 132342 4832374 + 39 | -------------------------------------------------------------------------------- /gamgee/sam/sam_pair_iterator.cpp: -------------------------------------------------------------------------------- 1 | #include "sam_pair_iterator.h" 2 | #include "sam.h" 3 | 4 | #include "../utils/hts_memory.h" 5 | 6 | #include "htslib/sam.h" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | namespace gamgee { 15 | 16 | SamPairIterator::SamPairIterator() : 17 | m_sam_file_ptr {nullptr}, 18 | m_sam_header_ptr {nullptr}, 19 | m_sam_record_ptr1 {nullptr}, 20 | m_sam_record_ptr2 {nullptr} 21 | {} 22 | 23 | SamPairIterator::SamPairIterator(const std::shared_ptr& sam_file_ptr, const std::shared_ptr& sam_header_ptr) : 24 | m_sam_file_ptr {sam_file_ptr}, 25 | m_sam_header_ptr {sam_header_ptr}, 26 | m_sam_record_ptr1 {utils::make_shared_sam(bam_init1())}, ///< important to initialize the record buffer in the constructor so we can reuse it across the iterator 27 | m_sam_record_ptr2 {utils::make_shared_sam(bam_init1())}, ///< important to initialize the record buffer in the constructor so we can reuse it across the iterator 28 | m_sam_records {fetch_next_pair()} ///< important queue must be initialized *before* we call fetch_next_pair. Order matters 29 | {} 30 | 31 | pair SamPairIterator::operator*() { 32 | return m_sam_records; 33 | } 34 | 35 | pair SamPairIterator::operator++() { 36 | m_sam_records = fetch_next_pair(); 37 | return m_sam_records; 38 | } 39 | 40 | bool SamPairIterator::operator!=(const SamPairIterator& rhs) { 41 | return m_sam_file_ptr != rhs.m_sam_file_ptr; 42 | } 43 | 44 | bool SamPairIterator::read_sam(shared_ptr& record_ptr) { 45 | if (sam_read1(m_sam_file_ptr.get(), m_sam_header_ptr.get(), record_ptr.get()) < 0) { 46 | m_sam_file_ptr = nullptr; 47 | return false; 48 | } 49 | return true; 50 | } 51 | 52 | Sam SamPairIterator::make_sam(shared_ptr& record_ptr) { 53 | return Sam {m_sam_header_ptr, record_ptr}; 54 | } 55 | 56 | static bool primary(shared_ptr& record_ptr) { 57 | return !(record_ptr->core.flag & BAM_FSECONDARY) && !(record_ptr->core.flag & BAM_FSUPPLEMENTARY); 58 | } 59 | 60 | Sam SamPairIterator::next_primary_alignment(shared_ptr& record_ptr) { 61 | m_supp_alignments.push(utils::make_shared_sam(utils::sam_deep_copy(record_ptr.get()))); 62 | while (read_sam(record_ptr) && !primary(record_ptr)) 63 | m_supp_alignments.push(utils::make_shared_sam(utils::sam_deep_copy(record_ptr.get()))); 64 | return make_sam(record_ptr); 65 | } 66 | 67 | pair SamPairIterator::next_supplementary_alignment() { 68 | const auto read = Sam{m_sam_header_ptr, m_supp_alignments.front()}; 69 | m_supp_alignments.pop(); 70 | return make_pair(read, Sam{}); 71 | } 72 | 73 | pair SamPairIterator::fetch_next_pair() { 74 | if (!m_supp_alignments.empty()) // pending supplementary alignments have priority 75 | return next_supplementary_alignment(); 76 | if (!read_sam(m_sam_record_ptr1)) 77 | return make_pair(Sam{}, Sam{}); // we have reached the end of file 78 | const auto read1 = make_sam(m_sam_record_ptr1); 79 | if (!primary(m_sam_record_ptr1) || !read1.paired() || !read_sam(m_sam_record_ptr2)) // unpaired reads go in immediately and by themselves 80 | return make_pair(read1, Sam{}); 81 | if (primary(m_sam_record_ptr2)) // proper paired alignments return here 82 | return make_pair(read1, make_sam(m_sam_record_ptr2)); 83 | return make_pair(read1, next_primary_alignment(m_sam_record_ptr2)); // still haven't found the second primary alignment so search for it while pushing all the secondary/supplementary alignments to the queue 84 | } 85 | 86 | } 87 | 88 | -------------------------------------------------------------------------------- /test/fastq_test.cpp: -------------------------------------------------------------------------------- 1 | #include "fastq.h" 2 | #include "fastq_reader.h" 3 | #include "test_utils.h" 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | using namespace gamgee; 13 | using boost::test_tools::output_test_stream; 14 | 15 | void check_fastq_output_with_file(const string& input, const string& truth) { 16 | output_test_stream fq_file{truth}; 17 | for (const auto& fq : FastqReader{input}) 18 | fq_file << fq; 19 | BOOST_CHECK(fq_file.match_pattern()); 20 | } 21 | 22 | void check_fastq_output(const string& name, const string& comment, const string& seq, const string& qual = "") { 23 | output_test_stream fq_output; 24 | auto fq = Fastq{name, comment, seq, qual}; 25 | fq_output << fq; 26 | const auto fasta_truth = ">" + name + " " + comment + "\n" + seq + "\n"; 27 | const auto fastq_truth = "@" + name + " " + comment + "\n" + seq + "\n+\n" + qual + "\n" ; 28 | const auto truth = qual.empty() ? move(fasta_truth) : move(fastq_truth); 29 | BOOST_CHECK(fq_output.is_equal(truth)); 30 | } 31 | 32 | void check_fastq_fields(const Fastq& record, const string& name, const string& comment, const string& sequence, const string& quals) { 33 | BOOST_CHECK_EQUAL(record.name() , name); 34 | BOOST_CHECK_EQUAL(record.comment() , comment); 35 | BOOST_CHECK_EQUAL(record.sequence(), sequence); 36 | BOOST_CHECK_EQUAL(record.quals() , quals); 37 | } 38 | 39 | BOOST_AUTO_TEST_CASE( fastq_chop_test ) 40 | { 41 | const auto name = string{"test"}; 42 | const auto comment = string{"comm"}; 43 | const auto seq = string{"ACGTACGTACGT"}; 44 | const auto qual = string{"@#@$%$#@#$@$$#@!"}; 45 | auto record = Fastq{name, comment, seq, qual}; 46 | for (auto i = 0u; i != seq.length(); ++i) { 47 | record.chop(i); 48 | check_fastq_fields(record, name, comment, seq.substr(i), qual.substr(i)); 49 | record = Fastq{name, comment, seq, qual}; 50 | } 51 | } 52 | 53 | BOOST_AUTO_TEST_CASE( fastq_reverse_complement_test ) 54 | { 55 | const auto name = string{"test"}; 56 | const auto comment = string{"comm"}; 57 | const auto seq = string{"TTGATCTCCGAT"}; 58 | const auto qual = string{"@#@$%$#@#$@$$#@!"}; 59 | const auto rev = string{"ATCGGAGATCAA"}; 60 | auto record = Fastq{name, comment, seq, qual}; // check that reversing only reverses the sequence (correctly) 61 | record.reverse_complement(); 62 | check_fastq_fields(record, name, comment, rev, qual); 63 | record.reverse_complement(); // re-reverse should go back to original 64 | BOOST_CHECK_EQUAL(record.sequence(), seq); 65 | } 66 | 67 | BOOST_AUTO_TEST_CASE( fastq_output ) 68 | { 69 | check_fastq_output("name", "comment", "ACAGAC", "!!$%#"); 70 | check_fastq_output("name", "comment", "ACAGAC"); 71 | check_fastq_output_with_file("testdata/test_clean.fq", "testdata/test_clean.fq"); // read a clean fastq 72 | check_fastq_output_with_file("testdata/complete_same_seq.fq", "testdata/test_clean.fq"); // read a dirty fastq 73 | check_fastq_output_with_file("testdata/complete_same_seq.fa", "testdata/complete_same_seq.fa"); // read a clean fasta 74 | } 75 | 76 | BOOST_AUTO_TEST_CASE( fastq_copy_and_move_constructor ) { 77 | auto it = FastqReader{"testdata/complete_same_seq.fa"}.begin(); 78 | auto c0 = *it; 79 | auto copies = check_copy_constructor(c0); 80 | auto c1 = get<0>(copies); 81 | auto c2 = get<1>(copies); 82 | auto c3 = get<2>(copies); 83 | BOOST_CHECK(c0 == c1); 84 | BOOST_CHECK(c0 == c2); 85 | BOOST_CHECK(c0 == c3); 86 | c1.set_name("modified"); 87 | BOOST_CHECK(c1 != c0); 88 | BOOST_CHECK(c1 != c2); 89 | auto m0 = *it; 90 | auto m1 = check_move_constructor(m0); 91 | auto m2 = *it; 92 | BOOST_CHECK(m1 == m2); 93 | } 94 | -------------------------------------------------------------------------------- /gamgee/variant/variant_header_builder.cpp: -------------------------------------------------------------------------------- 1 | #include "variant_header_builder.h" 2 | 3 | #include "../utils/hts_memory.h" 4 | #include "../utils/variant_utils.h" 5 | 6 | #include "htslib/vcf.h" 7 | 8 | #include 9 | #include 10 | 11 | namespace gamgee { 12 | 13 | using namespace std; 14 | 15 | static inline string required_parameter(const string& prefix, const string& parameter) { 16 | return string{","}.append(prefix).append(parameter); 17 | } 18 | 19 | static inline string optional_parameter(const string& prefix, const string& parameter) { 20 | return parameter.empty() ? "" : required_parameter(prefix, parameter); 21 | } 22 | 23 | VariantHeaderBuilder::VariantHeaderBuilder() noexcept : 24 | m_header {bcf_hdr_init("w"), utils::VariantHeaderDeleter()} 25 | {} 26 | 27 | VariantHeaderBuilder::VariantHeaderBuilder(const VariantHeader& header) : 28 | m_header {utils::make_shared_variant_header(utils::variant_header_deep_copy(header.m_header.get()))} 29 | {} 30 | 31 | VariantHeaderBuilder& VariantHeaderBuilder::add_chromosome(const string& id, const string& length, const string& url, const string& extra) { 32 | auto s = string{"##contig="); 37 | bcf_hdr_append(m_header.get(), s.c_str()); 38 | return *this; 39 | } 40 | 41 | VariantHeaderBuilder& VariantHeaderBuilder::add_filter(const string& id, const string& description, const string& extra) { 42 | auto s = string{"##FILTER="); 46 | bcf_hdr_append(m_header.get(), s.c_str()); 47 | return *this; 48 | } 49 | 50 | VariantHeaderBuilder& VariantHeaderBuilder::add_shared_field(const string& id, const string& number, const string& type, const string& description, const string& source, const string& version, const string& extra) { 51 | auto s = string{"##INFO="); 59 | bcf_hdr_append(m_header.get(), s.c_str()); 60 | return *this; 61 | } 62 | 63 | VariantHeaderBuilder& VariantHeaderBuilder::add_individual_field(const string& id, const string& number, const string& type, const string& description, const string& extra) { 64 | auto s = string{"##FORMAT="); 70 | bcf_hdr_append(m_header.get(), s.c_str()); 71 | return *this; 72 | } 73 | 74 | VariantHeaderBuilder& VariantHeaderBuilder::add_source(const string& source) { 75 | auto s = string{"##FORMAT="}; 76 | bcf_hdr_append(m_header.get(), s.c_str()); 77 | return *this; 78 | } 79 | 80 | VariantHeaderBuilder& VariantHeaderBuilder::add_sample(const string& sample) { 81 | bcf_hdr_add_sample(m_header.get(), sample.c_str()); 82 | return *this; 83 | } 84 | 85 | VariantHeaderBuilder& VariantHeaderBuilder::advanced_add_arbitrary_line(const std::string& line) { 86 | bcf_hdr_append(m_header.get(), line.c_str()); 87 | return *this; 88 | } 89 | 90 | VariantHeaderBuilder& VariantHeaderBuilder::merge(const VariantHeader& other_header) { 91 | merge_variant_headers(m_header, other_header.m_header); 92 | return *this; 93 | } 94 | 95 | 96 | } // end of namespace 97 | -------------------------------------------------------------------------------- /gamgee/variant/reference_block_splitting_variant_iterator.h: -------------------------------------------------------------------------------- 1 | #ifndef __gamgee__reference_block_splitting_variant_iterator__ 2 | #define __gamgee__reference_block_splitting_variant_iterator__ 3 | 4 | #include "variant.h" 5 | #include "multiple_variant_iterator.h" 6 | 7 | #include 8 | 9 | namespace gamgee { 10 | 11 | /** 12 | * @brief Utility class to handle reference blocks while iterating over multiple variant files 13 | * 14 | * @warn This class is experimental/WIP 15 | */ 16 | class ReferenceBlockSplittingVariantIterator : public MultipleVariantIterator { 17 | public: 18 | 19 | /** 20 | * @brief creates an empty iterator (used for the end() method) 21 | */ 22 | ReferenceBlockSplittingVariantIterator() = default; 23 | 24 | /** 25 | * @brief initializes a new iterator based on a vector of input files (vcf or bcf) 26 | * 27 | * @param variant_files vector of vcf/bcf files opened via the bcf_open() macro from htslib 28 | * @param variant_headers vector of variant headers corresponding to these files 29 | */ 30 | ReferenceBlockSplittingVariantIterator(const std::vector>& variant_files, const std::vector>& variant_headers); 31 | 32 | /** 33 | * @brief a ReferenceBlockSplittingVariantIterator move constructor guarantees all objects will have the same state. 34 | */ 35 | ReferenceBlockSplittingVariantIterator(ReferenceBlockSplittingVariantIterator&&) = default; 36 | ReferenceBlockSplittingVariantIterator& operator=(ReferenceBlockSplittingVariantIterator&&) = default; 37 | 38 | /** 39 | * @brief a ReferenceBlockSplittingVariantIterator cannot be copied. 40 | */ 41 | ReferenceBlockSplittingVariantIterator(const ReferenceBlockSplittingVariantIterator&) = delete; 42 | ReferenceBlockSplittingVariantIterator& operator=(const ReferenceBlockSplittingVariantIterator&) = delete; 43 | 44 | /** 45 | * @brief pseudo-inequality operator (needed by for-each loop) 46 | * 47 | * @warning this method does the minimal work necessary to determine that we have reached the end of iteration. 48 | * it is NOT a valid general-purpose inequality method. 49 | * 50 | * @param rhs the other ReferenceBlockSplittingVariantIterator to compare to 51 | * 52 | * @return whether both iterators have entered their end states 53 | */ 54 | bool operator!=(const ReferenceBlockSplittingVariantIterator& rhs); 55 | 56 | /** 57 | * @brief dereference operator (needed by for-each loop) 58 | * 59 | * @return a reference to the iterator's Variant vector 60 | */ 61 | std::vector& operator*(); 62 | 63 | /** 64 | * @brief advances the iterator, fetching the next vector 65 | * 66 | * @return a reference to the iterator's Variant vector 67 | */ 68 | std::vector& operator++(); 69 | 70 | private: 71 | // fetches the next reference-block-split Variant vector 72 | // calls populate_pending() and populate_split_variants() as needed 73 | void fetch_next_split_vector(); 74 | 75 | // populates the list of pending variants from the incoming vector of pre-split reference-block variants 76 | inline void populate_pending(); 77 | 78 | // populates the vector of split variants from the list of pending variants, modifying the pending list as well 79 | inline void populate_split_variants(); 80 | 81 | // holds the incoming reference-block variants before and during split operations 82 | std::vector m_pending_variants; 83 | 84 | // caches next reference-block-split Variant vector 85 | std::vector m_split_variants; 86 | 87 | unsigned int m_pending_chrom = UINT_MAX; 88 | unsigned int m_pending_start = UINT_MAX; 89 | unsigned int m_pending_min_end = UINT_MAX; 90 | }; 91 | 92 | } // end namespace gamgee 93 | 94 | #endif // __gamgee__reference_block_splitting_variant_iterator__ 95 | -------------------------------------------------------------------------------- /test/reference_test.cpp: -------------------------------------------------------------------------------- 1 | #include "reference_map.h" 2 | #include "reference_iterator.h" 3 | 4 | #include "utils/utils.h" 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | using namespace gamgee; 14 | 15 | // truth data for test reference (if file changes, this has to change too) 16 | 17 | const auto FILE1 = string{"testdata/test_reference.fa"}; 18 | const auto CHROMOSOMES1 = vector{ 19 | "chrA", "chrB", "chrC", "chrD", "chrE", "chrF", "chrG", "chrH", 20 | "chrI", "chrJ", "chrK", "chrL", "chrM", "chrN", "chrO", "chrQ", "chrR", "chrS", "chrT", "chrU" 21 | }; 22 | const auto SEQ1 = string{"AGGGTAGAGAGATAGAGATCCCCCCCCCCAGTACCNNNNAGTT"}; 23 | 24 | const auto FILE2 = string{"testdata/test_reference2.fa"}; 25 | const auto CHROMOSOMES2 = vector{ "chr1", "chr2" }; 26 | const auto CHR1_SEQ = string{"AGGGATCCCCCCCCCCAGTACCNNNNAGTT"}; 27 | const auto CHR2_SEQ = string{"NNNNNNAGGGATCCCNCCCCCCCAGTACCNNNNAGTT"}; 28 | 29 | BOOST_AUTO_TEST_CASE( reference_map_constructor_test ) 30 | { 31 | auto ref1 = ReferenceMap {FILE1}; 32 | for (const auto& chr_seq : ref1) { 33 | BOOST_CHECK_EQUAL(chr_seq.first.substr(0,3), "chr"); 34 | BOOST_CHECK_EQUAL(chr_seq.second, SEQ1); 35 | } 36 | 37 | auto ref2 = ReferenceMap {FILE2}; 38 | BOOST_CHECK_EQUAL(ref2["chr1"], CHR1_SEQ); 39 | BOOST_CHECK_EQUAL(ref2["chr2"], CHR2_SEQ); 40 | // verify that it can access in arbitrary order 41 | BOOST_CHECK_EQUAL(ref2["chr1"], CHR1_SEQ); 42 | } 43 | 44 | BOOST_AUTO_TEST_CASE( reference_map_get_sequence_test ) 45 | { 46 | auto reference_map1 = ReferenceMap{FILE1}; 47 | for (auto start = 1u; start != SEQ1.length(); ++start) { 48 | for (auto len = 1u; len <= SEQ1.length() - start; ++len) { 49 | const auto interval = Interval{"chrA", start, start+len-1}; 50 | BOOST_CHECK_EQUAL(reference_map1.get_sequence(interval), SEQ1.substr(start-1, len)); 51 | BOOST_CHECK_EQUAL(reference_map1.get_sequence(interval, true), gamgee::utils::complement(SEQ1.substr(start-1, len))); 52 | } 53 | } 54 | 55 | auto reference_map2 = ReferenceMap{FILE2}; 56 | for (auto start = 1u; start != CHR1_SEQ.length(); ++start) { 57 | for (auto len = 1u; len <= CHR1_SEQ.length() - start; ++len) { 58 | const auto interval = Interval{"chr1", start, start+len-1}; 59 | BOOST_CHECK_EQUAL(reference_map2.get_sequence(interval), CHR1_SEQ.substr(start-1, len)); 60 | BOOST_CHECK_EQUAL(reference_map2.get_sequence(interval, true), gamgee::utils::complement(CHR1_SEQ.substr(start-1, len))); 61 | } 62 | } 63 | for (auto start = 1u; start != CHR2_SEQ.length(); ++start) { 64 | for (auto len = 1u; len <= CHR2_SEQ.length() - start; ++len) { 65 | const auto interval = Interval{"chr2", start, start+len-1}; 66 | BOOST_CHECK_EQUAL(reference_map2.get_sequence(interval), CHR2_SEQ.substr(start-1, len)); 67 | BOOST_CHECK_EQUAL(reference_map2.get_sequence(interval, true), gamgee::utils::complement(CHR2_SEQ.substr(start-1, len))); 68 | } 69 | } 70 | } 71 | 72 | BOOST_AUTO_TEST_CASE( reference_iterator_test ) { 73 | auto reference1 = ReferenceIterator{FILE1}; 74 | for (const auto chr : CHROMOSOMES1) { 75 | for (auto counter = 1u; counter <= SEQ1.size(); counter++) { 76 | const char truth = SEQ1[counter - 1]; 77 | BOOST_CHECK(truth == reference1.ref_base(chr, counter)); 78 | } 79 | } 80 | 81 | auto reference2 = ReferenceIterator{FILE2}; 82 | for (auto counter = 1u; counter <= CHR1_SEQ.size(); counter++) { 83 | const char truth = CHR1_SEQ[counter - 1]; 84 | BOOST_CHECK(truth == reference2.ref_base("chr1", counter)); 85 | } 86 | for (auto counter = 1u; counter <= CHR2_SEQ.size(); counter++) { 87 | const char truth = CHR2_SEQ[counter - 1]; 88 | BOOST_CHECK(truth == reference2.ref_base("chr2", counter)); 89 | } 90 | } 91 | 92 | -------------------------------------------------------------------------------- /gamgee/utils/merged_vcf_lut.cpp: -------------------------------------------------------------------------------- 1 | #include "merged_vcf_lut.h" 2 | 3 | using namespace std; 4 | namespace gamgee 5 | { 6 | namespace utils 7 | { 8 | template 9 | MergedVCFLUTBase::MergedVCFLUTBase() 10 | { 11 | m_num_input_vcfs = 0u; 12 | m_num_merged_fields = 0u; 13 | clear(); 14 | } 15 | 16 | template 17 | MergedVCFLUTBase::MergedVCFLUTBase(unsigned numInputGVCFs, unsigned numMergedFields) 18 | { 19 | m_num_input_vcfs = numInputGVCFs; 20 | m_num_merged_fields = numMergedFields; 21 | clear(); 22 | resize_inputs_2_merged_lut_if_needed(numInputGVCFs, numMergedFields); 23 | resize_merged_2_inputs_lut_if_needed(numInputGVCFs, numMergedFields); 24 | } 25 | 26 | template 27 | void MergedVCFLUTBase::clear() 28 | { 29 | for(auto& vec : m_inputs_2_merged_lut) 30 | vec.clear(); 31 | m_inputs_2_merged_lut.clear(); 32 | for(auto& vec : m_merged_2_inputs_lut) 33 | vec.clear(); 34 | m_merged_2_inputs_lut.clear(); 35 | } 36 | 37 | template 38 | void MergedVCFLUTBase::reset_vector(vector& vec, unsigned from) 39 | { 40 | for(auto i=from;i 45 | void MergedVCFLUTBase::resize_and_reset_vector(vector& vec, unsigned new_size) 46 | { 47 | auto old_size = vec.size(); 48 | if(new_size > old_size) 49 | { 50 | vec.resize(new_size); 51 | reset_vector(vec, old_size); 52 | } 53 | } 54 | 55 | template 56 | void MergedVCFLUTBase::resize_and_reset_lut 57 | (vector>& lut, unsigned new_lut_size, unsigned new_vector_size, unsigned& numRowsVar, unsigned& numColsVar) 58 | { 59 | auto old_lut_size = lut.size(); 60 | if(new_lut_size > old_lut_size) 61 | { 62 | lut.resize(new_lut_size); 63 | numRowsVar = new_lut_size; 64 | } 65 | auto old_vector_size = (lut.size() > 0u) ? lut[0].size() : 0u; 66 | //Begin resizing of vectors at start_idx 67 | auto start_idx = old_lut_size; 68 | if(new_vector_size > old_vector_size) //every vector needs to be resized 69 | { 70 | start_idx = 0u; 71 | numColsVar = new_vector_size; 72 | } 73 | else 74 | new_vector_size = old_vector_size; //new vector size is smaller, don't bother reducing the size of existing rows 75 | for(auto i=start_idx;i; 80 | template class MergedVCFLUTBase; 81 | template class MergedVCFLUTBase; 82 | template class MergedVCFLUTBase; 83 | 84 | //explicit initialization to avoid link errors 85 | template class MergedVCFAllelesIdxLUT; 86 | } 87 | } 88 | --------------------------------------------------------------------------------