├── test ├── truths │ ├── deletion.fasta │ ├── deletionfuzzy.fasta │ ├── n_after_gap.fasta │ ├── n_before_gap.fasta │ ├── n_in_stretch.fasta │ ├── insertion.fasta │ ├── sSNP.fasta │ ├── insertion_before_SNP.fasta │ ├── multiSNP.fasta │ ├── truth_snp.vcf │ ├── multiSNP.vcf │ └── insert_ref10K.fasta ├── full_test │ ├── gold_bed.info.txt │ ├── gold.bed │ ├── gold_bed.breakpoints │ ├── gold_bed.insertions.fasta │ ├── variants.txt │ ├── gold_bed.othervariants.vcf │ ├── gold.breakpoints │ ├── gold_bed.insertions.vcf │ ├── gold_fill.output │ ├── gold.insertions.fasta │ ├── gold_find.output │ ├── gold.othervariants.vcf │ ├── gold.insertions.vcf │ ├── reference.fasta │ ├── allele1.fasta │ └── allele2.fasta ├── references │ ├── deleted.fasta │ ├── master.fasta │ ├── sSNP.fasta │ ├── n_before_gap.fasta │ ├── n_in_stretch.fasta │ ├── deleted_before_SNP.fasta │ ├── n_after_gap.fasta │ ├── beginfuzzySNP.fasta │ ├── endfuzzySNP.fasta │ ├── deletionfuzzy.fasta │ └── multiSNP.fasta ├── contig_test │ ├── gold.info.txt │ ├── gold_seed_dictionary.fasta │ ├── gold.log │ ├── README │ └── genome-variant.fasta ├── compare_vcf.sh ├── scripts │ └── generate_read.py ├── simple_test.sh ├── vde.py └── simple_full_test.sh ├── .gitignore ├── data ├── regions.bed ├── contig-reads.fasta.gz ├── contigs.fasta └── reference.fasta ├── .gitmodules ├── scripts ├── jenkins │ ├── README │ ├── tool-mindthegap-build-macos-10.9.5-gcc-4.2.1.sh │ ├── tool-mindthegap-build-debian7-64bits-gcc-4.7.sh │ └── tool-mindthegap-release-debian.sh ├── python3 │ ├── README.txt │ └── Inser_snp_in_ref.py └── script_human_analysis │ ├── README.txt │ └── Inser_snp_in_ref.py ├── container ├── README.txt ├── Dockerfile └── MTG_recipes ├── INSTALL ├── .travis.yml ├── docker └── Dockerfile ├── src ├── nwAlign │ └── nwalign.cpp ├── GraphOutputDot.hpp ├── FindBackup.hpp ├── GraphAnalysis.hpp ├── CircularBuffer.hpp ├── Utils.hpp ├── IFindObserver.hpp ├── Finder.hpp ├── main.cpp ├── IGraphOutput.hpp ├── GraphOutputDot.cpp ├── FindInsertion.hpp ├── FindDeletion.hpp ├── IGraphOutput.cpp └── Utils.cpp ├── CMakeLists.txt ├── doc └── MindTheGap_assembly.md └── CHANGELOG.md /test/truths/deletion.fasta: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/truths/deletionfuzzy.fasta: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/truths/n_after_gap.fasta: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/truths/n_before_gap.fasta: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/truths/n_in_stretch.fasta: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build* 2 | test/output 3 | .vscode/settings.json 4 | *.pyc -------------------------------------------------------------------------------- /data/regions.bed: -------------------------------------------------------------------------------- 1 | Seq2 600 1000 2 | Seq4 20 85 3 | Seq4 150 400 4 | Seq4 450 700 5 | -------------------------------------------------------------------------------- /test/truths/insertion.fasta: -------------------------------------------------------------------------------- 1 | >deletion_1 : master n=200_52 2 | TCGGATGGAGGCAAACGCAG 3 | -------------------------------------------------------------------------------- /test/truths/sSNP.fasta: -------------------------------------------------------------------------------- 1 | > insertion 0 ( len= 1 ) for breakpoint "todo add header here" 2 | T 3 | -------------------------------------------------------------------------------- /data/contig-reads.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATB/MindTheGap/HEAD/data/contig-reads.fasta.gz -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "thirdparty/gatb-core"] 2 | path = thirdparty/gatb-core 3 | url = https://github.com/GATB/gatb-core.git 4 | -------------------------------------------------------------------------------- /test/truths/insertion_before_SNP.fasta: -------------------------------------------------------------------------------- 1 | > insertion 0 ( len= 20 ) for breakpoint "todo add header here" 2 | AAGACTTGAGAGTCTATCAC 3 | -------------------------------------------------------------------------------- /test/truths/multiSNP.fasta: -------------------------------------------------------------------------------- 1 | > insertion 0 ( len= 1 ) for breakpoint "todo add header here" 2 | A 3 | > insertion 0 ( len= 1 ) for breakpoint "todo add header here" 4 | A 5 | -------------------------------------------------------------------------------- /test/full_test/gold_bed.info.txt: -------------------------------------------------------------------------------- 1 | bkpt2_Seq0_pos_123_fuzzy_0_HET 5 1776 2 1 1 2 | bkpt3_Seq1_pos_342_fuzzy_0_HET 4 1318 2 1 1 3 | bkpt4_Seq2_pos_535_fuzzy_0_HOM 1 823 1 1 1 4 | -------------------------------------------------------------------------------- /test/references/deleted.fasta: -------------------------------------------------------------------------------- 1 | >deleted n=180 2 | ACGCGCCTGGTATGGCAGGATTAAGAAGCCAATACAAAGGCTACATCCTCACAACAATGGTTACTTTTTCGATACGTGAAACATGTCCCACGGTAGCCCAAAGACTTGAGAGTCTATCACCCCTAGGGCCCTTTCCCGGATATAAACGCCAGGTTGAATCCGCATTTGGAGGTACGATGG 3 | -------------------------------------------------------------------------------- /test/references/master.fasta: -------------------------------------------------------------------------------- 1 | >master n=200 2 | ACGCGCCTGGTATGGCAGGATTAAGAAGCCAATACAAAGGCTACATCCTCACTCGGATGGAGGCAAACGCAGAACAATGGTTACTTTTTCGATACGTGAAACATGTCCCACGGTAGCCCAAAGACTTGAGAGTCTATCACCCCTAGGGCCCTTTCCCGGATATAAACGCCAGGTTGAATCCGCATTTGGAGGTACGATGG 3 | -------------------------------------------------------------------------------- /test/references/sSNP.fasta: -------------------------------------------------------------------------------- 1 | >sSNP pos 89 T->A n=200 2 | ACGCGCCTGGTATGGCAGGATTAAGAAGCCAATACAAAGGCTACATCCTCACTCGGATGGAGGCAAACGCAGAACAATGGTTACTTTTACGATACGTGAAACATGTCCCACGGTAGCCCAAAGACTTGAGAGTCTATCACCCCTAGGGCCCTTTCCCGGATATAAACGCCAGGTTGAATCCGCATTTGGAGGTACGATGG -------------------------------------------------------------------------------- /scripts/jenkins/README: -------------------------------------------------------------------------------- 1 | These scripts are intended to be used with the Jenkins CI Platform available at Inria. 2 | 3 | They can be called from a Jenkins Task / Build / Execute script, as follows: 4 | 5 | /bin/bash -xv gatb-${TOOL_NAME}/scripts/jenkins/xxx.sh 6 | 7 | -------------------------------------------------------------------------------- /test/references/n_before_gap.fasta: -------------------------------------------------------------------------------- 1 | >deleted at pos 120, N at pos 89 T in N n=180 2 | ACGCGCCTGGTATGGCAGGATTAAGAAGCCAATACAAAGGCTACATCCTCACTCGGATGGAGGCAAACGCAGAACAATGGTTACTTTTNCGATACGTGAAACATGTCCCACGGTAGCCCCCCCTAGGGCCCTTTCCCGGATATAAACGCCAGGTTGAATCCGCATTTGGAGGTACGATGG -------------------------------------------------------------------------------- /test/references/n_in_stretch.fasta: -------------------------------------------------------------------------------- 1 | >deleted at pos 120, N at pos 88 T in N n=180 2 | ACGCGCCTGGTATGGCAGGATTAAGAAGCCAATACAAAGGCTACATCCTCACTCGGATGGAGGCAAACGCAGAACAATGGTTACTTTNTCGATACGTGAAACATGTCCCACGGTAGCCCCCCCTAGGGCCCTTTCCCGGATATAAACGCCAGGTTGAATCCGCATTTGGAGGTACGATGG -------------------------------------------------------------------------------- /test/references/deleted_before_SNP.fasta: -------------------------------------------------------------------------------- 1 | >deleted at 120-140 SNP at 100 A->G n=180 2 | ACGCGCCTGGTATGGCAGGATTAAGAAGCCAATACAAAGGCTACATCCTCACTCGGATGGAGGCAAACGCAGAACAATGGTTACTTTTTCGATACGTGAAGCATGTCCCACGGTAGCCCACCCTAGGGCCCTTTCCCGGATATAAACGCCAGGTTGAATCCGCATTTGGAGGTACGATGG 3 | -------------------------------------------------------------------------------- /test/references/n_after_gap.fasta: -------------------------------------------------------------------------------- 1 | >deleted at pos 100, N at pos 130 C in N n=200 2 | ACGCGCCTGGTATGGCAGGATTAAGAAGCCAATACAAAGGCTACATCCTCACTCGGATGGAGGCAAACGCAGAACAATGGTTACTTTTTCGATACGTGAAAAGACTTGAGAGTCTATCACCCCTAGGGCNCTTTCCCGGATATAAACGCCAGGTTGAATCCGCATTTGGAGGTACGATGG 3 | -------------------------------------------------------------------------------- /test/full_test/gold.bed: -------------------------------------------------------------------------------- 1 | Seq2 500 600 HOM clean 2 | Seq0 60 140 SNP T -> C 3 | Seq0 90 150 HET clean after SNP 4 | Seq0 200 450 DEL end=410 clean 5 | Seq1 300 400 HET clean 6 | Seq1 700 847 DEL end=847 fuzzy=2 #not found cause bed don't scan all the deletion sequence 7 | 8 | -------------------------------------------------------------------------------- /test/references/beginfuzzySNP.fasta: -------------------------------------------------------------------------------- 1 | >fuzzySNP pos 49 A->G n=100+31+69 2 | TGATTATCATGAGTGCCCCGTTATGGTCGTGTTCGATCAGAGCGCTCTTGCGAGCAGTCGTATGCTTTCTCGAATTCCGTGCGGTTAAGCGTGACAGTCCCGTTATGGTCGTGTTCGATCAGAGCGCTCTTGCAGTGAACCCACAAAACGTGATGGCAGTCCATGCGATCATACGCAAGAAGGATGGTCTCCAGACACCGG -------------------------------------------------------------------------------- /test/references/endfuzzySNP.fasta: -------------------------------------------------------------------------------- 1 | >endfuzzySNP pos 49 T->A n = 100+31+69 2 | CCCTAGACAGACAACAGTAAGCGCCTTTTGTAGGCAAGAGCTCCGCCTGAGACTAACTGCGCCAAAACGTCTTCCAATCCCCTTATCCAATTTAACTCACAGACTAACTGCGCCAAAACGTCTTCCAATCCGAATTCTTACAATTTAGACCCTAATATCACATCATTAGACACTAATTGCCTCTGCCAAAATTCTGTCC -------------------------------------------------------------------------------- /test/references/deletionfuzzy.fasta: -------------------------------------------------------------------------------- 1 | >ref_for_deletionfuzzy_n=200_del_between_110_130 2 | CATCGCATCCGGGCGTGCGCTCTATTTGACGATCCCTTGGCGCAGAGGTGCTGGCCACGTGCTAAATTAAAGCGGCTGCACTACTGTAAGGTCCGTCACGGAGACGGCGGAAACTGTTAGGACTTACCATAAATCCCCTATATCGTTCTCGGACGGACAGATTACTAGAGTGCCGCTTTCAGCCCAACTTGGGGTTACC -------------------------------------------------------------------------------- /container/README.txt: -------------------------------------------------------------------------------- 1 | MTG_recipes is a recipe to build MindTheGap master version using singularity. The recipe contains python2 and python3(pyGATB,biopython,pandas and numpy installed) 2 | 3 | To build MindTheGap image use the command: 4 | sudo singularity build Name_my_image.sif MTP_recipes 5 | 6 | -------------------------------------------------------------------------------- /test/references/multiSNP.fasta: -------------------------------------------------------------------------------- 1 | >dSNP pos 88 T->A, 108 C->G, 128 A->G, 288 T->A, 308 C->G, 328 A->G n=200 2 | ACGCGCCTGGTATGGCAGGATTAAGAAGCCAATACAAAGGCTACATCCTCACTCGGATGGAGGCAAACGCAGAACAATGGTTACTTTTACGATACGTGAAACATGTCCGACGGTAGCCCAAAGACTTGGGAGTCTATCACCCCTAGGGCCCTTTCCCGGATATAAACGCCAGGTTGAATCCGCATTTGGAGGTACGATGGACGCGCCTGGTATGGCAGGATTAAGAAGCCAATACAAAGGCTACATCCTCACTCGGATGGAGGCAAACGCAGAACAATGGTTACTTTTACGATACGTGAAACATGTCCGACGGTAGCCCAAAGACTTGGGAGTCTATCACCCCTAGGGCCCTTTCCCGGATATAAACGCCAGGTTGAATCCGCATTTGGAGGTACGATGG 3 | -------------------------------------------------------------------------------- /test/contig_test/gold.info.txt: -------------------------------------------------------------------------------- 1 | contig1 5 7627 3 1 1 2 | contig1_Rc 1 55 0 3 | contig3 5 6033 3 1 1 4 | contig3_Rc 1 1611 1 1 1 5 | contig4 5 4998 3 1 1 6 | contig4_Rc 1 2723 1 1 1 7 | contig5 5 4428 2 2 2 8 | contig5_Rc 1 3175 1 1 1 9 | contig6 3 3233 2 2 2 10 | contig6_Rc 4 7062 2 2 2 11 | contig7 1 2258 1 1 1 12 | contig7_Rc 4 7862 3 1 1 13 | contig8 1 1375 1 1 1 14 | contig8_Rc 4 3160 2 2 2 15 | contig9 1 31 0 16 | contig9_Rc 1 31 0 17 | contig10 1 54 0 18 | contig10_Rc 1 31 0 19 | -------------------------------------------------------------------------------- /test/compare_vcf.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | ## Script to compare vcf, in a smarter way than a simple diff : comparing only the fields chrom position ref alt (must be identical) 4 | 5 | ## 2 arguments = 2 vcf files 6 | 7 | vcf1=$1 8 | vcf2=$2 9 | 10 | tmp1=$vcf1.temp 11 | tmp2=$vcf2.temp 12 | 13 | grep -v "^#" $vcf1 | cut -f1,2,4,5 | sort > $tmp1 14 | grep -v "^#" $vcf2 | cut -f1,2,4,5 | sort > $tmp2 15 | 16 | diff $tmp1 $tmp2 17 | 18 | RETVAR=$? 19 | 20 | rm -f $tmp1 21 | rm -f $tmp2 22 | 23 | exit $RETVAR 24 | -------------------------------------------------------------------------------- /test/full_test/gold_bed.breakpoints: -------------------------------------------------------------------------------- 1 | >bkpt2_Seq0_pos_123_fuzzy_0_HET left_kmer 2 | CTCCGGATCTCCGTGTTCTTCGGAAGCTTAG 3 | >bkpt2_Seq0_pos_123_fuzzy_0_HET right_kmer 4 | GTCACGCGCGTCATACTACAGTAAGTTACTG 5 | >bkpt4_Seq1_pos_342_fuzzy_0_HET left_kmer 6 | GCCGCGCAAAGCCGGTCAACAGCGTTAGTAT 7 | >bkpt4_Seq1_pos_342_fuzzy_0_HET right_kmer 8 | GTTGAAAGTTTACTCAGATCGCTTCTGTCGG 9 | >bkpt5_Seq2_pos_535_fuzzy_0_HOM left_kmer 10 | GGCATGCGTAAGTTATCGTGAAACCATGATG 11 | >bkpt5_Seq2_pos_535_fuzzy_0_HOM right_kmer 12 | GCCCCTTACTAGACCAAATGTACTGAATGCG 13 | -------------------------------------------------------------------------------- /test/truths/truth_snp.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##filedate 3 | ##source=MindTheGap find 4 | ##SAMPLE=file:reads/master.fasta 5 | ##REF=file:references/sSNP.fasta 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##FORMAT= 10 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT G1 11 | sSNP 89 bkpt1 A T . PASS TYPE=SNP;LEN=1;REP=0 GT 1/1 12 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | # CMake is required to compile software (http://www.cmake.org/cmake/resources/software.html) 2 | # you can install software by executing this file: sh INSTALL 3 | # 4 | 5 | # Prepare GATB sub-module 6 | git submodule init 7 | git submodule update 8 | 9 | # Prepare directories: 10 | rm -rf build 11 | mkdir build 12 | 13 | # Go in the 'build' directory 14 | cd build 15 | 16 | # Prepare the makefile 17 | cmake .. 18 | 19 | # Run the newly created makefile: 20 | make 21 | # To compile faster, use: 22 | # make -j8 23 | 24 | # Go back at the installation root directory 25 | cd .. 26 | # run tests 27 | echo "Running simple test..." 28 | cd test 29 | . ./simple_test.sh 30 | cd .. 31 | -------------------------------------------------------------------------------- /test/full_test/gold_bed.insertions.fasta: -------------------------------------------------------------------------------- 1 | >bkpt2_Seq0_pos_123_fuzzy_0_HET_len_137_qual_50_avg_cov_8.38_median_cov_8.00 2 | ATCTAAGCTGTGACCTTGTGGCCGAGGCGCTTTTCACGCCTACATTAACTCCTGGGAAGCTCTCTGCTCTAGTTTCAGTGCACATCTCCAGGTGAGCAACCCTGGCAAGCAGCCCCTTCCTGTAGAAATTACTTAGC 3 | >bkpt3_Seq1_pos_342_fuzzy_0_HET_len_125_qual_50_avg_cov_9.91_median_cov_9.00 4 | ATGGTTTATAGAACCCGGGCGTTCATGTCCGTCAGAACGATCTTGGCACGGTAGCCCCTGGTCCAGAGAGCCAAGGTGACTCAGCCCCACGATGGTGGTCTAGAGCGAAATAACCCTCGCCGAGA 5 | >bkpt4_Seq2_pos_535_fuzzy_0_HOM_len_140_qual_50_avg_cov_21.63_median_cov_22.00 6 | TAACGTTCGCTGAACATCGACTCCGGTGACGACATACGATTCAAGAAGAGAGTGACTCTGTAGGATAACATCCCGCAACGCCTAATCCATCCAGCCTGGCACCATGTATAAAGGGCGTCAGGTATGTTAACGAGACTATT 7 | -------------------------------------------------------------------------------- /test/full_test/variants.txt: -------------------------------------------------------------------------------- 1 | Seq0 100 SNP T -> C 2 | Seq0 122 HET clean after SNP 3 | Seq0 296 DEL end=410 clean # not detected cause 2 consecutive k-mer are present within the deletion 4 | Seq0 815 SNP C -> A 5 | Seq1 205 MSNP G -> C # 2 close snps 6 | Seq1 218 MSNP T -> A # 2 close snps 7 | Seq1 341 HET clean 8 | Seq1 739 DEL end=847 fuzzy=2 9 | Seq2 319 MSNP T -> C # 2 close snp, note : due to bloom-FP can be missed 10 | Seq2 343 MSNP C -> A # 2 close snp 11 | Seq2 378 MSNP G -> C 12 | Seq2 535 HOM clean 13 | Seq2 834 HOM fuzzy=1 14 | Seq3 255 SNP A -> T 15 | Seq3 510 SNP C -> A 16 | Seq3 765 SNP G -> A 17 | Seq3 780 HOM clean after SNP 18 | Seq4 256 SNP C -> T 19 | Seq4 348 HET fuzzy=2 20 | Seq4 511 SNP A -> G 21 | Seq4 600 HOM fuzzy=3 22 | Seq4 820 HOM clean before SNP 23 | Seq4 840 SNP C -> T 24 | Seq4 884 DEL end=928 fuzzy=2 25 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | os: 3 | - linux 4 | - osx 5 | compiler: 6 | - clang 7 | - gcc 8 | addons: 9 | apt: 10 | sources: 11 | - ubuntu-toolchain-r-test 12 | - llvm-toolchain-precise-3.7 13 | - george-edison55-precise-backports # for cmake 3 14 | packages: 15 | - libcppunit-dev 16 | - g++-4.8 17 | - clang-3.7 18 | - cmake 19 | - cmake-data 20 | install: 21 | - if [ "`echo $CXX`" == "g++" ] && [ "$TRAVIS_OS_NAME" == "linux" ]; then export CXX=g++-4.8; fi 22 | - if [ "`echo $CXX`" == "clang++" ] && [ "$TRAVIS_OS_NAME" == "linux" ]; then export CXX=clang++-3.7; fi 23 | matrix: 24 | exclude: 25 | - os: osx 26 | compiler: gcc 27 | script: 28 | - mkdir build 29 | - cd build 30 | - cmake .. && make 31 | - cd ../test && ./simple_full_test.sh 32 | env: 33 | global: 34 | - MAKEFLAGS="-j 4" 35 | -------------------------------------------------------------------------------- /test/full_test/gold_bed.othervariants.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##filedate=Tue Feb 8 12:46:04 2022 3 | ##source=MindTheGap find version 2.2.3 4 | ##SAMPLE=file:../../data/reads_r1.fastq,../../data/reads_r2.fastq 5 | ##REF=file:../../data/reference.fasta 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##FORMAT= 10 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT G1 11 | Seq0 101 bkpt1 T C . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 12 | Seq0 297 bkpt3 CTAGCTTGAGAGTGCGTATCTCACCGATCCCCTGGCTATGCTCCGCGATTCACTAGTAGTTTCACGCCGACAGAGCGAAACCGTGATAGGTCATCATGCCGGTCTGCAGTCACGT C . PASS TYPE=DEL;LEN=114;FUZZY=0 GT 1/1 13 | -------------------------------------------------------------------------------- /test/truths/multiSNP.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##filedate 3 | ##source 4 | ##SAMPLE=file:reads/master.fasta 5 | ##REF=file:references/multiSNP.fasta 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##FORMAT= 10 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT G1 11 | dSNP 89 bkpt1 A T . PASS TYPE=SNP;LEN=1;REP=0 GT 1/1 12 | dSNP 109 bkpt2 G C . PASS TYPE=SNP;LEN=1;REP=0 GT 1/1 13 | dSNP 129 bkpt3 G A . PASS TYPE=SNP;LEN=1;REP=0 GT 1/1 14 | dSNP 289 bkpt4 A T . PASS TYPE=SNP;LEN=1;REP=0 GT 1/1 15 | dSNP 309 bkpt5 G C . PASS TYPE=SNP;LEN=1;REP=0 GT 1/1 16 | dSNP 329 bkpt6 G A . PASS TYPE=SNP;LEN=1;REP=0 GT 1/1 17 | -------------------------------------------------------------------------------- /test/contig_test/gold_seed_dictionary.fasta: -------------------------------------------------------------------------------- 1 | >contig1 2 | CAACGGGGGCTGTGGATACCCCATCTGTACC 3 | >contig1_Rc 4 | AAATGAGACGCACGAATACATTACGCTTCCA 5 | >contig3 6 | TAACCCGAGGGGAATGAGTTATCCTTACTGG 7 | >contig3_Rc 8 | CAGAACGGTTTGATTATCCTCGCTGGTTACG 9 | >contig4 10 | TAGCCCATATTAGTTATAGGATCTGTATTTA 11 | >contig4_Rc 12 | TTGAAAACGTGCTCACAGGAGGCTGCTCTAG 13 | >contig5 14 | GCGGCTCACCAAGTTAATTACTTGCCTTGTC 15 | >contig5_Rc 16 | GCGTGGAGGGGGCGTCCGTAATGCTGGCAGC 17 | >contig6 18 | ATATTCTGAGCTTCCACACCGCGGCATCTGA 19 | >contig6_Rc 20 | GAAGCGATTGTAGACGCGGGCAGCTGACTTC 21 | >contig7 22 | CTAGCATAACGCTTGTTCAACCTCAGGCTGG 23 | >contig7_Rc 24 | ATCCCCATTTGTTCAGAGGCAAATATATGCG 25 | >contig8 26 | ATCGGTGCATCGCTTTCAGGGTAGAACTTGT 27 | >contig8_Rc 28 | CCGGTAGTGCCCGTTTAAGATAAGAACATAA 29 | >contig9 30 | GCTCCGTCCGAGTGTACGTGCCGTCATTTTC 31 | >contig9_Rc 32 | AGTAGGCCATGAAGCGGCTCAATTCCTTGTG 33 | >contig10 34 | GTCATCTCATCTGTAGCTCCTTGCGCTGTAT 35 | >contig10_Rc 36 | GCACAGCTTGGAGGCGATTTCGCTTGCCAGC 37 | -------------------------------------------------------------------------------- /container/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:wheezy 2 | 3 | MAINTAINER Claire Lemaitre claire.lemaitre@inria.fr 4 | 5 | # Set MindTheGap version 6 | ENV MTG_VERSION 2.1.0 7 | 8 | # Set noninteratve mode 9 | ENV DEBIAN_FRONTEND noninteractive 10 | ENV PACKAGES wget gcc g++ make cmake zlib1g-dev libboost-dev git 11 | 12 | ENV DIR /opt 13 | ENV SOURCE MindTheGap 14 | ENV BUILD build 15 | 16 | WORKDIR ${DIR} 17 | 18 | RUN apt-get update -y && \ 19 | apt-get install -y --no-install-recommends ${PACKAGES} 20 | 21 | RUN git config --global http.sslVerify false 22 | 23 | # clone the github repo 24 | RUN git clone --recursive https://github.com/GATB/MindTheGap.git 25 | 26 | WORKDIR ${DIR}/${SOURCE} 27 | RUN git submodule init 28 | 29 | # Using an official release 30 | RUN git checkout v${MTG_VERSION} 31 | RUN git submodule update 32 | 33 | RUN mkdir ${BUILD} 34 | WORKDIR ${DIR}/${SOURCE}/${BUILD} 35 | 36 | RUN cmake .. 37 | RUN make 38 | 39 | # symlink binary in /usr/local/bin 40 | RUN ln -s ${DIR}/${SOURCE}/${BUILD}/bin/MindTheGap /usr/local/bin 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:wheezy 2 | 3 | MAINTAINER Claire Lemaitre claire.lemaitre@inria.fr 4 | 5 | # Set MindTheGap version 6 | ENV MTG_VERSION 2.1.0 7 | 8 | # Set noninteratve mode 9 | ENV DEBIAN_FRONTEND noninteractive 10 | ENV PACKAGES wget gcc g++ make cmake zlib1g-dev libboost-dev git 11 | 12 | ENV DIR /opt 13 | ENV SOURCE MindTheGap 14 | ENV BUILD build 15 | 16 | WORKDIR ${DIR} 17 | 18 | RUN apt-get update -y && \ 19 | apt-get install -y --no-install-recommends ${PACKAGES} 20 | 21 | RUN git config --global http.sslVerify false 22 | 23 | # clone the github repo 24 | RUN git clone --recursive https://github.com/GATB/MindTheGap.git 25 | 26 | WORKDIR ${DIR}/${SOURCE} 27 | RUN git submodule init 28 | 29 | # Using an official release 30 | RUN git checkout v${MTG_VERSION} 31 | RUN git submodule update 32 | 33 | RUN mkdir ${BUILD} 34 | WORKDIR ${DIR}/${SOURCE}/${BUILD} 35 | 36 | RUN cmake .. 37 | RUN make 38 | 39 | # symlink binary in /usr/local/bin 40 | RUN ln -s ${DIR}/${SOURCE}/${BUILD}/bin/MindTheGap /usr/local/bin 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /container/MTG_recipes: -------------------------------------------------------------------------------- 1 | Bootstrap: docker 2 | From: ubuntu:16.04 3 | 4 | 5 | %environment 6 | export PATH="$PATH:/opt/MindTheGap/build/ext/gatb-core/bin" 7 | 8 | %post 9 | apt-get -y update && \ 10 | apt-get install -y --no-install-recommends \ 11 | wget \ 12 | gcc \ 13 | g++ \ 14 | make \ 15 | cmake \ 16 | zlib1g-dev \ 17 | libboost-dev \ 18 | bzip2 \ 19 | python2.7 \ 20 | default-jre \ 21 | libbz2-dev \ 22 | liblzma-dev \ 23 | default-jdk \ 24 | python3 \ 25 | python3-pip \ 26 | cython \ 27 | unzip \ 28 | git;\ 29 | git config --global http.sslVerify false;\ 30 | cd /opt;\ 31 | git clone --recursive https://github.com/GATB/MindTheGap.git;\ 32 | cd MindTheGap;\ 33 | git submodule init;\ 34 | git submodule update;\ 35 | mkdir build;\ 36 | cd build;\ 37 | cmake ..;\ 38 | make;\ 39 | cd ..;\ 40 | git config --global http.sslVerify false;\ 41 | pip3 install pyGATB;\ 42 | pip3 install biopython;\ 43 | pip3 install pandas;\ 44 | pip3 install numpy;\ 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /test/full_test/gold.breakpoints: -------------------------------------------------------------------------------- 1 | >bkpt2_Seq0_pos_123_fuzzy_0_HET left_kmer 2 | CTCCGGATCTCCGTGTTCTTCGGAAGCTTAG 3 | >bkpt2_Seq0_pos_123_fuzzy_0_HET right_kmer 4 | GTCACGCGCGTCATACTACAGTAAGTTACTG 5 | >bkpt5_Seq1_pos_342_fuzzy_0_HET left_kmer 6 | GCCGCGCAAAGCCGGTCAACAGCGTTAGTAT 7 | >bkpt5_Seq1_pos_342_fuzzy_0_HET right_kmer 8 | GTTGAAAGTTTACTCAGATCGCTTCTGTCGG 9 | >bkpt9_Seq2_pos_535_fuzzy_0_HOM left_kmer 10 | GGCATGCGTAAGTTATCGTGAAACCATGATG 11 | >bkpt9_Seq2_pos_535_fuzzy_0_HOM right_kmer 12 | GCCCCTTACTAGACCAAATGTACTGAATGCG 13 | >bkpt10_Seq2_pos_835_fuzzy_1_HOM left_kmer 14 | GAGCTACCCGCCCTCGGTGAGAAGGTAGTAT 15 | >bkpt10_Seq2_pos_835_fuzzy_1_HOM right_kmer 16 | ACCCAAACGCGTCCTATGCAGTTTTGGGCTT 17 | >bkpt14_Seq3_pos_781_fuzzy_0_HOM left_kmer 18 | CGGCCCATGGGAACAAGTATCCTTACTTTCG 19 | >bkpt14_Seq3_pos_781_fuzzy_0_HOM right_kmer 20 | GTACAAATGAGGCTCCAAAATAGCACGCTTG 21 | >bkpt16_Seq4_pos_351_fuzzy_2_HET left_kmer 22 | GTAATCGAGATTCTCCACCATAACCTGCGCA 23 | >bkpt16_Seq4_pos_351_fuzzy_2_HET right_kmer 24 | ATGCATCGTGAAGCTTTACCGCGCCCAAGGG 25 | >bkpt18_Seq4_pos_603_fuzzy_3_HOM left_kmer 26 | CTTTTGGGTGCGCAACATTGCTATACTTAGG 27 | >bkpt18_Seq4_pos_603_fuzzy_3_HOM right_kmer 28 | ATCCATTGACATCTGTCAGCCGTCTTTCCAG 29 | >bkpt20_Seq4_pos_821_fuzzy_0_HOM left_kmer 30 | AGCGCGCTAAATTACCGCTACGAGCCATACC 31 | >bkpt20_Seq4_pos_821_fuzzy_0_HOM right_kmer 32 | CCGAACATTGAGACCTGGCTAGTAGGTAGGT 33 | -------------------------------------------------------------------------------- /scripts/python3/README.txt: -------------------------------------------------------------------------------- 1 | Two scripts are available to improve performance of MindTheGap for human genome analysis : 2 | 3 | Inser_snp_in_ref.py : 4 | It allows user to integrate SNP called from GATK HaplotypeCaller in a reference genome. 5 | Three paramaters are required : -s GATK.vcf, -g reference_genome.fa, -o altered_genome.fa 6 | 7 | Context_genome.py : 8 | It allows user to filter potential false positive. 9 | The script will check k-mer connectivity around each breakpoints. 10 | By default, if more than 20% of the last 50 k-mers contain unusual connectivity (number of branching k-mer for a k-mer is greater than 2) the breakpoints is not kept. 11 | Four parameters are required : 12 | -g MindTheGap_file.h5 13 | -p Reference_genome.fa 14 | -b Breakpoint_file.breakpoints 15 | -o Breakpoints_filtered.breakpoints 16 | 17 | Use -m to set a specific threshold of connectivity (0 to 1) 18 | 19 | Example of running pipeline : 20 | python3.5 /MindTheGap/script/python3/Inser_snp_in_ref.py -g genome.fa -s GATKHC.vcf -o altered_genome.fa 21 | /MindTheGap/build/bin/MindTheGap find -ref altered_genome.fa -in part1.fastq.gz,part2.fastq.gz -abundance-min auto -out OUTPUT_FIND 22 | python3.5 /MindTheGap/script/python3/Context_genome_WG.py -g OUTPUT_FIND.h5 -p altered_genome.fa -b OUTPUT_FIND.breakpoints -o OUTPUT_FIND_filter.breakpoints 23 | /MindTheGap/build/bin/MindTheGap fill -graph OUTPUT_FIND.h5 -bkpt OUTPUT_FIND_filter.breakpoints -out OUTPUT_FIND_filter -filter 24 | -------------------------------------------------------------------------------- /src/nwAlign/nwalign.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************* 2 | Minimalist utility to use perform fast Needleman_Wunsch alignment outside of MindTheGap 3 | 4 | Usage : nwalign < infile 5 | Where infile is a two lines file with the two sequences to compare 6 | Outputs identity score in stdout 7 | *********************************************************************/ 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | int main (int argc, char* argv[]) 16 | { 17 | // We use a try/catch block since GATB functions may throw exceptions 18 | try 19 | { 20 | int nbLine = 0; 21 | string seq1; 22 | string seq2; 23 | 24 | float score; 25 | 26 | for (std::string line; std::getline(std::cin, line);) { 27 | nbLine += 1; 28 | if (nbLine == 1){ 29 | seq1 = line; 30 | } else if (nbLine == 2){ 31 | seq2 = line; 32 | } else{ 33 | cout << "Only two lines expected" << endl; 34 | break; 35 | } 36 | } 37 | score = needleman_wunsch(seq1,seq2, NULL, NULL, NULL); 38 | cout << score << endl; 39 | return 0; 40 | 41 | } 42 | catch (Exception& e) 43 | { 44 | std::cout << "EXCEPTION: " << e.getMessage() << std::endl; 45 | return EXIT_FAILURE; 46 | } 47 | } -------------------------------------------------------------------------------- /scripts/script_human_analysis/README.txt: -------------------------------------------------------------------------------- 1 | Two scripts are available to improve performance of MindTheGap for human genome analysis : 2 | 3 | Inser_snp_in_ref.py : 4 | It allows user to integrate SNP called from GATK HaplotypeCaller in a reference genome. 5 | Three paramaters are required : -s GATK.vcf, -g reference_genome.fa, -o altered_genome.fa 6 | 7 | Context_genome.py : 8 | It allows user to filter potential false positive. 9 | The script will check k-mer connectivity around each breakpoints. 10 | By default, if more than 20% of the last 50 k-mers contain unusual connectivity (number of branching k-mer for a k-mer is greater than 2) the breakpoints is not kept. 11 | Four parameters are required : 12 | -g MindTheGap_file.h5 13 | -p Reference_genome.fa 14 | -b Breakpoint_file.breakpoints 15 | -o Breakpoints_filtered.breakpoints 16 | 17 | Use -m to set a specific threshold of connectivity (0 to 1) 18 | 19 | Example of running pipeline : 20 | python3.5 /MindTheGap/script/python3/Inser_snp_in_ref.py -g genome.fa -s GATKHC.vcf -o altered_genome.fa 21 | /MindTheGap/build/bin/MindTheGap find -ref altered_genome.fa -in part1.fastq.gz,part2.fastq.gz -abundance-min auto -out OUTPUT_FIND 22 | python3.5 /MindTheGap/script/python3/Context_genome_WG.py -g OUTPUT_FIND.h5 -p altered_genome.fa -b OUTPUT_FIND.breakpoints -o OUTPUT_FIND_filter.breakpoints 23 | /MindTheGap/build/bin/MindTheGap fill -graph OUTPUT_FIND.h5 -bkpt OUTPUT_FIND_filter.breakpoints -out OUTPUT_FIND_filter -filter 24 | -------------------------------------------------------------------------------- /test/full_test/gold_bed.insertions.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##filedate=Thu May 9 11:40:23 2019 3 | ##source=MindTheGap fill version 2.2.0 4 | ##SAMPLE=file:gold_bed.h5 5 | ##REF=file:gold_bed 6 | ##INFO= 7 | ##INFO= 8 | ##INFO=<=QUAL,Number=.,Type=Integer,Description="Quality of the insertion"> 9 | ##INFO=<=AVK,Number=.,Type=Float,Description="Average k-mer coverage along the insertion"> 10 | ##INFO=<=MDK,Number=.,Type=Float,Description="Median k-mer coverage along the insertion"> 11 | ##INFO=<=NSOL,Number=1,Type=String,Description="number of alternative insertion sequences for the breakpoint"> 12 | ##INFO= 13 | ##FORMAT= 14 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT G1 15 | Seq0 123 bkpt2 G GATCTAAGCTGTGACCTTGTGGCCGAGGCGCTTTTCACGCCTACATTAACTCCTGGGAAGCTCTCTGCTCTAGTTTCAGTGCACATCTCCAGGTGAGCAACCCTGGCAAGCAGCCCCTTCCTGTAGAAATTACTTAGC . PASS TYPE=INS;LEN=137;QUAL=50;NSOL=1;NPOS=1;AVK=8.38;MDK=8.00 GT 0/1 16 | Seq1 342 bkpt3 T TATGGTTTATAGAACCCGGGCGTTCATGTCCGTCAGAACGATCTTGGCACGGTAGCCCCTGGTCCAGAGAGCCAAGGTGACTCAGCCCCACGATGGTGGTCTAGAGCGAAATAACCCTCGCCGAGA . PASS TYPE=INS;LEN=125;QUAL=50;NSOL=1;NPOS=1;AVK=9.91;MDK=9.00 GT 0/1 17 | Seq2 535 bkpt4 G GTAACGTTCGCTGAACATCGACTCCGGTGACGACATACGATTCAAGAAGAGAGTGACTCTGTAGGATAACATCCCGCAACGCCTAATCCATCCAGCCTGGCACCATGTATAAAGGGCGTCAGGTATGTTAACGAGACTATT . PASS TYPE=INS;LEN=140;QUAL=50;NSOL=1;NPOS=1;AVK=21.63;MDK=22.00 GT 1/1 18 | -------------------------------------------------------------------------------- /test/full_test/gold_fill.output: -------------------------------------------------------------------------------- 1 | MindTheGap fill 2 | version : 2.2.3 3 | gatb-core-library : 1.4.2 4 | supported_kmer_sizes : 32 64 96 128 5 | Parameters 6 | Input data 7 | Graph : gold.h5 8 | Breakpoints : gold.breakpoints 9 | Graph 10 | kmer-size : 31 11 | abundance_min (auto inferred) : 7 12 | abundance_min (used) : 7 13 | nb_solid_kmers : 7419 14 | nb_branching_nodes : 36 15 | Assembly options 16 | max_depth : 10000 17 | max_nodes : 100 18 | Results 19 | Breakpoints 20 | nb_input_breakpoints : 8 21 | nb_filled_breakpoints : 8 22 | as_unique_sequence : 8 23 | as_multiple_sequence : 0 24 | Time : 1.0 s 25 | Output files 26 | assembled sequence file : gold.insertions.fasta 27 | insertion variant vcf file : gold.insertions.vcf 28 | assembly statistics file : gold.info.txt 29 | -------------------------------------------------------------------------------- /test/full_test/gold.insertions.fasta: -------------------------------------------------------------------------------- 1 | >bkpt2_Seq0_pos_123_fuzzy_0_HET_len_137_qual_50_avg_cov_21.59_median_cov_21.00 2 | ATCTAAGCTGTGACCTTGTGGCCGAGGCGCTTTTCACGCCTACATTAACTCCTGGGAAGCTCTCTGCTCTAGTTTCAGTGCACATCTCCAGGTGAGCAACCCTGGCAAGCAGCCCCTTCCTGTAGAAATTACTTAGC 3 | >bkpt5_Seq1_pos_342_fuzzy_0_HET_len_125_qual_50_avg_cov_25.17_median_cov_24.00 4 | ATGGTTTATAGAACCCGGGCGTTCATGTCCGTCAGAACGATCTTGGCACGGTAGCCCCTGGTCCAGAGAGCCAAGGTGACTCAGCCCCACGATGGTGGTCTAGAGCGAAATAACCCTCGCCGAGA 5 | >bkpt9_Seq2_pos_535_fuzzy_0_HOM_len_140_qual_50_avg_cov_41.66_median_cov_43.00 6 | TAACGTTCGCTGAACATCGACTCCGGTGACGACATACGATTCAAGAAGAGAGTGACTCTGTAGGATAACATCCCGCAACGCCTAATCCATCCAGCCTGGCACCATGTATAAAGGGCGTCAGGTATGTTAACGAGACTATT 7 | >bkpt10_Seq2_pos_835_fuzzy_1_HOM_len_207_qual_50_avg_cov_40.42_median_cov_42.00 8 | GCACGCTGCAGGATTGGAACCACAATGTACGCCGATCCAAGCAGTAGTGGTTCATTGTATAAGTATCCTCCCTTGATTGGTCGAATATTAGGCATGCCCCGGGAGCATGTGGGCTCGAGCCACGGAGAGCAACTAATCGCGCATAAAACAAATACCTCATGGTTTTTGTGCGGAAAACCGTTGGGTGGACCATCAGCGGTTGTGATT 9 | >bkpt14_Seq3_pos_781_fuzzy_0_HOM_len_111_qual_50_avg_cov_50.54_median_cov_53.00 10 | TTTGCAGCACTAGCCGTTCCTTGACATCTGCGGCCAACTTGTGCCTGAACCTGGAGTTTCGACAGCGTGGCGCTCTGGCCTAGTTCTTCGCTGGCACCTGGAAGAGCCGCC 11 | >bkpt16_Seq4_pos_351_fuzzy_2_HET_len_120_qual_50_avg_cov_25.67_median_cov_26.00 12 | GTCTTAACCTTAAGACCGTTCATTGATAAAACTTGCTCACGCTCTAGATGGCGTGAAGCGAAACCTAGGAAAAAGTTTTGCAGATAATTAGATTATGCGCGATACTCCGCCGTGTGTTCA 13 | >bkpt18_Seq4_pos_603_fuzzy_3_HOM_len_57_qual_50_avg_cov_46.66_median_cov_47.00 14 | TGTATTCCTGGGTTGAGTGGCAGGTTTCTCTTAATTCTTCCCTAAGTAGCTCCGAGG 15 | >bkpt20_Seq4_pos_821_fuzzy_0_HOM_len_40_qual_50_avg_cov_37.63_median_cov_38.00 16 | GGATGGCGGCCGGAGAGCGCTGCAATCGCATGGCTCGGGA 17 | -------------------------------------------------------------------------------- /test/contig_test/gold.log: -------------------------------------------------------------------------------- 1 | MindTheGap fill 2 | version : 2.2.0 3 | gatb-core-library : 1.4.1 4 | supported_kmer_sizes : 32 64 96 128 5 | Parameters 6 | Input data 7 | Reads : reads.fasta 8 | Contigs : contigs.fasta 9 | Graph 10 | kmer-size : 31 11 | abundance_min (used) : 3 12 | nb_solid_kmers : 10194 13 | nb_branching_nodes : 46 14 | Assembly options 15 | max_depth : 10000 16 | max_nodes : 100 17 | contig trim size before gap-filling : 31 18 | Results 19 | Contigs 20 | nb_input_contigs : 10 21 | nb_used_contigs : 9 22 | nb_input_seeds : 18 23 | nb_filled_seeds : 13 24 | as_unique_sequence : 9 25 | as_multiple_sequence : 4 26 | Time : 0.0 s 27 | Output files 28 | assembled sequence file : gold.insertions.fasta 29 | assembly graph file : gold.gfa 30 | assembly statistics file : gold.info.txt 31 | -------------------------------------------------------------------------------- /test/contig_test/README: -------------------------------------------------------------------------------- 1 | # Claire Lemaitre 2 | # 06/07/2018 3 | 4 | # Creates a small dataset to test MindTheGap fill -contig 5 | # With : 6 | # - several contigs, with 1 too small one 7 | # - errors in anchors 8 | # - 2 contigs that overlap of 30 nt. 9 | # - 2 structural variants 10 | # - 1 noise contig 11 | 12 | 13 | # 1. Generates a random genome 14 | ~/workspace/divers_scripts/gener_alea 10000 1 15 | mv alea.seq genome.fasta 16 | 17 | # 2. Extract 10 contigs: 18 | - contig1: 0:521 19 | - contig2: 1000:1065 = contig too small 20 | - contig3: 1156:2115 21 | - contig4: 2268:3150 22 | - contig5: 3120:3720 = overlap de 30 nt with last contig 23 | - contig6: 6271:6911 24 | - contig7: 7071:7796 25 | - contig8: 8274:8679 26 | - contig9: noise contig, comes from another random sequence : ~/workspace/divers_scripts/gener_alea 500 1; mv alea.seq contig9.fasta 27 | - contig10: 9486:9999 : with 2 substitutions pos 40 G->C pos 57 T->C 28 | 29 | # 3. creates 2 structural variants : 30 | cp genome.fasta genome-variant.fasta, 2 deletions in genome-variant.fasta : 31 | - deletion that removes contig7 : 6960-8080 --> gap-fill contig6->contig7, contig7->contig8 and contig6->contig8. 32 | - deletion that removes a large portion between contig5 and contig6 : 4000:5600 => 2 alternative gap-fill sequences. 33 | 34 | # 4. generates reads : 35 | 2x30X : N*100/10000 = 30 => N=3000 36 | cp /Users/clemaitr/workspace/mutareads/*.qual . 37 | /Users/clemaitr/workspace/mutareads/mutareads genome.fasta temp1 3000 100 0.01 0 0 38 | /Users/clemaitr/workspace/mutareads/mutareads genome-variant.fasta temp2 3000 100 0.01 0 0 39 | cat temp1.fasta temp2.fasta > reads.fasta 40 | gzip reads.fasta 41 | rm -f *.qual 42 | 43 | # 5. put the data in data dir : 44 | cp reads.fasta.gz ../../data/contig-reads.fasta.gz 45 | cp contigs.fasta ../../data/ 46 | 47 | 48 | # 5. generates gold results : 49 | ../../build/bin/MindTheGap fill -in ../../data/contig-reads.fasta.gz -contig ../../data/contigs.fasta -abundance-min 3 -out gold -nb-cores 1 > gold.log 50 | 51 | # 6. dir cleaning : 52 | rm -f temp*.fasta 53 | rm -f contig9.fasta 54 | 55 | -------------------------------------------------------------------------------- /test/truths/insert_ref10K.fasta: -------------------------------------------------------------------------------- 1 | > insertion ( len= 61 ) for breakpoint "bkpt0_left_kmer_Seq0_pos_1789_repeat_0_HOM" 2 | AGCTCAATCGAAGCTTGTTCTTCTTGTCCACACCTGGTATATAGCGCGGAAAAGCATGACT 3 | > insertion ( len= 34 ) for breakpoint "bkpt1_left_kmer_Seq0_pos_2133_repeat_0_HOM" 4 | GGTTGGGAATGCAAGGTAGCCTGAGCCGACCCTG 5 | > insertion ( len= 49 ) for breakpoint "bkpt2_left_kmer_Seq0_pos_2366_repeat_0_HOM" 6 | CCTTTAAAGTAATTTAAGATGTCCCATAGCAATTATTCCTGATAGATTC 7 | > insertion ( len= 116 ) for breakpoint "bkpt3_left_kmer_Seq0_pos_2852_repeat_1_HOM" 8 | AGTGCTAGAGGCGTGAGACTTAGATATATTGCTCGGATCGATCCTAGAGCTACACCTAATGCCACCAGTCCCGCGGTCAGCAATGATGGAGCATGCTATGTTCGACGGCACTTGAG 9 | > insertion ( len= 28 ) for breakpoint "bkpt4_left_kmer_Seq0_pos_4513_repeat_2_HOM" 10 | AGCTGTATCCCAATGGAGATTCAACGTG 11 | > insertion ( len= 105 ) for breakpoint "bkpt5_left_kmer_Seq0_pos_4616_repeat_1_HOM" 12 | CCTAACACGGTCCAAGGAGCGAGTGGCTTCTGGAAGGCAAAGGTAGCATCATGGCACATTCAACGTGACAAGATATTTTTCTTACCAAAAAATGTGACACCAAAG 13 | > insertion ( len= 27 ) for breakpoint "bkpt6_left_kmer_Seq0_pos_4716_repeat_0_HOM" 14 | CGCCGGCAATCCTTCAGCGATCAGGAA 15 | > insertion ( len= 60 ) for breakpoint "bkpt7_left_kmer_Seq0_pos_5310_repeat_0_HOM" 16 | TATTATAGGCTGAAAGTAGAGAGAGCCGTGTCGGAGTTGTGAGGAAGAAACTATCGGCTG 17 | > insertion ( len= 266 ) for breakpoint "bkpt8_left_kmer_Seq0_pos_6090_repeat_1_HOM" 18 | ATAGGAACGGAGAGTTATAAAGAAGTAATACTCAATCAATTATACACCTTATCCGTCAGAGTGGTTATTAACCGGGCTTCCCAACCCCATTGTGCATTGCCCATTTAAGTTAACTTTACACTACAGGTGGAGCGGGAGGGTACCTGGCGCATTATTGCGGTCAACTTGGAAGTCCTGTATACAGGCATACCAAATGCACCCAGACGATGGTTCCCTCTAAATACTTGTGATGCCCAAAATTGGGTTTTCAGGAGGTTTTCGAATCC 19 | > insertion ( len= 35 ) for breakpoint "bkpt9_left_kmer_Seq0_pos_7160_repeat_2_HOM" 20 | AACTCTGTGATTCTACATTGCAGCTTGGTCACGCA 21 | > insertion ( len= 71 ) for breakpoint "bkpt10_left_kmer_Seq0_pos_7311_repeat_0_HOM" 22 | AATACGGATATGGCCGAGGTTTCAACTGAGACGAAAAACGCCAAATCCTATAACACGCCCGTTTCGTATCG 23 | > insertion ( len= 22 ) for breakpoint "bkpt11_left_kmer_Seq0_pos_8034_repeat_3_HOM" 24 | TTCGGAAGAAACCAATATAGAA 25 | > insertion ( len= 59 ) for breakpoint "bkpt12_left_kmer_Seq0_pos_8805_repeat_1_HOM" 26 | GTTCAGCGAGTCGTAAAAGTATAAGGAGACTGTTGCCTACCCGCCAGCATTACGTCGAA 27 | -------------------------------------------------------------------------------- /test/full_test/gold_find.output: -------------------------------------------------------------------------------- 1 | MindTheGap find 2 | version : 2.2.3 3 | gatb-core-library : 1.4.2 4 | supported_kmer_sizes : 32 64 96 128 5 | Parameters 6 | Input data 7 | Reads : ../../data/reads_r1.fastq,../../data/reads_r2.fastq 8 | Reference : ../../data/reference.fasta 9 | Graph 10 | kmer-size : 31 11 | abundance_min (auto inferred) : 7 12 | abundance_min (used) : 7 13 | abundance_max : 2147483647 14 | nb_solid_kmers : 7419 15 | nb_branching_nodes : 36 16 | Breakpoint detection options 17 | max_repeat : 5 18 | hetero_max_occ : 1 19 | homo_insertions : yes 20 | hete_insertions : yes 21 | snp : yes 22 | deletion : yes 23 | Results 24 | Insertion breakpoints 25 | homozygous : 5 26 | clean : 3 27 | fuzzy : 2 28 | heterozygous : 3 29 | clean : 2 30 | fuzzy : 1 31 | Other variants 32 | deletions : 2 33 | Homozygous insertions 1-2 bp size : 9 34 | Heterozygous insertions 1-2 bp size : 7 35 | SNPs : 11 36 | Time : 0.0 s 37 | Output files 38 | graph_file : gold.h5 39 | breakpoint_file : gold.breakpoints 40 | othervariants_file : gold.othervariants.vcf 41 | -------------------------------------------------------------------------------- /test/full_test/gold.othervariants.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##filedate=Tue Feb 8 12:29:50 2022 3 | ##source=MindTheGap find version 2.2.3 4 | ##SAMPLE=file:../../data/reads_r1.fastq,../../data/reads_r2.fastq 5 | ##REF=file:../../data/reference.fasta 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##FORMAT= 10 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT G1 11 | Seq0 101 bkpt1 T C . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 12 | Seq0 297 bkpt3 CTAGCTTGAGAGTGCGTATCTCACCGATCCCCTGGCTATGCTCCGCGATTCACTAGTAGTTTCACGCCGACAGAGCGAAACCGTGATAGGTCATCATGCCGGTCTGCAGTCACGT C . PASS TYPE=DEL;LEN=114;FUZZY=0 GT 1/1 13 | Seq0 816 bkpt4 C A . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 14 | Seq2 320 bkpt6 T C . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 15 | Seq2 344 bkpt7 C A . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 16 | Seq2 379 bkpt8 G C . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 17 | Seq3 256 bkpt11 A T . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 18 | Seq3 511 bkpt12 C A . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 19 | Seq3 766 bkpt13 G A . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 20 | Seq4 257 bkpt15 C T . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 21 | Seq4 512 bkpt17 A G . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 22 | Seq4 841 bkpt19 C T . PASS TYPE=SNP;LEN=1;FUZZY=0 GT 1/1 23 | Seq4 884 bkpt21 CTAGGGACCTAGACGCAACAGTAACCGCCTCGGAGTAAGCCCTGG C . PASS TYPE=DEL;LEN=44;FUZZY=2 GT 1/1 24 | Seq5 100 bkpt22 T TC . PASS TYPE=INS;LEN=1;FUZZY=1 GT 1/1 25 | Seq5 199 bkpt23 T TG . PASS TYPE=INS;LEN=1;FUZZY=1 GT 1/1 26 | Seq5 300 bkpt24 A AC . PASS TYPE=INS;LEN=1;FUZZY=0 GT 1/1 27 | Seq5 400 bkpt25 G GT . PASS TYPE=INS;LEN=1;FUZZY=2 GT 1/1 28 | Seq5 500 bkpt26 C CT . PASS TYPE=INS;LEN=1;FUZZY=0 GT 1/1 29 | Seq5 600 bkpt27 G GA . PASS TYPE=INS;LEN=1;FUZZY=0 GT 0/1 30 | Seq5 700 bkpt28 A AC . PASS TYPE=INS;LEN=1;FUZZY=0 GT 0/1 31 | Seq5 800 bkpt29 A AC . PASS TYPE=INS;LEN=1;FUZZY=2 GT 0/1 32 | Seq5 900 bkpt30 T TG . PASS TYPE=INS;LEN=1;FUZZY=2 GT 0/1 33 | Seq6 98 bkpt31 T TAT . PASS TYPE=INS;LEN=2;FUZZY=3 GT 1/1 34 | Seq6 200 bkpt32 A ACG . PASS TYPE=INS;LEN=2;FUZZY=1 GT 1/1 35 | Seq6 300 bkpt33 A ATA . PASS TYPE=INS;LEN=2;FUZZY=1 GT 1/1 36 | Seq6 400 bkpt34 T TCA . PASS TYPE=INS;LEN=2;FUZZY=0 GT 1/1 37 | Seq6 600 bkpt35 T TCA . PASS TYPE=INS;LEN=2;FUZZY=0 GT 0/1 38 | Seq6 699 bkpt36 C CGT . PASS TYPE=INS;LEN=2;FUZZY=2 GT 0/1 39 | Seq6 800 bkpt37 T TGG . PASS TYPE=INS;LEN=2;FUZZY=0 GT 0/1 40 | -------------------------------------------------------------------------------- /src/GraphOutputDot.hpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * GATB : Genome Assembly Tool Box 3 | * Copyright (C) 2014 INRIA 4 | * Authors: R.Chikhi, G.Rizk, E.Drezen, C.Lemaitre 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU Affero General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Affero General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Affero General Public License 17 | * along with this program. If not, see . 18 | *****************************************************************************/ 19 | 20 | #ifndef _GRAPHOUTPUTDOT_H 21 | #define _GRAPHOUTPUTDOT_H 22 | 23 | /********************************************************************************/ 24 | #include 25 | /********************************************************************************/ 26 | 27 | 28 | 29 | 30 | template 31 | class GraphOutputDot : public IGraphOutput 32 | { 33 | public: 34 | 35 | /** Constructor. 36 | * \param[in] kmerSize : size of the kmer 37 | * \param[in] prefix : prefix of the file name 38 | * */ 39 | GraphOutputDot (size_t kmerSize, const std::string& prefix); 40 | 41 | /** Finish the output. */ 42 | virtual void close(); 43 | 44 | virtual void print_starter_head (int index, char* sequence, size_t sequenceLen); 45 | virtual void print_starter_end (); 46 | 47 | virtual void print_sequence_head (const std::string& filename, const std::string& direction); 48 | virtual void print_sequence_end (); 49 | 50 | virtual void print_node (long index, const std::string& seq); 51 | virtual void print_edge (long index, long id, long id2, const std::string& label, const std::string& comment); 52 | 53 | std::string get_dot_file_name() {return _dot_file_name;}; 54 | 55 | private: 56 | 57 | void init (bool erase); 58 | 59 | FILE* _graph_file; 60 | 61 | std::string _dot_file_name; 62 | 63 | std::string _dot_file_suffix; 64 | }; 65 | 66 | /********************************************************************************/ 67 | 68 | #endif //_GRAPHOUTPUTDOT_H 69 | -------------------------------------------------------------------------------- /test/full_test/gold.insertions.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##filedate=Tue Feb 8 12:29:50 2022 3 | ##source=MindTheGap fill version 2.2.3 4 | ##SAMPLE=file:gold.h5 5 | ##REF=file:gold 6 | ##INFO= 7 | ##INFO= 8 | ##INFO=<=QUAL,Number=.,Type=Integer,Description="Quality of the insertion"> 9 | ##INFO=<=AVK,Number=.,Type=Float,Description="Average k-mer coverage along the insertion"> 10 | ##INFO=<=MDK,Number=.,Type=Float,Description="Median k-mer coverage along the insertion"> 11 | ##INFO=<=NSOL,Number=1,Type=String,Description="number of alternative insertion sequences for the breakpoint"> 12 | ##INFO= 13 | ##FORMAT= 14 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT G1 15 | Seq0 123 bkpt2 G GATCTAAGCTGTGACCTTGTGGCCGAGGCGCTTTTCACGCCTACATTAACTCCTGGGAAGCTCTCTGCTCTAGTTTCAGTGCACATCTCCAGGTGAGCAACCCTGGCAAGCAGCCCCTTCCTGTAGAAATTACTTAGC . PASS TYPE=INS;LEN=137;QUAL=50;NSOL=1;NPOS=1;AVK=21.59;MDK=21.00 GT 0/1 16 | Seq1 342 bkpt5 T TATGGTTTATAGAACCCGGGCGTTCATGTCCGTCAGAACGATCTTGGCACGGTAGCCCCTGGTCCAGAGAGCCAAGGTGACTCAGCCCCACGATGGTGGTCTAGAGCGAAATAACCCTCGCCGAGA . PASS TYPE=INS;LEN=125;QUAL=50;NSOL=1;NPOS=1;AVK=25.17;MDK=24.00 GT 0/1 17 | Seq2 535 bkpt9 G GTAACGTTCGCTGAACATCGACTCCGGTGACGACATACGATTCAAGAAGAGAGTGACTCTGTAGGATAACATCCCGCAACGCCTAATCCATCCAGCCTGGCACCATGTATAAAGGGCGTCAGGTATGTTAACGAGACTATT . PASS TYPE=INS;LEN=140;QUAL=50;NSOL=1;NPOS=1;AVK=41.66;MDK=43.00 GT 1/1 18 | Seq2 834 bkpt10 A ATGCACGCTGCAGGATTGGAACCACAATGTACGCCGATCCAAGCAGTAGTGGTTCATTGTATAAGTATCCTCCCTTGATTGGTCGAATATTAGGCATGCCCCGGGAGCATGTGGGCTCGAGCCACGGAGAGCAACTAATCGCGCATAAAACAAATACCTCATGGTTTTTGTGCGGAAAACCGTTGGGTGGACCATCAGCGGTTGTGAT . PASS TYPE=INS;LEN=207;QUAL=50;NSOL=1;NPOS=2;AVK=40.42;MDK=42.00 GT 1/1 19 | Seq3 781 bkpt14 G GTTTGCAGCACTAGCCGTTCCTTGACATCTGCGGCCAACTTGTGCCTGAACCTGGAGTTTCGACAGCGTGGCGCTCTGGCCTAGTTCTTCGCTGGCACCTGGAAGAGCCGCC . PASS TYPE=INS;LEN=111;QUAL=50;NSOL=1;NPOS=1;AVK=50.54;MDK=53.00 GT 1/1 20 | Seq4 349 bkpt16 G GCAGTCTTAACCTTAAGACCGTTCATTGATAAAACTTGCTCACGCTCTAGATGGCGTGAAGCGAAACCTAGGAAAAAGTTTTGCAGATAATTAGATTATGCGCGATACTCCGCCGTGTGTT . PASS TYPE=INS;LEN=120;QUAL=50;NSOL=1;NPOS=3;AVK=25.67;MDK=26.00 GT 0/1 21 | Seq4 600 bkpt18 T TAGGTGTATTCCTGGGTTGAGTGGCAGGTTTCTCTTAATTCTTCCCTAAGTAGCTCCG . PASS TYPE=INS;LEN=57;QUAL=50;NSOL=1;NPOS=4;AVK=46.66;MDK=47.00 GT 1/1 22 | Seq4 821 bkpt20 C CGGATGGCGGCCGGAGAGCGCTGCAATCGCATGGCTCGGGA . PASS TYPE=INS;LEN=40;QUAL=50;NSOL=1;NPOS=1;AVK=37.63;MDK=38.00 GT 1/1 23 | -------------------------------------------------------------------------------- /src/FindBackup.hpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * MindTheGap: Integrated detection and assembly of insertion variants 3 | * A tool from the GATB (Genome Assembly Tool Box) 4 | * Copyright (C) 2014 INRIA 5 | * Authors: C.Lemaitre, G.Rizk, P.Marijon 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU Affero General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU Affero General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Affero General Public License 18 | * along with this program. If not, see . 19 | *****************************************************************************/ 20 | 21 | #ifndef _TOOL_FindBackup_HPP_ 22 | #define _TOOL_FindBackup_HPP_ 23 | 24 | /*******************************************************************************/ 25 | #include 26 | #include 27 | 28 | template 29 | class FindBackup : public IFindObserver 30 | { 31 | public : 32 | 33 | /** \copydoc IFindObserver::IFindObserver 34 | */ 35 | FindBackup(FindBreakpoints * find); 36 | 37 | /** \copydoc IFindObserver::update 38 | */ 39 | bool update(); 40 | }; 41 | 42 | template 43 | FindBackup::FindBackup(FindBreakpoints * find) : IFindObserver(find){} 44 | 45 | template 46 | bool FindBackup::update() 47 | { 48 | if((this->_find->kmer_begin().isValid() && this->_find->kmer_end().isValid()) == false) 49 | { 50 | return false; 51 | } 52 | 53 | if(this->_find->gap_stretch_size() > (this->_find->kmer_size() / 2)) { 54 | string kmer_begin_str = this->_find->model().toString(this->_find->kmer_begin().forward()); 55 | string kmer_end_str = this->_find->model().toString(this->_find->kmer_end().forward()); 56 | string chrom_name_bak = this->_find->chrom_name()+"_backup"; 57 | 58 | this->_find->writeBreakpoint(this->_find->breakpoint_id(), chrom_name_bak, this->_find->position() - 1, kmer_begin_str, kmer_end_str, 0, STR_BKP_TYPE); 59 | 60 | this->_find->breakpoint_id_iterate(); 61 | this->_find->backup_iterate(); 62 | 63 | return true; 64 | } 65 | 66 | return false; 67 | } 68 | 69 | #endif /* _TOOL_FindBackup_HPP_ */ 70 | -------------------------------------------------------------------------------- /scripts/python3/Inser_snp_in_ref.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | from Bio.Seq import Seq 4 | from Bio import SeqIO 5 | from collections import defaultdict 6 | import getopt 7 | csv.field_size_limit(100000000) 8 | 9 | 10 | def main(): 11 | print(sys.argv[1:]) 12 | try: 13 | opts, args = getopt.getopt(sys.argv[1:], "s:g:o", ["snp=", "genome=", "genome_altered="]) 14 | except getopt.GetoptError: 15 | # print help information and exit: 16 | #print ('error') # will print something like "option -a not recognized" 17 | sys.exit(2) 18 | 19 | genome_parser="" 20 | vcf_reader="" 21 | out_m=csv.writer(open(sys.argv[3],"w"),delimiter="\n") 22 | dic_snp={} 23 | 24 | for opt, arg in opts: 25 | print(opt, arg) 26 | if opt in ('-s', "--snp"): 27 | vcf_reader = arg 28 | #print(i) 29 | elif opt in ('-g', "--genome"): 30 | genome_parser = arg 31 | #print(r) 32 | elif opt in ('-o', "--genome_altered"): 33 | out_m = arg 34 | dic_snp=insert_snp( vcf_reader) 35 | alter_genome(genome_parser,dic_snp,out_m) 36 | 37 | def is_valid(inser) : 38 | allowed="ATCGatcg" 39 | if all(c in allowed for c in inser ) : 40 | return True 41 | else : 42 | return False 43 | 44 | def insert_snp(genome,chromosome,dic_snp) : 45 | nb_error=0 46 | #print (chromosome) 47 | #print ("len before", len(genome)) 48 | if chromosome in dic_snp : 49 | for listing in dic_snp[chromosome] : 50 | if listing[1]==genome[listing[0]-1] : 51 | genome[listing[0]-1]=listing[2] 52 | else : 53 | print( "Error SNP: in genome ",genome[listing[0]-2:listing[0]+2], "in vcf ", listing[1],"at position", listing[0] ) 54 | nb_error+=1 55 | print (" nb SNP substitution failed",nb_error) 56 | return genome 57 | 58 | def extract_snp(vcf_reader): 59 | vcf_readers=csv.reader(open(vcf_reader,'r'),delimiter='\t') 60 | for elits in vcf_readers : 61 | if '#' not in elits[0] and '@' not in elits[0] : 62 | if len(elits[3])==1 and len(elits[4])==1 and is_valid(elits[3])==True and is_valid(elits[4])==True: 63 | dic_snp.setdefault(elits[0],[]).append((int(elits[1]),elits[3],elits[4])) 64 | return dic_snp 65 | 66 | def alter_genome(genome_parser,dic_snp,out_m): 67 | genome_parsers=SeqIO.parse(genome_parser, "fasta") 68 | out_ms=csv.writer(open(out_m,"w"),delimiter="\n") 69 | for record in genome_parsers : 70 | elts=str(record.description) 71 | head=">" + str(record.description) 72 | old_sequence=list(str(record.seq).upper()) 73 | old_length=len(old_sequence) 74 | sequence="" 75 | unmasked_seq="" 76 | num_exon=0 77 | total_length=0 78 | work_seq=insert_snp(old_sequence,elts,dic_snp) 79 | finals=''.join(work_seq) 80 | print( 'final', len(finals)) 81 | out_m_lists=[head,finals] 82 | out_ms.writerow(out_m_lists) 83 | 84 | if __name__ == "__main__": 85 | main() -------------------------------------------------------------------------------- /src/GraphAnalysis.hpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * MindTheGap: Integrated detection and assembly of insertion variants 3 | * A tool from the GATB (Genome Assembly Tool Box) 4 | * Copyright (C) 2014 INRIA 5 | * Authors: C.Lemaitre, G.Rizk, R. Chikhi 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU Affero General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU Affero General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Affero General Public License 18 | * along with this program. If not, see . 19 | *****************************************************************************/ 20 | #include 21 | 22 | 23 | //USE_NEW_CXX variable defined in CMakeList.txt of gatb-core : depending on the compil version unordered_map is not in the same location... 24 | #include 25 | #define NS_TR1_PREFIX std 26 | 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | using namespace std; 35 | 36 | // path type 37 | typedef vector unlabeled_path; 38 | 39 | 40 | class GraphAnalysis { 41 | 42 | public: 43 | static const int max_breadth = 20; //changed from 10 to 20 44 | 45 | string prefix; 46 | FILE *graph_file; 47 | 48 | int nb_nodes, nb_edges; 49 | 50 | string node_identifier(int node); 51 | int revcomp_node(int node); 52 | 53 | NS_TR1_PREFIX::unordered_map node_sequences; 54 | NS_TR1_PREFIX::unordered_map > out_edges; 55 | NS_TR1_PREFIX::unordered_map > in_edges; 56 | 57 | size_t _sizeKmer; 58 | 59 | 60 | GraphAnalysis(string graph_file_name,size_t kmerSize); 61 | 62 | 63 | 64 | set> find_all_paths(set terminal_nodes_with_endpos, bool &success); 65 | set> find_all_paths(int start_node, set terminal_nodes_with_endpos, unlabeled_path current_path, int &nb_calls, bool &success); 66 | 67 | set> find_all_paths_rev(set< info_node_t > terminal_nodes_with_endpos); 68 | set> find_all_paths_rev(int start_node, set< info_node_t > terminal_nodes_with_endpos, unlabeled_path current_path, int &nb_calls, bool &success, int &terminal_node, bkpt_t &target_id); 69 | 70 | 71 | static int debug; // 0: no debug, 1: node id debug, 2: ful sequence debug; useful to see the sequences of the traversed paths 72 | std::vector paths_to_sequences(set paths, set< info_node_t > terminal_nodes_with_endpos); 73 | }; 74 | 75 | -------------------------------------------------------------------------------- /scripts/script_human_analysis/Inser_snp_in_ref.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | from Bio.Seq import Seq 4 | from Bio import SeqIO 5 | from collections import defaultdict 6 | import getopt 7 | csv.field_size_limit(100000000) 8 | 9 | 10 | def main(): 11 | print(sys.argv[1:]) 12 | try: 13 | opts, args = getopt.getopt(sys.argv[1:], "s:g:o:", ["snp=", "genome=", "genome_altered="]) 14 | except getopt.GetoptError: 15 | # print help information and exit: 16 | #print ('error') # will print something like "option -a not recognized" 17 | sys.exit(2) 18 | print('start') 19 | genome_parser="" 20 | vcf_reader="" 21 | out_m=csv.writer(open(sys.argv[3],"w"),delimiter="\n") 22 | dic_snp={} 23 | 24 | for opt, arg in opts: 25 | print(opt, arg) 26 | if opt in ('-s', "--snp"): 27 | vcf_reader = arg 28 | #print(i) 29 | elif opt in ('-g', "--genome"): 30 | genome_parser = arg 31 | #print(r) 32 | elif opt in ('-o', "--genome_altered"): 33 | out_m = arg 34 | else: 35 | assert False, "unhandled option" 36 | print('start') 37 | 38 | dic_snp=extract_snp(vcf_reader) 39 | alter_genome(genome_parser,dic_snp,out_m) 40 | 41 | def is_valid(inser) : 42 | allowed="ATCGatcg" 43 | if all(c in allowed for c in inser ) : 44 | return True 45 | else : 46 | return False 47 | 48 | def insert_snp(genome,chromosome,dic_snp) : 49 | nb_error=0 50 | #print (chromosome) 51 | #print ("len before", len(genome)) 52 | if chromosome in dic_snp : 53 | for listing in dic_snp[chromosome] : 54 | if listing[1]==genome[listing[0]-1] : 55 | genome[listing[0]-1]=listing[2] 56 | else : 57 | print( "Error SNP: in genome ",genome[listing[0]-2:listing[0]+2], "in vcf ", listing[1],"at position", listing[0]) 58 | nb_error+=1 59 | print (" nb SNP substitution failed",nb_error) 60 | return genome 61 | 62 | def extract_snp(vcf_reader): 63 | dic_snp={} 64 | vcf_readers=csv.reader(open(vcf_reader,'r'),delimiter='\t') 65 | count_snp=0 66 | for elits in vcf_readers : 67 | if '#' not in elits[0] and '@' not in elits[0] : 68 | if len(elits[3])==1 and len(elits[4])==1 and is_valid(elits[3])==True and is_valid(elits[4])==True: 69 | dic_snp.setdefault(elits[0],[]).append((int(elits[1]),elits[3],elits[4])) 70 | count_snp+=1 71 | print("total_snp in vcf : ", count_snp) 72 | return dic_snp 73 | 74 | def alter_genome(genome_parser,dic_snp,out_m): 75 | genome_parsers=SeqIO.parse(genome_parser, "fasta") 76 | out_ms=csv.writer(open(out_m,"w"),delimiter="\n") 77 | for record in genome_parsers : 78 | elts=str(record.description) 79 | head=">" + str(record.description) 80 | old_sequence=list(str(record.seq).upper()) 81 | old_length=len(old_sequence) 82 | sequence="" 83 | unmasked_seq="" 84 | num_exon=0 85 | total_length=0 86 | work_seq=insert_snp(old_sequence,elts,dic_snp) 87 | finals=''.join(work_seq) 88 | print( 'final', len(finals)) 89 | out_m_lists=[head,finals] 90 | out_ms.writerow(out_m_lists) 91 | 92 | if __name__ == "__main__": 93 | main() -------------------------------------------------------------------------------- /test/scripts/generate_read.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | Program to generate reads from a fasta file (for tests). P. Marijon based one 5 | get_subsequence.py write by C. Lemaitre. 6 | ''' 7 | 8 | import getopt, sys 9 | import random 10 | 11 | 12 | def usage(): 13 | '''Usage''' 14 | print "-----------------------------------------------------------------------------" 15 | print sys.argv[0]," : sub-sequence" 16 | print "-----------------------------------------------------------------------------" 17 | print "usage: ",sys.argv[0]," -f fasta_file -n numbre -l length" 18 | print " -f: input fasta file" 19 | print " -n: numbre of read you want(def : 1)" 20 | print " -l: length of sub-sequence (def : 1)" 21 | print " -h: help" 22 | print "-----------------------------------------------------------------------------" 23 | sys.exit(2) 24 | 25 | 26 | def main(): 27 | try: 28 | opts, args = getopt.getopt(sys.argv[1:], "hf:n:l:", ["help", "fasta=", "num=", "len="]) 29 | except getopt.GetoptError, err: 30 | # print help information and exit: 31 | print str(err) # will print something like "option -a not recognized" 32 | usage() 33 | sys.exit(2) 34 | 35 | # Default parameters 36 | read_len=1 37 | fasta_file=0 38 | num_loop=1 39 | for opt, arg in opts: 40 | if opt in ("-h", "--help"): 41 | usage() 42 | sys.exit() 43 | elif opt in ("-f", "--fasta"): 44 | fasta_file = arg 45 | elif opt in ("-l", "--len"): 46 | read_len = int(arg) 47 | elif opt in ("-n", "--num"): 48 | num_loop = int(arg) 49 | else: 50 | assert False, "unhandled option" 51 | 52 | if fasta_file == 0 : 53 | print "Missing arguments" 54 | usage() 55 | return 2 56 | 57 | else: 58 | header = "base_header" 59 | sequence = "" 60 | filin = open(fasta_file,"r") 61 | 62 | for line in filin: 63 | if line[0] ==">": 64 | # header 65 | header = (line.lstrip(">")).rstrip("\n").rstrip(" ") 66 | else: 67 | sequence+=(line.rstrip("\n")).upper() 68 | 69 | filin.close() 70 | 71 | sequence_len = len(sequence) 72 | 73 | if sequence_len == 0 : 74 | print "warning we didn't find fasta sequence in file." 75 | return 1 76 | 77 | if sequence_len < read_len : 78 | print "warning read length is upper than sequence length we can't generate read." 79 | return 1 80 | 81 | for i in range(num_loop) : 82 | pos = sequence_len 83 | while pos + read_len > sequence_len : 84 | pos = random.randint(0, sequence_len) 85 | 86 | print ">"+header+"_read"+str(i)+"_pos_"+str(pos)+":"+str(pos+read_len) 87 | print sequence[pos:pos+read_len] 88 | 89 | return 0 90 | 91 | if __name__ == "__main__": 92 | exit(main()) 93 | 94 | # exemple : 95 | # ./get_subsequence -f random_1Mb.fa -b 44444 96 | 97 | -------------------------------------------------------------------------------- /src/CircularBuffer.hpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * MindTheGap: Integrated detection and assembly of insertion variants 3 | * A tool from the GATB (Genome Assembly Tool Box) 4 | * Copyright (C) 2014 INRIA 5 | * Authors: C.Lemaitre, G.Rizk 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU Affero General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU Affero General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Affero General Public License 18 | * along with this program. If not, see . 19 | *****************************************************************************/ 20 | 21 | #ifndef CircularBuffer_hpp 22 | #define CircularBuffer_hpp 23 | 24 | #include 25 | 26 | 27 | #include 28 | 29 | 30 | 31 | template class CircularBuffer 32 | { 33 | 34 | friend class itCB; 35 | 36 | public: 37 | 38 | //constructor, len in power of two : 2^powlen 39 | CircularBuffer(int powlen) : _idx(this) 40 | { 41 | _tai = (1LL << powlen); 42 | _buffer = (elem_t* ) calloc(_tai,sizeof(elem_t)); 43 | _mask = _tai -1; 44 | } 45 | 46 | //def construc 47 | CircularBuffer() : _idx(this) 48 | { 49 | int powlen = 10; 50 | _tai = (1LL << powlen); 51 | _buffer = (elem_t* ) calloc(_tai,sizeof(elem_t)); 52 | _mask = _tai -1; 53 | } 54 | 55 | void resize(int powlen) 56 | { 57 | _tai = (1LL << powlen); 58 | _buffer = (elem_t* ) realloc(_buffer,sizeof(elem_t)*_tai); 59 | _mask = _tai -1; 60 | } 61 | 62 | 63 | void push(elem_t new_elem) 64 | { 65 | _idx() = new_elem ; 66 | //_buffer[_idx] = new_elem; 67 | _idx++; 68 | } 69 | 70 | ~CircularBuffer() 71 | { 72 | free(_buffer); 73 | }; 74 | 75 | void clear() 76 | { 77 | memset(_buffer,0,sizeof(elem_t)*_tai); 78 | _idx = itCB(this); 79 | } 80 | 81 | 82 | class itCB // iterator of circularbuffer 83 | { 84 | public: 85 | u_int64_t _idx; 86 | CircularBuffer * _ref; 87 | 88 | itCB (CircularBuffer *ref) 89 | { 90 | _idx = 0; 91 | _ref=ref; 92 | } 93 | 94 | elem_t & operator()() 95 | { 96 | return _ref->_buffer[_idx]; 97 | } 98 | elem_t & item() 99 | { 100 | return _ref->_buffer[_idx]; 101 | } 102 | 103 | void set(u_int64_t val) 104 | { 105 | _idx = val & _ref->_mask; 106 | } 107 | 108 | itCB operator+( u_int64_t rhs) //member func 109 | { 110 | itCB nit = itCB(this->_ref); 111 | nit._idx = (this->_idx + rhs) & _ref->_mask; 112 | return nit; 113 | } 114 | itCB operator-( u_int64_t rhs) //member func 115 | { 116 | itCB nit = itCB(this->_ref); 117 | nit._idx = (this->_idx - rhs) & _ref->_mask; 118 | return nit; 119 | } 120 | void operator++(int) //postfix operator 121 | { 122 | _idx = (_idx+1) & _ref->_mask; 123 | } 124 | void operator--(int) 125 | { 126 | _idx = (_idx-1) & _ref->_mask; 127 | } 128 | }; 129 | 130 | private: 131 | 132 | elem_t * _buffer; 133 | u_int64_t _tai; 134 | u_int64_t _mask; 135 | itCB _idx; //index to next free cell 136 | 137 | }; 138 | 139 | 140 | 141 | 142 | #endif /* CircularBuffer_hpp */ 143 | 144 | -------------------------------------------------------------------------------- /test/simple_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # look for MindTheGap binary. In devel mode, it's in ../build/bin directory. 4 | # In production mode, it's in ../bin directory. 5 | if [ -f "../bin/MindTheGap" ] 6 | then 7 | bindir="../bin" 8 | elif [ -f "../build/bin/MindTheGap" ] 9 | then 10 | bindir="../build/bin" 11 | else 12 | echo "could not find a compiled mindthegap binary" 13 | exit 1 14 | fi 15 | 16 | testOK="true" 17 | 18 | run_test() 19 | { 20 | # param : reads_file ref_file true_result prefix 21 | $bindir/MindTheGap find -in $1 -ref $2 -kmer-size 31 -out output/$4_find $5 1> output/$4_find.out 2> output/$4_find.err 22 | 23 | $bindir/MindTheGap fill -bkpt output/$4_find.breakpoints -graph output/$4_find.h5 -out output/$4_fill 1> output/$4_fill.out 2> output/$4_fill.err 24 | 25 | tmp1=output/$4_fill.insertions.fasta.tmp 26 | tmp2=output/tmp2 27 | 28 | grep -v "^>" output/$4_fill.insertions.fasta > $tmp1 29 | grep -v "^>" $3 > $tmp2 30 | 31 | 32 | diff $tmp1 $tmp2 1> /dev/null 2>&1 33 | 34 | var=$? 35 | if [ $var -eq 0 ] 36 | then 37 | eval $6="passed" 38 | else 39 | testOK="false" ; eval $6="FAILED" 40 | fi 41 | } 42 | 43 | 44 | 45 | run_test_vcf() 46 | { 47 | # param : reads_file ref_file true_result prefix 48 | $bindir/MindTheGap find -in $1 -ref $2 -kmer-size 31 -out output/$4_find $5 1> output/$4_find.out 2> output/$4_find.err 49 | 50 | sh compare_vcf.sh output/$4_find.othervariants.vcf $3 1> /dev/null 2>&1 51 | 52 | var=$? 53 | if [ $var -eq 0 ] 54 | then 55 | eval $6="passed" 56 | else 57 | testOK="false" ; eval $6="FAILED" 58 | fi 59 | } 60 | 61 | 62 | mkdir -p output 63 | output="" 64 | 65 | output=$output"clean-insert : " 66 | run_test reads/master.fasta references/deleted.fasta truths/insertion.fasta k-1 "-insert-only" retvalue 67 | output=${output}${retvalue} 68 | 69 | output=$output"\n13-inserts-ref10k : " 70 | run_test reads/readref10K.fasta references/g10K_del.fasta truths/insert_ref10K.fasta 13i "-insert-only" retvalue 71 | output=${output}${retvalue} 72 | 73 | output=$output"\n1-SNP : " 74 | run_test_vcf reads/master.fasta references/sSNP.fasta truths/truth_snp.vcf sSNP "-snp-only" retvalue 75 | output=${output}${retvalue} 76 | 77 | output=$output"\n3-SNP*2 : " 78 | run_test_vcf reads/master.fasta references/multiSNP.fasta truths/multiSNP.vcf multiSNP "-snp-only" retvalue 79 | output=${output}${retvalue} 80 | 81 | 82 | output=$output"\nsnp-before-clean-insert : " 83 | run_test reads/master.fasta references/deleted_before_SNP.fasta truths/insertion_before_SNP.fasta k-1_before_SNP "-no-deletion -homo-only" retvalue 84 | output=${output}${retvalue} 85 | 86 | #output=$output"\nsnp-begin-fuzzy : " 87 | #run_test reads/beginfuzzySNP.fasta references/beginfuzzySNP.fasta truths/beginfuzzySNP.fasta beginfuzzySNP "-snp-only" retvalue 88 | #output=${output}${retvalue} 89 | 90 | output=$output"\nhetero-insert : " 91 | run_test reads/deleted.fasta,reads/master.fasta references/deleted.fasta truths/insertion.fasta hete "-hete-only -max-rep 2" retvalue 92 | output=${output}${retvalue} 93 | 94 | output=$output"\ndeletion : " 95 | run_test reads/deleted.fasta references/master.fasta truths/deletion.fasta deletion "-deletion-only" retvalue 96 | output=${output}${retvalue} 97 | 98 | output=$output"\nfuzzy-deletion : " 99 | run_test reads/deletionfuzzy.fasta references/deletionfuzzy.fasta truths/deletionfuzzy.fasta deletionfuzzy "-deletion-only" retvalue 100 | output=${output}${retvalue} 101 | 102 | output=$output"\nn-in-solid-stretch : " 103 | run_test reads/master.fasta references/n_in_stretch.fasta truths/n_in_stretch.fasta n_in_stretch "-insert-only" retvalue 104 | output=${output}${retvalue} 105 | 106 | output=$output"\nn-in-before-clean-insert : " 107 | run_test reads/master.fasta references/n_before_gap.fasta truths/n_before_gap.fasta n_before_gap "-insert-only" retvalue 108 | output=${output}${retvalue} 109 | 110 | output=$output"\nn-after-clean-insert : " 111 | run_test reads/master.fasta references/n_after_gap.fasta truths/n_after_gap.fasta n_after_gap "-insert-only" retvalue 112 | output=${output}${retvalue} 113 | 114 | echo -e $output | column -t 115 | 116 | 117 | rm -rf output/ 118 | 119 | if [ $testOK == "false" ]; then 120 | exit 1 121 | fi 122 | -------------------------------------------------------------------------------- /src/Utils.hpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * MindTheGap: Integrated detection and assembly of insertion variants 3 | * A tool from the GATB (Genome Assembly Tool Box) 4 | * Copyright (C) 2014 INRIA 5 | * Authors: C.Lemaitre, G.Rizk, R. Chikhi 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU Affero General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU Affero General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Affero General Public License 18 | * along with this program. If not, see . 19 | *****************************************************************************/ 20 | 21 | #ifndef _Utils_HPP_ 22 | #define _Utils_HPP_ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | using namespace std; 32 | 33 | 34 | // TODO factoriser 35 | // this one is used in GraphAnalysis (modifies s) 36 | void revcomp_sequence(char s[], int len); 37 | // this one is used to reverse source and target Sequence (copies the sequence) 38 | string revcomp_sequence(const string& dna); 39 | 40 | 41 | 42 | #include 43 | typedef pair bkpt_t; 44 | typedef unordered_map bkpt_dict_t; 45 | 46 | class filled_insertion_t 47 | { 48 | public: 49 | 50 | filled_insertion_t(string insert, int nb_errors, bkpt_t targetId) : nb_errors_in_anchor(nb_errors), targetId_anchor(targetId) { 51 | seq = insert; 52 | } 53 | filled_insertion_t(string insert, int nb_errors) : nb_errors_in_anchor(nb_errors) { 54 | seq = insert; 55 | } 56 | 57 | string seq; 58 | int nb_errors_in_anchor; 59 | //bool is_anchor_repeated; 60 | 61 | float avg_coverage; 62 | float median_coverage; 63 | bkpt_t targetId_anchor; 64 | 65 | int qual; 66 | int solution_count; 67 | int solution_rank; 68 | 69 | //required to be inserted in set 70 | bool operator< (const filled_insertion_t & other) const 71 | { 72 | if (this->targetId_anchor != other.targetId_anchor) 73 | return this->targetId_anchor < other.targetId_anchor; 74 | else 75 | return this->seq < other.seq; 76 | } 77 | 78 | void reverse() 79 | { 80 | string reversed = revcomp_sequence(this->seq); 81 | this->seq = reversed; 82 | 83 | } 84 | 85 | void compute_qual(bool is_anchor_repeated) 86 | { 87 | int quality = 50; 88 | 89 | if(is_anchor_repeated) 90 | quality = 25; 91 | 92 | if(solution_count>1) 93 | quality = 15; 94 | 95 | if(nb_errors_in_anchor==1) 96 | quality = 10; 97 | 98 | if(nb_errors_in_anchor==2) 99 | quality = 5; 100 | 101 | this->qual = quality; 102 | 103 | } 104 | }; 105 | 106 | 107 | 108 | 109 | /** 110 | * verifies if a and b are identical (tolerant to case), if one equals N returns false (even if both N) 111 | */ 112 | int identNT(char a, char b); 113 | 114 | /** 115 | * gapped alignment 116 | * used by find_nodes_containingR : need to get the details of differences 117 | */ 118 | float needleman_wunsch(string a, string b, int * nbmatch,int * nbmis,int * nbgaps); 119 | 120 | 121 | 122 | 123 | /** 124 | * returns true if all pairs of sequences have identity percent > threshold 125 | */ 126 | bool all_consensuses_almost_identical(set consensuses, int identity_threshold); 127 | 128 | /** 129 | * reduces the redundancy int the vector of filled sequences : removes from the input vectot the sequences that are more than 90% similar with at least an other one in the vector. 130 | */ 131 | void remove_almost_identical_solutions(std::vector& consensuses, int identity_threshold); 132 | 133 | 134 | double median(std::vector &v); 135 | 136 | #endif /* _Utils_HPP_ */ 137 | -------------------------------------------------------------------------------- /src/IFindObserver.hpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * MindTheGap: Integrated detection and assembly of insertion variants 3 | * A tool from the GATB (Genome Assembly Tool Box) 4 | * Copyright (C) 2014 INRIA 5 | * Authors: C.Lemaitre, G.Rizk, P.Marijon 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU Affero General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU Affero General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Affero General Public License 18 | * along with this program. If not, see . 19 | *****************************************************************************/ 20 | 21 | /** 22 | * \file IFindObserver.hpp 23 | * \date 09/04/2015 24 | * \author pmarijon 25 | * \brief Interface definition for FindBreakpoints observer 26 | */ 27 | #ifndef _TOOL_IFindObserver_HPP_ 28 | #define _TOOL_IFindObserver_HPP_ 29 | 30 | /********************************************************************************/ 31 | #include 32 | #include 33 | 34 | /********************************************************************************/ 35 | 36 | template 37 | class FindBreakpoints; 38 | 39 | /** \brief Interface for FindBreakpoints observer 40 | * Implementation can be add in FindBreakpoints observer list and call by update. 41 | */ 42 | template 43 | class IFindObserver : public SmartPointer 44 | { 45 | public: 46 | typedef typename gatb::core::kmer::impl::Kmer Kmer; 47 | 48 | typedef typename Kmer::Type KmerType; 49 | 50 | public: 51 | 52 | /** Constructor. 53 | * \param[in,out] The FindBreakpoints instance, implementation can read and 54 | * write information one this instance. 55 | */ 56 | IFindObserver(FindBreakpoints* find); 57 | 58 | /** Destructor. 59 | */ 60 | virtual ~IFindObserver() {} 61 | 62 | /** Called when FindBreakpoints::notify is called 63 | * \param[in] kmer is in graph or not 64 | */ 65 | virtual bool update() = 0; 66 | 67 | protected : 68 | 69 | /** Pointer one FindBreakpoints instance 70 | */ 71 | FindBreakpoints* _find; 72 | bool contains(KmerType kmer); 73 | int nb_in_branch(KmerType kmer); 74 | int nb_out_branch(KmerType kmer); 75 | bool suffix_is_repeated(KmerType kmer); 76 | }; 77 | 78 | template 79 | IFindObserver::IFindObserver(FindBreakpoints* find) 80 | { 81 | this->_find = find; 82 | } 83 | 84 | template 85 | bool IFindObserver::contains(KmerType kmer) 86 | { 87 | kmer = std::min(kmer, revcomp(kmer, this->_find->kmer_size())); 88 | Node node = Node(Node::Value(kmer)); 89 | return this->_find->graph_contains(node); 90 | } 91 | template 92 | int IFindObserver::nb_in_branch(KmerType kmer) 93 | { 94 | //kmer = std::min(kmer, revcomp(kmer, this->_find->kmer_size())); 95 | Node node = Node(Node::Value(kmer), STRAND_FORWARD); 96 | return this->_find->node_in_branch(node); 97 | } 98 | template 99 | int IFindObserver::nb_out_branch(KmerType kmer) 100 | { 101 | //kmer = std::min(kmer, revcomp(kmer, this->_find->kmer_size())); 102 | Node node = Node(Node::Value(kmer), STRAND_FORWARD); 103 | return this->_find->node_out_branch(node); 104 | } 105 | 106 | template 107 | bool IFindObserver::suffix_is_repeated(KmerType kmer) 108 | { 109 | 110 | KmerType one; one.setVal(1); 111 | KmerType kminus1_mask = (one << ((this->_find->kmer_size()-1)*2)) - one; 112 | KmerType suffix = kmer & kminus1_mask ; // getting the k-1 suffix (because putative kmer_begin) 113 | KmerType suffix_rev = revcomp(suffix,this->_find->kmer_size()-1); // we get its reverse complement to compute the canonical value of this k-1-mer 114 | 115 | return(this->_find->ref_bloom_contains(min(suffix,suffix_rev))); 116 | 117 | } 118 | #endif /* _TOOL_IFindObserver_HPP_ */ 119 | 120 | -------------------------------------------------------------------------------- /src/Finder.hpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * MindTheGap: Integrated detection and assembly of insertion variants 3 | * A tool from the GATB (Genome Assembly Tool Box) 4 | * Copyright (C) 2014 INRIA 5 | * Authors: C.Lemaitre, G.Rizk 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU Affero General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU Affero General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Affero General Public License 18 | * along with this program. If not, see . 19 | *****************************************************************************/ 20 | 21 | #ifndef _TOOL_Finder_HPP_ 22 | #define _TOOL_Finder_HPP_ 23 | 24 | /********************************************************************************/ 25 | #include 26 | using namespace std; 27 | 28 | /********************************************************************************/ 29 | 30 | static const char* STR_URI_REF = "-ref"; 31 | static const char* STR_MAX_REPEAT = "-max-rep";; 32 | static const char* STR_HET_MAX_OCC = "-het-max-occ"; 33 | static const char* STR_SNP_MIN_VAL = "-snp-min-val"; 34 | static const char* STR_BRANCHING_FILTER = "-branching-filter"; 35 | 36 | static const char* STR_HOMO_ONLY = "-homo-only"; 37 | static const char* STR_INSERT_ONLY = "-insert-only"; 38 | static const char* STR_SNP_ONLY = "-snp-only"; 39 | static const char* STR_DELETION_ONLY = "-deletion-only"; 40 | static const char* STR_HETERO_ONLY = "-hete-only"; 41 | static const char* STR_NO_BACKUP = "-no-backup"; 42 | static const char* STR_WITH_BACKUP = "-backup"; 43 | static const char* STR_NO_SNP = "-no-snp"; 44 | static const char* STR_NO_INSERT = "-no-insert"; 45 | static const char* STR_NO_DELETION = "-no-deletion"; 46 | static const char* STR_NO_HETERO = "-no-hetero"; 47 | static const char* STR_BED = "-bed"; 48 | 49 | static const char* STR_HOM_TYPE = "HOM"; 50 | static const char* STR_HET_TYPE = "HET"; 51 | static const char* STR_SNP_TYPE = "SNP"; 52 | static const char* STR_MSNP_TYPE = "MSNP"; 53 | static const char* STR_DEL_TYPE = "DEL"; 54 | static const char* STR_BKP_TYPE = "BACKUP"; 55 | 56 | class Finder : public Tool 57 | { 58 | public: 59 | 60 | // Constructor 61 | Finder (); 62 | ~Finder (); 63 | 64 | void FinderHelp(); 65 | 66 | const char* _mtg_version; 67 | size_t _kmerSize; 68 | Graph _graph; 69 | 70 | //parameters 71 | int _max_repeat; 72 | int _het_max_occ; 73 | int _snp_min_val; 74 | int _branching_threshold; 75 | int _nbCores; 76 | bool _homo_only; 77 | bool _homo_insert; 78 | bool _hete_insert; 79 | bool _snp; 80 | bool _backup; 81 | bool _deletion; 82 | bool _small_homo; 83 | bool _small_hetero; 84 | 85 | //input/output files 86 | IBank* _refBank; 87 | string _breakpoint_file_name; 88 | FILE * _breakpoint_file; 89 | string _vcf_file_name; 90 | FILE * _vcf_file; 91 | 92 | string _bed_file_name; 93 | 94 | //results statistics 95 | int _nb_homo_clean; 96 | int _nb_homo_fuzzy; 97 | int _nb_hetero_clean; 98 | int _nb_hetero_fuzzy; 99 | int _nb_fuzzy_deletion; 100 | int _nb_clean_deletion; 101 | int _nb_solo_snp; 102 | int _nb_multi_snp; 103 | int _nb_backup; 104 | int _nb_homo_clean_indel; 105 | int _nb_homo_fuzzy_indel; 106 | int _nb_hetero_indel; 107 | // Actual job done by the tool is here 108 | void execute (); 109 | 110 | 111 | private: 112 | 113 | /** fills getInfo() with parameters informations 114 | */ 115 | void resumeParameters(); 116 | 117 | /** fills getInfo() with results informations 118 | * arg seconds running time 119 | */ 120 | void resumeResults(double seconds); 121 | 122 | /** writes the header of the vcf file 123 | */ 124 | void writeVcfHeader(); 125 | 126 | /** Create and use FindBreakpoints class to find gaps in the reference genome 127 | */ 128 | template 129 | struct runFindBreakpoints { void operator () (Finder* object); }; 130 | }; 131 | 132 | /********************************************************************************/ 133 | 134 | #endif /* _TOOL_Finder_HPP_ */ 135 | 136 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * MindTheGap: Integrated detection and assembly of insertion variants 3 | * A tool from the GATB (Genome Assembly Tool Box) 4 | * Copyright (C) 2014 INRIA 5 | * Authors: C.Lemaitre, G.Rizk 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU Affero General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU Affero General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Affero General Public License 18 | * along with this program. If not, see . 19 | *****************************************************************************/ 20 | 21 | // We include the header file for the tool 22 | #include 23 | #include 24 | 25 | /********************************************************************************/ 26 | 27 | using namespace std; 28 | 29 | static const char* MTG_VERSION = "2.3.0"; 30 | 31 | static const char* STR_FIND = "find"; 32 | static const char* STR_FILL = "fill"; 33 | 34 | void displayVersion(std::ostream& os){ 35 | 36 | os << "* * * * * * * * * * * * * * * * * * * * * *" << endl; 37 | os << "* MindTheGap version "<< MTG_VERSION << " *" << endl; //<< " AGPL licence" < [module options]" < | -graph ) -ref [options]" << endl; 50 | os << " help: MindTheGap find -help"<< endl; 51 | os << " fill : gap-filler or insertion assembly"<< endl; 52 | os << " usage: MindTheGap fill (-in | -graph ) (-bkpt | -contig ) [options]" << endl; 53 | os << " help: MindTheGap fill -help"<< endl; 54 | os << "[Common options]" << endl; 55 | os << " -help : display this help menu" << endl; 56 | os << " -version : display current version" << endl; 57 | os << endl; 58 | 59 | } 60 | 61 | 62 | int main (int argc, char* argv[]) 63 | { 64 | 65 | 66 | if(argc<2){ 67 | displayHelp(cout); 68 | return EXIT_FAILURE; 69 | } 70 | 71 | if(strcmp(argv[1],STR_VERSION)==0 || strcmp(argv[1],"-v")==0 ){ 72 | displayVersion(cout); 73 | return EXIT_SUCCESS; 74 | } 75 | 76 | if(strcmp(argv[1],STR_HELP)==0 || strcmp(argv[1],"-h")==0 ){ 77 | displayHelp(cout); 78 | return EXIT_SUCCESS; 79 | } 80 | 81 | if ((strcmp(argv[1],STR_FIND) != 0 && strcmp(argv[1],STR_FILL) != 0 ) || (strcmp(argv[1],STR_FIND) == 0 && strcmp(argv[1],STR_FILL) == 0 )) 82 | { 83 | cerr << "options find and fill are incompatible, but at least one of these is mandatory" << endl; 84 | return EXIT_FAILURE; 85 | 86 | } 87 | 88 | if (strcmp(argv[1],STR_FIND) == 0) 89 | { 90 | try 91 | { 92 | Finder finder = Finder(); 93 | finder._mtg_version = MTG_VERSION; 94 | finder.run (argc-1, argv+1); 95 | } 96 | catch (Exception& e) 97 | { 98 | if(strcmp(e.getMessage(),"")!=0){ 99 | std::cout << std::endl << "EXCEPTION: " << e.getMessage() << std::endl; 100 | } 101 | return EXIT_FAILURE; 102 | } 103 | } 104 | 105 | if (strcmp(argv[1],STR_FILL) == 0) 106 | { 107 | try 108 | { 109 | Filler filler = Filler(); 110 | filler._mtg_version = MTG_VERSION; 111 | filler.run (argc-1, argv+1); 112 | } 113 | catch (Exception& e) 114 | { 115 | if(strcmp(e.getMessage(),"")!=0){ 116 | std::cout << std::endl << "EXCEPTION: " << e.getMessage() << std::endl; 117 | } 118 | return EXIT_FAILURE; 119 | } 120 | } 121 | 122 | return EXIT_SUCCESS; 123 | 124 | } 125 | 126 | -------------------------------------------------------------------------------- /scripts/jenkins/tool-mindthegap-build-macos-10.9.5-gcc-4.2.1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #--------------------------------------------------------------# 3 | # Continuous integration script for Jenkins # 4 | #--------------------------------------------------------------# 5 | # 6 | # Default mode : 7 | # This script will exit with error (exit code 1) if any of its steps fails. 8 | # To change this behaviour, choose DO_NOT_STOP_AT_ERROR in Jenkins (see below). 9 | #--------------------------------------------------------------# 10 | set +xv 11 | 12 | echo " 13 | ----------------------------------------- 14 | Miscellaneous information 15 | ----------------------------------------- 16 | date : `date` 17 | hostname : `hostname` 18 | pwd : `pwd` 19 | 20 | ----------------------------------------- 21 | Jenkins build parameters (user defined) 22 | ----------------------------------------- 23 | BRANCH_TO_BUILD : ${BRANCH_TO_BUILD} 24 | INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN} 25 | DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR} 26 | 27 | ----------------------------------------- 28 | Jenkins build parameters (built in) 29 | ----------------------------------------- 30 | BUILD_NUMBER : ${BUILD_NUMBER} 31 | JENKINS_HOME : ${JENKINS_HOME} 32 | WORKSPACE : ${WORKSPACE} 33 | " 34 | 35 | error_code () { [ "$DO_NOT_STOP_AT_ERROR" = "true" ] && { return 0 ; } } 36 | 37 | [ "$DO_NOT_STOP_AT_ERROR" != "true" ] && { set -e ; } || { echo "(!) DEBUG mode, the script will NOT stop..." ; echo; } 38 | set -xv 39 | 40 | # quick look at resources 41 | #----------------------------------------------- 42 | sw_vers -productVersion 43 | #----------------------------------------------- 44 | system_profiler SPSoftwareDataType 45 | #----------------------------------------------- 46 | lstopo 47 | #----------------------------------------------- 48 | top -l 1|head -15 49 | #----------------------------------------------- 50 | 51 | ################################################################ 52 | # COMPILATION # 53 | ################################################################ 54 | 55 | gcc --version 56 | g++ --version 57 | 58 | [ `gcc -dumpversion` = 4.2.1 ] && { echo "GCC 4.2.1"; } || { echo "GCC version is not 4.2.1, we exit"; exit 1; } 59 | 60 | JENKINS_TASK=tool-${TOOL_NAME}-build-macos-10.9.5-gcc-4.2.1 61 | GIT_DIR=/builds/workspace/$JENKINS_TASK/gatb-${TOOL_NAME} 62 | #N.B. /scratchdir not yet mounted on the osx slave (ciosx). 63 | # as soon as /scratchdir is created, one has to update TEST procedure, below. 64 | # refer to linux build target to see how to do that 65 | BUILD_DIR=$GIT_DIR/build 66 | 67 | rm -rf $BUILD_DIR 68 | mkdir -p $BUILD_DIR 69 | 70 | #----------------------------------------------- 71 | # we need gatb-core submodule to be initialized 72 | cd $GIT_DIR 73 | git submodule init 74 | git submodule update 75 | 76 | ################################################################ 77 | # GIT INFO # 78 | ################################################################ 79 | echo " 80 | ----------------------------------------- 81 | GATB-Tool used : ${TOOL_NAME} 82 | ----------------------------------------- 83 | HEAD is : `git rev-parse HEAD` 84 | release is : `git describe --all` 85 | " 86 | 87 | cd thirdparty/gatb-core 88 | 89 | echo " 90 | ----------------------------------------- 91 | GATB-Core used 92 | ----------------------------------------- 93 | HEAD is : `git rev-parse HEAD` 94 | release is : `git describe --all` 95 | " 96 | 97 | #----------------------------------------------- 98 | cd $BUILD_DIR 99 | 100 | #----------------------------------------------- 101 | cmake -Wno-dev -DJENKINS_TAG=${BRANCH_TO_BUILD} $GIT_DIR 102 | 103 | #----------------------------------------------- 104 | make -j 2 || error_code 105 | 106 | ################################################################ 107 | # TEST # 108 | ################################################################ 109 | # run tests 110 | cd ../test 111 | ./simple_test.sh || error_code 112 | ./simple_full_test.sh || error_code 113 | # go back to build for packaging step 114 | cd ../build 115 | 116 | ################################################################ 117 | # PACKAGING # 118 | ################################################################ 119 | 120 | # Prepare and upload bin and source bundle to the forge 121 | if [ $? -eq 0 ] && [ "$INRIA_FORGE_LOGIN" != none ] && [ "$DO_NOT_STOP_AT_ERROR" != true ]; then 122 | make package 123 | make package_source 124 | 125 | # make both tar.gz available as Jenkins build artifacts 126 | cp ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz ${WORKSPACE}/ 127 | cp ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz ${WORKSPACE}/ 128 | 129 | fi 130 | 131 | -------------------------------------------------------------------------------- /data/contigs.fasta: -------------------------------------------------------------------------------- 1 | >contig1 2 | GACTAATCCACTGCCTCAGGGCCATAGGATCTGGAAGCGTAATGTATTCGTGCGTCTCATTTGTCCTGGTTCTTTGGGGACGTACAATGCAATCTCAGGTACACATGCGAGAGTCTGTTATGTGCTTCACAAGGTCAAGTATACCAGAATGTAGCTCCCTCAATTGGATATTCGCGGTAATGGTACCGATGTGCATGTGAGAGATCAAGTCTTCGTCGGAGAGTTGTAGTTGATTCCTCCCAGATGAGGACTCCTGTCCTCTTGTACTCGCCGAGTAGGAAGTAGAGGCCTGTTAGGCATGATTTGCTGAAACGCCGGCTAAACTTTCAGTTTTACCCACTTGGCGAGGCAGCGTGGGTAGAACTATCCGGATCGAGGGTACGTTAGACATGGGCCTACGGGTAGCCTCTAGGGTTGACCCCGTGAGGTGTCTTACAAGAATGCTGAGGTGGAGCCCGCCAACGGGGGCTGTGGATACCCCATCTGTACCTATTCTCGCTACAGAGTTAGTGCATAGTATG 3 | >contig2 4 | CCAATTTCTTCAGCGAATCTTAGACAGGCAACTGCAACCCCAATCCTGGGAGAATGCCACATCAG 5 | >contig3 6 | ATTTGTGATTTCAACCCTACCCCCACCACCCCGTAACCAGCGAGGATAATCAAACCGTTCTGCCGTTTGTCATAGTGTTTAGCACACGGCATCCGTAATCTACCCTCATTCTCCCCCAAAGATGTTTGACGTTCATAAGCGTTAGTGACTTTGATGCTCTTTAATAAAGGTCCGTTGAGCTTGTCATTGGAGTAGACCCCCTTACGGCAATGCCGGCGGGGTGGTAGAAGGCCCACTCAAGTGAACGCAGCGATGTCCTAGAGTTTCCACTCTACCCTACGCCGGGTATACCGTTCCGCGGCATATCTCCTAGCGGGATACTATGTCACTAATTAAACCACTGGGGACAGGGTTGCGGGTCTAAGGCGATTCAGTACGGAGTCAACTATGGCCACACGCAGAACCTAGATGTTAGCCAATCCAGTGTTCGGTTATCTTGTTTCCCTCACGCTTTTGTGTCTGGTTCCCAAGCCTTTTTCCCAAGCCCTTATAGGAGATAACCCGAGGGGAATGAGTTATCCTTACTGGGTGTTAACCCTCGTTGCGCGAATCCCGGATA 7 | >contig4 8 | CATACTCGTCGTGGTGCAGGCGATTAATTAACTAGAGCAGCCTCCTGTGAGCACGTTTTCAACAGTCTGGGCCGCAGATAGCTAAGTCAATTAAACTTTGGAGCAGTCTAGTTTCAACGGCCGTGTGAGTCGACAGTAACCTACCGTGAAATACAATTGTATTGTTACATCATACAAAAAGACCCTTAAACGGTAATTGGCGTGTTGTTCCATTTTTTCTACCTAGGGCATGGACGGTAAACCACCACCGTGTCAGTGTGTTTTGGTGGGTAAGGCGCGCAAACAAAGATGGTGGGTGCAAGCACTGTCGAAGGTTAACTCCTACCATCTTTCCTAAATCCCTGACCGTCTGACGCGTGATGTCAGGGTCCTTGGCACCCAGTACTGGAGGCTAACCGAGAGCCAGGCGTATCTCTATCGTAGCCCATATTAGTTATAGGATCTGTATTTAGCTCTGTGGACCATAAAAAAAGACCTGGAGG 9 | >contig5 10 | CTCTGTGGACCATAAAAAAAGACCTGGAGGCGCTGCCAGCATTACGGACGCCCCCTCCACGCGGTCGTCACACGTTATTCGACCCGCAGTTACATTATAGATCGTCGATTCGATTACTCGCGTGGGACTTATGGTAACCTAATGTAAGGGAGCGCGAAACCGTGGCAAAAGGCGTAACTGCTAACCCTTACTGCTGCACTATCGAGCGTATCGGGCCTCGTGGCATGGCTCAAATGATTGCAGGCTAAGGCTCCCCAAATGTATCCGGTTGGTGCTGGCATCGACGCGACGGAGAGTAGAAGTATGTTTTCTGCAGTCATGAGCTAGTCTGTTATTACGTACTCCCGCTCACGGGGATTATCAAACCGCATCGCCTAGTTGGAGTACGACGTGTAGGAGTGTCTGGACGCTCAGGGGTGTCAGAACCAAGTTTGCCGCGAAACAAACTTCATTGAGGGCCTTGACCGCTCTGACTGTCAAAACAACACAAAATGACTGCGTTGGCGGTTCGACAGGATTCTCGTTTAATCTACTTTTGGCGGCTCACCAAGTTAATTACTTGCCTTGTCCGGTCGACCGAGGTCCCACTGTGTGTCGGGG 11 | >contig6 12 | GAAGAAGACACGCCAGTAAGGCCCACATTGAGAAGTCAGCTGCCCGCGTCTACAATCGCTTCACAGATTGATCAGGTTGACATTTGACAGTATGAGAGTAAGACGGTAAGATAGAGAACCTTCCCTGACACCATGTGTTCGAATCCGCTTCGATGGCTTGTCTTCTGCTTTATCGCGTATGCGGATTGCTCATTGTAGGCGCACGGAGATGTCCTCAGATCCCCAATGGTGTGCCAGGTTTGAGGGTTTCACACCATTGCTTACAGCTGATAGCAAGTCCGTGGTTGTGGTTGGGACGTTCCGCGCATCAGATAACCAAGCACACTATCGTTATGTCGAGGAAGGTCGCAAGATGGGGTCAGGTATCAATACATACGGCCGACCCAAAGACCGTCTACAGTTCGGCCTGTCGCGACGTCGTGAGGATAGTTGGCTATTCTCAGGTGCCCTAGACGGAGGCCACGGATCCACCTCTACGCATGGGAGTACGGAGTAACTGCGATATGTGAGTCCGTACGCAATTAGATAAAACTATCCAGCGGTACGTGGCTCGAAATTAAATTCAGAAAGCACGCTTCATATTCTGAGCTTCCACACCGCGGCATCTGATTAGAAGTGCGTGGGTAGGAGATCCCACACG 13 | >contig7 14 | GCAGCTCTTTGCGCAACAATTTCTTGTGCCGCGCATATATTTGCCTCTGAACAAATGGGGATCGCCTCTACCATCAGTCTTATATCTGCCCCAAAGACTTACGGCTCGATTGCCTGCCTGAGGACACCACTTCAGAAGGTGTTGTCATCAGCAAATCGAGGCGGCCTTCGCGCAATCCATGGATATGTGATCTATACTGTTACATGCTAAACGGCCGAAAGCGCTACTCCACAGATCCGCGCCACGGGATAGAATTTCGGCAAAGGGGTTTCCTCCCTTCCATAGGTGTATAACCGCATGGGACCAAGGGGATTAGTCGCGTCTCTCACGGATCCAGGCCTCTTCTCCCCCCTGACGCGAGGTCCAGAGCCCCGCACTACAGCGTGACGGCGCTAAACCTTGTACGCCAAACTATGCGGCCTGCTAATTACCCCGTAGACGTATTAAGAGATTCGAGAGATCTCGGGACGGCGCGCCATGACCCGTTAAATGAGAACCAATGGAAGAATGGGACTGGACGTTGGCAACGTCCACATCTCACCGCCCCTATCTCATCTGATTCGGCTTGATGAAATGTTGCTGAAGGTTGCGGGCCGGCACGTGAGCTCACGCGAGGCCATCTGTGCTCTTGGTTTACTCATAAGAGTGCTATATCCTGACGCCCTAGCATAACGCTTGTTCAACCTCAGGCTGGCTTGAAAGGGAACCCCCTTCGCTTACGGCAG 15 | >contig8 16 | CGAAAGTGAAAGATGGATTAATCAAGAGATTTTATGTTCTTATCTTAAACGGGCACTACCGGTACCGTACTAGTAGCGAGGGTCGGGTAACGCTTGTTCTTGTCGATGTAAAGCGCATAGACACCAGCCTTAGCACCCTGGTATGCGTGCGCCCATGATTGGTTGTAGCCCTCGTTAGCATCGCCTTATCGATCGAAAAATCGCCTGTACGGTTAGGATACACGATGTTTCTGAGCCGACGGAACACCCATCCGAATCTTTTCCAATATATCGTTAATGCTCTCTTGTCCCAGCTTAATACCATAGGTGTGGATATTGCATGTTCGGTCCCGGAACGCAAGCGATCGGTGCATCGCTTTCAGGGTAGAACTTGTCTGCGATTTACCCCTGAACTCATAAGCCACC 17 | >contig9 18 | TTCACGACCCAGACGTAAACATGACTGGTTTCACAAGGAATTGAGCCGCTTCATGGCCTACTCCAGCAGCTAAACGATATTCAATACCCCGCCTTCTGTCGGAGTTGCATGCAAAGTTGTCCCTAATCGAGTATCAGGTACCGGCCCAAGTCTAGGGAGTCCCCCCCACTAACGCGATACGCTTCGAGGGTTGAAACGCGTGCATTCCGTAAATAAATGCGGTACATTGTTCCACTACTGTTAAGCGGCCCGCCGGTGCAGTAGAATTTGGAATTCAGGTGTCTATTAGAAGACTCTACATCCCGGGAAGGGGTCCCAATGACCTTTCTGCCTCCACCATCAAGACGAACTTTGACCATCGTTAGTTGAGCCCCCTGTATCAGGCCGTAGCGCCTCGATTGAGGACTGTCCTGTCTTTGTGTAATCCTATCAGATAAAGCTCCGTCCGAGTGTACGTGCCGTCATTTTCCAACCGGAATCCTTGCCGCCATCGATCAAGT 19 | >contig10 20 | TTCCGAATTACGCCTCCTGCTGTTTACTTCCGCTGGCAAGCGAAATCGCCTCCAAGCTGTGCCAGAGTGGATCTTCGGAGCCACCTGGTCGAACCCAGCCATTCAACATGAAGGGTCCGAACCCAATTCACGTACGTGCTCTCCCTCAAAAGGCCCTGTACCCCAAAACCTCTCATGCTTAGAAGCCGTCGGATCATATCAGTGGGGAGCTCGGATATGGCTCTCGGGGCGCAGAATAAGGAGTCTCTCAAAGCAATCGACCGAACCTCGTGCTTATCCCATGCTCTGGTAGGGGGCGGACCGCATGTTAAAACCCTCAGACCCAACCTGGGTGGGTGCACAAAGTTAAACAATATTTCCGTAAGCTCGCAGATGTCAACGGATCAAGATAATCTGATAAACCATGGGCGGGCTTGTTACCCAGTGAGCACGCTATCAAGGCAGCTTTGCACGTCATCTCATCTGTAGCTCCTTGCGCTGTATGAGCTTACGGAATCTTCTCCTTACTGCCACC 21 | 22 | -------------------------------------------------------------------------------- /scripts/jenkins/tool-mindthegap-build-debian7-64bits-gcc-4.7.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #--------------------------------------------------------------# 3 | # Continuous integration script for Jenkins # 4 | #--------------------------------------------------------------# 5 | # 6 | # Default mode : 7 | # This script will exit with error (exit code 1) if any of its steps fails. 8 | # To change this behaviour, choose DO_NOT_STOP_AT_ERROR in Jenkins (see below). 9 | #--------------------------------------------------------------# 10 | set +xv 11 | 12 | echo " 13 | ----------------------------------------- 14 | Miscellaneous information 15 | ----------------------------------------- 16 | date : `date` 17 | hostname : `hostname` 18 | pwd : `pwd` 19 | 20 | ----------------------------------------- 21 | Jenkins build parameters (user defined) 22 | ----------------------------------------- 23 | BRANCH_TO_BUILD : ${BRANCH_TO_BUILD} 24 | INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN} 25 | DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR} 26 | 27 | ----------------------------------------- 28 | Jenkins build parameters (built in) 29 | ----------------------------------------- 30 | BUILD_NUMBER : ${BUILD_NUMBER} 31 | JENKINS_HOME : ${JENKINS_HOME} 32 | WORKSPACE : ${WORKSPACE} 33 | " 34 | 35 | error_code () { [ "$DO_NOT_STOP_AT_ERROR" = "true" ] && { return 0 ; } } 36 | 37 | [ "$DO_NOT_STOP_AT_ERROR" != "true" ] && { set -e ; } || { echo "(!) DEBUG mode, the script will NOT stop..." ; echo; } 38 | set -xv 39 | 40 | # quick look at resources 41 | #----------------------------------------------- 42 | free -h 43 | #----------------------------------------------- 44 | lstopo 45 | #----------------------------------------------- 46 | df -kh 47 | #----------------------------------------------- 48 | 49 | 50 | ################################################################ 51 | # COMPILATION # 52 | ################################################################ 53 | 54 | gcc --version 55 | g++ --version 56 | 57 | [ `gcc -dumpversion` = 4.7 ] && { echo "GCC 4.7"; } || { echo "GCC version is not 4.7, we exit"; exit 1; } 58 | 59 | JENKINS_TASK=tool-${TOOL_NAME}-build-debian7-64bits-gcc-4.7-gitlab 60 | JENKINS_WORKSPACE=$WORKSPACE/$JENKINS_TASK/ 61 | 62 | GIT_DIR=/scratchdir/builds/workspace/gatb-${TOOL_NAME} 63 | BUILD_DIR=/scratchdir/$JENKINS_TASK/gatb-${TOOL_NAME}/build 64 | 65 | rm -rf $BUILD_DIR 66 | mkdir -p $BUILD_DIR 67 | mkdir -p $JENKINS_WORKSPACE 68 | 69 | #----------------------------------------------- 70 | # we need gatb-core submodule to be initialized 71 | cd $GIT_DIR 72 | git submodule init 73 | git submodule update 74 | 75 | ################################################################ 76 | # GIT INFO # 77 | ################################################################ 78 | echo " 79 | ----------------------------------------- 80 | GATB-Tool used : ${TOOL_NAME} 81 | ----------------------------------------- 82 | HEAD is : `git rev-parse HEAD` 83 | release is : `git describe --all` 84 | " 85 | 86 | cd thirdparty/gatb-core 87 | 88 | echo " 89 | ----------------------------------------- 90 | GATB-Core used 91 | ----------------------------------------- 92 | HEAD is : `git rev-parse HEAD` 93 | release is : `git describe --all` 94 | " 95 | 96 | #----------------------------------------------- 97 | cd $BUILD_DIR 98 | 99 | #----------------------------------------------- 100 | cmake -Wno-dev -DJENKINS_TAG=${BRANCH_TO_BUILD} $GIT_DIR 101 | 102 | #----------------------------------------------- 103 | make -j 2 || error_code 104 | 105 | ################################################################ 106 | # TEST # 107 | ################################################################ 108 | # prepare data and scripts 109 | cp -R $GIT_DIR/test/ .. 110 | cp -R $GIT_DIR/data/ .. 111 | # run tests 112 | cd ../test 113 | ./simple_test.sh || error_code 114 | ./simple_full_test.sh || error_code 115 | # cleanup disk space 116 | cd .. 117 | rm -rf test 118 | rm -rf data 119 | # go bask to build for packaging step 120 | cd build 121 | 122 | ################################################################ 123 | # PACKAGING # 124 | ################################################################ 125 | 126 | #-- Upload bin bundle as a build artifact 127 | # -> bin bundle *-bin-Linux.tar.gz will be archived as a build artifact 128 | # -> source package is handled by the osx task 129 | 130 | if [ $? -eq 0 ] && [ "$INRIA_FORGE_LOGIN" != none ] && [ "$DO_NOT_STOP_AT_ERROR" != true ]; then 131 | echo "Creating a binary archive... " 132 | make package 133 | 134 | pwd 135 | ls -atlhrsF 136 | 137 | #-- Move the generated bin bundle to the workspace (so that it can be uploaded as a Jenkins job artifact) 138 | mv *-${BRANCH_TO_BUILD}-bin-Linux.tar.gz $JENKINS_WORKSPACE/ 139 | 140 | fi 141 | 142 | -------------------------------------------------------------------------------- /scripts/jenkins/tool-mindthegap-release-debian.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #--------------------------------------------------------------# 3 | # Continuous integration script for Jenkins # 4 | #--------------------------------------------------------------# 5 | # 6 | # Default mode : 7 | # This script will exit with error (exit code 1) if any of its steps fails. 8 | # To change this behaviour, choose DO_NOT_STOP_AT_ERROR in Jenkins (see below). 9 | #--------------------------------------------------------------# 10 | set +xv 11 | 12 | echo " 13 | ----------------------------------------- 14 | Miscellaneous information 15 | ----------------------------------------- 16 | date : `date` 17 | hostname : `hostname` 18 | pwd : `pwd` 19 | 20 | ----------------------------------------- 21 | Jenkins build parameters (user defined) 22 | ----------------------------------------- 23 | BRANCH_TO_BUILD : ${BRANCH_TO_BUILD} 24 | RELEASE_TO_BUILD : ${RELEASE_TO_BUILD} 25 | INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN} 26 | TEST_VARIABLE : ${TEST_VARIABLE} 27 | DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR} 28 | 29 | ----------------------------------------- 30 | Jenkins build parameters (built in) 31 | ----------------------------------------- 32 | BUILD_NUMBER : ${BUILD_NUMBER} 33 | " 34 | set -xv 35 | 36 | # quick look at resources 37 | #----------------------------------------------- 38 | free -h 39 | #----------------------------------------------- 40 | lstopo 41 | #----------------------------------------------- 42 | df -kh 43 | #----------------------------------------------- 44 | 45 | 46 | ################################################################ 47 | # PREPARE RELEASE # 48 | ################################################################ 49 | 50 | # paths to access tool source code and build 51 | JENKINS_TASK=tool-${TOOL_NAME}-build-debian7-64bits-gcc-4.7 52 | BUILD_DIR=/scratchdir/$JENKINS_TASK/gatb-${TOOL_NAME}-release 53 | TOOL_GIT_HOME="/scratchdir/builds/workspace/gatb-${TOOL_NAME}" 54 | 55 | # path to 'github_release_manager.sh' script 56 | GRM_PATH="${BUILD_DIR}/github-release-api" 57 | GRM_CMD="${GRM_PATH}/github_release_manager.sh" 58 | # github credentials and repository 59 | GITHUB_REPO=${TOOL_NAME} 60 | GITHUB_OWNER=GATB 61 | GRM_CREDENTIALS="-l $GITHUB_ADMIN -t $GITHUB_TOKEN -o ${GITHUB_OWNER} -r ${GITHUB_REPO}" 62 | 63 | # Prepare build dir 64 | rm -rf $BUILD_DIR 65 | mkdir -p $BUILD_DIR 66 | 67 | #----------------------------------------------- 68 | # check tag version; 'master' is not allowed 69 | if [ ! "${BRANCH_TO_BUILD}" == "master" ] ; then 70 | cd ${TOOL_GIT_HOME} 71 | DOES_TAG_EXIST=`git tag -l | grep "^${BRANCH_TO_BUILD}$"` 72 | if [ -z ${DOES_TAG_EXIST} ] ; then 73 | echo "/!\ Error: tag '${BRANCH_TO_BUILD}' does not exist on 'gatb-tool-${TOOL_NAME}' repository" 74 | exit 1 75 | fi 76 | else 77 | echo "/!\ Error: cannot make an official release on 'master' branch" 78 | exit 1 79 | fi 80 | 81 | #----------------------------------------------- 82 | if [ "$INRIA_FORGE_LOGIN" == none ]; then 83 | echo "/!\ Error: No login name to connect to Inria Forge" 84 | exit 1 85 | fi 86 | 87 | cd $BUILD_DIR 88 | git clone https://github.com/GATB/github-release-api.git 89 | 90 | ################################################################ 91 | # RETRIEVE ARCHIVES FROM INRIA FORGE # 92 | ################################################################ 93 | 94 | CI_URL=https://ci.inria.fr/gatb-core/view/MindTheGap-gitlab/job 95 | JENKINS_TASK_DEB=tool-mindthegap-build-debian7-64bits-gcc-4.7-gitlab 96 | JENKINS_TASK_MAC=tool-mindthegap-build-macos-10.9.5-gcc-4.2.1-gitlab 97 | 98 | #retrieve last build from ci-inria (see tool-lean-build-XXX tasks) 99 | wget $CI_URL/$JENKINS_TASK_DEB/lastSuccessfulBuild/artifact/$JENKINS_TASK_DEB/${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Linux.tar.gz 100 | [ $? != 0 ] && exit 1 101 | 102 | wget $CI_URL/$JENKINS_TASK_MAC/lastSuccessfulBuild/artifact/${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz 103 | [ $? != 0 ] && exit 1 104 | 105 | wget $CI_URL/$JENKINS_TASK_MAC/lastSuccessfulBuild/artifact/${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz 106 | [ $? != 0 ] && exit 1 107 | 108 | ################################################################ 109 | # INTERACT WITH GITHUB # 110 | ################################################################ 111 | 112 | # create Github release 113 | ${GRM_CMD} ${GRM_CREDENTIALS} -d ${BRANCH_TO_BUILD} -c create 114 | if [ $? != 0 ] ; then 115 | echo "/!\ Error: unable to create release, check above error" 116 | exit 1 117 | fi 118 | 119 | #upload files 120 | function uploadFile(){ 121 | local FILE_TO_LOAD=$1 122 | echo "Uploading: ${FILE_TO_LOAD}" 123 | ${GRM_CMD} ${GRM_CREDENTIALS} -d ${BRANCH_TO_BUILD} -c upload ${FILE_TO_LOAD} 124 | if [ $? != 0 ] ; then 125 | echo "/!\ Error: unable to upload file, check above error" 126 | exit 1 127 | fi 128 | } 129 | 130 | uploadFile ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Linux.tar.gz 131 | uploadFile ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz 132 | uploadFile ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz 133 | 134 | 135 | -------------------------------------------------------------------------------- /src/IGraphOutput.hpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * GATB : Genome Assembly Tool Box 3 | * Copyright (C) 2014 INRIA 4 | * Authors: R.Chikhi, G.Rizk, E.Drezen 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU Affero General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Affero General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Affero General Public License 17 | * along with this program. If not, see . 18 | *****************************************************************************/ 19 | 20 | #ifndef _IGRAPHOUTPUT_H 21 | #define _IGRAPHOUTPUT_H 22 | 23 | /********************************************************************************/ 24 | #include 25 | /********************************************************************************/ 26 | 27 | #include 28 | #include 29 | 30 | #define NS_TR1_BEGIN 31 | #define NS_TR1_END 32 | 33 | #define NS_TR1_PREFIX std 34 | 35 | 36 | /********************************************************************************/ 37 | 38 | //structure for print id nodes and edges in graph output 39 | struct id_els 40 | { 41 | long node; 42 | long edge; 43 | }; 44 | 45 | /********************************************************************************/ 46 | 47 | // hash functions for unordered_map with various kmer_type's 48 | 49 | // WARNING !!! The following code is not generic !!! 50 | // It is designed to cope with 4 values of supported kmer size. 51 | namespace std 52 | { 53 | NS_TR1_BEGIN 54 | template <> struct hash::Type> : public unary_function::Type, size_t> 55 | { 56 | size_t operator()(const Kmer::Type& elem) const { return hash1(elem); } 57 | }; 58 | template <> struct hash::Type> : public unary_function::Type, size_t> 59 | { 60 | size_t operator()(const Kmer::Type& elem) const { return hash1(elem); } 61 | }; 62 | template <> struct hash::Type> : public unary_function::Type, size_t> 63 | { 64 | size_t operator()(const Kmer::Type& elem) const { return hash1(elem); } 65 | }; 66 | template <> struct hash::Type> : public unary_function::Type, size_t> 67 | { 68 | size_t operator()(const Kmer::Type& elem) const { return hash1(elem); } 69 | }; 70 | NS_TR1_END 71 | } 72 | 73 | /********************************************************************************/ 74 | 75 | template 76 | class IGraphOutput 77 | { 78 | public: 79 | id_els first_id_els; 80 | 81 | // The extended kmer comes originally from the starter (true), or (false) if is it a degenerated kmer (one substitution or one indel). 82 | bool original; 83 | 84 | public: 85 | 86 | /** Constructor. 87 | * \param[in] kmerSize : size of the kmer 88 | * \param[in] prefix : prefix of the file name 89 | * */ 90 | IGraphOutput (size_t kmerSize, const std::string& prefix); 91 | 92 | /** Destructor. */ 93 | virtual ~IGraphOutput() {} 94 | 95 | /** */ 96 | void load_nodes_extremities (const std::string& linear_seqs_name,std::string & infostring); 97 | 98 | /** */ 99 | id_els construct_graph (const std::string& linear_seqs_name, const std::string& direction); 100 | 101 | /** Finish the output. */ 102 | virtual void close() = 0; 103 | 104 | virtual void print_starter_head (int index, char* sequence, size_t sequenceLen) = 0; 105 | virtual void print_starter_end () = 0; 106 | 107 | virtual void print_sequence_head (const std::string& filename, const std::string& direction) = 0; 108 | virtual void print_sequence_end () = 0; 109 | 110 | virtual void print_node (long index, const std::string& seq) = 0; 111 | virtual void print_edge (long index, long id, long id2, const std::string& label, const std::string& comment) = 0; 112 | 113 | /** */ 114 | void reset () { first_id_els.node = first_id_els.edge = 0; } 115 | 116 | protected: 117 | 118 | typedef typename Kmer::Type kmer_type; 119 | typedef typename Kmer::ModelCanonical Model; 120 | typedef typename Model::Kmer ModelKmer; 121 | 122 | enum LeftOrRight { LEFT=0, RIGHT=1 }; 123 | 124 | Model _modelKmer; 125 | Model _modelKmerMinusOne; 126 | 127 | virtual void print_edges (const ModelKmer& kmer, size_t seqLen, LeftOrRight direction, id_els& nb_els); 128 | 129 | struct node_strand { 130 | long node; 131 | Strand strand; 132 | LeftOrRight left_or_right; 133 | node_strand(long node, Strand strand, LeftOrRight left_or_right) : node(node), strand(strand), left_or_right(left_or_right) {} 134 | bool operator<(const node_strand &other) const { 135 | if (node != other.node) 136 | return (node < other.node); 137 | if (left_or_right != other.left_or_right) 138 | return left_or_right < other.left_or_right; 139 | return (strand < other.strand); 140 | } 141 | }; 142 | 143 | NS_TR1_PREFIX::unordered_map < kmer_type, std::set > kmer_links; 144 | 145 | std::string _prefix; 146 | }; 147 | 148 | #endif //_IGRAPHOUTPUT_H 149 | -------------------------------------------------------------------------------- /src/GraphOutputDot.cpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * GATB : Genome Assembly Tool Box 3 | * Copyright (C) 2014 INRIA 4 | * Authors: R.Chikhi, G.Rizk, E.Drezen 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU Affero General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Affero General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Affero General Public License 17 | * along with this program. If not, see . 18 | *****************************************************************************/ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #define DEBUG(a) //printf a 26 | 27 | using namespace std; 28 | 29 | 30 | 31 | /********************************************************************* 32 | ** METHOD : 33 | ** PURPOSE : 34 | ** INPUT : 35 | ** OUTPUT : 36 | ** RETURN : 37 | ** REMARKS : 38 | *********************************************************************/ 39 | template 40 | GraphOutputDot::GraphOutputDot (size_t kmerSize, const string& prefix) 41 | : IGraphOutput (kmerSize,prefix) 42 | { 43 | init (true); 44 | } 45 | 46 | /********************************************************************* 47 | ** METHOD : 48 | ** PURPOSE : 49 | ** INPUT : 50 | ** OUTPUT : 51 | ** RETURN : 52 | ** REMARKS : 53 | *********************************************************************/ 54 | template 55 | void GraphOutputDot::init(bool erase) 56 | { 57 | 58 | _dot_file_suffix = ".graph"; //should be static (but pb with span) 59 | _dot_file_name = (this->_prefix + _dot_file_suffix); 60 | _graph_file = fopen (_dot_file_name.c_str(), erase ? "w":"a"); 61 | fprintf(_graph_file,"digraph dedebruijn {\n"); 62 | } 63 | 64 | /********************************************************************* 65 | ** METHOD : 66 | ** PURPOSE : write graph file or sequence file 67 | ** INPUT : 68 | ** OUTPUT : 69 | ** RETURN : 70 | ** REMARKS : 71 | *********************************************************************/ 72 | template 73 | void GraphOutputDot::close() 74 | { 75 | fprintf(_graph_file,"}\n"); 76 | fclose(_graph_file); 77 | } 78 | 79 | /********************************************************************* 80 | ** METHOD : 81 | ** PURPOSE : print head for a starter 82 | ** INPUT : 83 | ** OUTPUT : 84 | ** RETURN : 85 | ** REMARKS : 86 | *********************************************************************/ 87 | template 88 | void GraphOutputDot::print_starter_head (int index, char* sequence, size_t sequenceLen) 89 | { 90 | 91 | } 92 | 93 | /********************************************************************* 94 | ** METHOD : 95 | ** PURPOSE : write mark for end of nodes list 96 | ** INPUT : 97 | ** OUTPUT : 98 | ** RETURN : 99 | ** REMARKS : 100 | *********************************************************************/ 101 | template 102 | void GraphOutputDot::print_starter_end() // output a single node to a file 103 | { 104 | 105 | } 106 | 107 | /********************************************************************* 108 | ** METHOD : 109 | ** PURPOSE : 110 | ** INPUT : 111 | ** OUTPUT : 112 | ** RETURN : 113 | ** REMARKS : 114 | *********************************************************************/ 115 | template 116 | void GraphOutputDot::print_sequence_head (const string& linear_seqs_name, const string& direction) 117 | { 118 | 119 | } 120 | 121 | /********************************************************************* 122 | ** METHOD : 123 | ** PURPOSE : 124 | ** INPUT : 125 | ** OUTPUT : 126 | ** RETURN : 127 | ** REMARKS : 128 | *********************************************************************/ 129 | template 130 | void GraphOutputDot::print_sequence_end () 131 | { 132 | 133 | } 134 | 135 | /********************************************************************* 136 | ** METHOD : 137 | ** PURPOSE : output a single node to a file 138 | ** INPUT : 139 | ** OUTPUT : 140 | ** RETURN : 141 | ** REMARKS : 142 | *********************************************************************/ 143 | template 144 | void GraphOutputDot::print_node (long index, const string& seq) // output a single node to a file 145 | { 146 | 147 | fprintf(_graph_file,"%ld [label=\"%s\"];\n",index,seq.c_str()); 148 | 149 | } 150 | 151 | /********************************************************************* 152 | ** METHOD : 153 | ** PURPOSE : output a single edges to a file 154 | ** INPUT : 155 | ** OUTPUT : 156 | ** RETURN : 157 | ** REMARKS : 158 | *********************************************************************/ 159 | template 160 | void GraphOutputDot::print_edge (long index, long id, long id2, const string& label, const string& comment) 161 | { 162 | fprintf(_graph_file,"%ld -> %ld [label=\"%s\"];\n",id,id2,label.c_str()); 163 | 164 | } 165 | 166 | // WARNING !!! The following code is not generic !!! 167 | // It is designed to cope with 4 values of supported kmer size. 168 | 169 | template class GraphOutputDot ; 170 | template class GraphOutputDot ; 171 | template class GraphOutputDot ; 172 | template class GraphOutputDot ; 173 | 174 | 175 | -------------------------------------------------------------------------------- /src/FindInsertion.hpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * MindTheGap: Integrated detection and assembly of insertion variants 3 | * A tool from the GATB (Genome Assembly Tool Box) 4 | * Copyright (C) 2014 INRIA 5 | * Authors: C.Lemaitre, G.Rizk, P.Marijon 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU Affero General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU Affero General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Affero General Public License 18 | * along with this program. If not, see . 19 | *****************************************************************************/ 20 | 21 | #ifndef _TOOL_FindInsert_HPP_ 22 | #define _TOOL_FindInsert_HPP_ 23 | 24 | /*******************************************************************************/ 25 | #include 26 | #include 27 | 28 | template 29 | class FindCleanInsertion : public IFindObserver 30 | { 31 | public : 32 | 33 | /** \copydoc IFindObserver::IFindObserver 34 | */ 35 | FindCleanInsertion(FindBreakpoints * find); 36 | 37 | /** \copydoc IFindObserver::update 38 | */ 39 | bool update(); 40 | }; 41 | 42 | template 43 | FindCleanInsertion::FindCleanInsertion(FindBreakpoints * find) : IFindObserver(find){} 44 | 45 | template 46 | bool FindCleanInsertion::update() 47 | { 48 | if((this->_find->kmer_begin().isValid() && this->_find->kmer_end().isValid()) == false) 49 | { 50 | return false; 51 | } 52 | 53 | if(this->_find->gap_stretch_size() == (this->_find->kmer_size()-1)) //Check size of gap 54 | { 55 | // obtains the kmer sequence 56 | string kmer_begin_str = this->_find->model().toString(this->_find->kmer_begin().forward()); 57 | string kmer_end_str = this->_find->model().toString(this->_find->kmer_end().forward()); 58 | 59 | // Check that kmer_begin has at least one out neighbor, if not the breakpoint is not valid 60 | 61 | string kmer_begin_str_1 = this->_find->model().toString(this->_find->het_kmer_history(this->_find->het_kmer_begin_index()-1).kmer); 62 | if ((this->nb_out_branch(this->_find->kmer_begin().forward())==0) || (this->nb_in_branch(this->_find->kmer_end().forward())==0)) 63 | { 64 | return false; 65 | } 66 | else 67 | { 68 | 69 | //position : this->_find->position() is the beginning of the second found kmer after the gap : -2 ie position of the last 0, ie position just before (at the left of) the insertion site (0-based) 70 | this->_find->writeBreakpoint(this->_find->breakpoint_id(), this->_find->chrom_name(), this->_find->position() - 2, kmer_begin_str, kmer_end_str, 0,STR_HOM_TYPE, this->_find->kmer_begin_is_repeated() ,this->_find->kmer_end_is_repeated() ); 71 | 72 | // iterate counter 73 | this->_find->breakpoint_id_iterate(); 74 | this->_find->homo_clean_iterate(); 75 | return true; 76 | } 77 | } 78 | 79 | return false; 80 | } 81 | 82 | template 83 | class FindFuzzyInsertion : public IFindObserver 84 | { 85 | public : 86 | 87 | /** \copydoc IFindObserver::IFindobserver 88 | */ 89 | FindFuzzyInsertion(FindBreakpoints * find); 90 | 91 | /** \copydoc IFindObserver::update 92 | */ 93 | bool update(); 94 | }; 95 | 96 | template 97 | FindFuzzyInsertion::FindFuzzyInsertion(FindBreakpoints * find) : IFindObserver(find){} 98 | 99 | template 100 | bool FindFuzzyInsertion::update() 101 | { 102 | if((this->_find->kmer_begin().isValid() && this->_find->kmer_end().isValid()) == false) 103 | { 104 | return false; 105 | } 106 | 107 | if(this->_find->gap_stretch_size() < this->_find->kmer_size() - 1 && this->_find->gap_stretch_size() >= this->_find->kmer_size() - 1 - this->_find->max_repeat()) 108 | { 109 | // Fuzzy site, position and kmer_end are impacted by the repeat 110 | int repeat_size = this->_find->kmer_size() - 1 - this->_find->gap_stretch_size(); 111 | 112 | // obtains the kmer sequence 113 | string kmer_begin_str = this->_find->model().toString(this->_find->kmer_begin().forward()); 114 | string kmer_end_str = string(&(this->_find->chrom_seq()[this->_find->position() - 1 + repeat_size]), this->_find->kmer_size()); 115 | if ((this->nb_out_branch(this->_find->kmer_begin().forward())==0) || (this->nb_in_branch(this->_find->kmer_end().forward())==0) || (!this->_find->model().codeSeed(&(this->_find->chrom_seq()[this->_find->position() - 1 + repeat_size]),Data::ASCII).isValid())) 116 | { 117 | return false; 118 | } 119 | else 120 | { 121 | //position : this->_find->position() is the beginning of the second found kmer after the gap : -2 ie position of the last 0, ie position just before (at the left of) the insertion site (0-based) 122 | this->_find->writeBreakpoint(this->_find->breakpoint_id(), this->_find->chrom_name(), this->_find->position() - 2 + repeat_size, kmer_begin_str, kmer_end_str, repeat_size, STR_HOM_TYPE, this->_find->kmer_begin_is_repeated() , this->_find->kmer_end_is_repeated()); 123 | 124 | //iterate counter 125 | this->_find->breakpoint_id_iterate(); 126 | this->_find->homo_fuzzy_iterate(); 127 | 128 | return true; 129 | } 130 | } 131 | 132 | return false; 133 | } 134 | 135 | #endif /* _TOOL_FindInsert_HPP_ */ 136 | -------------------------------------------------------------------------------- /data/reference.fasta: -------------------------------------------------------------------------------- 1 | >Seq0 2 | ACCTATAAGGATCCACGTCTTGATGTGACCGACGATGTTATCTGCCTATAGCGAACATTTCCTGGTAGGAATACTATTATATTACCTAAATTCTCCGGATTTCCGTGTTCTTCGGAAGCTTAGGTCACGCGCGTCATACTACAGTAAGTTACTGAATTTTGGAGTAATGCATATCGACCGAGTCCGGGGAATCGTCCGTATCCCCGCCAGGCGTATCCAATAATTAGGTCGGACCCCTAGCGCGTACGAGTCCTATAGCTGACATGCGAATTGGCCGTGCATCGAGCTCAAATGTCCTAGCTTGAGAGTGCGTATCTCACCGATCCCCTGGCTATGCTCCGCGATTCACTAGTAGTTTCACGCCGACAGAGCGAAACCGTGATAGGTCATCATGCCGGTCTGCAGTCACGTGCCCAGCAGTCTCCAATATAGCCACTTATGTCGGAATGTAATCGGCTCCAAATTACTGTGAACCTGATGATCGACGACGCGCTGGGTGCGGGATAGTGTAACTGTCCCGTATAAGGCTTAAATTGAGTATGCAGGATACACTTTTAGTATGACTCGGAAAATCCCACTACAGTATGAACCGTGACTGTCGTTGGGCGGTGCTTGTTGCTATTAACTGGATGGTAAGTCAATTTTAAGTGGAGCCCCCTGCAAGATGGAAGAGAACTATATGTGGAACCGGATGTCACGTCAAGACCACCGCGCACGCTTAACACAACGCATGGTTCGGCGACTGCTCCGGTTCTGGAATTCCTCTTTTTTATCGAGATTAGCATCTCATTAGCGCGCAGTGTGTCCTAGTGTCCCAGAACGTAAAGACCAGATTTGACATTAGTAAGTTGCATTCGCGGGTCTTTAGAGCTCCTACTAAGCCCATTCAGATTGAAAAAGGCGCGCCTGCGGTTCCATGTCATAGTTTAACTTGGTTGAGACGGCGCACGAAAGCATACTACCACCATCCATGTCAAAACTGGCGACGATCTTGCTGGGT 3 | >Seq1 4 | GTTTTATAACAAGTGAAGAAAGAACAGTCATTAGGCCCGAAGGCGTACACGGAATCCGCCTAAACTAATGCTTATGGTCCCTCGTAGGCCCCGACGGCATAGGGTGCCCCGTAATTGCGTAGACAGCGAAAGGAAAGTGTACCGACGACGAGATCAAGTGGCAGTGAACCCCTTGGAAAACTTCATATCGACAGACACTGTCCGAGTGATAAATCTACTATTCGCGGAACGGCTCCATAAACGGTGACTATGACCTCATCAGTTAAATTCATTAACCTATAGATGTTGAGAATCCGCCTAAATTGGAGGACGCCGCGCAAAGCCGGTCAACAGCGTTAGTATGTTGAAAGTTTACTCAGATCGCTTCTGTCGGCCTAAGTTATGAATACAGTCAGGGGTTTTCCTGGATACAGATCTTCGGAAGTCTGCTAGTGCCGGTCACGCCACGTATAAGACTAGAGACCTGACACTCATCTCGGACTGTAATTATTTTCTAAGCTAGCTGTCCTATTGTAAGGTAAGTCGGGGTGCCGGACCTTGGCAGCCGTCCGTACGTACATTAATGCATCAGACCAGTAATCTTGCTCGCTAGAGTCGGATCGGCGAGGTCCTCTCAGCTATGAAAAGTAGAATTCAGTCTTGCCCGGGGTAGGGCTAGAGACAAGGGTTGATATTGTCTGAAATGGCGCTCCCATCCGATAACTAAACTACCGTACGGGTGCACGCGAGTAGCCTTAATCCTGTTGGGAAGGAATTGCAATACTCTCCGAACCAGCTTAGGGCCCCCCGCCGCCGCAATTCGAGCGTTATGCCCGGAGCATTTGCACGATGCCATTAAACTATATCAACTACATATACTAGACTATCATCCTAGGGCGTAGAACTCCCTTTGCGTTACCACTCACAACGCGTGTGTTTCTCGTGCGGTGATCCCGGGAGTATTTTATCGGCGCCCCAACTGCCGGCAACTCGAGTGCACCGGAGCTGAGGAACGCTTCAC 5 | >Seq2 6 | TGCTGCCGATCGCTACGACGTCCTACCTTACACACAACGGGCCGCGTTCATACCCACGTATGAAGACATGCGGTTATCCGTTAGTTGGGGCCCGCGATGGCTGTTGTATGTAATGAGTGGAGTAAAAGAACACGGGAGATGTTCAATTAGGCTCAACCATTTGTATTGAAGGGGTCAAAACCCACCTGTGTAAGTAGCATTGAGTGCTGCTCATGGCCGATTCTCTTACTTTATTTAAGCCCATACCACTGATGGAGGAGACTCCAAGGAATAGTCCAACCGTTGTTCCGGGAAAAACGAACTCGGTCCGCTAGCGTGCTTGTCGTCGCAGCGCAGATTCCCTCCTAAGTTATTTTACCGGTTGACTCAGTCGCAGGGGTGCTGAACGCCATGGGGCGGACCGAGCGGAGATCTTTCAGGTGTAATTGGATGATCTTCACCCTAAGGAGGGAGCGGGTTATTGGGCAGGCCCGACCGCCGCTGGAAAGATGCAAGTATGAACAAGGCATGCGTAAGTTATCGTGAAACCATGATGGCCCCTTACTAGACCAAATGTACTGAATGCGTCTTGATTAGACCCCGAAAAGGCATATCACATCAACTTGTCGTGTTAGAGATGTCGCGAGAACCCTCGCCTATTCACGAAGTGCCCATGAGCCTTTGGGCCTGGTTGACAATATGCGCACCCTGTAGCCTACCAACCCTTCATAATCTATCTTCAATTTAGCCTGTCGAGAACGTCCTAACAAGGCTTTTGGCGCTCCAGATGGGACAGTCACTCTCTAGCATCAACCCATAGTTTTTGAGCTACCCGCCCTCGGTGAGAAGGTAGTATACCCAAACGCGTCCTATGCAGTTTTGGGCTTGAGGAACTAAAATCAATTCGTTAAGTCCAGTAGTCCTTATGTGTGGCCGTTCCAAGAATTAATCATATCGCGGAATCACCGGCGATTCATTTTCCGCACCAGAGCTTTCAGGTGCGGCGCGGCCCTAAAAGGTC 7 | >Seq3 8 | TATTGCGCCCTTCAAGAAGCTTCTGCTGACCGTAGGCGTCTCGGCGGTTTGTACTTTGAAAAATTAGCTGCACTACATCCGATGGGTATCCCTCCTCAATCTCAGCAGACCCGGAAAGCGATAGAATCAGCCACGCGGTCGTCCGGGCTAGGGGCCCTGCGCAAGGAAGGTTGGACAGGGCTAGACCCGGAAGCATCGGCTTTTCCTAAATGGTGACGGAGTTATATAGGGTAAGCCTGATAGCGCGGTAGGTGTAATGGCCATCCCCTCGCCTAGCGTGCGCGCAGACAAGTCCAGTCCCGGAGGAGGCATAGGCCTCATTATCATTTCCCTAGAATCGCTCTTGACATCTAGGTTGTACTAGGGACCAGGCGCCCAAAGCGGACGGTTCTCCGTGCTTTCGTGCCGTTTCAGCGTAAGATGCTATTTTTTGGGGAAATGGTCGGCGTGTGCGGGGGAGAACCACGGTACCAACTACGATAAGTCCGTCGTGTAACTTACGTGAAGGTGCTGTGAAGCAGGAATCCGTGCCAAAATGTCCGTGCGATATCCAACTTTCATAGTATTACACGAGAGCCTATGATTTGCCCAGGCGCGACCCGTGAATCGAGGTAATCGCCGACCAGATATTGCGAAACACCACATTACATGACTACTGTCCGCTTGAAGAGTTATATACTTGACAGTCCTGGTTGACGGCACAGCATATCTCCAATGTGTGGTTTAAAGTCTCACGTTCTTCATGCGCGCCGGCCCATGGGAACAGGTATCCTTACTTTCGGTACAAATGAGGCTCCAAAATAGCACGCTTGCAGCAGTCAAGTTGAACGCCTTAAAAGGCACCGCCGCTCGTTCATTGGGATTCCTTGAGAATCGTGACTTGTTACACTATAAGATCATGGATTGGACAAAATAGGCCAACTCCCGCACGCTGTGGCTATTCTTAAGTTGCATAGGTGGGAGTAGCCTTATACTCGATTTCTAAAAAGAGTAGGTGAGC 9 | >Seq4 10 | TTCCGGCGCCGCACTAATTGAAGTGGTGAGCTGACCAGTCGTTCAGGATCCGAAGGCGGGGATGGCGCTATAGGAGCCGGCAGGTATGCTTTGCCGCAAAATTTCGGGGTGGTGGAACCGTCTTACCGAAAGTTAGCTACAGCCTGGAATGTGAAATTCCATGACCTGCCCGTCCTGTGTCCACAGGGCGACATTTGCCACGTAGGTAGGGCGACCATTAGAATGCTGCATTATCGGGCGATAAAAAGTTTTATACCCAAGAATCCTACAAAGATGAAAATTTCGAAGAGCTGCACGCAGTTGTAAGTTGCTTTTCTGGGGTAATCGAGATTCTCCACCATAACCTGCGCAATGCATCGTGAAGCTTTACCGCGCCCAAGGGGAGCGTCTCAGTGGGGTTGCCTCCAGGGATATATTGAAAGTTGAAGAAGAAGATCACAGGTTAAGCGGTATGTTAAGTTAGAACTCACGGGGAGCCGCCTTGATTTTGTTCGACATGAACCAGAGACCAAGTGTGTTATGTTCTGGAACCTTAATACGTACGTCGCCAGCACCGAGCCGGCACTCCATCTCTTTTGGGTGCGCAACATTGCTATACTTAGGATCCATTGACATCTGTCAGCCGTCTTTCCAGAACGTTATAAGACTCGTGAGGAAATTATACAAATCGTTGCCATCATCCAAAGCAAAGTACTTCCGCTTAGGAGTGCCTTGAAGAACCGATTATCTCTGACAATGTAATGCCACAGCACCCTCGACAAAGTTCTACATTCGTTCCAGGTCATGATACAGCGCGCTAAATTACCGCTACGAGCCATACCCCGAACATTGAGACCTGGCCAGTAGGTAGGTGTCAAATCGATATCCACACCTGTCGAAGCAGCTAGGGACCTAGACGCAACAGTAACCGCCTCGGAGTAAGCCCTGGTAAAGATCGGTTGCGGCGGGAGTCCTCCATTCAGGCCAAACGTGCAGTGCTCGATGTGCTTCCTATCGCTCT 11 | >Seq5 12 | GATGTTTAGAAGTTTCCAGGTCACGCCAATGATTGGCATTTACACACGTGGATCAGCGGACATATCTAACCCTTAGTGTTCTTAAGAGCAACTCACTACTCATTTCCACTAACCCCGCCGGCGGTAATTCCAATCTAGTTGATCAGACTTCCCAGTCAATGAAAGCGACACCGTGCGTCTGTAATACCAACAAGACCCTGCTGTCGTCCCGCAGAGGACGCGGCACCTCCGGATTTTGAGTCCAGTCTGAACGATTTTCGATCACTCACCATGGATCTGGAAAACGGAGTCGAGTACTCAGAGCCAAATTGATGCATTTCCAATGACCCGATGCAGGTGCGACCGATCTTCGCCTATGCTTCCCGCCGTAATTATTGAGTCTGGGTCCCGGCCGCTAACGTTGACTCACGGGGAGGTACCCGTGCGTATTCTTCTCAAAGTGACGCTGGACAGCAGCGCATGTCCGAGCCCCATCGTCCTATCTGGTGTAGAGTCTTACCCTAATTAGAGTGATCGAACCAGTAGGTGTCGCGGTCTTAGGGCTCCCATTGTCCAAGGGAACGTGAACAGATATGAATCTGGGAGAATAGTGCAGCGTTGCCCTTCTGGTCGGTCAGCCCTTGCCTACGGCCCGTATGCGGAGAATGAAGGCGTGAAACATTCTGCTCTTTTAGAAGCAGCGGCTGCACCCGTATAACAATCGCACGATCGTACGTCTCATTTGCCGCGTTGGCGCGCCCGTGGATGATGGACCACGGTATGAACCTCTGCACTTCAAATTTGACGCAATCCTGCACTCACCGCACACAGTTCTAGTCTAACCGTCGCAGTGTCTGCTTTAAGGTAGAGATCGATACTTAGGATATGTTCATGTGTGTTTGTAGCGCTGGACCCTCTTATGGTGTGGTCACTTGTGATGGATCGAGGAACTTAGGCGGTTAACTTGTTTCGACGTCTCACCGACAATATCAGGATTTAGTATCG 13 | >Seq6 14 | ACCGAAAATGACAATGTTCACACGCATGCTCGGCGTGGAAAAGAGCCTTTTCTAAGACCGACTCGTTCCGGGCAGCAGGATTATTAGCCAATCAAAATTATCGACCGGTCATCAAGCTGCGATAGTGCAGGCGCATGCCGTCCAATGGGTCCACGGCGGAAGTGCGTTCGTCTACTCTGTCAAATCTTAACATTTTTTGAGGCTAATCCGGCCGGTAGTGTACCGTGAACCAAAGTCCTTCTACGAGCGTATTAGATTGCTCAAAAGATCCGGGAGAATTGACCAGGTCGTATCTTTAAAAACGCTGGTGCGAGCAGCTGCTGTTTTATCAACACCCATTTAGTCCTGTGAAGTTTGCTTAGCAGATACACCTTCCCGCGTGGTATGAGAGGCTGTTCTTTTAAAAACTATGAGGCTCTGGCACCTTCGACGCTAACAAAGTCCCCACGGACCATGATACCCTTACGCAACTCTCTTTGCACGCTAGGGCGAGAGTACTGCCCCTAGACTAGGTACACGCCGGGTAAACTCTCTCGCACACCTTTACGCTCGACTACAGGCTTCTAACCCTTCCGAACGCATATAATTCAAATGGCACTTAGTAACAGACGAATCACGGCTCACAGGCAGAATTCACTGGAGTAAAAGGATTCAGAACAATAGATAGTGTGTTAACTTTACAGTCATCCGTATTATAACGTAGCGAGAGGATTGAGTTCTTGTTAGGAAGGAAGGTCCTATAGACGAGTGCGGTAGCGCACCCGGTCGCCTTGCGTAGTCATGCCCGACGTGTTGATGGTTCCCTTTTAGCCGCCACACAAGGGATCCGAGGGTGAGAGACACATGGCCCTCACCGACGAGACTTACTCAGCCTGCCTCGCTATTGCCCTCTTTTTGATCGTCCCTTTGTGGCTCTCGAGGACTCGTGCAGCGTGTATCTGGGGATTTGTAAGCTTAAGACTACCTTCCATAGGA 15 | -------------------------------------------------------------------------------- /test/full_test/reference.fasta: -------------------------------------------------------------------------------- 1 | >Seq0 2 | ACCTATAAGGATCCACGTCTTGATGTGACCGACGATGTTATCTGCCTATAGCGAACATTTCCTGGTAGGAATACTATTATATTACCTAAATTCTCCGGATTTCCGTGTTCTTCGGAAGCTTAGGTCACGCGCGTCATACTACAGTAAGTTACTGAATTTTGGAGTAATGCATATCGACCGAGTCCGGGGAATCGTCCGTATCCCCGCCAGGCGTATCCAATAATTAGGTCGGACCCCTAGCGCGTACGAGTCCTATAGCTGACATGCGAATTGGCCGTGCATCGAGCTCAAATGTCCTAGCTTGAGAGTGCGTATCTCACCGATCCCCTGGCTATGCTCCGCGATTCACTAGTAGTTTCACGCCGACAGAGCGAAACCGTGATAGGTCATCATGCCGGTCTGCAGTCACGTGCCCAGCAGTCTCCAATATAGCCACTTATGTCGGAATGTAATCGGCTCCAAATTACTGTGAACCTGATGATCGACGACGCGCTGGGTGCGGGATAGTGTAACTGTCCCGTATAAGGCTTAAATTGAGTATGCAGGATACACTTTTAGTATGACTCGGAAAATCCCACTACAGTATGAACCGTGACTGTCGTTGGGCGGTGCTTGTTGCTATTAACTGGATGGTAAGTCAATTTTAAGTGGAGCCCCCTGCAAGATGGAAGAGAACTATATGTGGAACCGGATGTCACGTCAAGACCACCGCGCACGCTTAACACAACGCATGGTTCGGCGACTGCTCCGGTTCTGGAATTCCTCTTTTTTATCGAGATTAGCATCTCATTAGCGCGCAGTGTGTCCTAGTGTCCCAGAACGTAAAGACCAGATTTGACATTAGTAAGTTGCATTCGCGGGTCTTTAGAGCTCCTACTAAGCCCATTCAGATTGAAAAAGGCGCGCCTGCGGTTCCATGTCATAGTTTAACTTGGTTGAGACGGCGCACGAAAGCATACTACCACCATCCATGTCAAAACTGGCGACGATCTTGCTGGGT 3 | >Seq1 4 | GTTTTATAACAAGTGAAGAAAGAACAGTCATTAGGCCCGAAGGCGTACACGGAATCCGCCTAAACTAATGCTTATGGTCCCTCGTAGGCCCCGACGGCATAGGGTGCCCCGTAATTGCGTAGACAGCGAAAGGAAAGTGTACCGACGACGAGATCAAGTGGCAGTGAACCCCTTGGAAAACTTCATATCGACAGACACTGTCCGAGTGATAAATCTACTATTCGCGGAACGGCTCCATAAACGGTGACTATGACCTCATCAGTTAAATTCATTAACCTATAGATGTTGAGAATCCGCCTAAATTGGAGGACGCCGCGCAAAGCCGGTCAACAGCGTTAGTATGTTGAAAGTTTACTCAGATCGCTTCTGTCGGCCTAAGTTATGAATACAGTCAGGGGTTTTCCTGGATACAGATCTTCGGAAGTCTGCTAGTGCCGGTCACGCCACGTATAAGACTAGAGACCTGACACTCATCTCGGACTGTAATTATTTTCTAAGCTAGCTGTCCTATTGTAAGGTAAGTCGGGGTGCCGGACCTTGGCAGCCGTCCGTACGTACATTAATGCATCAGACCAGTAATCTTGCTCGCTAGAGTCGGATCGGCGAGGTCCTCTCAGCTATGAAAAGTAGAATTCAGTCTTGCCCGGGGTAGGGCTAGAGACAAGGGTTGATATTGTCTGAAATGGCGCTCCCATCCGATAACTAAACTACCGTACGGGTGCACGCGAGTAGCCTTAATCCTGTTGGGAAGGAATTGCAATACTCTCCGAACCAGCTTAGGGCCCCCCGCCGCCGCAATTCGAGCGTTATGCCCGGAGCATTTGCACGATGCCATTAAACTATATCAACTACATATACTAGACTATCATCCTAGGGCGTAGAACTCCCTTTGCGTTACCACTCACAACGCGTGTGTTTCTCGTGCGGTGATCCCGGGAGTATTTTATCGGCGCCCCAACTGCCGGCAACTCGAGTGCACCGGAGCTGAGGAACGCTTCAC 5 | >Seq2 6 | TGCTGCCGATCGCTACGACGTCCTACCTTACACACAACGGGCCGCGTTCATACCCACGTATGAAGACATGCGGTTATCCGTTAGTTGGGGCCCGCGATGGCTGTTGTATGTAATGAGTGGAGTAAAAGAACACGGGAGATGTTCAATTAGGCTCAACCATTTGTATTGAAGGGGTCAAAACCCACCTGTGTAAGTAGCATTGAGTGCTGCTCATGGCCGATTCTCTTACTTTATTTAAGCCCATACCACTGATGGAGGAGACTCCAAGGAATAGTCCAACCGTTGTTCCGGGAAAAACGAACTCGGTCCGCTAGCGTGCTTGTCGTCGCAGCGCAGATTCCCTCCTAAGTTATTTTACCGGTTGACTCAGTCGCAGGGGTGCTGAACGCCATGGGGCGGACCGAGCGGAGATCTTTCAGGTGTAATTGGATGATCTTCACCCTAAGGAGGGAGCGGGTTATTGGGCAGGCCCGACCGCCGCTGGAAAGATGCAAGTATGAACAAGGCATGCGTAAGTTATCGTGAAACCATGATGGCCCCTTACTAGACCAAATGTACTGAATGCGTCTTGATTAGACCCCGAAAAGGCATATCACATCAACTTGTCGTGTTAGAGATGTCGCGAGAACCCTCGCCTATTCACGAAGTGCCCATGAGCCTTTGGGCCTGGTTGACAATATGCGCACCCTGTAGCCTACCAACCCTTCATAATCTATCTTCAATTTAGCCTGTCGAGAACGTCCTAACAAGGCTTTTGGCGCTCCAGATGGGACAGTCACTCTCTAGCATCAACCCATAGTTTTTGAGCTACCCGCCCTCGGTGAGAAGGTAGTATACCCAAACGCGTCCTATGCAGTTTTGGGCTTGAGGAACTAAAATCAATTCGTTAAGTCCAGTAGTCCTTATGTGTGGCCGTTCCAAGAATTAATCATATCGCGGAATCACCGGCGATTCATTTTCCGCACCAGAGCTTTCAGGTGCGGCGCGGCCCTAAAAGGTC 7 | >Seq3 8 | TATTGCGCCCTTCAAGAAGCTTCTGCTGACCGTAGGCGTCTCGGCGGTTTGTACTTTGAAAAATTAGCTGCACTACATCCGATGGGTATCCCTCCTCAATCTCAGCAGACCCGGAAAGCGATAGAATCAGCCACGCGGTCGTCCGGGCTAGGGGCCCTGCGCAAGGAAGGTTGGACAGGGCTAGACCCGGAAGCATCGGCTTTTCCTAAATGGTGACGGAGTTATATAGGGTAAGCCTGATAGCGCGGTAGGTGTAATGGCCATCCCCTCGCCTAGCGTGCGCGCAGACAAGTCCAGTCCCGGAGGAGGCATAGGCCTCATTATCATTTCCCTAGAATCGCTCTTGACATCTAGGTTGTACTAGGGACCAGGCGCCCAAAGCGGACGGTTCTCCGTGCTTTCGTGCCGTTTCAGCGTAAGATGCTATTTTTTGGGGAAATGGTCGGCGTGTGCGGGGGAGAACCACGGTACCAACTACGATAAGTCCGTCGTGTAACTTACGTGAAGGTGCTGTGAAGCAGGAATCCGTGCCAAAATGTCCGTGCGATATCCAACTTTCATAGTATTACACGAGAGCCTATGATTTGCCCAGGCGCGACCCGTGAATCGAGGTAATCGCCGACCAGATATTGCGAAACACCACATTACATGACTACTGTCCGCTTGAAGAGTTATATACTTGACAGTCCTGGTTGACGGCACAGCATATCTCCAATGTGTGGTTTAAAGTCTCACGTTCTTCATGCGCGCCGGCCCATGGGAACAGGTATCCTTACTTTCGGTACAAATGAGGCTCCAAAATAGCACGCTTGCAGCAGTCAAGTTGAACGCCTTAAAAGGCACCGCCGCTCGTTCATTGGGATTCCTTGAGAATCGTGACTTGTTACACTATAAGATCATGGATTGGACAAAATAGGCCAACTCCCGCACGCTGTGGCTATTCTTAAGTTGCATAGGTGGGAGTAGCCTTATACTCGATTTCTAAAAAGAGTAGGTGAGC 9 | >Seq4 10 | TTCCGGCGCCGCACTAATTGAAGTGGTGAGCTGACCAGTCGTTCAGGATCCGAAGGCGGGGATGGCGCTATAGGAGCCGGCAGGTATGCTTTGCCGCAAAATTTCGGGGTGGTGGAACCGTCTTACCGAAAGTTAGCTACAGCCTGGAATGTGAAATTCCATGACCTGCCCGTCCTGTGTCCACAGGGCGACATTTGCCACGTAGGTAGGGCGACCATTAGAATGCTGCATTATCGGGCGATAAAAAGTTTTATACCCAAGAATCCTACAAAGATGAAAATTTCGAAGAGCTGCACGCAGTTGTAAGTTGCTTTTCTGGGGTAATCGAGATTCTCCACCATAACCTGCGCAATGCATCGTGAAGCTTTACCGCGCCCAAGGGGAGCGTCTCAGTGGGGTTGCCTCCAGGGATATATTGAAAGTTGAAGAAGAAGATCACAGGTTAAGCGGTATGTTAAGTTAGAACTCACGGGGAGCCGCCTTGATTTTGTTCGACATGAACCAGAGACCAAGTGTGTTATGTTCTGGAACCTTAATACGTACGTCGCCAGCACCGAGCCGGCACTCCATCTCTTTTGGGTGCGCAACATTGCTATACTTAGGATCCATTGACATCTGTCAGCCGTCTTTCCAGAACGTTATAAGACTCGTGAGGAAATTATACAAATCGTTGCCATCATCCAAAGCAAAGTACTTCCGCTTAGGAGTGCCTTGAAGAACCGATTATCTCTGACAATGTAATGCCACAGCACCCTCGACAAAGTTCTACATTCGTTCCAGGTCATGATACAGCGCGCTAAATTACCGCTACGAGCCATACCCCGAACATTGAGACCTGGCCAGTAGGTAGGTGTCAAATCGATATCCACACCTGTCGAAGCAGCTAGGGACCTAGACGCAACAGTAACCGCCTCGGAGTAAGCCCTGGTAAAGATCGGTTGCGGCGGGAGTCCTCCATTCAGGCCAAACGTGCAGTGCTCGATGTGCTTCCTATCGCTCT 11 | >Seq5 12 | GATGTTTAGAAGTTTCCAGGTCACGCCAATGATTGGCATTTACACACGTGGATCAGCGGACATATCTAACCCTTAGTGTTCTTAAGAGCAACTCACTACTCATTTCCACTAACCCCGCCGGCGGTAATTCCAATCTAGTTGATCAGACTTCCCAGTCAATGAAAGCGACACCGTGCGTCTGTAATACCAACAAGACCCTGCTGTCGTCCCGCAGAGGACGCGGCACCTCCGGATTTTGAGTCCAGTCTGAACGATTTTCGATCACTCACCATGGATCTGGAAAACGGAGTCGAGTACTCAGAGCCAAATTGATGCATTTCCAATGACCCGATGCAGGTGCGACCGATCTTCGCCTATGCTTCCCGCCGTAATTATTGAGTCTGGGTCCCGGCCGCTAACGTTGACTCACGGGGAGGTACCCGTGCGTATTCTTCTCAAAGTGACGCTGGACAGCAGCGCATGTCCGAGCCCCATCGTCCTATCTGGTGTAGAGTCTTACCCTAATTAGAGTGATCGAACCAGTAGGTGTCGCGGTCTTAGGGCTCCCATTGTCCAAGGGAACGTGAACAGATATGAATCTGGGAGAATAGTGCAGCGTTGCCCTTCTGGTCGGTCAGCCCTTGCCTACGGCCCGTATGCGGAGAATGAAGGCGTGAAACATTCTGCTCTTTTAGAAGCAGCGGCTGCACCCGTATAACAATCGCACGATCGTACGTCTCATTTGCCGCGTTGGCGCGCCCGTGGATGATGGACCACGGTATGAACCTCTGCACTTCAAATTTGACGCAATCCTGCACTCACCGCACACAGTTCTAGTCTAACCGTCGCAGTGTCTGCTTTAAGGTAGAGATCGATACTTAGGATATGTTCATGTGTGTTTGTAGCGCTGGACCCTCTTATGGTGTGGTCACTTGTGATGGATCGAGGAACTTAGGCGGTTAACTTGTTTCGACGTCTCACCGACAATATCAGGATTTAGTATCG 13 | >Seq6 14 | ACCGAAAATGACAATGTTCACACGCATGCTCGGCGTGGAAAAGAGCCTTTTCTAAGACCGACTCGTTCCGGGCAGCAGGATTATTAGCCAATCAAAATTATCGACCGGTCATCAAGCTGCGATAGTGCAGGCGCATGCCGTCCAATGGGTCCACGGCGGAAGTGCGTTCGTCTACTCTGTCAAATCTTAACATTTTTTGAGGCTAATCCGGCCGGTAGTGTACCGTGAACCAAAGTCCTTCTACGAGCGTATTAGATTGCTCAAAAGATCCGGGAGAATTGACCAGGTCGTATCTTTAAAAACGCTGGTGCGAGCAGCTGCTGTTTTATCAACACCCATTTAGTCCTGTGAAGTTTGCTTAGCAGATACACCTTCCCGCGTGGTATGAGAGGCTGTTCTTTTAAAAACTATGAGGCTCTGGCACCTTCGACGCTAACAAAGTCCCCACGGACCATGATACCCTTACGCAACTCTCTTTGCACGCTAGGGCGAGAGTACTGCCCCTAGACTAGGTACACGCCGGGTAAACTCTCTCGCACACCTTTACGCTCGACTACAGGCTTCTAACCCTTCCGAACGCATATAATTCAAATGGCACTTAGTAACAGACGAATCACGGCTCACAGGCAGAATTCACTGGAGTAAAAGGATTCAGAACAATAGATAGTGTGTTAACTTTACAGTCATCCGTATTATAACGTAGCGAGAGGATTGAGTTCTTGTTAGGAAGGAAGGTCCTATAGACGAGTGCGGTAGCGCACCCGGTCGCCTTGCGTAGTCATGCCCGACGTGTTGATGGTTCCCTTTTAGCCGCCACACAAGGGATCCGAGGGTGAGAGACACATGGCCCTCACCGACGAGACTTACTCAGCCTGCCTCGCTATTGCCCTCTTTTTGATCGTCCCTTTGTGGCTCTCGAGGACTCGTGCAGCGTGTATCTGGGGATTTGTAAGCTTAAGACTACCTTCCATAGGA 15 | -------------------------------------------------------------------------------- /src/FindDeletion.hpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * MindTheGap: Integrated detection and assembly of insertion variants 3 | * A tool from the GATB (Genome Assembly Tool Box) 4 | * Copyright (C) 2014 INRIA 5 | * Authors: C.Lemaitre, G.Rizk, P.Marijon 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU Affero General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU Affero General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Affero General Public License 18 | * along with this program. If not, see . 19 | *****************************************************************************/ 20 | 21 | #ifndef _TOOL_FindDeletion_HPP_ 22 | #define _TOOL_FindDeletion_HPP_ 23 | 24 | /*****************************************************************************/ 25 | #include 26 | #include 27 | 28 | 29 | template 30 | class FindDeletion : public IFindObserver 31 | { 32 | public : 33 | typedef typename gatb::core::kmer::impl::Kmer Kmer; 34 | 35 | typedef typename Kmer::ModelCanonical KmerModel; 36 | typedef typename KmerModel::Iterator KmerIterator; 37 | 38 | public: 39 | 40 | /** \copydoc IFindObserver 41 | */ 42 | FindDeletion(FindBreakpoints * find); 43 | 44 | /** \copydoc IFindObserver::IFindObserver 45 | */ 46 | bool update(); 47 | 48 | private: 49 | 50 | /** Detect if the end of a kmer is equal to the begin of other 51 | * \param[in] begin first kmer 52 | * \param[in] end the other kmer 53 | * \return The size of repetition 54 | */ 55 | unsigned int fuzzy_site(std::string begin, std::string end); 56 | }; 57 | 58 | template 59 | FindDeletion::FindDeletion(FindBreakpoints * find) : IFindObserver(find){} 60 | 61 | template 62 | bool FindDeletion::update() 63 | { 64 | if((this->_find->kmer_begin().isValid() && this->_find->kmer_end().isValid()) == false) 65 | { 66 | return false; 67 | } 68 | if( this->_find->gap_stretch_size() < (this->_find->kmer_size() - this->_find->max_repeat() ) ) 69 | { 70 | return false; 71 | } 72 | 73 | // Test if deletion is a fuzzy deletion 74 | std::string begin = this->_find->model().toString(this->_find->kmer_begin().forward()); 75 | std::string end = this->_find->model().toString(this->_find->kmer_end().forward()); 76 | 77 | unsigned int repeat_size = this->fuzzy_site(begin, end); 78 | 79 | if(repeat_size > (unsigned)this->_find->max_repeat()) 80 | { 81 | return false; 82 | } 83 | 84 | if(repeat_size != 0) 85 | { 86 | begin = begin.substr(0, begin.length() - repeat_size); 87 | } 88 | 89 | // Compute del_size 90 | int del_size = (int) this->_find->gap_stretch_size() - (int) this->_find->kmer_size() + (int) repeat_size + 1; 91 | //was size_t, caused computation bug 92 | 93 | 94 | // Create a sequence maybe is in graphe 95 | std::string seq = begin + end; 96 | 97 | // Create variable required for iterate on kmer 98 | KmerModel local_m(this->_find->kmer_size()); 99 | KmerIterator local_it(local_m); 100 | Data local_d(const_cast(seq.c_str())); 101 | 102 | // Init this variable 103 | local_d.setRef(const_cast(seq.c_str()), (size_t)seq.length()); 104 | local_it.setData(local_d); 105 | 106 | bool is_deletion = true; 107 | for(local_it.first(); !local_it.isDone(); local_it.next()) 108 | { 109 | if(!this->contains(local_it->forward())) 110 | { 111 | is_deletion = false; 112 | break; 113 | } 114 | } 115 | 116 | if(is_deletion == false) 117 | { 118 | if(repeat_size == 0) 119 | { 120 | return false; 121 | } 122 | else // Maybee isn't a fuzzy deletion 123 | { 124 | seq = this->_find->model().toString(this->_find->kmer_begin().forward()) + end; 125 | local_d.setRef(const_cast(seq.c_str()), (size_t)seq.length()); 126 | local_it.setData(local_d); 127 | 128 | for(local_it.first(); !local_it.isDone(); local_it.next()) 129 | { 130 | if(!this->contains(local_it->forward())) 131 | { 132 | return false; 133 | } 134 | } 135 | 136 | del_size -= repeat_size; 137 | repeat_size = 0; 138 | } 139 | } 140 | 141 | //printf("FindDeletion repeat_size %u del_size %i %i %llu\n",repeat_size,del_size,this->_find->position(),this->_find->position()); 142 | 143 | if(del_size<=0) return false; //just in case 144 | 145 | // Write the breakpoint 146 | //this->_find->writeBreakpoint(this->_find->breakpoint_id(), this->_find->chrom_name(), this->_find->position() - del_size - 1, begin, end, repeat_size, STR_DEL_TYPE); 147 | //NOTE : position will always be the left-most when repeat_size>0. 148 | size_t del_start_pos = this->_find->position() - 2 - del_size; //begining position of the deletion -1 (0-based): because in VCF we need to put the letter just before the deleted sequence 149 | 150 | //cout << "start pos = " << del_start_pos << "size = " << del_size << endl; 151 | char *del_sequence = new char[del_size+2]; 152 | sprintf(del_sequence,"%.*s", del_size+1, this->_find->chrom_seq()+del_start_pos); 153 | char *alt_char = new char[2]; 154 | sprintf(alt_char,"%.*s", 1, del_sequence); 155 | //cout << del_sequence << endl; 156 | //cout << alt_char << endl; 157 | // here position is 0-based 158 | this->_find->writeVcfVariant(this->_find->breakpoint_id(), 159 | this->_find->chrom_name(), 160 | del_start_pos, del_sequence, alt_char, repeat_size, STR_DEL_TYPE); 161 | 162 | delete [](del_sequence); 163 | delete [] (alt_char); 164 | 165 | this->_find->breakpoint_id_iterate(); 166 | 167 | if(repeat_size != 0) 168 | this->_find->fuzzy_deletion_iterate(); 169 | else 170 | this->_find->clean_deletion_iterate(); 171 | 172 | return true; 173 | } 174 | 175 | /* 176 | with max_repeat = 5 177 | good case 1 + 5 + 1 = 6 operation exemple AAAAATTCGG TTCGGCCCCC 178 | */ 179 | template 180 | unsigned int FindDeletion::fuzzy_site(std::string begin, std::string end) 181 | { 182 | for(unsigned int i = this->_find->max_repeat(); i != 0; i--) 183 | for(unsigned int j = 1; begin.substr(begin.length() - i, j) == end.substr(0, j); j++) 184 | if(i == j) 185 | return j; 186 | 187 | return 0; 188 | } 189 | 190 | #endif /* _TOOL_FindDeletion_HPP_ */ 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # MindTheGap: Integrated detection and assembly of insertion variants 3 | # A tool from the GATB (Genome Assembly Tool Box) 4 | # Copyright (C) 2014 INRIA 5 | # Authors: C.Lemaitre, G. Rizk 6 | # 7 | # This program is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU Affero General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU Affero General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU Affero General Public License 18 | # along with this program. If not, see . 19 | ################################################################################ 20 | 21 | project(MindTheGap) 22 | 23 | cmake_minimum_required(VERSION 3.1) 24 | 25 | ################################################################################ 26 | # The version number. 27 | ################################################################################ 28 | # The default version number is the latest official build 29 | SET (gatb-tool_VERSION_MAJOR 2) 30 | SET (gatb-tool_VERSION_MINOR 3) 31 | SET (gatb-tool_VERSION_PATCH 0) 32 | 33 | # But, it is possible to define another release number during a local build 34 | IF (DEFINED MAJOR) 35 | SET (gatb-tool_VERSION_MAJOR ${MAJOR}) 36 | ENDIF() 37 | IF (DEFINED MINOR) 38 | SET (gatb-tool_VERSION_MINOR ${MINOR}) 39 | ENDIF() 40 | IF (DEFINED PATCH) 41 | SET (gatb-tool_VERSION_PATCH ${PATCH}) 42 | ENDIF() 43 | 44 | set (gatb-tool-version ${gatb-tool_VERSION_MAJOR}.${gatb-tool_VERSION_MINOR}.${gatb-tool_VERSION_PATCH}) 45 | 46 | # However, continuous integration has priority over local compilation 47 | IF (DEFINED JENKINS_TAG) 48 | SET (gatb-tool-version ${JENKINS_TAG}) 49 | ENDIF() 50 | 51 | ################################################################################ 52 | # Define cmake modules directory 53 | ################################################################################ 54 | SET (GATB_CORE_HOME ${PROJECT_SOURCE_DIR}/thirdparty/gatb-core/gatb-core) 55 | SET (CMAKE_MODULE_PATH ${GATB_CORE_HOME}/cmake) 56 | 57 | ################################################################################ 58 | # SUPPORTED KMER SIZES 59 | ################################################################################ 60 | 61 | # One can uncomment this line and set the wanted values 62 | #set (KSIZE_LIST "32 64 96 128 160 192 224 256") 63 | 64 | ################################################################################ 65 | # THIRD PARTIES 66 | ################################################################################ 67 | 68 | # We don't want to install some GATB-CORE artifacts 69 | #SET (GATB_CORE_EXCLUDE_TOOLS 1) 70 | SET (GATB_CORE_EXCLUDE_TESTS 1) 71 | SET (GATB_CORE_EXCLUDE_EXAMPLES 1) 72 | 73 | # GATB CORE 74 | include (GatbCore) 75 | 76 | ################################################################################ 77 | # TOOL 78 | ################################################################################ 79 | 80 | 81 | # We also set a flag for TR1 management 82 | if (use_new_cxx) 83 | set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_NEW_CXX ") 84 | endif() 85 | 86 | message("-- cxx: ${CMAKE_CXX_FLAGS}") 87 | 88 | 89 | 90 | # we get compilation definitions from the gatb-core part 91 | add_definitions (${gatb-core-flags}) 92 | 93 | # SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra" ) 94 | SET( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0") 95 | 96 | # we give the headers directories from : 97 | # - from project source 98 | # - from GATB-CORE source 99 | # - from dsk source 100 | set (PROGRAM_SOURCE_DIR ${PROJECT_SOURCE_DIR}/src) 101 | 102 | include_directories (${PROGRAM_SOURCE_DIR} ${gatb-core-includes}) 103 | 104 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 105 | 106 | # we define the files to be compiled 107 | file (GLOB ProjectFiles src/*) 108 | 109 | 110 | # we define the artifact to be built: the project binary 111 | add_executable (${PROJECT_NAME} src/main.cpp ${ProjectFiles}) 112 | add_executable(nwalign src/nwAlign/nwalign.cpp) 113 | 114 | # we define which libraries to be linked with project binary 115 | target_link_libraries (${PROJECT_NAME} ${gatb-core-libraries}) 116 | target_link_libraries (nwalign ${gatb-core-libraries}) 117 | 118 | 119 | 120 | ################################################################################ 121 | # PACKAGING 122 | ################################################################################ 123 | 124 | # We set the version number 125 | SET (CPACK_PACKAGE_DESCRIPTION_SUMMARY "gatb-tool ${PROJECT_NAME}") 126 | SET (CPACK_PACKAGE_VENDOR "Genscale team (INRIA)") 127 | SET (CPACK_PACKAGE_VERSION_MAJOR "${gatb-tool_VERSION_MAJOR}") 128 | SET (CPACK_PACKAGE_VERSION_MINOR "${gatb-tool_VERSION_MINOR}") 129 | SET (CPACK_PACKAGE_VERSION_PATCH "${gatb-tool_VERSION_PATCH}") 130 | SET (CPACK_PACKAGE_VERSION "${gatb-tool-version}") 131 | 132 | # We set the kind of archive 133 | SET (CPACK_GENERATOR "TGZ") 134 | SET (CPACK_SOURCE_GENERATOR "TGZ") 135 | 136 | # We ignore unwated files for the source archive 137 | SET (CPACK_SOURCE_IGNORE_FILES 138 | "^${PROJECT_SOURCE_DIR}/\\.git/" ; 139 | "^${PROJECT_SOURCE_DIR}/\\.gitmodules" ; 140 | "^${PROJECT_SOURCE_DIR}/\\.gitignore"; 141 | "^${PROJECT_SOURCE_DIR}/build/" ; 142 | "^${GATB_CORE_HOME}/\\.cproject" ; 143 | "^${GATB_CORE_HOME}/\\.git/" ; 144 | "^${GATB_CORE_HOME}/\\.project" ; 145 | "^${GATB_CORE_HOME}/\\.gitignore"; 146 | "^${GATB_CORE_HOME}/doc/design" ; 147 | "^${GATB_CORE_HOME}/DELIVERY.md" 148 | ) 149 | 150 | # We copy the project binary to the 'bin' directory 151 | INSTALL (TARGETS ${PROJECT_NAME} DESTINATION bin) 152 | INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/test DESTINATION .) 153 | INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/data DESTINATION .) 154 | #INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/doc DESTINATION .) 155 | INSTALL (FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.md DESTINATION .) 156 | INSTALL (FILES ${CMAKE_CURRENT_SOURCE_DIR}/CHANGELOG.md DESTINATION .) 157 | INSTALL (FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE DESTINATION .) 158 | 159 | # We include the "bin" tag into binary archive file name 160 | set (CPACK_PACKAGE_FILE_NAME ${PROJECT_NAME}-${CPACK_PACKAGE_VERSION}-bin-${CMAKE_SYSTEM_NAME}) 161 | 162 | include (CPack) 163 | -------------------------------------------------------------------------------- /test/contig_test/genome-variant.fasta: -------------------------------------------------------------------------------- 1 | >Seq0 2 | GACTAATCCACTGCCTCAGGGCCATAGGATCTGGAAGCGTAATGTATTCGTGCGTCTCATTTGTCCTGGTTCTTTGGGGACGTACAATGCAATCTCAGGTACACATGCGAGAGTCTGTTATGTGCTTCACAAGGTCAAGTATACCAGAATGTAGCTCCCTCAATTGGATATTCGCGGTAATGGTACCGATGTGCATGTGAGAGATCAAGTCTTCGTCGGAGAGTTGTAGTTGATTCCTCCCAGATGAGGACTCCTGTCCTCTTGTACTCGCCGAGTAGGAAGTAGAGGCCTGTTAGGCATGATTTGCTGAAACGCCGGCTAAACTTTCAGTTTTACCCACTTGGCGAGGCAGCGTGGGTAGAACTATCCGGATCGAGGGTACGTTAGACATGGGCCTACGGGTAGCCTCTAGGGTTGACCCCGTGAGGTGTCTTACAAGAATGCTGAGGTGGAGCCCGCCAACGGGGGCTGTGGATACCCCATCTGTACCTATTCTCGCTACAGAGTTAGTGCATAGTATGGCTCTTTGGATTTGGAAAAGCACGAACAACCGGTTGTATTTCCTAGGATAACCTCAGCTGTACACGTGCGTGCGCAGGGCAATAGTTGTCAGAAGCCTCTTCTTGGACGTTCGTGCCATTAATCAGAGCGTACTCTGACCAAAAGTAGCTCATCCGATTTATCCCGAATACAGCTTGCATAGCTTAGCCCTAGAGGAGTAACAATTCATGTTACCTTTTGCAGATCGTCGTGATCAGTGGCAGCTGAAACTCATTCTTCGGCGGCAGGTATTAGACTTCATACGATCCGGTCGTGCATAAATGGACTTTGGTAAATCAACTCCCGTCGTAATTGACTTAGTCAGTATAACACAAAAGCACGGCAAGGTTGACTATTACATGCGTATTGACTGCAAGACGGTTCGGAACTATGAAACATCTGGAATAAAAGCAGAGAGGATTCGTTCCAACGGAAGCAGGCTTGAAGCCAATTGGCCGGCCCAATTTCTTCAGCGAATCTTAGACAGGCAACTGCAACCCCAATCCTGGGAGAATGCCACATCAGAAGTCACCTAAGGACATCACGCTAGGCAAACAATTCACGGGCGTTGTTGCACGAGGTAAGGGAAGTCTAGAACGTATGCACGTTGAATTATTTTCACGAGGTCGATGTCAGCCTCCTGATGACACAGCAACGACCAAGAAATGTATCGGGACGTCTTTGCCGGCCTGGATCTGAATCCTCAAGCTCTGCGTCTCATGTTCCAGGCCGCTCGTTGTTCCTCGTCACGCCGAGAGAAGTCCGCTTGACTGTTGACTCAGGCGGAACTTTATTCCACCCCTGTAAGCAGTGAACAGTATGATACCTGACTTTACTGTAAACGTATGCTCTATTTTCGACTCGCGTTTCATAACACTGTGAATACTAGTGCTAAGCACTCCCCCTCCACGCATGTAGAGGGATATGGGCCAGCTATTGAGGTGGATTATTTCTAGTCATTTTCTGTTGTATATAGATAGGGACCACCACCGCCTCAGCGCGGTCAGCCCTGTGTTATTTGTGATTTCAACCCTACCCCCACCACCCCGTAACCAGCGAGGATAATCAAACCGTTCTGCCGTTTGTCATAGTGTTTAGCACACGGCATCCGTAATCTACCCTCATTCTCCCCCAAAGATGTTTGACGTTCATAAGCGTTAGTGACTTTGATGCTCTTTAATAAAGGTCCGTTGAGCTTGTCATTGGAGTAGACCCCCTTACGGCAATGCCGGCGGGGTGGTAGAAGGCCCACTCAAGTGAACGCAGCGATGTCCTAGAGTTTCCACTCTACCCTACGCCGGGTATACCGTTCCGCGGCATATCTCCTAGCGGGATACTATGTCACTAATTAAACCACTGGGGACAGGGTTGCGGGTCTAAGGCGATTCAGTACGGAGTCAACTATGGCCACACGCAGAACCTAGATGTTAGCCAATCCAGTGTTCGGTTATCTTGTTTCCCTCACGCTTTTGTGTCTGGTTCCCAAGCCTTTTTCCCAAGCCCTTATAGGAGATAACCCGAGGGGAATGAGTTATCCTTACTGGGTGTTAACCCTCGTTGCGCGAATCCCGGATAAAAAGGTAGGTCTAAAATAAATATCGGGTTTGGAGGTACAGGTAGAGTATCGCACTTTGTGTCTTGAAGCCCCGTCATATTTCCGACGTCCGTAAATATAGCCTCCCCCGTGTCCGCTTGAAATAAGAAATTAAACCTACTCTTACTCAGTGTATTCTTACTACAAATAGTCTAGTCTATGCCATGGATTTTGGGTCACATGTGTCAGCGGGTTCTGGTCTTCGCCGGTCTGGTTAACAACTGGACGGAAGTTGCTCGACTGCTATCTTAGCGTTTTCCGACAGTCTCTTGCCAGCGGGACATGGTTTTCTGTGAACACAATGAGATGAAATGCTGTGTGAGACCTGCCAGTCTCCGCGAGCTAGAAGAAGCATTGCCGGGGCACGCTAGGCGAAGGCTAGTTTGACTGCGGCGCTAGGGGGCGAGGTAAAATGTCTGAAATAACCGAAGTAACACTGAATCCGACATTGTCACATAGCGATTGAATCAACCGAGTGATCAGTTAAAGTGTTAACACTCATATGCCACGAAGCGAACGAATCTGTCATGGTTACATACTCGTCGTGGTGCAGGCGATTAATTAACTAGAGCAGCCTCCTGTGAGCACGTTTTCAACAGTCTGGGCCGCAGATAGCTAAGTCAATTAAACTTTGGAGCAGTCTAGTTTCAACGGCCGTGTGAGTCGACAGTAACCTACCGTGAAATACAATTGTATTGTTACATCATACAAAAAGACCCTTAAACGGTAATTGGCGTGTTGTTCCATTTTTTCTACCTAGGGCATGGACGGTAAACCACCACCGTGTCAGTGTGTTTTGGTGGGTAAGGCGCGCAAACAAAGATGGTGGGTGCAAGCACTGTCGAAGGTTAACTCCTACCATCTTTCCTAAATCCCTGACCGTCTGACGCGTGATGTCAGGGTCCTTGGCACCCAGTACTGGAGGCTAACCGAGAGCCAGGCGTATCTCTATCGTAGCCCATATTAGTTATAGGATCTGTATTTAGCTCTGTGGACCATAAAAAAAGACCTGGAGGCGCTGCCAGCATTACGGACGCCCCCTCCACGCGGTCGTCACACGTTATTCGACCCGCAGTTACATTATAGATCGTCGATTCGATTACTCGCGTGGGACTTATGGTAACCTAATGTAAGGGAGCGCGAAACCGTGGCAAAAGGCGTAACTGCTAACCCTTACTGCTGCACTATCGAGCGTATCGGGCCTCGTGGCATGGCTCAAATGATTGCAGGCTAAGGCTCCCCAAATGTATCCGGTTGGTGCTGGCATCGACGCGACGGAGAGTAGAAGTATGTTTTCTGCAGTCATGAGCTAGTCTGTTATTACGTACTCCCGCTCACGGGGATTATCAAACCGCATCGCCTAGTTGGAGTACGACGTGTAGGAGTGTCTGGACGCTCAGGGGTGTCAGAACCAAGTTTGCCGCGAAACAAACTTCATTGAGGGCCTTGACCGCTCTGACTGTCAAAACAACACAAAATGACTGCGTTGGCGGTTCGACAGGATTCTCGTTTAATCTACTTTTGGCGGCTCACCAAGTTAATTACTTGCCTTGTCCGGTCGACCGAGGTCCCACTGTGTGTCGGGGCCTAGAGGCGAAACTCCGGCGATTTTACTCAAGAGCATTCGTCGTCAATACAAGCATTAATGGTGTAGGCCTTTATATGCGAGACCTATACTCCCCCGCTGAGCAGGACCTCTTGATGCACAACGCCAAACCCATGGTTGATACTGCGCCGAAGTCGTCCACTGGCTGTAGCGTGCCGGTTGAACCATCTACTGAAAATATGACCCGCCTGCCGAGGTACAGTAGAATCGCACAGCCATTACTGAGGCCAGCCCCCAATGCCAGACCTATGGAGGATTGTAAGCCATCATTGTCTTTTTGGATTACTGCCTGCTCCGACCATAAGAGCGCAGGAAAGGAGTTTGAGATTCCAAGGCGCCAAGTACGGCCATGCAGTAAGACCTATATTCATCGGGCGAAAGCATCTGTCTGATACTACAGTAAATTTAGCAATCAGTTCGATAGCTCACACTGAAGCACGTCTAACAGCGGCGACCGCGTTACGTCTAAAGCCCGTAAAGAGCTAAGCATCTGACGGACAAAAACGCTATTATACTACGGGACGGTTACACTTAAGGGTTAGTGGGGGTGTAACCGACTAGGAAGCCAGCTGAAAACCAATACGTTTTTCCACCAAGGCGCAACCTCGAAGGCAAACCGGCTGTCTTATCAGTTGTTCCCTTGCATTCTCCTTCGTTAATTGTTCCTGCTATAGAGGTAGTGCAAATAGACCAGTACTAACCTACCGTTCGTTGCGACTTGATTCAAATAAACCGATCCGCCTGTGTTATAACTCAGCCTAAATTTCAAAACTCCAATGTGCGATTATTGACGGCGCAAGCGTCCGGGGTCAGTATGATGGGCCATTTGACGCTACTGTGCCGCTACGGTCCGTACTCGTTATGGGTGTGCATGTGGGTGTAACGTCTTTCGTTTAGTCATTTTCGCCAAGTCGGCGGGCTCACTTATTTAGAAGAAGACACGCCAGTAAGGCCCACATTGAGAAGTCAGCTGCCCGCGTCTACAATCGCTTCACAGATTGATCAGGTTGACATTTGACAGTATGAGAGTAAGACGGTAAGATAGAGAACCTTCCCTGACACCATGTGTTCGAATCCGCTTCGATGGCTTGTCTTCTGCTTTATCGCGTATGCGGATTGCTCATTGTAGGCGCACGGAGATGTCCTCAGATCCCCAATGGTGTGCCAGGTTTGAGGGTTTCACACCATTGCTTACAGCTGATAGCAAGTCCGTGGTTGTGGTTGGGACGTTCCGCGCATCAGATAACCAAGCACACTATCGTTATGTCGAGGAAGGTCGCAAGATGGGGTCAGGTATCAATACATACGGCCGACCCAAAGACCGTCTACAGTTCGGCCTGTCGCGACGTCGTGAGGATAGTTGGCTATTCTCAGGTGCCCTAGACGGAGGCCACGGATCCACCTCTACGCATGGGAGTACGGAGTAACTGCGATATGTGAGTCCGTACGCAATTAGATAAAACTATCCAGCGGTACGTGGCTCGAAATTAAATTCAGAAAGCACGCTTCATATTCTGAGCTTCCACACCGCGGCATCTGATTAGAAGTGCGTGGGTAGGAGATCCCACACGGACTGAGGTGCACGCCTACTACGCCGGAACACCAGCGAGTGCTGTGAGAAAACTTCAGAACCTGATGACTTGCCGGATATCACCTAAACCCGCCAGCGCATTACTGCTGTATTTGTAGATGTCAACGGATCTGGTCCAGATCACTTCCTGACTGGACGTGAGTCACCTATAGAAGCGTGGTATACGAAAACAGCTTAGCTGCATCGCGATCTTAAAACATACGTTACGCACGTAAATTAGGCCCGAAAGTGAAAGATGGATTAATCAAGAGATTTTATGTTCTTATCTTAAACGGGCACTACCGGTACCGTACTAGTAGCGAGGGTCGGGTAACGCTTGTTCTTGTCGATGTAAAGCGCATAGACACCAGCCTTAGCACCCTGGTATGCGTGCGCCCATGATTGGTTGTAGCCCTCGTTAGCATCGCCTTATCGATCGAAAAATCGCCTGTACGGTTAGGATACACGATGTTTCTGAGCCGACGGAACACCCATCCGAATCTTTTCCAATATATCGTTAATGCTCTCTTGTCCCAGCTTAATACCATAGGTGTGGATATTGCATGTTCGGTCCCGGAACGCAAGCGATCGGTGCATCGCTTTCAGGGTAGAACTTGTCTGCGATTTACCCCTGAACTCATAAGCCACCTTAGAACGAGTTTGGAGATGCCATAAACAAAGGACAGATTTTTATAGTAACCGCTCCGGACGTCGATTGTTACGGCGCATATTCGGGATTGGTCAATCCCATAGATCCGGCAATCAAATCCATATTAATTCGAGCGTGTATATTAATGGGGCCACATTCATTCAGATTTCAGGAAGGTGCGGTCTTGCTAAGCGCTACCAGGCACCTTCCCTGTTTAAAGGCTAAACAGACAGCTAGGCCCCATGGGGGCTCTCAAGAATAATTGGGGACCACACGGAGCCACACACCGACACCGGCACAAAGCAGTGAACCGGATACATCCTGATGACATTCTGGCAGGCGTTCCAGCATTGGCATCCATTTGTCAACATTGTCAAAGTTTGACACGATATCGAATATATGTGCTACACTCAGGTCGGTTTTCTGCCACTGCTCAGCTCTGCCACCCTCTGGATTATCAAAACGGCGCGCACTAGTCATTCCAGCTCACGACATCACTCTCTTACACAGAAATCGCCTAAGATGATATCGTGCTGAACACGGTTGCGGGGTTTCGCGAGCCCAAGGCCTATGCACAACTCCCCCTGCTTGTATCCTCCCTTATGACCTCAAAAAAAGTGCGCCGTACTAGCATCTATAGGAGTACCTCAAGATCGGATAGAGTAGCATCCGTCCGCTGGAATATACAAGGTCCCACGCCAGCGGATTAAATGAGCCTGGAGCAATCCGTACTGACGGGAGCGAGCCAAAACTGAGTATGCAGATATTCTCCATGCTCACGGGGTCGGCAGCCGCATAACCCTATTCTTCCGAATTACGCCTCCTGCTGTTTACTTCCGCTGGCAAGGGAAATCGCCTCCAAGTTGTGCCAGAGTGGATCTTCGGAGCCACCTGGTCGAACCCAGCCATTCAACATGAAGGGTCCGAACCCAATTCACGTACGTGCTCTCCCTCAAAAGGCCCTGTACCCCAAAACCTCTCATGCTTAGAAGCCGTCGGATCATATCAGTGGGGAGCTCGGATATGGCTCTCGGGGCGCAGAATAAGGAGTCTCTCAAAGCAATCGACCGAACCTCGTGCTTATCCCATGCTCTGGTAGGGGGCGGACCGCATGTTAAAACCCTCAGACCCAACCTGGGTGGGTGCACAAAGTTAAACAATATTTCCGTAAGCTCGCAGATGTCAACGGATCAAGATAATCTGATAAACCATGGGCGGGCTTGTTACCCAGTGAGCACGCTATCAAGGCAGCTTTGCACGTCATCTCATCTGTAGCTCCTTGCGCTGTATGAGCTTACGGAATCTTCTCCTTACTGCCACC 3 | -------------------------------------------------------------------------------- /doc/MindTheGap_assembly.md: -------------------------------------------------------------------------------- 1 | # Genome assembly gap-filling using *MindTheGap* 2 | 3 | ## *MindTheGap* contig mode 4 | 5 | In addition to the assembly of insertion variants, the `fill` module of MindTheGap can be used as a genome assembly finishing tool, to fill the gaps between a given set of contigs, wihtout any apriori on their relative order and orientation. 6 | 7 | The basic usage of this mode is : 8 | 9 | ```bash 10 | MindTheGap fill (-in | -graph ) -contig [options] 11 | ``` 12 | 13 | It takes as input 2 mandatory files : the sequencing reads or their de bruijn graph if already computed (options `-in` and `-graph` respectively) and the set of contigs in fasta format (option `-contig`). 14 | 15 | ### Specific input parameters 16 | 17 | Most options are similar to those of the standard mode of MindTheGap, notably for the de Bruijn graph construction or for computational resource settings (see [../README.md](../README.md)). Specific options of the `contig`mode are: 18 | 19 | - `-contig`: the contig file path in fasta format. Note that only contigs larger than $3*kmerSize$ will be used. 20 | Although MindTheGap has been tested with contigs obtained with the assembler [Minia](https://github.com/GATB/minia), which uses similar assembly heuristics, contigs from any assembler may be used. 21 | - `-overlap`: the maximal potential sequence overlap between input contigs (default = $k$). MindTheGap extract from contig extremities seed and target kmers to perform local assembly between these kmers. To ensure a non-null gap-filled sequence even between contigs that overlap, seed and target kmers are extracted at `overlap` bp from the contig extremities. In case the overlap between your contigs exceeds the 'k' value chosen for *MindTheGap*, this can be specified using the `-overlap` option. 22 | 23 | - Local assembly limitations: local assembly may be tuned to allow larger and more complex assemblies between the contigs (than for insertion variants), with the following options: 24 | 25 | - `-max-nodes`: maximum number of nodes in the contig graph for each gap-filling assembly [default '100']. This arguments limits the computational time, but it can be safely set to $300$ or $1000$ in contig mode. 26 | - `-max-length`: maximum number of assembled nucleotides in the contig graph (nt) [default '10000']. This arguments limits the computational time, but if gaps are large, it must be increased. 27 | 28 | Increasing these two parameters may improve the results for gapfilling of assemblies much shorter than their expected size. 29 | 30 | ### Output 31 | 32 | In contig mode, *MindTheGap* returns 3 files ; 33 | 1. GFA file : `out.gfa` 34 | The assembly is returned in a [GFA format graph](https://github.com/GFA-spec/GFA-spec). Both initial contigs and gapfilling sequences are represented by segments. Links indicate sequence overlaps between segments. 35 | 36 | 2. Insertion sequences `out.insertions.fa` 37 | In addition to the GFA file, gap-filling sequences are reported in a fasta format file (see the header format below). 38 | 39 | 3. Gap-filling information file : `out.info.txt` 40 | 41 | For each gap-fill, some informations about the filling process are given in the file `.info.txt`, whether it has been successfully filled or not. 42 | 43 | 44 | ### Output formats 45 | 46 | **Gap-filling sequence header specificies**: 47 | 48 | MindTheGap fill outputs a file in fasta format containing the obtained gap-filling sequences (`.insertions.fasta`). Source and target kmers are not included in the output sequences. For each pair of contigs for which the filling succeeded, one can find in this file either one or several sequences with the following header: 49 | 50 | ``` 51 | >contig3_len_3652;contig18_len_19822_Rc;len_117_qual_50_median_cov_1350 52 | #contig3_len_3652: header of the source contig, contig3 in the original input file contigs.fa 53 | #contig18_len_19822: header of the target contig, contig18 in the original input file contigs.fa 54 | #_Rc: absent for the source contig and present for the target contig, this means that the end of contig3 is gap-filled with the end of contig18 (that is with the beginning of the reverse complement of contig18). 55 | #len_117_qual_50_median_cov_1350: information about the assembled gap-fill sequence, median_ cov giving the median kmer abundance of the sequence in the sample. 56 | ``` 57 | 58 | it contains notably two contig identifiers (their fasta headers in the original contig file) with optionnally a suffix "_Rc" if it is reversed. 59 | 60 | **Gap-filling information file** 61 | 62 | For each gap-fill, some informations about the filling process are given in the file `.info.txt`, whether it has been successfully filled or not. This can help understand why some gaps could not be filled. Here are the description of the columns: 63 | 64 | - column 1 : gap-filling name 65 | - column 2-4 : number of nodes in the contig graph, total nt assembled, number of nodes containing the right breakpoint kmer 66 | - (optionnally) column 5-7 : same informations as in column 2-4 but for the filling process in the reverse direction from right to left kmer, activated only if the filling failed in the forward direction 67 | - last 2 columns : number of alternative filled sequences before comparison, number of output filled sequences (can be reduced if some pairs of alternative sequences are more than 90% identical). 68 | 69 | ### Dealing and analysing genome graphs (GFA files) 70 | 71 | The graph output by MindTheGap can be easily visualized using [Bandage](https://github.com/rrwick/Bandage). 72 | 73 | Note that GFA Graphs supplied by MindTheGap may contain redundant sequence information (for instance this is likely that two contigs are linked in the graph by two gapfillings with reverse-complement sequences). Before further analyses, we recommend to simplify and reduce the redundancy in the graph using the scripts available in (MinYS github repository)[https://github.com/cguyomar/MinYS] : 74 | 75 | ``` 76 | git clone https://github.com/cguyomar/MinYS.git 77 | python3 MinYS/graph_simplification/graph_simplification.py MindTheGap_output.gfa simplified_graph.gfa 78 | ``` 79 | 80 | Other usefull scripts are available in the (MinYS github repository)[https://github.com/cguyomar/MinYS] to deal with this graph data structure : to enumerate paths, to convert the segments to a fasta file, to filter connected components according to their size... 81 | 82 | 83 | 84 | ## *MinYS*: targeted assembly pipeline 85 | 86 | *MindTheGap* in contig-mode is an essential step of the targeted assembly tool MinYS which is freely available here: [https://github.com/cguyomar/MinYS](https://github.com/cguyomar/MinYS). 87 | 88 | MinYS stands for *Mine Your Symbiont* and was designed for de novo assembly of a bacterial genome of interest, sequenced in a metagenomic context, and with the help of a (potentially distant) reference genome. A typical situation when studying symbiont genomes within their eukaryotic host sequencing. It consists in three steps : 89 | 90 | - Recruiting reads by mapping onto the reference genome, 91 | - Assembly of those reads in *backbone* contigs, 92 | - Gap-filling of these *backbone* contigs with the whole readset. 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /test/full_test/allele1.fasta: -------------------------------------------------------------------------------- 1 | >Seq0 2 | ACCTATAAGGATCCACGTCTTGATGTGACCGACGATGTTATCTGCCTATAGCGAACATTTCCTGGTAGGAATACTATTATATTACCTAAATTCTCCGGATCTCCGTGTTCTTCGGAAGCTTAGGTCACGCGCGTCATACTACAGTAAGTTACTGAATTTTGGAGTAATGCATATCGACCGAGTCCGGGGAATCGTCCGTATCCCCGCCAGGCGTATCCAATAATTAGGTCGGACCCCTAGCGCGTACGAGTCCTATAGCTGACATGCGAATTGGCCGTGCATCGAGCTCAAATGTCCGCCCAGCAGTCTCCAATATAGCCACTTATGTCGGAATGTAATCGGCTCCAAATTACTGTGAACCTGATGATCGACGACGCGCTGGGTGCGGGATAGTGTAACTGTCCCGTATAAGGCTTAAATTGAGTATGCAGGATACACTTTTAGTATGACTCGGAAAATCCCACTACAGTATGAACCGTGACTGTCGTTGGGCGGTGCTTGTTGCTATTAACTGGATGGTAAGTCAATTTTAAGTGGAGCCCCCTGCAAGATGGAAGAGAACTATATGTGGAACCGGATGTCACGTCAAGACCACCGCGCACGCTTAACACAACGCATGGTTCGGCGACTGCTCCGGTTCTGGAATTCCTCTTTTTTATCGAGATTAGCATCTCATTAGCGCGCAGTGTGTCCTAGTGTCCAAGAACGTAAAGACCAGATTTGACATTAGTAAGTTGCATTCGCGGGTCTTTAGAGCTCCTACTAAGCCCATTCAGATTGAAAAAGGCGCGCCTGCGGTTCCATGTCATAGTTTAACTTGGTTGAGACGGCGCACGAAAGCATACTACCACCATCCATGTCAAAACTGGCGACGATCTTGCTGGGT 3 | >Seq1 4 | GTTTTATAACAAGTGAAGAAAGAACAGTCATTAGGCCCGAAGGCGTACACGGAATCCGCCTAAACTAATGCTTATGGTCCCTCGTAGGCCCCGACGGCATAGGGTGCCCCGTAATTGCGTAGACAGCGAAAGGAAAGTGTACCGACGACGAGATCAAGTGGCAGTGAACCCCTTGGAAAACTTCATATCGACAGACACTGTCCGACTGATAAATCTACAATTCGCGGAACGGCTCCATAAACGGTGACTATGACCTCATCAGTTAAATTCATTAACCTATAGATGTTGAGAATCCGCCTAAATTGGAGGACGCCGCGCAAAGCCGGTCAACAGCGTTAGTATGTTGAAAGTTTACTCAGATCGCTTCTGTCGGCCTAAGTTATGAATACAGTCAGGGGTTTTCCTGGATACAGATCTTCGGAAGTCTGCTAGTGCCGGTCACGCCACGTATAAGACTAGAGACCTGACACTCATCTCGGACTGTAATTATTTTCTAAGCTAGCTGTCCTATTGTAAGGTAAGTCGGGGTGCCGGACCTTGGCAGCCGTCCGTACGTACATTAATGCATCAGACCAGTAATCTTGCTCGCTAGAGTCGGATCGGCGAGGTCCTCTCAGCTATGAAAAGTAGAATTCAGTCTTGCCCGGGGTAGGGCTAGAGACAAGGGTTGATATTGTCTGAAATGGCGCTCCCATCCGATAACTAAACTACCGTACGGGTGCACGCGAGTAGCCTTAATCCTACATATACTAGACTATCATCCTAGGGCGTAGAACTCCCTTTGCGTTACCACTCACAACGCGTGTGTTTCTCGTGCGGTGATCCCGGGAGTATTTTATCGGCGCCCCAACTGCCGGCAACTCGAGTGCACCGGAGCTGAGGAACGCTTCAC 5 | >Seq2 6 | TGCTGCCGATCGCTACGACGTCCTACCTTACACACAACGGGCCGCGTTCATACCCACGTATGAAGACATGCGGTTATCCGTTAGTTGGGGCCCGCGATGGCTGTTGTATGTAATGAGTGGAGTAAAAGAACACGGGAGATGTTCAATTAGGCTCAACCATTTGTATTGAAGGGGTCAAAACCCACCTGTGTAAGTAGCATTGAGTGCTGCTCATGGCCGATTCTCTTACTTTATTTAAGCCCATACCACTGATGGAGGAGACTCCAAGGAATAGTCCAACCGTTGTTCCGGGAAAAACGAACTCGGTCCGCTAGCGTGCCTGTCGTCGCAGCGCAGATTCCCTACTAAGTTATTTTACCGGTTGACTCAGTCGCAGGGCTGCTGAACGCCATGGGGCGGACCGAGCGGAGATCTTTCAGGTGTAATTGGATGATCTTCACCCTAAGGAGGGAGCGGGTTATTGGGCAGGCCCGACCGCCGCTGGAAAGATGCAAGTATGAACAAGGCATGCGTAAGTTATCGTGAAACCATGATGTAACGTTCGCTGAACATCGACTCCGGTGACGACATACGATTCAAGAAGAGAGTGACTCTGTAGGATAACATCCCGCAACGCCTAATCCATCCAGCCTGGCACCATGTATAAAGGGCGTCAGGTATGTTAACGAGACTATTGCCCCTTACTAGACCAAATGTACTGAATGCGTCTTGATTAGACCCCGAAAAGGCATATCACATCAACTTGTCGTGTTAGAGATGTCGCGAGAACCCTCGCCTATTCACGAAGTGCCCATGAGCCTTTGGGCCTGGTTGACAATATGCGCACCCTGTAGCCTACCAACCCTTCATAATCTATCTTCAATTTAGCCTGTCGAGAACGTCCTAACAAGGCTTTTGGCGCTCCAGATGGGACAGTCACTCTCTAGCATCAACCCATAGTTTTTGAGCTACCCGCCCTCGGTGAGAAGGTAGTATGCACGCTGCAGGATTGGAACCACAATGTACGCCGATCCAAGCAGTAGTGGTTCATTGTATAAGTATCCTCCCTTGATTGGTCGAATATTAGGCATGCCCCGGGAGCATGTGGGCTCGAGCCACGGAGAGCAACTAATCGCGCATAAAACAAATACCTCATGGTTTTTGTGCGGAAAACCGTTGGGTGGACCATCAGCGGTTGTGATTACCCAAACGCGTCCTATGCAGTTTTGGGCTTGAGGAACTAAAATCAATTCGTTAAGTCCAGTAGTCCTTATGTGTGGCCGTTCCAAGAATTAATCATATCGCGGAATCACCGGCGATTCATTTTCCGCACCAGAGCTTTCAGGTGCGGCGCGGCCCTAAAAGGTC 7 | >Seq3 8 | TATTGCGCCCTTCAAGAAGCTTCTGCTGACCGTAGGCGTCTCGGCGGTTTGTACTTTGAAAAATTAGCTGCACTACATCCGATGGGTATCCCTCCTCAATCTCAGCAGACCCGGAAAGCGATAGAATCAGCCACGCGGTCGTCCGGGCTAGGGGCCCTGCGCAAGGAAGGTTGGACAGGGCTAGACCCGGAAGCATCGGCTTTTCCTAAATGGTGACGGAGTTATATAGGGTAAGCCTGATAGCGCGGTAGGTGTTATGGCCATCCCCTCGCCTAGCGTGCGCGCAGACAAGTCCAGTCCCGGAGGAGGCATAGGCCTCATTATCATTTCCCTAGAATCGCTCTTGACATCTAGGTTGTACTAGGGACCAGGCGCCCAAAGCGGACGGTTCTCCGTGCTTTCGTGCCGTTTCAGCGTAAGATGCTATTTTTTGGGGAAATGGTCGGCGTGTGCGGGGGAGAACCACGGTACCAACTACGATAAGTCCGTCGTGTAACTTACGTGAAGGTGATGTGAAGCAGGAATCCGTGCCAAAATGTCCGTGCGATATCCAACTTTCATAGTATTACACGAGAGCCTATGATTTGCCCAGGCGCGACCCGTGAATCGAGGTAATCGCCGACCAGATATTGCGAAACACCACATTACATGACTACTGTCCGCTTGAAGAGTTATATACTTGACAGTCCTGGTTGACGGCACAGCATATCTCCAATGTGTGGTTTAAAGTCTCACGTTCTTCATGCGCGCCGGCCCATGGGAACAAGTATCCTTACTTTCGTTTGCAGCACTAGCCGTTCCTTGACATCTGCGGCCAACTTGTGCCTGAACCTGGAGTTTCGACAGCGTGGCGCTCTGGCCTAGTTCTTCGCTGGCACCTGGAAGAGCCGCCGTACAAATGAGGCTCCAAAATAGCACGCTTGCAGCAGTCAAGTTGAACGCCTTAAAAGGCACCGCCGCTCGTTCATTGGGATTCCTTGAGAATCGTGACTTGTTACACTATAAGATCATGGATTGGACAAAATAGGCCAACTCCCGCACGCTGTGGCTATTCTTAAGTTGCATAGGTGGGAGTAGCCTTATACTCGATTTCTAAAAAGAGTAGGTGAGC 9 | >Seq4 10 | TTCCGGCGCCGCACTAATTGAAGTGGTGAGCTGACCAGTCGTTCAGGATCCGAAGGCGGGGATGGCGCTATAGGAGCCGGCAGGTATGCTTTGCCGCAAAATTTCGGGGTGGTGGAACCGTCTTACCGAAAGTTAGCTACAGCCTGGAATGTGAAATTCCATGACCTGCCCGTCCTGTGTCCACAGGGCGACATTTGCCACGTAGGTAGGGCGACCATTAGAATGCTGCATTATCGGGCGATAAAAAGTTTTATACTCAAGAATCCTACAAAGATGAAAATTTCGAAGAGCTGCACGCAGTTGTAAGTTGCTTTTCTGGGGTAATCGAGATTCTCCACCATAACCTGCGCAATGCATCGTGAAGCTTTACCGCGCCCAAGGGGAGCGTCTCAGTGGGGTTGCCTCCAGGGATATATTGAAAGTTGAAGAAGAAGATCACAGGTTAAGCGGTATGTTAAGTTAGAACTCACGGGGAGCCGCCTTGATTTTGTTCGACATGAACCAGAGACCAGGTGTGTTATGTTCTGGAACCTTAATACGTACGTCGCCAGCACCGAGCCGGCACTCCATCTCTTTTGGGTGCGCAACATTGCTATACTTAGGTGTATTCCTGGGTTGAGTGGCAGGTTTCTCTTAATTCTTCCCTAAGTAGCTCCGAGGATCCATTGACATCTGTCAGCCGTCTTTCCAGAACGTTATAAGACTCGTGAGGAAATTATACAAATCGTTGCCATCATCCAAAGCAAAGTACTTCCGCTTAGGAGTGCCTTGAAGAACCGATTATCTCTGACAATGTAATGCCACAGCACCCTCGACAAAGTTCTACATTCGTTCCAGGTCATGATACAGCGCGCTAAATTACCGCTACGAGCCATACCGGATGGCGGCCGGAGAGCGCTGCAATCGCATGGCTCGGGACCGAACATTGAGACCTGGCTAGTAGGTAGGTGTCAAATCGATATCCACACCTGTCGAAGCAGCTAAAGATCGGTTGCGGCGGGAGTCCTCCATTCAGGCCAAACGTGCAGTGCTCGATGTGCTTCCTATCGCTCT 11 | >Seq5 12 | GATGTTTAGAAGTTTCCAGGTCACGCCAATGATTGGCATTTACACACGTGGATCAGCGGACATATCTAACCCTTAGTGTTCTTAAGAGCAACTCACTACTCCATTTCCACTAACCCCGCCGGCGGTAATTCCAATCTAGTTGATCAGACTTCCCAGTCAATGAAAGCGACACCGTGCGTCTGTAATACCAACAAGACCCTGGCTGTCGTCCCGCAGAGGACGCGGCACCTCCGGATTTTGAGTCCAGTCTGAACGATTTTCGATCACTCACCATGGATCTGGAAAACGGAGTCGAGTACTCACGAGCCAAATTGATGCATTTCCAATGACCCGATGCAGGTGCGACCGATCTTCGCCTATGCTTCCCGCCGTAATTATTGAGTCTGGGTCCCGGCCGCTAACGTTTGACTCACGGGGAGGTACCCGTGCGTATTCTTCTCAAAGTGACGCTGGACAGCAGCGCATGTCCGAGCCCCATCGTCCTATCTGGTGTAGAGTCTTACCTCTAATTAGAGTGATCGAACCAGTAGGTGTCGCGGTCTTAGGGCTCCCATTGTCCAAGGGAACGTGAACAGATATGAATCTGGGAGAATAGTGCAGCGTTGACCCTTCTGGTCGGTCAGCCCTTGCCTACGGCCCGTATGCGGAGAATGAAGGCGTGAAACATTCTGCTCTTTTAGAAGCAGCGGCTGCACCCGTATAACAACTCGCACGATCGTACGTCTCATTTGCCGCGTTGGCGCGCCCGTGGATGATGGACCACGGTATGAACCTCTGCACTTCAAATTTGACGCAATCCTGCACTCACCCGCACACAGTTCTAGTCTAACCGTCGCAGTGTCTGCTTTAAGGTAGAGATCGATACTTAGGATATGTTCATGTGTGTTTGTAGCGCTGGACCCTCTTATGGGTGTGGTCACTTGTGATGGATCGAGGAACTTAGGCGGTTAACTTGTTTCGACGTCTCACCGACAATATCAGGATTTAGTATCG 13 | >Seq6 14 | ACCGAAAATGACAATGTTCACACGCATGCTCGGCGTGGAAAAGAGCCTTTTCTAAGACCGACTCGTTCCGGGCAGCAGGATTATTAGCCAATCAAAATTATATCGACCGGTCATCAAGCTGCGATAGTGCAGGCGCATGCCGTCCAATGGGTCCACGGCGGAAGTGCGTTCGTCTACTCTGTCAAATCTTAACATTTTTTGAGCGGCTAATCCGGCCGGTAGTGTACCGTGAACCAAAGTCCTTCTACGAGCGTATTAGATTGCTCAAAAGATCCGGGAGAATTGACCAGGTCGTATCTTTAAAATAACGCTGGTGCGAGCAGCTGCTGTTTTATCAACACCCATTTAGTCCTGTGAAGTTTGCTTAGCAGATACACCTTCCCGCGTGGTATGAGAGGCTGTTCTTCATTAAAAACTATGAGGCTCTGGCACCTTCGACGCTAACAAAGTCCCCACGGACCATGATACCCTTACGCAACTCTCTTTGCACGCTAGGGCGAGAGTACTGTCCCCCTAGACTAGGTACACGCCGGGTAAACTCTCTCGCACACCTTTACGCTCGACTACAGGCTTCTAACCCTTCCGAACGCATATAATTCAAATGGCACTTCAAGTAACAGACGAATCACGGCTCACAGGCAGAATTCACTGGAGTAAAAGGATTCAGAACAATAGATAGTGTGTTAACTTTACAGTCATCCGTATTATAACGTGTAGCGAGAGGATTGAGTTCTTGTTAGGAAGGAAGGTCCTATAGACGAGTGCGGTAGCGCACCCGGTCGCCTTGCGTAGTCATGCCCGACGTGTTGATGGTGGTCCCTTTTAGCCGCCACACAAGGGATCCGAGGGTGAGAGACACATGGCCCTCACCGACGAGACTTACTCAGCCTGCCTCGCTATTGCCCTCTTTTTGATCACGTCCCTTTGTGGCTCTCGAGGACTCGTGCAGCGTGTATCTGGGGATTTGTAAGCTTAAGACTACCTTCCATAGGA 15 | -------------------------------------------------------------------------------- /test/vde.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | 5 | # std import 6 | import sys 7 | import argparse 8 | import os.path 9 | from collections import defaultdict 10 | import csv 11 | import re 12 | import math 13 | 14 | 15 | class Variant(object): 16 | 17 | def __init__(self, type, comment): 18 | self.type = type 19 | self.comment = comment 20 | 21 | def __eq__(self, other): 22 | if self.type != other.type: 23 | return False 24 | 25 | if self.comment != other.comment: 26 | return False 27 | 28 | return True 29 | 30 | def __str__(self): 31 | return "%s_%s" % (str(self.type), str(self.comment)) 32 | 33 | def __repr__(self): 34 | return "<%s - %s>" % (self.type, self.comment) 35 | 36 | def __hash__(self): 37 | return hash(self.type + self.comment) 38 | 39 | 40 | def main(): 41 | """ The main function of vde no argument """ 42 | 43 | enable_input = [name.split("2")[0] for name in globals().keys() 44 | if name.endswith("2eva")] 45 | 46 | parser = argparse.ArgumentParser( 47 | prog="vde", 48 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 49 | 50 | parser.add_argument("-e", "--experiment", 51 | type=str, 52 | help="File of experimente result.", 53 | required=True) 54 | parser.add_argument("-t", "--truth", 55 | type=str, 56 | help="File of truth result.", 57 | required=True) 58 | parser.add_argument("-d", "--delta", 59 | type=int, 60 | help="Acceptable diff betwen truth and experimente.", 61 | default=5) 62 | parser.add_argument("-ef", "--experiment-format", 63 | type=str, 64 | help="Format of experiment file", 65 | choices=enable_input, 66 | default="eva") 67 | parser.add_argument("-tf", "--truth-format", 68 | type=str, 69 | help="Format of truth file", 70 | choices=enable_input, 71 | default="eva") 72 | 73 | # parsing cli argument 74 | argument = vars(parser.parse_args()) 75 | 76 | expfunc = globals()[argument["experiment_format"]+"2eva"] 77 | truthfunc = globals()[argument["truth_format"]+"2eva"] 78 | 79 | experiment, count = expfunc(argument["experiment"]) 80 | truth, count = truthfunc(argument["truth"]) 81 | 82 | result = compare(experiment, truth, argument["delta"]) 83 | 84 | result_printing(result, count) 85 | 86 | 87 | def result_printing(result, count): 88 | """ Printing the result in csv format """ 89 | 90 | head = ",".join(("type", "TP", "FP", "recall", "precision")) 91 | print(head) 92 | for gap in result.keys(): 93 | total = result[gap]["TP"] + result[gap]["FP"] 94 | 95 | prec = 1 if total == 0 else result[gap]["TP"]/float(total) 96 | recall = 1 if count[gap] == 0 else result[gap]["TP"]/float(count[gap]) 97 | 98 | print(",".join((str(gap), 99 | str(result[gap]["TP"]), 100 | str(result[gap]["FP"]), 101 | str(recall), 102 | str(prec)))) 103 | 104 | 105 | def compare(exp, truth, delta): 106 | """ Compare experimente and truth return TP FP precision and recall 107 | for each type """ 108 | 109 | result = defaultdict(lambda: defaultdict(int)) 110 | 111 | exp_pos = set(exp.keys()) 112 | tru_pos = set(truth.keys()) 113 | 114 | for exact_pos in set(exp_pos & tru_pos): 115 | for variant in exp[exact_pos]: 116 | if variant in truth[exact_pos]: 117 | result[variant.type]["TP"] += 1 118 | else: 119 | result[variant.type]["FP"] += 1 120 | 121 | not_found = set(exp_pos - (exp_pos & tru_pos)) 122 | for fuzzy_pos in set(exp_pos - (exp_pos & tru_pos)): 123 | end = False 124 | for pos in range(fuzzy_pos - delta, fuzzy_pos + delta + 1): 125 | for variant in exp[fuzzy_pos]: 126 | if variant.type in ("snp", "multi_snp"): 127 | result[variant.type]["FP"] += 1 128 | try: 129 | not_found.remove(fuzzy_pos) 130 | except KeyError: 131 | pass 132 | end = True 133 | break 134 | 135 | if variant in truth[pos]: 136 | result[variant.type]["TP"] += 1 137 | try: 138 | not_found.remove(fuzzy_pos) 139 | except KeyError: 140 | pass 141 | end = True 142 | break 143 | if end : 144 | break 145 | 146 | for pos in not_found: 147 | for variant in set(exp[pos]): 148 | result[variant.type]["FP"] += 1 149 | 150 | return result 151 | 152 | 153 | def eva2eva(filename): 154 | """ Read eva file and return value in dict 155 | position is key and type is value """ 156 | 157 | __check_file_exist(filename) 158 | 159 | data = defaultdict(list) 160 | count = defaultdict(int) 161 | 162 | with open(filename) as csvfile: 163 | linereader = csv.reader(csvfile) 164 | for val in linereader: 165 | data[int(val[0])].append(Variant(val[1], val[2])) 166 | count[val[1]] += 1 167 | 168 | return data, count 169 | 170 | 171 | def breakpoints2eva(filename): 172 | """ Read breakpoint file and return value in dict 173 | position is key and type is value """ 174 | 175 | __check_file_exist(filename) 176 | 177 | data = defaultdict(list) 178 | count = defaultdict(int) 179 | 180 | mtg2eva = {"HOM": "homo", 181 | "HET": "hete", 182 | "SNP": "snp", 183 | "MSNP": "multi_snp", 184 | "DEL": "deletion", 185 | "BACKUP": "backup"} 186 | 187 | findpos = re.compile(r'pos_([-\d]+)') 188 | findtype = re.compile(r'_([a-zA-Z]+)$') 189 | findcomment = re.compile(r'contig_\d+_(.+)_pos') 190 | 191 | with open(filename) as filehand: 192 | for line in filehand: 193 | line = line.strip() 194 | if line.startswith(">left_contig_"): 195 | data[int(findpos.search(line).group(1))].append(Variant( 196 | mtg2eva[findtype.search(line).group(1)], 197 | findcomment.search(line).group(1))) 198 | 199 | count[mtg2eva[findtype.search(line).group(1)]] += 1 200 | 201 | return data, count 202 | 203 | 204 | def __add_in_data_count(pos, type_gap, data, counter): 205 | """ Add value pos: type_gap in data and increment counter[data] """ 206 | 207 | data[pos].add(type_gap) 208 | counter[type_gap] += 1 209 | 210 | 211 | def __check_file_exist(filename): 212 | """ If file doesn't exist trow assert """ 213 | 214 | assert os.path.isfile(filename), "Error when I try open " + filename 215 | 216 | 217 | if(__name__ == '__main__'): 218 | main() 219 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | -------------------------------------------------------------------------------- 4 | ## [Unreleased] 5 | 6 | -------------------------------------------------------------------------------- 7 | ## [2.3.0] - 2022-04-20 8 | 9 | Improving the `Find` (insertion breakpoint finder) module: 10 | 11 | * very small insertions (1 or 2 bp) are now directly assembled in the `Find` module and are output in the `.othervariants.vcf` file. This may increase the running time of the `Find` module but the overall running time of MindTheGap (Find+Fill) is drastically reduced. Indeed, these numerous small insertions are no longer output in the breakpoint file, nor given as input for the `Fill` assembly module which performs a deeper traversal of the de Bruijn graph (designed for longer insertions). 12 | * a novel filter is implemented to reduce the amount of False Positive insertion sites. It is based on the number of branching kmers in a 100-bp window before a heterozygous site. It can be tuned with the novel option `-branching-filter`. It is now activated by default, so this may modify the amount of heterozygous sites detected with respect to previous versions. 13 | 14 | With this new version, the running time of MindTheGap as an insertion variant caller is reduced for real large datasets, such as human genome re-sequencing data. 15 | 16 | -------------------------------------------------------------------------------- 17 | ## [2.2.3] - 2021-06-11 18 | 19 | * 2 novel options in the `Fill` (local assembly) module : 20 | * `-fwd-only`: output the first-contig extensions of failed gap-fillings in a separate file, it can be useful for the assembly of the extremities of linear genomes (as in the [MinYS](https://github.com/cguyomar/MinYS) tool) 21 | * `-extend`: do not try in reverse direction if no inserted sequence is assembled (bkpt mode), it can improve the running time and/or help the user controlling the direction of assembly (as in the [MTG-link](https://github.com/anne-gcd/MTG-Link) tool) 22 | 23 | -------------------------------------------------------------------------------- 24 | 25 | ## [2.2.2] - 2020-06-19 26 | 27 | * A bug fix: updating gatb-core version, notably this fixes a bug in the `fill` module: nodes at extremities of contigs of size exactly `k` were not marked correctly, potentially leading to duplicated contigs in the contig graph. This could prevent exploring some parts of the graph, and if graph exploration parameters where set too large (`-max-nodes` and `-max-length`), it could lead in some rare cases to extreme running times and/or memory consumptions. This should no longer happen now. 28 | 29 | -------------------------------------------------------------------------------- 30 | ## [2.2.1] - 2019-11-29 31 | 32 | * Some bug fixes: 33 | * updating gatb-core version, notably this fixes a potentially important bug, where the de bruijn graph was erroneous for large datasets (such as human re-sequencing ones) 34 | * bug fix in the fill module, node marking in contig graph construction was not working properly leading to obtain too many solutions, in the case of multiple solutions 35 | * Some improvements: 36 | * optimization of the algorithm to find paths in the fill module (should be faster); 37 | * new options for insertion variant detection: 38 | * `-bed` (find module): to limit the search of insertion breakpoint in specific regions. 39 | * `-filter` (fill module): to remove insertions with multiple sequence solutions from the final vcf file (since they most often are a sign of false positive) 40 | 41 | Note that the targeted assembly pipeline, including the gfa graph simplification scripts, is no longer included in this repository. This is now a proper tool, called MinYS (for MineYourSymbiont), which is distributed independently of MindTheGap and has now its own github repository : [https://github.com/cguyomar/MinYS](https://github.com/cguyomar/MinYS) 42 | 43 | 44 | -------------------------------------------------------------------------------- 45 | ## [2.2.0] - 2018-07-06 46 | 47 | * A nice novel feature: insertion variants are now output in vcf format! and with left-normalization (ie. if several equivalent positions are possible for a given insertion event, the left-most is output and the size of the ambiguity is indicated). 48 | * Some improvements and bug fixes: 49 | * faster graph loading in fill module; 50 | * if multiple inserted sequence solutions, better handling of very similar ones, the number of output solutions can be reduced; 51 | * better handling of N stretches in the reference genome, resulting in less False Positive calls in find module; 52 | * better recall for very small heterozygous insertion variants (bug fix when the insertion is size smaller or equal than the ambiguity size). 53 | * a CI simple test for the Fill module with option `-contig`. 54 | 55 | 56 | -------------------------------------------------------------------------------- 57 | ## [2.1.0] - 2018-06-13 58 | 59 | A nice novel feature: 60 | 61 | MindTheGap can now also be used as a genome assembly finishing tool: it can fill the gaps between a set of input contigs without any a priori on their relative order and orientation. This new feature is available in the Fill module with option `-contig`. 62 | 63 | Some bug and compilation fixes, by updating the gatb-core version to 1.4.1 and more. 64 | 65 | 66 | -------------------------------------------------------------------------------- 67 | ## [2.0.2] - 2017-07-06 68 | 69 | Some new features: 70 | * the Fill module is now parallelized and can use several cores 71 | * additional information is output by the fill module: 72 | * the abundance of each filled sequence is now computed and written in the fasta file 73 | * a log file is output giving details about each gap-filling process 74 | 75 | Bug fix: 76 | * some gap-filled sequences were incorrect (this happened only for multiple filled sequences in rare cases) 77 | 78 | -------------------------------------------------------------------------------- 79 | ## [2.0.1] - 2016-07-21 80 | 81 | This is a bug-fix release : 82 | * fixed a compilation issue with old version of clang compilers (prior to clang 4.3 on mac), by updating the gatb-core version to 1.2.2. 83 | 84 | -------------------------------------------------------------------------------- 85 | ## [2.0.0] - 2016-06-29 86 | 87 | * Initial release after refactoring the whole code of MindTheGap to use the **GATB library**. 88 | Some of the benefits: 89 | * faster (thanks to GATB improvements in kmer counting!); 90 | * no longer need to recompile for changing the `k` parameter; 91 | * automatic estimation of the paramater `abundance-min`; 92 | * more user-friendly usage, with more readable help, progress bars, input-output summaries, etc. 93 | * compatibility with other GATB tools: input-output graph in `h5` format. 94 | * **New features** (with respect to the published version, August 2014): 95 | * detection of homozygous SNPs and deletions (output in a separate VCF file); this should also improve the recall of insertion event detection; 96 | * a quality score is now associated to each insertion prediction, this enables to filter out some predictions and to obtain a high-confidence subset. 97 | 98 | Have a look at a comparison between the published and the 2.0.0 versions on simulated data [here](https://gatb.inria.fr/mindthegap-insertion-event-detection/). 99 | -------------------------------------------------------------------------------- /test/full_test/allele2.fasta: -------------------------------------------------------------------------------- 1 | >Seq0 2 | ACCTATAAGGATCCACGTCTTGATGTGACCGACGATGTTATCTGCCTATAGCGAACATTTCCTGGTAGGAATACTATTATATTACCTAAATTCTCCGGATCTCCGTGTTCTTCGGAAGCTTAGATCTAAGCTGTGACCTTGTGGCCGAGGCGCTTTTCACGCCTACATTAACTCCTGGGAAGCTCTCTGCTCTAGTTTCAGTGCACATCTCCAGGTGAGCAACCCTGGCAAGCAGCCCCTTCCTGTAGAAATTACTTAGCGTCACGCGCGTCATACTACAGTAAGTTACTGAATTTTGGAGTAATGCATATCGACCGAGTCCGGGGAATCGTCCGTATCCCCGCCAGGCGTATCCAATAATTAGGTCGGACCCCTAGCGCGTACGAGTCCTATAGCTGACATGCGAATTGGCCGTGCATCGAGCTCAAATGTCCGCCCAGCAGTCTCCAATATAGCCACTTATGTCGGAATGTAATCGGCTCCAAATTACTGTGAACCTGATGATCGACGACGCGCTGGGTGCGGGATAGTGTAACTGTCCCGTATAAGGCTTAAATTGAGTATGCAGGATACACTTTTAGTATGACTCGGAAAATCCCACTACAGTATGAACCGTGACTGTCGTTGGGCGGTGCTTGTTGCTATTAACTGGATGGTAAGTCAATTTTAAGTGGAGCCCCCTGCAAGATGGAAGAGAACTATATGTGGAACCGGATGTCACGTCAAGACCACCGCGCACGCTTAACACAACGCATGGTTCGGCGACTGCTCCGGTTCTGGAATTCCTCTTTTTTATCGAGATTAGCATCTCATTAGCGCGCAGTGTGTCCTAGTGTCCAAGAACGTAAAGACCAGATTTGACATTAGTAAGTTGCATTCGCGGGTCTTTAGAGCTCCTACTAAGCCCATTCAGATTGAAAAAGGCGCGCCTGCGGTTCCATGTCATAGTTTAACTTGGTTGAGACGGCGCACGAAAGCATACTACCACCATCCATGTCAAAACTGGCGACGATCTTGCTGGGT 3 | >Seq1 4 | GTTTTATAACAAGTGAAGAAAGAACAGTCATTAGGCCCGAAGGCGTACACGGAATCCGCCTAAACTAATGCTTATGGTCCCTCGTAGGCCCCGACGGCATAGGGTGCCCCGTAATTGCGTAGACAGCGAAAGGAAAGTGTACCGACGACGAGATCAAGTGGCAGTGAACCCCTTGGAAAACTTCATATCGACAGACACTGTCCGACTGATAAATCTACAATTCGCGGAACGGCTCCATAAACGGTGACTATGACCTCATCAGTTAAATTCATTAACCTATAGATGTTGAGAATCCGCCTAAATTGGAGGACGCCGCGCAAAGCCGGTCAACAGCGTTAGTATATGGTTTATAGAACCCGGGCGTTCATGTCCGTCAGAACGATCTTGGCACGGTAGCCCCTGGTCCAGAGAGCCAAGGTGACTCAGCCCCACGATGGTGGTCTAGAGCGAAATAACCCTCGCCGAGAGTTGAAAGTTTACTCAGATCGCTTCTGTCGGCCTAAGTTATGAATACAGTCAGGGGTTTTCCTGGATACAGATCTTCGGAAGTCTGCTAGTGCCGGTCACGCCACGTATAAGACTAGAGACCTGACACTCATCTCGGACTGTAATTATTTTCTAAGCTAGCTGTCCTATTGTAAGGTAAGTCGGGGTGCCGGACCTTGGCAGCCGTCCGTACGTACATTAATGCATCAGACCAGTAATCTTGCTCGCTAGAGTCGGATCGGCGAGGTCCTCTCAGCTATGAAAAGTAGAATTCAGTCTTGCCCGGGGTAGGGCTAGAGACAAGGGTTGATATTGTCTGAAATGGCGCTCCCATCCGATAACTAAACTACCGTACGGGTGCACGCGAGTAGCCTTAATCCTACATATACTAGACTATCATCCTAGGGCGTAGAACTCCCTTTGCGTTACCACTCACAACGCGTGTGTTTCTCGTGCGGTGATCCCGGGAGTATTTTATCGGCGCCCCAACTGCCGGCAACTCGAGTGCACCGGAGCTGAGGAACGCTTCAC 5 | >Seq2 6 | TGCTGCCGATCGCTACGACGTCCTACCTTACACACAACGGGCCGCGTTCATACCCACGTATGAAGACATGCGGTTATCCGTTAGTTGGGGCCCGCGATGGCTGTTGTATGTAATGAGTGGAGTAAAAGAACACGGGAGATGTTCAATTAGGCTCAACCATTTGTATTGAAGGGGTCAAAACCCACCTGTGTAAGTAGCATTGAGTGCTGCTCATGGCCGATTCTCTTACTTTATTTAAGCCCATACCACTGATGGAGGAGACTCCAAGGAATAGTCCAACCGTTGTTCCGGGAAAAACGAACTCGGTCCGCTAGCGTGCCTGTCGTCGCAGCGCAGATTCCCTACTAAGTTATTTTACCGGTTGACTCAGTCGCAGGGCTGCTGAACGCCATGGGGCGGACCGAGCGGAGATCTTTCAGGTGTAATTGGATGATCTTCACCCTAAGGAGGGAGCGGGTTATTGGGCAGGCCCGACCGCCGCTGGAAAGATGCAAGTATGAACAAGGCATGCGTAAGTTATCGTGAAACCATGATGTAACGTTCGCTGAACATCGACTCCGGTGACGACATACGATTCAAGAAGAGAGTGACTCTGTAGGATAACATCCCGCAACGCCTAATCCATCCAGCCTGGCACCATGTATAAAGGGCGTCAGGTATGTTAACGAGACTATTGCCCCTTACTAGACCAAATGTACTGAATGCGTCTTGATTAGACCCCGAAAAGGCATATCACATCAACTTGTCGTGTTAGAGATGTCGCGAGAACCCTCGCCTATTCACGAAGTGCCCATGAGCCTTTGGGCCTGGTTGACAATATGCGCACCCTGTAGCCTACCAACCCTTCATAATCTATCTTCAATTTAGCCTGTCGAGAACGTCCTAACAAGGCTTTTGGCGCTCCAGATGGGACAGTCACTCTCTAGCATCAACCCATAGTTTTTGAGCTACCCGCCCTCGGTGAGAAGGTAGTATGCACGCTGCAGGATTGGAACCACAATGTACGCCGATCCAAGCAGTAGTGGTTCATTGTATAAGTATCCTCCCTTGATTGGTCGAATATTAGGCATGCCCCGGGAGCATGTGGGCTCGAGCCACGGAGAGCAACTAATCGCGCATAAAACAAATACCTCATGGTTTTTGTGCGGAAAACCGTTGGGTGGACCATCAGCGGTTGTGATTACCCAAACGCGTCCTATGCAGTTTTGGGCTTGAGGAACTAAAATCAATTCGTTAAGTCCAGTAGTCCTTATGTGTGGCCGTTCCAAGAATTAATCATATCGCGGAATCACCGGCGATTCATTTTCCGCACCAGAGCTTTCAGGTGCGGCGCGGCCCTAAAAGGTC 7 | >Seq3 8 | TATTGCGCCCTTCAAGAAGCTTCTGCTGACCGTAGGCGTCTCGGCGGTTTGTACTTTGAAAAATTAGCTGCACTACATCCGATGGGTATCCCTCCTCAATCTCAGCAGACCCGGAAAGCGATAGAATCAGCCACGCGGTCGTCCGGGCTAGGGGCCCTGCGCAAGGAAGGTTGGACAGGGCTAGACCCGGAAGCATCGGCTTTTCCTAAATGGTGACGGAGTTATATAGGGTAAGCCTGATAGCGCGGTAGGTGTTATGGCCATCCCCTCGCCTAGCGTGCGCGCAGACAAGTCCAGTCCCGGAGGAGGCATAGGCCTCATTATCATTTCCCTAGAATCGCTCTTGACATCTAGGTTGTACTAGGGACCAGGCGCCCAAAGCGGACGGTTCTCCGTGCTTTCGTGCCGTTTCAGCGTAAGATGCTATTTTTTGGGGAAATGGTCGGCGTGTGCGGGGGAGAACCACGGTACCAACTACGATAAGTCCGTCGTGTAACTTACGTGAAGGTGATGTGAAGCAGGAATCCGTGCCAAAATGTCCGTGCGATATCCAACTTTCATAGTATTACACGAGAGCCTATGATTTGCCCAGGCGCGACCCGTGAATCGAGGTAATCGCCGACCAGATATTGCGAAACACCACATTACATGACTACTGTCCGCTTGAAGAGTTATATACTTGACAGTCCTGGTTGACGGCACAGCATATCTCCAATGTGTGGTTTAAAGTCTCACGTTCTTCATGCGCGCCGGCCCATGGGAACAAGTATCCTTACTTTCGTTTGCAGCACTAGCCGTTCCTTGACATCTGCGGCCAACTTGTGCCTGAACCTGGAGTTTCGACAGCGTGGCGCTCTGGCCTAGTTCTTCGCTGGCACCTGGAAGAGCCGCCGTACAAATGAGGCTCCAAAATAGCACGCTTGCAGCAGTCAAGTTGAACGCCTTAAAAGGCACCGCCGCTCGTTCATTGGGATTCCTTGAGAATCGTGACTTGTTACACTATAAGATCATGGATTGGACAAAATAGGCCAACTCCCGCACGCTGTGGCTATTCTTAAGTTGCATAGGTGGGAGTAGCCTTATACTCGATTTCTAAAAAGAGTAGGTGAGC 9 | >Seq4 10 | TTCCGGCGCCGCACTAATTGAAGTGGTGAGCTGACCAGTCGTTCAGGATCCGAAGGCGGGGATGGCGCTATAGGAGCCGGCAGGTATGCTTTGCCGCAAAATTTCGGGGTGGTGGAACCGTCTTACCGAAAGTTAGCTACAGCCTGGAATGTGAAATTCCATGACCTGCCCGTCCTGTGTCCACAGGGCGACATTTGCCACGTAGGTAGGGCGACCATTAGAATGCTGCATTATCGGGCGATAAAAAGTTTTATACTCAAGAATCCTACAAAGATGAAAATTTCGAAGAGCTGCACGCAGTTGTAAGTTGCTTTTCTGGGGTAATCGAGATTCTCCACCATAACCTGCGCAGTCTTAACCTTAAGACCGTTCATTGATAAAACTTGCTCACGCTCTAGATGGCGTGAAGCGAAACCTAGGAAAAAGTTTTGCAGATAATTAGATTATGCGCGATACTCCGCCGTGTGTTCAATGCATCGTGAAGCTTTACCGCGCCCAAGGGGAGCGTCTCAGTGGGGTTGCCTCCAGGGATATATTGAAAGTTGAAGAAGAAGATCACAGGTTAAGCGGTATGTTAAGTTAGAACTCACGGGGAGCCGCCTTGATTTTGTTCGACATGAACCAGAGACCAGGTGTGTTATGTTCTGGAACCTTAATACGTACGTCGCCAGCACCGAGCCGGCACTCCATCTCTTTTGGGTGCGCAACATTGCTATACTTAGGTGTATTCCTGGGTTGAGTGGCAGGTTTCTCTTAATTCTTCCCTAAGTAGCTCCGAGGATCCATTGACATCTGTCAGCCGTCTTTCCAGAACGTTATAAGACTCGTGAGGAAATTATACAAATCGTTGCCATCATCCAAAGCAAAGTACTTCCGCTTAGGAGTGCCTTGAAGAACCGATTATCTCTGACAATGTAATGCCACAGCACCCTCGACAAAGTTCTACATTCGTTCCAGGTCATGATACAGCGCGCTAAATTACCGCTACGAGCCATACCGGATGGCGGCCGGAGAGCGCTGCAATCGCATGGCTCGGGACCGAACATTGAGACCTGGCTAGTAGGTAGGTGTCAAATCGATATCCACACCTGTCGAAGCAGCTAAAGATCGGTTGCGGCGGGAGTCCTCCATTCAGGCCAAACGTGCAGTGCTCGATGTGCTTCCTATCGCTCT 11 | >Seq5 12 | GATGTTTAGAAGTTTCCAGGTCACGCCAATGATTGGCATTTACACACGTGGATCAGCGGACATATCTAACCCTTAGTGTTCTTAAGAGCAACTCACTACTCCATTTCCACTAACCCCGCCGGCGGTAATTCCAATCTAGTTGATCAGACTTCCCAGTCAATGAAAGCGACACCGTGCGTCTGTAATACCAACAAGACCCTGGCTGTCGTCCCGCAGAGGACGCGGCACCTCCGGATTTTGAGTCCAGTCTGAACGATTTTCGATCACTCACCATGGATCTGGAAAACGGAGTCGAGTACTCACGAGCCAAATTGATGCATTTCCAATGACCCGATGCAGGTGCGACCGATCTTCGCCTATGCTTCCCGCCGTAATTATTGAGTCTGGGTCCCGGCCGCTAACGTTTGACTCACGGGGAGGTACCCGTGCGTATTCTTCTCAAAGTGACGCTGGACAGCAGCGCATGTCCGAGCCCCATCGTCCTATCTGGTGTAGAGTCTTACCTCTAATTAGAGTGATCGAACCAGTAGGTGTCGCGGTCTTAGGGCTCCCATTGTCCAAGGGAACGTGAACAGATATGAATCTGGGAGAATAGTGCAGCGTTGCCCTTCTGGTCGGTCAGCCCTTGCCTACGGCCCGTATGCGGAGAATGAAGGCGTGAAACATTCTGCTCTTTTAGAAGCAGCGGCTGCACCCGTATAACAATCGCACGATCGTACGTCTCATTTGCCGCGTTGGCGCGCCCGTGGATGATGGACCACGGTATGAACCTCTGCACTTCAAATTTGACGCAATCCTGCACTCACCGCACACAGTTCTAGTCTAACCGTCGCAGTGTCTGCTTTAAGGTAGAGATCGATACTTAGGATATGTTCATGTGTGTTTGTAGCGCTGGACCCTCTTATGGTGTGGTCACTTGTGATGGATCGAGGAACTTAGGCGGTTAACTTGTTTCGACGTCTCACCGACAATATCAGGATTTAGTATCG 13 | >Seq6 14 | ACCGAAAATGACAATGTTCACACGCATGCTCGGCGTGGAAAAGAGCCTTTTCTAAGACCGACTCGTTCCGGGCAGCAGGATTATTAGCCAATCAAAATTATATCGACCGGTCATCAAGCTGCGATAGTGCAGGCGCATGCCGTCCAATGGGTCCACGGCGGAAGTGCGTTCGTCTACTCTGTCAAATCTTAACATTTTTTGAGCGGCTAATCCGGCCGGTAGTGTACCGTGAACCAAAGTCCTTCTACGAGCGTATTAGATTGCTCAAAAGATCCGGGAGAATTGACCAGGTCGTATCTTTAAAATAACGCTGGTGCGAGCAGCTGCTGTTTTATCAACACCCATTTAGTCCTGTGAAGTTTGCTTAGCAGATACACCTTCCCGCGTGGTATGAGAGGCTGTTCTTCATTAAAAACTATGAGGCTCTGGCACCTTCGACGCTAACAAAGTCCCCACGGACCATGATACCCTTACGCAACTCTCTTTGCACGCTAGGGCGAGAGTACTGTCCCCCTAGACTAGGTACACGCCGGGTAAACTCTCTCGCACACCTTTACGCTCGACTACAGGCTTCTAACCCTTCCGAACGCATATAATTCAAATGGCACTTAGTAACAGACGAATCACGGCTCACAGGCAGAATTCACTGGAGTAAAAGGATTCAGAACAATAGATAGTGTGTTAACTTTACAGTCATCCGTATTATAACGTAGCGAGAGGATTGAGTTCTTGTTAGGAAGGAAGGTCCTATAGACGAGTGCGGTAGCGCACCCGGTCGCCTTGCGTAGTCATGCCCGACGTGTTGATGGTTCCCTTTTAGCCGCCACACAAGGGATCCGAGGGTGAGAGACACATGGCCCTCACCGACGAGACTTACTCAGCCTGCCTCGCTATTGCCCTCTTTTTGATCGTCCCTTTGTGGCTCTCGAGGACTCGTGCAGCGTGTATCTGGGGATTTGTAAGCTTAAGACTACCTTCCATAGGA 15 | -------------------------------------------------------------------------------- /test/simple_full_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # look for MindTheGap binary. In devel mode, it's in ../build/bin directory. 4 | # In production mode, it's in ../bin directory. 5 | if [ -f "../bin/MindTheGap" ] 6 | then 7 | bindir="../bin" 8 | elif [ -f "../build/bin/MindTheGap" ] 9 | then 10 | bindir="../build/bin" 11 | else 12 | echo "could not find a compiled MindTheGap binary" 13 | exit 1 14 | fi 15 | 16 | RETVAL=0 17 | testDir="test-output" 18 | outputPrefix=$testDir/full-test 19 | goldPrefix="full_test/gold" 20 | 21 | outputPrefix2=$testDir/contig-test 22 | goldPrefix2="contig_test/gold" 23 | 24 | 25 | ## First cleaning test dir 26 | if [ -d $testDir ]; then 27 | rm -rf $testDir 28 | fi 29 | mkdir $testDir 30 | 31 | 32 | 33 | ################################################################################ 34 | # we launch the find module 35 | ################################################################################ 36 | ${bindir}/MindTheGap find -in ../data/reads_r1.fastq,../data/reads_r2.fastq -ref ../data/reference.fasta -out $outputPrefix >$outputPrefix.out -nb-cores 1 2> /dev/null 37 | 38 | ################################################################################ 39 | # we check the results 40 | ################################################################################ 41 | 42 | # Checking the .othervariants.vcf : 43 | sh compare_vcf.sh $outputPrefix.othervariants.vcf $goldPrefix.othervariants.vcf 1> /dev/null 2>&1 44 | var=$? 45 | 46 | if [ $var -eq 0 ] 47 | then 48 | echo "full-test find vcf : PASS" 49 | else 50 | echo "full-test find vcf : FAILED" 51 | RETVAL=1 52 | fi 53 | 54 | # Checking the .breakpoints : 55 | #diff --ignore-matching-lines=">" $outputPrefix.breakpoints $goldPrefix.breakpoints 1> /dev/null 2>&1 56 | #var=$? 57 | 58 | tmp1=$outputPrefix.breakpoints.tmp 59 | tmp2=$testDir/tmp2 60 | 61 | grep -v "^>" $outputPrefix.breakpoints > $tmp1 62 | grep -v "^>" $goldPrefix.breakpoints > $tmp2 63 | 64 | 65 | diff $tmp1 $tmp2 1> /dev/null 2>&1 66 | var=$? 67 | 68 | 69 | if [ $var -eq 0 ] 70 | then 71 | echo "full-test find breakpoints : PASS" 72 | else 73 | echo "full-test find breakpoints : FAILED" 74 | RETVAL=1 75 | fi 76 | 77 | ################################################################################ 78 | # we launch the find module with bed option 79 | ################################################################################ 80 | ${bindir}/MindTheGap find -in ../data/reads_r1.fastq,../data/reads_r2.fastq -ref ../data/reference.fasta -bed ${goldPrefix}.bed -out ${outputPrefix}_bed >${outputPrefix}_bed.out -nb-cores 1 2> /dev/null 81 | 82 | ################################################################################ 83 | # we check the results 84 | ################################################################################ 85 | 86 | # Checking the .othervariants.vcf : 87 | sh compare_vcf.sh ${outputPrefix}_bed.othervariants.vcf ${goldPrefix}_bed.othervariants.vcf 1> /dev/null 2>&1 88 | var=$? 89 | 90 | if [ $var -eq 0 ] 91 | then 92 | echo "full-test find with bed option vcf : PASS" 93 | else 94 | echo "full-test find with bed option vcf : FAILED" 95 | RETVAL=1 96 | fi 97 | 98 | # Checking the .breakpoints : 99 | #diff --ignore-matching-lines=">" $outputPrefix.breakpoints $goldPrefix.breakpoints 1> /dev/null 2>&1 100 | #var=$? 101 | 102 | tmp1=${outputPrefix}_bed.breakpoints.tmp 103 | tmp2=$testDir/tmp2 104 | 105 | grep -v "^>" ${outputPrefix}_bed.breakpoints > $tmp1 106 | grep -v "^>" ${goldPrefix}_bed.breakpoints > $tmp2 107 | 108 | 109 | diff $tmp1 $tmp2 1> /dev/null 2>&1 110 | var=$? 111 | 112 | 113 | if [ $var -eq 0 ] 114 | then 115 | echo "full-test find breakpoints with bed otpion : PASS" 116 | else 117 | echo "full-test find breakpoints with bed otpion : FAILED" 118 | RETVAL=1 119 | fi 120 | 121 | ################################################################################ 122 | # we launch the fill module 123 | ################################################################################ 124 | ${bindir}/MindTheGap fill -graph $outputPrefix.h5 -bkpt $outputPrefix.breakpoints -out $outputPrefix -nb-cores 1 >>$outputPrefix.out 2> /dev/null 125 | 126 | ################################################################################ 127 | # we check the results 128 | ################################################################################ 129 | tmp1=$outputPrefix.insertions.fasta.tmp 130 | tmp2=$testDir/tmp2 131 | 132 | grep -v "^>" $outputPrefix.insertions.fasta > $tmp1 133 | grep -v "^>" $goldPrefix.insertions.fasta > $tmp2 134 | 135 | 136 | diff $tmp1 $tmp2 1> /dev/null 2>&1 137 | var=$? 138 | 139 | if [ $var -eq 0 ] 140 | then 141 | echo "full-test fill fasta : PASS" 142 | else 143 | echo "full-test fill fasta : FAILED" 144 | RETVAL=1 145 | fi 146 | 147 | # Checking the .insertions.vcf : 148 | sh compare_vcf.sh $outputPrefix.insertions.vcf $goldPrefix.insertions.vcf 1> /dev/null 2>&1 149 | var=$? 150 | 151 | if [ $var -eq 0 ] 152 | then 153 | echo "full-test fill vcf : PASS" 154 | else 155 | echo "full-test fill vcf : FAILED" 156 | RETVAL=1 157 | fi 158 | 159 | 160 | ################################################################################ 161 | # we launch the fill module in contig mode 162 | ################################################################################ 163 | ${bindir}/MindTheGap fill -in ../data/contig-reads.fasta.gz -contig ../data/contigs.fasta -abundance-min 3 -out $outputPrefix2 -nb-cores 1 >>$outputPrefix2.out 2> /dev/null 164 | 165 | ################################################################################ 166 | # we check the results 167 | ################################################################################ 168 | tmp1=$outputPrefix2.insertions.fasta.tmp 169 | tmp2=$testDir/tmp2 170 | 171 | grep -v "^>" $outputPrefix2.insertions.fasta > $tmp1 172 | grep -v "^>" $goldPrefix2.insertions.fasta > $tmp2 173 | 174 | 175 | diff $tmp1 $tmp2 1> /dev/null 2>&1 176 | var=$? 177 | 178 | if [ $var -eq 0 ] 179 | then 180 | echo "contig-test fill fasta : PASS" 181 | else 182 | echo "contig-test fill fasta : FAILED" 183 | RETVAL=1 184 | fi 185 | 186 | # Checking the .gfa : 187 | diff $outputPrefix2.gfa $goldPrefix2.gfa 1> /dev/null 2>&1 188 | var=$? 189 | 190 | if [ $var -eq 0 ] 191 | then 192 | echo "contig-test fill gfa : PASS" 193 | else 194 | echo "contig-test fill gfa : FAILED" 195 | RETVAL=1 196 | fi 197 | 198 | ################################################################################ 199 | # we launch the graph simplifications script --> no longer in this repo (see MinYS) 200 | ################################################################################ 201 | #../pipeline/genome_graph/graph_simplification.py ../pipeline/genome_graph/data/simple4.gfa $outputPrefix.simplified.gfa 1> /dev/null 2>&1 202 | 203 | ################################################################################ 204 | # we check the results 205 | ################################################################################ 206 | 207 | #nblink=$(grep "^[^L]" $outputPrefix.simplified.gfa | wc -l) 208 | #nbsegment=$(grep "^[^S]" $outputPrefix.simplified.gfa | wc -l) 209 | 210 | #if [ $nblink -eq 4 ] && [ $nbsegment -eq 4 ] 211 | #then 212 | #echo "graph simplification : PASS" 213 | #else 214 | #echo "graph simplification : FAILED" 215 | #RETVAL=1 216 | #fi 217 | 218 | 219 | 220 | ################################################################################ 221 | # clean up 222 | ################################################################################ 223 | rm -rf $testDir 224 | 225 | # for Jenkins CI platform, we need an exit code: PASS (0) vs. FAILED (1) 226 | exit $RETVAL 227 | -------------------------------------------------------------------------------- /src/IGraphOutput.cpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * GATB : Genome Assembly Tool Box 3 | * Copyright (C) 2014 INRIA 4 | * Authors: R.Chikhi, G.Rizk, E.Drezen 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU Affero General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Affero General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Affero General Public License 17 | * along with this program. If not, see . 18 | *****************************************************************************/ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #define DEBUG(a) //printf a 26 | 27 | using namespace std; 28 | 29 | /********************************************************************* 30 | ** METHOD : 31 | ** PURPOSE : Initialize first elements and files (files are erasing) 32 | ** INPUT : 33 | ** OUTPUT : 34 | ** RETURN : 35 | ** REMARKS : 36 | *********************************************************************/ 37 | template 38 | IGraphOutput::IGraphOutput (size_t kmerSize, const string& prefix) 39 | : original(true), _modelKmer(kmerSize), _modelKmerMinusOne(kmerSize-1), _prefix(prefix) 40 | { 41 | first_id_els.node = 0; 42 | first_id_els.edge = 0; 43 | } 44 | 45 | /********************************************************************* 46 | ** METHOD : 47 | ** PURPOSE : load nodes extremities 48 | ** INPUT : 49 | ** OUTPUT : 50 | ** RETURN : 51 | ** REMARKS : 52 | *********************************************************************/ 53 | template 54 | void IGraphOutput::load_nodes_extremities (const string& linear_seqs_name,std::string & infostring) 55 | { 56 | kmer_links.clear(); 57 | 58 | /** We open the bank that holds the extensions. */ 59 | IBank* Nodes = Bank::open (linear_seqs_name); LOCAL (Nodes); 60 | 61 | long nb_nodes = first_id_els.node; 62 | long totalnt =0; 63 | DEBUG (("[GraphOutput::load_nodes_extremities] kmerSize=%ld name=%s\n", _modelKmerMinusOne.getKmerSize(), linear_seqs_name.c_str())); 64 | 65 | Iterator* itSeq = Nodes->iterator(); LOCAL (itSeq); 66 | for (itSeq->first(); !itSeq->isDone(); itSeq->next()) 67 | { 68 | char* rseq = itSeq->item().getDataBuffer(); 69 | int readlen = itSeq->item().getDataSize(); 70 | totalnt += readlen; 71 | DEBUG (("[GraphOutput::load_nodes_extremities] seq.size=%ld\n", readlen)); 72 | 73 | ModelKmer leftKmer = _modelKmerMinusOne.codeSeed (rseq, Data::ASCII, 0); 74 | ModelKmer rightKmer = _modelKmerMinusOne.codeSeed (rseq, Data::ASCII, readlen-_modelKmerMinusOne.getKmerSize()); 75 | 76 | kmer_links[leftKmer. value()].insert (node_strand(nb_nodes, leftKmer.strand(), LEFT)); 77 | kmer_links[rightKmer.value()].insert (node_strand(nb_nodes, rightKmer.strand(), RIGHT)); 78 | 79 | nb_nodes++; 80 | } 81 | 82 | infostring += Stringify::format ("\t%i", nb_nodes) ; 83 | infostring += Stringify::format ("\t%i", totalnt) ; 84 | 85 | DEBUG (("[GraphOutput::load_nodes_extremities] nbNodes=%ld\n", nb_nodes)); 86 | } 87 | 88 | /********************************************************************* 89 | ** METHOD : 90 | ** PURPOSE : construct node file and edge file for graph file 91 | ** INPUT : 92 | ** OUTPUT : 93 | ** RETURN : 94 | ** REMARKS : 95 | *********************************************************************/ 96 | template 97 | id_els IGraphOutput::construct_graph (const string& linear_seqs_name, const string& direction) 98 | { 99 | DEBUG (("[GraphOutput::construct_graph] linear_seqs_name=%s direction=%s\n", linear_seqs_name.c_str(), direction.c_str() )); 100 | 101 | /** We open the bank that holds the extensions. */ 102 | IBank* Nodes = Bank::open (linear_seqs_name); LOCAL (Nodes); 103 | 104 | id_els nb_els = first_id_els; 105 | bool found = false; 106 | 107 | print_sequence_head (linear_seqs_name, direction); 108 | 109 | Iterator* itSeq = Nodes->iterator(); LOCAL (itSeq); 110 | for (itSeq->first(); !itSeq->isDone(); itSeq->next()) 111 | { 112 | /** We get the current sequence as a string object. 113 | * NOTE: we can't rely on strlen() on itSeq->item().getDataBuffer() because the buffer may be not 114 | * 0 terminated (it's just a buffer, not a C-like string). */ 115 | string seq; seq.assign (itSeq->item().getDataBuffer(), itSeq->item().getDataSize()); 116 | 117 | ModelKmer leftKmer = _modelKmerMinusOne.codeSeed (seq.c_str(), Data::ASCII, 0); 118 | ModelKmer rightKmer = _modelKmerMinusOne.codeSeed (seq.c_str(), Data::ASCII, seq.size()-_modelKmerMinusOne.getKmerSize()); 119 | 120 | print_edges (leftKmer, seq.size(), LEFT, nb_els); // left edges (revcomp extensions) 121 | print_edges (rightKmer, seq.size(), RIGHT, nb_els); // right edges 122 | 123 | print_node (nb_els.node, seq); 124 | nb_els.node++; 125 | 126 | found = true ; 127 | 128 | } /* end of for (itSeq->first(); !itSeq->isDone(); itSeq->next()) */ 129 | 130 | if (found) { print_sequence_end (); } 131 | 132 | return nb_els; 133 | } 134 | 135 | /********************************************************************* 136 | ** METHOD : 137 | ** PURPOSE : construct node file and edge file for graph file 138 | ** INPUT : 139 | ** OUTPUT : 140 | ** RETURN : 141 | ** REMARKS : 142 | *********************************************************************/ 143 | template 144 | void IGraphOutput::print_edges (const ModelKmer& kmer, size_t seqLen, LeftOrRight direction, id_els& nb_els) 145 | { 146 | /** Shortcut. */ 147 | std::set& nodes = kmer_links[kmer.value()]; 148 | 149 | size_t sizeKmer = _modelKmerMinusOne.getKmerSize(); 150 | 151 | static const char* table0[] = { "R", "F" }; 152 | static const char* table1[] = { "F", "R" }; 153 | 154 | for (typename set::iterator it = nodes.begin(); it != nodes.end(); it++) 155 | { 156 | long cur_node = it->node; 157 | Strand cur_strand = it->strand; 158 | LeftOrRight cur_left_or_right = it->left_or_right; 159 | 160 | // prevent self loops on same kmer 161 | if (cur_node == nb_els.node) { if (seqLen == sizeKmer) { continue; } } 162 | 163 | string label = table0[direction]; 164 | 165 | if (cur_left_or_right == direction) 166 | { 167 | if (cur_strand != kmer.strand()) { label += table1[direction]; } 168 | else { continue; } 169 | } 170 | else 171 | { 172 | if (cur_strand == kmer.strand()) { label += table0[direction]; } 173 | else { continue; } 174 | } 175 | 176 | print_edge (nb_els.edge, nb_els.node, cur_node, label, ""); 177 | nb_els.edge++; 178 | } 179 | } 180 | 181 | // WARNING !!! The following code is not generic !!! 182 | // It is designed to cope with 4 values of supported kmer size. 183 | 184 | template class IGraphOutput ; 185 | template class IGraphOutput ; 186 | template class IGraphOutput ; 187 | template class IGraphOutput ; 188 | 189 | 190 | -------------------------------------------------------------------------------- /src/Utils.cpp: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * MindTheGap: Integrated detection and assembly of insertion variants 3 | * A tool from the GATB (Genome Assembly Tool Box) 4 | * Copyright (C) 2014 INRIA 5 | * Authors: C.Lemaitre, G.Rizk, R. Chikhi 6 | * 7 | * This program is free software: you can redistribute it and/or modify 8 | * it under the terms of the GNU Affero General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU Affero General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Affero General Public License 18 | * along with this program. If not, see . 19 | *****************************************************************************/ 20 | 21 | #include 22 | 23 | void revcomp_sequence(char s[], int len) 24 | { 25 | #define CHAR_REVCOMP(a,b) {switch(a){\ 26 | case 'A': b='T';break;case 'C': b='G';break;case 'G': b='C';break;case 'T': b='A';break;default: b=a;break;}} 27 | int i; 28 | unsigned char t; 29 | for (i=0;i %.2f\n--------\n",a.c_str(),b.c_str(),identity); 188 | return identity; 189 | } 190 | 191 | bool all_consensuses_almost_identical(set consensuses, int identity_threshold) 192 | { 193 | for (set::iterator it_a = consensuses.begin(); it_a != consensuses.end(); it_a++) 194 | { 195 | set::iterator it_b = it_a; 196 | advance(it_b,1); 197 | while (it_b != consensuses.end()) 198 | { 199 | if (needleman_wunsch(it_a->seq,it_b->seq, NULL, NULL, NULL) * 100 < identity_threshold) 200 | return false; 201 | advance(it_b,1); 202 | } 203 | } 204 | return true; 205 | } 206 | 207 | 208 | void remove_almost_identical_solutions(std::vector& consensuses, int identity_threshold) 209 | { 210 | // heuristic : add first seq to final set. Compare every seq to final_seq and add to final_set if different 211 | 212 | std::vector final_set; 213 | final_set.push_back(*consensuses.begin() ); 214 | //std::cerr << "remove_almost... after first push_back" << std::endl; 215 | 216 | for (std::vector::iterator it_a=consensuses.begin(); it_a!=consensuses.end(); ++it_a) // could be improved : no need to compare first seq 217 | { 218 | bool found_a_similar_seq = false; 219 | for (std::vector::iterator it_b=final_set.begin(); it_b!=final_set.end(); ++it_b){ 220 | if (it_a->seq.compare(it_b->seq) == 0 || needleman_wunsch(it_a->seq,it_b->seq, NULL, NULL, NULL) * 100 >= identity_threshold){ // time optimisation ? if identical sequences, will not run needleman 221 | 222 | //This insertion is removed, but we select the one with nb_errors_in_anchor minimal 223 | if(it_a->nb_errors_in_anchor < it_b->nb_errors_in_anchor){ 224 | it_b->seq = it_a->seq; 225 | it_b->nb_errors_in_anchor = it_a->nb_errors_in_anchor; 226 | } 227 | found_a_similar_seq = true; 228 | break; 229 | } 230 | } 231 | // if the sequence has a %id always < threashold for all sequences in final_set, we add it to the final set. 232 | if(!found_a_similar_seq){ 233 | final_set.push_back(*it_a); 234 | } 235 | } 236 | 237 | consensuses = final_set; 238 | } 239 | 240 | 241 | double median(std::vector &v) 242 | { 243 | size_t n = v.size() / 2; 244 | std::nth_element(v.begin(), v.begin()+n, v.end()); 245 | unsigned int vn = v[n]; 246 | if(v.size()%2 == 1) 247 | { 248 | return vn; 249 | }else 250 | { 251 | std::nth_element(v.begin(), v.begin()+n-1, v.end()); 252 | return 0.5*(vn+v[n-1]); 253 | } 254 | } 255 | --------------------------------------------------------------------------------