├── src ├── .DS_Store ├── scrup_vcf │ ├── scrup_svs.cpp │ └── scrup_vcf.h ├── convert │ ├── Convert_Pindel.h │ ├── Convert_MUMmer.h │ ├── Convert_Bionano.h │ ├── Convert_Honey_tails.h │ ├── Convert_Assemblytics.h │ ├── ConvertMQ0Bed.h │ ├── Process_Coverage.h │ ├── Convert_hapcut2.h │ ├── Update_bam_pacbio.h │ ├── Process_Lumpy.h │ ├── Convert_VCF_to_BED.h │ ├── ConvertMQ0Bed.cpp │ ├── Convert_MUMmer.cpp │ ├── Process_Coverage.cpp │ ├── Update_bam_pacbio.cpp │ ├── Convert_Honey_tails.cpp │ ├── Convert_Pindel.cpp │ ├── Convert_hapcut2.cpp │ ├── Convert_Assemblytics.cpp │ └── Convert_Bionano.cpp ├── Extract_Seq.h ├── simulator │ ├── Pac_Simulator.h │ ├── Sim_reads.h │ ├── test_cov.h │ ├── Eval_vcf.h │ ├── Error_scanner.h │ ├── SV_Simulator.h │ ├── test_cov.cpp │ ├── Pac_Simulator.cpp │ └── Sim_reads.cpp ├── vcfs │ ├── Compoverlap_VCF.h │ ├── Combine_3_VCF.h │ ├── Detect_nested.h │ ├── Generate_distMat.h │ ├── Filter_vcf.h │ ├── Annotate_vcf.h │ ├── Merge_VCF.h │ ├── Detect_nested.cpp │ ├── Generate_distMat.cpp │ ├── Combine_3_VCF.cpp │ ├── Compoverlap_VCF.cpp │ └── Filter_vcf.cpp ├── CorrectAllele.h ├── analysis_sv │ ├── Select_samples.h │ ├── MT_identifier.h │ ├── Summ_mat.h │ ├── MUMmer_overlap.h │ ├── GIAB_summary.h │ ├── Density_VCF.h │ ├── Simplify_SVs.h │ ├── Density_VCF.cpp │ ├── MT_identifier.cpp │ ├── Select_samples.cpp │ ├── MUMmer_overlap.cpp │ ├── Summ_mat.cpp │ └── Simplify_SVs.cpp ├── DetectDif.h ├── Summarize_SV.h ├── phasing │ ├── Phasing_vcf.h │ └── Phasing_vcf.cpp ├── R-scripts │ └── SUR_plots.R ├── snp_overlap │ └── Overlap_snps.h ├── merge_vcf │ ├── Paramer.h │ ├── IntervallTree.h │ ├── combine_svs.h │ └── TNode.h ├── structs.h ├── GzipStream.h ├── DetectDif.cpp ├── CorrectAllele.cpp └── Extract_Seq.cpp ├── Debug ├── src │ ├── .DS_Store │ ├── phasing │ │ └── subdir.mk │ ├── scrup_vcf │ │ └── subdir.mk │ ├── snp_overlap │ │ └── subdir.mk │ ├── merge_vcf │ │ └── subdir.mk │ ├── subdir.mk │ ├── simulator │ │ └── subdir.mk │ ├── vcfs │ │ └── subdir.mk │ ├── analysis_sv │ │ └── subdir.mk │ └── convert │ │ └── subdir.mk ├── objects.mk ├── sources.mk └── makefile ├── HG002_Pac_error_profile_bwa.txt.zip ├── NA12878_nano_error_profile_bwa.txt.zip ├── INSTALL ├── .project ├── LICENSE ├── .settings └── language.settings.xml └── README.md /src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fritzsedlazeck/SURVIVOR/HEAD/src/.DS_Store -------------------------------------------------------------------------------- /Debug/src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fritzsedlazeck/SURVIVOR/HEAD/Debug/src/.DS_Store -------------------------------------------------------------------------------- /HG002_Pac_error_profile_bwa.txt.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fritzsedlazeck/SURVIVOR/HEAD/HG002_Pac_error_profile_bwa.txt.zip -------------------------------------------------------------------------------- /NA12878_nano_error_profile_bwa.txt.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fritzsedlazeck/SURVIVOR/HEAD/NA12878_nano_error_profile_bwa.txt.zip -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | change in the Debug directory. 2 | type: 3 | make 4 | 5 | you should find an executable in the same directory. 6 | If you have problems please let me know: 7 | fritz.sedlazeck@gmail.com 8 | -------------------------------------------------------------------------------- /Debug/objects.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | USER_OBJS := 6 | 7 | LIBS := 8 | 9 | -------------------------------------------------------------------------------- /src/scrup_vcf/scrup_svs.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "scrup_vcf.h" 3 | 4 | 5 | //void scrup_svs(std::string svs_vcf, string snp_vcf, std::string output){ 6 | 7 | //parse SVs in tree structure. 8 | 9 | 10 | //run over SNP and intersect with SVs 11 | // hom del + SNP -> delete del 12 | // het del + het SNP -> flag del 13 | // 14 | 15 | 16 | //output vcf file. 17 | 18 | //} 19 | -------------------------------------------------------------------------------- /src/convert/Convert_Pindel.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_Pindel.h 3 | * 4 | * Created on: Mar 3, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef CONVERT_PINDEL_H_ 9 | #define CONVERT_PINDEL_H_ 10 | 11 | #include "Process_Lumpy.h" 12 | 13 | void process_Pindel( std::string pindel_vcf,int min_support, int min_length,std::string output); 14 | 15 | 16 | #endif /* CONVERT_PINDEL_H_ */ 17 | -------------------------------------------------------------------------------- /src/Extract_Seq.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Extract_Seq.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef EXTRACT_SEQ_H_ 9 | #define EXTRACT_SEQ_H_ 10 | 11 | #include "vcfs/Merge_VCF.h" 12 | #include "simulator/Eval_vcf.h" 13 | void extract_breakpoint_seq(std::string vcf_file, std::string reference_file, int len,std::string output); 14 | 15 | 16 | #endif /* EXTRACT_SEQ_H_ */ 17 | -------------------------------------------------------------------------------- /src/convert/Convert_MUMmer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_MUMmer.h 3 | * 4 | * Created on: Jul 31, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef CONVERT_CONVERT_MUMMER_H_ 9 | #define CONVERT_CONVERT_MUMMER_H_ 10 | 11 | 12 | #include "Process_Lumpy.h" 13 | 14 | 15 | void convert_mummer_svs(std::string mummer, int min_len, std::string output); 16 | 17 | 18 | 19 | 20 | #endif /* CONVERT_CONVERT_MUMMER_H_ */ 21 | -------------------------------------------------------------------------------- /src/convert/Convert_Bionano.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_Bionano.h 3 | * 4 | * Created on: Jun 20, 2016 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef CONVERT_CONVERT_BIONANO_H_ 9 | #define CONVERT_CONVERT_BIONANO_H_ 10 | #include "Process_Lumpy.h" 11 | void process_Bionano(std::string bionano, std::string output); 12 | void process_CG(std::string gc_file, std::string output) ; 13 | #endif /* CONVERT_CONVERT_BIONANO_H_ */ 14 | -------------------------------------------------------------------------------- /src/convert/Convert_Honey_tails.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_Honey_tails.h 3 | * 4 | * Created on: Jun 6, 2016 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef CONVERT_CONVERT_HONEY_TAILS_H_ 9 | #define CONVERT_CONVERT_HONEY_TAILS_H_ 10 | 11 | 12 | #include "Process_Lumpy.h" 13 | void process_Honey( std::string assemblytics, int minlen, std::string output); 14 | 15 | 16 | 17 | 18 | #endif /* CONVERT_CONVERT_HONEY_TAILS_H_ */ 19 | -------------------------------------------------------------------------------- /src/convert/Convert_Assemblytics.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_Assemblytics.h 3 | * 4 | * Created on: May 26, 2016 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef CONVERT_CONVERT_ASSEMBLYTICS_H_ 9 | #define CONVERT_CONVERT_ASSEMBLYTICS_H_ 10 | 11 | 12 | #include "Process_Lumpy.h" 13 | 14 | void process_Assemblytics( std::string assemblytics, int minlen, std::string output); 15 | 16 | 17 | 18 | #endif /* CONVERT_CONVERT_ASSEMBLYTICS_H_ */ 19 | -------------------------------------------------------------------------------- /src/convert/ConvertMQ0Bed.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ConvertMQ0Bed.h 3 | * 4 | * Created on: Mar 16, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef CONVERTMQ0BED_H_ 9 | #define CONVERTMQ0BED_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | void comp_mq0bed(std::string cov_file,int border, int cov_tresh); 18 | 19 | 20 | 21 | #endif /* CONVERTMQ0BED_H_ */ 22 | -------------------------------------------------------------------------------- /src/simulator/Pac_Simulator.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Pac_Simulator.h 3 | * 4 | * Created on: Feb 1, 2016 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef SIMULATOR_PAC_SIMULATOR_H_ 9 | #define SIMULATOR_PAC_SIMULATOR_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | void simulate_pac(std::string genome,std::string out); 19 | 20 | #endif /* SIMULATOR_PAC_SIMULATOR_H_ */ 21 | -------------------------------------------------------------------------------- /src/vcfs/Compoverlap_VCF.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Compoverlap_VCF.h 3 | * 4 | * Created on: Feb 27, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef COMPOVERLAP_VCF_H_ 9 | #define COMPOVERLAP_VCF_H_ 10 | #include "Merge_VCF.h" 11 | #include "../simulator/Eval_vcf.h" 12 | 13 | void comp_overlap_vcf(std::string vcf1, std::string vcf2,int max_dis,std::string output); 14 | void print_entry(strvcfentry entry, FILE *& out); 15 | void print_header(std::string vcf_file, FILE *& out); 16 | 17 | #endif /* COMPOVERLAP_VCF_H_ */ 18 | -------------------------------------------------------------------------------- /src/convert/Process_Coverage.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Process_Coverage.h 3 | * 4 | * Created on: Apr 13, 2017 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef CONVERT_PROCESS_COVERAGE_H_ 9 | #define CONVERT_PROCESS_COVERAGE_H_ 10 | #include "../vcfs/Merge_VCF.h" 11 | #include "../structs.h" 12 | #include "../simulator/Eval_vcf.h" 13 | #include 14 | #include 15 | void summarize_badcoverage(std::string filename,int win_size,int min_cov, std::string output); 16 | 17 | 18 | #endif /* CONVERT_PROCESS_COVERAGE_H_ */ 19 | -------------------------------------------------------------------------------- /src/vcfs/Combine_3_VCF.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Combine_3_VCF.h 3 | * 4 | * Created on: Mar 10, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef COMBINE_3_VCF_H_ 9 | #define COMBINE_3_VCF_H_ 10 | #include "../simulator/Eval_vcf.h" 11 | #include "Compoverlap_VCF.h" 12 | 13 | void combine_calls(std::string vcf_delly, std::string vcf_lumpy,std::string vcf_pindel,int max_dist,std::string output); 14 | void combine_calls_new(std::string files, int max_dist,int min_caller, std::string output); 15 | 16 | #endif /* COMBINE_3_VCF_H_ */ 17 | -------------------------------------------------------------------------------- /src/vcfs/Detect_nested.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Detect_nested.h 3 | * 4 | * Created on: Apr 27, 2017 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef VCFS_DETECT_NESTED_H_ 9 | #define VCFS_DETECT_NESTED_H_ 10 | 11 | #include "Merge_VCF.h" 12 | #include "../simulator/Eval_vcf.h" 13 | 14 | struct nested_sv{ 15 | std::string chr; 16 | int id; 17 | int del; 18 | int inv; 19 | int dup; 20 | int others; 21 | }; 22 | 23 | void detect_nested(std::string vcf_file, std::string output ); 24 | 25 | #endif /* VCFS_DETECT_NESTED_H_ */ 26 | -------------------------------------------------------------------------------- /src/convert/Convert_hapcut2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_hapcut2.h 3 | * 4 | * Created on: Mar 14, 2018 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef CONVERT_CONVERT_HAPCUT2_H_ 9 | #define CONVERT_CONVERT_HAPCUT2_H_ 10 | 11 | 12 | #include "../vcfs/Merge_VCF.h" 13 | #include "../structs.h" 14 | #include "../simulator/Eval_vcf.h" 15 | #include 16 | #include 17 | using namespace std; 18 | 19 | 20 | void process_hapcut(std::string orig_snp, std::string hapcut2, std::string output); 21 | #endif /* CONVERT_CONVERT_HAPCUT2_H_ */ 22 | -------------------------------------------------------------------------------- /src/convert/Update_bam_pacbio.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Update_bam_pacbio.h 3 | * 4 | * Created on: Mar 15, 2018 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef CONVERT_UPDATE_BAM_PACBIO_H_ 9 | #define CONVERT_UPDATE_BAM_PACBIO_H_ 10 | 11 | #include "../vcfs/Merge_VCF.h" 12 | #include "../structs.h" 13 | #include "../simulator/Eval_vcf.h" 14 | #include 15 | #include 16 | 17 | void process_sam_forpacbio(std::string unmapped_sam, std::string mapped_sam, std::string output_sam); 18 | 19 | 20 | #endif /* CONVERT_UPDATE_BAM_PACBIO_H_ */ 21 | -------------------------------------------------------------------------------- /src/CorrectAllele.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CorrectAllele.h 3 | * 4 | * Created on: Jun 18, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef CORRECTALLELE_H_ 9 | #define CORRECTALLELE_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "structs.h" 21 | #include "vcfs/Merge_VCF.h" 22 | 23 | void correct_alleles(std::string vcf_file,std::string table, std::string output); 24 | 25 | #endif /* CORRECTALLELE_H_ */ 26 | -------------------------------------------------------------------------------- /src/vcfs/Generate_distMat.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Generate_distMat.h 3 | * 4 | * Created on: Jul 17, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef VCFS_GENERATE_DISTMAT_H_ 9 | #define VCFS_GENERATE_DISTMAT_H_ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "Merge_VCF.h" 20 | 21 | void generate_dist_mat(std::string svs_vcf, std::string snp_vcf, std::string weighted_file, std::string output); 22 | 23 | #endif /* VCFS_GENERATE_DISTMAT_H_ */ 24 | -------------------------------------------------------------------------------- /src/analysis_sv/Select_samples.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Select_samples.h 3 | * 4 | * Created on: Feb 27, 2018 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef ANALYSIS_SV_SELECT_SAMPLES_H_ 9 | #define ANALYSIS_SV_SELECT_SAMPLES_H_ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | //#include"../vcfs/Merge_VCF.h" 20 | using namespace std; 21 | 22 | 23 | void select_greedy(std::string vcf_file, std::string output); 24 | 25 | #endif /* ANALYSIS_SV_SELECT_SAMPLES_H_ */ 26 | -------------------------------------------------------------------------------- /src/DetectDif.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DetectDif.h 3 | * 4 | * Created on: Oct 30, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef DETECTDIF_H_ 9 | #define DETECTDIF_H_ 10 | 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | using namespace std; 21 | struct svs_str{ 22 | std::string chr; 23 | int start; 24 | int stop; 25 | int type; 26 | bool joined; 27 | }; 28 | 29 | void detect_divergence(std::string file, float precent_overlap, std::string output); 30 | 31 | 32 | #endif /* DETECTDIF_H_ */ 33 | -------------------------------------------------------------------------------- /src/scrup_vcf/scrup_vcf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * scrup_vcf.h 3 | * 4 | * Created on: Jun 25, 2018 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef SRC_SCRUP_VCF_SCRUP_VCF_H_ 9 | #define SRC_SCRUP_VCF_SCRUP_VCF_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include "../simulator/Eval_vcf.h" 21 | //#include "Paramer.h" 22 | 23 | //void scrup_svs(std::string svs_vcf, string snp_vcf, std::string output); 24 | 25 | 26 | 27 | #endif /* SRC_SCRUP_VCF_SCRUP_VCF_H_ */ 28 | -------------------------------------------------------------------------------- /src/analysis_sv/MT_identifier.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MT_identifier.h 3 | * 4 | * Created on: Aug 15, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef ANALYSIS_SV_MT_IDENTIFIER_H_ 9 | #define ANALYSIS_SV_MT_IDENTIFIER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | using namespace std; 21 | 22 | struct segment_str{ 23 | int pos; 24 | std::string chr; 25 | bool strand; 26 | int MQ; 27 | int read_start; 28 | }; 29 | 30 | #endif /* ANALYSIS_SV_MT_IDENTIFIER_H_ */ 31 | -------------------------------------------------------------------------------- /src/simulator/Sim_reads.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Nanopore_sim.h 3 | * 4 | * Created on: May 30, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef NANOPORE_SIM_H_ 9 | #define NANOPORE_SIM_H_ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "Error_scanner.h" 23 | using namespace std; 24 | void simulate_reads(std::string genome,std::string error_profile,int coverage, std::string output); 25 | 26 | #endif /* NANOPORE_SIM_H_ */ 27 | -------------------------------------------------------------------------------- /src/analysis_sv/Summ_mat.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Summ_mat.h 3 | * 4 | * Created on: Jul 5, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef ANALYSIS_SV_SUMM_MAT_H_ 9 | #define ANALYSIS_SV_SUMM_MAT_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | using namespace std; 21 | 22 | void summarize_svs_table_window (std::string venn_file,int window,std::string output); 23 | void summarize_svs_table_window_stream(int window, std::string output); 24 | 25 | #endif /* ANALYSIS_SV_SUMM_MAT_H_ */ 26 | -------------------------------------------------------------------------------- /src/analysis_sv/MUMmer_overlap.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MUMmer_overlap.h 3 | * 4 | * Created on: Dec 27, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef ANALYSIS_SV_MUMMER_OVERLAP_H_ 9 | #define ANALYSIS_SV_MUMMER_OVERLAP_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include"../vcfs/Merge_VCF.h" 21 | using namespace std; 22 | 23 | void overlapp_mummer(std::string vcf_SVs_file, std::string mummer_files, int max_dist, std::string output); 24 | 25 | 26 | 27 | #endif /* ANALYSIS_SV_MUMMER_OVERLAP_H_ */ 28 | -------------------------------------------------------------------------------- /src/analysis_sv/GIAB_summary.h: -------------------------------------------------------------------------------- 1 | /* 2 | * GIAB_summary.h 3 | * 4 | * Created on: Apr 13, 2017 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef GIAB_SUMMARY_H_ 9 | #define GIAB_SUMMARY_H_ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include"../vcfs/Merge_VCF.h" 20 | using namespace std; 21 | 22 | struct svstruct{ 23 | std::vector support; 24 | std::string type; 25 | double size; 26 | }; 27 | 28 | void summary_giab(std::string venn_file, std::string output); 29 | 30 | 31 | #endif /* GIAB_SUMMARY_H_ */ 32 | -------------------------------------------------------------------------------- /src/vcfs/Filter_vcf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Filter_vcf.h 3 | * 4 | * Created on: Feb 12, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef FILTER_VCF_H_ 9 | #define FILTER_VCF_H_ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "../structs.h" 17 | #include "Merge_VCF.h" 18 | 19 | void filter_vcf(std::string vcf_file,std::string genomic_regions,int min_size, int max_size,double min_AF,int min_reads,std::string outputvcf); 20 | 21 | void filter_vcf_sniffles(std::string vcf_file,int min_lenght, std::string outputvcf); 22 | void summarize_paper_gaib(std::string venn_file); 23 | #endif /* FILTER_VCF_H_ */ 24 | -------------------------------------------------------------------------------- /src/analysis_sv/Density_VCF.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Density_VCF.h 3 | * 4 | * Created on: May 15, 2020 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef ANALYSIS_SV_DENSITY_VCF_H_ 9 | #define ANALYSIS_SV_DENSITY_VCF_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include "../simulator/Eval_vcf.h" 21 | #include "../merge_vcf/Paramer.h" 22 | #include "../merge_vcf/combine_svs.h" 23 | using namespace std; 24 | 25 | void density_VCF(std::string vcf_file, int window, std::string output); 26 | 27 | 28 | #endif /* ANALYSIS_SV_DENSITY_VCF_H_ */ 29 | -------------------------------------------------------------------------------- /src/simulator/test_cov.h: -------------------------------------------------------------------------------- 1 | /* 2 | * test_cov.h 3 | * 4 | * Created on: Jun 17, 2016 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef TEST_COV_H_ 9 | #define TEST_COV_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | //#include 20 | 21 | using namespace std; 22 | 23 | ///void est_cov(int coverage,int read_length,int num_SV,int genome,int min_overlap,int min_support); 24 | void est_cov(int read_length, int num_SV, int min_overlap, int min_support,int cov); 25 | void count_valid_reads(double allowed_n_ratio); 26 | 27 | 28 | #endif /* TEST_COV_H_ */ 29 | -------------------------------------------------------------------------------- /src/Summarize_SV.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Summarize_SV.h 3 | * 4 | * Created on: Nov 18, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef SUMMARIZE_SV_H_ 9 | #define SUMMARIZE_SV_H_ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "vcfs/Merge_VCF.h" 20 | 21 | using namespace std; 22 | void summary_SV(std::string filename, int min_size, int max_size, int min_reads,std::string output); 23 | void summary_venn(std::string filename, bool normalize, std::string output); 24 | void summary_SV_stream(int min_size, int max_size, std::string output); 25 | 26 | #endif /* SUMMARIZE_SV_H_ */ 27 | -------------------------------------------------------------------------------- /src/simulator/Eval_vcf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Eval_vcf.h 3 | * 4 | * Created on: Feb 12, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef EVAL_VCF_H_ 9 | #define EVAL_VCF_H_ 10 | #include 11 | 12 | #include "../vcfs/Merge_VCF.h" 13 | #include "../structs.h" 14 | struct strreport{ 15 | short del; 16 | short dup; 17 | short inv; 18 | short tra; 19 | short ins; 20 | short other; 21 | }; 22 | void eval_vcf(std::string vcf_file,std::string bed_file,int max_allowed_dist,std::string output); 23 | bool match_coords(strsimul c1, strvcfentry c2, int max_allowed_dist); 24 | std::string trans_type(short type); 25 | 26 | void eval_paper(std::string vcf_file,std::string bed_file,int max_allowed_dist); 27 | 28 | #endif /* EVAL_VCF_H_ */ 29 | -------------------------------------------------------------------------------- /src/convert/Process_Lumpy.h: -------------------------------------------------------------------------------- 1 | /* 2 | * process_Lumpy.h 3 | * 4 | * Created on: Feb 24, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef PROCESS_LUMPY_H_ 9 | #define PROCESS_LUMPY_H_ 10 | 11 | #include "../vcfs/Merge_VCF.h" 12 | #include "../structs.h" 13 | #include "../simulator/Eval_vcf.h" 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | void print_header(std::string name, std::string output); 24 | void print_entries(std::string output, std::vector& entries); 25 | void process_Lumpy( std::string lumpy_bede, std::string output); 26 | void trans_vcf(std::string in_vcf, std::string out_vcf); 27 | 28 | #endif /* PROCESS_LUMPY_H_ */ 29 | -------------------------------------------------------------------------------- /Debug/sources.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | C_UPPER_SRCS := 6 | CXX_SRCS := 7 | C++_SRCS := 8 | OBJ_SRCS := 9 | CC_SRCS := 10 | ASM_SRCS := 11 | C_SRCS := 12 | CPP_SRCS := 13 | O_SRCS := 14 | S_UPPER_SRCS := 15 | CC_DEPS := 16 | C++_DEPS := 17 | EXECUTABLES := 18 | OBJS := 19 | C_UPPER_DEPS := 20 | CXX_DEPS := 21 | C_DEPS := 22 | CPP_DEPS := 23 | 24 | # Every subdirectory with source files must be described here 25 | SUBDIRS := \ 26 | src \ 27 | src/analysis_sv \ 28 | src/convert \ 29 | src/merge_vcf \ 30 | src/phasing \ 31 | src/scrup_vcf \ 32 | src/simulator \ 33 | src/snp_overlap \ 34 | src/vcfs \ 35 | 36 | -------------------------------------------------------------------------------- /src/analysis_sv/Simplify_SVs.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Simplify_SVs.h 3 | * 4 | * Created on: Nov 28, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef ANALYSIS_SV_SIMPLIFY_SVS_H_ 9 | #define ANALYSIS_SV_SIMPLIFY_SVS_H_ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include"../vcfs/Merge_VCF.h" 20 | using namespace std; 21 | 22 | struct sv_simple_str { 23 | strcoordinate start; 24 | strcoordinate stop; 25 | std::string svtype; 26 | std::map< std::string,bool > accessions; 27 | pair strands; 28 | }; 29 | 30 | void simplify_svs(std::string file, std::string pop_file, int min_size, std::string output); 31 | 32 | 33 | #endif /* ANALYSIS_SV_SIMPLIFY_SVS_H_ */ 34 | -------------------------------------------------------------------------------- /Debug/src/phasing/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/phasing/Phasing_vcf.cpp 8 | 9 | OBJS += \ 10 | ./src/phasing/Phasing_vcf.o 11 | 12 | CPP_DEPS += \ 13 | ./src/phasing/Phasing_vcf.d 14 | 15 | 16 | # Each subdirectory must supply rules for building sources it contributes 17 | 18 | src/phasing/%.o: ../src/phasing/%.cpp src/phasing/subdir.mk 19 | @echo 'Building file: $<' 20 | @echo 'Invoking: Cross G++ Compiler' 21 | g++ -O3 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$@" -o "$@" "$<" 22 | @echo 'Finished building: $<' 23 | @echo ' ' 24 | 25 | 26 | -------------------------------------------------------------------------------- /Debug/src/scrup_vcf/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/scrup_vcf/scrup_svs.cpp 8 | 9 | OBJS += \ 10 | ./src/scrup_vcf/scrup_svs.o 11 | 12 | CPP_DEPS += \ 13 | ./src/scrup_vcf/scrup_svs.d 14 | 15 | 16 | # Each subdirectory must supply rules for building sources it contributes 17 | src/scrup_vcf/%.o: ../src/scrup_vcf/%.cpp src/scrup_vcf/subdir.mk 18 | @echo 'Building file: $<' 19 | @echo 'Invoking: Cross G++ Compiler' 20 | g++ -O3 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$@" -o "$@" "$<" 21 | @echo 'Finished building: $<' 22 | @echo ' ' 23 | 24 | 25 | -------------------------------------------------------------------------------- /Debug/src/snp_overlap/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/snp_overlap/Overlap_snps.cpp 8 | 9 | OBJS += \ 10 | ./src/snp_overlap/Overlap_snps.o 11 | 12 | CPP_DEPS += \ 13 | ./src/snp_overlap/Overlap_snps.d 14 | 15 | 16 | # Each subdirectory must supply rules for building sources it contributes 17 | src/snp_overlap/%.o: ../src/snp_overlap/%.cpp src/snp_overlap/subdir.mk 18 | @echo 'Building file: $<' 19 | @echo 'Invoking: Cross G++ Compiler' 20 | g++ -O3 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$@" -o "$@" "$<" 21 | @echo 'Finished building: $<' 22 | @echo ' ' 23 | 24 | 25 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | SURVIVOR 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder 10 | clean,full,incremental, 11 | 12 | 13 | 14 | 15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder 16 | full,incremental, 17 | 18 | 19 | 20 | 21 | 22 | org.eclipse.cdt.core.cnature 23 | org.eclipse.cdt.core.ccnature 24 | org.eclipse.cdt.managedbuilder.core.managedBuildNature 25 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/phasing/Phasing_vcf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Phasing_vcf.h 3 | * 4 | * Created on: Sep 26, 2018 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef SRC_PHASING_PHASING_VCF_H_ 9 | #define SRC_PHASING_PHASING_VCF_H_ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | struct snp_str{ 21 | std::string chr; 22 | int position; 23 | int phase_block; 24 | bool haplotype; //true =1 25 | char alt_allele; 26 | short parental; //0=na ; 1=father; 2=mother; 27 | short gatk; //0=na ; 1=father; 2=mother; 28 | std::string qual; 29 | double ratio; //allele ratio; 30 | }; 31 | 32 | void parental_phasing(std::string parents_vcf, std::string hapcut_output, std::string gatk_output, std::string snp_file, std::string output); 33 | 34 | 35 | #endif /* SRC_PHASING_PHASING_VCF_H_ */ 36 | -------------------------------------------------------------------------------- /src/R-scripts/SUR_plots.R: -------------------------------------------------------------------------------- 1 | #! /usr/local/bin/Rscript 2 | 3 | ## This is the automated script for greating the plots over the summary stats using SURVIVOR. 4 | 5 | 6 | 7 | args<-commandArgs(TRUE) 8 | 9 | if(length(args) != 1) { 10 | cat("USAGE: produde_plots.R summary_file_chr\n") 11 | }else{ 12 | 13 | if (!require("RColorBrewer")) { 14 | #install.packages("RColorBrewer") 15 | library(RColorBrewer) 16 | } 17 | 18 | file = args[[1]] 19 | file_chr = args[[2]] 20 | 21 | cols=(brewer.pal(5,"Set1")) 22 | #Plot2: 23 | pdf(paste(file_chr,"_plot.pdf",sep="")) 24 | t=read.table(file_chr,header=T) 25 | plot(t[,c(1,2)],ylim=c(0,max(t[,c(2:5)])),ylab="# of SVs",col=cols[1],xlab="chromosome",main=file_chr,type='points') 26 | points(t[,c(1,3)],col=cols[2]) 27 | points(t[,c(1,4)],col=cols[3]) 28 | points(t[,c(1,5)],col=cols[4]) 29 | points(t[,c(1,6)],col=cols[5]) 30 | legend('topright',legend=c('DEL','DUP','INV','INS','TRA'),lwd=2,col=cols) 31 | dev.off() 32 | } 33 | -------------------------------------------------------------------------------- /src/simulator/Error_scanner.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Error_scanner.h 3 | * 4 | * Created on: Jun 30, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef SIMULATOR_ERROR_SCANNER_H_ 9 | #define SIMULATOR_ERROR_SCANNER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | //#include 20 | 21 | using namespace std; 22 | 23 | struct differences_str { 24 | int position; 25 | short type; 26 | }; 27 | 28 | struct CigarOp { 29 | char Type; //!< CIGAR operation type (MIDNSHPX=) 30 | int Length; //!< CIGAR operation length (number of bases) 31 | }; 32 | 33 | struct read_position { 34 | double match; 35 | double mismatch; 36 | double ins; 37 | double del; 38 | double total; 39 | }; 40 | 41 | void generate_error_profile(int min_length,bool comp_error_mat, std::string output); 42 | 43 | #endif /* SIMULATOR_ERROR_SCANNER_H_ */ 44 | -------------------------------------------------------------------------------- /Debug/src/merge_vcf/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/merge_vcf/IntervallTree.cpp \ 8 | ../src/merge_vcf/combine_svs.cpp 9 | 10 | OBJS += \ 11 | ./src/merge_vcf/IntervallTree.o \ 12 | ./src/merge_vcf/combine_svs.o 13 | 14 | CPP_DEPS += \ 15 | ./src/merge_vcf/IntervallTree.d \ 16 | ./src/merge_vcf/combine_svs.d 17 | 18 | 19 | # Each subdirectory must supply rules for building sources it contributes 20 | src/merge_vcf/%.o: ../src/merge_vcf/%.cpp src/merge_vcf/subdir.mk 21 | @echo 'Building file: $<' 22 | @echo 'Invoking: Cross G++ Compiler' 23 | g++ -O3 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$@" -o "$@" "$<" 24 | @echo 'Finished building: $<' 25 | @echo ' ' 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/vcfs/Annotate_vcf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Annotate_vcf.h 3 | * 4 | * Created on: Feb 12, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef ANNOTATE_VCF_H_ 9 | #define ANNOTATE_VCF_H_ 10 | #include "../structs.h" 11 | #include "Merge_VCF.h" 12 | #include "../simulator/Eval_vcf.h" 13 | struct LTR_reg{ 14 | std::string chr; 15 | int start; 16 | int stop; 17 | }; 18 | struct SV_reg{ 19 | std::string header; 20 | std::string chr; 21 | strcoordinate start; 22 | strcoordinate stop; 23 | int type; 24 | }; 25 | void generate_gene_list(std::string vcf_file, std::string annotation,int max_distance, std::string output); 26 | 27 | int get_num_strains(strvcfentry entry); 28 | void overlap_gtf(std::string vcf_file, std::string gtf_file,int max_distance,int min_num_occurance,int max_num_occurance,int type, std::string output); 29 | 30 | void gene_overlap(std::string SV_file,std::string LTR_file,std::string gtf_file, int min_dist_LTR, int min_dist_gene,std::string output); 31 | #endif /* ANNOTATE_VCF_H_ */ 32 | -------------------------------------------------------------------------------- /src/convert/Convert_VCF_to_BED.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_VCF_to_BED.h 3 | * 4 | * Created on: Mar 3, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef CONVERT_VCF_TO_BED_H_ 9 | #define CONVERT_VCF_TO_BED_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "../merge_vcf/combine_svs.h" 18 | #include "../vcfs/Merge_VCF.h" 19 | #include "../simulator/Eval_vcf.h" 20 | #include "../vcfs/Annotate_vcf.h" 21 | 22 | void convert_vcf(std::string vcf_file, std::string output); 23 | void convert_vcf_bede(std::string vcffile,int min_length, std::string output); 24 | void process_bed_file(std::string bedfile,std::string type,std::string output); 25 | void parse_VCF_to_bed(std::string vcffile,int min_length,int max_length, std::string output); 26 | void change_insert_pos(std::string vcffile, std::string output); 27 | void prepare_svviz(std::string vcffile, std::string bam, std::string ref, std::string output); 28 | #endif /* CONVERT_VCF_TO_BED_H_ */ 29 | -------------------------------------------------------------------------------- /src/vcfs/Merge_VCF.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Merge_VCF.h 3 | * 4 | * Created on: Feb 12, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef MERGE_VCF_H_ 9 | #define MERGE_VCF_H_ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "../merge_vcf/Paramer.h" 19 | #include "../merge_vcf/combine_svs.h" 20 | 21 | #include "../structs.h" 22 | std::vector parse_vcf(std::string & filename,int min_svs); 23 | strvcfentry parse_vcf_entry(std::string buffer); 24 | strcoordinate parse_stop(const char * buffer); 25 | void merge_vcf(std::string filenames, int max_dist, int min_observed, std::string outputfile); 26 | int overlap(strvcfentry tmp, std::vector & final_vcf,int max_dist); 27 | strcoordinate parse_stop(const char * buffer); 28 | std::pair parse_strands(const char * buffer); 29 | std::vector parse_filename(std::string filename); 30 | short get_type(std::string type); 31 | 32 | #endif /* MERGE_VCF_H_ */ 33 | -------------------------------------------------------------------------------- /src/snp_overlap/Overlap_snps.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Overlap_snps.h 3 | * 4 | * Created on: Jul 18, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #ifndef SNP_OVERLAP_OVERLAP_SNPS_H_ 9 | #define SNP_OVERLAP_OVERLAP_SNPS_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "../simulator/Eval_vcf.h" 23 | #include "../merge_vcf/Paramer.h" 24 | using namespace std; 25 | void overlap_snpsGWASDB(std::string svs_file, std::string snp_file, int max_dist, int min_svs, int allele, std::string output); 26 | void overlap_snps(std::string svs_file, std::string snp_file, int max_dist, int min_svs, int allele, std::string output); 27 | void overlap_snps_gwas(std::string svs_file, std::string random_SV,int max_dist, int min_svs, std::string output); 28 | void generate_random_regions(std::string genome_file, std::string svs_vcf, int min_svs, std::string output); 29 | #endif /* SNP_OVERLAP_OVERLAP_SNPS_H_ */ 30 | -------------------------------------------------------------------------------- /Debug/src/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/CorrectAllele.cpp \ 8 | ../src/DetectDif.cpp \ 9 | ../src/Extract_Seq.cpp \ 10 | ../src/SURVIVOR.cpp \ 11 | ../src/Summarize_SV.cpp 12 | 13 | OBJS += \ 14 | ./src/CorrectAllele.o \ 15 | ./src/DetectDif.o \ 16 | ./src/Extract_Seq.o \ 17 | ./src/SURVIVOR.o \ 18 | ./src/Summarize_SV.o 19 | 20 | CPP_DEPS += \ 21 | ./src/CorrectAllele.d \ 22 | ./src/DetectDif.d \ 23 | ./src/Extract_Seq.d \ 24 | ./src/SURVIVOR.d \ 25 | ./src/Summarize_SV.d 26 | 27 | 28 | # Each subdirectory must supply rules for building sources it contributes 29 | src/%.o: ../src/%.cpp src/subdir.mk 30 | @echo 'Building file: $<' 31 | @echo 'Invoking: Cross G++ Compiler' 32 | g++ -O3 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$@" -o "$@" "$<" 33 | @echo 'Finished building: $<' 34 | @echo ' ' 35 | 36 | 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Fritz Sedlazeck 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /src/merge_vcf/Paramer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Paramer.h 3 | * 4 | * Created on: Aug 20, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef PARAMER_H_ 9 | #define PARAMER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | 19 | 20 | class Parameter { 21 | private: 22 | Parameter() { 23 | min_freq=-1; 24 | version ="1.0.7"; 25 | } 26 | ~Parameter() { 27 | 28 | 29 | } 30 | static Parameter* m_pInstance; 31 | 32 | public: 33 | std::string version; 34 | double max_dist; 35 | int max_caller; 36 | bool use_type; 37 | bool use_strand; 38 | bool dynamic_size; 39 | int min_length; 40 | float min_freq; 41 | int min_support; 42 | 43 | static Parameter* Instance() { 44 | if (!m_pInstance) { 45 | m_pInstance = new Parameter; 46 | } 47 | return m_pInstance; 48 | } 49 | 50 | double meassure_time(clock_t begin ,std::string msg){ 51 | return 0; 52 | clock_t end = clock(); 53 | double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; 54 | std::cout << msg<<" " << elapsed_secs< 13 | #include 14 | #include 15 | 16 | 17 | struct strcoordinate{ 18 | int pos; 19 | std::string chr; 20 | }; 21 | 22 | struct strvcfentry{ 23 | std::string header; 24 | strcoordinate start; 25 | strcoordinate stop; 26 | short type; //0=DEL,1=DUP,2=INV,3=TRA 27 | std::map calls; 28 | int sup_lumpy; 29 | int caller_id; 30 | std::vector caller_supports; 31 | std::pair strands; 32 | std::pair num_reads; //ref alt 33 | std::string genotype; 34 | int sv_len; 35 | std::string sv_id; 36 | double af; 37 | std::string prev_support_vec; 38 | int quality; 39 | std::pair alleles; 40 | std::pair cpos; 41 | std::pair cend; 42 | int supp; 43 | //int num_reads; 44 | }; 45 | 46 | 47 | struct strentry{ 48 | int valid; 49 | int not_covered; 50 | int not_valid; 51 | }; 52 | 53 | struct strregion{ 54 | strcoordinate start; 55 | strcoordinate stop; 56 | }; 57 | 58 | struct strsimul{ 59 | strcoordinate start; 60 | strcoordinate stop; 61 | short type; 62 | bool identified; 63 | bool wrong; 64 | }; 65 | 66 | struct strgene{ 67 | int count; 68 | strregion region; 69 | std::string gene_name; 70 | }; 71 | 72 | #endif /* STRUCTS_H_ */ 73 | -------------------------------------------------------------------------------- /Debug/src/vcfs/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/vcfs/Annotate_vcf.cpp \ 8 | ../src/vcfs/Combine_3_VCF.cpp \ 9 | ../src/vcfs/Compoverlap_VCF.cpp \ 10 | ../src/vcfs/Detect_nested.cpp \ 11 | ../src/vcfs/Filter_vcf.cpp \ 12 | ../src/vcfs/Generate_distMat.cpp \ 13 | ../src/vcfs/Merge_VCF.cpp 14 | 15 | OBJS += \ 16 | ./src/vcfs/Annotate_vcf.o \ 17 | ./src/vcfs/Combine_3_VCF.o \ 18 | ./src/vcfs/Compoverlap_VCF.o \ 19 | ./src/vcfs/Detect_nested.o \ 20 | ./src/vcfs/Filter_vcf.o \ 21 | ./src/vcfs/Generate_distMat.o \ 22 | ./src/vcfs/Merge_VCF.o 23 | 24 | CPP_DEPS += \ 25 | ./src/vcfs/Annotate_vcf.d \ 26 | ./src/vcfs/Combine_3_VCF.d \ 27 | ./src/vcfs/Compoverlap_VCF.d \ 28 | ./src/vcfs/Detect_nested.d \ 29 | ./src/vcfs/Filter_vcf.d \ 30 | ./src/vcfs/Generate_distMat.d \ 31 | ./src/vcfs/Merge_VCF.d 32 | 33 | 34 | # Each subdirectory must supply rules for building sources it contributes 35 | src/vcfs/%.o: ../src/vcfs/%.cpp src/vcfs/subdir.mk 36 | @echo 'Building file: $<' 37 | @echo 'Invoking: Cross G++ Compiler' 38 | g++ -O3 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$@" -o "$@" "$<" 39 | @echo 'Finished building: $<' 40 | @echo ' ' 41 | 42 | 43 | -------------------------------------------------------------------------------- /Debug/src/analysis_sv/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/analysis_sv/Density_VCF.cpp \ 8 | ../src/analysis_sv/GIAB_summary.cpp \ 9 | ../src/analysis_sv/MT_identifier.cpp \ 10 | ../src/analysis_sv/MUMmer_overlap.cpp \ 11 | ../src/analysis_sv/Select_samples.cpp \ 12 | ../src/analysis_sv/Simplify_SVs.cpp \ 13 | ../src/analysis_sv/Summ_mat.cpp 14 | 15 | OBJS += \ 16 | ./src/analysis_sv/Density_VCF.o \ 17 | ./src/analysis_sv/GIAB_summary.o \ 18 | ./src/analysis_sv/MT_identifier.o \ 19 | ./src/analysis_sv/MUMmer_overlap.o \ 20 | ./src/analysis_sv/Select_samples.o \ 21 | ./src/analysis_sv/Simplify_SVs.o \ 22 | ./src/analysis_sv/Summ_mat.o 23 | 24 | CPP_DEPS += \ 25 | ./src/analysis_sv/Density_VCF.d \ 26 | ./src/analysis_sv/GIAB_summary.d \ 27 | ./src/analysis_sv/MT_identifier.d \ 28 | ./src/analysis_sv/MUMmer_overlap.d \ 29 | ./src/analysis_sv/Select_samples.d \ 30 | ./src/analysis_sv/Simplify_SVs.d \ 31 | ./src/analysis_sv/Summ_mat.d 32 | 33 | 34 | # Each subdirectory must supply rules for building sources it contributes 35 | src/analysis_sv/%.o: ../src/analysis_sv/%.cpp src/analysis_sv/subdir.mk 36 | @echo 'Building file: $<' 37 | @echo 'Invoking: Cross G++ Compiler' 38 | g++ -O3 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$@" -o "$@" "$<" 39 | @echo 'Finished building: $<' 40 | @echo ' ' 41 | 42 | 43 | -------------------------------------------------------------------------------- /src/simulator/SV_Simulator.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SV_Simulator.h 3 | * 4 | * Created on: Jan 30, 2016 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef SIMULATOR_SV_SIMULATOR_H_ 9 | #define SIMULATOR_SV_SIMULATOR_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | struct parameter { 22 | int dup_min; 23 | int dup_max; 24 | int dup_max_amp; 25 | int dup_num; 26 | 27 | int indel_min; 28 | int indel_max; 29 | int indel_num; 30 | 31 | int translocations_min; 32 | int translocations_max; 33 | int translocations_num; 34 | 35 | int inv_min; 36 | int inv_max; 37 | int inv_num; 38 | 39 | int inv_del_min; 40 | int inv_del_max; 41 | int inv_del_num; 42 | 43 | int inv_dup_min; 44 | int inv_dup_max; 45 | int inv_dup_num; 46 | 47 | int intrachr_num; 48 | int intrachr_min; 49 | int intrachr_max; 50 | 51 | bool diploid; 52 | float hom_rate; 53 | }; 54 | 55 | struct position { 56 | std::string chr; 57 | int start; 58 | int stop; 59 | }; 60 | 61 | struct struct_var { 62 | int type; //0:dup;1:del;2:ins;3:inv;4:tra 63 | position pos; 64 | position target; 65 | std::string seq; //not mandadory! 66 | std::string ref; 67 | bool print; 68 | int copy_num; 69 | }; 70 | 71 | struct insertions { 72 | position target; 73 | std::string seq; 74 | }; 75 | 76 | void simulate_SV(std::string ref_file, std::string parameter_file,float snp_freq, bool coordinates, std::string output_prefix); 77 | void generate_parameter_file(std::string parameter_file); 78 | #endif /* SIMULATOR_SV_SIMULATOR_H_ */ 79 | -------------------------------------------------------------------------------- /src/merge_vcf/IntervallTree.h: -------------------------------------------------------------------------------- 1 | /* 2 | * IntervallTree.h 3 | * 4 | * Created on: Jun 23, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef TREE_INTERVALLTREE_H_ 9 | #define TREE_INTERVALLTREE_H_ 10 | 11 | #include 12 | #include 13 | #include "TNode.h" 14 | #include "Paramer.h" 15 | 16 | 17 | 18 | class IntervallTree { 19 | private: 20 | int max(int, int); 21 | TNode * srl(TNode *&); 22 | TNode * drl(TNode *&); 23 | TNode * srr(TNode *&); 24 | TNode * drr(TNode *&); 25 | int overlap(breakpoint_str start, breakpoint_str stop,short type, std::pair strands,SVS_Node * curr_svs); 26 | bool same_breakpoint(breakpoint_str first, breakpoint_str second,int max_dist); 27 | void careful_screening(breakpoint_str &start, breakpoint_str& stop, short type, std::pair strands, meta_data_str meta_info, TNode *&p); 28 | long overlap_SNP(breakpoint_str start, SVS_Node * curr_svs); 29 | public: 30 | void insert(breakpoint_str &start, breakpoint_str& stop, short type, std::pair strands, meta_data_str meta_info, TNode *&p); 31 | void del(SVS_Node * point, TNode *&); 32 | int deletemin(TNode *&); 33 | void find(SVS_Node * point, TNode *&); 34 | TNode * findmin(TNode*); 35 | TNode * findmax(TNode*); 36 | void makeempty(TNode *&); 37 | void copy(TNode * &, TNode *&); 38 | TNode * nodecopy(TNode *&); 39 | void preorder(TNode*); 40 | void inorder(TNode*,TNode * root); 41 | void postorder(TNode*); 42 | int bsheight(TNode*); 43 | void get_breakpoints(TNode *p,std::vector & points); 44 | void get_breakpoints(TNode *p, std::map > & points); 45 | int nonodes(TNode*); 46 | void collapse_intervalls(TNode *&p); 47 | 48 | std::string findSNP(breakpoint_str &snp, TNode *&p); 49 | }; 50 | 51 | #endif /* TREE_INTERVALLTREE_H_ */ 52 | -------------------------------------------------------------------------------- /src/convert/ConvertMQ0Bed.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ConvertMQ0Bed.cpp 3 | * 4 | * Created on: Mar 16, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #include "ConvertMQ0Bed.h" 9 | 10 | void comp_mq0bed(std::string cov_file, int border, int cov_tresh) { 11 | size_t buffer_size = 2000000; 12 | char*buffer = new char[buffer_size]; 13 | std::ifstream myfile; 14 | 15 | myfile.open(cov_file.c_str(), std::ifstream::in); 16 | if (!myfile.good()) { 17 | std::cout << "Cov Parser: could not open file: " << cov_file.c_str() << std::endl; 18 | exit(0); 19 | } 20 | 21 | myfile.getline(buffer, buffer_size); 22 | int start = 0; 23 | int current = -1; 24 | int prev = current; 25 | std::string start_chr; 26 | std::string chr; 27 | while (!myfile.eof()) { 28 | int count = 0; 29 | int cov = 0; 30 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 31 | 32 | if (count == 0 && buffer[i] != '\t') { 33 | chr += buffer[i]; 34 | } 35 | if (count == 1 && buffer[i - 1] == '\t') { 36 | current = atoi(&buffer[i]); 37 | } 38 | if (count == 2 && buffer[i - 1] == '\t') { 39 | cov = atoi(&buffer[i]); 40 | break; 41 | } 42 | 43 | if (buffer[i] == '\t') { 44 | count++; 45 | } 46 | } 47 | //std::cout< cov_tresh) { 49 | if (prev != -1 && abs(current - prev) > border) { 50 | //print 51 | if (start - border > 1) { 52 | std::cout << start_chr << "\t" << start - border << "\t" << prev + border << std::endl; 53 | } else { 54 | std::cout << start_chr << "\t" << 1 << "\t" << prev + border << std::endl; 55 | } 56 | start = current; 57 | start_chr = chr; 58 | } else if (prev == -1) { 59 | start = current; 60 | start_chr = chr; 61 | } 62 | prev = current; 63 | } 64 | chr.clear(); 65 | myfile.getline(buffer, buffer_size); 66 | } 67 | myfile.close(); 68 | } 69 | -------------------------------------------------------------------------------- /Debug/src/convert/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/convert/ConvertMQ0Bed.cpp \ 8 | ../src/convert/Convert_Assemblytics.cpp \ 9 | ../src/convert/Convert_Bionano.cpp \ 10 | ../src/convert/Convert_Honey_tails.cpp \ 11 | ../src/convert/Convert_MUMmer.cpp \ 12 | ../src/convert/Convert_Pindel.cpp \ 13 | ../src/convert/Convert_VCF_to_BED.cpp \ 14 | ../src/convert/Convert_hapcut2.cpp \ 15 | ../src/convert/Process_Coverage.cpp \ 16 | ../src/convert/Process_Lumpy.cpp \ 17 | ../src/convert/Update_bam_pacbio.cpp 18 | 19 | OBJS += \ 20 | ./src/convert/ConvertMQ0Bed.o \ 21 | ./src/convert/Convert_Assemblytics.o \ 22 | ./src/convert/Convert_Bionano.o \ 23 | ./src/convert/Convert_Honey_tails.o \ 24 | ./src/convert/Convert_MUMmer.o \ 25 | ./src/convert/Convert_Pindel.o \ 26 | ./src/convert/Convert_VCF_to_BED.o \ 27 | ./src/convert/Convert_hapcut2.o \ 28 | ./src/convert/Process_Coverage.o \ 29 | ./src/convert/Process_Lumpy.o \ 30 | ./src/convert/Update_bam_pacbio.o 31 | 32 | CPP_DEPS += \ 33 | ./src/convert/ConvertMQ0Bed.d \ 34 | ./src/convert/Convert_Assemblytics.d \ 35 | ./src/convert/Convert_Bionano.d \ 36 | ./src/convert/Convert_Honey_tails.d \ 37 | ./src/convert/Convert_MUMmer.d \ 38 | ./src/convert/Convert_Pindel.d \ 39 | ./src/convert/Convert_VCF_to_BED.d \ 40 | ./src/convert/Convert_hapcut2.d \ 41 | ./src/convert/Process_Coverage.d \ 42 | ./src/convert/Process_Lumpy.d \ 43 | ./src/convert/Update_bam_pacbio.d 44 | 45 | 46 | # Each subdirectory must supply rules for building sources it contributes 47 | src/convert/%.o: ../src/convert/%.cpp src/convert/subdir.mk 48 | @echo 'Building file: $<' 49 | @echo 'Invoking: Cross G++ Compiler' 50 | g++ -O3 -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$@" -o "$@" "$<" 51 | @echo 'Finished building: $<' 52 | @echo ' ' 53 | 54 | 55 | -------------------------------------------------------------------------------- /.settings/language.settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /Debug/makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | -include ../makefile.init 6 | 7 | RM := rm -rf 8 | 9 | # All of the sources participating in the build are defined here 10 | -include sources.mk 11 | -include src/vcfs/subdir.mk 12 | -include src/snp_overlap/subdir.mk 13 | -include src/simulator/subdir.mk 14 | -include src/scrup_vcf/subdir.mk 15 | -include src/phasing/subdir.mk 16 | -include src/merge_vcf/subdir.mk 17 | -include src/convert/subdir.mk 18 | -include src/analysis_sv/subdir.mk 19 | -include src/subdir.mk 20 | -include subdir.mk 21 | -include objects.mk 22 | 23 | ifneq ($(MAKECMDGOALS),clean) 24 | ifneq ($(strip $(CC_DEPS)),) 25 | -include $(CC_DEPS) 26 | endif 27 | ifneq ($(strip $(C++_DEPS)),) 28 | -include $(C++_DEPS) 29 | endif 30 | ifneq ($(strip $(C_UPPER_DEPS)),) 31 | -include $(C_UPPER_DEPS) 32 | endif 33 | ifneq ($(strip $(CXX_DEPS)),) 34 | -include $(CXX_DEPS) 35 | endif 36 | ifneq ($(strip $(C_DEPS)),) 37 | -include $(C_DEPS) 38 | endif 39 | ifneq ($(strip $(CPP_DEPS)),) 40 | -include $(CPP_DEPS) 41 | endif 42 | endif 43 | 44 | -include ../makefile.defs 45 | 46 | OPTIONAL_TOOL_DEPS := \ 47 | $(wildcard ../makefile.defs) \ 48 | $(wildcard ../makefile.init) \ 49 | $(wildcard ../makefile.targets) \ 50 | 51 | 52 | BUILD_ARTIFACT_NAME := SURVIVOR 53 | BUILD_ARTIFACT_EXTENSION := 54 | BUILD_ARTIFACT_PREFIX := 55 | BUILD_ARTIFACT := $(BUILD_ARTIFACT_PREFIX)$(BUILD_ARTIFACT_NAME)$(if $(BUILD_ARTIFACT_EXTENSION),.$(BUILD_ARTIFACT_EXTENSION),) 56 | 57 | # Add inputs and outputs from these tool invocations to the build variables 58 | 59 | # All Target 60 | all: main-build 61 | 62 | # Main-build Target 63 | main-build: SURVIVOR 64 | 65 | # Tool invocations 66 | SURVIVOR: $(OBJS) $(USER_OBJS) makefile objects.mk $(OPTIONAL_TOOL_DEPS) 67 | @echo 'Building target: $@' 68 | @echo 'Invoking: Cross G++ Linker' 69 | g++ -o "SURVIVOR" $(OBJS) $(USER_OBJS) $(LIBS) 70 | @echo 'Finished building target: $@' 71 | @echo ' ' 72 | 73 | # Other Targets 74 | clean: 75 | -$(RM) $(CC_DEPS)$(C++_DEPS)$(EXECUTABLES)$(OBJS)$(C_UPPER_DEPS)$(CXX_DEPS)$(C_DEPS)$(CPP_DEPS) SURVIVOR 76 | -@echo ' ' 77 | 78 | .PHONY: all clean dependents 79 | 80 | -include ../makefile.targets 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SURVIVOR 2 | SURVIVOR is a tool set for simulating/evaluating SVs, merging and comparing SVs within and among samples, and includes various methods to reformat or summarize SVs. 3 | 4 | **Please see our github wiki for more information** (https://github.com/fritzsedlazeck/SURVIVOR/wiki ) 5 | ************************************** 6 | ## Cite: 7 | 8 | If you use it in your study please cite: 9 | 10 | **Transient structural variations have strong effects on quantitative traits and reproductive isolation in fission yeast.** 11 | Jeffares, Daniel C; Jolly, Clemency; Hoti, Mimoza; Speed, Doug; Shaw, Liam; Rallis, Charalampos; Balloux, Francois; Dessimoz, Christophe; Bähler, Jürg; Sedlazeck, Fritz J. 12 | Nature communications, Vol. 8, 14061, 24.01.2017, p. 1-11. DOI:10.1038/NCOMMS14061 13 | 14 | ************************************** 15 | 16 | ## INSTALL: 17 | 18 | Download SURVIVOR: 19 |
20 | git clone https://github.com/fritzsedlazeck/SURVIVOR.git
21 | cd SURVIVOR/Debug
22 | make
23 | 
24 | 25 | ************************************** 26 | 27 | ## USAGE: 28 | ``` 29 | ./SURVIVOR ID 30 | ``` 31 | to see the individual parameters for each option. 32 | 33 | choose the ID from these options: 34 | ``` 35 | Program: SURVIVOR (Tools for Structural Variations in the VCF format) 36 | Version: 1.0.3 37 | 38 | Usage: SURVIVOR [options] 39 | 40 | Commands: 41 | -- Simulation/ Evaluation 42 | simSV Simulates SVs and SNPs on a reference genome. 43 | scanreads Obtain error profiles form mapped reads for simulation. 44 | simreads Simulates long reads (Pacio or ONT). 45 | eval Evaluates a VCF file after SV calling over simulated data. 46 | 47 | -- Comparison/filtering 48 | merge Compare or merge VCF files to generate a consensus or multi sample vcf files. 49 | filter Filter a vcf file based on size and/or regions to ignore 50 | stats Report multipe stats over a VCF file 51 | compMUMMer Annotates a VCF file with the breakpoints found with MUMMer (Show-diff). 52 | 53 | -- Conversion 54 | bincov Bins coverage vector to a bed file to filter SVs in low MQ regions 55 | vcftobed Converts a VCF file to a bed file 56 | bedtovcf Converts a bed file to a VCF file 57 | smaptovcf Converts the smap file to a VCF file (beta version) 58 | bedpetovcf Converts a bedpe file ot a VCF file (beta version) 59 | hapcuttovcf Converts the Hapcut2 final file to a VCF file using the original SNP file provided to Hapcut2 60 | convertAssemblytics Converts Assemblytics to a VCF file``` 61 | ``` 62 | ************************************** 63 | ## CONTACT: 64 | 65 | If you have questions or encounter a problem please contact: 66 | fritz.sedlazeck@gmail.com 67 | -------------------------------------------------------------------------------- /src/simulator/test_cov.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * test_cov.cpp 3 | * 4 | * Created on: Jun 17, 2016 5 | * Author: fsedlaze 6 | */ 7 | #include "test_cov.h" 8 | 9 | struct SV_sim { 10 | int pos; 11 | int times; 12 | }; 13 | std::vector get_SV_pos(int num_SV, int genome) { 14 | std::vector svs; 15 | SV_sim tmp; 16 | tmp.times=0; 17 | 18 | svs.resize(num_SV,tmp); 19 | for(size_t i=0;i svs = get_SV_pos(num_SV, genome); // gives X random pos. 51 | long num_reads = (genome / (long) read_length) * (long) coverage; 52 | for (long i = 0; i < num_reads; i++) { 53 | long pos = rand() % genome; 54 | int length = read_length;//sim_readlength(read_length); 55 | 56 | //detect overlap taking into account the min overlap + read length 57 | for (size_t j = 0; j < svs.size(); j++) { 58 | if (pos + min_overlap < svs[j].pos && (pos + length) - min_overlap > svs[j].pos) { 59 | svs[j].times++; 60 | if(svs[j].times==min_support+1){ 61 | covered++; 62 | } 63 | } 64 | } 65 | } 66 | std::cout << covered << "\t"; 67 | svs.clear(); 68 | } 69 | std::cout << endl; 70 | // } 71 | } 72 | void count_valid_reads(double allowed_n_ratio) { 73 | int count = 0; 74 | while (!cin.eof()) { 75 | string line; 76 | getline(cin, line); 77 | 78 | if (!cin.fail()) { 79 | if (line[0] != '>') { 80 | //error 81 | double len = (double) line.size(); 82 | double ns = 0; 83 | for (size_t i = 0; i < line.size(); i++) { 84 | if (line[i] == 'N' || line[i] == 'n') { 85 | ns++; 86 | } 87 | } 88 | if (ns / len < allowed_n_ratio) { 89 | count++; 90 | } 91 | } 92 | } else { 93 | break; 94 | } 95 | } 96 | cout << "Number of valid reads: " << count << std::endl; 97 | } 98 | -------------------------------------------------------------------------------- /src/convert/Convert_MUMmer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_MUMmer.cpp 3 | * 4 | * Created on: Jul 31, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "Convert_MUMmer.h" 9 | 10 | std::string print_entry_mummer(std::string chr, std::string type, int start, int stop, int len) { 11 | 12 | std::ostringstream convert; // stream used for the conversion 13 | convert << chr; 14 | convert << "\t"; 15 | convert << start; // insert the textual representation of 'Number' in the characters in the stream 16 | convert << "\t"; 17 | convert << type; 18 | convert << "00"; 19 | convert << "MUMmer\tN\t<"; 20 | convert << type; 21 | convert << ">\t.\tLowQual\tIMPRECISE;SVTYPE="; 22 | convert << type; 23 | convert << ";SVMETHOD=MUMmer;CHR2="; 24 | convert << chr; 25 | convert << ";END="; 26 | convert << stop; 27 | convert << ";SVLEN="; 28 | convert << len; 29 | convert << ";PE="; 30 | convert << 1; 31 | convert << "\tGT:GL:GQ:FT:RC:DR:DV:RR:RV\t"; 32 | std::stringstream s; 33 | s << "1/1:0,0,0:0:PASS:0:0:"; 34 | s << 1; 35 | s << ":0:0"; 36 | 37 | return convert.str(); 38 | } 39 | 40 | void convert_mummer_svs(std::string mummer, int min_len, std::string output) { 41 | 42 | size_t buffer_size = 2000000; 43 | char*buffer = new char[buffer_size]; 44 | std::ifstream myfile; 45 | myfile.open(mummer.c_str(), std::ifstream::in); 46 | if (!myfile.good()) { 47 | std::cout << "MUMmer Parser: could not open file: " << mummer.c_str() << std::endl; 48 | exit(0); 49 | } 50 | myfile.getline(buffer, buffer_size); //avoid header 51 | while (!myfile.eof() && buffer[0] != '[') { 52 | myfile.getline(buffer, buffer_size); 53 | } 54 | myfile.getline(buffer, buffer_size); 55 | FILE *file; 56 | file = fopen(output.c_str(), "w"); 57 | 58 | while (!myfile.eof()) { 59 | int count = 0; 60 | std::string chr = ""; 61 | std::string type = ""; 62 | int start = 0; 63 | int stop = 0; 64 | int len = 0; 65 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 66 | if (count == 0 && buffer[i] != '\t') { 67 | chr += buffer[i]; 68 | } 69 | if (count == 1 && buffer[i] != '\t') { 70 | type += buffer[i]; 71 | } 72 | if (count == 2 && buffer[i - 1] == '\t') { 73 | start = atoi(&buffer[i]); 74 | } 75 | if (count == 3 && buffer[i - 1] == '\t') { 76 | stop = atoi(&buffer[i]); 77 | } 78 | if (count == 4 && buffer[i - 1] == '\t') { 79 | len = atoi(&buffer[i]); 80 | break; 81 | } 82 | if (buffer[i] == '\t') { 83 | count++; 84 | } 85 | } 86 | 87 | if (len > min_len) { 88 | fprintf(file, "%s", print_entry_mummer(chr, type, start, stop, len).c_str()); 89 | fprintf(file, "%c", '\n'); 90 | } 91 | 92 | myfile.getline(buffer, buffer_size); 93 | } 94 | fclose(file); 95 | myfile.close(); 96 | } 97 | 98 | -------------------------------------------------------------------------------- /src/analysis_sv/Density_VCF.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Density_VCF.cpp 3 | * 4 | * Created on: May 15, 2020 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "Density_VCF.h" 9 | 10 | void density_VCF(std::string vcf_file, int window, std::string output) { 11 | 12 | std::string buffer; 13 | std::ifstream myfile; 14 | 15 | myfile.open(vcf_file.c_str(), std::ifstream::in); 16 | if (!myfile.good()) { 17 | std::cout << "Annotation Parser: could not open file: " << vcf_file.c_str() << std::endl; 18 | exit(0); 19 | } 20 | 21 | std::vector calls; 22 | getline(myfile, buffer); 23 | 24 | int num_samples = 0; 25 | while (!myfile.eof()) { 26 | if (buffer[0] == '#' && buffer[1] == 'C') { //find out how many samples: 27 | int count = 0; 28 | 29 | for (size_t i = 0; i < buffer.size() && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 30 | if (count >= 9 && buffer[i - 1] == '\t') { 31 | num_samples++; 32 | } 33 | if (buffer[i] == '\t') { 34 | count++; 35 | } 36 | } 37 | } 38 | getline(myfile, buffer); 39 | } 40 | myfile.close(); 41 | 42 | std::cerr << "Found: " << num_samples << " Samples" << std::endl; 43 | std::vector entries = parse_vcf(vcf_file, -1); 44 | 45 | std::vector dups; 46 | std::vector dels; 47 | std::vector inv; 48 | std::vector ins; 49 | 50 | for (size_t i = 0; i < entries.size(); i++) { 51 | int start = entries[i].start.pos / window; 52 | int stop = entries[i].stop.pos / window; 53 | 54 | while (stop + 1 > dups.size()) { 55 | dups.push_back(0); 56 | dels.push_back(0); 57 | inv.push_back(0); 58 | ins.push_back(0); 59 | } 60 | 61 | double ratio = (double) entries[i].supp / (double) num_samples; 62 | int pos = start; 63 | // for (int pos = start; pos < stop; pos++) { 64 | if (entries[i].type == 0) { //DEL 65 | dels[pos] += ratio; 66 | } else if (entries[i].type == 1) { //DUP 67 | dups[pos] += ratio; 68 | } else if (entries[i].type == 2) { //INV 69 | inv[pos] += ratio; 70 | } else if (entries[i].type == 4) { //INS 71 | ins[pos] += ratio; 72 | } 73 | // } 74 | } 75 | FILE * file; 76 | file = fopen(output.c_str(), "w"); 77 | fprintf(file, "%s", "Start\tStop\tDEL\tDUP\tINV\tINS\n"); 78 | for (size_t i = 0; i < dups.size(); i++) { 79 | //for (size_t pos = (int) i * window; pos != (int) (i + 1) * window; pos++) { 80 | fprintf(file, "%i", (int) i * window); 81 | fprintf(file, "%c", '\t'); 82 | fprintf(file, "%i", (int) (i + 1) * window); 83 | fprintf(file, "%c", '\t'); 84 | fprintf(file, "%f", dels[i]); 85 | fprintf(file, "%c", '\t'); 86 | fprintf(file, "%f", dups[i]); 87 | fprintf(file, "%c", '\t'); 88 | fprintf(file, "%f", inv[i]); 89 | fprintf(file, "%c", '\t'); 90 | fprintf(file, "%f", ins[i]); 91 | fprintf(file, "%c", '\n'); 92 | //} 93 | } 94 | fclose(file); 95 | } 96 | 97 | -------------------------------------------------------------------------------- /src/convert/Process_Coverage.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Process_Coverage.cpp 3 | * 4 | * Created on: Apr 13, 2017 5 | * Author: fsedlaze 6 | */ 7 | 8 | #include "Process_Coverage.h" 9 | 10 | void summarize_badcoverage(std::string filename, int win_size, int min_cov, std::string output) { 11 | size_t buffer_size = 2000000; 12 | char*buffer = new char[buffer_size]; 13 | std::ifstream myfile; 14 | myfile.open(filename.c_str(), std::ifstream::in); 15 | if (!myfile.good()) { 16 | std::cout << "Lumpy Parser: could not open file: " << filename.c_str() << std::endl; 17 | exit(0); 18 | } 19 | 20 | FILE *file; 21 | file = fopen(output.c_str(), "w"); 22 | 23 | myfile.getline(buffer, buffer_size); 24 | myfile.getline(buffer, buffer_size); //avoiding header 25 | int start = win_size * -1; 26 | int stop = 0; 27 | std::string chr = ""; 28 | int pos = 0; 29 | std::string chr_prev = ""; 30 | while (!myfile.eof()) { 31 | chr.clear(); 32 | int count = 0; 33 | int cov = -1; 34 | 35 | //REF POS COV 36 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 37 | if (count == 0 && buffer[i] != '\t') { 38 | chr += buffer[i]; 39 | } 40 | if (count == 1 && buffer[i - 1] == '\t') { 41 | if (!chr_prev.empty() && strcmp(chr_prev.c_str(), chr.c_str()) != 0) { 42 | 43 | if (start != win_size * -1) { 44 | fprintf(file, "%s", chr_prev.c_str()); 45 | fprintf(file, "%c", '\t'); 46 | fprintf(file, "%i", start); 47 | fprintf(file, "%c", '\t'); 48 | fprintf(file, "%i", pos); 49 | fprintf(file, "%c", '\n'); 50 | start = win_size * -1; 51 | stop = 1; 52 | } 53 | chr_prev = chr; 54 | } 55 | pos = atoi(&buffer[i]); 56 | } 57 | if (count == 2 && buffer[i - 1] == '\t') { 58 | cov = atoi(&buffer[i]); 59 | } 60 | if (buffer[i] == '\t') { 61 | count++; 62 | } 63 | } 64 | 65 | if (cov <= min_cov) { 66 | if (start == win_size * -1) { 67 | if (pos - win_size > 0) { 68 | start = pos - win_size; 69 | } else { 70 | start = 0; 71 | } 72 | stop = pos; 73 | } 74 | if (stop - pos < win_size) { // extend current window. 75 | stop = pos; 76 | } 77 | } else if (start != win_size * -1 && pos - stop > win_size) { //report: 78 | fprintf(file, "%s", chr.c_str()); 79 | fprintf(file, "%c", '\t'); 80 | fprintf(file, "%i", start); 81 | fprintf(file, "%c", '\t'); 82 | fprintf(file, "%i", stop); 83 | fprintf(file, "%c", '\n'); 84 | start = win_size * -1; 85 | stop = pos; 86 | } 87 | chr_prev = chr; 88 | myfile.getline(buffer, buffer_size); 89 | } 90 | if (start != win_size * -1) { 91 | fprintf(file, "%s", chr.c_str()); 92 | fprintf(file, "%c", '\t'); 93 | fprintf(file, "%i", start); 94 | fprintf(file, "%c", '\t'); 95 | fprintf(file, "%i", pos); 96 | fprintf(file, "%c", '\n'); 97 | } 98 | fclose(file); 99 | myfile.close(); 100 | 101 | } 102 | 103 | -------------------------------------------------------------------------------- /src/merge_vcf/combine_svs.h: -------------------------------------------------------------------------------- 1 | /* 2 | * combine_svs.h 3 | * 4 | * Created on: Jul 6, 2016 5 | * Author: fsedlaze 6 | */ 7 | 8 | #ifndef MERGE_VCF_COMBINE_SVS_H_ 9 | #define MERGE_VCF_COMBINE_SVS_H_ 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "../simulator/Eval_vcf.h" 20 | #include "Paramer.h" 21 | 22 | 23 | 24 | struct breakpoint_str { 25 | std::string chr; 26 | int position; 27 | }; 28 | 29 | struct meta_data_str{ 30 | int caller_id; 31 | short type; 32 | std::string genotype; 33 | int sv_len; 34 | std::string pre_supp_vec; 35 | int QV; 36 | std::pair num_reads; 37 | std::string vcf_ID; 38 | std::pair allleles ; //first=REF; second=ALT 39 | }; 40 | 41 | 42 | class Support_Node{ 43 | public: 44 | Support_Node(){ 45 | id=0; 46 | len=0; 47 | num_support.first=0; 48 | num_support.second=0; 49 | strand.first=false; 50 | strand.second=false; 51 | genotype="./."; 52 | pre_supp_vec=""; 53 | } 54 | ~Support_Node(){ 55 | 56 | } 57 | int id; 58 | int len; 59 | std::vector quality; 60 | std::vector types; 61 | std::vector sv_lengths; 62 | std::vector starts; 63 | std::vector stops; 64 | std::pair num_support; 65 | std::pair strand; 66 | std::string genotype; 67 | std::string pre_supp_vec; 68 | std::pair alleles; 69 | std::string vcf_ID; 70 | }; 71 | 72 | class SVS_Node { 73 | public: 74 | //just for testing! 75 | 76 | SVS_Node() { 77 | 78 | type=-1; 79 | num_support.first=-1; 80 | num_support.second=-1; 81 | strand.first=false; 82 | strand.second=false; 83 | caller_info.clear(); 84 | genotype="./."; 85 | 86 | types[0]=false; //DEL 87 | types[1]=false; //DUP 88 | types[2]=false; //INV 89 | types[3]=false; //TRA 90 | types[4]=false; //UNK 91 | 92 | strands[0]=false; //+ 93 | strands[1]=false; //- 94 | strands[2]=false; //+ 95 | strands[3]=false; //- 96 | 97 | } 98 | ~SVS_Node() { 99 | caller_info.clear(); 100 | } 101 | //TODO change that to getter and setter! 102 | short type; 103 | breakpoint_str first; 104 | breakpoint_str second; 105 | std::vector caller_info; 106 | std::string entry; 107 | std::pair num_support; 108 | std::pair strand; 109 | std::string genotype; 110 | bool types[5]; 111 | bool strands[4]; 112 | }; 113 | 114 | #include "../structs.h" 115 | #include "IntervallTree.h" 116 | #include "../vcfs/Merge_VCF.h" 117 | void parse_vcf_header(std::map &chrs, std::string filename); 118 | void combine_calls_svs(std::string file, double max_dist, int min_support, int type_save, int strand_save,int dynamic_size,int min_svs, std::string output); 119 | breakpoint_str convert_position(strcoordinate pos); 120 | void summarize_VCF_files(std::string filename, int min_size, std::string output); 121 | void print_entry_overlap(FILE *& file, SVS_Node * entry, int id); 122 | #endif /* MERGE_VCF_COMBINE_SVS_H_ */ 123 | -------------------------------------------------------------------------------- /src/convert/Update_bam_pacbio.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Update_bam_pacbio.cpp 3 | * 4 | * Created on: Mar 15, 2018 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "Update_bam_pacbio.h" 9 | 10 | 11 | std::vector parse_header(std::string unmapped_sam){ 12 | std::vector header; 13 | 14 | 15 | return header; 16 | } 17 | 18 | 19 | 20 | void merge_header(std::string unmapped_sam,std::string mapped_sam,FILE *& file2) { 21 | std::vector header_un=parse_header(unmapped_sam); 22 | 23 | 24 | } 25 | 26 | void update_entries(std::map & entries, std::string unmapped_sam) { 27 | std::cout<<"check unmapped"< 9) { 44 | entries[id] += buffer[i]; 45 | } 46 | if (buffer[i] == '\t') { 47 | count++; 48 | } 49 | } 50 | } 51 | } 52 | getline(myfile, buffer); 53 | } 54 | 55 | } 56 | 57 | void process_sam_forpacbio(std::string unmapped_sam, std::string mapped_sam, std::string output_sam) { 58 | 59 | 60 | std::string buffer; 61 | std::ifstream myfile; 62 | myfile.open(mapped_sam.c_str(), std::ifstream::in); 63 | if (!myfile.good()) { 64 | std::cout << "Sam Parser: could not open file: " << mapped_sam.c_str() << std::endl; 65 | exit(0); 66 | } 67 | 68 | FILE *file2; 69 | file2 = fopen(output_sam.c_str(), "w"); 70 | 71 | merge_header(unmapped_sam,mapped_sam,file2); 72 | 73 | std::map entries; 74 | getline(myfile, buffer); 75 | while (!myfile.eof()) { //avoid header. 76 | if (buffer[0] != '@') { 77 | //parse part of the mapped entries into a map (e.g. step size =100000) 78 | size_t found = buffer.find_first_of('\t'); 79 | std::string id = buffer.substr(0, found); 80 | entries[id] = buffer; 81 | if (entries.size() > 1000) { 82 | std::cout<<"check entries"<::iterator i = entries.begin(); i != entries.end(); i++) { 86 | fprintf(file2, "%s", (*i).second.c_str()); 87 | fprintf(file2, "%c", '\n'); 88 | } 89 | entries.clear(); 90 | } 91 | } 92 | 93 | getline(myfile, buffer); 94 | } 95 | myfile.close(); 96 | //check orig file and update them 97 | update_entries(entries, unmapped_sam); 98 | for (std::map::iterator i = entries.begin(); i != entries.end(); i++) { 99 | fprintf(file2, "%s", (*i).second.c_str()); 100 | fprintf(file2, "%c", '\n'); 101 | } 102 | fclose(file2); 103 | 104 | 105 | } 106 | 107 | -------------------------------------------------------------------------------- /src/vcfs/Detect_nested.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Detect_nested.cpp 3 | * 4 | * Created on: Apr 27, 2017 5 | * Author: fsedlaze 6 | */ 7 | 8 | #include "Detect_nested.h" 9 | void detect_nested(std::string vcf_file, std::string output) { 10 | std::vector simulated; 11 | size_t buffer_size = 2000000; 12 | char*buffer = new char[buffer_size]; 13 | std::ifstream myfile; 14 | myfile.open(vcf_file.c_str(), std::ifstream::in); 15 | if (!myfile.good()) { 16 | std::cout << "BED Parser: could not open file: " << vcf_file.c_str() << std::endl; 17 | exit(0); 18 | } 19 | myfile.getline(buffer, buffer_size); 20 | 21 | //UTURN 22 | //INVDEL 23 | //INVDUP 24 | nested_sv tmp; 25 | tmp.del = 0; 26 | tmp.dup = 0; 27 | tmp.id = -1; 28 | tmp.inv = 0; 29 | tmp.others=0; 30 | tmp.chr=""; 31 | 32 | std::vector nested_stuff; 33 | int invdups = 0; 34 | while (!myfile.eof()) { 35 | if (buffer[0] != '#') { 36 | int count = 0; 37 | std::string type; 38 | int id=0; 39 | std::string chr; 40 | bool flag = false; 41 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 42 | if(count==0 && buffer[i]!='\t'){ 43 | chr+=buffer[i]; 44 | } 45 | if (count == 2 && buffer[i - 1] == '\t') { 46 | id = atoi(&buffer[i]); 47 | } 48 | if (count == 2 && buffer[i] != '\t') { 49 | if (buffer[i] == '_') { 50 | flag = true; 51 | } 52 | } 53 | 54 | if (count == 4 && buffer[i] != '\t') { 55 | type += buffer[i]; 56 | } 57 | if (buffer[i] == '\t') { 58 | count++; 59 | } 60 | } 61 | if (flag) { 62 | //std::cout<<"HIT "<") == 0) { 75 | nested_stuff[i].del++; 76 | } else if (strcmp(type.c_str(), "") == 0) { 77 | nested_stuff[i].inv++; 78 | } else if (strcmp(type.c_str(), "") == 0) { 79 | nested_stuff[i].dup++; 80 | }else{ 81 | nested_stuff[i].others++; 82 | } 83 | if(nested_stuff[i].chr.empty()){ 84 | nested_stuff[i].chr=chr; 85 | }else if(strcmp(nested_stuff[i].chr.c_str(),chr.c_str())!=0){ 86 | nested_stuff[i].others=100; 87 | } 88 | } 89 | //UTURN 90 | if (strcmp(type.c_str(), "") == 0) { 91 | invdups++; 92 | } 93 | } 94 | myfile.getline(buffer, buffer_size); 95 | } 96 | myfile.close(); 97 | int invdel=0; 98 | int invdup=0; 99 | for(size_t i=0;i < nested_stuff.size();i++){ 100 | if(nested_stuff[i].others==0 && ( nested_stuff[i].del > 1 &&nested_stuff[i].inv >0 && nested_stuff[i].dup==0) ){ 101 | invdel++; 102 | std::cout<<"invdel ID: "<0 && nested_stuff[i].dup>0) ){ 104 | invdup++; 105 | std::cout<<"invdup ID: "< 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | using namespace std; 43 | 44 | class GzipStreamBuf : public std::basic_streambuf 45 | { 46 | private: 47 | unsigned int buff_size; 48 | char* buffer; 49 | gzFile gzin; 50 | 51 | void checkError() 52 | { 53 | int ret = 0; 54 | const char* msg = ::gzerror(this->gzin,&ret); 55 | if(ret==0) return; 56 | if(msg==NULL) 57 | { 58 | throw std::runtime_error("GZLIB: I/O error"); 59 | } 60 | else 61 | { 62 | throw std::runtime_error(msg); 63 | } 64 | } 65 | 66 | 67 | 68 | 69 | 70 | void _init(const char* fname,unsigned int buff_size) 71 | { 72 | assert(buff_size>0); 73 | this->gzin = ::gzopen(fname,"r"); 74 | if(this->gzin == NULL) 75 | { 76 | std::ostringstream msg; 77 | msg << "File Parser: could not open file: " << fname << std::endl; 78 | throw std::runtime_error(msg.str()); 79 | } 80 | 81 | this->buff_size=buff_size; 82 | this->buffer=new char[buff_size]; 83 | 84 | setg( (char*)&this->buffer[0], 85 | (char*)&this->buffer[this->buff_size], 86 | (char*)&this->buffer[this->buff_size] 87 | ); 88 | 89 | } 90 | 91 | 92 | 93 | public: 94 | GzipStreamBuf(const char* fname) 95 | { 96 | _init(fname,BUFSIZ); 97 | } 98 | 99 | virtual ~GzipStreamBuf() 100 | { 101 | if(gzin!=NULL) ::gzclose(this->gzin); 102 | if(this->buffer!=NULL) delete [] this->buffer; 103 | } 104 | 105 | virtual int underflow ( ) 106 | { 107 | int nRead =0; 108 | if(gzeof(this->gzin)) return EOF; 109 | 110 | if( ( nRead = ::gzread(this->gzin,this->buffer,this->buff_size) ) <= 0 ) { 111 | checkError(); 112 | return EOF; 113 | } 114 | 115 | setg( (char*)this->buffer, 116 | (char*)&this->buffer[1], 117 | (char*)&this->buffer[nRead+1] 118 | ); 119 | 120 | return this->buffer[0]; 121 | } 122 | }; 123 | 124 | 125 | 126 | #endif 127 | -------------------------------------------------------------------------------- /src/DetectDif.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * DetectDif.cpp 3 | * 4 | * Created on: Oct 30, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #include "DetectDif.h" 9 | 10 | int get_stop(char * buffer, int & type) { 11 | size_t i = 0; 12 | int stop=0; 13 | if (strncmp(&buffer[i], "TRA",3) == 0) { 14 | type = 3; 15 | } else if (strncmp(&buffer[i], "INV",3) == 0) { 16 | type = 2; 17 | } else if (strncmp(&buffer[i], "DUP",3) == 0) { 18 | type = 1; 19 | } else if (strncmp(&buffer[i], "DEL",3) == 0) { 20 | type = 0; 21 | } 22 | i+=3; 23 | while (buffer[i] != '\t') { 24 | if (buffer[i] == '.' && buffer[i + 1] == '.') { 25 | 26 | stop = atoi(&buffer[i + 2]); 27 | 28 | } 29 | i++; 30 | } 31 | return stop; 32 | } 33 | 34 | std::vector parse_strains(std::string file, int call) { 35 | std::ifstream myfile; 36 | 37 | myfile.open(file.c_str(), std::ifstream::in); 38 | if (!myfile.good()) { 39 | std::cout << "Could not open file: " << file.c_str() << std::endl; 40 | exit(0); 41 | } 42 | 43 | size_t buffer_size = 2000000; 44 | char * buffer = new char[buffer_size]; 45 | myfile.getline(buffer, buffer_size); 46 | std::vector strain; 47 | while (!myfile.eof()) { 48 | int count = 0; 49 | svs_str svs; 50 | svs.joined = false; 51 | bool called=false; 52 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 53 | if (count == 0 && buffer[i] != '\t') { 54 | svs.chr += buffer[i]; 55 | } 56 | if (count == 1 && buffer[i - 1] == '\t') { 57 | svs.start = atoi(&buffer[i]); 58 | } 59 | if (count == 2 && buffer[i - 1] == '\t') { 60 | svs.stop = get_stop(&buffer[i], svs.type); 61 | 62 | } 63 | if (count == 3 + call && buffer[i - 1] == '\t') { 64 | if (buffer[i + 2] == '1') { 65 | called = true; 66 | } 67 | break; 68 | } 69 | if (buffer[i] == '\t') { 70 | count++; 71 | } 72 | } 73 | if (called) { 74 | strain.push_back(svs); 75 | } 76 | myfile.getline(buffer, buffer_size); 77 | } 78 | myfile.close(); 79 | return strain; 80 | } 81 | 82 | void detect_divergence(std::string file, float precent_overlap, std::string output) { 83 | //parse info from vcf file. 84 | std::vector p1, p2; 85 | p1 = parse_strains(file, 0); 86 | p2 = parse_strains(file, 1); 87 | 88 | 89 | 90 | for (size_t i = 0; i < p1.size(); i++) { 91 | for (size_t j = 0; j < p2.size(); j++) { 92 | if (p1[i].type < 2 && p1[i].type == p2[j].type) { //only del and dups: 93 | if (strcmp(p1[i].chr.c_str(), p2[i].chr.c_str()) == 0) { 94 | double dist = 0; 95 | if ((p1[i].start < p2[j].start && p2[j].start < p1[i].stop) || (p1[i].start < p2[j].stop && p2[j].stop < p1[i].stop)) { 96 | int start = max(p1[i].start, p2[j].start); 97 | int stop = min(p1[i].stop, p2[j].stop); 98 | dist = stop - start; 99 | double len = max((p1[i].stop - p1[i].start), (p2[j].stop - p2[j].start)); 100 | if (dist / len > precent_overlap) { 101 | //mark them as joined! 102 | p1[i].joined = true; 103 | p2[i].joined = true; 104 | } 105 | } 106 | } 107 | } 108 | } 109 | } 110 | cout << "P1:" << endl; 111 | for (size_t i = 0; i < p1.size(); i++) { 112 | cout << "\t"; 113 | if (p1[i].joined) { 114 | cout << "Joined "; 115 | } else { 116 | cout << "NOT "; 117 | } 118 | cout << p1[i].start << " " << p1[i].stop <<" "<< p1[i].type << endl; 119 | } 120 | 121 | cout << "P2:" << endl; 122 | for (size_t i = 0; i < p2.size(); i++) { 123 | cout << "\t"; 124 | if (p2[i].joined) { 125 | cout << "Joined "; 126 | } else { 127 | cout << "NOT "; 128 | } 129 | cout << p2[i].start << " " << p2[i].stop <<" "<< p2[i].type << endl; 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /src/convert/Convert_Honey_tails.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_Honey_tails.cpp 3 | * 4 | * Created on: Jun 6, 2016 5 | * Author: fsedlaze 6 | */ 7 | #include "Convert_Honey_tails.h" 8 | 9 | short get_type_honey(std::string type){ 10 | 11 | if (strncmp(type.c_str(), "DEL", 3) == 0 ) { 12 | return 0; 13 | } else if (strncmp(type.c_str(), "DUP", 3) == 0) { 14 | return 1; 15 | } else if (strncmp(type.c_str(), "INV", 3) == 0) { 16 | return 2; 17 | } else if (strncmp(type.c_str(), "TLOC", 4) == 0) { 18 | return 3; 19 | } else if (strncmp(type.c_str(), "INS", 3) == 0) { 20 | return 4; 21 | } else { 22 | std::cerr << "Unknown type! "< & entries) { 29 | size_t buffer_size = 2000000; 30 | char*buffer = new char[buffer_size]; 31 | std::ifstream myfile; 32 | myfile.open(assemblytics.c_str(), std::ifstream::in); 33 | if (!myfile.good()) { 34 | std::cout << "Pindel Parser: could not open file: " << assemblytics.c_str() << std::endl; 35 | exit(0); 36 | } 37 | myfile.getline(buffer, buffer_size); //avoid header 38 | myfile.getline(buffer, buffer_size); 39 | myfile.getline(buffer, buffer_size); 40 | while (!myfile.eof()) { 41 | // std::cout< minlen){ 75 | entries.push_back(tmp); 76 | } 77 | myfile.getline(buffer, buffer_size); 78 | } 79 | } 80 | 81 | std::string print_entry_honey(strvcfentry & region) { 82 | 83 | // III 5104 DEL00000002 N . LowQual IMPRECISE;CIEND=-305,305;CIPOS=-305,305;SVTYPE=DEL;SVMETHOD=EMBL.DELLYv0.5.9;CHR2=III;END=15991;SVLEN=10887;CT=3to5;PE=2;MAPQ=60 GT:GL:GQ:FT:RC:DR:DV:RR:RV 1/1:-12,-0.602059,0:6:LowQual:816:0:2:0:0 84 | 85 | std::ostringstream convert; // stream used for the conversion 86 | convert << region.start.chr; 87 | convert << "\t"; 88 | convert << region.start.pos; // insert the textual representation of 'Number' in the characters in the stream 89 | convert << "\t"; 90 | convert << trans_type(region.type); 91 | convert << "00"; 92 | convert << "Honey\tN\t<"; 93 | convert << trans_type(region.type); 94 | convert << ">\t.\tLowQual\tIMPRECISE;SVTYPE="; 95 | convert << trans_type(region.type); 96 | convert << ";SVMETHOD=Honey_tails;CHR2="; 97 | convert << region.stop.chr; 98 | convert << ";END="; 99 | convert << region.stop.pos; 100 | convert << ";SVLEN="; 101 | convert << region.stop.pos - region.start.pos; 102 | convert << ";PE="; 103 | convert << 1; 104 | convert << "\tGT:GL:GQ:FT:RC:DR:DV:RR:RV\t"; 105 | std::stringstream s; 106 | s << "1/1:0,0,0:0:PASS:0:0:"; 107 | s << 1; 108 | s << ":0:0"; 109 | //std::cout< entries; 115 | parse_honey_tails(assemblytics, minlen, entries); 116 | FILE *file; 117 | file = fopen(output.c_str(), "w"); 118 | for (size_t i = 0; i < entries.size(); i++) { 119 | fprintf(file, "%s", print_entry_honey(entries[i]).c_str()); 120 | fprintf(file, "%c", '\n'); 121 | } 122 | 123 | fclose(file); 124 | } 125 | -------------------------------------------------------------------------------- /src/analysis_sv/MT_identifier.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MT_identifier.cpp 3 | * 4 | * Created on: Aug 15, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "MT_identifier.h" 9 | int parse_read_start(std::string cigar, bool strand) { 10 | //std::string::size_type sz; // alias of size_t 11 | int pos = 0; 12 | if (strand) { 13 | //size_t found = cigar.find_first_of("S"); 14 | pos = 0;//stoi(cigar.substr(0, found), &sz); 15 | } else { 16 | std::string tmp=""; 17 | std::string::reverse_iterator i = cigar.rbegin(); 18 | while (i != cigar.rend() && (*i) != 'M') { 19 | i++; 20 | tmp+=(*i); 21 | } 22 | cout<<"Check Cigar backend:" << tmp< & segments) { 29 | 30 | size_t i = 0; 31 | while (i < segments.size()) { 32 | // if (tmp.pos<) 33 | i++; 34 | } 35 | } 36 | void parse_entries(const char * buffer, std::vector & segments) { 37 | size_t i = 5; //avoid SA:Z: 38 | int count = 0; 39 | segment_str tmp; 40 | std::string cigar = ""; 41 | while (buffer[i] != '\t') { 42 | if (count == 0 && buffer[i] != ',') { 43 | tmp.chr += buffer[i]; 44 | } 45 | if (count == 1 && buffer[i - 1] == ',') { 46 | tmp.pos = atoi(&buffer[i]); 47 | } 48 | if (count == 2 && buffer[i - 1] == ',') { 49 | tmp.strand = (buffer[i] == '+'); 50 | } 51 | if (count == 3 && buffer[i] != ',') { 52 | cigar += buffer[i]; 53 | } 54 | if (count == 4 && buffer[i - 1] == ',') { 55 | //parse cigar and set read start: 56 | tmp.read_start=parse_read_start(cigar, tmp.strand); 57 | tmp.MQ = atoi(&buffer[i]); 58 | } 59 | if (buffer[i] == ';') { 60 | //store; 61 | insert_sort(tmp, segments); 62 | count = 0; 63 | } 64 | if (buffer[i] == ',') { 65 | count++; 66 | } 67 | } 68 | } 69 | void detect_MT_copies(std::string chr_identifier) { 70 | //Q1: Start lockations always full lenght? 71 | //Q2: Avg vs. max copies per read 72 | 73 | chr_identifier = "MT"; 74 | 75 | std::vector start_pos; //we can do that interactively... 76 | 77 | int min_len = 0; 78 | while (!cin.eof()) { 79 | string line; 80 | getline(cin, line); 81 | if (!cin.fail()) { 82 | size_t found = 0; 83 | found = line.find_first_of(chr_identifier); 84 | if (found != std::string::npos) { //only if the line includes MT! 85 | if (line[0] == '@') { 86 | //get the length of the chr! 87 | //found: @SQ SN:MT LN:19431 88 | found = line.find_first_of("LN:"); 89 | found += 3; 90 | // std::string::size_type sz; // alias of size_t 91 | int len = 0;//std::stoi(line.substr(found), &sz); //get chr size 92 | min_len = len * 2; 93 | std::cout << "LEN: " << min_len << std::endl; 94 | } else { 95 | int count = 0; 96 | int sequence = 0; 97 | std::string cigar = ""; 98 | std::vector segments; 99 | segment_str tmp; 100 | for (size_t i = 0; i < line.size(); i++) { 101 | if (count == 2 && line[i] != '\t') { 102 | tmp.chr += line[i]; 103 | } 104 | if (count == 1 && line[i - 1] == '\t') { 105 | tmp.strand = (line[i] == '0'); //should be 0 (+) or 16 (-) 106 | } 107 | 108 | //parse cigar and read start ; 109 | if (count == 5 && line[i] != '\t') { 110 | cigar += line[i]; 111 | } 112 | if (count == 10 && line[i] != '\t') { 113 | sequence++; 114 | } 115 | if (count == 11 && line[i - 1] == '\t') { 116 | 117 | if (sequence > min_len || strncmp(chr_identifier.c_str(), tmp.chr.c_str(), chr_identifier.size()) != 0) { 118 | sequence = 0; 119 | break; //early terminate we dont need to parse the rest. 120 | } 121 | //parse cigar and read start ; 122 | tmp.read_start = parse_read_start(cigar, tmp.strand); 123 | segments.push_back(tmp); 124 | } 125 | if (count == 20 && line[i] == '\t') { 126 | parse_entries(line.substr(i).c_str(), segments); 127 | } 128 | 129 | if (line[i] == '\t') { 130 | count++; 131 | } 132 | } 133 | if (sequence > min_len) { 134 | 135 | } 136 | } 137 | } 138 | } else { 139 | break; 140 | } 141 | } 142 | 143 | } 144 | -------------------------------------------------------------------------------- /src/convert/Convert_Pindel.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_Pindel.cpp 3 | * 4 | * Created on: Mar 3, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #include "Convert_Pindel.h" 9 | 10 | short get_type_pind(const char * type){ 11 | if (strncmp(type, "DEL", 3) == 0 || strncmp(type, "RPL", 3) == 0) { 12 | return 0; 13 | } else if (strncmp(type, "DUP", 3) == 0) { 14 | return 1; 15 | } else if (strncmp(type, "INV", 3) == 0) { 16 | return 2; 17 | //no interchrom! 18 | } else if (strncmp(type, "INS", 3) == 0) { 19 | return 4; 20 | } else { 21 | std::cerr << "Unknown type! "< . LowQual IMPRECISE;CIEND=-305,305;CIPOS=-305,305;SVTYPE=DEL;SVMETHOD=EMBL.DELLYv0.5.9;CHR2=III;END=15991;SVLEN=10887;CT=3to5;PE=2;MAPQ=60 GT:GL:GQ:FT:RC:DR:DV:RR:RV 1/1:-12,-0.602059,0:6:LowQual:816:0:2:0:0 29 | tmp.start = region.start; 30 | tmp.stop= region.stop; 31 | tmp.type=type; 32 | tmp.sup_lumpy=support; 33 | 34 | std::ostringstream convert; // stream used for the conversion 35 | convert << region.start.chr; 36 | convert << "\t"; 37 | convert << region.start.pos; // insert the textual representation of 'Number' in the characters in the stream 38 | convert << "\t"; 39 | convert << trans_type(type); 40 | convert << "00"; 41 | convert << id; 42 | convert << "PIN\tN\t<"; 43 | convert << trans_type(type) ; 44 | if(tmp.sup_lumpy<4){ 45 | convert << ">\t.\tLowQual\tIMPRECISE;SVTYPE="; 46 | }else{ 47 | convert << ">\t.\tPASS\tIMPRECISE;SVTYPE="; 48 | } 49 | convert << trans_type(type); 50 | convert << ";SVMETHOD=PINDELv0.2.5a8;CHR2="; 51 | convert << region.stop.chr; 52 | convert << ";END="; 53 | convert << region.stop.pos; 54 | 55 | if(tmp.type==3){ 56 | convert << ";SVLEN=0;PE="; 57 | }else{ 58 | convert << ";SVLEN="; 59 | convert << region.stop.pos-region.start.pos; 60 | convert << ";PE="; 61 | } 62 | convert < & entries, 74 | int min_number_supporting, int min_length) { 75 | 76 | size_t buffer_size = 2000000; 77 | char*buffer = new char[buffer_size]; 78 | std::ifstream myfile; 79 | myfile.open(pindel_vcf.c_str(), std::ifstream::in); 80 | if (!myfile.good()) { 81 | std::cout << "Pindel Parser: could not open file: " << pindel_vcf.c_str() 82 | << std::endl; 83 | exit(0); 84 | } 85 | int call_id = entries.size(); 86 | myfile.getline(buffer, buffer_size); 87 | while (!myfile.eof()) { 88 | if(buffer[0]!='#'){ 89 | int count=0; 90 | strregion region; 91 | int support = 0; 92 | short type=-2; 93 | bool flag=false; 94 | for (size_t i = 0;i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n';i++) { 95 | if(count==0 && buffer[i]!='\t'){ 96 | region.start.chr+=buffer[i]; 97 | region.stop.chr+=buffer[i]; 98 | } 99 | if(count==1 && buffer[i-1]=='\t'){ 100 | region.start.pos=atoi(&buffer[i]); 101 | } 102 | if(count==7 && strncmp(&buffer[i],"END=",4)==0){ 103 | region.stop.pos=atoi(&buffer[i+4]); 104 | } 105 | if(count==7 && strncmp(&buffer[i],"SVLEN=0",7)==0){ 106 | flag=true; 107 | } 108 | if(count==7 && strncmp(&buffer[i],"SVTYPE=",7)==0){ 109 | type=get_type_pind(&buffer[i+7]); 110 | } 111 | if(count==9 && (buffer[i-1]==',' || (flag && buffer[i-1]==':'))){ 112 | support=atoi(&buffer[i]); 113 | 114 | } 115 | if(buffer[i]=='\t'){ 116 | count++; 117 | } 118 | } 119 | //std::cout<min_number_supporting && (region.stop.pos-region.start.pos > min_length || flag) ){ 121 | entries.push_back(create_entry(region,support,type,call_id)); 122 | call_id++; 123 | } 124 | 125 | } 126 | myfile.getline(buffer, buffer_size); 127 | } 128 | } 129 | 130 | 131 | void process_Pindel(std::string pindel_vcf, int min_number_supporting, 132 | int min_length, std::string output) { 133 | std::vector entries; //= parse_vcf(delly_vcf); //get delly calls 134 | 135 | parse_pindel(pindel_vcf, entries, min_number_supporting, min_length); 136 | 137 | print_header(pindel_vcf, output); 138 | print_entries(output, entries); 139 | } 140 | -------------------------------------------------------------------------------- /src/vcfs/Generate_distMat.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Generate_distMat.cpp 3 | * 4 | * Created on: Jul 17, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "Generate_distMat.h" 9 | bool is_file_exist(const char *fileName) { 10 | std::ifstream infile(fileName); 11 | return infile.good(); 12 | } 13 | 14 | std::vector parse_sample_names(std::string filename) { 15 | size_t buffer_size = 200000000; 16 | char*buffer = new char[buffer_size]; 17 | std::ifstream myfile; 18 | 19 | myfile.open(filename.c_str(), std::ifstream::in); 20 | if (!myfile.good()) { 21 | std::cout << "VCF Parser: could not open file: " << filename.c_str() << std::endl; 22 | exit(0); 23 | } 24 | 25 | myfile.getline(buffer, buffer_size); 26 | 27 | std::vector names; 28 | while (!myfile.eof()) { 29 | if (buffer[0] == '#' && buffer[1] != '#') { 30 | //parse line! 31 | int count = 0; 32 | std::string name = ""; 33 | for (size_t i = 0; i < buffer_size && buffer[i] != '\n' && buffer[i] != '\0'; i++) { 34 | if (count > 8 && buffer[i] != '\t' && name.size()<64) { 35 | name += buffer[i]; 36 | } 37 | if (buffer[i] == '\t') { 38 | if (!name.empty()) { 39 | names.push_back(name); 40 | name.clear(); 41 | } 42 | count++; 43 | } 44 | } 45 | 46 | if (!name.empty()) { 47 | names.push_back(name); 48 | name.clear(); 49 | } 50 | } else if (buffer[0] != '#') { 51 | break; 52 | } 53 | myfile.getline(buffer, buffer_size); 54 | } 55 | myfile.close(); 56 | return names; 57 | } 58 | bool not_set(char * buffer){ 59 | size_t i=0; 60 | while(buffer[i]!='\t' && buffer[i]!='\0'){ 61 | if(strncmp("NaN",&buffer[i],3)==0){ 62 | return true; 63 | } 64 | i++; 65 | } 66 | return false; 67 | } 68 | void update_mat(std::string filename, std::vector > &samples_mat, std::vector sample_names) { 69 | //std::cerr<<"Parser was adapted for bug with two tabs!"< ids; 87 | for (size_t i = 0; i < buffer_size && buffer[i] != '\n' && buffer[i] != '\0'; i++) { 88 | if (count > 8 && buffer[i - 1] == '\t') { 89 | if (!not_set(&buffer[i]) && !(buffer[i] == '0' && buffer[i + 2] == '0')) { 90 | ids.push_back(id); 91 | } 92 | id++; 93 | 94 | } 95 | if (buffer[i] == '\t') { 96 | count++; 97 | } 98 | } 99 | //std::cout<<"ID: "< sample_names; 118 | if (is_file_exist(svs_vcf.c_str())) { 119 | sample_names = parse_sample_names(svs_vcf); 120 | } else if (is_file_exist(snp_vcf.c_str())) { 121 | sample_names = parse_sample_names(snp_vcf); 122 | } else { 123 | std::cerr << "We need at least one SNP/SVs file with all the samples!" << std::endl; 124 | exit(1); 125 | } 126 | std::cout << "We detected " << sample_names.size() << " Samples" << std::endl; 127 | 128 | //initialize: 129 | std::vector > samples_mat; 130 | std::vector tmp; 131 | tmp.assign(sample_names.size(), 0); 132 | samples_mat.assign(sample_names.size(), tmp); 133 | 134 | std::cout<<"Finished int"< parse_hapcut(std::string hapcut2, std::string target_chr, int & phaseblock_id) { 11 | std::string buffer; 12 | std::ifstream myfile; 13 | 14 | map hapcut; 15 | 16 | myfile.open(hapcut2.c_str(), std::ifstream::in); 17 | if (!myfile.good()) { 18 | std::cout << "Hapcut2 Parser: could not open file: " << hapcut2.c_str() << std::endl; 19 | exit(0); 20 | } 21 | 22 | getline(myfile, buffer); 23 | while (!myfile.eof()) { 24 | if (buffer[0] != 'B' && buffer[0] != '*') { //avoid headers! 25 | int count = 0; 26 | std::string chr = ""; 27 | int pos = -1; 28 | bool first_gt = true; 29 | bool second_gt = true; //not needed but good to check! 30 | for (size_t i = 0; i < buffer.size(); i++) { 31 | if (count == 1 && buffer[i - 1] == '\t') { 32 | first_gt = (bool) (buffer[i] != '0'); 33 | } 34 | if (count == 2 && buffer[i - 1] == '\t') { 35 | second_gt = (bool) (buffer[i] != '0'); 36 | } 37 | 38 | if (count == 3 && buffer[i] != '\t') { 39 | chr += buffer[i]; 40 | } 41 | if (count == 4 && buffer[i - 1] == '\t') { 42 | pos = atoi(&buffer[i]); 43 | break; 44 | } 45 | 46 | if (buffer[i] == '\t') { 47 | count++; 48 | } 49 | } 50 | if (strcmp(chr.c_str(), target_chr.c_str()) == 0) { 51 | if ((first_gt && !second_gt) || (!first_gt && second_gt)) { 52 | std::stringstream ss; 53 | ss << chr; 54 | ss << "_"; 55 | ss << pos; 56 | 57 | if (hapcut.find(ss.str()) == hapcut.end()) { 58 | hapcut[ss.str()] = phaseblock_id; 59 | if (!first_gt) { //negative ID for none 60 | hapcut[ss.str()] = hapcut[ss.str()] * -1; 61 | } 62 | } else { 63 | cerr << "A position was found twice: " << ss.str() << endl; 64 | } 65 | } 66 | } 67 | } else if(buffer[0] != 'B') { 68 | phaseblock_id++; 69 | } 70 | getline(myfile, buffer); 71 | } 72 | myfile.close(); 73 | return hapcut; 74 | } 75 | 76 | void process_hapcut(std::string orig_snp, std::string hapcut2, std::string output) { 77 | 78 | //parse VCF file. just if we dected a 0/1 we check the hapcut results. 79 | std::string buffer; 80 | std::ifstream myfile; 81 | 82 | myfile.open(orig_snp.c_str(), std::ifstream::in); 83 | if (!myfile.good()) { 84 | std::cout << "SNP Parser: could not open file: " << orig_snp.c_str() << std::endl; 85 | exit(0); 86 | } 87 | FILE * file = fopen(output.c_str(), "w"); 88 | map hapcut_res; 89 | std::string old_chr; 90 | getline(myfile, buffer); 91 | int phaseblock = 1; 92 | //int num = 0; 93 | while (!myfile.eof()) { 94 | if (buffer[0] == '#') { 95 | if (buffer[1] == 'C') { 96 | fprintf(file, "%s", "##FORMAT=\n"); 97 | } 98 | fprintf(file, "%s", buffer.c_str()); 99 | fprintf(file, "%c", '\n'); 100 | } else { 101 | // num++; 102 | 103 | std::size_t found = buffer.find_last_of('\t'); 104 | //check found + 1 if '1' && if found+3 !='0' -> 105 | if ((buffer[found + 1] == '0' && buffer[found + 3] == '1') || (buffer[found + 1] == '1' && buffer[found + 3] == '0')) { 106 | //search pos and replace 3 chars. 107 | int count = 0; 108 | std::string chr = ""; 109 | int pos = 0; 110 | for (size_t i = 0; i < buffer.size(); i++) { 111 | if (count == 0 && buffer[i] != '\t') { 112 | chr += buffer[i]; 113 | } 114 | if (count == 1 && buffer[i - 1] == '\t') { 115 | pos = atoi(&buffer[i]); 116 | break; 117 | } 118 | if (buffer[i] == '\t') { 119 | count++; 120 | } 121 | } 122 | if (strcmp(chr.c_str(), old_chr.c_str()) != 0) { 123 | //load new chr set: 124 | cout << "Parsing hapcut2 output for " << chr; 125 | hapcut_res = parse_hapcut(hapcut2, chr, phaseblock); 126 | cout << " SNPs parsed " << hapcut_res.size() << endl; 127 | old_chr = chr; 128 | } 129 | 130 | if (!chr.empty()) { 131 | 132 | std::stringstream ss; 133 | ss << chr; 134 | ss << "_"; 135 | ss << pos; 136 | if (hapcut_res.find(ss.str()) != hapcut_res.end()) { 137 | 138 | buffer.insert(found, ":PS"); 139 | found += 3; 140 | // cout << "MATCH: " << ss.str() << endl; 141 | if (hapcut_res[ss.str()] > 0) { 142 | buffer[found + 1] = '1'; 143 | buffer[found + 2] = '|'; 144 | buffer[found + 3] = '0'; 145 | } else { 146 | buffer[found + 1] = '0'; 147 | buffer[found + 2] = '|'; 148 | buffer[found + 3] = '1'; 149 | } 150 | std::stringstream id; 151 | id << ":"; 152 | id << abs(hapcut_res[ss.str()]); 153 | 154 | buffer.append(id.str()); 155 | 156 | } 157 | } 158 | 159 | } 160 | fprintf(file, "%s", buffer.c_str()); 161 | fprintf(file, "%c", '\n'); 162 | 163 | } 164 | 165 | getline(myfile, buffer); 166 | } 167 | myfile.close(); 168 | fclose(file); 169 | } 170 | -------------------------------------------------------------------------------- /src/analysis_sv/Select_samples.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Select_samples.cpp 3 | * 4 | * Created on: Feb 27, 2018 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "Select_samples.h" 9 | 10 | bool genotype_parse(char * buffer) { 11 | //cout << "buffer: " << buffer[0] << buffer[1] << buffer[2] << endl; 12 | 13 | if ((buffer[0] == '0' && buffer[2] == '1') || (buffer[0] == '1' && buffer[2] == '1')) { 14 | return true; 15 | } else if (buffer[0] == '0' && buffer[2] == '0') { 16 | return false; 17 | } 18 | if (strncmp(buffer, "./.:0:0,0:--:NaN:NaN", 20) != 0) { 19 | return false; 20 | } 21 | //0/0 ./. 22 | return false; 23 | } 24 | 25 | std::vector parase_matrix(std::string vcf_file, std::vector & names, std::map taken_ids, int &num) { 26 | std::cout<<"Parsing... "< matrix; 30 | myfile.open(vcf_file.c_str(), std::ifstream::in); 31 | if (!myfile.good()) { 32 | std::cout << "VCF Parser: could not open file: " << vcf_file.c_str() << std::endl; 33 | exit(0); 34 | } 35 | 36 | getline(myfile, buffer); 37 | int line=0; 38 | while (!myfile.eof()) { 39 | if (names.empty() && (buffer[0] == '#' && buffer[1] == 'C')) { //parse names 40 | int count = 0; 41 | std::string id = ""; 42 | for (size_t i = 0; i < buffer.size() && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 43 | if (count >= 9 && buffer[i] != '\t') { 44 | id += buffer[i]; 45 | } 46 | if (buffer[i] == '\t') { 47 | if (!id.empty()) { 48 | names.push_back(id); 49 | id = ""; 50 | } 51 | count++; 52 | } 53 | } 54 | if (!id.empty()) { 55 | names.push_back(id); 56 | } 57 | 58 | } else if (buffer[0] != '#') { //parse svs; 59 | if (matrix.empty()) { //init pairwise matrix; 60 | std::vector tmp; 61 | matrix.resize(names.size(), 0); 62 | } 63 | line++; 64 | num++; 65 | //bool discard = false; 66 | int count = 0; 67 | bool include = false; 68 | int num=0; 69 | 70 | std::string entries; 71 | entries.resize(names.size(),'0'); 72 | for (size_t i = 0; i < buffer.size() && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 73 | if (count >= 9 && buffer[i - 1] == '\t') { 74 | if (genotype_parse(&buffer[i])) { 75 | if (taken_ids.find(num) != taken_ids.end()) { 76 | include=false; 77 | break; 78 | } 79 | include=true; 80 | entries[num]='1'; 81 | } 82 | num++; 83 | } 84 | if (buffer[i] == '\t') { 85 | count++; 86 | } 87 | } 88 | if(include){ 89 | for(size_t j=0;j svs_count_mat) { 108 | for (size_t i = 0; i < svs_count_mat.size(); i++) { 109 | std::cout << svs_count_mat[i] << "\t"; 110 | } 111 | std::cout << std::endl; 112 | std::cout << std::endl; 113 | 114 | } 115 | void select_greedy(std::string vcf_file, std::string output) { 116 | std::map taken_ids; 117 | std::vector sample_names; 118 | int total_svs = 0; 119 | 120 | //we can actually just use a vector instead! 121 | std::vector svs_count_mat = parase_matrix(vcf_file, sample_names, taken_ids, total_svs); //span a NxN matrix and stores the shared SVs 122 | //print_mat(svs_count_mat); 123 | 124 | FILE *file; 125 | file = fopen(output.c_str(), "w"); 126 | 127 | fprintf(file, "%s", "Sample\t#SVs\t#_SVs_captured\t%_SVs_captured\n"); 128 | std::cout << "Parsed vcf file with " << sample_names.size() << " samples" << endl; 129 | int captured_svs = 0; 130 | for (size_t i = 0; i < sample_names.size(); i++) { 131 | //select max on main diag 132 | int max = 0; 133 | int max_id = -1; 134 | 135 | for (size_t j = 0; j < sample_names.size(); j++) { 136 | // cout << svs_count_mat[j][j] << "\t"; 137 | if (max < svs_count_mat[j]) { 138 | max = svs_count_mat[j]; 139 | max_id = j; 140 | } 141 | } 142 | captured_svs += max; 143 | std::cout <<"RANK:\t"< 1.8) 18 | return ((p22 < 1e-10 && p32 < 1e-10) && (ratiop22 < 1.8 && ratiop23 < 1.8)); 19 | } 20 | 21 | std::map > get_entries( 22 | std::string table) { 23 | size_t buffer_size = 2000000; 24 | char*buffer = new char[buffer_size]; 25 | std::ifstream myfile; 26 | myfile.open(table.c_str(), std::ifstream::in); 27 | myfile.getline(buffer, buffer_size); 28 | std::map > entries; 29 | std::string prevname = ""; 30 | bool flag = (bool) (buffer[2] == 'E'); 31 | while (!myfile.eof()) { 32 | int count = 0; 33 | std::string name = ""; 34 | double p22 = 100; 35 | double p32 = 100; 36 | double ratiop22 = 0; 37 | double ratiop32 = 0; 38 | std::string id; 39 | for (size_t i = 0; buffer[i] != '\0' && buffer[i] != '\n'; i++) { 40 | if (count == 0 && buffer[i] != '\t') { 41 | id += buffer[i]; 42 | } 43 | if (count == 2 && buffer[i] != '\t') { 44 | name += buffer[i]; 45 | } 46 | if (count == 4 && buffer[i - 1] == '\t') { 47 | p22 = atof(&buffer[i]); 48 | } 49 | if (count == 5 && buffer[i - 1] == '\t') { 50 | p32 = atof(&buffer[i]); 51 | } 52 | if (count == 6 && buffer[i - 1] == '\t') { 53 | ratiop22 = atof(&buffer[i]); 54 | } 55 | if (count == 7 && buffer[i - 1] == '\t') { 56 | ratiop32 = atof(&buffer[i]); 57 | bool test=eval_sv(flag, p22, p32, ratiop22, ratiop32); 58 | if(test){ 59 | std::cout< > entries = get_entries( 81 | table); 82 | size_t buffer_size = 2000000; 83 | char*buffer = new char[buffer_size]; 84 | std::ifstream myfile; 85 | myfile.open(vcf_file.c_str(), std::ifstream::in); 86 | myfile.getline(buffer, buffer_size); 87 | FILE *file; 88 | file = fopen(output.c_str(), "w"); 89 | while (!myfile.eof() && buffer[0] == '#' && buffer[1] == '#') { 90 | fprintf(file, "%s", buffer); 91 | fprintf(file, "%c", '\n'); 92 | myfile.getline(buffer, buffer_size); 93 | } 94 | fprintf(file, "%s", buffer); 95 | fprintf(file, "%c", '\n'); 96 | std::vector names; 97 | std::string name; 98 | int count = 0; 99 | for (size_t i = 0; 100 | i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 101 | if (count > 8 && buffer[i] != '\t') { 102 | name += buffer[i]; 103 | } 104 | if (buffer[i] == '\t') { 105 | count++; 106 | if (!name.empty()) { 107 | std::cout< 12 | #include 13 | #include 14 | #include "combine_svs.h" 15 | #include "Paramer.h" 16 | //#include "TNode.h" 17 | //struct svs_str; 18 | //struct breakpoint_str; 19 | // struct support_str; 20 | class TNode { 21 | private: 22 | SVS_Node * data; 23 | //int value; 24 | int height; 25 | void init() { 26 | this->parent = NULL; 27 | this->left = NULL; 28 | this->right = NULL; 29 | } 30 | public: 31 | TNode * parent; 32 | TNode * left; 33 | TNode * right; 34 | TNode() { 35 | height = 0; 36 | init(); 37 | this->data = NULL; 38 | } 39 | TNode(SVS_Node * point) { 40 | init(); 41 | this->data = point; 42 | //this->data->caller_info[caller_id].num_reads= //TODO! 43 | height = 0; 44 | } 45 | 46 | TNode(breakpoint_str start, breakpoint_str stop, short type, std::pair strands, meta_data_str meta_info) { 47 | this->data = new SVS_Node(); 48 | this->data->first = start; 49 | this->data->second = stop; 50 | this->data->type = type; 51 | this->data->strand = strands; 52 | if (strands.first) { 53 | this->data->strands[0] = true; 54 | } else { 55 | this->data->strands[1] = true; 56 | } 57 | 58 | if (strands.second) { 59 | this->data->strands[2] = true; 60 | } else { 61 | this->data->strands[3] = true; 62 | } 63 | 64 | this->data->genotype = meta_info.genotype; //do I need this? 65 | this->data->types[type] = true; 66 | 67 | init(); 68 | Support_Node * tmp = new Support_Node(); 69 | if (meta_info.sv_len == -1) { 70 | tmp->len = stop.position - start.position; 71 | } else { 72 | tmp->len = meta_info.sv_len; 73 | } 74 | tmp->quality.push_back(meta_info.QV); 75 | tmp->num_support = meta_info.num_reads; 76 | tmp->id = meta_info.caller_id; 77 | tmp->starts.push_back(start.position); 78 | tmp->sv_lengths.push_back(meta_info.sv_len); 79 | tmp->stops.push_back(stop.position); 80 | tmp->types.push_back(type); 81 | tmp->genotype = meta_info.genotype; 82 | tmp->strand = strands; 83 | tmp->pre_supp_vec = meta_info.pre_supp_vec; 84 | tmp->alleles = meta_info.allleles; 85 | tmp->vcf_ID = meta_info.vcf_ID; 86 | data->caller_info.push_back(tmp); 87 | height = 0; 88 | } 89 | 90 | ~TNode() { 91 | 92 | } 93 | 94 | SVS_Node * get_data() { 95 | return data; 96 | } 97 | int get_height() { 98 | return height; 99 | } 100 | void set_height(int val) { 101 | this->height = val; 102 | } 103 | 104 | void add(breakpoint_str start, breakpoint_str stop, short type, std::pair strands, meta_data_str meta_info) { 105 | 106 | if (start.position == 55986511 || start.position == 55986511) { 107 | std::cout << "ADD " << type << " " << this->data->type << std::endl; 108 | std::cout << std::endl; 109 | } 110 | int index = -1; 111 | for (size_t i = 0; i < this->data->caller_info.size(); i++) { 112 | if (this->data->caller_info[i]->id == meta_info.caller_id) { 113 | index = i; 114 | } 115 | } 116 | 117 | if (index == -1) { 118 | index = this->data->caller_info.size(); //todo check! 119 | Support_Node * tmp = new Support_Node(); 120 | tmp->id = meta_info.caller_id; 121 | this->data->caller_info.push_back(tmp); 122 | } 123 | 124 | this->data->types[type] = true; //extend if there is an in sample merge! 125 | if (strands.first) { 126 | this->data->strands[0] = true; 127 | } else { 128 | this->data->strands[1] = true; 129 | } 130 | 131 | if (strands.second) { 132 | this->data->strands[2] = true; 133 | } else { 134 | this->data->strands[3] = true; 135 | } 136 | 137 | this->data->caller_info[index]->starts.push_back(start.position); 138 | this->data->caller_info[index]->stops.push_back(stop.position); 139 | this->data->caller_info[index]->types.push_back(type); 140 | this->data->caller_info[index]->sv_lengths.push_back(meta_info.sv_len); 141 | this->data->caller_info[index]->num_support.first = std::max(meta_info.num_reads.first, this->data->caller_info[index]->num_support.first); 142 | this->data->caller_info[index]->num_support.second = std::max(meta_info.num_reads.second, this->data->caller_info[index]->num_support.second); 143 | this->data->caller_info[index]->genotype = meta_info.genotype; 144 | this->data->caller_info[index]->strand = strands; 145 | this->data->caller_info[index]->pre_supp_vec = meta_info.pre_supp_vec; 146 | this->data->caller_info[index]->quality.push_back(meta_info.QV); 147 | 148 | if (meta_info.allleles.first.size() > this->data->caller_info[index]->alleles.first.size() || meta_info.allleles.second.size() > this->data->caller_info[index]->alleles.second.size()) { 149 | this->data->caller_info[index]->alleles.first = meta_info.allleles.first; 150 | this->data->caller_info[index]->alleles.second = meta_info.allleles.second; 151 | } 152 | 153 | if (meta_info.vcf_ID[0] != '.') { 154 | if (!this->data->caller_info[index]->vcf_ID.empty()) { 155 | this->data->caller_info[index]->vcf_ID += ";"; // meta_info.vcf_ID; 156 | } 157 | this->data->caller_info[index]->vcf_ID = meta_info.vcf_ID; 158 | } 159 | 160 | if (this->data->caller_info[index]->len == 0) { //first time 161 | this->data->caller_info[index]->len = meta_info.sv_len; //stop.position-start.position; // take the length of the svs as identifier. 162 | } else { 163 | this->data->caller_info[index]->len = std::max(meta_info.sv_len, this->data->caller_info[index]->len); //stop.position-start.position; 164 | } 165 | } 166 | }; 167 | 168 | #endif /* TREE_TNODE_H_ */ 169 | -------------------------------------------------------------------------------- /src/vcfs/Combine_3_VCF.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Combine_3_VCF.cpp 3 | * 4 | * Created on: Mar 10, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #include "Combine_3_VCF.h" 9 | bool match_coords(strvcfentry c1, strvcfentry c2, int max_allowed_dist) { 10 | 11 | if ((strcmp(c1.start.chr.c_str(), c2.start.chr.c_str()) == 0 && abs(c1.start.pos - c2.start.pos) < max_allowed_dist)) { 12 | if (c1.type == 4) { 13 | return true; 14 | } 15 | return (strcmp(c1.stop.chr.c_str(), c2.stop.chr.c_str()) == 0 && abs(c1.stop.pos - c2.stop.pos) < max_allowed_dist); 16 | 17 | } else if ((strcmp(c1.stop.chr.c_str(), c2.start.chr.c_str()) == 0 && abs(c1.stop.pos - c2.start.pos) < max_allowed_dist)) { 18 | if (c1.type == 4) { 19 | return true; 20 | } 21 | return (strcmp(c1.start.chr.c_str(), c2.stop.chr.c_str()) == 0 && abs(c1.start.pos - c2.stop.pos) < max_allowed_dist); 22 | 23 | } 24 | return false; 25 | 26 | } 27 | int find_SV(strvcfentry caller, std::vector & merged, int max_dist) { 28 | 29 | for (size_t i = 0; i < merged.size(); i++) { 30 | //std::cout< init_vec(int length) { 41 | std::vector tmp; 42 | 43 | for (int i = 0; i < length; i++) { 44 | tmp.push_back(0); 45 | } 46 | return tmp; 47 | } 48 | 49 | void process_SV(std::vector caller, std::vector & merged, int max_dist, int caller_id, int num_caller) { 50 | //std::vector new_merged = merged; 51 | std::vector blank = init_vec(num_caller); 52 | for (size_t i = 0; i < caller.size(); i++) { 53 | int id = find_SV(caller[i], merged, max_dist); 54 | if (id == -1) { //not found: 55 | caller[i].caller_supports = blank; 56 | caller[i].caller_supports[caller_id] = caller[i].stop.pos - caller[i].start.pos; 57 | caller[i].sup_lumpy = 1; 58 | merged.push_back(caller[i]); 59 | //new_merged.push_back(caller[i]); 60 | } else { 61 | merged[id].caller_supports[caller_id] = caller[i].stop.pos - caller[i].start.pos; 62 | merged[id].sup_lumpy++; 63 | //new_merged[id].caller_supports[caller_id] = caller[i].stop.pos - caller[i].start.pos; 64 | //std::cout<<"Match"<= min_caller) { //two callers must support the calls 114 | //modify_entry(merged[i]); 115 | print_entry(merged[i], final); 116 | } 117 | } 118 | fclose(final); 119 | } 120 | 121 | void combine_calls(std::string vcf_delly, std::string vcf_lumpy, std::string vcf_pindel, int max_dist, std::string output) { 122 | 123 | std::vector delly = parse_vcf(vcf_delly,0); 124 | std::vector lumpy = parse_vcf(vcf_lumpy,0); 125 | std::vector pindel = parse_vcf(vcf_pindel,0); 126 | 127 | std::vector merged; 128 | process_SV(pindel, merged, max_dist, 1, 3); 129 | std::cout << "merged: " << merged.size() << std::endl; 130 | process_SV(delly, merged, max_dist, 2, 3); 131 | std::cout << "merged: " << merged.size() << std::endl; 132 | process_SV(lumpy, merged, max_dist, 3, 3); 133 | std::cout << "merged: " << merged.size() << std::endl; 134 | 135 | FILE * final; 136 | FILE * unique; 137 | std::string out = output; 138 | out += "_overlap.vcf"; 139 | final = fopen(out.c_str(), "w"); 140 | 141 | out = output; 142 | out += "_uniq.vcf"; 143 | unique = fopen(out.c_str(), "w"); 144 | 145 | print_header(vcf_delly, final); 146 | print_header(vcf_delly, unique); 147 | 148 | for (size_t i = 0; i < merged.size(); i++) { 149 | if (num_support(merged[i].caller_supports) > 1) { //two callers must support the calls 150 | //modify_entry(merged[i]); 151 | print_entry(merged[i], final); 152 | } else { 153 | print_entry(merged[i], unique); 154 | } 155 | } 156 | fclose(final); 157 | fclose(unique); 158 | 159 | } 160 | 161 | -------------------------------------------------------------------------------- /src/analysis_sv/MUMmer_overlap.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MUMmer_overlap.cpp 3 | * 4 | * Created on: Dec 27, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "MUMmer_overlap.h" 9 | 10 | vector parse_filenames(std::string filename) { 11 | std::vector names; 12 | size_t buffer_size = 2000000; 13 | char*buffer = new char[buffer_size]; 14 | std::ifstream myfile; 15 | 16 | myfile.open(filename.c_str(), std::ifstream::in); 17 | if (!myfile.good()) { 18 | std::cout << "File Parser: could not open file: " << filename.c_str() << std::endl; 19 | exit(0); 20 | } 21 | myfile.getline(buffer, buffer_size); 22 | while (!myfile.eof()) { 23 | names.push_back(std::string(buffer)); 24 | myfile.getline(buffer, buffer_size); 25 | } 26 | myfile.close(); 27 | 28 | return names; 29 | } 30 | 31 | void comp_entries_mummer(std::vector & entries, std::string filename, int max_dist) { 32 | 33 | std::size_t found = filename.find("short_"); 34 | std::string id_denovo = ""; 35 | if (found != std::string::npos) { 36 | id_denovo = filename.substr(found + 6); 37 | } else { 38 | //cout << "not found" << endl; 39 | id_denovo = filename; 40 | } 41 | 42 | size_t buffer_size = 2000000; 43 | char*buffer = new char[buffer_size]; 44 | std::ifstream myfile; 45 | 46 | myfile.open(filename.c_str(), std::ifstream::in); 47 | if (!myfile.good()) { 48 | std::cout << "File Parser: could not open file: " << filename.c_str() << std::endl; 49 | exit(0); 50 | } 51 | myfile.getline(buffer, buffer_size); 52 | while (!myfile.eof() && buffer[0] != '[') { //avoid headers! 53 | myfile.getline(buffer, buffer_size); 54 | } 55 | myfile.getline(buffer, buffer_size); 56 | while (!myfile.eof()) { 57 | int count = 0; 58 | std::string chr = ""; 59 | int start = 0; 60 | int stop = 0; 61 | int len = 0; 62 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 63 | if (count == 0 && buffer[i] != '\t') { 64 | chr += buffer[i]; 65 | } 66 | if (count == 2 && buffer[i - 1] == '\t') { 67 | start = atoi(&buffer[i]); 68 | } 69 | if (count == 3 && buffer[i - 1] == '\t') { 70 | stop = atoi(&buffer[i]); 71 | 72 | } 73 | if (count == 4 && buffer[i - 1] == '\t') { 74 | len = atoi(&buffer[i]); 75 | break; 76 | } 77 | if (buffer[i] == '\t') { 78 | count++; 79 | } 80 | } 81 | for (size_t i = 0; i < entries.size(); i++) { 82 | if (entries[i].num_reads.second == 0) { 83 | if (strcmp(entries[i].start.chr.c_str(), chr.c_str()) == 0) { 84 | if (abs(entries[i].start.pos - start) < max_dist) { 85 | //match! 86 | 87 | //cout<<"HIT1 "< entries = parse_vcf(vcf_SVs_file, 0); 116 | 117 | for (size_t i = 0; i < entries.size(); i++) { 118 | //init to use it later! 119 | entries[i].num_reads.first = 0; //total counts 120 | entries[i].num_reads.second = 0; //flag to no count twice! 121 | } 122 | //compare to MUMMer files: 123 | vector filenames = parse_filenames(mummer_files); 124 | for (size_t i = 0; i < filenames.size(); i++) { 125 | comp_entries_mummer(entries, filenames[i], max_dist); 126 | } 127 | 128 | //combine info: 129 | size_t buffer_size = 2000000; 130 | char*buffer = new char[buffer_size]; 131 | std::ifstream myfile; 132 | 133 | myfile.open(vcf_SVs_file.c_str(), std::ifstream::in); 134 | if (!myfile.good()) { 135 | std::cout << "File Parser: could not open file: " << vcf_SVs_file.c_str() << std::endl; 136 | exit(0); 137 | } 138 | myfile.getline(buffer, buffer_size); 139 | 140 | FILE *file; 141 | file = fopen(output.c_str(), "w"); 142 | int line = 0; 143 | while (!myfile.eof()) { 144 | if (buffer[0] != '#') { 145 | int count = 0; 146 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 147 | fprintf(file, "%c", buffer[i]); 148 | if (count == 7 && buffer[i + 1] == '\t') { 149 | if (entries[line].calls.size() > 1) { 150 | fprintf(file, "%s", ";DeNovo_mummer="); 151 | bool draw_comma=false; 152 | for (std::map::iterator j = entries[line].calls.begin(); j != entries[line].calls.end(); j++) { 153 | if (strcmp((*j).first.c_str(), vcf_SVs_file.c_str()) != 0) { 154 | if (draw_comma) { 155 | fprintf(file, "%c", ','); 156 | } 157 | fprintf(file, "%s", (*j).first.c_str()); 158 | draw_comma=true; 159 | } 160 | } 161 | } 162 | 163 | } 164 | if (buffer[i] == '\t') { 165 | count++; 166 | } 167 | 168 | } 169 | fprintf(file, "%c", '\n'); 170 | line++; 171 | } else { 172 | fprintf(file, "%s", buffer); 173 | fprintf(file, "%c", '\n'); 174 | } 175 | myfile.getline(buffer, buffer_size); 176 | } 177 | myfile.close(); 178 | fclose(file); 179 | } 180 | 181 | -------------------------------------------------------------------------------- /src/vcfs/Compoverlap_VCF.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Compoverla_VCF.cpp 3 | * 4 | * Created on: Feb 27, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #include "Compoverlap_VCF.h" 9 | 10 | int overlap_bothdir(strvcfentry vcf1, std::vector vcf2, int max_dist) { 11 | for (size_t i = 0; i < vcf2.size(); i++) { 12 | //check type: 13 | if (vcf2[i].type == vcf1.type) { 14 | //check chrs: 15 | if (strcmp(vcf2[i].stop.chr.c_str(), vcf1.stop.chr.c_str()) == 0 && strcmp(vcf2[i].start.chr.c_str(), vcf1.start.chr.c_str()) == 0) { 16 | //check coordinates: 17 | if (abs(vcf2[i].stop.pos - vcf1.stop.pos) < max_dist && abs(vcf2[i].start.pos - vcf1.start.pos) < max_dist) { 18 | return i; 19 | } 20 | } 21 | 22 | if (strcmp(vcf2[i].start.chr.c_str(), vcf1.stop.chr.c_str()) == 0 && strcmp(vcf2[i].stop.chr.c_str(), vcf1.start.chr.c_str()) == 0) { 23 | //check coordinates: 24 | if (abs(vcf2[i].start.pos - vcf1.stop.pos) < max_dist && abs(vcf2[i].stop.pos - vcf1.start.pos) < max_dist) { 25 | return i; 26 | } 27 | } 28 | } 29 | } 30 | return -1; 31 | } 32 | 33 | void print_header(std::string vcf_file, FILE *& out) { 34 | std::vector simulated; 35 | size_t buffer_size = 2000000; 36 | char*buffer = new char[buffer_size]; 37 | std::ifstream myfile; 38 | myfile.open(vcf_file.c_str(), std::ifstream::in); 39 | if (!myfile.good()) { 40 | std::cout << "VCF Parser: could not open file: " << vcf_file.c_str() << std::endl; 41 | exit(0); 42 | } 43 | myfile.getline(buffer, buffer_size); 44 | while (!myfile.eof()) { 45 | if (buffer[0] == '#') { 46 | fprintf(out, "%s", buffer); 47 | fprintf(out, "%c", '\n'); 48 | } 49 | myfile.getline(buffer, buffer_size); 50 | } 51 | myfile.close(); 52 | } 53 | void print_entry(strvcfentry entry, FILE *& out) { 54 | std::string tmp = entry.header; 55 | int count = 0; 56 | for (size_t i = 0; i < tmp.size(); i++) { 57 | if (count == 7&&tmp[i-1]=='\t') { 58 | if(!entry.caller_supports.empty()){ 59 | fprintf(out, "%s", "SUP="); 60 | if(entry.caller_supports[0]>0){ 61 | fprintf(out, "%i", entry.caller_supports[0]); 62 | }else{ 63 | fprintf(out, "%c", '.'); 64 | } 65 | for (size_t j = 1; j < entry.caller_supports.size(); j++) { 66 | fprintf(out, "%c", ','); 67 | if(entry.caller_supports[j]>0){ 68 | fprintf(out, "%i", entry.caller_supports[j]); 69 | }else{ 70 | fprintf(out, "%c", '.'); 71 | } 72 | } 73 | fprintf(out, "%c", ';'); 74 | } 75 | } 76 | fprintf(out, "%c", tmp[i]); 77 | if (tmp[i] == '\t') { 78 | count++; 79 | } 80 | } 81 | //fprintf(out, "%s", entry.header.c_str()); 82 | for (std::map::iterator tz = entry.calls.begin(); tz != entry.calls.end(); tz++) { 83 | fprintf(out, "%s", (*tz).second.c_str()); 84 | } 85 | fprintf(out, "%c", '\n'); 86 | } 87 | 88 | void comp_overlap_vcf(std::string vcf1_file, std::string vcf2_file, int max_dist, std::string output) { 89 | std::vector vcf1 = parse_vcf(vcf1_file,0); 90 | std::vector vcf2 = parse_vcf(vcf2_file,0); 91 | 92 | std::cout << vcf1.size() << " " << vcf2.size() << std::endl; 93 | 94 | FILE * combined; 95 | FILE * unique_lumpy; 96 | FILE * unique_delly; 97 | std::string out = output; 98 | out += "_overlap.vcf"; 99 | combined = fopen(out.c_str(), "w"); 100 | 101 | out = output; 102 | out += "_uniq_delly.vcf"; 103 | unique_delly = fopen(out.c_str(), "w"); 104 | 105 | out = output; 106 | out += "_uniq_lumpy.vcf"; 107 | unique_lumpy = fopen(out.c_str(), "w"); 108 | 109 | print_header(vcf1_file, combined); 110 | print_header(vcf1_file, unique_delly); 111 | print_header(vcf2_file, unique_lumpy); 112 | 113 | for (size_t i = 0; i < vcf1.size(); i++) { 114 | vcf1[i].sup_lumpy = 0; 115 | } 116 | for (size_t i = 0; i < vcf2.size(); i++) { 117 | vcf2[i].sup_lumpy = 0; 118 | } 119 | for (size_t i = 0; i < vcf1.size(); i++) { 120 | int id = overlap_bothdir(vcf1[i], vcf2, max_dist); 121 | if (id > -1) { 122 | vcf1[i].sup_lumpy = 1; 123 | vcf2[id].sup_lumpy = 1; 124 | } 125 | } 126 | 127 | std::vector tmp; 128 | tmp.resize(5, 0); 129 | std::vector overlap_vcf1 = tmp; 130 | std::vector overlap_vcf2 = tmp; 131 | std::vector unique_vcf1 = tmp; 132 | std::vector unique_vcf2 = tmp; 133 | 134 | std::cout << "vcf1.size " << vcf1.size() << std::endl; 135 | for (size_t i = 0; i < vcf1.size(); i++) { 136 | if (vcf1[i].sup_lumpy == 1) { 137 | overlap_vcf1[vcf1[i].type]++; 138 | overlap_vcf2[vcf2[i].type]++; 139 | // print_entry( vcf1[i],combined); 140 | } else { 141 | unique_vcf1[vcf1[i].type]++; 142 | // print_entry( vcf1[i],unique_delly); 143 | } 144 | } 145 | for (size_t i = 0; i < vcf2.size(); i++) { 146 | if (vcf2[i].sup_lumpy == 0) { 147 | unique_vcf2[vcf2[i].type]++; 148 | //std::cout << "Not " << trans_type(vcf2[i].type) << " " 149 | // << vcf2[i].start.chr << " " << vcf2[i].start.pos << " " 150 | // << vcf2[i].stop.chr << " " << vcf2[i].stop.pos << std::endl; 151 | // print_entry( vcf2[i],unique_lumpy); 152 | } 153 | } 154 | fclose(unique_delly); 155 | fclose(unique_lumpy); 156 | fclose(combined); 157 | 158 | //0=DEL,1=DUP,2=INV,3=TRA 159 | std::cout << "Overlap VCF1:" << " DEL " << overlap_vcf1[0] << " DUP " << overlap_vcf1[1] << " INV " << overlap_vcf1[2] << " TRA " << overlap_vcf1[3] << std::endl; 160 | std::cout << "Overlap VCF2:" << " DEL " << overlap_vcf2[0] << " DUP " << overlap_vcf2[1] << " INV " << overlap_vcf2[2] << " TRA " << overlap_vcf2[3] << std::endl; 161 | std::cout << "Uniqe VCF1:" << " DEL " << unique_vcf1[0] << " DUP " << unique_vcf1[1] << " INV " << unique_vcf1[2] << " TRA " << unique_vcf1[3] << std::endl; 162 | std::cout << "Uniqe VCF2:" << " DEL " << unique_vcf2[0] << " DUP " << unique_vcf2[1] << " INV " << unique_vcf2[2] << " TRA " << unique_vcf2[3] << std::endl; 163 | 164 | } 165 | -------------------------------------------------------------------------------- /src/simulator/Pac_Simulator.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Pac_Simulator.cpp 3 | * 4 | * Created on: Feb 1, 2016 5 | * Author: fsedlaze 6 | */ 7 | 8 | #include "Pac_Simulator.h" 9 | char ins() { 10 | switch (rand() % 4) { 11 | case 0: 12 | return 'A'; 13 | break; 14 | case 1: 15 | return 'C'; 16 | break; 17 | case 2: 18 | return 'G'; 19 | break; 20 | case 3: 21 | return 'T'; 22 | break; 23 | } 24 | return 'N'; //should not happen 25 | } 26 | void print_sam(FILE*& file, std::string name, int pos, std::string new_seq, std::string cigar, bool strand) { 27 | fprintf(file, "%s", name.c_str()); 28 | fprintf(file, "%c", '_'); 29 | fprintf(file, "%i", pos); 30 | if (strand) { 31 | fprintf(file, "%s", "\t0\t"); 32 | } else { 33 | fprintf(file, "%s", "\t16\t"); 34 | } 35 | fprintf(file, "%s", name.c_str()); 36 | fprintf(file, "%c", '\t'); 37 | fprintf(file, "%i", pos + 1); 38 | fprintf(file, "%s", "\t255\t"); 39 | fprintf(file, "%s", cigar.c_str()); 40 | fprintf(file, "%s", "\t*\t0\t0\t"); 41 | fprintf(file, "%s", new_seq.c_str()); 42 | fprintf(file, "%c", '\t'); 43 | for (size_t pos = 0; pos < new_seq.size(); pos++) { 44 | fprintf(file, "%c", 'H'); 45 | } 46 | fprintf(file, "%s", "\tNM:i:4"); 47 | fprintf(file, "%c", '\n'); 48 | } 49 | 50 | char comp(char base) { 51 | switch (base) { 52 | case 'A': 53 | return 'T'; 54 | break; 55 | case 'C': 56 | return 'G'; 57 | break; 58 | case 'G': 59 | return 'C'; 60 | break; 61 | case 'T': 62 | return 'A'; 63 | break; 64 | } 65 | return base; 66 | } 67 | void rev_comp(std::string & read) { 68 | std::string new_read; 69 | for (std::string::reverse_iterator i = read.rbegin(); i != read.rend(); i++) { 70 | new_read += comp((*i)); 71 | } 72 | read.clear(); 73 | read = new_read; 74 | } 75 | void simulate_reads(std::string name, std::string seq, FILE*& file, FILE*& sam, FILE*& file2) { 76 | size_t i = 0; 77 | int len = std::min(20000 + rand() % 1000,(int)seq.size()); 78 | while (i < seq.size()) { 79 | if (i + len <= seq.size()) { 80 | fprintf(file, "%c", '@'); 81 | fprintf(file, "%s", name.c_str()); 82 | fprintf(file, "%c", '_'); 83 | fprintf(file, "%i", (int) i); 84 | fprintf(file, "%c", '\n'); 85 | 86 | fprintf(file2, "%c", '>'); 87 | fprintf(file2, "%s", name.c_str()); 88 | fprintf(file2, "%c", '_'); 89 | fprintf(file2, "%i", (int) i); 90 | fprintf(file2, "%c", '\n'); 91 | 92 | std::string read = seq.substr(i, len * 2); 93 | bool strand = true; 94 | if (rand() % 100 < 50) { 95 | strand = false; 96 | rev_comp(read); 97 | } 98 | std::string new_seq=""; 99 | int tmp = 1; 100 | std::stringstream ss; 101 | 102 | //14M2D3M2D3M * 0 0 103 | //AGCTTTTCATTCTA--CGC--A 104 | // 14M1D2M1D3M * 0 0 105 | //AGCTTTTCATTCTA CG CA 106 | char mod = ' '; 107 | size_t pos = 0; 108 | while ((int)new_seq.size() < len && pos < read.size()) { 109 | if (rand() % 100 < 15 && (pos > 0 && pos < read.size() - 1)) { //why 4?? // ADD MISMATCHES 110 | if (rand() % 100 < 40) { 111 | if (mod != 'D' && mod != ' ') { 112 | ss << tmp; 113 | ss << mod; 114 | tmp = 0; 115 | } 116 | tmp++; 117 | mod = 'D'; 118 | pos++; 119 | //deletion 120 | } else { 121 | if (mod != 'I' && mod != ' ') { 122 | ss << tmp; 123 | ss << mod; 124 | tmp = 0; 125 | } 126 | //insertion 127 | tmp++; 128 | new_seq += ins(); 129 | mod = 'I'; 130 | } 131 | } else { 132 | if (mod != 'M' && mod != ' ') { 133 | ss << tmp; 134 | ss << mod; 135 | tmp = 0; 136 | } 137 | mod = 'M'; 138 | tmp++; 139 | new_seq += read[pos]; 140 | pos++; 141 | } 142 | } 143 | if (tmp - 1 > 0) { 144 | ss << tmp - 1; 145 | ss << mod; 146 | } 147 | for (size_t pos = 0; pos < new_seq.size(); pos++) { 148 | fprintf(file, "%c", new_seq[pos]); 149 | fprintf(file2, "%c", new_seq[pos]); 150 | } 151 | fprintf(file, "%s", "\n+\n"); 152 | fprintf(file2, "%s", "\n"); 153 | for (size_t pos = 0; pos < new_seq.size(); pos++) { 154 | fprintf(file, "%c", 'H'); 155 | } 156 | fprintf(file, "%c", '\n'); 157 | 158 | print_sam(sam, name, i, new_seq, ss.str(), strand); 159 | } 160 | //i += 286; 161 | i+=1000; 162 | } 163 | } 164 | 165 | void simulate_pac(std::string genome, std::string output) { 166 | size_t buffer_size = 2000000; 167 | char*buffer = new char[buffer_size]; 168 | std::ifstream myfile; 169 | 170 | myfile.open(genome.c_str(), std::ifstream::in); 171 | if (!myfile.good()) { 172 | std::cout << "Fasta Parser: could not open file: " << genome.c_str() << std::endl; 173 | exit(0); 174 | } 175 | 176 | myfile.getline(buffer, buffer_size); 177 | std::string name; 178 | std::string seq; 179 | std::string out = output; 180 | out += ".fq"; 181 | FILE * file; 182 | file = fopen(out.c_str(), "w"); 183 | FILE * file2; 184 | out = output; 185 | out += ".fa"; 186 | file2 = fopen(out.c_str(), "w"); 187 | FILE * sam; 188 | out = output; 189 | out += ".sam"; 190 | sam = fopen(out.c_str(), "w"); 191 | 192 | srand(time(NULL)); 193 | while (!myfile.eof()) { 194 | if (buffer[0] == '>') { 195 | if (!seq.empty()) { 196 | simulate_reads(name, seq, file, sam, file2); 197 | name.clear(); 198 | seq.clear(); 199 | } 200 | for (size_t i = 1; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n' && buffer[i] != ' '; i++) { 201 | name += buffer[i]; 202 | } 203 | } else { 204 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n' && buffer[i] != ' '; i++) { 205 | seq += buffer[i]; 206 | } 207 | } 208 | myfile.getline(buffer, buffer_size); 209 | } 210 | if (!seq.empty()) { 211 | simulate_reads(name, seq, file, sam, file2); 212 | } 213 | myfile.close(); 214 | fclose(file); 215 | fclose(sam); 216 | 217 | } 218 | -------------------------------------------------------------------------------- /src/convert/Convert_Assemblytics.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Convert_Assemblytics.cpp 3 | * 4 | * Created on: May 26, 2016 5 | * Author: fsedlaze 6 | */ 7 | 8 | #include "Convert_Assemblytics.h" 9 | 10 | 11 | void print_header_ass(std::string name, FILE *&file) { 12 | 13 | 14 | fprintf(file, "%s", "##fileformat=VCFv4.1\n"); 15 | fprintf(file, "%s", "##fileDate=20150217\n"); 16 | fprintf(file, "%s", "##ALT=\n"); 17 | fprintf(file, "%s", "##ALT=\n"); 18 | fprintf(file, "%s", "##ALT=\n"); 19 | fprintf(file, "%s", "##ALT=\n"); 20 | fprintf(file, "%s", "##ALT=\n"); 21 | 22 | fprintf(file, "%s", "##FILTER=\n"); 23 | fprintf(file, "%s", "##INFO=\n"); 24 | fprintf(file, "%s", "##INFO=\n"); 25 | fprintf(file, "%s", "##INFO=\n"); 26 | fprintf(file, "%s", "##INFO=\n"); 27 | fprintf(file, "%s", "##INFO=\n"); 28 | fprintf(file, "%s", "##INFO=\n"); 29 | fprintf(file, "%s", "##INFO=\n"); 30 | fprintf(file, "%s", "##INFO=\n"); 31 | fprintf(file, "%s", "##INFO=\n"); 32 | fprintf(file, "%s", "##INFO=\n"); 33 | 34 | fprintf(file, "%s", "##FORMAT=\n"); 35 | fprintf(file, "%s", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"); 36 | fprintf(file, "%s", name.c_str()); 37 | fprintf(file, "%c", '\n'); 38 | 39 | } 40 | 41 | 42 | 43 | short get_type_assemblytics(std::string type) { 44 | if(strcmp(type.c_str(), "Tandem_expansion") == 0){ 45 | return 1; 46 | }else if (strcmp(type.c_str(), "Deletion") == 0 || (strcmp(type.c_str(), "Repeat_contraction") == 0 || strcmp(type.c_str(), "Tandem_contraction") == 0)) { 47 | return 0; 48 | } else if (strcmp(type.c_str(), "Insertion") == 0 || strcmp(type.c_str(), "Repeat_expansion") == 0 ) { 49 | return 4; 50 | } else { 51 | std::cerr << "Unknown type! " << type << std::endl; 52 | } 53 | return -1; 54 | } 55 | void parse_assemblytics(std::string assemblytics,int minlen, std::vector & entries) { 56 | size_t buffer_size = 2000000; 57 | char*buffer = new char[buffer_size]; 58 | std::ifstream myfile; 59 | myfile.open(assemblytics.c_str(), std::ifstream::in); 60 | if (!myfile.good()) { 61 | std::cout << "Pindel Parser: could not open file: " << assemblytics.c_str() << std::endl; 62 | exit(0); 63 | } 64 | myfile.getline(buffer, buffer_size); //avoid header 65 | myfile.getline(buffer, buffer_size); 66 | while (!myfile.eof()) { 67 | // std::cout< minlen){ 104 | entries.push_back(tmp); 105 | } 106 | myfile.getline(buffer, buffer_size); 107 | } 108 | myfile.close(); 109 | } 110 | 111 | std::string print_entry(strvcfentry & region) { 112 | 113 | // III 5104 DEL00000002 N . LowQual IMPRECISE;CIEND=-305,305;CIPOS=-305,305;SVTYPE=DEL;SVMETHOD=EMBL.DELLYv0.5.9;CHR2=III;END=15991;SVLEN=10887;CT=3to5;PE=2;MAPQ=60 GT:GL:GQ:FT:RC:DR:DV:RR:RV 1/1:-12,-0.602059,0:6:LowQual:816:0:2:0:0 114 | 115 | std::ostringstream convert; // stream used for the conversion 116 | convert << region.start.chr; 117 | convert << "\t"; 118 | convert << region.start.pos; // insert the textual representation of 'Number' in the characters in the stream 119 | convert << "\t"; 120 | convert << region.sv_id; 121 | convert << "\tN\t<"; 122 | convert << trans_type(region.type); 123 | convert << ">\t.\tLowQual\tIMPRECISE;SVTYPE="; 124 | convert << trans_type(region.type); 125 | convert << ";SVMETHOD=Assemblytics;CHR2="; 126 | convert << region.stop.chr; 127 | convert << ";END="; 128 | convert << region.stop.pos; 129 | convert << ";SVLEN="; 130 | convert << region.stop.pos - region.start.pos; 131 | convert << ";PE="; 132 | convert << 1; 133 | convert << "\tGT\t1/1"; 134 | return convert.str(); 135 | } 136 | 137 | void process_Assemblytics(std::string assemblytics,int minlen, std::string output) { 138 | 139 | std::vector entries; 140 | parse_assemblytics(assemblytics,minlen, entries); 141 | FILE *file; 142 | file = fopen(output.c_str(), "w"); 143 | print_header_ass(assemblytics,file); 144 | for (size_t i = 0; i < entries.size(); i++) { 145 | fprintf(file, "%s", print_entry(entries[i]).c_str()); 146 | fprintf(file, "%c", '\n'); 147 | } 148 | 149 | fclose(file); 150 | 151 | } 152 | -------------------------------------------------------------------------------- /src/analysis_sv/Summ_mat.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Summ_mat.cpp 3 | * 4 | * Created on: Jul 5, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "Summ_mat.h" 9 | 10 | void process_patterns(std::vector mat, FILE *& file) { 11 | 12 | std::map patterns; 13 | //std::map > patterns2; 14 | std::vector vec; 15 | vec.assign(mat[0].size(),0); 16 | //cout<<"Mat: "< >::iterator i = patterns2.begin(); i != patterns2.end(); i++) { 40 | fprintf(file2, "%c", '\t'); 41 | fprintf(file2, "%i", (*i).second[0]); 42 | for (size_t j = 1; j < (*i).second.size(); j++) { 43 | fprintf(file2, "%c", ','); 44 | fprintf(file2, "%i", (*i).second[j]); 45 | } 46 | } 47 | fprintf(file2, "%c", '\n'); 48 | */ 49 | for (std::map::iterator i = patterns.begin(); i != patterns.end(); i++) { 50 | //while ((*i).second >= (int) vec.size()) { 51 | // vec.push_back(0); 52 | // } 53 | vec[(*i).second]++; 54 | } 55 | std::stringstream ss; 56 | for (size_t i = 1; i < vec.size(); i++) { 57 | fprintf(file, "%i", (int) vec[i]); 58 | fprintf(file, "%c", ';'); 59 | } 60 | fprintf(file, "%c", '\n'); 61 | } 62 | char parse_inf(char * buffer) { 63 | 64 | size_t i = 0; 65 | while (buffer[i] != '\t' && buffer[i] != '\n') { 66 | if (strncmp("NaN", &buffer[i], 3) == 0) { 67 | return '0'; 68 | } 69 | i++; 70 | } 71 | return '1'; 72 | } 73 | void summarize_svs_table_window(std::string venn_file, int window, std::string output) { 74 | 75 | size_t buffer_size = 200000; 76 | char*buffer = new char[buffer_size]; 77 | std::ifstream myfile; 78 | 79 | myfile.open(venn_file.c_str(), std::ifstream::in); 80 | if (!myfile.good()) { 81 | std::cout << "Annotation Parser: could not open file: " << venn_file.c_str() << std::endl; 82 | exit(0); 83 | } 84 | 85 | myfile.getline(buffer, buffer_size); 86 | myfile.getline(buffer, buffer_size); 87 | 88 | FILE *file; 89 | file = fopen(output.c_str(), "w"); 90 | 91 | //FILE* file2; 92 | //std::string out = output; 93 | //out += "patient_hist"; 94 | //file2 = fopen(out.c_str(), "w"); 95 | 96 | int last_pos = 0; 97 | 98 | std::vector mat; 99 | std::string last_chr = ""; 100 | while (!myfile.eof()) { 101 | if (buffer[0] != '#') { 102 | int pos = 0; 103 | std::string chr; 104 | int count = 0; 105 | std::string pattern; 106 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 107 | if (count == 0 && buffer[i] != '\t') { 108 | chr += buffer[i]; 109 | } 110 | if (count == 1 && buffer[i - 1] == '\t') { 111 | pos = atoi(&buffer[i]); 112 | 113 | if (pos - last_pos > window || (strcmp(chr.c_str(), last_chr.c_str()) != 0)) { 114 | //process entries; 115 | if (mat.size() > 0) { 116 | fprintf(file, "%s", last_chr.c_str()); 117 | fprintf(file, "%c", ':'); 118 | fprintf(file, "%i", (int) last_pos); 119 | fprintf(file, "%c", ':'); 120 | // fprintf(file, "%s", (*patterns.begin()).first.c_str()); 121 | // fprintf(file, "%c", ':'); 122 | // fprintf(file2, "%s", last_chr.c_str()); 123 | // fprintf(file2, "%c", ':'); 124 | // fprintf(file2, "%i", (int) last_pos); 125 | process_patterns(mat, file); 126 | 127 | vector patients; 128 | patients.assign(mat[0].size(), 0); 129 | for (size_t i = 0; i < mat.size(); i++) { 130 | for (size_t j = 0; j < mat[i].size(); j++) { 131 | if (mat[i][j] == '1') { 132 | patients[j]++; 133 | } 134 | } 135 | } 136 | 137 | // for (size_t i = 0; i < patients.size(); i++) { 138 | // fprintf(file2, "%c", '\t'); 139 | // fprintf(file2, "%i", (int) patients[i]); 140 | 141 | // } 142 | // fprintf(file2, "%c", '\n'); 143 | mat.clear(); 144 | 145 | } 146 | last_pos = pos; 147 | last_chr = chr; 148 | 149 | } 150 | } 151 | if (count > 9 && buffer[i - 1] == '\t') { 152 | pattern += parse_inf(&buffer[i]); 153 | } 154 | if (buffer[i] == '\t') { 155 | count++; 156 | } 157 | } 158 | mat.push_back(pattern); 159 | } 160 | myfile.getline(buffer, buffer_size); 161 | } 162 | fclose(file); 163 | } 164 | 165 | void summarize_svs_table_window_stream(int window, std::string output) { 166 | 167 | FILE *file; 168 | file = fopen(output.c_str(), "w"); 169 | 170 | // FILE* file2; 171 | // std::string out = output; 172 | // out += "perpatient"; 173 | // file2 = fopen(out.c_str(), "w"); 174 | 175 | int last_pos = 0; 176 | 177 | std::vector mat; 178 | std::string last_chr = ""; 179 | while (!cin.eof()) { 180 | std::string buffer; 181 | getline(cin, buffer); 182 | if (!cin.fail()) { 183 | if (buffer[0] != '#') { 184 | int pos = 0; 185 | std::string chr; 186 | int count = 0; 187 | std::string pattern; 188 | for (size_t i = 0; i < buffer.size() && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 189 | if (count == 0 && buffer[i] != '\t') { 190 | chr += buffer[i]; 191 | } 192 | if (count == 1 && buffer[i - 1] == '\t') { 193 | pos = atoi(&buffer[i]); 194 | 195 | if (pos - last_pos > window || (strcmp(chr.c_str(), last_chr.c_str()) != 0)) { 196 | //process entries; 197 | if (mat.size() > 0) { 198 | fprintf(file, "%s", last_chr.c_str()); 199 | fprintf(file, "%c", ':'); 200 | fprintf(file, "%i", (int) last_pos); 201 | fprintf(file, "%c", ':'); 202 | // fprintf(file, "%s", (*patterns.begin()).first.c_str()); 203 | // fprintf(file, "%c", ':'); 204 | 205 | // fprintf(file2, "%s", last_chr.c_str()); 206 | // fprintf(file2, "%c", ':'); 207 | // fprintf(file2, "%i", (int) last_pos); 208 | 209 | process_patterns(mat, file); 210 | 211 | /*vector patients; 212 | patients.assign(mat[0].size(), 0); 213 | for (size_t i = 0; i < mat.size(); i++) { 214 | for (size_t j = 0; j < mat[i].size(); j++) { 215 | if (mat[i][j] == '1') { 216 | patients[j]++; 217 | } 218 | } 219 | }*/ 220 | mat.clear(); 221 | 222 | } 223 | last_pos = pos; 224 | last_chr = chr; 225 | 226 | } 227 | } 228 | if (count == 7 && strncmp(&buffer[i], "SUPP_VEC=", 9) == 0) { 229 | std::string tmp = buffer.substr(i + 9); 230 | std::size_t found = tmp.find_first_of(";"); 231 | pattern = tmp.substr(0, found); 232 | } 233 | 234 | if(count>8 && buffer[i-1]=='\t'){ 235 | 236 | if(buffer[i]=='1' || buffer[i+2]=='1'){ 237 | pattern+='1'; 238 | }else{ 239 | pattern+='0'; 240 | } 241 | } 242 | 243 | if (buffer[i] == '\t') { 244 | count++; 245 | } 246 | } 247 | // cout<<"pat "< . LowQual IMPRECISE;CIEND=-305,305;CIPOS=-305,305;SVTYPE=DEL;SVMETHOD=EMBL.DELLYv0.5.9;CHR2=III;END=15991;SVLEN=10887;CT=3to5;PE=2;MAPQ=60 GT:GL:GQ:FT:RC:DR:DV:RR:RV 1/1:-12,-0.602059,0:6:LowQual:816:0:2:0:0 30 | 31 | std::ostringstream convert; // stream used for the conversion 32 | convert << region.start.chr; 33 | convert << "\t"; 34 | convert << region.start.pos; // insert the textual representation of 'Number' in the characters in the stream 35 | convert << "\t"; 36 | convert << trans_type(region.type); 37 | convert << "00"; 38 | convert << "Bionanom\tN\t<"; 39 | convert << trans_type(region.type); 40 | convert << ">\t.\tLowQual\tIMPRECISE;SVTYPE="; 41 | convert << trans_type(region.type); 42 | convert << ";CHR2="; 43 | convert << region.stop.chr; 44 | convert << ";END="; 45 | convert << region.stop.pos; 46 | convert << ";SVLEN="; 47 | convert << region.sv_len; 48 | convert << ";PE="; 49 | convert << 1; 50 | convert << "\tGT:GL:GQ:FT:RC:DR:DV:RR:RV\t"; 51 | std::stringstream s; 52 | s << "1/1:0,0,0:0:PASS:0:0:"; 53 | s << 1; 54 | s << ":0:0"; 55 | //std::cout<& entries) { 60 | size_t buffer_size = 2000000; 61 | char*buffer = new char[buffer_size]; 62 | std::ifstream myfile; 63 | myfile.open(bionano.c_str(), std::ifstream::in); 64 | if (!myfile.good()) { 65 | std::cout << "Bionano Parser: could not open file: " << bionano.c_str() << std::endl; 66 | exit(0); 67 | } 68 | myfile.getline(buffer, buffer_size); //avoid header 69 | myfile.getline(buffer, buffer_size); 70 | while (buffer[0] == '#' && !myfile.eof()) { 71 | myfile.getline(buffer, buffer_size); 72 | } 73 | 74 | while (!myfile.eof()) { 75 | int count = 0; 76 | strvcfentry tmp; 77 | std::string type; 78 | int query_start = 0; 79 | int query_stop = 0; 80 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 81 | if (count == 2 && buffer[i] != '\t') { 82 | tmp.start.chr += buffer[i]; 83 | } 84 | if (count == 3 && buffer[i] != '\t') { 85 | tmp.stop.chr += buffer[i]; 86 | } 87 | if (count == 4 && buffer[i - 1] == '\t') { 88 | query_start = atoi(&buffer[i]); 89 | } 90 | if (count == 5 && buffer[i - 1] == '\t') { 91 | query_stop = atoi(&buffer[i]); 92 | } 93 | if (count == 6 && buffer[i - 1] == '\t') { 94 | tmp.start.pos = atoi(&buffer[i]); 95 | } 96 | if (count == 7 && buffer[i - 1] == '\t') { 97 | tmp.stop.pos = atoi(&buffer[i]); 98 | } 99 | if (count == 9 && buffer[i] != '\t') { 100 | type += buffer[i]; 101 | } 102 | if (count == 10 && buffer[i - 1] == '\t') { 103 | tmp.type = trans_type_bio(type); 104 | break; 105 | } 106 | if (buffer[i] == '\t') { 107 | count++; 108 | } 109 | } 110 | double factor = (query_stop - query_start) / 2; 111 | if (tmp.type == 0) { 112 | tmp.start.pos = tmp.start.pos + factor; 113 | tmp.stop.pos = tmp.stop.pos - factor; 114 | tmp.sv_len=tmp.stop.pos-tmp.start.pos; 115 | } else if (tmp.type == 4) { 116 | tmp.sv_len=query_start - query_stop + tmp.stop.pos - tmp.start.pos; 117 | 118 | //tmp.sv_len=abs((query_start - query_stop) - (tmp.start.pos - tmp.stop.pos)); 119 | tmp.start.pos=(query_stop + query_start)/2; 120 | tmp.stop.pos = tmp.start.pos+1; 121 | } 122 | entries.push_back(tmp); 123 | myfile.getline(buffer, buffer_size); 124 | } 125 | myfile.close(); 126 | 127 | } 128 | 129 | void parse_GC(std::string bionano, std::vector& entries) { 130 | size_t buffer_size = 2000000; 131 | char*buffer = new char[buffer_size]; 132 | std::ifstream myfile; 133 | myfile.open(bionano.c_str(), std::ifstream::in); 134 | if (!myfile.good()) { 135 | std::cout << "Bionano Parser: could not open file: " << bionano.c_str() << std::endl; 136 | exit(0); 137 | } 138 | myfile.getline(buffer, buffer_size); //avoid header 139 | myfile.getline(buffer, buffer_size); 140 | while (buffer[0] == '#' && !myfile.eof()) { 141 | myfile.getline(buffer, buffer_size); 142 | } 143 | 144 | while (!myfile.eof()) { 145 | if (buffer[0] != '#' && buffer[0] != '>') { 146 | int count = 0; 147 | strvcfentry tmp; 148 | std::string type = ""; 149 | tmp.type = -1; 150 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 151 | if (count == 1 && buffer[i] != '\t') { 152 | type += buffer[i]; 153 | } 154 | if (count == 2 && buffer[i - 1] == '\t') { 155 | // std::cout< entries; 197 | parse_Bionano(bionano, entries); 198 | FILE *file; 199 | file = fopen(output.c_str(), "w"); 200 | for (size_t i = 0; i < entries.size(); i++) { 201 | fprintf(file, "%s", print_entry_bio(entries[i]).c_str()); 202 | fprintf(file, "%c", '\n'); 203 | } 204 | fclose(file); 205 | } 206 | 207 | void process_CG(std::string gc_file, std::string output) { 208 | std::vector entries; 209 | parse_GC(gc_file, entries); 210 | FILE *file; 211 | file = fopen(output.c_str(), "w"); 212 | for (size_t i = 0; i < entries.size(); i++) { 213 | fprintf(file, "%s", print_entry_bio(entries[i]).c_str()); 214 | fprintf(file, "%c", '\n'); 215 | } 216 | fclose(file); 217 | } 218 | -------------------------------------------------------------------------------- /src/analysis_sv/Simplify_SVs.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Simplify_SVs.cpp 3 | * 4 | * Created on: Nov 28, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "Simplify_SVs.h" 9 | 10 | std::map parse_support(char * vec, vector names) { 11 | std::map accessions; 12 | size_t i = 0; 13 | while (vec[i] != ';') { 14 | if (vec[i] == '1') { 15 | accessions[names[i]] = true; 16 | } 17 | i++; 18 | } 19 | return accessions; 20 | } 21 | std::string parse_seq(char * vec) { 22 | size_t i = 0; 23 | std::string chr = ""; 24 | while (vec[i] != ';' && vec[i] != '\t') { 25 | chr += vec[i]; 26 | i++; 27 | } 28 | return chr; 29 | } 30 | std::string parse_gene_name(char * vec) { 31 | size_t i = 0; 32 | std::string name = ""; 33 | bool parse = false; 34 | while (vec[i] != ';' && vec[i] != '\t') { 35 | if (strncmp("Name=", &vec[i], 5) == 0) { 36 | i = i + 5; 37 | parse = true; 38 | } 39 | if (parse) { 40 | name += vec[i]; 41 | } 42 | if (vec[i] == ',') { 43 | parse = false; 44 | } 45 | i++; 46 | } 47 | return name.substr(0, name.size() - 1); //chop of the last comma 48 | } 49 | 50 | sv_simple_str parse_line_sv(char * buffer, int buffer_size, vector names, std::string & gene_name) { 51 | int count = 0; 52 | 53 | sv_simple_str sv; 54 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 55 | if (count == 0 && buffer[i] != '\t') { 56 | sv.start.chr += buffer[i]; 57 | } 58 | if (count == 1 && buffer[i - 1] == '\t') { 59 | sv.start.pos = atoi(&buffer[i]); 60 | } 61 | if (count == 4 && (buffer[i] != '\t' && (buffer[i] != '<' && buffer[i] != '>'))) { 62 | sv.svtype += buffer[i]; 63 | } 64 | if (count == 7 && strncmp(&buffer[i], "SUPP_VEC=", 9) == 0) { 65 | sv.accessions = parse_support(&buffer[i + 9], names); 66 | } 67 | if (count == 7 && strncmp(&buffer[i], "END=", 4) == 0) { 68 | sv.stop.pos = atoi(&buffer[i + 4]); 69 | } 70 | if (count == 7 && strncmp(&buffer[i], "CHR2=", 5) == 0) { 71 | sv.stop.chr = parse_seq(&buffer[i + 5]); 72 | } 73 | if (count == 7 && strncmp(&buffer[i], "STRANDS=", 8) == 0) { 74 | sv.strands.first = buffer[i + 8] == '+'; 75 | sv.strands.second = buffer[i + 9] == '+'; 76 | } 77 | if (count == 7 && strncmp(&buffer[i], ";gene_id=", 9) == 0) { 78 | gene_name = parse_gene_name(&buffer[i + 9]); 79 | break; 80 | } 81 | 82 | if (buffer[i] == '\t') { 83 | count++; 84 | } 85 | } 86 | return sv; 87 | } 88 | 89 | void print_gene_sv(std::string gene_name, sv_simple_str entry, FILE *&file2, int pop_size, std::map > populations) { 90 | 91 | fprintf(file2, "%s", gene_name.c_str()); 92 | fprintf(file2, "%c", '\t'); 93 | fprintf(file2, "%s", entry.svtype.c_str()); 94 | fprintf(file2, "%c", '\t'); 95 | fprintf(file2, "%s", entry.start.chr.c_str()); 96 | fprintf(file2, "%c", ':'); 97 | fprintf(file2, "%i", entry.start.pos); 98 | fprintf(file2, "%c", '-'); 99 | fprintf(file2, "%s", entry.stop.chr.c_str()); 100 | fprintf(file2, "%c", ':'); 101 | fprintf(file2, "%i", entry.stop.pos); 102 | fprintf(file2, "%c", '\t'); 103 | if (entry.strands.first == true) { 104 | fprintf(file2, "%c", '+'); 105 | } else { 106 | fprintf(file2, "%c", '-'); 107 | } 108 | 109 | if (entry.strands.second == true) { 110 | fprintf(file2, "%c", '+'); 111 | } else { 112 | fprintf(file2, "%c", '-'); 113 | } 114 | fprintf(file2, "%c", '\t'); 115 | int count = 0; 116 | for (std::map::iterator t = entry.accessions.begin(); t != entry.accessions.end(); t++) { 117 | fprintf(file2, "%s", (*t).first.c_str()); 118 | if (count + 1 < entry.accessions.size()) { 119 | fprintf(file2, "%c", ','); 120 | } 121 | count++; 122 | } 123 | fprintf(file2, "%c", '\t'); 124 | fprintf(file2, "%i", (int) entry.accessions.size()); 125 | fprintf(file2, "%c", '\t'); 126 | fprintf(file2, "%f", (double) entry.accessions.size() / (double) pop_size); 127 | //compute AF! 128 | for (std::map >::iterator j = populations.begin(); j != populations.end(); j++) { 129 | int count = 0; 130 | 131 | for (size_t t = 0; t < (*j).second.size(); t++) { 132 | if (entry.accessions.find((*j).second[t]) != entry.accessions.end()) { 133 | count++; 134 | } 135 | } 136 | 137 | fprintf(file2, "%c", '\t'); 138 | fprintf(file2, "%i", count); 139 | fprintf(file2, "%c", '\t'); 140 | fprintf(file2, "%f", (double) count / (double) (*j).second.size()); 141 | 142 | } 143 | fprintf(file2, "%c", '\n'); 144 | } 145 | std::map > parse_populations(std::string pop_file) { 146 | std::map > pop; 147 | size_t buffer_size = 2000000; 148 | char*buffer = new char[buffer_size]; 149 | ifstream myfile; 150 | myfile.open(pop_file.c_str(), ifstream::in); 151 | if (!myfile.good()) { 152 | cout << "Pop Parser: could not open file: " << pop_file.c_str() << endl; 153 | exit(0); 154 | } 155 | myfile.getline(buffer, buffer_size); 156 | 157 | while (!myfile.eof()) { 158 | std::string sample = ""; 159 | std::string pop_id = ""; 160 | int count = 0; 161 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 162 | if (count == 0 && buffer[i] != '\t') { 163 | sample += buffer[i]; 164 | } 165 | if (count > 0 && buffer[i] != '\t') { 166 | if (buffer[i] == ' ') { 167 | pop_id += "_"; 168 | } else { 169 | pop_id += buffer[i]; 170 | } 171 | } 172 | if (buffer[i] == '\t') { 173 | if (!pop_id.empty()) { 174 | pop[pop_id].push_back(sample); 175 | pop_id = ""; 176 | } 177 | count++; 178 | } 179 | } 180 | if (!pop_id.empty()) { 181 | pop[pop_id].push_back(sample); 182 | pop_id = ""; 183 | } 184 | 185 | myfile.getline(buffer, buffer_size); 186 | } 187 | 188 | return pop; 189 | } 190 | void simplify_svs(std::string filename, std::string pop_file, int min_size, std::string output) { 191 | size_t buffer_size = 2000000; 192 | char*buffer = new char[buffer_size]; 193 | ifstream myfile; 194 | 195 | std::map > population = parse_populations(pop_file); 196 | 197 | myfile.open(filename.c_str(), ifstream::in); 198 | if (!myfile.good()) { 199 | cout << "Annotation Parser: could not open file: " << filename.c_str() << endl; 200 | exit(0); 201 | } 202 | 203 | myfile.getline(buffer, buffer_size); 204 | 205 | vector names; 206 | std::map > svs; 207 | 208 | FILE *file2; 209 | file2 = fopen(output.c_str(), "w"); 210 | fprintf(file2, "%s", "Genes\tSVtype\tPositions\tstrands\taccessions\ttotal_num\ttotal_AF"); 211 | for (std::map >::iterator j = population.begin(); j != population.end(); j++) { 212 | fprintf(file2, "%s", "\t"); 213 | fprintf(file2, "%s", (*j).first.c_str()); 214 | fprintf(file2, "%s", "_num"); 215 | 216 | fprintf(file2, "%s", "\t"); 217 | fprintf(file2, "%s", (*j).first.c_str()); 218 | fprintf(file2, "%s", "_AF"); 219 | } 220 | fprintf(file2, "%s", "\n"); 221 | 222 | while (!myfile.eof()) { 223 | if (buffer[0] == '#' && buffer[1] == 'C') { //parse header to get names. 224 | int count = 0; 225 | std::string name = ""; 226 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 227 | if (count > 8 && buffer[i] != '\t') { 228 | name += buffer[i]; 229 | } 230 | if (buffer[i] == '\t') { 231 | if (!name.empty()) { 232 | names.push_back(name); 233 | name.clear(); 234 | } 235 | count++; 236 | } 237 | } 238 | if (!name.empty()) { 239 | names.push_back(name); 240 | name.clear(); 241 | } 242 | cout << "Names: " << names.size() << endl; 243 | } else if (buffer[0] != '#') { 244 | std::string gene_name = "NA"; 245 | sv_simple_str tmp = parse_line_sv(buffer, buffer_size, names, gene_name); 246 | print_gene_sv(gene_name, tmp, file2, names.size(), population); 247 | } 248 | myfile.getline(buffer, buffer_size); 249 | } 250 | myfile.close(); 251 | fclose(file2); 252 | } 253 | -------------------------------------------------------------------------------- /src/Extract_Seq.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Extract_Seq.cpp 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: fsedlaze 6 | */ 7 | 8 | #include "Extract_Seq.h" 9 | std::map parse_ref(std::string reference_file) { 10 | std::map genome; 11 | 12 | size_t buffer_size = 2000000; 13 | char*buffer = new char[buffer_size]; 14 | std::ifstream myfile; 15 | 16 | myfile.open(reference_file.c_str(), std::ifstream::in); 17 | if (!myfile.good()) { 18 | std::cout << "Annotation Parser: could not open file: " 19 | << reference_file.c_str() << std::endl; 20 | exit(0); 21 | } 22 | 23 | myfile.getline(buffer, buffer_size); 24 | std::string seq; 25 | std::string name; 26 | while (!myfile.eof()) { 27 | if (buffer[0] == '>') { 28 | if (!seq.empty()) { 29 | genome[name] = seq; 30 | std::cout< max_len) { 55 | start = (max_len - len) - 1; 56 | } 57 | } 58 | std::string create_name(strvcfentry entry, std::string prefix) { 59 | std::ostringstream tmp; 60 | tmp << ">"; 61 | tmp << prefix; 62 | tmp << "_"; 63 | tmp << trans_type(entry.type); 64 | tmp << "_"; 65 | tmp << entry.start.chr; 66 | tmp << "_"; 67 | tmp << entry.start.pos; 68 | tmp << "_"; 69 | tmp << entry.stop.chr; 70 | tmp << "_"; 71 | tmp << entry.stop.pos; 72 | return tmp.str(); 73 | } 74 | std::string rev(std::string seq) { 75 | std::string tmp; 76 | for (std::string::reverse_iterator i = seq.rbegin(); i != seq.rend(); i++) { 77 | tmp += (*i); 78 | } 79 | return tmp; 80 | } 81 | void extract_DEL(strvcfentry entry, std::map &ref, 82 | int len, FILE *&file) { 83 | /*deletions 84 | ref ....1234.... 85 | alt .... .... 86 | #test for ref and alt: for short DELs (up to ~500nt) 87 | #expect shorter band for the alt 88 | ref .....1234..... 89 | ...>......<... 90 | */ 91 | //TODO hard coded length constrain: 92 | if (entry.stop.pos - entry.start.pos < 500) { 93 | fprintf(file, "%s", create_name(entry, "reg_1").c_str()); 94 | fprintf(file, "%c", '\n'); 95 | int start = entry.start.pos - len; 96 | check_coords(start, len, ref[entry.start.chr].size()); 97 | fprintf(file, "%s", ref[entry.start.chr].substr(start, len).c_str()); 98 | fprintf(file, "%c", '\n'); 99 | fprintf(file, "%s", create_name(entry, "reg_2").c_str()); 100 | fprintf(file, "%c", '\n'); 101 | start = entry.stop.pos; 102 | check_coords(start, len, ref[entry.stop.chr].size()); 103 | fprintf(file, "%s", ref[entry.stop.chr].substr(start, len).c_str()); 104 | fprintf(file, "%c", '\n'); 105 | } 106 | } 107 | 108 | void extract_INV(strvcfentry entry, std::map &ref, 109 | int len, FILE *&file) { 110 | /* ref ....1234.... 111 | alt ....4321.... 112 | 113 | #test for ref: 114 | ref ....1234.... 115 | ...>.<...... 116 | 117 | #test for alt: 118 | ref ....4321.... 119 | ...>.<...... 120 | */ 121 | int region = (len / 2); 122 | 123 | fprintf(file, "%s", create_name(entry, "ref").c_str()); 124 | fprintf(file, "%c", '\n'); 125 | 126 | int start = entry.start.pos - region; 127 | check_coords(start, len, ref[entry.start.chr].size()); 128 | fprintf(file, "%s", ref[entry.start.chr].substr(start, len).c_str()); 129 | fprintf(file, "%c", '\n'); 130 | fprintf(file, "%s", create_name(entry, "alt").c_str()); 131 | fprintf(file, "%c", '\n'); 132 | 133 | start = entry.start.pos - region; 134 | check_coords(start, region, ref[entry.start.chr].size()); 135 | fprintf(file, "%s", ref[entry.start.chr].substr(start, region).c_str()); 136 | 137 | start = entry.stop.pos - region; 138 | check_coords(start, region, ref[entry.stop.chr].size()); 139 | fprintf(file, "%s", rev(ref[entry.stop.chr].substr(start, region)).c_str()); 140 | fprintf(file, "%c", '\n'); 141 | } 142 | 143 | void extract_TRA(strvcfentry entry, std::map &ref, 144 | int len, FILE *&file) { 145 | /* 146 | #test for ref: 147 | ref ....1234.... [another place] ,,,,,,,, 148 | ...>..<..... 149 | 150 | #test for alt: 151 | alt ....12,,,,,, [another place] ,,,,34.... 152 | ...>..<,,,,, 153 | */ 154 | 155 | int region = (len / 2); 156 | 157 | fprintf(file, "%s", create_name(entry, "ref").c_str()); 158 | fprintf(file, "%c", '\n'); 159 | int start = entry.start.pos - region; 160 | check_coords(start, len, ref[entry.start.chr].size()); 161 | fprintf(file, "%s", ref[entry.start.chr].substr(start, len).c_str()); 162 | fprintf(file, "%c", '\n'); 163 | fprintf(file, "%s", create_name(entry, "alt").c_str()); 164 | fprintf(file, "%c", '\n'); 165 | 166 | start = entry.start.pos - region; 167 | check_coords(start, len, ref[entry.start.chr].size()); 168 | fprintf(file, "%s", ref[entry.start.chr].substr(start, region).c_str()); 169 | 170 | start = entry.stop.pos - region; 171 | check_coords(start, len, ref[entry.stop.chr].size()); 172 | fprintf(file, "%s", ref[entry.stop.chr].substr(start, region).c_str()); 173 | fprintf(file, "%c", '\n'); 174 | } 175 | 176 | void extract_DUP(strvcfentry entry, std::map &ref, 177 | int len, FILE *&file) { 178 | std::cout << entry.start.chr << " " << entry.start.pos << " " 179 | << entry.stop.chr << " " << entry.stop.pos << std::endl; 180 | 181 | if (entry.stop.pos - entry.start.pos < 500) { 182 | /* 183 | * #duplications 184 | ref ....1234.... 185 | alt ....12341234.... 186 | 187 | #test for ref and alt: for short DUPs (up to ~500nt) 188 | ref .....1234..... 189 | ...>......<... 190 | */ 191 | 192 | //TODO think about the region vs. two seq for breakpoints? 193 | fprintf(file, "%s", create_name(entry, "Dup_small_1").c_str()); 194 | fprintf(file, "%c", '\n'); 195 | 196 | int start = entry.start.pos - len; 197 | check_coords(start, len, ref[entry.start.chr].size()); 198 | fprintf(file, "%s", ref[entry.start.chr].substr(start, len).c_str()); 199 | fprintf(file, "%c", '\n'); 200 | fprintf(file, "%s", create_name(entry, "Dup_small_2").c_str()); 201 | fprintf(file, "%c", '\n'); 202 | 203 | start = entry.stop.pos; 204 | check_coords(start, len, ref[entry.stop.chr].size()); 205 | fprintf(file, "%s", ref[entry.stop.chr].substr(start, len).c_str()); 206 | fprintf(file, "%c", '\n'); 207 | 208 | } else { 209 | /* 210 | #for for longer dups (> 500nt) 211 | test for alt: 212 | alt ....12341234.... 213 | var ......>..<...... 214 | 215 | #in the ref this should not make a product: 216 | ref .....1234..... 217 | ......<>...... 218 | * 219 | */ 220 | //half into the Dup: 221 | int pos = entry.start.pos + ((entry.stop.pos - entry.start.pos) / 2); 222 | std::cout << pos << " " << len << std::endl; 223 | fprintf(file, "%s", create_name(entry, "Dup_large_1").c_str()); 224 | fprintf(file, "%c", '\n'); 225 | int start = pos - len; 226 | check_coords(start, len, ref[entry.start.chr].size()); 227 | fprintf(file, "%s", ref[entry.start.chr].substr(start, len).c_str()); 228 | fprintf(file, "%c", '\n'); 229 | 230 | fprintf(file, "%s", create_name(entry, "Dup_large_2").c_str()); 231 | fprintf(file, "%c", '\n'); 232 | start = pos; 233 | check_coords(start, len, ref[entry.stop.chr].size()); 234 | fprintf(file, "%s", ref[entry.stop.chr].substr(start, len).c_str()); 235 | fprintf(file, "%c", '\n'); 236 | 237 | } 238 | 239 | } 240 | void extract_breakpoint_seq(std::string vcf_file, std::string reference_file, 241 | int len, std::string outputfile) { 242 | 243 | std::vector svs = parse_vcf(vcf_file,0); 244 | std::map ref = parse_ref(reference_file); 245 | std::cout<<"REF "< 0 && (dv + rv) / (dr + rr) < min_alt_ref_ratio) { 50 | result.not_valid++; 51 | } else if (gq > max_genotype) { 52 | result.not_valid++; 53 | } else { 54 | //std::cout<<"valid"< ignore_regions) { 61 | //TODO: compare to bed file to filter out regions! 62 | for (size_t i = 0; i < ignore_regions.size(); i++) { 63 | if (strcmp(region.start.chr.c_str(), ignore_regions[i].start.chr.c_str()) == 0) { 64 | if (region.start.pos > ignore_regions[i].start.pos && region.start.pos < ignore_regions[i].stop.pos) { 65 | return false; 66 | } 67 | } 68 | 69 | if (strcmp(region.stop.chr.c_str(), ignore_regions[i].start.chr.c_str()) == 0) { 70 | if (region.stop.pos > ignore_regions[i].start.pos && region.stop.pos < ignore_regions[i].stop.pos) { 71 | return false; 72 | } 73 | } 74 | 75 | } 76 | 77 | return true; 78 | } 79 | 80 | //parse the bed file that defines regions that should be ignored: 81 | std::vector parse_bed(std::string filename) { 82 | std::vector ignore_regions; 83 | size_t buffer_size = 2000000; 84 | char*buffer = new char[buffer_size]; 85 | std::ifstream myfile; 86 | myfile.open(filename.c_str(), std::ifstream::in); 87 | if (!myfile.good()) { 88 | std::cout << "BED Parser: could not open file: " << filename.c_str() << std::endl; 89 | return ignore_regions; 90 | } 91 | myfile.getline(buffer, buffer_size); 92 | while (!myfile.eof()) { 93 | int count = 0; 94 | strregion tmp; 95 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 96 | if (count == 0 && buffer[i] != '\t') { 97 | tmp.start.chr += buffer[i]; 98 | tmp.stop.chr += buffer[i]; 99 | } 100 | if (count == 1 && buffer[i - 1] == '\t') { 101 | tmp.start.pos = atoi(&buffer[i]); 102 | } 103 | if (count == 2 && buffer[i - 1] == '\t') { 104 | tmp.stop.pos = atoi(&buffer[i]); 105 | break; 106 | } 107 | if (buffer[i] == '\t') { 108 | count++; 109 | } 110 | } 111 | ignore_regions.push_back(tmp); 112 | myfile.getline(buffer, buffer_size); 113 | } 114 | 115 | myfile.close(); 116 | return ignore_regions; 117 | } 118 | 119 | void filter_vcf(std::string vcf_file, std::string genomic_regions,int min_size, int max_size,double min_AF,int min_reads, std::string outputvcf) { 120 | 121 | if(max_size!=-1){ 122 | std::cerr<<"Warning: Max size threshold set, TRA wont be reported as their size cannot be assesst."< ignore_regions; 125 | if(strncmp(genomic_regions.c_str(),"NA",2)!=0){ 126 | ignore_regions = parse_bed(genomic_regions); 127 | } 128 | //parse vcf file and write vcf file in one go: 129 | 130 | std::vector names; 131 | std::string buffer; 132 | std::ifstream myfile; 133 | 134 | myfile.open(vcf_file.c_str(), std::ifstream::in); 135 | if (!myfile.good()) { 136 | std::cout << "Annotation Parser: could not open file: " << vcf_file.c_str() << std::endl; 137 | exit(0); 138 | } 139 | FILE *file; 140 | file = fopen(outputvcf.c_str(), "w"); 141 | 142 | getline(myfile,buffer); 143 | int deleted = 0; 144 | while (!myfile.eof()) { 145 | if (buffer[0] == '#') { //write header info: 146 | fprintf(file, "%s", buffer.c_str()); 147 | fprintf(file, "%c", '\n'); 148 | } else { 149 | strvcfentry sv= parse_vcf_entry(buffer); 150 | int size=(min_size)+1; 151 | 152 | if(sv.type!=3 && sv.type!=5 && sv.type!=-1){ 153 | size=sv.sv_len; 154 | } 155 | //std::cout<min_size && (size< max_size || max_size==-1))){ 157 | // std::cout<<"size_pass: "<min_AF) ) && (ignore_regions.empty() || pass_filter(sv, ignore_regions) )) && ( (size>min_size && (size< max_size || max_size==-1)) && (sv.num_reads.second == -1 || sv.num_reads.second >=min_reads)) ) { 161 | fprintf(file, "%s", buffer.c_str()); 162 | fprintf(file, "%c", '\n'); 163 | } else { 164 | deleted++; 165 | } 166 | } 167 | getline(myfile,buffer); 168 | } 169 | std::cout << "SVs ignored: " << deleted << std::endl; 170 | myfile.close(); 171 | fclose(file); 172 | } 173 | 174 | void filter_vcf_sniffles(std::string vcf_file, int min_lenght, std::string outputvcf) { 175 | 176 | std::vector names; 177 | size_t buffer_size = 2000000; 178 | char*buffer = new char[buffer_size]; 179 | std::ifstream myfile; 180 | 181 | myfile.open(vcf_file.c_str(), std::ifstream::in); 182 | if (!myfile.good()) { 183 | std::cout << "Annotation Parser: could not open file: " << vcf_file.c_str() << std::endl; 184 | exit(0); 185 | } 186 | FILE *file; 187 | file = fopen(outputvcf.c_str(), "w"); 188 | 189 | myfile.getline(buffer, buffer_size); 190 | int deleted = 0; 191 | while (!myfile.eof()) { 192 | if (buffer[0] == '#') { //write header info: 193 | fprintf(file, "%s", buffer); 194 | fprintf(file, "%c", '\n'); 195 | } else { 196 | int count = 0; 197 | int len = 0; 198 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 199 | if (count == 7 && strncmp(&buffer[i], "SVLEN=NA", 8) == 0) { 200 | 201 | len = min_lenght * 2; 202 | break; 203 | } else if (count == 7 && strncmp(&buffer[i], "SVLEN=", 6) == 0) { 204 | len = atoi(&buffer[i + 6]); 205 | //std::cout<<"len "<= min_lenght) { 215 | fprintf(file, "%s", buffer); 216 | fprintf(file, "%c", '\n'); 217 | } else { 218 | deleted++; 219 | } 220 | } 221 | myfile.getline(buffer, buffer_size); 222 | } 223 | std::cout << "WE deleted: " << deleted << std::endl; 224 | myfile.close(); 225 | fclose(file); 226 | } 227 | struct sv_trio { 228 | bool son; 229 | bool mother; 230 | bool father; 231 | }; 232 | double report_norm(double e1, double e2) { 233 | if (e2 == 0) { 234 | return 0; 235 | } 236 | return e1;//round((e1 / e2)* 10000)/100 ; 237 | } 238 | void summarize_paper_gaib(std::string venn_file) { 239 | size_t buffer_size = 2000000; 240 | char*buffer = new char[buffer_size]; 241 | std::ifstream myfile; 242 | 243 | myfile.open(venn_file.c_str(), std::ifstream::in); 244 | if (!myfile.good()) { 245 | std::cout << "Annotation Parser: could not open file: " << venn_file.c_str() << std::endl; 246 | exit(0); 247 | } 248 | myfile.getline(buffer, buffer_size); 249 | myfile.getline(buffer, buffer_size); 250 | 251 | std::map > trio_summary; 252 | std::vector tmp; 253 | tmp.resize(8, 0); 254 | trio_summary["DEL"] = tmp; 255 | trio_summary["DUP"] = tmp; 256 | trio_summary["INS"] = tmp; 257 | trio_summary["INV"] = tmp; 258 | trio_summary["TRA"] = tmp; 259 | 260 | while (!myfile.eof()) { 261 | int count = 0; 262 | 263 | std::string key = ""; 264 | sv_trio trio; 265 | trio.father = false; 266 | trio.mother = false; 267 | trio.son = false; 268 | 269 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 270 | if (count == 0 && i < 3) { 271 | key += buffer[i]; 272 | } 273 | if (count == 1 && buffer[i - 1] == '\t') { 274 | trio.son = (bool) (buffer[i] == '1'); 275 | } 276 | if (count == 2 && buffer[i - 1] == '\t') { 277 | trio.father = (bool) (buffer[i] == '1'); 278 | } 279 | if (count == 3 && buffer[i - 1] == '\t') { 280 | trio.mother = (bool) (buffer[i] == '1'); 281 | break; 282 | } 283 | if (buffer[i] == '\t') { 284 | count++; 285 | } 286 | } 287 | if (trio_summary.find(key) == trio_summary.end()) { 288 | trio_summary[key] = tmp; 289 | } 290 | trio_summary[key][0]++; // total 291 | if (trio.son) { 292 | trio_summary[key][1]++; // son 293 | } 294 | if (trio.father) { 295 | trio_summary[key][2]++; // father 296 | } 297 | if (trio.mother) { 298 | trio_summary[key][3]++; // mother 299 | } 300 | if (trio.son && (trio.mother && trio.father)) { 301 | trio_summary[key][4]++; 302 | } 303 | if (trio.son && (trio.mother || trio.father)) { 304 | trio_summary[key][5]++; 305 | } 306 | if (!trio.son && (trio.mother && trio.father)) { 307 | trio_summary[key][6]++; 308 | } 309 | if (trio.son && (!trio.mother && !trio.father)) { 310 | trio_summary[key][7]++; 311 | } 312 | 313 | myfile.getline(buffer, buffer_size); 314 | } 315 | std::cout << "DEL/DUP/INS/INV/TRA" << std::endl; 316 | std::cout << trio_summary["DEL"][0] << '/' << trio_summary["DUP"][0] << '/' << trio_summary["INS"][0] << '/' << trio_summary["INV"][0] << '/' << trio_summary["TRA"][0] << '\t'; 317 | std::cout << trio_summary["DEL"][1] << '/' << trio_summary["DUP"][1] << '/' << trio_summary["INS"][1] << '/' << trio_summary["INV"][1] << '/' << trio_summary["TRA"][1] << '\t'; 318 | std::cout << trio_summary["DEL"][2] << '/' << trio_summary["DUP"][2] << '/' << trio_summary["INS"][2] << '/' << trio_summary["INV"][2] << '/' << trio_summary["TRA"][2] << '\t'; 319 | std::cout << trio_summary["DEL"][3] << '/' << trio_summary["DUP"][3] << '/' << trio_summary["INS"][3] << '/' << trio_summary["INV"][3] << '/' << trio_summary["TRA"][3] << '\t'; 320 | 321 | std::cout << report_norm(trio_summary["DEL"][4], trio_summary["DEL"][0]) << '/' << report_norm(trio_summary["DUP"][4], trio_summary["DUP"][0]) << '/' << report_norm(trio_summary["INS"][4], trio_summary["INS"][0]) << '/' << report_norm(trio_summary["INV"][4], trio_summary["INV"][0]) << '/'<< report_norm(trio_summary["TRA"][4], trio_summary["TRA"][0]) << '\t'; 322 | std::cout << report_norm(trio_summary["DEL"][5], trio_summary["DEL"][0]) << '/' << report_norm(trio_summary["DUP"][5], trio_summary["DUP"][0]) << '/' << report_norm(trio_summary["INS"][5], trio_summary["INS"][0]) << '/' << report_norm(trio_summary["INV"][5], trio_summary["INV"][0]) << '/'<< report_norm(trio_summary["TRA"][5], trio_summary["TRA"][0]) << '\t'; 323 | std::cout << report_norm(trio_summary["DEL"][6], trio_summary["DEL"][0]) << '/' << report_norm(trio_summary["DUP"][6], trio_summary["DUP"][0]) << '/' << report_norm(trio_summary["INS"][6], trio_summary["INS"][0]) << '/' << report_norm(trio_summary["INV"][6], trio_summary["INV"][0]) << '/'<< report_norm(trio_summary["TRA"][6], trio_summary["TRA"][0]) << '\t'; 324 | std::cout << report_norm(trio_summary["DEL"][7], trio_summary["DEL"][0]) << '/' << report_norm(trio_summary["DUP"][7], trio_summary["DUP"][0]) << '/' << report_norm(trio_summary["INS"][7], trio_summary["INS"][0]) << '/' << report_norm(trio_summary["INV"][7], trio_summary["INV"][0]) << '/'<< report_norm(trio_summary["TRA"][7], trio_summary["TRA"][0]) << std::endl; 325 | } 326 | 327 | -------------------------------------------------------------------------------- /src/simulator/Sim_reads.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Nanopore_sim.cpp 3 | * 4 | * Created on: May 30, 2017 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "Sim_reads.h" 9 | std::map parse_genome(std::string genome_file, int min_length) { 10 | size_t buffer_size; 11 | char *buffer; 12 | ifstream myfile; 13 | 14 | myfile.open(genome_file.c_str(), ifstream::in); 15 | if (!myfile.good()) { 16 | cout << "Fasta Parser: could not open file: " << genome_file.c_str() << endl; 17 | exit(0); 18 | } 19 | 20 | buffer_size = 20000; 21 | buffer = new char[buffer_size]; 22 | 23 | myfile.getline(buffer, buffer_size); 24 | string seq = ""; 25 | string name; 26 | 27 | // std::default_random_engine generator; 28 | // std::poisson_distribution distribution(2); 29 | 30 | std::map genome; 31 | while (!myfile.eof()) { 32 | if (buffer[0] == '>') { 33 | if ((int) seq.size() > min_length) { 34 | 35 | // int border=distribution(generator); 36 | //cout<<"max copies: "< min_length) { 61 | stringstream ss; 62 | ss << name; 63 | // ss << i; 64 | // std::cout< 1.0); 141 | { 142 | double d = sqrt(-2.0 * log(r) / r); 143 | double n1 = x * d; 144 | n2 = y * d; 145 | double result = n1 * stddev + mean; 146 | n2_cached = 1; 147 | return result; 148 | } 149 | } else { 150 | n2_cached = 0; 151 | return n2 * stddev + mean; 152 | } 153 | } 154 | 155 | std::vector parse_error_profile(std::string error_profile_file) { 156 | char *buffer; 157 | ifstream myfile; 158 | 159 | myfile.open(error_profile_file.c_str(), ifstream::in); 160 | if (!myfile.good()) { 161 | cout << "Fasta Parser: could not open file: " << error_profile_file.c_str() << endl; 162 | exit(0); 163 | } 164 | 165 | size_t buffer_size = 2000; 166 | buffer = new char[buffer_size]; 167 | 168 | myfile.getline(buffer, buffer_size); //avoid header 169 | myfile.getline(buffer, buffer_size); 170 | 171 | std::vector error_profile; 172 | while (!myfile.eof()) { 173 | int count = 0; 174 | read_position tmp; 175 | for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 176 | if (count == 1 && buffer[i - 1] == '\t') { 177 | tmp.total = atof(&buffer[i]); 178 | } 179 | if (count == 2 && buffer[i - 1] == '\t') { 180 | tmp.match = atof(&buffer[i]); 181 | } 182 | if (count == 3 && buffer[i - 1] == '\t') { 183 | tmp.mismatch = atof(&buffer[i]); 184 | } 185 | if (count == 4 && buffer[i - 1] == '\t') { 186 | tmp.ins = atof(&buffer[i]); 187 | } 188 | if (count == 5 && buffer[i - 1] == '\t') { 189 | tmp.del = atof(&buffer[i]); 190 | } 191 | if (buffer[i] == '\t') { 192 | count++; 193 | } 194 | } 195 | error_profile.push_back(tmp); 196 | myfile.getline(buffer, buffer_size); 197 | } 198 | 199 | return error_profile; 200 | } 201 | 202 | void simulate_reads(std::string genome_file, std::string error_profile_file, int coverage, std::string output) { 203 | srand(time(NULL)); 204 | 205 | std::vector error_profile = parse_error_profile(error_profile_file); 206 | int avg_readlen = 0; 207 | ///int max_size = error_profile.size() - 1; 208 | for (size_t i = 0; i < error_profile.size() && error_profile[i].total < 0.6; i++) { 209 | avg_readlen++; 210 | } 211 | 212 | std::map genome = parse_genome(genome_file, avg_readlen * 3); 213 | std::cout << "\tParsing done: " << genome.size() << " chrs " << std::endl; 214 | long genome_size = 0; 215 | int hits = 0; 216 | for (std::map::iterator i = genome.begin(); i != genome.end(); i++) { 217 | hits++; 218 | genome_size += (*i).second.size(); 219 | } 220 | 221 | std::cout << "\tAVG read length: " << avg_readlen << std::endl; 222 | int num_reads = ((double) genome_size / (double) avg_readlen) * (double) coverage; 223 | std::cout << "\tNum of reads: " << num_reads << std::endl; 224 | FILE *file; 225 | file = fopen(output.c_str(), "w"); 226 | double prev = 0; 227 | // cout << "\t\tReads simulated: " << prev << "%" << std::endl; 228 | 229 | std::map cov_reported; 230 | 231 | for (std::map::iterator i = genome.begin(); i != genome.end(); i++) { 232 | 233 | cov_reported[(*i).first] = ((long) (*i).second.size()) * coverage; 234 | if (cov_reported[(*i).first] < 0) { 235 | cerr << "Error in genome size * cov recording. Apport. Please report this!" << endl; 236 | exit(1); 237 | } 238 | } 239 | 240 | std::map::iterator current_chr = genome.begin(); // we just go one chr after another. 241 | 242 | for (int i = 0; i < num_reads && current_chr != genome.end(); i++) { //start to simulate reads: 243 | double bp = (rand() % 1000000); 244 | 245 | bp = bp / 1000000; 246 | size_t size = 0; 247 | while (size < error_profile.size()) { //1: Pick a read size based on the profile 248 | if (size > 10 && bp < error_profile[size].total) { 249 | break; 250 | } 251 | size++; 252 | } 253 | 254 | //cout << size << ": " << bp << endl; 255 | 256 | //std::cout<<"Read init: "< (*current_chr).second.size()) { 262 | current_chr++; 263 | cout << "De: skip" << endl; 264 | } 265 | 266 | chr = (*current_chr).first; 267 | //cout<<"chr: "< len) { //2: Pick a chromosome: 272 | int pos = rand() % (int) (genome.size()); 273 | 274 | //check if selected chr is not already covered enough: 275 | 276 | 277 | chr = "";//(*genome.begin()).first; //check that again... 278 | for (std::map::iterator j = genome.begin(); j != genome.end() && pos >= 0; j++) { 279 | if (pos == 0) { 280 | while(cov_reported[(*j).first]<0){ 281 | j++; 282 | if(j==genome.end()){ 283 | j=genome.begin(); 284 | } 285 | } 286 | chr = (*j).first; 287 | } 288 | pos--; 289 | } 290 | if(!chr.empty()){ 291 | len = genome[chr].size(); 292 | 293 | }else if(counter>30){ 294 | break; 295 | }else{ 296 | counter++; 297 | } 298 | } 299 | if(chr.empty()){ 300 | std::cerr<<"Nothing left to simulate from."<::iterator i = genome.begin(); i != genome.end(); i++) { 302 | // cout<< (*i).first <<" left: "< 0.1) { 331 | break; 332 | } 333 | read[j] = new_nuc('N'); 334 | } 335 | double bp = (rand() % 1000000); //bp probability 336 | bp = bp / 1000000; 337 | if (bp < error_profile[j].match) { 338 | final_read += read[j]; 339 | } else if (bp < error_profile[j].mismatch + error_profile[j].match) { 340 | final_read += new_nuc(read[j]); 341 | } else if (bp < error_profile[j].ins + error_profile[j].match + error_profile[j].mismatch) { 342 | final_read += read[j]; 343 | final_read += new_nuc('N'); 344 | } 345 | } 346 | attempts--; 347 | // cout << "Ns: " << num_N <<" size: "<<(double) read.size() << endl; 348 | } while (num_N / (double) read.size() > 0.1 && attempts != 0);*/ 349 | 350 | //ignore if there are N's because of even out the coverage.. 351 | start_pos = (size_t) rand() % (genome[chr].size() - size + 2); 352 | read = genome[chr].substr(start_pos, size); 353 | if (read.empty()) { 354 | cerr << "ERROR! Read is empty!" << endl; 355 | exit(1); 356 | } 357 | //apply sequencing errors: 358 | bool do_not_report = false; 359 | for (size_t j = 0; j < read.size(); j++) { 360 | if (read[j] == 'N' || read[j] == 'n') { 361 | num_N++; 362 | if (num_N / (double) read.size() > 0.8) { 363 | do_not_report = true; 364 | break; 365 | } 366 | // read[j] = new_nuc('N'); 367 | } 368 | 369 | double bp = (rand() % 1000000); //bp probability 370 | bp = bp / 1000000; 371 | if (bp < error_profile[j].match) { 372 | final_read += read[j]; 373 | } else if (bp < error_profile[j].mismatch + error_profile[j].match) { 374 | final_read += new_nuc(read[j]); 375 | } else if (bp < error_profile[j].ins + error_profile[j].match + error_profile[j].mismatch) { 376 | final_read += read[j]; 377 | final_read += new_nuc('N'); 378 | } 379 | 380 | } 381 | 382 | if (!do_not_report) { 383 | //if (attempts != 0) { 384 | bool flag = true; 385 | if (rand() % 100 < 51) { 386 | flag = false; 387 | //reverse read. 388 | std::string new_read; 389 | for (std::string::reverse_iterator ri = final_read.rbegin(); ri != final_read.rend(); ri++) { 390 | new_read += complementbp((*ri)); 391 | } 392 | final_read = new_read; 393 | } 394 | 395 | std::stringstream name; 396 | name << chr; 397 | name << "_"; 398 | name << start_pos; 399 | if (flag) { 400 | name << "_+"; 401 | } else { 402 | name << "_-"; 403 | } 404 | 405 | fprintf(file, "%s", name.str().c_str()); 406 | fprintf(file, "%c", '\n'); 407 | fprintf(file, "%s", final_read.c_str()); 408 | fprintf(file, "%c", '\n'); 409 | 410 | if (i % 10000 == 0 && prev < (i * 100) / num_reads) { 411 | prev = (i * 100) / num_reads; 412 | std::cout << "\t\tReads simulated: " << prev << "%" << '\r' << std::flush; 413 | //cout << "\t\tReads simulated: " << prev << "%" << std::endl; 414 | } 415 | } 416 | 417 | cov_reported[chr] -= final_read.size(); 418 | if (cov_reported[chr] < 0) { 419 | current_chr++; // we reached out targeted coverage lets select the next one. 420 | } 421 | if (current_chr == genome.end()) { 422 | std::cerr << "End of genome" << endl; 423 | break; 424 | } 425 | // } 426 | } 427 | 428 | fclose(file); 429 | } 430 | -------------------------------------------------------------------------------- /src/phasing/Phasing_vcf.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Phasing_vcf.cpp 3 | * 4 | * Created on: Sep 26, 2018 5 | * Author: sedlazec 6 | */ 7 | 8 | #include "Phasing_vcf.h" 9 | std::vector parse_hapcut2(std::string hapcut_output) { 10 | std::vector snps; 11 | 12 | std::string buffer; 13 | std::ifstream myfile; 14 | myfile.open(hapcut_output.c_str(), std::ifstream::in); 15 | if (!myfile.good()) { 16 | std::cout << "Hapcut Parser: could not open file: " << hapcut_output.c_str() << std::endl; 17 | exit(0); 18 | } 19 | getline(myfile, buffer); 20 | int phase_block_id = 0; 21 | while (!myfile.eof()) { 22 | if (buffer[0] != 'B') { 23 | 24 | if (buffer[0] == '*') { 25 | //store new 26 | phase_block_id++; 27 | } else { 28 | snp_str tmp; 29 | tmp.parental = 0; 30 | tmp.gatk = 0; 31 | tmp.phase_block = phase_block_id; 32 | tmp.ratio = -1; 33 | //parse the actual snps: 34 | int count = 0; 35 | for (size_t i = 0; i < buffer.size() && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 36 | if (count == 1 && buffer[i] == '1') { 37 | tmp.haplotype = true; 38 | } 39 | if (count == 2 && buffer[i] == '1') { 40 | tmp.haplotype = false; 41 | } 42 | if (count == 3 && buffer[i] != '\t') { 43 | tmp.chr += buffer[i]; 44 | } 45 | if (count == 4 && buffer[i - 1] == '\t') { 46 | tmp.position = atoi(&buffer[i]); 47 | } 48 | if (count == 6 && buffer[i] != '\t') { 49 | tmp.alt_allele = buffer[i]; 50 | break; 51 | } 52 | if (buffer[i] == '\t') { 53 | count++; 54 | } 55 | } 56 | // if (strcmp(tmp.chr.c_str(), "17") == 0) { 57 | snps.push_back(tmp); 58 | 59 | // } 60 | 61 | } 62 | 63 | } 64 | getline(myfile, buffer); 65 | } 66 | 67 | myfile.close(); 68 | return snps; 69 | } 70 | 71 | void update_parents_xatlas(std::string parents_vcf, std::vector &snps) { 72 | std::string buffer; 73 | std::ifstream myfile; 74 | myfile.open(parents_vcf.c_str(), std::ifstream::in); 75 | if (!myfile.good()) { 76 | std::cout << "Hapcut Parser: could not open file: " << parents_vcf.c_str() << std::endl; 77 | // exit(0); 78 | } else { 79 | getline(myfile, buffer); 80 | while (!myfile.eof()) { 81 | int pos = 0; 82 | std::string chr; 83 | char alt_allele = ' '; 84 | short parental = -1; //0=na ; 1=father; 2=mother; 85 | int count = 0; 86 | for (size_t i = 0; i < buffer.size() && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 87 | if (count == 0 && buffer[i] != '\t') { 88 | chr += buffer[i]; 89 | } 90 | if (count == 1 && buffer[i - 1] == '\t') { 91 | pos = atoi(&buffer[i]); 92 | } 93 | if (count == 4 && buffer[i] != '\t') { 94 | alt_allele = buffer[i]; 95 | } 96 | if (count == 9 && buffer[i - 1] == '\t') { 97 | if (buffer[i] != '.') { 98 | // std::cout<<"SET mother: "< &snps) { 234 | 235 | std::string buffer; 236 | std::ifstream myfile; 237 | myfile.open(snp_output.c_str(), std::ifstream::in); 238 | if (!myfile.good()) { 239 | std::cout << "SNP Parser: could not open file: " << snp_output.c_str() << std::endl; 240 | //exit(0); 241 | } else { 242 | getline(myfile, buffer); 243 | while (!myfile.eof()) { 244 | int count = 0; 245 | int pos = 0; 246 | int index = -1; 247 | std::string chr = ""; 248 | 249 | for (size_t i = 0; i < buffer.size() && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 250 | 251 | if (count == 0 && buffer[i] != '\t') { 252 | chr += buffer[i]; 253 | } 254 | if (count == 1 && buffer[i - 1] == '\t') { 255 | pos = atoi(&buffer[i]); 256 | index = is_in(chr, pos, snps); 257 | 258 | } 259 | if (count == 6 && buffer[i] != '\t') { 260 | if (index != -1) { 261 | snps[index].qual += buffer[i]; 262 | } 263 | } 264 | if (count == 9 && buffer[i - 1] == '\t') { 265 | if (index != -1) { 266 | snps[index].ratio = parse_gt(&buffer[i]); 267 | } 268 | } 269 | 270 | if (buffer[i] == '\t') { 271 | count++; 272 | } 273 | 274 | } 275 | getline(myfile, buffer); 276 | } 277 | 278 | myfile.close(); 279 | } 280 | } 281 | 282 | void update_parents_snp_array(std::string parents_vcf, std::vector & snps) { 283 | std::string buffer; 284 | std::ifstream myfile; 285 | myfile.open(parents_vcf.c_str(), std::ifstream::in); 286 | if (!myfile.good()) { 287 | std::cout << "GATK Parser: could not open file: " << parents_vcf.c_str() << std::endl; 288 | //exit(0); 289 | } else { 290 | getline(myfile, buffer); 291 | while (!myfile.eof()) { 292 | int pos = 0; 293 | std::string chr; 294 | std::pair father; 295 | std::pair mother; 296 | int count = 0; 297 | for (size_t i = 0; i < buffer.size() && buffer[i] != '\0' && buffer[i] != '\n'; i++) { 298 | if (count == 0 && buffer[i] != '\t') { 299 | chr += buffer[i]; 300 | } 301 | if (count == 1 && buffer[i - 1] == '\t') { 302 | pos = atoi(&buffer[i]); 303 | } 304 | //magic: record all alleles (father + mother) and determine the ref allele later: 305 | if (count == 4 && buffer[i - 1] == '\t') { 306 | father.first = buffer[i]; 307 | } 308 | if (count == 5 && buffer[i - 1] == '\t') { 309 | father.second = buffer[i]; 310 | } 311 | if (count == 7 && buffer[i - 1] == '\t') { 312 | mother.first = buffer[i]; 313 | } 314 | if (count == 8 && buffer[i - 1] == '\t') { 315 | mother.second = buffer[i]; 316 | } 317 | 318 | if (buffer[i] == '\t') { 319 | count++; 320 | } 321 | } 322 | 323 | if (strncmp(chr.c_str(), "14", 2) == 0 && (mother.first!=father.first ||mother.second!=father.second ) ) { 324 | for (size_t i = 0; i < snps.size(); i++) { 325 | if (snps[i].position == pos && strncmp(snps[i].chr.c_str(), chr.c_str(), chr.size()) == 0) { 326 | //found the snp in the offspring (hapcut2 output) 327 | if((snps[i].alt_allele== father.first ||snps[i].alt_allele== father.second) && !(snps[i].alt_allele== mother.first ||snps[i].alt_allele== mother.second)){ 328 | snps[i].gatk=1; 329 | }else if(!(snps[i].alt_allele== father.first ||snps[i].alt_allele== father.second) && (snps[i].alt_allele== mother.first ||snps[i].alt_allele== mother.second)){ 330 | snps[i].gatk=2; 331 | }else{ 332 | snps[i].gatk=0; 333 | } 334 | 335 | break; 336 | } 337 | } 338 | } 339 | 340 | getline(myfile, buffer); 341 | } 342 | myfile.close(); 343 | } 344 | 345 | } 346 | void parental_phasing(std::string parents_vcf, std::string hapcut_output, std::string gatk_output, std::string snp_file, std::string output) { 347 | std::vector snps = parse_hapcut2(hapcut_output); 348 | std::cout<< "Extracted: "<